Repository: Tencent/ncnn
Branch: master
Commit: 939f24fc2b44
Files: 3805
Total size: 33.0 MB

Directory structure:
gitextract_nmtq5ath/

├── .astylerc
├── .clang-format
├── .gitattributes
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug.md
│   │   ├── model-convert.md
│   │   ├── others.md
│   │   └── quantization.md
│   ├── dependabot.yml
│   ├── labeler.yml
│   └── workflows/
│       ├── android.yml
│       ├── code-format-msg.yml
│       ├── code-format.yml
│       ├── codeql-analysis.yml
│       ├── compare-binary-size-pr-comment.yml
│       ├── compare-binary-size.yml
│       ├── elf-riscv32.yml
│       ├── elf-riscv64.yml
│       ├── esp32.yml
│       ├── harmonyos.yml
│       ├── ios.yml
│       ├── labeler.yml
│       ├── linux-aarch64.yml
│       ├── linux-arm.yml
│       ├── linux-loongarch64.yml
│       ├── linux-mips.yml
│       ├── linux-mips64.yml
│       ├── linux-ppc64.yml
│       ├── linux-riscv32.yml
│       ├── linux-riscv64.yml
│       ├── linux-x64-cpu-clang.yml
│       ├── linux-x64-cpu-gcc-musl.yml
│       ├── linux-x64-cpu-gcc.yml
│       ├── linux-x64-gpu-clang.yml
│       ├── linux-x64-gpu-gcc.yml
│       ├── linux-x64-sde.yml
│       ├── linux-x86-cpu-clang.yml
│       ├── linux-x86-cpu-gcc.yml
│       ├── mac-catalyst.yml
│       ├── macos.yml
│       ├── pnnx.yml
│       ├── python.yml
│       ├── release-python.yml
│       ├── release.yml
│       ├── sync-wiki.yml
│       ├── test-coverage.yml
│       ├── tvos.yml
│       ├── visionos.yml
│       ├── watchos.yml
│       ├── web-assembly.yml
│       ├── windows-arm.yml
│       ├── windows-clang.yml
│       ├── windows-mingw.yml
│       ├── windows-xp.yml
│       └── windows.yml
├── .gitignore
├── .gitmodules
├── CITATION.cff
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Info.plist
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── benchmark/
│   ├── CMakeLists.txt
│   ├── FastestDet.param
│   ├── README.md
│   ├── RankCards/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── Rcards.h
│   │   └── main.cpp
│   ├── alexnet.param
│   ├── benchncnn.cpp
│   ├── benchncnn_param_data.h.in
│   ├── blazeface.param
│   ├── efficientnet_b0.param
│   ├── efficientnetv2_b0.param
│   ├── googlenet.param
│   ├── googlenet_int8.param
│   ├── mnasnet.param
│   ├── mobilenet.param
│   ├── mobilenet_int8.param
│   ├── mobilenet_ssd.param
│   ├── mobilenet_ssd_int8.param
│   ├── mobilenet_v2.param
│   ├── mobilenet_v3.param
│   ├── mobilenet_yolo.param
│   ├── mobilenetv2_yolov3.param
│   ├── nanodet_m.param
│   ├── proxylessnasnet.param
│   ├── regnety_400m.param
│   ├── resnet18.param
│   ├── resnet18_int8.param
│   ├── resnet50.param
│   ├── resnet50_int8.param
│   ├── shufflenet.param
│   ├── shufflenet_v2.param
│   ├── squeezenet.param
│   ├── squeezenet_int8.param
│   ├── squeezenet_ssd.param
│   ├── squeezenet_ssd_int8.param
│   ├── vgg16.param
│   ├── vgg16_int8.param
│   ├── vision_transformer.param
│   ├── yolo-fastest-1.1.param
│   ├── yolo-fastestv2.param
│   └── yolov4-tiny.param
├── build-android.cmd
├── build.sh
├── cmake/
│   ├── ncnnConfig.cmake.in
│   ├── ncnn_add_layer.cmake
│   ├── ncnn_add_param.cmake
│   ├── ncnn_add_shader.cmake
│   ├── ncnn_generate_avx512_source.cmake
│   ├── ncnn_generate_avx_source.cmake
│   ├── ncnn_generate_fma_source.cmake
│   ├── ncnn_generate_lasx_source.cmake
│   ├── ncnn_generate_lsx_source.cmake
│   ├── ncnn_generate_msa_source.cmake
│   ├── ncnn_generate_param_header.cmake
│   ├── ncnn_generate_rvv_source.cmake
│   ├── ncnn_generate_shader_comp_header.cmake
│   ├── ncnn_generate_xtheadvector_source.cmake
│   └── run_test.cmake
├── codeformat.sh
├── docs/
│   ├── Home.md
│   ├── application-with-ncnn-inside.md
│   ├── benchmark/
│   │   ├── the-benchmark-of-caffe-android-lib,-mini-caffe,-and-ncnn.md
│   │   └── vulkan-conformance-test.md
│   ├── developer-guide/
│   │   ├── aarch64-mix-assembly-and-intrinsic.md
│   │   ├── add-custom-layer.zh.md
│   │   ├── arm-a53-a55-dual-issue.md
│   │   ├── armv7-mix-assembly-and-intrinsic.md
│   │   ├── binaryop-broadcasting.md
│   │   ├── build-ncnn-on-windows-xp.zh.md
│   │   ├── custom-allocator.md
│   │   ├── element-packing.md
│   │   ├── expression.md
│   │   ├── glsl-extension.md
│   │   ├── glsl-extension.zh.md
│   │   ├── how-to-be-a-contributor.zh.md
│   │   ├── how-to-implement-custom-layer-step-by-step.md
│   │   ├── how-to-write-a-neon-optimized-op-kernel.md
│   │   ├── how-to-write-a-sse-optimized-op-kernel.zh.md
│   │   ├── kvcache.md
│   │   ├── layer-feat-mask.md
│   │   ├── layer-support-behavior.md
│   │   ├── low-level-operation-api.md
│   │   ├── ncnn-tips-and-tricks.zh.md
│   │   ├── new-model-load-api.md
│   │   ├── new-param-load-api.md
│   │   ├── operation-param-weight-table.md
│   │   ├── operators.md
│   │   ├── param-and-model-file-structure.md
│   │   ├── preload-practice.zh.md
│   │   ├── tensorflow-op-combination.md
│   │   └── vulkan-driver-loader.md
│   ├── faq.en.md
│   ├── faq.md
│   ├── how-to-build/
│   │   ├── build-mlir2ncnn.md
│   │   └── how-to-build.md
│   └── how-to-use-and-FAQ/
│       ├── FAQ-ncnn-produce-wrong-result.md
│       ├── FAQ-ncnn-protobuf-problem.zh.md
│       ├── FAQ-ncnn-throw-error.md
│       ├── FAQ-ncnn-vulkan.md
│       ├── build-minimal-library.md
│       ├── efficient-roi-resize-rotate.md
│       ├── ncnn-load-model.md
│       ├── openmp-best-practice.md
│       ├── openmp-best-practice.zh.md
│       ├── quantized-int8-inference.md
│       ├── use-ncnn-with-alexnet.md
│       ├── use-ncnn-with-alexnet.zh.md
│       ├── use-ncnn-with-opencv.md
│       ├── use-ncnn-with-own-project.md
│       ├── use-ncnn-with-pytorch-or-onnx.md
│       ├── use-ncnnoptimize-to-optimize-model.md
│       └── vulkan-notes.md
├── examples/
│   ├── CMakeLists.txt
│   ├── arcface.cpp
│   ├── fasterrcnn.cpp
│   ├── mobilenetssd.cpp
│   ├── mobilenetv2ssdlite.cpp
│   ├── mobilenetv3ssdlite.cpp
│   ├── nanodet.cpp
│   ├── nanodetplus_pnnx.cpp
│   ├── p2pnet.cpp
│   ├── peleenetssd_seg.cpp
│   ├── piper.cpp
│   ├── ppocrv5.cpp
│   ├── ppocrv5_dict.h
│   ├── retinaface.cpp
│   ├── rfcn.cpp
│   ├── rvm.cpp
│   ├── scrfd.cpp
│   ├── scrfd_crowdhuman.cpp
│   ├── shufflenetv2.cpp
│   ├── simplepose.cpp
│   ├── squeezencnn/
│   │   └── README.md
│   ├── squeezenet.cpp
│   ├── squeezenet_c_api.cpp
│   ├── squeezenet_v1.1.caffemodel
│   ├── squeezenet_v1.1.param
│   ├── squeezenet_v1.1.prototxt
│   ├── squeezenetssd.cpp
│   ├── synset_words.txt
│   ├── whisper.cpp
│   ├── yolact.cpp
│   ├── yolo11.cpp
│   ├── yolo11_cls.cpp
│   ├── yolo11_obb.cpp
│   ├── yolo11_pose.cpp
│   ├── yolo11_seg.cpp
│   ├── yolov2.cpp
│   ├── yolov3.cpp
│   ├── yolov4.cpp
│   ├── yolov5.cpp
│   ├── yolov5_pnnx.cpp
│   ├── yolov7.cpp
│   ├── yolov7_pnnx.cpp
│   ├── yolov8.cpp
│   ├── yolov8_cls.cpp
│   ├── yolov8_obb.cpp
│   ├── yolov8_pose.cpp
│   ├── yolov8_seg.cpp
│   ├── yoloworld.cpp
│   └── yolox.cpp
├── package.sh
├── pyproject.toml
├── python/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── examples/
│   │   ├── fasterrcnn.py
│   │   ├── mobilenetssd.py
│   │   ├── mobilenetv2ssdlite.py
│   │   ├── mobilenetv3ssdlite.py
│   │   ├── model_zoo.py
│   │   ├── nanodet.py
│   │   ├── peleenetssd.py
│   │   ├── retinaface.py
│   │   ├── rfcn.py
│   │   ├── shufflenetv2.py
│   │   ├── simplepose.py
│   │   ├── squeezenet.py
│   │   ├── squeezenetssd.py
│   │   ├── yolact.py
│   │   ├── yolov2.py
│   │   ├── yolov3.py
│   │   ├── yolov4.py
│   │   ├── yolov5.py
│   │   └── yolov8.py
│   ├── ncnn/
│   │   ├── __init__.py
│   │   ├── model_zoo/
│   │   │   ├── __init__.py
│   │   │   ├── fasterrcnn.py
│   │   │   ├── mobilenetssd.py
│   │   │   ├── mobilenetv2ssdlite.py
│   │   │   ├── mobilenetv3ssdlite.py
│   │   │   ├── model_store.py
│   │   │   ├── model_zoo.py
│   │   │   ├── nanodet.py
│   │   │   ├── peleenetssd.py
│   │   │   ├── retinaface.py
│   │   │   ├── rfcn.py
│   │   │   ├── shufflenetv2.py
│   │   │   ├── simplepose.py
│   │   │   ├── squeezenet.py
│   │   │   ├── squeezenetssd.py
│   │   │   ├── yolact.py
│   │   │   ├── yolov2.py
│   │   │   ├── yolov3.py
│   │   │   ├── yolov4.py
│   │   │   ├── yolov5.py
│   │   │   ├── yolov7.py
│   │   │   └── yolov8.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── download.py
│   │       ├── functional.py
│   │       ├── objects.py
│   │       └── visual.py
│   ├── requirements.txt
│   ├── setup.py.i
│   ├── src/
│   │   ├── main.cpp
│   │   ├── pybind11_allocator.h
│   │   ├── pybind11_bind.h
│   │   ├── pybind11_datareader.h
│   │   ├── pybind11_layer.h
│   │   ├── pybind11_mat.h
│   │   └── pybind11_modelbin.h
│   └── tests/
│       ├── benchmark.py
│       ├── custom_layer.param
│       ├── test.param
│       ├── test_allocator.py
│       ├── test_blob.py
│       ├── test_extractor.py
│       ├── test_mat.py
│       ├── test_net.py
│       ├── test_option.py
│       ├── test_paramdict.py
│       ├── test_vulkan_allocator.py
│       └── test_vulkan_device.py
├── setup.py
├── src/
│   ├── CMakeLists.txt
│   ├── allocator.cpp
│   ├── allocator.h
│   ├── benchmark.cpp
│   ├── benchmark.h
│   ├── blob.cpp
│   ├── blob.h
│   ├── c_api.cpp
│   ├── c_api.h
│   ├── command.cpp
│   ├── command.h
│   ├── convert_ycbcr.comp
│   ├── cpu.cpp
│   ├── cpu.h
│   ├── datareader.cpp
│   ├── datareader.h
│   ├── expression.cpp
│   ├── expression.h
│   ├── gpu.cpp
│   ├── gpu.h
│   ├── layer/
│   │   ├── absval.cpp
│   │   ├── absval.h
│   │   ├── argmax.cpp
│   │   ├── argmax.h
│   │   ├── arm/
│   │   │   ├── absval_arm.cpp
│   │   │   ├── absval_arm.h
│   │   │   ├── arm_activation.h
│   │   │   ├── arm_usability.h
│   │   │   ├── batchnorm_arm.cpp
│   │   │   ├── batchnorm_arm.h
│   │   │   ├── batchnorm_arm_asimdhp.cpp
│   │   │   ├── bias_arm.cpp
│   │   │   ├── bias_arm.h
│   │   │   ├── binaryop_arm.cpp
│   │   │   ├── binaryop_arm.h
│   │   │   ├── binaryop_arm_asimdhp.cpp
│   │   │   ├── cast_arm.cpp
│   │   │   ├── cast_arm.h
│   │   │   ├── cast_arm_bf16.cpp
│   │   │   ├── cast_arm_vfpv4.cpp
│   │   │   ├── cast_bf16.h
│   │   │   ├── cast_fp16.h
│   │   │   ├── clip_arm.cpp
│   │   │   ├── clip_arm.h
│   │   │   ├── clip_arm_asimdhp.cpp
│   │   │   ├── concat_arm.cpp
│   │   │   ├── concat_arm.h
│   │   │   ├── convolution1d_arm.cpp
│   │   │   ├── convolution1d_arm.h
│   │   │   ├── convolution1d_arm_asimdhp.cpp
│   │   │   ├── convolution1d_packed.h
│   │   │   ├── convolution1d_packed_bf16s.h
│   │   │   ├── convolution1d_packed_fp16s.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_2x2.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack1to4_bf16s.h
│   │   │   ├── convolution_3x3_pack1to4_fp16s.h
│   │   │   ├── convolution_3x3_pack1to8_fp16s.h
│   │   │   ├── convolution_3x3_pack4.h
│   │   │   ├── convolution_3x3_pack4_bf16s.h
│   │   │   ├── convolution_3x3_pack4_fp16s.h
│   │   │   ├── convolution_3x3_pack4to1.h
│   │   │   ├── convolution_3x3_pack8_fp16s.h
│   │   │   ├── convolution_3x3_winograd.h
│   │   │   ├── convolution_3x3_winograd_bf16s.h
│   │   │   ├── convolution_3x3_winograd_fp16s.h
│   │   │   ├── convolution_3x3_winograd_int8.h
│   │   │   ├── convolution_4x4.h
│   │   │   ├── convolution_5x5.h
│   │   │   ├── convolution_5x5_pack4.h
│   │   │   ├── convolution_5x5_pack4_bf16s.h
│   │   │   ├── convolution_5x5_pack8_fp16s.h
│   │   │   ├── convolution_7x7.h
│   │   │   ├── convolution_7x7_pack1to4.h
│   │   │   ├── convolution_7x7_pack1to4_bf16s.h
│   │   │   ├── convolution_7x7_pack1to8_fp16s.h
│   │   │   ├── convolution_arm.cpp
│   │   │   ├── convolution_arm.h
│   │   │   ├── convolution_arm_asimddp.cpp
│   │   │   ├── convolution_arm_asimdhp.cpp
│   │   │   ├── convolution_arm_i8mm.cpp
│   │   │   ├── convolution_im2col_gemm.h
│   │   │   ├── convolution_im2col_gemm_bf16s.h
│   │   │   ├── convolution_im2col_gemm_bf16s_fp16s.h
│   │   │   ├── convolution_im2col_gemm_fp16s.h
│   │   │   ├── convolution_im2col_gemm_int8.h
│   │   │   ├── convolution_packed.h
│   │   │   ├── convolution_packed_bf16s.h
│   │   │   ├── convolution_packed_fp16s.h
│   │   │   ├── convolution_packed_int8.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_fp16s.h
│   │   │   ├── convolutiondepthwise_3x3_int8.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_3x3_pack4_bf16s.h
│   │   │   ├── convolutiondepthwise_3x3_pack8_fp16s.h
│   │   │   ├── convolutiondepthwise_3x3_pack8_int8.h
│   │   │   ├── convolutiondepthwise_5x5.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack4_bf16s.h
│   │   │   ├── convolutiondepthwise_5x5_pack8_fp16s.h
│   │   │   ├── convolutiondepthwise_arm.cpp
│   │   │   ├── convolutiondepthwise_arm.h
│   │   │   ├── convolutiondepthwise_arm_asimdhp.cpp
│   │   │   ├── crop_arm.cpp
│   │   │   ├── crop_arm.h
│   │   │   ├── deconvolution_3x3.h
│   │   │   ├── deconvolution_4x4.h
│   │   │   ├── deconvolution_4x4_fp16s.h
│   │   │   ├── deconvolution_arm.cpp
│   │   │   ├── deconvolution_arm.h
│   │   │   ├── deconvolution_arm_asimdhp.cpp
│   │   │   ├── deconvolutiondepthwise_arm.cpp
│   │   │   ├── deconvolutiondepthwise_arm.h
│   │   │   ├── deconvolutiondepthwise_arm_asimdhp.cpp
│   │   │   ├── dequantize_arm.cpp
│   │   │   ├── dequantize_arm.h
│   │   │   ├── dequantize_arm_asimdhp.cpp
│   │   │   ├── dropout_arm.cpp
│   │   │   ├── dropout_arm.h
│   │   │   ├── eltwise_arm.cpp
│   │   │   ├── eltwise_arm.h
│   │   │   ├── eltwise_arm_asimdhp.cpp
│   │   │   ├── flatten_arm.cpp
│   │   │   ├── flatten_arm.h
│   │   │   ├── gelu_arm.cpp
│   │   │   ├── gelu_arm.h
│   │   │   ├── gelu_arm_asimdhp.cpp
│   │   │   ├── gemm_arm.cpp
│   │   │   ├── gemm_arm.h
│   │   │   ├── gemm_arm_asimddp.cpp
│   │   │   ├── gemm_arm_asimdfhm.cpp
│   │   │   ├── gemm_arm_asimdhp.cpp
│   │   │   ├── gemm_arm_i8mm.cpp
│   │   │   ├── gemm_arm_vfpv4.cpp
│   │   │   ├── gemm_bf16s.h
│   │   │   ├── gemm_bf16s_fp16s.h
│   │   │   ├── gemm_fp16s.h
│   │   │   ├── gemm_int8.h
│   │   │   ├── gemm_int8_bf16s.h
│   │   │   ├── gemm_int8_fp16s.h
│   │   │   ├── groupnorm_arm.cpp
│   │   │   ├── groupnorm_arm.h
│   │   │   ├── groupnorm_arm_asimdhp.cpp
│   │   │   ├── gru_arm.cpp
│   │   │   ├── gru_arm.h
│   │   │   ├── gru_arm_asimddp.cpp
│   │   │   ├── gru_arm_asimdhp.cpp
│   │   │   ├── gru_arm_vfpv4.cpp
│   │   │   ├── gru_int8.h
│   │   │   ├── hardsigmoid_arm.cpp
│   │   │   ├── hardsigmoid_arm.h
│   │   │   ├── hardsigmoid_arm_asimdhp.cpp
│   │   │   ├── hardswish_arm.cpp
│   │   │   ├── hardswish_arm.h
│   │   │   ├── hardswish_arm_asimdhp.cpp
│   │   │   ├── innerproduct_arm.cpp
│   │   │   ├── innerproduct_arm.h
│   │   │   ├── innerproduct_arm_asimdfhm.cpp
│   │   │   ├── innerproduct_arm_asimdhp.cpp
│   │   │   ├── innerproduct_arm_vfpv4.cpp
│   │   │   ├── innerproduct_fp16s.h
│   │   │   ├── innerproduct_gemm_fp16s.h
│   │   │   ├── instancenorm_arm.cpp
│   │   │   ├── instancenorm_arm.h
│   │   │   ├── instancenorm_arm_asimdhp.cpp
│   │   │   ├── interp_arm.cpp
│   │   │   ├── interp_arm.h
│   │   │   ├── interp_arm_asimdhp.cpp
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_bf16s.h
│   │   │   ├── interp_bicubic_fp16s.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bicubic_pack4_bf16s.h
│   │   │   ├── interp_bicubic_pack4_fp16s.h
│   │   │   ├── interp_bicubic_pack8_fp16s.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_bf16s.h
│   │   │   ├── interp_bilinear_fp16s.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_bilinear_pack4_bf16s.h
│   │   │   ├── interp_bilinear_pack4_fp16s.h
│   │   │   ├── interp_bilinear_pack8_fp16s.h
│   │   │   ├── layernorm_arm.cpp
│   │   │   ├── layernorm_arm.h
│   │   │   ├── layernorm_arm_asimdhp.cpp
│   │   │   ├── lrn_arm.cpp
│   │   │   ├── lrn_arm.h
│   │   │   ├── lstm_arm.cpp
│   │   │   ├── lstm_arm.h
│   │   │   ├── lstm_arm_asimddp.cpp
│   │   │   ├── lstm_arm_asimdhp.cpp
│   │   │   ├── lstm_arm_vfpv4.cpp
│   │   │   ├── lstm_int8.h
│   │   │   ├── matmul_arm.cpp
│   │   │   ├── matmul_arm.h
│   │   │   ├── mish_arm.cpp
│   │   │   ├── mish_arm.h
│   │   │   ├── mish_arm_asimdhp.cpp
│   │   │   ├── multiheadattention_arm.cpp
│   │   │   ├── multiheadattention_arm.h
│   │   │   ├── neon_mathfun.h
│   │   │   ├── neon_mathfun_fp16s.h
│   │   │   ├── neon_mathfun_tanh.h
│   │   │   ├── packing_arm.cpp
│   │   │   ├── packing_arm.h
│   │   │   ├── padding_arm.cpp
│   │   │   ├── padding_arm.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack4_bf16s_fp16s.h
│   │   │   ├── padding_pack8_fp16s.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── pixelshuffle_arm.cpp
│   │   │   ├── pixelshuffle_arm.h
│   │   │   ├── pooling_2x2.h
│   │   │   ├── pooling_2x2_pack4.h
│   │   │   ├── pooling_2x2_pack4_bf16s.h
│   │   │   ├── pooling_3x3.h
│   │   │   ├── pooling_3x3_pack4.h
│   │   │   ├── pooling_3x3_pack4_bf16s.h
│   │   │   ├── pooling_arm.cpp
│   │   │   ├── pooling_arm.h
│   │   │   ├── pooling_arm_asimdhp.cpp
│   │   │   ├── prelu_arm.cpp
│   │   │   ├── prelu_arm.h
│   │   │   ├── prelu_arm_asimdhp.cpp
│   │   │   ├── quantize_arm.cpp
│   │   │   ├── quantize_arm.h
│   │   │   ├── quantize_arm_asimdhp.cpp
│   │   │   ├── relu_arm.cpp
│   │   │   ├── relu_arm.h
│   │   │   ├── relu_arm_asimdhp.cpp
│   │   │   ├── requantize_arm.cpp
│   │   │   ├── requantize_arm.h
│   │   │   ├── reshape_arm.cpp
│   │   │   ├── reshape_arm.h
│   │   │   ├── rmsnorm_arm.cpp
│   │   │   ├── rmsnorm_arm.h
│   │   │   ├── rmsnorm_arm_asimdhp.cpp
│   │   │   ├── rnn_arm.cpp
│   │   │   ├── rnn_arm.h
│   │   │   ├── rnn_arm_asimddp.cpp
│   │   │   ├── rnn_arm_asimdhp.cpp
│   │   │   ├── rnn_arm_vfpv4.cpp
│   │   │   ├── rnn_int8.h
│   │   │   ├── scale_arm.cpp
│   │   │   ├── scale_arm.h
│   │   │   ├── selu_arm.cpp
│   │   │   ├── selu_arm.h
│   │   │   ├── shufflechannel_arm.cpp
│   │   │   ├── shufflechannel_arm.h
│   │   │   ├── sigmoid_arm.cpp
│   │   │   ├── sigmoid_arm.h
│   │   │   ├── sigmoid_arm_asimdhp.cpp
│   │   │   ├── slice_arm.cpp
│   │   │   ├── slice_arm.h
│   │   │   ├── softmax_arm.cpp
│   │   │   ├── softmax_arm.h
│   │   │   ├── softmax_arm_asimdhp.cpp
│   │   │   ├── swish_arm.cpp
│   │   │   ├── swish_arm.h
│   │   │   ├── swish_arm_asimdhp.cpp
│   │   │   ├── tanh_arm.cpp
│   │   │   ├── tanh_arm.h
│   │   │   ├── tanh_arm_asimdhp.cpp
│   │   │   ├── unaryop_arm.cpp
│   │   │   ├── unaryop_arm.h
│   │   │   └── unaryop_arm_asimdhp.cpp
│   │   ├── batchnorm.cpp
│   │   ├── batchnorm.h
│   │   ├── bias.cpp
│   │   ├── bias.h
│   │   ├── binaryop.cpp
│   │   ├── binaryop.h
│   │   ├── bnll.cpp
│   │   ├── bnll.h
│   │   ├── cast.cpp
│   │   ├── cast.h
│   │   ├── celu.cpp
│   │   ├── celu.h
│   │   ├── clip.cpp
│   │   ├── clip.h
│   │   ├── concat.cpp
│   │   ├── concat.h
│   │   ├── convolution.cpp
│   │   ├── convolution.h
│   │   ├── convolution1d.cpp
│   │   ├── convolution1d.h
│   │   ├── convolution3d.cpp
│   │   ├── convolution3d.h
│   │   ├── convolutiondepthwise.cpp
│   │   ├── convolutiondepthwise.h
│   │   ├── convolutiondepthwise1d.cpp
│   │   ├── convolutiondepthwise1d.h
│   │   ├── convolutiondepthwise3d.cpp
│   │   ├── convolutiondepthwise3d.h
│   │   ├── copyto.cpp
│   │   ├── copyto.h
│   │   ├── crop.cpp
│   │   ├── crop.h
│   │   ├── cumulativesum.cpp
│   │   ├── cumulativesum.h
│   │   ├── deconvolution.cpp
│   │   ├── deconvolution.h
│   │   ├── deconvolution1d.cpp
│   │   ├── deconvolution1d.h
│   │   ├── deconvolution3d.cpp
│   │   ├── deconvolution3d.h
│   │   ├── deconvolutiondepthwise.cpp
│   │   ├── deconvolutiondepthwise.h
│   │   ├── deconvolutiondepthwise1d.cpp
│   │   ├── deconvolutiondepthwise1d.h
│   │   ├── deconvolutiondepthwise3d.cpp
│   │   ├── deconvolutiondepthwise3d.h
│   │   ├── deepcopy.cpp
│   │   ├── deepcopy.h
│   │   ├── deformableconv2d.cpp
│   │   ├── deformableconv2d.h
│   │   ├── dequantize.cpp
│   │   ├── dequantize.h
│   │   ├── detectionoutput.cpp
│   │   ├── detectionoutput.h
│   │   ├── diag.cpp
│   │   ├── diag.h
│   │   ├── dropout.cpp
│   │   ├── dropout.h
│   │   ├── einsum.cpp
│   │   ├── einsum.h
│   │   ├── eltwise.cpp
│   │   ├── eltwise.h
│   │   ├── elu.cpp
│   │   ├── elu.h
│   │   ├── embed.cpp
│   │   ├── embed.h
│   │   ├── erf.cpp
│   │   ├── erf.h
│   │   ├── exp.cpp
│   │   ├── exp.h
│   │   ├── expanddims.cpp
│   │   ├── expanddims.h
│   │   ├── flatten.cpp
│   │   ├── flatten.h
│   │   ├── flip.cpp
│   │   ├── flip.h
│   │   ├── fold.cpp
│   │   ├── fold.h
│   │   ├── fused_activation.h
│   │   ├── gelu.cpp
│   │   ├── gelu.h
│   │   ├── gemm.cpp
│   │   ├── gemm.h
│   │   ├── glu.cpp
│   │   ├── glu.h
│   │   ├── gridsample.cpp
│   │   ├── gridsample.h
│   │   ├── groupnorm.cpp
│   │   ├── groupnorm.h
│   │   ├── gru.cpp
│   │   ├── gru.h
│   │   ├── hardsigmoid.cpp
│   │   ├── hardsigmoid.h
│   │   ├── hardswish.cpp
│   │   ├── hardswish.h
│   │   ├── innerproduct.cpp
│   │   ├── innerproduct.h
│   │   ├── input.cpp
│   │   ├── input.h
│   │   ├── instancenorm.cpp
│   │   ├── instancenorm.h
│   │   ├── interp.cpp
│   │   ├── interp.h
│   │   ├── inversespectrogram.cpp
│   │   ├── inversespectrogram.h
│   │   ├── layernorm.cpp
│   │   ├── layernorm.h
│   │   ├── log.cpp
│   │   ├── log.h
│   │   ├── loongarch/
│   │   │   ├── absval_loongarch.cpp
│   │   │   ├── absval_loongarch.h
│   │   │   ├── batchnorm_loongarch.cpp
│   │   │   ├── batchnorm_loongarch.h
│   │   │   ├── bias_loongarch.cpp
│   │   │   ├── bias_loongarch.h
│   │   │   ├── binaryop_loongarch.cpp
│   │   │   ├── binaryop_loongarch.h
│   │   │   ├── cast_loongarch.cpp
│   │   │   ├── cast_loongarch.h
│   │   │   ├── clip_loongarch.cpp
│   │   │   ├── clip_loongarch.h
│   │   │   ├── concat_loongarch.cpp
│   │   │   ├── concat_loongarch.h
│   │   │   ├── convolution1d_loongarch.cpp
│   │   │   ├── convolution1d_loongarch.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_1x1_int8.h
│   │   │   ├── convolution_1x1_pack1to4_int8.h
│   │   │   ├── convolution_1x1_pack4.h
│   │   │   ├── convolution_1x1_pack4to1.h
│   │   │   ├── convolution_1x1_pack8to1_int8.h
│   │   │   ├── convolution_1x1_pack8to4_int8.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack4.h
│   │   │   ├── convolution_3x3_pack8to1_int8.h
│   │   │   ├── convolution_3x3_pack8to4_int8.h
│   │   │   ├── convolution_7x7_pack1to4.h
│   │   │   ├── convolution_int8.h
│   │   │   ├── convolution_loongarch.cpp
│   │   │   ├── convolution_loongarch.h
│   │   │   ├── convolution_pack1to4.h
│   │   │   ├── convolution_pack1to4_int8.h
│   │   │   ├── convolution_pack4.h
│   │   │   ├── convolution_pack4to1.h
│   │   │   ├── convolution_pack8to1_int8.h
│   │   │   ├── convolution_pack8to4_int8.h
│   │   │   ├── convolution_sgemm.h
│   │   │   ├── convolution_sgemm_int8.h
│   │   │   ├── convolution_sgemm_pack1to4_int8.h
│   │   │   ├── convolution_sgemm_pack4.h
│   │   │   ├── convolution_sgemm_pack4to1.h
│   │   │   ├── convolution_sgemm_pack8to1_int8.h
│   │   │   ├── convolution_sgemm_pack8to4_int8.h
│   │   │   ├── convolution_winograd_dot.h
│   │   │   ├── convolution_winograd_dot_int8.h
│   │   │   ├── convolution_winograd_dot_pack4.h
│   │   │   ├── convolution_winograd_dot_pack8to1_int8.h
│   │   │   ├── convolution_winograd_dot_pack8to4_int8.h
│   │   │   ├── convolution_winograd_transform.h
│   │   │   ├── convolution_winograd_transform_int8.h
│   │   │   ├── convolution_winograd_transform_pack4.h
│   │   │   ├── convolution_winograd_transform_pack4_int8.h
│   │   │   ├── convolution_winograd_transform_pack8_int8.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_loongarch.cpp
│   │   │   ├── convolutiondepthwise_loongarch.h
│   │   │   ├── crop_loongarch.cpp
│   │   │   ├── crop_loongarch.h
│   │   │   ├── deconvolution_loongarch.cpp
│   │   │   ├── deconvolution_loongarch.h
│   │   │   ├── deconvolution_pack1to4.h
│   │   │   ├── deconvolution_pack4.h
│   │   │   ├── deconvolution_pack4to1.h
│   │   │   ├── deconvolutiondepthwise_loongarch.cpp
│   │   │   ├── deconvolutiondepthwise_loongarch.h
│   │   │   ├── dequantize_loongarch.cpp
│   │   │   ├── dequantize_loongarch.h
│   │   │   ├── dropout_loongarch.cpp
│   │   │   ├── dropout_loongarch.h
│   │   │   ├── eltwise_loongarch.cpp
│   │   │   ├── eltwise_loongarch.h
│   │   │   ├── flatten_loongarch.cpp
│   │   │   ├── flatten_loongarch.h
│   │   │   ├── hardsigmoid_loongarch.cpp
│   │   │   ├── hardsigmoid_loongarch.h
│   │   │   ├── hardswish_loongarch.cpp
│   │   │   ├── hardswish_loongarch.h
│   │   │   ├── innerproduct_loongarch.cpp
│   │   │   ├── innerproduct_loongarch.h
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_loongarch.cpp
│   │   │   ├── interp_loongarch.h
│   │   │   ├── lasx_mathfun.h
│   │   │   ├── loongarch_activation.h
│   │   │   ├── loongarch_usability.h
│   │   │   ├── lsx_mathfun.h
│   │   │   ├── mish_loongarch.cpp
│   │   │   ├── mish_loongarch.h
│   │   │   ├── packing_loongarch.cpp
│   │   │   ├── packing_loongarch.h
│   │   │   ├── padding_loongarch.cpp
│   │   │   ├── padding_loongarch.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── pooling_loongarch.cpp
│   │   │   ├── pooling_loongarch.h
│   │   │   ├── prelu_loongarch.cpp
│   │   │   ├── prelu_loongarch.h
│   │   │   ├── quantize_loongarch.cpp
│   │   │   ├── quantize_loongarch.h
│   │   │   ├── relu_loongarch.cpp
│   │   │   ├── relu_loongarch.h
│   │   │   ├── requantize_loongarch.cpp
│   │   │   ├── requantize_loongarch.h
│   │   │   ├── sigmoid_loongarch.cpp
│   │   │   ├── sigmoid_loongarch.h
│   │   │   ├── slice_loongarch.cpp
│   │   │   ├── slice_loongarch.h
│   │   │   ├── softmax_loongarch.cpp
│   │   │   ├── softmax_loongarch.h
│   │   │   ├── swish_loongarch.cpp
│   │   │   ├── swish_loongarch.h
│   │   │   ├── tanh_loongarch.cpp
│   │   │   ├── tanh_loongarch.h
│   │   │   ├── unaryop_loongarch.cpp
│   │   │   └── unaryop_loongarch.h
│   │   ├── lrn.cpp
│   │   ├── lrn.h
│   │   ├── lstm.cpp
│   │   ├── lstm.h
│   │   ├── matmul.cpp
│   │   ├── matmul.h
│   │   ├── memorydata.cpp
│   │   ├── memorydata.h
│   │   ├── mips/
│   │   │   ├── absval_mips.cpp
│   │   │   ├── absval_mips.h
│   │   │   ├── batchnorm_mips.cpp
│   │   │   ├── batchnorm_mips.h
│   │   │   ├── bias_mips.cpp
│   │   │   ├── bias_mips.h
│   │   │   ├── binaryop_mips.cpp
│   │   │   ├── binaryop_mips.h
│   │   │   ├── cast_mips.cpp
│   │   │   ├── cast_mips.h
│   │   │   ├── clip_mips.cpp
│   │   │   ├── clip_mips.h
│   │   │   ├── concat_mips.cpp
│   │   │   ├── concat_mips.h
│   │   │   ├── convolution1d_mips.cpp
│   │   │   ├── convolution1d_mips.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_1x1_int8.h
│   │   │   ├── convolution_1x1_pack1to4_int8.h
│   │   │   ├── convolution_1x1_pack4.h
│   │   │   ├── convolution_1x1_pack4to1.h
│   │   │   ├── convolution_1x1_pack8to1_int8.h
│   │   │   ├── convolution_1x1_pack8to4_int8.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack4.h
│   │   │   ├── convolution_3x3_pack8to1_int8.h
│   │   │   ├── convolution_3x3_pack8to4_int8.h
│   │   │   ├── convolution_7x7_pack1to4.h
│   │   │   ├── convolution_int8.h
│   │   │   ├── convolution_mips.cpp
│   │   │   ├── convolution_mips.h
│   │   │   ├── convolution_mips_mmi.cpp
│   │   │   ├── convolution_pack1to4.h
│   │   │   ├── convolution_pack1to4_int8.h
│   │   │   ├── convolution_pack4.h
│   │   │   ├── convolution_pack4to1.h
│   │   │   ├── convolution_pack8to1_int8.h
│   │   │   ├── convolution_pack8to4_int8.h
│   │   │   ├── convolution_sgemm.h
│   │   │   ├── convolution_sgemm_int8.h
│   │   │   ├── convolution_sgemm_pack1to4_int8.h
│   │   │   ├── convolution_sgemm_pack4.h
│   │   │   ├── convolution_sgemm_pack4to1.h
│   │   │   ├── convolution_sgemm_pack8to1_int8.h
│   │   │   ├── convolution_sgemm_pack8to4_int8.h
│   │   │   ├── convolution_winograd_dot.h
│   │   │   ├── convolution_winograd_dot_int8.h
│   │   │   ├── convolution_winograd_dot_pack4.h
│   │   │   ├── convolution_winograd_dot_pack8to1_int8.h
│   │   │   ├── convolution_winograd_dot_pack8to4_int8.h
│   │   │   ├── convolution_winograd_transform.h
│   │   │   ├── convolution_winograd_transform_int8.h
│   │   │   ├── convolution_winograd_transform_pack4.h
│   │   │   ├── convolution_winograd_transform_pack4_int8.h
│   │   │   ├── convolution_winograd_transform_pack8_int8.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_mips.cpp
│   │   │   ├── convolutiondepthwise_mips.h
│   │   │   ├── crop_mips.cpp
│   │   │   ├── crop_mips.h
│   │   │   ├── deconvolution_mips.cpp
│   │   │   ├── deconvolution_mips.h
│   │   │   ├── deconvolution_pack1to4.h
│   │   │   ├── deconvolution_pack4.h
│   │   │   ├── deconvolution_pack4to1.h
│   │   │   ├── deconvolutiondepthwise_mips.cpp
│   │   │   ├── deconvolutiondepthwise_mips.h
│   │   │   ├── dequantize_mips.cpp
│   │   │   ├── dequantize_mips.h
│   │   │   ├── dropout_mips.cpp
│   │   │   ├── dropout_mips.h
│   │   │   ├── eltwise_mips.cpp
│   │   │   ├── eltwise_mips.h
│   │   │   ├── elu_mips.cpp
│   │   │   ├── elu_mips.h
│   │   │   ├── erf_mips.cpp
│   │   │   ├── erf_mips.h
│   │   │   ├── flatten_mips.cpp
│   │   │   ├── flatten_mips.h
│   │   │   ├── gelu_mips.cpp
│   │   │   ├── gelu_mips.h
│   │   │   ├── hardsigmoid_mips.cpp
│   │   │   ├── hardsigmoid_mips.h
│   │   │   ├── hardswish_mips.cpp
│   │   │   ├── hardswish_mips.h
│   │   │   ├── innerproduct_mips.cpp
│   │   │   ├── innerproduct_mips.h
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_mips.cpp
│   │   │   ├── interp_mips.h
│   │   │   ├── loongson_mmi.h
│   │   │   ├── mips_activation.h
│   │   │   ├── mips_usability.h
│   │   │   ├── mish_mips.cpp
│   │   │   ├── mish_mips.h
│   │   │   ├── msa_mathfun.h
│   │   │   ├── packing_mips.cpp
│   │   │   ├── packing_mips.h
│   │   │   ├── padding_mips.cpp
│   │   │   ├── padding_mips.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── pooling_mips.cpp
│   │   │   ├── pooling_mips.h
│   │   │   ├── prelu_mips.cpp
│   │   │   ├── prelu_mips.h
│   │   │   ├── quantize_mips.cpp
│   │   │   ├── quantize_mips.h
│   │   │   ├── relu_mips.cpp
│   │   │   ├── relu_mips.h
│   │   │   ├── requantize_mips.cpp
│   │   │   ├── requantize_mips.h
│   │   │   ├── selu_mips.cpp
│   │   │   ├── selu_mips.h
│   │   │   ├── sigmoid_mips.cpp
│   │   │   ├── sigmoid_mips.h
│   │   │   ├── slice_mips.cpp
│   │   │   ├── slice_mips.h
│   │   │   ├── softmax_mips.cpp
│   │   │   ├── softmax_mips.h
│   │   │   ├── swish_mips.cpp
│   │   │   ├── swish_mips.h
│   │   │   ├── tanh_mips.cpp
│   │   │   ├── tanh_mips.h
│   │   │   ├── unaryop_mips.cpp
│   │   │   └── unaryop_mips.h
│   │   ├── mish.cpp
│   │   ├── mish.h
│   │   ├── multiheadattention.cpp
│   │   ├── multiheadattention.h
│   │   ├── mvn.cpp
│   │   ├── mvn.h
│   │   ├── noop.cpp
│   │   ├── noop.h
│   │   ├── normalize.cpp
│   │   ├── normalize.h
│   │   ├── packing.cpp
│   │   ├── packing.h
│   │   ├── padding.cpp
│   │   ├── padding.h
│   │   ├── permute.cpp
│   │   ├── permute.h
│   │   ├── pixelshuffle.cpp
│   │   ├── pixelshuffle.h
│   │   ├── pooling.cpp
│   │   ├── pooling.h
│   │   ├── pooling1d.cpp
│   │   ├── pooling1d.h
│   │   ├── pooling3d.cpp
│   │   ├── pooling3d.h
│   │   ├── power.cpp
│   │   ├── power.h
│   │   ├── prelu.cpp
│   │   ├── prelu.h
│   │   ├── priorbox.cpp
│   │   ├── priorbox.h
│   │   ├── proposal.cpp
│   │   ├── proposal.h
│   │   ├── psroipooling.cpp
│   │   ├── psroipooling.h
│   │   ├── quantize.cpp
│   │   ├── quantize.h
│   │   ├── reduction.cpp
│   │   ├── reduction.h
│   │   ├── relu.cpp
│   │   ├── relu.h
│   │   ├── reorg.cpp
│   │   ├── reorg.h
│   │   ├── requantize.cpp
│   │   ├── requantize.h
│   │   ├── reshape.cpp
│   │   ├── reshape.h
│   │   ├── riscv/
│   │   │   ├── absval_riscv.cpp
│   │   │   ├── absval_riscv.h
│   │   │   ├── absval_riscv_zfh.cpp
│   │   │   ├── batchnorm_riscv.cpp
│   │   │   ├── batchnorm_riscv.h
│   │   │   ├── batchnorm_riscv_zfh.cpp
│   │   │   ├── bias_riscv.cpp
│   │   │   ├── bias_riscv.h
│   │   │   ├── bias_riscv_zfh.cpp
│   │   │   ├── binaryop_riscv.cpp
│   │   │   ├── binaryop_riscv.h
│   │   │   ├── binaryop_riscv_zfh.cpp
│   │   │   ├── bnll_riscv.cpp
│   │   │   ├── bnll_riscv.h
│   │   │   ├── bnll_riscv_zfh.cpp
│   │   │   ├── cast_riscv.cpp
│   │   │   ├── cast_riscv.h
│   │   │   ├── cast_riscv_zfh.cpp
│   │   │   ├── celu_riscv.cpp
│   │   │   ├── celu_riscv.h
│   │   │   ├── celu_riscv_zfh.cpp
│   │   │   ├── clip_riscv.cpp
│   │   │   ├── clip_riscv.h
│   │   │   ├── clip_riscv_zfh.cpp
│   │   │   ├── concat_riscv.cpp
│   │   │   ├── concat_riscv.h
│   │   │   ├── convolution1d_riscv.cpp
│   │   │   ├── convolution1d_riscv.h
│   │   │   ├── convolution1d_riscv_zfh.cpp
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_1x1_fp16s.h
│   │   │   ├── convolution_1x1_pack1ton.h
│   │   │   ├── convolution_1x1_pack1ton_fp16s.h
│   │   │   ├── convolution_1x1_packn.h
│   │   │   ├── convolution_1x1_packn_fp16s.h
│   │   │   ├── convolution_1x1_packnto1.h
│   │   │   ├── convolution_1x1_packnto1_fp16s.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_pack1ton.h
│   │   │   ├── convolution_3x3_pack1ton_fp16s.h
│   │   │   ├── convolution_3x3_packn.h
│   │   │   ├── convolution_3x3_packn_fp16s.h
│   │   │   ├── convolution_7x7_pack1ton.h
│   │   │   ├── convolution_7x7_pack1ton_fp16s.h
│   │   │   ├── convolution_fp16s.h
│   │   │   ├── convolution_pack1ton.h
│   │   │   ├── convolution_pack1ton_fp16s.h
│   │   │   ├── convolution_packn.h
│   │   │   ├── convolution_packn_fp16s.h
│   │   │   ├── convolution_packnto1.h
│   │   │   ├── convolution_packnto1_fp16s.h
│   │   │   ├── convolution_riscv.cpp
│   │   │   ├── convolution_riscv.h
│   │   │   ├── convolution_riscv_zfh.cpp
│   │   │   ├── convolution_sgemm.h
│   │   │   ├── convolution_sgemm_fp16s.h
│   │   │   ├── convolution_sgemm_pack1ton.h
│   │   │   ├── convolution_sgemm_pack1ton_fp16s.h
│   │   │   ├── convolution_sgemm_packn.h
│   │   │   ├── convolution_sgemm_packn_fp16s.h
│   │   │   ├── convolution_sgemm_packnto1.h
│   │   │   ├── convolution_sgemm_packnto1_fp16s.h
│   │   │   ├── convolution_winograd_dot.h
│   │   │   ├── convolution_winograd_dot_packn.h
│   │   │   ├── convolution_winograd_dot_packn_fp16s.h
│   │   │   ├── convolution_winograd_transform.h
│   │   │   ├── convolution_winograd_transform_packn.h
│   │   │   ├── convolution_winograd_transform_packn_fp16s.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_packn.h
│   │   │   ├── convolutiondepthwise_3x3_packn_fp16s.h
│   │   │   ├── convolutiondepthwise_5x5_packn.h
│   │   │   ├── convolutiondepthwise_5x5_packn_fp16s.h
│   │   │   ├── convolutiondepthwise_riscv.cpp
│   │   │   ├── convolutiondepthwise_riscv.h
│   │   │   ├── convolutiondepthwise_riscv_zfh.cpp
│   │   │   ├── crop_riscv.cpp
│   │   │   ├── crop_riscv.h
│   │   │   ├── deconvolution_fp16s.h
│   │   │   ├── deconvolution_pack1ton.h
│   │   │   ├── deconvolution_pack1ton_fp16s.h
│   │   │   ├── deconvolution_packn.h
│   │   │   ├── deconvolution_packn_fp16s.h
│   │   │   ├── deconvolution_packnto1.h
│   │   │   ├── deconvolution_packnto1_fp16s.h
│   │   │   ├── deconvolution_riscv.cpp
│   │   │   ├── deconvolution_riscv.h
│   │   │   ├── deconvolution_riscv_zfh.cpp
│   │   │   ├── deconvolutiondepthwise_riscv.cpp
│   │   │   ├── deconvolutiondepthwise_riscv.h
│   │   │   ├── deconvolutiondepthwise_riscv_zfh.cpp
│   │   │   ├── deformableconv2d_pack1ton.h
│   │   │   ├── deformableconv2d_packn.h
│   │   │   ├── deformableconv2d_packnto1.h
│   │   │   ├── deformableconv2d_riscv.cpp
│   │   │   ├── deformableconv2d_riscv.h
│   │   │   ├── dropout_riscv.cpp
│   │   │   ├── dropout_riscv.h
│   │   │   ├── eltwise_riscv.cpp
│   │   │   ├── eltwise_riscv.h
│   │   │   ├── eltwise_riscv_zfh.cpp
│   │   │   ├── flatten_riscv.cpp
│   │   │   ├── flatten_riscv.h
│   │   │   ├── gelu_riscv.cpp
│   │   │   ├── gelu_riscv.h
│   │   │   ├── gemm_bf16s_fp16s.h
│   │   │   ├── gemm_fp16s.h
│   │   │   ├── gemm_riscv.cpp
│   │   │   ├── gemm_riscv.h
│   │   │   ├── gemm_riscv_zfh.cpp
│   │   │   ├── gru_riscv.cpp
│   │   │   ├── gru_riscv.h
│   │   │   ├── gru_riscv_zfh.cpp
│   │   │   ├── hardsigmoid_riscv.cpp
│   │   │   ├── hardsigmoid_riscv.h
│   │   │   ├── hardsigmoid_riscv_zfh.cpp
│   │   │   ├── hardswish_riscv.cpp
│   │   │   ├── hardswish_riscv.h
│   │   │   ├── hardswish_riscv_zfh.cpp
│   │   │   ├── innerproduct_riscv.cpp
│   │   │   ├── innerproduct_riscv.h
│   │   │   ├── innerproduct_riscv_zfh.cpp
│   │   │   ├── instancenorm_riscv.cpp
│   │   │   ├── instancenorm_riscv.h
│   │   │   ├── instancenorm_riscv_zfh.cpp
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_fp16s.h
│   │   │   ├── interp_bicubic_packn.h
│   │   │   ├── interp_bicubic_packn_fp16s.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_fp16s.h
│   │   │   ├── interp_bilinear_packn.h
│   │   │   ├── interp_bilinear_packn_fp16s.h
│   │   │   ├── interp_riscv.cpp
│   │   │   ├── interp_riscv.h
│   │   │   ├── interp_riscv_zfh.cpp
│   │   │   ├── layernorm_riscv.cpp
│   │   │   ├── layernorm_riscv.h
│   │   │   ├── layernorm_riscv_zfh.cpp
│   │   │   ├── mish_riscv.cpp
│   │   │   ├── mish_riscv.h
│   │   │   ├── mish_riscv_zfh.cpp
│   │   │   ├── packing_riscv.cpp
│   │   │   ├── packing_riscv.h
│   │   │   ├── padding_packn.h
│   │   │   ├── padding_riscv.cpp
│   │   │   ├── padding_riscv.h
│   │   │   ├── pooling_riscv.cpp
│   │   │   ├── pooling_riscv.h
│   │   │   ├── pooling_riscv_zfh.cpp
│   │   │   ├── prelu_riscv.cpp
│   │   │   ├── prelu_riscv.h
│   │   │   ├── prelu_riscv_zfh.cpp
│   │   │   ├── relu_riscv.cpp
│   │   │   ├── relu_riscv.h
│   │   │   ├── relu_riscv_zfh.cpp
│   │   │   ├── riscv_activation.h
│   │   │   ├── riscv_usability.h
│   │   │   ├── rvv_mathfun.h
│   │   │   ├── rvv_mathfun_fp16s.h
│   │   │   ├── selu_riscv.cpp
│   │   │   ├── selu_riscv.h
│   │   │   ├── shufflechannel_riscv.cpp
│   │   │   ├── shufflechannel_riscv.h
│   │   │   ├── sigmoid_riscv.cpp
│   │   │   ├── sigmoid_riscv.h
│   │   │   ├── sigmoid_riscv_zfh.cpp
│   │   │   ├── softmax_riscv.cpp
│   │   │   ├── softmax_riscv.h
│   │   │   ├── swish_riscv.cpp
│   │   │   ├── swish_riscv.h
│   │   │   ├── swish_riscv_zfh.cpp
│   │   │   ├── tanh_riscv.cpp
│   │   │   ├── tanh_riscv.h
│   │   │   ├── tanh_riscv_zfh.cpp
│   │   │   ├── unaryop_riscv.cpp
│   │   │   ├── unaryop_riscv.h
│   │   │   └── unaryop_riscv_zfh.cpp
│   │   ├── rmsnorm.cpp
│   │   ├── rmsnorm.h
│   │   ├── rnn.cpp
│   │   ├── rnn.h
│   │   ├── roialign.cpp
│   │   ├── roialign.h
│   │   ├── roipooling.cpp
│   │   ├── roipooling.h
│   │   ├── rotaryembed.cpp
│   │   ├── rotaryembed.h
│   │   ├── scale.cpp
│   │   ├── scale.h
│   │   ├── sdpa.cpp
│   │   ├── sdpa.h
│   │   ├── selu.cpp
│   │   ├── selu.h
│   │   ├── shrink.cpp
│   │   ├── shrink.h
│   │   ├── shufflechannel.cpp
│   │   ├── shufflechannel.h
│   │   ├── sigmoid.cpp
│   │   ├── sigmoid.h
│   │   ├── slice.cpp
│   │   ├── slice.h
│   │   ├── softmax.cpp
│   │   ├── softmax.h
│   │   ├── softplus.cpp
│   │   ├── softplus.h
│   │   ├── spectrogram.cpp
│   │   ├── spectrogram.h
│   │   ├── split.cpp
│   │   ├── split.h
│   │   ├── spp.cpp
│   │   ├── spp.h
│   │   ├── squeeze.cpp
│   │   ├── squeeze.h
│   │   ├── statisticspooling.cpp
│   │   ├── statisticspooling.h
│   │   ├── swish.cpp
│   │   ├── swish.h
│   │   ├── tanh.cpp
│   │   ├── tanh.h
│   │   ├── threshold.cpp
│   │   ├── threshold.h
│   │   ├── tile.cpp
│   │   ├── tile.h
│   │   ├── unaryop.cpp
│   │   ├── unaryop.h
│   │   ├── unfold.cpp
│   │   ├── unfold.h
│   │   ├── vulkan/
│   │   │   ├── absval_vulkan.cpp
│   │   │   ├── absval_vulkan.h
│   │   │   ├── batchnorm_vulkan.cpp
│   │   │   ├── batchnorm_vulkan.h
│   │   │   ├── binaryop_vulkan.cpp
│   │   │   ├── binaryop_vulkan.h
│   │   │   ├── cast_vulkan.cpp
│   │   │   ├── cast_vulkan.h
│   │   │   ├── celu_vulkan.cpp
│   │   │   ├── celu_vulkan.h
│   │   │   ├── clip_vulkan.cpp
│   │   │   ├── clip_vulkan.h
│   │   │   ├── concat_vulkan.cpp
│   │   │   ├── concat_vulkan.h
│   │   │   ├── convolution1d_vulkan.cpp
│   │   │   ├── convolution1d_vulkan.h
│   │   │   ├── convolution_vulkan.cpp
│   │   │   ├── convolution_vulkan.h
│   │   │   ├── convolutiondepthwise_vulkan.cpp
│   │   │   ├── convolutiondepthwise_vulkan.h
│   │   │   ├── crop_vulkan.cpp
│   │   │   ├── crop_vulkan.h
│   │   │   ├── deconvolution_vulkan.cpp
│   │   │   ├── deconvolution_vulkan.h
│   │   │   ├── deconvolutiondepthwise_vulkan.cpp
│   │   │   ├── deconvolutiondepthwise_vulkan.h
│   │   │   ├── deepcopy_vulkan.cpp
│   │   │   ├── deepcopy_vulkan.h
│   │   │   ├── dequantize_vulkan.cpp
│   │   │   ├── dequantize_vulkan.h
│   │   │   ├── dropout_vulkan.cpp
│   │   │   ├── dropout_vulkan.h
│   │   │   ├── eltwise_vulkan.cpp
│   │   │   ├── eltwise_vulkan.h
│   │   │   ├── elu_vulkan.cpp
│   │   │   ├── elu_vulkan.h
│   │   │   ├── erf_vulkan.cpp
│   │   │   ├── erf_vulkan.h
│   │   │   ├── flatten_vulkan.cpp
│   │   │   ├── flatten_vulkan.h
│   │   │   ├── gelu_vulkan.cpp
│   │   │   ├── gelu_vulkan.h
│   │   │   ├── gemm_vulkan.cpp
│   │   │   ├── gemm_vulkan.h
│   │   │   ├── groupnorm_vulkan.cpp
│   │   │   ├── groupnorm_vulkan.h
│   │   │   ├── hardsigmoid_vulkan.cpp
│   │   │   ├── hardsigmoid_vulkan.h
│   │   │   ├── hardswish_vulkan.cpp
│   │   │   ├── hardswish_vulkan.h
│   │   │   ├── innerproduct_vulkan.cpp
│   │   │   ├── innerproduct_vulkan.h
│   │   │   ├── instancenorm_vulkan.cpp
│   │   │   ├── instancenorm_vulkan.h
│   │   │   ├── interp_vulkan.cpp
│   │   │   ├── interp_vulkan.h
│   │   │   ├── layernorm_vulkan.cpp
│   │   │   ├── layernorm_vulkan.h
│   │   │   ├── lrn_vulkan.cpp
│   │   │   ├── lrn_vulkan.h
│   │   │   ├── memorydata_vulkan.cpp
│   │   │   ├── memorydata_vulkan.h
│   │   │   ├── mish_vulkan.cpp
│   │   │   ├── mish_vulkan.h
│   │   │   ├── multiheadattention_vulkan.cpp
│   │   │   ├── multiheadattention_vulkan.h
│   │   │   ├── noop_vulkan.cpp
│   │   │   ├── noop_vulkan.h
│   │   │   ├── normalize_vulkan.cpp
│   │   │   ├── normalize_vulkan.h
│   │   │   ├── packing_vulkan.cpp
│   │   │   ├── packing_vulkan.h
│   │   │   ├── padding_vulkan.cpp
│   │   │   ├── padding_vulkan.h
│   │   │   ├── permute_vulkan.cpp
│   │   │   ├── permute_vulkan.h
│   │   │   ├── pixelshuffle_vulkan.cpp
│   │   │   ├── pixelshuffle_vulkan.h
│   │   │   ├── pooling_vulkan.cpp
│   │   │   ├── pooling_vulkan.h
│   │   │   ├── prelu_vulkan.cpp
│   │   │   ├── prelu_vulkan.h
│   │   │   ├── priorbox_vulkan.cpp
│   │   │   ├── priorbox_vulkan.h
│   │   │   ├── quantize_vulkan.cpp
│   │   │   ├── quantize_vulkan.h
│   │   │   ├── reduction_vulkan.cpp
│   │   │   ├── reduction_vulkan.h
│   │   │   ├── relu_vulkan.cpp
│   │   │   ├── relu_vulkan.h
│   │   │   ├── reorg_vulkan.cpp
│   │   │   ├── reorg_vulkan.h
│   │   │   ├── requantize_vulkan.cpp
│   │   │   ├── requantize_vulkan.h
│   │   │   ├── reshape_vulkan.cpp
│   │   │   ├── reshape_vulkan.h
│   │   │   ├── rmsnorm_vulkan.cpp
│   │   │   ├── rmsnorm_vulkan.h
│   │   │   ├── rotaryembed_vulkan.cpp
│   │   │   ├── rotaryembed_vulkan.h
│   │   │   ├── scale_vulkan.cpp
│   │   │   ├── scale_vulkan.h
│   │   │   ├── sdpa_vulkan.cpp
│   │   │   ├── sdpa_vulkan.h
│   │   │   ├── selu_vulkan.cpp
│   │   │   ├── selu_vulkan.h
│   │   │   ├── shader/
│   │   │   │   ├── .clang-format
│   │   │   │   ├── absval.comp
│   │   │   │   ├── batchnorm.comp
│   │   │   │   ├── batchnorm_pack4.comp
│   │   │   │   ├── binaryop.comp
│   │   │   │   ├── binaryop_broadcast.comp
│   │   │   │   ├── binaryop_broadcast_pack1to4.comp
│   │   │   │   ├── binaryop_broadcast_pack4.comp
│   │   │   │   ├── binaryop_pack4.comp
│   │   │   │   ├── cast_fp16_to_fp32.comp
│   │   │   │   ├── cast_fp16_to_fp32_pack4.comp
│   │   │   │   ├── cast_fp32_to_fp16.comp
│   │   │   │   ├── cast_fp32_to_fp16_pack4.comp
│   │   │   │   ├── celu.comp
│   │   │   │   ├── clip.comp
│   │   │   │   ├── concat.comp
│   │   │   │   ├── concat_pack4.comp
│   │   │   │   ├── concat_pack4to1.comp
│   │   │   │   ├── convolution1d_packed.comp
│   │   │   │   ├── convolution_1x1s1d1_cm.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd23_transform_input.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd23_transform_output.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd43_transform_input.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd43_transform_output.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_gemm_cm.comp
│   │   │   │   ├── convolution_pack1to4_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd23_transform_input.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd23_transform_output.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd43_transform_input.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd43_transform_output.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_pack4to1_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_packed.comp
│   │   │   │   ├── convolution_packed_1x1s1d1.comp
│   │   │   │   ├── convolution_packed_gemm.comp
│   │   │   │   ├── convolution_winograd_gemm_cm.comp
│   │   │   │   ├── convolutiondepthwise.comp
│   │   │   │   ├── convolutiondepthwise_group.comp
│   │   │   │   ├── convolutiondepthwise_group_pack1to4.comp
│   │   │   │   ├── convolutiondepthwise_group_pack4.comp
│   │   │   │   ├── convolutiondepthwise_group_pack4to1.comp
│   │   │   │   ├── convolutiondepthwise_pack4.comp
│   │   │   │   ├── crop.comp
│   │   │   │   ├── crop_pack1to4.comp
│   │   │   │   ├── crop_pack4.comp
│   │   │   │   ├── crop_pack4to1.comp
│   │   │   │   ├── deconvolution_col2im.comp
│   │   │   │   ├── deconvolution_gemm_cm.comp
│   │   │   │   ├── deconvolution_gemm_packed.comp
│   │   │   │   ├── deconvolution_pack4_col2im.comp
│   │   │   │   ├── deconvolution_packed.comp
│   │   │   │   ├── deconvolutiondepthwise.comp
│   │   │   │   ├── deconvolutiondepthwise_group.comp
│   │   │   │   ├── deconvolutiondepthwise_group_pack1to4.comp
│   │   │   │   ├── deconvolutiondepthwise_group_pack4.comp
│   │   │   │   ├── deconvolutiondepthwise_group_pack4to1.comp
│   │   │   │   ├── deconvolutiondepthwise_pack4.comp
│   │   │   │   ├── deepcopy.comp
│   │   │   │   ├── deepcopy_pack4.comp
│   │   │   │   ├── dequantize.comp
│   │   │   │   ├── dequantize_pack4.comp
│   │   │   │   ├── dropout.comp
│   │   │   │   ├── eltwise.comp
│   │   │   │   ├── elu.comp
│   │   │   │   ├── erf.comp
│   │   │   │   ├── flatten.comp
│   │   │   │   ├── flatten_pack1to4.comp
│   │   │   │   ├── flatten_pack4.comp
│   │   │   │   ├── gelu.comp
│   │   │   │   ├── gemm.comp
│   │   │   │   ├── gemm_cm.comp
│   │   │   │   ├── gemm_sg.comp
│   │   │   │   ├── groupnorm_coeffs.comp
│   │   │   │   ├── groupnorm_coeffs_pack4.comp
│   │   │   │   ├── groupnorm_norm.comp
│   │   │   │   ├── groupnorm_norm_pack4.comp
│   │   │   │   ├── groupnorm_reduce_mean.comp
│   │   │   │   ├── groupnorm_reduce_mean_pack4.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp32.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── groupnorm_sub_mean_square.comp
│   │   │   │   ├── groupnorm_sub_mean_square_pack4.comp
│   │   │   │   ├── hardsigmoid.comp
│   │   │   │   ├── hardswish.comp
│   │   │   │   ├── innerproduct.comp
│   │   │   │   ├── innerproduct_gemm.comp
│   │   │   │   ├── innerproduct_gemm_wp1to4.comp
│   │   │   │   ├── innerproduct_gemm_wp4.comp
│   │   │   │   ├── innerproduct_gemm_wp4to1.comp
│   │   │   │   ├── innerproduct_pack1to4.comp
│   │   │   │   ├── innerproduct_pack4.comp
│   │   │   │   ├── innerproduct_pack4to1.comp
│   │   │   │   ├── innerproduct_reduce_sum8.comp
│   │   │   │   ├── innerproduct_reduce_sum8_pack4.comp
│   │   │   │   ├── innerproduct_sum8.comp
│   │   │   │   ├── innerproduct_sum8_pack1to4.comp
│   │   │   │   ├── innerproduct_sum8_pack4.comp
│   │   │   │   ├── innerproduct_sum8_pack4to1.comp
│   │   │   │   ├── instancenorm_coeffs.comp
│   │   │   │   ├── instancenorm_coeffs_pack4.comp
│   │   │   │   ├── instancenorm_norm.comp
│   │   │   │   ├── instancenorm_norm_pack4.comp
│   │   │   │   ├── instancenorm_reduce_mean.comp
│   │   │   │   ├── instancenorm_reduce_mean_pack4.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp32.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── instancenorm_sub_mean_square.comp
│   │   │   │   ├── instancenorm_sub_mean_square_pack4.comp
│   │   │   │   ├── interp.comp
│   │   │   │   ├── interp_bicubic.comp
│   │   │   │   ├── interp_bicubic_coeffs.comp
│   │   │   │   ├── interp_bicubic_pack4.comp
│   │   │   │   ├── interp_pack4.comp
│   │   │   │   ├── layernorm_coeffs.comp
│   │   │   │   ├── layernorm_coeffs_pack4.comp
│   │   │   │   ├── layernorm_norm.comp
│   │   │   │   ├── layernorm_norm_pack4.comp
│   │   │   │   ├── layernorm_reduce_mean.comp
│   │   │   │   ├── layernorm_reduce_mean_pack4.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp32.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── layernorm_sub_mean_square.comp
│   │   │   │   ├── layernorm_sub_mean_square_pack4.comp
│   │   │   │   ├── lrn_norm.comp
│   │   │   │   ├── lrn_norm_across_channel_pack4.comp
│   │   │   │   ├── lrn_norm_within_channel_pack4.comp
│   │   │   │   ├── lrn_square_pad.comp
│   │   │   │   ├── lrn_square_pad_across_channel_pack4.comp
│   │   │   │   ├── lrn_square_pad_within_channel_pack4.comp
│   │   │   │   ├── mish.comp
│   │   │   │   ├── multiheadattention_qk_cross.comp
│   │   │   │   ├── multiheadattention_qk_cross_pack1to4.comp
│   │   │   │   ├── multiheadattention_qk_cross_pack4.comp
│   │   │   │   ├── multiheadattention_qk_cross_pack4to1.comp
│   │   │   │   ├── multiheadattention_qkv_cross.comp
│   │   │   │   ├── multiheadattention_qkv_cross_pack1to4.comp
│   │   │   │   ├── multiheadattention_qkv_cross_pack4.comp
│   │   │   │   ├── multiheadattention_qkv_cross_pack4to1.comp
│   │   │   │   ├── normalize_coeffs.comp
│   │   │   │   ├── normalize_coeffs_pack4.comp
│   │   │   │   ├── normalize_norm.comp
│   │   │   │   ├── normalize_norm_pack4.comp
│   │   │   │   ├── normalize_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── normalize_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── normalize_reduce_sum4_fp32.comp
│   │   │   │   ├── normalize_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── packing.comp
│   │   │   │   ├── packing_int8.comp
│   │   │   │   ├── packing_pack1to4.comp
│   │   │   │   ├── packing_pack1to4_int8.comp
│   │   │   │   ├── packing_pack4to1.comp
│   │   │   │   ├── packing_pack4to1_int8.comp
│   │   │   │   ├── padding.comp
│   │   │   │   ├── padding_3d.comp
│   │   │   │   ├── padding_3d_pack4.comp
│   │   │   │   ├── padding_pack1to4.comp
│   │   │   │   ├── padding_pack4.comp
│   │   │   │   ├── padding_pack4to1.comp
│   │   │   │   ├── permute.comp
│   │   │   │   ├── permute_pack1to4.comp
│   │   │   │   ├── permute_pack4.comp
│   │   │   │   ├── permute_pack4to1.comp
│   │   │   │   ├── pixelshuffle.comp
│   │   │   │   ├── pixelshuffle_pack4.comp
│   │   │   │   ├── pixelshuffle_pack4to1.comp
│   │   │   │   ├── pooling.comp
│   │   │   │   ├── pooling_adaptive.comp
│   │   │   │   ├── pooling_adaptive_pack4.comp
│   │   │   │   ├── pooling_global_reduce_max.comp
│   │   │   │   ├── pooling_global_reduce_max_first.comp
│   │   │   │   ├── pooling_global_reduce_max_first_pack4.comp
│   │   │   │   ├── pooling_global_reduce_max_last.comp
│   │   │   │   ├── pooling_global_reduce_max_last_pack4.comp
│   │   │   │   ├── pooling_global_reduce_max_pack4.comp
│   │   │   │   ├── pooling_global_reduce_sum.comp
│   │   │   │   ├── pooling_global_reduce_sum_first.comp
│   │   │   │   ├── pooling_global_reduce_sum_first_pack4.comp
│   │   │   │   ├── pooling_global_reduce_sum_last.comp
│   │   │   │   ├── pooling_global_reduce_sum_last_pack4.comp
│   │   │   │   ├── pooling_global_reduce_sum_pack4.comp
│   │   │   │   ├── pooling_pack4.comp
│   │   │   │   ├── prelu.comp
│   │   │   │   ├── prelu_pack4.comp
│   │   │   │   ├── priorbox.comp
│   │   │   │   ├── priorbox_mxnet.comp
│   │   │   │   ├── quantize.comp
│   │   │   │   ├── quantize_pack4.comp
│   │   │   │   ├── reduction.comp
│   │   │   │   ├── relu.comp
│   │   │   │   ├── reorg.comp
│   │   │   │   ├── reorg_pack1to4.comp
│   │   │   │   ├── reorg_pack4.comp
│   │   │   │   ├── requantize.comp
│   │   │   │   ├── requantize_pack4.comp
│   │   │   │   ├── reshape.comp
│   │   │   │   ├── reshape_pack1to4.comp
│   │   │   │   ├── reshape_pack4.comp
│   │   │   │   ├── reshape_pack4to1.comp
│   │   │   │   ├── rmsnorm_coeffs.comp
│   │   │   │   ├── rmsnorm_coeffs_pack4.comp
│   │   │   │   ├── rmsnorm_norm.comp
│   │   │   │   ├── rmsnorm_norm_pack4.comp
│   │   │   │   ├── rmsnorm_square.comp
│   │   │   │   ├── rmsnorm_square_pack4.comp
│   │   │   │   ├── rotaryembed.comp
│   │   │   │   ├── rotaryembed_pack4.comp
│   │   │   │   ├── scale.comp
│   │   │   │   ├── scale_pack4.comp
│   │   │   │   ├── sdpa_cross.comp
│   │   │   │   ├── sdpa_cross_cm.comp
│   │   │   │   ├── sdpa_fa.comp
│   │   │   │   ├── sdpa_fa_cm.comp
│   │   │   │   ├── selu.comp
│   │   │   │   ├── shrink.comp
│   │   │   │   ├── shufflechannel.comp
│   │   │   │   ├── shufflechannel_pack4.comp
│   │   │   │   ├── sigmoid.comp
│   │   │   │   ├── slice.comp
│   │   │   │   ├── slice_pack1to4.comp
│   │   │   │   ├── slice_pack4.comp
│   │   │   │   ├── softmax_div_sum.comp
│   │   │   │   ├── softmax_div_sum_pack4.comp
│   │   │   │   ├── softmax_exp_sub_max.comp
│   │   │   │   ├── softmax_exp_sub_max_pack4.comp
│   │   │   │   ├── softmax_reduce_max.comp
│   │   │   │   ├── softmax_reduce_max_pack4.comp
│   │   │   │   ├── softmax_reduce_sum.comp
│   │   │   │   ├── softmax_reduce_sum_pack4.comp
│   │   │   │   ├── softplus.comp
│   │   │   │   ├── swish.comp
│   │   │   │   ├── tanh.comp
│   │   │   │   ├── unaryop.comp
│   │   │   │   ├── unfold_im2col.comp
│   │   │   │   ├── unfold_im2col_pack1to4.comp
│   │   │   │   ├── unfold_im2col_pack4.comp
│   │   │   │   ├── unfold_im2col_pack4to1.comp
│   │   │   │   └── vulkan_activation.comp
│   │   │   ├── shrink_vulkan.cpp
│   │   │   ├── shrink_vulkan.h
│   │   │   ├── shufflechannel_vulkan.cpp
│   │   │   ├── shufflechannel_vulkan.h
│   │   │   ├── sigmoid_vulkan.cpp
│   │   │   ├── sigmoid_vulkan.h
│   │   │   ├── slice_vulkan.cpp
│   │   │   ├── slice_vulkan.h
│   │   │   ├── softmax_vulkan.cpp
│   │   │   ├── softmax_vulkan.h
│   │   │   ├── softplus_vulkan.cpp
│   │   │   ├── softplus_vulkan.h
│   │   │   ├── split_vulkan.cpp
│   │   │   ├── split_vulkan.h
│   │   │   ├── swish_vulkan.cpp
│   │   │   ├── swish_vulkan.h
│   │   │   ├── tanh_vulkan.cpp
│   │   │   ├── tanh_vulkan.h
│   │   │   ├── unaryop_vulkan.cpp
│   │   │   ├── unaryop_vulkan.h
│   │   │   ├── unfold_vulkan.cpp
│   │   │   └── unfold_vulkan.h
│   │   ├── x86/
│   │   │   ├── absval_x86.cpp
│   │   │   ├── absval_x86.h
│   │   │   ├── avx512_mathfun.h
│   │   │   ├── avx_mathfun.h
│   │   │   ├── batchnorm_bf16s.h
│   │   │   ├── batchnorm_x86.cpp
│   │   │   ├── batchnorm_x86.h
│   │   │   ├── batchnorm_x86_avx512bf16.cpp
│   │   │   ├── bias_x86.cpp
│   │   │   ├── bias_x86.h
│   │   │   ├── binaryop_bf16s.h
│   │   │   ├── binaryop_functor.h
│   │   │   ├── binaryop_x86.cpp
│   │   │   ├── binaryop_x86.h
│   │   │   ├── binaryop_x86_avx512bf16.cpp
│   │   │   ├── bnll_x86.cpp
│   │   │   ├── bnll_x86.h
│   │   │   ├── cast_bf16.h
│   │   │   ├── cast_fp16.h
│   │   │   ├── cast_x86.cpp
│   │   │   ├── cast_x86.h
│   │   │   ├── cast_x86_avx2.cpp
│   │   │   ├── cast_x86_avx512bf16.cpp
│   │   │   ├── cast_x86_f16c.cpp
│   │   │   ├── clip_bf16s.h
│   │   │   ├── clip_x86.cpp
│   │   │   ├── clip_x86.h
│   │   │   ├── clip_x86_avx512bf16.cpp
│   │   │   ├── concat_x86.cpp
│   │   │   ├── concat_x86.h
│   │   │   ├── convolution1d_packed.h
│   │   │   ├── convolution1d_x86.cpp
│   │   │   ├── convolution1d_x86.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_2x2_pack8.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack16to1.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack1to8.h
│   │   │   ├── convolution_3x3_pack8.h
│   │   │   ├── convolution_3x3_pack8to1.h
│   │   │   ├── convolution_3x3_winograd.h
│   │   │   ├── convolution_3x3_winograd_int8.h
│   │   │   ├── convolution_5x5.h
│   │   │   ├── convolution_im2col_gemm.h
│   │   │   ├── convolution_im2col_gemm_int8.h
│   │   │   ├── convolution_packed.h
│   │   │   ├── convolution_packed_int8.h
│   │   │   ├── convolution_x86.cpp
│   │   │   ├── convolution_x86.h
│   │   │   ├── convolution_x86_avx2.cpp
│   │   │   ├── convolution_x86_avx512vnni.cpp
│   │   │   ├── convolution_x86_avxvnni.cpp
│   │   │   ├── convolution_x86_avxvnniint8.cpp
│   │   │   ├── convolution_x86_xop.cpp
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_int8.h
│   │   │   ├── convolutiondepthwise_3x3_pack16.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_3x3_pack8.h
│   │   │   ├── convolutiondepthwise_5x5_pack16.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack8.h
│   │   │   ├── convolutiondepthwise_x86.cpp
│   │   │   ├── convolutiondepthwise_x86.h
│   │   │   ├── crop_x86.cpp
│   │   │   ├── crop_x86.h
│   │   │   ├── deconvolution_packed.h
│   │   │   ├── deconvolution_x86.cpp
│   │   │   ├── deconvolution_x86.h
│   │   │   ├── deconvolutiondepthwise_x86.cpp
│   │   │   ├── deconvolutiondepthwise_x86.h
│   │   │   ├── deformableconv2d_packed.h
│   │   │   ├── deformableconv2d_x86.cpp
│   │   │   ├── deformableconv2d_x86.h
│   │   │   ├── dequantize_x86.cpp
│   │   │   ├── dequantize_x86.h
│   │   │   ├── dropout_x86.cpp
│   │   │   ├── dropout_x86.h
│   │   │   ├── eltwise_x86.cpp
│   │   │   ├── eltwise_x86.h
│   │   │   ├── elu_x86.cpp
│   │   │   ├── elu_x86.h
│   │   │   ├── erf_x86.cpp
│   │   │   ├── erf_x86.h
│   │   │   ├── flatten_x86.cpp
│   │   │   ├── flatten_x86.h
│   │   │   ├── gelu_x86.cpp
│   │   │   ├── gelu_x86.h
│   │   │   ├── gemm_bf16s.h
│   │   │   ├── gemm_int8.h
│   │   │   ├── gemm_x86.cpp
│   │   │   ├── gemm_x86.h
│   │   │   ├── gemm_x86_avx2.cpp
│   │   │   ├── gemm_x86_avx512vnni.cpp
│   │   │   ├── gemm_x86_avxvnni.cpp
│   │   │   ├── gemm_x86_avxvnniint8.cpp
│   │   │   ├── gemm_x86_xop.cpp
│   │   │   ├── gridsample_bicubic_apply_interpolation.h
│   │   │   ├── gridsample_bicubic_compute_blob.h
│   │   │   ├── gridsample_bilinear_apply_interpolation.h
│   │   │   ├── gridsample_bilinear_compute_blob.h
│   │   │   ├── gridsample_compute_blob.h
│   │   │   ├── gridsample_nearest_apply_interpolation.h
│   │   │   ├── gridsample_nearest_compute_blob.h
│   │   │   ├── gridsample_x86.cpp
│   │   │   ├── gridsample_x86.h
│   │   │   ├── groupnorm_bf16s.h
│   │   │   ├── groupnorm_x86.cpp
│   │   │   ├── groupnorm_x86.h
│   │   │   ├── groupnorm_x86_avx512bf16.cpp
│   │   │   ├── hardsigmoid_x86.cpp
│   │   │   ├── hardsigmoid_x86.h
│   │   │   ├── hardswish_x86.cpp
│   │   │   ├── hardswish_x86.h
│   │   │   ├── innerproduct_fp.h
│   │   │   ├── innerproduct_gemm_fp.h
│   │   │   ├── innerproduct_x86.cpp
│   │   │   ├── innerproduct_x86.h
│   │   │   ├── innerproduct_x86_f16c.cpp
│   │   │   ├── instancenorm_bf16s.h
│   │   │   ├── instancenorm_x86.cpp
│   │   │   ├── instancenorm_x86.h
│   │   │   ├── instancenorm_x86_avx512bf16.cpp
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_pack16.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bicubic_pack8.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_pack16.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_bilinear_pack8.h
│   │   │   ├── interp_x86.cpp
│   │   │   ├── interp_x86.h
│   │   │   ├── interp_x86_avx2.cpp
│   │   │   ├── layernorm_bf16s.h
│   │   │   ├── layernorm_x86.cpp
│   │   │   ├── layernorm_x86.h
│   │   │   ├── layernorm_x86_avx512bf16.cpp
│   │   │   ├── lrn_x86.cpp
│   │   │   ├── lrn_x86.h
│   │   │   ├── lstm_int8.h
│   │   │   ├── lstm_x86.cpp
│   │   │   ├── lstm_x86.h
│   │   │   ├── lstm_x86_avx2.cpp
│   │   │   ├── lstm_x86_avx512vnni.cpp
│   │   │   ├── lstm_x86_avxvnni.cpp
│   │   │   ├── lstm_x86_xop.cpp
│   │   │   ├── matmul_x86.cpp
│   │   │   ├── matmul_x86.h
│   │   │   ├── mish_x86.cpp
│   │   │   ├── mish_x86.h
│   │   │   ├── multiheadattention_x86.cpp
│   │   │   ├── multiheadattention_x86.h
│   │   │   ├── packing_x86.cpp
│   │   │   ├── packing_x86.h
│   │   │   ├── padding_pack16.h
│   │   │   ├── padding_pack16_bf16s_fp16s.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack4_bf16s_fp16s.h
│   │   │   ├── padding_pack8.h
│   │   │   ├── padding_pack8_bf16s_fp16s.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── padding_x86.cpp
│   │   │   ├── padding_x86.h
│   │   │   ├── pooling_2x2.h
│   │   │   ├── pooling_2x2_pack16.h
│   │   │   ├── pooling_2x2_pack4.h
│   │   │   ├── pooling_2x2_pack8.h
│   │   │   ├── pooling_3x3_pack16.h
│   │   │   ├── pooling_3x3_pack4.h
│   │   │   ├── pooling_3x3_pack8.h
│   │   │   ├── pooling_x86.cpp
│   │   │   ├── pooling_x86.h
│   │   │   ├── prelu_bf16s.h
│   │   │   ├── prelu_x86.cpp
│   │   │   ├── prelu_x86.h
│   │   │   ├── prelu_x86_avx512bf16.cpp
│   │   │   ├── quantize_x86.cpp
│   │   │   ├── quantize_x86.h
│   │   │   ├── relu_bf16s.h
│   │   │   ├── relu_x86.cpp
│   │   │   ├── relu_x86.h
│   │   │   ├── relu_x86_avx512bf16.cpp
│   │   │   ├── requantize_x86.cpp
│   │   │   ├── requantize_x86.h
│   │   │   ├── reshape_x86.cpp
│   │   │   ├── reshape_x86.h
│   │   │   ├── rmsnorm_bf16s.h
│   │   │   ├── rmsnorm_x86.cpp
│   │   │   ├── rmsnorm_x86.h
│   │   │   ├── rmsnorm_x86_avx512bf16.cpp
│   │   │   ├── roialign_x86.cpp
│   │   │   ├── roialign_x86.h
│   │   │   ├── rotaryembed_x86.cpp
│   │   │   ├── rotaryembed_x86.h
│   │   │   ├── scale_bf16s.h
│   │   │   ├── scale_x86.cpp
│   │   │   ├── scale_x86.h
│   │   │   ├── scale_x86_avx512bf16.cpp
│   │   │   ├── sdpa_x86.cpp
│   │   │   ├── sdpa_x86.h
│   │   │   ├── selu_x86.cpp
│   │   │   ├── selu_x86.h
│   │   │   ├── shufflechannel_x86.cpp
│   │   │   ├── shufflechannel_x86.h
│   │   │   ├── sigmoid_bf16s.h
│   │   │   ├── sigmoid_x86.cpp
│   │   │   ├── sigmoid_x86.h
│   │   │   ├── sigmoid_x86_avx512bf16.cpp
│   │   │   ├── slice_x86.cpp
│   │   │   ├── slice_x86.h
│   │   │   ├── softmax_bf16s.h
│   │   │   ├── softmax_x86.cpp
│   │   │   ├── softmax_x86.h
│   │   │   ├── softmax_x86_avx512bf16.cpp
│   │   │   ├── sse_mathfun.h
│   │   │   ├── swish_bf16s.h
│   │   │   ├── swish_x86.cpp
│   │   │   ├── swish_x86.h
│   │   │   ├── swish_x86_avx512bf16.cpp
│   │   │   ├── tanh_x86.cpp
│   │   │   ├── tanh_x86.h
│   │   │   ├── unaryop_bf16s.h
│   │   │   ├── unaryop_functor.h
│   │   │   ├── unaryop_x86.cpp
│   │   │   ├── unaryop_x86.h
│   │   │   ├── unaryop_x86_avx512bf16.cpp
│   │   │   ├── x86_activation.h
│   │   │   ├── x86_usability.h
│   │   │   ├── yolov3detectionoutput_x86.cpp
│   │   │   └── yolov3detectionoutput_x86.h
│   │   ├── yolodetectionoutput.cpp
│   │   ├── yolodetectionoutput.h
│   │   ├── yolov3detectionoutput.cpp
│   │   └── yolov3detectionoutput.h
│   ├── layer.cpp
│   ├── layer.h
│   ├── layer_declaration.h.in
│   ├── layer_registry.h.in
│   ├── layer_shader_registry.h.in
│   ├── layer_shader_spv_data.h.in
│   ├── layer_shader_type.h
│   ├── layer_shader_type_enum.h.in
│   ├── layer_type.h
│   ├── layer_type_enum.h.in
│   ├── mat.cpp
│   ├── mat.h
│   ├── mat_pixel.cpp
│   ├── mat_pixel_affine.cpp
│   ├── mat_pixel_android.cpp
│   ├── mat_pixel_drawing.cpp
│   ├── mat_pixel_drawing_font.h
│   ├── mat_pixel_resize.cpp
│   ├── mat_pixel_rotate.cpp
│   ├── modelbin.cpp
│   ├── modelbin.h
│   ├── ncnn.pc.in
│   ├── net.cpp
│   ├── net.h
│   ├── option.cpp
│   ├── option.h
│   ├── paramdict.cpp
│   ├── paramdict.h
│   ├── pipeline.cpp
│   ├── pipeline.h
│   ├── pipelinecache.cpp
│   ├── pipelinecache.h
│   ├── platform.h.in
│   ├── ruapu.h
│   ├── simplemath.cpp
│   ├── simplemath.h
│   ├── simpleocv.cpp
│   ├── simpleocv.h
│   ├── simpleomp.cpp
│   ├── simpleomp.h
│   ├── simplestl.cpp
│   ├── simplestl.h
│   ├── simplevk.cpp
│   ├── simplevk.h
│   ├── simplevk.tbd
│   ├── stb_image.h
│   ├── stb_image_write.h
│   └── vulkan_header_fix.h
├── tests/
│   ├── CMakeLists.txt
│   ├── perf/
│   │   ├── CMakeLists.txt
│   │   ├── perf_batchnorm.cpp
│   │   ├── perf_binaryop.cpp
│   │   ├── perf_concat.cpp
│   │   ├── perf_convolution.cpp
│   │   ├── perf_convolutiondepthwise.cpp
│   │   ├── perf_deconvolution.cpp
│   │   ├── perf_innerproduct.cpp
│   │   ├── perf_pooling.cpp
│   │   ├── perf_relu.cpp
│   │   ├── perf_sigmoid.cpp
│   │   ├── perf_softmax.cpp
│   │   ├── perfutil.cpp
│   │   └── perfutil.h
│   ├── prng.h
│   ├── test_absval.cpp
│   ├── test_batchnorm.cpp
│   ├── test_bias.cpp
│   ├── test_binaryop.cpp
│   ├── test_binaryop_1.cpp
│   ├── test_binaryop_2.cpp
│   ├── test_binaryop_3.cpp
│   ├── test_binaryop_4.cpp
│   ├── test_bnll.cpp
│   ├── test_c_api.cpp
│   ├── test_cast.cpp
│   ├── test_celu.cpp
│   ├── test_clip.cpp
│   ├── test_command.cpp
│   ├── test_concat.cpp
│   ├── test_concat_oom.cpp
│   ├── test_convolution.cpp
│   ├── test_convolution1d.cpp
│   ├── test_convolution3d.cpp
│   ├── test_convolution_1.cpp
│   ├── test_convolution_2.cpp
│   ├── test_convolution_3.cpp
│   ├── test_convolution_oom.cpp
│   ├── test_convolutiondepthwise.cpp
│   ├── test_convolutiondepthwise1d.cpp
│   ├── test_convolutiondepthwise3d.cpp
│   ├── test_convolutiondepthwise_1.cpp
│   ├── test_copyto.cpp
│   ├── test_copyto_1.cpp
│   ├── test_cpu.cpp
│   ├── test_crop.cpp
│   ├── test_crop_1.cpp
│   ├── test_crop_2.cpp
│   ├── test_crop_3.cpp
│   ├── test_crop_oom.cpp
│   ├── test_cumulativesum.cpp
│   ├── test_deconvolution.cpp
│   ├── test_deconvolution1d.cpp
│   ├── test_deconvolution3d.cpp
│   ├── test_deconvolutiondepthwise.cpp
│   ├── test_deconvolutiondepthwise1d.cpp
│   ├── test_deconvolutiondepthwise3d.cpp
│   ├── test_deconvolutiondepthwise_1.cpp
│   ├── test_deepcopy.cpp
│   ├── test_deformableconv2d.cpp
│   ├── test_deformableconv2d_1.cpp
│   ├── test_deformableconv2d_2.cpp
│   ├── test_deformableconv2d_3.cpp
│   ├── test_deformableconv2d_4.cpp
│   ├── test_dequantize.cpp
│   ├── test_diag.cpp
│   ├── test_dropout.cpp
│   ├── test_einsum.cpp
│   ├── test_eltwise.cpp
│   ├── test_elu.cpp
│   ├── test_embed.cpp
│   ├── test_erf.cpp
│   ├── test_expanddims.cpp
│   ├── test_expression.cpp
│   ├── test_flatten.cpp
│   ├── test_flip.cpp
│   ├── test_fold.cpp
│   ├── test_gelu.cpp
│   ├── test_gemm_0.h
│   ├── test_gemm_0a.cpp
│   ├── test_gemm_0b.cpp
│   ├── test_gemm_0c.cpp
│   ├── test_gemm_0d.cpp
│   ├── test_gemm_0e.cpp
│   ├── test_gemm_0f.cpp
│   ├── test_gemm_1.h
│   ├── test_gemm_1a.cpp
│   ├── test_gemm_1b.cpp
│   ├── test_gemm_2.h
│   ├── test_gemm_2a.cpp
│   ├── test_gemm_2b.cpp
│   ├── test_gemm_2c.cpp
│   ├── test_gemm_2d.cpp
│   ├── test_gemm_2e.cpp
│   ├── test_gemm_3.cpp
│   ├── test_gemm_4.cpp
│   ├── test_gemm_nt.cpp
│   ├── test_gemm_oom.cpp
│   ├── test_glu.cpp
│   ├── test_gridsample.cpp
│   ├── test_groupnorm.cpp
│   ├── test_gru.cpp
│   ├── test_hardsigmoid.cpp
│   ├── test_hardswish.cpp
│   ├── test_innerproduct.cpp
│   ├── test_instancenorm.cpp
│   ├── test_interp.cpp
│   ├── test_interp_1.cpp
│   ├── test_inversespectrogram.cpp
│   ├── test_layernorm.cpp
│   ├── test_lrn.cpp
│   ├── test_lstm.cpp
│   ├── test_mat_pixel.cpp
│   ├── test_mat_pixel_affine.cpp
│   ├── test_mat_pixel_drawing.cpp
│   ├── test_mat_pixel_resize.cpp
│   ├── test_mat_pixel_rotate.cpp
│   ├── test_matmul.cpp
│   ├── test_memorydata.cpp
│   ├── test_mish.cpp
│   ├── test_multiheadattention.cpp
│   ├── test_multiheadattention_1.cpp
│   ├── test_multiheadattention_kvcache.cpp
│   ├── test_multiheadattention_oom.cpp
│   ├── test_noop.cpp
│   ├── test_normalize.cpp
│   ├── test_packing.cpp
│   ├── test_padding.cpp
│   ├── test_paramdict.cpp
│   ├── test_permute.cpp
│   ├── test_pixelshuffle.cpp
│   ├── test_pooling.cpp
│   ├── test_pooling1d.cpp
│   ├── test_pooling3d.cpp
│   ├── test_power.cpp
│   ├── test_prelu.cpp
│   ├── test_priorbox.cpp
│   ├── test_quantize.cpp
│   ├── test_quantize_oom.cpp
│   ├── test_reduction.cpp
│   ├── test_relu.cpp
│   ├── test_reorg.cpp
│   ├── test_requantize.cpp
│   ├── test_requantize_oom.cpp
│   ├── test_reshape.cpp
│   ├── test_reshape_1.cpp
│   ├── test_reshape_oom.cpp
│   ├── test_rmsnorm.cpp
│   ├── test_rnn.cpp
│   ├── test_roialign.cpp
│   ├── test_roipooling.cpp
│   ├── test_rotaryembed.cpp
│   ├── test_rotaryembed_oom.cpp
│   ├── test_scale.cpp
│   ├── test_sdpa.cpp
│   ├── test_sdpa_kvcache.cpp
│   ├── test_sdpa_oom.cpp
│   ├── test_selu.cpp
│   ├── test_shrink.cpp
│   ├── test_shufflechannel.cpp
│   ├── test_sigmoid.cpp
│   ├── test_slice.cpp
│   ├── test_slice_oom.cpp
│   ├── test_softmax.cpp
│   ├── test_softmax_oom.cpp
│   ├── test_softplus.cpp
│   ├── test_spectrogram.cpp
│   ├── test_squeeze.cpp
│   ├── test_squeezenet.cpp
│   ├── test_swish.cpp
│   ├── test_tanh.cpp
│   ├── test_tile.cpp
│   ├── test_tile_oom.cpp
│   ├── test_unaryop.cpp
│   ├── test_unfold.cpp
│   ├── test_yolov3detectionoutput.cpp
│   ├── testutil.cpp
│   └── testutil.h
├── toolchains/
│   ├── aarch64-linux-gnu-c.toolchain.cmake
│   ├── aarch64-linux-gnu.toolchain.cmake
│   ├── aarch64-qnx.toolchain.cmake
│   ├── anykav500.toolchain.cmake
│   ├── arm-linux-gnueabi-c.toolchain.cmake
│   ├── arm-linux-gnueabi.toolchain.cmake
│   ├── arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake
│   ├── arm-linux-gnueabihf.toolchain.cmake
│   ├── c906-v310.toolchain.cmake
│   ├── c907-rv32-v310.toolchain.cmake
│   ├── c907-v310.toolchain.cmake
│   ├── c908-v310.toolchain.cmake
│   ├── c910-v310.toolchain.cmake
│   ├── esp32.toolchain.cmake
│   ├── himix100.toolchain.cmake
│   ├── himix200.toolchain.cmake
│   ├── himix210.toolchain.cmake
│   ├── hisiv300.toolchain.cmake
│   ├── hisiv500.toolchain.cmake
│   ├── hisiv600.toolchain.cmake
│   ├── host-c.clang.toolchain.cmake
│   ├── host-c.gcc.toolchain.cmake
│   ├── host.clang-m32.toolchain.cmake
│   ├── host.gcc-c++03.toolchain.cmake
│   ├── host.gcc-m32.toolchain.cmake
│   ├── host.gcc.toolchain.cmake
│   ├── ingenic-x2000.toolchain.cmake
│   ├── ios.toolchain.cmake
│   ├── iossimxc-x64.toolchain.cmake
│   ├── iossimxc.toolchain.cmake
│   ├── iosxc-arm64.toolchain.cmake
│   ├── iosxc.toolchain.cmake
│   ├── jetson.toolchain.cmake
│   ├── k1.llvm.toolchain.cmake
│   ├── k1.toolchain.cmake
│   ├── loongarch64-linux-gnu.toolchain.cmake
│   ├── loongarch64-unknown-linux-gnu.toolchain.cmake
│   ├── loongson2f-linux-gnuabi64.toolchain.cmake
│   ├── mips-mti-linux-gnu.toolchain.cmake
│   ├── mips32r2-linux-gnu.toolchain.cmake
│   ├── mips64el-linux-gnuabi64.toolchain.cmake
│   ├── mipsel-linux-gnu.toolchain.cmake
│   ├── mipsisa32r6el-linux-gnu.toolchain.cmake
│   ├── mipsisa64r6el-linux-gnuabi64.toolchain.cmake
│   ├── pi3.toolchain.cmake
│   ├── power8le-linux-gnu-vsx.clang.toolchain.cmake
│   ├── power8le-linux-gnu-vsx.toolchain.cmake
│   ├── power9le-linux-gnu-vsx.clang.toolchain.cmake
│   ├── power9le-linux-gnu-vsx.toolchain.cmake
│   ├── powerpc-linux-gnu.toolchain.cmake
│   ├── powerpc64le-linux-gnu.toolchain.cmake
│   ├── riscv32-unknown-elf.toolchain.cmake
│   ├── riscv64-linux-gnu.toolchain.cmake
│   ├── riscv64-unknown-elf.toolchain.cmake
│   ├── riscv64-unknown-linux-gnu.llvm-toolchain.cmake
│   ├── riscv64-unknown-linux-gnu.toolchain.cmake
│   ├── v831.toolchain.cmake
│   ├── windows-xp-clang.toolchain.cmake
│   ├── windows-xp-mingw.toolchain.cmake
│   └── windows-xp-msvc.toolchain.cmake
└── tools/
    ├── CMakeLists.txt
    ├── caffe/
    │   ├── CMakeLists.txt
    │   ├── caffe.proto
    │   └── caffe2ncnn.cpp
    ├── darknet/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── darknet2ncnn.cpp
    ├── keras/
    │   └── readme.md
    ├── mlir/
    │   ├── CMakeLists.txt
    │   ├── fix_td.sh
    │   ├── mlir2ncnn.cpp
    │   ├── ncnn_dialect.cpp
    │   ├── ncnn_dialect.h
    │   ├── ncnn_ops.td
    │   ├── ncnn_rewriter.cpp
    │   ├── ncnn_rewriter.td
    │   ├── tf_attributes.cc
    │   ├── tf_attributes.h
    │   ├── tf_dialect.cpp
    │   ├── tf_dialect.h
    │   ├── tf_generated_ops.td
    │   ├── tf_op_base.td
    │   ├── tf_ops.td
    │   ├── tf_side_effects.h
    │   ├── tf_traits.h
    │   ├── tf_types.cc
    │   ├── tf_types.def
    │   └── tf_types.h
    ├── modelwriter.h
    ├── mxnet/
    │   ├── CMakeLists.txt
    │   └── mxnet2ncnn.cpp
    ├── ncnn2mem.cpp
    ├── ncnnmerge.cpp
    ├── ncnnoptimize.cpp
    ├── onnx/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── onnx.proto
    │   └── onnx2ncnn.cpp
    ├── plugin/
    │   ├── ImageWatchNCNN.natvis
    │   ├── ImageWatchNNIE.natvis
    │   └── README.md
    ├── pnnx/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cmake/
    │   │   └── PNNXPyTorch.cmake
    │   ├── python/
    │   │   ├── README.md
    │   │   ├── examples/
    │   │   │   ├── convert.py
    │   │   │   └── export.py
    │   │   ├── pnnx/
    │   │   │   ├── __init__.py
    │   │   │   └── utils/
    │   │   │       ├── __init__.py
    │   │   │       ├── convert.py
    │   │   │       ├── export.py
    │   │   │       └── utils.py
    │   │   ├── requirements.txt
    │   │   ├── setup.py
    │   │   └── tests/
    │   │       ├── test_convert.py
    │   │       ├── test_dynamicinput_convert.py
    │   │       ├── test_dynamicinput_export.py
    │   │       ├── test_export.py
    │   │       ├── test_naiveinput_convert.py
    │   │       └── test_naiveinput_export.py
    │   ├── src/
    │   │   ├── CMakeLists.txt
    │   │   ├── ir.cpp
    │   │   ├── ir.h
    │   │   ├── load_onnx.cpp
    │   │   ├── load_onnx.h
    │   │   ├── load_tnn.cpp
    │   │   ├── load_tnn.h
    │   │   ├── load_torchscript.cpp
    │   │   ├── load_torchscript.h
    │   │   ├── main.cpp
    │   │   ├── onnx-data.proto
    │   │   ├── onnx-ml.proto
    │   │   ├── onnx-operators-ml.proto
    │   │   ├── pass_level0/
    │   │   │   ├── constant_unpooling.cpp
    │   │   │   ├── constant_unpooling.h
    │   │   │   ├── convert_half_to_float.cpp
    │   │   │   ├── convert_half_to_float.h
    │   │   │   ├── flatten_input.cpp
    │   │   │   ├── flatten_input.h
    │   │   │   ├── inline_block.cpp
    │   │   │   ├── inline_block.h
    │   │   │   ├── reset_device.cpp
    │   │   │   ├── reset_device.h
    │   │   │   ├── shape_inference.cpp
    │   │   │   └── shape_inference.h
    │   │   ├── pass_level0.cpp
    │   │   ├── pass_level0.h
    │   │   ├── pass_level1/
    │   │   │   ├── fuse_module_pass.cpp
    │   │   │   ├── fuse_module_pass.h
    │   │   │   ├── nn_AdaptiveAvgPool1d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool2d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool3d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool1d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool2d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool3d.cpp
    │   │   │   ├── nn_AlphaDropout.cpp
    │   │   │   ├── nn_AvgPool1d.cpp
    │   │   │   ├── nn_AvgPool2d.cpp
    │   │   │   ├── nn_AvgPool3d.cpp
    │   │   │   ├── nn_BatchNorm1d.cpp
    │   │   │   ├── nn_BatchNorm2d.cpp
    │   │   │   ├── nn_BatchNorm3d.cpp
    │   │   │   ├── nn_CELU.cpp
    │   │   │   ├── nn_ChannelShuffle.cpp
    │   │   │   ├── nn_ConstantPad1d.cpp
    │   │   │   ├── nn_ConstantPad2d.cpp
    │   │   │   ├── nn_ConstantPad3d.cpp
    │   │   │   ├── nn_Conv1d.cpp
    │   │   │   ├── nn_Conv2d.cpp
    │   │   │   ├── nn_Conv3d.cpp
    │   │   │   ├── nn_ConvTranspose1d.cpp
    │   │   │   ├── nn_ConvTranspose2d.cpp
    │   │   │   ├── nn_ConvTranspose3d.cpp
    │   │   │   ├── nn_Dropout.cpp
    │   │   │   ├── nn_Dropout2d.cpp
    │   │   │   ├── nn_Dropout3d.cpp
    │   │   │   ├── nn_ELU.cpp
    │   │   │   ├── nn_Embedding.cpp
    │   │   │   ├── nn_Fold.cpp
    │   │   │   ├── nn_GELU.cpp
    │   │   │   ├── nn_GLU.cpp
    │   │   │   ├── nn_GRU.cpp
    │   │   │   ├── nn_GroupNorm.cpp
    │   │   │   ├── nn_Hardshrink.cpp
    │   │   │   ├── nn_Hardsigmoid.cpp
    │   │   │   ├── nn_Hardswish.cpp
    │   │   │   ├── nn_Hardtanh.cpp
    │   │   │   ├── nn_InstanceNorm1d.cpp
    │   │   │   ├── nn_InstanceNorm2d.cpp
    │   │   │   ├── nn_InstanceNorm3d.cpp
    │   │   │   ├── nn_LPPool1d.cpp
    │   │   │   ├── nn_LPPool2d.cpp
    │   │   │   ├── nn_LSTM.cpp
    │   │   │   ├── nn_LayerNorm.cpp
    │   │   │   ├── nn_LeakyReLU.cpp
    │   │   │   ├── nn_Linear.cpp
    │   │   │   ├── nn_LocalResponseNorm.cpp
    │   │   │   ├── nn_LogSigmoid.cpp
    │   │   │   ├── nn_LogSoftmax.cpp
    │   │   │   ├── nn_MaxPool1d.cpp
    │   │   │   ├── nn_MaxPool2d.cpp
    │   │   │   ├── nn_MaxPool3d.cpp
    │   │   │   ├── nn_Mish.cpp
    │   │   │   ├── nn_MultiheadAttention.cpp
    │   │   │   ├── nn_PReLU.cpp
    │   │   │   ├── nn_PixelShuffle.cpp
    │   │   │   ├── nn_PixelUnshuffle.cpp
    │   │   │   ├── nn_RMSNorm.cpp
    │   │   │   ├── nn_RNN.cpp
    │   │   │   ├── nn_RReLU.cpp
    │   │   │   ├── nn_ReLU.cpp
    │   │   │   ├── nn_ReLU6.cpp
    │   │   │   ├── nn_ReflectionPad1d.cpp
    │   │   │   ├── nn_ReflectionPad2d.cpp
    │   │   │   ├── nn_ReplicationPad1d.cpp
    │   │   │   ├── nn_ReplicationPad2d.cpp
    │   │   │   ├── nn_ReplicationPad3d.cpp
    │   │   │   ├── nn_SELU.cpp
    │   │   │   ├── nn_SiLU.cpp
    │   │   │   ├── nn_Sigmoid.cpp
    │   │   │   ├── nn_Softmax.cpp
    │   │   │   ├── nn_Softmax2d.cpp
    │   │   │   ├── nn_Softmin.cpp
    │   │   │   ├── nn_Softplus.cpp
    │   │   │   ├── nn_Softshrink.cpp
    │   │   │   ├── nn_Softsign.cpp
    │   │   │   ├── nn_Tanh.cpp
    │   │   │   ├── nn_Tanhshrink.cpp
    │   │   │   ├── nn_Threshold.cpp
    │   │   │   ├── nn_Unfold.cpp
    │   │   │   ├── nn_Upsample.cpp
    │   │   │   ├── nn_UpsamplingBilinear2d.cpp
    │   │   │   ├── nn_UpsamplingNearest2d.cpp
    │   │   │   ├── nn_ZeroPad2d.cpp
    │   │   │   ├── nn_maxunpool2d.cpp
    │   │   │   ├── nn_quantized_Conv2d.cpp
    │   │   │   ├── nn_quantized_DeQuantize.cpp
    │   │   │   ├── nn_quantized_Linear.cpp
    │   │   │   ├── nn_quantized_Quantize.cpp
    │   │   │   ├── torchvision_DeformConv2d.cpp
    │   │   │   └── torchvision_RoIAlign.cpp
    │   │   ├── pass_level1.cpp
    │   │   ├── pass_level1.h
    │   │   ├── pass_level2/
    │   │   │   ├── F_adaptive_avg_pool1d.cpp
    │   │   │   ├── F_adaptive_avg_pool2d.cpp
    │   │   │   ├── F_adaptive_avg_pool3d.cpp
    │   │   │   ├── F_adaptive_max_pool1d.cpp
    │   │   │   ├── F_adaptive_max_pool2d.cpp
    │   │   │   ├── F_adaptive_max_pool3d.cpp
    │   │   │   ├── F_affine_grid.cpp
    │   │   │   ├── F_alpha_dropout.cpp
    │   │   │   ├── F_avg_pool1d.cpp
    │   │   │   ├── F_avg_pool2d.cpp
    │   │   │   ├── F_avg_pool3d.cpp
    │   │   │   ├── F_batch_norm.cpp
    │   │   │   ├── F_celu.cpp
    │   │   │   ├── F_conv1d.cpp
    │   │   │   ├── F_conv2d.cpp
    │   │   │   ├── F_conv3d.cpp
    │   │   │   ├── F_conv_transpose1d.cpp
    │   │   │   ├── F_conv_transpose2d.cpp
    │   │   │   ├── F_conv_transpose3d.cpp
    │   │   │   ├── F_dropout.cpp
    │   │   │   ├── F_dropout23d.cpp
    │   │   │   ├── F_elu.cpp
    │   │   │   ├── F_embedding.cpp
    │   │   │   ├── F_feature_alpha_dropout.cpp
    │   │   │   ├── F_fold.cpp
    │   │   │   ├── F_gelu.cpp
    │   │   │   ├── F_glu.cpp
    │   │   │   ├── F_grid_sample.cpp
    │   │   │   ├── F_group_norm.cpp
    │   │   │   ├── F_hardshrink.cpp
    │   │   │   ├── F_hardsigmoid.cpp
    │   │   │   ├── F_hardswish.cpp
    │   │   │   ├── F_hardtanh.cpp
    │   │   │   ├── F_instance_norm.cpp
    │   │   │   ├── F_interpolate.cpp
    │   │   │   ├── F_layer_norm.cpp
    │   │   │   ├── F_leaky_relu.cpp
    │   │   │   ├── F_linear.cpp
    │   │   │   ├── F_local_response_norm.cpp
    │   │   │   ├── F_log_softmax.cpp
    │   │   │   ├── F_logsigmoid.cpp
    │   │   │   ├── F_lp_pool1d.cpp
    │   │   │   ├── F_lp_pool2d.cpp
    │   │   │   ├── F_max_pool1d.cpp
    │   │   │   ├── F_max_pool2d.cpp
    │   │   │   ├── F_max_pool3d.cpp
    │   │   │   ├── F_mish.cpp
    │   │   │   ├── F_normalize.cpp
    │   │   │   ├── F_pad.cpp
    │   │   │   ├── F_pairwise_distance.cpp
    │   │   │   ├── F_pixel_shuffle.cpp
    │   │   │   ├── F_pixel_unshuffle.cpp
    │   │   │   ├── F_prelu.cpp
    │   │   │   ├── F_relu.cpp
    │   │   │   ├── F_relu6.cpp
    │   │   │   ├── F_rms_norm.cpp
    │   │   │   ├── F_rrelu.cpp
    │   │   │   ├── F_scaled_dot_product_attention.cpp
    │   │   │   ├── F_selu.cpp
    │   │   │   ├── F_sigmoid.cpp
    │   │   │   ├── F_silu.cpp
    │   │   │   ├── F_softmax.cpp
    │   │   │   ├── F_softmin.cpp
    │   │   │   ├── F_softplus.cpp
    │   │   │   ├── F_softshrink.cpp
    │   │   │   ├── F_softsign.cpp
    │   │   │   ├── F_tanh.cpp
    │   │   │   ├── F_tanhshrink.cpp
    │   │   │   ├── F_threshold.cpp
    │   │   │   ├── F_unfold.cpp
    │   │   │   ├── F_upsample.cpp
    │   │   │   ├── F_upsample_bilinear.cpp
    │   │   │   ├── F_upsample_nearest.cpp
    │   │   │   ├── README.md
    │   │   │   ├── Tensor_copy.cpp
    │   │   │   ├── Tensor_expand.cpp
    │   │   │   ├── Tensor_expand_as.cpp
    │   │   │   ├── Tensor_fill.cpp
    │   │   │   ├── Tensor_index.cpp
    │   │   │   ├── Tensor_index_put.cpp
    │   │   │   ├── Tensor_masked_fill.cpp
    │   │   │   ├── Tensor_new_empty.cpp
    │   │   │   ├── Tensor_new_ones.cpp
    │   │   │   ├── Tensor_new_zeros.cpp
    │   │   │   ├── Tensor_permute.cpp
    │   │   │   ├── Tensor_repeat.cpp
    │   │   │   ├── Tensor_reshape.cpp
    │   │   │   ├── Tensor_reshape_as.cpp
    │   │   │   ├── Tensor_select.cpp
    │   │   │   ├── Tensor_size.cpp
    │   │   │   ├── Tensor_slice.cpp
    │   │   │   ├── Tensor_to.cpp
    │   │   │   ├── Tensor_type_as.cpp
    │   │   │   ├── Tensor_unflatten.cpp
    │   │   │   ├── eliminate_contiguous.cpp
    │   │   │   ├── eliminate_contiguous.h
    │   │   │   ├── eliminate_size_numtotensor_int.cpp
    │   │   │   ├── eliminate_size_numtotensor_int.h
    │   │   │   ├── functionize.cpp
    │   │   │   ├── functionize.h
    │   │   │   ├── fuse_constantlist.cpp
    │   │   │   ├── fuse_constantlist.h
    │   │   │   ├── nn_GRU.cpp
    │   │   │   ├── nn_LSTM.cpp
    │   │   │   ├── nn_RNN.cpp
    │   │   │   ├── nn_quantized_FloatFunctional.cpp
    │   │   │   ├── torch_addmm.cpp
    │   │   │   ├── torch_amax.cpp
    │   │   │   ├── torch_amin.cpp
    │   │   │   ├── torch_arange.cpp
    │   │   │   ├── torch_argmax.cpp
    │   │   │   ├── torch_argmin.cpp
    │   │   │   ├── torch_as_strided.cpp
    │   │   │   ├── torch_baddbmm.cpp
    │   │   │   ├── torch_bitwise_and.cpp
    │   │   │   ├── torch_bitwise_left_shift.cpp
    │   │   │   ├── torch_bitwise_not.cpp
    │   │   │   ├── torch_bitwise_or.cpp
    │   │   │   ├── torch_bitwise_right_shift.cpp
    │   │   │   ├── torch_bitwise_xor.cpp
    │   │   │   ├── torch_bmm.cpp
    │   │   │   ├── torch_cat.cpp
    │   │   │   ├── torch_chunk.cpp
    │   │   │   ├── torch_clamp.cpp
    │   │   │   ├── torch_clone.cpp
    │   │   │   ├── torch_complex.cpp
    │   │   │   ├── torch_cross.cpp
    │   │   │   ├── torch_cumprod.cpp
    │   │   │   ├── torch_cumsum.cpp
    │   │   │   ├── torch_dequantize.cpp
    │   │   │   ├── torch_diag.cpp
    │   │   │   ├── torch_einsum.cpp
    │   │   │   ├── torch_empty.cpp
    │   │   │   ├── torch_empty_like.cpp
    │   │   │   ├── torch_eq.cpp
    │   │   │   ├── torch_fft_fft.cpp
    │   │   │   ├── torch_fft_fft2.cpp
    │   │   │   ├── torch_fft_fftn.cpp
    │   │   │   ├── torch_fft_hfft.cpp
    │   │   │   ├── torch_fft_hfft2.cpp
    │   │   │   ├── torch_fft_hfftn.cpp
    │   │   │   ├── torch_fft_ifft.cpp
    │   │   │   ├── torch_fft_ifft2.cpp
    │   │   │   ├── torch_fft_ifftn.cpp
    │   │   │   ├── torch_fft_ihfft.cpp
    │   │   │   ├── torch_fft_ihfft2.cpp
    │   │   │   ├── torch_fft_ihfftn.cpp
    │   │   │   ├── torch_fft_irfft.cpp
    │   │   │   ├── torch_fft_irfft2.cpp
    │   │   │   ├── torch_fft_irfftn.cpp
    │   │   │   ├── torch_fft_rfft.cpp
    │   │   │   ├── torch_fft_rfft2.cpp
    │   │   │   ├── torch_fft_rfftn.cpp
    │   │   │   ├── torch_flatten.cpp
    │   │   │   ├── torch_flip.cpp
    │   │   │   ├── torch_full.cpp
    │   │   │   ├── torch_full_like.cpp
    │   │   │   ├── torch_gather.cpp
    │   │   │   ├── torch_ge.cpp
    │   │   │   ├── torch_gt.cpp
    │   │   │   ├── torch_imag.cpp
    │   │   │   ├── torch_index_select.cpp
    │   │   │   ├── torch_istft.cpp
    │   │   │   ├── torch_le.cpp
    │   │   │   ├── torch_lgamma.cpp
    │   │   │   ├── torch_logical_and.cpp
    │   │   │   ├── torch_logical_not.cpp
    │   │   │   ├── torch_logical_or.cpp
    │   │   │   ├── torch_logical_xor.cpp
    │   │   │   ├── torch_logsumexp.cpp
    │   │   │   ├── torch_lt.cpp
    │   │   │   ├── torch_masked_select.cpp
    │   │   │   ├── torch_matmul.cpp
    │   │   │   ├── torch_max.cpp
    │   │   │   ├── torch_mean.cpp
    │   │   │   ├── torch_min.cpp
    │   │   │   ├── torch_mm.cpp
    │   │   │   ├── torch_mv.cpp
    │   │   │   ├── torch_narrow.cpp
    │   │   │   ├── torch_ne.cpp
    │   │   │   ├── torch_norm.cpp
    │   │   │   ├── torch_normal.cpp
    │   │   │   ├── torch_ones.cpp
    │   │   │   ├── torch_ones_like.cpp
    │   │   │   ├── torch_positive.cpp
    │   │   │   ├── torch_prod.cpp
    │   │   │   ├── torch_quantize_per_tensor.cpp
    │   │   │   ├── torch_randn.cpp
    │   │   │   ├── torch_randn_like.cpp
    │   │   │   ├── torch_real.cpp
    │   │   │   ├── torch_repeat_interleave.cpp
    │   │   │   ├── torch_roll.cpp
    │   │   │   ├── torch_scatter_add.cpp
    │   │   │   ├── torch_slice_scatter.cpp
    │   │   │   ├── torch_split.cpp
    │   │   │   ├── torch_squeeze.cpp
    │   │   │   ├── torch_stack.cpp
    │   │   │   ├── torch_std.cpp
    │   │   │   ├── torch_stft.cpp
    │   │   │   ├── torch_sum.cpp
    │   │   │   ├── torch_t.cpp
    │   │   │   ├── torch_tensor_split.cpp
    │   │   │   ├── torch_tile.cpp
    │   │   │   ├── torch_topk.cpp
    │   │   │   ├── torch_transpose.cpp
    │   │   │   ├── torch_unbind.cpp
    │   │   │   ├── torch_unsqueeze.cpp
    │   │   │   ├── torch_var.cpp
    │   │   │   ├── torch_view_as_complex.cpp
    │   │   │   ├── torch_view_as_real.cpp
    │   │   │   ├── torch_where.cpp
    │   │   │   ├── torch_zeros.cpp
    │   │   │   ├── torch_zeros_like.cpp
    │   │   │   ├── torchaudio_F_inverse_spectrogram.cpp
    │   │   │   └── torchaudio_F_spectrogram.cpp
    │   │   ├── pass_level2.cpp
    │   │   ├── pass_level2.h
    │   │   ├── pass_level3/
    │   │   │   ├── assign_unique_name.cpp
    │   │   │   ├── assign_unique_name.h
    │   │   │   ├── eliminate_noop_math.cpp
    │   │   │   ├── eliminate_noop_math.h
    │   │   │   ├── eliminate_squeeze_unsqueeze_pair.cpp
    │   │   │   ├── eliminate_squeeze_unsqueeze_pair.h
    │   │   │   ├── eliminate_tuple_pair.cpp
    │   │   │   ├── eliminate_tuple_pair.h
    │   │   │   ├── expand_quantization_modules.cpp
    │   │   │   ├── expand_quantization_modules.h
    │   │   │   ├── fuse_dynamic_adaptive_pool.cpp
    │   │   │   ├── fuse_dynamic_adaptive_pool.h
    │   │   │   ├── fuse_einsum_operands.cpp
    │   │   │   ├── fuse_einsum_operands.h
    │   │   │   ├── fuse_expression.cpp
    │   │   │   ├── fuse_expression.h
    │   │   │   ├── fuse_index_expression.cpp
    │   │   │   ├── fuse_index_expression.h
    │   │   │   ├── fuse_maxpool_unpack.cpp
    │   │   │   ├── fuse_maxpool_unpack.h
    │   │   │   ├── fuse_multiheadattention_unpack.cpp
    │   │   │   ├── fuse_multiheadattention_unpack.h
    │   │   │   ├── fuse_op1ton_unpack.cpp
    │   │   │   ├── fuse_op1ton_unpack.h
    │   │   │   ├── fuse_opnto1_tensors.cpp
    │   │   │   ├── fuse_opnto1_tensors.h
    │   │   │   ├── fuse_rnn_unpack.cpp
    │   │   │   ├── fuse_rnn_unpack.h
    │   │   │   ├── rename_F_dropoutnd.cpp
    │   │   │   └── rename_F_dropoutnd.h
    │   │   ├── pass_level3.cpp
    │   │   ├── pass_level3.h
    │   │   ├── pass_level4/
    │   │   │   ├── attribute_pooling.cpp
    │   │   │   ├── attribute_pooling.h
    │   │   │   ├── canonicalize.cpp
    │   │   │   ├── canonicalize.h
    │   │   │   ├── dead_code_elimination.cpp
    │   │   │   ├── dead_code_elimination.h
    │   │   │   ├── fuse_custom_op.cpp
    │   │   │   └── fuse_custom_op.h
    │   │   ├── pass_level4.cpp
    │   │   ├── pass_level4.h
    │   │   ├── pass_level5/
    │   │   │   ├── attribute_unpooling.cpp
    │   │   │   ├── attribute_unpooling.h
    │   │   │   ├── eliminate_dropout.cpp
    │   │   │   ├── eliminate_dropout.h
    │   │   │   ├── eliminate_identity_operator.cpp
    │   │   │   ├── eliminate_identity_operator.h
    │   │   │   ├── eliminate_maxpool_indices.cpp
    │   │   │   ├── eliminate_maxpool_indices.h
    │   │   │   ├── eliminate_noop_cat.cpp
    │   │   │   ├── eliminate_noop_cat.h
    │   │   │   ├── eliminate_noop_einsum.cpp
    │   │   │   ├── eliminate_noop_einsum.h
    │   │   │   ├── eliminate_noop_expand.cpp
    │   │   │   ├── eliminate_noop_expand.h
    │   │   │   ├── eliminate_noop_expression.cpp
    │   │   │   ├── eliminate_noop_expression.h
    │   │   │   ├── eliminate_noop_pad.cpp
    │   │   │   ├── eliminate_noop_pad.h
    │   │   │   ├── eliminate_noop_permute.cpp
    │   │   │   ├── eliminate_noop_permute.h
    │   │   │   ├── eliminate_noop_reshape.cpp
    │   │   │   ├── eliminate_noop_reshape.h
    │   │   │   ├── eliminate_noop_slice.cpp
    │   │   │   ├── eliminate_noop_slice.h
    │   │   │   ├── eliminate_noop_upsample.cpp
    │   │   │   ├── eliminate_noop_upsample.h
    │   │   │   ├── eliminate_reshape_shape_expression.cpp
    │   │   │   ├── eliminate_reshape_shape_expression.h
    │   │   │   ├── eliminate_type_as.cpp
    │   │   │   ├── eliminate_type_as.h
    │   │   │   ├── eval_expression.cpp
    │   │   │   ├── eval_expression.h
    │   │   │   ├── fold_constants.cpp
    │   │   │   ├── fold_constants.h
    │   │   │   ├── fuse_adjacent_permute.cpp
    │   │   │   ├── fuse_adjacent_permute.h
    │   │   │   ├── fuse_adjacent_reshape.cpp
    │   │   │   ├── fuse_adjacent_reshape.h
    │   │   │   ├── fuse_channel_shuffle.cpp
    │   │   │   ├── fuse_channel_shuffle.h
    │   │   │   ├── fuse_constant_expression.cpp
    │   │   │   ├── fuse_constant_expression.h
    │   │   │   ├── fuse_conv1d_batchnorm1d.cpp
    │   │   │   ├── fuse_conv1d_batchnorm1d.h
    │   │   │   ├── fuse_conv2d_batchnorm2d.cpp
    │   │   │   ├── fuse_conv2d_batchnorm2d.h
    │   │   │   ├── fuse_conv3d_batchnorm3d.cpp
    │   │   │   ├── fuse_conv3d_batchnorm3d.h
    │   │   │   ├── fuse_convtranspose1d_batchnorm1d.cpp
    │   │   │   ├── fuse_convtranspose1d_batchnorm1d.h
    │   │   │   ├── fuse_convtranspose2d_batchnorm2d.cpp
    │   │   │   ├── fuse_convtranspose2d_batchnorm2d.h
    │   │   │   ├── fuse_convtranspose3d_batchnorm3d.cpp
    │   │   │   ├── fuse_convtranspose3d_batchnorm3d.h
    │   │   │   ├── fuse_layernorm.cpp
    │   │   │   ├── fuse_layernorm.h
    │   │   │   ├── fuse_linear_batchnorm1d.cpp
    │   │   │   ├── fuse_linear_batchnorm1d.h
    │   │   │   ├── fuse_multiheadattention.cpp
    │   │   │   ├── fuse_multiheadattention.h
    │   │   │   ├── fuse_multiheadattention_sameqkv.cpp
    │   │   │   ├── fuse_multiheadattention_sameqkv.h
    │   │   │   ├── fuse_pad_conv1d.cpp
    │   │   │   ├── fuse_pad_conv1d.h
    │   │   │   ├── fuse_pad_conv2d.cpp
    │   │   │   ├── fuse_pad_conv2d.h
    │   │   │   ├── fuse_pixel_shuffle.cpp
    │   │   │   ├── fuse_pixel_shuffle.h
    │   │   │   ├── fuse_pixel_unshuffle.cpp
    │   │   │   ├── fuse_pixel_unshuffle.h
    │   │   │   ├── fuse_rmsnorm.cpp
    │   │   │   ├── fuse_rmsnorm.h
    │   │   │   ├── fuse_scaled_dot_product_attention.cpp
    │   │   │   ├── fuse_scaled_dot_product_attention.h
    │   │   │   ├── fuse_select_to_unbind.cpp
    │   │   │   ├── fuse_select_to_unbind.h
    │   │   │   ├── fuse_silu.cpp
    │   │   │   ├── fuse_silu.h
    │   │   │   ├── fuse_slice_copy.cpp
    │   │   │   ├── fuse_slice_copy.h
    │   │   │   ├── fuse_slice_indices.cpp
    │   │   │   ├── fuse_slice_indices.h
    │   │   │   ├── fuse_slice_squeeze_to_select.cpp
    │   │   │   ├── fuse_slice_squeeze_to_select.h
    │   │   │   ├── fuse_slice_to_tensor_split.cpp
    │   │   │   ├── fuse_slice_to_tensor_split.h
    │   │   │   ├── fuse_static_batchnorm.cpp
    │   │   │   ├── fuse_static_batchnorm.h
    │   │   │   ├── fuse_static_conv.cpp
    │   │   │   ├── fuse_static_conv.h
    │   │   │   ├── fuse_static_convtranspose.cpp
    │   │   │   ├── fuse_static_convtranspose.h
    │   │   │   ├── fuse_static_embedding.cpp
    │   │   │   ├── fuse_static_embedding.h
    │   │   │   ├── fuse_static_groupnorm.cpp
    │   │   │   ├── fuse_static_groupnorm.h
    │   │   │   ├── fuse_static_instancenorm.cpp
    │   │   │   ├── fuse_static_instancenorm.h
    │   │   │   ├── fuse_static_layernorm.cpp
    │   │   │   ├── fuse_static_layernorm.h
    │   │   │   ├── fuse_static_linear.cpp
    │   │   │   ├── fuse_static_linear.h
    │   │   │   ├── fuse_static_prelu.cpp
    │   │   │   ├── fuse_static_prelu.h
    │   │   │   ├── fuse_static_rmsnorm.cpp
    │   │   │   ├── fuse_static_rmsnorm.h
    │   │   │   ├── fuse_transformers_multiheadattention.cpp
    │   │   │   ├── fuse_transformers_multiheadattention.h
    │   │   │   ├── fuse_transformers_scaled_dot_product_attention.cpp
    │   │   │   ├── fuse_transformers_scaled_dot_product_attention.h
    │   │   │   ├── normalize_einsum_equation.cpp
    │   │   │   ├── normalize_einsum_equation.h
    │   │   │   ├── unroll_rnn_op.cpp
    │   │   │   └── unroll_rnn_op.h
    │   │   ├── pass_level5.cpp
    │   │   ├── pass_level5.h
    │   │   ├── pass_ncnn/
    │   │   │   ├── F_adaptive_avg_pool1d.cpp
    │   │   │   ├── F_adaptive_avg_pool2d.cpp
    │   │   │   ├── F_adaptive_avg_pool3d.cpp
    │   │   │   ├── F_adaptive_max_pool1d.cpp
    │   │   │   ├── F_adaptive_max_pool2d.cpp
    │   │   │   ├── F_adaptive_max_pool3d.cpp
    │   │   │   ├── F_avg_pool1d.cpp
    │   │   │   ├── F_avg_pool2d.cpp
    │   │   │   ├── F_avg_pool3d.cpp
    │   │   │   ├── F_batch_norm.cpp
    │   │   │   ├── F_celu.cpp
    │   │   │   ├── F_conv1d.cpp
    │   │   │   ├── F_conv2d.cpp
    │   │   │   ├── F_conv3d.cpp
    │   │   │   ├── F_conv_transpose1d.cpp
    │   │   │   ├── F_conv_transpose2d.cpp
    │   │   │   ├── F_conv_transpose3d.cpp
    │   │   │   ├── F_elu.cpp
    │   │   │   ├── F_embedding.cpp
    │   │   │   ├── F_fold.cpp
    │   │   │   ├── F_gelu.cpp
    │   │   │   ├── F_glu.cpp
    │   │   │   ├── F_grid_sample.cpp
    │   │   │   ├── F_group_norm.cpp
    │   │   │   ├── F_hardshrink.cpp
    │   │   │   ├── F_hardsigmoid.cpp
    │   │   │   ├── F_hardswish.cpp
    │   │   │   ├── F_hardtanh.cpp
    │   │   │   ├── F_instance_norm.cpp
    │   │   │   ├── F_interpolate.cpp
    │   │   │   ├── F_layer_norm.cpp
    │   │   │   ├── F_leaky_relu.cpp
    │   │   │   ├── F_linear.cpp
    │   │   │   ├── F_local_response_norm.cpp
    │   │   │   ├── F_log_softmax.cpp
    │   │   │   ├── F_logsigmoid.cpp
    │   │   │   ├── F_max_pool1d.cpp
    │   │   │   ├── F_max_pool2d.cpp
    │   │   │   ├── F_max_pool3d.cpp
    │   │   │   ├── F_mish.cpp
    │   │   │   ├── F_normalize.cpp
    │   │   │   ├── F_pad.cpp
    │   │   │   ├── F_pixel_shuffle.cpp
    │   │   │   ├── F_pixel_unshuffle.cpp
    │   │   │   ├── F_prelu.cpp
    │   │   │   ├── F_relu.cpp
    │   │   │   ├── F_relu6.cpp
    │   │   │   ├── F_rms_norm.cpp
    │   │   │   ├── F_scaled_dot_product_attention.cpp
    │   │   │   ├── F_selu.cpp
    │   │   │   ├── F_sigmoid.cpp
    │   │   │   ├── F_silu.cpp
    │   │   │   ├── F_softmax.cpp
    │   │   │   ├── F_softplus.cpp
    │   │   │   ├── F_softshrink.cpp
    │   │   │   ├── F_tanh.cpp
    │   │   │   ├── F_unfold.cpp
    │   │   │   ├── F_upsample.cpp
    │   │   │   ├── F_upsample_bilinear.cpp
    │   │   │   ├── F_upsample_nearest.cpp
    │   │   │   ├── Tensor_expand.cpp
    │   │   │   ├── Tensor_permute.cpp
    │   │   │   ├── Tensor_repeat.cpp
    │   │   │   ├── Tensor_reshape.cpp
    │   │   │   ├── Tensor_reshape_as.cpp
    │   │   │   ├── Tensor_unflatten.cpp
    │   │   │   ├── chain_multi_output.cpp
    │   │   │   ├── chain_multi_output.h
    │   │   │   ├── convert_Tensor_select.cpp
    │   │   │   ├── convert_Tensor_select.h
    │   │   │   ├── convert_Tensor_slice.cpp
    │   │   │   ├── convert_Tensor_slice.h
    │   │   │   ├── convert_Tensor_slice_copy.cpp
    │   │   │   ├── convert_Tensor_slice_copy.h
    │   │   │   ├── convert_attribute.cpp
    │   │   │   ├── convert_attribute.h
    │   │   │   ├── convert_custom_op.cpp
    │   │   │   ├── convert_custom_op.h
    │   │   │   ├── convert_half_to_float.cpp
    │   │   │   ├── convert_half_to_float.h
    │   │   │   ├── convert_input.cpp
    │   │   │   ├── convert_input.h
    │   │   │   ├── convert_module_op.cpp
    │   │   │   ├── convert_module_op.h
    │   │   │   ├── convert_reshape_interp_expression.cpp
    │   │   │   ├── convert_reshape_interp_expression.h
    │   │   │   ├── convert_slice_expression.cpp
    │   │   │   ├── convert_slice_expression.h
    │   │   │   ├── convert_torch_cat.cpp
    │   │   │   ├── convert_torch_cat.h
    │   │   │   ├── convert_torch_chunk.cpp
    │   │   │   ├── convert_torch_chunk.h
    │   │   │   ├── convert_torch_einsum.cpp
    │   │   │   ├── convert_torch_einsum.h
    │   │   │   ├── convert_torch_split.cpp
    │   │   │   ├── convert_torch_split.h
    │   │   │   ├── convert_torch_stack.cpp
    │   │   │   ├── convert_torch_stack.h
    │   │   │   ├── convert_torch_tensor_split.cpp
    │   │   │   ├── convert_torch_tensor_split.h
    │   │   │   ├── convert_torch_unbind.cpp
    │   │   │   ├── convert_torch_unbind.h
    │   │   │   ├── eliminate_noop.cpp
    │   │   │   ├── eliminate_noop.h
    │   │   │   ├── eliminate_output.cpp
    │   │   │   ├── eliminate_output.h
    │   │   │   ├── expand_expression.cpp
    │   │   │   ├── expand_expression.h
    │   │   │   ├── fuse_binaryop_eltwise.cpp
    │   │   │   ├── fuse_binaryop_eltwise.h
    │   │   │   ├── fuse_convert_rotaryembed.cpp
    │   │   │   ├── fuse_convert_rotaryembed.h
    │   │   │   ├── fuse_convert_shufflechannel_slice.cpp
    │   │   │   ├── fuse_convert_shufflechannel_slice.h
    │   │   │   ├── fuse_convolution1d_activation.cpp
    │   │   │   ├── fuse_convolution1d_activation.h
    │   │   │   ├── fuse_convolution_activation.cpp
    │   │   │   ├── fuse_convolution_activation.h
    │   │   │   ├── fuse_convolutiondepthwise1d_activation.cpp
    │   │   │   ├── fuse_convolutiondepthwise1d_activation.h
    │   │   │   ├── fuse_convolutiondepthwise_activation.cpp
    │   │   │   ├── fuse_convolutiondepthwise_activation.h
    │   │   │   ├── fuse_deconvolution_activation.cpp
    │   │   │   ├── fuse_deconvolution_activation.h
    │   │   │   ├── fuse_deconvolutiondepthwise_activation.cpp
    │   │   │   ├── fuse_deconvolutiondepthwise_activation.h
    │   │   │   ├── fuse_innerproduct_activation.cpp
    │   │   │   ├── fuse_innerproduct_activation.h
    │   │   │   ├── fuse_padding_convolution.cpp
    │   │   │   ├── fuse_padding_convolution.h
    │   │   │   ├── fuse_padding_convolutiondepthwise.cpp
    │   │   │   ├── fuse_padding_convolutiondepthwise.h
    │   │   │   ├── fuse_transpose_matmul.cpp
    │   │   │   ├── fuse_transpose_matmul.h
    │   │   │   ├── insert_reshape_global_pooling.cpp
    │   │   │   ├── insert_reshape_global_pooling.h
    │   │   │   ├── insert_reshape_linear.cpp
    │   │   │   ├── insert_reshape_linear.h
    │   │   │   ├── insert_reshape_numpy_binaryop_broadcast.cpp
    │   │   │   ├── insert_reshape_numpy_binaryop_broadcast.h
    │   │   │   ├── insert_reshape_pooling.cpp
    │   │   │   ├── insert_reshape_pooling.h
    │   │   │   ├── insert_split.cpp
    │   │   │   ├── insert_split.h
    │   │   │   ├── nn_AdaptiveAvgPool1d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool2d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool3d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool1d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool2d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool3d.cpp
    │   │   │   ├── nn_AvgPool1d.cpp
    │   │   │   ├── nn_AvgPool2d.cpp
    │   │   │   ├── nn_AvgPool3d.cpp
    │   │   │   ├── nn_BatchNorm1d.cpp
    │   │   │   ├── nn_BatchNorm2d.cpp
    │   │   │   ├── nn_BatchNorm3d.cpp
    │   │   │   ├── nn_CELU.cpp
    │   │   │   ├── nn_ChannelShuffle.cpp
    │   │   │   ├── nn_ConstantPad1d.cpp
    │   │   │   ├── nn_ConstantPad2d.cpp
    │   │   │   ├── nn_ConstantPad3d.cpp
    │   │   │   ├── nn_Conv1d.cpp
    │   │   │   ├── nn_Conv2d.cpp
    │   │   │   ├── nn_Conv3d.cpp
    │   │   │   ├── nn_ConvTranspose1d.cpp
    │   │   │   ├── nn_ConvTranspose2d.cpp
    │   │   │   ├── nn_ConvTranspose3d.cpp
    │   │   │   ├── nn_ELU.cpp
    │   │   │   ├── nn_Embedding.cpp
    │   │   │   ├── nn_Fold.cpp
    │   │   │   ├── nn_GELU.cpp
    │   │   │   ├── nn_GLU.cpp
    │   │   │   ├── nn_GRU.cpp
    │   │   │   ├── nn_GroupNorm.cpp
    │   │   │   ├── nn_Hardshrink.cpp
    │   │   │   ├── nn_Hardsigmoid.cpp
    │   │   │   ├── nn_Hardswish.cpp
    │   │   │   ├── nn_Hardtanh.cpp
    │   │   │   ├── nn_InstanceNorm2d.cpp
    │   │   │   ├── nn_LSTM.cpp
    │   │   │   ├── nn_LayerNorm.cpp
    │   │   │   ├── nn_LeakyReLU.cpp
    │   │   │   ├── nn_Linear.cpp
    │   │   │   ├── nn_LocalResponseNorm.cpp
    │   │   │   ├── nn_LogSigmoid.cpp
    │   │   │   ├── nn_LogSoftmax.cpp
    │   │   │   ├── nn_MaxPool1d.cpp
    │   │   │   ├── nn_MaxPool2d.cpp
    │   │   │   ├── nn_MaxPool3d.cpp
    │   │   │   ├── nn_Mish.cpp
    │   │   │   ├── nn_MultiheadAttention.cpp
    │   │   │   ├── nn_PReLU.cpp
    │   │   │   ├── nn_PixelShuffle.cpp
    │   │   │   ├── nn_PixelUnshuffle.cpp
    │   │   │   ├── nn_RMSNorm.cpp
    │   │   │   ├── nn_RNN.cpp
    │   │   │   ├── nn_ReLU.cpp
    │   │   │   ├── nn_ReLU6.cpp
    │   │   │   ├── nn_ReflectionPad1d.cpp
    │   │   │   ├── nn_ReflectionPad2d.cpp
    │   │   │   ├── nn_ReplicationPad1d.cpp
    │   │   │   ├── nn_ReplicationPad2d.cpp
    │   │   │   ├── nn_ReplicationPad3d.cpp
    │   │   │   ├── nn_SELU.cpp
    │   │   │   ├── nn_SiLU.cpp
    │   │   │   ├── nn_Sigmoid.cpp
    │   │   │   ├── nn_Softmax.cpp
    │   │   │   ├── nn_Softmax2d.cpp
    │   │   │   ├── nn_Softplus.cpp
    │   │   │   ├── nn_Softshrink.cpp
    │   │   │   ├── nn_Tanh.cpp
    │   │   │   ├── nn_Unfold.cpp
    │   │   │   ├── nn_Upsample.cpp
    │   │   │   ├── nn_UpsamplingBilinear2d.cpp
    │   │   │   ├── nn_UpsamplingNearest2d.cpp
    │   │   │   ├── nn_ZeroPad2d.cpp
    │   │   │   ├── solve_batch_index.cpp
    │   │   │   ├── solve_batch_index.h
    │   │   │   ├── torch_addmm.cpp
    │   │   │   ├── torch_amax.cpp
    │   │   │   ├── torch_amin.cpp
    │   │   │   ├── torch_bmm.cpp
    │   │   │   ├── torch_clamp.cpp
    │   │   │   ├── torch_clone.cpp
    │   │   │   ├── torch_cumsum.cpp
    │   │   │   ├── torch_diag.cpp
    │   │   │   ├── torch_flatten.cpp
    │   │   │   ├── torch_flip.cpp
    │   │   │   ├── torch_istft.cpp
    │   │   │   ├── torch_logsumexp.cpp
    │   │   │   ├── torch_matmul.cpp
    │   │   │   ├── torch_max.cpp
    │   │   │   ├── torch_mean.cpp
    │   │   │   ├── torch_min.cpp
    │   │   │   ├── torch_mm.cpp
    │   │   │   ├── torch_norm.cpp
    │   │   │   ├── torch_prod.cpp
    │   │   │   ├── torch_roll.cpp
    │   │   │   ├── torch_slice_scatter.cpp
    │   │   │   ├── torch_squeeze.cpp
    │   │   │   ├── torch_stft.cpp
    │   │   │   ├── torch_sum.cpp
    │   │   │   ├── torch_t.cpp
    │   │   │   ├── torch_transpose.cpp
    │   │   │   ├── torch_unsqueeze.cpp
    │   │   │   ├── torchaudio_F_inverse_spectrogram.cpp
    │   │   │   ├── torchaudio_F_spectrogram.cpp
    │   │   │   └── torchvision_DeformConv2d.cpp
    │   │   ├── pass_ncnn.cpp
    │   │   ├── pass_ncnn.h
    │   │   ├── pass_onnx/
    │   │   │   ├── canonicalize.cpp
    │   │   │   ├── canonicalize.h
    │   │   │   ├── dead_code_elimination.cpp
    │   │   │   ├── dead_code_elimination.h
    │   │   │   ├── eliminate_initializer_input.cpp
    │   │   │   ├── eliminate_initializer_input.h
    │   │   │   ├── eliminate_noop.cpp
    │   │   │   ├── eliminate_noop.h
    │   │   │   ├── fold_constants.cpp
    │   │   │   ├── fold_constants.h
    │   │   │   ├── fuse_constant_as_attribute.cpp
    │   │   │   ├── fuse_constant_as_attribute.h
    │   │   │   ├── inline_containers.cpp
    │   │   │   ├── inline_containers.h
    │   │   │   ├── inline_if_graph.cpp
    │   │   │   ├── inline_if_graph.h
    │   │   │   ├── model_stat.cpp
    │   │   │   ├── model_stat.h
    │   │   │   ├── shape_inference.cpp
    │   │   │   └── shape_inference.h
    │   │   ├── pass_onnx.cpp
    │   │   ├── pass_onnx.h
    │   │   ├── pass_tnn/
    │   │   │   ├── fuse_shape_list_construct.cpp
    │   │   │   ├── fuse_shape_list_construct.h
    │   │   │   ├── fuse_shape_size.cpp
    │   │   │   ├── fuse_shape_size.h
    │   │   │   ├── lower_concat.cpp
    │   │   │   ├── lower_concat.h
    │   │   │   ├── lower_convolution_activation.cpp
    │   │   │   ├── lower_convolution_activation.h
    │   │   │   ├── lower_power.cpp
    │   │   │   └── lower_power.h
    │   │   ├── save_ncnn.cpp
    │   │   ├── save_ncnn.h
    │   │   ├── save_onnx.cpp
    │   │   ├── save_onnx.h
    │   │   ├── storezip.cpp
    │   │   ├── storezip.h
    │   │   ├── utils.cpp
    │   │   └── utils.h
    │   └── tests/
    │       ├── CMakeLists.txt
    │       ├── ncnn/
    │       │   ├── CMakeLists.txt
    │       │   ├── test_F_adaptive_avg_pool1d.py
    │       │   ├── test_F_adaptive_avg_pool2d.py
    │       │   ├── test_F_adaptive_avg_pool3d.py
    │       │   ├── test_F_adaptive_max_pool1d.py
    │       │   ├── test_F_adaptive_max_pool2d.py
    │       │   ├── test_F_adaptive_max_pool3d.py
    │       │   ├── test_F_alpha_dropout.py
    │       │   ├── test_F_avg_pool1d.py
    │       │   ├── test_F_avg_pool2d.py
    │       │   ├── test_F_avg_pool3d.py
    │       │   ├── test_F_batch_norm.py
    │       │   ├── test_F_celu.py
    │       │   ├── test_F_conv1d.py
    │       │   ├── test_F_conv2d.py
    │       │   ├── test_F_conv3d.py
    │       │   ├── test_F_conv_transpose1d.py
    │       │   ├── test_F_conv_transpose2d.py
    │       │   ├── test_F_conv_transpose3d.py
    │       │   ├── test_F_dropout.py
    │       │   ├── test_F_dropout2d.py
    │       │   ├── test_F_dropout3d.py
    │       │   ├── test_F_elu.py
    │       │   ├── test_F_embedding.py
    │       │   ├── test_F_feature_alpha_dropout.py
    │       │   ├── test_F_fold.py
    │       │   ├── test_F_gelu.py
    │       │   ├── test_F_glu.py
    │       │   ├── test_F_grid_sample.py
    │       │   ├── test_F_group_norm.py
    │       │   ├── test_F_hardshrink.py
    │       │   ├── test_F_hardsigmoid.py
    │       │   ├── test_F_hardswish.py
    │       │   ├── test_F_hardtanh.py
    │       │   ├── test_F_interpolate.py
    │       │   ├── test_F_layer_norm.py
    │       │   ├── test_F_leaky_relu.py
    │       │   ├── test_F_local_response_norm.py
    │       │   ├── test_F_log_softmax.py
    │       │   ├── test_F_logsigmoid.py
    │       │   ├── test_F_max_pool1d.py
    │       │   ├── test_F_max_pool2d.py
    │       │   ├── test_F_max_pool3d.py
    │       │   ├── test_F_mish.py
    │       │   ├── test_F_normalize.py
    │       │   ├── test_F_pad.py
    │       │   ├── test_F_pixel_shuffle.py
    │       │   ├── test_F_pixel_unshuffle.py
    │       │   ├── test_F_prelu.py
    │       │   ├── test_F_relu.py
    │       │   ├── test_F_relu6.py
    │       │   ├── test_F_rms_norm.py
    │       │   ├── test_F_scaled_dot_product_attention.py
    │       │   ├── test_F_selu.py
    │       │   ├── test_F_sigmoid.py
    │       │   ├── test_F_silu.py
    │       │   ├── test_F_softmax.py
    │       │   ├── test_F_softshrink.py
    │       │   ├── test_F_tanh.py
    │       │   ├── test_F_unfold.py
    │       │   ├── test_F_upsample.py
    │       │   ├── test_F_upsample_bilinear.py
    │       │   ├── test_F_upsample_nearest.py
    │       │   ├── test_Tensor_expand.py
    │       │   ├── test_Tensor_permute.py
    │       │   ├── test_Tensor_repeat.py
    │       │   ├── test_Tensor_reshape.py
    │       │   ├── test_Tensor_reshape_as.py
    │       │   ├── test_Tensor_slice.py
    │       │   ├── test_Tensor_slice_copy.py
    │       │   ├── test_Tensor_unflatten.py
    │       │   ├── test_Tensor_view.py
    │       │   ├── test_convnext_tiny.py
    │       │   ├── test_mobilenet_v2.py
    │       │   ├── test_mobilenet_v3_small.py
    │       │   ├── test_ncnn_fuse_binaryop_eltwise.py
    │       │   ├── test_ncnn_fuse_pad_conv.py
    │       │   ├── test_ncnn_fuse_shufflechannel_slice.py
    │       │   ├── test_ncnn_fuse_transpose_matmul.py
    │       │   ├── test_ncnn_interp_expr.py
    │       │   ├── test_ncnn_numpy_binaryop_broadcast.py
    │       │   ├── test_ncnn_reshape_expr.py
    │       │   ├── test_ncnn_slice_expr.py
    │       │   ├── test_ncnn_solve_batch_index.py
    │       │   ├── test_nn_AdaptiveAvgPool1d.py
    │       │   ├── test_nn_AdaptiveAvgPool2d.py
    │       │   ├── test_nn_AdaptiveAvgPool3d.py
    │       │   ├── test_nn_AdaptiveMaxPool1d.py
    │       │   ├── test_nn_AdaptiveMaxPool2d.py
    │       │   ├── test_nn_AdaptiveMaxPool3d.py
    │       │   ├── test_nn_AlphaDropout.py
    │       │   ├── test_nn_AvgPool1d.py
    │       │   ├── test_nn_AvgPool2d.py
    │       │   ├── test_nn_AvgPool3d.py
    │       │   ├── test_nn_BatchNorm1d.py
    │       │   ├── test_nn_BatchNorm2d.py
    │       │   ├── test_nn_BatchNorm3d.py
    │       │   ├── test_nn_CELU.py
    │       │   ├── test_nn_ChannelShuffle.py
    │       │   ├── test_nn_ConstantPad1d.py
    │       │   ├── test_nn_ConstantPad2d.py
    │       │   ├── test_nn_ConstantPad3d.py
    │       │   ├── test_nn_Conv1d.py
    │       │   ├── test_nn_Conv2d.py
    │       │   ├── test_nn_Conv3d.py
    │       │   ├── test_nn_ConvTranspose1d.py
    │       │   ├── test_nn_ConvTranspose2d.py
    │       │   ├── test_nn_ConvTranspose3d.py
    │       │   ├── test_nn_Dropout.py
    │       │   ├── test_nn_Dropout2d.py
    │       │   ├── test_nn_Dropout3d.py
    │       │   ├── test_nn_ELU.py
    │       │   ├── test_nn_Embedding.py
    │       │   ├── test_nn_Fold.py
    │       │   ├── test_nn_GELU.py
    │       │   ├── test_nn_GLU.py
    │       │   ├── test_nn_GRU.py
    │       │   ├── test_nn_GroupNorm.py
    │       │   ├── test_nn_Hardshrink.py
    │       │   ├── test_nn_Hardsigmoid.py
    │       │   ├── test_nn_Hardswish.py
    │       │   ├── test_nn_Hardtanh.py
    │       │   ├── test_nn_Identity.py
    │       │   ├── test_nn_InstanceNorm2d.py
    │       │   ├── test_nn_LSTM.py
    │       │   ├── test_nn_LayerNorm.py
    │       │   ├── test_nn_LeakyReLU.py
    │       │   ├── test_nn_Linear.py
    │       │   ├── test_nn_LocalResponseNorm.py
    │       │   ├── test_nn_LogSigmoid.py
    │       │   ├── test_nn_LogSoftmax.py
    │       │   ├── test_nn_MaxPool1d.py
    │       │   ├── test_nn_MaxPool2d.py
    │       │   ├── test_nn_MaxPool3d.py
    │       │   ├── test_nn_Mish.py
    │       │   ├── test_nn_MultiheadAttention.py
    │       │   ├── test_nn_PReLU.py
    │       │   ├── test_nn_PixelShuffle.py
    │       │   ├── test_nn_PixelUnshuffle.py
    │       │   ├── test_nn_RMSNorm.py
    │       │   ├── test_nn_RNN.py
    │       │   ├── test_nn_ReLU.py
    │       │   ├── test_nn_ReLU6.py
    │       │   ├── test_nn_ReflectionPad1d.py
    │       │   ├── test_nn_ReflectionPad2d.py
    │       │   ├── test_nn_ReplicationPad1d.py
    │       │   ├── test_nn_ReplicationPad2d.py
    │       │   ├── test_nn_ReplicationPad3d.py
    │       │   ├── test_nn_SELU.py
    │       │   ├── test_nn_SiLU.py
    │       │   ├── test_nn_Sigmoid.py
    │       │   ├── test_nn_Softmax.py
    │       │   ├── test_nn_Softmax2d.py
    │       │   ├── test_nn_Softshrink.py
    │       │   ├── test_nn_Tanh.py
    │       │   ├── test_nn_Unfold.py
    │       │   ├── test_nn_Upsample.py
    │       │   ├── test_nn_UpsamplingBilinear2d.py
    │       │   ├── test_nn_UpsamplingNearest2d.py
    │       │   ├── test_nn_ZeroPad2d.py
    │       │   ├── test_resnet18.py
    │       │   ├── test_shufflenet_v2_x1_0.py
    │       │   ├── test_squeezenet1_1.py
    │       │   ├── test_torch_abs.py
    │       │   ├── test_torch_acos.py
    │       │   ├── test_torch_addmm.py
    │       │   ├── test_torch_amax.py
    │       │   ├── test_torch_amin.py
    │       │   ├── test_torch_asin.py
    │       │   ├── test_torch_atan.py
    │       │   ├── test_torch_atan2.py
    │       │   ├── test_torch_bmm.py
    │       │   ├── test_torch_cat.py
    │       │   ├── test_torch_ceil.py
    │       │   ├── test_torch_chunk.py
    │       │   ├── test_torch_clamp.py
    │       │   ├── test_torch_clone.py
    │       │   ├── test_torch_cos.py
    │       │   ├── test_torch_cumsum.py
    │       │   ├── test_torch_diag.py
    │       │   ├── test_torch_einsum.py
    │       │   ├── test_torch_exp.py
    │       │   ├── test_torch_flatten.py
    │       │   ├── test_torch_flip.py
    │       │   ├── test_torch_floor.py
    │       │   ├── test_torch_istft.py
    │       │   ├── test_torch_log.py
    │       │   ├── test_torch_log10.py
    │       │   ├── test_torch_logsumexp.py
    │       │   ├── test_torch_matmul.py
    │       │   ├── test_torch_max.py
    │       │   ├── test_torch_maximum.py
    │       │   ├── test_torch_mean.py
    │       │   ├── test_torch_min.py
    │       │   ├── test_torch_minimum.py
    │       │   ├── test_torch_mm.py
    │       │   ├── test_torch_neg.py
    │       │   ├── test_torch_norm.py
    │       │   ├── test_torch_pow.py
    │       │   ├── test_torch_prod.py
    │       │   ├── test_torch_reciprocal.py
    │       │   ├── test_torch_roll.py
    │       │   ├── test_torch_round.py
    │       │   ├── test_torch_rsqrt.py
    │       │   ├── test_torch_sin.py
    │       │   ├── test_torch_slice_scatter.py
    │       │   ├── test_torch_sqrt.py
    │       │   ├── test_torch_square.py
    │       │   ├── test_torch_squeeze.py
    │       │   ├── test_torch_stack.py
    │       │   ├── test_torch_stft.py
    │       │   ├── test_torch_sum.py
    │       │   ├── test_torch_t.py
    │       │   ├── test_torch_tan.py
    │       │   ├── test_torch_tanh.py
    │       │   ├── test_torch_tensor_split.py
    │       │   ├── test_torch_transpose.py
    │       │   ├── test_torch_trunc.py
    │       │   ├── test_torch_unbind.py
    │       │   ├── test_torch_unsqueeze.py
    │       │   ├── test_torchaudio_F_inverse_spectrogram.py
    │       │   ├── test_torchaudio_F_spectrogram.py
    │       │   ├── test_torchaudio_InverseSpectrogram.py
    │       │   ├── test_torchaudio_Spectrogram.py
    │       │   ├── test_torchvision_DeformConv2d.py
    │       │   ├── test_transformers_deepseek_v3_attention.py
    │       │   ├── test_transformers_qwen2_attention.py
    │       │   ├── test_transformers_qwen3_attention.py
    │       │   └── test_vit_b_32.py
    │       ├── onnx/
    │       │   ├── CMakeLists.txt
    │       │   ├── test_F_adaptive_avg_pool1d.py
    │       │   ├── test_F_adaptive_avg_pool2d.py
    │       │   ├── test_F_adaptive_avg_pool3d.py
    │       │   ├── test_F_adaptive_max_pool1d.py
    │       │   ├── test_F_adaptive_max_pool2d.py
    │       │   ├── test_F_adaptive_max_pool3d.py
    │       │   ├── test_F_avg_pool1d.py
    │       │   ├── test_F_avg_pool2d.py
    │       │   ├── test_F_avg_pool3d.py
    │       │   ├── test_F_batch_norm.py
    │       │   ├── test_F_celu.py
    │       │   ├── test_F_conv1d.py
    │       │   ├── test_F_conv2d.py
    │       │   ├── test_F_conv3d.py
    │       │   ├── test_F_conv_transpose1d.py
    │       │   ├── test_F_conv_transpose2d.py
    │       │   ├── test_F_conv_transpose3d.py
    │       │   ├── test_F_elu.py
    │       │   ├── test_F_gelu.py
    │       │   ├── test_F_group_norm.py
    │       │   ├── test_F_hardshrink.py
    │       │   ├── test_F_hardsigmoid.py
    │       │   ├── test_F_hardswish.py
    │       │   ├── test_F_hardtanh.py
    │       │   ├── test_F_interpolate.py
    │       │   ├── test_F_layer_norm.py
    │       │   ├── test_F_leaky_relu.py
    │       │   ├── test_F_linear.py
    │       │   ├── test_F_local_response_norm.py
    │       │   ├── test_F_log_softmax.py
    │       │   ├── test_F_logsigmoid.py
    │       │   ├── test_F_max_pool1d.py
    │       │   ├── test_F_max_pool2d.py
    │       │   ├── test_F_max_pool3d.py
    │       │   ├── test_F_mish.py
    │       │   ├── test_F_normalize.py
    │       │   ├── test_F_pad.py
    │       │   ├── test_F_pixel_shuffle.py
    │       │   ├── test_F_pixel_unshuffle.py
    │       │   ├── test_F_prelu.py
    │       │   ├── test_F_relu.py
    │       │   ├── test_F_relu6.py
    │       │   ├── test_F_scaled_dot_product_attention.py
    │       │   ├── test_F_selu.py
    │       │   ├── test_F_sigmoid.py
    │       │   ├── test_F_silu.py
    │       │   ├── test_F_softmax.py
    │       │   ├── test_F_softmin.py
    │       │   ├── test_F_softplus.py
    │       │   ├── test_F_softshrink.py
    │       │   ├── test_F_softsign.py
    │       │   ├── test_F_tanh.py
    │       │   ├── test_F_tanhshrink.py
    │       │   ├── test_F_upsample.py
    │       │   ├── test_F_upsample_bilinear.py
    │       │   ├── test_F_upsample_nearest.py
    │       │   ├── test_Tensor_expand.py
    │       │   ├── test_Tensor_permute.py
    │       │   ├── test_Tensor_repeat.py
    │       │   ├── test_Tensor_reshape.py
    │       │   ├── test_Tensor_reshape_as.py
    │       │   ├── test_Tensor_select.py
    │       │   ├── test_Tensor_slice.py
    │       │   ├── test_Tensor_unflatten.py
    │       │   ├── test_Tensor_view.py
    │       │   ├── test_convnext_tiny.py
    │       │   ├── test_mobilenet_v2.py
    │       │   ├── test_mobilenet_v3_small.py
    │       │   ├── test_nn_AdaptiveAvgPool1d.py
    │       │   ├── test_nn_AdaptiveAvgPool2d.py
    │       │   ├── test_nn_AdaptiveAvgPool3d.py
    │       │   ├── test_nn_AdaptiveMaxPool1d.py
    │       │   ├── test_nn_AdaptiveMaxPool2d.py
    │       │   ├── test_nn_AdaptiveMaxPool3d.py
    │       │   ├── test_nn_AvgPool1d.py
    │       │   ├── test_nn_AvgPool2d.py
    │       │   ├── test_nn_AvgPool3d.py
    │       │   ├── test_nn_BatchNorm1d.py
    │       │   ├── test_nn_BatchNorm2d.py
    │       │   ├── test_nn_BatchNorm3d.py
    │       │   ├── test_nn_CELU.py
    │       │   ├── test_nn_ConstantPad1d.py
    │       │   ├── test_nn_ConstantPad2d.py
    │       │   ├── test_nn_ConstantPad3d.py
    │       │   ├── test_nn_Conv1d.py
    │       │   ├── test_nn_Conv2d.py
    │       │   ├── test_nn_Conv3d.py
    │       │   ├── test_nn_ConvTranspose1d.py
    │       │   ├── test_nn_ConvTranspose2d.py
    │       │   ├── test_nn_ConvTranspose3d.py
    │       │   ├── test_nn_ELU.py
    │       │   ├── test_nn_GELU.py
    │       │   ├── test_nn_GRU.py
    │       │   ├── test_nn_GroupNorm.py
    │       │   ├── test_nn_Hardshrink.py
    │       │   ├── test_nn_Hardsigmoid.py
    │       │   ├── test_nn_Hardswish.py
    │       │   ├── test_nn_Hardtanh.py
    │       │   ├── test_nn_InstanceNorm1d.py
    │       │   ├── test_nn_InstanceNorm2d.py
    │       │   ├── test_nn_InstanceNorm3d.py
    │       │   ├── test_nn_LSTM.py
    │       │   ├── test_nn_LayerNorm.py
    │       │   ├── test_nn_LeakyReLU.py
    │       │   ├── test_nn_Linear.py
    │       │   ├── test_nn_LocalResponseNorm.py
    │       │   ├── test_nn_LogSigmoid.py
    │       │   ├── test_nn_LogSoftmax.py
    │       │   ├── test_nn_MaxPool1d.py
    │       │   ├── test_nn_MaxPool2d.py
    │       │   ├── test_nn_MaxPool3d.py
    │       │   ├── test_nn_Mish.py
    │       │   ├── test_nn_MultiheadAttention.py
    │       │   ├── test_nn_PReLU.py
    │       │   ├── test_nn_PixelShuffle.py
    │       │   ├── test_nn_PixelUnshuffle.py
    │       │   ├── test_nn_RNN.py
    │       │   ├── test_nn_ReLU.py
    │       │   ├── test_nn_ReLU6.py
    │       │   ├── test_nn_ReflectionPad1d.py
    │       │   ├── test_nn_ReflectionPad2d.py
    │       │   ├── test_nn_ReplicationPad1d.py
    │       │   ├── test_nn_ReplicationPad2d.py
    │       │   ├── test_nn_ReplicationPad3d.py
    │       │   ├── test_nn_SELU.py
    │       │   ├── test_nn_SiLU.py
    │       │   ├── test_nn_Sigmoid.py
    │       │   ├── test_nn_Softmax.py
    │       │   ├── test_nn_Softmin.py
    │       │   ├── test_nn_Softplus.py
    │       │   ├── test_nn_Softshrink.py
    │       │   ├── test_nn_Softsign.py
    │       │   ├── test_nn_Tanh.py
    │       │   ├── test_nn_Tanhshrink.py
    │       │   ├── test_nn_Upsample.py
    │       │   ├── test_nn_UpsamplingBilinear2d.py
    │       │   ├── test_nn_UpsamplingNearest2d.py
    │       │   ├── test_nn_ZeroPad2d.py
    │       │   ├── test_onnx_activation_ops.py
    │       │   ├── test_onnx_conv_ops.py
    │       │   ├── test_onnx_dense_ops.py
    │       │   ├── test_onnx_fuse_channel_shuffle.py
    │       │   ├── test_onnx_fuse_pixel_shuffle.py
    │       │   ├── test_onnx_fuse_pixel_unshuffle.py
    │       │   ├── test_onnx_layout_ops.py
    │       │   ├── test_onnx_math_ops.py
    │       │   ├── test_onnx_normalize_ops.py
    │       │   ├── test_onnx_opset21_ops.py
    │       │   ├── test_onnx_pool_ops.py
    │       │   ├── test_onnx_reduce_ops.py
    │       │   ├── test_onnx_rnn_ops.py
    │       │   ├── test_resnet18.py
    │       │   ├── test_shufflenet_v2_x1_0.py
    │       │   ├── test_squeezenet1_1.py
    │       │   ├── test_swin_t.py
    │       │   ├── test_torch_cat.py
    │       │   ├── test_torch_ceil.py
    │       │   ├── test_torch_chunk.py
    │       │   ├── test_torch_clamp.py
    │       │   ├── test_torch_flatten.py
    │       │   ├── test_torch_flip.py
    │       │   ├── test_torch_floor.py
    │       │   ├── test_torch_logical_and.py
    │       │   ├── test_torch_logical_not.py
    │       │   ├── test_torch_logical_or.py
    │       │   ├── test_torch_logical_xor.py
    │       │   ├── test_torch_max.py
    │       │   ├── test_torch_maximum.py
    │       │   ├── test_torch_mean.py
    │       │   ├── test_torch_min.py
    │       │   ├── test_torch_minimum.py
    │       │   ├── test_torch_norm.py
    │       │   ├── test_torch_prod.py
    │       │   ├── test_torch_roll.py
    │       │   ├── test_torch_split.py
    │       │   ├── test_torch_squeeze.py
    │       │   ├── test_torch_stack.py
    │       │   ├── test_torch_sum.py
    │       │   ├── test_torch_transpose.py
    │       │   ├── test_torch_unbind.py
    │       │   ├── test_torch_unsqueeze.py
    │       │   ├── test_transformers_albert_attention.py
    │       │   ├── test_transformers_bart_attention.py
    │       │   ├── test_transformers_bert_attention.py
    │       │   ├── test_transformers_bert_generation_attention.py
    │       │   ├── test_transformers_blenderbot_attention.py
    │       │   ├── test_transformers_camembert_attention.py
    │       │   ├── test_transformers_chinese_clip_attention.py
    │       │   ├── test_transformers_clip_attention.py
    │       │   ├── test_transformers_ctrl_attention.py
    │       │   ├── test_transformers_deberta_attention.py
    │       │   ├── test_transformers_distilbert_attention.py
    │       │   ├── test_transformers_electra_attention.py
    │       │   ├── test_transformers_flaubert_attention.py
    │       │   ├── test_transformers_fsmt_attention.py
    │       │   ├── test_transformers_funnel_attention.py
    │       │   ├── test_transformers_gpt2_attention.py
    │       │   ├── test_transformers_layoutlm_attention.py
    │       │   ├── test_transformers_lxmert_attention.py
    │       │   ├── test_transformers_m2m_100_attention.py
    │       │   ├── test_transformers_marian_attention.py
    │       │   ├── test_transformers_mbart_attention.py
    │       │   ├── test_transformers_mobilebert_attention.py
    │       │   ├── test_transformers_mt5_attention.py
    │       │   ├── test_transformers_openai_attention.py
    │       │   ├── test_transformers_pegasus_attention.py
    │       │   ├── test_transformers_prophetnet_attention.py
    │       │   ├── test_transformers_reformer_attention.py
    │       │   ├── test_transformers_roberta_attention.py
    │       │   ├── test_transformers_squeezebert_attention.py
    │       │   ├── test_transformers_t5_attention.py
    │       │   ├── test_transformers_xlm_attention.py
    │       │   ├── test_transformers_xlm_roberta_attention.py
    │       │   └── test_vit_b_32.py
    │       ├── run_test.cmake
    │       ├── test_F_adaptive_avg_pool1d.py
    │       ├── test_F_adaptive_avg_pool2d.py
    │       ├── test_F_adaptive_avg_pool3d.py
    │       ├── test_F_adaptive_max_pool1d.py
    │       ├── test_F_adaptive_max_pool2d.py
    │       ├── test_F_adaptive_max_pool3d.py
    │       ├── test_F_affine_grid.py
    │       ├── test_F_alpha_dropout.py
    │       ├── test_F_avg_pool1d.py
    │       ├── test_F_avg_pool2d.py
    │       ├── test_F_avg_pool3d.py
    │       ├── test_F_batch_norm.py
    │       ├── test_F_celu.py
    │       ├── test_F_conv1d.py
    │       ├── test_F_conv2d.py
    │       ├── test_F_conv3d.py
    │       ├── test_F_conv_transpose1d.py
    │       ├── test_F_conv_transpose2d.py
    │       ├── test_F_conv_transpose3d.py
    │       ├── test_F_dropout.py
    │       ├── test_F_dropout2d.py
    │       ├── test_F_dropout3d.py
    │       ├── test_F_elu.py
    │       ├── test_F_embedding.py
    │       ├── test_F_feature_alpha_dropout.py
    │       ├── test_F_fold.py
    │       ├── test_F_gelu.py
    │       ├── test_F_glu.py
    │       ├── test_F_grid_sample.py
    │       ├── test_F_group_norm.py
    │       ├── test_F_hardshrink.py
    │       ├── test_F_hardsigmoid.py
    │       ├── test_F_hardswish.py
    │       ├── test_F_hardtanh.py
    │       ├── test_F_instance_norm.py
    │       ├── test_F_interpolate.py
    │       ├── test_F_layer_norm.py
    │       ├── test_F_leaky_relu.py
    │       ├── test_F_linear.py
    │       ├── test_F_local_response_norm.py
    │       ├── test_F_log_softmax.py
    │       ├── test_F_logsigmoid.py
    │       ├── test_F_lp_pool1d.py
    │       ├── test_F_lp_pool2d.py
    │       ├── test_F_max_pool1d.py
    │       ├── test_F_max_pool2d.py
    │       ├── test_F_max_pool3d.py
    │       ├── test_F_mish.py
    │       ├── test_F_normalize.py
    │       ├── test_F_pad.py
    │       ├── test_F_pairwise_distance.py
    │       ├── test_F_pixel_shuffle.py
    │       ├── test_F_pixel_unshuffle.py
    │       ├── test_F_prelu.py
    │       ├── test_F_relu.py
    │       ├── test_F_relu6.py
    │       ├── test_F_rms_norm.py
    │       ├── test_F_rrelu.py
    │       ├── test_F_scaled_dot_product_attention.py
    │       ├── test_F_selu.py
    │       ├── test_F_sigmoid.py
    │       ├── test_F_silu.py
    │       ├── test_F_softmax.py
    │       ├── test_F_softmin.py
    │       ├── test_F_softplus.py
    │       ├── test_F_softshrink.py
    │       ├── test_F_softsign.py
    │       ├── test_F_tanh.py
    │       ├── test_F_tanhshrink.py
    │       ├── test_F_threshold.py
    │       ├── test_F_unfold.py
    │       ├── test_F_upsample.py
    │       ├── test_F_upsample_bilinear.py
    │       ├── test_F_upsample_nearest.py
    │       ├── test_Tensor_expand.py
    │       ├── test_Tensor_fill.py
    │       ├── test_Tensor_index.py
    │       ├── test_Tensor_index_put.py
    │       ├── test_Tensor_masked_fill.py
    │       ├── test_Tensor_new_empty.py
    │       ├── test_Tensor_new_full.py
    │       ├── test_Tensor_new_ones.py
    │       ├── test_Tensor_new_zeros.py
    │       ├── test_Tensor_permute.py
    │       ├── test_Tensor_repeat.py
    │       ├── test_Tensor_reshape.py
    │       ├── test_Tensor_reshape_as.py
    │       ├── test_Tensor_select.py
    │       ├── test_Tensor_slice.py
    │       ├── test_Tensor_slice_copy.py
    │       ├── test_Tensor_to.py
    │       ├── test_Tensor_type_as.py
    │       ├── test_Tensor_unflatten.py
    │       ├── test_Tensor_view.py
    │       ├── test_convnext_tiny.py
    │       ├── test_ir_complex.py
    │       ├── test_mobilenet_v2.py
    │       ├── test_mobilenet_v3_small.py
    │       ├── test_nn_AdaptiveAvgPool1d.py
    │       ├── test_nn_AdaptiveAvgPool2d.py
    │       ├── test_nn_AdaptiveAvgPool3d.py
    │       ├── test_nn_AdaptiveMaxPool1d.py
    │       ├── test_nn_AdaptiveMaxPool2d.py
    │       ├── test_nn_AdaptiveMaxPool3d.py
    │       ├── test_nn_AlphaDropout.py
    │       ├── test_nn_AvgPool1d.py
    │       ├── test_nn_AvgPool2d.py
    │       ├── test_nn_AvgPool3d.py
    │       ├── test_nn_BatchNorm1d.py
    │       ├── test_nn_BatchNorm2d.py
    │       ├── test_nn_BatchNorm3d.py
    │       ├── test_nn_CELU.py
    │       ├── test_nn_ChannelShuffle.py
    │       ├── test_nn_ConstantPad1d.py
    │       ├── test_nn_ConstantPad2d.py
    │       ├── test_nn_ConstantPad3d.py
    │       ├── test_nn_Conv1d.py
    │       ├── test_nn_Conv2d.py
    │       ├── test_nn_Conv3d.py
    │       ├── test_nn_ConvTranspose1d.py
    │       ├── test_nn_ConvTranspose2d.py
    │       ├── test_nn_ConvTranspose3d.py
    │       ├── test_nn_Dropout.py
    │       ├── test_nn_Dropout2d.py
    │       ├── test_nn_Dropout3d.py
    │       ├── test_nn_ELU.py
    │       ├── test_nn_Embedding.py
    │       ├── test_nn_Fold.py
    │       ├── test_nn_GELU.py
    │       ├── test_nn_GLU.py
    │       ├── test_nn_GRU.py
    │       ├── test_nn_GroupNorm.py
    │       ├── test_nn_Hardshrink.py
    │       ├── test_nn_Hardsigmoid.py
    │       ├── test_nn_Hardswish.py
    │       ├── test_nn_Hardtanh.py
    │       ├── test_nn_Identity.py
    │       ├── test_nn_InstanceNorm1d.py
    │       ├── test_nn_InstanceNorm2d.py
    │       ├── test_nn_InstanceNorm3d.py
    │       ├── test_nn_LPPool1d.py
    │       ├── test_nn_LPPool2d.py
    │       ├── test_nn_LSTM.py
    │       ├── test_nn_LayerNorm.py
    │       ├── test_nn_LeakyReLU.py
    │       ├── test_nn_Linear.py
    │       ├── test_nn_LocalResponseNorm.py
    │       ├── test_nn_LogSigmoid.py
    │       ├── test_nn_LogSoftmax.py
    │       ├── test_nn_MaxPool1d.py
    │       ├── test_nn_MaxPool2d.py
    │       ├── test_nn_MaxPool3d.py
    │       ├── test_nn_Mish.py
    │       ├── test_nn_MultiheadAttention.py
    │       ├── test_nn_PReLU.py
    │       ├── test_nn_PixelShuffle.py
    │       ├── test_nn_PixelUnshuffle.py
    │       ├── test_nn_RMSNorm.py
    │       ├── test_nn_RNN.py
    │       ├── test_nn_RReLU.py
    │       ├── test_nn_ReLU.py
    │       ├── test_nn_ReLU6.py
    │       ├── test_nn_ReflectionPad1d.py
    │       ├── test_nn_ReflectionPad2d.py
    │       ├── test_nn_ReplicationPad1d.py
    │       ├── test_nn_ReplicationPad2d.py
    │       ├── test_nn_ReplicationPad3d.py
    │       ├── test_nn_SELU.py
    │       ├── test_nn_SiLU.py
    │       ├── test_nn_Sigmoid.py
    │       ├── test_nn_Softmax.py
    │       ├── test_nn_Softmax2d.py
    │       ├── test_nn_Softmin.py
    │       ├── test_nn_Softplus.py
    │       ├── test_nn_Softshrink.py
    │       ├── test_nn_Softsign.py
    │       ├── test_nn_Tanh.py
    │       ├── test_nn_Tanhshrink.py
    │       ├── test_nn_Threshold.py
    │       ├── test_nn_Unfold.py
    │       ├── test_nn_Upsample.py
    │       ├── test_nn_UpsamplingBilinear2d.py
    │       ├── test_nn_UpsamplingNearest2d.py
    │       ├── test_nn_ZeroPad2d.py
    │       ├── test_pnnx_eliminate_noop_cat.py
    │       ├── test_pnnx_eliminate_noop_expand.py
    │       ├── test_pnnx_eliminate_noop_math.py
    │       ├── test_pnnx_eliminate_noop_upsample.py
    │       ├── test_pnnx_expression.py
    │       ├── test_pnnx_fold_constant.py
    │       ├── test_pnnx_fuse_adjacent_permute.py
    │       ├── test_pnnx_fuse_adjacent_reshape.py
    │       ├── test_pnnx_fuse_channel_shuffle.py
    │       ├── test_pnnx_fuse_conv1d_batchnorm1d.py
    │       ├── test_pnnx_fuse_conv2d_batchnorm2d.py
    │       ├── test_pnnx_fuse_conv3d_batchnorm3d.py
    │       ├── test_pnnx_fuse_convtranspose1d_batchnorm1d.py
    │       ├── test_pnnx_fuse_convtranspose2d_batchnorm2d.py
    │       ├── test_pnnx_fuse_convtranspose3d_batchnorm3d.py
    │       ├── test_pnnx_fuse_input_unpack.py
    │       ├── test_pnnx_fuse_layernorm.py
    │       ├── test_pnnx_fuse_linear_batchnorm1d.py
    │       ├── test_pnnx_fuse_multiheadattention.py
    │       ├── test_pnnx_fuse_pad_conv1d.py
    │       ├── test_pnnx_fuse_pad_conv2d.py
    │       ├── test_pnnx_fuse_pixel_shuffle.py
    │       ├── test_pnnx_fuse_pixel_unshuffle.py
    │       ├── test_pnnx_fuse_rmsnorm.py
    │       ├── test_pnnx_fuse_scaled_dot_product_attention.py
    │       ├── test_pnnx_fuse_select_to_unbind.py
    │       ├── test_pnnx_fuse_slice_to_tensor_split.py
    │       ├── test_quantization_shufflenet_v2_x1_0.py
    │       ├── test_resnet18.py
    │       ├── test_shufflenet_v2_x1_0.py
    │       ├── test_squeezenet1_1.py
    │       ├── test_swin_t.py
    │       ├── test_torch_abs.py
    │       ├── test_torch_acos.py
    │       ├── test_torch_acosh.py
    │       ├── test_torch_addmm.py
    │       ├── test_torch_amax.py
    │       ├── test_torch_amin.py
    │       ├── test_torch_arange.py
    │       ├── test_torch_argmax.py
    │       ├── test_torch_argmin.py
    │       ├── test_torch_asin.py
    │       ├── test_torch_asinh.py
    │       ├── test_torch_atan.py
    │       ├── test_torch_atan2.py
    │       ├── test_torch_atanh.py
    │       ├── test_torch_bitwise_and.py
    │       ├── test_torch_bitwise_left_shift.py
    │       ├── test_torch_bitwise_not.py
    │       ├── test_torch_bitwise_or.py
    │       ├── test_torch_bitwise_right_shift.py
    │       ├── test_torch_bitwise_xor.py
    │       ├── test_torch_bmm.py
    │       ├── test_torch_cat.py
    │       ├── test_torch_ceil.py
    │       ├── test_torch_chunk.py
    │       ├── test_torch_clamp.py
    │       ├── test_torch_clone.py
    │       ├── test_torch_complex.py
    │       ├── test_torch_cos.py
    │       ├── test_torch_cosh.py
    │       ├── test_torch_cross.py
    │       ├── test_torch_cumprod.py
    │       ├── test_torch_cumsum.py
    │       ├── test_torch_diag.py
    │       ├── test_torch_einsum.py
    │       ├── test_torch_eq.py
    │       ├── test_torch_exp.py
    │       ├── test_torch_fft_fft.py
    │       ├── test_torch_fft_fft2.py
    │       ├── test_torch_fft_fftn.py
    │       ├── test_torch_fft_hfft.py
    │       ├── test_torch_fft_hfft2.py
    │       ├── test_torch_fft_hfftn.py
    │       ├── test_torch_fft_ifft.py
    │       ├── test_torch_fft_ifft2.py
    │       ├── test_torch_fft_ifftn.py
    │       ├── test_torch_fft_ihfft.py
    │       ├── test_torch_fft_ihfft2.py
    │       ├── test_torch_fft_ihfftn.py
    │       ├── test_torch_fft_irfft.py
    │       ├── test_torch_fft_irfft2.py
    │       ├── test_torch_fft_irfftn.py
    │       ├── test_torch_fft_rfft.py
    │       ├── test_torch_fft_rfft2.py
    │       ├── test_torch_fft_rfftn.py
    │       ├── test_torch_flatten.py
    │       ├── test_torch_flip.py
    │       ├── test_torch_floor.py
    │       ├── test_torch_full.py
    │       ├── test_torch_full_like.py
    │       ├── test_torch_gather.py
    │       ├── test_torch_ge.py
    │       ├── test_torch_gt.py
    │       ├── test_torch_imag.py
    │       ├── test_torch_index_select.py
    │       ├── test_torch_istft.py
    │       ├── test_torch_le.py
    │       ├── test_torch_lgamma.py
    │       ├── test_torch_log.py
    │       ├── test_torch_log10.py
    │       ├── test_torch_logaddexp.py
    │       ├── test_torch_logical_and.py
    │       ├── test_torch_logical_not.py
    │       ├── test_torch_logical_or.py
    │       ├── test_torch_logical_xor.py
    │       ├── test_torch_logsumexp.py
    │       ├── test_torch_lt.py
    │       ├── test_torch_masked_select.py
    │       ├── test_torch_matmul.py
    │       ├── test_torch_max.py
    │       ├── test_torch_maximum.py
    │       ├── test_torch_mean.py
    │       ├── test_torch_min.py
    │       ├── test_torch_minimum.py
    │       ├── test_torch_mm.py
    │       ├── test_torch_mv.py
    │       ├── test_torch_narrow.py
    │       ├── test_torch_ne.py
    │       ├── test_torch_neg.py
    │       ├── test_torch_norm.py
    │       ├── test_torch_ones.py
    │       ├── test_torch_ones_like.py
    │       ├── test_torch_positive.py
    │       ├── test_torch_pow.py
    │       ├── test_torch_prod.py
    │       ├── test_torch_real.py
    │       ├── test_torch_reciprocal.py
    │       ├── test_torch_repeat_interleave.py
    │       ├── test_torch_roll.py
    │       ├── test_torch_round.py
    │       ├── test_torch_rsqrt.py
    │       ├── test_torch_scatter_add.py
    │       ├── test_torch_sign.py
    │       ├── test_torch_sin.py
    │       ├── test_torch_sinh.py
    │       ├── test_torch_slice_scatter.py
    │       ├── test_torch_split.py
    │       ├── test_torch_sqrt.py
    │       ├── test_torch_square.py
    │       ├── test_torch_squeeze.py
    │       ├── test_torch_stack.py
    │       ├── test_torch_std.py
    │       ├── test_torch_stft.py
    │       ├── test_torch_sum.py
    │       ├── test_torch_t.py
    │       ├── test_torch_tan.py
    │       ├── test_torch_tanh.py
    │       ├── test_torch_tensor_split.py
    │       ├── test_torch_tile.py
    │       ├── test_torch_topk.py
    │       ├── test_torch_transpose.py
    │       ├── test_torch_trunc.py
    │       ├── test_torch_unbind.py
    │       ├── test_torch_unsqueeze.py
    │       ├── test_torch_view_as_complex.py
    │       ├── test_torch_view_as_real.py
    │       ├── test_torch_where.py
    │       ├── test_torch_zeros.py
    │       ├── test_torch_zeros_like.py
    │       ├── test_torchaudio_F_inverse_spectrogram.py
    │       ├── test_torchaudio_F_spectrogram.py
    │       ├── test_torchaudio_InverseSpectrogram.py
    │       ├── test_torchaudio_Spectrogram.py
    │       ├── test_torchvision_DeformConv2d.py
    │       ├── test_torchvision_RoIAlign.py
    │       ├── test_transformers_albert_attention.py
    │       ├── test_transformers_bart_attention.py
    │       ├── test_transformers_bert_attention.py
    │       ├── test_transformers_bert_generation_attention.py
    │       ├── test_transformers_blenderbot_attention.py
    │       ├── test_transformers_camembert_attention.py
    │       ├── test_transformers_chinese_clip_attention.py
    │       ├── test_transformers_clip_attention.py
    │       ├── test_transformers_ctrl_attention.py
    │       ├── test_transformers_deberta_attention.py
    │       ├── test_transformers_deepseek_v3_attention.py
    │       ├── test_transformers_distilbert_attention.py
    │       ├── test_transformers_electra_attention.py
    │       ├── test_transformers_flaubert_attention.py
    │       ├── test_transformers_fsmt_attention.py
    │       ├── test_transformers_funnel_attention.py
    │       ├── test_transformers_gpt2_attention.py
    │       ├── test_transformers_layoutlm_attention.py
    │       ├── test_transformers_longformer_attention.py
    │       ├── test_transformers_lxmert_attention.py
    │       ├── test_transformers_m2m_100_attention.py
    │       ├── test_transformers_marian_attention.py
    │       ├── test_transformers_mbart_attention.py
    │       ├── test_transformers_mobilebert_attention.py
    │       ├── test_transformers_mt5_attention.py
    │       ├── test_transformers_openai_attention.py
    │       ├── test_transformers_pegasus_attention.py
    │       ├── test_transformers_prophetnet_attention.py
    │       ├── test_transformers_qwen2_attention.py
    │       ├── test_transformers_qwen3_attention.py
    │       ├── test_transformers_reformer_attention.py
    │       ├── test_transformers_roberta_attention.py
    │       ├── test_transformers_squeezebert_attention.py
    │       ├── test_transformers_t5_attention.py
    │       ├── test_transformers_xlm_attention.py
    │       ├── test_transformers_xlm_roberta_attention.py
    │       ├── test_transformers_xlnet_attention.py
    │       └── test_vit_b_32.py
    ├── pytorch/
    │   └── README.md
    ├── quantize/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── imreadwrite.cpp
    │   ├── imreadwrite.h
    │   ├── ncnn2int8.cpp
    │   ├── ncnn2table.cpp
    │   └── npy.hpp
    └── tensorflow/
        └── readme.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .astylerc
================================================
# astyle -n -r "benchmark/*.h,*.cpp" "src/*.h,*.cpp" "tests/*.h,*.cpp" "tools/*.h,*.cpp" "examples/*.h,*.cpp"

# brace style
--style=allman

# tab
--attach-namespaces
--attach-extern-c
--attach-closing-while

# indentation
--indent-preproc-define
--indent-col1-comments
--min-conditional-indent=0
--max-continuation-indent=120

# padding
--pad-oper
--pad-comma
--pad-header
--align-pointer=type
--align-reference=type

# formatting
--break-closing-braces
--attach-return-type
--attach-return-type-decl
--keep-one-line-blocks
--keep-one-line-statements
--convert-tabs
--max-code-length=200
--mode=c

# other
--lineend=linux


================================================
FILE: .clang-format
================================================
# find src/ tools/ tests/ examples/ benchmark/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.h' | xargs -i clang-format -i {}

# need clang-format >= 10.0

AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
# AlignConsecutiveBitFields: true
AlignConsecutiveDeclarations: false
AlignConsecutiveMacros: true
AlignEscapedNewlines: Left
# AlignOperands: AlignAfterOperator
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: Always
AllowShortCaseLabelsOnASingleLine: true
# AllowShortEnumsOnASingleLine: true
AllowShortFunctionsOnASingleLine: None
AllowShortIfStatementsOnASingleLine: WithoutElse
AllowShortLambdasOnASingleLine: All
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
  AfterCaseLabel: true
  AfterClass: true
  AfterControlStatement: Always
  AfterEnum: true
  AfterFunction: true
  AfterNamespace: false
  AfterObjCDeclaration: false
  AfterStruct: true
  AfterUnion: true
  AfterExternBlock: false
  BeforeCatch: true
  BeforeElse: true
#  BeforeLambdaBody: false
#  BeforeWhile: false
  IndentBraces: false
  SplitEmptyFunction: true
  SplitEmptyRecord: true
  SplitEmptyNamespace: false
BreakAfterJavaFieldAnnotations: true
BreakBeforeBinaryOperators: All
BreakBeforeBraces: Custom
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeColon
BreakInheritanceList: BeforeColon
BreakStringLiterals: false
ColumnLimit: 0
# CommentPragmas:
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: false
DerivePointerAlignment: false
# DisableFormat:
# ExperimentalAutoDetectBinPacking:
FixNamespaceComments: true
# ForEachMacros:
IncludeBlocks: Regroup
# IncludeCategories:
# IncludeIsMainRegex:
# IncludeIsMainSourceRegex:
# IndentCaseBlocks: false
IndentCaseLabels: false
# IndentExternBlock: NoIndent
IndentGotoLabels: false
IndentPPDirectives: None
IndentWidth: 4
# IndentWrappedFunctionNames: 4
# InsertTrailingCommas: None
# JavaImportGroups:
# JavaScriptQuotes
# JavaScriptWrapImports:
KeepEmptyLinesAtTheStartOfBlocks: false
Language: Cpp
# MacroBlockBegin:
# MacroBlockEnd:
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
# NamespaceMacros:
# ObjCBinPackProtocolList:
# ObjCBlockIndentWidth:
# ObjCBreakBeforeNestedBlockParam:
# ObjCSpaceAfterProperty:
# ObjCSpaceBeforeProtocolList:
# PenaltyBreakAssignment:
# PenaltyBreakBeforeFirstCallParameter:
# PenaltyBreakComment:
# PenaltyBreakFirstLessLess:
# PenaltyBreakString:
# PenaltyBreakTemplateDeclaration:
# PenaltyExcessCharacter:
# PenaltyReturnTypeOnItsOwnLine:
PointerAlignment: Left
# RawStringFormats:
ReflowComments: false
SortIncludes: false
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceBeforeSquareBrackets: false
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInCStyleCastParentheses: false
SpacesInConditionalStatement: false
SpacesInContainerLiterals: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: c++03
#StatementMacros:
TabWidth: 4
# TypenameMacros:
UseCRLF: false
UseTab: Never


================================================
FILE: .gitattributes
================================================
*.comp linguist-language=GLSL


================================================
FILE: .github/ISSUE_TEMPLATE/bug.md
================================================
---
name: "\U0001F41B bug issue"
about: submit a bug report +_+
---

## error log | 日志或报错信息 | ログ

## context | 编译/运行环境 | バックグラウンド

## how to reproduce | 复现步骤 | 再現方法
1.
2.
3.

## more | 其他 | その他


================================================
FILE: .github/ISSUE_TEMPLATE/model-convert.md
================================================
---
name: "\U0001F6B8 model convert issue"
about: "Life is Short, Use pnnx and convertmodel.com"
---

## error log | 日志或报错信息 | ログ

## model | 模型 | モデル
1. original model

## how to reproduce | 复现步骤 | 再現方法
1.
2.
3.


================================================
FILE: .github/ISSUE_TEMPLATE/others.md
================================================
---
name: "\U0001F4DD others"
about: discussion, suggestion and question
---

## detail | 详细描述 | 詳細な説明


================================================
FILE: .github/ISSUE_TEMPLATE/quantization.md
================================================
---
name: "\U0001F4C8 quantization"
about: best wishes for your low bit quantization has a low accuracy loss...\(^▽^)/...2333... 
---

## expectation | 诉求 | 期待する
1. speed 
2. precision

## model | 模型 | モデル
1. model.param and model.bin

## detail | 详细描述 | 詳細な説明


================================================
FILE: .github/dependabot.yml
================================================
version: 2
updates:
  - package-ecosystem: "github-actions"
    directory: "/"
    schedule:
      interval: "daily"


================================================
FILE: .github/labeler.yml
================================================
cmake:
- changed-files:
  - any-glob-to-any-file: ['cmake/**', 'toolchains/**']

doc: 
- changed-files:
  - any-glob-to-any-file: docs/**

python: 
- changed-files:
  - any-glob-to-any-file: python/**

example: 
- changed-files:
  - any-glob-to-any-file: examples/**

test: 
- changed-files:
  - any-glob-to-any-file: tests/**

tool: 
- changed-files:
  - any-glob-to-any-file: tools/**
pnnx: 
- changed-files:
  - any-glob-to-any-file: tools/pnnx/**

core: 
- changed-files:
  - any-glob-to-any-file: src/*
layer: 
- changed-files:
  - any-glob-to-any-file: src/layer/*

arm: 
- changed-files:
  - any-glob-to-any-file: src/layer/arm/**
loongarch: 
- changed-files:
  - any-glob-to-any-file: src/layer/loongarch/**
mips: 
- changed-files:
  - any-glob-to-any-file: src/layer/mips/**
riscv: 
- changed-files:
  - any-glob-to-any-file: src/layer/riscv/**
vulkan: 
- changed-files:
  - any-glob-to-any-file: src/layer/vulkan/**
x86: 
- changed-files:
  - any-glob-to-any-file: src/layer/x86/**


================================================
FILE: .github/workflows/android.yml
================================================
name: android
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/android.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/riscv/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/android.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/riscv/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
concurrency:
  group: android-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  build:
    runs-on: ubuntu-latest
    env:
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \
        -DANDROID_PLATFORM=android-21 \
        -DANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES=ON \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: armeabi-v7a
      run: |
        mkdir build-armeabi-v7a && cd build-armeabi-v7a
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON ..
        cmake --build . -j $(nproc)
    - name: arm64-v8a
      run: |
        mkdir build-arm64-v8a && cd build-arm64-v8a
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" ..
        cmake --build . -j $(nproc)
    - name: x86
      run: |
        mkdir build-x86 && cd build-x86
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86" ..
        cmake --build . -j $(nproc)
    - name: x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" ..
        cmake --build . -j $(nproc)
    - name: riscv64
      run: |
        mkdir build-riscv64 && cd build-riscv64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" ..
        cmake --build . -j $(nproc)

    - name: armeabi-v7a-shared
      run: |
        mkdir build-armeabi-v7a-shared && cd build-armeabi-v7a-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: arm64-v8a-shared
      run: |
        mkdir build-arm64-v8a-shared && cd build-arm64-v8a-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: x86-shared
      run: |
        mkdir build-x86-shared && cd build-x86-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86" -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: x86_64-shared
      run: |
        mkdir build-x86_64-shared && cd build-x86_64-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="x86_64" -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: riscv64-shared
      run: |
        mkdir build-riscv64-shared && cd build-riscv64-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="riscv64" -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)

  ndk-r16b:
    runs-on: ubuntu-latest
    env:
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/android-ndk-r16b/build/cmake/android.toolchain.cmake \
        -DANDROID_PLATFORM=android-21 \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: ndk-r16b
      env:
        DEBIAN_FRONTEND: noninteractive
      run: |
        pushd /usr/lib/x86_64-linux-gnu/
        sudo ln -s libncurses.so.6 libncurses.so.5
        sudo ln -s libtinfo.so.6 libtinfo.so.5
        popd
        wget -q https://dl.google.com/android/repository/android-ndk-r16b-linux-x86_64.zip -O $GITHUB_WORKSPACE/android-ndk-r16b-linux-x86_64.zip
        cd $GITHUB_WORKSPACE && unzip -q android-ndk-r16b-linux-x86_64.zip

    - name: armeabi-v7a
      run: |
        mkdir build-armeabi-v7a && cd build-armeabi-v7a
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON ..
        cmake --build . -j $(nproc)
    - name: armeabi-v7a-no-neon
      run: |
        mkdir build-armeabi-v7a-no-neon && cd build-armeabi-v7a-no-neon
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF ..
        cmake --build . -j $(nproc)
    - name: arm64-v8a
      run: |
        mkdir build-arm64-v8a && cd build-arm64-v8a
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" ..
        cmake --build . -j $(nproc)

    - name: armeabi-v7a-shared
      run: |
        mkdir build-armeabi-v7a-shared && cd build-armeabi-v7a-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: armeabi-v7a-no-neon-shared
      run: |
        mkdir build-armeabi-v7a-no-neon-shared && cd build-armeabi-v7a-no-neon-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: arm64-v8a-shared
      run: |
        mkdir build-arm64-v8a-shared && cd build-arm64-v8a-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="arm64-v8a" -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)


================================================
FILE: .github/workflows/code-format-msg.yml
================================================
name: code-format-msg

on:
  workflow_run:
    workflows: [code-format]
    types: [completed]

concurrency:
  group: code-format-msg-${{ github.head_ref || github.run_id }}

permissions:
  contents: read
  pull-requests: write

jobs:
  pr-context:
    name: acquire-pr-context
    runs-on: ubuntu-latest
    outputs:
      PR_HEADSHA: ${{ steps.set-pr-context.outputs.head-sha }}
      PR_NUMBER:  ${{ steps.set-pr-context.outputs.number   }}
    if: ${{ github.event.workflow_run.event == 'pull_request' }}
    steps:
    - name: get-pr-context
      id: set-pr-context
      env:
        GH_TOKEN: ${{ github.token }}
        PR_TARGET_REPO: ${{ github.repository }}
        PR_BRANCH: |-
          ${{
            (github.event.workflow_run.head_repository.owner.login != github.event.workflow_run.repository.owner.login)
              && format('{0}:{1}', github.event.workflow_run.head_repository.owner.login, github.event.workflow_run.head_branch)
              || github.event.workflow_run.head_branch
          }}
      run: |
        gh pr view --repo "${PR_TARGET_REPO}" "${PR_BRANCH}" \
          --json 'number,headRefOid' \
          --jq '"number=\(.number)\nhead-sha=\(.headRefOid)"' \
          >> $GITHUB_OUTPUT

  remove-comment-if-success:
    if: ${{ github.event.workflow_run.conclusion == 'success' }}
    runs-on: ubuntu-latest
    needs: [pr-context]
    env:
      PR_HEADSHA: ${{ needs.pr-context.outputs.PR_HEADSHA }}
      PR_NUMBER:  ${{ needs.pr-context.outputs.PR_NUMBER  }}
    steps:
    - name: Remove existing "format check failed" comment
      uses: actions/github-script@v8
      with:
        script: |
          const owner = context.repo.owner;
          const repo = context.repo.repo;
          const { data: comments } = await github.rest.issues.listComments({
            owner,
            repo,
            issue_number: ${{ env.PR_NUMBER }},
          });

          const targetComment = comments.find(comment =>
            comment.body.includes("Please enable github action in **YOUR FORKED REPO** to make code-format workflow work")
          );

          if (targetComment) {
            await github.rest.issues.deleteComment({
              owner,
              repo,
              comment_id: targetComment.id,
            });
            console.log("Removed existing code-format failure comment.");
          } else {
            console.log("No existing format failure comment to remove.");
          }

  post-comment-if-failure:
    if: ${{ github.event.workflow_run.conclusion == 'failure' }}
    runs-on: ubuntu-latest
    needs: [pr-context]
    env:
      PR_HEADSHA: ${{ needs.pr-context.outputs.PR_HEADSHA }}
      PR_NUMBER:  ${{ needs.pr-context.outputs.PR_NUMBER  }}
    steps:
    - name: Post comment on failed code-format if not existing
      uses: actions/github-script@v8
      with:
        script: |
          const owner = context.repo.owner;
          const repo = context.repo.repo;
          const { data: comments } = await github.rest.issues.listComments({
            owner,
            repo,
            issue_number: ${{ env.PR_NUMBER }},
          });

          const existingComment = comments.find(comment =>
            comment.body.includes("Please enable github action in **YOUR FORKED REPO** to make code-format workflow work")
          );

          if (existingComment) {
            console.log("A code-format failure comment already exists.");
          } else {
            await github.rest.issues.createComment({
              owner,
              repo,
              issue_number: ${{ env.PR_NUMBER }},
              body: "Please enable github action in **YOUR FORKED REPO** to make code-format workflow work",
            });
            console.log("Created code-format failure comment.");
          }


================================================
FILE: .github/workflows/code-format.yml
================================================
name: code-format

on: [push, pull_request]

concurrency:
  group: code-format-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: write

jobs:
  code-format:
    runs-on: ubuntu-latest
    container: ubuntu:20.04
    steps:
    - name: astyle
      run: |
        export DEBIAN_FRONTEND=noninteractive
        apt-get update -y
        apt-get install -y astyle git

    - uses: actions/checkout@v6

    - name: cache-clang-format
      id: cache-clang-format
      uses: actions/cache@v5
      with:
        path: clang-format-install
        key: clang-format-install-5
    - name: clang-format
      if: steps.cache-clang-format.outputs.cache-hit != 'true'
      run: |
        export DEBIAN_FRONTEND=noninteractive
        apt-get update -y
        apt-get install -y build-essential wget curl cmake unzip zip python3-pip
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/llvm-project-10.0.1.tar.xz
        tar -xf llvm-project-10.0.1.tar.xz
        cd llvm-project-10.0.1
        mkdir build
        cd build
        cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_DOCS=OFF ../llvm/
        make -j4 clang-format
        mkdir $GITHUB_WORKSPACE/clang-format-install
        cp -r bin/clang-format $GITHUB_WORKSPACE/clang-format-install
        cd ../../
        rm -rf llvm-project-10.0.1
        rm llvm-project-10.0.1.tar.xz

    - name: cache-clang-format-21
      id: cache-clang-format-21
      uses: actions/cache@v5
      with:
        path: clang-format-21-install
        key: clang-format-21-install
    - name: clang-format-21
      if: steps.cache-clang-format-21.outputs.cache-hit != 'true'
      run: |
        export DEBIAN_FRONTEND=noninteractive
        apt-get update -y
        apt-get install -y build-essential wget curl cmake unzip zip python3-pip
        pip install cmake
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.8/llvm-project-21.1.8.src.tar.xz
        tar -xf llvm-project-21.1.8.src.tar.xz
        cd llvm-project-21.1.8.src
        mkdir build
        cd build
        cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_DOCS=OFF ../llvm/
        make -j4 clang-format
        mkdir $GITHUB_WORKSPACE/clang-format-21-install
        cp -r bin/clang-format $GITHUB_WORKSPACE/clang-format-21-install
        cd ../../
        rm -rf llvm-project-21.1.8.src
        rm llvm-project-21.1.8.src.tar.xz

    - name: code-format
      run: |
        mv $GITHUB_WORKSPACE/clang-format-install/clang-format /usr/local/bin/clang-format
        rm -rf $GITHUB_WORKSPACE/clang-format-install
        sh codeformat.sh

    - name: code-format-glsl
      run: |
        mv $GITHUB_WORKSPACE/clang-format-21-install/clang-format /usr/local/bin/clang-format-21
        rm -rf $GITHUB_WORKSPACE/clang-format-21-install
        cd src/layer/vulkan/shader
        find . -type f -name '*.comp' | xargs -i clang-format-21 -i -assume-filename=main.cpp {}

    - name: configure-git-safe-directory
      run: git config --global --add safe.directory /__w/ncnn/ncnn

    - uses: stefanzweifel/git-auto-commit-action@v7
      with:
        commit_message: apply code-format changes

    - name: restore-clang-format-cache
      run: |
        mkdir $GITHUB_WORKSPACE/clang-format-install
        cp -r /usr/local/bin/clang-format $GITHUB_WORKSPACE/clang-format-install
        mkdir $GITHUB_WORKSPACE/clang-format-21-install
        cp -r /usr/local/bin/clang-format-21 $GITHUB_WORKSPACE/clang-format-21-install/clang-format


================================================
FILE: .github/workflows/codeql-analysis.yml
================================================
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
name: "CodeQL"

on:
  push:
    branches: [master]
    paths-ignore: ['**.md']
  pull_request:
    # The branches below must be a subset of the branches above
    branches: [master]
    paths-ignore: ['**.md']
  schedule:
    - cron: '0 20 * * 4'

concurrency:
  group: CodeQL-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read

jobs:
  analyze:
    permissions:
      actions: read  # for github/codeql-action/init to get workflow details
      contents: read  # for actions/checkout to fetch code
      security-events: write  # for github/codeql-action/autobuild to send a status report
    name: Analyze
    runs-on: ubuntu-latest

    strategy:
      fail-fast: false
      matrix:
        # Override automatic language detection by changing the below list
        # Supported options are ['csharp', 'cpp', 'go', 'java', 'javascript', 'python']
        language: ['cpp']
        # Learn more...
        # https://docs.github.com/en/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#overriding-automatic-language-detection

    steps:
    - name: Checkout repository
      uses: actions/checkout@v6
      with:
        # We must fetch at least the immediate parents so that if this is
        # a pull request then we can checkout the head.
        fetch-depth: 2

    # If this run was triggered by a pull request event, then checkout
    # the head of the pull request instead of the merge commit.
    - run: git checkout HEAD^2
      if: ${{ github.event_name == 'pull_request' }}

    # Initializes the CodeQL tools for scanning.
    - name: Initialize CodeQL
      uses: github/codeql-action/init@v4
      with:
        languages: ${{ matrix.language }}
        # If you wish to specify custom queries, you can do so here or in a config file.
        # By default, queries listed here will override any specified in a config file. 
        # Prefix the list here with "+" to use these queries and those in the config file.
        # queries: ./path/to/local/query, your-org/your-repo/queries@main

    # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
    # If this step fails, then you should remove it and run the build manually (see below)
    - name: Autobuild
      uses: github/codeql-action/autobuild@v4

    # ℹ️ Command-line programs to run using the OS shell.
    # 📚 https://git.io/JvXDl

    # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
    #    and modify them (or add more) to build your code if your project
    #    uses a compiled language

    #- run: |
    #   make bootstrap
    #   make release

    - name: Perform CodeQL Analysis
      uses: github/codeql-action/analyze@v4


================================================
FILE: .github/workflows/compare-binary-size-pr-comment.yml
================================================
name: compare-binary-size-pr-comment
on:
  workflow_run:
    workflows: ["compare-binary-size"]
    types:
      - completed

permissions:
  actions: read
  contents: read
  pull-requests: write

jobs:
  pr-comment:
    runs-on: ubuntu-latest
    steps:
    - name: Setup tools
      run: |
        sudo apt-get update
        sudo apt-get install -y jq unzip

    - name: Ensure workflow_run is for a PR
      id: validate
      run: |
        # Use the event payload file provided by GitHub Actions directly
        echo "Using event payload from: $GITHUB_EVENT_PATH"
        echo "Event file size: $(wc -c < "$GITHUB_EVENT_PATH") bytes"

        # Safely compute number of associated PRs (use // 0 to default if missing)
        PR_COUNT=$(jq -r '.workflow_run.pull_requests | length // 0' "$GITHUB_EVENT_PATH")
        echo "Associated pull_request count: $PR_COUNT"

        if [ "$PR_COUNT" -eq 0 ]; then
          echo "No pull_request associated with this workflow_run; nothing to do."
          echo "skip=true" >> $GITHUB_OUTPUT
          exit 0
        fi

        echo "skip=false" >> $GITHUB_OUTPUT

    - name: Download artifact zip for this run
      if: steps.validate.outputs.skip != 'true'
      env:
        RUN_ID: ${{ github.event.workflow_run.id }}
        OWNER: ${{ github.repository_owner }}
        REPO: ${{ github.repository }}
        TOKEN: ${{ secrets.COMMENTER_PAT }}
        ART_NAME: "compare-binary-size.md"
      run: |
        echo "Listing artifacts for run $RUN_ID"
        API="https://api.github.com/repos/$OWNER/${REPO#*/}/actions/runs/$RUN_ID/artifacts"

        # Save artifact list to a file (avoid pipe/echo issues)
        curl -s -H "Authorization: token $TOKEN" "$API" -o /tmp/art_list.json
        echo "Art list size: $(wc -c < /tmp/art_list.json) bytes"
        if ! jq . /tmp/art_list.json; then
          echo "Failed to parse /tmp/art_list.json with jq; aborting for safety."
          exit 1
        fi

        # find artifact archive_download_url by name (first match)
        ARCHIVE_URL=$(jq -r --arg name "$ART_NAME" '.artifacts[] | select(.name==$name) | .archive_download_url' /tmp/art_list.json | head -n1)
        if [ -z "$ARCHIVE_URL" ] || [ "$ARCHIVE_URL" = "null" ]; then
          echo "Artifact named '$ART_NAME' not found for run $RUN_ID. Exiting."
          exit 0
        fi
        echo "Downloading artifact from: $ARCHIVE_URL"

        # download and unzip to temp dir
        mkdir -p /tmp/artifact_contents
        curl -L -H "Authorization: token $TOKEN" -o /tmp/artifact.zip "$ARCHIVE_URL"
        if ! unzip -q /tmp/artifact.zip -d /tmp/artifact_contents; then
          echo "Failed to unzip /tmp/artifact.zip"; exit 1
        fi
        ls -la /tmp/artifact_contents

    - name: Read compare-binary-size.md content
      if: steps.validate.outputs.skip != 'true'
      id: read
      run: |
        # find file inside artifact_contents
        FILE=$(find /tmp/artifact_contents -type f -name "compare-binary-size.md" | head -n1 || true)
        if [ -z "$FILE" ]; then
          # If artifact name matched but internal filename differs, try any .md
          FILE=$(find /tmp/artifact_contents -type f -name "*.md" | head -n1 || true)
        fi

        if [ -z "$FILE" ]; then
          echo "compare_content<<EOF" >> $GITHUB_OUTPUT
          echo "No compare-binary-size.md found in artifact." >> $GITHUB_OUTPUT
          echo "EOF" >> $GITHUB_OUTPUT
        else
          # Truncate to avoid overly long comments (adjust lines as needed)
          head -n 1000 "$FILE" > /tmp/compare-truncated.md || true
          echo "compare_content<<EOF" >> $GITHUB_OUTPUT
          cat /tmp/compare-truncated.md >> $GITHUB_OUTPUT
          echo "EOF" >> $GITHUB_OUTPUT
        fi

    - name: Post or update PR comment via actions/github-script
      if: steps.validate.outputs.skip != 'true'
      uses: actions/github-script@v8
      with:
        github-token: ${{ secrets.COMMENTER_PAT }}
        script: |
          const pr = context.payload.workflow_run.pull_requests[0];
          if (!pr) {
            core.info("No pull request found in workflow_run payload; skipping.");
            return;
          }

          const owner = context.repo.owner;
          const repo = context.repo.repo;
          const issue_number = pr.number;
          const marker = '<!-- compare-binary-size-bot -->';

          // Read the compare content from env (set in previous step outputs)
          const compare = process.env.COMPARE_CONTENT || "";

          const body = `${marker}\n**Binary size comparison** (from artifact)\n\n\`\`\`markdown\n${compare}\n\`\`\``;

          // List existing comments and find our bot comment (by marker)
          const { data: comments } = await github.rest.issues.listComments({
            owner,
            repo,
            issue_number,
            per_page: 100
          });

          const existing = comments.find(c => c.body && c.body.includes(marker));

          if (existing) {
            await github.rest.issues.updateComment({
              owner,
              repo,
              comment_id: existing.id,
              body
            });
            core.info(`Updated comment id=${existing.id}`);
          } else {
            await github.rest.issues.createComment({
              owner,
              repo,
              issue_number,
              body
            });
            core.info("Created new comment");
          }
      env:
        # pass the content from previous step into the github-script environment
        COMPARE_CONTENT: ${{ steps.read.outputs.compare_content }}


================================================
FILE: .github/workflows/compare-binary-size.yml
================================================
name: compare-binary-size
on:
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/compare-binary-size.yml'
    - 'toolchains/**'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/**'
    - 'glslang'

concurrency:
  group: compare-binary-size-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read
  actions: read

jobs:
  compare-size:
    runs-on: ubuntu-latest
    steps:
    - name: checkout-pr-branch
      uses: actions/checkout@v6
      with:
        ref: refs/pull/${{ github.event.pull_request.number }}/merge
        submodules: true
        path: pr

    - name: checkout-base-branch
      uses: actions/checkout@v6
      with:
        ref: ${{ github.event.pull_request.base.ref }}
        repository: ${{ github.event.pull_request.base.repo.full_name }}
        submodules: true
        path: base

    - name: install-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-arm-linux-gnueabihf g++-aarch64-linux-gnu

    - name: compare-sizes
      env:
        COMMON_CMAKE_ARGS: -DNCNN_SHARED_LIB=ON -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF
      run: |
        # define target architectures
        archs=("x86_64" "armhf" "aarch64")

        # generate table
        echo "The binary size change of libncnn.so (bytes)" >> compare-binary-size.md
        echo "| architecture | base size | pr size | difference |" >> compare-binary-size.md
        echo "|--------------|-----------|---------|------------|" >> compare-binary-size.md

        for arch in "${archs[@]}"; do

          mkdir -p pr/build_$arch
          pushd pr/build_$arch
          if [ "$arch" = "armhf" ]; then
            cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake ..
          elif [ "$arch" = "aarch64" ]; then
            cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake ..
          else
            cmake ${{env.COMMON_CMAKE_ARGS}} ..
          fi
          cmake --build . -j $(nproc)
          PR_SIZE=$(stat -c%s $(readlink -f src/libncnn.so))
          popd

          mkdir -p base/build_$arch
          pushd base/build_$arch
          if [ "$arch" = "armhf" ]; then
            cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake ..
          elif [ "$arch" = "aarch64" ]; then
            cmake ${{env.COMMON_CMAKE_ARGS}} -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake ..
          else
            cmake ${{env.COMMON_CMAKE_ARGS}} ..
          fi
          cmake --build . -j $(nproc)
          BASE_SIZE=$(stat -c%s $(readlink -f src/libncnn.so))
          popd

          DIFF=$(($PR_SIZE - $BASE_SIZE))
          if [ $DIFF -gt 0 ]; then
            DIFF_STR="+$DIFF :warning:"
          else
            DIFF_STR="$DIFF :kissing_heart:"
          fi

          echo "| $arch | $BASE_SIZE | $PR_SIZE | $DIFF_STR |" >> compare-binary-size.md
        done

        cat compare-binary-size.md

    - name: upload-compare-binary-size-md
      uses: actions/upload-artifact@v6
      with:
        name: compare-binary-size.md
        path: compare-binary-size.md


================================================
FILE: .github/workflows/elf-riscv32.yml
================================================
name: elf-riscv32
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/elf-riscv32.yml'
    - 'toolchains/riscv32-unknown-elf.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/elf-riscv32.yml'
    - 'toolchains/riscv32-unknown-elf.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
concurrency:
  group: elf-riscv32-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  rv32gc:
    runs-on: [self-hosted, linux, centos]
    steps:
    - uses: actions/checkout@v6

    #- name: riscv-gnu-toolchain
      #run: |
        #wget -c https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2025.01.20/riscv32-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz
        #tar -xf riscv32-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz
        #mv riscv riscv32-elf

    #- name: checkout-riscv-pk
      #uses: actions/checkout@v6
      #with:
        #repository: riscv/riscv-pk
        #path: riscv-pk
        #ref: d8659a4e8e888bdc9caf840ad17bfe83239b1d64
    #- name: riscv-pk
      #run: |
        #cd riscv-pk
        #mkdir build && cd build
        #export PATH=$GITHUB_WORKSPACE/riscv32-elf/bin:$PATH
        #export CFLAGS="-O3"
        #export CXXFLAGS="-O3"
        #../configure --prefix=$GITHUB_WORKSPACE/riscv32-elf --with-arch=rv32gc_zicsr_zifencei --host=riscv32-unknown-elf --with-abi=ilp32d
        #make -j4
        #make install

    #- name: checkout-riscv-isa-sim
      #uses: actions/checkout@v6
      #with:
        #repository: riscv-software-src/riscv-isa-sim
        #path: riscv-isa-sim
        #ref: 5ef9a61f5fecdb9bf77da155172c8018ce820308
    #- name: riscv-isa-sim
      #run: |
        #cd riscv-isa-sim
        #mkdir build && cd build
        #export PATH=$GITHUB_WORKSPACE/riscv32-elf/bin:$PATH
        #export CFLAGS="-O3"
        #export CXXFLAGS="-O3"
        #../configure --prefix=$GITHUB_WORKSPACE/riscv32-elf
        #make -j4
        #make install

    #- name: riscv-strip-install
      #run: find $GITHUB_WORKSPACE/riscv32-elf -type f | xargs -i strip -g {} || true

    - name: build
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/riscv32-elf
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv32-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=OFF -DNCNN_ZFH=OFF -DNCNN_ZVFH=OFF ..
        cmake --build . -j 4

    - name: test
      run: |
        export PATH=/data/action/osd/riscv32-elf/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS="--isa=rv32gc;/data/action/osd/riscv32-elf/riscv32-unknown-elf/bin/pk" ctest --output-on-failure -j 4


================================================
FILE: .github/workflows/elf-riscv64.yml
================================================
name: elf-riscv64
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/elf-riscv64.yml'
    - 'toolchains/riscv64-unknown-elf.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/elf-riscv64.yml'
    - 'toolchains/riscv64-unknown-elf.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
concurrency:
  group: elf-riscv64-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  rv64gc:
    runs-on: [self-hosted, linux, centos]
    steps:
    - uses: actions/checkout@v6

    #- name: riscv-gnu-toolchain
      #run: |
        #wget -c https://github.com/riscv-collab/riscv-gnu-toolchain/releases/download/2025.01.20/riscv64-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz
        #tar -xf riscv64-elf-ubuntu-22.04-gcc-nightly-2025.01.20-nightly.tar.xz
        #mv riscv riscv64-elf

    #- name: checkout-riscv-pk
      #uses: actions/checkout@v6
      #with:
        #repository: riscv/riscv-pk
        #path: riscv-pk
        #ref: d8659a4e8e888bdc9caf840ad17bfe83239b1d64
    #- name: riscv-pk
      #run: |
        #cd riscv-pk
        #mkdir build && cd build
        #export PATH=$GITHUB_WORKSPACE/riscv64-elf/bin:$PATH
        #export CFLAGS="-O3"
        #export CXXFLAGS="-O3"
        #../configure --prefix=$GITHUB_WORKSPACE/riscv64-elf --with-arch=rv64gc_zicsr_zifencei --host=riscv64-unknown-elf --with-abi=lp64d
        #make -j4
        #make install

    #- name: checkout-riscv-isa-sim
      #uses: actions/checkout@v6
      #with:
        #repository: riscv-software-src/riscv-isa-sim
        #path: riscv-isa-sim
        #ref: 5ef9a61f5fecdb9bf77da155172c8018ce820308
    #- name: riscv-isa-sim
      #run: |
        #cd riscv-isa-sim
        #mkdir build && cd build
        #export PATH=$GITHUB_WORKSPACE/riscv64-elf/bin:$PATH
        #export CFLAGS="-O3"
        #export CXXFLAGS="-O3"
        #../configure --prefix=$GITHUB_WORKSPACE/riscv64-elf
        #make -j4
        #make install

    #- name: riscv-strip-install
      #run: find $GITHUB_WORKSPACE/riscv64-elf -type f | xargs -i strip -g {} || true

    - name: build
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/riscv64-elf
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-elf.toolchain.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_XTHEADVECTOR=OFF ..
        cmake --build . -j 4

    - name: test
      run: |
        export PATH=/data/action/osd/riscv64-elf/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=spike TESTS_EXECUTABLE_LOADER_ARGUMENTS="--isa=rv64gc;/data/action/osd/riscv64-elf/riscv64-unknown-elf/bin/pk" ctest --output-on-failure -j 4


================================================
FILE: .github/workflows/esp32.yml
================================================
name: ESP32
on:
  push:
    branches: [master]
    paths:
      - '.github/workflows/esp32.yml'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'src/*'
      - 'src/layer/*'
  pull_request:
    branches: [master]
    paths:
      - '.github/workflows/esp32.yml'
      - 'CMakeLists.txt'
      - 'cmake/**'
      - 'src/*'
      - 'src/layer/*'

concurrency:
  group: esp32-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read

jobs:
  build:
    name: ESP32
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v6
        with:
          submodules: true

      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: '3.8'

      - name: Install dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y cmake ninja-build ccache
            
      - name: Checkout ESP-IDF
        uses: actions/checkout@v6
        with:
          repository: espressif/esp-idf
          path: esp-idf-install
          ref: release/v5.3
          
      - name: Install ESP-IDF
        run: |
          cd esp-idf-install
          git submodule update --init --recursive
          ./install.sh

      - name: Set environment and build NCNN for ESP32
        run: |
          source esp-idf-install/export.sh
          echo "IDF_PATH=$IDF_PATH" >> $GITHUB_ENV
          echo "${IDF_PATH}/tools" >> $GITHUB_PATH
          echo "${IDF_PATH}/components" >> $GITHUB_PATH
          mkdir -p build-esp32 && cd build-esp32
          cmake -DCMAKE_TOOLCHAIN_FILE="../toolchains/esp32.toolchain.cmake" -DCMAKE_BUILD_TYPE=Release -DNCNN_BUILD_EXAMPLES=OFF ..
          make -j 4
          make install


================================================
FILE: .github/workflows/harmonyos.yml
================================================
name: harmonyos
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/harmonyos.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/harmonyos.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
concurrency:
  group: harmonyos-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  build:
    runs-on: [self-hosted, linux, centos]

    env:
      OHOS_NDK_HOME: /data/action/osd/ohos-sdk/linux/native
      OHOS_NDK_CMAKE: /data/action/osd/ohos-sdk/linux/native/build-tools/cmake/bin/cmake
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=/data/action/osd/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DNCNN_SIMPLEOMP=ON \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    # - name: setup-sdk
    #   run: |
    #     cd /data/action/osd
    #     wget -q https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz
    #     tar -xf ohos-sdk-windows_linux-public.tar.gz
    #     cd ohos-sdk/linux
    #     unzip -q native-linux-x64-4.1.7.8-Release.zip

    - name: armeabi-v7a
      run: |
        mkdir build-armeabi-v7a && cd build-armeabi-v7a
        ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="armeabi-v7a" ..
        ${{ env.OHOS_NDK_CMAKE }} --build . -j 4
    - name: arm64-v8a
      run: |
        mkdir build-arm64-v8a && cd build-arm64-v8a
        ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" ..
        ${{ env.OHOS_NDK_CMAKE }} --build . -j 4
    - name: x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="x86_64" ..
        ${{ env.OHOS_NDK_CMAKE }} --build . -j 4

    - name: armeabi-v7a-shared
      run: |
        mkdir build-armeabi-v7a-shared && cd build-armeabi-v7a-shared
        ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="armeabi-v7a" -DNCNN_SHARED_LIB=ON ..
        ${{ env.OHOS_NDK_CMAKE }} --build . -j 4
    - name: arm64-v8a-shared
      run: |
        mkdir build-arm64-v8a-shared && cd build-arm64-v8a-shared
        ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" -DNCNN_SHARED_LIB=ON ..
        ${{ env.OHOS_NDK_CMAKE }} --build . -j 4
    - name: x86_64-shared
      run: |
        mkdir build-x86_64-shared && cd build-x86_64-shared
        ${{ env.OHOS_NDK_CMAKE }} ${{ env.NCNN_CMAKE_OPTIONS }} -DOHOS_ARCH="x86_64" -DNCNN_SHARED_LIB=ON ..
        ${{ env.OHOS_NDK_CMAKE }} --build . -j 4


================================================
FILE: .github/workflows/ios.yml
================================================
name: ios
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/ios.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/ios.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
concurrency:
  group: ios-${{ github.ref }}
  cancel-in-progress: true
env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  IOS_DEPLOYMENT_TARGET: '13.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
permissions:
  contents: read

jobs:
  build:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \

      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-ios-install-20251004
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: openmp-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/ios
        mkdir -p $GITHUB_WORKSPACE/openmp-install/ios-simulator

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install/ios
        mkdir -p $GITHUB_WORKSPACE/openmp-install/ios/lib
        cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/ios/lib/libomp.a

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/ios-simulator
        mkdir -p $GITHUB_WORKSPACE/openmp-install/ios-simulator/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/ios-simulator/lib/libomp.a

    - name: install-openmp
      run: |
        sudo cp $GITHUB_WORKSPACE/openmp-install/ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib

        sudo cp $GITHUB_WORKSPACE/openmp-install/ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib

    - name: arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" ..
        cmake --build . -j 4
    - name: simulator-x86_64
      run: |
        mkdir build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" ..
        cmake --build . -j 4
    - name: simulator-arm64
      run: |
        mkdir build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" ..
        cmake --build . -j 4


================================================
FILE: .github/workflows/labeler.yml
================================================
name: labeler
on: [pull_request_target]

permissions:
  contents: read
  pull-requests: write

jobs:
  label:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/labeler@v6


================================================
FILE: .github/workflows/linux-aarch64.yml
================================================
name: linux-aarch64
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-aarch64.yml'
    - 'toolchains/aarch64-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-aarch64.yml'
    - 'toolchains/aarch64-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'tests/**'
concurrency:
  group: linux-aarch64-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  aarch64-native:
    runs-on: ubuntu-24.04-arm
    steps:
    - uses: actions/checkout@v6

    - name: build
      run: |
        mkdir build && cd build
        cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test
      run: cd build && ctest --output-on-failure -j $(nproc)

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)

    - name: build-simplestl-simplemath
      run: |
        mkdir build-simplestl-simplemath && cd build-simplestl-simplemath 
        cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test-simplestl-simplemath
      run: cd build-simplestl-simplemath && ctest --output-on-failure -j $(nproc)

  asan:
    runs-on: ubuntu-24.04-arm
    steps:
    - uses: actions/checkout@v6
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=relwithdebinfo -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_SHARED_LIB=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test
      run: |
        cd build
        ctest --output-on-failure -j $(nproc)

  aarch64:
    runs-on: ubuntu-24.04
    steps:
    - uses: actions/checkout@v6

    - name: aarch64-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-aarch64-linux-gnu qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test-a53
      run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a53" ctest --output-on-failure -j $(nproc)

    - name: test-a55
      run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a55" ctest --output-on-failure -j $(nproc)

    - name: test-a72
      run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a72" ctest --output-on-failure -j $(nproc)

    - name: test-a76
      run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a76" ctest --output-on-failure -j $(nproc)

    - name: test-a710
      run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;cortex-a710" ctest --output-on-failure -j $(nproc)

    - name: test-max
      run: cd build && TESTS_EXECUTABLE_LOADER=qemu-aarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu;-cpu;max" ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-arm.yml
================================================
name: linux-arm
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-arm.yml'
    - 'toolchains/arm-linux-gnueabi.toolchain.cmake'
    - 'toolchains/arm-linux-gnueabihf.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-arm.yml'
    - 'toolchains/arm-linux-gnueabi.toolchain.cmake'
    - 'toolchains/arm-linux-gnueabihf.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'tests/**'
concurrency:
  group: linux-arm-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  arm:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: arm-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-arm-linux-gnueabi qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: |
        cd build-noint8
        TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc)

  armhf:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: arm-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-arm-linux-gnueabihf qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: |
        cd build-noint8
        TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)

  armhf-vfpv3-d16:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: arm-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-arm-linux-gnueabihf qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)

    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_INT8=OFF ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: |
        cd build-noint8
        TESTS_EXECUTABLE_LOADER=qemu-arm-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-loongarch64.yml
================================================
name: linux-loongarch64
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-loongarch64.yml'
    - 'toolchains/loongarch64-linux-gnu.toolchain.cmake'
    - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/loongarch/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-loongarch64.yml'
    - 'toolchains/loongarch64-linux-gnu.toolchain.cmake'
    - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/loongarch/**'
    - 'tests/**'
concurrency:
  group: linux-loongarch64-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  gcc-loongarch64:
    runs-on: [self-hosted, linux, centos]

    steps:
    - uses: actions/checkout@v6

    # - name: qemu
    #   run: |
    #     sudo apt-get update
    #     sudo apt-get install -y qemu-user-static

    # - name: loongarch64-toolchain
    #   run: |
    #     wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.0/loongarch64-clfs-8.0-cross-tools-gcc-full.tar.xz
    #     tar -xf loongarch64-clfs-8.0-cross-tools-gcc-full.tar.xz

    - name: build
      run: |
        export LOONGARCH64_ROOT_PATH=/data/action/osd/cross-tools
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 4

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-loongarch64-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/data/action/osd/cross-tools/target" ctest --output-on-failure -j 4


================================================
FILE: .github/workflows/linux-mips.yml
================================================
name: linux-mips
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-mips.yml'
    - 'toolchains/mipsel-linux-gnu.toolchain.cmake'
    - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/mips/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-mips.yml'
    - 'toolchains/mipsel-linux-gnu.toolchain.cmake'
    - 'toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/mips/**'
    - 'tests/**'
concurrency:
  group: linux-mips-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  mipsel:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: mipsel-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-mipsel-linux-gnu qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsel-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-mipsel-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsel-linux-gnu" ctest --output-on-failure -j $(nproc)

  mipsisa32r6el:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: mipsisa32r6el-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-mipsisa32r6el-linux-gnu qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-mipsel-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-mips64.yml
================================================
name: linux-mips64
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-mips64.yml'
    - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake'
    - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/mips/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-mips64.yml'
    - 'toolchains/mips64el-linux-gnuabi64.toolchain.cmake'
    - 'toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/mips/**'
    - 'tests/**'
concurrency:
  group: linux-mips64-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  mips64el:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: mips64el-gnuabi64-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-mips64el-linux-gnuabi64 qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mips64el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-mips64el-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mips64el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)

  mipsisa64r6el:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: mipsisa64r6el-gnuabi64-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-mipsisa64r6el-linux-gnuabi64 qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-mips64el-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-ppc64.yml
================================================
name: linux-ppc64
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-ppc64.yml'
    - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/*'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-ppc64.yml'
    - 'toolchains/powerpc64le-linux-gnu.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/*'
    - 'tests/**'
concurrency:
  group: linux-ppc64-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  ppc:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: powerpc-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-powerpc-linux-gnu qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-ppc-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc-linux-gnu" ctest --output-on-failure -j $(nproc)

  ppc64le:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: powerpc64le-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-powerpc64le-linux-gnu qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/powerpc64le-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-ppc64le-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc)

  power8le-vsx:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: powerpc64le-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-powerpc64le-linux-gnu qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power8le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-ppc64le-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu" ctest --output-on-failure -j $(nproc)

  power9le-vsx:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: powerpc64le-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-powerpc64le-linux-gnu qemu-user-static

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/power9le-linux-gnu-vsx.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-ppc64le-static TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/powerpc64le-linux-gnu;-cpu;power9_v2.0" ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-riscv32.yml
================================================
name: linux-riscv32
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-riscv32.yml'
    - 'toolchains/c907-rv32-v310.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-riscv32.yml'
    - 'toolchains/c907-rv32-v310.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
concurrency:
  group: linux-riscv32-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  xuantie:
    name: xuantie-${{ matrix.cpu }}
    runs-on: [self-hosted, linux, ubuntu]
    strategy:
      fail-fast: false
      matrix:
        include:
          - { cpu: c907-rv32, QEMU_CPU: c907fdv-rv32,   OPENMP: ON,  RVV: ON,  XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON  }

    steps:
    - uses: actions/checkout@v6

    - name: build
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-6.6.36-glibc-x86_64-V3.3.0
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.cpu }}-v310.toolchain.cmake -DCMAKE_BUILD_TYPE=release \
            -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \
            -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_RVV=${{ matrix.RVV }} \
            -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \
            -DNCNN_ZFH=${{ matrix.ZFH }} \
            -DNCNN_ZVFH=${{ matrix.ZVFH }} \
            -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    - name: test
      run: |
        export PATH=/data/action/osd/Xuantie-qemu-x86_64-Ubuntu-20.04-V5.2.8-B20250721-0303/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv32 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }}" ctest --output-on-failure -j 8


================================================
FILE: .github/workflows/linux-riscv64.yml
================================================
name: linux-riscv64
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-riscv64.yml'
    - 'toolchains/riscv64-linux-gnu.toolchain.cmake'
    - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake'
    - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake'
    - 'toolchains/c906-v310.toolchain.cmake'
    - 'toolchains/c908-v310.toolchain.cmake'
    - 'toolchains/c910-v310.toolchain.cmake'
    - 'toolchains/k1.toolchain.cmake'
    - 'toolchains/k1.llvm.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
    - 'examples/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-riscv64.yml'
    - 'toolchains/riscv64-linux-gnu.toolchain.cmake'
    - 'toolchains/riscv64-unknown-linux-gnu.toolchain.cmake'
    - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake'
    - 'toolchains/c906-v310.toolchain.cmake'
    - 'toolchains/c908-v310.toolchain.cmake'
    - 'toolchains/c910-v310.toolchain.cmake'
    - 'toolchains/k1.toolchain.cmake'
    - 'toolchains/k1.llvm.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/riscv/**'
    - 'tests/**'
    - 'examples/**'
concurrency:
  group: linux-riscv64-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  gcc-riscv64:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: cache-qemu
      id: cache-qemu
      uses: actions/cache@v5
      with:
        path: qemu-install
        key: qemu-riscv64-install-20220502-4
    - name: install-qemu-build-deps
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      run: |
        sudo apt-get update
        sudo apt-get install autoconf automake autotools-dev ninja-build build-essential pkg-config libglib2.0-dev libpixman-1-dev zlib1g-dev python3
    - name: checkout-qemu
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
      with:
        repository: qemu/qemu
        path: qemu
        ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
    - name: qemu
      if: steps.cache-qemu.outputs.cache-hit != 'true'
      run: |
        cd qemu
        wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
        patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
        ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
        make -j$(nproc)
        make install

    - name: riscv64-gnu-toolchain
      run: |
        sudo apt-get update
        sudo apt-get install g++-riscv64-linux-gnu

    - name: configure
      run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
    - name: build
      run: cmake --build build -j $(nproc)

    - name: test
      run: |
        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc)

  xuantie:
    name: xuantie-${{ matrix.cpu }}
    runs-on: [self-hosted, linux, ubuntu]
    strategy:
      fail-fast: false
      matrix:
        include:
          - { cpu: c906, QEMU_CPU: c906fdv, OPENMP: OFF, RVV: OFF, XTHEADVECTOR: ON,  ZFH: ON, ZVFH: OFF }
          - { cpu: c910, QEMU_CPU: c910v,   OPENMP: ON,  RVV: OFF, XTHEADVECTOR: ON,  ZFH: ON, ZVFH: OFF }
          - { cpu: c908, QEMU_CPU: c908v,   OPENMP: ON,  RVV: ON,  XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON  }
          - { cpu: c907, QEMU_CPU: c907fdv-rv64,   OPENMP: ON,  RVV: ON,  XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON  }

    steps:
    - uses: actions/checkout@v6

    - name: build
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-6.6.36-glibc-x86_64-V3.3.0
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.cpu }}-v310.toolchain.cmake -DCMAKE_BUILD_TYPE=release \
            -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \
            -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_RVV=${{ matrix.RVV }} \
            -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \
            -DNCNN_ZFH=${{ matrix.ZFH }} \
            -DNCNN_ZVFH=${{ matrix.ZVFH }} \
            -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    - name: test
      run: |
        export PATH=/data/action/osd/Xuantie-qemu-x86_64-Ubuntu-20.04-V5.2.8-B20250721-0303/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }}" ctest --output-on-failure -j 8

  spacemit:
    name: spacemit-${{ matrix.cpu }}
    runs-on: [self-hosted, linux, ubuntu]
    strategy:
      fail-fast: false
      matrix:
        include:
          - { cpu: x60, QEMU_CPU: "max,vlen=256,elen=64,vext_spec=v1.0", OPENMP: ON, RVV: ON, XTHEADVECTOR: OFF, ZFH: ON, ZVFH: ON }

    steps:
    - uses: actions/checkout@v6

    # https://archive.spacemit.com/toolchain/spacemit-toolchain-linux-glibc-x86_64-v1.1.2.tar.xz
    - name: build-gcc
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2
        mkdir build-gcc && cd build-gcc
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/k1.toolchain.cmake -DCMAKE_BUILD_TYPE=release \
            -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \
            -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_RVV=${{ matrix.RVV }} \
            -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \
            -DNCNN_ZFH=${{ matrix.ZFH }} \
            -DNCNN_ZVFH=${{ matrix.ZVFH }} \
            -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    - name: build-llvm
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2
        mkdir build-llvm && cd build-llvm
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/k1.llvm.toolchain.cmake -DCMAKE_BUILD_TYPE=release \
            -DNCNN_OPENMP=${{ matrix.OPENMP }} -DNCNN_THREADS=${{ matrix.OPENMP }} \
            -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_RVV=${{ matrix.RVV }} \
            -DNCNN_XTHEADVECTOR=${{ matrix.XTHEADVECTOR }} \
            -DNCNN_ZFH=${{ matrix.ZFH }} \
            -DNCNN_ZVFH=${{ matrix.ZVFH }} \
            -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    # https://archive.spacemit.com/spacemit-ai/qemu/jdsk-qemu-v0.0.14.tar.gz
    - name: test-gcc
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2
        export PATH=/data/action/osd/jdsk-qemu/bin:$PATH
        cd build-gcc
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }};-L;${RISCV_ROOT_PATH}/sysroot" ctest --output-on-failure -j 8

    - name: test-llvm
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/spacemit-toolchain-linux-glibc-x86_64-v1.1.2
        export PATH=/data/action/osd/jdsk-qemu/bin:$PATH
        cd build-llvm
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;${{ matrix.QEMU_CPU }};-L;${RISCV_ROOT_PATH}/sysroot" ctest --output-on-failure -j 8

  gcc-rvv:
    runs-on: [self-hosted, linux, ubuntu]
    steps:
    - uses: actions/checkout@v6

    #- name: cache-qemu
      #id: cache-qemu
      #uses: actions/cache@v5
      #with:
        #path: qemu-install
        #key: qemu-riscv64-install-20241202
    #- name: install-qemu-build-deps
      #if: steps.cache-qemu.outputs.cache-hit != 'true'
      #run: |
        #sudo apt-get update
        #sudo apt-get install autoconf automake autotools-dev ninja-build
    #- name: checkout-qemu
      #if: steps.cache-qemu.outputs.cache-hit != 'true'
      #uses: actions/checkout@v6
      #with:
        #repository: qemu/qemu
        #path: qemu
        #ref: 72b88908d12ee9347d13539c7dd9a252625158d1
    #- name: qemu
      #if: steps.cache-qemu.outputs.cache-hit != 'true'
      #run: |
        #cd qemu
        #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
        #make -j4
        #make install

    #- name: cache-riscv
      #id: cache-riscv
      #uses: actions/cache@v5
      #with:
        #path: riscv-install
        #key: riscv-linux-install-20241202

    #- name: install-riscv-build-deps
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: |
        #sudo apt-get update
        #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler

    #- name: checkout-riscv-gnu-toolchain
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #uses: actions/checkout@v6
      #with:
        #repository: riscv-collab/riscv-gnu-toolchain
        #path: riscv-gnu-toolchain
        #ref: 20f615317e2ce888dfc11b29ccde4a649494b654
    #- name: checkout-riscv-gnu-toolchain-submodules
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: |
        #cd riscv-gnu-toolchain
        #git submodule update --init --recursive --depth 1 glibc
        #git submodule update --init --recursive --depth 1 newlib
        #git submodule update --init --recursive --depth 1 riscv-binutils
        #git submodule update --init --recursive --depth 1 riscv-gcc
        #git submodule update --init --recursive --depth 1 riscv-dejagnu
        #git submodule update --init --recursive --depth 1 riscv-gdb
    #- name: riscv-gnu-toolchain
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: |
        #cd riscv-gnu-toolchain
        #./configure --prefix=$GITHUB_WORKSPACE/riscv
        #make linux -j4

    #- name: riscv-strip-install
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: find $GITHUB_WORKSPACE/riscv -type f | xargs -i strip -g {} || true

    - name: configure
      run: export RISCV_ROOT_PATH=/data/action/osd/riscv && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
    - name: build
      run: cmake --build build -j 8

    - name: test-vlen256
      run: |
        export PATH=/data/action/osd/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8

    - name: test-vlen128
      run: |
        export PATH=/data/action/osd/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8

  clang-rvv:
    runs-on: [self-hosted, linux, ubuntu]
    steps:
    - uses: actions/checkout@v6

    #- name: cache-qemu
      #id: cache-qemu
      #uses: actions/cache@v5
      #with:
        #path: qemu-install
        #key: qemu-riscv64-install-20241202
    #- name: install-qemu-build-deps
      #if: steps.cache-qemu.outputs.cache-hit != 'true'
      #run: |
        #sudo apt-get update
        #sudo apt-get install autoconf automake autotools-dev ninja-build
    #- name: checkout-qemu
      #if: steps.cache-qemu.outputs.cache-hit != 'true'
      #uses: actions/checkout@v6
      #with:
        #repository: qemu/qemu
        #path: qemu
        #ref: 72b88908d12ee9347d13539c7dd9a252625158d1
    #- name: qemu
      #if: steps.cache-qemu.outputs.cache-hit != 'true'
      #run: |
        #cd qemu
        #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
        #make -j4
        #make install

    #- name: cache-riscv
      #id: cache-riscv
      #uses: actions/cache@v5
      #with:
        #path: riscv-install
        #key: riscv-linux-install-20241202

    #- name: install-riscv-build-deps
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: |
        #sudo apt-get update
        #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler

    #- name: checkout-riscv-gnu-toolchain
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #uses: actions/checkout@v6
      #with:
        #repository: riscv-collab/riscv-gnu-toolchain
        #path: riscv-gnu-toolchain
        #ref: 20f615317e2ce888dfc11b29ccde4a649494b654
    #- name: checkout-riscv-gnu-toolchain-submodules
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: |
        #cd riscv-gnu-toolchain
        #git submodule update --init --recursive --depth 1 glibc
        #git submodule update --init --recursive --depth 1 newlib
        #git submodule update --init --recursive --depth 1 riscv-binutils
        #git submodule update --init --recursive --depth 1 riscv-gcc
        #git submodule update --init --recursive --depth 1 riscv-dejagnu
        #git submodule update --init --recursive --depth 1 riscv-gdb
    #- name: riscv-gnu-toolchain
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: |
        #cd riscv-gnu-toolchain
        #./configure --prefix=$GITHUB_WORKSPACE/riscv
        #make linux -j4

    #- name: riscv-strip-install
      #if: steps.cache-riscv.outputs.cache-hit != 'true'
      #run: find $GITHUB_WORKSPACE/riscv -type f | xargs -i strip -g {} || true

    # - name: install-clang
    #   run: |
    #     wget https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.4/llvm-project-19.1.4.src.tar.xz
    #     tar -xf llvm-project-19.1.4.src.tar.xz
    #     cd llvm-project-19.1.4.src
    #     mkdir build
    #     cd build
    #     cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/riscv -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/
    #     make -j16
    #     make install

    - name: build
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/riscv
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    - name: test-vlen256
      run: |
        export PATH=/data/action/osd/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8

    - name: test-vlen128
      run: |
        export PATH=/data/action/osd/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8


================================================
FILE: .github/workflows/linux-x64-cpu-clang.yml
================================================
name: linux-x64-cpu-clang
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-cpu-clang.yml'
    - 'toolchains/host-c.clang.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-cpu-clang.yml'
    - 'toolchains/host-c.clang.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
concurrency:
  group: linux-x64-cpu-clang-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-clang:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: update
      run: sudo apt-get update
    - name: protobuf
      run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
    - name: build-sse2
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-sse2 && cd build-sse2
        cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-sse2
      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
    - name: build-shared
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-shared && cd build-shared
        cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: build-avx2
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-avx2 && cd build-avx2
        cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-avx2
      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
    - name: build-avx
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-avx && cd build-avx
        cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-avx
      run: cd build-avx && ctest --output-on-failure -j $(nproc)
    - name: build-avx1-2
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-avx1-2 && cd build-avx1-2
        cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-avx1-2
      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
    - name: build-noint8
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)

  linux-clang-simplestl:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: build-simplestl
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-simplestl && cd build-simplestl
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test-simplestl
      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
    - name: build-simplestl-simpleomp
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.clang.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test-simplestl-simpleomp
      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-x64-cpu-gcc-musl.yml
================================================
name: linux-x64-cpu-gcc-musl
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-cpu-gcc-musl.yml'
    - 'toolchains/host-c.gcc.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-cpu-gcc-musl.yml'
    - 'toolchains/host-c.gcc.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
concurrency:
  group: linux-x64-cpu-gcc-musl-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-gcc-musl:
    runs-on: ubuntu-latest
    steps:
    - uses: jirutka/setup-alpine@v1
      with:
        packages: >
          cmake
          clang
          clang-dev
          make
          gcc
          g++
          libc-dev
          linux-headers

    - uses: actions/checkout@v6
    - name: build
      shell: alpine.sh {0}
      run: |
        mkdir build && cd build
        cmake -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test
      shell: alpine.sh {0}
      run: cd build && ctest --output-on-failure -j $(nproc)
    - name: build-shared
      run: |
        mkdir build-shared && cd build-shared
        cmake -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)


================================================
FILE: .github/workflows/linux-x64-cpu-gcc.yml
================================================
name: linux-x64-cpu-gcc
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-cpu-gcc.yml'
    - 'toolchains/host-c.gcc.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-cpu-gcc.yml'
    - 'toolchains/host-c.gcc.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
concurrency:
  group: linux-x64-cpu-gcc-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-gcc:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: update
      run: sudo apt-get update
    - name: protobuf
      run: sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev
    - name: build-sse2
      run: |
        mkdir build-sse2 && cd build-sse2
        cmake -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-sse2
      run: cd build-sse2 && ctest --output-on-failure -j $(nproc)
    - name: build-shared
      run: |
        mkdir build-shared && cd build-shared
        cmake -DNCNN_AVX2=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: build-avx2
      run: |
        mkdir build-avx2 && cd build-avx2
        cmake -DNCNN_AVX2=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-avx2
      run: cd build-avx2 && ctest --output-on-failure -j $(nproc)
    - name: build-avx
      run: |
        mkdir build-avx && cd build-avx
        cmake -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-avx
      run: cd build-avx && ctest --output-on-failure -j $(nproc)
    - name: build-avx1-2
      run: |
        mkdir build-avx1-2 && cd build-avx1-2
        cmake -DNCNN_AVX2=ON -DNCNN_AVX=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-avx1-2
      run: cd build-avx1-2 && ctest --output-on-failure -j $(nproc)
    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)

  asan:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=relwithdebinfo -DNCNN_ASAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_SHARED_LIB=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test
      run: |
        cd build
        ctest --output-on-failure -j $(nproc)

  linux-gcc-cpp03-nostdio-nostring-simplestl:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: build-nostdio
      run: |
        mkdir build-nostdio && cd build-nostdio
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test-nostdio
      run: cd build-nostdio && ctest --output-on-failure -j $(nproc)
    - name: build-nostdio-nostring
      run: |
        mkdir build-nostdio-nostring && cd build-nostdio-nostring
        cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: build-simplestl
      run: |
        mkdir build-simplestl && cd build-simplestl
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test-simplestl
      run: cd build-simplestl && ctest --output-on-failure -j $(nproc)
    - name: build-simplestl-simpleomp
      run: |
        mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test-simplestl-simpleomp
      run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)

  linux-gcc-avx512:
    runs-on: [self-hosted, linux, t4]
    steps:
    - uses: actions/checkout@v6
    - name: build
      env:
        CC: gcc
        CXX: g++
        LD_LIBRARY_PATH: /data/action/install/lib64
      run: |
        mkdir build && cd build
        cmake -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j 4
    - name: test
      env:
        LD_LIBRARY_PATH: /data/action/install/lib64
      run: cd build && ctest --output-on-failure -j 4


================================================
FILE: .github/workflows/linux-x64-gpu-clang.yml
================================================
name: linux-x64-gpu-clang
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-gpu-clang.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-gpu-clang.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
concurrency:
  group: linux-x64-gpu-clang-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-clang-gpu:
    runs-on: [self-hosted, linux, ubuntu25]
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-swiftshader
      id: cache-swiftshader
      uses: actions/cache@v5
      with:
        path: swiftshader-install
        key: swiftshader-linux-install-20250508
    - name: checkout-swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
      with:
        repository: google/swiftshader
        path: swiftshader
        ref: 930d46d31b5d637f313fd5ef55da2bbf053c26c1
    - name: swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
        mkdir -p build; cd build
        cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
        cmake --build . -j 8
        mkdir $GITHUB_WORKSPACE/swiftshader-install
        cp Linux/* $GITHUB_WORKSPACE/swiftshader-install

    - name: build
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build && cd build
        cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini
        export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
        cd build && ctest --output-on-failure -j 8
    - name: build-shared
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-shared && cd build-shared
        cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j 8


================================================
FILE: .github/workflows/linux-x64-gpu-gcc.yml
================================================
name: linux-x64-gpu-gcc
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-gpu-gcc.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-gpu-gcc.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
concurrency:
  group: linux-x64-gpu-gcc-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-gcc-gpu:
    runs-on: [self-hosted, linux, ubuntu25]
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-swiftshader
      id: cache-swiftshader
      uses: actions/cache@v5
      with:
        path: swiftshader-install
        key: swiftshader-linux-install-20250508
    - name: checkout-swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
      with:
        repository: google/swiftshader
        path: swiftshader
        ref: 930d46d31b5d637f313fd5ef55da2bbf053c26c1
    - name: swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
        mkdir -p build; cd build
        cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
        cmake --build . -j 8
        mkdir $GITHUB_WORKSPACE/swiftshader-install
        cp Linux/* $GITHUB_WORKSPACE/swiftshader-install

    - name: build
      run: |
        mkdir build && cd build
        cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini
        export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
        cd build && ctest --output-on-failure -j 8
    - name: build-shared
      run: |
        mkdir build-shared && cd build-shared
        cmake -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j 8

  linux-gcc-gpu-system-glslang:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6

    - name: install-deps
      run: |
        sudo apt-get update
        sudo apt-get install libprotobuf-dev protobuf-compiler libopencv-dev libvulkan-dev glslang-dev glslang-tools spirv-tools

    - name: build
      run: |
        mkdir build && cd build
        cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake ..
        cmake --build . -j $(nproc)
    - name: build-shared
      run: |
        mkdir build-shared && cd build-shared
        cmake -DNCNN_VULKAN=ON -DNCNN_SYSTEM_GLSLANG=ON -DGLSLANG_TARGET_DIR=/usr/lib/x86_64-linux-gnu/cmake -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)

  linux-gcc-gpu-t4:
    runs-on: [self-hosted, linux, t4]
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: build
      env:
        CC: gcc
        CXX: g++
        LD_LIBRARY_PATH: /data/action/install/lib64
      run: |
        mkdir build && cd build
        cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j 4
    - name: test
      env:
        LD_LIBRARY_PATH: /data/action/install/lib64
      run: |
        cd build && ctest --output-on-failure -j 4


================================================
FILE: .github/workflows/linux-x64-sde.yml
================================================
name: linux-x64-sde
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-sde.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x64-sde.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
concurrency:
  group: linux-x64-sde-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  gcc-sde:
    runs-on: ubuntu-24.04
    steps:
    - uses: actions/checkout@v6
    - name: update
      run: sudo apt-get update
    - name: gcc14
      run: sudo apt-get install gcc-14 g++-14
    - name: Setup SDE binaries
      uses: petarpetrovt/setup-sde@v3.0
    - name: build
      env:
        CC: gcc-14
        CXX: g++-14
      run: |
        mkdir build && cd build
        cmake -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-p4p
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-p4p;--" ctest --output-on-failure -j $(nproc)
    - name: test-snb
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-snb;--" ctest --output-on-failure -j $(nproc)
    - name: test-hsw
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-hsw;--" ctest --output-on-failure -j $(nproc)
    - name: test-adl
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-adl;--" ctest --output-on-failure -j $(nproc)
    - name: test-arl
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-arl;--" ctest --output-on-failure -j $(nproc)
    - name: test-skx
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-skx;--" ctest --output-on-failure -j $(nproc)
    - name: test-spr
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j $(nproc)
    - name: test-gnr
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-gnr;--" ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-x86-cpu-clang.yml
================================================
name: linux-x86-cpu-clang
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x86-cpu-clang.yml'
    - 'toolchains/host.clang-m32.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x86-cpu-clang.yml'
    - 'toolchains/host.clang-m32.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
concurrency:
  group: linux-x86-cpu-clang-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-clang:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: update
      run: sudo apt-get update
    - name: gcc-multilib
      run: sudo apt-get install gcc-multilib g++-multilib
    - name: build
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test
      run: cd build && ctest --output-on-failure -j $(nproc)
    - name: build-shared
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-shared && cd build-shared
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: build-noint8
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.clang-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/linux-x86-cpu-gcc.yml
================================================
name: linux-x86-cpu-gcc
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/linux-x86-cpu-gcc.yml'
    - 'toolchains/host.gcc-m32.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/linux-x86-cpu-gcc.yml'
    - 'toolchains/host.gcc-m32.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
concurrency:
  group: linux-x86-cpu-gcc-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-gcc:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: update
      run: sudo apt-get update
    - name: gcc-multilib
      run: sudo apt-get install gcc-multilib g++-multilib
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test
      run: cd build && ctest --output-on-failure -j $(nproc)
    - name: build-nosse
      run: |
        mkdir build-nosse && cd build-nosse
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: test-nosse
      run: cd build-nosse && ctest --output-on-failure -j $(nproc)
    - name: build-shared
      run: |
        mkdir build-shared && cd build-shared
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j $(nproc)
    - name: build-noint8
      run: |
        mkdir build-noint8 && cd build-noint8
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_INT8=OFF ..
        cmake --build . -j $(nproc)
    - name: test-noint8
      run: cd build-noint8 && ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/mac-catalyst.yml
================================================
name: mac-catalyst
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/mac-catalyst.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/mac-catalyst.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
concurrency:
  group: mac-catalyst-${{ github.ref }}
  cancel-in-progress: true
env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  MAC_CATALYST_DEPLOYMENT_TARGET: '13.1'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
permissions:
  contents: read

jobs:
  build:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \

      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-mac-catalyst-install-20251004
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: openmp-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/mac-catalyst

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/mac-catalyst
        mkdir -p $GITHUB_WORKSPACE/openmp-install/mac-catalyst/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/mac-catalyst/lib/libomp.a

    - name: install-openmp
      run: |
        sudo cp $GITHUB_WORKSPACE/openmp-install/mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib

    - name: x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" ..
        cmake --build . -j 4
    - name: arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4


================================================
FILE: .github/workflows/macos.yml
================================================
name: macos
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/macos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/macos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
concurrency:
  group: macos-${{ github.ref }}
  cancel-in-progress: true
env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  MAC_DEPLOYMENT_TARGET: '11.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
permissions:
  contents: read

jobs:
  build:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \

      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-macos-install-20251004
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: openmp-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a

    - name: install-openmp
      run: |
        sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib

    - name: cache-swiftshader
      id: cache-swiftshader
      uses: actions/cache@v5
      with:
        path: swiftshader-install
        key: swiftshader-macos-install-20251004
    - name: checkout-swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
      with:
        repository: google/swiftshader
        path: swiftshader
        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
    - name: checkout-swiftshader-submodules
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
    - name: swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        mkdir -p build; cd build
        cmake -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
        cmake --build . -j 4
        mkdir $GITHUB_WORKSPACE/swiftshader-install
        cp Darwin/* $GITHUB_WORKSPACE/swiftshader-install

    - name: arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
    - name: x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 4

    - name: arm64-shared
      run: |
        mkdir build-arm64-shared && cd build-arm64-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j 4
    - name: x86_64-shared
      run: |
        mkdir build-x86_64-shared && cd build-x86_64-shared
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" -DNCNN_SHARED_LIB=ON ..
        cmake --build . -j 4

    - name: x86_64-test
      run: |
        printf "[Processor]\nThreadCount=1\n" > build-x86_64/tests/SwiftShader.ini
        export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
        cd build-x86_64 && ctest --output-on-failure -j 4


================================================
FILE: .github/workflows/pnnx.yml
================================================
name: pnnx
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/pnnx.yml'
    - 'src/layer/*'
    - 'tools/pnnx/**'
    - '!tools/pnnx/README.md'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/pnnx.yml'
    - 'src/layer/*'
    - 'tools/pnnx/**'
    - '!tools/pnnx/README.md'
concurrency:
  group: pnnx-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

env:
  LIBTORCH_VERSION: 2.10.0
  TORCHVISION_VERSION: 0.25.0
  PROTOBUF_VERSION: 21.12
  ONNXRUNTIME_VERSION: 1.24.3
  CACHE_DATE: 20260309
  SEGMENT_DOWNLOAD_TIMEOUT_MINS: 15

jobs:
  quick-test:
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]

    env:
      PYTHONUSERBASE: ${{ github.workspace }}/torch
      UseMultiToolTask: true
    steps:
    - uses: actions/checkout@v6

    - uses: actions/setup-python@v6
      with:
        python-version: 3.12

    - name: setup-pytorch
      run: |
        python3 -m pip config set global.break-system-packages true
        pip3 install --user torch --index-url https://download.pytorch.org/whl/cpu
        pip3 install --user numpy packaging

    - name: build-pnnx
      run: |
        cd tools/pnnx
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=Release ..
        cmake --build . --config Release -j 4

    - name: quick-test
      if: matrix.os != 'windows-latest'
      run: |
        cd tools/pnnx
        cd build && ctest -C Release --output-on-failure -R test_nn_Conv

  build:
    runs-on: [self-hosted, linux, ubuntu25]

    steps:
    - uses: actions/checkout@v6

    - name: local-cache-libtorch
      id: local-cache-libtorch
      uses: maxnowack/local-cache@v2
      with:
        path: libtorch-${{ env.LIBTORCH_VERSION }}-install
        key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: local-cache-torchvision
      id: local-cache-torchvision
      uses: maxnowack/local-cache@v2
      with:
        path: torchvision-${{ env.TORCHVISION_VERSION }}-install
        key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: local-cache-onnxruntime
      id: local-cache-onnxruntime
      uses: maxnowack/local-cache@v2
      with:
        path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install
        key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: cache-libtorch
      id: cache-libtorch
      uses: actions/cache@v4
      with:
        path: libtorch-${{ env.LIBTORCH_VERSION }}-install
        key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: cache-torchvision
      id: cache-torchvision
      uses: actions/cache@v4
      with:
        path: torchvision-${{ env.TORCHVISION_VERSION }}-install
        key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: cache-onnxruntime
      id: cache-onnxruntime
      uses: actions/cache@v4
      with:
        path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install
        key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: pnnx-patches
      if: (steps.local-cache-libtorch.outputs.cache-hit != 'true' && steps.cache-libtorch.outputs.cache-hit != 'true') || (steps.local-cache-torchvision.outputs.cache-hit != 'true' && steps.cache-torchvision.outputs.cache-hit != 'true') || (steps.local-cache-onnxruntime.outputs.cache-hit != 'true' && steps.cache-onnxruntime.outputs.cache-hit != 'true')
      uses: actions/checkout@v6
      with:
        repository: pnnx/pnnx
        path: pnnx-patches

    - name: libtorch
      if: steps.local-cache-libtorch.outputs.cache-hit != 'true' && steps.cache-libtorch.outputs.cache-hit != 'true'
      run: |
        wget -q https://github.com/pytorch/pytorch/releases/download/v${{ env.LIBTORCH_VERSION }}/pytorch-v${{ env.LIBTORCH_VERSION }}.tar.gz
        tar -xf pytorch-v${{ env.LIBTORCH_VERSION }}.tar.gz
        cd pytorch-v${{ env.LIBTORCH_VERSION }}
        pip3 install -r requirements.txt --break-system-packages
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-fix-mobile-build.patch
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-no-link-system-lib.patch
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-fix-eigen-build.patch
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-fix-link-local-sleef.patch
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/pytorch-v${{ env.LIBTORCH_VERSION }}-revert-nativert-api.patch
        mkdir -p build && cd build
        cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install \
            -DCMAKE_BUILD_TYPE=MinSizeRel \
            -DBUILD_SHARED_LIBS=OFF \
            -DCMAKE_POLICY_VERSION_MINIMUM=3.5 \
            -DBUILD_CUSTOM_PROTOBUF=OFF \
            -DBUILD_LITE_INTERPRETER=OFF \
            -DBUILD_PYTHON=OFF \
            -DINTERN_BUILD_MOBILE=ON \
            -DINTERN_DISABLE_AUTOGRAD=ON \
            -DINTERN_DISABLE_ONNX=ON \
            -DUSE_CUDA=OFF \
            -DUSE_DISTRIBUTED=OFF \
            -DUSE_ITT=OFF \
            -DUSE_KINETO=OFF \
            -DUSE_LITE_INTERPRETER_PROFILER=OFF \
            -DUSE_MKLDNN=OFF \
            -DUSE_MPS=OFF \
            -DUSE_NUMPY=OFF \
            -DUSE_OPENMP=OFF \
            -DUSE_SOURCE_DEBUG_ON_MOBILE=OFF \
            -DUSE_XNNPACK=OFF \
            -DBUILD_TEST=OFF \
            -DATEN_NO_TEST=ON \
            ..
        cmake --build . -j 8
        cmake --build . -j 8 --target install/strip

    - name: torchvision
      if: steps.local-cache-torchvision.outputs.cache-hit != 'true' && steps.cache-torchvision.outputs.cache-hit != 'true'
      run: |
        wget -q https://github.com/pytorch/vision/archive/v${{ env.TORCHVISION_VERSION }}.zip -O vision-${{ env.TORCHVISION_VERSION }}.zip
        unzip -q vision-${{ env.TORCHVISION_VERSION }}.zip
        cd vision-${{ env.TORCHVISION_VERSION }}
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/vision-${{ env.TORCHVISION_VERSION }}-ops-only.patch
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/vision-${{ env.TORCHVISION_VERSION }}-no-cuda-version.patch
        mkdir -p build && cd build
        cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/torchvision-${{ env.TORCHVISION_VERSION }}-install \
            -DTorch_DIR=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install/share/cmake/Torch \
            -DCMAKE_BUILD_TYPE=MinSizeRel \
            -DWITH_PNG=OFF \
            -DWITH_JPEG=OFF ..
        cmake --build . -j 8
        cmake --build . -j 8 --target install/strip

    - name: onnxruntime
      if: steps.local-cache-onnxruntime.outputs.cache-hit != 'true' && steps.cache-onnxruntime.outputs.cache-hit != 'true'
      run: |
        wget -q https://github.com/protocolbuffers/protobuf/archive/v${{ env.PROTOBUF_VERSION }}.zip -O protobuf-${{ env.PROTOBUF_VERSION }}.zip
        unzip -q protobuf-${{ env.PROTOBUF_VERSION }}.zip
        cd protobuf-${{ env.PROTOBUF_VERSION }}
        mkdir -p build2 && cd build2
        cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \
            -Dprotobuf_BUILD_TESTS=OFF \
            -DCMAKE_BUILD_TYPE=MinSizeRel \
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON ..
        cmake --build . -j 8
        cmake --build . -j 8 --target install/strip

        cd ../../
        wget -q https://github.com/microsoft/onnxruntime/archive/v${{ env.ONNXRUNTIME_VERSION }}.zip -O onnxruntime-${{ env.ONNXRUNTIME_VERSION }}.zip
        unzip -q onnxruntime-${{ env.ONNXRUNTIME_VERSION }}.zip
        cd onnxruntime-${{ env.ONNXRUNTIME_VERSION }}
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-less-mlas-features.patch
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-monolithic-static-library.patch
        patch -p1 -i $GITHUB_WORKSPACE/pnnx-patches/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-use-clog.patch
        mkdir -p build2 && cd build2
        cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \
            -DCMAKE_BUILD_TYPE=MinSizeRel \
            -Donnxruntime_USE_FULL_PROTOBUF=ON \
            -Donnxruntime_BUILD_SHARED_LIB=ON \
            -Donnxruntime_BUILD_UNIT_TESTS=OFF \
            -Donnxruntime_ENABLE_CPUINFO=OFF \
            -Donnxruntime_DISABLE_CONTRIB_OPS=ON \
            -Donnxruntime_DISABLE_ML_OPS=ON \
            -Donnxruntime_DISABLE_SPARSE_TENSORS=ON \
            -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
            --compile-no-warning-as-error ../cmake
        cmake --build . -j 8
        cmake --build . -j 8 --target install/strip

    - name: pnnx
      run: |
        cd tools/pnnx
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=MinSizeRel \
            -DTorch_INSTALL_DIR=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install \
            -DTorchVision_INSTALL_DIR=$GITHUB_WORKSPACE/torchvision-${{ env.TORCHVISION_VERSION }}-install \
            -Donnxruntime_INSTALL_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \
            -Dprotobuf_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install/lib/cmake/protobuf ..
        cmake --build . -j 8
        strip src/pnnx

    - name: upload-pnnx
      uses: actions/upload-artifact@v5
      with:
        name: pnnx
        path: tools/pnnx/build/src/pnnx
        compression-level: 9

  test:
    needs: [build]
    runs-on: [self-hosted, linux, ubuntu25]
    strategy:
      fail-fast: false
      matrix:
        include:
          - { python: '3.8',  numpy: '1.24.4', opencv: '4.5.*',  torch: '1.8.1',  torchvision: '0.9.1',  torchaudio: '0.8.1',      transformers: '4.52.1' }
          - { python: '3.8',  numpy: '1.24.4', opencv: '4.5.*',  torch: '1.9.1',  torchvision: '0.10.1', torchaudio: '0.9.1',      transformers: '4.52.1' }
          - { python: '3.8',  numpy: '1.24.4', opencv: '4.6.*',  torch: '1.10.0', torchvision: '0.11.1', torchaudio: '0.10.0+cpu', transformers: '4.52.1' }
          - { python: '3.9',  numpy: '1.26.4', opencv: '4.6.*',  torch: '1.11.0', torchvision: '0.12.0', torchaudio: '0.11.0+cpu', transformers: '4.52.1' }
          - { python: '3.9',  numpy: '1.26.4', opencv: '4.7.*',  torch: '1.12.0', torchvision: '0.13.0', torchaudio: '0.12.0+cpu', transformers: '4.52.1' }
          - { python: '3.10', numpy: '1.26.4', opencv: '4.7.*',  torch: '1.13.0', torchvision: '0.14.0', torchaudio: '0.13.0+cpu', transformers: '4.52.1' }
          - { python: '3.10', numpy: '1.26.4', opencv: '4.8.*',  torch: '2.0.0',  torchvision: '0.15.1', torchaudio: '2.0.0+cpu',  transformers: '4.52.1' }
          - { python: '3.10', numpy: '1.26.4', opencv: '4.8.*',  torch: '2.1.0',  torchvision: '0.16.0', torchaudio: '2.1.0+cpu',  transformers: '4.52.1' }
          - { python: '3.11', numpy: '1.26.4', opencv: '4.9.*',  torch: '2.2.1',  torchvision: '0.17.1', torchaudio: '2.2.1+cpu',  transformers: '4.52.1' }
          - { python: '3.11', numpy: '1.26.4', opencv: '4.9.*',  torch: '2.3.0',  torchvision: '0.18.0', torchaudio: '2.3.0+cpu',  transformers: '4.52.1' }
          - { python: '3.11', numpy: '2.2.5',  opencv: '4.10.*', torch: '2.4.0',  torchvision: '0.19.0', torchaudio: '2.4.0+cpu',  transformers: '4.52.1' }
          - { python: '3.12', numpy: '2.2.5',  opencv: '4.10.*', torch: '2.5.0',  torchvision: '0.20.0', torchaudio: '2.5.0+cpu',  transformers: '4.52.1' }
          - { python: '3.12', numpy: '2.2.5',  opencv: '4.11.*', torch: '2.6.0',  torchvision: '0.21.0', torchaudio: '2.6.0+cpu',  transformers: '4.52.1' }
          - { python: '3.12', numpy: '2.2.5',  opencv: '4.11.*', torch: '2.7.0',  torchvision: '0.22.0', torchaudio: '2.7.0+cpu',  transformers: '4.52.1' }
          - { python: '3.13', numpy: '2.2.5',  opencv: '4.12.*', torch: '2.8.0',  torchvision: '0.23.0', torchaudio: '2.8.0+cpu',  transformers: '4.56.2' }
          - { python: '3.13', numpy: '2.2.5',  opencv: '4.12.*', torch: '2.9.0',  torchvision: '0.24.0', torchaudio: '2.9.0+cpu',  transformers: '4.56.2' }
          - { python: '3.13', numpy: '2.2.5',  opencv: '4.12.*', torch: '2.10.0', torchvision: '0.25.0', torchaudio: '2.10.0+cpu', transformers: '4.56.2' }

    name: test-${{ matrix.torch }}-py${{ matrix.python }}

    env:
      PYTHONUSERBASE: ${{ github.workspace }}/python-${{ matrix.python }}

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: local-cache-libtorch
      id: local-cache-libtorch
      uses: maxnowack/local-cache@v2
      with:
        path: libtorch-${{ env.LIBTORCH_VERSION }}-install
        key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: local-cache-torchvision
      id: local-cache-torchvision
      uses: maxnowack/local-cache@v2
      with:
        path: torchvision-${{ env.TORCHVISION_VERSION }}-install
        key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: local-cache-onnxruntime
      id: local-cache-onnxruntime
      uses: maxnowack/local-cache@v2
      with:
        path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install
        key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }}

    - name: cache-libtorch
      if: steps.local-cache-libtorch.outputs.cache-hit != 'true'
      id: cache-libtorch
      uses: actions/cache/restore@v5
      with:
        path: libtorch-${{ env.LIBTORCH_VERSION }}-install
        key: libtorch-${{ env.LIBTORCH_VERSION }}-linux-install-${{ env.CACHE_DATE }}
        fail-on-cache-miss: true

    - name: cache-torchvision
      if: steps.local-cache-torchvision.outputs.cache-hit != 'true'
      id: cache-torchvision
      uses: actions/cache/restore@v5
      with:
        path: torchvision-${{ env.TORCHVISION_VERSION }}-install
        key: torchvision-${{ env.TORCHVISION_VERSION }}-linux-install-${{ env.CACHE_DATE }}
        fail-on-cache-miss: true

    - name: cache-onnxruntime
      if: steps.local-cache-onnxruntime.outputs.cache-hit != 'true'
      id: cache-onnxruntime
      uses: actions/cache/restore@v5
      with:
        path: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install
        key: onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-linux-install-${{ env.CACHE_DATE }}
        fail-on-cache-miss: true

    - uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python }}

    - name: setup-pytorch
      run: |
        export PATH=${{ env.PYTHONUSERBASE }}/bin:$PATH
        pip3 install --user pytest wheel twine requests einops numpy==${{ matrix.numpy }} opencv-python==${{ matrix.opencv }}
        pip3 install --user torch==${{ matrix.torch }}+cpu torchvision==${{ matrix.torchvision }}+cpu torchaudio==${{ matrix.torchaudio }} --index-url https://download.pytorch.org/whl/cpu
        pip3 install --user onnx onnxscript onnxruntime
        pip3 install --user "transformers<=${{ matrix.transformers }}" diffusers "safetensors<=0.6.2"

    - name: setup-pytorch-execstack-or-patchelf
      if: ${{ matrix.python }} == '3.8' || ${{ matrix.python }} == '3.9'
      run: |
        execstack -c ${{ env.PYTHONUSERBASE }}/lib/python${{ matrix.python }}/site-packages/torch/lib/libtorch_cpu.so || true
        patchelf --clear-execstack ${{ env.PYTHONUSERBASE }}/lib/python${{ matrix.python }}/site-packages/torch/lib/libtorch_cpu.so || true

    - name: python-ncnn
      run: |
        export CMAKE_BUILD_PARALLEL_LEVEL=8
        pip3 install --user . --verbose

    - name: pnnx
      run: |
        cd tools/pnnx
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=Release \
            -DTorch_INSTALL_DIR=$GITHUB_WORKSPACE/libtorch-${{ env.LIBTORCH_VERSION }}-install \
            -DTorchVision_INSTALL_DIR=$GITHUB_WORKSPACE/torchvision-${{ env.TORCHVISION_VERSION }}-install \
            -Donnxruntime_INSTALL_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install \
            -Dprotobuf_DIR=$GITHUB_WORKSPACE/onnxruntime-${{ env.ONNXRUNTIME_VERSION }}-install/lib/cmake/protobuf ..

    - name: download-pnnx
      uses: actions/download-artifact@v8
      with:
        name: pnnx
        path: tools/pnnx/build/src

    - name: test
      run: |
        export PATH=${{ env.PYTHONUSERBASE }}/bin:$PATH
        chmod +x tools/pnnx/build/src/pnnx
        export OMP_THREAD_LIMIT=1
        export OMP_NUM_THREADS=1
        export MKL_NUM_THREADS=1
        export MKL_ENABLE_INSTRUCTIONS=SSE4_2
        cd tools/pnnx/build
        ctest --output-on-failure -j 8

    - name: python-pnnx
      run: |
        export PATH=${{ env.PYTHONUSERBASE }}/bin:$PATH
        export PNNX_WHEEL_WITHOUT_BUILD=ON
        cd tools/pnnx/python
        cp ../build/src/pnnx pnnx/
        python3 setup.py install --user
        pytest tests


================================================
FILE: .github/workflows/python.yml
================================================
name: python
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/python.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'python/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/python.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'python/**'
    - 'glslang'
concurrency:
  group: python-${{ github.ref }}
  cancel-in-progress: true
env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  MAC_DEPLOYMENT_TARGET: '11.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
  CMAKE_BUILD_PARALLEL_LEVEL: 4
  UseMultiToolTask: true
permissions:
  contents: read

jobs:
  build:
    strategy:
      matrix:
        os: [ubuntu-latest, macos-15-intel, windows-latest]
        python-version: [3.9, 3.12]

    runs-on: ${{ matrix.os }}

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-swiftshader
      if: matrix.os == 'ubuntu-latest'
      id: cache-swiftshader
      uses: actions/cache@v5
      with:
        path: swiftshader-install
        key: swiftshader-linux-install-20240622
    - name: checkout-swiftshader
      if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
      with:
        repository: google/swiftshader
        path: swiftshader
        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
    - name: checkout-swiftshader-submodules
      if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
    - name: swiftshader
      if: matrix.os == 'ubuntu-latest' && steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        mkdir -p build; cd build
        cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
        cmake --build . -j $(nproc)
        mkdir $GITHUB_WORKSPACE/swiftshader-install
        cp Linux/* $GITHUB_WORKSPACE/swiftshader-install

    - name: setup-python
      uses: actions/setup-python@v6
      with:
        python-version: ${{ matrix.python-version }}
    - name: install-deps
      run: |
        python -m pip install --upgrade pip
        pip install pytest setuptools wheel twine importlib-metadata

    - name: build
      if: matrix.os == 'ubuntu-latest'
      env:
        CC: clang
        CXX: clang++
      run: |
        mkdir build && cd build
        cmake -DNCNN_VULKAN=ON -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j $(nproc)
    - name: build
      if: matrix.os == 'macos-15-intel'
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=MAC -DARCHS="x86_64" \
            -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET -DENABLE_BITCODE=$ENABLE_BITCODE -DENABLE_ARC=$ENABLE_ARC -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
            -DNCNN_VULKAN=OFF -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . -j 4
    - name: build
      if: matrix.os == 'windows-latest'
      run: |
        mkdir build; cd build
        cmake -T v142,host=x64 -A x64 -DNCNN_VULKAN=OFF -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . --config Release -j 4
    - name: build-python
      run: cd python && pip install .
    - name: test
      if: matrix.os == 'ubuntu-latest'
      run: |
        export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
        cd python && pytest tests
    - name: test
      if: matrix.os != 'ubuntu-latest'
      run: |
        cd python && pytest tests


================================================
FILE: .github/workflows/release-python.yml
================================================
name: release-python
on:
  push:
    tags:
      - '*'
  workflow_dispatch:

env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  MAC_DEPLOYMENT_TARGET: '11.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
  CIBW_SKIP: "cp3??t-*"

jobs:
  build_sdist:
    name: Build SDist
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - uses: actions/setup-python@v6
      with:
        python-version: '3.x'

    - name: Install deps
      run: python -m pip install twine build

    - name: Build SDist
      run: python -m build -s

    - name: Check metadata
      run: twine check dist/*

    - uses: actions/upload-artifact@v6
      with:
        name: sdist
        path: dist/*.tar.gz

  build_wheels:
    name: ${{ matrix.arch }} ${{ matrix.build_id }} on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - { os: ubuntu-24.04,     arch: x86_64,     build: 'cp*-manylinux*', build_id: cp-manylinux }
          - { os: ubuntu-24.04,     arch: x86_64,     build: 'cp*-musllinux*', build_id: cp-musllinux }
          - { os: ubuntu-24.04,     arch: x86_64,     build: 'pp*',            build_id: pp           }
          - { os: ubuntu-24.04,     arch: i686,       build: 'cp*-manylinux*', build_id: cp-manylinux }
          - { os: ubuntu-24.04,     arch: i686,       build: 'cp*-musllinux*', build_id: cp-musllinux }
          - { os: ubuntu-24.04,     arch: i686,       build: 'pp*',            build_id: pp           }
          - { os: windows-2025,     arch: x86,        build: 'cp*',            build_id: cp           }
          - { os: windows-2025,     arch: AMD64,      build: 'cp*',            build_id: cp           }
          - { os: windows-2025,     arch: AMD64,      build: 'pp*',            build_id: pp           }
          - { os: windows-11-arm,   arch: ARM64,      build: 'cp*',            build_id: cp           }
          - { os: macos-15-intel,   arch: x86_64,     build: 'cp*',            build_id: cp           }
          - { os: macos-15,         arch: arm64,      build: 'cp*',            build_id: cp           }
          - { os: ubuntu-24.04-arm, arch: armv7l,     build: 'cp*-manylinux*', build_id: cp-manylinux }
          - { os: ubuntu-24.04-arm, arch: armv7l,     build: 'cp*-musllinux*', build_id: cp-musllinux }
          - { os: ubuntu-24.04-arm, arch: aarch64,    build: 'cp*-manylinux*', build_id: cp-manylinux }
          - { os: ubuntu-24.04-arm, arch: aarch64,    build: 'cp*-musllinux*', build_id: cp-musllinux }
          - { os: ubuntu-24.04-arm, arch: aarch64,    build: 'pp*',            build_id: pp           }

    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    # build wheels for ubuntu
    - name: Build wheels for ubuntu
      if: matrix.os == 'ubuntu-24.04'
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}
        CIBW_ENABLE: pypy
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
      with:
        output-dir: wheelhouse

    # build wheels for ubuntu armv7l
    - name: Build wheels for ubuntu armv7l
      if: matrix.os == 'ubuntu-24.04-arm' && (matrix.arch == 'armv7l')
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}
        CIBW_ENABLE: pypy
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
          CFLAGS="-mfpu=neon" CXXFLAGS="-mfpu=neon"
      with:
        output-dir: wheelhouse

    # build wheels for ubuntu aarch64
    - name: Build wheels for ubuntu aarch64
      if: matrix.os == 'ubuntu-24.04-arm' && (matrix.arch == 'aarch64')
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}
        CIBW_ENABLE: pypy
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
      with:
        output-dir: wheelhouse

    # build wheels for windows
    - name: Build wheels for windows
      if: matrix.os == 'windows-2025' && (matrix.arch == 'AMD64' || matrix.arch == 'x86')
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}
        CIBW_ENABLE: pypy
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4
        CIBW_BEFORE_BUILD: pip install delvewheel
        CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel}
      with:
        output-dir: wheelhouse

    - name: Build wheels for windows ARM64
      if: matrix.os == 'windows-11-arm' && matrix.arch == 'ARM64'
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_WINDOWS: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}
        CIBW_ENABLE: pypy
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT_WINDOWS: CMAKE_BUILD_PARALLEL_LEVEL=4
        CIBW_BEFORE_BUILD: pip install delvewheel
        CIBW_REPAIR_WHEEL_COMMAND: delvewheel repair -w {dest_dir} {wheel} --no-dll "msvcp140.dll;vcomp140.dll"
      with:
        output-dir: wheelhouse

    # build wheels for macos
    - name: cache-openmp for macos
      if: matrix.os == 'macos-15-intel' || matrix.os == 'macos-15'
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-macos-install-20251004

    - name: openmp for macos
      if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch

    - name: openmp-build-x86_64 for macos
      if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install

    - name: openmp-build-arm64 for macos
      if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install

    - name: openmp-merge-fat-library for macos
      if: (matrix.os == 'macos-15-intel' || matrix.os == 'macos-15') && steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a

    - name: install-openmp for macos
      if: matrix.os == 'macos-15-intel' || matrix.os == 'macos-15'
      run: |
        sudo cp $GITHUB_WORKSPACE/openmp-install/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib

    - name: vulkansdk for macos
      if: matrix.os == 'macos-15-intel' || matrix.os == 'macos-15'
      run: |
        wget -q https://sdk.lunarg.com/sdk/download/1.4.335.1/mac/vulkansdk-macos-1.4.335.1.zip?Human=true -O vulkansdk-macos-1.4.335.1.zip
        unzip vulkansdk-macos-1.4.335.1.zip
        sudo vulkansdk-macOS-1.4.335.1.app/Contents/MacOS/vulkansdk-macOS-1.4.335.1 --root $GITHUB_WORKSPACE/vulkansdk-macos-1.4.335.1 --accept-licenses --default-answer --confirm-command install

    - name: Build wheels for macos x86_64
      if: matrix.os == 'macos-15-intel' && matrix.arch == 'x86_64'
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_MACOS: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}
        CIBW_ENABLE: pypy
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
          CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC ARCHS="x86_64"
          DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
          OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
          OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp"
          OpenMP_libomp_LIBRARY="libomp.a"
          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.4.335.1/macOS/lib/libMoltenVK.dylib
          MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET
      with:
        output-dir: wheelhouse

    - name: Build wheels for macos arm64
      if: matrix.os == 'macos-15' && matrix.arch == 'arm64'
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_MACOS: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build }}
        CIBW_ENABLE: pypy
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4
          CMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/toolchains/ios.toolchain.cmake PLATFORM=MAC_ARM64 ARCHS="arm64"
          DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET ENABLE_BITCODE=OFF ENABLE_ARC=OFF ENABLE_VISIBILITY=OFF
          OpenMP_C_FLAGS="-Xclang -fopenmp" OpenMP_CXX_FLAGS="-Xclang -fopenmp"
          OpenMP_C_LIB_NAMES="libomp" OpenMP_CXX_LIB_NAMES="libomp"
          OpenMP_libomp_LIBRARY="libomp.a"
          Vulkan_LIBRARY=$GITHUB_WORKSPACE/vulkansdk-macos-1.4.335.1/macOS/lib/libMoltenVK.dylib
          MACOSX_DEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET
      with:
        output-dir: wheelhouse

    - name: Show files
      run: ls -lh wheelhouse
      shell: bash

    - name: Verify clean directory
      run: git diff --exit-code
      shell: bash

    - name: Upload wheels
      uses: actions/upload-artifact@v6
      with:
        name: wheels-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.build_id }}
        path: wheelhouse/*.whl

  build_wheels_qemu_cp:
    name: ${{ matrix.arch }} ${{ matrix.build_cp }} ${{ matrix.build_sub }}
    runs-on: ubuntu-24.04

    strategy:
      fail-fast: false
      matrix:
        arch: [riscv64]
        build_cp: [cp38, cp39, cp310, cp311, cp312, cp313, cp314]
        build_sub: [manylinux, musllinux]

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: Set up QEMU
      uses: docker/setup-qemu-action@v3
      with:
        platforms: all

    - name: Build wheels with qemu
      uses: pypa/cibuildwheel@v3.3.1
      env:
        CIBW_ARCHS_LINUX: ${{ matrix.arch }}
        CIBW_BUILD: ${{ matrix.build_cp }}-${{ matrix.build_sub }}*
        CIBW_BUILD_VERBOSITY: 1
        CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=4 EXTRA_CMAKE_ARGS="-DNCNN_XTHEADVECTOR=OFF"
      with:
        output-dir: wheelhouse

    - name: Show files
      run: ls -lh wheelhouse
      shell: bash

    - name: Verify clean directory
      run: git diff --exit-code
      shell: bash

    - name: Upload wheels
      uses: actions/upload-artifact@v6
      with:
        name: wheels_qemu_cp-${{ matrix.arch }}-${{ matrix.build_cp }}-${{ matrix.build_sub }}
        path: wheelhouse/*.whl

  upload_all:
    permissions:
      contents: none
    name: Upload
    needs: [build_wheels, build_wheels_qemu_cp, build_sdist]
    runs-on: ubuntu-latest

    steps:
    - uses: actions/download-artifact@v8
      with:
        path: dist
        merge-multiple: true

    - uses: pypa/gh-action-pypi-publish@release/v1
      with:
        user: __token__
        password: ${{ secrets.PYPI_API_TOKEN }}


================================================
FILE: .github/workflows/release.yml
================================================
name: release
on:
  push:
    tags:
      - '*'

env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  IOS_DEPLOYMENT_TARGET: '13.0'
  MAC_DEPLOYMENT_TARGET: '11.0'
  MAC_CATALYST_DEPLOYMENT_TARGET: '13.1'
  WATCHOS_DEPLOYMENT_TARGET: '6.0'
  TVOS_DEPLOYMENT_TARGET: '11.0'
  VISIONOS_DEPLOYMENT_TARGET: '1.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
  EMSCRIPTEN_VERSION: 3.1.28

permissions:
  contents: read

jobs:

  setup:
    permissions:
      contents: none
    runs-on: ubuntu-latest
    outputs:
      VERSION: ${{ steps.get_version.outputs.VERSION }}
    steps:
    - name: get-version
      id: get_version
      run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_OUTPUT

  full-source:
    needs: [setup]
    runs-on: ubuntu-latest
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-full-source
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: package
      run: |
        rm -rf .git
        rm -f /tmp/${{ env.PACKAGENAME }}.zip
        zip -9 -y -r /tmp/${{ env.PACKAGENAME }}.zip .
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: /tmp/${{ env.PACKAGENAME }}.zip

  ubuntu:
    needs: [setup]
    strategy:
      matrix:
        opt:
          - { shared-lib: OFF, os: ubuntu-22.04, id: ubuntu-2204        }
          - { shared-lib: OFF, os: ubuntu-24.04, id: ubuntu-2404        }
          - { shared-lib: ON,  os: ubuntu-22.04, id: ubuntu-2204-shared }
          - { shared-lib: ON,  os: ubuntu-24.04, id: ubuntu-2404-shared }
    runs-on: ${{ matrix.opt.os }}
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: apt
      run: |
        sudo apt-get install -y libprotobuf-dev protobuf-compiler
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
            -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TOOLS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: package
      run: |
        rm -rf ${{ env.PACKAGENAME }}
        mkdir -p ${{ env.PACKAGENAME }}
        cp -a build/install/* ${{ env.PACKAGENAME }}
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-macos:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-macos-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-macos
        path: openmp-install

  macos:
    needs: [setup, openmp-macos]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: macos        }
          - { vulkan: ON,  id: macos-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_TOOLS=OFF \
        -DNCNN_BUILD_EXAMPLES=OFF \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-macos
      uses: actions/download-artifact@v8
      with:
        name: openmp-macos
        path: openmp-macos
    - name: install-openmp
      run: |
        sudo cp openmp-macos/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
        sudo cp openmp-macos/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
    - name: build-x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-macos/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-macos/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-x86_64/install/lib/libglslang.a \
            build-x86_64/install/lib/libSPIRV.a \
            -o build-x86_64/install/lib/libglslang_combined.a
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        lipo -create build-x86_64/install/lib/libglslang_combined.a build-arm64/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
        cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create build-x86_64/install/lib/libncnn.a build-arm64/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-ios:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-ios-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-ios
        path: openmp-install

  ios:
    needs: [setup, openmp-ios]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: ios        }
          - { vulkan: ON,  id: ios-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-ios
      uses: actions/download-artifact@v8
      with:
        name: openmp-ios
        path: openmp-ios
    - name: install-openmp
      run: |
        sudo cp openmp-ios/include/* $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include
        sudo cp openmp-ios/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=OS64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-ios/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-ios/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang
        cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-ios-simulator:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-ios-simulator-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-ios-simulator
        path: openmp-install

  ios-simulator:
    needs: [setup, openmp-ios-simulator]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: ios-simulator        }
          - { vulkan: ON,  id: ios-simulator-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$IOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-ios-simulator
      uses: actions/download-artifact@v8
      with:
        name: openmp-ios-simulator
        path: openmp-ios-simulator
    - name: install-openmp
      run: |
        sudo cp openmp-ios-simulator/include/* $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include
        sudo cp openmp-ios-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib
    - name: build-x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR64 -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-ios-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-ios-simulator/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-x86_64/install/lib/libglslang.a \
            build-x86_64/install/lib/libSPIRV.a \
            -o build-x86_64/install/lib/libglslang_combined.a
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        lipo -create \
            build-x86_64/install/lib/libglslang_combined.a \
            build-arm64/install/lib/libglslang_combined.a \
            -o glslang.framework/Versions/A/glslang
        cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create \
            build-x86_64/install/lib/libncnn.a \
            build-arm64/install/lib/libncnn.a \
            -o ncnn.framework/Versions/A/ncnn
        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-mac-catalyst:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-mac-catalyst-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST_ARM64 -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-mac-catalyst
        path: openmp-install

  mac-catalyst:
    needs: [setup, openmp-mac-catalyst]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: mac-catalyst        }
          - { vulkan: ON,  id: mac-catalyst-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$MAC_CATALYST_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-mac-catalyst
      uses: actions/download-artifact@v8
      with:
        name: openmp-mac-catalyst
        path: openmp-mac-catalyst
    - name: install-openmp
      run: |
        sudo cp openmp-mac-catalyst/include/* $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include
        sudo cp openmp-mac-catalyst/lib/libomp.a $DEVELOPER_DIR/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/lib
    - name: build-x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=MAC_CATALYST -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-mac-catalyst/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-mac-catalyst/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-x86_64/install/lib/libglslang.a \
            build-x86_64/install/lib/libSPIRV.a \
            -o build-x86_64/install/lib/libglslang_combined.a
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        lipo -create \
            build-x86_64/install/lib/libglslang_combined.a \
            build-arm64/install/lib/libglslang_combined.a \
            -o glslang.framework/Versions/A/glslang
        cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create \
            build-x86_64/install/lib/libncnn.a \
            build-arm64/install/lib/libncnn.a \
            -o ncnn.framework/Versions/A/ncnn
        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-watchos:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-watchos-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-armv7k
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-armv7k && cd build-armv7k
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64_32
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64_32 && cd build-arm64_32
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-armv7k/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-watchos
        path: openmp-install

  watchos:
    needs: [setup, openmp-watchos]
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-watchos
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \

    steps:
    - uses: actions/checkout@v6
    - name: download-openmp-watchos
      uses: actions/download-artifact@v8
      with:
        name: openmp-watchos
        path: openmp-watchos
    - name: install-openmp
      run: |
        sudo cp openmp-watchos/include/* $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/include
        sudo cp openmp-watchos/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/lib
    - name: build-armv7k
      run: |
        mkdir build-armv7k && cd build-armv7k
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64_32
      run: |
        mkdir build-arm64_32 && cd build-arm64_32
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-watchos/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-watchos/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create \
            build-armv7k/install/lib/libncnn.a \
            build-arm64_32/install/lib/libncnn.a \
            -o ncnn.framework/Versions/A/ncnn
        cp -a build-arm64_32/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-watchos-simulator:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-watchos-simulator-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-watchos-simulator
        path: openmp-install

  watchos-simulator:
    needs: [setup, openmp-watchos-simulator]
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \

    steps:
    - uses: actions/checkout@v6
    - name: download-openmp-watchos-simulator
      uses: actions/download-artifact@v8
      with:
        name: openmp-watchos-simulator
        path: openmp-watchos-simulator
    - name: install-openmp
      run: |
        sudo cp openmp-watchos-simulator/include/* $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/include
        sudo cp openmp-watchos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/lib
    - name: build-x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-watchos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-watchos-simulator/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create \
            build-x86_64/install/lib/libncnn.a \
            build-arm64/install/lib/libncnn.a \
            -o ncnn.framework/Versions/A/ncnn
        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-tvos:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-tvos-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64e
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64e && cd build-arm64e
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64e/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-tvos
        path: openmp-install

  tvos:
    needs: [setup, openmp-tvos]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: tvos        }
          - { vulkan: ON,  id: tvos-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-tvos
      uses: actions/download-artifact@v8
      with:
        name: openmp-tvos
        path: openmp-tvos
    - name: install-openmp
      run: |
        sudo cp openmp-tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include
        sudo cp openmp-tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64e
      run: |
        mkdir build-arm64e && cd build-arm64e
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-tvos/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-tvos/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        libtool -static \
            build-arm64e/install/lib/libglslang.a \
            build-arm64e/install/lib/libSPIRV.a \
            -o build-arm64e/install/lib/libglslang_combined.a
        lipo -create \
            build-arm64/install/lib/libglslang_combined.a \
            build-arm64e/install/lib/libglslang_combined.a \
            -o glslang.framework/Versions/A/glslang
        cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create \
            build-arm64/install/lib/libncnn.a \
            build-arm64e/install/lib/libncnn.a \
            -o ncnn.framework/Versions/A/ncnn
        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-tvos-simulator:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-tvos-simulator-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-tvos-simulator
        path: openmp-install

  tvos-simulator:
    needs: [setup, openmp-tvos-simulator]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: tvos-simulator        }
          - { vulkan: ON,  id: tvos-simulator-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-tvos-simulator
      uses: actions/download-artifact@v8
      with:
        name: openmp-tvos-simulator
        path: openmp-tvos-simulator
    - name: install-openmp
      run: |
        sudo cp openmp-tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include
        sudo cp openmp-tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib
    - name: build-x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-tvos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-tvos-simulator/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-x86_64/install/lib/libglslang.a \
            build-x86_64/install/lib/libSPIRV.a \
            -o build-x86_64/install/lib/libglslang_combined.a
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        lipo -create \
            build-x86_64/install/lib/libglslang_combined.a \
            build-arm64/install/lib/libglslang_combined.a \
            -o glslang.framework/Versions/A/glslang
        cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create \
            build-x86_64/install/lib/libncnn.a \
            build-arm64/install/lib/libncnn.a \
            -o ncnn.framework/Versions/A/ncnn
        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-visionos:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-visionos-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-visionos
        path: openmp-install

  visionos:
    needs: [setup, openmp-visionos]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: visionos        }
          - { vulkan: ON,  id: visionos-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-visionos
      uses: actions/download-artifact@v8
      with:
        name: openmp-visionos
        path: openmp-visionos
    - name: install-openmp
      run: |
        sudo cp openmp-visionos/include/* $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/include
        sudo cp openmp-visionos/lib/libomp.a $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/lib
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-visionos/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-visionos/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        cp build-arm64/install/lib/libglslang_combined.a glslang.framework/Versions/A/glslang
        cp -a build-arm64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        cp build-arm64/install/lib/libncnn.a ncnn.framework/Versions/A/ncnn
        cp -a build-arm64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  openmp-visionos-simulator:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \
    steps:
    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-visionos-simulator-release-18.1.2-20251004
    - name: checkout
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: build-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-x86_64 && cd build-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        rm -rf $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/include $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/lib/libomp.a
    - name: upload
      uses: actions/upload-artifact@v6
      with:
        name: openmp-visionos-simulator
        path: openmp-install

  visionos-simulator:
    needs: [setup, openmp-visionos-simulator]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, id: visionos-simulator        }
          - { vulkan: ON,  id: visionos-simulator-vulkan }
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: download-openmp-visionos-simulator
      uses: actions/download-artifact@v8
      with:
        name: openmp-visionos-simulator
        path: openmp-visionos-simulator
    - name: install-openmp
      run: |
        sudo cp openmp-visionos-simulator/include/* $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/include
        sudo cp openmp-visionos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/lib
    - name: build-x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: build-arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install/strip
    - name: package-openmp
      run: |
        rm -rf openmp.framework
        mkdir -p openmp.framework/Versions/A/Headers
        mkdir -p openmp.framework/Versions/A/Resources
        ln -s A openmp.framework/Versions/Current
        ln -s Versions/Current/Headers openmp.framework/Headers
        ln -s Versions/Current/Resources openmp.framework/Resources
        ln -s Versions/Current/openmp openmp.framework/openmp
        cp openmp-visionos-simulator/lib/libomp.a openmp.framework/Versions/A/openmp
        cp -a openmp-visionos-simulator/include/* openmp.framework/Versions/A/Headers/
        sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/18.1/g' Info.plist > openmp.framework/Versions/A/Resources/Info.plist
    - name: package-glslang
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -rf glslang.framework
        mkdir -p glslang.framework/Versions/A/Headers
        mkdir -p glslang.framework/Versions/A/Resources
        ln -s A glslang.framework/Versions/Current
        ln -s Versions/Current/Headers glslang.framework/Headers
        ln -s Versions/Current/Resources glslang.framework/Resources
        ln -s Versions/Current/glslang glslang.framework/glslang
        libtool -static \
            build-x86_64/install/lib/libglslang.a \
            build-x86_64/install/lib/libSPIRV.a \
            -o build-x86_64/install/lib/libglslang_combined.a
        libtool -static \
            build-arm64/install/lib/libglslang.a \
            build-arm64/install/lib/libSPIRV.a \
            -o build-arm64/install/lib/libglslang_combined.a
        lipo -create \
            build-x86_64/install/lib/libglslang_combined.a \
            build-arm64/install/lib/libglslang_combined.a \
            -o glslang.framework/Versions/A/glslang
        cp -a build-x86_64/install/include/glslang glslang.framework/Versions/A/Headers/
        sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
    - name: package-ncnn
      run: |
        rm -rf ncnn.framework
        mkdir -p ncnn.framework/Versions/A/Headers
        mkdir -p ncnn.framework/Versions/A/Resources
        ln -s A ncnn.framework/Versions/Current
        ln -s Versions/Current/Headers ncnn.framework/Headers
        ln -s Versions/Current/Resources ncnn.framework/Resources
        ln -s Versions/Current/ncnn ncnn.framework/ncnn
        lipo -create \
            build-x86_64/install/lib/libncnn.a \
            build-arm64/install/lib/libncnn.a \
            -o ncnn.framework/Versions/A/ncnn
        cp -a build-x86_64/install/include/* ncnn.framework/Versions/A/Headers/
        sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
    - name: package
      if: matrix.opt.vulkan == 'OFF'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework ncnn.framework
    - name: package
      if: matrix.opt.vulkan == 'ON'
      run: |
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.framework glslang.framework ncnn.framework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  android:
    needs: [setup]
    strategy:
      matrix:
        opt:
          - { vulkan: OFF, shared-lib: OFF, id: android               }
          - { vulkan: OFF, shared-lib: ON,  id: android-shared        }
          - { vulkan: ON,  shared-lib: OFF, id: android-vulkan        }
          - { vulkan: ON,  shared-lib: ON,  id: android-vulkan-shared }
    runs-on: ubuntu-latest
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-${{ matrix.opt.id }}
      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK_LATEST_HOME/build/cmake/android.toolchain.cmake \
        -DANDROID_PLATFORM=android-21 \
        -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False \
        -DANDROID_SUPPORT_FLEXIBLE_PAGE_SIZES=ON \
        -DCMAKE_BUILD_TYPE=Release \
        -DCMAKE_INSTALL_PREFIX=install \
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
        -DNCNN_BUILD_BENCHMARK=OFF \
        -DNCNN_VULKAN=${{ matrix.opt.vulkan }} \
        -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: ndk-fix-debug
      run: sed -i -e '/^  -g$/d' $ANDROID_NDK_LATEST_HOME/build/cmake/android-legacy.toolchain.cmake
    - name: build-armeabi-v7a
      run: |
        mkdir build-armeabi-v7a && cd build-armeabi-v7a
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: build-arm64-v8a
      run: |
        mkdir build-arm64-v8a && cd build-arm64-v8a
        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="arm64-v8a" ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: build-x86
      run: |
        mkdir build-x86 && cd build-x86
        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86" ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: build-x86_64
      run: |
        mkdir build-x86_64 && cd build-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="x86_64" ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: build-riscv64
      run: |
        mkdir build-riscv64 && cd build-riscv64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }}-DANDROID_ABI="riscv64" ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: package
      run: |
        rm -rf ${{ env.PACKAGENAME }}
        mkdir -p ${{ env.PACKAGENAME }}
        cp -a build-armeabi-v7a/install ${{ env.PACKAGENAME }}/armeabi-v7a
        cp -a build-arm64-v8a/install ${{ env.PACKAGENAME }}/arm64-v8a
        cp -a build-x86/install ${{ env.PACKAGENAME }}/x86
        cp -a build-x86_64/install ${{ env.PACKAGENAME }}/x86_64
        cp -a build-riscv64/install ${{ env.PACKAGENAME }}/riscv64
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  webassembly:
    needs: [setup]
    runs-on: ubuntu-latest
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-webassembly
    steps:
    - uses: actions/checkout@v6
    - name: emsdk
      run: |
        git clone https://github.com/emscripten-core/emsdk.git
        cd emsdk
        ./emsdk install $EMSCRIPTEN_VERSION
        ./emsdk activate $EMSCRIPTEN_VERSION
    - name: build
      run: |
        source emsdk/emsdk_env.sh
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: build-simd
      run: |
        source emsdk/emsdk_env.sh
        mkdir build-simd && cd build-simd
        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
            -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: build-threads
      run: |
        source emsdk/emsdk_env.sh
        mkdir build-threads && cd build-threads
        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: build-simd-threads
      run: |
        source emsdk/emsdk_env.sh
        mkdir build-simd-threads && cd build-simd-threads
        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" \
            -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
        cmake --build . -j $(nproc)
        cmake --build . --target install/strip
    - name: package
      run: |
        rm -rf ${{ env.PACKAGENAME }}
        mkdir -p ${{ env.PACKAGENAME }}
        cp -a build/install ${{ env.PACKAGENAME }}/basic
        cp -a build-simd/install ${{ env.PACKAGENAME }}/simd
        cp -a build-threads/install ${{ env.PACKAGENAME }}/threads
        cp -a build-simd-threads/install ${{ env.PACKAGENAME }}/simd-threads
        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  windows:
    needs: [setup]
    strategy:
      matrix:
        opt:
          - { shared-lib: OFF, os: windows-2022, toolset-version: v140, windows-sdk-version: 22621, id: vs2015 }
          - { shared-lib: OFF, os: windows-2022, toolset-version: v141, windows-sdk-version: 22621, id: vs2017 }
          - { shared-lib: OFF, os: windows-2022, toolset-version: v142, windows-sdk-version: 22621, id: vs2019 }
          - { shared-lib: OFF, os: windows-2022, toolset-version: v143, windows-sdk-version: 26100, id: vs2022 }
          - { shared-lib: ON,  os: windows-2022, toolset-version: v140, windows-sdk-version: 22621, id: vs2015-shared }
          - { shared-lib: ON,  os: windows-2022, toolset-version: v141, windows-sdk-version: 22621, id: vs2017-shared }
          - { shared-lib: ON,  os: windows-2022, toolset-version: v142, windows-sdk-version: 22621, id: vs2019-shared }
          - { shared-lib: ON,  os: windows-2022, toolset-version: v143, windows-sdk-version: 26100, id: vs2022-shared }
    runs-on: ${{ matrix.opt.os }}
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-${{ matrix.opt.id }}
      UseMultiToolTask: true
      NCNN_CMAKE_OPTIONS: |
        -T ${{ matrix.opt.toolset-version }},host=x64 `
        -DCMAKE_BUILD_TYPE=Release `
        -DCMAKE_INSTALL_PREFIX=install `
        -DNCNN_VERSION_STRING="${{ needs.setup.outputs.VERSION }}" `
        -DNCNN_BUILD_EXAMPLES=OFF `
        -DNCNN_BUILD_TOOLS=ON `
        -DNCNN_BUILD_BENCHMARK=OFF `
        -DNCNN_VULKAN=ON `
        -DNCNN_SHARED_LIB=${{ matrix.opt.shared-lib }} `

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: Install VS 2017 (v141) Build Tools
      if: matrix.opt.toolset-version == 'v141'
      run: |
        $vsInstallPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath
        Start-Process -FilePath "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vs_installer.exe" -ArgumentList "modify --installPath `"$vsInstallPath`" --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 --quiet --norestart --nocache" -Wait
    - name: Install and Setup VS 2015 (v140) Build Tools
      if: matrix.opt.toolset-version == 'v140'
      run: |
        $vs140Path = "C:/vs140_build_tools"
        Invoke-WebRequest -Uri "https://aka.ms/vs/15/release/vs_buildtools.exe" -OutFile vs_buildtools.exe
        Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--installPath `"$vs140Path`" --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.140 --quiet --wait --norestart --nocache" -Wait

        $vcvarsPath = (Get-ChildItem -Path $vs140Path -Filter "vcvars64.bat" -Recurse | Select-Object -First 1).FullName
        $cmd = "`"$vcvarsPath`" && powershell -Command `"`$env:PATH;`$env:INCLUDE;`$env:LIB`""
        $output = cmd.exe /c $cmd
        $lines = $output -split "`r`n"

        echo "PATH=$($lines[0]);$($env:PATH)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
        echo "INCLUDE=$($lines[1])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
        echo "LIB=$($lines[2])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append

    - uses: GuillaumeFalourd/setup-windows10-sdk-action@v2.4
      with:
        sdk-version: ${{ matrix.opt.windows-sdk-version }}

    - name: cache-protobuf
      id: cache-protobuf
      uses: actions/cache@v5
      with:
        path: "protobuf-install"
        key: protobuf-${{ matrix.opt.toolset-version }}-x86-x64-install
    - name: protobuf
      if: steps.cache-protobuf.outputs.cache-hit != 'true'
      run: |
        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
        7z x ./protobuf-3.11.2.zip
        cd protobuf-3.11.2
        mkdir build-x86; cd build-x86;
        cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A Win32,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x86" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
        cmake --build . --config Release -j 4
        cmake --build . --config Release --target install
        cd ..
        mkdir build-x64; cd build-x64;
        cmake -T ${{ matrix.opt.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install\x64" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
        cmake --build . --config Release -j 4
        cmake --build . --config Release --target install
    - name: build-x86
      run: |
        mkdir build-x86; cd build-x86
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A Win32,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x86\cmake" ..
        cmake --build . --config Release -j 4
        cmake --build . --config Release --target install
    - name: build-x64
      run: |
        mkdir build-x64; cd build-x64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A x64,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\x64\cmake" ..
        cmake --build . --config Release -j 4
        cmake --build . --config Release --target install
    - name: build-arm64
      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
      run: |
        mkdir build-arm64; cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A arm64,version=10.0.${{ matrix.opt.windows-sdk-version }}.0 ..
        cmake --build . --config Release -j 4
        cmake --build . --config Release --target install
    - name: package
      if: matrix.opt.toolset-version == 'v140' || matrix.opt.toolset-version == 'v141'
      run: |
        mkdir ${{ env.PACKAGENAME }}
        mkdir ${{ env.PACKAGENAME }}/x86
        mkdir ${{ env.PACKAGENAME }}/x64
        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
    - name: package
      if: matrix.opt.toolset-version == 'v142' || matrix.opt.toolset-version == 'v143'
      run: |
        mkdir ${{ env.PACKAGENAME }}
        mkdir ${{ env.PACKAGENAME }}/x86
        mkdir ${{ env.PACKAGENAME }}/x64
        mkdir ${{ env.PACKAGENAME }}/arm64
        Copy-Item -Verbose -Recurse -Path "build-x86\install\*" -Destination "${{ env.PACKAGENAME }}\x86"
        Copy-Item -Verbose -Recurse -Path "build-x64\install\*" -Destination "${{ env.PACKAGENAME }}\x64"
        Copy-Item -Verbose -Recurse -Path "build-arm64\install\*" -Destination "${{ env.PACKAGENAME }}\arm64"
        7z a -r ${{ env.PACKAGENAME }}.zip ${{ env.PACKAGENAME }}
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip

  apple:
    needs: [setup, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, visionos, visionos-simulator]
    runs-on: macos-15-intel
    env:
      PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-apple
    steps:
    - run: sudo xcode-select --switch /Applications/Xcode_16.4.0.app
    - name: download
      uses: actions/download-artifact@v8
      with:
        path: artifacts

    - name: unzip
      run: |
        mkdir -p ncnn-ios
        mkdir -p ncnn-ios-vulkan
        mkdir -p ncnn-ios-simulator
        mkdir -p ncnn-ios-simulator-vulkan
        mkdir -p ncnn-mac-catalyst
        mkdir -p ncnn-mac-catalyst-vulkan
        mkdir -p ncnn-macos
        mkdir -p ncnn-macos-vulkan
        mkdir -p ncnn-tvos
        mkdir -p ncnn-tvos-vulkan
        mkdir -p ncnn-tvos-simulator
        mkdir -p ncnn-tvos-simulator-vulkan
        mkdir -p ncnn-visionos
        mkdir -p ncnn-visionos-vulkan
        mkdir -p ncnn-visionos-simulator
        mkdir -p ncnn-visionos-simulator-vulkan
        mkdir -p ncnn-watchos
        mkdir -p ncnn-watchos-simulator

        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios/ncnn-${{ needs.setup.outputs.VERSION }}-ios.zip -d ncnn-ios
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan.zip -d ncnn-ios-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator.zip -d ncnn-ios-simulator
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-ios-simulator-vulkan.zip -d ncnn-ios-simulator-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst.zip -d ncnn-mac-catalyst
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-mac-catalyst-vulkan.zip -d ncnn-mac-catalyst-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos/ncnn-${{ needs.setup.outputs.VERSION }}-macos.zip -d ncnn-macos
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan.zip -d ncnn-macos-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos/ncnn-${{ needs.setup.outputs.VERSION }}-tvos.zip -d ncnn-tvos
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-vulkan.zip -d ncnn-tvos-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator.zip -d ncnn-tvos-simulator
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-tvos-simulator-vulkan.zip -d ncnn-tvos-simulator-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos/ncnn-${{ needs.setup.outputs.VERSION }}-visionos.zip -d ncnn-visionos
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-vulkan.zip -d ncnn-visionos-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator.zip -d ncnn-visionos-simulator
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan/ncnn-${{ needs.setup.outputs.VERSION }}-visionos-simulator-vulkan.zip -d ncnn-visionos-simulator-vulkan
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos/ncnn-${{ needs.setup.outputs.VERSION }}-watchos.zip -d ncnn-watchos
        unzip -q artifacts/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator/ncnn-${{ needs.setup.outputs.VERSION }}-watchos-simulator.zip -d ncnn-watchos-simulator

    - name: create-xcframwork
      run: |
        rm -rf openmp.xcframework
        xcodebuild -create-xcframework \
            -framework ncnn-macos/openmp.framework \
            -framework ncnn-ios/openmp.framework \
            -framework ncnn-ios-simulator/openmp.framework \
            -framework ncnn-mac-catalyst/openmp.framework \
            -framework ncnn-watchos/openmp.framework \
            -framework ncnn-watchos-simulator/openmp.framework \
            -framework ncnn-tvos/openmp.framework \
            -framework ncnn-tvos-simulator/openmp.framework \
            -framework ncnn-visionos/openmp.framework \
            -framework ncnn-visionos-simulator/openmp.framework \
            -output openmp.xcframework

        rm -rf ncnn.xcframework
        xcodebuild -create-xcframework \
            -framework ncnn-macos/ncnn.framework \
            -framework ncnn-ios/ncnn.framework \
            -framework ncnn-ios-simulator/ncnn.framework \
            -framework ncnn-mac-catalyst/ncnn.framework \
            -framework ncnn-watchos/ncnn.framework \
            -framework ncnn-watchos-simulator/ncnn.framework \
            -framework ncnn-tvos/ncnn.framework \
            -framework ncnn-tvos-simulator/ncnn.framework \
            -framework ncnn-visionos/ncnn.framework \
            -framework ncnn-visionos-simulator/ncnn.framework \
            -output ncnn.xcframework

        rm -f ${{ env.PACKAGENAME }}.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}.zip openmp.xcframework ncnn.xcframework
    - name: create-xcframwork-vulkan
      run: |
        rm -rf openmp.xcframework
        xcodebuild -create-xcframework \
            -framework ncnn-macos-vulkan/openmp.framework \
            -framework ncnn-ios-vulkan/openmp.framework \
            -framework ncnn-ios-simulator-vulkan/openmp.framework \
            -framework ncnn-mac-catalyst-vulkan/openmp.framework \
            -framework ncnn-watchos/openmp.framework \
            -framework ncnn-watchos-simulator/openmp.framework \
            -framework ncnn-tvos-vulkan/openmp.framework \
            -framework ncnn-tvos-simulator-vulkan/openmp.framework \
            -framework ncnn-visionos/openmp.framework \
            -framework ncnn-visionos-simulator/openmp.framework \
            -output openmp.xcframework

        rm -rf glslang.xcframework
        xcodebuild -create-xcframework \
            -framework ncnn-macos-vulkan/glslang.framework \
            -framework ncnn-ios-vulkan/glslang.framework \
            -framework ncnn-ios-simulator-vulkan/glslang.framework \
            -framework ncnn-mac-catalyst-vulkan/glslang.framework \
            -framework ncnn-tvos-vulkan/glslang.framework \
            -framework ncnn-tvos-simulator-vulkan/glslang.framework \
            -framework ncnn-visionos-vulkan/glslang.framework \
            -framework ncnn-visionos-simulator-vulkan/glslang.framework \
            -output glslang.xcframework

        rm -rf ncnn.xcframework
        xcodebuild -create-xcframework \
            -framework ncnn-macos-vulkan/ncnn.framework \
            -framework ncnn-ios-vulkan/ncnn.framework \
            -framework ncnn-ios-simulator-vulkan/ncnn.framework \
            -framework ncnn-mac-catalyst-vulkan/ncnn.framework \
            -framework ncnn-watchos/ncnn.framework \
            -framework ncnn-watchos-simulator/ncnn.framework \
            -framework ncnn-tvos-vulkan/ncnn.framework \
            -framework ncnn-tvos-simulator-vulkan/ncnn.framework \
            -framework ncnn-visionos-vulkan/ncnn.framework \
            -framework ncnn-visionos-simulator-vulkan/ncnn.framework \
            -output ncnn.xcframework

        rm -f ${{ env.PACKAGENAME }}-vulkan.zip
        zip -9 -y -r ${{ env.PACKAGENAME }}-vulkan.zip openmp.xcframework glslang.xcframework ncnn.xcframework
    - name: upload-zip
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}
        path: ${{ env.PACKAGENAME }}.zip
    - name: upload-zip-vulkan
      uses: actions/upload-artifact@v6
      with:
        name: ${{ env.PACKAGENAME }}-vulkan
        path: ${{ env.PACKAGENAME }}-vulkan.zip

  release:
    permissions:
      contents: write  # for softprops/action-gh-release to create a release
    needs: [setup, full-source, ubuntu, macos, ios, ios-simulator, mac-catalyst, watchos, watchos-simulator, tvos, tvos-simulator, android, webassembly, windows, apple]
    runs-on: ubuntu-latest
    steps:
    - name: download
      uses: actions/download-artifact@v8
      with:
        path: artifacts

    - name: create-release
      uses: softprops/action-gh-release@v2
      with:
        token: ${{ secrets.GITHUB_TOKEN }}
        tag_name: ${{ needs.setup.outputs.VERSION }}
        name: Release ${{ needs.setup.outputs.VERSION }}
        files: artifacts/*/*.zip


================================================
FILE: .github/workflows/sync-wiki.yml
================================================
name: sync-wiki
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/sync-wiki.yml'
    - 'docs/**'
concurrency:
  group: sync-wiki-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read

jobs:
  sync-wiki:
    permissions:
      contents: write  # for Git to git push
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: sync
      run: |
        cp -r docs $GITHUB_WORKSPACE/ncnn.wiki
        cd $GITHUB_WORKSPACE/ncnn.wiki
        git config --global user.name "wiki-sync-bot"
        git config --global user.email "wiki-sync-bot@qq.com"
        git init
        git add .
        git commit -m "sync"
        git remote add upstream https://${{ secrets.WIKI_SYNC_BOT_TOKEN }}@github.com/Tencent/ncnn.wiki.git
        git push upstream master -f


================================================
FILE: .github/workflows/test-coverage.yml
================================================
name: test-coverage
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/test-coverage.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/**'
    - 'tests/**'
    - 'toolchains/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/test-coverage.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/**'
    - 'tests/**'
    - 'toolchains/**'
    - 'glslang'
concurrency:
  group: test-coverage-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  linux-gcc-gpu-t4:
    runs-on: [self-hosted, linux, t4]
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: build
      env:
        CC: gcc
        CXX: g++
        LD_LIBRARY_PATH: /data/action/install/lib64
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_VULKAN=ON -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVXNECONVERT=OFF -DNCNN_AVX512=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 4
    - name: test
      env:
        LD_LIBRARY_PATH: /data/action/install/lib64
      run: cd build && ctest --output-on-failure -j 4
    - name: lcov-collect
      run: |
        cd build
        lcov -d ./src -c -o lcov.info
        lcov -r lcov.info '/usr/*' -o lcov.info
        lcov -r lcov.info '*/install/*' -o lcov.info
        lcov -r lcov.info '*/build/*' -o lcov.info
        lcov --list lcov.info

    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/.local/bin/codecov
        files: build/lcov.info

  linux-gcc-x64:
    name: x64-${{ matrix.name }}
    runs-on: [self-hosted, linux, ubuntu25]
    strategy:
      fail-fast: false
      matrix:
        include:
          - { name: 'none',       SSE2: OFF, AVX: OFF, F16C: OFF, FMA: OFF, AVX2: OFF, AVX512: OFF, AVX512VNNI: OFF }
          - { name: 'sse2',       SSE2: ON,  AVX: OFF, F16C: OFF, FMA: OFF, AVX2: OFF, AVX512: OFF, AVX512VNNI: OFF }
          - { name: 'avx',        SSE2: ON,  AVX: ON,  F16C: OFF, FMA: OFF, AVX2: OFF, AVX512: OFF, AVX512VNNI: OFF }
          - { name: 'avx2',       SSE2: ON,  AVX: ON,  F16C: ON,  FMA: ON,  AVX2: ON,  AVX512: OFF, AVX512VNNI: OFF }
          - { name: 'avx512',     SSE2: ON,  AVX: ON,  F16C: ON,  FMA: ON,  AVX2: ON,  AVX512: ON,  AVX512VNNI: OFF }
          - { name: 'avx512vnni', SSE2: ON,  AVX: ON,  F16C: ON,  FMA: ON,  AVX2: ON,  AVX512: ON,  AVX512VNNI: ON  }
    steps:
    - uses: actions/checkout@v6
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_SSE2=${{ matrix.SSE2 }} \
            -DNCNN_AVX=${{ matrix.AVX }} \
            -DNCNN_F16C=${{ matrix.F16C }} \
            -DNCNN_FMA=${{ matrix.FMA }} \
            -DNCNN_AVX2=${{ matrix.AVX2 }} \
            -DNCNN_AVX512=${{ matrix.AVX512 }} \
            -DNCNN_AVX512VNNI=${{ matrix.AVX512VNNI }} \
            -DNCNN_XOP=OFF \
            -DNCNN_AVXVNNI=OFF \
            -DNCNN_AVX512BF16=OFF \
            -DNCNN_AVX512FP16=OFF \
            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        cd build
        ctest --output-on-failure -j 8
    - name: lcov-collect
      run: |
        cd build
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info
    - name: build-openmp
      run: |
        mkdir build-openmp && cd build-openmp
        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_SSE2=${{ matrix.SSE2 }} \
            -DNCNN_AVX=${{ matrix.AVX }} \
            -DNCNN_F16C=${{ matrix.F16C }} \
            -DNCNN_FMA=${{ matrix.FMA }} \
            -DNCNN_AVX2=${{ matrix.AVX2 }} \
            -DNCNN_AVX512=${{ matrix.AVX512 }} \
            -DNCNN_AVX512VNNI=${{ matrix.AVX512VNNI }} \
            -DNCNN_XOP=OFF \
            -DNCNN_AVXVNNI=OFF \
            -DNCNN_AVX512BF16=OFF \
            -DNCNN_AVX512FP16=OFF \
            -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        export OMP_THREAD_LIMIT=1
        export OMP_NUM_THREADS=1
        cd build-openmp
        ctest --output-on-failure -j 8
    - name: lcov-collect
      run: |
        cd build-openmp
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build-openmp/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info
    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/osd/codecov
        files: build/lcov.info,build-openmp/lcov.info

  linux-gcc-x64-simplestl-simplemath:
    name: simplestl-simplemath
    runs-on: [self-hosted, linux, ubuntu25]
    steps:
    - uses: actions/checkout@v6
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \
            -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \
            -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        cd build
        ctest --output-on-failure -j 8
    - name: lcov-collect
      run: |
        cd build
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info
    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/osd/codecov
        files: build/lcov.info

  linux-gcc-x64-sde:
    name: sde-${{ matrix.cpu }}
    runs-on: [self-hosted, linux, ubuntu25]
    env:
      SDE_PATH: /data/action/osd/sde-external-9.33.0-2024-01-07-lin
    strategy:
      fail-fast: false
      matrix:
        include:
          - { cpu: hsw, AVX2: ON, AVXVNNI: OFF, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF }
          - { cpu: adl, AVX2: ON, AVXVNNI: ON,  AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF }
          - { cpu: arl, AVX2: ON, AVXVNNI: ON,  AVXVNNIINT8: ON,  AVXNECONVERT: ON,  AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF }
          - { cpu: spr, AVX2: ON, AVXVNNI: OFF, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: ON,  AVX512VNNI: ON,  AVX512BF16: ON,  AVX512FP16: ON  }
    steps:
    - uses: actions/checkout@v6
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_AVX=ON \
            -DNCNN_F16C=ON \
            -DNCNN_XOP=OFF \
            -DNCNN_AVX2=${{ matrix.AVX2 }} \
            -DNCNN_AVXVNNI=${{ matrix.AVXVNNI }} \
            -DNCNN_AVXVNNIINT8=${{ matrix.AVXVNNIINT8 }} \
            -DNCNN_AVXNECONVERT=${{ matrix.AVXNECONVERT }} \
            -DNCNN_AVX512=${{ matrix.AVX512 }} \
            -DNCNN_AVX512VNNI=${{ matrix.AVX512VNNI }} \
            -DNCNN_AVX512BF16=${{ matrix.AVX512BF16 }} \
            -DNCNN_AVX512FP16=${{ matrix.AVX512FP16 }} \
            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-${{ matrix.cpu }};--" ctest --output-on-failure -j 8
    - name: lcov-collect
      run: |
        cd build
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info
    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/osd/codecov
        files: build/lcov.info

  linux-gcc-x64-sde-combined:
    name: sde-combined
    runs-on: [self-hosted, linux, ubuntu25]
    env:
      SDE_PATH: /data/action/osd/sde-external-9.33.0-2024-01-07-lin
    steps:
    - uses: actions/checkout@v6
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test-p4p
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-p4p;--" ctest --output-on-failure -j 8
    - name: test-snb
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-snb;--" ctest --output-on-failure -j 8
    - name: test-hsw
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-hsw;--" ctest --output-on-failure -j 8
    - name: test-adl
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-adl;--" ctest --output-on-failure -j 8
    - name: test-arl
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-arl;--" ctest --output-on-failure -j 8
    - name: test-skx
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-skx;--" ctest --output-on-failure -j 8
    - name: test-spr
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 8
    - name: test-gnr
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-gnr;--" ctest --output-on-failure -j 8
    - name: lcov-collect
      run: |
        cd build
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info
    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/osd/codecov
        files: build/lcov.info

  linux-gcc-riscv64-rvv:
    strategy:
      matrix:
        openmp: [ON, OFF]
    runs-on: [self-hosted, linux, ubuntu]
    steps:
    - uses: actions/checkout@v6
    - name: build
      run: |
        export RISCV_ROOT_PATH=/data/action/osd/riscv
        mkdir build
        cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_ZFH=ON -DNCNN_ZVFH=ON -DNCNN_OPENMP=${{ matrix.openmp }} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    - name: test-vlen256
      run: |
        export PATH=/data/action/osd/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8

    - name: test-vlen128
      run: |
        export PATH=/data/action/osd/qemu-install/bin:$PATH
        cd build
        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,zfh=true,zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/riscv/sysroot" ctest --output-on-failure -j 8

    - name: lcov-collect
      run: |
        cd build
        lcov --gcov-tool /data/action/osd/riscv/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info
        lcov -r lcov.info '/usr/*' -o lcov.info
        lcov -r lcov.info '*/install/*' -o lcov.info
        lcov -r lcov.info '*/build/*' -o lcov.info
        lcov --list lcov.info

    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        files: build/lcov.info

  linux-gpu-llvmpipe:
    runs-on: [self-hosted, linux, ubuntu25]
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVXVNNI=OFF -DNCNN_AVXNECONVERT=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=OFF -DNCNN_AVX512FP16=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        export LP_NUM_THREADS=4
        cd build && ctest --output-on-failure -j 8
    - name: lcov-collect
      run: |
        cd build
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info

    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/osd/codecov
        files: build/lcov.info

  linux-gpu-swiftshader:
    runs-on: [self-hosted, linux, ubuntu25]
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-swiftshader
      id: cache-swiftshader
      uses: actions/cache@v5
      with:
        path: swiftshader-install
        key: swiftshader-linux-install-20250508
    - name: checkout-swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
      with:
        repository: google/swiftshader
        path: swiftshader
        ref: 930d46d31b5d637f313fd5ef55da2bbf053c26c1
    - name: swiftshader
      if: steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
        mkdir -p build; cd build
        cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
        cmake --build . -j 8
        mkdir $GITHUB_WORKSPACE/swiftshader-install
        cp Linux/* $GITHUB_WORKSPACE/swiftshader-install

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVXVNNI=OFF -DNCNN_AVXNECONVERT=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=OFF -DNCNN_AVX512FP16=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8
    - name: test
      run: |
        printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini
        export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json"
        cd build && ctest --output-on-failure -j 8
    - name: lcov-collect
      run: |
        cd build
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info

    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/osd/codecov
        files: build/lcov.info

  linux-gcc-cross:
    name: ${{ matrix.arch }}
    runs-on: [self-hosted, linux, ubuntu25]
    strategy:
      fail-fast: false
      matrix:
        include:
          - arch: arm
            toolchain: arm-linux-gnueabi.toolchain.cmake
            extra-cmake-args: -DNCNN_VFPV4=ON
            qemu: qemu-arm-static
            qemu-args: "-L;/usr/arm-linux-gnueabi"

          - arch: arm-noinlineasm
            toolchain: arm-linux-gnueabi.toolchain.cmake
            extra-cmake-args: -DNCNN_GNU_INLINE_ASM=OFF -DNCNN_VFPV4=ON
            qemu: qemu-arm-static
            qemu-args: "-L;/usr/arm-linux-gnueabi"

          - arch: armhf-vfpv3-d16
            toolchain: arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake
            extra-cmake-args: -DNCNN_VFPV4=OFF
            qemu: qemu-arm-static
            qemu-args: "-L;/usr/arm-linux-gnueabihf"

          - arch: armhf-vfpv3-d16-noinlineasm
            toolchain: arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake
            extra-cmake-args: -DNCNN_GNU_INLINE_ASM=OFF -DNCNN_VFPV4=OFF
            qemu: qemu-arm-static
            qemu-args: "-L;/usr/arm-linux-gnueabihf"

          - arch: aarch64-armv8.0
            toolchain: aarch64-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_ARM82=OFF
            qemu: qemu-aarch64-static
            qemu-args: "-L;/usr/aarch64-linux-gnu"

          - arch: aarch64-armv8.2
            toolchain: aarch64-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_ARM82DOT=OFF -DNCNN_ARM82FP16FML=OFF
            qemu: qemu-aarch64-static
            qemu-args: "-L;/usr/aarch64-linux-gnu"

          - arch: aarch64-armv8.4
            toolchain: aarch64-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF
            qemu: qemu-aarch64-static
            qemu-args: "-L;/usr/aarch64-linux-gnu"

          - arch: aarch64-armv8.6
            toolchain: aarch64-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_ARM86SVE=OFF
            qemu: qemu-aarch64-static
            qemu-args: "-L;/usr/aarch64-linux-gnu"

          - arch: aarch64-armv8.6-noinlineasm
            toolchain: aarch64-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_GNU_INLINE_ASM=OFF -DNCNN_ARM86SVE=OFF
            qemu: qemu-aarch64-static
            qemu-args: "-L;/usr/aarch64-linux-gnu"

          - arch: mipsisa32r6el
            toolchain: mipsisa32r6el-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_MSA=OFF -DNCNN_MMI=OFF
            qemu: qemu-mipsel-static
            qemu-args: "-L;/usr/mipsisa32r6el-linux-gnu"

          - arch: mipsisa64r6el
            toolchain: mipsisa64r6el-linux-gnuabi64.toolchain.cmake
            extra-cmake-args: -DNCNN_MSA=ON -DNCNN_MMI=OFF
            qemu: qemu-mips64el-static
            qemu-args: "-L;/usr/mipsisa64r6el-linux-gnuabi64"

          - arch: powerpc
            toolchain: powerpc-linux-gnu.toolchain.cmake
            extra-cmake-args:
            qemu: qemu-ppc-static
            qemu-args: "-L;/usr/powerpc-linux-gnu"

          - arch: powerpc64le
            toolchain: powerpc64le-linux-gnu.toolchain.cmake
            extra-cmake-args:
            qemu: qemu-ppc64le-static
            qemu-args: "-L;/usr/powerpc64le-linux-gnu"

          - arch: riscv64
            toolchain: riscv64-linux-gnu.toolchain.cmake
            extra-cmake-args:
            qemu: qemu-riscv64-static
            qemu-args: "-L;/usr/riscv64-linux-gnu"

          - arch: loongarch64-la264
            toolchain: loongarch64-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_LSX=ON -DNCNN_LASX=OFF
            qemu: qemu-loongarch64-static
            qemu-args: "-L;/usr/loongarch64-linux-gnu"

          - arch: loongarch64-la664
            toolchain: loongarch64-linux-gnu.toolchain.cmake
            extra-cmake-args: -DNCNN_LSX=ON -DNCNN_LASX=ON
            qemu: qemu-loongarch64-static
            qemu-args: "-L;/usr/loongarch64-linux-gnu"

    steps:
    - uses: actions/checkout@v6

    - name: build
      run: |
        mkdir build && cd build
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.toolchain }} ${{ matrix.extra-cmake-args }} -DNCNN_OPENMP=OFF \
            -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=${{ matrix.qemu }} TESTS_EXECUTABLE_LOADER_ARGUMENTS="${{ matrix.qemu-args }}" ctest --output-on-failure -j 8

    - name: lcov-collect
      run: |
        cd build
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info

    - name: build-openmp
      run: |
        mkdir build-openmp && cd build-openmp
        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/${{ matrix.toolchain }} ${{ matrix.extra-cmake-args }} -DNCNN_OPENMP=ON \
            -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \
            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j 8

    - name: test-openmp
      run: |
        export OMP_THREAD_LIMIT=1
        export OMP_NUM_THREADS=1
        cd build-openmp
        TESTS_EXECUTABLE_LOADER=${{ matrix.qemu }} TESTS_EXECUTABLE_LOADER_ARGUMENTS="${{ matrix.qemu-args }}" ctest --output-on-failure -j 8

    - name: lcov-collect-openmp
      run: |
        cd build-openmp
        lcov --ignore-errors inconsistent -d ./src -c -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '/usr/*' -o lcov.info
        lcov --ignore-errors inconsistent -r lcov.info '*/build-openmp/*' -o lcov.info
        lcov --ignore-errors inconsistent --list lcov.info

    - name: codecov
      uses: codecov/codecov-action@v5
      with:
        token: ${{ secrets.CODECOV_TOKEN }}
        disable_search: true
        plugins: noop
        binary: /data/action/osd/codecov
        files: build/lcov.info,build-openmp/lcov.info


================================================
FILE: .github/workflows/tvos.yml
================================================
name: tvos
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/tvos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/tvos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'glslang'
concurrency:
  group: tvos-${{ github.ref }}
  cancel-in-progress: true
env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  TVOS_DEPLOYMENT_TARGET: '11.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
permissions:
  contents: read

jobs:
  build:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \

      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$TVOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-tvos-install-20251004
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: openmp-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-arm64e
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64e && cd build-arm64e
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos
        mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos-simulator

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install/tvos
        mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64e/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/tvos/lib/libomp.a

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/tvos-simulator
        mkdir -p $GITHUB_WORKSPACE/openmp-install/tvos-simulator/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/tvos-simulator/lib/libomp.a

    - name: install-openmp
      run: |
        sudo cp $GITHUB_WORKSPACE/openmp-install/tvos/include/* $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/tvos/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVOS.platform/Developer/SDKs/AppleTVOS.sdk/usr/lib

        sudo cp $GITHUB_WORKSPACE/openmp-install/tvos-simulator/include/* $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/tvos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/AppleTVSimulator.platform/Developer/SDKs/AppleTVSimulator.sdk/usr/lib

    - name: arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64" ..
        cmake --build . -j 4
    - name: arm64e
      run: |
        mkdir build-arm64e && cd build-arm64e
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=TVOS -DARCHS="arm64e" ..
        cmake --build . -j 4
    - name: simulator-x86_64
      run: |
        mkdir build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_TVOS -DARCHS="x86_64" ..
        cmake --build . -j 4
    - name: simulator-arm64
      run: |
        mkdir build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATORARM64_TVOS -DARCHS="arm64" ..
        cmake --build . -j 4


================================================
FILE: .github/workflows/visionos.yml
================================================
name: visionos
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/visionos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/visionos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
concurrency:
  group: visionos-${{ github.ref }}
  cancel-in-progress: true
env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  VISIONOS_DEPLOYMENT_TARGET: '1.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
permissions:
  contents: read

jobs:
  build:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \

      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$VISIONOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \
        -DNCNN_VULKAN=ON \

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-visionos-install-20251004
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: openmp-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64 && cd build-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos
        mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos-simulator

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/include $GITHUB_WORKSPACE/openmp-install/visionos
        mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos/lib
        cp openmp-${{ env.OPENMP_VERSION }}.src/build-arm64/install/lib/libomp.a $GITHUB_WORKSPACE/openmp-install/visionos/lib/libomp.a

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/visionos-simulator
        mkdir -p $GITHUB_WORKSPACE/openmp-install/visionos-simulator/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/visionos-simulator/lib/libomp.a

    - name: install-openmp
      run: |
        sudo cp $GITHUB_WORKSPACE/openmp-install/visionos/include/* $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/visionos/lib/libomp.a $DEVELOPER_DIR/Platforms/XROS.platform/Developer/SDKs/XROS.sdk/usr/lib

        sudo cp $GITHUB_WORKSPACE/openmp-install/visionos-simulator/include/* $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/visionos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/XRSimulator.platform/Developer/SDKs/XRSimulator.sdk/usr/lib

    - name: arm64
      run: |
        mkdir build-arm64 && cd build-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4
    - name: simulator-x86_64
      run: |
        mkdir build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="x86_64" ..
        cmake --build . -j 4
    - name: simulator-arm64
      run: |
        mkdir build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_VISIONOS -DARCHS="arm64" ..
        cmake --build . -j 4


================================================
FILE: .github/workflows/watchos.yml
================================================
name: watchos
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/watchos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/watchos.yml'
    - 'toolchains/ios.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
concurrency:
  group: watchos-${{ github.ref }}
  cancel-in-progress: true
env:
  DEVELOPER_DIR: /Applications/Xcode_16.4.0.app/Contents/Developer
  WATCHOS_DEPLOYMENT_TARGET: '6.0'
  ENABLE_BITCODE: OFF
  ENABLE_ARC: OFF
  ENABLE_VISIBILITY: OFF
permissions:
  contents: read

jobs:
  build:
    runs-on: macos-15-intel
    env:
      OPENMP_VERSION: '18.1.2'
      OPENMP_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DLIBOMP_ENABLE_SHARED=OFF \
        -DLIBOMP_OMPT_SUPPORT=OFF \
        -DLIBOMP_USE_HWLOC=OFF \

      NCNN_CMAKE_OPTIONS: |
        -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake \
        -DDEPLOYMENT_TARGET=$WATCHOS_DEPLOYMENT_TARGET \
        -DENABLE_BITCODE=$ENABLE_BITCODE \
        -DENABLE_ARC=$ENABLE_ARC \
        -DENABLE_VISIBILITY=$ENABLE_VISIBILITY \
        -DCMAKE_INSTALL_PREFIX=install \
        -DCMAKE_BUILD_TYPE=Release \
        -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
        -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
        -DOpenMP_libomp_LIBRARY="libomp.a" \

    steps:
    - uses: actions/checkout@v6

    - name: cache-openmp
      id: cache-openmp
      uses: actions/cache@v5
      with:
        path: openmp-install
        key: openmp-watchos-install-20251004
    - name: openmp
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf cmake-${{ env.OPENMP_VERSION }}.src.tar.xz
        wget https://github.com/llvm/llvm-project/releases/download/llvmorg-${{ env.OPENMP_VERSION }}/openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        tar -xf openmp-${{ env.OPENMP_VERSION }}.src.tar.xz
        mv cmake-${{ env.OPENMP_VERSION }}.src/Modules/* openmp-${{ env.OPENMP_VERSION }}.src/cmake/
        cd openmp-${{ env.OPENMP_VERSION }}.src
        wget https://github.com/nihui/llvm-project/commit/ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        patch -p2 -i ef8c35bcf5d9cfdb0764ffde6a63c04ec715bc37.patch
        wget https://github.com/nihui/llvm-project/commit/5c12711f9a21f41bea70566bf15a4026804d6b20.patch
        patch -p2 -i 5c12711f9a21f41bea70566bf15a4026804d6b20.patch
    - name: openmp-armv7k
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-armv7k && cd build-armv7k
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-arm64_32
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-arm64_32 && cd build-arm64_32
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-x86_64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-simulator-arm64
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        cd openmp-${{ env.OPENMP_VERSION }}.src
        mkdir -p build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.OPENMP_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" ..
        cmake --build . -j 4
        cmake --build . --target install
    - name: openmp-merge-fat-library
      if: steps.cache-openmp.outputs.cache-hit != 'true'
      run: |
        mkdir -p $GITHUB_WORKSPACE/openmp-install
        mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos
        mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos-simulator

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/include $GITHUB_WORKSPACE/openmp-install/watchos
        mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-armv7k/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-arm64_32/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/watchos/lib/libomp.a

        cp -a openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/include $GITHUB_WORKSPACE/openmp-install/watchos-simulator
        mkdir -p $GITHUB_WORKSPACE/openmp-install/watchos-simulator/lib
        lipo -create \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-x86_64/install/lib/libomp.a \
            openmp-${{ env.OPENMP_VERSION }}.src/build-simulator-arm64/install/lib/libomp.a \
            -o $GITHUB_WORKSPACE/openmp-install/watchos-simulator/lib/libomp.a

    - name: install-openmp
      run: |
        sudo cp $GITHUB_WORKSPACE/openmp-install/watchos/include/* $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/watchos/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchOS.platform/Developer/SDKs/WatchOS.sdk/usr/lib

        sudo cp $GITHUB_WORKSPACE/openmp-install/watchos-simulator/include/* $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/include
        sudo cp $GITHUB_WORKSPACE/openmp-install/watchos-simulator/lib/libomp.a $DEVELOPER_DIR/Platforms/WatchSimulator.platform/Developer/SDKs/WatchSimulator.sdk/usr/lib

    - name: armv7k
      run: |
        mkdir build-armv7k && cd build-armv7k
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="armv7k" ..
        cmake --build . -j 4
    - name: arm64_32
      run: |
        mkdir build-arm64_32 && cd build-arm64_32
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=WATCHOS -DARCHS="arm64_32" ..
        cmake --build . -j 4

    - name: simulator-x86_64
      run: |
        mkdir build-simulator-x86_64 && cd build-simulator-x86_64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="x86_64" ..
        cmake --build . -j 4
    - name: simulator-arm64
      run: |
        mkdir build-simulator-arm64 && cd build-simulator-arm64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DPLATFORM=SIMULATOR_WATCHOS -DARCHS="arm64" ..
        cmake --build . -j 4


================================================
FILE: .github/workflows/web-assembly.yml
================================================
name: web-assembly
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/web-assembly.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/web-assembly.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'

env:
  EMSCRIPTEN_VERSION: 3.1.28

concurrency:
  group: web-assembly-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  webassembly:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v6
    - name: emsdk
      run: |
        git clone https://github.com/emscripten-core/emsdk.git
        cd emsdk
        ./emsdk install $EMSCRIPTEN_VERSION
        ./emsdk activate $EMSCRIPTEN_VERSION
    - name: build-basic
      run: |
        source emsdk/emsdk_env.sh
        export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
        mkdir build-basic && cd build-basic
        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-basic
      run: |
        cd build-basic
        TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc)
    - name: build-simd
      run: |
        source emsdk/emsdk_env.sh
        export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
        mkdir build-simd && cd build-simd
        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-simd
      run: |
        cd build-simd
        TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc)
    - name: build-simd-omp
      run: |
        source emsdk/emsdk_env.sh
        export LDFLAGS="-sERROR_ON_WASM_CHANGES_AFTER_LINK -sWASM_BIGINT -O1"
        mkdir build-simd-omp && cd build-simd-omp
        cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_TESTS=ON ..
        cmake --build . -j $(nproc)
    - name: test-simd-omp
      run: |
        cd build-simd-omp
        TESTS_EXECUTABLE_LOADER=node ctest --output-on-failure -j $(nproc)


================================================
FILE: .github/workflows/windows-arm.yml
================================================
name: windows-arm
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/windows-arm.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/windows-arm.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'glslang'
concurrency:
  group: windows-arm-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  windows:
    name: ${{ matrix.vs-version }}
    runs-on: windows-2022
    strategy:
      matrix:
        include:
          - vs-version: vs2019
            toolset-version: v142
            windows-sdk-version: 22621

          - vs-version: vs2022
            toolset-version: v143
            windows-sdk-version: 26100

    env:
      UseMultiToolTask: true
      NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_VULKAN=ON

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - uses: GuillaumeFalourd/setup-windows10-sdk-action@v2.4
      with:
        sdk-version: ${{ matrix.windows-sdk-version }}
    - name: arm64
      run: |
        mkdir build-arm64; cd build-arm64
        cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64,version=10.0.${{ matrix.windows-sdk-version }}.0 ${{ env.NCNN_CMAKE_OPTIONS }} ..
        cmake --build . --config Release -j 4
    - name: arm64-shared
      run: |
        mkdir build-arm64-shared; cd build-arm64-shared
        cmake -T ${{ matrix.toolset-version }},host=x64 -A arm64,version=10.0.${{ matrix.windows-sdk-version }}.0 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_SHARED_LIB=ON ..
        cmake --build . --config Release -j 4

  woa-linux:
    name: woa-linux
    runs-on: ubuntu-latest
    container: linaro/wine-arm64
    steps:
    - uses: actions/checkout@v6
    - name: msvc-wine
      env:
        WINEPREFIX: /tmp/wine-x64-prefix/
      run: |
        apt-get update
        apt-get install -y wine64 python3 msitools python3-simplejson python3-six ca-certificates winbind cmake ninja-build meson
        ln -s /usr/bin/wine /usr/bin/wine64
        xvfb-run winecfg &
        git clone --depth 1 https://github.com/mstorsjo/msvc-wine
        msvc-wine/vsdownload.py --accept-license --dest /msvc
        msvc-wine/install.sh /msvc
    - name: build
      env:
        WINEPREFIX: /tmp/wine-x64-prefix/
        CC: cl
        CXX: cl
      run: |
        export PATH=/msvc/bin/arm64:$PATH
        mkdir build && cd build
        cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_SYSTEM_NAME=Windows -DNCNN_BUILD_TESTS=ON ..
        cmake --build . --config Release -j $(nproc)
    - name: test
      run: |
        cd build
        TESTS_EXECUTABLE_LOADER=wine-arm64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="" ctest --output-on-failure -j $(nproc)

  windows-arm:
    runs-on: windows-11-arm
    env:
      UseMultiToolTask: true

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: build
      run: |
        mkdir build; cd build
        cmake -A arm64 -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_VULKAN=OFF -DNCNN_ARM82=OFF ..
        cmake --build . --config Release -j 4
    - name: test
      run: cd build; ctest -C Release --output-on-failure -j 4


================================================
FILE: .github/workflows/windows-clang.yml
================================================
name: windows-clang
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/windows-clang.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/windows-clang.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/arm/**'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'glslang'
concurrency:
  group: windows-clang-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  windows:
    name: ClangCL
    runs-on: windows-2022

    env:
      UseMultiToolTask: true
      NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: arm64
      run: |
        mkdir build-arm64; cd build-arm64
        cmake -T ClangCL -A arm64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=OFF ..
        cmake --build . --config Release -j 4

    - name: arm64-vulkan
      run: |
        mkdir build-arm64-vulkan; cd build-arm64-vulkan
        cmake -T ClangCL -A arm64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . --config Release -j 4

    - name: x86
      run: |
        mkdir build-x86; cd build-x86
        cmake -T ClangCL -A Win32 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_BUILD_TESTS=ON -DNCNN_VULKAN=OFF ..
        cmake --build . --config Release -j 4
    - name: x86-test
      run: cd build-x86; ctest -C Release --output-on-failure -j 4

    - name: x86-vulkan
      run: |
        mkdir build-x86-vulkan; cd build-x86-vulkan
        cmake -T ClangCL -A Win32 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . --config Release -j 4

    - name: x64
      run: |
        mkdir build-x64; cd build-x64
        cmake -T ClangCL -A x64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_BUILD_TESTS=ON -DNCNN_VULKAN=OFF ..
        cmake --build . --config Release -j 4
    - name: x64-test
      run: cd build-x64; ctest -C Release --output-on-failure -j 4

    - name: x64-vulkan
      run: |
        mkdir build-x64-vulkan; cd build-x64-vulkan
        cmake -T ClangCL -A x64 ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON ..
        cmake --build . --config Release -j 4


================================================
FILE: .github/workflows/windows-mingw.yml
================================================
name: windows-mingw
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/windows-mingw.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/windows-mingw.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'glslang'
concurrency:
  group: windows-mingw-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  windows:
    name: MinGW-w64
    runs-on: windows-2022

    env:
      UseMultiToolTask: true
      NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: x64
      run: |
        mkdir build-x64; cd build-x64
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_BUILD_TESTS=ON -DNCNN_VULKAN=OFF -G "MinGW Makefiles" ..
        cmake --build . --config Release -j 4
    - name: x64-test
      run: cd build-x64; ctest -C Release --output-on-failure -j 4

    - name: x64-vulkan
      run: |
        mkdir build-x64-vulkan; cd build-x64-vulkan
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DNCNN_VULKAN=ON -DNCNN_SHARED_LIB=ON -G "MinGW Makefiles" ..
        cmake --build . --config Release -j 4


================================================
FILE: .github/workflows/windows-xp.yml
================================================
name: windows-xp
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/windows-xp.yml'
    - 'toolchains/windows-xp-msvc.toolchain.cmake'
    - 'toolchains/windows-xp-mingw.toolchain.cmake'
    - 'toolchains/windows-xp-clang.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/windows-xp.yml'
    - 'toolchains/windows-xp-msvc.toolchain.cmake'
    - 'toolchains/windows-xp-mingw.toolchain.cmake'
    - 'toolchains/windows-xp-clang.toolchain.cmake'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'tests/**'
concurrency:
  group: windows-xp-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  MSVC:
    runs-on: windows-2025

    env:
      VS_INSTALL_DIR: C:\Program Files\Microsoft Visual Studio\2022\Enterprise
      UseMultiToolTask: true
      NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: config
      shell: cmd
      run: |
        "C:\Program Files (x86)\Microsoft Visual Studio\Installer\setup.exe" modify --installPath "${{ env.VS_INSTALL_DIR }}" --channelId VisualStudio.17.Release --add Microsoft.VisualStudio.Component.WinXP  --add Microsoft.VisualStudio.Component.VC.Tools.X86.X64.Spectre --add Microsoft.VisualStudio.Component.VC.Tools.X86.X64 --add Microsoft.VisualStudio.Component.VC.Tools.X86.X64 --add Microsoft.VisualStudio.Component.VC.v141.xp --nocache --quiet
        call "${{ env.VS_INSTALL_DIR }}\VC\Auxiliary\Build\vcvarsall.bat" x86
    - name: build
      run: |
        mkdir build; cd build
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -A WIN32 -G "Visual Studio 17 2022" -T v141_xp -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_WITH_STATIC_CRT=ON -DNCNN_AVX=OFF -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-msvc.toolchain.cmake" ..
        cmake --build . --config Release -j 4
    - name: test
      run: cd build; ctest -C Release --output-on-failure -j 4

  MinGW-w32:
    runs-on: windows-2025

    env:
      UseMultiToolTask: true
      NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true

    - name: config
      run: |
        Invoke-WebRequest -Uri https://github.com/nihui/ncnn-assets/releases/download/toolchain/i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z -OutFile i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z
        7z x ./i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z
        Add-Content -Path $env:GITHUB_ENV -Value "MINGW32_ROOT_PATH=${{ github.workspace }}\mingw32"
        Add-Content -Path $env:GITHUB_PATH -Value "${{ github.workspace }}\mingw32\bin"
    - name: build
      run: |
        mkdir build; cd build
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-mingw.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles"
        cmake --build . --config Release -j 4
    - name: test
      run: cd build; ctest -C Release --output-on-failure -j 4

  Clang:
    runs-on: windows-2022

    env:
      UseMultiToolTask: true
      NCNN_CMAKE_OPTIONS: -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON

    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
    - name: Set up Clang
      run: choco install llvm --version=6.0.0 --allow-downgrade
    - name: Verify Clang
      run: |
        clang --version
        clang++ --version
    - name: config
      run: |
        Invoke-WebRequest -Uri https://github.com/nihui/ncnn-assets/releases/download/toolchain/i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z -OutFile i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z
        7z x ./i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z
        Add-Content -Path $env:GITHUB_ENV -Value "MINGW32_ROOT_PATH=${{ github.workspace }}\mingw32"
        echo "${{ github.workspace }}\mingw32\bin;$env:PATH" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8
    - name: build
      run: |
        mkdir build; cd build
        cmake ${{ env.NCNN_CMAKE_OPTIONS }} -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-clang.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles"
        cmake --build . --config Release -j 4
    - name: test
      run: cd build; ctest -C Release --output-on-failure -j 4


================================================
FILE: .github/workflows/windows.yml
================================================
name: windows
on:
  push:
    branches: [master]
    paths:
    - '.github/workflows/windows.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
  pull_request:
    branches: [master]
    paths:
    - '.github/workflows/windows.yml'
    - 'CMakeLists.txt'
    - 'cmake/**'
    - 'src/*'
    - 'src/layer/*'
    - 'src/layer/x86/**'
    - 'src/layer/vulkan/**'
    - 'tests/**'
    - 'tools/**'
    - '!tools/pnnx/**'
    - 'examples/**'
    - 'glslang'
concurrency:
  group: windows-${{ github.ref }}
  cancel-in-progress: true
permissions:
  contents: read

jobs:
  msvc:
    name: ${{ matrix.vs-version }}
    runs-on: windows-2022
    strategy:
      matrix:
        include:
          - vs-version: vs2015
            toolset-version: v140
            windows-sdk-version: 22621
          - vs-version: vs2017
            toolset-version: v141
            windows-sdk-version: 22621
          - vs-version: vs2019
            toolset-version: v142
            windows-sdk-version: 26100
          - vs-version: vs2022
            toolset-version: v143
            windows-sdk-version: 26100

    env:
      UseMultiToolTask: true
    steps:
    - uses: actions/checkout@v6
      with:
        submodules: true
        
    - name: Install VS 2017 (v141) Build Tools
      if: matrix.vs-version == 'vs2017'
      run: |
        $vsInstallPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -latest -property installationPath
        Start-Process -FilePath "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vs_installer.exe" -ArgumentList "modify --installPath `"$vsInstallPath`" --add Microsoft.VisualStudio.Component.VC.v141.x86.x64 --quiet --norestart --nocache" -Wait
    - name: Install and Setup VS 2015 (v140) Build Tools
      if: matrix.vs-version == 'vs2015'
      run: |
        $vs140Path = "C:/vs140_build_tools"
        Invoke-WebRequest -Uri "https://aka.ms/vs/15/release/vs_buildtools.exe" -OutFile vs_buildtools.exe
        Start-Process -FilePath "vs_buildtools.exe" -ArgumentList "--installPath `"$vs140Path`" --add Microsoft.VisualStudio.Workload.VCTools --add Microsoft.VisualStudio.Component.VC.140 --quiet --wait --norestart --nocache" -Wait

        $vcvarsPath = (Get-ChildItem -Path $vs140Path -Filter "vcvars64.bat" -Recurse | Select-Object -First 1).FullName
        $cmd = "`"$vcvarsPath`" && powershell -Command `"`$env:PATH;`$env:INCLUDE;`$env:LIB`""
        $output = cmd.exe /c $cmd
        $lines = $output -split "`r`n"
        
        echo "PATH=$($lines[0]);$($env:PATH)" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
        echo "INCLUDE=$($lines[1])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
        echo "LIB=$($lines[2])" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append

    - uses: GuillaumeFalourd/setup-windows10-sdk-action@v2.4
      with:
        sdk-version: ${{ matrix.windows-sdk-version }}

    - name: cache-protobuf
      id: cache-protobuf
      uses: actions/cache@v5
      with:
        path: "protobuf-install"
        key: protobuf-${{ matrix.vs-version }}-x64-install-3
    - name: protobuf
      if: steps.cache-protobuf.outputs.cache-hit != 'true'
      run: |
        Invoke-WebRequest -Uri https://github.com/protocolbuffers/protobuf/archive/v3.11.2.zip -OutFile protobuf-3.11.2.zip
        7z x ./protobuf-3.11.2.zip
        cd protobuf-3.11.2
        mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}
        cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX="$env:GITHUB_WORKSPACE\protobuf-install" -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DNCNN_BUILD_TESTS=ON ../cmake
        cmake --build . --config Release -j 4
        cmake --build . --config Release --target install
    - name: cache-swiftshader
      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017'
      id: cache-swiftshader
      uses: actions/cache@v5
      with:
        path: swiftshader-install
        key: swiftshader-${{ matrix.vs-version }}-x64-install-20251010
    - name: checkout-swiftshader
      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true'
      uses: actions/checkout@v6
      with:
        repository: google/swiftshader
        path: swiftshader
        ref: de870ac7518fe2b6bb651ecc22fc36647cf7b986
    - name: checkout-swiftshader-submodules
      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive
    - name: swiftshader
      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017' && steps.cache-swiftshader.outputs.cache-hit != 'true'
      run: |
        cd swiftshader
        mkdir build-${{ matrix.vs-version }}; cd build-${{ matrix.vs-version }}
        cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release ..
        cmake --build . --config Release -j 4
        mkdir "$env:GITHUB_WORKSPACE/swiftshader-install"
        Copy-Item -Path "Windows\*" -Destination "$env:GITHUB_WORKSPACE\swiftshader-install"

    - name: x64
      run: |
        mkdir build-x64; cd build-x64
        cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -Dprotobuf_DIR="$env:GITHUB_WORKSPACE\protobuf-install\cmake" -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..
        cmake --build . --config Release -j 4
    - name: x64-test
      if: matrix.vs-version != 'vs2015' && matrix.vs-version != 'vs2017'
      run: |
        echo "[Processor]`nThreadCount=1`n" > build-x64/tests/Release/SwiftShader.ini
        Copy-Item -Path "$env:GITHUB_WORKSPACE\swiftshader-install\vulkan-1.dll" -Destination 'build-x64\tests'
        cd build-x64; ctest -C Release --output-on-failure -j 4

    - name: x64-sse2
      run: |
        mkdir build-x64-sse2; cd build-x64-sse2
        cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DNCNN_RUNTIME_CPU=OFF -DNCNN_XOP=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . --config Release -j 4
    - name: x64-sse2-test
      run: cd build-x64-sse2; ctest -C Release --output-on-failure -j 4

    - name: x64-avx
      run: |
        mkdir build-x64-avx; cd build-x64-avx
        cmake -T ${{ matrix.toolset-version }},host=x64 -A x64,version=10.0.${{ matrix.windows-sdk-version }}.0 -DNCNN_RUNTIME_CPU=OFF -DNCNN_XOP=OFF -DNCNN_AVX=ON -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_BUILD_TESTS=ON -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . --config Release -j 4
    - name: x64-avx-test
      run: cd build-x64-avx; ctest -C Release --output-on-failure -j 4

    - name: x86
      run: |
        mkdir build-x86; cd build-x86
        cmake -T ${{ matrix.toolset-version }},host=x64 -A Win32,version=10.0.${{ matrix.windows-sdk-version }}.0 -DNCNN_SHARED_LIB=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
        cmake --build . --config Release -j 4
    - name: x86-test
      run: |
        Copy-Item -Path "build-x86\src\Release\ncnn.dll" -Destination 'build-x86\tests'
        cd build-x86; ctest -C Release --output-on-failure -j 4


================================================
FILE: .gitignore
================================================
# CMake build directory
build*/

# Backup files.
*~

# Prerequisites
*.d

# Compiled Object files
*.slo
*.lo
*.o
*.obj

# Precompiled Headers
*.gch
*.pch

# Compiled Dynamic libraries
*.so
*.dylib
*.dll

# Fortran module files
*.mod
*.smod

# Compiled Static libraries
*.lai
*.la
*.a
*.lib

# Executables
*.exe
*.out
*.app

# MACOSX
.DS_Store

# IDE
.vs
.vscode
.idea
cmake-build-debug
cmake-build-release
CMakeSettings.json

# Compiled python
__pycache__
*.pyc
*.pyd
*.egg-info/
python/setup.py

# Clangd
.cache/

# Xmake
.xmake/


================================================
FILE: .gitmodules
================================================
[submodule "glslang"]
	path = glslang
	url = https://github.com/nihui/glslang
[submodule "python/pybind11"]
	path = python/pybind11
	url = https://github.com/pybind/pybind11.git


================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
title: ncnn
message: >-
  If you use this software, please cite it using the
  metadata from this file.
type: software
authors:
  - family-names: "Ni"
    given-names: "Hui"
  - name: "The ncnn contributors"
abstract: >-
  ncnn is a high-performance neural network inference
  computing framework optimized for mobile platforms. 
date-released: 2017-06-30
keywords:
  - "neural network"
  - "artificial intelligence"
  - "deep learning"
  - android
  - ios
  - windows
  - linux
  - macos
  - pnnx
  - simd
  - vulkan
  - riscv
  - x86
  - arm
  - mips
  - loongarch
license: BSD-3-Clause
repository-code: "https://github.com/Tencent/ncnn"


================================================
FILE: CMakeLists.txt
================================================
if(CMAKE_TOOLCHAIN_FILE)
    set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
    # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
    get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
    find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
    message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
endif()

if(NOT DEFINED CMAKE_INSTALL_PREFIX)
    set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
endif()
message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")

if(NOT DEFINED NCNN_VERSION)
    string(TIMESTAMP NCNN_VERSION "%Y%m%d")
endif()

set(NCNN_VERSION_MAJOR 1)
set(NCNN_VERSION_MINOR 0)
set(NCNN_VERSION_PATCH ${NCNN_VERSION})
set(NCNN_VERSION_STRING ${NCNN_VERSION_MAJOR}.${NCNN_VERSION_MINOR}.${NCNN_VERSION_PATCH})
set(NCNN_VERSION_NUMBER ${NCNN_VERSION})
message(STATUS "NCNN_VERSION_STRING = ${NCNN_VERSION_STRING}")

cmake_minimum_required(VERSION 2.8.12...3.10)

if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE)
endif()

if(NOT CMAKE_VERSION VERSION_LESS "3.15")
    # enable CMAKE_MSVC_RUNTIME_LIBRARY
    cmake_policy(SET CMP0091 NEW)
endif()

if(POLICY CMP0025)
    # reference from https://cmake.org/cmake/help/latest/policy/CMP0025.html
    cmake_policy(SET CMP0025 NEW)
endif()

if(POLICY CMP0057)
    # reference from https://cmake.org/cmake/help/latest/policy/CMP0057.html
    cmake_policy(SET CMP0057 NEW)
endif()

project(ncnn)

if(MSVC AND NOT CMAKE_VERSION VERSION_LESS "3.15")
    option(NCNN_BUILD_WITH_STATIC_CRT "Enables use of statically linked CRT for statically linked ncnn" OFF)
    if(NCNN_BUILD_WITH_STATIC_CRT)
        # cmake before version 3.15 not work
        set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
    endif()
endif()

if(CMAKE_FIND_LIBRARY_SUFFIXES_INIT)
    # project() overwrite CMAKE_FIND_LIBRARY_SUFFIXES in toolchain, restore it
    set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_INIT})
endif()

option(NCNN_SHARED_LIB "shared library support" OFF)
option(NCNN_ENABLE_LTO "enable link-time optimization" OFF)
option(NCNN_OPENMP "openmp support" ON)
option(NCNN_STDIO "load model from external file" ON)
option(NCNN_STRING "plain and verbose string" ON)
option(NCNN_INSTALL_SDK "install ncnn library and headers" ON)
option(NCNN_SIMPLEOCV "minimal opencv structure emulation" OFF)
option(NCNN_SIMPLEOMP "minimal openmp runtime emulation" OFF)
option(NCNN_SIMPLESTL "minimal cpp stl structure emulation" OFF)
option(NCNN_SIMPLEMATH "minimal cmath" OFF)
option(NCNN_THREADS "build with threads" ON)
option(NCNN_BENCHMARK "print benchmark information for every layer" OFF)
option(NCNN_C_API "build with C api" ON)
option(NCNN_PLATFORM_API "build with platform api candy" ON)
option(NCNN_WINXP "build with windows xp compatibility" OFF)
option(NCNN_PIXEL "convert and resize from/to image pixel" ON)
option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" ON)
option(NCNN_PIXEL_AFFINE "warp affine image pixel" ON)
option(NCNN_PIXEL_DRAWING "draw basic figure and text" ON)
option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF)
option(NCNN_VULKAN "vulkan compute support" OFF)
option(NCNN_SIMPLEVK "minimal in-house vulkan loader" ON)
option(NCNN_SYSTEM_GLSLANG "use system glslang library" OFF)
option(NCNN_RUNTIME_CPU "runtime dispatch cpu routines" ON)
option(NCNN_DISABLE_PIC "disable position-independent code" OFF)
option(NCNN_BUILD_TESTS "build tests" OFF)
option(NCNN_COVERAGE "build for coverage" OFF)
option(NCNN_ASAN "build for address sanitizer" OFF)
option(NCNN_BUILD_BENCHMARK "build benchmark" ON)
option(NCNN_PYTHON "build python api" OFF)
option(NCNN_INT8 "int8 inference" ON)
option(NCNN_BF16 "bf16 inference" ON)
option(NCNN_FORCE_INLINE "force inline some function" ON)

if(ANDROID OR IOS OR NCNN_SIMPLESTL)
    option(NCNN_DISABLE_RTTI "disable rtti" ON)
    option(NCNN_DISABLE_EXCEPTION "disable exception" ON)
else()
    option(NCNN_DISABLE_RTTI "disable rtti" OFF)
    option(NCNN_DISABLE_EXCEPTION "disable exception" OFF)
endif()

if(ANDROID OR IOS OR NCNN_SIMPLESTL OR CMAKE_CROSSCOMPILING)
    option(NCNN_BUILD_TOOLS "build tools" OFF)
    option(NCNN_BUILD_EXAMPLES "build examples" OFF)
else()
    option(NCNN_BUILD_TOOLS "build tools" ON)
    option(NCNN_BUILD_EXAMPLES "build examples" ON)
endif()

if(NCNN_SHARED_LIB)
    if(NCNN_ENABLE_LTO)
        # enable global link time optimization
        cmake_policy(SET CMP0069 NEW)
        set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
        include(CheckIPOSupported)
        check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output)
        if(ipo_supported)
            set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
        else()
            message(WARNING "IPO is not supported: ${ipo_supported_output}")
            set(NCNN_ENABLE_LTO OFF)
        endif()
    endif()
endif()

if(NOT NCNN_STDIO OR NOT NCNN_STRING)
    if(NCNN_BUILD_TOOLS)
        message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_TOOLS will be turned off.")
        set(NCNN_BUILD_TOOLS OFF)
    endif()
    if(NCNN_BUILD_EXAMPLES)
        message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_EXAMPLES will be turned off.")
        set(NCNN_BUILD_EXAMPLES OFF)
    endif()
    if(NCNN_BUILD_BENCHMARK)
        message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_BENCHMARK will be turned off.")
        set(NCNN_BUILD_BENCHMARK OFF)
    endif()
    if(NCNN_BUILD_TESTS)
        message(WARNING "NCNN_STDIO or NCNN_STRING disabled, NCNN_BUILD_TESTS will be turned off.")
        set(NCNN_BUILD_TESTS OFF)
    endif()
endif()

##############################################

include(CheckCXXCompilerFlag)
set(CMAKE_TRY_COMPILE_CONFIGURATION release)
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)

# gnu inline assembly in clang msvc does not work actually
if(NOT (CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")))
    check_cxx_source_compiles("int test(int a) { asm volatile(\"\" : \"=r\"(a) : \"0\"(a) : \"memory\"); return a; }" NCNN_COMPILER_SUPPORT_GNU_INLINE_ASM)
    if(NCNN_COMPILER_SUPPORT_GNU_INLINE_ASM)
        option(NCNN_GNU_INLINE_ASM "optimize platform with gnu style inline assembly" ON)
    else()
        message(WARNING "The compiler does not support gnu style inline assembly. NCNN_GNU_INLINE_ASM will be OFF.")
    endif()
endif()

if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm")
    OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
    OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)")
    OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "(ARMV7|ARM64)")
    OR ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) AND (${CMAKE_GENERATOR_PLATFORM} MATCHES "^(arm|arm64)")))
    set(NCNN_TARGET_ARCH arm)

    if(APPLE AND CMAKE_OSX_ARCHITECTURES STREQUAL "arm64_32")
        set(NCNN_TARGET_ILP32 TRUE)
    endif()

    if(CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT NCNN_TARGET_ILP32)
        check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float32x4_t a, float32x4_t b) { return vmlaq_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM_NEON)

        if(NCNN_COMPILER_SUPPORT_ARM_NEON)
            if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
                set(CMAKE_REQUIRED_FLAGS "/arch:VFPv4")
                check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)

                unset(CMAKE_REQUIRED_FLAGS)
            else()
                set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4")
                check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)

                if(NOT NCNN_COMPILER_SUPPORT_ARM_VFPV4)
                    set(CMAKE_REQUIRED_FLAGS "-mfpu=neon-vfpv4 -mfp16-format=ieee")
                    check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16)
                endif()

                unset(CMAKE_REQUIRED_FLAGS)
            endif()
        endif()

        if(NCNN_COMPILER_SUPPORT_ARM_VFPV4 OR NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16)
            option(NCNN_VFPV4 "optimize armv7 platform with vfpv4" ON)
        else()
            message(WARNING "The compiler does not support arm vfpv4. NCNN_VFPV4 will be OFF.")
        endif()
    endif()

    if(CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4")
            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)

            unset(CMAKE_REQUIRED_FLAGS)
        elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.0")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+fp16")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+dotprod")
            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.2 -march=armv8.2-a+fp16fml")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4 -march=armv8.4-a+bf16")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.4 -march=armv8.4-a+i8mm")
            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve2")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+bf16")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+i8mm")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)

            set(CMAKE_REQUIRED_FLAGS "/arch:armv8.6 -march=armv8.6-a+sve+f32mm")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)

            unset(CMAKE_REQUIRED_FLAGS)
        else()
            set(CMAKE_REQUIRED_FLAGS "-march=armv8-a")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x4_t test(float32x4_t a) { return vcvt_f16_f32(a); }" NCNN_COMPILER_SUPPORT_ARM_VFPV4)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat16x8_t test(float16x8_t s, float16x8_t a, float16x8_t b) { return vfmaq_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+dotprod")
            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vdotq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.2-a+fp16fml")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, float16x8_t a, float16x8_t b) { return vfmlalq_low_f16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16")
            check_cxx_source_compiles("#include <arm_neon.h>\nfloat32x4_t test(float32x4_t s, bfloat16x8_t a, bfloat16x8_t b) { return vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(s, a, b))); }" NCNN_COMPILER_SUPPORT_ARM84_BF16)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm")
            check_cxx_source_compiles("#include <arm_neon.h>\nint32x4_t test(int32x4_t s, int8x16_t a, int8x16_t b) { return vmmlaq_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM84_I8MM)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat16_t test(svfloat16_t s, svfloat16_t a, svfloat16_t b, svbool_t bp) { return svmla_f16_z(bp, s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve2")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvint16_t test(svint16_t s, svint8_t a, svint8_t b) { return svmlslb_s16(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVE2)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+bf16")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svbfloat16_t a, svbfloat16_t b) { return svbfmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+i8mm")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvint32_t test(svint32_t s, svint8_t a, svint8_t b) { return svmmla_s32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)

            set(CMAKE_REQUIRED_FLAGS "-march=armv8.6-a+sve+f32mm")
            check_cxx_source_compiles("#include <arm_sve.h>\nsvfloat32_t test(svfloat32_t s, svfloat32_t a, svfloat32_t b) { return svmmla_f32(s, a, b); }" NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)

            unset(CMAKE_REQUIRED_FLAGS)
        endif()

        if(NCNN_COMPILER_SUPPORT_ARM_VFPV4)
            option(NCNN_VFPV4 "optimize aarch64 platform with vfpv4" ON)
        else()
            message(WARNING "The compiler does not support arm vfpv4. NCNN_VFPV4 will be OFF.")
        endif()

        if(NCNN_COMPILER_SUPPORT_ARM82_FP16)
            option(NCNN_ARM82 "optimize aarch64 platform with armv8.2 fp16" ON)
            if(NCNN_COMPILER_SUPPORT_ARM82_DOTPROD)
                if(NCNN_ARM82)
                    option(NCNN_ARM82DOT "optimize aarch64 platform with armv8.2 dotprod" ON)
                endif()
            else()
                message(WARNING "The compiler does not support armv8.2 dotprod. NCNN_ARM82DOT will be OFF.")
            endif()
            if(NCNN_COMPILER_SUPPORT_ARM82_FP16FML)
                if(NCNN_ARM82)
                    option(NCNN_ARM82FP16FML "optimize aarch64 platform with armv8.2 fp16fml" ON)
                endif()
            else()
                message(WARNING "The compiler does not support armv8.2 fp16fml. NCNN_ARM82FP16FML will be OFF.")
            endif()
            if(NCNN_COMPILER_SUPPORT_ARM84_BF16)
                if(NCNN_ARM82DOT AND NCNN_ARM82FP16FML)
                    option(NCNN_ARM84BF16 "optimize aarch64 platform with armv8.4 bf16" ON)
                endif()
            else()
                message(WARNING "The compiler does not support armv8.4 bf16. NCNN_ARM86BF16 will be OFF.")
            endif()
            if(NCNN_COMPILER_SUPPORT_ARM84_I8MM)
                if(NCNN_ARM82DOT AND NCNN_ARM82FP16FML)
                    option(NCNN_ARM84I8MM "optimize aarch64 platform with armv8.4 i8mm" ON)
                endif()
            else()
                message(WARNING "The compiler does not support armv8.4 i8mm. NCNN_ARM84I8MM will be OFF.")
            endif()
            if(NCNN_COMPILER_SUPPORT_ARM86_SVE)
                if(NCNN_ARM84BF16 AND NCNN_ARM84I8MM)
                    option(NCNN_ARM86SVE "optimize aarch64 platform with armv8.6 sve" ON)
                    if(NCNN_COMPILER_SUPPORT_ARM86_SVE2)
                        if(NCNN_ARM86SVE)
                            option(NCNN_ARM86SVE2 "optimize aarch64 platform with armv8.6 sve2" ON)
                        endif()
                    else()
                        message(WARNING "The compiler does not support armv8.6 sve2. NCNN_ARM86SVE2 will be OFF.")
                    endif()
                    if(NCNN_COMPILER_SUPPORT_ARM86_SVEBF16)
                        if(NCNN_ARM86SVE)
                            option(NCNN_ARM86SVEBF16 "optimize aarch64 platform with armv8.6 sve bf16" ON)
                        endif()
                    else()
                        message(WARNING "The compiler does not support armv8.6 sve bf16. NCNN_ARM86SVEBF16 will be OFF.")
                    endif()
                    if(NCNN_COMPILER_SUPPORT_ARM86_SVEI8MM)
                        if(NCNN_ARM86SVE)
                            option(NCNN_ARM86SVEI8MM "optimize aarch64 platform with armv8.6 sve i8mm" ON)
                        endif()
                    else()
                        message(WARNING "The compiler does not support armv8.6 sve i8mm. NCNN_ARM86SVEI8MM will be OFF.")
                    endif()
                    if(NCNN_COMPILER_SUPPORT_ARM86_SVEF32MM)
                        if(NCNN_ARM86SVE)
                            option(NCNN_ARM86SVEF32MM "optimize aarch64 platform with armv8.6 sve f32mm" ON)
                        endif()
                    else()
                        message(WARNING "The compiler does not support armv8.6 sve f32mm. NCNN_ARM86SVEF32MM will be OFF.")
                    endif()
                endif()
            else()
                message(WARNING "The compiler does not support armv8.6 sve. NCNN_ARM86SVE will be OFF.")
            endif()
        else()
            message(WARNING "The compiler does not support armv8.2 fp16. NCNN_ARM82 will be OFF.")
        endif()
    endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
    set(NCNN_TARGET_ARCH mips)

    check_cxx_compiler_flag("-mmsa" NCNN_COMPILER_SUPPORT_MIPS_MSA)

    set(CMAKE_REQUIRED_FLAGS "-mloongson-mmi -I${CMAKE_CURRENT_SOURCE_DIR}/src/layer/mips")
    check_cxx_source_compiles("#include \"loongson_mmi.h\"\nint32x2_t test(int16x4_t a, int16x4_t b) { return __mmi_pmaddhw(a, b); }" NCNN_COMPILER_SUPPORT_LOONGSON_MMI)

    unset(CMAKE_REQUIRED_FLAGS)

    if(NCNN_COMPILER_SUPPORT_MIPS_MSA)
        option(NCNN_MSA "optimize mips platform with msa extension" ON)
    else()
        message(WARNING "The compiler does not support msa extension. NCNN_MSA will be OFF.")
    endif()
    if(NCNN_COMPILER_SUPPORT_LOONGSON_MMI)
        option(NCNN_MMI "optimize mips platform with loongson mmi extension" ON)
    else()
        message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.")
    endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)")
    set(NCNN_TARGET_ARCH loongarch)

    set(CMAKE_REQUIRED_FLAGS "-mlsx")
    check_cxx_source_compiles("#include <lsxintrin.h>\n__m128 test(__m128 a, __m128 b, __m128 c) { return __lsx_vfmadd_s(a, b, c); }" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)

    set(CMAKE_REQUIRED_FLAGS "-mlasx")
    check_cxx_source_compiles("#include <lasxintrin.h>\n__m256 test(__m256 a, __m256 b, __m256 c) { return __lasx_xvfmadd_s(a, b, c); }" NCNN_COMPILER_SUPPORT_LOONGARCH_LASX)

    unset(CMAKE_REQUIRED_FLAGS)

    if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
        option(NCNN_LSX "optimize loongarch platform with lsx extension" ON)
        if(NCNN_COMPILER_SUPPORT_LOONGARCH_LASX)
            option(NCNN_LASX "optimize loongarch platform with lasx extension" ON)
        else()
            message(WARNING "The compiler does not support lasx extension. NCNN_LASX will be OFF.")
        endif()
    else()
        message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.")
    endif()

elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
    set(NCNN_TARGET_ARCH riscv)

    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
        set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat32m8_t test(vfloat32m8_t s, vfloat32m8_t w, float v, size_t vl) { return __riscv_vfmacc_vf_f32m8(s, v, w, vl); }\nvfloat32m1x2_t test2(vfloat32m1_t x) { return __riscv_vcreate_v_f32m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_V)

        set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh -D__fp16=_Float16")
        check_cxx_source_compiles("__fp16 test(__fp16 a) { return a * a; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH)

        set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)

        set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)

        unset(CMAKE_REQUIRED_FLAGS)

        if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
            option(NCNN_RVV "optimize risc-v platform with v extension" ON)
        else()
            message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.")
        endif()

        if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
            option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON)
        else()
            message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.")
        endif()

        if(NCNN_COMPILER_SUPPORT_RISCV_ZFH)
            option(NCNN_ZFH "optimize risc-v platform with zfh extension" ON)
            if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
                if(NCNN_RVV AND NCNN_ZFH)
                    option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON)
                endif()
            else()
                message(WARNING "The compiler does not support zvfh extension. NCNN_ZVFH will be OFF.")
            endif()
        else()
            message(WARNING "The compiler does not support risc-v zfh extension. NCNN_ZFH will be OFF.")
        endif()

    elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
        set(CMAKE_REQUIRED_FLAGS "-march=rv32gcv")
        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat32m8_t test(vfloat32m8_t s, vfloat32m8_t w, float v, size_t vl) { return __riscv_vfmacc_vf_f32m8(s, v, w, vl); }\nvfloat32m1x2_t test2(vfloat32m1_t x) { return __riscv_vcreate_v_f32m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_V)

        set(CMAKE_REQUIRED_FLAGS "-march=rv32gc_zfh -D__fp16=_Float16")
        check_cxx_source_compiles("__fp16 test(__fp16 a) { return a * a; }" NCNN_COMPILER_SUPPORT_RISCV_ZFH)

        set(CMAKE_REQUIRED_FLAGS "-march=rv32gcv_zfh_zvfh -D__fp16=_Float16")
        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)

        set(CMAKE_REQUIRED_FLAGS "-march=rv32gc_zfh_xtheadvector -D__fp16=_Float16")
        check_cxx_source_compiles("#include <riscv_vector.h>\nvfloat16m8_t test(vfloat16m8_t s, vfloat16m8_t w, __fp16 v, size_t vl) { return __riscv_vfmacc_vf_f16m8(s, v, w, vl); }\nvfloat16m1x2_t test2(vfloat16m1_t x){ return __riscv_vcreate_v_f16m1x2(x, x); }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)

        unset(CMAKE_REQUIRED_FLAGS)

        if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
            option(NCNN_RVV "optimize risc-v platform with v extension" ON)
        else()
            message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.")
        endif()

        if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
            option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON)
        else()
            message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.")
        endif()

        if(NCNN_COMPILER_SUPPORT_RISCV_ZFH)
            option(NCNN_ZFH "optimize risc-v platform with zfh extension" ON)
            if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
                if(NCNN_RVV AND NCNN_ZFH)
                    option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON)
                endif()
            else()
                message(WARNING "The compiler does not support zvfh extension. NCNN_ZVFH will be OFF.")
            endif()
        else()
            message(WARNING "The compiler does not support risc-v zfh extension. NCNN_ZFH will be OFF.")
        endif()

    endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
    set(NCNN_TARGET_ARCH powerpc)

    if(NCNN_PPC64LE_VSX)
        set(NCNN_TARGET_ARCH x86)

        set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE2__")
        check_cxx_source_compiles("#include <emmintrin.h>\n__m128i test(__m128i a, __m128i b) { return _mm_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
        unset(CMAKE_REQUIRED_FLAGS)

        set(CMAKE_REQUIRED_FLAGS "-DNO_WARN_X86_INTRINSICS -D__SSE4_1__")
        check_cxx_source_compiles("#include <smmintrin.h>\n__m128i test(__m128i a, __m128i b) { return _mm_packus_epi32(a, b); }" NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
        unset(CMAKE_REQUIRED_FLAGS)

        if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE2)
            option(NCNN_VSX_SSE2 "optimize ppc64le platform with sse2 extension" ON)
        else()
            message(WARNING "The compiler does not support sse2 extension. NCNN_VSX_SSE2 will be OFF.")
        endif()

        if(NCNN_COMPILER_SUPPORT_PPC64LE_SSE41)
            option(NCNN_VSX_SSE41 "optimize ppc64le platform with sse4.1 extension" ON)
        else()
            message(WARNING "The compiler does not support sse4.1 extension. NCNN_VSX_SSE41 will be OFF.")
        endif()
    endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")
    set(NCNN_TARGET_ARCH xtensa)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)")
    set(NCNN_TARGET_ARCH s390x)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(sw_64)")
    set(NCNN_TARGET_ARCH sw_64)
    #sw_64 is alpha-like platform
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mieee")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mieee")
else()
    set(NCNN_TARGET_ARCH x86)

    option(NCNN_SSE2 "optimize x86 platform with sse2 extension" ON)

    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
        check_cxx_source_compiles("#include <immintrin.h>\n#include <ammintrin.h>\n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
        check_cxx_source_compiles("#include <immintrin.h>\n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)

        unset(CMAKE_REQUIRED_FLAGS)
    elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
        check_cxx_compiler_flag("-mrecip=none" NCNN_COMPILER_SUPPORT_X86_RECIP_NONE)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mfma -mf16c")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mxop")
        check_cxx_source_compiles("#include <x86intrin.h>\n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX -mf16c")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint8")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint16")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxneconvert")
        check_cxx_source_compiles("#include <immintrin.h>\n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)

        set(CMAKE_REQUIRED_FLAGS "/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)

        unset(CMAKE_REQUIRED_FLAGS)
    else()
        check_cxx_compiler_flag("-mrecip=none" NCNN_COMPILER_SUPPORT_X86_RECIP_NONE)

        set(CMAKE_REQUIRED_FLAGS "-mavx")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 a, __m256 b) { return _mm256_mul_ps(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m256 s, __m256 a, __m256 b) { return _mm256_fmadd_ps(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_FMA)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mxop")
        check_cxx_source_compiles("#include <x86intrin.h>\n__m128i test(__m128i s, __m128i a, __m128i b) { return _mm_maddd_epi16(a, b, s); }" NCNN_COMPILER_SUPPORT_X86_XOP)

        set(CMAKE_REQUIRED_FLAGS "-mf16c")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256 test(__m128i a) { return _mm256_cvtph_ps(a); }" NCNN_COMPILER_SUPPORT_X86_F16C)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i a, __m256i b) { return _mm256_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX2)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i a, __m512i b) { return _mm512_madd_epi16(a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwssd_avx_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni -mavxvnniint8")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpbssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxvnni -mavxvnniint16")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256i test(__m256i s, __m256i a, __m256i b) { return _mm256_dpwsud_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx2 -mavxneconvert")
        check_cxx_source_compiles("#include <immintrin.h>\n__m128bh test(__m256 a) { return _mm256_cvtneps_avx_pbh(a); }" NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512vnni")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512i test(__m512i s, __m512i a, __m512i b) { return _mm512_dpwssd_epi32(s, a, b); }" NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512bf16")
        check_cxx_source_compiles("#include <immintrin.h>\n__m256bh test(__m256bh s, __m512bh a, __m512bh b) { return _mm512_cvtneps_pbh(_mm512_dpbf16_ps(_mm512_cvtpbh_ps(s), a, b)); }\n__m512i test2(__m512 a) { __m256i _a = (__m256i)_mm512_cvtneps_pbh(a); return _mm512_inserti32x8(_mm512_castsi256_si512(_a), _a, 1); }" NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)

        set(CMAKE_REQUIRED_FLAGS "-mfma -mf16c -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mavx512fp16")
        check_cxx_source_compiles("#include <immintrin.h>\n__m512h test(__m512h s, __m512h a, __m512h b) { return _mm512_fmadd_ph(s, a, b); }\n__m512 test2(__m512 a) { return _mm512_cvtxph_ps(_mm512_cvtxps_ph(a)); }" NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)

        unset(CMAKE_REQUIRED_FLAGS)
    endif()

    if(NOT CMAKE_SYSTEM_NAME MATCHES "Emscripten|WASI" AND NCNN_COMPILER_SUPPORT_X86_AVX)
        option(NCNN_AVX "optimize x86 platform with avx extension" ON)
        if(NCNN_COMPILER_SUPPORT_X86_FMA)
            if(NCNN_AVX)
                option(NCNN_FMA "optimize x86 platform with fma extension" ON)
            endif()
        else()
            message(WARNING "The compiler does not support fma extension. NCNN_FMA will be OFF.")
        endif()
        if(NCNN_COMPILER_SUPPORT_X86_XOP)
            if(NCNN_AVX)
                option(NCNN_XOP "optimize x86 platform with xop extension" ON)
            endif()
        else()
            message(WARNING "The compiler does not support xop extension. NCNN_XOP will be OFF.")
        endif()
        if(NCNN_COMPILER_SUPPORT_X86_F16C)
            if(NCNN_AVX)
                option(NCNN_F16C "optimize x86 platform with f16c extension" ON)
            endif()
        else()
            message(WARNING "The compiler does not support f16c extension. NCNN_F16C will be OFF.")
        endif()
        if(NCNN_COMPILER_SUPPORT_X86_AVX2)
            if(NCNN_AVX)
                option(NCNN_AVX2 "optimize x86 platform with avx2 extension" ON)
            endif()
            if(NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
                if(NCNN_AVX2)
                    option(NCNN_AVXVNNI "optimize x86 platform with avx vnni extension" ON)
                endif()
                if(NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
                    if(NCNN_AVXVNNI)
                        option(NCNN_AVXVNNIINT8 "optimize x86 platform with avx vnni int8 extension" ON)
                    endif()
                else()
                    message(WARNING "The compiler does not support avx vnni int8 extension. NCNN_AVXVNNIINT8 will be OFF.")
                endif()
                if(NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT16)
                    if(NCNN_AVXVNNI)
                        option(NCNN_AVXVNNIINT16 "optimize x86 platform with avx vnni int16 extension" ON)
                    endif()
                else()
                    message(WARNING "The compiler does not support avx vnni int16 extension. NCNN_AVXVNNIINT16 will be OFF.")
                endif()
            else()
                message(WARNING "The compiler does not support avx vnni extension. NCNN_AVXVNNI will be OFF.")
            endif()
            if(NCNN_COMPILER_SUPPORT_X86_AVX_NE_CONVERT)
                if(NCNN_AVX2)
                    option(NCNN_AVXNECONVERT "optimize x86 platform with avx ne convert extension" ON)
                endif()
            else()
                message(WARNING "The compiler does not support avx ne convert extension. NCNN_AVXNECONVERT will be OFF.")
            endif()
            if(NCNN_COMPILER_SUPPORT_X86_AVX512)
                if(NCNN_AVX2)
                    option(NCNN_AVX512 "optimize x86 platform with avx512 extension" ON)
                endif()
                if(NCNN_COMPILER_SUPPORT_X86_AVX512_VNNI)
                    if(NCNN_AVX512)
                        option(NCNN_AVX512VNNI "optimize x86 platform with avx512 vnni extension" ON)
                    endif()
                else()
                    message(WARNING "The compiler does not support avx512 vnni extension. NCNN_AVX512VNNI will be OFF.")
                endif()
                if(NCNN_COMPILER_SUPPORT_X86_AVX512_BF16)
                    if(NCNN_AVX512)
                        option(NCNN_AVX512BF16 "optimize x86 platform with avx512 bf16 extension" ON)
                    endif()
                else()
                    message(WARNING "The compiler does not support avx512 bf16 extension. NCNN_AVX512BF16 will be OFF.")
                endif()
                if(NCNN_COMPILER_SUPPORT_X86_AVX512_FP16)
                    if(NCNN_AVX512)
                        option(NCNN_AVX512FP16 "optimize x86 platform with avx512 fp16 extension" ON)
                    endif()
                else()
                    message(WARNING "The compiler does not support avx512 fp16 extension. NCNN_AVX512FP16 will be OFF.")
                endif()
            else()
                message(WARNING "The compiler does not support avx512 extension. NCNN_AVX512 will be OFF.")
            endif()
        else()
            message(WARNING "The compiler does not support avx2 extension. NCNN_AVX2 will be OFF.")
        endif()
    else()
        message(WARNING "The compiler does not support avx extension. NCNN_AVX will be OFF.")
    endif()
endif()

unset(CMAKE_TRY_COMPILE_CONFIGURATION)
unset(CMAKE_TRY_COMPILE_TARGET_TYPE)

if(NCNN_TARGET_ILP32)
    message(STATUS "Target arch: ${NCNN_TARGET_ARCH} 64bit ilp32")
elseif(CMAKE_SIZEOF_VOID_P EQUAL 8)
    message(STATUS "Target arch: ${NCNN_TARGET_ARCH} 64bit")
else()
    message(STATUS "Target arch: ${NCNN_TARGET_ARCH} 32bit")
endif()

##############################################

# set cmake default folder name
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "cmake")

if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=256MB -s EXIT_RUNTIME=1")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=256MB -s EXIT_RUNTIME=1")
    set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} -s FORCE_FILESYSTEM=1 -s INITIAL_MEMORY=256MB -s EXIT_RUNTIME=1")

    if(NCNN_OPENMP AND NCNN_SIMPLEOMP)
        # TODO better flags for emscripten
        # node --experimental-wasm-threads xxx.js
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -s USE_PTHREADS=1 -s PTHREAD_POOL_SIZE=15")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -s USE_PTHREADS=1 -s PTHREAD_POOL_SIZE=15")
        set(CMAKE_EXECUTBLE_LINKER_FLAGS "${CMAKE_EXECUTBLE_LINKER_FLAGS} -s USE_PTHREADS=1 -s PTHREAD_POOL_SIZE=15")
    endif()
endif()

if(NCNN_VULKAN)
    if(NCNN_SYSTEM_GLSLANG)
        find_package(Threads)
        find_package(SPIRV-Tools QUIET)
        find_package(SPIRV-Tools-opt QUIET)
        find_package(glslang QUIET)
        if(glslang_FOUND)
            add_library(glslang ALIAS glslang::glslang)
            add_library(SPIRV ALIAS glslang::SPIRV)
        else()
            set(GLSLANG_TARGET_DIR "GLSLANG-NOTFOUND" CACHE PATH "Absolute path to glslangTargets.cmake directory")
            if(NOT GLSLANG_TARGET_DIR AND NOT DEFINED ENV{GLSLANG_TARGET_DIR})
                message(WARNING "set glslang_DIR to glslang-config.cmake directory for using system glslang.")
                message(WARNING "GLSLANG_TARGET_DIR must be defined! NCNN_SYSTEM_GLSLANG will be turned off.")
                set(NCNN_SYSTEM_GLSLANG OFF)
            else()
                include("${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake")
                include("${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake")
                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
                    # hlsl support can be optional
                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
                endif()
                include("${GLSLANG_TARGET_DIR}/glslangTargets.cmake")
                include("${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake")
            endif()
        endif()

        if(TARGET glslang AND TARGET SPIRV)
            get_property(glslang_location TARGET glslang PROPERTY LOCATION)
            get_property(SPIRV_location TARGET SPIRV PROPERTY LOCATION)
            message(STATUS "Found glslang: ${glslang_location} (found version \"${glslang_VERSION}\")")
            message(STATUS "Found SPIRV: ${SPIRV_location} (found version \"${glslang_VERSION}\")")
        else()
            message(WARNING "glslang or SPIRV target not found! NCNN_SYSTEM_GLSLANG will be turned off.")
            set(NCNN_SYSTEM_GLSLANG OFF)
        endif()
    endif()

    if(NOT NCNN_SYSTEM_GLSLANG)
        if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/glslang/CMakeLists.txt")
            message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init\" and try again.")
        else()
            # glslang requires c++11
            set(CMAKE_CXX_STANDARD 11)

            option(BUILD_EXTERNAL "" OFF)
            option(ENABLE_SPVREMAPPER "" OFF)
            option(ENABLE_GLSLANG_BINARIES "" OFF)
            option(ENABLE_HLSL "" OFF)
            option(ENABLE_RTTI "" OFF)
            option(ENABLE_EXCEPTIONS "" OFF)
            option(ENABLE_OPT "" OFF)
            option(ENABLE_PCH "" OFF)
            option(GLSLANG_TESTS "" OFF)
            if(NCNN_SHARED_LIB)
                option(GLSLANG_ENABLE_INSTALL "" OFF)
            else()
                option(GLSLANG_ENABLE_INSTALL "" ON)
            endif()
            add_subdirectory(glslang)
            if(NCNN_SHARED_LIB)
                if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
                    target_compile_options(glslang PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
                    target_compile_options(glslang-default-resource-limits PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
                endif()
                if(NCNN_ENABLE_LTO)
                    set_target_properties(glslang PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
                    set_target_properties(glslang-default-resource-limits PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
                endif()
            endif()
        endif()
    endif()
endif()

add_subdirectory(src)
if(NCNN_BUILD_BENCHMARK)
    add_subdirectory(benchmark)
endif()
if(NCNN_BUILD_EXAMPLES)
    add_subdirectory(examples)
endif()
if(NCNN_BUILD_TOOLS)
    add_subdirectory(tools)
endif()
if(NCNN_BUILD_TESTS)
    enable_testing()
    add_subdirectory(tests)
    add_subdirectory(tests/perf)
endif()
if(NCNN_PYTHON)
    add_subdirectory(python)
endif()


================================================
FILE: CONTRIBUTING.md
================================================

# Acknowledgements

- Thanks to bug1989 [https://github.com/bug1989] for contributing the initial quantized int8 inference code and a large variety of device benchmark
- Thanks to zhiliu6 [https://github.com/zhiliu6] for contributing the darknet conversion tool, operators and YOLO examples
- Thanks to Tijmen Verhulsdonck [https://github.com/Timen] for contributing the massive AVX optimization for x86 platform


================================================
FILE: Info.plist
================================================
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
    <key>CFBundleName</key>
    <string>__NAME__</string>
    <key>CFBundleIdentifier</key>
    <string>__IDENTIFIER__</string>
    <key>CFBundleVersion</key>
    <string>__VERSION__</string>
    <key>CFBundleShortVersionString</key>
    <string>__VERSION__</string>
    <key>CFBundleSignature</key>
    <string>????</string>
    <key>CFBundlePackageType</key>
    <string>FMWK</string>
</dict>
</plist>


================================================
FILE: LICENSE.txt
================================================
Tencent is pleased to support the open source community by making ncnn available.
Copyright (C) 2017 Tencent.  All rights reserved.
If you have downloaded a copy of the ncnn binary from Tencent, please note that the ncnn binary is licensed under the BSD 3-Clause License.
If you have downloaded a copy of the ncnn source code from Tencent, please note that ncnn source code is licensed under the BSD 3-Clause License, except for the third-party components listed below which are subject to different license terms.  Your integration of ncnn into your own projects may require compliance with the BSD 3-Clause License, as well as the other licenses applicable to the third-party components included within ncnn.
A copy of the BSD 3-Clause License is included in this file.

Other dependencies and licenses:

Open Source Software Licensed Under the zlib License:
The below software in this distribution may have been modified by Tencent (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 Tencent.
----------------------------------------------------------------------------------------
1. neon_mathfun.h
Copyright (C) 2011 Julien Pommier

2. sse_mathfun.h
Copyright (C) 2007 Julien Pommier

3. avx_mathfun.h
Copyright (C) 2012 Giovanni Garberoglio
Interdisciplinary Laboratory for Computational Science (LISC)
Fondazione Bruno Kessler and University of Trento
via Sommarive, 18
I-38123 Trento (Italy)


Terms of the zlib License:
---------------------------------------------------
Copyright (c) <year> <copyright holders>

This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.

Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.


Open Source Software Licensed Under the BSD 2-Clause License:
The below software in this distribution may have been modified by Tencent (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 Tencent.
----------------------------------------------------------------------------------------
1. squeezenet  1.1
Copyright (c) 2016 Forrest N. Iandola and Matthew W. Moskewicz and Khalid Ashraf and Song Han and William J. Dally and Kurt Keutzer
All rights reserved.

2. caffe.proto  master
All contributions by the University of California:
Copyright (c) 2014-2017 The Regents of the University of California (Regents)
All rights reserved.

All other contributions:
Copyright (c) 2014-2017, the respective contributors
All rights reserved.


Terms of the BSD 2-Clause License:
--------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


Open Source Software Licensed Under the BSD 3-Clause License:
The below software in this distribution may have been modified by Tencent (“Tencent Modifications”). All Tencent Modifications are Copyright (C) 2017 Tencent.
----------------------------------------------------------------------------------------
1. android.toolchain.cmake  master
Copyright (c) 2010-2011, Ethan Rublee
Copyright (c) 2011-2014, Andrey Kamaev
All rights reserved.


Terms of the BSD 3-Clause License:
--------------------------------------------------------------------

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
Neither the name of [copyright holder] nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: MANIFEST.in
================================================
recursive-include cmake *

recursive-include glslang *
prune glslang/Test

recursive-include src *

recursive-include python *
prune python/pybind11/tests

include CMakeLists.txt


================================================
FILE: README.md
================================================
![ncnn](https://raw.githubusercontent.com/Tencent/ncnn/master/images/256-ncnn.png)

# ncnn

[![License](https://img.shields.io/badge/license-BSD_3_Clause-blue.svg?style=for-the-badge)](LICENSE.txt)
[![Download Total Count](https://img.shields.io/github/downloads/Tencent/ncnn/total.svg?style=for-the-badge)](https://github.com/Tencent/ncnn/releases)
[![codecov](https://img.shields.io/codecov/c/github/Tencent/ncnn/master?style=for-the-badge)](https://codecov.io/gh/Tencent/ncnn)

ncnn is a high-performance neural network inference computing framework optimized for mobile platforms.
ncnn is deeply considerate about deployment and uses on mobile phones from the beginning of design.
ncnn does not have third-party dependencies.
It is cross-platform and runs faster than all known open-source frameworks on mobile phone cpu.
Developers can easily deploy deep learning algorithm models to the mobile platform by using efficient ncnn implementation, creating intelligent APPs, and bringing artificial intelligence to your fingertips.
ncnn is currently being used in many Tencent applications, such as QQ, Qzone, WeChat, Pitu, and so on.

ncnn 是一个为手机端极致优化的高性能神经网络前向计算框架。
ncnn 从设计之初深刻考虑手机端的部署和使用。
无第三方依赖，跨平台，手机端 cpu 的速度快于目前所有已知的开源框架。
基于 ncnn，开发者能够将深度学习算法轻松移植到手机端高效执行，
开发出人工智能 APP，将 AI 带到你的指尖。
ncnn 目前已在腾讯多款应用中使用，如：QQ，Qzone，微信，天天 P 图等。

---

<table>
<tr>
<td>
<b>技术交流 QQ 群</b><br />
637093648 (超多大佬)<br />
答案：卷卷卷卷卷（已满）
</td>
<td rowspan=3>
<b>Telegram Group</b>

<https://t.me/ncnnyes>
</td>
<td rowspan=3>
<b>Discord Channel</b>

<https://discord.gg/YRsxgmF>
</td>
</tr>
<tr>
<td>
<b>Pocky QQ 群（MLIR YES!）</b><br />
677104663 (超多大佬)<br />
答案：multi-level intermediate representation
</td>
</tr>
<tr>
<td>
<b>他们都不知道 pnnx 有多好用群</b><br />
818998520 (新群！)
</td>
</tr>
</table>

---

## Download & Build status

https://github.com/Tencent/ncnn/releases/latest


<table>
<tr>
<td rowspan=2>
  <img src="https://user-images.githubusercontent.com/25181517/192108372-f71d70ac-7ae6-4c0d-8395-51d8870c2ef0.png" width="120" height="auto">
</td>
<td colspan=3>

  **[how to build ncnn library](https://github.com/Tencent/ncnn/wiki/how-to-build) on Linux / Windows / macOS / Raspberry Pi3, Pi4 / POWER / Android / NVIDIA Jetson / iOS / WebAssembly / AllWinner D1 / Loongson 2K1000**

</td>
</tr>
<tr>
<td>Source</td>
<td colspan=2>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-full-source.zip)

</td>
</tr>

<tr>
<td rowspan=3>
  <img src="https://user-images.githubusercontent.com/25181517/117269608-b7dcfb80-ae58-11eb-8e66-6cc8753553f0.png" width="120" height="auto">
</td>
<td colspan=3>

- [Build for Android](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-android)
- [Build for Termux on Android](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-termux-on-android)

</td>
</tr>
<tr>
<td>Android</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android.zip)

</td>
<td rowspan=2>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/android.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Aandroid)

</td>
</tr>
<tr>
<td>Android shared</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android-vulkan-shared.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-android-shared.zip)

</td>
</tr>

<tr>
<td rowspan=3>
  <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/3/37/HMOS_Logo_Icon.svg/240px-HMOS_Logo_Icon.svg.png" width="120" height="auto">
</td>
<td colspan=3>

- [Build for HarmonyOS with cross-compiling](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-harmonyos-with-cross-compiling)

</td>
</tr>
<tr>
<td>HarmonyOS</td>
<td>

</td>
<td rowspan=2>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/harmonyos.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Aharmonyos)

</td>
</tr>
<tr>
<td>HarmonyOS shared</td>
<td>

</td>
</tr>

<tr>
<td rowspan=3>
  <img src="https://user-images.githubusercontent.com/25181517/121406611-a8246b80-c95e-11eb-9b11-b771486377f6.png" width="120" height="auto">
</td>
<td colspan=3>

- [Build for iOS on macOS with xcode](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-ios-on-macos-with-xcode)

</td>
</tr>
<tr>
<td>iOS</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios.zip)

</td>
<td rowspan=2>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/ios.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Aios)

</td>
</tr>
<tr>
<td>iOS-Simulator</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios-simulator-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ios-simulator.zip)

</td>
</tr>

<tr>
<td rowspan=10>
  <img src="https://user-images.githubusercontent.com/25181517/186884152-ae609cca-8cf1-4175-8d60-1ce1fa078ca2.png" width="120" height="auto">
</td>
<td colspan=3>

- [Build for macOS](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-macos)

</td>
</tr>
<tr>
<td>macOS</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-macos-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-macos.zip)

</td>
<td rowspan=1>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/macos.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Amacos)

</td>
</tr>
<tr>
<td>Mac-Catalyst</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-mac-catalyst-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-mac-catalyst.zip)

</td>
<td rowspan=1>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/mac-catalyst.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Amac-catalyst)

</td>
</tr>
<tr>
<td>watchOS</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-watchos.zip)

</td>
<td rowspan=2>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/watchos.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Awatchos)

</td>
</tr>
<tr>
<td>watchOS-Simulator</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-watchos-simulator.zip)

</td>
</tr>
<tr>
<td>tvOS</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos.zip)

</td>
<td rowspan=2>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/tvos.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Atvos)

</td>
</tr>
<tr>
<td>tvOS-Simulator</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos-simulator-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-tvos-simulator.zip)

</td>
</tr>
<tr>
<td>visionOS</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos.zip)

</td>
<td rowspan=2>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/visionos.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Avisionos)

</td>
</tr>
<tr>
<td>visionOS-Simulator</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos-simulator-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-visionos-simulator.zip)

</td>
</tr>
<tr>
<td>Apple xcframework</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-apple-vulkan.zip)
  [<img src="https://img.shields.io/badge/+cpuonly-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-apple.zip)

</td>
<td rowspan=1>

</td>
</tr>

<tr>
<td rowspan=3>
  <img src="https://user-images.githubusercontent.com/25181517/186884153-99edc188-e4aa-4c84-91b0-e2df260ebc33.png" width="120" height="auto">
</td>
<td colspan=3>

- [Build for Linux / NVIDIA Jetson / Raspberry Pi3, Pi4 / POWER](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-linux)

</td>
</tr>
<tr>
<td>Ubuntu 22.04</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2204.zip)
  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2204-shared.zip)

</td>
<td rowspan=2>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-x64-gpu-gcc.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-x64-gpu-gcc)

</td>
</tr>
<tr>
<td>Ubuntu 24.04</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2404.zip)
  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-ubuntu-2404-shared.zip)

</td>
</tr>

<tr>
<td rowspan=5>
  <img alt="windows" src="https://user-images.githubusercontent.com/25181517/186884150-05e9ff6d-340e-4802-9533-2c3f02363ee3.png" width="120" height="auto">
</td>
<td colspan=3>

- [Build for Windows x64 using VS2017](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-visual-studio-community-2017)
- [Build for Windows x64 using MinGW-w64](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-windows-x64-using-mingw-w64)

</td>
</tr>
<tr>
<td>VS2015</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2015.zip)
  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2015-shared.zip)

</td>
<td rowspan=4>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/windows.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Awindows)

</td>
</tr>
<tr>
<td>VS2017</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2017.zip)
  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2017-shared.zip)

</td>
</tr>
<tr>
<td>VS2019</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2019.zip)
  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2019-shared.zip)

</td>
</tr>
<tr>
<td>VS2022</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2022.zip)
  [<img src="https://img.shields.io/badge/+shared-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-windows-vs2022-shared.zip)

</td>
</tr>

<tr>
<td rowspan=2>
  <img src="https://user-images.githubusercontent.com/25181517/188324036-d704ac9a-6e61-4722-b978-254b25b61bed.png" width="120" height="auto">
</td>
<td colspan=3>

- [Build for WebAssembly](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-webassembly)

</td>
</tr>
<tr>
<td>WebAssembly</td>
<td>

  [<img src="https://img.shields.io/badge/download-blue?style=for-the-badge">](https://github.com/Tencent/ncnn/releases/latest/download/ncnn-20260113-webassembly.zip)

</td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/web-assembly.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Aweb-assembly)

</td>
</tr>

<tr>
<td rowspan=8>
  <img src="https://github.com/marwin1991/profile-technology-icons/assets/76662862/2481dc48-be6b-4ebb-9e8c-3b957efe69fa" width="120" height="auto">
</td>
<td colspan=3>

- [Build for ARM Cortex-A family with cross-compiling](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-arm-cortex-a-family-with-cross-compiling)
- [Build for Hisilicon platform with cross-compiling](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-hisilicon-platform-with-cross-compiling)
- [Build for AllWinner D1](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-allwinner-d1)
- [Build for Loongson 2K1000](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-loongson-2k1000)
- [Build for QNX](https://github.com/Tencent/ncnn/wiki/how-to-build#build-for-qnx)

</td>
</tr>
<tr>
<td>Linux (arm)</td>
<td></td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-arm.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-arm)

</td>
</tr>
<tr>
<td>Linux (aarch64)</td>
<td></td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-aarch64.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-aarch64)

</td>
</tr>
<tr>
<td>Linux (mips)</td>
<td></td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-mips.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-mips)

</td>
</tr>
<tr>
<td>Linux (mips64)</td>
<td></td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-mips64.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-mips64)

</td>
</tr>
<tr>
<td>Linux (ppc64)</td>
<td></td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-ppc64.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-ppc64)

</td>
</tr>
<tr>
<td>Linux (riscv64)</td>
<td></td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-riscv64.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-riscv64)

</td>
</tr>
<tr>
<td>Linux (loongarch64)</td>
<td></td>
<td>

  [<img src="https://img.shields.io/github/actions/workflow/status/Tencent/ncnn/linux-loongarch64.yml?branch=master&style=for-the-badge&label=build">](https://github.com/Tencent/ncnn/actions?query=workflow%3Alinux-loongarch64)

</td>
</tr>

</table>


---

## Support most commonly used CNN network

## 支持大部分常用的 CNN 网络

- Classical CNN:
  [VGG](https://github.com/BVLC/caffe/wiki/Model-Zoo#models-used-by-the-vgg-team-in-ilsvrc-2014)
  [AlexNet](https://github.com/BVLC/caffe/tree/9b891540183ddc834a02b2bd81b31afae71b2153/models/bvlc_alexnet)
  [GoogleNet](https://github.com/BVLC/caffe/tree/9b891540183ddc834a02b2bd81b31afae71b2153/models/bvlc_googlenet)
  Inception
  ...
- Practical CNN:
  [ResNet](https://github.com/tornadomeet/ResNet)
  [DenseNet](https://github.com/liuzhuang13/DenseNet)
  [SENet](https://github.com/hujie-frank/SENet)
  [FPN](https://github.com/unsky/FPN)
  ...
- Light-weight CNN:
  [SqueezeNet](https://github.com/forresti/SqueezeNet)
  [MobileNetV1](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)
  [MobileNetV2/V3](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/README.md)
  [ShuffleNetV1](https://github.com/farmingyard/ShuffleNet)
  [ShuffleNetV2](https://github.com/opconty/keras-shufflenetV2)
  [MNasNet](https://github.com/tensorflow/models/tree/master/research/slim/nets/nasnet)
  ...
- Face Detection:
  [MTCNN](https://github.com/ipazc/mtcnn)
  [RetinaFace](https://github.com/biubug6/Pytorch_Retinaface)
  [scrfd](https://github.com/nihui/ncnn-android-scrfd)
  ...
- Detection:
  [VGG-SSD](https://github.com/lzx1413/CAFFE_SSD)
  [MobileNet-SSD](https://github.com/chuanqi305/MobileNet-SSD)
  [SqueezeNet-SSD](https://github.com/chuanqi305/SqueezeNet-SSD)
  [MobileNetV2-SSDLite](https://github.com/chuanqi305/MobileNetv2-SSDLite)
  [MobileNetV3-SSDLite](https://github.com/XiaoyuHuang96/MobilenetV3SSDLite-tfkeras)
  ...
- Detection:
  [Faster-RCNN](https://github.com/rbgirshick/py-faster-rcnn)
  [R-FCN](https://github.com/daijifeng001/R-FCN)
  ...
- Detection:
  [YOLOv2](https://github.com/longcw/yolo2-pytorch)
  [YOLOv3](https://github.com/ultralytics/yolov3)
  [MobileNet-YOLOv3](https://github.com/eric612/MobileNet-YOLO)
  [YOLOv4](https://github.com/Tianxiaomo/pytorch-YOLOv4)
  [YOLOv5](https://github.com/ultralytics/yolov5)
  [YOLOv7](https://github.com/WongKinYiu/yolov7)
  [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX)
  [YOLOv8](https://github.com/nihui/ncnn-android-yolov8)
  ...
- Detection:
  [NanoDet](https://github.com/RangiLyu/nanodet)
- Segmentation:
  [FCN](https://github.com/unsky/FPN)
  [PSPNet](https://github.com/hszhao/PSPNet)
  [UNet](https://github.com/zhixuhao/unet)
  [YOLACT](https://github.com/dbolya/yolact)
  ...
- Pose Estimation:
  [SimplePose](https://github.com/dog-qiuqiu/Ultralight-SimplePose)
  ...

---

## HowTo

**[use ncnn with alexnet](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-alexnet) with detailed steps, recommended for beginners :)**

**[ncnn 组件使用指北 alexnet](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-alexnet.zh) 附带详细步骤，新人强烈推荐 :)**

**[use netron for ncnn model visualization](https://netron.app)**

**[use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)**

[ncnn low-level operation api](https://github.com/Tencent/ncnn/wiki/low-level-operation-api)

[ncnn param and model file spec](https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure)

[ncnn operation param weight table](https://github.com/Tencent/ncnn/wiki/operation-param-weight-table)

[how to implement custom layer step by step](https://github.com/Tencent/ncnn/wiki/how-to-implement-custom-layer-step-by-step)

---

## FAQ

**[ncnn deepwiki](https://deepwiki.com/Tencent/ncnn) LLM Answering Questions ;)** 

**[ncnn throw error](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-throw-error)**

**[ncnn produce wrong result](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-produce-wrong-result)**

**[ncnn vulkan](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-vulkan)**

---

## Features

- Supports convolutional neural networks, supports multiple input and multi-branch structure, can calculate part of the branch
- No third-party library dependencies, does not rely on BLAS / NNPACK or any other computing framework
- Pure C++ implementation, cross-platform, supports Android, iOS and so on
- ARM NEON assembly level of careful optimization, calculation speed is extremely high
- Sophisticated memory management and data structure design, very low memory footprint
- Supports multi-core parallel computing acceleration, ARM big.LITTLE CPU scheduling optimization
- Supports GPU acceleration via the next-generation low-overhead Vulkan API
- Extensible model design, supports 8bit [quantization](https://github.com/Tencent/ncnn/wiki/quantized-int8-inference) and half-precision floating point storage, can import caffe/pytorch/mxnet/onnx/darknet/keras/tensorflow(mlir) models
- Support direct memory zero copy reference load network model
- Can be registered with custom layer implementation and extended
- Well, it is strong, not afraid of being stuffed with 卷 QvQ

## 功能概述

- 支持卷积神经网络，支持多输入和多分支结构，可计算部分分支
- 无任何第三方库依赖，不依赖 BLAS/NNPACK 等计算框架
- 纯 C++ 实现，跨平台，支持 Android / iOS 等
- ARM Neon 汇编级良心优化，计算速度极快
- 精细的内存管理和数据结构设计，内存占用极低
- 支持多核并行计算加速，ARM big.LITTLE CPU 调度优化
- 支持基于全新低消耗的 Vulkan API GPU 加速
- 可扩展的模型设计，支持 8bit [量化](tools/quantize) 和半精度浮点存储，可导入 caffe/pytorch/mxnet/onnx/darknet/keras/tensorflow(mlir) 模型
- 支持直接内存零拷贝引用加载网络模型
- 可注册自定义层实现并扩展
- 恩，很强就是了，不怕被塞卷 QvQ

---

## supported platform matrix

- ✅ = known work and runs fast with good optimization
- ✔️ = known work, but speed may not be fast enough
- ❔ = shall work, not confirmed
- / = not applied

|            | Windows | Linux | Android | macOS | iOS |
| ---------- | ------- | ----- | ------- | ----- | --- |
| intel-cpu  | ✔️      | ✔️    | ✔️      | ✔️    | /   |
| intel-gpu  | ✔️      | ✔️    | ✔️      | ✔️    | /   |
| amd-cpu    | ✔️      | ✔️    | ✔️      | ✔️    | /   |
| amd-gpu    | ✔️      | ✔️    | ✔️      | ✔️    | /   |
| nvidia-gpu | ✔️      | ✔️    | ✔️      | ✔️    | /   |
| qcom-cpu   | ✅      | ✅    | ✅      | /     | /   |
| qcom-gpu   | ✔️      | ✔️    | ✔️      | /     | /   |
| arm-cpu    | ✅      | ✅    | ✅      | /     | /   |
| arm-gpu    | ❔      | ✔️    | ✔️      | /     | /   |
| apple-cpu  | /       | /     | /       | ✔️    | ✅  |
| apple-gpu  | /       | /     | /       | ✔️    | ✔️  |
| ibm-cpu    | /       | ✔️     | /       | /    | /  |

---

## Project examples

- <https://github.com/nihui/ncnn-android-squeezenet>
- <https://github.com/nihui/ncnn-android-styletransfer>
- <https://github.com/nihui/ncnn-android-mobilenetssd>
- <https://github.com/moli232777144/mtcnn_ncnn>
- <https://github.com/nihui/ncnn-android-yolov5>
- <https://github.com/xiang-wuu/ncnn-android-yolov7>
- <https://github.com/nihui/ncnn-android-scrfd> 🤩
- <https://github.com/shaoshengsong/qt_android_ncnn_lib_encrypt_example>

<img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-2.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/4.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-33.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-assets/raw/master/20181217/ncnn-m.png" height ="230"/><img src="https://github.com/nihui/ncnn-android-yolov5/raw/master/screenshot.jpg" height ="230"/><img src="https://github.com/nihui/ncnn-android-scrfd/raw/master/screenshot.jpg" height ="230"/><br>

- <https://github.com/magicse/ncnn-colorization-siggraph17><br>
<img src="https://user-images.githubusercontent.com/13585785/189326958-f5a8d6f8-caef-49bf-88da-ae494371195d.jpg" width ="700"/>

- <https://github.com/mizu-bai/ncnn-fortran> Call ncnn from Fortran

- <https://github.com/k2-fsa/sherpa> Use ncnn for real-time speech
  recognition (i.e., speech-to-text); also support embedded devices and provide
  mobile Apps (e.g., Android App)

---

## License

[BSD 3 Clause](LICENSE.txt)


================================================
FILE: benchmark/CMakeLists.txt
================================================

if(MSVC)
    # warning C4996: 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
    add_definitions(/wd4996)
endif()

# ncnn macro
include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_param.cmake)

set(benchncnn_PARAMS
    alexnet.param
    blazeface.param
    efficientnet_b0.param
    efficientnetv2_b0.param
    FastestDet.param
    googlenet_int8.param
    googlenet.param
    mnasnet.param
    mobilenet_int8.param
    mobilenet_ssd_int8.param
    mobilenet_ssd.param
    mobilenet_v2.param
    mobilenet_v3.param
    mobilenet_yolo.param
    mobilenet.param
    mobilenetv2_yolov3.param
    nanodet_m.param
    proxylessnasnet.param
    regnety_400m.param
    resnet18_int8.param
    resnet18.param
    resnet50_int8.param
    resnet50.param
    shufflenet_v2.param
    shufflenet.param
    squeezenet_int8.param
    squeezenet_ssd_int8.param
    squeezenet_ssd.param
    squeezenet.param
    vgg16_int8.param
    vgg16.param
    vision_transformer.param
    yolo-fastest-1.1.param
    yolo-fastestv2.param
    yolov4-tiny.param
)

foreach(PARAM_FILE ${benchncnn_PARAMS})
    ncnn_add_param("${CMAKE_CURRENT_SOURCE_DIR}/${PARAM_FILE}")
endforeach()

add_custom_target(ncnn-generate-param DEPENDS ${NCNN_PARAM_HEX_FILES})

configure_file(benchncnn_param_data.h.in ${CMAKE_CURRENT_BINARY_DIR}/benchncnn_param_data.h)

add_executable(benchncnn benchncnn.cpp)
target_link_libraries(benchncnn PRIVATE ncnn)

target_include_directories(benchncnn PRIVATE ${CMAKE_CURRENT_BINARY_DIR})

if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
    target_link_libraries(benchncnn PRIVATE nodefs.js)
endif()

add_dependencies(benchncnn ncnn-generate-param)

# add benchncnn to a virtual project group
set_property(TARGET benchncnn PROPERTY FOLDER "benchmark")


================================================
FILE: benchmark/FastestDet.param
================================================
7767517
127 150
Input                    in0                      0 1 in0
Convolution              convrelu_0               1 1 in0 1 0=24 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=648 9=1
Pooling                  maxpool2d_43             1 1 1 2 0=0 1=3 11=3 12=2 13=1 2=2 3=1 5=1
Split                    splitncnn_0              1 2 2 3 4
ConvolutionDepthWise     convdw_95                1 1 4 5 0=24 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=216 7=24
Convolution              convrelu_1               1 1 3 6 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
ConvolutionDepthWise     convdw_96                1 1 6 7 0=24 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=216 7=24
Convolution              convrelu_3               1 1 5 8 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
Convolution              convrelu_2               1 1 7 9 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
Concat                   cat_0                    2 1 8 9 10 0=0
ShuffleChannel           shufflechannel_0         1 1 10 11 0=2 1=1
Slice                    shufflechannel_0_slice   1 2 11 12 13 -23300=2,-233,-233 1=0
Convolution              convrelu_4               1 1 13 14 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
ConvolutionDepthWise     convdw_97                1 1 14 15 0=24 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=216 7=24
Convolution              convrelu_5               1 1 15 16 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
Concat                   cat_1                    2 1 12 16 17 0=0
ShuffleChannel           shufflechannel_1         1 1 17 18 0=2 1=1
Slice                    shufflechannel_1_slice   1 2 18 19 20 -23300=2,-233,-233 1=0
Convolution              convrelu_6               1 1 20 21 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
ConvolutionDepthWise     convdw_98                1 1 21 22 0=24 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=216 7=24
Convolution              convrelu_7               1 1 22 23 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
Concat                   cat_2                    2 1 19 23 24 0=0
ShuffleChannel           shufflechannel_2         1 1 24 25 0=2 1=1
Slice                    shufflechannel_2_slice   1 2 25 26 27 -23300=2,-233,-233 1=0
Convolution              convrelu_8               1 1 27 28 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
ConvolutionDepthWise     convdw_99                1 1 28 29 0=24 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=216 7=24
Convolution              convrelu_9               1 1 29 30 0=24 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=576 9=1
Concat                   cat_3                    2 1 26 30 31 0=0
Split                    splitncnn_1              1 3 31 32 33 34
ConvolutionDepthWise     convdw_100               1 1 34 35 0=48 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=432 7=48
Convolution              convrelu_10              1 1 33 36 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_101               1 1 36 37 0=48 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=432 7=48
Convolution              convrelu_12              1 1 35 38 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Convolution              convrelu_11              1 1 37 39 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_4                    2 1 38 39 40 0=0
ShuffleChannel           shufflechannel_3         1 1 40 41 0=2 1=1
Slice                    shufflechannel_3_slice   1 2 41 42 43 -23300=2,-233,-233 1=0
Convolution              convrelu_13              1 1 43 44 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_102               1 1 44 45 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48
Convolution              convrelu_14              1 1 45 46 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_5                    2 1 42 46 47 0=0
ShuffleChannel           shufflechannel_4         1 1 47 48 0=2 1=1
Slice                    shufflechannel_4_slice   1 2 48 49 50 -23300=2,-233,-233 1=0
Convolution              convrelu_15              1 1 50 51 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_103               1 1 51 52 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48
Convolution              convrelu_16              1 1 52 53 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_6                    2 1 49 53 54 0=0
ShuffleChannel           shufflechannel_5         1 1 54 55 0=2 1=1
Slice                    shufflechannel_5_slice   1 2 55 56 57 -23300=2,-233,-233 1=0
Convolution              convrelu_17              1 1 57 58 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_104               1 1 58 59 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48
Convolution              convrelu_18              1 1 59 60 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_7                    2 1 56 60 61 0=0
ShuffleChannel           shufflechannel_6         1 1 61 62 0=2 1=1
Slice                    shufflechannel_6_slice   1 2 62 63 64 -23300=2,-233,-233 1=0
Convolution              convrelu_19              1 1 64 65 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_105               1 1 65 66 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48
Convolution              convrelu_20              1 1 66 67 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_8                    2 1 63 67 68 0=0
ShuffleChannel           shufflechannel_7         1 1 68 69 0=2 1=1
Slice                    shufflechannel_7_slice   1 2 69 70 71 -23300=2,-233,-233 1=0
Convolution              convrelu_21              1 1 71 72 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_106               1 1 72 73 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48
Convolution              convrelu_22              1 1 73 74 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_9                    2 1 70 74 75 0=0
ShuffleChannel           shufflechannel_8         1 1 75 76 0=2 1=1
Slice                    shufflechannel_8_slice   1 2 76 77 78 -23300=2,-233,-233 1=0
Convolution              convrelu_23              1 1 78 79 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_107               1 1 79 80 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48
Convolution              convrelu_24              1 1 80 81 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_10                   2 1 77 81 82 0=0
ShuffleChannel           shufflechannel_9         1 1 82 83 0=2 1=1
Slice                    shufflechannel_9_slice   1 2 83 84 85 -23300=2,-233,-233 1=0
Convolution              convrelu_25              1 1 85 86 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
ConvolutionDepthWise     convdw_108               1 1 86 87 0=48 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=432 7=48
Convolution              convrelu_26              1 1 87 88 0=48 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=2304 9=1
Concat                   cat_11                   2 1 84 88 89 0=0
Split                    splitncnn_2              1 3 89 90 91 92
ConvolutionDepthWise     convdw_109               1 1 92 93 0=96 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=864 7=96
Convolution              convrelu_27              1 1 91 94 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
ConvolutionDepthWise     convdw_110               1 1 94 95 0=96 1=3 11=3 12=1 13=2 14=1 2=1 3=2 4=1 5=1 6=864 7=96
Convolution              convrelu_29              1 1 93 96 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
Convolution              convrelu_28              1 1 95 97 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
Concat                   cat_12                   2 1 96 97 98 0=0
ShuffleChannel           shufflechannel_10        1 1 98 99 0=2 1=1
Slice                    shufflechannel_10_slice  1 2 99 100 101 -23300=2,-233,-233 1=0
Convolution              convrelu_30              1 1 101 102 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
ConvolutionDepthWise     convdw_111               1 1 102 103 0=96 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=864 7=96
Convolution              convrelu_31              1 1 103 104 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
Concat                   cat_13                   2 1 100 104 105 0=0
ShuffleChannel           shufflechannel_11        1 1 105 106 0=2 1=1
Slice                    shufflechannel_11_slice  1 2 106 107 108 -23300=2,-233,-233 1=0
Convolution              convrelu_32              1 1 108 109 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
ConvolutionDepthWise     convdw_112               1 1 109 110 0=96 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=864 7=96
Convolution              convrelu_33              1 1 110 111 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
Concat                   cat_14                   2 1 107 111 112 0=0
ShuffleChannel           shufflechannel_12        1 1 112 113 0=2 1=1
Slice                    shufflechannel_12_slice  1 2 113 114 115 -23300=2,-233,-233 1=0
Convolution              convrelu_34              1 1 115 116 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
ConvolutionDepthWise     convdw_113               1 1 116 117 0=96 1=3 11=3 12=1 13=1 14=1 2=1 3=1 4=1 5=1 6=864 7=96
Convolution              convrelu_35              1 1 117 118 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
Concat                   cat_15                   2 1 114 118 119 0=0
Pooling                  avgpool2d_0              1 1 32 120 0=1 1=3 11=3 12=2 13=1 2=2 3=1 5=1 6=1
Interp                   upsample_94              1 1 119 121 0=1 1=2.000000e+00 2=2.000000e+00 6=0
Concat                   cat_16                   3 1 120 90 121 122 0=0
Convolution              convrelu_36              1 1 122 123 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=32256 9=1
Split                    splitncnn_3              1 4 123 124 125 126 127
ConvolutionDepthWise     convdwrelu_5             1 1 127 128 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
ConvolutionDepthWise     convdwrelu_0             1 1 126 129 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
ConvolutionDepthWise     convdwrelu_4             1 1 129 130 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
ConvolutionDepthWise     convdwrelu_1             1 1 125 131 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
ConvolutionDepthWise     convdwrelu_2             1 1 131 132 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
ConvolutionDepthWise     convdwrelu_3             1 1 132 133 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
Concat                   cat_17                   3 1 128 130 133 134 0=0
Convolution              conv_38                  1 1 134 135 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=27648
BinaryOp                 add_0                    2 1 124 135 136 0=0
ReLU                     relu_87                  1 1 136 137
Convolution              convrelu_37              1 1 137 138 0=96 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=9216 9=1
Split                    splitncnn_4              1 3 138 139 140 141
ConvolutionDepthWise     convdwrelu_7             1 1 139 142 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
Convolution              conv_41                  1 1 142 143 0=80 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=7680
ConvolutionDepthWise     convdwrelu_8             1 1 140 144 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
Convolution              conv_42                  1 1 144 145 0=4 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=384
Softmax                  softmax_93               1 1 143 146 0=0 1=1
ConvolutionDepthWise     convdwrelu_6             1 1 141 147 0=96 1=5 11=5 12=1 13=1 14=2 2=1 3=1 4=2 5=1 6=2400 7=96 9=1
Convolution              convsigmoid_38           1 1 147 148 0=1 1=1 11=1 12=1 13=1 14=0 2=1 3=1 4=0 5=1 6=96 9=4
Concat                   cat_18                   3 1 148 145 146 out0 0=0


================================================
FILE: benchmark/README.md
================================================
benchncnn can be used to test neural network inference performance

Only the network definition files (ncnn param) are required.

The large model binary files (ncnn bin) are not loaded but generated randomly for speed test.

If no model specified, it would benchmark default built-in models. More model networks may be added later.

---
Build
```shell
# assume you have already build ncnn library successfully
# uncomment the following line in <ncnn-root-dir>/CMakeLists.txt with your favorite editor

# add_subdirectory(benchmark)

cd <ncnn-root-dir>/<your-build-dir>
make -j4

# you can find benchncnn binary in <ncnn-root-dir>/<your-build-dir>/benchmark
```

Usage
```shell
# copy all param files to the current directory
./benchncnn [loop count] [num threads] [powersave] [gpu device] [cooling down] [(key=value)...]
  param=model.param
  shape=[227,227,3],..
```
run benchncnn on android device
```shell
# for running on android device, upload to /data/local/tmp/ folder
adb push benchncnn /data/local/tmp/

# (optional) upload your ncnn model param to /data/local/tmp/ folder
adb push model.param /data/local/tmp/

# executed in android adb shell
adb shell
cd /data/local/tmp/

# sample: benchmark built-in models on cpu, with 4 threads on big core, 4 loops and cooling_down
./benchncnn 4 4 2 -1 1

# sample: benchmark built-in models on gpu id 0, with 1 thread on big core, 8 loops, without cooling_down
./benchncnn 8 1 2 0 0

./benchncnn [loop count] [num threads] [powersave] [gpu device] [cooling down] [(key=value)...]
  param=model.param
  shape=[227,227,3],..
```

Parameter

|param|options|default|
|---|---|---|
|loop count|1~N|4|
|num threads|1~N|max_cpu_count|
|powersave|0=all cores, 1=little cores only, 2=big cores only|0|
|gpu device|-1=cpu-only, 0=gpu0, 1=gpu1 ...|-1|
|cooling down|0=disable, 1=enable|1|
|param|ncnn model.param filepath|-|
|shape|model input shapes with, whc format|-|

Tips: Disable android UI server and set CPU and GPU to max frequency
```shell
# stopping android ui server, can be retarted later via adb shell start
adb root
adb shell stop

# executed in android adb shell
# set cpu performance mode
echo "performance" > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor
echo "performance" > /sys/devices/system/cpu/cpu1/cpufreq/scaling_governor
echo "performance" > /sys/devices/system/cpu/cpu2/cpufreq/scaling_governor
echo "performance" > /sys/devices/system/cpu/cpu3/cpufreq/scaling_governor
echo "performance" > /sys/devices/system/cpu/cpu4/cpufreq/scaling_governor
echo "performance" > /sys/devices/system/cpu/cpu5/cpufreq/scaling_governor

# set gpu performance mode (eg. RK3399)
echo "performance" > /sys/class/misc/mali0/device/devfreq/ff9a0000.gpu/governor

# set gpu performance mode (eg. Android Adreno)
echo 1 > /sys/class/kgsl/kgsl-3d0/force_clk_on
echo 10000000 > /sys/class/kgsl/kgsl-3d0/idle_timer
echo "performance" > /sys/class/kgsl/kgsl-3d0/devfreq/governor
echo <max freq> > /sys/class/kgsl/kgsl-3d0/gpuclk
```

---

Typical output (executed in android adb shell)

### NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64)
```
i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 1 0 -1 0
loop_count = 64
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   11.66  max =   11.80  avg =   11.74
     squeezenet_int8  min =   12.24  max =   12.39  avg =   12.31
           mobilenet  min =   19.56  max =   19.73  avg =   19.65
      mobilenet_int8  min =   16.06  max =   16.25  avg =   16.14
        mobilenet_v2  min =   13.20  max =   13.41  avg =   13.29
        mobilenet_v3  min =   11.39  max =   11.57  avg =   11.48
          shufflenet  min =    8.07  max =    8.18  avg =    8.11
       shufflenet_v2  min =    8.41  max =    8.51  avg =    8.45
             mnasnet  min =   12.74  max =   12.91  avg =   12.79
     proxylessnasnet  min =   15.18  max =   15.32  avg =   15.25
     efficientnet_b0  min =   26.86  max =   26.96  avg =   26.90
   efficientnetv2_b0  min =   35.99  max =   36.15  avg =   36.07
        regnety_400m  min =   16.81  max =   16.98  avg =   16.87
           blazeface  min =    4.25  max =    4.37  avg =    4.29
           googlenet  min =   48.73  max =   48.98  avg =   48.87
      googlenet_int8  min =   47.39  max =   47.60  avg =   47.49
            resnet18  min =   30.93  max =   31.24  avg =   31.08
       resnet18_int8  min =   55.44  max =   55.70  avg =   55.56
             alexnet  min =   44.19  max =   44.43  avg =   44.33
               vgg16  min =  173.94  max =  174.97  avg =  174.46
          vgg16_int8  min =  475.10  max =  479.37  avg =  477.33
            resnet50  min =   89.50  max =   90.11  avg =   89.80
       resnet50_int8  min =  106.77  max =  107.14  avg =  106.96
      squeezenet_ssd  min =   37.78  max =   38.35  avg =   37.93
 squeezenet_ssd_int8  min =   50.48  max =   50.88  avg =   50.74
       mobilenet_ssd  min =   45.62  max =   46.12  avg =   45.74
  mobilenet_ssd_int8  min =   37.77  max =   38.00  avg =   37.88
      mobilenet_yolo  min =   90.23  max =   90.49  avg =   90.35
  mobilenetv2_yolov3  min =   47.27  max =   47.48  avg =   47.33
         yolov4-tiny  min =   60.41  max =   60.75  avg =   60.57
           nanodet_m  min =   19.26  max =   19.43  avg =   19.35
    yolo-fastest-1.1  min =    8.16  max =    8.31  avg =    8.20
      yolo-fastestv2  min =    8.26  max =    8.39  avg =    8.32
i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 2 0 -1 0
loop_count = 64
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.83  max =    6.98  avg =    6.90
     squeezenet_int8  min =    7.39  max =    7.50  avg =    7.45
           mobilenet  min =   10.40  max =   10.50  avg =   10.45
      mobilenet_int8  min =    8.92  max =    9.09  avg =    8.99
        mobilenet_v2  min =    7.67  max =    7.80  avg =    7.74
        mobilenet_v3  min =    6.86  max =    7.01  avg =    6.93
          shufflenet  min =    6.34  max =    6.44  avg =    6.39
       shufflenet_v2  min =    5.71  max =    5.83  avg =    5.76
             mnasnet  min =    7.47  max =    7.58  avg =    7.53
     proxylessnasnet  min =    8.73  max =    8.83  avg =    8.78
     efficientnet_b0  min =   14.93  max =   15.13  avg =   15.03
   efficientnetv2_b0  min =   20.17  max =   20.70  avg =   20.29
        regnety_400m  min =   12.50  max =   12.62  avg =   12.57
           blazeface  min =    2.95  max =    3.06  avg =    3.00
           googlenet  min =   26.25  max =   26.53  avg =   26.37
      googlenet_int8  min =   26.54  max =   26.79  avg =   26.66
            resnet18  min =   16.69  max =   16.90  avg =   16.80
       resnet18_int8  min =   29.70  max =   29.93  avg =   29.81
             alexnet  min =   22.96  max =   23.12  avg =   23.03
               vgg16  min =   88.39  max =   89.16  avg =   88.79
          vgg16_int8  min =  245.86  max =  247.55  avg =  246.62
            resnet50  min =   46.55  max =   46.86  avg =   46.70
       resnet50_int8  min =   56.28  max =   56.63  avg =   56.43
      squeezenet_ssd  min =   23.65  max =   24.29  avg =   23.81
 squeezenet_ssd_int8  min =   30.86  max =   31.27  avg =   30.99
       mobilenet_ssd  min =   25.17  max =   25.31  avg =   25.24
  mobilenet_ssd_int8  min =   21.77  max =   21.97  avg =   21.84
      mobilenet_yolo  min =   48.03  max =   48.33  avg =   48.14
  mobilenetv2_yolov3  min =   26.58  max =   26.81  avg =   26.66
         yolov4-tiny  min =   35.31  max =   35.53  avg =   35.41
           nanodet_m  min =   12.93  max =   13.08  avg =   13.01
    yolo-fastest-1.1  min =    6.00  max =    6.10  avg =    6.04
      yolo-fastestv2  min =    6.46  max =    6.61  avg =    6.52
i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 4 0 -1 0
loop_count = 64
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    4.54  max =    4.84  avg =    4.61
     squeezenet_int8  min =    4.96  max =    5.41  avg =    5.05
           mobilenet  min =    5.96  max =    6.23  avg =    6.04
      mobilenet_int8  min =    5.21  max =    5.50  avg =    5.30
        mobilenet_v2  min =    5.05  max =    5.26  avg =    5.15
        mobilenet_v3  min =    4.83  max =    5.14  avg =    4.90
          shufflenet  min =    5.11  max =    5.34  avg =    5.18
       shufflenet_v2  min =    4.13  max =    4.44  avg =    4.18
             mnasnet  min =    4.93  max =    5.27  avg =    5.01
     proxylessnasnet  min =    5.64  max =    5.89  avg =    5.72
     efficientnet_b0  min =    9.47  max =   10.60  avg =    9.60
   efficientnetv2_b0  min =   12.67  max =   13.06  avg =   12.82
        regnety_400m  min =   10.27  max =   10.58  avg =   10.38
           blazeface  min =    2.05  max =    2.27  avg =    2.10
           googlenet  min =   15.57  max =   15.96  avg =   15.68
      googlenet_int8  min =   16.19  max =   16.65  avg =   16.32
            resnet18  min =   10.20  max =   11.76  avg =   10.35
       resnet18_int8  min =   16.89  max =   17.31  avg =   17.03
             alexnet  min =   13.13  max =   13.70  avg =   13.32
               vgg16  min =   51.03  max =   52.46  avg =   51.35
          vgg16_int8  min =  131.08  max =  139.44  avg =  133.78
            resnet50  min =   26.74  max =   28.32  avg =   26.91
       resnet50_int8  min =   32.15  max =   32.74  avg =   32.38
      squeezenet_ssd  min =   16.58  max =   16.99  avg =   16.70
 squeezenet_ssd_int8  min =   20.22  max =   21.67  avg =   20.51
       mobilenet_ssd  min =   14.68  max =   16.07  avg =   14.83
  mobilenet_ssd_int8  min =   12.89  max =   13.27  avg =   13.01
      mobilenet_yolo  min =   28.44  max =   28.85  avg =   28.58
  mobilenetv2_yolov3  min =   17.21  max =   21.31  avg =   17.44
         yolov4-tiny  min =   23.68  max =   24.38  avg =   23.88
           nanodet_m  min =    8.76  max =    9.17  avg =    8.86
    yolo-fastest-1.1  min =    4.83  max =    5.04  avg =    4.88
      yolo-fastestv2  min =    4.93  max =    5.17  avg =    5.00
i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 8 0 -1 0
loop_count = 64
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    3.52  max =    4.28  avg =    3.65
     squeezenet_int8  min =    3.85  max =    4.11  avg =    3.93
           mobilenet  min =    3.78  max =    4.12  avg =    3.85
      mobilenet_int8  min =    3.57  max =    3.85  avg =    3.63
        mobilenet_v2  min =    4.14  max =    4.44  avg =    4.22
        mobilenet_v3  min =    3.89  max =    4.26  avg =    3.97
          shufflenet  min =    4.78  max =    4.95  avg =    4.84
       shufflenet_v2  min =    3.49  max =    3.84  avg =    3.54
             mnasnet  min =    3.94  max =    4.09  avg =    3.99
     proxylessnasnet  min =    4.41  max =    4.68  avg =    4.47
     efficientnet_b0  min =    7.01  max =    7.85  avg =    7.13
   efficientnetv2_b0  min =    9.22  max =    9.46  avg =    9.32
        regnety_400m  min =    9.34  max =    9.66  avg =    9.44
           blazeface  min =    1.86  max =    1.98  avg =    1.89
           googlenet  min =   10.37  max =   10.76  avg =   10.48
      googlenet_int8  min =   11.03  max =   11.34  avg =   11.16
            resnet18  min =    6.83  max =    7.12  avg =    6.93
       resnet18_int8  min =   10.25  max =   11.50  avg =   10.42
             alexnet  min =    8.88  max =    9.71  avg =    9.01
               vgg16  min =   31.26  max =   31.97  avg =   31.44
          vgg16_int8  min =   71.31  max =   74.53  avg =   72.18
            resnet50  min =   16.43  max =   16.84  avg =   16.52
       resnet50_int8  min =   19.07  max =   20.28  avg =   19.42
      squeezenet_ssd  min =   13.50  max =   13.69  avg =   13.56
 squeezenet_ssd_int8  min =   15.16  max =   16.06  avg =   15.30
       mobilenet_ssd  min =    9.73  max =   10.85  avg =    9.90
  mobilenet_ssd_int8  min =    9.27  max =    9.46  avg =    9.36
      mobilenet_yolo  min =   17.58  max =   17.79  avg =   17.67
  mobilenetv2_yolov3  min =   12.80  max =   13.50  avg =   12.90
         yolov4-tiny  min =   17.98  max =   21.31  avg =   18.24
           nanodet_m  min =    7.01  max =    7.18  avg =    7.09
    yolo-fastest-1.1  min =    4.76  max =    4.86  avg =    4.80
      yolo-fastestv2  min =    4.76  max =    4.88  avg =    4.82
i@orin:~/projects/ncnn/benchmark$ ./benchncnn 64 12 0 -1 0
loop_count = 64
num_threads = 12
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    3.50  max =    5.21  avg =    3.65
     squeezenet_int8  min =    3.97  max =    4.44  avg =    4.12
           mobilenet  min =    3.49  max =    7.73  avg =    3.78
      mobilenet_int8  min =    3.40  max =    3.86  avg =    3.49
        mobilenet_v2  min =    4.07  max =    4.39  avg =    4.17
        mobilenet_v3  min =    3.92  max =    4.17  avg =    4.03
          shufflenet  min =    5.08  max =    6.63  avg =    5.18
       shufflenet_v2  min =    3.64  max =    5.11  avg =    3.75
             mnasnet  min =    3.86  max =    4.16  avg =    3.95
     proxylessnasnet  min =    4.30  max =    5.39  avg =    4.38
     efficientnet_b0  min =    6.42  max =    9.19  avg =    6.61
   efficientnetv2_b0  min =    8.96  max =    9.43  avg =    9.12
        regnety_400m  min =   10.11  max =   10.89  avg =   10.27
           blazeface  min =    1.93  max =    2.16  avg =    1.99
           googlenet  min =    9.72  max =   10.84  avg =   10.01
      googlenet_int8  min =   10.91  max =   13.03  avg =   11.17
            resnet18  min =    6.70  max =    7.27  avg =    6.92
       resnet18_int8  min =    9.62  max =   12.93  avg =   10.14
             alexnet  min =    7.21  max =    7.47  avg =    7.32
               vgg16  min =   29.61  max =   63.73  avg =   30.86
          vgg16_int8  min =   64.91  max =   75.06  avg =   68.72
            resnet50  min =   15.35  max =   16.28  avg =   15.73
       resnet50_int8  min =   17.47  max =   18.98  avg =   18.09
      squeezenet_ssd  min =   13.40  max =   28.74  avg =   14.07
 squeezenet_ssd_int8  min =   15.35  max =   16.77  avg =   15.67
       mobilenet_ssd  min =    9.51  max =   11.49  avg =    9.88
  mobilenet_ssd_int8  min =    9.43  max =   10.08  avg =    9.58
      mobilenet_yolo  min =   16.88  max =   17.45  avg =   17.09
  mobilenetv2_yolov3  min =   11.91  max =   31.90  avg =   12.50
         yolov4-tiny  min =   17.85  max =   18.87  avg =   18.36
           nanodet_m  min =    6.88  max =    7.64  avg =    7.06
    yolo-fastest-1.1  min =    5.02  max =    5.53  avg =    5.12
      yolo-fastestv2  min =    4.95  max =    5.60  avg =    5.05
i@orin:~/projects/ncnn/benchmark$ ./benchncnn 128 1 0 0 0
[0 NVIDIA Tegra Orin (nvgpu)]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 NVIDIA Tegra Orin (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra Orin (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra Orin (nvgpu)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 128
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    2.13  max =    3.37  avg =    2.31
     squeezenet_int8  min =   12.31  max =   12.51  avg =   12.42
           mobilenet  min =    2.03  max =    2.73  avg =    2.23
      mobilenet_int8  min =   16.86  max =   17.91  avg =   16.99
        mobilenet_v2  min =    2.59  max =    3.59  avg =    2.91
        mobilenet_v3  min =    3.22  max =    4.23  avg =    3.71
          shufflenet  min =    2.57  max =    3.27  avg =    2.80
       shufflenet_v2  min =    3.20  max =    4.03  avg =    3.47
             mnasnet  min =    2.45  max =    3.06  avg =    2.69
     proxylessnasnet  min =    2.50  max =    3.14  avg =    2.72
     efficientnet_b0  min =    4.23  max =    8.73  avg =    4.85
   efficientnetv2_b0  min =    8.15  max =    8.60  avg =    8.41
        regnety_400m  min =    3.25  max =    4.17  avg =    3.54
           blazeface  min =    1.29  max =    1.48  avg =    1.33
           googlenet  min =    4.95  max =   12.34  avg =    6.36
      googlenet_int8  min =   47.49  max =   47.78  avg =   47.61
            resnet18  min =    3.18  max =    9.49  avg =    4.04
       resnet18_int8  min =   55.57  max =   55.88  avg =   55.73
             alexnet  min =    3.22  max =   14.56  avg =    4.25
               vgg16  min =    6.82  max =   14.75  avg =    8.18
          vgg16_int8  min =  473.55  max =  479.07  avg =  476.22
            resnet50  min =    4.75  max =   15.06  avg =    6.08
       resnet50_int8  min =  106.99  max =  107.48  avg =  107.22
      squeezenet_ssd  min =    6.87  max =    9.12  avg =    7.76
 squeezenet_ssd_int8  min =   50.87  max =   51.17  avg =   51.01
       mobilenet_ssd  min =    4.44  max =    6.22  avg =    5.23
  mobilenet_ssd_int8  min =   37.80  max =   38.03  avg =   37.92
      mobilenet_yolo  min =    5.41  max =    7.36  avg =    6.29
  mobilenetv2_yolov3  min =    7.20  max =    9.96  avg =    7.30
         yolov4-tiny  min =   16.48  max =   28.81  avg =   18.40
           nanodet_m  min =    5.75  max =    8.54  avg =    6.85
    yolo-fastest-1.1  min =    4.03  max =    4.75  avg =    4.35
      yolo-fastestv2  min =    4.27  max =    5.23  avg =    4.71
```

### AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32)
```
i@s:~/qtang/ncnn/benchmark$ ../build-vulkan/benchmark/benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   11.73  max =   11.88  avg =   11.78
           mobilenet  min =   21.63  max =   21.73  avg =   21.68
        mobilenet_v2  min =   14.70  max =   14.95  avg =   14.82
        mobilenet_v3  min =   12.12  max =   12.17  avg =   12.15
          shufflenet  min =   14.08  max =   14.16  avg =   14.12
       shufflenet_v2  min =   25.99  max =   26.13  avg =   26.06
             mnasnet  min =   14.12  max =   14.17  avg =   14.14
     proxylessnasnet  min =   16.51  max =   16.71  avg =   16.61
     efficientnet_b0  min =   22.88  max =   22.97  avg =   22.93
        regnety_400m  min =   18.50  max =   18.61  avg =   18.56
           blazeface  min =    6.18  max =    6.27  avg =    6.21
           googlenet  min =   58.42  max =   58.60  avg =   58.49
            resnet18  min =   61.13  max =   61.84  avg =   61.40
             alexnet  min =   50.82  max =   50.98  avg =   50.92
               vgg16  min =  217.19  max =  218.40  avg =  217.87
            resnet50  min =  126.84  max =  137.46  avg =  128.21
      squeezenet_ssd  min =  114.24  max =  114.57  avg =  114.47
       mobilenet_ssd  min =   51.60  max =   51.89  avg =   51.77
      mobilenet_yolo  min =  125.09  max =  126.33  avg =  125.83
  mobilenetv2_yolov3  min =   57.51  max =   57.79  avg =   57.65
         yolov4-tiny  min =   85.65  max =   85.97  avg =   85.79
```

### NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576)
```
i@s:~/qtang/ncnn/benchmark$ ../build-vulkan/benchmark/benchncnn 256 1 0 1 0
[0 Quadro RTX 8000]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 Quadro RTX 8000]  bugsbn1=0  bugcopc=0  bugihfa=0
[0 Quadro RTX 8000]  fp16p=1  fp16s=1  fp16a=1  int8s=1  int8a=1
[0 Quadro RTX 8000]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
[1 Quadro RTX 8000]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[1 Quadro RTX 8000]  bugsbn1=0  bugcopc=0  bugihfa=0
[1 Quadro RTX 8000]  fp16p=1  fp16s=1  fp16a=1  int8s=1  int8a=1
[1 Quadro RTX 8000]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 256
num_threads = 1
powersave = 0
gpu_device = 1
cooling_down = 0
          squeezenet  min =    0.84  max =    1.39  avg =    0.93
           mobilenet  min =    0.90  max =    2.30  avg =    0.91
        mobilenet_v2  min =    1.35  max =    9.59  avg =    1.46
        mobilenet_v3  min =    1.60  max =   77.94  avg =    2.12
          shufflenet  min =    0.86  max =    2.27  avg =    0.88
       shufflenet_v2  min =    1.25  max =    1.47  avg =    1.27
             mnasnet  min =    1.42  max =   20.77  avg =    1.72
     proxylessnasnet  min =    1.48  max =    1.67  avg =    1.49
     efficientnet_b0  min =    2.56  max =   12.86  avg =    2.77
        regnety_400m  min =    1.84  max =   14.98  avg =    2.42
           blazeface  min =    0.64  max =    0.90  avg =    0.65
           googlenet  min =    2.94  max =   76.82  avg =    3.45
            resnet18  min =    1.27  max =   10.56  avg =    1.56
             alexnet  min =    1.53  max =   71.76  avg =    1.96
               vgg16  min =    4.90  max =   78.12  avg =    5.80
            resnet50  min =    3.00  max =   12.51  avg =    3.07
      squeezenet_ssd  min =    5.60  max =   97.09  avg =    6.50
       mobilenet_ssd  min =    2.40  max =   93.64  avg =    3.30
      mobilenet_yolo  min =    2.96  max =   19.15  avg =    3.25
  mobilenetv2_yolov3  min =    4.52  max =   66.96  avg =    5.32
         yolov4-tiny  min =    9.32  max =   72.92  avg =   14.01

```

### NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328)
```
(base) i@t:~/wls/ncnn/benchmark$ ../build/benchmark/benchncnn 32 1 0 0 0
[0 GeForce RTX 3090]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 GeForce RTX 3090]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 GeForce RTX 3090]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 GeForce RTX 3090]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
[1 GeForce RTX 3090]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[1 GeForce RTX 3090]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 GeForce RTX 3090]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[1 GeForce RTX 3090]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 32
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    1.76  max =    2.74  avg =    1.80
     squeezenet_int8  min =   47.10  max =   47.75  avg =   47.21
           mobilenet  min =    4.77  max =    5.79  avg =    5.20
      mobilenet_int8  min =   64.19  max =   67.05  avg =   64.39
        mobilenet_v2  min =    2.44  max =   20.89  avg =    6.98
        mobilenet_v3  min =    2.75  max =    2.87  avg =    2.77
          shufflenet  min =    2.20  max =    2.62  avg =    2.46
       shufflenet_v2  min =    5.10  max =    7.43  avg =    5.75
             mnasnet  min =    3.47  max =    3.50  avg =    3.48
     proxylessnasnet  min =    2.59  max =    9.08  avg =    7.28
     efficientnet_b0  min =    3.87  max =    4.65  avg =    3.91
   efficientnetv2_b0  min =   29.48  max =   41.90  avg =   30.14
        regnety_400m  min =    2.89  max =    2.99  avg =    2.91
           blazeface  min =    1.55  max =    2.14  avg =    1.60
           googlenet  min =    4.33  max =   17.89  avg =    6.05
      googlenet_int8  min =  174.46  max =  178.19  avg =  174.74
            resnet18  min =    2.14  max =   11.04  avg =    5.33
       resnet18_int8  min =  193.37  max =  193.83  avg =  193.55
             alexnet  min =    2.37  max =   15.99  avg =    4.50
               vgg16  min =    4.55  max =   16.65  avg =    5.22
          vgg16_int8  min = 1538.76  max = 1544.81  avg = 1540.79
            resnet50  min =    4.13  max =   25.86  avg =    5.80
       resnet50_int8  min =  400.89  max =  401.72  avg =  401.29
      squeezenet_ssd  min =    6.95  max =    7.81  avg =    7.07
 squeezenet_ssd_int8  min =  158.51  max =  159.04  avg =  158.68
       mobilenet_ssd  min =    4.36  max =   18.98  avg =    9.40
  mobilenet_ssd_int8  min =  130.74  max =  130.92  avg =  130.83
      mobilenet_yolo  min =    3.96  max =   11.94  avg =    6.48
  mobilenetv2_yolov3  min =    6.07  max =    6.21  avg =    6.13
         yolov4-tiny  min =   13.01  max =   26.78  avg =   14.87

root@3090:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 100 10 2 0 0
[0 NVIDIA GeForce RTX 3090]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 NVIDIA GeForce RTX 3090]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA GeForce RTX 3090]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA GeForce RTX 3090]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 100
num_threads = 10
powersave = 2
gpu_device = 0
cooling_down = 0
          squeezenet  min =    0.64  max =    0.66  avg =    0.65
     squeezenet_int8  min =    4.30  max =    4.93  avg =    4.45
           mobilenet  min =    0.60  max =    1.85  avg =    1.32
      mobilenet_int8  min =    3.08  max =    3.17  avg =    3.12
        mobilenet_v2  min =    1.40  max =    1.46  avg =    1.42
        mobilenet_v3  min =    1.22  max =    6.10  avg =    3.02
          shufflenet  min =    0.90  max =    0.97  avg =    0.92
       shufflenet_v2  min =    1.06  max =    1.13  avg =    1.09
             mnasnet  min =    0.84  max =    0.98  avg =    0.91
     proxylessnasnet  min =    0.99  max =    3.01  avg =    2.45
     efficientnet_b0  min =    2.11  max =    2.85  avg =    2.16
   efficientnetv2_b0  min =    7.46  max =   28.58  avg =    8.55
        regnety_400m  min =    1.53  max =    1.75  avg =    1.59
           blazeface  min =    0.59  max =    0.94  avg =    0.63
           googlenet  min =    1.90  max =   12.22  avg =    2.63
      googlenet_int8  min =   17.45  max =   18.69  avg =   17.81
            resnet18  min =    0.90  max =   13.14  avg =    3.09
       resnet18_int8  min =   16.25  max =   17.34  avg =   16.50
             alexnet  min =    0.86  max =    4.77  avg =    2.59
               vgg16  min =    1.38  max =   11.20  avg =    2.91
          vgg16_int8  min =   47.17  max =   49.02  avg =   47.57
            resnet50  min =    1.54  max =    2.16  avg =    1.64
       resnet50_int8  min =   22.90  max =   24.46  avg =   23.23
      squeezenet_ssd  min =    2.25  max =   10.91  avg =    4.12
 squeezenet_ssd_int8  min =   11.98  max =   14.54  avg =   12.31
       mobilenet_ssd  min =    1.46  max =    8.98  avg =    3.38
  mobilenet_ssd_int8  min =    6.13  max =    6.65  avg =    6.23
      mobilenet_yolo  min =    1.29  max =    1.43  avg =    1.34
  mobilenetv2_yolov3  min =    3.64  max =    6.66  avg =    3.77
         yolov4-tiny  min =    9.04  max =   11.65  avg =    9.54
           nanodet_m  min =    1.43  max =   11.90  avg =    3.16
    yolo-fastest-1.1  min =    1.40  max =    1.82  avg =    1.57
      yolo-fastestv2  min =    1.36  max =    2.30  avg =    1.42
  vision_transformer  min =  202.71  max =  244.47  avg =  218.69
          FastestDet  min =    1.37  max =    5.37  avg =    2.77
```

### AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU)
```
C:\Users\i\Desktop\benchmark>benchncnn.exe 32 1 0 -1 0
loop_count = 32
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   22.13  max =   24.07  avg =   22.88
     squeezenet_int8  min =   58.54  max =   62.21  avg =   59.55
           mobilenet  min =   40.99  max =   43.67  avg =   41.70
      mobilenet_int8  min =   98.06  max =  111.37  avg =  101.15
        mobilenet_v2  min =   26.53  max =   28.96  avg =   27.81
        mobilenet_v3  min =   22.96  max =   25.25  avg =   23.30
          shufflenet  min =   20.17  max =   28.78  avg =   21.09
       shufflenet_v2  min =   19.06  max =   19.72  avg =   19.47
             mnasnet  min =   25.11  max =   39.53  avg =   27.54
     proxylessnasnet  min =   28.84  max =   35.16  avg =   30.03
     efficientnet_b0  min =   43.16  max =   46.03  avg =   43.65
   efficientnetv2_b0  min =   48.64  max =   52.07  avg =   49.62
        regnety_400m  min =   33.43  max =   35.87  avg =   33.97
           blazeface  min =    5.43  max =    6.04  avg =    5.56
           googlenet  min =   85.80  max =   90.93  avg =   87.65
      googlenet_int8  min =  214.37  max =  230.75  avg =  219.50
            resnet18  min =   76.58  max =   80.38  avg =   77.34
       resnet18_int8  min =  231.16  max =  255.22  avg =  236.65
             alexnet  min =   60.69  max =   64.06  avg =   61.34
               vgg16  min =  286.45  max =  307.04  avg =  290.86
          vgg16_int8  min = 1797.58  max = 2079.73  avg = 1844.78
            resnet50  min =  198.27  max =  215.03  avg =  201.37
       resnet50_int8  min =  493.52  max =  499.67  avg =  496.95
      squeezenet_ssd  min =  189.97  max =  198.53  avg =  192.10
 squeezenet_ssd_int8  min =  198.81  max =  214.55  avg =  203.59
       mobilenet_ssd  min =   87.56  max =   92.72  avg =   89.03
  mobilenet_ssd_int8  min =  196.97  max =  209.51  avg =  201.95
      mobilenet_yolo  min =  206.87  max =  218.48  avg =  210.84
  mobilenetv2_yolov3  min =  102.72  max =  108.18  avg =  104.62
         yolov4-tiny  min =  117.97  max =  134.73  avg =  121.26

C:\Users\i\Desktop\benchmark>benchncnn.exe 32 2 0 -1 0
loop_count = 32
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   13.43  max =   14.35  avg =   13.62
     squeezenet_int8  min =   32.29  max =   50.76  avg =   33.56
           mobilenet  min =   23.42  max =   25.10  avg =   24.09
      mobilenet_int8  min =   51.99  max =   55.42  avg =   53.01
        mobilenet_v2  min =   15.45  max =   15.75  avg =   15.59
        mobilenet_v3  min =   14.32  max =   14.75  avg =   14.39
          shufflenet  min =   12.64  max =   12.83  avg =   12.69
       shufflenet_v2  min =   11.45  max =   12.44  avg =   11.60
             mnasnet  min =   14.43  max =   20.45  avg =   15.11
     proxylessnasnet  min =   16.18  max =   16.38  avg =   16.24
     efficientnet_b0  min =   25.25  max =   28.42  avg =   26.59
   efficientnetv2_b0  min =   27.57  max =   32.05  avg =   30.04
        regnety_400m  min =   22.74  max =   24.75  avg =   23.31
           blazeface  min =    3.44  max =    3.83  avg =    3.62
           googlenet  min =   49.39  max =   66.76  avg =   53.76
      googlenet_int8  min =  113.89  max =  136.75  avg =  119.29
            resnet18  min =   43.77  max =   67.24  avg =   46.14
       resnet18_int8  min =  121.44  max =  148.01  avg =  126.95
             alexnet  min =   34.46  max =   37.38  avg =   35.50
               vgg16  min =  177.16  max =  207.25  avg =  184.19
          vgg16_int8  min =  951.86  max = 1155.60  avg =  990.51
            resnet50  min =  112.28  max =  137.18  avg =  115.64
       resnet50_int8  min =  260.69  max =  272.26  avg =  265.89
      squeezenet_ssd  min =  108.07  max =  121.66  avg =  110.35
 squeezenet_ssd_int8  min =  109.01  max =  126.86  avg =  111.96
       mobilenet_ssd  min =   49.60  max =   52.62  avg =   50.46
  mobilenet_ssd_int8  min =  104.22  max =  111.07  avg =  106.33
      mobilenet_yolo  min =  117.42  max =  136.73  avg =  122.92
  mobilenetv2_yolov3  min =   61.66  max =   65.22  avg =   63.01
         yolov4-tiny  min =   72.64  max =   77.09  avg =   74.30

C:\Users\i\Desktop\benchmark>benchncnn.exe 32 4 0 -1 0
loop_count = 32
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    9.19  max =   14.82  avg =   11.15
     squeezenet_int8  min =   19.00  max =   40.30  avg =   24.80
           mobilenet  min =   18.02  max =   39.84  avg =   27.38
      mobilenet_int8  min =   28.04  max =   57.59  avg =   34.15
        mobilenet_v2  min =   10.26  max =   17.79  avg =   13.36
        mobilenet_v3  min =    8.87  max =   10.87  avg =    9.11
          shufflenet  min =    8.93  max =   11.96  avg =    9.34
       shufflenet_v2  min =    7.37  max =   13.10  avg =    8.72
             mnasnet  min =    9.24  max =   14.90  avg =   11.32
     proxylessnasnet  min =   10.21  max =   11.89  avg =   10.39
     efficientnet_b0  min =   16.22  max =   23.71  avg =   16.59
   efficientnetv2_b0  min =   17.44  max =   31.42  avg =   22.85
        regnety_400m  min =   18.32  max =   24.02  avg =   18.90
           blazeface  min =    2.22  max =    2.81  avg =    2.30
           googlenet  min =   31.52  max =   51.80  avg =   42.11
      googlenet_int8  min =   65.47  max =  114.41  avg =   75.98
            resnet18  min =   28.90  max =   64.62  avg =   37.58
       resnet18_int8  min =   71.29  max =  136.67  avg =  103.03
             alexnet  min =   23.67  max =   34.01  avg =   29.78
               vgg16  min =  142.18  max =  211.00  avg =  170.46
          vgg16_int8  min =  531.36  max =  871.25  avg =  625.60
            resnet50  min =   69.23  max =  108.67  avg =   73.68
       resnet50_int8  min =  149.18  max =  309.88  avg =  168.68
      squeezenet_ssd  min =   68.83  max =   81.70  avg =   71.01
 squeezenet_ssd_int8  min =   66.34  max =  118.16  avg =   74.34
       mobilenet_ssd  min =   29.96  max =   34.32  avg =   30.74
  mobilenet_ssd_int8  min =   56.87  max =   92.24  avg =   65.57
      mobilenet_yolo  min =   74.26  max =  113.91  avg =   81.28
  mobilenetv2_yolov3  min =   42.16  max =   63.49  avg =   45.34
         yolov4-tiny  min =   53.06  max =   69.84  avg =   55.81

C:\Users\i\Desktop\benchmark>benchncnn.exe 32 1 0 0 0
[0 AMD Radeon(TM) Vega 8 Graphics]  queueC=1[2]  queueG=0[1]  queueT=2[1]
[0 AMD Radeon(TM) Vega 8 Graphics]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 AMD Radeon(TM) Vega 8 Graphics]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 AMD Radeon(TM) Vega 8 Graphics]  subgroup=64  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 32
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    6.78  max =    7.09  avg =    6.91
     squeezenet_int8  min =   58.93  max =   62.53  avg =   60.11
           mobilenet  min =    8.08  max =    8.39  avg =    8.25
      mobilenet_int8  min =   97.74  max =  116.77  avg =  100.17
        mobilenet_v2  min =    7.95  max =    8.27  avg =    8.14
        mobilenet_v3  min =    8.70  max =    9.70  avg =    9.02
          shufflenet  min =    6.36  max =    7.64  avg =    7.01
       shufflenet_v2  min =    7.04  max =    8.12  avg =    7.50
             mnasnet  min =    8.07  max =    9.08  avg =    8.38
     proxylessnasnet  min =    8.56  max =    9.66  avg =    8.81
     efficientnet_b0  min =   16.68  max =   18.00  avg =   17.30
   efficientnetv2_b0  min =  394.82  max =  404.88  avg =  401.05
        regnety_400m  min =   11.92  max =   12.17  avg =   12.03
           blazeface  min =    4.82  max =    6.50  avg =    5.42
           googlenet  min =   18.44  max =   19.66  avg =   19.18
      googlenet_int8  min =  213.41  max =  231.79  avg =  218.31
            resnet18  min =   14.27  max =   14.72  avg =   14.44
       resnet18_int8  min =  228.79  max =  249.65  avg =  236.06
             alexnet  min =   17.31  max =   18.31  avg =   17.69
               vgg16  min =  111.85  max =  123.35  avg =  112.98
          vgg16_int8  min = 1789.64  max = 1838.84  avg = 1826.05
            resnet50  min =   31.61  max =   32.86  avg =   32.12
       resnet50_int8  min =  483.57  max =  505.72  avg =  491.76
      squeezenet_ssd  min =   99.66  max =  105.68  avg =  104.57
 squeezenet_ssd_int8  min =  200.48  max =  208.71  avg =  203.02
       mobilenet_ssd  min =   33.45  max =   35.64  avg =   34.75
  mobilenet_ssd_int8  min =  195.14  max =  205.35  avg =  200.18
      mobilenet_yolo  min =   59.20  max =   61.06  avg =   60.47
  mobilenetv2_yolov3  min =   31.48  max =   33.25  avg =   32.84
         yolov4-tiny  min =   93.75  max =   97.45  avg =   96.00
```

### Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640)
```
OnePlus7T:/data/local/tmp # ./benchncnn 8 4 2 -1 1                                                                                                                                                                                      
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    3.60  max =    3.70  avg =    3.64
     squeezenet_int8  min =    3.67  max =    3.78  avg =    3.71
           mobilenet  min =    5.32  max =    5.42  avg =    5.38
      mobilenet_int8  min =    4.20  max =    4.28  avg =    4.23
        mobilenet_v2  min =    4.64  max =    4.73  avg =    4.68
        mobilenet_v3  min =    4.13  max =    4.25  avg =    4.18
          shufflenet  min =    3.29  max =    3.40  avg =    3.33
       shufflenet_v2  min =    2.98  max =    3.07  avg =    3.01
             mnasnet  min =    4.26  max =    4.37  avg =    4.31
     proxylessnasnet  min =    4.67  max =    4.78  avg =    4.72
     efficientnet_b0  min =    7.23  max =    7.34  avg =    7.30
   efficientnetv2_b0  min =    8.74  max =    8.87  avg =    8.81
        regnety_400m  min =    7.88  max =    7.99  avg =    7.95
           blazeface  min =    1.19  max =    1.30  avg =    1.22
           googlenet  min =   13.07  max =   13.20  avg =   13.12
      googlenet_int8  min =   12.86  max =   12.98  avg =   12.93
            resnet18  min =   10.33  max =   10.36  avg =   10.35
       resnet18_int8  min =    9.42  max =    9.45  avg =    9.43
             alexnet  min =   11.88  max =   11.95  avg =   11.91
               vgg16  min =   59.34  max =   60.69  avg =   60.19
          vgg16_int8  min =   68.78  max =   69.07  avg =   68.93
            resnet50  min =   26.18  max =   26.28  avg =   26.24
       resnet50_int8  min =   20.86  max =   20.95  avg =   20.91
      squeezenet_ssd  min =   12.00  max =   12.76  avg =   12.19
 squeezenet_ssd_int8  min =   11.67  max =   13.13  avg =   12.03
       mobilenet_ssd  min =   11.88  max =   12.68  avg =   12.03
  mobilenet_ssd_int8  min =    9.28  max =    9.68  avg =    9.35
      mobilenet_yolo  min =   27.89  max =   28.06  avg =   27.96
  mobilenetv2_yolov3  min =   18.00  max =   18.13  avg =   18.06
         yolov4-tiny  min =   25.25  max =   25.36  avg =   25.29
           nanodet_m  min =    8.93  max =    9.00  avg =    8.96
    yolo-fastest-1.1  min =    3.73  max =    3.83  avg =    3.77
      yolo-fastestv2  min =    3.38  max =    3.47  avg =    3.41
  vision_transformer  min =  567.94  max =  572.31  avg =  569.66
          FastestDet  min =    3.28  max =    3.37  avg =    3.32

OnePlus7T:/data/local/tmp # ./benchncnn 8 1 2 -1 1                                                                                                                                                                                         
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    8.24  max =    8.34  avg =    8.31
     squeezenet_int8  min =    8.23  max =    8.34  avg =    8.30
           mobilenet  min =   14.38  max =   14.56  avg =   14.45
      mobilenet_int8  min =   11.12  max =   11.24  avg =   11.17
        mobilenet_v2  min =    9.82  max =    9.88  avg =    9.84
        mobilenet_v3  min =    8.15  max =    8.24  avg =    8.21
          shufflenet  min =    5.32  max =    5.44  avg =    5.37
       shufflenet_v2  min =    5.38  max =    5.51  avg =    5.44
             mnasnet  min =    9.25  max =    9.36  avg =    9.31
     proxylessnasnet  min =   10.95  max =   11.01  avg =   10.98
     efficientnet_b0  min =   17.67  max =   17.79  avg =   17.73
   efficientnetv2_b0  min =   20.56  max =   20.70  avg =   20.60
        regnety_400m  min =   11.96  max =   12.07  avg =   12.00
           blazeface  min =    2.19  max =    2.87  avg =    2.47
           googlenet  min =   32.10  max =   32.20  avg =   32.15
      googlenet_int8  min =   32.00  max =   32.15  avg =   32.07
            resnet18  min =   22.02  max =   22.28  avg =   22.12
       resnet18_int8  min =   26.17  max =   26.26  avg =   26.22
             alexnet  min =   24.83  max =   24.99  avg =   24.92
               vgg16  min =  129.57  max =  129.95  avg =  129.78
          vgg16_int8  min =  202.08  max =  202.34  avg =  202.19
            resnet50  min =   65.85  max =   66.01  avg =   65.93
       resnet50_int8  min =   56.33  max =   56.49  avg =   56.42
      squeezenet_ssd  min =   22.52  max =   24.50  avg =   22.93
 squeezenet_ssd_int8  min =   24.51  max =   26.83  avg =   24.98
       mobilenet_ssd  min =   30.55  max =   32.68  avg =   30.85
  mobilenet_ssd_int8  min =   22.96  max =   23.75  avg =   23.09
      mobilenet_yolo  min =   68.74  max =   69.01  avg =   68.88
  mobilenetv2_yolov3  min =   36.98  max =   37.16  avg =   37.06
         yolov4-tiny  min =   47.36  max =   47.45  avg =   47.41
           nanodet_m  min =   15.08  max =   15.30  avg =   15.17
    yolo-fastest-1.1  min =    5.51  max =    5.61  avg =    5.55
      yolo-fastestv2  min =    4.92  max =    5.02  avg =    4.97
  vision_transformer  min =  990.13  max =  994.45  avg =  991.95
          FastestDet  min =    5.06  max =    5.17  avg =    5.11

OnePlus7T:/data/local/tmp $ ./benchncnn 8 1 2 0 1
[0 Adreno (TM) 640]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 640]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=1
[0 Adreno (TM) 640]  fp16-p/s/a=1/0/1  int8-p/s/a=1/0/0
[0 Adreno (TM) 640]  subgroup=64  basic=1  vote=1  ballot=0  shuffle=0
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =    8.59  max =    9.51  avg =    9.09
           mobilenet  min =   13.04  max =   13.45  avg =   13.22
        mobilenet_v2  min =   10.68  max =   11.38  avg =   10.85
        mobilenet_v3  min =   11.86  max =   12.37  avg =   12.08
          shufflenet  min =    8.21  max =    8.40  avg =    8.25
       shufflenet_v2  min =    8.84  max =    9.13  avg =    8.97
             mnasnet  min =   11.32  max =   11.72  avg =   11.45
     proxylessnasnet  min =   12.27  max =   12.86  avg =   12.55
     efficientnet_b0  min =   22.64  max =   22.82  avg =   22.75
   efficientnetv2_b0  min =   32.32  max =   38.20  avg =   35.79
        regnety_400m  min =   15.35  max =   15.86  avg =   15.64
           blazeface  min =    2.82  max =    2.93  avg =    2.86
           googlenet  min =   28.22  max =   28.34  avg =   28.26
            resnet18  min =   24.71  max =   24.96  avg =   24.82
             alexnet  min =   27.94  max =   28.10  avg =   28.01
               vgg16  min =  106.08  max =  106.53  avg =  106.30
            resnet50  min =   55.28  max =   56.03  avg =   55.68
      squeezenet_ssd  min =   29.77  max =   30.65  avg =   30.05
       mobilenet_ssd  min =   29.14  max =   29.39  avg =   29.25
      mobilenet_yolo  min =   49.78  max =   50.09  avg =   49.94
  mobilenetv2_yolov3  min =   31.11  max =   31.97  avg =   31.60
         yolov4-tiny  min =   46.22  max =   46.90  avg =   46.63
           nanodet_m  min =   15.96  max =   16.52  avg =   16.13
    yolo-fastest-1.1  min =    9.59  max =    9.66  avg =    9.61
      yolo-fastestv2  min =    7.99  max =    8.23  avg =    8.13
```

### Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612)
```
violet:/data/local/tmp/ncnn $ ./benchncnn 8 2 0
loop_count = 8
num_threads = 2
powersave = 0
gpu_device = -1
          squeezenet  min =   23.29  max =   24.65  avg =   23.95
     squeezenet_int8  min =   23.24  max =   61.55  avg =   31.20
           mobilenet  min =   31.60  max =   32.10  avg =   31.80
      mobilenet_int8  min =   30.35  max =   32.03  avg =   30.95
        mobilenet_v2  min =   25.92  max =   26.45  avg =   26.08
          shufflenet  min =   11.91  max =   12.11  avg =   12.00
             mnasnet  min =   21.38  max =   21.71  avg =   21.51
     proxylessnasnet  min =   25.53  max =   25.78  avg =   25.62
           googlenet  min =   93.62  max =  100.67  avg =   94.86
      googlenet_int8  min =   90.74  max =   91.06  avg =   90.87
            resnet18  min =   85.84  max =   87.37  avg =   86.50
       resnet18_int8  min =   77.88  max =   78.11  avg =   78.00
             alexnet  min =  196.33  max =  201.73  avg =  200.19
               vgg16  min =  560.71  max =  571.75  avg =  564.84
          vgg16_int8  min =  651.51  max =  652.68  avg =  652.12
            resnet50  min =  178.25  max =  179.86  avg =  178.77
       resnet50_int8  min =  181.07  max =  183.26  avg =  181.64
      squeezenet_ssd  min =   64.86  max =   68.39  avg =   66.05
 squeezenet_ssd_int8  min =   69.61  max =   70.37  avg =   69.93
       mobilenet_ssd  min =   65.92  max =   67.03  avg =   66.41
  mobilenet_ssd_int8  min =   61.54  max =   63.38  avg =   62.27
      mobilenet_yolo  min =  143.42  max =  146.69  avg =  144.33
    mobilenet_yolov3  min =  150.45  max =  152.30  avg =  151.36

violet:/data/local/tmp/ncnn $ ./benchncnn 8 1 0
loop_count = 8
num_threads = 1
powersave = 0
gpu_device = -1
          squeezenet  min =   36.04  max =   37.25  avg =   36.48
     squeezenet_int8  min =   37.82  max =   79.20  avg =   43.13
           mobilenet  min =   54.29  max =   54.73  avg =   54.41
      mobilenet_int8  min =   58.90  max =   60.11  avg =   59.39
        mobilenet_v2  min =   38.64  max =   40.22  avg =   38.97
          shufflenet  min =   18.05  max =   18.39  avg =   18.19
             mnasnet  min =   34.65  max =   34.98  avg =   34.79
     proxylessnasnet  min =   42.61  max =   43.12  avg =   42.80
           googlenet  min =  164.74  max =  165.89  avg =  165.34
      googlenet_int8  min =  159.93  max =  160.38  avg =  160.12
            resnet18  min =  135.76  max =  137.93  avg =  136.98
       resnet18_int8  min =  140.22  max =  144.06  avg =  141.92
             alexnet  min =  391.01  max =  396.85  avg =  392.74
               vgg16  min = 1019.35  max = 1022.75  avg = 1021.26
          vgg16_int8  min = 1122.25  max = 1137.99  avg = 1124.78
            resnet50  min =  302.16  max =  304.22  avg =  303.05
       resnet50_int8  min =  318.35  max =  319.50  avg =  318.84
      squeezenet_ssd  min =   91.26  max =   94.86  avg =   92.39
 squeezenet_ssd_int8  min =  105.06  max =  106.17  avg =  105.56
       mobilenet_ssd  min =  105.01  max =  105.95  avg =  105.40
  mobilenet_ssd_int8  min =  119.93  max =  120.50  avg =  120.19
      mobilenet_yolo  min =  229.87  max =  230.76  avg =  230.21
    mobilenet_yolov3  min =  242.10  max =  242.91  avg =  242.47
```

### Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4)
```
HWEML:/data/local/tmp/ncnnbench $ ./benchncnn 8 4 2 -1 1
[0 Mali-G72]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G72]  buglssc=0  bugsbn1=0  buglbia=0  bugihfa=1
[0 Mali-G72]  fp16p=1  fp16s=0  fp16a=1  int8s=0  int8a=0
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   24.38  max =   28.03  avg =   25.83
     squeezenet_int8  min =   21.79  max =   24.80  avg =   22.60
           mobilenet  min =   34.09  max =   36.88  avg =   35.93
      mobilenet_int8  min =   52.62  max =   61.70  avg =   55.38
        mobilenet_v2  min =   23.71  max =   25.70  avg =   24.49
        mobilenet_v3  min =   20.66  max =   25.68  avg =   23.07
          shufflenet  min =   17.89  max =   19.91  avg =   18.53
       shufflenet_v2  min =   13.73  max =   16.54  avg =   15.37
             mnasnet  min =   24.36  max =   27.14  avg =   25.58
     proxylessnasnet  min =   27.19  max =   29.70  avg =   28.59
     efficientnet_b0  min =   49.31  max =   50.26  avg =   49.70
        regnety_400m  min =   42.54  max =   51.22  avg =   46.71
           blazeface  min =    5.49  max =    7.67  avg =    6.27
           googlenet  min =   72.67  max =   81.22  avg =   75.92
      googlenet_int8  min =   67.60  max =   74.50  avg =   71.21
            resnet18  min =   69.32  max =   81.59  avg =   73.45
       resnet18_int8  min =   60.92  max =   68.11  avg =   64.18
             alexnet  min =   60.90  max =   79.28  avg =   66.72
               vgg16  min =  337.01  max =  378.89  avg =  352.37
          vgg16_int8  min =  465.88  max =  505.19  avg =  489.76
            resnet50  min =  207.75  max =  220.74  avg =  214.42
       resnet50_int8  min =  165.67  max =  183.80  avg =  171.27
      squeezenet_ssd  min =   72.77  max =   84.45  avg =   79.09
 squeezenet_ssd_int8  min =   75.37  max =   86.58  avg =   78.70
       mobilenet_ssd  min =   88.88  max =   96.43  avg =   92.02
  mobilenet_ssd_int8  min =   89.04  max =  101.35  avg =   92.23
      mobilenet_yolo  min =  189.73  max =  206.55  avg =  193.64
  mobilenetv2_yolov3  min =   99.08  max =  111.64  avg =  104.23

HWEML:/data/local/tmp/ncnnbench $ ./benchncnn 8 1 2 -1 1
[0 Mali-G72]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G72]  buglssc=0  bugsbn1=0  buglbia=0  bugihfa=1
[0 Mali-G72]  fp16p=1  fp16s=0  fp16a=1  int8s=0  int8a=0
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   73.47  max =   81.39  avg =   76.06
     squeezenet_int8  min =   62.63  max =   73.66  avg =   66.52
           mobilenet  min =  103.85  max =  112.83  avg =  108.98
      mobilenet_int8  min =  152.27  max =  161.26  avg =  157.17
        mobilenet_v2  min =   70.53  max =   87.26  avg =   76.67
        mobilenet_v3  min =   59.87  max =   68.59  avg =   63.08
          shufflenet  min =   36.69  max =   41.45  avg =   39.24
       shufflenet_v2  min =   33.97  max =   37.84  avg =   35.03
             mnasnet  min =   69.24  max =   79.73  avg =   74.20
     proxylessnasnet  min =   78.63  max =   88.57  avg =   81.83
     efficientnet_b0  min =  147.45  max =  159.07  avg =  152.09
        regnety_400m  min =   90.83  max =   98.51  avg =   93.82
           blazeface  min =   10.05  max =   11.59  avg =   10.78
           googlenet  min =  240.26  max =  277.71  avg =  259.61
      googlenet_int8  min =  214.64  max =  233.56  avg =  225.01
            resnet18  min =  245.62  max =  268.49  avg =  260.37
       resnet18_int8  min =  184.85  max =  194.91  avg =  190.60
             alexnet  min =  202.52  max =  241.12  avg =  211.51
               vgg16  min = 1632.98  max = 1769.05  avg = 1710.89
          vgg16_int8  min = 1237.01  max = 1316.40  avg = 1273.44
            resnet50  min =  558.41  max =  601.59  avg =  581.26
       resnet50_int8  min =  425.26  max =  445.19  avg =  436.22
      squeezenet_ssd  min =  228.50  max =  255.89  avg =  244.63
 squeezenet_ssd_int8  min =  166.97  max =  193.77  avg =  180.22
       mobilenet_ssd  min =  226.54  max =  246.62  avg =  235.75
  mobilenet_ssd_int8  min =  231.35  max =  249.63  avg =  241.29
      mobilenet_yolo  min =  469.71  max =  508.79  avg =  497.50
  mobilenetv2_yolov3  min =  242.88  max =  265.30  avg =  254.68

HWEML:/data/local/tmp/ncnnbench $ ./benchncnn 4 1 2 0 1
[0 Mali-G72]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G72]  buglssc=0  bugsbn1=0  buglbia=0  bugihfa=1
[0 Mali-G72]  fp16p=1  fp16s=0  fp16a=1  int8s=0  int8a=0
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   24.54  max =   25.75  avg =   25.16
           mobilenet  min =   22.03  max =   29.61  avg =   27.31
        mobilenet_v2  min =   20.15  max =   28.05  avg =   25.35
        mobilenet_v3  min =   34.26  max =   37.49  avg =   35.51
          shufflenet  min =   26.29  max =   27.68  avg =   26.86
       shufflenet_v2  min =   29.60  max =   32.08  avg =   31.27
             mnasnet  min =   25.85  max =   29.38  avg =   27.98
     proxylessnasnet  min =   23.64  max =   30.09  avg =   26.36
     efficientnet_b0  min =   52.55  max =   58.51  avg =   55.56
        regnety_400m  min =   37.81  max =   43.22  avg =   40.30
           blazeface  min =    9.14  max =   10.93  avg =   10.08
           googlenet  min =   60.19  max =   62.84  avg =   61.51
            resnet18  min =   50.42  max =   52.93  avg =   51.70
             alexnet  min =  195.34  max =  196.98  avg =  196.14
               vgg16  min =  725.88  max =  751.20  avg =  739.99
            resnet50  min =  124.47  max =  125.93  avg =  125.02
      squeezenet_ssd  min =   91.79  max =   97.04  avg =   93.56
       mobilenet_ssd  min =   51.81  max =   59.31  avg =   54.09
      mobilenet_yolo  min =  124.67  max =  127.62  avg =  126.53
  mobilenetv2_yolov3  min =   53.11  max =   54.81  avg =   54.11
```

### Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540)
```
taimen:/data/local/tmp/ncnnbench $ ./benchncnn 8 4 2 -1 0
[0 Adreno (TM) 540]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 540]  buglssc=0  bugsbn1=1  buglbia=0  bugihfa=0
[0 Adreno (TM) 540]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   28.46  max =   30.89  avg =   29.77
     squeezenet_int8  min =   30.32  max =   32.92  avg =   31.68
           mobilenet  min =   36.65  max =   38.37  avg =   37.32
      mobilenet_int8  min =   62.91  max =   66.71  avg =   64.49
        mobilenet_v2  min =   27.85  max =   31.21  avg =   29.41
        mobilenet_v3  min =   23.83  max =   26.40  avg =   24.79
          shufflenet  min =   15.65  max =   16.88  avg =   16.27
       shufflenet_v2  min =   13.70  max =   14.49  avg =   14.08
             mnasnet  min =   25.04  max =   28.35  avg =   26.45
     proxylessnasnet  min =   27.49  max =   29.58  avg =   28.62
     efficientnet_b0  min =   48.43  max =   49.41  avg =   48.85
        regnety_400m  min =   42.48  max =   43.78  avg =   43.18
           blazeface  min =    4.39  max =    4.68  avg =    4.51
           googlenet  min =   75.98  max =   78.40  avg =   77.37
      googlenet_int8  min =   79.26  max =   83.20  avg =   80.55
            resnet18  min =   73.60  max =   76.97  avg =   75.63
       resnet18_int8  min =   62.93  max =   65.94  avg =   64.50
             alexnet  min =   64.18  max =   67.02  avg =   65.49
               vgg16  min =  389.39  max =  399.13  avg =  394.09
          vgg16_int8  min =  509.06  max =  524.41  avg =  514.76
            resnet50  min =  188.21  max =  194.58  avg =  191.98
       resnet50_int8  min =  182.84  max =  187.22  avg =  184.23
      squeezenet_ssd  min =   77.69  max =   81.17  avg =   79.24
 squeezenet_ssd_int8  min =   81.71  max =   84.12  avg =   82.90
       mobilenet_ssd  min =   78.35  max =   81.50  avg =   79.82
  mobilenet_ssd_int8  min =   96.84  max =  100.97  avg =   98.42
      mobilenet_yolo  min =  167.32  max =  170.71  avg =  168.87
  mobilenetv2_yolov3  min =   97.00  max =  102.11  avg =   99.01

taimen:/data/local/tmp/ncnnbench $ ./benchncnn 8 1 2 -1 1
[0 Adreno (TM) 540]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 540]  buglssc=0  bugsbn1=1  buglbia=0  bugihfa=0
[0 Adreno (TM) 540]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   67.25  max =   71.39  avg =   69.35
     squeezenet_int8  min =   62.12  max =   66.35  avg =   63.73
           mobilenet  min =  103.30  max =  110.39  avg =  107.13
      mobilenet_int8  min =  155.24  max =  161.42  avg =  157.82
        mobilenet_v2  min =   71.89  max =   74.73  avg =   73.48
        mobilenet_v3  min =   58.35  max =   63.43  avg =   60.68
          shufflenet  min =   35.96  max =   39.43  avg =   36.94
       shufflenet_v2  min =   35.53  max =   39.86  avg =   37.10
             mnasnet  min =   66.71  max =   74.00  avg =   68.65
     proxylessnasnet  min =   76.50  max =   82.20  avg =   78.57
     efficientnet_b0  min =  142.32  max =  152.17  avg =  146.14
        regnety_400m  min =   89.60  max =   98.27  avg =   92.62
           blazeface  min =   10.45  max =   12.81  avg =   11.07
           googlenet  min =  222.75  max =  233.61  avg =  228.38
      googlenet_int8  min =  206.70  max =  212.20  avg =  209.24
            resnet18  min =  210.86  max =  220.25  avg =  213.65
       resnet18_int8  min =  176.04  max =  183.58  avg =  178.71
             alexnet  min =  185.97  max =  195.91  avg =  191.40
               vgg16  min = 1176.82  max = 1200.64  avg = 1187.88
          vgg16_int8  min = 1086.52  max = 1105.00  avg = 1095.53
            resnet50  min =  517.48  max =  533.99  avg =  526.04
       resnet50_int8  min =  417.30  max =  435.81  avg =  422.36
      squeezenet_ssd  min =  164.88  max =  171.21  avg =  167.51
 squeezenet_ssd_int8  min =  164.78  max =  171.77  avg =  168.36
       mobilenet_ssd  min =  221.41  max =  229.13  avg =  226.18
  mobilenet_ssd_int8  min =  234.15  max =  245.91  avg =  239.01
      mobilenet_yolo  min =  471.34  max =  484.99  avg =  477.15
  mobilenetv2_yolov3  min =  249.14  max =  257.61  avg =  252.54

taimen:/data/local/tmp/ncnnbench $ ./benchncnn 8 1 2 0 1
[0 Adreno (TM) 540]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 540]  buglssc=0  bugsbn1=1  buglbia=0  bugihfa=0
[0 Adreno (TM) 540]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   18.74  max =   19.89  avg =   19.22
           mobilenet  min =   21.19  max =   25.61  avg =   22.94
        mobilenet_v2  min =   24.15  max =   34.68  avg =   30.12
        mobilenet_v3  min =   25.94  max =   33.15  avg =   30.09
          shufflenet  min =   25.05  max =   31.41  avg =   27.85
       shufflenet_v2  min =   28.82  max =   32.04  avg =   30.95
             mnasnet  min =   21.34  max =   27.69  avg =   24.17
     proxylessnasnet  min =   25.51  max =   30.03  avg =   28.01
     efficientnet_b0  min =   42.94  max =   47.44  avg =   45.28
        regnety_400m  min =   36.36  max =   55.73  avg =   41.82
           blazeface  min =   11.14  max =   13.11  avg =   12.20
           googlenet  min =   49.72  max =   56.92  avg =   51.79
            resnet18  min =   44.63  max =   47.37  avg =   45.86
             alexnet  min =   42.83  max =   46.34  avg =   44.63
               vgg16  min =  568.82  max =  586.75  avg =  578.60
            resnet50  min =  108.63  max =  115.76  avg =  110.38
      squeezenet_ssd  min =   85.22  max =  104.73  avg =   93.14
       mobilenet_ssd  min =   49.91  max =   56.86  avg =   52.33
      mobilenet_yolo  min =   98.76  max =  109.37  avg =  102.27
  mobilenetv2_yolov3  min =   57.49  max =   61.15  avg =   58.74
```

### Qualcomm SDM765G Snapdragon 765G (Kyro 1.8GHz x 6 + Kyro 2.2GHz x 2 + Adreno 620)
```
130|bramble:/data/local/tmp $ ./benchncnn 8 4 2 -1 1
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    9.84  max =   11.72  avg =   10.36
     squeezenet_int8  min =   10.80  max =   11.13  avg =   10.96
               mobilenet  min =   14.04  max =   14.37  avg =   14.20
      mobilenet_int8  min =   13.39  max =   13.75  avg =   13.59
        mobilenet_v2  min =   13.04  max =   13.51  avg =   13.27
        mobilenet_v3  min =   11.00  max =   13.21  avg =   12.54
          shufflenet  min =   11.08  max =   11.22  avg =   11.16
       shufflenet_v2  min =    8.45  max =    8.50  avg =    8.47
             mnasnet  min =   14.15  max =   14.69  avg =   14.38
     proxylessnasnet  min =   14.49  max =   15.07  avg =   14.83
     efficientnet_b0  min =   28.99  max =   29.53  avg =   29.24
   efficientnetv2_b0  min =   38.92  max =   39.34  avg =   39.14
        regnety_400m  min =   33.46  max =   33.81  avg =   33.62
           blazeface  min =    4.22  max =    4.30  avg =    4.27
           googlenet  min =   35.24  max =   36.94  avg =   35.57
      googlenet_int8  min =   45.26  max =   46.46  avg =   45.78
            resnet18  min =   33.14  max =   33.75  avg =   33.31
       resnet18_int8  min =   43.26  max =   43.50  avg =   43.35
             alexnet  min =   25.40  max =   26.19  avg =   25.74
               vgg16  min =  121.39  max =  122.35  avg =  121.78
          vgg16_int8  min =  243.47  max =  249.94  avg =  245.56
            resnet50  min =   67.05  max =   70.16  avg =   68.20
       resnet50_int8  min =   76.95  max =   80.23  avg =   78.18
      squeezenet_ssd  min =   32.02  max =   33.27  avg =   32.51
 squeezenet_ssd_int8  min =   36.31  max =   38.35  avg =   37.09
       mobilenet_ssd  min =   32.02  max =   34.55  avg =   32.99
  mobilenet_ssd_int8  min =   32.31  max =   33.92  avg =   32.77
      mobilenet_yolo  min =   99.12  max =  109.81  avg =  103.00
  mobilenetv2_yolov3  min =   59.74  max =   60.95  avg =   60.21
         yolov4-tiny  min =   57.83  max =   72.15  avg =   68.75
           nanodet_m  min =   22.76  max =   22.97  avg =   22.85
    yolo-fastest-1.1  min =   13.58  max =   13.93  avg =   13.80
      yolo-fastestv2  min =   12.06  max =   12.27  avg =   12.15
  vision_transformer  min = 1274.67  max = 1597.52  avg = 1363.14
          FastestDet  min =    9.75  max =    9.86  avg =    9.81

130|bramble:/data/local/tmp $ ./benchncnn 8 4 2 0 1
[0 Adreno (TM) 620]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 620]  bugsbn1=1  bugbilz=0  bugcopc=0  bugihfa=0
[0 Adreno (TM) 620]  fp16-p/s/u/a=1/1/0/1  int8-p/s/u/a=1/0/0/1
[0 Adreno (TM) 620]  subgroup=64  basic/vote/ballot/shuffle=1/1/1/1
[0 Adreno (TM) 620]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   25.06  max =   25.80  avg =   25.53
     squeezenet_int8  min =    9.75  max =    9.82  avg =    9.78
           mobilenet  min =   43.43  max =   44.04  avg =   43.71
      mobilenet_int8  min =   11.12  max =   11.59  avg =   11.34
        mobilenet_v2  min =   32.14  max =   32.58  avg =   32.40
        mobilenet_v3  min =   32.75  max =   32.98  avg =   32.87
          shufflenet  min =   29.29  max =   29.63  avg =   29.40
       shufflenet_v2  min =   32.43  max =   33.18  avg =   32.69
             mnasnet  min =   34.58  max =   35.24  avg =   35.00
     proxylessnasnet  min =   40.61  max =   41.40  avg =   40.98
     efficientnet_b0  min =   49.44  max =   50.46  avg =   49.95
   efficientnetv2_b0  min =  185.31  max =  187.37  avg =  186.24
        regnety_400m  min =   41.43  max =   42.75  avg =   41.84
           blazeface  min =   13.47  max =   14.07  avg =   13.72
           googlenet  min =   78.12  max =   79.06  avg =   78.56
      googlenet_int8  min =   48.73  max =   50.13  avg =   49.20
            resnet18  min =   73.61  max =   74.05  avg =   73.75
       resnet18_int8  min =   21.87  max =   22.05  avg =   21.95
             alexnet  min =  128.58  max =  129.51  avg =  128.97
               vgg16  min =  437.64  max =  439.12  avg =  438.28
          vgg16_int8  min =  232.77  max =  243.06  avg =  239.54
            resnet50  min =  187.36  max =  188.47  avg =  188.01
       resnet50_int8  min =   75.79  max =   77.33  avg =   76.64
      squeezenet_ssd  min =   80.68  max =   84.50  avg =   81.93
 squeezenet_ssd_int8  min =   29.88  max =   30.77  avg =   30.30
       mobilenet_ssd  min =   94.77  max =   96.46  avg =   95.79
  mobilenet_ssd_int8  min =   29.03  max =   30.07  avg =   29.53
      mobilenet_yolo  min =  185.97  max =  188.11  avg =  186.59
  mobilenetv2_yolov3  min =  108.43  max =  164.75  avg =  121.55
         yolov4-tiny  min =  149.38  max =  158.39  avg =  153.92
           nanodet_m  min =   46.73  max =   48.85  avg =   47.73
    yolo-fastest-1.1  min =   26.32  max =   26.77  avg =   26.54
      yolo-fastestv2  min =   38.87  max =   39.31  avg =   39.13
  vision_transformer  min = 3392.80  max = 3397.79  avg = 3396.09
          FastestDet  min =   43.05  max =   43.81  avg =   43.45
```

### Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512)
```
lavender:/data/local/tmp/ncnnbench $ ./benchncnn 8 8 0 -1 1
[0 Adreno (TM) 512]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 512]  buglssc=0  bugsbn1=1  buglbia=0  bugihfa=0
[0 Adreno (TM) 512]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 8
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   29.05  max =   44.86  avg =   33.26
     squeezenet_int8  min =   35.47  max =   37.10  avg =   36.09
           mobilenet  min =   31.59  max =   33.47  avg =   32.33
      mobilenet_int8  min =   77.50  max =   91.15  avg =   82.98
        mobilenet_v2  min =   33.63  max =   35.43  avg =   34.54
        mobilenet_v3  min =   29.97  max =   49.80  avg =   34.81
          shufflenet  min =   28.52  max =   30.09  avg =   29.09
       shufflenet_v2  min =   19.15  max =   21.15  avg =   19.99
             mnasnet  min =   29.91  max =   35.11  avg =   31.46
     proxylessnasnet  min =   33.28  max =  117.09  avg =   55.22
     efficientnet_b0  min =   52.29  max =   57.93  avg =   55.04
        regnety_400m  min =   96.05  max =  116.42  avg =  102.07
           blazeface  min =    7.98  max =   11.83  avg =    8.89
           googlenet  min =   76.88  max =  103.99  avg =   84.54
      googlenet_int8  min =   97.68  max =  118.56  avg =  104.92
            resnet18  min =   75.93  max =   89.31  avg =   80.00
       resnet18_int8  min =   73.27  max =   80.84  avg =   76.19
             alexnet  min =   90.94  max =  114.57  avg =   96.42
               vgg16  min =  381.30  max =  615.62  avg =  555.96
          vgg16_int8  min =  803.75  max = 1126.53  avg =  886.03
            resnet50  min =  257.38  max =  285.19  avg =  266.59
       resnet50_int8  min =  304.81  max =  338.01  avg =  314.84
      squeezenet_ssd  min =  117.59  max =  145.79  avg =  123.79
 squeezenet_ssd_int8  min =  132.80  max =  163.00  avg =  149.99
       mobilenet_ssd  min =  103.98  max =  126.90  avg =  113.10
  mobilenet_ssd_int8  min =  167.86  max =  188.46  avg =  180.56
      mobilenet_yolo  min =  201.75  max =  263.92  avg =  240.17
  mobilenetv2_yolov3  min =  143.76  max =  167.77  avg =  151.94

lavender:/data/local/tmp/ncnnbench $ ./benchncnn 4 1 2 -1 1
[0 Adreno (TM) 512]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 512]  buglssc=0  bugsbn1=1  buglbia=0  bugihfa=0
[0 Adreno (TM) 512]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   69.75  max =   71.33  avg =   70.38
     squeezenet_int8  min =   67.12  max =   68.07  avg =   67.59
           mobilenet  min =  107.65  max =  110.48  avg =  108.82
      mobilenet_int8  min =  163.13  max =  164.74  avg =  164.24
        mobilenet_v2  min =   75.50  max =   77.36  avg =   76.38
        mobilenet_v3  min =   59.05  max =   59.36  avg =   59.23
          shufflenet  min =   38.33  max =   38.74  avg =   38.57
       shufflenet_v2  min =   37.43  max =   38.97  avg =   38.32
             mnasnet  min =   69.29  max =   73.20  avg =   70.73
     proxylessnasnet  min =   80.81  max =   82.66  avg =   81.52
     efficientnet_b0  min =  151.20  max =  152.38  avg =  151.72
        regnety_400m  min =   93.53  max =   94.53  avg =   94.19
           blazeface  min =   12.15  max =   12.82  avg =   12.46
           googlenet  min =  239.63  max =  242.64  avg =  241.06
      googlenet_int8  min =  214.71  max =  216.53  avg =  215.79
            resnet18  min =  234.20  max =  238.74  avg =  236.90
       resnet18_int8  min =  181.57  max =  183.97  avg =  182.66
             alexnet  min =  205.94  max =  207.44  avg =  206.63
               vgg16  min = 1188.14  max = 1201.95  avg = 1196.93
          vgg16_int8  min = 1081.21  max = 1087.84  avg = 1085.17
            resnet50  min =  556.54  max =  566.68  avg =  561.21
       resnet50_int8  min =  433.19  max =  433.93  avg =  433.48
      squeezenet_ssd  min =  169.02  max =  170.54  avg =  169.73
 squeezenet_ssd_int8  min =  176.28  max =  177.90  avg =  176.87
       mobilenet_ssd  min =  228.15  max =  232.69  avg =  230.38
  mobilenet_ssd_int8  min =  236.97  max =  239.69  avg =  238.35
      mobilenet_yolo  min =  493.33  max =  506.34  avg =  499.79
  mobilenetv2_yolov3  min =  252.53  max =  261.58  avg =  256.30

lavender:/data/local/tmp/ncnnbench $ ./benchncnn 4 1 2 0 1
[0 Adreno (TM) 512]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 512]  buglssc=0  bugsbn1=1  buglbia=0  bugihfa=0
[0 Adreno (TM) 512]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   34.49  max =   34.65  avg =   34.55
           mobilenet  min =   54.45  max =   55.52  avg =   54.75
        mobilenet_v2  min =   39.32  max =   39.58  avg =   39.50
        mobilenet_v3  min =   36.13  max =   36.28  avg =   36.19
          shufflenet  min =   35.25  max =   35.42  avg =   35.31
       shufflenet_v2  min =   31.38  max =   31.70  avg =   31.53
             mnasnet  min =   40.95  max =   41.32  avg =   41.13
     proxylessnasnet  min =   43.81  max =   44.05  avg =   43.90
     efficientnet_b0  min =   68.34  max =   68.56  avg =   68.47
        regnety_400m  min =   53.89  max =   54.23  avg =   54.02
           blazeface  min =   19.82  max =   27.74  avg =   22.01
           googlenet  min =  119.46  max =  119.98  avg =  119.80
            resnet18  min =  115.56  max =  120.28  avg =  116.88
             alexnet  min =  102.06  max =  105.56  avg =  102.97
               vgg16  min = 1192.29  max = 1202.17  avg = 1197.03
            resnet50  min =  294.87  max =  298.79  avg =  296.05
      squeezenet_ssd  min =  167.85  max =  168.42  avg =  168.09
       mobilenet_ssd  min =  120.30  max =  120.37  avg =  120.34
      mobilenet_yolo  min =  256.60  max =  260.21  avg =  257.54
  mobilenetv2_yolov3  min =  121.48  max =  125.22  avg =  122.53
```

### Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2)
```
natrium:/data/local/tmp # ./benchncnn 8 4 0 -1 1
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   18.46  max =   19.12  avg =   18.78
     squeezenet_int8  min =   16.69  max =   17.22  avg =   16.95
           mobilenet  min =   27.33  max =   28.74  avg =   27.88
      mobilenet_int8  min =   20.14  max =   20.71  avg =   20.46
        mobilenet_v2  min =   21.94  max =   23.09  avg =   22.38
        mobilenet_v3  min =   18.81  max =   19.45  avg =   19.04
          shufflenet  min =   14.07  max =   14.75  avg =   14.29
       shufflenet_v2  min =   11.52  max =   11.92  avg =   11.71
             mnasnet  min =   20.41  max =   21.75  avg =   20.74
     proxylessnasnet  min =   22.99  max =   23.63  avg =   23.13
     efficientnet_b0  min =   34.74  max =   35.26  avg =   34.91
   efficientnetv2_b0  min =   41.16  max =   41.60  avg =   41.39
        regnety_400m  min =   44.27  max =   45.01  avg =   44.69
           blazeface  min =    4.25  max =    4.71  avg =    4.43
           googlenet  min =   54.88  max =   55.55  avg =   55.12
      googlenet_int8  min =   51.88  max =   52.72  avg =   52.25
            resnet18  min =   44.33  max =   45.44  avg =   44.88
       resnet18_int8  min =   51.24  max =   51.94  avg =   51.54
             alexnet  min =   38.62  max =   39.31  avg =   38.88
               vgg16  min =  242.53  max =  244.23  avg =  243.16
          vgg16_int8  min =  183.15  max =  204.96  avg =  192.16
            resnet50  min =  122.14  max =  124.29  avg =  122.94
       resnet50_int8  min =  116.61  max =  118.47  avg =  117.56
      squeezenet_ssd  min =   47.92  max =   49.01  avg =   48.45
 squeezenet_ssd_int8  min =   43.21  max =   44.45  avg =   43.76
       mobilenet_ssd  min =   56.92  max =   58.21  avg =   57.56
  mobilenet_ssd_int8  min =   42.26  max =   42.92  avg =   42.48
      mobilenet_yolo  min =  126.20  max =  128.50  avg =  127.10
  mobilenetv2_yolov3  min =   75.49  max =   76.50  avg =   76.01
         yolov4-tiny  min =   94.24  max =   95.75  avg =   94.83
           nanodet_m  min =   31.30  max =   31.93  avg =   31.62
    yolo-fastest-1.1  min =   16.89  max =   17.56  avg =   17.23
      yolo-fastestv2  min =   12.97  max =   13.50  avg =   13.15

natrium:/data/local/tmp # ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   46.27  max =   46.60  avg =   46.45
     squeezenet_int8  min =   41.33  max =   41.73  avg =   41.56
           mobilenet  min =   80.89  max =   81.16  avg =   81.00
      mobilenet_int8  min =   60.33  max =   62.29  avg =   61.33
        mobilenet_v2  min =   51.78  max =   52.02  avg =   51.88
        mobilenet_v3  min =   43.71  max =   44.17  avg =   43.91
          shufflenet  min =   24.96  max =   25.08  avg =   25.02
       shufflenet_v2  min =   24.09  max =   24.26  avg =   24.17
             mnasnet  min =   51.28  max =   51.42  avg =   51.35
     proxylessnasnet  min =   59.25  max =   59.66  avg =   59.48
     efficientnet_b0  min =   92.16  max =   92.34  avg =   92.22
   efficientnetv2_b0  min =  112.27  max =  113.63  avg =  113.17
        regnety_400m  min =   68.59  max =   68.85  avg =   68.75
           blazeface  min =    7.36  max =    7.83  avg =    7.59
           googlenet  min =  151.15  max =  151.53  avg =  151.37
      googlenet_int8  min =  152.01  max =  158.63  avg =  154.18
            resnet18  min =  121.49  max =  121.90  avg =  121.77
       resnet18_int8  min =  154.54  max =  166.73  avg =  161.30
             alexnet  min =   97.41  max =   97.74  avg =   97.62
               vgg16  min =  674.80  max =  675.86  avg =  675.38
          vgg16_int8  min =  593.42  max =  602.98  avg =  596.93
            resnet50  min =  360.44  max =  364.31  avg =  362.01
       resnet50_int8  min =  371.21  max =  386.24  avg =  381.53
      squeezenet_ssd  min =   97.72  max =   98.32  avg =   98.01
 squeezenet_ssd_int8  min =   98.33  max =   99.15  avg =   98.63
       mobilenet_ssd  min =  161.72  max =  161.89  avg =  161.79
  mobilenet_ssd_int8  min =  122.44  max =  123.38  avg =  123.00
      mobilenet_yolo  min =  367.34  max =  369.59  avg =  368.97
  mobilenetv2_yolov3  min =  190.09  max =  190.77  avg =  190.31
         yolov4-tiny  min =  241.59  max =  242.29  avg =  241.81
           nanodet_m  min =   63.03  max =   63.22  avg =   63.12
    yolo-fastest-1.1  min =   29.06  max =   29.22  avg =   29.12
      yolo-fastestv2  min =   22.72  max =   22.80  avg =   22.77
```

### Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4)
```
angler:/data/local/tmp $ ./benchncnn 8 8 0 -1 1
loop_count = 8
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   25.83  max =   29.17  avg =   27.69
     squeezenet_int8  min =   24.18  max =   26.31  avg =   25.18
           mobilenet  min =   33.94  max =   35.29  avg =   34.44
      mobilenet_int8  min =   24.99  max =   26.12  avg =   25.46
        mobilenet_v2  min =   32.63  max =   34.44  avg =   33.56
        mobilenet_v3  min =   27.72  max =   30.14  avg =   29.35
          shufflenet  min =   23.23  max =   26.78  avg =   24.58
       shufflenet_v2  min =   21.04  max =   22.25  avg =   21.68
             mnasnet  min =   29.51  max =   31.26  avg =   30.27
     proxylessnasnet  min =   34.21  max =   37.55  avg =   35.20
     efficientnet_b0  min =   54.75  max =   60.45  avg =   56.38
   efficientnetv2_b0  min =   63.60  max =   67.51  avg =   64.81
        regnety_400m  min =   60.80  max =   72.33  avg =   68.27
           blazeface  min =    5.96  max =    7.22  avg =    6.41
           googlenet  min =   80.62  max =   94.46  avg =   86.50
      googlenet_int8  min =   69.05  max =   75.75  avg =   71.47
            resnet18  min =   63.90  max =   75.96  avg =   69.64
       resnet18_int8  min =   46.43  max =   62.23  avg =   53.22
             alexnet  min =   82.67  max =   90.25  avg =   87.03
               vgg16  min =  562.23  max =  636.26  avg =  594.82
          vgg16_int8  min =  303.42  max =  358.03  avg =  325.60
            resnet50  min =  233.47  max =  279.99  avg =  248.49
       resnet50_int8  min =  170.11  max =  198.27  avg =  183.35
      squeezenet_ssd  min =   86.97  max =  112.21  avg =   96.84
 squeezenet_ssd_int8  min =   66.09  max =   77.00  avg =   70.57
       mobilenet_ssd  min =   76.95  max =  101.74  avg =   87.73
  mobilenet_ssd_int8  min =   53.27  max =   60.50  avg =   57.46
      mobilenet_yolo  min =  206.42  max =  260.06  avg =  227.84
  mobilenetv2_yolov3  min =  129.32  max =  147.76  avg =  138.90
         yolov4-tiny  min =  184.85  max =  213.03  avg =  203.52
           nanodet_m  min =   47.66  max =   60.55  avg =   53.00

angler:/data/local/tmp # ./benchncnn 4 4 2 -1 1
loop_count = 4
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   41.39  max =   47.64  avg =   43.08
     squeezenet_int8  min =   36.92  max =   37.59  avg =   37.24
           mobilenet  min =   59.04  max =   59.43  avg =   59.22
      mobilenet_int8  min =   44.67  max =   46.60  avg =   45.58
        mobilenet_v2  min =   43.38  max =   43.71  avg =   43.62
        mobilenet_v3  min =   37.57  max =   37.82  avg =   37.65
          shufflenet  min =   30.67  max =   30.86  avg =   30.76
       shufflenet_v2  min =   27.80  max =   28.12  avg =   27.97
             mnasnet  min =   42.99  max =   46.41  avg =   44.21
     proxylessnasnet  min =   51.26  max =   53.52  avg =   52.04
     efficientnet_b0  min =   81.58  max =   82.30  avg =   82.03
   efficientnetv2_b0  min =   94.01  max =   94.48  avg =   94.27
        regnety_400m  min =   82.38  max =   83.86  avg =   82.95
           blazeface  min =   10.02  max =   10.42  avg =   10.18
           googlenet  min =  125.47  max =  126.72  avg =  125.92
      googlenet_int8  min =  109.92  max =  111.65  avg =  110.44
            resnet18  min =  110.14  max =  111.95  avg =  110.76
       resnet18_int8  min =   78.21  max =   79.65  avg =   79.07
             alexnet  min =   78.09  max =   80.34  avg =   78.87
               vgg16  min =  486.69  max =  494.97  avg =  490.35
          vgg16_int8  min =  370.66  max =  377.64  avg =  373.78
            resnet50  min =  272.31  max =  278.64  avg =  274.10
       resnet50_int8  min =  215.57  max =  218.55  avg =  217.27
      squeezenet_ssd  min =  112.98  max =  114.75  avg =  113.60
 squeezenet_ssd_int8  min =   91.85  max =   94.82  avg =   93.13
       mobilenet_ssd  min =  115.18  max =  116.56  avg =  115.95
  mobilenet_ssd_int8  min =   90.95  max =   92.21  avg =   91.39
      mobilenet_yolo  min =  255.07  max =  259.01  avg =  256.18
  mobilenetv2_yolov3  min =  155.52  max =  156.58  avg =  156.09
         yolov4-tiny  min =  231.89  max =  234.14  avg =  232.97
           nanodet_m  min =   72.74  max =   74.71  avg =   73.52
    yolo-fastest-1.1  min =   35.25  max =   36.51  avg =   35.77
      yolo-fastestv2  min =   29.94  max =   31.09  avg =   30.75

angler:/data/local/tmp # ./benchncnn 4 1 2 -1 1
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   70.83  max =   72.68  avg =   71.77
     squeezenet_int8  min =   59.27  max =   59.60  avg =   59.51
           mobilenet  min =  110.70  max =  112.72  avg =  111.48
      mobilenet_int8  min =   79.69  max =   80.01  avg =   79.81
        mobilenet_v2  min =   77.85  max =   78.19  avg =   78.03
        mobilenet_v3  min =   63.49  max =   63.92  avg =   63.73
          shufflenet  min =   41.43  max =   41.60  avg =   41.49
       shufflenet_v2  min =   37.49  max =   38.26  avg =   37.97
             mnasnet  min =   73.91  max =   75.91  avg =   74.59
     proxylessnasnet  min =   94.13  max =   94.53  avg =   94.37
     efficientnet_b0  min =  161.91  max =  162.38  avg =  162.10
   efficientnetv2_b0  min =  179.33  max =  180.26  avg =  179.67
        regnety_400m  min =  100.35  max =  100.76  avg =  100.53
           blazeface  min =   12.57  max =   12.76  avg =   12.66
           googlenet  min =  232.77  max =  233.08  avg =  232.91
      googlenet_int8  min =  203.39  max =  205.25  avg =  204.77
            resnet18  min =  182.58  max =  183.17  avg =  182.91
       resnet18_int8  min =  150.40  max =  152.07  avg =  151.35
             alexnet  min =  147.27  max =  149.00  avg =  148.06
               vgg16  min =  986.93  max =  988.35  avg =  987.47
          vgg16_int8  min =  816.37  max =  819.93  avg =  817.79
            resnet50  min =  502.77  max =  510.88  avg =  508.53
       resnet50_int8  min =  393.33  max =  398.07  avg =  395.86
      squeezenet_ssd  min =  175.01  max =  175.61  avg =  175.32
 squeezenet_ssd_int8  min =  145.19  max =  145.94  avg =  145.66
       mobilenet_ssd  min =  231.04  max =  231.25  avg =  231.13
  mobilenet_ssd_int8  min =  159.81  max =  160.52  avg =  160.13
      mobilenet_yolo  min =  517.86  max =  523.71  avg =  521.85
  mobilenetv2_yolov3  min =  275.84  max =  279.16  avg =  277.13
         yolov4-tiny  min =  363.71  max =  366.14  avg =  364.56
           nanodet_m  min =   93.90  max =   95.09  avg =   94.40
    yolo-fastest-1.1  min =   45.94  max =   46.09  avg =   46.01
      yolo-fastestv2  min =   38.23  max =   38.33  avg =   38.29

angler:/data/local/tmp $ ./benchncnn 4 1 2 0 1
[0 Adreno (TM) 430]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 430]  buglssc=0  bugsbn1=1  buglbia=0  bugihfa=0
[0 Adreno (TM) 430]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   39.49  max =   41.93  avg =   40.62
           mobilenet  min =   60.30  max =   61.81  avg =   60.88
        mobilenet_v2  min =   45.38  max =   47.10  avg =   45.88
        mobilenet_v3  min =   45.97  max =   47.39  avg =   46.69
          shufflenet  min =   29.12  max =   31.02  avg =   29.91
       shufflenet_v2  min =   47.58  max =   50.06  avg =   48.26
             mnasnet  min =   47.84  max =   49.17  avg =   48.26
     proxylessnasnet  min =   49.51  max =   51.03  avg =   49.97
     efficientnet_b0  min =  100.56  max =  105.60  avg =  102.45
        regnety_400m  min =   59.67  max =   61.24  avg =   60.56
           blazeface  min =   13.87  max =   13.98  avg =   13.93
           googlenet  min =  131.26  max =  136.33  avg =  133.40
            resnet18  min =  116.38  max =  117.92  avg =  116.93
             alexnet  min =   72.59  max =   73.94  avg =   73.29
               vgg16  min = 1090.07  max = 1101.71  avg = 1096.34
            resnet50  min =  299.76  max =  300.78  avg =  300.40
      squeezenet_ssd  min =  181.95  max =  182.83  avg =  182.39
       mobilenet_ssd  min =  148.44  max =  151.07  avg =  149.75
      mobilenet_yolo  min =  284.46  max =  285.74  avg =  285.39
  mobilenetv2_yolov3  min =  140.28  max =  148.62  avg =  144.83
```

### Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4)
```
HM2014812:/data/local/tmp # ./benchncnn 8 4 0 -1 1
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   65.45  max =   73.59  avg =   68.10
     squeezenet_int8  min =   59.39  max =   65.54  avg =   61.14
           mobilenet  min =   86.69  max =   94.10  avg =   90.03
      mobilenet_int8  min =   62.22  max =   69.67  avg =   64.13
        mobilenet_v2  min =   77.98  max =   89.53  avg =   82.00
        mobilenet_v3  min =   62.17  max =   68.31  avg =   63.90
          shufflenet  min =   47.52  max =   53.76  avg =   49.92
       shufflenet_v2  min =   39.77  max =   46.08  avg =   40.66
             mnasnet  min =   69.27  max =   75.73  avg =   71.73
     proxylessnasnet  min =   78.72  max =   85.37  avg =   81.33
     efficientnet_b0  min =  126.62  max =  136.67  avg =  130.69
   efficientnetv2_b0  min =  143.24  max =  150.97  avg =  146.89
        regnety_400m  min =  108.79  max =  116.22  avg =  112.99
           blazeface  min =   14.85  max =   15.02  avg =   14.94
           googlenet  min =  180.91  max =  190.37  avg =  186.36
      googlenet_int8  min =  160.07  max =  170.86  avg =  165.05
            resnet18  min =  137.91  max =  155.37  avg =  144.99
       resnet18_int8  min =  104.34  max =  110.20  avg =  106.76
             alexnet  min =  105.30  max =  114.73  avg =  109.53
               vgg16  min =  829.16  max =  942.94  avg =  853.28
          vgg16_int8  min =  515.61  max =  547.32  avg =  526.50
            resnet50  min =  380.46  max =  443.90  avg =  393.71
       resnet50_int8  min =  318.06  max =  327.13  avg =  323.23
      squeezenet_ssd  min =  178.22  max =  189.02  avg =  184.51
 squeezenet_ssd_int8  min =  153.75  max =  163.44  avg =  158.05
       mobilenet_ssd  min =  189.45  max =  195.17  avg =  193.10
  mobilenet_ssd_int8  min =  132.59  max =  139.63  avg =  137.23
      mobilenet_yolo  min =  404.52  max =  414.20  avg =  409.97
  mobilenetv2_yolov3  min =  271.33  max =  279.98  avg =  275.08
         yolov4-tiny  min =  349.36  max =  372.54  avg =  357.98
           nanodet_m  min =  103.01  max =  111.71  avg =  105.82

HM2014812:/data/local/tmp # ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  147.48  max =  149.35  avg =  148.40
     squeezenet_int8  min =  143.20  max =  144.55  avg =  143.98
           mobilenet  min =  243.78  max =  244.33  avg =  244.08
      mobilenet_int8  min =  206.23  max =  207.13  avg =  206.55
        mobilenet_v2  min =  168.04  max =  170.37  avg =  169.06
        mobilenet_v3  min =  147.10  max =  147.91  avg =  147.55
          shufflenet  min =   88.47  max =   89.31  avg =   88.85
       shufflenet_v2  min =   84.47  max =   84.80  avg =   84.60
             mnasnet  min =  162.81  max =  163.93  avg =  163.22
     proxylessnasnet  min =  208.18  max =  209.15  avg =  208.61
     efficientnet_b0  min =  370.06  max =  371.14  avg =  370.64
   efficientnetv2_b0  min =  418.28  max =  429.68  avg =  423.01
        regnety_400m  min =  216.42  max =  217.19  avg =  216.71
           blazeface  min =   27.63  max =   28.67  avg =   28.00
           googlenet  min =  525.25  max =  528.83  avg =  526.23
      googlenet_int8  min =  469.78  max =  472.51  avg =  470.76
            resnet18  min =  396.46  max =  399.66  avg =  397.57
       resnet18_int8  min =  324.07  max =  326.64  avg =  325.34
             alexnet  min =  362.44  max =  363.02  avg =  362.68
               vgg16  min = 2174.86  max = 2252.92  avg = 2215.62
          vgg16_int8  min = 1726.07  max = 1732.69  avg = 1729.18
            resnet50  min = 1136.96  max = 1142.94  avg = 1139.91
       resnet50_int8  min =  977.73  max =  983.64  avg =  980.71
      squeezenet_ssd  min =  350.46  max =  353.35  avg =  351.37
 squeezenet_ssd_int8  min =  333.91  max =  336.59  avg =  334.77
       mobilenet_ssd  min =  513.18  max =  519.05  avg =  516.22
  mobilenet_ssd_int8  min =  424.37  max =  426.89  avg =  426.03
      mobilenet_yolo  min = 1143.20  max = 1145.04  avg = 1144.31
  mobilenetv2_yolov3  min =  617.45  max =  619.30  avg =  618.37
         yolov4-tiny  min =  839.32  max =  847.57  avg =  844.61
           nanodet_m  min =  208.41  max =  211.31  avg =  210.03
```

### Qualcomm Snapdragon 888 (Cortex-X1 2.84GHz x1 + Cortex-A78 2.4GHz x3 + Cortex-A55 1.8GHz x4 + Adreno 660)
```
venus:/data/local/tmp $ ./benchncnn 8 8 2 -1 1
loop_count = 8
num_threads = 8
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    5.89  max =    6.04  avg =    5.98
     squeezenet_int8  min =    6.09  max =    6.29  avg =    6.25
           mobilenet  min =    9.27  max =   10.22  avg =    9.64
      mobilenet_int8  min =    5.90  max =    6.05  avg =    5.97
        mobilenet_v2  min =    6.87  max =    8.42  avg =    7.63
        mobilenet_v3  min =    8.93  max =   12.22  avg =    9.55
          shufflenet  min =    8.72  max =   11.44  avg =    9.20
       shufflenet_v2  min =    6.05  max =    8.24  avg =    7.40
             mnasnet  min =    7.83  max =    9.03  avg =    8.53
     proxylessnasnet  min =    7.03  max =    9.62  avg =    7.88
     efficientnet_b0  min =   12.62  max =   18.01  avg =   15.51
   efficientnetv2_b0  min =   14.96  max =   23.75  avg =   19.61
        regnety_400m  min =   23.58  max =   23.87  avg =   23.72
           blazeface  min =    4.62  max =    4.87  avg =    4.73
           googlenet  min =   17.23  max =   25.41  avg =   19.83
      googlenet_int8  min =   16.91  max =   17.05  avg =   16.99
            resnet18  min =   12.05  max =   14.90  avg =   13.47
       resnet18_int8  min =   15.10  max =   15.42  avg =   15.27
             alexnet  min =   13.85  max =   15.73  avg =   14.50
               vgg16  min =   56.85  max =   57.88  avg =   57.32
          vgg16_int8  min =   70.12  max =   72.99  avg =   71.53
            resnet50  min =   29.45  max =   29.78  avg =   29.64
       resnet50_int8  min =   24.99  max =   25.31  avg =   25.16
      squeezenet_ssd  min =   17.51  max =   22.63  avg =   19.25
 squeezenet_ssd_int8  min =   16.81  max =   17.26  avg =   16.98
       mobilenet_ssd  min =   15.96  max =   16.52  avg =   16.11
  mobilenet_ssd_int8  min =   13.70  max =   14.26  avg =   13.95
      mobilenet_yolo  min =   50.48  max =   52.88  avg =   51.76
  mobilenetv2_yolov3  min =   22.63  max =   22.99  avg =   22.85
         yolov4-tiny  min =   29.01  max =   38.20  avg =   32.50
           nanodet_m  min =   12.58  max =   15.53  avg =   13.86
    yolo-fastest-1.1  min =    8.57  max =    9.18  avg =    8.86
      yolo-fastestv2  min =    6.85  max =    8.47  avg =    8.05
  vision_transformer  min =  548.48  max =  703.29  avg =  614.47
          FastestDet  min =    7.71  max =    9.31  avg =    8.15
          
venus:/data/local/tmp $ ./benchncnn 8 8 2 0 1
./benchncnn 8 8 2 0 1
[0 Adreno (TM) 660]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 660]  bugsbn1=1  bugbilz=0  bugcopc=0  bugihfa=0
[0 Adreno (TM) 660]  fp16-p/s/u/a=1/1/0/1  int8-p/s/u/a=1/0/0/1
[0 Adreno (TM) 660]  subgroup=64  basic/vote/ballot/shuffle=1/1/1/1
[0 Adreno (TM) 660]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 8
num_threads = 8
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   10.63  max =   12.41  avg =   11.80
     squeezenet_int8  min =    6.93  max =    8.82  avg =    7.86
           mobilenet  min =   12.79  max =   14.12  avg =   13.48
      mobilenet_int8  min =    9.18  max =    9.70  avg =    9.44
        mobilenet_v2  min =   14.73  max =   15.62  avg =   15.13
        mobilenet_v3  min =   14.68  max =   16.72  avg =   15.70
          shufflenet  min =   11.28  max =   12.75  avg =   12.17
       shufflenet_v2  min =   11.44  max =   14.27  avg =   12.07
             mnasnet  min =   14.54  max =   15.94  avg =   15.35
     proxylessnasnet  min =   16.33  max =   17.31  avg =   16.71
     efficientnet_b0  min =   22.64  max =   25.42  avg =   24.35
   efficientnetv2_b0  min =   41.16  max =   52.08  avg =   45.61
        regnety_400m  min =   17.56  max =   18.08  avg =   17.85
           blazeface  min =    2.87  max =    3.89  avg =    3.34
           googlenet  min =   31.64  max =   33.38  avg =   32.14
      googlenet_int8  min =   18.29  max =   19.15  avg =   18.73
            resnet18  min =   23.47  max =   24.60  avg =   23.85
       resnet18_int8  min =   11.89  max =   17.17  avg =   14.54
             alexnet  min =   25.62  max =   26.23  avg =   25.98
               vgg16  min =   41.81  max =   42.69  avg =   42.12
          vgg16_int8  min =   79.43  max =  123.88  avg =   93.17
            resnet50  min =   41.28  max =   43.27  avg =   41.79
       resnet50_int8  min =   25.55  max =   26.34  avg =   25.97
      squeezenet_ssd  min =   30.10  max =   33.64  avg =   31.39
 squeezenet_ssd_int8  min =   18.12  max =   18.58  avg =   18.30
       mobilenet_ssd  min =   28.29  max =   28.90  avg =   28.66
  mobilenet_ssd_int8  min =   13.90  max =   14.31  avg =   14.02
      mobilenet_yolo  min =   43.88  max =   45.43  avg =   44.58
  mobilenetv2_yolov3  min =   16.49  max =   37.05  avg =   19.32
         yolov4-tiny  min =   22.70  max =   50.58  avg =   34.92
           nanodet_m  min =   19.31  max =   19.88  avg =   19.57
    yolo-fastest-1.1  min =   11.17  max =   11.33  avg =   11.26
      yolo-fastestv2  min =    9.72  max =   10.04  avg =    9.85
  vision_transformer  min =  744.98  max =  758.15  avg =  751.62
          FastestDet  min =   11.95  max =   13.12  avg =   12.46
```

### Qualcomm Snapdragon X Elite (X1E78100), Oryon 3.4GHz x 12 + Adreno X1-85

Test on Oryon CPU

```
loop_count = 10
num_threads = 12
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    5.13  max =    5.19  avg =    5.16
     squeezenet_int8  min =    4.31  max =    4.81  avg =    4.67
           mobilenet  min =    3.73  max =    3.85  avg =    3.78
      mobilenet_int8  min =    2.51  max =    3.11  avg =    2.64
        mobilenet_v2  min =    3.55  max =    3.70  avg =    3.60
        mobilenet_v3  min =    3.28  max =    3.88  avg =    3.40
          shufflenet  min =    3.77  max =    5.07  avg =    4.02
       shufflenet_v2  min =    3.24  max =    3.34  avg =    3.29
             mnasnet  min =    3.49  max =    4.09  avg =    3.58
     proxylessnasnet  min =    4.30  max =    4.93  avg =    4.41
     efficientnet_b0  min =    4.97  max =   17.26  avg =    6.28
   efficientnetv2_b0  min =    6.85  max =   10.19  avg =    7.39
        regnety_400m  min =   11.26  max =   11.36  avg =   11.31
           blazeface  min =    1.43  max =    1.48  avg =    1.44
           googlenet  min =    9.84  max =    9.96  avg =    9.89
      googlenet_int8  min =    8.04  max =    8.33  avg =    8.13
            resnet18  min =    6.63  max =    9.34  avg =    6.94
       resnet18_int8  min =    5.47  max =    6.24  avg =    5.59
             alexnet  min =    7.52  max =    7.61  avg =    7.54
               vgg16  min =   29.66  max =   32.27  avg =   30.07
          vgg16_int8  min =   32.97  max =   34.43  avg =   33.32
            resnet50  min =   16.54  max =   16.68  avg =   16.63
       resnet50_int8  min =   11.12  max =   13.84  avg =   11.42
      squeezenet_ssd  min =    9.20  max =    9.77  avg =    9.39
 squeezenet_ssd_int8  min =    8.50  max =    9.17  avg =    8.73
       mobilenet_ssd  min =    8.28  max =    8.67  avg =    8.36
  mobilenet_ssd_int8  min =    5.59  max =    6.25  avg =    5.74
      mobilenet_yolo  min =   21.42  max =   22.77  avg =   21.65
  mobilenetv2_yolov3  min =   14.03  max =   14.34  avg =   14.13
         yolov4-tiny  min =   23.60  max =   23.84  avg =   23.70
           nanodet_m  min =    6.64  max =    7.40  avg =    6.77
    yolo-fastest-1.1  min =    4.14  max =    7.15  avg =    4.53
      yolo-fastestv2  min =    3.63  max =    3.70  avg =    3.66
  vision_transformer  min =  384.74  max =  415.74  avg =  391.28
          FastestDet  min =    4.29  max =    4.94  avg =    4.40
```

Test on X1-85 GPU

```
[0 Adreno X1-85]  queueC=0[1]  queueT=0[1]
[0 Adreno X1-85]  fp16-p/s/u/a=1/1/0/1  int8-p/s/u/a=1/0/0/1  bf16-p/s=1/0
[0 Adreno X1-85]  subgroup=128(64~128)  ops=1/1/1/1/1/1/1/1/1/1
[0 Adreno X1-85]  fp16-cm=0  int8-cm=0  bf16-cm=0  fp8-cm=0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    3.23  max =    3.99  avg =    3.63
           mobilenet  min =    3.33  max =    5.86  avg =    5.20
        mobilenet_v2  min =    4.06  max =    4.77  avg =    4.52
        mobilenet_v3  min =    4.61  max =    8.12  avg =    6.60
          shufflenet  min =    3.16  max =    7.45  avg =    4.65
       shufflenet_v2  min =    3.90  max =    6.00  avg =    5.02
             mnasnet  min =    4.44  max =    5.12  avg =    4.81
     proxylessnasnet  min =    4.91  max =    7.02  avg =    6.15
     efficientnet_b0  min =    6.61  max =    7.25  avg =    7.04
   efficientnetv2_b0  min =   21.48  max =   56.52  avg =   39.03
        regnety_400m  min =    7.33  max =    7.60  avg =    7.44
           blazeface  min =    2.83  max =    4.59  avg =    4.30
           googlenet  min =   11.00  max =   12.98  avg =   12.60
            resnet18  min =   12.11  max =   14.59  avg =   13.27
             alexnet  min =   11.64  max =   12.18  avg =   11.96
               vgg16  min =   40.06  max =   45.62  avg =   42.88
            resnet50  min =   18.99  max =   21.93  avg =   20.88
      squeezenet_ssd  min =   10.95  max =   14.73  avg =   13.03
       mobilenet_ssd  min =    7.92  max =    9.75  avg =    9.46
      mobilenet_yolo  min =    9.02  max =   12.54  avg =   11.38
  mobilenetv2_yolov3  min =   12.70  max =   14.70  avg =   13.95
         yolov4-tiny  min =   25.88  max =   30.26  avg =   28.12
           nanodet_m  min =    9.38  max =   33.46  avg =   20.29
    yolo-fastest-1.1  min =    6.08  max =    6.75  avg =    6.43
      yolo-fastestv2  min =    4.50  max =    6.47  avg =    6.04
  vision_transformer  min =  184.89  max =  191.78  avg =  189.07
          FastestDet  min =    6.01  max =    7.83  avg =    6.43
```

### Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4)
```
pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 4 4 0 -1 1
loop_count = 4
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   84.74  max =   85.60  avg =   85.22
     squeezenet_int8  min =   74.48  max =   74.80  avg =   74.68
           mobilenet  min =  107.84  max =  110.13  avg =  108.66
      mobilenet_int8  min =   66.91  max =   67.12  avg =   67.03
        mobilenet_v2  min =  110.64  max =  112.73  avg =  111.68
        mobilenet_v3  min =   85.78  max =   86.74  avg =   86.44
          shufflenet  min =   58.38  max =   60.32  avg =   59.33
       shufflenet_v2  min =   46.76  max =   47.53  avg =   47.19
             mnasnet  min =   95.53  max =   95.88  avg =   95.78
     proxylessnasnet  min =  102.24  max =  105.58  avg =  103.38
     efficientnet_b0  min =  134.87  max =  136.98  avg =  135.86
   efficientnetv2_b0  min =  146.62  max =  148.06  avg =  147.13
        regnety_400m  min =  118.60  max =  119.51  avg =  119.03
           blazeface  min =   15.42  max =   15.61  avg =   15.52
           googlenet  min =  223.78  max =  224.85  avg =  224.22
      googlenet_int8  min =  188.23  max =  190.15  avg =  189.21
            resnet18  min =  270.86  max =  272.66  avg =  271.93
       resnet18_int8  min =  159.57  max =  160.39  avg =  160.07
             alexnet  min =  157.79  max =  160.77  avg =  159.09
            resnet50  min =  583.57  max =  591.41  avg =  587.42
       resnet50_int8  min =  383.96  max =  401.37  avg =  391.87
      squeezenet_ssd  min =  247.90  max =  249.77  avg =  248.98
 squeezenet_ssd_int8  min =  191.65  max =  192.81  avg =  192.17
       mobilenet_ssd  min =  240.11  max =  241.02  avg =  240.62
  mobilenet_ssd_int8  min =  136.30  max =  137.26  avg =  136.73
      mobilenet_yolo  min =  523.59  max =  539.91  avg =  529.98
  mobilenetv2_yolov3  min =  356.44  max =  366.85  avg =  362.06
         yolov4-tiny  min =  410.25  max =  422.18  avg =  417.17
           nanodet_m  min =  114.98  max =  115.83  avg =  115.40
    yolo-fastest-1.1  min =   79.85  max =   80.83  avg =   80.28
      yolo-fastestv2  min =   62.36  max =   62.91  avg =   62.60
          FastestDet  min =   67.11  max =   68.51  avg =   67.98

pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  125.34  max =  125.81  avg =  125.58
     squeezenet_int8  min =  135.56  max =  136.34  avg =  135.98
           mobilenet  min =  204.62  max =  207.06  avg =  205.65
      mobilenet_int8  min =  181.34  max =  182.46  avg =  181.91
        mobilenet_v2  min =  158.69  max =  158.94  avg =  158.80
        mobilenet_v3  min =  127.13  max =  127.31  avg =  127.23
          shufflenet  min =   84.64  max =   85.29  avg =   84.89
       shufflenet_v2  min =   74.28  max =   74.64  avg =   74.44
             mnasnet  min =  148.12  max =  148.65  avg =  148.42
     proxylessnasnet  min =  199.56  max =  201.99  avg =  200.42
     efficientnet_b0  min =  240.94  max =  241.75  avg =  241.27
   efficientnetv2_b0  min =  270.71  max =  270.90  avg =  270.83
        regnety_400m  min =  186.89  max =  187.08  avg =  187.01
           blazeface  min =   22.75  max =   23.24  avg =   22.95
           googlenet  min =  450.64  max =  450.96  avg =  450.79
      googlenet_int8  min =  424.66  max =  426.83  avg =  425.78
            resnet18  min =  379.21  max =  380.01  avg =  379.57
       resnet18_int8  min =  312.23  max =  313.21  avg =  312.68
             alexnet  min =  270.13  max =  270.88  avg =  270.55
            resnet50  min =  977.51  max =  981.89  avg =  979.75
       resnet50_int8  min =  890.77  max =  896.89  avg =  893.83
      squeezenet_ssd  min =  331.52  max =  333.47  avg =  332.46
 squeezenet_ssd_int8  min =  317.71  max =  319.64  avg =  318.62
       mobilenet_ssd  min =  425.42  max =  426.52  avg =  425.93
  mobilenet_ssd_int8  min =  370.17  max =  370.90  avg =  370.66
      mobilenet_yolo  min =  930.40  max =  932.24  avg =  931.46
  mobilenetv2_yolov3  min =  534.79  max =  543.56  avg =  539.20
         yolov4-tiny  min =  675.33  max =  676.83  avg =  676.14
           nanodet_m  min =  178.13  max =  178.98  avg =  178.64
    yolo-fastest-1.1  min =  100.83  max =  101.96  avg =  101.49
      yolo-fastestv2  min =   79.73  max =   79.94  avg =   79.84
          FastestDet  min =   89.09  max =   90.07  avg =   89.78
```

### Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4)
```
pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 4 0 -1 1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   46.28  max =   46.91  avg =   46.65
     squeezenet_int8  min =   42.18  max =   44.98  avg =   42.59
           mobilenet  min =   60.74  max =   61.79  avg =   61.17
      mobilenet_int8  min =   34.19  max =   34.55  avg =   34.37
        mobilenet_v2  min =   61.63  max =   62.02  avg =   61.88
        mobilenet_v3  min =   47.08  max =   48.40  avg =   47.53
          shufflenet  min =   32.91  max =   33.30  avg =   33.09
       shufflenet_v2  min =   24.37  max =   24.73  avg =   24.56
             mnasnet  min =   51.80  max =   52.14  avg =   51.98
     proxylessnasnet  min =   53.02  max =   53.58  avg =   53.32
     efficientnet_b0  min =   73.92  max =   74.44  avg =   74.19
   efficientnetv2_b0  min =   79.10  max =   79.60  avg =   79.34
        regnety_400m  min =   65.27  max =   66.12  avg =   65.70
           blazeface  min =    8.62  max =    8.75  avg =    8.69
           googlenet  min =  113.74  max =  115.14  avg =  114.35
      googlenet_int8  min =  100.87  max =  101.71  avg =  101.25
            resnet18  min =  122.27  max =  125.39  avg =  123.12
       resnet18_int8  min =   82.19  max =   94.12  avg =   83.92
             alexnet  min =   75.75  max =   78.08  avg =   76.40
               vgg16  min =  541.66  max =  552.56  avg =  547.09
          vgg16_int8  min =  391.44  max =  395.73  avg =  394.23
            resnet50  min =  261.90  max =  263.91  avg =  262.83
       resnet50_int8  min =  195.60  max =  198.08  avg =  196.65
      squeezenet_ssd  min =  127.01  max =  129.85  avg =  127.61
 squeezenet_ssd_int8  min =  104.98  max =  107.67  avg =  105.47
       mobilenet_ssd  min =  120.43  max =  123.28  avg =  121.46
  mobilenet_ssd_int8  min =   70.70  max =   72.85  avg =   71.14
      mobilenet_yolo  min =  270.89  max =  273.42  avg =  272.33
  mobilenetv2_yolov3  min =  183.85  max =  185.73  avg =  184.88
         yolov4-tiny  min =  205.95  max =  209.90  avg =  207.22
           nanodet_m  min =   68.08  max =   68.69  avg =   68.38
    yolo-fastest-1.1  min =   47.97  max =   48.20  avg =   48.06
      yolo-fastestv2  min =   37.17  max =   37.69  avg =   37.47
  vision_transformer  min = 1872.31  max = 1964.95  avg = 1909.21
          FastestDet  min =   38.39  max =   39.17  avg =   38.69

pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 1 0 -1 1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   73.35  max =   75.10  avg =   73.96
     squeezenet_int8  min =   69.17  max =   69.66  avg =   69.42
           mobilenet  min =  123.76  max =  125.35  avg =  124.32
      mobilenet_int8  min =   84.66  max =   85.24  avg =   84.82
        mobilenet_v2  min =   92.98  max =   94.05  avg =   93.48
        mobilenet_v3  min =   72.48  max =   73.14  avg =   72.81
          shufflenet  min =   47.17  max =   47.83  avg =   47.51
       shufflenet_v2  min =   41.62  max =   42.60  avg =   42.12
             mnasnet  min =   83.60  max =   84.35  avg =   83.98
     proxylessnasnet  min =   98.48  max =   99.33  avg =   98.78
     efficientnet_b0  min =  129.45  max =  130.02  avg =  129.73
   efficientnetv2_b0  min =  155.06  max =  156.70  avg =  155.76
        regnety_400m  min =  105.39  max =  106.03  avg =  105.70
           blazeface  min =   12.54  max =   12.84  avg =   12.65
           googlenet  min =  235.38  max =  236.34  avg =  235.94
      googlenet_int8  min =  209.63  max =  210.39  avg =  210.00
            resnet18  min =  190.80  max =  191.43  avg =  191.10
       resnet18_int8  min =  157.92  max =  158.97  avg =  158.50
             alexnet  min =  139.34  max =  139.44  avg =  139.40
               vgg16  min = 1066.58  max = 1079.30  avg = 1071.85
          vgg16_int8  min =  866.15  max =  873.75  avg =  869.84
            resnet50  min =  533.15  max =  535.12  avg =  534.11
       resnet50_int8  min =  423.72  max =  424.24  avg =  423.96
      squeezenet_ssd  min =  178.90  max =  179.53  avg =  179.30
 squeezenet_ssd_int8  min =  157.05  max =  159.06  avg =  157.89
       mobilenet_ssd  min =  250.71  max =  251.26  avg =  251.00
  mobilenet_ssd_int8  min =  170.21  max =  170.96  avg =  170.56
      mobilenet_yolo  min =  557.48  max =  560.08  avg =  558.80
  mobilenetv2_yolov3  min =  301.60  max =  307.98  avg =  306.52
         yolov4-tiny  min =  370.55  max =  375.69  avg =  372.99
           nanodet_m  min =  103.05  max =  103.74  avg =  103.45
    yolo-fastest-1.1  min =   56.58  max =   57.44  avg =   57.01
      yolo-fastestv2  min =   46.69  max =   47.34  avg =   47.03
  vision_transformer  min = 6605.19  max = 6606.66  avg = 6605.73
          FastestDet  min =   52.11  max =   52.97  avg =   52.61
```
### Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4)
```
pi@raspberrypi:~/ncnn/benchmark $ ./benchncnn 10 4 0 -1 -1 >> text.out
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =    6.74  max =    8.16  avg =    7.38
     squeezenet_int8  min =    6.97  max =    7.67  avg =    7.21
           mobilenet  min =    9.00  max =   72.98  avg =   33.88
      mobilenet_int8  min =    8.68  max =    8.80  avg =    8.74
        mobilenet_v2  min =   10.46  max =   10.63  avg =   10.52
        mobilenet_v3  min =    7.30  max =    7.44  avg =    7.35
          shufflenet  min =    4.14  max =    4.18  avg =    4.16
       shufflenet_v2  min =    3.37  max =    3.41  avg =    3.39
             mnasnet  min =    6.83  max =    8.55  avg =    7.10
     proxylessnasnet  min =    7.85  max =    7.97  avg =    7.88
     efficientnet_b0  min =   12.28  max =   12.37  avg =   12.33
   efficientnetv2_b0  min =   13.54  max =   13.84  avg =   13.69
        regnety_400m  min =   10.93  max =   11.07  avg =   10.99
           blazeface  min =    1.45  max =    1.48  avg =    1.47
           googlenet  min =   25.13  max =   25.47  avg =   25.35
      googlenet_int8  min =   24.00  max =   24.23  avg =   24.12
            resnet18  min =   19.84  max =   20.19  avg =   19.96
       resnet18_int8  min =   16.68  max =   16.83  avg =   16.74
             alexnet  min =   21.21  max =   21.54  avg =   21.36
               vgg16  min =  127.75  max =  134.00  avg =  129.24
          vgg16_int8  min =  106.39  max =  110.66  avg =  107.01
            resnet50  min =   45.94  max =   46.54  avg =   46.21
       resnet50_int8  min =   40.16  max =   42.58  avg =   40.75
      squeezenet_ssd  min =   30.10  max =   30.95  avg =   30.37
 squeezenet_ssd_int8  min =   27.71  max =   29.03  avg =   28.15
       mobilenet_ssd  min =   24.16  max =   24.89  avg =   24.52
  mobilenet_ssd_int8  min =   21.79  max =   22.37  avg =   22.05
      mobilenet_yolo  min =   58.06  max =   58.45  avg =   58.19
  mobilenetv2_yolov3  min =   37.49  max =   37.94  avg =   37.68
         yolov4-tiny  min =   44.45  max =   60.58  avg =   46.29
           nanodet_m  min =   11.01  max =   11.28  avg =   11.18
    yolo-fastest-1.1  min =    5.53  max =    5.97  avg =    5.62
      yolo-fastestv2  min =    4.76  max =    4.84  avg =    4.80
  vision_transformer  min =  600.65  max =  622.47  avg =  611.65
          FastestDet  min =    4.83  max =    6.94  avg =    5.34


pi@raspberrypi:~/ncnn/benchmark $ ./benchncnn 10 1 0 -1 -1 
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   11.77  max =   12.18  avg =   11.87
     squeezenet_int8  min =   11.67  max =   11.98  avg =   11.82
           mobilenet  min =   20.24  max =   20.59  avg =   20.30
      mobilenet_int8  min =   14.38  max =   14.51  avg =   14.44
        mobilenet_v2  min =   16.21  max =   16.49  avg =   16.38
        mobilenet_v3  min =   11.64  max =   12.12  avg =   11.80
          shufflenet  min =    7.17  max =    7.24  avg =    7.20
       shufflenet_v2  min =    7.07  max =    7.21  avg =    7.14
             mnasnet  min =   12.93  max =   13.03  avg =   12.99
     proxylessnasnet  min =   15.72  max =   15.80  avg =   15.74
     efficientnet_b0  min =   24.12  max =   24.53  avg =   24.20
   efficientnetv2_b0  min =   27.59  max =   28.04  avg =   27.75
        regnety_400m  min =   16.41  max =   16.66  avg =   16.49
           blazeface  min =    2.98  max =    3.04  avg =    3.02
           googlenet  min =   48.62  max =   48.87  avg =   48.71
      googlenet_int8  min =   49.07  max =   49.26  avg =   49.15
            resnet18  min =   29.54  max =   30.17  avg =   29.68
       resnet18_int8  min =   36.30  max =   36.55  avg =   36.42
             alexnet  min =   35.24  max =   35.86  avg =   35.62
               vgg16  min =  188.84  max =  190.87  avg =  189.63
          vgg16_int8  min =  272.27  max =  274.15  avg =  273.10
            resnet50  min =   89.04  max =   89.87  avg =   89.43
       resnet50_int8  min =   80.00  max =   80.50  avg =   80.16
      squeezenet_ssd  min =   38.02  max =   38.69  avg =   38.29
 squeezenet_ssd_int8  min =   40.58  max =   41.17  avg =   40.94
       mobilenet_ssd  min =   45.42  max =   47.08  avg =   45.90
  mobilenet_ssd_int8  min =   36.05  max =   37.02  avg =   36.35
      mobilenet_yolo  min =  104.82  max =  106.56  avg =  105.69
  mobilenetv2_yolov3  min =   60.11  max =   60.29  avg =   60.19
         yolov4-tiny  min =   67.61  max =   69.05  avg =   68.02
           nanodet_m  min =   19.63  max =   19.81  avg =   19.69
    yolo-fastest-1.1  min =    8.10  max =    8.14  avg =    8.12
      yolo-fastestv2  min =    7.21  max =    7.26  avg =    7.24
  vision_transformer  min = 1249.08  max = 1253.32  avg = 1250.30
          FastestDet  min =    7.33  max =    7.44  avg =    7.38
```
### Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2)
```
fan@raspberrypi:~/ncnn/benchmark $ ../build/benchmark/benchncnn 10 $(nproc) 0 0
[0 V3D 7.1.7]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 V3D 7.1.7]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 V3D 7.1.7]  fp16-p/s/a=1/1/0  int8-p/s/a=1/1/0
[0 V3D 7.1.7]  subgroup=16  basic/vote/ballot/shuffle=1/0/0/0
[0 V3D 7.1.7]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  subgroup=4  basic/vote/ballot/shuffle=1/1/1/1
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =  120.75  max =  121.31  avg =  120.94
     squeezenet_int8  min =    9.57  max =   24.49  avg =   11.23
           mobilenet  min =  160.32  max =  160.75  avg =  160.53
      mobilenet_int8  min =   11.29  max =   11.47  avg =   11.37
        mobilenet_v2  min =  121.05  max =  121.93  avg =  121.46
        mobilenet_v3  min =  117.90  max =  119.20  avg =  118.48
          shufflenet  min =   70.82  max =   71.55  avg =   71.04
       shufflenet_v2  min =   97.74  max =   98.58  avg =   98.00
             mnasnet  min =  118.21  max =  118.76  avg =  118.44
     proxylessnasnet  min =  124.28  max =  124.92  avg =  124.52
     efficientnet_b0  min =  187.48  max =  188.38  avg =  187.93
   efficientnetv2_b0  min =  270.11  max =  280.80  avg =  272.26
        regnety_400m  min =  142.14  max =  143.25  avg =  142.66
           blazeface  min =   31.97  max =   32.41  avg =   32.17
           googlenet  min =  346.30  max =  347.47  avg =  346.81
      googlenet_int8  min =   30.77  max =   32.26  avg =   31.52
            resnet18  min =  346.96  max =  347.50  avg =  347.26
       resnet18_int8  min =   19.95  max =   20.95  avg =   20.48
             alexnet  min =  181.57  max =  182.03  avg =  181.75
               vgg16  min = 1776.00  max = 1776.66  avg = 1776.40
          vgg16_int8  min =  134.10  max =  141.76  avg =  136.32
            resnet50  min =  841.90  max =  842.50  avg =  842.16
       resnet50_int8  min =   54.29  max =   55.22  avg =   54.54
      squeezenet_ssd  min =  461.71  max =  468.09  avg =  466.97
 squeezenet_ssd_int8  min =   38.05  max =   39.00  avg =   38.58
       mobilenet_ssd  min =  379.50  max =  381.66  avg =  380.14
  mobilenet_ssd_int8  min =   29.91  max =   30.77  avg =   30.13
      mobilenet_yolo  min =  753.61  max =  755.06  avg =  753.97
  mobilenetv2_yolov3  min =  382.18  max =  389.90  avg =  386.97
         yolov4-tiny  min =  673.87  max =  674.71  avg =  674.07
           nanodet_m  min =  206.55  max =  210.48  avg =  209.69
    yolo-fastest-1.1  min =  109.98  max =  111.18  avg =  110.45
      yolo-fastestv2  min =   86.07  max =   87.16  avg =   86.51
  vision_transformer  min = 20594.51  max = 20601.53  avg = 20596.59
          FastestDet  min =   90.25  max =   91.00  avg =   90.64
```

### Raspberry Pi 5 Broadcom BCM2712 Overclock to 2.9Ghz, VideoCore VII Graphics Overclock to 1.1Ghz (Vulkan 1.2)
```
pi@raspberrypi:~/ncnn/build/benchmark $ sudo echo "arm_freq=2900" >> /boot/firmware/config.txt
pi@raspberrypi:~/ncnn/build/benchmark $ sudo echo "gpu_freq=1100" >> /boot/firmware/config.txt
pi@raspberrypi:~/ncnn/build/benchmark $ sudo reboot

pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 4 0 0
[0 V3D 7.1.7]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 V3D 7.1.7]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 V3D 7.1.7]  fp16-p/s/u/a=1/1/1/0  int8-p/s/u/a=1/1/1/0
[0 V3D 7.1.7]  subgroup=16  basic/vote/ballot/shuffle=1/0/0/0
[0 V3D 7.1.7]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  subgroup=4  basic/vote/ballot/shuffle=1/1/1/1
[1 llvmpipe (LLVM 15.0.6, 128 bits)]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =  106.98  max =  107.05  avg =  107.02
     squeezenet_int8  min =    8.51  max =    8.83  avg =    8.65
           mobilenet  min =  147.66  max =  147.71  avg =  147.68
      mobilenet_int8  min =   10.21  max =   10.54  avg =   10.37
        mobilenet_v2  min =  110.11  max =  110.23  avg =  110.18
        mobilenet_v3  min =  101.84  max =  102.03  avg =  101.92
          shufflenet  min =   59.77  max =   59.84  avg =   59.80
       shufflenet_v2  min =   81.46  max =   81.60  avg =   81.51
             mnasnet  min =  105.88  max =  105.98  avg =  105.94
     proxylessnasnet  min =  108.82  max =  108.89  avg =  108.86
     efficientnet_b0  min =  168.79  max =  168.93  avg =  168.87
   efficientnetv2_b0  min =  232.52  max =  232.80  avg =  232.65
        regnety_400m  min =  130.33  max =  130.49  avg =  130.36
           blazeface  min =   22.23  max =   22.49  avg =   22.39
           googlenet  min =  299.25  max =  299.37  avg =  299.31
      googlenet_int8  min =   29.21  max =   29.97  avg =   29.58
            resnet18  min =  304.47  max =  304.64  avg =  304.58
       resnet18_int8  min =   19.31  max =   20.77  avg =   20.24
             alexnet  min =  203.68  max =  203.79  avg =  203.76
               vgg16  min = 1571.91  max = 1572.22  avg = 1572.06
          vgg16_int8  min =  128.46  max =  130.89  avg =  129.96
            resnet50  min =  754.16  max =  754.33  avg =  754.26
       resnet50_int8  min =   52.65  max =   53.48  avg =   53.09
      squeezenet_ssd  min =  398.22  max =  398.36  avg =  398.28
 squeezenet_ssd_int8  min =   34.26  max =   34.67  avg =   34.51
       mobilenet_ssd  min =  344.81  max =  344.99  avg =  344.89
  mobilenet_ssd_int8  min =   27.59  max =   28.01  avg =   27.77
      mobilenet_yolo  min =  712.53  max =  712.63  avg =  712.59
  mobilenetv2_yolov3  min =  362.81  max =  363.11  avg =  362.90
         yolov4-tiny  min =  589.30  max =  589.51  avg =  589.39
           nanodet_m  min =  178.83  max =  178.97  avg =  178.88
    yolo-fastest-1.1  min =   92.36  max =   92.58  avg =   92.45
      yolo-fastestv2  min =   70.68  max =   70.84  avg =   70.74
  vision_transformer  min = 18615.94  max = 18648.17  avg = 18633.77
          FastestDet  min =   74.59  max =   74.68  avg =   74.63

pi@raspberrypi:~/ncnn/build/benchmark $ ./benchncnn 10 4 0 -1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =    7.61  max =    7.76  avg =    7.70
     squeezenet_int8  min =    7.97  max =    8.68  avg =    8.23
           mobilenet  min =    9.65  max =    9.91  avg =    9.80
      mobilenet_int8  min =   10.60  max =   36.93  avg =   13.29
        mobilenet_v2  min =   12.25  max =   12.64  avg =   12.40
        mobilenet_v3  min =    8.14  max =    8.26  avg =    8.20
          shufflenet  min =    3.72  max =    3.82  avg =    3.77
       shufflenet_v2  min =    2.99  max =    3.10  avg =    3.05
             mnasnet  min =    7.27  max =    7.46  avg =    7.37
     proxylessnasnet  min =    8.39  max =    8.55  avg =    8.48
     efficientnet_b0  min =   13.15  max =   13.59  avg =   13.39
   efficientnetv2_b0  min =   14.79  max =   15.30  avg =   14.91
        regnety_400m  min =    9.49  max =    9.71  avg =    9.57
           blazeface  min =    1.41  max =    1.46  avg =    1.43
           googlenet  min =   28.60  max =   28.87  avg =   28.73
      googlenet_int8  min =   27.09  max =   27.77  avg =   27.47
            resnet18  min =   21.47  max =   21.88  avg =   21.65
       resnet18_int8  min =   20.07  max =   20.30  avg =   20.24
             alexnet  min =   22.75  max =   23.47  avg =   23.05
               vgg16  min =  154.32  max =  158.51  avg =  157.40
          vgg16_int8  min =  127.78  max =  162.60  avg =  133.21
            resnet50  min =   49.36  max =   49.86  avg =   49.63
       resnet50_int8  min =   46.44  max =   46.89  avg =   46.74
      squeezenet_ssd  min =   37.31  max =   74.95  avg =   41.30
 squeezenet_ssd_int8  min =   32.62  max =   33.63  avg =   33.09
       mobilenet_ssd  min =   27.40  max =   27.99  avg =   27.68
  mobilenet_ssd_int8  min =   26.70  max =   27.71  avg =   27.23
      mobilenet_yolo  min =   60.25  max =   61.10  avg =   60.67
  mobilenetv2_yolov3  min =   43.51  max =   44.29  avg =   43.87
         yolov4-tiny  min =   51.63  max =   52.64  avg =   52.24
           nanodet_m  min =   11.89  max =   12.06  avg =   11.97
    yolo-fastest-1.1  min =    5.63  max =    5.78  avg =    5.69
      yolo-fastestv2  min =    5.34  max =    5.48  avg =    5.40
  vision_transformer  min =  481.78  max =  506.72  avg =  493.05
          FastestDet  min =    4.91  max =    5.14  avg =    5.01
```
### Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4)

```
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  119.52  max =  120.29  avg =  119.93
     squeezenet_int8  min =   96.32  max =   96.96  avg =   96.55
           mobilenet  min =  162.60  max =  165.49  avg =  163.19
      mobilenet_int8  min =   90.78  max =   91.39  avg =   91.03
        mobilenet_v2  min =  145.71  max =  148.83  avg =  147.39
        mobilenet_v3  min =  113.89  max =  151.95  avg =  119.04
          shufflenet  min =   72.72  max =   73.27  avg =   72.96
       shufflenet_v2  min =   63.64  max =   64.50  avg =   64.13
             mnasnet  min =  126.07  max =  126.93  avg =  126.53
     proxylessnasnet  min =  139.90  max =  140.84  avg =  140.35
     efficientnet_b0  min =  201.88  max =  202.55  avg =  202.14
   efficientnetv2_b0  min =  227.22  max =  228.84  avg =  228.09
        regnety_400m  min =  156.49  max =  157.47  avg =  156.96
           blazeface  min =   22.79  max =   23.28  avg =   23.10
           googlenet  min =  323.74  max =  324.90  avg =  324.45
      googlenet_int8  min =  250.86  max =  252.82  avg =  251.63
            resnet18  min =  351.37  max =  355.67  avg =  353.45
       resnet18_int8  min =  194.83  max =  196.68  avg =  195.51
             alexnet  min =  271.18  max =  273.53  avg =  272.18
            resnet50  min =  777.44  max =  797.47  avg =  782.63
       resnet50_int8  min =  496.78  max =  498.86  avg =  497.57
      squeezenet_ssd  min =  376.10  max =  382.41  avg =  379.13
 squeezenet_ssd_int8  min =  255.99  max =  257.57  avg =  256.78
       mobilenet_ssd  min =  338.64  max =  339.93  avg =  339.50
  mobilenet_ssd_int8  min =  190.24  max =  190.68  avg =  190.48
      mobilenet_yolo  min =  746.83  max =  748.14  avg =  747.53
  mobilenetv2_yolov3  min =  487.99  max =  491.18  avg =  489.37
         yolov4-tiny  min =  644.73  max =  652.24  avg =  646.64
           nanodet_m  min =  165.27  max =  167.12  avg =  166.27
    yolo-fastest-1.1  min =   98.74  max =  100.02  avg =   99.17
      yolo-fastestv2  min =   80.52  max =   81.86  avg =   81.29

loop_count = 8
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  240.53  max =  241.07  avg =  240.77
     squeezenet_int8  min =  212.63  max =  213.23  avg =  212.94
           mobilenet  min =  393.79  max =  394.04  avg =  393.94
      mobilenet_int8  min =  286.58  max =  286.95  avg =  286.75
        mobilenet_v2  min =  273.97  max =  274.51  avg =  274.23
        mobilenet_v3  min =  233.77  max =  234.59  avg =  234.20
          shufflenet  min =  133.05  max =  133.36  avg =  133.23
       shufflenet_v2  min =  128.86  max =  129.47  avg =  129.18
             mnasnet  min =  265.70  max =  266.17  avg =  265.93
     proxylessnasnet  min =  329.78  max =  330.54  avg =  330.13
     efficientnet_b0  min =  518.42  max =  519.38  avg =  519.00
   efficientnetv2_b0  min =  594.37  max =  595.17  avg =  594.74
        regnety_400m  min =  329.53  max =  330.44  avg =  329.87
           blazeface  min =   42.24  max =   45.56  avg =   43.96
           googlenet  min =  780.05  max =  780.63  avg =  780.39
      googlenet_int8  min =  663.83  max =  664.43  avg =  664.15
            resnet18  min =  653.62  max =  657.59  avg =  654.69
       resnet18_int8  min =  479.03  max =  479.72  avg =  479.40
             alexnet  min =  687.99  max =  690.34  avg =  689.15
            resnet50  min = 1800.97  max = 1806.11  avg = 1802.79
       resnet50_int8  min = 1311.68  max = 1314.56  avg = 1313.15
      squeezenet_ssd  min =  563.63  max =  565.57  avg =  564.44
 squeezenet_ssd_int8  min =  481.24  max =  483.97  avg =  482.20
       mobilenet_ssd  min =  799.21  max =  829.10  avg =  803.56
  mobilenet_ssd_int8  min =  568.11  max =  568.88  avg =  568.42
      mobilenet_yolo  min = 1815.60  max = 1816.44  avg = 1815.93
  mobilenetv2_yolov3  min =  951.34  max =  952.15  avg =  951.72
         yolov4-tiny  min = 1258.21  max = 1259.49  avg = 1258.66
           nanodet_m  min =  301.04  max =  304.09  avg =  301.70
    yolo-fastest-1.1  min =  155.04  max =  155.98  avg =  155.53
      yolo-fastestv2  min =  126.77  max =  127.40  avg =  127.05
```

### Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4)

```
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  230.97  max =  232.18  avg =  231.49
     squeezenet_int8  min =  171.12  max =  172.87  avg =  171.68
           mobilenet  min =  327.65  max =  340.92  avg =  329.88
      mobilenet_int8  min =  166.58  max =  169.55  avg =  167.47
        mobilenet_v2  min =  276.81  max =  278.67  avg =  277.55
        mobilenet_v3  min =  220.74  max =  225.14  avg =  222.08
          shufflenet  min =  147.97  max =  157.68  avg =  149.40
       shufflenet_v2  min =  146.56  max =  154.90  avg =  148.25
             mnasnet  min =  243.06  max =  244.47  avg =  243.80
     proxylessnasnet  min =  260.38  max =  261.47  avg =  260.66
     efficientnet_b0  min =  368.98  max =  371.03  avg =  369.96
   efficientnetv2_b0  min =  433.96  max =  459.25  avg =  437.52
        regnety_400m  min =  307.53  max =  312.29  avg =  308.68
           blazeface  min =   46.54  max =   47.35  avg =   46.98
           googlenet  min =  647.86  max =  669.20  avg =  651.19
      googlenet_int8  min =  439.90  max =  442.35  avg =  441.38
            resnet18  min =  642.53  max =  856.58  avg =  698.28
       resnet18_int8  min =  352.10  max =  354.51  avg =  353.44
             alexnet  min =  593.16  max =  624.20  avg =  598.66
            resnet50  min = 1556.12  max = 1782.22  avg = 1606.86
       resnet50_int8  min =  911.63  max =  999.42  avg =  924.37
      squeezenet_ssd  min =  653.85  max =  658.07  avg =  655.19
 squeezenet_ssd_int8  min =  456.26  max =  467.76  avg =  459.87
       mobilenet_ssd  min =  671.93  max =  682.64  avg =  674.88
  mobilenet_ssd_int8  min =  347.18  max =  349.07  avg =  347.81
      mobilenet_yolo  min = 1471.16  max = 1492.65  avg = 1479.30
  mobilenetv2_yolov3  min =  895.90  max =  906.60  avg =  899.74
         yolov4-tiny  min = 1178.53  max = 1205.79  avg = 1183.98
           nanodet_m  min =  358.89  max =  366.07  avg =  362.20
    yolo-fastest-1.1  min =  189.93  max =  192.18  avg =  190.91
      yolo-fastestv2  min =  158.60  max =  161.33  avg =  159.43

loop_count = 8
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  602.97  max =  604.97  avg =  603.46
     squeezenet_int8  min =  431.18  max =  432.42  avg =  431.77
           mobilenet  min =  971.52  max =  986.64  avg =  974.04
      mobilenet_int8  min =  556.74  max =  556.98  avg =  556.84
        mobilenet_v2  min =  682.85  max =  684.17  avg =  683.34
        mobilenet_v3  min =  585.10  max =  585.76  avg =  585.57
          shufflenet  min =  340.64  max =  342.63  avg =  341.26
       shufflenet_v2  min =  322.41  max =  324.13  avg =  323.35
             mnasnet  min =  644.30  max =  645.93  avg =  644.71
     proxylessnasnet  min =  732.50  max =  733.30  avg =  732.96
     efficientnet_b0  min = 1084.70  max = 1094.98  avg = 1086.52
   efficientnetv2_b0  min = 1282.27  max = 1283.67  avg = 1282.60
        regnety_400m  min =  764.60  max =  768.54  avg =  765.30
           blazeface  min =  100.48  max =  106.28  avg =  103.33
           googlenet  min = 1878.69  max = 1883.96  avg = 1880.76
      googlenet_int8  min = 1274.31  max = 1296.02  avg = 1279.59
            resnet18  min = 1837.91  max = 1843.95  avg = 1839.17
       resnet18_int8  min = 1011.98  max = 1014.43  avg = 1013.01
             alexnet  min = 1997.59  max = 2001.81  avg = 1999.42
            resnet50  min = 4844.31  max = 4857.05  avg = 4847.80
       resnet50_int8  min = 2792.59  max = 2810.08  avg = 2797.30
      squeezenet_ssd  min = 1438.96  max = 1443.31  avg = 1441.09
 squeezenet_ssd_int8  min = 1046.76  max = 1053.00  avg = 1049.22
       mobilenet_ssd  min = 2018.66  max = 2023.70  avg = 2019.67
  mobilenet_ssd_int8  min = 1129.16  max = 1130.62  avg = 1129.82
      mobilenet_yolo  min = 4724.90  max = 4728.57  avg = 4726.41
  mobilenetv2_yolov3  min = 2410.67  max = 2427.95  avg = 2413.89
         yolov4-tiny  min = 3177.27  max = 3185.52  avg = 3179.71
           nanodet_m  min =  761.38  max =  768.79  avg =  766.53
    yolo-fastest-1.1  min =  391.82  max =  393.32  avg =  392.39
      yolo-fastestv2  min =  316.93  max =  319.86  avg =  318.33
```

### Radxa Orion O6 (Big Cortex‑A720 2.6Ghz x4 + Medium Cortex‑A720 x 4 + Little Cortex‑A520 x 4 + Arm Immortals G720 MC10 GPU 1.1Ghz)

```
radxa@orion-o6:~/ncnn/build/benchmark$ ./benchncnn 4 1 2 -1 1
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    8.52  max =    8.53  avg =    8.53
     squeezenet_int8  min =    6.49  max =    6.50  avg =    6.50
           mobilenet  min =   15.56  max =   15.61  avg =   15.58
      mobilenet_int8  min =    8.68  max =    8.70  avg =    8.69
        mobilenet_v2  min =    9.67  max =    9.68  avg =    9.67
        mobilenet_v3  min =    8.05  max =    8.07  avg =    8.06
          shufflenet  min =    5.30  max =    5.32  avg =    5.31
       shufflenet_v2  min =    5.55  max =    5.57  avg =    5.56
             mnasnet  min =    9.23  max =    9.26  avg =    9.25
     proxylessnasnet  min =   11.58  max =   11.58  avg =   11.58
     efficientnet_b0  min =   18.67  max =   18.68  avg =   18.67
   efficientnetv2_b0  min =   21.55  max =   21.59  avg =   21.57
        regnety_400m  min =   13.02  max =   13.07  avg =   13.05
           blazeface  min =    2.04  max =    2.06  avg =    2.05
           googlenet  min =   35.36  max =   35.49  avg =   35.40
      googlenet_int8  min =   27.86  max =   27.97  avg =   27.91
            resnet18  min =   21.68  max =   21.74  avg =   21.70
       resnet18_int8  min =   19.07  max =   19.12  avg =   19.09
             alexnet  min =   23.94  max =   24.06  avg =   24.02
               vgg16  min =  123.48  max =  124.36  avg =  123.87
          vgg16_int8  min =  139.53  max =  139.72  avg =  139.64
            resnet50  min =   68.07  max =   68.09  avg =   68.08
       resnet50_int8  min =   39.99  max =   40.07  avg =   40.03
      squeezenet_ssd  min =   20.35  max =   20.43  avg =   20.38
 squeezenet_ssd_int8  min =   18.62  max =   18.69  avg =   18.67
       mobilenet_ssd  min =   31.40  max =   31.56  avg =   31.48
  mobilenet_ssd_int8  min =   17.44  max =   17.54  avg =   17.49
      mobilenet_yolo  min =   70.84  max =   70.94  avg =   70.88
  mobilenetv2_yolov3  min =   35.24  max =   35.30  avg =   35.28
         yolov4-tiny  min =   42.96  max =   43.02  avg =   42.99
           nanodet_m  min =   13.05  max =   13.11  avg =   13.08
    yolo-fastest-1.1  min =    5.21  max =    5.22  avg =    5.22
      yolo-fastestv2  min =    4.48  max =    4.50  avg =    4.49
  vision_transformer  min = 1001.70  max = 1002.06  avg = 1001.90
          FastestDet  min =    4.65  max =    4.67  avg =    4.66
radxa@orion-o6:~/ncnn/build/benchmark$ ./benchncnn 4 12 2 -1 1
loop_count = 4
num_threads = 12
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   38.01  max =   40.45  avg =   39.00
     squeezenet_int8  min =   45.53  max =   45.73  avg =   45.60
           mobilenet  min =   33.35  max =   37.73  avg =   35.96
      mobilenet_int8  min =   33.87  max =   34.05  avg =   33.93
        mobilenet_v2  min =   57.97  max =   61.42  avg =   59.74
        mobilenet_v3  min =   65.47  max =   65.76  avg =   65.65
          shufflenet  min =  110.95  max =  111.29  avg =  111.12
       shufflenet_v2  min =   63.97  max =   64.20  avg =   64.08
             mnasnet  min =   56.06  max =   56.44  avg =   56.23
     proxylessnasnet  min =   63.84  max =   64.36  avg =   64.10
     efficientnet_b0  min =   94.52  max =   94.79  avg =   94.65
   efficientnetv2_b0  min =  154.39  max =  158.08  avg =  156.57
        regnety_400m  min =  454.18  max =  457.25  avg =  455.08
           blazeface  min =   44.79  max =   45.03  avg =   44.92
           googlenet  min =   91.22  max =   93.72  avg =   92.01
      googlenet_int8  min =  115.45  max =  118.36  avg =  116.69
            resnet18  min =   42.81  max =   50.61  avg =   45.62
       resnet18_int8  min =   45.26  max =   47.70  avg =   46.52
             alexnet  min =   25.74  max =   28.83  avg =   26.66
               vgg16  min =   61.15  max =   64.72  avg =   63.09
          vgg16_int8  min =   67.75  max =   73.18  avg =   69.38
            resnet50  min =   90.29  max =  100.58  avg =   96.62
       resnet50_int8  min =   92.35  max =   97.42  avg =   94.64
      squeezenet_ssd  min =  105.26  max =  111.83  avg =  107.89
 squeezenet_ssd_int8  min =  117.49  max =  121.57  avg =  118.91
       mobilenet_ssd  min =   89.79  max =   95.18  avg =   92.15
  mobilenet_ssd_int8  min =   97.02  max =  103.84  avg =   99.86
      mobilenet_yolo  min =  603.04  max =  606.87  avg =  605.03
  mobilenetv2_yolov3  min =   75.32  max =   80.43  avg =   76.83
         yolov4-tiny  min =   51.46  max =   60.43  avg =   56.32
           nanodet_m  min =  104.05  max =  109.94  avg =  107.06
    yolo-fastest-1.1  min =   90.31  max =   90.50  avg =   90.41
      yolo-fastestv2  min =   94.72  max =   96.62  avg =   95.52
  vision_transformer  min =  323.38  max =  333.42  avg =  329.50
          FastestDet  min =   80.86  max =   83.37  avg =   81.84
radxa@orion-o6:~/ncnn/build/benchmark$ ./benchncnn 4 1 2 0 1
[0 Mali-G720-Immortalis]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G720-Immortalis]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Mali-G720-Immortalis]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[0 Mali-G720-Immortalis]  subgroup=16  basic/vote/ballot/shuffle=1/1/1/1
[0 Mali-G720-Immortalis]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   16.33  max =   16.59  avg =   16.45
     squeezenet_int8  min =    6.36  max =   10.08  avg =    7.32
           mobilenet  min =    3.45  max =   27.79  avg =   14.90
      mobilenet_int8  min =    8.71  max =    8.76  avg =    8.74
        mobilenet_v2  min =    4.31  max =    4.43  avg =    4.40
        mobilenet_v3  min =   19.81  max =   19.86  avg =   19.83
          shufflenet  min =   14.76  max =   14.83  avg =   14.79
       shufflenet_v2  min =   15.24  max =   15.33  avg =   15.28
             mnasnet  min =    3.71  max =   10.64  avg =    5.55
     proxylessnasnet  min =    4.82  max =    4.95  avg =    4.90
     efficientnet_b0  min =    6.58  max =    6.62  avg =    6.60
   efficientnetv2_b0  min =   56.26  max =   57.46  avg =   56.82
        regnety_400m  min =    5.30  max =   30.08  avg =   17.72
           blazeface  min =    4.36  max =    4.52  avg =    4.46
           googlenet  min =    9.03  max =    9.07  avg =    9.05
      googlenet_int8  min =   27.90  max =   27.94  avg =   27.92
            resnet18  min =    6.47  max =   28.26  avg =   11.93
       resnet18_int8  min =   19.79  max =   19.83  avg =   19.81
             alexnet  min =    7.76  max =    7.81  avg =    7.77
               vgg16  min =   27.58  max =   27.90  avg =   27.77
          vgg16_int8  min =  143.28  max =  144.19  avg =  143.68
            resnet50  min =   14.06  max =   14.22  avg =   14.15
       resnet50_int8  min =   41.37  max =   41.48  avg =   41.43
      squeezenet_ssd  min =   11.11  max =   60.31  avg =   47.93
 squeezenet_ssd_int8  min =   19.29  max =   19.39  avg =   19.35
       mobilenet_ssd  min =    8.78  max =    8.88  avg =    8.82
  mobilenet_ssd_int8  min =   17.60  max =   17.66  avg =   17.62
      mobilenet_yolo  min =   13.64  max =   13.91  avg =   13.76
  mobilenetv2_yolov3  min =   11.97  max =   15.79  avg =   14.01
         yolov4-tiny  min =   26.72  max =   32.41  avg =   28.27
           nanodet_m  min =    9.84  max =   13.42  avg =   10.76
    yolo-fastest-1.1  min =   15.38  max =   15.62  avg =   15.56
      yolo-fastestv2  min =   13.56  max =   13.67  avg =   13.61
  vision_transformer  min =  831.86  max =  835.66  avg =  833.83
          FastestDet  min =   13.85  max =   13.92  avg =   13.88
```

### Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4)
```
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   34.51  max =  106.19  avg =   79.43
     squeezenet_int8  min =   31.48  max =   49.87  avg =   34.65
           mobilenet  min =   42.23  max =   45.36  avg =   42.89
      mobilenet_int8  min =   35.97  max =   53.84  avg =   38.77
        mobilenet_v2  min =   39.61  max =   40.35  avg =   40.00
        mobilenet_v3  min =   31.19  max =   31.85  avg =   31.50
          shufflenet  min =   24.75  max =   27.74  avg =   25.55
       shufflenet_v2  min =   22.00  max =   22.70  avg =   22.31
             mnasnet  min =   34.95  max =   53.55  avg =   37.39
     proxylessnasnet  min =   39.96  max =   44.32  avg =   40.81
     efficientnet_b0  min =   49.76  max =   67.77  avg =   52.61
   efficientnetv2_b0  min =   64.00  max =   85.78  avg =   67.06
        regnety_400m  min =   55.23  max =   73.22  avg =   57.87
           blazeface  min =    7.80  max =   10.39  avg =    8.27
           googlenet  min =   98.24  max =  118.27  avg =  101.78
      googlenet_int8  min =   98.81  max =  115.66  avg =  101.52
            resnet18  min =   75.33  max =   88.59  avg =   78.19
       resnet18_int8  min =   76.31  max =   95.17  avg =   79.03
             alexnet  min =   65.07  max =   73.80  avg =   67.18
               vgg16  min =  423.20  max =  455.15  avg =  436.32
          vgg16_int8  min =  591.82  max =  620.22  avg =  607.55
            resnet50  min =  185.53  max =  207.10  avg =  193.03
       resnet50_int8  min =  176.84  max =  194.73  avg =  181.81
      squeezenet_ssd  min =   96.64  max =  118.46  avg =  100.86
 squeezenet_ssd_int8  min =   96.61  max =  123.48  avg =  104.64
       mobilenet_ssd  min =   95.38  max =  110.52  avg =   98.61
  mobilenet_ssd_int8  min =   76.21  max =   95.41  avg =   79.10
      mobilenet_yolo  min =  210.73  max =  235.47  avg =  221.72
  mobilenetv2_yolov3  min =  134.59  max =  154.33  avg =  139.54
         yolov4-tiny  min =  167.79  max =  191.60  avg =  171.25
           nanodet_m  min =   63.22  max =   80.73  avg =   66.25
    yolo-fastest-1.1  min =   32.87  max =   88.05  avg =   47.36
      yolo-fastestv2  min =   26.03  max =   27.01  avg =   26.54
  vision_transformer  min = 3682.51  max = 3882.79  avg = 3809.42
          FastestDet  min =   30.69  max =   50.65  avg =   33.65
```

### Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4)

```
./benchncnn 4 4 2 -1 1
loop_count = 4
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   14.15  max =   14.21  avg =   14.17
     squeezenet_int8  min =   21.05  max =   21.12  avg =   21.09
           mobilenet  min =   19.22  max =   19.30  avg =   19.25
      mobilenet_int8  min =   18.65  max =   19.52  avg =   19.07
        mobilenet_v2  min =   20.23  max =   21.01  avg =   20.63
        mobilenet_v3  min =   15.34  max =   15.48  avg =   15.41
          shufflenet  min =   10.30  max =   10.37  avg =   10.33
       shufflenet_v2  min =    9.18  max =    9.34  avg =    9.23
             mnasnet  min =   15.58  max =   15.62  avg =   15.60
     proxylessnasnet  min =   19.64  max =   19.73  avg =   19.67
     efficientnet_b0  min =   25.62  max =   25.81  avg =   25.69
   efficientnetv2_b0  min =   36.95  max =   37.46  avg =   37.17
        regnety_400m  min =   23.75  max =   24.13  avg =   23.90
           blazeface  min =    3.37  max =    3.42  avg =    3.40
           googlenet  min =   57.36  max =   58.32  avg =   57.88
      googlenet_int8  min =   60.80  max =   62.30  avg =   61.50
            resnet18  min =   39.99  max =   40.34  avg =   40.17
       resnet18_int8  min =   54.18  max =   56.08  avg =   55.16
             alexnet  min =   41.87  max =   42.21  avg =   42.08
               vgg16  min =  260.14  max =  260.94  avg =  260.51
          vgg16_int8  min =  347.42  max =  348.90  avg =  348.30
            resnet50  min =   90.91  max =   91.26  avg =   91.07
       resnet50_int8  min =  121.94  max =  122.56  avg =  122.28
      squeezenet_ssd  min =   57.11  max =   57.57  avg =   57.37
 squeezenet_ssd_int8  min =   74.70  max =   75.18  avg =   74.91
       mobilenet_ssd  min =   49.60  max =   49.96  avg =   49.71
  mobilenet_ssd_int8  min =   49.45  max =   49.93  avg =   49.63
      mobilenet_yolo  min =  114.98  max =  115.37  avg =  115.18
  mobilenetv2_yolov3  min =   75.74  max =   75.97  avg =   75.87
         yolov4-tiny  min =   99.09  max =   99.43  avg =   99.25
           nanodet_m  min =   29.40  max =   29.77  avg =   29.60
    yolo-fastest-1.1  min =   13.78  max =   13.85  avg =   13.82
      yolo-fastestv2  min =   12.91  max =   13.10  avg =   12.98
  vision_transformer  min = 1641.78  max = 1648.71  avg = 1646.65
          FastestDet  min =   12.24  max =   12.61  avg =   12.42

```


### Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2)

```
vim3:/data/local/tmp # ./benchncnn 8 4 2 -1 1
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   30.98  max =   31.26  avg =   31.09
     squeezenet_int8  min =   24.70  max =   24.84  avg =   24.78
           mobilenet  min =   42.57  max =   43.37  avg =   42.96
      mobilenet_int8  min =   22.33  max =   22.52  avg =   22.44
        mobilenet_v2  min =   39.36  max =   39.77  avg =   39.56
        mobilenet_v3  min =   30.13  max =   30.45  avg =   30.28
          shufflenet  min =   21.62  max =   21.94  avg =   21.80
       shufflenet_v2  min =   18.83  max =   19.24  avg =   19.05
             mnasnet  min =   33.54  max =   34.08  avg =   33.80
     proxylessnasnet  min =   35.81  max =   36.05  avg =   35.95
     efficientnet_b0  min =   53.82  max =   54.44  avg =   54.21
   efficientnetv2_b0  min =   62.20  max =   62.60  avg =   62.43
        regnety_400m  min =   48.82  max =   49.27  avg =   49.05
           blazeface  min =    6.34  max =    6.51  avg =    6.43
           googlenet  min =   81.96  max =   82.53  avg =   82.23
      googlenet_int8  min =   64.42  max =   65.00  avg =   64.77
            resnet18  min =   77.00  max =   77.83  avg =   77.46
       resnet18_int8  min =   48.91  max =   49.14  avg =   49.05
             alexnet  min =   60.43  max =   60.93  avg =   60.69
               vgg16  min =  414.89  max =  423.00  avg =  418.75
          vgg16_int8  min =  245.58  max =  246.37  avg =  245.94
            resnet50  min =  185.53  max =  187.35  avg =  186.18
       resnet50_int8  min =  123.36  max =  124.75  avg =  124.17
      squeezenet_ssd  min =   85.87  max =   86.42  avg =   86.23
 squeezenet_ssd_int8  min =   64.90  max =   65.24  avg =   65.08
       mobilenet_ssd  min =   88.32  max =   90.02  avg =   89.10
  mobilenet_ssd_int8  min =   46.85  max =   47.18  avg =   46.98
      mobilenet_yolo  min =  192.33  max =  195.38  avg =  194.10
  mobilenetv2_yolov3  min =  127.33  max =  128.58  avg =  127.96
         yolov4-tiny  min =  150.44  max =  152.02  avg =  151.20
           nanodet_m  min =   54.22  max =   54.61  avg =   54.37
    yolo-fastest-1.1  min =   28.13  max =   28.76  avg =   28.40
      yolo-fastestv2  min =   22.10  max =   22.26  avg =   22.19

vim3:/data/local/tmp # ./benchncnn 4 1 2 -1 1
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   68.25  max =   68.85  avg =   68.67
     squeezenet_int8  min =   51.92  max =   52.08  avg =   52.01
           mobilenet  min =  112.69  max =  113.72  avg =  113.33
      mobilenet_int8  min =   66.43  max =   66.89  avg =   66.68
        mobilenet_v2  min =   81.36  max =   81.77  avg =   81.62
        mobilenet_v3  min =   62.33  max =   63.39  avg =   62.94
          shufflenet  min =   37.84  max =   38.03  avg =   37.93
       shufflenet_v2  min =   37.33  max =   38.08  avg =   37.68
             mnasnet  min =   73.83  max =   74.32  avg =   74.03
     proxylessnasnet  min =   85.19  max =   86.43  avg =   85.84
     efficientnet_b0  min =  138.68  max =  139.69  avg =  139.19
   efficientnetv2_b0  min =  167.53  max =  167.99  avg =  167.75
        regnety_400m  min =   94.78  max =   95.81  avg =   95.21
           blazeface  min =   11.22  max =   11.43  avg =   11.28
           googlenet  min =  229.35  max =  230.91  avg =  229.89
      googlenet_int8  min =  173.04  max =  173.48  avg =  173.24
            resnet18  min =  191.54  max =  193.78  avg =  192.49
       resnet18_int8  min =  132.97  max =  133.51  avg =  133.25
             alexnet  min =  140.31  max =  141.95  avg =  141.18
               vgg16  min = 1093.71  max = 1100.95  avg = 1097.64
          vgg16_int8  min =  734.44  max =  736.16  avg =  735.05
            resnet50  min =  530.38  max =  533.93  avg =  531.87
       resnet50_int8  min =  332.88  max =  334.22  avg =  333.71
      squeezenet_ssd  min =  159.08  max =  160.98  avg =  160.16
 squeezenet_ssd_int8  min =  126.97  max =  127.96  avg =  127.43
       mobilenet_ssd  min =  238.92  max =  241.14  avg =  239.70
  mobilenet_ssd_int8  min =  135.57  max =  136.02  avg =  135.78
      mobilenet_yolo  min =  539.59  max =  543.88  avg =  541.90
  mobilenetv2_yolov3  min =  281.32  max =  285.05  avg =  283.24
         yolov4-tiny  min =  381.99  max =  384.93  avg =  383.53
           nanodet_m  min =   98.32  max =   98.85  avg =   98.60
    yolo-fastest-1.1  min =   44.59  max =   44.95  avg =   44.80
      yolo-fastestv2  min =   36.88  max =   37.11  avg =   36.98

vim3:/data/local/tmp $ ./benchncnn 8 6 2 0 1                               
[0 Mali-G52]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G52]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=1
[0 Mali-G52]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/0/0/0
[0 Mali-G52]  subgroup=8  basic/vote/ballot/shuffle=1/0/0/0
[0 Mali-G52]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 8
num_threads = 6
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =   21.29  max =   21.81  avg =   21.56
     squeezenet_int8  min =   37.59  max =   37.85  avg =   37.70
           mobilenet  min =   32.08  max =   32.61  avg =   32.42
      mobilenet_int8  min =   40.12  max =   40.46  avg =   40.28
        mobilenet_v2  min =   24.55  max =   24.67  avg =   24.62
        mobilenet_v3  min =   25.35  max =   25.60  avg =   25.47
          shufflenet  min =   18.78  max =   89.48  avg =   35.41
       shufflenet_v2  min =   21.15  max =   21.33  avg =   21.22
             mnasnet  min =   25.08  max =   25.31  avg =   25.21
     proxylessnasnet  min =   26.97  max =   27.18  avg =   27.05
     efficientnet_b0  min =   40.70  max =   40.91  avg =   40.81
   efficientnetv2_b0  min =  189.26  max =  192.84  avg =  191.33
        regnety_400m  min =   30.88  max =   31.17  avg =   31.03
           blazeface  min =   24.34  max =   24.52  avg =   24.45
           googlenet  min =   67.14  max =   67.43  avg =   67.30
      googlenet_int8  min =   98.06  max =   98.57  avg =   98.35
            resnet18  min =   61.13  max =   61.63  avg =   61.44
       resnet18_int8  min =   72.63  max =   73.48  avg =   73.01
             alexnet  min =   68.88  max =   70.34  avg =   69.71
               vgg16  min =  347.48  max =  348.48  avg =  347.94
          vgg16_int8  min =  342.50  max =  357.78  avg =  353.13
            resnet50  min =  158.90  max =  160.10  avg =  159.76
       resnet50_int8  min =  211.35  max =  212.68  avg =  212.11
      squeezenet_ssd  min =   81.61  max =   82.17  avg =   81.91
 squeezenet_ssd_int8  min =   85.52  max =   85.98  avg =   85.79
       mobilenet_ssd  min =   73.38  max =   74.41  avg =   74.02
  mobilenet_ssd_int8  min =   85.13  max =   91.47  avg =   86.13
      mobilenet_yolo  min =  154.47  max =  155.23  avg =  154.74
  mobilenetv2_yolov3  min =  100.75  max =  101.96  avg =  101.27
         yolov4-tiny  min =  140.52  max =  161.68  avg =  153.85
           nanodet_m  min =   85.27  max =  110.53  avg =   94.81
    yolo-fastest-1.1  min =   23.56  max =   42.04  avg =   33.10
      yolo-fastestv2  min =   19.54  max =   21.66  avg =   21.01
  vision_transformer  min = 6395.34  max = 6418.70  avg = 6410.43
          FastestDet  min =   21.53  max =   23.21  avg =   22.98
```

### Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4)

```
rk3588_s:/data/local/tmp # ./benchncnn 8 4 2 -1 1
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    7.57  max =    7.68  avg =    7.60
     squeezenet_int8  min =    8.43  max =    8.52  avg =    8.46
           mobilenet  min =   11.01  max =   11.08  avg =   11.05
      mobilenet_int8  min =    8.89  max =    8.96  avg =    8.91
        mobilenet_v2  min =    8.73  max =    8.78  avg =    8.76
        mobilenet_v3  min =    7.90  max =    7.95  avg =    7.92
          shufflenet  min =    7.95  max =    8.02  avg =    7.99
       shufflenet_v2  min =    6.09  max =    6.13  avg =    6.11
             mnasnet  min =    8.30  max =    8.35  avg =    8.33
     proxylessnasnet  min =    9.67  max =    9.72  avg =    9.69
     efficientnet_b0  min =   17.51  max =   17.60  avg =   17.56
   efficientnetv2_b0  min =   28.10  max =   28.17  avg =   28.14
        regnety_400m  min =   16.33  max =   16.39  avg =   16.35
           blazeface  min =    2.81  max =    2.89  avg =    2.83
           googlenet  min =   33.33  max =   33.41  avg =   33.37
      googlenet_int8  min =   33.62  max =   33.87  avg =   33.77
            resnet18  min =   18.83  max =   18.90  avg =   18.86
       resnet18_int8  min =   33.92  max =   34.10  avg =   34.00
             alexnet  min =   29.07  max =   29.11  avg =   29.09
               vgg16  min =  106.86  max =  107.40  avg =  107.06
          vgg16_int8  min =  283.66  max =  284.16  avg =  283.94
            resnet50  min =   53.70  max =   54.21  avg =   53.83
       resnet50_int8  min =   66.11  max =   66.24  avg =   66.15
      squeezenet_ssd  min =   34.88  max =   35.04  avg =   34.99
 squeezenet_ssd_int8  min =   43.25  max =   43.62  avg =   43.37
       mobilenet_ssd  min =   31.32  max =   31.42  avg =   31.37
  mobilenet_ssd_int8  min =   26.11  max =   26.18  avg =   26.13
      mobilenet_yolo  min =   58.89  max =   59.02  avg =   58.95
  mobilenetv2_yolov3  min =   37.53  max =   37.64  avg =   37.58
         yolov4-tiny  min =   52.95  max =   53.31  avg =   53.03
           nanodet_m  min =   16.06  max =   16.14  avg =   16.10
    yolo-fastest-1.1  min =    8.42  max =    8.47  avg =    8.45
      yolo-fastestv2  min =    7.81  max =    7.88  avg =    7.84

rk3588_s:/data/local/tmp # ./benchncnn 8 1 2 -1 1
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   25.04  max =   25.14  avg =   25.07
     squeezenet_int8  min =   26.29  max =   26.38  avg =   26.33
           mobilenet  min =   41.17  max =   41.23  avg =   41.19
      mobilenet_int8  min =   32.51  max =   32.57  avg =   32.54
        mobilenet_v2  min =   27.27  max =   27.31  avg =   27.29
        mobilenet_v3  min =   22.49  max =   22.54  avg =   22.51
          shufflenet  min =   18.15  max =   18.22  avg =   18.18
       shufflenet_v2  min =   15.82  max =   15.86  avg =   15.85
             mnasnet  min =   26.45  max =   26.50  avg =   26.47
     proxylessnasnet  min =   31.60  max =   31.66  avg =   31.62
     efficientnet_b0  min =   55.53  max =   55.68  avg =   55.62
   efficientnetv2_b0  min =   96.84  max =   96.92  avg =   96.89
        regnety_400m  min =   33.66  max =   33.70  avg =   33.68
           blazeface  min =    8.80  max =    8.84  avg =    8.83
           googlenet  min =  116.89  max =  117.06  avg =  116.97
      googlenet_int8  min =  107.92  max =  108.03  avg =  107.98
            resnet18  min =   60.97  max =   61.18  avg =   61.05
       resnet18_int8  min =  118.95  max =  119.04  avg =  119.00
             alexnet  min =   93.49  max =   93.59  avg =   93.55
               vgg16  min =  333.81  max =  334.52  avg =  334.07
          vgg16_int8  min =  947.19  max =  947.55  avg =  947.35
            resnet50  min =  186.95  max =  187.42  avg =  187.15
       resnet50_int8  min =  225.72  max =  225.86  avg =  225.75
      squeezenet_ssd  min =   93.29  max =   93.66  avg =   93.47
 squeezenet_ssd_int8  min =  120.22  max =  120.95  avg =  120.49
       mobilenet_ssd  min =  105.84  max =  105.90  avg =  105.87
  mobilenet_ssd_int8  min =   85.95  max =   86.04  avg =   86.01
      mobilenet_yolo  min =  194.22  max =  194.64  avg =  194.41
  mobilenetv2_yolov3  min =  103.63  max =  103.72  avg =  103.69
         yolov4-tiny  min =  136.59  max =  137.14  avg =  136.91
           nanodet_m  min =   41.40  max =   41.49  avg =   41.43
    yolo-fastest-1.1  min =   18.73  max =   18.80  avg =   18.77
      yolo-fastestv2  min =   18.25  max =   18.31  avg =   18.28

rk3588_s:/data/local/tmp # ./benchncnn 8 4 1 -1 1
loop_count = 8
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 1
          squeezenet  min =   25.54  max =   25.99  avg =   25.71
     squeezenet_int8  min =   30.88  max =   31.16  avg =   31.01
           mobilenet  min =   36.24  max =   62.95  avg =   39.89
      mobilenet_int8  min =   31.90  max =   32.37  avg =   32.06
        mobilenet_v2  min =   27.49  max =   27.82  avg =   27.64
        mobilenet_v3  min =   26.30  max =   26.69  avg =   26.45
          shufflenet  min =   25.49  max =   25.72  avg =   25.60
       shufflenet_v2  min =   21.59  max =   22.67  avg =   21.78
             mnasnet  min =   27.92  max =   28.10  avg =   28.00
     proxylessnasnet  min =   34.18  max =   34.42  avg =   34.28
     efficientnet_b0  min =   57.37  max =   57.60  avg =   57.45
   efficientnetv2_b0  min =   83.50  max =   84.03  avg =   83.66
        regnety_400m  min =   50.83  max =   51.27  avg =   50.98
           blazeface  min =   14.07  max =   14.29  avg =   14.17
           googlenet  min =  100.60  max =  101.00  avg =  100.87
      googlenet_int8  min =  106.58  max =  107.14  avg =  106.71
            resnet18  min =   58.60  max =   59.62  avg =   59.00
       resnet18_int8  min =   84.90  max =   85.15  avg =   84.99
             alexnet  min =   86.06  max =   86.58  avg =   86.22
               vgg16  min =  308.42  max =  309.18  avg =  308.81
          vgg16_int8  min =  543.61  max =  545.09  avg =  544.40
            resnet50  min =  163.45  max =  164.44  avg =  163.92
       resnet50_int8  min =  179.51  max =  180.16  avg =  179.83
      squeezenet_ssd  min =   96.32  max =   97.24  avg =   96.71
 squeezenet_ssd_int8  min =  116.48  max =  117.65  avg =  116.85
       mobilenet_ssd  min =   92.12  max =   93.09  avg =   92.55
  mobilenet_ssd_int8  min =   81.78  max =   82.42  avg =   81.95
      mobilenet_yolo  min =  174.95  max =  175.40  avg =  175.15
  mobilenetv2_yolov3  min =  110.63  max =  111.05  avg =  110.81
         yolov4-tiny  min =  163.37  max =  164.24  avg =  163.63
           nanodet_m  min =   52.96  max =   53.59  avg =   53.12
    yolo-fastest-1.1  min =   28.98  max =   29.33  avg =   29.20
      yolo-fastestv2  min =   23.52  max =   24.16  avg =   23.76

rk3588_s:/data/local/tmp # ./benchncnn 8 1 1 -1 1
loop_count = 8
num_threads = 1
powersave = 1
gpu_device = -1
cooling_down = 1
          squeezenet  min =   83.46  max =   83.63  avg =   83.53
     squeezenet_int8  min =  101.39  max =  102.29  avg =  101.77
           mobilenet  min =  131.78  max =  132.25  avg =  131.87
      mobilenet_int8  min =  111.66  max =  112.60  avg =  111.94
        mobilenet_v2  min =   92.92  max =  227.19  avg =  132.44
        mobilenet_v3  min =   78.38  max =   78.64  avg =   78.49
          shufflenet  min =   62.98  max =   63.17  avg =   63.09
       shufflenet_v2  min =   56.85  max =   57.23  avg =   57.00
             mnasnet  min =   87.53  max =   87.71  avg =   87.60
     proxylessnasnet  min =  113.25  max =  114.10  avg =  113.58
     efficientnet_b0  min =  180.95  max =  181.16  avg =  181.07
   efficientnetv2_b0  min =  285.34  max =  285.62  avg =  285.51
        regnety_400m  min =  109.24  max =  109.36  avg =  109.31
           blazeface  min =   41.12  max =   41.53  avg =   41.23
           googlenet  min =  358.94  max =  359.55  avg =  359.24
      googlenet_int8  min =  371.32  max =  371.84  avg =  371.51
            resnet18  min =  209.97  max =  210.42  avg =  210.22
       resnet18_int8  min =  302.93  max =  303.51  avg =  303.26
             alexnet  min =  318.95  max =  321.70  avg =  319.40
               vgg16  min = 1126.11  max = 1127.83  avg = 1126.98
          vgg16_int8  min = 2026.90  max = 2034.04  avg = 2029.35
            resnet50  min =  602.90  max =  603.70  avg =  603.30
       resnet50_int8  min =  647.33  max =  649.41  avg =  648.65
      squeezenet_ssd  min =  280.60  max =  281.50  avg =  281.02
 squeezenet_ssd_int8  min =  359.41  max =  362.07  avg =  360.66
       mobilenet_ssd  min =  319.11  max =  319.29  avg =  319.21
  mobilenet_ssd_int8  min =  272.16  max =  273.36  avg =  272.83
      mobilenet_yolo  min =  607.07  max =  607.38  avg =  607.21
  mobilenetv2_yolov3  min =  326.66  max =  326.95  avg =  326.80
         yolov4-tiny  min =  449.56  max =  450.45  avg =  450.04
           nanodet_m  min =  142.09  max =  142.54  avg =  142.32
    yolo-fastest-1.1  min =   63.74  max =   63.80  avg =   63.78
      yolo-fastestv2  min =   57.56  max =   58.17  avg =   57.97

rk3588_s:/data/local/tmp # ./benchncnn 8 1 2 0 0
[0 Mali-G610]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G610]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Mali-G610]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Mali-G610]  subgroup=16  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 0
          squeezenet  min =    7.09  max =    7.20  avg =    7.13
           mobilenet  min =    9.16  max =    9.32  avg =    9.22
        mobilenet_v2  min =   10.18  max =   10.32  avg =   10.25
        mobilenet_v3  min =    8.01  max =    8.09  avg =    8.04
          shufflenet  min =    5.88  max =    5.93  avg =    5.89
       shufflenet_v2  min =    6.30  max =    6.33  avg =    6.31
             mnasnet  min =    7.91  max =    8.00  avg =    7.95
     proxylessnasnet  min =   11.20  max =   11.42  avg =   11.30
        regnety_400m  min =   11.65  max =   11.84  avg =   11.74
           blazeface  min =    2.50  max =    2.59  avg =    2.53
           googlenet  min =   17.69  max =   17.78  avg =   17.74
            resnet18  min =   16.04  max =   16.39  avg =   16.25
             alexnet  min =   15.47  max =   15.66  avg =   15.56
               vgg16  min =   64.74  max =   65.42  avg =   65.04
            resnet50  min =   37.83  max =   38.31  avg =   38.12
      squeezenet_ssd  min =   23.14  max =   23.44  avg =   23.26
       mobilenet_ssd  min =   22.48  max =   23.01  avg =   22.74
      mobilenet_yolo  min =   40.08  max =   40.72  avg =   40.32
  mobilenetv2_yolov3  min =   31.88  max =   32.57  avg =   32.12
         yolov4-tiny  min =   49.64  max =   50.73  avg =   50.13
           nanodet_m  min =   10.60  max =   10.70  avg =   10.64
    yolo-fastest-1.1  min =    7.63  max =    7.66  avg =    7.64
      yolo-fastestv2  min =    6.99  max =    7.02  avg =    7.00
```

### Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android)

```
roc_rk3588s_pc:/data/local/tmp # ./benchncnn 10 1 0 0 0
./benchncnn 10 1 0 0 0
[0 Mali-G610]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G610]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Mali-G610]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Mali-G610]  subgroup=16  basic/vote/ballot/shuffle=1/1/1/1
[0 Mali-G610]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    7.83  max =   14.17  avg =    9.76
     squeezenet_int8  min =   13.41  max =   13.52  avg =   13.45
           mobilenet  min =    8.73  max =    9.68  avg =    9.07
      mobilenet_int8  min =   17.70  max =   17.89  avg =   17.80
        mobilenet_v2  min =   10.73  max =   21.20  avg =   18.93
        mobilenet_v3  min =    9.00  max =   13.36  avg =   10.64
          shufflenet  min =    7.79  max =    7.93  avg =    7.85
       shufflenet_v2  min =    8.01  max =    8.06  avg =    8.03
             mnasnet  min =    7.43  max =    8.71  avg =    8.28
     proxylessnasnet  min =   10.56  max =   12.07  avg =   11.70
     efficientnet_b0  min =    2.15  max =    2.19  avg =    2.17
   efficientnetv2_b0  min =    0.56  max =    0.62  avg =    0.57
        regnety_400m  min =    1.65  max =    1.69  avg =    1.67
           blazeface  min =    0.76  max =    0.79  avg =    0.78
           googlenet  min =    1.53  max =    1.60  avg =    1.56
      googlenet_int8  min =   60.85  max =   61.01  avg =   60.93
            resnet18  min =    0.63  max =    0.82  avg =    0.65
       resnet18_int8  min =   64.60  max =   65.13  avg =   64.78
             alexnet  min =    0.35  max =    0.40  avg =    0.37
               vgg16  min =    0.54  max =    0.60  avg =    0.56
          vgg16_int8  min =  445.21  max =  562.09  avg =  537.10
            resnet50  min =    0.95  max =    0.97  avg =    0.96
       resnet50_int8  min =  113.02  max =  113.38  avg =  113.17
      squeezenet_ssd  min =    1.94  max =    2.00  avg =    1.96
 squeezenet_ssd_int8  min =   52.09  max =   56.93  avg =   56.35
       mobilenet_ssd  min =    1.19  max =    1.26  avg =    1.21
  mobilenet_ssd_int8  min =   44.33  max =   44.87  avg =   44.66
      mobilenet_yolo  min =    1.05  max =    1.24  avg =    1.13
  mobilenetv2_yolov3  min =    1.18  max =    1.25  avg =    1.21
         yolov4-tiny  min =    0.78  max =    0.80  avg =    0.78
           nanodet_m  min =    3.43  max =    3.80  avg =    3.57
    yolo-fastest-1.1  min =    1.43  max =    1.50  avg =    1.47
      yolo-fastestv2  min =    2.03  max =    2.10  avg =    2.05
  vision_transformer  min =    0.32  max =    0.36  avg =    0.35
          FastestDet  min =    1.90  max =    1.95  avg =    1.93

roc_rk3588s_pc:/data/local/tmp # ./benchncnn 10 1 0 -1 0
./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   13.36  max =   13.50  avg =   13.40
     squeezenet_int8  min =   16.22  max =   16.34  avg =   16.30
           mobilenet  min =   22.41  max =   22.49  avg =   22.44
      mobilenet_int8  min =   17.76  max =   17.94  avg =   17.84
        mobilenet_v2  min =   17.60  max =   17.80  avg =   17.70
        mobilenet_v3  min =   13.55  max =   13.70  avg =   13.61
          shufflenet  min =    7.91  max =    7.95  avg =    7.93
       shufflenet_v2  min =    8.36  max =    8.40  avg =    8.38
             mnasnet  min =   14.50  max =   14.60  avg =   14.56
     proxylessnasnet  min =   16.99  max =   17.12  avg =   17.06
     efficientnet_b0  min =   26.55  max =   26.78  avg =   26.62
   efficientnetv2_b0  min =   46.96  max =   47.44  avg =   47.30
        regnety_400m  min =   18.53  max =   18.63  avg =   18.58
           blazeface  min =    2.98  max =    3.02  avg =    3.00
           googlenet  min =   62.69  max =   63.14  avg =   62.90
      googlenet_int8  min =   60.86  max =   61.54  avg =   61.05
            resnet18  min =   30.34  max =   31.39  avg =   31.22
       resnet18_int8  min =   57.42  max =   57.67  avg =   57.56
             alexnet  min =   40.81  max =   40.87  avg =   40.84
               vgg16  min =  192.71  max =  195.20  avg =  194.26
          vgg16_int8  min =  450.95  max =  534.38  avg =  482.27
            resnet50  min =  105.11  max =  105.64  avg =  105.30
       resnet50_int8  min =  105.94  max =  132.01  avg =  116.48
      squeezenet_ssd  min =   51.36  max =   51.59  avg =   51.51
 squeezenet_ssd_int8  min =   69.01  max =   69.83  avg =   69.37
       mobilenet_ssd  min =   53.19  max =   55.24  avg =   53.50
  mobilenet_ssd_int8  min =   44.49  max =   44.98  avg =   44.74
      mobilenet_yolo  min =  112.65  max =  113.28  avg =  112.94
  mobilenetv2_yolov3  min =   63.38  max =   63.83  avg =   63.55
         yolov4-tiny  min =   77.57  max =   78.20  avg =   77.90
           nanodet_m  min =   25.21  max =   25.81  avg =   25.58
    yolo-fastest-1.1  min =    8.76  max =    8.84  avg =    8.80
      yolo-fastestv2  min =    8.46  max =    8.53  avg =    8.50
  vision_transformer  min = 1499.53  max = 1501.32  avg = 1500.50
          FastestDet  min =    7.04  max =    7.08  avg =    7.06
```

### Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4)

```
./benchncnn 4 4 0 -1 1
loop_count = 4
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   26.02  max =   27.15  avg =   26.74
     squeezenet_int8  min =   44.69  max =   45.70  avg =   45.24
           mobilenet  min =   32.63  max =   33.49  avg =   33.10
      mobilenet_int8  min =   44.23  max =   45.86  avg =   44.99
        mobilenet_v2  min =   31.59  max =   32.02  avg =   31.86
        mobilenet_v3  min =   25.71  max =   26.44  avg =   26.10
          shufflenet  min =   22.12  max =   23.17  avg =   22.52
       shufflenet_v2  min =   17.84  max =   18.21  avg =   17.96
             mnasnet  min =   28.26  max =   28.70  avg =   28.45
     proxylessnasnet  min =   31.96  max =   32.25  avg =   32.13
     efficientnet_b0  min =   53.17  max =   54.48  avg =   53.60
   efficientnetv2_b0  min =   70.08  max =   70.69  avg =   70.30
        regnety_400m  min =   40.80  max =   41.79  avg =   41.10
           blazeface  min =   10.79  max =   11.57  avg =   11.11
           googlenet  min =   83.66  max =   92.22  avg =   86.23
      googlenet_int8  min =  116.44  max =  118.34  avg =  117.08
            resnet18  min =   61.38  max =   62.52  avg =   61.94
       resnet18_int8  min =   95.58  max =   96.93  avg =   96.28
             alexnet  min =   69.90  max =   70.59  avg =   70.19
               vgg16  min =  334.24  max =  343.89  avg =  337.24
          vgg16_int8  min =  464.88  max =  474.71  avg =  468.29
            resnet50  min =  141.65  max =  146.23  avg =  143.78
       resnet50_int8  min =  230.36  max =  254.75  avg =  241.24
      squeezenet_ssd  min =   98.38  max =  104.60  avg =  100.50
 squeezenet_ssd_int8  min =  134.73  max =  137.88  avg =  136.12
       mobilenet_ssd  min =   77.48  max =   79.92  avg =   78.64
  mobilenet_ssd_int8  min =  101.44  max =  102.61  avg =  102.06
      mobilenet_yolo  min =  149.12  max =  150.14  avg =  149.76
  mobilenetv2_yolov3  min =  103.71  max =  107.81  avg =  105.69
         yolov4-tiny  min =  145.75  max =  149.35  avg =  147.09
           nanodet_m  min =   52.91  max =   54.06  avg =   53.53

./benchncnn 4 2 0 -1 1
loop_count = 4
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   33.78  max =   34.38  avg =   34.16
     squeezenet_int8  min =   61.66  max =   62.11  avg =   61.85
           mobilenet  min =   46.53  max =   46.74  avg =   46.62
      mobilenet_int8  min =   71.06  max =   71.76  avg =   71.38
        mobilenet_v2  min =   39.05  max =   39.38  avg =   39.19
        mobilenet_v3  min =   32.20  max =   32.47  avg =   32.29
          shufflenet  min =   27.13  max =   27.40  avg =   27.27
       shufflenet_v2  min =   23.38  max =   23.92  avg =   23.62
             mnasnet  min =   35.51  max =   35.73  avg =   35.62
     proxylessnasnet  min =   42.98  max =   43.16  avg =   43.06
     efficientnet_b0  min =   75.34  max =   75.79  avg =   75.61
   efficientnetv2_b0  min =  107.34  max =  107.83  avg =  107.60
        regnety_400m  min =   47.91  max =   48.20  avg =   48.02
           blazeface  min =   16.38  max =   16.63  avg =   16.49
           googlenet  min =  124.27  max =  125.24  avg =  124.65
      googlenet_int8  min =  177.78  max =  178.39  avg =  178.06
            resnet18  min =   82.02  max =   82.70  avg =   82.38
       resnet18_int8  min =  148.06  max =  149.03  avg =  148.39
             alexnet  min =  105.20  max =  105.91  avg =  105.54
               vgg16  min =  459.65  max =  464.94  avg =  462.02
          vgg16_int8  min =  737.54  max =  750.64  avg =  742.90
            resnet50  min =  204.44  max =  205.20  avg =  204.84
       resnet50_int8  min =  364.47  max =  366.04  avg =  365.53
      squeezenet_ssd  min =  124.42  max =  128.01  avg =  125.80
 squeezenet_ssd_int8  min =  179.29  max =  183.83  avg =  181.43
       mobilenet_ssd  min =  113.85  max =  115.50  avg =  114.41
  mobilenet_ssd_int8  min =  161.35  max =  162.38  avg =  161.71
      mobilenet_yolo  min =  214.95  max =  216.62  avg =  215.72
  mobilenetv2_yolov3  min =  134.23  max =  136.26  avg =  135.07
         yolov4-tiny  min =  194.72  max =  195.49  avg =  195.18
           nanodet_m  min =   67.67  max =   68.09  avg =   67.90

./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   54.31  max =   55.65  avg =   55.00
     squeezenet_int8  min =  103.96  max =  106.28  avg =  104.92
           mobilenet  min =   79.02  max =   79.46  avg =   79.25
      mobilenet_int8  min =  130.06  max =  130.61  avg =  130.36
        mobilenet_v2  min =   60.15  max =   60.66  avg =   60.31
        mobilenet_v3  min =   49.40  max =   49.57  avg =   49.49
          shufflenet  min =   39.39  max =   39.78  avg =   39.60
       shufflenet_v2  min =   35.48  max =   35.70  avg =   35.62
             mnasnet  min =   55.38  max =   56.10  avg =   55.71
     proxylessnasnet  min =   70.29  max =   70.48  avg =   70.35
     efficientnet_b0  min =  128.56  max =  129.96  avg =  129.26
   efficientnetv2_b0  min =  181.00  max =  181.56  avg =  181.24
        regnety_400m  min =   67.15  max =   69.62  avg =   67.95
           blazeface  min =   26.07  max =   26.58  avg =   26.33
           googlenet  min =  219.19  max =  221.32  avg =  220.01
      googlenet_int8  min =  317.62  max =  319.40  avg =  318.37
            resnet18  min =  135.33  max =  136.94  avg =  135.88
       resnet18_int8  min =  264.69  max =  265.51  avg =  265.16
             alexnet  min =  190.54  max =  193.50  avg =  191.88
               vgg16  min =  790.99  max =  809.24  avg =  795.85
          vgg16_int8  min = 1354.48  max = 1358.89  avg = 1357.40
            resnet50  min =  358.08  max =  362.96  avg =  360.29
       resnet50_int8  min =  667.92  max =  670.40  avg =  668.78
      squeezenet_ssd  min =  193.15  max =  194.02  avg =  193.49
 squeezenet_ssd_int8  min =  291.42  max =  294.70  avg =  293.16
       mobilenet_ssd  min =  189.54  max =  190.28  avg =  189.97
  mobilenet_ssd_int8  min =  289.94  max =  290.40  avg =  290.28
      mobilenet_yolo  min =  370.37  max =  384.69  avg =  375.11
  mobilenetv2_yolov3  min =  210.93  max =  211.70  avg =  211.40
         yolov4-tiny  min =  309.11  max =  310.74  avg =  309.89
           nanodet_m  min =  100.42  max =  112.25  avg =  103.66
```

### Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04

```
rock@rock3a:~/ncnn/build/benchmark$ ./benchncnn 8 4 0 -1 1
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   29.52  max =   30.30  avg =   29.76
     squeezenet_int8  min =   35.40  max =   36.19  avg =   35.88
           mobilenet  min =   34.47  max =   35.44  avg =   34.84
      mobilenet_int8  min =   34.19  max =   34.53  avg =   34.40
        mobilenet_v2  min =   35.75  max =   36.09  avg =   35.88
        mobilenet_v3  min =   28.12  max =   28.82  avg =   28.49
          shufflenet  min =   23.62  max =   24.08  avg =   23.84
       shufflenet_v2  min =   19.37  max =   19.64  avg =   19.52
             mnasnet  min =   30.84  max =   31.45  avg =   31.02
     proxylessnasnet  min =   35.73  max =   36.07  avg =   35.90
     efficientnet_b0  min =   48.16  max =   49.29  avg =   48.64
   efficientnetv2_b0  min =   66.62  max =   67.11  avg =   66.85
        regnety_400m  min =   41.11  max =   41.64  avg =   41.34
           blazeface  min =   12.38  max =   12.64  avg =   12.56
           googlenet  min =   86.73  max =   87.79  avg =   87.11
      googlenet_int8  min =  101.42  max =  103.87  avg =  102.55
            resnet18  min =   64.85  max =   65.84  avg =   65.23
       resnet18_int8  min =   93.55  max =   94.54  avg =   94.03
             alexnet  min =   70.89  max =   73.58  avg =   71.57
               vgg16  min =  356.13  max =  358.52  avg =  357.15
          vgg16_int8  min =  521.92  max =  524.13  avg =  523.11
            resnet50  min =  147.65  max =  150.33  avg =  148.52
       resnet50_int8  min =  191.94  max =  192.73  avg =  192.30
      squeezenet_ssd  min =  104.32  max =  105.75  avg =  105.00
 squeezenet_ssd_int8  min =  125.97  max =  127.53  avg =  126.70
       mobilenet_ssd  min =   82.29  max =   82.65  avg =   82.47
  mobilenet_ssd_int8  min =   79.26  max =   80.93  avg =   79.72
      mobilenet_yolo  min =  165.51  max =  165.86  avg =  165.72
  mobilenetv2_yolov3  min =  116.11  max =  116.83  avg =  116.43
         yolov4-tiny  min =  152.09  max =  153.39  avg =  152.60
           nanodet_m  min =   53.63  max =   54.14  avg =   53.92

rock@rock3a:~/ncnn/build/benchmark$ ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   62.47  max =   63.04  avg =   62.84
     squeezenet_int8  min =   67.23  max =   68.48  avg =   67.93
           mobilenet  min =   85.27  max =   85.69  avg =   85.49
      mobilenet_int8  min =   75.00  max =   75.48  avg =   75.26
        mobilenet_v2  min =   68.41  max =   69.09  avg =   68.76
        mobilenet_v3  min =   54.19  max =   54.52  avg =   54.34
          shufflenet  min =   45.90  max =   46.30  avg =   46.09
       shufflenet_v2  min =   39.64  max =   40.07  avg =   39.91
             mnasnet  min =   62.16  max =   62.41  avg =   62.30
     proxylessnasnet  min =   80.79  max =   81.41  avg =   81.12
     efficientnet_b0  min =  113.47  max =  113.68  avg =  113.57
   efficientnetv2_b0  min =  167.30  max =  167.58  avg =  167.44
        regnety_400m  min =   72.12  max =   72.24  avg =   72.17
           blazeface  min =   31.89  max =   32.04  avg =   31.95
           googlenet  min =  224.27  max =  224.86  avg =  224.55
      googlenet_int8  min =  240.02  max =  240.93  avg =  240.45
            resnet18  min =  150.25  max =  150.69  avg =  150.47
       resnet18_int8  min =  226.70  max =  228.19  avg =  227.56
             alexnet  min =  197.44  max =  199.16  avg =  198.17
               vgg16  min =  859.80  max =  860.79  avg =  860.35
          vgg16_int8  min = 1409.66  max = 1411.92  avg = 1411.07
            resnet50  min =  381.04  max =  382.73  avg =  381.86
       resnet50_int8  min =  441.78  max =  445.00  avg =  443.29
      squeezenet_ssd  min =  208.14  max =  208.67  avg =  208.41
 squeezenet_ssd_int8  min =  248.82  max =  250.80  avg =  249.89
       mobilenet_ssd  min =  200.95  max =  201.21  avg =  201.06
  mobilenet_ssd_int8  min =  173.81  max =  174.54  avg =  174.28
      mobilenet_yolo  min =  394.65  max =  395.00  avg =  394.78
  mobilenetv2_yolov3  min =  231.80  max =  232.27  avg =  232.08
         yolov4-tiny  min =  321.31  max =  322.43  avg =  321.79
           nanodet_m  min =  103.81  max =  104.61  avg =  104.25
```

### Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android)

```
rk3566_roc_pc:/data/local/tmp # ./benchncnn 10 1 0 0 0
./benchncnn 10 1 0 0 0
[0 Mali-G52]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G52]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=1
[0 Mali-G52]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Mali-G52]  subgroup=8  basic/vote/ballot/shuffle=1/1/1/1
[0 Mali-G52]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =   43.67  max =   44.15  avg =   43.82
     squeezenet_int8  min =   62.72  max =   63.99  avg =   63.49
           mobilenet  min =   74.32  max =   74.82  avg =   74.58
      mobilenet_int8  min =   64.42  max =   65.43  avg =   64.89
        mobilenet_v2  min =   52.96  max =   53.23  avg =   53.09
        mobilenet_v3  min =   51.55  max =   53.12  avg =   51.96
          shufflenet  min =   40.73  max =   41.28  avg =   40.98
       shufflenet_v2  min =   41.56  max =   43.62  avg =   42.22
             mnasnet  min =   54.37  max =   54.63  avg =   54.52
     proxylessnasnet  min =   57.91  max =   59.38  avg =   58.36
     efficientnet_b0  min =   38.40  max =   40.29  avg =   39.06
   efficientnetv2_b0  min =   36.91  max =   38.45  avg =   37.72
        regnety_400m  min =   69.07  max =   69.98  avg =   69.40
           blazeface  min =   12.26  max =   13.08  avg =   12.57
           googlenet  min =  147.08  max =  147.80  avg =  147.48
      googlenet_int8  min =  221.94  max =  225.99  avg =  223.12
            resnet18  min =  137.90  max =  138.50  avg =  138.19
       resnet18_int8  min =  187.84  max =  190.88  avg =  188.81
             alexnet  min =  167.56  max =  168.92  avg =  168.17
               vgg16  min =  713.42  max =  715.20  avg =  714.51
          vgg16_int8  min = 1279.97  max = 1302.95  avg = 1294.59
            resnet50  min =  369.74  max =  375.95  avg =  372.60
       resnet50_int8  min =  391.86  max =  397.49  avg =  395.17
      squeezenet_ssd  min =  155.18  max =  156.09  avg =  155.62
 squeezenet_ssd_int8  min =  218.83  max =  222.64  avg =  221.11
       mobilenet_ssd  min =  161.62  max =  163.22  avg =  162.27
  mobilenet_ssd_int8  min =  147.33  max =  149.16  avg =  148.23
      mobilenet_yolo  min =  344.09  max =  349.15  avg =  346.73
  mobilenetv2_yolov3  min =  168.72  max =  169.64  avg =  169.22
         yolov4-tiny  min =  239.44  max =  241.11  avg =  240.00
           nanodet_m  min =   88.06  max =   89.89  avg =   88.87
    yolo-fastest-1.1  min =   36.05  max =   37.86  avg =   36.47
      yolo-fastestv2  min =   34.80  max =   36.58  avg =   35.37
  vision_transformer  min =  356.42  max =  359.37  avg =  358.03
          FastestDet  min =   38.03  max =   38.52  avg =   38.24

rk3566_roc_pc:/data/local/tmp # ./benchncnn 10 1 0 -1 0
./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   47.01  max =   48.12  avg =   47.62
     squeezenet_int8  min =   63.30  max =   64.10  avg =   63.74
           mobilenet  min =   70.24  max =   71.52  avg =   70.63
      mobilenet_int8  min =   63.90  max =   65.25  avg =   64.41
        mobilenet_v2  min =   55.75  max =   56.26  avg =   56.02
        mobilenet_v3  min =   45.56  max =   46.47  avg =   46.17
          shufflenet  min =   34.16  max =   35.16  avg =   34.64
       shufflenet_v2  min =   32.58  max =   33.86  avg =   33.25
             mnasnet  min =   52.43  max =   53.15  avg =   52.80
     proxylessnasnet  min =   65.55  max =   67.04  avg =   66.36
     efficientnet_b0  min =   82.52  max =   82.97  avg =   82.64
   efficientnetv2_b0  min =  148.90  max =  150.47  avg =  149.64
        regnety_400m  min =   63.33  max =   64.29  avg =   63.70
           blazeface  min =   11.55  max =   12.35  avg =   11.77
           googlenet  min =  205.85  max =  208.74  avg =  207.17
      googlenet_int8  min =  222.72  max =  225.84  avg =  223.98
            resnet18  min =  134.19  max =  136.81  avg =  135.39
       resnet18_int8  min =  187.26  max =  189.45  avg =  188.36
             alexnet  min =  143.01  max =  144.97  avg =  143.42
               vgg16  min =  829.44  max =  839.46  avg =  835.37
          vgg16_int8  min = 1299.25  max = 1306.89  avg = 1301.71
            resnet50  min =  326.54  max =  330.21  avg =  328.27
       resnet50_int8  min =  391.67  max =  395.59  avg =  393.27
      squeezenet_ssd  min =  166.12  max =  168.33  avg =  167.08
 squeezenet_ssd_int8  min =  221.82  max =  223.85  avg =  222.69
       mobilenet_ssd  min =  163.17  max =  166.55  avg =  164.11
  mobilenet_ssd_int8  min =  146.16  max =  148.20  avg =  147.41
      mobilenet_yolo  min =  335.15  max =  338.32  avg =  336.66
  mobilenetv2_yolov3  min =  193.18  max =  195.51  avg =  194.33
         yolov4-tiny  min =  288.82  max =  292.16  avg =  290.36
           nanodet_m  min =   98.31  max =  100.30  avg =   99.20
    yolo-fastest-1.1  min =   37.73  max =   38.97  avg =   38.40
      yolo-fastestv2  min =   36.21  max =   37.90  avg =   37.13
  vision_transformer  min = 7385.59  max = 7410.59  avg = 7402.20
          FastestDet  min =   34.55  max =   35.42  avg =   35.06
```

### Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4)

```
nanopc-t4:/data/local/tmp # ./benchncnn 8 2 2 -1 1
loop_count = 8
num_threads = 2
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   43.73  max =   44.30  avg =   43.97
     squeezenet_int8  min =   37.92  max =   38.39  avg =   38.09
           mobilenet  min =   64.28  max =   66.66  avg =   65.14
      mobilenet_int8  min =   43.17  max =   43.73  avg =   43.38
        mobilenet_v2  min =   51.30  max =   52.18  avg =   51.75
        mobilenet_v3  min =   41.51  max =   43.25  avg =   42.10
          shufflenet  min =   27.43  max =   28.27  avg =   27.75
       shufflenet_v2  min =   24.96  max =   25.79  avg =   25.55
             mnasnet  min =   45.44  max =   46.95  avg =   46.16
     proxylessnasnet  min =   51.98  max =   53.52  avg =   52.48
     efficientnet_b0  min =   83.79  max =   84.68  avg =   84.27
   efficientnetv2_b0  min =   97.89  max =   99.27  avg =   98.55
        regnety_400m  min =   65.15  max =   65.89  avg =   65.41
           blazeface  min =    8.74  max =    8.89  avg =    8.80
           googlenet  min =  131.46  max =  140.16  avg =  133.24
      googlenet_int8  min =  115.72  max =  118.34  avg =  116.60
            resnet18  min =  111.77  max =  113.18  avg =  112.37
       resnet18_int8  min =   84.27  max =   84.90  avg =   84.49
             alexnet  min =  105.74  max =  109.87  avg =  107.15
               vgg16  min =  619.88  max =  634.59  avg =  629.15
          vgg16_int8  min =  447.14  max =  451.09  avg =  448.53
            resnet50  min =  291.51  max =  296.55  avg =  293.08
       resnet50_int8  min =  224.09  max =  227.03  avg =  225.02
      squeezenet_ssd  min =  109.72  max =  112.09  avg =  110.78
 squeezenet_ssd_int8  min =   93.41  max =   94.83  avg =   93.97
       mobilenet_ssd  min =  131.30  max =  132.82  avg =  131.94
  mobilenet_ssd_int8  min =   87.52  max =   88.89  avg =   88.35
      mobilenet_yolo  min =  288.02  max =  289.84  avg =  288.61
  mobilenetv2_yolov3  min =  168.45  max =  170.94  avg =  169.79
         yolov4-tiny  min =  217.45  max =  226.39  avg =  219.76
           nanodet_m  min =   65.74  max =   66.84  avg =   66.49
    yolo-fastest-1.1  min =   32.91  max =   33.74  avg =   33.37
      yolo-fastestv2  min =   28.90  max =   37.31  avg =   30.27

nanopc-t4:/data/local/tmp # ./benchncnn 8 1 2 -1 1
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   71.35  max =   73.02  avg =   71.83
     squeezenet_int8  min =   60.39  max =   60.96  avg =   60.69
           mobilenet  min =  111.12  max =  113.02  avg =  111.99
      mobilenet_int8  min =   80.14  max =   81.59  avg =   81.00
        mobilenet_v2  min =   78.18  max =   80.89  avg =   79.18
        mobilenet_v3  min =   63.49  max =   64.26  avg =   63.90
          shufflenet  min =   38.90  max =   40.28  avg =   39.26
       shufflenet_v2  min =   37.72  max =   38.45  avg =   38.02
             mnasnet  min =   72.34  max =   73.59  avg =   72.87
     proxylessnasnet  min =   87.33  max =   89.70  avg =   88.45
     efficientnet_b0  min =  145.14  max =  146.77  avg =  145.93
   efficientnetv2_b0  min =  169.33  max =  171.16  avg =  170.16
        regnety_400m  min =   99.08  max =   99.80  avg =   99.47
           blazeface  min =   12.28  max =   12.69  avg =   12.48
           googlenet  min =  228.18  max =  229.36  avg =  228.64
      googlenet_int8  min =  201.62  max =  203.71  avg =  202.25
            resnet18  min =  175.71  max =  180.53  avg =  176.85
       resnet18_int8  min =  151.42  max =  152.45  avg =  151.83
             alexnet  min =  160.81  max =  186.24  avg =  165.30
               vgg16  min = 1044.34  max = 1080.88  avg = 1062.34
          vgg16_int8  min =  844.53  max =  851.71  avg =  848.65
            resnet50  min =  503.25  max =  505.20  avg =  504.18
       resnet50_int8  min =  397.71  max =  400.19  avg =  398.63
      squeezenet_ssd  min =  162.98  max =  165.97  avg =  164.34
 squeezenet_ssd_int8  min =  145.93  max =  148.59  avg =  146.94
       mobilenet_ssd  min =  226.54  max =  229.80  avg =  227.80
  mobilenet_ssd_int8  min =  159.97  max =  163.18  avg =  161.06
      mobilenet_yolo  min =  512.90  max =  517.47  avg =  515.06
  mobilenetv2_yolov3  min =  274.88  max =  280.24  avg =  276.36
         yolov4-tiny  min =  351.97  max =  358.70  avg =  355.60
           nanodet_m  min =   95.32  max =   97.83  avg =   96.28
    yolo-fastest-1.1  min =   43.47  max =   46.52  avg =   44.55
      yolo-fastestv2  min =   37.22  max =   37.63  avg =   37.45

nanopc-t4:/data/local/tmp # ./benchncnn 8 4 1 -1 1
loop_count = 8
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 1
          squeezenet  min =   48.11  max =   48.51  avg =   48.24
     squeezenet_int8  min =   43.19  max =   44.17  avg =   43.40
           mobilenet  min =   65.47  max =   66.40  avg =   65.68
      mobilenet_int8  min =   49.15  max =   51.65  avg =   49.76
        mobilenet_v2  min =   53.60  max =   54.19  avg =   53.87
        mobilenet_v3  min =   52.83  max =   92.92  avg =   66.25
          shufflenet  min =   35.71  max =   36.03  avg =   35.83
       shufflenet_v2  min =   31.88  max =   32.38  avg =   32.16
             mnasnet  min =   51.59  max =   54.01  avg =   52.30
     proxylessnasnet  min =   60.11  max =   60.40  avg =   60.24
     efficientnet_b0  min =   98.22  max =   99.40  avg =   98.56
   efficientnetv2_b0  min =  114.19  max =  123.90  avg =  115.89
        regnety_400m  min =   85.89  max =   86.20  avg =   86.03
           blazeface  min =   11.23  max =   11.37  avg =   11.31
           googlenet  min =  142.25  max =  160.88  avg =  145.26
      googlenet_int8  min =  125.45  max =  128.50  avg =  125.96
            resnet18  min =  116.68  max =  118.26  avg =  117.00
       resnet18_int8  min =   88.43  max =   90.95  avg =   89.08
             alexnet  min =  150.91  max =  160.01  avg =  152.51
               vgg16  min =  674.91  max =  684.83  avg =  679.08
          vgg16_int8  min =  417.60  max =  422.52  avg =  419.60
            resnet50  min =  297.23  max =  299.37  avg =  298.03
       resnet50_int8  min =  243.99  max =  251.39  avg =  245.99
      squeezenet_ssd  min =  127.92  max =  128.53  avg =  128.17
 squeezenet_ssd_int8  min =  112.54  max =  114.63  avg =  113.19
       mobilenet_ssd  min =  136.43  max =  140.14  avg =  137.33
  mobilenet_ssd_int8  min =  102.14  max =  105.00  avg =  102.77
      mobilenet_yolo  min =  291.45  max =  294.04  avg =  292.63
  mobilenetv2_yolov3  min =  183.13  max =  187.00  avg =  184.05
         yolov4-tiny  min =  257.46  max =  268.76  avg =  260.49
           nanodet_m  min =   83.16  max =   91.03  avg =   84.77
    yolo-fastest-1.1  min =   43.53  max =   43.87  avg =   43.74
      yolo-fastestv2  min =   35.04  max =   35.54  avg =   35.17

nanopc-t4:/data/local/tmp # ./benchncnn 8 1 1 -1 1
loop_count = 8
num_threads = 1
powersave = 1
gpu_device = -1
cooling_down = 1
          squeezenet  min =  129.63  max =  130.58  avg =  129.85
     squeezenet_int8  min =  124.10  max =  126.34  avg =  124.81
           mobilenet  min =  207.92  max =  208.72  avg =  208.41
      mobilenet_int8  min =  175.55  max =  176.11  avg =  175.84
        mobilenet_v2  min =  143.02  max =  143.56  avg =  143.25
        mobilenet_v3  min =  133.11  max =  134.05  avg =  133.33
          shufflenet  min =   77.97  max =   78.54  avg =   78.19
       shufflenet_v2  min =   75.59  max =   76.05  avg =   75.82
             mnasnet  min =  139.86  max =  141.77  avg =  140.19
     proxylessnasnet  min =  178.57  max =  179.57  avg =  179.03
     efficientnet_b0  min =  316.10  max =  317.82  avg =  316.86
   efficientnetv2_b0  min =  359.26  max =  362.03  avg =  360.31
        regnety_400m  min =  182.64  max =  183.03  avg =  182.82
           blazeface  min =   25.81  max =   26.53  avg =   26.20
           googlenet  min =  448.45  max =  450.80  avg =  449.35
      googlenet_int8  min =  406.07  max =  410.65  avg =  408.04
            resnet18  min =  351.64  max =  362.12  avg =  354.19
       resnet18_int8  min =  298.10  max =  300.45  avg =  299.26
             alexnet  min =  586.92  max =  588.73  avg =  587.80
               vgg16  min = 2170.12  max = 2202.80  avg = 2183.32
          vgg16_int8  min = 1533.65  max = 1542.01  avg = 1537.33
            resnet50  min =  975.40  max =  977.79  avg =  976.61
       resnet50_int8  min =  851.59  max =  855.22  avg =  853.75
      squeezenet_ssd  min =  306.35  max =  307.54  avg =  306.96
 squeezenet_ssd_int8  min =  291.32  max =  292.87  avg =  292.18
       mobilenet_ssd  min =  423.70  max =  424.63  avg =  424.11
  mobilenet_ssd_int8  min =  358.62  max =  359.42  avg =  359.04
      mobilenet_yolo  min =  928.06  max =  929.25  avg =  928.55
  mobilenetv2_yolov3  min =  496.96  max =  499.29  avg =  497.73
         yolov4-tiny  min =  712.80  max =  714.15  avg =  713.55
           nanodet_m  min =  179.42  max =  180.60  avg =  179.75
    yolo-fastest-1.1  min =   88.06  max =   88.85  avg =   88.35
      yolo-fastestv2  min =   68.68  max =   69.83  avg =   69.08

nanopc-t4:/data/local/tmp # ./benchncnn 4 1 2 0 0
[0 Mali-T860]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-T860]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=1
[0 Mali-T860]  fp16-p/s/a=1/0/1  int8-p/s/a=1/0/0
[0 Mali-T860]  subgroup=0  basic=0  vote=0  ballot=0  shuffle=0
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 0
          squeezenet  min =   24.57  max =   24.71  avg =   24.64
           mobilenet  min =   35.86  max =   36.14  avg =   36.04
        mobilenet_v2  min =   30.18  max =   30.19  avg =   30.19
        mobilenet_v3  min =   30.88  max =   31.12  avg =   31.01
          shufflenet  min =   33.90  max =   33.98  avg =   33.93
       shufflenet_v2  min =   29.10  max =   29.14  avg =   29.12
             mnasnet  min =   30.49  max =   30.59  avg =   30.53
     proxylessnasnet  min =   33.56  max =   33.61  avg =   33.59
     efficientnet_b0  min =   51.15  max =   51.54  avg =   51.38
   efficientnetv2_b0  min =   86.26  max =   87.36  avg =   86.91
        regnety_400m  min =   38.44  max =   38.54  avg =   38.49
           blazeface  min =    9.66  max =    9.74  avg =    9.70
           googlenet  min =   80.62  max =   80.96  avg =   80.81
            resnet18  min =   74.07  max =   74.36  avg =   74.23
             alexnet  min =   76.84  max =   77.26  avg =   77.08
               vgg16  min =  300.71  max =  300.89  avg =  300.80
            resnet50  min =  175.96  max =  176.72  avg =  176.23
      squeezenet_ssd  min =   71.20  max =   71.38  avg =   71.32
       mobilenet_ssd  min =   76.99  max =   77.47  avg =   77.19
      mobilenet_yolo  min =  160.41  max =  160.84  avg =  160.62
  mobilenetv2_yolov3  min =   91.31  max =   91.37  avg =   91.35
         yolov4-tiny  min =  130.78  max =  131.54  avg =  131.16
           nanodet_m  min =   55.90  max =   56.03  avg =   55.96
    yolo-fastest-1.1  min =   25.50  max =   25.66  avg =   25.59
      yolo-fastestv2  min =   24.94  max =   25.07  avg =   25.01
```

### MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2)

```
root@myir-remi-1g:~/ncnn# time ./benchncnn 10 4 0 -1 1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   85.38  max =   87.72  avg =   86.78
     squeezenet_int8  min =   84.23  max =   86.46  avg =   85.59
           mobilenet  min =  121.01  max =  122.55  avg =  121.76
      mobilenet_int8  min =   95.64  max =   97.27  avg =   96.25
        mobilenet_v2  min =  101.35  max =  102.24  avg =  101.72
        mobilenet_v3  min =   84.09  max =   86.66  avg =   84.86
          shufflenet  min =   63.32  max =   65.16  avg =   64.53
       shufflenet_v2  min =   60.33  max =   62.35  avg =   61.04
             mnasnet  min =   95.51  max =   96.70  avg =   95.95
     proxylessnasnet  min =  124.46  max =  125.82  avg =  125.14
     efficientnet_b0  min =  144.94  max =  146.46  avg =  145.56
   efficientnetv2_b0  min =  182.87  max =  185.63  avg =  184.56
        regnety_400m  min =  105.31  max =  106.42  avg =  105.72
           blazeface  min =   21.34  max =   21.90  avg =   21.50
           googlenet  min =  313.01  max =  318.42  avg =  314.25
      googlenet_int8  min =  301.87  max =  304.93  avg =  303.66
            resnet18  min =  248.02  max =  253.93  avg =  250.12
       resnet18_int8  min =  244.65  max =  246.62  avg =  245.66
             alexnet  min =  204.00  max =  206.39  avg =  205.21
            resnet50  min =  583.13  max =  584.82  avg =  584.11
       resnet50_int8  min =  517.42  max =  520.97  avg =  519.07
      squeezenet_ssd  min =  266.63  max =  273.34  avg =  268.60
 squeezenet_ssd_int8  min =  255.42  max =  260.98  avg =  257.15
       mobilenet_ssd  min =  267.16  max =  270.41  avg =  268.20
  mobilenet_ssd_int8  min =  205.03  max =  206.43  avg =  205.53
      mobilenet_yolo  min =  571.08  max =  576.15  avg =  574.18
  mobilenetv2_yolov3  min =  342.52  max =  344.84  avg =  343.38
         yolov4-tiny  min =  499.74  max =  503.13  avg =  501.45
           nanodet_m  min =  161.87  max =  163.90  avg =  162.93
    yolo-fastest-1.1  min =   72.84  max =   74.81  avg =   73.35
      yolo-fastestv2  min =   68.24  max =   70.49  avg =   68.74
  vision_transformer  min = 12464.09  max = 12491.57  avg = 12475.63
          FastestDet  min =   67.92  max =   69.90  avg =   68.94
```

### OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4)

```
orangepi@zero2:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   76.25  max =   90.20  avg =   78.99
     squeezenet_int8  min =   59.92  max =   60.44  avg =   60.10
           mobilenet  min =  106.91  max =  132.22  avg =  109.99
      mobilenet_int8  min =   57.96  max =   59.06  avg =   58.19
        mobilenet_v2  min =   97.93  max =  124.48  avg =  100.91
        mobilenet_v3  min =   82.27  max =   83.93  avg =   83.00
          shufflenet  min =   55.27  max =   82.06  avg =   58.40
       shufflenet_v2  min =   44.94  max =   71.99  avg =   48.10
             mnasnet  min =   90.66  max =   91.41  avg =   90.92
     proxylessnasnet  min =   91.55  max =  118.74  avg =   94.71
     efficientnet_b0  min =  127.95  max =  155.13  avg =  131.25
   efficientnetv2_b0  min =  145.96  max =  173.67  avg =  149.36
        regnety_400m  min =  102.83  max =  103.52  avg =  103.08
           blazeface  min =   14.46  max =   14.95  avg =   14.77
           googlenet  min =  217.71  max =  244.16  avg =  221.38
      googlenet_int8  min =  163.04  max =  187.69  avg =  166.20
            resnet18  min =  251.45  max =  277.52  avg =  255.00
       resnet18_int8  min =  136.54  max =  161.95  avg =  141.60
             alexnet  min =  212.07  max =  233.27  avg =  215.34
               vgg16  min = 1206.92  max = 1981.79  avg = 1673.28
          vgg16_int8  min =  622.93  max =  702.12  avg =  661.83
            resnet50  min =  555.84  max =  643.69  avg =  576.17
       resnet50_int8  min =  348.11  max =  374.25  avg =  354.17
      squeezenet_ssd  min =  224.68  max =  251.32  avg =  230.59
 squeezenet_ssd_int8  min =  154.87  max =  182.66  avg =  159.08
       mobilenet_ssd  min =  238.49  max =  426.65  avg =  263.18
  mobilenet_ssd_int8  min =  118.36  max =  138.39  avg =  120.78
      mobilenet_yolo  min =  500.28  max =  615.83  avg =  553.59
  mobilenetv2_yolov3  min =  340.27  max =  369.13  avg =  347.17
         yolov4-tiny  min =  365.04  max =  408.48  avg =  383.93
           nanodet_m  min =  112.88  max =  141.85  avg =  116.13
    yolo-fastest-1.1  min =   72.05  max =   73.46  avg =   72.68
      yolo-fastestv2  min =   54.94  max =   55.35  avg =   55.15
  vision_transformer  min = 6842.19  max = 9125.07  avg = 7343.64
          FastestDet  min =   59.09  max =   59.87  avg =   59.35
```

### OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4)
Test Ubuntu 22.04 Gnome Desktop
```
orangepi@orangepi4-lts:~/ncnn/benchmark$ ./benchncnn 10 6 0 -1 0
loop_count = 10
num_threads = 6
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   40.89  max =   50.29  avg =   45.15
     squeezenet_int8  min =   40.36  max =   48.57  avg =   43.56
           mobilenet  min =   55.81  max =   67.35  avg =   59.81
      mobilenet_int8  min =   39.96  max =   45.10  avg =   42.09
        mobilenet_v2  min =   53.29  max =   64.12  avg =   57.40
        mobilenet_v3  min =   38.94  max =   51.11  avg =   43.06
          shufflenet  min =   27.32  max =   38.53  avg =   31.85
       shufflenet_v2  min =   24.38  max =   31.17  avg =   28.32
             mnasnet  min =   47.02  max =   50.68  avg =   48.86
     proxylessnasnet  min =   52.31  max =   61.31  avg =   56.66
     efficientnet_b0  min =   68.14  max =   76.07  avg =   72.62
   efficientnetv2_b0  min =   77.23  max =   96.07  avg =   84.83
        regnety_400m  min =   60.81  max =   81.72  avg =   72.37
           blazeface  min =    7.24  max =    8.19  avg =    7.68
           googlenet  min =  122.99  max =  132.67  avg =  128.90
      googlenet_int8  min =  108.45  max =  121.17  avg =  115.37
            resnet18  min =  100.67  max =  115.30  avg =  107.65
       resnet18_int8  min =   80.17  max =   87.56  avg =   84.01
             alexnet  min =   71.00  max =   83.09  avg =   76.21
               vgg16  min =  557.67  max =  606.30  avg =  581.12
          vgg16_int8  min =  369.93  max =  393.20  avg =  384.86
            resnet50  min =  254.25  max =  272.90  avg =  265.18
       resnet50_int8  min =  220.70  max =  231.50  avg =  225.03
      squeezenet_ssd  min =  118.91  max =  131.52  avg =  123.91
 squeezenet_ssd_int8  min =   98.25  max =  116.42  avg =  110.13
       mobilenet_ssd  min =  126.62  max =  134.13  avg =  129.56
  mobilenet_ssd_int8  min =   83.83  max =   91.61  avg =   86.75
      mobilenet_yolo  min =  281.19  max =  299.79  avg =  290.05
  mobilenetv2_yolov3  min =  180.37  max =  194.10  avg =  185.61
         yolov4-tiny  min =  215.28  max =  227.29  avg =  221.61
           nanodet_m  min =   64.63  max =   75.86  avg =   70.46
    yolo-fastest-1.1  min =   39.54  max =   48.30  avg =   44.76
      yolo-fastestv2  min =   29.91  max =   53.15  avg =   37.32
  vision_transformer  min = 2520.25  max = 2595.28  avg = 2557.05
          FastestDet  min =   32.45  max =   47.38  avg =   40.55

orangepi@orangepi4-lts:~/ncnn/benchmark$ ./benchncnn 10 4 1 -1 0
loop_count = 10
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 0
          squeezenet  min =   48.90  max =   56.65  avg =   53.09
     squeezenet_int8  min =   48.09  max =   54.69  avg =   51.26
           mobilenet  min =   66.06  max =   79.73  avg =   73.96
      mobilenet_int8  min =   51.33  max =   58.30  avg =   54.71
        mobilenet_v2  min =   61.06  max =   88.93  avg =   71.48
        mobilenet_v3  min =   50.41  max =   65.40  avg =   56.51
          shufflenet  min =   38.11  max =   63.95  avg =   44.03
       shufflenet_v2  min =   33.27  max =   36.43  avg =   34.89
             mnasnet  min =   60.02  max =   72.71  avg =   64.57
     proxylessnasnet  min =   66.61  max =   73.25  avg =   70.65
     efficientnet_b0  min =   87.27  max =   94.97  avg =   91.00
   efficientnetv2_b0  min =   99.89  max =  112.09  avg =  106.13
        regnety_400m  min =   84.65  max =   92.78  avg =   89.51
           blazeface  min =    9.73  max =   11.45  avg =   10.85
           googlenet  min =  154.74  max =  164.25  avg =  159.33
      googlenet_int8  min =  140.29  max =  148.08  avg =  144.18
            resnet18  min =  131.51  max =  244.02  avg =  150.56
       resnet18_int8  min =  102.11  max =  114.40  avg =  108.32
             alexnet  min =   81.13  max =   92.35  avg =   86.86
               vgg16  min =  649.91  max =  668.62  avg =  660.25
          vgg16_int8  min =  513.75  max =  523.77  avg =  518.17
            resnet50  min =  330.89  max =  378.23  avg =  344.07
       resnet50_int8  min =  280.38  max =  286.93  avg =  284.43
      squeezenet_ssd  min =  134.35  max =  146.97  avg =  141.17
 squeezenet_ssd_int8  min =  126.31  max =  137.29  avg =  130.73
       mobilenet_ssd  min =  146.83  max =  161.70  avg =  155.08
  mobilenet_ssd_int8  min =  105.74  max =  117.05  avg =  111.62
      mobilenet_yolo  min =  339.30  max =  352.16  avg =  345.22
  mobilenetv2_yolov3  min =  223.12  max =  234.18  avg =  229.81
         yolov4-tiny  min =  267.30  max =  272.95  avg =  270.47
           nanodet_m  min =   78.72  max =   86.18  avg =   81.81
    yolo-fastest-1.1  min =   47.96  max =   55.08  avg =   51.81
      yolo-fastestv2  min =   38.01  max =   44.32  avg =   42.29
  vision_transformer  min = 3499.34  max = 3526.15  avg = 3514.43
          FastestDet  min =   40.14  max =   44.37  avg =   42.30

orangepi@orangepi4-lts:~/ncnn/benchmark$ ./benchncnn 10 2 2 -1 0
loop_count = 10
num_threads = 2
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =   45.65  max =   46.72  avg =   46.15
     squeezenet_int8  min =   42.60  max =   43.01  avg =   42.76
           mobilenet  min =   69.35  max =   70.59  avg =   69.92
      mobilenet_int8  min =   46.08  max =   46.35  avg =   46.20
        mobilenet_v2  min =   57.47  max =   58.90  avg =   58.08
        mobilenet_v3  min =   44.72  max =   45.47  avg =   45.05
          shufflenet  min =   31.74  max =   32.16  avg =   31.97
       shufflenet_v2  min =   26.74  max =   26.98  avg =   26.86
             mnasnet  min =   50.47  max =   51.20  avg =   50.82
     proxylessnasnet  min =   57.31  max =   58.24  avg =   57.68
     efficientnet_b0  min =   79.61  max =   80.79  avg =   80.02
   efficientnetv2_b0  min =   92.67  max =   93.37  avg =   93.08
        regnety_400m  min =   67.08  max =   68.07  avg =   67.59
           blazeface  min =    8.56  max =    8.81  avg =    8.70
           googlenet  min =  136.82  max =  138.26  avg =  137.44
      googlenet_int8  min =  121.96  max =  122.64  avg =  122.36
            resnet18  min =  118.04  max =  119.24  avg =  118.49
       resnet18_int8  min =   89.55  max =   92.11  avg =   90.38
             alexnet  min =   80.75  max =   82.34  avg =   81.24
               vgg16  min =  602.11  max =  628.12  avg =  612.26
          vgg16_int8  min =  481.31  max =  484.49  avg =  482.84
            resnet50  min =  307.31  max =  310.10  avg =  308.88
       resnet50_int8  min =  240.45  max =  243.43  avg =  241.76
      squeezenet_ssd  min =  119.65  max =  122.93  avg =  121.34
 squeezenet_ssd_int8  min =  102.71  max =  103.45  avg =  103.20
       mobilenet_ssd  min =  142.16  max =  143.58  avg =  142.54
  mobilenet_ssd_int8  min =   93.20  max =   93.81  avg =   93.41
      mobilenet_yolo  min =  315.42  max =  318.06  avg =  317.00
  mobilenetv2_yolov3  min =  190.59  max =  191.74  avg =  190.96
         yolov4-tiny  min =  228.77  max =  230.49  avg =  229.78
           nanodet_m  min =   66.82  max =   67.23  avg =   67.02
    yolo-fastest-1.1  min =   38.20  max =   40.89  avg =   38.85
      yolo-fastestv2  min =   32.53  max =   33.48  avg =   33.03
  vision_transformer  min = 3372.17  max = 3516.54  avg = 3461.89
          FastestDet  min =   32.92  max =   35.55  avg =   33.62

```

### OrangePicm4, Rockchip Rk3566 (Cortex-A55 1.8GHz x 4)
```
orangepi@orangepicm4:~/code/ncnn-test$ ./benchncnn 10 4 0 -1 1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   23.91  max =   91.49  avg =   31.03
     squeezenet_int8  min =   24.44  max =   25.39  avg =   24.75
           mobilenet  min =   30.67  max =   31.75  avg =   30.98
      mobilenet_int8  min =   27.87  max =   28.48  avg =   28.05
        mobilenet_v2  min =   31.82  max =   32.56  avg =   32.07
        mobilenet_v3  min =   24.63  max =   24.91  avg =   24.81
          shufflenet  min =   19.77  max =   20.19  avg =   20.01
       shufflenet_v2  min =   16.67  max =   40.81  avg =   28.79
             mnasnet  min =   27.48  max =   28.36  avg =   27.75
     proxylessnasnet  min =   33.04  max =   37.30  avg =   33.70
     efficientnet_b0  min =   39.21  max =  175.34  avg =   53.26
   efficientnetv2_b0  min =   48.94  max =   78.68  avg =   52.44
        regnety_400m  min =   39.81  max =   40.15  avg =   39.96
           blazeface  min =    6.22  max =    6.36  avg =    6.30
           googlenet  min =   75.48  max =  120.58  avg =   82.05
      googlenet_int8  min =   74.42  max =   78.70  avg =   75.29
            resnet18  min =   58.21  max =   99.04  avg =   66.07
       resnet18_int8  min =   54.18  max =   79.91  avg =   57.31
             alexnet  min =   49.18  max =  161.71  avg =   63.03
               vgg16  min =  323.82  max =  452.63  avg =  360.92
          vgg16_int8  min =  379.18  max =  527.82  avg =  432.99
            resnet50  min =  135.84  max =  200.71  avg =  142.54
       resnet50_int8  min =  126.06  max =  169.65  avg =  136.29
      squeezenet_ssd  min =   77.62  max =  137.89  avg =   86.87
 squeezenet_ssd_int8  min =   74.17  max =   76.22  avg =   74.91
       mobilenet_ssd  min =   68.60  max =  132.81  avg =   75.30
  mobilenet_ssd_int8  min =   58.01  max =   59.24  avg =   58.81
      mobilenet_yolo  min =  151.61  max =  247.03  avg =  168.31
  mobilenetv2_yolov3  min =  106.00  max =  163.45  avg =  111.92
         yolov4-tiny  min =  132.99  max =  193.53  avg =  139.88
           nanodet_m  min =   51.43  max =   87.10  avg =   58.17
    yolo-fastest-1.1  min =   26.10  max =   66.68  avg =   30.33
      yolo-fastestv2  min =   21.87  max =   69.79  avg =   35.55
  vision_transformer  min = 2301.36  max = 2513.89  avg = 2426.14
          FastestDet  min =   21.33  max =   21.59  avg =   21.47
orangepi@orangepicm4:~/code/ncnn-test$ ./benchncnn 10 1 0 -1 1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   47.26  max =   48.21  avg =   47.68
     squeezenet_int8  min =   50.80  max =   54.79  avg =   51.64
           mobilenet  min =   68.18  max =   71.72  avg =   68.78
      mobilenet_int8  min =   58.34  max =   58.73  avg =   58.56
        mobilenet_v2  min =   56.56  max =   57.38  avg =   57.04
        mobilenet_v3  min =   45.52  max =   53.46  avg =   47.98
          shufflenet  min =   34.88  max =   75.06  avg =   46.15
       shufflenet_v2  min =   33.43  max =   49.65  avg =   36.86
             mnasnet  min =   53.87  max =   54.08  avg =   53.98
     proxylessnasnet  min =   70.99  max =   71.40  avg =   71.14
     efficientnet_b0  min =   83.79  max =   89.78  avg =   84.96
   efficientnetv2_b0  min =  103.89  max =  117.47  avg =  105.81
        regnety_400m  min =   63.68  max =   81.25  avg =   66.66
           blazeface  min =   12.18  max =   39.24  avg =   21.79
           googlenet  min =  179.41  max =  202.18  avg =  185.39
      googlenet_int8  min =  187.88  max =  198.49  avg =  191.01
            resnet18  min =  132.67  max =  148.94  avg =  136.09
       resnet18_int8  min =  150.37  max =  158.14  avg =  153.17
             alexnet  min =  115.00  max =  120.17  avg =  116.26
               vgg16  min =  809.99  max =  851.07  avg =  827.73
          vgg16_int8  min = 1149.74  max = 1161.37  avg = 1154.22
            resnet50  min =  327.19  max =  350.42  avg =  332.12
       resnet50_int8  min =  325.08  max =  332.46  avg =  327.17
      squeezenet_ssd  min =  150.33  max =  163.00  avg =  153.12
 squeezenet_ssd_int8  min =  152.21  max =  157.94  avg =  155.36
       mobilenet_ssd  min =  149.30  max =  150.23  avg =  149.72
  mobilenet_ssd_int8  min =  121.93  max =  127.07  avg =  123.03
      mobilenet_yolo  min =  330.91  max =  345.64  avg =  336.21
  mobilenetv2_yolov3  min =  193.25  max =  214.92  avg =  198.82
         yolov4-tiny  min =  284.38  max =  332.54  avg =  293.43
           nanodet_m  min =   90.69  max =  100.74  avg =   92.56
    yolo-fastest-1.1  min =   38.93  max =   51.96  avg =   42.11
      yolo-fastestv2  min =   35.74  max =   48.11  avg =   38.63
  vision_transformer  min = 7280.18  max = 7301.27  avg = 7292.38
          FastestDet  min =   36.54  max =   42.31  avg =   38.41
```

### OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz)
```
orangepi@orangepi5:~/ncnn-master/benchmark$ ./benchncnn 10 8 0 -1 0
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.22  max =    6.69  avg =    6.37
     squeezenet_int8  min =    7.93  max =    8.32  avg =    8.07
           mobilenet  min =    9.08  max =   14.02  avg =    9.81
      mobilenet_int8  min =    7.89  max =    9.02  avg =    8.47
        mobilenet_v2  min =    7.77  max =    8.09  avg =    7.92
        mobilenet_v3  min =    6.87  max =    8.19  avg =    7.46
          shufflenet  min =    5.98  max =   10.21  avg =    7.23
       shufflenet_v2  min =    4.82  max =    5.04  avg =    4.93
             mnasnet  min =    6.15  max =    6.36  avg =    6.24
     proxylessnasnet  min =    9.50  max =   10.50  avg =    9.93
     efficientnet_b0  min =   11.46  max =   11.79  avg =   11.60
   efficientnetv2_b0  min =   18.61  max =   19.48  avg =   18.88
        regnety_400m  min =   10.54  max =   12.44  avg =   10.86
           blazeface  min =    1.96  max =    5.35  avg =    2.58
           googlenet  min =   26.62  max =   32.59  avg =   29.96
      googlenet_int8  min =   28.27  max =   32.80  avg =   30.01
            resnet18  min =   15.52  max =   18.29  avg =   16.37
       resnet18_int8  min =   23.33  max =   26.89  avg =   24.99
             alexnet  min =   19.92  max =   22.75  avg =   21.06
               vgg16  min =  101.18  max =  122.44  avg =  107.45
          vgg16_int8  min =  164.69  max =  227.98  avg =  189.73
            resnet50  min =   42.96  max =   59.26  avg =   50.83
       resnet50_int8  min =   54.46  max =   66.72  avg =   61.37
      squeezenet_ssd  min =   24.39  max =   31.19  avg =   27.69
 squeezenet_ssd_int8  min =   27.15  max =   41.55  avg =   33.68
       mobilenet_ssd  min =   22.26  max =   26.89  avg =   23.95
  mobilenet_ssd_int8  min =   21.18  max =   24.21  avg =   23.05
      mobilenet_yolo  min =   52.65  max =   65.53  avg =   58.47
  mobilenetv2_yolov3  min =   31.34  max =   45.15  avg =   34.63
         yolov4-tiny  min =   40.55  max =   49.32  avg =   43.85
           nanodet_m  min =   16.08  max =   19.51  avg =   17.58
    yolo-fastest-1.1  min =    6.48  max =    7.33  avg =    6.98
      yolo-fastestv2  min =    4.96  max =   11.66  avg =    7.30
  vision_transformer  min =  678.22  max =  815.73  avg =  729.16
          FastestDet  min =    4.95  max =   10.65  avg =    6.88


orangepi@orangepi5:~/ncnn-master/benchmark$ ./benchncnn 10 4 1 -1 0
loop_count = 10
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 0
          squeezenet  min =   10.91  max =   11.14  avg =   11.03
     squeezenet_int8  min =   14.26  max =   14.55  avg =   14.30
           mobilenet  min =   15.92  max =   16.26  avg =   16.11
      mobilenet_int8  min =   14.71  max =   15.22  avg =   14.91
        mobilenet_v2  min =   12.28  max =   12.49  avg =   12.37
        mobilenet_v3  min =   11.31  max =   11.72  avg =   11.46
          shufflenet  min =   10.10  max =   10.33  avg =   10.24
       shufflenet_v2  min =    9.38  max =    9.70  avg =    9.55
             mnasnet  min =   12.28  max =   12.80  avg =   12.44
     proxylessnasnet  min =   16.54  max =   16.66  avg =   16.60
     efficientnet_b0  min =   19.56  max =   20.66  avg =   19.86
   efficientnetv2_b0  min =   34.06  max =   34.65  avg =   34.41
        regnety_400m  min =   23.97  max =   24.69  avg =   24.20
           blazeface  min =    3.39  max =    3.56  avg =    3.48
           googlenet  min =   46.96  max =   47.90  avg =   47.56
      googlenet_int8  min =   49.56  max =   50.23  avg =   49.79
            resnet18  min =   28.44  max =   29.54  avg =   28.77
       resnet18_int8  min =   41.32  max =   42.44  avg =   41.67
             alexnet  min =   31.83  max =   32.77  avg =   32.32
               vgg16  min =  170.32  max =  178.30  avg =  173.22
          vgg16_int8  min =  282.55  max =  299.32  avg =  287.78
            resnet50  min =   78.00  max =   81.57  avg =   78.79
       resnet50_int8  min =   89.12  max =   92.31  avg =   90.92
      squeezenet_ssd  min =   38.07  max =   39.07  avg =   38.59
 squeezenet_ssd_int8  min =   50.98  max =   52.56  avg =   51.68
       mobilenet_ssd  min =   38.79  max =   39.67  avg =   39.34
  mobilenet_ssd_int8  min =   33.53  max =   35.26  avg =   34.66
      mobilenet_yolo  min =   90.50  max =   92.32  avg =   90.99
  mobilenetv2_yolov3  min =   51.38  max =   51.93  avg =   51.56
         yolov4-tiny  min =   75.65  max =   76.80  avg =   76.17
           nanodet_m  min =   21.33  max =   21.68  avg =   21.50
    yolo-fastest-1.1  min =   11.18  max =   12.06  avg =   11.36
      yolo-fastestv2  min =    9.87  max =   10.33  avg =   10.15
  vision_transformer  min = 1475.77  max = 1477.97  avg = 1476.77
          FastestDet  min =    9.39  max =    9.73  avg =    9.53


orangepi@orangepi5:~/ncnn-master/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =    3.59  max =    3.70  avg =    3.66
     squeezenet_int8  min =    4.32  max =    4.42  avg =    4.36
           mobilenet  min =    5.50  max =    5.55  avg =    5.53
      mobilenet_int8  min =    4.52  max =    4.60  avg =    4.56
        mobilenet_v2  min =    4.50  max =    4.60  avg =    4.54
        mobilenet_v3  min =    4.09  max =    4.28  avg =    4.15
          shufflenet  min =    3.49  max =    3.58  avg =    3.51
       shufflenet_v2  min =    2.91  max =    3.07  avg =    2.97
             mnasnet  min =    4.18  max =    4.25  avg =    4.21
     proxylessnasnet  min =    4.94  max =    5.00  avg =    4.97
     efficientnet_b0  min =    7.50  max =    7.54  avg =    7.52
   efficientnetv2_b0  min =   11.32  max =   11.41  avg =   11.37
        regnety_400m  min =    7.92  max =    8.01  avg =    7.95
           blazeface  min =    1.21  max =    1.31  avg =    1.24
           googlenet  min =   15.03  max =   15.17  avg =   15.10
      googlenet_int8  min =   15.48  max =   15.61  avg =   15.55
            resnet18  min =    9.91  max =    9.97  avg =    9.93
       resnet18_int8  min =   15.80  max =   16.00  avg =   15.89
             alexnet  min =   12.35  max =   12.64  avg =   12.48
               vgg16  min =   61.92  max =   65.62  avg =   62.93
          vgg16_int8  min =  129.94  max =  131.65  avg =  130.65
            resnet50  min =   27.41  max =   27.62  avg =   27.52
       resnet50_int8  min =   33.01  max =   33.23  avg =   33.08
      squeezenet_ssd  min =   13.92  max =   14.27  avg =   14.02
 squeezenet_ssd_int8  min =   18.04  max =   18.40  avg =   18.15
       mobilenet_ssd  min =   13.69  max =   13.80  avg =   13.74
  mobilenet_ssd_int8  min =   10.95  max =   11.10  avg =   11.02
      mobilenet_yolo  min =   32.06  max =   32.30  avg =   32.17
  mobilenetv2_yolov3  min =   19.27  max =   20.68  avg =   19.97
         yolov4-tiny  min =   25.41  max =   29.51  avg =   27.76
           nanodet_m  min =    6.68  max =    6.73  avg =    6.70
    yolo-fastest-1.1  min =    3.77  max =    4.02  avg =    3.83
      yolo-fastestv2  min =    3.41  max =    3.65  avg =    3.48
  vision_transformer  min =  548.32  max =  654.71  avg =  579.48
          FastestDet  min =    3.38  max =    3.46  avg =    3.42

```

### OrangePi5 Plus, Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz)
```
orangepi@orangepi5plus:~/ncnn$ ./benchncnn 8 4 2 -1 1
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    5.55  max =    5.67  avg =    5.61
     squeezenet_int8  min =    5.39  max =    5.76  avg =    5.60
           mobilenet  min =    7.43  max =    7.50  avg =    7.47
      mobilenet_int8  min =    6.91  max =    7.00  avg =    6.96
        mobilenet_v2  min =    8.24  max =    8.47  avg =    8.33
        mobilenet_v3  min =    6.63  max =    7.32  avg =    6.84
          shufflenet  min =    4.10  max =    4.23  avg =    4.14
       shufflenet_v2  min =    3.51  max =    3.61  avg =    3.56
             mnasnet  min =    5.76  max =    7.79  avg =    6.53
     proxylessnasnet  min =    6.66  max =    7.19  avg =    6.79
     efficientnet_b0  min =   10.32  max =   10.73  avg =   10.40
   efficientnetv2_b0  min =   11.48  max =   11.78  avg =   11.61
        regnety_400m  min =    9.73  max =    9.85  avg =    9.79
           blazeface  min =    1.39  max =    1.62  avg =    1.46
           googlenet  min =   21.48  max =   23.08  avg =   22.79
      googlenet_int8  min =   20.82  max =   21.78  avg =   21.01
            resnet18  min =    9.37  max =   10.05  avg =    9.50
       resnet18_int8  min =   14.88  max =   19.64  avg =   15.90
             alexnet  min =   24.74  max =   24.93  avg =   24.81
               vgg16  min =   58.75  max =   62.44  avg =   59.52
          vgg16_int8  min =   73.68  max =   75.89  avg =   74.14
            resnet50  min =   44.88  max =   45.10  avg =   44.98
       resnet50_int8  min =   35.54  max =   36.02  avg =   35.71
      squeezenet_ssd  min =   12.07  max =   26.66  avg =   19.03
 squeezenet_ssd_int8  min =   21.95  max =   25.51  avg =   23.21
       mobilenet_ssd  min =   12.62  max =   12.73  avg =   12.67
  mobilenet_ssd_int8  min =   17.21  max =   17.68  avg =   17.44
      mobilenet_yolo  min =   32.82  max =   32.98  avg =   32.91
  mobilenetv2_yolov3  min =   18.67  max =   20.52  avg =   19.57
         yolov4-tiny  min =   38.82  max =   40.84  avg =   39.82
           nanodet_m  min =    9.05  max =    9.22  avg =    9.13
    yolo-fastest-1.1  min =    4.67  max =    5.04  avg =    4.74
      yolo-fastestv2  min =    4.27  max =    4.32  avg =    4.29
  vision_transformer  min =  429.32  max =  431.02  avg =  430.20
          FastestDet  min =    4.28  max =    4.72  avg =    4.36

```

### RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64
```
root@ubuntu:/home/sunrise/ncnn-master/benchmark# ../build-aarch64-linux-gnu/benchmark/benchncnn 10 4 0 -1 1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   49.83  max =   50.57  avg =   50.08
     squeezenet_int8  min =   48.43  max =   49.18  avg =   48.67
           mobilenet  min =   68.37  max =   69.09  avg =   68.63
      mobilenet_int8  min =   58.19  max =   58.72  avg =   58.37
        mobilenet_v2  min =   58.76  max =   60.62  avg =   59.20
        mobilenet_v3  min =   49.75  max =   50.60  avg =   50.06
          shufflenet  min =   37.17  max =   37.96  avg =   37.50
       shufflenet_v2  min =   32.08  max =   32.42  avg =   32.22
             mnasnet  min =   55.51  max =   57.02  avg =   55.90
     proxylessnasnet  min =   68.15  max =   69.53  avg =   68.78
     efficientnet_b0  min =   88.64  max =   90.16  avg =   89.43
   efficientnetv2_b0  min =  102.45  max =  103.42  avg =  102.92
        regnety_400m  min =   88.22  max =   89.09  avg =   88.62
           blazeface  min =    9.78  max =   10.15  avg =    9.93
           googlenet  min =  152.20  max =  153.92  avg =  153.28
      googlenet_int8  min =  141.80  max =  143.30  avg =  142.48
            resnet18  min =  116.70  max =  117.59  avg =  117.03
       resnet18_int8  min =  104.42  max =  105.85  avg =  104.94
             alexnet  min =   82.55  max =   83.23  avg =   82.82
               vgg16  min =  590.22  max =  598.18  avg =  594.35
          vgg16_int8  min =  504.56  max =  507.21  avg =  505.73
            resnet50  min =  307.36  max =  308.68  avg =  308.03
       resnet50_int8  min =  281.35  max =  283.87  avg =  282.30
      squeezenet_ssd  min =  124.93  max =  126.51  avg =  125.51
 squeezenet_ssd_int8  min =  118.07  max =  118.89  avg =  118.29
       mobilenet_ssd  min =  142.27  max =  142.57  avg =  142.44
  mobilenet_ssd_int8  min =  116.51  max =  117.60  avg =  117.04
      mobilenet_yolo  min =  314.64  max =  317.09  avg =  315.93
  mobilenetv2_yolov3  min =  204.55  max =  205.30  avg =  204.93
         yolov4-tiny  min =  246.69  max =  249.64  avg =  247.95
           nanodet_m  min =   77.73  max =   78.30  avg =   77.99
    yolo-fastest-1.1  min =   46.29  max =   47.52  avg =   46.93
      yolo-fastestv2  min =   36.55  max =   36.95  avg =   36.73
  vision_transformer  min = 3372.85  max = 3409.14  avg = 3377.75
          FastestDet  min =   38.23  max =   38.77  avg =   38.49

```

### NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64
```
root@nanopi-r2s:~/ncnn/build/benchmark# ./benchncnn 8 4 0
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   62.20  max =   62.81  avg =   62.49
     squeezenet_int8  min =   57.92  max =   71.46  avg =   59.76
           mobilenet  min =   82.88  max =   89.36  avg =   84.52
      mobilenet_int8  min =   57.16  max =   96.22  avg =   62.29
        mobilenet_v2  min =   73.68  max =   75.92  avg =   74.17
        mobilenet_v3  min =   59.57  max =   60.14  avg =   59.84
          shufflenet  min =   52.34  max =   52.70  avg =   52.53
       shufflenet_v2  min =   45.51  max =   45.92  avg =   45.73
             mnasnet  min =   67.75  max =   83.15  avg =   69.82
     proxylessnasnet  min =   81.70  max =   83.66  avg =   82.31
     efficientnet_b0  min =  121.10  max =  123.22  avg =  121.55
   efficientnetv2_b0  min =  138.93  max =  192.15  avg =  154.94
        regnety_400m  min =   99.62  max =  116.29  avg =  101.97
           blazeface  min =   18.80  max =   19.15  avg =   19.01
           googlenet  min =  176.36  max =  202.84  avg =  181.86
      googlenet_int8  min =  155.50  max =  190.50  avg =  161.20
            resnet18  min =  165.79  max =  201.57  avg =  172.56
       resnet18_int8  min =  122.24  max =  160.53  avg =  134.24
             alexnet  min =  227.07  max =  238.09  avg =  232.19
          vgg16_int8  min =  522.14  max =  551.75  avg =  531.68
            resnet50  min =  378.30  max =  440.21  avg =  388.56
       resnet50_int8  min =  315.76  max =  373.97  avg =  329.88
      squeezenet_ssd  min =  175.37  max =  200.86  avg =  179.01
 squeezenet_ssd_int8  min =  134.71  max =  147.57  avg =  136.57
       mobilenet_ssd  min =  174.43  max =  212.11  avg =  180.61
  mobilenet_ssd_int8  min =  119.41  max =  153.75  avg =  124.21
      mobilenet_yolo  min =  366.27  max =  422.67  avg =  383.65
  mobilenetv2_yolov3  min =  238.56  max =  281.97  avg =  247.56
         yolov4-tiny  min =  311.45  max =  333.32  avg =  316.79
           nanodet_m  min =  114.15  max =  122.39  avg =  115.44

root@nanopi-r2s:~/ncnn/build/benchmark# ./benchncnn 8 2 0
loop_count = 8
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   89.02  max =   90.52  avg =   89.35
     squeezenet_int8  min =   81.19  max =   81.90  avg =   81.42
           mobilenet  min =  131.47  max =  134.39  avg =  132.34
      mobilenet_int8  min =  102.20  max =  103.03  avg =  102.66
        mobilenet_v2  min =  102.40  max =  108.12  avg =  103.91
        mobilenet_v3  min =   89.17  max =   90.10  avg =   89.53
          shufflenet  min =   65.74  max =   68.86  avg =   66.50
       shufflenet_v2  min =   62.83  max =   64.41  avg =   63.25
             mnasnet  min =   98.01  max =   98.24  avg =   98.14
     proxylessnasnet  min =  121.10  max =  123.55  avg =  121.80
     efficientnet_b0  min =  187.79  max =  188.41  avg =  188.08
   efficientnetv2_b0  min =  211.96  max =  213.99  avg =  212.74
        regnety_400m  min =  124.98  max =  125.49  avg =  125.28
           blazeface  min =   24.91  max =   25.14  avg =   25.00
           googlenet  min =  278.47  max =  283.24  avg =  280.79
      googlenet_int8  min =  243.81  max =  247.82  avg =  245.30
            resnet18  min =  257.46  max =  259.29  avg =  258.29
       resnet18_int8  min =  187.18  max =  188.74  avg =  187.70
             alexnet  min =  384.52  max =  387.07  avg =  385.84
          vgg16_int8  min =  897.26  max =  901.68  avg =  899.19
            resnet50  min =  618.85  max =  623.92  avg =  620.85
       resnet50_int8  min =  512.33  max =  514.93  avg =  513.64
      squeezenet_ssd  min =  211.21  max =  218.71  avg =  213.02
 squeezenet_ssd_int8  min =  193.32  max =  193.97  avg =  193.70
       mobilenet_ssd  min =  271.11  max =  275.58  avg =  272.06
  mobilenet_ssd_int8  min =  208.80  max =  209.59  avg =  209.05
      mobilenet_yolo  min =  570.55  max =  575.98  avg =  572.73
  mobilenetv2_yolov3  min =  329.04  max =  353.84  avg =  340.42
         yolov4-tiny  min =  435.16  max =  463.68  avg =  457.69
           nanodet_m  min =  155.70  max =  159.13  avg =  156.50
```

### EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64
```
[openailab@MiWiFi-R1D-srv benchmark]$ ./benchncnn 8 4 0 -1 1
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   68.97  max =   71.42  avg =   69.65
     squeezenet_int8  min =   58.47  max =   59.58  avg =   58.77
           mobilenet  min =   90.87  max =  100.18  avg =   92.48
      mobilenet_int8  min =   59.46  max =   63.02  avg =   60.01
        mobilenet_v2  min =   82.92  max =  112.01  avg =   88.10
        mobilenet_v3  min =   66.65  max =   69.57  avg =   67.27
          shufflenet  min =   48.22  max =   48.49  avg =   48.34
       shufflenet_v2  min =   48.52  max =   52.88  avg =   49.17
             mnasnet  min =   75.63  max =   79.83  avg =   76.43
     proxylessnasnet  min =   84.73  max =   86.69  avg =   85.16
     efficientnet_b0  min =  125.69  max =  129.00  avg =  126.38
   efficientnetv2_b0  min =  144.44  max =  149.01  avg =  145.33
        regnety_400m  min =   99.69  max =  101.23  avg =  100.38
           blazeface  min =   15.84  max =   16.24  avg =   16.03
           googlenet  min =  194.64  max =  199.29  avg =  196.07
      googlenet_int8  min =  158.54  max =  165.64  avg =  160.25
            resnet18  min =  200.65  max =  221.60  avg =  204.30
       resnet18_int8  min =  122.69  max =  126.57  avg =  123.54
             alexnet  min =  175.54  max =  200.91  avg =  181.38
            resnet50  min =  428.75  max =  466.51  avg =  439.67
       resnet50_int8  min =  324.95  max =  347.47  avg =  329.74
      squeezenet_ssd  min =  199.86  max =  207.51  avg =  201.99
 squeezenet_ssd_int8  min =  150.35  max =  176.92  avg =  154.60
       mobilenet_ssd  min =  186.50  max =  189.92  avg =  188.09
  mobilenet_ssd_int8  min =  123.55  max =  127.17  avg =  124.63
      mobilenet_yolo  min =  393.83  max =  414.09  avg =  398.57
  mobilenetv2_yolov3  min =  263.49  max =  273.11  avg =  266.11
         yolov4-tiny  min =  342.33  max =  363.69  avg =  346.34
           nanodet_m  min =  119.66  max =  127.29  avg =  121.26
    yolo-fastest-1.1  min =   61.87  max =   90.26  avg =   65.77
      yolo-fastestv2  min =   48.48  max =   50.82  avg =   48.93

[openailab@MiWiFi-R1D-srv benchmark]$ ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  152.15  max =  152.67  avg =  152.43
     squeezenet_int8  min =  143.22  max =  144.24  avg =  143.61
           mobilenet  min =  237.77  max =  239.69  avg =  238.47
      mobilenet_int8  min =  199.91  max =  201.35  avg =  200.50
        mobilenet_v2  min =  169.67  max =  170.18  avg =  169.93
        mobilenet_v3  min =  150.06  max =  151.17  avg =  150.78
          shufflenet  min =   91.78  max =   92.38  avg =   92.06
       shufflenet_v2  min =  100.86  max =  101.75  avg =  101.50
             mnasnet  min =  165.10  max =  166.74  avg =  166.24
     proxylessnasnet  min =  218.42  max =  220.55  avg =  219.12
     efficientnet_b0  min =  348.00  max =  349.03  avg =  348.49
   efficientnetv2_b0  min =  404.06  max =  406.16  avg =  405.00
        regnety_400m  min =  209.48  max =  211.36  avg =  210.44
           blazeface  min =   31.31  max =   32.61  avg =   32.00
           googlenet  min =  510.38  max =  512.43  avg =  511.25
      googlenet_int8  min =  454.38  max =  456.19  avg =  455.02
            resnet18  min =  407.78  max =  409.45  avg =  408.34
       resnet18_int8  min =  357.01  max =  360.72  avg =  358.74
             alexnet  min =  504.12  max =  506.74  avg =  505.08
            resnet50  min = 1115.42  max = 1121.91  avg = 1118.67
       resnet50_int8  min =  973.38  max =  976.26  avg =  975.21
      squeezenet_ssd  min =  361.52  max =  363.69  avg =  362.38
 squeezenet_ssd_int8  min =  333.81  max =  337.16  avg =  335.24
       mobilenet_ssd  min =  477.43  max =  478.36  avg =  477.82
  mobilenet_ssd_int8  min =  409.33  max =  409.67  avg =  409.52
      mobilenet_yolo  min = 1048.79  max = 1057.72  avg = 1053.80
  mobilenetv2_yolov3  min =  567.04  max =  571.44  avg =  569.04
         yolov4-tiny  min =  788.40  max =  790.74  avg =  789.12
           nanodet_m  min =  253.68  max =  254.59  avg =  254.16
    yolo-fastest-1.1  min =  102.44  max =  103.11  avg =  102.67
      yolo-fastestv2  min =   82.19  max =   82.43  avg =   82.35
```

### NVIDIA Jetson Orin Nano
```
orin@nano:~/ncnn/benchmark$ ./benchncnn 8 6 0 0 1
[0 NVIDIA Tegra Orin (nvgpu)]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 NVIDIA Tegra Orin (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra Orin (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra Orin (nvgpu)]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 NVIDIA Tegra Orin (nvgpu)]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
loop_count = 8
num_threads = 6
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =    5.31  max =    5.95  avg =    5.44
     squeezenet_int8  min =    5.13  max =    6.24  avg =    5.57
           mobilenet  min =    2.98  max =    5.52  avg =    3.66
      mobilenet_int8  min =    5.97  max =    7.76  avg =    6.98
        mobilenet_v2  min =    6.73  max =    6.98  avg =    6.91
        mobilenet_v3  min =    8.58  max =    8.77  avg =    8.71
          shufflenet  min =    7.33  max =    7.43  avg =    7.39
       shufflenet_v2  min =    7.59  max =    8.46  avg =    8.27
             mnasnet  min =    4.78  max =    6.81  avg =    5.41
     proxylessnasnet  min =    7.39  max =    7.65  avg =    7.52
     efficientnet_b0  min =   10.81  max =   15.28  avg =   12.27
   efficientnetv2_b0  min =   46.58  max =   48.56  avg =   47.70
        regnety_400m  min =    9.86  max =   10.46  avg =   10.04
           blazeface  min =    3.98  max =    4.66  avg =    4.31
           googlenet  min =   10.01  max =   14.44  avg =   11.48
      googlenet_int8  min =   18.07  max =   19.55  avg =   18.65
            resnet18  min =    6.52  max =    9.73  avg =    8.26
       resnet18_int8  min =   13.28  max =   20.58  avg =   14.96
             alexnet  min =    8.71  max =    9.05  avg =    8.84
               vgg16  min =   19.28  max =   19.49  avg =   19.35
          vgg16_int8  min =   98.14  max =  100.92  avg =   99.76
            resnet50  min =    9.25  max =    9.37  avg =    9.31
       resnet50_int8  min =   31.16  max =   34.44  avg =   32.59
      squeezenet_ssd  min =   13.60  max =   18.96  avg =   16.68
 squeezenet_ssd_int8  min =   17.81  max =   19.83  avg =   18.75
       mobilenet_ssd  min =   11.88  max =   13.86  avg =   13.27
  mobilenet_ssd_int8  min =   14.05  max =   21.16  avg =   15.64
      mobilenet_yolo  min =   14.18  max =   14.41  avg =   14.26
  mobilenetv2_yolov3  min =   16.65  max =   18.78  avg =   18.06
         yolov4-tiny  min =   25.60  max =   26.56  avg =   25.92
           nanodet_m  min =   15.71  max =   19.89  avg =   19.03
    yolo-fastest-1.1  min =    8.72  max =    9.18  avg =    8.96
      yolo-fastestv2  min =    7.97  max =    8.10  avg =    8.04
  vision_transformer  min =  821.34  max =  825.91  avg =  823.26
          FastestDet  min =    7.72  max =    8.15  avg =    7.81
orin@nano:~/ncnn/benchmark$ ./benchncnn 8 1 0 0 1
[0 NVIDIA Tegra Orin (nvgpu)]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 NVIDIA Tegra Orin (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra Orin (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra Orin (nvgpu)]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 NVIDIA Tegra Orin (nvgpu)]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
loop_count = 8
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =    5.05  max =    5.23  avg =    5.09
     squeezenet_int8  min =   15.93  max =   16.09  avg =   16.00
           mobilenet  min =    2.97  max =    5.49  avg =    3.84
      mobilenet_int8  min =   23.27  max =   23.38  avg =   23.33
        mobilenet_v2  min =    3.61  max =    4.01  avg =    3.83
        mobilenet_v3  min =    6.12  max =    8.36  avg =    6.67
          shufflenet  min =    4.07  max =    7.25  avg =    6.22
       shufflenet_v2  min =    8.49  max =    8.82  avg =    8.67
             mnasnet  min =    3.70  max =    8.23  avg =    5.37
     proxylessnasnet  min =    6.36  max =    9.16  avg =    7.52
     efficientnet_b0  min =   10.55  max =   10.81  avg =   10.65
   efficientnetv2_b0  min =   28.22  max =   28.62  avg =   28.54
        regnety_400m  min =    7.22  max =   10.04  avg =    8.50
           blazeface  min =    3.70  max =    3.86  avg =    3.76
           googlenet  min =    7.18  max =    9.76  avg =    8.21
      googlenet_int8  min =   63.19  max =   63.54  avg =   63.32
            resnet18  min =    4.67  max =    4.73  avg =    4.70
       resnet18_int8  min =   50.51  max =   50.81  avg =   50.65
             alexnet  min =    8.56  max =   10.64  avg =    9.02
               vgg16  min =   19.24  max =   19.50  avg =   19.31
          vgg16_int8  min =  411.02  max =  412.40  avg =  411.60
            resnet50  min =    9.14  max =    9.52  avg =    9.41
       resnet50_int8  min =  112.04  max =  112.43  avg =  112.25
      squeezenet_ssd  min =   13.23  max =   13.79  avg =   13.52
 squeezenet_ssd_int8  min =   46.52  max =   46.98  avg =   46.77
       mobilenet_ssd  min =    8.89  max =   12.51  avg =    9.95
  mobilenet_ssd_int8  min =   47.66  max =   48.73  avg =   48.13
      mobilenet_yolo  min =    9.68  max =    9.75  avg =    9.70
  mobilenetv2_yolov3  min =   15.84  max =   17.54  avg =   16.83
         yolov4-tiny  min =   23.32  max =   25.49  avg =   24.56
           nanodet_m  min =   13.59  max =   19.53  avg =   15.85
    yolo-fastest-1.1  min =    7.68  max =   11.32  avg =    8.20
      yolo-fastestv2  min =    7.75  max =    7.84  avg =    7.78
  vision_transformer  min =  822.27  max =  829.73  avg =  825.74
          FastestDet  min =    7.51  max =    8.05  avg =    7.68
          
orin@nano:~/ncnn/benchmark$ ./benchncnn 8 6 0 -1 1
loop_count = 8
num_threads = 6
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =    5.07  max =    6.99  avg =    5.69
     squeezenet_int8  min =    5.08  max =    5.79  avg =    5.42
           mobilenet  min =    6.96  max =    8.20  avg =    7.45
      mobilenet_int8  min =    5.91  max =    7.33  avg =    6.37
        mobilenet_v2  min =    5.86  max =    7.55  avg =    6.51
        mobilenet_v3  min =    5.60  max =    7.22  avg =    6.14
          shufflenet  min =    5.20  max =    5.79  avg =    5.44
       shufflenet_v2  min =    4.56  max =    5.90  avg =    4.86
             mnasnet  min =    5.43  max =    6.44  avg =    5.83
     proxylessnasnet  min =    5.92  max =    8.70  avg =    6.83
     efficientnet_b0  min =   10.09  max =   11.57  avg =   10.65
   efficientnetv2_b0  min =   12.79  max =   15.96  avg =   14.12
        regnety_400m  min =   14.04  max =   21.23  avg =   15.88
           blazeface  min =    1.76  max =    1.90  avg =    1.81
           googlenet  min =   19.45  max =   25.43  avg =   21.21
      googlenet_int8  min =   17.67  max =   18.59  avg =   18.20
            resnet18  min =   12.26  max =   19.47  avg =   15.13
       resnet18_int8  min =   13.02  max =   14.78  avg =   13.86
             alexnet  min =   12.27  max =   19.18  avg =   15.02
               vgg16  min =   59.43  max =   89.43  avg =   65.11
          vgg16_int8  min =   97.71  max =  141.28  avg =  108.00
            resnet50  min =   38.69  max =   40.67  avg =   39.26
       resnet50_int8  min =   28.67  max =   31.63  avg =   29.93
      squeezenet_ssd  min =   14.52  max =   26.92  avg =   17.89
 squeezenet_ssd_int8  min =   16.61  max =   19.27  avg =   17.82
       mobilenet_ssd  min =   16.61  max =   22.65  avg =   17.89
  mobilenet_ssd_int8  min =   13.22  max =   14.83  avg =   14.04
      mobilenet_yolo  min =   40.10  max =   44.28  avg =   41.48
  mobilenetv2_yolov3  min =   21.48  max =   22.83  avg =   22.01
         yolov4-tiny  min =   33.30  max =   37.31  avg =   34.59
           nanodet_m  min =   10.80  max =   12.62  avg =   11.54
    yolo-fastest-1.1  min =    5.51  max =    6.03  avg =    5.75
      yolo-fastestv2  min =    4.98  max =    6.35  avg =    5.44
  vision_transformer  min =  610.40  max =  681.89  avg =  628.84
          FastestDet  min =    4.82  max =    6.19  avg =    5.32
orin@nano:~/ncnn/benchmark$ ./benchncnn 8 1 0 -1 1
loop_count = 8
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   15.94  max =   16.23  avg =   16.04
     squeezenet_int8  min =   15.91  max =   16.09  avg =   15.98
           mobilenet  min =   28.77  max =   28.91  avg =   28.83
      mobilenet_int8  min =   23.29  max =   23.63  avg =   23.46
        mobilenet_v2  min =   19.32  max =   19.43  avg =   19.37
        mobilenet_v3  min =   16.57  max =   16.65  avg =   16.61
          shufflenet  min =   10.39  max =   10.48  avg =   10.44
       shufflenet_v2  min =   10.61  max =   10.69  avg =   10.65
             mnasnet  min =   18.61  max =   18.69  avg =   18.65
     proxylessnasnet  min =   21.97  max =   22.17  avg =   22.05
     efficientnet_b0  min =   36.73  max =   36.89  avg =   36.83
   efficientnetv2_b0  min =   41.72  max =   41.97  avg =   41.83
        regnety_400m  min =   25.71  max =   26.03  avg =   25.85
           blazeface  min =    3.59  max =    3.63  avg =    3.60
           googlenet  min =   66.85  max =   67.38  avg =   67.12
      googlenet_int8  min =   63.65  max =   63.85  avg =   63.74
            resnet18  min =   48.49  max =   49.21  avg =   48.83
       resnet18_int8  min =   50.82  max =   51.16  avg =   50.92
             alexnet  min =   57.67  max =   58.24  avg =   58.03
               vgg16  min =  280.03  max =  281.34  avg =  280.77
          vgg16_int8  min =  413.51  max =  414.67  avg =  414.08
            resnet50  min =  138.19  max =  138.94  avg =  138.48
       resnet50_int8  min =  112.53  max =  112.86  avg =  112.68
      squeezenet_ssd  min =   46.26  max =   46.46  avg =   46.37
 squeezenet_ssd_int8  min =   47.56  max =   48.33  avg =   47.85
       mobilenet_ssd  min =   60.51  max =   60.81  avg =   60.68
  mobilenet_ssd_int8  min =   47.47  max =   47.76  avg =   47.58
      mobilenet_yolo  min =  136.20  max =  136.54  avg =  136.37
  mobilenetv2_yolov3  min =   69.80  max =   70.04  avg =   69.93
         yolov4-tiny  min =   87.71  max =   88.63  avg =   88.12
           nanodet_m  min =   25.73  max =   26.06  avg =   25.85
    yolo-fastest-1.1  min =   10.25  max =   10.35  avg =   10.29
      yolo-fastestv2  min =    9.25  max =    9.38  avg =    9.33
  vision_transformer  min = 2282.07  max = 2690.34  avg = 2481.94
          FastestDet  min =    9.80  max =    9.88  avg =    9.84
```

### NVIDIA Jetson Nano
```
[0 NVIDIA Tegra X1 (nvgpu)]  queueC=0[16]  queueG=0[16]  queueT=0[16]
[0 NVIDIA Tegra X1 (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra X1 (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra X1 (nvgpu)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =   12.15  max =   26.48  avg =   18.11
     squeezenet_int8  min =   27.60  max =   42.50  avg =   29.89
           mobilenet  min =   16.07  max =   16.10  avg =   16.09
      mobilenet_int8  min =   30.65  max =   32.15  avg =   31.07
        mobilenet_v2  min =   12.87  max =   13.15  avg =   12.99
        mobilenet_v3  min =   13.32  max =   16.65  avg =   14.57
          shufflenet  min =   14.21  max =   14.34  avg =   14.29
       shufflenet_v2  min =   13.03  max =   21.97  avg =   19.02
             mnasnet  min =   13.33  max =   13.64  avg =   13.49
     proxylessnasnet  min =   14.65  max =   14.91  avg =   14.76
     efficientnet_b0  min =   21.26  max =   21.41  avg =   21.35
   efficientnetv2_b0  min =   54.66  max =   60.81  avg =   57.16
        regnety_400m  min =   17.91  max =   18.08  avg =   18.01
           blazeface  min =    6.87  max =    7.03  avg =    6.94
           googlenet  min =   43.30  max =   43.54  avg =   43.43
      googlenet_int8  min =   80.07  max =   84.28  avg =   81.10
            resnet18  min =   43.89  max =   44.06  avg =   43.98
       resnet18_int8  min =   60.70  max =   63.43  avg =   61.60
             alexnet  min =   74.21  max =   75.20  avg =   74.45
               vgg16  min =  310.39  max =  310.65  avg =  310.52
          vgg16_int8  min =  293.15  max =  297.28  avg =  294.93
            resnet50  min =   93.03  max =   93.22  avg =   93.12
       resnet50_int8  min =  158.54  max =  161.25  avg =  159.56
      squeezenet_ssd  min =   55.88  max =   57.43  avg =   56.46
 squeezenet_ssd_int8  min =   72.42  max =   73.25  avg =   72.73
       mobilenet_ssd  min =   35.38  max =   37.57  avg =   36.63
  mobilenet_ssd_int8  min =   62.92  max =   64.97  avg =   63.63
      mobilenet_yolo  min =   76.56  max =   80.44  avg =   78.05
  mobilenetv2_yolov3  min =   46.35  max =   48.14  avg =   47.26
         yolov4-tiny  min =   95.38  max =   97.55  avg =   96.45
           nanodet_m  min =   22.82  max =   26.01  avg =   24.48
    yolo-fastest-1.1  min =   20.23  max =   25.51  avg =   21.52
      yolo-fastestv2  min =   20.67  max =   20.82  avg =   20.75

[0 NVIDIA Tegra X1 (nvgpu)]  queueC=0[16]  queueG=0[16]  queueT=0[16]
[0 NVIDIA Tegra X1 (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra X1 (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra X1 (nvgpu)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 8
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =   12.00  max =   15.41  avg =   13.55
     squeezenet_int8  min =   78.76  max =   79.14  avg =   78.91
           mobilenet  min =   16.03  max =   16.25  avg =   16.15
      mobilenet_int8  min =  107.58  max =  107.68  avg =  107.61
        mobilenet_v2  min =   12.84  max =   13.13  avg =   12.99
        mobilenet_v3  min =   13.29  max =   16.64  avg =   14.38
          shufflenet  min =   14.23  max =   14.54  avg =   14.34
       shufflenet_v2  min =   12.94  max =   13.21  avg =   13.02
             mnasnet  min =   13.42  max =   13.66  avg =   13.53
     proxylessnasnet  min =   14.64  max =   14.94  avg =   14.76
     efficientnet_b0  min =   21.28  max =   21.51  avg =   21.36
   efficientnetv2_b0  min =   74.32  max =   78.50  avg =   77.79
        regnety_400m  min =   17.94  max =   18.26  avg =   18.07
           blazeface  min =    6.83  max =    6.94  avg =    6.89
           googlenet  min =   43.45  max =   43.63  avg =   43.52
      googlenet_int8  min =  255.68  max =  256.33  avg =  255.92
            resnet18  min =   43.96  max =   44.06  avg =   44.01
       resnet18_int8  min =  192.01  max =  192.64  avg =  192.33
             alexnet  min =   74.04  max =   74.23  avg =   74.14
               vgg16  min =  310.32  max =  310.64  avg =  310.44
          vgg16_int8  min = 1003.05  max = 1004.27  avg = 1003.66
            resnet50  min =   93.05  max =   93.34  avg =   93.21
       resnet50_int8  min =  516.27  max =  517.12  avg =  516.69
      squeezenet_ssd  min =   56.67  max =   56.86  avg =   56.73
 squeezenet_ssd_int8  min =  182.96  max =  184.26  avg =  183.71
       mobilenet_ssd  min =   35.61  max =   35.70  avg =   35.65
  mobilenet_ssd_int8  min =  217.02  max =  217.50  avg =  217.23
      mobilenet_yolo  min =   78.10  max =   78.36  avg =   78.20
  mobilenetv2_yolov3  min =   49.86  max =   57.83  avg =   53.18
         yolov4-tiny  min =   96.76  max =   96.86  avg =   96.82
           nanodet_m  min =   25.26  max =   25.36  avg =   25.31
    yolo-fastest-1.1  min =   21.55  max =   24.22  avg =   23.78
      yolo-fastestv2  min =   20.80  max =   21.01  avg =   20.90

loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   30.03  max =   31.41  avg =   30.59
     squeezenet_int8  min =   27.32  max =   27.76  avg =   27.50
           mobilenet  min =   41.74  max =   42.57  avg =   42.05
      mobilenet_int8  min =   30.48  max =   31.57  avg =   30.85
        mobilenet_v2  min =   33.49  max =   34.18  avg =   33.83
        mobilenet_v3  min =   30.59  max =   30.96  avg =   30.79
          shufflenet  min =   21.07  max =   31.68  avg =   22.53
       shufflenet_v2  min =   19.55  max =   20.01  avg =   19.71
             mnasnet  min =   31.70  max =   32.26  avg =   31.93
     proxylessnasnet  min =   36.90  max =   38.55  avg =   37.27
     efficientnet_b0  min =   68.42  max =   77.60  avg =   70.60
   efficientnetv2_b0  min =   73.72  max =   81.05  avg =   75.31
        regnety_400m  min =   56.67  max =   66.82  avg =   58.24
           blazeface  min =    6.55  max =    6.96  avg =    6.74
           googlenet  min =   92.74  max =   94.22  avg =   93.12
      googlenet_int8  min =   80.86  max =   87.28  avg =   82.41
            resnet18  min =   83.10  max =   84.30  avg =   83.44
       resnet18_int8  min =   59.40  max =   65.86  avg =   60.70
             alexnet  min =   89.21  max =   92.45  avg =   89.98
               vgg16  min =  445.72  max =  451.09  avg =  447.39
          vgg16_int8  min =  292.81  max =  295.55  avg =  294.34
            resnet50  min =  203.42  max =  204.45  avg =  204.08
       resnet50_int8  min =  157.87  max =  160.30  avg =  158.67
      squeezenet_ssd  min =   85.60  max =   87.24  avg =   86.18
 squeezenet_ssd_int8  min =   73.10  max =   85.64  avg =   74.94
       mobilenet_ssd  min =   86.75  max =   96.51  avg =   88.49
  mobilenet_ssd_int8  min =   63.40  max =   71.57  avg =   64.97
      mobilenet_yolo  min =  193.84  max =  195.24  avg =  194.62
  mobilenetv2_yolov3  min =  115.80  max =  117.27  avg =  116.27
         yolov4-tiny  min =  156.30  max =  158.26  avg =  156.81
           nanodet_m  min =   46.64  max =   47.97  avg =   47.12
    yolo-fastest-1.1  min =   25.78  max =   27.86  avg =   26.29
      yolo-fastestv2  min =   20.54  max =   30.73  avg =   22.18

loop_count = 8
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   85.91  max =   86.86  avg =   86.14
     squeezenet_int8  min =   77.57  max =   78.10  avg =   77.69
           mobilenet  min =  137.43  max =  138.03  avg =  137.63
      mobilenet_int8  min =  108.06  max =  108.21  avg =  108.13
        mobilenet_v2  min =   93.81  max =   94.70  avg =   93.99
        mobilenet_v3  min =   81.77  max =   82.49  avg =   81.99
          shufflenet  min =   47.84  max =   48.46  avg =   48.17
       shufflenet_v2  min =   47.93  max =   48.23  avg =   48.09
             mnasnet  min =   91.73  max =   92.55  avg =   91.98
     proxylessnasnet  min =  115.41  max =  115.75  avg =  115.56
     efficientnet_b0  min =  225.64  max =  226.21  avg =  225.94
   efficientnetv2_b0  min =  239.71  max =  240.20  avg =  239.89
        regnety_400m  min =  118.46  max =  118.84  avg =  118.61
           blazeface  min =   15.58  max =   17.14  avg =   16.21
           googlenet  min =  286.85  max =  287.51  avg =  287.11
      googlenet_int8  min =  256.44  max =  256.74  avg =  256.53
            resnet18  min =  221.27  max =  221.93  avg =  221.60
       resnet18_int8  min =  189.95  max =  191.34  avg =  190.74
             alexnet  min =  284.30  max =  285.40  avg =  284.87
               vgg16  min = 1241.51  max = 1244.53  avg = 1242.90
          vgg16_int8  min = 1003.92  max = 1004.47  avg = 1004.29
            resnet50  min =  624.43  max =  625.34  avg =  624.84
       resnet50_int8  min =  516.64  max =  517.26  avg =  516.99
      squeezenet_ssd  min =  190.21  max =  191.35  avg =  190.71
 squeezenet_ssd_int8  min =  182.97  max =  184.19  avg =  183.38
       mobilenet_ssd  min =  275.60  max =  276.17  avg =  275.90
  mobilenet_ssd_int8  min =  216.67  max =  217.58  avg =  216.94
      mobilenet_yolo  min =  616.16  max =  617.45  avg =  616.71
  mobilenetv2_yolov3  min =  324.88  max =  325.73  avg =  325.19
         yolov4-tiny  min =  421.01  max =  423.52  avg =  422.14
           nanodet_m  min =  117.39  max =  117.75  avg =  117.54
    yolo-fastest-1.1  min =   54.55  max =   55.61  avg =   54.87
      yolo-fastestv2  min =   44.40  max =   44.78  avg =   44.57
```

### NVIDIA Jetson TX2 NX(NV-Denver2 2.0Ghz x 2 +  Cortex-A57 2.0Ghz x 4 + 256-core NVIDIA Pascal iGPU)
```
fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 $(nproc) 0 0
[0 NVIDIA Tegra X2 (nvgpu)]  queueC=0[16]  queueG=0[16]  queueT=0[16]
[0 NVIDIA Tegra X2 (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra X2 (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra X2 (nvgpu)]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 NVIDIA Tegra X2 (nvgpu)]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 10
num_threads = 6
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =    4.84  max =    6.12  avg =    5.33
     squeezenet_int8  min =   23.14  max =  148.62  avg =   52.65
           mobilenet  min =    7.23  max =    7.57  avg =    7.40
      mobilenet_int8  min =   19.69  max =  101.50  avg =   44.15
        mobilenet_v2  min =    6.65  max =    6.86  avg =    6.76
        mobilenet_v3  min =    7.22  max =    8.34  avg =    8.01
          shufflenet  min =    6.14  max =    6.73  avg =    6.51
       shufflenet_v2  min =    5.33  max =    5.43  avg =    5.39
             mnasnet  min =    6.98  max =    7.47  avg =    7.16
     proxylessnasnet  min =    6.90  max =    7.52  avg =    7.09
     efficientnet_b0  min =   11.42  max =   11.89  avg =   11.67
   efficientnetv2_b0  min =   26.48  max =   51.57  avg =   36.25
        regnety_400m  min =    8.94  max =    9.45  avg =    9.13
           blazeface  min =    2.08  max =    3.21  avg =    2.42
           googlenet  min =   15.33  max =   15.78  avg =   15.53
      googlenet_int8  min =   64.02  max =  158.22  avg =   79.32
            resnet18  min =   12.25  max =   13.28  avg =   12.78
       resnet18_int8  min =   41.89  max =  156.59  avg =   57.07
             alexnet  min =   20.15  max =   20.51  avg =   20.32
               vgg16  min =   62.45  max =   64.63  avg =   63.06
          vgg16_int8  min =  198.24  max =  271.71  avg =  217.63
            resnet50  min =   30.05  max =   31.11  avg =   30.39
       resnet50_int8  min =  129.03  max =  205.33  avg =  154.72
      squeezenet_ssd  min =   18.48  max =   22.90  avg =   20.26
 squeezenet_ssd_int8  min =   48.18  max =   71.20  avg =   60.89
       mobilenet_ssd  min =   15.56  max =   15.76  avg =   15.67
  mobilenet_ssd_int8  min =   55.10  max =  114.34  avg =   67.41
      mobilenet_yolo  min =   28.75  max =   32.54  avg =   30.30
  mobilenetv2_yolov3  min =   26.15  max =   32.36  avg =   29.57
         yolov4-tiny  min =   23.08  max =   37.19  avg =   25.43
           nanodet_m  min =   15.81  max =   19.99  avg =   18.10
    yolo-fastest-1.1  min =    7.35  max =   11.26  avg =    8.69
      yolo-fastestv2  min =    6.16  max =    6.61  avg =    6.31
  vision_transformer  min = 1301.45  max = 1356.58  avg = 1321.51
          FastestDet  min =    5.64  max =    6.60  avg =    5.90
fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 1 0 0
[0 NVIDIA Tegra X2 (nvgpu)]  queueC=0[16]  queueG=0[16]  queueT=0[16]
[0 NVIDIA Tegra X2 (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra X2 (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra X2 (nvgpu)]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 NVIDIA Tegra X2 (nvgpu)]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =    5.10  max =    6.33  avg =    5.51
     squeezenet_int8  min =   56.36  max =   59.23  avg =   57.79
           mobilenet  min =    6.61  max =    9.93  avg =    7.27
      mobilenet_int8  min =   95.73  max =  107.69  avg =   99.35
        mobilenet_v2  min =    6.66  max =    9.87  avg =    7.22
        mobilenet_v3  min =    7.20  max =    8.77  avg =    7.61
          shufflenet  min =    5.87  max =    6.13  avg =    5.97
       shufflenet_v2  min =    5.63  max =    8.24  avg =    6.10
             mnasnet  min =    6.55  max =    9.05  avg =    7.10
     proxylessnasnet  min =    7.29  max =    7.86  avg =    7.50
     efficientnet_b0  min =   11.22  max =   12.13  avg =   11.49
   efficientnetv2_b0  min =   20.21  max =   24.55  avg =   21.42
        regnety_400m  min =    8.94  max =   10.77  avg =    9.37
           blazeface  min =    2.30  max =    2.45  avg =    2.35
           googlenet  min =   15.48  max =   17.88  avg =   16.32
      googlenet_int8  min =  197.08  max =  205.18  avg =  200.93
            resnet18  min =   12.69  max =   13.38  avg =   13.01
       resnet18_int8  min =  147.42  max =  154.63  avg =  149.94
             alexnet  min =   20.49  max =   20.83  avg =   20.62
               vgg16  min =   62.43  max =   63.41  avg =   62.81
          vgg16_int8  min =  802.28  max =  810.33  avg =  805.66
            resnet50  min =   29.96  max =   30.56  avg =   30.26
       resnet50_int8  min =  488.38  max =  494.67  avg =  491.09
      squeezenet_ssd  min =   18.35  max =   18.84  avg =   18.59
 squeezenet_ssd_int8  min =  121.27  max =  124.52  avg =  122.21
       mobilenet_ssd  min =   15.13  max =   15.60  avg =   15.30
  mobilenet_ssd_int8  min =  206.22  max =  225.98  avg =  222.55
      mobilenet_yolo  min =   30.12  max =   31.28  avg =   30.41
  mobilenetv2_yolov3  min =   26.65  max =   27.08  avg =   26.87
         yolov4-tiny  min =   22.91  max =   23.32  avg =   23.04
           nanodet_m  min =   11.57  max =   11.99  avg =   11.75
    yolo-fastest-1.1  min =    7.06  max =    7.49  avg =    7.25
      yolo-fastestv2  min =    6.17  max =    6.65  avg =    6.34
  vision_transformer  min = 1185.13  max = 1193.94  avg = 1189.50
          FastestDet  min =    5.78  max =    6.87  avg =    6.11
fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 $(nproc) 0 -1
loop_count = 10
num_threads = 6
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   19.92  max =   22.96  avg =   21.43
     squeezenet_int8  min =   20.33  max =   25.17  avg =   22.63
           mobilenet  min =   27.25  max =   80.19  avg =   36.64
      mobilenet_int8  min =   21.22  max =   31.14  avg =   27.05
        mobilenet_v2  min =   21.95  max =   25.77  avg =   24.10
        mobilenet_v3  min =   20.10  max =   34.13  avg =   25.30
          shufflenet  min =   14.96  max =  108.36  avg =   28.88
       shufflenet_v2  min =   13.25  max =   29.33  avg =   16.43
             mnasnet  min =   19.41  max =  111.63  avg =   30.57
     proxylessnasnet  min =   22.58  max =   27.29  avg =   24.43
     efficientnet_b0  min =   32.95  max =   35.53  avg =   34.46
   efficientnetv2_b0  min =   36.91  max =   52.12  avg =   41.72
        regnety_400m  min =   43.87  max =  152.33  avg =   56.15
           blazeface  min =    4.51  max =   16.71  avg =    6.79
           googlenet  min =   59.37  max =   93.96  avg =   70.88
      googlenet_int8  min =   57.95  max =  124.06  avg =   71.47
            resnet18  min =   51.99  max =  134.81  avg =   68.50
       resnet18_int8  min =   40.54  max =  130.18  avg =   54.10
             alexnet  min =   41.42  max =   67.03  avg =   52.66
               vgg16  min =  253.75  max =  295.39  avg =  265.01
          vgg16_int8  min =  183.96  max =  334.83  avg =  206.81
            resnet50  min =  305.79  max =  330.68  avg =  316.55
       resnet50_int8  min =  120.10  max =  133.19  avg =  125.92
      squeezenet_ssd  min =   51.06  max =  125.69  avg =   67.34
 squeezenet_ssd_int8  min =   44.56  max =  156.68  avg =   61.47
       mobilenet_ssd  min =   52.27  max =  123.50  avg =   64.86
  mobilenet_ssd_int8  min =   48.18  max =  183.44  avg =   63.25
      mobilenet_yolo  min =  120.27  max =  160.73  avg =  130.75
  mobilenetv2_yolov3  min =   74.39  max =  167.08  avg =   86.50
         yolov4-tiny  min =  108.39  max =  123.62  avg =  112.81
           nanodet_m  min =   32.38  max =   91.62  avg =   42.01
    yolo-fastest-1.1  min =   17.97  max =  157.78  avg =   34.93
      yolo-fastestv2  min =   16.12  max =   19.55  avg =   18.03
  vision_transformer  min = 2317.30  max = 2437.95  avg = 2375.98
          FastestDet  min =   15.52  max =  127.95  avg =   27.40
fan@ubuntu:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 1 0 -1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   48.72  max =   50.66  avg =   49.98
     squeezenet_int8  min =   56.50  max =   61.58  avg =   58.64
           mobilenet  min =   88.10  max =   89.76  avg =   88.92
      mobilenet_int8  min =   95.08  max =   96.92  avg =   95.82
        mobilenet_v2  min =   58.72  max =   61.48  avg =   59.54
        mobilenet_v3  min =   48.58  max =   49.95  avg =   49.24
          shufflenet  min =   30.42  max =   32.03  avg =   31.17
       shufflenet_v2  min =   28.27  max =   29.37  avg =   28.65
             mnasnet  min =   56.85  max =   58.22  avg =   57.37
     proxylessnasnet  min =   68.67  max =   71.23  avg =   69.64
     efficientnet_b0  min =   89.27  max =   92.67  avg =   90.33
   efficientnetv2_b0  min =  107.72  max =  109.86  avg =  108.53
        regnety_400m  min =   85.19  max =   91.74  avg =   86.95
           blazeface  min =    8.60  max =    8.80  avg =    8.71
           googlenet  min =  161.58  max =  166.70  avg =  163.60
      googlenet_int8  min =  183.79  max =  189.43  avg =  186.17
            resnet18  min =  123.43  max =  126.29  avg =  124.86
       resnet18_int8  min =  140.80  max =  144.92  avg =  142.60
             alexnet  min =   93.16  max =  100.47  avg =   96.44
               vgg16  min =  664.14  max =  671.67  avg =  667.90
          vgg16_int8  min =  799.67  max =  813.66  avg =  803.50
            resnet50  min =  384.10  max =  388.46  avg =  386.49
       resnet50_int8  min =  448.11  max =  473.27  avg =  465.12
      squeezenet_ssd  min =  106.58  max =  109.62  avg =  107.39
 squeezenet_ssd_int8  min =  118.39  max =  122.62  avg =  120.43
       mobilenet_ssd  min =  178.89  max =  183.37  avg =  180.47
  mobilenet_ssd_int8  min =  201.46  max =  207.18  avg =  203.00
      mobilenet_yolo  min =  407.54  max =  411.12  avg =  409.33
  mobilenetv2_yolov3  min =  211.83  max =  214.46  avg =  213.20
         yolov4-tiny  min =  249.11  max =  254.22  avg =  251.38
           nanodet_m  min =   69.41  max =   71.26  avg =   70.28
    yolo-fastest-1.1  min =   30.99  max =   33.29  avg =   32.03
      yolo-fastestv2  min =   27.70  max =   28.90  avg =   27.93
  vision_transformer  min = 3203.45  max = 3402.10  avg = 3286.58
          FastestDet  min =   29.05  max =   32.57  avg =   30.53
```

### Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4)
```
WW_Tinker_Board:/data/local/tmp # ./benchncnn 8 4 0 -1 1
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   56.61  max =   56.80  avg =   56.69
     squeezenet_int8  min =   40.63  max =   41.05  avg =   40.89
           mobilenet  min =   83.91  max =   84.59  avg =   84.23
      mobilenet_int8  min =   36.15  max =   36.44  avg =   36.25
        mobilenet_v2  min =   71.12  max =   71.73  avg =   71.54
        mobilenet_v3  min =   56.08  max =   56.56  avg =   56.28
          shufflenet  min =   37.39  max =   37.75  avg =   37.55
       shufflenet_v2  min =   35.19  max =   35.52  avg =   35.34
             mnasnet  min =   62.08  max =   62.36  avg =   62.24
     proxylessnasnet  min =   66.98  max =   67.38  avg =   67.16
     efficientnet_b0  min =  109.95  max =  110.71  avg =  110.15
   efficientnetv2_b0  min =  122.56  max =  123.31  avg =  122.94
        regnety_400m  min =   88.84  max =   89.19  avg =   88.99
           blazeface  min =   11.79  max =   11.92  avg =   11.85
           googlenet  min =  162.56  max =  165.39  avg =  163.19
      googlenet_int8  min =  110.35  max =  110.91  avg =  110.60
            resnet18  min =  172.39  max =  173.99  avg =  173.24
       resnet18_int8  min =   84.00  max =   84.40  avg =   84.19
             alexnet  min =  156.71  max =  158.23  avg =  157.59
               vgg16  min =  956.95  max =  964.32  avg =  960.60
          vgg16_int8  min =  388.10  max =  389.52  avg =  388.68
            resnet50  min =  403.05  max =  404.80  avg =  404.01
       resnet50_int8  min =  205.12  max =  207.42  avg =  206.19
      squeezenet_ssd  min =  163.61  max =  165.79  avg =  164.93
 squeezenet_ssd_int8  min =  125.88  max =  126.35  avg =  126.12
       mobilenet_ssd  min =  175.97  max =  176.86  avg =  176.39
  mobilenet_ssd_int8  min =   76.90  max =   77.74  avg =   77.35
      mobilenet_yolo  min =  385.59  max =  387.19  avg =  386.60
  mobilenetv2_yolov3  min =  234.88  max =  236.22  avg =  235.66
         yolov4-tiny  min =  307.44  max =  310.64  avg =  308.54
           nanodet_m  min =   92.54  max =   93.15  avg =   92.82
    yolo-fastest-1.1  min =   46.69  max =   47.02  avg =   46.83
      yolo-fastestv2  min =   38.37  max =   38.68  avg =   38.54

WW_Tinker_Board:/data/local/tmp # ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  138.27  max =  138.57  avg =  138.41
     squeezenet_int8  min =   85.97  max =   86.23  avg =   86.05
           mobilenet  min =  234.90  max =  235.08  avg =  235.00
      mobilenet_int8  min =   99.92  max =  100.45  avg =  100.12
        mobilenet_v2  min =  157.76  max =  157.99  avg =  157.86
        mobilenet_v3  min =  130.05  max =  130.23  avg =  130.17
          shufflenet  min =   74.48  max =   74.62  avg =   74.55
       shufflenet_v2  min =   74.05  max =   74.25  avg =   74.13
             mnasnet  min =  150.74  max =  151.03  avg =  150.87
     proxylessnasnet  min =  171.09  max =  171.23  avg =  171.16
     efficientnet_b0  min =  306.85  max =  307.02  avg =  306.97
   efficientnetv2_b0  min =  347.40  max =  347.87  avg =  347.64
        regnety_400m  min =  190.26  max =  190.33  avg =  190.29
           blazeface  min =   25.25  max =   25.68  avg =   25.47
           googlenet  min =  432.09  max =  432.48  avg =  432.32
      googlenet_int8  min =  275.55  max =  276.07  avg =  275.88
            resnet18  min =  355.11  max =  358.56  avg =  356.90
       resnet18_int8  min =  205.80  max =  206.68  avg =  206.26
             alexnet  min =  330.09  max =  330.29  avg =  330.15
               vgg16  min = 2122.95  max = 2124.45  avg = 2123.68
          vgg16_int8  min = 1048.53  max = 1049.29  avg = 1048.86
            resnet50  min = 1047.27  max = 1048.33  avg = 1047.63
       resnet50_int8  min =  517.75  max =  519.28  avg =  518.81
      squeezenet_ssd  min =  304.69  max =  305.75  avg =  305.16
 squeezenet_ssd_int8  min =  219.16  max =  219.94  avg =  219.45
       mobilenet_ssd  min =  483.73  max =  484.12  avg =  484.01
  mobilenet_ssd_int8  min =  208.89  max =  209.19  avg =  209.09
      mobilenet_yolo  min = 1092.75  max = 1093.70  avg = 1093.13
  mobilenetv2_yolov3  min =  560.66  max =  560.92  avg =  560.77
         yolov4-tiny  min =  704.69  max =  705.38  avg =  705.12
           nanodet_m  min =  187.13  max =  187.57  avg =  187.39
    yolo-fastest-1.1  min =   83.05  max =   83.11  avg =   83.08
      yolo-fastestv2  min =   72.19  max =   72.23  avg =   72.21

WW_Tinker_Board:/data/local/tmp # ./benchncnn 4 1 0 0 0
[0 Mali-T760]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-T760]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=1
[0 Mali-T760]  fp16-p/s/a=1/0/1  int8-p/s/a=1/0/0
[0 Mali-T760]  subgroup=0  basic=0  vote=0  ballot=0  shuffle=0
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =   41.78  max =   41.82  avg =   41.79
           mobilenet  min =   62.67  max =   62.80  avg =   62.74
        mobilenet_v2  min =   51.08  max =   51.26  avg =   51.17
        mobilenet_v3  min =   51.43  max =   51.70  avg =   51.51
          shufflenet  min =   56.83  max =   56.94  avg =   56.87
       shufflenet_v2  min =   48.46  max =   48.63  avg =   48.53
             mnasnet  min =   52.31  max =   52.63  avg =   52.42
     proxylessnasnet  min =   57.33  max =   57.46  avg =   57.41
     efficientnet_b0  min =   87.52  max =   87.80  avg =   87.62
   efficientnetv2_b0  min =  123.83  max =  124.67  avg =  124.34
        regnety_400m  min =   65.52  max =   65.81  avg =   65.64
           blazeface  min =   14.56  max =   14.73  avg =   14.62
           googlenet  min =  138.52  max =  139.39  avg =  138.89
            resnet18  min =  124.45  max =  124.81  avg =  124.58
             alexnet  min =  130.46  max =  130.68  avg =  130.54
```
### HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1)
```
root@Hi3519:/ncnn-benchmark # taskset 2 ./benchncnn 8 1 0
loop_count = 8
num_threads = 1
powersave = 0
      squeezenet  min =  272.97  max =  275.84  avg =  274.85
 squeezenet-int8  min =  200.87  max =  202.47  avg =  201.74
       mobilenet  min =  480.90  max =  482.16  avg =  481.64
    mobilenet_v2  min =  350.01  max =  352.39  avg =  350.81
      shufflenet  min =  152.40  max =  153.17  avg =  152.80
       googlenet  min = 1096.65  max = 1101.35  avg = 1099.21
        resnet18  min =  983.92  max =  987.00  avg =  985.25
         alexnet  min = 1140.30  max = 1141.55  avg = 1140.92
  squeezenet-ssd  min =  574.62  max =  580.12  avg =  577.23
   mobilenet-ssd  min =  960.26  max =  969.13  avg =  965.93
  mobilenet-yolo  min = 1867.78  max = 1880.08  avg = 1873.89
```

### iPhone 5S (Apple A7 1.3GHz x 2)
```
iPhone:~ root# ./benchncnn 8 2 0 -1
[0 Apple A7 GPU]  queueC=0[8]  queueT=0[8]  memU=1  memDL=1  memHV=1
[0 Apple A7 GPU]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 8
num_threads = 2
powersave = 0
gpu_device = -1
          squeezenet  min =   49.21  max =   50.40  avg =   49.74
     squeezenet_int8  min =   54.73  max =   57.39  avg =   56.70
           mobilenet  min =   79.03  max =   80.00  avg =   79.44
      mobilenet_int8  min =  109.95  max =  112.69  avg =  111.38
        mobilenet_v2  min =   57.34  max =   57.88  avg =   57.47
        mobilenet_v3  min =   52.66  max =   53.73  avg =   53.12
          shufflenet  min =   32.78  max =   36.12  avg =   35.12
       shufflenet_v2  min =   31.25  max =   32.10  avg =   31.61
             mnasnet  min =   54.58  max =   56.12  avg =   55.44
     proxylessnasnet  min =   69.52  max =   72.42  avg =   70.40
           googlenet  min =  192.82  max =  194.20  avg =  193.35
      googlenet_int8  min =  235.43  max =  244.71  avg =  239.64
            resnet18  min =  164.33  max =  167.27  avg =  165.51
       resnet18_int8  min =  176.16  max =  179.73  avg =  178.60
             alexnet  min =  224.50  max =  228.21  avg =  226.51
               vgg16  min = 4262.28  max = 4400.29  avg = 4300.34
          vgg16_int8  min = 2835.84  max = 2955.22  avg = 2890.26
            resnet50  min =  542.66  max = 1344.49  avg =  737.05
       resnet50_int8  min =  426.08  max =  435.34  avg =  431.87
      squeezenet_ssd  min =  129.03  max =  131.44  avg =  129.99
 squeezenet_ssd_int8  min =  155.52  max =  161.42  avg =  158.51
       mobilenet_ssd  min =  168.18  max =  170.17  avg =  169.42
  mobilenet_ssd_int8  min =  205.78  max =  212.07  avg =  209.66
      mobilenet_yolo  min =  347.32  max =  363.15  avg =  355.72
  mobilenetv2_yolov3  min =  193.11  max =  196.64  avg =  194.31

iPhone:~ root# ./benchncnn 4 1 0 -1
[0 Apple A7 GPU]  queueC=0[8]  queueT=0[8]  memU=1  memDL=1  memHV=1
[0 Apple A7 GPU]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
          squeezenet  min =   86.36  max =   86.81  avg =   86.57
     squeezenet_int8  min =   99.62  max =  100.07  avg =   99.83
           mobilenet  min =  143.11  max =  146.50  avg =  145.38
      mobilenet_int8  min =  202.25  max =  203.32  avg =  203.02
        mobilenet_v2  min =   97.56  max =   98.55  avg =   98.09
        mobilenet_v3  min =   87.45  max =   87.68  avg =   87.52
          shufflenet  min =   54.01  max =   54.13  avg =   54.08
       shufflenet_v2  min =   48.11  max =   48.65  avg =   48.36
             mnasnet  min =   95.02  max =   95.77  avg =   95.25
     proxylessnasnet  min =  123.91  max =  124.61  avg =  124.18
           googlenet  min =  344.23  max =  348.95  avg =  345.97
      googlenet_int8  min =  420.30  max =  420.99  avg =  420.65
            resnet18  min =  300.44  max =  301.36  avg =  300.99
       resnet18_int8  min =  308.60  max =  310.52  avg =  309.70
             alexnet  min =  423.92  max =  429.84  avg =  427.24
               vgg16  min = 4787.59  max = 5015.23  avg = 4900.43
          vgg16_int8  min = 3560.59  max = 3722.75  avg = 3639.88
            resnet50  min =  797.88  max = 1294.57  avg =  985.63
       resnet50_int8  min =  751.15  max =  760.25  avg =  757.89
      squeezenet_ssd  min =  193.75  max =  196.13  avg =  195.29
 squeezenet_ssd_int8  min =  243.78  max =  245.19  avg =  244.74
       mobilenet_ssd  min =  299.69  max =  307.22  avg =  305.12
  mobilenet_ssd_int8  min =  385.91  max =  389.82  avg =  388.48
      mobilenet_yolo  min =  657.00  max =  659.31  avg =  658.08
  mobilenetv2_yolov3  min =  335.59  max =  342.22  avg =  339.37

iPhone:~ root# ./benchncnn 4 1 0 0
[0 Apple A7 GPU]  queueC=0[8]  queueT=0[8]  memU=1  memDL=1  memHV=1
[0 Apple A7 GPU]  fp16p=1  fp16s=0  fp16a=0  int8s=0  int8a=0
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = 0
          squeezenet  min =  260.18  max =  262.55  avg =  261.09
           mobilenet  min =  288.73  max =  291.83  avg =  289.67
        mobilenet_v2  min =  265.72  max =  267.05  avg =  266.14
        mobilenet_v3  min =  255.86  max =  257.35  avg =  256.43
          shufflenet  min =  236.66  max =  239.49  avg =  237.98
       shufflenet_v2  min =  244.92  max =  247.75  avg =  246.22
             mnasnet  min =  254.75  max =  256.48  avg =  255.85
     proxylessnasnet  min =  281.42  max =  282.62  avg =  282.11
           googlenet  min =  745.36  max =  764.91  avg =  754.16
            resnet18  min =  721.26  max =  741.98  avg =  734.78
             alexnet  min =  521.43  max =  530.95  avg =  527.01
            resnet50  min = 1494.86  max = 1505.79  avg = 1501.49
      squeezenet_ssd  min = 1096.45  max = 1102.84  avg = 1098.55
       mobilenet_ssd  min =  639.50  max =  641.81  avg =  640.83
      mobilenet_yolo  min = 1445.16  max = 1450.94  avg = 1447.42
  mobilenetv2_yolov3  min = 1047.24  max = 1060.97  avg = 1052.86
```

### Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2)
```
imx7d_pico:/data/local/tmp $ ./benchncnn 8 2 0 -1 1
loop_count = 8
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  220.10  max =  226.46  avg =  222.89
     squeezenet_int8  min =  159.26  max =  165.25  avg =  161.71
           mobilenet  min =  366.92  max =  373.78  avg =  371.55
      mobilenet_int8  min =  223.14  max =  229.66  avg =  225.66
        mobilenet_v2  min =  252.32  max =  259.41  avg =  255.54
        mobilenet_v3  min =  214.05  max =  222.24  avg =  217.53
          shufflenet  min =  137.02  max =  144.79  avg =  138.85
       shufflenet_v2  min =  134.89  max =  140.75  avg =  137.18
             mnasnet  min =  250.64  max =  256.75  avg =  253.33
     proxylessnasnet  min =  285.35  max =  291.43  avg =  288.37
     efficientnet_b0  min =  430.47  max =  436.63  avg =  434.75
        regnety_400m  min =  317.69  max =  325.77  avg =  321.24
           blazeface  min =   42.93  max =   43.30  avg =   43.14
           googlenet  min =  721.84  max =  728.40  avg =  724.23
      googlenet_int8  min =  504.07  max =  511.06  avg =  507.39
            resnet18  min =  645.61  max =  653.08  avg =  648.51
       resnet18_int8  min =  370.84  max =  514.38  avg =  392.80
             alexnet  min =  783.64  max =  794.83  avg =  786.95
      squeezenet_ssd  min =  508.71  max =  513.70  avg =  511.29
 squeezenet_ssd_int8  min =  402.85  max =  409.32  avg =  406.45
       mobilenet_ssd  min =  763.70  max =  771.52  avg =  767.61
  mobilenet_ssd_int8  min =  457.99  max =  460.85  avg =  459.76
      mobilenet_yolo  min = 1730.90  max = 1746.52  avg = 1741.26
  mobilenetv2_yolov3  min =  884.00  max =  892.97  avg =  889.38
         yolov4-tiny  min = 1181.20  max = 1218.20  avg = 1202.28
           nanodet_m  min =  331.53  max =  339.89  avg =  334.62

imx7d_pico:/data/local/tmp $ ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  408.39  max =  410.27  avg =  408.95
     squeezenet_int8  min =  290.25  max =  290.95  avg =  290.61
           mobilenet  min =  707.10  max =  711.64  avg =  708.47
      mobilenet_int8  min =  434.95  max =  436.16  avg =  435.66
        mobilenet_v2  min =  466.52  max =  467.41  avg =  466.96
        mobilenet_v3  min =  407.03  max =  408.29  avg =  407.56
          shufflenet  min =  240.65  max =  241.07  avg =  240.85
       shufflenet_v2  min =  229.27  max =  235.66  avg =  231.51
             mnasnet  min =  471.21  max =  471.48  avg =  471.35
     proxylessnasnet  min =  544.74  max =  547.62  avg =  546.20
     efficientnet_b0  min =  824.09  max =  824.44  avg =  824.20
        regnety_400m  min =  570.20  max =  571.73  avg =  570.82
           blazeface  min =   76.46  max =   77.05  avg =   76.81
           googlenet  min = 1368.82  max = 1369.99  avg = 1369.33
      googlenet_int8  min =  945.51  max =  946.61  avg =  945.91
            resnet18  min = 1237.79  max = 1257.12  avg = 1246.80
       resnet18_int8  min =  705.09  max =  706.72  avg =  705.63
             alexnet  min = 1516.35  max = 1522.82  avg = 1519.52
      squeezenet_ssd  min =  906.97  max =  908.48  avg =  907.68
 squeezenet_ssd_int8  min =  727.15  max =  728.16  avg =  727.77
       mobilenet_ssd  min = 1475.19  max = 1478.52  avg = 1476.81
  mobilenet_ssd_int8  min =  883.88  max =  890.68  avg =  885.90
      mobilenet_yolo  min = 3408.43  max = 3418.63  avg = 3412.52
  mobilenetv2_yolov3  min = 1685.18  max = 1695.89  avg = 1689.23
         yolov4-tiny  min = 2168.24  max = 2183.24  avg = 2175.93
           nanodet_m  min =  561.56  max =  562.05  avg =  561.72
```

### Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2)
```
root@petalinux_hdmi:~# LD_LIBRARY_PATH=. ./benchncnn 8 2 0 -1 1
loop_count = 8
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  389.18  max =  390.13  avg =  389.60
     squeezenet_int8  min =  254.33  max =  255.24  avg =  254.85
           mobilenet  min =  623.71  max =  625.01  avg =  624.46
      mobilenet_int8  min =  240.40  max =  241.03  avg =  240.87
        mobilenet_v2  min =  450.00  max =  450.89  avg =  450.40
        mobilenet_v3  min =  362.99  max =  363.66  avg =  363.28
          shufflenet  min =  212.20  max =  213.28  avg =  212.84
       shufflenet_v2  min =  210.26  max =  212.64  avg =  211.53
             mnasnet  min =  408.67  max =  409.64  avg =  409.17
     proxylessnasnet  min =  449.86  max =  450.94  avg =  450.45
     efficientnet_b0  min =  737.40  max =  739.58  avg =  738.32
   efficientnetv2_b0  min =  848.58  max =  849.74  avg =  849.24
        regnety_400m  min =  501.32  max =  503.02  avg =  501.87
           blazeface  min =   70.89  max =   72.22  avg =   71.61
      squeezenet_ssd  min =  978.55  max =  979.86  avg =  979.22
 squeezenet_ssd_int8  min =  691.90  max =  694.18  avg =  692.73
       mobilenet_ssd  min = 1353.12  max = 1354.13  avg = 1353.53
  mobilenet_ssd_int8  min =  496.26  max =  497.29  avg =  496.61
           nanodet_m  min =  542.04  max =  546.29  avg =  544.73
    yolo-fastest-1.1  min =  282.75  max =  286.11  avg =  284.24
      yolo-fastestv2  min =  230.91  max =  232.74  avg =  231.56

root@petalinux_hdmi:~# LD_LIBRARY_PATH=. ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  637.19  max =  639.33  avg =  637.82
     squeezenet_int8  min =  390.31  max =  391.63  avg =  390.94
           mobilenet  min = 1085.54  max = 1085.96  avg = 1085.71
      mobilenet_int8  min =  437.28  max =  437.65  avg =  437.44
        mobilenet_v2  min =  716.03  max =  716.75  avg =  716.35
        mobilenet_v3  min =  587.83  max =  588.55  avg =  588.21
          shufflenet  min =  331.28  max =  331.97  avg =  331.63
       shufflenet_v2  min =  331.03  max =  333.19  avg =  331.76
             mnasnet  min =  682.68  max =  683.11  avg =  682.82
     proxylessnasnet  min =  763.89  max =  764.80  avg =  764.35
     efficientnet_b0  min = 1288.61  max = 1289.10  avg = 1288.81
   efficientnetv2_b0  min = 1499.12  max = 1500.11  avg = 1499.65
        regnety_400m  min =  852.03  max =  853.16  avg =  852.68
           blazeface  min =  109.40  max =  111.51  avg =  110.41
      squeezenet_ssd  min = 1493.25  max = 1497.00  avg = 1494.87
 squeezenet_ssd_int8  min = 1016.77  max = 1019.31  avg = 1017.99
       mobilenet_ssd  min = 2379.20  max = 2379.83  avg = 2379.64
  mobilenet_ssd_int8  min =  881.70  max =  881.89  avg =  881.83
           nanodet_m  min =  831.13  max =  832.58  avg =  831.87
    yolo-fastest-1.1  min =  466.80  max =  469.90  avg =  468.79
      yolo-fastestv2  min =  352.07  max =  355.20  avg =  353.36
```

### Loongson 2K1000 (GS264 1.0GHz x 2)
```
root@ls2k:~/ncnn/build/benchmark# ./benchncnn 10 2 0 -1 1
loop_count = 10
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  184.33  max =  184.94  avg =  184.65
     squeezenet_int8  min =  201.42  max =  201.99  avg =  201.72
           mobilenet  min =  277.17  max =  278.04  avg =  277.66
      mobilenet_int8  min =  234.61  max =  235.17  avg =  234.81
        mobilenet_v2  min =  223.10  max =  274.92  avg =  228.71
        mobilenet_v3  min =  185.79  max =  201.76  avg =  187.60
          shufflenet  min =  129.78  max =  131.09  avg =  130.28
       shufflenet_v2  min =  115.86  max =  116.77  avg =  116.42
             mnasnet  min =  213.92  max =  214.72  avg =  214.26
     proxylessnasnet  min =  240.05  max =  242.02  avg =  240.86
     efficientnet_b0  min =  347.52  max =  348.53  avg =  348.13
   efficientnetv2_b0  min =  382.78  max =  479.58  avg =  398.18
        regnety_400m  min =  270.00  max =  312.84  avg =  274.66
           blazeface  min =   37.60  max =   38.02  avg =   37.79
           googlenet  min =  659.55  max =  693.17  avg =  666.17
      googlenet_int8  min =  678.26  max =  718.39  avg =  682.79
            resnet18  min =  499.75  max =  766.88  avg =  532.49
       resnet18_int8  min =  500.38  max =  533.97  avg =  504.56
             alexnet  min =  508.49  max =  542.94  avg =  516.13
               vgg16  min = 2654.06  max = 3082.44  avg = 2762.51
          vgg16_int8  min = 2628.96  max = 2665.35  avg = 2647.12
            resnet50  min = 1256.97  max = 1417.45  avg = 1283.04
       resnet50_int8  min = 1232.55  max = 1276.94  avg = 1244.59
      squeezenet_ssd  min =  538.83  max =  588.03  avg =  553.44
 squeezenet_ssd_int8  min =  501.67  max =  532.61  avg =  505.72
       mobilenet_ssd  min =  571.14  max =  600.93  avg =  578.22
  mobilenet_ssd_int8  min =  478.67  max =  515.39  avg =  483.06
      mobilenet_yolo  min = 1644.48  max = 1729.17  avg = 1669.18
  mobilenetv2_yolov3  min =  752.22  max =  792.40  avg =  760.10
         yolov4-tiny  min =  994.48  max = 1096.10  avg = 1016.49
           nanodet_m  min =  299.12  max =  343.99  avg =  303.98
    yolo-fastest-1.1  min =  141.56  max =  142.93  avg =  142.04
      yolo-fastestv2  min =  125.66  max =  168.88  avg =  130.28

root@ls2k:~/ncnn/build/benchmark# ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  295.48  max =  296.42  avg =  295.98
     squeezenet_int8  min =  334.05  max =  336.31  avg =  335.35
           mobilenet  min =  476.33  max =  479.00  avg =  477.41
      mobilenet_int8  min =  446.03  max =  448.21  avg =  446.73
        mobilenet_v2  min =  343.26  max =  343.97  avg =  343.69
        mobilenet_v3  min =  296.84  max =  297.31  avg =  297.11
          shufflenet  min =  202.31  max =  203.96  avg =  202.79
       shufflenet_v2  min =  181.69  max =  182.42  avg =  182.08
             mnasnet  min =  353.73  max =  354.12  avg =  353.99
     proxylessnasnet  min =  404.49  max =  405.00  avg =  404.75
     efficientnet_b0  min =  592.54  max =  593.81  avg =  593.14
   efficientnetv2_b0  min =  649.91  max =  651.49  avg =  650.54
        regnety_400m  min =  425.96  max =  426.33  avg =  426.12
           blazeface  min =   59.74  max =   60.19  avg =   59.90
           googlenet  min = 1120.13  max = 1217.54  avg = 1146.27
      googlenet_int8  min = 1205.17  max = 1213.43  avg = 1208.13
            resnet18  min =  803.07  max =  997.37  avg =  856.09
       resnet18_int8  min =  911.74  max =  916.16  avg =  913.31
             alexnet  min =  883.47  max =  903.08  avg =  889.06
               vgg16  min = 4425.52  max = 4587.36  avg = 4467.61
          vgg16_int8  min = 4896.90  max = 4993.15  avg = 4924.44
            resnet50  min = 2163.22  max = 2169.90  avg = 2167.49
       resnet50_int8  min = 2202.87  max = 2218.00  avg = 2210.51
      squeezenet_ssd  min =  831.06  max =  926.94  avg =  856.24
 squeezenet_ssd_int8  min =  800.52  max =  803.28  avg =  801.72
       mobilenet_ssd  min =  979.74  max =  980.82  avg =  980.22
  mobilenet_ssd_int8  min =  893.79  max =  895.41  avg =  894.51
      mobilenet_yolo  min = 2578.17  max = 2586.30  avg = 2582.55
  mobilenetv2_yolov3  min = 1190.77  max = 1207.67  avg = 1196.06
         yolov4-tiny  min = 1558.29  max = 1570.18  avg = 1561.52
           nanodet_m  min =  442.90  max =  444.27  avg =  443.72
    yolo-fastest-1.1  min =  203.60  max =  208.43  avg =  205.20
      yolo-fastestv2  min =  184.61  max =  185.05  avg =  184.75
```

### Loongson 2K1000LA (LA264 1.0GHz * 2)
```
root@ls2kla:~/ncnn/build/benchmark# ./benchncnn 10 2 0 -1 1
loop_count = 10
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  151.11  max =  162.36  avg =  153.30
     squeezenet_int8  min =  195.32  max =  198.63  avg =  196.12
           mobilenet  min =  279.27  max =  283.42  avg =  280.40
      mobilenet_int8  min =  264.78  max =  268.41  avg =  265.76
        mobilenet_v2  min =  204.39  max =  207.69  avg =  205.77
        mobilenet_v3  min =  171.32  max =  187.07  avg =  173.15
          shufflenet  min =  147.43  max =  150.72  avg =  147.89
       shufflenet_v2  min =  169.42  max =  172.58  avg =  170.35
             mnasnet  min =  204.87  max =  208.01  avg =  205.63
     proxylessnasnet  min =  226.79  max =  237.74  avg =  229.02
     efficientnet_b0  min =  302.30  max =  310.91  avg =  303.87
   efficientnetv2_b0  min =  327.65  max =  361.15  avg =  334.45
        regnety_400m  min =  264.08  max =  278.49  avg =  266.35
           blazeface  min =   31.80  max =   39.18  avg =   32.88
           googlenet  min =  562.95  max =  578.42  avg =  566.28
      googlenet_int8  min =  598.16  max =  613.56  avg =  601.68
            resnet18  min =  466.73  max =  472.08  avg =  469.58
       resnet18_int8  min =  489.69  max =  493.74  avg =  491.63
             alexnet  min =  381.35  max =  388.12  avg =  384.78
               vgg16  min = 2321.29  max = 2345.89  avg = 2330.29
          vgg16_int8  min = 2562.86  max = 2568.06  avg = 2565.68
            resnet50  min = 1219.09  max = 1225.67  avg = 1221.36
       resnet50_int8  min = 1263.44  max = 1266.74  avg = 1265.09
      squeezenet_ssd  min =  433.23  max =  441.06  avg =  437.07
 squeezenet_ssd_int8  min =  438.69  max =  443.17  avg =  440.81
       mobilenet_ssd  min =  587.37  max =  598.57  avg =  589.99
  mobilenet_ssd_int8  min =  539.62  max =  552.57  avg =  542.87
      mobilenet_yolo  min = 1485.30  max = 1491.17  avg = 1487.81
  mobilenetv2_yolov3  min =  711.57  max =  722.91  avg =  715.07
         yolov4-tiny  min =  954.76  max =  961.66  avg =  957.28
           nanodet_m  min =  364.22  max =  369.32  avg =  365.94
    yolo-fastest-1.1  min =  154.81  max =  160.45  avg =  156.23
      yolo-fastestv2  min =  157.39  max =  168.82  avg =  159.51
  vision_transformer  min = 18926.46  max = 18980.43  avg = 18951.29
          FastestDet  min =  168.81  max =  176.77  avg =  170.26

root@ls2kla:~/ncnn/build/benchmark# ./benchncnn 4 1 0 -1 1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  272.76  max =  280.89  avg =  275.29
     squeezenet_int8  min =  352.02  max =  353.25  avg =  352.40
           mobilenet  min =  519.09  max =  519.68  avg =  519.34
      mobilenet_int8  min =  509.85  max =  510.23  avg =  510.04
        mobilenet_v2  min =  352.06  max =  352.74  avg =  352.37
        mobilenet_v3  min =  295.13  max =  295.70  avg =  295.39
          shufflenet  min =  241.58  max =  241.94  avg =  241.73
       shufflenet_v2  min =  282.88  max =  283.39  avg =  283.18
             mnasnet  min =  357.74  max =  358.21  avg =  357.98
     proxylessnasnet  min =  403.26  max =  411.69  avg =  406.02
     efficientnet_b0  min =  546.11  max =  546.88  avg =  546.53
   efficientnetv2_b0  min =  596.83  max =  597.05  avg =  596.93
        regnety_400m  min =  441.94  max =  442.02  avg =  441.98
           blazeface  min =   54.08  max =   54.59  avg =   54.38
           googlenet  min = 1042.19  max = 1048.03  avg = 1044.40
      googlenet_int8  min = 1118.22  max = 1121.18  avg = 1119.79
            resnet18  min =  838.79  max =  839.81  avg =  839.43
       resnet18_int8  min =  939.62  max =  940.72  avg =  940.23
             alexnet  min =  729.36  max =  740.65  avg =  734.19
               vgg16  min = 4326.68  max = 4335.10  avg = 4330.97
          vgg16_int8  min = 4896.71  max = 4909.63  avg = 4905.14
            resnet50  min = 2277.36  max = 2280.34  avg = 2279.14
       resnet50_int8  min = 2399.07  max = 2402.21  avg = 2400.78
      squeezenet_ssd  min =  751.49  max =  753.79  avg =  752.20
 squeezenet_ssd_int8  min =  771.01  max =  774.08  avg =  771.91
       mobilenet_ssd  min = 1063.41  max = 1065.65  avg = 1064.16
  mobilenet_ssd_int8  min = 1031.59  max = 1033.03  avg = 1032.09
      mobilenet_yolo  min = 2585.33  max = 2586.65  avg = 2586.11
  mobilenetv2_yolov3  min = 1246.35  max = 1248.43  avg = 1247.32
         yolov4-tiny  min = 1639.13  max = 1642.47  avg = 1640.87
           nanodet_m  min =  606.40  max =  607.14  avg =  606.86
    yolo-fastest-1.1  min =  242.15  max =  244.64  avg =  243.43
      yolo-fastestv2  min =  246.92  max =  247.84  avg =  247.27
  vision_transformer  min = 36607.51  max = 36870.44  avg = 36724.88
          FastestDet  min =  266.96  max =  268.86  avg =  267.94
```

### Loongson 2K2000 (LA364 1.5GHz * 2 with lsx)
```
loongson@loongson-pc:~/ncnn/build/benchmark$ ./benchncnn 4 2 0 -1 1
loop_count = 4
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   58.54  max =   61.57  avg =   60.37
     squeezenet_int8  min =   66.79  max =   72.05  avg =   70.49
           mobilenet  min =  110.46  max =  112.72  avg =  111.84
      mobilenet_int8  min =  117.83  max =  126.51  avg =  123.42
        mobilenet_v2  min =   65.19  max =   70.78  avg =   67.73
        mobilenet_v3  min =   51.30  max =   56.61  avg =   54.52
          shufflenet  min =   32.78  max =   35.11  avg =   33.99
       shufflenet_v2  min =   31.58  max =   32.59  avg =   32.15
             mnasnet  min =   64.18  max =   78.53  avg =   68.72
     proxylessnasnet  min =   73.49  max =   85.30  avg =   77.35
     efficientnet_b0  min =  101.83  max =  106.26  avg =  104.91
   efficientnetv2_b0  min =  126.55  max =  131.95  avg =  127.91
        regnety_400m  min =   88.19  max =   92.58  avg =   89.60
           blazeface  min =    8.57  max =    8.68  avg =    8.63
           googlenet  min =  207.97  max =  214.47  avg =  211.07
      googlenet_int8  min =  237.92  max =  241.06  avg =  239.76
            resnet18  min =  153.42  max =  161.54  avg =  158.21
       resnet18_int8  min =  177.77  max =  183.83  avg =  181.90
             alexnet  min =  145.71  max =  149.41  avg =  147.97
               vgg16  min =  937.03  max =  961.65  avg =  945.20
          vgg16_int8  min =  850.20  max =  869.47  avg =  859.99
            resnet50  min =  497.95  max =  524.29  avg =  511.85
       resnet50_int8  min =  541.22  max =  549.09  avg =  544.30
      squeezenet_ssd  min =  155.11  max =  163.01  avg =  159.72
 squeezenet_ssd_int8  min =  136.11  max =  138.38  avg =  137.36
       mobilenet_ssd  min =  226.97  max =  231.33  avg =  229.20
  mobilenet_ssd_int8  min =  248.61  max =  253.10  avg =  250.83
      mobilenet_yolo  min =  613.25  max =  626.75  avg =  619.83
  mobilenetv2_yolov3  min =  249.50  max =  258.17  avg =  255.75
         yolov4-tiny  min =  312.41  max =  349.24  avg =  328.38
           nanodet_m  min =   81.50  max =   84.20  avg =   83.14
    yolo-fastest-1.1  min =   30.46  max =   30.91  avg =   30.71
      yolo-fastestv2  min =   26.78  max =   28.80  avg =   28.10
  vision_transformer  min = 4483.37  max = 4519.06  avg = 4507.04
          FastestDet  min =   31.15  max =   32.37  avg =   32.06
```

### Loongson 3A3000 (GS464E 1.45GHz * 4)
```
root@3A3K:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 5 4 2 -1 0
loop_count = 5
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =   88.82  max =  116.74  avg =   94.92
     squeezenet_int8  min =  140.62  max =  162.48  avg =  146.32
           mobilenet  min =  144.80  max =  244.58  avg =  172.14
      mobilenet_int8  min =  265.21  max =  293.89  avg =  281.80
        mobilenet_v2  min =  109.80  max =  156.74  avg =  120.48
        mobilenet_v3  min =   90.18  max =   93.25  avg =   91.50
          shufflenet  min =   56.64  max =  216.12  avg =  100.68
       shufflenet_v2  min =   45.70  max =  142.00  avg =   65.20
             mnasnet  min =  106.99  max =  229.11  avg =  134.22
     proxylessnasnet  min =  123.68  max =  261.01  avg =  155.97
     efficientnet_b0  min =  160.98  max =  191.14  avg =  171.55
   efficientnetv2_b0  min =  162.75  max =  187.67  avg =  176.19
        regnety_400m  min =  135.06  max =  174.12  avg =  151.30
           blazeface  min =   15.26  max =   43.81  avg =   23.91
           googlenet  min =  327.16  max =  386.02  avg =  350.25
      googlenet_int8  min =  500.45  max =  637.39  avg =  540.62
            resnet18  min =  254.45  max =  421.56  avg =  304.48
       resnet18_int8  min =  385.14  max =  559.01  avg =  439.74
             alexnet  min =  179.19  max =  220.91  avg =  190.63
               vgg16  min = 1563.99  max = 1645.01  avg = 1619.63
          vgg16_int8  min = 1436.00  max = 1530.45  avg = 1473.00
            resnet50  min =  702.35  max =  833.23  avg =  764.14
       resnet50_int8  min = 1099.40  max = 1208.84  avg = 1154.51
      squeezenet_ssd  min =  191.40  max =  270.10  avg =  218.75
 squeezenet_ssd_int8  min =  304.51  max =  387.51  avg =  344.98
       mobilenet_ssd  min =  315.77  max =  417.37  avg =  344.40
  mobilenet_ssd_int8  min =  554.28  max =  656.07  avg =  580.72
      mobilenet_yolo  min =  806.48  max =  851.22  avg =  825.50
  mobilenetv2_yolov3  min =  382.38  max =  503.38  avg =  421.03
         yolov4-tiny  min =  502.87  max =  620.30  avg =  550.08
           nanodet_m  min =  126.00  max =  314.03  avg =  184.93
    yolo-fastest-1.1  min =   64.68  max =  189.47  avg =  110.89
      yolo-fastestv2  min =   69.03  max =  116.31  avg =   82.36
  vision_transformer  min = 14737.56  max = 15012.35  avg = 14890.56
          FastestDet  min =   84.30  max =  139.87  avg =  102.23
```

### Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128)
```
root@3A4K:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =   17.04  max =   39.86  avg =   20.39
     squeezenet_int8  min =   21.77  max =   25.93  avg =   23.02
           mobilenet  min =   26.34  max =   97.11  avg =   38.24
      mobilenet_int8  min =   32.93  max =   33.31  avg =   33.07
        mobilenet_v2  min =   19.40  max =   19.91  avg =   19.63
        mobilenet_v3  min =   16.48  max =   45.31  avg =   19.68
          shufflenet  min =   12.23  max =  116.79  avg =   22.86
       shufflenet_v2  min =   11.14  max =   11.59  avg =   11.37
             mnasnet  min =   18.33  max =   51.66  avg =   24.52
     proxylessnasnet  min =   22.03  max =   22.46  avg =   22.19
     efficientnet_b0  min =   34.94  max =  129.52  avg =   45.76
   efficientnetv2_b0  min =   38.58  max =   67.86  avg =   41.84
        regnety_400m  min =   35.53  max =   38.59  avg =   36.14
           blazeface  min =    4.08  max =    4.34  avg =    4.17
           googlenet  min =   72.60  max =  100.31  avg =   76.25
      googlenet_int8  min =   82.09  max =  107.09  avg =   86.78
            resnet18  min =   53.99  max =  100.21  avg =   63.52
       resnet18_int8  min =   57.20  max =   77.00  avg =   60.47
             alexnet  min =   61.95  max =   80.86  avg =   65.01
               vgg16  min =  329.58  max =  438.99  avg =  360.40
          vgg16_int8  min =  293.27  max =  366.16  avg =  311.23
            resnet50  min =  138.06  max =  260.50  avg =  169.27
       resnet50_int8  min =  154.06  max =  244.31  avg =  173.37
      squeezenet_ssd  min =   60.44  max =   97.92  avg =   65.41
 squeezenet_ssd_int8  min =   55.34  max =  136.72  avg =   68.15
       mobilenet_ssd  min =   57.97  max =  139.16  avg =   69.27
  mobilenet_ssd_int8  min =   66.66  max =   89.91  avg =   71.00
      mobilenet_yolo  min =  169.38  max =  711.10  avg =  242.62
  mobilenetv2_yolov3  min =   75.61  max =   97.83  avg =   80.23
         yolov4-tiny  min =  110.52  max =  143.67  avg =  118.53
           nanodet_m  min =   24.04  max =   92.81  avg =   32.45
    yolo-fastest-1.1  min =   10.97  max =   32.77  avg =   15.05
      yolo-fastestv2  min =   11.54  max =   12.09  avg =   11.84
  vision_transformer  min = 4193.41  max = 4274.03  avg = 4213.64
          FastestDet  min =   12.54  max =   13.01  avg =   12.78
```


### Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128)

Test on UOS V20 E1050

```
uos@uos-PC:~/ncnn/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =   25.28  max =   38.19  avg =   27.81
     squeezenet_int8  min =   21.61  max =   22.13  avg =   21.85
           mobilenet  min =   44.77  max =   69.54  avg =   55.37
      mobilenet_int8  min =   32.96  max =   44.00  avg =   36.08
        mobilenet_v2  min =   29.21  max =   52.70  avg =   35.47
        mobilenet_v3  min =   24.62  max =   27.32  avg =   25.18
          shufflenet  min =   18.90  max =   49.70  avg =   22.95
       shufflenet_v2  min =   15.87  max =   22.38  avg =   17.67
             mnasnet  min =   29.08  max =   69.37  avg =   35.53
     proxylessnasnet  min =   33.30  max =   94.15  avg =   42.81
     efficientnet_b0  min =   49.34  max =   61.22  avg =   52.01
   efficientnetv2_b0  min =   57.89  max =   72.55  avg =   60.72
        regnety_400m  min =   50.65  max =   74.16  avg =   57.56
           blazeface  min =    4.97  max =    5.33  avg =    5.11
           googlenet  min =  101.45  max =  119.73  avg =  106.85
      googlenet_int8  min =   83.94  max =   99.75  avg =   87.36
            resnet18  min =   81.65  max =   99.76  avg =   85.96
       resnet18_int8  min =   58.60  max =   75.88  avg =   60.62
             alexnet  min =   77.05  max =  208.05  avg =  120.39
               vgg16  min =  427.51  max =  676.57  avg =  531.53
          vgg16_int8  min =  326.59  max =  487.96  avg =  417.74
            resnet50  min =  221.51  max =  580.11  avg =  305.64
       resnet50_int8  min =  158.00  max =  190.71  avg =  167.50
      squeezenet_ssd  min =   98.87  max =  135.55  avg =  115.54
 squeezenet_ssd_int8  min =   66.33  max =  361.40  avg =  148.19
       mobilenet_ssd  min =   94.12  max =  340.16  avg =  184.85
  mobilenet_ssd_int8  min =   88.26  max =  150.47  avg =  112.35
      mobilenet_yolo  min =  252.07  max =  510.61  avg =  327.21
  mobilenetv2_yolov3  min =  115.31  max =  183.63  avg =  147.28
         yolov4-tiny  min =  153.92  max =  259.18  avg =  196.70
           nanodet_m  min =   34.95  max =   66.15  avg =   46.41
    yolo-fastest-1.1  min =   15.34  max =   15.94  avg =   15.62
      yolo-fastestv2  min =   15.53  max =   16.06  avg =   15.80
  vision_transformer  min = 4200.48  max = 5853.43  avg = 4555.42
          FastestDet  min =   16.73  max =   18.72  avg =   17.08


uos@uos-PC:~/ncnn/benchmark$ ./benchncnn 10 4 1 -1 0
loop_count = 10
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 0
          squeezenet  min =   25.93  max =   47.61  avg =   28.45
     squeezenet_int8  min =   21.84  max =   27.09  avg =   22.84
           mobilenet  min =   44.61  max =   83.44  avg =   52.52
      mobilenet_int8  min =   32.91  max =   45.99  avg =   34.52
        mobilenet_v2  min =   29.44  max =   37.14  avg =   30.43
        mobilenet_v3  min =   24.54  max =   42.68  avg =   27.25
          shufflenet  min =   17.16  max =   42.10  avg =   20.08
       shufflenet_v2  min =   15.99  max =   16.43  avg =   16.29
             mnasnet  min =   29.14  max =   43.37  avg =   30.79
     proxylessnasnet  min =   33.15  max =   34.12  avg =   33.52
     efficientnet_b0  min =   49.35  max =   87.75  avg =   54.03
   efficientnetv2_b0  min =   57.69  max =   84.67  avg =   64.12
        regnety_400m  min =   50.55  max =   75.35  avg =   55.31
           blazeface  min =    5.01  max =    5.16  avg =    5.05
           googlenet  min =  101.51  max =  116.33  avg =  105.38
      googlenet_int8  min =   84.34  max =  102.58  avg =   89.89
            resnet18  min =   80.58  max =   94.47  avg =   86.27
       resnet18_int8  min =   59.00  max =   76.66  avg =   62.15
             alexnet  min =   91.72  max =  117.98  avg =  102.20
               vgg16  min =  435.57  max =  453.90  avg =  441.39
          vgg16_int8  min =  308.39  max =  332.69  avg =  321.09
            resnet50  min =  219.93  max =  249.30  avg =  231.93
       resnet50_int8  min =  156.78  max =  179.34  avg =  163.43
      squeezenet_ssd  min =  109.48  max =  153.84  avg =  123.75
 squeezenet_ssd_int8  min =   74.33  max =  117.03  avg =   93.81
       mobilenet_ssd  min =   94.91  max =  161.38  avg =  127.78
  mobilenet_ssd_int8  min =   82.35  max =  112.79  avg =   91.86
      mobilenet_yolo  min =  252.05  max =  285.16  avg =  266.33
  mobilenetv2_yolov3  min =  113.98  max =  173.83  avg =  139.60
         yolov4-tiny  min =  150.06  max =  210.96  avg =  164.94
           nanodet_m  min =   34.62  max =   67.81  avg =   48.43
    yolo-fastest-1.1  min =   15.78  max =   16.09  avg =   15.93
      yolo-fastestv2  min =   15.54  max =   32.82  avg =   17.62
  vision_transformer  min = 4202.89  max = 5573.15  avg = 4426.38
          FastestDet  min =   16.39  max =   17.06  avg =   16.75


uos@uos-PC:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 0
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   25.98  max =   36.75  avg =   28.86
     squeezenet_int8  min =   22.04  max =   30.86  avg =   23.28
           mobilenet  min =   44.82  max =   60.73  avg =   46.72
      mobilenet_int8  min =   33.00  max =   48.45  avg =   34.70
        mobilenet_v2  min =   29.53  max =   56.78  avg =   33.98
        mobilenet_v3  min =   24.69  max =   45.60  avg =   28.13
          shufflenet  min =   17.25  max =   24.72  avg =   18.18
       shufflenet_v2  min =   16.00  max =   31.27  avg =   17.62
             mnasnet  min =   28.95  max =   44.73  avg =   32.58
     proxylessnasnet  min =   32.99  max =   45.42  avg =   34.66
     efficientnet_b0  min =   49.71  max =   53.47  avg =   50.25
   efficientnetv2_b0  min =   57.51  max =   78.56  avg =   61.47
        regnety_400m  min =   50.18  max =   71.85  avg =   54.77
           blazeface  min =    4.98  max =    9.36  avg =    5.48
           googlenet  min =  101.25  max =  121.71  avg =  105.71
      googlenet_int8  min =   82.97  max =  111.81  avg =   89.49
            resnet18  min =   75.66  max =   87.19  avg =   78.72
       resnet18_int8  min =   58.92  max =  108.67  avg =   76.70
             alexnet  min =   79.12  max =  144.22  avg =  101.91
               vgg16  min =  430.14  max =  460.46  avg =  444.56
          vgg16_int8  min =  308.08  max =  350.15  avg =  324.86
            resnet50  min =  219.60  max =  258.59  avg =  237.46
       resnet50_int8  min =  156.54  max =  180.28  avg =  163.11
      squeezenet_ssd  min =   77.71  max =  137.36  avg =  119.68
 squeezenet_ssd_int8  min =   78.88  max =  113.64  avg =   95.83
       mobilenet_ssd  min =   94.82  max =  156.99  avg =  119.67
  mobilenet_ssd_int8  min =   77.17  max =   98.29  avg =   86.90
      mobilenet_yolo  min =  252.29  max =  295.62  avg =  265.58
  mobilenetv2_yolov3  min =  114.28  max =  159.82  avg =  140.03
         yolov4-tiny  min =  150.99  max =  203.07  avg =  165.18
           nanodet_m  min =   34.48  max =   71.56  avg =   49.84
    yolo-fastest-1.1  min =   15.36  max =   30.00  avg =   17.11
      yolo-fastestv2  min =   15.42  max =   26.96  avg =   16.78
  vision_transformer  min = 4187.60  max = 4319.84  avg = 4220.05
          FastestDet  min =   16.30  max =   24.88  avg =   17.49

```


### Loongson 3A5000 (LA464 2.5GHz * 4)
```
root@3A5K:~/Desktop/ncnn-20230223/build/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =   11.97  max =   19.38  avg =   13.61
     squeezenet_int8  min =   14.96  max =   15.36  avg =   15.12
           mobilenet  min =   20.14  max =   27.50  avg =   21.12
      mobilenet_int8  min =   25.28  max =   35.06  avg =   27.37
        mobilenet_v2  min =   12.82  max =   13.20  avg =   12.98
        mobilenet_v3  min =   11.39  max =   25.03  avg =   12.86
          shufflenet  min =    7.35  max =    7.50  avg =    7.40
       shufflenet_v2  min =    7.12  max =    7.23  avg =    7.18
             mnasnet  min =   12.85  max =   21.69  avg =   13.83
     proxylessnasnet  min =   15.35  max =   15.79  avg =   15.43
     efficientnet_b0  min =   24.20  max =   24.46  avg =   24.30
   efficientnetv2_b0  min =   26.80  max =   42.43  avg =   29.25
        regnety_400m  min =   22.85  max =   38.30  avg =   24.51
           blazeface  min =    2.57  max =    2.67  avg =    2.60
           googlenet  min =   49.09  max =   85.91  avg =   67.57
      googlenet_int8  min =   64.89  max =   95.28  avg =   76.41
            resnet18  min =   42.43  max =   62.39  avg =   52.38
       resnet18_int8  min =   47.96  max =   68.69  avg =   56.75
             alexnet  min =   46.01  max =   59.26  avg =   49.20
               vgg16  min =  246.82  max =  261.80  avg =  252.81
          vgg16_int8  min =  247.13  max =  256.81  avg =  252.37
            resnet50  min =  102.17  max =  138.16  avg =  117.65
       resnet50_int8  min =  115.09  max =  151.30  avg =  129.13
      squeezenet_ssd  min =   43.62  max =   70.64  avg =   53.89
 squeezenet_ssd_int8  min =   38.66  max =   60.12  avg =   47.66
       mobilenet_ssd  min =   42.67  max =   68.78  avg =   53.95
  mobilenet_ssd_int8  min =   56.29  max =   68.31  avg =   59.86
      mobilenet_yolo  min =  129.04  max =  188.26  avg =  149.64
  mobilenetv2_yolov3  min =   61.80  max =   71.41  avg =   66.43
         yolov4-tiny  min =   88.64  max =  108.17  avg =   95.48
           nanodet_m  min =   16.24  max =   16.57  avg =   16.34
    yolo-fastest-1.1  min =    6.98  max =    7.16  avg =    7.05
      yolo-fastestv2  min =    6.95  max =    7.29  avg =    7.08
  vision_transformer  min = 2910.63  max = 3109.29  avg = 2949.04
          FastestDet  min =    7.66  max =    7.90  avg =    7.80
```

### Loongson 3A6000 (LA664 2.5GHz * 4+4)

```
~/ncnn/build/benchmark$ ./benchncnn 10 8 2 -1 0
loop_count = 10
num_threads = 8
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =    7.12  max =    7.20  avg =    7.16
     squeezenet_int8  min =    8.93  max =    9.20  avg =    8.98
           mobilenet  min =   11.81  max =   11.88  avg =   11.84
      mobilenet_int8  min =   14.25  max =   14.33  avg =   14.28
        mobilenet_v2  min =    8.06  max =    8.16  avg =    8.08
        mobilenet_v3  min =    6.84  max =    6.90  avg =    6.87
          shufflenet  min =    5.38  max =    5.44  avg =    5.39
       shufflenet_v2  min =    5.20  max =    5.22  avg =    5.20
             mnasnet  min =    8.06  max =    8.10  avg =    8.07
     proxylessnasnet  min =    8.94  max =    9.09  avg =    8.99
     efficientnet_b0  min =   13.43  max =   13.65  avg =   13.48
   efficientnetv2_b0  min =   16.06  max =   16.18  avg =   16.11
        regnety_400m  min =   18.11  max =   18.18  avg =   18.14
           blazeface  min =    1.59  max =    1.61  avg =    1.60
           googlenet  min =   26.08  max =   26.24  avg =   26.17
      googlenet_int8  min =   31.25  max =   31.42  avg =   31.34
            resnet18  min =   19.65  max =   19.73  avg =   19.69
       resnet18_int8  min =   25.55  max =   25.66  avg =   25.60
             alexnet  min =   19.56  max =   19.81  avg =   19.67
               vgg16  min =  115.32  max =  116.38  avg =  115.99
          vgg16_int8  min =  135.94  max =  136.73  avg =  136.34
            resnet50  min =   56.46  max =   56.96  avg =   56.81
       resnet50_int8  min =   66.13  max =   66.40  avg =   66.27
      squeezenet_ssd  min =   22.84  max =   22.99  avg =   22.89
 squeezenet_ssd_int8  min =   22.34  max =   22.76  avg =   22.54
       mobilenet_ssd  min =   24.67  max =   24.75  avg =   24.71
  mobilenet_ssd_int8  min =   29.32  max =   29.37  avg =   29.34
      mobilenet_yolo  min =   82.82  max =   84.02  avg =   83.40
  mobilenetv2_yolov3  min =   30.31  max =   30.45  avg =   30.38
         yolov4-tiny  min =   42.49  max =   42.74  avg =   42.62
           nanodet_m  min =   11.00  max =   11.08  avg =   11.02
    yolo-fastest-1.1  min =    5.28  max =    5.40  avg =    5.31
      yolo-fastestv2  min =    5.09  max =    5.10  avg =    5.10
  vision_transformer  min =  869.40  max =  898.18  avg =  874.07
          FastestDet  min =    5.28  max =    5.37  avg =    5.31
```

### Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4)
Test on Kylin OS V10
```
mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   40.92  max =   43.43  avg =   41.34
     squeezenet_int8  min =   35.48  max =   36.07  avg =   35.75
           mobilenet  min =   72.23  max =   72.53  avg =   72.39
      mobilenet_int8  min =   48.10  max =   48.59  avg =   48.31
        mobilenet_v2  min =   47.94  max =   48.45  avg =   48.13
        mobilenet_v3  min =   37.95  max =   39.59  avg =   38.41
          shufflenet  min =   21.51  max =   21.84  avg =   21.64
       shufflenet_v2  min =   21.10  max =   21.45  avg =   21.26
             mnasnet  min =   44.53  max =   45.15  avg =   44.74
     proxylessnasnet  min =   53.02  max =   53.62  avg =   53.21
     efficientnet_b0  min =   79.81  max =   80.51  avg =   80.15
   efficientnetv2_b0  min =   92.55  max =  103.10  avg =   97.53
        regnety_400m  min =   58.52  max =   70.04  avg =   64.20
           blazeface  min =    6.06  max =    9.85  avg =    6.88
           googlenet  min =  146.49  max =  162.69  avg =  152.98
      googlenet_int8  min =  127.38  max =  132.11  avg =  128.51
            resnet18  min =  107.79  max =  108.83  avg =  108.37
       resnet18_int8  min =   97.28  max =   99.03  avg =   97.73
             alexnet  min =   89.95  max =   91.63  avg =   90.28
               vgg16  min =  642.27  max =  647.16  avg =  644.09
          vgg16_int8  min =  567.03  max =  574.11  avg =  568.74
            resnet50  min =  329.12  max =  331.79  avg =  330.10
       resnet50_int8  min =  252.48  max =  253.65  avg =  252.93
      squeezenet_ssd  min =   96.46  max =   96.95  avg =   96.69
 squeezenet_ssd_int8  min =   92.35  max =   93.24  avg =   92.72
       mobilenet_ssd  min =  149.14  max =  150.56  avg =  149.40
  mobilenet_ssd_int8  min =   97.56  max =   98.03  avg =   97.82
      mobilenet_yolo  min =  339.71  max =  340.60  avg =  339.89
  mobilenetv2_yolov3  min =  174.53  max =  175.80  avg =  175.01
         yolov4-tiny  min =  213.72  max =  214.94  avg =  214.08
           nanodet_m  min =   49.95  max =   50.47  avg =   50.19
    yolo-fastest-1.1  min =   23.80  max =   24.42  avg =   23.91
      yolo-fastestv2  min =   19.78  max =   19.95  avg =   19.84
  vision_transformer  min = 3927.51  max = 4025.76  avg = 3947.06
          FastestDet  min =   21.78  max =   22.17  avg =   21.88

mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 1 -1 0
loop_count = 10
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 0
          squeezenet  min =   70.80  max =   76.55  avg =   72.49
     squeezenet_int8  min =  110.36  max =  133.06  avg =  114.23
           mobilenet  min =   77.97  max =   85.73  avg =   79.98
      mobilenet_int8  min =   80.05  max =   84.09  avg =   81.76
        mobilenet_v2  min =  101.07  max =  192.92  avg =  139.32
        mobilenet_v3  min =  108.60  max =  129.37  avg =  113.80
          shufflenet  min =  160.96  max =  188.96  avg =  168.62
       shufflenet_v2  min =   96.20  max =  190.31  avg =  119.77
             mnasnet  min =   97.34  max =  104.00  avg =   99.85
     proxylessnasnet  min =  112.58  max =  276.49  avg =  145.74
     efficientnet_b0  min =  171.01  max =  238.15  avg =  195.53
   efficientnetv2_b0  min =  235.31  max =  299.00  avg =  254.12
        regnety_400m  min = 1059.87  max = 1173.49  avg = 1084.13
           blazeface  min =   58.69  max =   64.83  avg =   60.83
           googlenet  min =  190.47  max =  257.76  avg =  207.71
      googlenet_int8  min =  285.67  max =  327.20  avg =  300.87
            resnet18  min =  111.87  max =  118.36  avg =  114.48
       resnet18_int8  min =  143.08  max =  147.98  avg =  144.93
             alexnet  min =   72.83  max =   76.52  avg =   74.01
               vgg16  min =  390.35  max =  406.58  avg =  397.19
          vgg16_int8  min =  358.54  max =  369.89  avg =  364.31
            resnet50  min =  275.57  max =  300.14  avg =  283.21
       resnet50_int8  min =  315.18  max =  371.22  avg =  328.43
      squeezenet_ssd  min =  170.14  max =  200.18  avg =  175.23
 squeezenet_ssd_int8  min =  259.01  max =  271.23  avg =  263.35
       mobilenet_ssd  min =  166.85  max =  170.64  avg =  168.74
  mobilenet_ssd_int8  min =  191.71  max =  195.91  avg =  193.44
      mobilenet_yolo  min =  960.70  max = 1080.81  avg =  983.68
  mobilenetv2_yolov3  min =  187.72  max =  207.92  avg =  192.60
         yolov4-tiny  min =  172.72  max =  177.62  avg =  174.63
           nanodet_m  min =  128.79  max =  137.31  avg =  131.04
    yolo-fastest-1.1  min =  132.39  max =  148.06  avg =  137.90
      yolo-fastestv2  min =  130.97  max =  137.73  avg =  133.53
  vision_transformer  min = 2229.10  max = 2392.59  avg = 2304.21
          FastestDet  min =  119.98  max =  126.26  avg =  122.40

mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =   70.93  max =   75.55  avg =   72.93
     squeezenet_int8  min =  109.65  max =  153.48  avg =  124.20
           mobilenet  min =   78.02  max =   85.80  avg =   81.97
      mobilenet_int8  min =   80.34  max =   89.31  avg =   83.20
        mobilenet_v2  min =   99.51  max =  110.36  avg =  102.54
        mobilenet_v3  min =  109.04  max =  116.28  avg =  111.75
          shufflenet  min =  160.04  max =  166.21  avg =  163.59
       shufflenet_v2  min =   88.90  max =   91.82  avg =   90.24
             mnasnet  min =   97.02  max =  103.09  avg =   98.70
     proxylessnasnet  min =  111.21  max =  117.47  avg =  113.97
     efficientnet_b0  min =  167.99  max =  175.35  avg =  171.26
   efficientnetv2_b0  min =  228.59  max =  245.97  avg =  232.79
        regnety_400m  min = 1049.34  max = 1085.18  avg = 1064.68
           blazeface  min =   59.35  max =   64.91  avg =   60.35
           googlenet  min =  187.87  max =  195.29  avg =  190.56
      googlenet_int8  min =  283.22  max =  301.69  avg =  287.66
            resnet18  min =  111.48  max =  116.76  avg =  112.88
       resnet18_int8  min =  142.41  max =  148.79  avg =  145.14
             alexnet  min =   72.59  max =   75.37  avg =   73.62
               vgg16  min =  389.61  max =  452.95  avg =  424.36
          vgg16_int8  min =  365.57  max =  465.13  avg =  422.84
            resnet50  min =  283.07  max =  411.14  avg =  332.88
       resnet50_int8  min =  323.21  max =  381.13  avg =  340.59
      squeezenet_ssd  min =  178.21  max =  252.82  avg =  211.62
 squeezenet_ssd_int8  min =  263.82  max =  372.38  avg =  284.38
       mobilenet_ssd  min =  166.29  max =  281.36  avg =  195.16
  mobilenet_ssd_int8  min =  194.00  max =  220.95  avg =  204.07
      mobilenet_yolo  min =  964.99  max = 1027.13  avg =  989.45
  mobilenetv2_yolov3  min =  218.58  max =  512.86  avg =  265.12
         yolov4-tiny  min =  172.20  max =  177.27  avg =  174.14
           nanodet_m  min =  128.78  max =  222.66  avg =  150.88
    yolo-fastest-1.1  min =  132.52  max =  196.41  avg =  149.03
      yolo-fastestv2  min =  131.39  max =  138.72  avg =  134.96
  vision_transformer  min = 2243.31  max = 2659.56  avg = 2395.76
          FastestDet  min =  119.44  max =  126.07  avg =  122.27

```


### Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8)
```
[root@bogon benchmark]# ./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   57.60  max =   59.78  avg =   58.51
     squeezenet_int8  min =   47.05  max =   47.89  avg =   47.40
           mobilenet  min =   91.08  max =   95.16  avg =   91.89
      mobilenet_int8  min =   60.27  max =   61.17  avg =   60.74
        mobilenet_v2  min =   63.38  max =   68.12  avg =   66.96
        mobilenet_v3  min =   53.34  max =   54.71  avg =   54.01
          shufflenet  min =   37.87  max =   41.78  avg =   39.37
       shufflenet_v2  min =   35.89  max =   37.30  avg =   36.40
             mnasnet  min =   59.57  max =   63.23  avg =   60.25
     proxylessnasnet  min =   71.24  max =   71.93  avg =   71.51
     efficientnet_b0  min =  134.34  max =  141.14  avg =  137.74
   efficientnetv2_b0  min =  143.82  max =  145.63  avg =  144.36
        regnety_400m  min =   76.96  max =   77.66  avg =   77.27
           blazeface  min =   11.57  max =   11.90  avg =   11.70
           googlenet  min =  188.10  max =  191.27  avg =  189.02
      googlenet_int8  min =  167.54  max =  169.63  avg =  168.38
            resnet18  min =  144.76  max =  163.39  avg =  154.95
       resnet18_int8  min =  124.14  max =  129.84  avg =  127.83
             alexnet  min =  198.22  max =  208.86  avg =  205.35
               vgg16  min =  848.10  max =  891.00  avg =  859.94
          vgg16_int8  min =  686.54  max =  742.77  avg =  704.74
            resnet50  min =  413.45  max =  428.84  avg =  417.81
       resnet50_int8  min =  306.32  max =  324.27  avg =  316.47
      squeezenet_ssd  min =  147.62  max =  149.58  avg =  148.48
 squeezenet_ssd_int8  min =  116.18  max =  134.86  avg =  126.93
       mobilenet_ssd  min =  188.49  max =  191.97  avg =  189.48
  mobilenet_ssd_int8  min =  120.28  max =  121.36  avg =  120.83
      mobilenet_yolo  min =  421.79  max =  425.68  avg =  423.51
  mobilenetv2_yolov3  min =  222.86  max =  225.58  avg =  224.01
         yolov4-tiny  min =  303.77  max =  310.70  avg =  307.45
           nanodet_m  min =   80.87  max =   82.11  avg =   81.35

[root@bogon benchmark]# ./benchncnn 10 8 0 -1 0
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   14.53  max =   14.92  avg =   14.68
     squeezenet_int8  min =   11.67  max =   11.89  avg =   11.82
           mobilenet  min =   17.60  max =   20.05  avg =   18.34
      mobilenet_int8  min =    9.94  max =   10.22  avg =   10.08
        mobilenet_v2  min =   18.46  max =   19.18  avg =   18.81
        mobilenet_v3  min =   16.30  max =   16.71  avg =   16.45
          shufflenet  min =   14.65  max =   14.93  avg =   14.78
       shufflenet_v2  min =   11.23  max =   11.56  avg =   11.35
             mnasnet  min =   15.65  max =   16.08  avg =   15.92
     proxylessnasnet  min =   18.78  max =   21.72  avg =   19.68
     efficientnet_b0  min =   29.16  max =   29.62  avg =   29.37
   efficientnetv2_b0  min =   33.28  max =   35.48  avg =   34.23
        regnety_400m  min =   44.90  max =   47.36  avg =   46.32
           blazeface  min =    4.23  max =    4.43  avg =    4.30
           googlenet  min =   42.11  max =   42.98  avg =   42.38
      googlenet_int8  min =   33.24  max =   38.21  avg =   34.10
            resnet18  min =   33.27  max =   34.00  avg =   33.57
       resnet18_int8  min =   23.66  max =   24.78  avg =   24.24
             alexnet  min =   35.78  max =   37.68  avg =   36.46
               vgg16  min =  219.60  max =  235.79  avg =  222.11
          vgg16_int8  min =  128.64  max =  135.19  avg =  130.73
            resnet50  min =   84.15  max =   85.48  avg =   84.66
       resnet50_int8  min =   58.87  max =   61.98  avg =   59.85
      squeezenet_ssd  min =   47.60  max =   50.24  avg =   48.54
 squeezenet_ssd_int8  min =   36.42  max =   37.89  avg =   36.99
       mobilenet_ssd  min =   39.37  max =   42.63  avg =   41.06
  mobilenet_ssd_int8  min =   21.59  max =   22.05  avg =   21.83
      mobilenet_yolo  min =   83.16  max =   88.75  avg =   85.29
  mobilenetv2_yolov3  min =   58.13  max =   59.50  avg =   58.62
         yolov4-tiny  min =   74.18  max =   76.56  avg =   75.13
           nanodet_m  min =   25.16  max =   31.45  avg =   26.71

root@FT2K:~/Desktop/ncnn-20221128/build/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =   14.19  max =   21.46  avg =   15.16
     squeezenet_int8  min =   11.63  max =   12.08  avg =   11.91
           mobilenet  min =   20.52  max =   37.00  avg =   23.66
      mobilenet_int8  min =   13.38  max =   25.95  avg =   15.01
        mobilenet_v2  min =   15.80  max =   16.59  avg =   16.12
        mobilenet_v3  min =   13.38  max =   17.62  avg =   14.21
          shufflenet  min =   10.62  max =   11.10  avg =   10.85
       shufflenet_v2  min =    9.09  max =   12.30  avg =    9.66
             mnasnet  min =   14.85  max =   15.67  avg =   15.14
     proxylessnasnet  min =   16.83  max =   17.10  avg =   16.98
     efficientnet_b0  min =   24.59  max =   26.40  avg =   25.06
   efficientnetv2_b0  min =   30.25  max =   34.46  avg =   31.42
        regnety_400m  min =   32.37  max =   41.10  avg =   35.17
           blazeface  min =    3.00  max =    3.56  avg =    3.18
           googlenet  min =   49.52  max =   64.98  avg =   56.29
      googlenet_int8  min =   38.65  max =   52.51  avg =   43.90
            resnet18  min =   42.81  max =   53.94  avg =   45.38
       resnet18_int8  min =   32.53  max =   53.62  avg =   37.26
             alexnet  min =   33.92  max =   47.88  avg =   37.12
               vgg16  min =  214.19  max =  228.96  avg =  220.16
          vgg16_int8  min =  164.22  max =  224.51  avg =  180.15
            resnet50  min =  106.90  max =  189.61  avg =  133.34
       resnet50_int8  min =   79.62  max =   94.41  avg =   83.56
      squeezenet_ssd  min =   48.00  max =   49.11  avg =   48.43
 squeezenet_ssd_int8  min =   33.59  max =   47.60  avg =   37.57
       mobilenet_ssd  min =   43.97  max =   58.84  avg =   49.64
  mobilenet_ssd_int8  min =   27.94  max =   32.89  avg =   29.56
      mobilenet_yolo  min =  107.29  max =  118.80  avg =  114.24
  mobilenetv2_yolov3  min =   63.44  max =  106.75  avg =   70.69
         yolov4-tiny  min =   89.93  max =  155.39  avg =  101.90
           nanodet_m  min =   20.34  max =   28.67  avg =   21.44
    yolo-fastest-1.1  min =   11.74  max =   12.24  avg =   11.96
      yolo-fastestv2  min =    9.81  max =    9.98  avg =    9.91
  vision_transformer  min = 1617.60  max = 1634.13  avg = 1625.87
          FastestDet  min =   10.19  max =   10.55  avg =   10.36
```
### HUAWEI KunPeng 920 2251K (x8 cores)
test on UOS 1050
```
mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   12.11  max =   12.40  avg =   12.25
     squeezenet_int8  min =   14.24  max =   14.50  avg =   14.36
           mobilenet  min =   20.52  max =   21.11  avg =   20.63
      mobilenet_int8  min =   18.29  max =   18.63  avg =   18.45
        mobilenet_v2  min =   13.73  max =   13.90  avg =   13.79
        mobilenet_v3  min =   11.37  max =   11.49  avg =   11.41
          shufflenet  min =    7.90  max =    7.96  avg =    7.92
       shufflenet_v2  min =    8.09  max =    8.13  avg =    8.11
             mnasnet  min =   13.26  max =   13.44  avg =   13.30
     proxylessnasnet  min =   16.19  max =   16.39  avg =   16.26
     efficientnet_b0  min =   34.92  max =   35.22  avg =   35.04
   efficientnetv2_b0  min =   43.82  max =   44.39  avg =   43.94
        regnety_400m  min =   17.55  max =   18.02  avg =   17.65
           blazeface  min =    3.05  max =    3.08  avg =    3.07
           googlenet  min =   58.65  max =   59.26  avg =   58.89
      googlenet_int8  min =   60.55  max =   63.00  avg =   61.96
            resnet18  min =   34.27  max =   35.43  avg =   34.84
       resnet18_int8  min =   60.79  max =   62.15  avg =   61.47
             alexnet  min =   42.01  max =   44.43  avg =   43.36
               vgg16  min =  174.46  max =  177.33  avg =  175.57
          vgg16_int8  min =  453.93  max =  457.03  avg =  454.79
            resnet50  min =   95.36  max =   96.27  avg =   95.55
       resnet50_int8  min =  119.77  max =  121.26  avg =  120.46
      squeezenet_ssd  min =   39.05  max =   39.69  avg =   39.20
 squeezenet_ssd_int8  min =   55.06  max =   56.23  avg =   55.72
       mobilenet_ssd  min =   45.20  max =   45.96  avg =   45.49
  mobilenet_ssd_int8  min =   39.40  max =   40.13  avg =   39.76
      mobilenet_yolo  min =   98.86  max =   99.85  avg =   99.34
  mobilenetv2_yolov3  min =   51.17  max =   52.89  avg =   51.89
         yolov4-tiny  min =   66.43  max =   67.23  avg =   66.70
           nanodet_m  min =   20.59  max =   20.79  avg =   20.71
    yolo-fastest-1.1  min =    7.90  max =    7.99  avg =    7.93
      yolo-fastestv2  min =    7.45  max =    7.49  avg =    7.47
  vision_transformer  min = 1586.33  max = 1595.34  avg = 1589.76
          FastestDet  min =    7.45  max =    7.52  avg =    7.47

mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 8 0 -1 0
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    2.93  max =    3.10  avg =    3.00
     squeezenet_int8  min =    3.47  max =    3.56  avg =    3.52
           mobilenet  min =    3.89  max =    4.04  avg =    3.94
      mobilenet_int8  min =    3.29  max =    3.39  avg =    3.33
        mobilenet_v2  min =    3.95  max =    4.08  avg =    3.98
        mobilenet_v3  min =    3.45  max =    3.59  avg =    3.49
          shufflenet  min =    3.42  max =    4.66  avg =    3.62
       shufflenet_v2  min =    2.60  max =    2.94  avg =    2.68
             mnasnet  min =    3.46  max =    3.57  avg =    3.52
     proxylessnasnet  min =    3.94  max =   12.34  avg =    4.88
     efficientnet_b0  min =    7.31  max =    7.60  avg =    7.38
   efficientnetv2_b0  min =    9.01  max =    9.22  avg =    9.08
        regnety_400m  min =    8.56  max =    9.36  avg =    8.70
           blazeface  min =    1.36  max =    3.52  avg =    1.60
           googlenet  min =   11.80  max =   12.02  avg =   11.93
      googlenet_int8  min =   11.87  max =   23.09  avg =   13.16
            resnet18  min =    7.27  max =    7.64  avg =    7.38
       resnet18_int8  min =   11.02  max =   11.73  avg =   11.20
             alexnet  min =    9.05  max =    9.35  avg =    9.17
               vgg16  min =   44.13  max =   50.84  avg =   46.89
          vgg16_int8  min =   75.15  max =   80.73  avg =   77.52
            resnet50  min =   18.72  max =   27.49  avg =   19.96
       resnet50_int8  min =   22.72  max =   36.80  avg =   26.78
      squeezenet_ssd  min =   13.96  max =   27.42  avg =   15.62
 squeezenet_ssd_int8  min =   15.01  max =   29.53  avg =   19.51
       mobilenet_ssd  min =    9.37  max =   13.34  avg =   10.44
  mobilenet_ssd_int8  min =    8.07  max =   24.28  avg =    9.83
      mobilenet_yolo  min =   22.06  max =   24.89  avg =   22.91
  mobilenetv2_yolov3  min =   14.41  max =   15.97  avg =   14.78
         yolov4-tiny  min =   20.71  max =   23.96  avg =   21.42
           nanodet_m  min =    6.37  max =    6.59  avg =    6.45
    yolo-fastest-1.1  min =    4.27  max =    4.52  avg =    4.34
      yolo-fastestv2  min =    3.53  max =    3.63  avg =    3.58
  vision_transformer  min =  435.60  max =  523.43  avg =  479.70
          FastestDet  min =    3.54  max =    7.95  avg =    5.24

mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =    4.04  max =    4.22  avg =    4.09
     squeezenet_int8  min =    4.64  max =    4.76  avg =    4.69
           mobilenet  min =    6.04  max =    6.06  avg =    6.05
      mobilenet_int8  min =    5.23  max =    5.32  avg =    5.25
        mobilenet_v2  min =    5.00  max =    5.03  avg =    5.01
        mobilenet_v3  min =    4.49  max =    4.69  avg =    4.52
          shufflenet  min =    3.90  max =    3.94  avg =    3.91
       shufflenet_v2  min =    3.27  max =    3.48  avg =    3.33
             mnasnet  min =    4.80  max =    4.83  avg =    4.82
     proxylessnasnet  min =    5.20  max =    5.28  avg =    5.23
     efficientnet_b0  min =   10.53  max =   11.06  avg =   10.68
   efficientnetv2_b0  min =   13.18  max =   13.37  avg =   13.25
        regnety_400m  min =    9.20  max =    9.25  avg =    9.22
           blazeface  min =    1.43  max =    1.45  avg =    1.44
           googlenet  min =   17.63  max =   17.78  avg =   17.71
      googlenet_int8  min =   17.63  max =   18.03  avg =   17.85
            resnet18  min =   10.34  max =   10.59  avg =   10.40
       resnet18_int8  min =   17.93  max =   18.84  avg =   18.25
             alexnet  min =   13.28  max =   13.37  avg =   13.31
               vgg16  min =   55.41  max =   56.60  avg =   55.70
          vgg16_int8  min =  123.71  max =  125.34  avg =  124.48
            resnet50  min =   27.82  max =   28.22  avg =   27.95
       resnet50_int8  min =   34.50  max =   34.89  avg =   34.70
      squeezenet_ssd  min =   14.67  max =   15.19  avg =   14.85
 squeezenet_ssd_int8  min =   19.76  max =   20.32  avg =   19.87
       mobilenet_ssd  min =   13.15  max =   13.38  avg =   13.21
  mobilenet_ssd_int8  min =   11.52  max =   11.70  avg =   11.60
      mobilenet_yolo  min =   30.95  max =   31.28  avg =   31.05
  mobilenetv2_yolov3  min =   20.04  max =   20.36  avg =   20.16
         yolov4-tiny  min =   25.61  max =   26.73  avg =   25.80
           nanodet_m  min =    7.93  max =    7.97  avg =    7.95
    yolo-fastest-1.1  min =    4.52  max =    4.59  avg =    4.53
      yolo-fastestv2  min =    3.74  max =    3.88  avg =    3.77
  vision_transformer  min =  546.94  max =  726.81  avg =  698.27
          FastestDet  min =    3.59  max =    3.61  avg =    3.60
```

### HUAWEI KunPeng 920 3211K (x24 cores)
test on ubuntu 22.04
```
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   12.11  max =   12.20  avg =   12.14
     squeezenet_int8  min =   14.34  max =   14.46  avg =   14.41
           mobilenet  min =   20.27  max =   20.36  avg =   20.31
      mobilenet_int8  min =   17.45  max =   17.74  avg =   17.58
        mobilenet_v2  min =   13.72  max =   13.87  avg =   13.78
        mobilenet_v3  min =   11.51  max =   11.69  avg =   11.61
          shufflenet  min =    8.07  max =    8.36  avg =    8.20
       shufflenet_v2  min =    8.13  max =    8.17  avg =    8.14
             mnasnet  min =   13.34  max =   13.45  avg =   13.41
     proxylessnasnet  min =   16.22  max =   16.35  avg =   16.29
     efficientnet_b0  min =   34.69  max =   35.14  avg =   34.82
   efficientnetv2_b0  min =   44.54  max =   44.68  avg =   44.61
        regnety_400m  min =   18.06  max =   18.15  avg =   18.10
           blazeface  min =    3.06  max =    3.22  avg =    3.12
           googlenet  min =   56.80  max =   57.60  avg =   57.08
      googlenet_int8  min =   58.64  max =   59.98  avg =   59.42
            resnet18  min =   35.02  max =   35.35  avg =   35.10
       resnet18_int8  min =   61.13  max =   61.68  avg =   61.33
             alexnet  min =   42.56  max =   43.05  avg =   42.69
               vgg16  min =  186.32  max =  188.73  avg =  187.20
          vgg16_int8  min =  459.01  max =  461.48  avg =  460.29
            resnet50  min =   97.59  max =   98.32  avg =   97.83
       resnet50_int8  min =  118.67  max =  120.45  avg =  119.78
      squeezenet_ssd  min =   39.62  max =   39.95  avg =   39.81
 squeezenet_ssd_int8  min =   56.72  max =   57.63  avg =   57.00
       mobilenet_ssd  min =   45.44  max =   45.82  avg =   45.63
  mobilenet_ssd_int8  min =   38.99  max =   40.08  avg =   39.39
      mobilenet_yolo  min =   98.71  max =   99.27  avg =   98.94
  mobilenetv2_yolov3  min =   51.50  max =   52.41  avg =   51.87
         yolov4-tiny  min =   68.02  max =   68.43  avg =   68.24
           nanodet_m  min =   20.49  max =   20.64  avg =   20.59
    yolo-fastest-1.1  min =    8.17  max =    8.45  avg =    8.23
      yolo-fastestv2  min =    7.73  max =    8.06  avg =    7.87
  vision_transformer  min = 1620.65  max = 1630.45  avg = 1625.64
          FastestDet  min =    7.65  max =    7.77  avg =    7.69
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 2 0 -1 0
loop_count = 10
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.77  max =    6.85  avg =    6.81
     squeezenet_int8  min =    7.98  max =    8.07  avg =    8.03
           mobilenet  min =   10.70  max =   10.78  avg =   10.73
      mobilenet_int8  min =    9.21  max =    9.36  avg =    9.28
        mobilenet_v2  min =    7.91  max =    7.99  avg =    7.94
        mobilenet_v3  min =    6.72  max =    6.92  avg =    6.78
          shufflenet  min =    5.34  max =    5.55  avg =    5.38
       shufflenet_v2  min =    5.12  max =    5.15  avg =    5.14
             mnasnet  min =    7.74  max =    7.86  avg =    7.80
     proxylessnasnet  min =    9.00  max =    9.03  avg =    9.02
     efficientnet_b0  min =   18.51  max =   18.58  avg =   18.54
   efficientnetv2_b0  min =   23.68  max =   23.83  avg =   23.74
        regnety_400m  min =   12.65  max =   12.68  avg =   12.66
           blazeface  min =    1.99  max =    2.14  avg =    2.03
           googlenet  min =   30.83  max =   31.29  avg =   30.91
      googlenet_int8  min =   31.97  max =   33.12  avg =   32.45
            resnet18  min =   18.81  max =   18.87  avg =   18.84
       resnet18_int8  min =   32.80  max =   32.99  avg =   32.90
             alexnet  min =   22.88  max =   23.16  avg =   22.94
               vgg16  min =  100.58  max =  101.12  avg =  100.90
          vgg16_int8  min =  235.81  max =  237.97  avg =  236.20
            resnet50  min =   51.12  max =   51.43  avg =   51.28
       resnet50_int8  min =   62.46  max =   63.02  avg =   62.72
      squeezenet_ssd  min =   23.26  max =   23.73  avg =   23.38
 squeezenet_ssd_int8  min =   31.91  max =   32.30  avg =   32.13
       mobilenet_ssd  min =   24.73  max =   24.95  avg =   24.84
  mobilenet_ssd_int8  min =   20.99  max =   21.52  avg =   21.21
      mobilenet_yolo  min =   54.91  max =   55.70  avg =   55.15
  mobilenetv2_yolov3  min =   30.18  max =   30.52  avg =   30.31
         yolov4-tiny  min =   40.46  max =   40.61  avg =   40.55
           nanodet_m  min =   12.56  max =   12.72  avg =   12.62
    yolo-fastest-1.1  min =    6.00  max =    6.15  avg =    6.04
      yolo-fastestv2  min =    5.32  max =    5.59  avg =    5.43
  vision_transformer  min =  894.51  max =  896.28  avg =  895.57
          FastestDet  min =    5.33  max =    5.42  avg =    5.36
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 4 0 -1 0
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    4.18  max =    4.35  avg =    4.22
     squeezenet_int8  min =    4.85  max =    4.98  avg =    4.89
           mobilenet  min =    5.80  max =    5.95  avg =    5.89
      mobilenet_int8  min =    4.86  max =    4.94  avg =    4.89
        mobilenet_v2  min =    4.66  max =    4.73  avg =    4.69
        mobilenet_v3  min =    4.46  max =    4.50  avg =    4.48
          shufflenet  min =    4.01  max =    4.17  avg =    4.04
       shufflenet_v2  min =    3.39  max =    3.41  avg =    3.39
             mnasnet  min =    4.81  max =    4.93  avg =    4.85
     proxylessnasnet  min =    5.47  max =    5.54  avg =    5.49
     efficientnet_b0  min =   10.49  max =   10.55  avg =   10.52
   efficientnetv2_b0  min =   13.67  max =   13.77  avg =   13.72
        regnety_400m  min =   10.20  max =   10.24  avg =   10.21
           blazeface  min =    1.52  max =    1.58  avg =    1.54
           googlenet  min =   17.65  max =   17.69  avg =   17.68
      googlenet_int8  min =   18.14  max =   18.27  avg =   18.19
            resnet18  min =   10.52  max =   10.63  avg =   10.57
       resnet18_int8  min =   17.42  max =   17.53  avg =   17.49
             alexnet  min =   13.12  max =   13.20  avg =   13.16
               vgg16  min =   55.24  max =   55.45  avg =   55.35
          vgg16_int8  min =  123.46  max =  124.23  avg =  123.75
            resnet50  min =   28.31  max =   28.57  avg =   28.39
       resnet50_int8  min =   34.10  max =   34.39  avg =   34.23
      squeezenet_ssd  min =   14.85  max =   14.96  avg =   14.91
 squeezenet_ssd_int8  min =   19.71  max =   19.88  avg =   19.82
       mobilenet_ssd  min =   13.49  max =   13.58  avg =   13.52
  mobilenet_ssd_int8  min =   11.60  max =   11.70  avg =   11.66
      mobilenet_yolo  min =   31.74  max =   31.96  avg =   31.81
  mobilenetv2_yolov3  min =   17.87  max =   18.03  avg =   17.93
         yolov4-tiny  min =   25.63  max =   25.78  avg =   25.72
           nanodet_m  min =    8.16  max =    8.22  avg =    8.20
    yolo-fastest-1.1  min =    4.72  max =    4.86  avg =    4.75
      yolo-fastestv2  min =    3.98  max =    4.15  avg =    4.00
  vision_transformer  min =  501.18  max =  503.51  avg =  502.12
          FastestDet  min =    3.74  max =    3.76  avg =    3.75
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 8 0 -1 0
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    2.91  max =    3.10  avg =    2.97
     squeezenet_int8  min =    3.42  max =    3.74  avg =    3.51
           mobilenet  min =    3.57  max =    3.70  avg =    3.61
      mobilenet_int8  min =    3.06  max =    3.14  avg =    3.10
        mobilenet_v2  min =    3.73  max =    3.75  avg =    3.75
        mobilenet_v3  min =    3.50  max =    3.66  avg =    3.56
          shufflenet  min =    3.63  max =    3.65  avg =    3.64
       shufflenet_v2  min =    2.85  max =    3.02  avg =    2.95
             mnasnet  min =    3.60  max =    3.67  avg =    3.62
     proxylessnasnet  min =    4.00  max =    4.08  avg =    4.03
     efficientnet_b0  min =    7.31  max =    7.34  avg =    7.33
   efficientnetv2_b0  min =    9.44  max =    9.51  avg =    9.47
        regnety_400m  min =    9.76  max =   10.07  avg =    9.90
           blazeface  min =    1.56  max =    1.75  avg =    1.61
           googlenet  min =   11.22  max =   11.28  avg =   11.25
      googlenet_int8  min =   11.40  max =   12.82  avg =   11.76
            resnet18  min =    6.83  max =    6.96  avg =    6.90
       resnet18_int8  min =   10.28  max =   10.38  avg =   10.33
             alexnet  min =    8.75  max =    8.88  avg =    8.80
               vgg16  min =   36.00  max =   36.72  avg =   36.29
          vgg16_int8  min =   67.38  max =   67.72  avg =   67.54
            resnet50  min =   17.63  max =   17.82  avg =   17.68
       resnet50_int8  min =   20.05  max =   20.21  avg =   20.15
      squeezenet_ssd  min =   11.18  max =   11.45  avg =   11.26
 squeezenet_ssd_int8  min =   14.09  max =   14.23  avg =   14.18
       mobilenet_ssd  min =    8.60  max =    8.69  avg =    8.64
  mobilenet_ssd_int8  min =    7.75  max =    7.87  avg =    7.81
      mobilenet_yolo  min =   21.97  max =   22.25  avg =   22.09
  mobilenetv2_yolov3  min =   14.04  max =   14.18  avg =   14.12
         yolov4-tiny  min =   19.66  max =   19.93  avg =   19.81
           nanodet_m  min =    6.52  max =    6.67  avg =    6.57
    yolo-fastest-1.1  min =    4.61  max =    4.76  avg =    4.66
      yolo-fastestv2  min =    3.78  max =    3.91  avg =    3.82
  vision_transformer  min =  323.01  max =  327.38  avg =  323.75
          FastestDet  min =    3.50  max =    3.54  avg =    3.51
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 16 0 -1 0
loop_count = 10
num_threads = 16
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    3.00  max =    3.25  avg =    3.08
     squeezenet_int8  min =    4.13  max =    4.47  avg =    4.21
           mobilenet  min =    3.27  max =    3.42  avg =    3.34
      mobilenet_int8  min =    3.49  max =    3.58  avg =    3.56
        mobilenet_v2  min =    3.86  max =    4.10  avg =    3.97
        mobilenet_v3  min =    3.72  max =    3.80  avg =    3.76
          shufflenet  min =    4.67  max =    4.78  avg =    4.72
       shufflenet_v2  min =    3.16  max =    3.24  avg =    3.20
             mnasnet  min =    3.51  max =    3.65  avg =    3.57
     proxylessnasnet  min =    4.08  max =    4.35  avg =    4.15
     efficientnet_b0  min =    7.51  max =    7.80  avg =    7.63
   efficientnetv2_b0  min =    8.92  max =    9.39  avg =    9.05
        regnety_400m  min =   14.80  max =   15.05  avg =   14.89
           blazeface  min =    2.14  max =    2.28  avg =    2.20
           googlenet  min =    9.91  max =   10.00  avg =    9.96
      googlenet_int8  min =   11.51  max =   11.65  avg =   11.60
            resnet18  min =    6.39  max =    6.56  avg =    6.46
       resnet18_int8  min =    9.76  max =    9.91  avg =    9.84
             alexnet  min =    6.99  max =    7.10  avg =    7.04
               vgg16  min =   27.52  max =   28.64  avg =   27.88
          vgg16_int8  min =   45.64  max =   45.93  avg =   45.78
            resnet50  min =   13.96  max =   14.17  avg =   14.07
       resnet50_int8  min =   16.82  max =   16.93  avg =   16.89
      squeezenet_ssd  min =   11.11  max =   11.54  avg =   11.23
 squeezenet_ssd_int8  min =   13.77  max =   14.00  avg =   13.88
       mobilenet_ssd  min =    8.21  max =    8.46  avg =    8.35
  mobilenet_ssd_int8  min =    8.87  max =    9.03  avg =    8.94
      mobilenet_yolo  min =   30.77  max =   31.35  avg =   31.08
  mobilenetv2_yolov3  min =   12.11  max =   13.10  avg =   12.43
         yolov4-tiny  min =   18.25  max =   18.68  avg =   18.41
           nanodet_m  min =    6.55  max =    6.68  avg =    6.59
    yolo-fastest-1.1  min =    6.00  max =    6.22  avg =    6.09
      yolo-fastestv2  min =    4.86  max =    5.01  avg =    4.94
  vision_transformer  min =  218.18  max =  220.49  avg =  218.79
          FastestDet  min =    5.01  max =    5.14  avg =    5.07
(base) mobtgzhang@mobtgzhang-PC:~/ncnn/benchmark$ ./benchncnn 10 24 0 -1 0
loop_count = 10
num_threads = 24
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    3.52  max =    3.96  avg =    3.70
     squeezenet_int8  min =    5.49  max =    5.83  avg =    5.65
           mobilenet  min =    3.42  max =    3.83  avg =    3.55
      mobilenet_int8  min =    3.69  max =   45.17  avg =   11.59
        mobilenet_v2  min =    4.63  max =    5.44  avg =    4.84
        mobilenet_v3  min =    4.51  max =    4.89  avg =    4.68
          shufflenet  min =    6.21  max =    6.52  avg =    6.36
       shufflenet_v2  min =    3.98  max =   17.54  avg =    5.45
             mnasnet  min =    4.28  max =    4.56  avg =    4.39
     proxylessnasnet  min =    4.76  max =    5.13  avg =    4.92
     efficientnet_b0  min =    7.45  max =  111.76  avg =   22.59
   efficientnetv2_b0  min =   10.87  max =   33.13  avg =   13.51
        regnety_400m  min =   20.97  max =   21.73  avg =   21.46
           blazeface  min =    2.56  max =    2.82  avg =    2.67
           googlenet  min =   10.54  max =  105.87  avg =   21.85
      googlenet_int8  min =   14.21  max =   77.02  avg =   22.23
            resnet18  min =    7.08  max =    7.51  avg =    7.31
       resnet18_int8  min =   11.25  max =   50.66  avg =   19.14
             alexnet  min =    7.13  max =    8.67  avg =    7.44
               vgg16  min =   27.59  max =   35.35  avg =   29.12
          vgg16_int8  min =   44.43  max =   51.76  avg =   46.90
            resnet50  min =   15.16  max =  105.98  avg =   24.91
       resnet50_int8  min =   19.82  max =   20.50  avg =   20.16
      squeezenet_ssd  min =   13.03  max =   13.69  avg =   13.40
 squeezenet_ssd_int8  min =   17.62  max =  187.55  avg =   39.92
       mobilenet_ssd  min =    8.83  max =   71.97  avg =   15.37
  mobilenet_ssd_int8  min =   10.22  max =   49.61  avg =   15.26
      mobilenet_yolo  min =   35.19  max =   46.43  avg =   36.93
  mobilenetv2_yolov3  min =   12.96  max =   15.57  avg =   13.41
         yolov4-tiny  min =   19.22  max =   21.43  avg =   19.89
           nanodet_m  min =    7.71  max =    8.74  avg =    8.09
    yolo-fastest-1.1  min =    6.71  max =   78.72  avg =   14.16
      yolo-fastestv2  min =    5.72  max =    6.08  avg =    5.88
  vision_transformer  min =  192.16  max =  221.86  avg =  202.73
          FastestDet  min =    5.13  max =    5.47  avg =    5.30
```

### HUAWEI Kunpeng 920 7260 (x64 cores)
test on Ubuntu 20.04 (gcc 9.4.0)
```
root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 1 0 -1 0
loop_count = 300
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   11.64  max =   12.11  avg =   11.71
     squeezenet_int8  min =   12.22  max =   13.22  avg =   12.37
           mobilenet  min =   20.00  max =   20.79  avg =   20.08
      mobilenet_int8  min =   17.44  max =   19.09  avg =   17.64
        mobilenet_v2  min =   13.29  max =   14.25  avg =   13.39
        mobilenet_v3  min =   11.06  max =   11.84  avg =   11.11
          shufflenet  min =    7.56  max =    7.74  avg =    7.59
       shufflenet_v2  min =    7.84  max =    8.37  avg =    7.88
             mnasnet  min =   13.07  max =   13.78  avg =   13.14
     proxylessnasnet  min =   15.71  max =   16.31  avg =   15.77
     efficientnet_b0  min =   34.79  max =   35.98  avg =   34.92
   efficientnetv2_b0  min =   35.28  max =   36.36  avg =   35.41
        regnety_400m  min =   17.06  max =   17.74  avg =   17.16
           blazeface  min =    2.99  max =    3.04  avg =    3.01
           googlenet  min =   50.76  max =   51.74  avg =   51.00
      googlenet_int8  min =   50.31  max =   52.27  avg =   50.65
            resnet18  min =   34.97  max =   37.17  avg =   35.82
       resnet18_int8  min =   40.47  max =   42.03  avg =   40.78
             alexnet  min =   39.19  max =   39.80  avg =   39.32
               vgg16  min =  176.62  max =  181.29  avg =  177.07
          vgg16_int8  min =  352.35  max =  358.38  avg =  355.15
            resnet50  min =   96.76  max =   98.63  avg =   97.09
       resnet50_int8  min =   90.00  max =   92.74  avg =   90.81
      squeezenet_ssd  min =   33.23  max =   33.99  avg =   33.39
 squeezenet_ssd_int8  min =   38.50  max =   41.53  avg =   39.28
       mobilenet_ssd  min =   42.49  max =   44.78  avg =   42.72
  mobilenet_ssd_int8  min =   37.06  max =   39.97  avg =   37.57
      mobilenet_yolo  min =   96.34  max =   98.91  avg =   96.73
  mobilenetv2_yolov3  min =   50.88  max =   52.97  avg =   51.15
         yolov4-tiny  min =   65.56  max =   67.13  avg =   65.80
           nanodet_m  min =   19.94  max =   20.82  avg =   20.04
    yolo-fastest-1.1  min =    7.66  max =    7.81  avg =    7.71
      yolo-fastestv2  min =    6.82  max =    7.23  avg =    6.87
  vision_transformer  min = 1535.03  max = 1552.84  avg = 1543.73
          FastestDet  min =    7.17  max =    7.50  avg =    7.21
root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 2 0 -1 0
loop_count = 300
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.35  max =    9.15  avg =    7.33
     squeezenet_int8  min =    8.06  max =    8.60  avg =    8.14
           mobilenet  min =   10.30  max =   11.86  avg =   11.48
      mobilenet_int8  min =    8.93  max =   11.87  avg =   10.47
        mobilenet_v2  min =    9.05  max =   11.50  avg =    9.19
        mobilenet_v3  min =    6.32  max =    6.42  avg =    6.36
          shufflenet  min =    6.73  max =    8.55  avg =    6.81
       shufflenet_v2  min =    4.94  max =    6.65  avg =    6.32
             mnasnet  min =    7.38  max =   10.77  avg =    8.82
     proxylessnasnet  min =    8.57  max =    9.72  avg =    8.63
     efficientnet_b0  min =   18.61  max =   22.53  avg =   20.42
   efficientnetv2_b0  min =   18.75  max =   21.93  avg =   20.79
        regnety_400m  min =   11.86  max =   15.09  avg =   14.60
           blazeface  min =    1.95  max =    3.37  avg =    2.06
           googlenet  min =   28.66  max =   32.24  avg =   28.94
      googlenet_int8  min =   27.64  max =   32.15  avg =   30.84
            resnet18  min =   20.33  max =   20.77  avg =   20.47
       resnet18_int8  min =   22.63  max =   23.72  avg =   22.88
             alexnet  min =   20.41  max =   29.37  avg =   27.22
               vgg16  min =  101.72  max =  140.33  avg =  103.29
          vgg16_int8  min =  187.56  max =  211.44  avg =  189.92
            resnet50  min =   51.07  max =   59.25  avg =   58.35
       resnet50_int8  min =   46.50  max =   52.55  avg =   48.93
      squeezenet_ssd  min =   22.48  max =   28.59  avg =   22.98
 squeezenet_ssd_int8  min =   25.56  max =   26.82  avg =   25.99
       mobilenet_ssd  min =   22.81  max =   26.21  avg =   24.88
  mobilenet_ssd_int8  min =   19.31  max =   25.53  avg =   21.74
      mobilenet_yolo  min =   59.58  max =   62.04  avg =   59.99
  mobilenetv2_yolov3  min =   33.26  max =   35.74  avg =   33.51
         yolov4-tiny  min =   41.14  max =   45.34  avg =   42.46
           nanodet_m  min =   12.10  max =   16.69  avg =   15.02
    yolo-fastest-1.1  min =    5.44  max =    7.78  avg =    7.24
      yolo-fastestv2  min =    5.03  max =    8.08  avg =    6.75
  vision_transformer  min =  994.46  max = 1090.68  avg = 1045.50
          FastestDet  min =    6.76  max =    6.91  avg =    6.83
root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 4 0 -1 0
loop_count = 300
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    3.79  max =    6.99  avg =    4.55
     squeezenet_int8  min =    5.13  max =    5.68  avg =    5.20
           mobilenet  min =    6.25  max =    6.55  avg =    6.30
      mobilenet_int8  min =    5.96  max =    6.10  avg =    6.03
        mobilenet_v2  min =    5.34  max =    7.15  avg =    5.62
        mobilenet_v3  min =    4.05  max =    5.74  avg =    5.01
          shufflenet  min =    3.69  max =    5.81  avg =    5.15
       shufflenet_v2  min =    4.31  max =    6.02  avg =    4.56
             mnasnet  min =    4.48  max =    6.05  avg =    5.54
     proxylessnasnet  min =    5.05  max =    8.08  avg =    6.03
     efficientnet_b0  min =   10.17  max =   12.21  avg =   11.58
   efficientnetv2_b0  min =   10.86  max =   15.78  avg =   12.70
        regnety_400m  min =    9.24  max =   14.13  avg =   11.98
           blazeface  min =    1.89  max =    1.97  avg =    1.93
           googlenet  min =   15.19  max =   20.31  avg =   16.90
      googlenet_int8  min =   17.97  max =   19.40  avg =   18.11
            resnet18  min =   11.18  max =   11.48  avg =   11.29
       resnet18_int8  min =   12.26  max =   12.78  avg =   12.44
             alexnet  min =   14.43  max =   16.94  avg =   14.68
               vgg16  min =   62.40  max =   78.42  avg =   64.96
          vgg16_int8  min =  101.52  max =  109.42  avg =  104.46
            resnet50  min =   29.19  max =   39.69  avg =   32.99
       resnet50_int8  min =   26.94  max =   28.82  avg =   27.16
      squeezenet_ssd  min =   12.90  max =   16.52  avg =   15.20
 squeezenet_ssd_int8  min =   15.58  max =   18.40  avg =   16.28
       mobilenet_ssd  min =   13.68  max =   14.45  avg =   13.87
  mobilenet_ssd_int8  min =   12.20  max =   14.58  avg =   12.84
      mobilenet_yolo  min =   34.85  max =   36.54  avg =   35.05
  mobilenetv2_yolov3  min =   18.61  max =   20.93  avg =   19.92
         yolov4-tiny  min =   26.09  max =   32.32  avg =   28.03
           nanodet_m  min =    7.85  max =   12.48  avg =   11.00
    yolo-fastest-1.1  min =    6.19  max =    6.49  avg =    6.31
      yolo-fastestv2  min =    3.66  max =    6.83  avg =    5.11
  vision_transformer  min =  605.95  max =  624.99  avg =  609.79
          FastestDet  min =    4.32  max =    5.41  avg =    5.17
root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 8 0 -1 0
loop_count = 300
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    2.72  max =    3.74  avg =    3.05
     squeezenet_int8  min =    3.80  max =    4.71  avg =    4.03
           mobilenet  min =    3.94  max =    5.15  avg =    4.00
      mobilenet_int8  min =    3.73  max =    3.87  avg =    3.80
        mobilenet_v2  min =    4.51  max =    6.57  avg =    4.68
        mobilenet_v3  min =    4.12  max =    4.38  avg =    4.28
          shufflenet  min =    4.60  max =    6.27  avg =    4.88
       shufflenet_v2  min =    4.07  max =    4.20  avg =    4.11
             mnasnet  min =    4.26  max =    4.51  avg =    4.36
     proxylessnasnet  min =    4.71  max =    7.40  avg =    4.80
     efficientnet_b0  min =    8.49  max =    8.74  avg =    8.56
   efficientnetv2_b0  min =    9.34  max =    9.68  avg =    9.41
        regnety_400m  min =    8.00  max =   12.85  avg =   10.64
           blazeface  min =    1.76  max =    1.84  avg =    1.80
           googlenet  min =   10.89  max =   11.33  avg =   10.98
      googlenet_int8  min =   11.66  max =   14.07  avg =   11.83
            resnet18  min =    6.48  max =    6.61  avg =    6.54
       resnet18_int8  min =    7.30  max =    7.79  avg =    7.51
             alexnet  min =    8.33  max =    8.95  avg =    8.62
               vgg16  min =   29.94  max =   47.54  avg =   31.95
          vgg16_int8  min =   54.67  max =   60.76  avg =   56.03
            resnet50  min =   16.13  max =   20.79  avg =   20.03
       resnet50_int8  min =   15.64  max =   20.13  avg =   16.11
      squeezenet_ssd  min =   11.58  max =   12.02  avg =   11.77
 squeezenet_ssd_int8  min =   11.14  max =   13.72  avg =   12.10
       mobilenet_ssd  min =    8.27  max =   10.77  avg =    8.76
  mobilenet_ssd_int8  min =    8.13  max =    9.09  avg =    8.29
      mobilenet_yolo  min =   23.90  max =   24.69  avg =   24.17
  mobilenetv2_yolov3  min =   14.83  max =   15.72  avg =   15.19
         yolov4-tiny  min =   19.78  max =   23.66  avg =   20.05
           nanodet_m  min =    8.92  max =   10.76  avg =    9.09
    yolo-fastest-1.1  min =    5.49  max =    5.77  avg =    5.63
      yolo-fastestv2  min =    5.04  max =    5.21  avg =    5.10
  vision_transformer  min =  318.42  max =  379.40  avg =  363.66
          FastestDet  min =    4.18  max =    4.54  avg =    4.38
root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 16 0 -1 0
loop_count = 300
num_threads = 16
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    2.70  max =    3.14  avg =    2.81
     squeezenet_int8  min =    3.21  max =    4.22  avg =    3.39
           mobilenet  min =    3.13  max =    3.26  avg =    3.20
      mobilenet_int8  min =    3.17  max =    5.05  avg =    3.30
        mobilenet_v2  min =    4.31  max =    6.24  avg =    4.62
        mobilenet_v3  min =    3.57  max =    3.77  avg =    3.68
          shufflenet  min =    4.70  max =    6.45  avg =    4.80
       shufflenet_v2  min =    3.73  max =    4.27  avg =    3.87
             mnasnet  min =    3.67  max =    3.87  avg =    3.75
     proxylessnasnet  min =    4.28  max =    4.81  avg =    4.35
     efficientnet_b0  min =    7.31  max =    7.77  avg =    7.53
   efficientnetv2_b0  min =    9.87  max =   12.33  avg =   10.07
        regnety_400m  min =   17.95  max =   18.53  avg =   18.26
           blazeface  min =    2.26  max =    2.40  avg =    2.33
           googlenet  min =    9.51  max =    9.99  avg =    9.68
      googlenet_int8  min =   10.98  max =   11.36  avg =   11.18
            resnet18  min =    5.59  max =    6.08  avg =    5.71
       resnet18_int8  min =    6.55  max =    7.28  avg =    6.77
             alexnet  min =    6.26  max =    6.50  avg =    6.36
               vgg16  min =   23.98  max =   27.37  avg =   24.89
          vgg16_int8  min =   38.07  max =   39.66  avg =   39.02
            resnet50  min =   12.81  max =   14.19  avg =   13.76
       resnet50_int8  min =   12.42  max =   12.84  avg =   12.55
      squeezenet_ssd  min =   10.80  max =   11.49  avg =   11.12
 squeezenet_ssd_int8  min =   11.57  max =   12.21  avg =   11.74
       mobilenet_ssd  min =    7.46  max =    8.08  avg =    7.84
  mobilenet_ssd_int8  min =    7.47  max =    8.07  avg =    7.63
      mobilenet_yolo  min =   21.70  max =   23.43  avg =   21.92
  mobilenetv2_yolov3  min =   12.55  max =   14.56  avg =   12.90
         yolov4-tiny  min =   17.68  max =   19.85  avg =   18.18
           nanodet_m  min =    8.35  max =    8.70  avg =    8.45
    yolo-fastest-1.1  min =    5.70  max =    7.11  avg =    6.05
      yolo-fastestv2  min =    4.85  max =    5.70  avg =    5.37
  vision_transformer  min =  214.36  max =  259.56  avg =  245.47
          FastestDet  min =    5.01  max =    5.42  avg =    5.17
root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 32 0 -1 0
loop_count = 300
num_threads = 32
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    2.30  max =    2.94  avg =    2.46
     squeezenet_int8  min =    3.08  max =    4.88  avg =    4.03
           mobilenet  min =    2.49  max =    2.76  avg =    2.53
      mobilenet_int8  min =    2.86  max =    3.73  avg =    2.95
        mobilenet_v2  min =    4.51  max =    5.20  avg =    4.74
        mobilenet_v3  min =    5.11  max =    6.91  avg =    6.10
          shufflenet  min =    5.57  max =    6.51  avg =    5.78
       shufflenet_v2  min =    4.37  max =    4.66  avg =    4.48
             mnasnet  min =    3.72  max =    4.08  avg =    3.90
     proxylessnasnet  min =    4.19  max =    6.18  avg =    4.79
     efficientnet_b0  min =    6.80  max =    7.22  avg =    6.89
   efficientnetv2_b0  min =   13.98  max =   17.55  avg =   15.06
        regnety_400m  min =   16.10  max =   16.72  avg =   16.26
           blazeface  min =    2.12  max =    2.53  avg =    2.17
           googlenet  min =    8.63  max =    9.89  avg =    8.77
      googlenet_int8  min =    9.90  max =   11.09  avg =   10.08
            resnet18  min =    6.54  max =    6.99  avg =    6.73
       resnet18_int8  min =    8.34  max =    9.00  avg =    8.67
             alexnet  min =    6.64  max =    7.15  avg =    6.93
               vgg16  min =   22.79  max =   23.91  avg =   23.50
          vgg16_int8  min =   32.37  max =   37.51  avg =   33.13
            resnet50  min =   11.19  max =   16.40  avg =   11.47
       resnet50_int8  min =   11.92  max =   12.55  avg =   12.13
      squeezenet_ssd  min =   10.75  max =   12.28  avg =   11.12
 squeezenet_ssd_int8  min =   11.31  max =   12.29  avg =   11.57
       mobilenet_ssd  min =   10.25  max =   11.26  avg =   10.79
  mobilenet_ssd_int8  min =   11.39  max =   16.99  avg =   11.98
      mobilenet_yolo  min =   52.11  max =   60.46  avg =   53.84
  mobilenetv2_yolov3  min =   12.07  max =   12.47  avg =   12.20
         yolov4-tiny  min =   17.48  max =   17.79  avg =   17.58
           nanodet_m  min =   13.06  max =   14.71  avg =   13.64
    yolo-fastest-1.1  min =    5.70  max =    5.89  avg =    5.79
      yolo-fastestv2  min =    8.89  max =    9.99  avg =    9.21
  vision_transformer  min =  158.92  max =  187.40  avg =  168.21
          FastestDet  min =    8.70  max =    9.43  avg =    9.00
root@8d46e508165f:/home/lkl/ARM_CHAR/ncnn/benchmark# ../build/benchmark/benchncnn 300 64 0 -1 0
loop_count = 300
num_threads = 64
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.85  max =   78.56  avg =    7.81
     squeezenet_int8  min =    8.06  max =   88.91  avg =    9.23
           mobilenet  min =    3.02  max =   86.86  avg =    5.89
      mobilenet_int8  min =    3.58  max =    4.55  avg =    3.68
        mobilenet_v2  min =    5.05  max =  150.06  avg =   13.04
        mobilenet_v3  min =    4.85  max =  125.22  avg =    8.34
          shufflenet  min =   17.80  max =  220.55  avg =   21.01
       shufflenet_v2  min =   11.23  max =  381.95  avg =   13.71
             mnasnet  min =    9.83  max =  128.42  avg =   11.10
     proxylessnasnet  min =   10.53  max =   68.52  avg =   12.03
     efficientnet_b0  min =   16.78  max =  968.87  avg =   23.94
   efficientnetv2_b0  min =   26.23  max =  551.18  avg =   31.34
        regnety_400m  min =   70.14  max =  407.92  avg =   78.30
           blazeface  min =    7.27  max =  191.44  avg =    9.37
           googlenet  min =   16.69  max =  820.58  avg =   25.06
      googlenet_int8  min =   20.58  max =  849.09  avg =   29.87
            resnet18  min =    8.67  max =  349.00  avg =   11.33
       resnet18_int8  min =   10.40  max =  128.98  avg =   11.45
             alexnet  min =    6.15  max =  196.01  avg =   10.24
               vgg16  min =   21.11  max =  288.66  avg =   29.37
          vgg16_int8  min =   30.72  max =  251.95  avg =   37.68
            resnet50  min =   19.10  max =  114.08  avg =   22.00
       resnet50_int8  min =   18.99  max =  436.89  avg =   24.36
      squeezenet_ssd  min =   22.22  max =  510.52  avg =   28.76
 squeezenet_ssd_int8  min =   23.42  max =  614.70  avg =   30.82
       mobilenet_ssd  min =    7.62  max =  202.66  avg =   14.59
  mobilenet_ssd_int8  min =    7.89  max =  109.82  avg =    8.80
      mobilenet_yolo  min =   31.43  max =  742.10  avg =   45.52
  mobilenetv2_yolov3  min =   18.31  max =  273.05  avg =   20.78
         yolov4-tiny  min =   21.03  max =  400.05  avg =   33.64
           nanodet_m  min =   19.94  max =  114.18  avg =   21.89
    yolo-fastest-1.1  min =    7.20  max =  174.60  avg =    9.13
      yolo-fastestv2  min =    7.50  max =  170.55  avg =    9.01
  vision_transformer  min =  126.90  max =  335.71  avg =  157.38
          FastestDet  min =    6.59  max =   19.77  avg =    6.77
```

### Intel Atom x5-Z8350
```
nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 20 4 0 -1 1
loop_count = 20
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   50.22  max =   50.53  avg =   50.32
     squeezenet_int8  min =   77.92  max =   78.37  avg =   78.07
           mobilenet  min =   80.12  max =   81.53  avg =   80.35
      mobilenet_int8  min =  120.54  max =  124.10  avg =  120.84
        mobilenet_v2  min =   56.62  max =   60.12  avg =   58.37
        mobilenet_v3  min =   50.19  max =   50.41  avg =   50.27
          shufflenet  min =   37.96  max =   38.28  avg =   38.10
       shufflenet_v2  min =   35.28  max =   35.59  avg =   35.45
             mnasnet  min =   54.91  max =   55.10  avg =   55.01
     proxylessnasnet  min =   62.25  max =   62.59  avg =   62.40
     efficientnet_b0  min =  101.92  max =  105.73  avg =  102.27
   efficientnetv2_b0  min =  115.48  max =  117.25  avg =  115.89
        regnety_400m  min =   79.66  max =   81.70  avg =   79.95
           blazeface  min =   10.43  max =   10.60  avg =   10.49
           googlenet  min =  170.41  max =  173.44  avg =  170.68
      googlenet_int8  min =  253.06  max =  257.34  avg =  253.57
            resnet18  min =  127.19  max =  130.69  avg =  127.65
       resnet18_int8  min =  200.54  max =  204.25  avg =  200.88
             alexnet  min =  104.89  max =  110.89  avg =  105.56
               vgg16  min =  653.78  max =  661.34  avg =  655.44
          vgg16_int8  min =  974.72  max = 1006.48  avg =  978.76
            resnet50  min =  367.63  max =  371.74  avg =  368.27
       resnet50_int8  min =  574.94  max =  584.08  avg =  576.18
      squeezenet_ssd  min =  115.35  max =  116.47  avg =  115.62
 squeezenet_ssd_int8  min =  169.95  max =  170.75  avg =  170.26
       mobilenet_ssd  min =  167.00  max =  172.02  avg =  168.95
  mobilenet_ssd_int8  min =  244.91  max =  248.30  avg =  245.27
      mobilenet_yolo  min =  382.80  max =  393.23  avg =  385.79
  mobilenetv2_yolov3  min =  208.23  max =  211.54  avg =  209.64
         yolov4-tiny  min =  251.10  max =  263.77  avg =  256.37
           nanodet_m  min =   84.48  max =   84.95  avg =   84.70
    yolo-fastest-1.1  min =   44.11  max =   45.15  avg =   44.26
      yolo-fastestv2  min =   37.95  max =   38.52  avg =   38.34

nihui@nihui-ROCK-Pi-X:~/ncnn/build/benchmark$ ./benchncnn 10 1 0 -1 1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  130.52  max =  131.08  avg =  130.64
     squeezenet_int8  min =  231.03  max =  231.38  avg =  231.19
           mobilenet  min =  231.40  max =  231.74  avg =  231.61
      mobilenet_int8  min =  409.74  max =  410.02  avg =  409.85
        mobilenet_v2  min =  150.23  max =  150.72  avg =  150.47
        mobilenet_v3  min =  119.08  max =  119.34  avg =  119.20
          shufflenet  min =   72.62  max =   72.81  avg =   72.73
       shufflenet_v2  min =   73.63  max =   73.71  avg =   73.68
             mnasnet  min =  140.87  max =  141.09  avg =  140.98
     proxylessnasnet  min =  166.39  max =  166.75  avg =  166.54
     efficientnet_b0  min =  280.55  max =  281.30  avg =  280.77
   efficientnetv2_b0  min =  321.05  max =  321.24  avg =  321.16
        regnety_400m  min =  183.78  max =  184.64  avg =  183.91
           blazeface  min =   18.94  max =   19.08  avg =   19.01
           googlenet  min =  453.56  max =  454.71  avg =  454.15
      googlenet_int8  min =  791.40  max =  791.93  avg =  791.61
            resnet18  min =  365.87  max =  366.40  avg =  366.15
       resnet18_int8  min =  652.86  max =  653.39  avg =  653.09
             alexnet  min =  289.15  max =  290.25  avg =  289.65
               vgg16  min = 1887.16  max = 1887.73  avg = 1887.41
          vgg16_int8  min = 3211.44  max = 3213.39  avg = 3212.55
            resnet50  min = 1060.37  max = 1061.40  avg = 1060.80
       resnet50_int8  min = 1869.41  max = 1870.59  avg = 1870.17
      squeezenet_ssd  min =  277.23  max =  277.83  avg =  277.50
 squeezenet_ssd_int8  min =  455.54  max =  458.06  avg =  456.28
       mobilenet_ssd  min =  478.03  max =  478.83  avg =  478.32
  mobilenet_ssd_int8  min =  822.61  max =  822.96  avg =  822.79
      mobilenet_yolo  min = 1136.89  max = 1138.51  avg = 1137.74
  mobilenetv2_yolov3  min =  551.81  max =  552.53  avg =  552.14
         yolov4-tiny  min =  685.49  max =  686.15  avg =  685.79
           nanodet_m  min =  181.21  max =  181.52  avg =  181.32
    yolo-fastest-1.1  min =   82.21  max =   82.68  avg =   82.30
      yolo-fastestv2  min =   67.62  max =   68.36  avg =   68.10

root@nihui-ROCK-Pi-X:/home/nihui/osd/ncnn/build/benchmark# ./benchncnn 10 1 0 0 0
[0 Intel(R) HD Graphics (CHV)]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 Intel(R) HD Graphics (CHV)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Intel(R) HD Graphics (CHV)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Intel(R) HD Graphics (CHV)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =   29.14  max =   29.76  avg =   29.45
           mobilenet  min =   36.19  max =   37.03  avg =   36.52
        mobilenet_v2  min =   30.39  max =   31.62  avg =   30.76
        mobilenet_v3  min =   31.60  max =   32.25  avg =   31.92
          shufflenet  min =   22.47  max =   23.19  avg =   22.70
       shufflenet_v2  min =   22.30  max =   24.16  avg =   23.12
             mnasnet  min =   29.40  max =   30.23  avg =   29.84
     proxylessnasnet  min =   31.00  max =   31.91  avg =   31.41
     efficientnet_b0  min =   58.03  max =   58.74  avg =   58.42
   efficientnetv2_b0  min =  131.17  max =  191.61  avg =  161.37
        regnety_400m  min =   40.30  max =   42.27  avg =   41.04
           blazeface  min =   15.06  max =   15.96  avg =   15.48
           googlenet  min =   85.37  max =   86.49  avg =   85.84
            resnet18  min =   93.87  max =   95.00  avg =   94.53
             alexnet  min =  110.96  max =  120.83  avg =  115.14
               vgg16  min =  798.75  max =  812.60  avg =  804.93
            resnet50  min =  213.12  max =  214.81  avg =  213.79
      squeezenet_ssd  min =  124.48  max =  125.18  avg =  124.87
       mobilenet_ssd  min =   84.04  max =   84.70  avg =   84.49
      mobilenet_yolo  min =  186.52  max =  189.61  avg =  188.53
  mobilenetv2_yolov3  min =  102.07  max =  102.97  avg =  102.39
         yolov4-tiny  min =  212.49  max =  214.75  avg =  213.77
           nanodet_m  min =   42.97  max =   45.58  avg =   44.05
    yolo-fastest-1.1  min =   27.14  max =   32.53  avg =   28.76
      yolo-fastestv2  min =   20.73  max =   25.90  avg =   22.97
```

### Intel Celeron N5105
```
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   18.06  max =   18.21  avg =   18.12
     squeezenet_int8  min =   24.55  max =   25.16  avg =   24.69
           mobilenet  min =   32.22  max =   32.70  avg =   32.40
      mobilenet_int8  min =   40.52  max =   40.59  avg =   40.54
        mobilenet_v2  min =   22.54  max =   22.71  avg =   22.65
        mobilenet_v3  min =   17.86  max =   19.02  avg =   18.09
          shufflenet  min =   11.23  max =   11.30  avg =   11.28
       shufflenet_v2  min =   11.04  max =   11.19  avg =   11.13
             mnasnet  min =   19.93  max =   20.09  avg =   20.01
     proxylessnasnet  min =   21.91  max =   22.00  avg =   21.95
     efficientnet_b0  min =   33.29  max =   33.66  avg =   33.50
   efficientnetv2_b0  min =   40.16  max =   40.63  avg =   40.34
        regnety_400m  min =   27.38  max =   27.59  avg =   27.50
           blazeface  min =    3.01  max =    3.11  avg =    3.04
           googlenet  min =   64.78  max =   65.16  avg =   65.01
      googlenet_int8  min =   80.11  max =   80.79  avg =   80.46
            resnet18  min =   53.91  max =   54.28  avg =   54.07
       resnet18_int8  min =   63.95  max =   64.20  avg =   64.06
             alexnet  min =   51.84  max =   52.17  avg =   52.00
               vgg16  min =  322.01  max =  324.34  avg =  322.72
          vgg16_int8  min =  323.83  max =  324.17  avg =  324.02
            resnet50  min =  152.66  max =  153.33  avg =  153.03
       resnet50_int8  min =  193.40  max =  194.55  avg =  194.03
      squeezenet_ssd  min =   44.07  max =   44.51  avg =   44.37
 squeezenet_ssd_int8  min =   51.08  max =   52.26  avg =   51.60
       mobilenet_ssd  min =   67.73  max =   68.21  avg =   67.98
  mobilenet_ssd_int8  min =   82.41  max =   82.70  avg =   82.55
      mobilenet_yolo  min =  157.38  max =  159.44  avg =  158.23
  mobilenetv2_yolov3  min =   83.35  max =   83.68  avg =   83.55
         yolov4-tiny  min =  107.25  max =  107.72  avg =  107.50
           nanodet_m  min =   26.93  max =   27.24  avg =   27.09
    yolo-fastest-1.1  min =   12.47  max =   12.71  avg =   12.61
      yolo-fastestv2  min =   10.65  max =   10.95  avg =   10.81

loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   54.43  max =   54.48  avg =   54.46
     squeezenet_int8  min =   79.32  max =   79.64  avg =   79.43
           mobilenet  min =  105.92  max =  106.12  avg =  106.03
      mobilenet_int8  min =  152.24  max =  152.28  avg =  152.26
        mobilenet_v2  min =   62.44  max =   62.83  avg =   62.57
        mobilenet_v3  min =   49.47  max =   49.55  avg =   49.50
          shufflenet  min =   27.32  max =   27.37  avg =   27.34
       shufflenet_v2  min =   29.85  max =   30.00  avg =   29.93
             mnasnet  min =   59.83  max =   60.09  avg =   59.98
     proxylessnasnet  min =   66.66  max =   66.84  avg =   66.76
     efficientnet_b0  min =  104.00  max =  104.19  avg =  104.08
   efficientnetv2_b0  min =  128.05  max =  128.39  avg =  128.21
        regnety_400m  min =   77.95  max =   78.03  avg =   78.00
           blazeface  min =    6.66  max =    6.77  avg =    6.70
           googlenet  min =  195.32  max =  195.75  avg =  195.52
      googlenet_int8  min =  275.81  max =  276.25  avg =  275.98
            resnet18  min =  160.94  max =  161.17  avg =  161.03
       resnet18_int8  min =  223.88  max =  224.12  avg =  224.03
             alexnet  min =  120.96  max =  121.16  avg =  121.05
               vgg16  min =  852.50  max =  853.66  avg =  853.04
          vgg16_int8  min = 1081.07  max = 1083.31  avg = 1082.18
            resnet50  min =  497.54  max =  497.85  avg =  497.67
       resnet50_int8  min =  681.79  max =  682.60  avg =  682.29
      squeezenet_ssd  min =  101.81  max =  102.49  avg =  102.13
 squeezenet_ssd_int8  min =  147.77  max =  148.52  avg =  148.04
       mobilenet_ssd  min =  215.63  max =  216.07  avg =  215.91
  mobilenet_ssd_int8  min =  305.65  max =  305.97  avg =  305.78
      mobilenet_yolo  min =  494.99  max =  495.41  avg =  495.16
  mobilenetv2_yolov3  min =  233.51  max =  234.26  avg =  233.84
         yolov4-tiny  min =  287.26  max =  287.89  avg =  287.50
           nanodet_m  min =   70.48  max =   70.73  avg =   70.61
    yolo-fastest-1.1  min =   27.32  max =   27.36  avg =   27.34
      yolo-fastestv2  min =   23.51  max =   23.85  avg =   23.76

[0 Intel(R) UHD Graphics (JSL)]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 Intel(R) UHD Graphics (JSL)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Intel(R) UHD Graphics (JSL)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Intel(R) UHD Graphics (JSL)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =   14.71  max =   15.37  avg =   14.90
           mobilenet  min =   15.38  max =   16.34  avg =   16.07
        mobilenet_v2  min =   13.58  max =   14.52  avg =   14.23
        mobilenet_v3  min =   14.95  max =   15.81  avg =   15.20
          shufflenet  min =   11.93  max =   12.73  avg =   12.31
       shufflenet_v2  min =   14.47  max =   14.74  avg =   14.60
             mnasnet  min =   15.32  max =   17.13  avg =   15.95
     proxylessnasnet  min =   15.34  max =   16.25  avg =   15.66
     efficientnet_b0  min =   26.02  max =   26.19  avg =   26.11
   efficientnetv2_b0  min =   75.92  max =   76.18  avg =   76.07
        regnety_400m  min =   17.79  max =   18.00  avg =   17.91
           blazeface  min =    5.03  max =    5.96  avg =    5.65
           googlenet  min =   35.20  max =   35.40  avg =   35.32
            resnet18  min =   35.49  max =   35.61  avg =   35.56
             alexnet  min =   40.93  max =   41.25  avg =   41.11
               vgg16  min =  220.66  max =  222.18  avg =  221.42
            resnet50  min =   78.10  max =   78.48  avg =   78.28
      squeezenet_ssd  min =   46.90  max =   47.46  avg =   47.26
       mobilenet_ssd  min =   33.33  max =   33.54  avg =   33.44
      mobilenet_yolo  min =   67.54  max =   67.77  avg =   67.64
  mobilenetv2_yolov3  min =   38.98  max =   39.69  avg =   39.37
         yolov4-tiny  min =   68.01  max =   69.74  avg =   68.86
           nanodet_m  min =   17.41  max =   18.13  avg =   17.78
    yolo-fastest-1.1  min =   13.91  max =   14.18  avg =   14.03
      yolo-fastestv2  min =   15.94  max =   16.02  avg =   15.97
```

### nVIDIA RTX2060 of Notebook
```
C:\Users\ai\AppData\Local\Temp\benchmark>benchncnn.exe 64 1 0 0 0
[0 GeForce RTX 2060]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 GeForce RTX 2060]  buglssc=0  bugihfa=0
[0 GeForce RTX 2060]  fp16p=1  fp16s=1  fp16a=1  int8s=1  int8a=1
loop_count = 64
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    2.14  max =    2.93  avg =    2.26
           mobilenet  min =    2.08  max =    2.53  avg =    2.22
        mobilenet_v2  min =    2.81  max =    4.03  avg =    3.05
        mobilenet_v3  min =    2.90  max =    3.53  avg =    3.08
          shufflenet  min =    1.94  max =    4.27  avg =    2.55
       shufflenet_v2  min =    2.34  max =    2.97  avg =    2.49
             mnasnet  min =    2.11  max =    2.86  avg =    2.37
     proxylessnasnet  min =    2.27  max =    3.25  avg =    2.49
           googlenet  min =    4.34  max =    6.79  avg =    5.25
            resnet18  min =    2.60  max =    4.36  avg =    2.90
             alexnet  min =    2.79  max =    4.70  avg =    3.04
               vgg16  min =   11.40  max =   14.32  avg =   12.42
            resnet50  min =    5.26  max =    5.86  avg =    5.51
      squeezenet_ssd  min =    5.58  max =    7.94  avg =    6.56
       mobilenet_ssd  min =    3.47  max =    5.29  avg =    3.77
      mobilenet_yolo  min =    5.49  max =    6.19  avg =    5.70
  mobilenetv2_yolov3  min =    3.69  max =    5.14  avg =    3.91
```

### nVIDIA RTX A3000 of Notebook (6GB)
```
cx@HP-ZBook-Fury-15-6-inch-G8-Mobile-Workstation-PC:~/ncnn/build/benchmark$ ./benchncnn 10 1 0 1
[0 Intel(R) UHD Graphics (TGL GT1)]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 Intel(R) UHD Graphics (TGL GT1)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Intel(R) UHD Graphics (TGL GT1)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Intel(R) UHD Graphics (TGL GT1)]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 Intel(R) UHD Graphics (TGL GT1)]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
[1 NVIDIA RTX A3000 Laptop GPU]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[1 NVIDIA RTX A3000 Laptop GPU]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 NVIDIA RTX A3000 Laptop GPU]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[1 NVIDIA RTX A3000 Laptop GPU]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[1 NVIDIA RTX A3000 Laptop GPU]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 1
cooling_down = 1
          squeezenet  min =    1.49  max =    1.94  avg =    1.74
     squeezenet_int8  min =    6.13  max =    6.20  avg =    6.16
           mobilenet  min =    4.05  max =    4.82  avg =    4.65
      mobilenet_int8  min =   10.24  max =   10.29  avg =   10.26
        mobilenet_v2  min =    0.98  max =    1.14  avg =    1.03
        mobilenet_v3  min =    1.74  max =    1.82  avg =    1.77
          shufflenet  min =    1.43  max =   30.51  avg =    9.51
       shufflenet_v2  min =    3.43  max =    3.89  avg =    3.77
             mnasnet  min =    6.50  max =    6.75  avg =    6.62
     proxylessnasnet  min =    6.46  max =    7.28  avg =    7.00
     efficientnet_b0  min =    3.14  max =   15.11  avg =    7.29
   efficientnetv2_b0  min =   18.50  max =   20.13  avg =   19.17
        regnety_400m  min =    2.16  max =    3.57  avg =    2.70
           blazeface  min =    2.52  max =    2.76  avg =    2.65
           googlenet  min =    2.67  max =   14.67  avg =    9.85
      googlenet_int8  min =   19.08  max =   19.40  avg =   19.19
            resnet18  min =    5.19  max =    9.44  avg =    8.48
       resnet18_int8  min =   16.57  max =   17.69  avg =   16.96
             alexnet  min =    1.98  max =    3.24  avg =    2.23
               vgg16  min =    3.59  max =   12.34  avg =   10.99
          vgg16_int8  min =  110.63  max =  124.31  avg =  118.16
            resnet50  min =    3.01  max =    4.93  avg =    3.77
       resnet50_int8  min =   41.58  max =   44.80  avg =   43.24
      squeezenet_ssd  min =    4.08  max =    4.70  avg =    4.32
 squeezenet_ssd_int8  min =   17.32  max =   17.92  avg =   17.46
       mobilenet_ssd  min =    2.26  max =    8.23  avg =    5.57
  mobilenet_ssd_int8  min =   20.35  max =   21.89  avg =   20.76
      mobilenet_yolo  min =    2.14  max =   16.94  avg =    6.44
  mobilenetv2_yolov3  min =    3.64  max =    5.09  avg =    4.02
         yolov4-tiny  min =   10.94  max =   17.46  avg =   13.58
           nanodet_m  min =    6.57  max =   13.91  avg =    9.82
    yolo-fastest-1.1  min =    5.40  max =   14.22  avg =   10.78
      yolo-fastestv2  min =    7.49  max =    9.43  avg =    7.99
  vision_transformer  min =   76.04  max =   76.96  avg =   76.43
          FastestDet  min =    6.31  max =    6.60  avg =    6.43
```

### nVIDIA RTX2080 of Desktop
```
E:\projects\framework\ncnn\benchmark>benchncnn.exe 4096 1 0 0 0
[0 GeForce RTX 2080]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 GeForce RTX 2080]  buglssc=0  bugihfa=0
[0 GeForce RTX 2080]  fp16p=1  fp16s=1  fp16a=1  int8s=1  int8a=1
loop_count = 4096
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    1.39  max =   16.70  avg =    1.49
           mobilenet  min =    1.32  max =    2.55  avg =    1.42
        mobilenet_v2  min =    1.88  max =    5.02  avg =    2.00
        mobilenet_v3  min =    2.31  max =    3.58  avg =    2.45
          shufflenet  min =    1.45  max =    2.65  avg =    1.55
       shufflenet_v2  min =    1.90  max =    3.21  avg =    2.03
             mnasnet  min =    1.95  max =    3.17  avg =    2.09
     proxylessnasnet  min =    2.02  max =    2.95  avg =    2.16
           googlenet  min =    3.81  max =    5.91  avg =    4.05
            resnet18  min =    2.10  max =    3.28  avg =    2.24
             alexnet  min =    2.15  max =    3.35  avg =    2.30
               vgg16  min =    7.33  max =   11.12  avg =    7.80
            resnet50  min =    4.21  max =    6.70  avg =    4.49
      squeezenet_ssd  min =    4.58  max =    6.86  avg =    4.88
       mobilenet_ssd  min =    2.90  max =    4.52  avg =    3.09
      mobilenet_yolo  min =    4.15  max =    6.09  avg =    4.40
  mobilenetv2_yolov3  min =    3.04  max =    9.13  avg =    3.28
```

### NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64)
```
i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 1 0 -1 0
loop_count = 32
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   22.31  max =   23.29  avg =   22.68
     squeezenet_int8  min =   47.64  max =   52.88  avg =   49.72
           mobilenet  min =   37.50  max =   38.45  avg =   37.85
      mobilenet_int8  min =   89.14  max =   92.38  avg =   90.95
        mobilenet_v2  min =   24.31  max =   25.53  avg =   24.68
        mobilenet_v3  min =   20.20  max =   21.21  avg =   20.56
          shufflenet  min =   14.85  max =   15.64  avg =   15.15
       shufflenet_v2  min =   14.34  max =   16.11  avg =   14.86
             mnasnet  min =   23.42  max =   23.86  avg =   23.56
     proxylessnasnet  min =   27.44  max =   28.83  avg =   27.83
     efficientnet_b0  min =   34.57  max =   37.84  avg =   35.13
   efficientnetv2_b0  min =   65.16  max =   68.67  avg =   66.76
        regnety_400m  min =   33.86  max =   34.49  avg =   34.17
           blazeface  min =   11.86  max =   14.15  avg =   12.52
           googlenet  min =   83.19  max =   89.84  avg =   85.14
      googlenet_int8  min =  146.74  max =  155.25  avg =  151.14
            resnet18  min =   50.46  max =   57.80  avg =   53.40
       resnet18_int8  min =  108.43  max =  116.14  avg =  110.78
             alexnet  min =   56.59  max =   64.93  avg =   59.51
               vgg16  min =  266.78  max =  272.16  avg =  269.14
          vgg16_int8  min =  538.71  max =  551.55  avg =  544.78
            resnet50  min =  169.11  max =  172.26  avg =  170.51
       resnet50_int8  min =  370.55  max =  384.36  avg =  377.75
      squeezenet_ssd  min =   58.51  max =   67.88  avg =   62.78
 squeezenet_ssd_int8  min =   95.34  max =  106.49  avg =   97.99
       mobilenet_ssd  min =   83.52  max =   86.84  avg =   84.86
  mobilenet_ssd_int8  min =  172.70  max =  181.84  avg =  176.25
      mobilenet_yolo  min =  165.26  max =  167.74  avg =  166.51
  mobilenetv2_yolov3  min =   88.11  max =   90.29  avg =   89.19
         yolov4-tiny  min =  105.44  max =  109.24  avg =  107.07
           nanodet_m  min =   33.60  max =   37.02  avg =   34.39
    yolo-fastest-1.1  min =   13.56  max =   14.22  avg =   13.75
      yolo-fastestv2  min =   13.76  max =   14.59  avg =   14.02
i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 2 0 -1 0
loop_count = 32
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   13.05  max =   13.76  avg =   13.36
     squeezenet_int8  min =   26.08  max =   28.09  avg =   26.69
           mobilenet  min =   20.61  max =   21.21  avg =   20.81
      mobilenet_int8  min =   44.72  max =   47.33  avg =   45.76
        mobilenet_v2  min =   14.67  max =   15.23  avg =   14.86
        mobilenet_v3  min =   12.59  max =   15.50  avg =   13.36
          shufflenet  min =   12.74  max =   14.14  avg =   13.31
       shufflenet_v2  min =   10.05  max =   10.89  avg =   10.40
             mnasnet  min =   14.02  max =   14.75  avg =   14.19
     proxylessnasnet  min =   16.05  max =   16.94  avg =   16.31
     efficientnet_b0  min =   20.47  max =   23.05  avg =   20.81
   efficientnetv2_b0  min =   37.51  max =   41.53  avg =   39.19
        regnety_400m  min =   25.21  max =   25.73  avg =   25.39
           blazeface  min =    7.30  max =    8.44  avg =    7.43
           googlenet  min =   42.52  max =   47.38  avg =   44.39
      googlenet_int8  min =   76.38  max =   81.63  avg =   77.93
            resnet18  min =   26.76  max =   28.72  avg =   27.22
       resnet18_int8  min =   55.97  max =   61.57  avg =   57.26
             alexnet  min =   29.29  max =   33.20  avg =   31.03
               vgg16  min =  134.32  max =  138.65  avg =  136.05
          vgg16_int8  min =  267.70  max =  281.71  avg =  272.79
            resnet50  min =   87.22  max =   88.75  avg =   87.65
       resnet50_int8  min =  183.80  max =  192.17  avg =  187.25
      squeezenet_ssd  min =   35.80  max =   39.00  avg =   37.32
 squeezenet_ssd_int8  min =   53.56  max =   60.43  avg =   55.58
       mobilenet_ssd  min =   44.17  max =   48.30  avg =   44.70
  mobilenet_ssd_int8  min =   90.32  max =   94.09  avg =   92.27
      mobilenet_yolo  min =   87.50  max =   89.63  avg =   88.33
  mobilenetv2_yolov3  min =   49.76  max =   51.58  avg =   50.44
         yolov4-tiny  min =   61.17  max =   64.41  avg =   62.15
           nanodet_m  min =   21.43  max =   22.47  avg =   21.82
    yolo-fastest-1.1  min =   10.90  max =   12.63  avg =   11.12
      yolo-fastestv2  min =   10.61  max =   11.11  avg =   10.82
i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 4 0 -1 0
loop_count = 32
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    8.06  max =    8.79  avg =    8.39
     squeezenet_int8  min =   14.96  max =   16.64  avg =   15.37
           mobilenet  min =   11.24  max =   11.91  avg =   11.48
      mobilenet_int8  min =   23.63  max =   24.75  avg =   23.81
        mobilenet_v2  min =    9.27  max =    9.97  avg =    9.44
        mobilenet_v3  min =    8.81  max =   10.06  avg =    9.07
          shufflenet  min =   11.22  max =   11.53  avg =   11.37
       shufflenet_v2  min =    7.81  max =    8.17  avg =    7.97
             mnasnet  min =    9.40  max =   10.49  avg =   10.06
     proxylessnasnet  min =   10.53  max =   10.73  avg =   10.62
     efficientnet_b0  min =   13.55  max =   15.14  avg =   13.80
   efficientnetv2_b0  min =   19.83  max =   21.95  avg =   21.09
        regnety_400m  min =   21.80  max =   22.91  avg =   22.13
           blazeface  min =    5.17  max =    6.27  avg =    5.31
           googlenet  min =   22.67  max =   25.35  avg =   23.10
      googlenet_int8  min =   43.19  max =   45.68  avg =   43.72
            resnet18  min =   15.19  max =   16.14  avg =   15.42
       resnet18_int8  min =   31.22  max =   34.76  avg =   31.81
             alexnet  min =   15.20  max =   17.65  avg =   15.56
               vgg16  min =   70.76  max =   73.21  avg =   71.70
          vgg16_int8  min =  137.94  max =  143.50  avg =  139.54
            resnet50  min =   47.15  max =   47.91  avg =   47.40
       resnet50_int8  min =   99.80  max =  102.94  avg =  100.29
      squeezenet_ssd  min =   22.10  max =   24.11  avg =   22.46
 squeezenet_ssd_int8  min =   33.21  max =   35.98  avg =   33.98
       mobilenet_ssd  min =   25.09  max =   26.81  avg =   25.50
  mobilenet_ssd_int8  min =   48.15  max =   50.96  avg =   49.49
      mobilenet_yolo  min =   48.63  max =   49.02  avg =   48.84
  mobilenetv2_yolov3  min =   30.93  max =   31.41  avg =   31.13
         yolov4-tiny  min =   38.43  max =   41.20  avg =   39.28
           nanodet_m  min =   14.95  max =   15.74  avg =   15.35
    yolo-fastest-1.1  min =    8.89  max =    9.18  avg =    9.01
      yolo-fastestv2  min =    8.36  max =    9.28  avg =    8.50
i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 32 8 0 -1 0
loop_count = 32
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.52  max =   74.10  avg =   12.94
     squeezenet_int8  min =   10.44  max =   18.81  avg =   12.15
           mobilenet  min =    7.49  max =   14.63  avg =    8.67
      mobilenet_int8  min =   13.80  max =   15.89  avg =   14.53
        mobilenet_v2  min =    8.15  max =   11.42  avg =    8.78
        mobilenet_v3  min =    7.60  max =   10.92  avg =    8.38
          shufflenet  min =   11.51  max =   19.48  avg =   12.97
       shufflenet_v2  min =    7.06  max =   15.58  avg =    9.48
             mnasnet  min =    7.77  max =   15.12  avg =    8.68
     proxylessnasnet  min =    8.54  max =   42.73  avg =   10.00
     efficientnet_b0  min =   11.11  max =   12.86  avg =   11.89
   efficientnetv2_b0  min =   17.17  max =   29.03  avg =   20.48
        regnety_400m  min =   22.41  max =   36.72  avg =   25.49
           blazeface  min =    4.93  max =   11.62  avg =    6.13
           googlenet  min =   17.02  max =   31.61  avg =   19.92
      googlenet_int8  min =   27.70  max =   35.49  avg =   29.18
            resnet18  min =    9.74  max =   18.78  avg =   11.40
       resnet18_int8  min =   18.52  max =   24.70  avg =   19.32
             alexnet  min =   10.70  max =   15.41  avg =   11.39
               vgg16  min =   40.80  max =   54.47  avg =   42.72
          vgg16_int8  min =   74.71  max =   79.66  avg =   76.37
            resnet50  min =   28.21  max =   36.62  avg =   29.41
       resnet50_int8  min =   54.53  max =   76.02  avg =   56.81
      squeezenet_ssd  min =   19.01  max =   30.68  avg =   24.89
 squeezenet_ssd_int8  min =   27.61  max =   35.87  avg =   29.22
       mobilenet_ssd  min =   17.35  max =   22.87  avg =   18.55
  mobilenet_ssd_int8  min =   29.92  max =   36.35  avg =   31.15
      mobilenet_yolo  min =   31.63  max =   55.61  avg =   34.31
  mobilenetv2_yolov3  min =   23.75  max =   35.45  avg =   25.68
         yolov4-tiny  min =   29.23  max =   70.12  avg =   31.94
           nanodet_m  min =   13.00  max =   21.72  avg =   15.39
    yolo-fastest-1.1  min =    9.72  max =   17.94  avg =   11.45
      yolo-fastestv2  min =    9.16  max =   16.35  avg =   11.08
i@ubuntu:~/projects/ncnn/benchmark$ ./benchncnn 128 1 0 0 0
[0 NVIDIA Tegra Xavier (nvgpu)]  queueC=2[8]  queueG=0[16]  queueT=1[1]
[0 NVIDIA Tegra Xavier (nvgpu)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA Tegra Xavier (nvgpu)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA Tegra Xavier (nvgpu)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 128
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    4.85  max =   19.65  avg =    6.83
     squeezenet_int8  min =   46.38  max =   49.70  avg =   47.22
           mobilenet  min =    5.62  max =    6.61  avg =    6.33
      mobilenet_int8  min =   87.42  max =   92.95  avg =   90.52
        mobilenet_v2  min =    5.96  max =    7.53  avg =    6.50
        mobilenet_v3  min =    6.77  max =    7.83  avg =    7.01
          shufflenet  min =   10.58  max =   18.46  avg =   13.68
       shufflenet_v2  min =   20.06  max =   21.09  avg =   20.37
             mnasnet  min =    6.49  max =   26.49  avg =    8.26
     proxylessnasnet  min =    6.75  max =   27.37  avg =    7.88
     efficientnet_b0  min =   12.11  max =   48.35  avg =   14.63
   efficientnetv2_b0  min =   24.61  max =   69.68  avg =   34.33
        regnety_400m  min =    9.02  max =   34.40  avg =   10.84
           blazeface  min =    7.55  max =    8.10  avg =    7.78
           googlenet  min =   12.57  max =   65.14  avg =   18.91
      googlenet_int8  min =  145.74  max =  155.87  avg =  151.06
            resnet18  min =    8.88  max =   30.48  avg =    9.34
       resnet18_int8  min =  109.19  max =  116.78  avg =  111.52
             alexnet  min =    9.06  max =   54.53  avg =   19.04
               vgg16  min =   18.12  max =   37.31  avg =   19.65
          vgg16_int8  min =  530.60  max =  551.58  avg =  542.33
            resnet50  min =   11.62  max =   20.64  avg =   12.17
       resnet50_int8  min =  374.83  max =  384.79  avg =  379.50
      squeezenet_ssd  min =   14.01  max =   55.88  avg =   23.64
 squeezenet_ssd_int8  min =   89.86  max =   95.80  avg =   92.18
       mobilenet_ssd  min =   13.20  max =   13.61  avg =   13.37
  mobilenet_ssd_int8  min =  170.17  max =  181.48  avg =  174.93
      mobilenet_yolo  min =   11.78  max =   20.42  avg =   13.34
  mobilenetv2_yolov3  min =   18.08  max =   62.94  avg =   26.70
         yolov4-tiny  min =   26.44  max =   34.83  avg =   31.83
           nanodet_m  min =    7.93  max =    9.91  avg =    9.01
    yolo-fastest-1.1  min =    6.03  max =   20.85  avg =    8.42
      yolo-fastestv2  min =    9.01  max =   20.60  avg =   12.51
```

### MacBook Pro (13-inch, M1, 2020)
```
MacBook-Pro benchmark % ./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    4.80  max =    5.05  avg =    4.86
     squeezenet_int8  min =    4.02  max =    4.13  avg =    4.04
           mobilenet  min =    9.09  max =    9.41  avg =    9.22
      mobilenet_int8  min =    4.65  max =    4.76  avg =    4.70
        mobilenet_v2  min =    5.64  max =    5.83  avg =    5.73
        mobilenet_v3  min =    4.64  max =    4.85  avg =    4.76
          shufflenet  min =    3.48  max =    3.63  avg =    3.56
       shufflenet_v2  min =    3.69  max =    3.81  avg =    3.73
             mnasnet  min =    5.67  max =    5.94  avg =    5.77
     proxylessnasnet  min =    7.03  max =    7.28  avg =    7.20
     efficientnet_b0  min =    9.13  max =    9.53  avg =    9.28
   efficientnetv2_b0  min =   17.37  max =   18.47  avg =   17.63
        regnety_400m  min =    7.64  max =    8.08  avg =    7.72
           blazeface  min =    1.80  max =    1.89  avg =    1.83
           googlenet  min =   25.71  max =   25.90  avg =   25.81
      googlenet_int8  min =   16.89  max =   17.10  avg =   16.97
            resnet18  min =   17.16  max =   17.28  avg =   17.20
       resnet18_int8  min =   15.55  max =   15.75  avg =   15.64
             alexnet  min =   30.60  max =   31.11  avg =   30.69
               vgg16  min =   73.41  max =   75.37  avg =   73.91
          vgg16_int8  min =  103.81  max =  105.15  avg =  104.19
            resnet50  min =   43.47  max =   44.24  avg =   43.68
       resnet50_int8  min =   30.37  max =   35.25  avg =   31.61
      squeezenet_ssd  min =   20.97  max =   21.21  avg =   21.12
 squeezenet_ssd_int8  min =   19.34  max =   19.54  avg =   19.42
       mobilenet_ssd  min =   22.18  max =   22.58  avg =   22.28
  mobilenet_ssd_int8  min =   13.27  max =   15.31  avg =   14.05
      mobilenet_yolo  min =   40.78  max =   41.04  avg =   40.89
  mobilenetv2_yolov3  min =   20.87  max =   21.92  avg =   21.02
         yolov4-tiny  min =   30.73  max =   32.37  avg =   31.29
           nanodet_m  min =    8.54  max =    8.86  avg =    8.65


MacBook-Pro benchmark % ./benchncnn 10 8 0 0 0
[0 Apple M1]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 Apple M1]  bugsbn1=0  bugbilz=151  bugcopc=0  bugihfa=0
[0 Apple M1]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Apple M1]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    1.86  max =    2.22  avg =    2.01
     squeezenet_int8  min =    2.38  max =    8.40  avg =    5.13
           mobilenet  min =    2.50  max =    2.91  avg =    2.64
      mobilenet_int8  min =    2.29  max =    5.26  avg =    3.54
        mobilenet_v2  min =    2.93  max =    3.12  avg =    2.98
        mobilenet_v3  min =    3.36  max =    3.61  avg =    3.48
          shufflenet  min =    1.99  max =    2.54  avg =    2.18
       shufflenet_v2  min =    2.35  max =    2.84  avg =    2.52
             mnasnet  min =    2.81  max =    3.33  avg =    2.92
     proxylessnasnet  min =    3.21  max =    3.62  avg =    3.36
     efficientnet_b0  min =    4.74  max =    5.73  avg =    5.07
   efficientnetv2_b0  min =   12.04  max =   13.04  avg =   12.61
        regnety_400m  min =    3.86  max =    4.04  avg =    3.98
           blazeface  min =    0.98  max =    1.11  avg =    1.03
           googlenet  min =    4.86  max =    5.38  avg =    5.02
      googlenet_int8  min =    9.43  max =   15.72  avg =   10.44
            resnet18  min =    3.92  max =    4.59  avg =    4.24
       resnet18_int8  min =    6.83  max =    7.57  avg =    7.35
             alexnet  min =    7.49  max =    7.87  avg =    7.65
               vgg16  min =   34.10  max =   35.29  avg =   34.60
          vgg16_int8  min =   40.09  max =   44.66  avg =   41.95
            resnet50  min =    7.22  max =    7.83  avg =    7.42
       resnet50_int8  min =   14.52  max =   20.56  avg =   15.78
      squeezenet_ssd  min =    8.52  max =   13.79  avg =    9.98
 squeezenet_ssd_int8  min =   12.38  max =   15.44  avg =   13.37
       mobilenet_ssd  min =    4.83  max =    6.00  avg =    5.31
  mobilenet_ssd_int8  min =    7.26  max =   13.12  avg =    9.01
      mobilenet_yolo  min =    7.22  max =    8.66  avg =    7.99
  mobilenetv2_yolov3  min =    7.46  max =    8.06  avg =    7.80
         yolov4-tiny  min =   12.17  max =   13.95  avg =   12.82
           nanodet_m  min =    3.54  max =    4.78  avg =    3.86
```

### MacBook Air (13-inch, M3, 2024)
```
MacBook-Air benchmark % ./benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    3.59  max =    4.20  avg =    3.80
     squeezenet_int8  min =    2.61  max =    2.82  avg =    2.74
           mobilenet  min =    6.67  max =    6.92  avg =    6.85
      mobilenet_int8  min =    3.61  max =    3.66  avg =    3.62
        mobilenet_v2  min =    4.08  max =    4.15  avg =    4.10
        mobilenet_v3  min =    3.32  max =    3.44  avg =    3.34
          shufflenet  min =    2.08  max =    2.13  avg =    2.10
       shufflenet_v2  min =    2.35  max =    2.44  avg =    2.37
             mnasnet  min =    4.14  max =    4.23  avg =    4.18
     proxylessnasnet  min =    5.09  max =    5.15  avg =    5.11
     efficientnet_b0  min =    6.67  max =    6.75  avg =    6.70
   efficientnetv2_b0  min =    8.79  max =    8.83  avg =    8.81
        regnety_400m  min =    5.68  max =    5.73  avg =    5.69
           blazeface  min =    0.75  max =    0.77  avg =    0.76
           googlenet  min =   15.94  max =   15.97  avg =   15.96
      googlenet_int8  min =   10.88  max =   10.92  avg =   10.89
            resnet18  min =   12.60  max =   12.63  avg =   12.61
       resnet18_int8  min =    9.88  max =    9.95  avg =    9.90
             alexnet  min =   12.72  max =   12.82  avg =   12.77
               vgg16  min =   57.85  max =   61.44  avg =   58.40
          vgg16_int8  min =   78.53  max =   79.85  avg =   78.83
            resnet50  min =   34.79  max =   34.85  avg =   34.81
       resnet50_int8  min =   20.56  max =   20.62  avg =   20.58
      squeezenet_ssd  min =    9.64  max =    9.82  avg =    9.69
 squeezenet_ssd_int8  min =    8.21  max =    8.34  avg =    8.25
       mobilenet_ssd  min =   14.21  max =   14.34  avg =   14.25
  mobilenet_ssd_int8  min =    7.35  max =    7.41  avg =    7.37
      mobilenet_yolo  min =   31.61  max =   31.74  avg =   31.64
  mobilenetv2_yolov3  min =   15.79  max =   15.87  avg =   15.83
         yolov4-tiny  min =   22.93  max =   22.99  avg =   22.96
           nanodet_m  min =    5.58  max =    5.62  avg =    5.59
    yolo-fastest-1.1  min =    2.00  max =    2.05  avg =    2.01
      yolo-fastestv2  min =    1.75  max =    1.77  avg =    1.76
  vision_transformer  min = 1020.57  max = 1046.02  avg = 1028.75
          FastestDet  min =    1.88  max =    1.93  avg =    1.89

MacBook-Air benchmark % ./benchncnn 10 8 0 0 0
[0 Apple M3]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 Apple M3]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Apple M3]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[0 Apple M3]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 Apple M3]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    1.79  max =    2.48  avg =    2.16
     squeezenet_int8  min =    2.78  max =    2.93  avg =    2.80
           mobilenet  min =    1.40  max =    1.85  avg =    1.68
      mobilenet_int8  min =    3.60  max =    3.67  avg =    3.61
        mobilenet_v2  min =    1.68  max =    2.28  avg =    1.97
        mobilenet_v3  min =    1.71  max =    2.29  avg =    2.00
          shufflenet  min =    1.18  max =    2.49  avg =    1.78
       shufflenet_v2  min =    1.45  max =    2.09  avg =    1.70
             mnasnet  min =    1.74  max =    2.25  avg =    2.05
     proxylessnasnet  min =    1.75  max =    2.18  avg =    2.02
     efficientnet_b0  min =    2.71  max =    3.19  avg =    2.99
   efficientnetv2_b0  min =    6.77  max =    7.04  avg =    6.88
        regnety_400m  min =    1.94  max =    2.40  avg =    2.10
           blazeface  min =    1.05  max =    1.43  avg =    1.24
           googlenet  min =    3.99  max =    4.42  avg =    4.27
      googlenet_int8  min =   10.83  max =   10.86  avg =   10.85
            resnet18  min =    2.50  max =    2.77  avg =    2.70
       resnet18_int8  min =    9.86  max =    9.91  avg =    9.88
             alexnet  min =    2.99  max =    3.28  avg =    3.11
               vgg16  min =   12.41  max =   13.13  avg =   12.54
          vgg16_int8  min =   78.52  max =   78.67  avg =   78.61
            resnet50  min =    5.46  max =    5.52  avg =    5.49
       resnet50_int8  min =   20.57  max =   20.59  avg =   20.58
      squeezenet_ssd  min =    3.86  max =    4.53  avg =    4.17
 squeezenet_ssd_int8  min =    8.20  max =    8.35  avg =    8.25
       mobilenet_ssd  min =    3.19  max =    3.75  avg =    3.52
  mobilenet_ssd_int8  min =    7.35  max =    7.41  avg =    7.37
      mobilenet_yolo  min =    4.77  max =    4.88  avg =    4.81
  mobilenetv2_yolov3  min =    4.28  max =    4.88  avg =    4.62
         yolov4-tiny  min =    6.76  max =    7.38  avg =    7.21
           nanodet_m  min =    2.92  max =    4.71  avg =    3.46
    yolo-fastest-1.1  min =    1.48  max =    2.04  avg =    1.87
      yolo-fastestv2  min =    1.41  max =    1.97  avg =    1.74
  vision_transformer  min =   80.34  max =   80.66  avg =   80.44
          FastestDet  min =    1.43  max =    2.04  avg =    1.73
```

### Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA)
```
loop_count = 8
num_threads = 2
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =  921.23  max =  944.03  avg =  930.71
     squeezenet_int8  min = 3280.89  max = 3404.83  avg = 3359.68
           mobilenet  min = 1277.61  max = 1298.51  avg = 1284.38
      mobilenet_int8  min = 4342.67  max = 4350.21  avg = 4345.85
        mobilenet_v2  min =  780.92  max =  783.93  avg =  782.79
        mobilenet_v3  min =  650.59  max =  655.08  avg =  652.06
          shufflenet  min =  352.75  max =  353.69  avg =  353.24
       shufflenet_v2  min =  362.82  max =  364.08  avg =  363.38
             mnasnet  min =  790.45  max =  791.89  avg =  790.99
     proxylessnasnet  min =  868.71  max =  870.47  avg =  869.17
     efficientnet_b0  min = 1491.44  max = 1492.36  avg = 1491.95
   efficientnetv2_b0  min = 2135.04  max = 2148.02  avg = 2139.99
        regnety_400m  min = 1000.53  max = 1005.29  avg = 1001.81
           blazeface  min =  102.72  max =  104.18  avg =  103.51
           googlenet  min = 3652.89  max = 3705.40  avg = 3675.43
      googlenet_int8  min = 8067.30  max = 8070.22  avg = 8069.21
```
### MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB
```

➜  benchmark git:(master) ✗ ./benchncnn 10 1 0 -1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   14.68  max =   17.06  avg =   15.55
     squeezenet_int8  min =   51.64  max =   57.85  avg =   54.01
           mobilenet  min =   20.74  max =   25.38  avg =   22.77
      mobilenet_int8  min =   66.84  max =   91.01  avg =   75.69
        mobilenet_v2  min =   14.04  max =   20.06  avg =   16.36
        mobilenet_v3  min =   11.89  max =   16.22  avg =   13.58
          shufflenet  min =   13.74  max =   17.10  avg =   15.02
       shufflenet_v2  min =   12.73  max =   14.36  avg =   13.53
             mnasnet  min =   11.05  max =   17.79  avg =   13.82
     proxylessnasnet  min =   12.60  max =   27.38  avg =   17.55
     efficientnet_b0  min =   23.73  max =   26.82  avg =   25.45
   efficientnetv2_b0  min =   27.03  max =   33.89  avg =   30.78
        regnety_400m  min =   13.81  max =   21.50  avg =   15.40
           blazeface  min =    3.72  max =    4.98  avg =    4.43
           googlenet  min =   65.88  max =   76.62  avg =   69.40
      googlenet_int8  min =  192.07  max =  227.85  avg =  203.81
            resnet18  min =   79.45  max =   90.41  avg =   85.32
       resnet18_int8  min =  201.71  max =  222.31  avg =  207.39
             alexnet  min =   70.67  max =   80.13  avg =   74.43
               vgg16  min =  233.74  max =  261.62  avg =  250.99
          vgg16_int8  min = 1722.78  max = 1997.14  avg = 1772.71
            resnet50  min =  130.39  max =  135.31  avg =  133.27
       resnet50_int8  min =  439.69  max =  483.78  avg =  461.33
      squeezenet_ssd  min =  108.54  max =  122.15  avg =  115.02
 squeezenet_ssd_int8  min =  175.58  max =  185.09  avg =  181.33
       mobilenet_ssd  min =   51.89  max =   59.32  avg =   54.30
  mobilenet_ssd_int8  min =  140.15  max =  192.10  avg =  164.47
      mobilenet_yolo  min =  117.37  max =  131.89  avg =  126.34
  mobilenetv2_yolov3  min =   57.57  max =   72.29  avg =   64.92
         yolov4-tiny  min =  114.45  max =  123.15  avg =  116.91
           nanodet_m  min =   25.65  max =   33.27  avg =   28.75

➜  benchmark git:(master) ✗ ./benchncnn 10 1 0 0
[0 AMD Radeon Pro 555X]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 AMD Radeon Pro 555X]  bugsbn1=0  bugbilz=196  bugcopc=0  bugihfa=0
[0 AMD Radeon Pro 555X]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 AMD Radeon Pro 555X]  subgroup=64  basic=0  vote=0  ballot=0  shuffle=0
[1 Intel(R) UHD Graphics 630]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[1 Intel(R) UHD Graphics 630]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 Intel(R) UHD Graphics 630]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[1 Intel(R) UHD Graphics 630]  subgroup=32  basic=0  vote=0  ballot=0  shuffle=0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =    6.66  max =    7.30  avg =    6.91
     squeezenet_int8  min =   49.97  max =   60.92  avg =   53.86
           mobilenet  min =    6.99  max =    7.48  avg =    7.17
      mobilenet_int8  min =   70.46  max =   83.20  avg =   79.33
        mobilenet_v2  min =    9.56  max =   10.87  avg =   10.34
        mobilenet_v3  min =   11.48  max =   12.20  avg =   11.94
          shufflenet  min =    4.52  max =    5.25  avg =    4.96
       shufflenet_v2  min =    7.29  max =    9.65  avg =    7.99
             mnasnet  min =    9.82  max =   11.88  avg =   10.62
     proxylessnasnet  min =    7.85  max =    8.41  avg =    8.07
     efficientnet_b0  min =   17.34  max =   17.85  avg =   17.56
   efficientnetv2_b0  min =   21.95  max =   24.10  avg =   23.15
        regnety_400m  min =   13.54  max =   14.83  avg =   14.11
           blazeface  min =    3.26  max =    6.59  avg =    5.50
           googlenet  min =   17.62  max =   19.47  avg =   18.27
      googlenet_int8  min =  198.88  max =  247.97  avg =  223.31
            resnet18  min =   11.10  max =   12.01  avg =   11.59
       resnet18_int8  min =  225.56  max =  259.39  avg =  238.97
             alexnet  min =   17.66  max =   19.19  avg =   18.24
               vgg16  min =   53.20  max =   54.88  avg =   53.73
          vgg16_int8  min = 1747.52  max = 2130.08  avg = 1880.42
            resnet50  min =   27.38  max =   28.84  avg =   28.34
       resnet50_int8  min =  461.86  max =  579.83  avg =  528.15
      squeezenet_ssd  min =   19.99  max =   20.98  avg =   20.50
 squeezenet_ssd_int8  min =  185.20  max =  209.66  avg =  196.81
       mobilenet_ssd  min =   12.81  max =   14.21  avg =   13.48
  mobilenet_ssd_int8  min =  139.29  max =  168.38  avg =  148.20
      mobilenet_yolo  min =   19.50  max =   20.51  avg =   19.97
  mobilenetv2_yolov3  min =   15.95  max =   19.28  avg =   16.85
         yolov4-tiny  min =   21.43  max =   23.42  avg =   22.28
           nanodet_m  min =    7.95  max =    9.23  avg =    8.48

➜  benchmark git:(master) ✗ ./benchncnn 10 1 0 1
[0 AMD Radeon Pro 555X]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[0 AMD Radeon Pro 555X]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 AMD Radeon Pro 555X]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 AMD Radeon Pro 555X]  subgroup=64  basic=0  vote=0  ballot=0  shuffle=0
[1 Intel(R) UHD Graphics 630]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[1 Intel(R) UHD Graphics 630]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 Intel(R) UHD Graphics 630]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[1 Intel(R) UHD Graphics 630]  subgroup=32  basic=0  vote=0  ballot=0  shuffle=0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 1
cooling_down = 1
          squeezenet  min =   11.06  max =   13.22  avg =   12.09
     squeezenet_int8  min =   54.87  max =   64.55  avg =   59.84
           mobilenet  min =   13.65  max =   16.70  avg =   14.81
      mobilenet_int8  min =   72.36  max =   93.58  avg =   86.40
        mobilenet_v2  min =   11.88  max =   15.90  avg =   13.47
        mobilenet_v3  min =   12.68  max =   16.16  avg =   14.56
          shufflenet  min =   13.87  max =   16.68  avg =   14.93
       shufflenet_v2  min =   11.73  max =   13.65  avg =   12.87
             mnasnet  min =   12.71  max =   15.56  avg =   14.22
     proxylessnasnet  min =   14.03  max =   17.28  avg =   15.37
     efficientnet_b0  min =   17.50  max =   21.46  avg =   19.30
   efficientnetv2_b0  min =   35.47  max =   38.58  avg =   36.89
        regnety_400m  min =   16.00  max =   19.45  avg =   17.48
           blazeface  min =    6.08  max =    7.18  avg =    6.39
           googlenet  min =   23.35  max =   29.68  avg =   25.77
      googlenet_int8  min =  198.49  max =  254.38  avg =  222.77
            resnet18  min =   21.85  max =   28.10  avg =   24.70
       resnet18_int8  min =  211.21  max =  279.55  avg =  222.64
             alexnet  min =   24.45  max =   30.47  avg =   26.87
               vgg16  min =  115.20  max =  117.76  avg =  116.48
          vgg16_int8  min = 1715.92  max = 1960.02  avg = 1800.21
            resnet50  min =   45.65  max =   46.25  avg =   46.05
       resnet50_int8  min =  448.13  max =  555.53  avg =  485.47
      squeezenet_ssd  min =   28.43  max =   33.26  avg =   29.85
 squeezenet_ssd_int8  min =  180.91  max =  202.51  avg =  190.84
       mobilenet_ssd  min =   21.03  max =   26.93  avg =   23.48
  mobilenet_ssd_int8  min =  154.41  max =  184.64  avg =  165.04
      mobilenet_yolo  min =   37.04  max =   38.64  avg =   37.52
  mobilenetv2_yolov3  min =   24.98  max =   30.03  avg =   27.70
         yolov4-tiny  min =   39.29  max =   50.25  avg =   44.18
           nanodet_m  min =   15.97  max =   20.27  avg =   17.93
```

### Sunway SW421 (sw_64 1.7GHz * 4)
```
root@SW421:~/Desktop/ncnn-20220420/ncnn-20220420/build/benchmark$ ./benchncnn
loop_count = 4
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  943.61  max =  966.98  avg =  955.24
     squeezenet_int8  min =  654.75  max =  731.28  avg =  674.87
           mobilenet  min = 1584.87  max = 1612.88  avg = 1597.47
      mobilenet_int8  min = 1198.21  max = 1204.82  avg = 1201.61
        mobilenet_v2  min =  733.94  max =  754.79  avg =  744.48
        mobilenet_v3  min =  665.26  max =  683.81  avg =  675.18
          shufflenet  min =  401.53  max =  435.21  avg =  420.32
       shufflenet_v2  min =  294.65  max =  316.50  avg =  309.08
             mnasnet  min =  671.22  max =  808.46  avg =  713.01
     proxylessnasnet  min =  686.12  max =  698.13  avg =  692.29
     efficientnet_b0  min = 1151.75  max = 1184.86  avg = 1161.33
   efficientnetv2_b0  min = 1372.05  max = 1395.22  avg = 1379.47
        regnety_400m  min =  933.93  max =  949.42  avg =  942.43
           blazeface  min =  104.72  max =  136.77  avg =  112.86
           googlenet  min = 2574.02  max = 4330.81  avg = 3015.56
      googlenet_int8  min = 2136.42  max = 2183.61  avg = 2166.45
            resnet18  min = 2511.12  max = 2537.42  avg = 2526.08
       resnet18_int8  min = 2003.84  max = 2027.50  avg = 2012.48
             alexnet  min =  668.28  max =  686.35  avg =  673.95
               vgg16  min = 24863.92  max = 24967.94  avg = 24907.39
          vgg16_int8  min = 18735.54  max = 18926.83  avg = 18859.32
            resnet50  min = 9896.47  max = 9981.13  avg = 9929.77
       resnet50_int8  min = 6971.01  max = 7085.29  avg = 7017.88
      squeezenet_ssd  min = 1798.23  max = 1814.25  avg = 1806.57
 squeezenet_ssd_int8  min = 1586.11  max = 1606.83  avg = 1596.75
       mobilenet_ssd  min = 3995.54  max = 4018.27  avg = 4002.78
  mobilenet_ssd_int8  min = 2753.65  max = 2766.06  avg = 2760.04
      mobilenet_yolo  min = 10892.22  max = 10978.84  avg = 10921.00
  mobilenetv2_yolov3  min = 3600.80  max = 3607.72  avg = 3603.18
         yolov4-tiny  min = 5565.82  max = 5582.22  avg = 5571.78
           nanodet_m  min = 1182.97  max = 1220.47  avg = 1199.30
    yolo-fastest-1.1  min =  340.63  max =  360.95  avg =  349.15
      yolo-fastestv2  min =  255.47  max =  281.79  avg =  268.82
```

### Sunway SW831 (sw_64 2.5GHz * 8)
```
root@SW831:~/Desktop/ncnn_20221128/build/benchmark$ ./benchncnn 5 8 2 -1 0
loop_count = 5
num_threads = 8
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =  343.27  max =  420.86  avg =  364.97
     squeezenet_int8  min =  237.91  max =  251.71  avg =  243.84
           mobilenet  min =  607.80  max =  696.04  avg =  646.61
      mobilenet_int8  min =  428.37  max =  499.32  avg =  460.21
        mobilenet_v2  min =  291.29  max =  381.93  avg =  311.76
        mobilenet_v3  min =  262.01  max =  287.93  avg =  277.29
          shufflenet  min =  144.89  max =  169.10  avg =  150.84
       shufflenet_v2  min =  121.44  max =  139.62  avg =  126.96
             mnasnet  min =  265.59  max =  353.84  avg =  288.79
     proxylessnasnet  min =  272.08  max =  293.19  avg =  284.61
     efficientnet_b0  min =  445.40  max =  508.36  avg =  467.84
   efficientnetv2_b0  min =  550.57  max =  619.16  avg =  581.85
        regnety_400m  min =  374.02  max =  460.64  avg =  394.49
           blazeface  min =   39.93  max =   59.19  avg =   44.14
           googlenet  min =  941.35  max = 1014.23  avg =  976.37
      googlenet_int8  min =  770.66  max =  827.44  avg =  797.93
            resnet18  min =  815.02  max =  895.13  avg =  843.57
       resnet18_int8  min =  701.10  max =  776.40  avg =  729.49
             alexnet  min =  216.74  max =  273.39  avg =  228.99
               vgg16  min = 8645.55  max = 8699.60  avg = 8681.61
          vgg16_int8  min = 6786.91  max = 6930.90  avg = 6854.29
            resnet50  min = 3624.02  max = 3698.91  avg = 3652.31
       resnet50_int8  min = 2537.92  max = 2618.10  avg = 2567.88
      squeezenet_ssd  min =  635.25  max =  693.23  avg =  663.56
 squeezenet_ssd_int8  min =  577.37  max =  641.12  avg =  603.34
       mobilenet_ssd  min = 1529.35  max = 1711.54  avg = 1582.10
  mobilenet_ssd_int8  min =  982.65  max = 1042.82  avg = 1016.62
      mobilenet_yolo  min = 4053.62  max = 4124.84  avg = 4094.38
  mobilenetv2_yolov3  min = 1367.81  max = 1527.79  avg = 1433.04
         yolov4-tiny  min = 1943.20  max = 2028.02  avg = 1978.31
           nanodet_m  min =  433.66  max =  498.83  avg =  457.77
    yolo-fastest-1.1  min =  140.07  max =  284.35  avg =  192.46
      yolo-fastestv2  min =  123.91  max =  225.70  avg =  152.54
  vision_transformer  min = 2470.70  max = 2509.73  avg = 2486.40
          FastestDet  min =  145.30  max =  163.43  avg =  154.35
```

### AXERA AX620A (Cortex-A7 1.0GHz * 4)
```
/root/axera # ./benchncnn 4 1 0 -1 0
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =  530.57  max =  533.11  avg =  532.22
     squeezenet_int8  min =  359.74  max =  360.02  avg =  359.86
           mobilenet  min =  920.12  max =  921.04  avg =  920.52
      mobilenet_int8  min =  532.60  max =  533.08  avg =  532.81
        mobilenet_v2  min =  608.81  max =  609.49  avg =  609.18
        mobilenet_v3  min =  531.43  max =  532.34  avg =  531.90
          shufflenet  min =  297.91  max =  300.08  avg =  299.06
       shufflenet_v2  min =  288.44  max =  289.30  avg =  288.79
             mnasnet  min =  590.29  max =  590.99  avg =  590.63
     proxylessnasnet  min =  678.22  max =  679.22  avg =  678.63
     efficientnet_b0  min = 1041.41  max = 1043.79  avg = 1042.61
   efficientnetv2_b0  min = 1222.41  max = 1223.63  avg = 1222.91
        regnety_400m  min =  723.83  max =  725.37  avg =  724.64
           blazeface  min =   86.77  max =   87.21  avg =   86.92
           googlenet  min = 1740.32  max = 1741.44  avg = 1740.81
      googlenet_int8  min = 1167.95  max = 1169.18  avg = 1168.54
            resnet18  min = 1584.41  max = 1585.36  avg = 1584.97
       resnet18_int8  min =  915.78  max =  918.77  avg =  917.16
             alexnet  min = 1811.30  max = 1812.86  avg = 1812.07
            resnet50  min = 4516.48  max = 4523.48  avg = 4519.03
       resnet50_int8  min = 2573.18  max = 2574.29  avg = 2573.69
      squeezenet_ssd  min = 1191.79  max = 1193.71  avg = 1193.02
 squeezenet_ssd_int8  min =  862.36  max =  863.69  avg =  862.83
       mobilenet_ssd  min = 1950.48  max = 1950.98  avg = 1950.65
  mobilenet_ssd_int8  min = 1081.70  max = 1082.64  avg = 1082.20
      mobilenet_yolo  min = 4629.22  max = 4630.23  avg = 4629.69
  mobilenetv2_yolov3  min = 2233.05  max = 2234.14  avg = 2233.42
         yolov4-tiny  min = 2942.58  max = 2946.55  avg = 2944.81
           nanodet_m  min =  692.19  max =  693.36  avg =  692.79
    yolo-fastest-1.1  min =  333.62  max =  334.43  avg =  334.00
      yolo-fastestv2  min =  256.41  max =  257.32  avg =  256.83


/root/axera # ./benchncnn 4 4 0 -1 0
loop_count = 4
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =  150.38  max =  179.83  avg =  157.90
     squeezenet_int8  min =  106.97  max =  107.43  avg =  107.22
           mobilenet  min =  248.92  max =  273.98  avg =  255.72
      mobilenet_int8  min =  139.49  max =  139.65  avg =  139.60
        mobilenet_v2  min =  174.67  max =  204.35  avg =  182.30
        mobilenet_v3  min =  152.17  max =  152.54  avg =  152.30
          shufflenet  min =   98.74  max =  125.99  avg =  105.74
       shufflenet_v2  min =  103.44  max =  103.88  avg =  103.65
             mnasnet  min =  167.63  max =  197.54  avg =  175.28
     proxylessnasnet  min =  186.02  max =  186.32  avg =  186.15
     efficientnet_b0  min =  284.35  max =  318.17  avg =  292.90
   efficientnetv2_b0  min =  329.56  max =  359.71  avg =  337.22
        regnety_400m  min =  246.91  max =  277.08  avg =  254.71
           blazeface  min =   30.95  max =   31.31  avg =   31.16
           googlenet  min =  474.87  max =  504.38  avg =  489.43
      googlenet_int8  min =  322.06  max =  331.97  avg =  324.57
            resnet18  min =  440.03  max =  475.28  avg =  456.70
       resnet18_int8  min =  252.01  max =  280.64  avg =  259.22
             alexnet  min =  453.16  max =  478.80  avg =  465.88
            resnet50  min = 1214.70  max = 1252.42  avg = 1229.22
       resnet50_int8  min =  684.53  max =  715.65  avg =  706.14
      squeezenet_ssd  min =  358.84  max =  393.45  avg =  367.77
 squeezenet_ssd_int8  min =  281.56  max =  312.86  avg =  289.85
       mobilenet_ssd  min =  519.11  max =  559.14  avg =  538.41
  mobilenet_ssd_int8  min =  284.58  max =  310.02  avg =  291.02
      mobilenet_yolo  min = 1238.87  max = 1284.74  avg = 1260.51
  mobilenetv2_yolov3  min =  624.42  max =  665.81  avg =  642.15
         yolov4-tiny  min =  826.46  max =  852.97  avg =  844.88
           nanodet_m  min =  246.76  max =  279.09  avg =  255.04
    yolo-fastest-1.1  min =  116.12  max =  116.95  avg =  116.50
      yolo-fastestv2  min =   91.08  max =  102.93  avg =   94.41
```

### AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8)
test in wsl2 with ubuntu 20.04
```
$ ./benchncnn  10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.53  max =    7.05  avg =    6.77
     squeezenet_int8  min =   17.72  max =   17.86  avg =   17.79
           mobilenet  min =   11.43  max =   11.98  avg =   11.64
      mobilenet_int8  min =   22.91  max =   24.48  avg =   23.26
        mobilenet_v2  min =    8.28  max =    9.29  avg =    8.66
        mobilenet_v3  min =    6.86  max =    6.98  avg =    6.94
          shufflenet  min =    3.75  max =    4.64  avg =    3.91
       shufflenet_v2  min =    5.08  max =    5.80  avg =    5.22
             mnasnet  min =    7.54  max =    8.60  avg =    7.81
     proxylessnasnet  min =    9.18  max =   10.33  avg =    9.41
     efficientnet_b0  min =   22.57  max =   23.67  avg =   22.93
   efficientnetv2_b0  min =   21.23  max =   22.08  avg =   21.45
        regnety_400m  min =   10.56  max =   10.80  avg =   10.63
           blazeface  min =    1.08  max =    1.17  avg =    1.11
           googlenet  min =   27.91  max =   29.51  avg =   28.28
      googlenet_int8  min =   71.00  max =   86.86  avg =   72.74
            resnet18  min =   20.11  max =   20.56  avg =   20.26
       resnet18_int8  min =   63.80  max =   65.13  avg =   64.19
             alexnet  min =   20.64  max =   24.25  avg =   21.65
               vgg16  min =  119.99  max =  125.45  avg =  121.59
          vgg16_int8  min =  268.11  max =  270.41  avg =  269.15
            resnet50  min =   55.42  max =   56.29  avg =   55.70
       resnet50_int8  min =  126.73  max =  132.37  avg =  128.72
      squeezenet_ssd  min =   28.41  max =   30.30  avg =   29.20
 squeezenet_ssd_int8  min =   41.12  max =   42.53  avg =   41.52
       mobilenet_ssd  min =   24.15  max =   24.91  avg =   24.33
  mobilenet_ssd_int8  min =   46.06  max =   59.19  avg =   49.87
      mobilenet_yolo  min =   67.58  max =   73.19  avg =   68.99
  mobilenetv2_yolov3  min =   29.44  max =   30.46  avg =   29.78
         yolov4-tiny  min =   41.89  max =   43.47  avg =   42.37
           nanodet_m  min =   11.23  max =   11.47  avg =   11.36
    yolo-fastest-1.1  min =    3.86  max =    4.64  avg =    4.04
      yolo-fastestv2  min =    3.43  max =    3.99  avg =    3.56
  vision_transformer  min = 1590.86  max = 1593.97  avg = 1591.91


$ ./benchncnn  10 16 0 -1 0
loop_count = 10
num_threads = 16
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    2.94  max =    4.66  avg =    3.31
     squeezenet_int8  min =    3.53  max =    5.26  avg =    3.92
           mobilenet  min =    3.96  max =    5.30  avg =    4.21
      mobilenet_int8  min =    4.27  max =    4.56  avg =    4.35
        mobilenet_v2  min =    3.63  max =    4.20  avg =    3.82
        mobilenet_v3  min =    3.25  max =    4.79  avg =    3.58
          shufflenet  min =    2.98  max =    3.59  avg =    3.12
       shufflenet_v2  min =    2.62  max =    5.93  avg =    3.04
             mnasnet  min =    3.09  max =    3.49  avg =    3.28
     proxylessnasnet  min =    3.57  max =    4.18  avg =    3.76
     efficientnet_b0  min =    5.98  max =    6.48  avg =    6.18
   efficientnetv2_b0  min =    6.96  max =    7.48  avg =    7.13
        regnety_400m  min =    8.71  max =   11.89  avg =    9.61
           blazeface  min =    0.86  max =    0.96  avg =    0.89
           googlenet  min =   10.75  max =   11.33  avg =   11.00
      googlenet_int8  min =   12.75  max =   15.47  avg =   13.50
            resnet18  min =    8.92  max =   16.08  avg =   10.08
       resnet18_int8  min =   10.55  max =   10.99  avg =   10.69
             alexnet  min =    9.95  max =   10.45  avg =   10.17
               vgg16  min =   52.28  max =   53.69  avg =   52.89
          vgg16_int8  min =   44.90  max =   47.90  avg =   45.61
            resnet50  min =   17.80  max =   21.43  avg =   18.66
       resnet50_int8  min =   21.80  max =   25.42  avg =   22.75
      squeezenet_ssd  min =   14.49  max =   16.36  avg =   14.90
 squeezenet_ssd_int8  min =   10.02  max =   10.49  avg =   10.28
       mobilenet_ssd  min =    7.20  max =    7.86  avg =    7.51
  mobilenet_ssd_int8  min =    8.51  max =   10.90  avg =    9.09
      mobilenet_yolo  min =   35.67  max =   44.84  avg =   37.33
  mobilenetv2_yolov3  min =   12.72  max =   17.16  avg =   13.67
         yolov4-tiny  min =   20.81  max =   22.11  avg =   21.33
           nanodet_m  min =    5.13  max =   42.12  avg =    9.07
    yolo-fastest-1.1  min =    3.05  max =    4.72  avg =    3.39
      yolo-fastestv2  min =    3.33  max =    3.73  avg =    3.44
  vision_transformer  min =  214.91  max =  229.91  avg =  220.82
```

### Intel Celeron M 420 (Yonah 1.60 GHz x 1)

Tested on `Debian GNU/Linux 11 (bullseye) i686` with `cmake -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF-DNCNN_BUILD_TESTS=ON ..`.

```
mouri@Mouri-Laptop-2:~/ncnn/benchmark$ ./../build/benchmark/benchncnn
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  289.23  max =  301.83  avg =  292.90
     squeezenet_int8  min =  442.82  max =  457.21  avg =  446.89
           mobilenet  min =  549.62  max =  561.20  avg =  554.78
      mobilenet_int8  min =  823.92  max =  837.70  avg =  830.52
        mobilenet_v2  min =  341.72  max =  353.77  avg =  345.34
        mobilenet_v3  min =  267.68  max =  282.08  avg =  273.10
          shufflenet  min =  151.66  max =  153.02  avg =  152.24
       shufflenet_v2  min =  161.54  max =  163.38  avg =  162.13
             mnasnet  min =  322.66  max =  336.91  avg =  326.86
     proxylessnasnet  min =  356.63  max =  368.79  avg =  360.66
     efficientnet_b0  min =  489.92  max =  505.11  avg =  497.32
   efficientnetv2_b0  min =  618.16  max =  632.02  avg =  622.82
        regnety_400m  min =  414.83  max =  428.42  avg =  419.28
           blazeface  min =   38.56  max =   40.05  avg =   39.05
           googlenet  min = 1022.54  max = 1037.53  avg = 1029.48
      googlenet_int8  min = 1493.35  max = 1495.46  avg = 1494.31
            resnet18  min =  803.32  max =  818.27  avg =  812.49
       resnet18_int8  min = 1188.26  max = 1200.88  avg = 1192.56
             alexnet  min =  613.78  max =  623.88  avg =  619.99
               vgg16  min = 4465.44  max = 4478.12  avg = 4474.16
          vgg16_int8  min = 6042.40  max = 6114.37  avg = 6077.07
            resnet50  min = 2517.75  max = 2528.42  avg = 2522.83
       resnet50_int8  min = 3746.28  max = 3771.09  avg = 3756.88
      squeezenet_ssd  min =  585.56  max =  636.01  avg =  602.62
 squeezenet_ssd_int8  min =  822.43  max =  968.77  avg =  862.33
       mobilenet_ssd  min = 1116.98  max = 1139.17  avg = 1127.65
  mobilenet_ssd_int8  min = 1665.03  max = 1670.55  avg = 1668.37
      mobilenet_yolo  min = 2638.61  max = 2666.54  avg = 2652.26
  mobilenetv2_yolov3  min = 1248.56  max = 1255.98  avg = 1251.22
         yolov4-tiny  min = 1507.31  max = 1525.56  avg = 1514.66
           nanodet_m  min =  386.41  max =  400.63  avg =  391.21
    yolo-fastest-1.1  min =  159.97  max =  164.53  avg =  161.41
      yolo-fastestv2  min =  134.29  max =  135.47  avg =  134.70
  vision_transformer  min = 22201.32  max = 22510.75  avg = 22315.09
          FastestDet  min =  146.94  max =  148.50  avg =  147.44
```
### VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 with PowerVR B-Series BXE-4-32
Test on Debian 11 with g++ 12.2.0 and vulkan 1.3.231
```
user@starfive:~/Downloads/ncnn-master/benchmark$ ./benchncnn 10 4 0 -1 0
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =  149.06  max =  149.33  avg =  149.17
     squeezenet_int8  min = 1318.66  max = 1349.04  avg = 1328.87
           mobilenet  min =  255.13  max =  255.71  avg =  255.39
      mobilenet_int8  min = 2025.40  max = 2036.00  avg = 2031.67
        mobilenet_v2  min =  173.92  max =  174.60  avg =  174.31
        mobilenet_v3  min =  166.58  max =  167.30  avg =  167.02
          shufflenet  min =   91.36  max =   91.72  avg =   91.57
       shufflenet_v2  min =   83.50  max =   83.95  avg =   83.76
             mnasnet  min =  190.42  max =  191.15  avg =  190.66
     proxylessnasnet  min =  226.35  max =  226.81  avg =  226.52
     efficientnet_b0  min =  342.74  max =  343.62  avg =  343.15
   efficientnetv2_b0  min =  343.31  max =  344.23  avg =  343.80
        regnety_400m  min =  227.04  max =  227.75  avg =  227.43
           blazeface  min =   26.18  max =   26.43  avg =   26.28
           googlenet  min =  506.76  max =  508.58  avg =  507.84
      googlenet_int8  min = 3827.36  max = 3856.05  avg = 3835.67
            resnet18  min =  401.12  max =  402.27  avg =  401.61
       resnet18_int8  min = 4053.06  max = 4069.98  avg = 4061.63
             alexnet  min =  297.81  max =  320.09  avg =  301.39
               vgg16  min = 2338.76  max = 2351.23  avg = 2346.19
          vgg16_int8  min = 36846.41  max = 36929.56  avg = 36886.26
            resnet50  min = 1189.88  max = 1211.10  avg = 1193.34
       resnet50_int8  min = 11819.59  max = 11884.94  avg = 11845.22
      squeezenet_ssd  min =  351.71  max =  352.73  avg =  352.30
 squeezenet_ssd_int8  min = 2872.00  max = 2903.35  avg = 2891.01
       mobilenet_ssd  min =  530.92  max =  531.73  avg =  531.28
  mobilenet_ssd_int8  min = 4511.56  max = 4553.41  avg = 4523.51
      mobilenet_yolo  min = 1357.14  max = 1359.82  avg = 1358.83
  mobilenetv2_yolov3  min =  621.15  max =  622.29  avg =  621.66
         yolov4-tiny  min =  803.06  max =  809.19  avg =  805.79
           nanodet_m  min =  220.82  max =  221.18  avg =  221.06
    yolo-fastest-1.1  min =  102.59  max =  103.98  avg =  102.93
      yolo-fastestv2  min =   89.61  max =   90.03  avg =   89.76
  vision_transformer  min = 15862.96  max = 15897.17  avg = 15878.22
          FastestDet  min =  108.69  max =  109.00  avg =  108.84

user@starfive:~/Downloads/ncnn-master/benchmark$ ./benchncnn 10 4 1 -1 0
loop_count = 10
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 0
          squeezenet  min =  148.62  max =  148.95  avg =  148.82
     squeezenet_int8  min = 1324.10  max = 1339.58  avg = 1332.57
           mobilenet  min =  255.67  max =  256.20  avg =  255.93
      mobilenet_int8  min = 2024.72  max = 2028.23  avg = 2026.29
        mobilenet_v2  min =  173.76  max =  174.73  avg =  174.31
        mobilenet_v3  min =  166.66  max =  167.28  avg =  166.99
          shufflenet  min =   91.18  max =   91.68  avg =   91.46
       shufflenet_v2  min =   83.88  max =   84.84  avg =   84.26
             mnasnet  min =  190.23  max =  190.84  avg =  190.45
     proxylessnasnet  min =  226.02  max =  226.82  avg =  226.38
     efficientnet_b0  min =  342.95  max =  343.52  avg =  343.25
   efficientnetv2_b0  min =  343.07  max =  343.80  avg =  343.39
        regnety_400m  min =  226.96  max =  227.62  avg =  227.24
           blazeface  min =   26.08  max =   26.32  avg =   26.18
           googlenet  min =  508.30  max =  510.34  avg =  509.27
      googlenet_int8  min = 3825.65  max = 3858.90  avg = 3833.79
            resnet18  min =  400.69  max =  403.18  avg =  401.74
       resnet18_int8  min = 4055.41  max = 4123.79  avg = 4067.55
             alexnet  min =  296.35  max =  300.46  avg =  299.11
               vgg16  min = 2337.68  max = 2349.78  avg = 2344.77
          vgg16_int8  min = 36760.47  max = 36985.40  avg = 36918.31
            resnet50  min = 1190.13  max = 1221.98  avg = 1196.77
       resnet50_int8  min = 11816.03  max = 11869.41  avg = 11843.72
      squeezenet_ssd  min =  351.24  max =  352.20  avg =  351.89
 squeezenet_ssd_int8  min = 2873.40  max = 2902.55  avg = 2891.58
       mobilenet_ssd  min =  530.45  max =  531.85  avg =  530.91
  mobilenet_ssd_int8  min = 4504.87  max = 4564.64  avg = 4528.56
      mobilenet_yolo  min = 1357.83  max = 1360.48  avg = 1358.75
  mobilenetv2_yolov3  min =  621.00  max =  621.76  avg =  621.35
         yolov4-tiny  min =  803.54  max =  808.00  avg =  806.16
           nanodet_m  min =  221.08  max =  222.57  avg =  221.72
    yolo-fastest-1.1  min =  102.79  max =  103.15  avg =  102.95
      yolo-fastestv2  min =   89.56  max =   89.79  avg =   89.70
  vision_transformer  min = 15874.12  max = 15907.97  avg = 15883.26
          FastestDet  min =  108.22  max =  108.64  avg =  108.36

user@starfive:~/Downloads/ncnn-master/benchmark$ ./benchncnn 10 1 1 0 0
[0 PowerVR B-Series BXE-4-32]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 PowerVR B-Series BXE-4-32]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 PowerVR B-Series BXE-4-32]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 PowerVR B-Series BXE-4-32]  subgroup=1  basic/vote/ballot/shuffle=1/1/1/1
[0 PowerVR B-Series BXE-4-32]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 10
num_threads = 1
powersave = 1
gpu_device = 0
cooling_down = 0
          squeezenet  min =  355.26  max =  356.42  avg =  355.75
     squeezenet_int8  min = 5171.49  max = 5187.42  avg = 5178.45
           mobilenet  min =  757.04  max =  762.74  avg =  759.77
      mobilenet_int8  min = 7695.03  max = 7715.39  avg = 7705.16
        mobilenet_v2  min =  476.20  max =  477.19  avg =  476.94
        mobilenet_v3  min =  403.12  max =  405.44  avg =  405.09
          shufflenet  min =  181.02  max =  182.32  avg =  181.96
       shufflenet_v2  min =  257.29  max =  259.06  avg =  258.57
             mnasnet  min =  495.78  max =  497.44  avg =  496.89
     proxylessnasnet  min =  562.60  max =  563.02  avg =  562.83
     efficientnet_b0  min =  660.29  max =  664.73  avg =  662.97
   efficientnetv2_b0  min =  856.88  max =  864.96  avg =  861.30
        regnety_400m  min =  492.79  max =  495.44  avg =  494.51
           blazeface  min =   65.95  max =   68.72  avg =   68.19
           googlenet  min = 1132.70  max = 1134.65  avg = 1133.50
      googlenet_int8  min = 14978.60  max = 15000.89  avg = 14988.56
            resnet18  min = 1155.15  max = 1172.06  avg = 1160.64
       resnet18_int8  min = 15776.36  max = 15790.48  avg = 15782.76
             alexnet  min =  601.09  max =  606.63  avg =  603.81
               vgg16  min = 5558.47  max = 5613.23  avg = 5586.98
          vgg16_int8  min = 143936.04  max = 144068.45  avg = 143991.58
            resnet50  min = 3425.81  max = 3440.51  avg = 3434.73
       resnet50_int8  min = 44780.92  max = 45144.97  avg = 45038.46
      squeezenet_ssd  min =  967.46  max =  978.39  avg =  972.76
 squeezenet_ssd_int8  min = 10842.39  max = 10999.00  avg = 10940.15
       mobilenet_ssd  min = 1565.15  max = 1570.11  avg = 1568.87
  mobilenet_ssd_int8  min = 17317.40  max = 17386.46  avg = 17361.80
      mobilenet_yolo  min = 3559.36  max = 3570.38  avg = 3568.84
  mobilenetv2_yolov3  min = 1731.98  max = 1739.52  avg = 1735.33
         yolov4-tiny  min = 1984.22  max = 2001.65  avg = 1993.20
           nanodet_m  min =  603.06  max =  609.65  avg =  607.79
    yolo-fastest-1.1  min =  306.30  max =  312.33  avg =  310.63
      yolo-fastestv2  min =  201.45  max =  207.44  avg =  205.93
  vision_transformer  min = 27310.74  max = 27358.54  avg = 27327.23
          FastestDet  min =  245.07  max =  248.81  avg =  248.14
```
### T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR)

Tested on `Linux anolis-riscv 5.10.112-00579-g8e3db308d5a5 #23 SMP PREEMPT Fri Aug 12 10:17:32 CST 2022 riscv64 riscv64 riscv64 GNU/Linux`

```
[root@anolis-riscv benchmark]# ./benchncnn
syscall error -1
loop_count = 4
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =  187.88  max =  188.82  avg =  188.13
     squeezenet_int8  min = 2388.26  max = 2446.92  avg = 2411.46
           mobilenet  min =  321.46  max =  323.34  avg =  322.19
      mobilenet_int8  min = 2318.93  max = 2458.55  avg = 2400.99
        mobilenet_v2  min =  214.01  max =  216.00  avg =  215.35
        mobilenet_v3  min =  247.71  max =  248.18  avg =  247.96
          shufflenet  min =  155.58  max =  155.85  avg =  155.67
       shufflenet_v2  min =   99.50  max =   99.75  avg =   99.63
             mnasnet  min =  261.46  max =  263.83  avg =  262.53
     proxylessnasnet  min =  315.40  max =  316.89  avg =  316.28
     efficientnet_b0  min =  484.97  max =  486.16  avg =  485.55
   efficientnetv2_b0  min =  453.03  max =  453.40  avg =  453.21
        regnety_400m  min =  314.09  max =  315.33  avg =  314.77
           blazeface  min =   46.14  max =   46.69  avg =   46.39
           googlenet  min =  650.99  max =  653.60  avg =  651.69
      googlenet_int8  min = 5435.11  max = 6391.98  avg = 6012.81
            resnet18  min =  505.48  max =  506.70  avg =  506.06
       resnet18_int8  min = 5053.33  max = 6599.94  avg = 6001.86
             alexnet  min =  403.68  max =  404.60  avg =  404.23
               vgg16  min = 2731.55  max = 2746.48  avg = 2738.82
```

test on `Beaglev-ahead(Linux ahead 5.10.113-ahead #2023.08.02.13.12+2c2096a98 SMP PREEMPT Wed Aug 2 13:13:02 UTC 2 riscv64 GNU/Linux)`

```
debian@ahead:~/ncnn/build/benchmark$ sudo ./benchncnn 10 1 0 0 0
[0 PowerVR B-Series BXM-4-64]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 PowerVR B-Series BXM-4-64]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 PowerVR B-Series BXM-4-64]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 PowerVR B-Series BXM-4-64]  subgroup=1  basic/vote/ballot/shuffle=1/1/1/1
[0 PowerVR B-Series BXM-4-64]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =  287.88  max =  296.84  avg =  295.68
     squeezenet_int8  min = 2289.46  max = 2320.97  avg = 2306.60
           mobilenet  min =  584.32  max =  588.48  avg =  587.41
      mobilenet_int8  min = 2487.91  max = 2492.12  avg = 2489.64
        mobilenet_v2  min =  380.02  max =  386.67  avg =  385.75
        mobilenet_v3  min =  314.73  max =  328.84  avg =  325.76
          shufflenet  min =  146.96  max =  158.29  avg =  156.38
       shufflenet_v2  min =  203.94  max =  211.77  avg =  210.82
             mnasnet  min =  395.80  max =  404.95  avg =  403.80
     proxylessnasnet  min =  447.74  max =  456.89  avg =  454.87
     efficientnet_b0  min =  532.23  max =  543.05  avg =  538.53
   efficientnetv2_b0  min =  659.43  max =  681.64  avg =  669.13
        regnety_400m  min =  393.16  max =  407.27  avg =  403.81
           blazeface  min =   50.41  max =   61.83  avg =   56.92
           googlenet  min =  890.79  max =  898.09  avg =  896.25
      googlenet_int8  min = 4713.76  max = 5296.61  avg = 5044.39
            resnet18  min =  814.16  max =  824.53  avg =  820.35
       resnet18_int8  min = 4800.73  max = 6015.34  avg = 5765.47
             alexnet  min =  453.80  max =  465.51  avg =  462.11
               vgg16  min = 4016.26  max = 4027.30  avg = 4021.94
          vgg16_int8  min = 55069.69  max = 64814.86  avg = 59096.20
            resnet50  min = 2494.42  max = 2502.38  avg = 2500.28
       resnet50_int8  min = 15366.90  max = 17179.36  avg = 16701.20
      squeezenet_ssd  min =  724.36  max =  738.28  avg =  730.44
 squeezenet_ssd_int8  min = 4550.62  max = 5235.87  avg = 4684.19
       mobilenet_ssd  min = 1207.04  max = 1218.80  avg = 1212.86
  mobilenet_ssd_int8  min = 6019.61  max = 6349.35  avg = 6184.49
      mobilenet_yolo  min = 2736.28  max = 2747.06  avg = 2743.21
  mobilenetv2_yolov3  min = 1339.16  max = 1349.46  avg = 1344.81
         yolov4-tiny  min = 1457.05  max = 1459.04  avg = 1457.81
           nanodet_m  min =  443.40  max =  444.58  avg =  444.00
    yolo-fastest-1.1  min =  240.39  max =  248.05  avg =  247.04
      yolo-fastestv2  min =  162.71  max =  173.30  avg =  169.39
  vision_transformer  min = 17148.14  max = 17250.66  avg = 17202.60
          FastestDet  min =  199.71  max =  200.38  avg =  199.90
```

### CVITEK SG2000 (C906, 1 GHz x 1 + 700MHz x 1)
```
[root@milkv-duo]~/ncnn# ./benchncnn 4 1 2 -1 0
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =  221.53  max =  229.14  avg =  225.53
     squeezenet_int8  min = 8153.49  max = 8163.26  avg = 8160.17
           mobilenet  min =  329.60  max =  338.58  avg =  335.00
      mobilenet_int8  min = 12725.12  max = 12733.70  avg = 12728.52
        mobilenet_v2  min =  253.83  max =  260.60  avg =  257.20
        mobilenet_v3  min =  205.51  max =  212.72  avg =  209.26
          shufflenet  min =  358.73  max =  367.05  avg =  364.52
       shufflenet_v2  min =  238.44  max =  246.05  avg =  242.09
             mnasnet  min =  254.39  max =  258.26  avg =  255.63
     proxylessnasnet  min =  294.99  max =  302.80  avg =  300.65
        regnety_400m  min =  407.72  max =  409.69  avg =  409.03
           blazeface  min =  117.08  max =  124.26  avg =  119.00
           googlenet  min =  817.28  max =  824.70  avg =  820.70
      googlenet_int8  min = 18246.97  max = 18276.23  avg = 18261.11
            resnet18  min =  610.81  max =  618.87  avg =  613.91
       resnet18_int8  min = 18772.96  max = 18808.53  avg = 18786.88
             alexnet  min =  568.11  max =  577.02  avg =  570.66
      squeezenet_ssd  min =  890.76  max =  896.30  avg =  893.57
 squeezenet_ssd_int8  min = 31680.48  max = 31938.09  avg = 31810.68
       mobilenet_ssd  min =  746.38  max =  762.07  avg =  752.19
  mobilenet_ssd_int8  min = 41140.62  max = 41540.85  avg = 41356.70
      mobilenet_yolo  min = 1744.59  max = 1755.90  avg = 1750.05
  mobilenetv2_yolov3  min =  890.20  max =  897.86  avg =  895.14
         yolov4-tiny  min = 1056.03  max = 1059.44  avg = 1058.21
           nanodet_m  min =  547.85  max =  554.80  avg =  549.81
    yolo-fastest-1.1  min =  290.89  max =  298.31  avg =  296.24
      yolo-fastestv2  min =  188.59  max =  196.79  avg =  190.96
          FastestDet  min =  196.19  max =  205.96  avg =  200.99
```

### Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz)
test in ROCK5 MODEL B

```
rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn  10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   15.22  max =   16.03  avg =   15.70
     squeezenet_int8  min =   16.77  max =   16.96  avg =   16.86
           mobilenet  min =   23.07  max =   23.58  avg =   23.36
      mobilenet_int8  min =   18.58  max =   18.90  avg =   18.72
        mobilenet_v2  min =   18.74  max =   19.10  avg =   18.96
        mobilenet_v3  min =   14.40  max =   14.65  avg =   14.50
          shufflenet  min =    9.74  max =    9.88  avg =    9.84
       shufflenet_v2  min =    9.44  max =    9.55  avg =    9.50
             mnasnet  min =   14.73  max =   15.03  avg =   14.87
     proxylessnasnet  min =   18.37  max =   18.59  avg =   18.46
     efficientnet_b0  min =   29.11  max =   30.18  avg =   29.63
   efficientnetv2_b0  min =   46.40  max =   46.95  avg =   46.76
        regnety_400m  min =   19.18  max =   19.39  avg =   19.28
           blazeface  min =    5.16  max =    5.23  avg =    5.20
           googlenet  min =   64.64  max =   65.33  avg =   65.00
      googlenet_int8  min =   61.86  max =   63.41  avg =   62.42
            resnet18  min =   42.00  max =   43.34  avg =   42.48
       resnet18_int8  min =   67.22  max =   67.80  avg =   67.45
             alexnet  min =   57.65  max =   58.21  avg =   58.01
               vgg16  min =  192.35  max =  193.36  avg =  192.84
          vgg16_int8  min =  570.86  max =  578.81  avg =  574.50
            resnet50  min =  107.86  max =  109.52  avg =  108.70
       resnet50_int8  min =  134.41  max =  135.86  avg =  135.18
      squeezenet_ssd  min =   40.85  max =   41.24  avg =   41.02
 squeezenet_ssd_int8  min =   52.23  max =   53.70  avg =   52.54
       mobilenet_ssd  min =   45.11  max =   45.50  avg =   45.32
  mobilenet_ssd_int8  min =   36.53  max =   36.63  avg =   36.59
      mobilenet_yolo  min =   95.18  max =   96.79  avg =   95.90
  mobilenetv2_yolov3  min =   65.50  max =   65.88  avg =   65.72
         yolov4-tiny  min =   86.13  max =   88.84  avg =   87.29
           nanodet_m  min =   22.57  max =   22.87  avg =   22.74
    yolo-fastest-1.1  min =    9.23  max =    9.35  avg =    9.29
      yolo-fastestv2  min =    8.62  max =    8.83  avg =    8.73
  vision_transformer  min = 3077.54  max = 3396.13  avg = 3339.58
          FastestDet  min =    9.11  max =    9.30  avg =    9.20

rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn  10 8 0 -1 0
loop_count = 10
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   10.02  max =   11.01  avg =   10.43
     squeezenet_int8  min =   11.78  max =   13.77  avg =   12.55
           mobilenet  min =   12.75  max =   13.58  avg =   13.12
      mobilenet_int8  min =   12.23  max =   14.29  avg =   13.54
        mobilenet_v2  min =   12.76  max =   14.27  avg =   13.40
        mobilenet_v3  min =    9.51  max =    9.81  avg =    9.71
          shufflenet  min =    7.06  max =    7.23  avg =    7.13
       shufflenet_v2  min =    6.21  max =    7.32  avg =    6.38
             mnasnet  min =    9.32  max =   12.49  avg =   10.75
     proxylessnasnet  min =   13.79  max =   15.51  avg =   14.70
     efficientnet_b0  min =   16.59  max =   17.99  avg =   17.08
   efficientnetv2_b0  min =   28.26  max =   32.26  avg =   30.52
        regnety_400m  min =   13.43  max =   15.00  avg =   13.72
           blazeface  min =    3.87  max =    7.38  avg =    5.65
           googlenet  min =   29.18  max =   44.00  avg =   36.31
      googlenet_int8  min =   31.14  max =   37.48  avg =   34.58
            resnet18  min =   21.47  max =   24.40  avg =   22.35
       resnet18_int8  min =   26.68  max =   29.89  avg =   28.45
             alexnet  min =   29.35  max =   38.09  avg =   31.65
               vgg16  min =  112.37  max =  122.94  avg =  117.05
          vgg16_int8  min =  161.08  max =  215.29  avg =  176.89
            resnet50  min =   54.54  max =   57.50  avg =   55.71
       resnet50_int8  min =   54.76  max =   65.05  avg =   60.59
      squeezenet_ssd  min =   26.21  max =   35.05  avg =   30.76
 squeezenet_ssd_int8  min =   33.34  max =   40.88  avg =   36.19
       mobilenet_ssd  min =   26.71  max =   28.85  avg =   27.88
  mobilenet_ssd_int8  min =   22.03  max =   25.31  avg =   24.21
      mobilenet_yolo  min =   60.51  max =   74.65  avg =   65.45
  mobilenetv2_yolov3  min =   37.27  max =   44.13  avg =   41.20
         yolov4-tiny  min =   49.84  max =   58.12  avg =   53.93
           nanodet_m  min =   16.54  max =   22.41  avg =   20.60
    yolo-fastest-1.1  min =    8.49  max =   13.50  avg =    9.91
      yolo-fastestv2  min =    6.28  max =   11.22  avg =    8.00
  vision_transformer  min =  968.62  max = 1063.47  avg = 1019.12
          FastestDet  min =    6.14  max =   11.92  avg =    7.85

rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 4 2 -1 0
loop_count = 10
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.78  max =    7.27  avg =    7.07
     squeezenet_int8  min =    4.58  max =    4.73  avg =    4.63
           mobilenet  min =    5.67  max =    5.78  avg =    5.72
      mobilenet_int8  min =    5.01  max =    5.20  avg =    5.15
        mobilenet_v2  min =    5.44  max =    5.76  avg =    5.50
        mobilenet_v3  min =    4.67  max =    5.03  avg =    4.74
          shufflenet  min =    4.22  max =    4.30  avg =    4.27
       shufflenet_v2  min =    3.48  max =    3.60  avg =    3.53
             mnasnet  min =    4.52  max =    4.83  avg =    4.61
     proxylessnasnet  min =    5.44  max =    6.01  avg =    5.56
     efficientnet_b0  min =    8.33  max =    8.52  avg =    8.41
   efficientnetv2_b0  min =   12.95  max =   13.08  avg =   13.02
        regnety_400m  min =    8.60  max =    8.73  avg =    8.66
           blazeface  min =    1.86  max =    1.95  avg =    1.90
           googlenet  min =   16.58  max =   16.85  avg =   16.65
      googlenet_int8  min =   16.99  max =   17.13  avg =   17.06
            resnet18  min =   14.98  max =   15.30  avg =   15.08
       resnet18_int8  min =   20.10  max =   20.22  avg =   20.15
             alexnet  min =   19.78  max =   20.21  avg =   19.87
               vgg16  min =   66.35  max =   94.16  avg =   75.24
          vgg16_int8  min =  131.02  max =  131.98  avg =  131.51
            resnet50  min =   28.07  max =   28.78  avg =   28.28
       resnet50_int8  min =   33.56  max =   35.53  avg =   33.84
      squeezenet_ssd  min =   16.40  max =   16.80  avg =   16.49
 squeezenet_ssd_int8  min =   18.64  max =   19.00  avg =   18.76
       mobilenet_ssd  min =   13.66  max =   13.78  avg =   13.72
  mobilenet_ssd_int8  min =   11.23  max =   11.42  avg =   11.33
      mobilenet_yolo  min =   30.76  max =   31.03  avg =   30.86
  mobilenetv2_yolov3  min =   19.28  max =   21.07  avg =   20.30
         yolov4-tiny  min =   33.44  max =   37.68  avg =   34.70
           nanodet_m  min =    8.28  max =    8.55  avg =    8.38
    yolo-fastest-1.1  min =    4.30  max =    4.40  avg =    4.34
      yolo-fastestv2  min =    4.07  max =    4.18  avg =    4.13
  vision_transformer  min =  815.67  max =  819.27  avg =  817.49
          FastestDet  min =    4.34  max =    7.47  avg =    5.18
```

### AWS c5.4xlarge Instance

- OS: Ubuntu 20.04.6 LTS x86_64
- CPU: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
- Compiler: gcc version 9.4.0 (Ubuntu 9.4.0-1ubuntu1~20.04.2)
- ncnn tag: 20240102

```
loop_count = 4
num_threads = 8
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    3.31  max =    3.33  avg =    3.32
     squeezenet_int8  min =    3.87  max =    4.34  avg =    4.07
           mobilenet  min =    3.12  max =    3.20  avg =    3.17
      mobilenet_int8  min =    3.32  max =    3.45  avg =    3.38
        mobilenet_v2  min =    4.23  max =    4.43  avg =    4.33
        mobilenet_v3  min =    3.82  max =    3.92  avg =    3.87
          shufflenet  min =    3.67  max =    3.72  avg =    3.69
       shufflenet_v2  min =    4.08  max =    4.22  avg =    4.15
             mnasnet  min =    3.62  max =    3.69  avg =    3.64
     proxylessnasnet  min =    4.29  max =    4.59  avg =    4.37
     efficientnet_b0  min =    5.32  max =    5.64  avg =    5.50
   efficientnetv2_b0  min =    6.81  max =    6.88  avg =    6.85
        regnety_400m  min =    9.71  max =    9.77  avg =    9.74
           blazeface  min =    1.71  max =    2.57  avg =    2.10
           googlenet  min =   10.00  max =   10.09  avg =   10.05
      googlenet_int8  min =    8.76  max =    8.79  avg =    8.77
            resnet18  min =    6.55  max =    6.91  avg =    6.70
       resnet18_int8  min =    5.63  max =    5.95  avg =    5.81
             alexnet  min =    4.88  max =    4.91  avg =    4.89
               vgg16  min =   36.99  max =   37.04  avg =   37.01
          vgg16_int8  min =   28.13  max =   28.57  avg =   28.31
            resnet50  min =   13.99  max =   14.13  avg =   14.06
       resnet50_int8  min =   12.49  max =   12.56  avg =   12.53
      squeezenet_ssd  min =    9.93  max =   10.04  avg =    9.98
 squeezenet_ssd_int8  min =    9.51  max =    9.70  avg =    9.59
       mobilenet_ssd  min =    6.60  max =    6.63  avg =    6.61
  mobilenet_ssd_int8  min =    6.95  max =    7.10  avg =    7.02
      mobilenet_yolo  min =   18.28  max =   18.44  avg =   18.35
  mobilenetv2_yolov3  min =   13.26  max =   13.39  avg =   13.32
         yolov4-tiny  min =   25.14  max =   25.58  avg =   25.37
           nanodet_m  min =    7.71  max =    7.77  avg =    7.75
    yolo-fastest-1.1  min =    4.69  max =    4.96  avg =    4.81
      yolo-fastestv2  min =    4.84  max =    5.17  avg =    5.01
  vision_transformer  min =  139.34  max =  140.38  avg =  139.96
          FastestDet  min =    4.95  max =    5.12  avg =    5.06
```

### Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU)

- Host OS: Microsoft Windows 11 Enterprise (10.0.22621.1635)
- Guest OS: openSUSE Tumbleweed x86_64 20230507
- Mesa 3D source tree: https://gitlab.freedesktop.org/mesa/mesa/-/tree/ce6430067613e3e64cabf79918a3d96122b0c4c4
- Mesa 3D configuration command
  > meson --prefix="${PWD}/build/install" -D gallium-drivers=swrast,d3d12 -D vulkan-drivers=swrast,microsoft-experimental build/
- ncnn configuration command
  > cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..

```
mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 0 0
WARNING: dzn is not a conformant Vulkan implementation, testing use only.
WARNING: dzn is not a conformant Vulkan implementation, testing use only.
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  queueC=1[8]  queueG=0[4]  queueT=2[1]
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/0/0
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  queueC=1[8]  queueG=0[4]  queueT=2[1]
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/0/0
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  subgroup=16  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =   52.30  max =   65.51  avg =   56.65
     squeezenet_int8  min =   14.53  max =   15.55  avg =   14.88
           mobilenet  min =   37.42  max =   52.07  avg =   42.48
      mobilenet_int8  min =   19.01  max =   19.82  avg =   19.46
        mobilenet_v2  min =   55.34  max =   73.39  avg =   63.94
        mobilenet_v3  min =   97.02  max =  123.14  avg =  109.90
          shufflenet  min =   72.75  max =  100.26  avg =   88.26
       shufflenet_v2  min =   93.34  max =  119.64  avg =  105.76
             mnasnet  min =   63.49  max =   74.11  avg =   69.05
     proxylessnasnet  min =   65.87  max =   83.87  avg =   76.33
     efficientnet_b0  min =  162.86  max =  210.51  avg =  184.03
   efficientnetv2_b0  min =  200.88  max =  220.40  avg =  210.85
        regnety_400m  min =  106.92  max =  134.68  avg =  123.04
           blazeface  min =   58.64  max =   66.50  avg =   60.54
           googlenet  min =  117.34  max =  145.28  avg =  134.84
      googlenet_int8  min =   62.50  max =   65.07  avg =   63.44
            resnet18  min =   67.30  max =   92.40  avg =   80.23
       resnet18_int8  min =   56.09  max =   58.40  avg =   56.97
             alexnet  min =   29.94  max =   47.51  avg =   38.83
               vgg16  min =   59.72  max =   73.08  avg =   65.46
          vgg16_int8  min =  136.35  max =  148.39  avg =  143.96
            resnet50  min =  115.92  max =  152.34  avg =  129.64
       resnet50_int8  min =   93.86  max =  101.51  avg =   97.96
      squeezenet_ssd  min =  139.82  max =  149.15  avg =  144.78
 squeezenet_ssd_int8  min =   32.09  max =   35.96  avg =   33.41
       mobilenet_ssd  min =   88.14  max =  102.62  avg =   97.79
  mobilenet_ssd_int8  min =   33.93  max =   36.42  avg =   34.41
      mobilenet_yolo  min =   52.22  max =   65.25  avg =   58.81
  mobilenetv2_yolov3  min =   75.09  max =   94.12  avg =   85.23
         yolov4-tiny  min =   73.27  max =   88.69  avg =   81.44
           nanodet_m  min =  110.98  max =  150.70  avg =  127.60
    yolo-fastest-1.1  min =  104.72  max =  135.40  avg =  116.92
      yolo-fastestv2  min =  113.84  max =  142.19  avg =  128.24
  vision_transformer  min =  412.19  max =  474.25  avg =  444.15
          FastestDet  min =   96.31  max =  131.51  avg =  117.27
mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 1 0
WARNING: dzn is not a conformant Vulkan implementation, testing use only.
WARNING: dzn is not a conformant Vulkan implementation, testing use only.
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  queueC=1[8]  queueG=0[4]  queueT=2[1]
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/0/0
[0 Microsoft Direct3D12 (NVIDIA GeForce RTX 3070 Laptop GPU)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  queueC=1[8]  queueG=0[4]  queueT=2[1]
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/0/0
[1 Microsoft Direct3D12 (Intel(R) UHD Graphics)]  subgroup=16  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 1
cooling_down = 0
          squeezenet  min =   36.86  max =   62.04  avg =   44.48
     squeezenet_int8  min =   15.31  max =   16.14  avg =   15.63
           mobilenet  min =   30.79  max =   34.67  avg =   32.95
      mobilenet_int8  min =   19.23  max =   19.72  avg =   19.42
        mobilenet_v2  min =   36.56  max =   40.53  avg =   38.20
        mobilenet_v3  min =   52.11  max =   61.72  avg =   56.58
          shufflenet  min =   41.50  max =   74.61  avg =   49.24
       shufflenet_v2  min =   44.49  max =   52.30  avg =   49.04
             mnasnet  min =   35.66  max =   43.45  avg =   37.98
     proxylessnasnet  min =   41.27  max =   47.63  avg =   43.63
     efficientnet_b0  min =   67.66  max =   80.88  avg =   73.64
   efficientnetv2_b0  min =  111.10  max =  156.52  avg =  126.70
        regnety_400m  min =   62.66  max =   89.16  avg =   68.99
           blazeface  min =   24.86  max =   33.52  avg =   26.91
           googlenet  min =   70.55  max =   84.22  avg =   75.19
      googlenet_int8  min =   58.78  max =   64.81  avg =   62.99
            resnet18  min =   44.17  max =   49.37  avg =   46.73
       resnet18_int8  min =   59.99  max =   66.91  avg =   62.35
             alexnet  min =   41.54  max =   57.16  avg =   44.30
               vgg16  min =  138.74  max =  165.03  avg =  146.90
          vgg16_int8  min =  135.36  max =  165.89  avg =  142.61
            resnet50  min =   97.46  max =  107.18  avg =  100.89
       resnet50_int8  min =   92.90  max =  100.45  avg =   95.91
      squeezenet_ssd  min =   72.27  max =   90.71  avg =   76.09
 squeezenet_ssd_int8  min =   34.66  max =   40.46  avg =   36.58
       mobilenet_ssd  min =   59.90  max =   68.74  avg =   62.40
  mobilenet_ssd_int8  min =   37.02  max =   38.59  avg =   37.82
      mobilenet_yolo  min =   73.19  max =   80.40  avg =   76.42
  mobilenetv2_yolov3  min =   58.56  max =   66.71  avg =   62.02
         yolov4-tiny  min =   63.75  max =   84.29  avg =   69.54
           nanodet_m  min =   54.66  max =   67.89  avg =   60.82
    yolo-fastest-1.1  min =   40.89  max =   51.03  avg =   43.15
      yolo-fastestv2  min =   50.43  max =   77.46  avg =   60.66
  vision_transformer  min = 1330.82  max = 1388.73  avg = 1354.10
          FastestDet  min =   85.75  max =  112.67  avg =   98.62
mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    6.30  max =   10.16  avg =    8.21
     squeezenet_int8  min =   14.53  max =   14.94  avg =   14.67
           mobilenet  min =   10.71  max =   11.26  avg =   10.91
      mobilenet_int8  min =   17.66  max =   18.46  avg =   17.91
        mobilenet_v2  min =    7.74  max =    8.05  avg =    7.89
        mobilenet_v3  min =    6.25  max =    6.70  avg =    6.38
          shufflenet  min =    3.78  max =    7.87  avg =    5.37
       shufflenet_v2  min =    4.19  max =    7.83  avg =    5.25
             mnasnet  min =    7.29  max =    7.61  avg =    7.44
     proxylessnasnet  min =    8.10  max =    8.43  avg =    8.24
     efficientnet_b0  min =   11.77  max =   12.66  avg =   12.06
   efficientnetv2_b0  min =   13.80  max =   15.02  avg =   14.11
        regnety_400m  min =   10.09  max =   10.26  avg =   10.17
           blazeface  min =    1.24  max =    4.02  avg =    2.45
           googlenet  min =   24.05  max =   25.78  avg =   24.64
      googlenet_int8  min =   58.75  max =   62.45  avg =   59.54
            resnet18  min =   20.31  max =   21.48  avg =   20.74
       resnet18_int8  min =   53.82  max =   55.27  avg =   54.43
             alexnet  min =   17.37  max =   18.69  avg =   17.66
               vgg16  min =  114.49  max =  117.62  avg =  115.96
          vgg16_int8  min =  133.82  max =  144.40  avg =  137.07
            resnet50  min =   54.40  max =   58.74  avg =   55.54
       resnet50_int8  min =   92.95  max =  104.71  avg =   99.18
      squeezenet_ssd  min =   17.30  max =   18.65  avg =   17.71
 squeezenet_ssd_int8  min =   32.27  max =   33.88  avg =   32.82
       mobilenet_ssd  min =   24.01  max =   25.94  avg =   25.02
  mobilenet_ssd_int8  min =   34.68  max =   36.09  avg =   35.43
      mobilenet_yolo  min =   53.32  max =   63.48  avg =   56.58
  mobilenetv2_yolov3  min =   30.06  max =   34.24  avg =   31.46
         yolov4-tiny  min =   41.49  max =   43.55  avg =   42.50
           nanodet_m  min =   10.24  max =   11.08  avg =   10.43
    yolo-fastest-1.1  min =    3.85  max =    8.34  avg =    5.40
      yolo-fastestv2  min =    4.33  max =    7.61  avg =    6.01
  vision_transformer  min =  556.38  max =  599.49  avg =  567.98
          FastestDet  min =    4.20  max =   11.37  avg =    6.51
mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark>
```

### Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti)

- Host OS: Microsoft Windows 10 Enterprise LTSC 2021 (10.0.19044.2846)
- Guest OS: openSUSE Tumbleweed x86_64 20230507
- Mesa 3D source tree: https://gitlab.freedesktop.org/mesa/mesa/-/tree/ce6430067613e3e64cabf79918a3d96122b0c4c4
- Mesa 3D configuration command
  > meson --prefix="${PWD}/build/install" -D gallium-drivers=swrast,d3d12 -D vulkan-drivers=swrast,microsoft-experimental build/
- ncnn configuration command
  > cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_TESTS=ON ..

```
mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 0 0
WARNING: dzn is not a conformant Vulkan implementation, testing use only.
[0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)]  queueC=1[8]  queueG=0[4]  queueT=2[1]
[0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)]  fp16-p/s/a=1/0/0  int8-p/s/a=1/0/0
[0 Microsoft Direct3D12 (NVIDIA GeForce GTX 1050 Ti)]  subgroup=32  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =   53.80  max =   64.22  avg =   59.91
     squeezenet_int8  min =   23.21  max =   25.98  avg =   24.44
           mobilenet  min =   47.63  max =   55.22  avg =   49.79
      mobilenet_int8  min =   23.27  max =   25.05  avg =   23.77
        mobilenet_v2  min =   58.17  max =   83.14  avg =   68.48
        mobilenet_v3  min =   92.14  max =  114.74  avg =  101.66
          shufflenet  min =   75.96  max =  106.54  avg =   89.64
       shufflenet_v2  min =   90.66  max =  114.69  avg =  103.25
             mnasnet  min =   58.40  max =   85.74  avg =   67.75
     proxylessnasnet  min =   66.73  max =   84.82  avg =   77.73
     efficientnet_b0  min =  134.28  max =  164.39  avg =  155.40
   efficientnetv2_b0  min =  171.97  max =  220.43  avg =  198.26
        regnety_400m  min =  124.15  max =  145.61  avg =  135.99
           blazeface  min =   53.18  max =   72.10  avg =   60.21
           googlenet  min =  119.34  max =  159.93  avg =  134.71
      googlenet_int8  min =   96.71  max =  102.44  avg =   98.57
            resnet18  min =   68.14  max =   89.99  avg =   80.76
       resnet18_int8  min =   88.07  max =  108.62  avg =   91.09
             alexnet  min =   44.12  max =   51.57  avg =   48.09
               vgg16  min =   88.49  max =   99.87  avg =   93.42
          vgg16_int8  min =  196.17  max =  211.99  avg =  201.27
            resnet50  min =  115.36  max =  138.65  avg =  125.57
       resnet50_int8  min =  138.15  max =  148.55  avg =  141.08
      squeezenet_ssd  min =  138.42  max =  168.49  avg =  155.66
 squeezenet_ssd_int8  min =   46.01  max =   47.83  avg =   46.85
       mobilenet_ssd  min =   82.39  max =  134.74  avg =  101.22
  mobilenet_ssd_int8  min =   45.53  max =   46.67  avg =   45.96
      mobilenet_yolo  min =   70.39  max =   87.83  avg =   80.01
  mobilenetv2_yolov3  min =   75.71  max =   90.59  avg =   84.04
         yolov4-tiny  min =   72.16  max =   87.76  avg =   76.81
           nanodet_m  min =   98.27  max =  129.60  avg =  112.34
    yolo-fastest-1.1  min =  101.01  max =  118.45  avg =  106.47
      yolo-fastestv2  min =  109.89  max =  137.23  avg =  123.97
  vision_transformer  min =  688.60  max =  750.54  avg =  723.30
          FastestDet  min =  104.16  max =  139.23  avg =  123.75
mouri@MouriVM-openSUSE:~/Workspace/ncnn/benchmark> VK_ICD_FILENAMES=/home/mouri/Workspace/mesa/build/install/share/vulkan/icd.d/dzn_icd.x86_64.json ./../build/benchmark/benchncnn 10 1 0 -1 0
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    8.90  max =    9.48  avg =    9.15
     squeezenet_int8  min =   22.54  max =   24.13  avg =   22.85
           mobilenet  min =   14.85  max =   16.15  avg =   15.18
      mobilenet_int8  min =   23.56  max =   23.98  avg =   23.74
        mobilenet_v2  min =   11.03  max =   11.73  avg =   11.22
        mobilenet_v3  min =    8.61  max =    9.29  avg =    8.79
          shufflenet  min =    5.26  max =    5.96  avg =    5.42
       shufflenet_v2  min =    5.56  max =    7.06  avg =    5.82
             mnasnet  min =   10.46  max =   11.04  avg =   10.68
     proxylessnasnet  min =   12.18  max =   12.55  avg =   12.33
     efficientnet_b0  min =   22.46  max =   23.15  avg =   22.86
   efficientnetv2_b0  min =   23.33  max =   23.80  avg =   23.55
        regnety_400m  min =   13.03  max =   14.25  avg =   13.28
           blazeface  min =    1.49  max =    1.95  avg =    1.61
           googlenet  min =   35.26  max =   46.31  avg =   39.63
      googlenet_int8  min =   96.25  max =   98.15  avg =   96.93
            resnet18  min =   29.34  max =   31.00  avg =   29.92
       resnet18_int8  min =   87.84  max =   89.85  avg =   88.73
             alexnet  min =   22.91  max =   23.87  avg =   23.18
               vgg16  min =  151.26  max =  174.79  avg =  155.94
          vgg16_int8  min =  193.66  max =  210.63  avg =  199.14
            resnet50  min =   74.89  max =   77.27  avg =   75.91
       resnet50_int8  min =  136.59  max =  162.13  avg =  141.22
      squeezenet_ssd  min =   24.48  max =   34.00  avg =   26.19
 squeezenet_ssd_int8  min =   46.31  max =   48.87  avg =   47.09
       mobilenet_ssd  min =   31.56  max =   34.45  avg =   32.50
  mobilenet_ssd_int8  min =   45.15  max =   46.53  avg =   45.93
      mobilenet_yolo  min =   72.09  max =   78.05  avg =   74.31
  mobilenetv2_yolov3  min =   40.44  max =   41.54  avg =   40.86
         yolov4-tiny  min =   56.73  max =   60.59  avg =   57.93
           nanodet_m  min =   13.22  max =   19.28  avg =   14.65
    yolo-fastest-1.1  min =    5.47  max =    5.70  avg =    5.58
      yolo-fastestv2  min =    5.68  max =    7.20  avg =    5.88
  vision_transformer  min =  600.83  max =  666.35  avg =  617.33
          FastestDet  min =    6.05  max =    6.72  avg =    6.23
```

### AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12]
```
E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 -1 0
loop_count = 100
num_threads = 16
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    2.68  max =    3.10  avg =    2.77
     squeezenet_int8  min =    3.57  max =    4.72  avg =    4.04
           mobilenet  min =    3.09  max =    5.44  avg =    3.38
      mobilenet_int8  min =    2.36  max =    3.40  avg =    2.74
        mobilenet_v2  min =    4.24  max =    4.81  avg =    4.40
        mobilenet_v3  min =    3.46  max =    3.93  avg =    3.58
          shufflenet  min =    3.21  max =    4.54  avg =    4.01
       shufflenet_v2  min =    2.99  max =    4.49  avg =    3.34
             mnasnet  min =    3.62  max =    4.31  avg =    3.83
     proxylessnasnet  min =    4.06  max =    5.70  avg =    4.23
     efficientnet_b0  min =    5.60  max =    6.55  avg =    5.81
   efficientnetv2_b0  min =    6.83  max =    8.82  avg =    7.12
        regnety_400m  min =    8.02  max =    9.75  avg =    8.34
           blazeface  min =    1.34  max =    1.77  avg =    1.46
           googlenet  min =   11.62  max =   15.95  avg =   12.70
      googlenet_int8  min =    7.43  max =   10.06  avg =    7.92
            resnet18  min =    8.39  max =   10.39  avg =    9.04
       resnet18_int8  min =    6.23  max =    8.64  avg =    6.75
             alexnet  min =    7.78  max =   12.51  avg =    8.51
               vgg16  min =   53.85  max =   63.39  avg =   56.36
          vgg16_int8  min =   35.61  max =   46.94  avg =   38.08
            resnet50  min =   18.55  max =   24.46  avg =   19.81
       resnet50_int8  min =   11.95  max =   23.21  avg =   13.51
      squeezenet_ssd  min =   10.01  max =   13.16  avg =   10.69
 squeezenet_ssd_int8  min =    9.29  max =   14.02  avg =   10.47
       mobilenet_ssd  min =    6.38  max =   10.26  avg =    7.15
  mobilenet_ssd_int8  min =    4.69  max =    6.98  avg =    5.42
      mobilenet_yolo  min =   17.63  max =   22.59  avg =   19.45
  mobilenetv2_yolov3  min =   11.79  max =   15.67  avg =   12.76
         yolov4-tiny  min =   21.53  max =   25.79  avg =   22.46
           nanodet_m  min =    7.16  max =    9.99  avg =    8.01
    yolo-fastest-1.1  min =    3.66  max =    5.00  avg =    4.38
      yolo-fastestv2  min =    3.52  max =    5.20  avg =    4.60
  vision_transformer  min =   67.01  max =   93.71  avg =   78.48
          FastestDet  min =    4.44  max =    8.62  avg =    4.69
```

### AMD Radeon RX 6900 XT of Desktop[2023-10-12]
```
E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 0 0
[0 AMD Radeon RX 6900 XT]  queueC=1[2]  queueG=0[1]  queueT=2[2]
[0 AMD Radeon RX 6900 XT]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 AMD Radeon RX 6900 XT]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 AMD Radeon RX 6900 XT]  subgroup=64  basic/vote/ballot/shuffle=1/1/1/1
[0 AMD Radeon RX 6900 XT]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 100
num_threads = 16
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    2.19  max =    2.70  avg =    2.47
     squeezenet_int8  min =    3.94  max =    4.51  avg =    4.18
           mobilenet  min =    2.03  max =    2.63  avg =    2.28
      mobilenet_int8  min =    2.56  max =    3.34  avg =    2.69
        mobilenet_v2  min =    2.29  max =    2.98  avg =    2.62
        mobilenet_v3  min =    2.31  max =    3.10  avg =    2.75
          shufflenet  min =    1.89  max =    2.61  avg =    2.30
       shufflenet_v2  min =    2.17  max =    3.04  avg =    2.59
             mnasnet  min =    2.19  max =    2.98  avg =    2.69
     proxylessnasnet  min =    2.12  max =    4.08  avg =    2.62
     efficientnet_b0  min =    3.62  max =    5.27  avg =    4.21
   efficientnetv2_b0  min =    6.09  max =    7.15  avg =    6.49
        regnety_400m  min =    2.55  max =    3.82  avg =    3.00
           blazeface  min =    1.93  max =    2.56  avg =    2.28
           googlenet  min =    3.35  max =    4.46  avg =    3.75
      googlenet_int8  min =    8.02  max =   12.84  avg =    9.15
            resnet18  min =    2.46  max =    3.14  avg =    2.84
       resnet18_int8  min =    6.37  max =    9.15  avg =    7.30
             alexnet  min =    2.31  max =    2.91  avg =    2.69
               vgg16  min =    4.76  max =    5.79  avg =    5.24
          vgg16_int8  min =   35.94  max =   46.27  avg =   39.05
            resnet50  min =    3.25  max =    4.09  avg =    3.75
       resnet50_int8  min =   12.04  max =   20.53  avg =   14.61
      squeezenet_ssd  min =    3.03  max =    5.31  avg =    3.66
 squeezenet_ssd_int8  min =    9.74  max =   13.46  avg =   10.42
       mobilenet_ssd  min =    2.82  max =    4.75  avg =    3.39
  mobilenet_ssd_int8  min =    4.67  max =    6.76  avg =    5.30
      mobilenet_yolo  min =    3.01  max =    3.67  avg =    3.34
  mobilenetv2_yolov3  min =    4.04  max =    6.46  avg =    4.55
         yolov4-tiny  min =    5.75  max =    8.05  avg =    6.52
           nanodet_m  min =   10.16  max =   14.97  avg =   13.11
    yolo-fastest-1.1  min =    2.36  max =    3.80  avg =    2.88
      yolo-fastestv2  min =    2.24  max =    3.19  avg =    2.80
  vision_transformer  min =   20.43  max =   25.06  avg =   21.07
          FastestDet  min =    2.49  max =    3.18  avg =    2.93
```

### NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12]
```
E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 0 0
[0 NVIDIA GeForce RTX 3060 Ti]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 NVIDIA GeForce RTX 3060 Ti]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA GeForce RTX 3060 Ti]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA GeForce RTX 3060 Ti]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 NVIDIA GeForce RTX 3060 Ti]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
[1 Intel(R) UHD Graphics 770]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[1 Intel(R) UHD Graphics 770]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 Intel(R) UHD Graphics 770]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[1 Intel(R) UHD Graphics 770]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[1 Intel(R) UHD Graphics 770]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 100
num_threads = 16
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    0.80  max =    2.51  avg =    0.89
     squeezenet_int8  min =    2.81  max =    3.51  avg =    2.96
           mobilenet  min =    0.70  max =    0.79  avg =    0.71
      mobilenet_int8  min =    2.95  max =    3.44  avg =    3.03
        mobilenet_v2  min =    1.09  max =    1.25  avg =    1.12
        mobilenet_v3  min =    1.33  max =    2.04  avg =    1.56
          shufflenet  min =    1.20  max =    1.39  avg =    1.27
       shufflenet_v2  min =    1.50  max =    1.66  avg =    1.57
             mnasnet  min =    1.11  max =    1.22  avg =    1.15
     proxylessnasnet  min =    1.20  max =    1.63  avg =    1.24
     efficientnet_b0  min =    2.38  max =    3.21  avg =    2.61
   efficientnetv2_b0  min =    9.16  max =   11.35  avg =    9.63
        regnety_400m  min =    1.86  max =    2.03  avg =    1.94
           blazeface  min =    0.70  max =    1.10  avg =    0.76
           googlenet  min =    2.11  max =    2.40  avg =    2.30
      googlenet_int8  min =    6.91  max =    7.88  avg =    7.17
            resnet18  min =    1.14  max =    1.47  avg =    1.19
       resnet18_int8  min =    4.96  max =    6.82  avg =    5.40
             alexnet  min =    1.10  max =    1.85  avg =    1.19
               vgg16  min =    2.27  max =    3.97  avg =    2.46
          vgg16_int8  min =   19.02  max =   22.20  avg =   20.28
            resnet50  min =    2.00  max =    2.99  avg =    2.10
       resnet50_int8  min =   10.66  max =   13.30  avg =   11.29
      squeezenet_ssd  min =    2.74  max =    3.44  avg =    2.90
 squeezenet_ssd_int8  min =    6.93  max =    7.95  avg =    7.19
       mobilenet_ssd  min =    1.86  max =    2.07  avg =    1.96
  mobilenet_ssd_int8  min =    5.92  max =    6.48  avg =    6.09
      mobilenet_yolo  min =    1.65  max =    2.58  avg =    1.78
  mobilenetv2_yolov3  min =    3.85  max =    4.11  avg =    3.96
         yolov4-tiny  min =    6.54  max =    7.05  avg =    6.69
           nanodet_m  min =    2.38  max =    3.28  avg =    2.72
    yolo-fastest-1.1  min =    1.73  max =    2.07  avg =    1.83
      yolo-fastestv2  min =    1.72  max =    1.92  avg =    1.80
  vision_transformer  min =   53.91  max =   56.59  avg =   55.27
          FastestDet  min =    1.48  max =    1.83  avg =    1.69
```

### Intel(R) UHD Graphics 770 of Desktop[2023-10-12]
```
E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 1 0
[0 NVIDIA GeForce RTX 3060 Ti]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 NVIDIA GeForce RTX 3060 Ti]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 NVIDIA GeForce RTX 3060 Ti]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 NVIDIA GeForce RTX 3060 Ti]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 NVIDIA GeForce RTX 3060 Ti]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
[1 Intel(R) UHD Graphics 770]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[1 Intel(R) UHD Graphics 770]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 Intel(R) UHD Graphics 770]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[1 Intel(R) UHD Graphics 770]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[1 Intel(R) UHD Graphics 770]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 100
num_threads = 16
powersave = 0
gpu_device = 1
cooling_down = 0
          squeezenet  min =    3.11  max =    4.47  avg =    3.45
     squeezenet_int8  min =    1.89  max =    2.84  avg =    2.23
           mobilenet  min =    4.98  max =    5.67  avg =    5.18
      mobilenet_int8  min =    2.54  max =    3.17  avg =    2.98
        mobilenet_v2  min =    4.03  max =    4.89  avg =    4.37
        mobilenet_v3  min =    4.45  max =    5.68  avg =    4.86
          shufflenet  min =    3.42  max =    4.42  avg =    3.79
       shufflenet_v2  min =    3.00  max =    4.01  avg =    3.30
             mnasnet  min =    4.21  max =    5.12  avg =    4.51
     proxylessnasnet  min =    4.62  max =    5.64  avg =    4.90
     efficientnet_b0  min =    7.82  max =    8.63  avg =    8.10
   efficientnetv2_b0  min =   34.52  max =   36.34  avg =   35.29
        regnety_400m  min =    6.07  max =    7.31  avg =    6.44
           blazeface  min =    1.54  max =    1.67  avg =    1.59
           googlenet  min =   11.53  max =   12.64  avg =   11.89
      googlenet_int8  min =   13.71  max =   15.52  avg =   14.38
            resnet18  min =   10.75  max =   12.94  avg =   11.07
       resnet18_int8  min =    9.04  max =   11.05  avg =    9.53
             alexnet  min =   13.64  max =   14.37  avg =   13.98
               vgg16  min =   38.53  max =   40.16  avg =   39.22
          vgg16_int8  min =   16.04  max =   21.16  avg =   19.35
            resnet50  min =   25.61  max =   28.22  avg =   26.62
       resnet50_int8  min =    7.72  max =   12.83  avg =   10.29
      squeezenet_ssd  min =   10.34  max =   15.88  avg =   14.75
 squeezenet_ssd_int8  min =    4.63  max =    7.13  avg =    5.66
       mobilenet_ssd  min =   11.35  max =   13.06  avg =   12.44
  mobilenet_ssd_int8  min =    4.21  max =    6.31  avg =    5.32
      mobilenet_yolo  min =   20.14  max =   22.92  avg =   21.94
  mobilenetv2_yolov3  min =   12.58  max =   14.88  avg =   14.21
         yolov4-tiny  min =   20.62  max =   25.58  avg =   24.39
           nanodet_m  min =    7.75  max =   12.49  avg =   11.42
    yolo-fastest-1.1  min =    3.68  max =    6.49  avg =    5.54
      yolo-fastestv2  min =    4.32  max =    5.39  avg =    4.51
  vision_transformer  min =  796.51  max =  805.29  avg =  802.39
          FastestDet  min =    2.89  max =    4.83  avg =    3.95
```

### Intel® Core™ i7-13700K of Desktop[2023-10-12]
```
E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 -1 0
loop_count = 100
num_threads = 16
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    1.69  max =    2.63  avg =    2.12
     squeezenet_int8  min =    1.83  max =    3.03  avg =    2.26
           mobilenet  min =    1.69  max =    2.64  avg =    2.24
      mobilenet_int8  min =    2.47  max =    3.06  avg =    2.84
        mobilenet_v2  min =    1.94  max =    3.47  avg =    2.47
        mobilenet_v3  min =    1.49  max =    2.74  avg =    1.87
          shufflenet  min =    1.57  max =    3.00  avg =    1.82
       shufflenet_v2  min =    1.41  max =    1.72  avg =    1.51
             mnasnet  min =    1.73  max =    2.94  avg =    2.13
     proxylessnasnet  min =    2.08  max =    3.31  avg =    2.69
     efficientnet_b0  min =    3.20  max =    4.99  avg =    3.78
   efficientnetv2_b0  min =    3.51  max =    5.16  avg =    4.08
        regnety_400m  min =    4.51  max =   10.29  avg =    6.18
           blazeface  min =    0.52  max =    0.92  avg =    0.59
           googlenet  min =    5.49  max =    7.48  avg =    6.26
      googlenet_int8  min =    4.83  max =    7.54  avg =    5.90
            resnet18  min =    4.05  max =    6.61  avg =    4.83
       resnet18_int8  min =    3.77  max =    5.70  avg =    4.57
             alexnet  min =    3.60  max =    5.09  avg =    4.26
               vgg16  min =   25.19  max =   28.79  avg =   26.81
          vgg16_int8  min =   17.52  max =   21.79  avg =   19.80
            resnet50  min =    9.23  max =   13.15  avg =   11.34
       resnet50_int8  min =    7.77  max =   12.00  avg =   10.18
      squeezenet_ssd  min =    4.33  max =    6.73  avg =    4.96
 squeezenet_ssd_int8  min =    4.77  max =    7.62  avg =    5.71
       mobilenet_ssd  min =    3.70  max =    6.43  avg =    4.53
  mobilenet_ssd_int8  min =    4.16  max =    6.53  avg =    5.38
      mobilenet_yolo  min =   11.27  max =   14.93  avg =   12.90
  mobilenetv2_yolov3  min =    7.41  max =   11.52  avg =    9.11
         yolov4-tiny  min =   12.05  max =   18.96  avg =   14.15
           nanodet_m  min =    3.39  max =    5.77  avg =    4.07
    yolo-fastest-1.1  min =    1.95  max =    3.85  avg =    2.30
      yolo-fastestv2  min =    1.91  max =    3.52  avg =    2.27
  vision_transformer  min =   79.50  max =   99.93  avg =   88.91
          FastestDet  min =    1.92  max =    2.72  avg =    2.19
```

### Amlogic S805 (Cortex-A5, 4 × 1.536GHz)

- Platform: Xunlei OneCloud (玩客云)
- OS: Armbian buster (20.12) armv7l
- Compiler: gcc version 8.3.0 (Debian 8.3.0-6)
- ncnn tag: 20240102

```
mizu-bai@aml-s812:~/ncnn-20240102/benchmark$ ../build/benchmark/benchncnn
loop_count = 4
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =  376.45  max =  445.48  avg =  408.08
     squeezenet_int8  min =  247.06  max =  340.34  avg =  281.40
           mobilenet  min =  696.71  max =  745.63  avg =  718.49
      mobilenet_int8  min =  355.78  max =  472.06  avg =  401.17
        mobilenet_v2  min =  428.86  max =  491.25  avg =  458.45
        mobilenet_v3  min =  361.78  max =  425.90  avg =  396.94
          shufflenet  min =  245.90  max =  333.41  avg =  293.46
       shufflenet_v2  min =  210.69  max =  329.51  avg =  260.73
             mnasnet  min =  418.49  max =  493.40  avg =  448.95
     proxylessnasnet  min =  542.20  max =  566.65  avg =  554.75
     efficientnet_b0  min =  727.72  max =  785.47  avg =  750.72
   efficientnetv2_b0  min =  805.70  max =  874.57  avg =  843.87
        regnety_400m  min =  627.74  max =  686.57  avg =  660.60
           blazeface  min =   62.14  max =  121.32  avg =   82.10
           googlenet  min = 1295.31  max = 1411.88  avg = 1342.26
      googlenet_int8  min =  796.39  max =  860.28  avg =  823.76
            resnet18  min = 1076.93  max = 1125.12  avg = 1099.37
       resnet18_int8  min =  587.12  max =  634.97  avg =  605.29
             alexnet  min =  701.70  max =  729.68  avg =  718.99
               vgg16  min = 5584.13  max = 5748.84  avg = 5660.70
          vgg16_int8  min = 3107.89  max = 3138.78  avg = 3121.28
            resnet50  min = 3378.84  max = 3461.61  avg = 3425.38
       resnet50_int8  min = 2044.93  max = 2067.70  avg = 2061.38
      squeezenet_ssd  min =  908.77  max =  972.68  avg =  939.98
 squeezenet_ssd_int8  min =  609.58  max =  703.88  avg =  662.43
       mobilenet_ssd  min = 1524.69  max = 1589.79  avg = 1552.12
  mobilenet_ssd_int8  min =  817.70  max =  885.45  avg =  840.30
      mobilenet_yolo  min = 3497.13  max = 3605.83  avg = 3543.72
  mobilenetv2_yolov3  min = 1734.10  max = 1824.98  avg = 1795.42
         yolov4-tiny  min = 2093.70  max = 2163.44  avg = 2128.30
           nanodet_m  min =  593.75  max =  647.03  avg =  608.03
    yolo-fastest-1.1  min =  228.68  max =  318.40  avg =  265.74
      yolo-fastestv2  min =  194.29  max =  258.78  avg =  219.82
  vision_transformer  min = 14836.43  max = 15238.27  avg = 15125.26
          FastestDet  min =  215.60  max =  264.69  avg =  239.85
```

### Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740)
```
./benchncnn 4 1 2 -1 1
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    8.44  max =    8.51  avg =    8.47
     squeezenet_int8  min =    6.91  max =    7.13  avg =    7.00
           mobilenet  min =   15.45  max =   15.53  avg =   15.49
      mobilenet_int8  min =    8.76  max =    9.03  avg =    8.88
        mobilenet_v2  min =    9.52  max =   10.71  avg =   10.02
        mobilenet_v3  min =    7.89  max =    8.02  avg =    7.93
          shufflenet  min =    5.07  max =    5.61  avg =    5.25
       shufflenet_v2  min =    5.28  max =    5.41  avg =    5.37
             mnasnet  min =    9.52  max =    9.58  avg =    9.54
     proxylessnasnet  min =   11.26  max =   11.41  avg =   11.36
     efficientnet_b0  min =   18.84  max =   18.91  avg =   18.88
   efficientnetv2_b0  min =   28.60  max =   28.73  avg =   28.66
        regnety_400m  min =   12.35  max =   12.39  avg =   12.37
           blazeface  min =    1.83  max =    2.23  avg =    1.94
           googlenet  min =   32.07  max =   37.37  avg =   35.59
      googlenet_int8  min =   28.50  max =   28.57  avg =   28.53
            resnet18  min =   21.88  max =   22.05  avg =   21.94
       resnet18_int8  min =   24.43  max =   40.52  avg =   32.04
             alexnet  min =   23.69  max =   24.22  avg =   23.98
               vgg16  min =   91.85  max =  100.71  avg =   94.80
          vgg16_int8  min =  206.66  max =  325.74  avg =  258.40
            resnet50  min =   53.59  max =   54.20  avg =   53.96
       resnet50_int8  min =   44.39  max =   45.11  avg =   44.74
      squeezenet_ssd  min =   23.80  max =   24.12  avg =   23.94
 squeezenet_ssd_int8  min =   30.17  max =   30.42  avg =   30.31
       mobilenet_ssd  min =   33.49  max =   33.69  avg =   33.59
  mobilenet_ssd_int8  min =   19.37  max =   19.76  avg =   19.56
      mobilenet_yolo  min =   72.63  max =   73.00  avg =   72.77
  mobilenetv2_yolov3  min =   36.86  max =   37.40  avg =   37.08
         yolov4-tiny  min =   44.94  max =   45.46  avg =   45.22
           nanodet_m  min =   13.65  max =   13.99  avg =   13.82
    yolo-fastest-1.1  min =    3.84  max =    3.93  avg =    3.89
      yolo-fastestv2  min =    4.78  max =    4.93  avg =    4.84
  vision_transformer  min = 1042.50  max = 1043.06  avg = 1042.80
          FastestDet  min =    4.67  max =    4.75  avg =    4.70
./benchncnn 4 4 2 -1 1
loop_count = 4
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    2.60  max =    2.66  avg =    2.64
     squeezenet_int8  min =    2.38  max =    2.43  avg =    2.40
           mobilenet  min =    4.17  max =    4.25  avg =    4.21
      mobilenet_int8  min =    2.59  max =    2.60  avg =    2.60
        mobilenet_v2  min =    3.13  max =    3.44  avg =    3.23
        mobilenet_v3  min =    2.90  max =    5.07  avg =    3.46
          shufflenet  min =    2.34  max =    2.44  avg =    2.38
       shufflenet_v2  min =    2.06  max =    2.15  avg =    2.11
             mnasnet  min =    3.19  max =    3.20  avg =    3.20
     proxylessnasnet  min =    3.53  max =    3.61  avg =    3.57
     efficientnet_b0  min =    5.72  max =    5.75  avg =    5.74
   efficientnetv2_b0  min =    8.61  max =    8.67  avg =    8.64
        regnety_400m  min =    6.22  max =    6.27  avg =    6.25
           blazeface  min =    0.82  max =    0.92  avg =    0.86
           googlenet  min =   10.62  max =   14.39  avg =   11.59
      googlenet_int8  min =    8.84  max =    8.99  avg =    8.92
            resnet18  min =    6.61  max =    6.66  avg =    6.63
       resnet18_int8  min =   21.41  max =   23.48  avg =   22.57
             alexnet  min =    8.18  max =    8.24  avg =    8.21
               vgg16  min =   36.99  max =   39.65  avg =   37.75
          vgg16_int8  min =   86.21  max =   89.00  avg =   86.95
            resnet50  min =   18.90  max =   18.98  avg =   18.94
       resnet50_int8  min =   19.18  max =   19.28  avg =   19.22
      squeezenet_ssd  min =    8.26  max =    8.42  avg =    8.32
 squeezenet_ssd_int8  min =   21.02  max =   21.15  avg =   21.09
       mobilenet_ssd  min =    9.29  max =    9.42  avg =    9.34
  mobilenet_ssd_int8  min =    5.85  max =    5.91  avg =    5.87
      mobilenet_yolo  min =   21.64  max =   21.71  avg =   21.69
  mobilenetv2_yolov3  min =   11.50  max =   11.62  avg =   11.57
         yolov4-tiny  min =   14.91  max =   14.99  avg =   14.95
           nanodet_m  min =    4.93  max =    5.02  avg =    4.98
    yolo-fastest-1.1  min =    2.19  max =    2.26  avg =    2.21
      yolo-fastestv2  min =    2.29  max =    2.44  avg =    2.39
  vision_transformer  min =  242.50  max =  301.91  avg =  271.32
          FastestDet  min =    2.01  max =    2.12  avg =    2.05
./benchncnn 4 8 0 -1 1
loop_count = 4
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =    4.53  max =    6.34  avg =    5.48
     squeezenet_int8  min =    5.48  max =    7.02  avg =    6.14
           mobilenet  min =    6.89  max =    8.44  avg =    7.61
      mobilenet_int8  min =    4.89  max =    6.39  avg =    5.43
        mobilenet_v2  min =    6.01  max =    7.28  avg =    6.53
        mobilenet_v3  min =    4.85  max =   12.13  avg =    7.16
          shufflenet  min =    4.41  max =    6.20  avg =    5.25
       shufflenet_v2  min =    3.50  max =    4.34  avg =    3.74
             mnasnet  min =    5.52  max =    7.03  avg =    6.18
     proxylessnasnet  min =    6.21  max =    7.76  avg =    6.94
     efficientnet_b0  min =    9.49  max =   10.57  avg =    9.94
   efficientnetv2_b0  min =   15.26  max =   19.50  avg =   17.42
        regnety_400m  min =    9.89  max =   14.30  avg =   12.02
           blazeface  min =    2.25  max =    3.44  avg =    2.66
           googlenet  min =   18.98  max =   23.38  avg =   21.07
      googlenet_int8  min =   17.99  max =   20.47  avg =   19.45
            resnet18  min =   34.98  max =   84.52  avg =   69.50
       resnet18_int8  min =   14.58  max =   15.43  avg =   15.04
             alexnet  min =   13.56  max =   15.05  avg =   14.29
               vgg16  min =   63.32  max =   73.69  avg =   67.01
          vgg16_int8  min =   91.17  max =   99.80  avg =   94.81
            resnet50  min =   32.01  max =   42.22  avg =   36.06
       resnet50_int8  min =   30.16  max =   32.25  avg =   30.72
      squeezenet_ssd  min =   14.72  max =   21.45  avg =   17.51
 squeezenet_ssd_int8  min =   18.21  max =   23.93  avg =   21.45
       mobilenet_ssd  min =   16.38  max =   17.92  avg =   16.97
  mobilenet_ssd_int8  min =   10.15  max =   15.88  avg =   12.92
      mobilenet_yolo  min =   35.88  max =   37.10  avg =   36.26
  mobilenetv2_yolov3  min =   21.92  max =   27.60  avg =   24.12
         yolov4-tiny  min =   32.03  max =   34.45  avg =   33.51
           nanodet_m  min =    9.49  max =   14.35  avg =   11.20
    yolo-fastest-1.1  min =    3.97  max =    5.16  avg =    4.40
      yolo-fastestv2  min =    5.13  max =    7.84  avg =    6.18
  vision_transformer  min =  364.37  max =  391.13  avg =  374.55
          FastestDet  min =    3.01  max =    7.36  avg =    4.76
./benchncnn 4 1 2 0 0
[0 Adreno (TM) 740]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 740]  bugsbn1=1  bugbilz=0  bugcopc=0  bugihfa=0
[0 Adreno (TM) 740]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Adreno (TM) 740]  subgroup=64  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 4
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 0
          squeezenet  min =    9.73  max =   11.72  avg =   10.55
     squeezenet_int8  min =    7.21  max =    7.34  avg =    7.27
           mobilenet  min =   10.87  max =   13.09  avg =   12.01
      mobilenet_int8  min =    8.82  max =    9.23  avg =    9.11
        mobilenet_v2  min =   15.77  max =   16.21  avg =   15.96
        mobilenet_v3  min =   18.04  max =   18.68  avg =   18.40
          shufflenet  min =    9.82  max =   11.92  avg =   10.79
       shufflenet_v2  min =   14.41  max =   15.41  avg =   14.96
             mnasnet  min =   16.01  max =   16.43  avg =   16.27
     proxylessnasnet  min =   14.18  max =   16.28  avg =   15.51
     efficientnet_b0  min =   36.38  max =   37.06  avg =   36.83
   efficientnetv2_b0  min =   55.98  max =   66.59  avg =   59.54
        regnety_400m  min =   21.94  max =   22.46  avg =   22.30
           blazeface  min =    3.92  max =    4.47  avg =    4.08
           googlenet  min =   31.79  max =   35.63  avg =   33.04
      googlenet_int8  min =   23.21  max =   29.38  avg =   26.60
            resnet18  min =   22.61  max =   24.05  avg =   23.09
       resnet18_int8  min =   24.56  max =   24.78  avg =   24.62
             alexnet  min =   25.98  max =   27.05  avg =   26.49
               vgg16  min =   39.00  max =   39.82  avg =   39.29
          vgg16_int8  min =  207.47  max =  208.56  avg =  207.90
            resnet50  min =   44.07  max =   44.43  avg =   44.29
       resnet50_int8  min =   44.77  max =   47.04  avg =   45.44
      squeezenet_ssd  min =   33.71  max =   34.27  avg =   34.09
 squeezenet_ssd_int8  min =   22.53  max =   30.33  avg =   25.07
       mobilenet_ssd  min =   26.91  max =   28.35  avg =   27.42
  mobilenet_ssd_int8  min =   19.43  max =   19.82  avg =   19.69
      mobilenet_yolo  min =   28.03  max =   29.19  avg =   28.65
  mobilenetv2_yolov3  min =   33.54  max =   34.65  avg =   34.31
         yolov4-tiny  min =   49.77  max =   51.21  avg =   50.55
           nanodet_m  min =   17.35  max =   18.83  avg =   18.06
    yolo-fastest-1.1  min =    9.45  max =    9.59  avg =    9.51
      yolo-fastestv2  min =   13.13  max =   13.63  avg =   13.36
  vision_transformer  min =  671.13  max =  679.90  avg =  675.27
          FastestDet  min =    8.62  max =    9.01  avg =    8.86
./benchncnn 64 1 2 0 0
[0 Adreno (TM) 740]  queueC=0[3]  queueG=0[3]  queueT=0[3]
[0 Adreno (TM) 740]  bugsbn1=1  bugbilz=0  bugcopc=0  bugihfa=0
[0 Adreno (TM) 740]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Adreno (TM) 740]  subgroup=64  basic=1  vote=1  ballot=1  shuffle=1
loop_count = 64
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 0
          squeezenet  min =    9.56  max =   12.14  avg =   11.48
     squeezenet_int8  min =    6.78  max =    8.47  avg =    7.04
           mobilenet  min =   11.59  max =   12.90  avg =   12.44
      mobilenet_int8  min =    8.69  max =    9.42  avg =    8.90
        mobilenet_v2  min =   14.00  max =   16.08  avg =   15.12
        mobilenet_v3  min =   16.66  max =   19.62  avg =   18.51
          shufflenet  min =    8.72  max =   13.02  avg =   11.86
       shufflenet_v2  min =   12.82  max =   14.66  avg =   14.03
             mnasnet  min =   15.06  max =   17.55  avg =   16.12
     proxylessnasnet  min =   15.42  max =   17.28  avg =   16.59
     efficientnet_b0  min =   35.96  max =   41.24  avg =   37.89
   efficientnetv2_b0  min =   46.11  max =   65.75  avg =   58.52
        regnety_400m  min =   22.07  max =   26.40  avg =   24.43
           blazeface  min =    3.61  max =    6.26  avg =    4.53
           googlenet  min =   32.60  max =   37.05  avg =   34.55
      googlenet_int8  min =   21.79  max =   30.65  avg =   24.84
            resnet18  min =   19.46  max =   24.26  avg =   22.76
       resnet18_int8  min =   38.09  max =   40.42  avg =   38.44
             alexnet  min =   20.80  max =   28.44  avg =   26.86
               vgg16  min =   36.00  max =   44.01  avg =   39.18
          vgg16_int8  min =  201.54  max =  209.87  avg =  207.06
            resnet50  min =   42.50  max =   46.82  avg =   44.26
       resnet50_int8  min =   44.63  max =   47.47  avg =   45.15
      squeezenet_ssd  min =   33.19  max =   36.74  avg =   34.62
 squeezenet_ssd_int8  min =   22.40  max =   31.99  avg =   25.65
       mobilenet_ssd  min =   26.35  max =   29.79  avg =   28.09
  mobilenet_ssd_int8  min =   19.15  max =   20.86  avg =   19.48
      mobilenet_yolo  min =   28.42  max =   31.16  avg =   29.06
  mobilenetv2_yolov3  min =   33.86  max =   36.54  avg =   35.36
         yolov4-tiny  min =   46.51  max =   49.29  avg =   48.01
           nanodet_m  min =   17.14  max =   19.79  avg =   18.49
    yolo-fastest-1.1  min =    9.49  max =   15.00  avg =   13.59
      yolo-fastestv2  min =   11.65  max =   15.61  avg =   14.36
  vision_transformer  min =  650.85  max =  696.67  avg =  671.13
          FastestDet  min =    8.63  max =   13.12  avg =   11.39
```

### MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12)
```
k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 8 0 -1 1                                           
loop_count = 8
num_threads = 8
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =    1.87  max =    2.18  avg =    2.01
     squeezenet_int8  min =    1.52  max =    1.98  avg =    1.77
           mobilenet  min =    3.02  max =    3.34  avg =    3.15
      mobilenet_int8  min =    1.90  max =    2.27  avg =    2.04
        mobilenet_v2  min =    2.72  max =    3.13  avg =    2.89
        mobilenet_v3  min =    2.20  max =    3.82  avg =    2.78
          shufflenet  min =    1.97  max =    2.56  avg =    2.20
       shufflenet_v2  min =    1.77  max =    2.29  avg =    1.96
             mnasnet  min =    2.61  max =    3.48  avg =    2.90
     proxylessnasnet  min =    2.72  max =    3.06  avg =    2.89
     efficientnet_b0  min =    4.57  max =    5.17  avg =    4.89
   efficientnetv2_b0  min =    5.24  max =    6.72  avg =    5.81
        regnety_400m  min =    4.94  max =    6.78  avg =    5.70
           blazeface  min =    0.80  max =    1.02  avg =    0.91
           googlenet  min =    7.76  max =    8.53  avg =    8.12
      googlenet_int8  min =    5.68  max =    6.62  avg =    6.19
            resnet18  min =    5.35  max =    6.06  avg =    5.61
       resnet18_int8  min =    4.20  max =    4.40  avg =    4.29
             alexnet  min =    5.96  max =    7.30  avg =    6.77
               vgg16  min =   29.27  max =   30.58  avg =   29.93
          vgg16_int8  min =   26.72  max =   28.12  avg =   27.27
            resnet50  min =   15.21  max =   19.16  avg =   16.09
       resnet50_int8  min =    8.57  max =    9.16  avg =    8.91
      squeezenet_ssd  min =    6.29  max =    7.56  avg =    6.82
 squeezenet_ssd_int8  min =    5.57  max =    6.96  avg =    6.12
       mobilenet_ssd  min =    6.90  max =    8.90  avg =    7.55
  mobilenet_ssd_int8  min =    4.53  max =    5.22  avg =    4.86
      mobilenet_yolo  min =   16.88  max =   19.71  avg =   17.88
  mobilenetv2_yolov3  min =   10.51  max =   14.19  avg =   11.95
         yolov4-tiny  min =   12.81  max =   16.23  avg =   14.22
           nanodet_m  min =    4.38  max =    5.96  avg =    5.19
    yolo-fastest-1.1  min =    2.22  max =    3.08  avg =    2.73
      yolo-fastestv2  min =    2.09  max =    2.73  avg =    2.41
  vision_transformer  min =  193.39  max =  203.13  avg =  198.32
          FastestDet  min =    1.98  max =    2.35  avg =    2.16
k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 2 -1 1                                           
loop_count = 8
num_threads = 4
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    2.23  max =    2.31  avg =    2.27
     squeezenet_int8  min =    1.68  max =    1.73  avg =    1.70
           mobilenet  min =    3.76  max =    3.86  avg =    3.81
      mobilenet_int8  min =    2.07  max =    2.16  avg =    2.11
        mobilenet_v2  min =    2.72  max =    2.95  avg =    2.80
        mobilenet_v3  min =    2.43  max =    2.51  avg =    2.47
          shufflenet  min =    1.78  max =    1.87  avg =    1.81
       shufflenet_v2  min =    1.61  max =    1.66  avg =    1.63
             mnasnet  min =    2.69  max =    2.82  avg =    2.76
     proxylessnasnet  min =    2.95  max =    3.13  avg =    3.05
     efficientnet_b0  min =    4.99  max =    5.29  avg =    5.08
   efficientnetv2_b0  min =    5.73  max =    5.86  avg =    5.79
        regnety_400m  min =    4.97  max =    5.04  avg =    5.00
           blazeface  min =    1.07  max =    1.17  avg =    1.10
           googlenet  min =    8.51  max =    9.43  avg =    8.75
      googlenet_int8  min =    6.01  max =    6.13  avg =    6.07
            resnet18  min =    6.72  max =    7.04  avg =    6.95
       resnet18_int8  min =    4.31  max =    4.40  avg =    4.34
             alexnet  min =    7.41  max =    7.71  avg =    7.57
               vgg16  min =   33.77  max =   34.68  avg =   34.08
          vgg16_int8  min =   32.61  max =   33.83  avg =   33.12
            resnet50  min =   18.76  max =   19.53  avg =   19.05
       resnet50_int8  min =    9.56  max =    9.70  avg =    9.61
      squeezenet_ssd  min =    6.86  max =    7.26  avg =    7.01
 squeezenet_ssd_int8  min =    5.42  max =    6.17  avg =    5.64
       mobilenet_ssd  min =    8.38  max =    9.14  avg =    8.62
  mobilenet_ssd_int8  min =    4.60  max =    4.90  avg =    4.69
      mobilenet_yolo  min =   19.59  max =   20.06  avg =   19.78
  mobilenetv2_yolov3  min =   10.46  max =   11.01  avg =   10.70
         yolov4-tiny  min =   13.46  max =   14.18  avg =   13.86
           nanodet_m  min =    4.52  max =    4.59  avg =    4.55
    yolo-fastest-1.1  min =    1.88  max =    1.94  avg =    1.91
      yolo-fastestv2  min =    1.73  max =    1.79  avg =    1.76
  vision_transformer  min =  220.32  max =  229.49  avg =  223.92
          FastestDet  min =    1.67  max =    1.73  avg =    1.70
k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 4 1 -1 1                                           
loop_count = 8
num_threads = 4
powersave = 1
gpu_device = -1
cooling_down = 1
          squeezenet  min =    3.42  max =    4.25  avg =    3.62
     squeezenet_int8  min =    2.63  max =    2.78  avg =    2.73
           mobilenet  min =    5.66  max =    6.25  avg =    5.82
      mobilenet_int8  min =    3.13  max =    5.66  avg =    3.58
        mobilenet_v2  min =    4.40  max =    4.46  avg =    4.42
        mobilenet_v3  min =    3.74  max =    4.07  avg =    3.94
          shufflenet  min =    2.77  max =    2.86  avg =    2.82
       shufflenet_v2  min =    2.52  max =    2.62  avg =    2.57
             mnasnet  min =    4.24  max =    4.37  avg =    4.28
     proxylessnasnet  min =    4.65  max =    4.91  avg =    4.74
     efficientnet_b0  min =    7.71  max =   10.00  avg =    8.08
   efficientnetv2_b0  min =    9.24  max =   10.34  avg =    9.87
        regnety_400m  min =    7.87  max =    8.35  avg =    8.02
           blazeface  min =    2.38  max =    2.46  avg =    2.40
           googlenet  min =   13.21  max =   13.78  avg =   13.40
      googlenet_int8  min =   10.23  max =   10.65  avg =   10.36
            resnet18  min =    9.25  max =    9.68  avg =    9.49
       resnet18_int8  min =    6.86  max =    6.97  avg =    6.91
             alexnet  min =    9.73  max =   10.53  avg =    9.97
               vgg16  min =   47.43  max =   48.12  avg =   47.78
          vgg16_int8  min =   47.08  max =   48.18  avg =   47.46
            resnet50  min =   26.82  max =   27.14  avg =   26.99
       resnet50_int8  min =   15.01  max =   15.57  avg =   15.20
      squeezenet_ssd  min =    9.96  max =   12.66  avg =   10.83
 squeezenet_ssd_int8  min =    8.47  max =    9.26  avg =    8.88
       mobilenet_ssd  min =   12.54  max =   13.25  avg =   12.82
  mobilenet_ssd_int8  min =    7.03  max =   10.91  avg =    7.94
      mobilenet_yolo  min =   29.73  max =   30.45  avg =   30.23
  mobilenetv2_yolov3  min =   16.64  max =   17.71  avg =   17.13
         yolov4-tiny  min =   22.25  max =   22.65  avg =   22.45
           nanodet_m  min =    7.56  max =    7.86  avg =    7.69
    yolo-fastest-1.1  min =    3.32  max =    3.45  avg =    3.39
      yolo-fastestv2  min =    2.76  max =    2.96  avg =    2.84
  vision_transformer  min =  328.11  max =  337.26  avg =  332.12
          FastestDet  min =    2.66  max =    2.77  avg =    2.71
k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 8 1 2 -1 1                                           
loop_count = 8
num_threads = 1
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =    5.27  max =    5.35  avg =    5.32
     squeezenet_int8  min =    3.06  max =    3.22  avg =    3.16
           mobilenet  min =    9.59  max =    9.85  avg =    9.74
      mobilenet_int8  min =    4.29  max =    4.45  avg =    4.37
        mobilenet_v2  min =    5.14  max =    5.33  avg =    5.20
        mobilenet_v3  min =    4.28  max =    4.54  avg =    4.42
          shufflenet  min =    3.18  max =    3.34  avg =    3.27
       shufflenet_v2  min =    2.78  max =    3.23  avg =    3.05
             mnasnet  min =    5.01  max =    5.38  avg =    5.19
     proxylessnasnet  min =    6.11  max =    6.30  avg =    6.21
     efficientnet_b0  min =   11.53  max =   11.78  avg =   11.66
   efficientnetv2_b0  min =   13.88  max =   14.28  avg =   14.13
        regnety_400m  min =    8.11  max =    8.18  avg =    8.16
           blazeface  min =    0.99  max =    1.08  avg =    1.01
           googlenet  min =   19.68  max =   20.71  avg =   20.25
      googlenet_int8  min =   13.42  max =   13.86  avg =   13.60
            resnet18  min =   18.10  max =   18.84  avg =   18.53
       resnet18_int8  min =    9.67  max =   10.17  avg =    9.99
             alexnet  min =   15.76  max =   16.35  avg =   16.03
               vgg16  min =   70.22  max =   72.85  avg =   71.58
          vgg16_int8  min =   76.83  max =   79.70  avg =   78.45
            resnet50  min =   39.73  max =   41.24  avg =   40.30
       resnet50_int8  min =   20.76  max =   21.54  avg =   21.27
      squeezenet_ssd  min =   12.63  max =   18.67  avg =   15.20
 squeezenet_ssd_int8  min =   10.29  max =   16.13  avg =   14.13
       mobilenet_ssd  min =   17.21  max =   18.43  avg =   17.68
  mobilenet_ssd_int8  min =    8.92  max =    9.49  avg =    9.07
      mobilenet_yolo  min =   37.45  max =   38.29  avg =   37.88
  mobilenetv2_yolov3  min =   19.18  max =   19.83  avg =   19.58
         yolov4-tiny  min =   27.06  max =   27.86  avg =   27.45
           nanodet_m  min =    9.33  max =    9.50  avg =    9.42
    yolo-fastest-1.1  min =    3.48  max =    3.59  avg =    3.54
      yolo-fastestv2  min =    2.29  max =    2.37  avg =    2.33
  vision_transformer  min =  730.38  max =  739.99  avg =  735.77
          FastestDet  min =    2.40  max =    2.48  avg =    2.43
k6989v1_64:/data/local/tmp/benchmark # ../build-android/benchmark/benchncnn 64 1 2 0 0                                           
[0 Mali-G720-Immortalis MC12]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 Mali-G720-Immortalis MC12]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Mali-G720-Immortalis MC12]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
[0 Mali-G720-Immortalis MC12]  subgroup=16  basic/vote/ballot/shuffle=1/1/1/1
[0 Mali-G720-Immortalis MC12]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
loop_count = 64
num_threads = 1
powersave = 2
gpu_device = 0
cooling_down = 0
          squeezenet  min =   11.26  max =   13.58  avg =   12.32
     squeezenet_int8  min =    3.08  max =    3.29  avg =    3.17
           mobilenet  min =   11.96  max =   14.52  avg =   13.48
      mobilenet_int8  min =    4.20  max =    4.58  avg =    4.34
        mobilenet_v2  min =   13.62  max =   16.46  avg =   14.62
        mobilenet_v3  min =   13.98  max =   17.16  avg =   15.25
          shufflenet  min =   10.22  max =   11.82  avg =   11.07
       shufflenet_v2  min =   12.42  max =   15.39  avg =   14.35
             mnasnet  min =   12.94  max =   16.30  avg =   14.91
     proxylessnasnet  min =   13.18  max =   16.55  avg =   15.05
     efficientnet_b0  min =   16.70  max =   20.35  avg =   18.27
   efficientnetv2_b0  min =   54.09  max =   70.05  avg =   58.68
        regnety_400m  min =   16.20  max =   18.42  avg =   17.27
           blazeface  min =    6.50  max =    7.86  avg =    6.93
           googlenet  min =   15.29  max =   17.54  avg =   16.19
      googlenet_int8  min =   20.38  max =   22.08  avg =   20.98
            resnet18  min =   12.22  max =   15.63  avg =   14.27
       resnet18_int8  min =    9.50  max =   10.46  avg =    9.75
             alexnet  min =   12.00  max =   16.09  avg =   13.65
               vgg16  min =   31.06  max =   32.77  avg =   31.85
          vgg16_int8  min =  115.72  max =  123.71  avg =  118.23
            resnet50  min =   15.74  max =   16.53  avg =   16.10
       resnet50_int8  min =   32.43  max =   33.78  avg =   33.07
      squeezenet_ssd  min =   17.24  max =   21.80  avg =   20.68
 squeezenet_ssd_int8  min =    9.69  max =   10.52  avg =    9.97
       mobilenet_ssd  min =   15.32  max =   17.63  avg =   16.62
  mobilenet_ssd_int8  min =    8.84  max =    9.54  avg =    9.05
      mobilenet_yolo  min =   16.67  max =   18.21  avg =   17.25
  mobilenetv2_yolov3  min =   20.08  max =   25.40  avg =   23.12
         yolov4-tiny  min =   21.98  max =   29.67  avg =   24.75
           nanodet_m  min =   23.19  max =   29.95  avg =   25.69
    yolo-fastest-1.1  min =   15.07  max =   17.78  avg =   16.49
      yolo-fastestv2  min =   14.67  max =   16.07  avg =   15.44
  vision_transformer  min =  768.04  max =  801.48  avg =  786.79
          FastestDet  min =    8.33  max =   16.07  avg =   14.38
```

### Xeon Phi 3120A (1.10 GHz 57-core 228-thread)

- Host: CentOS 7.9
- Compiler: icc & icpc (ICC) 17.0.2 20170213
- ncnn tag: 20240102

Build command

```bash
$ CC=icc CXX=icpc CFLAGS="-mmic" CXXFLAGS="-mmic" cmake .. -DCMAKE_BUILD_TYPE=Release -DNCNN_SSE2=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF
```

Copy the whole `ncnn` directory and libraries in `/opt/intel/compilers_and_libraries_2017/linux/lib/mic/lib` to `mic0`, then set the `LD_LIBRARY_PATH` environment variable. Some tools cannot be built, but `benchncnn` should work. The built `benchncnn` is for Intel Xeon Phi coprocessor (k1om).

```bash
[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ file benchncnn 
benchncnn: ELF 64-bit LSB executable, Intel Xeon Phi coprocessor (k1om), version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.32, not stripped
```

The benchmark is run in the native mode, ssh into the Xeon Phi by `ssh user@mic0`, then run `benckncnn` as under general linux systems.

```
[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 56 0 -1 1
loop_count = 4
num_threads = 56
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   43.42  max =   44.20  avg =   43.64
     squeezenet_int8  min =  161.92  max =  162.41  avg =  162.15
           mobilenet  min =   44.49  max =   46.90  avg =   45.68
      mobilenet_int8  min =  230.47  max =  232.40  avg =  231.77
        mobilenet_v2  min =   57.22  max =   62.03  avg =   59.42
        mobilenet_v3  min =  301.16  max =  306.62  avg =  303.90
          shufflenet  min =   65.80  max =   70.18  avg =   67.70
       shufflenet_v2  min =   49.54  max =   53.17  avg =   51.22
             mnasnet  min =  521.87  max =  527.76  avg =  524.63
     proxylessnasnet  min =  745.79  max =  748.55  avg =  746.92
     efficientnet_b0  min =  582.21  max =  584.64  avg =  583.34
   efficientnetv2_b0  min =   84.13  max =   86.13  avg =   85.19
        regnety_400m  min =  209.67  max =  214.84  avg =  212.39
           blazeface  min =   26.33  max =   27.39  avg =   26.74
           googlenet  min =  124.14  max =  125.72  avg =  124.83
      googlenet_int8  min =  498.36  max =  502.37  avg =  500.29
            resnet18  min =   87.86  max =   88.83  avg =   88.35
       resnet18_int8  min =  359.50  max =  360.71  avg =  360.11
             alexnet  min =   49.87  max =   51.25  avg =   50.76
               vgg16  min =  341.87  max =  343.92  avg =  342.42
          vgg16_int8  min = 1649.34  max = 1655.37  avg = 1652.98
            resnet50  min =  198.91  max =  202.32  avg =  200.58
       resnet50_int8  min =  983.48  max =  988.73  avg =  986.22
      squeezenet_ssd  min =  108.33  max =  111.45  avg =  110.18
 squeezenet_ssd_int8  min =  368.96  max =  370.30  avg =  369.54
       mobilenet_ssd  min =   98.29  max =  101.49  avg =   99.99
  mobilenet_ssd_int8  min =  462.18  max =  466.20  avg =  464.85
      mobilenet_yolo  min =  262.42  max =  266.84  avg =  263.91
  mobilenetv2_yolov3  min =  159.20  max =  161.58  avg =  160.66
         yolov4-tiny  min =  229.22  max =  230.48  avg =  229.87
           nanodet_m  min =  115.10  max =  116.78  avg =  115.86
    yolo-fastest-1.1  min =  154.48  max =  155.33  avg =  154.79
      yolo-fastestv2  min =  161.10  max =  163.98  avg =  161.88
  vision_transformer  min =  848.51  max =  863.03  avg =  854.92
          FastestDet  min =  251.64  max =  253.22  avg =  252.38
[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 112 0 -1 1
loop_count = 4
num_threads = 112
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   41.07  max =   41.19  avg =   41.12
     squeezenet_int8  min =  161.73  max =  163.90  avg =  162.74
           mobilenet  min =   36.82  max =   37.53  avg =   37.11
      mobilenet_int8  min =  231.50  max =  233.81  avg =  232.65
        mobilenet_v2  min =   53.12  max =   55.87  avg =   54.44
        mobilenet_v3  min =  277.82  max =  280.61  avg =  279.66
          shufflenet  min =   64.11  max =   64.92  avg =   64.63
       shufflenet_v2  min =   48.23  max =   50.00  avg =   49.19
             mnasnet  min =  532.09  max =  534.73  avg =  533.34
     proxylessnasnet  min =  760.43  max =  763.94  avg =  762.34
     efficientnet_b0  min =  534.29  max =  547.51  avg =  541.29
   efficientnetv2_b0  min =   75.94  max =   76.88  avg =   76.39
        regnety_400m  min =  226.37  max =  227.81  avg =  227.23
           blazeface  min =   26.03  max =   26.93  avg =   26.51
           googlenet  min =  106.53  max =  107.54  avg =  107.06
      googlenet_int8  min =  503.01  max =  505.16  avg =  504.13
            resnet18  min =   73.63  max =   76.61  avg =   75.11
       resnet18_int8  min =  358.18  max =  359.50  avg =  358.99
             alexnet  min =   37.40  max =   38.17  avg =   37.83
               vgg16  min =  244.95  max =  250.05  avg =  247.24
          vgg16_int8  min = 1511.89  max = 1512.66  avg = 1512.35
            resnet50  min =  151.99  max =  154.66  avg =  153.37
       resnet50_int8  min =  954.16  max =  957.63  avg =  956.55
      squeezenet_ssd  min =   91.46  max =   97.18  avg =   94.00
 squeezenet_ssd_int8  min =  368.03  max =  375.96  avg =  370.99
       mobilenet_ssd  min =   79.61  max =   81.38  avg =   80.33
  mobilenet_ssd_int8  min =  458.93  max =  463.41  avg =  461.63
      mobilenet_yolo  min =  234.59  max =  236.91  avg =  235.43
  mobilenetv2_yolov3  min =  145.82  max =  146.92  avg =  146.23
         yolov4-tiny  min =  219.22  max =  220.51  avg =  219.83
           nanodet_m  min =  109.43  max =  113.94  avg =  112.20
    yolo-fastest-1.1  min =  158.13  max =  160.59  avg =  159.20
      yolo-fastestv2  min =  162.05  max =  162.80  avg =  162.47
  vision_transformer  min =  615.14  max =  625.35  avg =  618.47
          FastestDet  min =  279.98  max =  282.49  avg =  281.14
[mizu-bai@DESKTOP-1D9QDE1-mic0 benchmark]$ KMP_AFFINITY=scatter ../build/benchmark/benchncnn 4 224 0 -1 1
loop_count = 4
num_threads = 224
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   45.54  max =   46.81  avg =   46.13
     squeezenet_int8  min =  186.81  max =  187.14  avg =  186.97
           mobilenet  min =   38.33  max =   39.11  avg =   38.64
      mobilenet_int8  min =  251.06  max =  251.91  avg =  251.40
        mobilenet_v2  min =   56.57  max =   57.15  avg =   56.88
        mobilenet_v3  min =  365.04  max =  366.87  avg =  365.94
          shufflenet  min =   71.16  max =   72.02  avg =   71.68
       shufflenet_v2  min =   52.14  max =   53.60  avg =   52.92
             mnasnet  min =  596.37  max =  603.62  avg =  600.50
     proxylessnasnet  min =  911.84  max =  912.23  avg =  912.04
     efficientnet_b0  min =  611.77  max =  614.32  avg =  612.69
   efficientnetv2_b0  min =   82.16  max =   83.05  avg =   82.62
        regnety_400m  min =  253.43  max =  255.79  avg =  254.66
           blazeface  min =   30.54  max =   30.91  avg =   30.70
           googlenet  min =  111.68  max =  112.65  avg =  112.11
      googlenet_int8  min =  594.07  max =  597.09  avg =  596.03
            resnet18  min =   78.14  max =   79.12  avg =   78.75
       resnet18_int8  min =  412.69  max =  413.92  avg =  413.46
             alexnet  min =   40.93  max =   41.43  avg =   41.17
               vgg16  min =  242.45  max =  244.46  avg =  243.47
          vgg16_int8  min = 1545.61  max = 1548.72  avg = 1547.47
            resnet50  min =  147.73  max =  148.56  avg =  148.07
       resnet50_int8  min = 1034.47  max = 1042.31  avg = 1038.41
      squeezenet_ssd  min =  107.82  max =  110.53  avg =  108.98
 squeezenet_ssd_int8  min =  423.30  max =  426.91  avg =  425.67
       mobilenet_ssd  min =   74.54  max =   77.13  avg =   75.97
  mobilenet_ssd_int8  min =  510.95  max =  513.33  avg =  512.40
      mobilenet_yolo  min =  238.83  max =  239.64  avg =  239.27
  mobilenetv2_yolov3  min =  159.80  max =  160.31  avg =  160.04
         yolov4-tiny  min =  233.89  max =  237.41  avg =  236.22
           nanodet_m  min =  122.39  max =  123.42  avg =  122.89
    yolo-fastest-1.1  min =  194.49  max =  195.25  avg =  194.94
      yolo-fastestv2  min =  193.06  max =  195.03  avg =  194.05
  vision_transformer  min =  547.36  max =  554.17  avg =  549.99
          FastestDet  min =  317.76  max =  321.38  avg =  320.18
```

### PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2)
```
loop_count = 4
num_threads = 2
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =   43.84  max =   43.95  avg =   43.88
     squeezenet_int8  min =   35.48  max =   35.77  avg =   35.66
           mobilenet  min =   69.31  max =   70.03  avg =   69.66
      mobilenet_int8  min =   42.30  max =   42.40  avg =   42.35
        mobilenet_v2  min =   59.07  max =   59.35  avg =   59.19
        mobilenet_v3  min =   46.02  max =   46.37  avg =   46.19
          shufflenet  min =   31.52  max =   31.61  avg =   31.56
       shufflenet_v2  min =   23.99  max =   24.07  avg =   24.04
             mnasnet  min =   49.40  max =   50.45  avg =   49.92
     proxylessnasnet  min =   53.24  max =   53.85  avg =   53.53
     efficientnet_b0  min =   77.49  max =   77.84  avg =   77.62
   efficientnetv2_b0  min =   88.51  max =   88.92  avg =   88.69
        regnety_400m  min =   66.99  max =   67.05  avg =   67.03
           blazeface  min =    7.74  max =    8.14  avg =    7.98
           googlenet  min =  126.62  max =  127.23  avg =  126.91
      googlenet_int8  min =  102.87  max =  103.16  avg =  103.01
            resnet18  min =  102.28  max =  102.63  avg =  102.48
       resnet18_int8  min =   72.01  max =   72.45  avg =   72.29
             alexnet  min =   76.00  max =  124.61  avg =   88.24
               vgg16  min =  597.75  max =  601.99  avg =  599.44
          vgg16_int8  min =  421.40  max =  423.83  avg =  423.01
            resnet50  min =  278.16  max =  280.64  avg =  279.37
       resnet50_int8  min =  207.26  max =  207.47  avg =  207.36
      squeezenet_ssd  min =  108.69  max =  109.26  avg =  108.99
 squeezenet_ssd_int8  min =   84.05  max =   84.60  avg =   84.28
       mobilenet_ssd  min =  141.65  max =  142.46  avg =  142.14
  mobilenet_ssd_int8  min =   84.43  max =   84.99  avg =   84.73
      mobilenet_yolo  min =  322.53  max =  325.15  avg =  323.51
  mobilenetv2_yolov3  min =  194.84  max =  196.98  avg =  196.07
         yolov4-tiny  min =  208.29  max =  213.26  avg =  210.77
           nanodet_m  min =   64.78  max =   65.38  avg =   65.08
    yolo-fastest-1.1  min =   37.89  max =   38.23  avg =   38.07
      yolo-fastestv2  min =   29.75  max =   30.33  avg =   30.09
  vision_transformer  min = 4257.71  max = 4263.73  avg = 4260.60
          FastestDet  min =   30.86  max =   44.67  avg =   34.41
```

### AMD EPYC 7742 (2.25GHz) ubuntu 22.04 AOCC_4.2.0-Build#89

single core

```
# nice -20 ../build-host-aocc-linux/benchmark/benchncnn 100 1 0 -1 0
loop_count = 100
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    9.26  max =   10.05  avg =    9.45
     squeezenet_int8  min =    9.54  max =   13.35  avg =    9.67
           mobilenet  min =   16.20  max =   16.83  avg =   16.35
      mobilenet_int8  min =   16.79  max =   17.28  avg =   16.89
        mobilenet_v2  min =   10.69  max =   11.13  avg =   10.78
        mobilenet_v3  min =    8.87  max =   14.09  avg =    9.03
          shufflenet  min =    4.99  max =    5.29  avg =    5.06
       shufflenet_v2  min =    5.61  max =    7.14  avg =    5.66
             mnasnet  min =   11.94  max =   12.39  avg =   12.05
     proxylessnasnet  min =   13.48  max =   16.57  avg =   13.62
     efficientnet_b0  min =   19.58  max =   20.34  avg =   19.73
   efficientnetv2_b0  min =   22.66  max =   23.63  avg =   22.89
        regnety_400m  min =   14.89  max =   18.76  avg =   15.11
           blazeface  min =    1.45  max =    1.59  avg =    1.51
           googlenet  min =   35.38  max =   36.94  avg =   35.79
      googlenet_int8  min =   30.55  max =   42.18  avg =   30.88
            resnet18  min =   34.73  max =   48.15  avg =   35.43
       resnet18_int8  min =   27.39  max =   28.22  avg =   27.61
             alexnet  min =   31.42  max =   32.26  avg =   31.64
               vgg16  min =  160.38  max =  172.02  avg =  162.52
          vgg16_int8  min =  134.03  max =  153.69  avg =  135.12
            resnet50  min =   85.47  max =   87.90  avg =   86.21
       resnet50_int8  min =   71.18  max =   80.37  avg =   71.70
      squeezenet_ssd  min =   24.66  max =   25.71  avg =   24.84
 squeezenet_ssd_int8  min =   23.61  max =   24.28  avg =   23.78
       mobilenet_ssd  min =   34.48  max =   35.69  avg =   34.64
  mobilenet_ssd_int8  min =   33.26  max =   34.32  avg =   33.45
      mobilenet_yolo  min =   77.25  max =   86.54  avg =   77.73
  mobilenetv2_yolov3  min =   41.72  max =   42.92  avg =   42.02
         yolov4-tiny  min =   57.61  max =   59.49  avg =   58.46
           nanodet_m  min =   12.92  max =   13.39  avg =   13.03
    yolo-fastest-1.1  min =    5.02  max =    5.26  avg =    5.11
      yolo-fastestv2  min =    5.06  max =    5.20  avg =    5.09
  vision_transformer  min =  637.63  max =  670.46  avg =  640.60
          FastestDet  min =    5.59  max =    5.82  avg =    5.66
```

64 cores

```
# nice -20 ../build-host-aocc-linux/benchmark/benchncnn 300 64 0 -1 0
loop_count = 300
num_threads = 64
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =    4.19  max =   13.94  avg =    5.06
     squeezenet_int8  min =    4.93  max =   13.59  avg =    5.14
           mobilenet  min =    3.29  max =    5.28  avg =    3.39
      mobilenet_int8  min =    2.32  max =    3.32  avg =    2.40
        mobilenet_v2  min =    4.58  max =    8.64  avg =    4.76
        mobilenet_v3  min =    4.11  max =    6.89  avg =    4.88
          shufflenet  min =    5.67  max =    8.60  avg =    5.92
       shufflenet_v2  min =    4.83  max =    6.29  avg =    5.02
             mnasnet  min =    4.08  max =   12.75  avg =    4.29
     proxylessnasnet  min =    4.46  max =    7.28  avg =    4.68
     efficientnet_b0  min =    5.51  max =   11.67  avg =    6.33
   efficientnetv2_b0  min =    7.50  max =   11.30  avg =    9.34
        regnety_400m  min =   12.50  max =   20.88  avg =   12.76
           blazeface  min =    1.67  max =    3.37  avg =    1.76
           googlenet  min =   10.64  max =   11.59  avg =   10.87
      googlenet_int8  min =    8.49  max =   17.88  avg =    9.90
            resnet18  min =    6.36  max =    6.88  avg =    6.48
       resnet18_int8  min =    4.65  max =   13.03  avg =    4.77
             alexnet  min =    3.88  max =    4.62  avg =    3.97
               vgg16  min =   26.00  max =   36.86  avg =   27.25
          vgg16_int8  min =   17.75  max =   19.63  avg =   18.42
            resnet50  min =   13.94  max =   23.10  avg =   14.17
       resnet50_int8  min =    8.73  max =   18.32  avg =    8.92
      squeezenet_ssd  min =   10.39  max =   12.10  avg =   10.77
 squeezenet_ssd_int8  min =   11.53  max =   20.24  avg =   12.01
       mobilenet_ssd  min =    6.80  max =    8.16  avg =    6.96
  mobilenet_ssd_int8  min =    4.98  max =    5.21  avg =    5.07
      mobilenet_yolo  min =   17.75  max =   30.34  avg =   18.29
  mobilenetv2_yolov3  min =   13.74  max =   15.69  avg =   14.18
         yolov4-tiny  min =   21.27  max =   29.53  avg =   22.81
           nanodet_m  min =   10.22  max =   12.25  avg =   10.89
    yolo-fastest-1.1  min =    5.56  max =    6.03  avg =    5.66
      yolo-fastestv2  min =    5.61  max =    5.78  avg =    5.67
  vision_transformer  min =   69.07  max =  508.15  avg =   71.73
          FastestDet  min =    5.74  max =    6.83  avg =    5.81
```

### NVIDIA Tesla V100-PCIE-32GB  (GV100 SM x 80 + Tensor Core x 640)

```
# ../build-host-gcc-vk-linux/benchmark/benchncnn 300 1 0 0 0
[0 Tesla V100-PCIE-32GB]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[0 Tesla V100-PCIE-32GB]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 Tesla V100-PCIE-32GB]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[0 Tesla V100-PCIE-32GB]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[0 Tesla V100-PCIE-32GB]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
[1 llvmpipe (LLVM 15.0.7, 256 bits)]  queueC=0[1]  queueG=0[1]  queueT=0[1]
[1 llvmpipe (LLVM 15.0.7, 256 bits)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[1 llvmpipe (LLVM 15.0.7, 256 bits)]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[1 llvmpipe (LLVM 15.0.7, 256 bits)]  subgroup=8  basic/vote/ballot/shuffle=1/1/1/1
[1 llvmpipe (LLVM 15.0.7, 256 bits)]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
[2 Tesla V100-PCIE-32GB]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[2 Tesla V100-PCIE-32GB]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[2 Tesla V100-PCIE-32GB]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[2 Tesla V100-PCIE-32GB]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[2 Tesla V100-PCIE-32GB]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
[3 Tesla V100-PCIE-32GB]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[3 Tesla V100-PCIE-32GB]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[3 Tesla V100-PCIE-32GB]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[3 Tesla V100-PCIE-32GB]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[3 Tesla V100-PCIE-32GB]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
[4 Tesla V100-PCIE-32GB]  queueC=2[8]  queueG=0[16]  queueT=1[2]
[4 Tesla V100-PCIE-32GB]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[4 Tesla V100-PCIE-32GB]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[4 Tesla V100-PCIE-32GB]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
[4 Tesla V100-PCIE-32GB]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 300
num_threads = 1
powersave = 0
gpu_device = 0
cooling_down = 0
          squeezenet  min =    1.16  max =   16.79  avg =    1.64
     squeezenet_int8  min =    9.03  max =   10.06  avg =    9.15
           mobilenet  min =    1.05  max =    2.60  avg =    1.25
      mobilenet_int8  min =   16.78  max =   19.89  avg =   16.93
        mobilenet_v2  min =    1.60  max =    3.29  avg =    1.76
        mobilenet_v3  min =    1.84  max =    8.43  avg =    2.04
          shufflenet  min =    1.35  max =    3.73  avg =    1.54
       shufflenet_v2  min =    1.66  max =    8.02  avg =    1.93
             mnasnet  min =    1.69  max =    3.31  avg =    1.82
     proxylessnasnet  min =    1.74  max =    3.70  avg =    1.89
     efficientnet_b0  min =    2.86  max =    5.21  avg =    3.02
   efficientnetv2_b0  min =   60.41  max =   80.28  avg =   69.51
        regnety_400m  min =    2.38  max =    6.84  avg =    2.57
           blazeface  min =    0.85  max =    3.50  avg =    0.96
           googlenet  min =    3.69  max =   16.66  avg =    4.10
      googlenet_int8  min =   33.66  max =   47.27  avg =   34.32
            resnet18  min =    1.76  max =    7.58  avg =    1.95
       resnet18_int8  min =   27.12  max =   36.43  avg =   27.62
             alexnet  min =    1.33  max =    2.97  avg =    1.49
               vgg16  min =    2.98  max =    4.60  avg =    3.17
          vgg16_int8  min =  133.97  max =  154.41  avg =  136.22
            resnet50  min =    3.42  max =   17.05  avg =    3.72
       resnet50_int8  min =   70.53  max =   93.57  avg =   71.96
      squeezenet_ssd  min =   16.88  max =   22.55  avg =   18.49
 squeezenet_ssd_int8  min =   23.12  max =   30.45  avg =   23.50
       mobilenet_ssd  min =    5.44  max =    7.09  avg =    5.93
  mobilenet_ssd_int8  min =   33.28  max =   38.92  avg =   33.62
      mobilenet_yolo  min =    5.67  max =    7.66  avg =    6.26
  mobilenetv2_yolov3  min =    6.33  max =    7.89  avg =    6.67
         yolov4-tiny  min =   14.66  max =   17.29  avg =   15.57
           nanodet_m  min =    5.36  max =   16.11  avg =    5.95
    yolo-fastest-1.1  min =    5.60  max =    7.45  avg =    6.13
      yolo-fastestv2  min =    3.48  max =    5.29  avg =    3.96
  vision_transformer  min =  153.75  max =  198.81  avg =  165.58
          FastestDet  min =    3.01  max =    5.01  avg =    3.29
```

### AXERA AX630C (Cortex-A53 1.2GHz * 2)

```
# ~/ncnn/build-aarch64-linux-gnu/benchmark # ./benchncnn 4 1 0 -1 0
loop_count = 4
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =  129.78  max =  130.30  avg =  130.09
     squeezenet_int8  min =  123.08  max =  123.48  avg =  123.22
           mobilenet  min =  211.46  max =  221.68  avg =  214.14
      mobilenet_int8  min =  196.00  max =  212.73  avg =  200.23
        mobilenet_v2  min =  149.15  max =  149.21  avg =  149.17
        mobilenet_v3  min =  124.70  max =  125.54  avg =  125.08
          shufflenet  min =   80.75  max =   80.88  avg =   80.81
       shufflenet_v2  min =   74.30  max =   74.50  avg =   74.37
             mnasnet  min =  148.87  max =  165.85  avg =  153.26
     proxylessnasnet  min =  203.05  max =  213.50  avg =  205.82
     efficientnet_b0  min =  270.39  max =  280.59  avg =  273.13
   efficientnetv2_b0  min =  302.93  max =  318.07  avg =  307.30
        regnety_400m  min =  187.47  max =  187.90  avg =  187.60
           blazeface  min =   22.64  max =   22.78  avg =   22.72
           googlenet  min =  487.36  max =  503.50  avg =  493.93
      googlenet_int8  min =  418.16  max =  434.44  avg =  426.09
       resnet18_int8  min =  290.39  max =  301.90  avg =  293.70
       resnet50_int8  min =  888.81  max =  898.34  avg =  895.92
      squeezenet_ssd  min =  320.78  max =  330.33  avg =  323.54
 squeezenet_ssd_int8  min =  281.52  max =  299.11  avg =  286.89
       mobilenet_ssd  min =  435.79  max =  452.66  avg =  444.19
  mobilenet_ssd_int8  min =  394.38  max =  411.09  avg =  398.65
      mobilenet_yolo  min =  955.48  max =  972.38  avg =  967.52
  mobilenetv2_yolov3  min =  519.47  max =  536.58  avg =  524.25
      yolo-fastestv2  min =   73.94  max =   74.15  avg =   74.05
          FastestDet  min =   81.89  max =   82.07  avg =   81.98
          
# ~/ncnn/build-aarch64-linux-gnu/benchmark # ./benchncnn 4 2 0 -1 0
loop_count = 4
num_threads = 2
powersave = 0
gpu_device = -1
cooling_down = 0
          squeezenet  min =   75.14  max =   88.89  avg =   79.06
     squeezenet_int8  min =   70.11  max =   85.48  avg =   74.32
           mobilenet  min =  112.72  max =  124.85  avg =  115.87
      mobilenet_int8  min =  100.35  max =  100.58  avg =  100.49
        mobilenet_v2  min =   85.92  max =   86.20  avg =   86.03
        mobilenet_v3  min =   73.94  max =   74.34  avg =   74.20
          shufflenet  min =   53.99  max =   66.11  avg =   57.63
       shufflenet_v2  min =   47.47  max =   47.72  avg =   47.59
             mnasnet  min =   85.96  max =   86.27  avg =   86.13
     proxylessnasnet  min =  111.15  max =  121.84  avg =  113.92
     efficientnet_b0  min =  149.72  max =  150.00  avg =  149.85
   efficientnetv2_b0  min =  168.84  max =  170.57  avg =  169.35
        regnety_400m  min =  120.42  max =  135.50  avg =  124.26
           blazeface  min =   14.27  max =   14.48  avg =   14.39
           googlenet  min =  263.82  max =  274.74  avg =  266.84
      googlenet_int8  min =  226.91  max =  227.36  avg =  227.23
       resnet18_int8  min =  157.66  max =  168.11  avg =  160.57
       resnet50_int8  min =  469.84  max =  484.00  avg =  476.59
      squeezenet_ssd  min =  190.23  max =  204.41  avg =  193.99
 squeezenet_ssd_int8  min =  162.73  max =  174.30  avg =  165.79
       mobilenet_ssd  min =  236.26  max =  251.16  avg =  240.34
  mobilenet_ssd_int8  min =  203.22  max =  212.01  avg =  206.00
      mobilenet_yolo  min =  522.45  max =  537.99  avg =  529.95
  mobilenetv2_yolov3  min =  300.33  max =  316.59  avg =  304.89
      yolo-fastestv2  min =   50.27  max =   50.62  avg =   50.43
          FastestDet  min =   53.34  max =   53.64  avg =   53.51
```

### Spacemit MUSE Pi Pro Spacemit M1 (Spacemit X60 *8 + PowerVR B-Series BXE-2-32 MC1)
```
root@spacemit-k1-x-MUSE-Pi-Pro-board:/home/yingxi/ncnn/build/benchmark# ./benchncnn 4 8 2 -1 1
loop_count = 4
num_threads = 8
powersave = 2
gpu_device = -1
cooling_down = 1
          squeezenet  min =  192.55  max =  203.73  avg =  195.61
     squeezenet_int8  min =  863.38  max =  875.44  avg =  867.96
           mobilenet  min =  260.32  max =  274.70  avg =  266.42
      mobilenet_int8  min = 1287.80  max = 1606.98  avg = 1461.52
        mobilenet_v2  min =  168.08  max =  173.99  avg =  169.97
        mobilenet_v3  min =  141.06  max =  166.83  avg =  147.74
          shufflenet  min =   82.91  max =   92.83  avg =   85.57
       shufflenet_v2  min =   83.11  max =   83.35  avg =   83.26
             mnasnet  min =  168.99  max =  180.35  avg =  171.95
     proxylessnasnet  min =  186.14  max =  194.56  avg =  188.91
     efficientnet_b0  min =  257.93  max =  263.18  avg =  259.94
   efficientnetv2_b0  min =  385.35  max =  394.09  avg =  388.57
        regnety_400m  min =  228.02  max =  229.55  avg =  228.88
           blazeface  min =   26.78  max =   27.43  avg =   26.97
           googlenet  min =  781.12  max =  796.37  avg =  788.60
      googlenet_int8  min = 2422.82  max = 2441.75  avg = 2432.78
            resnet18  min =  864.67  max =  874.15  avg =  869.32
       resnet18_int8  min = 2409.34  max = 2728.57  avg = 2530.44
             alexnet  min =  389.93  max =  393.67  avg =  391.77
               vgg16  min = 8213.96  max = 8957.49  avg = 8405.27
          vgg16_int8  min = 34268.94  max = 36044.89  avg = 35244.72
            resnet50  min = 1798.75  max = 1859.80  avg = 1825.00
       resnet50_int8  min = 7364.21  max = 7500.24  avg = 7428.21
      squeezenet_ssd  min =  693.59  max =  701.68  avg =  697.60
 squeezenet_ssd_int8  min = 1447.64  max = 1461.21  avg = 1455.02
       mobilenet_ssd  min =  530.90  max =  542.81  avg =  534.42
  mobilenet_ssd_int8  min = 4347.45  max = 4391.44  avg = 4377.68
      mobilenet_yolo  min = 1285.07  max = 1369.59  avg = 1312.64
  mobilenetv2_yolov3  min =  605.19  max =  628.05  avg =  616.37
         yolov4-tiny  min = 1743.00  max = 1751.39  avg = 1748.09
           nanodet_m  min =  201.46  max =  202.80  avg =  202.03
    yolo-fastest-1.1  min =   97.02  max =   98.29  avg =   97.71
      yolo-fastestv2  min =   75.53  max =   76.62  avg =   76.20
  vision_transformer  min = 11328.10  max = 11334.80  avg = 11332.34
          FastestDet  min =   85.01  max =   86.04  avg =   85.45

root@spacemit-k1-x-MUSE-Pi-Pro-board:/home/yingxi/ncnn/build/benchmark# ./benchncnn 4 8 2 0 1
[0 PowerVR B-Series BXE-2-32 MC1]  queueC=0[2]  queueG=0[2]  queueT=0[2]
[0 PowerVR B-Series BXE-2-32 MC1]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
[0 PowerVR B-Series BXE-2-32 MC1]  fp16-p/s/u/a=1/1/1/1  int8-p/s/u/a=1/1/1/1
[0 PowerVR B-Series BXE-2-32 MC1]  subgroup=1(1~1)  ops=1/1/1/1/1/1/0/0/1/1
[0 PowerVR B-Series BXE-2-32 MC1]  fp16-8x8x16/16x8x8/16x8x16/16x16x16=0/0/0/0
loop_count = 4
num_threads = 8
powersave = 2
gpu_device = 0
cooling_down = 1
          squeezenet  min =  381.51  max =  382.05  avg =  381.73
     squeezenet_int8  min =  862.26  max =  890.38  avg =  879.94
           mobilenet  min =  795.29  max =  796.41  avg =  795.80
      mobilenet_int8  min = 1284.16  max = 1298.86  avg = 1290.31
        mobilenet_v2  min =  512.00  max =  512.59  avg =  512.19
        mobilenet_v3  min =  428.55  max =  428.95  avg =  428.76
          shufflenet  min =  198.17  max =  198.83  avg =  198.39
       shufflenet_v2  min =  272.36  max =  272.73  avg =  272.55
             mnasnet  min =  526.92  max =  527.44  avg =  527.12
     proxylessnasnet  min =  601.43  max =  602.65  avg =  602.05
     efficientnet_b0  min =  704.94  max =  705.23  avg =  705.13
   efficientnetv2_b0  min =  854.83  max =  866.51  avg =  859.85
        regnety_400m  min =  526.46  max =  527.04  avg =  526.65
           blazeface  min =   69.74  max =   69.84  avg =   69.80
           googlenet  min = 1230.07  max = 1231.04  avg = 1230.53
      googlenet_int8  min = 2409.25  max = 2423.38  avg = 2416.76
            resnet18  min = 1134.72  max = 1136.35  avg = 1135.44
       resnet18_int8  min = 2431.48  max = 2552.62  avg = 2473.90
             alexnet  min =  692.35  max =  697.08  avg =  695.61
               vgg16  min = 5790.33  max = 5805.37  avg = 5796.20
          vgg16_int8  min = 34057.43  max = 35714.99  avg = 35080.62
            resnet50  min = 3426.54  max = 3429.97  avg = 3427.94
       resnet50_int8  min = 7370.03  max = 7409.63  avg = 7390.83
      squeezenet_ssd  min = 1057.50  max = 1061.42  avg = 1059.26
 squeezenet_ssd_int8  min = 1454.99  max = 1469.47  avg = 1462.61
       mobilenet_ssd  min = 1670.02  max = 1673.22  avg = 1671.34
  mobilenet_ssd_int8  min = 4372.23  max = 4424.18  avg = 4400.11
      mobilenet_yolo  min = 3794.02  max = 3796.52  avg = 3795.21
  mobilenetv2_yolov3  min = 1841.86  max = 1844.70  avg = 1843.49
         yolov4-tiny  min = 2099.86  max = 2104.18  avg = 2102.34
           nanodet_m  min =  646.19  max =  647.41  avg =  646.69
    yolo-fastest-1.1  min =  322.08  max =  323.71  avg =  323.22
      yolo-fastestv2  min =  209.42  max =  209.72  avg =  209.56
  vision_transformer  min = 26499.86  max = 26548.73  avg = 26528.54
          FastestDet  min =  251.68  max =  252.52  avg =  252.14
```

### Arduino UNO Q - QRB2210 (ARM Cortex-A53 @ 2.0GHz x 4)
```
arduino@noivis-uno-q:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 4 0 -1 -1
loop_count = 10
num_threads = 4
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   35.57  max =  111.57  avg =   43.99
     squeezenet_int8  min =   31.61  max =   32.34  avg =   31.91
           mobilenet  min =   47.82  max =  133.12  avg =   56.77
      mobilenet_int8  min =   33.96  max =  102.49  avg =   44.91
        mobilenet_v2  min =   42.62  max =  119.38  avg =   51.88
        mobilenet_v3  min =   34.53  max =   35.91  avg =   35.27
          shufflenet  min =   26.18  max =   26.47  avg =   26.32
       shufflenet_v2  min =   22.02  max =   88.82  avg =   30.98
             mnasnet  min =   38.96  max =   92.30  avg =   50.98
     proxylessnasnet  min =   47.04  max =  137.34  avg =   56.91
     efficientnet_b0  min =   58.75  max =  141.67  avg =   76.36
   efficientnetv2_b0  min =   79.72  max =  175.06  avg =   99.54
        regnety_400m  min =   65.97  max =  184.19  avg =   96.94
           blazeface  min =    6.43  max =    7.84  avg =    6.76
           googlenet  min =  105.37  max =  197.46  avg =  130.49
      googlenet_int8  min =   89.68  max =  179.01  avg =  107.28
            resnet18  min =   86.52  max =  166.67  avg =  102.49
       resnet18_int8  min =   57.96  max =  107.52  avg =   66.63
             alexnet  min =   56.77  max =  127.20  avg =   67.50
               vgg16  min =  463.45  max =  557.00  avg =  511.24
          vgg16_int8  min =  323.15  max =  415.10  avg =  367.00
            resnet50  min =  219.89  max =  298.83  avg =  250.55
       resnet50_int8  min =  177.14  max =  261.74  avg =  208.69
      squeezenet_ssd  min =   96.95  max =  195.33  avg =  123.10
 squeezenet_ssd_int8  min =   79.66  max =  179.98  avg =   97.71
       mobilenet_ssd  min =  100.40  max =  191.42  avg =  119.07
  mobilenet_ssd_int8  min =   71.88  max =  173.69  avg =   92.27
      mobilenet_yolo  min =  216.49  max =  301.24  avg =  248.78
  mobilenetv2_yolov3  min =  154.69  max =  245.76  avg =  179.31
         yolov4-tiny  min =  191.17  max =  261.76  avg =  218.64
           nanodet_m  min =   57.66  max =  113.14  avg =   67.66
    yolo-fastest-1.1  min =   34.72  max =  131.85  avg =   49.81
      yolo-fastestv2  min =   26.91  max =   28.23  avg =   27.46
  vision_transformer  min = 2529.77  max = 2703.20  avg = 2601.17
          FastestDet  min =   28.09  max =   29.11  avg =   28.48

arduino@noivis-uno-q:~/ncnn/benchmark$ ../build/benchmark/benchncnn 10 1 0 -1 -1
loop_count = 10
num_threads = 1
powersave = 0
gpu_device = -1
cooling_down = 1
          squeezenet  min =   94.15  max =  111.95  avg =   99.15
     squeezenet_int8  min =   78.23  max =   86.76  avg =   80.23
           mobilenet  min =  146.45  max =  165.20  avg =  153.61
      mobilenet_int8  min =  123.70  max =  133.75  avg =  126.28
        mobilenet_v2  min =   99.85  max =  108.01  avg =  103.90
        mobilenet_v3  min =   93.31  max =  102.90  avg =   96.41
          shufflenet  min =   61.80  max =   79.39  avg =   65.28
       shufflenet_v2  min =   47.57  max =   56.28  avg =   49.89
             mnasnet  min =  106.41  max =  119.18  avg =  109.83
     proxylessnasnet  min =  143.93  max =  164.33  avg =  151.37
     efficientnet_b0  min =  164.14  max =  173.38  avg =  167.91
   efficientnetv2_b0  min =  206.05  max =  225.26  avg =  211.93
        regnety_400m  min =  133.84  max =  144.94  avg =  137.26
           blazeface  min =   13.90  max =   14.97  avg =   14.25
           googlenet  min =  337.11  max =  364.05  avg =  347.30
      googlenet_int8  min =  281.64  max =  293.46  avg =  288.34
            resnet18  min =  276.23  max =  304.36  avg =  289.94
       resnet18_int8  min =  190.11  max =  217.07  avg =  199.87
             alexnet  min =  196.14  max =  203.26  avg =  198.63
               vgg16  min = 1391.13  max = 1626.54  avg = 1502.86
          vgg16_int8  min = 1128.65  max = 1290.60  avg = 1200.60
            resnet50  min =  739.44  max =  774.68  avg =  750.76
       resnet50_int8  min =  591.32  max =  612.44  avg =  603.38
      squeezenet_ssd  min =  245.57  max =  280.32  avg =  262.18
 squeezenet_ssd_int8  min =  182.86  max =  228.61  avg =  199.68
       mobilenet_ssd  min =  308.26  max =  320.81  avg =  314.58
  mobilenet_ssd_int8  min =  246.33  max =  265.22  avg =  253.05
      mobilenet_yolo  min =  682.76  max =  703.99  avg =  696.30
  mobilenetv2_yolov3  min =  346.53  max =  365.76  avg =  355.41
         yolov4-tiny  min =  527.86  max =  558.38  avg =  542.25
           nanodet_m  min =  135.87  max =  153.99  avg =  145.11
    yolo-fastest-1.1  min =   58.92  max =   76.24  avg =   65.08
      yolo-fastestv2  min =   48.54  max =   59.97  avg =   53.21
  vision_transformer  min = 9218.64  max = 10723.27  avg = 10253.49
          FastestDet  min =   51.52  max =   62.65  avg =   55.04
```


================================================
FILE: benchmark/RankCards/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.10)

project(RankCards CXX)

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(EXECUTABLE_OUTPUT_PATH "../")

add_executable(RankCards main.cpp)


================================================
FILE: benchmark/RankCards/README.md
================================================
### Rank the boards.
The table below is generated by RankCards, using the timings found in the /ncnn/benchmark/README.md file.<br>
First, the best set of timings is selected from each board.<br>
The set is then compared to a reference set by calculating the ratio of each model one by one and averaging all results.<br>
Finally, the boards are ranked from fast to slow.<br>
|      | Board | Ratio | 
| :--: | :---- | :---  | 
| 1 | NVIDIA Quadro RTX 8000 (TU102 SM x 72 + Tensor Core x 576) | 0.147 | 
| 2 | nVIDIA RTX2080 of Desktop | 0.15 | 
| 3 | NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12] | 0.18 | 
| 4 | nVIDIA RTX2060 of Notebook | 0.198 | 
| 5 | Intel® Core™ i7-13700K of Desktop[2023-10-12] | 0.255 | 
| 6 | AMD Radeon RX 6900 XT of Desktop[2023-10-12] | 0.275 | 
| 7 | NVIDIA RTX3090 (GA102 SM x 82 + Tensor Core 328) | 0.277 | 
| 8 | MediaTek Dimensity 9300 (MT6989) (Cortex-X4 3.25 GHz + 2.85 GHz x 3 + Cortex-A720 2.0 GHz x 4 + Mali-G720-Immortalis MC12) | 0.309 | 
| 9 | MacBook Pro (13-inch, M1, 2020) | 0.346 | 
| 10 | AWS c5.4xlarge Instance | 0.418 | 
| 11 | AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12] | 0.427 | 
| 12 | Qualcomm SM8550-AB Snapdragon 8 Gen 2 (Kyro 3.20 GHz + 2.8 GHz x 2 + 2.80 GHz x 2 + 2.00 GHz * 3 + Adreno 740) | 0.45 | 
| 13 | AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) | 0.478 | 
| 14 | HUAWEI KunPeng 920 3211K (x24 cores) | 0.482 | 
| 15 | NVIDIA Jetson AGX Orin (Cortex-A78AE 2.2 GHz x 12 + Ampere@1.3 GHz Tensor Cores 64) | 0.485 | 
| 16 | HUAWEI KunPeng 920 2251K (x8 cores) | 0.54 | 
| 17 | nVIDIA RTX A3000 of Notebook (6GB) | 0.577 | 
| 18 | Intel(R) UHD Graphics 770 of Desktop[2023-10-12] | 0.593 | 
| 19 | OrangePi5, Rockchip RK3588s (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.642 | 
| 20 | Qualcomm SM8150-AC Snapdragon 855+ (Kyro485 2.96 GHz + 2.42 GHz x 3 + 1.80 GHz x 4 + Adreno 640) | 0.665 | 
| 21 | Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) | 0.753 | 
| 22 | NVIDIA Jetson Orin Nano | 0.819 | 
| 23 | Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4) | 1 | 
| 24 | Station-M3/ROC-RK3588S-PC, Rockchip RK3588S (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz + Mali-G610) StationOS (Android) | 1 | 
| 25 | NVIDIA Jetson AGX Xavier (Carmel 2.2 GHz x 8 + Volta Tensor Cores 64) | 1.05 | 
| 26 | Loongson 3A6000 (LA664 2.5GHz * 4+4) | 1.11 | 
| 27 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-11800H, NVIDIA GeForce RTX 3070 Laptop GPU) | 1.19 | 
| 28 | Rockchip RK3588 (Cortex-A76 2.4GHz x 4 + Cortex-A55 1.8GHz x 4) | 1.35 | 
| 29 | NVIDIA Jetson TX2 NX(NV-Denver2 2.0Ghz x 2 +  Cortex-A57 2.0Ghz x 4 + 256-core NVIDIA Pascal iGPU) | 1.59 | 
| 30 | Hyper-V Linux Guest with GPU-PV enabled (Intel Core i7-7700K, NVIDIA GeForce GTX 1050 Ti) | 1.66 | 
| 31 | Phytium FT-2000+/64 (FTC662 armv8 2.4GHz x 8) | 1.75 | 
| 32 | AMD Ryzen Threadripper 3970X (Zen2 3.7 GHz ~ 4.5 GHz x 32) | 2.19 | 
| 33 | AMD Ryzen Embedded V1605B (Zen 2.0 GHz ~ 3.6 GHz x 4 + Radeon Vega 8 1.1GHz 8CU) | 2.23 | 
| 34 | Avaota Aim T527, Allwinner T527 (Cortex-A55 2.2GHz x 4 + Cortex-A55 1.8GHz x 4) | 2.28 | 
| 35 | Loongson 3A5000 (LA464 2.5GHz * 4) | 2.31 | 
| 36 | Qualcomm MSM8996 Pro Snapdragon 821 (Kyro 2.35GHz x 2 + Kyro 2.19GHz x 2) | 2.37 | 
| 37 | NVIDIA Jetson Nano | 2.44 | 
| 38 | Intel Celeron N5105 | 2.8 | 
| 39 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 3.24 | 
| 40 | Khadas VIM3, Amlogic A311D (Cortex-A73 2.2GHz x 4 + Cortex-A53 1.8GHz x 2) | 3.48 | 
| 41 | Kirin 970 (Cortex-A73 2.4GHz x 4 + Cortex-A53 1.8GHz x 4) | 3.58 | 
| 42 | Qualcomm MSM8998 Snapdragon 835 (Kyro 2.45GHz x 4 + Kyro 1.9GHz x 4 + Adreno 540) | 3.63 | 
| 43 | MacBook Pro (15-inch, 2019) - 2.6GHz six cores Intel Core i7 && Radeon Pro 555X 4GB && Intel UHD Graphics 630 1536MB | 3.75 | 
| 44 | Qualcomm MSM6150 Snapdragon 675 (Kyro460 2.0GHz x 2 + Kyro460 1.7GHz x 6 + Adreno 612) | 3.75 | 
| 45 | Qualcomm MSM8994 Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4) | 3.82 | 
| 46 | Station P2, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) | 3.85 | 
| 47 | Rock3A, Rockchip RK3568 (Cortex-A55 2.0GHz x 4) ubuntu 20.04 | 3.86 | 
| 48 | Loongson 3A4000 (GS464V 1.8GHz * 4 with MSA128) | 4.08 | 
| 49 | Radxa Zero 3W, Cortex-A55 (ARMv82) (1.416 GHz x 4) | 4.5 | 
| 50 | Raspberry Pi 4 Model B Broadcom BCM2711B0, Cortex-A72 (ARMv8) (1.8GHz x 4) | 4.95 | 
| 51 | OrangePi4 LTS, Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.11 | 
| 52 | Rockchip RK3399 (Cortex-A72 1.8GHz x 2 + Cortex-A53 1.5GHz x 4) | 5.16 | 
| 53 | PhytiumPi, Phytium E2000 (FTC664@1.8GHz x2 + FTC310@1.5GHz x2) | 5.16 | 
| 54 | Qualcomm SDM660 Snapdragon 660 (Kyro260 2.2GHz x 4 + Kyro260 1.84GHz x 4 + Adreno 512) | 5.26 | 
| 55 | Phytium FT-2000/4 (FTC663 armv8 2.2GHz x 4) | 5.27 | 
| 56 | RDK X3 Module (Cortex-A53 1.5GHz x 4) aarch64 | 5.88 | 
| 57 | Station-M2/ROC-RK3566-PC, Rockchip RK3566 (Cortex-A55 1.8GHz x 4 + Mali-G52) StationOS (Android) | 6.51 | 
| 58 | Rockchip RK3288-CG.W (Cortex-A17 1.8GHz x 4) | 6.66 | 
| 59 | Qualcomm MSM8916 Snapdragon 410 (Cortex-A53 1.2GHz x 4) | 7.63 | 
| 60 | NanoPi R2S, Rockchip RK3328 (Cortex-A53 1.3GHz x 4) Armbian focal (21.05.1) aarch64 | 7.66 | 
| 61 | Intel Atom x5-Z8350 | 7.74 | 
| 62 | Loongson 2K2000 (LA364 1.5GHz * 2 with lsx) | 8.23 | 
| 63 | EAIDK 310, Rockchip RK3228H (Cortex-A53 1.3GHz x 4) fedora-28 aarch64 | 8.34 | 
| 64 | OrangePi Zero 2, Allwinner H616 (Cortex-A53 1.5GHz x 4) | 9.51 | 
| 65 | Raspberry Pi 3 Model B+ Broadcom BCM2837B0, Cortex-A53 (ARMv8) (1.4GHz x 4) | 9.87 | 
| 66 | iPhone 5S (Apple A7 1.3GHz x 2) | 11 | 
| 67 | MYIR RemiPi,Renesas RZG2L(Cortex-A55 1.5GHz x 2) | 11.9 | 
| 68 | Raspberry Pi 5 Broadcom BCM2712, VideoCore VII Graphics (Vulkan 1.2) | 12.5 | 
| 69 | Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4) | 13.7 | 
| 70 | Xeon Phi 3120A (1.10 GHz 57-core 228-thread) | 15.1 | 
| 71 | Loongson 3A3000 (GS464E 1.45GHz * 4) | 16.3 | 
| 72 | AXERA AX620A (Cortex-A7 1.0GHz * 4) | 18.8 | 
| 73 | Loongson 2K1000LA (LA264 1.0GHz * 2) | 24.4 | 
| 74 | Loongson 2K1000 (GS264 1.0GHz x 2) | 24.8 | 
| 75 | Freescale i.MX7 Dual (Cortex A7 1.0GHz x 2) | 26.7 | 
| 76 | Banana Pi M2 Zero 2 AllWinner H2+, Cortex-A7 (ARMv7-A) (1.2GHz x 4) | 26.8 | 
| 77 | HiSilicon Hi3519V101 (Cortex-A17 1.2GHz x 1) | 36.2 | 
| 78 | Sunway SW831 (sw_64 2.5GHz * 8) | 40.7 | 
| 79 | Z7-Lite 7020 XC7Z020CLG400-2 (Cortex-A9 766MHz x 2) | 43.2 | 
| 80 | Intel Celeron M 420 (Yonah 1.60 GHz x 1) | 43.9 | 
| 81 | Amlogic S805 (Cortex-A5, 4 × 1.536GHz) | 45.9 | 
| 82 | VisionFive2 , JH7110 (SiFive-U74(RV64GC) 1.5GHz x 4) riscv64 with PowerVR B-Series BXE-4-32 | 72.4 | 
| 83 | T-Head TH1520 (C910V, 1.848 GHz x 4 + BXM-4-64 PowerVR) | 83.3 | 
| 84 | Sunway SW421 (sw_64 1.7GHz * 4) | 116 | 
| 85 | Ingenic T40XP Xburst2 Core X2 1.4Ghz (without MSA) | 165 | 


================================================
FILE: benchmark/RankCards/Rcards.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause
#ifndef RCARDS_H
#define RCARDS_H

#include <cstdint>
#include <cmath>
#include <deque>
#include <list>
#include <array>
#include <memory>
#include <iostream>
#include <iomanip>
#include <stdio.h>
#include <string.h>
#include <istream>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <chrono>
#include <thread>

//---------------------------------------------------------------------------
// Global hardcoded parameters
//---------------------------------------------------------------------------
// LERP(a,b,c) = linear interpolation macro, is 'a' when c == 0.0 and 'b' when c == 1.0 */
#define MIN(a, b)                 ((a) > (b) ? (b) : (a))
#define MAX(a, b)                 ((a) < (b) ? (b) : (a))
#define LIM(a, b, c)              (((a) > (c)) ? (c) : ((a) < (b)) ? (b) : (a))
#define LERP(a, b, c)             (((b) - (a)) * (c) + (a))
#define ROUND(a)                  (static_cast<int>((a) + 0.5))
#define EUCLIDEAN(x1, y1, x2, y2) sqrt(((x1) - (x2)) * ((x1) - (x2)) + ((y1) - (y2)) * ((y1) - (y2)))
//---------------------------------------------------------------------------
struct TModel
{
    std::string Name;
    float AvrTime{0.0};
};
//---------------------------------------------------------------------------
struct TModelSet
{
    std::vector<TModel> Mset;

    //use push_back to prevent <brace-enclosed initializer list> issues with CMake
    inline TModelSet(void)
    {
        TModel model;
        model.Name = "squeezenet";
        Mset.push_back(model);
        model.Name = "squeezenet_int8";
        Mset.push_back(model);
        model.Name = "mobilenet";
        Mset.push_back(model);
        model.Name = "mobilenet_int8";
        Mset.push_back(model);
        model.Name = "mobilenet_v2";
        Mset.push_back(model);
        model.Name = "mobilenet_v3";
        Mset.push_back(model);
        model.Name = "shufflenet";
        Mset.push_back(model);
        model.Name = "shufflenet_v2";
        Mset.push_back(model);
        model.Name = "mnasnet";
        Mset.push_back(model);
        model.Name = "proxylessnasnet";
        Mset.push_back(model);
        model.Name = "efficientnet_b0";
        Mset.push_back(model);
        model.Name = "efficientnetv2_b0";
        Mset.push_back(model);
        model.Name = "regnety_400m";
        Mset.push_back(model);
        model.Name = "blazeface";
        Mset.push_back(model);
        model.Name = "googlenet";
        Mset.push_back(model);
        model.Name = "googlenet_int8";
        Mset.push_back(model);
        model.Name = "resnet18";
        Mset.push_back(model);
        model.Name = "resnet18_int8";
        Mset.push_back(model);
        model.Name = "alexnet";
        Mset.push_back(model);
        model.Name = "vgg16";
        Mset.push_back(model);
        model.Name = "vgg16_int8";
        Mset.push_back(model);
        model.Name = "resnet50";
        Mset.push_back(model);
        model.Name = "resnet50_int8";
        Mset.push_back(model);
        model.Name = "squeezenet_ssd";
        Mset.push_back(model);
        model.Name = "squeezenet_ssd_int8";
        Mset.push_back(model);
        model.Name = "mobilenet_ssd";
        Mset.push_back(model);
        model.Name = "mobilenet_ssd_int8";
        Mset.push_back(model);
        model.Name = "mobilenet_yolo";
        Mset.push_back(model);
        model.Name = "mobilenetv2_yolov3";
        Mset.push_back(model);
        model.Name = "yolov4-tiny";
        Mset.push_back(model);
        model.Name = "nanodet_m";
        Mset.push_back(model);
        model.Name = "yolo-fastest-1.1";
        Mset.push_back(model);
        model.Name = "yolo-fastestv2";
        Mset.push_back(model);
        model.Name = "vision_transformer";
        Mset.push_back(model);
        model.Name = "FastestDet";
        Mset.push_back(model);
    }

    void Store(const TModel& model)
    {
        for (size_t i = 0; i < Mset.size(); i++)
        {
            if (Mset[i].Name == model.Name)
            {
                Mset[i].AvrTime = model.AvrTime;
                break;
            }
        }
    }

    float Sum(void)
    {
        float t = 0;

        for (size_t i = 0; i < Mset.size(); i++) t += Mset[i].AvrTime;

        return t;
    }

    float Ratio(const TModelSet& Rset)
    {
        float w;
        float s = 0;
        float t = 0;

        for (size_t r = 0; r < Rset.Mset.size(); r++)
        {
            if (Rset.Mset[r].AvrTime > 0.0)
            {
                for (size_t i = 0; i < Mset.size(); i++)
                {
                    if (Mset[i].AvrTime > 0.0)
                    {
                        if (Mset[i].Name == Rset.Mset[r].Name)
                        {
                            w = log(Rset.Mset[r].AvrTime);
                            s += w * (Mset[i].AvrTime / Rset.Mset[r].AvrTime);
                            t += w;
                        }
                    }
                }
            }
        }
        if (t > 0) s /= t;
        return s;
    }
};
//---------------------------------------------------------------------------
struct TBoard
{
    std::string Name;
    size_t StartLine;
    size_t EndLine;
    std::vector<TModelSet> BenchSet;
    int BestSet;
    float Ratio;
};
//---------------------------------------------------------------------------
inline bool FileExists(const std::string& name)
{
    struct stat buffer;
    return (stat(name.c_str(), &buffer) == 0);
}
//---------------------------------------------------------------------------
inline void FileCopy(const std::string& Src, const std::string& Dst)
{
    std::ifstream src(Src, std::ios::binary);
    std::ofstream dst(Dst, std::ios::binary);

    dst << src.rdbuf();
}
//---------------------------------------------------------------------------
// to lower case
static inline void lcase(std::string& s)
{
    std::transform(s.begin(), s.end(), s.begin(),
    [](unsigned char c) {
        return std::tolower(c);
    });
}
//---------------------------------------------------------------------------
// to lower case (copying)
static inline std::string lcase_copy(std::string s)
{
    lcase(s);
    return s;
}
//---------------------------------------------------------------------------
// trim from start (in place)
static inline void ltrim(std::string& s)
{
    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
        return !std::isspace(ch);
    }));
}
//---------------------------------------------------------------------------
// trim from end (in place)
static inline void rtrim(std::string& s)
{
    s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
        return !std::isspace(ch);
    }).base(),
    s.end());
}
//---------------------------------------------------------------------------
// trim from both ends (in place)
static inline void trim(std::string& s)
{
    ltrim(s);
    rtrim(s);
}
//---------------------------------------------------------------------------
// trim from start (copying)
static inline std::string ltrim_copy(std::string s)
{
    ltrim(s);
    return s;
}
//---------------------------------------------------------------------------
// trim from end (copying)
static inline std::string rtrim_copy(std::string s)
{
    rtrim(s);
    return s;
}
//---------------------------------------------------------------------------
// trim from both ends (copying)
static inline std::string trim_copy(std::string s)
{
    trim(s);
    return s;
}
//---------------------------------------------------------------------------
static inline void GetNameAver(std::string line, TModel& model)
{
    // line example: squeezenet  min =   46.28  max =   46.91  avg =   46.65

    size_t p = line.find("min =");

    if (p != std::string::npos)
    {
        model.Name = trim_copy(line.substr(0, p));
        p = line.find("avg =");
        if (p != std::string::npos)
        {
            try
            {
                model.AvrTime = std::stof(trim_copy(line.substr(p + 5, line.length() - p - 5)));
            }
            catch (...)
            {
            }
        }
        else
            model.AvrTime = 0.0;
    }
    else
    {
        model.Name = "";
        model.AvrTime = 0.0;
    }
}
//---------------------------------------------------------------------------
#endif // RCARDS_H


================================================
FILE: benchmark/RankCards/main.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include <iostream>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <cfloat>
#include "Rcards.h"
//---------------------------------------------------------------------------
using namespace std;
//---------------------------------------------------------------------------
#define REF_BOARD "Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8)"
//---------------------------------------------------------------------------
// Define a custom comparator function for sorting based on Ratio
bool compareByRatio(const TBoard& a, const TBoard& b)
{
    return a.Ratio < b.Ratio;
}
//---------------------------------------------------------------------------
int main(int argc, char** argv)
{
    size_t i, t, n, r;
    int RefBoard;
    float f, x;
    string Line;
    TModel Model;
    vector<string> Lines;  // Vector to store strings
    vector<TBoard> Boards; // Vector to store boards
    ifstream inputFile;

    // Check existence of the ../README.md file
    inputFile.open("../README.md");
    if (!inputFile.is_open())
    {
        if (argc != 2)
        {
            fprintf(stderr, "Usage: ./RankCards <your README.md> \n");
            return -1;
        }
        const char* imagepath = argv[1];
        // Open the file given as argument
        inputFile.open(imagepath);
        // Check if the file is open
        if (!inputFile.is_open())
        {
            cerr << "Error opening file" << endl;
            return 1; // Return an error code
        }
    }

    // Read each Line from the file and add it to the vector
    while (std::getline(inputFile, Line))
    {
        Lines.push_back(Line);
    }
    // Close the file
    inputFile.close();

    // Get the boards.
    for (i = 0; i < Lines.size(); i++)
    {
        TBoard Brd;
        if (Lines[i].find("###") != string::npos)
        {
            Brd.Name = Lines[i].substr(4, Lines[i].length() - 4);
            Brd.StartLine = i + 1;
            Boards.push_back(Brd);
        }
    }
    // Get the boards end Line.
    for (t = 0; t < Boards.size() - 1; t++)
    {
        Boards[t].EndLine = Boards[t + 1].StartLine;
    }
    Boards[t].EndLine = Lines.size();

    // Get the bench sets (must always start with squeezenet)
    for (t = 0; t < Boards.size(); t++)
    {
        TModelSet MdSet;
        bool FirstSet = true;
        for (n = Boards[t].StartLine; n < Boards[t].EndLine; n++)
        {
            GetNameAver(Lines[n], Model);
            MdSet.Store(Model);

            if (Model.Name == "squeezenet")
            {
                //start of new set, check if it is the first set
                if (FirstSet)
                    FirstSet = false;
                else
                    Boards[t].BenchSet.push_back(MdSet);
            }
        }
        Boards[t].BenchSet.push_back(MdSet);
    }

    // Get the total AvrTime of the bench sets and set the lowest as best set
    for (t = 0; t < Boards.size(); t++)
    {
        x = FLT_MAX;
        for (n = 0; n < Boards[t].BenchSet.size(); n++)
        {
            f = Boards[t].BenchSet[n].Sum();
            if (f < x)
            {
                x = f;
                Boards[t].BestSet = n;
            }
        }
    }

    // Get the reference set
    RefBoard = -1;
    for (t = 0; t < Boards.size(); t++)
    {
        if (Boards[t].Name.find(REF_BOARD) != string::npos)
        {
            RefBoard = static_cast<int>(t);
        }
    }
    if (RefBoard == -1)
    {
        cerr << "Error finding reference board :" << endl;
        cerr << REF_BOARD << endl;
        return 1; // Return an error code
    }

    // Get the ratios between the best bench sets and reference
    r = Boards[RefBoard].BestSet;
    for (t = 0; t < Boards.size(); t++)
    {
        n = Boards[t].BestSet;
        Boards[t].Ratio = Boards[t].BenchSet[n].Ratio(Boards[RefBoard].BenchSet[r]);
    }

    // Sort the vector using the custom comparator
    std::sort(Boards.begin(), Boards.end(), compareByRatio);

    // Open an output README.md file
    std::ofstream outputFile("README.md");

    // Check if the file is successfully opened
    if (outputFile.is_open())
    {
        outputFile << "### Rank the boards." << endl;
        outputFile << "The table below is generated by RankCards, using the timings found in the /ncnn/benchmark/README.md file.<br>" << endl;
        outputFile << "First, the best set of timings is selected from each board.<br>" << endl;
        outputFile << "The set is then compared to a reference set by calculating the ratio of each model one by one and averaging all results.<br>" << endl;
        outputFile << "Finally, the boards are ranked from fast to slow.<br>" << endl;
        outputFile << "|      | Board | Ratio | " << endl;
        outputFile << "| :--: | :---- | :---  | " << endl;
        // Write the sorted vector to the file
        for (t = 0; t < Boards.size(); t++)
        {
            outputFile << "| " << t + 1 << " | " << Boards[t].Name << " | " << setprecision(3) << Boards[t].Ratio << " | " << endl;
        }
        // Close the file stream
        outputFile.close();
        cout << "Sorted data has been written to README.md" << endl;
    }
    else
    {
        cerr << "Error opening the file." << endl;
        return 1; // Return an error code
    }

    return 0; // Return success
}
//---------------------------------------------------------------------------


================================================
FILE: benchmark/alexnet.param
================================================
7767517
15 15
Input                    data                     0 1 data -23330=4,3,227,227,3 0=227 1=227 2=3
Convolution              conv1                    1 1 data conv1_relu1 -23330=4,3,55,55,96 0=96 1=11 3=4 5=1 6=34848 9=1
LRN                      norm1                    1 1 conv1_relu1 norm1 -23330=4,3,55,55,96 2=1.000000e-04
Pooling                  pool1                    1 1 norm1 pool1 -23330=4,3,27,27,96 1=3 2=2
ConvolutionDepthWise     conv2                    1 1 pool1 conv2_relu2 -23330=4,3,27,27,256 0=256 1=5 4=2 5=1 6=307200 7=2 9=1
LRN                      norm2                    1 1 conv2_relu2 norm2 -23330=4,3,27,27,256 2=1.000000e-04
Pooling                  pool2                    1 1 norm2 pool2 -23330=4,3,13,13,256 1=3 2=2
Convolution              conv3                    1 1 pool2 conv3_relu3 -23330=4,3,13,13,384 0=384 1=3 4=1 5=1 6=884736 9=1
ConvolutionDepthWise     conv4                    1 1 conv3_relu3 conv4_relu4 -23330=4,3,13,13,384 0=384 1=3 4=1 5=1 6=663552 7=2 9=1
ConvolutionDepthWise     conv5                    1 1 conv4_relu4 conv5_relu5 -23330=4,3,13,13,256 0=256 1=3 4=1 5=1 6=442368 7=2 9=1
Pooling                  pool5                    1 1 conv5_relu5 pool5 -23330=4,3,6,6,256 1=3 2=2
InnerProduct             fc6                      1 1 pool5 fc6_drop6 -23330=4,1,4096,1,1 0=4096 1=1 2=37748736 9=1
InnerProduct             fc7                      1 1 fc6_drop6 fc7_drop7 -23330=4,1,4096,1,1 0=4096 1=1 2=16777216 9=1
InnerProduct             fc8                      1 1 fc7_drop7 fc8 -23330=4,1,1000,1,1 0=1000 1=1 2=4096000
Softmax                  prob                     1 1 fc8 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/benchncnn.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include <float.h>
#include <stdio.h>
#include <string.h>

#ifdef __EMSCRIPTEN__
#include <emscripten.h>
#endif

#include "benchmark.h"
#include "cpu.h"
#include "datareader.h"
#include "net.h"
#include "gpu.h"

#include "benchncnn_param_data.h"

#ifndef NCNN_SIMPLESTL
#include <vector>
#endif

class DataReaderFromEmpty : public ncnn::DataReader
{
public:
    virtual int scan(const char* format, void* p) const
    {
        return 0;
    }
    virtual size_t read(void* buf, size_t size) const
    {
        memset(buf, 0, size);
        return size;
    }
};

static int g_warmup_loop_count = 8;
static int g_loop_count = 4;
static bool g_enable_cooling_down = true;

static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
static ncnn::PoolAllocator g_workspace_pool_allocator;

#if NCNN_VULKAN
static ncnn::VulkanDevice* g_vkdev = 0;
static ncnn::VkAllocator* g_blob_vkallocator = 0;
static ncnn::VkAllocator* g_staging_vkallocator = 0;
#endif // NCNN_VULKAN

void benchmark(const char* comment, const std::vector<ncnn::Mat>& _in, const ncnn::Option& opt, const char* model_param_data = NULL)
{
    // Skip if int8 model name and using GPU
    if (opt.use_vulkan_compute && strstr(comment, "int8") != NULL)
    {
        if (!model_param_data)
            fprintf(stderr, "%20s  skipped (int8+GPU not supported)\n", comment);
        return;
    }

    g_blob_pool_allocator.clear();
    g_workspace_pool_allocator.clear();

#if NCNN_VULKAN
    if (opt.use_vulkan_compute)
    {
        g_blob_vkallocator->clear();
        g_staging_vkallocator->clear();
    }
#endif // NCNN_VULKAN

    ncnn::Net net;

    net.opt = opt;

#if NCNN_VULKAN
    if (net.opt.use_vulkan_compute)
    {
        net.set_vulkan_device(g_vkdev);
    }
#endif // NCNN_VULKAN

    if (model_param_data)
    {
        net.load_param_mem(model_param_data);
    }
    else
    {
        net.load_param(comment);
    }

    DataReaderFromEmpty dr;
    net.load_model(dr);

    const std::vector<const char*>& input_names = net.input_names();
    const std::vector<const char*>& output_names = net.output_names();

    if (g_enable_cooling_down)
    {
        // sleep 10 seconds for cooling down SOC  :(
        ncnn::sleep(10 * 1000);
    }

    if (input_names.size() > _in.size())
    {
        fprintf(stderr, "input %zu tensors while model has %zu inputs\n", _in.size(), input_names.size());
        return;
    }

    // initialize input
    for (size_t j = 0; j < input_names.size(); ++j)
    {
        ncnn::Mat in = _in[j];
        in.fill(0.01f);
    }

    // warm up
    for (int i = 0; i < g_warmup_loop_count; i++)
    {
        ncnn::Extractor ex = net.create_extractor();
        for (size_t j = 0; j < input_names.size(); ++j)
        {
            ncnn::Mat in = _in[j];
            ex.input(input_names[j], in);
        }

        for (size_t j = 0; j < output_names.size(); ++j)
        {
            ncnn::Mat out;
            ex.extract(output_names[j], out);
        }
    }

    double time_min = DBL_MAX;
    double time_max = -DBL_MAX;
    double time_avg = 0;

    for (int i = 0; i < g_loop_count; i++)
    {
        double start = ncnn::get_current_time();
        {
            ncnn::Extractor ex = net.create_extractor();
            for (size_t j = 0; j < input_names.size(); ++j)
            {
                ncnn::Mat in = _in[j];
                ex.input(input_names[j], in);
            }

            for (size_t j = 0; j < output_names.size(); ++j)
            {
                ncnn::Mat out;
                ex.extract(output_names[j], out);
            }
        }

        double end = ncnn::get_current_time();

        double time = end - start;

        time_min = std::min(time_min, time);
        time_max = std::max(time_max, time);
        time_avg += time;
    }

    time_avg /= g_loop_count;

    fprintf(stderr, "%20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", comment, time_min, time_max, time_avg);
}

void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& opt, const char* model_param_data = NULL)
{
    std::vector<ncnn::Mat> inputs;
    inputs.push_back(_in);
    return benchmark(comment, inputs, opt, model_param_data);
}

void show_usage()
{
    fprintf(stderr, "Usage: benchncnn [loop count] [num threads] [powersave] [gpu device] [cooling down] [(key=value)...]\n");
    fprintf(stderr, "  param=model.param\n");
    fprintf(stderr, "  shape=[227,227,3],...\n");
}

static std::vector<ncnn::Mat> parse_shape_list(char* s)
{
    std::vector<std::vector<int> > shapes;
    std::vector<ncnn::Mat> mats;

    char* pch = strtok(s, "[]");
    while (pch != NULL)
    {
        // parse a,b,c
        int v;
        int nconsumed = 0;
        int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
        if (nscan == 1)
        {
            // ok we get shape
            pch += nconsumed;

            std::vector<int> s;
            s.push_back(v);

            nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
            while (nscan == 1)
            {
                pch += nconsumed;

                s.push_back(v);

                nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
            }

            // shape end
            shapes.push_back(s);
        }

        pch = strtok(NULL, "[]");
    }

    for (size_t i = 0; i < shapes.size(); ++i)
    {
        const std::vector<int>& shape = shapes[i];
        switch (shape.size())
        {
        case 4:
            mats.push_back(ncnn::Mat(shape[0], shape[1], shape[2], shape[3]));
            break;
        case 3:
            mats.push_back(ncnn::Mat(shape[0], shape[1], shape[2]));
            break;
        case 2:
            mats.push_back(ncnn::Mat(shape[0], shape[1]));
            break;
        case 1:
            mats.push_back(ncnn::Mat(shape[0]));
            break;
        default:
            fprintf(stderr, "unsupported input shape size %zu\n", shape.size());
            break;
        }
    }
    return mats;
}

int main(int argc, char** argv)
{
    int loop_count = 4;
    int num_threads = ncnn::get_physical_big_cpu_count();
    int powersave = 2;
    int gpu_device = -1;
    int cooling_down = 1;
    char* model = 0;
    std::vector<ncnn::Mat> inputs;

    for (int i = 1; i < argc; i++)
    {
        if (argv[i][0] == '-' && argv[i][1] == 'h')
        {
            show_usage();
            return -1;
        }

        if (strcmp(argv[i], "--help") == 0)
        {
            show_usage();
            return -1;
        }
    }

    if (argc >= 2)
    {
        loop_count = atoi(argv[1]);
    }
    if (argc >= 3)
    {
        num_threads = atoi(argv[2]);
    }
    if (argc >= 4)
    {
        powersave = atoi(argv[3]);
    }
    if (argc >= 5)
    {
        gpu_device = atoi(argv[4]);
    }
    if (argc >= 6)
    {
        cooling_down = atoi(argv[5]);
    }

    for (int i = 6; i < argc; i++)
    {
        // key=value
        char* kv = argv[i];

        char* eqs = strchr(kv, '=');
        if (eqs == NULL)
        {
            fprintf(stderr, "unrecognized arg %s\n", kv);
            continue;
        }

        // split k v
        eqs[0] = '\0';
        const char* key = kv;
        char* value = eqs + 1;

        if (strcmp(key, "param") == 0)
            model = value;
        if (strcmp(key, "shape") == 0)
            inputs = parse_shape_list(value);
    }

    if (model && inputs.empty())
    {
        fprintf(stderr, "input tensor shape empty!\n");
        return -1;
    }

#ifdef __EMSCRIPTEN__
    EM_ASM(
        FS.mkdir('/working');
        FS.mount(NODEFS, {root: '.'}, '/working'););
#endif // __EMSCRIPTEN__

    bool use_vulkan_compute = gpu_device != -1;

    g_enable_cooling_down = cooling_down != 0;

    g_loop_count = loop_count;

    g_blob_pool_allocator.set_size_compare_ratio(0.f);
    g_workspace_pool_allocator.set_size_compare_ratio(0.f);

#if NCNN_VULKAN
    if (use_vulkan_compute)
    {
        g_warmup_loop_count = 10;

        g_vkdev = ncnn::get_gpu_device(gpu_device);

        g_blob_vkallocator = new ncnn::VkBlobAllocator(g_vkdev);
        g_staging_vkallocator = new ncnn::VkStagingAllocator(g_vkdev);
    }
#endif // NCNN_VULKAN

    ncnn::set_cpu_powersave(powersave);

    ncnn::set_omp_dynamic(0);
    ncnn::set_omp_num_threads(num_threads);

    // default option
    ncnn::Option opt;
    opt.lightmode = true;
    opt.num_threads = num_threads;
    opt.blob_allocator = &g_blob_pool_allocator;
    opt.workspace_allocator = &g_workspace_pool_allocator;
#if NCNN_VULKAN
    opt.blob_vkallocator = g_blob_vkallocator;
    opt.workspace_vkallocator = g_blob_vkallocator;
    opt.staging_vkallocator = g_staging_vkallocator;
#endif // NCNN_VULKAN
    opt.use_winograd_convolution = true;
    opt.use_sgemm_convolution = true;
    opt.use_int8_inference = true;
    opt.use_vulkan_compute = use_vulkan_compute;
    opt.use_fp16_packed = true;
    opt.use_fp16_storage = true;
    opt.use_fp16_arithmetic = true;
    opt.use_int8_storage = true;
    opt.use_int8_arithmetic = true;
    opt.use_packing_layout = true;

    fprintf(stderr, "loop_count = %d\n", g_loop_count);
    fprintf(stderr, "num_threads = %d\n", num_threads);
    fprintf(stderr, "powersave = %d\n", ncnn::get_cpu_powersave());
    fprintf(stderr, "gpu_device = %d\n", gpu_device);
    fprintf(stderr, "cooling_down = %d\n", (int)g_enable_cooling_down);

    if (model != 0)
    {
        // run user defined benchmark
        benchmark(model, inputs, opt);
    }
    else
    {
        // run default cases
        benchmark("squeezenet", ncnn::Mat(227, 227, 3), opt, squeezenet_param_data);

        benchmark("squeezenet_int8", ncnn::Mat(227, 227, 3), opt, squeezenet_int8_param_data);

        benchmark("mobilenet", ncnn::Mat(224, 224, 3), opt, mobilenet_param_data);

        benchmark("mobilenet_int8", ncnn::Mat(224, 224, 3), opt, mobilenet_int8_param_data);

        benchmark("mobilenet_v2", ncnn::Mat(224, 224, 3), opt, mobilenet_v2_param_data);

        // benchmark("mobilenet_v2_int8", ncnn::Mat(224, 224, 3), opt, mobilenet_v2_int8_param_data);

        benchmark("mobilenet_v3", ncnn::Mat(224, 224, 3), opt, mobilenet_v3_param_data);

        benchmark("shufflenet", ncnn::Mat(224, 224, 3), opt, shufflenet_param_data);

        benchmark("shufflenet_v2", ncnn::Mat(224, 224, 3), opt, shufflenet_v2_param_data);

        benchmark("mnasnet", ncnn::Mat(224, 224, 3), opt, mnasnet_param_data);

        benchmark("proxylessnasnet", ncnn::Mat(224, 224, 3), opt, proxylessnasnet_param_data);

        benchmark("efficientnet_b0", ncnn::Mat(224, 224, 3), opt, efficientnet_b0_param_data);

        benchmark("efficientnetv2_b0", ncnn::Mat(224, 224, 3), opt, efficientnetv2_b0_param_data);

        benchmark("regnety_400m", ncnn::Mat(224, 224, 3), opt, regnety_400m_param_data);

        benchmark("blazeface", ncnn::Mat(128, 128, 3), opt, blazeface_param_data);

        benchmark("googlenet", ncnn::Mat(224, 224, 3), opt, googlenet_param_data);

        benchmark("googlenet_int8", ncnn::Mat(224, 224, 3), opt, googlenet_int8_param_data);

        benchmark("resnet18", ncnn::Mat(224, 224, 3), opt, resnet18_param_data);

        benchmark("resnet18_int8", ncnn::Mat(224, 224, 3), opt, resnet18_int8_param_data);

        benchmark("alexnet", ncnn::Mat(227, 227, 3), opt, alexnet_param_data);

        benchmark("vgg16", ncnn::Mat(224, 224, 3), opt, vgg16_param_data);

        benchmark("vgg16_int8", ncnn::Mat(224, 224, 3), opt, vgg16_int8_param_data);

        benchmark("resnet50", ncnn::Mat(224, 224, 3), opt, resnet50_param_data);

        benchmark("resnet50_int8", ncnn::Mat(224, 224, 3), opt, resnet50_int8_param_data);

        benchmark("squeezenet_ssd", ncnn::Mat(300, 300, 3), opt, squeezenet_ssd_param_data);

        benchmark("squeezenet_ssd_int8", ncnn::Mat(300, 300, 3), opt, squeezenet_ssd_int8_param_data);

        benchmark("mobilenet_ssd", ncnn::Mat(300, 300, 3), opt, mobilenet_ssd_param_data);

        benchmark("mobilenet_ssd_int8", ncnn::Mat(300, 300, 3), opt, mobilenet_ssd_int8_param_data);

        benchmark("mobilenet_yolo", ncnn::Mat(416, 416, 3), opt, mobilenet_yolo_param_data);

        benchmark("mobilenetv2_yolov3", ncnn::Mat(352, 352, 3), opt, mobilenetv2_yolov3_param_data);

        benchmark("yolov4-tiny", ncnn::Mat(416, 416, 3), opt, yolov4_tiny_param_data);

        benchmark("nanodet_m", ncnn::Mat(320, 320, 3), opt, nanodet_m_param_data);

        benchmark("yolo-fastest-1.1", ncnn::Mat(320, 320, 3), opt, yolo_fastest_1_1_param_data);

        benchmark("yolo-fastestv2", ncnn::Mat(352, 352, 3), opt, yolo_fastestv2_param_data);

        benchmark("vision_transformer", ncnn::Mat(384, 384, 3), opt, vision_transformer_param_data);

        benchmark("FastestDet", ncnn::Mat(352, 352, 3), opt, FastestDet_param_data);
    }
#if NCNN_VULKAN
    delete g_blob_vkallocator;
    delete g_staging_vkallocator;
#endif // NCNN_VULKAN

    return 0;
}


================================================
FILE: benchmark/benchncnn_param_data.h.in
================================================
// Benchncnn Param Data header
//
// This file is auto-generated by cmake, don't edit it.

@param_header_data@


================================================
FILE: benchmark/blazeface.param
================================================
7767517
101 117
Input            data                    0 1 data 0=128 1=128 2=3
Padding          75                       1 1 data 75 0=1 1=2 2=1 3=2 4=0 5=0.000000e+00 7=0 8=0
Convolution      76                       1 1 75 76 0=24 1=5 11=5 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=1800
ReLU             77                       1 1 76 77
Split            splitncnn_0              1 2 77 77_splitncnn_0 77_splitncnn_1
ConvolutionDepthWise 78                       1 1 77_splitncnn_1 78 0=24 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=216 7=24
Convolution      79                       1 1 78 79 0=24 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=576
BinaryOp         80                       2 1 79 77_splitncnn_0 80 0=0
ReLU             81                       1 1 80 81
Split            splitncnn_1              1 2 81 81_splitncnn_0 81_splitncnn_1
Padding          82                       1 1 81_splitncnn_1 82 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=4
ConvolutionDepthWise 83                       1 1 81_splitncnn_0 83 0=24 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=216 7=24
Convolution      84                       1 1 83 84 0=28 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=672
BinaryOp         85                       2 1 84 82 85 0=0
ReLU             86                       1 1 85 86
Split            splitncnn_2              1 2 86 86_splitncnn_0 86_splitncnn_1
Padding          87                       1 1 86_splitncnn_1 87 0=0 1=2 2=0 3=2 4=0 5=0.000000e+00 7=0 8=0
Pooling          88                       1 1 86_splitncnn_0 88 0=0 1=2 11=2 2=2 12=2 3=0 13=0 14=0 15=0 5=1
Padding          89                       1 1 88 89 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=4
ConvolutionDepthWise 90                       1 1 87 90 0=28 1=3 11=3 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=252 7=28
Convolution      91                       1 1 90 91 0=32 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=896
BinaryOp         92                       2 1 91 89 92 0=0
ReLU             93                       1 1 92 93
Split            splitncnn_3              1 2 93 93_splitncnn_0 93_splitncnn_1
Padding          94                       1 1 93_splitncnn_1 94 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=4
ConvolutionDepthWise 95                       1 1 93_splitncnn_0 95 0=32 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=288 7=32
Convolution      96                       1 1 95 96 0=36 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=1152
BinaryOp         97                       2 1 96 94 97 0=0
ReLU             98                       1 1 97 98
Split            splitncnn_4              1 2 98 98_splitncnn_0 98_splitncnn_1
Padding          99                       1 1 98_splitncnn_1 99 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=6
ConvolutionDepthWise 100                      1 1 98_splitncnn_0 100 0=36 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=324 7=36
Convolution      101                      1 1 100 101 0=42 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=1512
BinaryOp         102                      2 1 101 99 102 0=0
ReLU             103                      1 1 102 103
Split            splitncnn_5              1 2 103 103_splitncnn_0 103_splitncnn_1
Padding          104                      1 1 103_splitncnn_1 104 0=0 1=2 2=0 3=2 4=0 5=0.000000e+00 7=0 8=0
Pooling          105                      1 1 103_splitncnn_0 105 0=0 1=2 11=2 2=2 12=2 3=0 13=0 14=0 15=0 5=1
Padding          106                      1 1 105 106 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=6
ConvolutionDepthWise 107                      1 1 104 107 0=42 1=3 11=3 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=378 7=42
Convolution      108                      1 1 107 108 0=48 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=2016
BinaryOp         109                      2 1 108 106 109 0=0
ReLU             110                      1 1 109 110
Split            splitncnn_6              1 2 110 110_splitncnn_0 110_splitncnn_1
Padding          111                      1 1 110_splitncnn_1 111 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8
ConvolutionDepthWise 112                      1 1 110_splitncnn_0 112 0=48 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=432 7=48
Convolution      113                      1 1 112 113 0=56 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=2688
BinaryOp         114                      2 1 113 111 114 0=0
ReLU             115                      1 1 114 115
Split            splitncnn_7              1 2 115 115_splitncnn_0 115_splitncnn_1
Padding          116                      1 1 115_splitncnn_1 116 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8
ConvolutionDepthWise 117                      1 1 115_splitncnn_0 117 0=56 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=504 7=56
Convolution      118                      1 1 117 118 0=64 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=3584
BinaryOp         119                      2 1 118 116 119 0=0
ReLU             120                      1 1 119 120
Split            splitncnn_8              1 2 120 120_splitncnn_0 120_splitncnn_1
Padding          121                      1 1 120_splitncnn_1 121 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8
ConvolutionDepthWise 122                      1 1 120_splitncnn_0 122 0=64 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=576 7=64
Convolution      123                      1 1 122 123 0=72 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=4608
BinaryOp         124                      2 1 123 121 124 0=0
ReLU             125                      1 1 124 125
Split            splitncnn_9              1 2 125 125_splitncnn_0 125_splitncnn_1
Padding          126                      1 1 125_splitncnn_1 126 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8
ConvolutionDepthWise 127                      1 1 125_splitncnn_0 127 0=72 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=648 7=72
Convolution      128                      1 1 127 128 0=80 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=5760
BinaryOp         129                      2 1 128 126 129 0=0
ReLU             130                      1 1 129 130
Split            splitncnn_10             1 2 130 130_splitncnn_0 130_splitncnn_1
Padding          131                      1 1 130_splitncnn_1 131 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8
ConvolutionDepthWise 132                      1 1 130_splitncnn_0 132 0=80 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=720 7=80
Convolution      133                      1 1 132 133 0=88 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=7040
BinaryOp         134                      2 1 133 131 134 0=0
ReLU             135                      1 1 134 135
Split            splitncnn_11             1 2 135 135_splitncnn_0 135_splitncnn_1
Padding          136                      1 1 135_splitncnn_1 136 0=0 1=2 2=0 3=2 4=0 5=0.000000e+00 7=0 8=0
Pooling          137                      1 1 135_splitncnn_0 137 0=0 1=2 11=2 2=2 12=2 3=0 13=0 14=0 15=0 5=1
Padding          138                      1 1 137 138 0=0 1=0 2=0 3=0 4=0 5=0.000000e+00 7=0 8=8
ConvolutionDepthWise 139                      1 1 136 139 0=88 1=3 11=3 2=1 12=1 3=2 13=2 4=0 14=0 15=0 16=0 5=1 6=792 7=88
Convolution      140                      1 1 139 140 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=8448
BinaryOp         141                      2 1 140 138 141 0=0
ReLU             142                      1 1 141 142
Split            splitncnn_12             1 2 142 142_splitncnn_0 142_splitncnn_1
ConvolutionDepthWise 143                      1 1 142_splitncnn_1 143 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96
Convolution      144                      1 1 143 144 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216
BinaryOp         145                      2 1 144 142_splitncnn_0 145 0=0
ReLU             146                      1 1 145 146
Split            splitncnn_13             1 2 146 146_splitncnn_0 146_splitncnn_1
ConvolutionDepthWise 147                      1 1 146_splitncnn_1 147 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96
Convolution      148                      1 1 147 148 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216
BinaryOp         149                      2 1 148 146_splitncnn_0 149 0=0
ReLU             150                      1 1 149 150
Split            splitncnn_14             1 2 150 150_splitncnn_0 150_splitncnn_1
ConvolutionDepthWise 151                      1 1 150_splitncnn_1 151 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96
Convolution      152                      1 1 151 152 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216
BinaryOp         153                      2 1 152 150_splitncnn_0 153 0=0
ReLU             154                      1 1 153 154
Split            splitncnn_15             1 2 154 154_splitncnn_0 154_splitncnn_1
ConvolutionDepthWise 155                      1 1 154_splitncnn_1 155 0=96 1=3 11=3 2=1 12=1 3=1 13=1 4=1 14=1 15=1 16=1 5=1 6=864 7=96
Convolution      156                      1 1 155 156 0=96 1=1 11=1 2=1 12=1 3=1 13=1 4=0 14=0 15=0 16=0 5=1 6=9216
BinaryOp         157                      2 1 156 154_splitncnn_0 157 0=0
ReLU             output                   1 1 157 output


================================================
FILE: benchmark/efficientnet_b0.param
================================================
7767517
200 225
Input                    input.1                  0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              Conv_0                   1 1 data 362 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864
Swish                    Mul_3                    1 1 362 364 -23330=4,3,112,112,32
ConvolutionDepthWise     Conv_4                   1 1 364 366 -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32
Swish                    Mul_7                    1 1 366 368 -23330=4,3,112,112,32
Split                    splitncnn_0              1 2 368 368_splitncnn_0 368_splitncnn_1 -23330=8,3,112,112,32,3,112,112,32
Pooling                  GlobalAveragePool_8      1 1 368_splitncnn_1 369 -23330=4,1,32,1,1 0=1 4=1
InnerProduct             Conv_9                   1 1 369 370 -23330=4,1,8,1,1 0=8 1=1 2=256
Swish                    Mul_11                   1 1 370 372 -23330=4,1,8,1,1
Convolution              Conv_12                  1 1 372 374 -23330=4,1,32,1,1 0=32 1=1 5=1 6=256 9=4
BinaryOp                 Mul_14                   2 1 368_splitncnn_0 374 375 -23330=4,3,112,112,32 0=2
Convolution              Conv_15                  1 1 375 377 -23330=4,3,112,112,16 0=16 1=1 5=1 6=512
Convolution              Conv_17                  1 1 377 379 -23330=4,3,112,112,96 0=96 1=1 5=1 6=1536
Swish                    Mul_20                   1 1 379 381 -23330=4,3,112,112,96
ConvolutionDepthWise     Conv_21                  1 1 381 383 -23330=4,3,56,56,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96
Swish                    Mul_24                   1 1 383 385 -23330=4,3,56,56,96
Split                    splitncnn_1              1 2 385 385_splitncnn_0 385_splitncnn_1 -23330=8,3,56,56,96,3,56,56,96
Pooling                  GlobalAveragePool_25     1 1 385_splitncnn_1 386 -23330=4,1,96,1,1 0=1 4=1
InnerProduct             Conv_26                  1 1 386 387 -23330=4,1,4,1,1 0=4 1=1 2=384
Swish                    Mul_28                   1 1 387 389 -23330=4,1,4,1,1
Convolution              Conv_29                  1 1 389 391 -23330=4,1,96,1,1 0=96 1=1 5=1 6=384 9=4
BinaryOp                 Mul_31                   2 1 385_splitncnn_0 391 392 -23330=4,3,56,56,96 0=2
Convolution              Conv_32                  1 1 392 394 -23330=4,3,56,56,24 0=24 1=1 5=1 6=2304
Split                    splitncnn_2              1 2 394 394_splitncnn_0 394_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
Convolution              Conv_34                  1 1 394_splitncnn_1 396 -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456
Swish                    Mul_37                   1 1 396 398 -23330=4,3,56,56,144
ConvolutionDepthWise     Conv_38                  1 1 398 400 -23330=4,3,56,56,144 0=144 1=3 4=1 5=1 6=1296 7=144
Swish                    Mul_41                   1 1 400 402 -23330=4,3,56,56,144
Split                    splitncnn_3              1 2 402 402_splitncnn_0 402_splitncnn_1 -23330=8,3,56,56,144,3,56,56,144
Pooling                  GlobalAveragePool_42     1 1 402_splitncnn_1 403 -23330=4,1,144,1,1 0=1 4=1
InnerProduct             Conv_43                  1 1 403 404 -23330=4,1,6,1,1 0=6 1=1 2=864
Swish                    Mul_45                   1 1 404 406 -23330=4,1,6,1,1
Convolution              Conv_46                  1 1 406 408 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4
BinaryOp                 Mul_48                   2 1 402_splitncnn_0 408 409 -23330=4,3,56,56,144 0=2
Convolution              Conv_49                  1 1 409 411 -23330=4,3,56,56,24 0=24 1=1 5=1 6=3456
BinaryOp                 Add_51                   2 1 394_splitncnn_0 411 412 -23330=4,3,56,56,24
Convolution              Conv_52                  1 1 412 414 -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456
Swish                    Mul_55                   1 1 414 416 -23330=4,3,56,56,144
ConvolutionDepthWise     Conv_56                  1 1 416 418 -23330=4,3,28,28,144 0=144 1=5 3=2 4=2 5=1 6=3600 7=144
Swish                    Mul_59                   1 1 418 420 -23330=4,3,28,28,144
Split                    splitncnn_4              1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,144,3,28,28,144
Pooling                  GlobalAveragePool_60     1 1 420_splitncnn_1 421 -23330=4,1,144,1,1 0=1 4=1
InnerProduct             Conv_61                  1 1 421 422 -23330=4,1,6,1,1 0=6 1=1 2=864
Swish                    Mul_63                   1 1 422 424 -23330=4,1,6,1,1
Convolution              Conv_64                  1 1 424 426 -23330=4,1,144,1,1 0=144 1=1 5=1 6=864 9=4
BinaryOp                 Mul_66                   2 1 420_splitncnn_0 426 427 -23330=4,3,28,28,144 0=2
Convolution              Conv_67                  1 1 427 429 -23330=4,3,28,28,40 0=40 1=1 5=1 6=5760
Split                    splitncnn_5              1 2 429 429_splitncnn_0 429_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              Conv_69                  1 1 429_splitncnn_1 431 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
Swish                    Mul_72                   1 1 431 433 -23330=4,3,28,28,240
ConvolutionDepthWise     Conv_73                  1 1 433 435 -23330=4,3,28,28,240 0=240 1=5 4=2 5=1 6=6000 7=240
Swish                    Mul_76                   1 1 435 437 -23330=4,3,28,28,240
Split                    splitncnn_6              1 2 437 437_splitncnn_0 437_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240
Pooling                  GlobalAveragePool_77     1 1 437_splitncnn_1 438 -23330=4,1,240,1,1 0=1 4=1
InnerProduct             Conv_78                  1 1 438 439 -23330=4,1,10,1,1 0=10 1=1 2=2400
Swish                    Mul_80                   1 1 439 441 -23330=4,1,10,1,1
Convolution              Conv_81                  1 1 441 443 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4
BinaryOp                 Mul_83                   2 1 437_splitncnn_0 443 444 -23330=4,3,28,28,240 0=2
Convolution              Conv_84                  1 1 444 446 -23330=4,3,28,28,40 0=40 1=1 5=1 6=9600
BinaryOp                 Add_86                   2 1 429_splitncnn_0 446 447 -23330=4,3,28,28,40
Convolution              Conv_87                  1 1 447 449 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
Swish                    Mul_90                   1 1 449 451 -23330=4,3,28,28,240
ConvolutionDepthWise     Conv_91                  1 1 451 453 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240
Swish                    Mul_94                   1 1 453 455 -23330=4,3,14,14,240
Split                    splitncnn_7              1 2 455 455_splitncnn_0 455_splitncnn_1 -23330=8,3,14,14,240,3,14,14,240
Pooling                  GlobalAveragePool_95     1 1 455_splitncnn_1 456 -23330=4,1,240,1,1 0=1 4=1
InnerProduct             Conv_96                  1 1 456 457 -23330=4,1,10,1,1 0=10 1=1 2=2400
Swish                    Mul_98                   1 1 457 459 -23330=4,1,10,1,1
Convolution              Conv_99                  1 1 459 461 -23330=4,1,240,1,1 0=240 1=1 5=1 6=2400 9=4
BinaryOp                 Mul_101                  2 1 455_splitncnn_0 461 462 -23330=4,3,14,14,240 0=2
Convolution              Conv_102                 1 1 462 464 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
Split                    splitncnn_8              1 2 464 464_splitncnn_0 464_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              Conv_104                 1 1 464_splitncnn_1 466 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
Swish                    Mul_107                  1 1 466 468 -23330=4,3,14,14,480
ConvolutionDepthWise     Conv_108                 1 1 468 470 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
Swish                    Mul_111                  1 1 470 472 -23330=4,3,14,14,480
Split                    splitncnn_9              1 2 472 472_splitncnn_0 472_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
Pooling                  GlobalAveragePool_112    1 1 472_splitncnn_1 473 -23330=4,1,480,1,1 0=1 4=1
InnerProduct             Conv_113                 1 1 473 474 -23330=4,1,20,1,1 0=20 1=1 2=9600
Swish                    Mul_115                  1 1 474 476 -23330=4,1,20,1,1
Convolution              Conv_116                 1 1 476 478 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4
BinaryOp                 Mul_118                  2 1 472_splitncnn_0 478 479 -23330=4,3,14,14,480 0=2
Convolution              Conv_119                 1 1 479 481 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400
BinaryOp                 Add_121                  2 1 464_splitncnn_0 481 482 -23330=4,3,14,14,80
Split                    splitncnn_10             1 2 482 482_splitncnn_0 482_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              Conv_122                 1 1 482_splitncnn_1 484 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
Swish                    Mul_125                  1 1 484 486 -23330=4,3,14,14,480
ConvolutionDepthWise     Conv_126                 1 1 486 488 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
Swish                    Mul_129                  1 1 488 490 -23330=4,3,14,14,480
Split                    splitncnn_11             1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
Pooling                  GlobalAveragePool_130    1 1 490_splitncnn_1 491 -23330=4,1,480,1,1 0=1 4=1
InnerProduct             Conv_131                 1 1 491 492 -23330=4,1,20,1,1 0=20 1=1 2=9600
Swish                    Mul_133                  1 1 492 494 -23330=4,1,20,1,1
Convolution              Conv_134                 1 1 494 496 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4
BinaryOp                 Mul_136                  2 1 490_splitncnn_0 496 497 -23330=4,3,14,14,480 0=2
Convolution              Conv_137                 1 1 497 499 -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400
BinaryOp                 Add_139                  2 1 482_splitncnn_0 499 500 -23330=4,3,14,14,80
Convolution              Conv_140                 1 1 500 502 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
Swish                    Mul_143                  1 1 502 504 -23330=4,3,14,14,480
ConvolutionDepthWise     Conv_144                 1 1 504 506 -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480
Swish                    Mul_147                  1 1 506 508 -23330=4,3,14,14,480
Split                    splitncnn_12             1 2 508 508_splitncnn_0 508_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
Pooling                  GlobalAveragePool_148    1 1 508_splitncnn_1 509 -23330=4,1,480,1,1 0=1 4=1
InnerProduct             Conv_149                 1 1 509 510 -23330=4,1,20,1,1 0=20 1=1 2=9600
Swish                    Mul_151                  1 1 510 512 -23330=4,1,20,1,1
Convolution              Conv_152                 1 1 512 514 -23330=4,1,480,1,1 0=480 1=1 5=1 6=9600 9=4
BinaryOp                 Mul_154                  2 1 508_splitncnn_0 514 515 -23330=4,3,14,14,480 0=2
Convolution              Conv_155                 1 1 515 517 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
Split                    splitncnn_13             1 2 517 517_splitncnn_0 517_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution              Conv_157                 1 1 517_splitncnn_1 519 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
Swish                    Mul_160                  1 1 519 521 -23330=4,3,14,14,672
ConvolutionDepthWise     Conv_161                 1 1 521 523 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
Swish                    Mul_164                  1 1 523 525 -23330=4,3,14,14,672
Split                    splitncnn_14             1 2 525 525_splitncnn_0 525_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Pooling                  GlobalAveragePool_165    1 1 525_splitncnn_1 526 -23330=4,1,672,1,1 0=1 4=1
InnerProduct             Conv_166                 1 1 526 527 -23330=4,1,28,1,1 0=28 1=1 2=18816
Swish                    Mul_168                  1 1 527 529 -23330=4,1,28,1,1
Convolution              Conv_169                 1 1 529 531 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4
BinaryOp                 Mul_171                  2 1 525_splitncnn_0 531 532 -23330=4,3,14,14,672 0=2
Convolution              Conv_172                 1 1 532 534 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
BinaryOp                 Add_174                  2 1 517_splitncnn_0 534 535 -23330=4,3,14,14,112
Split                    splitncnn_15             1 2 535 535_splitncnn_0 535_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution              Conv_175                 1 1 535_splitncnn_1 537 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
Swish                    Mul_178                  1 1 537 539 -23330=4,3,14,14,672
ConvolutionDepthWise     Conv_179                 1 1 539 541 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
Swish                    Mul_182                  1 1 541 543 -23330=4,3,14,14,672
Split                    splitncnn_16             1 2 543 543_splitncnn_0 543_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Pooling                  GlobalAveragePool_183    1 1 543_splitncnn_1 544 -23330=4,1,672,1,1 0=1 4=1
InnerProduct             Conv_184                 1 1 544 545 -23330=4,1,28,1,1 0=28 1=1 2=18816
Swish                    Mul_186                  1 1 545 547 -23330=4,1,28,1,1
Convolution              Conv_187                 1 1 547 549 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4
BinaryOp                 Mul_189                  2 1 543_splitncnn_0 549 550 -23330=4,3,14,14,672 0=2
Convolution              Conv_190                 1 1 550 552 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
BinaryOp                 Add_192                  2 1 535_splitncnn_0 552 553 -23330=4,3,14,14,112
Convolution              Conv_193                 1 1 553 555 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
Swish                    Mul_196                  1 1 555 557 -23330=4,3,14,14,672
ConvolutionDepthWise     Conv_197                 1 1 557 559 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672
Swish                    Mul_200                  1 1 559 561 -23330=4,3,7,7,672
Split                    splitncnn_17             1 2 561 561_splitncnn_0 561_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
Pooling                  GlobalAveragePool_201    1 1 561_splitncnn_1 562 -23330=4,1,672,1,1 0=1 4=1
InnerProduct             Conv_202                 1 1 562 563 -23330=4,1,28,1,1 0=28 1=1 2=18816
Swish                    Mul_204                  1 1 563 565 -23330=4,1,28,1,1
Convolution              Conv_205                 1 1 565 567 -23330=4,1,672,1,1 0=672 1=1 5=1 6=18816 9=4
BinaryOp                 Mul_207                  2 1 561_splitncnn_0 567 568 -23330=4,3,7,7,672 0=2
Convolution              Conv_208                 1 1 568 570 -23330=4,3,7,7,192 0=192 1=1 5=1 6=129024
Split                    splitncnn_18             1 2 570 570_splitncnn_0 570_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              Conv_210                 1 1 570_splitncnn_1 572 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184
Swish                    Mul_213                  1 1 572 574 -23330=4,3,7,7,1152
ConvolutionDepthWise     Conv_214                 1 1 574 576 -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152
Swish                    Mul_217                  1 1 576 578 -23330=4,3,7,7,1152
Split                    splitncnn_19             1 2 578 578_splitncnn_0 578_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Pooling                  GlobalAveragePool_218    1 1 578_splitncnn_1 579 -23330=4,1,1152,1,1 0=1 4=1
InnerProduct             Conv_219                 1 1 579 580 -23330=4,1,48,1,1 0=48 1=1 2=55296
Swish                    Mul_221                  1 1 580 582 -23330=4,1,48,1,1
Convolution              Conv_222                 1 1 582 584 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
BinaryOp                 Mul_224                  2 1 578_splitncnn_0 584 585 -23330=4,3,7,7,1152 0=2
Convolution              Conv_225                 1 1 585 587 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
BinaryOp                 Add_227                  2 1 570_splitncnn_0 587 588 -23330=4,3,7,7,192
Split                    splitncnn_20             1 2 588 588_splitncnn_0 588_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              Conv_228                 1 1 588_splitncnn_1 590 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184
Swish                    Mul_231                  1 1 590 592 -23330=4,3,7,7,1152
ConvolutionDepthWise     Conv_232                 1 1 592 594 -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152
Swish                    Mul_235                  1 1 594 596 -23330=4,3,7,7,1152
Split                    splitncnn_21             1 2 596 596_splitncnn_0 596_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Pooling                  GlobalAveragePool_236    1 1 596_splitncnn_1 597 -23330=4,1,1152,1,1 0=1 4=1
InnerProduct             Conv_237                 1 1 597 598 -23330=4,1,48,1,1 0=48 1=1 2=55296
Swish                    Mul_239                  1 1 598 600 -23330=4,1,48,1,1
Convolution              Conv_240                 1 1 600 602 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
BinaryOp                 Mul_242                  2 1 596_splitncnn_0 602 603 -23330=4,3,7,7,1152 0=2
Convolution              Conv_243                 1 1 603 605 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
BinaryOp                 Add_245                  2 1 588_splitncnn_0 605 606 -23330=4,3,7,7,192
Split                    splitncnn_22             1 2 606 606_splitncnn_0 606_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              Conv_246                 1 1 606_splitncnn_1 608 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184
Swish                    Mul_249                  1 1 608 610 -23330=4,3,7,7,1152
ConvolutionDepthWise     Conv_250                 1 1 610 612 -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152
Swish                    Mul_253                  1 1 612 614 -23330=4,3,7,7,1152
Split                    splitncnn_23             1 2 614 614_splitncnn_0 614_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Pooling                  GlobalAveragePool_254    1 1 614_splitncnn_1 615 -23330=4,1,1152,1,1 0=1 4=1
InnerProduct             Conv_255                 1 1 615 616 -23330=4,1,48,1,1 0=48 1=1 2=55296
Swish                    Mul_257                  1 1 616 618 -23330=4,1,48,1,1
Convolution              Conv_258                 1 1 618 620 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
BinaryOp                 Mul_260                  2 1 614_splitncnn_0 620 621 -23330=4,3,7,7,1152 0=2
Convolution              Conv_261                 1 1 621 623 -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
BinaryOp                 Add_263                  2 1 606_splitncnn_0 623 624 -23330=4,3,7,7,192
Convolution              Conv_264                 1 1 624 626 -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184
Swish                    Mul_267                  1 1 626 628 -23330=4,3,7,7,1152
ConvolutionDepthWise     Conv_268                 1 1 628 630 -23330=4,3,7,7,1152 0=1152 1=3 4=1 5=1 6=10368 7=1152
Swish                    Mul_271                  1 1 630 632 -23330=4,3,7,7,1152
Split                    splitncnn_24             1 2 632 632_splitncnn_0 632_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Pooling                  GlobalAveragePool_272    1 1 632_splitncnn_1 633 -23330=4,1,1152,1,1 0=1 4=1
InnerProduct             Conv_273                 1 1 633 634 -23330=4,1,48,1,1 0=48 1=1 2=55296
Swish                    Mul_275                  1 1 634 636 -23330=4,1,48,1,1
Convolution              Conv_276                 1 1 636 638 -23330=4,1,1152,1,1 0=1152 1=1 5=1 6=55296 9=4
BinaryOp                 Mul_278                  2 1 632_splitncnn_0 638 639 -23330=4,3,7,7,1152 0=2
Convolution              Conv_279                 1 1 639 641 -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640
Convolution              Conv_281                 1 1 641 643 -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600
Swish                    Mul_284                  1 1 643 645 -23330=4,3,7,7,1280
Pooling                  GlobalAveragePool_285    1 1 645 654 -23330=4,1,1280,1,1 0=1 4=1
InnerProduct             Gemm_292                 1 1 654 655 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
Softmax                  prob                     1 1 655 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/efficientnetv2_b0.param
================================================
7767517
257 288
MemoryData               110:12                   0 1 110:12 -23330=4,1,112,1,1 0=112
MemoryData               133:12                   0 1 133:12 -23330=4,1,192,1,1 0=192
MemoryData               144:12                   0 1 144:12 -23330=4,1,192,1,1 0=192
MemoryData               14:11                    0 1 14:11 -23330=4,1,32,1,1 0=32
MemoryData               155:12                   0 1 155:12 -23330=4,1,192,1,1 0=192
MemoryData               166:12                   0 1 166:12 -23330=4,1,192,1,1 0=192
MemoryData               177:12                   0 1 177:12 -23330=4,1,192,1,1 0=192
MemoryData               188:12                   0 1 188:12 -23330=4,1,192,1,1 0=192
MemoryData               199:12                   0 1 199:12 -23330=4,1,192,1,1 0=192
MemoryData               22:11                    0 1 22:11 -23330=4,1,48,1,1 0=48
MemoryData               33:11                    0 1 33:11 -23330=4,1,112,1,1 0=112
MemoryData               44:11                    0 1 44:11 -23330=4,1,112,1,1 0=112
MemoryData               55:11                    0 1 55:11 -23330=4,1,112,1,1 0=112
MemoryData               77:11                    0 1 77:11 -23330=4,1,96,1,1 0=96
MemoryData               88:11                    0 1 88:11 -23330=4,1,96,1,1 0=96
Input                    op_201                   0 1 204:12 -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              op_202                   1 1 204:12 206:12 -23330=4,3,112,112,32 0=32 1=3 3=2 4=-233 5=1 6=864
Swish                    op_203                   1 1 206:12 208:12 -23330=4,3,112,112,32
Convolution              op_204                   1 1 208:12 210:12 -23330=4,3,112,112,16 0=16 1=3 4=-233 5=1 6=4608
Swish                    op_205                   1 1 210:12 212:12_splitncnn_0 -23330=4,3,112,112,16
Convolution              op_207                   1 1 212:12_splitncnn_0 215:12 -23330=4,3,56,56,64 0=64 1=3 3=2 4=-233 5=1 6=9216
Swish                    op_208                   1 1 215:12 217:12 -23330=4,3,56,56,64
Convolution              op_209                   1 1 217:12 219:12 -23330=4,3,56,56,32 0=32 1=1 4=-233 5=1 6=2048
Split                    splitncnn_1              1 2 219:12 219:12_splitncnn_0 219:12_splitncnn_1 -23330=8,3,56,56,32,3,56,56,32
Convolution              op_210                   1 1 219:12_splitncnn_1 221:12 -23330=4,3,56,56,128 0=128 1=3 4=-233 5=1 6=36864
Swish                    op_211                   1 1 221:12 223:12 -23330=4,3,56,56,128
Convolution              op_212                   1 1 223:12 224:12 -23330=4,3,56,56,32 0=32 1=1 4=-233 6=4096
Eltwise                  op_213                   2 1 219:12_splitncnn_0 224:12 225:12 -23330=4,3,56,56,32 0=1
BinaryOp                 op_214                   2 1 225:12 14:11 226:12_splitncnn_0 -23330=4,3,56,56,32
Convolution              op_216                   1 1 226:12_splitncnn_0 229:12 -23330=4,3,28,28,128 0=128 1=3 3=2 4=-233 5=1 6=36864
Swish                    op_217                   1 1 229:12 231:12 -23330=4,3,28,28,128
Convolution              op_218                   1 1 231:12 233:12 -23330=4,3,28,28,48 0=48 1=1 4=-233 5=1 6=6144
Split                    splitncnn_3              1 2 233:12 233:12_splitncnn_0 233:12_splitncnn_1 -23330=8,3,28,28,48,3,28,28,48
Convolution              op_219                   1 1 233:12_splitncnn_1 235:12 -23330=4,3,28,28,192 0=192 1=3 4=-233 5=1 6=82944
Swish                    op_220                   1 1 235:12 237:12 -23330=4,3,28,28,192
Convolution              op_221                   1 1 237:12 238:12 -23330=4,3,28,28,48 0=48 1=1 4=-233 6=9216
Eltwise                  op_222                   2 1 233:12_splitncnn_0 238:12 239:12 -23330=4,3,28,28,48 0=1
BinaryOp                 op_223                   2 1 239:12 22:11 240:12_splitncnn_0 -23330=4,3,28,28,48
Convolution              op_225                   1 1 240:12_splitncnn_0 243:12 -23330=4,3,28,28,192 0=192 1=1 4=-233 5=1 6=9216
Swish                    op_226                   1 1 243:12 245:12 -23330=4,3,28,28,192
ConvolutionDepthWise     op_227                   1 1 245:12 248:12 -23330=4,3,14,14,192 0=192 1=3 3=2 4=-233 5=1 6=1728 7=192
Swish                    op_229                   1 1 248:12 250:12 -23330=4,3,14,14,192
Split                    splitncnn_5              1 2 250:12 250:12_splitncnn_0 250:12_splitncnn_1 -23330=8,3,14,14,192,3,14,14,192
Reduction                op_230                   1 1 250:12_splitncnn_1 251:12 -23330=4,3,1,1,192 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_231                   1 1 251:12 253:12 -23330=4,3,1,1,12 0=12 1=1 4=-233 5=1 6=2304
Swish                    op_232                   1 1 253:12 255:12 -23330=4,3,1,1,12
Convolution              op_233                   1 1 255:12 258:12 -23330=4,3,1,1,192 0=192 1=1 4=-233 5=1 6=2304 9=4
BinaryOp                 op_235                   2 1 250:12_splitncnn_0 258:12 259:12 -23330=4,3,14,14,192 0=2
Convolution              op_236                   1 1 259:12 261:12 -23330=4,3,14,14,96 0=96 1=1 4=-233 5=1 6=18432
Split                    splitncnn_6              1 2 261:12 261:12_splitncnn_0 261:12_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              op_237                   1 1 261:12_splitncnn_1 263:12 -23330=4,3,14,14,384 0=384 1=1 4=-233 5=1 6=36864
Swish                    op_238                   1 1 263:12 265:12 -23330=4,3,14,14,384
ConvolutionDepthWise     op_239                   1 1 265:12 268:12 -23330=4,3,14,14,384 0=384 1=3 4=-233 5=1 6=3456 7=384
Swish                    op_241                   1 1 268:12 270:12 -23330=4,3,14,14,384
Split                    splitncnn_7              1 2 270:12 270:12_splitncnn_0 270:12_splitncnn_1 -23330=8,3,14,14,384,3,14,14,384
Reduction                op_242                   1 1 270:12_splitncnn_1 271:12 -23330=4,3,1,1,384 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_243                   1 1 271:12 273:12 -23330=4,3,1,1,24 0=24 1=1 4=-233 5=1 6=9216
Swish                    op_244                   1 1 273:12 275:12 -23330=4,3,1,1,24
Convolution              op_245                   1 1 275:12 278:12 -23330=4,3,1,1,384 0=384 1=1 4=-233 5=1 6=9216 9=4
BinaryOp                 op_247                   2 1 270:12_splitncnn_0 278:12 279:12 -23330=4,3,14,14,384 0=2
Convolution              op_248                   1 1 279:12 280:12 -23330=4,3,14,14,96 0=96 1=1 4=-233 6=36864
Eltwise                  op_249                   2 1 261:12_splitncnn_0 280:12 281:12 -23330=4,3,14,14,96 0=1
BinaryOp                 op_250                   2 1 281:12 77:11 282:12 -23330=4,3,14,14,96
Split                    splitncnn_8              1 2 282:12 282:12_splitncnn_0 282:12_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              op_251                   1 1 282:12_splitncnn_1 284:12 -23330=4,3,14,14,384 0=384 1=1 4=-233 5=1 6=36864
Swish                    op_252                   1 1 284:12 286:12 -23330=4,3,14,14,384
ConvolutionDepthWise     op_253                   1 1 286:12 289:12 -23330=4,3,14,14,384 0=384 1=3 4=-233 5=1 6=3456 7=384
Swish                    op_255                   1 1 289:12 291:12 -23330=4,3,14,14,384
Split                    splitncnn_9              1 2 291:12 291:12_splitncnn_0 291:12_splitncnn_1 -23330=8,3,14,14,384,3,14,14,384
Reduction                op_256                   1 1 291:12_splitncnn_1 292:12 -23330=4,3,1,1,384 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_257                   1 1 292:12 294:12 -23330=4,3,1,1,24 0=24 1=1 4=-233 5=1 6=9216
Swish                    op_258                   1 1 294:12 296:12 -23330=4,3,1,1,24
Convolution              op_259                   1 1 296:12 299:12 -23330=4,3,1,1,384 0=384 1=1 4=-233 5=1 6=9216 9=4
BinaryOp                 op_261                   2 1 291:12_splitncnn_0 299:12 300:12 -23330=4,3,14,14,384 0=2
Convolution              op_262                   1 1 300:12 301:12 -23330=4,3,14,14,96 0=96 1=1 4=-233 6=36864
Eltwise                  op_263                   2 1 282:12_splitncnn_0 301:12 302:12 -23330=4,3,14,14,96 0=1
BinaryOp                 op_264                   2 1 302:12 88:11 303:12 -23330=4,3,14,14,96
Convolution              op_265                   1 1 303:12 305:12 -23330=4,3,14,14,576 0=576 1=1 4=-233 5=1 6=55296
Swish                    op_266                   1 1 305:12 307:12 -23330=4,3,14,14,576
ConvolutionDepthWise     op_267                   1 1 307:12 310:12 -23330=4,3,14,14,576 0=576 1=3 4=-233 5=1 6=5184 7=576
Swish                    op_269                   1 1 310:12 312:12 -23330=4,3,14,14,576
Split                    splitncnn_10             1 2 312:12 312:12_splitncnn_0 312:12_splitncnn_1 -23330=8,3,14,14,576,3,14,14,576
Reduction                op_270                   1 1 312:12_splitncnn_1 313:12 -23330=4,3,1,1,576 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_271                   1 1 313:12 315:12 -23330=4,3,1,1,24 0=24 1=1 4=-233 5=1 6=13824
Swish                    op_272                   1 1 315:12 317:12 -23330=4,3,1,1,24
Convolution              op_273                   1 1 317:12 320:12 -23330=4,3,1,1,576 0=576 1=1 4=-233 5=1 6=13824 9=4
BinaryOp                 op_275                   2 1 312:12_splitncnn_0 320:12 321:12 -23330=4,3,14,14,576 0=2
Convolution              op_276                   1 1 321:12 323:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 5=1 6=64512
Split                    splitncnn_11             1 2 323:12 323:12_splitncnn_0 323:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution              op_277                   1 1 323:12_splitncnn_1 325:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264
Swish                    op_278                   1 1 325:12 327:12 -23330=4,3,14,14,672
ConvolutionDepthWise     op_279                   1 1 327:12 330:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672
Swish                    op_281                   1 1 330:12 332:12 -23330=4,3,14,14,672
Split                    splitncnn_12             1 2 332:12 332:12_splitncnn_0 332:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Reduction                op_282                   1 1 332:12_splitncnn_1 333:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_283                   1 1 333:12 335:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816
Swish                    op_284                   1 1 335:12 337:12 -23330=4,3,1,1,28
Convolution              op_285                   1 1 337:12 340:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4
BinaryOp                 op_287                   2 1 332:12_splitncnn_0 340:12 341:12 -23330=4,3,14,14,672 0=2
Convolution              op_288                   1 1 341:12 342:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264
Eltwise                  op_289                   2 1 323:12_splitncnn_0 342:12 343:12 -23330=4,3,14,14,112 0=1
BinaryOp                 op_290                   2 1 343:12 110:12 344:12 -23330=4,3,14,14,112
Split                    splitncnn_13             1 2 344:12 344:12_splitncnn_0 344:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution              op_291                   1 1 344:12_splitncnn_1 346:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264
Swish                    op_292                   1 1 346:12 348:12 -23330=4,3,14,14,672
ConvolutionDepthWise     op_293                   1 1 348:12 351:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672
Swish                    op_295                   1 1 351:12 353:12 -23330=4,3,14,14,672
Split                    splitncnn_14             1 2 353:12 353:12_splitncnn_0 353:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Reduction                op_296                   1 1 353:12_splitncnn_1 354:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_297                   1 1 354:12 356:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816
Swish                    op_298                   1 1 356:12 358:12 -23330=4,3,1,1,28
Convolution              op_299                   1 1 358:12 361:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4
BinaryOp                 op_301                   2 1 353:12_splitncnn_0 361:12 362:12 -23330=4,3,14,14,672 0=2
Convolution              op_302                   1 1 362:12 363:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264
Eltwise                  op_303                   2 1 363:12 344:12_splitncnn_0 364:12 -23330=4,3,14,14,112 0=1
BinaryOp                 op_304                   2 1 364:12 33:11 365:12 -23330=4,3,14,14,112
Split                    splitncnn_15             1 2 365:12 365:12_splitncnn_0 365:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution              op_305                   1 1 365:12_splitncnn_1 367:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264
Swish                    op_306                   1 1 367:12 369:12 -23330=4,3,14,14,672
ConvolutionDepthWise     op_307                   1 1 369:12 372:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672
Swish                    op_309                   1 1 372:12 374:12 -23330=4,3,14,14,672
Split                    splitncnn_16             1 2 374:12 374:12_splitncnn_0 374:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Reduction                op_310                   1 1 374:12_splitncnn_1 375:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_311                   1 1 375:12 377:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816
Swish                    op_312                   1 1 377:12 379:12 -23330=4,3,1,1,28
Convolution              op_313                   1 1 379:12 382:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4
BinaryOp                 op_315                   2 1 374:12_splitncnn_0 382:12 383:12 -23330=4,3,14,14,672 0=2
Convolution              op_316                   1 1 383:12 384:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264
Eltwise                  op_317                   2 1 365:12_splitncnn_0 384:12 385:12 -23330=4,3,14,14,112 0=1
BinaryOp                 op_318                   2 1 385:12 44:11 386:12 -23330=4,3,14,14,112
Split                    splitncnn_17             1 2 386:12 386:12_splitncnn_0 386:12_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution              op_319                   1 1 386:12_splitncnn_1 388:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264
Swish                    op_320                   1 1 388:12 390:12 -23330=4,3,14,14,672
ConvolutionDepthWise     op_321                   1 1 390:12 393:12 -23330=4,3,14,14,672 0=672 1=3 4=-233 5=1 6=6048 7=672
Swish                    op_323                   1 1 393:12 395:12 -23330=4,3,14,14,672
Split                    splitncnn_18             1 2 395:12 395:12_splitncnn_0 395:12_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Reduction                op_324                   1 1 395:12_splitncnn_1 396:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_325                   1 1 396:12 398:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816
Swish                    op_326                   1 1 398:12 400:12 -23330=4,3,1,1,28
Convolution              op_327                   1 1 400:12 403:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4
BinaryOp                 op_329                   2 1 395:12_splitncnn_0 403:12 404:12 -23330=4,3,14,14,672 0=2
Convolution              op_330                   1 1 404:12 405:12 -23330=4,3,14,14,112 0=112 1=1 4=-233 6=75264
Eltwise                  op_331                   2 1 386:12_splitncnn_0 405:12 406:12 -23330=4,3,14,14,112 0=1
BinaryOp                 op_332                   2 1 406:12 55:11 407:12_splitncnn_0 -23330=4,3,14,14,112
Convolution              op_334                   1 1 407:12_splitncnn_0 410:12 -23330=4,3,14,14,672 0=672 1=1 4=-233 5=1 6=75264
Swish                    op_335                   1 1 410:12 412:12 -23330=4,3,14,14,672
ConvolutionDepthWise     op_336                   1 1 412:12 415:12 -23330=4,3,7,7,672 0=672 1=3 3=2 4=-233 5=1 6=6048 7=672
Swish                    op_338                   1 1 415:12 417:12 -23330=4,3,7,7,672
Split                    splitncnn_20             1 2 417:12 417:12_splitncnn_0 417:12_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
Reduction                op_339                   1 1 417:12_splitncnn_1 418:12 -23330=4,3,1,1,672 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_340                   1 1 418:12 420:12 -23330=4,3,1,1,28 0=28 1=1 4=-233 5=1 6=18816
Swish                    op_341                   1 1 420:12 422:12 -23330=4,3,1,1,28
Convolution              op_342                   1 1 422:12 425:12 -23330=4,3,1,1,672 0=672 1=1 4=-233 5=1 6=18816 9=4
BinaryOp                 op_344                   2 1 417:12_splitncnn_0 425:12 426:12 -23330=4,3,7,7,672 0=2
Convolution              op_345                   1 1 426:12 428:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 5=1 6=129024
Split                    splitncnn_21             1 2 428:12 428:12_splitncnn_0 428:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              op_346                   1 1 428:12_splitncnn_1 430:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184
Swish                    op_347                   1 1 430:12 432:12 -23330=4,3,7,7,1152
ConvolutionDepthWise     op_348                   1 1 432:12 435:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152
Swish                    op_350                   1 1 435:12 437:12 -23330=4,3,7,7,1152
Split                    splitncnn_22             1 2 437:12 437:12_splitncnn_0 437:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Reduction                op_351                   1 1 437:12_splitncnn_1 438:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_352                   1 1 438:12 440:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296
Swish                    op_353                   1 1 440:12 442:12 -23330=4,3,1,1,48
Convolution              op_354                   1 1 442:12 445:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4
BinaryOp                 op_356                   2 1 437:12_splitncnn_0 445:12 446:12 -23330=4,3,7,7,1152 0=2
Convolution              op_357                   1 1 446:12 447:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184
Eltwise                  op_358                   2 1 428:12_splitncnn_0 447:12 448:12 -23330=4,3,7,7,192 0=1
BinaryOp                 op_359                   2 1 448:12 133:12 449:12 -23330=4,3,7,7,192
Split                    splitncnn_23             1 2 449:12 449:12_splitncnn_0 449:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              op_360                   1 1 449:12_splitncnn_1 451:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184
Swish                    op_361                   1 1 451:12 453:12 -23330=4,3,7,7,1152
ConvolutionDepthWise     op_362                   1 1 453:12 456:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152
Swish                    op_364                   1 1 456:12 458:12 -23330=4,3,7,7,1152
Split                    splitncnn_24             1 2 458:12 458:12_splitncnn_0 458:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Reduction                op_365                   1 1 458:12_splitncnn_1 459:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_366                   1 1 459:12 461:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296
Swish                    op_367                   1 1 461:12 463:12 -23330=4,3,1,1,48
Convolution              op_368                   1 1 463:12 466:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4
BinaryOp                 op_370                   2 1 458:12_splitncnn_0 466:12 467:12 -23330=4,3,7,7,1152 0=2
Convolution              op_371                   1 1 467:12 468:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184
Eltwise                  op_372                   2 1 449:12_splitncnn_0 468:12 469:12 -23330=4,3,7,7,192 0=1
BinaryOp                 op_373                   2 1 469:12 144:12 470:12 -23330=4,3,7,7,192
Split                    splitncnn_25             1 2 470:12 470:12_splitncnn_0 470:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              op_374                   1 1 470:12_splitncnn_1 472:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184
Swish                    op_375                   1 1 472:12 474:12 -23330=4,3,7,7,1152
ConvolutionDepthWise     op_376                   1 1 474:12 477:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152
Swish                    op_378                   1 1 477:12 479:12 -23330=4,3,7,7,1152
Split                    splitncnn_26             1 2 479:12 479:12_splitncnn_0 479:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Reduction                op_379                   1 1 479:12_splitncnn_1 480:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_380                   1 1 480:12 482:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296
Swish                    op_381                   1 1 482:12 484:12 -23330=4,3,1,1,48
Convolution              op_382                   1 1 484:12 487:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4
BinaryOp                 op_384                   2 1 479:12_splitncnn_0 487:12 488:12 -23330=4,3,7,7,1152 0=2
Convolution              op_385                   1 1 488:12 489:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184
Eltwise                  op_386                   2 1 470:12_splitncnn_0 489:12 490:12 -23330=4,3,7,7,192 0=1
BinaryOp                 op_387                   2 1 490:12 155:12 491:12 -23330=4,3,7,7,192
Split                    splitncnn_27             1 2 491:12 491:12_splitncnn_0 491:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              op_388                   1 1 491:12_splitncnn_1 493:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184
Swish                    op_389                   1 1 493:12 495:12 -23330=4,3,7,7,1152
ConvolutionDepthWise     op_390                   1 1 495:12 498:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152
Swish                    op_392                   1 1 498:12 500:12 -23330=4,3,7,7,1152
Split                    splitncnn_28             1 2 500:12 500:12_splitncnn_0 500:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Reduction                op_393                   1 1 500:12_splitncnn_1 501:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_394                   1 1 501:12 503:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296
Swish                    op_395                   1 1 503:12 505:12 -23330=4,3,1,1,48
Convolution              op_396                   1 1 505:12 508:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4
BinaryOp                 op_398                   2 1 500:12_splitncnn_0 508:12 509:12 -23330=4,3,7,7,1152 0=2
Convolution              op_399                   1 1 509:12 510:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184
Eltwise                  op_400                   2 1 491:12_splitncnn_0 510:12 511:12 -23330=4,3,7,7,192 0=1
BinaryOp                 op_401                   2 1 511:12 166:12 512:12 -23330=4,3,7,7,192
Split                    splitncnn_29             1 2 512:12 512:12_splitncnn_0 512:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              op_402                   1 1 512:12_splitncnn_1 514:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184
Swish                    op_403                   1 1 514:12 516:12 -23330=4,3,7,7,1152
ConvolutionDepthWise     op_404                   1 1 516:12 519:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152
Swish                    op_406                   1 1 519:12 521:12 -23330=4,3,7,7,1152
Split                    splitncnn_30             1 2 521:12 521:12_splitncnn_0 521:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Reduction                op_407                   1 1 521:12_splitncnn_1 522:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_408                   1 1 522:12 524:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296
Swish                    op_409                   1 1 524:12 526:12 -23330=4,3,1,1,48
Convolution              op_410                   1 1 526:12 529:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4
BinaryOp                 op_412                   2 1 521:12_splitncnn_0 529:12 530:12 -23330=4,3,7,7,1152 0=2
Convolution              op_413                   1 1 530:12 531:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184
Eltwise                  op_414                   2 1 512:12_splitncnn_0 531:12 532:12 -23330=4,3,7,7,192 0=1
BinaryOp                 op_415                   2 1 532:12 177:12 533:12 -23330=4,3,7,7,192
Split                    splitncnn_31             1 2 533:12 533:12_splitncnn_0 533:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              op_416                   1 1 533:12_splitncnn_1 535:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184
Swish                    op_417                   1 1 535:12 537:12 -23330=4,3,7,7,1152
ConvolutionDepthWise     op_418                   1 1 537:12 540:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152
Swish                    op_420                   1 1 540:12 542:12 -23330=4,3,7,7,1152
Split                    splitncnn_32             1 2 542:12 542:12_splitncnn_0 542:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Reduction                op_421                   1 1 542:12_splitncnn_1 543:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_422                   1 1 543:12 545:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296
Swish                    op_423                   1 1 545:12 547:12 -23330=4,3,1,1,48
Convolution              op_424                   1 1 547:12 550:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4
BinaryOp                 op_426                   2 1 542:12_splitncnn_0 550:12 551:12 -23330=4,3,7,7,1152 0=2
Convolution              op_427                   1 1 551:12 552:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184
Eltwise                  op_428                   2 1 533:12_splitncnn_0 552:12 553:12 -23330=4,3,7,7,192 0=1
BinaryOp                 op_429                   2 1 553:12 188:12 554:12 -23330=4,3,7,7,192
Split                    splitncnn_33             1 2 554:12 554:12_splitncnn_0 554:12_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              op_430                   1 1 554:12_splitncnn_1 556:12 -23330=4,3,7,7,1152 0=1152 1=1 4=-233 5=1 6=221184
Swish                    op_431                   1 1 556:12 558:12 -23330=4,3,7,7,1152
ConvolutionDepthWise     op_432                   1 1 558:12 561:12 -23330=4,3,7,7,1152 0=1152 1=3 4=-233 5=1 6=10368 7=1152
Swish                    op_434                   1 1 561:12 563:12 -23330=4,3,7,7,1152
Split                    splitncnn_34             1 2 563:12 563:12_splitncnn_0 563:12_splitncnn_1 -23330=8,3,7,7,1152,3,7,7,1152
Reduction                op_435                   1 1 563:12_splitncnn_1 564:12 -23330=4,3,1,1,1152 0=3 1=0 -23303=2,1,2 4=1 5=1
Convolution              op_436                   1 1 564:12 566:12 -23330=4,3,1,1,48 0=48 1=1 4=-233 5=1 6=55296
Swish                    op_437                   1 1 566:12 568:12 -23330=4,3,1,1,48
Convolution              op_438                   1 1 568:12 571:12 -23330=4,3,1,1,1152 0=1152 1=1 4=-233 5=1 6=55296 9=4
BinaryOp                 op_440                   2 1 563:12_splitncnn_0 571:12 572:12 -23330=4,3,7,7,1152 0=2
Convolution              op_441                   1 1 572:12 573:12 -23330=4,3,7,7,192 0=192 1=1 4=-233 6=221184
Eltwise                  op_442                   2 1 554:12_splitncnn_0 573:12 574:12 -23330=4,3,7,7,192 0=1
BinaryOp                 op_443                   2 1 574:12 199:12 575:12_splitncnn_0 -23330=4,3,7,7,192
Convolution              op_445                   1 1 575:12_splitncnn_0 578:12 -23330=4,3,7,7,1280 0=1280 1=1 4=-233 5=1 6=245760
Swish                    op_446                   1 1 578:12 580:12 -23330=4,3,7,7,1280
Pooling                  op_447                   1 1 580:12 581:12 -23330=4,1,1280,1,1 0=1 4=1
InnerProduct             op_448                   1 1 581:12 584:12 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000


================================================
FILE: benchmark/googlenet.param
================================================
7767517
94 121
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1/7x7_s2             1 1 data conv1/7x7_s2_conv1/relu_7x7 -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 9=1
Pooling                  pool1/3x3_s2             1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 -23330=4,3,56,56,64 1=3 2=2
LRN                      pool1/norm1              1 1 pool1/3x3_s2 pool1/norm1 -23330=4,3,56,56,64 2=1.000000e-04
Convolution              conv2/3x3_reduce         1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 9=1
Convolution              conv2/3x3                1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 -23330=4,3,56,56,192 0=192 1=3 4=1 5=1 6=110592 9=1
LRN                      conv2/norm2              1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 -23330=4,3,56,56,192 2=1.000000e-04
Pooling                  pool2/3x3_s2             1 1 conv2/norm2 pool2/3x3_s2 -23330=4,3,28,28,192 1=3 2=2
Split                    splitncnn_0              1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 -23330=16,3,28,28,192,3,28,28,192,3,28,28,192,3,28,28,192
Convolution              inception_3a/1x1         1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 -23330=4,3,28,28,64 0=64 1=1 5=1 6=12288 9=1
Convolution              inception_3a/3x3_reduce  1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce -23330=4,3,28,28,96 0=96 1=1 5=1 6=18432 9=1
Convolution              inception_3a/3x3         1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=110592 9=1
Convolution              inception_3a/5x5_reduce  1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce -23330=4,3,28,28,16 0=16 1=1 5=1 6=3072 9=1
Convolution              inception_3a/5x5         1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 -23330=4,3,28,28,32 0=32 1=5 4=2 5=1 6=12800 9=1
Pooling                  inception_3a/pool        1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool -23330=4,3,28,28,192 1=3 3=1
Convolution              inception_3a/pool_proj   1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144 9=1
Concat                   inception_3a/output      4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output -23330=4,3,28,28,256
Split                    splitncnn_1              1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 -23330=16,3,28,28,256,3,28,28,256,3,28,28,256,3,28,28,256
Convolution              inception_3b/1x1         1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 9=1
Convolution              inception_3b/3x3_reduce  1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce -23330=4,3,28,28,128 0=128 1=1 5=1 6=32768 9=1
Convolution              inception_3b/3x3         1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=221184 9=1
Convolution              inception_3b/5x5_reduce  1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 9=1
Convolution              inception_3b/5x5         1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 -23330=4,3,28,28,96 0=96 1=5 4=2 5=1 6=76800 9=1
Pooling                  inception_3b/pool        1 1 inception_3a/output_splitncnn_0 inception_3b/pool -23330=4,3,28,28,256 1=3 3=1
Convolution              inception_3b/pool_proj   1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj -23330=4,3,28,28,64 0=64 1=1 5=1 6=16384 9=1
Concat                   inception_3b/output      4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output -23330=4,3,28,28,480
Pooling                  pool3/3x3_s2             1 1 inception_3b/output pool3/3x3_s2 -23330=4,3,14,14,480 1=3 2=2
Split                    splitncnn_2              1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 -23330=16,3,14,14,480,3,14,14,480,3,14,14,480,3,14,14,480
Convolution              inception_4a/1x1         1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=92160 9=1
Convolution              inception_4a/3x3_reduce  1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080 9=1
Convolution              inception_4a/3x3         1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=179712 9=1
Convolution              inception_4a/5x5_reduce  1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce -23330=4,3,14,14,16 0=16 1=1 5=1 6=7680 9=1
Convolution              inception_4a/5x5         1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 -23330=4,3,14,14,48 0=48 1=5 4=2 5=1 6=19200 9=1
Pooling                  inception_4a/pool        1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool -23330=4,3,14,14,480 1=3 3=1
Convolution              inception_4a/pool_proj   1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=30720 9=1
Concat                   inception_4a/output      4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output -23330=4,3,14,14,512
Split                    splitncnn_3              1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512
Convolution              inception_4b/1x1         1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 -23330=4,3,14,14,160 0=160 1=1 5=1 6=81920 9=1
Convolution              inception_4b/3x3_reduce  1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 9=1
Convolution              inception_4b/3x3         1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 -23330=4,3,14,14,224 0=224 1=3 4=1 5=1 6=225792 9=1
Convolution              inception_4b/5x5_reduce  1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 9=1
Convolution              inception_4b/5x5         1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 9=1
Pooling                  inception_4b/pool        1 1 inception_4a/output_splitncnn_0 inception_4b/pool -23330=4,3,14,14,512 1=3 3=1
Convolution              inception_4b/pool_proj   1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1
Concat                   inception_4b/output      4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output -23330=4,3,14,14,512
Split                    splitncnn_4              1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512
Convolution              inception_4c/1x1         1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 9=1
Convolution              inception_4c/3x3_reduce  1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce -23330=4,3,14,14,128 0=128 1=1 5=1 6=65536 9=1
Convolution              inception_4c/3x3         1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=294912 9=1
Convolution              inception_4c/5x5_reduce  1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce -23330=4,3,14,14,24 0=24 1=1 5=1 6=12288 9=1
Convolution              inception_4c/5x5         1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=38400 9=1
Pooling                  inception_4c/pool        1 1 inception_4b/output_splitncnn_0 inception_4c/pool -23330=4,3,14,14,512 1=3 3=1
Convolution              inception_4c/pool_proj   1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1
Concat                   inception_4c/output      4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output -23330=4,3,14,14,512
Split                    splitncnn_5              1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 -23330=16,3,14,14,512,3,14,14,512,3,14,14,512,3,14,14,512
Convolution              inception_4d/1x1         1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 -23330=4,3,14,14,112 0=112 1=1 5=1 6=57344 9=1
Convolution              inception_4d/3x3_reduce  1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce -23330=4,3,14,14,144 0=144 1=1 5=1 6=73728 9=1
Convolution              inception_4d/3x3         1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 -23330=4,3,14,14,288 0=288 1=3 4=1 5=1 6=373248 9=1
Convolution              inception_4d/5x5_reduce  1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16384 9=1
Convolution              inception_4d/5x5         1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 -23330=4,3,14,14,64 0=64 1=5 4=2 5=1 6=51200 9=1
Pooling                  inception_4d/pool        1 1 inception_4c/output_splitncnn_0 inception_4d/pool -23330=4,3,14,14,512 1=3 3=1
Convolution              inception_4d/pool_proj   1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1
Concat                   inception_4d/output      4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output -23330=4,3,14,14,528
Split                    splitncnn_6              1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 -23330=16,3,14,14,528,3,14,14,528,3,14,14,528,3,14,14,528
Convolution              inception_4e/1x1         1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=135168 9=1
Convolution              inception_4e/3x3_reduce  1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce -23330=4,3,14,14,160 0=160 1=1 5=1 6=84480 9=1
Convolution              inception_4e/3x3         1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 -23330=4,3,14,14,320 0=320 1=3 4=1 5=1 6=460800 9=1
Convolution              inception_4e/5x5_reduce  1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce -23330=4,3,14,14,32 0=32 1=1 5=1 6=16896 9=1
Convolution              inception_4e/5x5         1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 -23330=4,3,14,14,128 0=128 1=5 4=2 5=1 6=102400 9=1
Pooling                  inception_4e/pool        1 1 inception_4d/output_splitncnn_0 inception_4e/pool -23330=4,3,14,14,528 1=3 3=1
Convolution              inception_4e/pool_proj   1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj -23330=4,3,14,14,128 0=128 1=1 5=1 6=67584 9=1
Concat                   inception_4e/output      4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output -23330=4,3,14,14,832
Pooling                  pool4/3x3_s2             1 1 inception_4e/output pool4/3x3_s2 -23330=4,3,7,7,832 1=3 2=2
Split                    splitncnn_7              1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832
Convolution              inception_5a/1x1         1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 -23330=4,3,7,7,256 0=256 1=1 5=1 6=212992 9=1
Convolution              inception_5a/3x3_reduce  1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce -23330=4,3,7,7,160 0=160 1=1 5=1 6=133120 9=1
Convolution              inception_5a/3x3         1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 -23330=4,3,7,7,320 0=320 1=3 4=1 5=1 6=460800 9=1
Convolution              inception_5a/5x5_reduce  1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce -23330=4,3,7,7,32 0=32 1=1 5=1 6=26624 9=1
Convolution              inception_5a/5x5         1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=102400 9=1
Pooling                  inception_5a/pool        1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool -23330=4,3,7,7,832 1=3 3=1
Convolution              inception_5a/pool_proj   1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 9=1
Concat                   inception_5a/output      4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output -23330=4,3,7,7,832
Split                    splitncnn_8              1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 -23330=16,3,7,7,832,3,7,7,832,3,7,7,832,3,7,7,832
Convolution              inception_5b/1x1         1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 -23330=4,3,7,7,384 0=384 1=1 5=1 6=319488 9=1
Convolution              inception_5b/3x3_reduce  1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce -23330=4,3,7,7,192 0=192 1=1 5=1 6=159744 9=1
Convolution              inception_5b/3x3         1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 -23330=4,3,7,7,384 0=384 1=3 4=1 5=1 6=663552 9=1
Convolution              inception_5b/5x5_reduce  1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce -23330=4,3,7,7,48 0=48 1=1 5=1 6=39936 9=1
Convolution              inception_5b/5x5         1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 -23330=4,3,7,7,128 0=128 1=5 4=2 5=1 6=153600 9=1
Pooling                  inception_5b/pool        1 1 inception_5a/output_splitncnn_0 inception_5b/pool -23330=4,3,7,7,832 1=3 3=1
Convolution              inception_5b/pool_proj   1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj -23330=4,3,7,7,128 0=128 1=1 5=1 6=106496 9=1
Concat                   inception_5b/output      4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output -23330=4,3,7,7,1024
Pooling                  pool5/7x7_s1             1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 -23330=4,3,1,1,1024 0=1 1=7
InnerProduct             loss3/classifier         1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier -23330=4,1,1000,1,1 0=1000 1=1 2=1024000
Softmax                  prob                     1 1 loss3/classifier output -23330=4,1,1000,1,1


================================================
FILE: benchmark/googlenet_int8.param
================================================
7767517
94 121
Input                    data                     0 1 data 0=224 1=224 2=3
Convolution              conv1/7x7_s2             1 1 data conv1/7x7_s2_conv1/relu_7x7 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
Pooling                  pool1/3x3_s2             1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 1=3 2=2
LRN                      pool1/norm1              1 1 pool1/3x3_s2 pool1/norm1 2=0.000100
Convolution              conv2/3x3_reduce         1 1 pool1/norm1 conv2/3x3_reduce_conv2/relu_3x3_reduce 0=64 1=1 5=1 6=4096 8=102 9=1
Convolution              conv2/3x3                1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3_conv2/relu_3x3 0=192 1=3 4=1 5=1 6=110592 8=2 9=1
LRN                      conv2/norm2              1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 2=0.000100
Pooling                  pool2/3x3_s2             1 1 conv2/norm2 pool2/3x3_s2 1=3 2=2
Split                    splitncnn_0              1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3
Convolution              inception_3a/1x1         1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1_inception_3a/relu_1x1 0=64 1=1 5=1 6=12288 8=2 9=1
Convolution              inception_3a/3x3_reduce  1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce 0=96 1=1 5=1 6=18432 8=102 9=1
Convolution              inception_3a/3x3         1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3_inception_3a/relu_3x3 0=128 1=3 4=1 5=1 6=110592 8=2 9=1
Convolution              inception_3a/5x5_reduce  1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce 0=16 1=1 5=1 6=3072 8=102 9=1
Convolution              inception_3a/5x5         1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5_inception_3a/relu_5x5 0=32 1=5 4=2 5=1 6=12800 8=2 9=1
Pooling                  inception_3a/pool        1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 1=3 3=1
Convolution              inception_3a/pool_proj   1 1 inception_3a/pool inception_3a/pool_proj_inception_3a/relu_pool_proj 0=32 1=1 5=1 6=6144 8=2 9=1
Concat                   inception_3a/output      4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output
Split                    splitncnn_1              1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3
Convolution              inception_3b/1x1         1 1 inception_3a/output_splitncnn_3 inception_3b/1x1_inception_3b/relu_1x1 0=128 1=1 5=1 6=32768 8=2 9=1
Convolution              inception_3b/3x3_reduce  1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce 0=128 1=1 5=1 6=32768 8=102 9=1
Convolution              inception_3b/3x3         1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3_inception_3b/relu_3x3 0=192 1=3 4=1 5=1 6=221184 8=2 9=1
Convolution              inception_3b/5x5_reduce  1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce 0=32 1=1 5=1 6=8192 8=102 9=1
Convolution              inception_3b/5x5         1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5_inception_3b/relu_5x5 0=96 1=5 4=2 5=1 6=76800 8=2 9=1
Pooling                  inception_3b/pool        1 1 inception_3a/output_splitncnn_0 inception_3b/pool 1=3 3=1
Convolution              inception_3b/pool_proj   1 1 inception_3b/pool inception_3b/pool_proj_inception_3b/relu_pool_proj 0=64 1=1 5=1 6=16384 8=2 9=1
Concat                   inception_3b/output      4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output
Pooling                  pool3/3x3_s2             1 1 inception_3b/output pool3/3x3_s2 1=3 2=2
Split                    splitncnn_2              1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3
Convolution              inception_4a/1x1         1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1_inception_4a/relu_1x1 0=192 1=1 5=1 6=92160 8=2 9=1
Convolution              inception_4a/3x3_reduce  1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce 0=96 1=1 5=1 6=46080 8=102 9=1
Convolution              inception_4a/3x3         1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3_inception_4a/relu_3x3 0=208 1=3 4=1 5=1 6=179712 8=2 9=1
Convolution              inception_4a/5x5_reduce  1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce 0=16 1=1 5=1 6=7680 8=102 9=1
Convolution              inception_4a/5x5         1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5_inception_4a/relu_5x5 0=48 1=5 4=2 5=1 6=19200 8=2 9=1
Pooling                  inception_4a/pool        1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 1=3 3=1
Convolution              inception_4a/pool_proj   1 1 inception_4a/pool inception_4a/pool_proj_inception_4a/relu_pool_proj 0=64 1=1 5=1 6=30720 8=2 9=1
Concat                   inception_4a/output      4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output
Split                    splitncnn_3              1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3
Convolution              inception_4b/1x1         1 1 inception_4a/output_splitncnn_3 inception_4b/1x1_inception_4b/relu_1x1 0=160 1=1 5=1 6=81920 8=2 9=1
Convolution              inception_4b/3x3_reduce  1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce 0=112 1=1 5=1 6=57344 8=102 9=1
Convolution              inception_4b/3x3         1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3_inception_4b/relu_3x3 0=224 1=3 4=1 5=1 6=225792 8=2 9=1
Convolution              inception_4b/5x5_reduce  1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1
Convolution              inception_4b/5x5         1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5_inception_4b/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1
Pooling                  inception_4b/pool        1 1 inception_4a/output_splitncnn_0 inception_4b/pool 1=3 3=1
Convolution              inception_4b/pool_proj   1 1 inception_4b/pool inception_4b/pool_proj_inception_4b/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1
Concat                   inception_4b/output      4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output
Split                    splitncnn_4              1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3
Convolution              inception_4c/1x1         1 1 inception_4b/output_splitncnn_3 inception_4c/1x1_inception_4c/relu_1x1 0=128 1=1 5=1 6=65536 8=2 9=1
Convolution              inception_4c/3x3_reduce  1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce 0=128 1=1 5=1 6=65536 8=102 9=1
Convolution              inception_4c/3x3         1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3_inception_4c/relu_3x3 0=256 1=3 4=1 5=1 6=294912 8=2 9=1
Convolution              inception_4c/5x5_reduce  1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce 0=24 1=1 5=1 6=12288 8=102 9=1
Convolution              inception_4c/5x5         1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5_inception_4c/relu_5x5 0=64 1=5 4=2 5=1 6=38400 8=2 9=1
Pooling                  inception_4c/pool        1 1 inception_4b/output_splitncnn_0 inception_4c/pool 1=3 3=1
Convolution              inception_4c/pool_proj   1 1 inception_4c/pool inception_4c/pool_proj_inception_4c/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1
Concat                   inception_4c/output      4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output
Split                    splitncnn_5              1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3
Convolution              inception_4d/1x1         1 1 inception_4c/output_splitncnn_3 inception_4d/1x1_inception_4d/relu_1x1 0=112 1=1 5=1 6=57344 8=2 9=1
Convolution              inception_4d/3x3_reduce  1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce 0=144 1=1 5=1 6=73728 8=102 9=1
Convolution              inception_4d/3x3         1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3_inception_4d/relu_3x3 0=288 1=3 4=1 5=1 6=373248 8=2 9=1
Convolution              inception_4d/5x5_reduce  1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce 0=32 1=1 5=1 6=16384 8=102 9=1
Convolution              inception_4d/5x5         1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5_inception_4d/relu_5x5 0=64 1=5 4=2 5=1 6=51200 8=2 9=1
Pooling                  inception_4d/pool        1 1 inception_4c/output_splitncnn_0 inception_4d/pool 1=3 3=1
Convolution              inception_4d/pool_proj   1 1 inception_4d/pool inception_4d/pool_proj_inception_4d/relu_pool_proj 0=64 1=1 5=1 6=32768 8=2 9=1
Concat                   inception_4d/output      4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output
Split                    splitncnn_6              1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3
Convolution              inception_4e/1x1         1 1 inception_4d/output_splitncnn_3 inception_4e/1x1_inception_4e/relu_1x1 0=256 1=1 5=1 6=135168 8=2 9=1
Convolution              inception_4e/3x3_reduce  1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce 0=160 1=1 5=1 6=84480 8=102 9=1
Convolution              inception_4e/3x3         1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3_inception_4e/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1
Convolution              inception_4e/5x5_reduce  1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce 0=32 1=1 5=1 6=16896 8=102 9=1
Convolution              inception_4e/5x5         1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5_inception_4e/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1
Pooling                  inception_4e/pool        1 1 inception_4d/output_splitncnn_0 inception_4e/pool 1=3 3=1
Convolution              inception_4e/pool_proj   1 1 inception_4e/pool inception_4e/pool_proj_inception_4e/relu_pool_proj 0=128 1=1 5=1 6=67584 8=2 9=1
Concat                   inception_4e/output      4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output
Pooling                  pool4/3x3_s2             1 1 inception_4e/output pool4/3x3_s2 1=3 2=2
Split                    splitncnn_7              1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3
Convolution              inception_5a/1x1         1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1_inception_5a/relu_1x1 0=256 1=1 5=1 6=212992 8=2 9=1
Convolution              inception_5a/3x3_reduce  1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce 0=160 1=1 5=1 6=133120 8=102 9=1
Convolution              inception_5a/3x3         1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3_inception_5a/relu_3x3 0=320 1=3 4=1 5=1 6=460800 8=2 9=1
Convolution              inception_5a/5x5_reduce  1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce 0=32 1=1 5=1 6=26624 8=102 9=1
Convolution              inception_5a/5x5         1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5_inception_5a/relu_5x5 0=128 1=5 4=2 5=1 6=102400 8=2 9=1
Pooling                  inception_5a/pool        1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 1=3 3=1
Convolution              inception_5a/pool_proj   1 1 inception_5a/pool inception_5a/pool_proj_inception_5a/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1
Concat                   inception_5a/output      4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output
Split                    splitncnn_8              1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3
Convolution              inception_5b/1x1         1 1 inception_5a/output_splitncnn_3 inception_5b/1x1_inception_5b/relu_1x1 0=384 1=1 5=1 6=319488 8=2 9=1
Convolution              inception_5b/3x3_reduce  1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce 0=192 1=1 5=1 6=159744 8=102 9=1
Convolution              inception_5b/3x3         1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3_inception_5b/relu_3x3 0=384 1=3 4=1 5=1 6=663552 8=2 9=1
Convolution              inception_5b/5x5_reduce  1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce 0=48 1=1 5=1 6=39936 8=102 9=1
Convolution              inception_5b/5x5         1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5_inception_5b/relu_5x5 0=128 1=5 4=2 5=1 6=153600 8=2 9=1
Pooling                  inception_5b/pool        1 1 inception_5a/output_splitncnn_0 inception_5b/pool 1=3 3=1
Convolution              inception_5b/pool_proj   1 1 inception_5b/pool inception_5b/pool_proj_inception_5b/relu_pool_proj 0=128 1=1 5=1 6=106496 8=2 9=1
Concat                   inception_5b/output      4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output
Pooling                  pool5/7x7_s1             1 1 inception_5b/output pool5/7x7_s1_pool5/drop_7x7_s1 0=1 1=7
InnerProduct             loss3/classifier         1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000
Softmax                  prob                     1 1 loss3/classifier output


================================================
FILE: benchmark/mnasnet.param
================================================
7767517
76 86
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              first-3x3-conv           1 1 data first-3x3-conv_relu -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
ConvolutionDepthWise     A0_dw                    1 1 first-3x3-conv_relu A0_dw_relu -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1
Convolution              A0_linear                1 1 A0_dw_relu A0_linear_bn -23330=4,3,112,112,16 0=16 1=1 5=1 6=512
Convolution              B0_expand                1 1 A0_linear_bn B0_expand_relu -23330=4,3,112,112,48 0=48 1=1 5=1 6=768 9=1
ConvolutionDepthWise     B0_dw                    1 1 B0_expand_relu B0_dw_relu -23330=4,3,56,56,48 0=48 1=3 3=2 4=1 5=1 6=432 7=48 9=1
Convolution              B0_linear                1 1 B0_dw_relu B0_linear_bn -23330=4,3,56,56,24 0=24 1=1 5=1 6=1152
Split                    splitncnn_0              1 2 B0_linear_bn B0_linear_bn_splitncnn_0 B0_linear_bn_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
Convolution              B1_expand                1 1 B0_linear_bn_splitncnn_1 B1_expand_relu -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
ConvolutionDepthWise     B1_dw                    1 1 B1_expand_relu B1_dw_relu -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
Convolution              B1_linear                1 1 B1_dw_relu B1_linear_bn -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
BinaryOp                 unknownncnn_0            2 1 B0_linear_bn_splitncnn_0 B1_linear_bn unknownncnn_0 -23330=4,3,56,56,24
Split                    splitncnn_1              1 2 unknownncnn_0 unknownncnn_0_splitncnn_0 unknownncnn_0_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
Convolution              B2_expand                1 1 unknownncnn_0_splitncnn_1 B2_expand_relu -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
ConvolutionDepthWise     B2_dw                    1 1 B2_expand_relu B2_dw_relu -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
Convolution              B2_linear                1 1 B2_dw_relu B2_linear_bn -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
BinaryOp                 unknownncnn_1            2 1 unknownncnn_0_splitncnn_0 B2_linear_bn unknownncnn_1 -23330=4,3,56,56,24
Convolution              C0_expand                1 1 unknownncnn_1 C0_expand_relu -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
ConvolutionDepthWise     C0_dw                    1 1 C0_expand_relu C0_dw_relu -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72 9=1
Convolution              C0_linear                1 1 C0_dw_relu C0_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880
Split                    splitncnn_2              1 2 C0_linear_bn C0_linear_bn_splitncnn_0 C0_linear_bn_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              C1_expand                1 1 C0_linear_bn_splitncnn_1 C1_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise     C1_dw                    1 1 C1_expand_relu C1_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1
Convolution              C1_linear                1 1 C1_dw_relu C1_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp                 unknownncnn_2            2 1 C0_linear_bn_splitncnn_0 C1_linear_bn unknownncnn_2 -23330=4,3,28,28,40
Split                    splitncnn_3              1 2 unknownncnn_2 unknownncnn_2_splitncnn_0 unknownncnn_2_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              C2_expand                1 1 unknownncnn_2_splitncnn_1 C2_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise     C2_dw                    1 1 C2_expand_relu C2_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1
Convolution              C2_linear                1 1 C2_dw_relu C2_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp                 unknownncnn_3            2 1 unknownncnn_2_splitncnn_0 C2_linear_bn unknownncnn_3 -23330=4,3,28,28,40
Convolution              D0_expand                1 1 unknownncnn_3 D0_expand_relu -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 9=1
ConvolutionDepthWise     D0_dw                    1 1 D0_expand_relu D0_dw_relu -23330=4,3,14,14,240 0=240 1=5 3=2 4=2 5=1 6=6000 7=240 9=1
Convolution              D0_linear                1 1 D0_dw_relu D0_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
Split                    splitncnn_4              1 2 D0_linear_bn D0_linear_bn_splitncnn_0 D0_linear_bn_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              D1_expand                1 1 D0_linear_bn_splitncnn_1 D1_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1
ConvolutionDepthWise     D1_dw                    1 1 D1_expand_relu D1_dw_relu -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480 9=1
Convolution              D1_linear                1 1 D1_dw_relu D1_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400
BinaryOp                 unknownncnn_4            2 1 D0_linear_bn_splitncnn_0 D1_linear_bn unknownncnn_4 -23330=4,3,14,14,80
Split                    splitncnn_5              1 2 unknownncnn_4 unknownncnn_4_splitncnn_0 unknownncnn_4_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              D2_expand                1 1 unknownncnn_4_splitncnn_1 D2_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1
ConvolutionDepthWise     D2_dw                    1 1 D2_expand_relu D2_dw_relu -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480 9=1
Convolution              D2_linear                1 1 D2_dw_relu D2_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=38400
BinaryOp                 unknownncnn_5            2 1 unknownncnn_4_splitncnn_0 D2_linear_bn unknownncnn_5 -23330=4,3,14,14,80
Convolution              E0_expand                1 1 unknownncnn_5 E0_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1
ConvolutionDepthWise     E0_dw                    1 1 E0_expand_relu E0_dw_relu -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480 9=1
Convolution              E0_linear                1 1 E0_dw_relu E0_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080
Split                    splitncnn_6              1 2 E0_linear_bn E0_linear_bn_splitncnn_0 E0_linear_bn_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              E1_expand                1 1 E0_linear_bn_splitncnn_1 E1_expand_relu -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     E1_dw                    1 1 E1_expand_relu E1_dw_relu -23330=4,3,14,14,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1
Convolution              E1_linear                1 1 E1_dw_relu E1_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=55296
BinaryOp                 unknownncnn_6            2 1 E0_linear_bn_splitncnn_0 E1_linear_bn unknownncnn_6 -23330=4,3,14,14,96
Convolution              F0_expand                1 1 unknownncnn_6 F0_expand_relu -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     F0_dw                    1 1 F0_expand_relu F0_dw_relu -23330=4,3,7,7,576 0=576 1=5 3=2 4=2 5=1 6=14400 7=576 9=1
Convolution              F0_linear                1 1 F0_dw_relu F0_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592
Split                    splitncnn_7              1 2 F0_linear_bn F0_linear_bn_splitncnn_0 F0_linear_bn_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              F1_expand                1 1 F0_linear_bn_splitncnn_1 F1_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1
ConvolutionDepthWise     F1_dw                    1 1 F1_expand_relu F1_dw_relu -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 9=1
Convolution              F1_linear                1 1 F1_dw_relu F1_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
BinaryOp                 unknownncnn_7            2 1 F0_linear_bn_splitncnn_0 F1_linear_bn unknownncnn_7 -23330=4,3,7,7,192
Split                    splitncnn_8              1 2 unknownncnn_7 unknownncnn_7_splitncnn_0 unknownncnn_7_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              F2_expand                1 1 unknownncnn_7_splitncnn_1 F2_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1
ConvolutionDepthWise     F2_dw                    1 1 F2_expand_relu F2_dw_relu -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 9=1
Convolution              F2_linear                1 1 F2_dw_relu F2_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
BinaryOp                 unknownncnn_8            2 1 unknownncnn_7_splitncnn_0 F2_linear_bn unknownncnn_8 -23330=4,3,7,7,192
Split                    splitncnn_9              1 2 unknownncnn_8 unknownncnn_8_splitncnn_0 unknownncnn_8_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              F3_expand                1 1 unknownncnn_8_splitncnn_1 F3_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1
ConvolutionDepthWise     F3_dw                    1 1 F3_expand_relu F3_dw_relu -23330=4,3,7,7,1152 0=1152 1=5 4=2 5=1 6=28800 7=1152 9=1
Convolution              F3_linear                1 1 F3_dw_relu F3_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
BinaryOp                 unknownncnn_9            2 1 unknownncnn_8_splitncnn_0 F3_linear_bn unknownncnn_9 -23330=4,3,7,7,192
Convolution              G0_expand                1 1 unknownncnn_9 G0_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1
ConvolutionDepthWise     G0_dw                    1 1 G0_expand_relu G0_dw_relu -23330=4,3,7,7,1152 0=1152 1=3 4=1 5=1 6=10368 7=1152 9=1
Convolution              G0_linear                1 1 G0_dw_relu G0_linear_bn -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640
Convolution              last-1x1-conv            1 1 G0_linear_bn last-1x1-conv_relu -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 9=1
Pooling                  avgpool                  1 1 last-1x1-conv_relu flatten -23330=4,1,1280,1,1 0=1 1=7 4=1 5=1
InnerProduct             fc                       1 1 flatten fc -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
Softmax                  prob                     1 1 fc output -23330=4,1,1000,1,1


================================================
FILE: benchmark/mobilenet.param
================================================
7767517
31 31
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_relu1 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
ConvolutionDepthWise     conv2_1/dw               1 1 conv1_relu1 conv2_1/dw_relu2_1/dw -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1
Convolution              conv2_1/sep              1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep -23330=4,3,112,112,64 0=64 1=1 5=1 6=2048 9=1
ConvolutionDepthWise     conv2_2/dw               1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
Convolution              conv2_2/sep              1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=8192 9=1
ConvolutionDepthWise     conv3_1/dw               1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw -23330=4,3,56,56,128 0=128 1=3 4=1 5=1 6=1152 7=128 9=1
Convolution              conv3_1/sep              1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep -23330=4,3,56,56,128 0=128 1=1 5=1 6=16384 9=1
ConvolutionDepthWise     conv3_2/dw               1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 9=1
Convolution              conv3_2/sep              1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=32768 9=1
ConvolutionDepthWise     conv4_1/dw               1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw -23330=4,3,28,28,256 0=256 1=3 4=1 5=1 6=2304 7=256 9=1
Convolution              conv4_1/sep              1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep -23330=4,3,28,28,256 0=256 1=1 5=1 6=65536 9=1
ConvolutionDepthWise     conv4_2/dw               1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 9=1
Convolution              conv4_2/sep              1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=131072 9=1
ConvolutionDepthWise     conv5_1/dw               1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv5_1/sep              1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv5_2/dw               1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv5_2/sep              1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv5_3/dw               1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv5_3/sep              1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv5_4/dw               1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv5_4/sep              1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv5_5/dw               1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv5_5/sep              1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep -23330=4,3,14,14,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv5_6/dw               1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 9=1
Convolution              conv5_6/sep              1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=524288 9=1
ConvolutionDepthWise     conv6/dw                 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw -23330=4,3,7,7,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1
Convolution              conv6/sep                1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=1048576 9=1
Pooling                  pool6                    1 1 conv6/sep_relu6/sep pool6 -23330=4,1,1024,1,1 0=1 4=1
InnerProduct             fc7                      1 1 pool6 fc7 -23330=4,1,1000,1,1 0=1000 1=1 2=1024000
Softmax                  prob                     1 1 fc7 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/mobilenet_int8.param
================================================
7767517
31 31
Input                    data                     0 1 data 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_relu1 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1
ConvolutionDepthWise     conv2_1/dw               1 1 conv1_relu1 conv2_1/dw_relu2_1/dw 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1
Convolution              conv2_1/sep              1 1 conv2_1/dw_relu2_1/dw conv2_1/sep_relu2_1/sep 0=64 1=1 5=1 6=2048 8=102 9=1
ConvolutionDepthWise     conv2_2/dw               1 1 conv2_1/sep_relu2_1/sep conv2_2/dw_relu2_2/dw 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1
Convolution              conv2_2/sep              1 1 conv2_2/dw_relu2_2/dw conv2_2/sep_relu2_2/sep 0=128 1=1 5=1 6=8192 8=102 9=1
ConvolutionDepthWise     conv3_1/dw               1 1 conv2_2/sep_relu2_2/sep conv3_1/dw_relu3_1/dw 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1
Convolution              conv3_1/sep              1 1 conv3_1/dw_relu3_1/dw conv3_1/sep_relu3_1/sep 0=128 1=1 5=1 6=16384 8=102 9=1
ConvolutionDepthWise     conv3_2/dw               1 1 conv3_1/sep_relu3_1/sep conv3_2/dw_relu3_2/dw 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1
Convolution              conv3_2/sep              1 1 conv3_2/dw_relu3_2/dw conv3_2/sep_relu3_2/sep 0=256 1=1 5=1 6=32768 8=102 9=1
ConvolutionDepthWise     conv4_1/dw               1 1 conv3_2/sep_relu3_2/sep conv4_1/dw_relu4_1/dw 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1
Convolution              conv4_1/sep              1 1 conv4_1/dw_relu4_1/dw conv4_1/sep_relu4_1/sep 0=256 1=1 5=1 6=65536 8=102 9=1
ConvolutionDepthWise     conv4_2/dw               1 1 conv4_1/sep_relu4_1/sep conv4_2/dw_relu4_2/dw 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1
Convolution              conv4_2/sep              1 1 conv4_2/dw_relu4_2/dw conv4_2/sep_relu4_2/sep 0=512 1=1 5=1 6=131072 8=102 9=1
ConvolutionDepthWise     conv5_1/dw               1 1 conv4_2/sep_relu4_2/sep conv5_1/dw_relu5_1/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv5_1/sep              1 1 conv5_1/dw_relu5_1/dw conv5_1/sep_relu5_1/sep 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv5_2/dw               1 1 conv5_1/sep_relu5_1/sep conv5_2/dw_relu5_2/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv5_2/sep              1 1 conv5_2/dw_relu5_2/dw conv5_2/sep_relu5_2/sep 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv5_3/dw               1 1 conv5_2/sep_relu5_2/sep conv5_3/dw_relu5_3/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv5_3/sep              1 1 conv5_3/dw_relu5_3/dw conv5_3/sep_relu5_3/sep 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv5_4/dw               1 1 conv5_3/sep_relu5_3/sep conv5_4/dw_relu5_4/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv5_4/sep              1 1 conv5_4/dw_relu5_4/dw conv5_4/sep_relu5_4/sep 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv5_5/dw               1 1 conv5_4/sep_relu5_4/sep conv5_5/dw_relu5_5/dw 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv5_5/sep              1 1 conv5_5/dw_relu5_5/dw conv5_5/sep_relu5_5/sep 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv5_6/dw               1 1 conv5_5/sep_relu5_5/sep conv5_6/dw_relu5_6/dw 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv5_6/sep              1 1 conv5_6/dw_relu5_6/dw conv5_6/sep_relu5_6/sep 0=1024 1=1 5=1 6=524288 8=102 9=1
ConvolutionDepthWise     conv6/dw                 1 1 conv5_6/sep_relu5_6/sep conv6/dw_relu6/dw 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1
Convolution              conv6/sep                1 1 conv6/dw_relu6/dw conv6/sep_relu6/sep 0=1024 1=1 5=1 6=1048576 8=2 9=1
Pooling                  pool6                    1 1 conv6/sep_relu6/sep pool6 0=1 4=1
InnerProduct             fc7                      1 1 pool6 fc7 0=1000 1=1 2=1024000 8=2
Softmax                  prob                     1 1 fc7 output


================================================
FILE: benchmark/mobilenet_ssd.param
================================================
7767517
92 115
Input                    input                    0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3
Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3
Convolution              conv0                    1 1 data_splitncnn_6 conv0_conv0/relu -23330=4,3,150,150,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
ConvolutionDepthWise     conv1/dw                 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu -23330=4,3,150,150,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1
Convolution              conv1                    1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu -23330=4,3,150,150,64 0=64 1=1 5=1 6=2048 9=1
ConvolutionDepthWise     conv2/dw                 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu -23330=4,3,75,75,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
Convolution              conv2                    1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=8192 9=1
ConvolutionDepthWise     conv3/dw                 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu -23330=4,3,75,75,128 0=128 1=3 4=1 5=1 6=1152 7=128 9=1
Convolution              conv3                    1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu -23330=4,3,75,75,128 0=128 1=1 5=1 6=16384 9=1
ConvolutionDepthWise     conv4/dw                 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu -23330=4,3,38,38,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 9=1
Convolution              conv4                    1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=32768 9=1
ConvolutionDepthWise     conv5/dw                 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu -23330=4,3,38,38,256 0=256 1=3 4=1 5=1 6=2304 7=256 9=1
Convolution              conv5                    1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu -23330=4,3,38,38,256 0=256 1=1 5=1 6=65536 9=1
ConvolutionDepthWise     conv6/dw                 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu -23330=4,3,19,19,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 9=1
Convolution              conv6                    1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=131072 9=1
ConvolutionDepthWise     conv7/dw                 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv7                    1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv8/dw                 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv8                    1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv9/dw                 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv9                    1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv10/dw                1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv10                   1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv11/dw                1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu -23330=4,3,19,19,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv11                   1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu -23330=4,3,19,19,512 0=512 1=1 5=1 6=262144 9=1
Split                    splitncnn_1              1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 -23330=16,3,19,19,512,3,19,19,512,3,19,19,512,3,19,19,512
ConvolutionDepthWise     conv12/dw                1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu -23330=4,3,10,10,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 9=1
Convolution              conv12                   1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=524288 9=1
ConvolutionDepthWise     conv13/dw                1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu -23330=4,3,10,10,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1
Convolution              conv13                   1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu -23330=4,3,10,10,1024 0=1024 1=1 5=1 6=1048576 9=1
Split                    splitncnn_2              1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 -23330=16,3,10,10,1024,3,10,10,1024,3,10,10,1024,3,10,10,1024
Convolution              conv14_1                 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu -23330=4,3,10,10,256 0=256 1=1 5=1 6=262144 9=1
Convolution              conv14_2                 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu -23330=4,3,5,5,512 0=512 1=3 3=2 4=1 5=1 6=1179648 9=1
Split                    splitncnn_3              1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 -23330=16,3,5,5,512,3,5,5,512,3,5,5,512,3,5,5,512
Convolution              conv15_1                 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu -23330=4,3,5,5,128 0=128 1=1 5=1 6=65536 9=1
Convolution              conv15_2                 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu -23330=4,3,3,3,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1
Split                    splitncnn_4              1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 -23330=16,3,3,3,256,3,3,3,256,3,3,3,256,3,3,3,256
Convolution              conv16_1                 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu -23330=4,3,3,3,128 0=128 1=1 5=1 6=32768 9=1
Convolution              conv16_2                 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1
Split                    splitncnn_5              1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256
Convolution              conv17_1                 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 9=1
Convolution              conv17_2                 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 9=1
Split                    splitncnn_6              1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128
Convolution              conv11_mbox_loc          1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc -23330=4,3,19,19,12 0=12 1=1 5=1 6=6144
Permute                  conv11_mbox_loc_perm     1 1 conv11_mbox_loc conv11_mbox_loc_perm -23330=4,3,12,19,19 0=3
Flatten                  conv11_mbox_loc_flat     1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat -23330=4,1,4332,1,1
Convolution              conv11_mbox_conf         1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf -23330=4,3,19,19,63 0=63 1=1 5=1 6=32256
Permute                  conv11_mbox_conf_perm    1 1 conv11_mbox_conf conv11_mbox_conf_perm -23330=4,3,63,19,19 0=3
Flatten                  conv11_mbox_conf_flat    1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat -23330=4,1,22743,1,1
PriorBox                 conv11_mbox_priorbox     2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23330=4,2,4332,2,1 -23300=1,6.000000e+01 -23302=1,2.000000e+00 9=-233 10=-233 13=5.000000e-01
Convolution              conv13_mbox_loc          1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc -23330=4,3,10,10,24 0=24 1=1 5=1 6=24576
Permute                  conv13_mbox_loc_perm     1 1 conv13_mbox_loc conv13_mbox_loc_perm -23330=4,3,24,10,10 0=3
Flatten                  conv13_mbox_loc_flat     1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat -23330=4,1,2400,1,1
Convolution              conv13_mbox_conf         1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf -23330=4,3,10,10,126 0=126 1=1 5=1 6=129024
Permute                  conv13_mbox_conf_perm    1 1 conv13_mbox_conf conv13_mbox_conf_perm -23330=4,3,126,10,10 0=3
Flatten                  conv13_mbox_conf_flat    1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat -23330=4,1,12600,1,1
PriorBox                 conv13_mbox_priorbox     2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23330=4,2,2400,2,1 -23300=1,1.050000e+02 -23301=1,1.500000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01
Convolution              conv14_2_mbox_loc        1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc -23330=4,3,5,5,24 0=24 1=1 5=1 6=12288
Permute                  conv14_2_mbox_loc_perm   1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm -23330=4,3,24,5,5 0=3
Flatten                  conv14_2_mbox_loc_flat   1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat -23330=4,1,600,1,1
Convolution              conv14_2_mbox_conf       1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf -23330=4,3,5,5,126 0=126 1=1 5=1 6=64512
Permute                  conv14_2_mbox_conf_perm  1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm -23330=4,3,126,5,5 0=3
Flatten                  conv14_2_mbox_conf_flat  1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat -23330=4,1,3150,1,1
PriorBox                 conv14_2_mbox_priorbox   2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23330=4,2,600,2,1 -23300=1,1.500000e+02 -23301=1,1.950000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01
Convolution              conv15_2_mbox_loc        1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc -23330=4,3,3,3,24 0=24 1=1 5=1 6=6144
Permute                  conv15_2_mbox_loc_perm   1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm -23330=4,3,24,3,3 0=3
Flatten                  conv15_2_mbox_loc_flat   1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat -23330=4,1,216,1,1
Convolution              conv15_2_mbox_conf       1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf -23330=4,3,3,3,126 0=126 1=1 5=1 6=32256
Permute                  conv15_2_mbox_conf_perm  1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm -23330=4,3,126,3,3 0=3
Flatten                  conv15_2_mbox_conf_flat  1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat -23330=4,1,1134,1,1
PriorBox                 conv15_2_mbox_priorbox   2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23330=4,2,216,2,1 -23300=1,1.950000e+02 -23301=1,2.400000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01
Convolution              conv16_2_mbox_loc        1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc -23330=4,3,2,2,24 0=24 1=1 5=1 6=6144
Permute                  conv16_2_mbox_loc_perm   1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm -23330=4,3,24,2,2 0=3
Flatten                  conv16_2_mbox_loc_flat   1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat -23330=4,1,96,1,1
Convolution              conv16_2_mbox_conf       1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf -23330=4,3,2,2,126 0=126 1=1 5=1 6=32256
Permute                  conv16_2_mbox_conf_perm  1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm -23330=4,3,126,2,2 0=3
Flatten                  conv16_2_mbox_conf_flat  1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat -23330=4,1,504,1,1
PriorBox                 conv16_2_mbox_priorbox   2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,2.400000e+02 -23301=1,2.850000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01
Convolution              conv17_2_mbox_loc        1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc -23330=4,3,1,1,24 0=24 1=1 5=1 6=3072
Permute                  conv17_2_mbox_loc_perm   1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm -23330=4,3,24,1,1 0=3
Flatten                  conv17_2_mbox_loc_flat   1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat -23330=4,1,24,1,1
Convolution              conv17_2_mbox_conf       1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf -23330=4,3,1,1,126 0=126 1=1 5=1 6=16128
Permute                  conv17_2_mbox_conf_perm  1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm -23330=4,3,126,1,1 0=3
Flatten                  conv17_2_mbox_conf_flat  1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat -23330=4,1,126,1,1
PriorBox                 conv17_2_mbox_priorbox   2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23330=4,2,24,2,1 -23300=1,2.850000e+02 -23301=1,3.000000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 13=5.000000e-01
Concat                   mbox_loc                 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc -23330=4,1,7668,1,1
Concat                   mbox_conf                6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf -23330=4,1,40257,1,1
Concat                   mbox_priorbox            6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox -23330=4,2,7668,2,1 0=1
Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,1917,1 0=21 1=-1
Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,1917,1 0=1 1=1
Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,40257,1,1
DetectionOutput          detection_out            3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=4.500000e-01 2=100 4=2.500000e-01


================================================
FILE: benchmark/mobilenet_ssd_int8.param
================================================
7767517
92 115
Input                    input                    0 1 data 0=300 1=300 2=3
Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
Convolution              conv0                    1 1 data_splitncnn_6 conv0_conv0/relu 0=32 1=3 3=2 4=1 5=1 6=864 8=102 9=1
ConvolutionDepthWise     conv1/dw                 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu 0=32 1=3 4=1 5=1 6=288 7=32 8=101 9=1
Convolution              conv1                    1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu 0=64 1=1 5=1 6=2048 8=102 9=1
ConvolutionDepthWise     conv2/dw                 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu 0=64 1=3 3=2 4=1 5=1 6=576 7=64 8=101 9=1
Convolution              conv2                    1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu 0=128 1=1 5=1 6=8192 8=102 9=1
ConvolutionDepthWise     conv3/dw                 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu 0=128 1=3 4=1 5=1 6=1152 7=128 8=101 9=1
Convolution              conv3                    1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu 0=128 1=1 5=1 6=16384 8=102 9=1
ConvolutionDepthWise     conv4/dw                 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 8=101 9=1
Convolution              conv4                    1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu 0=256 1=1 5=1 6=32768 8=102 9=1
ConvolutionDepthWise     conv5/dw                 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu 0=256 1=3 4=1 5=1 6=2304 7=256 8=101 9=1
Convolution              conv5                    1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu 0=256 1=1 5=1 6=65536 8=102 9=1
ConvolutionDepthWise     conv6/dw                 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 8=101 9=1
Convolution              conv6                    1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu 0=512 1=1 5=1 6=131072 8=102 9=1
ConvolutionDepthWise     conv7/dw                 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv7                    1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv8/dw                 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv8                    1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv9/dw                 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv9                    1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv10/dw                1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv10                   1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu 0=512 1=1 5=1 6=262144 8=102 9=1
ConvolutionDepthWise     conv11/dw                1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu 0=512 1=3 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv11                   1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu 0=512 1=1 5=1 6=262144 8=2 9=1
Split                    splitncnn_1              1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3
ConvolutionDepthWise     conv12/dw                1 1 conv11_conv11/relu_splitncnn_3 conv12/dw_conv12/dw/relu 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 8=101 9=1
Convolution              conv12                   1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu 0=1024 1=1 5=1 6=524288 8=102 9=1
ConvolutionDepthWise     conv13/dw                1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu 0=1024 1=3 4=1 5=1 6=9216 7=1024 8=101 9=1
Convolution              conv13                   1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu 0=1024 1=1 5=1 6=1048576 8=2 9=1
Split                    splitncnn_2              1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3
Convolution              conv14_1                 1 1 conv13_conv13/relu_splitncnn_3 conv14_1_conv14_1/relu 0=256 1=1 5=1 6=262144 8=102 9=1
Convolution              conv14_2                 1 1 conv14_1_conv14_1/relu conv14_2_conv14_2/relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=2 9=1
Split                    splitncnn_3              1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3
Convolution              conv15_1                 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1_conv15_1/relu 0=128 1=1 5=1 6=65536 8=102 9=1
Convolution              conv15_2                 1 1 conv15_1_conv15_1/relu conv15_2_conv15_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
Split                    splitncnn_4              1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3
Convolution              conv16_1                 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1_conv16_1/relu 0=128 1=1 5=1 6=32768 8=102 9=1
Convolution              conv16_2                 1 1 conv16_1_conv16_1/relu conv16_2_conv16_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
Split                    splitncnn_5              1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3
Convolution              conv17_1                 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1_conv17_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1
Convolution              conv17_2                 1 1 conv17_1_conv17_1/relu conv17_2_conv17_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1
Split                    splitncnn_6              1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2
Convolution              conv11_mbox_loc          1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 5=1 6=6144 8=2
Permute                  conv11_mbox_loc_perm     1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3
Flatten                  conv11_mbox_loc_flat     1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat
Convolution              conv11_mbox_conf         1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 5=1 6=32256 8=2
Permute                  conv11_mbox_conf_perm    1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3
Flatten                  conv11_mbox_conf_flat    1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat
PriorBox                 conv11_mbox_priorbox     2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23302=1,2.000000 9=-233 10=-233 13=0.500000
Convolution              conv13_mbox_loc          1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 5=1 6=24576 8=2
Permute                  conv13_mbox_loc_perm     1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3
Flatten                  conv13_mbox_loc_flat     1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat
Convolution              conv13_mbox_conf         1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 5=1 6=129024 8=2
Permute                  conv13_mbox_conf_perm    1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3
Flatten                  conv13_mbox_conf_flat    1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat
PriorBox                 conv13_mbox_priorbox     2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
Convolution              conv14_2_mbox_loc        1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 5=1 6=12288 8=2
Permute                  conv14_2_mbox_loc_perm   1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3
Flatten                  conv14_2_mbox_loc_flat   1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat
Convolution              conv14_2_mbox_conf       1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 5=1 6=64512 8=2
Permute                  conv14_2_mbox_conf_perm  1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3
Flatten                  conv14_2_mbox_conf_flat  1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat
PriorBox                 conv14_2_mbox_priorbox   2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
Convolution              conv15_2_mbox_loc        1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2
Permute                  conv15_2_mbox_loc_perm   1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3
Flatten                  conv15_2_mbox_loc_flat   1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat
Convolution              conv15_2_mbox_conf       1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2
Permute                  conv15_2_mbox_conf_perm  1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3
Flatten                  conv15_2_mbox_conf_flat  1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat
PriorBox                 conv15_2_mbox_priorbox   2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
Convolution              conv16_2_mbox_loc        1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 5=1 6=6144 8=2
Permute                  conv16_2_mbox_loc_perm   1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3
Flatten                  conv16_2_mbox_loc_flat   1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat
Convolution              conv16_2_mbox_conf       1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 5=1 6=32256 8=2
Permute                  conv16_2_mbox_conf_perm  1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3
Flatten                  conv16_2_mbox_conf_flat  1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat
PriorBox                 conv16_2_mbox_priorbox   2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
Convolution              conv17_2_mbox_loc        1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 5=1 6=3072 8=2
Permute                  conv17_2_mbox_loc_perm   1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3
Flatten                  conv17_2_mbox_loc_flat   1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat
Convolution              conv17_2_mbox_conf       1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 5=1 6=16128 8=2
Permute                  conv17_2_mbox_conf_perm  1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3
Flatten                  conv17_2_mbox_conf_flat  1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat
PriorBox                 conv17_2_mbox_priorbox   2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 13=0.500000
Concat                   mbox_loc                 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc
Concat                   mbox_conf                6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf
Concat                   mbox_priorbox            6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1
Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape 0=21 1=-1
Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1
Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten
DetectionOutput          detection_out            3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000


================================================
FILE: benchmark/mobilenet_v2.param
================================================
7767517
77 87
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1/bn_relu1 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
Convolution              conv2_1/expand           1 1 conv1/bn_relu1 conv2_1/expand/bn_relu2_1/expand -23330=4,3,112,112,32 0=32 1=1 5=1 6=1024 9=1
ConvolutionDepthWise     conv2_1/dwise            1 1 conv2_1/expand/bn_relu2_1/expand conv2_1/dwise/bn_relu2_1/dwise -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1
Convolution              conv2_1/linear           1 1 conv2_1/dwise/bn_relu2_1/dwise conv2_1/linear/bn_conv2_1/linear/scale -23330=4,3,112,112,16 0=16 1=1 5=1 6=512
Convolution              conv2_2/expand           1 1 conv2_1/linear/bn_conv2_1/linear/scale conv2_2/expand/bn_relu2_2/expand -23330=4,3,112,112,96 0=96 1=1 5=1 6=1536 9=1
ConvolutionDepthWise     conv2_2/dwise            1 1 conv2_2/expand/bn_relu2_2/expand conv2_2/dwise/bn_relu2_2/dwise -23330=4,3,56,56,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 9=1
Convolution              conv2_2/linear           1 1 conv2_2/dwise/bn_relu2_2/dwise conv2_2/linear/bn_conv2_2/linear/scale -23330=4,3,56,56,24 0=24 1=1 5=1 6=2304
Split                    splitncnn_0              1 2 conv2_2/linear/bn_conv2_2/linear/scale conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_0 conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
Convolution              conv3_1/expand           1 1 conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_1 conv3_1/expand/bn_relu3_1/expand -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456 9=1
ConvolutionDepthWise     conv3_1/dwise            1 1 conv3_1/expand/bn_relu3_1/expand conv3_1/dwise/bn_relu3_1/dwise -23330=4,3,56,56,144 0=144 1=3 4=1 5=1 6=1296 7=144 9=1
Convolution              conv3_1/linear           1 1 conv3_1/dwise/bn_relu3_1/dwise conv3_1/linear/bn_conv3_1/linear/scale -23330=4,3,56,56,24 0=24 1=1 5=1 6=3456
Eltwise                  block_3_1                2 1 conv2_2/linear/bn_conv2_2/linear/scale_splitncnn_0 conv3_1/linear/bn_conv3_1/linear/scale block_3_1 -23330=4,3,56,56,24 0=1
Convolution              conv3_2/expand           1 1 block_3_1 conv3_2/expand/bn_relu3_2/expand -23330=4,3,56,56,144 0=144 1=1 5=1 6=3456 9=1
ConvolutionDepthWise     conv3_2/dwise            1 1 conv3_2/expand/bn_relu3_2/expand conv3_2/dwise/bn_relu3_2/dwise -23330=4,3,28,28,144 0=144 1=3 3=2 4=1 5=1 6=1296 7=144 9=1
Convolution              conv3_2/linear           1 1 conv3_2/dwise/bn_relu3_2/dwise conv3_2/linear/bn_conv3_2/linear/scale -23330=4,3,28,28,32 0=32 1=1 5=1 6=4608
Split                    splitncnn_1              1 2 conv3_2/linear/bn_conv3_2/linear/scale conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_0 conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32
Convolution              conv4_1/expand           1 1 conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_1 conv4_1/expand/bn_relu4_1/expand -23330=4,3,28,28,192 0=192 1=1 5=1 6=6144 9=1
ConvolutionDepthWise     conv4_1/dwise            1 1 conv4_1/expand/bn_relu4_1/expand conv4_1/dwise/bn_relu4_1/dwise -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1
Convolution              conv4_1/linear           1 1 conv4_1/dwise/bn_relu4_1/dwise conv4_1/linear/bn_conv4_1/linear/scale -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144
Eltwise                  block_4_1                2 1 conv3_2/linear/bn_conv3_2/linear/scale_splitncnn_0 conv4_1/linear/bn_conv4_1/linear/scale block_4_1 -23330=4,3,28,28,32 0=1
Split                    splitncnn_2              1 2 block_4_1 block_4_1_splitncnn_0 block_4_1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32
Convolution              conv4_2/expand           1 1 block_4_1_splitncnn_1 conv4_2/expand/bn_relu4_2/expand -23330=4,3,28,28,192 0=192 1=1 5=1 6=6144 9=1
ConvolutionDepthWise     conv4_2/dwise            1 1 conv4_2/expand/bn_relu4_2/expand conv4_2/dwise/bn_relu4_2/dwise -23330=4,3,28,28,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1
Convolution              conv4_2/linear           1 1 conv4_2/dwise/bn_relu4_2/dwise conv4_2/linear/bn_conv4_2/linear/scale -23330=4,3,28,28,32 0=32 1=1 5=1 6=6144
Eltwise                  block_4_2                2 1 block_4_1_splitncnn_0 conv4_2/linear/bn_conv4_2/linear/scale block_4_2 -23330=4,3,28,28,32 0=1
Convolution              conv4_3/expand           1 1 block_4_2 conv4_3/expand/bn_relu4_3/expand -23330=4,3,28,28,192 0=192 1=1 5=1 6=6144 9=1
ConvolutionDepthWise     conv4_3/dwise            1 1 conv4_3/expand/bn_relu4_3/expand conv4_3/dwise/bn_relu4_3/dwise -23330=4,3,14,14,192 0=192 1=3 3=2 4=1 5=1 6=1728 7=192 9=1
Convolution              conv4_3/linear           1 1 conv4_3/dwise/bn_relu4_3/dwise conv4_3/linear/bn_conv4_3/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=12288
Split                    splitncnn_3              1 2 conv4_3/linear/bn_conv4_3/linear/scale conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_0 conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64
Convolution              conv4_4/expand           1 1 conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_1 conv4_4/expand/bn_relu4_4/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv4_4/dwise            1 1 conv4_4/expand/bn_relu4_4/expand conv4_4/dwise/bn_relu4_4/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv4_4/linear           1 1 conv4_4/dwise/bn_relu4_4/dwise conv4_4/linear/bn_conv4_4/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576
Eltwise                  block_4_4                2 1 conv4_3/linear/bn_conv4_3/linear/scale_splitncnn_0 conv4_4/linear/bn_conv4_4/linear/scale block_4_4 -23330=4,3,14,14,64 0=1
Split                    splitncnn_4              1 2 block_4_4 block_4_4_splitncnn_0 block_4_4_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64
Convolution              conv4_5/expand           1 1 block_4_4_splitncnn_1 conv4_5/expand/bn_relu4_5/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv4_5/dwise            1 1 conv4_5/expand/bn_relu4_5/expand conv4_5/dwise/bn_relu4_5/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv4_5/linear           1 1 conv4_5/dwise/bn_relu4_5/dwise conv4_5/linear/bn_conv4_5/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576
Eltwise                  block_4_5                2 1 block_4_4_splitncnn_0 conv4_5/linear/bn_conv4_5/linear/scale block_4_5 -23330=4,3,14,14,64 0=1
Split                    splitncnn_5              1 2 block_4_5 block_4_5_splitncnn_0 block_4_5_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64
Convolution              conv4_6/expand           1 1 block_4_5_splitncnn_1 conv4_6/expand/bn_relu4_6/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv4_6/dwise            1 1 conv4_6/expand/bn_relu4_6/expand conv4_6/dwise/bn_relu4_6/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv4_6/linear           1 1 conv4_6/dwise/bn_relu4_6/dwise conv4_6/linear/bn_conv4_6/linear/scale -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576
Eltwise                  block_4_6                2 1 block_4_5_splitncnn_0 conv4_6/linear/bn_conv4_6/linear/scale block_4_6 -23330=4,3,14,14,64 0=1
Convolution              conv4_7/expand           1 1 block_4_6 conv4_7/expand/bn_relu4_7/expand -23330=4,3,14,14,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv4_7/dwise            1 1 conv4_7/expand/bn_relu4_7/expand conv4_7/dwise/bn_relu4_7/dwise -23330=4,3,14,14,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv4_7/linear           1 1 conv4_7/dwise/bn_relu4_7/dwise conv4_7/linear/bn_conv4_7/linear/scale -23330=4,3,14,14,96 0=96 1=1 5=1 6=36864
Split                    splitncnn_6              1 2 conv4_7/linear/bn_conv4_7/linear/scale conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_0 conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              conv5_1/expand           1 1 conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_1 conv5_1/expand/bn_relu5_1/expand -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     conv5_1/dwise            1 1 conv5_1/expand/bn_relu5_1/expand conv5_1/dwise/bn_relu5_1/dwise -23330=4,3,14,14,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1
Convolution              conv5_1/linear           1 1 conv5_1/dwise/bn_relu5_1/dwise conv5_1/linear/bn_conv5_1/linear/scale -23330=4,3,14,14,96 0=96 1=1 5=1 6=55296
Eltwise                  block_5_1                2 1 conv4_7/linear/bn_conv4_7/linear/scale_splitncnn_0 conv5_1/linear/bn_conv5_1/linear/scale block_5_1 -23330=4,3,14,14,96 0=1
Split                    splitncnn_7              1 2 block_5_1 block_5_1_splitncnn_0 block_5_1_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              conv5_2/expand           1 1 block_5_1_splitncnn_1 conv5_2/expand/bn_relu5_2/expand -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     conv5_2/dwise            1 1 conv5_2/expand/bn_relu5_2/expand conv5_2/dwise/bn_relu5_2/dwise -23330=4,3,14,14,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1
Convolution              conv5_2/linear           1 1 conv5_2/dwise/bn_relu5_2/dwise conv5_2/linear/bn_conv5_2/linear/scale -23330=4,3,14,14,96 0=96 1=1 5=1 6=55296
Eltwise                  block_5_2                2 1 block_5_1_splitncnn_0 conv5_2/linear/bn_conv5_2/linear/scale block_5_2 -23330=4,3,14,14,96 0=1
Convolution              conv5_3/expand           1 1 block_5_2 conv5_3/expand/bn_relu5_3/expand -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     conv5_3/dwise            1 1 conv5_3/expand/bn_relu5_3/expand conv5_3/dwise/bn_relu5_3/dwise -23330=4,3,7,7,576 0=576 1=3 3=2 4=1 5=1 6=5184 7=576 9=1
Convolution              conv5_3/linear           1 1 conv5_3/dwise/bn_relu5_3/dwise conv5_3/linear/bn_conv5_3/linear/scale -23330=4,3,7,7,160 0=160 1=1 5=1 6=92160
Split                    splitncnn_8              1 2 conv5_3/linear/bn_conv5_3/linear/scale conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_0 conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
Convolution              conv6_1/expand           1 1 conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_1 conv6_1/expand/bn_relu6_1/expand -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 9=1
ConvolutionDepthWise     conv6_1/dwise            1 1 conv6_1/expand/bn_relu6_1/expand conv6_1/dwise/bn_relu6_1/dwise -23330=4,3,7,7,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1
Convolution              conv6_1/linear           1 1 conv6_1/dwise/bn_relu6_1/dwise conv6_1/linear/bn_conv6_1/linear/scale -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
Eltwise                  block_6_1                2 1 conv5_3/linear/bn_conv5_3/linear/scale_splitncnn_0 conv6_1/linear/bn_conv6_1/linear/scale block_6_1 -23330=4,3,7,7,160 0=1
Split                    splitncnn_9              1 2 block_6_1 block_6_1_splitncnn_0 block_6_1_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
Convolution              conv6_2/expand           1 1 block_6_1_splitncnn_1 conv6_2/expand/bn_relu6_2/expand -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 9=1
ConvolutionDepthWise     conv6_2/dwise            1 1 conv6_2/expand/bn_relu6_2/expand conv6_2/dwise/bn_relu6_2/dwise -23330=4,3,7,7,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1
Convolution              conv6_2/linear           1 1 conv6_2/dwise/bn_relu6_2/dwise conv6_2/linear/bn_conv6_2/linear/scale -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
Eltwise                  block_6_2                2 1 block_6_1_splitncnn_0 conv6_2/linear/bn_conv6_2/linear/scale block_6_2 -23330=4,3,7,7,160 0=1
Convolution              conv6_3/expand           1 1 block_6_2 conv6_3/expand/bn_relu6_3/expand -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600 9=1
ConvolutionDepthWise     conv6_3/dwise            1 1 conv6_3/expand/bn_relu6_3/expand conv6_3/dwise/bn_relu6_3/dwise -23330=4,3,7,7,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1
Convolution              conv6_3/linear           1 1 conv6_3/dwise/bn_relu6_3/dwise conv6_3/linear/bn_conv6_3/linear/scale -23330=4,3,7,7,320 0=320 1=1 5=1 6=307200
Convolution              conv6_4                  1 1 conv6_3/linear/bn_conv6_3/linear/scale conv6_4/bn_relu6_4 -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 9=1
Pooling                  pool6                    1 1 conv6_4/bn_relu6_4 pool6 -23330=4,1,1280,1,1 0=1 4=1
InnerProduct             fc7                      1 1 pool6 fc7 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
Softmax                  prob                     1 1 fc7 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/mobilenet_v3.param
================================================
7767517
145 163
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              313                      1 1 data 313 -23330=4,3,112,112,16 0=16 1=3 3=2 4=1 5=1 6=432
Split                    splitncnn_0              1 2 313 313_splitncnn_0 313_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
HardSigmoid              319                      1 1 313_splitncnn_1 319 -23330=4,3,112,112,16
BinaryOp                 320                      2 1 313_splitncnn_0 319 320 -23330=4,3,112,112,16 0=2
Split                    splitncnn_1              1 2 320 320_splitncnn_0 320_splitncnn_1 -23330=8,3,112,112,16,3,112,112,16
ConvolutionDepthWise     321                      1 1 320_splitncnn_1 323 -23330=4,3,112,112,16 0=16 1=3 4=1 5=1 6=144 7=16 9=1
Convolution              324                      1 1 323 324 -23330=4,3,112,112,16 0=16 1=1 5=1 6=256
BinaryOp                 326                      2 1 320_splitncnn_0 324 326 -23330=4,3,112,112,16
Convolution              327                      1 1 326 329 -23330=4,3,112,112,64 0=64 1=1 5=1 6=1024 9=1
ConvolutionDepthWise     330                      1 1 329 332 -23330=4,3,56,56,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
Convolution              333                      1 1 332 333 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1536
Split                    splitncnn_2              1 2 333 333_splitncnn_0 333_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
Convolution              335                      1 1 333_splitncnn_1 337 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
ConvolutionDepthWise     338                      1 1 337 340 -23330=4,3,56,56,72 0=72 1=3 4=1 5=1 6=648 7=72 9=1
Convolution              341                      1 1 340 341 -23330=4,3,56,56,24 0=24 1=1 5=1 6=1728
BinaryOp                 343                      2 1 333_splitncnn_0 341 343 -23330=4,3,56,56,24
Convolution              344                      1 1 343 346 -23330=4,3,56,56,72 0=72 1=1 5=1 6=1728 9=1
ConvolutionDepthWise     347                      1 1 346 347 -23330=4,3,28,28,72 0=72 1=5 3=2 4=2 5=1 6=1800 7=72
Split                    splitncnn_3              1 2 347 347_splitncnn_0 347_splitncnn_1 -23330=8,3,28,28,72,3,28,28,72
Pooling                  355                      1 1 347_splitncnn_1 359 -23330=4,1,72,1,1 0=1 4=1
InnerProduct             360                      1 1 359 361 -23330=4,1,18,1,1 0=18 1=1 2=1296 9=1
InnerProduct             362                      1 1 361 362 -23330=4,1,72,1,1 0=72 1=1 2=1296
HardSigmoid              367                      1 1 362 367 -23330=4,1,72,1,1
BinaryOp                 376                      2 1 347_splitncnn_0 367 376 -23330=4,3,28,28,72 0=2
ReLU                     377                      1 1 376 377 -23330=4,3,28,28,72
Convolution              378                      1 1 377 378 -23330=4,3,28,28,40 0=40 1=1 5=1 6=2880
Split                    splitncnn_4              1 2 378 378_splitncnn_0 378_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              380                      1 1 378_splitncnn_1 382 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise     383                      1 1 382 383 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
Split                    splitncnn_5              1 2 383 383_splitncnn_0 383_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
Pooling                  391                      1 1 383_splitncnn_1 395 -23330=4,1,120,1,1 0=1 4=1
InnerProduct             396                      1 1 395 397 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
InnerProduct             398                      1 1 397 398 -23330=4,1,120,1,1 0=120 1=1 2=3600
HardSigmoid              403                      1 1 398 403 -23330=4,1,120,1,1
BinaryOp                 412                      2 1 383_splitncnn_0 403 412 -23330=4,3,28,28,120 0=2
ReLU                     413                      1 1 412 413 -23330=4,3,28,28,120
Convolution              414                      1 1 413 414 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp                 416                      2 1 378_splitncnn_0 414 416 -23330=4,3,28,28,40
Split                    splitncnn_6              1 2 416 416_splitncnn_0 416_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              417                      1 1 416_splitncnn_1 419 -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise     420                      1 1 419 420 -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120
Split                    splitncnn_7              1 2 420 420_splitncnn_0 420_splitncnn_1 -23330=8,3,28,28,120,3,28,28,120
Pooling                  428                      1 1 420_splitncnn_1 432 -23330=4,1,120,1,1 0=1 4=1
InnerProduct             433                      1 1 432 434 -23330=4,1,30,1,1 0=30 1=1 2=3600 9=1
InnerProduct             435                      1 1 434 435 -23330=4,1,120,1,1 0=120 1=1 2=3600
HardSigmoid              440                      1 1 435 440 -23330=4,1,120,1,1
BinaryOp                 449                      2 1 420_splitncnn_0 440 449 -23330=4,3,28,28,120 0=2
ReLU                     450                      1 1 449 450 -23330=4,3,28,28,120
Convolution              451                      1 1 450 451 -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp                 453                      2 1 416_splitncnn_0 451 453 -23330=4,3,28,28,40
Convolution              454                      1 1 453 454 -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600
HardSwish                461                      1 1 454 461 -23330=4,3,28,28,240
ConvolutionDepthWise     462                      1 1 461 462 -23330=4,3,14,14,240 0=240 1=3 3=2 4=1 5=1 6=2160 7=240
HardSwish                469                      1 1 462 469 -23330=4,3,14,14,240
Convolution              470                      1 1 469 470 -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
Split                    splitncnn_8              1 2 470 470_splitncnn_0 470_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              472                      1 1 470_splitncnn_1 472 -23330=4,3,14,14,200 0=200 1=1 5=1 6=16000
HardSwish                479                      1 1 472 479 -23330=4,3,14,14,200
ConvolutionDepthWise     480                      1 1 479 480 -23330=4,3,14,14,200 0=200 1=3 4=1 5=1 6=1800 7=200
HardSwish                487                      1 1 480 487 -23330=4,3,14,14,200
Convolution              488                      1 1 487 488 -23330=4,3,14,14,80 0=80 1=1 5=1 6=16000
BinaryOp                 490                      2 1 470_splitncnn_0 488 490 -23330=4,3,14,14,80
Split                    splitncnn_9              1 2 490 490_splitncnn_0 490_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              491                      1 1 490_splitncnn_1 491 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
HardSwish                498                      1 1 491 498 -23330=4,3,14,14,184
ConvolutionDepthWise     499                      1 1 498 499 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
HardSwish                506                      1 1 499 506 -23330=4,3,14,14,184
Convolution              507                      1 1 506 507 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
BinaryOp                 509                      2 1 490_splitncnn_0 507 509 -23330=4,3,14,14,80
Split                    splitncnn_10             1 2 509 509_splitncnn_0 509_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              510                      1 1 509_splitncnn_1 510 -23330=4,3,14,14,184 0=184 1=1 5=1 6=14720
HardSwish                517                      1 1 510 517 -23330=4,3,14,14,184
ConvolutionDepthWise     518                      1 1 517 518 -23330=4,3,14,14,184 0=184 1=3 4=1 5=1 6=1656 7=184
HardSwish                525                      1 1 518 525 -23330=4,3,14,14,184
Convolution              526                      1 1 525 526 -23330=4,3,14,14,80 0=80 1=1 5=1 6=14720
BinaryOp                 528                      2 1 509_splitncnn_0 526 528 -23330=4,3,14,14,80
Convolution              529                      1 1 528 529 -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400
HardSwish                536                      1 1 529 536 -23330=4,3,14,14,480
ConvolutionDepthWise     537                      1 1 536 537 -23330=4,3,14,14,480 0=480 1=3 4=1 5=1 6=4320 7=480
Split                    splitncnn_11             1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
Pooling                  545                      1 1 537_splitncnn_1 549 -23330=4,1,480,1,1 0=1 4=1
InnerProduct             550                      1 1 549 551 -23330=4,1,120,1,1 0=120 1=1 2=57600 9=1
InnerProduct             552                      1 1 551 552 -23330=4,1,480,1,1 0=480 1=1 2=57600
HardSigmoid              557                      1 1 552 557 -23330=4,1,480,1,1
BinaryOp                 566                      2 1 537_splitncnn_0 557 566 -23330=4,3,14,14,480 0=2
HardSwish                572                      1 1 566 572 -23330=4,3,14,14,480
Convolution              573                      1 1 572 573 -23330=4,3,14,14,112 0=112 1=1 5=1 6=53760
Split                    splitncnn_12             1 2 573 573_splitncnn_0 573_splitncnn_1 -23330=8,3,14,14,112,3,14,14,112
Convolution              575                      1 1 573_splitncnn_1 575 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
HardSwish                582                      1 1 575 582 -23330=4,3,14,14,672
ConvolutionDepthWise     583                      1 1 582 583 -23330=4,3,14,14,672 0=672 1=3 4=1 5=1 6=6048 7=672
Split                    splitncnn_13             1 2 583 583_splitncnn_0 583_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Pooling                  591                      1 1 583_splitncnn_1 595 -23330=4,1,672,1,1 0=1 4=1
InnerProduct             596                      1 1 595 597 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
InnerProduct             598                      1 1 597 598 -23330=4,1,672,1,1 0=672 1=1 2=112896
HardSigmoid              603                      1 1 598 603 -23330=4,1,672,1,1
BinaryOp                 612                      2 1 583_splitncnn_0 603 612 -23330=4,3,14,14,672 0=2
HardSwish                618                      1 1 612 618 -23330=4,3,14,14,672
Convolution              619                      1 1 618 619 -23330=4,3,14,14,112 0=112 1=1 5=1 6=75264
BinaryOp                 621                      2 1 573_splitncnn_0 619 621 -23330=4,3,14,14,112
Convolution              622                      1 1 621 622 -23330=4,3,14,14,672 0=672 1=1 5=1 6=75264
HardSwish                629                      1 1 622 629 -23330=4,3,14,14,672
ConvolutionDepthWise     630                      1 1 629 630 -23330=4,3,14,14,672 0=672 1=5 4=2 5=1 6=16800 7=672
Split                    splitncnn_14             1 2 630 630_splitncnn_0 630_splitncnn_1 -23330=8,3,14,14,672,3,14,14,672
Pooling                  638                      1 1 630_splitncnn_1 642 -23330=4,1,672,1,1 0=1 4=1
InnerProduct             643                      1 1 642 644 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
InnerProduct             645                      1 1 644 645 -23330=4,1,672,1,1 0=672 1=1 2=112896
HardSigmoid              650                      1 1 645 650 -23330=4,1,672,1,1
BinaryOp                 659                      2 1 630_splitncnn_0 650 659 -23330=4,3,14,14,672 0=2
HardSwish                665                      1 1 659 665 -23330=4,3,14,14,672
Convolution              666                      1 1 665 666 -23330=4,3,14,14,160 0=160 1=1 5=1 6=107520
Convolution              668                      1 1 666 668 -23330=4,3,14,14,672 0=672 1=1 5=1 6=107520
HardSwish                675                      1 1 668 675 -23330=4,3,14,14,672
ConvolutionDepthWise     676                      1 1 675 676 -23330=4,3,7,7,672 0=672 1=5 3=2 4=2 5=1 6=16800 7=672
Split                    splitncnn_15             1 2 676 676_splitncnn_0 676_splitncnn_1 -23330=8,3,7,7,672,3,7,7,672
Pooling                  684                      1 1 676_splitncnn_1 688 -23330=4,1,672,1,1 0=1 4=1
InnerProduct             689                      1 1 688 690 -23330=4,1,168,1,1 0=168 1=1 2=112896 9=1
InnerProduct             691                      1 1 690 691 -23330=4,1,672,1,1 0=672 1=1 2=112896
HardSigmoid              696                      1 1 691 696 -23330=4,1,672,1,1
BinaryOp                 705                      2 1 676_splitncnn_0 696 705 -23330=4,3,7,7,672 0=2
HardSwish                711                      1 1 705 711 -23330=4,3,7,7,672
Convolution              712                      1 1 711 712 -23330=4,3,7,7,160 0=160 1=1 5=1 6=107520
Split                    splitncnn_16             1 2 712 712_splitncnn_0 712_splitncnn_1 -23330=8,3,7,7,160,3,7,7,160
Convolution              714                      1 1 712_splitncnn_1 714 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
HardSwish                721                      1 1 714 721 -23330=4,3,7,7,960
ConvolutionDepthWise     722                      1 1 721 722 -23330=4,3,7,7,960 0=960 1=5 4=2 5=1 6=24000 7=960
Split                    splitncnn_17             1 2 722 722_splitncnn_0 722_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
Pooling                  730                      1 1 722_splitncnn_1 734 -23330=4,1,960,1,1 0=1 4=1
InnerProduct             735                      1 1 734 736 -23330=4,1,240,1,1 0=240 1=1 2=230400 9=1
InnerProduct             737                      1 1 736 737 -23330=4,1,960,1,1 0=960 1=1 2=230400
HardSigmoid              742                      1 1 737 742 -23330=4,1,960,1,1
BinaryOp                 751                      2 1 722_splitncnn_0 742 751 -23330=4,3,7,7,960 0=2
HardSwish                757                      1 1 751 757 -23330=4,3,7,7,960
Convolution              758                      1 1 757 758 -23330=4,3,7,7,160 0=160 1=1 5=1 6=153600
BinaryOp                 760                      2 1 712_splitncnn_0 758 760 -23330=4,3,7,7,160
Convolution              761                      1 1 760 761 -23330=4,3,7,7,960 0=960 1=1 5=1 6=153600
HardSwish                768                      1 1 761 768 -23330=4,3,7,7,960
Pooling                  769                      1 1 768 769 -23330=4,1,960,1,1 0=1 4=1
HardSwish                775                      1 1 769 775 -23330=4,1,960,1,1
Reshape                  783                      1 1 775 783 -23330=4,1,960,1,1 0=-1
InnerProduct             784                      1 1 783 784 -23330=4,1,1280,1,1 0=1280 1=1 2=1228800
HardSwish                790                      1 1 784 790 -23330=4,1,1280,1,1
InnerProduct             791                      1 1 790 791 -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
Softmax                  prob                     1 1 791 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/mobilenet_yolo.param
================================================
7767517
39 41
Input                    data                     0 1 data -23330=4,3,416,416,3 0=416 1=416 2=3
Convolution              conv0                    1 1 data conv0_conv0/relu -23330=4,3,208,208,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
ConvolutionDepthWise     conv1/dw                 1 1 conv0_conv0/relu conv1/dw_conv1/dw/relu -23330=4,3,208,208,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1
Convolution              conv1                    1 1 conv1/dw_conv1/dw/relu conv1_conv1/relu -23330=4,3,208,208,64 0=64 1=1 5=1 6=2048 9=1
ConvolutionDepthWise     conv2/dw                 1 1 conv1_conv1/relu conv2/dw_conv2/dw/relu -23330=4,3,104,104,64 0=64 1=3 3=2 4=1 5=1 6=576 7=64 9=1
Convolution              conv2                    1 1 conv2/dw_conv2/dw/relu conv2_conv2/relu -23330=4,3,104,104,128 0=128 1=1 5=1 6=8192 9=1
ConvolutionDepthWise     conv3/dw                 1 1 conv2_conv2/relu conv3/dw_conv3/dw/relu -23330=4,3,104,104,128 0=128 1=3 4=1 5=1 6=1152 7=128 9=1
Convolution              conv3                    1 1 conv3/dw_conv3/dw/relu conv3_conv3/relu -23330=4,3,104,104,128 0=128 1=1 5=1 6=16384 9=1
ConvolutionDepthWise     conv4/dw                 1 1 conv3_conv3/relu conv4/dw_conv4/dw/relu -23330=4,3,52,52,128 0=128 1=3 3=2 4=1 5=1 6=1152 7=128 9=1
Convolution              conv4                    1 1 conv4/dw_conv4/dw/relu conv4_conv4/relu -23330=4,3,52,52,256 0=256 1=1 5=1 6=32768 9=1
ConvolutionDepthWise     conv5/dw                 1 1 conv4_conv4/relu conv5/dw_conv5/dw/relu -23330=4,3,52,52,256 0=256 1=3 4=1 5=1 6=2304 7=256 9=1
Convolution              conv5                    1 1 conv5/dw_conv5/dw/relu conv5_conv5/relu -23330=4,3,52,52,256 0=256 1=1 5=1 6=65536 9=1
ConvolutionDepthWise     conv6/dw                 1 1 conv5_conv5/relu conv6/dw_conv6/dw/relu -23330=4,3,26,26,256 0=256 1=3 3=2 4=1 5=1 6=2304 7=256 9=1
Convolution              conv6                    1 1 conv6/dw_conv6/dw/relu conv6_conv6/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=131072 9=1
ConvolutionDepthWise     conv7/dw                 1 1 conv6_conv6/relu conv7/dw_conv7/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv7                    1 1 conv7/dw_conv7/dw/relu conv7_conv7/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv8/dw                 1 1 conv7_conv7/relu conv8/dw_conv8/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv8                    1 1 conv8/dw_conv8/dw/relu conv8_conv8/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv9/dw                 1 1 conv8_conv8/relu conv9/dw_conv9/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv9                    1 1 conv9/dw_conv9/dw/relu conv9_conv9/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv10/dw                1 1 conv9_conv9/relu conv10/dw_conv10/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv10                   1 1 conv10/dw_conv10/dw/relu conv10_conv10/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1
ConvolutionDepthWise     conv11/dw                1 1 conv10_conv10/relu conv11/dw_conv11/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv11                   1 1 conv11/dw_conv11/dw/relu conv11_conv11/relu -23330=4,3,26,26,512 0=512 1=1 5=1 6=262144 9=1
Split                    splitncnn_0              1 2 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 -23330=8,3,26,26,512,3,26,26,512
ConvolutionDepthWise     conv12/dw                1 1 conv11_conv11/relu_splitncnn_1 conv12/dw_conv12/dw/relu -23330=4,3,13,13,512 0=512 1=3 3=2 4=1 5=1 6=4608 7=512 9=1
Convolution              conv12                   1 1 conv12/dw_conv12/dw/relu conv12_conv12/relu -23330=4,3,13,13,1024 0=1024 1=1 5=1 6=524288 9=1
ConvolutionDepthWise     conv13/dw                1 1 conv12_conv12/relu conv13/dw_conv13/dw/relu -23330=4,3,13,13,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1
Convolution              conv13                   1 1 conv13/dw_conv13/dw/relu conv13_conv13/relu -23330=4,3,13,13,1024 0=1024 1=1 5=1 6=1048576 9=1
ConvolutionDepthWise     conv16/dw                1 1 conv13_conv13/relu conv16/dw_conv16/dw/relu -23330=4,3,13,13,1024 0=1024 1=3 4=1 5=1 6=9216 7=1024 9=1
Convolution              conv17                   1 1 conv16/dw_conv16/dw/relu conv17_conv17/relu -23330=4,3,13,13,1024 0=1024 1=1 5=1 6=1048576 9=1
Split                    splitncnn_1              1 2 conv17_conv17/relu conv17_conv17/relu_splitncnn_0 conv17_conv17/relu_splitncnn_1 -23330=8,3,13,13,1024,3,13,13,1024
DeconvolutionDepthWise   upsample                 1 1 conv17_conv17/relu_splitncnn_1 upsample -23330=4,3,26,26,512 0=512 1=4 3=2 4=1 6=16384 7=512
Eltwise                  conv_18/sum              2 1 conv11_conv11/relu_splitncnn_0 upsample conv_18/sum -23330=4,3,26,26,512 0=1
ConvolutionDepthWise     conv19/dw                1 1 conv_18/sum conv19/dw_conv19/dw/relu -23330=4,3,26,26,512 0=512 1=3 4=1 5=1 6=4608 7=512 9=1
Convolution              conv20                   1 1 conv19/dw_conv19/dw/relu conv20_conv20/relu -23330=4,3,26,26,1024 0=1024 1=1 5=1 6=524288 9=1
Convolution              conv22_indoor            1 1 conv17_conv17/relu_splitncnn_0 conv22 -23330=4,3,13,13,125 0=125 1=1 5=1 6=128000
Convolution              conv23_indoor            1 1 conv20_conv20/relu conv23 -23330=4,3,26,26,125 0=125 1=1 5=1 6=128000
YoloDetectionOutput      detection_out            2 1 conv22 conv23 output -23330=4,3,13,13,125 2=4.000000e-01 -23304=10,1.080000e+00,1.190000e+00,3.420000e+00,4.410000e+00,6.630000e+00,1.138000e+01,9.420000e+00,5.110000e+00,1.662000e+01,1.052000e+01


================================================
FILE: benchmark/mobilenetv2_yolov3.param
================================================
7767517
87 99
Input                    data                     0 1 data -23330=4,3,352,352,3 0=352 1=352 2=3
Convolution              conv1                    1 1 data conv1_relu1 -23330=4,3,176,176,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
ConvolutionDepthWise     conv2                    1 1 conv1_relu1 conv2_relu2 -23330=4,3,176,176,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1
Convolution              conv3                    1 1 conv2_relu2 conv3 -23330=4,3,176,176,16 0=16 1=1 5=1 6=512
Convolution              conv4                    1 1 conv3 conv4_relu3 -23330=4,3,176,176,96 0=96 1=1 5=1 6=1536 9=1
ConvolutionDepthWise     conv5                    1 1 conv4_relu3 conv5_relu4 -23330=4,3,88,88,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 9=1
Convolution              conv6                    1 1 conv5_relu4 conv6 -23330=4,3,88,88,24 0=24 1=1 5=1 6=2304
Split                    splitncnn_0              1 2 conv6 conv6_splitncnn_0 conv6_splitncnn_1 -23330=8,3,88,88,24,3,88,88,24
Convolution              conv7                    1 1 conv6_splitncnn_1 conv7_relu5 -23330=4,3,88,88,144 0=144 1=1 5=1 6=3456 9=1
ConvolutionDepthWise     conv8                    1 1 conv7_relu5 conv8_relu6 -23330=4,3,88,88,144 0=144 1=3 4=1 5=1 6=1296 7=144 9=1
Convolution              conv9                    1 1 conv8_relu6 conv9 -23330=4,3,88,88,24 0=24 1=1 5=1 6=3456
Eltwise                  add1                     2 1 conv6_splitncnn_0 conv9 add1 -23330=4,3,88,88,24 0=1
Convolution              conv10                   1 1 add1 conv10_relu7 -23330=4,3,88,88,144 0=144 1=1 5=1 6=3456 9=1
ConvolutionDepthWise     conv11                   1 1 conv10_relu7 conv11_relu8 -23330=4,3,44,44,144 0=144 1=3 3=2 4=1 5=1 6=1296 7=144 9=1
Convolution              conv12                   1 1 conv11_relu8 conv12 -23330=4,3,44,44,32 0=32 1=1 5=1 6=4608
Split                    splitncnn_1              1 2 conv12 conv12_splitncnn_0 conv12_splitncnn_1 -23330=8,3,44,44,32,3,44,44,32
Convolution              conv13                   1 1 conv12_splitncnn_1 conv13_relu9 -23330=4,3,44,44,192 0=192 1=1 5=1 6=6144 9=1
ConvolutionDepthWise     conv14                   1 1 conv13_relu9 conv14_relu10 -23330=4,3,44,44,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1
Convolution              conv15                   1 1 conv14_relu10 conv15 -23330=4,3,44,44,32 0=32 1=1 5=1 6=6144
Eltwise                  add2                     2 1 conv12_splitncnn_0 conv15 add2 -23330=4,3,44,44,32 0=1
Split                    splitncnn_2              1 2 add2 add2_splitncnn_0 add2_splitncnn_1 -23330=8,3,44,44,32,3,44,44,32
Convolution              conv16                   1 1 add2_splitncnn_1 conv16_relu11 -23330=4,3,44,44,192 0=192 1=1 5=1 6=6144 9=1
ConvolutionDepthWise     conv17                   1 1 conv16_relu11 conv17_relu12 -23330=4,3,44,44,192 0=192 1=3 4=1 5=1 6=1728 7=192 9=1
Convolution              conv18                   1 1 conv17_relu12 conv18 -23330=4,3,44,44,32 0=32 1=1 5=1 6=6144
Eltwise                  add3                     2 1 add2_splitncnn_0 conv18 add3 -23330=4,3,44,44,32 0=1
Convolution              conv19                   1 1 add3 conv19_relu13 -23330=4,3,44,44,192 0=192 1=1 5=1 6=6144 9=1
ConvolutionDepthWise     conv20                   1 1 conv19_relu13 conv20_relu14 -23330=4,3,22,22,192 0=192 1=3 3=2 4=1 5=1 6=1728 7=192 9=1
Convolution              conv21                   1 1 conv20_relu14 conv21 -23330=4,3,22,22,64 0=64 1=1 5=1 6=12288
Split                    splitncnn_3              1 2 conv21 conv21_splitncnn_0 conv21_splitncnn_1 -23330=8,3,22,22,64,3,22,22,64
Convolution              conv22                   1 1 conv21_splitncnn_1 conv22_relu15 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv23                   1 1 conv22_relu15 conv23_relu16 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv24                   1 1 conv23_relu16 conv24 -23330=4,3,22,22,64 0=64 1=1 5=1 6=24576
Eltwise                  add4                     2 1 conv21_splitncnn_0 conv24 add4 -23330=4,3,22,22,64 0=1
Split                    splitncnn_4              1 2 add4 add4_splitncnn_0 add4_splitncnn_1 -23330=8,3,22,22,64,3,22,22,64
Convolution              conv25                   1 1 add4_splitncnn_1 conv25_relu17 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv26                   1 1 conv25_relu17 conv26_relu18 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv27                   1 1 conv26_relu18 conv27 -23330=4,3,22,22,64 0=64 1=1 5=1 6=24576
Eltwise                  add5                     2 1 add4_splitncnn_0 conv27 add5 -23330=4,3,22,22,64 0=1
Split                    splitncnn_5              1 2 add5 add5_splitncnn_0 add5_splitncnn_1 -23330=8,3,22,22,64,3,22,22,64
Convolution              conv28                   1 1 add5_splitncnn_1 conv28_relu19 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv29                   1 1 conv28_relu19 conv29_relu20 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv30                   1 1 conv29_relu20 conv30 -23330=4,3,22,22,64 0=64 1=1 5=1 6=24576
Eltwise                  add6                     2 1 add5_splitncnn_0 conv30 add6 -23330=4,3,22,22,64 0=1
Convolution              conv31                   1 1 add6 conv31_relu21 -23330=4,3,22,22,384 0=384 1=1 5=1 6=24576 9=1
ConvolutionDepthWise     conv32                   1 1 conv31_relu21 conv32_relu22 -23330=4,3,22,22,384 0=384 1=3 4=1 5=1 6=3456 7=384 9=1
Convolution              conv33                   1 1 conv32_relu22 conv33 -23330=4,3,22,22,96 0=96 1=1 5=1 6=36864
Split                    splitncnn_6              1 2 conv33 conv33_splitncnn_0 conv33_splitncnn_1 -23330=8,3,22,22,96,3,22,22,96
Convolution              conv34                   1 1 conv33_splitncnn_1 conv34_relu23 -23330=4,3,22,22,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     conv35                   1 1 conv34_relu23 conv35_relu24 -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1
Convolution              conv36                   1 1 conv35_relu24 conv36 -23330=4,3,22,22,96 0=96 1=1 5=1 6=55296
Eltwise                  add7                     2 1 conv33_splitncnn_0 conv36 add7 -23330=4,3,22,22,96 0=1
Split                    splitncnn_7              1 2 add7 add7_splitncnn_0 add7_splitncnn_1 -23330=8,3,22,22,96,3,22,22,96
Convolution              conv37                   1 1 add7_splitncnn_1 conv37_relu25 -23330=4,3,22,22,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     conv38                   1 1 conv37_relu25 conv38_relu26 -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1
Convolution              conv39                   1 1 conv38_relu26 conv39 -23330=4,3,22,22,96 0=96 1=1 5=1 6=55296
Eltwise                  add8                     2 1 add7_splitncnn_0 conv39 add8 -23330=4,3,22,22,96 0=1
Convolution              conv40                   1 1 add8 conv40_relu27 -23330=4,3,22,22,576 0=576 1=1 5=1 6=55296 9=1
Split                    splitncnn_8              1 2 conv40_relu27 conv40_relu27_splitncnn_0 conv40_relu27_splitncnn_1 -23330=8,3,22,22,576,3,22,22,576
ConvolutionDepthWise     conv41                   1 1 conv40_relu27_splitncnn_1 conv41_relu28 -23330=4,3,11,11,576 0=576 1=3 3=2 4=1 5=1 6=5184 7=576 9=1
Convolution              conv42                   1 1 conv41_relu28 conv42 -23330=4,3,11,11,160 0=160 1=1 5=1 6=92160
Split                    splitncnn_9              1 2 conv42 conv42_splitncnn_0 conv42_splitncnn_1 -23330=8,3,11,11,160,3,11,11,160
Convolution              conv43                   1 1 conv42_splitncnn_1 conv43_relu29 -23330=4,3,11,11,960 0=960 1=1 5=1 6=153600 9=1
ConvolutionDepthWise     conv44                   1 1 conv43_relu29 conv44_relu30 -23330=4,3,11,11,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1
Convolution              conv45                   1 1 conv44_relu30 conv45 -23330=4,3,11,11,160 0=160 1=1 5=1 6=153600
Eltwise                  add9                     2 1 conv42_splitncnn_0 conv45 add9 -23330=4,3,11,11,160 0=1
Split                    splitncnn_10             1 2 add9 add9_splitncnn_0 add9_splitncnn_1 -23330=8,3,11,11,160,3,11,11,160
Convolution              conv46                   1 1 add9_splitncnn_1 conv46_relu31 -23330=4,3,11,11,960 0=960 1=1 5=1 6=153600 9=1
ConvolutionDepthWise     conv47                   1 1 conv46_relu31 conv47_relu32 -23330=4,3,11,11,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1
Convolution              conv48                   1 1 conv47_relu32 conv48 -23330=4,3,11,11,160 0=160 1=1 5=1 6=153600
Eltwise                  add10                    2 1 add9_splitncnn_0 conv48 add10 -23330=4,3,11,11,160 0=1
Convolution              conv49                   1 1 add10 conv49_relu33 -23330=4,3,11,11,960 0=960 1=1 5=1 6=153600 9=1
ConvolutionDepthWise     conv50                   1 1 conv49_relu33 conv50_relu34 -23330=4,3,11,11,960 0=960 1=3 4=1 5=1 6=8640 7=960 9=1
Convolution              conv51                   1 1 conv50_relu34 conv51 -23330=4,3,11,11,320 0=320 1=1 5=1 6=307200
Convolution              conv52                   1 1 conv51 conv52_relu35 -23330=4,3,11,11,1280 0=1280 1=1 5=1 6=409600 9=1
ConvolutionDepthWise     yolo/conv1/dw            1 1 conv52_relu35 yolo/conv1/dw_yolo/conv1/dw/relu -23330=4,3,11,11,1280 0=1280 1=3 4=1 5=1 6=11520 7=1280 9=1
Convolution              yolo/conv1               1 1 yolo/conv1/dw_yolo/conv1/dw/relu yolo/conv1_yolo/conv1/relu -23330=4,3,11,11,576 0=576 1=1 5=1 6=737280 9=1
Split                    splitncnn_11             1 2 yolo/conv1_yolo/conv1/relu yolo/conv1_yolo/conv1/relu_splitncnn_0 yolo/conv1_yolo/conv1/relu_splitncnn_1 -23330=8,3,11,11,576,3,11,11,576
DeconvolutionDepthWise   upsample                 1 1 yolo/conv1_yolo/conv1/relu_splitncnn_1 upsample -23330=4,3,21,21,576 0=576 1=1 3=2 6=576 7=576
Pooling                  maxpool                  1 1 upsample maxpool -23330=4,3,22,22,576 1=2 3=1
ConvolutionDepthWise     yolo/conv2/dw            1 1 conv40_relu27_splitncnn_0 yolo/conv2/dw_yolo/conv2/dw/relu -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1
Convolution              yolo/conv2               1 1 yolo/conv2/dw_yolo/conv2/dw/relu yolo/conv2_yolo/conv2/relu -23330=4,3,22,22,576 0=576 1=1 5=1 6=331776 9=1
Eltwise                  yolo/conv2/sum           2 1 maxpool yolo/conv2_yolo/conv2/relu yolo/conv2/sum -23330=4,3,22,22,576 0=1
ConvolutionDepthWise     yolo/conv3/dw            1 1 yolo/conv2/sum yolo/conv3/dw_yolo/conv3/dw/relu -23330=4,3,22,22,576 0=576 1=3 4=1 5=1 6=5184 7=576 9=1
Convolution              yolo/conv3               1 1 yolo/conv3/dw_yolo/conv3/dw/relu yolo/conv3_yolo/conv3/relu -23330=4,3,22,22,576 0=576 1=1 5=1 6=331776 9=1
Convolution              yolo/conv4               1 1 yolo/conv1_yolo/conv1/relu_splitncnn_0 yolo/conv4 -23330=4,3,11,11,75 0=75 1=1 5=1 6=43200
Convolution              yolo/conv5               1 1 yolo/conv3_yolo/conv3/relu yolo/conv5 -23330=4,3,22,22,75 0=75 1=1 5=1 6=43200
Yolov3DetectionOutput    detection_out            2 1 yolo/conv4 yolo/conv5 output 1=3 2=3.000000e-01 -23304=12,2.000000e+01,3.700000e+01,4.900000e+01,9.400000e+01,7.300000e+01,2.010000e+02,1.430000e+02,2.650000e+02,1.530000e+02,1.210000e+02,2.800000e+02,2.790000e+02 -23305=6,1077936128,1082130432,1084227584,0,1065353216,1073741824 -23306=2,3.200000e+01,1.600000e+01


================================================
FILE: benchmark/nanodet_m.param
================================================
7767517
179 204
Input                    input.1                  0 1 input.1 -23330=4,3,320,320,3 0=320 1=320 2=3
Convolution              Conv_0                   1 1 input.1 424 -23330=4,3,160,160,24 0=24 1=3 3=2 4=1 5=1 6=648 9=2 -23310=1,1.000000e-01
Pooling                  MaxPool_2                1 1 424 425 -23330=4,3,80,80,24 1=3 2=2 3=1 5=1
Split                    splitncnn_0              1 2 425 425_splitncnn_0 425_splitncnn_1 -23330=8,3,80,80,24,3,80,80,24
ConvolutionDepthWise     Conv_3                   1 1 425_splitncnn_1 943 -23330=4,3,40,40,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24
Convolution              Conv_4                   1 1 943 430 -23330=4,3,40,40,58 0=58 1=1 5=1 6=1392 9=2 -23310=1,1.000000e-01
Convolution              Conv_6                   1 1 425_splitncnn_0 433 -23330=4,3,80,80,58 0=58 1=1 5=1 6=1392 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_8                   1 1 433 952 -23330=4,3,40,40,58 0=58 1=3 3=2 4=1 5=1 6=522 7=58
Convolution              Conv_9                   1 1 952 438 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01
Concat                   Concat_11                2 1 430 438 439 -23330=4,3,40,40,116
ShuffleChannel           Reshape_16               1 1 439 444 -23330=4,3,40,40,116 0=2
Split                    splitncnn_1              1 2 444 444_splitncnn_0 444_splitncnn_1 -23330=8,3,40,40,116,3,40,40,116
Crop                     Slice_27                 1 1 444_splitncnn_1 455 -23330=4,3,40,40,58 -23309=1,0 -23310=1,58 -23311=1,0
Crop                     Slice_30                 1 1 444_splitncnn_0 458 -23330=4,3,40,40,58 -23309=1,58 -23310=1,116 -23311=1,0
Convolution              Conv_31                  1 1 458 461 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_33                  1 1 461 961 -23330=4,3,40,40,58 0=58 1=3 4=1 5=1 6=522 7=58
Convolution              Conv_34                  1 1 961 466 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01
Concat                   Concat_36                2 1 455 466 467 -23330=4,3,40,40,116
ShuffleChannel           Reshape_41               1 1 467 472 -23330=4,3,40,40,116 0=2
Split                    splitncnn_2              1 2 472 472_splitncnn_0 472_splitncnn_1 -23330=8,3,40,40,116,3,40,40,116
Crop                     Slice_52                 1 1 472_splitncnn_1 483 -23330=4,3,40,40,58 -23309=1,0 -23310=1,58 -23311=1,0
Crop                     Slice_55                 1 1 472_splitncnn_0 486 -23330=4,3,40,40,58 -23309=1,58 -23310=1,116 -23311=1,0
Convolution              Conv_56                  1 1 486 489 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_58                  1 1 489 970 -23330=4,3,40,40,58 0=58 1=3 4=1 5=1 6=522 7=58
Convolution              Conv_59                  1 1 970 494 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01
Concat                   Concat_61                2 1 483 494 495 -23330=4,3,40,40,116
ShuffleChannel           Reshape_66               1 1 495 500 -23330=4,3,40,40,116 0=2
Split                    splitncnn_3              1 2 500 500_splitncnn_0 500_splitncnn_1 -23330=8,3,40,40,116,3,40,40,116
Crop                     Slice_77                 1 1 500_splitncnn_1 511 -23330=4,3,40,40,58 -23309=1,0 -23310=1,58 -23311=1,0
Crop                     Slice_80                 1 1 500_splitncnn_0 514 -23330=4,3,40,40,58 -23309=1,58 -23310=1,116 -23311=1,0
Convolution              Conv_81                  1 1 514 517 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_83                  1 1 517 979 -23330=4,3,40,40,58 0=58 1=3 4=1 5=1 6=522 7=58
Convolution              Conv_84                  1 1 979 522 -23330=4,3,40,40,58 0=58 1=1 5=1 6=3364 9=2 -23310=1,1.000000e-01
Concat                   Concat_86                2 1 511 522 523 -23330=4,3,40,40,116
ShuffleChannel           Reshape_91               1 1 523 528 -23330=4,3,40,40,116 0=2
Split                    splitncnn_4              1 3 528 528_splitncnn_0 528_splitncnn_1 528_splitncnn_2 -23330=12,3,40,40,116,3,40,40,116,3,40,40,116
ConvolutionDepthWise     Conv_92                  1 1 528_splitncnn_2 985 -23330=4,3,20,20,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116
Convolution              Conv_93                  1 1 985 533 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Convolution              Conv_95                  1 1 528_splitncnn_1 536 -23330=4,3,40,40,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_97                  1 1 536 994 -23330=4,3,20,20,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116
Convolution              Conv_98                  1 1 994 541 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_100               2 1 533 541 542 -23330=4,3,20,20,232
ShuffleChannel           Reshape_105              1 1 542 547 -23330=4,3,20,20,232 0=2
Split                    splitncnn_5              1 2 547 547_splitncnn_0 547_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232
Crop                     Slice_116                1 1 547_splitncnn_1 558 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0
Crop                     Slice_119                1 1 547_splitncnn_0 561 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0
Convolution              Conv_120                 1 1 561 564 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_122                 1 1 564 1003 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              Conv_123                 1 1 1003 569 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_125               2 1 558 569 570 -23330=4,3,20,20,232
ShuffleChannel           Reshape_130              1 1 570 575 -23330=4,3,20,20,232 0=2
Split                    splitncnn_6              1 2 575 575_splitncnn_0 575_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232
Crop                     Slice_141                1 1 575_splitncnn_1 586 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0
Crop                     Slice_144                1 1 575_splitncnn_0 589 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0
Convolution              Conv_145                 1 1 589 592 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_147                 1 1 592 1012 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              Conv_148                 1 1 1012 597 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_150               2 1 586 597 598 -23330=4,3,20,20,232
ShuffleChannel           Reshape_155              1 1 598 603 -23330=4,3,20,20,232 0=2
Split                    splitncnn_7              1 2 603 603_splitncnn_0 603_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232
Crop                     Slice_166                1 1 603_splitncnn_1 614 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0
Crop                     Slice_169                1 1 603_splitncnn_0 617 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0
Convolution              Conv_170                 1 1 617 620 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_172                 1 1 620 1021 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              Conv_173                 1 1 1021 625 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_175               2 1 614 625 626 -23330=4,3,20,20,232
ShuffleChannel           Reshape_180              1 1 626 631 -23330=4,3,20,20,232 0=2
Split                    splitncnn_8              1 2 631 631_splitncnn_0 631_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232
Crop                     Slice_191                1 1 631_splitncnn_1 642 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0
Crop                     Slice_194                1 1 631_splitncnn_0 645 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0
Convolution              Conv_195                 1 1 645 648 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_197                 1 1 648 1030 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              Conv_198                 1 1 1030 653 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_200               2 1 642 653 654 -23330=4,3,20,20,232
ShuffleChannel           Reshape_205              1 1 654 659 -23330=4,3,20,20,232 0=2
Split                    splitncnn_9              1 2 659 659_splitncnn_0 659_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232
Crop                     Slice_216                1 1 659_splitncnn_1 670 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0
Crop                     Slice_219                1 1 659_splitncnn_0 673 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0
Convolution              Conv_220                 1 1 673 676 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_222                 1 1 676 1039 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              Conv_223                 1 1 1039 681 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_225               2 1 670 681 682 -23330=4,3,20,20,232
ShuffleChannel           Reshape_230              1 1 682 687 -23330=4,3,20,20,232 0=2
Split                    splitncnn_10             1 2 687 687_splitncnn_0 687_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232
Crop                     Slice_241                1 1 687_splitncnn_1 698 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0
Crop                     Slice_244                1 1 687_splitncnn_0 701 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0
Convolution              Conv_245                 1 1 701 704 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_247                 1 1 704 1048 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              Conv_248                 1 1 1048 709 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_250               2 1 698 709 710 -23330=4,3,20,20,232
ShuffleChannel           Reshape_255              1 1 710 715 -23330=4,3,20,20,232 0=2
Split                    splitncnn_11             1 2 715 715_splitncnn_0 715_splitncnn_1 -23330=8,3,20,20,232,3,20,20,232
Crop                     Slice_266                1 1 715_splitncnn_1 726 -23330=4,3,20,20,116 -23309=1,0 -23310=1,116 -23311=1,0
Crop                     Slice_269                1 1 715_splitncnn_0 729 -23330=4,3,20,20,116 -23309=1,116 -23310=1,232 -23311=1,0
Convolution              Conv_270                 1 1 729 732 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_272                 1 1 732 1057 -23330=4,3,20,20,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              Conv_273                 1 1 1057 737 -23330=4,3,20,20,116 0=116 1=1 5=1 6=13456 9=2 -23310=1,1.000000e-01
Concat                   Concat_275               2 1 726 737 738 -23330=4,3,20,20,232
ShuffleChannel           Reshape_280              1 1 738 743 -23330=4,3,20,20,232 0=2
Split                    splitncnn_12             1 3 743 743_splitncnn_0 743_splitncnn_1 743_splitncnn_2 -23330=12,3,20,20,232,3,20,20,232,3,20,20,232
ConvolutionDepthWise     Conv_281                 1 1 743_splitncnn_2 1063 -23330=4,3,10,10,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232
Convolution              Conv_282                 1 1 1063 748 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
Convolution              Conv_284                 1 1 743_splitncnn_1 751 -23330=4,3,20,20,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_286                 1 1 751 1072 -23330=4,3,10,10,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232
Convolution              Conv_287                 1 1 1072 756 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
Concat                   Concat_289               2 1 748 756 757 -23330=4,3,10,10,464
ShuffleChannel           Reshape_294              1 1 757 762 -23330=4,3,10,10,464 0=2
Split                    splitncnn_13             1 2 762 762_splitncnn_0 762_splitncnn_1 -23330=8,3,10,10,464,3,10,10,464
Crop                     Slice_305                1 1 762_splitncnn_1 773 -23330=4,3,10,10,232 -23309=1,0 -23310=1,232 -23311=1,0
Crop                     Slice_308                1 1 762_splitncnn_0 776 -23330=4,3,10,10,232 -23309=1,232 -23310=1,464 -23311=1,0
Convolution              Conv_309                 1 1 776 779 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_311                 1 1 779 1081 -23330=4,3,10,10,232 0=232 1=3 4=1 5=1 6=2088 7=232
Convolution              Conv_312                 1 1 1081 784 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
Concat                   Concat_314               2 1 773 784 785 -23330=4,3,10,10,464
ShuffleChannel           Reshape_319              1 1 785 790 -23330=4,3,10,10,464 0=2
Split                    splitncnn_14             1 2 790 790_splitncnn_0 790_splitncnn_1 -23330=8,3,10,10,464,3,10,10,464
Crop                     Slice_330                1 1 790_splitncnn_1 801 -23330=4,3,10,10,232 -23309=1,0 -23310=1,232 -23311=1,0
Crop                     Slice_333                1 1 790_splitncnn_0 804 -23330=4,3,10,10,232 -23309=1,232 -23310=1,464 -23311=1,0
Convolution              Conv_334                 1 1 804 807 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_336                 1 1 807 1090 -23330=4,3,10,10,232 0=232 1=3 4=1 5=1 6=2088 7=232
Convolution              Conv_337                 1 1 1090 812 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
Concat                   Concat_339               2 1 801 812 813 -23330=4,3,10,10,464
ShuffleChannel           Reshape_344              1 1 813 818 -23330=4,3,10,10,464 0=2
Split                    splitncnn_15             1 2 818 818_splitncnn_0 818_splitncnn_1 -23330=8,3,10,10,464,3,10,10,464
Crop                     Slice_355                1 1 818_splitncnn_1 829 -23330=4,3,10,10,232 -23309=1,0 -23310=1,232 -23311=1,0
Crop                     Slice_358                1 1 818_splitncnn_0 832 -23330=4,3,10,10,232 -23309=1,232 -23310=1,464 -23311=1,0
Convolution              Conv_359                 1 1 832 835 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_361                 1 1 835 1099 -23330=4,3,10,10,232 0=232 1=3 4=1 5=1 6=2088 7=232
Convolution              Conv_362                 1 1 1099 840 -23330=4,3,10,10,232 0=232 1=1 5=1 6=53824 9=2 -23310=1,1.000000e-01
Concat                   Concat_364               2 1 829 840 841 -23330=4,3,10,10,464
ShuffleChannel           Reshape_369              1 1 841 846 -23330=4,3,10,10,464 0=2
Convolution              Conv_370                 1 1 528_splitncnn_0 847 -23330=4,3,40,40,96 0=96 1=1 5=1 6=11136
Convolution              Conv_371                 1 1 743_splitncnn_0 848 -23330=4,3,20,20,96 0=96 1=1 5=1 6=22272
Convolution              Conv_372                 1 1 846 849 -23330=4,3,10,10,96 0=96 1=1 5=1 6=44544
Split                    splitncnn_16             1 2 849 849_splitncnn_0 849_splitncnn_1 -23330=8,3,10,10,96,3,10,10,96
Interp                   Resize_374               1 1 849_splitncnn_1 854 -23330=4,3,20,20,96 0=2 1=2.000000e+00 2=2.000000e+00
BinaryOp                 Add_375                  2 1 848 854 855 -23330=4,3,20,20,96
Split                    splitncnn_17             1 2 855 855_splitncnn_0 855_splitncnn_1 -23330=8,3,20,20,96,3,20,20,96
Interp                   Resize_377               1 1 855_splitncnn_1 860 -23330=4,3,40,40,96 0=2 1=2.000000e+00 2=2.000000e+00
BinaryOp                 Add_378                  2 1 847 860 861 -23330=4,3,40,40,96
Split                    splitncnn_18             1 2 861 861_splitncnn_0 861_splitncnn_1 -23330=8,3,40,40,96,3,40,40,96
Interp                   Resize_380               1 1 861_splitncnn_1 866 -23330=4,3,20,20,96 0=2 1=5.000000e-01 2=5.000000e-01
BinaryOp                 Add_381                  2 1 855_splitncnn_0 866 867 -23330=4,3,20,20,96
Split                    splitncnn_19             1 2 867 867_splitncnn_0 867_splitncnn_1 -23330=8,3,20,20,96,3,20,20,96
Interp                   Resize_383               1 1 867_splitncnn_1 872 -23330=4,3,10,10,96 0=2 1=5.000000e-01 2=5.000000e-01
BinaryOp                 Add_384                  2 1 849_splitncnn_0 872 873 -23330=4,3,10,10,96
ConvolutionDepthWise     Conv_385                 1 1 861_splitncnn_0 876 -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              Conv_387                 1 1 876 879 -23330=4,3,40,40,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_389                 1 1 879 882 -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              Conv_391                 1 1 882 885 -23330=4,3,40,40,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
Convolution              Conv_393                 1 1 885 886 -23330=4,3,40,40,112 0=112 1=1 5=1 6=10752
Slice                    Split_394                1 2 886 887 888 -23330=8,3,40,40,80,3,40,40,32 -23300=2,80,-233
Sigmoid                  Sigmoid_395              1 1 887 889 -23330=4,3,40,40,80
Reshape                  Reshape_397              1 1 889 891 -23330=4,2,1600,80,1 0=-1 1=80
Permute                  Transpose_398            1 1 891 cls_pred_stride_8 -23330=4,2,80,1600,1 0=1
Reshape                  Reshape_400              1 1 888 894 -23330=4,2,1600,32,1 0=-1 1=32
Permute                  Transpose_401            1 1 894 dis_pred_stride_8 -23330=4,2,32,1600,1 0=1
ConvolutionDepthWise     Conv_402                 1 1 867_splitncnn_0 898 -23330=4,3,20,20,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              Conv_404                 1 1 898 901 -23330=4,3,20,20,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_406                 1 1 901 904 -23330=4,3,20,20,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              Conv_408                 1 1 904 907 -23330=4,3,20,20,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
Convolution              Conv_410                 1 1 907 908 -23330=4,3,20,20,112 0=112 1=1 5=1 6=10752
Slice                    Split_411                1 2 908 909 910 -23330=8,3,20,20,80,3,20,20,32 -23300=2,80,-233
Sigmoid                  Sigmoid_412              1 1 909 911 -23330=4,3,20,20,80
Reshape                  Reshape_414              1 1 911 913 -23330=4,2,400,80,1 0=-1 1=80
Permute                  Transpose_415            1 1 913 cls_pred_stride_16 -23330=4,2,80,400,1 0=1
Reshape                  Reshape_417              1 1 910 916 -23330=4,2,400,32,1 0=-1 1=32
Permute                  Transpose_418            1 1 916 dis_pred_stride_16 -23330=4,2,32,400,1 0=1
ConvolutionDepthWise     Conv_419                 1 1 873 920 -23330=4,3,10,10,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              Conv_421                 1 1 920 923 -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     Conv_423                 1 1 923 926 -23330=4,3,10,10,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              Conv_425                 1 1 926 929 -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
Convolution              Conv_427                 1 1 929 930 -23330=4,3,10,10,112 0=112 1=1 5=1 6=10752
Slice                    Split_428                1 2 930 931 932 -23330=8,3,10,10,80,3,10,10,32 -23300=2,80,-233
Sigmoid                  Sigmoid_429              1 1 931 933 -23330=4,3,10,10,80
Reshape                  Reshape_431              1 1 933 935 -23330=4,2,100,80,1 0=-1 1=80
Permute                  Transpose_432            1 1 935 cls_pred_stride_32 -23330=4,2,80,100,1 0=1
Reshape                  Reshape_434              1 1 932 938 -23330=4,2,100,32,1 0=-1 1=32
Permute                  Transpose_435            1 1 938 dis_pred_stride_32 -23330=4,2,32,100,1 0=1
Noop                     Output                   6 1 cls_pred_stride_8 cls_pred_stride_16 cls_pred_stride_32 dis_pred_stride_8 dis_pred_stride_16 dis_pred_stride_32 output


================================================
FILE: benchmark/proxylessnasnet.param
================================================
7767517
91 104
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              first-3x3-conv           1 1 data first-3x3-conv_relu -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
ConvolutionDepthWise     A0_dw                    1 1 first-3x3-conv_relu A0_dw_relu -23330=4,3,112,112,32 0=32 1=3 4=1 5=1 6=288 7=32 9=1
Convolution              A0_linear                1 1 A0_dw_relu A0_linear_bn -23330=4,3,112,112,32 0=32 1=1 5=1 6=1024
Convolution              B0_expand                1 1 A0_linear_bn B0_expand_relu -23330=4,3,112,112,48 0=48 1=1 5=1 6=1536 9=1
ConvolutionDepthWise     B0_dw                    1 1 B0_expand_relu B0_dw_relu -23330=4,3,56,56,48 0=48 1=5 3=2 4=2 5=1 6=1200 7=48 9=1
Convolution              B0_linear                1 1 B0_dw_relu B0_linear_bn -23330=4,3,56,56,32 0=32 1=1 5=1 6=1536
Split                    splitncnn_0              1 2 B0_linear_bn B0_linear_bn_splitncnn_0 B0_linear_bn_splitncnn_1 -23330=8,3,56,56,32,3,56,56,32
Convolution              B1_expand                1 1 B0_linear_bn_splitncnn_1 B1_expand_relu -23330=4,3,56,56,96 0=96 1=1 5=1 6=3072 9=1
ConvolutionDepthWise     B1_dw                    1 1 B1_expand_relu B1_dw_relu -23330=4,3,56,56,96 0=96 1=3 4=1 5=1 6=864 7=96 9=1
Convolution              B1_linear                1 1 B1_dw_relu B1_linear_bn -23330=4,3,56,56,32 0=32 1=1 5=1 6=3072
BinaryOp                 unknownncnn_0            2 1 B0_linear_bn_splitncnn_0 B1_linear_bn unknownncnn_0 -23330=4,3,56,56,32
Convolution              C0_expand                1 1 unknownncnn_0 C0_expand_relu -23330=4,3,56,56,96 0=96 1=1 5=1 6=3072 9=1
ConvolutionDepthWise     C0_dw                    1 1 C0_expand_relu C0_dw_relu -23330=4,3,28,28,96 0=96 1=7 3=2 4=3 5=1 6=4704 7=96 9=1
Convolution              C0_linear                1 1 C0_dw_relu C0_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=3840
Split                    splitncnn_1              1 2 C0_linear_bn C0_linear_bn_splitncnn_0 C0_linear_bn_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              C1_expand                1 1 C0_linear_bn_splitncnn_1 C1_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise     C1_dw                    1 1 C1_expand_relu C1_dw_relu -23330=4,3,28,28,120 0=120 1=3 4=1 5=1 6=1080 7=120 9=1
Convolution              C1_linear                1 1 C1_dw_relu C1_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp                 unknownncnn_1            2 1 C0_linear_bn_splitncnn_0 C1_linear_bn unknownncnn_1 -23330=4,3,28,28,40
Split                    splitncnn_2              1 2 unknownncnn_1 unknownncnn_1_splitncnn_0 unknownncnn_1_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              C2_expand                1 1 unknownncnn_1_splitncnn_1 C2_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise     C2_dw                    1 1 C2_expand_relu C2_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1
Convolution              C2_linear                1 1 C2_dw_relu C2_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp                 unknownncnn_2            2 1 unknownncnn_1_splitncnn_0 C2_linear_bn unknownncnn_2 -23330=4,3,28,28,40
Split                    splitncnn_3              1 2 unknownncnn_2 unknownncnn_2_splitncnn_0 unknownncnn_2_splitncnn_1 -23330=8,3,28,28,40,3,28,28,40
Convolution              C3_expand                1 1 unknownncnn_2_splitncnn_1 C3_expand_relu -23330=4,3,28,28,120 0=120 1=1 5=1 6=4800 9=1
ConvolutionDepthWise     C3_dw                    1 1 C3_expand_relu C3_dw_relu -23330=4,3,28,28,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=1
Convolution              C3_linear                1 1 C3_dw_relu C3_linear_bn -23330=4,3,28,28,40 0=40 1=1 5=1 6=4800
BinaryOp                 unknownncnn_3            2 1 unknownncnn_2_splitncnn_0 C3_linear_bn unknownncnn_3 -23330=4,3,28,28,40
Convolution              D0_expand                1 1 unknownncnn_3 D0_expand_relu -23330=4,3,28,28,240 0=240 1=1 5=1 6=9600 9=1
ConvolutionDepthWise     D0_dw                    1 1 D0_expand_relu D0_dw_relu -23330=4,3,14,14,240 0=240 1=7 3=2 4=3 5=1 6=11760 7=240 9=1
Convolution              D0_linear                1 1 D0_dw_relu D0_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
Split                    splitncnn_4              1 2 D0_linear_bn D0_linear_bn_splitncnn_0 D0_linear_bn_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              D1_expand                1 1 D0_linear_bn_splitncnn_1 D1_expand_relu -23330=4,3,14,14,240 0=240 1=1 5=1 6=19200 9=1
ConvolutionDepthWise     D1_dw                    1 1 D1_expand_relu D1_dw_relu -23330=4,3,14,14,240 0=240 1=5 4=2 5=1 6=6000 7=240 9=1
Convolution              D1_linear                1 1 D1_dw_relu D1_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
BinaryOp                 unknownncnn_4            2 1 D0_linear_bn_splitncnn_0 D1_linear_bn unknownncnn_4 -23330=4,3,14,14,80
Split                    splitncnn_5              1 2 unknownncnn_4 unknownncnn_4_splitncnn_0 unknownncnn_4_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              D2_expand                1 1 unknownncnn_4_splitncnn_1 D2_expand_relu -23330=4,3,14,14,240 0=240 1=1 5=1 6=19200 9=1
ConvolutionDepthWise     D2_dw                    1 1 D2_expand_relu D2_dw_relu -23330=4,3,14,14,240 0=240 1=5 4=2 5=1 6=6000 7=240 9=1
Convolution              D2_linear                1 1 D2_dw_relu D2_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
BinaryOp                 unknownncnn_5            2 1 unknownncnn_4_splitncnn_0 D2_linear_bn unknownncnn_5 -23330=4,3,14,14,80
Split                    splitncnn_6              1 2 unknownncnn_5 unknownncnn_5_splitncnn_0 unknownncnn_5_splitncnn_1 -23330=8,3,14,14,80,3,14,14,80
Convolution              D3_expand                1 1 unknownncnn_5_splitncnn_1 D3_expand_relu -23330=4,3,14,14,240 0=240 1=1 5=1 6=19200 9=1
ConvolutionDepthWise     D3_dw                    1 1 D3_expand_relu D3_dw_relu -23330=4,3,14,14,240 0=240 1=5 4=2 5=1 6=6000 7=240 9=1
Convolution              D3_linear                1 1 D3_dw_relu D3_linear_bn -23330=4,3,14,14,80 0=80 1=1 5=1 6=19200
BinaryOp                 unknownncnn_6            2 1 unknownncnn_5_splitncnn_0 D3_linear_bn unknownncnn_6 -23330=4,3,14,14,80
Convolution              E0_expand                1 1 unknownncnn_6 E0_expand_relu -23330=4,3,14,14,480 0=480 1=1 5=1 6=38400 9=1
ConvolutionDepthWise     E0_dw                    1 1 E0_expand_relu E0_dw_relu -23330=4,3,14,14,480 0=480 1=5 4=2 5=1 6=12000 7=480 9=1
Convolution              E0_linear                1 1 E0_dw_relu E0_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=46080
Split                    splitncnn_7              1 2 E0_linear_bn E0_linear_bn_splitncnn_0 E0_linear_bn_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              E1_expand                1 1 E0_linear_bn_splitncnn_1 E1_expand_relu -23330=4,3,14,14,288 0=288 1=1 5=1 6=27648 9=1
ConvolutionDepthWise     E1_dw                    1 1 E1_expand_relu E1_dw_relu -23330=4,3,14,14,288 0=288 1=5 4=2 5=1 6=7200 7=288 9=1
Convolution              E1_linear                1 1 E1_dw_relu E1_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=27648
BinaryOp                 unknownncnn_7            2 1 E0_linear_bn_splitncnn_0 E1_linear_bn unknownncnn_7 -23330=4,3,14,14,96
Split                    splitncnn_8              1 2 unknownncnn_7 unknownncnn_7_splitncnn_0 unknownncnn_7_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              E2_expand                1 1 unknownncnn_7_splitncnn_1 E2_expand_relu -23330=4,3,14,14,288 0=288 1=1 5=1 6=27648 9=1
ConvolutionDepthWise     E2_dw                    1 1 E2_expand_relu E2_dw_relu -23330=4,3,14,14,288 0=288 1=5 4=2 5=1 6=7200 7=288 9=1
Convolution              E2_linear                1 1 E2_dw_relu E2_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=27648
BinaryOp                 unknownncnn_8            2 1 unknownncnn_7_splitncnn_0 E2_linear_bn unknownncnn_8 -23330=4,3,14,14,96
Split                    splitncnn_9              1 2 unknownncnn_8 unknownncnn_8_splitncnn_0 unknownncnn_8_splitncnn_1 -23330=8,3,14,14,96,3,14,14,96
Convolution              E3_expand                1 1 unknownncnn_8_splitncnn_1 E3_expand_relu -23330=4,3,14,14,288 0=288 1=1 5=1 6=27648 9=1
ConvolutionDepthWise     E3_dw                    1 1 E3_expand_relu E3_dw_relu -23330=4,3,14,14,288 0=288 1=5 4=2 5=1 6=7200 7=288 9=1
Convolution              E3_linear                1 1 E3_dw_relu E3_linear_bn -23330=4,3,14,14,96 0=96 1=1 5=1 6=27648
BinaryOp                 unknownncnn_9            2 1 unknownncnn_8_splitncnn_0 E3_linear_bn unknownncnn_9 -23330=4,3,14,14,96
Convolution              F0_expand                1 1 unknownncnn_9 F0_expand_relu -23330=4,3,14,14,576 0=576 1=1 5=1 6=55296 9=1
ConvolutionDepthWise     F0_dw                    1 1 F0_expand_relu F0_dw_relu -23330=4,3,7,7,576 0=576 1=7 3=2 4=3 5=1 6=28224 7=576 9=1
Convolution              F0_linear                1 1 F0_dw_relu F0_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592
Split                    splitncnn_10             1 2 F0_linear_bn F0_linear_bn_splitncnn_0 F0_linear_bn_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              F1_expand                1 1 F0_linear_bn_splitncnn_1 F1_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1
ConvolutionDepthWise     F1_dw                    1 1 F1_expand_relu F1_dw_relu -23330=4,3,7,7,1152 0=1152 1=7 4=3 5=1 6=56448 7=1152 9=1
Convolution              F1_linear                1 1 F1_dw_relu F1_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=221184
BinaryOp                 unknownncnn_10           2 1 F0_linear_bn_splitncnn_0 F1_linear_bn unknownncnn_10 -23330=4,3,7,7,192
Split                    splitncnn_11             1 2 unknownncnn_10 unknownncnn_10_splitncnn_0 unknownncnn_10_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              F2_expand                1 1 unknownncnn_10_splitncnn_1 F2_expand_relu -23330=4,3,7,7,576 0=576 1=1 5=1 6=110592 9=1
ConvolutionDepthWise     F2_dw                    1 1 F2_expand_relu F2_dw_relu -23330=4,3,7,7,576 0=576 1=7 4=3 5=1 6=28224 7=576 9=1
Convolution              F2_linear                1 1 F2_dw_relu F2_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592
BinaryOp                 unknownncnn_11           2 1 unknownncnn_10_splitncnn_0 F2_linear_bn unknownncnn_11 -23330=4,3,7,7,192
Split                    splitncnn_12             1 2 unknownncnn_11 unknownncnn_11_splitncnn_0 unknownncnn_11_splitncnn_1 -23330=8,3,7,7,192,3,7,7,192
Convolution              F3_expand                1 1 unknownncnn_11_splitncnn_1 F3_expand_relu -23330=4,3,7,7,576 0=576 1=1 5=1 6=110592 9=1
ConvolutionDepthWise     F3_dw                    1 1 F3_expand_relu F3_dw_relu -23330=4,3,7,7,576 0=576 1=7 4=3 5=1 6=28224 7=576 9=1
Convolution              F3_linear                1 1 F3_dw_relu F3_linear_bn -23330=4,3,7,7,192 0=192 1=1 5=1 6=110592
BinaryOp                 unknownncnn_12           2 1 unknownncnn_11_splitncnn_0 F3_linear_bn unknownncnn_12 -23330=4,3,7,7,192
Convolution              G0_expand                1 1 unknownncnn_12 G0_expand_relu -23330=4,3,7,7,1152 0=1152 1=1 5=1 6=221184 9=1
ConvolutionDepthWise     G0_dw                    1 1 G0_expand_relu G0_dw_relu -23330=4,3,7,7,1152 0=1152 1=7 4=3 5=1 6=56448 7=1152 9=1
Convolution              G0_linear                1 1 G0_dw_relu G0_linear_bn -23330=4,3,7,7,320 0=320 1=1 5=1 6=368640
Convolution              last-1x1-conv            1 1 G0_linear_bn last-1x1-conv_relu -23330=4,3,7,7,1280 0=1280 1=1 5=1 6=409600 9=1
Pooling                  avgpool                  1 1 last-1x1-conv_relu flatten -23330=4,1,1280,1,1 0=1 1=7 4=1 5=1
InnerProduct             fc                       1 1 flatten fc -23330=4,1,1000,1,1 0=1000 1=1 2=1280000
Softmax                  prob                     1 1 fc output -23330=4,1,1000,1,1


================================================
FILE: benchmark/regnety_400m.param
================================================
7767517
185 217
Input                    input.1                  0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              Conv_0                   1 1 data 387 -23330=4,3,112,112,32 0=32 1=3 3=2 4=1 5=1 6=864 9=1
Split                    splitncnn_0              1 2 387 387_splitncnn_0 387_splitncnn_1 -23330=8,3,112,112,32,3,112,112,32
Convolution              Conv_3                   1 1 387_splitncnn_1 389 -23330=4,3,56,56,48 0=48 1=1 3=2 5=1 6=1536
Convolution              Conv_5                   1 1 387_splitncnn_0 392 -23330=4,3,112,112,48 0=48 1=1 5=1 6=1536 9=1
ConvolutionDepthWise     Conv_8                   1 1 392 395 -23330=4,3,56,56,48 0=48 1=3 3=2 4=1 5=1 6=3456 7=6 9=1
Split                    splitncnn_1              1 2 395 395_splitncnn_0 395_splitncnn_1 -23330=8,3,56,56,48,3,56,56,48
Pooling                  GlobalAveragePool_11     1 1 395_splitncnn_1 396 -23330=4,1,48,1,1 0=1 4=1
InnerProduct             Conv_12                  1 1 396 398 -23330=4,1,8,1,1 0=8 1=1 2=384 9=1
InnerProduct             Conv_14                  1 1 398 400 -23330=4,1,48,1,1 0=48 1=1 2=384 9=4
BinaryOp                 Mul_16                   2 1 395_splitncnn_0 400 401 -23330=4,3,56,56,48 0=2
Convolution              Conv_17                  1 1 401 403 -23330=4,3,56,56,48 0=48 1=1 5=1 6=2304
BinaryOp                 Add_19                   2 1 389 403 404 -23330=4,3,56,56,48
ReLU                     Relu_20                  1 1 404 405 -23330=4,3,56,56,48
Split                    splitncnn_2              1 2 405 405_splitncnn_0 405_splitncnn_1 -23330=8,3,56,56,48,3,56,56,48
Convolution              Conv_21                  1 1 405_splitncnn_1 407 -23330=4,3,28,28,104 0=104 1=1 3=2 5=1 6=4992
Convolution              Conv_23                  1 1 405_splitncnn_0 410 -23330=4,3,56,56,104 0=104 1=1 5=1 6=4992 9=1
ConvolutionDepthWise     Conv_26                  1 1 410 413 -23330=4,3,28,28,104 0=104 1=3 3=2 4=1 5=1 6=7488 7=13 9=1
Split                    splitncnn_3              1 2 413 413_splitncnn_0 413_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104
Pooling                  GlobalAveragePool_29     1 1 413_splitncnn_1 414 -23330=4,1,104,1,1 0=1 4=1
InnerProduct             Conv_30                  1 1 414 416 -23330=4,1,12,1,1 0=12 1=1 2=1248 9=1
InnerProduct             Conv_32                  1 1 416 418 -23330=4,1,104,1,1 0=104 1=1 2=1248 9=4
BinaryOp                 Mul_34                   2 1 413_splitncnn_0 418 419 -23330=4,3,28,28,104 0=2
Convolution              Conv_35                  1 1 419 421 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816
BinaryOp                 Add_37                   2 1 407 421 422 -23330=4,3,28,28,104
ReLU                     Relu_38                  1 1 422 423 -23330=4,3,28,28,104
Split                    splitncnn_4              1 2 423 423_splitncnn_0 423_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104
Convolution              Conv_39                  1 1 423_splitncnn_1 426 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816 9=1
ConvolutionDepthWise     Conv_42                  1 1 426 429 -23330=4,3,28,28,104 0=104 1=3 4=1 5=1 6=7488 7=13 9=1
Split                    splitncnn_5              1 2 429 429_splitncnn_0 429_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104
Pooling                  GlobalAveragePool_45     1 1 429_splitncnn_1 430 -23330=4,1,104,1,1 0=1 4=1
InnerProduct             Conv_46                  1 1 430 432 -23330=4,1,26,1,1 0=26 1=1 2=2704 9=1
InnerProduct             Conv_48                  1 1 432 434 -23330=4,1,104,1,1 0=104 1=1 2=2704 9=4
BinaryOp                 Mul_50                   2 1 429_splitncnn_0 434 435 -23330=4,3,28,28,104 0=2
Convolution              Conv_51                  1 1 435 437 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816
BinaryOp                 Add_53                   2 1 423_splitncnn_0 437 438 -23330=4,3,28,28,104
ReLU                     Relu_54                  1 1 438 439 -23330=4,3,28,28,104
Split                    splitncnn_6              1 2 439 439_splitncnn_0 439_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104
Convolution              Conv_55                  1 1 439_splitncnn_1 442 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816 9=1
ConvolutionDepthWise     Conv_58                  1 1 442 445 -23330=4,3,28,28,104 0=104 1=3 4=1 5=1 6=7488 7=13 9=1
Split                    splitncnn_7              1 2 445 445_splitncnn_0 445_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104
Pooling                  GlobalAveragePool_61     1 1 445_splitncnn_1 446 -23330=4,1,104,1,1 0=1 4=1
InnerProduct             Conv_62                  1 1 446 448 -23330=4,1,26,1,1 0=26 1=1 2=2704 9=1
InnerProduct             Conv_64                  1 1 448 450 -23330=4,1,104,1,1 0=104 1=1 2=2704 9=4
BinaryOp                 Mul_66                   2 1 445_splitncnn_0 450 451 -23330=4,3,28,28,104 0=2
Convolution              Conv_67                  1 1 451 453 -23330=4,3,28,28,104 0=104 1=1 5=1 6=10816
BinaryOp                 Add_69                   2 1 439_splitncnn_0 453 454 -23330=4,3,28,28,104
ReLU                     Relu_70                  1 1 454 455 -23330=4,3,28,28,104
Split                    splitncnn_8              1 2 455 455_splitncnn_0 455_splitncnn_1 -23330=8,3,28,28,104,3,28,28,104
Convolution              Conv_71                  1 1 455_splitncnn_1 457 -23330=4,3,14,14,208 0=208 1=1 3=2 5=1 6=21632
Convolution              Conv_73                  1 1 455_splitncnn_0 460 -23330=4,3,28,28,208 0=208 1=1 5=1 6=21632 9=1
ConvolutionDepthWise     Conv_76                  1 1 460 463 -23330=4,3,14,14,208 0=208 1=3 3=2 4=1 5=1 6=14976 7=26 9=1
Split                    splitncnn_9              1 2 463 463_splitncnn_0 463_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Pooling                  GlobalAveragePool_79     1 1 463_splitncnn_1 464 -23330=4,1,208,1,1 0=1 4=1
InnerProduct             Conv_80                  1 1 464 466 -23330=4,1,26,1,1 0=26 1=1 2=5408 9=1
InnerProduct             Conv_82                  1 1 466 468 -23330=4,1,208,1,1 0=208 1=1 2=5408 9=4
BinaryOp                 Mul_84                   2 1 463_splitncnn_0 468 469 -23330=4,3,14,14,208 0=2
Convolution              Conv_85                  1 1 469 471 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264
BinaryOp                 Add_87                   2 1 457 471 472 -23330=4,3,14,14,208
ReLU                     Relu_88                  1 1 472 473 -23330=4,3,14,14,208
Split                    splitncnn_10             1 2 473 473_splitncnn_0 473_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Convolution              Conv_89                  1 1 473_splitncnn_1 476 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1
ConvolutionDepthWise     Conv_92                  1 1 476 479 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1
Split                    splitncnn_11             1 2 479 479_splitncnn_0 479_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Pooling                  GlobalAveragePool_95     1 1 479_splitncnn_1 480 -23330=4,1,208,1,1 0=1 4=1
InnerProduct             Conv_96                  1 1 480 482 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1
InnerProduct             Conv_98                  1 1 482 484 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4
BinaryOp                 Mul_100                  2 1 479_splitncnn_0 484 485 -23330=4,3,14,14,208 0=2
Convolution              Conv_101                 1 1 485 487 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264
BinaryOp                 Add_103                  2 1 473_splitncnn_0 487 488 -23330=4,3,14,14,208
ReLU                     Relu_104                 1 1 488 489 -23330=4,3,14,14,208
Split                    splitncnn_12             1 2 489 489_splitncnn_0 489_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Convolution              Conv_105                 1 1 489_splitncnn_1 492 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1
ConvolutionDepthWise     Conv_108                 1 1 492 495 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1
Split                    splitncnn_13             1 2 495 495_splitncnn_0 495_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Pooling                  GlobalAveragePool_111    1 1 495_splitncnn_1 496 -23330=4,1,208,1,1 0=1 4=1
InnerProduct             Conv_112                 1 1 496 498 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1
InnerProduct             Conv_114                 1 1 498 500 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4
BinaryOp                 Mul_116                  2 1 495_splitncnn_0 500 501 -23330=4,3,14,14,208 0=2
Convolution              Conv_117                 1 1 501 503 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264
BinaryOp                 Add_119                  2 1 489_splitncnn_0 503 504 -23330=4,3,14,14,208
ReLU                     Relu_120                 1 1 504 505 -23330=4,3,14,14,208
Split                    splitncnn_14             1 2 505 505_splitncnn_0 505_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Convolution              Conv_121                 1 1 505_splitncnn_1 508 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1
ConvolutionDepthWise     Conv_124                 1 1 508 511 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1
Split                    splitncnn_15             1 2 511 511_splitncnn_0 511_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Pooling                  GlobalAveragePool_127    1 1 511_splitncnn_1 512 -23330=4,1,208,1,1 0=1 4=1
InnerProduct             Conv_128                 1 1 512 514 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1
InnerProduct             Conv_130                 1 1 514 516 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4
BinaryOp                 Mul_132                  2 1 511_splitncnn_0 516 517 -23330=4,3,14,14,208 0=2
Convolution              Conv_133                 1 1 517 519 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264
BinaryOp                 Add_135                  2 1 505_splitncnn_0 519 520 -23330=4,3,14,14,208
ReLU                     Relu_136                 1 1 520 521 -23330=4,3,14,14,208
Split                    splitncnn_16             1 2 521 521_splitncnn_0 521_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Convolution              Conv_137                 1 1 521_splitncnn_1 524 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1
ConvolutionDepthWise     Conv_140                 1 1 524 527 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1
Split                    splitncnn_17             1 2 527 527_splitncnn_0 527_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Pooling                  GlobalAveragePool_143    1 1 527_splitncnn_1 528 -23330=4,1,208,1,1 0=1 4=1
InnerProduct             Conv_144                 1 1 528 530 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1
InnerProduct             Conv_146                 1 1 530 532 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4
BinaryOp                 Mul_148                  2 1 527_splitncnn_0 532 533 -23330=4,3,14,14,208 0=2
Convolution              Conv_149                 1 1 533 535 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264
BinaryOp                 Add_151                  2 1 521_splitncnn_0 535 536 -23330=4,3,14,14,208
ReLU                     Relu_152                 1 1 536 537 -23330=4,3,14,14,208
Split                    splitncnn_18             1 2 537 537_splitncnn_0 537_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Convolution              Conv_153                 1 1 537_splitncnn_1 540 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264 9=1
ConvolutionDepthWise     Conv_156                 1 1 540 543 -23330=4,3,14,14,208 0=208 1=3 4=1 5=1 6=14976 7=26 9=1
Split                    splitncnn_19             1 2 543 543_splitncnn_0 543_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Pooling                  GlobalAveragePool_159    1 1 543_splitncnn_1 544 -23330=4,1,208,1,1 0=1 4=1
InnerProduct             Conv_160                 1 1 544 546 -23330=4,1,52,1,1 0=52 1=1 2=10816 9=1
InnerProduct             Conv_162                 1 1 546 548 -23330=4,1,208,1,1 0=208 1=1 2=10816 9=4
BinaryOp                 Mul_164                  2 1 543_splitncnn_0 548 549 -23330=4,3,14,14,208 0=2
Convolution              Conv_165                 1 1 549 551 -23330=4,3,14,14,208 0=208 1=1 5=1 6=43264
BinaryOp                 Add_167                  2 1 537_splitncnn_0 551 552 -23330=4,3,14,14,208
ReLU                     Relu_168                 1 1 552 553 -23330=4,3,14,14,208
Split                    splitncnn_20             1 2 553 553_splitncnn_0 553_splitncnn_1 -23330=8,3,14,14,208,3,14,14,208
Convolution              Conv_169                 1 1 553_splitncnn_1 555 -23330=4,3,7,7,440 0=440 1=1 3=2 5=1 6=91520
Convolution              Conv_171                 1 1 553_splitncnn_0 558 -23330=4,3,14,14,440 0=440 1=1 5=1 6=91520 9=1
ConvolutionDepthWise     Conv_174                 1 1 558 561 -23330=4,3,7,7,440 0=440 1=3 3=2 4=1 5=1 6=31680 7=55 9=1
Split                    splitncnn_21             1 2 561 561_splitncnn_0 561_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Pooling                  GlobalAveragePool_177    1 1 561_splitncnn_1 562 -23330=4,1,440,1,1 0=1 4=1
InnerProduct             Conv_178                 1 1 562 564 -23330=4,1,52,1,1 0=52 1=1 2=22880 9=1
InnerProduct             Conv_180                 1 1 564 566 -23330=4,1,440,1,1 0=440 1=1 2=22880 9=4
BinaryOp                 Mul_182                  2 1 561_splitncnn_0 566 567 -23330=4,3,7,7,440 0=2
Convolution              Conv_183                 1 1 567 569 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600
BinaryOp                 Add_185                  2 1 555 569 570 -23330=4,3,7,7,440
ReLU                     Relu_186                 1 1 570 571 -23330=4,3,7,7,440
Split                    splitncnn_22             1 2 571 571_splitncnn_0 571_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Convolution              Conv_187                 1 1 571_splitncnn_1 574 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1
ConvolutionDepthWise     Conv_190                 1 1 574 577 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1
Split                    splitncnn_23             1 2 577 577_splitncnn_0 577_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Pooling                  GlobalAveragePool_193    1 1 577_splitncnn_1 578 -23330=4,1,440,1,1 0=1 4=1
InnerProduct             Conv_194                 1 1 578 580 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1
InnerProduct             Conv_196                 1 1 580 582 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4
BinaryOp                 Mul_198                  2 1 577_splitncnn_0 582 583 -23330=4,3,7,7,440 0=2
Convolution              Conv_199                 1 1 583 585 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600
BinaryOp                 Add_201                  2 1 571_splitncnn_0 585 586 -23330=4,3,7,7,440
ReLU                     Relu_202                 1 1 586 587 -23330=4,3,7,7,440
Split                    splitncnn_24             1 2 587 587_splitncnn_0 587_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Convolution              Conv_203                 1 1 587_splitncnn_1 590 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1
ConvolutionDepthWise     Conv_206                 1 1 590 593 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1
Split                    splitncnn_25             1 2 593 593_splitncnn_0 593_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Pooling                  GlobalAveragePool_209    1 1 593_splitncnn_1 594 -23330=4,1,440,1,1 0=1 4=1
InnerProduct             Conv_210                 1 1 594 596 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1
InnerProduct             Conv_212                 1 1 596 598 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4
BinaryOp                 Mul_214                  2 1 593_splitncnn_0 598 599 -23330=4,3,7,7,440 0=2
Convolution              Conv_215                 1 1 599 601 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600
BinaryOp                 Add_217                  2 1 587_splitncnn_0 601 602 -23330=4,3,7,7,440
ReLU                     Relu_218                 1 1 602 603 -23330=4,3,7,7,440
Split                    splitncnn_26             1 2 603 603_splitncnn_0 603_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Convolution              Conv_219                 1 1 603_splitncnn_1 606 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1
ConvolutionDepthWise     Conv_222                 1 1 606 609 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1
Split                    splitncnn_27             1 2 609 609_splitncnn_0 609_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Pooling                  GlobalAveragePool_225    1 1 609_splitncnn_1 610 -23330=4,1,440,1,1 0=1 4=1
InnerProduct             Conv_226                 1 1 610 612 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1
InnerProduct             Conv_228                 1 1 612 614 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4
BinaryOp                 Mul_230                  2 1 609_splitncnn_0 614 615 -23330=4,3,7,7,440 0=2
Convolution              Conv_231                 1 1 615 617 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600
BinaryOp                 Add_233                  2 1 603_splitncnn_0 617 618 -23330=4,3,7,7,440
ReLU                     Relu_234                 1 1 618 619 -23330=4,3,7,7,440
Split                    splitncnn_28             1 2 619 619_splitncnn_0 619_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Convolution              Conv_235                 1 1 619_splitncnn_1 622 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1
ConvolutionDepthWise     Conv_238                 1 1 622 625 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1
Split                    splitncnn_29             1 2 625 625_splitncnn_0 625_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Pooling                  GlobalAveragePool_241    1 1 625_splitncnn_1 626 -23330=4,1,440,1,1 0=1 4=1
InnerProduct             Conv_242                 1 1 626 628 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1
InnerProduct             Conv_244                 1 1 628 630 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4
BinaryOp                 Mul_246                  2 1 625_splitncnn_0 630 631 -23330=4,3,7,7,440 0=2
Convolution              Conv_247                 1 1 631 633 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600
BinaryOp                 Add_249                  2 1 619_splitncnn_0 633 634 -23330=4,3,7,7,440
ReLU                     Relu_250                 1 1 634 635 -23330=4,3,7,7,440
Split                    splitncnn_30             1 2 635 635_splitncnn_0 635_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Convolution              Conv_251                 1 1 635_splitncnn_1 638 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600 9=1
ConvolutionDepthWise     Conv_254                 1 1 638 641 -23330=4,3,7,7,440 0=440 1=3 4=1 5=1 6=31680 7=55 9=1
Split                    splitncnn_31             1 2 641 641_splitncnn_0 641_splitncnn_1 -23330=8,3,7,7,440,3,7,7,440
Pooling                  GlobalAveragePool_257    1 1 641_splitncnn_1 642 -23330=4,1,440,1,1 0=1 4=1
InnerProduct             Conv_258                 1 1 642 644 -23330=4,1,110,1,1 0=110 1=1 2=48400 9=1
InnerProduct             Conv_260                 1 1 644 646 -23330=4,1,440,1,1 0=440 1=1 2=48400 9=4
BinaryOp                 Mul_262                  2 1 641_splitncnn_0 646 647 -23330=4,3,7,7,440 0=2
Convolution              Conv_263                 1 1 647 649 -23330=4,3,7,7,440 0=440 1=1 5=1 6=193600
BinaryOp                 Add_265                  2 1 635_splitncnn_0 649 650 -23330=4,3,7,7,440
ReLU                     Relu_266                 1 1 650 651 -23330=4,3,7,7,440
Pooling                  GlobalAveragePool_267    1 1 651 660 -23330=4,1,440,1,1 0=1 4=1
InnerProduct             Gemm_274                 1 1 660 661 -23330=4,1,1000,1,1 0=1000 1=1 2=440000
Softmax                  prob                     1 1 661 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/resnet18.param
================================================
7767517
50 58
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 9=1
Pooling                  pool1                    1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2
Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096
Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1
Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864
Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a -23330=4,3,56,56,64 0=1
ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu -23330=4,3,56,56,64
Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1
Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864
Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b -23330=4,3,56,56,64 0=1
ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu -23330=4,3,56,56,64
Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
Convolution              res3a_branch1            1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=8192
Convolution              res3a_branch2a           1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 3=2 4=1 5=1 6=73728 9=1
Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456
Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a -23330=4,3,28,28,128 0=1
ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu -23330=4,3,28,28,128
Split                    splitncnn_3              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128
Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1
Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456
Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b -23330=4,3,28,28,128 0=1
ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu -23330=4,3,28,28,128
Split                    splitncnn_4              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,128,3,28,28,128
Convolution              res4a_branch1            1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=32768
Convolution              res4a_branch2a           1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1
Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824
Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a -23330=4,3,14,14,256 0=1
ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu -23330=4,3,14,14,256
Split                    splitncnn_5              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256
Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824
Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b -23330=4,3,14,14,256 0=1
ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu -23330=4,3,14,14,256
Split                    splitncnn_6              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,256,3,14,14,256
Convolution              res5a_branch1            1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=131072
Convolution              res5a_branch2a           1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 3=2 4=1 5=1 6=1179648 9=1
Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296
Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a -23330=4,3,7,7,512 0=1
ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu -23330=4,3,7,7,512
Split                    splitncnn_7              1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,512,3,7,7,512
Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296
Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b -23330=4,3,7,7,512 0=1
ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu -23330=4,3,7,7,512
Pooling                  pool5                    1 1 res5b_res5b_relu pool5 -23330=4,3,1,1,512 0=1 1=7
InnerProduct             fc1000                   1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=512000
Softmax                  prob                     1 1 fc1000 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/resnet18_int8.param
================================================
7767517
50 58
Input                    data                     0 1 data 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
Pooling                  pool1                    1 1 conv1_conv1_relu pool1 1=3 2=2
Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=64 1=1 5=1 6=4096 8=2
Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_scale2a_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2
Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1
ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu
Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_scale2b_branch2b 0=64 1=3 4=1 5=1 6=36864 8=2
Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1
ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu
Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
Convolution              res3a_branch1            1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=128 1=1 3=2 5=1 6=8192 8=2
Convolution              res3a_branch2a           1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=102 9=1
Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_scale3a_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2
Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1
ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu
Split                    splitncnn_3              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_scale3b_branch2b 0=128 1=3 4=1 5=1 6=147456 8=2
Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1
ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu
Split                    splitncnn_4              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
Convolution              res4a_branch1            1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=256 1=1 3=2 5=1 6=32768 8=2
Convolution              res4a_branch2a           1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=102 9=1
Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_scale4a_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2
Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1
ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu
Split                    splitncnn_5              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_scale4b_branch2b 0=256 1=3 4=1 5=1 6=589824 8=2
Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1
ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu
Split                    splitncnn_6              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
Convolution              res5a_branch1            1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2
Convolution              res5a_branch2a           1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=3 3=2 4=1 5=1 6=1179648 8=102 9=1
Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_scale5a_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2
Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1
ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu
Split                    splitncnn_7              1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_scale5b_branch2b 0=512 1=3 4=1 5=1 6=2359296 8=2
Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1
ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu
Pooling                  pool5                    1 1 res5b_res5b_relu pool5 0=1 1=7
InnerProduct             fc1000                   1 1 pool5 fc1000 0=1000 1=1 2=512000
Softmax                  prob                     1 1 fc1000 output


================================================
FILE: benchmark/resnet50.param
================================================
7767517
106 122
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_conv1_relu -23330=4,3,112,112,64 0=64 1=7 3=2 4=3 5=1 6=9408 9=1
Pooling                  pool1                    1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,64 1=3 2=2
Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,64,3,56,56,64
Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384
Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=4096 9=1
Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1
Convolution              res2a_branch2c           1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384
Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a -23330=4,3,56,56,256 0=1
ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu -23330=4,3,56,56,256
Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256
Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 9=1
Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1
Convolution              res2b_branch2c           1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384
Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b -23330=4,3,56,56,256 0=1
ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu -23330=4,3,56,56,256
Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256
Convolution              res2c_branch2a           1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu -23330=4,3,56,56,64 0=64 1=1 5=1 6=16384 9=1
Convolution              res2c_branch2b           1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=36864 9=1
Convolution              res2c_branch2c           1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c -23330=4,3,56,56,256 0=256 1=1 5=1 6=16384
Eltwise                  res2c                    2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c -23330=4,3,56,56,256 0=1
ReLU                     res2c_relu               1 1 res2c res2c_res2c_relu -23330=4,3,56,56,256
Split                    splitncnn_3              1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 -23330=8,3,56,56,256,3,56,56,256
Convolution              res3a_branch1            1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 -23330=4,3,28,28,512 0=512 1=1 3=2 5=1 6=131072
Convolution              res3a_branch2a           1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 3=2 5=1 6=32768 9=1
Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1
Convolution              res3a_branch2c           1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536
Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a -23330=4,3,28,28,512 0=1
ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu -23330=4,3,28,28,512
Split                    splitncnn_4              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 9=1
Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1
Convolution              res3b_branch2c           1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536
Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b -23330=4,3,28,28,512 0=1
ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu -23330=4,3,28,28,512
Split                    splitncnn_5              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
Convolution              res3c_branch2a           1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 9=1
Convolution              res3c_branch2b           1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1
Convolution              res3c_branch2c           1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536
Eltwise                  res3c                    2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c -23330=4,3,28,28,512 0=1
ReLU                     res3c_relu               1 1 res3c res3c_res3c_relu -23330=4,3,28,28,512
Split                    splitncnn_6              1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
Convolution              res3d_branch2a           1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu -23330=4,3,28,28,128 0=128 1=1 5=1 6=65536 9=1
Convolution              res3d_branch2b           1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=147456 9=1
Convolution              res3d_branch2c           1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c -23330=4,3,28,28,512 0=512 1=1 5=1 6=65536
Eltwise                  res3d                    2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d -23330=4,3,28,28,512 0=1
ReLU                     res3d_relu               1 1 res3d res3d_res3d_relu -23330=4,3,28,28,512
Split                    splitncnn_7              1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 -23330=8,3,28,28,512,3,28,28,512
Convolution              res4a_branch1            1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 -23330=4,3,14,14,1024 0=1024 1=1 3=2 5=1 6=524288
Convolution              res4a_branch2a           1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 3=2 5=1 6=131072 9=1
Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              res4a_branch2c           1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144
Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a -23330=4,3,14,14,1024 0=1
ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu -23330=4,3,14,14,1024
Split                    splitncnn_8              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1
Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              res4b_branch2c           1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144
Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b -23330=4,3,14,14,1024 0=1
ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu -23330=4,3,14,14,1024
Split                    splitncnn_9              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
Convolution              res4c_branch2a           1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1
Convolution              res4c_branch2b           1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              res4c_branch2c           1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144
Eltwise                  res4c                    2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c -23330=4,3,14,14,1024 0=1
ReLU                     res4c_relu               1 1 res4c res4c_res4c_relu -23330=4,3,14,14,1024
Split                    splitncnn_10             1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
Convolution              res4d_branch2a           1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1
Convolution              res4d_branch2b           1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              res4d_branch2c           1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144
Eltwise                  res4d                    2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d -23330=4,3,14,14,1024 0=1
ReLU                     res4d_relu               1 1 res4d res4d_res4d_relu -23330=4,3,14,14,1024
Split                    splitncnn_11             1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
Convolution              res4e_branch2a           1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1
Convolution              res4e_branch2b           1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              res4e_branch2c           1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144
Eltwise                  res4e                    2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e -23330=4,3,14,14,1024 0=1
ReLU                     res4e_relu               1 1 res4e res4e_res4e_relu -23330=4,3,14,14,1024
Split                    splitncnn_12             1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
Convolution              res4f_branch2a           1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu -23330=4,3,14,14,256 0=256 1=1 5=1 6=262144 9=1
Convolution              res4f_branch2b           1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              res4f_branch2c           1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c -23330=4,3,14,14,1024 0=1024 1=1 5=1 6=262144
Eltwise                  res4f                    2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f -23330=4,3,14,14,1024 0=1
ReLU                     res4f_relu               1 1 res4f res4f_res4f_relu -23330=4,3,14,14,1024
Split                    splitncnn_13             1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 -23330=8,3,14,14,1024,3,14,14,1024
Convolution              res5a_branch1            1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 -23330=4,3,7,7,2048 0=2048 1=1 3=2 5=1 6=2097152
Convolution              res5a_branch2a           1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 3=2 5=1 6=524288 9=1
Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Convolution              res5a_branch2c           1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576
Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a -23330=4,3,7,7,2048 0=1
ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu -23330=4,3,7,7,2048
Split                    splitncnn_14             1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048
Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 9=1
Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Convolution              res5b_branch2c           1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576
Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b -23330=4,3,7,7,2048 0=1
ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu -23330=4,3,7,7,2048
Split                    splitncnn_15             1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 -23330=8,3,7,7,2048,3,7,7,2048
Convolution              res5c_branch2a           1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu -23330=4,3,7,7,512 0=512 1=1 5=1 6=1048576 9=1
Convolution              res5c_branch2b           1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu -23330=4,3,7,7,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Convolution              res5c_branch2c           1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c -23330=4,3,7,7,2048 0=2048 1=1 5=1 6=1048576
Eltwise                  res5c                    2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c -23330=4,3,7,7,2048 0=1
ReLU                     res5c_relu               1 1 res5c res5c_res5c_relu -23330=4,3,7,7,2048
Pooling                  pool5                    1 1 res5c_res5c_relu pool5 -23330=4,3,1,1,2048 0=1 1=7
InnerProduct             fc1000                   1 1 pool5 fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=2048000
Softmax                  prob                     1 1 fc1000 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/resnet50_int8.param
================================================
7767517
106 122
Input                    data                     0 1 data 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_conv1_relu 0=64 1=7 3=2 4=3 5=1 6=9408 8=2 9=1
Pooling                  pool1                    1 1 conv1_conv1_relu pool1 1=3 2=2
Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
Convolution              res2a_branch1            1 1 pool1_splitncnn_1 res2a_branch1_scale2a_branch1 0=256 1=1 5=1 6=16384 8=2
Convolution              res2a_branch2a           1 1 pool1_splitncnn_0 res2a_branch2a_res2a_branch2a_relu 0=64 1=1 5=1 6=4096 8=102 9=1
Convolution              res2a_branch2b           1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b_res2a_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
Convolution              res2a_branch2c           1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c_scale2a_branch2c 0=256 1=1 5=1 6=16384 8=2
Eltwise                  res2a                    2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1
ReLU                     res2a_relu               1 1 res2a res2a_res2a_relu
Split                    splitncnn_1              1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
Convolution              res2b_branch2a           1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a_res2b_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1
Convolution              res2b_branch2b           1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b_res2b_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
Convolution              res2b_branch2c           1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c_scale2b_branch2c 0=256 1=1 5=1 6=16384 8=2
Eltwise                  res2b                    2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1
ReLU                     res2b_relu               1 1 res2b res2b_res2b_relu
Split                    splitncnn_2              1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
Convolution              res2c_branch2a           1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a_res2c_branch2a_relu 0=64 1=1 5=1 6=16384 8=102 9=1
Convolution              res2c_branch2b           1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b_res2c_branch2b_relu 0=64 1=3 4=1 5=1 6=36864 8=102 9=1
Convolution              res2c_branch2c           1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c_scale2c_branch2c 0=256 1=1 5=1 6=16384 8=2
Eltwise                  res2c                    2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1
ReLU                     res2c_relu               1 1 res2c res2c_res2c_relu
Split                    splitncnn_3              1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
Convolution              res3a_branch1            1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1_scale3a_branch1 0=512 1=1 3=2 5=1 6=131072 8=2
Convolution              res3a_branch2a           1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a_res3a_branch2a_relu 0=128 1=1 3=2 5=1 6=32768 8=102 9=1
Convolution              res3a_branch2b           1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b_res3a_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
Convolution              res3a_branch2c           1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c_scale3a_branch2c 0=512 1=1 5=1 6=65536 8=2
Eltwise                  res3a                    2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1
ReLU                     res3a_relu               1 1 res3a res3a_res3a_relu
Split                    splitncnn_4              1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
Convolution              res3b_branch2a           1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a_res3b_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1
Convolution              res3b_branch2b           1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b_res3b_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
Convolution              res3b_branch2c           1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c_scale3b_branch2c 0=512 1=1 5=1 6=65536 8=2
Eltwise                  res3b                    2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1
ReLU                     res3b_relu               1 1 res3b res3b_res3b_relu
Split                    splitncnn_5              1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
Convolution              res3c_branch2a           1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a_res3c_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1
Convolution              res3c_branch2b           1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b_res3c_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
Convolution              res3c_branch2c           1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c_scale3c_branch2c 0=512 1=1 5=1 6=65536 8=2
Eltwise                  res3c                    2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1
ReLU                     res3c_relu               1 1 res3c res3c_res3c_relu
Split                    splitncnn_6              1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
Convolution              res3d_branch2a           1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a_res3d_branch2a_relu 0=128 1=1 5=1 6=65536 8=102 9=1
Convolution              res3d_branch2b           1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b_res3d_branch2b_relu 0=128 1=3 4=1 5=1 6=147456 8=102 9=1
Convolution              res3d_branch2c           1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c_scale3d_branch2c 0=512 1=1 5=1 6=65536 8=2
Eltwise                  res3d                    2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1
ReLU                     res3d_relu               1 1 res3d res3d_res3d_relu
Split                    splitncnn_7              1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
Convolution              res4a_branch1            1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1_scale4a_branch1 0=1024 1=1 3=2 5=1 6=524288 8=2
Convolution              res4a_branch2a           1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a_res4a_branch2a_relu 0=256 1=1 3=2 5=1 6=131072 8=102 9=1
Convolution              res4a_branch2b           1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b_res4a_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              res4a_branch2c           1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c_scale4a_branch2c 0=1024 1=1 5=1 6=262144 8=2
Eltwise                  res4a                    2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1
ReLU                     res4a_relu               1 1 res4a res4a_res4a_relu
Split                    splitncnn_8              1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
Convolution              res4b_branch2a           1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a_res4b_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
Convolution              res4b_branch2b           1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b_res4b_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              res4b_branch2c           1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c_scale4b_branch2c 0=1024 1=1 5=1 6=262144 8=2
Eltwise                  res4b                    2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1
ReLU                     res4b_relu               1 1 res4b res4b_res4b_relu
Split                    splitncnn_9              1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
Convolution              res4c_branch2a           1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a_res4c_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
Convolution              res4c_branch2b           1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b_res4c_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              res4c_branch2c           1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c_scale4c_branch2c 0=1024 1=1 5=1 6=262144 8=2
Eltwise                  res4c                    2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1
ReLU                     res4c_relu               1 1 res4c res4c_res4c_relu
Split                    splitncnn_10             1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
Convolution              res4d_branch2a           1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a_res4d_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
Convolution              res4d_branch2b           1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b_res4d_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              res4d_branch2c           1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c_scale4d_branch2c 0=1024 1=1 5=1 6=262144 8=2
Eltwise                  res4d                    2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1
ReLU                     res4d_relu               1 1 res4d res4d_res4d_relu
Split                    splitncnn_11             1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
Convolution              res4e_branch2a           1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a_res4e_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
Convolution              res4e_branch2b           1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b_res4e_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              res4e_branch2c           1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c_scale4e_branch2c 0=1024 1=1 5=1 6=262144 8=2
Eltwise                  res4e                    2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1
ReLU                     res4e_relu               1 1 res4e res4e_res4e_relu
Split                    splitncnn_12             1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
Convolution              res4f_branch2a           1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a_res4f_branch2a_relu 0=256 1=1 5=1 6=262144 8=102 9=1
Convolution              res4f_branch2b           1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b_res4f_branch2b_relu 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              res4f_branch2c           1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c_scale4f_branch2c 0=1024 1=1 5=1 6=262144 8=2
Eltwise                  res4f                    2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1
ReLU                     res4f_relu               1 1 res4f res4f_res4f_relu
Split                    splitncnn_13             1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
Convolution              res5a_branch1            1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1_scale5a_branch1 0=2048 1=1 3=2 5=1 6=2097152 8=2
Convolution              res5a_branch2a           1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a_res5a_branch2a_relu 0=512 1=1 3=2 5=1 6=524288 8=102 9=1
Convolution              res5a_branch2b           1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b_res5a_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
Convolution              res5a_branch2c           1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c_scale5a_branch2c 0=2048 1=1 5=1 6=1048576 8=2
Eltwise                  res5a                    2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1
ReLU                     res5a_relu               1 1 res5a res5a_res5a_relu
Split                    splitncnn_14             1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
Convolution              res5b_branch2a           1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a_res5b_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1
Convolution              res5b_branch2b           1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b_res5b_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
Convolution              res5b_branch2c           1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c_scale5b_branch2c 0=2048 1=1 5=1 6=1048576 8=2
Eltwise                  res5b                    2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1
ReLU                     res5b_relu               1 1 res5b res5b_res5b_relu
Split                    splitncnn_15             1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
Convolution              res5c_branch2a           1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a_res5c_branch2a_relu 0=512 1=1 5=1 6=1048576 8=102 9=1
Convolution              res5c_branch2b           1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b_res5c_branch2b_relu 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
Convolution              res5c_branch2c           1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c_scale5c_branch2c 0=2048 1=1 5=1 6=1048576 8=2
Eltwise                  res5c                    2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1
ReLU                     res5c_relu               1 1 res5c res5c_res5c_relu
Pooling                  pool5                    1 1 res5c_res5c_relu pool5 0=1 1=7
InnerProduct             fc1000                   1 1 pool5 fc1000 0=1000 1=1 2=2048000
Softmax                  prob                     1 1 fc1000 output


================================================
FILE: benchmark/shufflenet.param
================================================
7767517
120 136
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_conv1_relu -23330=4,3,112,112,24 0=24 1=3 3=2 4=1 5=1 6=648 9=1
Pooling                  pool1                    1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,24 1=3 2=2
Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
Pooling                  resx1_match_conv         1 1 pool1_splitncnn_1 resx1_match_conv -23330=4,3,28,28,24 0=1 1=3 2=2
Convolution              resx1_conv1              1 1 pool1_splitncnn_0 resx1_conv1_resx1_conv1_relu -23330=4,3,56,56,54 0=54 1=1 5=1 6=1296 9=1
ConvolutionDepthWise     resx1_conv2              1 1 resx1_conv1_resx1_conv1_relu resx1_conv2_resx1_conv2_scale -23330=4,3,28,28,54 0=54 1=3 3=2 4=1 5=1 6=486 7=54
ConvolutionDepthWise     resx1_conv3              1 1 resx1_conv2_resx1_conv2_scale resx1_conv3_resx1_conv3_scale -23330=4,3,28,28,216 0=216 1=1 5=1 6=3888 7=3
Concat                   resx1_concat             2 1 resx1_match_conv resx1_conv3_resx1_conv3_scale resx1_concat -23330=4,3,28,28,240
ReLU                     resx1_concat_relu        1 1 resx1_concat resx1_concat_resx1_concat_relu -23330=4,3,28,28,240
Split                    splitncnn_1              1 2 resx1_concat_resx1_concat_relu resx1_concat_resx1_concat_relu_splitncnn_0 resx1_concat_resx1_concat_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240
ConvolutionDepthWise     resx2_conv1              1 1 resx1_concat_resx1_concat_relu_splitncnn_1 resx2_conv1_resx2_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1
ShuffleChannel           shuffle2                 1 1 resx2_conv1_resx2_conv1_relu shuffle2 -23330=4,3,28,28,60 0=3
ConvolutionDepthWise     resx2_conv2              1 1 shuffle2 resx2_conv2_resx2_conv2_scale -23330=4,3,28,28,60 0=60 1=3 4=1 5=1 6=540 7=60
ConvolutionDepthWise     resx2_conv3              1 1 resx2_conv2_resx2_conv2_scale resx2_conv3_resx2_conv3_scale -23330=4,3,28,28,240 0=240 1=1 5=1 6=4800 7=3
Eltwise                  resx2_elewise            2 1 resx1_concat_resx1_concat_relu_splitncnn_0 resx2_conv3_resx2_conv3_scale resx2_elewise -23330=4,3,28,28,240 0=1
ReLU                     resx2_elewise_relu       1 1 resx2_elewise resx2_elewise_resx2_elewise_relu -23330=4,3,28,28,240
Split                    splitncnn_2              1 2 resx2_elewise_resx2_elewise_relu resx2_elewise_resx2_elewise_relu_splitncnn_0 resx2_elewise_resx2_elewise_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240
ConvolutionDepthWise     resx3_conv1              1 1 resx2_elewise_resx2_elewise_relu_splitncnn_1 resx3_conv1_resx3_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1
ShuffleChannel           shuffle3                 1 1 resx3_conv1_resx3_conv1_relu shuffle3 -23330=4,3,28,28,60 0=3
ConvolutionDepthWise     resx3_conv2              1 1 shuffle3 resx3_conv2_resx3_conv2_scale -23330=4,3,28,28,60 0=60 1=3 4=1 5=1 6=540 7=60
ConvolutionDepthWise     resx3_conv3              1 1 resx3_conv2_resx3_conv2_scale resx3_conv3_resx3_conv3_scale -23330=4,3,28,28,240 0=240 1=1 5=1 6=4800 7=3
Eltwise                  resx3_elewise            2 1 resx2_elewise_resx2_elewise_relu_splitncnn_0 resx3_conv3_resx3_conv3_scale resx3_elewise -23330=4,3,28,28,240 0=1
ReLU                     resx3_elewise_relu       1 1 resx3_elewise resx3_elewise_resx3_elewise_relu -23330=4,3,28,28,240
Split                    splitncnn_3              1 2 resx3_elewise_resx3_elewise_relu resx3_elewise_resx3_elewise_relu_splitncnn_0 resx3_elewise_resx3_elewise_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240
ConvolutionDepthWise     resx4_conv1              1 1 resx3_elewise_resx3_elewise_relu_splitncnn_1 resx4_conv1_resx4_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1
ShuffleChannel           shuffle4                 1 1 resx4_conv1_resx4_conv1_relu shuffle4 -23330=4,3,28,28,60 0=3
ConvolutionDepthWise     resx4_conv2              1 1 shuffle4 resx4_conv2_resx4_conv2_scale -23330=4,3,28,28,60 0=60 1=3 4=1 5=1 6=540 7=60
ConvolutionDepthWise     resx4_conv3              1 1 resx4_conv2_resx4_conv2_scale resx4_conv3_resx4_conv3_scale -23330=4,3,28,28,240 0=240 1=1 5=1 6=4800 7=3
Eltwise                  resx4_elewise            2 1 resx3_elewise_resx3_elewise_relu_splitncnn_0 resx4_conv3_resx4_conv3_scale resx4_elewise -23330=4,3,28,28,240 0=1
ReLU                     resx4_elewise_relu       1 1 resx4_elewise resx4_elewise_resx4_elewise_relu -23330=4,3,28,28,240
Split                    splitncnn_4              1 2 resx4_elewise_resx4_elewise_relu resx4_elewise_resx4_elewise_relu_splitncnn_0 resx4_elewise_resx4_elewise_relu_splitncnn_1 -23330=8,3,28,28,240,3,28,28,240
Pooling                  resx5_match_conv         1 1 resx4_elewise_resx4_elewise_relu_splitncnn_1 resx5_match_conv -23330=4,3,14,14,240 0=1 1=3 2=2
ConvolutionDepthWise     resx5_conv1              1 1 resx4_elewise_resx4_elewise_relu_splitncnn_0 resx5_conv1_resx5_conv1_relu -23330=4,3,28,28,60 0=60 1=1 5=1 6=4800 7=3 9=1
ShuffleChannel           shuffle5                 1 1 resx5_conv1_resx5_conv1_relu shuffle5 -23330=4,3,28,28,60 0=3
ConvolutionDepthWise     resx5_conv2              1 1 shuffle5 resx5_conv2_resx5_conv2_scale -23330=4,3,14,14,60 0=60 1=3 3=2 4=1 5=1 6=540 7=60
ConvolutionDepthWise     resx5_conv3              1 1 resx5_conv2_resx5_conv2_scale resx5_conv3_resx5_conv3_scale -23330=4,3,14,14,240 0=240 1=1 5=1 6=4800 7=3
Concat                   resx5_concat             2 1 resx5_match_conv resx5_conv3_resx5_conv3_scale resx5_concat -23330=4,3,14,14,480
ReLU                     resx5_concat_relu        1 1 resx5_concat resx5_concat_resx5_concat_relu -23330=4,3,14,14,480
Split                    splitncnn_5              1 2 resx5_concat_resx5_concat_relu resx5_concat_resx5_concat_relu_splitncnn_0 resx5_concat_resx5_concat_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
ConvolutionDepthWise     resx6_conv1              1 1 resx5_concat_resx5_concat_relu_splitncnn_1 resx6_conv1_resx6_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle6                 1 1 resx6_conv1_resx6_conv1_relu shuffle6 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx6_conv2              1 1 shuffle6 resx6_conv2_resx6_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx6_conv3              1 1 resx6_conv2_resx6_conv2_scale resx6_conv3_resx6_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3
Eltwise                  resx6_elewise            2 1 resx5_concat_resx5_concat_relu_splitncnn_0 resx6_conv3_resx6_conv3_scale resx6_elewise -23330=4,3,14,14,480 0=1
ReLU                     resx6_elewise_relu       1 1 resx6_elewise resx6_elewise_resx6_elewise_relu -23330=4,3,14,14,480
Split                    splitncnn_6              1 2 resx6_elewise_resx6_elewise_relu resx6_elewise_resx6_elewise_relu_splitncnn_0 resx6_elewise_resx6_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
ConvolutionDepthWise     resx7_conv1              1 1 resx6_elewise_resx6_elewise_relu_splitncnn_1 resx7_conv1_resx7_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle7                 1 1 resx7_conv1_resx7_conv1_relu shuffle7 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx7_conv2              1 1 shuffle7 resx7_conv2_resx7_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx7_conv3              1 1 resx7_conv2_resx7_conv2_scale resx7_conv3_resx7_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3
Eltwise                  resx7_elewise            2 1 resx6_elewise_resx6_elewise_relu_splitncnn_0 resx7_conv3_resx7_conv3_scale resx7_elewise -23330=4,3,14,14,480 0=1
ReLU                     resx7_elewise_relu       1 1 resx7_elewise resx7_elewise_resx7_elewise_relu -23330=4,3,14,14,480
Split                    splitncnn_7              1 2 resx7_elewise_resx7_elewise_relu resx7_elewise_resx7_elewise_relu_splitncnn_0 resx7_elewise_resx7_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
ConvolutionDepthWise     resx8_conv1              1 1 resx7_elewise_resx7_elewise_relu_splitncnn_1 resx8_conv1_resx8_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle8                 1 1 resx8_conv1_resx8_conv1_relu shuffle8 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx8_conv2              1 1 shuffle8 resx8_conv2_resx8_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx8_conv3              1 1 resx8_conv2_resx8_conv2_scale resx8_conv3_resx8_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3
Eltwise                  resx8_elewise            2 1 resx7_elewise_resx7_elewise_relu_splitncnn_0 resx8_conv3_resx8_conv3_scale resx8_elewise -23330=4,3,14,14,480 0=1
ReLU                     resx8_elewise_relu       1 1 resx8_elewise resx8_elewise_resx8_elewise_relu -23330=4,3,14,14,480
Split                    splitncnn_8              1 2 resx8_elewise_resx8_elewise_relu resx8_elewise_resx8_elewise_relu_splitncnn_0 resx8_elewise_resx8_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
ConvolutionDepthWise     resx9_conv1              1 1 resx8_elewise_resx8_elewise_relu_splitncnn_1 resx9_conv1_resx9_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle9                 1 1 resx9_conv1_resx9_conv1_relu shuffle9 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx9_conv2              1 1 shuffle9 resx9_conv2_resx9_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx9_conv3              1 1 resx9_conv2_resx9_conv2_scale resx9_conv3_resx9_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3
Eltwise                  resx9_elewise            2 1 resx8_elewise_resx8_elewise_relu_splitncnn_0 resx9_conv3_resx9_conv3_scale resx9_elewise -23330=4,3,14,14,480 0=1
ReLU                     resx9_elewise_relu       1 1 resx9_elewise resx9_elewise_resx9_elewise_relu -23330=4,3,14,14,480
Split                    splitncnn_9              1 2 resx9_elewise_resx9_elewise_relu resx9_elewise_resx9_elewise_relu_splitncnn_0 resx9_elewise_resx9_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
ConvolutionDepthWise     resx10_conv1             1 1 resx9_elewise_resx9_elewise_relu_splitncnn_1 resx10_conv1_resx10_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle10                1 1 resx10_conv1_resx10_conv1_relu shuffle10 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx10_conv2             1 1 shuffle10 resx10_conv2_resx10_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx10_conv3             1 1 resx10_conv2_resx10_conv2_scale resx10_conv3_resx10_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3
Eltwise                  resx10_elewise           2 1 resx9_elewise_resx9_elewise_relu_splitncnn_0 resx10_conv3_resx10_conv3_scale resx10_elewise -23330=4,3,14,14,480 0=1
ReLU                     resx10_elewise_relu      1 1 resx10_elewise resx10_elewise_resx10_elewise_relu -23330=4,3,14,14,480
Split                    splitncnn_10             1 2 resx10_elewise_resx10_elewise_relu resx10_elewise_resx10_elewise_relu_splitncnn_0 resx10_elewise_resx10_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
ConvolutionDepthWise     resx11_conv1             1 1 resx10_elewise_resx10_elewise_relu_splitncnn_1 resx11_conv1_resx11_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle11                1 1 resx11_conv1_resx11_conv1_relu shuffle11 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx11_conv2             1 1 shuffle11 resx11_conv2_resx11_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx11_conv3             1 1 resx11_conv2_resx11_conv2_scale resx11_conv3_resx11_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3
Eltwise                  resx11_elewise           2 1 resx10_elewise_resx10_elewise_relu_splitncnn_0 resx11_conv3_resx11_conv3_scale resx11_elewise -23330=4,3,14,14,480 0=1
ReLU                     resx11_elewise_relu      1 1 resx11_elewise resx11_elewise_resx11_elewise_relu -23330=4,3,14,14,480
Split                    splitncnn_11             1 2 resx11_elewise_resx11_elewise_relu resx11_elewise_resx11_elewise_relu_splitncnn_0 resx11_elewise_resx11_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
ConvolutionDepthWise     resx12_conv1             1 1 resx11_elewise_resx11_elewise_relu_splitncnn_1 resx12_conv1_resx12_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle12                1 1 resx12_conv1_resx12_conv1_relu shuffle12 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx12_conv2             1 1 shuffle12 resx12_conv2_resx12_conv2_scale -23330=4,3,14,14,120 0=120 1=3 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx12_conv3             1 1 resx12_conv2_resx12_conv2_scale resx12_conv3_resx12_conv3_scale -23330=4,3,14,14,480 0=480 1=1 5=1 6=19200 7=3
Eltwise                  resx12_elewise           2 1 resx11_elewise_resx11_elewise_relu_splitncnn_0 resx12_conv3_resx12_conv3_scale resx12_elewise -23330=4,3,14,14,480 0=1
ReLU                     resx12_elewise_relu      1 1 resx12_elewise resx12_elewise_resx12_elewise_relu -23330=4,3,14,14,480
Split                    splitncnn_12             1 2 resx12_elewise_resx12_elewise_relu resx12_elewise_resx12_elewise_relu_splitncnn_0 resx12_elewise_resx12_elewise_relu_splitncnn_1 -23330=8,3,14,14,480,3,14,14,480
Pooling                  resx13_match_conv        1 1 resx12_elewise_resx12_elewise_relu_splitncnn_1 resx13_match_conv -23330=4,3,7,7,480 0=1 1=3 2=2
ConvolutionDepthWise     resx13_conv1             1 1 resx12_elewise_resx12_elewise_relu_splitncnn_0 resx13_conv1_resx13_conv1_relu -23330=4,3,14,14,120 0=120 1=1 5=1 6=19200 7=3 9=1
ShuffleChannel           shuffle13                1 1 resx13_conv1_resx13_conv1_relu shuffle13 -23330=4,3,14,14,120 0=3
ConvolutionDepthWise     resx13_conv2             1 1 shuffle13 resx13_conv2_resx13_conv2_scale -23330=4,3,7,7,120 0=120 1=3 3=2 4=1 5=1 6=1080 7=120
ConvolutionDepthWise     resx13_conv3             1 1 resx13_conv2_resx13_conv2_scale resx13_conv3_resx13_conv3_scale -23330=4,3,7,7,480 0=480 1=1 5=1 6=19200 7=3
Concat                   resx13_concat            2 1 resx13_match_conv resx13_conv3_resx13_conv3_scale resx13_concat -23330=4,3,7,7,960
ReLU                     resx13_concat_relu       1 1 resx13_concat resx13_concat_resx13_concat_relu -23330=4,3,7,7,960
Split                    splitncnn_13             1 2 resx13_concat_resx13_concat_relu resx13_concat_resx13_concat_relu_splitncnn_0 resx13_concat_resx13_concat_relu_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
ConvolutionDepthWise     resx14_conv1             1 1 resx13_concat_resx13_concat_relu_splitncnn_1 resx14_conv1_resx14_conv1_relu -23330=4,3,7,7,240 0=240 1=1 5=1 6=76800 7=3 9=1
ShuffleChannel           shuffle14                1 1 resx14_conv1_resx14_conv1_relu shuffle14 -23330=4,3,7,7,240 0=3
ConvolutionDepthWise     resx14_conv2             1 1 shuffle14 resx14_conv2_resx14_conv2_scale -23330=4,3,7,7,240 0=240 1=3 4=1 5=1 6=2160 7=240
ConvolutionDepthWise     resx14_conv3             1 1 resx14_conv2_resx14_conv2_scale resx14_conv3_resx14_conv3_scale -23330=4,3,7,7,960 0=960 1=1 5=1 6=76800 7=3
Eltwise                  resx14_elewise           2 1 resx13_concat_resx13_concat_relu_splitncnn_0 resx14_conv3_resx14_conv3_scale resx14_elewise -23330=4,3,7,7,960 0=1
ReLU                     resx14_elewise_relu      1 1 resx14_elewise resx14_elewise_resx14_elewise_relu -23330=4,3,7,7,960
Split                    splitncnn_14             1 2 resx14_elewise_resx14_elewise_relu resx14_elewise_resx14_elewise_relu_splitncnn_0 resx14_elewise_resx14_elewise_relu_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
ConvolutionDepthWise     resx15_conv1             1 1 resx14_elewise_resx14_elewise_relu_splitncnn_1 resx15_conv1_resx15_conv1_relu -23330=4,3,7,7,240 0=240 1=1 5=1 6=76800 7=3 9=1
ShuffleChannel           shuffle15                1 1 resx15_conv1_resx15_conv1_relu shuffle15 -23330=4,3,7,7,240 0=3
ConvolutionDepthWise     resx15_conv2             1 1 shuffle15 resx15_conv2_resx15_conv2_scale -23330=4,3,7,7,240 0=240 1=3 4=1 5=1 6=2160 7=240
ConvolutionDepthWise     resx15_conv3             1 1 resx15_conv2_resx15_conv2_scale resx15_conv3_resx15_conv3_scale -23330=4,3,7,7,960 0=960 1=1 5=1 6=76800 7=3
Eltwise                  resx15_elewise           2 1 resx14_elewise_resx14_elewise_relu_splitncnn_0 resx15_conv3_resx15_conv3_scale resx15_elewise -23330=4,3,7,7,960 0=1
ReLU                     resx15_elewise_relu      1 1 resx15_elewise resx15_elewise_resx15_elewise_relu -23330=4,3,7,7,960
Split                    splitncnn_15             1 2 resx15_elewise_resx15_elewise_relu resx15_elewise_resx15_elewise_relu_splitncnn_0 resx15_elewise_resx15_elewise_relu_splitncnn_1 -23330=8,3,7,7,960,3,7,7,960
ConvolutionDepthWise     resx16_conv1             1 1 resx15_elewise_resx15_elewise_relu_splitncnn_1 resx16_conv1_resx16_conv1_relu -23330=4,3,7,7,240 0=240 1=1 5=1 6=76800 7=3 9=1
ShuffleChannel           shuffle16                1 1 resx16_conv1_resx16_conv1_relu shuffle16 -23330=4,3,7,7,240 0=3
ConvolutionDepthWise     resx16_conv2             1 1 shuffle16 resx16_conv2_resx16_conv2_scale -23330=4,3,7,7,240 0=240 1=3 4=1 5=1 6=2160 7=240
ConvolutionDepthWise     resx16_conv3             1 1 resx16_conv2_resx16_conv2_scale resx16_conv3_resx16_conv3_scale -23330=4,3,7,7,960 0=960 1=1 5=1 6=76800 7=3
Eltwise                  resx16_elewise           2 1 resx15_elewise_resx15_elewise_relu_splitncnn_0 resx16_conv3_resx16_conv3_scale resx16_elewise -23330=4,3,7,7,960 0=1
ReLU                     resx16_elewise_relu      1 1 resx16_elewise resx16_elewise_resx16_elewise_relu -23330=4,3,7,7,960
Pooling                  pool_ave                 1 1 resx16_elewise_resx16_elewise_relu pool_ave -23330=4,1,960,1,1 0=1 4=1
InnerProduct             fc1000                   1 1 pool_ave fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=960000
Softmax                  prob                     1 1 fc1000 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/shufflenet_v2.param
================================================
7767517
109 125
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1                    1 1 data conv1_conv1_relu -23330=4,3,112,112,24 0=24 1=3 3=2 4=1 5=1 6=648 9=1
Pooling                  pool1                    1 1 conv1_conv1_relu pool1 -23330=4,3,56,56,24 1=3 2=2
Split                    splitncnn_0              1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 -23330=8,3,56,56,24,3,56,56,24
ConvolutionDepthWise     branch1_1_conv1          1 1 pool1_splitncnn_1 branch1_1_conv1_branch1_1_conv1_scale -23330=4,3,28,28,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24
Convolution              branch1_1_conv2          1 1 branch1_1_conv1_branch1_1_conv1_scale branch1_1_conv2_branch1_1_conv2_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=1392 9=1
Convolution              branch1_2_conv1          1 1 pool1_splitncnn_0 branch1_2_conv1_branch1_2_conv1_relu -23330=4,3,56,56,58 0=58 1=1 5=1 6=1392 9=1
ConvolutionDepthWise     branch1_2_conv2          1 1 branch1_2_conv1_branch1_2_conv1_relu branch1_2_conv2_branch1_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 3=2 4=1 5=1 6=522 7=58
Convolution              branch1_2_conv3          1 1 branch1_2_conv2_branch1_2_conv2_scale branch1_2_conv3_branch1_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1
Concat                   concat1                  2 1 branch1_1_conv2_branch1_1_conv2_relu branch1_2_conv3_branch1_2_conv3_relu concat1 -23330=4,3,28,28,116
ShuffleChannel           shuffle1                 1 1 concat1 shuffle1 -23330=4,3,28,28,116 0=2
Slice                    slice2                   1 2 shuffle1 branch2_1 branch2_2 -23330=8,3,28,28,58,3,28,28,58 -23300=2,58,-233
Convolution              branch2_2_conv1          1 1 branch2_2 branch2_2_conv1_branch2_2_conv1_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1
ConvolutionDepthWise     branch2_2_conv2          1 1 branch2_2_conv1_branch2_2_conv1_relu branch2_2_conv2_branch2_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 4=1 5=1 6=522 7=58
Convolution              branch2_2_conv3          1 1 branch2_2_conv2_branch2_2_conv2_scale branch2_2_conv3_branch2_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1
Concat                   concat2                  2 1 branch2_1 branch2_2_conv3_branch2_2_conv3_relu concat2 -23330=4,3,28,28,116
ShuffleChannel           shuffle2                 1 1 concat2 shuffle2 -23330=4,3,28,28,116 0=2
Slice                    slice3                   1 2 shuffle2 branch3_1 branch3_2 -23330=8,3,28,28,58,3,28,28,58 -23300=2,58,-233
Convolution              branch3_2_conv1          1 1 branch3_2 branch3_2_conv1_branch3_2_conv1_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1
ConvolutionDepthWise     branch3_2_conv2          1 1 branch3_2_conv1_branch3_2_conv1_relu branch3_2_conv2_branch3_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 4=1 5=1 6=522 7=58
Convolution              branch3_2_conv3          1 1 branch3_2_conv2_branch3_2_conv2_scale branch3_2_conv3_branch3_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1
Concat                   concat3                  2 1 branch3_1 branch3_2_conv3_branch3_2_conv3_relu concat3 -23330=4,3,28,28,116
ShuffleChannel           shuffle3                 1 1 concat3 shuffle3 -23330=4,3,28,28,116 0=2
Slice                    slice4                   1 2 shuffle3 branch4_1 branch4_2 -23330=8,3,28,28,58,3,28,28,58 -23300=2,58,-233
Convolution              branch4_2_conv1          1 1 branch4_2 branch4_2_conv1_branch4_2_conv1_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1
ConvolutionDepthWise     branch4_2_conv2          1 1 branch4_2_conv1_branch4_2_conv1_relu branch4_2_conv2_branch4_2_conv2_scale -23330=4,3,28,28,58 0=58 1=3 4=1 5=1 6=522 7=58
Convolution              branch4_2_conv3          1 1 branch4_2_conv2_branch4_2_conv2_scale branch4_2_conv3_branch4_2_conv3_relu -23330=4,3,28,28,58 0=58 1=1 5=1 6=3364 9=1
Concat                   concat4                  2 1 branch4_1 branch4_2_conv3_branch4_2_conv3_relu concat4 -23330=4,3,28,28,116
ShuffleChannel           shuffle4                 1 1 concat4 shuffle4 -23330=4,3,28,28,116 0=2
Split                    splitncnn_1              1 2 shuffle4 shuffle4_splitncnn_0 shuffle4_splitncnn_1 -23330=8,3,28,28,116,3,28,28,116
ConvolutionDepthWise     branch5_1_conv1          1 1 shuffle4_splitncnn_1 branch5_1_conv1_branch5_1_conv1_scale -23330=4,3,14,14,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116
Convolution              branch5_1_conv2          1 1 branch5_1_conv1_branch5_1_conv1_scale branch5_1_conv2_branch5_1_conv2_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Convolution              branch5_2_conv1          1 1 shuffle4_splitncnn_0 branch5_2_conv1_branch5_2_conv1_relu -23330=4,3,28,28,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch5_2_conv2          1 1 branch5_2_conv1_branch5_2_conv1_relu branch5_2_conv2_branch5_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 3=2 4=1 5=1 6=1044 7=116
Convolution              branch5_2_conv3          1 1 branch5_2_conv2_branch5_2_conv2_scale branch5_2_conv3_branch5_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat5                  2 1 branch5_1_conv2_branch5_1_conv2_relu branch5_2_conv3_branch5_2_conv3_relu concat5 -23330=4,3,14,14,232
ShuffleChannel           shuffle5                 1 1 concat5 shuffle5 -23330=4,3,14,14,232 0=2
Slice                    slice6                   1 2 shuffle5 branch6_1 branch6_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233
Convolution              branch6_2_conv1          1 1 branch6_2 branch6_2_conv1_branch6_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch6_2_conv2          1 1 branch6_2_conv1_branch6_2_conv1_relu branch6_2_conv2_branch6_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              branch6_2_conv3          1 1 branch6_2_conv2_branch6_2_conv2_scale branch6_2_conv3_branch6_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat6                  2 1 branch6_1 branch6_2_conv3_branch6_2_conv3_relu concat6 -23330=4,3,14,14,232
ShuffleChannel           shuffle6                 1 1 concat6 shuffle6 -23330=4,3,14,14,232 0=2
Slice                    slice7                   1 2 shuffle6 branch7_1 branch7_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233
Convolution              branch7_2_conv1          1 1 branch7_2 branch7_2_conv1_branch7_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch7_2_conv2          1 1 branch7_2_conv1_branch7_2_conv1_relu branch7_2_conv2_branch7_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              branch7_2_conv3          1 1 branch7_2_conv2_branch7_2_conv2_scale branch7_2_conv3_branch7_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat7                  2 1 branch7_1 branch7_2_conv3_branch7_2_conv3_relu concat7 -23330=4,3,14,14,232
ShuffleChannel           shuffle7                 1 1 concat7 shuffle7 -23330=4,3,14,14,232 0=2
Slice                    slice8                   1 2 shuffle7 branch8_1 branch8_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233
Convolution              branch8_2_conv1          1 1 branch8_2 branch8_2_conv1_branch8_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch8_2_conv2          1 1 branch8_2_conv1_branch8_2_conv1_relu branch8_2_conv2_branch8_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              branch8_2_conv3          1 1 branch8_2_conv2_branch8_2_conv2_scale branch8_2_conv3_branch8_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat8                  2 1 branch8_1 branch8_2_conv3_branch8_2_conv3_relu concat8 -23330=4,3,14,14,232
ShuffleChannel           shuffle8                 1 1 concat8 shuffle8 -23330=4,3,14,14,232 0=2
Slice                    slice9                   1 2 shuffle8 branch9_1 branch9_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233
Convolution              branch9_2_conv1          1 1 branch9_2 branch9_2_conv1_branch9_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch9_2_conv2          1 1 branch9_2_conv1_branch9_2_conv1_relu branch9_2_conv2_branch9_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              branch9_2_conv3          1 1 branch9_2_conv2_branch9_2_conv2_scale branch9_2_conv3_branch9_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat9                  2 1 branch9_1 branch9_2_conv3_branch9_2_conv3_relu concat9 -23330=4,3,14,14,232
ShuffleChannel           shuffle9                 1 1 concat9 shuffle9 -23330=4,3,14,14,232 0=2
Slice                    slice10                  1 2 shuffle9 branch10_1 branch10_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233
Convolution              branch10_2_conv1         1 1 branch10_2 branch10_2_conv1_branch10_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch10_2_conv2         1 1 branch10_2_conv1_branch10_2_conv1_relu branch10_2_conv2_branch10_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              branch10_2_conv3         1 1 branch10_2_conv2_branch10_2_conv2_scale branch10_2_conv3_branch10_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat10                 2 1 branch10_1 branch10_2_conv3_branch10_2_conv3_relu concat10 -23330=4,3,14,14,232
ShuffleChannel           shuffle10                1 1 concat10 shuffle10 -23330=4,3,14,14,232 0=2
Slice                    slice11                  1 2 shuffle10 branch11_1 branch11_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233
Convolution              branch11_2_conv1         1 1 branch11_2 branch11_2_conv1_branch11_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch11_2_conv2         1 1 branch11_2_conv1_branch11_2_conv1_relu branch11_2_conv2_branch11_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              branch11_2_conv3         1 1 branch11_2_conv2_branch11_2_conv2_scale branch11_2_conv3_branch11_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat11                 2 1 branch11_1 branch11_2_conv3_branch11_2_conv3_relu concat11 -23330=4,3,14,14,232
ShuffleChannel           shuffle11                1 1 concat11 shuffle11 -23330=4,3,14,14,232 0=2
Slice                    slice12                  1 2 shuffle11 branch12_1 branch12_2 -23330=8,3,14,14,116,3,14,14,116 -23300=2,116,-233
Convolution              branch12_2_conv1         1 1 branch12_2 branch12_2_conv1_branch12_2_conv1_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
ConvolutionDepthWise     branch12_2_conv2         1 1 branch12_2_conv1_branch12_2_conv1_relu branch12_2_conv2_branch12_2_conv2_scale -23330=4,3,14,14,116 0=116 1=3 4=1 5=1 6=1044 7=116
Convolution              branch12_2_conv3         1 1 branch12_2_conv2_branch12_2_conv2_scale branch12_2_conv3_branch12_2_conv3_relu -23330=4,3,14,14,116 0=116 1=1 5=1 6=13456 9=1
Concat                   concat12                 2 1 branch12_1 branch12_2_conv3_branch12_2_conv3_relu concat12 -23330=4,3,14,14,232
ShuffleChannel           shuffle12                1 1 concat12 shuffle12 -23330=4,3,14,14,232 0=2
Split                    splitncnn_2              1 2 shuffle12 shuffle12_splitncnn_0 shuffle12_splitncnn_1 -23330=8,3,14,14,232,3,14,14,232
ConvolutionDepthWise     branch13_1_conv1         1 1 shuffle12_splitncnn_1 branch13_1_conv1_branch13_1_conv1_scale -23330=4,3,7,7,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232
Convolution              branch13_1_conv2         1 1 branch13_1_conv1_branch13_1_conv1_scale branch13_1_conv2_branch13_1_conv2_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
Convolution              branch13_2_conv1         1 1 shuffle12_splitncnn_0 branch13_2_conv1_branch13_2_conv1_relu -23330=4,3,14,14,232 0=232 1=1 5=1 6=53824 9=1
ConvolutionDepthWise     branch13_2_conv2         1 1 branch13_2_conv1_branch13_2_conv1_relu branch13_2_conv2_branch13_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 3=2 4=1 5=1 6=2088 7=232
Convolution              branch13_2_conv3         1 1 branch13_2_conv2_branch13_2_conv2_scale branch13_2_conv3_branch13_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
Concat                   concat13                 2 1 branch13_1_conv2_branch13_1_conv2_relu branch13_2_conv3_branch13_2_conv3_relu concat13 -23330=4,3,7,7,464
ShuffleChannel           shuffle13                1 1 concat13 shuffle13 -23330=4,3,7,7,464 0=2
Slice                    slice14                  1 2 shuffle13 branch14_1 branch14_2 -23330=8,3,7,7,232,3,7,7,232 -23300=2,232,-233
Convolution              branch14_2_conv1         1 1 branch14_2 branch14_2_conv1_branch14_2_conv1_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
ConvolutionDepthWise     branch14_2_conv2         1 1 branch14_2_conv1_branch14_2_conv1_relu branch14_2_conv2_branch14_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 4=1 5=1 6=2088 7=232
Convolution              branch14_2_conv3         1 1 branch14_2_conv2_branch14_2_conv2_scale branch14_2_conv3_branch14_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
Concat                   concat14                 2 1 branch14_1 branch14_2_conv3_branch14_2_conv3_relu concat14 -23330=4,3,7,7,464
ShuffleChannel           shuffle14                1 1 concat14 shuffle14 -23330=4,3,7,7,464 0=2
Slice                    slice15                  1 2 shuffle14 branch15_1 branch15_2 -23330=8,3,7,7,232,3,7,7,232 -23300=2,232,-233
Convolution              branch15_2_conv1         1 1 branch15_2 branch15_2_conv1_branch15_2_conv1_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
ConvolutionDepthWise     branch15_2_conv2         1 1 branch15_2_conv1_branch15_2_conv1_relu branch15_2_conv2_branch15_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 4=1 5=1 6=2088 7=232
Convolution              branch15_2_conv3         1 1 branch15_2_conv2_branch15_2_conv2_scale branch15_2_conv3_branch15_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
Concat                   concat15                 2 1 branch15_1 branch15_2_conv3_branch15_2_conv3_relu concat15 -23330=4,3,7,7,464
ShuffleChannel           shuffle15                1 1 concat15 shuffle15 -23330=4,3,7,7,464 0=2
Slice                    slice16                  1 2 shuffle15 branch16_1 branch16_2 -23330=8,3,7,7,232,3,7,7,232 -23300=2,232,-233
Convolution              branch16_2_conv1         1 1 branch16_2 branch16_2_conv1_branch16_2_conv1_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
ConvolutionDepthWise     branch16_2_conv2         1 1 branch16_2_conv1_branch16_2_conv1_relu branch16_2_conv2_branch16_2_conv2_scale -23330=4,3,7,7,232 0=232 1=3 4=1 5=1 6=2088 7=232
Convolution              branch16_2_conv3         1 1 branch16_2_conv2_branch16_2_conv2_scale branch16_2_conv3_branch16_2_conv3_relu -23330=4,3,7,7,232 0=232 1=1 5=1 6=53824 9=1
Concat                   concat16                 2 1 branch16_1 branch16_2_conv3_branch16_2_conv3_relu concat16 -23330=4,3,7,7,464
ShuffleChannel           shuffle16                1 1 concat16 shuffle16 -23330=4,3,7,7,464 0=2
Convolution              conv5                    1 1 shuffle16 conv5_conv5_relu -23330=4,3,7,7,1024 0=1024 1=1 5=1 6=475136 9=1
Pooling                  pool_ave                 1 1 conv5_conv5_relu pool_ave -23330=4,1,1024,1,1 0=1 4=1
InnerProduct             fc1000                   1 1 pool_ave fc1000 -23330=4,1,1000,1,1 0=1000 1=1 2=1024000
Softmax                  prob                     1 1 fc1000 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/squeezenet.param
================================================
7767517
48 56
Input                    data                     0 1 data -23330=4,3,227,227,3 0=227 1=227 2=3
Convolution              conv1                    1 1 data conv1_relu_conv1 -23330=4,3,113,113,64 0=64 1=3 3=2 5=1 6=1728 9=1
Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 -23330=4,3,56,56,64 1=3 2=2
Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=1024 9=1
Split                    splitncnn_0              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16
Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 9=1
Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 9=1
Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,56,56,128
Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,56,56,16 0=16 1=1 5=1 6=2048 9=1
Split                    splitncnn_1              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,56,56,16,3,56,56,16
Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,56,56,64 0=64 1=1 5=1 6=1024 9=1
Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,56,56,64 0=64 1=3 4=1 5=1 6=9216 9=1
Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,56,56,128
Pooling                  pool3                    1 1 fire3/concat pool3 -23330=4,3,28,28,128 1=3 2=2
Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=4096 9=1
Split                    splitncnn_2              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32
Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 9=1
Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 9=1
Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,28,28,256
Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,28,28,32 0=32 1=1 5=1 6=8192 9=1
Split                    splitncnn_3              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,28,28,32,3,28,28,32
Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,28,28,128 0=128 1=1 5=1 6=4096 9=1
Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,28,28,128 0=128 1=3 4=1 5=1 6=36864 9=1
Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,28,28,256
Pooling                  pool5                    1 1 fire5/concat pool5 -23330=4,3,14,14,256 1=3 2=2
Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=12288 9=1
Split                    splitncnn_4              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48
Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 9=1
Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 9=1
Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,14,14,384
Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,14,14,48 0=48 1=1 5=1 6=18432 9=1
Split                    splitncnn_5              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,48,3,14,14,48
Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,14,14,192 0=192 1=1 5=1 6=9216 9=1
Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,14,14,192 0=192 1=3 4=1 5=1 6=82944 9=1
Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,14,14,384
Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=24576 9=1
Split                    splitncnn_6              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64
Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 9=1
Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 9=1
Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,14,14,512
Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,14,14,64 0=64 1=1 5=1 6=32768 9=1
Split                    splitncnn_7              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,14,14,64,3,14,14,64
Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,14,14,256 0=256 1=1 5=1 6=16384 9=1
Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,14,14,256 0=256 1=3 4=1 5=1 6=147456 9=1
Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9 -23330=4,3,14,14,512
Convolution              conv10                   1 1 fire9/concat_drop9 conv10_relu_conv10 -23330=4,3,16,16,1000 0=1000 1=1 4=1 5=1 6=512000 9=1
Pooling                  pool10                   1 1 conv10_relu_conv10 pool10 -23330=4,1,1000,1,1 0=1 4=1
Softmax                  prob                     1 1 pool10 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/squeezenet_int8.param
================================================
7767517
48 56
Input                    data                     0 1 data 0=227 1=227 2=3
Convolution              conv1                    1 1 data conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1
Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 1=3 2=2
Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1
Split                    splitncnn_0              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1
Split                    splitncnn_1              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
Pooling                  pool3                    1 1 fire3/concat pool3 1=3 2=2
Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1
Split                    splitncnn_2              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1
Split                    splitncnn_3              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
Pooling                  pool5                    1 1 fire5/concat pool5 1=3 2=2
Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1
Split                    splitncnn_4              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1
Split                    splitncnn_5              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1
Split                    splitncnn_6              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1
Split                    splitncnn_7              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat_drop9
Convolution              conv10                   1 1 fire9/concat_drop9 conv10_relu_conv10 0=1000 1=1 4=1 5=1 6=512000 8=2 9=1
Pooling                  pool10                   1 1 conv10_relu_conv10 pool10 0=1 4=1
Softmax                  prob                     1 1 pool10 output


================================================
FILE: benchmark/squeezenet_ssd.param
================================================
7767517
119 152
Input                    data                     0 1 data -23330=4,3,300,300,3 0=300 1=300 2=3
Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 -23330=28,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3,3,300,300,3
Convolution              conv1                    1 1 data_splitncnn_6 conv1_relu_conv1 -23330=4,3,149,149,64 0=64 1=3 3=2 5=1 6=1728 9=1
Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 -23330=4,3,74,74,64 1=3 2=2
Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=1024 9=1
Split                    splitncnn_1              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16
Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 9=1
Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 9=1
Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat -23330=4,3,74,74,128
Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 -23330=4,3,74,74,16 0=16 1=1 5=1 6=2048 9=1
Split                    splitncnn_2              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 -23330=8,3,74,74,16,3,74,74,16
Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 -23330=4,3,74,74,64 0=64 1=1 5=1 6=1024 9=1
Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 -23330=4,3,74,74,64 0=64 1=3 4=1 5=1 6=9216 9=1
Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat -23330=4,3,74,74,128
Pooling                  pool3                    1 1 fire3/concat pool3 -23330=4,3,37,37,128 1=3 2=2
Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=4096 9=1
Split                    splitncnn_3              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32
Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 9=1
Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 9=1
Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat -23330=4,3,37,37,256
Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 -23330=4,3,37,37,32 0=32 1=1 5=1 6=8192 9=1
Split                    splitncnn_4              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 -23330=8,3,37,37,32,3,37,37,32
Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 -23330=4,3,37,37,128 0=128 1=1 5=1 6=4096 9=1
Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 -23330=4,3,37,37,128 0=128 1=3 4=1 5=1 6=36864 9=1
Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat -23330=4,3,37,37,256
Split                    splitncnn_5              1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 -23330=8,3,37,37,256,3,37,37,256
Pooling                  pool5                    1 1 fire5/concat_splitncnn_1 pool5 -23330=4,3,18,18,256 1=3 2=2
Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=12288 9=1
Split                    splitncnn_6              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48
Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 9=1
Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 9=1
Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat -23330=4,3,18,18,384
Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 -23330=4,3,18,18,48 0=48 1=1 5=1 6=18432 9=1
Split                    splitncnn_7              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,48,3,18,18,48
Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 -23330=4,3,18,18,192 0=192 1=1 5=1 6=9216 9=1
Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 -23330=4,3,18,18,192 0=192 1=3 4=1 5=1 6=82944 9=1
Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat -23330=4,3,18,18,384
Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=24576 9=1
Split                    splitncnn_8              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64
Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 9=1
Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 9=1
Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat -23330=4,3,18,18,512
Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 -23330=4,3,18,18,64 0=64 1=1 5=1 6=32768 9=1
Split                    splitncnn_9              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 -23330=8,3,18,18,64,3,18,18,64
Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 -23330=4,3,18,18,256 0=256 1=1 5=1 6=16384 9=1
Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 -23330=4,3,18,18,256 0=256 1=3 4=1 5=1 6=147456 9=1
Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat -23330=4,3,18,18,512
Split                    splitncnn_10             1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 -23330=16,3,18,18,512,3,18,18,512,3,18,18,512,3,18,18,512
Pooling                  pool9                    1 1 fire9/concat_splitncnn_3 pool9 -23330=4,3,9,9,512 1=3 2=2
Convolution              fire10/squeeze1x1        1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 -23330=4,3,9,9,96 0=96 1=1 5=1 6=49152 9=1
Split                    splitncnn_11             1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 -23330=8,3,9,9,96,3,9,9,96
Convolution              fire10/expand1x1         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 -23330=4,3,9,9,384 0=384 1=1 5=1 6=36864 9=1
Convolution              fire10/expand3x3         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 -23330=4,3,9,9,384 0=384 1=3 4=1 5=1 6=331776 9=1
Concat                   fire10/concat            2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat -23330=4,3,9,9,768
Split                    splitncnn_12             1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 -23330=16,3,9,9,768,3,9,9,768,3,9,9,768,3,9,9,768
Pooling                  pool10                   1 1 fire10/concat_splitncnn_3 pool10 -23330=4,3,4,4,768 1=3 2=2
Convolution              fire11/squeeze1x1        1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 -23330=4,3,4,4,96 0=96 1=1 5=1 6=73728 9=1
Split                    splitncnn_13             1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 -23330=8,3,4,4,96,3,4,4,96
Convolution              fire11/expand1x1         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 -23330=4,3,4,4,384 0=384 1=1 5=1 6=36864 9=1
Convolution              fire11/expand3x3         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 -23330=4,3,4,4,384 0=384 1=3 4=1 5=1 6=331776 9=1
Concat                   fire11/concat            2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat -23330=4,3,4,4,768
Split                    splitncnn_14             1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 -23330=16,3,4,4,768,3,4,4,768,3,4,4,768,3,4,4,768
Convolution              conv12_1                 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu -23330=4,3,4,4,128 0=128 1=1 5=1 6=98304 9=1
Convolution              conv12_2                 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu -23330=4,3,2,2,256 0=256 1=3 3=2 4=1 5=1 6=294912 9=1
Split                    splitncnn_15             1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 -23330=16,3,2,2,256,3,2,2,256,3,2,2,256,3,2,2,256
Convolution              conv13_1                 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu -23330=4,3,2,2,64 0=64 1=1 5=1 6=16384 9=1
Convolution              conv13_2                 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu -23330=4,3,1,1,128 0=128 1=3 3=2 4=1 5=1 6=73728 9=1
Split                    splitncnn_16             1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 -23330=12,3,1,1,128,3,1,1,128,3,1,1,128
BatchNorm                fire5/bn                 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale -23330=4,3,37,37,256 0=256
Split                    splitncnn_17             1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 -23330=12,3,37,37,256,3,37,37,256,3,37,37,256
Convolution              fire5_mbox_loc           1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc -23330=4,3,37,37,16 0=16 1=3 4=1 5=1 6=36864
Permute                  fire5_mbox_loc_perm      1 1 fire5_mbox_loc fire5_mbox_loc_perm -23330=4,3,16,37,37 0=3
Flatten                  fire5_mbox_loc_flat      1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat -23330=4,1,21904,1,1
Convolution              fire5_mbox_conf          1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf -23330=4,3,37,37,84 0=84 1=3 4=1 5=1 6=193536
Permute                  fire5_mbox_conf_perm     1 1 fire5_mbox_conf fire5_mbox_conf_perm -23330=4,3,84,37,37 0=3
Flatten                  fire5_mbox_conf_flat     1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat -23330=4,1,114996,1,1
PriorBox                 fire5_mbox_priorbox      2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23330=4,2,21904,2,1 -23300=1,2.100000e+01 -23301=1,4.500000e+01 -23302=1,2.000000e+00 9=-233 10=-233 11=8.000000e+00 12=8.000000e+00 13=5.000000e-01
Convolution              fire9_mbox_loc           1 1 fire9/concat_splitncnn_2 fire9_mbox_loc -23330=4,3,18,18,24 0=24 1=3 4=1 5=1 6=110592
Permute                  fire9_mbox_loc_perm      1 1 fire9_mbox_loc fire9_mbox_loc_perm -23330=4,3,24,18,18 0=3
Flatten                  fire9_mbox_loc_flat      1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat -23330=4,1,7776,1,1
Convolution              fire9_mbox_conf          1 1 fire9/concat_splitncnn_1 fire9_mbox_conf -23330=4,3,18,18,126 0=126 1=3 4=1 5=1 6=580608
Permute                  fire9_mbox_conf_perm     1 1 fire9_mbox_conf fire9_mbox_conf_perm -23330=4,3,126,18,18 0=3
Flatten                  fire9_mbox_conf_flat     1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat -23330=4,1,40824,1,1
PriorBox                 fire9_mbox_priorbox      2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23330=4,2,7776,2,1 -23300=1,4.500000e+01 -23301=1,9.900000e+01 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=1.600000e+01 12=1.600000e+01 13=5.000000e-01
Convolution              fire10_mbox_loc          1 1 fire10/concat_splitncnn_2 fire10_mbox_loc -23330=4,3,9,9,24 0=24 1=3 4=1 5=1 6=165888
Permute                  fire10_mbox_loc_perm     1 1 fire10_mbox_loc fire10_mbox_loc_perm -23330=4,3,24,9,9 0=3
Flatten                  fire10_mbox_loc_flat     1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat -23330=4,1,1944,1,1
Convolution              fire10_mbox_conf         1 1 fire10/concat_splitncnn_1 fire10_mbox_conf -23330=4,3,9,9,126 0=126 1=3 4=1 5=1 6=870912
Permute                  fire10_mbox_conf_perm    1 1 fire10_mbox_conf fire10_mbox_conf_perm -23330=4,3,126,9,9 0=3
Flatten                  fire10_mbox_conf_flat    1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat -23330=4,1,10206,1,1
PriorBox                 fire10_mbox_priorbox     2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23330=4,2,1944,2,1 -23300=1,9.900000e+01 -23301=1,1.530000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=3.200000e+01 12=3.200000e+01 13=5.000000e-01
Convolution              fire11_mbox_loc          1 1 fire11/concat_splitncnn_2 fire11_mbox_loc -23330=4,3,4,4,24 0=24 1=3 4=1 5=1 6=165888
Permute                  fire11_mbox_loc_perm     1 1 fire11_mbox_loc fire11_mbox_loc_perm -23330=4,3,24,4,4 0=3
Flatten                  fire11_mbox_loc_flat     1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat -23330=4,1,384,1,1
Convolution              fire11_mbox_conf         1 1 fire11/concat_splitncnn_1 fire11_mbox_conf -23330=4,3,4,4,126 0=126 1=3 4=1 5=1 6=870912
Permute                  fire11_mbox_conf_perm    1 1 fire11_mbox_conf fire11_mbox_conf_perm -23330=4,3,126,4,4 0=3
Flatten                  fire11_mbox_conf_flat    1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat -23330=4,1,2016,1,1
PriorBox                 fire11_mbox_priorbox     2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23330=4,2,384,2,1 -23300=1,1.530000e+02 -23301=1,2.070000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=6.400000e+01 12=6.400000e+01 13=5.000000e-01
Convolution              conv12_2_mbox_loc        1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc -23330=4,3,2,2,24 0=24 1=3 4=1 5=1 6=55296
Permute                  conv12_2_mbox_loc_perm   1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm -23330=4,3,24,2,2 0=3
Flatten                  conv12_2_mbox_loc_flat   1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat -23330=4,1,96,1,1
Convolution              conv12_2_mbox_conf       1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf -23330=4,3,2,2,126 0=126 1=3 4=1 5=1 6=290304
Permute                  conv12_2_mbox_conf_perm  1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm -23330=4,3,126,2,2 0=3
Flatten                  conv12_2_mbox_conf_flat  1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat -23330=4,1,504,1,1
PriorBox                 conv12_2_mbox_priorbox   2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23330=4,2,96,2,1 -23300=1,2.070000e+02 -23301=1,2.610000e+02 -23302=2,2.000000e+00,3.000000e+00 9=-233 10=-233 11=1.000000e+02 12=1.000000e+02 13=5.000000e-01
Convolution              conv13_2_mbox_loc        1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc -23330=4,3,1,1,16 0=16 1=3 4=1 5=1 6=18432
Permute                  conv13_2_mbox_loc_perm   1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm -23330=4,3,16,1,1 0=3
Flatten                  conv13_2_mbox_loc_flat   1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat -23330=4,1,16,1,1
Convolution              conv13_2_mbox_conf       1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf -23330=4,3,1,1,84 0=84 1=3 4=1 5=1 6=96768
Permute                  conv13_2_mbox_conf_perm  1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm -23330=4,3,84,1,1 0=3
Flatten                  conv13_2_mbox_conf_flat  1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat -23330=4,1,84,1,1
PriorBox                 conv13_2_mbox_priorbox   2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23330=4,2,16,2,1 -23300=1,2.610000e+02 -23301=1,3.150000e+02 -23302=1,2.000000e+00 9=-233 10=-233 11=3.000000e+02 12=3.000000e+02 13=5.000000e-01
Concat                   mbox_loc                 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc -23330=4,1,32120,1,1
Concat                   mbox_conf                6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf -23330=4,1,168630,1,1
Concat                   mbox_priorbox            6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox -23330=4,2,32120,2,1 0=1
Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape -23330=4,2,21,8030,1 0=21 1=-1
Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax -23330=4,2,21,8030,1 0=1 1=1
Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten -23330=4,1,168630,1,1
DetectionOutput          detection_out            3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=4.500000e-01 2=100 4=2.500000e-01


================================================
FILE: benchmark/squeezenet_ssd_int8.param
================================================
7767517
119 152
Input                    data                     0 1 data 0=300 1=300 2=3
Split                    splitncnn_0              1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
Convolution              conv1                    1 1 data_splitncnn_6 conv1_relu_conv1 0=64 1=3 3=2 5=1 6=1728 8=2 9=1
Pooling                  pool1                    1 1 conv1_relu_conv1 pool1 1=3 2=2
Convolution              fire2/squeeze1x1         1 1 pool1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=16 1=1 5=1 6=1024 8=102 9=1
Split                    splitncnn_1              1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
Convolution              fire2/expand1x1          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1_fire2/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
Convolution              fire2/expand3x3          1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3_fire2/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
Concat                   fire2/concat             2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat
Convolution              fire3/squeeze1x1         1 1 fire2/concat fire3/squeeze1x1_fire3/relu_squeeze1x1 0=16 1=1 5=1 6=2048 8=102 9=1
Split                    splitncnn_2              1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
Convolution              fire3/expand1x1          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1_fire3/relu_expand1x1 0=64 1=1 5=1 6=1024 8=2 9=1
Convolution              fire3/expand3x3          1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3_fire3/relu_expand3x3 0=64 1=3 4=1 5=1 6=9216 8=2 9=1
Concat                   fire3/concat             2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat
Pooling                  pool3                    1 1 fire3/concat pool3 1=3 2=2
Convolution              fire4/squeeze1x1         1 1 pool3 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=32 1=1 5=1 6=4096 8=102 9=1
Split                    splitncnn_3              1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
Convolution              fire4/expand1x1          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1_fire4/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
Convolution              fire4/expand3x3          1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3_fire4/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
Concat                   fire4/concat             2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat
Convolution              fire5/squeeze1x1         1 1 fire4/concat fire5/squeeze1x1_fire5/relu_squeeze1x1 0=32 1=1 5=1 6=8192 8=102 9=1
Split                    splitncnn_4              1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
Convolution              fire5/expand1x1          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1_fire5/relu_expand1x1 0=128 1=1 5=1 6=4096 8=2 9=1
Convolution              fire5/expand3x3          1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3_fire5/relu_expand3x3 0=128 1=3 4=1 5=1 6=36864 8=2 9=1
Concat                   fire5/concat             2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat
Split                    splitncnn_5              1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1
Pooling                  pool5                    1 1 fire5/concat_splitncnn_1 pool5 1=3 2=2
Convolution              fire6/squeeze1x1         1 1 pool5 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=48 1=1 5=1 6=12288 8=102 9=1
Split                    splitncnn_6              1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
Convolution              fire6/expand1x1          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1_fire6/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
Convolution              fire6/expand3x3          1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3_fire6/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
Concat                   fire6/concat             2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat
Convolution              fire7/squeeze1x1         1 1 fire6/concat fire7/squeeze1x1_fire7/relu_squeeze1x1 0=48 1=1 5=1 6=18432 8=102 9=1
Split                    splitncnn_7              1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
Convolution              fire7/expand1x1          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1_fire7/relu_expand1x1 0=192 1=1 5=1 6=9216 8=2 9=1
Convolution              fire7/expand3x3          1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3_fire7/relu_expand3x3 0=192 1=3 4=1 5=1 6=82944 8=2 9=1
Concat                   fire7/concat             2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat
Convolution              fire8/squeeze1x1         1 1 fire7/concat fire8/squeeze1x1_fire8/relu_squeeze1x1 0=64 1=1 5=1 6=24576 8=102 9=1
Split                    splitncnn_8              1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
Convolution              fire8/expand1x1          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1_fire8/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
Convolution              fire8/expand3x3          1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3_fire8/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
Concat                   fire8/concat             2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat
Convolution              fire9/squeeze1x1         1 1 fire8/concat fire9/squeeze1x1_fire9/relu_squeeze1x1 0=64 1=1 5=1 6=32768 8=102 9=1
Split                    splitncnn_9              1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
Convolution              fire9/expand1x1          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1_fire9/relu_expand1x1 0=256 1=1 5=1 6=16384 8=2 9=1
Convolution              fire9/expand3x3          1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3_fire9/relu_expand3x3 0=256 1=3 4=1 5=1 6=147456 8=2 9=1
Concat                   fire9/concat             2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat
Split                    splitncnn_10             1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3
Pooling                  pool9                    1 1 fire9/concat_splitncnn_3 pool9 1=3 2=2
Convolution              fire10/squeeze1x1        1 1 pool9 fire10/squeeze1x1_fire10/relu_squeeze1x1 0=96 1=1 5=1 6=49152 8=102 9=1
Split                    splitncnn_11             1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1
Convolution              fire10/expand1x1         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1_fire10/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1
Convolution              fire10/expand3x3         1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3_fire10/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1
Concat                   fire10/concat            2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat
Split                    splitncnn_12             1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3
Pooling                  pool10                   1 1 fire10/concat_splitncnn_3 pool10 1=3 2=2
Convolution              fire11/squeeze1x1        1 1 pool10 fire11/squeeze1x1_fire11/relu_squeeze1x1 0=96 1=1 5=1 6=73728 8=102 9=1
Split                    splitncnn_13             1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1
Convolution              fire11/expand1x1         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1_fire11/relu_expand1x1 0=384 1=1 5=1 6=36864 8=2 9=1
Convolution              fire11/expand3x3         1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3_fire11/relu_expand3x3 0=384 1=3 4=1 5=1 6=331776 8=2 9=1
Concat                   fire11/concat            2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat
Split                    splitncnn_14             1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3
Convolution              conv12_1                 1 1 fire11/concat_splitncnn_3 conv12_1_conv12_1/relu 0=128 1=1 5=1 6=98304 8=102 9=1
Convolution              conv12_2                 1 1 conv12_1_conv12_1/relu conv12_2_conv12_2/relu 0=256 1=3 3=2 4=1 5=1 6=294912 8=2 9=1
Split                    splitncnn_15             1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3
Convolution              conv13_1                 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1_conv13_1/relu 0=64 1=1 5=1 6=16384 8=102 9=1
Convolution              conv13_2                 1 1 conv13_1_conv13_1/relu conv13_2_conv13_2/relu 0=128 1=3 3=2 4=1 5=1 6=73728 8=2 9=1
Split                    splitncnn_16             1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2
BatchNorm                fire5/bn                 1 1 fire5/concat_splitncnn_0 fire5/normal_fire5/scale 0=256
Split                    splitncnn_17             1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2
Convolution              fire5_mbox_loc           1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 4=1 5=1 6=36864 8=2
Permute                  fire5_mbox_loc_perm      1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3
Flatten                  fire5_mbox_loc_flat      1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat
Convolution              fire5_mbox_conf          1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 4=1 5=1 6=193536 8=2
Permute                  fire5_mbox_conf_perm     1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3
Flatten                  fire5_mbox_conf_flat     1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat
PriorBox                 fire5_mbox_priorbox      2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000
Convolution              fire9_mbox_loc           1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 4=1 5=1 6=110592 8=2
Permute                  fire9_mbox_loc_perm      1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3
Flatten                  fire9_mbox_loc_flat      1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat
Convolution              fire9_mbox_conf          1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 4=1 5=1 6=580608 8=2
Permute                  fire9_mbox_conf_perm     1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3
Flatten                  fire9_mbox_conf_flat     1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat
PriorBox                 fire9_mbox_priorbox      2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000
Convolution              fire10_mbox_loc          1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2
Permute                  fire10_mbox_loc_perm     1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3
Flatten                  fire10_mbox_loc_flat     1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat
Convolution              fire10_mbox_conf         1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2
Permute                  fire10_mbox_conf_perm    1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3
Flatten                  fire10_mbox_conf_flat    1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat
PriorBox                 fire10_mbox_priorbox     2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000
Convolution              fire11_mbox_loc          1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 4=1 5=1 6=165888 8=2
Permute                  fire11_mbox_loc_perm     1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3
Flatten                  fire11_mbox_loc_flat     1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat
Convolution              fire11_mbox_conf         1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 4=1 5=1 6=870912 8=2
Permute                  fire11_mbox_conf_perm    1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3
Flatten                  fire11_mbox_conf_flat    1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat
PriorBox                 fire11_mbox_priorbox     2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000
Convolution              conv12_2_mbox_loc        1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 4=1 5=1 6=55296 8=2
Permute                  conv12_2_mbox_loc_perm   1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3
Flatten                  conv12_2_mbox_loc_flat   1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat
Convolution              conv12_2_mbox_conf       1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 4=1 5=1 6=290304 8=2
Permute                  conv12_2_mbox_conf_perm  1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3
Flatten                  conv12_2_mbox_conf_flat  1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat
PriorBox                 conv12_2_mbox_priorbox   2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000
Convolution              conv13_2_mbox_loc        1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 4=1 5=1 6=18432 8=2
Permute                  conv13_2_mbox_loc_perm   1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3
Flatten                  conv13_2_mbox_loc_flat   1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat
Convolution              conv13_2_mbox_conf       1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 4=1 5=1 6=96768 8=2
Permute                  conv13_2_mbox_conf_perm  1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3
Flatten                  conv13_2_mbox_conf_flat  1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat
PriorBox                 conv13_2_mbox_priorbox   2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000
Concat                   mbox_loc                 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc
Concat                   mbox_conf                6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf
Concat                   mbox_priorbox            6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1
Reshape                  mbox_conf_reshape        1 1 mbox_conf mbox_conf_reshape 0=21 1=-1
Softmax                  mbox_conf_softmax        1 1 mbox_conf_reshape mbox_conf_softmax 0=1 1=1
Flatten                  mbox_conf_flatten        1 1 mbox_conf_softmax mbox_conf_flatten
DetectionOutput          detection_out            3 1 mbox_loc mbox_conf_flatten mbox_priorbox output 0=21 1=0.450000 2=100 4=0.250000


================================================
FILE: benchmark/vgg16.param
================================================
7767517
23 23
Input                    data                     0 1 data -23330=4,3,224,224,3 0=224 1=224 2=3
Convolution              conv1_1                  1 1 data conv1_1_relu1_1 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=1728 9=1
Convolution              conv1_2                  1 1 conv1_1_relu1_1 conv1_2_relu1_2 -23330=4,3,224,224,64 0=64 1=3 4=1 5=1 6=36864 9=1
Pooling                  pool1                    1 1 conv1_2_relu1_2 pool1 -23330=4,3,112,112,64 1=2 2=2
Convolution              conv2_1                  1 1 pool1 conv2_1_relu2_1 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=73728 9=1
Convolution              conv2_2                  1 1 conv2_1_relu2_1 conv2_2_relu2_2 -23330=4,3,112,112,128 0=128 1=3 4=1 5=1 6=147456 9=1
Pooling                  pool2                    1 1 conv2_2_relu2_2 pool2 -23330=4,3,56,56,128 1=2 2=2
Convolution              conv3_1                  1 1 pool2 conv3_1_relu3_1 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=294912 9=1
Convolution              conv3_2                  1 1 conv3_1_relu3_1 conv3_2_relu3_2 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 9=1
Convolution              conv3_3                  1 1 conv3_2_relu3_2 conv3_3_relu3_3 -23330=4,3,56,56,256 0=256 1=3 4=1 5=1 6=589824 9=1
Pooling                  pool3                    1 1 conv3_3_relu3_3 pool3 -23330=4,3,28,28,256 1=2 2=2
Convolution              conv4_1                  1 1 pool3 conv4_1_relu4_1 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=1179648 9=1
Convolution              conv4_2                  1 1 conv4_1_relu4_1 conv4_2_relu4_2 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Convolution              conv4_3                  1 1 conv4_2_relu4_2 conv4_3_relu4_3 -23330=4,3,28,28,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Pooling                  pool4                    1 1 conv4_3_relu4_3 pool4 -23330=4,3,14,14,512 1=2 2=2
Convolution              conv5_1                  1 1 pool4 conv5_1_relu5_1 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Convolution              conv5_2                  1 1 conv5_1_relu5_1 conv5_2_relu5_2 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Convolution              conv5_3                  1 1 conv5_2_relu5_2 conv5_3_relu5_3 -23330=4,3,14,14,512 0=512 1=3 4=1 5=1 6=2359296 9=1
Pooling                  pool5                    1 1 conv5_3_relu5_3 pool5 -23330=4,3,7,7,512 1=2 2=2
InnerProduct             fc6                      1 1 pool5 fc6_drop6 -23330=4,1,4096,1,1 0=4096 1=1 2=102760448 9=1
InnerProduct             fc7                      1 1 fc6_drop6 fc7_drop7 -23330=4,1,4096,1,1 0=4096 1=1 2=16777216 9=1
InnerProduct             fc8                      1 1 fc7_drop7 fc8 -23330=4,1,1000,1,1 0=1000 1=1 2=4096000
Softmax                  prob                     1 1 fc8 output -23330=4,1,1000,1,1


================================================
FILE: benchmark/vgg16_int8.param
================================================
7767517
23 23
Input                    data                     0 1 data 0=224 1=224 2=3
Convolution              conv1_1                  1 1 data conv1_1_relu1_1 0=64 1=3 4=1 5=1 6=1728 8=102 9=1
Convolution              conv1_2                  1 1 conv1_1_relu1_1 conv1_2_relu1_2 0=64 1=3 4=1 5=1 6=36864 8=2 9=1
Pooling                  pool1                    1 1 conv1_2_relu1_2 pool1 1=2 2=2
Convolution              conv2_1                  1 1 pool1 conv2_1_relu2_1 0=128 1=3 4=1 5=1 6=73728 8=102 9=1
Convolution              conv2_2                  1 1 conv2_1_relu2_1 conv2_2_relu2_2 0=128 1=3 4=1 5=1 6=147456 8=2 9=1
Pooling                  pool2                    1 1 conv2_2_relu2_2 pool2 1=2 2=2
Convolution              conv3_1                  1 1 pool2 conv3_1_relu3_1 0=256 1=3 4=1 5=1 6=294912 8=102 9=1
Convolution              conv3_2                  1 1 conv3_1_relu3_1 conv3_2_relu3_2 0=256 1=3 4=1 5=1 6=589824 8=102 9=1
Convolution              conv3_3                  1 1 conv3_2_relu3_2 conv3_3_relu3_3 0=256 1=3 4=1 5=1 6=589824 8=2 9=1
Pooling                  pool3                    1 1 conv3_3_relu3_3 pool3 1=2 2=2
Convolution              conv4_1                  1 1 pool3 conv4_1_relu4_1 0=512 1=3 4=1 5=1 6=1179648 8=102 9=1
Convolution              conv4_2                  1 1 conv4_1_relu4_1 conv4_2_relu4_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
Convolution              conv4_3                  1 1 conv4_2_relu4_2 conv4_3_relu4_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1
Pooling                  pool4                    1 1 conv4_3_relu4_3 pool4 1=2 2=2
Convolution              conv5_1                  1 1 pool4 conv5_1_relu5_1 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
Convolution              conv5_2                  1 1 conv5_1_relu5_1 conv5_2_relu5_2 0=512 1=3 4=1 5=1 6=2359296 8=102 9=1
Convolution              conv5_3                  1 1 conv5_2_relu5_2 conv5_3_relu5_3 0=512 1=3 4=1 5=1 6=2359296 8=2 9=1
Pooling                  pool5                    1 1 conv5_3_relu5_3 pool5 1=2 2=2
InnerProduct             fc6                      1 1 pool5 fc6_drop6 0=4096 1=1 2=102760448 8=2 9=1
InnerProduct             fc7                      1 1 fc6_drop6 fc7_drop7 0=4096 1=1 2=16777216 8=2 9=1
InnerProduct             fc8                      1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000 8=2
Softmax                  prob                     1 1 fc8 output


================================================
FILE: benchmark/vision_transformer.param
================================================
7767517
144 192
Input            input                    0 1 input
MemoryData       backbone.cls_token       0 1 backbone.cls_token 0=768 1=1
MemoryData       backbone.pos_embed       0 1 backbone.pos_embed 0=768 1=145
Convolution      Conv_0                   1 1 input onnx::Shape_153 0=768 1=32 11=32 2=1 12=1 3=32 13=32 4=0 14=0 15=0 16=0 5=1 6=2359296
Reshape          Reshape_8                1 1 onnx::Shape_153 onnx::Transpose_161 0=-1 1=768
Permute          Transpose_9              1 1 onnx::Transpose_161 onnx::Concat_162 0=1
Concat           Concat_10                2 1 backbone.cls_token onnx::Concat_162 onnx::Add_163 0=0
BinaryOp         Add_11                   2 1 onnx::Add_163 backbone.pos_embed input.1 0=0
Split            splitncnn_0              1 2 input.1 input.1_splitncnn_0 input.1_splitncnn_1
LayerNorm        LayerNorm_12             1 1 input.1_splitncnn_1 qkv_input 0=768 1=1.000000e-06 2=1
Split            splitncnn_1              1 3 qkv_input qkv_input_splitncnn_0 qkv_input_splitncnn_1 qkv_input_splitncnn_2
MultiHeadAttention MultiHeadAttention_21    3 1 qkv_input_splitncnn_2 qkv_input_splitncnn_1 qkv_input_splitncnn_0 onnx::Add_174 0=768 1=12 2=589824
BinaryOp         Add_22                   2 1 input.1_splitncnn_0 onnx::Add_174 input.4 0=0
Split            splitncnn_2              1 2 input.4 input.4_splitncnn_0 input.4_splitncnn_1
LayerNorm        LayerNorm_23             1 1 input.4_splitncnn_1 mmdeploy::Gemm_176 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_24                  1 1 mmdeploy::Gemm_176 mmdeploy::Gelu_177 0=3072 1=1 2=2359296
GELU             Gelu_25                  1 1 mmdeploy::Gelu_177 input.8 0=1
InnerProduct     Gemm_26                  1 1 input.8 input.12 0=768 1=1 2=2359296
BinaryOp         Add_27                   2 1 input.4_splitncnn_0 input.12 input.16 0=0
Split            splitncnn_3              1 2 input.16 input.16_splitncnn_0 input.16_splitncnn_1
LayerNorm        LayerNorm_28             1 1 input.16_splitncnn_1 qkv_input.3 0=768 1=1.000000e-06 2=1
Split            splitncnn_4              1 3 qkv_input.3 qkv_input.3_splitncnn_0 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_2
MultiHeadAttention MultiHeadAttention_37    3 1 qkv_input.3_splitncnn_2 qkv_input.3_splitncnn_1 qkv_input.3_splitncnn_0 onnx::Add_190 0=768 1=12 2=589824
BinaryOp         Add_38                   2 1 input.16_splitncnn_0 onnx::Add_190 input.20 0=0
Split            splitncnn_5              1 2 input.20 input.20_splitncnn_0 input.20_splitncnn_1
LayerNorm        LayerNorm_39             1 1 input.20_splitncnn_1 mmdeploy::Gemm_192 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_40                  1 1 mmdeploy::Gemm_192 mmdeploy::Gelu_193 0=3072 1=1 2=2359296
GELU             Gelu_41                  1 1 mmdeploy::Gelu_193 input.24 0=1
InnerProduct     Gemm_42                  1 1 input.24 input.28 0=768 1=1 2=2359296
BinaryOp         Add_43                   2 1 input.20_splitncnn_0 input.28 input.32 0=0
Split            splitncnn_6              1 2 input.32 input.32_splitncnn_0 input.32_splitncnn_1
LayerNorm        LayerNorm_44             1 1 input.32_splitncnn_1 qkv_input.7 0=768 1=1.000000e-06 2=1
Split            splitncnn_7              1 3 qkv_input.7 qkv_input.7_splitncnn_0 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_2
MultiHeadAttention MultiHeadAttention_53    3 1 qkv_input.7_splitncnn_2 qkv_input.7_splitncnn_1 qkv_input.7_splitncnn_0 onnx::Add_206 0=768 1=12 2=589824
BinaryOp         Add_54                   2 1 input.32_splitncnn_0 onnx::Add_206 input.36 0=0
Split            splitncnn_8              1 2 input.36 input.36_splitncnn_0 input.36_splitncnn_1
LayerNorm        LayerNorm_55             1 1 input.36_splitncnn_1 mmdeploy::Gemm_208 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_56                  1 1 mmdeploy::Gemm_208 mmdeploy::Gelu_209 0=3072 1=1 2=2359296
GELU             Gelu_57                  1 1 mmdeploy::Gelu_209 input.40 0=1
InnerProduct     Gemm_58                  1 1 input.40 input.44 0=768 1=1 2=2359296
BinaryOp         Add_59                   2 1 input.36_splitncnn_0 input.44 input.48 0=0
Split            splitncnn_9              1 2 input.48 input.48_splitncnn_0 input.48_splitncnn_1
LayerNorm        LayerNorm_60             1 1 input.48_splitncnn_1 qkv_input.11 0=768 1=1.000000e-06 2=1
Split            splitncnn_10             1 3 qkv_input.11 qkv_input.11_splitncnn_0 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_2
MultiHeadAttention MultiHeadAttention_69    3 1 qkv_input.11_splitncnn_2 qkv_input.11_splitncnn_1 qkv_input.11_splitncnn_0 onnx::Add_222 0=768 1=12 2=589824
BinaryOp         Add_70                   2 1 input.48_splitncnn_0 onnx::Add_222 input.52 0=0
Split            splitncnn_11             1 2 input.52 input.52_splitncnn_0 input.52_splitncnn_1
LayerNorm        LayerNorm_71             1 1 input.52_splitncnn_1 mmdeploy::Gemm_224 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_72                  1 1 mmdeploy::Gemm_224 mmdeploy::Gelu_225 0=3072 1=1 2=2359296
GELU             Gelu_73                  1 1 mmdeploy::Gelu_225 input.56 0=1
InnerProduct     Gemm_74                  1 1 input.56 input.60 0=768 1=1 2=2359296
BinaryOp         Add_75                   2 1 input.52_splitncnn_0 input.60 input.64 0=0
Split            splitncnn_12             1 2 input.64 input.64_splitncnn_0 input.64_splitncnn_1
LayerNorm        LayerNorm_76             1 1 input.64_splitncnn_1 qkv_input.15 0=768 1=1.000000e-06 2=1
Split            splitncnn_13             1 3 qkv_input.15 qkv_input.15_splitncnn_0 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_2
MultiHeadAttention MultiHeadAttention_85    3 1 qkv_input.15_splitncnn_2 qkv_input.15_splitncnn_1 qkv_input.15_splitncnn_0 onnx::Add_238 0=768 1=12 2=589824
BinaryOp         Add_86                   2 1 input.64_splitncnn_0 onnx::Add_238 input.68 0=0
Split            splitncnn_14             1 2 input.68 input.68_splitncnn_0 input.68_splitncnn_1
LayerNorm        LayerNorm_87             1 1 input.68_splitncnn_1 mmdeploy::Gemm_240 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_88                  1 1 mmdeploy::Gemm_240 mmdeploy::Gelu_241 0=3072 1=1 2=2359296
GELU             Gelu_89                  1 1 mmdeploy::Gelu_241 input.72 0=1
InnerProduct     Gemm_90                  1 1 input.72 input.76 0=768 1=1 2=2359296
BinaryOp         Add_91                   2 1 input.68_splitncnn_0 input.76 input.80 0=0
Split            splitncnn_15             1 2 input.80 input.80_splitncnn_0 input.80_splitncnn_1
LayerNorm        LayerNorm_92             1 1 input.80_splitncnn_1 qkv_input.19 0=768 1=1.000000e-06 2=1
Split            splitncnn_16             1 3 qkv_input.19 qkv_input.19_splitncnn_0 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_2
MultiHeadAttention MultiHeadAttention_101   3 1 qkv_input.19_splitncnn_2 qkv_input.19_splitncnn_1 qkv_input.19_splitncnn_0 onnx::Add_254 0=768 1=12 2=589824
BinaryOp         Add_102                  2 1 input.80_splitncnn_0 onnx::Add_254 input.84 0=0
Split            splitncnn_17             1 2 input.84 input.84_splitncnn_0 input.84_splitncnn_1
LayerNorm        LayerNorm_103            1 1 input.84_splitncnn_1 mmdeploy::Gemm_256 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_104                 1 1 mmdeploy::Gemm_256 mmdeploy::Gelu_257 0=3072 1=1 2=2359296
GELU             Gelu_105                 1 1 mmdeploy::Gelu_257 input.88 0=1
InnerProduct     Gemm_106                 1 1 input.88 input.92 0=768 1=1 2=2359296
BinaryOp         Add_107                  2 1 input.84_splitncnn_0 input.92 input.96 0=0
Split            splitncnn_18             1 2 input.96 input.96_splitncnn_0 input.96_splitncnn_1
LayerNorm        LayerNorm_108            1 1 input.96_splitncnn_1 qkv_input.23 0=768 1=1.000000e-06 2=1
Split            splitncnn_19             1 3 qkv_input.23 qkv_input.23_splitncnn_0 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_2
MultiHeadAttention MultiHeadAttention_117   3 1 qkv_input.23_splitncnn_2 qkv_input.23_splitncnn_1 qkv_input.23_splitncnn_0 onnx::Add_270 0=768 1=12 2=589824
BinaryOp         Add_118                  2 1 input.96_splitncnn_0 onnx::Add_270 input.100 0=0
Split            splitncnn_20             1 2 input.100 input.100_splitncnn_0 input.100_splitncnn_1
LayerNorm        LayerNorm_119            1 1 input.100_splitncnn_1 mmdeploy::Gemm_272 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_120                 1 1 mmdeploy::Gemm_272 mmdeploy::Gelu_273 0=3072 1=1 2=2359296
GELU             Gelu_121                 1 1 mmdeploy::Gelu_273 input.104 0=1
InnerProduct     Gemm_122                 1 1 input.104 input.108 0=768 1=1 2=2359296
BinaryOp         Add_123                  2 1 input.100_splitncnn_0 input.108 input.112 0=0
Split            splitncnn_21             1 2 input.112 input.112_splitncnn_0 input.112_splitncnn_1
LayerNorm        LayerNorm_124            1 1 input.112_splitncnn_1 qkv_input.27 0=768 1=1.000000e-06 2=1
Split            splitncnn_22             1 3 qkv_input.27 qkv_input.27_splitncnn_0 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_2
MultiHeadAttention MultiHeadAttention_133   3 1 qkv_input.27_splitncnn_2 qkv_input.27_splitncnn_1 qkv_input.27_splitncnn_0 onnx::Add_286 0=768 1=12 2=589824
BinaryOp         Add_134                  2 1 input.112_splitncnn_0 onnx::Add_286 input.116 0=0
Split            splitncnn_23             1 2 input.116 input.116_splitncnn_0 input.116_splitncnn_1
LayerNorm        LayerNorm_135            1 1 input.116_splitncnn_1 mmdeploy::Gemm_288 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_136                 1 1 mmdeploy::Gemm_288 mmdeploy::Gelu_289 0=3072 1=1 2=2359296
GELU             Gelu_137                 1 1 mmdeploy::Gelu_289 input.120 0=1
InnerProduct     Gemm_138                 1 1 input.120 input.124 0=768 1=1 2=2359296
BinaryOp         Add_139                  2 1 input.116_splitncnn_0 input.124 input.128 0=0
Split            splitncnn_24             1 2 input.128 input.128_splitncnn_0 input.128_splitncnn_1
LayerNorm        LayerNorm_140            1 1 input.128_splitncnn_1 qkv_input.31 0=768 1=1.000000e-06 2=1
Split            splitncnn_25             1 3 qkv_input.31 qkv_input.31_splitncnn_0 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_2
MultiHeadAttention MultiHeadAttention_149   3 1 qkv_input.31_splitncnn_2 qkv_input.31_splitncnn_1 qkv_input.31_splitncnn_0 onnx::Add_302 0=768 1=12 2=589824
BinaryOp         Add_150                  2 1 input.128_splitncnn_0 onnx::Add_302 input.132 0=0
Split            splitncnn_26             1 2 input.132 input.132_splitncnn_0 input.132_splitncnn_1
LayerNorm        LayerNorm_151            1 1 input.132_splitncnn_1 mmdeploy::Gemm_304 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_152                 1 1 mmdeploy::Gemm_304 mmdeploy::Gelu_305 0=3072 1=1 2=2359296
GELU             Gelu_153                 1 1 mmdeploy::Gelu_305 input.136 0=1
InnerProduct     Gemm_154                 1 1 input.136 input.140 0=768 1=1 2=2359296
BinaryOp         Add_155                  2 1 input.132_splitncnn_0 input.140 input.144 0=0
Split            splitncnn_27             1 2 input.144 input.144_splitncnn_0 input.144_splitncnn_1
LayerNorm        LayerNorm_156            1 1 input.144_splitncnn_1 qkv_input.35 0=768 1=1.000000e-06 2=1
Split            splitncnn_28             1 3 qkv_input.35 qkv_input.35_splitncnn_0 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_2
MultiHeadAttention MultiHeadAttention_165   3 1 qkv_input.35_splitncnn_2 qkv_input.35_splitncnn_1 qkv_input.35_splitncnn_0 onnx::Add_318 0=768 1=12 2=589824
BinaryOp         Add_166                  2 1 input.144_splitncnn_0 onnx::Add_318 input.148 0=0
Split            splitncnn_29             1 2 input.148 input.148_splitncnn_0 input.148_splitncnn_1
LayerNorm        LayerNorm_167            1 1 input.148_splitncnn_1 mmdeploy::Gemm_320 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_168                 1 1 mmdeploy::Gemm_320 mmdeploy::Gelu_321 0=3072 1=1 2=2359296
GELU             Gelu_169                 1 1 mmdeploy::Gelu_321 input.152 0=1
InnerProduct     Gemm_170                 1 1 input.152 input.156 0=768 1=1 2=2359296
BinaryOp         Add_171                  2 1 input.148_splitncnn_0 input.156 input.160 0=0
Split            splitncnn_30             1 2 input.160 input.160_splitncnn_0 input.160_splitncnn_1
LayerNorm        LayerNorm_172            1 1 input.160_splitncnn_1 qkv_input.39 0=768 1=1.000000e-06 2=1
Split            splitncnn_31             1 3 qkv_input.39 qkv_input.39_splitncnn_0 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_2
MultiHeadAttention MultiHeadAttention_181   3 1 qkv_input.39_splitncnn_2 qkv_input.39_splitncnn_1 qkv_input.39_splitncnn_0 onnx::Add_334 0=768 1=12 2=589824
BinaryOp         Add_182                  2 1 input.160_splitncnn_0 onnx::Add_334 input.164 0=0
Split            splitncnn_32             1 2 input.164 input.164_splitncnn_0 input.164_splitncnn_1
LayerNorm        LayerNorm_183            1 1 input.164_splitncnn_1 mmdeploy::Gemm_336 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_184                 1 1 mmdeploy::Gemm_336 mmdeploy::Gelu_337 0=3072 1=1 2=2359296
GELU             Gelu_185                 1 1 mmdeploy::Gelu_337 input.168 0=1
InnerProduct     Gemm_186                 1 1 input.168 input.172 0=768 1=1 2=2359296
BinaryOp         Add_187                  2 1 input.164_splitncnn_0 input.172 input.176 0=0
Split            splitncnn_33             1 2 input.176 input.176_splitncnn_0 input.176_splitncnn_1
LayerNorm        LayerNorm_188            1 1 input.176_splitncnn_1 qkv_input.43 0=768 1=1.000000e-06 2=1
Split            splitncnn_34             1 3 qkv_input.43 qkv_input.43_splitncnn_0 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_2
MultiHeadAttention MultiHeadAttention_197   3 1 qkv_input.43_splitncnn_2 qkv_input.43_splitncnn_1 qkv_input.43_splitncnn_0 onnx::Add_350 0=768 1=12 2=589824
BinaryOp         Add_198                  2 1 input.176_splitncnn_0 onnx::Add_350 input.180 0=0
Split            splitncnn_35             1 2 input.180 input.180_splitncnn_0 input.180_splitncnn_1
LayerNorm        LayerNorm_199            1 1 input.180_splitncnn_1 mmdeploy::Gemm_352 0=768 1=1.000000e-06 2=1
InnerProduct     Gemm_200                 1 1 mmdeploy::Gemm_352 mmdeploy::Gelu_353 0=3072 1=1 2=2359296
GELU             Gelu_201                 1 1 mmdeploy::Gelu_353 input.184 0=1
InnerProduct     Gemm_202                 1 1 input.184 input.188 0=768 1=1 2=2359296
BinaryOp         Add_203                  2 1 input.180_splitncnn_0 input.188 input.192 0=0
LayerNorm        LayerNorm_204            1 1 input.192 onnx::Gather_357 0=768 1=1.000000e-06 2=1
Crop             Gather_206               1 1 onnx::Gather_357 mmdeploy::Gemm_359 -23309=1,0 -23310=1,1 -23311=1,0
InnerProduct     Gemm_207                 1 1 mmdeploy::Gemm_359 cls_score 0=1000 1=1 2=768000
Softmax          Softmax_208              1 1 cls_score output 0=0 1=1


================================================
FILE: benchmark/yolo-fastest-1.1.param
================================================
7767517
131 154
Input                    data                     0 1 data -23330=4,3,320,320,3 0=320 1=320 2=3
Convolution              0_22                     1 1 data 0_22_bn_leaky -23330=4,3,160,160,8 0=8 1=3 3=2 4=1 5=1 6=216 9=2 -23310=1,1.000000e-01
Convolution              1_31                     1 1 0_22_bn_leaky 1_31_bn_leaky -23330=4,3,160,160,8 0=8 1=1 5=1 6=64 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     2_39                     1 1 1_31_bn_leaky 2_39_bn_leaky -23330=4,3,160,160,8 0=8 1=3 4=1 5=1 6=72 7=8 9=2 -23310=1,1.000000e-01
Convolution              3_48                     1 1 2_39_bn_leaky 3_48_bn -23330=4,3,160,160,4 0=4 1=1 5=1 6=32
Split                    3_48_bn_split            1 2 3_48_bn 3_48_bn_split_0 3_48_bn_split_1 -23330=8,3,160,160,4,3,160,160,4
Convolution              4_57                     1 1 3_48_bn_split_0 4_57_bn_leaky -23330=4,3,160,160,8 0=8 1=1 5=1 6=32 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     5_65                     1 1 4_57_bn_leaky 5_65_bn_leaky -23330=4,3,160,160,8 0=8 1=3 4=1 5=1 6=72 7=8 9=2 -23310=1,1.000000e-01
Convolution              6_74                     1 1 5_65_bn_leaky 6_74_bn -23330=4,3,160,160,4 0=4 1=1 5=1 6=32
Eltwise                  8_86                     2 1 6_74_bn 3_48_bn_split_1 8_86 -23330=4,3,160,160,4 0=1
Convolution              9_90                     1 1 8_86 9_90_bn_leaky -23330=4,3,160,160,24 0=24 1=1 5=1 6=96 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     10_98                    1 1 9_90_bn_leaky 10_98_bn_leaky -23330=4,3,80,80,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24 9=2 -23310=1,1.000000e-01
Convolution              11_107                   1 1 10_98_bn_leaky 11_107_bn -23330=4,3,80,80,8 0=8 1=1 5=1 6=192
Split                    11_107_bn_split          1 2 11_107_bn 11_107_bn_split_0 11_107_bn_split_1 -23330=8,3,80,80,8,3,80,80,8
Convolution              12_116                   1 1 11_107_bn_split_0 12_116_bn_leaky -23330=4,3,80,80,32 0=32 1=1 5=1 6=256 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     13_124                   1 1 12_116_bn_leaky 13_124_bn_leaky -23330=4,3,80,80,32 0=32 1=3 4=1 5=1 6=288 7=32 9=2 -23310=1,1.000000e-01
Convolution              14_133                   1 1 13_124_bn_leaky 14_133_bn -23330=4,3,80,80,8 0=8 1=1 5=1 6=256
Eltwise                  16_145                   2 1 14_133_bn 11_107_bn_split_1 16_145 -23330=4,3,80,80,8 0=1
Split                    16_145_split             1 2 16_145 16_145_split_0 16_145_split_1 -23330=8,3,80,80,8,3,80,80,8
Convolution              17_149                   1 1 16_145_split_0 17_149_bn_leaky -23330=4,3,80,80,32 0=32 1=1 5=1 6=256 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     18_157                   1 1 17_149_bn_leaky 18_157_bn_leaky -23330=4,3,80,80,32 0=32 1=3 4=1 5=1 6=288 7=32 9=2 -23310=1,1.000000e-01
Convolution              19_166                   1 1 18_157_bn_leaky 19_166_bn -23330=4,3,80,80,8 0=8 1=1 5=1 6=256
Eltwise                  21_179                   2 1 19_166_bn 16_145_split_1 21_179 -23330=4,3,80,80,8 0=1
Convolution              22_183                   1 1 21_179 22_183_bn_leaky -23330=4,3,80,80,32 0=32 1=1 5=1 6=256 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     23_191                   1 1 22_183_bn_leaky 23_191_bn_leaky -23330=4,3,40,40,32 0=32 1=3 3=2 4=1 5=1 6=288 7=32 9=2 -23310=1,1.000000e-01
Convolution              24_200                   1 1 23_191_bn_leaky 24_200_bn -23330=4,3,40,40,8 0=8 1=1 5=1 6=256
Split                    24_200_bn_split          1 2 24_200_bn 24_200_bn_split_0 24_200_bn_split_1 -23330=8,3,40,40,8,3,40,40,8
Convolution              25_209                   1 1 24_200_bn_split_0 25_209_bn_leaky -23330=4,3,40,40,48 0=48 1=1 5=1 6=384 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     26_217                   1 1 25_209_bn_leaky 26_217_bn_leaky -23330=4,3,40,40,48 0=48 1=3 4=1 5=1 6=432 7=48 9=2 -23310=1,1.000000e-01
Convolution              27_226                   1 1 26_217_bn_leaky 27_226_bn -23330=4,3,40,40,8 0=8 1=1 5=1 6=384
Eltwise                  29_238                   2 1 27_226_bn 24_200_bn_split_1 29_238 -23330=4,3,40,40,8 0=1
Split                    29_238_split             1 2 29_238 29_238_split_0 29_238_split_1 -23330=8,3,40,40,8,3,40,40,8
Convolution              30_242                   1 1 29_238_split_0 30_242_bn_leaky -23330=4,3,40,40,48 0=48 1=1 5=1 6=384 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     31_250                   1 1 30_242_bn_leaky 31_250_bn_leaky -23330=4,3,40,40,48 0=48 1=3 4=1 5=1 6=432 7=48 9=2 -23310=1,1.000000e-01
Convolution              32_259                   1 1 31_250_bn_leaky 32_259_bn -23330=4,3,40,40,8 0=8 1=1 5=1 6=384
Eltwise                  34_273                   2 1 32_259_bn 29_238_split_1 34_273 -23330=4,3,40,40,8 0=1
Convolution              35_277                   1 1 34_273 35_277_bn_leaky -23330=4,3,40,40,48 0=48 1=1 5=1 6=384 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     36_285                   1 1 35_277_bn_leaky 36_285_bn_leaky -23330=4,3,40,40,48 0=48 1=3 4=1 5=1 6=432 7=48 9=2 -23310=1,1.000000e-01
Convolution              37_294                   1 1 36_285_bn_leaky 37_294_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=768
Split                    37_294_bn_split          1 2 37_294_bn 37_294_bn_split_0 37_294_bn_split_1 -23330=8,3,40,40,16,3,40,40,16
Convolution              38_303                   1 1 37_294_bn_split_0 38_303_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     39_311                   1 1 38_303_bn_leaky 39_311_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              40_320                   1 1 39_311_bn_leaky 40_320_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536
Eltwise                  42_332                   2 1 40_320_bn 37_294_bn_split_1 42_332 -23330=4,3,40,40,16 0=1
Split                    42_332_split             1 2 42_332 42_332_split_0 42_332_split_1 -23330=8,3,40,40,16,3,40,40,16
Convolution              43_336                   1 1 42_332_split_0 43_336_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     44_344                   1 1 43_336_bn_leaky 44_344_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              45_353                   1 1 44_344_bn_leaky 45_353_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536
Eltwise                  47_365                   2 1 45_353_bn 42_332_split_1 47_365 -23330=4,3,40,40,16 0=1
Split                    47_365_split             1 2 47_365 47_365_split_0 47_365_split_1 -23330=8,3,40,40,16,3,40,40,16
Convolution              48_369                   1 1 47_365_split_0 48_369_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     49_377                   1 1 48_369_bn_leaky 49_377_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              50_386                   1 1 49_377_bn_leaky 50_386_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536
Eltwise                  52_399                   2 1 50_386_bn 47_365_split_1 52_399 -23330=4,3,40,40,16 0=1
Split                    52_399_split             1 2 52_399 52_399_split_0 52_399_split_1 -23330=8,3,40,40,16,3,40,40,16
Convolution              53_403                   1 1 52_399_split_0 53_403_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     54_411                   1 1 53_403_bn_leaky 54_411_bn_leaky -23330=4,3,40,40,96 0=96 1=3 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              55_420                   1 1 54_411_bn_leaky 55_420_bn -23330=4,3,40,40,16 0=16 1=1 5=1 6=1536
Eltwise                  57_433                   2 1 55_420_bn 52_399_split_1 57_433 -23330=4,3,40,40,16 0=1
Convolution              58_437                   1 1 57_433 58_437_bn_leaky -23330=4,3,40,40,96 0=96 1=1 5=1 6=1536 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     59_445                   1 1 58_437_bn_leaky 59_445_bn_leaky -23330=4,3,20,20,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96 9=2 -23310=1,1.000000e-01
Convolution              60_454                   1 1 59_445_bn_leaky 60_454_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=2304
Split                    60_454_bn_split          1 2 60_454_bn 60_454_bn_split_0 60_454_bn_split_1 -23330=8,3,20,20,24,3,20,20,24
Convolution              61_463                   1 1 60_454_bn_split_0 61_463_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     62_471                   1 1 61_463_bn_leaky 62_471_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01
Convolution              63_480                   1 1 62_471_bn_leaky 63_480_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264
Eltwise                  65_492                   2 1 63_480_bn 60_454_bn_split_1 65_492 -23330=4,3,20,20,24 0=1
Split                    65_492_split             1 2 65_492 65_492_split_0 65_492_split_1 -23330=8,3,20,20,24,3,20,20,24
Convolution              66_496                   1 1 65_492_split_0 66_496_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     67_504                   1 1 66_496_bn_leaky 67_504_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01
Convolution              68_513                   1 1 67_504_bn_leaky 68_513_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264
Eltwise                  70_526                   2 1 68_513_bn 65_492_split_1 70_526 -23330=4,3,20,20,24 0=1
Split                    70_526_split             1 2 70_526 70_526_split_0 70_526_split_1 -23330=8,3,20,20,24,3,20,20,24
Convolution              71_530                   1 1 70_526_split_0 71_530_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     72_538                   1 1 71_530_bn_leaky 72_538_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01
Convolution              73_547                   1 1 72_538_bn_leaky 73_547_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264
Eltwise                  75_559                   2 1 73_547_bn 70_526_split_1 75_559 -23330=4,3,20,20,24 0=1
Split                    75_559_split             1 2 75_559 75_559_split_0 75_559_split_1 -23330=8,3,20,20,24,3,20,20,24
Convolution              76_563                   1 1 75_559_split_0 76_563_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     77_571                   1 1 76_563_bn_leaky 77_571_bn_leaky -23330=4,3,20,20,136 0=136 1=3 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01
Convolution              78_580                   1 1 77_571_bn_leaky 78_580_bn -23330=4,3,20,20,24 0=24 1=1 5=1 6=3264
Eltwise                  80_593                   2 1 78_580_bn 75_559_split_1 80_593 -23330=4,3,20,20,24 0=1
Split                    80_593_split             1 2 80_593 80_593_split_0 80_593_split_1 -23330=8,3,20,20,24,3,20,20,24
Convolution              81_597                   1 1 80_593_split_0 81_597_bn_leaky -23330=4,3,20,20,136 0=136 1=1 5=1 6=3264 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     82_605                   1 1 81_597_bn_leaky 82_605_bn_leaky -23330=4,3,10,10,136 0=136 1=3 3=2 4=1 5=1 6=1224 7=136 9=2 -23310=1,1.000000e-01
Convolution              83_615                   1 1 82_605_bn_leaky 83_615_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=6528
Split                    83_615_bn_split          1 2 83_615_bn 83_615_bn_split_0 83_615_bn_split_1 -23330=8,3,10,10,48,3,10,10,48
Convolution              84_624                   1 1 83_615_bn_split_0 84_624_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     85_632                   1 1 84_624_bn_leaky 85_632_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01
Convolution              86_641                   1 1 85_632_bn_leaky 86_641_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752
Eltwise                  88_653                   2 1 86_641_bn 83_615_bn_split_1 88_653 -23330=4,3,10,10,48 0=1
Split                    88_653_split             1 2 88_653 88_653_split_0 88_653_split_1 -23330=8,3,10,10,48,3,10,10,48
Convolution              89_657                   1 1 88_653_split_0 89_657_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     90_665                   1 1 89_657_bn_leaky 90_665_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01
Convolution              91_674                   1 1 90_665_bn_leaky 91_674_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752
Eltwise                  93_686                   2 1 91_674_bn 88_653_split_1 93_686 -23330=4,3,10,10,48 0=1
Split                    93_686_split             1 2 93_686 93_686_split_0 93_686_split_1 -23330=8,3,10,10,48,3,10,10,48
Convolution              94_690                   1 1 93_686_split_0 94_690_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     95_698                   1 1 94_690_bn_leaky 95_698_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01
Convolution              96_707                   1 1 95_698_bn_leaky 96_707_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752
Eltwise                  98_719                   2 1 96_707_bn 93_686_split_1 98_719 -23330=4,3,10,10,48 0=1
Split                    98_719_split             1 2 98_719 98_719_split_0 98_719_split_1 -23330=8,3,10,10,48,3,10,10,48
Convolution              99_723                   1 1 98_719_split_0 99_723_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     100_731                  1 1 99_723_bn_leaky 100_731_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01
Convolution              101_740                  1 1 100_731_bn_leaky 101_740_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752
Eltwise                  103_752                  2 1 101_740_bn 98_719_split_1 103_752 -23330=4,3,10,10,48 0=1
Split                    103_752_split            1 2 103_752 103_752_split_0 103_752_split_1 -23330=8,3,10,10,48,3,10,10,48
Convolution              104_756                  1 1 103_752_split_0 104_756_bn_leaky -23330=4,3,10,10,224 0=224 1=1 5=1 6=10752 9=2 -23310=1,1.000000e-01
ConvolutionDepthWise     105_764                  1 1 104_756_bn_leaky 105_764_bn_leaky -23330=4,3,10,10,224 0=224 1=3 4=1 5=1 6=2016 7=224 9=2 -23310=1,1.000000e-01
Convolution              106_773                  1 1 105_764_bn_leaky 106_773_bn -23330=4,3,10,10,48 0=48 1=1 5=1 6=10752
Eltwise                  108_784                  2 1 106_773_bn 103_752_split_1 108_784 -23330=4,3,10,10,48 0=1
Split                    108_784_split            1 4 108_784 108_784_split_0 108_784_split_1 108_784_split_2 108_784_split_3 -23330=16,3,10,10,48,3,10,10,48,3,10,10,48,3,10,10,48
Pooling                  109_788                  1 1 108_784_split_0 109_788 -23330=4,3,10,10,48 1=3 3=1 5=1
Pooling                  111_795                  1 1 108_784_split_1 111_795 -23330=4,3,10,10,48 1=5 3=2 5=1
Pooling                  113_802                  1 1 108_784_split_2 113_802 -23330=4,3,10,10,48 1=9 3=4 5=1
Concat                   114_806                  4 1 113_802 111_795 109_788 108_784_split_3 114_806 -23330=4,3,10,10,192
Convolution              115_811                  1 1 114_806 115_811_bn_leaky -23330=4,3,10,10,96 0=96 1=1 5=1 6=18432 9=2 -23310=1,1.000000e-01
Split                    115_811_bn_leaky_split   1 2 115_811_bn_leaky 115_811_bn_leaky_split_0 115_811_bn_leaky_split_1 -23330=8,3,10,10,96,3,10,10,96
ConvolutionDepthWise     116_819                  1 1 115_811_bn_leaky_split_0 116_819_bn_leaky -23330=4,3,10,10,96 0=96 1=5 4=2 5=1 6=2400 7=96 9=2 -23310=1,1.000000e-01
Convolution              117_828                  1 1 116_819_bn_leaky 117_828_bn -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216
ConvolutionDepthWise     118_836                  1 1 117_828_bn 118_836_bn_leaky -23330=4,3,10,10,96 0=96 1=5 4=2 5=1 6=2400 7=96 9=2 -23310=1,1.000000e-01
Convolution              119_845                  1 1 118_836_bn_leaky 119_845_bn -23330=4,3,10,10,96 0=96 1=1 5=1 6=9216
Convolution              120_854                  1 1 119_845_bn 120_854 -23330=4,3,10,10,255 0=255 1=1 5=1 6=24480
Interp                   123_882                  1 1 115_811_bn_leaky_split_1 123_882 -23330=4,3,20,20,96 0=1 1=2.000000e+00 2=2.000000e+00
Concat                   124_885                  2 1 123_882 80_593_split_1 124_885 -23330=4,3,20,20,120
ConvolutionDepthWise     125_888                  1 1 124_885 125_888_bn_leaky -23330=4,3,20,20,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=2 -23310=1,1.000000e-01
Convolution              126_897                  1 1 125_888_bn_leaky 126_897_bn -23330=4,3,20,20,120 0=120 1=1 5=1 6=14400
ConvolutionDepthWise     127_905                  1 1 126_897_bn 127_905_bn_leaky -23330=4,3,20,20,120 0=120 1=5 4=2 5=1 6=3000 7=120 9=2 -23310=1,1.000000e-01
Convolution              128_914                  1 1 127_905_bn_leaky 128_914_bn -23330=4,3,20,20,120 0=120 1=1 5=1 6=14400
Convolution              129_922                  1 1 128_914_bn 129_922 -23330=4,3,20,20,255 0=255 1=1 5=1 6=30600
Yolov3DetectionOutput    detection_out            2 1 120_854 129_922 output -23330=4,2,6,1431,1 0=80 1=3 2=5.500000e-01 -23304=12,1.200000e+01,1.800000e+01,3.700000e+01,4.900000e+01,5.200000e+01,1.320000e+02,1.150000e+02,7.300000e+01,1.190000e+02,1.990000e+02,2.420000e+02,2.380000e+02 -23305=6,1077936128,1082130432,1084227584,0,1065353216,1073741824 -23306=2,3.200000e+01,1.600000e+01


================================================
FILE: benchmark/yolo-fastestv2.param
================================================
7767517
144 166
Input                    input.1                  0 1 input.1 -23330=4,3,352,352,3 0=352 1=352 2=3
Convolution              Conv_0                   1 1 input.1 447 -23330=4,3,176,176,24 0=24 1=3 3=2 4=1 5=1 6=648 9=1
Pooling                  MaxPool_2                1 1 447 448 -23330=4,3,88,88,24 1=3 2=2 3=1 5=1
Split                    splitncnn_0              1 2 448 448_splitncnn_0 448_splitncnn_1 -23330=8,3,88,88,24,3,88,88,24
ConvolutionDepthWise     Conv_3                   1 1 448_splitncnn_1 800 -23330=4,3,44,44,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24
Convolution              Conv_4                   1 1 800 453 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
Convolution              Conv_6                   1 1 448_splitncnn_0 456 -23330=4,3,88,88,24 0=24 1=1 5=1 6=576 9=1
ConvolutionDepthWise     Conv_8                   1 1 456 809 -23330=4,3,44,44,24 0=24 1=3 3=2 4=1 5=1 6=216 7=24
Convolution              Conv_9                   1 1 809 461 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
Concat                   Concat_11                2 1 453 461 462 -23330=4,3,44,44,48
ShuffleChannel           Reshape_16               1 1 462 467 -23330=4,3,44,44,48 0=2 1=1
Slice                    Gather_20                1 2 467 469 471 -23330=8,3,44,44,24,3,44,44,24 -23300=2,-233,-233
Convolution              Conv_21                  1 1 471 474 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
ConvolutionDepthWise     Conv_23                  1 1 474 818 -23330=4,3,44,44,24 0=24 1=3 4=1 5=1 6=216 7=24
Convolution              Conv_24                  1 1 818 479 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
Concat                   Concat_26                2 1 469 479 480 -23330=4,3,44,44,48
ShuffleChannel           Reshape_31               1 1 480 485 -23330=4,3,44,44,48 0=2 1=1
Slice                    Gather_35                1 2 485 487 489 -23330=8,3,44,44,24,3,44,44,24 -23300=2,-233,-233
Convolution              Conv_36                  1 1 489 492 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
ConvolutionDepthWise     Conv_38                  1 1 492 827 -23330=4,3,44,44,24 0=24 1=3 4=1 5=1 6=216 7=24
Convolution              Conv_39                  1 1 827 497 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
Concat                   Concat_41                2 1 487 497 498 -23330=4,3,44,44,48
ShuffleChannel           Reshape_46               1 1 498 503 -23330=4,3,44,44,48 0=2 1=1
Slice                    Gather_50                1 2 503 505 507 -23330=8,3,44,44,24,3,44,44,24 -23300=2,-233,-233
Convolution              Conv_51                  1 1 507 510 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
ConvolutionDepthWise     Conv_53                  1 1 510 836 -23330=4,3,44,44,24 0=24 1=3 4=1 5=1 6=216 7=24
Convolution              Conv_54                  1 1 836 515 -23330=4,3,44,44,24 0=24 1=1 5=1 6=576 9=1
Concat                   Concat_56                2 1 505 515 516 -23330=4,3,44,44,48
Split                    splitncnn_1              1 2 516 516_splitncnn_0 516_splitncnn_1 -23330=8,3,44,44,48,3,44,44,48
ConvolutionDepthWise     Conv_57                  1 1 516_splitncnn_1 842 -23330=4,3,22,22,48 0=48 1=3 3=2 4=1 5=1 6=432 7=48
Convolution              Conv_58                  1 1 842 521 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Convolution              Conv_60                  1 1 516_splitncnn_0 524 -23330=4,3,44,44,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_62                  1 1 524 851 -23330=4,3,22,22,48 0=48 1=3 3=2 4=1 5=1 6=432 7=48
Convolution              Conv_63                  1 1 851 529 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_65                2 1 521 529 530 -23330=4,3,22,22,96
ShuffleChannel           Reshape_70               1 1 530 535 -23330=4,3,22,22,96 0=2 1=1
Slice                    Gather_74                1 2 535 537 539 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233
Convolution              Conv_75                  1 1 539 542 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_77                  1 1 542 860 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48
Convolution              Conv_78                  1 1 860 547 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_80                2 1 537 547 548 -23330=4,3,22,22,96
ShuffleChannel           Reshape_85               1 1 548 553 -23330=4,3,22,22,96 0=2 1=1
Slice                    Gather_89                1 2 553 555 557 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233
Convolution              Conv_90                  1 1 557 560 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_92                  1 1 560 869 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48
Convolution              Conv_93                  1 1 869 565 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_95                2 1 555 565 566 -23330=4,3,22,22,96
ShuffleChannel           Reshape_100              1 1 566 571 -23330=4,3,22,22,96 0=2 1=1
Slice                    Gather_104               1 2 571 573 575 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233
Convolution              Conv_105                 1 1 575 578 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_107                 1 1 578 878 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48
Convolution              Conv_108                 1 1 878 583 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_110               2 1 573 583 584 -23330=4,3,22,22,96
ShuffleChannel           Reshape_115              1 1 584 589 -23330=4,3,22,22,96 0=2 1=1
Slice                    Gather_119               1 2 589 591 593 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233
Convolution              Conv_120                 1 1 593 596 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_122                 1 1 596 887 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48
Convolution              Conv_123                 1 1 887 601 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_125               2 1 591 601 602 -23330=4,3,22,22,96
ShuffleChannel           Reshape_130              1 1 602 607 -23330=4,3,22,22,96 0=2 1=1
Slice                    Gather_134               1 2 607 609 611 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233
Convolution              Conv_135                 1 1 611 614 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_137                 1 1 614 896 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48
Convolution              Conv_138                 1 1 896 619 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_140               2 1 609 619 620 -23330=4,3,22,22,96
ShuffleChannel           Reshape_145              1 1 620 625 -23330=4,3,22,22,96 0=2 1=1
Slice                    Gather_149               1 2 625 627 629 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233
Convolution              Conv_150                 1 1 629 632 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_152                 1 1 632 905 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48
Convolution              Conv_153                 1 1 905 637 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_155               2 1 627 637 638 -23330=4,3,22,22,96
ShuffleChannel           Reshape_160              1 1 638 643 -23330=4,3,22,22,96 0=2 1=1
Slice                    Gather_164               1 2 643 645 647 -23330=8,3,22,22,48,3,22,22,48 -23300=2,-233,-233
Convolution              Conv_165                 1 1 647 650 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
ConvolutionDepthWise     Conv_167                 1 1 650 914 -23330=4,3,22,22,48 0=48 1=3 4=1 5=1 6=432 7=48
Convolution              Conv_168                 1 1 914 655 -23330=4,3,22,22,48 0=48 1=1 5=1 6=2304 9=1
Concat                   Concat_170               2 1 645 655 656 -23330=4,3,22,22,96
Split                    splitncnn_2              1 3 656 656_splitncnn_0 656_splitncnn_1 656_splitncnn_2 -23330=12,3,22,22,96,3,22,22,96,3,22,22,96
ConvolutionDepthWise     Conv_171                 1 1 656_splitncnn_2 920 -23330=4,3,11,11,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96
Convolution              Conv_172                 1 1 920 661 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
Convolution              Conv_174                 1 1 656_splitncnn_1 664 -23330=4,3,22,22,96 0=96 1=1 5=1 6=9216 9=1
ConvolutionDepthWise     Conv_176                 1 1 664 929 -23330=4,3,11,11,96 0=96 1=3 3=2 4=1 5=1 6=864 7=96
Convolution              Conv_177                 1 1 929 669 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
Concat                   Concat_179               2 1 661 669 670 -23330=4,3,11,11,192
ShuffleChannel           Reshape_184              1 1 670 675 -23330=4,3,11,11,192 0=2 1=1
Slice                    Gather_188               1 2 675 677 679 -23330=8,3,11,11,96,3,11,11,96 -23300=2,-233,-233
Convolution              Conv_189                 1 1 679 682 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
ConvolutionDepthWise     Conv_191                 1 1 682 938 -23330=4,3,11,11,96 0=96 1=3 4=1 5=1 6=864 7=96
Convolution              Conv_192                 1 1 938 687 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
Concat                   Concat_194               2 1 677 687 688 -23330=4,3,11,11,192
ShuffleChannel           Reshape_199              1 1 688 693 -23330=4,3,11,11,192 0=2 1=1
Slice                    Gather_203               1 2 693 695 697 -23330=8,3,11,11,96,3,11,11,96 -23300=2,-233,-233
Convolution              Conv_204                 1 1 697 700 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
ConvolutionDepthWise     Conv_206                 1 1 700 947 -23330=4,3,11,11,96 0=96 1=3 4=1 5=1 6=864 7=96
Convolution              Conv_207                 1 1 947 705 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
Concat                   Concat_209               2 1 695 705 706 -23330=4,3,11,11,192
ShuffleChannel           Reshape_214              1 1 706 711 -23330=4,3,11,11,192 0=2 1=1
Slice                    Gather_218               1 2 711 713 715 -23330=8,3,11,11,96,3,11,11,96 -23300=2,-233,-233
Convolution              Conv_219                 1 1 715 718 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
ConvolutionDepthWise     Conv_221                 1 1 718 956 -23330=4,3,11,11,96 0=96 1=3 4=1 5=1 6=864 7=96
Convolution              Conv_222                 1 1 956 723 -23330=4,3,11,11,96 0=96 1=1 5=1 6=9216 9=1
Concat                   Concat_224               2 1 713 723 724 -23330=4,3,11,11,192
Split                    splitncnn_3              1 2 724 724_splitncnn_0 724_splitncnn_1 -23330=8,3,11,11,192,3,11,11,192
Convolution              Conv_225                 1 1 724_splitncnn_1 727 -23330=4,3,11,11,72 0=72 1=1 5=1 6=13824 9=1
Split                    splitncnn_4              1 2 727 727_splitncnn_0 727_splitncnn_1 -23330=8,3,11,11,72,3,11,11,72
ConvolutionDepthWise     Conv_227                 1 1 727_splitncnn_1 730 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_229                 1 1 730 968 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184
ConvolutionDepthWise     Conv_230                 1 1 968 735 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_232                 1 1 735 974 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184
Split                    splitncnn_5              1 2 974 974_splitncnn_0 974_splitncnn_1 -23330=8,3,11,11,72,3,11,11,72
ConvolutionDepthWise     Conv_233                 1 1 727_splitncnn_0 740 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_235                 1 1 740 980 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184
ConvolutionDepthWise     Conv_236                 1 1 980 745 -23330=4,3,11,11,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_238                 1 1 745 986 -23330=4,3,11,11,72 0=72 1=1 5=1 6=5184
Interp                   Resize_240               1 1 724_splitncnn_0 752 -23330=4,3,22,22,192 0=1 1=2.000000e+00 2=2.000000e+00
Concat                   Concat_241               2 1 752 656_splitncnn_0 753 -23330=4,3,22,22,288
Convolution              Conv_242                 1 1 753 756 -23330=4,3,22,22,72 0=72 1=1 5=1 6=20736 9=1
Split                    splitncnn_6              1 2 756 756_splitncnn_0 756_splitncnn_1 -23330=8,3,22,22,72,3,22,22,72
ConvolutionDepthWise     Conv_244                 1 1 756_splitncnn_1 759 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_246                 1 1 759 995 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184
ConvolutionDepthWise     Conv_247                 1 1 995 764 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_249                 1 1 764 1001 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184
Split                    splitncnn_7              1 2 1001 1001_splitncnn_0 1001_splitncnn_1 -23330=8,3,22,22,72,3,22,22,72
ConvolutionDepthWise     Conv_250                 1 1 756_splitncnn_0 769 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_252                 1 1 769 1007 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184
ConvolutionDepthWise     Conv_253                 1 1 1007 774 -23330=4,3,22,22,72 0=72 1=5 4=2 5=1 6=1800 7=72 9=1
Convolution              Conv_255                 1 1 774 1013 -23330=4,3,22,22,72 0=72 1=1 5=1 6=5184
Convolution              Conv_256                 1 1 1013 783 -23330=4,3,22,22,12 0=12 1=1 5=1 6=864 9=4
Convolution              Conv_257                 1 1 1001_splitncnn_1 784 -23330=4,3,22,22,3 0=3 1=1 5=1 6=216 9=4
Convolution              Conv_258                 1 1 1001_splitncnn_0 779 -23330=4,3,22,22,80 0=80 1=1 5=1 6=5760
Convolution              Conv_259                 1 1 986 788 -23330=4,3,11,11,12 0=12 1=1 5=1 6=864 9=4
Convolution              Conv_260                 1 1 974_splitncnn_1 789 -23330=4,3,11,11,3 0=3 1=1 5=1 6=216 9=4
Convolution              Conv_261                 1 1 974_splitncnn_0 782 -23330=4,3,11,11,80 0=80 1=1 5=1 6=5760
Permute                  Transpose_264            1 1 779 785 -23330=4,3,80,22,22 0=5
Softmax                  Softmax_265              1 1 785 786 -23330=4,3,80,22,22 0=2 1=1
Permute                  Transpose_266            1 1 786 787 -23330=4,3,22,22,80 0=5
Permute                  Transpose_269            1 1 782 790 -23330=4,3,80,11,11 0=5
Softmax                  Softmax_270              1 1 790 791 -23330=4,3,80,11,11 0=2 1=1
Permute                  Transpose_271            1 1 791 792 -23330=4,3,11,11,80 0=5
Concat                   Concat_272               3 1 783 784 787 793 -23330=4,3,22,22,95
Permute                  Transpose_273            1 1 793 794 -23330=4,3,95,22,22 0=3
Concat                   Concat_274               3 1 788 789 792 795 -23330=4,3,11,11,95
Permute                  Transpose_275            1 1 795 796 -23330=4,3,95,11,11 0=3
Noop                     output                   2 1 794 796 output


================================================
FILE: benchmark/yolov4-tiny.param
================================================
7767517
45 53
Input                    data                     0 1 data -23330=4,3,416,416,3 0=416 1=416 2=3
Convolution              0_25                     1 1 data 0_25_bn_leaky -23330=4,3,208,208,32 0=32 1=3 3=2 4=1 5=1 6=864 9=2 -23310=1,1.000000e-01
Convolution              1_33                     1 1 0_25_bn_leaky 1_33_bn_leaky -23330=4,3,104,104,64 0=64 1=3 3=2 4=1 5=1 6=18432 9=2 -23310=1,1.000000e-01
Convolution              2_41                     1 1 1_33_bn_leaky 2_41_bn_leaky -23330=4,3,104,104,64 0=64 1=3 4=1 5=1 6=36864 9=2 -23310=1,1.000000e-01
Split                    2_41_bn_leaky_split      1 2 2_41_bn_leaky 2_41_bn_leaky_split_0 2_41_bn_leaky_split_1 -23330=8,3,104,104,64,3,104,104,64
Crop                     3_49                     1 1 2_41_bn_leaky_split_0 3_49 -23330=4,3,104,104,32 2=32 3=104 4=104 5=32
Convolution              4_54                     1 1 3_49 4_54_bn_leaky -23330=4,3,104,104,32 0=32 1=3 4=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
Split                    4_54_bn_leaky_split      1 2 4_54_bn_leaky 4_54_bn_leaky_split_0 4_54_bn_leaky_split_1 -23330=8,3,104,104,32,3,104,104,32
Convolution              5_62                     1 1 4_54_bn_leaky_split_0 5_62_bn_leaky -23330=4,3,104,104,32 0=32 1=3 4=1 5=1 6=9216 9=2 -23310=1,1.000000e-01
Concat                   6_70                     2 1 5_62_bn_leaky 4_54_bn_leaky_split_1 6_70 -23330=4,3,104,104,64
Convolution              7_73                     1 1 6_70 7_73_bn_leaky -23330=4,3,104,104,64 0=64 1=1 5=1 6=4096 9=2 -23310=1,1.000000e-01
Concat                   8_81                     2 1 2_41_bn_leaky_split_1 7_73_bn_leaky 8_81 -23330=4,3,104,104,128
Pooling                  9_84                     1 1 8_81 9_84 -23330=4,3,52,52,128 1=2 2=2 14=1 15=1 5=1
Convolution              10_88                    1 1 9_84 10_88_bn_leaky -23330=4,3,52,52,128 0=128 1=3 4=1 5=1 6=147456 9=2 -23310=1,1.000000e-01
Split                    10_88_bn_leaky_split     1 2 10_88_bn_leaky 10_88_bn_leaky_split_0 10_88_bn_leaky_split_1 -23330=8,3,52,52,128,3,52,52,128
Crop                     11_96                    1 1 10_88_bn_leaky_split_0 11_96 -23330=4,3,52,52,64 2=64 3=52 4=52 5=64
Convolution              12_101                   1 1 11_96 12_101_bn_leaky -23330=4,3,52,52,64 0=64 1=3 4=1 5=1 6=36864 9=2 -23310=1,1.000000e-01
Split                    12_101_bn_leaky_split    1 2 12_101_bn_leaky 12_101_bn_leaky_split_0 12_101_bn_leaky_split_1 -23330=8,3,52,52,64,3,52,52,64
Convolution              13_109                   1 1 12_101_bn_leaky_split_0 13_109_bn_leaky -23330=4,3,52,52,64 0=64 1=3 4=1 5=1 6=36864 9=2 -23310=1,1.000000e-01
Concat                   14_117                   2 1 13_109_bn_leaky 12_101_bn_leaky_split_1 14_117 -23330=4,3,52,52,128
Convolution              15_120                   1 1 14_117 15_120_bn_leaky -23330=4,3,52,52,128 0=128 1=1 5=1 6=16384 9=2 -23310=1,1.000000e-01
Concat                   16_128                   2 1 10_88_bn_leaky_split_1 15_120_bn_leaky 16_128 -23330=4,3,52,52,256
Pooling                  17_131                   1 1 16_128 17_131 -23330=4,3,26,26,256 1=2 2=2 14=1 15=1 5=1
Convolution              18_135                   1 1 17_131 18_135_bn_leaky -23330=4,3,26,26,256 0=256 1=3 4=1 5=1 6=589824 9=2 -23310=1,1.000000e-01
Split                    18_135_bn_leaky_split    1 2 18_135_bn_leaky 18_135_bn_leaky_split_0 18_135_bn_leaky_split_1 -23330=8,3,26,26,256,3,26,26,256
Crop                     19_143                   1 1 18_135_bn_leaky_split_0 19_143 -23330=4,3,26,26,128 2=128 3=26 4=26 5=128
Convolution              20_148                   1 1 19_143 20_148_bn_leaky -23330=4,3,26,26,128 0=128 1=3 4=1 5=1 6=147456 9=2 -23310=1,1.000000e-01
Split                    20_148_bn_leaky_split    1 2 20_148_bn_leaky 20_148_bn_leaky_split_0 20_148_bn_leaky_split_1 -23330=8,3,26,26,128,3,26,26,128
Convolution              21_156                   1 1 20_148_bn_leaky_split_0 21_156_bn_leaky -23330=4,3,26,26,128 0=128 1=3 4=1 5=1 6=147456 9=2 -23310=1,1.000000e-01
Concat                   22_164                   2 1 21_156_bn_leaky 20_148_bn_leaky_split_1 22_164 -23330=4,3,26,26,256
Convolution              23_167                   1 1 22_164 23_167_bn_leaky -23330=4,3,26,26,256 0=256 1=1 5=1 6=65536 9=2 -23310=1,1.000000e-01
Split                    23_167_bn_leaky_split    1 2 23_167_bn_leaky 23_167_bn_leaky_split_0 23_167_bn_leaky_split_1 -23330=8,3,26,26,256,3,26,26,256
Concat                   24_175                   2 1 18_135_bn_leaky_split_1 23_167_bn_leaky_split_0 24_175 -23330=4,3,26,26,512
Pooling                  25_178                   1 1 24_175 25_178 -23330=4,3,13,13,512 1=2 2=2 14=1 15=1 5=1
Convolution              26_182                   1 1 25_178 26_182_bn_leaky -23330=4,3,13,13,512 0=512 1=3 4=1 5=1 6=2359296 9=2 -23310=1,1.000000e-01
Convolution              27_192                   1 1 26_182_bn_leaky 27_192_bn_leaky -23330=4,3,13,13,256 0=256 1=1 5=1 6=131072 9=2 -23310=1,1.000000e-01
Split                    27_192_bn_leaky_split    1 2 27_192_bn_leaky 27_192_bn_leaky_split_0 27_192_bn_leaky_split_1 -23330=8,3,13,13,256,3,13,13,256
Convolution              28_200                   1 1 27_192_bn_leaky_split_0 28_200_bn_leaky -23330=4,3,13,13,512 0=512 1=3 4=1 5=1 6=1179648 9=2 -23310=1,1.000000e-01
Convolution              29_208                   1 1 28_200_bn_leaky 29_208 -23330=4,3,13,13,255 0=255 1=1 5=1 6=130560
Convolution              32_237                   1 1 27_192_bn_leaky_split_1 32_237_bn_leaky -23330=4,3,13,13,128 0=128 1=1 5=1 6=32768 9=2 -23310=1,1.000000e-01
Interp                   33_245                   1 1 32_237_bn_leaky 33_245 -23330=4,3,26,26,128 0=1 1=2.000000e+00 2=2.000000e+00
Concat                   34_248                   2 1 33_245 23_167_bn_leaky_split_1 34_248 -23330=4,3,26,26,384
Convolution              35_251                   1 1 34_248 35_251_bn_leaky -23330=4,3,26,26,256 0=256 1=3 4=1 5=1 6=884736 9=2 -23310=1,1.000000e-01
Convolution              36_259                   1 1 35_251_bn_leaky 36_259 -23330=4,3,26,26,255 0=255 1=1 5=1 6=65280
Yolov3DetectionOutput    detection_out            2 1 29_208 36_259 output -23330=4,2,6,1637,1 0=80 1=3 2=3.000001e-01 -23304=12,1.000000e+01,1.400000e+01,2.300000e+01,2.700000e+01,3.700000e+01,5.800000e+01,8.100000e+01,8.200000e+01,1.350000e+02,1.690000e+02,3.440000e+02,3.190000e+02 -23305=6,1077936128,1082130432,1084227584,1065353216,1073741824,1077936128 -23306=2,3.360000e+01,1.680000e+01


================================================
FILE: build-android.cmd
================================================
:: Set android ndk root
@ECHO OFF
@SETLOCAL
@SET ANDROID_NDK=<your-ndk-root_path, such as"E:\android-ndk-r27">

:: Set ninja.exe
:: @SET NINJA_EXE=<your-ninja-exe_path, such as"D:\android\sdk\cmake\3.10.2.4988404\bin\ninja.exe">

:: android armv7
mkdir build-android-armv7-vulkan
pushd build-android-armv7-vulkan
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android aarch64
mkdir build-android-aarch64-vulkan
pushd build-android-aarch64-vulkan
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android x86
mkdir build-android-x86
pushd build-android-x86
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android x86_64
mkdir build-android-x86_64
pushd build-android-x86_64
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

:: android riscv64
mkdir build-android-riscv64
pushd build-android-riscv64
cmake -G "Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=%ANDROID_NDK%/build/cmake/android.toolchain.cmake -DCMAKE_MAKE_PROGRAM="%ANDROID_NDK%/prebuilt/windows-x86_64/bin/make.exe" -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON ..
cmake --build . --parallel %NUMBER_OF_PROCESSORS%
cmake --build . --target install
popd

@ENDLOCAL


================================================
FILE: build.sh
================================================
#!/usr/bin/env bash

##### android armv7 without neon
mkdir -p build-android-armv7-without-neon
pushd build-android-armv7-without-neon
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=OFF -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android armv7
mkdir -p build-android-armv7
pushd build-android-armv7
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android aarch64
mkdir -p build-android-aarch64
pushd build-android-aarch64
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android x86
mkdir -p build-android-x86
pushd build-android-x86
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86" -DANDROID_PLATFORM=android-19 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android x86_64
mkdir -p build-android-x86_64
pushd build-android-x86_64
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="x86_64" -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### android riscv64
mkdir -p build-android-riscv64
pushd build-android-riscv64
cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI="riscv64" -DANDROID_PLATFORM=android-35 -DNCNN_VULKAN=ON ..
make -j4
make install
popd

##### linux of hisiv300 (forgot the chip name) toolchain with neon and openmp
mkdir -p build-hisiv300-linux
pushd build-hisiv300-linux
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv300.toolchain.cmake ..
make -j4
make install
popd

##### linux of hisiv500 (Hi3516CV200 and Hi3519V101) toolchain with neon and openmp
mkdir -p build-hisiv500-linux
pushd build-hisiv500-linux
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv500.toolchain.cmake ..
make -j4
make install
popd

##### linux of hisiv600 (Hi3559V100) toolchain with neon and no openmp (due to only one cpu, close openmp)
mkdir -p build-hisiv600-linux
pushd build-hisiv600-linux
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv600.toolchain.cmake ..
make -j4
make install
popd

##### linux of himix100 (Hi3559a) toolchain with neon and openmp
mkdir -p build-himix100-linux
pushd build-himix100-linux
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix100.toolchain.cmake ..
make -j4
make install
popd

##### linux of arm-linux-gnueabi toolchain
mkdir -p build-arm-linux-gnueabi
pushd build-arm-linux-gnueabi
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake ..
make -j4
make install
popd

##### linux of arm-linux-gnueabihf toolchain
mkdir -p build-arm-linux-gnueabihf
pushd build-arm-linux-gnueabihf
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake ..
make -j4
make install
popd

##### linux of v831 toolchain with neon and openmp
mkdir -p build-v831-linux
pushd build-v831-linux
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/v831.toolchain.cmake ..
make -j4
make install
popd

##### linux for aarch64-linux-gnu toolchain
mkdir -p build-aarch64-linux-gnu
pushd build-aarch64-linux-gnu
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake ..
make -j4
make install
popd

##### linux host system with gcc/g++
mkdir -p build-host-gcc-linux
pushd build-host-gcc-linux
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc.toolchain.cmake ..
make -j4
make install
popd

##### MacOS
mkdir -p build-mac
pushd build-mac
cmake   -DNCNN_OPENMP=OFF \
        -DNCNN_BENCHMARK=ON \
        ..
make -j8
make install
popd


================================================
FILE: cmake/ncnnConfig.cmake.in
================================================
set(NCNN_VERSION @NCNN_VERSION@)
set(NCNN_OPENMP @NCNN_OPENMP@)
set(NCNN_THREADS @NCNN_THREADS@)
set(NCNN_VULKAN @NCNN_VULKAN@)
set(NCNN_SHARED_LIB @NCNN_SHARED_LIB@)
set(NCNN_SYSTEM_GLSLANG @NCNN_SYSTEM_GLSLANG@)
set(NCNN_SIMPLEVK @NCNN_SIMPLEVK@)

if(NCNN_OPENMP)
    find_package(OpenMP)
endif()

if(NCNN_THREADS)
    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
    find_package(Threads REQUIRED)
endif()

if(NCNN_VULKAN)
    if(NOT NCNN_SIMPLEVK)
        find_package(Vulkan REQUIRED)
    endif()

    if(NOT NCNN_SHARED_LIB)
        if(NCNN_SYSTEM_GLSLANG)
            find_package(SPIRV-Tools QUIET)
            find_package(SPIRV-Tools-opt QUIET)
            find_package(glslang QUIET)
            if(NOT glslang_FOUND)
                set(GLSLANG_TARGET_DIR "@GLSLANG_TARGET_DIR@")
                include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake)
                include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake)
                if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
                    # hlsl support can be optional
                    include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake")
                endif()
                include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake)
                include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake)
            endif()
        else()
            set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_LIBDIR@/cmake/glslang")
            find_package(glslang QUIET)
        endif()
    endif()
endif()

include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake)

if(TARGET ncnn)
    set(ncnn_FOUND TRUE)
    if(NOT ncnn_FIND_QUIETLY)
        message(STATUS "Found ncnn: ${NCNN_VERSION}")
    endif()
endif()


================================================
FILE: cmake/ncnn_add_layer.cmake
================================================

macro(ncnn_add_arch_opt_layer class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS)
    set(NCNN_${NCNN_TARGET_ARCH}_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h)
    set(NCNN_${NCNN_TARGET_ARCH}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp)

    if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH}_HEADER} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH}_SOURCE})

        set(NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h)
        set(NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.cpp)

        add_custom_command(
            OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER}
            COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH}_HEADER} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake"
            DEPENDS ${NCNN_${NCNN_TARGET_ARCH}_HEADER}
            COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h"
            VERBATIM
        )
        set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} PROPERTIES GENERATED TRUE)

        add_custom_command(
            OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE}
            COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH}_SOURCE} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake"
            DEPENDS ${NCNN_${NCNN_TARGET_ARCH}_SOURCE}
            COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.cpp"
            VERBATIM
        )
        set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES GENERATED TRUE)

        set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS})

        list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT}_HEADER} ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE})

        # generate layer_declaration and layer_registry file
        set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.h\"\n")
        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}) }\n")

        set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}_layer_creator},\n#endif\n")
    else()
        # no isa optimized version
        if(WITH_LAYER_${name})
            set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
        else()
            set(layer_registry_${NCNN_TARGET_ARCH_OPT} "${layer_registry_${NCNN_TARGET_ARCH_OPT}}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
        endif()
    endif()
endmacro()

macro(ncnn_add_arch_opt_source class NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS)
    set(NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT}.cpp)

    if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE})
        if(NCNN_RUNTIME_CPU)
            set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS})
        endif()
        list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT}_SOURCE})
    endif()
endmacro()

macro(ncnn_add_arch_opt_layer_source class NCNN_TARGET_ARCH_OPT_BASE NCNN_TARGET_ARCH_OPT NCNN_TARGET_ARCH_OPT_CFLAGS)
    set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}.cpp)

    if(WITH_LAYER_${name} AND EXISTS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE})

        set(NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE ${CMAKE_CURRENT_BINARY_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp)

        add_custom_command(
            OUTPUT ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE}
            COMMAND ${CMAKE_COMMAND} -DSRC=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE} -DDST=${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} -DCLASS=${class} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_${NCNN_TARGET_ARCH_OPT}_source.cmake"
            DEPENDS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_SOURCE}
            COMMENT "Generating source ${name}_${NCNN_TARGET_ARCH}_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}.cpp"
            VERBATIM
        )
        set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES GENERATED TRUE)

        if(NCNN_RUNTIME_CPU)
            set_source_files_properties(${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE} PROPERTIES COMPILE_FLAGS ${NCNN_TARGET_ARCH_OPT_CFLAGS})
        endif()
        list(APPEND ncnn_SRCS ${NCNN_${NCNN_TARGET_ARCH_OPT_BASE}_${NCNN_TARGET_ARCH_OPT}_SOURCE})
    endif()
endmacro()

macro(ncnn_add_layer class)
    string(TOLOWER ${class} name)

    # WITH_LAYER_xxx option
    if(${ARGC} EQUAL 2)
        option(WITH_LAYER_${name} "build with layer ${name}" ${ARGV1})
    else()
        option(WITH_LAYER_${name} "build with layer ${name}" ON)
    endif()

    if(NCNN_CMAKE_VERBOSE)
        message(STATUS "WITH_LAYER_${name} = ${WITH_LAYER_${name}}")
    endif()

    if(WITH_LAYER_${name})
        list(APPEND ncnn_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp)

        # look for arch specific implementation and append source
        # optimized implementation for armv7, aarch64 or x86
        set(LAYER_ARCH_SRC ${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp)
        if(EXISTS ${LAYER_ARCH_SRC})
            set(WITH_LAYER_${name}_${NCNN_TARGET_ARCH} 1)
            list(APPEND ncnn_SRCS ${LAYER_ARCH_SRC})
        endif()

        set(LAYER_VULKAN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/${name}_vulkan.cpp)
        if(NCNN_VULKAN AND EXISTS ${LAYER_VULKAN_SRC})
            set(WITH_LAYER_${name}_vulkan 1)
            list(APPEND ncnn_SRCS ${LAYER_VULKAN_SRC})
        endif()
    endif()

    # generate layer_declaration and layer_registry file
    if(WITH_LAYER_${name})
        set(layer_declaration "${layer_declaration}#include \"layer/${name}.h\"\n")
        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}) }\n")

        source_group ("sources\\\\layers" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${name}.cpp")
    endif()

    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
        set(layer_declaration "${layer_declaration}#include \"layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.h\"\n")
        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_${NCNN_TARGET_ARCH}) }\n")

        source_group ("sources\\\\layers\\\\${NCNN_TARGET_ARCH}" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}/${name}_${NCNN_TARGET_ARCH}.cpp")
    endif()

    if(WITH_LAYER_${name}_vulkan)
        set(layer_declaration "${layer_declaration}#include \"layer/vulkan/${name}_vulkan.h\"\n")
        set(layer_declaration "${layer_declaration}namespace ncnn { DEFINE_LAYER_CREATOR(${class}_vulkan) }\n")

        file(GLOB NCNN_SHADER_SRCS "layer/vulkan/shader/${name}.comp")
        file(GLOB NCNN_SHADER_SUBSRCS "layer/vulkan/shader/${name}_*.comp")
        list(APPEND NCNN_SHADER_SRCS ${NCNN_SHADER_SUBSRCS})
        foreach(NCNN_SHADER_SRC ${NCNN_SHADER_SRCS})
            ncnn_add_shader(${NCNN_SHADER_SRC})
        endforeach()

        source_group ("sources\\\\layers\\\\vulkan" FILES "${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/${name}_vulkan.cpp")
    endif()

    if(WITH_LAYER_${name})
        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", ${class}_layer_creator},\n#else\n{${class}_layer_creator},\n#endif\n")
    else()
        set(layer_registry "${layer_registry}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
    endif()

    if(WITH_LAYER_${name}_${NCNN_TARGET_ARCH})
        set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", ${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#else\n{${class}_${NCNN_TARGET_ARCH}_layer_creator},\n#endif\n")
    else()
        set(layer_registry_arch "${layer_registry_arch}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
    endif()

    if(WITH_LAYER_${name}_vulkan)
        set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", ${class}_vulkan_layer_creator},\n#else\n{${class}_vulkan_layer_creator},\n#endif\n")
    else()
        set(layer_registry_vulkan "${layer_registry_vulkan}#if NCNN_STRING\n{\"${class}\", 0},\n#else\n{0},\n#endif\n")
    endif()

    if(NCNN_TARGET_ARCH STREQUAL "x86")
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512)
                ncnn_add_arch_opt_layer(${class} avx512 "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_FMA)
                ncnn_add_arch_opt_layer(${class} fma "/arch:AVX /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX)
                ncnn_add_arch_opt_layer(${class} avx "/arch:AVX /D__SSSE3__ /D__SSE4_1__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512VNNI)
                ncnn_add_arch_opt_source(${class} avx512vnni "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512VNNI__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512BF16)
                ncnn_add_arch_opt_source(${class} avx512bf16 "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512BF16__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512FP16)
                ncnn_add_arch_opt_source(${class} avx512fp16 "/arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512FP16__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNI)
                ncnn_add_arch_opt_source(${class} avxvnni "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT8)
                ncnn_add_arch_opt_source(${class} avxvnniint8 "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT8__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT16)
                ncnn_add_arch_opt_source(${class} avxvnniint16 "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT16__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXNECONVERT)
                ncnn_add_arch_opt_source(${class} avxneconvert "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXNECONVERT__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX2)
                ncnn_add_arch_opt_source(${class} avx2 "/arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_XOP)
                ncnn_add_arch_opt_source(${class} xop "/arch:AVX /D__SSSE3__ /D__SSE4_1__ /D__XOP__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_F16C)
                ncnn_add_arch_opt_source(${class} f16c "/arch:AVX /D__SSSE3__ /D__SSE4_1__ /D__F16C__")
            endif()
        elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512)
                ncnn_add_arch_opt_layer(${class} avx512 "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_FMA)
                ncnn_add_arch_opt_layer(${class} fma "/arch:AVX -mfma -mf16c /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX)
                ncnn_add_arch_opt_layer(${class} avx "/arch:AVX /D__SSSE3__ /D__SSE4_1__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512VNNI)
                ncnn_add_arch_opt_source(${class} avx512vnni "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512vnni /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512VNNI__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512BF16)
                ncnn_add_arch_opt_source(${class} avx512bf16 "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512bf16 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512BF16__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512FP16)
                ncnn_add_arch_opt_source(${class} avx512fp16 "/arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512fp16 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVX512FP16__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNI)
                ncnn_add_arch_opt_source(${class} avxvnni "/arch:AVX2 -mfma -mf16c -mavxvnni /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT8)
                ncnn_add_arch_opt_source(${class} avxvnniint8 "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint8 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT8__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT16)
                ncnn_add_arch_opt_source(${class} avxvnniint16 "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint16 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXVNNI__ /D__AVXVNNIINT16__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXNECONVERT)
                ncnn_add_arch_opt_source(${class} avxneconvert "/arch:AVX2 -mfma -mf16c -mavxneconvert /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__ /D__AVXNECONVERT__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX2)
                ncnn_add_arch_opt_source(${class} avx2 "/arch:AVX2 -mfma -mf16c /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_XOP)
                ncnn_add_arch_opt_source(${class} xop "/arch:AVX -mxop /D__SSSE3__ /D__SSE4_1__ /D__XOP__")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_F16C)
                ncnn_add_arch_opt_source(${class} f16c "/arch:AVX -mf16c /D__SSSE3__ /D__SSE4_1__ /D__F16C__")
            endif()
        else()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512)
                ncnn_add_arch_opt_layer(${class} avx512 "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_FMA)
                ncnn_add_arch_opt_layer(${class} fma "-mavx -mfma -mf16c")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX)
                ncnn_add_arch_opt_layer(${class} avx "-mavx")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512VNNI)
                ncnn_add_arch_opt_source(${class} avx512vnni "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512vnni")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512BF16)
                ncnn_add_arch_opt_source(${class} avx512bf16 "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512bf16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX512FP16)
                ncnn_add_arch_opt_source(${class} avx512fp16 "-mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c -mavx512fp16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNI)
                ncnn_add_arch_opt_source(${class} avxvnni "-mavx2 -mfma -mf16c -mavxvnni")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT8)
                ncnn_add_arch_opt_source(${class} avxvnniint8 "-mavx2 -mfma -mf16c -mavxvnni -mavxvnniint8")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXVNNIINT16)
                ncnn_add_arch_opt_source(${class} avxvnniint16 "-mavx2 -mfma -mf16c -mavxvnni -mavxvnniint16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVXNECONVERT)
                ncnn_add_arch_opt_source(${class} avxneconvert "-mavx2 -mfma -mf16c -mavxneconvert")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_AVX2)
                ncnn_add_arch_opt_source(${class} avx2 "-mavx2 -mfma -mf16c")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_XOP)
                ncnn_add_arch_opt_source(${class} xop "-mavx -mxop")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_F16C)
                ncnn_add_arch_opt_source(${class} f16c "-mavx -mf16c")
            endif()
        endif()
    endif()

    if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT NCNN_TARGET_ILP32))
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
            if(NCNN_VFPV4)
                ncnn_add_arch_opt_source(${class} vfpv4 "/arch:VFPv4 /D__ARM_FP=0x0E")
            endif()
        else()
            if(NCNN_VFPV4)
                if(NCNN_COMPILER_SUPPORT_ARM_VFPV4)
                    ncnn_add_arch_opt_source(${class} vfpv4 "-mfpu=neon-vfpv4")
                elseif(NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16)
                    ncnn_add_arch_opt_source(${class} vfpv4 "-mfpu=neon-vfpv4 -mfp16-format=ieee")
                endif()
            endif()
        endif()
    endif()

    if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32))
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            if(NCNN_VFPV4)
                ncnn_add_arch_opt_source(${class} vfpv4 " ")
            endif()
            if(NCNN_ARM82)
                ncnn_add_arch_opt_source(${class} asimdhp "/arch:armv8.2 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM82DOT)
                ncnn_add_arch_opt_source(${class} asimddp "/arch:armv8.2 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM82FP16FML)
                ncnn_add_arch_opt_source(${class} asimdfhm "/arch:armv8.2 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_FP16_FML")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM84BF16)
                ncnn_add_arch_opt_source(${class} bf16 "/arch:armv8.4 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM84I8MM)
                ncnn_add_arch_opt_source(${class} i8mm "/arch:armv8.4 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_MATMUL_INT8")
            endif()
            # TODO add support for sve family
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE2)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEBF16)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEI8MM)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEF32MM)
            endif()
        elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
            if(NCNN_VFPV4)
                ncnn_add_arch_opt_source(${class} vfpv4 " ")
            endif()
            if(NCNN_ARM82)
                ncnn_add_arch_opt_source(${class} asimdhp "/arch:armv8.2 -march=armv8.2-a+fp16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM82DOT)
                ncnn_add_arch_opt_source(${class} asimddp "/arch:armv8.2 -march=armv8.2-a+fp16+dotprod /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM82FP16FML)
                ncnn_add_arch_opt_source(${class} asimdfhm "/arch:armv8.2 -march=armv8.2-a+fp16+fp16fml /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_FP16_FML")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM84BF16)
                ncnn_add_arch_opt_source(${class} bf16 "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+bf16 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM84I8MM)
                ncnn_add_arch_opt_source(${class} i8mm "/arch:armv8.4 -march=armv8.4-a+fp16+dotprod+i8mm /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_MATMUL_INT8")
            endif()
            # TODO add support for sve family
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE2)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEBF16)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEI8MM)
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEF32MM)
            endif()
        else()
            if(NCNN_VFPV4)
                ncnn_add_arch_opt_source(${class} vfpv4 " ")
            endif()
            if(NCNN_ARM82)
                ncnn_add_arch_opt_source(${class} asimdhp "-march=armv8.2-a+fp16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM82DOT)
                ncnn_add_arch_opt_source(${class} asimddp "-march=armv8.2-a+fp16+dotprod")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM82FP16FML)
                # clang 9.0.9 shipped with android ndk-r21 is missing __ARM_FEATURE_FP16_FML macro for asimdfhm target
                ncnn_add_arch_opt_source(${class} asimdfhm "-march=armv8.2-a+fp16+fp16fml -D__ARM_FEATURE_FP16_FML")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM84BF16)
                ncnn_add_arch_opt_source(${class} bf16 "-march=armv8.4-a+fp16+dotprod+bf16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM84I8MM)
                ncnn_add_arch_opt_source(${class} i8mm "-march=armv8.4-a+fp16+dotprod+i8mm")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE)
                ncnn_add_arch_opt_source(${class} sve "-march=armv8.6-a+fp16+dotprod+sve")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVE2)
                ncnn_add_arch_opt_source(${class} sve2 "-march=armv8.6-a+fp16+dotprod+sve2")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEBF16)
                ncnn_add_arch_opt_source(${class} svebf16 "-march=armv8.6-a+fp16+dotprod+sve+bf16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEI8MM)
                ncnn_add_arch_opt_source(${class} svei8mm "-march=armv8.6-a+fp16+dotprod+sve+i8mm")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ARM86SVEF32MM)
                ncnn_add_arch_opt_source(${class} svef32mm "-march=armv8.6-a+fp16+dotprod+sve+f32mm")
            endif()
        endif()
    endif()

    if(NCNN_TARGET_ARCH STREQUAL "mips")
        if(NCNN_RUNTIME_CPU AND NCNN_MSA)
            ncnn_add_arch_opt_layer(${class} msa "-mmsa")
        endif()
        if(NCNN_MMI)
            ncnn_add_arch_opt_source(${class} mmi "-mloongson-mmi")
        endif()
    endif()

    if(NCNN_TARGET_ARCH STREQUAL "loongarch")
        if(NCNN_RUNTIME_CPU AND NCNN_LASX)
            ncnn_add_arch_opt_layer(${class} lasx "-mlasx -mlsx")
        endif()
        if(NCNN_RUNTIME_CPU AND NCNN_LSX)
            ncnn_add_arch_opt_layer(${class} lsx "-mlsx")
        endif()
    endif()

    if(NCNN_TARGET_ARCH STREQUAL "riscv")
        if(CMAKE_SIZEOF_VOID_P EQUAL 8)
            if(NCNN_RUNTIME_CPU AND NCNN_RVV)
                ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
            endif()
            if(NCNN_ZFH)
                if(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH)
                    ncnn_add_arch_opt_source(${class} zfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
                elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
                    ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh_xtheadvector -D__riscv_zvfh=1 -D__fp16=_Float16")
                else()
                    ncnn_add_arch_opt_source(${class} zfh "-march=rv64gc_zfh -D__fp16=_Float16")
                endif()
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
                # linker complains the conflict of v and xtheadvector, so disable generating any riscv attributes
                ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr")
                ncnn_add_arch_opt_layer_source(${class} zfh xtheadvector "-march=rv64gc_zfh_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr -D__fp16=_Float16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ZVFH)
                ncnn_add_arch_opt_layer_source(${class} zfh rvv "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
            endif()
        elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
            if(NCNN_RUNTIME_CPU AND NCNN_RVV)
                ncnn_add_arch_opt_layer(${class} rvv "-march=rv32gcv")
            endif()
            if(NCNN_ZFH)
                if(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH)
                    ncnn_add_arch_opt_source(${class} zfh "-march=rv32gcv_zfh_zvfh -D__fp16=_Float16")
                elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
                    ncnn_add_arch_opt_source(${class} zfh "-march=rv32gc_zfh_xtheadvector -D__riscv_zvfh=1 -D__fp16=_Float16")
                else()
                    ncnn_add_arch_opt_source(${class} zfh "-march=rv32gc_zfh -D__fp16=_Float16")
                endif()
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
                # linker complains the conflict of v and xtheadvector, so disable generating any riscv attributes
                ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv32gc_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr")
                ncnn_add_arch_opt_layer_source(${class} zfh xtheadvector "-march=rv32gc_zfh_xtheadvector -mno-riscv-attribute -Wa,-mno-arch-attr -D__fp16=_Float16")
            endif()
            if(NCNN_RUNTIME_CPU AND NCNN_ZVFH)
                ncnn_add_arch_opt_layer_source(${class} zfh rvv "-march=rv32gcv_zfh_zvfh -D__fp16=_Float16")
            endif()
        endif()
    endif()

    # generate layer_type_enum file
    set(layer_type_enum "${layer_type_enum}${class} = ${__LAYER_TYPE_ENUM_INDEX},\n")
    math(EXPR __LAYER_TYPE_ENUM_INDEX "${__LAYER_TYPE_ENUM_INDEX}+1")
endmacro()


================================================
FILE: cmake/ncnn_add_param.cmake
================================================

macro(ncnn_add_param NCNN_PARAM_SRC)
    # Get the file name with extension
    get_filename_component(NCNN_PARAM_SRC_NAME_WE ${NCNN_PARAM_SRC} NAME)
    # Manually remove ".param" since NAME_WE treats ".1.param" as a multi-extension
    string(REPLACE ".param" "" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}")
    # Replace characters invalid in C identifiers ('.' and '-') with underscores
    string(REPLACE ".param" "" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}")
    # Replace characters invalid in C identifiers ('.' and '-') with underscores
    string(REPLACE "." "_" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}")
    string(REPLACE "-" "_" NCNN_PARAM_SRC_NAME_WE "${NCNN_PARAM_SRC_NAME_WE}")
    # Check if the result is empty
    if (NOT NCNN_PARAM_SRC_NAME_WE)
        message(FATAL_ERROR "Failed to extract valid filename from '${NCNN_PARAM_SRC}'")
    endif()
    # Check if the extracted filename is a valid C identifier
    string(REGEX MATCH "^[A-Za-z_][A-Za-z0-9_]*$" is_valid "${NCNN_PARAM_SRC_NAME_WE}")
    if (NOT is_valid)
        message(FATAL_ERROR "Extracted filename '${NCNN_PARAM_SRC_NAME_WE}' is not a valid C identifier")
    endif()

    set(NCNN_PARAM_HEADER ${CMAKE_CURRENT_BINARY_DIR}/param/${NCNN_PARAM_SRC_NAME_WE}.hex.h)

    add_custom_command(
        OUTPUT ${NCNN_PARAM_HEADER}
        COMMAND ${CMAKE_COMMAND} -DPARAM_SRC=${NCNN_PARAM_SRC} -DPARAM_SRC_NAME_WE=${NCNN_PARAM_SRC_NAME_WE} -DPARAM_HEADER=${NCNN_PARAM_HEADER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_param_header.cmake"
        DEPENDS ${NCNN_PARAM_SRC}
        COMMENT "Preprocessing param source ${NCNN_PARAM_SRC_NAME_WE}.param"
        VERBATIM
    )
    set_source_files_properties(${NCNN_PARAM_HEADER} PROPERTIES GENERATED TRUE)

    get_filename_component(NCNN_PARAM_HEADER_NAME ${NCNN_PARAM_HEADER} NAME)
    string(APPEND param_header_data "#include \"param/${NCNN_PARAM_HEADER_NAME}\"\n")

    list(APPEND NCNN_PARAM_HEX_FILES ${NCNN_PARAM_HEADER})
endmacro()


================================================
FILE: cmake/ncnn_add_shader.cmake
================================================

macro(ncnn_add_shader NCNN_SHADER_SRC)
    get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
    set(NCNN_SHADER_COMP_HEADER ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${NCNN_SHADER_SRC_NAME_WE}.comp.hex.h)

    add_custom_command(
        OUTPUT ${NCNN_SHADER_COMP_HEADER}
        COMMAND ${CMAKE_COMMAND} -DSHADER_SRC=${NCNN_SHADER_SRC} -DSHADER_COMP_HEADER=${NCNN_SHADER_COMP_HEADER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_generate_shader_comp_header.cmake"
        DEPENDS ${NCNN_SHADER_SRC}
        COMMENT "Preprocessing shader source ${NCNN_SHADER_SRC_NAME_WE}.comp"
        VERBATIM
    )
    set_source_files_properties(${NCNN_SHADER_COMP_HEADER} PROPERTIES GENERATED TRUE)

    get_filename_component(NCNN_SHADER_COMP_HEADER_NAME ${NCNN_SHADER_COMP_HEADER} NAME)
    string(APPEND layer_shader_spv_data "#include \"layer/vulkan/shader/${NCNN_SHADER_COMP_HEADER_NAME}\"\n")

    get_filename_component(NCNN_SHADER_SRC_NAME_WE ${NCNN_SHADER_SRC} NAME_WE)
    string(APPEND layer_shader_registry "{${NCNN_SHADER_SRC_NAME_WE}_comp_data,sizeof(${NCNN_SHADER_SRC_NAME_WE}_comp_data)},\n")

    list(APPEND NCNN_SHADER_SPV_HEX_FILES ${NCNN_SHADER_COMP_HEADER})

    # generate layer_shader_type_enum file
    set(layer_shader_type_enum "${layer_shader_type_enum}${NCNN_SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
endmacro()


================================================
FILE: cmake/ncnn_generate_avx512_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_X86_H" "LAYER_${CLASS_UPPER}_X86_AVX512_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_x86" "${CLASS}_x86_avx512" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_x86.h\"" "#include \"${CLASS_LOWER}_x86_avx512.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/ncnn_generate_avx_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_X86_H" "LAYER_${CLASS_UPPER}_X86_AVX_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_x86" "${CLASS}_x86_avx" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_x86.h\"" "#include \"${CLASS_LOWER}_x86_avx.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/ncnn_generate_fma_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_X86_H" "LAYER_${CLASS_UPPER}_X86_FMA_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_x86" "${CLASS}_x86_fma" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_x86.h\"" "#include \"${CLASS_LOWER}_x86_fma.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/ncnn_generate_lasx_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LASX_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lasx" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lasx.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/ncnn_generate_lsx_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/ncnn_generate_msa_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_MIPS_H" "LAYER_${CLASS_UPPER}_MIPS_MSA_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_mips" "${CLASS}_mips_msa" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_mips.h\"" "#include \"${CLASS_LOWER}_mips_msa.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/ncnn_generate_param_header.cmake
================================================

# must define PARAM_HEADER PARAM_SRC PARAM_SRC_NAME_WE

file(READ ${PARAM_SRC} param_data)

# remove whitespace
string(REGEX REPLACE "\n +" "\n" param_data ${param_data})

# replace more spaces to one space
string(REGEX REPLACE "[ \t]+" " " param_data "${param_data}")

# remove empty line
string(REGEX REPLACE "\n[\n]+" "\n" param_data "${param_data}")

# text to hex
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/param/${PARAM_SRC_NAME_WE}.text2hex.txt "${param_data}")
file(READ ${CMAKE_CURRENT_BINARY_DIR}/param/${PARAM_SRC_NAME_WE}.text2hex.txt param_data_hex HEX)
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," param_data_hex ${param_data_hex})
string(FIND "${param_data_hex}" "," tail_comma REVERSE)
string(SUBSTRING "${param_data_hex}" 0 ${tail_comma} param_data_hex)

# generate model param header file
file(WRITE ${PARAM_HEADER} "static const char ${PARAM_SRC_NAME_WE}_param_data[] = {${param_data_hex},0x00};\n")


================================================
FILE: cmake/ncnn_generate_rvv_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_RVV_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_rvv" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_rvv.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/ncnn_generate_shader_comp_header.cmake
================================================

# must define SHADER_COMP_HEADER SHADER_SRC

file(READ ${SHADER_SRC} comp_data)

# skip leading comment
string(FIND "${comp_data}" "#version" version_start)
if(NOT ${version_start} EQUAL -1)
    string(SUBSTRING "${comp_data}" ${version_start} -1 comp_data)
endif()

# remove whitespace
string(REGEX REPLACE "\n +" "\n" comp_data "${comp_data}")

# remove comments
string(REGEX REPLACE "//[^\n]*" "" comp_data "${comp_data}")

# replace more spaces to one space
string(REGEX REPLACE "[ \t]+" " " comp_data "${comp_data}")

# remove empty line
string(REGEX REPLACE "\n[\n]+" "\n" comp_data "${comp_data}")

get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)

# text to hex
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt "${comp_data}")
file(READ ${CMAKE_CURRENT_BINARY_DIR}/layer/vulkan/shader/${SHADER_SRC_NAME_WE}.text2hex.txt comp_data_hex HEX)
string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," comp_data_hex ${comp_data_hex})
string(FIND "${comp_data_hex}" "," tail_comma REVERSE)
string(SUBSTRING "${comp_data_hex}" 0 ${tail_comma} comp_data_hex)

file(WRITE ${SHADER_COMP_HEADER} "static const char ${SHADER_SRC_NAME_WE}_comp_data[] = {${comp_data_hex}};\n")


================================================
FILE: cmake/ncnn_generate_xtheadvector_source.cmake
================================================

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_XTHEADVECTOR_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_xtheadvector" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_xtheadvector.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")


================================================
FILE: cmake/run_test.cmake
================================================

execute_process(COMMAND $ENV{TESTS_EXECUTABLE_LOADER} $ENV{TESTS_EXECUTABLE_LOADER_ARGUMENTS} ${TEST_EXECUTABLE} $ENV{TESTS_ARGUMENTS} RESULT_VARIABLE result)
if(NOT "${result}" STREQUAL "0")
    message(FATAL_ERROR "Test failed with return value '${result}'")
endif()


================================================
FILE: codeformat.sh
================================================
#!/usr/bin/env bash

# we run clang-format and astyle twice to get stable format output

format_code() {
    find src/ tools/ tests/ examples/ benchmark/ python/ -type f -name '*.c' -o -name '*.cpp' -o -name '*.cc' -o -name '*.h' | grep -v python/pybind11 | grep -v stb_image | grep -v ruapu | xargs -i clang-format -i {}
    astyle -n -r "benchmark/*.h,*.cpp,*.cc" "tests/*.h,*.cpp,*.cc" "tools/*.h,*.cpp,*.cc" "examples/*.h,*.cpp,*.cc"
    astyle -n -r "src/*.h,*.cpp,*.cc" --exclude=src/stb_image.h --exclude=src/stb_image_write.h --exclude=src/ruapu.h
    astyle -n -r "python/*.h,*.cpp,*.cc" --exclude=python/pybind11
}

format_code || { echo 'Formatting failed' ; exit 1; } #first time execute
format_code || { echo 'Formatting failed' ; exit 1; } #second time execute


================================================
FILE: docs/Home.md
================================================
### input data and extract output
```cpp
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "net.h"

int main()
{
    cv::Mat img = cv::imread("image.ppm", CV_LOAD_IMAGE_GRAYSCALE);
    int w = img.cols;
    int h = img.rows;

    // subtract 128, norm to -1 ~ 1
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(img.data, ncnn::Mat::PIXEL_GRAY, w, h, 60, 60);
    float mean[1] = { 128.f };
    float norm[1] = { 1/128.f };
    in.substract_mean_normalize(mean, norm);

    ncnn::Net net;
    net.load_param("model.param");
    net.load_model("model.bin");

    ncnn::Extractor ex = net.create_extractor();

    ex.input("data", in);

    ncnn::Mat feat;
    ex.extract("output", feat);

    return 0;
}

```

### print Mat content
```cpp
void pretty_print(const ncnn::Mat& m)
{
    for (int q=0; q<m.c; q++)
    {
        const float* ptr = m.channel(q);
        for (int z=0; z<m.d; z++)
        {
            for (int y=0; y<m.h; y++)
            {
                for (int x=0; x<m.w; x++)
                {
                    printf("%f ", ptr[x]);
                }
                ptr += m.w;
                printf("\n");
            }
            printf("\n");
        }
        printf("------------------------\n");
    }
}
```

### print VkMat content
```cpp
void pretty_print(const ncnn::VkMat& m, ncnn::VkCompute& cmd, const ncnn::Option& opt)
{
    ncnn::Option opt_unpack = opt;
    opt_unpack.use_packing_layout = false;

    ncnn::Mat m_cpu;
    cmd.record_download(m, m_cpu, opt_unpack);
    cmd.submit_and_wait();
    cmd.reset();

    // print Mat content
    pretty_print(m_cpu);
}
```

### visualize Mat content
```cpp
void visualize(const char* title, const ncnn::Mat& m)
{
    std::vector<cv::Mat> normed_feats(m.c);

    for (int i=0; i<m.c; i++)
    {
        cv::Mat tmp(m.h, m.w, CV_32FC1, (void*)(const float*)m.channel(i));

        cv::normalize(tmp, normed_feats[i], 0, 255, cv::NORM_MINMAX, CV_8U);

        cv::cvtColor(normed_feats[i], normed_feats[i], cv::COLOR_GRAY2BGR);

        // check NaN
        for (int y=0; y<m.h; y++)
        {
            const float* tp = tmp.ptr<float>(y);
            uchar* sp = normed_feats[i].ptr<uchar>(y);
            for (int x=0; x<m.w; x++)
            {
                float v = tp[x];
                if (v != v)
                {
                    sp[0] = 0;
                    sp[1] = 0;
                    sp[2] = 255;
                }

                sp += 3;
            }
        }
    }

    int tw = m.w < 10 ? 32 : m.w < 20 ? 16 : m.w < 40 ? 8 : m.w < 80 ? 4 : m.w < 160 ? 2 : 1;
    int th = (m.c - 1) / tw + 1;

    cv::Mat show_map(m.h * th, m.w * tw, CV_8UC3);
    show_map = cv::Scalar(127);

    // tile
    for (int i=0; i<m.c; i++)
    {
        int ty = i / tw;
        int tx = i % tw;

        normed_feats[i].copyTo(show_map(cv::Rect(tx * m.w, ty * m.h, m.w, m.h)));
    }

    cv::resize(show_map, show_map, cv::Size(0,0), 2, 2, cv::INTER_NEAREST);
    cv::imshow(title, show_map);
}
```

### FAQ
Q ncnn的起源

A 深度学习算法要在手机上落地，caffe依赖太多，手机上也没有cuda，需要个又快又小的前向网络实现


Q ncnn名字的来历

A cnn就是卷积神经网络的缩写，开头的n算是一语n关。比如new/next(全新的实现)，naive(ncnn是naive实现)，neon(ncnn最初为手机优化)，up主名字(←_←)


Q 支持哪些平台

A 跨平台，支持 android / ios / linux / windows / macos，也支持裸机跑


Q 计算精度如何

A armv7 neon float 不遵照 ieee754 标准，有些采用快速实现(如exp sin等)，速度快但确保精度足够高


Q logo

A up主是mc玩家，所以灵魂手绘像素猫，还可以找到ncnn...


================================================
FILE: docs/application-with-ncnn-inside.md
================================================
![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.azarlive.android.png) Azar-视频交友与聊天 June 20, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.cyberlink.youcammakeup.png) 玩美彩妆 - 自拍美颜 & 智能美妆相机 June 21, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.fotoable.makeup.png) You Makeup Photo Camera 2.1.5

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.fotoable.cartoon.cam.png) 滤镜相机 Cartoon Camera- Paintlab January 24, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.pipcamera.activity.png) 画中画相机 January 30, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.hefe.pro.editor.png) Photo Editor Pro 1.1.4.1029

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.apus.camera.id.png) Air Camera 1.7.3.1002

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.fotoable.fotobeauty.png) 美丽拍－懂你的自拍美颜相机 February 1, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.perfectcorp.ycf.png) 玩美Fun-特效动图自拍滤镜&分享相片！ May 15, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.ufotosoft.justshot.png) Sweet Snap - 生活贴纸&图像编辑器,实时滤镜,录制视频和有趣表情包,美容效果 June 22, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.wantu.activity.png) 玩图 - 美图相机 March 29, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.meitu.meiyancamera.png) 美颜相机 7.6.95

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.lyrebirdstudio.colorizer.lite.png) 自拍相机 - 照片编辑器和过滤器和贴纸 April 27, 2018

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.apusapps.fulakora.png) APUS Camera 1.7.2.1001

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/video.like.png) LIKE短视频 — 魔法视频自拍神器 2.2.4

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.qiyi.video.png) 爱奇艺 9.6.0

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.eg.android.AlipayGphone.png) 支付宝 10.1.25.752

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.perfectcorp.beautycircle.png) YouCam Shop - World's First AR Makeup Shopping App 3.4.0

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.lyrebirdstudio.beauty.png) 美容化妆自拍相机和自拍照片编辑器 1.4.8

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.jingdong.app.mall.png) 京东-挑好物，上京东 7.0.8

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.versa.png) Versa 2.9.2

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.tencent.weishi.png) 微视 4.3.1.88

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.smile.gifmaker.png) 快手短视频—国民短视频平台 5.4.2.5360

![](https://github.com/nihui/ncnn-assets/raw/master/20180626/com.sdu.didi.psnger.png) 滴滴出行 5.3.0


================================================
FILE: docs/benchmark/the-benchmark-of-caffe-android-lib,-mini-caffe,-and-ncnn.md
================================================
caffe-android-lib https://github.com/sh1r0/caffe-android-lib

mini-caffe https://github.com/luoyetx/mini-caffe

openblas-0.2.20 https://github.com/xianyi/OpenBLAS

ncnn https://github.com/Tencent/ncnn

***

squeezenet_v1.1 https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1

mobilenet_v1 https://github.com/shicai/MobileNet-Caffe

vgg16 https://gist.github.com/ksimonyan/211839e770f7b538e2d8

***

Host platform and compiler configuration: 

fedora 27, android-ndk-r15c, target arch = arm64-v8a

we manually update openblas package to version 0.2.20 in caffe-android-lib for better performance


***

Device: Nexus 6p

OS: LineageOS 15.1(Android 8.1.0), ROM newly flashed without any third-party APP installed

CPU: Snapdragon 810 (Cortex-A57 2.0GHz x 4 + Cortex-A53 1.55GHz x 4)

RAM: 3G


***

Benchmark method: 

Run squeezenet, mobilenet inference 23 times in a loop, discard the first three warmup records, and then calculate the average inference time

Run vgg169 times in a loop, discard the first warmup record, and then calculate the average inference time

Since the system may force SOC lowering its frequency when temperature goes high, sleep over 1 minute before each benchmark to prevent this issue.

fps performance: fps = 1000 / avgtime(ms)

cpu usage: take the CPU value in top utility output

memory usage: take the RES value in top utility output

the overall power consumption and performance per watt: 

Disable usb charging: adb shell echo 0 > /sys/class/power_supply/battery/charging_enabled

current(μA) = adb shell cat /sys/class/power_supply/battery/current_now (multiply -1 for 810 chip)

voltage(μV) = adb shell cat /sys/class/power_supply/battery/voltage_now

power consumption(mW) = current / 1000 * voltage / 1000 / 1000

performance per watt(1000fps/W) = fps / power consumption * 1000


***

The binary size after debug stripping

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/1.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/2.jpg)

***

squeezenet

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/3.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/4.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/5.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/6.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/7.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/8.jpg)
***

mobilenet

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/9.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/10.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/11.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/12.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/13.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/14.jpg)
***

vgg16

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/15.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/16.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/17.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/18.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/19.jpg)

![](https://github.com/nihui/ncnn-assets/raw/master/20180413/20.jpg)


================================================
FILE: docs/benchmark/vulkan-conformance-test.md
================================================

|device|gpu|api version|driver version|squeezenet|mobilenetssd|yolov3|
|---|---|---|---|---|---|---|
|intel-i7-7700|Intel(R) HD Graphics 630 (Kaby Lake GT2)|1.1.90|18.3.4|y|y|y|
|GTX-1060|GeForce GTX 1060 3GB|1.1.95|418.172.0|y|y|y|
|AMD-Radeon R9 M290X|AMD RADV PITCAIRN (LLVM 7.0.1)|1.1.70|18.3.4|y|y|y|
|iphone-5s|Apple A7 GPU|1.0.82|0.2.1825|y|y|y|
|huawei-nexus6p|Adreno (TM) 430|1.0.49|35.601.2388|y|y|y
|vivo-y1731ca|Adreno (TM) 505|1.0.61|37.845.1429|y|n|n|
|vivo-y85a|Adreno (TM) 506|1.0.61|2.944.3349|y|n|n|
|vivo-x9s|Adreno (TM) 510|1.0.61|42.917.1172|y|y|y|
|meizu-15|Adreno (TM) 512|1.0.38|29.189.223|n|n|n|
|chuizi-jianguo-pro2|Adreno (TM) 512|1.0.38|21.219.2615|n|n|n|
|xiaomi-note3|Adreno (TM) 512|1.0.38|39.369.2305|n|n|n|
|oppo-r11|Adreno (TM) 512|1.0.38|42.977.756|n|n|n|
|xiaomi-6x|Adreno (TM) 512|1.0.61|14.322.3739|y|y|y|
|oppo-r11s+|Adreno (TM) 512|1.0.61|35.1004.3936|y|y|y|
|vivo-x20a|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y|
|vivo-v1816a|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y|
|vivo-z1|Adreno (TM) 512|1.0.61|43.10.3141|y|y|y|
|xiaomi-redmi-note5|Adreno (TM) 512|1.0.61|63.219.2354|y|y|y|
|google-pixel|Adreno (TM) 530|1.1.87|512.354.0|y|y|y|
|nubia-z17|Adreno (TM) 540|1.0.38|1.28.32|n|n|n|
|samsung-galaxys8+|Adreno (TM) 540|1.0.61|29.896.3583|y|y|y|
|oneplus-5t|Adreno (TM) 540|1.0.61|18.1023.2233|y|y|y|
|google-pixel2|Adreno (TM) 540|1.1.66|512.313.0|y|y|y|
|essential-ph-1|Adreno (TM) 540|1.1.66|512.319.0|y|y|y|
|vivo-x23|Adreno (TM) 615|1.0.66|33.870.3328|y|y|y|
|vivo-v1813ba|Adreno (TM) 615|1.0.66|33.870.3328|y|y|y|
|xiaomi-8se|Adreno (TM) 616|1.0.66|30.913.18|y|y|y|
|vivo-nex-a|Adreno (TM) 616|1.0.66|33.870.3328|y|y|y|
|xiaomi-mix2s|Adreno (TM) 630|1.0.61|4.91.2976|y|y|y|
|heisha-SKR-A0|Adreno (TM) 630|1.0.61|36.173.3586|y|y|y|
|heisha-SKR-A0|Adreno (TM) 630|1.0.66|47.448.1532|y|y|y|
|oneplus-6|Adreno (TM) 630|1.1.66|512.324.0|y|y|y|
|vivo-iQOO|Adreno (TM) 640|1.1.87|512.361.0|y|y|y|
|meitu-m8s|Mali-T880|1.0.14|500.910.1017|n|n|n|
|huawei-p10|Mali-G71|1.0.53|151.949.2145|n|n|n|
|huawei-mate9|Mali-G71|1.0.53|151.949.2145|n|n|n|
|oppo-a73|Mali-G71|1.0.47|575.795.1934|n|n|n|
|vivo-y97|Mali-G72|1.0.58|240.537.3580|n|n|n|
|huawei-mate10|Mali-G72|1.0.66|14.0.0|y|y|y|
|huawei-v10|Mali-G72|1.0.66|14.0.0|y|y|y|
|huawei-vce-al00|Mali-G72|1.0.66|14.0.0|y|y|y|
|huawei-mate20|Mali-G76|1.0.66|14.0.0|y|y|y|
|huawei-pct-al10|Mali-G76|1.0.66|14.0.0|y|y|y|

================================================
FILE: docs/developer-guide/aarch64-mix-assembly-and-intrinsic.md
================================================
```c
// v寄存器全部使用 %.4s
// 128-bit vreg matches %.4s
// a += b * c
float32x4_t _a = vld1q_f32(a);
float32x4_t _b = vld1q_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
    "fmla  %0.4s, %2.4s, %3.4s"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// v寄存器使用低64位  %.2s
// low 64-bit vreg matches %.2s
// a += b * c
float32x2_t _a = vld1_f32(a);
float32x2_t _b = vld1_f32(b);
float32x2_t _c = vld1_f32(c);
asm volatile(
    "fmla  %0.2s, %2.2s, %3.2s"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// v寄存器单路使用 %.s[0] %.s[1] %.s[2] %.s[3]
// 32-bit register matches %.s[0]
// a += b * c[0]
// a += b * c[1]
// a += b * c[2]
// a += b * c[3]
float32x4_t _a = vld1_f32(a);
float32x4_t _b = vld1_f32(b);
float32x4_t _c = vld1_f32(c);
asm volatile(
    "fmla  %0.4s, %2.4s, %3.s[0]"
    "fmla  %0.4s, %2.4s, %3.s[1]"
    "fmla  %0.4s, %2.4s, %3.s[2]"
    "fmla  %0.4s, %2.4s, %3.s[3]"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```


qwq


================================================
FILE: docs/developer-guide/add-custom-layer.zh.md
================================================
# NCNN增加自定义层

## 举例

这里举个例子添加自定义层次 如Relu6，即 std::min(6.f, std::max(0.f, val))

```
Input            input   0 1 input
Convolution      conv2d  1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
Relu6            relu6   1 1 conv2d relu6
Pooling          maxpool 1 1 relu6 maxpool 0=0 1=3 2=2 3=-233 4=0
```


## 定义源码h文件：src/layer/relu6.h

```CPP
#ifndef LAYER_RELU6_H
#define LAYER_RELU6_H

#include "layer.h"

namespace ncnn {

class Relu6 : public Layer
{
public:
    Relu6();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_RELU6_H
```


## 定义源码CPP文件：src/layer/relu6.cpp

```CPP
#include "relu6.h"

#include <math.h>

namespace ncnn {

Relu6::Relu6()
{
    one_blob_only = true;
    support_inplace = true;
}

int Relu6::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                ptr[i] = std::min(6.f, std::max(0.f, ptr[i]));
            }
        }

        return 0;
}

} // namespace ncnn

```


## 修改 src/CMakeLists.txt 注册Relu6

```CPP
ncnn_add_layer(GroupNorm)
ncnn_add_layer(LayerNorm)
ncnn_add_layer(Relu6)
```


## 定义测试用例CPP文件 tests/test_relu6.cpp 

```CPP
#include "layer/relu6.h"
#include "testutil.h"

static int test_relu6(const ncnn::Mat& a)
{
    ncnn::ParamDict pd;

    std::vector<ncnn::Mat> weights(0);

    int ret = test_layer<ncnn::Relu6>("Relu6", pd, weights, a);
    if (ret != 0)
    {
        fprintf(stderr, "test_relu6 failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c);
    }

    return ret;
}

static int test_relu6_0()
{
    return 0
           || test_relu6(RandomMat(5, 7, 24))
           || test_relu6(RandomMat(7, 9, 12))
           || test_relu6(RandomMat(3, 5, 13));
}

static int test_relu6_1()
{
    return 0
           || test_relu6(RandomMat(15, 24))
           || test_relu6(RandomMat(17, 12))
           || test_relu6(RandomMat(19, 15));
}

static int test_relu6_2()
{
    return 0
           || test_relu6(RandomMat(128))
           || test_relu6(RandomMat(124))
           || test_relu6(RandomMat(127));
}

int main()
{
    SRAND(7767517);

    return 0
           || test_relu6_0()
           || test_relu6_1()
           || test_relu6_2();
}

```


## 修改tests/CMakeLists.txt 注册Relu6测试用例

```CPP
ncnn_add_layer_test(LSTM)
ncnn_add_layer_test(Yolov3DetectionOutput)
ncnn_add_layer_test(Relu6)
```


## 编译

```
按原NCNN步骤编译
```


## 单元测试

```
./test_relu6
```


================================================
FILE: docs/developer-guide/arm-a53-a55-dual-issue.md
================================================
## natural assembly
* no register dependency, no penalty
```
ld1     {v0.4s}, [r0], #16
fmla    v10.4s, v16.4s, v24.s[0]
fmla    v11.4s, v16.4s, v24.s[1]
fmla    v12.4s, v16.4s, v24.s[2]
fmla    v13.4s, v16.4s, v24.s[3]
```

## A53
* 128bit vector load cannot be dual issued with fmla, wait 2 cycles
* 64bit vector load cannot be dual issued with fmla, wait 1 cycle
* 64bit integer load can be dual issued with fmla, no penalty
* pointer update can be dual issued with fmla, no penalty
* 64bit vector load and 64bit vector insert can be dual issued, no penalty
* any vector load cannot be issued on the 4th cycle of each fmla (enters the accumulator pipeline)

### practical guide
* use 64bit vector load only
* issue vector load every three fmla
* 1 cycle to load 64bit, dual issue with the previous interleaved 64bit insert
* load the remaining 64bit into integer register, dual issue with fmla
* update pointer, dual issue with fmla
* insert 64bit into vector from integer register, dual issue with the next interleaved 64bit load
* add nop every three fmla if no load, seems to be faster
```
ldr     d0, [r0] // 1 cycle, v0 first 64bit
fmla
ldr     x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
fmla
add     r0, r0, #16 // 0 cycle, update pointer
fmla
ldr     d1, [r0] // 1 cycle, v1 first 64bit
ins     v0.d[1], x23 // 0 cycle, v0 second 64bit complete
fmla
ldr     x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
fmla
add     r0, r0, #16 // 0 cycle, update pointer
fmla
ins     v1.d[1], x23 // 1 cycle, v1 second 64bit complete
nop
fmla
fmla
fmla
nop
nop
fmla
fmla
fmla
```

## A55
* Limited by the number of neon register read and write ports, most neon instructions cannot be dual-issued.
* neon instructions have different latencies
* 128bit vector load cannot be issued with fmla, WAR wait 2 cycles
* 64bit integer load can be dual issued with fmla, no penalty
* pointer update can be dual issued with fmla, no penalty
* 64bit vector insert can be dual issued with fmla, no penalty

### practical guide
* A55 supports 128bit load and 256bit write in one clock. Support dual emission of two 64bit vector loads or single emission of 128bit vector load
* `ldr`, dual issue with fmla
* load the remaining 64bit into integer register, dual issue with fmla
* update pointer, dual issue with fmla
* insert 64bit into vector from integer register, dual issue with fmla
* interleaved load loose register dependency
* nop trick is not needed
* Loop unrolling fma reduces pipeline bubbles
* Some data type conversion neon instructions can be dual issued, such as `fsvts`
```
ldr     d0, [r0] // 0 cycle, v0 first 64bit
fmla
ldr     x23, [r0, #8] // 0 cycle, v0 second 64bit to temp register
fmla
add     r0, r0, #16 // 0 cycle, update pointer
fmla
ldr     d1, [r0] // 0 cycle, v1 first 64bit
fmla
ins     v0.d[1], x23 // 0 cycle, v0 second 64bit complete
fmla
ldr     x23, [r0, #8] // 0 cycle, v1 second 64bit to temp register
fmla
add     r0, r0, #16 // 0 cycle, update pointer
fmla
ins     v1.d[1], x23 // 0 cycle, v1 second 64bit complete
fmla
```


================================================
FILE: docs/developer-guide/armv7-mix-assembly-and-intrinsic.md
================================================
```c
// d寄存器全部使用 %P
// d reg matches %P
// a += b * c
float32x2_t _a = vld1_f32(a);
float32x2_t _b = vld1_f32(b);
float32x2_t _c = vld1_f32(c);
asm volatile(
    "vmla.f32  %P0, %P2, %P3"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// q寄存器全部使用 %q
// q reg matches %q
// a += b * c
float32x4_t _a = vld1q_f32(a);
float32x4_t _b = vld1q_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
    "vmla.f32  %q0, %q2, %q3"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// d寄存器单路使用 %P[0] %P[1]
// 32bit d reg matches %P[0]
// a += b * c[0]
// a += b * c[1]
float32x2_t _a = vld1_f32(a);
float32x2_t _b = vld1_f32(b);
float32x2_t _c = vld1_f32(c);
asm volatile(
    "vmla.f32  %P0, %P2, %P3[0]"
    "vmla.f32  %P0, %P2, %P3[1]"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// q寄存器单路使用 %e[0] %e[1] %f[0] %f[1]
// 32-bit q reg matches %e[0]
// a += b * c[0]
// a += b * c[1]
// a += b * c[2]
// a += b * c[3]
float32x4_t _a = vld1q_f32(a);
float32x4_t _b = vld1q_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
    "vmla.f32  %q0, %q2, %e3[0]"
    "vmla.f32  %q0, %q2, %e3[1]"
    "vmla.f32  %q0, %q2, %f3[0]"
    "vmla.f32  %q0, %q2, %f3[1]"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// q寄存器拆分d寄存器使用 %e %f
// use %e %f to split q reg into two d regs
// a += b * c[0]c[1]
// a += b * c[2]c[3]
float32x2_t _a = vldq_f32(a);
float32x2_t _b = vldq_f32(b);
float32x4_t _c = vld1q_f32(c);
asm volatile(
    "vmla.f32  %P0, %P2, %e3"
    "vmla.f32  %P0, %P2, %f3"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// d寄存器声明绑定
// specify concrete d reg which want to save
// vmla.f32  d0, d2, d4
register float32x2_t _a asm("d0") = vld1_f32(a);
register float32x2_t _b asm("d2") = vld1_f32(b);
register float32x2_t _c asm("d4") = vld1_f32(c);

asm volatile(
    "vmla.f32  %P0, %P2, %P3"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```
```c
// q寄存器声明绑定
// bind q reg with data
// vmla.f32  q0, q1, q2
register float32x4_t _a asm("q0") = vld1q_f32(a);
register float32x4_t _b asm("q1") = vld1q_f32(b);
register float32x4_t _c asm("q2") = vld1q_f32(c);

asm volatile(
    "vmla.f32  %q0, %q2, %q3"
    : "=w"(_a) // %0
    : "0"(_a),
      "w"(_b), // %2
      "w"(_c)  // %3
    :
);
```

如果不是因为编译器的bug，寄存器绑定是用不着的，然而。。。

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41538

qwq


================================================
FILE: docs/developer-guide/binaryop-broadcasting.md
================================================
### broadcasting rule

ncnn BinaryOp accepts blobs with different shape

C = BinaryOp(A, B)

shape notation convention is [w], [w,h], [w,h,c], [w,h,d,c]

* binaryop with scalar and scalar-like

|A|B|C|
|---|---|---|
|[2]|scalar / [1]|[2]|
|[2,3]|scalar / [1] / [1,1]|[2,3]|
|[2,3,4]|scalar / [1] / [1,1] / [1,1,1]|[2,3,4]|
|[2,3,4,5]|scalar / [1] / [1,1] / [1,1,1] / [1,1,1,1]|[2,3,4,5]|

* no broadcast

|A|B|C|
|---|---|---|
|[2]|[2]|[2]|
|[2,3]|[2,3]|[2,3]|
|[2,3,4]|[2,3,4]|[2,3,4]|
|[2,3,4,5]|[2,3,4,5]|[2,3,4,5]|

* explicit broadcast B

|A|B|C|
|---|---|---|
|[2,3]|[1,3]|[2,3]|
|[2,3]|[2,1]|[2,3]|
|[2,3,4]|[1,3,4]|[2,3,4]|
|[2,3,4]|[2,1,4]|[2,3,4]|
|[2,3,4]|[2,3,1]|[2,3,4]|
|[2,3,4]|[1,1,4]|[2,3,4]|
|[2,3,4]|[1,3,1]|[2,3,4]|
|[2,3,4]|[2,1,1]|[2,3,4]|
|[2,3,4,5]|[1,3,4,5]|[2,3,4,5]|
|[2,3,4,5]|[2,1,4,5]|[2,3,4,5]|
|[2,3,4,5]|[2,3,1,5]|[2,3,4,5]|
|[2,3,4,5]|[2,3,4,1]|[2,3,4,5]|
|[2,3,4,5]|[1,1,4,5]|[2,3,4,5]|
|[2,3,4,5]|[1,3,1,5]|[2,3,4,5]|
|[2,3,4,5]|[1,3,4,1]|[2,3,4,5]|
|[2,3,4,5]|[2,1,1,5]|[2,3,4,5]|
|[2,3,4,5]|[2,1,4,1]|[2,3,4,5]|
|[2,3,4,5]|[2,3,1,1]|[2,3,4,5]|
|[2,3,4,5]|[1,1,1,5]|[2,3,4,5]|
|[2,3,4,5]|[1,1,4,1]|[2,3,4,5]|
|[2,3,4,5]|[1,3,1,1]|[2,3,4,5]|
|[2,3,4,5]|[2,1,1,1]|[2,3,4,5]|

* implicit broadcast B for inner axis

It broadcasts in the opposite direction of the numpy's implicit broadcasting behavior.

pnnx will insert reshape operator at the appropriate position to convert it to explicit broadcast automatically.

|A|B|C|
|---|---|---|
|[2,3]|[3]|[2,3]|
|[2,3,4]|[4]|[2,3,4]|
|[2,3,4]|[3,4]|[2,3,4]|
|[2,3,4,5]|[5]|[2,3,4,5]|
|[2,3,4,5]|[4,5]|[2,3,4,5]|
|[2,3,4,5]|[3,4,5]|[2,3,4,5]|

* implicit broadcast B with 1 dimension rank for outer axis

This exists only for compatibility.

When the size is the same, eg. [2,2] and [2], broadcast B for inner axis will be prioritized.

|A|B|C|
|---|---|---|
|[2,3]|[2]|[2,3]|
|[2,3,4]|[2]|[2,3,4]|
|[2,3,4,5]|[2]|[2,3,4,5]|


================================================
FILE: docs/developer-guide/build-ncnn-on-windows-xp.zh.md
================================================
# Build ncnn on Windows XP

> **Contributors:** [@Sugar-Baby](https://github.com/Sugar-Baby) and [@AtomAlpaca](https://github.com/AtomAlpaca)

## 0. 环境准备

#### 0.1 虚拟机设置

我使用的是[我的MSDN](https://www.imsdn.cn/)提供的[Windows XP SP3 x64版本](https://www.imsdn.cn/operating-systems/windows-xp/)。虚拟机使用Oracle VM VirtualBox，内存4GB，存储空间64GB（C盘16GB，D盘48GB）。

**在虚拟机关机的情况下**，点击虚拟机管理器界面的"设置"-"网络"-"高级"，将控制芯片改为PCnet-FAST III，混杂模式设置为拒绝，勾选接入网线，点击"OK"保存。重启虚拟机就可以连接上网络了。

点击虚拟机界面的"设备"-"安装增强功能..."，在虚拟机中进入"我的电脑"，刷新后出现"VirtualBox Guest Additions (D: )"，右键选择"自动播放"，完成安装后重启。

点击虚拟机界面的"设备"-"共享粘贴板"，设置为"双向"。点击"设备"-"共享文件夹"-"共享文件夹.."，点击右侧加号，在"共享文件夹路径"中选择"其他..."，然后选择需要共享的主机文件夹。勾选"自动挂载"和"固定分配"，点击"OK"保存。在虚拟机中进入"我的电脑"，刷新后出现'VBoxSvr' 上的 <主机文件夹名称>，双击进入就可以双向传输文件了。

#### 0.2 开发环境配置

浏览器推荐[Mypal 68](https://www.mypal-browser.org/download.html)，注意要选择32位版本。Windows XP自带ZIP文件解压。安装后就可以访问互联网了。

从Github下载[w64devkit](https://github.com/skeeto/w64devkit)，选择x86版本。这里下载的是一个自解压的7z文件，在虚拟机中解压即可。

在"开始"-"控制面板"-"切换到经典视图"-"系统"-"高级"-"环境变量"-"系统变量"中，选择Path，点击"编辑"，在字符串末尾加入一个分号(;)，然后粘贴w64devkit下bin文件夹的目录。点击"确定"保存之后可以打开命令提示符输入例如c++的命令验证是否成功加入环境变量。

由于年代过于久远，Git的官方release已经没有兼容Windows XP的版本了。最后一个兼容的版本(1.9.5)可以在[这里](https://www.xiazaiba.com/html/29352.html)下载。

为了使用Git，需要安装[Win32 OpenSSL](https://slproweb.com/products/Win32OpenSSL.html)。选择Win32 OpenSSL Light版本。这个过程中会附带安装VC++ 2022运行时库。

如果因为协议、代理等问题不能在虚拟机中使用Git，也可以下载ZIP版本后在虚拟机中解压。

需要手动下载[CMake最后支持Windows XP的版本](https://github.com/Kitware/CMake/releases/download/v3.10.3/cmake-3.10.3-win32-x86.zip)。建议解压在C:\Program Files下，并且需要设置系统变量，到CMake目录下的bin文件夹。具体可以参考上面w64devkit的方法。

## 1. 编译

### 1.1 使用 MinGW-w64

运行

```bash
cd <ncnn-root-dir>
mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/windows-xp-mingw.toolchain.cmake -DNCNN_VULKAN=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF -DCMAKE_BUILD_TYPE=Release -G "MinGW Makefiles" ..
make -j2
make install
```

由于平台性能的限制，Vulkan SDK 最低要求 Windows 7 SP1，XP 无法安装官方驱动和工具链，因此需要关闭Vulkan选项。同时需要使用简化版 OpenCV 替代库NCNN_SIMPLEOCV。

### 1.2 使用 Clang

需要先配置 MinGW-w64 环境，然后安装 Clang 6.0 或更高版本。

```bash
cd <ncnn-root-dir>
mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/windows-xp-clang.toolchain.cmake -DNCNN_SIMPLEOCV=ON -DNCNN_SIMPLEOMP=ON -DNCNN_AVX=OFF -DCMAKE_BUILD_TYPE=Release -G "MinGW Makefiles" ..
make -j2
make install
```

### 1.3 使用 Visual Studio (MSVC)

需要安装支持 Windows XP 的 v141_xp 工具集：

1. 打开 Visual Studio 安装程序（工具 → 获取工具和功能）
2. 选择"使用 C++ 的桌面开发"
3. 在摘要部分选择"对 C++ 的 Windows XP 支持"
4. 点击修改

```bash
cd <ncnn-root-dir>
mkdir build
cd build
cmake -A WIN32 -G "Visual Studio 17 2022" -T v141_xp -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_WITH_STATIC_CRT=ON -DCMAKE_TOOLCHAIN_FILE=../toolchains/windows-xp-msvc.toolchain.cmake ..
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
```

## 2. 测试

### 2.1 benchncnn

将benchmark目录下的所有文件复制到build/benchmark目录下。在命令提示符中cd到build/benchmark， 然后运行

```bash
benchncnn [测试的循环次数] [线程数] [节能模式]
```

其中，节能模式取值为0时关闭，为1时打开。

### 2.2 examples

从[这里](https://github.com/nihui/ncnn-assets/tree/master/models)可以下载到所有需要的param和bin文件。需要注意的是，ZF_faster_rcnn_final.bin开头的三个文件（.zip，.z01，.z02）最好先放在主机上解压出bin文件再传进虚拟机。

把这些文件放在build/examples目录下。

我写了一个bat脚本来批量测试这些模型：

```batch
@echo off
setlocal enabledelayedexpansion

set EXAMPLES_DIR=<ncnn-root-dir>\BUILD\EXAMPLES
set IMAGE_PATH=<ncnn-root-dir>\IMAGES\256-ncnn.png
set LOG_FILE=test_results.log

echo NCNN Examples Test Results > %LOG_FILE%
echo ========================= >> %LOG_FILE%
echo Test started: %date% %time% >> %LOG_FILE%
echo. >> %LOG_FILE%

for %%f in ("%EXAMPLES_DIR%\*.exe") do (
    set EXE_NAME=%%~nf
    set EXE_PATH=%%f
    echo Testing: !EXE_NAME! >> %LOG_FILE%
    echo -------------------------------- >> %LOG_FILE%

    !EXE_PATH! "%IMAGE_PATH%" >> %LOG_FILE% 2>&1

    if errorlevel 1 (
        echo [ERROR] !EXE_NAME! failed to run. >> %LOG_FILE%
    ) else (
        echo [SUCCESS] !EXE_NAME! completed. >> %LOG_FILE%
    )
    echo. >> %LOG_FILE%
)

echo Test finished: %date% %time% >> %LOG_FILE%
echo Results saved to %LOG_FILE%
endlocal
```

把这个bat脚本放在build/examples目录下，替换掉所有的`<ncnn-root-dir>`，双击运行。通过生成的test_results.log即可查看所有模型的结果。

通过修改`set IMAGE_PATH=<ncnn-root-dir>\IMAGES\256-ncnn.png`中的路径来更换需要测试的文件。

================================================
FILE: docs/developer-guide/custom-allocator.md
================================================
Mat structure is now allocator-aware via an extra allocator parameter with default zero value.

The good-old ncnn::fastMalloc()/ncnn::fastFree() will be used for a null allocator.

You could pass a custom allocator to delegate all memory allocation and deallocation.

```cpp
class Allocator
{
public:
    virtual void* fastMalloc(size_t size) = 0;
    virtual void fastFree(void* ptr) = 0;
};
```

ncnn has already implemented two simple pooled Allocator class, with mutex lock or without it.

```cpp
ncnn::PoolAllocator locked_mempool;
ncnn::UnlockedPoolAllocator unlocked_mempool;
```

the two allocator types in ncnn

* blob allocator

    used to allocate memory for all named blobs, which you could retrieve by Extractor::extract()
* workspace allocator

    used to allocate memory for internal temporary use in layer implementation, such as the temp blob after padding in convolution

by default, all Extractor instance use the two allocator in the default option
You can alter them by ncnn::set_default_option()
or you can set them per Extractor by Extractor::set_blob_allocator()/Extractor::set_workspace_allocator()

blob allocator is guaranteed to be called in-order in layer implementation during each Extractor lifecycle
while workspace allocator may be called synchronously

the practical usage

* one network, one-by-one inference

    shared unlocked blob allocator for all Extractor

    shared locked workspace allocator for all Extractor

* one network, concurrent inference

    shared unlocked blob allocator for all Extractor in each thread

    shared locked workspace allocator for all Extractor among all threads

* concurrent multiple networks, one-by-one inference for each network

    shared unlocked blob allocator for all Extractor of each network

    shared locked workspace allocator for all Extractor among all networks (for saving memory)

* concurrent multiple networks, concurrent inference for each network

    shared unlocked blob allocator for all Extractor of each network in each thread

    shared locked workspace allocator for all Extractor among all networks (for saving memory)


================================================
FILE: docs/developer-guide/element-packing.md
================================================
### what is packing and why

packing is the form of storing multiple short-sized values as one long-sized value.

element packing is well mapped with the underlying simd register, which usually use one very wide register to store different types of values.

|C|elemsize|elempack|
|---|---|---|
|double|8|1|
|float|4|1|
|int|4|1|
|short|2|1|
|signed char|1|1|

|arm neon|elemsize|elempack|
|---|---|---|
|float64x2_t|16|2|
|float32x4_t|16|4|
|int32x4_t|16|4|
|float16x4_t|8|4|
|int8x8_t|8|8|

Though the real count of values doubles when elempack is two, the wide-sized value is still treated as one value in the view of Mat structure. For example, we want to store 40 float values in Mat object, if elempack 1 is used, Mat width is then 40, while 10 if elempack 4 is used.

|dims|w|h|c|cstep|elemsize|elempack|
|---|---|---|---|---|---|---|
|1|40|1|1|40|4|1|
|1|10|1|1|10|16|4|

### packing style convention

In practice, elempack 1, 4, 8 are the most common cases. It is possible to use any other packing style in theory.

The following table show the packing axis used in ncnn for different dimension.

|dims|packing axis|shape before packing|shape after packing|
|---|---|---|---|
|1|w|w|w/elempack|
|2|h|w, h|w, h/elempack|
|3|c|w, h, c|w, h, c/elempack|

If the packing axis dim is not evenly divisible by elempack, zero padding may be used.

```
outw = (w + elempack - 1) / elempack;
```

The following snippet shows the memory layout after elempack=4 on 3-dim Mat

```
// w=2 h=3 c=4 elempack=1
0 1
2 3
4 5

6 7
8 9
10 11

12 13
14 15
16 17

18 19
20 21
22 23

// w=2 h=3 c=1 elempack=4
(0,6,12,18) (1,7,13,19)
(2,8,14,20) (3,9,15,21)
(4,10,16,22) (5,11,17,23)
```

### how to convert elempack

There is a convenient wrapper function provided
```
// convert to elempack 4 if packing axis dim is evenly divisible by elempack
// return the identity Mat otherwise
ncnn::Mat a;
ncnn::Mat a_packed;
ncnn::convert_packing(a, a_packed, 4);
if (a_packed.elempack == 4)
{
    // check if packing is successful
}

// convert to packing 1, aka unpacking, shall be always successful
ncnn::Mat b;
ncnn::Mat b_unpacked;
ncnn::convert_packing(b, b_unpacked, 1);
```

### handle general interleaved data

Here is an example of using convert packing to convert RGB interleaved data to planar

**NOTE:** The following code is just presented to explain what packing is and the conversion process. Do not use it in production due to its poor performance. Do use ncnn::Mat::from_pixels()

```cpp
// rgb_interleaved_u8 is RGB RGB RGB ...
// rgb_interleaved_u8.w = w;
// rgb_interleaved_u8.h = h;
// rgb_interleaved_u8.c = 1;
// rgb_interleaved_u8.elemsize = 3;
// rgb_interleaved_u8.elempack = 3;

ncnn::Mat rgb_interleaved_u8(w, h, 1, 3, 3);
ncnn::Mat rgb_planar_u8;

ncnn::convert_packing(rgb_interleaved_u8, rgb_planar_u8, 1);

// rgb_planar_u8 is now RRR ... GGG ... BBB ...
// rgb_planar_u8.w = w;
// rgb_planar_u8.h = h;
// rgb_planar_u8.c = 3;
// rgb_planar_u8.elemsize = 1;
// rgb_planar_u8.elempack = 1;
```


================================================
FILE: docs/developer-guide/expression.md
================================================
### expression

expression is used in the reshape slice parameter to express the dynamic shape or subscript value based on the expression formula and input shape

Compared with directly converting the expression calculation process into multiple operators, the motivation for using expression
* No additional shape concat and other operators will be generated due to dynamic calculation, which greatly reduces the number of layers of the ncnn model and makes it easier to view the model structure and modify expression
* Shape or subscript evaluations are usually single-digit operations, which are more suitable for direct completion on the CPU without layout conversion and kernel call overhead

In the param file, `Reshape` layer can contain 6=expression

The pnnx tool can automatically convert `pnnx.Expression` to the expr parameter of ncnn `Reshape`

* Convert to 0w, 0h, 0d or 0c according to the input shape rank and `size(@0,1)`
* Automatically remove the batch dimension according to the input batch index
* Convert `pnnx.Expression` and `Tensor.reshape`/`Tensor.view` two operators are fused into ncnn `Reshape`
* Automatically summarize the number of references, exclude duplicate references and sort the indexes of references
* Convert the customary shape representation order, such as CHW to WHC

Example pnnx.param where A and B are 3D tensors
```
pnnx.Expression  expr     2 1 A B shape expr=[add(size(@1,0),2),mul(size(@0,1),2),-1]
Tensor.reshape   reshape  2 1 A shape out
```

pnnx.py
```python
shape = [(B.size(0) + 2), (A.size(1) * 2), -1]
out = A.reshape(*shape)
```

Converted to ncnn.param
```
Reshape          reshape  2 1 A B out 6="-1,*(0h,2),+(1c,2)"
```

### syntax

Use infix expression, format is `op(arg0,arg1,...)`, multiple operations can be nested, multiple sizes are separated by commas, and numbers can be integers or decimals

Among them, the commonly used `add` `sub` `mul` `div` `floor_div` are abbreviated as `+` `-` `*` `/` `//`, and other arithmetic operations use names, such as `sin` `ceil` `max`, etc.

* `max(2,3)`
* `floor(sin(3.14))`
* `+(*(-2,1),10)` means (-2 * 1) + 10
* `1,2,+(3,2)` list can represent output shape with 3-rank

The input shape can be referenced at runtime, format is `id(w|h|d|c)`, the maximum id is 9, which means that up to 10 inputs can be referenced

Assuming that the Reshape layer has two input blobs, A and B, then

* `0w,1h` means A.w, B.h
* `*(+(0c,1c),2)` means (A.c + B.c) * 2

### helper api

```cpp
#include "expression.h"

int count_expression_blobs(const std::string& expr);

int eval_list_expression(const std::string& expr, const std::vector<Mat>& blobs, std::vector<int>& outlist);
```

* `count_expression_blobs`

Pass expression to get the number of inputs it references, such as `0w,1h` returns 2

* `eval_list_expression`

Evaluate the result list according to expression and input blob calculate. If the calculation result is a floating point number, it will be automatically truncated to an integer.

### supported operator

|type|operators|
|---|---|
|float to int|`trunc` `ceil` `floor` `round`|
|binary arithmetic|`+` `-` `*` `/` `//` `max` `min` `pow` `fmod` `remainder` `atan2` `logaddexp`|
|unary arithmetic|`abs` `neg` `sign` `square` `sqrt` `rsqrt` `reciprocal` `exp` `log` `log10` `sin` `asin` `cos` `acos` `tan` `atan` `sinh` `asinh` `cosh` `acosh` `tanh` `atanh`|
|integer bitwise|`and` `or` `xor` `lshift` `rshift`|


================================================
FILE: docs/developer-guide/glsl-extension.md
================================================
# ncnn GLSL extension

## rationale
Different GPUs support different features, some support fp16 as buffer storage type, some support fp16 as operand variable, some old GPUs only support fp32

When the GPU supports the `VK_KHR_16bit_storage` extension, in order to minimize the memory bandwidth consumption of the GPU, we will give priority to using fp16 as the storage type. Otherwise, we use `packHalf2x16` and `unpackHalf2x16` in GLSL 4.2 to compress 2 fp32 to uint, reducing read and write bandwidth.

Similarly, when the gpu supports the `VK_KHR_shader_float16_int8` extension, in order to speed up the calculation efficiency, we will give priority to using fp16 as the operation operand, which usually doubles the speed. Otherwise, we use fp32.

To ensure the widest compatibility, the following code for declaring descriptor binding and loading data will be written

```c
#if NCNN_fp16_storage // gpu supports 16bit storage
layout (binding = 0) buffer blob { f16vec4 blob_data[]; };
#elif NCNN_fp16_packed // gpu supports GLSL 4.2
layout (binding = 0) buffer blob { uvec2 blob_data[]; };
#else // gpu only supports fp32
layout (binding = 0) buffer blob { vec4 blob_data[]; };
#endif

void main()
{
    const int i = int(gl_GlobalInvocationID.x);

#if NCNN_fp16_storage && NCNN_fp16_arithmetic // gpu supports 16bit storage and shader float16
    f16vec4 x = blob_data[i];
#elif NCNN_fp16_storage // gpu supports 16bit storage but no shader float16
    vec4 x = vec4(blob_data[i]);
#elif NCNN_fp16_packed && NCNN_fp16_arithmetic // gpu supports GLSL 4.2 and shader float16
    f16vec4 x = f16vec4(unpackFloat2x16(blob_data[i].x), unpackFloat2x16(blob_data[i].y));
#elif NCNN_fp16_packed // gpu supports GLSL 4.2
    vec4 x = vec4(unpackHalf2x16(blob_data[i].x), unpackHalf2x16(blob_data[i].y));
#else // gpu only supports fp32
    vec4 x = blob_data[i];
#endif
}
```

As you can see, just declaring the buffer type and reading a value consumes a lot of lines of code, which is a maintenance nightmare. Therefore, ncnn adds more flexible data types and auxiliary functions to reduce the size of the code and improve readability, and will automatically expand to the most efficient implementation according to the feature level supported by the GPU.

The above code, by using the ncnn glsl extension, can be simplified to

```c
layout (binding = 0) buffer blob { sfpvec4 blob_data[]; };

void main()
{
    const int i = int(gl_GlobalInvocationID.x);

    afpvec4 x = buffer_ld4(blob_data, i);
}
```

The ncnn glsl extension provides the necessary data types for storage, computation, shared memory, and load, store, conversion functions for buffers and images. We also provide some buffer and image copy functions to prevent loss of precision when using fp16 as the intermediate data type, and to avoid unnecessary `unpackHalf2x16` and `packHalf2x16` pair.

# entrypoint for compiling GLSL

The gpu.h header in the ncnn library exposes 3 APIs for compiling glsl code into spir-v binary, they support ncnn glsl extension, these 3 functions accept opt switch to control the expansion form of ncnn glsl extension. The first two accept raw glsl code strings, and the last one is used to create ncnn's built-in shader.

```cpp
namespace ncnn {

// online spirv compilation
NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);

} // namespace ncnn
```

## compile ncnn extended GLSL code directly

You can write shader code with ncnn glsl extension, compiled to spir-v using ncnn functions. The compiled product is a standard-compliant spir-v binary, which can be directly used to create a pipeline object in the vulkan api

```cpp
static const char my_glsl_data[] = R"(
#version 450

layout (binding = 0) readonly buffer a_blob { sfpvec4 a_blob_data[]; };
layout (binding = 1) writeonly buffer b_blob { sfpvec4 b_blob_data[]; };

void main()
{
    const int i = int(gl_GlobalInvocationID.x);

    afpvec4 v = buffer_ld4(a_blob_data, i);

    v = v + 123;

    buffer_st4(b_blob_data, i, v);
}
)";

Option opt;
 // you can control the extension behavior
 // even if the gpu supports 16bit storage
opt.use_fp16_storage = false;

std::vector<uint32_t> spirv;
ncnn::compile_spirv_module(my_glsl_data, sizeof(my_glsl_data) - 1, opt, spirv);

// To create pipeline object later
// ncnn::Pipeline pipeline(vkdev);
// pipeline.set_local_size_xyz(64, 1, 1);
// pipeline.create(spirv.data(), spirv.size() * 4, specializations);
```

## ncnn built-in shader

The shader index inside ncnn is exposed in the `layer_shader_type.h` header and can be used if needed

```cpp
#include "layer_shader_type.h"

int shader_type_index = LayerShaderType::convert_ycbcr;

Option opt;

std::vector<uint32_t> spirv;
int retc = compile_spirv_module(shader_type_index, opt, spirv);
```

# data types

## storage type

declare buffer data layout in descriptor binding

```c
layout (binding = 0) buffer top_blob { sfpvec4 top_blob_data[]; };
```

|storage type|fp32|fp16p|fp16s|bf16p|bf16s|
|---|---|---|---|---|---|
|sfp|float|uint|float16_t|uint|bfloat16_t|
|sfpvec2|vec2|uint|f16vec2|uint|bf16vec2|
|sfpvec4|vec4|uvec2|f16vec4|uvec2|bf16vec4|

## arithmetic type

declare local variable in glsl code

```c
void main()
{
    afpvec4 v = a * b;
}
```

|arithmetic type|fp32|fp16a|
|---|---|---|
|afp|float|float16_t|
|afpvec2|vec2|f16vec2|
|afpvec4|vec4|f16vec4|

## local type

declare variable in shared local memory

```c
shared lfp tmp_a[8][4][2];
```

|local type|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|bf16p|bf16s|
|---|---|---|---|---|---|---|
|lfp|float|float|float|float16_t|float|bfloat16_t|
|lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4|

# buffer functions

- load typed value from src[offset]

```c
afp buffer_ld1(sfp src, int offset);
afpvec2 buffer_ld2(sfpvec2 src, int offset);
afpvec4 buffer_ld4(sfpvec4 src, int offset);
```

- store typed value to dst[offset]

```c
void buffer_st1(sfp dst, int offset, afp v);
void buffer_st2(sfpvec2 dst, int offset, afpvec2 v);
void buffer_st4(sfpvec4 dst, int offset, afpvec4 v);
```

- copy typed value from src[src_offset] to dst[dst_offset]

```c
void buffer_cp1(sfp dst, int dst_offset, sfp src, int src_offset);
void buffer_cp2(sfpvec2 dst, int dst_offset, sfpvec2 src, int src_offset);
void buffer_cp4(sfpvec4 dst, int dst_offset, sfpvec4 src, int src_offset);
```

- copy and pack value from src[src_offsets[0],src_offsets[1],...] to dst[dst_offset]

```c
void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets);
```

- copy and unpack value from src[src_offset] to dst[dst_offsets[0],dst_offsets[1],...]

```c
void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset);
```
# local data conversion functions

- storage buffer to local memory

```c
lfp buffer_sm1(sfp src, int offset);
lfpvec4 buffer_sm4(sfpvec4 src, int offset);
```

- local memory to local variable

```c
afp lfp2afp(lfp v);
afpvec4 lfp2afpvec4(lfpvec4 v);
```

- local variable to local memory

```c
lfp afp2lfp(afp v);
lfpvec4 afp2lfpvec4(afpvec4 v);
```

Note: The common usage of local memory is to read from global memory first, store it in local memory, and then read local variables from local memory for subsequent use. Therefore, only storage type to local type and local type to arithmetic type conversion functions are provided here.

# misc functions

- prefer specialization constant over push constant

```c
T psc(T x)
```

Declare the same variable in specialization constant AND push constant section, then `psc(x)` will become a compile-time constant when specialization constant given non-zero or be dynamic via push constant otherwise. This is often used for tensor shape specialization. We can usually resolve all shape information and make them be compile-time constants for more aggressive shader optimization.

```c
layout (constant_id = 0) const int size = 0;

layout (push_constant) uniform parameter
{
    int size;
} p;

void main()
{
    const int s = psc(size);
}
```

# platform macros

judge if the current platform is moltenvk, for enabling some platform-specific workaround

```c
#if NCNN_moltenvk
// enable workaround for moltenvk
#endif
```

ncnn adds additional macro definitions in the new version, which may conflict or confuse the existing glsl code. In order to obtain cross-version compatibility of ncnn, you can switch between the old and new codes according to the `ncnn_glsl_version` macro version.

```c
#if ncnn_glsl_version >= 1
// use device macros introduced since version 1
#endif
```

ncnn additionally defines most of the vulkan device-related features as macros, which we can use to distinguish different platforms, device extensions, features, and properties

### extension macros

When the device supports an extension, `ncnn_<extension_name>` is defined as the extension version

```c
void main()
{
#if ncnn_VK_KHR_16bit_storage
    // here is the code for any device that supports VK_KHR_16bit_storage
#endif

#if ncnn_VK_KHR_sampler_ycbcr_conversion >= 10
    // here is the code for any device that supports VK_KHR_sampler_ycbcr_conversion and version >= 10
#endif
}
```

### device feature and property macros

ncnn will query device features and properties and then define them as macros.

The macro name is `ncnn_<feature_name>` or `ncnn_<property_name>`

The `GL_EXT_shader_explicit_arithmetic_types_int64` extension will be automatically enabled without explicit code indication when the device supports `shaderInt64`

The `GL_EXT_shader_explicit_arithmetic_types_int16` extension will be automatically enabled without explicit code indication when the device supports `shaderInt16`

```c
void main()
{
#if ncnn_robustBufferAccess
    // here is the code for any device that supports robustBufferAccess feature
#endif

#if ncnn_vendorID == 4318
    // here is the vendor specific code, 4318 is nvidia graphics
#endif

#if ncnn_subgroupSize == 32
    // here is the code path optimized for subgroup_size == 32
#endif

    // use macro definitions
    uint size; // dynamic value from some previous routines
    if (size < ncnn_subgroupSize)
    {
#if ncnn_supportedOperations & 4
        // subgroup support arithmetic
#endif

#if ncnn_subgroup_arithmetic
        // shorthand style for checking subgroup arithmetic :P
#endif
    }
}
```

### validation layer macros

ncnn will define some additional convenient macros when the vulkan validation layer enabled

* `ncnn_enable_validation_layer`
* `NCNN_LOGE`

currently, you have to modify the `ENABLE_VALIDATION_LAYER` definition at the beginning of `src/gpu.cpp` to `1` to enable these macros.

The `GL_EXT_debug_printf` extension will be enabled automatically without explicitly specifying it in your code.

```c
void main()
{
    int gx = int(gl_GlobalInvocationID.x);

#if ncnn_enable_validation_layer
    NCNN_LOGE("gx = %d\n", gx);
#endif
}
```

At runtime, `NCNN_LOGE` will print out the value of `gx`

### option macros

enable glsl extension only if user enable some options

The `GL_EXT_shader_16bit_storage` extension will be automatically enabled without explicit code indication when the device supports 16-bit storage and the user turns on `opt.use_fp16_storage` or `opt.use_bf16_storage`

The `GL_EXT_shader_explicit_arithmetic_types_float16` extension will be automatically enabled without explicit code indication when the device supports 16-bit arithmetic and the user turns on `opt.use_fp16_arithmetic`

The `GL_EXT_shader_8bit_storage` extension will be automatically enabled without explicit code indication when the device supports 8-bit storage and the user turns on `opt.use_int8_storage`

The `GL_EXT_shader_explicit_arithmetic_types_int8` extension will be automatically enabled without explicit code indication when the device supports 8-bit arithmetic and the user turns on `opt.use_int8_arithmetic`

The `GL_EXT_bfloat16` extension will be automatically enabled without explicit code indication when the device supports bfloat16 storage and the user turns on `opt.use_bf16_storage`

```c
void main()
{
#if NCNN_fp16_storage
    // the user enable fp16 storage option and the device has fp16 storage support
#endif

#if NCNN_fp16_arithmetic
    // the user enable fp16 arithmetic option and the device has fp16 arithmetic support
#endif
}
```

|macro|defined by option|
|---|---|
|NCNN_fp16_packed|opt.use_fp16_packed|
|NCNN_fp16_storage|opt.use_fp16_storage|
|NCNN_fp16_arithmetic|opt.use_fp16_arithmetic|
|NCNN_int8_packed|opt.use_int8_packed|
|NCNN_int8_storage|opt.use_int8_storage|
|NCNN_int8_arithmetic|opt.use_int8_arithmetic|
|NCNN_bf16_packed|opt.use_bf16_packed|
|NCNN_bf16_storage|opt.use_bf16_storage|
|NCNN_shader_local_memory|opt.use_shader_local_memory|


================================================
FILE: docs/developer-guide/glsl-extension.zh.md
================================================
# ncnn GLSL 扩展

## 理由
不同的 GPU 支持不同的功能，有的支持 fp16 作为缓冲存储类型，有的支持 fp16 作为操作数变量，有的老 GPU 只支持 fp32。

当 GPU 支持 `VK_KHR_16bit_storage` 扩展时，为了尽量减少 GPU 的内存带宽消耗，我们会优先使用 fp16 作为存储类型。否则，我们使用 `packHalf2x16` 和 `unpackHalf2x16` 在 GLSL 4.2 中将 2 个 fp32 压缩为 uint，从而减少读写带宽。

同样，当 GPU 支持 `VK_KHR_shader_float16_int8` 扩展时，为了加快计算效率，我们会优先使用 fp16 作为运算操作数，这通常会使速度翻倍。否则，我们使用 fp32。

为了确保最广泛的兼容性，将编写以下用于声明描述符绑定和加载数据的代码

```c
#if NCNN_fp16_storage // GPU支持 16bit storage
layout (binding = 0) buffer blob { f16vec4 blob_data[]; };
#elif NCNN_fp16_packed // GPU支持 GLSL 4.2
layout (binding = 0) buffer blob { uvec2 blob_data[]; };
#else // GPU仅支持 fp32
layout (binding = 0) buffer blob { vec4 blob_data[]; };
#endif

void main()
{
    const int i = int(gl_GlobalInvocationID.x);

#if NCNN_fp16_storage && NCNN_fp16_arithmetic // GPU支持 16bit storage 和 shader float16
    f16vec4 x = blob_data[i];
#elif NCNN_fp16_storage // GPU支持 16bit storage 但不包含 shader float16
    vec4 x = vec4(blob_data[i]);
#elif NCNN_fp16_packed && NCNN_fp16_arithmetic // GPU支持 GLSL 4.2 和 shader float16
    f16vec4 x = f16vec4(unpackFloat2x16(blob_data[i].x), unpackFloat2x16(blob_data[i].y));
#elif NCNN_fp16_packed // GPU支持 GLSL 4.2
    vec4 x = vec4(unpackHalf2x16(blob_data[i].x), unpackHalf2x16(blob_data[i].y));
#else // GPU仅支持 fp32
    vec4 x = blob_data[i];
#endif
}
```

如您所见，仅声明缓冲区类型并读取值会消耗大量代码行，这是项目维护的噩梦。因此，ncnn 增加了更灵活的数据类型和辅助函数，以减小代码的大小并提高可读性，并且会根据 GPU 支持的功能级别自动扩展到最高效的实现。

上面的代码，通过使用 ncnn GLSL 扩展，可以简化为

```c
layout (binding = 0) buffer blob { sfpvec4 blob_data[]; };

void main()
{
    const int i = int(gl_GlobalInvocationID.x);

    afpvec4 x = buffer_ld4(blob_data, i);
}
```

ncnn GLSL 扩展为存储、计算、共享内存以及缓冲区和图像的加载、存储、转换函数提供了必要的数据类型。我们还提供了一些缓冲区和图像复制函数，以防止在使用 fp16 作为中间数据类型时丢失精度，并避免不必要的 `unpackHalf2x16` 和 `packHalf2x16` 配对。

# 编译GLSL的入口点

ncnn库中的 gpu.h 头文件公开了3个用于将 GLSL 代码编译为 Spir-V 二进制的API函数，它们支持 ncnn GLSL 扩展，这3个函数接受 opt switch 来控制 ncnn GLSL 扩展形式。前两个函数接受原始 GLSL 代码字符串作为参数，最后一个函数用于创建 ncnn 的已存在的内置着色器。

```cpp
namespace ncnn {

// 在线 Spir-V 编译器
NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);

} // namespace ncnn
```

## 直接编译ncnn扩展GLSL代码

您可以使用 ncnn GLSL 扩展编写着色器代码，使用 ncnn 函数编译为 Spir-V。编译后的产品是符合标准的 Spir-V 二进制文件，可以直接用于在 Vulkan API 中创建流水线对象

```cpp
static const char my_glsl_data[] = R"(
#version 450

layout (binding = 0) readonly buffer a_blob { sfpvec4 a_blob_data[]; };
layout (binding = 1) writeonly buffer b_blob { sfpvec4 b_blob_data[]; };

void main()
{
    const int i = int(gl_GlobalInvocationID.x);

    afpvec4 v = buffer_ld4(a_blob_data, i);

    v = v + 123;

    buffer_st4(b_blob_data, i, v);
}
)";

Option opt;
 // 您可以控制Vulkan扩展行为
 // 当GPU支持16位存储的话
opt.use_fp16_storage = false;

std::vector<uint32_t> spirv;
ncnn::compile_spirv_module(my_glsl_data, sizeof(my_glsl_data) - 1, opt, spirv);

// 稍后再创建管道对象
// ncnn::Pipeline pipeline(vkdev);
// pipeline.set_local_size_xyz(64, 1, 1);
// pipeline.create(spirv.data(), spirv.size() * 4, specializations);
```

## ncnn内置着色器

ncnn内部的着色器索引在标头中公开，如果需要可以使用 `layer_shader_type.h`

```cpp
#include "layer_shader_type.h"

int shader_type_index = LayerShaderType::convert_ycbcr;

Option opt;

std::vector<uint32_t> spirv;
int retc = compile_spirv_module(shader_type_index, opt, spirv);
```

# 数据类型

## 存储类型(storage type)

在描述符绑定中声明缓冲区数据布局

```c
layout (binding = 0) buffer top_blob { sfpvec4 top_blob_data[]; };
```

|存储类型|fp32|fp16p|fp16s|bf16p|bf16s|
|---|---|---|---|---|---|
|sfp|float|uint|float16_t|uint|bfloat16_t|
|sfpvec2|vec2|uint|f16vec2|uint|bf16vec2|
|sfpvec4|vec4|uvec2|f16vec4|uvec2|bf16vec4|

## 算术类型(arithmetic type)

在 GLSL 代码中声明局部变量

```c
void main()
{
    afpvec4 v = a * b;
}
```

|算术类型|fp32|fp16a|
|---|---|---|
|afp|float|float16_t|
|afpvec2|vec2|f16vec2|
|afpvec4|vec4|f16vec4|

## 本地类型(local type)

在共享本地内存中声明变量

```c
shared lfp tmp_a[8][4][2];
```

|本地类型|fp32|fp16p / fp16s only|fp16s+fp16a|fp16s+fp16u|bf16p|bf16s|
|---|---|---|---|---|---|---|
|lfp|float|float|float|float16_t|float|bfloat16_t|
|lfpvec4|vec4|uvec2|uint64_t|f16vec4|uvec2|bf16vec4|

# 缓冲区函数(buffer functions)

- 从 src[offset] 加载已经确定类型的值

```c
afp buffer_ld1(sfp src, int offset);
afpvec2 buffer_ld2(sfpvec2 src, int offset);
afpvec4 buffer_ld4(sfpvec4 src, int offset);
```

- 将已确定类型的值存储到 dst[偏移量]

```c
void buffer_st1(sfp dst, int offset, afp v);
void buffer_st2(sfpvec2 dst, int offset, afpvec2 v);
void buffer_st4(sfpvec4 dst, int offset, afpvec4 v);
```

- 从已确定类型 src[src_offset] 的值拷贝到 dst[dst_offset]

```c
void buffer_cp1(sfp dst, int dst_offset, sfp src, int src_offset);
void buffer_cp2(sfpvec2 dst, int dst_offset, sfpvec2 src, int src_offset);
void buffer_cp4(sfpvec4 dst, int dst_offset, sfpvec4 src, int src_offset);
```

- 从 src[src_offsets[0],src_offsets[1],...] 的值拷贝并打包到 dst[dst_offset]

```c
void buffer_cp1to4(sfpvec4 dst, int dst_offset, sfp src, ivec4 src_offsets);
```

- 从 src[src_offset] 的值拷贝并解包到 dst[dst_offsets[0],dst_offsets[1],...]

```c
void buffer_cp4to1(sfp dst, ivec4 dst_offsets, sfpvec4 src, int src_offset);
```

# 本地数据转换函数

- 存储缓冲区转换到本地内存

```c
lfp buffer_sm1(sfp src, int offset);
lfpvec4 buffer_sm4(sfpvec4 src, int offset);
```

- 本地内存转换到局部变量

```c
afp lfp2afp(lfp v);
afpvec4 lfp2afpvec4(lfpvec4 v);
```

- 局部变量转换到本地内存

```c
lfp afp2lfp(afp v);
lfpvec4 afp2lfpvec4(afpvec4 v);
```

注意：本地内存的常见用法是先从全局内存中读取，存储在本地内存中，然后再从本地内存中读取局部变量以供后续使用。因此，此处仅提供存储类型到本地类型和本地类型到算术类型的转换函数。

# 杂项函数

- 更推荐使用专业化常量(specialization constants)，而不是推动常量(push constants)

```c
T psc(T x)
```

在 `专用常量` 和 `推送常量` 部分中声明相同的变量，然后在专用常量给定非零时 `psc(x)` 将成为编译时常量，否则将通过推送常量动态。这通常用于张量形状特化。我们通常可以解析所有形状信息，并使它们成为编译时常量，以实现让着色器得到更积极的优化。

```c
layout (constant_id = 0) const int size = 0;

layout (push_constant) uniform parameter
{
    int size;
} p;

void main()
{
    const int s = psc(size);
}
```

# 平台宏定义

判断当前平台是否为 moltenvk，以启用对于某些特定于平台的解决方法

```c
#if NCNN_moltenvk
// 启用moltenvk的解决方法
#endif
```

ncnn 在新版本中添加了额外的宏定义，可能与现在的 glsl 代码冲突或引起混淆。为了实现  ncnn 的跨版本兼容性，可以根据  `ncnn_glsl_version` 宏的版本号在新旧代码之间进行切换 。

```c
#if ncnn_glsl_version >= 1
// 使用自版本 1 起引入的设备宏
#endif
```

ncnn 额外定义了大多数 vulcan 设备相关功能作为宏，我们可以用来区分不同的平台、设备扩展、功能和属性。

### 扩展宏定义

当设备支持某个扩展时，`ncnn_<extension_name>` 被定义为扩展版本

```c
void main()
{
#if ncnn_VK_KHR_16bit_storage
    // 支持 VK_KHR_16bit_storage 设备的代码
#endif

#if ncnn_VK_KHR_sampler_ycbcr_conversion >= 10
    // 支持 VK_KHR_sampler_ycbcr_conversion 且版本 >=10 的代码
#endif
}
```

### 设备特性和属性宏

ncnn 会查询设备特性和属性，然后将它们定义为宏。

宏名称为 `ncnn_<feature_name>` 或 `ncnn_<property_name>`

当设备支持 `shaderInt64` 时，`GL_EXT_shader_explicit_arithmetic_types_int64` 扩展会自动启用，无需显式代码指示。

当设备支持 `shaderInt16` 时，`GL_EXT_shader_explicit_arithmetic_types_int16` 扩展会自动启用，无需显式代码指示。

```c
void main()
{
#if ncnn_robustBufferAccess
    // 支持 robustBufferAccess 特性的设备代码
#endif

#if ncnn_vendorID == 4318
    // 供应商特定代码，4318 是 nvidia 显卡
#endif

#if ncnn_subgroupSize == 32
    // 为 subgroup_size == 32 优化的代码路径
#endif

    // 使用宏定义
    uint size; // 来自先前例程的动态值
    if (size < ncnn_subgroupSize)
    {
#if ncnn_supportedOperations & 4
        // subgroup 支持算术运算
#endif

#if ncnn_subgroup_arithmetic
        // 检查 subgroup 算术运算的简写形式
#endif
    }
}
```

### 验证层宏定义

当启用 vulkan 验证层时，ncnn 会定义一些额外的便捷宏

* `ncnn_enable_validation_layer`
* `NCNN_LOGE`

目前，你必须将 `src/gpu.cpp` 开头的 `ENABLE_VALIDATION_LAYER` 定义修改为 `1` 才能启用这些宏。

`GL_EXT_debug_printf` 扩展会自动启用，无需在代码中显式指定。

```c
void main()
{
    int gx = int(gl_GlobalInvocationID.x);

#if ncnn_enable_validation_layer
    NCNN_LOGE("gx = %d\n", gx);
#endif
}
```

在运行时，`NCNN_LOGE` 将打印出 `gx` 的值

### 选项宏

仅当用户启用某些选项时才启用 GLSL 扩展

`GL_EXT_shader_16bit_storage` 扩展会在设备支持 16 位存储且用户开启了 `opt.use_fp16_storage` 或 `opt.use_bf16_storage` 选项时，自动启用，无需显式代码指示。

`GL_EXT_shader_explicit_arithmetic_types_float16` 扩展会在设备支持 16 位算术运算且用户开启了 `opt.use_fp16_arithmetic` 选项时，自动启用，无需显式代码指示。

`GL_EXT_shader_8bit_storage` 扩展会在设备支持 8 位存储且用户开启了 `opt.use_int8_storage` 选项时，自动启用，无需显式代码指示。

`GL_EXT_shader_explicit_arithmetic_types_int8` 扩展会在设备支持 8 位算术运算且用户开启了 `opt.use_int8_arithmetic` 选项时，自动启用，无需显式代码指示。

`GL_EXT_bfloat16` 扩展会在设备支持 bfloat16 存储且用户开启了 `opt.use_bf16_storage` 选项时，自动启用，无需显式代码指示。

```c
void main()
{
#if NCNN_fp16_storage
    // 用户启用 fp16 存储选项，且设备支持 fp16 存储
#endif

#if NCNN_fp16_arithmetic
    // 用户启用 fp16 算术选项，且设备支持 fp16 算术运算
#endif
}
```

|宏定义|option中所定义的变量|
|---|---|
|NCNN_fp16_packed|opt.use_fp16_packed|
|NCNN_fp16_storage|opt.use_fp16_storage|
|NCNN_fp16_arithmetic|opt.use_fp16_arithmetic|
|NCNN_int8_packed|opt.use_int8_packed|
|NCNN_int8_storage|opt.use_int8_storage|
|NCNN_int8_arithmetic|opt.use_int8_arithmetic|
|NCNN_bf16_packed|opt.use_bf16_packed|
|NCNN_bf16_storage|opt.use_bf16_storage|
|NCNN_shader_local_memory|opt.use_shader_local_memory|


================================================
FILE: docs/developer-guide/how-to-be-a-contributor.zh.md
================================================
### 如何提交代码

#### 一、fork 分支
在浏览器中打开 [ncnn](https://github.com/tencent/ncnn), `fork` 到自己的 repositories，例如
```
https://github.com/user/ncnn
```

clone 项目到本地，添加官方 remote 并 fetch:
```
$ git clone https://github.com/user/ncnn && cd ncnn
$ git remote add tencent https://github.com/tencent/ncnn
$ git fetch tencent
```
对于 `git clone` 下来的项目，它现在有两个 remote，分别是 origin 和 tencent：

```
$ git remote -v
origin   https://github.com/user/ncnn (fetch)
origin   https://github.com/user/ncnn (push)
tencent  https://github.com/Tencent/ncnn (fetch)
tencent  https://github.com/Tencent/ncnn (push)
```
origin 指向你 fork 的仓库地址；remote 即官方 repo。可以基于不同的 remote 创建和提交分支。

例如切换到官方 master 分支，并基于此创建自己的分支（命名尽量言简意赅。一个分支只做一件事，方便 review 和 revert）
```
$ git checkout tencent/master
$ git checkout -b add-conv-int8
```

或创建分支时指定基于官方 master 分支：
```
$ git checkout -b fix-typo-in-document tencent/master
```

> `git fetch` 是从远程获取最新代码到本地。如果是第二次 pr ncnn，直接从  `git fetch tencent` 开始即可，不需要 `git remote add tencent`，也不需要修改 `github.com/user/ncnn`。

#### 二、代码习惯
为了增加沟通效率，reviewer 一般要求 contributor 遵从以下规则

* `if-else`和花括号`{`中间需要换行
* 不能随意增删空行
* tab 替换为 4 个空格
* 为了保证平台兼容性，目前不使用`c++11`，`src`目录下尽量避免使用`template`
* 若是新增功能或平台，`test`目录需有对应测试用例
* 文档放到`doc`对应目录下，中文用`.zh.md`做后缀；英文直接用`.md`后缀

开发完成后提交到自己的 repository
```
$ git commit -a
$ git push origin add-conv-int8
```
推荐使用 [`commitizen`](https://pypi.org/project/commitizen/) 或 [`gitlint`](https://jorisroovers.com/gitlint/) 等工具格式化 commit message，方便事后检索海量提交记录

#### 三、代码提交
浏览器中打开 [ncnn pulls](https://github.com/Tencent/ncnn/pulls) ，此时应有此分支 pr 提示，点击 `Compare & pull request`

* 标题**必须**是英文。未完成的分支应以 `WIP:` 开头，例如 `WIP: add conv int8`
* 正文宜包含以下内容，中英不限
    * 内容概述和实现方式
    * 功能或性能测试
    * 测试结果

CI 已集成了自动格式化，restyled-io 会在 pr 的同时生成 `Restyled add conv int8`，需要 merge 自动 restyled 的分支，例如
```
$ git fetch tencent
$ git checkout add-conv-int8
$ git merge tencent/restyled/pull-2078
$ git push origin add-conv-int8
```
回到浏览器签署  CLA，所有 CI 测试通过后通知 reviewer merge 此分支。

#### 四、彩蛋
留下个人 qq 号会触发隐藏事件。

================================================
FILE: docs/developer-guide/how-to-implement-custom-layer-step-by-step.md
================================================
# step1 create a new empty class
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;

// a new layer type called MyLayer
class MyLayer : public Layer
{
};

// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)
```

# step2 declare layer parameters and weights
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;

class MyLayer : public Layer
{
private:
    int channels;// new code
    float gamma;// new code
    Mat weight;// new code
};

// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)
```

# step3 implement load functions for parameters and weights
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;

class MyLayer : public Layer
{
public:
    virtual int load_param(const ParamDict& pd);// new code
    virtual int load_model(const ModelBin& mb);// new code

private:
    int channels;
    float eps;
    Mat gamma_data;
};

// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)

// new routine for loading parameters
int MyLayer::load_param(const ParamDict& pd)
{
    // details about the relations with param file
    // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
    //
    channels = pd.get(0, 0);// parse 0=<int value> entry, default value 0
    eps = pd.get(1, 0.001f);// parse 1=<float value> entry, default value 0.001f

    return 0;// return zero if success
}

// new routine for loading weights
int MyLayer::load_model(const ModelBin& mb)
{
    // details about the relations with model file
    // https://github.com/Tencent/ncnn/wiki/param-and-model-file-structure
    //
    // read weights with length of channels * sizeof(float)
    // the second argument explains as follows
    // 0 judge the value type automatically, you may get float or float16 or uint8 etc
    //   depends on the model storage and the supporting target hardware
    // 1 read float values anyway
    // 2 read float16 values anyway
    // 3 read uint8 values anyway
    gamma_data = mb.load(channels, 1);
    if (gamma_data.empty())
        return -100;// return non-zero on error, -100 indicates out-of-memory

    return 0;// return zero if success
}
```

# step4 determine forward behavior
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;

class MyLayer : public Layer
{
public:
    MyLayer();// new code
    virtual int load_param(const ParamDict& pd);
    virtual int load_model(const ModelBin& mb);

private:
    int channels;
    float eps;
    Mat gamma_data;
};

// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)

// new routine for setting forward behavior
MyLayer::MyLayer()
{
    // one input and one output
    // typical one_blob_only type: Convolution, Pooling, ReLU, Softmax ...
    // typical non-one_blob_only type: Eltwise, Split, Concat, Slice ...
    one_blob_only = true;

    // do not change the blob size, modify data in-place
    // typical support_inplace type: ReLU, Sigmoid ...
    // typical non-support_inplace type: Convolution, Pooling ...
    support_inplace = true;
}

int MyLayer::load_param(const ParamDict& pd)
{
    channels = pd.get(0, 0);
    eps = pd.get(1, 0.001f);

    // you could alter the behavior based on loaded parameter
    // if (eps == 0.001f)
    // {
    //     one_blob_only = false;
    //     support_inplace = false;
    // }

    return 0;
}

int MyLayer::load_model(const ModelBin& mb)
{
    gamma_data = mb.load(channels, 1);
    if (gamma_data.empty())
        return -100;

    // you could alter the behavior based on loaded weight
    // if (gamma_data[0] == 0.f)
    // {
    //     one_blob_only = false;
    //     support_inplace = false;
    // }

    return 0;
}
```

# step5 choose proper interface based on forward behavior
```cpp
// The base class Layer defines four interfaces for each forward behavior combination

// 1
virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

// 2
virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

// 3
virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;

// 4
virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
```
**must** = layer must implement this function

**optional** = layer may implement this function for optimal performance

sometimes the graph inference path cannot call forward_inplace directly due to data sharing, in this situation the non-inplace forward routine will be used, which deep-copy the input blob and call inplace forward on it if the optional routine is not implemented. Thus, you could avoid this deep-copy by process input to output on-the-fly.

|one_blob_only|support_inplace|1|2|3|4|
|---|---|---|---|---|---|
|false|false|must| | | |
|false|true|optional| |must| |
|true|false| |must| | |
|true|true| |optional| |must|

# step6 implement forward function
```cpp
// mylayer.h
#include "layer.h"
using namespace ncnn;

class MyLayer : public Layer
{
public:
    MyLayer();
    virtual int load_param(const ParamDict& pd);
    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;// new code, optional
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;// new code

private:
    int channels;
    float eps;
    Mat gamma_data;
};

// mylayer.cpp
#include "mylayer.h"
DEFINE_LAYER_CREATOR(MyLayer)

MyLayer::MyLayer()
{
    one_blob_only = true;
    support_inplace = true;
}

int MyLayer::load_param(const ParamDict& pd)
{
    channels = pd.get(0, 0);
    eps = pd.get(1, 0.001f);

    return 0;
}

int MyLayer::load_model(const ModelBin& mb)
{
    gamma_data = mb.load(channels, 1);
    if (gamma_data.empty())
        return -100;

    return 0;
}

// optional new routine for layer forward function, non-inplace version
int MyLayer::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // check input dims, return non-zero on error
    if (bottom_blob.c != channels)
        return -1;

    // x = (x + eps) * gamma_per_channel

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    top_blob.create(w, h, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;// return non-zero on error, -100 indicates out-of-memory

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);
        const float gamma = gamma_data[q];

        for (int i=0; i<size; i++)
        {
            outptr[i] = (ptr[i] + eps) * gamma ;
        }
    }

    return 0;
}

// new routine for layer forward function
int MyLayer::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    // check input dims, return non-zero on error
    if (bottom_top_blob.c != channels)
        return -1;

    // x = (x + eps) * gamma_per_channel

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        const float gamma = gamma_data[q];

        for (int i=0; i<size; i++)
        {
            ptr[i] = (ptr[i] + eps) * gamma ;
        }
    }

    return 0;
}
```

# step7 integrate with ncnn library
you may probably need to modify caffe2ncnn or mxnet2ncnn etc. to write your layer specific parameters and weights into ncnn param and model file

the param and model file structure [param-and-model-file-structure](param-and-model-file-structure)

```
// example param file content
Input            input   0 1 input
Convolution      conv2d  1 1 input conv2d 0=32 1=1 2=1 3=1 4=0 5=0 6=768
MyLayer          mylayer 1 1 conv2d mylayer0
Pooling          maxpool 1 1 mylayer0 maxpool 0=0 1=3 2=2 3=-233 4=0
```

```cpp
ncnn::Net net;

// register custom layer before load param and model
// the layer creator function signature is always XYZ_layer_creator, which defined in DEFINE_LAYER_CREATOR macro
net.register_custom_layer("MyLayer", MyLayer_layer_creator);

net.load_param("model.param");
net.load_model("model.bin");
```


================================================
FILE: docs/developer-guide/how-to-write-a-neon-optimized-op-kernel.md
================================================
# benchmark
op

# naive C with openmp
for for for

# unroll, first try
h

# register allocation
kernels

# unroll, second try
simd

# neon intrinsics
optional

# naive neon assembly with pld
asm

# pipeline optimize, first try
more register load mla

# pipeline optimize, second try
interleave load mla

# pipeline optimize, third try
loop tail

# usual practice, load/save
233

# usual practice, unroll
233

# usual practice, save register
233


================================================
FILE: docs/developer-guide/how-to-write-a-sse-optimized-op-kernel.zh.md
================================================
# 如何使用SSE来优化算子核心

## 一：准备

### 1.背景资料

​	SSE 全称Intel® Streaming SIMD Extensions (Intel® SSE),本质是Intel公司封装汇编语句提供的底层操作指令函数集。同样属于底层操作指令集的还有著名的Intel® AVX(Advanced Vector Extensions),  及 Intel® AVX2(Intel® Advanced Vector Extensions 2)。基于同样原理封装的还有Arm 对应Arm Intrinsic，MIPS中对应MIPS Intrinsic。

​	SSE的版本包含：SSE/SSE2/SSE3/SSE4.1/SSE4.2。下文中在描述CPU特性上统称为SSE系列指令集。在描述具体使用指令函数中的CPUID Flags，才会具体区分SSE不同版本。

​	自从MSVC不再支持x64的汇编指令后（虽然可以强制使用，但不推荐不安全）。SSE，AVX等成为MSVC 支持的最佳底层优化方法。

​	本文将从SSE的使用出发，以ncnn实现为例，展示如何使用SSE优化深度学习中算子。

​	优化算子工作需要三方面的准备事项：

- 测试正确的原生代码
- 快速测试验证环境
- 基准统计程序

### 2.确认硬件是否支持SSE

​	在开始SSE优化之前，首先请确保您硬件支持SSE指令集，对于大多数Intel CPU都支持SSE指令集。但在各种系统环境下，查看方式不同。我们有：

#### 1.windows环境

​	windows环境下推荐简单使用GPU-Z来检测当前处理器是否支持SSE扩展。在GPU-Z官网下载后，运行，在“处理器”-“CPU支持的特性”项目下，若包含SSE系列指令集，即当前CPU支持SSE。

#### 2.Linux环境和类Unix环境

​	Linux环境和类Unix环境下，使用查看cpuinfo文件来确认CPU特性；

```shell
cat  /proc/cpuinfo
...
flags: *** sse sse2 ***	#在cpu flags中即可检查是否支持sse扩展
```

#### 2.macOS环境：

​	macOS本质是像Unix环境，所以同样使用sysctl 来查看CPU特性.(注意Mac的 M1 M2系列芯片是arm架构，不支持SSE)

```shell
sysctl machdep.cpu		# 结果同Linux环境
```

## 二：编写原生代码

​	使用SSE来优化算法的过程本质就是代码重构的一种情况。代码重构的首要条件是完成完备的代码行为测试集合。所以，这部分将从测试代码的编写开始。

​	其次优化过程的目标是调优某些性能指标的过程。所以第二部分将讨论性能指标的选定和优先级；

### 1.编写测试代码

​	在大多数情况下，看到这篇文章的人肯定是比笔者更会写算法，所以我在这里只谈一些编写测试的注意事项（这里的测试指验证你算法满足你的要求所编写的代码行为，跟其他人无关）。

​	编写测试代码主要注意事项：

- 思考如何构造基础数据结构才能满足算法行为的输入要求。举例来说，如果你准备为ncnn贡献算法，请阅读ncnn中关于Mat结构的函数。最好编写相关测试来验证该数据结构满足你的需要。（笔者的建议是可以先从简单结构来验证，比如需要做一个支持f32任意大小的矩阵加法算子，可以先从支持固定矩阵int8类型的加法开始编写测试代码）。
- 保持结果的正确性。首先考虑，你所编写的原生代码行为上，是否满足你所需要的结果（不论这个结果是手算的，numpy算的或者pytorch算的）。其次要考虑，结果在内存上结构如何排布。以ncnn为例，思考你的结果该如何放入到一个Mat中，Mat的size该如何设定。（在后续SSE优化中，我们将多次以原生代码结果作为target结果，验证每次优化后的正确性，原生代码能够稳定输出正确结果非常重要）
- 不用过早考虑算法的完备性，应该随着每次测试结果的正确来迭代重构算法和测试代码。二者同样重要。如果能够自动化测试，请尽量让一个简单的脚本执行来完成所有你慢吞吞的命令行。

### 2.考虑性能指标

​	性能指标的主要作用是随着每次优化的迭代，告诉我们所采取的措施在什么方面取得效果，是正面优化还是负面优化。

​	性能指标很多，包括吞吐量，还有类似计算稳定度，时间延迟，视频方面还有fps 等等。无法确认有效的性能指标也是大多数优化算法的困难点之一。	

​	随着简单粗暴地叠晶体管数量来解决电脑运行问题，性能指标似乎变得越来越不重要。这是一种错误观念，如果在单核上编写非常烂的代码，增加N个核心只是把烂代码重复N次而已。另一方面，性能指标有着客观性，在开发板上和集群设备上运行同样的算法，性能指标的优先级也不一样。但是，我认为应当满足最基础性能指标有这两个：

- **吞吐量**：算法在单位时间内执行的次数，用Gflops表示，该值越大越好（也可以认为执行同样算法所平均占用的时间，时间越短越好）；
- **性能衰退**：即随着数据规模的增加，Gflops在不同数据规模下的波动情况。更低的离散程度意味着吞吐量保持在一定范围内不发生变化。

​	其余有效的性能指标应当由业务环境和任务需求决定。负责技术基础设施建设的算法工程师，一方面应该理解业务所需求中的最高优先级，另一方面也应该追求做到更好。

​	以SSE优化算法，本质上是重构的迭代过程。不用在初期就考虑如何达到最大性能指标，而是应该考虑每次迭代中带来一定量的性能优化。

## 三：理解SSE

​	SSE主要由SSE基础数据类型 及 针对性的SSE操作函数构成。前文提到，SSE是针对汇编语句的封装，所以本身不具备错误检查和错误处理（错误检查和错误处理一般由编译器完成）。使用不当的话，诸如segmentation fault之内指针指向不存在的内存错误非常常见。我在此处建议：<u>使用SSE优化之前，确保理解代码指针位置和移动原理，原生代码已经完成测试，输出结果正确。</u>

### 1.SSE数据类型

​	SSE数据类型形如：

```c++
__m<bit><type>			 //__m适用代表申请mm寄存器
    					// bit 代表数据类型的字节长度，在SSE中为128 或 64
    					// 默认type为单精度浮点（f32），其余为int 或double
// 另外要注意所有SSE的类型除__m128和__m64外，随着版本更新有不同的类型，建议根据需要且确定硬件性能后选择合适的类型
// 举例如下：
__m128					//4xf32 含有4个单精度浮点数；SSE
__m64    				//4xf32 含有2个单精度浮点数；SSE
__m128i   				//8个int类型（8x16)		 ；SSE3
__m128d					//2个double类型(2x64)
```

### 2.SSE内联函数结构

​	SSE内联函数在线查询：[Intel® Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE,SSE2,SSE3,SSSE3,SSE4_1,SSE4_2) 在此 单个指令的结构如下：

- Synopsis ：摘要。描述指令的接口定义，需要引入的头文件，对应的指令，CPU必须支持的标志；
- Description：描述该指令的行为；
- Operation：逻辑层面描述指令行为；
- Performance：在不同架构中所需要的延迟和执行所需要的时钟周期数（CPI）。

​	值得指出的的是此处默认使用小端存储，即左边为高位，右边为低位。

​	相似的内联函数有很多，在使用时候一定要注意Operation中的逻辑满足您的要求。

​	另外，在ncnn中，ncnn已经将部分SSE内联函数以NCNN内联的方式封装。在为NCNN添加SSE优化的算法的过程中，请首先考虑搜索“NCNNINLINE”宏封装的SSE函数。

## 四：样例

### 1.一个简单的样例：4x4矩阵乘法

​	矩阵乘法方面，已经有很多出色的成果。值得一读的比如[how-to-optimize-gemm](https://github.com/flame/how-to-optimize-gemm)，及 [以Arm Intrinsic优化矩阵乘法](https://github.com/tpoisonooo/how-to-optimize-gemm)。我建议感兴趣同学参考和学习这两份项目，来探究如何从0到1优化一份算法；

​	矩阵乘法原理很简单：

​	假设有A，B两个矩阵，如下：
$$
A_{[4][4]} =  
\begin{bmatrix}
	a_0 & a_4 & a_8 & a_{12} \\
	a_1 & a_5 & a_9 & a_{13} \\
	a_2 & a_6 & a_{10} & a_{14} \\
	a_3 & a_7 & a_{11} & a_{15} 
\end{bmatrix}
~~
B_{[4][4]} =  
\begin{bmatrix}
	b_0 & b_4 & b_8 & b_{12} \\
	b_1 & b_5 & b_9 & b_{13} \\
	b_2 & b_6 & b_{10} & b_{14} \\
	b_3 & b_7 & b_{11} & b_{15} 
\end{bmatrix}
~~
C_{[4][4]} =  
\begin{bmatrix}
	c_0 & c_4 & c_8 & c_{12} \\
	c_1 & c_5 & c_9 & c_{13} \\
	c_2 & c_6 & c_{10} & c_{14} \\
	c_3 & c_7 & c_{11} & c_{15} 
\end{bmatrix}
$$
​	对于C 矩阵的第一列，我们有：
$$
c_0 = a_0b_0 + a_4b_1 + a_8b_2 + a_{12}b_3 \\
	c_1 = a_1b_0 + a_5b_1 + a_9b_2 + a_{13}b_3 \\
	c_2 = a_2b_0 + a_6b_1 + a_{10}b_2 + a_{14}b_3 \\
	c_3 = a_3b_0 + a_7b_1 + a_{11}b_2 + a_{15}b_3
$$


#### 1.编写测试代码和基准测量程序

​	在该样例中，测试代码很容易编写出来，我们只需要初始化4x4的二维数组，并返回指针即可。此时，可以不考虑泛用性，初始化为固定值即可。

```c
// <代码片段>
...
float A[16] = {0.0f};			// 此处已经将输入和输出的矩阵默认展开成im2col 后的单行（inch = 1） 宽度为h*w = 16的矩阵
float B[16] = {0.0f};
float C[16] = {0.0f};
matrix_init_rand(A, 4, 4);		// 随机初始化A数组
matrix_init_rand(B, 4, 4);		// 随机初始化B数组
```

​	编写验证正确性的测试代码。

```c
// <代码片段>
...
float T[16] = {...};			// Target即为预测的C的结果数组，可用numpy或者纸笔计算
...
float error = 0.0001;
bool CheckAuc(T, C, error);		
// 注意：float在计算机中不能完全表示，只能使用绝对误差的判别方法。gtest等测试框架的EXCEPT宏无法处理1.234e5这样结构的float数的对比。
```

​	同样，编写计算耗时的基准测量代码，此处使用1000次操作所占的平均时间来作为基准。

```c
// <代码片段>
...
const int loop = 1000;
clock_gettime_(CLOCK_REALTIME, &time_start);
for(init i = 0; i < loop; i++)
{
	matirx_mult_native(C, A, B);
}
clock_gettime_(CLOCK_REALTIME, &time_end);
clocks_c = (time_end.tv_sec - time_start.tv_sec) * 1000000 +  (time_end.tv_sec - time_start.tv_sec) /1000;
```

#### 2.编写原生代码

​	编写原生代码，使得正确性测试能够通过。

```c
// <代码片段>
static void matirx_mult_native(float *C, float *A, float *B)
{
    for(int i_idx = 0; i_idx < 4; i_idx++)
    {
        for(int j_idx = 0; j_idx < 4; j_idx++)
        {
           for(int k_idx = 0; k_idx < 4; k_idx++)
           {
               C[4*j_idx + i_idx] += A[4*k_idx + i_idx] * B[4*j_idx + k_idx];
           }
        }
    }
}
```

#### 3.优化原生代码

​	注意到上述代码中，先取c0 - c3 的计算作为样例考虑：
$$
	c_0 = a_0b_0 + a_4b_1 + a_8b_2 + a_{12}b_3   \\
	c_1 = a_1b_0 + a_5b_1 + a_9b_2 + a_{13}b_3    \\
	c_2 = a_2b_0 + a_6b_1 + a_{10}b_2 + a_{14}b_3 \\
	c_3 = a_3b_0 + a_7b_1 + a_{11}b_2 + a_{15}b_3
$$

##### 1.装载寄存器

- 考虑竖排a0-a1-a2-a3 为4个f32 数据，又因为SSE可以申请mm寄存器，单次保存128bit，那么不妨把a0-a4保存在寄存器中，

- 对于b0-b3 则是，单次读取一个值，能够重复用4次，不妨考虑b0 重复4次，排满单个128bit的mm寄存器；

- 同理把c0-c3也放入寄存器，从列方向上考虑，取名为_c0 

  ```c++
  _m128 _a0 = _mm_load_ps(a_ptr);			//a0 -a1 -a2 -a3
  _m128 _a1 = _mm_load_ps(a_ptr + 4);		//a4 -a5 -a6 -a7
  _m128 _a2 = _mm_load_ps(a_ptr + 8);		//a8 -a9 -a10-a11
  _m128 _a3 = _mm_load_ps(a_ptr + 12);	//a12-a13-a14-a15
  
  _m128 _b0 = _mm_load_ps1(b_ptr);		// b0 - b0 - b0 - b0
  _m128 _b1 = _mm_load_ps1(b_ptr + 4);	// b1 - b1 - b1 - b1
  _m128 _b2 = _mm_load_ps1(b_ptr + 8);	// b2 - b2 - b2 - b2
  _m128 _b3 = _mm_load_ps1(b_ptr + 12);	// b3 - b3 - b3 - b3
  ```

##### 2.编写第一列的计算结果

​	对于_a0 -\_a3 数据与\_b0 数据相乘 ，有：

```c++
// 保存结果新建一个_c0 作为临时变量
_m128 _c0 = _mm_set_ps1(0.0f);
_c0 = _mm_mul_ps(_a0, _b0);
_c0 = _mm_add_ps(_mm_mul_ps(_a1, _b1),_c0);
_c0 = _mm_add_ps(_mm_mul_ps(_a2, _b2),_c0);
_c0 = _mm_add_ps(_mm_mul_ps(_a3, _b3),_c0);
// 把 _sum0存会以c指针开头的内存中，完美！
_mm_store_ps(c_ptr, _c0);
```

##### 3.将单列输出扩展到所有列：

​	我们针对剩下的c中的c1 列也做相同的操作： 对于C1 列 有：
$$
	c_4 = a_0b_4 + a_4b_5 + a_8b_6 + a_{12}b_7 \\
	c_5 = a_1b_4 + a_5b_5 + a_9b_6 + a_{13}b_7 \\
	c_6 = a_2b_4 + a_6b_5 + a_{10}b_6 + a_{14}b_7 \\
	c_7 = a_3b_4 + a_7b_5 + a_{11}b_6 + a_{15}b_7
$$


```c++
// a 系列不变 b系列指针+1
_m128 _b4 = _mm_load_ps1(b_ptr + 1);		// b4 - b4 - b4 - b4
_m128 _b5 = _mm_load_ps1(b_ptr + 4 + 1);	// b5 - b5 - b5 - b5
_m128 _b6 = _mm_load_ps1(b_ptr + 8 + 1);	// b6 - b6 - b6 - b6
_m128 _b7 = _mm_load_ps1(b_ptr + 12+ 1);	// b7 - b7 - b7 - b7

// 保存结果新建一个_c0 作为临时变量
_m128 _c1 = _mm_set_ps1(0.0f);
_c1 = _mm_mul_ps(_a0, _b4);
_c1 = _mm_add_ps(_mm_mul_ps(_a1, _b5),_c1);
_c1 = _mm_add_ps(_mm_mul_ps(_a2, _b6),_c1);
_c1 = _mm_add_ps(_mm_mul_ps(_a3, _b7),_c1);
// 把 _sum0存会以c指针开头的内存中，完美！
_mm_store_ps(c_ptr, _c1);
```

​	此时我们发现，对于C1列的操作与C0列及其相似，只不过是b_ptr的指针发生移动，不妨将其放到同一个循环中，有：

```C++
// a 系列不变
_m128 _a0 = _mm_load_ps(a_ptr);			//a0 -a1 -a2 -a3
_m128 _a1 = _mm_load_ps(a_ptr + 4);		//a4 -a5 -a6 -a7
_m128 _a2 = _mm_load_ps(a_ptr + 8);		//a8 -a9 -a10-a11
_m128 _a3 = _mm_load_ps(a_ptr + 12);	//a12-a13-a14-a15

for(int i = 0; i < 4; i++)
{
    _m128 _b0 = _mm_load_ps1(b_ptr);		// b0 - b0 - b0 - b0
    _m128 _b1 = _mm_load_ps1(b_ptr + 4);	// b1 - b1 - b1 - b1
    _m128 _b2 = _mm_load_ps1(b_ptr + 8);	// b2 - b2 - b2 - b2
    _m128 _b3 = _mm_load_ps1(b_ptr + 12);	// b3 - b3 - b3 - b3
    
    _m128 _ci = _mm_set_ps1(0.0f);
    _ci = _mm_mul_ps(_a0, _b0);
    _ci = _mm_add_ps(_mm_mul_ps(_a1, _b1),_ci);
    _ci = _mm_add_ps(_mm_mul_ps(_a2, _b2),_ci);
    _ci = _mm_add_ps(_mm_mul_ps(_a3, _b3),_ci);
    
    _mm_store_ps(c_ptr, _ci);
    
    b_ptr += 1;				// 移动b_ptr
    c_ptr += 4;				// 移动保存内存的c_ptr
}
```

### 2.NCNN中以SSE优化算子的注意事项

#### 1.线程与openmp

​	以上计算Benchmark 和 SSE优化的方法大多集中在单个核心中，但是在实际使用ncnn中，ncnn使用Option opt 中提供的num_threads 给openmp赋值，以实现多线程并行化，同时运行在多个核心上。

```c++
#pragma omp parallel for num_threads(opt.num_threads)
```

​	在优化成SSE代码的初期，可以考虑锁定为单线程，或者直接不用考虑线程的影响，仅对单核以SSE优化，保证单核的结果正确后，再加上opt的多线程进行结果测试。

#### 2.展开循环

​	在实际ncnn实现的原生代码的算法中，循环是非常常见的。针对以SSE优化这类循环，遵循非常简单的原则：循环中，迭代器等于零时刻，整个输出的结果也是正确的。

​	那么，在我们使用SSE优化过程中，不妨以迭代器等于零的时刻，函数计算结果作为此时目标结果。在此基础上再利用SSE优化代码。与目标结果核对正确以后，再进一步去考虑迭代器等于1的情况（重复这个过程直到迭代器达到最大值）。在迭代器的每个元素下，SSE优化出的代码都与结果相等，那么我们可以说，该次优化是正确性，且完全覆盖了需执行代码。（一般来说不用考虑到最大值，根据数学归纳法，n有效，n+1有效，那么n的序列都是有效的）

## 五：总结

​	本文描述SSE的使用及以4x4矩阵乘法的样例来优化SSE代码。

​	值得注意的是，SSE只是128bit数据宽度的指令集，但是也可以用来模拟256bit 和 512bit数据宽度，来实现以pack4拼接成pack8，甚至pack16的做法，只不过在输出结果管理上更加繁琐而已。感兴趣的同学可以尝试一下。

## 六：引用

1. [SSE指令扩展快查](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=SSE,SSE2,SSE3,SSSE3,SSE4_1,SSE4_2)；
2. 浮点性能基准计算-[浮点峰值那些事儿](https://zhuanlan.zhihu.com/p/28226956)
3. 硬件性能基准测试计算样例：[M1芯片搞数据科学好使吗？5种基准测试给你答案](https://mp.weixin.qq.com/s/2N5cl_Z1MRF8dfbRo-sb4A)
4. 讨论矩阵乘法如何优化的系列论文：[how-to-optimized-gemm](https://github.com/flame/how-to-optimize-gemm/wiki)
5. 讨论以Arm Intrinsic 优化gemm的系列文章：[OpenBLAS gemm从零入门](https://zhuanlan.zhihu.com/p/65436463)


================================================
FILE: docs/developer-guide/kvcache.md
================================================
# high-performance transformer inference with mha kv cache in ncnn

This document details the implementation and usage of the key-value (kv) cache for the `MultiHeadAttention` and `SDPA` layer in ncnn. This feature significantly accelerates autoregressive inference for Transformer-based models, such as large language models and other encoder-decoder architectures.

## 1. what is kv cache?

### the challenge of autoregressive inference

Transformer models generate output token by token in a process called autoregressive decoding. In each step, the model takes the previously generated tokens as input to predict the next one. A core component of this is the self-attention mechanism, which computes query (q), key (k), and value (v) matrices based on the sequence generated so far.

Without optimization, the model must recompute the k and v matrices for all preceding tokens at every single step. For a sequence of length `N`, the computational cost for the self-attention mechanism is roughly proportional to `N^2`. As the sequence grows, this becomes a significant performance bottleneck.

### the solution: kv cache

**kv cache** is an optimization technique that stores the key and value tensors from previous decoding steps. When generating a new token, we only need to compute the k and v for the *current* token and append them to the cached values. The model then uses the full set of cached k and v tensors for the attention calculation.

### key benefits

- **dramatic speed-up:** It reduces the computational complexity of the self-attention mechanism from O(N^2) per step to approximately O(N). This drastically cuts down inference latency, especially for long sequences.
- **reduced computation:** It eliminates redundant calculations, saving significant computational resources and energy.
- **enables real-time applications:** The performance gain makes it feasible to deploy large Transformer models for interactive and real-time tasks.

## 2. ncnn kv cache implementation

ncnn introduces kv cache support directly into its `MultiHeadAttention` and `SDPA` layer. The implementation is designed to be efficient and flexible, handling both the dynamic cache of self-attention and the static k/v of cross-attention found in encoder-decoder architectures.

### self-attention vs. cross-attention cache logic

The caching strategy is fundamentally different for self-attention and cross-attention layers within a decoder.

#### self-attention (dynamic cache)
- **purpose:** Allows the decoder to attend to previously generated tokens in its own sequence (e.g., the text being generated).
- **cache Logic:** The cache is **dynamic** and grows with each generated token. In step `t`, the k and v for token `t` are computed and appended to the cache from step `t-1`.
- **ncnn implementation:** The `MultiHeadAttention` and `SDPA` layers for self-attention are modified to accept two additional inputs (`cache_k_in`, `cache_v_in`) and produce two corresponding outputs (`cache_k_out`, `cache_v_out`). The `7=1` parameter enables this dynamic caching behavior inside the layer.

#### cross-attention (static k/v)
- **purpose:** Allows the decoder to attend to the output of the encoder (e.g., attending to audio features in speech recognition, or an input sentence in translation).
- **cache Logic:** The k and v matrices are derived from the encoder's output, which is computed only **once** per input sequence. Therefore, the k and v for cross-attention are **static** and do not change during the decoding process. They are "cached" in the sense that they are pre-computed and reused in every decoding step.
- **ncnn implementation:** The `MultiHeadAttention` and `SDPA` layers for cross-attention are also configured with `7=1` and cache I/O blobs. However, the implementation correctly identifies cross-attention (where the query blob is different from the key/value blobs) and reuses the `cache_k_in` and `cache_v_in` directly, without performing concatenation. This allows the static encoder k/v to be passed efficiently through the network.

## 3. ncnn kv cache memory layout

The memory layout of the kv cache is a critical design choice for performance. ncnn uses different layouts for `MultiHeadAttention` and `SDPA` to optimize for their respective calculation patterns.

### `MultiHeadAttention` cache layout (Transposed)

The `MultiHeadAttention` layer uses a **transposed layout** for its cache blobs. The primary reason for this is to **ensure that data for each attention head is contiguous in memory, which significantly boosts gemm performance.**

*   **input blobs (q, k, v):** These typically have a shape where height represents the sequence length.
    *   `ncnn::Mat` dimensions: `(w = embed_dim, h = seq_len)`

*   **cache blobs (`k_cache`, `v_cache`):** These are stored in a **transposed** format.
    *   `ncnn::Mat` dimensions: `(w = seq_len, h = embed_dim)`

**the rationale:**

1.  **slicing by Head:** During the attention calculation, the code slices the `k_cache` and `v_cache` matrices along their height to isolate the data for each head (e.g., using `row_range(head_index * embed_dim_per_head, embed_dim_per_head)`).
2.  **memory contiguity:** Because `ncnn::Mat` uses a row-major memory layout, this slicing operation on the transposed cache blob results in a sub-matrix where all the data for a single head is perfectly contiguous.
3.  **gemm efficiency:** Subsequent matrix multiplication operations (`q * k^T` and `Attention * v`) can then operate on these contiguous memory blocks. This maximizes CPU cache locality and the effectiveness of simd instructions, leading to a substantial increase in computational speed.

If a non-transposed layout were used, the data for each head would be strided in memory, causing frequent cache misses and dramatically slowing down the performance-critical gemm calculations. Therefore, this transposed layout is a deliberate and crucial optimization for computation.

### `SDPA` cache layout (Standard)

The `SDPA` layer uses the **standard ncnn Mat layout**, where the sequence length is represented by the height.

*   **input blobs (q, k, v):** `(w = embed_dim, h = seq_len, c = num_heads)`
*   **cache blobs (`k_cache`, `v_cache`):** `(w = embed_dim, h = seq_len, c = num_heads)`

**the rationale:**

The `SDPA` layer's internal implementation directly concatenates the cache blobs (`past_k`, `past_v`) with the current ones (`cur_k`, `cur_v`) along the height dimension (`seq_len`). This simpler approach avoids the need for a transposed layout while still being highly efficient, as the concatenation logic is handled inside the optimized C++ implementation.

## 4. converting models to support kv cache

To enable kv cache, you must modify the model's `.param` file to add the necessary cache inputs and outputs to all `MultiHeadAttention` and `SDPA` layers in the decoder.

### step 1: export a sequence-length-1 model

First, export your model from its original framework (e.g., PyTorch) using a sequence length of 1 for the decoder. This creates a graph optimized for single-token generation, which is the core of the autoregressive decoding loop.

### step 2: modify the .ncnn.param file

After exporting, a script is needed to edit the generated `.ncnn.param` file to make it cache-aware.

#### A. Adding kv cache to All MultiHeadAttention and SDPA Layers

You must add cache inputs/outputs to **every** `MultiHeadAttention` / `SDPA` layer in the decoder.

- **change `input_count` and `output_count`:** Increase both by 2.
- **add blob names:** Append new, unique blob names for `cache_k_in`, `cache_v_in`, `cache_k_out`, and `cache_v_out`.
- **enable cache behavior:** Add the parameter `7=1`.

Here is a robust Python function that automates this process:
```python
def add_kv_cache_to_ncnn_param(filename):
    """
    Modifies an ncnn.param file to add a kv cache mechanism to all
    MultiHeadAttention and SDPA layers and overwrites the original file.
    This handles both self-attention and cross-attention layers.
    """
    import os

    if not os.path.exists(filename):
        print(f"Error: The file '{filename}' was not found.")
        return

    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    header_line_index = 1  # line 2, after magic number
    header_parts = lines[header_line_index].strip().split()
    original_layer_count = int(header_parts[0])
    original_blob_count = int(header_parts[1])

    attention_indices = [i for i, line in enumerate(lines) if line.strip().startswith("MultiHeadAttention") or line.strip().startswith("SDPA")]
    attention_count = len(attention_indices)

    if attention_count == 0:
        print("No 'MultiHeadAttention' or 'SDPA' layers found. The file will not be modified.")
        return

    # --- modify MultiHeadAttention and SDPA layers ---
    for i, line_index in enumerate(attention_indices):
        parts = lines[line_index].strip().split()
        layer_type, layer_name, input_count_str, output_count_str = parts[:4]
        input_count, output_count = int(input_count_str), int(output_count_str)

        blob_and_params = parts[4:]
        inputs = blob_and_params[:input_count]
        outputs = blob_and_params[input_count : input_count + output_count]
        params = blob_and_params[input_count + output_count:]

        # add cache I/O blobs and enable cache parameter
        inputs.extend([f"cache_k_in_{i}", f"cache_v_in_{i}"])
        outputs.extend([f"cache_k_out_{i}", f"cache_v_out_{i}"])
        params.append("7=1")

        new_line_parts = [
            f"{layer_type:<24}", f"{layer_name:<24}",
            str(input_count + 2), str(output_count + 2),
            *inputs, *outputs, *params
        ]
        lines[line_index] = " ".join(new_line_parts) + "\n"

    # --- add a single input layer to provide all cache blobs ---
    new_layer_count = original_layer_count + 1
    # each mha needs 2 new *input* blobs and produces 2 new *output* blobs.
    # the total number of unique blobs increases by 4 for each mha.
    new_blob_count = original_blob_count + (attention_count * 4)
    lines[header_line_index] = f"{new_layer_count} {new_blob_count}\n"

    # find where to insert the new input layer (after existing ones)
    insert_pos = header_line_index + 1
    while insert_pos < len(lines) and lines[insert_pos].strip().startswith("Input"):
        insert_pos += 1

    cache_blob_names = [name for i in range(attention_count) for name in (f"cache_k_in_{i}", f"cache_v_in_{i}")]
    input_layer_line = (
        f"{'Input':<24} {'kv_cache_in':<24} 0 {len(cache_blob_names)} "
        f"{' '.join(cache_blob_names)}\n"
    )
    lines.insert(insert_pos, input_layer_line)

    with open(filename, 'w', encoding='utf-8') as f:
        f.writelines(lines)

    print(f"Successfully added kv cache to {attention_count} MultiHeadAttention / SDPA layers.")

# usage:
# add_kv_cache_to_ncnn_param("your_model_decoder.ncnn.param")
```

#### B. Supporting Dynamic Sequence Length in Gemm
Feed-forward networks (`Gemm` layers) that process the output of attention blocks must support dynamic sequence lengths, as the cache grows. To achieve this, change the parameter `7=1` (constant input shape) to `7=0` (dynamic input shape) for the relevant `Gemm` layers.

```python
def update_gemm_params(param_file_path):
    """
    Finds all 'Gemm' layers and changes parameter '7=1' to '7=0'
    to support dynamic input shapes.
    """
    import re
    with open(param_file_path, 'r') as f:
        lines = f.readlines()

    new_lines = []
    for line in lines:
        if line.strip().startswith('Gemm'):
            line = re.sub(r'(\b7=)1\b', r'\g<1>0', line)
        new_lines.append(line)

    with open(param_file_path, 'w') as f:
        f.writelines(new_lines)
    print(f"Updated Gemm layers in '{param_file_path}' to support dynamic inputs.")

# usage:
# update_gemm_params("your_model_decoder.ncnn.param")
```

## 5. implementing kv cache inference logic

Your C++ inference code must manage the cache blobs across decoding steps.

### step 1: identify cache blob indices
After loading the network, identify the input and output blob indices for the cache. You can iterate through the mha layers and find the blobs you named in the conversion script.

```cpp
#include "net.h"
#include <vector>
#include <string>

struct kvcache_info
{
    std::vector<int> input_indices;
    std::vector<int> output_indices;
};

void find_mha_kvcache_blobs(const ncnn::Net& net, kvcache_info& info)
{
    for (const ncnn::Layer* layer : net.layers())
    {
        // cache-enabled mha layer has 3 outputs (out, cache_k_out, cache_v_out) instead of 1
        if ((layer->typeindex == ncnn::LayerType::MultiHeadAttention || layer->typeindex == ncnn::LayerType::SDPA) && layer->tops.size() == 3)
        {
            // the script adds cache_k and cache_v as the last two inputs/outputs
            int input_count = layer->bottoms.size();
            int output_count = layer->tops.size();

            info.input_indices.push_back(layer->bottoms[input_count - 2]); // cache_k_in
            info.input_indices.push_back(layer->bottoms[input_count - 1]); // cache_v_in

            info.output_indices.push_back(layer->tops[output_count - 2]);  // cache_k_out, i.e., tops[1]
            info.output_indices.push_back(layer->tops[output_count - 1]);  // cache_v_out, i.e., tops[2]
        }
    }
}
```

### step 2: prefill and decode loop
The inference process is split into two phases: "prefill" for the initial prompt and "decode" for subsequent single-token generation.

- **prefill (`run_decoder_pre`):**
  - input: The entire initial sequence of token IDs
  - the kv cache is empty
  - run the decoder once
  - extract the output logits for the *last* token to predict the next token
  - extract the `out_cache_k` and `out_cache_v` blobs from all mha layers and store them

- **decode (`run_decoder_step`):**
  - input: The single, most recently generated token ID
  - the kv cache blobs from the previous step are fed as input
  - run the decoder
  - extract the output logits to predict the next token
  - extract and store the updated kv cache blobs for the next step

Here is a conceptual C++ implementation:

```cpp
// assume 'decoder_net' is loaded and 'kvcache_info' is populated.

// --- prefill step (processes a sequence of tokens) ---
void run_decoder_pre(const std::vector<int>& tokens, const ncnn::Mat& encoder_states, std::vector<ncnn::Mat>& out_kv_cache)
{
    ncnn::Extractor ex = decoder_net.create_extractor();

    ncnn::Mat input_embeds = prepare_input_embeds(tokens); // your embedding logic
    ex.input("in0", input_embeds); // use your input blob name
    ex.input("encoder_out", encoder_states); // use your encoder output blob name

    out_kv_cache.resize(kvcache_info.output_indices.size());
    for (size_t i = 0; i < kvcache_info.output_indices.size(); i++)
    {
        ex.extract(kvcache_info.output_indices[i], out_kv_cache[i]);
    }

    ncnn::Mat all_logits;
    ex.extract("out0", all_logits); // Use your output blob name
    // ... process logits for the last token ...
}

// --- decode step (processes a single token) ---
void run_decoder_step(int token, const ncnn::Mat& encoder_states, const std::vector<ncnn::Mat>& kv_cache, std::vector<ncnn::Mat>& out_kv_cache)
{
    ncnn::Extractor ex = decoder_net.create_extractor();

    ncnn::Mat input_embeds = prepare_input_embeds({token});
    ex.input("in0", input_embeds);
    ex.input("encoder_out", encoder_states);

    // feed the existing cache
    for (size_t i = 0; i < kvcache_info.input_indices.size(); i++)
    {
        ex.input(kvcache_info.input_indices[i], kv_cache[i]);
    }

    // extract the updated cache
    out_kv_cache.resize(kvcache_info.output_indices.size());
    for (size_t i = 0; i < kvcache_info.output_indices.size(); i++)
    {
        ex.extract(kvcache_info.output_indices[i], out_kv_cache[i]);
    }

    ncnn::Mat logits;
    ex.extract("out0", logits);
    // ... process logits to get the next token ...
}

// --- main inference loop ---
void generate_sequence()
{
    std::vector<int> initial_tokens = { /* SOT and prompt tokens */ };
    ncnn::Mat encoder_states = run_encoder(); // compute encoder output once

    // 1. prefill stage
    std::vector<ncnn::Mat> kv_cache;
    run_decoder_pre(initial_tokens, encoder_states, kv_cache);
    int next_token = get_next_token_from_prefill_logits();

    // 2. autoregressive decoding loop
    while (next_token != EOT_TOKEN && sequence_length < MAX_LENGTH)
    {
        std::vector<ncnn::Mat> next_kv_cache;
        run_decoder_step(next_token, encoder_states, kv_cache, next_kv_cache);
        kv_cache = next_kv_cache; // update cache for the next iteration

        next_token = get_next_token_from_step_logits();
        // append next_token to your generated sequence
    }
}
```
This structured approach allows ncnn to perform highly efficient Transformer inference, correctly handling both dynamic self-attention and static cross-attention caches with an optimized memory layout.


================================================
FILE: docs/developer-guide/layer-feat-mask.md
================================================
# layer feature mask

Each ncnn layer allows a special parameter pair `31=X` to control specific bahavior.

X is an unsigned integer with each bit contributing a feature mask.

We usually use it to configuring fine-graded behaviors for certain layers to maintain accuracy, reduce memory usage or optimize performance.

|bit|value|mask|rationale|
|---|---|---|---|
|1<<0|1|no fp16 arithmetic|precision concern|
|1<<1|2|no fp16 storage|precision concern|
|1<<2|4|no bf16 storage|precision concern|
|1<<3|8|no int8|debug dynamic quantized model|
|1<<4|16|no vulkan|reduce overhead for cpu op - gpu split - cpu op|
|1<<5|32|no sgemm|reduce some memory|
|1<<6|64|no winograd|reduce some memory|
|1<<7|128|no threading|force single thread|

These bits can be OR-combined into one value to control multiple behaviors simultaneously.

For example, `31=17` means disabling both vulkan and fp16 arithmetic.

## disable fp16 for certain layer to fix overflow

```ruby
7767517
3 3
Input           input   0 1 input0 0=22 1=22 2=32
Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1
```

Typically, we use fp16 computation to improve inference speed.
However, since the weight value of `conv1` is very large, fp16 accumulation may cause numerical overflow, so fp16 needs to be disabled individually for `conv1`, while other layers continue to use fp16 mode

Add `31=3` to disable fp16 storage and arithmetic.

```ruby
7767517
3 3
Input           input   0 1 input0 0=22 1=22 2=32
Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=3
```

## disable vulkan for certain layer to improve performance

```ruby
7767517
5 5
Input           input   0 1 input0 0=22 1=22 2=32
Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
SomeCPULayer    c0      1 1 conv0 c0 0=32
ReLU            relu0   1 1 c0 relu0
SomeCPULayer    c1      1 1 relu0 c1 0=32
```

Between the CPU layers, there is a simple calculation layer that supports vulkan. We can set `31=16` to force it to run on CPU. This can avoid the overhead of data upload, download and storage layout conversion between CPU and GPU. After all, CPU is fast enough for simple operations.

```ruby
7767517
5 5
Input           input   0 1 input0 0=22 1=22 2=32
Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
SomeCPULayer    c0      1 1 conv0 c0 0=32
ReLU            relu0   1 1 c0 relu0 31=16
SomeCPULayer    c1      1 1 relu0 c1 0=32
```

## disable winograd for certain layer to reduce memory usage

```ruby
7767517
3 3
Input           input   0 1 input0 0=22 1=22 2=32
Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1
```

The winograd technology uses more memory for the purpose of improving convolution performance, but this is not always true. In some memory-constrained situations, or memory IO bottlenecks, we can disable the use of winograd on some layers in exchange for a smaller memory footprint. Add `31=64` to Convolution layer, which forces it to use implcit-gemm or tiled im2col-gemm implementation, reducing memory usage and sometimes improving vulkan performance.

```ruby
7767517
3 3
Input           input   0 1 input0 0=22 1=22 2=32
Convolution     conv0   1 1 input0 conv0 0=32 1=1 6=1024 9=1
Convolution     conv1   1 1 conv0 conv1 0=128 1=3 6=36864 9=1 31=64
```

## disable threading for certain layer to improve performance

```ruby
7767517
4 4
Input           input   0 1 input0 0=22 1=22 2=3
Convolution     conv0   1 1 input0 conv0 0=16 1=3 6=432
HardSigmoid     hs      1 1 conv0 hs0
Convolution     conv1   1 1 hs0 conv1 0=16 1=3 6=2304
```

The overhead of multi-thread dispatch and merging is too large for small tensors. Add `31=128` to HardSigmoid layer, which forces it to execute in a single thread, reducing power consumption and improving performance.

```ruby
7767517
4 4
Input           input   0 1 input0 0=22 1=22 2=3
Convolution     conv0   1 1 input0 conv0 0=16 1=3 6=432
HardSigmoid     hs      1 1 conv0 hs0 31=128
Convolution     conv1   1 1 hs0 conv1 0=16 1=3 6=2304
```


================================================
FILE: docs/developer-guide/layer-support-behavior.md
================================================
# Understanding `support_XYZ` Properties in ncnn's `Layer` Class

This document is for developers implementing new layers in `ncnn`. It explains the `support_XYZ` boolean properties in the `ncnn::Layer` base class. Correctly setting these properties declares the capabilities of your layer to the `ncnn` inference engine. This allows the engine to apply specific optimizations, such as enabling SIMD, half-precision floating-point computation, or Vulkan GPU acceleration, to achieve optimal performance and memory efficiency.

## When to Set `support` Properties

A layer can set its `support` properties in two ways:

1.  **Statically in the constructor**: If the layer's capabilities are fixed, the simplest way is to set them in its constructor.
2.  **Dynamically in `create_pipeline`**: If the layer's capabilities depend on parameters loaded from `load_param` or `load_model` (e.g., the data type of weights), you can set these properties dynamically within the `create_pipeline` method.

---

## Property Details

Here is a detailed breakdown of each `support` property and what it means for your layer's implementation.

### `one_blob_only`

*   **Purpose**: Declares that the layer accepts only one input `blob` and produces only one output `blob`.
*   **Requirements if `true`**: You must implement the single-input, single-output version of the `forward` method:
    ```cpp
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    ```
*   **Behavior**: When `true`, `ncnn` calls this overload. If `false` (default), the `std::vector<Mat>` version of `forward` is called.

### `support_inplace`

*   **Purpose**: Declares that the layer supports in-place computation, meaning the input and output can share the same memory. This significantly reduces memory overhead.
*   **Requirements if `true`**: You must implement the `forward_inplace` method. Depending on whether `one_blob_only` is also enabled, implement the corresponding version:
    ```cpp
    // If one_blob_only is true
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

    // If one_blob_only is false
    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
    ```

### `support_vulkan`

*   **Purpose**: Declares that the layer has a Vulkan implementation for GPU-accelerated inference.
*   **Requirements if `true`**:
    *   Implement `forward` / `forward_inplace` methods that accept `VkMat` for input and output.
    *   Implement `upload_model` to transfer weight data to the GPU.
    *   Implement `create_pipeline` and `destroy_pipeline` to manage Vulkan `Pipeline` objects and other GPU resources.

### `support_packing` (for CPU)

*   **Purpose**: Declares that the layer's **CPU implementation** can handle `Mat` data with a "packing" memory layout (i.e., `elempack > 1`). This is crucial for SIMD optimizations (e.g., processing 4 or 8 floats at once with NEON or AVX).
*   **Behavior if `true`**:
    *   When the input `Mat` channel count is a multiple of the SIMD width, the `ncnn` engine ensures that the input `Mat` passed to `forward` / `forward_inplace` is packed (e.g., `elempack=4` or `elempack=8`).
    *   Your implementation must correctly handle `Mat` data where `cstep` and `elempack` are not their default values.
*   **Behavior if `false`**:
    *   The `ncnn` engine guarantees that the input `Mat` passed to your layer will always have `elempack=1`. The engine will automatically insert conversions if the preceding layer produced a packed output.
*   **Output**: Regardless of the property's value, your layer can output a `Mat` with any `elempack`. However, it is highly recommended to output a `Mat` with an adaptive `elempack` to avoid unnecessary conversions in subsequent layers.

### `support_any_packing` (for CPU)

*   **Purpose**: An extension of `support_packing`. It declares that the layer's **CPU implementation** is flexible enough to handle a `Mat` with **any** `elempack` value (`1`, `4`, `8`, etc.).
*   **Behavior if `true`**:
    *   The `ncnn` engine can pass an input `Mat` with any packing layout to your `forward` method, without forcing a conversion to the hardware's "optimal" `elempack`. For example, on an AVX512 system where `elempack=16` is optimal, your layer can still accept `elempack=1`, `4`, or `8`.
    *   This gives the engine more flexibility to avoid unnecessary packing/unpacking conversions between layers.
*   **Behavior if `false`**: If `false` (but `support_packing` is `true`), the engine will try to provide an input `Mat` with an optimal `elempack` for the target architecture.
*   **Output**: This property does not enforce any constraint on the output `Mat`, which can have any `elempack`.

### `support_vulkan_packing` (for Vulkan)

*   **Purpose**: This is the Vulkan equivalent of `support_packing`. It declares that the layer's **Vulkan implementation** can handle `VkMat` with `elempack=4`.
*   **Behavior if `true`**: When the input `VkMat` has a channel count that is a multiple of 4, the `ncnn` engine will provide a packed `VkMat` (with `elempack=4`) to your Vulkan `forward` methods.
*   **Behavior if `false`**: The engine will ensure the input `VkMat` has `elempack=1`.
*   **Note**: `support_packing` and `support_vulkan_packing` are independent. A layer can support packing on CPU but not on Vulkan, or vice-versa.

### `support_vulkan_any_packing` (for Vulkan)

*   **Purpose**: An extension of `support_vulkan_packing`. It declares that the layer's **Vulkan implementation** can handle a `VkMat` with **any** supported `elempack` value (e.g., `1`, `4`).
*   **Behavior if `true`**:
    *   The `ncnn` engine can pass an input `VkMat` with any supported packing layout to your Vulkan `forward` method. This allows the engine to avoid unnecessary repacking operations on the GPU.
    *   This is particularly useful for optimizing shader dispatch and memory access patterns.
*   **Behavior if `false`**: If `false` (but `support_vulkan_packing` is `true`), the engine will try to provide a `VkMat` with `elempack=4` if the channel count is a multiple of 4.
*   **Note**: This property is independent of its CPU counterpart, `support_any_packing`.

### `support_bf16_storage`

*   **Purpose**: Declares that the layer can process `bfloat16` data.
*   **Behavior if `true`**:
    *   The `forward` method may receive an input `Mat` of type `bfloat16` (`elembits() == 16`) or `fp32`.
    *   Inside your `forward` implementation, you must check `opt.use_bf16_storage` and `bottom_blob.elembits()` to determine whether to use a `bfloat16`-optimized code path.
*   **Behavior if `false`**: The `ncnn` engine ensures your layer will **not** receive a `bfloat16` `Mat`.
*   **Output**: Your layer can output either a `bfloat16` or `fp32` `Mat`. When `opt.use_bf16_storage` is active, outputting `bfloat16` is recommended to maintain precision and performance across the network.

### `support_fp16_storage`

*   **Purpose**: Declares that the layer can process `float16` data for half-precision inference.
*   **Behavior if `true`**:
    *   Similar to `support_bf16_storage`, the `forward` method may receive an `fp16` or `fp32` `Mat`.
    *   Your implementation should check `opt.use_fp16_storage` and `bottom_blob.elembits()` to select the correct code path.
*   **Behavior if `false`**: The `ncnn` engine ensures your layer will **not** receive an `fp16` `Mat`.
*   **Output**: Your layer can output either a `fp16` or `fp32` `Mat`. When `opt.use_fp16_storage` is active, outputting an `fp16` `Mat` is recommended.

### `support_int8_storage`

*   **Purpose**: Declares that the layer supports `int8` quantized inference.
*   **Behavior if `true`**:
    *   When `opt.use_int8_inference` is `true`, the `forward` method may receive an `int8` or `fp32` `Mat`.
    *   **Important**: If the input is `fp32`, your `forward` implementation is responsible for dynamically quantizing it to `int8` before performing computations.
*   **Behavior if `false`**: The `ncnn` engine ensures your layer will **not** receive an `int8` `Mat`.
*   **Output**: The output can be `int8` or `fp32`, depending on your layer's design.

---

## Practical Implementation and Priorities

### Handling Multiple Precision Types

A layer can set `support_fp16_storage` and `support_bf16_storage` to `true` simultaneously. The `ncnn` engine prioritizes these formats based on the `Option` flags. As seen in the `convert_layout` function in `src/net.cpp`, if `opt.use_bf16_storage` is true, the engine will prefer converting inputs to `bfloat16`. Otherwise, it falls back to `fp16` if `opt.use_fp16_storage` is true.

The chosen `elempack` also depends on the precision. For instance, with SIMD, the priority might be:
*   FP16: `elempack=8` (if supported), then `elempack=4`, then `1`.
*   BF16: `elempack=4`, then `1`.

Your `forward` implementation should reflect this by checking `elembits()` and `elempack` to dispatch to the correct kernel.

### Code Example: `Clip_arm`

The `Clip_arm` layer provides a great example of these concepts in practice.

1.  **Declaring Support in the Constructor**:
    It declares support for packing and, conditionally, for fp16 and bf16 storage.
    ```cpp
    // From: src/layer/arm/clip_arm.cpp
    Clip_arm::Clip_arm()
    {
    #if __ARM_NEON
        support_packing = true;
    #if NCNN_ARM82
        support_fp16_storage = cpu_support_arm_asimdhp();
    #endif
    #endif // __ARM_NEON

    #if NCNN_BF16
        support_bf16_storage = true;
    #endif
    }
    ```

2.  **Dispatching in `forward_inplace`**:
    The `forward_inplace` method acts as a dispatcher. It first checks the element size (`elembits`) and the corresponding `opt` flag to decide whether to call a specialized low-precision implementation (`fp16s` or `bf16s`). If neither is applicable, it defaults to the standard `fp32` implementation.

    ```cpp
    // From: src/layer/arm/clip_arm.cpp
    int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    {
        int elembits = bottom_top_blob.elembits();

    #if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
            return forward_inplace_fp16s(bottom_top_blob, opt);
    #endif

    #if NCNN_BF16
        if (opt.use_bf16_storage && elembits == 16)
            return forward_inplace_bf16s(bottom_top_blob, opt);
    #endif

        // Default fp32 implementation follows...
        int w = bottom_top_blob.w;
        // ...
    }
    ```

### An Incremental Development Workflow

Adopting a gradual approach can simplify the development of a new layer:

1.  **Implement the Core Algorithm**: Start with all `support_XYZ` properties set to `false`. Focus on getting the mathematical logic correct using standard `fp32` data and `elempack=1`.
2.  **Add Packing Support**: Once the core logic is validated, set `support_packing = true`. Modify your code to handle `elempack > 1` and implement SIMD optimizations (e.g., using NEON intrinsics).
3.  **Add Low-Precision Support**: Next, add support for `fp16`, `bf16`, or `int8`. Set the corresponding `support_*_storage` flags to `true` and add branches in your `forward` method to handle these data types based on the `opt` flags.
4.  **Add Vulkan Support**: Finally, if GPU acceleration is desired, set `support_vulkan = true` and implement the Vulkan-specific methods.

This incremental process allows you to tackle one challenge at a time, making it easier to develop a highly optimized and feature-rich layer.


================================================
FILE: docs/developer-guide/low-level-operation-api.md
================================================
# implement elementwise addition with/without broadcast using BinaryOp operation

* input must be fp32 storage without packing
* output is expected to be fp32 storage without packing

```cpp
void binary_add(const ncnn::Mat& a, const ncnn::Mat& b, ncnn::Mat& c)
{
    ncnn::Option opt;
    opt.num_threads = 2;
    opt.use_fp16_storage = false;
    opt.use_packing_layout = false;

    ncnn::Layer* op = ncnn::create_layer("BinaryOp");

    // set param
    ncnn::ParamDict pd;
    pd.set(0, 0);// op_type

    op->load_param(pd);

    op->create_pipeline(opt);

    // forward
    std::vector<ncnn::Mat> bottoms(2);
    bottoms[0] = a;
    bottoms[1] = b;

    std::vector<ncnn::Mat> tops(1);
    op->forward(bottoms, tops, opt);

    c = tops[0];

    op->destroy_pipeline(opt);

    delete op;
}
```

# implement 3x3 box blur on three channel image using ConvolutionDepthWise operation

* input must be fp32 storage without packing
* output is expected to be fp32 storage without packing

```cpp
void convolution_3x3_boxblur_RGB(const ncnn::Mat& rgb, ncnn::Mat& out)
{
    ncnn::Option opt;
    opt.num_threads = 2;
    opt.use_fp16_storage = false;
    opt.use_packing_layout = false;

    ncnn::Layer* op = ncnn::create_layer("ConvolutionDepthWise");

    // set param
    ncnn::ParamDict pd;
    pd.set(0, 3);// num_output
    pd.set(1, 3);// kernel_w
    pd.set(5, 0);// bias_term
    pd.set(6, 3*3*3);// weight_data_size
    pd.set(7, 3);// group

    op->load_param(pd);

    // set weights
    ncnn::Mat weights[1];
    weights[0].create(3*3*3);// weight_data

    for (int i=0; i<3*3*3; i++)
    {
        weights[0][i] = 1.f / 9;
    }

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    // forward
    op->forward(rgb, out, opt);

    op->destroy_pipeline(opt);

    delete op;
}
```
# transpose Mat, chw to cwh

* input must be fp32 storage with/without packing
* output is expected to be fp32 storage packed

```cpp
void transpose(const ncnn::Mat& in, ncnn::Mat& out)
{
    ncnn::Option opt;
    opt.num_threads = 2;
    opt.use_fp16_storage = false;
    opt.use_packing_layout = true;

    ncnn::Layer* op = ncnn::create_layer("Permute");

    // set param
    ncnn::ParamDict pd;
    pd.set(0, 1);// order_type

    op->load_param(pd);

    op->create_pipeline(opt);

    ncnn::Mat in_packed = in;
    {
        // resolve dst_elempack
        int dims = in.dims;
        int elemcount = 0;
        if (dims == 1) elemcount = in.elempack * in.w;
        if (dims == 2) elemcount = in.elempack * in.h;
        if (dims == 3) elemcount = in.elempack * in.c;

        int dst_elempack = 1;
        if (op->support_packing)
        {
            if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
                dst_elempack = 8;
            else if (elemcount % 4 == 0)
                dst_elempack = 4;
        }

        if (in.elempack != dst_elempack)
        {
            convert_packing(in, in_packed, dst_elempack, opt);
        }
    }

    // forward
    op->forward(in_packed, out, opt);

    op->destroy_pipeline(opt);

    delete op;
}
```
# apply instance normalization
// x = (x - mean) / sqrt(var)

* input can be fp32/fp16 storage with/without packing
* output is expected to be fp16 storage packed when supported, or fp32 storage packed otherwise

```cpp
void normalize(const ncnn::Mat& in, ncnn::Mat& out)
{
    ncnn::Option opt;
    opt.num_threads = 2;
    opt.use_fp16_storage = true;
    opt.use_packing_layout = true;

    ncnn::Layer* op = ncnn::create_layer("InstanceNorm");

    // set param
    ncnn::ParamDict pd;
    pd.set(0, in.c);// channels
    pd.set(1, 0.f);// eps

    op->load_param(pd);

    // set weights
    ncnn::Mat weights[2];
    weights[0].create(in.c);// gamma_data
    weights[1].create(in.c);// beta_data

    weights[0].fill(1.f);
    weights[1].fill(0.f);

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    ncnn::Mat in_fp16 = in;
    if (in.elembits() == 32 && op->support_fp16_storage)
    {
        cast_float32_to_float16(in, in_fp16, opt);
    }
    if (in.elembits() == 16 && !op->support_fp16_storage)
    {
        cast_float16_to_float32(in, in_fp16, opt);
    }

    ncnn::Mat in_fp16_packed = in_fp16;
    {
        // resolve dst_elempack
        int dims = in_fp16.dims;
        int elemcount = 0;
        if (dims == 1) elemcount = in_fp16.elempack * in_fp16.w;
        if (dims == 2) elemcount = in_fp16.elempack * in_fp16.h;
        if (dims == 3) elemcount = in_fp16.elempack * in_fp16.c;

        int dst_elempack = 1;
        if (op->support_packing)
        {
            if (elemcount % 8 == 0 && (ncnn::cpu_support_x86_avx2() || ncnn::cpu_support_x86_avx()))
                dst_elempack = 8;
            else if (elemcount % 4 == 0)
                dst_elempack = 4;
        }

        if (in_fp16.elempack != dst_elempack)
        {
            convert_packing(in_fp16, in_fp16_packed, dst_elempack, opt);
        }
    }

    // forward
    op->forward(in_fp16_packed, out, opt);

    op->destroy_pipeline(opt);

    delete op;
}
```

# cpu -> gpu -> forward -> gpu -> cpu

```cpp
ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device();

ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator();

ncnn::VkWeightAllocator* weight_vkallocator = new ncnn::VkWeightAllocator(vkdev);
ncnn::VkWeightStagingAllocator* weight_staging_vkallocator = new ncnn::VkWeightStagingAllocator(vkdev);

// create layer
ncnn::Layer* convolution = ncnn::create_layer("Convolution");
convolution->vkdev = vkdev;

// set option
ncnn::Option opt;
opt.num_threads = 4;
opt.use_vulkan_compute = true;
opt.blob_vkallocator = blob_vkallocator;
opt.workspace_vkallocator = blob_vkallocator;
opt.staging_vkallocator = staging_vkallocator;

// load param
{
    ncnn::ParamDict pd;
    pd.set(0, outch);
    pd.set(1, ksize);
    pd.set(6, outch*inch*ksize*ksize);
    pd.use_vulkan_compute = 1;

    convolution->load_param(pd);
}

// load model
{
    ncnn::Mat weights[2];
    weights[0] = random_mat(outch*inch*ksize*ksize);
    weights[1] = random_mat(outch);

    ncnn::ModelBinFromMatArray mb(weights);
    convolution->load_model(mb);
}

// create pipeline
convolution->create_pipeline(opt);

// upload model
{
    ncnn::VkTransfer cmd(vkdev);

    ncnn::Option opt_upload = opt;
    opt_upload.blob_vkallocator = weight_vkallocator;
    opt_upload.workspace_vkallocator = weight_vkallocator;
    opt_upload.staging_vkallocator = weight_staging_vkallocator;

    convolution->upload_model(cmd, opt_upload);

    cmd.submit_and_wait();
}

ncnn::Mat bottom = random_mat(w, h, inch);

ncnn::Mat top;

// forward
{
    ncnn::VkCompute cmd(vkdev);

    ncnn::VkMat bottom_gpu;
    cmd.record_upload(bottom, bottom_gpu, opt);

    ncnn::VkMat top_gpu;
    convolution->forward(bottom_gpu, top_gpu, cmd, opt);

    cmd.record_download(top_gpu, top, opt);

    cmd.submit_and_wait();
}

convolution->destroy_pipeline(opt);

delete convolution;

vkdev->reclaim_blob_allocator(blob_vkallocator);
vkdev->reclaim_staging_allocator(staging_vkallocator);

weight_vkallocator->clear();
weight_staging_vkallocator->clear();
delete weight_vkallocator;
delete weight_staging_vkallocator;
```


================================================
FILE: docs/developer-guide/ncnn-tips-and-tricks.zh.md
================================================
### blob内存是隐含共享的

ncnn的blob最初直接使用opencv的cv::Mat，后发现blob最多只支持三维，因此实现了类似的Mat
Mat的data每个通道内存16字节对齐，并且有原子的引用计数，a=b不复制数据，超级快
Mat支持直接引用外部的内存块，不复制数据，加快模型加载和输入输出

举个例子：split layer 将一个blob复制成n个，ncnn中实现为单纯的增加引用计数，没有任何数据复制

### 只运算一部分并保留中间结果

ncnn的net在解决分支依赖时是自上而下深度优先的，因此当网络有多个分支时，运算只会在需要结果的那个分支中进行，节约时间
当多个分支有重合部分时，运算其中一个分支后会自动保留其余分支所需的中间结果，隐含共享，以便运算其余分支时利用

举个例子：某网络结构为 A -> B -> C1 + C2，向ncnn索要C1结果时，运算过程是 A -> B -> C1，同时B结果引用计数加1自动保留，后面还需要C2结果时，只运算C2就足够了

### 开启轻模式省内存

每个layer都会产生blob，除了最后的结果和多分支中间结果，大部分blob都不值得保留，开启轻模式可以在运算后自动回收，省下内存

举个例子：某网络结构为 A -> B -> C，在轻模式下，向ncnn索要C结果时，A结果会在运算B时自动回收，而B结果会在运算C时自动回收，最后只保留C结果，后面再需要C结果会直接获得，满足绝大部分深度网络的使用方式

### 网络和运算是分开的

ncnn的net是网络模型，实际使用的是extractor，也就是同个net可以有很多个运算实例，而且运算实例互不影响，中间结果保留在extractor内部，在多线程使用时共用网络的结构和参数数据，初始化网络模型和参数只需要一遍

举个例子：全局静态的net实例，初始化一次后，就能不停地生成extractor使用

### openmp虽快但未必合适

ncnn中几乎所有运算都能用上openmp多线程加速，而且性能很赞
不过系统有时候会突然慢一下，比如手机太热自动降频，界面操作等等，ncnn耗时也会偶尔抖动变长，在计算耗时稳定性比较重要的时候建议关闭openmp，或者设置下extractor线程数

举个例子：手机自拍时，用ncnn进行人脸实时定位，如果耗时突然涨一下就会感觉到掉帧，而稳定的帧率体验更好

### NCNN_STDIO/NCNN_STRING禁用模型文件

ncnn支持加载自有的模型文件和模型内存，NCNN_STDIO控制是否需要支持加载模型文件，设成0能禁用这部分代码，从而减小库的体积，NCNN_STRING设成0能清除大部分可见的字符串和解析过程
模型内存加载时的参数数据是直接引用的，速度更快，通常在手机上使用这种方式

### 削减 ncnn 内置的层实现

cmake的时候，加参数 -DWITH_LAYER_xxx=OFF 就可以完全不编译对应的内置层，这样可以进一步减小库的体积

### 关于 ARM big.LITTLE 调度

调用set_cpu_powersave可以把ncnn运算线程控制在特定的cpu核心上，大核心速度快耗电多，小核心速度慢点但省电，大小一起用手机热得快


================================================
FILE: docs/developer-guide/new-model-load-api.md
================================================
## current model load api
### Cons
#### long and awful code
#### two functions
#### deal float32 float16 quantized-u8
#### deal alignment size
```cpp
#if NCNN_STDIO
int Convolution::load_model(FILE* binfp)
{
    int nread;

    union
    {
        struct
        {
            unsigned char f0;
            unsigned char f1;
            unsigned char f2;
            unsigned char f3;
        };
        unsigned int tag;
    } flag_struct;

    nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
    if (nread != 1)
    {
        fprintf(stderr, "Convolution read flag_struct failed %d\n", nread);
        return -1;
    }

    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

    weight_data.create(weight_data_size);
    if (weight_data.empty())
        return -100;

    if (flag_struct.tag == 0x01306B47)
    {
        // half-precision weight data
        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned short), 4);
        std::vector<unsigned short> float16_weights;
        float16_weights.resize(align_weight_data_size);
        nread = fread(float16_weights.data(), align_weight_data_size, 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read float16_weights failed %d\n", nread);
            return -1;
        }

        weight_data = Mat::from_float16(float16_weights.data(), weight_data_size);
        if (weight_data.empty())
            return -100;
    }
    else if (flag != 0)
    {
        // quantized weight data
        float quantization_value[256];
        nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read quantization_value failed %d\n", nread);
            return -1;
        }

        int align_weight_data_size = alignSize(weight_data_size * sizeof(unsigned char), 4);
        std::vector<unsigned char> index_array;
        index_array.resize(align_weight_data_size);
        nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read index_array failed %d\n", nread);
            return -1;
        }

        float* weight_data_ptr = weight_data;
        for (int i = 0; i < weight_data_size; i++)
        {
            weight_data_ptr[i] = quantization_value[ index_array[i] ];
        }
    }
    else if (flag_struct.f0 == 0)
    {
        // raw weight data
        nread = fread(weight_data, weight_data_size * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read weight_data failed %d\n", nread);
            return -1;
        }
    }

    if (bias_term)
    {
        bias_data.create(num_output);
        if (bias_data.empty())
            return -100;
        nread = fread(bias_data, num_output * sizeof(float), 1, binfp);
        if (nread != 1)
        {
            fprintf(stderr, "Convolution read bias_data failed %d\n", nread);
            return -1;
        }
    }

    return 0;
}
#endif // NCNN_STDIO

int Convolution::load_model(const unsigned char*& mem)
{
    union
    {
        struct
        {
            unsigned char f0;
            unsigned char f1;
            unsigned char f2;
            unsigned char f3;
        };
        unsigned int tag;
    } flag_struct;

    memcpy(&flag_struct, mem, sizeof(flag_struct));
    mem += sizeof(flag_struct);

    unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;

    if (flag_struct.tag == 0x01306B47)
    {
        // half-precision weight data
        weight_data = Mat::from_float16((unsigned short*)mem, weight_data_size);
        mem += alignSize(weight_data_size * sizeof(unsigned short), 4);
        if (weight_data.empty())
            return -100;
    }
    else if (flag != 0)
    {
        // quantized weight data
        const float* quantization_value = (const float*)mem;
        mem += 256 * sizeof(float);

        const unsigned char* index_array = (const unsigned char*)mem;
        mem += alignSize(weight_data_size * sizeof(unsigned char), 4);

        weight_data.create(weight_data_size);
        if (weight_data.empty())
            return -100;
        float* weight_data_ptr = weight_data;
        for (int i = 0; i < weight_data_size; i++)
        {
            weight_data_ptr[i] = quantization_value[ index_array[i] ];
        }
    }
    else if (flag_struct.f0 == 0)
    {
        // raw weight data
        weight_data = Mat(weight_data_size, (float*)mem);
        mem += weight_data_size * sizeof(float);
    }

    if (bias_term)
    {
        bias_data = Mat(num_output, (float*)mem);
        mem += num_output * sizeof(float);
    }

    return 0;
}
```

## new model load api proposed
### Pros
#### clean and simple api
#### element type detection
```cpp
int Convolution::load_model(const ModelBin& mb)
{
    // auto detect element type
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        // certain type specified
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}
```


================================================
FILE: docs/developer-guide/new-param-load-api.md
================================================
## current param load api
### Cons
#### long and awful code
#### three functions
#### not extensible
#### no default value
#### no variable length array
```
MyLayer  mylayer 1 1 in out 100 1.250000
```
```
binary 100
binary 1.250000
```
```cpp
#if NCNN_STDIO
#if NCNN_STRING
int MyLayer::load_param(FILE* paramfp)
{
    int nscan = fscanf(paramfp, "%d %f", &a, &b);
    if (nscan != 2)
    {
        fprintf(stderr, "MyLayer load_param failed %d\n", nscan);
        return -1;
    }

    return 0;
}
#endif // NCNN_STRING
int MyLayer::load_param_bin(FILE* paramfp)
{
    fread(&a, sizeof(int), 1, paramfp);

    fread(&b, sizeof(float), 1, paramfp);

    return 0;
}
#endif // NCNN_STDIO

int MyLayer::load_param(const unsigned char*& mem)
{
    a = *(int*)(mem);
    mem += 4;

    b = *(float*)(mem);
    mem += 4;

    return 0;
}
```

## new param load api proposed
### Pros
#### clean and simple api
#### default value
#### extensible
#### variable length array
```
7767517
MyLayer  mylayer 1 1 in out 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0
```
```
binary 0xDD857600(magic)

binary 0
binary 100
binary 1
binary 1.250000
binary -23303
binary 5
binary 0.1
binary 0.2
binary 0.4
binary 0.8
binary 1.0
binary -233(EOP)
```
```cpp
int MyLayer::load_param(const ParamDict& pd)
{
    // pd.get( param id (seq), default value );
    a = pd.get(0, 100);
    b = pd.get(1, 1.25f);

    // get default value for c if not specified in param file
    c = pd.get(2, 0.001);

    // get array
    d = pd.get(3, Mat(len, array));
    return 0;
}
```


================================================
FILE: docs/developer-guide/operation-param-weight-table.md
================================================

|operation|param id|param phase|default value|weight order|
|:---:|:---:|:---:|:---:|:---:|
|AbsVal|||
|ArgMax|0|out_max_val|0|
||1|topk|1|
|BatchNorm|0|channels|0|slope mean variance bias|
||1|eps|0.f|
|Bias|0|bias_data_size|0|
|BinaryOp|0|op_type|0|
||1|with_scalar|0|
||2|b|0.f|
|BNLL|||
|Cast|0|type_from|0|
||1|type_to|0|
|Clip|0|min|-FLT_MAX|
||1|max|FLT_MAX|
|Concat|0|axis|0|
|Convolution|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||8|int8_scale_term|0|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||17|impl_type|0|
||18|pad_value|0.f|
|ConvolutionDepthWise|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||7|group|1|
||8|int8_scale_term|0|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||18|pad_value|0.f|
|Crop|0|woffset|0|
||1|hoffset|0|
||2|coffset|0|
||3|outw|0|
||4|outh|0|
||5|outc|0|
||6|woffset2|0|
||7|hoffset2|0|
||8|coffset2|0|
||9|starts|[ ]|
||10|ends|[ ]|
||11|axes|[ ]|
|Deconvolution|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||18|output_pad_right|0|
||19|output_pad_bottom|output_pad_right|
||20|output_w|0|
||21|output_h|output_w|
|DeconvolutionDepthWise|0|num_output|0|weight bias|
||1|kernel_w|0|
||2|dilation_w|1|
||3|stride_w|1|
||4|pad_left|0|
||5|bias_term|0|
||6|weight_data_size|0|
||7|group|1|
||9|activation_type|0|
||10|activation_params|[ ]|
||11|kernel_h|kernel_w|
||12|dilation_h|dilation_w|
||13|stride_h|stride_w|
||15|pad_right|pad_left|
||14|pad_top|pad_left|
||16|pad_bottom|pad_top|
||18|output_pad_right|0|
||19|output_pad_bottom|output_pad_right|
||20|output_w|0|
||21|output_h|output_w|
|Dequantize|0|scale|1.f|bias|
||1|bias_term|0|
||2|bias_data_size|0|
|DetectionOutput|0|num_class|0|
||1|nms_threshold|0.05f|
||2|nms_top_k|300|
||3|keep_top_k|100|
||4|confidence_threshold|0.5f|
||5|variances[0]|0.1f|
||6|variances[1]|0.1f|
||7|variances[2]|0.2f|
||8|variances[3]|0.2f|
|Dropout|0|scale|1.f|
|Eltwise|0|op_type|0|
||1|coeffs|[ ]|
|ELU|0|alpha|0.1f|
|Embed|0|num_output|0|weight bias|
||1|input_dim|0|
||2|bias_term|0|
||3|weight_data_size|0|
|Exp|0|base|-1.f|
||1|scale|1.f|
||2|shift|0.f|
|ExpandDims|0|expand_w|0|
||1|expand_h|0|
||2|expand_c|0|
||3|axes|[ ]|
|Flatten|||
|HardSigmoid|0|alpha|0.2f||
||1|beta|0.5f|
|HardSwish|0|alpha|0.2f||
||1|beta|0.5f|
|InnerProduct|0|num_output|0|weight bias|
||1|bias_term|0|
||2|weight_data_size|0|
||8|int8_scale_term|0|
||9|activation_type|0|
||10|activation_params|[ ]|
|Input|0|w|0|
||1|h|0|
||2|c|0|
|InstanceNorm|0|channels|0|gamma bias|
||1|eps|0.001f|
|Interp|0|resize_type|0|
||1|height_scale|1.f|
||2|width_scale|1.f|
||3|output_height|0|
||4|output_width|0|
|Log|0|base|-1.f|
||1|scale|1.f|
||2|shift|0.f|
|LRN|0|region_type|0|
||1|local_size|5|
||2|alpha|1.f|
||3|beta|0.75f|
||4|bias|1.f|
|LSTM|0|num_output|0|
||1|weight_data_size|1|
||2|direction|0|
|MemoryData|0|w|0|
||1|h|0|
||2|c|0|
|Mish|||
|MVN|0|normalize_variance|0|
||1|across_channels|0|
||2|eps|0.0001f|
|Noop|||
|Normalize|0|across_spatial|0|scale|
||4|across_channel|0|
||1|channel_shared|0|
||2|eps|0.0001f|
||9|eps_mode|0|
||3|scale_data_size|0|
|Packing|0|out_packing|1|
||1|use_padding|0|
||2|cast_type_from|0|
||3|cast_type_to|0|
||4|storage_type_from|0|
||5|storage_type_to|0|
|Padding|0|top|0|per_channel_pad_data|
||1|bottom|0|
||2|left|0|
||3|right|0|
||4|type|0|
||5|value|0.f|
||6|per_channel_pad_data_size|0|
||7|front|0|
||8|behind|0|
|Permute|0|order_type|0|
|PixelShuffle|0|upscale_factor|1|
|Pooling|0|pooling_type(0: max 1: avg)|0|
||1|kernel_w|0|
||11|kernel_h|kernel_w|
||2|stride_w|1|
||12|stride_h|stride_w|
||3|pad_left|0|
||14|pad_right|pad_left|
||13|pad_top|pad_left|
||15|pad_bottom|pad_top|
||4|global_pooling|0|
||5|pad_mode|0|
|Power|0|power|1.f|
||1|scale|1.f|
||2|shift|0.f|
|PReLU|0|num_slope|0|slope|
|PriorBox|0|min_sizes|[ ]|
||1|max_sizes|[ ]|
||2|aspect_ratios|[ ]|
||3|varainces[0]|0.f|
||4|varainces[1]|0.f|
||5|varainces[2]|0.f|
||6|varainces[3]|0.f|
||7|flip|1|
||8|clip|0|
||9|image_width|0|
||10|image_height|0|
||11|step_width|-233.f|
||12|step_height|-233.f|
||13|offset|0.f|
||14|step_mmdetection|0|
||15|center_mmdetection|0|
|Proposal|0|feat_stride|16|
||1|base_size|16|
||2|pre_nms_topN|6000|
||3|after_nms_topN|300|
||4|num_thresh|0.7f|
||5|min_size|16|
|PSROIPooling|0|pooled_width|7|
||1|pooled_height|7|
||2|spatial_scale|0.0625f|
||3|output_dim|0|
|Quantize|0|scale|1.f|
|Reduction|0|operation|0|
||1|dim|0|
||2|coeff|1.f|
||3|axes|[ ]|
||4|keepdims|0|
|ReLU|0|slope|0.f|
|Reorg|0|stride|0|
|Requantize|0|scale_in|1.f|bias|
||1|scale_out|1.f|
||2|bias_term|0|
||3|bias_data_size|0|
||4|fusion_relu|0|
|Reshape|0|w|-233|
||1|h|-233|
||2|c|-233|
||3|permute|0|
|ROIAlign|0|pooled_width|0|
||1|pooled_height|0|
||2|spatial_scale|1.f|
||3|sampling_ratio|0|
||4|aligned|0|
||5|version|0|
|ROIPooling|0|pooled_width|0|
||1|pooled_height|0|
||2|spatial_scale|1.f|
|Scale|0|scale_data_size|0|scale bias|
||1|bias_term|0|
|SELU|0|alpha|1.67326324f||
||1|lambda|1.050700987f|
|ShuffleChannel|0|group|1|
|Sigmoid|||
|Slice|0|slices|[ ]|
||1|axis|0|
|Softmax|0|axis|0|
|Split|||
|SPP|0|pooling_type|0|
||1|pyramid_height|1|
|Squeeze|0|squeeze_w|0|
||1|squeeze_h|0|
||2|squeeze_c|0|
||3|axes|[ ]|
|StatisticsPooling|0|include_stddev|0|
|Swish|||
|TanH|||
|Threshold|0|threshold|0.f|
|Tile|0|dim|0|
||1|tiles|1|
|UnaryOp|0|op_type|0|
|YoloDetectionOutput|0|num_class|20|
||1|num_box|5|
||2|confidence_threshold|0.01f|
||3|num_threshold|0.45f|
||4|biases|[]|
|Yolov3DetectionOutput|0|num_class|20|
||1|num_box|5|
||2|confidence_threshold|0.01f|
||3|num_threshold|0.45f|
||4|biases|[]|
||5|mask|[]|
||6|anchors_scale|[]|
|RNN|0|num_output|0|
||1|weight_data_size|0|
||2|direction|0|
|MultiHeadAttention|0|embed_dim|0|
||1|num_head|1|
||2|weight_data_size|0|


================================================
FILE: docs/developer-guide/operators.md
================================================

* [AbsVal](#absval)
* [ArgMax](#argmax)
* [BatchNorm](#batchnorm)
* [Bias](#bias)
* [BinaryOp](#binaryop)
* [BNLL](#bnll)
* [Cast](#cast)
* [CELU](#celu)
* [Clip](#clip)
* [Concat](#concat)
* [Convolution](#convolution)
* [Convolution1D](#convolution1d)
* [Convolution3D](#convolution3d)
* [ConvolutionDepthWise](#convolutiondepthwise)
* [ConvolutionDepthWise1D](#convolutiondepthwise1d)
* [ConvolutionDepthWise3D](#convolutiondepthwise3d)
* [CopyTo](#copyto)
* [Crop](#crop)
* [CumulativeSum](#cumulativesum)
* [Deconvolution](#deconvolution)
* [Deconvolution1D](#deconvolution1d)
* [Deconvolution3D](#deconvolution3d)
* [DeconvolutionDepthWise](#deconvolutiondepthwise)
* [DeconvolutionDepthWise1D](#deconvolutiondepthwise1d)
* [DeconvolutionDepthWise3D](#deconvolutiondepthwise3d)
* [DeformableConv2D](#deformableconv2d)
* [Dequantize](#dequantize)
* [Diag](#diag)
* [Dropout](#dropout)
* [Eltwise](#eltwise)
* [ELU](#elu)
* [Embed](#embed)
* [Exp](#exp)
* [ExpandDims](#expanddims)
* [Flatten](#flatten)
* [Flip](#flip)
* [Fold](#fold)
* [GELU](#gelu)
* [GLU](#glu)
* [Gemm](#gemm)
* [GridSample](#gridsample)
* [GroupNorm](#groupnorm)
* [GRU](#gru)
* [HardSigmoid](#hardsigmoid)
* [HardSwish](#hardswish)
* [InnerProduct](#innerproduct)
* [Input](#input)
* [InstanceNorm](#instancenorm)
* [Interp](#interp)
* [InverseSpectrogram](#inversespectrogram)
* [LayerNorm](#layernorm)
* [Log](#log)
* [LRN](#lrn)
* [LSTM](#lstm)
* [MemoryData](#memorydata)
* [Mish](#mish)
* [MultiHeadAttention](#multiheadattention)
* [MVN](#mvn)
* [Noop](#noop)
* [Normalize](#normalize)
* [Packing](#packing)
* [Padding](#padding)
* [Permute](#permute)
* [PixelShuffle](#pixelshuffle)
* [Pooling](#pooling)
* [Pooling1D](#pooling1d)
* [Pooling3D](#pooling3d)
* [Power](#power)
* [PReLU](#prelu)
* [Quantize](#quantize)
* [Reduction](#reduction)
* [ReLU](#relu)
* [Reorg](#reorg)
* [Requantize](#requantize)
* [Reshape](#reshape)
* [RMSNorm](#rmsnorm)
* [RNN](#rnn)
* [RotaryEmbed](#rotaryembed)
* [Scale](#scale)
* [SDPA](#sdpa)
* [SELU](#selu)
* [Shrink](#shrink)
* [ShuffleChannel](#shufflechannel)
* [Sigmoid](#sigmoid)
* [Slice](#slice)
* [Softmax](#softmax)
* [Softplus](#softplus)
* [Spectrogram](#spectrogram)
* [Split](#split)
* [Squeeze](#squeeze)
* [Swish](#swish)
* [TanH](#tanh)
* [Threshold](#threshold)
* [Tile](#tile)
* [UnaryOp](#unaryop)
* [Unfold](#unfold)

# AbsVal
```
y = abs(x)
```

* one_blob_only
* support_inplace

# ArgMax
```
y = argmax(x, out_max_val, topk)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | out_max_val   | int   | 0         |                   |
| 1         | topk          | int   | 1         |                   |

# BatchNorm
```
y = (x - mean) / sqrt(var + eps) * slope + bias
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | channels      | int   | 0         |                   |
| 1         | eps           | float | 0.f       |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| slope_data    | float | [channels]            |
| mean_data     | float | [channels]            |
| var_data      | float | [channels]            |
| bias_data     | float | [channels]            |

# Bias
```
y = x + bias
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | bias_data_size| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| bias_data     | float | [channels]            |

# BinaryOp
 This operation is used for binary computation, and the calculation rule depends on the [broadcasting rule](https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting).
```
C = binaryop(A, B)
```
if with_scalar = 1:
- one_blob_only
- support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | op_type       | int   | 0         | Operation type as follows |
| 1         | with_scalar   | int   | 0         | with_scalar=0 B is a matrix, with_scalar=1 B is a scalar |
| 2         | b             | float | 0.f       | When B is a scalar, B = b |

Operation type:
- 0 = ADD
- 1 = SUB
- 2 = MUL
- 3 = DIV
- 4 = MAX
- 5 = MIN
- 6 = POW
- 7 = RSUB
- 8 = RDIV
- 9 = RPOW
- 10 = ATAN2
- 11 = RATAN2

# BNLL
```
y = log(1 + e^(-x)) , x > 0
y = log(1 + e^x),     x < 0
```

* one_blob_only
* support_inplace

# Cast
```
y = cast(x)
```

* one_blob_only
* support_packing

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | type_from     | int   | 0         |                   |
| 1         | type_to       | int   | 0         |                   |

Element type:
- 0 = auto
- 1 = float32
- 2 = float16
- 3 = int8
- 4 = bfloat16

# CELU
```
if x < 0    y = (exp(x / alpha) - 1.f) * alpha
else        y = x
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | alpha         | float | 1.f       |                   |

# Clip
```
y = clamp(x, min, max)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | min           | float | -FLT_MAX  |                   |
| 1         | max           | float | FLT_MAX   |                   |

# Concat
```
y = concat(x0, x1, x2, ...) by axis
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | axis          | int   | 0         |                   |

# Convolution
```
x2 = pad(x, pads, pad_value)
x3 = conv(x2, weight, kernel, stride, dilation) + bias
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 8         | int8_scale_term| int  | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 18        | pad_value     | float | 0.f       |                   |
| 19        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
| bias_data     | float | [num_output]          |
| weight_data_int8_scales| float | [num_output] |
| bottom_blob_int8_scales| float | [1]          |
| top_blob_int8_scales| float | [1]             |

# Convolution1D
```
x2 = pad(x, pads, pad_value)
x3 = conv1d(x2, weight, kernel, stride, dilation) + bias
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 18        | pad_value     | float | 0.f       |                   |
| 19        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [kernel_w, num_input, num_output] |
| bias_data     | float | [num_output]          |

# Convolution3D
```
x2 = pad(x, pads, pad_value)
x3 = conv3d(x2, weight, kernel, stride, dilation) + bias
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 17        | pad_behind    | int   | pad_front |                   |
| 18        | pad_value     | float | 0.f       |                   |
| 21        | kernel_d      | int   | kernel_w  |                   |
| 22        | dilation_d    | int   | dilation_w |                  |
| 23        | stride_d      | int   | stride_w  |                   |
| 24        | pad_front     | int   | pad_left  |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
| bias_data     | float | [num_output]          |

# ConvolutionDepthWise
```
x2 = pad(x, pads, pad_value)
x3 = conv(x2, weight, kernel, stride, dilation, group) + bias
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 7         | group         | int   | 1         |                   |
| 8         | int8_scale_term| int  | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 18        | pad_value     | float | 0.f       |                   |
| 19        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
| bias_data     | float | [num_output]          |
| weight_data_int8_scales| float | [group]      |
| bottom_blob_int8_scales| float | [1]          |
| top_blob_int8_scales| float | [1]             |

# ConvolutionDepthWise1D
```
x2 = pad(x, pads, pad_value)
x3 = conv1d(x2, weight, kernel, stride, dilation, group) + bias
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 7         | group         | int   | 1         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 18        | pad_value     | float | 0.f       |                   |
| 19        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [kernel_w, num_input / group, num_output / group, group] |
| bias_data     | float | [num_output]          |

# ConvolutionDepthWise3D
```
x2 = pad(x, pads, pad_value)
x3 = conv3d(x2, weight, kernel, stride, dilation, group) + bias
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 7         | group         | int   | 1         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 17        | pad_behind    | int   | pad_front |                   |
| 18        | pad_value     | float | 0.f       |                   |
| 21        | kernel_d      | int   | kernel_w  |                   |
| 22        | dilation_d    | int   | dilation_w |                  |
| 23        | stride_d      | int   | stride_w  |                   |
| 24        | pad_front     | int   | pad_left  |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
| bias_data     | float | [num_output]          |

# CopyTo
```
self[offset] = src
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | woffset       | int   | 0         |                   |
| 1         | hoffset       | int   | 0         |                   |
| 13        | doffset       | int   | 0         |                   |
| 2         | coffset       | int   | 0         |                   |
| 9         | starts        | array | [ ]       |                   |
| 11        | axes          | array | [ ]       |                   |

# Crop
```
y = crop(x)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | woffset       | int   | 0         |                   |
| 1         | hoffset       | int   | 0         |                   |
| 13        | doffset       | int   | 0         |                   |
| 2         | coffset       | int   | 0         |                   |
| 3         | outw          | int   | 0         |                   |
| 4         | outh          | int   | 0         |                   |
| 14        | outd          | int   | 0         |                   |
| 5         | outc          | int   | 0         |                   |
| 6         | woffset2      | int   | 0         |                   |
| 7         | hoffset2      | int   | 0         |                   |
| 15        | doffset2      | int   | 0         |                   |
| 8         | coffset2      | int   | 0         |                   |
| 9         | starts        | array | [ ]       |                   |
| 10        | ends          | array | [ ]       |                   |
| 11        | axes          | array | [ ]       |                   |
| 19        | starts_expr   | str   | ""        |                   |
| 20        | ends_expr     | str   | ""        |                   |
| 21        | axes_expr     | str   | ""        |                   |

# CumulativeSum

If axis < 0, we use axis = x.dims + axis

It implements https://pytorch.org/docs/stable/generated/torch.cumsum.html

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | axis          | int   | 0         |                   |


# Deconvolution
```
x2 = deconv(x, weight, kernel, stride, dilation) + bias
x3 = depad(x2, pads, pad_value)
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 18        | output_pad_right| int | 0         |                   |
| 19        | output_pad_bottom| int | output_pad_right |           |
| 20        | output_w      | int   | 0         |                   |
| 21        | output_h      | int   | output_w  |                   |
| 28        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input, num_output] |
| bias_data     | float | [num_output]          |

# Deconvolution1D
```
x2 = deconv1d(x, weight, kernel, stride, dilation) + bias
x3 = depad(x2, pads, pad_value)
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 18        | output_pad_right| int | 0         |                   |
| 20        | output_w      | int   | 0         |                   |
| 28        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16 | [kernel_w, num_input, num_output] |
| bias_data     | float | [num_output]          |

# Deconvolution3D
```
x2 = deconv3d(x, weight, kernel, stride, dilation) + bias
x3 = depad(x2, pads, pad_value)
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 17        | pad_behind    | int   | pad_front |                   |
| 18        | output_pad_right| int | 0         |                   |
| 19        | output_pad_bottom| int | output_pad_right |           |
| 20        | output_pad_behind| int | output_pad_right |           |
| 21        | kernel_d      | int   | kernel_w  |                   |
| 22        | dilation_d    | int   | dilation_w |                  |
| 23        | stride_d      | int   | stride_w  |                   |
| 24        | pad_front     | int   | pad_left  |                   |
| 25        | output_w      | int   | 0         |                   |
| 26        | output_h      | int   | output_w  |                   |
| 27        | output_d      | int   | output_w  |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input, num_output] |
| bias_data     | float | [num_output]          |

# DeconvolutionDepthWise
```
x2 = deconv(x, weight, kernel, stride, dilation, group) + bias
x3 = depad(x2, pads, pad_value)
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 7         | group         | int   | 1         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 18        | output_pad_right| int | 0         |                   |
| 19        | output_pad_bottom| int | output_pad_right |           |
| 20        | output_w      | int   | 0         |                   |
| 21        | output_h      | int   | output_w  |                   |
| 28        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16 | [kernel_w, kernel_h, num_input / group, num_output / group, group] |
| bias_data     | float | [num_output]          |

# DeconvolutionDepthWise1D
```
x2 = deconv1d(x, weight, kernel, stride, dilation, group) + bias
x3 = depad(x2, pads, pad_value)
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 7         | group         | int   | 1         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 18        | output_pad_right| int | 0         |                   |
| 20        | output_w      | int   | 0         |                   |
| 28        | dynamic_weight| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16 | [kernel_w, num_input / group, num_output / group, group] |
| bias_data     | float | [num_output]          |

# DeconvolutionDepthWise3D
```
x2 = deconv3d(x, weight, kernel, stride, dilation, group) + bias
x3 = depad(x2, pads, pad_value)
y = activation(x3, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 7         | group         | int   | 1         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 17        | pad_behind    | int   | pad_front |                   |
| 18        | output_pad_right| int | 0         |                   |
| 19        | output_pad_bottom| int | output_pad_right |           |
| 20        | output_pad_behind| int | output_pad_right |           |
| 21        | kernel_d      | int   | kernel_w  |                   |
| 22        | dilation_d    | int   | dilation_w |                  |
| 23        | stride_d      | int   | stride_w  |                   |
| 24        | pad_front     | int   | pad_left  |                   |
| 25        | output_w      | int   | 0         |                   |
| 26        | output_h      | int   | output_w  |                   |
| 27        | output_d      | int   | output_w  |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16 | [kernel_w, kernel_h, kernel_d, num_input / group, num_output / group, group] |
| bias_data     | float | [num_output]          |

# DeformableConv2D
```
x2 = deformableconv2d(x, offset, mask, weight, kernel, stride, dilation) + bias
y = activation(x2, act_type, act_params)
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 5         | bias_term     | int   | 0         |                   |
| 6         | weight_data_size| int | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [kernel_w, kernel_h, num_input, num_output] |
| bias_data     | float | [num_output]          |

# Dequantize
```
y = x * scale + bias
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | scale_data_size| int  | 1         |                   |
| 1         | bias_data_size| int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| scale_data    | float | [scale_data_size]     |
| bias_data     | float | [bias_data_size]      |

# Diag
```
y = diag(x, diagonal)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | diagonal      | int   | 0         |                   |

# Dropout
```
y = x * scale
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | scale         | float | 1.f       |                   |

# Eltwise
```
y = elementwise_op(x0, x1, ...)
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | op_type       | int   | 0         |                   |
| 1         | coeffs        | array | [ ]       |                   |

Operation type:
- 0 = PROD
- 1 = SUM
- 2 = MAX

# ELU
```
if x < 0    y = (exp(x) - 1) * alpha
else        y = x
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | alpha         | float | 0.1f      |                   |

# Embed
```
y = embedding(x)
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | input_dim     | int   | 0         |                   |
| 2         | bias_term     | int   | 0         |                   |
| 3         | weight_data_size | int | 0        |                   |
| 18        | int8_scale_term| int  | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float | [weight_data_size]    |
| bias_term     | float | [num_output]          |
| weight_data_int8_scales| float | [1]          |

# Exp
```
if base == -1   y = exp(shift + x * scale)
else            y = pow(base, (shift + x * scale))
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | base          | float | -1.f      |                   |
| 1         | scale         | float | 1.f       |                   |
| 2         | shift         | float | 0.f       |                   |

# ExpandDims

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 3         | axes          | array | [ ]       |                   |

# Flatten
Reshape blob to 1 dimension

* one_blob_only

# Flip

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | axes          | array | [ ]       |                   |

# Fold
```
y = fold(x)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |
| 20        | output_w      | int   | 0         |                   |
| 21        | output_h      | int   | output_w  |                   |

# GELU
```
if fast_gelu == 1   y = 0.5 * x * (1 + tanh(0.79788452 * (x + 0.044715 * x * x * x)));
else                y = 0.5 * x * erfc(-0.70710678 * x)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | fast_gelu     | int   | 0         | use approximation |

# GLU

If axis < 0, we use axis = x.dims + axis

GLU(a,b)=a⊗σ(b)

where a is the first half of the input matrix and b is the second half.

axis specifies the dimension to split the input

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | axis          | int   | 0         |                   |

# Gemm
```
a = transA ? transpose(x0) : x0
b = transb ? transpose(x1) : x1
c = x2
y = (gemm(a, b) + c * beta) * alpha
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | alpha         | float | 1.f       |                   |
| 1         | beta          | float | 1.f       |                   |
| 2         | transA        | int   | 0         |                   |
| 3         | transb        | int   | 0         |                   |
| 4         | constantA     | int   | 0         |                   |
| 5         | constantB     | int   | 0         |                   |
| 6         | constantC     | int   | 0         |                   |
| 7         | constantM     | int   | 0         |                   |
| 8         | constantN     | int   | 0         |                   |
| 9         | constantK     | int   | 0         |                   |
| 10        | constant_broadcast_type_C | int | 0 |                 |
| 11        | output_N1M    | int   | 0         |                   |
| 12        | output_elempack | int | 0         |                   |
| 13        | output_elemtype | int | 0         |                   |
| 14        | output_transpose | int| 0         |                   |
| 18        | int8_scale_term | int | 0         |                   |
| 20        | constant_TILE_M | int | 0         |                   |
| 21        | constant_TILE_N | int | 0         |                   |
| 22        | constant_TILE_K | int | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| A_data        | float/fp16/int8 | [M, K] or [K, M] |
| B_data        | float/fp16/int8 | [N, K] or [K, N] |
| C_data        | float | [1], [M] or [N] or [1, M] or [N,1] or [N, M] |
| A_data_int8_scales| float | [M]               |
| B_data_int8_scales| float | [1]               |

# GridSample
```
Given an input and a flow-field grid, computes the output using input values and pixel locations from grid.

For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, 
which are used to interpolate the output value output[:, h2, w2]

This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks .
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | sample_type   | int   | 1         |                   |
| 1         | padding_mode  | int   | 1         |                   |
| 2         | align_corner  | int   | 0         |                   |
| 3         | permute_fusion| int   | 0         | fuse with permute |


Sample type:
- 1 = Nearest
- 2 = Bilinear
- 3 = Bicubic

Padding mode:
- 1 = zeros
- 2 = border
- 3 = reflection


# GroupNorm
```
split x along channel axis into group x0, x1 ...
l2 normalize for each group x0, x1 ...
y = x * gamma + beta
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | group         | int   | 1         |                   |
| 1         | channels      | int   | 0         |                   |
| 2         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
| 3         | affine        | int   | 1         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| gamma_data    | float | [channels]            |
| beta_data     | float | [channels]            |

# GRU
Apply a single-layer GRU to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.

```
y = gru(x)
y0, hidden y1 = gru(x0, hidden x1)
```

* one_blob_only if bidirectional

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         | hidden size of output |
| 1         | weight_data_size| int | 0         | total size of weight matrix |
| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_xc_data| float/fp16/int8 | [input_size, num_output * 3, num_directions] |
| bias_c_data   | float/fp16/int8 | [num_output, 4, num_directions] |
| weight_hc_data| float/fp16/int8 | [num_output, num_output * 3, num_directions] |

Direction flag:
- 0 = forward only
- 1 = reverse only
- 2 = bidirectional

# HardSigmoid
```
y = clamp(x * alpha + beta, 0, 1)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | alpha         | float | 0.2f      |                   |
| 1         | beta          | float | 0.5f      |                   |

# HardSwish
```
y = x * clamp(x * alpha + beta, 0, 1)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | alpha         | float | 0.2f      |                   |
| 1         | beta          | float | 0.5f      |                   |

# InnerProduct
```
x2 = innerproduct(x, weight) + bias
y = activation(x2, act_type, act_params)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | bias_term     | int   | 0         |                   |
| 2         | weight_data_size| int | 0         |                   |
| 8         | int8_scale_term| int  | 0         |                   |
| 9         | activation_type| int  | 0         |                   |
| 10        | activation_params| array | [ ]    |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_data   | float/fp16/int8 | [num_input, num_output] |
| bias_data     | float | [num_output]          |
| weight_data_int8_scales| float | [num_output] |
| bottom_blob_int8_scales| float | [1]          |

# Input
```
y = input
```

* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | w             | int   | 0         |                   |
| 1         | h             | int   | 0         |                   |
| 11        | d             | int   | 0         |                   |
| 2         | c             | int   | 0         |                   |

# InstanceNorm
```
split x along channel axis into instance x0, x1 ...
l2 normalize for each channel instance x0, x1 ...
y = x * gamma + beta
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | channels      | int   | 0         |                   |
| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
| 2         | affine        | int   | 1         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| gamma_data    | float | [channels]            |
| beta_data     | float | [channels]            |

# Interp
```
if dynamic_target_size == 0     y = resize(x) by fixed size or scale
else                            y = resize(x0, size(x1))
```

* one_blob_only if dynamic_target_size == 0

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | resize_type   | int   | 0         |                   |
| 1         | height_scale  | float | 1.f       |                   |
| 2         | width_scale   | float | 1.f       |                   |
| 3         | output_height | int   | 0         |                   |
| 4         | output_width  | int   | 0         |                   |
| 5         | dynamic_target_size| int | 0      |                   |
| 6         | align_corner  | int   | 0         |                   |
| 9         | size_expr     | str   | ""        |                   |

Resize type:
- 1 = Nearest
- 2 = Bilinear
- 3 = Bicubic

# InverseSpectrogram
```
x1 = x as complex
x1 = x1 * sqrt(norm) if normalized
y = istft(x1)
y1 = unpad(y) if center

if returns == 0 return y1 as complex
if returns == 1 return y1 real
if returns == 2 return y1 imag
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | n_fft         | int   | 0         |                   |
| 1         | returns       | int   | 1         |                   |
| 2         | hoplen        | int   | n_fft / 4 |                   |
| 3         | winlen        | int   | n_fft     |                   |
| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
| 5         | center        | int   | 1         |                   |
| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |

# LayerNorm
```
split x along outmost axis into part x0, x1 ...
l2 normalize for each part x0, x1 ...
y = x * gamma + beta by elementwise
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | affine_size   | int   | 0         |                   |
| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
| 2         | affine        | int   | 1         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| gamma_data    | float | [affine_size]         |
| beta_data     | float | [affine_size]         |

# Log
```
if base == -1   y = log(shift + x * scale)
else            y = log(shift + x * scale) / log(base)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | base          | float | -1.f      |                   |
| 1         | scale         | float | 1.f       |                   |
| 2         | shift         | float | 0.f       |                   |

# LRN
```
if region_type == ACROSS_CHANNELS   square_sum = sum of channel window of local_size
if region_type == WITHIN_CHANNEL    square_sum = sum of spatial window of local_size
y = x * pow(bias + alpha * square_sum / (local_size * local_size), -beta)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | region_type   | int   | 0         |                   |
| 1         | local_size    | int   | 5         |                   |
| 2         | alpha         | float | 1.f       |                   |
| 3         | beta          | float | 0.75f     |                   |
| 4         | bias          | float | 1.f       |                   |

Region type:
- 0 = ACROSS_CHANNELS
- 1 = WITHIN_CHANNEL

# LSTM
Apply a single-layer LSTM to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.

```
y = lstm(x)
y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2)
```

* one_blob_only if bidirectional

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         | output size of output |
| 1         | weight_data_size| int | 0         | total size of IFOG weight matrix |
| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |
| 3         | hidden_size   | int   | num_output| hidden size       |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] |
| bias_c_data   | float/fp16/int8 | [hidden_size, 4, num_directions] |
| weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] |
| weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] |

Direction flag:
- 0 = forward only
- 1 = reverse only
- 2 = bidirectional

# MemoryData
```
y = data
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | w             | int   | 0         |                   |
| 1         | h             | int   | 0         |                   |
| 11        | d             | int   | 0         |                   |
| 2         | c             | int   | 0         |                   |
| 21        | load_type     | int   | 1         | 1=fp32            |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| data          | float | [w, h, d, c]          |

# Mish
```
y = x * tanh(log(exp(x) + 1))
```

* one_blob_only
* support_inplace

# MultiHeadAttention
```
q_affine = affine(q) / (embed_dim / num_head)
k_affine = affine(k) or reuse kv_cache part
v_affine = affine(v) or reuse kv_cache part
split q k v into num_head part q0, k0, v0, q1, k1, v1 ...
for each num_head part
    qk = q * k
    qk = qk + attn_mask if attn_mask exists
    softmax(qk)
    qkv = qk * v
    merge qkv to out
y = affine(out)
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | embed_dim     | int   | 0         |                   |
| 1         | num_heads     | int   | 1         |                   |
| 2         | weight_data_size| int | 0         | qdim = weight_data_size / embed_dim |
| 3         | kdim          | int   | embed_dim |                   |
| 4         | vdim          | int   | embed_dim |                   |
| 5         | attn_mask     | int   | 0         |                   |
| 6         | scale         | float | 1.f / sqrt(embed_dim / num_heads) | |
| 7         | kv_cache      | int   | 0         |                   |
| 18        | int8_scale_term | int | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| q_weight_data | float/fp16/int8 | [embed_dim * qdim] |
| q_bias_data   | float | [embed_dim]           |
| k_weight_data | float/fp16/int8 | [embed_dim * kdim] |
| k_bias_data   | float | [embed_dim]           |
| v_weight_data | float/fp16/int8 | [embed_dim * vdim] |
| v_bias_data   | float | [embed_dim]           |
| out_weight_data| float/fp16/int8 | [qdim * embed_dim] |
| out_bias_data | float | [qdim]                |
| q_weight_data_int8_scales| float | [embed_dim] |
| k_weight_data_int8_scales| float | [embed_dim] |
| v_weight_data_int8_scales| float | [embed_dim] |
| out_weight_data_int8_scales| float | [1]      |

# MVN
```
if normalize_variance == 1 && across_channels == 1      y = (x - mean) / (sqrt(var) + eps) of whole blob
if normalize_variance == 1 && across_channels == 0      y = (x - mean) / (sqrt(var) + eps) of each channel
if normalize_variance == 0 && across_channels == 1      y = x - mean of whole blob
if normalize_variance == 0 && across_channels == 0      y = x - mean of each channel
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | normalize_variance| int | 0       |                   |
| 1         | across_channels| int  | 0         |                   |
| 2         | eps           | float | 0.0001f   | x = x / (sqrt(var) + eps) |

# Noop
```
y = x
```

# Normalize
```
if across_spatial == 1 && across_channel == 1      x2 = normalize(x) of whole blob
if across_spatial == 1 && across_channel == 0      x2 = normalize(x) of each channel
if across_spatial == 0 && across_channel == 1      x2 = normalize(x) of each position
y = x2 * scale
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | across_spatial| int   | 0         |                   |
| 1         | channel_shared| int   | 0         |                   |
| 2         | eps           | float | 0.0001f   | see eps mode      |
| 3         | scale_data_size| int  | 0         |                   |
| 4         | across_channel| int   | 0         |                   |
| 9         | eps_mode      | int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| scale_data    | float | [scale_data_size]     |

Eps Mode:
- 0 = caffe/mxnet   x = x / sqrt(var + eps)
- 1 = pytorch       x = x / max(sqrt(var), eps)
- 2 = tensorflow    x = x / sqrt(max(var, eps))

# Packing
```
y = wrap_packing(x)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | out_elempack  | int   | 1         |                   |
| 1         | use_padding   | int   | 0         |                   |
| 2         | cast_type_from| int   | 0         |                   |
| 3         | cast_type_to  | int   | 0         |                   |
| 4         | storage_type_from| int | 0        |                   |
| 5         | storage_type_to| int  | 0         |                   |

# Padding
```
y = pad(x, pads)
```

| param id  | name          | type | default   | description       |
| --------- | ------------- | ---- | --------- | ----------------- |
| 0         | top           | int  | 0         |                   |
| 1         | bottom        | int  | 0         |                   |
| 2         | left          | int  | 0         |                   |
| 3         | right         | int  | 0         |                   |
| 4         | type          | int  | 0         |                   |
| 5         | value         | float | 0         |                   |
| 6         | per_channel_pad_data_size| int | 0 |                 |
| 7         | front         | int  | stride_w  |                   |
| 8         | behind        | int  | pad_left  |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| per_channel_pad_data| float | [per_channel_pad_data_size] |

Padding type:
- 0 = CONSTANT
- 1 = REPLICATE
- 2 = REFLECT

# Permute
```
y = reorder(x)
```

| param id  | name          | type | default   | description       |
| --------- | ------------- | ---- | --------- | ----------------- |
| 0         | order_type    | int  | 0         |                   |

Order Type:
- 0 = WH WHC WHDC
- 1 = HW HWC HWDC
- 2 = WCH WDHC
- 3 = CWH DWHC
- 4 = HCW HDWC
- 5 = CHW DHWC
- 6 = WHCD
- 7 = HWCD
- 8 = WCHD
- 9 = CWHD
- 10 = HCWD
- 11 = CHWD
- 12 = WDCH
- 13 = DWCH
- 14 = WCDH
- 15 = CWDH
- 16 = DCWH
- 17 = CDWH
- 18 = HDCW
- 19 = DHCW
- 20 = HCDW
- 21 = CHDW
- 22 = DCHW
- 23 = CDHW

# PixelShuffle
```
if mode == 0    y = depth_to_space(x) where x channel order is sw-sh-outc
if mode == 1    y = depth_to_space(x) where x channel order is outc-sw-sh
```

* one_blob_only

| param id  | name          | type | default   | description       |
| --------- | ------------- | ---- | --------- | ----------------- |
| 0         | upscale_factor| int  | 1         |                   |
| 1         | mode          | int  | 0         |                   |

# Pooling
```
x2 = pad(x, pads)
x3 = pooling(x2, kernel, stride)
```

| param id  | name          | type | default   | description       |
| --------- | --------------| ---- | --------- | ----------------- |
| 0         | pooling_type  | int  | 0         |                   |
| 1         | kernel_w      | int  | 0         |                   |
| 2         | stride_w      | int  | 1         |                   |
| 3         | pad_left      | int  | 0         |                   |
| 4         | global_pooling| int  | 0         |                   |
| 5         | pad_mode      | int  | 0         |                   |
| 6         | avgpool_count_include_pad| int | 0 |                 |
| 7         | adaptive_pooling| int | 0        |                   |
| 8         | out_w         | int  | 0         |                   |
| 11        | kernel_h      | int  | kernel_w  |                   |
| 12        | stride_h      | int  | stride_w  |                   |
| 13        | pad_top       | int  | pad_left  |                   |
| 14        | pad_right     | int  | pad_left  |                   |
| 15        | pad_bottom    | int  | pad_top   |                   |
| 18        | out_h         | int  | out_w     |                   |

Pooling type:
- 0 = MAX
- 1 = AVG

Pad mode:
- 0 = full padding
- 1 = valid padding
- 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
- 3 = onnx padding=SAME_LOWER

# Pooling1D
```
x2 = pad(x, pads)
x3 = pooling1d(x2, kernel, stride)
```

| param id  | name          | type | default   | description       |
| --------- | --------------| ---- | --------- | ----------------- |
| 0         | pooling_type  | int  | 0         |                   |
| 1         | kernel_w      | int  | 0         |                   |
| 2         | stride_w      | int  | 1         |                   |
| 3         | pad_left      | int  | 0         |                   |
| 4         | global_pooling| int  | 0         |                   |
| 5         | pad_mode      | int  | 0         |                   |
| 6         | avgpool_count_include_pad| int | 0 |                 |
| 7         | adaptive_pooling| int | 0        |                   |
| 8         | out_w         | int  | 0         |                   |
| 14        | pad_right     | int  | pad_left  |                   |

Pooling type:
- 0 = MAX
- 1 = AVG

Pad mode:
- 0 = full padding
- 1 = valid padding
- 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
- 3 = onnx padding=SAME_LOWER

# Pooling3D
```
x2 = pad(x, pads)
x3 = pooling3d(x2, kernel, stride)
```

| param id  | name          | type | default   | description       |
| --------- | --------------| ---- | --------- | ----------------- |
| 0         | pooling_type  | int  | 0         |                   |
| 1         | kernel_w      | int  | 0         |                   |
| 2         | stride_w      | int  | 1         |                   |
| 3         | pad_left      | int  | 0         |                   |
| 4         | global_pooling| int  | 0         |                   |
| 5         | pad_mode      | int  | 0         |                   |
| 6         | avgpool_count_include_pad| int | 0 |                 |
| 7         | adaptive_pooling| int | 0        |                   |
| 8         | out_w         | int  | 0         |                   |
| 11        | kernel_h      | int  | kernel_w  |                   |
| 12        | stride_h      | int  | stride_w  |                   |
| 13        | pad_top       | int  | pad_left  |                   |
| 14        | pad_right     | int  | pad_left  |                   |
| 15        | pad_bottom    | int  | pad_top   |                   |
| 16        | pad_behind    | int  | pad_front |                   |
| 18        | out_h         | int  | out_w     |                   |
| 21        | kernel_d      | int  | kernel_w  |                   |
| 22        | stride_d      | int  | stride_w  |                   |
| 23        | pad_front     | int  | pad_left  |                   |
| 28        | out_d         | int  | out_w     |                   |

Pooling type:
- 0 = MAX
- 1 = AVG

Pad mode:
- 0 = full padding
- 1 = valid padding
- 2 = tensorflow padding=SAME or onnx padding=SAME_UPPER
- 3 = onnx padding=SAME_LOWER

# Power
```
y = pow((shift + x * scale), power)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | power         | float | 1.f       |                   |
| 1         | scale         | float | 1.f       |                   |
| 2         | shift         | float | 0.f       |                   |

# PReLU
```
if x < 0    y = x * slope
else        y = x
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_slope     | int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| slope_data    | float | [num_slope]           |

# Quantize
```
y = float2int8(x * scale)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | scale_data_size| int  | 1         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| scale_data    | float | [scale_data_size]     |

# Reduction
```
y = reduce_op(x * coeff)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | operation     | int   | 0         |                   |
| 1         | reduce_all    | int   | 1         |                   |
| 2         | coeff         | float | 1.f       |                   |
| 3         | axes          | array | [ ]       |                   |
| 4         | keepdims      | int   | 0         |                   |
| 5         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |

Operation type:
- 0 = SUM
- 1 = ASUM
- 2 = SUMSQ
- 3 = MEAN
- 4 = MAX
- 5 = MIN
- 6 = PROD
- 7 = L1
- 8 = L2
- 9 = LogSum
- 10 = LogSumExp

# ReLU
```
if x < 0    y = x * slope
else        y = x
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | slope         | float | 0.f       |                   |

# Reorg
```
if mode == 0    y = space_to_depth(x) where x channel order is sw-sh-outc
if mode == 1    y = space_to_depth(x) where x channel order is outc-sw-sh
```

* one_blob_only

| param id  | name          | type | default   | description       |
| --------- | ------------- | ---- | --------- | ----------------- |
| 0         | stride        | int  | 1         |                   |
| 1         | mode          | int  | 0         |                   |

# Requantize
```
x2 = x * scale_in + bias
x3 = activation(x2)
y = float2int8(x3 * scale_out)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | scale_in_data_size| int | 1       |                   |
| 1         | scale_out_data_size| int | 1      |                   |
| 2         | bias_data_size| int   | 0         |                   |
| 3         | activation_type| int  | 0         |                   |
| 4         | activation_params| int | [ ]      |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| scale_in_data | float | [scale_in_data_size]  |
| scale_out_data| float | [scale_out_data_size] |
| bias_data     | float | [bias_data_size]      |

# Reshape
```
y = reshape(x)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | w             | int   | -233      |                   |
| 1         | h             | int   | -233      |                   |
| 11        | d             | int   | -233      |                   |
| 2         | c             | int   | -233      |                   |
| 6         | shape_expr    | str   | ""        |                   |

Reshape flag:
- 0 = copy from bottom
- -1 = remaining
- -233 = drop this dim(default)

# RMSNorm
```
split x along outmost axis into part x0, x1 ...
root mean square normalize for each part x0, x1 ...
y = x * gamma by elementwise
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | affine_size   | int   | 0         |                   |
| 1         | eps           | float | 0.001f    | x = x / sqrt(var + eps) |
| 2         | affine        | int   | 1         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| gamma_data    | float | [affine_size]         |

# RNN
Apply a single-layer RNN to a feature sequence of `T` timesteps. The input blob shape is `[w=input_size, h=T]` and the output blob shape is `[w=num_output, h=T]`.

```
y = rnn(x)
y0, hidden y1 = rnn(x0, hidden x1)
```

* one_blob_only if bidirectional

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         | hidden size of output |
| 1         | weight_data_size| int | 0         | total size of weight matrix |
| 2         | direction     | int   | 0         | 0=forward, 1=reverse, 2=bidirectional |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| weight_xc_data| float/fp16/int8 | [input_size, num_output, num_directions] |
| bias_c_data   | float/fp16/int8 | [num_output, 1, num_directions] |
| weight_hc_data| float/fp16/int8 | [num_output, num_output, num_directions] |

Direction flag:
- 0 = forward only
- 1 = reverse only
- 2 = bidirectional

# RotaryEmbed
Apply rotary positional embeddings with cos and sin cache

```
y1 = x1 * cos - x2 * sin
y2 = x1 * sin + x2 * cos
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | interleaved   | int   | 0         |                   |

# Scale
```
if scale_data_size == -233  y = x0 * x1
else                        y = x * scale + bias
```

* one_blob_only if scale_data_size != -233
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | scale_data_size| int  | 0         |                   |
| 1         | bias_term     | int   | 0         |                   |

| weight        | type  | shape                 |
| ------------- | ----- | --------------------- |
| scale_data    | float | [scale_data_size]     |
| bias_data     | float | [scale_data_size]     |

# SDPA
```
scaled dot product attention
for each num_head part
    qk = q * k
    qk = qk + attn_mask if attn_mask exists
    softmax(qk)
    qkv = qk * v
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 5         | attn_mask     | int   | 0         |                   |
| 6         | scale         | float | 0.f       | auto = 1.f / sqrt(embed_dim) |
| 7         | kv_cache      | int   | 0         |                   |
| 18        | int8_scale_term | int | 0         |                   |

# SELU
```
if x < 0    y = (exp(x) - 1.f) * alpha * lambda
else        y = x * lambda
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | alpha         | float | 1.67326324f|                  |
| 1         | lambda        | float | 1.050700987f|                 |

# Shrink
```
if x < -lambd y = x + bias
if x >  lambd y = x - bias
else          y = x
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | bias          | float | 0.0f      |                   |
| 1         | lambd         | float | 0.5f      |                   |

# ShuffleChannel
```
if reverse == 0     y = shufflechannel(x) by group
if reverse == 1     y = shufflechannel(x) by channel / group
```

* one_blob_only

| param id  | name          | type | default   | description       |
| --------- | ------------- | ---- | --------- | ----------------- |
| 0         | group         | int  | 1         |                   |
| 1         | reverse       | int  | 0         |                   |

# Sigmoid
```
y = 1 / (1 + exp(-x))
```

* one_blob_only
* support_inplace

# Slice
```
split x along axis into slices, each part slice size is based on slices array
```

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | slices        | array | [ ]       |                   |
| 1         | axis          | int   | 0         |                   |
| 2         | indices       | array | [ ]       |                   |

# Softmax
```
softmax(x, axis)
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | axis          | int   | 0         |                   |
| 1         | fixbug0       | int   | 0         | hack for bug fix, should be 1 |

# Softplus
```
y = log(exp(x) + 1)
```

* one_blob_only
* support_inplace

# Spectrogram
```
x1 = pad(x) if center
y = stft(x1)
y = y / sqrt(norm) if normalized

if power == 0 return y as real
if power == 1 return magnitude
if power == 2 return square of magnitude
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | n_fft         | int   | 0         |                   |
| 1         | power         | int   | 0         |                   |
| 2         | hoplen        | int   | n_fft / 4 |                   |
| 3         | winlen        | int   | n_fft     |                   |
| 4         | window_type   | int   | 0         | 0=ones 1=hann 2=hamming |
| 5         | center        | int   | 1         |                   |
| 6         | pad_type      | int   | 2         | 0=CONSTANT 1=REPLICATE 2=REFLECT |
| 7         | normalized    | int   | 0         | 0=no 1=n_fft 2=window-l2-energy |
| 8         | onesided      | int   | 1         |                   |

# Split
```
y0, y1 ... = x
```

# Squeeze

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | squeeze_w     | int   | 0         |                   |
| 1         | squeeze_h     | int   | 0         |                   |
| 11        | squeeze_d     | int   | 0         |                   |
| 2         | squeeze_c     | int   | 0         |                   |
| 3         | axes          | array | [ ]       |                   |

# Swish
```
y = x / (1 + exp(-x))
```

* one_blob_only
* support_inplace

# TanH
```
y = tanh(x)
```

* one_blob_only
* support_inplace

# Threshold
```
if x > threshold    y = 1
else                y = 0
```

* one_blob_only
* support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | threshold     | float | 0.f       |                   |

# Tile
```
y = repeat tiles along axis for x
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | axis          | int   | 0         |                   |
| 1         | tiles         | int   | 1         |                   |
| 2         | repeats       | array | [ ]       |                   |

# UnaryOp
```
y = unaryop(x)
```

- one_blob_only
- support_inplace

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | op_type       | int   | 0         | Operation type as follows |

Operation type:
- 0 = ABS
- 1 = NEG
- 2 = FLOOR
- 3 = CEIL
- 4 = SQUARE
- 5 = SQRT
- 6 = RSQ
- 7 = EXP
- 8 = LOG
- 9 = SIN
- 10 = COS
- 11 = TAN
- 12 = ASIN
- 13 = ACOS
- 14 = ATAN
- 15 = RECIPROCAL
- 16 = TANH
- 17 = LOG10
- 18 = ROUND
- 19 = TRUNC

# Unfold
```
y = unfold(x)
```

* one_blob_only

| param id  | name          | type  | default   | description       |
| --------- | ------------- | ----- | --------- | ----------------- |
| 0         | num_output    | int   | 0         |                   |
| 1         | kernel_w      | int   | 0         |                   |
| 2         | dilation_w    | int   | 1         |                   |
| 3         | stride_w      | int   | 1         |                   |
| 4         | pad_left      | int   | 0         |                   |
| 11        | kernel_h      | int   | kernel_w  |                   |
| 12        | dilation_h    | int   | dilation_w |                  |
| 13        | stride_h      | int   | stride_w  |                   |
| 14        | pad_top       | int   | pad_left  |                   |
| 15        | pad_right     | int   | pad_left  |                   |
| 16        | pad_bottom    | int   | pad_top   |                   |


================================================
FILE: docs/developer-guide/param-and-model-file-structure.md
================================================
## net.param
### example
```
7767517
3 3
Input         input    0 1 data 0=4 1=4 2=1
InnerProduct  ip       1 1 data fc 0=10 1=1 2=80
Softmax       softmax  1 1 fc prob 0=0
```
### overview
```
[magic]
```
* magic number : 7767517
```
[layer count] [blob count]
```
* layer count : count of the layer line follows, should be exactly the count of all layer names
* blob count : count of all blobs, usually greater than or equals to the layer count
### layer line
```
[layer type] [layer name] [input count] [output count] [input blobs] [output blobs] [layer specific params]
```
* layer type : type name, such as Convolution Softmax etc
* layer name : name of this layer, must be unique among all layer names
* input count : count of the blobs this layer needs as input
* output count : count of the blobs this layer produces as output
* input blobs : name list of all the input blob names, separated by space, must be unique among input blob names of all layers
* output blobs : name list of all the output blob names, separated by space, must be unique among output blob names of all layers
* layer specific params : key=value pair list, separated by space
### layer param
```
0=1 1=2.5 -23303=2,2.0,3.0
```
key index should be unique in each layer line, pair can be omitted if the default value used

the meaning of existing param key index can be looked up at [operation-param-weight-table](operation-param-weight-table)

* integer or float key : index 0 ~ 19
* integer value : int
* float value : float
* integer array or float array key : -23300 minus index 0 ~ 19
* integer array value : [array size],int,int,...,int
* float array value : [array size],float,float,...,float

In modern ncnn param file

* array could be represented as `3=2.0,3.0` that is much more human friendly
* string typed value: `4=hello` and the string is no longer than 255

## net.bin
```
  +---------+---------+---------+---------+---------+---------+
  | weight1 | weight2 | weight3 | weight4 | ....... | weightN |
  +---------+---------+---------+---------+---------+---------+
  ^         ^         ^         ^
  0x0      0x80      0x140     0x1C0
```
the model binary is the concatenation of all weight data, each weight buffer is aligned by 32bit

### weight buffer
```
[flag] (optional)
[raw data]
[padding] (optional)
```
* flag : unsigned int,  little-endian, indicating the weight storage type, 0 => float32, 0x01306B47 => float16, otherwise => quantized int8, may be omitted if the layer implementation forced the storage type explicitly
* raw data : raw weight data, little-endian, float32 data or float16 data or quantized table and indexes depending on the storage type flag
* padding : padding space for 32bit alignment, may be omitted if already aligned


================================================
FILE: docs/developer-guide/preload-practice.zh.md
================================================
## 只是实践经验，没有理论，不一定正确

```
prfm pldl1keep, [x0, #256]
```
* 放在 ld1 [x0] 前面 0~8 条指令
* #256 表示把 x0+256 的内容放进 L1 cache
* ldp 也适用
* (经验)不写 offset 不如写个 #128
* (经验)pldl1strm 似乎没啥意思，也没 pldl1keep 快
* (经验)x0 ~ x0+256 的内容也会进来
* (经验)load 128bit 用 #128，256bit或更多用 #256
* (经验)避免 pld a，pld b，load a，load b 顺序，可能相互干扰
* (经验)提前太多会失效
* (经验)适合连续读

```
prfm pldl2strm, [x0, #256]
```
* 放在 ld1 [x0] 前面 N 条指令，N 尽量大些
* #256 表示把 x0+256 的内容放进 L2 cache
* ldp 也适用
* (经验)不写 offset 不如写个 #128
* (经验)pldl2strm 效果稍好于 pldl2keep
* (经验)x0 ~ x0+256 的内容也会进来
* (经验)load 128bit 用 #128，256bit 用 #256
* (经验)读很多数据，用不同 offset 连续两次 pldl2strm
* (经验)后面不要对同位置再 pldl1keep，会变慢
* (经验)适合提前准备要跳到很远的地方读，比如换 channel


================================================
FILE: docs/developer-guide/tensorflow-op-combination.md
================================================
## batchnorm
```
Input       A            0 1 A 0 0 0
MemoryData  sub/y        0 1 sub/y 16 0 0
BinaryOp    sub          2 1 A sub/y sub 1
MemoryData  div/y        0 1 div/y 16 0 0
BinaryOp    div          2 1 sub div/y div 3
MemoryData  mul/y        0 1 mul/y 16 0 0
BinaryOp    mul          2 1 div mul/y mul 2
MemoryData  BiasAdd/bias 0 1 BiasAdd/bias 16 0 0
BinaryOp    BiasAdd      2 1 mul BiasAdd/bias BiasAdd 0
```
## convolution
```
Input       A            0 1 A 0 0 0
Convolution Conv2D       1 1 A Conv2D 10 3 1 1 0 0 270
MemoryData  biases/read  0 1 biases/read 10 0 0
BinaryOp    BiasAdd      2 1 Conv2D biases/read BiasAdd 0
```
## innerproduct
```
Input        A           0 1 A 0 0 0
MemoryData   biases/read 0 1 biases/read 10 0 0
InnerProduct MatMul      1 1 A MatMul 10 0 2560
BinaryOp     conv6       2 1 MatMul biases/read conv6 0
```
## leakyrelu
```
Input       A            0 1 A 0 0 0
Split       splitncnn_0  1 2 A A_splitncnn_0 A_splitncnn_1
MemoryData  mul_1/x      0 1 mul_1/x 0 0 0
BinaryOp    mul_1        2 1 mul_1/x A_splitncnn_1 mul_1 2
BinaryOp    leaky        2 1 mul_1 A_splitncnn_0 leaky 4
```
## prelu
```
Input       A            0 1 A 0 0 0
Split       splitncnn_0  1 2 A A_splitncnn_0 A_splitncnn_1
MemoryData  prelu/alpha  0 1 prelu/alpha 10 0 0
ReLU        prelu/Relu   1 1 A_splitncnn_1 prelu/Relu 0.000000
UnaryOp     prelu/Neg    1 1 A_splitncnn_0 prelu/Neg 1
ReLU        prelu/Relu_1 1 1 prelu/Neg prelu/Relu_1 0.000000
UnaryOp     prelu/Neg_1  1 1 prelu/Relu_1 prelu/Neg_1 1
BinaryOp    prelu/Mul    2 1 prelu/alpha prelu/Neg_1 prelu/Mul 2
BinaryOp    prelu/add    2 1 prelu/Relu prelu/Mul prelu/add 0
```
## softmax
```
Input       A            0 1 A 0 0 0
Split       splitncnn_4  1 2 A A_splitncnn_0 A_splitncnn_1
Reduction   Max          1 1 A_splitncnn_1 Max 4 -2 1.000000
BinaryOp    sub          2 1 A_splitncnn_0 Max sub 1
UnaryOp     Exp          1 1 sub Exp 7
Split       splitncnn_5  1 2 Exp Exp_splitncnn_0 Exp_splitncnn_1
Reduction   Sum          1 1 Exp_splitncnn_1 Sum 0 -2 1.000000
BinaryOp    prob         2 1 Exp_splitncnn_0 Sum prob 3
```

================================================
FILE: docs/developer-guide/vulkan-driver-loader.md
================================================
# ncnn vulkan driver loader

ncnn turns on the ```NCNN_SIMPLEVK``` cmake option by default, when ```NCNN_VULKAN``` is enabled

simplevk is ncnn's built-in vulkan loader. It provides vulkan function declarations and function entries that meet ncnn's needs. It allows the use and compilation of vulkan-related codes without relying on vulkan-sdk. It can dynamically load the vulkan runtime library at runtime or directly load the graphics card driver. vulkan driver. When distributing ncnn applications, it is not required that the target system has a vulkan driver.

Usually you don't need to care about how simplevk loads the vulkan driver, because ncnn will automatically load and initialize when using vulkan related functions. It is sufficient to set the `Option` switch before loading the model.

Typical code

```cpp
ncnn::Net net;
net.opt.use_vulkan_compute = true;
net.load_param("model.param");
net.load_param("model.bin");
```

Using the in-house vulkan loader instead of the standard libvulkan has the following benefits

- Can compile ncnn vulkan code without installing vulkan-sdk
- Can deploy and distribute applications without libvulkan linkage
- Can load external vulkan driver instead of system driver
- Can directly load android hal module
- Can directly load graphics card driver files via NCNN_VULKAN_DRIVER env
- Able to actively search for graphics card driver files in the system and load them
- Can compile android libraries supporting vulkan under the platform of android-api<24

## Create and manage gpu context

```cpp
int create_gpu_instance(const char* driver_path = 0);

void destroy_gpu_instance();

VkInstance get_gpu_instance();
```

## Loading order

```
If driver_path == 0
  1a from env ```VK_ICD_FILENAMES```
  1b from env ```NCNN_VULKAN_DRIVER```

If driver_path != 0
  1 from specified driver_path

2 from vulkan-1.dll / libvulkan.so / libvulkan.dylib in system

3 search driver by name nvoglv64.dll / amdvlk64.dll / libGLX_nvidia.so.0 .... and load it
```

## Load from system vulkan library or graphics driver

This is the default behavior and it should work on most systems

sample usage
```cpp
int ret = create_gpu_instance();
```

Load from system-installed libvulkan

#### Windows
vulkan-1.dll

#### Linux Android
libvulkan.so

#### macOS iOS and other APPLE platforms
libvulkan.dylib

If static moltenvk driver linked, should always succeed

If failed, it will try to find graphics driver object and load it

#### Windows
for 64bit applications. search in ```%SystemRoot%\System32\DriverStore\FileRepository```
- nvoglv64.dll
- amdvlk64.dll
- igvk64.dll
- qcvkarm64xum.dll

for 32bit applications. search in ```%SystemRoot%\System32\DriverStore\FileRepository```
- nvoglv32.dll
- amdvlk32.dll
- igvk32.dll

#### Linux
`dlopen()` search for
- libGLX_nvidia.so.0
- libvulkan_radeon.so
- libvulkan_intel.so
- libMaliVulkan.so.1
- libVK_IMG.so

#### Android
for 64bit applications
- /vendor/lib64/hw/vulkan.adreno.so
- /vendor/lib64/egl/libGLES_mali.so

for 32bit applications
- /vendor/lib/hw/vulkan.adreno.so
- /vendor/lib/egl/libGLES_mali.so

#### macOS iOS and other APPLE platforms
`dlopen()` search for
- libMoltenVK.dylib
- libvulkan_kosmickrisp.dylib

## Load from driver_path

for advanced developer

sample usage
```cpp
int ret = create_gpu_instance("libvulkan.so");
int ret = create_gpu_instance("/usr/lib64/libvulkan_radeon.so");
int ret = create_gpu_instance("/vendor/lib64/hw/vulkan.adreno.so");
int ret = create_gpu_instance("/data/local/tmp/vulkan.ad07XX.so");
```

## Load from env VK_ICD_FILENAMES

for debug purpose

sample usage
```sh
export VK_ICD_FILENAMES=./vk_swiftshader_icd.json
export VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/lvp_icd.x86_64.json
export VK_ICD_FILENAMES=/etc/vulkan/icd.d/nvidia_icd.json
```

## Load from env NCNN_VULKAN_DRIVER

for debug purpose

sample usage
```sh
export NCNN_VULKAN_DRIVER=/data/local/tmp/vulkan.ad07XX.so
```


================================================
FILE: docs/faq.en.md
================================================


# How to join the technical Community Groups with QQ  ？

- Open QQ -> click the group chat search-> search group number 637093648, enter the answer to the question: conv conv conv conv conv → join the group chat → ready to accept the Turing test(a joke)
- Open QQ -> search Pocky group: 677104663 (lots experts), the answer to the question

# How to watch the author's on live in Bilibili？

- nihui：[水竹院落](https://live.bilibili.com/1264617)

# Compilation

- ## How to download the full source code？

   git clone --recursive https://github.com/Tencent/ncnn/

   or

   download [ncnn-xxxxx-full-source.zip](https://github.com/Tencent/ncnn/releases)

- ## How to cross-compile？How to set the cmake toolchain？

   See https://github.com/Tencent/ncnn/wiki/how-to-build

- ## The submodules were not downloaded! Please update submodules with "git submodule update --init" and try again

   As above, download the full source code. Or follow the prompts to execute: git submodule update --init

- ## Could NOT find Protobuf (missing: Protobuf_INCLUDE_DIR)

   sudo apt-get install libprotobuf-dev protobuf-compiler

- ## Could NOT find CUDA (missing: CUDA_TOOLKIT_ROOT_DIR CUDA_INCLUDE_DIRS CUDA_CUDART_LIBRARY)

   https://github.com/Tencent/ncnn/issues/1873

- ## Could not find a package configuration file provided by "OpenCV" with any of the following names: OpenCVConfig.cmake opencv-config.cmake

   sudo apt-get install libopencv-dev

   or customized compile and install ，with set(OpenCV_DIR {the dir OpenCVConfig.cmake exist})

- ## Could not find a package configuration file provided by "ncnn" with any of the following names: ncnnConfig.cmake ncnn-config.cmake

   set(ncnn_DIR { the dir ncnnConfig.cmake exist})

- ## xxx.lib not found（be specified by system/compiler）

   undefined reference to __kmpc_for_static_init_4 __kmpc_for_static_fini __kmpc_fork_call ...

   Need to link openmp

   undefined reference to glslang::InitializeProcess() glslang::TShader::TShader(EShLanguage) ...

   need glslang.lib glslang-default-resource-limits.lib

   undefined reference to AAssetManager_fromJava AAssetManager_open AAsset_seek ...

   Add android to find_library and target_like_libraries

   find_package(ncnn)

- ## undefined reference to typeinfo for ncnn::Layer

   opencv rtti -> opencv-mobile

- ## undefined reference to __cpu_model

   upgrade compiler / libgcc_s libgcc

- ## unrecognized command line option "-mavx2"

   upgrade gcc

- ## Why is the compiled ncnn-android library so large？

   See https://github.com/Tencent/ncnn/wiki/build-for-android.zh and see How to trim smaller ncnn

- ## ncnnoptimize and custom layer

   ncnnoptimize first before adding a custom layer to avoid ncnnoptimize not being able to handle custom layer saves.


- ## rtti/exceptions Conflict

   The reason for the conflict is that the libraries used in the project are configured differently, so analyze whether you need to turn them on or off according to your actual situation. ncnn is ON by default, add the following two parameters when recompiling ncnn.
   - ON: -DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF
   - OFF: -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON


- ## error: undefined symbol: ncnn::Extractor::extract(char const*, ncnn::Mat&)

   Possible scenarios.
   - Try upgrading the NDK version of Android Studio


# How do I add the ncnn library to my project and how does the cmake method work?

Compile ncnn,and make install. linux/windows should set/export ncnn_DIR points to the directory containing ncnnConfig.cmake under the install directory

- ## android

- ## ios

- ## linux

- ## windows

- ## macos

- ## arm linux


# Convert model issues

- ## caffe

   `./caffe2ncnn caffe.prototxt caffe.caffemodel ncnn.param ncnn.bin`

- ## mxnet

   ` ./mxnet2ncnn mxnet-symbol.json mxnet.params ncnn.param ncnn.bin`

- ## darknet

   [https://github.com/xiangweizeng/darknet2ncnn](https://github.com/xiangweizeng/darknet2ncnn)

- ## pytorch - onnx

   [use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)

- ## tensorflow 1.x/2.x - keras

   [https://github.com/MarsTechHAN/keras2ncnn](https://github.com/MarsTechHAN/keras2ncnn) **[@MarsTechHAN](https://github.com/MarsTechHAN)**

- ## tensorflow 2.x - mlir

   [Converting tensorflow2 models to ncnn via MLIR](https://zhuanlan.zhihu.com/p/152535430) **@[nihui](https://www.zhihu.com/people/nihui-2)**

- ## netron

   [https://github.com/lutzroeder/netron](https://github.com/lutzroeder/netron)

- ## How to generate a model with fixed shape？

   Input      0=w 1=h 2=c

- ## why gpu can speedup

- ## How to convert ncnnoptimize to fp16 model

   `ncnnoptimize model.param model.bin yolov5s-opt.param yolov5s-opt.bin 65536`

- ## How to use ncnnoptimize  checking the FLOPS / memory usage of your model

- ## How to modify the model to support dynamics shape？

   Interp Reshape

- ## How to convert a model into code embedded in a program？

   use ncnn2mem

- ## How to encrypt the model？

   See https://zhuanlan.zhihu.com/p/268327784

- ## The ncnn model transferred under Linux, Windows/MacOS/Android/... Can I use it directly?

   Yes, for all platforms

- ## How to remove post-processing and export onnx？

   Ref：

   Referring to an article by UP <https://zhuanlan.zhihu.com/p/128974102>, step 3 is to remove the post-processing and then export the onnx, where removing the post-processing can be the result of removing the subsequent steps when testing within the project.

- ## pytorch layers can't export to onnx？

 Mode 1:

   ONNX_ATEN_FALLBACK
Fully customizable op, first change to one that can export (e.g. concat slice), go to ncnn and then modify param

 Way 2.

 You can try this with PNNX, see the following article for a general description:

   1. [Windows/Linux/macOS steps for compiling PNNX](https://zhuanlan.zhihu.com/p/431833958)

   2. [Learn in 5 minutes! Converting TorchScript models to ncnn models with PNNX](https://zhuanlan.zhihu.com/p/427512763)

# Using

- ## vkEnumeratePhysicalDevices failed -3

- ## vkCreateInstance failed -9

   Please upgrade your GPU driver if you meet this crash or error.
   Here are the download sites for some brands of GPU drivers. We have provided some driver download pages here.
   [Intel](https://downloadcenter.intel.com/product/80939/Graphics-Drivers), [AMD](https://www.amd.com/en/support), [Nvidia](https://) www.nvidia.com/Download/index.aspx)

- ## ModuleNotFoundError: No module named 'ncnn.ncnn'

   python setup.py develop

- ## fopen nanodet-m.param failed

   path should be working dir

   File not found or not readable. Make sure that XYZ.param/XYZ.bin is accessible.

- ## find_blob_index_by_name data / output / ... failed

   layer name vs blob name

   param.bin use xxx.id.h enum

- ## parse magic failed

- ## param is too old, please regenerate

   The model maybe has problems

   Your model file is being the old format converted by an old caffe2ncnn tool.

   Checkout the latest ncnn code, build it and regenerate param and model binary files, and that should work.

   Make sure that your param file starts with the magic number 7767517.

   you may find more info on use-ncnn-with-alexnet

   When adding the softmax layer yourself, you need to add 1=1

- ## set_vulkan_compute failed, network use_vulkan_compute disabled

   Set net.opt.use_vulkan_compute = true before load_param / load_model;

- ## How to execute multiple blob inputs, multiple blob outputs？
   Multiple execute `ex.input()` and `ex.extract()` like following
    ```
    ex.input("data1", in_1);
    ex.input("data2", in_2);
    ex.extract("output1", out_1);
    ex.extract("output2", out_2);
    ```
- ## Multiple executions of Extractor extract double the calculation？

   No

- ## How to see the elapsed time for every layer？

   cmake -DNCNN_BENCHMARK=ON ..

- ## How to convert a cv::Mat CV_8UC3 BGR image

   from_pixels to_pixels

- ## How to convert float data to ncnn::Mat

   First of all, you need to manage the memory you request yourself, at this point ncnn::Mat will not automatically free up the float data you pass over to it
   ``` c++
   std::vector<float> testData(60, 1.0); // use std::vector<float> to manage memory requests and releases yourself
   ncnn::Mat in1 = ncnn::Mat(60, (void*)testData.data()).reshape(4, 5, 3); // just pass the pointer to the float data as a void*, and even specify the dimension (up says it's best to use reshape to solve the channel gap)
   float* a = new float[60]; // New a piece of memory yourself, you need to release it later
   ncnn::Mat in2 = ncnn::Mat(60, (void*)a).reshape(4, 5, 3).clone(); // use the same method as above, clone() to transfer data owner
   ```


================================================
FILE: docs/faq.md
================================================


# 如何加入技术交流QQ群？

- 打开QQ→点击群聊搜索→搜索群号637093648→输入问题答案：卷卷卷卷卷→进入群聊→准备接受图灵测试（bushi）
- 前往QQ搜索Pocky群：677104663(超多大佬)，问题答案：multi level intermediate representation

# 如何看作者b站直播？

- nihui的bilibili直播间：[水竹院落](https://live.bilibili.com/1264617)

# 编译

- ## 怎样下载完整源码？

   git clone --recursive https://github.com/Tencent/ncnn/
   
   或者
   
   下载 [ncnn-xxxxx-full-source.zip](https://github.com/Tencent/ncnn/releases)

- ## 怎么交叉编译？cmake 工具链怎么设置啊？
  
   参见 https://github.com/Tencent/ncnn/wiki/how-to-build

- ## The submodules were not downloaded! Please update submodules with "git submodule update --init" and try again

   如上，下载完整源码。或者按提示执行: git submodule update --init

- ## Could NOT find Protobuf (missing: Protobuf_INCLUDE_DIR)
  
   sudo apt-get install libprotobuf-dev protobuf-compiler

- ## Could NOT find CUDA (missing: CUDA_TOOLKIT_ROOT_DIR CUDA_INCLUDE_DIRS CUDA_CUDART_LIBRARY)

   https://github.com/Tencent/ncnn/issues/1873

- ## Could not find a package configuration file provided by "OpenCV" with any of the following names: OpenCVConfig.cmake opencv-config.cmake

   sudo apt-get install libopencv-dev

   或者自行编译安装，set(OpenCV_DIR {OpenCVConfig.cmake所在目录})

- ## Could not find a package configuration file provided by "ncnn" with any of the following names: ncnnConfig.cmake ncnn-config.cmake

   set(ncnn_DIR {ncnnConfig.cmake所在目录})

- ## 找不到库（需要根据系统/编译器指定）

   undefined reference to __kmpc_for_static_init_4 __kmpc_for_static_fini __kmpc_fork_call ...

   需要链接openmp库 

   undefined reference to glslang::InitializeProcess() glslang::TShader::TShader(EShLanguage) ...

   需要 glslang.lib glslang-default-resource-limits.lib

   undefined reference to AAssetManager_fromJava AAssetManager_open AAsset_seek ...

   find_library和target_like_libraries中增加 android 

   find_package(ncnn)

- ## undefined reference to typeinfo for ncnn::Layer

   opencv rtti -> opencv-mobile

- ## undefined reference to __cpu_model

   升级编译器 / libgcc_s libgcc

- ## unrecognized command line option "-mavx2"

   升级 gcc

- ## 为啥自己编译的ncnn android库特别大？

   https://github.com/Tencent/ncnn/wiki/build-for-android.zh 以及见 如何裁剪更小的 ncnn 库

- ## ncnnoptimize和自定义层

   先ncnnoptimize再增加自定义层，避免ncnnoptimize不能处理自定义层保存。


- ## rtti/exceptions冲突

   产生原因是项目工程中使用的库配置不一样导致冲突，根据自己的实际情况分析是需要开启还是关闭。ncnn默认是ON，在重新编译ncnn时增加以下2个参数即可：
   - 开启：-DNCNN_DISABLE_RTTI=OFF -DNCNN_DISABLE_EXCEPTION=OFF
   - 关闭：-DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON


- ## error: undefined symbol: ncnn::Extractor::extract(char const*, ncnn::Mat&)

   可能的情况：
   - 尝试升级 Android Studio 的 NDK 版本

- ## CMake 3.14.0 or higher is required.  You are running version 2.8.12.2
```shell
wget https://github.com/Kitware/CMake/releases/download/v3.18.2/cmake-3.18.2-Linux-x86_64.tar.gz
tar zxvf cmake-3.18.2-Linux-x86_64.tar.gz
mv cmake-3.18.2-Linux-x86_64 /opt/cmake-3.18.2
ln -sf /opt/cmake-3.18.2/bin/* /usr/bin/
```

# 怎样添加ncnn库到项目中？cmake方式怎么用？

编译ncnn，make install。linux/windows set/export ncnn_DIR 指向 install目录下包含ncnnConfig.cmake 的目录

- ## android

- ## ios

- ## linux

- ## windows

- ## macos

- ## arm linux


# 转模型问题

- ## caffe

   `./caffe2ncnn caffe.prototxt caffe.caffemodel ncnn.param ncnn.bin`

- ## mxnet

   ` ./mxnet2ncnn mxnet-symbol.json mxnet.params ncnn.param ncnn.bin`

- ## darknet

   [https://github.com/xiangweizeng/darknet2ncnn](https://github.com/xiangweizeng/darknet2ncnn)

- ## pytorch - onnx

   [use ncnn with pytorch or onnx](https://github.com/Tencent/ncnn/wiki/use-ncnn-with-pytorch-or-onnx)

- ## tensorflow 1.x/2.x - keras

   [https://github.com/MarsTechHAN/keras2ncnn](https://github.com/MarsTechHAN/keras2ncnn) **[@MarsTechHAN](https://github.com/MarsTechHAN)**

- ## tensorflow 2.x - mlir

   [通过MLIR将tensorflow2模型转换到ncnn](https://zhuanlan.zhihu.com/p/152535430) **@[nihui](https://www.zhihu.com/people/nihui-2)**

- ## netron

   [https://github.com/lutzroeder/netron](https://github.com/lutzroeder/netron)

- ## 怎么生成有固定 shape 信息的模型？

   Input      0=w 1=h 2=c

- ## why gpu能更快

- ## ncnnoptimize 怎么转成 fp16 模型

   `ncnnoptimize model.param model.bin yolov5s-opt.param yolov5s-opt.bin 65536`

- ## ncnnoptimize 怎样查看模型的 FLOPS / 内存占用情况

- ## 怎么修改模型支持动态 shape？

   Interp Reshape

- ## 如何将模型转换为代码内嵌到程序里？

   ncnn2mem

- ## 如何加密模型？

   https://zhuanlan.zhihu.com/p/268327784

- ## Linux下转的ncnn模型，Windows/MacOS/Android/.. 也能直接用吗？

   Yes，全平台通用

- ## 如何去掉后处理，再导出 onnx？

   检测：

   参考up的一篇文章<https://zhuanlan.zhihu.com/p/128974102>，步骤三就是去掉后处理,再导出onnx,其中去掉后处理可以是项目内测试时去掉后续步骤的结果。

- ## pytorch 有的层导不出 onnx 怎么办？

 方式一:

   ONNX_ATEN_FALLBACK
完全自定义的op，先改成能导出的（如 concat slice），转到 ncnn 后再修改 param

 方式二：

 可以使用PNNX来试试，参考以下文章大概说明:

   1. [Windows/Linux/macOS 编译 PNNX 步骤](https://zhuanlan.zhihu.com/p/431833958)

   2. [5分钟学会！用 PNNX 转换 TorchScript 模型到 ncnn 模型](https://zhuanlan.zhihu.com/p/427512763)

# 使用

- ## vkEnumeratePhysicalDevices failed -3

- ## vkCreateInstance failed -9

   出现此类问题请先更新GPU驱动。Please upgrade your GPU driver if you encounter this crash or error.
   这里提供了一些品牌的GPU驱动下载网址.We have provided some drivers' download pages here.
   [Intel](https://downloadcenter.intel.com/product/80939/Graphics-Drivers)，[AMD](https://www.amd.com/en/support)，[Nvidia](https://www.nvidia.com/Download/index.aspx)

- ## docker 环境里面 nvidia-smi 能看到显卡也能跑 cuda 却不能跑 vulkan

   因为这个docker环境的nvidia驱动没有安装opengl/vulkan支持

  首先运行 nvidia-smi 查看当前驱动版本

```
NVIDIA-SMI 535.161.07
Driver Version: 535.161.07
CUDA Version: 12.2
```

然后去下载对应版本的NVIDIA驱动，安装用户态驱动文件，跳过内核部分

```
wget https://us.download.nvidia.com/tesla/535.161.07/NVIDIA-Linux-x86_64-535.161.07.run
chmod +x NVIDIA-Linux-x86_64-535.161.07.run
./NVIDIA-Linux-x86_64-535.161.07.run --silent --no-kernel-module
```

安装时会报一些文件权限错误，不用管，安装完成后 vulkan 支持就可用了。最后安装 vulkaninfo 查看gpu信息

```
dnf install vulkan-tools
vulkaninfo
```

- ## ModuleNotFoundError: No module named 'ncnn.ncnn'

   python setup.py develop

- ## fopen nanodet-m.param failed

   文件路径 working dir

   File not found or not readable. Make sure that XYZ.param/XYZ.bin is accessible.

- ## find_blob_index_by_name data / output / ... failed

   layer name vs blob name
   
   param.bin 应该用 xxx.id.h 的枚举

- ## parse magic failed

- ## param is too old, please regenerate

   模型本身有问题

   Your model file is being the old format converted by an old caffe2ncnn tool.

   Checkout the latest ncnn code, build it and regenerate param and model binary files, and that should work.

   Make sure that your param file starts with the magic number 7767517.

   you may find more info on use-ncnn-with-alexnet
   
   When adding the softmax layer yourself, you need to add 1=1

- ## set_vulkan_compute failed, network use_vulkan_compute disabled

   你应该在 load_param / load_model 之前设置 net.opt.use_vulkan_compute = true;

- ## 多个blob输入，多个blob输出，怎么做？
   多次执行`ex.input()` 和 `ex.extract()`
```
ex.input("data1", in_1);
ex.input("data2", in_2);
ex.extract("output1", out_1);
ex.extract("output2", out_2);
```
- ## Extractor extract 多次会重复计算吗？

   不会

- ## 如何看每一层的耗时？

   cmake -DNCNN_BENCHMARK=ON ..

- ## 如何转换 cv::Mat CV_8UC3 BGR 图片

   from_pixels to_pixels

- ## 如何转换 float 数据为 ncnn::Mat

   首先，自己申请的内存需要自己管理，此时ncnn::Mat不会自动给你释放你传过来的float数据
   ``` c++
   std::vector<float> testData(60, 1.0);                                      // 利用std::vector<float>自己管理内存的申请和释放
   ncnn::Mat in1 = ncnn::Mat(60, (void*)testData.data()).reshape(4, 5, 3);    // 把float数据的指针转成void*传过去即可，甚至还可以指定维度(up说最好使用reshape用来解决channel gap)
   float* a = new float[60];                                                  // 自己new一块内存，后续需要自己释放
   ncnn::Mat in2 = ncnn::Mat(60, (void*)a).reshape(4, 5, 3).clone();          // 使用方法和上面相同，clone() to transfer data owner
   ```

- ## 如何初始化 ncnn::Mat 为全 0

   `mat.fill(0.f);`

- ## 如何查看／获取版本号

   cmake时会打印

   c_api.h ncnn_version()

   自己拼 1.0+yyyymmdd

- ## 如何转换 yuv 数据

   yuv420sp2rgb yuv420sp2rgb_nv12

   **[@metarutaiga](https://github.com/metarutaiga/xxYUV)**

- ## 如何 resize crop rotate 图片

   [efficient roi resize rotate](https://github.com/Tencent/ncnn/wiki/efficient-roi-resize-rotate)

- ## 如何人脸5点对齐

   get_affine_transform

   warpaffine_bilinear_c3

```c
// 计算变换矩阵 并且求逆变换
int type = 0;       // 0->区域外填充为v[0],v[1],v[2], -233->区域外不处理
unsigned int v = 0;
float tm[6];
float tm_inv[6];
// 人脸区域在原图上的坐标和宽高
float src_x = target->det.rect.x / target->det.w * pIveImageU8C3->u32Width;
float src_y = target->det.rect.y / target->det.h * pIveImageU8C3->u32Height;
float src_w = target->det.rect.w / target->det.w * pIveImageU8C3->u32Width;
float src_h = target->det.rect.h / target->det.h * pIveImageU8C3->u32Height;
float point_src[10] = {
src_x + src_w * target->attr.land[0][0], src_x + src_w * target->attr.land[0][1],
src_x + src_w * target->attr.land[1][0], src_x + src_w * target->attr.land[1][1],
src_x + src_w * target->attr.land[2][0], src_x + src_w * target->attr.land[2][1],
src_x + src_w * target->attr.land[3][0], src_x + src_w * target->attr.land[3][1],
src_x + src_w * target->attr.land[4][0], src_x + src_w * target->attr.land[4][1],
};
float point_dst[10] = { // +8 是因为我们处理112*112的图
30.2946f + 8.0f, 51.6963f,
65.5318f + 8.0f, 51.5014f,
48.0252f + 8.0f, 71.7366f,
33.5493f + 8.0f, 92.3655f,
62.7299f + 8.0f, 92.2041f,
};
// 第一种方式：先计算变换在求逆
AffineTrans::get_affine_transform(point_src, point_dst, 5, tm);
AffineTrans::invert_affine_transform(tm, tm_inv);
// 第二种方式：直接拿到求逆的结果
// AffineTrans::get_affine_transform(point_dst, point_src, 5, tm_inv);
// rgb 分离的，所以要单独处理
for(int c = 0; c < 3; c++)
{
    unsigned char* pSrc = malloc(xxx);
    unsigned char* pDst = malloc(xxx);
    ncnn::warpaffine_bilinear_c1(pSrc, SrcWidth, SrcHeight, SrcStride[c], pDst, DstWidth, DstHeight, DstStride[c], tm_inv, type, v);
}
// rgb packed则可以一次处理
ncnn::warpaffine_bilinear_c3(pSrc, SrcWidth, SrcHeight, SrcStride, pDst, DstWidth, DstHeight, DstStride, tm_inv, type, v);
```

- ## 如何获得中间层的blob输出
  
   ncnn::Mat output;
   
   ex.extract("your_blob_name", output);

- ## 为什么我使用GPU，但是GPU占用为0

   windows 10 任务管理器 - 性能选项卡 - GPU - 选择其中一个视图左上角的下拉箭头切换到 Compute_0 / Compute_1 / Cuda

   你还可以安装软件：GPU-Z 

- ## layer XYZ not exists or registered

   Your network contains some operations that are not implemented in ncnn.

   You may implement them as custom layer followed in how-to-implement-custom-layer-step-by-step.

   Or you could simply register them as no-op if you are sure those operations make no sense.

```
class Noop : public ncnn::Layer {};
DEFINE_LAYER_CREATOR(Noop)

net.register_custom_layer("LinearRegressionOutput", Noop_layer_creator);
net.register_custom_layer("MAERegressionOutput", Noop_layer_creator);
```

- ## network graph not ready

   You shall call Net::load_param() first, then Net::load_model().

   This error may also happens when Net::load_param() failed, but not properly handled.

   For more information about the ncnn model load api, see ncnn-load-model

- ## memory not 32-bit aligned at XYZ

   The pointer passed to Net::load_param() or Net::load_model() is not 32bit aligned.

   In practice, the head pointer of std::vector is not guaranteed to be 32bit aligned.

   you can store your binary buffer in ncnn::Mat structure, its internal memory is aligned.

- ## crash on android with '__kmp_abort_process'

   This usually happens if you bundle multiple shared library with openmp linked

   It is actually an issue of the android ndk https://github.com/android/ndk/issues/1028

   On old android ndk, modify the link flags as

   -Wl,-Bstatic -lomp -Wl,-Bdynamic

   For recent ndk >= 21

   -fstatic-openmp

- ## dlopen failed: library "libomp.so" not found
   Newer android ndk defaults to dynamic openmp runtime

   modify the link flags as

   -fstatic-openmp -fopenmp

- ## crash when freeing a ncnn dynamic library(.dll/.so) built with openMP

   for optimal performance, the openmp threadpool spin waits for about a second prior to shutting down in case more work becomes available.

   If you unload a dynamic library that's in the process of spin-waiting, it will crash in the manner you see (most of the time).

   Just set OMP_WAIT_POLICY=passive in your environment, before calling loadlibrary. or Just wait a few seconds before calling freelibrary.

   You can also use the following method to set environment variables in your code:

   for msvc++:

      SetEnvironmentVariable(_T("OMP_WAIT_POLICY"), _T("passive"));

   for g++:

      setenv("OMP_WAIT_POLICY", "passive", 1)
   
      reference: https://stackoverflow.com/questions/34439956/vc-crash-when-freeing-a-dll-built-with-openmp

# 跑出来的结果对不上

[ncnn-produce-wrong-result](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-produce-wrong-result)

- ## 如何打印 ncnn::Mat 的值？

```C++
void pretty_print(const ncnn::Mat& m)
{
    for (int q=0; q<m.c; q++)
    {
        const float* ptr = m.channel(q);
        for (int y=0; y<m.h; y++)
        {
            for (int x=0; x<m.w; x++)
            {
                printf("%f ", ptr[x]);
            }
            ptr += m.w;
            printf("\n");
        }
        printf("------------------------\n");
    }
}
```
In Android Studio, `printf` will not work, you can use `__android_log_print` instead. Example :
```C++
#include <android/log.h>  // Don't forget this

void pretty_print(const ncnn::Mat& m)
{
    for (int q=0; q<m.c; q++)
    {
        for (int y=0; y<m.h; y++)
        {
            for (int x=0; x<m.w; x++)
            {
                __android_log_print(ANDROID_LOG_DEBUG,"LOG_TAG","ncnn Mat is : %f", m.channel(q).row(y)[x]);
            }
        }
    }
}
```

- ## 如何可视化 ncnn::Mat 的值？

```
void visualize(const char* title, const ncnn::Mat& m)
{
    std::vector<cv::Mat> normed_feats(m.c);

    for (int i=0; i<m.c; i++)
    {
        cv::Mat tmp(m.h, m.w, CV_32FC1, (void*)(const float*)m.channel(i));

        cv::normalize(tmp, normed_feats[i], 0, 255, cv::NORM_MINMAX, CV_8U);

        cv::cvtColor(normed_feats[i], normed_feats[i], cv::COLOR_GRAY2BGR);

        // check NaN
        for (int y=0; y<m.h; y++)
        {
            const float* tp = tmp.ptr<float>(y);
            uchar* sp = normed_feats[i].ptr<uchar>(y);
            for (int x=0; x<m.w; x++)
            {
                float v = tp[x];
                if (v != v)
                {
                    sp[0] = 0;
                    sp[1] = 0;
                    sp[2] = 255;
                }

                sp += 3;
            }
        }
    }

    int tw = m.w < 10 ? 32 : m.w < 20 ? 16 : m.w < 40 ? 8 : m.w < 80 ? 4 : m.w < 160 ? 2 : 1;
    int th = (m.c - 1) / tw + 1;

    cv::Mat show_map(m.h * th, m.w * tw, CV_8UC3);
    show_map = cv::Scalar(127);

    // tile
    for (int i=0; i<m.c; i++)
    {
        int ty = i / tw;
        int tx = i % tw;

        normed_feats[i].copyTo(show_map(cv::Rect(tx * m.w, ty * m.h, m.w, m.h)));
    }

    cv::resize(show_map, show_map, cv::Size(0,0), 2, 2, cv::INTER_NEAREST);
    cv::imshow(title, show_map);
}
```

- ## 总是输出第一张图的结果

   复用 Extractor？！

- ## 启用fp16时的精度有差异

   net.opt.use_fp16_packed = false;

   net.opt.use_fp16_storage = false;

   net.opt.use_fp16_arithmetic = false;

   [ncnn-produce-wrong-result](https://github.com/Tencent/ncnn/wiki/FAQ-ncnn-produce-wrong-result)


# 如何跑得更快？内存占用更少？库体积更小？

- ## fp32 fp16

- ## 大小核绑定
   ncnn::set_cpu_powersave(int)绑定大核或小核
   注意windows系统不支持绑核。
   ncnn支持不同的模型运行在不同的核心。假设硬件平台有2个大核，4个小核，你想把netA运行在大核，netB运行在小核。
   可以通过std::thread or pthread创建两个线程，运行如下代码：
   0:全部
   1:小核
   2:大核
```
   void thread_1()
   {
      ncnn::set_cpu_powersave(2); // bind to big cores
      netA.opt.num_threads = 2;
   }

   void thread_2()
   {
      ncnn::set_cpu_powersave(1); // bind to little cores
      netB.opt.num_threads = 4;
   }
```

   [openmp-best-practice.zh.md](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/openmp-best-practice.zh.md)

- ## 查看 CPU 或 GPU 数量
   get_cpu_count
   
   get_gpu_count

- ## ncnnoptimize

   使用方式一：
    - ./ncnnoptimize ncnn.param ncnn.bin new.param new.bin flag
    <br/>注意这里的flag指的是fp32和fp16，其中0指的是fp32，1指的是fp16

   使用方式二：
    - ./ncnnoptimize ncnn.param ncnn.bin new.param new.bin flag cutstartname cutendname
    <br/>cutstartname：模型截取的起点
     <br/>cutendname：模型截取的终点


- ## 如何使用量化工具？

   [Post Training Quantization Tools](https://github.com/Tencent/ncnn/tree/master/tools/quantize)

- ## 如何设置线程数？

   opt.num_threads

- ## 如何降低CPU占用率？

   net.opt.openmp_blocktime = 0;
   
   OMP_WAIT_POLICY=passive

- ## 如何 batch inference？

```
   int max_batch_size = vkdev->info.compute_queue_count;
   
   ncnn::Mat inputs[1000];
   ncnn::Mat outputs[1000];
   
   #pragma omp parallel for num_threads(max_batch_size)
   for (int i=0; i<1000; i++)
   {
       ncnn::Extractor ex = net1.create_extractor();
       ex.input("data", inputs[i]);
       ex.extract("prob", outputs[i]);
   }
```

   
- ## partial graph inference

   先 extract 分类，判断后，再 extract bbox

- ## 如何启用 bf16s 加速？

```
net.opt.use_packing_layout = true;
net.opt.use_bf16_storage = true;
```

   [用bf16加速ncnn](https://zhuanlan.zhihu.com/p/112564372) **@[nihui](https://www.zhihu.com/people/nihui-2)**

   A53

- ## 如何裁剪更小的 ncnn 库？

   [build-minimal-library](https://github.com/Tencent/ncnn/wiki/build-minimal-library)

- ## net.opt sgemm winograd fp16_storage 各是有什么作用？

   对内存消耗的影响

- ## 如何解决显卡进入节能模式造成的一系列问题？

   nVidia显卡（Intel和AMD估计也有）会在它认为的所谓空闲模式下，自动进入 `节能模式`，显存和核心频率就都会降低。
   
   简单来说就是如果你的计算任务是 `非连续的`，那么可能会让耗时看起来非常 `不均匀`，当期间有运算空闲间隔发生，显卡进入节能模式，则会在下一次冷启动时发生计算耗时远超正常耗时几倍的情况，如下日志所示：

   ```cpp
   //开始播放
   Total: 162ms, Diff: 0ms, GLTex2Mat: 7ms, calc: 152ms, Mat2GLTex: 3ms
   Total: 43ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 35ms, Mat2GLTex: 2ms
   Total: 45ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 37ms, Mat2GLTex: 3ms
   Total: 40ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 32ms, Mat2GLTex: 4ms
   //暂停3秒
   //继续播放
   Total: 190ms, Diff: 0ms, GLTex2Mat: 9ms, calc: 177ms, Mat2GLTex: 3ms
   Total: 134ms, Diff: 0ms, GLTex2Mat: 5ms, calc: 110ms, Mat2GLTex: 18ms
   Total: 40ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 34ms, Mat2GLTex: 2ms
   Total: 42ms, Diff: 0ms, GLTex2Mat: 3ms, calc: 36ms, Mat2GLTex: 2ms
   Total: 47ms, Diff: 0ms, GLTex2Mat: 5ms, calc: 38ms, Mat2GLTex: 3ms
   ...
   ```

   在对时间不敏感的项目上，这个问题没什么大不了的，完全可以忽略，但是有些业务场景上必须精准推估下一帧及其未来几帧的从上传、计算到渲染的耗时情况，则这种现象将会给开发者打开些许困扰。

   ### 3种解决方法
   * 联系显卡厂商，让其更新驱动将你的应用加入到免节能模式的白名单。
     * 优点：你什么都不用改。缺点：沟通困难，很可能显卡厂商根本不理你。
   * [显卡控制面板] - [管理3D设置] - [电源管理模式]，改成：[最高性能优先]。
     * 优点：不用改代码。缺点：如果是部署端是小白用户，需要编写手册手把手教他。
   * 可以空闲（暂停）时定期灌一些心跳计算包的任务进去（放1x1小图）让GPU维持在高性能状态。
     * 优点：需要改代码。缺点：不低碳不环保。

# 白嫖项目

- ## nanodet

# 其他

- ## up主用的什么系统/编辑器/开发环境？

   | 软件类型     |   软件名称  |
   | ------------| ----------- |
   | 系统        | Fedora       |
   | 桌面环境     | KDE         |
   | 编辑器       | Kate        |
   | 画草图       | kolourpaint |
   | 画函数图像   | kmplot      |
   | bilibili直播 |  OBS         |


================================================
FILE: docs/how-to-build/build-mlir2ncnn.md
================================================
# mlir2ncnn

## Compile

**Clone LLVM**
```bash
https://github.com/llvm/llvm-project.git
git checkout -b mlir <a_working_commit_id>
```
Current working commit id is 74e6030bcbcc8e628f9a99a424342a0c656456f9:
```bash
$ git log

commit 74e6030bcbcc8e628f9a99a424342a0c656456f9 (HEAD -> main, origin/main, origin/HEAD)
Author: Craig Topper <craig.topper@sifive.com>
Date:   Thu Mar 4 22:30:38 2021 -0800

    [TargetLowering] Use HandleSDNodes to prevent nodes from being deleted by recursive calls in getNegatedExpression.
```

It is determined by query lastest git commit date of `tools/mlir` directory.


**Compile mlir**
```bash
cd llvm-project
mkdir build
cd build
cmake -G Ninja -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="mlir" -DLLVM_TARGETS_TO_BUILD="" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/
ninja -j8
ninja install
```

**Compile mlir2ncnn**
```bash
cd tools/mlir
mkdir build
cd build
cmake .. -D LLVM_DIR=<path/to/your/llvm_install/lib/cmake/llvm>
make
```

## Usage

**Export `.mlir`**

See https://zhuanlan.zhihu.com/p/152535430


**Usage mlir2ncnn**

```bash
./mlir2ncnn pix2pix.mlir pix2pix.param pix2pix.bin
```


================================================
FILE: docs/how-to-build/how-to-build.md
================================================
### Git clone ncnn repo with submodule

```
git clone https://github.com/Tencent/ncnn.git
cd ncnn
git submodule update --init
```

- [Git clone ncnn repo with submodule](#git-clone-ncnn-repo-with-submodule)
- [Build for Linux](#build-for-linux)
  - [Nvidia Jetson](#nvidia-jetson)
  - [Raspberry Pi](#raspberry-pi)
  - [POWER](#power)
  - [Intel oneAPI](#intel-oneapi)
  - [Cross compile: Riscv-gnu-toolchain](#cross-compile-riscv-gnu-toolchain)
  - [Verification](#verification)
- [Build for Windows x64 using Visual Studio Community 2017](#build-for-windows-x64-using-visual-studio-community-2017)
- [Build for Windows x64 using MinGW-w64](#build-for-windows-x64-using-mingw-w64)
- [Build for Windows XP (x86)](#build-for-windows-xp-x86)
  - [Using MinGW-w64](#using-mingw-w64)
  - [Using Clang](#using-clang)
  - [Using Visual Studio (MSVC)](#using-visual-studio-msvc)
- [Build for macOS](#build-for-macos)
- [Build for ARM Cortex-A family with cross-compiling](#build-for-arm-cortex-a-family-with-cross-compiling)
- [Build for Hisilicon platform with cross-compiling](#build-for-hisilicon-platform-with-cross-compiling)
- [Build for AnyCloud platform with cross-compiling](#build-for-AnyCloud-platform-with-cross-compiling)
- [Build for Android](#build-for-android)
- [Build for iOS on macOS with xcode](#build-for-ios-on-macos-with-xcode)
- [Build for WebAssembly](#build-for-webassembly)
- [Build for AllWinner D1](#build-for-allwinner-d1)
- [Build for Loongson 2K1000](#build-for-loongson-2k1000)
- [Build for Termux on Android](#build-for-termux-on-android)
- [Build for QNX](#build-for-qnx)
- [Build for Nintendo 3DS Homebrew Launcher](#build-for-nintendo-3ds-homebrew-launcher)
- [Build for HarmonyOS with cross-compiling](#build-for-harmonyos-with-cross-compiling)
- [Build for ESP32 with cross-compiling](#build-for-esp32-with-cross-compiling)

***

### Build for Linux

Install required build dependencies:

* git
* g++
* cmake
* protocol buffer (protobuf) headers files and protobuf compiler
* (optional) LLVM OpenMP header files # If building with Clang, and multithreaded CPU inference is desired
* (optional) opencv  # For building examples

Generally if you have Intel, AMD or Nvidia GPU from last 10 years, Vulkan can be easily used.

On some systems there are no Vulkan drivers easily available at the moment (October 2020), so you might need to disable use of Vulkan on them. This applies to Raspberry Pi 3 (but there is experimental open source Vulkan driver in the works, which is not ready yet). Nvidia Tegra series devices (like Nvidia Jetson) should support Vulkan. Ensure you have most recent software installed for best experience.

On Debian, Ubuntu, or Raspberry Pi OS, you can install all required dependencies using:
```shell
sudo apt install build-essential git cmake libprotobuf-dev protobuf-compiler libomp-dev libopencv-dev
```
On Redhat or Centos, you can install all required dependencies using:
```shell
sudo yum install build-essential git cmake libprotobuf-dev protobuf-compiler libopencv-dev
```

To use Vulkan after building ncnn later, you will also need to have Vulkan driver for your GPU. For AMD and Intel GPUs these can be found in Mesa graphics driver, which usually is installed by default on all distros (i.e. `sudo apt install mesa-vulkan-drivers` on Debian/Ubuntu). For Nvidia GPUs the proprietary Nvidia driver must be downloaded and installed (some distros will allow easier installation in some way). After installing Vulkan driver, confirm Vulkan libraries and driver are working, by using `vulkaninfo` or `vulkaninfo | grep deviceType`, it should list GPU device type. If there are more than one GPU installed (including the case of integrated GPU and discrete GPU, commonly found in laptops), you might need to note the order of devices to use later on.

#### Nvidia Jetson

The Vulkan driver is a default component of the Linux For Tegra BSP release, check [the device list](https://developer.nvidia.com/embedded/vulkan).

```shell
cd ncnn
mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/jetson.toolchain.cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON ..
make -j$(nproc)
```

#### Raspberry Pi

Vulkan drivers do exists, but are not mature. You are free to experiment at your own discretion, and report results and performance.

```shell
cd ncnn
mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON ..
make -j$(nproc)
```

You can add `-GNinja` to `cmake` above to use Ninja build system (invoke build using `ninja` or `cmake --build .`).

For Raspberry Pi 3 on 32bit OS, add `-DCMAKE_TOOLCHAIN_FILE=../toolchains/pi3.toolchain.cmake` to cmake. You can also consider disabling Vulkan support as the Vulkan drivers for Raspberry Pi are still not mature, but it doesn't hurt to build the support in, but not use it.

#### POWER

For POWER9 with Clang:

```shell
cd ncnn
mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/power9le-linux-gnu-vsx.clang.toolchain.cmake -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON ..
make -j$(nproc)
```

To use GCC instead, use the `power9le-linux-gnu-vsx.toolchain.cmake` toolchain file instead. Note that according to benchmarks, Clang appears to produce noticeably faster CPU inference than GCC for POWER9 targets. For fastest inference, use Clang 18 or higher; earlier versions of Clang may have impaired inference speed due to [Bug 49864](https://github.com/llvm/llvm-project/issues/49864) and [Bug 64664](https://github.com/llvm/llvm-project/issues/64664).

For POWER8 instead of POWER9, use the `power8le-linux-gnu-vsx.clang.toolchain.cmake` or `power8le-linux-gnu-vsx.toolchain.cmake` toolchain file instead. POWER8 will be slower than POWER9.

Note that the POWER toolchain files only support little-endian mode.

#### Intel oneAPI

Besides the prerequests in this section, Intel oneAPI BaseKit and HPCKit should be installed. They are available from https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html and https://www.intel.com/content/www/us/en/developer/tools/oneapi/hpc-toolkit.html freely.

Intel oneAPI offers two kinds of compilers, the classic `icc/icpc` and the LLVM based `icx/icpx`. To build with these compilers, add `CC=icc CXX=icpc` or `CC=icx CXX=icpx` before the `cmake` command. When compiling with `icc/icpc`, cmake will warn that `xop`, `avx512`, and `bf16` extensions are not supported by the compiler, while `icx/icpx` works well.

Both of these compilers have been tested and passed the ncnn benchmark successfully. The results have been included in ncnn benchmark readme. Generally, `icx/icpx` are likely to show better performance than `icc/icpc` and the quantized models can benefit from the extensions `icx/icpx` supports.

#### Cross compile: Riscv-gnu-toolchain
Before compiling the whole project, toolchain must be installed.
[Reference: Riscv-gnu-toolchain build guide](https://github.com/riscv-collab/riscv-gnu-toolchain/blob/master/README.md)
```shell

# configure with vector extension.
./configure --prefix=/opt/riscv --enable-multilib --with-arch=rv64gcv

# configure without vector extension.
./configure --prefix=/opt/riscv --enable-multilib --with-arch=rv64gc

# it takes quite a long time:(
sudo make linux

```
Now you can build the project:
```shell
mkdir build-riscv
cd build-riscv
cmake -DDCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_EXAMPLES=ON ..
make -j$(nproc) # or `make -j2` if your cpu isn't powerful enough.
```

#### Verification

Verify build by running some examples:

```shell
cd ../examples
../build/examples/squeezenet ../images/256-ncnn.png
[0 AMD RADV FIJI (LLVM 10.0.1)]  queueC=1[4]  queueG=0[1]  queueT=0[1]
[0 AMD RADV FIJI (LLVM 10.0.1)]  bugsbn1=0  buglbia=0  bugcopc=0  bugihfa=0
[0 AMD RADV FIJI (LLVM 10.0.1)]  fp16p=1  fp16s=1  fp16a=0  int8s=1  int8a=1
532 = 0.163452
920 = 0.093140
716 = 0.061584
```

You can also run benchmarks (the 4th argument is a GPU device index to use, refer to `vulkaninfo`, if you have more than one GPU):

```shell
cd ../benchmark
../build/benchmark/benchncnn 10 $(nproc) 0 0
[0 AMD RADV FIJI (LLVM 10.0.1)]  queueC=1[4]  queueG=0[1]  queueT=0[1]
[0 AMD RADV FIJI (LLVM 10.0.1)]  bugsbn1=0  buglbia=0  bugcopc=0  bugihfa=0
[0 AMD RADV FIJI (LLVM 10.0.1)]  fp16p=1  fp16s=1  fp16a=0  int8s=1  int8a=1
num_threads = 4
powersave = 0
gpu_device = 0
cooling_down = 1
          squeezenet  min =    4.68  max =    4.99  avg =    4.85
     squeezenet_int8  min =   38.52  max =   66.90  avg =   48.52
...
```

To run benchmarks on a CPU, set the 5th argument to `-1`.


***

### Build for Windows x64 using Visual Studio Community 2017

Download and Install Visual Studio Community 2017 from https://visualstudio.microsoft.com/vs/community/

Start the command prompt: `Start → Programs → Visual Studio 2017 → Visual Studio Tools → x64 Native Tools Command Prompt for VS 2017`

> You can also search `x64 Native Tools Command Prompt for VS 2017` directly.

Download protobuf-3.11.2 from https://github.com/google/protobuf/archive/v3.11.2.zip

Build protobuf library:

```shell
cd <protobuf-root-dir>
mkdir protobuf_build
cd protobuf_build
cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF ../cmake
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
```

Build ncnn library (replace `<protobuf-root-dir>` with a proper path):

```shell
cd <ncnn-root-dir>
mkdir -p protobuf_build
cd protobuf_build
cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -Dprotobuf_DIR=<protobuf-root-dir>/protobuf_build/install/cmake -DNCNN_VULKAN=ON ..
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
```

Note: To speed up compilation process on multi core machines, configuring `cmake` to use `jom` or `ninja` using `-G` flag is recommended.

Note: For protobuf >=22.0 (Take v25.3 for example):

Build zlib:
```shell
git clone -b -v1.3.1 https://github.com/madler/zlib.git
cd zlib
mkdir build
cd build
cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install ..
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
```

Build protobuf library (replace `<zlib-root-dir>` with a proper path):
```shell
git clone -b v25.3 https://github.com/protocolbuffers/protobuf.git
cd protobuf
git submodule update --init --recursive

mkdir protobuf_build
cd protobuf_build
cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_CXX_STANDARD=14 -Dprotobuf_BUILD_TESTS=OFF -Dprotobuf_MSVC_STATIC_RUNTIME=OFF -DZLIB_INCLUDE_DIR=<zlib-root-dir>\build\install\include -DZLIB_LIBRARY=<zlib-root-dir>\build\install\lib\zlib.lib -DABSL_PROPAGATE_CXX_STD=ON ../cmake
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
```

Build ncnn library (replace `<zlib-root-dir>` and `<protobuf-root-dir>` with a proper path):

```shell
cd <ncnn-root-dir>
mkdir -p build
cd build
cmake -A x64 -DCMAKE_INSTALL_PREFIX=%cd%/install -DCMAKE_PREFIX_PATH=<protobuf-root-dir>/protobuf_build\install\cmake -DZLIB_INCLUDE_DIR=<zlib-root-dir>\build\install\include -DZLIB_LIBRARY=<zlib-root-dir>\build\install\lib\zlib.lib -Dabsl_DIR=<protobuf-root-dir>/protobuf_build\install\lib\cmake\absl -Dutf8_range_DIR=<protobuf-root-dir>/protobuf_build\install\lib\cmake\utf8_range -DNCNN_VULKAN=ON ..
cmake --build . --config Release -j 2
cmake --build . --config Release --target install
```

***

### Build for Windows x64 using MinGW-w64

Download MinGW-w64 toolchain from [winlibs](https://winlibs.com/) or [w64devkit](https://github.com/skeeto/w64devkit), add `bin` folder to environment variables.

Build ncnn library:

```shell
cd <ncnn-root-dir>
mkdir build
cd build
cmake -DNCNN_VULKAN=ON -G "MinGW Makefiles" ..
cmake --build . --config Release -j 4
cmake --build . --config Release --target install
```

***

### Build for Windows XP (x86)

> **Note:** Windows XP support is provided through collaborative contributions from [@Sugar-Baby](https://github.com/Sugar-Baby) and [@AtomAlpaca](https://github.com/AtomAlpaca).

#### Using MinGW-w64

Download mingw toolchain targeting 32 bit from [sourceforge](https://jaist.dl.sourceforge.net/project/mingw-w64/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/8.1.0/threads-posix/dwarf/i686-8.1.0-release-posix-dwarf-rt_v6-rev0.7z), extract and add environment variable named `MINGW32_ROOT_PATH` valued by `<your-path-to-mingw-root-path>`, and add `<your-path-to-mingw-root-path>/bin` to `PATH`.

```shell
mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-mingw.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles"
cmake --build . --config Release -j 4
cmake --build . --config Release --target install
```

#### Using Clang

Clang requires libraries from mingw. Configure mingw toolchain targeting 32-bit as described in the [MinGW-w64 section](#using-mingw-w64).

Install Clang 6.0 or later.

```shell
mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-clang.toolchain.cmake" -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_AVX=OFF .. -G "MinGW Makefiles"
cmake --build . --config Release -j 4
cmake --build . --config Release --target install
```

#### Using Visual Studio (MSVC)

Install v141_xp toolset for Windows XP:

1. Bring up the Visual Studio installer (Tools → Get Tools and Features)
2. Select Desktop development with C++
3. Select Windows XP support for C++ from the Summary section
4. Click Modify

```shell
mkdir build
cd build
cmake -A WIN32 -G "Visual Studio 17 2022" -T v141_xp -DNCNN_WINXP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_AVX=OFF -DNCNN_BUILD_WITH_STATIC_CRT=ON -DCMAKE_TOOLCHAIN_FILE="../toolchains/windows-xp-msvc.toolchain.cmake" ..
cmake --build . --config Release -j 4
cmake --build . --config Release --target install
```

**Note:** The MSVC toolchain uses the `v141_xp` platform toolset for Windows XP compatibility. Vulkan is disabled for XP compatibility, and advanced CPU features (AVX, AVX2, AVX512) are disabled to ensure compatibility with older processors.

***

### Build for macOS

We've published ncnn to [brew](https://formulae.brew.sh/formula/ncnn#default) now, you can just use following method to install ncnn if you have the Xcode Command Line Tools installed.

```shell
brew update
brew install ncnn
```

Or if you want to compile and build ncnn locally, first install Xcode or Xcode Command Line Tools according to your needs.

Then install `protobuf` and `libomp` via homebrew

```shell
brew install protobuf libomp
```

Download and install Vulkan SDK from <https://vulkan.lunarg.com/sdk/home>


```shell
wget https://sdk.lunarg.com/sdk/download/1.3.280.1/mac/vulkansdk-macos-1.3.280.1.dmg -O vulkansdk-macos-1.3.280.1.dmg
hdiutil attach vulkansdk-macos-1.3.280.1.dmg
sudo /Volumes/vulkansdk-macos-1.3.280.1/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.3.280.1 --accept-licenses --default-answer --confirm-command install
hdiutil detach /Volumes/vulkansdk-macos-1.3.280.1

# setup env
export VULKAN_SDK=`pwd`/vulkansdk-macos-1.3.280.1/macOS
```

```shell
cd <ncnn-root-dir>
mkdir -p build
cd build

cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=MAC -DARCHS="x86_64;arm64" \
    -DVulkan_LIBRARY=`pwd`/../vulkansdk-macos-1.3.280.1/macOS/lib/libMoltenVK.dylib \
    -DNCNN_VULKAN=ON -DNCNN_BUILD_EXAMPLES=ON ..

cmake --build . -j 4
cmake --build . --target install
```

*Note: If you encounter `libomp` related errors during installation, you can also check our GitHub Actions at [here](https://github.com/Tencent/ncnn/blob/d91cccf/.github/workflows/macos-x64-gpu.yml#L50-L68) to install and use `openmp`.*
***

### Build for ARM Cortex-A family with cross-compiling
Download ARM toolchain from https://developer.arm.com/open-source/gnu-toolchain/gnu-a/downloads

```shell
export PATH="<your-toolchain-compiler-path>:${PATH}"
```

Alternatively install a cross-compiler provided by the distribution (i.e. on Debian / Ubuntu, you can do `sudo apt install g++-arm-linux-gnueabi g++-arm-linux-gnueabihf g++-aarch64-linux-gnu`).

Depending on your needs build one or more of the below targets.

AArch32 target with soft float (arm-linux-gnueabi)
```shell
cd <ncnn-root-dir>
mkdir -p build-arm-linux-gnueabi
cd build-arm-linux-gnueabi
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake ..
make -j$(nproc)
make install
```

AArch32 target with hard float (arm-linux-gnueabihf)
```shell
cd <ncnn-root-dir>
mkdir -p build-arm-linux-gnueabihf
cd build-arm-linux-gnueabihf
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf.toolchain.cmake ..
make -j$(nproc)
make install
```

AArch64 GNU/Linux target (aarch64-linux-gnu)
```shell
cd <ncnn-root-dir>
mkdir -p build-aarch64-linux-gnu
cd build-aarch64-linux-gnu
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake ..
make -j$(nproc)
make install
```

***

### Build for Hisilicon platform with cross-compiling
Download and install Hisilicon SDK. The toolchain should be in `/opt/hisi-linux/x86-arm` 
new version of Hisilicon toolchain should be in `/opt/linux/x86-arm/` 

```shell
cd <ncnn-root-dir>
mkdir -p build
cd build

# Choose one cmake toolchain file depends on your target platform
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv300.toolchain.cmake ..
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/hisiv500.toolchain.cmake ..
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix100.toolchain.cmake ..
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix200.toolchain.cmake ..
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/himix210.toolchain.cmake ..

make -j$(nproc)
make install
```

***

### Build for AnyCloud platform with cross-compiling
Download and install AnyCloud SDK. And load env to set toolchain can access in shell

```shell
cd <ncnn-root-dir>
mkdir -p build
cd build

# Choose one cmake toolchain file depends on your target platform
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/anykav500.toolchain.cmake ..

make -j$(nproc)
make install
```

***

### Build for Android
You can use the pre-build ncnn-android-lib.zip from https://github.com/Tencent/ncnn/releases

Download Android NDK from http://developer.android.com/ndk/downloads/index.html and install it, for example:

```shell
unzip android-ndk-r21d-linux-x86_64.zip
export ANDROID_NDK=<your-ndk-root-path>
```

(optional) remove the hardcoded debug flag in Android NDK [android-ndk issue](https://github.com/android-ndk/ndk/issues/243)
```
# open $ANDROID_NDK/build/cmake/android.toolchain.cmake for ndk < r23
# or $ANDROID_NDK/build/cmake/android-legacy.toolchain.cmake for ndk >= r23
# delete "-g" line
list(APPEND ANDROID_COMPILER_FLAGS
  -g
  -DANDROID
```

Build armv7 library

```shell
cd <ncnn-root-dir>
mkdir -p build-android-armv7
cd build-android-armv7

cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
    -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON \
    -DANDROID_PLATFORM=android-14 -DNCNN_VULKAN=ON ..

# If you use cmake >= 3.21 and ndk-r23
# you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags

make -j$(nproc)
make install
```

Pick `build-android-armv7/install` folder for further JNI usage.


Build aarch64 library:

```shell
cd <ncnn-root-dir>
mkdir -p build-android-aarch64
cd build-android-aarch64

cmake -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake"\
    -DANDROID_ABI="arm64-v8a" \
    -DANDROID_PLATFORM=android-21 -DNCNN_VULKAN=ON ..

# If you use cmake >= 3.21 and ndk-r23
# you need to add -DANDROID_USE_LEGACY_TOOLCHAIN_FILE=False option for working optimization flags

make -j$(nproc)
make install
```

Pick `build-android-aarch64/install` folder for further JNI usage.

***

### Build for iOS on macOS with xcode
You can use the pre-build ncnn.framework glslang.framework and openmp.framework from https://github.com/Tencent/ncnn/releases

Install xcode

You can replace ```-DENABLE_BITCODE=0``` to ```-DENABLE_BITCODE=1``` in the following cmake arguments if you want to build bitcode enabled libraries.

Download and install openmp for multithreading inference feature on iPhoneOS
```shell
wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz
tar -xf openmp-11.0.0.src.tar.xz
cd openmp-11.0.0.src

# apply some compilation fix
sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S
sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S

mkdir -p build-ios
cd build-ios

cmake -DCMAKE_TOOLCHAIN_FILE=<ncnn-root-dir>/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \
    -DPLATFORM=OS64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="arm64;arm64e" \
    -DPERL_EXECUTABLE=/usr/local/bin/perl \
    -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..

cmake --build . -j 4
cmake --build . --target install

# copy openmp library and header files to xcode toolchain sysroot
# <xcode-dir> is usually /Applications/Xcode.app or /Applications/Xcode-beta.app depends on your Xcode version
sudo cp install/include/* <xcode-dir>/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/include
sudo cp install/lib/libomp.a <xcode-dir>/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib
```

Download and install openmp for multithreading inference feature on iPhoneSimulator
```shell
wget https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.0/openmp-11.0.0.src.tar.xz
tar -xf openmp-11.0.0.src.tar.xz
cd openmp-11.0.0.src

# apply some compilation fix
sed -i'' -e '/.size __kmp_unnamed_critical_addr/d' runtime/src/z_Linux_asm.S
sed -i'' -e 's/__kmp_unnamed_critical_addr/___kmp_unnamed_critical_addr/g' runtime/src/z_Linux_asm.S

mkdir -p build-ios-sim
cd build-ios-sim

cmake -DCMAKE_TOOLCHAIN_FILE=<ncnn-root-dir>/toolchains/ios.toolchain.cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install \
    -DPLATFORM=SIMULATORARM64 -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 -DARCHS="x86_64;arm64" \
    -DPERL_EXECUTABLE=/usr/local/bin/perl \
    -DLIBOMP_ENABLE_SHARED=OFF -DLIBOMP_OMPT_SUPPORT=OFF -DLIBOMP_USE_HWLOC=OFF ..

cmake --build . -j 4
cmake --build . --target install

# copy openmp library and header files to xcode toolchain sysroot
# <xcode-dir> is usually /Applications/Xcode.app or /Applications/Xcode-beta.app depends on your Xcode version
sudo cp install/include/* <xcode-dir>/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/include
sudo cp install/lib/libomp.a <xcode-dir>/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib
```

Package openmp framework:
```shell
cd <openmp-root-dir>

mkdir -p openmp.framework/Versions/A/Headers
mkdir -p openmp.framework/Versions/A/Resources
ln -s A openmp.framework/Versions/Current
ln -s Versions/Current/Headers openmp.framework/Headers
ln -s Versions/Current/Resources openmp.framework/Resources
ln -s Versions/Current/openmp openmp.framework/openmp
lipo -create build-ios/install/lib/libomp.a build-ios-sim/install/lib/libomp.a -o openmp.framework/Versions/A/openmp
cp -r build-ios/install/include/* openmp.framework/Versions/A/Headers/
sed -e 's/__NAME__/openmp/g' -e 's/__IDENTIFIER__/org.llvm.openmp/g' -e 's/__VERSION__/11.0/g' <ncnn-root-dir>/Info.plist > openmp.framework/Versions/A/Resources/Info.plist
```

Download and install Vulkan SDK from https://vulkan.lunarg.com/sdk/home
```shell
wget https://sdk.lunarg.com/sdk/download/1.2.189.0/mac/vulkansdk-macos-1.2.189.0.dmg?Human=true -O vulkansdk-macos-1.2.189.0.dmg
hdiutil attach vulkansdk-macos-1.2.189.0.dmg
sudo /Volumes/vulkansdk-macos-1.2.189.0/InstallVulkan.app/Contents/MacOS/InstallVulkan --root `pwd`/vulkansdk-macos-1.2.189.0 --accept-licenses --default-answer --confirm-command install
hdiutil detach /Volumes/vulkansdk-macos-1.2.189.0

# setup env
export VULKAN_SDK=`pwd`/vulkansdk-macos-1.2.189.0/macOS
```

Build library for iPhoneOS:

```shell
cd <ncnn-root-dir>
git submodule update --init
mkdir -p build-ios
cd build-ios

cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=OS64 -DARCHS="arm64;arm64e" \
    -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
    -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
    -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
    -DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/usr/lib/libomp.a" \
    -DNCNN_VULKAN=ON -DNCNN_BUILD_BENCHMARK=OFF ..

cmake --build . -j 4
cmake --build . --target install
```

Build library for iPhoneSimulator:

```shell
cd <ncnn-root-dir>
mkdir -p build-ios-sim
cd build-ios-sim

cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/ios.toolchain.cmake -DPLATFORM=SIMULATORARM64 -DARCHS="x86_64;arm64" \
    -DENABLE_BITCODE=0 -DENABLE_ARC=0 -DENABLE_VISIBILITY=0 \
    -DOpenMP_C_FLAGS="-Xclang -fopenmp" -DOpenMP_CXX_FLAGS="-Xclang -fopenmp" \
    -DOpenMP_C_LIB_NAMES="libomp" -DOpenMP_CXX_LIB_NAMES="libomp" \
    -DOpenMP_libomp_LIBRARY="/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/usr/lib/libomp.a" \
    -DNCNN_BUILD_BENCHMARK=OFF ..

cmake --build . -j 4
cmake --build . --target install
```

Package glslang framework for iPhoneOS:
```shell
cd <ncnn-root-dir>

mkdir -p glslang.framework/Versions/A/Headers
mkdir -p glslang.framework/Versions/A/Resources
ln -s A glslang.framework/Versions/Current
ln -s Versions/Current/Headers glslang.framework/Headers
ln -s Versions/Current/Resources glslang.framework/Resources
ln -s Versions/Current/glslang glslang.framework/glslang
libtool -static build-ios/install/lib/libglslang.a build-ios/install/lib/libSPIRV.a -o build-ios/install/lib/libglslang_combined.a
lipo -create build-ios/install/lib/libglslang_combined.a -o glslang.framework/Versions/A/glslang
cp -r build/install/include/glslang glslang.framework/Versions/A/Headers/
sed -e 's/__NAME__/glslang/g' -e 's/__IDENTIFIER__/org.khronos.glslang/g' -e 's/__VERSION__/1.0/g' Info.plist > glslang.framework/Versions/A/Resources/Info.plist
```

Package ncnn framework for iPhoneOS:
```shell
cd <ncnn-root-dir>

mkdir -p ncnn.framework/Versions/A/Headers
mkdir -p ncnn.framework/Versions/A/Resources
ln -s A ncnn.framework/Versions/Current
ln -s Versions/Current/Headers ncnn.framework/Headers
ln -s Versions/Current/Resources ncnn.framework/Resources
ln -s Versions/Current/ncnn ncnn.framework/ncnn
lipo -create build-ios/install/lib/libncnn.a -o ncnn.framework/Versions/A/ncnn
cp -r build-ios/install/include/* ncnn.framework/Versions/A/Headers/
sed -e 's/__NAME__/ncnn/g' -e 's/__IDENTIFIER__/com.tencent.ncnn/g' -e 's/__VERSION__/1.0/g' Info.plist > ncnn.framework/Versions/A/Resources/Info.plist
```

Pick `ncnn.framework` `glslang.framework` and `openmp.framework` folder for app development.

***

### Build for WebAssembly

Install Emscripten

```shell
git clone https://github.com/emscripten-core/emsdk.git
cd emsdk
./emsdk install 3.1.28
./emsdk activate 3.1.28

source emsdk_env.sh
```

Build without any extension for general compatibility:
```shell
mkdir -p build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \
    -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
    -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
cmake --build . -j 4
cmake --build . --target install
```

Build with WASM SIMD extension:
```shell
mkdir -p build-simd
cd build-simd
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \
    -DNCNN_THREADS=OFF -DNCNN_OPENMP=OFF -DNCNN_SIMPLEOMP=OFF -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
    -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
cmake --build . -j 4
cmake --build . --target install
```

Build with WASM Thread extension:
```shell
mkdir -p build-threads
cd build-threads
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \
    -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
    -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
cmake --build . -j 4
cmake --build . --target install
```

Build with WASM SIMD and Thread extension:
```shell
mkdir -p build-simd-threads
cd build-simd-threads
cmake -DCMAKE_TOOLCHAIN_FILE=$EMSDK/upstream/emscripten/cmake/Modules/Platform/Emscripten.cmake \
    -DNCNN_THREADS=ON -DNCNN_OPENMP=ON -DNCNN_SIMPLEOMP=ON -DNCNN_SIMPLEOCV=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=ON -DNCNN_AVX2=OFF -DNCNN_AVX=OFF \
    -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_BENCHMARK=OFF ..
cmake --build . -j 4
cmake --build . --target install
```

Pick `build-XYZ/install` folder for further usage.

***

### Build for AllWinner D1

Download c906 toolchain package from https://www.xrvm.cn/community/download?id=4453617141140230144

```shell
tar -xf Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.1.0-20250522.tar.gz
export RISCV_ROOT_PATH=/home/nihui/osd/Xuantie-900-gcc-linux-6.6.0-glibc-x86_64-V3.1.0
```

Build ncnn with riscv-v vector and simpleocv enabled:
```shell
mkdir -p build-c906
cd build-c906
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v310.toolchain.cmake \
    -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=OFF -DNCNN_XTHEADVECTOR=ON -DNCNN_ZFH=ON -DNCNN_ZVFH=OFF \
    -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON ..
cmake --build . -j 4
cmake --build . --target install
```

Pick `build-c906/install` folder for further usage.

You can upload binary inside `build-c906/examples` folder and run on D1 board for testing.

***

### Build for Loongson 2K1000

For gcc version < 8.5, you need to fix msa.h header for workaround msa fmadd/fmsub/maddv/msubv bug.

Open ```/usr/lib/gcc/mips64el-linux-gnuabi64/8/include/msa.h```, find ```__msa_fmadd``` and ```__msa_fmsub``` and apply changes as the following
```c
// #define __msa_fmadd_w __builtin_msa_fmadd_w
// #define __msa_fmadd_d __builtin_msa_fmadd_d
// #define __msa_fmsub_w __builtin_msa_fmsub_w
// #define __msa_fmsub_d __builtin_msa_fmsub_d
#define __msa_fmadd_w(a, b, c) __builtin_msa_fmadd_w(c, b, a)
#define __msa_fmadd_d(a, b, c) __builtin_msa_fmadd_d(c, b, a)
#define __msa_fmsub_w(a, b, c) __builtin_msa_fmsub_w(c, b, a)
#define __msa_fmsub_d(a, b, c) __builtin_msa_fmsub_d(c, b, a)
```

find ```__msa_maddv``` and ```__msa_msubv``` and apply changes as the following
```c
// #define __msa_maddv_b __builtin_msa_maddv_b
// #define __msa_maddv_h __builtin_msa_maddv_h
// #define __msa_maddv_w __builtin_msa_maddv_w
// #define __msa_maddv_d __builtin_msa_maddv_d
// #define __msa_msubv_b __builtin_msa_msubv_b
// #define __msa_msubv_h __builtin_msa_msubv_h
// #define __msa_msubv_w __builtin_msa_msubv_w
// #define __msa_msubv_d __builtin_msa_msubv_d
#define __msa_maddv_b(a, b, c) __builtin_msa_maddv_b(c, b, a)
#define __msa_maddv_h(a, b, c) __builtin_msa_maddv_h(c, b, a)
#define __msa_maddv_w(a, b, c) __builtin_msa_maddv_w(c, b, a)
#define __msa_maddv_d(a, b, c) __builtin_msa_maddv_d(c, b, a)
#define __msa_msubv_b(a, b, c) __builtin_msa_msubv_b(c, b, a)
#define __msa_msubv_h(a, b, c) __builtin_msa_msubv_h(c, b, a)
#define __msa_msubv_w(a, b, c) __builtin_msa_msubv_w(c, b, a)
#define __msa_msubv_d(a, b, c) __builtin_msa_msubv_d(c, b, a)
```

Build ncnn with mips msa and simpleocv enabled:
```shell
mkdir -p build
cd build
cmake -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=ON -DNCNN_MMI=ON -DNCNN_SIMPLEOCV=ON ..
cmake --build . -j 2
cmake --build . --target install
```

Pick `build/install` folder for further usage.

You can run binary inside `build/examples` folder for testing.

***

### Build for Termux on Android

Install app Termux on your phone,and install Ubuntu in Termux.

 If you want use ssh, just install openssh in Termux

```shell
pkg install proot-distro
proot-distro install ubuntu
```

or you can see what system can be installed using `proot-distro list`

while you install ubuntu successfully, using `proot-distro login ubuntu` to login Ubuntu.

Then make ncnn,no need to install any other dependencies.

```shell
git clone https://github.com/Tencent/ncnn.git
cd ncnn
git submodule update --init
mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_BUILD_EXAMPLES=ON -DNCNN_PLATFORM_API=OFF -DNCNN_SIMPLEOCV=ON ..
make -j$(nproc)
```

Then you can run a test

> on my Pixel 3 XL using Qualcomm 845,cant load `256-ncnn.png`

```shell
cd ../examples
../build/examples/squeezenet ../images/128-ncnn.png
```

### Build for QNX

Request license and download SDP from QNX Software Center: https://www.qnx.com/products/everywhere/ .

Setup QNX environment by invoking SDP's bundled script:

on Windows, open cmd and run
```batch
call C:\Users\zz\qnx800\qnxsdp-env.bat
```

on Linux, use /bin/bash and run
```shell
source /home/zz/qnx800/qnxsdp-env.sh
```

If it gives error `cannot find ld` on Linux, solve it by creaing link file:
```shell
cd ${QNX_HOST}/usr/bin/
ln -s aarch64-unknown-nto-qnx7.1.0-ld ld
```

Build ncnn with cmake in same shell:

```shell
git clone https://github.com/Tencent/ncnn.git
cd ncnn
git submodule update --init
mkdir -p build-qnx
cd build-qnx
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-qnx.toolchain.cmake ..
make -j$(nproc)
make install
```

Pick `build-qnx/install` folder for further usage.

### Build for Nintendo 3DS Homebrew Launcher
Install DevkitPRO toolchains
- If you are working on windows, download DevkitPro installer from [DevkitPro](https://devkitpro.org/wiki/Getting_Started).
- If you are using Ubuntu, the official guidelines from DevkitPro might not work for you. Try using the lines below to install
```shell
sudo apt-get update
sudo apt-get upgrade
wget https://apt.devkitpro.org/install-devkitpro-pacman
chmod +x ./install-devkitpro-pacman
sudo ./install-devkitpro-pacman
```

```shell
export DEVKITPRO=/opt/devkitpro
export DEVKITARM=/opt/devkitpro/devkitARM
export DEVKITPPC=/opt/devkitpro/devkitPPC
export export PATH=$/opt/devkitpro/tools/bin:$PATH
source ~/.profile
```
```shell
sudo dkp-pacman -Sy
sudo dkp-pacman -Syu
sudo dkp-pacman -S 3ds-dev
```
Copy the toolchain files from [3DS-cmake](https://github.com/Xtansia/3ds-cmake)(DevitARM3DS.cmake and the cmake folder) to NCNN's toolchains folder.
```
├── toolchains
│   ├── cmake
│   │   ├── bin2s_header.h.in
│   │   ├── FindCITRO3D.cmake
│   │   ├── FindCTRULIB.cmake
│   │   ├── FindFreetype.cmake
│   │   ├── FindJPEG.cmake
│   │   ├── FindPNG.cmake
│   │   ├── FindSF2D.cmake
│   │   ├── FindSFIL.cmake
│   │   ├── FindSFTD.cmake
│   │   ├── FindZLIB.cmake
│   │   ├── LibFindMacros.cmake
│   │   ├── Tools3DS.cmake
│   │   ├── ToolsGBA.cmake
│   │   └── try_add_imported_target.cmake
│   ├── DevkitArm3DS.cmake
...

```
Build with:
```shell
cd ncnn
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/DevkitArm3DS.cmake .. -DNCNN_SIMPLEOCV=ON -DNCNN_OPENMP=OFF -DNCNN_VFPV4=OFF ..
make -j4
make install
```
Modify the Makefile in Homebrew example to link and use NCNN in your 3DS Homebrew app.

***

### Build for HarmonyOS with cross-compiling
Download and install HarmonyOS SDK. The sdk installation directory is `/opt/ohos-sdk/linux`

```shell
cd <ncnn-root-dir>
mkdir -p build
cd build

export HM_SDK=/opt/ohos-sdk/linux

# Choose HarmonyOS sdk cmake toolchain file.
# If you want to enable vulkan, set -DNCNN_VULKAN=ON
# The HarmonyOS sdk does not support openmp, use ncnn simpleomp instead.
# Cross-compiling with CMake must use the one provided by the HarmonyOS SDK; otherwise, it won't recognize parameters like OHOS_PLATFORM, leading to compilation errors.
${HM_SDK}/native/build-tools/cmake/bin/cmake -DOHOS_STL=c++_static -DOHOS_ARCH=arm64-v8a -DOHOS_PLATFORM=OHOS -DCMAKE_TOOLCHAIN_FILE=${HM_SDK}/native/build/cmake/ohos.toolchain.cmake -DNCNN_VULKAN=ON -DNCNN_SIMPLEOMP=ON ..

make -j$(nproc)
make install
```

***

### Build for ESP32 with cross-compiling
Download esp-idf sdk
```shell
git clone https://github.com/espressif/esp-idf
cd esp-idf
git submodule update --init --recursive
```
Install esp-idf sdk and configure the environment
```shell
./install.sh
source export.sh
```
And for Windows, you should use:
```bash
install.bat # or `install.ps1`
export.bat
```
Note: python>=3.8, cmake>=3.24.0

Build ncnn library:
```shell
mkdir build-esp32
cd build-esp32
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/esp32.toolchain.cmake -DCMAKE_BUILD_TYPE=Release ..
make -j 4
make install
```
Note: Make sure to compile in esp-idf environment.

The compiled ncnn library and headers can be put to the esp32 project to test.


================================================
FILE: docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md
================================================
### caffemodel should be row-major

`caffe2ncnn` tool assumes the caffemodel is row-major (produced by c++ caffe train command).

The kernel 3x3 weights should be stored as
```
a b c
d e f
g h i
```

However, matlab caffe produced col-major caffemodel.

You have to transpose all the kernel weights by yourself or re-training using c++ caffe train command.

Besides, you may interest in https://github.com/conanhujinming/matcaffe2caffe

### check input is RGB or BGR

If your caffemodel is trained using c++ caffe and opencv, then the input image should be BGR order.

If your model is trained using matlab caffe or pytorch or mxnet or tensorflow, the input image would probably be RGB order.

The channel order can be changed on-the-fly through proper pixel type enum
```
// construct RGB blob from rgb image
ncnn::Mat in_rgb = ncnn::Mat::from_pixels(rgb_data, ncnn::Mat::PIXEL_RGB, w, h);

// construct BGR blob from bgr image
ncnn::Mat in_bgr = ncnn::Mat::from_pixels(bgr_data, ncnn::Mat::PIXEL_BGR, w, h);

// construct BGR blob from rgb image
ncnn::Mat in_bgr = ncnn::Mat::from_pixels(rgb_data, ncnn::Mat::PIXEL_RGB2BGR, w, h);

// construct RGB blob from bgr image
ncnn::Mat in_rgb = ncnn::Mat::from_pixels(bgr_data, ncnn::Mat::PIXEL_BGR2RGB, w, h);
```


### image decoding

JPEG(`.jpg`,`.jpeg`) is loss compression, people may get different pixel value for same image on same position. 

`.bmp` images are recommended instead.

### interpolation / resizing

There are several image resizing methods, which may generate different result for same input image.

Even we specify same interpolation method, different frameworks/libraries and their various versions may also introduce difference.

A good practice is feed same size image as the input layer expected, e.g. read a 224x244 bmp image when input layer need 224x224 size.


### Mat::from_pixels/from_pixels_resize assume that the pixel data is continuous

You shall pass continuous pixel buffer to from_pixels family.

If your image is an opencv submat from an image roi, call clone() to get a continuous one.
```
cv::Mat image;// the image
cv::Rect facerect;// the face rectangle

cv::Mat faceimage = image(facerect).clone();// get a continuous sub image

ncnn::Mat in = ncnn::Mat::from_pixels(faceimage.data, ncnn::Mat::PIXEL_BGR, faceimage.cols, faceimage.rows);
```

### pre process
Apply pre process according to your training configuration

Different model has different pre process config, you may find the following transform config in Data layer section
```
transform_param {
    mean_value: 103.94
    mean_value: 116.78
    mean_value: 123.68
    scale: 0.017
}
```
Then the corresponding code for ncnn pre process is
```cpp
const float mean_vals[3] = { 103.94f, 116.78f, 123.68f };
const float norm_vals[3] = { 0.017f, 0.017f, 0.017f };
in.substract_mean_normalize(mean_vals, norm_vals);
```

Mean file is not supported currently

So you have to pre process the input data by yourself (use opencv or something)
```
transform_param {
    mean_file: "imagenet_mean.binaryproto"
}
```

For pytorch or mxnet-gluon
```python
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
```
Then the corresponding code for ncnn pre process is
```cpp
// R' = (R / 255 - 0.485) / 0.229 = (R - 0.485 * 255) / 0.229 / 255
// G' = (G / 255 - 0.456) / 0.224 = (G - 0.456 * 255) / 0.224 / 255
// B' = (B / 255 - 0.406) / 0.225 = (B - 0.406 * 255) / 0.225 / 255
const float mean_vals[3] = {0.485f*255.f, 0.456f*255.f, 0.406f*255.f};
const float norm_vals[3] = {1/0.229f/255.f, 1/0.224f/255.f, 1/0.225f/255.f};
in.substract_mean_normalize(mean_vals, norm_vals);
```

### use the desired blob
The blob names for input and extract are differ among models.

For example, squeezenet v1.1 use "data" as input blob and "prob" as output blob while mobilenet-ssd use "data" as input blob and "detection_out" as output blob.

Some models may need multiple input or produce multiple output.

```cpp
ncnn::Extractor ex = net.create_extractor();

ex.input("data", in);// change "data" to yours
ex.input("mask", mask);// change "mask" to yours

ex.extract("output1", out1);// change "output1" to yours
ex.extract("output2", out2);// change "output2" to yours
```

### blob may have channel gap
Each channel pointer is aligned by 128bit in ncnn Mat structure.

blob may have gaps between channels if (width x height) can not divided exactly by 4

Prefer using ncnn::Mat::from_pixels or ncnn::Mat::from_pixels_resize for constructing input blob from image data

If you do need a continuous blob buffer, reshape the output.
```cpp
// out is the output blob extracted
ncnn::Mat flattened_out = out.reshape(out.w * out.h * out.c);

// plain array, C-H-W
const float* outptr = flattened_out;
```

### create new Extractor for each image
The `ncnn::Extractor` object is stateful, if you reuse for different input, you will always get exact the same result cached inside.

Always create new Extractor to process images in loop unless you do know how the stateful Extractor works.
```cpp
for (int i=0; i<count; i++)
{
    // always create Extractor
    // it's cheap and almost instantly !
    ncnn::Extractor ex = net.create_extractor();

    // use
    ex.input(your_data[i]);
}
```

### use proper loading api

If you want to load plain param file buffer, you shall use Net::load_param_mem instead of Net::load_param.

For more information about the ncnn model load api, see [ncnn-load-model](ncnn-load-model)

```cpp
ncnn::Net net;

// param_buffer is the content buffe of XYZ.param file
net.load_param_mem(param_buffer);
```


### disable fp16

Some models may overflow fp16, resulting in a nan result.

So try to turn off fp16 lower-precision optimizations, and the precision will be improved to fp32 to investigate and solve the overflow problem caused by this.

You can set it as follows
```cpp
ncnn::Net net;

net.opt.use_fp16_packed = false;
net.opt.use_fp16_storage = false;
net.opt.use_fp16_arithmetic = false;
```

### make data contiguous
If you find the output of pnnx.py and ncnn.py (generated by pnnx) is different, This may be due to data discontiguous. You can set the ncnn.py as follows and moditfity other codes:
``` python
def test_inference():
    torch.manual_seed(0)
    in0 = torch.rand(1, 3, 224, 224, dtype=torch.float)
    in0.contiguous()
```


================================================
FILE: docs/how-to-use-and-FAQ/FAQ-ncnn-protobuf-problem.zh.md
================================================
# Protobuf 类问题解决方法

## 问题分析

protobuf 有关的报错，一般都是两个原因：

1. 需要的 pb 没安装/`FindProtobuf.cmake`不存在，最终 `find_package` 失败
2. 系统不止一套 pb，导致 bin/lib/include 三者不匹配

如果你遇到了这些报错，都可以通过本文档解决：

1. Linux 编译 `caffe2ncnn` 时报 `Protobuf not found`
2. 编译 `caffe2ncnn` 时报 protoc 和 protobuf.so 版本不匹配

## （推荐）通用处理办法

这个办法包治百病，**不管什么情况一定生效**

1. 编译下载 protobuf，以 3.20.0 版本为例

```bash
$ wget https://github.com/protocolbuffers/protobuf/releases/download/v3.20.0/protobuf-cpp-3.20.0.tar.gz
$ tar xvf protobuf-cpp-3.20.0.tar.gz
$ cd protobuf-3.20.0/
$ ./configure --prefix=/path/to/install
$ make && make install
```
注意需要 `--prefix`，不要装到系统里。能遇到这些错，说明本来系统环境就有问题，再给系统环境装 lib 就更乱了。

2. 修改 cmake

找到报错的 CMakeLists.txt，在 `find_package` 前插入 protobuf 路径。

```bash
# 加入下面 1 行
list(APPEND CMAKE_PREFIX_PATH "/path/to/install")

find_package(Protobuf REQUIRED)
...
```

3. 调整 cmake 选项

`cmake ..` 时，额外加入选项 `-DProtobuf_PROTOC_EXECUTABLE=/path/to/install/bin/protoc`

```bash
$ cd /path/to/ncnn/build
$ rm -rf CMakeCache
# 加入新选项
$ cmake .. -DProtobuf_PROTOC_EXECUTABLE=/path/to/install/bin/protoc 
$ ...
```

## （不推荐）自己改环境变量

### 一、遇到 `Protobuf not found`

是因为 protobuf 未安装或环境变量未设置

1. 安装 protobuf

Ubuntu 系统尝试以下命令
```bash
$ sudo apt-get install libprotobuf-dev protobuf-compiler
```

CentOS 尝试
```bash
$ sudo yum install protobuf-devel.x86_64 protobuf-compiler.x86_64
```

2. 然后设置 C++ 环境

在 LD_LIBRARY_PATH 增加参数

```bash
$ export LD_LIBRARY_PATH=${YOUR_PROTOBUF_LIB_PATH}:$LD_LIBRARY_PATH
```

### 二、遇到 protoc 和 protobuf.so 版本不匹配

1. 先看 protoc 需要的 so 版本号
```bash
$ ldd `whereis protoc| awk '{print $2}'` | grep libprotobuf.so
```

例如是 libprotobuf.so.10

2. 然后搜这个文件所在的路径
```bash
$ cd / && find . -type f | grep libprotobuf.so.10
```

假设在`/home/user/mydir`

3. 设置 protobuf.so 的搜索目录
```bash
$ export LD_LIBRARY_PATH=/home/user/mydir:$LD_LIBRARY_PATH
```

### 三、行走江湖必备
关于环境变量设置、工具和技巧，强烈建议学习下 https://missing.csail.mit.edu/ 


================================================
FILE: docs/how-to-use-and-FAQ/FAQ-ncnn-throw-error.md
================================================
### param is too old, please regenerate

Your model file is being the old format converted by an old caffe2ncnn tool.

Checkout the latest ncnn code, build it and regenerate param and model binary files, and that should work.

Make sure that your param file starts with the magic number 7767517.

you may find more info on [use-ncnn-with-alexnet](use-ncnn-with-alexnet)

If the original model is missing, you can try to manually fix the layer specific parameters in param file

1. **Softmax** append `1=1`

before
```
Softmax xxx 1 1 in out ...
```
after
```
Softmax xxx 1 1 in out ... 1=1
```

2. **Reduction** minus all axes value by 1 (except the leading array count) and append `5=1`

before
```
Reduction xxx 1 1 in out ... -23303=2,2,3 ...
```
after
```
Reduction xxx 1 1 in out ... -23303=2,1,2 ... 5=1
```

### find_blob_index_by_name XYZ failed

That means ncnn couldn't find the XYZ blob in the network. 

You shall call Extractor::input()/extract() by blob name instead of layer name.

For models loaded from binary param file or external memory, you shall call Extractor::input()/extract() by the enum defined in xxx.id.h because all the visible string literals have been stripped in binary form.

This error usually happens when the input layer is not properly converted.

You shall upgrade caffe prototxt/caffemodel before converting it to ncnn. Following snippet type shall be ok. 

```
layer {
  name: "data"
  type: "Input"
  top: "data"
  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
}
```

you may find more info on [use-ncnn-with-alexnet](use-ncnn-with-alexnet).

### layer XYZ not exists or registered

Your network contains some operations that are not implemented in ncnn.

You may implement them as custom layer followed in [how-to-implement-custom-layer-step-by-step](how-to-implement-custom-layer-step-by-step).

Or you could simply register them as no-op if you are sure those operations make no sense.

```cpp
class Noop : public ncnn::Layer {};
DEFINE_LAYER_CREATOR(Noop)

net.register_custom_layer("LinearRegressionOutput", Noop_layer_creator);
net.register_custom_layer("MAERegressionOutput", Noop_layer_creator);
```

### fopen XYZ.param/XYZ.bin failed

File not found or not readable. Make sure that XYZ.param/XYZ.bin is accessible.

### network graph not ready

You shall call Net::load_param() first, then Net::load_model().

This error may also happens when Net::load_param() failed, but not properly handled.

For more information about the ncnn model load api, see [ncnn-load-model](ncnn-load-model)

### memory not 32-bit aligned at XYZ

The pointer passed to Net::load_param() or Net::load_model() is not 32bit aligned.

In practice, the head pointer of std::vector<unsigned char> is not guaranteed to be 32bit aligned.

you can store your binary buffer in ncnn::Mat structure, its internal memory is aligned.

### undefined reference to '__kmpc_XYZ_XYZ'

use clang for building android shared library

comment the following line in your Application.mk
```
NDK_TOOLCHAIN_VERSION := 4.9
```

### crash on android with '__kmp_abort_process'

This usually happens if you bundle multiple shared library with openmp linked

It is actually an issue of the android ndk https://github.com/android/ndk/issues/1028

On old android ndk, modify the link flags as

```
-Wl,-Bstatic -lomp -Wl,-Bdynamic
```

For recent ndk >= 21

```
-fstatic-openmp
```

### dlopen failed: library "libomp.so" not found

Newer android ndk defaults to dynamic openmp runtime

modify the link flags as

```
-fstatic-openmp -fopenmp
```

### crash when freeing a ncnn dynamic library(*.dll/*.so) built with openMP

for optimal performance, the openmp threadpool spin waits for about a second prior to shutting down in case more work becomes available. 

If you unload a dynamic library that's in the process of spin-waiting, it will crash in the manner you see (most of the time).

Just set OMP_WAIT_POLICY=passive in your environment, before calling loadlibrary. or Just wait a few seconds before calling freelibrary.

You can also use the following method to set environment variables in your code:

for msvc++:

```
SetEnvironmentVariable(_T("OMP_WAIT_POLICY"), _T("passive"));
```

for g++:

```
setenv("OMP_WAIT_POLICY", "passive", 1)
```

reference: https://stackoverflow.com/questions/34439956/vc-crash-when-freeing-a-dll-built-with-openmp


================================================
FILE: docs/how-to-use-and-FAQ/FAQ-ncnn-vulkan.md
================================================
### how to enable ncnn vulkan capability

follow [the build and install instruction](https://github.com/Tencent/ncnn/blob/master/docs/how-to-build/how-to-build.md)

make sure you have installed vulkan sdk from [lunarg vulkan sdk website](https://vulkan.lunarg.com/sdk/home)

Usually, you can enable the vulkan compute inference feature by adding only one line of code to your application.

```cpp
// enable vulkan compute feature before loading
ncnn::Net net;
net.opt.use_vulkan_compute = 1;
```

### does my graphics device support vulkan

Some platforms have been tested and known working. In theory, if your platform support vulkan api, either 1.0 or 1.1, it shall work.

* Y = known work
* ? = shall work, not confirmed
* / = not applied

|    |windows|linux|android|mac|ios|
|---|---|---|---|---|---|
|intel|Y|Y|?|?|/|
|amd|Y|Y|/|?|/|
|nvidia|Y|Y|?|/|/|
|qcom|/|/|Y|/|/|
|apple|/|/|/|Y|Y|
|arm|/|?|Y|/|/|

You can search [the vulkan database](https://vulkan.gpuinfo.org) to see if your device supports vulkan.

Some old buggy drivers may produce wrong result, that are blacklisted in ncnn and treated as non-vulkan capable device.
You could check if your device and driver have this issue with  [my conformance test here](vulkan-conformance-test).
Most of these systems are android with version lower than 8.1.

### why using vulkan over cuda/opencl/metal

In the beginning, I had no GPGPU programming experience, and I had to learn one.

vulkan is considered more portable and well supported by vendors and the cross-platform low-overhead graphics api. As a contrast, cuda is only available on nvidia device, metal is only available on macos and ios, while loading opencl library is banned in android 7.0+ and does not work on ios.

### I got errors like "vkCreateComputePipelines failed -1000012000" or random stalls or crashes

Upgrade your vulkan driver.

[intel https://downloadcenter.intel.com/product/80939/Graphics-Drivers](https://downloadcenter.intel.com/product/80939/Graphics-Drivers)

[amd https://www.amd.com/en/support](https://www.amd.com/en/support)

[nvidia https://www.nvidia.com/Download/index.aspx](https://www.nvidia.com/Download/index.aspx)

### how to use ncnn vulkan on android

minimum android ndk version: android-ndk-r18b

minimum sdk platform api version: android-24

link your jni project with libvulkan.so

[The squeezencnn example](https://github.com/Tencent/ncnn/tree/master/examples/squeezencnn) have equipped gpu inference, you could take it as reference.

### how to use ncnn vulkan on ios

setup vulkan sdk (https://vulkan.lunarg.com/sdk/home#mac)

metal only works on real device with arm64 cpu (iPhone 5s and later)

link your project with MoltenVK framework and Metal

### what about the layers without vulkan support

These layers have vulkan support currently

AbsVal, BatchNorm, BinaryOp, Cast, Clip, Concat, Convolution, ConvolutionDepthWise, Crop, Deconvolution, DeconvolutionDepthWise, Dropout, Eltwise, Flatten, HardSigmoid, InnerProduct, Interp, LRN, Packing, Padding, Permute, Pooling(pad SAME not supported), PReLU, PriorBox, ReLU, Reorg, Reshape, Scale, ShuffleChannel, Sigmoid, Softmax, TanH, UnaryOp

For these layers without vulkan support, ncnn inference engine will automatically fallback to cpu path.

Thus, it is usually not a serious issue if your network only has some special head layers like SSD or YOLO. All examples in ncnn are known working properly with vulkan enabled.

### my model runs slower on gpu than cpu

The current vulkan inference implementation is far from the preferred state. Many handful optimization techniques are planned, such as winograd convolution, operator fusion, fp16 storage and arithmetic etc.

It is common that your model runs slower on gpu than cpu on arm devices like mobile phones, since we have quite good arm optimization in ncnn ;)

### vulkan device not found / extra high cpu utility while vulkan is enabled on nvidia gpu

There are several reasons could lead to this outcome. First please check your driver status with `nvidia-smi`. If you have correctly installed your driver, you should see something like this:

```bash
$ nvidia-smi
Sat Mar 06 19:53:16 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 451.48       Driver Version: 451.48       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 1060   WDDM  | 00000000:02:00.0 Off |                  N/A |
| N/A   31C    P8     5W /  N/A |     90MiB /  6144MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
```

If `nvidia-smi` crashes or cannot be found, please reinstall your graphics driver.

If ncnn *is* utilizing the Tesla GPU, you can see your program in the `Processes` block at the bottom. In that case, it's likely some operators are not yet supported in Vulkan, and have fallbacked to the CPU, thus leading to a low utilization of the GPU.

If you *couldn't* find your process running, plase check the active driver model, which can be found to the right of your device name. For Geforce and Titan GPUs, the default driver model is WDDM (Windows Desktop Driver Model), which supports both rendering graphics as well as computing. But for Tesla GPUs, without configuration, the driver model is defualted to TCC ([Tesla Computing Cluster](https://docs.nvidia.com/gameworks/content/developertools/desktop/tesla_compute_cluster.htm)). NVIDIA's TCC driver does not support Vulkan, so you need to use the following command to set the driver model back to WDDM, to use Vulkan:

```bash
$ nvidia-smi -g 0 -dm 0
```

The number following `-g` is the GPU ID (which can be found to the left of your device name in `nvidia-smi` output); and `-dm` stands for driver model, 0 refers to WDDM and 1 means TCC.


================================================
FILE: docs/how-to-use-and-FAQ/build-minimal-library.md
================================================
For some reason, if you're not happy with the binary size of the ncnn library, then here is the cheatsheet that helps you to build a minimal ncnn :P

### disable c++ rtti and exceptions

```
cmake -DNCNN_DISABLE_RTTI=ON -DNCNN_DISABLE_EXCEPTION=ON ..
```
* Cannot use RTTI and Exceptions when ncnn functions are called.

### disable vulkan support

```
cmake -DNCNN_VULKAN=OFF ..
```

* Cannot use GPU acceleration.

### disable NCNN_STDIO

```
cmake -DNCNN_STDIO=OFF ..
```

* Cannot load model from files, but can load model from memory or by Android Assets.

    Read more [here](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md#load-model).

### disable NCNN_STRING

```
cmake -DNCNN_STRING=OFF ..
```

* Cannot load human-readable param files with visible strings, but can load binary param.bin files.

    Read more [here](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md#strip-visible-string)

* Cannot identify blobs by string name when calling `Extractor::input / extract`, but can identify them by enum value in `id.h`.

    Read more [here](https://github.com/Tencent/ncnn/blob/master/docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md#input-and-output).

### disable NCNN_BF16

```
cmake -DNCNN_BF16=OFF ..
```

* Cannot use bf16 storage type in inference.


### disable NCNN_INT8

```
cmake -DNCNN_INT8=OFF ..
```

* Cannot use quantized int8 inference.


### drop pixel drawing functions

```
cmake -DNCNN_PIXEL_DRAWING=OFF ..
```

* Cannot use functions doing drawing basic shape and text like `ncnn::draw_rectangle_xx / ncnn::draw_circle_xx / ncnn::draw_text_xx`, but functions like `Mat::from_pixels / from_pixels_resize` are still available.


### drop pixel rotate and affine functions

```
cmake -DNCNN_PIXEL_ROTATE=OFF -DNCNN_PIXEL_AFFINE=OFF ..
```

* Cannot use functions doing rotatation and affine transformation like `ncnn::kanna_rotate_xx / ncnn::warpaffine_bilinear_xx`, but functions like `Mat::from_pixels / from_pixels_resize` are still available. 

### drop pixel functions

```
cmake -DNCNN_PIXEL=OFF ..
```

* Cannot use functions transferring from image to pixels like `Mat::from_pixels / from_pixels_resize / to_pixels / to_pixels_resize`, and need create a Mat and fill in data by hand.

### disable openmp

```
cmake -DNCNN_OPENMP=OFF ..
```

* Cannot use openmp multi-threading acceleration. If you want to run a model in single thread on your target machine, it is recommended to close the option.

### disable avx2 and arm82 optimized kernel

```
cmake -DNCNN_AVX2=OFF -DNCNN_ARM82=OFF ..
```

* Do not compile optimized kernels using avx2 / arm82 instruction set extensions. If your target machine does not support some of them, it is recommended to close the related options.

### disable runtime cpu instruction dispatch

```
cmake -DNCNN_RUNTIME_CPU=OFF ..
```

* Cannot check supported cpu instruction set extensions and use related optimized kernels in runtime.
* If you know which instruction set extensions are supported on your target machine like avx2 / arm82, you can open related options like `-DNCNN_AVX2=ON / -DNCNN_ARM82=ON` by hand and then sse2 / arm8 version kernels will not be compiled.

### drop layers not used

```
cmake -DWITH_LAYER_absval=OFF -DWITH_LAYER_bnll=OFF ..
```

* If your model does not include some layers, taking absval / bnll as a example above, you can drop them.
* Some key or dependency layers should not be dropped, like convolution / innerproduct, their dependency like padding / flatten, and activation like relu / clip.

### disable c++ stl

```
cmake -DNCNN_SIMPLESTL=ON ..
```

* STL provided by compiler is no longer depended on, and use `simplestl` provided by ncnn as a replacement. Users also can only use `simplestl` when ncnn functions are called.
* Usually with compiler parameters `-nodefaultlibs -fno-builtin -nostdinc++ -lc`
* Need cmake parameters `cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake -DANDROID_STL=system` to avoid STL conflict when compiling to Android.

### drop optimized kernel not used

* Modify the source code under `ncnn/src/layer/arm/` to delete unnecessary optimized kernels or replace them with empty functions.
* You can also drop layers and related optimized kernels by `-DWITH_LAYER_absval=OFF` as mentioned above.

### drop operators from BinaryOp UnaryOp

* Modify `ncnn/src/layer/binaryop.cpp unaryop.cpp` and `ncnn/src/layer/arm/binaryop.cpp unaryop_arm.cpp` by hand to delete unnecessary operators.


================================================
FILE: docs/how-to-use-and-FAQ/efficient-roi-resize-rotate.md
================================================

### image roi crop + convert to ncnn::Mat

```
+--------------+
|   y          |           /-------/
| x +-------+  |          +-------+|
|   |     roih |im_h  =>  |      roih
|   +-roiw--+  |          +-roiw--+/
|              |
+-----im_w-----+
```
```cpp
ncnn::Mat in = ncnn::Mat::from_pixels_roi(im.data, ncnn::Mat::PIXEL_RGB, im_w, im_h, x, y, roiw, roih);
```
For Android Application, it is :
```cpp
ncnn::Mat in = ncnn::Mat::from_android_bitmap_roi(env, image, ncnn::Mat::PIXEL_RGBA2RGB, x, y, roiw, roih);
```

### image roi crop + resize + convert to ncnn::Mat

```
+--------------+
|   y          |           /----/
| x +-------+  |          +----+|
|   |     roih |im_h  =>  |  target_h
|   +-roiw--+  |          |    ||
|              |          +----+/
+-----im_w-----+         target_w
```
```cpp
ncnn::Mat in = ncnn::Mat::from_pixels_roi_resize(im.data, ncnn::Mat::PIXEL_RGB, im_w, im_h, x, y, roiw, roih, target_w, target_h);
```
For Android Application, it is :
```cpp
ncnn::Mat in = ncnn::Mat::from_android_bitmap_roi_resize(env, image, ncnn::Mat::PIXEL_RGBA2RGB, x, y, roiw, roih, target_w, target_h);
```

### ncnn::Mat export image + offset paste

```
                +--------------+
 /-------/      |   y          |
+-------+|      | x +-------+  |
|       h|  =>  |   |       h  |im_h
+---w---+/      |   +---w---+  |
                |              |
                +-----im_w-----+
```
```cpp
const unsigned char* data = im.data + (y * im_w + x) * 3;
out.to_pixels(data, ncnn::Mat::PIXEL_RGB, im_w * 3);
```

### ncnn::Mat export image + resize + roi paste

```
            +--------------+
 /----/     |   y          |
+----+|     | x +-------+  |
|    h| =>  |   |      roih|im_h
|    ||     |   +-roiw--+  |
+-w--+/     |              |
            +-----im_w-----+
```
```cpp
const unsigned char* data = im.data + (y * im_w + x) * 3;
out.to_pixels_resize(data, ncnn::Mat::PIXEL_RGB, roiw, roih, im_w * 3);
```

### image roi crop + resize
```
+--------------+
|   y          |
| x +-------+  |          +----+
|   |      roih|im_h  =>  |  target_h
|   +-roiw--+  |          |    |
|              |          +----+
+-----im_w-----+         target_w
```
```cpp
const unsigned char* data = im.data + (y * im_w + x) * 3;
ncnn::resize_bilinear_c3(data, roiw, roih, im_w * 3, outdata, target_w, target_h, target_w * 3);
```

### image resize + offset paste
```
            +--------------+
            |   y          |
+----+      | x +-------+  |
|    h  =>  |   |     roih |im_h
|    |      |   +-roiw--+  |
+-w--+      |              |
            +-----im_w-----+
```
```cpp
unsigned char* outdata = im.data + (y * im_w + x) * 3;
ncnn::resize_bilinear_c3(data, w, h, w * 3, outdata, roiw, roih, im_w * 3);
```

### image roi crop + resize + roi paste
```
+--------------+         +-----------------+
|   y          |         |  roiy           |
| x +-------+  |         |roix----------+  |
|   |       h  |im_h  => |   |     target_h|outim_h
|   +---w---+  |         |   |          |  |
|              |         |   +-target_w-+  |
+-----im_w-----+         +-----outim_w-----+
```
```cpp
const unsigned char* data = im.data + (y * im_w + x) * 3;
unsigned char* outdata = outim.data + (roiy * outim_w + roix) * 3;
ncnn::resize_bilinear_c3(data, w, h, im_w * 3, outdata, target_w, target_h, outim_w * 3);
```

### image roi crop + rotate
```
+--------------+
|   y          |
| x +-------+  |          +---+
|   |  < <  h  |im_h  =>  | ^ |w
|   +---w---+  |          | ^ |
|              |          +---+
+-----im_w-----+            h
```
```cpp
const unsigned char* data = im.data + (y * im_w + x) * 3;
ncnn::kanna_rotate_c3(data, w, h, im_w * 3, outdata, h, w, h * 3, 6);
```

### image rotate + offset paste
```
             +--------------+
             |   y          |
 +---+       | x +-------+  |
 | ^ |h  =>  |   |  < <  w  |im_h
 | ^ |       |   +---h---+  |
 +---+       |              |
   w         +-----im_w-----+
```
```cpp
unsigned char* outdata = im.data + (y * im_w + x) * 3;
ncnn::kanna_rotate_c3(data, w, h, w * 3, outdata, h, w, im_w * 3, 7);
```

### image roi crop + rotate + roi paste
```
+--------------+         +-----------------+
|   y          |         |        roiy     |
| x +-------+  |         |   roix  +---+   |
|   |  < <  h  |im_h  => |         | ^ w   |outim_h
|   +---w---+  |         |         | ^ |   |
|              |         |         +-h-+   |
+-----im_w-----+         +-----outim_w-----+
```
```cpp
const unsigned char* data = im.data + (y * im_w + x) * 3;
unsigned char* outdata = outim.data + (roiy * outim_w + roix) * 3;
ncnn::kanna_rotate_c3(data, w, h, im_w * 3, outdata, h, w, outim_w * 3, 6);
```


================================================
FILE: docs/how-to-use-and-FAQ/ncnn-load-model.md
================================================
### the comprehensive model loading api table

|load from|alexnet.param|alexnet.param.bin|alexnet.bin|
|---|---|---|---|
|file path|load_param(const char*)|load_param_bin(const char*)|load_model(const char*)|
|file path<br/>(wchar_t for windows)|load_param(const wchar_t*)|load_param_bin(const wchar_t*)|load_model(const wchar_t*)|
|file descriptor|load_param(FILE*)|load_param_bin(FILE*)|load_model(FILE*)|
|file memory|load_param_mem(const char*)|load_param(const unsigned char*)|load_model(const unsigned char*)|
|android asset|load_param(AAsset*)|load_param_bin(AAsset*)|load_model(AAsset*)|
|android asset path|load_param(AAssetManager*, const char*)|load_param_bin(AAssetManager*, const char*)|load_model(AAssetManager*, const char*)|
|custom IO reader|load_param(const DataReader&)|load_param_bin(const DataReader&)|load_model(const DataReader&)|

### points to note

1. Either of the following combination shall be enough for loading model
    * alexnet.param + alexnet.bin
    * alexnet.param.bin + alexnet.bin

2. Never modify Net opt member after loading

3. Most loading functions return 0 if success, except loading alexnet.param.bin and alexnet.bin from file memory, which returns the bytes consumed after loading
    * size_t Net::load_param(const unsigned char*)
    * size_t Net::load_model(const unsigned char*)

4. It is recommended to load model from Android asset directly to avoid copying them to sdcard on Android platform

5. The custom IO reader interface can be used to implement on-the-fly model decryption and loading


================================================
FILE: docs/how-to-use-and-FAQ/openmp-best-practice.md
================================================
ncnn openmp best practice

### CPU loadaverage is too high with ncnn.

   When inference the neural network with ncnn, the cpu occupancy is very high even all CPU cores occupancy close to 100%.

   If there are other threads or processes that require more cpu resources, the running speed of the program will drop severely.

### The root cause of high CPU usage

1. ncnn uses openmp API to speed up the inference compute. the thread count equals to the cpu core   count. If the computing work need to run frequently, it must consume many cpu resources.

2. There is a thread pool managed by openmp, the pool size is equal to the cpu core size. (the max  vulue is 15 if there are much more cpu cores?)
   Openmp need to sync the thread when acquiring and returning threads to the pool. In order to improve efficiency, almost all omp implementations use spinlock synchronization (except for simpleomp). 
   The default spin time of the spinlock is 200ms. So after a thread is scheduled, the thread need to busy-wait up to 200ms.

### Why the CPU usage is still high even using vulkan GPU acceleration.

1. Openmp is also used when loading the param bin file, and this part runs on cpu.

2. The fp32 to fp16 conversion before and after the GPU memory upload is executed on the cpu, and this part of the logic also uses openmp.

### Solution
```
1. Bind to the specific cpu core.
```
   If you use a device with large and small core CPUs, it is recommended to bind large or small cores through ncnn::set_cpu_powersave(int). Note that Windows does not support binding cores. By the way,  it's possible to have multiple threadpool using openmp. A new threadpool will be created for a new thread scope.
Suppose your platform is 2 big cores + 4 little cores, and you want to execute model A on 2 big cores and model B on 4 little cores concurrently.

create two threads via std::thread or pthread
   ```
   void thread_1()
   {
      ncnn::set_cpu_powersave(2); // bind to big cores
      netA.opt.num_threads = 2;
   }

   void thread_2()
   {
      ncnn::set_cpu_powersave(1); // bind to little cores
      netB.opt.num_threads = 4;
   }
   ```
   
```
2. Use fewer threads.
```
   Set the number of threads to half of the cpu cores count or less through ncnn::set_omp_num_threads(int)  or change net.opt.num_threads field. If you are coding with clang libomp, it's recommended that the number of threads does not exceed 8. If you use other omp libraries, it is recommended that the number of threads does not exceed 4.
```
3. Reduce openmp spinlock blocktime.
```
   You can modify openmp blocktime by call ncnn::set_kmp_blocktime(int) method or modify net.opt.openmp_blocktime field.
   This argument is the spin time set by the ncnn API, and the default is 20ms.You can set a smaller value according to
   the situation, or directly change it to 0.

   Limitations: At present, only the libomp library of clang is implemented. Neither vcomp nor libgomp have corresponding interfaces.
   If it is not compiled with clang, this value is still 200ms by default.
   If you use vcomp or libgomp, you can use the environment variable OMP_WAIT_POLICY=PASSIVE to disable spin time. If you use simpleomp,
   It's no need to set this parameter.
```
4. Limit the number of threads available in the openmp thread pool.
```
   Even if the number of openmp threads is reduced, the CPU occupancy rate may still be high. This is more common on servers with
   particularly many CPU cores. 
   This is because the waiting threads in the thread pool use a spinlock to busy-wait, which can be reducedby limiting the number of
   threads available in the thread pool.

   Generally, you can set the OMP_THREAD_LIMIT environment variable. simpleomp currently does not support this feature so it's no need to be set.
   Note that this environment variable is only valid if it is set before the program starts.
```
5. Disable openmp completely
```
   If there is only one cpu core, or use the vulkan gpu acceleration, it is recommended to disable openmp, just specify -DNCNN_OPENMP=OFF
   when compiling with cmake.

================================================
FILE: docs/how-to-use-and-FAQ/openmp-best-practice.zh.md
================================================
ncnn openmp 最佳实践

### ncnn占用过多cpu资源

   使用ncnn推理运算，cpu占用非常高甚至所有核心占用都接近100%。

   如果还有其它线程或进程需要较多的cpu资源，运行速度下降严重。

### cpu占用高的根本原因

1. ncnn使用openmp API控制多线程加速推理计算。默认情况下，线程数等于cpu内核数。如果推理需要高频率运行，必然占用大部分
   cpu资源。

2. openmp内部维护一个线程池，线程池最大可用线程数等于cpu内核数。(核心过多时最大限制是15？）获取和归还线程时需要同步。

   为了提高效率，几乎所有omp实现都使用了自旋锁同步(simpleomp除外)。自旋锁默认的spin time是200ms。因此一个线程被调度后，
   需要忙等待最多200ms。

### 为什么使用vulkan加速后cpu占用依然很高。

1. 加载参数文件时也使用了openmp，这部分是在cpu上运行的。

2. 显存上传前和下载后的 fp32 fp16转换是在cpu上执行的，这部分逻辑也使用了openmp。

### 解决方法

```
1. 绑核
```
   如果使用有大小核cpu的设备，建议通过ncnn::set_cpu_powersave(int)绑定大核或小核，注意windows系统不支持绑核。顺便说一下，ncnn支持不同的模型运行在不同的核心。假设硬件平台有2个大核，4个小核，你想把netA运行在大核，netB运行在小核。
   可以通过std::thread or pthread创建两个线程，运行如下代码：
   
   ```
   void thread_1()
   {
      ncnn::set_cpu_powersave(2); // bind to big cores
      netA.opt.num_threads = 2;
   }

   void thread_2()
   {
      ncnn::set_cpu_powersave(1); // bind to little cores
      netB.opt.num_threads = 4;
   }
   ```

```
2. 使用更少的线程数。
```
   通过ncnn::set_omp_num_threads(int)或者net.opt.num_threads字段设置线程数为cpu内核数的一半或更小。如果使用clang的libomp，
   建议线程数不超过8，如果使用其它omp库，建议线程数不超过4。
```
3. 减小openmp blocktime。
```
   可以修改ncnn::set_kmp_blocktime(int)或者修改net.opt.openmp_blocktime，这个参数是ncnn API设置的spin time，默认是20ms。
   可以根据情况设置更小的值，或者直接改为0。

   局限：目前只有clang的libomp库有实现，vcomp和libgomp都没有相应接口，如果不是使用clang编译的，这个值默认还是200ms。
   如果使用vcomp或libgomp, 可以使用环境变量OMP_WAIT_POLICY=PASSIVE禁用spin time，如果使用simpleomp,不需要设置这个参数。
```
4. 限制openmp线程池可用线程数量。
```
   即使减小了openmp线程数量，cpu占用率仍然可能会很高。这在cpu核心特别多的服务器上比较常见。这是因为线程池中的等待线程使用
   自旋锁忙等待，可以通过限制线程池可用线程数量减轻这种影响。

   一般可以通过设置OMP_THREAD_LIMIT环境变量。simpleomp目前不支持这一特性，不需要设置。注意这个环境变量仅在程序启动前设置才有效。
```
5. 完全禁用openmp
```
   如果只有一个cpu核心，或者使用vulkan加速，建议关闭openmp, cmake编译时指定-DNCNN_OPENMP=OFF即可。

================================================
FILE: docs/how-to-use-and-FAQ/quantized-int8-inference.md
================================================
# Post Training Quantization Tools

To support int8 model deployment on mobile devices,we provide the universal post training quantization tools which can convert the float32 model to int8 model.

## User Guide

Example with mobilenet, just need three steps.

### 1. Optimize model

NOTE: **If your model is converted via pnnx, skip this step.**

```shell
./ncnnoptimize mobilenet.param mobilenet.bin mobilenet-opt.param mobilenet-opt.bin 0
```

### 2. Create the calibration table file

#### 2.1 From image

We suggest that using the verification dataset for calibration, which is more than 5000 images.

Some imagenet sample images here https://github.com/nihui/imagenet-sample-images

```shell
find images/ -type f > imagelist.txt
./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist.txt mobilenet.table mean=[104,117,123] norm=[0.017,0.017,0.017] shape=[224,224,3] pixel=BGR thread=8 method=kl
```

* mean and norm are the values you passed to ```Mat::substract_mean_normalize()```
* shape is the blob shape of your model, [w,h] or [w,h,c]

>
    * if w and h both are given, image will be resized to exactly size.
    * if w and h both are zero or negative, image will not be resized.
    * if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
    * if only w is zero or negative, image's height will scaled resize to h

* pixel is the pixel format of your model, image pixels will be converted to this type before ```Extractor::input()```
* thread is the CPU thread count that could be used for parallel inference
* method is the post training quantization algorithm, kl and aciq are currently supported

If your model has multiple input nodes, you can use multiple list files and other parameters

```shell
./ncnn2table mobilenet-opt.param mobilenet-opt.bin imagelist-bgr.txt,imagelist-depth.txt mobilenet.table mean=[104,117,123],[128] norm=[0.017,0.017,0.017],[0.0078125] shape=[224,224,3],[224,224,1] pixel=BGR,GRAY thread=8 method=kl
```

#### 2.2 From npy

We suggest that using the validation(development) set for calibration.

Use the same preprocessing as the training set to get the input vectors, in the case of batchsize=1, store each input vector as an npy file, n inputs correspond to n npy files, the actual stored vectors to remove the batch dimension.


test net, shape is in NCHW format, but there's no `N`.
```txt
in0, shape=[512]
in1, shape=[2, 1, 64]
in2, shape=[2, 1, 64]
```

filelist_in0.txt
```txt
0_in0.npy
1_in0.npy
2_in0.npy
...
```

filelist_in1.txt
```txt
0_in1.npy
1_in1.npy
2_in1.npy
...
```

filelist_in2.txt
```txt
0_in2.npy
1_in2.npy
2_in2.npy
...
```

```shell
./ncnn2table test.param test.bin filelist_in0.txt,filelist_in1.txt,filelist_in2.txt test.table shape=[512],[64,1,2],[64,1,2] thread=8 method=kl type=1
```
**Here shape is WHC, because the order of the arguments to `ncnn::Mat`.**

### 3. Quantize model

```shell
./ncnn2int8 mobilenet-opt.param mobilenet-opt.bin mobilenet-int8.param mobilenet-int8.bin mobilenet.table
```

If you don’t need static quantization, ncnn supports RNN/LSTM/GRU dynamic quantization. In this case, you can omit the table file.

```shell
./ncnn2int8 rnn-model.param rnn-model.bin rnn-model-int8.param rnn-model-int8.bin
```

## use ncnn int8 inference

the ncnn library would use int8 inference automatically, nothing changed in your code

```cpp
ncnn::Net mobilenet;
mobilenet.load_param("mobilenet-int8.param");
mobilenet.load_model("mobilenet-int8.bin");
```

## mixed precision inference

Before quantize your model, comment the layer weight scale line in table file, then the layer will do the float32 inference

```
conv1_param_0 156.639840536
```

```
#conv1_param_0 156.639840536
```


================================================
FILE: docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.md
================================================
We use alexnet as an example

### prepare caffe prototxt and model

These files will usually generated when trained with caffe
```
train.prototxt
deploy.prototxt
snapshot_10000.caffemodel
```
deploy.prototxt and caffemodel file are enough for TEST phase

alexnet deploy.prototxt can be downloaded here

https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet

alexnet caffemodel can be downloaded here

http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel

### convert to ncnn model

Convert old caffe prototxt and caffemodel to new ones using tools in caffe

because the ncnn convert tool needs the new format
```
upgrade_net_proto_text [old prototxt] [new prototxt]
upgrade_net_proto_binary [old caffemodel] [new caffemodel]
```

Use Input layer as input, set N dim as 1 since only one image can be processed each time
```
layer {
  name: "data"
  type: "Input"
  top: "data"
  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
}
```
Use caffe2ncnn tool to convert caffe model to ncnn model
```
caffe2ncnn deploy.prototxt bvlc_alexnet.caffemodel alexnet.param alexnet.bin
```

### strip visible string

It is already enough for deploying with param and bin file only, but there are visible strings in param file, it may not be suitable to distribute plain neural network information in your APP.

You can use ncnn2mem tool to convert plain model file to binary representation. It will generate alexnet.param.bin and two static array code files.
```
ncnn2mem alexnet.param alexnet.bin alexnet.id.h alexnet.mem.h
```

### load model

Load param and bin file, the easy way
```cpp
ncnn::Net net;
net.load_param("alexnet.param");
net.load_model("alexnet.bin");
```
Load binary param.bin and bin file, no visible strings included, suitable for bundled as APP resource
```cpp
ncnn::Net net;
net.load_param_bin("alexnet.param.bin");
net.load_model("alexnet.bin");
```
Load network and model from external memory, no visible strings included, no external resource files bundled, the whole model is hardcoded in your program

You may use this way to load from android asset resource
```cpp
#include "alexnet.mem.h"
ncnn::Net net;
net.load_param(alexnet_param_bin);
net.load_model(alexnet_bin);
```
You can choose either way to load model. Loading from external memory is zero-copy, which means you must keep your memory buffer during processing

### unload model
```cpp
net.clear();
```

### input and output

ncnn Mat is the data structure for input and output data

Input image should be converted to Mat, and subtracted mean values and normalized when needed

```cpp
#include "mat.h"
unsigned char* rgbdata;// data pointer to RGB image pixels
int w;// image width
int h;// image height
ncnn::Mat in = ncnn::Mat::from_pixels(rgbdata, ncnn::Mat::PIXEL_RGB, w, h);

const float mean_vals[3] = {104.f, 117.f, 123.f};
in.substract_mean_normalize(mean_vals, 0);
```
Execute the network inference and retrieve the result
```cpp
#include "net.h"
ncnn::Mat in;// input blob as above
ncnn::Mat out;
ncnn::Extractor ex = net.create_extractor();
ex.input("data", in);
ex.extract("prob", out);
```
If you load model with binary param.bin file, you should use the enum value in alexnet.id.h file instead of the blob name
```cpp
#include "net.h"
#include "alexnet.id.h"
ncnn::Mat in;// input blob as above
ncnn::Mat out;
ncnn::Extractor ex = net.create_extractor();
ex.input(alexnet_param_id::BLOB_data, in);
ex.extract(alexnet_param_id::BLOB_prob, out);
```
Read the data in the output Mat. Iterate data to get all classification scores.
```cpp
ncnn::Mat out_flatterned = out.reshape(out.w * out.h * out.c);
std::vector<float> scores;
scores.resize(out_flatterned.w);
for (int j=0; j<out_flatterned.w; j++)
{
    scores[j] = out_flatterned[j];
}
```

### some tricks

Convert image colorspace and resize image with Mat convenient function, these functions are well optimized

Support RGB2GRAY GRAY2RGB RGB2BGR etc, support scale up and scale down
```cpp
#include "mat.h"
unsigned char* rgbdata;// data pointer to RGB image pixels
int w;// image width
int h;// image height
int target_width = 227;// target resized width
int target_height = 227;// target resized height
ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgbdata, ncnn::Mat::PIXEL_RGB2GRAY, w, h, target_width, target_height);
```
You can concat multiple model files into one, and load this single file from FILE* interface.

It should ease the distribution of param and model files.

> $ cat alexnet.param.bin alexnet.bin > alexnet-all.bin

```cpp
#include "net.h"
FILE* fp = fopen("alexnet-all.bin", "rb");
net.load_param_bin(fp);
net.load_model(fp);
fclose(fp);
```


================================================
FILE: docs/how-to-use-and-FAQ/use-ncnn-with-alexnet.zh.md
================================================
首先，非常感谢大家对 ncnn 组件的关注
为了方便大家使用 ncnn 组件，up主特意写了这篇使用指北，以烂大街的 alexnet 作为例子


### 准备caffe网络和模型

caffe 的网络和模型通常是搞深度学习的研究者训练出来的，一般来说训练完会有
```
train.prototxt
deploy.prototxt
snapshot_10000.caffemodel
```
部署的时候只需要 TEST 过程，所以有 deploy.prototxt 和 caffemodel 就足够了

alexnet 的 deploy.prototxt 可以在这里下载
https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet

alexnet 的 caffemodel 可以在这里下载
http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel

### 转换ncnn网络和模型

caffe 自带了工具可以把老版本的 caffe 网络和模型转换为新版（ncnn的工具只认识新版
```
upgrade_net_proto_text [老prototxt] [新prototxt]
upgrade_net_proto_binary [老caffemodel] [新caffemodel]
```
输入层改用 Input，因为每次只需要做一个图片，所以第一个 dim 设为 1
```
layer {
  name: "data"
  type: "Input"
  top: "data"
  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
}
```
使用 caffe2ncnn 工具转换为 ncnn 的网络描述和模型
```
caffe2ncnn deploy.prototxt bvlc_alexnet.caffemodel alexnet.param alexnet.bin
```
### 去除可见字符串

有 param 和 bin 文件其实已经可以用了，但是 param 描述文件是明文的，如果放在 APP 分发出去容易被窥探到网络结构（说得好像不明文就看不到一样
使用 ncnn2mem 工具转换为二进制描述文件和内存模型，生成 alexnet.param.bin 和两个静态数组的代码文件
```
ncnn2mem alexnet.param alexnet.bin alexnet.id.h alexnet.mem.h
```
### 加载模型

直接加载 param 和 bin，适合快速验证效果使用
```cpp
ncnn::Net net;
net.load_param("alexnet.param");
net.load_model("alexnet.bin");
```
加载二进制的 param.bin 和 bin，没有可见字符串，适合 APP 分发模型资源
```cpp
ncnn::Net net;
net.load_param_bin("alexnet.param.bin");
net.load_model("alexnet.bin");
```
从内存引用加载网络和模型，没有可见字符串，模型数据全在代码里头，没有任何外部文件
另外，android apk 打包的资源文件读出来也是内存块
```cpp
#include "alexnet.mem.h"
ncnn::Net net;
net.load_param(alexnet_param_bin);
net.load_model(alexnet_bin);
```
以上三种都可以加载模型，其中内存引用方式加载是 zero-copy 的，所以使用 net 模型的来源内存块必须存在

### 卸载模型
```cpp
net.clear();
```

### 输入和输出

ncnn 用自己的数据结构 Mat 来存放输入和输出数据
输入图像的数据要转换为 Mat，依需要减去均值和乘系数
```cpp
#include "mat.h"
unsigned char* rgbdata;// data pointer to RGB image pixels
int w;// image width
int h;// image height
ncnn::Mat in = ncnn::Mat::from_pixels(rgbdata, ncnn::Mat::PIXEL_RGB, w, h);

const float mean_vals[3] = {104.f, 117.f, 123.f};
in.substract_mean_normalize(mean_vals, 0);
```
执行前向网络，获得计算结果
```cpp
#include "net.h"
ncnn::Mat in;// input blob as above
ncnn::Mat out;
ncnn::Extractor ex = net.create_extractor();
ex.input("data", in);
ex.extract("prob", out);
```
如果是二进制的 param.bin 方式，没有可见字符串，利用 alexnet.id.h 的枚举来代替 blob 的名字
```cpp
#include "net.h"
#include "alexnet.id.h"
ncnn::Mat in;// input blob as above
ncnn::Mat out;
ncnn::Extractor ex = net.create_extractor();
ex.input(alexnet_param_id::BLOB_data, in);
ex.extract(alexnet_param_id::BLOB_prob, out);
```
获取 Mat 中的输出数据，Mat 内部的数据通常是三维的，c / h / w，遍历所有获得全部分类的分数
```cpp
ncnn::Mat out_flatterned = out.reshape(out.w * out.h * out.c);
std::vector<float> scores;
scores.resize(out_flatterned.w);
for (int j=0; j<out_flatterned.w; j++)
{
    scores[j] = out_flatterned[j];
}
```
### 某些使用技巧

Mat 转换图像的时候可以顺便转换颜色和缩放大小，这些顺带的操作也是有优化的
支持 RGB2GRAY GRAY2RGB RGB2BGR 等常用转换，支持缩小和放大
```cpp
#include "mat.h"
unsigned char* rgbdata;// data pointer to RGB image pixels
int w;// image width
int h;// image height
int target_width = 227;// target resized width
int target_height = 227;// target resized height
ncnn::Mat in = ncnn::Mat::from_pixels_resize(rgbdata, ncnn::Mat::PIXEL_RGB2GRAY, w, h, target_width, target_height);
```
Net 有从 FILE* 文件描述加载的接口，可以利用这点把多个网络和模型文件合并为一个，分发时能方便些，内存引用就无所谓了

> $ cat alexnet.param.bin alexnet.bin > alexnet-all.bin

```cpp
#include "net.h"
FILE* fp = fopen("alexnet-all.bin", "rb");
net.load_param_bin(fp);
net.load_model(fp);
fclose(fp);
```


================================================
FILE: docs/how-to-use-and-FAQ/use-ncnn-with-opencv.md
================================================
### opencv to ncnn

* cv::Mat CV_8UC3 -> ncnn::Mat 3 channel + swap RGB/BGR

```cpp
// cv::Mat a(h, w, CV_8UC3);
ncnn::Mat in = ncnn::Mat::from_pixels(a.data, ncnn::Mat::PIXEL_BGR2RGB, a.cols, a.rows);
```

* cv::Mat CV_8UC3 -> ncnn::Mat 3 channel + keep RGB/BGR order

```cpp
// cv::Mat a(h, w, CV_8UC3);
ncnn::Mat in = ncnn::Mat::from_pixels(a.data, ncnn::Mat::PIXEL_RGB, a.cols, a.rows);
```

* cv::Mat CV_8UC3 -> ncnn::Mat 1 channel + do RGB2GRAY/BGR2GRAY

```cpp
// cv::Mat rgb(h, w, CV_8UC3);
ncnn::Mat inrgb = ncnn::Mat::from_pixels(rgb.data, ncnn::Mat::PIXEL_RGB2GRAY, rgb.cols, rgb.rows);

// cv::Mat bgr(h, w, CV_8UC3);
ncnn::Mat inbgr = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2GRAY, bgr.cols, bgr.rows);
```

* cv::Mat CV_8UC1 -> ncnn::Mat 1 channel

```cpp
// cv::Mat a(h, w, CV_8UC1);
ncnn::Mat in = ncnn::Mat::from_pixels(a.data, ncnn::Mat::PIXEL_GRAY, a.cols, a.rows);
```

* cv::Mat CV_32FC1 -> ncnn::Mat 1 channel

  * **You could construct ncnn::Mat and fill data into it directly to avoid data copy**

```cpp
// cv::Mat a(h, w, CV_32FC1);
ncnn::Mat in(a.cols, a.rows, 1, (void*)a.data);
in = in.clone();
```

* cv::Mat CV_32FC3 -> ncnn::Mat 3 channel

  * **You could construct ncnn::Mat and fill data into it directly to avoid data copy**

```cpp
// cv::Mat a(h, w, CV_32FC3);
ncnn::Mat in_pack3(a.cols, a.rows, 1, (void*)a.data, (size_t)4u * 3, 3);
ncnn::Mat in;
ncnn::convert_packing(in_pack3, in, 1);
```

* std::vector < cv::Mat > + CV_32FC1 -> ncnn::Mat multiple channels

  * **You could construct ncnn::Mat and fill data into it directly to avoid data copy**

```cpp
// std::vector<cv::Mat> a(channels, cv::Mat(h, w, CV_32FC1));
int channels = a.size();
ncnn::Mat in(a[0].cols, a[0].rows, channels);
for (int p=0; p<in.c; p++)
{
    memcpy(in.channel(p), (const uchar*)a[p].data, in.w * in.h * sizeof(float));
}
```

### ncnn to opencv

* ncnn::Mat 3 channel -> cv::Mat CV_8UC3 + swap RGB/BGR

  * **You may need to call in.substract_mean_normalize() first to scale values from 0..1 to 0..255**

```cpp
// ncnn::Mat in(w, h, 3);
cv::Mat a(in.h, in.w, CV_8UC3);
in.to_pixels(a.data, ncnn::Mat::PIXEL_BGR2RGB);
```

* ncnn::Mat 3 channel -> cv::Mat CV_8UC3 + keep RGB/BGR order

  * **You may need to call in.substract_mean_normalize() first to scale values from 0..1 to 0..255**

```cpp
// ncnn::Mat in(w, h, 3);
cv::Mat a(in.h, in.w, CV_8UC3);
in.to_pixels(a.data, ncnn::Mat::PIXEL_RGB);
```

* ncnn::Mat 1 channel -> cv::Mat CV_8UC1

  * **You may need to call in.substract_mean_normalize() first to scale values from 0..1 to 0..255**

```cpp
// ncnn::Mat in(w, h, 1);
cv::Mat a(in.h, in.w, CV_8UC1);
in.to_pixels(a.data, ncnn::Mat::PIXEL_GRAY);
```

* ncnn::Mat 1 channel -> cv::Mat CV_32FC1

  * **You could consume or manipulate ncnn::Mat data directly to avoid data copy**

```cpp
// ncnn::Mat in;
cv::Mat a(in.h, in.w, CV_32FC1);
memcpy((uchar*)a.data, in.data, in.w * in.h * sizeof(float));
```

* ncnn::Mat 3 channel -> cv::Mat CV_32FC3

  * **You could consume or manipulate ncnn::Mat data directly to avoid data copy**

```cpp
// ncnn::Mat in(w, h, 3);
ncnn::Mat in_pack3;
ncnn::convert_packing(in, in_pack3, 3);
cv::Mat a(in.h, in.w, CV_32FC3);
memcpy((uchar*)a.data, in_pack3.data, in.w * in.h * 3 * sizeof(float));
```

* ncnn::Mat multiple channels -> std::vector < cv::Mat > + CV_32FC1

  * **You could consume or manipulate ncnn::Mat data directly to avoid data copy**

```cpp
// ncnn::Mat in(w, h, channels);
std::vector<cv::Mat> a(in.c);
for (int p=0; p<in.c; p++)
{
    a[p] = cv::Mat(in.h, in.w, CV_32FC1);
    memcpy((uchar*)a[p].data, in.channel(p), in.w * in.h * sizeof(float));
}
```


================================================
FILE: docs/how-to-use-and-FAQ/use-ncnn-with-own-project.md
================================================
### use ncnn with own project

After building ncnn, there is one or more library files generated. Consider integrating ncnn into your own project, you may use ncnn's installating provided cmake config file, or by manually specify library path(s).

**with cmake**

Ensure your project is built by cmake. Then in your project's CMakeLists.txt, add these lines:

```cmake
set(ncnn_DIR "<ncnn_install_dir>/lib/cmake/ncnn" CACHE PATH "Directory that contains ncnnConfig.cmake")
find_package(ncnn REQUIRED)
target_link_libraries(my_target ncnn)
```
After this, both the header file search path ("including directories") and library paths are configured automatically, including vulkan related dependencies.

Note: you have to change `<ncnn_install_dir>` to your machine's directory, it is the directory that contains `ncnnConfig.cmake`.

For the prebuilt ncnn release packages, ncnnConfig is located in:
- for `ncnn-YYYYMMDD-windows-vs2019`, it is `lib/cmake/ncnn`
- for `ncnn-YYYYMMDD-android-vulkan`, it is `${ANDROID_ABI}/lib/cmake/ncnn` (`${ANDROID_ABI}` is defined in NDK's cmake toolchain file)
- other prebuilt release packages are with similar condition

**manually specify**

You may also manually specify ncnn library path and including directory. Note that if you use ncnn with vulkan, it is also required to specify vulkan related dependencies.

For example, on Visual Studio debug mode with vulkan required, the lib paths are:
```
E:\github\ncnn\build\vs2019-x64\install\lib\ncnnd.lib
E:\github\ncnn\build\vs2019-x64\install\lib\glslangd.lib
```
And for its release mode, lib paths are:
```
E:\github\ncnn\build\vs2019-x64\install\lib\ncnn.lib
E:\github\ncnn\build\vs2019-x64\install\lib\glslang.lib
```


================================================
FILE: docs/how-to-use-and-FAQ/use-ncnn-with-pytorch-or-onnx.md
================================================
# A Guide to Converting pytorch / onnx Models to ncnn

This guide is designed to help pytorch and onnx users use the new-generation model conversion tool, **pnnx**, to efficiently and reliably convert models to the ncnn format for high-performance inference on the edge.

This document is written and revised based on the **official pnnx documentation**.

* pnnx project: https://github.com/pnnx/pnnx
* ncnn project: https://github.com/Tencent/ncnn
* supported pytorch operators: https://github.com/Tencent/ncnn/tree/master/tools/pnnx#supported-pytorch-operator-status
* supported onnx operators: https://github.com/Tencent/ncnn/tree/master/tools/pnnx#supported-onnx-operator-status

---

## Why is pnnx Highly Recommended?

Regardless of which framework you come from, pnnx offers significant advantages over traditional tools (like `onnx2ncnn`):

*   **Forget the Hassles of onnx**: The traditional `pytorch -> onnx -> ncnn` pipeline often fails due to onnx operator compatibility issues and dynamic shape problems. pnnx can convert directly from pytorch, completely bypassing the unstable intermediate step of onnx.
*   **Core Framework Support**: pnnx focuses on supporting **pytorch** and **onnx**, providing you with a unified and consistent conversion experience.
*   **More Stable and Powerful**: pnnx can handle a wider range of modern operators and complex model architectures, generating cleaner and more accurate ncnn graphs.
*   **Active and Continuous Development**: pnnx is under active development, constantly adding support for the latest operators and features from both source frameworks and the ncnn engine.
*   **Richer Graph Information**: pnnx preserves the original model's structural information during the conversion process, which is highly beneficial for model analysis and subsequent optimization.

---

## Workflow 1: Guide for pytorch Users (Recommended)

For pytorch users, converting directly from a pytorch model is the most stable and efficient path.

### Method A: Direct Conversion in Python with `pnnx.export` (Most Recommended)

This is the simplest and most recommended workflow, allowing you to complete the model conversion with a single command without leaving your Python environment.

#### 1. Install pnnx

First, install the pnnx Python package. This command installs both the `pnnx` Python library and the `pnnx` command-line tool.

```bash
pip3 install pnnx
```

#### 2. Call `pnnx.export` in Your Python Script

Calling the `pnnx.export` function will generate both a TorchScript (`.pt`) file and the `.param` and `.bin` files required by ncnn.

**Complete Code Example:**

```python
import torch
import torch.nn as nn
import pnnx

# 1. Define or load your pytorch model
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, 1, 1)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(16 * 224 * 224, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

# 2. Instantiate the model and set it to evaluation mode
model = MyModel()
model.eval()

# 3. Create a dummy input tensor with the correct input shape
input_tensor = torch.rand(1, 3, 224, 224)

# 4. Call pnnx.export to export the model
pnnx.export(model, "my_model.pt", (input_tensor,))

print("Conversion complete!")
print("Please check for the generated my_model.pt, my_model.ncnn.param, and my_model.ncnn.bin files.")
```

### Method B: Using the Command-Line Tool (Alternative)

#### 1. Get the pnnx Command-Line Tool

If you have already run `pip install pnnx`, the `pnnx` command is available, and you can proceed to the next step.

For non-Python environments or users who prefer a standalone executable, you can manually download the latest binary from the [pnnx Releases page](https://github.com/pnnx/pnnx/releases).

#### 2. Export to TorchScript (Skip if you already have a .pt file)

```python
import torch
# ... (model definition from above)
model = MyModel()
model.eval()
input_tensor = torch.rand(1, 3, 224, 224)
traced_script_module = torch.jit.trace(model, input_tensor)
traced_script_module.save("my_model.pt")
```

#### 3. Run the pnnx Command for Conversion

Run the following command in your terminal.

```bash
# Syntax: pnnx <torchscript_model_path>
pnnx my_model.pt
```

---

## Workflow 2: Guide for onnx Users

For users who already have an `.onnx` file, please use pnnx for conversion.

### 1. Get the pnnx Command-Line Tool

*   **Method 1 (Recommended):** If you have Python in your environment, install it directly via pip.
    ```bash
    pip3 install pnnx
    ```
    The `pnnx` command will be automatically added to your system's path.

*   **Method 2 (Alternative):** For non-Python environments or to use a standalone program, you can download the latest executable from the [pnnx Releases page](https://github.com/pnnx/pnnx/releases).

### 2. Run the Command-Line Conversion

Open a terminal, navigate to the directory containing your model file, and run the following command.

**Basic Command Example:**

```bash
# Syntax: pnnx <onnx_model_path>
pnnx my_model.onnx
```
After the command executes successfully, you will get the `my_model.ncnn.param` and `my_model.ncnn.bin` files, which can be directly loaded and used in your ncnn project.


================================================
FILE: docs/how-to-use-and-FAQ/use-ncnnoptimize-to-optimize-model.md
================================================

the typical usage
```
ncnnoptimize mobilenet.param mobilenet.bin mobilenet-opt.param mobilenet-opt.bin 65536 
```

operator fusion
* batchnorm - scale
* convolution - batchnorm
* convolutiondepthwise - batchnorm
* deconvolution - batchnorm
* deconvolutiondepthwise - batchnorm
* innerproduct - batchnorm
* convolution - relu
* convolutiondepthwise - relu
* deconvolution - relu
* deconvolutiondepthwise - relu
* innerproduct - relu

eliminate noop operator
* innerproduct - dropout
* flatten after global pooling

prefer better operator
* replace convolution with innerproduct after global pooling


================================================
FILE: docs/how-to-use-and-FAQ/vulkan-notes.md
================================================
## supported platform

* Y = known work
* ? = shall work, not confirmed
* / = not applied

|    |windows|linux|android|mac|ios|
|---|---|---|---|---|---|
|intel|Y|Y|Y|Y|/|
|amd|Y|Y|/|Y|/|
|nvidia|Y|Y|?|/|/|
|qcom|/|/|Y|/|/|
|apple|/|/|/|Y|Y|
|arm|/|Y|Y|/|/|

## enable vulkan compute support
```
$ cmake -DNCNN_VULKAN=ON ..
```

## enable vulkan compute inference
```cpp
ncnn::Net net;
net.opt.use_vulkan_compute = 1;
```

## proper allocator usage
```cpp
ncnn::VkAllocator* blob_vkallocator = vkdev.acquire_blob_allocator();
ncnn::VkAllocator* staging_vkallocator = vkdev.acquire_blob_allocator();

net.opt.blob_vkallocator = blob_vkallocator;
net.opt.workspace_vkallocator = blob_vkallocator;
net.opt.staging_vkallocator = staging_vkallocator;

// ....

// after inference
vkdev.reclaim_blob_allocator(blob_vkallocator);
vkdev.reclaim_staging_allocator(staging_vkallocator);
```

## select gpu device
```cpp
// get gpu count
int gpu_count = ncnn::get_gpu_count();

// set specified vulkan device before loading param and model
net.set_vulkan_device(0); // use device-0
net.set_vulkan_device(1); // use device-1

// or set opt.vulkan_device_index field before loading param and model
net.opt.vulkan_device_index = 0; // use device-0
net.opt.vulkan_device_index = 1; // use device-1
```

## zero-copy on unified memory device
```cpp
ncnn::VkMat blob_gpu;
ncnn::Mat mapped = blob_gpu.mapped();

// use mapped.data directly
```

## hybrid cpu/gpu inference
```cpp
ncnn::Net net_cpu;
ncnn::Net net_gpu;
net_cpu.opt.use_vulkan_compute = false;
net_gpu.opt.use_vulkan_compute = true;
net_cpu.load_param();
net_cpu.load_model();
net_gpu.load_param();
net_gpu.load_model();

ncnn::Extractor ex_cpu = net_cpu.create_extractor();
ncnn::Extractor ex_gpu = net_gpu.create_extractor();

#pragma omp parallel sections
{
    #pragma omp section
    {
        ex_cpu.input();
        ex_cpu.extract();
    }
    #pragma omp section
    {
        ex_gpu.input();
        ex_gpu.extract();
    }
}
```

## zero-copy gpu inference chaining
```cpp
ncnn::Extractor ex1 = net1.create_extractor();
ncnn::Extractor ex2 = net2.create_extractor();

ncnn::VkCompute cmd(&vkdev);

ncnn::VkMat conv1;
ncnn::VkMat conv2;
ncnn::VkMat conv3;

ex1.input("conv1", conv1);
ex1.extract("conv2", conv2, cmd);

ex2.input("conv2", conv2);
ex2.extract("conv3", conv3, cmd);

cmd.submit_and_wait();
```

## batch inference
```cpp
int max_batch_size = vkdev->info.compute_queue_count();

ncnn::Mat inputs[1000];
ncnn::Mat outputs[1000];

#pragma omp parallel for num_threads(max_batch_size)
for (int i=0; i<1000; i++)
{
    ncnn::Extractor ex = net1.create_extractor();
    ex.input("data", inputs[i]);
    ex.extract("prob", outputs[i]);
}
```

## control storage and arithmetic precision

disable all lower-precision optimizations, get full fp32 precision

```cpp
ncnn::Net net;
net.opt.use_fp16_packed = false;
net.opt.use_fp16_storage = false;
net.opt.use_fp16_arithmetic = false;
net.opt.use_int8_storage = false;
net.opt.use_int8_arithmetic = false;
```

## debugging tips
```cpp
#define ENABLE_VALIDATION_LAYER 1 // modify to 1 in gpu.cpp
```

## add vulkan compute support to layer
1. add vulkan shader in src/layer/shader/

2. upload model weight data in Layer::upload_model()

3. setup pipeline in Layer::create_pipeline()

4. destroy pipeline in Layer::destroy_pipeline()

5. record command in Layer::forward()

## add optimized shader path
1. add vulkan shader in src/layer/shader/ named XXX_abc.comp

2. create pipeline with "XXX_abc"

3. record command using XXX_abc pipeline

## low-level op api
1. create layer

2. load param and load model

3. upload model

4. create pipeline

5. new command

6. record

7. submit and wait


================================================
FILE: examples/CMakeLists.txt
================================================
macro(ncnn_add_example name)
    add_executable(${name} ${name}.cpp)
    if(OpenCV_FOUND)
        target_include_directories(${name} PRIVATE ${OpenCV_INCLUDE_DIRS})
        target_link_libraries(${name} PRIVATE ncnn ${OpenCV_LIBS})
    elseif(NCNN_SIMPLEOCV)
        target_compile_definitions(${name} PUBLIC USE_NCNN_SIMPLEOCV)
        target_link_libraries(${name} PRIVATE ncnn)
    endif()

    # add test to a virtual project group
    set_property(TARGET ${name} PROPERTY FOLDER "examples")
endmacro()

if(NCNN_PIXEL)
    if(NOT NCNN_SIMPLEOCV)
        find_package(OpenCV QUIET COMPONENTS opencv_world)
        # for opencv 2.4 on ubuntu 16.04, there is no opencv_world but OpenCV_FOUND will be TRUE
        if("${OpenCV_LIBS}" STREQUAL "")
            set(OpenCV_FOUND FALSE)
        endif()
        if(NOT OpenCV_FOUND)
            find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs videoio)
        endif()
        if(NOT OpenCV_FOUND)
            find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
        endif()
    endif()

    if(OpenCV_FOUND OR NCNN_SIMPLEOCV)
        if(OpenCV_FOUND)
            message(STATUS "OpenCV library: ${OpenCV_INSTALL_PATH}")
            message(STATUS "    version: ${OpenCV_VERSION}")
            message(STATUS "    libraries: ${OpenCV_LIBS}")
            message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")

            if(${OpenCV_VERSION_MAJOR} GREATER 3)
                set(CMAKE_CXX_STANDARD 11)
            endif()
        endif()

        include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
        include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)
        ncnn_add_example(arcface)
        ncnn_add_example(squeezenet)
        ncnn_add_example(squeezenet_c_api)
        ncnn_add_example(fasterrcnn)
        ncnn_add_example(rfcn)
        ncnn_add_example(yolov2)
        ncnn_add_example(yolov3)
        ncnn_add_example(yolov5)
        ncnn_add_example(yolov5_pnnx)
        ncnn_add_example(yolov7_pnnx)
        ncnn_add_example(yolov7)
        ncnn_add_example(yolov8)
        ncnn_add_example(yolov8_seg)
        ncnn_add_example(yolov8_pose)
        ncnn_add_example(yolov8_cls)
        ncnn_add_example(yolox)
        ncnn_add_example(yolo11)
        ncnn_add_example(yolo11_seg)
        ncnn_add_example(yolo11_pose)
        ncnn_add_example(yolo11_cls)
        ncnn_add_example(yoloworld)
        ncnn_add_example(mobilenetv2ssdlite)
        ncnn_add_example(mobilenetssd)
        ncnn_add_example(squeezenetssd)
        ncnn_add_example(shufflenetv2)
        ncnn_add_example(peleenetssd_seg)
        ncnn_add_example(simplepose)
        ncnn_add_example(retinaface)
        ncnn_add_example(yolact)
        ncnn_add_example(nanodet)
        ncnn_add_example(nanodetplus_pnnx)
        ncnn_add_example(scrfd)
        ncnn_add_example(scrfd_crowdhuman)
        ncnn_add_example(piper)
        ncnn_add_example(whisper)
        if(OpenCV_FOUND)
            ncnn_add_example(yolov4)
            ncnn_add_example(yolov8_obb)
            ncnn_add_example(yolo11_obb)
            ncnn_add_example(rvm)
            ncnn_add_example(p2pnet)
            ncnn_add_example(ppocrv5)
        endif()
    else()
        message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built")
    endif()
else()
    message(WARNING "NCNN_PIXEL not enabled, examples won't be built")
endif()


================================================
FILE: examples/arcface.cpp
================================================
// Copyright 2025 heabeounMKTO
// SPDX-License-Identifier: BSD-3-Clause
/* ncnn example using yolo-face and arcface to extract embeddings from a face
 *
 *
 *  the arcface model is converted from
 * https://github.com/onnx/models/tree/main/validated/vision/body_analysis/arcface
 * 1. first simplify the arcface.onnx using onnxsim
 * 2. then convert it using ncnn's onnx exporter onnx2ncnn
 *  using pnnx to convert would cause -nan output!
 *
 *  the yolov8-face model is converted from
 *  https://github.com/derronqi/yolov8-face
 *
 *
 * you can find the models preconverted at
 * https://drive.google.com/drive/folders/1P0RDzj9V7FHEL8w_-yqls5RHeVpO-2PS?usp=sharing
 *
 * */

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>
#include <float.h>
#include "layer.h"
#include "net.h"
#include "mat.h"

#ifndef ARCFACE_EXAMPLE_YOLO_INFER_SIZE
#define ARCFACE_EXAMPLE_YOLO_INFER_SIZE 320
#endif

struct Bbox
{
    float x1, y1, x2, y2, confidence;
    int label;
    Bbox()
        : x1(0.0f), y1(0.0f), x2(0.0f), y2(0.0f), confidence(0.0f), label(0)
    {
    }
    Bbox(float x1,
         float y1,
         float x2,
         float y2,
         float confidence,
         int label = 0,
         std::string label_name = "")
        : x1(x1), y1(y1), x2(x2), y2(y2), confidence(confidence), label(label)
    {
    }
    Bbox apply_image_scale(const cv::Mat& original_image,
                           const float scale_factor,
                           const int pad_w,
                           const int pad_h)
    {
        int img_w = original_image.cols;
        int img_h = original_image.rows;

        x1 = (x1 - pad_w) / scale_factor;
        y1 = (y1 - pad_h) / scale_factor;
        x2 = (x2 - pad_w) / scale_factor;
        y2 = (y2 - pad_h) / scale_factor;

        // clamp
        x1 = std::max(0.0f, std::min(x1, (float)img_w));
        y1 = std::max(0.0f, std::min(y1, (float)img_h));
        x2 = std::max(0.0f, std::min(x2, (float)img_w));
        y2 = std::max(0.0f, std::min(y2, (float)img_h));
        return Bbox(x1, y1, x2, y2, confidence, label);
    }
    std::string get_label_name(const std::vector<std::string>& classes)
    {
        return classes[this->label];
    }

    /// what more do you need to know vro
    float area() const
    {
        float width = x2 - x1;
        float height = y2 - y1;
        return width * height;
    }
    cv::Mat crop_bbox(const cv::Mat& originalImage) const
    {
        // Calculate width and height
        int bbox_width = static_cast<int>(x2 - x1);
        int bbox_height = static_cast<int>(y2 - y1);

        // Ensure valid dimensions
        if (bbox_width <= 0 || bbox_height <= 0)
        {
            fprintf(stderr, "Invalid bounding box dimensions\n");
            return cv::Mat();
        }

        // Ensure coordinates are within image bounds
        int x1_int = static_cast<int>(x1);
        int y1_int = static_cast<int>(y1);
        int x2_int = static_cast<int>(x2);
        int y2_int = static_cast<int>(y2);

        // Clamp to image bounds
        x1_int = std::max(0, x1_int);
        y1_int = std::max(0, y1_int);
        x2_int = std::min(originalImage.cols, x2_int);
        y2_int = std::min(originalImage.rows, y2_int);

        // Create ROI and return cropped image
        cv::Rect roi(x1_int, y1_int, x2_int - x1_int, y2_int - y1_int);
        return originalImage(roi).clone();
    }
    cv::Rect_<float> get_rect() const
    {
        int x1_int = static_cast<int>(x1);
        int y1_int = static_cast<int>(y1);
        int width = static_cast<int>(x2 - x1);
        int height = static_cast<int>(y2 - y1);

        // Ensure valid dimensions
        if (width <= 0 || height <= 0)
        {
            return cv::Rect(0, 0, 0, 0); // Return invalid rect
        }

        return cv::Rect(x1_int, y1_int, width, height);
    }
};

static void print_bbox(Bbox& bbox)
{
    printf("Bbox(x1=%.2f, y1=%.2f, x2=%.2f, y2=%.2f, conf=%.4f, label=%d)\n",
           bbox.x1, bbox.y1, bbox.x2, bbox.y2, bbox.confidence, bbox.label);
}

static void qsort_descent_inplace(std::vector<Bbox>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].confidence;

    while (i <= j)
    {
        while (faceobjects[i].confidence > p)
            i++;

        while (faceobjects[j].confidence < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    //     #pragma omp parallel sections
    {
        //         #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        //         #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Bbox>& faceobjects)
{
    if (faceobjects.empty()) return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

float calculate_iou(const Bbox& box1, const Bbox& box2)
{
    float x1 = std::max(box1.x1, box2.x1);
    float y1 = std::max(box1.y1, box2.y1);
    float x2 = std::min(box1.x2, box2.x2);
    float y2 = std::min(box1.y2, box2.y2);

    if (x2 <= x1 || y2 <= y1)
    {
        return 0.0f; // no intersect
    }

    float intersection_area = (x2 - x1) * (y2 - y1);
    float box1_area = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
    float box2_area = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
    float union_area = box1_area + box2_area - intersection_area;
    return intersection_area / union_area;
}

static std::vector<int>
non_maximum_supression(const std::vector<Bbox>& bbox, float iou_thresh, bool class_agnostic = false)
{
    std::vector<int> picked;
    const int n = bbox.size();
    if (n == 0) return picked;

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = bbox[i].area();
    }

    for (int i = 0; i < n; i++)
    {
        const Bbox& a = bbox[i];
        bool keep = true;

        for (int j : picked)
        {
            const Bbox& b = bbox[j];

            // Enhanced class comparison logic using labels
            if (!class_agnostic)
            {
                if (a.label != b.label)
                {
                    continue; // Different classes, don't suppress
                }
            }

            float iou = calculate_iou(a, b);
            if (iou > iou_thresh)
            {
                keep = false;
                break;
            }
        }

        if (keep)
        {
            picked.push_back(i);
        }
    }

    return picked;
}

static std::vector<float> scale_wh(float w0, float h0, float w1, float h1)
{
    float r = std::min(w1 / w0, h1 / h0);
    std::vector<float> _scale_factor(3);
    _scale_factor[0] = r;
    _scale_factor[1] = (float)std::round(w0 * r);
    _scale_factor[2] = (float)std::round(h0 * r);
    return _scale_factor;
}

struct ImagePreProcessResults
{
    ncnn::Mat result;
    float img_scale, pad_w, pad_h;

    ImagePreProcessResults(ncnn::Mat result, float img_scale, float pad_w, float pad_h)
        : result(result), img_scale(img_scale), pad_w(pad_w), pad_h(pad_h)
    {
    }
};

struct DetectionResult
{
    std::vector<Bbox> bboxes;
    std::vector<std::vector<float> > keypoints;
};

static ImagePreProcessResults preprocess_yolo_kpts(cv::Mat& input_image, int infer_size) noexcept
{
    float mean_vals[] = {0.f, 0.f, 0.f};

    float norm_vals[] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    int img_w = input_image.cols;
    int img_h = input_image.rows;
    float scale_factor, new_w, new_h;
    std::vector<float> _scale_factor = scale_wh(img_w, img_h, (float)infer_size, (float)infer_size);
    scale_factor = _scale_factor[0];
    new_w = _scale_factor[1];
    new_h = _scale_factor[2];
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(input_image.data,
                   ncnn::Mat::PIXEL_BGR2RGB, img_w,
                   img_h, new_w, new_h);

    // padding calculation
    int pad_w = (infer_size - new_w) / 2;
    int pad_h = (infer_size - new_h) / 2;

    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, pad_h, infer_size - new_h - pad_h, pad_w,
                           infer_size - new_w - pad_w, ncnn::BORDER_CONSTANT, 114.f);
    in_pad.substract_mean_normalize(mean_vals, norm_vals);
    return ImagePreProcessResults(in_pad, scale_factor, pad_w, pad_h);
}

/// parses extra keypoints data for face mmodel
/// the format is this:
/// [x, y, w, h, conf, class_scores..., kp1_conf, kp1_x, kp1_y, kp2_conf, kp2_x, kp2_y,  ...]
static DetectionResult parse_yolo_keypoints_results(ncnn::Mat& result,
        cv::Mat& original_image,
        ImagePreProcessResults& preproc_img,
        float confidence_threshold,
        float iou_threshold,
        std::vector<std::string> class_names)
{
    cv::Mat output((int)result.w, (int)result.h, CV_32FC1);
    for (int i = 0; i < output.cols; i++)
    {
        for (int j = 0; j < output.rows; j++)
        {
            output.ptr<float>(j)[i] = result.row(i)[j];
        }
    }
    std::vector<Bbox> detections;
    std::vector<std::vector<float> > all_keypoints;

    int num_classes = class_names.size();
    int kp_stride = 3;
    int num_keypoints = 5;

    for (int i = 0; i < output.rows; i++)
    {
        const float* row_ptr = output.ptr<float>(i);
        const float* bboxes_ptr = row_ptr;
        const float* classes_ptr = row_ptr + 4;
        const float* max_s_ptr = std::max_element(classes_ptr, classes_ptr + num_classes);

        float score = *max_s_ptr;
        int class_id = max_s_ptr - classes_ptr;

        if (score >= confidence_threshold)
        {
            float x = bboxes_ptr[0];
            float y = bboxes_ptr[1];
            float w = bboxes_ptr[2];
            float h = bboxes_ptr[3];
            float x1 = x - w / 2.0f;
            float y1 = y - h / 2.0f;
            float x2 = x + w / 2.0f;
            float y2 = y + h / 2.0f;

            if (x2 > x1 && y2 > y1)
            {
                Bbox bbox = Bbox(x1, y1, x2, y2, score, class_id)
                            .apply_image_scale(original_image, preproc_img.img_scale,
                                               preproc_img.pad_w, preproc_img.pad_h);
                // Parse exactly 5 keypoints for this face model
                std::vector<float> face_keypoints;
                face_keypoints.reserve(15);
                const float* kp_ptr = row_ptr + 4 + num_classes;
                float scale = 1.0f / preproc_img.img_scale;

                for (int k = 0; k < num_keypoints; k++)
                {
                    float kp_x = kp_ptr[k * kp_stride];
                    float kp_y = kp_ptr[k * kp_stride + 1];
                    float kp_conf_raw = kp_ptr[k * kp_stride + 2];

                    // Apply sigmoid to convert logit to probability
                    float kp_conf = 1.0f / (1.0f + expf(-kp_conf_raw));

                    // Scale keypoints to original
                    kp_x = (kp_x - preproc_img.pad_w) * scale;
                    kp_y = (kp_y - preproc_img.pad_h) * scale;

                    face_keypoints.push_back(kp_x);
                    face_keypoints.push_back(kp_y);
                    face_keypoints.push_back(kp_conf);
                }

                detections.push_back(bbox);
                all_keypoints.push_back(face_keypoints);
            }
        }
    }

    // nms
    qsort_descent_inplace(detections);
    std::vector<int> picked = non_maximum_supression(detections, iou_threshold, false);
    DetectionResult res;
    for (size_t i = 0; i < picked.size(); i++)
    {
        int idx = picked[i];
        res.bboxes.push_back(detections[idx]);
        res.keypoints.push_back(all_keypoints[idx]);
    }

    return res;
}

static inline float get_similarity(std::vector<float> f1, std::vector<float> f2)
{
    float sim = 0.0;
    for (size_t i = 0; i < f1.size(); i++)
    {
        sim += f1[i] * f2[i];
    }
    return sim;
}

// these are converted from here
// https://github.com/deepinsight/insightface/blob/master/python-package/insightface/utils/face_align.py
static int estimate_norm(float* transform_matrix, const float* lmk, int image_size = 112)
{
    float ARCFACE_DST[] {
        38.2946f, 51.6963f, // left eye
        73.5318f, 51.5014f, // right eye
        56.0252f, 71.7366f, // nose
        41.5493f, 92.3655f, // left mouth
        70.7299f, 92.2041f  // right mouth
    };
    if (image_size % 112 != 0 && image_size % 128 != 0)
    {
        return -1;
    }

    float ratio, diff_x;
    if (image_size % 112 == 0)
    {
        ratio = static_cast<float>(image_size) / 112.0f;
        diff_x = 0.0f;
    }
    else
    {
        ratio = static_cast<float>(image_size) / 128.0f;
        diff_x = 8.0f * ratio;
    }

    float src_points[10];
    for (int i = 0; i < 5; i++)
    {
        src_points[i * 2] = lmk[i * 3];
        src_points[i * 2 + 1] = lmk[i * 3 + 1];
    }

    float dst_points[10];
    for (int i = 0; i < 5; i++)
    {
        dst_points[i * 2] = ARCFACE_DST[i * 2] * ratio + diff_x;
        dst_points[i * 2 + 1] = ARCFACE_DST[i * 2 + 1] * ratio;
    }

    ncnn::get_affine_transform(dst_points, src_points, 5, transform_matrix);

    return 0;
}

static int norm_crop(cv::Mat& output, const cv::Mat& input, const float* lmk, int image_size = 112)
{
    float transform_matrix[6];
    int status = estimate_norm(transform_matrix, lmk, image_size);

    if (status != 0)
    {
        return status;
    }
    output = cv::Mat(image_size, image_size, CV_8UC3);
    ncnn::warpaffine_bilinear_c3(input.data, input.cols, input.rows,
                                 output.data, image_size, image_size,
                                 transform_matrix);
    return 0;
}

void normalize_arcface(std::vector<float>& feature)
{
    if (feature.empty())
        return;
    float sum = 0;
    for (auto it = feature.begin(); it != feature.end(); it++)
        sum += (float)*it * (float)*it;
    sum = sqrt(sum);
    if (sum == 0.0f)
        return;
    for (auto it = feature.begin(); it != feature.end(); it++)
        *it /= sum;
}

static int get_face(const cv::Mat& rgb, DetectionResult& result)
{
    int status = 0;
    ncnn::Net yoloface;
    yoloface.opt.use_vulkan_compute = true;
    status = yoloface.load_param("yolov8-face.param");

    if (status != 0)
    {
        fprintf(stderr, "couldn't load params");
        return status;
    }

    status = yoloface.load_model("yolov8-face.bin");

    if (status != 0)
    {
        fprintf(stderr, "couldn't load model");
        return status;
    }

    cv::Mat input_image = rgb.clone();
    ImagePreProcessResults preproc_img = preprocess_yolo_kpts(input_image, ARCFACE_EXAMPLE_YOLO_INFER_SIZE);
    ncnn::Extractor ex = yoloface.create_extractor();
    ex.input("in0", preproc_img.result);
    ncnn::Mat out;
    ex.extract("out0", out);
    std::vector<std::string> class_names = {"face"};
    result = parse_yolo_keypoints_results(out, input_image, preproc_img, 0.5, 0.4, class_names);
    if (result.bboxes.size() < 1)
    {
        fprintf(stderr, "no faces are found!");
        return -1;
    }
    return 0;
}

static int get_embedding(const cv::Mat& rgb, std::vector<float>& result)
{
    ncnn::Net arcface;
    arcface.opt.use_vulkan_compute = true;
    int status = arcface.load_param("arcfaceresnet.param");
    if (status != 0)
    {
        fprintf(stderr, "couldn't load arcface params");
        return status;
    }
    status = arcface.load_model("arcfaceresnet.bin");
    if (status != 0)
    {
        fprintf(stderr, "couldn't load arcface model");
        return status;
    }

    if (rgb.empty() || rgb.type() != CV_8UC3)
    {
        fprintf(stderr, "invalid input image!");
        return -1;
    }
    /*
    * the arcface model provided in the link has builtin normalization layers,
    * no need to run substract_mean_normalize
    *
    *  reference from .param
    BinaryOp         _minusscalar0            2 1 data scalar_op2 _minusscalar0 0=1
    BinaryOp         _mulscalar0              2 1 _minusscalar0 scalar_op3 _mulscalar0 0=2
    * */
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(
                       rgb.data,
                       ncnn::Mat::PIXEL_BGR2RGB,
                       rgb.cols,
                       rgb.rows,
                       112,
                       112);
    ncnn::Extractor ex = arcface.create_extractor();
    ex.input("data", in);
    ncnn::Mat out;
    ex.extract("fc1", out);
    const float* ptr = (const float*)out.data;
    for (int i = 0; i < 512; i++)
    {
        result[i] = ptr[i];
    }
    normalize_arcface(result);
    return 0;
}

int main(int argc, char** argv)
{
    if (argc != 3)
    {
        fprintf(stderr, "Usage: %s <face1_path> <face2_path>\n", argv[0]);
        return -1;
    }

    const char* face1_path = argv[1];
    const char* face2_path = argv[2];

    int status = 0;
    cv::Mat face_img1 = cv::imread(face1_path);
    cv::Mat face_img2 = cv::imread(face2_path);

    if (face_img1.empty())
    {
        fprintf(stderr, "Failed to load image: %s\n", face1_path);
        return -1;
    }
    if (face_img2.empty())
    {
        fprintf(stderr, "Failed to load image: %s\n", face2_path);
        return -1;
    }

    cv::Mat input_embed1, input_embed2;
    DetectionResult res1, res2;
    std::vector<float> embedding1(512), embedding2(512);

    status = get_face(face_img1, res1);
    if (status != 0)
    {
        fprintf(stderr, "get face failed for %s!\n", face1_path);
        return -1;
    }
    fprintf(stdout, "found faces in face1: %d\n", (int)res1.bboxes.size());
    for (size_t i = 0; i < res1.bboxes.size(); i++)
    {
        print_bbox(res1.bboxes[i]);
    }

    status = get_face(face_img2, res2);
    if (status != 0)
    {
        fprintf(stderr, "get face failed for %s!\n", face2_path);
        return -1;
    }
    fprintf(stdout, "found faces in face2: %d\n", (int)res2.bboxes.size());
    for (size_t i = 0; i < res2.bboxes.size(); i++)
    {
        print_bbox(res2.bboxes[i]);
    }

    status = norm_crop(input_embed1, face_img1, res1.keypoints[0].data());
    status = get_embedding(input_embed1, embedding1);
    if (status != 0)
    {
        fprintf(stderr, "get embedding failed for %s!\n", face1_path);
        return -1;
    }

    status = norm_crop(input_embed2, face_img2, res2.keypoints[0].data());
    if (status != 0)
    {
        fprintf(stderr, "norm_crop failed for face2!\n");
        return -1;
    }
    status = get_embedding(input_embed2, embedding2);
    if (status != 0)
    {
        fprintf(stderr, "get embedding failed for face2!\n");
        return -1;
    }
    if (status != 0)
    {
        fprintf(stderr, "get embedding failed for %s!\n", face2_path);
        return -1;
    }

    float similarity = get_similarity(embedding1, embedding2);
    fprintf(stdout, "Similarity: %f\n", similarity);
}


================================================
FILE: examples/fasterrcnn.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#include <math.h>
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net fasterrcnn;

    fasterrcnn.opt.use_vulkan_compute = true;

    // original pretrained model from https://github.com/rbgirshick/py-faster-rcnn
    // py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt
    // https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0
    // ZF_faster_rcnn_final.caffemodel
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (fasterrcnn.load_param("ZF_faster_rcnn_final.param"))
        exit(-1);
    if (fasterrcnn.load_model("ZF_faster_rcnn_final.bin"))
        exit(-1);

    // hyper parameters taken from
    // py-faster-rcnn/lib/fast_rcnn/config.py
    // py-faster-rcnn/lib/fast_rcnn/test.py
    const int target_size = 600; // __C.TEST.SCALES

    const int max_per_image = 100;
    const float confidence_thresh = 0.05f;

    const float nms_threshold = 0.3f; // __C.TEST.NMS

    // scale to target detect size
    int w = bgr.cols;
    int h = bgr.rows;
    float scale = 1.f;
    if (w < h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h);

    const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f};
    in.substract_mean_normalize(mean_vals, 0);

    ncnn::Mat im_info(3);
    im_info[0] = h;
    im_info[1] = w;
    im_info[2] = scale;

    // step1, extract feature and all rois
    ncnn::Extractor ex1 = fasterrcnn.create_extractor();

    ex1.input("data", in);
    ex1.input("im_info", im_info);

    ncnn::Mat conv5_relu5; // feature
    ncnn::Mat rois;        // all rois
    ex1.extract("conv5_relu5", conv5_relu5);
    ex1.extract("rois", rois);

    // step2, extract bbox and score for each roi
    std::vector<std::vector<Object> > class_candidates;
    for (int i = 0; i < rois.c; i++)
    {
        ncnn::Extractor ex2 = fasterrcnn.create_extractor();

        ncnn::Mat roi = rois.channel(i); // get single roi
        ex2.input("conv5_relu5", conv5_relu5);
        ex2.input("rois", roi);

        ncnn::Mat bbox_pred;
        ncnn::Mat cls_prob;
        ex2.extract("bbox_pred", bbox_pred);
        ex2.extract("cls_prob", cls_prob);

        int num_class = cls_prob.w;
        class_candidates.resize(num_class);

        // find class id with highest score
        int label = 0;
        float score = 0.f;
        for (int i = 0; i < num_class; i++)
        {
            float class_score = cls_prob[i];
            if (class_score > score)
            {
                label = i;
                score = class_score;
            }
        }

        // ignore background or low score
        if (label == 0 || score <= confidence_thresh)
            continue;

        //         fprintf(stderr, "%d = %f\n", label, score);

        // unscale to image size
        float x1 = roi[0] / scale;
        float y1 = roi[1] / scale;
        float x2 = roi[2] / scale;
        float y2 = roi[3] / scale;

        float pb_w = x2 - x1 + 1;
        float pb_h = y2 - y1 + 1;

        // apply bbox regression
        float dx = bbox_pred[label * 4];
        float dy = bbox_pred[label * 4 + 1];
        float dw = bbox_pred[label * 4 + 2];
        float dh = bbox_pred[label * 4 + 3];

        float cx = x1 + pb_w * 0.5f;
        float cy = y1 + pb_h * 0.5f;

        float obj_cx = cx + pb_w * dx;
        float obj_cy = cy + pb_h * dy;

        float obj_w = pb_w * exp(dw);
        float obj_h = pb_h * exp(dh);

        float obj_x1 = obj_cx - obj_w * 0.5f;
        float obj_y1 = obj_cy - obj_h * 0.5f;
        float obj_x2 = obj_cx + obj_w * 0.5f;
        float obj_y2 = obj_cy + obj_h * 0.5f;

        // clip
        obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f);
        obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f);
        obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f);
        obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f);

        // append object
        Object obj;
        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
        obj.label = label;
        obj.prob = score;

        class_candidates[label].push_back(obj);
    }

    // post process
    objects.clear();
    for (int i = 0; i < (int)class_candidates.size(); i++)
    {
        std::vector<Object>& candidates = class_candidates[i];

        qsort_descent_inplace(candidates);

        std::vector<int> picked;
        nms_sorted_bboxes(candidates, picked, nms_threshold);

        for (int j = 0; j < (int)picked.size(); j++)
        {
            int z = picked[j];
            objects.push_back(candidates[z]);
        }
    }

    qsort_descent_inplace(objects);

    if (max_per_image > 0 && max_per_image < objects.size())
    {
        objects.resize(max_per_image);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_fasterrcnn(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/mobilenetssd.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int detect_mobilenet(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net mobilenet;

    mobilenet.opt.use_vulkan_compute = true;

    // model is converted from https://github.com/chuanqi305/MobileNet-SSD
    // and can be downloaded from https://drive.google.com/open?id=0ByaKLD9QaPtucWk0Y0dha1VVY0U
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (mobilenet.load_param("mobilenet_ssd_voc_ncnn.param"))
        exit(-1);
    if (mobilenet.load_model("mobilenet_ssd_voc_ncnn.bin"))
        exit(-1);

    const int target_size = 300;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);

    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
    const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = mobilenet.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("detection_out", out);

    //     printf("%d %d %d\n", out.w, out.h, out.c);
    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];
        object.rect.x = values[2] * img_w;
        object.rect.y = values[3] * img_h;
        object.rect.width = values[4] * img_w - object.rect.x;
        object.rect.height = values[5] * img_h - object.rect.y;

        objects.push_back(object);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_mobilenet(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/mobilenetv2ssdlite.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

class Noop : public ncnn::Layer
{
};
DEFINE_LAYER_CREATOR(Noop)

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int detect_mobilenetv2(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net mobilenetv2;

    mobilenetv2.opt.use_vulkan_compute = true;

    mobilenetv2.register_custom_layer("Silence", Noop_layer_creator);

    // original pretrained model from https://github.com/chuanqi305/MobileNetv2-SSDLite
    // https://github.com/chuanqi305/MobileNetv2-SSDLite/blob/master/ssdlite/voc/deploy.prototxt
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (mobilenetv2.load_param("mobilenetv2_ssdlite_voc.param"))
        exit(-1);
    if (mobilenetv2.load_model("mobilenetv2_ssdlite_voc.bin"))
        exit(-1);

    const int target_size = 300;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);

    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
    const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = mobilenetv2.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("detection_out", out);

    //     printf("%d %d %d\n", out.w, out.h, out.c);
    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];
        object.rect.x = values[2] * img_w;
        object.rect.y = values[3] * img_h;
        object.rect.width = values[4] * img_w - object.rect.x;
        object.rect.height = values[5] * img_h - object.rect.y;

        objects.push_back(object);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_mobilenetv2(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/mobilenetv3ssdlite.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"
#include "platform.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>
#if NCNN_VULKAN
#include "gpu.h"
#endif // NCNN_VULKAN

template<class T>
const T& clamp(const T& v, const T& lo, const T& hi)
{
    assert(!(hi < lo));
    return v < lo ? lo : hi < v ? hi : v;
}

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int detect_mobilenetv3(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net mobilenetv3;

#if NCNN_VULKAN
    mobilenetv3.opt.use_vulkan_compute = true;
#endif // NCNN_VULKAN

    // converted ncnn model from https://github.com/ujsyehao/mobilenetv3-ssd
    if (mobilenetv3.load_param("./mobilenetv3_ssdlite_voc.param"))
        exit(-1);
    if (mobilenetv3.load_model("./mobilenetv3_ssdlite_voc.bin"))
        exit(-1);

    const int target_size = 300;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size);

    const float mean_vals[3] = {123.675f, 116.28f, 103.53f};
    const float norm_vals[3] = {1.0f, 1.0f, 1.0f};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = mobilenetv3.create_extractor();

    ex.input("input", in);

    ncnn::Mat out;
    ex.extract("detection_out", out);

    //     printf("%d %d %d\n", out.w, out.h, out.c);
    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];

        // filter out cross-boundary
        float x1 = clamp(values[2] * target_size, 0.f, float(target_size - 1)) / target_size * img_w;
        float y1 = clamp(values[3] * target_size, 0.f, float(target_size - 1)) / target_size * img_h;
        float x2 = clamp(values[4] * target_size, 0.f, float(target_size - 1)) / target_size * img_w;
        float y2 = clamp(values[5] * target_size, 0.f, float(target_size - 1)) / target_size * img_h;

        object.rect.x = x1;
        object.rect.y = y1;
        object.rect.width = x2 - x1;
        object.rect.height = y2 - y1;

        objects.push_back(object);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        if (objects[i].prob > 0.6)
        {
            const Object& obj = objects[i];

            fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                    obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

            cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

            char text[256];
            sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

            int baseLine = 0;
            cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

            int x = obj.rect.x;
            int y = obj.rect.y - label_size.height - baseLine;
            if (y < 0)
                y = 0;
            if (x + label_size.width > image.cols)
                x = image.cols - label_size.width;

            cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                          cv::Scalar(255, 255, 255), -1);

            cv::putText(image, text, cv::Point(x, y + label_size.height),
                        cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
        }
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_mobilenetv3(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/nanodet.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdlib.h>
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static void generate_proposals(const ncnn::Mat& cls_pred, const ncnn::Mat& dis_pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid = cls_pred.h;

    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h)
    {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    }
    else
    {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }

    const int num_class = cls_pred.w;
    const int reg_max_1 = dis_pred.w / 4;

    for (int i = 0; i < num_grid_y; i++)
    {
        for (int j = 0; j < num_grid_x; j++)
        {
            const int idx = i * num_grid_x + j;

            const float* scores = cls_pred.row(idx);

            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            for (int k = 0; k < num_class; k++)
            {
                if (scores[k] > score)
                {
                    label = k;
                    score = scores[k];
                }
            }

            if (score >= prob_threshold)
            {
                ncnn::Mat bbox_pred(reg_max_1, 4, (void*)dis_pred.row(idx));
                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(bbox_pred, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = bbox_pred.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (j + 0.5f) * stride;
                float pb_cy = (i + 0.5f) * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;

                objects.push_back(obj);
            }
        }
    }
}

static int detect_nanodet(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net nanodet;

    nanodet.opt.use_vulkan_compute = true;
    // nanodet.opt.use_bf16_storage = true;

    // original pretrained model from https://github.com/RangiLyu/nanodet
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (nanodet.load_param("nanodet_m.param"))
        exit(-1);
    if (nanodet.load_model("nanodet_m.bin"))
        exit(-1);

    int width = bgr.cols;
    int height = bgr.rows;

    const int target_size = 320;
    const float prob_threshold = 0.4f;
    const float nms_threshold = 0.5f;

    // pad to multiple of 32
    int w = width;
    int h = height;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h);

    // pad to target_size rectangle
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);

    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
    const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
    in_pad.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = nanodet.create_extractor();

    ex.input("input.1", in_pad);

    std::vector<Object> proposals;

    // stride 8
    {
        ncnn::Mat cls_pred;
        ncnn::Mat dis_pred;
        ex.extract("792", cls_pred);
        ex.extract("795", dis_pred);

        std::vector<Object> objects8;
        generate_proposals(cls_pred, dis_pred, 8, in_pad, prob_threshold, objects8);

        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }

    // stride 16
    {
        ncnn::Mat cls_pred;
        ncnn::Mat dis_pred;
        ex.extract("814", cls_pred);
        ex.extract("817", dis_pred);

        std::vector<Object> objects16;
        generate_proposals(cls_pred, dis_pred, 16, in_pad, prob_threshold, objects16);

        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }

    // stride 32
    {
        ncnn::Mat cls_pred;
        ncnn::Mat dis_pred;
        ex.extract("836", cls_pred);
        ex.extract("839", dis_pred);

        std::vector<Object> objects32;
        generate_proposals(cls_pred, dis_pred, 32, in_pad, prob_threshold, objects32);

        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_nanodet(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/nanodetplus_pnnx.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdlib.h>
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + exp(-x));
}

static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid = pred.h;

    int num_grid_x = pred.w;
    int num_grid_y = pred.h;

    const int num_class = 80; // number of classes. 80 for COCO
    const int reg_max_1 = (pred.c - num_class) / 4;

    for (int i = 0; i < num_grid_y; i++)
    {
        for (int j = 0; j < num_grid_x; j++)
        {
            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            for (int k = 0; k < num_class; k++)
            {
                float s = pred.channel(k).row(i)[j];
                if (s > score)
                {
                    label = k;
                    score = s;
                }
            }

            score = sigmoid(score);

            if (score >= prob_threshold)
            {
                ncnn::Mat bbox_pred(reg_max_1, 4);
                for (int k = 0; k < reg_max_1 * 4; k++)
                {
                    bbox_pred[k] = pred.channel(num_class + k).row(i)[j];
                }
                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(bbox_pred, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = bbox_pred.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = j * stride;
                float pb_cy = i * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;

                objects.push_back(obj);
            }
        }
    }
}

static int detect_nanodet(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net nanodet;

    nanodet.opt.use_vulkan_compute = true;
    // nanodet.opt.use_bf16_storage = true;

    // original pretrained model from https://github.com/RangiLyu/nanodet
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    //     nanodet.load_param("nanodet-plus-m_320.torchscript.ncnn.param");
    //     nanodet.load_model("nanodet-plus-m_320.torchscript.ncnn.bin");
    if (nanodet.load_param("nanodet-plus-m_416.torchscript.ncnn.param"))
        exit(-1);
    if (nanodet.load_model("nanodet-plus-m_416.torchscript.ncnn.bin"))
        exit(-1);

    int width = bgr.cols;
    int height = bgr.rows;

    //     const int target_size = 320;
    const int target_size = 416;
    const float prob_threshold = 0.4f;
    const float nms_threshold = 0.5f;

    // pad to multiple of 32
    int w = width;
    int h = height;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h);

    // pad to target_size rectangle
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);

    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
    const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
    in_pad.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = nanodet.create_extractor();

    ex.input("in0", in_pad);

    std::vector<Object> proposals;

    // stride 8
    {
        ncnn::Mat pred;
        ex.extract("231", pred);

        std::vector<Object> objects8;
        generate_proposals(pred, 8, in_pad, prob_threshold, objects8);

        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }

    // stride 16
    {
        ncnn::Mat pred;
        ex.extract("228", pred);

        std::vector<Object> objects16;
        generate_proposals(pred, 16, in_pad, prob_threshold, objects16);

        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }

    // stride 32
    {
        ncnn::Mat pred;
        ex.extract("225", pred);

        std::vector<Object> objects32;
        generate_proposals(pred, 32, in_pad, prob_threshold, objects32);

        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }

    // stride 64
    {
        ncnn::Mat pred;
        ex.extract("222", pred);

        std::vector<Object> objects64;
        generate_proposals(pred, 64, in_pad, prob_threshold, objects64);

        proposals.insert(proposals.end(), objects64.begin(), objects64.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_nanodet(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/p2pnet.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdlib.h>
#include <float.h>
#include <stdio.h>
#include <vector>

struct CrowdPoint
{
    cv::Point pt;
    float prob;
};

static void shift(int w, int h, int stride, std::vector<float> anchor_points, std::vector<float>& shifted_anchor_points)
{
    std::vector<float> x_, y_;
    for (int i = 0; i < w; i++)
    {
        float x = (i + 0.5) * stride;
        x_.push_back(x);
    }
    for (int i = 0; i < h; i++)
    {
        float y = (i + 0.5) * stride;
        y_.push_back(y);
    }

    std::vector<float> shift_x((size_t)w * h, 0), shift_y((size_t)w * h, 0);
    for (int i = 0; i < h; i++)
    {
        for (int j = 0; j < w; j++)
        {
            shift_x[i * w + j] = x_[j];
        }
    }
    for (int i = 0; i < h; i++)
    {
        for (int j = 0; j < w; j++)
        {
            shift_y[i * w + j] = y_[i];
        }
    }

    std::vector<float> shifts((size_t)w * h * 2, 0);
    for (int i = 0; i < w * h; i++)
    {
        shifts[i * 2] = shift_x[i];
        shifts[i * 2 + 1] = shift_y[i];
    }

    shifted_anchor_points.resize((size_t)2 * w * h * anchor_points.size() / 2, 0);
    for (int i = 0; i < w * h; i++)
    {
        for (int j = 0; j < anchor_points.size() / 2; j++)
        {
            float x = anchor_points[j * 2] + shifts[i * 2];
            float y = anchor_points[j * 2 + 1] + shifts[i * 2 + 1];
            shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2] = x;
            shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2 + 1] = y;
        }
    }
}
static void generate_anchor_points(int stride, int row, int line, std::vector<float>& anchor_points)
{
    float row_step = (float)stride / row;
    float line_step = (float)stride / line;

    std::vector<float> x_, y_;
    for (int i = 1; i < line + 1; i++)
    {
        float x = (i - 0.5) * line_step - stride / 2;
        x_.push_back(x);
    }
    for (int i = 1; i < row + 1; i++)
    {
        float y = (i - 0.5) * row_step - stride / 2;
        y_.push_back(y);
    }
    std::vector<float> shift_x((size_t)row * line, 0), shift_y((size_t)row * line, 0);
    for (int i = 0; i < row; i++)
    {
        for (int j = 0; j < line; j++)
        {
            shift_x[i * line + j] = x_[j];
        }
    }
    for (int i = 0; i < row; i++)
    {
        for (int j = 0; j < line; j++)
        {
            shift_y[i * line + j] = y_[i];
        }
    }
    anchor_points.resize((size_t)row * line * 2, 0);
    for (int i = 0; i < row * line; i++)
    {
        float x = shift_x[i];
        float y = shift_y[i];
        anchor_points[i * 2] = x;
        anchor_points[i * 2 + 1] = y;
    }
}
static void generate_anchor_points(int img_w, int img_h, std::vector<int> pyramid_levels, int row, int line, std::vector<float>& all_anchor_points)
{
    std::vector<std::pair<int, int> > image_shapes;
    std::vector<int> strides;
    for (int i = 0; i < pyramid_levels.size(); i++)
    {
        int new_h = std::floor((img_h + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i]));
        int new_w = std::floor((img_w + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i]));
        image_shapes.push_back(std::make_pair(new_w, new_h));
        strides.push_back(std::pow(2, pyramid_levels[i]));
    }

    all_anchor_points.clear();
    for (int i = 0; i < pyramid_levels.size(); i++)
    {
        std::vector<float> anchor_points;
        generate_anchor_points(std::pow(2, pyramid_levels[i]), row, line, anchor_points);
        std::vector<float> shifted_anchor_points;
        shift(image_shapes[i].first, image_shapes[i].second, strides[i], anchor_points, shifted_anchor_points);
        all_anchor_points.insert(all_anchor_points.end(), shifted_anchor_points.begin(), shifted_anchor_points.end());
    }
}

static int detect_crowd(const cv::Mat& bgr, std::vector<CrowdPoint>& crowd_points)
{
    ncnn::Option opt;
    opt.num_threads = 4;
    opt.use_vulkan_compute = false;
    opt.use_bf16_storage = false;

    ncnn::Net net;
    net.opt = opt;

    // model is converted from
    // https://github.com/TencentYoutuResearch/CrowdCounting-P2PNet
    // the ncnn model  https://pan.baidu.com/s/1O1CBgvY6yJkrK8Npxx3VMg pwd: ezhx
    if (net.load_param("p2pnet.param"))
        exit(-1);
    if (net.load_model("p2pnet.bin"))
        exit(-1);

    int width = bgr.cols;
    int height = bgr.rows;

    int new_width = width / 128 * 128;
    int new_height = height / 128 * 128;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, new_width, new_height);

    std::vector<int> pyramid_levels(1, 3);
    std::vector<float> all_anchor_points;
    generate_anchor_points(in.w, in.h, pyramid_levels, 2, 2, all_anchor_points);

    ncnn::Mat anchor_points = ncnn::Mat(2, all_anchor_points.size() / 2, all_anchor_points.data());

    ncnn::Extractor ex = net.create_extractor();
    const float mean_vals1[3] = {123.675f, 116.28f, 103.53f};
    const float norm_vals1[3] = {0.01712475f, 0.0175f, 0.01742919f};

    in.substract_mean_normalize(mean_vals1, norm_vals1);

    ex.input("input", in);
    ex.input("anchor", anchor_points);

    ncnn::Mat score, points;
    ex.extract("pred_scores", score);
    ex.extract("pred_points", points);

    for (int i = 0; i < points.h; i++)
    {
        float* score_data = score.row(i);
        float* points_data = points.row(i);
        CrowdPoint cp;
        int x = points_data[0] / new_width * width;
        int y = points_data[1] / new_height * height;
        cp.pt = cv::Point(x, y);
        cp.prob = score_data[1];
        crowd_points.push_back(cp);
    }

    return 0;
}

static void draw_result(const cv::Mat& bgr, const std::vector<CrowdPoint>& crowd_points)
{
    cv::Mat image = bgr.clone();
    const float threshold = 0.5f;
    for (int i = 0; i < crowd_points.size(); i++)
    {
        if (crowd_points[i].prob > threshold)
        {
            cv::circle(image, crowd_points[i].pt, 4, cv::Scalar(0, 0, 255), -1, 8, 0);
        }
    }
    cv::imshow("image", image);
    cv::waitKey();
}
int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat bgr = cv::imread(imagepath, 1);
    if (bgr.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<CrowdPoint> crowd_points;
    detect_crowd(bgr, crowd_points);
    draw_result(bgr, crowd_points);

    return 0;
}


================================================
FILE: examples/peleenetssd_seg.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int detect_peleenet(const cv::Mat& bgr, std::vector<Object>& objects, ncnn::Mat& resized)
{
    ncnn::Net peleenet;

    peleenet.opt.use_vulkan_compute = true;

    // model is converted from https://github.com/eric612/MobileNet-YOLO
    // and can be downloaded from https://drive.google.com/open?id=1Wt6jKv13sBRMHgrGAJYlOlRF-o80pC0g
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (peleenet.load_param("pelee.param"))
        exit(-1);
    if (peleenet.load_model("pelee.bin"))
        exit(-1);

    const int target_size = 304;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);

    const float mean_vals[3] = {103.9f, 116.7f, 123.6f};
    const float norm_vals[3] = {0.017f, 0.017f, 0.017f};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = peleenet.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("detection_out", out);

    //     printf("%d %d %d\n", out.w, out.h, out.c);
    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];
        object.rect.x = values[2] * img_w;
        object.rect.y = values[3] * img_h;
        object.rect.width = values[4] * img_w - object.rect.x;
        object.rect.height = values[5] * img_h - object.rect.y;

        objects.push_back(object);
    }
    ncnn::Mat seg_out;
    ex.extract("sigmoid", seg_out);
    resize_bilinear(seg_out, resized, img_w, img_h);
    //resize_bicubic(seg_out,resized,img_w,img_h); // sharpness
    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, ncnn::Mat map)
{
    static const char* class_names[] = {"background",
                                        "person", "rider", "car", "bus",
                                        "truck", "bike", "motor",
                                        "traffic light", "traffic sign", "train"
                                       };

    cv::Mat image = bgr.clone();
    const int color[] = {128, 255, 128, 244, 35, 232};
    const int color_count = sizeof(color) / sizeof(int);

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }
    int width = map.w;
    int height = map.h;
    int size = map.c;
    int img_index2 = 0;
    float threshold = 0.45;
    const float* ptr2 = map;
    for (int i = 0; i < height; i++)
    {
        unsigned char* ptr1 = image.ptr<unsigned char>(i);
        int img_index1 = 0;
        for (int j = 0; j < width; j++)
        {
            float maxima = threshold;
            int index = -1;
            for (int c = 0; c < size; c++)
            {
                //const float* ptr3 = map.channel(c);
                const float* ptr3 = ptr2 + c * width * height;
                if (ptr3[img_index2] > maxima)
                {
                    maxima = ptr3[img_index2];
                    index = c;
                }
            }
            if (index > -1)
            {
                int color_index = (index)*3;
                if (color_index < color_count)
                {
                    int b = color[color_index];
                    int g = color[color_index + 1];
                    int r = color[color_index + 2];
                    ptr1[img_index1] = b / 2 + ptr1[img_index1] / 2;
                    ptr1[img_index1 + 1] = g / 2 + ptr1[img_index1 + 1] / 2;
                    ptr1[img_index1 + 2] = r / 2 + ptr1[img_index1 + 2] / 2;
                }
            }
            img_index1 += 3;
            img_index2++;
        }
    }
    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    ncnn::Mat seg_out;
    detect_peleenet(m, objects, seg_out);

    draw_objects(m, objects, seg_out);

    return 0;
}


================================================
FILE: examples/piper.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// convert piper checkpoints to ncnn models
//  1. checkout https://github.com/OHF-Voice/piper1-gpl (113931937cf235fc881afd1ca4be209bc6919bc7)
//  2. apply patch piper1-gpl.patch from https://github.com/nihui/ncnn-android-piper
//  3. setup piper with
//      python3 -m venv .venv
//      source .venv/bin/activate
//      python3 -m pip install -e .[train]
//  4. download piper checkpoint file (*.ckpt) from https://huggingface.co/datasets/rhasspy/piper-checkpoints
//  5. install pnnx via pip install -U pnnx
//  6. obtain export_ncnn.py script from https://github.com/nihui/ncnn-android-piper
//      python export_ncnn.py en.ckpt

// convert word list to simple phonemizer dict
//  1. prepare word list from https://github.com/Alexir/CMUdict
//  2. for each word, get phonemes via command "./espeak-ng -q -v en-us --ipa word"
//  3. obtain config.json file from https://huggingface.co/datasets/rhasspy/piper-checkpoints
//  4. replace phonemes with ids according to phoneme_id_map in config.json
//  5. write dict binary
//      word1 \0x00 ids1 \0xff word2 \0x00 ids2 \0xff .....

#include "layer.h"
#include "mat.h"
#include "net.h"

#include <ctype.h>
#include <stdio.h>
#include <map>
#include <vector>

class relative_embeddings_k_module : public ncnn::Layer
{
public:
    relative_embeddings_k_module()
    {
        one_blob_only = true;
    }

    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
    {
        const int window_size = 4;

        const int wsize = bottom_blob.w;
        const int len = bottom_blob.h;
        const int num_heads = bottom_blob.c;

        top_blob.create(len, len, num_heads);

        top_blob.fill(0.f);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < num_heads; q++)
        {
            const ncnn::Mat x0 = bottom_blob.channel(q);
            ncnn::Mat out0 = top_blob.channel(q);

            for (int i = 0; i < len; i++)
            {
                const float* xptr = x0.row(i) + std::max(0, window_size - i);
                float* outptr = out0.row(i) + std::max(i - window_size, 0);
                const int wsize2 = std::min(len, i - window_size + wsize) - std::max(i - window_size, 0);
                for (int j = 0; j < wsize2; j++)
                {
                    *outptr++ = *xptr++;
                }
            }
        }

        return 0;
    }
};

DEFINE_LAYER_CREATOR(relative_embeddings_k_module)

class relative_embeddings_v_module : public ncnn::Layer
{
public:
    relative_embeddings_v_module()
    {
        one_blob_only = true;
    }

    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
    {
        const int window_size = 4;

        const int wsize = window_size * 2 + 1;
        const int len = bottom_blob.h;
        const int num_heads = bottom_blob.c;

        top_blob.create(wsize, len, num_heads);

        top_blob.fill(0.f);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < num_heads; q++)
        {
            const ncnn::Mat x0 = bottom_blob.channel(q);
            ncnn::Mat out0 = top_blob.channel(q);

            for (int i = 0; i < len; i++)
            {
                const float* xptr = x0.row(i) + std::max(i - window_size, 0);
                float* outptr = out0.row(i) + std::max(0, window_size - i);
                const int wsize2 = std::min(len, i - window_size + wsize) - std::max(i - window_size, 0);
                for (int j = 0; j < wsize2; j++)
                {
                    *outptr++ = *xptr++;
                }
            }
        }

        return 0;
    }
};

DEFINE_LAYER_CREATOR(relative_embeddings_v_module)

class piecewise_rational_quadratic_transform_module : public ncnn::Layer
{
public:
    piecewise_rational_quadratic_transform_module()
    {
        one_blob_only = false;
    }

    virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const
    {
        const ncnn::Mat& h = bottom_blobs[0];
        const ncnn::Mat& x1 = bottom_blobs[1];
        ncnn::Mat& outputs = top_blobs[0];

        const int num_bins = 10;
        const int filter_channels = 192;
        const bool reverse = true;
        const float tail_bound = 5.0f;
        const float DEFAULT_MIN_BIN_WIDTH = 1e-3f;
        const float DEFAULT_MIN_BIN_HEIGHT = 1e-3f;
        const float DEFAULT_MIN_DERIVATIVE = 1e-3f;

        const int batch_size = x1.w;
        const int h_params_per_item = 2 * num_bins + (num_bins - 1); // 29

        outputs = x1.clone();

        float* out_ptr = outputs;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < batch_size; ++i)
        {
            const float current_x = ((const float*)x1)[i];

            const float* h_data = h.row(i);

            if (current_x < -tail_bound || current_x > tail_bound)
            {
                continue;
            }

            std::vector<float> unnormalized_widths(num_bins);
            std::vector<float> unnormalized_heights(num_bins);
            std::vector<float> unnormalized_derivatives(num_bins + 1);

            const float inv_sqrt_filter_channels = 1.0f / sqrtf(filter_channels);
            for (int j = 0; j < num_bins; ++j)
            {
                unnormalized_widths[j] = h_data[j] * inv_sqrt_filter_channels;
            }
            for (int j = 0; j < num_bins; ++j)
            {
                unnormalized_heights[j] = h_data[num_bins + j] * inv_sqrt_filter_channels;
            }
            for (int j = 0; j < num_bins - 1; ++j)
            {
                unnormalized_derivatives[j + 1] = h_data[2 * num_bins + j];
            }

            const float constant = logf(expf(1.f - DEFAULT_MIN_DERIVATIVE) - 1.f);
            unnormalized_derivatives[0] = constant;
            unnormalized_derivatives[num_bins] = constant;

            const float left = -tail_bound, right = tail_bound;
            const float bottom = -tail_bound, top = tail_bound;

            // Softmax + Affine
            std::vector<float> widths(num_bins);
            float w_max = -INFINITY;
            for (float val : unnormalized_widths) w_max = std::max(w_max, val);
            float w_sum = 0.f;
            for (int j = 0; j < num_bins; ++j)
            {
                widths[j] = expf(unnormalized_widths[j] - w_max);
                w_sum += widths[j];
            }
            for (int j = 0; j < num_bins; ++j)
            {
                widths[j] = DEFAULT_MIN_BIN_WIDTH + (1.f - DEFAULT_MIN_BIN_WIDTH * num_bins) * (widths[j] / w_sum);
            }

            // cumwidths
            std::vector<float> cumwidths(num_bins + 1);
            cumwidths[0] = left;
            float current_w_sum = 0.f;
            for (int j = 0; j < num_bins - 1; ++j)
            {
                current_w_sum += widths[j];
                cumwidths[j + 1] = left + (right - left) * current_w_sum;
            }
            cumwidths[num_bins] = right;

            // heights
            std::vector<float> heights(num_bins);
            float h_max = -INFINITY;
            for (float val : unnormalized_heights) h_max = std::max(h_max, val);
            float h_sum = 0.f;
            for (int j = 0; j < num_bins; ++j)
            {
                heights[j] = expf(unnormalized_heights[j] - h_max);
                h_sum += heights[j];
            }
            for (int j = 0; j < num_bins; ++j)
            {
                heights[j] = DEFAULT_MIN_BIN_HEIGHT + (1.f - DEFAULT_MIN_BIN_HEIGHT * num_bins) * (heights[j] / h_sum);
            }

            // cumheights
            std::vector<float> cumheights(num_bins + 1);
            cumheights[0] = bottom;
            float current_h_sum = 0.f;
            for (int j = 0; j < num_bins - 1; ++j)
            {
                current_h_sum += heights[j];
                cumheights[j + 1] = bottom + (top - bottom) * current_h_sum;
            }
            cumheights[num_bins] = top;

            // Softplus
            std::vector<float> derivatives(num_bins + 1);
            for (int j = 0; j < num_bins + 1; ++j)
            {
                float x = unnormalized_derivatives[j];
                derivatives[j] = DEFAULT_MIN_DERIVATIVE + (x > 0 ? x + logf(1.f + expf(-x)) : logf(1.f + expf(x)));
            }

            // bin_idx
            int bin_idx = 0;
            if (reverse)
            {
                auto it = std::upper_bound(cumheights.begin(), cumheights.end(), current_x);
                bin_idx = std::distance(cumheights.begin(), it) - 1;
            }
            else
            {
                auto it = std::upper_bound(cumwidths.begin(), cumwidths.end(), current_x);
                bin_idx = std::distance(cumwidths.begin(), it) - 1;
            }
            bin_idx = std::max(0, std::min(bin_idx, num_bins - 1));

            // collect coeffs
            const float input_cumwidths = cumwidths[bin_idx];
            const float input_bin_widths = cumwidths[bin_idx + 1] - cumwidths[bin_idx];
            const float input_cumheights = cumheights[bin_idx];
            const float input_heights = cumheights[bin_idx + 1] - cumheights[bin_idx];
            const float input_derivatives = derivatives[bin_idx];
            const float input_derivatives_plus_one = derivatives[bin_idx + 1];
            const float delta = input_heights / input_bin_widths;

            // apply transform
            if (reverse)
            {
                float a = (current_x - input_cumheights) * (input_derivatives + input_derivatives_plus_one - 2 * delta) + input_heights * (delta - input_derivatives);
                float b = input_heights * input_derivatives - (current_x - input_cumheights) * (input_derivatives + input_derivatives_plus_one - 2 * delta);
                float c = -delta * (current_x - input_cumheights);
                float discriminant = b * b - 4 * a * c;
                discriminant = std::max(0.f, discriminant);
                float root = (2 * c) / (-b - sqrtf(discriminant));
                out_ptr[i] = root * input_bin_widths + input_cumwidths;
            }
            else
            {
                float theta = (current_x - input_cumwidths) / input_bin_widths;
                float theta_one_minus_theta = theta * (1 - theta);
                float numerator = input_heights * (delta * theta * theta + input_derivatives * theta_one_minus_theta);
                float denominator = delta + ((input_derivatives + input_derivatives_plus_one - 2 * delta) * theta_one_minus_theta);
                out_ptr[i] = input_cumheights + numerator / denominator;
            }
        }

        return 0;
    }
};

DEFINE_LAYER_CREATOR(piecewise_rational_quadratic_transform_module)

static bool is_word_eos(const char* word)
{
    const char c = word[0];
    return c == ',' || c == '.' || c == ';' || c == '?' || c == '!';
}

static void find_word_id(const std::map<unsigned int, std::vector<const char*> >& dict, const char* word, const unsigned char*& ids)
{
    ids = 0;

    unsigned char first_char = toupper(word[0]);
    if (dict.find(first_char) == dict.end())
        return;

    const std::vector<const char*>& wordlist = dict.at(first_char);
    for (size_t i = 0; i < wordlist.size(); i++)
    {
        if (strcasecmp(wordlist[i], word) == 0)
        {
            // hit
            ids = (const unsigned char*)(wordlist[i] + strlen(wordlist[i]) + 1);
            return;
        }
    }
}

static void simple_phonemize(const char* text, std::vector<int>& sequence_ids)
{
    // this is a very simple g2p function, it works for english only

    // load dict buffer
    std::vector<unsigned char> dictbinbuf;
    {
        FILE* fp = fopen("en-word_id.bin", "rb");
        if (!fp)
            return;

        fseek(fp, 0, SEEK_END);
        size_t len = ftell(fp);
        rewind(fp);

        dictbinbuf.resize(len);
        fread(dictbinbuf.data(), 1, len, fp);

        fclose(fp);
    }

    // build dict
    std::map<unsigned int, std::vector<const char*> > dict;
    {
        const unsigned char* p = dictbinbuf.data();
        const char* word = (const char*)p;
        for (size_t i = 0; i < dictbinbuf.size(); i++)
        {
            if (dictbinbuf[i] == 0xff)
            {
                unsigned int first_char = toupper(word[0]);
                dict[first_char].push_back(word);
                word = (const char*)(p + i + 1);
            }
        }
    }

    // phonemize mainpart
    {
        const int ID_PAD = 0;   // interleaved
        const int ID_BOS = 1;   // beginning of sentence
        const int ID_EOS = 2;   // end of sentence
        const int ID_SPACE = 3; // space

        bool last_char_is_control = false;
        bool sentence_begin = true;
        bool sentence_end = true;

        char word[256];

        const char* p = text;
        while (*p)
        {
            if (sentence_end && !last_char_is_control)
            {
                sequence_ids.push_back(ID_BOS);
                sequence_ids.push_back(ID_PAD);
                sentence_end = false;
            }

            if (sentence_begin || last_char_is_control)
            {
                // the very first word
            }
            else
            {
                // space id
                sequence_ids.push_back(ID_SPACE);
                sequence_ids.push_back(ID_PAD);
            }

            if (isalnum((unsigned char)*p))
            {
                char* pword = word;

                // alpha or number
                *pword++ = *p++;

                // consume word
                int wordlen = 1;
                while (isalnum((unsigned char)*p) && wordlen < 233)
                {
                    *pword++ = *p++;
                    wordlen++;
                }

                *pword = '\0';

                if (is_word_eos(word))
                {
                    if (!sentence_end)
                        sequence_ids.push_back(ID_EOS);
                    sentence_end = true;
                    last_char_is_control = false;
                    sentence_begin = false;
                    continue;
                }

                const unsigned char* ids = 0;
                find_word_id(dict, word, ids);
                if (ids)
                {
                    const unsigned char* pids = ids;
                    while (*pids != 0xff)
                    {
                        sequence_ids.push_back(*pids);
                        sequence_ids.push_back(ID_PAD);
                        pids++;
                    }
                }
                else
                {
                    // no such word, spell alphabet one by one
                    char tmp[2] = {'\0', '\0'};
                    for (size_t i = 0; i < strlen(word); i++)
                    {
                        tmp[0] = word[i];
                        find_word_id(dict, tmp, ids);
                        if (ids)
                        {
                            const unsigned char* pids = ids;
                            while (*pids != 0xff)
                            {
                                sequence_ids.push_back(*pids);
                                sequence_ids.push_back(ID_PAD);
                                pids++;
                            }
                            if (i + 1 != strlen(word))
                            {
                                sequence_ids.push_back(ID_SPACE);
                                sequence_ids.push_back(ID_PAD);
                            }
                        }
                        else
                        {
                            fprintf(stderr, "word char %c not recognized\n", word[i]);
                        }
                    }
                }

                last_char_is_control = false;
                sentence_begin = false;
                continue;
            }
            else
            {
                // skip control character
                p++;
                last_char_is_control = true;
            }
        }

        if (!sentence_end)
            sequence_ids.push_back(ID_EOS);
    }
}

static void path_attention(const ncnn::Mat& logw, const ncnn::Mat& m_p, const ncnn::Mat& logs_p, float noise_scale, float length_scale, ncnn::Mat& z_p)
{
    const int x_lengths = logw.w;

    // assert m_p.h == logs_p.h
    const int depth = m_p.h;

    std::vector<int> w_ceil(x_lengths);
    int y_lengths = 0;
    for (int i = 0; i < x_lengths; i++)
    {
        w_ceil[i] = (int)ceilf(expf(logw[i]) * length_scale);
        y_lengths += w_ceil[i];
    }

    z_p.create(y_lengths, depth);

    for (int i = 0; i < depth; i++)
    {
        const float* m_p_ptr = m_p.row(i);
        const float* logs_p_ptr = logs_p.row(i);
        float* ptr = z_p.row(i);

        for (int j = 0; j < x_lengths; j++)
        {
            const float m = m_p_ptr[j];
            const float nl = expf(logs_p_ptr[j]) * noise_scale;
            const int duration = w_ceil[j];

            for (int k = 0; k < duration; k++)
            {
                ptr[k] = m + (rand() / (float)RAND_MAX) * nl;
            }
            ptr += duration;
        }
    }
}

static int tts_piper(const char* text, int speaker_id, std::vector<short>& pcm)
{
    // zh models could be found at
    // https://github.com/nihui/ncnn-android-piper/tree/master/app/src/main/assets

    // hyper parameters from https://huggingface.co/datasets/rhasspy/piper-checkpoints/blob/main/en/en_US/libritts_r/medium/config.json
    const float noise_scale = 0.333f;
    const float length_scale = 1.f;
    const float noise_scale_w = 0.333f;

    // phonemize
    ncnn::Mat sequence;
    {
        std::vector<int> sequence_ids;
        simple_phonemize(text, sequence_ids);

        const int sequence_length = (int)sequence_ids.size();

        sequence.create(sequence_length);
        memcpy(sequence, sequence_ids.data(), sequence_length * sizeof(int));
    }

    // enc_p
    ncnn::Mat x;
    ncnn::Mat m_p;
    ncnn::Mat logs_p;
    {
        ncnn::Net enc_p;
        enc_p.opt.use_vulkan_compute = true;
        enc_p.register_custom_layer("piper.train.vits.attentions.relative_embeddings_k_module", relative_embeddings_k_module_layer_creator);
        enc_p.register_custom_layer("piper.train.vits.attentions.relative_embeddings_v_module", relative_embeddings_v_module_layer_creator);
        enc_p.load_param("en_enc_p.ncnn.param");
        enc_p.load_model("en_enc_p.ncnn.bin");

        ncnn::Extractor ex = enc_p.create_extractor();

        ex.input("in0", sequence);

        ex.extract("out0", x);
        ex.extract("out1", m_p);
        ex.extract("out2", logs_p);
    }

    // emb_g
    ncnn::Mat g;
    {
        ncnn::Net emb_g;
        emb_g.opt.use_vulkan_compute = true;
        emb_g.load_param("en_emb_g.ncnn.param");
        emb_g.load_model("en_emb_g.ncnn.bin");

        ncnn::Mat speaker_id_mat(1);
        {
            int* p = speaker_id_mat;
            p[0] = speaker_id;
        }

        ncnn::Extractor ex = emb_g.create_extractor();

        ex.input("in0", speaker_id_mat);

        ex.extract("out0", g);

        g = g.reshape(1, g.w);
    }

    // dp
    ncnn::Mat logw;
    {
        ncnn::Net dp;
        dp.opt.use_vulkan_compute = true;
        dp.register_custom_layer("piper.train.vits.modules.piecewise_rational_quadratic_transform_module", piecewise_rational_quadratic_transform_module_layer_creator);
        dp.load_param("en_dp.ncnn.param");
        dp.load_model("en_dp.ncnn.bin");

        ncnn::Mat noise(x.w, 2);
        for (int i = 0; i < noise.w * noise.h; i++)
        {
            noise[i] = rand() / (float)RAND_MAX * noise_scale_w;
        }

        ncnn::Extractor ex = dp.create_extractor();

        ex.input("in0", x);
        ex.input("in1", noise);
        ex.input("in2", g);

        ex.extract("out0", logw);
    }

    // path attention
    ncnn::Mat z_p;
    {
        path_attention(logw, m_p, logs_p, noise_scale, length_scale, z_p);
    }

    // flow
    ncnn::Mat z;
    {
        ncnn::Net flow;
        flow.opt.use_vulkan_compute = true;
        flow.load_param("en_flow.ncnn.param");
        flow.load_model("en_flow.ncnn.bin");

        ncnn::Extractor ex = flow.create_extractor();

        ex.input("in0", z_p);
        ex.input("in1", g);

        ex.extract("out0", z);
    }

    // dec
    ncnn::Mat o;
    {
        ncnn::Net dec;
        dec.opt.use_vulkan_compute = true;
        dec.load_param("en_dec.ncnn.param");
        dec.load_model("en_dec.ncnn.bin");

        ncnn::Extractor ex = dec.create_extractor();

        ex.input("in0", z);
        ex.input("in1", g);

        ex.extract("out0", o);
    }

    // normalize and clip
    {
        float volume = 1.f;
        float absmax = 0.f;
        for (int i = 0; i < o.w; i++)
        {
            absmax = std::max(absmax, fabs(o[i]));
        }
        if (absmax > 1e-8)
        {
            for (int i = 0; i < o.w; i++)
            {
                float v = o[i] / absmax * volume;
                v = std::min(std::max(v, -1.f), 1.f);
                o[i] = v;
            }
        }
    }

    // 16bit pcm
    {
        pcm.resize(o.w);
        for (int i = 0; i < o.w; i++)
        {
            pcm[i] = (short)(o[i] * 32767);
        }
    }

    return 0;
}

static void save_pcm_to_wav(const char* path, const short* pcm, int num_samples, int sample_rate)
{
    FILE* f = fopen(path, "wb");
    if (!f)
        return;

    // write wav header
    {
        int16_t num_channels = 1;
        int16_t bits_per_sample = 16;
        int32_t byte_rate = sample_rate * num_channels * bits_per_sample / 8;
        int16_t block_align = num_channels * bits_per_sample / 8;
        int32_t data_chunk_size = num_samples * num_channels * bits_per_sample / 8;
        int32_t chunk_size = 36 + data_chunk_size;

        // RIFF header
        fwrite("RIFF", 1, 4, f);
        fwrite(&chunk_size, 4, 1, f);
        fwrite("WAVE", 1, 4, f);

        // fmt subchunk
        fwrite("fmt ", 1, 4, f);
        int32_t subchunk1_size = 16;
        int16_t audio_format = 1; // PCM
        fwrite(&subchunk1_size, 4, 1, f);
        fwrite(&audio_format, 2, 1, f);
        fwrite(&num_channels, 2, 1, f);
        fwrite(&sample_rate, 4, 1, f);
        fwrite(&byte_rate, 4, 1, f);
        fwrite(&block_align, 2, 1, f);
        fwrite(&bits_per_sample, 2, 1, f);

        // data subchunk
        fwrite("data", 1, 4, f);
        fwrite(&data_chunk_size, 4, 1, f);
    }

    fwrite(pcm, sizeof(short), num_samples, f);
    fclose(f);
}

int main(int argc, char** argv)
{
    if (argc != 4)
    {
        fprintf(stderr, "Usage: %s [sentences] [speaker id 0~903] [out path]\n", argv[0]);
        fprintf(stderr, "       %s \"Hello World\" 0 out.wav\n", argv[0]);
        fprintf(stderr, "       %s \"Happy New Year\" 123 out.wav\n", argv[0]);
        return 0;
    }

    const char* text = argv[1];
    const int speaker_id = atoi(argv[2]);
    const char* outpath = argv[3];

    std::vector<short> pcm;
    tts_piper(text, speaker_id, pcm);

    // "sample_rate": 22050
    save_pcm_to_wav(outpath, pcm.data(), pcm.size(), 22050);

    return 0;
}


================================================
FILE: examples/ppocrv5.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// pip install paddlepaddle==3.0.0
// pip install paddleocr==3.0.0
// paddlex --install paddle2onnx
// paddleocr ocr -i test.png
// paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_det --onnx_model_dir PP-OCRv5_mobile_det
// paddlex --paddle2onnx --paddle_model_dir ~/.paddlex/official_models/PP-OCRv5_mobile_rec --onnx_model_dir PP-OCRv5_mobile_rec
// pnnx PP-OCRv5_mobile_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256]
// pnnx PP-OCRv5_mobile_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256]
// pnnx PP-OCRv5_server_det.onnx inputshape=[1,3,320,320] inputshape2=[1,3,256,256] fp16=0
// pnnx PP-OCRv5_server_rec.onnx inputshape=[1,3,48,160] inputshape2=[1,3,48,256] fp16=0

#include "layer.h"
#include "net.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include <float.h>
#include <stdio.h>
#include <vector>

#include "ppocrv5_dict.h"

struct Character
{
    int id;
    float prob;
};

struct Object
{
    cv::RotatedRect rrect;
    int orientation;
    float prob;
    std::vector<Character> text;
};

static double contour_score(const cv::Mat& binary, const std::vector<cv::Point>& contour)
{
    cv::Rect rect = cv::boundingRect(contour);
    if (rect.x < 0)
        rect.x = 0;
    if (rect.y < 0)
        rect.y = 0;
    if (rect.x + rect.width > binary.cols)
        rect.width = binary.cols - rect.x;
    if (rect.y + rect.height > binary.rows)
        rect.height = binary.rows - rect.y;

    cv::Mat binROI = binary(rect);

    cv::Mat mask = cv::Mat::zeros(rect.height, rect.width, CV_8U);
    std::vector<cv::Point> roiContour;
    for (size_t i = 0; i < contour.size(); i++)
    {
        cv::Point pt = cv::Point(contour[i].x - rect.x, contour[i].y - rect.y);
        roiContour.push_back(pt);
    }

    std::vector<std::vector<cv::Point> > roiContours = {roiContour};
    cv::fillPoly(mask, roiContours, cv::Scalar(255));

    double score = cv::mean(binROI, mask).val[0];
    return score / 255.f;
}

static cv::Mat get_rotate_crop_image(const cv::Mat& bgr, const Object& object)
{
    const int orientation = object.orientation;
    const float rw = object.rrect.size.width;
    const float rh = object.rrect.size.height;

    const int target_height = 48;
    const float target_width = rh * target_height / rw;

    // warpperspective shall be used to rotate the image
    // but actually they are all rectangles, so warpaffine is almost enough  :P

    cv::Mat dst;

    cv::Point2f corners[4];
    object.rrect.points(corners);

    if (orientation == 0)
    {
        // horizontal text
        // corner points order
        //  0--------1
        //  |        |rw  -> as angle=90
        //  3--------2
        //      rh

        std::vector<cv::Point2f> src_pts(3);
        src_pts[0] = corners[0];
        src_pts[1] = corners[1];
        src_pts[2] = corners[3];

        std::vector<cv::Point2f> dst_pts(3);
        dst_pts[0] = cv::Point2f(0, 0);
        dst_pts[1] = cv::Point2f(target_width, 0);
        dst_pts[2] = cv::Point2f(0, target_height);

        cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts);

        cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE);
    }
    else
    {
        // vertial text
        // corner points order
        //  1----2
        //  |    |
        //  |    |
        //  |    |rh  -> as angle=0
        //  |    |
        //  |    |
        //  0----3
        //    rw

        std::vector<cv::Point2f> src_pts(3);
        src_pts[0] = corners[2];
        src_pts[1] = corners[3];
        src_pts[2] = corners[1];

        std::vector<cv::Point2f> dst_pts(3);
        dst_pts[0] = cv::Point2f(0, 0);
        dst_pts[1] = cv::Point2f(target_width, 0);
        dst_pts[2] = cv::Point2f(0, target_height);

        cv::Mat tm = cv::getAffineTransform(src_pts, dst_pts);

        cv::warpAffine(bgr, dst, tm, cv::Size(target_width, target_height), cv::INTER_LINEAR, cv::BORDER_REPLICATE);
    }

    return dst;
}

class PPOCRv5
{
public:
    void init();

    void detect(const cv::Mat& bgr, std::vector<Object>& objects);

    void recognize(const cv::Mat& bgr, Object& object);

protected:
    ncnn::Net ppocrv5_det;
    ncnn::Net ppocrv5_rec;
};

void PPOCRv5::init()
{
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    // https://github.com/nihui/ncnn-android-ppocrv5/tree/master/app/src/main/assets

    ppocrv5_det.opt.use_vulkan_compute = true;
    // ppocrv5_det.opt.use_bf16_storage = true;

    // fp16 must be disabled for server model
    // ppocrv5_det.opt.use_fp16_packed = false;
    // ppocrv5_det.opt.use_fp16_storage = false;

    ppocrv5_det.load_param("PP_OCRv5_mobile_det.ncnn.param");
    ppocrv5_det.load_model("PP_OCRv5_mobile_det.ncnn.bin");
    // ppocrv5_det.load_param("PP_OCRv5_server_det.ncnn.param");
    // ppocrv5_det.load_model("PP_OCRv5_server_det.ncnn.bin");

    ppocrv5_rec.opt.use_vulkan_compute = true;
    // ppocrv5_rec.opt.use_bf16_storage = true;

    // fp16 must be disabled for server model
    // ppocrv5_rec.opt.use_fp16_packed = false;
    // ppocrv5_rec.opt.use_fp16_storage = false;

    ppocrv5_rec.load_param("PP_OCRv5_mobile_rec.ncnn.param");
    ppocrv5_rec.load_model("PP_OCRv5_mobile_rec.ncnn.bin");
    // ppocrv5_rec.load_param("PP_OCRv5_server_rec.ncnn.param");
    // ppocrv5_rec.load_model("PP_OCRv5_server_rec.ncnn.bin");
}

void PPOCRv5::detect(const cv::Mat& bgr, std::vector<Object>& objects)
{
    const int target_size = 960;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    const int target_stride = 32;

    // letterbox pad to multiple of target_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (std::max(w, h) > target_size)
    {
        if (w > h)
        {
            scale = (float)target_size / w;
            w = target_size;
            h = h * scale;
        }
        else
        {
            scale = (float)target_size / h;
            h = target_size;
            w = w * scale;
        }
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);

    int wpad = (w + target_stride - 1) / target_stride * target_stride - w;
    int hpad = (h + target_stride - 1) / target_stride * target_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};
    const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
    in_pad.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = ppocrv5_det.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    const float denorm_vals[1] = {255.f};
    out.substract_mean_normalize(0, denorm_vals);

    cv::Mat pred(out.h, out.w, CV_8UC1);
    out.to_pixels(pred.data, ncnn::Mat::PIXEL_GRAY);

    // threshold binary
    cv::Mat bitmap;
    const float threshold = 0.3f;
    cv::threshold(pred, bitmap, threshold * 255, 255, cv::THRESH_BINARY);

    // boxes from bitmap
    {
        // should use dbnet post process, but I think unclip process is difficult to write
        // so simply implement expansion. This may lose detection accuracy
        // original implementation can be referenced
        // https://github.com/MhLiao/DB/blob/master/structure/representers/seg_detector_representer.py

        const float box_thresh = 0.6f;
        const float enlarge_ratio = 1.95f;

        const float min_size = 3 * scale;
        const int max_candidates = 1000;

        std::vector<std::vector<cv::Point> > contours;
        std::vector<cv::Vec4i> hierarchy;

        cv::findContours(bitmap, contours, hierarchy, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);

        contours.resize(std::min(contours.size(), (size_t)max_candidates));

        for (size_t i = 0; i < contours.size(); i++)
        {
            const std::vector<cv::Point>& contour = contours[i];
            if (contour.size() <= 2)
                continue;

            double score = contour_score(pred, contour);
            if (score < box_thresh)
                continue;

            cv::RotatedRect rrect = cv::minAreaRect(contour);

            float rrect_maxwh = std::max(rrect.size.width, rrect.size.height);
            if (rrect_maxwh < min_size)
                continue;

            int orientation = 0;
            if (rrect.angle >= -30 && rrect.angle <= 30 && rrect.size.height > rrect.size.width * 2.7)
            {
                // vertical text
                orientation = 1;
            }
            if ((rrect.angle <= -60 || rrect.angle >= 60) && rrect.size.width > rrect.size.height * 2.7)
            {
                // vertical text
                orientation = 1;
            }

            if (rrect.angle < -30)
            {
                // make orientation from -90 ~ -30 to 90 ~ 150
                rrect.angle += 180;
            }
            if (orientation == 0 && rrect.angle < 30)
            {
                // make it horizontal
                rrect.angle += 90;
                std::swap(rrect.size.width, rrect.size.height);
            }
            if (orientation == 1 && rrect.angle >= 60)
            {
                // make it vertical
                rrect.angle -= 90;
                std::swap(rrect.size.width, rrect.size.height);
            }

            // enlarge
            rrect.size.height += rrect.size.width * (enlarge_ratio - 1);
            rrect.size.width *= enlarge_ratio;

            // adjust offset to original unpadded
            rrect.center.x = (rrect.center.x - (wpad / 2)) / scale;
            rrect.center.y = (rrect.center.y - (hpad / 2)) / scale;
            rrect.size.width = (rrect.size.width) / scale;
            rrect.size.height = (rrect.size.height) / scale;

            Object obj;
            obj.rrect = rrect;
            obj.orientation = orientation;
            obj.prob = score;
            objects.push_back(obj);
        }
    }
}

void PPOCRv5::recognize(const cv::Mat& bgr, Object& object)
{
    cv::Mat roi = get_rotate_crop_image(bgr, object);

    ncnn::Mat in = ncnn::Mat::from_pixels(roi.data, ncnn::Mat::PIXEL_BGR, roi.cols, roi.rows);

    // ~/.paddlex/official_models/PP-OCRv5_mobile_rec/inference.yml
    const float mean_vals[3] = {127.5, 127.5, 127.5};
    const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = ppocrv5_rec.create_extractor();

    ex.input("in0", in);

    ncnn::Mat out;
    ex.extract("out0", out);

    // 18385 x len
    int last_token = 0;

    for (int i = 0; i < out.h; i++)
    {
        const float* p = out.row(i);

        int index = 0;
        float max_score = -9999.f;
        for (int j = 0; j < out.w; j++)
        {
            float score = *p++;
            if (score > max_score)
            {
                max_score = score;
                index = j;
            }
        }

        if (last_token == index) // CTC rule, if index is same as last one, they will be merged into one token
            continue;

        last_token = index;

        if (index <= 0)
            continue;

        Character ch;
        ch.id = index - 1;
        ch.prob = max_score;

        object.text.push_back(ch);
    }
}

static int detect_ppocrv5(const cv::Mat& bgr, std::vector<Object>& objects)
{
    PPOCRv5 ppocrv5;

    ppocrv5.init();

    ppocrv5.detect(bgr, objects);

    for (size_t i = 0; i < objects.size(); i++)
    {
        ppocrv5.recognize(bgr, objects[i]);
    }

    return 0;
}

static int draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const cv::Scalar colors[] = {
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 17];

        fprintf(stderr, "%s %.5f at %.2f %.2f %.2f x %.2f  @ %.2f  =  ", obj.orientation == 0 ? "H" : "V", obj.prob,
                obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle);

        cv::Point2f corners[4];
        obj.rrect.points(corners);
        cv::line(image, corners[0], corners[1], color);
        cv::line(image, corners[1], corners[2], color);
        cv::line(image, corners[2], corners[3], color);
        cv::line(image, corners[3], corners[0], color);

        std::string text;
        for (size_t j = 0; j < objects[i].text.size(); j++)
        {
            const Character& ch = objects[i].text[j];
            if (ch.id >= character_dict_size)
                continue;

            text += character_dict[ch.id];
        }
        fprintf(stderr, "%s\n", text.c_str());
    }

    fprintf(stderr, "opencv putText can not draw non-latin characters, you may see question marks instead\n");
    fprintf(stderr, "see opencv-mobile for drawing non-latin characters\n");

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 17];

        std::string text;
        for (size_t j = 0; j < objects[i].text.size(); j++)
        {
            const Character& ch = objects[i].text[j];
            if (ch.id >= character_dict_size)
            {
                if (!text.empty() && text.back() != ' ')
                {
                    text += " ";
                }
                continue;
            }

            if (obj.orientation == 0)
            {
                text += character_dict[ch.id];
            }
            else
            {
                text += character_dict[ch.id];
                if (j + 1 < objects[i].text.size())
                    text += "\n";
            }
        }

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rrect.center.x - label_size.width / 2;
        int y = obj.rrect.center.y - label_size.height / 2 - baseLine;
        if (y < 0)
            y = 0;
        if (y + label_size.height > image.rows)
            y = image.rows - label_size.height;
        if (x < 0)
            x = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        if (obj.orientation == 0)
        {
            cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
        }
        else
        {
            cv::putText(image, text, cv::Point(x, y + label_size.width), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
        }
    }

    cv::imshow("image", image);
    cv::waitKey(0);

    return 0;
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_ppocrv5(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/ppocrv5_dict.h
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static const char* character_dict[] = {
    "　",
    "一",
    "乙",
    "二",
    "十",
    "丁",
    "厂",
    "七",
    "卜",
    "八",
    "人",
    "入",
    "儿",
    "匕",
    "几",
    "九",
    "刁",
    "了",
    "刀",
    "力",
    "乃",
    "又",
    "三",
    "干",
    "于",
    "亏",
    "工",
    "土",
    "士",
    "才",
    "下",
    "寸",
    "大",
    "丈",
    "与",
    "万",
    "上",
    "小",
    "口",
    "山",
    "巾",
    "千",
    "乞",
    "川",
    "亿",
    "个",
    "夕",
    "久",
    "么",
    "勺",
    "凡",
    "丸",
    "及",
    "广",
    "亡",
    "门",
    "丫",
    "义",
    "之",
    "尸",
    "己",
    "已",
    "巳",
    "弓",
    "子",
    "卫",
    "也",
    "女",
    "刃",
    "飞",
    "习",
    "叉",
    "马",
    "乡",
    "丰",
    "王",
    "开",
    "井",
    "天",
    "夫",
    "元",
    "无",
    "云",
    "专",
    "丐",
    "扎",
    "艺",
    "木",
    "五",
    "支",
    "厅",
    "不",
    "犬",
    "太",
    "区",
    "历",
    "歹",
    "友",
    "尤",
    "匹",
    "车",
    "巨",
    "牙",
    "屯",
    "戈",
    "比",
    "互",
    "切",
    "瓦",
    "止",
    "少",
    "曰",
    "日",
    "中",
    "贝",
    "冈",
    "内",
    "水",
    "见",
    "午",
    "牛",
    "手",
    "气",
    "毛",
    "壬",
    "升",
    "夭",
    "长",
    "仁",
    "什",
    "片",
    "仆",
    "化",
    "仇",
    "币",
    "仍",
    "仅",
    "斤",
    "爪",
    "反",
    "介",
    "父",
    "从",
    "仑",
    "今",
    "凶",
    "分",
    "乏",
    "公",
    "仓",
    "月",
    "氏",
    "勿",
    "欠",
    "风",
    "丹",
    "匀",
    "乌",
    "勾",
    "凤",
    "六",
    "文",
    "亢",
    "方",
    "火",
    "为",
    "斗",
    "忆",
    "计",
    "订",
    "户",
    "认",
    "冗",
    "讥",
    "心",
    "尺",
    "引",
    "丑",
    "巴",
    "孔",
    "队",
    "办",
    "以",
    "允",
    "予",
    "邓",
    "劝",
    "双",
    "书",
    "幻",
    "玉",
    "刊",
    "未",
    "末",
    "示",
    "击",
    "打",
    "巧",
    "正",
    "扑",
    "卉",
    "扒",
    "功",
    "扔",
    "去",
    "甘",
    "世",
    "艾",
    "古",
    "节",
    "本",
    "术",
    "可",
    "丙",
    "左",
    "厉",
    "石",
    "右",
    "布",
    "夯",
    "戊",
    "龙",
    "平",
    "灭",
    "轧",
    "东",
    "卡",
    "北",
    "占",
    "凸",
    "卢",
    "业",
    "旧",
    "帅",
    "归",
    "旦",
    "目",
    "且",
    "叶",
    "甲",
    "申",
    "叮",
    "电",
    "号",
    "田",
    "由",
    "只",
    "叭",
    "史",
    "央",
    "兄",
    "叽",
    "叼",
    "叫",
    "叩",
    "叨",
    "另",
    "叹",
    "冉",
    "皿",
    "凹",
    "囚",
    "四",
    "生",
    "矢",
    "失",
    "乍",
    "禾",
    "丘",
    "付",
    "仗",
    "代",
    "仙",
    "们",
    "仪",
    "白",
    "仔",
    "他",
    "斥",
    "瓜",
    "乎",
    "丛",
    "令",
    "用",
    "甩",
    "印",
    "尔",
    "乐",
    "句",
    "匆",
    "册",
    "卯",
    "犯",
    "外",
    "处",
    "冬",
    "鸟",
    "务",
    "包",
    "饥",
    "主",
    "市",
    "立",
    "冯",
    "玄",
    "闪",
    "兰",
    "半",
    "汁",
    "汇",
    "头",
    "汉",
    "宁",
    "穴",
    "它",
    "讨",
    "写",
    "让",
    "礼",
    "训",
    "议",
    "必",
    "讯",
    "记",
    "永",
    "司",
    "尼",
    "民",
    "弗",
    "弘",
    "出",
    "辽",
    "奶",
    "奴",
    "召",
    "加",
    "皮",
    "边",
    "孕",
    "发",
    "圣",
    "对",
    "台",
    "矛",
    "纠",
    "母",
    "幼",
    "丝",
    "邦",
    "式",
    "迂",
    "刑",
    "戎",
    "动",
    "扛",
    "寺",
    "吉",
    "扣",
    "考",
    "托",
    "老",
    "巩",
    "圾",
    "执",
    "扩",
    "扫",
    "地",
    "场",
    "扬",
    "耳",
    "芋",
    "共",
    "芒",
    "亚",
    "芝",
    "朽",
    "朴",
    "机",
    "权",
    "过",
    "臣",
    "吏",
    "再",
    "协",
    "西",
    "压",
    "厌",
    "戌",
    "在",
    "百",
    "有",
    "存",
    "而",
    "页",
    "匠",
    "夸",
    "夺",
    "灰",
    "达",
    "列",
    "死",
    "成",
    "夹",
    "夷",
    "轨",
    "邪",
    "尧",
    "划",
    "迈",
    "毕",
    "至",
    "此",
    "贞",
    "师",
    "尘",
    "尖",
    "劣",
    "光",
    "当",
    "早",
    "吁",
    "吐",
    "吓",
    "虫",
    "曲",
    "团",
    "吕",
    "同",
    "吊",
    "吃",
    "因",
    "吸",
    "吗",
    "吆",
    "屿",
    "屹",
    "岁",
    "帆",
    "回",
    "岂",
    "则",
    "刚",
    "网",
    "肉",
    "年",
    "朱",
    "先",
    "丢",
    "廷",
    "舌",
    "竹",
    "迁",
    "乔",
    "迄",
    "伟",
    "传",
    "乒",
    "乓",
    "休",
    "伍",
    "伏",
    "优",
    "臼",
    "伐",
    "延",
    "仲",
    "件",
    "任",
    "伤",
    "价",
    "伦",
    "份",
    "华",
    "仰",
    "仿",
    "伙",
    "伪",
    "自",
    "伊",
    "血",
    "向",
    "似",
    "后",
    "行",
    "舟",
    "全",
    "会",
    "杀",
    "合",
    "兆",
    "企",
    "众",
    "爷",
    "伞",
    "创",
    "肌",
    "肋",
    "朵",
    "杂",
    "危",
    "旬",
    "旨",
    "旭",
    "负",
    "匈",
    "名",
    "各",
    "多",
    "争",
    "色",
    "壮",
    "冲",
    "妆",
    "冰",
    "庄",
    "庆",
    "亦",
    "刘",
    "齐",
    "交",
    "衣",
    "次",
    "产",
    "决",
    "亥",
    "充",
    "妄",
    "闭",
    "问",
    "闯",
    "羊",
    "并",
    "关",
    "米",
    "灯",
    "州",
    "汗",
    "污",
    "江",
    "汛",
    "池",
    "汝",
    "汤",
    "忙",
    "兴",
    "宇",
    "守",
    "宅",
    "字",
    "安",
    "讲",
    "讳",
    "军",
    "讶",
    "许",
    "讹",
    "论",
    "讼",
    "农",
    "讽",
    "设",
    "访",
    "诀",
    "寻",
    "那",
    "迅",
    "尽",
    "导",
    "异",
    "弛",
    "孙",
    "阵",
    "阳",
    "收",
    "阶",
    "阴",
    "防",
    "奸",
    "如",
    "妇",
    "妃",
    "好",
    "她",
    "妈",
    "戏",
    "羽",
    "观",
    "欢",
    "买",
    "红",
    "驮",
    "纤",
    "驯",
    "约",
    "级",
    "纪",
    "驰",
    "纫",
    "巡",
    "寿",
    "弄",
    "麦",
    "玖",
    "玛",
    "形",
    "进",
    "戒",
    "吞",
    "远",
    "违",
    "韧",
    "运",
    "扶",
    "抚",
    "坛",
    "技",
    "坏",
    "抠",
    "扰",
    "扼",
    "拒",
    "找",
    "批",
    "址",
    "扯",
    "走",
    "抄",
    "贡",
    "汞",
    "坝",
    "攻",
    "赤",
    "折",
    "抓",
    "扳",
    "抡",
    "扮",
    "抢",
    "孝",
    "坎",
    "均",
    "抑",
    "抛",
    "投",
    "坟",
    "坑",
    "抗",
    "坊",
    "抖",
    "护",
    "壳",
    "志",
    "块",
    "扭",
    "声",
    "把",
    "报",
    "拟",
    "却",
    "抒",
    "劫",
    "芙",
    "芜",
    "苇",
    "芽",
    "花",
    "芹",
    "芥",
    "芬",
    "苍",
    "芳",
    "严",
    "芦",
    "芯",
    "劳",
    "克",
    "芭",
    "苏",
    "杆",
    "杠",
    "杜",
    "材",
    "村",
    "杖",
    "杏",
    "杉",
    "巫",
    "极",
    "李",
    "杨",
    "求",
    "甫",
    "匣",
    "更",
    "束",
    "吾",
    "豆",
    "两",
    "酉",
    "丽",
    "医",
    "辰",
    "励",
    "否",
    "还",
    "尬",
    "歼",
    "来",
    "连",
    "轩",
    "步",
    "卤",
    "坚",
    "肖",
    "旱",
    "盯",
    "呈",
    "时",
    "吴",
    "助",
    "县",
    "里",
    "呆",
    "吱",
    "吠",
    "呕",
    "园",
    "旷",
    "围",
    "呀",
    "吨",
    "足",
    "邮",
    "男",
    "困",
    "吵",
    "串",
    "员",
    "呐",
    "听",
    "吟",
    "吩",
    "呛",
    "吻",
    "吹",
    "呜",
    "吭",
    "吧",
    "邑",
    "吼",
    "囤",
    "别",
    "吮",
    "岖",
    "岗",
    "帐",
    "财",
    "针",
    "钉",
    "牡",
    "告",
    "我",
    "乱",
    "利",
    "秃",
    "秀",
    "私",
    "每",
    "兵",
    "估",
    "体",
    "何",
    "佐",
    "佑",
    "但",
    "伸",
    "佃",
    "作",
    "伯",
    "伶",
    "佣",
    "低",
    "你",
    "住",
    "位",
    "伴",
    "身",
    "皂",
    "伺",
    "佛",
    "囱",
    "近",
    "彻",
    "役",
    "返",
    "余",
    "希",
    "坐",
    "谷",
    "妥",
    "含",
    "邻",
    "岔",
    "肝",
    "肛",
    "肚",
    "肘",
    "肠",
    "龟",
    "甸",
    "免",
    "狂",
    "犹",
    "狈",
    "角",
    "删",
    "条",
    "彤",
    "卵",
    "灸",
    "岛",
    "刨",
    "迎",
    "饭",
    "饮",
    "系",
    "言",
    "冻",
    "状",
    "亩",
    "况",
    "床",
    "库",
    "庇",
    "疗",
    "吝",
    "应",
    "这",
    "冷",
    "庐",
    "序",
    "辛",
    "弃",
    "冶",
    "忘",
    "闰",
    "闲",
    "间",
    "闷",
    "判",
    "兑",
    "灶",
    "灿",
    "灼",
    "弟",
    "汪",
    "沐",
    "沛",
    "汰",
    "沥",
    "沙",
    "汽",
    "沃",
    "沦",
    "汹",
    "泛",
    "沧",
    "没",
    "沟",
    "沪",
    "沈",
    "沉",
    "沁",
    "怀",
    "忧",
    "忱",
    "快",
    "完",
    "宋",
    "宏",
    "牢",
    "究",
    "穷",
    "灾",
    "良",
    "证",
    "启",
    "评",
    "补",
    "初",
    "社",
    "祀",
    "识",
    "诈",
    "诉",
    "罕",
    "诊",
    "词",
    "译",
    "君",
    "灵",
    "即",
    "层",
    "屁",
    "尿",
    "尾",
    "迟",
    "局",
    "改",
    "张",
    "忌",
    "际",
    "陆",
    "阿",
    "陈",
    "阻",
    "附",
    "坠",
    "妓",
    "妙",
    "妖",
    "姊",
    "妨",
    "妒",
    "努",
    "忍",
    "劲",
    "矣",
    "鸡",
    "纬",
    "驱",
    "纯",
    "纱",
    "纲",
    "纳",
    "驳",
    "纵",
    "纷",
    "纸",
    "纹",
    "纺",
    "驴",
    "纽",
    "奉",
    "玩",
    "环",
    "武",
    "青",
    "责",
    "现",
    "玫",
    "表",
    "规",
    "抹",
    "卦",
    "坷",
    "坯",
    "拓",
    "拢",
    "拔",
    "坪",
    "拣",
    "坦",
    "担",
    "坤",
    "押",
    "抽",
    "拐",
    "拖",
    "者",
    "拍",
    "顶",
    "拆",
    "拎",
    "拥",
    "抵",
    "拘",
    "势",
    "抱",
    "拄",
    "垃",
    "拉",
    "拦",
    "幸",
    "拌",
    "拧",
    "拂",
    "拙",
    "招",
    "坡",
    "披",
    "拨",
    "择",
    "抬",
    "拇",
    "拗",
    "其",
    "取",
    "茉",
    "苦",
    "昔",
    "苛",
    "若",
    "茂",
    "苹",
    "苗",
    "英",
    "苟",
    "苑",
    "苞",
    "范",
    "直",
    "茁",
    "茄",
    "茎",
    "苔",
    "茅",
    "枉",
    "林",
    "枝",
    "杯",
    "枢",
    "柜",
    "枚",
    "析",
    "板",
    "松",
    "枪",
    "枫",
    "构",
    "杭",
    "杰",
    "述",
    "枕",
    "丧",
    "或",
    "画",
    "卧",
    "事",
    "刺",
    "枣",
    "雨",
    "卖",
    "郁",
    "矾",
    "矿",
    "码",
    "厕",
    "奈",
    "奔",
    "奇",
    "奋",
    "态",
    "欧",
    "殴",
    "垄",
    "妻",
    "轰",
    "顷",
    "转",
    "斩",
    "轮",
    "软",
    "到",
    "非",
    "叔",
    "歧",
    "肯",
    "齿",
    "些",
    "卓",
    "虎",
    "虏",
    "肾",
    "贤",
    "尚",
    "旺",
    "具",
    "味",
    "果",
    "昆",
    "国",
    "哎",
    "咕",
    "昌",
    "呵",
    "畅",
    "明",
    "易",
    "咙",
    "昂",
    "迪",
    "典",
    "固",
    "忠",
    "呻",
    "咒",
    "咋",
    "咐",
    "呼",
    "鸣",
    "咏",
    "呢",
    "咄",
    "咖",
    "岸",
    "岩",
    "帖",
    "罗",
    "帜",
    "帕",
    "岭",
    "凯",
    "败",
    "账",
    "贩",
    "贬",
    "购",
    "贮",
    "图",
    "钓",
    "制",
    "知",
    "迭",
    "氛",
    "垂",
    "牧",
    "物",
    "乖",
    "刮",
    "秆",
    "和",
    "季",
    "委",
    "秉",
    "佳",
    "侍",
    "岳",
    "供",
    "使",
    "例",
    "侠",
    "侥",
    "版",
    "侄",
    "侦",
    "侣",
    "侧",
    "凭",
    "侨",
    "佩",
    "货",
    "侈",
    "依",
    "卑",
    "的",
    "迫",
    "质",
    "欣",
    "征",
    "往",
    "爬",
    "彼",
    "径",
    "所",
    "舍",
    "金",
    "刹",
    "命",
    "肴",
    "斧",
    "爸",
    "采",
    "觅",
    "受",
    "乳",
    "贪",
    "念",
    "贫",
    "忿",
    "肤",
    "肺",
    "肢",
    "肿",
    "胀",
    "朋",
    "股",
    "肮",
    "肪",
    "肥",
    "服",
    "胁",
    "周",
    "昏",
    "鱼",
    "兔",
    "狐",
    "忽",
    "狗",
    "狞",
    "备",
    "饰",
    "饱",
    "饲",
    "变",
    "京",
    "享",
    "庞",
    "店",
    "夜",
    "庙",
    "府",
    "底",
    "疟",
    "疙",
    "疚",
    "剂",
    "卒",
    "郊",
    "庚",
    "废",
    "净",
    "盲",
    "放",
    "刻",
    "育",
    "氓",
    "闸",
    "闹",
    "郑",
    "券",
    "卷",
    "单",
    "炬",
    "炒",
    "炊",
    "炕",
    "炎",
    "炉",
    "沫",
    "浅",
    "法",
    "泄",
    "沽",
    "河",
    "沾",
    "泪",
    "沮",
    "油",
    "泊",
    "沿",
    "泡",
    "注",
    "泣",
    "泞",
    "泻",
    "泌",
    "泳",
    "泥",
    "沸",
    "沼",
    "波",
    "泼",
    "泽",
    "治",
    "怔",
    "怯",
    "怖",
    "性",
    "怕",
    "怜",
    "怪",
    "怡",
    "学",
    "宝",
    "宗",
    "定",
    "宠",
    "宜",
    "审",
    "宙",
    "官",
    "空",
    "帘",
    "宛",
    "实",
    "试",
    "郎",
    "诗",
    "肩",
    "房",
    "诚",
    "衬",
    "衫",
    "视",
    "祈",
    "话",
    "诞",
    "诡",
    "询",
    "该",
    "详",
    "建",
    "肃",
    "录",
    "隶",
    "帚",
    "屉",
    "居",
    "届",
    "刷",
    "屈",
    "弧",
    "弥",
    "弦",
    "承",
    "孟",
    "陋",
    "陌",
    "孤",
    "陕",
    "降",
    "函",
    "限",
    "妹",
    "姑",
    "姐",
    "姓",
    "妮",
    "始",
    "姆",
    "迢",
    "驾",
    "叁",
    "参",
    "艰",
    "线",
    "练",
    "组",
    "绅",
    "细",
    "驶",
    "织",
    "驹",
    "终",
    "驻",
    "绊",
    "驼",
    "绍",
    "绎",
    "经",
    "贯",
    "契",
    "贰",
    "奏",
    "春",
    "帮",
    "玷",
    "珍",
    "玲",
    "玻",
    "毒",
    "型",
    "拭",
    "挂",
    "封",
    "持",
    "拷",
    "拱",
    "项",
    "垮",
    "挎",
    "城",
    "挟",
    "挠",
    "政",
    "赴",
    "赵",
    "挡",
    "拽",
    "哉",
    "挺",
    "括",
    "垢",
    "拴",
    "拾",
    "挑",
    "垛",
    "指",
    "垫",
    "挣",
    "挤",
    "拼",
    "挖",
    "按",
    "挥",
    "挪",
    "拯",
    "某",
    "甚",
    "荆",
    "茸",
    "革",
    "茬",
    "荐",
    "巷",
    "带",
    "草",
    "茧",
    "茵",
    "茶",
    "荒",
    "茫",
    "荡",
    "荣",
    "荤",
    "荧",
    "故",
    "胡",
    "荫",
    "荔",
    "南",
    "药",
    "标",
    "栈",
    "柑",
    "枯",
    "柄",
    "栋",
    "相",
    "查",
    "柏",
    "栅",
    "柳",
    "柱",
    "柿",
    "栏",
    "柠",
    "树",
    "勃",
    "要",
    "柬",
    "咸",
    "威",
    "歪",
    "研",
    "砖",
    "厘",
    "厚",
    "砌",
    "砂",
    "泵",
    "砚",
    "砍",
    "面",
    "耐",
    "耍",
    "牵",
    "鸥",
    "残",
    "殃",
    "轴",
    "轻",
    "鸦",
    "皆",
    "韭",
    "背",
    "战",
    "点",
    "虐",
    "临",
    "览",
    "竖",
    "省",
    "削",
    "尝",
    "昧",
    "盹",
    "是",
    "盼",
    "眨",
    "哇",
    "哄",
    "哑",
    "显",
    "冒",
    "映",
    "星",
    "昨",
    "咧",
    "昭",
    "畏",
    "趴",
    "胃",
    "贵",
    "界",
    "虹",
    "虾",
    "蚁",
    "思",
    "蚂",
    "虽",
    "品",
    "咽",
    "骂",
    "勋",
    "哗",
    "咱",
    "响",
    "哈",
    "哆",
    "咬",
    "咳",
    "咪",
    "哪",
    "哟",
    "炭",
    "峡",
    "罚",
    "贱",
    "贴",
    "贻",
    "骨",
    "幽",
    "钙",
    "钝",
    "钞",
    "钟",
    "钢",
    "钠",
    "钥",
    "钦",
    "钧",
    "钩",
    "钮",
    "卸",
    "缸",
    "拜",
    "看",
    "矩",
    "毡",
    "氢",
    "怎",
    "牲",
    "选",
    "适",
    "秒",
    "香",
    "种",
    "秋",
    "科",
    "重",
    "复",
    "竿",
    "段",
    "便",
    "俩",
    "贷",
    "顺",
    "修",
    "俏",
    "保",
    "促",
    "俄",
    "俐",
    "侮",
    "俭",
    "俗",
    "俘",
    "信",
    "皇",
    "泉",
    "鬼",
    "侵",
    "禹",
    "侯",
    "追",
    "俊",
    "盾",
    "待",
    "徊",
    "衍",
    "律",
    "很",
    "须",
    "叙",
    "剑",
    "逃",
    "食",
    "盆",
    "胚",
    "胧",
    "胆",
    "胜",
    "胞",
    "胖",
    "脉",
    "胎",
    "勉",
    "狭",
    "狮",
    "独",
    "狰",
    "狡",
    "狱",
    "狠",
    "贸",
    "怨",
    "急",
    "饵",
    "饶",
    "蚀",
    "饺",
    "饼",
    "峦",
    "弯",
    "将",
    "奖",
    "哀",
    "亭",
    "亮",
    "度",
    "迹",
    "庭",
    "疮",
    "疯",
    "疫",
    "疤",
    "咨",
    "姿",
    "亲",
    "音",
    "帝",
    "施",
    "闺",
    "闻",
    "闽",
    "阀",
    "阁",
    "差",
    "养",
    "美",
    "姜",
    "叛",
    "送",
    "类",
    "迷",
    "籽",
    "娄",
    "前",
    "首",
    "逆",
    "兹",
    "总",
    "炼",
    "炸",
    "烁",
    "炮",
    "炫",
    "烂",
    "剃",
    "洼",
    "洁",
    "洪",
    "洒",
    "柒",
    "浇",
    "浊",
    "洞",
    "测",
    "洗",
    "活",
    "派",
    "洽",
    "染",
    "洛",
    "浏",
    "济",
    "洋",
    "洲",
    "浑",
    "浓",
    "津",
    "恃",
    "恒",
    "恢",
    "恍",
    "恬",
    "恤",
    "恰",
    "恼",
    "恨",
    "举",
    "觉",
    "宣",
    "宦",
    "室",
    "宫",
    "宪",
    "突",
    "穿",
    "窃",
    "客",
    "诫",
    "冠",
    "诬",
    "语",
    "扁",
    "袄",
    "祖",
    "神",
    "祝",
    "祠",
    "误",
    "诱",
    "诲",
    "说",
    "诵",
    "垦",
    "退",
    "既",
    "屋",
    "昼",
    "屏",
    "屎",
    "费",
    "陡",
    "逊",
    "眉",
    "孩",
    "陨",
    "除",
    "险",
    "院",
    "娃",
    "姥",
    "姨",
    "姻",
    "娇",
    "姚",
    "娜",
    "怒",
    "架",
    "贺",
    "盈",
    "勇",
    "怠",
    "癸",
    "蚤",
    "柔",
    "垒",
    "绑",
    "绒",
    "结",
    "绕",
    "骄",
    "绘",
    "给",
    "绚",
    "骆",
    "络",
    "绝",
    "绞",
    "骇",
    "统",
    "耕",
    "耘",
    "耗",
    "耙",
    "艳",
    "泰",
    "秦",
    "珠",
    "班",
    "素",
    "匿",
    "蚕",
    "顽",
    "盏",
    "匪",
    "捞",
    "栽",
    "捕",
    "埂",
    "捂",
    "振",
    "载",
    "赶",
    "起",
    "盐",
    "捎",
    "捍",
    "捏",
    "埋",
    "捉",
    "捆",
    "捐",
    "损",
    "袁",
    "捌",
    "都",
    "哲",
    "逝",
    "捡",
    "挫",
    "换",
    "挽",
    "挚",
    "热",
    "恐",
    "捣",
    "壶",
    "捅",
    "埃",
    "挨",
    "耻",
    "耿",
    "耽",
    "聂",
    "恭",
    "莽",
    "莱",
    "莲",
    "莫",
    "莉",
    "荷",
    "获",
    "晋",
    "恶",
    "莹",
    "莺",
    "真",
    "框",
    "梆",
    "桂",
    "桔",
    "栖",
    "档",
    "桐",
    "株",
    "桥",
    "桦",
    "栓",
    "桃",
    "格",
    "桩",
    "校",
    "核",
    "样",
    "根",
    "索",
    "哥",
    "速",
    "逗",
    "栗",
    "贾",
    "酌",
    "配",
    "翅",
    "辱",
    "唇",
    "夏",
    "砸",
    "砰",
    "砾",
    "础",
    "破",
    "原",
    "套",
    "逐",
    "烈",
    "殊",
    "殉",
    "顾",
    "轿",
    "较",
    "顿",
    "毙",
    "致",
    "柴",
    "桌",
    "虑",
    "监",
    "紧",
    "党",
    "逞",
    "晒",
    "眠",
    "晓",
    "哮",
    "唠",
    "鸭",
    "晃",
    "哺",
    "晌",
    "剔",
    "晕",
    "蚌",
    "畔",
    "蚣",
    "蚊",
    "蚪",
    "蚓",
    "哨",
    "哩",
    "圃",
    "哭",
    "哦",
    "恩",
    "鸯",
    "唤",
    "唁",
    "哼",
    "唧",
    "啊",
    "唉",
    "唆",
    "罢",
    "峭",
    "峨",
    "峰",
    "圆",
    "峻",
    "贼",
    "贿",
    "赂",
    "赃",
    "钱",
    "钳",
    "钻",
    "钾",
    "铁",
    "铃",
    "铅",
    "缺",
    "氧",
    "氨",
    "特",
    "牺",
    "造",
    "乘",
    "敌",
    "秤",
    "租",
    "积",
    "秧",
    "秩",
    "称",
    "秘",
    "透",
    "笔",
    "笑",
    "笋",
    "债",
    "借",
    "值",
    "倚",
    "俺",
    "倾",
    "倒",
    "倘",
    "俱",
    "倡",
    "候",
    "赁",
    "俯",
    "倍",
    "倦",
    "健",
    "臭",
    "射",
    "躬",
    "息",
    "倔",
    "徒",
    "徐",
    "殷",
    "舰",
    "舱",
    "般",
    "航",
    "途",
    "拿",
    "耸",
    "爹",
    "舀",
    "爱",
    "豺",
    "豹",
    "颁",
    "颂",
    "翁",
    "胰",
    "脆",
    "脂",
    "胸",
    "胳",
    "脏",
    "脐",
    "胶",
    "脑",
    "脓",
    "逛",
    "狸",
    "狼",
    "卿",
    "逢",
    "鸵",
    "留",
    "鸳",
    "皱",
    "饿",
    "馁",
    "凌",
    "凄",
    "恋",
    "桨",
    "浆",
    "衰",
    "衷",
    "高",
    "郭",
    "席",
    "准",
    "座",
    "症",
    "病",
    "疾",
    "斋",
    "疹",
    "疼",
    "疲",
    "脊",
    "效",
    "离",
    "紊",
    "唐",
    "瓷",
    "资",
    "凉",
    "站",
    "剖",
    "竞",
    "部",
    "旁",
    "旅",
    "畜",
    "阅",
    "羞",
    "羔",
    "瓶",
    "拳",
    "粉",
    "料",
    "益",
    "兼",
    "烤",
    "烘",
    "烦",
    "烧",
    "烛",
    "烟",
    "烙",
    "递",
    "涛",
    "浙",
    "涝",
    "浦",
    "酒",
    "涉",
    "消",
    "涡",
    "浩",
    "海",
    "涂",
    "浴",
    "浮",
    "涣",
    "涤",
    "流",
    "润",
    "涧",
    "涕",
    "浪",
    "浸",
    "涨",
    "烫",
    "涩",
    "涌",
    "悖",
    "悟",
    "悄",
    "悍",
    "悔",
    "悯",
    "悦",
    "害",
    "宽",
    "家",
    "宵",
    "宴",
    "宾",
    "窍",
    "窄",
    "容",
    "宰",
    "案",
    "请",
    "朗",
    "诸",
    "诺",
    "读",
    "扇",
    "诽",
    "袜",
    "袖",
    "袍",
    "被",
    "祥",
    "课",
    "冥",
    "谁",
    "调",
    "冤",
    "谅",
    "谆",
    "谈",
    "谊",
    "剥",
    "恳",
    "展",
    "剧",
    "屑",
    "弱",
    "陵",
    "祟",
    "陶",
    "陷",
    "陪",
    "娱",
    "娟",
    "恕",
    "娥",
    "娘",
    "通",
    "能",
    "难",
    "预",
    "桑",
    "绢",
    "绣",
    "验",
    "继",
    "骏",
    "球",
    "琐",
    "理",
    "琉",
    "琅",
    "捧",
    "堵",
    "措",
    "描",
    "域",
    "捺",
    "掩",
    "捷",
    "排",
    "焉",
    "掉",
    "捶",
    "赦",
    "堆",
    "推",
    "埠",
    "掀",
    "授",
    "捻",
    "教",
    "掏",
    "掐",
    "掠",
    "掂",
    "培",
    "接",
    "掷",
    "控",
    "探",
    "据",
    "掘",
    "掺",
    "职",
    "基",
    "聆",
    "勘",
    "聊",
    "娶",
    "著",
    "菱",
    "勒",
    "黄",
    "菲",
    "萌",
    "萝",
    "菌",
    "萎",
    "菜",
    "萄",
    "菊",
    "菩",
    "萍",
    "菠",
    "萤",
    "营",
    "乾",
    "萧",
    "萨",
    "菇",
    "械",
    "彬",
    "梦",
    "婪",
    "梗",
    "梧",
    "梢",
    "梅",
    "检",
    "梳",
    "梯",
    "桶",
    "梭",
    "救",
    "曹",
    "副",
    "票",
    "酝",
    "酗",
    "厢",
    "戚",
    "硅",
    "硕",
    "奢",
    "盔",
    "爽",
    "聋",
    "袭",
    "盛",
    "匾",
    "雪",
    "辅",
    "辆",
    "颅",
    "虚",
    "彪",
    "雀",
    "堂",
    "常",
    "眶",
    "匙",
    "晨",
    "睁",
    "眯",
    "眼",
    "悬",
    "野",
    "啪",
    "啦",
    "曼",
    "晦",
    "晚",
    "啄",
    "啡",
    "距",
    "趾",
    "啃",
    "跃",
    "略",
    "蚯",
    "蛀",
    "蛇",
    "唬",
    "累",
    "鄂",
    "唱",
    "患",
    "啰",
    "唾",
    "唯",
    "啤",
    "啥",
    "啸",
    "崖",
    "崎",
    "崭",
    "逻",
    "崔",
    "帷",
    "崩",
    "崇",
    "崛",
    "婴",
    "圈",
    "铐",
    "铛",
    "铝",
    "铜",
    "铭",
    "铲",
    "银",
    "矫",
    "甜",
    "秸",
    "梨",
    "犁",
    "秽",
    "移",
    "笨",
    "笼",
    "笛",
    "笙",
    "符",
    "第",
    "敏",
    "做",
    "袋",
    "悠",
    "偿",
    "偶",
    "偎",
    "偷",
    "您",
    "售",
    "停",
    "偏",
    "躯",
    "兜",
    "假",
    "衅",
    "徘",
    "徙",
    "得",
    "衔",
    "盘",
    "舶",
    "船",
    "舵",
    "斜",
    "盒",
    "鸽",
    "敛",
    "悉",
    "欲",
    "彩",
    "领",
    "脚",
    "脖",
    "脯",
    "豚",
    "脸",
    "脱",
    "象",
    "够",
    "逸",
    "猜",
    "猪",
    "猎",
    "猫",
    "凰",
    "猖",
    "猛",
    "祭",
    "馅",
    "馆",
    "凑",
    "减",
    "毫",
    "烹",
    "庶",
    "麻",
    "庵",
    "痊",
    "痒",
    "痕",
    "廊",
    "康",
    "庸",
    "鹿",
    "盗",
    "章",
    "竟",
    "商",
    "族",
    "旋",
    "望",
    "率",
    "阎",
    "阐",
    "着",
    "羚",
    "盖",
    "眷",
    "粘",
    "粗",
    "粒",
    "断",
    "剪",
    "兽",
    "焊",
    "焕",
    "清",
    "添",
    "鸿",
    "淋",
    "涯",
    "淹",
    "渠",
    "渐",
    "淑",
    "淌",
    "混",
    "淮",
    "淆",
    "渊",
    "淫",
    "渔",
    "淘",
    "淳",
    "液",
    "淤",
    "淡",
    "淀",
    "深",
    "涮",
    "涵",
    "婆",
    "梁",
    "渗",
    "情",
    "惜",
    "惭",
    "悼",
    "惧",
    "惕",
    "惟",
    "惊",
    "惦",
    "悴",
    "惋",
    "惨",
    "惯",
    "寇",
    "寅",
    "寄",
    "寂",
    "宿",
    "窒",
    "窑",
    "密",
    "谋",
    "谍",
    "谎",
    "谐",
    "袱",
    "祷",
    "祸",
    "谓",
    "谚",
    "谜",
    "逮",
    "敢",
    "尉",
    "屠",
    "弹",
    "隋",
    "堕",
    "随",
    "蛋",
    "隅",
    "隆",
    "隐",
    "婚",
    "婶",
    "婉",
    "颇",
    "颈",
    "绩",
    "绪",
    "续",
    "骑",
    "绰",
    "绳",
    "维",
    "绵",
    "绷",
    "绸",
    "综",
    "绽",
    "绿",
    "缀",
    "巢",
    "琴",
    "琳",
    "琢",
    "琼",
    "斑",
    "替",
    "揍",
    "款",
    "堪",
    "塔",
    "搭",
    "堰",
    "揩",
    "越",
    "趁",
    "趋",
    "超",
    "揽",
    "堤",
    "提",
    "博",
    "揭",
    "喜",
    "彭",
    "揣",
    "插",
    "揪",
    "搜",
    "煮",
    "援",
    "搀",
    "裁",
    "搁",
    "搓",
    "搂",
    "搅",
    "壹",
    "握",
    "搔",
    "揉",
    "斯",
    "期",
    "欺",
    "联",
    "葫",
    "散",
    "惹",
    "葬",
    "募",
    "葛",
    "董",
    "葡",
    "敬",
    "葱",
    "蒋",
    "蒂",
    "落",
    "韩",
    "朝",
    "辜",
    "葵",
    "棒",
    "棱",
    "棋",
    "椰",
    "植",
    "森",
    "焚",
    "椅",
    "椒",
    "棵",
    "棍",
    "椎",
    "棉",
    "棚",
    "棕",
    "棺",
    "榔",
    "椭",
    "惠",
    "惑",
    "逼",
    "粟",
    "棘",
    "酣",
    "酥",
    "厨",
    "厦",
    "硬",
    "硝",
    "确",
    "硫",
    "雁",
    "殖",
    "裂",
    "雄",
    "颊",
    "雳",
    "暂",
    "雅",
    "翘",
    "辈",
    "悲",
    "紫",
    "凿",
    "辉",
    "敞",
    "棠",
    "赏",
    "掌",
    "晴",
    "睐",
    "暑",
    "最",
    "晰",
    "量",
    "鼎",
    "喷",
    "喳",
    "晶",
    "喇",
    "遇",
    "喊",
    "遏",
    "晾",
    "景",
    "畴",
    "践",
    "跋",
    "跌",
    "跑",
    "跛",
    "遗",
    "蛙",
    "蛛",
    "蜓",
    "蜒",
    "蛤",
    "喝",
    "鹃",
    "喂",
    "喘",
    "喉",
    "喻",
    "啼",
    "喧",
    "嵌",
    "幅",
    "帽",
    "赋",
    "赌",
    "赎",
    "赐",
    "赔",
    "黑",
    "铸",
    "铺",
    "链",
    "销",
    "锁",
    "锄",
    "锅",
    "锈",
    "锋",
    "锌",
    "锐",
    "甥",
    "掰",
    "短",
    "智",
    "氮",
    "毯",
    "氯",
    "鹅",
    "剩",
    "稍",
    "程",
    "稀",
    "税",
    "筐",
    "等",
    "筑",
    "策",
    "筛",
    "筒",
    "筏",
    "答",
    "筋",
    "筝",
    "傲",
    "傅",
    "牌",
    "堡",
    "集",
    "焦",
    "傍",
    "储",
    "皓",
    "皖",
    "粤",
    "奥",
    "街",
    "惩",
    "御",
    "循",
    "艇",
    "舒",
    "逾",
    "番",
    "释",
    "禽",
    "腊",
    "脾",
    "腋",
    "腔",
    "腕",
    "鲁",
    "猩",
    "猬",
    "猾",
    "猴",
    "惫",
    "然",
    "馈",
    "馋",
    "装",
    "蛮",
    "就",
    "敦",
    "斌",
    "痘",
    "痢",
    "痪",
    "痛",
    "童",
    "竣",
    "阔",
    "善",
    "翔",
    "羡",
    "普",
    "粪",
    "尊",
    "奠",
    "道",
    "遂",
    "曾",
    "焰",
    "港",
    "滞",
    "湖",
    "湘",
    "渣",
    "渤",
    "渺",
    "湿",
    "温",
    "渴",
    "溃",
    "溅",
    "滑",
    "湃",
    "渝",
    "湾",
    "渡",
    "游",
    "滋",
    "渲",
    "溉",
    "愤",
    "慌",
    "惰",
    "愕",
    "愣",
    "惶",
    "愧",
    "愉",
    "慨",
    "割",
    "寒",
    "富",
    "寓",
    "窜",
    "窝",
    "窖",
    "窗",
    "窘",
    "遍",
    "雇",
    "裕",
    "裤",
    "裙",
    "禅",
    "禄",
    "谢",
    "谣",
    "谤",
    "谦",
    "犀",
    "属",
    "屡",
    "强",
    "粥",
    "疏",
    "隔",
    "隙",
    "隘",
    "媒",
    "絮",
    "嫂",
    "媚",
    "婿",
    "登",
    "缅",
    "缆",
    "缉",
    "缎",
    "缓",
    "缔",
    "缕",
    "骗",
    "编",
    "骚",
    "缘",
    "瑟",
    "鹉",
    "瑞",
    "瑰",
    "瑙",
    "魂",
    "肆",
    "摄",
    "摸",
    "填",
    "搏",
    "塌",
    "鼓",
    "摆",
    "携",
    "搬",
    "摇",
    "搞",
    "塘",
    "摊",
    "聘",
    "斟",
    "蒜",
    "勤",
    "靴",
    "靶",
    "鹊",
    "蓝",
    "墓",
    "幕",
    "蓬",
    "蓄",
    "蒲",
    "蓉",
    "蒙",
    "蒸",
    "献",
    "椿",
    "禁",
    "楚",
    "楷",
    "榄",
    "想",
    "槐",
    "榆",
    "楼",
    "概",
    "赖",
    "酪",
    "酬",
    "感",
    "碍",
    "碘",
    "碑",
    "碎",
    "碰",
    "碗",
    "碌",
    "尴",
    "雷",
    "零",
    "雾",
    "雹",
    "辐",
    "辑",
    "输",
    "督",
    "频",
    "龄",
    "鉴",
    "睛",
    "睹",
    "睦",
    "瞄",
    "睫",
    "睡",
    "睬",
    "嗜",
    "鄙",
    "嗦",
    "愚",
    "暖",
    "盟",
    "歇",
    "暗",
    "暇",
    "照",
    "畸",
    "跨",
    "跷",
    "跳",
    "跺",
    "跪",
    "路",
    "跤",
    "跟",
    "遣",
    "蜈",
    "蜗",
    "蛾",
    "蜂",
    "蜕",
    "嗅",
    "嗡",
    "嗓",
    "署",
    "置",
    "罪",
    "罩",
    "蜀",
    "幌",
    "错",
    "锚",
    "锡",
    "锣",
    "锤",
    "锥",
    "锦",
    "键",
    "锯",
    "锰",
    "矮",
    "辞",
    "稚",
    "稠",
    "颓",
    "愁",
    "筹",
    "签",
    "简",
    "筷",
    "毁",
    "舅",
    "鼠",
    "催",
    "傻",
    "像",
    "躲",
    "魁",
    "衙",
    "微",
    "愈",
    "遥",
    "腻",
    "腰",
    "腥",
    "腮",
    "腹",
    "腺",
    "鹏",
    "腾",
    "腿",
    "鲍",
    "猿",
    "颖",
    "触",
    "解",
    "煞",
    "雏",
    "馍",
    "馏",
    "酱",
    "禀",
    "痹",
    "廓",
    "痴",
    "痰",
    "廉",
    "靖",
    "新",
    "韵",
    "意",
    "誊",
    "粮",
    "数",
    "煎",
    "塑",
    "慈",
    "煤",
    "煌",
    "满",
    "漠",
    "滇",
    "源",
    "滤",
    "滥",
    "滔",
    "溪",
    "溜",
    "漓",
    "滚",
    "溢",
    "溯",
    "滨",
    "溶",
    "溺",
    "粱",
    "滩",
    "慎",
    "誉",
    "塞",
    "寞",
    "窥",
    "窟",
    "寝",
    "谨",
    "褂",
    "裸",
    "福",
    "谬",
    "群",
    "殿",
    "辟",
    "障",
    "媳",
    "嫉",
    "嫌",
    "嫁",
    "叠",
    "缚",
    "缝",
    "缠",
    "缤",
    "剿",
    "静",
    "碧",
    "璃",
    "赘",
    "熬",
    "墙",
    "墟",
    "嘉",
    "摧",
    "赫",
    "截",
    "誓",
    "境",
    "摘",
    "摔",
    "撇",
    "聚",
    "慕",
    "暮",
    "摹",
    "蔓",
    "蔑",
    "蔡",
    "蔗",
    "蔽",
    "蔼",
    "熙",
    "蔚",
    "兢",
    "模",
    "槛",
    "榴",
    "榜",
    "榨",
    "榕",
    "歌",
    "遭",
    "酵",
    "酷",
    "酿",
    "酸",
    "碟",
    "碱",
    "碳",
    "磁",
    "愿",
    "需",
    "辖",
    "辗",
    "雌",
    "裳",
    "颗",
    "瞅",
    "墅",
    "嗽",
    "踊",
    "蜻",
    "蜡",
    "蝇",
    "蜘",
    "蝉",
    "嘛",
    "嘀",
    "赚",
    "锹",
    "锻",
    "镀",
    "舞",
    "舔",
    "稳",
    "熏",
    "箕",
    "算",
    "箩",
    "管",
    "箫",
    "舆",
    "僚",
    "僧",
    "鼻",
    "魄",
    "魅",
    "貌",
    "膜",
    "膊",
    "膀",
    "鲜",
    "疑",
    "孵",
    "馒",
    "裹",
    "敲",
    "豪",
    "膏",
    "遮",
    "腐",
    "瘩",
    "瘟",
    "瘦",
    "辣",
    "彰",
    "竭",
    "端",
    "旗",
    "精",
    "粹",
    "歉",
    "弊",
    "熄",
    "熔",
    "煽",
    "潇",
    "漆",
    "漱",
    "漂",
    "漫",
    "滴",
    "漾",
    "演",
    "漏",
    "慢",
    "慷",
    "寨",
    "赛",
    "寡",
    "察",
    "蜜",
    "寥",
    "谭",
    "肇",
    "褐",
    "褪",
    "谱",
    "隧",
    "嫩",
    "翠",
    "熊",
    "凳",
    "骡",
    "缩",
    "慧",
    "撵",
    "撕",
    "撒",
    "撩",
    "趣",
    "趟",
    "撑",
    "撮",
    "撬",
    "播",
    "擒",
    "墩",
    "撞",
    "撤",
    "增",
    "撰",
    "聪",
    "鞋",
    "鞍",
    "蕉",
    "蕊",
    "蔬",
    "蕴",
    "横",
    "槽",
    "樱",
    "橡",
    "樟",
    "橄",
    "敷",
    "豌",
    "飘",
    "醋",
    "醇",
    "醉",
    "磕",
    "磊",
    "磅",
    "碾",
    "震",
    "霄",
    "霉",
    "瞒",
    "题",
    "暴",
    "瞎",
    "嘻",
    "嘶",
    "嘲",
    "嘹",
    "影",
    "踢",
    "踏",
    "踩",
    "踪",
    "蝶",
    "蝴",
    "蝠",
    "蝎",
    "蝌",
    "蝗",
    "蝙",
    "嘿",
    "嘱",
    "幢",
    "墨",
    "镇",
    "镐",
    "镑",
    "靠",
    "稽",
    "稻",
    "黎",
    "稿",
    "稼",
    "箱",
    "篓",
    "箭",
    "篇",
    "僵",
    "躺",
    "僻",
    "德",
    "艘",
    "膝",
    "膛",
    "鲤",
    "鲫",
    "熟",
    "摩",
    "褒",
    "瘪",
    "瘤",
    "瘫",
    "凛",
    "颜",
    "毅",
    "糊",
    "遵",
    "憋",
    "潜",
    "澎",
    "潮",
    "潭",
    "鲨",
    "澳",
    "潘",
    "澈",
    "澜",
    "澄",
    "懂",
    "憔",
    "懊",
    "憎",
    "额",
    "翩",
    "褥",
    "谴",
    "鹤",
    "憨",
    "慰",
    "劈",
    "履",
    "豫",
    "缭",
    "撼",
    "擂",
    "操",
    "擅",
    "燕",
    "蕾",
    "薯",
    "薛",
    "薇",
    "擎",
    "薪",
    "薄",
    "颠",
    "翰",
    "噩",
    "橱",
    "橙",
    "橘",
    "整",
    "融",
    "瓢",
    "醒",
    "霍",
    "霎",
    "辙",
    "冀",
    "餐",
    "嘴",
    "踱",
    "蹄",
    "蹂",
    "蟆",
    "螃",
    "器",
    "噪",
    "鹦",
    "赠",
    "默",
    "黔",
    "镜",
    "赞",
    "穆",
    "篮",
    "篡",
    "篷",
    "篱",
    "儒",
    "邀",
    "衡",
    "膨",
    "雕",
    "鲸",
    "磨",
    "瘾",
    "瘸",
    "凝",
    "辨",
    "辩",
    "糙",
    "糖",
    "糕",
    "燃",
    "濒",
    "澡",
    "激",
    "懒",
    "憾",
    "懈",
    "窿",
    "壁",
    "避",
    "缰",
    "缴",
    "戴",
    "擦",
    "藉",
    "鞠",
    "藏",
    "藐",
    "檬",
    "檐",
    "檀",
    "礁",
    "磷",
    "霜",
    "霞",
    "瞭",
    "瞧",
    "瞬",
    "瞳",
    "瞩",
    "瞪",
    "曙",
    "蹋",
    "蹈",
    "螺",
    "蟋",
    "蟀",
    "嚎",
    "赡",
    "穗",
    "魏",
    "簧",
    "簇",
    "繁",
    "徽",
    "爵",
    "朦",
    "臊",
    "鳄",
    "癌",
    "辫",
    "赢",
    "糟",
    "糠",
    "燥",
    "懦",
    "豁",
    "臀",
    "臂",
    "翼",
    "骤",
    "藕",
    "鞭",
    "藤",
    "覆",
    "瞻",
    "蹦",
    "嚣",
    "镰",
    "翻",
    "鳍",
    "鹰",
    "瀑",
    "襟",
    "璧",
    "戳",
    "孽",
    "警",
    "蘑",
    "藻",
    "攀",
    "曝",
    "蹲",
    "蹭",
    "蹬",
    "巅",
    "簸",
    "簿",
    "蟹",
    "颤",
    "靡",
    "癣",
    "瓣",
    "羹",
    "鳖",
    "爆",
    "疆",
    "鬓",
    "壤",
    "馨",
    "耀",
    "躁",
    "蠕",
    "嚼",
    "嚷",
    "巍",
    "籍",
    "鳞",
    "魔",
    "糯",
    "灌",
    "譬",
    "蠢",
    "霸",
    "露",
    "霹",
    "躏",
    "黯",
    "髓",
    "赣",
    "囊",
    "镶",
    "瓤",
    "罐",
    "矗",
    "乂",
    "乜",
    "兀",
    "弋",
    "孑",
    "孓",
    "幺",
    "亓",
    "韦",
    "廿",
    "丏",
    "卅",
    "仄",
    "厄",
    "仃",
    "仉",
    "仂",
    "兮",
    "刈",
    "爻",
    "卞",
    "闩",
    "讣",
    "尹",
    "夬",
    "爿",
    "毋",
    "邗",
    "邛",
    "艽",
    "艿",
    "札",
    "叵",
    "匝",
    "丕",
    "匜",
    "劢",
    "卟",
    "叱",
    "叻",
    "仨",
    "仕",
    "仟",
    "仡",
    "仫",
    "仞",
    "卮",
    "氐",
    "犰",
    "刍",
    "邝",
    "邙",
    "汀",
    "讦",
    "讧",
    "讪",
    "讫",
    "尻",
    "阡",
    "尕",
    "弁",
    "驭",
    "匡",
    "耒",
    "玎",
    "玑",
    "邢",
    "圩",
    "圬",
    "圭",
    "扦",
    "圪",
    "圳",
    "圹",
    "扪",
    "圮",
    "圯",
    "芊",
    "芍",
    "芄",
    "芨",
    "芑",
    "芎",
    "芗",
    "亘",
    "厍",
    "夼",
    "戍",
    "尥",
    "乩",
    "旯",
    "曳",
    "岌",
    "屺",
    "凼",
    "囡",
    "钇",
    "缶",
    "氘",
    "氖",
    "牝",
    "伎",
    "伛",
    "伢",
    "佤",
    "仵",
    "伥",
    "伧",
    "伉",
    "伫",
    "囟",
    "汆",
    "刖",
    "夙",
    "旮",
    "刎",
    "犷",
    "犸",
    "舛",
    "凫",
    "邬",
    "饧",
    "汕",
    "汔",
    "汐",
    "汲",
    "汜",
    "汊",
    "忖",
    "忏",
    "讴",
    "讵",
    "祁",
    "讷",
    "聿",
    "艮",
    "厾",
    "阱",
    "阮",
    "阪",
    "丞",
    "妁",
    "牟",
    "纡",
    "纣",
    "纥",
    "纨",
    "玕",
    "玙",
    "抟",
    "抔",
    "圻",
    "坂",
    "坍",
    "坞",
    "抃",
    "抉",
    "㧐",
    "芫",
    "邯",
    "芸",
    "芾",
    "苈",
    "苣",
    "芷",
    "芮",
    "苋",
    "芼",
    "苌",
    "苁",
    "芩",
    "芪",
    "芡",
    "芟",
    "苄",
    "苎",
    "苡",
    "杌",
    "杓",
    "杞",
    "杈",
    "忑",
    "孛",
    "邴",
    "邳",
    "矶",
    "奁",
    "豕",
    "忒",
    "欤",
    "轫",
    "迓",
    "邶",
    "忐",
    "卣",
    "邺",
    "旰",
    "呋",
    "呒",
    "呓",
    "呔",
    "呖",
    "呃",
    "旸",
    "吡",
    "町",
    "虬",
    "呗",
    "吽",
    "吣",
    "吲",
    "帏",
    "岐",
    "岈",
    "岘",
    "岑",
    "岚",
    "兕",
    "囵",
    "囫",
    "钊",
    "钋",
    "钌",
    "迕",
    "氙",
    "氚",
    "牤",
    "佞",
    "邱",
    "攸",
    "佚",
    "佝",
    "佟",
    "佗",
    "伽",
    "彷",
    "佘",
    "佥",
    "孚",
    "豸",
    "坌",
    "肟",
    "邸",
    "奂",
    "劬",
    "狄",
    "狁",
    "鸠",
    "邹",
    "饨",
    "饩",
    "饪",
    "饫",
    "饬",
    "亨",
    "庑",
    "庋",
    "疔",
    "疖",
    "肓",
    "闱",
    "闳",
    "闵",
    "羌",
    "炀",
    "沣",
    "沅",
    "沔",
    "沤",
    "沌",
    "沏",
    "沚",
    "汩",
    "汨",
    "沂",
    "汾",
    "沨",
    "汴",
    "汶",
    "沆",
    "沩",
    "泐",
    "怃",
    "怄",
    "忡",
    "忤",
    "忾",
    "怅",
    "忻",
    "忪",
    "怆",
    "忭",
    "忸",
    "诂",
    "诃",
    "诅",
    "诋",
    "诌",
    "诏",
    "诒",
    "孜",
    "陇",
    "陀",
    "陂",
    "陉",
    "妍",
    "妩",
    "妪",
    "妣",
    "妊",
    "妗",
    "妫",
    "妞",
    "姒",
    "妤",
    "邵",
    "劭",
    "刭",
    "甬",
    "邰",
    "纭",
    "纰",
    "纴",
    "纶",
    "纾",
    "玮",
    "玡",
    "玭",
    "玠",
    "玢",
    "玥",
    "玦",
    "盂",
    "忝",
    "匦",
    "坩",
    "抨",
    "拤",
    "坫",
    "拈",
    "垆",
    "抻",
    "劼",
    "拃",
    "拊",
    "坼",
    "坻",
    "㧟",
    "坨",
    "坭",
    "抿",
    "坳",
    "耶",
    "苷",
    "苯",
    "苤",
    "茏",
    "苫",
    "苜",
    "苴",
    "苒",
    "苘",
    "茌",
    "苻",
    "苓",
    "茚",
    "茆",
    "茑",
    "茓",
    "茔",
    "茕",
    "茀",
    "苕",
    "枥",
    "枇",
    "杪",
    "杳",
    "枧",
    "杵",
    "枨",
    "枞",
    "枋",
    "杻",
    "杷",
    "杼",
    "矸",
    "砀",
    "刳",
    "奄",
    "瓯",
    "殁",
    "郏",
    "轭",
    "郅",
    "鸢",
    "盱",
    "昊",
    "昙",
    "杲",
    "昃",
    "咂",
    "呸",
    "昕",
    "昀",
    "旻",
    "昉",
    "炅",
    "咔",
    "畀",
    "虮",
    "咀",
    "呷",
    "黾",
    "呱",
    "呤",
    "咚",
    "咆",
    "咛",
    "呶",
    "呣",
    "呦",
    "咝",
    "岢",
    "岿",
    "岬",
    "岫",
    "帙",
    "岣",
    "峁",
    "刿",
    "迥",
    "岷",
    "剀",
    "帔",
    "峄",
    "沓",
    "囹",
    "罔",
    "钍",
    "钎",
    "钏",
    "钒",
    "钕",
    "钗",
    "邾",
    "迮",
    "牦",
    "竺",
    "迤",
    "佶",
    "佬",
    "佰",
    "侑",
    "侉",
    "臾",
    "岱",
    "侗",
    "侃",
    "侏",
    "侩",
    "佻",
    "佾",
    "侪",
    "佼",
    "佯",
    "侬",
    "帛",
    "阜",
    "侔",
    "徂",
    "刽",
    "郄",
    "怂",
    "籴",
    "瓮",
    "戗",
    "肼",
    "䏝",
    "肽",
    "肱",
    "肫",
    "剁",
    "迩",
    "郇",
    "狙",
    "狎",
    "狍",
    "狒",
    "咎",
    "炙",
    "枭",
    "饯",
    "饴",
    "冽",
    "冼",
    "庖",
    "疠",
    "疝",
    "疡",
    "兖",
    "妾",
    "劾",
    "炜",
    "𬉼",
    "炖",
    "炘",
    "炝",
    "炔",
    "泔",
    "沭",
    "泷",
    "泸",
    "泱",
    "泅",
    "泗",
    "泠",
    "泺",
    "泖",
    "泫",
    "泮",
    "沱",
    "泯",
    "泓",
    "泾",
    "怙",
    "怵",
    "怦",
    "怛",
    "怏",
    "怍",
    "㤘",
    "怩",
    "怫",
    "怿",
    "宕",
    "穹",
    "宓",
    "诓",
    "诔",
    "诖",
    "诘",
    "戾",
    "诙",
    "戽",
    "郓",
    "衩",
    "祆",
    "祎",
    "祉",
    "祇",
    "诛",
    "诜",
    "诟",
    "诠",
    "诣",
    "诤",
    "诧",
    "诨",
    "诩",
    "戕",
    "孢",
    "亟",
    "陔",
    "妲",
    "妯",
    "姗",
    "帑",
    "弩",
    "孥",
    "驽",
    "虱",
    "迦",
    "迨",
    "绀",
    "绁",
    "绂",
    "驷",
    "驸",
    "绉",
    "绌",
    "驿",
    "骀",
    "甾",
    "珏",
    "珐",
    "珂",
    "珑",
    "玳",
    "珀",
    "顸",
    "珉",
    "珈",
    "拮",
    "垭",
    "挝",
    "垣",
    "挞",
    "垤",
    "赳",
    "贲",
    "垱",
    "垌",
    "郝",
    "垧",
    "垓",
    "挦",
    "垠",
    "茜",
    "荚",
    "荑",
    "贳",
    "荜",
    "莒",
    "茼",
    "茴",
    "茱",
    "莛",
    "荞",
    "茯",
    "荏",
    "荇",
    "荃",
    "荟",
    "荀",
    "茗",
    "荠",
    "茭",
    "茨",
    "垩",
    "荥",
    "荦",
    "荨",
    "荩",
    "剋",
    "荪",
    "茹",
    "荬",
    "荮",
    "柰",
    "栉",
    "柯",
    "柘",
    "栊",
    "柩",
    "枰",
    "栌",
    "柙",
    "枵",
    "柚",
    "枳",
    "柞",
    "柝",
    "栀",
    "柢",
    "栎",
    "枸",
    "柈",
    "柁",
    "枷",
    "柽",
    "剌",
    "酊",
    "郦",
    "甭",
    "砗",
    "砘",
    "砒",
    "斫",
    "砭",
    "砜",
    "奎",
    "耷",
    "虺",
    "殂",
    "殇",
    "殄",
    "殆",
    "轱",
    "轲",
    "轳",
    "轶",
    "轸",
    "虿",
    "毖",
    "觇",
    "尜",
    "哐",
    "眄",
    "眍",
    "𠳐",
    "郢",
    "眇",
    "眊",
    "眈",
    "禺",
    "哂",
    "咴",
    "曷",
    "昴",
    "昱",
    "昵",
    "咦",
    "哓",
    "哔",
    "畎",
    "毗",
    "呲",
    "胄",
    "畋",
    "畈",
    "虼",
    "虻",
    "盅",
    "咣",
    "哕",
    "剐",
    "郧",
    "咻",
    "囿",
    "咿",
    "哌",
    "哙",
    "哚",
    "咯",
    "咩",
    "咤",
    "哝",
    "哏",
    "哞",
    "峙",
    "峣",
    "罘",
    "帧",
    "峒",
    "峤",
    "峋",
    "峥",
    "贶",
    "钚",
    "钛",
    "钡",
    "钣",
    "钤",
    "钨",
    "钫",
    "钯",
    "氡",
    "氟",
    "牯",
    "郜",
    "秕",
    "秭",
    "竽",
    "笈",
    "笃",
    "俦",
    "俨",
    "俅",
    "俪",
    "叟",
    "垡",
    "牮",
    "俣",
    "俚",
    "皈",
    "俑",
    "俟",
    "逅",
    "徇",
    "徉",
    "舢",
    "俞",
    "郗",
    "俎",
    "郤",
    "爰",
    "郛",
    "瓴",
    "胨",
    "胪",
    "胛",
    "胂",
    "胙",
    "胍",
    "胗",
    "胝",
    "朐",
    "胫",
    "鸨",
    "匍",
    "狨",
    "狯",
    "飑",
    "狩",
    "狲",
    "訇",
    "逄",
    "昝",
    "饷",
    "饸",
    "饹",
    "胤",
    "孪",
    "娈",
    "弈",
    "奕",
    "庥",
    "疬",
    "疣",
    "疥",
    "疭",
    "庠",
    "竑",
    "彦",
    "飒",
    "闼",
    "闾",
    "闿",
    "阂",
    "羑",
    "迸",
    "籼",
    "酋",
    "炳",
    "炻",
    "炽",
    "炯",
    "烀",
    "炷",
    "烃",
    "洱",
    "洹",
    "洧",
    "洌",
    "浃",
    "洇",
    "洄",
    "洙",
    "涎",
    "洎",
    "洫",
    "浍",
    "洮",
    "洵",
    "浒",
    "浔",
    "浕",
    "洳",
    "恸",
    "恓",
    "恹",
    "恫",
    "恺",
    "恻",
    "恂",
    "恪",
    "恽",
    "宥",
    "扃",
    "衲",
    "衽",
    "衿",
    "袂",
    "祛",
    "祜",
    "祓",
    "祚",
    "诮",
    "祗",
    "祢",
    "诰",
    "诳",
    "鸩",
    "昶",
    "郡",
    "咫",
    "弭",
    "牁",
    "胥",
    "陛",
    "陟",
    "娅",
    "姮",
    "娆",
    "姝",
    "姣",
    "姘",
    "姹",
    "怼",
    "羿",
    "炱",
    "矜",
    "绔",
    "骁",
    "骅",
    "绗",
    "绛",
    "骈",
    "耖",
    "挈",
    "珥",
    "珙",
    "顼",
    "珰",
    "珩",
    "珧",
    "珣",
    "珞",
    "琤",
    "珲",
    "敖",
    "恚",
    "埔",
    "埕",
    "埘",
    "埙",
    "埚",
    "挹",
    "耆",
    "耄",
    "埒",
    "捋",
    "贽",
    "垸",
    "捃",
    "盍",
    "荸",
    "莆",
    "莳",
    "莴",
    "莪",
    "莠",
    "莓",
    "莜",
    "莅",
    "荼",
    "莩",
    "荽",
    "莸",
    "荻",
    "莘",
    "莎",
    "莞",
    "莨",
    "渇",
    "鸪",
    "莼",
    "栲",
    "栳",
    "郴",
    "桓",
    "桡",
    "桎",
    "桢",
    "桤",
    "梃",
    "栝",
    "桕",
    "桁",
    "桧",
    "桅",
    "栟",
    "桉",
    "栩",
    "逑",
    "逋",
    "彧",
    "鬲",
    "豇",
    "酐",
    "逦",
    "厝",
    "孬",
    "砝",
    "砹",
    "砺",
    "砧",
    "砷",
    "砟",
    "砼",
    "砥",
    "砣",
    "剞",
    "砻",
    "轼",
    "轾",
    "辂",
    "鸫",
    "趸",
    "龀",
    "鸬",
    "虔",
    "逍",
    "眬",
    "唛",
    "晟",
    "眩",
    "眙",
    "哧",
    "哽",
    "唔",
    "晁",
    "晏",
    "鸮",
    "趵",
    "趿",
    "畛",
    "蚨",
    "蚜",
    "蚍",
    "蚋",
    "蚬",
    "蚝",
    "蚧",
    "唢",
    "圄",
    "唣",
    "唏",
    "盎",
    "唑",
    "崂",
    "崃",
    "罡",
    "罟",
    "峪",
    "觊",
    "赅",
    "钰",
    "钲",
    "钴",
    "钵",
    "钹",
    "钺",
    "钽",
    "钼",
    "钿",
    "铀",
    "铂",
    "铄",
    "铆",
    "铈",
    "铉",
    "铊",
    "铋",
    "铌",
    "铍",
    "䥽",
    "铎",
    "氩",
    "氤",
    "氦",
    "毪",
    "舐",
    "秣",
    "秫",
    "盉",
    "笄",
    "笕",
    "笊",
    "笏",
    "笆",
    "俸",
    "倩",
    "俵",
    "偌",
    "俳",
    "俶",
    "倬",
    "倏",
    "恁",
    "倭",
    "倪",
    "俾",
    "倜",
    "隼",
    "隽",
    "倌",
    "倥",
    "臬",
    "皋",
    "郫",
    "倨",
    "衄",
    "颀",
    "徕",
    "舫",
    "釜",
    "奚",
    "衾",
    "胯",
    "胱",
    "胴",
    "胭",
    "脍",
    "胼",
    "朕",
    "脒",
    "胺",
    "鸱",
    "玺",
    "鸲",
    "狷",
    "猁",
    "狳",
    "猃",
    "狺",
    "逖",
    "桀",
    "袅",
    "饽",
    "凇",
    "栾",
    "挛",
    "亳",
    "疳",
    "疴",
    "疸",
    "疽",
    "痈",
    "疱",
    "痂",
    "痉",
    "衮",
    "凋",
    "颃",
    "恣",
    "旆",
    "旄",
    "旃",
    "阃",
    "阄",
    "訚",
    "阆",
    "恙",
    "粑",
    "朔",
    "郸",
    "烜",
    "烨",
    "烩",
    "烊",
    "剡",
    "郯",
    "烬",
    "涑",
    "浯",
    "涞",
    "涟",
    "娑",
    "涅",
    "涠",
    "浞",
    "涓",
    "浥",
    "涔",
    "浜",
    "浠",
    "浣",
    "浚",
    "悚",
    "悭",
    "悝",
    "悒",
    "悌",
    "悛",
    "宸",
    "窈",
    "剜",
    "诹",
    "冢",
    "诼",
    "袒",
    "袢",
    "祯",
    "诿",
    "谀",
    "谂",
    "谄",
    "谇",
    "屐",
    "屙",
    "陬",
    "勐",
    "奘",
    "牂",
    "蚩",
    "陲",
    "姬",
    "娠",
    "娌",
    "娉",
    "娲",
    "娩",
    "娴",
    "娣",
    "娓",
    "婀",
    "畚",
    "逡",
    "绠",
    "骊",
    "绡",
    "骋",
    "绥",
    "绦",
    "绨",
    "骎",
    "邕",
    "鸶",
    "彗",
    "耜",
    "焘",
    "舂",
    "琏",
    "琇",
    "麸",
    "揶",
    "埴",
    "埯",
    "捯",
    "掳",
    "掴",
    "埸",
    "埵",
    "赧",
    "埤",
    "捭",
    "逵",
    "埝",
    "堋",
    "堍",
    "掬",
    "鸷",
    "掖",
    "捽",
    "掊",
    "堉",
    "掸",
    "捩",
    "掮",
    "悫",
    "埭",
    "埽",
    "掇",
    "掼",
    "聃",
    "菁",
    "萁",
    "菘",
    "堇",
    "萘",
    "萋",
    "菽",
    "菖",
    "萜",
    "萸",
    "萑",
    "棻",
    "菔",
    "菟",
    "萏",
    "萃",
    "菏",
    "菹",
    "菪",
    "菅",
    "菀",
    "萦",
    "菰",
    "菡",
    "梵",
    "梿",
    "梏",
    "觋",
    "桴",
    "桷",
    "梓",
    "棁",
    "桫",
    "棂",
    "啬",
    "郾",
    "匮",
    "敕",
    "豉",
    "鄄",
    "酞",
    "酚",
    "戛",
    "硎",
    "硭",
    "硒",
    "硖",
    "硗",
    "硐",
    "硇",
    "硌",
    "鸸",
    "瓠",
    "匏",
    "厩",
    "龚",
    "殒",
    "殓",
    "殍",
    "赉",
    "雩",
    "辄",
    "堑",
    "眭",
    "眦",
    "啧",
    "晡",
    "晤",
    "眺",
    "眵",
    "眸",
    "圊",
    "喏",
    "喵",
    "啉",
    "勖",
    "晞",
    "唵",
    "晗",
    "冕",
    "啭",
    "畦",
    "趺",
    "啮",
    "跄",
    "蚶",
    "蛄",
    "蛎",
    "蛆",
    "蚰",
    "蛊",
    "圉",
    "蚱",
    "蛉",
    "蛏",
    "蚴",
    "啁",
    "啕",
    "唿",
    "啐",
    "唼",
    "唷",
    "啖",
    "啵",
    "啶",
    "啷",
    "唳",
    "唰",
    "啜",
    "帻",
    "崚",
    "崦",
    "帼",
    "崮",
    "崤",
    "崆",
    "赇",
    "赈",
    "赊",
    "铑",
    "铒",
    "铗",
    "铙",
    "铟",
    "铠",
    "铡",
    "铢",
    "铣",
    "铤",
    "铧",
    "铨",
    "铩",
    "铪",
    "铫",
    "铬",
    "铮",
    "铯",
    "铰",
    "铱",
    "铳",
    "铵",
    "铷",
    "氪",
    "牾",
    "鸹",
    "秾",
    "逶",
    "笺",
    "筇",
    "笸",
    "笪",
    "笮",
    "笠",
    "笥",
    "笤",
    "笳",
    "笾",
    "笞",
    "偾",
    "偃",
    "偕",
    "偈",
    "傀",
    "偬",
    "偻",
    "皑",
    "皎",
    "鸻",
    "徜",
    "舸",
    "舻",
    "舴",
    "舷",
    "龛",
    "翎",
    "脬",
    "脘",
    "脲",
    "匐",
    "猗",
    "猡",
    "猞",
    "猝",
    "斛",
    "猕",
    "馗",
    "馃",
    "馄",
    "鸾",
    "孰",
    "庹",
    "庾",
    "痔",
    "痍",
    "疵",
    "翊",
    "旌",
    "旎",
    "袤",
    "阇",
    "阈",
    "阉",
    "阊",
    "阋",
    "阍",
    "阏",
    "羟",
    "粝",
    "粕",
    "敝",
    "焐",
    "烯",
    "焓",
    "烽",
    "焖",
    "烷",
    "焗",
    "渍",
    "渚",
    "淇",
    "淅",
    "淞",
    "渎",
    "涿",
    "淖",
    "挲",
    "淠",
    "涸",
    "渑",
    "淦",
    "淝",
    "淬",
    "涪",
    "淙",
    "涫",
    "渌",
    "淄",
    "惬",
    "悻",
    "悱",
    "惝",
    "惘",
    "悸",
    "惆",
    "惚",
    "惇",
    "惮",
    "窕",
    "谌",
    "谏",
    "扈",
    "皲",
    "谑",
    "裆",
    "袷",
    "裉",
    "谒",
    "谔",
    "谕",
    "谖",
    "谗",
    "谙",
    "谛",
    "谝",
    "逯",
    "郿",
    "隈",
    "粜",
    "隍",
    "隗",
    "婧",
    "婊",
    "婕",
    "娼",
    "婢",
    "婵",
    "胬",
    "袈",
    "翌",
    "恿",
    "欸",
    "绫",
    "骐",
    "绮",
    "绯",
    "绱",
    "骒",
    "绲",
    "骓",
    "绶",
    "绺",
    "绻",
    "绾",
    "骖",
    "缁",
    "耠",
    "琫",
    "琵",
    "琶",
    "琪",
    "瑛",
    "琦",
    "琥",
    "琨",
    "靓",
    "琰",
    "琮",
    "琯",
    "琬",
    "琛",
    "琚",
    "辇",
    "鼋",
    "揳",
    "堞",
    "搽",
    "揸",
    "揠",
    "堙",
    "趄",
    "揖",
    "颉",
    "塄",
    "揿",
    "耋",
    "揄",
    "蛩",
    "蛰",
    "塆",
    "摒",
    "揆",
    "掾",
    "聒",
    "葑",
    "葚",
    "靰",
    "靸",
    "葳",
    "葺",
    "葸",
    "萼",
    "葆",
    "葩",
    "葶",
    "蒌",
    "萱",
    "戟",
    "葭",
    "楮",
    "棼",
    "椟",
    "棹",
    "椤",
    "棰",
    "赍",
    "椋",
    "椁",
    "椪",
    "棣",
    "椐",
    "鹁",
    "覃",
    "酤",
    "酢",
    "酡",
    "鹂",
    "厥",
    "殚",
    "殛",
    "雯",
    "雱",
    "辊",
    "辋",
    "椠",
    "辍",
    "辎",
    "斐",
    "睄",
    "睑",
    "睇",
    "睃",
    "戢",
    "喋",
    "嗒",
    "喃",
    "喱",
    "喹",
    "晷",
    "喈",
    "跖",
    "跗",
    "跞",
    "跚",
    "跎",
    "跏",
    "跆",
    "蛱",
    "蛲",
    "蛭",
    "蛳",
    "蛐",
    "蛔",
    "蛞",
    "蛴",
    "蛟",
    "蛘",
    "喁",
    "喟",
    "啾",
    "嗖",
    "喑",
    "嗟",
    "喽",
    "嗞",
    "喀",
    "喔",
    "喙",
    "嵘",
    "嵖",
    "崴",
    "遄",
    "詈",
    "嵎",
    "崽",
    "嵬",
    "嵛",
    "嵯",
    "嵝",
    "嵫",
    "幄",
    "嵋",
    "赕",
    "铻",
    "铼",
    "铿",
    "锃",
    "锂",
    "锆",
    "锇",
    "锉",
    "锏",
    "锑",
    "锒",
    "锔",
    "锕",
    "掣",
    "矬",
    "氰",
    "毳",
    "毽",
    "犊",
    "犄",
    "犋",
    "鹄",
    "犍",
    "嵇",
    "黍",
    "稃",
    "稂",
    "筚",
    "筵",
    "筌",
    "傣",
    "傈",
    "舄",
    "牍",
    "傥",
    "傧",
    "遑",
    "傩",
    "遁",
    "徨",
    "媭",
    "畲",
    "弑",
    "颌",
    "翕",
    "釉",
    "鹆",
    "舜",
    "貂",
    "腈",
    "腌",
    "腓",
    "腆",
    "腴",
    "腑",
    "腚",
    "腱",
    "鱿",
    "鲀",
    "鲂",
    "颍",
    "猢",
    "猹",
    "猥",
    "飓",
    "觞",
    "觚",
    "猱",
    "颎",
    "飧",
    "馇",
    "馊",
    "亵",
    "脔",
    "裒",
    "痣",
    "痨",
    "痦",
    "痞",
    "痤",
    "痫",
    "痧",
    "赓",
    "竦",
    "瓿",
    "啻",
    "颏",
    "鹇",
    "阑",
    "阒",
    "阕",
    "粞",
    "遒",
    "孳",
    "焯",
    "焜",
    "焙",
    "焱",
    "鹈",
    "湛",
    "渫",
    "湮",
    "湎",
    "湜",
    "渭",
    "湍",
    "湫",
    "溲",
    "湟",
    "溆",
    "湲",
    "湔",
    "湉",
    "渥",
    "湄",
    "滁",
    "愠",
    "惺",
    "愦",
    "惴",
    "愀",
    "愎",
    "愔",
    "喾",
    "寐",
    "谟",
    "扉",
    "裢",
    "裎",
    "裥",
    "祾",
    "祺",
    "谠",
    "幂",
    "谡",
    "谥",
    "谧",
    "遐",
    "孱",
    "弼",
    "巽",
    "骘",
    "媪",
    "媛",
    "婷",
    "巯",
    "翚",
    "皴",
    "婺",
    "骛",
    "缂",
    "缃",
    "缄",
    "彘",
    "缇",
    "缈",
    "缌",
    "缑",
    "缒",
    "缗",
    "飨",
    "耢",
    "瑚",
    "瑁",
    "瑜",
    "瑗",
    "瑄",
    "瑕",
    "遨",
    "骜",
    "韫",
    "髡",
    "塬",
    "鄢",
    "趔",
    "趑",
    "摅",
    "摁",
    "蜇",
    "搋",
    "搪",
    "搐",
    "搛",
    "搠",
    "摈",
    "彀",
    "毂",
    "搦",
    "搡",
    "蓁",
    "戡",
    "蓍",
    "鄞",
    "靳",
    "蓐",
    "蓦",
    "鹋",
    "蒽",
    "蓓",
    "蓖",
    "蓊",
    "蒯",
    "蓟",
    "蓑",
    "蒿",
    "蒺",
    "蓠",
    "蒟",
    "蒡",
    "蒹",
    "蒴",
    "蒗",
    "蓥",
    "颐",
    "楔",
    "楠",
    "楂",
    "楝",
    "楫",
    "楸",
    "椴",
    "槌",
    "楯",
    "皙",
    "榈",
    "槎",
    "榉",
    "楦",
    "楣",
    "楹",
    "椽",
    "裘",
    "剽",
    "甄",
    "酮",
    "酰",
    "酯",
    "酩",
    "蜃",
    "碛",
    "碓",
    "硼",
    "碉",
    "碚",
    "碇",
    "碜",
    "鹌",
    "辏",
    "龃",
    "龅",
    "訾",
    "粲",
    "虞",
    "睚",
    "嗪",
    "韪",
    "嗷",
    "嗉",
    "睨",
    "睢",
    "雎",
    "睥",
    "嘟",
    "嗑",
    "嗫",
    "嗬",
    "嗔",
    "嗝",
    "戥",
    "嗄",
    "煦",
    "暄",
    "遢",
    "暌",
    "跬",
    "跶",
    "跸",
    "跐",
    "跣",
    "跹",
    "跻",
    "蛸",
    "蜊",
    "蜍",
    "蜉",
    "蜣",
    "畹",
    "蛹",
    "嗣",
    "嗯",
    "嗥",
    "嗲",
    "嗳",
    "嗌",
    "嗍",
    "嗨",
    "嗐",
    "嗤",
    "嗵",
    "罨",
    "嵊",
    "嵩",
    "嵴",
    "骰",
    "锗",
    "锛",
    "锜",
    "锝",
    "锞",
    "锟",
    "锢",
    "锨",
    "锩",
    "锭",
    "锱",
    "雉",
    "氲",
    "犏",
    "歃",
    "稞",
    "稗",
    "稔",
    "筠",
    "筢",
    "筮",
    "筲",
    "筱",
    "牒",
    "煲",
    "敫",
    "徭",
    "愆",
    "艄",
    "觎",
    "毹",
    "貊",
    "貅",
    "貉",
    "颔",
    "腠",
    "腩",
    "腼",
    "腭",
    "腧",
    "塍",
    "媵",
    "詹",
    "鲅",
    "鲆",
    "鲇",
    "鲈",
    "稣",
    "鲋",
    "鲐",
    "肄",
    "鹐",
    "飕",
    "觥",
    "遛",
    "馐",
    "鹑",
    "亶",
    "瘃",
    "痱",
    "痼",
    "痿",
    "瘐",
    "瘁",
    "瘆",
    "麂",
    "裔",
    "歆",
    "旒",
    "雍",
    "阖",
    "阗",
    "阙",
    "羧",
    "豢",
    "粳",
    "猷",
    "煳",
    "煜",
    "煨",
    "煅",
    "煊",
    "煸",
    "煺",
    "滟",
    "溱",
    "溘",
    "漭",
    "滢",
    "溥",
    "溧",
    "溽",
    "裟",
    "溻",
    "溷",
    "滗",
    "滫",
    "溴",
    "滏",
    "滃",
    "滦",
    "溏",
    "滂",
    "滓",
    "溟",
    "滪",
    "愫",
    "慑",
    "慊",
    "鲎",
    "骞",
    "窦",
    "窠",
    "窣",
    "裱",
    "褚",
    "裨",
    "裾",
    "裰",
    "禊",
    "谩",
    "谪",
    "媾",
    "嫫",
    "媲",
    "嫒",
    "嫔",
    "媸",
    "缙",
    "缜",
    "缛",
    "辔",
    "骝",
    "缟",
    "缡",
    "缢",
    "缣",
    "骟",
    "耥",
    "璈",
    "瑶",
    "瑭",
    "獒",
    "觏",
    "慝",
    "嫠",
    "韬",
    "叆",
    "髦",
    "摽",
    "墁",
    "撂",
    "摞",
    "撄",
    "翥",
    "踅",
    "摭",
    "墉",
    "墒",
    "榖",
    "綦",
    "蔫",
    "蔷",
    "靺",
    "靼",
    "鞅",
    "靿",
    "甍",
    "蔸",
    "蔟",
    "蔺",
    "戬",
    "蕖",
    "蔻",
    "蓿",
    "斡",
    "鹕",
    "蓼",
    "榛",
    "榧",
    "榻",
    "榫",
    "榭",
    "槔",
    "榱",
    "槁",
    "槟",
    "槠",
    "榷",
    "僰",
    "酽",
    "酶",
    "酹",
    "厮",
    "碡",
    "碴",
    "碣",
    "碲",
    "磋",
    "臧",
    "豨",
    "殡",
    "霆",
    "霁",
    "辕",
    "蜚",
    "裴",
    "翡",
    "龇",
    "龈",
    "睿",
    "䁖",
    "睽",
    "嘞",
    "嘈",
    "嘌",
    "嘁",
    "嘎",
    "暧",
    "暝",
    "踌",
    "踉",
    "蜞",
    "蜥",
    "蜮",
    "蝈",
    "蜴",
    "蜱",
    "蜩",
    "蜷",
    "蜿",
    "螂",
    "蜢",
    "嘘",
    "嘡",
    "鹗",
    "嘣",
    "嘤",
    "嘚",
    "嗾",
    "嘧",
    "罴",
    "罱",
    "幔",
    "嶂",
    "幛",
    "赙",
    "罂",
    "骷",
    "骶",
    "鹘",
    "锲",
    "锴",
    "锶",
    "锷",
    "锸",
    "锵",
    "镁",
    "镂",
    "犒",
    "箐",
    "箦",
    "箧",
    "箍",
    "箸",
    "箬",
    "箅",
    "箪",
    "箔",
    "箜",
    "箢",
    "箓",
    "毓",
    "僖",
    "儆",
    "僳",
    "僭",
    "劁",
    "僮",
    "魃",
    "魆",
    "睾",
    "艋",
    "鄱",
    "膈",
    "膑",
    "鲑",
    "鲔",
    "鲚",
    "鲛",
    "鲟",
    "獐",
    "觫",
    "雒",
    "夤",
    "馑",
    "銮",
    "塾",
    "麽",
    "瘌",
    "瘊",
    "瘘",
    "瘙",
    "廖",
    "韶",
    "旖",
    "膂",
    "阚",
    "鄯",
    "鲞",
    "粿",
    "粼",
    "粽",
    "糁",
    "槊",
    "鹚",
    "熘",
    "熥",
    "潢",
    "漕",
    "滹",
    "漯",
    "漶",
    "潋",
    "潴",
    "漪",
    "漉",
    "漳",
    "漩",
    "澉",
    "潍",
    "慵",
    "搴",
    "窨",
    "寤",
    "綮",
    "谮",
    "褡",
    "褙",
    "褓",
    "褛",
    "褊",
    "谯",
    "谰",
    "谲",
    "暨",
    "屣",
    "鹛",
    "嫣",
    "嫱",
    "嫖",
    "嫦",
    "嫚",
    "嫘",
    "嫡",
    "鼐",
    "翟",
    "瞀",
    "鹜",
    "骠",
    "缥",
    "缦",
    "缧",
    "缨",
    "骢",
    "缪",
    "缫",
    "耦",
    "耧",
    "瑾",
    "璜",
    "璀",
    "璎",
    "璁",
    "璋",
    "璇",
    "奭",
    "髯",
    "髫",
    "撷",
    "撅",
    "赭",
    "撸",
    "鋆",
    "撙",
    "撺",
    "墀",
    "聩",
    "觐",
    "鞑",
    "蕙",
    "鞒",
    "蕈",
    "蕨",
    "蕤",
    "蕞",
    "蕺",
    "瞢",
    "蕃",
    "蕲",
    "赜",
    "槿",
    "樯",
    "槭",
    "樗",
    "樘",
    "樊",
    "槲",
    "醌",
    "醅",
    "靥",
    "魇",
    "餍",
    "磔",
    "磙",
    "霈",
    "辘",
    "龉",
    "龊",
    "觑",
    "瞌",
    "瞋",
    "瞑",
    "嘭",
    "噎",
    "噶",
    "颙",
    "暹",
    "噘",
    "踔",
    "踝",
    "踟",
    "踒",
    "踬",
    "踮",
    "踯",
    "踺",
    "踞",
    "蝽",
    "蝾",
    "蝻",
    "蝰",
    "蝮",
    "螋",
    "蝓",
    "蝣",
    "蝼",
    "噗",
    "嘬",
    "颚",
    "噍",
    "噢",
    "噙",
    "噜",
    "噌",
    "噔",
    "颛",
    "幞",
    "幡",
    "嶙",
    "嶝",
    "骺",
    "骼",
    "骸",
    "镊",
    "镉",
    "镌",
    "镍",
    "镏",
    "镒",
    "镓",
    "镔",
    "稷",
    "箴",
    "篑",
    "篁",
    "篌",
    "篆",
    "牖",
    "儋",
    "徵",
    "磐",
    "虢",
    "鹞",
    "膘",
    "滕",
    "鲠",
    "鲡",
    "鲢",
    "鲣",
    "鲥",
    "鲧",
    "鲩",
    "獗",
    "獠",
    "觯",
    "馓",
    "馔",
    "麾",
    "廛",
    "瘛",
    "瘼",
    "瘢",
    "瘠",
    "齑",
    "羯",
    "羰",
    "𥻗",
    "遴",
    "糌",
    "糍",
    "糅",
    "熜",
    "熵",
    "熠",
    "澍",
    "澌",
    "潸",
    "潦",
    "潲",
    "鋈",
    "潟",
    "潼",
    "潺",
    "憬",
    "憧",
    "寮",
    "窳",
    "谳",
    "褴",
    "褟",
    "褫",
    "谵",
    "熨",
    "屦",
    "嬉",
    "勰",
    "戮",
    "蝥",
    "缬",
    "缮",
    "缯",
    "骣",
    "畿",
    "耩",
    "耨",
    "耪",
    "璞",
    "璟",
    "靛",
    "璠",
    "璘",
    "聱",
    "螯",
    "髻",
    "髭",
    "髹",
    "擀",
    "熹",
    "甏",
    "擞",
    "縠",
    "磬",
    "颞",
    "蕻",
    "鞘",
    "颟",
    "薤",
    "薨",
    "檠",
    "薏",
    "薮",
    "薜",
    "薅",
    "樾",
    "橛",
    "橇",
    "樵",
    "檎",
    "橹",
    "樽",
    "樨",
    "橼",
    "墼",
    "橐",
    "翮",
    "醛",
    "醐",
    "醍",
    "醚",
    "磲",
    "赝",
    "飙",
    "殪",
    "霖",
    "霏",
    "霓",
    "錾",
    "辚",
    "臻",
    "遽",
    "氅",
    "瞟",
    "瞠",
    "瞰",
    "嚄",
    "嚆",
    "噤",
    "暾",
    "蹀",
    "踹",
    "踵",
    "踽",
    "蹉",
    "蹁",
    "螨",
    "蟒",
    "螈",
    "螅",
    "螭",
    "螠",
    "螟",
    "噱",
    "噬",
    "噫",
    "噻",
    "噼",
    "罹",
    "圜",
    "䦃",
    "镖",
    "镗",
    "镘",
    "镚",
    "镛",
    "镝",
    "镞",
    "镠",
    "氇",
    "氆",
    "憩",
    "穑",
    "篝",
    "篥",
    "篦",
    "篪",
    "篙",
    "盥",
    "劓",
    "翱",
    "魉",
    "魈",
    "徼",
    "歙",
    "膳",
    "膦",
    "膙",
    "鲮",
    "鲱",
    "鲲",
    "鲳",
    "鲴",
    "鲵",
    "鲷",
    "鲻",
    "獴",
    "獭",
    "獬",
    "邂",
    "鹧",
    "廨",
    "赟",
    "瘰",
    "廪",
    "瘿",
    "瘵",
    "瘴",
    "癃",
    "瘳",
    "斓",
    "麇",
    "麈",
    "嬴",
    "壅",
    "羲",
    "糗",
    "瞥",
    "甑",
    "燎",
    "燠",
    "燔",
    "燧",
    "濑",
    "濉",
    "潞",
    "澧",
    "澹",
    "澥",
    "澶",
    "濂",
    "褰",
    "寰",
    "窸",
    "褶",
    "禧",
    "嬖",
    "犟",
    "隰",
    "嬗",
    "颡",
    "缱",
    "缲",
    "缳",
    "璨",
    "璩",
    "璐",
    "璪",
    "螫",
    "擤",
    "壕",
    "觳",
    "罄",
    "擢",
    "薹",
    "鞡",
    "鞬",
    "薷",
    "薰",
    "藓",
    "藁",
    "檄",
    "檩",
    "懋",
    "醢",
    "翳",
    "礅",
    "磴",
    "鹩",
    "龋",
    "龌",
    "豳",
    "壑",
    "黻",
    "嚏",
    "嚅",
    "蹑",
    "蹒",
    "蹊",
    "蟥",
    "螬",
    "螵",
    "疃",
    "螳",
    "蟑",
    "嚓",
    "羁",
    "罽",
    "罾",
    "嶷",
    "黜",
    "黝",
    "髁",
    "髀",
    "镡",
    "镢",
    "镣",
    "镦",
    "镧",
    "镩",
    "镪",
    "镫",
    "罅",
    "黏",
    "簌",
    "篾",
    "篼",
    "簖",
    "簋",
    "鼢",
    "黛",
    "儡",
    "鹪",
    "鼾",
    "皤",
    "魍",
    "龠",
    "繇",
    "貘",
    "邈",
    "貔",
    "臌",
    "膻",
    "臆",
    "臃",
    "鲼",
    "鲽",
    "鳀",
    "鳃",
    "鳅",
    "鳇",
    "鳊",
    "螽",
    "燮",
    "鹫",
    "襄",
    "糜",
    "縻",
    "膺",
    "癍",
    "麋",
    "懑",
    "濡",
    "濮",
    "濞",
    "濠",
    "濯",
    "蹇",
    "謇",
    "邃",
    "襁",
    "檗",
    "擘",
    "孺",
    "隳",
    "嬷",
    "蟊",
    "鹬",
    "鍪",
    "鏊",
    "鳌",
    "鬈",
    "鬃",
    "瞽",
    "鞯",
    "鞨",
    "鞫",
    "鞧",
    "鞣",
    "藜",
    "藠",
    "藩",
    "醪",
    "蹙",
    "礓",
    "燹",
    "餮",
    "瞿",
    "曛",
    "颢",
    "曜",
    "躇",
    "蹚",
    "鹭",
    "蟛",
    "蟪",
    "蟠",
    "蟮",
    "鹮",
    "黠",
    "黟",
    "髅",
    "髂",
    "镬",
    "镭",
    "镯",
    "馥",
    "簟",
    "簪",
    "鼬",
    "雠",
    "艟",
    "鳎",
    "鳏",
    "鳐",
    "癞",
    "癔",
    "癜",
    "癖",
    "糨",
    "蹩",
    "鎏",
    "懵",
    "彝",
    "邋",
    "鬏",
    "攉",
    "攒",
    "鞲",
    "鞴",
    "藿",
    "蘧",
    "蘅",
    "麓",
    "醮",
    "醯",
    "酃",
    "霪",
    "霭",
    "霨",
    "黼",
    "嚯",
    "蹰",
    "蹶",
    "蹽",
    "蹼",
    "蹴",
    "蹾",
    "蹿",
    "蠖",
    "蠓",
    "蟾",
    "蠊",
    "黢",
    "髋",
    "髌",
    "镲",
    "籀",
    "籁",
    "齁",
    "魑",
    "艨",
    "鳓",
    "鳔",
    "鳕",
    "鳗",
    "鳙",
    "麒",
    "鏖",
    "羸",
    "㸆",
    "瀚",
    "瀣",
    "瀛",
    "襦",
    "谶",
    "襞",
    "骥",
    "缵",
    "瓒",
    "攘",
    "蘩",
    "蘖",
    "醴",
    "霰",
    "酆",
    "矍",
    "曦",
    "躅",
    "鼍",
    "巉",
    "黩",
    "黥",
    "黪",
    "镳",
    "镴",
    "黧",
    "纂",
    "璺",
    "鼯",
    "臜",
    "鳜",
    "鳝",
    "鳟",
    "獾",
    "孀",
    "骧",
    "瓘",
    "鼙",
    "醺",
    "礴",
    "颦",
    "曩",
    "鳢",
    "癫",
    "麝",
    "夔",
    "爝",
    "灏",
    "禳",
    "鐾",
    "羼",
    "蠡",
    "耱",
    "懿",
    "蘸",
    "鹳",
    "霾",
    "氍",
    "饕",
    "躐",
    "髑",
    "镵",
    "穰",
    "饔",
    "鬻",
    "鬟",
    "趱",
    "攫",
    "攥",
    "颧",
    "躜",
    "鼹",
    "癯",
    "麟",
    "蠲",
    "蠹",
    "躞",
    "衢",
    "鑫",
    "灞",
    "襻",
    "纛",
    "鬣",
    "攮",
    "囔",
    "馕",
    "戆",
    "爨",
    "齉",
    "亍",
    "尢",
    "彳",
    "卬",
    "殳",
    "𠙶",
    "毌",
    "邘",
    "戋",
    "圢",
    "氕",
    "伋",
    "仝",
    "冮",
    "氿",
    "汈",
    "氾",
    "忉",
    "宄",
    "讱",
    "扞",
    "圲",
    "圫",
    "芏",
    "芃",
    "朳",
    "朸",
    "𨙸",
    "邨",
    "吒",
    "吖",
    "屼",
    "屾",
    "辿",
    "钆",
    "仳",
    "伣",
    "伈",
    "癿",
    "甪",
    "邠",
    "犴",
    "冱",
    "邡",
    "闫",
    "汋",
    "䜣",
    "讻",
    "孖",
    "纩",
    "玒",
    "玓",
    "玘",
    "玚",
    "刬",
    "坜",
    "坉",
    "扽",
    "坋",
    "扺",
    "㧑",
    "毐",
    "芰",
    "芣",
    "苊",
    "苉",
    "芘",
    "芴",
    "芠",
    "芤",
    "杕",
    "杙",
    "杄",
    "杧",
    "杩",
    "尪",
    "尨",
    "轪",
    "坒",
    "芈",
    "旴",
    "旵",
    "呙",
    "㕮",
    "岍",
    "岠",
    "岜",
    "呇",
    "冏",
    "觃",
    "岙",
    "伾",
    "㑇",
    "伭",
    "佖",
    "伲",
    "佁",
    "飏",
    "狃",
    "闶",
    "汧",
    "汫",
    "𣲘",
    "𣲗",
    "沄",
    "沘",
    "汭",
    "㳇",
    "沇",
    "忮",
    "忳",
    "忺",
    "祃",
    "诇",
    "邲",
    "诎",
    "诐",
    "屃",
    "岊",
    "阽",
    "䢺",
    "阼",
    "妧",
    "妘",
    "𨚕",
    "纮",
    "驲",
    "纻",
    "纼",
    "玤",
    "玞",
    "玱",
    "玟",
    "邽",
    "邿",
    "坥",
    "坰",
    "坬",
    "坽",
    "弆",
    "耵",
    "䢼",
    "𦭜",
    "茋",
    "苧",
    "苾",
    "苠",
    "枅",
    "㭎",
    "枘",
    "枍",
    "矼",
    "矻",
    "匼",
    "旿",
    "昇",
    "昄",
    "昒",
    "昈",
    "咉",
    "咇",
    "咍",
    "岵",
    "岽",
    "岨",
    "岞",
    "峂",
    "㟃",
    "囷",
    "钐",
    "钔",
    "钖",
    "牥",
    "佴",
    "垈",
    "侁",
    "侹",
    "佸",
    "佺",
    "隹",
    "㑊",
    "侂",
    "佽",
    "侘",
    "郈",
    "舠",
    "郐",
    "郃",
    "攽",
    "肭",
    "肸",
    "肷",
    "狉",
    "狝",
    "饳",
    "忞",
    "於",
    "炌",
    "炆",
    "泙",
    "沺",
    "泂",
    "泜",
    "泃",
    "泇",
    "怊",
    "峃",
    "穸",
    "祋",
    "祊",
    "鸤",
    "弢",
    "弨",
    "陑",
    "陎",
    "卺",
    "乸",
    "妭",
    "姈",
    "迳",
    "叕",
    "驵",
    "䌹",
    "驺",
    "绋",
    "绐",
    "砉",
    "耔",
    "㛃",
    "玶",
    "珇",
    "珅",
    "珋",
    "玹",
    "珌",
    "玿",
    "韨",
    "垚",
    "垯",
    "垙",
    "垲",
    "埏",
    "垍",
    "耇",
    "垎",
    "垴",
    "垟",
    "垞",
    "挓",
    "垵",
    "垏",
    "拶",
    "荖",
    "荁",
    "荙",
    "荛",
    "茈",
    "茽",
    "荄",
    "茺",
    "荓",
    "茳",
    "𦰡",
    "茛",
    "荭",
    "㭕",
    "柷",
    "柃",
    "柊",
    "枹",
    "栐",
    "柖",
    "郚",
    "剅",
    "䴓",
    "迺",
    "厖",
    "砆",
    "砑",
    "砄",
    "耏",
    "奓",
    "䶮",
    "轵",
    "轷",
    "轹",
    "轺",
    "昺",
    "昽",
    "盷",
    "咡",
    "咺",
    "昳",
    "昣",
    "哒",
    "昤",
    "昫",
    "昡",
    "咥",
    "昪",
    "虷",
    "虸",
    "哃",
    "峘",
    "耑",
    "峛",
    "峗",
    "峧",
    "帡",
    "钘",
    "钜",
    "钪",
    "钬",
    "钭",
    "矧",
    "秬",
    "俫",
    "舁",
    "俜",
    "俙",
    "俍",
    "垕",
    "衎",
    "舣",
    "弇",
    "侴",
    "鸧",
    "䏡",
    "胠",
    "𦙶",
    "胈",
    "胩",
    "胣",
    "朏",
    "飐",
    "訄",
    "饻",
    "庤",
    "疢",
    "炣",
    "炟",
    "㶲",
    "洭",
    "洘",
    "洓",
    "洿",
    "㳚",
    "泚",
    "浈",
    "浉",
    "洸",
    "洑",
    "洢",
    "洈",
    "洚",
    "洺",
    "洨",
    "浐",
    "㳘",
    "洴",
    "洣",
    "恔",
    "宬",
    "窀",
    "扂",
    "袆",
    "祏",
    "祐",
    "祕",
    "叚",
    "陧",
    "陞",
    "娀",
    "姞",
    "姱",
    "姤",
    "姶",
    "姽",
    "枲",
    "绖",
    "骃",
    "彖",
    "骉",
    "恝",
    "珪",
    "珛",
    "珹",
    "琊",
    "玼",
    "珖",
    "珽",
    "珦",
    "珫",
    "珒",
    "珢",
    "珕",
    "珝",
    "埗",
    "垾",
    "垺",
    "埆",
    "垿",
    "埌",
    "埇",
    "莰",
    "茝",
    "鄀",
    "莶",
    "莝",
    "䓖",
    "莙",
    "栻",
    "桠",
    "桄",
    "梠",
    "栴",
    "梴",
    "栒",
    "酎",
    "酏",
    "砵",
    "砠",
    "砫",
    "砬",
    "硁",
    "恧",
    "翃",
    "郪",
    "𨐈",
    "辀",
    "辁",
    "剕",
    "赀",
    "哢",
    "晅",
    "晊",
    "唝",
    "哳",
    "哱",
    "冔",
    "晔",
    "晐",
    "晖",
    "畖",
    "蚄",
    "蚆",
    "帱",
    "崁",
    "峿",
    "崄",
    "帨",
    "崀",
    "赆",
    "钷",
    "眚",
    "甡",
    "笫",
    "倻",
    "倴",
    "脩",
    "倮",
    "倕",
    "倞",
    "倓",
    "倧",
    "衃",
    "虒",
    "舭",
    "舯",
    "舥",
    "瓞",
    "鬯",
    "鸰",
    "脎",
    "朓",
    "胲",
    "虓",
    "鱽",
    "狴",
    "峱",
    "狻",
    "眢",
    "勍",
    "痄",
    "疰",
    "痃",
    "竘",
    "羖",
    "羓",
    "桊",
    "敉",
    "烠",
    "烔",
    "烶",
    "烻",
    "涍",
    "浡",
    "浭",
    "浬",
    "涄",
    "涢",
    "涐",
    "浰",
    "浟",
    "浛",
    "浼",
    "浲",
    "涘",
    "悈",
    "悃",
    "悢",
    "宧",
    "窅",
    "窊",
    "窎",
    "扅",
    "扆",
    "袪",
    "袗",
    "袯",
    "祧",
    "隺",
    "堲",
    "疍",
    "𨺙",
    "陴",
    "烝",
    "砮",
    "㛚",
    "哿",
    "翀",
    "翂",
    "剟",
    "绤",
    "骍",
    "䂮",
    "琎",
    "珸",
    "珵",
    "琄",
    "琈",
    "琀",
    "珺",
    "掭",
    "堎",
    "堐",
    "埼",
    "掎",
    "埫",
    "堌",
    "晢",
    "掞",
    "埪",
    "壸",
    "㙍",
    "聍",
    "菝",
    "萚",
    "菥",
    "莿",
    "䓫",
    "勚",
    "䓬",
    "萆",
    "菂",
    "菍",
    "菼",
    "萣",
    "䓨",
    "菉",
    "䓛",
    "梼",
    "梽",
    "桲",
    "梾",
    "桯",
    "梣",
    "梌",
    "桹",
    "敔",
    "厣",
    "硔",
    "硙",
    "硚",
    "硊",
    "硍",
    "勔",
    "䴕",
    "龁",
    "逴",
    "唪",
    "啫",
    "翈",
    "㫰",
    "晙",
    "畤",
    "趼",
    "跂",
    "蛃",
    "蚲",
    "蚺",
    "啴",
    "䎃",
    "崧",
    "崟",
    "崞",
    "崒",
    "崌",
    "崡",
    "铏",
    "铕",
    "铖",
    "铘",
    "铚",
    "铞",
    "铥",
    "铴",
    "牻",
    "牿",
    "稆",
    "笱",
    "笯",
    "偰",
    "偡",
    "鸺",
    "偭",
    "偲",
    "偁",
    "㿠",
    "鄅",
    "偓",
    "徛",
    "衒",
    "舳",
    "舲",
    "鸼",
    "悆",
    "鄃",
    "瓻",
    "䝙",
    "脶",
    "脞",
    "脟",
    "䏲",
    "鱾",
    "猇",
    "猊",
    "猄",
    "觖",
    "𠅤",
    "庱",
    "庼",
    "庳",
    "痓",
    "䴔",
    "竫",
    "堃",
    "阌",
    "羝",
    "羕",
    "焆",
    "烺",
    "焌",
    "淏",
    "淟",
    "淜",
    "淴",
    "淯",
    "湴",
    "涴",
    "㥄",
    "惛",
    "惔",
    "悰",
    "惙",
    "寁",
    "逭",
    "袼",
    "裈",
    "祲",
    "谞",
    "艴",
    "弸",
    "弶",
    "隃",
    "婞",
    "娵",
    "婼",
    "媖",
    "婳",
    "婍",
    "婌",
    "婫",
    "婤",
    "婘",
    "婠",
    "绹",
    "骕",
    "絜",
    "珷",
    "琲",
    "琡",
    "琟",
    "琔",
    "琭",
    "堾",
    "堼",
    "揕",
    "㙘",
    "堧",
    "喆",
    "堨",
    "塅",
    "堠",
    "絷",
    "𡎚",
    "葜",
    "惎",
    "萳",
    "葙",
    "靬",
    "葴",
    "蒇",
    "蒈",
    "鄚",
    "蒉",
    "蓇",
    "萩",
    "蒐",
    "葰",
    "葎",
    "鄑",
    "蒎",
    "葖",
    "蒄",
    "萹",
    "棤",
    "棽",
    "棫",
    "椓",
    "椑",
    "鹀",
    "椆",
    "棓",
    "棬",
    "棪",
    "椀",
    "楗",
    "甦",
    "酦",
    "觌",
    "奡",
    "皕",
    "硪",
    "欹",
    "詟",
    "辌",
    "棐",
    "龂",
    "黹",
    "牚",
    "睎",
    "晫",
    "晪",
    "晱",
    "𧿹",
    "蛑",
    "畯",
    "斝",
    "喤",
    "崶",
    "嵁",
    "崾",
    "嵅",
    "崿",
    "嵚",
    "翙",
    "圌",
    "圐",
    "赑",
    "淼",
    "赒",
    "铹",
    "铽",
    "𨱇",
    "锊",
    "锍",
    "锎",
    "锓",
    "犇",
    "颋",
    "稌",
    "筀",
    "筘",
    "筜",
    "筥",
    "筅",
    "傃",
    "傉",
    "翛",
    "傒",
    "傕",
    "舾",
    "畬",
    "脿",
    "腘",
    "䐃",
    "腙",
    "腒",
    "鲃",
    "猰",
    "猯",
    "㺄",
    "馉",
    "鄗",
    "廋",
    "廆",
    "鄌",
    "粢",
    "遆",
    "旐",
    "焞",
    "欻",
    "𣸣",
    "溚",
    "溁",
    "湝",
    "渰",
    "湓",
    "㴔",
    "渟",
    "溠",
    "渼",
    "溇",
    "湣",
    "湑",
    "溞",
    "愐",
    "愃",
    "敩",
    "甯",
    "棨",
    "扊",
    "裣",
    "祼",
    "婻",
    "媆",
    "媞",
    "㛹",
    "媓",
    "媂",
    "媄",
    "毵",
    "矞",
    "缊",
    "缐",
    "骙",
    "瑃",
    "瑓",
    "瑅",
    "瑆",
    "䴖",
    "瑖",
    "瑝",
    "瑔",
    "瑀",
    "𤧛",
    "瑳",
    "瑂",
    "嶅",
    "瑑",
    "遘",
    "髢",
    "塥",
    "堽",
    "赪",
    "摛",
    "塝",
    "搒",
    "搌",
    "蒱",
    "蒨",
    "蓏",
    "蔀",
    "蓢",
    "蓂",
    "蒻",
    "蓣",
    "椹",
    "楪",
    "榃",
    "榅",
    "楒",
    "楞",
    "楩",
    "榇",
    "椸",
    "楙",
    "歅",
    "碃",
    "碏",
    "碈",
    "䃅",
    "硿",
    "鄠",
    "辒",
    "龆",
    "觜",
    "䣘",
    "暕",
    "鹍",
    "㬊",
    "暅",
    "跱",
    "蜐",
    "蜎",
    "嵲",
    "赗",
    "骱",
    "锖",
    "锘",
    "锳",
    "锧",
    "锪",
    "锫",
    "锬",
    "稑",
    "稙",
    "䅟",
    "筻",
    "筼",
    "筶",
    "筦",
    "筤",
    "傺",
    "鹎",
    "僇",
    "艅",
    "艉",
    "谼",
    "貆",
    "腽",
    "腨",
    "腯",
    "鲉",
    "鲊",
    "鲌",
    "䲟",
    "鲏",
    "雊",
    "猺",
    "飔",
    "觟",
    "𦝼",
    "馌",
    "裛",
    "廒",
    "瘀",
    "瘅",
    "鄘",
    "鹒",
    "鄜",
    "麀",
    "鄣",
    "阘",
    "煁",
    "煃",
    "煴",
    "煋",
    "煟",
    "煓",
    "滠",
    "溍",
    "溹",
    "滆",
    "滉",
    "溦",
    "溵",
    "漷",
    "滧",
    "滘",
    "滍",
    "愭",
    "慥",
    "慆",
    "塱",
    "裼",
    "禋",
    "禔",
    "禘",
    "禒",
    "谫",
    "鹔",
    "愍",
    "嫄",
    "媱",
    "戤",
    "戣",
    "缞",
    "耤",
    "瑧",
    "瑨",
    "瑱",
    "瑷",
    "瑢",
    "斠",
    "摏",
    "墕",
    "墈",
    "墐",
    "墘",
    "摴",
    "銎",
    "𡐓",
    "墚",
    "撖",
    "靽",
    "鞁",
    "蔌",
    "蔈",
    "蓰",
    "蔹",
    "蔊",
    "嘏",
    "榰",
    "榑",
    "槚",
    "𣗋",
    "槜",
    "榍",
    "疐",
    "酺",
    "酾",
    "酲",
    "酴",
    "碶",
    "䃎",
    "碨",
    "𥔲",
    "碹",
    "碥",
    "劂",
    "䴗",
    "夥",
    "瞍",
    "鹖",
    "㬎",
    "跽",
    "蜾",
    "幖",
    "嶍",
    "圙",
    "𨱏",
    "锺",
    "锼",
    "锽",
    "锾",
    "锿",
    "镃",
    "镄",
    "镅",
    "馝",
    "鹙",
    "箨",
    "箖",
    "劄",
    "僬",
    "僦",
    "僔",
    "僎",
    "槃",
    "㙦",
    "鲒",
    "鲕",
    "鲖",
    "鲗",
    "鲘",
    "鲙",
    "𩽾",
    "夐",
    "獍",
    "飗",
    "凘",
    "廑",
    "廙",
    "瘗",
    "瘥",
    "瘕",
    "鲝",
    "鄫",
    "熇",
    "漹",
    "漖",
    "潆",
    "漤",
    "潩",
    "漼",
    "漴",
    "㽏",
    "漈",
    "漋",
    "漻",
    "慬",
    "窬",
    "窭",
    "㮾",
    "褕",
    "禛",
    "禚",
    "隩",
    "嫕",
    "嫭",
    "嫜",
    "嫪",
    "㻬",
    "麹",
    "璆",
    "漦",
    "叇",
    "墣",
    "墦",
    "墡",
    "劐",
    "薁",
    "蕰",
    "蔃",
    "鼒",
    "槱",
    "鹝",
    "磏",
    "磉",
    "殣",
    "慭",
    "霅",
    "暵",
    "暲",
    "暶",
    "踦",
    "踣",
    "䗖",
    "蝘",
    "蝲",
    "蝤",
    "噇",
    "噂",
    "噀",
    "罶",
    "嶲",
    "嶓",
    "㠇",
    "嶟",
    "嶒",
    "镆",
    "镈",
    "镋",
    "镎",
    "镕",
    "稹",
    "儇",
    "皞",
    "皛",
    "䴘",
    "艎",
    "艏",
    "鹟",
    "𩾃",
    "鲦",
    "鲪",
    "鲬",
    "橥",
    "觭",
    "鹠",
    "鹡",
    "糇",
    "糈",
    "翦",
    "鹢",
    "鹣",
    "熛",
    "潖",
    "潵",
    "㵐",
    "澂",
    "澛",
    "瑬",
    "潽",
    "潾",
    "潏",
    "憭",
    "憕",
    "戭",
    "褯",
    "禤",
    "嫽",
    "遹",
    "璥",
    "璲",
    "璒",
    "憙",
    "擐",
    "鄹",
    "薳",
    "鞔",
    "黇",
    "蕗",
    "薢",
    "蕹",
    "橞",
    "橑",
    "橦",
    "醑",
    "觱",
    "磡",
    "𥕢",
    "磜",
    "豮",
    "鹾",
    "虤",
    "暿",
    "曌",
    "曈",
    "㬚",
    "蹅",
    "踶",
    "䗛",
    "螗",
    "疁",
    "㠓",
    "幪",
    "嶦",
    "𨱑",
    "馞",
    "穄",
    "篚",
    "篯",
    "簉",
    "鼽",
    "衠",
    "盦",
    "螣",
    "縢",
    "鲭",
    "鲯",
    "鲰",
    "鲺",
    "鲹",
    "亸",
    "癀",
    "瘭",
    "羱",
    "糒",
    "燋",
    "熻",
    "燊",
    "燚",
    "燏",
    "濩",
    "濋",
    "澪",
    "澽",
    "澴",
    "澭",
    "澼",
    "憷",
    "憺",
    "懔",
    "黉",
    "嬛",
    "鹨",
    "翯",
    "璱",
    "𤩽",
    "璬",
    "璮",
    "髽",
    "擿",
    "薿",
    "薸",
    "檑",
    "櫆",
    "檞",
    "醨",
    "繄",
    "磹",
    "磻",
    "瞫",
    "瞵",
    "蹐",
    "蟏",
    "㘎",
    "镤",
    "镥",
    "镨",
    "𨱔",
    "矰",
    "穙",
    "穜",
    "穟",
    "簕",
    "簃",
    "簏",
    "儦",
    "魋",
    "斶",
    "艚",
    "谿",
    "䲠",
    "鲾",
    "鲿",
    "鳁",
    "鳂",
    "鳈",
    "鳉",
    "獯",
    "䗪",
    "馘",
    "襕",
    "襚",
    "螱",
    "甓",
    "嬬",
    "嬥",
    "𦈡",
    "瓀",
    "釐",
    "鬶",
    "爇",
    "鞳",
    "鞮",
    "藟",
    "藦",
    "藨",
    "鹲",
    "檫",
    "黡",
    "礞",
    "礌",
    "𥖨",
    "蹢",
    "蹜",
    "蟫",
    "䗴",
    "嚚",
    "髃",
    "镮",
    "镱",
    "酂",
    "馧",
    "簠",
    "簝",
    "簰",
    "鼫",
    "鼩",
    "皦",
    "臑",
    "䲢",
    "鳑",
    "鳒",
    "鹱",
    "鹯",
    "癗",
    "𦒍",
    "旞",
    "翷",
    "冁",
    "䎖",
    "瀔",
    "瀍",
    "瀌",
    "襜",
    "䴙",
    "嚭",
    "㰀",
    "鬷",
    "醭",
    "蹯",
    "蠋",
    "翾",
    "鳘",
    "儳",
    "儴",
    "鼗",
    "𩾌",
    "鳚",
    "鳛",
    "麑",
    "麖",
    "蠃",
    "彟",
    "嬿",
    "鬒",
    "蘘",
    "欂",
    "醵",
    "颥",
    "甗",
    "𨟠",
    "巇",
    "酅",
    "髎",
    "犨",
    "𨭉",
    "㸌",
    "爔",
    "瀱",
    "瀹",
    "瀼",
    "瀵",
    "襫",
    "孅",
    "骦",
    "耰",
    "𤫉",
    "瓖",
    "鬘",
    "趯",
    "罍",
    "鼱",
    "鳠",
    "鳡",
    "鳣",
    "爟",
    "爚",
    "灈",
    "韂",
    "糵",
    "蘼",
    "礵",
    "鹴",
    "躔",
    "皭",
    "龢",
    "鳤",
    "亹",
    "籥",
    "鼷",
    "玃",
    "醾",
    "齇",
    "觿",
    "蠼",
    "𬣙",
    "𬇕",
    "𬣞",
    "𬘓",
    "𫭟",
    "𫭢",
    "𫇭",
    "𫐄",
    "𫵷",
    "𬇙",
    "𬣡",
    "𫸩",
    "𫘜",
    "𬘘",
    "𫘝",
    "𬨂",
    "𬀩",
    "𬀪",
    "𬬩",
    "𫍣",
    "𬣳",
    "𬩽",
    "𬮿",
    "𬯀",
    "𫰛",
    "𬳵",
    "𬳶",
    "𫠊",
    "𬍛",
    "鿍",
    "𬜬",
    "𪾢",
    "𪨰",
    "𫓧",
    "𬬮",
    "𬬱",
    "𬬭",
    "𬘡",
    "𬳽",
    "𬘩",
    "𫄧",
    "𪟝",
    "𬍤",
    "𫭼",
    "𬜯",
    "𬂩",
    "𫠆",
    "𬌗",
    "𫑡",
    "𪨶",
    "𬬸",
    "𬬻",
    "𬬹",
    "𬬿",
    "𬭁",
    "𫢸",
    "𫗧",
    "𬊈",
    "𬒈",
    "𬳿",
    "𫄨",
    "𬘫",
    "𫮃",
    "鿎",
    "𬱖",
    "𬟽",
    "𫓯",
    "𫟹",
    "𫟼",
    "𬇹",
    "𬍡",
    "𬤇",
    "𫍯",
    "𬤊",
    "𫍲",
    "𬯎",
    "𬘬",
    "𬘭",
    "𬴂",
    "𫘦",
    "𫟅",
    "𬘯",
    "𫘧",
    "𪣻",
    "𬃊",
    "𬷕",
    "𫐐",
    "𬹼",
    "𫶇",
    "𫖮",
    "鿏",
    "𬭊",
    "𫓶",
    "𬭎",
    "𫖯",
    "𬱟",
    "𫛭",
    "𫷷",
    "𬮱",
    "𬊤",
    "𬴃",
    "𫘨",
    "𬪩",
    "𬒔",
    "𬨎",
    "𫐓",
    "𫫇",
    "𫓹",
    "𬭚",
    "𬭛",
    "𬕂",
    "𬶋",
    "𬶍",
    "𫔶",
    "𫌀",
    "𫖳",
    "𫘪",
    "𫘬",
    "𫞩",
    "𪤗",
    "𬸘",
    "𬒗",
    "𫚖",
    "𬭤",
    "𫚕",
    "𬶐",
    "𬶏",
    "𬸚",
    "𬤝",
    "𬙂",
    "𬭩",
    "𬸣",
    "𫍽",
    "𬴊",
    "𬞟",
    "𫟦",
    "𬺈",
    "𫠜",
    "𪩘",
    "𬭬",
    "𬭯",
    "𫗴",
    "𬸦",
    "𫄷",
    "𬭳",
    "𬭶",
    "𫔍",
    "𬭸",
    "𬭼",
    "𫔎",
    "𬸪",
    "𬶟",
    "𬶠",
    "𬶨",
    "𫄸",
    "𬟁",
    "𬙊",
    "𬶭",
    "𬶮",
    "𬙋",
    "𬺓",
    "𫚭",
    "廠",
    "蔔",
    "兒",
    "幾",
    "幹",
    "虧",
    "纔",
    "與",
    "萬",
    "韆",
    "億",
    "個",
    "廣",
    "門",
    "義",
    "衛",
    "飛",
    "習",
    "馬",
    "鄉",
    "豐",
    "開",
    "無",
    "雲",
    "專",
    "藝",
    "廳",
    "區",
    "歷",
    "曆",
    "車",
    "貝",
    "岡",
    "見",
    "氣",
    "長",
    "僕",
    "幣",
    "僅",
    "從",
    "侖",
    "倉",
    "風",
    "烏",
    "鳳",
    "爲",
    "鬥",
    "憶",
    "計",
    "訂",
    "認",
    "譏",
    "醜",
    "隊",
    "辦",
    "鄧",
    "勸",
    "雙",
    "書",
    "擊",
    "撲",
    "節",
    "術",
    "厲",
    "龍",
    "滅",
    "軋",
    "東",
    "盧",
    "業",
    "舊",
    "帥",
    "歸",
    "葉",
    "電",
    "號",
    "衹",
    "隻",
    "嘰",
    "嘆",
    "們",
    "儀",
    "叢",
    "爾",
    "樂",
    "處",
    "鼕",
    "鳥",
    "務",
    "飢",
    "饑",
    "馮",
    "閃",
    "蘭",
    "匯",
    "彙",
    "頭",
    "漢",
    "寧",
    "討",
    "寫",
    "讓",
    "禮",
    "訓",
    "議",
    "訊",
    "記",
    "齣",
    "遼",
    "邊",
    "發",
    "髮",
    "聖",
    "對",
    "臺",
    "颱",
    "檯",
    "糾",
    "絲",
    "動",
    "鞏",
    "執",
    "擴",
    "掃",
    "場",
    "揚",
    "亞",
    "樸",
    "機",
    "權",
    "過",
    "協",
    "壓",
    "厭",
    "頁",
    "誇",
    "奪",
    "達",
    "夾",
    "軌",
    "堯",
    "劃",
    "邁",
    "畢",
    "貞",
    "師",
    "塵",
    "當",
    "噹",
    "籲",
    "嚇",
    "蟲",
    "麯",
    "團",
    "糰",
    "嗎",
    "嶼",
    "歲",
    "迴",
    "豈",
    "則",
    "剛",
    "網",
    "硃",
    "遷",
    "喬",
    "偉",
    "傳",
    "優",
    "傷",
    "價",
    "倫",
    "華",
    "僞",
    "嚮",
    "後",
    "會",
    "殺",
    "閤",
    "衆",
    "爺",
    "傘",
    "創",
    "雜",
    "負",
    "壯",
    "衝",
    "妝",
    "莊",
    "慶",
    "劉",
    "齊",
    "産",
    "閉",
    "問",
    "闖",
    "關",
    "燈",
    "湯",
    "興",
    "講",
    "諱",
    "軍",
    "訝",
    "許",
    "訛",
    "論",
    "訟",
    "農",
    "諷",
    "設",
    "訪",
    "訣",
    "尋",
    "盡",
    "儘",
    "導",
    "孫",
    "陣",
    "陽",
    "階",
    "陰",
    "婦",
    "媽",
    "戲",
    "觀",
    "歡",
    "買",
    "紅",
    "馱",
    "纖",
    "縴",
    "馴",
    "約",
    "級",
    "紀",
    "馳",
    "紉",
    "壽",
    "麥",
    "瑪",
    "進",
    "遠",
    "違",
    "韌",
    "運",
    "撫",
    "壇",
    "罎",
    "壞",
    "摳",
    "擾",
    "貢",
    "垻",
    "壩",
    "摺",
    "掄",
    "搶",
    "墳",
    "護",
    "殻",
    "塊",
    "聲",
    "報",
    "擬",
    "蕪",
    "葦",
    "蒼",
    "嚴",
    "蘆",
    "勞",
    "蘇",
    "囌",
    "極",
    "楊",
    "兩",
    "麗",
    "醫",
    "勵",
    "還",
    "殲",
    "來",
    "連",
    "軒",
    "鹵",
    "滷",
    "堅",
    "時",
    "縣",
    "裏",
    "嘔",
    "園",
    "曠",
    "圍",
    "噸",
    "郵",
    "睏",
    "員",
    "聽",
    "嗆",
    "嗚",
    "彆",
    "嶇",
    "崗",
    "帳",
    "財",
    "針",
    "釘",
    "亂",
    "體",
    "傭",
    "徹",
    "餘",
    "穀",
    "鄰",
    "腸",
    "龜",
    "猶",
    "狽",
    "條",
    "島",
    "飯",
    "飲",
    "係",
    "繫",
    "凍",
    "狀",
    "畝",
    "庫",
    "療",
    "應",
    "這",
    "廬",
    "閏",
    "閑",
    "間",
    "悶",
    "竈",
    "燦",
    "瀝",
    "淪",
    "滄",
    "溝",
    "滬",
    "瀋",
    "懷",
    "憂",
    "窮",
    "證",
    "啓",
    "評",
    "補",
    "識",
    "詐",
    "訴",
    "診",
    "詞",
    "譯",
    "靈",
    "層",
    "遲",
    "張",
    "際",
    "陸",
    "陳",
    "墜",
    "勁",
    "鷄",
    "緯",
    "驅",
    "純",
    "紗",
    "綱",
    "納",
    "駁",
    "縱",
    "紛",
    "紙",
    "紋",
    "紡",
    "驢",
    "紐",
    "環",
    "責",
    "現",
    "錶",
    "規",
    "攏",
    "揀",
    "擔",
    "頂",
    "擁",
    "勢",
    "攔",
    "擰",
    "撥",
    "擇",
    "蘋",
    "範",
    "莖",
    "樞",
    "櫃",
    "闆",
    "鬆",
    "槍",
    "楓",
    "構",
    "喪",
    "畫",
    "棗",
    "賣",
    "鬱",
    "礬",
    "礦",
    "碼",
    "厠",
    "奮",
    "態",
    "歐",
    "毆",
    "壟",
    "轟",
    "頃",
    "轉",
    "斬",
    "輪",
    "軟",
    "齒",
    "虜",
    "腎",
    "賢",
    "國",
    "暢",
    "嚨",
    "鳴",
    "羅",
    "幟",
    "嶺",
    "凱",
    "敗",
    "賬",
    "販",
    "貶",
    "購",
    "貯",
    "圖",
    "釣",
    "製",
    "颳",
    "俠",
    "僥",
    "偵",
    "側",
    "憑",
    "僑",
    "貨",
    "質",
    "徑",
    "捨",
    "覓",
    "貪",
    "貧",
    "膚",
    "腫",
    "脹",
    "骯",
    "脅",
    "魚",
    "獰",
    "備",
    "飾",
    "飽",
    "飼",
    "變",
    "龐",
    "廟",
    "瘧",
    "劑",
    "廢",
    "閘",
    "鬧",
    "鄭",
    "捲",
    "單",
    "爐",
    "淺",
    "濘",
    "瀉",
    "潑",
    "澤",
    "憐",
    "學",
    "寶",
    "寵",
    "審",
    "簾",
    "實",
    "試",
    "詩",
    "誠",
    "襯",
    "視",
    "話",
    "誕",
    "詭",
    "詢",
    "該",
    "詳",
    "肅",
    "録",
    "隸",
    "彌",
    "瀰",
    "陝",
    "駕",
    "參",
    "艱",
    "綫",
    "練",
    "組",
    "紳",
    "細",
    "駛",
    "織",
    "駒",
    "終",
    "駐",
    "絆",
    "駝",
    "紹",
    "繹",
    "經",
    "貫",
    "貳",
    "幫",
    "項",
    "挾",
    "撓",
    "趙",
    "擋",
    "墊",
    "擠",
    "揮",
    "薦",
    "帶",
    "繭",
    "蕩",
    "榮",
    "葷",
    "熒",
    "鬍",
    "蔭",
    "藥",
    "標",
    "棧",
    "棟",
    "欄",
    "檸",
    "樹",
    "鹹",
    "磚",
    "硯",
    "麵",
    "牽",
    "鷗",
    "殘",
    "軸",
    "輕",
    "鴉",
    "戰",
    "點",
    "臨",
    "覽",
    "竪",
    "嘗",
    "啞",
    "顯",
    "貴",
    "蝦",
    "蟻",
    "螞",
    "雖",
    "駡",
    "勛",
    "嘩",
    "響",
    "喲",
    "峽",
    "罰",
    "賤",
    "貼",
    "貽",
    "鈣",
    "鈍",
    "鈔",
    "鍾",
    "鐘",
    "鋼",
    "鈉",
    "鑰",
    "欽",
    "鈞",
    "鈎",
    "鈕",
    "氈",
    "氫",
    "選",
    "適",
    "種",
    "鞦",
    "復",
    "複",
    "倆",
    "貸",
    "順",
    "儉",
    "須",
    "鬚",
    "劍",
    "朧",
    "膽",
    "勝",
    "狹",
    "獅",
    "獨",
    "獄",
    "貿",
    "餌",
    "饒",
    "蝕",
    "餃",
    "餅",
    "巒",
    "彎",
    "將",
    "奬",
    "瘡",
    "瘋",
    "親",
    "閨",
    "聞",
    "閩",
    "閥",
    "閣",
    "養",
    "薑",
    "類",
    "婁",
    "總",
    "煉",
    "爍",
    "爛",
    "窪",
    "潔",
    "灑",
    "澆",
    "濁",
    "測",
    "瀏",
    "濟",
    "渾",
    "濃",
    "惱",
    "舉",
    "覺",
    "憲",
    "竊",
    "誡",
    "誣",
    "語",
    "襖",
    "誤",
    "誘",
    "誨",
    "説",
    "誦",
    "墾",
    "晝",
    "費",
    "遜",
    "隕",
    "險",
    "嬌",
    "賀",
    "壘",
    "綁",
    "絨",
    "結",
    "繞",
    "驕",
    "繪",
    "給",
    "絢",
    "駱",
    "絡",
    "絶",
    "絞",
    "駭",
    "統",
    "艷",
    "蠶",
    "頑",
    "盞",
    "撈",
    "載",
    "趕",
    "鹽",
    "損",
    "撿",
    "摯",
    "剝",
    "熱",
    "搗",
    "壺",
    "聶",
    "萊",
    "蓮",
    "獲",
    "穫",
    "惡",
    "噁",
    "瑩",
    "鶯",
    "檔",
    "橋",
    "樺",
    "樁",
    "樣",
    "賈",
    "礫",
    "礎",
    "顧",
    "轎",
    "較",
    "頓",
    "斃",
    "緻",
    "慮",
    "監",
    "緊",
    "黨",
    "曬",
    "曉",
    "嘮",
    "鴨",
    "暈",
    "鴦",
    "罷",
    "圓",
    "賊",
    "賄",
    "賂",
    "贜",
    "錢",
    "鉗",
    "鑽",
    "鉀",
    "鐵",
    "鈴",
    "鉛",
    "犧",
    "敵",
    "積",
    "稱",
    "筆",
    "債",
    "傾",
    "賃",
    "艦",
    "艙",
    "聳",
    "愛",
    "頒",
    "頌",
    "臟",
    "髒",
    "臍",
    "膠",
    "腦",
    "膿",
    "鴕",
    "鴛",
    "皺",
    "餓",
    "餒",
    "戀",
    "槳",
    "漿",
    "準",
    "癥",
    "齋",
    "離",
    "資",
    "競",
    "閲",
    "煩",
    "燒",
    "燭",
    "遞",
    "濤",
    "澇",
    "渦",
    "塗",
    "滌",
    "潤",
    "澗",
    "漲",
    "燙",
    "澀",
    "憫",
    "寬",
    "傢",
    "賓",
    "竅",
    "請",
    "諸",
    "諾",
    "讀",
    "誹",
    "襪",
    "課",
    "誰",
    "調",
    "諒",
    "諄",
    "談",
    "誼",
    "懇",
    "劇",
    "難",
    "預",
    "絹",
    "綉",
    "驗",
    "繼",
    "駿",
    "瑣",
    "擲",
    "據",
    "摻",
    "職",
    "蘿",
    "螢",
    "營",
    "蕭",
    "薩",
    "夢",
    "檢",
    "醖",
    "碩",
    "聾",
    "襲",
    "輔",
    "輛",
    "顱",
    "懸",
    "躍",
    "纍",
    "囉",
    "嘯",
    "嶄",
    "邏",
    "嬰",
    "銬",
    "鐺",
    "鋁",
    "銅",
    "銘",
    "鏟",
    "銀",
    "矯",
    "穢",
    "籠",
    "償",
    "軀",
    "釁",
    "銜",
    "盤",
    "鴿",
    "斂",
    "領",
    "臉",
    "獵",
    "餡",
    "館",
    "癢",
    "鏇",
    "閻",
    "闡",
    "蓋",
    "斷",
    "獸",
    "鴻",
    "漸",
    "淵",
    "漁",
    "澱",
    "滲",
    "慚",
    "懼",
    "驚",
    "慘",
    "慣",
    "謀",
    "諜",
    "謊",
    "諧",
    "禱",
    "禍",
    "謂",
    "諺",
    "謎",
    "彈",
    "墮",
    "隨",
    "隱",
    "嬸",
    "頗",
    "頸",
    "績",
    "緒",
    "續",
    "騎",
    "綽",
    "繩",
    "維",
    "綿",
    "綳",
    "綢",
    "綜",
    "綻",
    "緑",
    "綴",
    "瓊",
    "趨",
    "攬",
    "攙",
    "擱",
    "摟",
    "攪",
    "聯",
    "蔣",
    "韓",
    "橢",
    "確",
    "頰",
    "靂",
    "暫",
    "翹",
    "輩",
    "鑿",
    "輝",
    "賞",
    "睞",
    "噴",
    "疇",
    "踐",
    "遺",
    "鵑",
    "賦",
    "賭",
    "贖",
    "賜",
    "賠",
    "鑄",
    "鋪",
    "鏈",
    "銷",
    "鎖",
    "鋤",
    "鍋",
    "銹",
    "鋒",
    "鋅",
    "鋭",
    "鵝",
    "築",
    "篩",
    "儲",
    "懲",
    "禦",
    "釋",
    "臘",
    "魯",
    "憊",
    "饋",
    "饞",
    "裝",
    "蠻",
    "闊",
    "糞",
    "滯",
    "濕",
    "潰",
    "濺",
    "灣",
    "憤",
    "竄",
    "窩",
    "褲",
    "禪",
    "謝",
    "謡",
    "謗",
    "謙",
    "屬",
    "屢",
    "緬",
    "纜",
    "緝",
    "緞",
    "緩",
    "締",
    "縷",
    "騙",
    "編",
    "騷",
    "緣",
    "鵡",
    "攝",
    "擺",
    "襬",
    "攤",
    "鵲",
    "藍",
    "濛",
    "懞",
    "矇",
    "獻",
    "欖",
    "樓",
    "賴",
    "礙",
    "尷",
    "霧",
    "輻",
    "輯",
    "輸",
    "頻",
    "齡",
    "鑒",
    "蹺",
    "蝸",
    "錯",
    "錨",
    "錫",
    "鑼",
    "錘",
    "錐",
    "錦",
    "鍵",
    "鋸",
    "錳",
    "辭",
    "頽",
    "籌",
    "簽",
    "籤",
    "簡",
    "膩",
    "鵬",
    "騰",
    "鮑",
    "穎",
    "觸",
    "雛",
    "饃",
    "餾",
    "醬",
    "謄",
    "糧",
    "數",
    "滿",
    "濾",
    "濫",
    "灕",
    "濱",
    "灘",
    "譽",
    "窺",
    "寢",
    "謹",
    "謬",
    "闢",
    "縛",
    "縫",
    "纏",
    "繽",
    "贅",
    "墻",
    "衊",
    "藹",
    "檻",
    "釀",
    "願",
    "轄",
    "輾",
    "顆",
    "踴",
    "蠟",
    "蠅",
    "蟬",
    "賺",
    "鍬",
    "鍛",
    "鍍",
    "穩",
    "籮",
    "簫",
    "輿",
    "鮮",
    "饅",
    "瀟",
    "賽",
    "譚",
    "譜",
    "騾",
    "縮",
    "攆",
    "聰",
    "藴",
    "櫻",
    "飄",
    "黴",
    "瞞",
    "題",
    "囑",
    "鎮",
    "鎬",
    "鎊",
    "簍",
    "鯉",
    "鯽",
    "癟",
    "癱",
    "顔",
    "鯊",
    "瀾",
    "額",
    "譴",
    "鶴",
    "繚",
    "顛",
    "轍",
    "鸚",
    "贈",
    "鏡",
    "贊",
    "籃",
    "籬",
    "鯨",
    "癮",
    "辯",
    "瀕",
    "懶",
    "繮",
    "繳",
    "矚",
    "贍",
    "鰐",
    "辮",
    "贏",
    "驟",
    "囂",
    "鐮",
    "鰭",
    "鷹",
    "巔",
    "顫",
    "癬",
    "鱉",
    "鬢",
    "鱗",
    "躪",
    "贛",
    "鑲",
    "韋",
    "閂",
    "訃",
    "勱",
    "芻",
    "鄺",
    "訐",
    "訌",
    "訕",
    "訖",
    "馭",
    "璣",
    "壙",
    "捫",
    "薌",
    "厙",
    "釔",
    "傴",
    "倀",
    "傖",
    "獷",
    "獁",
    "鳬",
    "鄔",
    "餳",
    "懺",
    "謳",
    "詎",
    "訥",
    "紆",
    "紂",
    "紇",
    "紈",
    "璵",
    "摶",
    "塢",
    "㩳",
    "蕓",
    "藶",
    "莧",
    "萇",
    "蓯",
    "磯",
    "奩",
    "歟",
    "軔",
    "鄴",
    "嘸",
    "囈",
    "嚦",
    "暘",
    "唄",
    "幃",
    "峴",
    "嵐",
    "圇",
    "釗",
    "釙",
    "釕",
    "僉",
    "鳩",
    "鄒",
    "飩",
    "餼",
    "飪",
    "飫",
    "飭",
    "廡",
    "癤",
    "闈",
    "閎",
    "閔",
    "煬",
    "灃",
    "漚",
    "渢",
    "潙",
    "憮",
    "慪",
    "愾",
    "悵",
    "愴",
    "詁",
    "訶",
    "詛",
    "詆",
    "謅",
    "詔",
    "詒",
    "隴",
    "陘",
    "嫵",
    "嫗",
    "嬀",
    "剄",
    "紜",
    "紕",
    "紝",
    "綸",
    "紓",
    "瑋",
    "匭",
    "壚",
    "擓",
    "蘢",
    "蔦",
    "塋",
    "煢",
    "櫪",
    "梘",
    "棖",
    "樅",
    "碭",
    "甌",
    "郟",
    "軛",
    "鳶",
    "曇",
    "蟣",
    "黽",
    "嚀",
    "噝",
    "巋",
    "劌",
    "剴",
    "嶧",
    "釷",
    "釺",
    "釧",
    "釩",
    "釹",
    "釵",
    "儈",
    "儕",
    "儂",
    "劊",
    "慫",
    "糴",
    "戧",
    "膞",
    "邇",
    "梟",
    "餞",
    "飴",
    "癘",
    "瘍",
    "煒",
    "熰",
    "熗",
    "瀧",
    "瀘",
    "濼",
    "涇",
    "㥮",
    "懌",
    "誆",
    "誄",
    "詿",
    "詰",
    "詼",
    "鄆",
    "禕",
    "誅",
    "詵",
    "詬",
    "詮",
    "詣",
    "諍",
    "詫",
    "諢",
    "詡",
    "駑",
    "紺",
    "紲",
    "紱",
    "駟",
    "駙",
    "縐",
    "絀",
    "驛",
    "駘",
    "瓏",
    "頇",
    "埡",
    "撾",
    "撻",
    "賁",
    "壋",
    "撏",
    "莢",
    "貰",
    "蓽",
    "蕎",
    "薈",
    "薺",
    "堊",
    "滎",
    "犖",
    "蕁",
    "藎",
    "蓀",
    "蕒",
    "葤",
    "櫛",
    "櫳",
    "櫨",
    "櫟",
    "檉",
    "酈",
    "硨",
    "碸",
    "殤",
    "軲",
    "軻",
    "轤",
    "軼",
    "軫",
    "蠆",
    "覘",
    "瞘",
    "嘵",
    "嗶",
    "噦",
    "剮",
    "鄖",
    "噲",
    "噥",
    "嶢",
    "幀",
    "嶠",
    "貺",
    "鈈",
    "鈦",
    "鋇",
    "鈑",
    "鈐",
    "鎢",
    "鈁",
    "鈀",
    "篤",
    "儔",
    "儼",
    "儷",
    "腖",
    "臚",
    "脛",
    "鴇",
    "獪",
    "颮",
    "猻",
    "餉",
    "餄",
    "餎",
    "孿",
    "孌",
    "癧",
    "瘲",
    "颯",
    "闥",
    "閭",
    "闓",
    "閡",
    "熾",
    "烴",
    "浹",
    "澮",
    "滸",
    "潯",
    "濜",
    "慟",
    "懨",
    "愷",
    "惻",
    "惲",
    "誚",
    "禰",
    "誥",
    "誑",
    "鴆",
    "婭",
    "嬈",
    "懟",
    "絝",
    "驍",
    "驊",
    "絎",
    "絳",
    "駢",
    "頊",
    "璫",
    "琿",
    "塒",
    "塤",
    "堝",
    "贄",
    "蒔",
    "萵",
    "蕕",
    "鴣",
    "蒓",
    "橈",
    "楨",
    "榿",
    "檜",
    "邐",
    "礪",
    "礱",
    "軾",
    "輊",
    "輅",
    "鶇",
    "躉",
    "齔",
    "鸕",
    "矓",
    "嘜",
    "鴞",
    "蜆",
    "嗩",
    "嶗",
    "崍",
    "覬",
    "賅",
    "鈺",
    "鉦",
    "鈷",
    "鉢",
    "鈸",
    "鉞",
    "鉭",
    "鉬",
    "鈿",
    "鈾",
    "鉑",
    "鑠",
    "鉚",
    "鈰",
    "鉉",
    "鉈",
    "鉍",
    "鈮",
    "鈹",
    "鏺",
    "鐸",
    "氬",
    "筧",
    "頎",
    "徠",
    "膾",
    "鴟",
    "璽",
    "鴝",
    "獫",
    "裊",
    "餑",
    "欒",
    "攣",
    "癰",
    "痙",
    "頏",
    "閫",
    "鬮",
    "誾",
    "閬",
    "鄲",
    "燁",
    "燴",
    "燼",
    "淶",
    "漣",
    "潿",
    "慳",
    "諏",
    "諑",
    "禎",
    "諉",
    "諛",
    "諗",
    "諂",
    "誶",
    "媧",
    "嫻",
    "綆",
    "驪",
    "綃",
    "騁",
    "綏",
    "縧",
    "綈",
    "駸",
    "鷥",
    "燾",
    "璉",
    "麩",
    "擄",
    "摑",
    "鷙",
    "撣",
    "慤",
    "摜",
    "縈",
    "槤",
    "覡",
    "欞",
    "嗇",
    "匱",
    "硤",
    "磽",
    "鴯",
    "龔",
    "殞",
    "殮",
    "賚",
    "輒",
    "塹",
    "嘖",
    "囀",
    "嚙",
    "蹌",
    "蠣",
    "蠱",
    "蟶",
    "幘",
    "幗",
    "賕",
    "賑",
    "賒",
    "銠",
    "鉺",
    "鋏",
    "鐃",
    "銦",
    "鎧",
    "鍘",
    "銖",
    "銑",
    "鋌",
    "鏵",
    "銓",
    "鎩",
    "鉿",
    "銚",
    "鉻",
    "錚",
    "銫",
    "鉸",
    "銥",
    "銃",
    "銨",
    "銣",
    "鴰",
    "穠",
    "箋",
    "籩",
    "僨",
    "僂",
    "皚",
    "鴴",
    "艫",
    "龕",
    "玀",
    "獼",
    "餜",
    "餛",
    "鸞",
    "闍",
    "閾",
    "閹",
    "閶",
    "鬩",
    "閽",
    "閼",
    "羥",
    "糲",
    "燜",
    "漬",
    "瀆",
    "澠",
    "愜",
    "憚",
    "諶",
    "諫",
    "皸",
    "謔",
    "襠",
    "謁",
    "諤",
    "諭",
    "諼",
    "讒",
    "諳",
    "諦",
    "諞",
    "糶",
    "嬋",
    "綾",
    "騏",
    "綺",
    "緋",
    "緔",
    "騍",
    "緄",
    "騅",
    "綬",
    "綹",
    "綣",
    "綰",
    "驂",
    "緇",
    "靚",
    "輦",
    "黿",
    "頡",
    "撳",
    "蟄",
    "壪",
    "蔞",
    "櫝",
    "欏",
    "賫",
    "鵓",
    "鸝",
    "殫",
    "輥",
    "輞",
    "槧",
    "輟",
    "輜",
    "瞼",
    "躒",
    "蛺",
    "蟯",
    "螄",
    "蠐",
    "嘍",
    "嶸",
    "嶁",
    "賧",
    "鋙",
    "錸",
    "鏗",
    "鋥",
    "鋰",
    "鋯",
    "鋨",
    "銼",
    "鐧",
    "銻",
    "鋃",
    "鋦",
    "錒",
    "犢",
    "鵠",
    "篳",
    "牘",
    "儻",
    "儐",
    "儺",
    "嬃",
    "頜",
    "鵒",
    "魷",
    "魨",
    "魴",
    "潁",
    "颶",
    "觴",
    "熲",
    "餷",
    "餿",
    "褻",
    "臠",
    "癆",
    "癇",
    "賡",
    "頦",
    "鷳",
    "闌",
    "闃",
    "闋",
    "鵜",
    "憒",
    "嚳",
    "謨",
    "褳",
    "襇",
    "讜",
    "謖",
    "謚",
    "謐",
    "騭",
    "巰",
    "翬",
    "騖",
    "緙",
    "緗",
    "緘",
    "緹",
    "緲",
    "緦",
    "緱",
    "縋",
    "緡",
    "饗",
    "耮",
    "驁",
    "韞",
    "攄",
    "擯",
    "轂",
    "驀",
    "鶓",
    "薊",
    "蘺",
    "鎣",
    "頤",
    "櫚",
    "櫸",
    "磧",
    "磣",
    "鵪",
    "輳",
    "齟",
    "齙",
    "韙",
    "囁",
    "躂",
    "蹕",
    "躚",
    "躋",
    "噯",
    "鍺",
    "錛",
    "錡",
    "鍀",
    "錁",
    "錕",
    "錮",
    "鍁",
    "錈",
    "錠",
    "錙",
    "覦",
    "頷",
    "鮁",
    "鮃",
    "鮎",
    "鱸",
    "穌",
    "鮒",
    "鮐",
    "鵮",
    "颼",
    "饈",
    "鶉",
    "瘮",
    "闔",
    "闐",
    "闕",
    "灧",
    "瀅",
    "潷",
    "灤",
    "澦",
    "懾",
    "鱟",
    "騫",
    "竇",
    "謾",
    "謫",
    "嬡",
    "嬪",
    "縉",
    "縝",
    "縟",
    "轡",
    "騮",
    "縞",
    "縭",
    "縊",
    "縑",
    "騸",
    "覯",
    "韜",
    "靉",
    "攖",
    "薔",
    "藺",
    "鶘",
    "檳",
    "櫧",
    "釅",
    "殯",
    "霽",
    "轅",
    "齜",
    "齦",
    "瞜",
    "曖",
    "躊",
    "蟈",
    "鶚",
    "嚶",
    "羆",
    "賻",
    "罌",
    "鶻",
    "鍥",
    "鍇",
    "鍶",
    "鍔",
    "鍤",
    "鏘",
    "鎂",
    "鏤",
    "簀",
    "篋",
    "簞",
    "籙",
    "臏",
    "鮭",
    "鮪",
    "鱭",
    "鮫",
    "鱘",
    "饉",
    "鑾",
    "瘻",
    "闞",
    "鮝",
    "糝",
    "鷀",
    "瀲",
    "濰",
    "譖",
    "褸",
    "譙",
    "讕",
    "譎",
    "鶥",
    "嬙",
    "鶩",
    "驃",
    "縹",
    "縵",
    "縲",
    "纓",
    "驄",
    "繆",
    "繅",
    "耬",
    "瓔",
    "擷",
    "擼",
    "攛",
    "聵",
    "覲",
    "韃",
    "鞽",
    "蘄",
    "賾",
    "檣",
    "靨",
    "魘",
    "饜",
    "轆",
    "齬",
    "齪",
    "覷",
    "顒",
    "躓",
    "躑",
    "蠑",
    "螻",
    "顎",
    "嚕",
    "顓",
    "鑷",
    "鎘",
    "鎸",
    "鎳",
    "鎦",
    "鎰",
    "鎵",
    "鑌",
    "簣",
    "鷂",
    "鯁",
    "鱺",
    "鰱",
    "鰹",
    "鰣",
    "鯀",
    "鯇",
    "觶",
    "饊",
    "饌",
    "齏",
    "讞",
    "襤",
    "譫",
    "屨",
    "纈",
    "繕",
    "繒",
    "驏",
    "擻",
    "顳",
    "顢",
    "藪",
    "櫓",
    "櫞",
    "贋",
    "飆",
    "鏨",
    "轔",
    "蟎",
    "鐯",
    "鏢",
    "鏜",
    "鏝",
    "鏰",
    "鏞",
    "鏑",
    "鏃",
    "鏐",
    "氌",
    "穡",
    "魎",
    "鯪",
    "鯡",
    "鯤",
    "鯧",
    "鯝",
    "鯢",
    "鯛",
    "鯔",
    "獺",
    "鷓",
    "贇",
    "癭",
    "斕",
    "瀨",
    "顙",
    "繾",
    "繰",
    "繯",
    "蘚",
    "鷯",
    "齲",
    "齷",
    "躡",
    "蹣",
    "羈",
    "鐔",
    "鐝",
    "鐐",
    "鐓",
    "鑭",
    "鑹",
    "鏹",
    "鐙",
    "籪",
    "鷦",
    "鱝",
    "鰈",
    "鯷",
    "鰓",
    "鰍",
    "鰉",
    "鯿",
    "鷲",
    "懣",
    "鷸",
    "鰲",
    "韉",
    "顥",
    "鷺",
    "䴉",
    "髏",
    "鑊",
    "鐳",
    "鐲",
    "讎",
    "鰨",
    "鰥",
    "鰩",
    "癩",
    "攢",
    "靄",
    "躥",
    "髖",
    "髕",
    "鑔",
    "籟",
    "鰳",
    "鰾",
    "鱈",
    "鰻",
    "鱅",
    "讖",
    "驥",
    "纘",
    "瓚",
    "鼉",
    "黷",
    "黲",
    "鑣",
    "鑞",
    "臢",
    "鱖",
    "鱔",
    "鱒",
    "驤",
    "顰",
    "鱧",
    "癲",
    "灝",
    "鸛",
    "鑱",
    "趲",
    "顴",
    "躦",
    "饢",
    "戇",
    "戔",
    "訏",
    "訒",
    "釓",
    "俔",
    "閆",
    "澫",
    "訢",
    "訩",
    "詝",
    "紃",
    "纊",
    "瑒",
    "剗",
    "塸",
    "壢",
    "埨",
    "撝",
    "蔿",
    "榪",
    "軑",
    "軏",
    "咼",
    "㠣",
    "覎",
    "㑳",
    "颺",
    "閌",
    "潕",
    "湋",
    "澐",
    "浿",
    "諓",
    "禡",
    "詗",
    "詘",
    "詖",
    "屓",
    "彄",
    "紘",
    "馹",
    "馼",
    "紵",
    "紞",
    "駃",
    "紖",
    "瑲",
    "薴",
    "棡",
    "軝",
    "暐",
    "晛",
    "崬",
    "釴",
    "釤",
    "鍆",
    "鍚",
    "鄶",
    "獮",
    "飿",
    "嶨",
    "詷",
    "詪",
    "鄩",
    "鳲",
    "隑",
    "隮",
    "娙",
    "逕",
    "駓",
    "駔",
    "駉",
    "絅",
    "騶",
    "䮄",
    "紼",
    "紿",
    "瓅",
    "韍",
    "墶",
    "塏",
    "薘",
    "蕘",
    "蔄",
    "葒",
    "鳾",
    "龑",
    "軹",
    "軤",
    "轢",
    "軺",
    "睍",
    "曨",
    "噠",
    "鈃",
    "鈇",
    "鉅",
    "鋹",
    "釿",
    "錀",
    "鈧",
    "鈥",
    "鈄",
    "倈",
    "艤",
    "鶬",
    "颭",
    "餏",
    "湞",
    "溮",
    "滻",
    "褘",
    "絰",
    "駰",
    "絪",
    "駪",
    "綎",
    "綖",
    "驫",
    "勣",
    "璕",
    "𡑍",
    "䓣",
    "薟",
    "藭",
    "椏",
    "梜",
    "頍",
    "硜",
    "輄",
    "輈",
    "輇",
    "貲",
    "嗊",
    "曄",
    "暉",
    "鄳",
    "幬",
    "輋",
    "嶮",
    "贐",
    "鉥",
    "鉕",
    "鑪",
    "鉮",
    "鉊",
    "鉧",
    "僤",
    "鴒",
    "魛",
    "餗",
    "燖",
    "溳",
    "礐",
    "窵",
    "襏",
    "駼",
    "絺",
    "綌",
    "騂",
    "綄",
    "璡",
    "墠",
    "壼",
    "聹",
    "蘀",
    "勩",
    "罃",
    "檮",
    "棶",
    "厴",
    "䃮",
    "磑",
    "礄",
    "鴷",
    "齕",
    "頔",
    "廼",
    "凢",
    "亾",
    "枒",
    "屍",
    "匃",
    "匄",
    "紥",
    "紮",
    "疋",
    "殀",
    "讐",
    "觔",
    "兇",
    "宂",
    "㕥",
    "㠯",
    "栞",
    "佈",
    "佔",
    "呌",
    "敂",
    "冄",
    "坵",
    "僊",
    "怱",
    "悤",
    "冊",
    "夘",
    "戼",
    "牠",
    "妳",
    "嬭",
    "摃",
    "釦",
    "攷",
    "託",
    "衺",
    "衕",
    "弔",
    "喫",
    "囙",
    "㠶",
    "颿",
    "秊",
    "倣",
    "髣",
    "佀",
    "朶",
    "氷",
    "決",
    "併",
    "並",
    "竝",
    "汙",
    "汚",
    "異",
    "姦",
    "廵",
    "挵",
    "衖",
    "搤",
    "阯",
    "撦",
    "埳",
    "阬",
    "誌",
    "㕁",
    "卻",
    "刦",
    "刧",
    "刼",
    "芲",
    "蘤",
    "桿",
    "槓",
    "荳",
    "獃",
    "唫",
    "脗",
    "皁",
    "彿",
    "髴",
    "疘",
    "刪",
    "鉋",
    "鑤",
    "況",
    "牀",
    "恡",
    "棄",
    "洶",
    "汎",
    "災",
    "烖",
    "菑",
    "禩",
    "侷",
    "跼",
    "坿",
    "玅",
    "姉",
    "妬",
    "翫",
    "搨",
    "柺",
    "拕",
    "牴",
    "觝",
    "倖",
    "抝",
    "盃",
    "桮",
    "傑",
    "逩",
    "肎",
    "菓",
    "崐",
    "崑",
    "呪",
    "虖",
    "嘑",
    "謼",
    "詠",
    "㟁",
    "嵒",
    "巗",
    "巖",
    "雰",
    "稈",
    "咊",
    "嶽",
    "妷",
    "姪",
    "廹",
    "徃",
    "餚",
    "採",
    "寀",
    "唸",
    "週",
    "昬",
    "兎",
    "兔",
    "亯",
    "亱",
    "䘚",
    "淨",
    "劵",
    "匟",
    "㳒",
    "灋",
    "洩",
    "霑",
    "淚",
    "註",
    "恠",
    "箒",
    "屆",
    "絃",
    "圅",
    "旾",
    "珎",
    "掛",
    "垜",
    "艸",
    "茘",
    "査",
    "栢",
    "柵",
    "栁",
    "桺",
    "柹",
    "韮",
    "揹",
    "昰",
    "閧",
    "鬨",
    "冐",
    "暎",
    "嚥",
    "倃",
    "𠴰",
    "偺",
    "喒",
    "齩",
    "欬",
    "榘",
    "㑺",
    "儁",
    "敍",
    "敘",
    "肧",
    "脈",
    "䘑",
    "衇",
    "跡",
    "蹟",
    "砲",
    "礮",
    "薙",
    "鬀",
    "恆",
    "怳",
    "卹",
    "䘏",
    "賉",
    "婣",
    "畊",
    "揑",
    "綑",
    "輓",
    "恥",
    "躭",
    "晉",
    "棲",
    "覈",
    "慄",
    "翄",
    "脣",
    "槕",
    "㨪",
    "螡",
    "蟁",
    "㤙",
    "陗",
    "峩",
    "峯",
    "乗",
    "椉",
    "咲",
    "筍",
    "俛",
    "頫",
    "勌",
    "䠶",
    "躳",
    "慇",
    "拏",
    "㧱",
    "挐",
    "脃",
    "胷",
    "肐",
    "貍",
    "㽞",
    "畱",
    "淒",
    "悽",
    "蓆",
    "効",
    "傚",
    "涼",
    "缾",
    "菸",
    "煙",
    "淛",
    "湧",
    "誖",
    "猂",
    "醼",
    "讌",
    "㝠",
    "寃",
    "孃",
    "桒",
    "毬",
    "瑠",
    "璢",
    "瑯",
    "㨗",
    "搥",
    "搯",
    "蔆",
    "惏",
    "楳",
    "槑",
    "捄",
    "廂",
    "慽",
    "慼",
    "瞇",
    "埜",
    "畧",
    "虵",
    "稭",
    "棃",
    "犂",
    "迻",
    "媮",
    "兠",
    "舩",
    "慾",
    "綵",
    "腳",
    "𩓐",
    "夠",
    "豬",
    "貓",
    "湊",
    "減",
    "庻",
    "蔴",
    "菴",
    "朢",
    "睠",
    "觕",
    "麤",
    "釬",
    "銲",
    "痳",
    "殽",
    "婬",
    "滛",
    "湻",
    "㴱",
    "樑",
    "顇",
    "㝛",
    "窰",
    "窯",
    "琹",
    "欵",
    "墖",
    "趂",
    "隄",
    "愽",
    "揷",
    "揫",
    "煑",
    "朞",
    "㪚",
    "塟",
    "蔥",
    "蔕",
    "稜",
    "棊",
    "碁",
    "椶",
    "偪",
    "㕑",
    "廚",
    "廈",
    "鴈",
    "冣",
    "㝡",
    "晳",
    "鼃",
    "餧",
    "餵",
    "嗁",
    "諠",
    "㡌",
    "賸",
    "筴",
    "筞",
    "筩",
    "栰",
    "暠",
    "皜",
    "踰",
    "蝟",
    "㪟",
    "燄",
    "遊",
    "媿",
    "嘅",
    "庽",
    "窓",
    "牎",
    "牕",
    "窻",
    "徧",
    "僱",
    "帬",
    "裠",
    "強",
    "彊",
    "疎",
    "壻",
    "瓌",
    "䰟",
    "皷",
    "擕",
    "㩗",
    "㩦",
    "攜",
    "懃",
    "鞾",
    "幙",
    "㮣",
    "酧",
    "詶",
    "醻",
    "掽",
    "踫",
    "㼝",
    "盌",
    "磟",
    "覩",
    "倸",
    "㬉",
    "煗",
    "煖",
    "晻",
    "闇",
    "炤",
    "跥",
    "䗬",
    "蠭",
    "寘",
    "辠",
    "稺",
    "穉",
    "燬",
    "譭",
    "瘉",
    "癒",
    "顋",
    "骽",
    "猨",
    "蝯",
    "稟",
    "痺",
    "癡",
    "亷",
    "㢘",
    "韻",
    "泝",
    "遡",
    "昚",
    "躶",
    "臝",
    "羣",
    "㬪",
    "曡",
    "疊",
    "勦",
    "琍",
    "瓈",
    "𤋮",
    "熈",
    "牓",
    "搾",
    "謌",
    "堿",
    "鹻",
    "鹼",
    "矁",
    "燻",
    "髈",
    "𤺥",
    "辢",
    "旂",
    "𡚁",
    "潄",
    "砦",
    "詧",
    "嫰",
    "櫈",
    "撐",
    "墪",
    "譔",
    "鞵",
    "鞌",
    "蕋",
    "橤",
    "蘂",
    "醕",
    "譆",
    "跴",
    "蹤",
    "蜨",
    "蠍",
    "稾",
    "殭",
    "惪",
    "厀",
    "襃",
    "癅",
    "䊀",
    "餬",
    "潛",
    "癄",
    "顦",
    "鷰",
    "藷",
    "櫥",
    "螎",
    "蹏",
    "蟇",
    "譟",
    "簒",
    "彫",
    "琱",
    "鵰",
    "餹",
    "餻",
    "簷",
    "粦",
    "燐",
    "緐",
    "幑",
    "蹧",
    "粇",
    "穅",
    "臋",
    "籐",
    "繙",
    "飜",
    "孼",
    "蠏",
    "燿",
    "蝡",
    "稬",
    "穤",
    "惷",
    "覇",
    "鑵",
    "戹",
    "阨",
    "剳",
    "帀",
    "巵",
    "亙",
    "佇",
    "竚",
    "穽",
    "岅",
    "虯",
    "𦍑",
    "羗",
    "啎",
    "姙",
    "㘭",
    "袟",
    "袠",
    "逈",
    "㒺",
    "犛",
    "氂",
    "偘",
    "甕",
    "罋",
    "冺",
    "姍",
    "蝨",
    "琺",
    "瑇",
    "尅",
    "梔",
    "斮",
    "斲",
    "斵",
    "暱",
    "毘",
    "蝱",
    "吚",
    "哶",
    "峝",
    "粃",
    "竢",
    "狥",
    "秈",
    "烱",
    "㳄",
    "袵",
    "盇",
    "涖",
    "蒞",
    "碪",
    "蠔",
    "唕",
    "倐",
    "儵",
    "雋",
    "皐",
    "臯",
    "衂",
    "䶊",
    "臙",
    "獧",
    "痾",
    "皰",
    "湼",
    "澣",
    "濬",
    "塚",
    "襢",
    "娿",
    "勅",
    "勑",
    "戞",
    "廐",
    "廄",
    "眥",
    "覜",
    "勗",
    "啗",
    "噉",
    "傯",
    "挱",
    "㥫",
    "惥",
    "慂",
    "陻",
    "蕚",
    "萲",
    "蕿",
    "蘐",
    "藼",
    "櫂",
    "箠",
    "槨",
    "啑",
    "蹠",
    "蚘",
    "痐",
    "蛕",
    "蜖",
    "瘖",
    "遯",
    "醃",
    "飱",
    "冪",
    "簑",
    "枏",
    "柟",
    "檝",
    "楥",
    "矴",
    "椗",
    "嘷",
    "獋",
    "粺",
    "䈰",
    "諐",
    "齶",
    "堘",
    "疿",
    "雝",
    "秔",
    "稉",
    "槀",
    "搉",
    "廝",
    "叡",
    "嘠",
    "蜋",
    "筯",
    "篛",
    "麞",
    "糉",
    "緥",
    "璿",
    "髥",
    "臕",
    "餈",
    "剹",
    "橜",
    "罇",
    "蜺",
    "矙",
    "憇",
    "翺",
    "饍",
    "瞖",
    "羴",
    "羶",
    "爕",
    "繦",
    "騌",
    "鬉",
    "騣",
    "蔾",
    "䠀",
    "簮",
    "躕",
    "蹵",
    "䝔",
    "貛",
    "鼴",
    "麐",
    "塡",
    "あ",
    "い",
    "う",
    "え",
    "お",
    "か",
    "き",
    "く",
    "け",
    "こ",
    "さ",
    "し",
    "す",
    "せ",
    "そ",
    "た",
    "ち",
    "つ",
    "て",
    "と",
    "な",
    "に",
    "ぬ",
    "ね",
    "の",
    "は",
    "ひ",
    "ふ",
    "へ",
    "ほ",
    "ま",
    "み",
    "む",
    "め",
    "も",
    "や",
    "ゆ",
    "よ",
    "ら",
    "り",
    "る",
    "れ",
    "ろ",
    "わ",
    "を",
    "ん",
    "が",
    "ぎ",
    "ぐ",
    "げ",
    "ご",
    "ざ",
    "じ",
    "ず",
    "ぜ",
    "ぞ",
    "だ",
    "ぢ",
    "づ",
    "で",
    "ど",
    "ば",
    "び",
    "ぶ",
    "べ",
    "ぼ",
    "ぱ",
    "ぴ",
    "ぷ",
    "ぺ",
    "ぽ",
    "ぁ",
    "ぃ",
    "ぅ",
    "ぇ",
    "ぉ",
    "っ",
    "ゃ",
    "ゅ",
    "ょ",
    "ゎ",
    "ゕ",
    "ゖ",
    "ア",
    "イ",
    "ウ",
    "エ",
    "オ",
    "カ",
    "キ",
    "ク",
    "ケ",
    "コ",
    "サ",
    "シ",
    "ス",
    "セ",
    "ソ",
    "タ",
    "チ",
    "ツ",
    "テ",
    "ト",
    "ナ",
    "ニ",
    "ヌ",
    "ネ",
    "ノ",
    "ハ",
    "ヒ",
    "フ",
    "ヘ",
    "ホ",
    "マ",
    "ミ",
    "ム",
    "メ",
    "モ",
    "ヤ",
    "ユ",
    "ヨ",
    "ラ",
    "リ",
    "ル",
    "レ",
    "ロ",
    "ワ",
    "ヲ",
    "ン",
    "ガ",
    "ギ",
    "グ",
    "ゲ",
    "ゴ",
    "ザ",
    "ジ",
    "ズ",
    "ゼ",
    "ゾ",
    "ダ",
    "ヂ",
    "ヅ",
    "デ",
    "ド",
    "バ",
    "ビ",
    "ブ",
    "ベ",
    "ボ",
    "パ",
    "ピ",
    "プ",
    "ペ",
    "ポ",
    "ァ",
    "ィ",
    "ゥ",
    "ェ",
    "ォ",
    "ッ",
    "ャ",
    "ュ",
    "ョ",
    "ヮ",
    "ヵ",
    "ヶ",
    "ヷ",
    "ヸ",
    "ヹ",
    "ヺ",
    "・",
    "ー",
    "ヽ",
    "ヾ",
    "ヿ",
    "ｱ",
    "ｲ",
    "ｳ",
    "ｴ",
    "ｵ",
    "ｶ",
    "ｷ",
    "ｸ",
    "ｹ",
    "ｺ",
    "ｻ",
    "ｼ",
    "ｽ",
    "ｾ",
    "ｿ",
    "ﾀ",
    "ﾁ",
    "ﾂ",
    "ﾃ",
    "ﾄ",
    "ﾅ",
    "ﾆ",
    "ﾇ",
    "ﾈ",
    "ﾉ",
    "ﾊ",
    "ﾋ",
    "ﾌ",
    "ﾍ",
    "ﾎ",
    "ﾏ",
    "ﾐ",
    "ﾑ",
    "ﾒ",
    "ﾓ",
    "ﾔ",
    "ﾕ",
    "ﾖ",
    "ﾗ",
    "ﾘ",
    "ﾙ",
    "ﾚ",
    "ﾛ",
    "ﾜ",
    "ｦ",
    "ﾝ",
    "ﾞ",
    "ﾟ",
    "ｧ",
    "ｨ",
    "ｩ",
    "ｪ",
    "ｫ",
    "ｯ",
    "ｬ",
    "ｭ",
    "ｮ",
    "円",
    "気",
    "糸",
    "絵",
    "楽",
    "帰",
    "戸",
    "広",
    "黒",
    "図",
    "線",
    "読",
    "売",
    "歩",
    "毎",
    "亜",
    "悪",
    "圧",
    "扱",
    "囲",
    "為",
    "壱",
    "隠",
    "栄",
    "営",
    "駅",
    "塩",
    "縁",
    "艶",
    "応",
    "桜",
    "穏",
    "仮",
    "価",
    "箇",
    "ゑ",
    "ゝ",
    "ゞ",
    "ヰ",
    "ヴ",
    "㈱",
    "両",
    "丼",
    "丿",
    "亀",
    "仏",
    "伝",
    "侶",
    "俤",
    "値",
    "倶",
    "倹",
    "偐",
    "偽",
    "働",
    "儛",
    "兌",
    "児",
    "冑",
    "冨",
    "凞",
    "処",
    "凪",
    "別",
    "剣",
    "剤",
    "剰",
    "劔",
    "労",
    "勧",
    "勲",
    "匁",
    "匂",
    "匲",
    "卍",
    "単",
    "厳",
    "収",
    "呂",
    "呉",
    "呑",
    "呰",
    "唖",
    "喚",
    "喩",
    "喰",
    "噛",
    "噺",
    "嚢",
    "囃",
    "団",
    "圀",
    "圏",
    "堀",
    "堺",
    "塀",
    "塁",
    "塙",
    "増",
    "墺",
    "壊",
    "壌",
    "壷",
    "変",
    "奨",
    "姫",
    "娯",
    "嫐",
    "嬢",
    "嬾",
    "孁",
    "宍",
    "実",
    "宮",
    "寔",
    "寛",
    "対",
    "専",
    "尭",
    "峠",
    "崋",
    "嶋",
    "巀",
    "巌",
    "巣",
    "巻",
    "帯",
    "幇",
    "庁",
    "廃",
    "廻",
    "弉",
    "弌",
    "弐",
    "弖",
    "弾",
    "従",
    "徳",
    "徴",
    "忯",
    "恵",
    "悩",
    "惣",
    "懐",
    "懽",
    "戦",
    "戯",
    "戻",
    "払",
    "抜",
    "択",
    "拝",
    "拠",
    "拡",
    "拵",
    "挙",
    "挿",
    "捗",
    "捜",
    "掟",
    "掲",
    "掻",
    "揃",
    "換",
    "揺",
    "摂",
    "撃",
    "撹",
    "斉",
    "斎",
    "旛",
    "旡",
    "晧",
    "晩",
    "暁",
    "暦",
    "曽",
    "杁",
    "杢",
    "杣",
    "杮",
    "枓",
    "枠",
    "枡",
    "柾",
    "栂",
    "栃",
    "桝",
    "桟",
    "桾",
    "梛",
    "梱",
    "梲",
    "梶",
    "椙",
    "検",
    "椥",
    "楕",
    "楡",
    "楢",
    "榊",
    "榎",
    "槇",
    "様",
    "槙",
    "槻",
    "樋",
    "権",
    "樫",
    "橿",
    "檥",
    "欅",
    "歎",
    "歓",
    "歯",
    "歳",
    "歴",
    "毀",
    "沖",
    "沢",
    "浄",
    "涙",
    "済",
    "渉",
    "渋",
    "渓",
    "渕",
    "満",
    "滝",
    "漑",
    "潅",
    "澁",
    "瀞",
    "瀬",
    "焔",
    "焼",
    "煇",
    "煕",
    "煥",
    "燗",
    "爼",
    "犠",
    "狛",
    "猟",
    "獏",
    "獣",
    "珊",
    "瑤",
    "甞",
    "畑",
    "畠",
    "畳",
    "畷",
    "畺",
    "痩",
    "癪",
    "発",
    "県",
    "眞",
    "砕",
    "碕",
    "礒",
    "禖",
    "禿",
    "稲",
    "穂",
    "穣",
    "竃",
    "竜",
    "竴",
    "笹",
    "筈",
    "筬",
    "筰",
    "箆",
    "箏",
    "箙",
    "篠",
    "篭",
    "簺",
    "籾",
    "粂",
    "粋",
    "粛",
    "粧",
    "糺",
    "紬",
    "絁",
    "経",
    "絖",
    "絣",
    "絽",
    "継",
    "続",
    "綟",
    "総",
    "縄",
    "縅",
    "縒",
    "縦",
    "繊",
    "繋",
    "繍",
    "繝",
    "繧",
    "纐",
    "纒",
    "罠",
    "罧",
    "罵",
    "羂",
    "羇",
    "羨",
    "聟",
    "聡",
    "聨",
    "聴",
    "脇",
    "脳",
    "膣",
    "膵",
    "臈",
    "臓",
    "臥",
    "舎",
    "舖",
    "舗",
    "舘",
    "芿",
    "苅",
    "茲",
    "荊",
    "荘",
    "莬",
    "莵",
    "菫",
    "萠",
    "蔵",
    "薗",
    "薫",
    "薬",
    "薭",
    "蘊",
    "蛍",
    "蝋",
    "蝿",
    "蟷",
    "衞",
    "衵",
    "袙",
    "袞",
    "袰",
    "袴",
    "袿",
    "裃",
    "裡",
    "裲",
    "褄",
    "褌",
    "襴",
    "襷",
    "覗",
    "覚",
    "覧",
    "観",
    "訳",
    "証",
    "諌",
    "諚",
    "諟",
    "諡",
    "諮",
    "譛",
    "譲",
    "讃",
    "豅",
    "豊",
    "豎",
    "賎",
    "賛",
    "贔",
    "躙",
    "躰",
    "転",
    "軽",
    "輌",
    "辥",
    "辺",
    "辻",
    "込",
    "逓",
    "遅",
    "遙",
    "邉",
    "郷",
    "酔",
    "醗",
    "醤",
    "醸",
    "釈",
    "鉄",
    "鉇",
    "鉤",
    "鉱",
    "鉾",
    "銈",
    "銕",
    "銭",
    "鋲",
    "鋳",
    "鋺",
    "錆",
    "錍",
    "錣",
    "錬",
    "錵",
    "鍑",
    "鍮",
    "鍼",
    "鎌",
    "鎗",
    "鎚",
    "鎹",
    "鐇",
    "鐚",
    "鐡",
    "鑁",
    "鑑",
    "鑚",
    "鑢",
    "閇",
    "関",
    "閦",
    "闘",
    "陥",
    "険",
    "隣",
    "隷",
    "雑",
    "雫",
    "霊",
    "靜",
    "靫",
    "靭",
    "靱",
    "鞄",
    "鞆",
    "頚",
    "頬",
    "頴",
    "頼",
    "顕",
    "顗",
    "餝",
    "饂",
    "駄",
    "駆",
    "駈",
    "騒",
    "験",
    "騨",
    "髄",
    "髙",
    "髪",
    "髷",
    "鯖",
    "鯰",
    "鯱",
    "鰒",
    "鰯",
    "鰰",
    "鳰",
    "鴎",
    "鴫",
    "鵄",
    "鵞",
    "鵺",
    "鶏",
    "鹸",
    "麁",
    "麺",
    "麿",
    "黌",
    "黙",
    "鼈",
    "齢",
    "龗",
    "縯",
    "蟅",
    "坖",
    "祂",
    "鼂",
    "鱚",
    "蛻",
    "屌",
    "呾",
    "煔",
    "吶",
    "扥",
    "蚖",
    "銂",
    "尃",
    "夋",
    "鵼",
    "徬",
    "寳",
    "彡",
    "舨",
    "湳",
    "麼",
    "鍈",
    "崈",
    "鱣",
    "盺",
    "拺",
    "瑥",
    "茷",
    "焻",
    "奀",
    "驎",
    "鱰",
    "砢",
    "痟",
    "廱",
    "僜",
    "瘺",
    "鱊",
    "擥",
    "嶰",
    "淓",
    "跅",
    "浵",
    "媗",
    "璦",
    "煠",
    "檊",
    "媃",
    "峅",
    "躄",
    "鉟",
    "塽",
    "蟴",
    "鯮",
    "弍",
    "烒",
    "鵵",
    "妑",
    "孋",
    "蚡",
    "恊",
    "輭",
    "廞",
    "產",
    "曅",
    "盜",
    "騤",
    "囪",
    "鱀",
    "茇",
    "葊",
    "逹",
    "狓",
    "崢",
    "趖",
    "凃",
    "羙",
    "鮸",
    "昞",
    "楿",
    "渽",
    "圗",
    "麪",
    "屇",
    "鍉",
    "葝",
    "沯",
    "爭",
    "幵",
    "筭",
    "寊",
    "銋",
    "貮",
    "鎭",
    "熺",
    "昜",
    "鍱",
    "墬",
    "愒",
    "磺",
    "嚈",
    "稘",
    "珮",
    "釆",
    "殑",
    "鍩",
    "䲁",
    "蕷",
    "鐿",
    "僡",
    "佹",
    "輶",
    "冴",
    "襶",
    "賔",
    "猙",
    "辧",
    "絛",
    "磾",
    "韁",
    "螔",
    "譳",
    "礑",
    "鋱",
    "魩",
    "嚗",
    "棆",
    "牆",
    "敟",
    "柶",
    "瓛",
    "魣",
    "巎",
    "轘",
    "襌",
    "枼",
    "鸌",
    "逺",
    "錏",
    "縡",
    "帢",
    "騄",
    "媼",
    "埅",
    "鄤",
    "萐",
    "祙",
    "旼",
    "詥",
    "鶲",
    "燉",
    "卲",
    "銱",
    "庲",
    "伱",
    "氽",
    "嵿",
    "挻",
    "煵",
    "窋",
    "鐤",
    "鮊",
    "鱬",
    "鰧",
    "嬤",
    "譞",
    "諲",
    "脭",
    "悳",
    "崘",
    "阭",
    "內",
    "袾",
    "冚",
    "壐",
    "咗",
    "礠",
    "孮",
    "痲",
    "埈",
    "肹",
    "鰮",
    "鮓",
    "濊",
    "塜",
    "凜",
    "蒢",
    "噰",
    "桼",
    "峍",
    "焴",
    "鶒",
    "鋮",
    "綠",
    "鶹",
    "熿",
    "毴",
    "咟",
    "嘥",
    "睺",
    "繡",
    "郎",
    "瘞",
    "鉶",
    "蔎",
    "秠",
    "緤",
    "蝀",
    "躝",
    "蟜",
    "繃",
    "囮",
    "墫",
    "乭",
    "胊",
    "濙",
    "瘓",
    "榣",
    "鑛",
    "鐫",
    "嶴",
    "甹",
    "坮",
    "銾",
    "蒭",
    "睜",
    "俋",
    "餠",
    "榢",
    "蓳",
    "盋",
    "堷",
    "鍏",
    "苝",
    "巛",
    "蚵",
    "暏",
    "熤",
    "嬨",
    "墎",
    "鏽",
    "戶",
    "菺",
    "膮",
    "熖",
    "睪",
    "栜",
    "捱",
    "榗",
    "鍷",
    "曧",
    "犽",
    "韑",
    "袓",
    "䖝",
    "焄",
    "喦",
    "髲",
    "疌",
    "㴪",
    "侊",
    "貐",
    "蕅",
    "禠",
    "蕑",
    "囯",
    "暊",
    "儞",
    "佋",
    "柎",
    "㐱",
    "鰤",
    "苳",
    "鱥",
    "謤",
    "遶",
    "眀",
    "鑀",
    "羋",
    "顏",
    "陜",
    "銩",
    "黶",
    "苼",
    "蒤",
    "棛",
    "儫",
    "咁",
    "抦",
    "衚",
    "棩",
    "焿",
    "脫",
    "麅",
    "玏",
    "埧",
    "淸",
    "黁",
    "淽",
    "彠",
    "鮨",
    "沜",
    "糀",
    "厓",
    "楧",
    "嶌",
    "簹",
    "檵",
    "鱇",
    "嶬",
    "廸",
    "卽",
    "樀",
    "贌",
    "酼",
    "籛",
    "沒",
    "晸",
    "諪",
    "蕡",
    "妏",
    "鄋",
    "蒍",
    "奧",
    "抇",
    "蓨",
    "薆",
    "鱷",
    "巘",
    "䝉",
    "亰",
    "寈",
    "槩",
    "誒",
    "麴",
    "蕟",
    "溎",
    "蘗",
    "榦",
    "斿",
    "暟",
    "炲",
    "拚",
    "娖",
    "繖",
    "橚",
    "寜",
    "爀",
    "饟",
    "悅",
    "鯏",
    "彜",
    "眾",
    "葯",
    "嬝",
    "埮",
    "獇",
    "馛",
    "溙",
    "瀦",
    "熼",
    "硓",
    "鈢",
    "樆",
    "輬",
    "鰜",
    "蔘",
    "渙",
    "澔",
    "嗮",
    "旉",
    "籜",
    "媊",
    "燘",
    "儚",
    "頹",
    "缽",
    "俽",
    "逨",
    "鱓",
    "郞",
    "歊",
    "杴",
    "珡",
    "杋",
    "醁",
    "鰏",
    "鵾",
    "鐽",
    "鮋",
    "巶",
    "荅",
    "薾",
    "囓",
    "蹻",
    "獎",
    "禑",
    "鎓",
    "榲",
    "僴",
    "綞",
    "尓",
    "敭",
    "曔",
    "褔",
    "鬅",
    "亊",
    "鏦",
    "蓘",
    "裬",
    "鱲",
    "薡",
    "鰗",
    "箑",
    "鬪",
    "縂",
    "璸",
    "甙",
    "茮",
    "辵",
    "岻",
    "覿",
    "滈",
    "鯶",
    "鑂",
    "囶",
    "舺",
    "溋",
    "拋",
    "菾",
    "敾",
    "虨",
    "綝",
    "蝍",
    "醂",
    "禨",
    "賹",
    "廧",
    "絕",
    "槗",
    "徫",
    "鎔",
    "曮",
    "蠂",
    "捒",
    "堈",
    "莕",
    "蓪",
    "敎",
    "禃",
    "櫱",
    "綧",
    "瀶",
    "逌",
    "浤",
    "碻",
    "刄",
    "逤",
    "剏",
    "氹",
    "菈",
    "娫",
    "蜛",
    "嵗",
    "糎",
    "螶",
    "譓",
    "鏳",
    "嵙",
    "瑊",
    "隲",
    "檨",
    "緈",
    "畵",
    "砯",
    "簗",
    "彅",
    "鰺",
    "騋",
    "窶",
    "嚒",
    "嵻",
    "尙",
    "頵",
    "槰",
    "虉",
    "醞",
    "巂",
    "彔",
    "偊",
    "畇",
    "鱨",
    "妸",
    "塲",
    "畐",
    "鈫",
    "錟",
    "磪",
    "摠",
    "彥",
    "璙",
    "囝",
    "寗",
    "耎",
    "鮡",
    "蘓",
    "弅",
    "焃",
    "飥",
    "戙",
    "塰",
    "儱",
    "槺",
    "噏",
    "魟",
    "禵",
    "佧",
    "咘",
    "盪",
    "瑈",
    "鉲",
    "睭",
    "鏌",
    "鼇",
    "郋",
    "魮",
    "朖",
    "滽",
    "渃",
    "滙",
    "熯",
    "醿",
    "鎅",
    "褀",
    "鬬",
    "巄",
    "螥",
    "眜",
    "釚",
    "柉",
    "壎",
    "峇",
    "姸",
    "唭",
    "鮜",
    "鈖",
    "嫈",
    "壄",
    "洤",
    "黃",
    "伕",
    "堦",
    "嶔",
    "鮰",
    "鞞",
    "漎",
    "鉓",
    "鮗",
    "壴",
    "阝",
    "妀",
    "矽",
    "獢",
    "倗",
    "銪",
    "鴓",
    "橒",
    "凈",
    "哖",
    "屚",
    "偍",
    "瑺",
    "媯",
    "淍",
    "驌",
    "椇",
    "赬",
    "薐",
    "糹",
    "碽",
    "濲",
    "釭",
    "晭",
    "纕",
    "寖",
    "閞",
    "歿",
    "呎",
    "鶆",
    "屄",
    "櫿",
    "犎",
    "旲",
    "㙟",
    "龎",
    "翜",
    "螾",
    "說",
    "衜",
    "泆",
    "軎",
    "鵂",
    "荎",
    "嚧",
    "硂",
    "桖",
    "褭",
    "筊",
    "鰷",
    "秳",
    "戩",
    "轀",
    "鬹",
    "飬",
    "卋",
    "暸",
    "狦",
    "搢",
    "娋",
    "鏴",
    "溫",
    "毉",
    "淰",
    "謩",
    "餺",
    "鵙",
    "鳽",
    "鮀",
    "狶",
    "氻",
    "轝",
    "妺",
    "袛",
    "蓭",
    "梂",
    "娛",
    "牼",
    "稅",
    "兿",
    "玾",
    "煚",
    "僩",
    "鶿",
    "鬄",
    "崠",
    "鉆",
    "鯓",
    "蚢",
    "庀",
    "鵟",
    "坣",
    "殼",
    "悞",
    "熅",
    "敻",
    "鍠",
    "曶",
    "愼",
    "搳",
    "姃",
    "砳",
    "槼",
    "臞",
    "韾",
    "靑",
    "鸊",
    "薲",
    "虛",
    "蠄",
    "啟",
    "鶺",
    "苺",
    "滾",
    "褞",
    "仺",
    "胇",
    "憻",
    "郳",
    "烉",
    "驩",
    "冇",
    "枖",
    "夌",
    "搵",
    "匸",
    "盨",
    "櫾",
    "霤",
    "麊",
    "貒",
    "噓",
    "嗢",
    "笩",
    "晈",
    "冂",
    "銳",
    "毿",
    "慜",
    "囧",
    "閜",
    "娸",
    "庢",
    "壆",
    "馯",
    "桱",
    "兗",
    "葃",
    "侅",
    "煐",
    "鐦",
    "藸",
    "鷎",
    "嵰",
    "逎",
    "弒",
    "匋",
    "鐭",
    "廔",
    "砩",
    "孆",
    "灴",
    "伷",
    "兪",
    "鴗",
    "澯",
    "幚",
    "旙",
    "勻",
    "礽",
    "婑",
    "鱮",
    "娍",
    "銶",
    "吳",
    "鍟",
    "仼",
    "鳧",
    "彞",
    "娽",
    "昛",
    "鰼",
    "剎",
    "佉",
    "鉏",
    "偸",
    "鰆",
    "讙",
    "橪",
    "啱",
    "岀",
    "孻",
    "釪",
    "乹",
    "鈳",
    "漇",
    "檦",
    "埻",
    "祿",
    "爌",
    "禇",
    "鱵",
    "㸃",
    "梉",
    "燝",
    "霙",
    "炁",
    "飮",
    "蠙",
    "勷",
    "鵎",
    "儥",
    "鐠",
    "唻",
    "廰",
    "嚿",
    "嵕",
    "墱",
    "紑",
    "搖",
    "瘜",
    "皝",
    "鸑",
    "瀁",
    "粵",
    "撚",
    "巑",
    "梀",
    "啯",
    "眛",
    "諴",
    "夊",
    "僙",
    "鍝",
    "裖",
    "鮣",
    "凬",
    "飡",
    "灊",
    "橓",
    "嫳",
    "筳",
    "咑",
    "粍",
    "瓑",
    "璌",
    "伃",
    "閰",
    "傜",
    "黐",
    "謢",
    "驒",
    "橫",
    "蛯",
    "寕",
    "蠵",
    "瞓",
    "旳",
    "翏",
    "硏",
    "寯",
    "韡",
    "楤",
    "鰃",
    "朿",
    "侞",
    "鵯",
    "愨",
    "祹",
    "厔",
    "丌",
    "盩",
    "謏",
    "魕",
    "啣",
    "閱",
    "曺",
    "枛",
    "罉",
    "卐",
    "樻",
    "鷉",
    "鯒",
    "鋡",
    "磱",
    "枱",
    "攴",
    "蠷",
    "穈",
    "嚟",
    "檽",
    "趐",
    "奐",
    "鋐",
    "檇",
    "薀",
    "峼",
    "咭",
    "訔",
    "韠",
    "鑴",
    "鸐",
    "唃",
    "捦",
    "鸜",
    "誴",
    "罳",
    "璄",
    "暃",
    "夀",
    "賨",
    "鞥",
    "鈊",
    "灡",
    "鮍",
    "懮",
    "籣",
    "昐",
    "陁",
    "襾",
    "鮠",
    "鈏",
    "囍",
    "婯",
    "艔",
    "貭",
    "䰾",
    "姁",
    "禼",
    "堖",
    "鋶",
    "仛",
    "鏷",
    "謜",
    "鑅",
    "忬",
    "蘶",
    "謠",
    "觙",
    "奫",
    "狟",
    "泩",
    "桙",
    "飈",
    "垰",
    "啍",
    "嚞",
    "鯕",
    "蒧",
    "榞",
    "徸",
    "璹",
    "揔",
    "欉",
    "魞",
    "菶",
    "玧",
    "鳯",
    "廍",
    "侚",
    "岰",
    "岧",
    "鋕",
    "凵",
    "彣",
    "崱",
    "媜",
    "倢",
    "鵐",
    "砋",
    "鷚",
    "鱠",
    "鮻",
    "繻",
    "摵",
    "贓",
    "磵",
    "錻",
    "痠",
    "粩",
    "胅",
    "奣",
    "塨",
    "瀠",
    "鸘",
    "啚",
    "娳",
    "霶",
    "壔",
    "峚",
    "甂",
    "廁",
    "覌",
    "鰂",
    "猳",
    "鱻",
    "盫",
    "裿",
    "杬",
    "歛",
    "澋",
    "蘞",
    "嵜",
    "尐",
    "旽",
    "鉌",
    "鎛",
    "豿",
    "凖",
    "榤",
    "禓",
    "龝",
    "悧",
    "鷟",
    "鮟",
    "吋",
    "喢",
    "岪",
    "吥",
    "漵",
    "頠",
    "豔",
    "巿",
    "鑨",
    "醣",
    "熳",
    "懍",
    "湥",
    "檡",
    "韺",
    "戱",
    "緖",
    "鐈",
    "凉",
    "緃",
    "鮹",
    "媐",
    "爯",
    "巆",
    "褍",
    "鐬",
    "昍",
    "扙",
    "鍳",
    "芛",
    "蟳",
    "嬅",
    "糬",
    "吔",
    "塭",
    "譿",
    "冧",
    "鏓",
    "嶪",
    "嗹",
    "椵",
    "姀",
    "閿",
    "褧",
    "錞",
    "玆",
    "笘",
    "篔",
    "萡",
    "鶡",
    "螐",
    "鮄",
    "鰟",
    "脷",
    "啲",
    "杤",
    "蓚",
    "尗",
    "娎",
    "殟",
    "淥",
    "蝚",
    "蓧",
    "彐",
    "嚤",
    "銍",
    "囒",
    "坶",
    "淩",
    "鶼",
    "鱂",
    "喼",
    "燫",
    "肏",
    "姵",
    "廌",
    "禟",
    "籝",
    "迵",
    "嵨",
    "堮",
    "蟌",
    "憍",
    "廕",
    "蜑",
    "緁",
    "唘",
    "竩",
    "崙",
    "璚",
    "粄",
    "栨",
    "罈",
    "梫",
    "貤",
    "藔",
    "蜯",
    "訁",
    "斖",
    "煶",
    "馦",
    "妠",
    "閟",
    "疕",
    "夆",
    "鎪",
    "膥",
    "澻",
    "嘢",
    "嚐",
    "靁",
    "鎻",
    "鰛",
    "穵",
    "烋",
    "縕",
    "褎",
    "疒",
    "壠",
    "溼",
    "圂",
    "咅",
    "鯭",
    "鯙",
    "磘",
    "玨",
    "珤",
    "朊",
    "蚼",
    "濶",
    "薞",
    "嚩",
    "丟",
    "嫺",
    "鯻",
    "椲",
    "鰕",
    "刂",
    "蠘",
    "踎",
    "瀴",
    "琁",
    "鰶",
    "瑴",
    "肜",
    "㐂",
    "欥",
    "媺",
    "竻",
    "讚",
    "𣇉",
    "裵",
    "緜",
    "廩",
    "齧",
    "叄",
    "俌",
    "厰",
    "滀",
    "錄",
    "鷫",
    "鯗",
    "攞",
    "姌",
    "蔝",
    "幷",
    "縤",
    "屻",
    "鯃",
    "雞",
    "纁",
    "嫲",
    "嵮",
    "屭",
    "嶃",
    "跩",
    "鋗",
    "蕢",
    "篊",
    "俬",
    "淎",
    "暻",
    "鏻",
    "憓",
    "玗",
    "溈",
    "笭",
    "糢",
    "勳",
    "閒",
    "沍",
    "咾",
    "鉷",
    "蘵",
    "俁",
    "崵",
    "毸",
    "苪",
    "掙",
    "鴡",
    "萭",
    "俴",
    "屜",
    "蒾",
    "艹",
    "剷",
    "慍",
    "朮",
    "枴",
    "氳",
    "猓",
    "甽",
    "箝",
    "譁",
    "贗",
    "迆",
    "鈽",
    "鍊",
    "鍰",
    "鏍",
    "靦",
    "餽",
    "丮",
    "丱",
    "仜",
    "仩",
    "伬",
    "伔",
    "仱",
    "伀",
    "伻",
    "佢",
    "佒",
    "侀",
    "侇",
    "佷",
    "佌",
    "佪",
    "侐",
    "侜",
    "俓",
    "侲",
    "俉",
    "侻",
    "侳",
    "俇",
    "倅",
    "倇",
    "倰",
    "倛",
    "倳",
    "倷",
    "俷",
    "倠",
    "偯",
    "偞",
    "偠",
    "偋",
    "偝",
    "偛",
    "偢",
    "偅",
    "偟",
    "偩",
    "偫",
    "傛",
    "傔",
    "傞",
    "傋",
    "傌",
    "傎",
    "傝",
    "偨",
    "傂",
    "傽",
    "傿",
    "僆",
    "傮",
    "僄",
    "僈",
    "傰",
    "僁",
    "傱",
    "僋",
    "僗",
    "僛",
    "僪",
    "僝",
    "僓",
    "僿",
    "儃",
    "儰",
    "僸",
    "僶",
    "僾",
    "儌",
    "僽",
    "儜",
    "儓",
    "儗",
    "儑",
    "儢",
    "儤",
    "儠",
    "儸",
    "儹",
    "儽",
    "冓",
    "冘",
    "冞",
    "凊",
    "凅",
    "凔",
    "刌",
    "刉",
    "刓",
    "刜",
    "刞",
    "刵",
    "刲",
    "剆",
    "刱",
    "剉",
    "剚",
    "剒",
    "剫",
    "剭",
    "剬",
    "剺",
    "剸",
    "剻",
    "剼",
    "劀",
    "劋",
    "劖",
    "劘",
    "劗",
    "劙",
    "劦",
    "勴",
    "匊",
    "匢",
    "匰",
    "匴",
    "匷",
    "匽",
    "卌",
    "卼",
    "厎",
    "厒",
    "厗",
    "厞",
    "厜",
    "厤",
    "厬",
    "厹",
    "吰",
    "吷",
    "吪",
    "呿",
    "咈",
    "呫",
    "呺",
    "呥",
    "呬",
    "呴",
    "茍",
    "咷",
    "咮",
    "咶",
    "哅",
    "咠",
    "咢",
    "唦",
    "唗",
    "唒",
    "哤",
    "唚",
    "唈",
    "哫",
    "唅",
    "唴",
    "啢",
    "唶",
    "啒",
    "啅",
    "唌",
    "唲",
    "喨",
    "喥",
    "喭",
    "噅",
    "喓",
    "喣",
    "啽",
    "喌",
    "嗃",
    "嗛",
    "嗋",
    "嗀",
    "喿",
    "喍",
    "嗏",
    "嗕",
    "嗈",
    "嘕",
    "嘒",
    "嗼",
    "嘐",
    "嘓",
    "嘂",
    "嗺",
    "嘝",
    "嘄",
    "嗿",
    "噈",
    "噊",
    "噆",
    "噚",
    "嘳",
    "嘽",
    "嘾",
    "噮",
    "噳",
    "噣",
    "噭",
    "噞",
    "嚌",
    "嚍",
    "嚃",
    "嚘",
    "嚜",
    "嚫",
    "嚪",
    "嚬",
    "嚲",
    "嚵",
    "嚽",
    "嚾",
    "囆",
    "囅",
    "囋",
    "囗",
    "圁",
    "圞",
    "圠",
    "坁",
    "坅",
    "坲",
    "坱",
    "垀",
    "坴",
    "垗",
    "垝",
    "垔",
    "垘",
    "垽",
    "垼",
    "埢",
    "埶",
    "堩",
    "堣",
    "塈",
    "堥",
    "塓",
    "塉",
    "塯",
    "塕",
    "塼",
    "墆",
    "塿",
    "塴",
    "墋",
    "塺",
    "墝",
    "墯",
    "壈",
    "墽",
    "壖",
    "壝",
    "壛",
    "壾",
    "壿",
    "夃",
    "夎",
    "夒",
    "夗",
    "奅",
    "奊",
    "奰",
    "奲",
    "奼",
    "妦",
    "妎",
    "妢",
    "妐",
    "妵",
    "姏",
    "姎",
    "㚷",
    "姡",
    "姺",
    "姼",
    "娭",
    "婐",
    "婟",
    "婥",
    "婓",
    "婗",
    "媔",
    "媟",
    "媢",
    "婸",
    "媦",
    "媥",
    "媬",
    "媕",
    "娷",
    "嫇",
    "嫋",
    "媰",
    "媻",
    "嫮",
    "嫥",
    "嫢",
    "嫛",
    "嫿",
    "嫴",
    "嫷",
    "嫶",
    "嬎",
    "嬓",
    "嬐",
    "嬲",
    "嬽",
    "孈",
    "屘",
    "孲",
    "孷",
    "宎",
    "宨",
    "寪",
    "寍",
    "寋",
    "寑",
    "寙",
    "寠",
    "寱",
    "尌",
    "尒",
    "尟",
    "尰",
    "尳",
    "屖",
    "屔",
    "屝",
    "屧",
    "屩",
    "屮",
    "屴",
    "岏",
    "岋",
    "岉",
    "岒",
    "岮",
    "岤",
    "岯",
    "岟",
    "岝",
    "峐",
    "峌",
    "峞",
    "峉",
    "峊",
    "峬",
    "峮",
    "峷",
    "崝",
    "崨",
    "崥",
    "崏",
    "崰",
    "崣",
    "崷",
    "嵃",
    "嵑",
    "崳",
    "崺",
    "嵂",
    "嵱",
    "嵣",
    "嵥",
    "嵞",
    "嶀",
    "嵽",
    "嶆",
    "嵺",
    "嵷",
    "嶊",
    "嶉",
    "嶈",
    "嵾",
    "嶕",
    "嶜",
    "嶡",
    "嶚",
    "嶞",
    "嶱",
    "嶩",
    "嶵",
    "嶭",
    "巃",
    "巏",
    "巕",
    "巟",
    "巹",
    "帊",
    "帗",
    "帟",
    "帣",
    "帠",
    "帤",
    "帩",
    "帾",
    "帴",
    "幏",
    "幎",
    "幓",
    "幩",
    "幝",
    "幠",
    "幧",
    "幨",
    "幦",
    "幭",
    "幰",
    "庂",
    "庉",
    "庌",
    "庈",
    "庰",
    "庛",
    "庣",
    "庨",
    "庮",
    "庪",
    "庬",
    "庴",
    "廅",
    "廇",
    "廘",
    "廗",
    "廎",
    "廜",
    "緳",
    "廦",
    "廥",
    "廮",
    "廯",
    "蠯",
    "廾",
    "弚",
    "弝",
    "弣",
    "弤",
    "弮",
    "弳",
    "彃",
    "彉",
    "彋",
    "彏",
    "彯",
    "彴",
    "彸",
    "彾",
    "徦",
    "徥",
    "徯",
    "徲",
    "徾",
    "徿",
    "忀",
    "忁",
    "忔",
    "忕",
    "忨",
    "忣",
    "忷",
    "忥",
    "怭",
    "怲",
    "怋",
    "怴",
    "怗",
    "怚",
    "怞",
    "怬",
    "怢",
    "怐",
    "怮",
    "怓",
    "怷",
    "怹",
    "恲",
    "恞",
    "恅",
    "恇",
    "恉",
    "恛",
    "恌",
    "恀",
    "恟",
    "悀",
    "悁",
    "悕",
    "悗",
    "悇",
    "悊",
    "悐",
    "悾",
    "悺",
    "惓",
    "惤",
    "惈",
    "悷",
    "惉",
    "悹",
    "惌",
    "惢",
    "惄",
    "愊",
    "愖",
    "愅",
    "惵",
    "愓",
    "惸",
    "惼",
    "惾",
    "慉",
    "慅",
    "愶",
    "愲",
    "愮",
    "愯",
    "愬",
    "慁",
    "慞",
    "慱",
    "慒",
    "慓",
    "慲",
    "憀",
    "慴",
    "慔",
    "慺",
    "慛",
    "憃",
    "慹",
    "憱",
    "憰",
    "憢",
    "憉",
    "憛",
    "憯",
    "憟",
    "憪",
    "憡",
    "憝",
    "憖",
    "懅",
    "憴",
    "懆",
    "懁",
    "憿",
    "憸",
    "憵",
    "憼",
    "懧",
    "懠",
    "懥",
    "懤",
    "懘",
    "懭",
    "懱",
    "懪",
    "懰",
    "懫",
    "懻",
    "戁",
    "戃",
    "戄",
    "戉",
    "戠",
    "酨",
    "戺",
    "扐",
    "扜",
    "扤",
    "扡",
    "扢",
    "抆",
    "抌",
    "抎",
    "抏",
    "扻",
    "抭",
    "抴",
    "拑",
    "抾",
    "抪",
    "抶",
    "抮",
    "挍",
    "挋",
    "挃",
    "拫",
    "拹",
    "挏",
    "挌",
    "拸",
    "挀",
    "拲",
    "捖",
    "挬",
    "挶",
    "揤",
    "捊",
    "挼",
    "挩",
    "捁",
    "挴",
    "捘",
    "捔",
    "捥",
    "掝",
    "掗",
    "掫",
    "掯",
    "捵",
    "掜",
    "捼",
    "掤",
    "掔",
    "掱",
    "揎",
    "揥",
    "揨",
    "揯",
    "揊",
    "揲",
    "揵",
    "摡",
    "揟",
    "揝",
    "揜",
    "揘",
    "揅",
    "揱",
    "搆",
    "搟",
    "搕",
    "搘",
    "搹",
    "搷",
    "搣",
    "搰",
    "搊",
    "搚",
    "摀",
    "搧",
    "搫",
    "摍",
    "摝",
    "摲",
    "摦",
    "摎",
    "摋",
    "摓",
    "摐",
    "摿",
    "摮",
    "摰",
    "撢",
    "撠",
    "撗",
    "撜",
    "撋",
    "撊",
    "撌",
    "撟",
    "擗",
    "擖",
    "擏",
    "擉",
    "撽",
    "擩",
    "擣",
    "擫",
    "擭",
    "擨",
    "擽",
    "擸",
    "攇",
    "攐",
    "攍",
    "攌",
    "攗",
    "攕",
    "攓",
    "攡",
    "攠",
    "攦",
    "攩",
    "攭",
    "攲",
    "攳",
    "敁",
    "敊",
    "敆",
    "敓",
    "敧",
    "敪",
    "敤",
    "敜",
    "敯",
    "敳",
    "敶",
    "敺",
    "敹",
    "敿",
    "斁",
    "斀",
    "斄",
    "斒",
    "斔",
    "斞",
    "斨",
    "斪",
    "斻",
    "旍",
    "旓",
    "旚",
    "旝",
    "旟",
    "昲",
    "昦",
    "昢",
    "晇",
    "晥",
    "晜",
    "晼",
    "晬",
    "暀",
    "暆",
    "暍",
    "暋",
    "暡",
    "暰",
    "暩",
    "曀",
    "曊",
    "曋",
    "曏",
    "曒",
    "曚",
    "曣",
    "曭",
    "朁",
    "朅",
    "朄",
    "朒",
    "朘",
    "朣",
    "朾",
    "朹",
    "朻",
    "朼",
    "杅",
    "杇",
    "杝",
    "杗",
    "枎",
    "杶",
    "枆",
    "枌",
    "柲",
    "枺",
    "枻",
    "柸",
    "柀",
    "柅",
    "柫",
    "柤",
    "柍",
    "柮",
    "柣",
    "柂",
    "柧",
    "栚",
    "桋",
    "桏",
    "栱",
    "栵",
    "栫",
    "栭",
    "栯",
    "栘",
    "栔",
    "梡",
    "梇",
    "梐",
    "桭",
    "梮",
    "楖",
    "梬",
    "梩",
    "桵",
    "梒",
    "椌",
    "椄",
    "棜",
    "棷",
    "棳",
    "棌",
    "椈",
    "楰",
    "棯",
    "椔",
    "棸",
    "楟",
    "楎",
    "楱",
    "楅",
    "楺",
    "楈",
    "楛",
    "楉",
    "楬",
    "椳",
    "楀",
    "楄",
    "楶",
    "楘",
    "榶",
    "槉",
    "榠",
    "榬",
    "榼",
    "榙",
    "榩",
    "榾",
    "榯",
    "槄",
    "榽",
    "榹",
    "槥",
    "槸",
    "樕",
    "樠",
    "槬",
    "槢",
    "樛",
    "樝",
    "槾",
    "樧",
    "槮",
    "樔",
    "槷",
    "橀",
    "樴",
    "橉",
    "橧",
    "樲",
    "橨",
    "橝",
    "橭",
    "橶",
    "樿",
    "橁",
    "檍",
    "檖",
    "檁",
    "檟",
    "橾",
    "檛",
    "檓",
    "檕",
    "檃",
    "櫅",
    "檹",
    "櫡",
    "櫠",
    "櫌",
    "櫑",
    "櫙",
    "櫋",
    "櫜",
    "櫐",
    "櫫",
    "櫬",
    "櫰",
    "櫹",
    "櫺",
    "櫼",
    "欃",
    "欋",
    "欈",
    "欐",
    "欑",
    "欘",
    "欨",
    "欴",
    "欯",
    "欭",
    "欱",
    "欶",
    "欳",
    "欷",
    "欿",
    "歂",
    "歈",
    "歍",
    "歋",
    "歕",
    "歔",
    "歜",
    "歠",
    "歭",
    "歾",
    "肂",
    "殈",
    "殏",
    "殔",
    "殗",
    "殙",
    "殠",
    "殥",
    "殢",
    "殦",
    "殧",
    "殰",
    "殶",
    "毃",
    "毄",
    "毈",
    "毇",
    "毊",
    "毚",
    "毞",
    "毦",
    "毤",
    "毨",
    "毣",
    "毰",
    "毲",
    "毻",
    "毼",
    "毾",
    "氁",
    "氀",
    "氄",
    "氠",
    "氶",
    "汃",
    "汒",
    "汏",
    "汍",
    "汸",
    "沋",
    "汱",
    "汯",
    "沕",
    "汦",
    "汳",
    "泬",
    "沶",
    "沬",
    "泧",
    "沷",
    "泭",
    "泲",
    "泒",
    "沴",
    "洟",
    "洊",
    "洀",
    "浺",
    "浶",
    "洍",
    "涒",
    "浘",
    "浢",
    "涊",
    "涆",
    "浧",
    "涗",
    "涳",
    "涬",
    "淢",
    "涷",
    "淔",
    "渀",
    "淈",
    "涾",
    "淊",
    "涽",
    "淭",
    "湆",
    "湇",
    "湅",
    "湢",
    "渿",
    "湁",
    "渜",
    "渳",
    "湀",
    "渻",
    "渮",
    "湨",
    "湡",
    "渱",
    "渨",
    "湠",
    "湱",
    "湩",
    "渹",
    "溛",
    "滖",
    "溓",
    "溔",
    "滒",
    "溰",
    "溾",
    "滜",
    "滵",
    "滱",
    "漃",
    "漥",
    "漮",
    "潎",
    "漙",
    "漧",
    "漘",
    "漒",
    "滭",
    "漊",
    "潳",
    "滮",
    "潀",
    "漰",
    "潃",
    "漅",
    "濆",
    "澒",
    "澅",
    "潚",
    "潠",
    "澖",
    "潶",
    "潬",
    "潒",
    "潐",
    "潗",
    "澓",
    "潝",
    "濇",
    "濎",
    "濈",
    "濄",
    "澞",
    "澨",
    "瀄",
    "濌",
    "澩",
    "濴",
    "濔",
    "濣",
    "濭",
    "濧",
    "濦",
    "瀇",
    "瀎",
    "濿",
    "瀀",
    "濻",
    "瀙",
    "瀖",
    "瀫",
    "瀡",
    "瀢",
    "瀩",
    "瀯",
    "瀷",
    "灂",
    "瀸",
    "瀿",
    "瀺",
    "灄",
    "灉",
    "灖",
    "灗",
    "灛",
    "灟",
    "灨",
    "灩",
    "灪",
    "炾",
    "炰",
    "烓",
    "烑",
    "缹",
    "焍",
    "烰",
    "焠",
    "焮",
    "焣",
    "煆",
    "煣",
    "煝",
    "熐",
    "熉",
    "熀",
    "熂",
    "熚",
    "燅",
    "燂",
    "熸",
    "燀",
    "燡",
    "爁",
    "爊",
    "爂",
    "爓",
    "爞",
    "爢",
    "爣",
    "牄",
    "牉",
    "牋",
    "牏",
    "牣",
    "牬",
    "牰",
    "牸",
    "牷",
    "犈",
    "犉",
    "犆",
    "犅",
    "犌",
    "犑",
    "犐",
    "犗",
    "犕",
    "犓",
    "犘",
    "犚",
    "犝",
    "犞",
    "犥",
    "犦",
    "犤",
    "犣",
    "犩",
    "犪",
    "犮",
    "犵",
    "犿",
    "狆",
    "狖",
    "狋",
    "狘",
    "狜",
    "狔",
    "狚",
    "狌",
    "狑",
    "狊",
    "狤",
    "狫",
    "狪",
    "狣",
    "猀",
    "狾",
    "猑",
    "猘",
    "猈",
    "狿",
    "猏",
    "猋",
    "猒",
    "猧",
    "猲",
    "猭",
    "猦",
    "猣",
    "猵",
    "猼",
    "獂",
    "獀",
    "獊",
    "獑",
    "獌",
    "獘",
    "獞",
    "獟",
    "獝",
    "獛",
    "獡",
    "獩",
    "獦",
    "獥",
    "獳",
    "獶",
    "獽",
    "獿",
    "玂",
    "玁",
    "玈",
    "玊",
    "玔",
    "珓",
    "珶",
    "琖",
    "瑵",
    "璊",
    "瑽",
    "璅",
    "瑿",
    "璗",
    "瓁",
    "瓋",
    "瓝",
    "瓟",
    "瓡",
    "瓥",
    "瓨",
    "瓬",
    "瓵",
    "瓾",
    "瓽",
    "甀",
    "甃",
    "甈",
    "甋",
    "甐",
    "甒",
    "甔",
    "甖",
    "甝",
    "甮",
    "甿",
    "畟",
    "畣",
    "畽",
    "疀",
    "疧",
    "痁",
    "疻",
    "痀",
    "痎",
    "痏",
    "痋",
    "痌",
    "痑",
    "痚",
    "痡",
    "痝",
    "痗",
    "痯",
    "瘏",
    "痷",
    "痸",
    "痻",
    "瘈",
    "瘑",
    "瘝",
    "瘣",
    "瘯",
    "瘱",
    "瘽",
    "癈",
    "癉",
    "癙",
    "癐",
    "癓",
    "癠",
    "癵",
    "癹",
    "皊",
    "皏",
    "皫",
    "皯",
    "皵",
    "皻",
    "皽",
    "皾",
    "盄",
    "盓",
    "盝",
    "盬",
    "盭",
    "盳",
    "眃",
    "眅",
    "盻",
    "眝",
    "眐",
    "眓",
    "眒",
    "眣",
    "眑",
    "眕",
    "眹",
    "眱",
    "眲",
    "眴",
    "眳",
    "眽",
    "睆",
    "睅",
    "睊",
    "睋",
    "睌",
    "睕",
    "睟",
    "睒",
    "睖",
    "睩",
    "睧",
    "睔",
    "瞁",
    "睼",
    "瞂",
    "睮",
    "睯",
    "瞏",
    "瞉",
    "瞚",
    "瞝",
    "瞡",
    "瞛",
    "瞲",
    "瞷",
    "瞶",
    "瞴",
    "矂",
    "矉",
    "矊",
    "矌",
    "矎",
    "矏",
    "矐",
    "矔",
    "矕",
    "矘",
    "矠",
    "矱",
    "矲",
    "矹",
    "矺",
    "砅",
    "砐",
    "砏",
    "砎",
    "砨",
    "硈",
    "硉",
    "硠",
    "硥",
    "硱",
    "硰",
    "硩",
    "碔",
    "碄",
    "碅",
    "碆",
    "硾",
    "碫",
    "碞",
    "磍",
    "磌",
    "磎",
    "磈",
    "磃",
    "磝",
    "磩",
    "磥",
    "磞",
    "磛",
    "磳",
    "磼",
    "磿",
    "礔",
    "礉",
    "礝",
    "礛",
    "礜",
    "礥",
    "礣",
    "礧",
    "礨",
    "礭",
    "礿",
    "祌",
    "祅",
    "祔",
    "祒",
    "祑",
    "祤",
    "祩",
    "祪",
    "祣",
    "祫",
    "祡",
    "祴",
    "祳",
    "禂",
    "禗",
    "禜",
    "禫",
    "禭",
    "禬",
    "禴",
    "禷",
    "禸",
    "歶",
    "秅",
    "秏",
    "秖",
    "秎",
    "秮",
    "秪",
    "秺",
    "秶",
    "稊",
    "稒",
    "稫",
    "穊",
    "稰",
    "稯",
    "穋",
    "穛",
    "穖",
    "穧",
    "穨",
    "穮",
    "穬",
    "穭",
    "穱",
    "穾",
    "窆",
    "窉",
    "窌",
    "窏",
    "窔",
    "窐",
    "窙",
    "窢",
    "窞",
    "窫",
    "窲",
    "窴",
    "窱",
    "窾",
    "竀",
    "竁",
    "竷",
    "笐",
    "笓",
    "笅",
    "笵",
    "笻",
    "笴",
    "笰",
    "笢",
    "笝",
    "笲",
    "筄",
    "筡",
    "箈",
    "箊",
    "箌",
    "箛",
    "箎",
    "箘",
    "箄",
    "箷",
    "箾",
    "篎",
    "箯",
    "箹",
    "篞",
    "篣",
    "篧",
    "篕",
    "篨",
    "篹",
    "簅",
    "篲",
    "篿",
    "篻",
    "簎",
    "篴",
    "簂",
    "簁",
    "篸",
    "篽",
    "簜",
    "簩",
    "簙",
    "簭",
    "簦",
    "簨",
    "簢",
    "簥",
    "簳",
    "簼",
    "簬",
    "簻",
    "籉",
    "籈",
    "籊",
    "籔",
    "籗",
    "籧",
    "籦",
    "籯",
    "籺",
    "籸",
    "籹",
    "粊",
    "粔",
    "粻",
    "糔",
    "糪",
    "糱",
    "糷",
    "紎",
    "紟",
    "紒",
    "紽",
    "紸",
    "紶",
    "紩",
    "絇",
    "紾",
    "絘",
    "絯",
    "絓",
    "絧",
    "絏",
    "絭",
    "絫",
    "綀",
    "綍",
    "絿",
    "綅",
    "絻",
    "絼",
    "綔",
    "綷",
    "緂",
    "綪",
    "緀",
    "緅",
    "緎",
    "緆",
    "緌",
    "綯",
    "綼",
    "緷",
    "緛",
    "緪",
    "緧",
    "縃",
    "緺",
    "緶",
    "緰",
    "縗",
    "縌",
    "縓",
    "縎",
    "縜",
    "縚",
    "縏",
    "縼",
    "繂",
    "縳",
    "顈",
    "繈",
    "縸",
    "縪",
    "繉",
    "繀",
    "縩",
    "緵",
    "縰",
    "縿",
    "縶",
    "繜",
    "繐",
    "繣",
    "繘",
    "繢",
    "繟",
    "繑",
    "繠",
    "繶",
    "繵",
    "繸",
    "繷",
    "繺",
    "繲",
    "繴",
    "纀",
    "纇",
    "纋",
    "纆",
    "纑",
    "纗",
    "纚",
    "缿",
    "罊",
    "罏",
    "罜",
    "罞",
    "罝",
    "罛",
    "罣",
    "罥",
    "罦",
    "罭",
    "罫",
    "罬",
    "罻",
    "罼",
    "罺",
    "罿",
    "羃",
    "羉",
    "羍",
    "羒",
    "羜",
    "羛",
    "羢",
    "羠",
    "羦",
    "羬",
    "羭",
    "羵",
    "羳",
    "羷",
    "羺",
    "羾",
    "翋",
    "翍",
    "翐",
    "翑",
    "翇",
    "翢",
    "翣",
    "翭",
    "翪",
    "翨",
    "翴",
    "翲",
    "翽",
    "翿",
    "耟",
    "耞",
    "耡",
    "耴",
    "耾",
    "耹",
    "聇",
    "聈",
    "聑",
    "聏",
    "聝",
    "肕",
    "肙",
    "肒",
    "肣",
    "肵",
    "胘",
    "胑",
    "胐",
    "胕",
    "胉",
    "胏",
    "胹",
    "胵",
    "脁",
    "胻",
    "脀",
    "胾",
    "胔",
    "脰",
    "脥",
    "脤",
    "脙",
    "脡",
    "脕",
    "脧",
    "腃",
    "腏",
    "腄",
    "腇",
    "脽",
    "腍",
    "腤",
    "腷",
    "腜",
    "腛",
    "腢",
    "腲",
    "朡",
    "腞",
    "腶",
    "膉",
    "膆",
    "膃",
    "膇",
    "膍",
    "膌",
    "膋",
    "膟",
    "膕",
    "膢",
    "膱",
    "膹",
    "膫",
    "膰",
    "膬",
    "膴",
    "膲",
    "臇",
    "膷",
    "臄",
    "臅",
    "臒",
    "臐",
    "臗",
    "臛",
    "臡",
    "臦",
    "臩",
    "臮",
    "臲",
    "臷",
    "臸",
    "臿",
    "舋",
    "舑",
    "舕",
    "舝",
    "舡",
    "舼",
    "舽",
    "艀",
    "艂",
    "艓",
    "艒",
    "艐",
    "艑",
    "艕",
    "艛",
    "艵",
    "艼",
    "芀",
    "芐",
    "芅",
    "芓",
    "芔",
    "苀",
    "芚",
    "芵",
    "芧",
    "芞",
    "芺",
    "苙",
    "苨",
    "苖",
    "苬",
    "苲",
    "苵",
    "苶",
    "茙",
    "茥",
    "茿",
    "茦",
    "茢",
    "荂",
    "茪",
    "荍",
    "茖",
    "茤",
    "茠",
    "茩",
    "茻",
    "莐",
    "莣",
    "莍",
    "荺",
    "莤",
    "荴",
    "莏",
    "莁",
    "荵",
    "莔",
    "莃",
    "莌",
    "莋",
    "荾",
    "莥",
    "菨",
    "萒",
    "菧",
    "菤",
    "菆",
    "菣",
    "菿",
    "菋",
    "菎",
    "菵",
    "萉",
    "菞",
    "菳",
    "菕",
    "蓱",
    "萿",
    "葹",
    "葥",
    "葀",
    "葧",
    "萰",
    "葍",
    "葽",
    "蔇",
    "葞",
    "萷",
    "萺",
    "萴",
    "葅",
    "菙",
    "葋",
    "萯",
    "葂",
    "葟",
    "葌",
    "蓎",
    "蒬",
    "蒮",
    "蒫",
    "蒪",
    "蒚",
    "蒝",
    "蓌",
    "蒛",
    "蒩",
    "蒘",
    "蒶",
    "蒠",
    "蔤",
    "蔏",
    "蔩",
    "蔉",
    "蔍",
    "蔧",
    "蔜",
    "蓻",
    "蓺",
    "蓴",
    "蔪",
    "蓲",
    "蓷",
    "蓫",
    "蔒",
    "蓩",
    "蔖",
    "蓾",
    "蔨",
    "蔮",
    "蔂",
    "蓶",
    "蔱",
    "蓹",
    "蔠",
    "蔰",
    "蕫",
    "蕍",
    "蕀",
    "蕆",
    "蕄",
    "蕇",
    "蕣",
    "蕛",
    "蕱",
    "蕵",
    "蕮",
    "蕧",
    "蕠",
    "蕦",
    "蕝",
    "薃",
    "薧",
    "薕",
    "薠",
    "薋",
    "薣",
    "薚",
    "蕼",
    "薉",
    "蕸",
    "薎",
    "薖",
    "薍",
    "薝",
    "薂",
    "藆",
    "藀",
    "藃",
    "藂",
    "薵",
    "薽",
    "藇",
    "藄",
    "藋",
    "藈",
    "藅",
    "薱",
    "薶",
    "藒",
    "藫",
    "藱",
    "藙",
    "藡",
    "藚",
    "藗",
    "藲",
    "藬",
    "藘",
    "藣",
    "藑",
    "藰",
    "蘁",
    "藾",
    "蘛",
    "蘉",
    "蘌",
    "蘪",
    "蘦",
    "蘟",
    "蘣",
    "蘜",
    "蘙",
    "蘮",
    "蘡",
    "蘠",
    "蘥",
    "蘴",
    "蘳",
    "蘬",
    "虀",
    "蘹",
    "蘱",
    "蘻",
    "蘾",
    "虃",
    "虆",
    "虇",
    "虈",
    "虌",
    "虋",
    "虙",
    "虡",
    "虣",
    "虩",
    "虪",
    "虰",
    "虭",
    "虴",
    "蚑",
    "蚞",
    "蚇",
    "蚗",
    "蚚",
    "蚅",
    "蚥",
    "蚙",
    "蚿",
    "蚷",
    "蛂",
    "蛁",
    "蛅",
    "蛈",
    "蚹",
    "蚳",
    "蚸",
    "蛌",
    "蚻",
    "蛢",
    "蛦",
    "蛓",
    "蛣",
    "蛚",
    "蛪",
    "蛝",
    "蛫",
    "蛜",
    "蛬",
    "蛗",
    "蜄",
    "蛷",
    "蜌",
    "蛖",
    "蛵",
    "蜁",
    "蛶",
    "蜳",
    "蝫",
    "蜙",
    "蝃",
    "蜬",
    "蝁",
    "蝆",
    "蜠",
    "蜲",
    "蜪",
    "蜭",
    "蜼",
    "蜵",
    "蝂",
    "蜦",
    "蜧",
    "蜸",
    "蜤",
    "蜰",
    "蝖",
    "蝷",
    "蟡",
    "蝳",
    "蝔",
    "蝛",
    "蝒",
    "蝑",
    "蝞",
    "蝭",
    "蝪",
    "蝐",
    "蝝",
    "蝬",
    "蝺",
    "蝜",
    "螛",
    "螏",
    "螓",
    "螒",
    "螁",
    "螖",
    "螘",
    "蝹",
    "螇",
    "螑",
    "螝",
    "螜",
    "螚",
    "螪",
    "螰",
    "螹",
    "螼",
    "螮",
    "蟉",
    "蟃",
    "蟂",
    "螷",
    "螴",
    "螿",
    "螸",
    "蟞",
    "蟧",
    "蟦",
    "蟢",
    "蟟",
    "蟤",
    "蟔",
    "蟓",
    "蟭",
    "蟘",
    "螤",
    "蟗",
    "蟙",
    "蠁",
    "蟨",
    "蠀",
    "蟺",
    "蠉",
    "蠌",
    "蟼",
    "蠈",
    "蟿",
    "蠗",
    "蠩",
    "蠝",
    "蠛",
    "蠠",
    "蠤",
    "蠜",
    "蠫",
    "蠬",
    "蠨",
    "蠦",
    "蠪",
    "蠥",
    "蠰",
    "蠮",
    "蠳",
    "蠸",
    "蠾",
    "蠽",
    "蠿",
    "衁",
    "衈",
    "衋",
    "衧",
    "衪",
    "衭",
    "衶",
    "袀",
    "衱",
    "衯",
    "袃",
    "袉",
    "袕",
    "袨",
    "袚",
    "袑",
    "袡",
    "袘",
    "袧",
    "袬",
    "袌",
    "袺",
    "裗",
    "袹",
    "袸",
    "裀",
    "袶",
    "袽",
    "袲",
    "裋",
    "裍",
    "裞",
    "裚",
    "裷",
    "裧",
    "裺",
    "裮",
    "裶",
    "裯",
    "裻",
    "褁",
    "褅",
    "褋",
    "褗",
    "褆",
    "褖",
    "褑",
    "褦",
    "褮",
    "褱",
    "褢",
    "褩",
    "褵",
    "褼",
    "褾",
    "襒",
    "褷",
    "襂",
    "褽",
    "襓",
    "襋",
    "襆",
    "襐",
    "襛",
    "襗",
    "襡",
    "襘",
    "襝",
    "襣",
    "襭",
    "襩",
    "襮",
    "襳",
    "襹",
    "襺",
    "覂",
    "覅",
    "覕",
    "覛",
    "覝",
    "覢",
    "覤",
    "覣",
    "覭",
    "覮",
    "覶",
    "觓",
    "觤",
    "觡",
    "觠",
    "觢",
    "觩",
    "觰",
    "觬",
    "觲",
    "觷",
    "觺",
    "觻",
    "觼",
    "觾",
    "訑",
    "訰",
    "訧",
    "訬",
    "訞",
    "詍",
    "訹",
    "詙",
    "詀",
    "詄",
    "詅",
    "訿",
    "誂",
    "詻",
    "誃",
    "誫",
    "誙",
    "誋",
    "諆",
    "誸",
    "諔",
    "諕",
    "誻",
    "諀",
    "諅",
    "諵",
    "諝",
    "諰",
    "諈",
    "謞",
    "謘",
    "謑",
    "謋",
    "謒",
    "謕",
    "謍",
    "謈",
    "謪",
    "謧",
    "謣",
    "謰",
    "謵",
    "譇",
    "謯",
    "謱",
    "謥",
    "謷",
    "謦",
    "譐",
    "譈",
    "譊",
    "譀",
    "譋",
    "譕",
    "譑",
    "譠",
    "譪",
    "譝",
    "譨",
    "譣",
    "譥",
    "譹",
    "譸",
    "譅",
    "譺",
    "譻",
    "譾",
    "讄",
    "讂",
    "讆",
    "讋",
    "讔",
    "讘",
    "讟",
    "谹",
    "谻",
    "谽",
    "谾",
    "豃",
    "豋",
    "豍",
    "豏",
    "豗",
    "豜",
    "豝",
    "豟",
    "豥",
    "豤",
    "豦",
    "豭",
    "豰",
    "豲",
    "豱",
    "豯",
    "豵",
    "豷",
    "豶",
    "豻",
    "豽",
    "貁",
    "貀",
    "貄",
    "貏",
    "貑",
    "貕",
    "貙",
    "貗",
    "貜",
    "貣",
    "貾",
    "賌",
    "賥",
    "賟",
    "賙",
    "賵",
    "賮",
    "贆",
    "贕",
    "贙",
    "赨",
    "赩",
    "赮",
    "赸",
    "趀",
    "趌",
    "趎",
    "趏",
    "趍",
    "趓",
    "趠",
    "趜",
    "趡",
    "趥",
    "趧",
    "趬",
    "趪",
    "趭",
    "趫",
    "趮",
    "趷",
    "趹",
    "跘",
    "跓",
    "跍",
    "跇",
    "跜",
    "跕",
    "跙",
    "跈",
    "跰",
    "跠",
    "跮",
    "跦",
    "跢",
    "跧",
    "跲",
    "跫",
    "踂",
    "跿",
    "踍",
    "踃",
    "踇",
    "踆",
    "跾",
    "踠",
    "踥",
    "踤",
    "踡",
    "踕",
    "踛",
    "踖",
    "踑",
    "踙",
    "踧",
    "踘",
    "踓",
    "踳",
    "踾",
    "踸",
    "踼",
    "蹎",
    "蹍",
    "蹓",
    "蹗",
    "蹖",
    "蹞",
    "蹥",
    "蹛",
    "蹡",
    "蹝",
    "蹔",
    "蹸",
    "蹳",
    "蹪",
    "躆",
    "躈",
    "躖",
    "躗",
    "躟",
    "躠",
    "躤",
    "躣",
    "躩",
    "躨",
    "躽",
    "軓",
    "軘",
    "軞",
    "軯",
    "軷",
    "軦",
    "軮",
    "軥",
    "軵",
    "軧",
    "軨",
    "軶",
    "軱",
    "軬",
    "輆",
    "軿",
    "輁",
    "輀",
    "輂",
    "輐",
    "輑",
    "輤",
    "輘",
    "輚",
    "輠",
    "輣",
    "輖",
    "輗",
    "輮",
    "輵",
    "輲",
    "輹",
    "輷",
    "輴",
    "轃",
    "轇",
    "轈",
    "轒",
    "轑",
    "轏",
    "轐",
    "轓",
    "轙",
    "轖",
    "轗",
    "轕",
    "轚",
    "轞",
    "轛",
    "轠",
    "辴",
    "迉",
    "迒",
    "迋",
    "迍",
    "迖",
    "迣",
    "迡",
    "迾",
    "迿",
    "逜",
    "逿",
    "遝",
    "遳",
    "遰",
    "遻",
    "邆",
    "邅",
    "遾",
    "邍",
    "邔",
    "邟",
    "邥",
    "邞",
    "邧",
    "郱",
    "郕",
    "郖",
    "郠",
    "郙",
    "郣",
    "郥",
    "郘",
    "郰",
    "郲",
    "郔",
    "鄬",
    "郼",
    "鄈",
    "郹",
    "郻",
    "鄁",
    "鄇",
    "郺",
    "鄐",
    "鄍",
    "鄏",
    "鄎",
    "鄟",
    "鄝",
    "鄡",
    "鄛",
    "鄨",
    "鄪",
    "鄦",
    "鄮",
    "鄵",
    "鄸",
    "鄻",
    "鄾",
    "酀",
    "酁",
    "酄",
    "酇",
    "酖",
    "酘",
    "酓",
    "酟",
    "酳",
    "醆",
    "醊",
    "醓",
    "醙",
    "醟",
    "醥",
    "醧",
    "醰",
    "醱",
    "醷",
    "醲",
    "醳",
    "醹",
    "醽",
    "釂",
    "釃",
    "釢",
    "釱",
    "釳",
    "釸",
    "鈚",
    "鈌",
    "鈒",
    "釽",
    "鈆",
    "鉒",
    "鉠",
    "鉯",
    "鈶",
    "鉼",
    "銤",
    "銛",
    "銔",
    "鉹",
    "銗",
    "鋄",
    "鋀",
    "鋟",
    "鋘",
    "鋩",
    "鋝",
    "鋂",
    "鋊",
    "錧",
    "錼",
    "錭",
    "錎",
    "鋋",
    "鎡",
    "鎃",
    "鎯",
    "鍖",
    "鍜",
    "鍐",
    "鍭",
    "鍌",
    "鎒",
    "鎷",
    "鎝",
    "鎉",
    "鎎",
    "鎞",
    "鏏",
    "鏂",
    "鏚",
    "鏬",
    "鏙",
    "鐋",
    "鐏",
    "鏾",
    "鐕",
    "鐨",
    "鐍",
    "鐀",
    "鐎",
    "鐖",
    "鐻",
    "鐶",
    "鑐",
    "鑋",
    "鑕",
    "鑮",
    "鑯",
    "钂",
    "钀",
    "钁",
    "钃",
    "镺",
    "镻",
    "镼",
    "镽",
    "閈",
    "閍",
    "閺",
    "閵",
    "闀",
    "闉",
    "闅",
    "閷",
    "闒",
    "闑",
    "闚",
    "闛",
    "闠",
    "闟",
    "闤",
    "阞",
    "阢",
    "阤",
    "阠",
    "阰",
    "阹",
    "阸",
    "阺",
    "陏",
    "陓",
    "陊",
    "陼",
    "陭",
    "陫",
    "隇",
    "陾",
    "隉",
    "隒",
    "隓",
    "隞",
    "隤",
    "隿",
    "雂",
    "雈",
    "雓",
    "雔",
    "雗",
    "雚",
    "雟",
    "雘",
    "雺",
    "雽",
    "雿",
    "霂",
    "霋",
    "霒",
    "霐",
    "霠",
    "霣",
    "霢",
    "霩",
    "霫",
    "霬",
    "霮",
    "霵",
    "霿",
    "靆",
    "靃",
    "靪",
    "靮",
    "靷",
    "靲",
    "靾",
    "鞃",
    "鞀",
    "鞂",
    "靻",
    "鞊",
    "鞎",
    "鞈",
    "鞙",
    "鞗",
    "鞚",
    "鞜",
    "鞤",
    "鞪",
    "鞷",
    "鞶",
    "鞹",
    "鞻",
    "鞿",
    "韄",
    "韅",
    "韇",
    "韎",
    "韐",
    "韏",
    "韕",
    "韔",
    "韗",
    "韝",
    "韟",
    "韣",
    "韥",
    "韰",
    "韱",
    "韹",
    "韽",
    "頄",
    "頖",
    "頞",
    "頝",
    "頩",
    "頨",
    "頯",
    "頲",
    "顁",
    "顄",
    "顊",
    "顉",
    "顅",
    "顐",
    "顑",
    "顜",
    "顝",
    "顠",
    "顣",
    "顟",
    "顤",
    "顪",
    "顩",
    "顲",
    "颬",
    "颲",
    "颸",
    "颽",
    "颻",
    "颾",
    "飁",
    "飂",
    "飉",
    "飋",
    "飌",
    "飣",
    "飶",
    "餂",
    "餀",
    "飺",
    "餔",
    "餖",
    "餕",
    "餤",
    "餟",
    "餥",
    "餫",
    "餪",
    "餲",
    "餯",
    "餭",
    "餱",
    "餰",
    "饁",
    "饇",
    "饐",
    "饎",
    "饙",
    "饘",
    "饛",
    "饡",
    "馣",
    "馲",
    "馰",
    "馵",
    "馻",
    "馺",
    "駂",
    "馽",
    "駜",
    "駍",
    "駏",
    "駎",
    "駖",
    "駮",
    "駬",
    "駥",
    "駤",
    "駣",
    "駩",
    "駺",
    "駴",
    "駷",
    "駹",
    "駶",
    "駻",
    "駽",
    "駾",
    "騃",
    "騉",
    "騑",
    "騊",
    "騇",
    "騚",
    "騕",
    "騥",
    "騝",
    "騛",
    "騢",
    "騠",
    "騧",
    "騞",
    "騜",
    "騵",
    "騲",
    "騴",
    "騱",
    "騬",
    "騪",
    "騩",
    "騹",
    "騽",
    "驆",
    "騺",
    "驓",
    "驔",
    "驈",
    "驉",
    "驖",
    "驞",
    "驠",
    "驦",
    "驨",
    "骭",
    "骫",
    "骹",
    "骿",
    "骴",
    "骾",
    "髇",
    "髊",
    "髆",
    "髍",
    "髐",
    "髟",
    "髧",
    "髬",
    "髳",
    "髶",
    "髺",
    "髾",
    "鬁",
    "髼",
    "鬋",
    "鬊",
    "鬎",
    "鬌",
    "鬐",
    "鬕",
    "鬗",
    "鬖",
    "鬙",
    "鬞",
    "鬠",
    "鬤",
    "鬫",
    "鬳",
    "鬵",
    "鬺",
    "鬾",
    "鬿",
    "魊",
    "魌",
    "魖",
    "魠",
    "魡",
    "魧",
    "魱",
    "魦",
    "魶",
    "魵",
    "鮅",
    "鮇",
    "魼",
    "魾",
    "魻",
    "鮂",
    "鮚",
    "鮞",
    "鮛",
    "鮦",
    "鮥",
    "鮤",
    "鮆",
    "鯆",
    "鮿",
    "鮵",
    "鯈",
    "鯫",
    "鯠",
    "鯞",
    "鯦",
    "鯬",
    "鰌",
    "鰋",
    "鰅",
    "鯸",
    "鰫",
    "鰝",
    "鰬",
    "鱆",
    "鰿",
    "鱄",
    "鱁",
    "鰴",
    "鱐",
    "鱍",
    "鱋",
    "鱕",
    "鱦",
    "鱢",
    "鱞",
    "鱴",
    "鱳",
    "鱹",
    "鳦",
    "鳪",
    "鳭",
    "鳱",
    "鳵",
    "鳼",
    "鳺",
    "鳿",
    "鳷",
    "鴀",
    "鳹",
    "鳻",
    "鴅",
    "鴃",
    "鴥",
    "鴠",
    "鴔",
    "鴩",
    "鴘",
    "鴢",
    "鴐",
    "鴳",
    "鵁",
    "鵧",
    "鴶",
    "鴮",
    "鴱",
    "鴸",
    "鵅",
    "鵃",
    "鴾",
    "鵀",
    "鴽",
    "鵏",
    "鵊",
    "鵛",
    "鵋",
    "鵖",
    "鵌",
    "鵗",
    "鵔",
    "鵷",
    "鶁",
    "鶊",
    "鶄",
    "鶈",
    "鵱",
    "鶀",
    "鵸",
    "鶋",
    "鶌",
    "鵽",
    "鵫",
    "鵴",
    "鵩",
    "鶅",
    "鵳",
    "鵻",
    "鶂",
    "鵹",
    "鶟",
    "鶙",
    "鶤",
    "鶝",
    "鶐",
    "鶛",
    "鶠",
    "鶔",
    "鶜",
    "鶪",
    "鶗",
    "鶢",
    "鶨",
    "鶞",
    "鶣",
    "鶖",
    "鶷",
    "鶶",
    "鷁",
    "鷇",
    "鷊",
    "鷏",
    "鶾",
    "鷅",
    "鷃",
    "鶵",
    "鷈",
    "鶱",
    "鶭",
    "鷛",
    "鷒",
    "鷞",
    "鷋",
    "鷐",
    "鷜",
    "鷑",
    "鷩",
    "鷘",
    "鷖",
    "鷵",
    "鷕",
    "鷻",
    "鷷",
    "鷣",
    "鷤",
    "鷶",
    "鷡",
    "鷮",
    "鷢",
    "鸂",
    "鷾",
    "鸇",
    "鸃",
    "鸆",
    "鸅",
    "鸀",
    "鸁",
    "鸉",
    "鷿",
    "鷽",
    "鸄",
    "鸋",
    "鸍",
    "鸏",
    "鸒",
    "鸔",
    "鸓",
    "鸗",
    "鸙",
    "鹺",
    "麃",
    "麆",
    "麉",
    "麎",
    "麌",
    "麔",
    "麙",
    "麛",
    "麚",
    "麜",
    "麠",
    "麡",
    "麧",
    "麮",
    "麰",
    "麶",
    "麷",
    "黀",
    "黂",
    "黈",
    "黓",
    "黕",
    "黖",
    "黚",
    "黤",
    "黫",
    "黮",
    "黭",
    "黰",
    "黳",
    "黵",
    "黺",
    "鼁",
    "鼀",
    "鼆",
    "鼊",
    "鼏",
    "鼖",
    "鼛",
    "鼘",
    "鼜",
    "鼤",
    "鼣",
    "鼥",
    "鼪",
    "鼨",
    "鼭",
    "鼰",
    "鼮",
    "鼵",
    "鼳",
    "鼲",
    "鼸",
    "鼶",
    "齀",
    "齂",
    "齃",
    "齌",
    "齍",
    "齎",
    "齖",
    "齗",
    "齘",
    "齛",
    "齠",
    "齞",
    "齝",
    "齥",
    "齤",
    "齫",
    "齱",
    "齰",
    "齮",
    "齯",
    "齴",
    "齵",
    "齸",
    "齻",
    "齺",
    "齹",
    "齾",
    "龒",
    "龤",
    "堔",
    "礂",
    "蒏",
    "蒆",
    "兙",
    "兛",
    "兞",
    "兝",
    "兡",
    "兣",
    "嗧",
    "瓩",
    "忼",
    "擡",
    "氊",
    "穇",
    "擧",
    "譌",
    "!",
    "\"",
    "#",
    "$",
    "%",
    "&",
    "'",
    "(",
    ")",
    "*",
    "+",
    ",",
    "-",
    ".",
    "/",
    "0",
    "1",
    "2",
    "3",
    "4",
    "5",
    "6",
    "7",
    "8",
    "9",
    ":",
    ";",
    "<",
    "=",
    ">",
    "?",
    "A",
    "B",
    "C",
    "D",
    "E",
    "F",
    "G",
    "H",
    "I",
    "J",
    "K",
    "L",
    "M",
    "N",
    "O",
    "P",
    "Q",
    "R",
    "S",
    "T",
    "U",
    "V",
    "W",
    "X",
    "Y",
    "Z",
    "[",
    "]",
    "_",
    "`",
    "a",
    "b",
    "c",
    "d",
    "e",
    "f",
    "g",
    "h",
    "i",
    "j",
    "k",
    "l",
    "m",
    "n",
    "o",
    "p",
    "q",
    "r",
    "s",
    "t",
    "u",
    "v",
    "w",
    "x",
    "y",
    "z",
    "©",
    "°",
    "²",
    "´",
    "½",
    "Á",
    "Ä",
    "Å",
    "Ç",
    "È",
    "É",
    "Í",
    "Ó",
    "Ö",
    "×",
    "Ü",
    "ß",
    "à",
    "á",
    "â",
    "ã",
    "ä",
    "å",
    "æ",
    "ç",
    "è",
    "é",
    "ê",
    "ë",
    "í",
    "ð",
    "ñ",
    "ò",
    "ó",
    "ô",
    "õ",
    "ö",
    "ø",
    "ú",
    "û",
    "ü",
    "ý",
    "ā",
    "ă",
    "ą",
    "ć",
    "Č",
    "č",
    "đ",
    "ē",
    "ė",
    "ę",
    "ğ",
    "ī",
    "ı",
    "Ł",
    "ł",
    "ń",
    "ň",
    "ō",
    "ř",
    "Ş",
    "ş",
    "Š",
    "š",
    "ţ",
    "ū",
    "ż",
    "Ž",
    "ž",
    "Ș",
    "ș",
    "ț",
    "Δ",
    "α",
    "λ",
    "μ",
    "φ",
    "Г",
    "О",
    "а",
    "в",
    "л",
    "о",
    "р",
    "с",
    "т",
    "я",
    "ồ",
    "—",
    "―",
    "’",
    "“",
    "”",
    "…",
    "℃",
    "→",
    "∇",
    "−",
    "■",
    "☆",
    "、",
    "。",
    "々",
    "〆",
    "〈",
    "〉",
    "「",
    "」",
    "『",
    "』",
    "〔",
    "〕",
    "〜",
    "！",
    "＃",
    "％",
    "＆",
    "（",
    "）",
    "＋",
    "，",
    "－",
    "．",
    "／",
    "０",
    "１",
    "２",
    "３",
    "４",
    "５",
    "６",
    "７",
    "８",
    "９",
    "：",
    "；",
    "＝",
    "？",
    "＠",
    "Ａ",
    "Ｂ",
    "Ｃ",
    "Ｄ",
    "Ｅ",
    "Ｆ",
    "Ｇ",
    "Ｈ",
    "Ｉ",
    "Ｊ",
    "Ｋ",
    "Ｌ",
    "Ｍ",
    "Ｎ",
    "Ｏ",
    "Ｐ",
    "Ｒ",
    "Ｓ",
    "Ｔ",
    "Ｕ",
    "Ｖ",
    "Ｗ",
    "Ｘ",
    "Ｚ",
    "ａ",
    "ｂ",
    "ｃ",
    "ｄ",
    "ｅ",
    "ｆ",
    "ｇ",
    "ｈ",
    "ｉ",
    "ｊ",
    "ｋ",
    "ｌ",
    "ｍ",
    "ｎ",
    "ｏ",
    "ｐ",
    "ｑ",
    "ｒ",
    "ｓ",
    "ｔ",
    "ｕ",
    "ｖ",
    "ｗ",
    "ｘ",
    "ｙ",
    "ｚ",
    "～",
    "･",
    "ǎ",
    "ǒ",
    "ě",
    "ǐ",
    "ì",
    "ǔ",
    "ù",
    "ǖ",
    "ǘ",
    "ǚ",
    "ǜ",
    "【",
    "】",
    "《",
    "》",
    "‥",
    "{",
    "}",
    "\\",
    "|",
    "@",
    "^",
    "~",
    "÷",
    "∕",
    "∙",
    "⋅",
    "·",
    "⊕",
    "⊖",
    "⊗",
    "⊘",
    "⊙",
    "±",
    "∓",
    "∩",
    "∪",
    "□",
    "⊎",
    "⊓",
    "⊔",
    "≠",
    "≈",
    "≡",
    "≤",
    "≥",
    "≪",
    "≫",
    "≲",
    "≳",
    "≶",
    "≷",
    "≺",
    "≻",
    "≼",
    "≽",
    "∈",
    "∉",
    "⊂",
    "⊃",
    "⊆",
    "⊇",
    "⊄",
    "⊅",
    "∅",
    "∖",
    "∁",
    "∆",
    "∧",
    "∨",
    "¬",
    "⊻",
    "⊼",
    "⊽",
    "←",
    "↔",
    "⇒",
    "⇐",
    "⇔",
    "∀",
    "∃",
    "∄",
    "∴",
    "∵",
    "∝",
    "∞",
    "⊥",
    "∟",
    "∠",
    "∡",
    "∢",
    "′",
    "″",
    "∥",
    "⊾",
    "⊿",
    "∂",
    "∫",
    "∬",
    "∭",
    "∮",
    "∯",
    "∰",
    "∑",
    "∏",
    "√",
    "∛",
    "∜",
    "∱",
    "∲",
    "∳",
    "∶",
    "∷",
    "∼",
    "®",
    "≄",
    "≅",
    "≃",
    "≦",
    "≧",
    "⊈",
    "⊉",
    "⊢",
    "⊤",
    "⊨",
    "⊧",
    "℉",
    "Ω",
    "℧",
    "Å",
    "⌀",
    "ℏ",
    "⅀",
    "⍺",
    "⍵",
    "¢",
    "€",
    "£",
    "¥",
    "￥",
    "₿",
    "↑",
    "↓",
    "↕",
    "↖",
    "↗",
    "↘",
    "↙",
    "↺",
    "↻",
    "↼",
    "↽",
    "↾",
    "↿",
    "⇀",
    "⇁",
    "⇂",
    "⇃",
    "⇋",
    "⇌",
    "ª",
    "º",
    "⁰",
    "¹",
    "³",
    "⁴",
    "⁵",
    "⁶",
    "⁷",
    "⁸",
    "⁹",
    "⁺",
    "⁻",
    "⁼",
    "⁽",
    "⁾",
    "ⁿ",
    "₀",
    "₁",
    "₂",
    "₃",
    "₄",
    "₅",
    "₆",
    "₇",
    "₈",
    "₉",
    "₊",
    "₋",
    "₌",
    "₍",
    "₎",
    "Ⅰ",
    "Ⅱ",
    "Ⅲ",
    "Ⅳ",
    "Ⅴ",
    "Ⅵ",
    "Ⅶ",
    "Ⅷ",
    "Ⅸ",
    "Ⅹ",
    "Ⅺ",
    "Ⅻ",
    "ⅰ",
    "ⅱ",
    "ⅲ",
    "ⅳ",
    "ⅴ",
    "ⅵ",
    "ⅶ",
    "ⅷ",
    "ⅸ",
    "ⅹ",
    "ⅺ",
    "ⅻ",
    "☰",
    "☱",
    "☲",
    "☳",
    "☴",
    "☵",
    "☶",
    "☷",
    "♀",
    "♂",
    "♳",
    "♴",
    "♵",
    "♶",
    "♷",
    "♸",
    "♹",
    "♺",
    "♩",
    "♪",
    "♫",
    "♬",
    "⚪",
    "⚫",
    "⚬",
    "✶",
    "✷",
    "✸",
    "➀",
    "➁",
    "➂",
    "➃",
    "➄",
    "➅",
    "➆",
    "➇",
    "➈",
    "➉",
    "➊",
    "➋",
    "➌",
    "➍",
    "➎",
    "➏",
    "➐",
    "➑",
    "➒",
    "➓",
    "⏀",
    "⏁",
    "⏂",
    "⏃",
    "⏄",
    "⏅",
    "⏆",
    "⏇",
    "⏈",
    "⏉",
    "⏊",
    "⏋",
    "⏌",
    "⏚",
    "⏴",
    "⏵",
    "⏶",
    "⏷",
    "⏸",
    "⏹",
    "⏺",
    "⏻",
    "⏼",
    "Α",
    "Β",
    "Γ",
    "Ε",
    "Ζ",
    "Η",
    "Θ",
    "Ι",
    "Κ",
    "Λ",
    "Μ",
    "Ν",
    "Ξ",
    "Ο",
    "Π",
    "Ρ",
    "Σ",
    "Τ",
    "Υ",
    "Φ",
    "Χ",
    "Ψ",
    "β",
    "γ",
    "δ",
    "ε",
    "ζ",
    "η",
    "θ",
    "ι",
    "κ",
    "ν",
    "ξ",
    "ο",
    "π",
    "ρ",
    "σ",
    "τ",
    "υ",
    "χ",
    "ψ",
    "ω",
    "ϐ",
    "ϑ",
    "ϒ",
    "ϕ",
    "█",
    "ϖ",
    "ϰ",
    "ϱ",
    "ϴ",
    "ϵ",
    "ϝ",
    "Ϟ",
    "ϟ",
    "Ϡ",
    "ϡ",
    "Ϣ",
    "ϣ",
    "Ϥ",
    "ϥ",
    "Ϧ",
    "ϧ",
    "Ϩ",
    "ϩ",
    "Ϫ",
    "ϫ",
    "Ϭ",
    "ϭ",
    "Ϯ",
    "ϯ",
    "∸",
    "∹",
    "∺",
    "∻",
    "∽",
    "∾",
    "∿",
    "≀",
    "≁",
    "≂",
    "≆",
    "≇",
    "≉",
    "≊",
    "≋",
    "≌",
    "≍",
    "≎",
    "≏",
    "≐",
    "≑",
    "≒",
    "≓",
    "≔",
    "≕",
    "≖",
    "≗",
    "≘",
    "≙",
    "≚",
    "≛",
    "≜",
    "≝",
    "≞",
    "≟",
    "≢",
    "≣",
    "≨",
    "≩",
    "≬",
    "≭",
    "≮",
    "≯",
    "≰",
    "≱",
    "≴",
    "≵",
    "≸",
    "≹",
    "≾",
    "≿",
    "⊀",
    "⊁",
    "⊊",
    "⊋",
    "⊌",
    "⊍",
    "⊏",
    "⊐",
    "⊑",
    "⊒",
    "⊚",
    "⊛",
    "⊜",
    "⊝",
    "⊞",
    "⊟",
    "⊠",
    "⊡",
    "⊣",
    "⊦",
    "⊩",
    "⊪",
    "⊫",
    "⊬",
    "⊭",
    "⊮",
    "⊯",
    "⊰",
    "⊱",
    "⊲",
    "⊳",
    "⊴",
    "⊵",
    "⊶",
    "⊷",
    "⊸",
    "⊹",
    "⊺",
    "ℎ",
    "℘",
    "ℜ",
    "ℑ",
    "ℵ",
    "ℶ",
    "ℷ",
    "ℸ",
    "⌬",
    "⌭",
    "⌮",
    "⌯",
    "⎔",
    "¤",
    "₠",
    "₡",
    "₢",
    "₣",
    "₤",
    "₥",
    "₦",
    "₧",
    "₨",
    "₩",
    "₪",
    "₫",
    "₭",
    "₮",
    "₯",
    "₰",
    "₱",
    "₲",
    "₳",
    "₴",
    "₵",
    "₶",
    "₷",
    "₸",
    "₹",
    "₺",
    "₻",
    "₼",
    "₽",
    "₾",
    "↚",
    "↛",
    "↜",
    "↝",
    "↞",
    "↟",
    "↠",
    "↡",
    "↢",
    "↣",
    "↤",
    "↥",
    "↦",
    "↧",
    "↨",
    "↩",
    "↪",
    "↫",
    "↬",
    "↭",
    "↮",
    "↯",
    "↰",
    "↱",
    "↲",
    "↳",
    "↴",
    "↵",
    "↶",
    "↷",
    "↸",
    "↹",
    "⇄",
    "⇅",
    "⇆",
    "⇇",
    "⇈",
    "⇉",
    "⇊",
    "⇍",
    "⇎",
    "⇏",
    "⇑",
    "⇓",
    "⇕",
    "⇖",
    "⇗",
    "⇘",
    "⇙",
    "⇚",
    "⇛",
    "⇜",
    "⇝",
    "⇞",
    "⇟",
    "⇠",
    "⇡",
    "⇢",
    "⇣",
    "⇤",
    "⇥",
    "⇦",
    "⇧",
    "⇨",
    "⇩",
    "⇪",
    "⇫",
    "⇬",
    "⇭",
    "⇮",
    "⇯",
    "⇰",
    "⇱",
    "⇲",
    "⇳",
    "⇴",
    "⇵",
    "⇶",
    "⇷",
    "⇸",
    "⇹",
    "⇺",
    "⇻",
    "⇼",
    "⇽",
    "⇾",
    "⇿",
    "ↀ",
    "ↁ",
    "ↂ",
    "☀",
    "☁",
    "☂",
    "☃",
    "☄",
    "★",
    "☇",
    "☈",
    "☉",
    "☊",
    "☋",
    "☌",
    "☍",
    "☎",
    "☏",
    "☐",
    "☑",
    "☒",
    "☓",
    "☔",
    "☕",
    "☖",
    "☗",
    "☘",
    "☙",
    "☚",
    "☛",
    "☜",
    "☝",
    "☞",
    "☟",
    "☠",
    "☡",
    "☢",
    "☣",
    "☤",
    "☥",
    "☦",
    "☧",
    "☨",
    "☩",
    "☪",
    "☫",
    "☬",
    "☭",
    "☮",
    "☯",
    "☸",
    "☹",
    "☺",
    "☻",
    "☼",
    "☽",
    "☾",
    "☿",
    "♁",
    "♃",
    "♄",
    "♅",
    "♆",
    "♇",
    "♔",
    "♕",
    "♖",
    "♗",
    "♘",
    "♙",
    "♚",
    "♛",
    "♜",
    "♝",
    "♞",
    "♟",
    "♠",
    "♡",
    "♢",
    "♣",
    "♤",
    "♥",
    "♦",
    "♧",
    "♨",
    "♭",
    "♮",
    "♯",
    "♰",
    "♱",
    "♲",
    "♻",
    "♼",
    "♽",
    "♾",
    "⚀",
    "⚁",
    "⚂",
    "⚃",
    "⚄",
    "⚅",
    "⚆",
    "⚇",
    "⚈",
    "⚉",
    "⚊",
    "⚋",
    "⚌",
    "⚍",
    "⚎",
    "⚏",
    "⚐",
    "⚑",
    "⚒",
    "⚓",
    "⚔",
    "⚕",
    "⚖",
    "⚗",
    "⚘",
    "⚙",
    "⚚",
    "⚛",
    "⚜",
    "⚝",
    "⚞",
    "⚟",
    "⚠",
    "⚡",
    "⚢",
    "⚣",
    "⚤",
    "⚥",
    "⚦",
    "⚧",
    "⚨",
    "⚩",
    "⚭",
    "⚮",
    "⚯",
    "⚰",
    "⚱",
    "⚲",
    "⚳",
    "⚴",
    "⚵",
    "⚶",
    "⚷",
    "⚸",
    "⚹",
    "⚺",
    "⚻",
    "⚼",
    "⚿",
    "⛀",
    "⛁",
    "⛂",
    "⛃",
    "⛆",
    "⛇",
    "⛈",
    "⛉",
    "⛊",
    "⛋",
    "⛌",
    "⛍",
    "⛏",
    "⛐",
    "⛑",
    "⛒",
    "⛓",
    "⛕",
    "⛖",
    "⛗",
    "⛘",
    "⛙",
    "⛚",
    "⛛",
    "⛜",
    "⛝",
    "⛞",
    "⛠",
    "⛡",
    "⛢",
    "⛣",
    "⛤",
    "⛥",
    "⛦",
    "⛧",
    "⛨",
    "⛩",
    "⛪",
    "⛫",
    "⛬",
    "⛭",
    "⛮",
    "⛯",
    "⛶",
    "⛾",
    "⛿",
    "✆",
    "✇",
    "✈",
    "✉",
    "✌",
    "✍",
    "✎",
    "✏",
    "✐",
    "✑",
    "✒",
    "✓",
    "✔",
    "✕",
    "✙",
    "✚",
    "✛",
    "✜",
    "✝",
    "✞",
    "✟",
    "✠",
    "✡",
    "✢",
    "✣",
    "✤",
    "✥",
    "✦",
    "✧",
    "✩",
    "✪",
    "✫",
    "✬",
    "✭",
    "✮",
    "✯",
    "✰",
    "✱",
    "✲",
    "✳",
    "✴",
    "✵",
    "✹",
    "✺",
    "✻",
    "✼",
    "✽",
    "✾",
    "✿",
    "❀",
    "❁",
    "❂",
    "❃",
    "❄",
    "❅",
    "❆",
    "❇",
    "❈",
    "❉",
    "❊",
    "❋",
    "❍",
    "❏",
    "❐",
    "❑",
    "❒",
    "❖",
    "❘",
    "❙",
    "❚",
    "❛",
    "❜",
    "❝",
    "❞",
    "❡",
    "❢",
    "❣",
    "❤",
    "❥",
    "❦",
    "❧",
    "❨",
    "❩",
    "❪",
    "❫",
    "❬",
    "❭",
    "❮",
    "❯",
    "❰",
    "❱",
    "❲",
    "❳",
    "❴",
    "❵",
    "❶",
    "❷",
    "❸",
    "❹",
    "❺",
    "❻",
    "❼",
    "❽",
    "❾",
    "❿",
    "①",
    "②",
    "③",
    "④",
    "⑤",
    "⑥",
    "⑦",
    "⑧",
    "⑨",
    "⑩",
    "➔",
    "➕",
    "➖",
    "➗",
    "➘",
    "➙",
    "➚",
    "➛",
    "➜",
    "➝",
    "➞",
    "➟",
    "➠",
    "➡",
    "➢",
    "➣",
    "➤",
    "➥",
    "➦",
    "➧",
    "➨",
    "➩",
    "➪",
    "➫",
    "➬",
    "➭",
    "➮",
    "➯",
    "➰",
    "➱",
    "➲",
    "➳",
    "➴",
    "➵",
    "➶",
    "➷",
    "➸",
    "➹",
    "➺",
    "➻",
    "➼",
    "➽",
    "➾",
    "➿",
    "⌘",
    "⌥",
    "⌃",
    "⎋",
    "⌫",
    "⌦",
    "⏏",
    "⌤",
    "⌧",
    "⌨",
    "⎆",
    "⎇",
    "⎈",
    "⎉",
    "⎊",
    "⎌",
    "⎍",
    "⎎",
    "⎏",
    "⎐",
    "⎑",
    "⎒",
    "⎓",
    "⎕",
    "⎖",
    "⎗",
    "⎘",
    "⎙",
    "⎚",
    "⎛",
    "⎜",
    "⎝",
    "⎞",
    "⎟",
    "⎠",
    "⎡",
    "⎢",
    "⎣",
    "⎤",
    "⎥",
    "⎦",
    "⎧",
    "⎨",
    "⎩",
    "⎪",
    "⎫",
    "⎬",
    "⎭",
    "⎮",
    "⎯",
    "⎰",
    "⎱",
    "⎲",
    "⎳",
    "⎴",
    "⎵",
    "⎶",
    "⎷",
    "⎸",
    "⎹",
    "⎺",
    "⎻",
    "⎼",
    "⎽",
    "⎾",
    "⎿",
    "⏍",
    "⏎",
    "⏐",
    "⏑",
    "⏒",
    "⏓",
    "⏔",
    "⏕",
    "⏖",
    "⏗",
    "⏘",
    "⏙",
    "⏛",
    "⏜",
    "⏝",
    "⏞",
    "⏟",
    "⏠",
    "⏡",
    "⏢",
    "⏣",
    "⏤",
    "⏥",
    "⏦",
    "⏧",
    "⏨",
    "⏭",
    "⏮",
    "⏯",
    "⏱",
    "⏲",
    "▲",
    "▽",
    "◐",
    "⏽",
    "⏾",
    "⏿",
    "ɐ",
    "ɑ",
    "ɒ",
    "ɓ",
    "ɔ",
    "ɕ",
    "ɖ",
    "ɗ",
    "ɘ",
    "ə",
    "ɚ",
    "ɛ",
    "ɜ",
    "ɝ",
    "ɞ",
    "ɟ",
    "ɠ",
    "ɡ",
    "ɢ",
    "ɣ",
    "ɤ",
    "ɥ",
    "ɦ",
    "ɧ",
    "ɨ",
    "ɩ",
    "ɪ",
    "ɫ",
    "ɬ",
    "ɭ",
    "ɮ",
    "ɯ",
    "ɰ",
    "ɱ",
    "ɲ",
    "ɳ",
    "ɴ",
    "ɵ",
    "ɶ",
    "ɷ",
    "ɸ",
    "ɹ",
    "ɺ",
    "ɻ",
    "ɼ",
    "ɽ",
    "ɾ",
    "ɿ",
    "ʀ",
    "ʁ",
    "ʂ",
    "ʃ",
    "ʄ",
    "ʅ",
    "ʆ",
    "ʇ",
    "ʈ",
    "ʉ",
    "ʊ",
    "ʋ",
    "ʌ",
    "ʍ",
    "ʎ",
    "ʏ",
    "ʐ",
    "ʑ",
    "ʒ",
    "ʓ",
    "ʔ",
    "ʕ",
    "ʖ",
    "ʗ",
    "ʘ",
    "ʙ",
    "ʚ",
    "ʛ",
    "ʜ",
    "ʝ",
    "ʞ",
    "ʟ",
    "ʠ",
    "ʡ",
    "ʢ",
    "ʣ",
    "ʤ",
    "ʥ",
    "ʦ",
    "ʧ",
    "ʨ",
    "ʩ",
    "ʪ",
    "ʫ",
    "ʬ",
    "ʭ",
    "ʮ",
    "ʯ",
    "━",
    "Ǝ",
    "Ã",
    "●",
    "▶",
    "｜",
    "𝑢",
    "〖",
    "〗",
    "︽",
    "–",
    "﹥",
    "𝜓",
    "•",
    "∋",
    "ƒ",
    "०",
    "✘",
    "Е",
    "◉",
    "〒",
    "𝒱",
    "𝜆",
    "⟹",
    "﹪",
    "◊",
    "╆",
    "오",
    "˂",
    "〉",
    "𝝎",
    "▪",
    "△",
    "▁",
    "◼",
    "〇",
    "▷",
    "▬",
    "𝒮",
    "†",
    "ₒ",
    "⼁",
    "〵",
    "⭐",
    "╳",
    "⟶",
    "으",
    "⬆",
    "Ạ",
    "◀",
    "",
    "▫",
    "丄",
    "︾",
    "◥",
    "‖",
    "𝜌",
    "ⅼ",
    "▼",
    "⁎",
    "﹏",
    "😁",
    "😂",
    "😃",
    "😄",
    "😅",
    "😆",
    "😉",
    "😊",
    "😋",
    "😌",
    "😍",
    "😏",
    "😒",
    "😓",
    "😔",
    "😖",
    "😘",
    "😚",
    "😜",
    "😝",
    "😞",
    "😠",
    "😡",
    "😢",
    "😣",
    "😤",
    "😥",
    "😨",
    "😩",
    "😪",
    "😫",
    "😭",
    "😰",
    "😱",
    "😲",
    "😳",
    "😵",
    "😷",
    "😸",
    "😹",
    "😺",
    "😻",
    "😼",
    "😽",
    "😾",
    "😿",
    "🙀",
    "🙅",
    "🙆",
    "🙇",
    "🙈",
    "🙉",
    "🙊",
    "🙋",
    "🙌",
    "🙍",
    "🙎",
    "🙏",
    "✂",
    "✅",
    "✊",
    "✋",
    "✖",
    "✨",
    "❌",
    "❎",
    "❓",
    "❔",
    "❕",
    "❗",
    "🚀",
    "🚃",
    "🚄",
    "🚅",
    "🚇",
    "🚉",
    "🚌",
    "🚏",
    "🚑",
    "🚒",
    "🚓",
    "🚕",
    "🚗",
    "🚙",
    "🚚",
    "🚢",
    "🚤",
    "🚥",
    "🚧",
    "🚨",
    "🚩",
    "🚪",
    "🚫",
    "🚬",
    "🚭",
    "🚲",
    "🚶",
    "🚹",
    "🚺",
    "🚻",
    "🚼",
    "🚽",
    "🚾",
    "🛀",
    "Ⓜ",
    "🅰",
    "🅱",
    "🅾",
    "🅿",
    "🆎",
    "🆑",
    "🆒",
    "🆓",
    "🆔",
    "🆕",
    "🆖",
    "🆗",
    "🆘",
    "🆙",
    "🆚",
    "🇩🇪",
    "🇬🇧",
    "🇨🇳",
    "🇯🇵",
    "🇫🇷",
    "🇰🇷",
    "🇪🇸",
    "🇮🇹",
    "🇷🇺",
    "🇺🇸",
    "🈁",
    "ℹ",
    "⌚",
    "⌛",
    "⏩",
    "⏪",
    "⏫",
    "⏬",
    "⏰",
    "⏳",
    "◻",
    "◽",
    "◾",
    "♈",
    "♉",
    "♊",
    "♋",
    "♌",
    "♍",
    "♎",
    "♏",
    "♐",
    "♑",
    "♒",
    "♓",
    "♿",
    "⚽",
    "⚾",
    "⛄",
    "⛅",
    "⛎",
    "⛔",
    "⛲",
    "⛳",
    "⛵",
    "⛺",
    "⛽",
    "⤴",
    "⤵",
    "⬅",
    "⬇",
    "⬛",
    "⬜",
    "⭕",
    "〰",
    "〽",
    "㊗",
    "㊙",
    "🀄",
    "🃏",
    "🌀",
    "🌁",
    "🌂",
    "🌃",
    "🌄",
    "🌅",
    "🌆",
    "🌇",
    "🌈",
    "🌉",
    "🌊",
    "🌋",
    "🌌",
    "🌏",
    "🌑",
    "🌓",
    "🌔",
    "🌕",
    "🌙",
    "🌛",
    "🌟",
    "🌠",
    "🌰",
    "🌱",
    "🌴",
    "🌵",
    "🌷",
    "🌸",
    "🌹",
    "🌺",
    "🌻",
    "🌼",
    "🌽",
    "🌾",
    "🌿",
    "🍀",
    "🍁",
    "🍂",
    "🍃",
    "🍄",
    "🍅",
    "🍆",
    "🍇",
    "🍈",
    "🍉",
    "🍊",
    "🍌",
    "🍍",
    "🍎",
    "🍏",
    "🍑",
    "🍒",
    "🍓",
    "🍔",
    "🍕",
    "🍖",
    "🍗",
    "🍘",
    "🍙",
    "🍚",
    "🍛",
    "🍜",
    "🍝",
    "🍞",
    "🍟",
    "🍠",
    "🍡",
    "🍢",
    "🍣",
    "🍤",
    "🍥",
    "🍦",
    "🍧",
    "🍨",
    "🍩",
    "🍪",
    "🍫",
    "🍬",
    "🍭",
    "🍮",
    "🍯",
    "🍰",
    "🍱",
    "🍲",
    "🍳",
    "🍴",
    "🍵",
    "🍶",
    "🍷",
    "🍸",
    "🍹",
    "🍺",
    "🍻",
    "🎀",
    "🎁",
    "🎂",
    "🎃",
    "🎄",
    "🎅",
    "🎆",
    "🎇",
    "🎈",
    "🎉",
    "🎊",
    "🎋",
    "🎌",
    "🎍",
    "🎎",
    "🎏",
    "🎐",
    "🎑",
    "🎒",
    "🎓",
    "🎠",
    "🎡",
    "🎢",
    "🎣",
    "🎤",
    "🎥",
    "🎦",
    "🎧",
    "🎨",
    "🎩",
    "🎪",
    "🎫",
    "🎬",
    "🎭",
    "🎮",
    "🎯",
    "🎰",
    "🎱",
    "🎲",
    "🎳",
    "🎴",
    "🎵",
    "🎶",
    "🎷",
    "🎸",
    "🎹",
    "🎺",
    "🎻",
    "🎼",
    "🎽",
    "🎾",
    "🎿",
    "🏀",
    "🏁",
    "🏂",
    "🏃",
    "🏄",
    "🏆",
    "🏈",
    "🏊",
    "🏠",
    "🏡",
    "🏢",
    "🏣",
    "🏥",
    "🏦",
    "🏧",
    "🏨",
    "🏩",
    "🏪",
    "🏫",
    "🏬",
    "🏭",
    "🏮",
    "🏯",
    "🏰",
    "🐌",
    "🐍",
    "🐎",
    "🐑",
    "🐒",
    "🐔",
    "🐗",
    "🐘",
    "🐙",
    "🐚",
    "🐛",
    "🐜",
    "🐝",
    "🐞",
    "🐟",
    "🐠",
    "🐡",
    "🐢",
    "🐣",
    "🐤",
    "🐥",
    "🐦",
    "🐧",
    "🐨",
    "🐩",
    "🐫",
    "🐬",
    "🐭",
    "🐮",
    "🐯",
    "🐰",
    "🐱",
    "🐲",
    "🐳",
    "🐴",
    "🐵",
    "🐶",
    "🐷",
    "🐸",
    "🐹",
    "🐺",
    "🐻",
    "🐼",
    "🐽",
    "🐾",
    "👀",
    "👂",
    "👃",
    "👄",
    "👅",
    "👆",
    "👇",
    "👈",
    "👉",
    "👊",
    "👋",
    "👌",
    "👍",
    "👎",
    "👏",
    "👐",
    "👑",
    "👒",
    "👓",
    "👔",
    "👕",
    "👖",
    "👗",
    "👘",
    "👙",
    "👚",
    "👛",
    "👜",
    "👝",
    "👞",
    "👟",
    "👠",
    "👡",
    "👢",
    "👣",
    "👤",
    "👦",
    "👧",
    "👨",
    "👩",
    "👪",
    "👫",
    "👮",
    "👯",
    "👰",
    "👱",
    "👲",
    "👳",
    "👴",
    "👵",
    "👶",
    "👷",
    "👸",
    "👹",
    "👺",
    "👻",
    "👼",
    "👽",
    "👾",
    "👿",
    "💀",
    "💁",
    "💂",
    "💃",
    "💄",
    "💅",
    "💆",
    "💇",
    "💈",
    "💉",
    "💊",
    "💋",
    "💌",
    "💍",
    "💎",
    "💏",
    "💐",
    "💑",
    "💒",
    "💓",
    "💔",
    "💕",
    "💖",
    "💗",
    "💘",
    "💙",
    "💚",
    "💛",
    "💜",
    "💝",
    "💞",
    "💟",
    "💠",
    "💡",
    "💢",
    "💣",
    "💤",
    "💥",
    "💦",
    "💧",
    "💨",
    "💩",
    "💪",
    "💫",
    "💬",
    "💮",
    "💯",
    "💰",
    "💲",
    "💳",
    "💴",
    "💵",
    "💸",
    "💹",
    "💺",
    "💻",
    "💼",
    "💽",
    "💾",
    "💿",
    "📀",
    "📁",
    "📂",
    "📃",
    "📄",
    "📅",
    "📆",
    "📇",
    "📈",
    "📉",
    "📊",
    "📋",
    "📌",
    "📍",
    "📎",
    "📏",
    "📐",
    "📑",
    "📒",
    "📓",
    "📔",
    "📕",
    "📖",
    "📗",
    "📘",
    "📙",
    "📚",
    "📛",
    "📜",
    "📝",
    "📞",
    "📟",
    "📠",
    "📡",
    "📢",
    "📣",
    "📤",
    "📥",
    "📦",
    "📧",
    "📨",
    "📩",
    "📪",
    "📫",
    "📮",
    "📰",
    "📱",
    "📲",
    "📳",
    "📴",
    "📶",
    "📷",
    "📹",
    "📺",
    "📻",
    "📼",
    "🔃",
    "🔊",
    "🔋",
    "🔌",
    "🔍",
    "🔎",
    "🔏",
    "🔐",
    "🔑",
    "🔒",
    "🔓",
    "🔔",
    "🔖",
    "🔗",
    "🔘",
    "🔙",
    "🔚",
    "🔛",
    "🔜",
    "🔝",
    "🔞",
    "🔟",
    "🔠",
    "🔡",
    "🔢",
    "🔣",
    "🔤",
    "🔥",
    "🔦",
    "🔧",
    "🔨",
    "🔩",
    "🔪",
    "🔫",
    "🔮",
    "🔯",
    "🔰",
    "🔱",
    "🔲",
    "🔳",
    "🔴",
    "🔵",
    "🔶",
    "🔷",
    "🔸",
    "🔹",
    "🔺",
    "🔻",
    "🔼",
    "🔽",
    "🕐",
    "🕑",
    "🕒",
    "🕓",
    "🕔",
    "🕕",
    "🕖",
    "🕗",
    "🕘",
    "🕙",
    "🕚",
    "🕛",
    "🗻",
    "🗼",
    "🗽",
    "🗾",
    "🗿",
    "😀",
    "😇",
    "😈",
    "😎",
    "😐",
    "😑",
    "😕",
    "😗",
    "😙",
    "😛",
    "😟",
    "😦",
    "😧",
    "😬",
    "😮",
    "😯",
    "😴",
    "😶",
    "🚁",
    "🚂",
    "🚆",
    "🚈",
    "🚊",
    "🚍",
    "🚎",
    "🚐",
    "🚔",
    "🚖",
    "🚘",
    "🚛",
    "🚜",
    "🚝",
    "🚞",
    "🚟",
    "🚠",
    "🚡",
    "🚣",
    "🚦",
    "🚮",
    "🚯",
    "🚰",
    "🚱",
    "🚳",
    "🚴",
    "🚵",
    "🚷",
    "🚸",
    "🚿",
    "🛁",
    "🛂",
    "🛃",
    "🛄",
    "🛅",
    "🌍",
    "🌎",
    "🌐",
    "🌒",
    "🌖",
    "🌗",
    "🌘",
    "🌚",
    "🌜",
    "🌝",
    "🌞",
    "🌲",
    "🌳",
    "🍋",
    "🍐",
    "🍼",
    "🏇",
    "🏉",
    "🏤",
    "🐀",
    "🐁",
    "🐂",
    "🐃",
    "🐄",
    "🐅",
    "🐆",
    "🐇",
    "🐈",
    "🐉",
    "🐊",
    "🐋",
    "🐏",
    "🐐",
    "🐓",
    "🐕",
    "🐖",
    "🐪",
    "👥",
    "👬",
    "👭",
    "💭",
    "💶",
    "💷",
    "📬",
    "📭",
    "📯",
    "📵",
    "🔀",
    "🔁",
    "🔂",
    "🔄",
    "🔅",
    "🔆",
    "🔇",
    "🔉",
    "🔕",
    "🔬",
    "🔭",
    "🕜",
    "🕝",
    "🕞",
    "🕟",
    "🕠",
    "🕡",
    "🕢",
    "🕣",
    "🕤",
    "🕥",
    "🕦",
    "🕧"
};

static const int character_dict_size = sizeof(character_dict) / sizeof(const char*);


================================================
FILE: examples/retinaface.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct FaceObject
{
    cv::Rect_<float> rect;
    cv::Point2f landmark[5];
    float prob;
};

static inline float intersection_area(const FaceObject& a, const FaceObject& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const FaceObject& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const FaceObject& b = faceobjects[picked[j]];

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            //             float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

// copy from src/layer/proposal.cpp
static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
{
    int num_ratio = ratios.w;
    int num_scale = scales.w;

    ncnn::Mat anchors;
    anchors.create(4, num_ratio * num_scale);

    const float cx = base_size * 0.5f;
    const float cy = base_size * 0.5f;

    for (int i = 0; i < num_ratio; i++)
    {
        float ar = ratios[i];

        int r_w = round(base_size / sqrt(ar));
        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));

        for (int j = 0; j < num_scale; j++)
        {
            float scale = scales[j];

            float rs_w = r_w * scale;
            float rs_h = r_h * scale;

            float* anchor = anchors.row(i * num_scale + j);

            anchor[0] = cx - rs_w * 0.5f;
            anchor[1] = cy - rs_h * 0.5f;
            anchor[2] = cx + rs_w * 0.5f;
            anchor[3] = cy + rs_h * 0.5f;
        }
    }

    return anchors;
}

static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, const ncnn::Mat& landmark_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
{
    int w = score_blob.w;
    int h = score_blob.h;

    // generate face proposal from bbox deltas and shifted anchors
    const int num_anchors = anchors.h;

    for (int q = 0; q < num_anchors; q++)
    {
        const float* anchor = anchors.row(q);

        const ncnn::Mat score = score_blob.channel(q + num_anchors);
        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);
        const ncnn::Mat landmark = landmark_blob.channel_range(q * 10, 10);

        // shifted anchor
        float anchor_y = anchor[1];

        float anchor_w = anchor[2] - anchor[0];
        float anchor_h = anchor[3] - anchor[1];

        for (int i = 0; i < h; i++)
        {
            float anchor_x = anchor[0];

            for (int j = 0; j < w; j++)
            {
                int index = i * w + j;

                float prob = score[index];

                if (prob >= prob_threshold)
                {
                    // apply center size
                    float dx = bbox.channel(0)[index];
                    float dy = bbox.channel(1)[index];
                    float dw = bbox.channel(2)[index];
                    float dh = bbox.channel(3)[index];

                    float cx = anchor_x + anchor_w * 0.5f;
                    float cy = anchor_y + anchor_h * 0.5f;

                    float pb_cx = cx + anchor_w * dx;
                    float pb_cy = cy + anchor_h * dy;

                    float pb_w = anchor_w * exp(dw);
                    float pb_h = anchor_h * exp(dh);

                    float x0 = pb_cx - pb_w * 0.5f;
                    float y0 = pb_cy - pb_h * 0.5f;
                    float x1 = pb_cx + pb_w * 0.5f;
                    float y1 = pb_cy + pb_h * 0.5f;

                    FaceObject obj;
                    obj.rect.x = x0;
                    obj.rect.y = y0;
                    obj.rect.width = x1 - x0 + 1;
                    obj.rect.height = y1 - y0 + 1;
                    obj.landmark[0].x = cx + (anchor_w + 1) * landmark.channel(0)[index];
                    obj.landmark[0].y = cy + (anchor_h + 1) * landmark.channel(1)[index];
                    obj.landmark[1].x = cx + (anchor_w + 1) * landmark.channel(2)[index];
                    obj.landmark[1].y = cy + (anchor_h + 1) * landmark.channel(3)[index];
                    obj.landmark[2].x = cx + (anchor_w + 1) * landmark.channel(4)[index];
                    obj.landmark[2].y = cy + (anchor_h + 1) * landmark.channel(5)[index];
                    obj.landmark[3].x = cx + (anchor_w + 1) * landmark.channel(6)[index];
                    obj.landmark[3].y = cy + (anchor_h + 1) * landmark.channel(7)[index];
                    obj.landmark[4].x = cx + (anchor_w + 1) * landmark.channel(8)[index];
                    obj.landmark[4].y = cy + (anchor_h + 1) * landmark.channel(9)[index];
                    obj.prob = prob;

                    faceobjects.push_back(obj);
                }

                anchor_x += feat_stride;
            }

            anchor_y += feat_stride;
        }
    }
}

static int detect_retinaface(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
{
    ncnn::Net retinaface;

    retinaface.opt.use_vulkan_compute = true;

    // model is converted from
    // https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models
    // https://github.com/deepinsight/insightface/issues/669
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    //     retinaface.load_param("retinaface-R50.param");
    //     retinaface.load_model("retinaface-R50.bin");
    if (retinaface.load_param("mnet.25-opt.param"))
        exit(-1);
    if (retinaface.load_model("mnet.25-opt.bin"))
        exit(-1);

    const float prob_threshold = 0.8f;
    const float nms_threshold = 0.4f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h);

    ncnn::Extractor ex = retinaface.create_extractor();

    ex.input("data", in);

    std::vector<FaceObject> faceproposals;

    // stride 32
    {
        ncnn::Mat score_blob, bbox_blob, landmark_blob;
        ex.extract("face_rpn_cls_prob_reshape_stride32", score_blob);
        ex.extract("face_rpn_bbox_pred_stride32", bbox_blob);
        ex.extract("face_rpn_landmark_pred_stride32", landmark_blob);

        const int base_size = 16;
        const int feat_stride = 32;
        ncnn::Mat ratios(1);
        ratios[0] = 1.f;
        ncnn::Mat scales(2);
        scales[0] = 32.f;
        scales[1] = 16.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects32;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects32);

        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
    }

    // stride 16
    {
        ncnn::Mat score_blob, bbox_blob, landmark_blob;
        ex.extract("face_rpn_cls_prob_reshape_stride16", score_blob);
        ex.extract("face_rpn_bbox_pred_stride16", bbox_blob);
        ex.extract("face_rpn_landmark_pred_stride16", landmark_blob);

        const int base_size = 16;
        const int feat_stride = 16;
        ncnn::Mat ratios(1);
        ratios[0] = 1.f;
        ncnn::Mat scales(2);
        scales[0] = 8.f;
        scales[1] = 4.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects16;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects16);

        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
    }

    // stride 8
    {
        ncnn::Mat score_blob, bbox_blob, landmark_blob;
        ex.extract("face_rpn_cls_prob_reshape_stride8", score_blob);
        ex.extract("face_rpn_bbox_pred_stride8", bbox_blob);
        ex.extract("face_rpn_landmark_pred_stride8", landmark_blob);

        const int base_size = 16;
        const int feat_stride = 8;
        ncnn::Mat ratios(1);
        ratios[0] = 1.f;
        ncnn::Mat scales(2);
        scales[0] = 2.f;
        scales[1] = 1.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects8;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects8);

        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(faceproposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(faceproposals, picked, nms_threshold);

    int face_count = picked.size();

    faceobjects.resize(face_count);
    for (int i = 0; i < face_count; i++)
    {
        faceobjects[i] = faceproposals[picked[i]];

        // clip to image size
        float x0 = faceobjects[i].rect.x;
        float y0 = faceobjects[i].rect.y;
        float x1 = x0 + faceobjects[i].rect.width;
        float y1 = y0 + faceobjects[i].rect.height;

        x0 = std::max(std::min(x0, (float)img_w - 1), 0.f);
        y0 = std::max(std::min(y0, (float)img_h - 1), 0.f);
        x1 = std::max(std::min(x1, (float)img_w - 1), 0.f);
        y1 = std::max(std::min(y1, (float)img_h - 1), 0.f);

        faceobjects[i].rect.x = x0;
        faceobjects[i].rect.y = y0;
        faceobjects[i].rect.width = x1 - x0;
        faceobjects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
{
    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < faceobjects.size(); i++)
    {
        const FaceObject& obj = faceobjects[i];

        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));

        cv::circle(image, obj.landmark[0], 2, cv::Scalar(0, 255, 255), -1);
        cv::circle(image, obj.landmark[1], 2, cv::Scalar(0, 255, 255), -1);
        cv::circle(image, obj.landmark[2], 2, cv::Scalar(0, 255, 255), -1);
        cv::circle(image, obj.landmark[3], 2, cv::Scalar(0, 255, 255), -1);
        cv::circle(image, obj.landmark[4], 2, cv::Scalar(0, 255, 255), -1);

        char text[256];
        sprintf(text, "%.1f%%", obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<FaceObject> faceobjects;
    detect_retinaface(m, faceobjects);

    draw_faceobjects(m, faceobjects);

    return 0;
}


================================================
FILE: examples/rfcn.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#include <math.h>
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static int detect_rfcn(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net rfcn;

    rfcn.opt.use_vulkan_compute = true;

    // original pretrained model from https://github.com/YuwenXiong/py-R-FCN
    // https://github.com/YuwenXiong/py-R-FCN/blob/master/models/pascal_voc/ResNet-50/rfcn_end2end/test_agnostic.prototxt
    // https://1drv.ms/u/s!AoN7vygOjLIQqUWHpY67oaC7mopf
    // resnet50_rfcn_final.caffemodel
    if (rfcn.load_param("rfcn_end2end.param"))
        exit(-1);
    if (rfcn.load_model("rfcn_end2end.bin"))
        exit(-1);

    const int target_size = 224;

    const int max_per_image = 100;
    const float confidence_thresh = 0.6f; // CONF_THRESH

    const float nms_threshold = 0.3f; // NMS_THRESH

    // scale to target detect size
    int w = bgr.cols;
    int h = bgr.rows;
    float scale = 1.f;
    if (w < h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h);

    const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f};
    in.substract_mean_normalize(mean_vals, 0);

    ncnn::Mat im_info(3);
    im_info[0] = h;
    im_info[1] = w;
    im_info[2] = scale;

    // step1, extract feature and all rois
    ncnn::Extractor ex1 = rfcn.create_extractor();

    ex1.input("data", in);
    ex1.input("im_info", im_info);

    ncnn::Mat rfcn_cls;
    ncnn::Mat rfcn_bbox;
    ncnn::Mat rois; // all rois
    ex1.extract("rfcn_cls", rfcn_cls);
    ex1.extract("rfcn_bbox", rfcn_bbox);
    ex1.extract("rois", rois);

    // step2, extract bbox and score for each roi
    std::vector<std::vector<Object> > class_candidates;
    for (int i = 0; i < rois.c; i++)
    {
        ncnn::Extractor ex2 = rfcn.create_extractor();

        ncnn::Mat roi = rois.channel(i); // get single roi
        ex2.input("rfcn_cls", rfcn_cls);
        ex2.input("rfcn_bbox", rfcn_bbox);
        ex2.input("rois", roi);

        ncnn::Mat bbox_pred;
        ncnn::Mat cls_prob;
        ex2.extract("bbox_pred", bbox_pred);
        ex2.extract("cls_prob", cls_prob);

        int num_class = cls_prob.w;
        class_candidates.resize(num_class);

        // find class id with highest score
        int label = 0;
        float score = 0.f;
        for (int i = 0; i < num_class; i++)
        {
            float class_score = cls_prob[i];
            if (class_score > score)
            {
                label = i;
                score = class_score;
            }
        }

        // ignore background or low score
        if (label == 0 || score <= confidence_thresh)
            continue;

        //         fprintf(stderr, "%d = %f\n", label, score);

        // unscale to image size
        float x1 = roi[0] / scale;
        float y1 = roi[1] / scale;
        float x2 = roi[2] / scale;
        float y2 = roi[3] / scale;

        float pb_w = x2 - x1 + 1;
        float pb_h = y2 - y1 + 1;

        // apply bbox regression
        float dx = bbox_pred[4];
        float dy = bbox_pred[4 + 1];
        float dw = bbox_pred[4 + 2];
        float dh = bbox_pred[4 + 3];

        float cx = x1 + pb_w * 0.5f;
        float cy = y1 + pb_h * 0.5f;

        float obj_cx = cx + pb_w * dx;
        float obj_cy = cy + pb_h * dy;

        float obj_w = pb_w * exp(dw);
        float obj_h = pb_h * exp(dh);

        float obj_x1 = obj_cx - obj_w * 0.5f;
        float obj_y1 = obj_cy - obj_h * 0.5f;
        float obj_x2 = obj_cx + obj_w * 0.5f;
        float obj_y2 = obj_cy + obj_h * 0.5f;

        // clip
        obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f);
        obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f);
        obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f);
        obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f);

        // append object
        Object obj;
        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
        obj.label = label;
        obj.prob = score;

        class_candidates[label].push_back(obj);
    }

    // post process
    objects.clear();
    for (int i = 0; i < (int)class_candidates.size(); i++)
    {
        std::vector<Object>& candidates = class_candidates[i];

        qsort_descent_inplace(candidates);

        std::vector<int> picked;
        nms_sorted_bboxes(candidates, picked, nms_threshold);

        for (int j = 0; j < (int)picked.size(); j++)
        {
            int z = picked[j];
            objects.push_back(candidates[z]);
        }
    }

    qsort_descent_inplace(objects);

    if (max_per_image > 0 && max_per_image < objects.size())
    {
        objects.resize(max_per_image);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_rfcn(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/rvm.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// ncnn model exported from https://github.com/PeterL1n/RobustVideoMatting
//
// import torch
// from torch import nn
// from model import MattingNetwork
// from model.fast_guided_filter import FastGuidedFilterRefiner
// from model.deep_guided_filter import DeepGuidedFilterRefiner
//
// class Model(nn.Module):
//     def __init__(self):
//         super().__init__()
//
//         self.rvm = MattingNetwork('mobilenetv3').eval()
//         self.rvm.load_state_dict(torch.load('rvm_mobilenetv3.pth'))
//
//         self.refiner_deep = DeepGuidedFilterRefiner()
//         self.refiner_fast = FastGuidedFilterRefiner()
//
//     def forward_first_frame(self, src):
//         return self.rvm(src)
//
//     def forward(self, src, src_sm, r1, r2, r3, r4):
//
//         f1, f2, f3, f4 = self.rvm.backbone(src_sm)
//         f4 = self.rvm.aspp(f4)
//         hid, *rec = self.rvm.decoder(src_sm, f1, f2, f3, f4, r1, r2, r3, r4)
//
//         # downsample
//         fgr_residual, pha = self.rvm.project_mat(hid).split([3, 1], dim=-3)
//         fgr = fgr_residual + src_sm
//
//         # downsample + refiner_deep
//         fgr_residual_deep, pha_deep = self.refiner_deep(src, src_sm, fgr_residual, pha, hid)
//         fgr_deep = fgr_residual_deep + src
//
//         # downsample + refiner_fast
//         fgr_residual_fast, pha_fast = self.refiner_fast(src, src_sm, fgr_residual, pha, hid)
//         fgr_fast = fgr_residual_fast + src
//
//         # downsample + segmentation
//         seg = self.rvm.project_seg(hid)
//
//         return fgr, pha, fgr_deep, pha_deep, fgr_fast, pha_fast, seg, *rec
//
// import pnnx
//
// model = Model().eval()
//
// x = torch.rand(1, 3, 512, 512)
// x2 = torch.rand(1, 3, 256, 256)
// x2_hr = torch.rand(1, 3, 1024, 1024)
//
// # generate feats via forward_first_frame, with different shapes
// fgr, pha, r1, r2, r3, r4 = model.forward_first_frame(x)
// fgr2, pha2, r12, r22, r32, r42 = model.forward_first_frame(x2)
//
// # export with dynamic shape
// pnnx.export(model, "rvm_mobilenetv3.pt", (x, x, r1, r2, r3, r4), (x2_hr, x2, r12, r22, r32, r42))
//
// and then fix refiner_fast fp16 overflow issue in ncnn.param via appending 31=1 layer feat mask
//
// BinaryOp   div_58    2 1 401 399 402 0=3 31=1
//

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif

static int detect_rvm(const cv::Mat& bgr, cv::Mat& fgr, cv::Mat& pha, cv::Mat& seg)
{
    ncnn::Net rvm;

    rvm.opt.use_vulkan_compute = true;

    // https://github.com/nihui/ncnn-android-rvm/tree/master/app/src/main/assets
    // you shall also change r1,r2,r3,r4 shape below when model changed
    if (rvm.load_param("rvm_mobilenetv3.ncnn.param"))
        exit(-1);
    if (rvm.load_model("rvm_mobilenetv3.ncnn.bin"))
        exit(-1);
    // if (rvm.load_param("rvm_resnet50.ncnn.param"))
    //     exit(-1);
    // if (rvm.load_model("rvm_resnet50.ncnn.bin"))
    //     exit(-1);

    const int w = bgr.cols;
    const int h = bgr.rows;

    const int target_size = 512;
    const int max_stride = 16;

    bool refine_deep = true;
    // bool refine_fast = true;

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};

    ncnn::Mat in_pad;
    ncnn::Mat in_small_pad;

    int wpad = 0;
    int hpad = 0;

    bool downsample = std::max(w, h) > target_size;
    if (downsample)
    {
        // letterbox pad to multiple of max_stride
        int w2 = w;
        int h2 = h;
        float scale = 1.f;
        if (w > h)
        {
            scale = (float)target_size / w;
            w2 = target_size;
            h2 = h2 * scale;
        }
        else
        {
            scale = (float)target_size / h;
            h2 = target_size;
            w2 = w2 * scale;
        }

        ncnn::Mat in_small = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h, w2, h2);

        // letterbox pad to target_size rectangle
        int w2pad = (w2 + max_stride - 1) / max_stride * max_stride - w2;
        int h2pad = (h2 + max_stride - 1) / max_stride * max_stride - h2;
        ncnn::copy_make_border(in_small, in_small_pad, h2pad / 2, h2pad - h2pad / 2, w2pad / 2, w2pad - w2pad / 2, ncnn::BORDER_CONSTANT, 114.f);

        in_small_pad.substract_mean_normalize(0, norm_vals);

        int w3 = w;
        int h3 = h;
        if (w > h)
        {
            w3 = w;
            h3 = in_small_pad.h / scale;
            wpad = 0;
            hpad = h3 - h;
        }
        else
        {
            h3 = h;
            w3 = in_small_pad.w / scale;
            wpad = w3 - w;
            hpad = 0;
        }

        ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h);

        ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

        in_pad.substract_mean_normalize(0, norm_vals);
    }
    else
    {
        ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h);

        // letterbox pad to target_size rectangle
        wpad = (w + max_stride - 1) / max_stride * max_stride - w;
        hpad = (h + max_stride - 1) / max_stride * max_stride - h;
        ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

        in_pad.substract_mean_normalize(0, norm_vals);

        in_small_pad = in_pad;
    }

    // rvm_mobilenetv3
    ncnn::Mat r1(in_small_pad.w / 2, in_small_pad.h / 2, 16);
    ncnn::Mat r2(in_small_pad.w / 4, in_small_pad.h / 4, 20);
    ncnn::Mat r3(in_small_pad.w / 8, in_small_pad.h / 8, 40);
    ncnn::Mat r4(in_small_pad.w / 16, in_small_pad.h / 16, 64);

    // rvm_resnet50
    // ncnn::Mat r1(in_small_pad.w / 2, in_small_pad.h / 2, 16);
    // ncnn::Mat r2(in_small_pad.w / 4, in_small_pad.h / 4, 32);
    // ncnn::Mat r3(in_small_pad.w / 8, in_small_pad.h / 8, 64);
    // ncnn::Mat r4(in_small_pad.w / 16, in_small_pad.h / 16, 128);

    r1.fill(0.f);
    r2.fill(0.f);
    r3.fill(0.f);
    r4.fill(0.f);

    ncnn::Extractor ex = rvm.create_extractor();

    ex.input("in0", in_pad);
    ex.input("in1", in_small_pad);

    ex.input("in2", r1);
    ex.input("in3", r2);
    ex.input("in4", r3);
    ex.input("in5", r4);

    ncnn::Mat out_fgr;
    ncnn::Mat out_pha;

    if (downsample)
    {
        if (refine_deep)
        {
            // downsample + refine deep
            ex.extract("out2", out_fgr);
            ex.extract("out3", out_pha);
        }
        else // if (refine_fast)
        {
            // downsample + refine fast
            ex.extract("out4", out_fgr);
            ex.extract("out5", out_pha);
        }
    }
    else
    {
        // no downsample
        ex.extract("out0", out_fgr);
        ex.extract("out1", out_pha);
    }

    ncnn::Mat out_seg;

    // segmentation
    ex.extract("out6", out_seg);

    // feats
    ex.extract("out7", r1);
    ex.extract("out8", r2);
    ex.extract("out9", r3);
    ex.extract("out10", r4);

    const float denorm_vals[3] = {255.f, 255.f, 255.f};

    out_fgr.substract_mean_normalize(0, denorm_vals);
    fgr.create(out_fgr.h, out_fgr.w, CV_8UC3);
    out_fgr.to_pixels(fgr.data, ncnn::Mat::PIXEL_RGB2BGR);

    out_pha.substract_mean_normalize(0, denorm_vals);
    pha.create(out_pha.h, out_pha.w, CV_8UC1);
    out_pha.to_pixels(pha.data, ncnn::Mat::PIXEL_GRAY);

    out_seg.substract_mean_normalize(0, denorm_vals);
    seg.create(in_pad.h, in_pad.w, CV_8UC1);
    out_seg.to_pixels_resize(seg.data, ncnn::Mat::PIXEL_GRAY, in_pad.w, in_pad.h);

    // cut letterbox pad
    fgr = fgr(cv::Rect(wpad / 2, hpad / 2, w, h));
    pha = pha(cv::Rect(wpad / 2, hpad / 2, w, h));
    seg = seg(cv::Rect(wpad / 2, hpad / 2, w, h));

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const cv::Mat& fgr, const cv::Mat& pha, const cv::Mat& seg)
{
    const int w = bgr.cols;
    const int h = bgr.rows;

    // composite
    cv::Mat comp(h, w, CV_8UC3);
    for (int y = 0; y < h; y++)
    {
        const uchar* pf = fgr.ptr<const uchar>(y);
        const uchar* pa = pha.ptr<const uchar>(y);
        uchar* p = comp.ptr<uchar>(y);
        for (int x = 0; x < w; x++)
        {
            const float alpha = pa[0] / 255.f;
            p[0] = cv::saturate_cast<uchar>(pf[0] * alpha + (1 - alpha) * 155);
            p[1] = cv::saturate_cast<uchar>(pf[1] * alpha + (1 - alpha) * 255);
            p[2] = cv::saturate_cast<uchar>(pf[2] * alpha + (1 - alpha) * 120);
            pf += 3;
            pa += 1;
            p += 3;
        }
    }

    // composite seg
    cv::Mat comp_seg(h, w, CV_8UC3);
    for (int y = 0; y < h; y++)
    {
        const uchar* pb = bgr.ptr<const uchar>(y);
        const uchar* ps = seg.ptr<const uchar>(y);
        uchar* p = comp_seg.ptr<uchar>(y);
        for (int x = 0; x < w; x++)
        {
            const float alpha = ps[0] / 255.f;
            p[0] = cv::saturate_cast<uchar>(pb[0] * alpha + (1 - alpha) * 155);
            p[1] = cv::saturate_cast<uchar>(pb[1] * alpha + (1 - alpha) * 255);
            p[2] = cv::saturate_cast<uchar>(pb[2] * alpha + (1 - alpha) * 120);
            pb += 3;
            ps += 1;
            p += 3;
        }
    }

    cv::imshow("comp", comp);
    cv::imshow("comp_seg", comp_seg);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    cv::Mat fgr;
    cv::Mat pha;
    cv::Mat seg;
    detect_rvm(m, fgr, pha, seg);

    draw_objects(m, fgr, pha, seg);

    return 0;
}


================================================
FILE: examples/scrfd.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct FaceObject
{
    cv::Rect_<float> rect;
    float prob;
};

static inline float intersection_area(const FaceObject& a, const FaceObject& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const FaceObject& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const FaceObject& b = faceobjects[picked[j]];

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            //             float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

// insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors()
static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
{
    int num_ratio = ratios.w;
    int num_scale = scales.w;

    ncnn::Mat anchors;
    anchors.create(4, num_ratio * num_scale);

    const float cx = 0;
    const float cy = 0;

    for (int i = 0; i < num_ratio; i++)
    {
        float ar = ratios[i];

        int r_w = round(base_size / sqrt(ar));
        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));

        for (int j = 0; j < num_scale; j++)
        {
            float scale = scales[j];

            float rs_w = r_w * scale;
            float rs_h = r_h * scale;

            float* anchor = anchors.row(i * num_scale + j);

            anchor[0] = cx - rs_w * 0.5f;
            anchor[1] = cy - rs_h * 0.5f;
            anchor[2] = cx + rs_w * 0.5f;
            anchor[3] = cy + rs_h * 0.5f;
        }
    }

    return anchors;
}

static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
{
    int w = score_blob.w;
    int h = score_blob.h;

    // generate face proposal from bbox deltas and shifted anchors
    const int num_anchors = anchors.h;

    for (int q = 0; q < num_anchors; q++)
    {
        const float* anchor = anchors.row(q);

        const ncnn::Mat score = score_blob.channel(q);
        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);

        // shifted anchor
        float anchor_y = anchor[1];

        float anchor_w = anchor[2] - anchor[0];
        float anchor_h = anchor[3] - anchor[1];

        for (int i = 0; i < h; i++)
        {
            float anchor_x = anchor[0];

            for (int j = 0; j < w; j++)
            {
                int index = i * w + j;

                float prob = score[index];

                if (prob >= prob_threshold)
                {
                    // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single()
                    float dx = bbox.channel(0)[index] * feat_stride;
                    float dy = bbox.channel(1)[index] * feat_stride;
                    float dw = bbox.channel(2)[index] * feat_stride;
                    float dh = bbox.channel(3)[index] * feat_stride;

                    // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox()
                    float cx = anchor_x + anchor_w * 0.5f;
                    float cy = anchor_y + anchor_h * 0.5f;

                    float x0 = cx - dx;
                    float y0 = cy - dy;
                    float x1 = cx + dw;
                    float y1 = cy + dh;

                    FaceObject obj;
                    obj.rect.x = x0;
                    obj.rect.y = y0;
                    obj.rect.width = x1 - x0 + 1;
                    obj.rect.height = y1 - y0 + 1;
                    obj.prob = prob;

                    faceobjects.push_back(obj);
                }

                anchor_x += feat_stride;
            }

            anchor_y += feat_stride;
        }
    }
}

static int detect_scrfd(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
{
    ncnn::Net scrfd;

    scrfd.opt.use_vulkan_compute = true;

    // model is converted from
    // https://github.com/deepinsight/insightface/tree/master/detection/scrfd
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (scrfd.load_param("scrfd_500m-opt2.param"))
        exit(-1);
    if (scrfd.load_model("scrfd_500m-opt2.bin"))
        exit(-1);

    int width = bgr.cols;
    int height = bgr.rows;

    // insightface/detection/scrfd/configs/scrfd/scrfd_500m.py
    const int target_size = 640;
    const float prob_threshold = 0.3f;
    const float nms_threshold = 0.45f;

    // pad to multiple of 32
    int w = width;
    int h = height;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h);

    // pad to target_size rectangle
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);

    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
    const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f};
    in_pad.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = scrfd.create_extractor();

    ex.input("input.1", in_pad);

    std::vector<FaceObject> faceproposals;

    // stride 32
    {
        ncnn::Mat score_blob, bbox_blob;
        ex.extract("412", score_blob);
        ex.extract("415", bbox_blob);

        const int base_size = 16;
        const int feat_stride = 8;
        ncnn::Mat ratios(1);
        ratios[0] = 1.f;
        ncnn::Mat scales(2);
        scales[0] = 1.f;
        scales[1] = 2.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects32;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32);

        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
    }

    // stride 16
    {
        ncnn::Mat score_blob, bbox_blob;
        ex.extract("474", score_blob);
        ex.extract("477", bbox_blob);

        const int base_size = 64;
        const int feat_stride = 16;
        ncnn::Mat ratios(1);
        ratios[0] = 1.f;
        ncnn::Mat scales(2);
        scales[0] = 1.f;
        scales[1] = 2.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects16;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16);

        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
    }

    // stride 8
    {
        ncnn::Mat score_blob, bbox_blob;
        ex.extract("536", score_blob);
        ex.extract("539", bbox_blob);

        const int base_size = 256;
        const int feat_stride = 32;
        ncnn::Mat ratios(1);
        ratios[0] = 1.f;
        ncnn::Mat scales(2);
        scales[0] = 1.f;
        scales[1] = 2.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects8;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);

        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(faceproposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(faceproposals, picked, nms_threshold);

    int face_count = picked.size();

    faceobjects.resize(face_count);
    for (int i = 0; i < face_count; i++)
    {
        faceobjects[i] = faceproposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale;

        x0 = std::max(std::min(x0, (float)width - 1), 0.f);
        y0 = std::max(std::min(y0, (float)height - 1), 0.f);
        x1 = std::max(std::min(x1, (float)width - 1), 0.f);
        y1 = std::max(std::min(y1, (float)height - 1), 0.f);

        faceobjects[i].rect.x = x0;
        faceobjects[i].rect.y = y0;
        faceobjects[i].rect.width = x1 - x0;
        faceobjects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
{
    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < faceobjects.size(); i++)
    {
        const FaceObject& obj = faceobjects[i];

        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));

        char text[256];
        sprintf(text, "%.1f%%", obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<FaceObject> faceobjects;
    detect_scrfd(m, faceobjects);

    draw_faceobjects(m, faceobjects);

    return 0;
}


================================================
FILE: examples/scrfd_crowdhuman.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct FaceObject
{
    cv::Rect_<float> rect;
    float prob;
};

static inline float intersection_area(const FaceObject& a, const FaceObject& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const FaceObject& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const FaceObject& b = faceobjects[picked[j]];

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            //             float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

// insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors()
static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
{
    int num_ratio = ratios.w;
    int num_scale = scales.w;

    ncnn::Mat anchors;
    anchors.create(4, num_ratio * num_scale);

    const float cx = 0;
    const float cy = 0;

    for (int i = 0; i < num_ratio; i++)
    {
        float ar = ratios[i];

        int r_w = round(base_size / sqrt(ar));
        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));

        for (int j = 0; j < num_scale; j++)
        {
            float scale = scales[j];

            float rs_w = r_w * scale;
            float rs_h = r_h * scale;

            float* anchor = anchors.row(i * num_scale + j);

            anchor[0] = cx - rs_w * 0.5f;
            anchor[1] = cy - rs_h * 0.5f;
            anchor[2] = cx + rs_w * 0.5f;
            anchor[3] = cy + rs_h * 0.5f;
        }
    }

    return anchors;
}

static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
{
    int w = score_blob.w;
    int h = score_blob.h;

    // generate face proposal from bbox deltas and shifted anchors
    const int num_anchors = anchors.h;

    for (int q = 0; q < num_anchors; q++)
    {
        const float* anchor = anchors.row(q);

        const ncnn::Mat score = score_blob.channel(q);
        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);

        // shifted anchor
        float anchor_y = anchor[1];

        float anchor_w = anchor[2] - anchor[0];
        float anchor_h = anchor[3] - anchor[1];

        for (int i = 0; i < h; i++)
        {
            float anchor_x = anchor[0];

            for (int j = 0; j < w; j++)
            {
                int index = i * w + j;

                float prob = score[index];

                if (prob >= prob_threshold)
                {
                    // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single()
                    float dx = bbox.channel(0)[index] * feat_stride;
                    float dy = bbox.channel(1)[index] * feat_stride;
                    float dw = bbox.channel(2)[index] * feat_stride;
                    float dh = bbox.channel(3)[index] * feat_stride;

                    // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox()
                    float cx = anchor_x + anchor_w * 0.5f;
                    float cy = anchor_y + anchor_h * 0.5f;

                    float x0 = cx - dx;
                    float y0 = cy - dy;
                    float x1 = cx + dw;
                    float y1 = cy + dh;

                    FaceObject obj;
                    obj.rect.x = x0;
                    obj.rect.y = y0;
                    obj.rect.width = x1 - x0 + 1;
                    obj.rect.height = y1 - y0 + 1;
                    obj.prob = prob;

                    faceobjects.push_back(obj);
                }

                anchor_x += feat_stride;
            }

            anchor_y += feat_stride;
        }
    }
}

static int detect_scrfd(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
{
    ncnn::Net scrfd;

    scrfd.opt.use_vulkan_compute = true;

    // Insight face does not provided a trained scrfd_crowdhuman model
    // but I have one for detecing cat face, you can have a try here:
    // https://drive.google.com/file/d/1JogkKa0f_09HkENbCnXy9hRYxm35wKTn

    if (scrfd.load_param("scrfd_crowdhuman.param"))
        exit(-1);
    if (scrfd.load_model("scrfd_crowdhuman.bin"))
        exit(-1);

    int width = bgr.cols;
    int height = bgr.rows;

    const int target_size = 640;
    const float prob_threshold = 0.3f;
    const float nms_threshold = 0.45f;

    // pad to multiple of 32
    int w = width;
    int h = height;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h);

    // pad to target_size rectangle
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);

    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
    const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f};
    in_pad.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = scrfd.create_extractor();

    ex.input("input.1", in_pad);

    std::vector<FaceObject> faceproposals;

    // stride 8
    {
        ncnn::Mat score_blob, bbox_blob;
        ex.extract("490", score_blob);
        ex.extract("493", bbox_blob);

        const int base_size = 8;
        const int feat_stride = 8;
        ncnn::Mat ratios(1);
        ratios[0] = 2.f;
        ncnn::Mat scales(1);
        scales[0] = 3.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects32;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32);

        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
    }

    // stride 16
    {
        ncnn::Mat score_blob, bbox_blob;
        ex.extract("510", score_blob);
        ex.extract("513", bbox_blob);

        const int base_size = 16;
        const int feat_stride = 16;
        ncnn::Mat ratios(1);
        ratios[0] = 2.f;
        ncnn::Mat scales(1);
        scales[0] = 3.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects16;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16);

        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
    }

    // stride 32
    {
        ncnn::Mat score_blob, bbox_blob;
        ex.extract("530", score_blob);
        ex.extract("533", bbox_blob);

        const int base_size = 32;
        const int feat_stride = 32;
        ncnn::Mat ratios(1);
        ratios[0] = 2.f;
        ncnn::Mat scales(1);
        scales[0] = 3.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects8;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);

        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
    }

    // stride 64
    {
        ncnn::Mat score_blob, bbox_blob, kps_blob;
        ex.extract("550", score_blob);
        ex.extract("553", bbox_blob);

        const int base_size = 64;
        const int feat_stride = 64;
        ncnn::Mat ratios(1);
        ratios[0] = 2.f;
        ncnn::Mat scales(1);
        scales[0] = 3.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects8;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);

        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
    }

    // stride 128
    {
        ncnn::Mat score_blob, bbox_blob, kps_blob;
        ex.extract("570", score_blob);
        ex.extract("573", bbox_blob);

        const int base_size = 128;
        const int feat_stride = 128;
        ncnn::Mat ratios(1);
        ratios[0] = 2.f;
        ncnn::Mat scales(1);
        scales[0] = 3.f;
        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);

        std::vector<FaceObject> faceobjects8;
        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);

        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(faceproposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(faceproposals, picked, nms_threshold);

    int face_count = picked.size();

    faceobjects.resize(face_count);
    for (int i = 0; i < face_count; i++)
    {
        faceobjects[i] = faceproposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale;

        x0 = std::max(std::min(x0, (float)width - 1), 0.f);
        y0 = std::max(std::min(y0, (float)height - 1), 0.f);
        x1 = std::max(std::min(x1, (float)width - 1), 0.f);
        y1 = std::max(std::min(y1, (float)height - 1), 0.f);

        faceobjects[i].rect.x = x0;
        faceobjects[i].rect.y = y0;
        faceobjects[i].rect.width = x1 - x0;
        faceobjects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
{
    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < faceobjects.size(); i++)
    {
        const FaceObject& obj = faceobjects[i];

        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));

        char text[256];
        sprintf(text, "%.1f%%", obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<FaceObject> faceobjects;
    detect_scrfd(m, faceobjects);

    draw_faceobjects(m, faceobjects);

    return 0;
}


================================================
FILE: examples/shufflenetv2.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#include <algorithm>
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#endif
#include <stdio.h>
#include <vector>

static int detect_shufflenetv2(const cv::Mat& bgr, std::vector<float>& cls_scores)
{
    ncnn::Net shufflenetv2;

    shufflenetv2.opt.use_vulkan_compute = true;

    // https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe
    // models can be downloaded from https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe/releases
    if (shufflenetv2.load_param("shufflenet_v2_x0.5.param"))
        exit(-1);
    if (shufflenetv2.load_model("shufflenet_v2_x0.5.bin"))
        exit(-1);

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 224, 224);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = shufflenetv2.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("fc", out);

    // manually call softmax on the fc output
    // convert result into probability
    // skip if your model already has softmax operation
    {
        ncnn::Layer* softmax = ncnn::create_layer("Softmax");

        ncnn::ParamDict pd;
        softmax->load_param(pd);

        softmax->forward_inplace(out, shufflenetv2.opt);

        delete softmax;
    }

    out = out.reshape(out.w * out.h * out.c);

    cls_scores.resize(out.w);
    for (int j = 0; j < out.w; j++)
    {
        cls_scores[j] = out[j];
    }

    return 0;
}

static int print_topk(const std::vector<float>& cls_scores, int topk)
{
    // partial sort topk with index
    int size = cls_scores.size();
    std::vector<std::pair<float, int> > vec;
    vec.resize(size);
    for (int i = 0; i < size; i++)
    {
        vec[i] = std::make_pair(cls_scores[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater<std::pair<float, int> >());

    // print topk and score
    for (int i = 0; i < topk; i++)
    {
        float score = vec[i].first;
        int index = vec[i].second;
        fprintf(stderr, "%d = %f\n", index, score);
    }

    return 0;
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<float> cls_scores;
    detect_shufflenetv2(m, cls_scores);

    print_topk(cls_scores, 3);

    return 0;
}


================================================
FILE: examples/simplepose.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#include <algorithm>
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct KeyPoint
{
    cv::Point2f p;
    float prob;
};

static int detect_posenet(const cv::Mat& bgr, std::vector<KeyPoint>& keypoints)
{
    ncnn::Net posenet;

    posenet.opt.use_vulkan_compute = true;

    // the simple baseline human pose estimation from gluon-cv
    // https://gluon-cv.mxnet.io/build/examples_pose/demo_simple_pose.html
    // mxnet model exported via
    //      pose_net.hybridize()
    //      pose_net.export('pose')
    // then mxnet2ncnn
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (posenet.load_param("pose.param"))
        exit(-1);
    if (posenet.load_model("pose.bin"))
        exit(-1);

    int w = bgr.cols;
    int h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h, 192, 256);

    // transforms.ToTensor(),
    // transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    // R' = (R / 255 - 0.485) / 0.229 = (R - 0.485 * 255) / 0.229 / 255
    // G' = (G / 255 - 0.456) / 0.224 = (G - 0.456 * 255) / 0.224 / 255
    // B' = (B / 255 - 0.406) / 0.225 = (B - 0.406 * 255) / 0.225 / 255
    const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};
    const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = posenet.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("conv3_fwd", out);

    // resolve point from heatmap
    keypoints.clear();
    for (int p = 0; p < out.c; p++)
    {
        const ncnn::Mat m = out.channel(p);

        float max_prob = 0.f;
        int max_x = 0;
        int max_y = 0;
        for (int y = 0; y < out.h; y++)
        {
            const float* ptr = m.row(y);
            for (int x = 0; x < out.w; x++)
            {
                float prob = ptr[x];
                if (prob > max_prob)
                {
                    max_prob = prob;
                    max_x = x;
                    max_y = y;
                }
            }
        }

        KeyPoint keypoint;
        keypoint.p = cv::Point2f(max_x * w / (float)out.w, max_y * h / (float)out.h);
        keypoint.prob = max_prob;

        keypoints.push_back(keypoint);
    }

    return 0;
}

static void draw_pose(const cv::Mat& bgr, const std::vector<KeyPoint>& keypoints)
{
    cv::Mat image = bgr.clone();

    // draw bone
    static const int joint_pairs[16][2] = {
        {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}
    };

    for (int i = 0; i < 16; i++)
    {
        const KeyPoint& p1 = keypoints[joint_pairs[i][0]];
        const KeyPoint& p2 = keypoints[joint_pairs[i][1]];

        if (p1.prob < 0.2f || p2.prob < 0.2f)
            continue;

        cv::line(image, p1.p, p2.p, cv::Scalar(255, 0, 0), 2);
    }

    // draw joint
    for (size_t i = 0; i < keypoints.size(); i++)
    {
        const KeyPoint& keypoint = keypoints[i];

        fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob);

        if (keypoint.prob < 0.2f)
            continue;

        cv::circle(image, keypoint.p, 3, cv::Scalar(0, 255, 0), -1);
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<KeyPoint> keypoints;
    detect_posenet(m, keypoints);

    draw_pose(m, keypoints);

    return 0;
}


================================================
FILE: examples/squeezencnn/README.md
================================================
The squeezenet android example project has been moved to https://github.com/nihui/ncnn-android-squeezenet


================================================
FILE: examples/squeezenet.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#include <algorithm>
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#endif
#include <stdio.h>
#include <vector>

static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
{
    ncnn::Net squeezenet;

    squeezenet.opt.use_vulkan_compute = true;

    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (squeezenet.load_param("squeezenet_v1.1.param"))
        exit(-1);
    if (squeezenet.load_model("squeezenet_v1.1.bin"))
        exit(-1);

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227);

    const float mean_vals[3] = {104.f, 117.f, 123.f};
    in.substract_mean_normalize(mean_vals, 0);

    ncnn::Extractor ex = squeezenet.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("prob", out);

    cls_scores.resize(out.w);
    for (int j = 0; j < out.w; j++)
    {
        cls_scores[j] = out[j];
    }

    return 0;
}

static int print_topk(const std::vector<float>& cls_scores, int topk)
{
    // partial sort topk with index
    int size = cls_scores.size();
    std::vector<std::pair<float, int> > vec;
    vec.resize(size);
    for (int i = 0; i < size; i++)
    {
        vec[i] = std::make_pair(cls_scores[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater<std::pair<float, int> >());

    // print topk and score
    for (int i = 0; i < topk; i++)
    {
        float score = vec[i].first;
        int index = vec[i].second;
        fprintf(stderr, "%d = %f\n", index, score);
    }

    return 0;
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<float> cls_scores;
    detect_squeezenet(m, cls_scores);

    print_topk(cls_scores, 3);

    return 0;
}


================================================
FILE: examples/squeezenet_c_api.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "c_api.h"

#include <algorithm>
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#endif
#include <stdio.h>
#include <vector>

static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
{
    ncnn_net_t squeezenet = ncnn_net_create();

    ncnn_option_t opt = ncnn_option_create();
    ncnn_option_set_use_vulkan_compute(opt, 1);

    ncnn_net_set_option(squeezenet, opt);

    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (ncnn_net_load_param(squeezenet, "squeezenet_v1.1.param"))
        exit(-1);
    if (ncnn_net_load_model(squeezenet, "squeezenet_v1.1.bin"))
        exit(-1);

    ncnn_mat_t in = ncnn_mat_from_pixels_resize(bgr.data, NCNN_MAT_PIXEL_BGR, bgr.cols, bgr.rows, bgr.cols * 3, 227, 227, NULL);

    const float mean_vals[3] = {104.f, 117.f, 123.f};
    ncnn_mat_substract_mean_normalize(in, mean_vals, 0);

    ncnn_extractor_t ex = ncnn_extractor_create(squeezenet);

    ncnn_extractor_input(ex, "data", in);

    ncnn_mat_t out;
    ncnn_extractor_extract(ex, "prob", &out);

    const int out_w = ncnn_mat_get_w(out);
    const float* out_data = (const float*)ncnn_mat_get_data(out);

    cls_scores.resize(out_w);
    for (int j = 0; j < out_w; j++)
    {
        cls_scores[j] = out_data[j];
    }

    ncnn_mat_destroy(in);
    ncnn_mat_destroy(out);

    ncnn_extractor_destroy(ex);

    ncnn_option_destroy(opt);

    ncnn_net_destroy(squeezenet);

    return 0;
}

static int print_topk(const std::vector<float>& cls_scores, int topk)
{
    // partial sort topk with index
    int size = cls_scores.size();
    std::vector<std::pair<float, int> > vec;
    vec.resize(size);
    for (int i = 0; i < size; i++)
    {
        vec[i] = std::make_pair(cls_scores[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater<std::pair<float, int> >());

    // print topk and score
    for (int i = 0; i < topk; i++)
    {
        float score = vec[i].first;
        int index = vec[i].second;
        fprintf(stderr, "%d = %f\n", index, score);
    }

    return 0;
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<float> cls_scores;
    detect_squeezenet(m, cls_scores);

    print_topk(cls_scores, 3);

    return 0;
}


================================================
FILE: examples/squeezenet_v1.1.param
================================================
7767517
75 83
Input            data             0 1 data 0=227 1=227 2=3
Convolution      conv1            1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728
ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1 0=0.000000
Pooling          pool1            1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024
ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=0.000000
Split            splitncnn_0      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024
ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0=0.000000
Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216
ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0=0.000000
Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048
ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0=0.000000
Split            splitncnn_1      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024
ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0=0.000000
Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216
ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0=0.000000
Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
Pooling          pool3            1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096
ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=0.000000
Split            splitncnn_2      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096
ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0=0.000000
Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864
ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0=0.000000
Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192
ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0=0.000000
Split            splitncnn_3      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096
ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0=0.000000
Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864
ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0=0.000000
Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
Pooling          pool5            1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0
Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288
ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=0.000000
Split            splitncnn_4      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216
ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0=0.000000
Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944
ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0=0.000000
Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432
ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0=0.000000
Split            splitncnn_5      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216
ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0=0.000000
Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944
ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0=0.000000
Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576
ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0=0.000000
Split            splitncnn_6      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384
ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0=0.000000
Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456
ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0=0.000000
Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768
ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0=0.000000
Split            splitncnn_7      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384
ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0=0.000000
Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456
ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0=0.000000
Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
Dropout          drop9            1 1 fire9/concat fire9/concat_drop9
Convolution      conv10           1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000
ReLU             relu_conv10      1 1 conv10 conv10_relu_conv10 0=0.000000
Pooling          pool10           1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1
Softmax          prob             1 1 pool10 prob 0=0


================================================
FILE: examples/squeezenet_v1.1.prototxt
================================================
name: "squeezenet_v1.1_deploy"

layer {
  name: "data"
  type: "Input"
  top: "data"
  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  convolution_param {
    num_output: 64
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "relu_conv1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "conv1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "fire2/squeeze1x1"
  type: "Convolution"
  bottom: "pool1"
  top: "fire2/squeeze1x1"
  convolution_param {
    num_output: 16
    kernel_size: 1
  }
}
layer {
  name: "fire2/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire2/squeeze1x1"
  top: "fire2/squeeze1x1"
}
layer {
  name: "fire2/expand1x1"
  type: "Convolution"
  bottom: "fire2/squeeze1x1"
  top: "fire2/expand1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
}
layer {
  name: "fire2/relu_expand1x1"
  type: "ReLU"
  bottom: "fire2/expand1x1"
  top: "fire2/expand1x1"
}
layer {
  name: "fire2/expand3x3"
  type: "Convolution"
  bottom: "fire2/squeeze1x1"
  top: "fire2/expand3x3"
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire2/relu_expand3x3"
  type: "ReLU"
  bottom: "fire2/expand3x3"
  top: "fire2/expand3x3"
}
layer {
  name: "fire2/concat"
  type: "Concat"
  bottom: "fire2/expand1x1"
  bottom: "fire2/expand3x3"
  top: "fire2/concat"
}
layer {
  name: "fire3/squeeze1x1"
  type: "Convolution"
  bottom: "fire2/concat"
  top: "fire3/squeeze1x1"
  convolution_param {
    num_output: 16
    kernel_size: 1
  }
}
layer {
  name: "fire3/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire3/squeeze1x1"
  top: "fire3/squeeze1x1"
}
layer {
  name: "fire3/expand1x1"
  type: "Convolution"
  bottom: "fire3/squeeze1x1"
  top: "fire3/expand1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
}
layer {
  name: "fire3/relu_expand1x1"
  type: "ReLU"
  bottom: "fire3/expand1x1"
  top: "fire3/expand1x1"
}
layer {
  name: "fire3/expand3x3"
  type: "Convolution"
  bottom: "fire3/squeeze1x1"
  top: "fire3/expand3x3"
  convolution_param {
    num_output: 64
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire3/relu_expand3x3"
  type: "ReLU"
  bottom: "fire3/expand3x3"
  top: "fire3/expand3x3"
}
layer {
  name: "fire3/concat"
  type: "Concat"
  bottom: "fire3/expand1x1"
  bottom: "fire3/expand3x3"
  top: "fire3/concat"
}
layer {
  name: "pool3"
  type: "Pooling"
  bottom: "fire3/concat"
  top: "pool3"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "fire4/squeeze1x1"
  type: "Convolution"
  bottom: "pool3"
  top: "fire4/squeeze1x1"
  convolution_param {
    num_output: 32
    kernel_size: 1
  }
}
layer {
  name: "fire4/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire4/squeeze1x1"
  top: "fire4/squeeze1x1"
}
layer {
  name: "fire4/expand1x1"
  type: "Convolution"
  bottom: "fire4/squeeze1x1"
  top: "fire4/expand1x1"
  convolution_param {
    num_output: 128
    kernel_size: 1
  }
}
layer {
  name: "fire4/relu_expand1x1"
  type: "ReLU"
  bottom: "fire4/expand1x1"
  top: "fire4/expand1x1"
}
layer {
  name: "fire4/expand3x3"
  type: "Convolution"
  bottom: "fire4/squeeze1x1"
  top: "fire4/expand3x3"
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire4/relu_expand3x3"
  type: "ReLU"
  bottom: "fire4/expand3x3"
  top: "fire4/expand3x3"
}
layer {
  name: "fire4/concat"
  type: "Concat"
  bottom: "fire4/expand1x1"
  bottom: "fire4/expand3x3"
  top: "fire4/concat"
}
layer {
  name: "fire5/squeeze1x1"
  type: "Convolution"
  bottom: "fire4/concat"
  top: "fire5/squeeze1x1"
  convolution_param {
    num_output: 32
    kernel_size: 1
  }
}
layer {
  name: "fire5/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire5/squeeze1x1"
  top: "fire5/squeeze1x1"
}
layer {
  name: "fire5/expand1x1"
  type: "Convolution"
  bottom: "fire5/squeeze1x1"
  top: "fire5/expand1x1"
  convolution_param {
    num_output: 128
    kernel_size: 1
  }
}
layer {
  name: "fire5/relu_expand1x1"
  type: "ReLU"
  bottom: "fire5/expand1x1"
  top: "fire5/expand1x1"
}
layer {
  name: "fire5/expand3x3"
  type: "Convolution"
  bottom: "fire5/squeeze1x1"
  top: "fire5/expand3x3"
  convolution_param {
    num_output: 128
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire5/relu_expand3x3"
  type: "ReLU"
  bottom: "fire5/expand3x3"
  top: "fire5/expand3x3"
}
layer {
  name: "fire5/concat"
  type: "Concat"
  bottom: "fire5/expand1x1"
  bottom: "fire5/expand3x3"
  top: "fire5/concat"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "fire5/concat"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "fire6/squeeze1x1"
  type: "Convolution"
  bottom: "pool5"
  top: "fire6/squeeze1x1"
  convolution_param {
    num_output: 48
    kernel_size: 1
  }
}
layer {
  name: "fire6/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire6/squeeze1x1"
  top: "fire6/squeeze1x1"
}
layer {
  name: "fire6/expand1x1"
  type: "Convolution"
  bottom: "fire6/squeeze1x1"
  top: "fire6/expand1x1"
  convolution_param {
    num_output: 192
    kernel_size: 1
  }
}
layer {
  name: "fire6/relu_expand1x1"
  type: "ReLU"
  bottom: "fire6/expand1x1"
  top: "fire6/expand1x1"
}
layer {
  name: "fire6/expand3x3"
  type: "Convolution"
  bottom: "fire6/squeeze1x1"
  top: "fire6/expand3x3"
  convolution_param {
    num_output: 192
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire6/relu_expand3x3"
  type: "ReLU"
  bottom: "fire6/expand3x3"
  top: "fire6/expand3x3"
}
layer {
  name: "fire6/concat"
  type: "Concat"
  bottom: "fire6/expand1x1"
  bottom: "fire6/expand3x3"
  top: "fire6/concat"
}
layer {
  name: "fire7/squeeze1x1"
  type: "Convolution"
  bottom: "fire6/concat"
  top: "fire7/squeeze1x1"
  convolution_param {
    num_output: 48
    kernel_size: 1
  }
}
layer {
  name: "fire7/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire7/squeeze1x1"
  top: "fire7/squeeze1x1"
}
layer {
  name: "fire7/expand1x1"
  type: "Convolution"
  bottom: "fire7/squeeze1x1"
  top: "fire7/expand1x1"
  convolution_param {
    num_output: 192
    kernel_size: 1
  }
}
layer {
  name: "fire7/relu_expand1x1"
  type: "ReLU"
  bottom: "fire7/expand1x1"
  top: "fire7/expand1x1"
}
layer {
  name: "fire7/expand3x3"
  type: "Convolution"
  bottom: "fire7/squeeze1x1"
  top: "fire7/expand3x3"
  convolution_param {
    num_output: 192
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire7/relu_expand3x3"
  type: "ReLU"
  bottom: "fire7/expand3x3"
  top: "fire7/expand3x3"
}
layer {
  name: "fire7/concat"
  type: "Concat"
  bottom: "fire7/expand1x1"
  bottom: "fire7/expand3x3"
  top: "fire7/concat"
}
layer {
  name: "fire8/squeeze1x1"
  type: "Convolution"
  bottom: "fire7/concat"
  top: "fire8/squeeze1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
}
layer {
  name: "fire8/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire8/squeeze1x1"
  top: "fire8/squeeze1x1"
}
layer {
  name: "fire8/expand1x1"
  type: "Convolution"
  bottom: "fire8/squeeze1x1"
  top: "fire8/expand1x1"
  convolution_param {
    num_output: 256
    kernel_size: 1
  }
}
layer {
  name: "fire8/relu_expand1x1"
  type: "ReLU"
  bottom: "fire8/expand1x1"
  top: "fire8/expand1x1"
}
layer {
  name: "fire8/expand3x3"
  type: "Convolution"
  bottom: "fire8/squeeze1x1"
  top: "fire8/expand3x3"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire8/relu_expand3x3"
  type: "ReLU"
  bottom: "fire8/expand3x3"
  top: "fire8/expand3x3"
}
layer {
  name: "fire8/concat"
  type: "Concat"
  bottom: "fire8/expand1x1"
  bottom: "fire8/expand3x3"
  top: "fire8/concat"
}
layer {
  name: "fire9/squeeze1x1"
  type: "Convolution"
  bottom: "fire8/concat"
  top: "fire9/squeeze1x1"
  convolution_param {
    num_output: 64
    kernel_size: 1
  }
}
layer {
  name: "fire9/relu_squeeze1x1"
  type: "ReLU"
  bottom: "fire9/squeeze1x1"
  top: "fire9/squeeze1x1"
}
layer {
  name: "fire9/expand1x1"
  type: "Convolution"
  bottom: "fire9/squeeze1x1"
  top: "fire9/expand1x1"
  convolution_param {
    num_output: 256
    kernel_size: 1
  }
}
layer {
  name: "fire9/relu_expand1x1"
  type: "ReLU"
  bottom: "fire9/expand1x1"
  top: "fire9/expand1x1"
}
layer {
  name: "fire9/expand3x3"
  type: "Convolution"
  bottom: "fire9/squeeze1x1"
  top: "fire9/expand3x3"
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
  }
}
layer {
  name: "fire9/relu_expand3x3"
  type: "ReLU"
  bottom: "fire9/expand3x3"
  top: "fire9/expand3x3"
}
layer {
  name: "fire9/concat"
  type: "Concat"
  bottom: "fire9/expand1x1"
  bottom: "fire9/expand3x3"
  top: "fire9/concat"
}
layer {
  name: "drop9"
  type: "Dropout"
  bottom: "fire9/concat"
  top: "fire9/concat"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "conv10"
  type: "Convolution"
  bottom: "fire9/concat"
  top: "conv10"
  convolution_param {
    num_output: 1000
    pad: 1
    kernel_size: 1
  }
}
layer {
  name: "relu_conv10"
  type: "ReLU"
  bottom: "conv10"
  top: "conv10"
}
layer {
  name: "pool10"
  type: "Pooling"
  bottom: "conv10"
  top: "pool10"
  pooling_param {
    pool: AVE
    global_pooling: true
  }
}
layer {
  name: "prob"
  type: "Softmax"
  bottom: "pool10"
  top: "prob"
}


================================================
FILE: examples/squeezenetssd.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int detect_squeezenet(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net squeezenet;

    squeezenet.opt.use_vulkan_compute = true;

    // original pretrained model from https://github.com/chuanqi305/SqueezeNet-SSD
    // squeezenet_ssd_voc_deploy.prototxt
    // https://drive.google.com/open?id=0B3gersZ2cHIxdGpyZlZnbEQ5Snc
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (squeezenet.load_param("squeezenet_ssd_voc.param"))
        exit(-1);
    if (squeezenet.load_model("squeezenet_ssd_voc.bin"))
        exit(-1);

    const int target_size = 300;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);

    const float mean_vals[3] = {104.f, 117.f, 123.f};
    in.substract_mean_normalize(mean_vals, 0);

    ncnn::Extractor ex = squeezenet.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("detection_out", out);

    //     printf("%d %d %d\n", out.w, out.h, out.c);
    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];
        object.rect.x = values[2] * img_w;
        object.rect.y = values[3] * img_h;
        object.rect.width = values[4] * img_w - object.rect.x;
        object.rect.height = values[5] * img_h - object.rect.y;

        objects.push_back(object);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_squeezenet(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/synset_words.txt
================================================
n01440764 tench, Tinca tinca
n01443537 goldfish, Carassius auratus
n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
n01491361 tiger shark, Galeocerdo cuvieri
n01494475 hammerhead, hammerhead shark
n01496331 electric ray, crampfish, numbfish, torpedo
n01498041 stingray
n01514668 cock
n01514859 hen
n01518878 ostrich, Struthio camelus
n01530575 brambling, Fringilla montifringilla
n01531178 goldfinch, Carduelis carduelis
n01532829 house finch, linnet, Carpodacus mexicans
n01534433 junco, snowbird
n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea
n01558993 robin, American robin, Turdus migratorius
n01560419 bulbul
n01580077 jay
n01582220 magpie
n01592084 chickadee
n01601694 water ouzel, dipper
n01608432 kite
n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
n01616318 vulture
n01622779 great grey owl, great gray owl, Strix nebulosa
n01629819 European fire salamander, Salamandra salamandra
n01630670 common newt, Triturus vulgaris
n01631663 eft
n01632458 spotted salamander, Ambystoma maculatum
n01632777 axolotl, mud puppy, Ambystoma mexicanum
n01641577 bullfrog, Rana catesbeiana
n01644373 tree frog, tree-frog
n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
n01664065 loggerhead, loggerhead turtle, Caretta caretta
n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
n01667114 mud turtle
n01667778 terrapin
n01669191 box turtle, box tortoise
n01675722 banded gecko
n01677366 common iguana, iguana, Iguana iguana
n01682714 American chameleon, anole, Anolis carolinensis
n01685808 whiptail, whiptail lizard
n01687978 agama
n01688243 frilled lizard, Chlamydosaurus kingi
n01689811 alligator lizard
n01692333 Gila monster, Heloderma suspectum
n01693334 green lizard, Lacerta viridis
n01694178 African chameleon, Chamaeleo chamaeleon
n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
n01697457 African crocodile, Nile crocodile, Crocodylus niloticus
n01698640 American alligator, Alligator mississipiensis
n01704323 triceratops
n01728572 thunder snake, worm snake, Carphophis amoenus
n01728920 ringneck snake, ring-necked snake, ring snake
n01729322 hognose snake, puff adder, sand viper
n01729977 green snake, grass snake
n01734418 king snake, kingsnake
n01735189 garter snake, grass snake
n01737021 water snake
n01739381 vine snake
n01740131 night snake, Hypsiglena torquata
n01742172 boa constrictor, Constrictor constrictor
n01744401 rock python, rock snake, Python sebae
n01748264 Indian cobra, Naja naja
n01749939 green mamba
n01751748 sea snake
n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus
n01756291 sidewinder, horned rattlesnake, Crotalus cerastes
n01768244 trilobite
n01770081 harvestman, daddy longlegs, Phalangium opilio
n01770393 scorpion
n01773157 black and gold garden spider, Argiope aurantia
n01773549 barn spider, Araneus cavaticus
n01773797 garden spider, Aranea diademata
n01774384 black widow, Latrodectus mactans
n01774750 tarantula
n01775062 wolf spider, hunting spider
n01776313 tick
n01784675 centipede
n01795545 black grouse
n01796340 ptarmigan
n01797886 ruffed grouse, partridge, Bonasa umbellus
n01798484 prairie chicken, prairie grouse, prairie fowl
n01806143 peacock
n01806567 quail
n01807496 partridge
n01817953 African grey, African gray, Psittacus erithacus
n01818515 macaw
n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
n01820546 lorikeet
n01824575 coucal
n01828970 bee eater
n01829413 hornbill
n01833805 hummingbird
n01843065 jacamar
n01843383 toucan
n01847000 drake
n01855032 red-breasted merganser, Mergus serrator
n01855672 goose
n01860187 black swan, Cygnus atratus
n01871265 tusker
n01872401 echidna, spiny anteater, anteater
n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
n01877812 wallaby, brush kangaroo
n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
n01883070 wombat
n01910747 jellyfish
n01914609 sea anemone, anemone
n01917289 brain coral
n01924916 flatworm, platyhelminth
n01930112 nematode, nematode worm, roundworm
n01943899 conch
n01944390 snail
n01945685 slug
n01950731 sea slug, nudibranch
n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore
n01968897 chambered nautilus, pearly nautilus, nautilus
n01978287 Dungeness crab, Cancer magister
n01978455 rock crab, Cancer irroratus
n01980166 fiddler crab
n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
n01983481 American lobster, Northern lobster, Maine lobster, Homarus americans
n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
n01985128 crayfish, crawfish, crawdad, crawdaddy
n01986214 hermit crab
n01990800 isopod
n02002556 white stork, Ciconia ciconia
n02002724 black stork, Ciconia nigra
n02006656 spoonbill
n02007558 flamingo
n02009229 little blue heron, Egretta caerulea
n02009912 American egret, great white heron, Egretta albus
n02011460 bittern
n02012849 crane
n02013706 limpkin, Aramus pictus
n02017213 European gallinule, Porphyrio porphyrio
n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana
n02018795 bustard
n02025239 ruddy turnstone, Arenaria interpres
n02027492 red-backed sandpiper, dunlin, Erolia alpina
n02028035 redshank, Tringa totanus
n02033041 dowitcher
n02037110 oystercatcher, oyster catcher
n02051845 pelican
n02056570 king penguin, Aptenodytes patagonica
n02058221 albatross, mollymawk
n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
n02074367 dugong, Dugong dugon
n02077923 sea lion
n02085620 Chihuahua
n02085782 Japanese spaniel
n02085936 Maltese dog, Maltese terrier, Maltese
n02086079 Pekinese, Pekingese, Peke
n02086240 Shih-Tzu
n02086646 Blenheim spaniel
n02086910 papillon
n02087046 toy terrier
n02087394 Rhodesian ridgeback
n02088094 Afghan hound, Afghan
n02088238 basset, basset hound
n02088364 beagle
n02088466 bloodhound, sleuthhound
n02088632 bluetick
n02089078 black-and-tan coonhound
n02089867 Walker hound, Walker foxhound
n02089973 English foxhound
n02090379 redbone
n02090622 borzoi, Russian wolfhound
n02090721 Irish wolfhound
n02091032 Italian greyhound
n02091134 whippet
n02091244 Ibizan hound, Ibizan Podenco
n02091467 Norwegian elkhound, elkhound
n02091635 otterhound, otter hound
n02091831 Saluki, gazelle hound
n02092002 Scottish deerhound, deerhound
n02092339 Weimaraner
n02093256 Staffordshire bullterrier, Staffordshire bull terrier
n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
n02093647 Bedlington terrier
n02093754 Border terrier
n02093859 Kerry blue terrier
n02093991 Irish terrier
n02094114 Norfolk terrier
n02094258 Norwich terrier
n02094433 Yorkshire terrier
n02095314 wire-haired fox terrier
n02095570 Lakeland terrier
n02095889 Sealyham terrier, Sealyham
n02096051 Airedale, Airedale terrier
n02096177 cairn, cairn terrier
n02096294 Australian terrier
n02096437 Dandie Dinmont, Dandie Dinmont terrier
n02096585 Boston bull, Boston terrier
n02097047 miniature schnauzer
n02097130 giant schnauzer
n02097209 standard schnauzer
n02097298 Scotch terrier, Scottish terrier, Scottie
n02097474 Tibetan terrier, chrysanthemum dog
n02097658 silky terrier, Sydney silky
n02098105 soft-coated wheaten terrier
n02098286 West Highland white terrier
n02098413 Lhasa, Lhasa apso
n02099267 flat-coated retriever
n02099429 curly-coated retriever
n02099601 golden retriever
n02099712 Labrador retriever
n02099849 Chesapeake Bay retriever
n02100236 German short-haired pointer
n02100583 vizsla, Hungarian pointer
n02100735 English setter
n02100877 Irish setter, red setter
n02101006 Gordon setter
n02101388 Brittany spaniel
n02101556 clumber, clumber spaniel
n02102040 English springer, English springer spaniel
n02102177 Welsh springer spaniel
n02102318 cocker spaniel, English cocker spaniel, cocker
n02102480 Sussex spaniel
n02102973 Irish water spaniel
n02104029 kuvasz
n02104365 schipperke
n02105056 groenendael
n02105162 malinois
n02105251 briard
n02105412 kelpie
n02105505 komondor
n02105641 Old English sheepdog, bobtail
n02105855 Shetland sheepdog, Shetland sheep dog, Shetland
n02106030 collie
n02106166 Border collie
n02106382 Bouvier des Flandres, Bouviers des Flandres
n02106550 Rottweiler
n02106662 German shepherd, German shepherd dog, German police dog, alsatian
n02107142 Doberman, Doberman pinscher
n02107312 miniature pinscher
n02107574 Greater Swiss Mountain dog
n02107683 Bernese mountain dog
n02107908 Appenzeller
n02108000 EntleBucher
n02108089 boxer
n02108422 bull mastiff
n02108551 Tibetan mastiff
n02108915 French bulldog
n02109047 Great Dane
n02109525 Saint Bernard, St Bernard
n02109961 Eskimo dog, husky
n02110063 malamute, malemute, Alaskan malamute
n02110185 Siberian husky
n02110341 dalmatian, coach dog, carriage dog
n02110627 affenpinscher, monkey pinscher, monkey dog
n02110806 basenji
n02110958 pug, pug-dog
n02111129 Leonberg
n02111277 Newfoundland, Newfoundland dog
n02111500 Great Pyrenees
n02111889 Samoyed, Samoyede
n02112018 Pomeranian
n02112137 chow, chow chow
n02112350 keeshond
n02112706 Brabancon griffon
n02113023 Pembroke, Pembroke Welsh corgi
n02113186 Cardigan, Cardigan Welsh corgi
n02113624 toy poodle
n02113712 miniature poodle
n02113799 standard poodle
n02113978 Mexican hairless
n02114367 timber wolf, grey wolf, gray wolf, Canis lupus
n02114548 white wolf, Arctic wolf, Canis lupus tundrarum
n02114712 red wolf, maned wolf, Canis rufus, Canis niger
n02114855 coyote, prairie wolf, brush wolf, Canis latrans
n02115641 dingo, warrigal, warragal, Canis dingo
n02115913 dhole, Cuon alpinus
n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
n02117135 hyena, hyaena
n02119022 red fox, Vulpes vulpes
n02119789 kit fox, Vulpes macrotis
n02120079 Arctic fox, white fox, Alopex lagopus
n02120505 grey fox, gray fox, Urocyon cinereoargenteus
n02123045 tabby, tabby cat
n02123159 tiger cat
n02123394 Persian cat
n02123597 Siamese cat, Siamese
n02124075 Egyptian cat
n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
n02127052 lynx, catamount
n02128385 leopard, Panthera pardus
n02128757 snow leopard, ounce, Panthera uncia
n02128925 jaguar, panther, Panthera onca, Felis onca
n02129165 lion, king of beasts, Panthera leo
n02129604 tiger, Panthera tigris
n02130308 cheetah, chetah, Acinonyx jubatus
n02132136 brown bear, bruin, Ursus arctos
n02133161 American black bear, black bear, Ursus americans, Euarctos americans
n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
n02134418 sloth bear, Melursus ursinus, Ursus ursinus
n02137549 mongoose
n02138441 meerkat, mierkat
n02165105 tiger beetle
n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
n02167151 ground beetle, carabid beetle
n02168699 long-horned beetle, longicorn, longicorn beetle
n02169497 leaf beetle, chrysomelid
n02172182 dung beetle
n02174001 rhinoceros beetle
n02177972 weevil
n02190166 fly
n02206856 bee
n02219486 ant, emmet, pismire
n02226429 grasshopper, hopper
n02229544 cricket
n02231487 walking stick, walkingstick, stick insect
n02233338 cockroach, roach
n02236044 mantis, mantid
n02256656 cicada, cicala
n02259212 leafhopper
n02264363 lacewing, lacewing fly
n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
n02268853 damselfly
n02276258 admiral
n02277742 ringlet, ringlet butterfly
n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
n02280649 cabbage butterfly
n02281406 sulphur butterfly, sulfur butterfly
n02281787 lycaenid, lycaenid butterfly
n02317335 starfish, sea star
n02319095 sea urchin
n02321529 sea cucumber, holothurian
n02325366 wood rabbit, cottontail, cottontail rabbit
n02326432 hare
n02328150 Angora, Angora rabbit
n02342885 hamster
n02346627 porcupine, hedgehog
n02356798 fox squirrel, eastern fox squirrel, Sciurus niger
n02361337 marmot
n02363005 beaver
n02364673 guinea pig, Cavia cobaya
n02389026 sorrel
n02391049 zebra
n02395406 hog, pig, grunter, squealer, Sus scrofa
n02396427 wild boar, boar, Sus scrofa
n02397096 warthog
n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius
n02403003 ox
n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
n02410509 bison
n02412080 ram, tup
n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
n02417914 ibex, Capra ibex
n02422106 hartebeest
n02422699 impala, Aepyceros melampus
n02423022 gazelle
n02437312 Arabian camel, dromedary, Camelus dromedarius
n02437616 llama
n02441942 weasel
n02442845 mink
n02443114 polecat, fitch, foulmart, foumart, Mustela putorius
n02443484 black-footed ferret, ferret, Mustela nigripes
n02444819 otter
n02445715 skunk, polecat, wood pussy
n02447366 badger
n02454379 armadillo
n02457408 three-toed sloth, ai, Bradypus tridactylus
n02480495 orangutan, orang, orangutang, Pongo pygmaeus
n02480855 gorilla, Gorilla gorilla
n02481823 chimpanzee, chimp, Pan troglodytes
n02483362 gibbon, Hylobates lar
n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus
n02484975 guenon, guenon monkey
n02486261 patas, hussar monkey, Erythrocebus patas
n02486410 baboon
n02487347 macaque
n02488291 langur
n02488702 colobus, colobus monkey
n02489166 proboscis monkey, Nasalis larvatus
n02490219 marmoset
n02492035 capuchin, ringtail, Cebus capucinus
n02492660 howler monkey, howler
n02493509 titi, titi monkey
n02493793 spider monkey, Ateles geoffroyi
n02494079 squirrel monkey, Saimiri sciureus
n02497673 Madagascar cat, ring-tailed lemur, Lemur catta
n02500267 indri, indris, Indri indri, Indri brevicaudatus
n02504013 Indian elephant, Elephas maximus
n02504458 African elephant, Loxodonta africana
n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
n02514041 barracouta, snoek
n02526121 eel
n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
n02606052 rock beauty, Holocanthus tricolor
n02607072 anemone fish
n02640242 sturgeon
n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus
n02643566 lionfish
n02655020 puffer, pufferfish, blowfish, globefish
n02666196 abacus
n02667093 abaya
n02669723 academic gown, academic robe, judge's robe
n02672831 accordion, piano accordion, squeeze box
n02676566 acoustic guitar
n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier
n02690373 airliner
n02692877 airship, dirigible
n02699494 altar
n02701002 ambulance
n02704792 amphibian, amphibious vehicle
n02708093 analog clock
n02727426 apiary, bee house
n02730930 apron
n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
n02749479 assault rifle, assault gun
n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack
n02776631 bakery, bakeshop, bakehouse
n02777292 balance beam, beam
n02782093 balloon
n02783161 ballpoint, ballpoint pen, ballpen, Biro
n02786058 Band Aid
n02787622 banjo
n02788148 bannister, banister, balustrade, balusters, handrail
n02790996 barbell
n02791124 barber chair
n02791270 barbershop
n02793495 barn
n02794156 barometer
n02795169 barrel, cask
n02797295 barrow, garden cart, lawn cart, wheelbarrow
n02799071 baseball
n02802426 basketball
n02804414 bassinet
n02804610 bassoon
n02807133 bathing cap, swimming cap
n02808304 bath towel
n02808440 bathtub, bathing tub, bath, tub
n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
n02814860 beacon, lighthouse, beacon light, pharos
n02815834 beaker
n02817516 bearskin, busby, shako
n02823428 beer bottle
n02823750 beer glass
n02825657 bell cote, bell cot
n02834397 bib
n02835271 bicycle-built-for-two, tandem bicycle, tandem
n02837789 bikini, two-piece
n02840245 binder, ring-binder
n02841315 binoculars, field glasses, opera glasses
n02843684 birdhouse
n02859443 boathouse
n02860847 bobsled, bobsleigh, bob
n02865351 bolo tie, bolo, bola tie, bola
n02869837 bonnet, poke bonnet
n02870880 bookcase
n02871525 bookshop, bookstore, bookstall
n02877765 bottlecap
n02879718 bow
n02883205 bow tie, bow-tie, bowtie
n02892201 brass, memorial tablet, plaque
n02892767 brassiere, bra, bandeau
n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty
n02895154 breastplate, aegis, egis
n02906734 broom
n02909870 bucket, pail
n02910353 buckle
n02916936 bulletproof vest
n02917067 bullet train, bullet
n02927161 butcher shop, meat market
n02930766 cab, hack, taxi, taxicab
n02939185 caldron, cauldron
n02948072 candle, taper, wax light
n02950826 cannon
n02951358 canoe
n02951585 can opener, tin opener
n02963159 cardigan
n02965783 car mirror
n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig
n02966687 carpenter's kit, tool kit
n02971356 carton
n02974003 car wheel
n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
n02978881 cassette
n02979186 cassette player
n02980441 castle
n02981792 catamaran
n02988304 CD player
n02992211 cello, violoncello
n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone
n02999410 chain
n03000134 chainlink fence
n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
n03000684 chain saw, chainsaw
n03014705 chest
n03016953 chiffonier, commode
n03017168 chime, bell, gong
n03018349 china cabinet, china closet
n03026506 Christmas stocking
n03028079 church, church building
n03032252 cinema, movie theater, movie theatre, movie house, picture palace
n03041632 cleaver, meat cleaver, chopper
n03042490 cliff dwelling
n03045698 cloak
n03047690 clog, geta, patten, sabot
n03062245 cocktail shaker
n03063599 coffee mug
n03063689 coffeepot
n03065424 coil, spiral, volute, whorl, helix
n03075370 combination lock
n03085013 computer keyboard, keypad
n03089624 confectionery, confectionary, candy store
n03095699 container ship, containership, container vessel
n03100240 convertible
n03109150 corkscrew, bottle screw
n03110669 cornet, horn, trumpet, trump
n03124043 cowboy boot
n03124170 cowboy hat, ten-gallon hat
n03125729 cradle
n03126707 crane
n03127747 crash helmet
n03127925 crate
n03131574 crib, cot
n03133878 Crock Pot
n03134739 croquet ball
n03141823 crutch
n03146219 cuirass
n03160309 dam, dike, dyke
n03179701 desk
n03180011 desktop computer
n03187595 dial telephone, dial phone
n03188531 diaper, nappy, napkin
n03196217 digital clock
n03197337 digital watch
n03201208 dining table, board
n03207743 dishrag, dishcloth
n03207941 dishwasher, dish washer, dishwashing machine
n03208938 disk brake, disc brake
n03216828 dock, dockage, docking facility
n03218198 dogsled, dog sled, dog sleigh
n03220513 dome
n03223299 doormat, welcome mat
n03240683 drilling platform, offshore rig
n03249569 drum, membranophone, tympan
n03250847 drumstick
n03255030 dumbbell
n03259280 Dutch oven
n03271574 electric fan, blower
n03272010 electric guitar
n03272562 electric locomotive
n03290653 entertainment center
n03291819 envelope
n03297495 espresso maker
n03314780 face powder
n03325584 feather boa, boa
n03337140 file, file cabinet, filing cabinet
n03344393 fireboat
n03345487 fire engine, fire truck
n03347037 fire screen, fireguard
n03355925 flagpole, flagstaff
n03372029 flute, transverse flute
n03376595 folding chair
n03379051 football helmet
n03384352 forklift
n03388043 fountain
n03388183 fountain pen
n03388549 four-poster
n03393912 freight car
n03394916 French horn, horn
n03400231 frying pan, frypan, skillet
n03404251 fur coat
n03417042 garbage truck, dustcart
n03424325 gasmask, respirator, gas helmet
n03425413 gas pump, gasoline pump, petrol pump, island dispenser
n03443371 goblet
n03444034 go-kart
n03445777 golf ball
n03445924 golfcart, golf cart
n03447447 gondola
n03447721 gong, tam-tam
n03450230 gown
n03452741 grand piano, grand
n03457902 greenhouse, nursery, glasshouse
n03459775 grille, radiator grille
n03461385 grocery store, grocery, food market, market
n03467068 guillotine
n03476684 hair slide
n03476991 hair spray
n03478589 half track
n03481172 hammer
n03482405 hamper
n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier
n03485407 hand-held computer, hand-held microcomputer
n03485794 handkerchief, hankie, hanky, hankey
n03492542 hard disc, hard disk, fixed disk
n03494278 harmonica, mouth organ, harp, mouth harp
n03495258 harp
n03496892 harvester, reaper
n03498962 hatchet
n03527444 holster
n03529860 home theater, home theatre
n03530642 honeycomb
n03532672 hook, claw
n03534580 hoopskirt, crinoline
n03535780 horizontal bar, high bar
n03538406 horse cart, horse-cart
n03544143 hourglass
n03584254 iPod
n03584829 iron, smoothing iron
n03590841 jack-o'-lantern
n03594734 jean, blue jean, denim
n03594945 jeep, landrover
n03595614 jersey, T-shirt, tee shirt
n03598930 jigsaw puzzle
n03599486 jinrikisha, ricksha, rickshaw
n03602883 joystick
n03617480 kimono
n03623198 knee pad
n03627232 knot
n03630383 lab coat, laboratory coat
n03633091 ladle
n03637318 lampshade, lamp shade
n03642806 laptop, laptop computer
n03649909 lawn mower, mower
n03657121 lens cap, lens cover
n03658185 letter opener, paper knife, paperknife
n03661043 library
n03662601 lifeboat
n03666591 lighter, light, igniter, ignitor
n03670208 limousine, limo
n03673027 liner, ocean liner
n03676483 lipstick, lip rouge
n03680355 Loafer
n03690938 lotion
n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
n03692522 loupe, jeweler's loupe
n03697007 lumbermill, sawmill
n03706229 magnetic compass
n03709823 mailbag, postbag
n03710193 mailbox, letter box
n03710637 maillot
n03710721 maillot, tank suit
n03717622 manhole cover
n03720891 maraca
n03721384 marimba, xylophone
n03724870 mask
n03729826 matchstick
n03733131 maypole
n03733281 maze, labyrinth
n03733805 measuring cup
n03742115 medicine chest, medicine cabinet
n03743016 megalith, megalithic structure
n03759954 microphone, mike
n03761084 microwave, microwave oven
n03763968 military uniform
n03764736 milk can
n03769881 minibus
n03770439 miniskirt, mini
n03770679 minivan
n03773504 missile
n03775071 mitten
n03775546 mixing bowl
n03776460 mobile home, manufactured home
n03777568 Model T
n03777754 modem
n03781244 monastery
n03782006 monitor
n03785016 moped
n03786901 mortar
n03787032 mortarboard
n03788195 mosque
n03788365 mosquito net
n03791053 motor scooter, scooter
n03792782 mountain bike, all-terrain bike, off-roader
n03792972 mountain tent
n03793489 mouse, computer mouse
n03794056 mousetrap
n03796401 moving van
n03803284 muzzle
n03804744 nail
n03814639 neck brace
n03814906 necklace
n03825788 nipple
n03832673 notebook, notebook computer
n03837869 obelisk
n03838899 oboe, hautboy, hautbois
n03840681 ocarina, sweet potato
n03841143 odometer, hodometer, mileometer, milometer
n03843555 oil filter
n03854065 organ, pipe organ
n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO
n03866082 overskirt
n03868242 oxcart
n03868863 oxygen mask
n03871628 packet
n03873416 paddle, boat paddle
n03874293 paddlewheel, paddle wheel
n03874599 padlock
n03876231 paintbrush
n03877472 pajama, pyjama, pj's, jammies
n03877845 palace
n03884397 panpipe, pandean pipe, syrinx
n03887697 paper towel
n03888257 parachute, chute
n03888605 parallel bars, bars
n03891251 park bench
n03891332 parking meter
n03895866 passenger car, coach, carriage
n03899768 patio, terrace
n03902125 pay-phone, pay-station
n03903868 pedestal, plinth, footstall
n03908618 pencil box, pencil case
n03908714 pencil sharpener
n03916031 perfume, essence
n03920288 Petri dish
n03924679 photocopier
n03929660 pick, plectrum, plectron
n03929855 pickelhaube
n03930313 picket fence, paling
n03930630 pickup, pickup truck
n03933933 pier
n03935335 piggy bank, penny bank
n03937543 pill bottle
n03938244 pillow
n03942813 ping-pong ball
n03944341 pinwheel
n03947888 pirate, pirate ship
n03950228 pitcher, ewer
n03954731 plane, carpenter's plane, woodworking plane
n03956157 planetarium
n03958227 plastic bag
n03961711 plate rack
n03967562 plow, plough
n03970156 plunger, plumber's helper
n03976467 Polaroid camera, Polaroid Land camera
n03976657 pole
n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
n03980874 poncho
n03982430 pool table, billiard table, snooker table
n03983396 pop bottle, soda bottle
n03991062 pot, flowerpot
n03992509 potter's wheel
n03995372 power drill
n03998194 prayer rug, prayer mat
n04004767 printer
n04005630 prison, prison house
n04008634 projectile, missile
n04009552 projector
n04019541 puck, hockey puck
n04023962 punching bag, punch bag, punching ball, punchball
n04026417 purse
n04033901 quill, quill pen
n04033995 quilt, comforter, comfort, puff
n04037443 racer, race car, racing car
n04039381 racket, racquet
n04040759 radiator
n04041544 radio, wireless
n04044716 radio telescope, radio reflector
n04049303 rain barrel
n04065272 recreational vehicle, RV, R.V.
n04067472 reel
n04069434 reflex camera
n04070727 refrigerator, icebox
n04074963 remote control, remote
n04081281 restaurant, eating house, eating place, eatery
n04086273 revolver, six-gun, six-shooter
n04090263 rifle
n04099969 rocking chair, rocker
n04111531 rotisserie
n04116512 rubber eraser, rubber, pencil eraser
n04118538 rugby ball
n04118776 rule, ruler
n04120489 running shoe
n04125021 safe
n04127249 safety pin
n04131690 saltshaker, salt shaker
n04133789 sandal
n04136333 sarong
n04141076 sax, saxophone
n04141327 scabbard
n04141975 scale, weighing machine
n04146614 school bus
n04147183 schooner
n04149813 scoreboard
n04152593 screen, CRT screen
n04153751 screw
n04154565 screwdriver
n04162706 seat belt, seatbelt
n04179913 sewing machine
n04192698 shield, buckler
n04200800 shoe shop, shoe-shop, shoe store
n04201297 shoji
n04204238 shopping basket
n04204347 shopping cart
n04208210 shovel
n04209133 shower cap
n04209239 shower curtain
n04228054 ski
n04229816 ski mask
n04235860 sleeping bag
n04238763 slide rule, slipstick
n04239074 sliding door
n04243546 slot, one-armed bandit
n04251144 snorkel
n04252077 snowmobile
n04252225 snowplow, snowplough
n04254120 soap dispenser
n04254680 soccer ball
n04254777 sock
n04258138 solar dish, solar collector, solar furnace
n04259630 sombrero
n04263257 soup bowl
n04264628 space bar
n04265275 space heater
n04266014 space shuttle
n04270147 spatula
n04273569 speedboat
n04275548 spider web, spider's web
n04277352 spindle
n04285008 sports car, sport car
n04286575 spotlight, spot
n04296562 stage
n04310018 steam locomotive
n04311004 steel arch bridge
n04311174 steel drum
n04317175 stethoscope
n04325704 stole
n04326547 stone wall
n04328186 stopwatch, stop watch
n04330267 stove
n04332243 strainer
n04335435 streetcar, tram, tramcar, trolley, trolley car
n04336792 stretcher
n04344873 studio couch, day bed
n04346328 stupa, tope
n04347754 submarine, pigboat, sub, U-boat
n04350905 suit, suit of clothes
n04355338 sundial
n04355933 sunglass
n04356056 sunglasses, dark glasses, shades
n04357314 sunscreen, sunblock, sun blocker
n04366367 suspension bridge
n04367480 swab, swob, mop
n04370456 sweatshirt
n04371430 swimming trunks, bathing trunks
n04371774 swing
n04372370 switch, electric switch, electrical switch
n04376876 syringe
n04380533 table lamp
n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle
n04392985 tape player
n04398044 teapot
n04399382 teddy, teddy bear
n04404412 television, television system
n04409515 tennis ball
n04417672 thatch, thatched roof
n04418357 theater curtain, theatre curtain
n04423845 thimble
n04428191 thresher, thrasher, threshing machine
n04429376 throne
n04435653 tile roof
n04442312 toaster
n04443257 tobacco shop, tobacconist shop, tobacconist
n04447861 toilet seat
n04456115 torch
n04458633 totem pole
n04461696 tow truck, tow car, wrecker
n04462240 toyshop
n04465501 tractor
n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
n04476259 tray
n04479046 trench coat
n04482393 tricycle, trike, velocipede
n04483307 trimaran
n04485082 tripod
n04486054 triumphal arch
n04487081 trolleybus, trolley coach, trackless trolley
n04487394 trombone
n04493381 tub, vat
n04501370 turnstile
n04505470 typewriter keyboard
n04507155 umbrella
n04509417 unicycle, monocycle
n04515003 upright, upright piano
n04517823 vacuum, vacuum cleaner
n04522168 vase
n04523525 vault
n04525038 velvet
n04525305 vending machine
n04532106 vestment
n04532670 viaduct
n04536866 violin, fiddle
n04540053 volleyball
n04542943 waffle iron
n04548280 wall clock
n04548362 wallet, billfold, notecase, pocketbook
n04550184 wardrobe, closet, press
n04552348 warplane, military plane
n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin
n04554684 washer, automatic washer, washing machine
n04557648 water bottle
n04560804 water jug
n04562935 water tower
n04579145 whiskey jug
n04579432 whistle
n04584207 wig
n04589890 window screen
n04590129 window shade
n04591157 Windsor tie
n04591713 wine bottle
n04592741 wing
n04596742 wok
n04597913 wooden spoon
n04599235 wool, woolen, woollen
n04604644 worm fence, snake fence, snake-rail fence, Virginia fence
n04606251 wreck
n04612504 yawl
n04613696 yurt
n06359193 web site, website, internet site, site
n06596364 comic book
n06785654 crossword puzzle, crossword
n06794110 street sign
n06874185 traffic light, traffic signal, stoplight
n07248320 book jacket, dust cover, dust jacket, dust wrapper
n07565083 menu
n07579787 plate
n07583066 guacamole
n07584110 consomme
n07590611 hot pot, hotpot
n07613480 trifle
n07614500 ice cream, icecream
n07615774 ice lolly, lolly, lollipop, popsicle
n07684084 French loaf
n07693725 bagel, beigel
n07695742 pretzel
n07697313 cheeseburger
n07697537 hotdog, hot dog, red hot
n07711569 mashed potato
n07714571 head cabbage
n07714990 broccoli
n07715103 cauliflower
n07716358 zucchini, courgette
n07716906 spaghetti squash
n07717410 acorn squash
n07717556 butternut squash
n07718472 cucumber, cuke
n07718747 artichoke, globe artichoke
n07720875 bell pepper
n07730033 cardoon
n07734744 mushroom
n07742313 Granny Smith
n07745940 strawberry
n07747607 orange
n07749582 lemon
n07753113 fig
n07753275 pineapple, ananas
n07753592 banana
n07754684 jackfruit, jak, jack
n07760859 custard apple
n07768694 pomegranate
n07802026 hay
n07831146 carbonara
n07836838 chocolate sauce, chocolate syrup
n07860988 dough
n07871810 meat loaf, meatloaf
n07873807 pizza, pizza pie
n07875152 potpie
n07880968 burrito
n07892512 red wine
n07920052 espresso
n07930864 cup
n07932039 eggnog
n09193705 alp
n09229709 bubble
n09246464 cliff, drop, drop-off
n09256479 coral reef
n09288635 geyser
n09332890 lakeside, lakeshore
n09399592 promontory, headland, head, foreland
n09421951 sandbar, sand bar
n09428293 seashore, coast, seacoast, sea-coast
n09468604 valley, vale
n09472597 volcano
n09835506 ballplayer, baseball player
n10148035 groom, bridegroom
n10565667 scuba diver
n11879895 rapeseed
n11939491 daisy
n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
n12144580 corn
n12267677 acorn
n12620546 hip, rose hip, rosehip
n12768682 buckeye, horse chestnut, conker
n12985857 coral fungus
n12998815 agaric
n13037406 gyromitra
n13040303 stinkhorn, carrion fungus
n13044778 earthstar
n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
n13054560 bolete
n13133613 ear, spike, capitulum
n15075141 toilet tissue, toilet paper, bathroom tissue


================================================
FILE: examples/whisper.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// whisper speech recognition implemented with ncnn library

// convert openai-whisper checkpoints to ncnn models
//  1. install pnnx via pip install -U pnnx
//  2. obtain export_ncnn.py script from https://github.com/nihui/ncnn-android-whisper
//  3. edit export_ncnn.py for changing the models among tiny/base/small/medium/large-v3-turbo
//  4. make sure you have good internet connection
//      python export_ncnn.py

// convert vocab.json to simple whisper_vocab.txt
//  1. obtain vocab.json file from https://huggingface.co/openai/whisper-tiny/blob/main/vocab.json
//  2. convert json dict into plain list, save to whisper_vocab.txt

// NOTE large-v3-turbo has special token ids from others, one more language(yue) and does not support translation

#include "net.h"
#include "layer.h"
#include "layer_type.h"

#include <float.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <algorithm>
#include <string>
#include <vector>

// https://huggingface.co/openai/whisper-tiny/blob/main/tokenizer_config.json
static const int token_endoftext = 50257;
static const int token_startoftranscript = 50258;
static const int token_lang_first = 50259;
static const int token_lang_last = 50357;
static const int token_lang_count = token_lang_last - token_lang_first + 1;
// clang-format off
// *INDENT-OFF*
static const char* token_langs[] = {
    "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv",
    "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no",
    "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr",
    "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw",
    "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu",
    "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl",
    "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su"
};
// *INDENT-ON*
// clang-format on
static const int token_translate = 50358;
static const int token_transcribe = 50359;
static const int token_startoflm = 50360;
static const int token_startofprev = 50361;
static const int token_nocaptions = 50362;
static const int token_notimestamps = 50363;
static const int token_timestamp_first = 50364;
static const int token_timestamp_last = 51864;

// https://huggingface.co/openai/whisper-large-v3-turbo/blob/main/tokenizer_config.json
// static const int token_endoftext = 50257;
// static const int token_startoftranscript = 50258;
// static const int token_lang_first = 50259;
// static const int token_lang_last = 50357;
// static const int token_lang_count = token_lang_last - token_lang_first + 1;
// // clang-format off
// // *INDENT-OFF*
// static const char* token_langs[] = {
//     "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv",
//     "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no",
//     "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr",
//     "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw",
//     "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu",
//     "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl",
//     "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su", "yue"
// };
// // *INDENT-ON*
// // clang-format on
// static const int token_translate = 50359;
// static const int token_transcribe = 50360;
// static const int token_startoflm = 50361;
// static const int token_startofprev = 50362;
// static const int token_nospeech = 50363;
// static const int token_notimestamps = 50364;
// static const int token_timestamp_first = 50365;
// static const int token_timestamp_last = 51865;

// tokenizer for handling text tokens
class Tokenizer
{
public:
    std::vector<std::string> reverse_vocab;

    uint8_t byte_decoder[512]; // unicode code point to byte value

    // generate byte decoder for tokenization
    void generate_byte_decoder()
    {
        // initialize array to 0
        memset(byte_decoder, 0, 512 * sizeof(uint8_t));

        // define function to check if char is in "printable" range
        auto is_printable = [](int b) {
            return (b >= '!' && b <= '~')     // '!' to '~'
                   || (b >= 161 && b <= 172)  // '¡' to '¬'
                   || (b >= 174 && b <= 255); // '®' to 'ÿ'
        };

        // handle "printable" characters
        // for these chars, key and value are the same
        for (int b = 0; b < 256; ++b)
        {
            if (is_printable(b))
            {
                byte_decoder[b] = static_cast<uint8_t>(b);
            }
        }

        // handle remaining characters
        // for these chars, key starts from 256 and increments
        int n = 0;
        for (int b = 0; b < 256; ++b)
        {
            if (!is_printable(b))
            {
                byte_decoder[256 + n] = static_cast<uint8_t>(b);
                n++;
            }
        }
    }

    // convert utf-8 string to code points
    std::vector<uint32_t> utf8_to_codepoints(const std::string& s) const
    {
        std::vector<uint32_t> codepoints;
        for (size_t i = 0; i < s.length();)
        {
            uint32_t cp = 0;
            int len = 0;
            unsigned char c = s[i];

            if (c < 0x80) // 1-byte
            {
                cp = c;
                len = 1;
            }
            else if ((c & 0xE0) == 0xC0) // 2-byte
            {
                cp = ((s[i] & 0x1F) << 6) | (s[i + 1] & 0x3F);
                len = 2;
            }
            else if ((c & 0xF0) == 0xE0) // 3-byte
            {
                cp = ((s[i] & 0x0F) << 12) | ((s[i + 1] & 0x3F) << 6) | (s[i + 2] & 0x3F);
                len = 3;
            }
            else if ((c & 0xF8) == 0xF0) // 4-byte
            {
                cp = ((s[i] & 0x07) << 18) | ((s[i + 1] & 0x3F) << 12) | ((s[i + 2] & 0x3F) << 6) | (s[i + 3] & 0x3F);
                len = 4;
            }
            else
            {
                // invalid utf-8 start byte, skip
                i++;
                continue;
            }
            codepoints.push_back(cp);
            i += len;
        }
        return codepoints;
    }

    bool load(const char* vocab_path)
    {
        // generate decoder when loading
        generate_byte_decoder();

        {
            FILE* fp = fopen(vocab_path, "rb");
            if (!fp)
            {
                fprintf(stderr, "fopen %s failed\n", vocab_path);
                return false;
            }

            char line[256];
            while (!feof(fp))
            {
                char* s = fgets(line, 255, fp);
                if (!s)
                    break;

                int vocab_len = strlen(line);
                if (vocab_len > 1)
                {
                    // drop the tail newline
                    vocab_len -= 1;
                }

                reverse_vocab.push_back(std::string(line, vocab_len));
            }

            fclose(fp);
        }

        return true;
    }

    // decode token ids to text
    std::string decode(const std::vector<int>& tokens) const
    {
        std::string outstring;
        bool in_timestamp = false;

        // step 1: concatenate token ids to a string with special unicode characters
        std::string text_buffer;
        for (int token_id : tokens)
        {
            if (token_id < token_endoftext)
            {
                text_buffer += reverse_vocab[token_id];
                continue;
            }

            // handle timestamp tokens
            if (token_id >= token_timestamp_first && token_id <= token_timestamp_last)
            {
                int timestamp = (token_id - token_timestamp_first) * 2;

                char tmp[256];
                sprintf(tmp, " [%d.%02d] ", timestamp / 100, timestamp % 100);

                if (in_timestamp)
                {
                    // step 2: translate the special string back to original byte stream
                    std::vector<uint32_t> codepoints = utf8_to_codepoints(text_buffer);

                    std::vector<uint8_t> byte_sequence;
                    for (uint32_t cp : codepoints)
                    {
                        byte_sequence.push_back(byte_decoder[cp]);
                    }

                    std::string s(byte_sequence.begin(), byte_sequence.end());

                    text_buffer.clear();

                    outstring += s;
                    outstring += tmp;
                    outstring += "\n";

                    in_timestamp = false;
                }
                else
                {
                    outstring += tmp;
                    in_timestamp = true;
                }
            }

            // ignore functional/special tokens
        }

        if (!text_buffer.empty())
        {
            // step 2: translate the special string back to original byte stream
            std::vector<uint32_t> codepoints = utf8_to_codepoints(text_buffer);

            std::vector<uint8_t> byte_sequence;
            for (uint32_t cp : codepoints)
            {
                byte_sequence.push_back(byte_decoder[cp]);
            }

            std::string s(byte_sequence.begin(), byte_sequence.end());

            outstring += s;
        }

        return outstring;
    }
};

// result class for beam search
class Result
{
public:
    std::vector<int> ids;
    float score;

    std::vector<ncnn::Mat> kvcache;
};

// main whisper implementation class
class Whisper
{
public:
    int load();

    int detect_lang(const std::vector<short>& samples, std::string& lang) const;
    int transcribe(const std::vector<short>& samples, const char* lang, std::string& text) const;

protected:
    int extract_fbank_feature(const std::vector<short>& samples, ncnn::Mat& input_features) const;
    int run_encoder(const ncnn::Mat& input_features, ncnn::Mat& encoder_states) const;
    int run_decoder_prefill(const std::vector<int>& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, std::vector<ncnn::Mat>& out_kvcache) const;
    int run_decoder_step(const std::vector<int>& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, const std::vector<ncnn::Mat>& kvcache, std::vector<ncnn::Mat>& out_kvcache) const;

protected:
    ncnn::Net fbank;

    ncnn::Net encoder;

    ncnn::Net embed_token;
    ncnn::Net embed_position;
    ncnn::Net decoder;

    ncnn::Net proj_out;

    Tokenizer tokenizer;

protected:
    std::vector<int> kv_cache_indexes;
    std::vector<int> out_kv_cache_indexes;
};

int Whisper::load()
{
    // whisper models could be found at
    // https://github.com/nihui/ncnn-android-whisper/releases
    // https://github.com/nihui/ncnn-android-whisper/tree/master/app/src/main/assets

    fbank.opt.use_vulkan_compute = true;
    fbank.opt.use_fp16_packed = false;
    fbank.opt.use_fp16_storage = false;
    fbank.opt.use_fp16_arithmetic = false;

    encoder.opt.use_vulkan_compute = true;
    encoder.opt.use_fp16_packed = false;
    encoder.opt.use_fp16_storage = false;
    encoder.opt.use_fp16_arithmetic = false;

    decoder.opt.use_vulkan_compute = true;
    decoder.opt.use_fp16_packed = false;
    decoder.opt.use_fp16_storage = false;
    decoder.opt.use_fp16_arithmetic = false;

    proj_out.opt.use_vulkan_compute = true;
    proj_out.opt.use_fp16_packed = false;
    proj_out.opt.use_fp16_storage = false;
    proj_out.opt.use_fp16_arithmetic = false;

    fbank.load_param("whisper_tiny_fbank.ncnn.param");
    fbank.load_model("whisper_tiny_fbank.ncnn.bin");

    encoder.load_param("whisper_tiny_encoder.ncnn.param");
    encoder.load_model("whisper_tiny_encoder.ncnn.bin");

    embed_token.load_param("whisper_tiny_embed_token.ncnn.param");
    embed_token.load_model("whisper_tiny_embed_token.ncnn.bin");

    embed_position.load_param("whisper_tiny_embed_position.ncnn.param");
    embed_position.load_model("whisper_tiny_embed_position.ncnn.bin");

    decoder.load_param("whisper_tiny_decoder.ncnn.param");
    decoder.load_model("whisper_tiny_decoder.ncnn.bin");

    proj_out.load_param("whisper_tiny_proj_out.ncnn.param");
    proj_out.load_model("whisper_tiny_proj_out.ncnn.bin");

    // fbank.load_param("whisper_large_v3_turbo_fbank.ncnn.param");
    // fbank.load_model("whisper_large_v3_turbo_fbank.ncnn.bin");
    //
    // encoder.load_param("whisper_large_v3_turbo_encoder.ncnn.param");
    // encoder.load_model("whisper_large_v3_turbo_encoder.ncnn.bin");
    //
    // embed_token.load_param("whisper_large_v3_turbo_embed_token.ncnn.param");
    // embed_token.load_model("whisper_large_v3_turbo_embed_token.ncnn.bin");
    //
    // embed_position.load_param("whisper_large_v3_turbo_embed_position.ncnn.param");
    // embed_position.load_model("whisper_large_v3_turbo_embed_position.ncnn.bin");
    //
    // decoder.load_param("whisper_large_v3_turbo_decoder.ncnn.param");
    // decoder.load_model("whisper_large_v3_turbo_decoder.ncnn.bin");
    //
    // proj_out.load_param("whisper_large_v3_turbo_proj_out.ncnn.param");
    // proj_out.load_model("whisper_large_v3_turbo_proj_out.ncnn.bin");

    tokenizer.load("whisper_vocab.txt");

    // resolve kv cache blob indexes
    for (size_t i = 0; i < decoder.layers().size(); i++)
    {
        const ncnn::Layer* mha = decoder.layers()[i];
        if (mha->typeindex != ncnn::LayerType::MultiHeadAttention)
            continue;

        const size_t input_count = mha->bottoms.size();
        const size_t output_count = mha->tops.size();

        if (output_count == 3)
        {
            kv_cache_indexes.push_back(mha->bottoms[input_count - 2]);
            kv_cache_indexes.push_back(mha->bottoms[input_count - 1]);
            out_kv_cache_indexes.push_back(mha->tops[output_count - 2]);
            out_kv_cache_indexes.push_back(mha->tops[output_count - 1]);
        }
    }

    return 0;
}

// apply log_softmax in-place
static void log_softmax_inplace(ncnn::Mat& m)
{
    ncnn::Option opt;
    opt.use_packing_layout = false;
    opt.use_fp16_storage = false;

    {
        ncnn::Layer* softmax = ncnn::create_layer_cpu("Softmax");
        ncnn::ParamDict pd;
        pd.set(0, 0); // axis
        softmax->load_param(pd);
        softmax->forward_inplace(m, opt);
        delete softmax;
    }

    {
        ncnn::Layer* log = ncnn::create_layer_cpu("UnaryOp");
        ncnn::ParamDict pd;
        pd.set(0, 8); // log
        log->load_param(pd);
        log->forward_inplace(m, opt);
        delete log;
    }
}

int Whisper::detect_lang(const std::vector<short>& samples, std::string& lang) const
{
    std::vector<int> ids(1);
    ids[0] = token_startoftranscript;

    ncnn::Mat input_features;
    extract_fbank_feature(samples, input_features);

    ncnn::Mat encoder_states;
    run_encoder(input_features, encoder_states);

    ncnn::Mat logits;
    std::vector<ncnn::Mat> out_kvcache;
    run_decoder_prefill(ids, encoder_states, logits, out_kvcache);

    // find the lang token with highest prob
    // we are only interested in lang part and no_speech
    int lang_id = token_lang_first;
    float max_prob = logits[token_lang_first];
    for (int i = token_lang_first; i <= token_lang_last; i++)
    {
        float prob = logits[i];
        if (prob > max_prob)
        {
            max_prob = prob;
            lang_id = i;
        }
    }

    lang = token_langs[lang_id - token_lang_first];

    return 0;
}

int Whisper::transcribe(const std::vector<short>& samples, const char* lang, std::string& text) const
{
    // find lang token id by lang string
    int token_lang = -1;
    for (int i = 0; i < token_lang_count; i++)
    {
        if (strcmp(token_langs[i], lang) == 0)
        {
            token_lang = token_lang_first + i;
            break;
        }
    }

    if (token_lang == -1)
    {
        fprintf(stderr, "language %s not supported\n", lang);
        return -1;
    }

    // initialize with prompt tokens
    std::vector<int> ids(4);
    ids[0] = token_startoftranscript;
    ids[1] = token_lang;
    ids[2] = token_transcribe;
    ids[3] = token_notimestamps;

    ncnn::Mat input_features;
    extract_fbank_feature(samples, input_features);

    ncnn::Mat encoder_states;
    run_encoder(input_features, encoder_states);

    const int beam_size = 5;
    const int max_candidates = 5;

    std::vector<Result> finished_beams;

    std::vector<Result> beams(1);
    beams[0].ids = ids;
    beams[0].score = 0.f;

    int step = 0;

    // beam search loop
    for (;;)
    {
        std::vector<Result> candidates;

        for (size_t i = 0; i < beams.size(); i++)
        {
            const Result& beam = beams[i];

            ncnn::Mat logits;
            std::vector<ncnn::Mat> out_kvcache;
            if (step == 0)
            {
                run_decoder_prefill(beam.ids, encoder_states, logits, out_kvcache);
            }
            else
            {
                run_decoder_step(beam.ids, encoder_states, logits, beam.kvcache, out_kvcache);
            }

            log_softmax_inplace(logits);

            // get topk candidates
            const int topk = 5;
            std::vector<std::pair<float, int> > vec(logits.w);
            for (int j = 0; j < logits.w; j++)
            {
                vec[j] = std::make_pair(logits[j], j);
            }
            std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(), std::greater<std::pair<float, int> >());

            for (int j = 0; j < topk; j++)
            {
                int next_id = vec[j].second;
                float next_id_score = vec[j].first;

                Result candidate;
                candidate.ids = beam.ids;
                candidate.ids.push_back(next_id);
                candidate.score = beam.score + next_id_score;
                candidate.kvcache = out_kvcache;

                candidates.push_back(candidate);
            }
        }

        // sort candidates by score
        std::sort(candidates.begin(), candidates.end(), [](const Result& a, const Result& b) {
            return a.score > b.score;
        });

        beams.clear();
        for (size_t i = 0; i < candidates.size(); i++)
        {
            const Result& candidate = candidates[i];

            if (candidate.ids.back() == token_endoftext)
            {
                finished_beams.push_back(candidate);
            }
            else
            {
                beams.push_back(candidate);
            }
        }

        if (beams.size() > beam_size)
        {
            beams.resize(beam_size);
        }

        step++;

        if (beams.empty())
        {
            break;
        }

        if (finished_beams.size() >= max_candidates)
        {
            break;
        }
    }

    if (finished_beams.empty())
    {
        // no results
        return 0;
    }

    // find the best result based on average score
    int max_avg_score_index = 0;
    float max_avg_score = -FLT_MAX;
    for (size_t i = 0; i < finished_beams.size(); i++)
    {
        const Result& result = finished_beams[i];
        float avg_score = result.score / result.ids.size();
        if (avg_score > max_avg_score)
        {
            max_avg_score_index = (int)i;
            max_avg_score = avg_score;
        }
    }

    const Result& best_result = finished_beams[max_avg_score_index];

    text = tokenizer.decode(best_result.ids);

    return 0;
}

int Whisper::extract_fbank_feature(const std::vector<short>& samples, ncnn::Mat& input_features) const
{
    const int samples_size = (int)samples.size();

    // pad to 480000, normalize samples to -1~1
    ncnn::Mat waveform(480000);
    waveform.fill(0.f);
    {
        for (int i = 0; i < samples_size; i++)
        {
            waveform[i] = samples[i] / 32768.0f;
        }
    }

    ncnn::Extractor ex = fbank.create_extractor();

    ex.input("in0", waveform);

    ex.extract("out0", input_features);

    // drop the last frame
    {
        ncnn::Mat input_features_3k(input_features.w - 1, input_features.h);
        for (int i = 0; i < input_features.h; i++)
        {
            memcpy(input_features_3k.row(i), input_features.row(i), (input_features.w - 1) * sizeof(float));
        }
        input_features = input_features_3k;
    }

    return 0;
}

int Whisper::run_encoder(const ncnn::Mat& input_features, ncnn::Mat& encoder_states) const
{
    ncnn::Extractor ex = encoder.create_extractor();

    ex.input("in0", input_features);

    ex.extract("out0", encoder_states);

    return 0;
}

int Whisper::run_decoder_prefill(const std::vector<int>& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, std::vector<ncnn::Mat>& out_kvcache) const
{
    const int dst_seqlen = tokens.size();

    // token embedding
    ncnn::Mat token_embeds;
    {
        ncnn::Mat input_tokens(dst_seqlen);
        int* p = input_tokens;
        memcpy(p, tokens.data(), tokens.size() * sizeof(int));

        ncnn::Extractor ex = embed_token.create_extractor();
        ex.input("in0", input_tokens);
        ex.extract("out0", token_embeds);
    }

    // position embedding
    ncnn::Mat position_embeds;
    {
        ncnn::Mat input_positions(dst_seqlen);
        int* p = input_positions;
        for (int i = 0; i < dst_seqlen; i++)
        {
            p[i] = i;
        }

        ncnn::Extractor ex = embed_position.create_extractor();
        ex.input("in0", input_positions);
        ex.extract("out0", position_embeds);
    }

    // input embedding = token + position
    ncnn::Mat input_embeds;
    {
        input_embeds.create_like(token_embeds);
        for (int i = 0; i < input_embeds.total(); i++)
        {
            input_embeds[i] = token_embeds[i] + position_embeds[i];
        }
    }

    // create attention mask (causal mask)
    ncnn::Mat attention_mask(dst_seqlen, dst_seqlen);
    attention_mask.fill(0.f);
    for (int i = 0; i < dst_seqlen; i++)
    {
        for (int j = i + 1; j < dst_seqlen; j++)
        {
            attention_mask.row(i)[j] = -INFINITY;
        }
    }

    ncnn::Mat output_states;
    {
        ncnn::Extractor ex = decoder.create_extractor();
        ex.input("in0", input_embeds);
        ex.input("in1", encoder_states);
        ex.input("in2", attention_mask);

        out_kvcache.resize(out_kv_cache_indexes.size());
        for (size_t i = 0; i < out_kv_cache_indexes.size(); i++)
        {
            ex.extract(out_kv_cache_indexes[i], out_kvcache[i], 1);
        }

        ex.extract("out0", output_states);
    }

    // get last token's state for next token prediction
    ncnn::Mat last_state = output_states.row_range(dst_seqlen - 1, 1).clone();
    {
        ncnn::Extractor ex = proj_out.create_extractor();
        ex.input("in0", last_state);
        ex.extract("out0", last_logits);
    }

    last_logits = last_logits.reshape(last_logits.w);

    return 0;
}

int Whisper::run_decoder_step(const std::vector<int>& tokens, const ncnn::Mat& encoder_states, ncnn::Mat& last_logits, const std::vector<ncnn::Mat>& kvcache, std::vector<ncnn::Mat>& out_kvcache) const
{
    const int token_id = tokens.back();
    const int dst_seqlen = 1;

    // token embedding
    ncnn::Mat token_embeds;
    {
        ncnn::Mat input_tokens(dst_seqlen);
        ((int*)input_tokens)[0] = token_id;

        ncnn::Extractor ex = embed_token.create_extractor();
        ex.input("in0", input_tokens);
        ex.extract("out0", token_embeds);
    }

    // position embedding
    ncnn::Mat position_embeds;
    {
        ncnn::Mat input_positions(dst_seqlen);
        ((int*)input_positions)[0] = tokens.size() - 1;

        ncnn::Extractor ex = embed_position.create_extractor();
        ex.input("in0", input_positions);
        ex.extract("out0", position_embeds);
    }

    // input embedding = token + position
    ncnn::Mat input_embeds;
    {
        input_embeds.create_like(token_embeds);
        for (int i = 0; i < input_embeds.total(); i++)
        {
            input_embeds[i] = token_embeds[i] + position_embeds[i];
        }
    }

    // single token doesn't need attention mask
    ncnn::Mat attention_mask(dst_seqlen, dst_seqlen);
    attention_mask.fill(0.f);

    ncnn::Mat output_states;
    {
        ncnn::Extractor ex = decoder.create_extractor();
        ex.input("in0", input_embeds);
        ex.input("in1", encoder_states);
        ex.input("in2", attention_mask);

        // pass in kv cache from previous steps
        for (size_t i = 0; i < kv_cache_indexes.size(); i++)
        {
            ex.input(kv_cache_indexes[i], kvcache[i]);
        }

        // extract updated kv cache
        out_kvcache.resize(out_kv_cache_indexes.size());
        for (size_t i = 0; i < out_kv_cache_indexes.size(); i++)
        {
            ex.extract(out_kv_cache_indexes[i], out_kvcache[i], 1);
        }

        ex.extract("out0", output_states);
    }

    // get last token's state for prediction
    ncnn::Mat last_state = output_states.row_range(dst_seqlen - 1, 1).clone();
    {
        ncnn::Extractor ex = proj_out.create_extractor();
        ex.input("in0", last_state);
        ex.extract("out0", last_logits);
    }

    last_logits = last_logits.reshape(last_logits.w);

    return 0;
}

static int load_wav_samples(const char* wavpath, std::vector<short>& samples)
{
    FILE* fp = fopen(wavpath, "rb");
    if (!fp)
    {
        fprintf(stderr, "open %s failed\n", wavpath);
        return -1;
    }

// https://stackoverflow.com/questions/1537964/visual-c-equivalent-of-gccs-attribute-packed
#ifdef _MSC_VER
#define PACK(__Declaration__) __pragma(pack(push, 1)) __Declaration__ __pragma(pack(pop))
#else
#define PACK(__Declaration__) __Declaration__ __attribute__((__packed__))
#endif

    PACK(struct wav_header {
        char riff[4];
        uint32_t chunk_size;
        char wave[4];
        char fmt[4];
        uint32_t subchunk1_size;
        uint16_t audio_format;
        uint16_t num_channels;
        uint32_t sample_rate;
        uint32_t byte_rate;
        uint16_t block_align;
        uint16_t bits_per_sample;
        char data[4];
        uint32_t data_size;
    });

    wav_header header;
    if (fread(&header, sizeof(wav_header), 1, fp) != 1)
    {
        fprintf(stderr, "failed to read wav header from %s\n", wavpath);
        fclose(fp);
        return -1;
    }

    if (memcmp(header.riff, "RIFF", 4) != 0 || memcmp(header.wave, "WAVE", 4) != 0
            || memcmp(header.fmt, "fmt ", 4) != 0 || memcmp(header.data, "data", 4) != 0)
    {
        fprintf(stderr, "%s is not a valid wav file\n", wavpath);
        fclose(fp);
        return -1;
    }

    if (header.subchunk1_size != 16 || header.audio_format != 1 || header.num_channels != 1
            || header.sample_rate != 16000 || header.bits_per_sample != 16)
    {
        fprintf(stderr, "%s is not pcm s16le 16k wav\n", wavpath);
        fprintf(stderr, "ffmpeg -i input.xxx -vn -c:a pcm_s16le -ac 1 -ar 16000 -fflags bitexact output.wav\n");
        fclose(fp);
        return -1;
    }

    fseek(fp, 0, SEEK_END);
    long len = ftell(fp);

    samples.resize((len - sizeof(wav_header)) / sizeof(short));

    rewind(fp);

    fseek(fp, sizeof(wav_header), SEEK_SET);

    fread(samples.data(), 1, len - sizeof(wav_header), fp);

    fclose(fp);

    return 0;
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [wavpath]\n", argv[0]);
        return -1;
    }

    const char* wavpath = argv[1];

    std::vector<short> samples;
    int ret = load_wav_samples(wavpath, samples);
    if (ret != 0)
    {
        fprintf(stderr, "load wav failed\n");
        return -1;
    }

    if (samples.size() > 480000)
    {
        fprintf(stderr, "audio duration too long, truncate to 30s\n");
        samples.resize(480000);
    }

    Whisper whisper;
    whisper.load();

    // detect language first
    std::string lang;
    whisper.detect_lang(samples, lang);
    fprintf(stderr, "lang = %s\n", lang.c_str());

    // transcribe audio to text
    std::string text;
    whisper.transcribe(samples, lang.c_str(), text);
    fprintf(stderr, "text = %s\n", text.c_str());

    return 0;
}


================================================
FILE: examples/yolact.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
    std::vector<float> maskdata;
    cv::Mat mask;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolact;

    yolact.opt.use_vulkan_compute = true;

    // original model converted from https://github.com/dbolya/yolact
    // yolact_resnet50_54_800000.pth
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (yolact.load_param("yolact.param"))
        exit(-1);
    if (yolact.load_model("yolact.bin"))
        exit(-1);

    const int target_size = 550;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, target_size, target_size);

    const float mean_vals[3] = {123.68f, 116.78f, 103.94f};
    const float norm_vals[3] = {1.0 / 58.40f, 1.0 / 57.12f, 1.0 / 57.38f};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = yolact.create_extractor();

    ex.input("input.1", in);

    ncnn::Mat maskmaps;
    ncnn::Mat location;
    ncnn::Mat mask;
    ncnn::Mat confidence;

    ex.extract("619", maskmaps); // 138x138 x 32

    ex.extract("816", location);   // 4 x 19248
    ex.extract("818", mask);       // maskdim 32 x 19248
    ex.extract("820", confidence); // 81 x 19248

    int num_class = confidence.w;
    int num_priors = confidence.h;

    // make priorbox
    ncnn::Mat priorbox(4, num_priors);
    {
        const int conv_ws[5] = {69, 35, 18, 9, 5};
        const int conv_hs[5] = {69, 35, 18, 9, 5};

        const float aspect_ratios[3] = {1.f, 0.5f, 2.f};
        const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f};

        float* pb = priorbox;

        for (int p = 0; p < 5; p++)
        {
            int conv_w = conv_ws[p];
            int conv_h = conv_hs[p];

            float scale = scales[p];

            for (int i = 0; i < conv_h; i++)
            {
                for (int j = 0; j < conv_w; j++)
                {
                    // +0.5 because priors are in center-size notation
                    float cx = (j + 0.5f) / conv_w;
                    float cy = (i + 0.5f) / conv_h;

                    for (int k = 0; k < 3; k++)
                    {
                        float ar = aspect_ratios[k];

                        ar = sqrt(ar);

                        float w = scale * ar / 550;
                        float h = scale / ar / 550;

                        // This is for backward compatibility with a bug where I made everything square by accident
                        // cfg.backbone.use_square_anchors:
                        h = w;

                        pb[0] = cx;
                        pb[1] = cy;
                        pb[2] = w;
                        pb[3] = h;

                        pb += 4;
                    }
                }
            }
        }
    }

    const float confidence_thresh = 0.05f;
    const float nms_threshold = 0.5f;
    const int keep_top_k = 200;

    std::vector<std::vector<Object> > class_candidates;
    class_candidates.resize(num_class);

    for (int i = 0; i < num_priors; i++)
    {
        const float* conf = confidence.row(i);
        const float* loc = location.row(i);
        const float* pb = priorbox.row(i);
        const float* maskdata = mask.row(i);

        // find class id with highest score
        // start from 1 to skip background
        int label = 0;
        float score = 0.f;
        for (int j = 1; j < num_class; j++)
        {
            float class_score = conf[j];
            if (class_score > score)
            {
                label = j;
                score = class_score;
            }
        }

        // ignore background or low score
        if (label == 0 || score <= confidence_thresh)
            continue;

        // CENTER_SIZE
        float var[4] = {0.1f, 0.1f, 0.2f, 0.2f};

        float pb_cx = pb[0];
        float pb_cy = pb[1];
        float pb_w = pb[2];
        float pb_h = pb[3];

        float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
        float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
        float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w);
        float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h);

        float obj_x1 = bbox_cx - bbox_w * 0.5f;
        float obj_y1 = bbox_cy - bbox_h * 0.5f;
        float obj_x2 = bbox_cx + bbox_w * 0.5f;
        float obj_y2 = bbox_cy + bbox_h * 0.5f;

        // clip
        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);

        // append object
        Object obj;
        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
        obj.label = label;
        obj.prob = score;
        obj.maskdata = std::vector<float>(maskdata, maskdata + mask.w);

        class_candidates[label].push_back(obj);
    }

    objects.clear();
    for (int i = 0; i < (int)class_candidates.size(); i++)
    {
        std::vector<Object>& candidates = class_candidates[i];

        qsort_descent_inplace(candidates);

        std::vector<int> picked;
        nms_sorted_bboxes(candidates, picked, nms_threshold);

        for (int j = 0; j < (int)picked.size(); j++)
        {
            int z = picked[j];
            objects.push_back(candidates[z]);
        }
    }

    qsort_descent_inplace(objects);

    // keep_top_k
    if (keep_top_k < (int)objects.size())
    {
        objects.resize(keep_top_k);
    }

    // generate mask
    for (int i = 0; i < (int)objects.size(); i++)
    {
        Object& obj = objects[i];

        cv::Mat mask(maskmaps.h, maskmaps.w, CV_32FC1);
        {
            mask = cv::Scalar(0.f);

            for (int p = 0; p < maskmaps.c; p++)
            {
                const float* maskmap = maskmaps.channel(p);
                float coeff = obj.maskdata[p];
                float* mp = (float*)mask.data;

                // mask += m * coeff
                for (int j = 0; j < maskmaps.w * maskmaps.h; j++)
                {
                    mp[j] += maskmap[j] * coeff;
                }
            }
        }

        cv::Mat mask2;
        cv::resize(mask, mask2, cv::Size(img_w, img_h));

        // crop obj box and binarize
        obj.mask = cv::Mat(img_h, img_w, CV_8UC1);
        {
            obj.mask = cv::Scalar(0);

            for (int y = 0; y < img_h; y++)
            {
                if (y < obj.rect.y || y > obj.rect.y + obj.rect.height)
                    continue;

                const float* mp2 = mask2.ptr<const float>(y);
                uchar* bmp = obj.mask.ptr<uchar>(y);

                for (int x = 0; x < img_w; x++)
                {
                    if (x < obj.rect.x || x > obj.rect.x + obj.rect.width)
                        continue;

                    bmp[x] = mp2[x] > 0.5f ? 255 : 0;
                }
            }
        }
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "person", "bicycle", "car", "motorcycle", "airplane", "bus",
                                        "train", "truck", "boat", "traffic light", "fire hydrant",
                                        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
                                        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
                                        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
                                        "skis", "snowboard", "sports ball", "kite", "baseball bat",
                                        "baseball glove", "skateboard", "surfboard", "tennis racket",
                                        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
                                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
                                        "hot dog", "pizza", "donut", "cake", "chair", "couch",
                                        "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
                                        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
                                        "toaster", "sink", "refrigerator", "book", "clock", "vase",
                                        "scissors", "teddy bear", "hair drier", "toothbrush"
                                       };

    static const unsigned char colors[81][3] = {
        {56, 0, 255},
        {226, 255, 0},
        {0, 94, 255},
        {0, 37, 255},
        {0, 255, 94},
        {255, 226, 0},
        {0, 18, 255},
        {255, 151, 0},
        {170, 0, 255},
        {0, 255, 56},
        {255, 0, 75},
        {0, 75, 255},
        {0, 255, 169},
        {255, 0, 207},
        {75, 255, 0},
        {207, 0, 255},
        {37, 0, 255},
        {0, 207, 255},
        {94, 0, 255},
        {0, 255, 113},
        {255, 18, 0},
        {255, 0, 56},
        {18, 0, 255},
        {0, 255, 226},
        {170, 255, 0},
        {255, 0, 245},
        {151, 255, 0},
        {132, 255, 0},
        {75, 0, 255},
        {151, 0, 255},
        {0, 151, 255},
        {132, 0, 255},
        {0, 255, 245},
        {255, 132, 0},
        {226, 0, 255},
        {255, 37, 0},
        {207, 255, 0},
        {0, 255, 207},
        {94, 255, 0},
        {0, 226, 255},
        {56, 255, 0},
        {255, 94, 0},
        {255, 113, 0},
        {0, 132, 255},
        {255, 0, 132},
        {255, 170, 0},
        {255, 0, 188},
        {113, 255, 0},
        {245, 0, 255},
        {113, 0, 255},
        {255, 188, 0},
        {0, 113, 255},
        {255, 0, 0},
        {0, 56, 255},
        {255, 0, 113},
        {0, 255, 188},
        {255, 0, 94},
        {255, 0, 18},
        {18, 255, 0},
        {0, 255, 132},
        {0, 188, 255},
        {0, 245, 255},
        {0, 169, 255},
        {37, 255, 0},
        {255, 0, 151},
        {188, 0, 255},
        {0, 255, 37},
        {0, 255, 0},
        {255, 0, 170},
        {255, 0, 37},
        {255, 75, 0},
        {0, 0, 255},
        {255, 207, 0},
        {255, 0, 226},
        {255, 245, 0},
        {188, 255, 0},
        {0, 255, 18},
        {0, 255, 75},
        {0, 255, 151},
        {255, 56, 0},
        {245, 255, 0}
    };

    cv::Mat image = bgr.clone();

    int color_index = 0;

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        if (obj.prob < 0.15)
            continue;

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        const unsigned char* color = colors[color_index % 81];
        color_index++;

        cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2]));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));

        // draw mask
        for (int y = 0; y < image.rows; y++)
        {
            const uchar* mp = obj.mask.ptr(y);
            uchar* p = image.ptr(y);
            for (int x = 0; x < image.cols; x++)
            {
                if (mp[x] == 255)
                {
                    p[0] = cv::saturate_cast<uchar>(p[0] * 0.5 + color[0] * 0.5);
                    p[1] = cv::saturate_cast<uchar>(p[1] * 0.5 + color[1] * 0.5);
                    p[2] = cv::saturate_cast<uchar>(p[2] * 0.5 + color[2] * 0.5);
                }
                p += 3;
            }
        }
    }

    cv::imwrite("result.png", image);
    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolact(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolo11.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolo11 torchscript
//      yolo export model=yolo11n.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolo11n.torchscript
// 4. modify yolo11n_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_235 = v_204.view(1, 144, 6400)
//          v_236 = v_219.view(1, 144, 1600)
//          v_237 = v_234.view(1, 144, 400)
//          v_238 = torch.cat((v_235, v_236, v_237), dim=2)
//          ...
//      after:
//          v_235 = v_204.view(1, 144, -1).transpose(1, 2)
//          v_236 = v_219.view(1, 144, -1).transpose(1, 2)
//          v_237 = v_234.view(1, 144, -1).transpose(1, 2)
//          v_238 = torch.cat((v_235, v_236, v_237), dim=1)
//          return v_238
//      D. modify area attention for dynamic shape inference
//      before:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, 400)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, 20, 20)
//          v_107 = v_99.reshape(1, 128, 20, 20)
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
//      after:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, -1)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3))
//          v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3))
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
// 5. re-export yolo11 torchscript
//      python3 -c 'import yolo11n_pnnx; yolo11n_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolo11n_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
// 7. now you get ncnn model files
//      mv yolo11n_pnnx.py.ncnn.param yolo11n.ncnn.param
//      mv yolo11n_pnnx.py.ncnn.bin yolo11n.ncnn.bin

// the out blob would be a 2-dim tensor with w=144 h=8400
//
//        | bbox-reg 16 x 4       | per-class scores(80) |
//        +-----+-----+-----+-----+----------------------+
//        | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......|
//   all /|     |     |     |     |           .          |
//  boxes |  .. |  .. |  .. |  .. |0.0 0.9 0.0 0.0 ......|
//  (8400)|     |     |     |     |           .          |
//       \|     |     |     |     |           .          |
//        +-----+-----+-----+-----+----------------------+
//

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);

            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            {
                const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);

                for (int k = 0; k < num_class; k++)
                {
                    float s = pred_score[k];
                    if (s > score)
                    {
                        label = k;
                        score = s;
                    }
                }

                score = sigmoid(score);
            }

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4);

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);
        pred_row_offset += num_grid;
    }
}

static int detect_yolo11(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolo11;

    yolo11.opt.use_vulkan_compute = true;
    // yolo11.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets
    yolo11.load_param("yolo11n.ncnn.param");
    yolo11.load_model("yolo11n.ncnn.bin");
    // yolo11.load_param("yolo11s.ncnn.param");
    // yolo11.load_model("yolo11s.ncnn.bin");
    // yolo11.load_param("yolo11m.ncnn.param");
    // yolo11.load_model("yolo11m.ncnn.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolo11.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolo11.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    std::vector<Object> proposals;
    generate_proposals(out, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    static cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolo11(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolo11_cls.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolo11-cls torchscript
//      yolo export model=yolo11n-cls.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolo11n-cls.torchscript
// 4. now you get ncnn model files
//      yolo11n_cls.ncnn.param
//      yolo11n_cls.ncnn.bin

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    int label;
    float prob;
};

static void get_topk(const ncnn::Mat& cls_scores, int topk, std::vector<Object>& objects)
{
    // partial sort topk with index
    int size = cls_scores.w;
    std::vector<std::pair<float, int> > vec;
    vec.resize(size);
    for (int i = 0; i < size; i++)
    {
        vec[i] = std::make_pair(cls_scores[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater<std::pair<float, int> >());

    objects.resize(topk);
    for (int i = 0; i < topk; i++)
    {
        objects[i].label = vec[i].second;
        objects[i].prob = vec[i].first;
    }
}

static int detect_yolo11_cls(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolo11;

    yolo11.opt.use_vulkan_compute = true;
    // yolo11.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets
    yolo11.load_param("yolo11n_cls.ncnn.param");
    yolo11.load_model("yolo11n_cls.ncnn.bin");
    // yolo11.load_param("yolo11s_cls.ncnn.param");
    // yolo11.load_model("yolo11s_cls.ncnn.bin");
    // yolo11.load_param("yolo11m_cls.ncnn.param");
    // yolo11.load_model("yolo11m_cls.ncnn.bin");

    const int target_size = 224;
    const int topk = 5;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // letterbox pad
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = target_size - w;
    int hpad = target_size - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolo11.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    // return top-5
    get_topk(out, topk, objects);

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "tench", "goldfish", "great white shark", "tiger shark", "hammerhead", "electric ray", "stingray", "cock",
        "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "robin", "bulbul",
        "jay", "magpie", "chickadee", "water ouzel", "kite", "bald eagle", "vulture", "great grey owl",
        "European fire salamander", "common newt", "eft", "spotted salamander", "axolotl", "bullfrog", "tree frog",
        "tailed frog", "loggerhead", "leatherback turtle", "mud turtle", "terrapin", "box turtle", "banded gecko",
        "common iguana", "American chameleon", "whiptail", "agama", "frilled lizard", "alligator lizard",
        "Gila monster", "green lizard", "African chameleon", "Komodo dragon", "African crocodile",
        "American alligator", "triceratops", "thunder snake", "ringneck snake", "hognose snake", "green snake",
        "king snake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "rock python",
        "Indian cobra", "green mamba", "sea snake", "horned viper", "diamondback", "sidewinder", "trilobite",
        "harvestman", "scorpion", "black and gold garden spider", "barn spider", "garden spider", "black widow",
        "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse",
        "prairie chicken", "peacock", "quail", "partridge", "African grey", "macaw", "sulphur-crested cockatoo",
        "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "drake",
        "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala",
        "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug",
        "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "king crab",
        "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork",
        "spoonbill", "flamingo", "little blue heron", "American egret", "bittern", "crane (bird)", "limpkin",
        "European gallinule", "American coot", "bustard", "ruddy turnstone", "red-backed sandpiper", "redshank",
        "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale",
        "dugong", "sea lion", "Chihuahua", "Japanese spaniel", "Maltese dog", "Pekinese", "Shih-Tzu",
        "Blenheim spaniel", "papillon", "toy terrier", "Rhodesian ridgeback", "Afghan hound", "basset", "beagle",
        "bloodhound", "bluetick", "black-and-tan coonhound", "Walker hound", "English foxhound", "redbone",
        "borzoi", "Irish wolfhound", "Italian greyhound", "whippet", "Ibizan hound", "Norwegian elkhound",
        "otterhound", "Saluki", "Scottish deerhound", "Weimaraner", "Staffordshire bullterrier",
        "American Staffordshire terrier", "Bedlington terrier", "Border terrier", "Kerry blue terrier",
        "Irish terrier", "Norfolk terrier", "Norwich terrier", "Yorkshire terrier", "wire-haired fox terrier",
        "Lakeland terrier", "Sealyham terrier", "Airedale", "cairn", "Australian terrier", "Dandie Dinmont",
        "Boston bull", "miniature schnauzer", "giant schnauzer", "standard schnauzer", "Scotch terrier",
        "Tibetan terrier", "silky terrier", "soft-coated wheaten terrier", "West Highland white terrier",
        "Lhasa", "flat-coated retriever", "curly-coated retriever", "golden retriever", "Labrador retriever",
        "Chesapeake Bay retriever", "German short-haired pointer", "vizsla", "English setter", "Irish setter",
        "Gordon setter", "Brittany spaniel", "clumber", "English springer", "Welsh springer spaniel",
        "cocker spaniel", "Sussex spaniel", "Irish water spaniel", "kuvasz", "schipperke", "groenendael",
        "malinois", "briard", "kelpie", "komondor", "Old English sheepdog", "Shetland sheepdog", "collie",
        "Border collie", "Bouvier des Flandres", "Rottweiler", "German shepherd", "Doberman",
        "miniature pinscher", "Greater Swiss Mountain dog", "Bernese mountain dog", "Appenzeller", "EntleBucher",
        "boxer", "bull mastiff", "Tibetan mastiff", "French bulldog", "Great Dane", "Saint Bernard",
        "Eskimo dog", "malamute", "Siberian husky", "dalmatian", "affenpinscher", "basenji", "pug", "Leonberg",
        "Newfoundland", "Great Pyrenees", "Samoyed", "Pomeranian", "chow", "keeshond", "Brabancon griffon",
        "Pembroke", "Cardigan", "toy poodle", "miniature poodle", "standard poodle", "Mexican hairless",
        "timber wolf", "white wolf", "red wolf", "coyote", "dingo", "dhole", "African hunting dog", "hyena",
        "red fox", "kit fox", "Arctic fox", "grey fox", "tabby", "tiger cat", "Persian cat", "Siamese cat",
        "Egyptian cat", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah",
        "brown bear", "American black bear", "ice bear", "sloth bear", "mongoose", "meerkat", "tiger beetle",
        "ladybug", "ground beetle", "long-horned beetle", "leaf beetle", "dung beetle", "rhinoceros beetle",
        "weevil", "fly", "bee", "ant", "grasshopper", "cricket", "walking stick", "cockroach", "mantis",
        "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "admiral", "ringlet", "monarch",
        "cabbage butterfly", "sulphur butterfly", "lycaenid", "starfish", "sea urchin", "sea cucumber",
        "wood rabbit", "hare", "Angora", "hamster", "porcupine", "fox squirrel", "marmot", "beaver",
        "guinea pig", "sorrel", "zebra", "hog", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo",
        "bison", "ram", "bighorn", "ibex", "hartebeest", "impala", "gazelle", "Arabian camel", "llama",
        "weasel", "mink", "polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo",
        "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas",
        "baboon", "macaque", "langur", "colobus", "proboscis monkey", "marmoset", "capuchin", "howler monkey",
        "titi", "spider monkey", "squirrel monkey", "Madagascar cat", "indri", "Indian elephant",
        "African elephant", "lesser panda", "giant panda", "barracouta", "eel", "coho", "rock beauty",
        "anemone fish", "sturgeon", "gar", "lionfish", "puffer", "abacus", "abaya", "academic gown",
        "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance",
        "amphibian", "analog clock", "apiary", "apron", "ashcan", "assault rifle", "backpack", "bakery",
        "balance beam", "balloon", "ballpoint", "Band Aid", "banjo", "bannister", "barbell", "barber chair",
        "barbershop", "barn", "barometer", "barrel", "barrow", "baseball", "basketball", "bassinet", "bassoon",
        "bathing cap", "bath towel", "bathtub", "beach wagon", "beacon", "beaker", "bearskin", "beer bottle",
        "beer glass", "bell cote", "bib", "bicycle-built-for-two", "bikini", "binder", "binoculars",
        "birdhouse", "boathouse", "bobsled", "bolo tie", "bonnet", "bookcase", "bookshop", "bottlecap", "bow",
        "bow tie", "brass", "brassiere", "breakwater", "breastplate", "broom", "bucket", "buckle",
        "bulletproof vest", "bullet train", "butcher shop", "cab", "caldron", "candle", "cannon", "canoe",
        "can opener", "cardigan", "car mirror", "carousel", "carpenter's kit", "carton", "car wheel",
        "cash machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello",
        "cellular telephone", "chain", "chainlink fence", "chain mail", "chain saw", "chest", "chiffonier",
        "chime", "china cabinet", "Christmas stocking", "church", "cinema", "cleaver", "cliff dwelling",
        "cloak", "clog", "cocktail shaker", "coffee mug", "coffeepot", "coil", "combination lock",
        "computer keyboard", "confectionery", "container ship", "convertible", "corkscrew", "cornet",
        "cowboy boot", "cowboy hat", "cradle", "crane (machine)", "crash helmet", "crate", "crib",
        "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone",
        "diaper", "digital clock", "digital watch", "dining table", "dishrag", "dishwasher", "disk brake",
        "dock", "dogsled", "dome", "doormat", "drilling platform", "drum", "drumstick", "dumbbell",
        "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center",
        "envelope", "espresso maker", "face powder", "feather boa", "file", "fireboat", "fire engine",
        "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain",
        "fountain pen", "four-poster", "freight car", "French horn", "frying pan", "fur coat", "garbage truck",
        "gasmask", "gas pump", "goblet", "go-kart", "golf ball", "golfcart", "gondola", "gong", "gown",
        "grand piano", "greenhouse", "grille", "grocery store", "guillotine", "hair slide", "hair spray",
        "half track", "hammer", "hamper", "hand blower", "hand-held computer", "handkerchief", "hard disc",
        "harmonica", "harp", "harvester", "hatchet", "holster", "home theater", "honeycomb", "hook",
        "hoopskirt", "horizontal bar", "horse cart", "hourglass", "iPod", "iron", "jack-o'-lantern", "jean",
        "jeep", "jersey", "jigsaw puzzle", "jinrikisha", "joystick", "kimono", "knee pad", "knot", "lab coat",
        "ladle", "lampshade", "laptop", "lawn mower", "lens cap", "letter opener", "library", "lifeboat",
        "lighter", "limousine", "liner", "lipstick", "Loafer", "lotion", "loudspeaker", "loupe", "lumbermill",
        "magnetic compass", "mailbag", "mailbox", "maillot (tights)", "maillot (tank suit)", "manhole cover",
        "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine chest",
        "megalith", "microphone", "microwave", "military uniform", "milk can", "minibus", "miniskirt",
        "minivan", "missile", "mitten", "mixing bowl", "mobile home", "Model T", "modem", "monastery",
        "monitor", "moped", "mortar", "mortarboard", "mosque", "mosquito net", "motor scooter", "mountain bike",
        "mountain tent", "mouse", "mousetrap", "moving van", "muzzle", "nail", "neck brace", "necklace",
        "nipple", "notebook", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "organ", "oscilloscope",
        "overskirt", "oxcart", "oxygen mask", "packet", "paddle", "paddlewheel", "padlock", "paintbrush",
        "pajama", "palace", "panpipe", "paper towel", "parachute", "parallel bars", "park bench",
        "parking meter", "passenger car", "patio", "pay-phone", "pedestal", "pencil box", "pencil sharpener",
        "perfume", "Petri dish", "photocopier", "pick", "pickelhaube", "picket fence", "pickup", "pier",
        "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate", "pitcher", "plane",
        "planetarium", "plastic bag", "plate rack", "plow", "plunger", "Polaroid camera", "pole",
        "police van", "poncho", "pool table", "pop bottle", "pot", "potter's wheel", "power drill",
        "prayer rug", "printer", "prison", "projectile", "projector", "puck", "punching bag", "purse",
        "quill", "quilt", "racer", "racket", "radiator", "radio", "radio telescope", "rain barrel",
        "recreational vehicle", "reel", "reflex camera", "refrigerator", "remote control", "restaurant",
        "revolver", "rifle", "rocking chair", "rotisserie", "rubber eraser", "rugby ball", "rule",
        "running shoe", "safe", "safety pin", "saltshaker", "sandal", "sarong", "sax", "scabbard", "scale",
        "school bus", "schooner", "scoreboard", "screen", "screw", "screwdriver", "seat belt", "sewing machine",
        "shield", "shoe shop", "shoji", "shopping basket", "shopping cart", "shovel", "shower cap",
        "shower curtain", "ski", "ski mask", "sleeping bag", "slide rule", "sliding door", "slot", "snorkel",
        "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar dish", "sombrero",
        "soup bowl", "space bar", "space heater", "space shuttle", "spatula", "speedboat", "spider web",
        "spindle", "sports car", "spotlight", "stage", "steam locomotive", "steel arch bridge", "steel drum",
        "stethoscope", "stole", "stone wall", "stopwatch", "stove", "strainer", "streetcar", "stretcher",
        "studio couch", "stupa", "submarine", "suit", "sundial", "sunglass", "sunglasses", "sunscreen",
        "suspension bridge", "swab", "sweatshirt", "swimming trunks", "swing", "switch", "syringe",
        "table lamp", "tank", "tape player", "teapot", "teddy", "television", "tennis ball", "thatch",
        "theater curtain", "thimble", "thresher", "throne", "tile roof", "toaster", "tobacco shop",
        "toilet seat", "torch", "totem pole", "tow truck", "toyshop", "tractor", "trailer truck", "tray",
        "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "tub",
        "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright", "vacuum", "vase", "vault",
        "velvet", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock",
        "wallet", "wardrobe", "warplane", "washbasin", "washer", "water bottle", "water jug", "water tower",
        "whiskey jug", "whistle", "wig", "window screen", "window shade", "Windsor tie", "wine bottle", "wing",
        "wok", "wooden spoon", "wool", "worm fence", "wreck", "yawl", "yurt", "web site", "comic book",
        "crossword puzzle", "street sign", "traffic light", "book jacket", "menu", "plate", "guacamole",
        "consomme", "hot pot", "trifle", "ice cream", "ice lolly", "French loaf", "bagel", "pretzel",
        "cheeseburger", "hotdog", "mashed potato", "head cabbage", "broccoli", "cauliflower", "zucchini",
        "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper",
        "cardoon", "mushroom", "Granny Smith", "strawberry", "orange", "lemon", "fig", "pineapple", "banana",
        "jackfruit", "custard apple", "pomegranate", "hay", "carbonara", "chocolate sauce", "dough",
        "meat loaf", "pizza", "potpie", "burrito", "red wine", "espresso", "cup", "eggnog", "alp", "bubble",
        "cliff", "coral reef", "geyser", "lakeside", "promontory", "sandbar", "seashore", "valley", "volcano",
        "ballplayer", "groom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn",
        "hip", "buckeye", "coral fungus", "agaric", "gyromitra", "stinkhorn", "earthstar", "hen-of-the-woods",
        "bolete", "ear", "toilet tissue"
    };

    cv::Mat image = bgr.clone();

    int y_offset = 0;
    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f\n", obj.label, obj.prob);

        char text[256];
        sprintf(text, "%4.1f%% %s", obj.prob * 100, class_names[obj.label]);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = 0;
        int y = y_offset;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));

        y_offset += label_size.height;
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolo11_cls(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolo11_obb.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolo11-obb torchscript
//      yolo export model=yolo11n-obb.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolo11n-obb.torchscript
// 4. modify yolo11n_obb_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_195 = v_194.view(1, 1, 16384)
//          v_201 = v_200.view(1, 1, 4096)
//          v_207 = v_206.view(1, 1, 1024)
//          v_208 = torch.cat((v_195, v_201, v_207), dim=2)
//          ...
//          v_256 = v_225.view(1, 79, 16384)
//          v_257 = v_240.view(1, 79, 4096)
//          v_258 = v_255.view(1, 79, 1024)
//          v_259 = torch.cat((v_256, v_257, v_258), dim=2)
//          ...
//      after:
//          v_195 = v_194.view(1, 1, -1).transpose(1, 2)
//          v_201 = v_200.view(1, 1, -1).transpose(1, 2)
//          v_207 = v_206.view(1, 1, -1).transpose(1, 2)
//          v_208 = torch.cat((v_195, v_201, v_207), dim=1)
//          ...
//          v_256 = v_225.view(1, 79, -1).transpose(1, 2)
//          v_257 = v_240.view(1, 79, -1).transpose(1, 2)
//          v_258 = v_255.view(1, 79, -1).transpose(1, 2)
//          v_259 = torch.cat((v_256, v_257, v_258), dim=1)
//          return v_259, v_208
//      D. modify area attention for dynamic shape inference
//      before:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, 1024)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, 32, 32)
//          v_107 = v_99.reshape(1, 128, 32, 32)
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
//      after:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, -1)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3))
//          v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3))
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
// 5. re-export yolo11-obb torchscript
//      python3 -c 'import yolo11n_obb_pnnx; yolo11n_obb_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolo11n_obb_pnnx.py.pt inputshape=[1,3,1024,1024] inputshape2=[1,3,512,512]
// 7. now you get ncnn model files
//      mv yolo11n_obb_pnnx.py.ncnn.param yolo11n_obb.ncnn.param
//      mv yolo11n_obb_pnnx.py.ncnn.bin yolo11n_obb.ncnn.bin

// the out blob would be a 2-dim tensor with w=79 h=21504
//
//        | bbox-reg 16 x 4       |score(15)|
//        +-----+-----+-----+-----+---------+
//        | dx0 | dy0 | dx1 | dy1 | 0.1 ... |
//   all /|     |     |     |     |     ... |
//  boxes |  .. |  .. |  .. |  .. | 0.0 ... |
// (21504)|     |     |     |     |  .  ... |
//       \|     |     |     |     |  .  ... |
//        +-----+-----+-----+-----+---------+
//

// the out blob would be a 2-dim tensor with w=1 h=21504
//
//        | degree(1)|
//        +----------+
//        |    0.1   |
//   all /|          |
//  boxes |    0.0   |
// (21504)|     .    |
//       \|     .    |
//        +----------+
//

#include "layer.h"
#include "net.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include <float.h>
#include <math.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::RotatedRect rrect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    std::vector<cv::Point2f> intersection;
    cv::rotatedRectangleIntersection(a.rrect, b.rrect, intersection);
    if (intersection.empty())
        return 0.f;

    return cv::contourArea(intersection);
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rrect.size.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area;
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_class = pred.w - reg_max_1 * 4; // number of classes. 15 for DOTAv1

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);

            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            {
                const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);

                for (int k = 0; k < num_class; k++)
                {
                    float s = pred_score[k];
                    if (s > score)
                    {
                        label = k;
                        score = s;
                    }
                }

                score = sigmoid(score);
            }

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                const float angle = sigmoid(pred_angle.row(y * num_grid_x + x)[0]) - 0.25f;

                const float angle_rad = angle * 3.14159265358979323846f;
                const float angle_degree = angle * 180.f;

                float cos = cosf(angle_rad);
                float sin = sinf(angle_rad);

                float xx = (pred_ltrb[2] - pred_ltrb[0]) * 0.5f;
                float yy = (pred_ltrb[3] - pred_ltrb[1]) * 0.5f;
                float xr = xx * cos - yy * sin;
                float yr = xx * sin + yy * cos;
                const float cx = pb_cx + xr;
                const float cy = pb_cy + yr;
                const float ww = pred_ltrb[2] + pred_ltrb[0];
                const float hh = pred_ltrb[3] + pred_ltrb[1];

                Object obj;
                obj.rrect = cv::RotatedRect(cv::Point2f(cx, cy), cv::Size_<float>(ww, hh), angle_degree);
                obj.label = label;
                obj.prob = score;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_angle.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);

        pred_row_offset += num_grid;
    }
}

static int detect_yolo11_obb(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolo11;

    yolo11.opt.use_vulkan_compute = true;
    // yolo11.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets
    yolo11.load_param("yolo11n_obb.ncnn.param");
    yolo11.load_model("yolo11n_obb.ncnn.bin");
    // yolo11.load_param("yolo11s_obb.ncnn.param");
    // yolo11.load_model("yolo11s_obb.ncnn.bin");
    // yolo11.load_param("yolo11m_obb.ncnn.param");
    // yolo11.load_model("yolo11m_obb.ncnn.bin");

    const int target_size = 1024;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolo11.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolo11.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    ncnn::Mat out_angle;
    ex.extract("out1", out_angle);

    std::vector<Object> proposals;
    generate_proposals(out, out_angle, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();
    if (count == 0)
        return 0;

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        Object obj = proposals[picked[i]];

        // adjust offset to original unpadded
        obj.rrect.center.x = (obj.rrect.center.x - (wpad / 2)) / scale;
        obj.rrect.center.y = (obj.rrect.center.y - (hpad / 2)) / scale;
        obj.rrect.size.width = (obj.rrect.size.width) / scale;
        obj.rrect.size.height = (obj.rrect.size.height) / scale;

        objects[i] = obj;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "plane", "ship", "storage tank", "baseball diamond", "tennis court",
        "basketball court", "ground track field", "harbor", "bridge", "large vehicle",
        "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool"
    };

    static const cv::Scalar colors[] = {
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[obj.label];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f  @ %.2f\n", obj.label, obj.prob,
                obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle);

        cv::Point2f corners[4];
        obj.rrect.points(corners);
        cv::line(image, corners[0], corners[1], color);
        cv::line(image, corners[1], corners[2], color);
        cv::line(image, corners[2], corners[3], color);
        cv::line(image, corners[3], corners[0], color);
    }

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[obj.label];

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rrect.center.x - label_size.width / 2;
        int y = obj.rrect.center.y - label_size.height / 2 - baseLine;
        if (y < 0)
            y = 0;
        if (y + label_size.height > image.rows)
            y = image.rows - label_size.height;
        if (x < 0)
            x = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolo11_obb(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolo11_pose.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolo11-pose torchscript
//      yolo export model=yolo11n-pose.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolo11n-pose.torchscript
// 4. modify yolo11n_pose_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_195 = v_194.view(1, 51, 6400)
//          v_201 = v_200.view(1, 51, 1600)
//          v_207 = v_206.view(1, 51, 400)
//          v_208 = torch.cat((v_195, v_201, v_207), dim=-1)
//          ...
//          v_254 = v_223.view(1, 65, 6400)
//          v_255 = v_238.view(1, 65, 1600)
//          v_256 = v_253.view(1, 65, 400)
//          v_257 = torch.cat((v_254, v_255, v_256), dim=2)
//          ...
//      after:
//          v_195 = v_194.view(1, 51, -1).transpose(1, 2)
//          v_201 = v_200.view(1, 51, -1).transpose(1, 2)
//          v_207 = v_206.view(1, 51, -1).transpose(1, 2)
//          v_208 = torch.cat((v_195, v_201, v_207), dim=1)
//          ...
//          v_254 = v_223.view(1, 65, -1).transpose(1, 2)
//          v_255 = v_238.view(1, 65, -1).transpose(1, 2)
//          v_256 = v_253.view(1, 65, -1).transpose(1, 2)
//          v_257 = torch.cat((v_254, v_255, v_256), dim=1)
//          return v_257, v_208
//      D. modify area attention for dynamic shape inference
//      before:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, 400)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, 20, 20)
//          v_107 = v_99.reshape(1, 128, 20, 20)
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
//      after:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, -1)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3))
//          v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3))
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
// 5. re-export yolo11-pose torchscript
//      python3 -c 'import yolo11n_pose_pnnx; yolo11n_pose_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolo11n_pose_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
// 7. now you get ncnn model files
//      mv yolo11n_pose_pnnx.py.ncnn.param yolo11n_pose.ncnn.param
//      mv yolo11n_pose_pnnx.py.ncnn.bin yolo11n_pose.ncnn.bin

// the out blob would be a 2-dim tensor with w=65 h=8400
//
//        | bbox-reg 16 x 4       |score(1)|
//        +-----+-----+-----+-----+--------+
//        | dx0 | dy0 | dx1 | dy1 |   0.1  |
//   all /|     |     |     |     |        |
//  boxes |  .. |  .. |  .. |  .. |   0.0  |
//  (8400)|     |     |     |     |   .    |
//       \|     |     |     |     |   .    |
//        +-----+-----+-----+-----+--------+
//

//
//        | pose (51) |
//        +-----------+
//        |0.1........|
//   all /|           |
//  boxes |0.0........|
//  (8400)|     .     |
//       \|     .     |
//        +-----------+
//

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct KeyPoint
{
    cv::Point2f p;
    float prob;
};

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
    std::vector<KeyPoint> keypoints;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_points = pred_points.w / 3;

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);
            const ncnn::Mat pred_points_grid = pred_points.row_range(y * num_grid_x + x, 1).reshape(3, num_points);

            // find label with max score
            int label = 0;
            float score = sigmoid(pred_grid[reg_max_1 * 4]);

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                std::vector<KeyPoint> keypoints;
                for (int k = 0; k < num_points; k++)
                {
                    KeyPoint keypoint;
                    keypoint.p.x = (x + pred_points_grid.row(k)[0] * 2) * stride;
                    keypoint.p.y = (y + pred_points_grid.row(k)[1] * 2) * stride;
                    keypoint.prob = sigmoid(pred_points_grid.row(k)[2]);
                    keypoints.push_back(keypoint);
                }

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;
                obj.keypoints = keypoints;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_points.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);

        pred_row_offset += num_grid;
    }
}

static int detect_yolo11_pose(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolo11;

    yolo11.opt.use_vulkan_compute = true;
    // yolo11.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets
    yolo11.load_param("yolo11n_pose.ncnn.param");
    yolo11.load_model("yolo11n_pose.ncnn.bin");
    // yolo11.load_param("yolo11s_pose.ncnn.param");
    // yolo11.load_model("yolo11s_pose.ncnn.bin");
    // yolo11.load_param("yolo11m_pose.ncnn.param");
    // yolo11.load_model("yolo11m_pose.ncnn.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;
    const float mask_threshold = 0.5f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolo11.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolo11.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    ncnn::Mat out_points;
    ex.extract("out1", out_points);

    std::vector<Object> proposals;
    generate_proposals(out, out_points, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();
    if (count == 0)
        return 0;

    const int num_points = out_points.w / 3;

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        for (int j = 0; j < num_points; j++)
        {
            objects[i].keypoints[j].p.x = (objects[i].keypoints[j].p.x - (wpad / 2)) / scale;
            objects[i].keypoints[j].p.y = (objects[i].keypoints[j].p.y - (hpad / 2)) / scale;
        }

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"person"};

    static const cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        // draw bone
        static const int joint_pairs[16][2] = {
            {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}
        };
        static const cv::Scalar bone_colors[] = {
            cv::Scalar(0, 255, 0),
            cv::Scalar(0, 255, 0),
            cv::Scalar(0, 255, 0),
            cv::Scalar(0, 255, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 51, 255),
            cv::Scalar(255, 51, 255),
            cv::Scalar(255, 51, 255),
            cv::Scalar(51, 153, 255),
            cv::Scalar(51, 153, 255),
            cv::Scalar(51, 153, 255),
            cv::Scalar(51, 153, 255),
        };

        for (int j = 0; j < 16; j++)
        {
            const KeyPoint& p1 = obj.keypoints[joint_pairs[j][0]];
            const KeyPoint& p2 = obj.keypoints[joint_pairs[j][1]];

            if (p1.prob < 0.2f || p2.prob < 0.2f)
                continue;

            cv::line(image, p1.p, p2.p, bone_colors[j], 2);
        }

        // draw joint
        for (size_t j = 0; j < obj.keypoints.size(); j++)
        {
            const KeyPoint& keypoint = obj.keypoints[j];

            fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob);

            if (keypoint.prob < 0.2f)
                continue;

            cv::circle(image, keypoint.p, 3, color, -1);
        }

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolo11_pose(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolo11_seg.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolo11-seg torchscript
//      yolo export model=yolo11n-seg.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolo11n-seg.torchscript
// 4. modify yolo11n_seg_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_202 = v_201.view(1, 32, 6400)
//          v_208 = v_207.view(1, 32, 1600)
//          v_214 = v_213.view(1, 32, 400)
//          v_215 = torch.cat((v_202, v_208, v_214), dim=2)
//          ...
//          v_261 = v_230.view(1, 144, 6400)
//          v_262 = v_245.view(1, 144, 1600)
//          v_263 = v_260.view(1, 144, 400)
//          v_264 = torch.cat((v_261, v_262, v_263), dim=2)
//          ...
//          v_285 = (v_284, v_196, )
//          return v_285
//      after:
//          v_202 = v_201.view(1, 32, -1).transpose(1, 2)
//          v_208 = v_207.view(1, 32, -1).transpose(1, 2)
//          v_214 = v_213.view(1, 32, -1).transpose(1, 2)
//          v_215 = torch.cat((v_202, v_208, v_214), dim=1)
//          ...
//          v_261 = v_230.view(1, 144, -1).transpose(1, 2)
//          v_262 = v_245.view(1, 144, -1).transpose(1, 2)
//          v_263 = v_260.view(1, 144, -1).transpose(1, 2)
//          v_264 = torch.cat((v_261, v_262, v_263), dim=1)
//          return v_264, v_215, v_196
//      D. modify area attention for dynamic shape inference
//      before:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, 400)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, 20, 20)
//          v_107 = v_99.reshape(1, 128, 20, 20)
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
//      after:
//          v_95 = self.model_10_m_0_attn_qkv_conv(v_94)
//          v_96 = v_95.view(1, 2, 128, -1)
//          v_97, v_98, v_99 = torch.split(tensor=v_96, dim=2, split_size_or_sections=(32,32,64))
//          v_100 = torch.transpose(input=v_97, dim0=-2, dim1=-1)
//          v_101 = torch.matmul(input=v_100, other=v_98)
//          v_102 = (v_101 * 0.176777)
//          v_103 = F.softmax(input=v_102, dim=-1)
//          v_104 = torch.transpose(input=v_103, dim0=-2, dim1=-1)
//          v_105 = torch.matmul(input=v_99, other=v_104)
//          v_106 = v_105.view(1, 128, v_95.size(2), v_95.size(3))
//          v_107 = v_99.reshape(1, 128, v_95.size(2), v_95.size(3))
//          v_108 = self.model_10_m_0_attn_pe_conv(v_107)
//          v_109 = (v_106 + v_108)
//          v_110 = self.model_10_m_0_attn_proj_conv(v_109)
// 5. re-export yolo11-seg torchscript
//      python3 -c 'import yolo11n_seg_pnnx; yolo11n_seg_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolo11n_seg_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
// 7. now you get ncnn model files
//      mv yolo11n_seg_pnnx.py.ncnn.param yolo11n_seg.ncnn.param
//      mv yolo11n_seg_pnnx.py.ncnn.bin yolo11n_seg.ncnn.bin

// the out blob would be a 2-dim tensor with w=176 h=8400
//
//        | bbox-reg 16 x 4       | per-class scores(80) |
//        +-----+-----+-----+-----+----------------------+
//        | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......|
//   all /|     |     |     |     |           .          |
//  boxes |  .. |  .. |  .. |  .. |0.0 0.9 0.0 0.0 ......|
//  (8400)|     |     |     |     |           .          |
//       \|     |     |     |     |           .          |
//        +-----+-----+-----+-----+----------------------+
//

//
//        | mask (32) |
//        +-----------+
//        |0.1........|
//   all /|           |
//  boxes |0.0........|
//  (8400)|     .     |
//       \|     .     |
//        +-----------+
//

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
    int gindex;
    cv::Mat mask;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);

            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            {
                const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);

                for (int k = 0; k < num_class; k++)
                {
                    float s = pred_score[k];
                    if (s > score)
                    {
                        label = k;
                        score = s;
                    }
                }

                score = sigmoid(score);
            }

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;
                obj.gindex = y * num_grid_x + x;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        std::vector<Object> objects_stride;
        generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects_stride);

        for (size_t j = 0; j < objects_stride.size(); j++)
        {
            Object obj = objects_stride[j];
            obj.gindex += pred_row_offset;
            objects.push_back(obj);
        }

        pred_row_offset += num_grid;
    }
}

static int detect_yolo11_seg(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolo11;

    yolo11.opt.use_vulkan_compute = true;
    // yolo11.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolo11/tree/master/app/src/main/assets
    yolo11.load_param("yolo11n_seg.ncnn.param");
    yolo11.load_model("yolo11n_seg.ncnn.bin");
    // yolo11.load_param("yolo11s_seg.ncnn.param");
    // yolo11.load_model("yolo11s_seg.ncnn.bin");
    // yolo11.load_param("yolo11m_seg.ncnn.param");
    // yolo11.load_model("yolo11m_seg.ncnn.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;
    const float mask_threshold = 0.5f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolo11.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolo11.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    std::vector<Object> proposals;
    generate_proposals(out, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();
    if (count == 0)
        return 0;

    ncnn::Mat mask_feat;
    ex.extract("out1", mask_feat);

    ncnn::Mat mask_protos;
    ex.extract("out2", mask_protos);

    ncnn::Mat objects_mask_feat(mask_feat.w, 1, count);

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;

        // pick mask feat
        memcpy(objects_mask_feat.channel(i), mask_feat.row(objects[i].gindex), mask_feat.w * sizeof(float));
    }

    // process mask
    ncnn::Mat objects_mask;
    {
        ncnn::Layer* gemm = ncnn::create_layer("Gemm");

        ncnn::ParamDict pd;
        pd.set(6, 1);                             // constantC
        pd.set(7, count);                         // constantM
        pd.set(8, mask_protos.w * mask_protos.h); // constantN
        pd.set(9, mask_feat.w);                   // constantK
        pd.set(10, -1);                           // constant_broadcast_type_C
        pd.set(11, 1);                            // output_N1M
        gemm->load_param(pd);

        ncnn::Option opt;
        opt.num_threads = 1;
        opt.use_packing_layout = false;

        gemm->create_pipeline(opt);

        std::vector<ncnn::Mat> gemm_inputs(2);
        gemm_inputs[0] = objects_mask_feat;
        gemm_inputs[1] = mask_protos.reshape(mask_protos.w * mask_protos.h, 1, mask_protos.c);
        std::vector<ncnn::Mat> gemm_outputs(1);
        gemm->forward(gemm_inputs, gemm_outputs, opt);
        objects_mask = gemm_outputs[0].reshape(mask_protos.w, mask_protos.h, count);

        gemm->destroy_pipeline(opt);

        delete gemm;
    }
    {
        ncnn::Layer* sigmoid = ncnn::create_layer("Sigmoid");

        ncnn::Option opt;
        opt.num_threads = 1;
        opt.use_packing_layout = false;

        sigmoid->create_pipeline(opt);

        sigmoid->forward_inplace(objects_mask, opt);

        sigmoid->destroy_pipeline(opt);

        delete sigmoid;
    }

    // resize mask map
    {
        ncnn::Mat objects_mask_resized;
        ncnn::resize_bilinear(objects_mask, objects_mask_resized, in_pad.w / scale, in_pad.h / scale);
        objects_mask = objects_mask_resized;
    }

    // create per-object mask
    for (int i = 0; i < count; i++)
    {
        Object& obj = objects[i];

        const ncnn::Mat mm = objects_mask.channel(i);

        obj.mask = cv::Mat((int)obj.rect.height, (int)obj.rect.width, CV_8UC1);

        // adjust offset to original unpadded and clip inside object box
        for (int y = 0; y < (int)obj.rect.height; y++)
        {
            const float* pmm = mm.row((int)(hpad / 2 / scale + obj.rect.y + y)) + (int)(wpad / 2 / scale + obj.rect.x);
            uchar* pmask = obj.mask.ptr<uchar>(y);
            for (int x = 0; x < (int)obj.rect.width; x++)
            {
                pmask[x] = pmm[x] > mask_threshold ? 1 : 0;
            }
        }
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    static cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        for (int y = 0; y < (int)obj.rect.height; y++)
        {
            const uchar* maskptr = obj.mask.ptr<const uchar>(y);
            uchar* bgrptr = image.ptr<uchar>((int)obj.rect.y + y) + (int)obj.rect.x * 3;
            for (int x = 0; x < (int)obj.rect.width; x++)
            {
                if (maskptr[x])
                {
                    bgrptr[0] = bgrptr[0] * 0.5 + color[0] * 0.5;
                    bgrptr[1] = bgrptr[1] * 0.5 + color[1] * 0.5;
                    bgrptr[2] = bgrptr[2] * 0.5 + color[2] * 0.5;
                }
                bgrptr += 3;
            }
        }

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolo11_seg(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov2.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int detect_yolov2(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov2;

    yolov2.opt.use_vulkan_compute = true;

    // original pretrained model from https://github.com/eric612/MobileNet-YOLO
    // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy.prototxt
    // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy_iter_80000.caffemodel
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (yolov2.load_param("mobilenet_yolo.param"))
        exit(-1);
    if (yolov2.load_model("mobilenet_yolo.bin"))
        exit(-1);

    const int target_size = 416;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);

    // the Caffe-YOLOv2-Windows style
    // X' = X * scale - mean
    const float mean_vals[3] = {1.0f, 1.0f, 1.0f};
    const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f};
    in.substract_mean_normalize(0, norm_vals);
    in.substract_mean_normalize(mean_vals, 0);

    ncnn::Extractor ex = yolov2.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("detection_out", out);

    //     printf("%d %d %d\n", out.w, out.h, out.c);
    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];
        object.rect.x = values[2] * img_w;
        object.rect.y = values[3] * img_h;
        object.rect.width = values[4] * img_w - object.rect.x;
        object.rect.height = values[5] * img_h - object.rect.y;

        objects.push_back(object);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov2(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov3.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int detect_yolov3(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov3;

    yolov3.opt.use_vulkan_compute = true;

    // original pretrained model from https://github.com/eric612/MobileNet-YOLO
    // param : https://drive.google.com/open?id=1V9oKHP6G6XvXZqhZbzNKL6FI_clRWdC-
    // bin : https://drive.google.com/open?id=1DBcuFCr-856z3FRQznWL_S5h-Aj3RawA
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (yolov3.load_param("mobilenetv2_yolov3.param"))
        exit(-1);
    if (yolov3.load_model("mobilenetv2_yolov3.bin"))
        exit(-1);

    const int target_size = 352;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);

    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
    const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = yolov3.create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("detection_out", out);

    //     printf("%d %d %d\n", out.w, out.h, out.c);
    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];
        object.rect.x = values[2] * img_w;
        object.rect.y = values[3] * img_h;
        object.rect.width = values[4] * img_w - object.rect.x;
        object.rect.height = values[5] * img_h - object.rect.y;

        objects.push_back(object);
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"background",
                                        "aeroplane", "bicycle", "bird", "boat",
                                        "bottle", "bus", "car", "cat", "chair",
                                        "cow", "diningtable", "dog", "horse",
                                        "motorbike", "person", "pottedplant",
                                        "sheep", "sofa", "train", "tvmonitor"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov3(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov4.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "net.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#if CV_MAJOR_VERSION >= 3
#include <opencv2/videoio/videoio.hpp>
#endif

#include <vector>

#include <stdio.h>

#define NCNN_PROFILING
#define YOLOV4_TINY //Using yolov4_tiny, if undef, using original yolov4

#ifdef NCNN_PROFILING
#include "benchmark.h"
#endif

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static int init_yolov4(ncnn::Net* yolov4, int* target_size)
{
    /* --> Set the params you need for the ncnn inference <-- */

    yolov4->opt.num_threads = 4; //You need to compile with libgomp for multi thread support

    yolov4->opt.use_vulkan_compute = true; //You need to compile with libvulkan for gpu support

    /* --> End of setting params <-- */
    int ret = 0;

    // original pretrained model from https://github.com/AlexeyAB/darknet
    // the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
#ifdef YOLOV4_TINY
    const char* yolov4_param = "yolov4-tiny-opt.param";
    const char* yolov4_model = "yolov4-tiny-opt.bin";
    *target_size = 416;
#else
    const char* yolov4_param = "yolov4-opt.param";
    const char* yolov4_model = "yolov4-opt.bin";
    *target_size = 608;
#endif

    if (yolov4->load_param(yolov4_param))
        exit(-1);
    if (yolov4->load_model(yolov4_model))
        exit(-1);

    return 0;
}

static int detect_yolov4(const cv::Mat& bgr, std::vector<Object>& objects, int target_size, ncnn::Net* yolov4)
{
    int img_w = bgr.cols;
    int img_h = bgr.rows;

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size);

    const float mean_vals[3] = {0, 0, 0};
    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in.substract_mean_normalize(mean_vals, norm_vals);

    ncnn::Extractor ex = yolov4->create_extractor();

    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("output", out);

    objects.clear();
    for (int i = 0; i < out.h; i++)
    {
        const float* values = out.row(i);

        Object object;
        object.label = values[0];
        object.prob = values[1];
        object.rect.x = values[2] * img_w;
        object.rect.y = values[3] * img_h;
        object.rect.width = values[4] * img_w - object.rect.x;
        object.rect.height = values[5] * img_h - object.rect.y;

        objects.push_back(object);
    }

    return 0;
}

static int draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, int is_streaming)
{
    static const char* class_names[] = {"background", "person", "bicycle",
                                        "car", "motorbike", "aeroplane", "bus", "train", "truck",
                                        "boat", "traffic light", "fire hydrant", "stop sign",
                                        "parking meter", "bench", "bird", "cat", "dog", "horse",
                                        "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
                                        "backpack", "umbrella", "handbag", "tie", "suitcase",
                                        "frisbee", "skis", "snowboard", "sports ball", "kite",
                                        "baseball bat", "baseball glove", "skateboard", "surfboard",
                                        "tennis racket", "bottle", "wine glass", "cup", "fork",
                                        "knife", "spoon", "bowl", "banana", "apple", "sandwich",
                                        "orange", "broccoli", "carrot", "hot dog", "pizza", "donut",
                                        "cake", "chair", "sofa", "pottedplant", "bed", "diningtable",
                                        "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard",
                                        "cell phone", "microwave", "oven", "toaster", "sink",
                                        "refrigerator", "book", "clock", "vase", "scissors",
                                        "teddy bear", "hair drier", "toothbrush"
                                       };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);

    if (is_streaming)
    {
        cv::waitKey(1);
    }
    else
    {
        cv::waitKey(0);
    }

    return 0;
}

int main(int argc, char** argv)
{
    cv::Mat frame;
    std::vector<Object> objects;

    cv::VideoCapture cap;

    ncnn::Net yolov4;

    const char* devicepath;

    int target_size = 0;
    int is_streaming = 0;

    if (argc < 2)
    {
        fprintf(stderr, "Usage: %s [v4l input device or image]\n", argv[0]);
        return -1;
    }

    devicepath = argv[1];

#ifdef NCNN_PROFILING
    double t_load_start = ncnn::get_current_time();
#endif

    int ret = init_yolov4(&yolov4, &target_size); //We load model and param first!
    if (ret != 0)
    {
        fprintf(stderr, "Failed to load model or param, error %d", ret);
        return -1;
    }

#ifdef NCNN_PROFILING
    double t_load_end = ncnn::get_current_time();
    fprintf(stdout, "NCNN Init time %.02lfms\n", t_load_end - t_load_start);
#endif

    if (strstr(devicepath, "/dev/video") == NULL)
    {
        frame = cv::imread(argv[1], 1);
        if (frame.empty())
        {
            fprintf(stderr, "Failed to read image %s.\n", argv[1]);
            return -1;
        }
    }
    else
    {
        cap.open(devicepath);

        if (!cap.isOpened())
        {
            fprintf(stderr, "Failed to open %s", devicepath);
            return -1;
        }

        cap >> frame;

        if (frame.empty())
        {
            fprintf(stderr, "Failed to read from device %s.\n", devicepath);
            return -1;
        }

        is_streaming = 1;
    }

    while (1)
    {
        if (is_streaming)
        {
#ifdef NCNN_PROFILING
            double t_capture_start = ncnn::get_current_time();
#endif

            cap >> frame;

#ifdef NCNN_PROFILING
            double t_capture_end = ncnn::get_current_time();
            fprintf(stdout, "NCNN OpenCV capture time %.02lfms\n", t_capture_end - t_capture_start);
#endif
            if (frame.empty())
            {
                fprintf(stderr, "OpenCV Failed to Capture from device %s\n", devicepath);
                return -1;
            }
        }

#ifdef NCNN_PROFILING
        double t_detect_start = ncnn::get_current_time();
#endif

        detect_yolov4(frame, objects, target_size, &yolov4); //Create an extractor and run detection

#ifdef NCNN_PROFILING
        double t_detect_end = ncnn::get_current_time();
        fprintf(stdout, "NCNN detection time %.02lfms\n", t_detect_end - t_detect_start);
#endif

#ifdef NCNN_PROFILING
        double t_draw_start = ncnn::get_current_time();
#endif

        draw_objects(frame, objects, is_streaming); //Draw detection results on opencv image

#ifdef NCNN_PROFILING
        double t_draw_end = ncnn::get_current_time();
        fprintf(stdout, "NCNN OpenCV draw result time %.02lfms\n", t_draw_end - t_draw_start);
#endif

        if (!is_streaming)
        {   //If it is a still image, exit!
            return 0;
        }
    }

    return 0;
}


================================================
FILE: examples/yolov5.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

//#define YOLOV5_V60 1 //YOLOv5 v6.0
#define YOLOV5_V62 1 //YOLOv5 v6.2 export  onnx model method https://github.com/shaoshengsong/yolov5_62_export_ncnn

#if YOLOV5_V60 || YOLOV5_V62
#define MAX_STRIDE 64
#else
#define MAX_STRIDE 32
class YoloV5Focus : public ncnn::Layer
{
public:
    YoloV5Focus()
    {
        one_blob_only = true;
    }

    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;

        int outw = w / 2;
        int outh = h / 2;
        int outc = channels * 4;

        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc; p++)
        {
            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
            float* outptr = top_blob.channel(p);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    *outptr = *ptr;

                    outptr += 1;
                    ptr += 2;
                }

                ptr += w;
            }
        }

        return 0;
    }
};

DEFINE_LAYER_CREATOR(YoloV5Focus)
#endif //YOLOV5_V60    YOLOV5_V62

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid = feat_blob.h;

    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h)
    {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    }
    else
    {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }

    const int num_class = feat_blob.w - 5;

    const int num_anchors = anchors.w / 2;

    for (int q = 0; q < num_anchors; q++)
    {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];

        const ncnn::Mat feat = feat_blob.channel(q);

        for (int i = 0; i < num_grid_y; i++)
        {
            for (int j = 0; j < num_grid_x; j++)
            {
                const float* featptr = feat.row(i * num_grid_x + j);
                float box_confidence = sigmoid(featptr[4]);
                if (box_confidence >= prob_threshold)
                {
                    // find class index with max class score
                    int class_index = 0;
                    float class_score = -FLT_MAX;
                    for (int k = 0; k < num_class; k++)
                    {
                        float score = featptr[5 + k];
                        if (score > class_score)
                        {
                            class_index = k;
                            class_score = score;
                        }
                    }
                    float confidence = box_confidence * sigmoid(class_score);
                    if (confidence >= prob_threshold)
                    {
                        // yolov5/models/yolo.py Detect forward
                        // y = x[i].sigmoid()
                        // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
                        // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh

                        float dx = sigmoid(featptr[0]);
                        float dy = sigmoid(featptr[1]);
                        float dw = sigmoid(featptr[2]);
                        float dh = sigmoid(featptr[3]);

                        float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                        float pb_cy = (dy * 2.f - 0.5f + i) * stride;

                        float pb_w = pow(dw * 2.f, 2) * anchor_w;
                        float pb_h = pow(dh * 2.f, 2) * anchor_h;

                        float x0 = pb_cx - pb_w * 0.5f;
                        float y0 = pb_cy - pb_h * 0.5f;
                        float x1 = pb_cx + pb_w * 0.5f;
                        float y1 = pb_cy + pb_h * 0.5f;

                        Object obj;
                        obj.rect.x = x0;
                        obj.rect.y = y0;
                        obj.rect.width = x1 - x0;
                        obj.rect.height = y1 - y0;
                        obj.label = class_index;
                        obj.prob = confidence;

                        objects.push_back(obj);
                    }
                }
            }
        }
    }
}

static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov5;

    yolov5.opt.use_vulkan_compute = true;
    // yolov5.opt.use_bf16_storage = true;

    // original pretrained model from https://github.com/ultralytics/yolov5
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
#if YOLOV5_V62
    if (yolov5.load_param("yolov5s_6.2.param"))
        exit(-1);
    if (yolov5.load_model("yolov5s_6.2.bin"))
        exit(-1);
#elif YOLOV5_V60
    if (yolov5.load_param("yolov5s_6.0.param"))
        exit(-1);
    if (yolov5.load_model("yolov5s_6.0.bin"))
        exit(-1);
#else
    yolov5.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);

    if (yolov5.load_param("yolov5s.param"))
        exit(-1);
    if (yolov5.load_model("yolov5s.bin"))
        exit(-1);
#endif

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // letterbox pad to multiple of MAX_STRIDE
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // pad to target_size rectangle
    // yolov5/utils/datasets.py letterbox
    int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
    int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov5.create_extractor();

    ex.input("images", in_pad);

    std::vector<Object> proposals;

    // anchor setting from yolov5/models/yolov5s.yaml

    // stride 8
    {
        ncnn::Mat out;
        ex.extract("output", out);

        ncnn::Mat anchors(6);
        anchors[0] = 10.f;
        anchors[1] = 13.f;
        anchors[2] = 16.f;
        anchors[3] = 30.f;
        anchors[4] = 33.f;
        anchors[5] = 23.f;

        std::vector<Object> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);

        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }

    // stride 16
    {
        ncnn::Mat out;

#if YOLOV5_V62
        ex.extract("353", out);
#elif YOLOV5_V60
        ex.extract("376", out);
#else
        ex.extract("781", out);
#endif

        ncnn::Mat anchors(6);
        anchors[0] = 30.f;
        anchors[1] = 61.f;
        anchors[2] = 62.f;
        anchors[3] = 45.f;
        anchors[4] = 59.f;
        anchors[5] = 119.f;

        std::vector<Object> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);

        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }

    // stride 32
    {
        ncnn::Mat out;
#if YOLOV5_V62
        ex.extract("367", out);
#elif YOLOV5_V60
        ex.extract("401", out);
#else
        ex.extract("801", out);
#endif
        ncnn::Mat anchors(6);
        anchors[0] = 116.f;
        anchors[1] = 90.f;
        anchors[2] = 156.f;
        anchors[3] = 198.f;
        anchors[4] = 373.f;
        anchors[5] = 326.f;

        std::vector<Object> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);

        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov5(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov5_pnnx.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid_x = feat_blob.w;
    const int num_grid_y = feat_blob.h;

    const int num_anchors = anchors.w / 2;

    const int num_class = feat_blob.c / num_anchors - 5;

    const int feat_offset = num_class + 5;

    for (int q = 0; q < num_anchors; q++)
    {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];

        for (int i = 0; i < num_grid_y; i++)
        {
            for (int j = 0; j < num_grid_x; j++)
            {
                // find class index with max class score
                int class_index = 0;
                float class_score = -FLT_MAX;
                for (int k = 0; k < num_class; k++)
                {
                    float score = feat_blob.channel(q * feat_offset + 5 + k).row(i)[j];
                    if (score > class_score)
                    {
                        class_index = k;
                        class_score = score;
                    }
                }

                float box_score = feat_blob.channel(q * feat_offset + 4).row(i)[j];

                float confidence = sigmoid(box_score) * sigmoid(class_score);

                if (confidence >= prob_threshold)
                {
                    // yolov5/models/yolo.py Detect forward
                    // y = x[i].sigmoid()
                    // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
                    // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh

                    float dx = sigmoid(feat_blob.channel(q * feat_offset + 0).row(i)[j]);
                    float dy = sigmoid(feat_blob.channel(q * feat_offset + 1).row(i)[j]);
                    float dw = sigmoid(feat_blob.channel(q * feat_offset + 2).row(i)[j]);
                    float dh = sigmoid(feat_blob.channel(q * feat_offset + 3).row(i)[j]);

                    float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                    float pb_cy = (dy * 2.f - 0.5f + i) * stride;

                    float pb_w = pow(dw * 2.f, 2) * anchor_w;
                    float pb_h = pow(dh * 2.f, 2) * anchor_h;

                    float x0 = pb_cx - pb_w * 0.5f;
                    float y0 = pb_cy - pb_h * 0.5f;
                    float x1 = pb_cx + pb_w * 0.5f;
                    float y1 = pb_cy + pb_h * 0.5f;

                    Object obj;
                    obj.rect.x = x0;
                    obj.rect.y = y0;
                    obj.rect.width = x1 - x0;
                    obj.rect.height = y1 - y0;
                    obj.label = class_index;
                    obj.prob = confidence;

                    objects.push_back(obj);
                }
            }
        }
    }
}

static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov5;

    yolov5.opt.use_vulkan_compute = true;
    // yolov5.opt.use_bf16_storage = true;

    // original pretrained model from https://github.com/ultralytics/yolov5
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    if (yolov5.load_param("yolov5s.ncnn.param"))
        exit(-1);
    if (yolov5.load_model("yolov5s.ncnn.bin"))
        exit(-1);

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // yolov5/models/common.py DetectMultiBackend
    const int max_stride = 64;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // pad to target_size rectangle
    // yolov5/utils/datasets.py letterbox
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov5.create_extractor();

    ex.input("in0", in_pad);

    std::vector<Object> proposals;

    // anchor setting from yolov5/models/yolov5s.yaml

    // stride 8
    {
        ncnn::Mat out;
        ex.extract("out0", out);

        ncnn::Mat anchors(6);
        anchors[0] = 10.f;
        anchors[1] = 13.f;
        anchors[2] = 16.f;
        anchors[3] = 30.f;
        anchors[4] = 33.f;
        anchors[5] = 23.f;

        std::vector<Object> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);

        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }

    // stride 16
    {
        ncnn::Mat out;
        ex.extract("out1", out);

        ncnn::Mat anchors(6);
        anchors[0] = 30.f;
        anchors[1] = 61.f;
        anchors[2] = 62.f;
        anchors[3] = 45.f;
        anchors[4] = 59.f;
        anchors[5] = 119.f;

        std::vector<Object> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);

        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }

    // stride 32
    {
        ncnn::Mat out;
        ex.extract("out2", out);

        ncnn::Mat anchors(6);
        anchors[0] = 116.f;
        anchors[1] = 90.f;
        anchors[2] = 156.f;
        anchors[3] = 198.f;
        anchors[4] = 373.f;
        anchors[5] = 326.f;

        std::vector<Object> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);

        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov5(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov7.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

#define MAX_STRIDE 32

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid = feat_blob.h;

    int num_grid_x;
    int num_grid_y;
    if (in_pad.w > in_pad.h)
    {
        num_grid_x = in_pad.w / stride;
        num_grid_y = num_grid / num_grid_x;
    }
    else
    {
        num_grid_y = in_pad.h / stride;
        num_grid_x = num_grid / num_grid_y;
    }

    const int num_class = feat_blob.w - 5;

    const int num_anchors = anchors.w / 2;

    for (int q = 0; q < num_anchors; q++)
    {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];

        const ncnn::Mat feat = feat_blob.channel(q);

        for (int i = 0; i < num_grid_y; i++)
        {
            for (int j = 0; j < num_grid_x; j++)
            {
                const float* featptr = feat.row(i * num_grid_x + j);
                float box_confidence = sigmoid(featptr[4]);
                if (box_confidence >= prob_threshold)
                {
                    // find class index with max class score
                    int class_index = 0;
                    float class_score = -FLT_MAX;
                    for (int k = 0; k < num_class; k++)
                    {
                        float score = featptr[5 + k];
                        if (score > class_score)
                        {
                            class_index = k;
                            class_score = score;
                        }
                    }
                    float confidence = box_confidence * sigmoid(class_score);
                    if (confidence >= prob_threshold)
                    {
                        float dx = sigmoid(featptr[0]);
                        float dy = sigmoid(featptr[1]);
                        float dw = sigmoid(featptr[2]);
                        float dh = sigmoid(featptr[3]);

                        float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                        float pb_cy = (dy * 2.f - 0.5f + i) * stride;

                        float pb_w = pow(dw * 2.f, 2) * anchor_w;
                        float pb_h = pow(dh * 2.f, 2) * anchor_h;

                        float x0 = pb_cx - pb_w * 0.5f;
                        float y0 = pb_cy - pb_h * 0.5f;
                        float x1 = pb_cx + pb_w * 0.5f;
                        float y1 = pb_cy + pb_h * 0.5f;

                        Object obj;
                        obj.rect.x = x0;
                        obj.rect.y = y0;
                        obj.rect.width = x1 - x0;
                        obj.rect.height = y1 - y0;
                        obj.label = class_index;
                        obj.prob = confidence;

                        objects.push_back(obj);
                    }
                }
            }
        }
    }
}

static int detect_yolov7(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov7;

    yolov7.opt.use_vulkan_compute = true;
    // yolov7.opt.use_bf16_storage = true;

    // original pretrained model from https://github.com/WongKinYiu/yolov7
    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
    yolov7.load_param("yolov7-tiny.param");
    yolov7.load_model("yolov7-tiny.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // letterbox pad to multiple of MAX_STRIDE
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
    int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov7.create_extractor();

    ex.input("images", in_pad);

    std::vector<Object> proposals;

    // stride 8
    {
        ncnn::Mat out;
        ex.extract("output", out);

        ncnn::Mat anchors(6);
        anchors[0] = 12.f;
        anchors[1] = 16.f;
        anchors[2] = 19.f;
        anchors[3] = 36.f;
        anchors[4] = 40.f;
        anchors[5] = 28.f;

        std::vector<Object> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);

        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }

    // stride 16
    {
        ncnn::Mat out;

        ex.extract("288", out);

        ncnn::Mat anchors(6);
        anchors[0] = 36.f;
        anchors[1] = 75.f;
        anchors[2] = 76.f;
        anchors[3] = 55.f;
        anchors[4] = 72.f;
        anchors[5] = 146.f;

        std::vector<Object> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);

        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }

    // stride 32
    {
        ncnn::Mat out;

        ex.extract("302", out);

        ncnn::Mat anchors(6);
        anchors[0] = 142.f;
        anchors[1] = 110.f;
        anchors[2] = 192.f;
        anchors[3] = 243.f;
        anchors[4] = 459.f;
        anchors[5] = 401.f;

        std::vector<Object> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);

        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    static const unsigned char colors[19][3] = {
        {54, 67, 244},
        {99, 30, 233},
        {176, 39, 156},
        {183, 58, 103},
        {181, 81, 63},
        {243, 150, 33},
        {244, 169, 3},
        {212, 188, 0},
        {136, 150, 0},
        {80, 175, 76},
        {74, 195, 139},
        {57, 220, 205},
        {59, 235, 255},
        {7, 193, 255},
        {0, 152, 255},
        {34, 87, 255},
        {72, 85, 121},
        {158, 158, 158},
        {139, 125, 96}
    };

    int color_index = 0;

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const unsigned char* color = colors[color_index % 19];
        color_index++;

        cv::Scalar cc(color[0], color[1], color[2]);

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cc, 2);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cc, -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov7(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov7_pnnx.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects)
{
    if (faceobjects.empty())
        return;

    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return static_cast<float>(1.f / (1.f + exp(-x)));
}

static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid_x = feat_blob.w;
    const int num_grid_y = feat_blob.h;

    const int num_anchors = anchors.w / 2;

    const int num_class = 80;

    for (int q = 0; q < num_anchors; q++)
    {
        const float anchor_w = anchors[q * 2];
        const float anchor_h = anchors[q * 2 + 1];

        for (int i = 0; i < num_grid_y; i++)
        {
            for (int j = 0; j < num_grid_x; j++)
            {
                // find class index with max class score
                int class_index = 0;
                float class_score = -FLT_MAX;
                for (int k = 0; k < num_class; k++)
                {
                    float score = feat_blob.channel(q * 85 + 5 + k).row(i)[j];
                    if (score > class_score)
                    {
                        class_index = k;
                        class_score = score;
                    }
                }

                float box_score = feat_blob.channel(q * 85 + 4).row(i)[j];

                float confidence = sigmoid(box_score) * sigmoid(class_score);

                if (confidence >= prob_threshold)
                {
                    // yolov5/models/yolo.py Detect forward
                    // y = x[i].sigmoid()
                    // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
                    // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh

                    float dx = sigmoid(feat_blob.channel(q * 85 + 0).row(i)[j]);
                    float dy = sigmoid(feat_blob.channel(q * 85 + 1).row(i)[j]);
                    float dw = sigmoid(feat_blob.channel(q * 85 + 2).row(i)[j]);
                    float dh = sigmoid(feat_blob.channel(q * 85 + 3).row(i)[j]);

                    float pb_cx = (dx * 2.f - 0.5f + j) * stride;
                    float pb_cy = (dy * 2.f - 0.5f + i) * stride;

                    float pb_w = pow(dw * 2.f, 2) * anchor_w;
                    float pb_h = pow(dh * 2.f, 2) * anchor_h;

                    float x0 = pb_cx - pb_w * 0.5f;
                    float y0 = pb_cy - pb_h * 0.5f;
                    float x1 = pb_cx + pb_w * 0.5f;
                    float y1 = pb_cy + pb_h * 0.5f;

                    Object obj;
                    obj.rect.x = x0;
                    obj.rect.y = y0;
                    obj.rect.width = x1 - x0;
                    obj.rect.height = y1 - y0;
                    obj.label = class_index;
                    obj.prob = confidence;

                    objects.push_back(obj);
                }
            }
        }
    }
}

static int detect_yolov7(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov7;

    yolov7.opt.use_vulkan_compute = true;
    // yolov7.opt.use_bf16_storage = true;

    // git clone https://github.com/WongKinYiu/yolov7
    // cd yolov7
    // wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
    // python models/export.py --weights yolov7.pt
    // pnnx yolov7.torchscript.pt inputshape=[1,3,640,640] inputshape=[1,3,320,320]
    yolov7.load_param("yolov7.param");
    yolov7.load_model("yolov7.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // yolov5/models/common.py DetectMultiBackend
    const int max_stride = 64;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // pad to target_size rectangle
    // yolov5/utils/datasets.py letterbox
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov7.create_extractor();

    ex.input("in0", in_pad);

    std::vector<Object> proposals;

    // anchor setting from yolov5/models/yolov5s.yaml

    // stride 8
    {
        ncnn::Mat out;
        ex.extract("out0", out);

        ncnn::Mat anchors(6);
        anchors[0] = 12.f;
        anchors[1] = 16.f;
        anchors[2] = 19.f;
        anchors[3] = 36.f;
        anchors[4] = 40.f;
        anchors[5] = 28.f;

        std::vector<Object> objects8;
        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);

        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
    }

    // stride 16
    {
        ncnn::Mat out;
        ex.extract("out1", out);

        ncnn::Mat anchors(6);
        anchors[0] = 36.f;
        anchors[1] = 75.f;
        anchors[2] = 76.f;
        anchors[3] = 55.f;
        anchors[4] = 72.f;
        anchors[5] = 146.f;

        std::vector<Object> objects16;
        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);

        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
    }

    // stride 32
    {
        ncnn::Mat out;
        ex.extract("out2", out);

        ncnn::Mat anchors(6);
        anchors[0] = 142.f;
        anchors[1] = 110.f;
        anchors[2] = 192.f;
        anchors[3] = 243.f;
        anchors[4] = 459.f;
        anchors[5] = 401.f;

        std::vector<Object> objects32;
        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);

        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov7(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov8.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolov8 torchscript
//      yolo export model=yolov8n.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolov8n.torchscript
// 4. modify yolov8n_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_165 = v_142.view(1, 144, 6400)
//          v_166 = v_153.view(1, 144, 1600)
//          v_167 = v_164.view(1, 144, 400)
//          v_168 = torch.cat((v_165, v_166, v_167), dim=2)
//          ...
//      after:
//          v_165 = v_142.view(1, 144, -1).transpose(1, 2)
//          v_166 = v_153.view(1, 144, -1).transpose(1, 2)
//          v_167 = v_164.view(1, 144, -1).transpose(1, 2)
//          v_168 = torch.cat((v_165, v_166, v_167), dim=1)
//          return v_168
// 5. re-export yolov8 torchscript
//      python3 -c 'import yolov8n_pnnx; yolov8n_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolov8n_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
// 7. now you get ncnn model files
//      mv yolov8n_pnnx.py.ncnn.param yolov8n.ncnn.param
//      mv yolov8n_pnnx.py.ncnn.bin yolov8n.ncnn.bin

// the out blob would be a 2-dim tensor with w=144 h=8400
//
//        | bbox-reg 16 x 4       | per-class scores(80) |
//        +-----+-----+-----+-----+----------------------+
//        | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......|
//   all /|     |     |     |     |           .          |
//  boxes |  .. |  .. |  .. |  .. |0.0 0.9 0.0 0.0 ......|
//  (8400)|     |     |     |     |           .          |
//       \|     |     |     |     |           .          |
//        +-----+-----+-----+-----+----------------------+
//

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);

            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            {
                const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);

                for (int k = 0; k < num_class; k++)
                {
                    float s = pred_score[k];
                    if (s > score)
                    {
                        label = k;
                        score = s;
                    }
                }

                score = sigmoid(score);
            }

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4);

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);
        pred_row_offset += num_grid;
    }
}

static int detect_yolov8(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov8;

    yolov8.opt.use_vulkan_compute = true;
    // yolov8.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets
    yolov8.load_param("yolov8n.ncnn.param");
    yolov8.load_model("yolov8n.ncnn.bin");
    // yolov8.load_param("yolov8s.ncnn.param");
    // yolov8.load_model("yolov8s.ncnn.bin");
    // yolov8.load_param("yolov8m.ncnn.param");
    // yolov8.load_model("yolov8m.ncnn.bin");

    // if you use oiv7 models, you shall call draw_objects_oiv() instead
    // yolov8.load_param("yolov8n_oiv7.ncnn.param");
    // yolov8.load_model("yolov8n_oiv7.ncnn.bin");
    // yolov8.load_param("yolov8s_oiv7.ncnn.param");
    // yolov8.load_model("yolov8s_oiv7.ncnn.bin");
    // yolov8.load_param("yolov8m_oiv7.ncnn.param");
    // yolov8.load_model("yolov8m_oiv7.ncnn.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolov8.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov8.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    std::vector<Object> proposals;
    generate_proposals(out, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects_coco(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    static cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

static void draw_objects_oiv(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "Accordion", "Adhesive tape", "Aircraft", "Airplane", "Alarm clock", "Alpaca", "Ambulance", "Animal",
        "Ant", "Antelope", "Apple", "Armadillo", "Artichoke", "Auto part", "Axe", "Backpack", "Bagel",
        "Baked goods", "Balance beam", "Ball", "Balloon", "Banana", "Band-aid", "Banjo", "Barge", "Barrel",
        "Baseball bat", "Baseball glove", "Bat (Animal)", "Bathroom accessory", "Bathroom cabinet", "Bathtub",
        "Beaker", "Bear", "Bed", "Bee", "Beehive", "Beer", "Beetle", "Bell pepper", "Belt", "Bench", "Bicycle",
        "Bicycle helmet", "Bicycle wheel", "Bidet", "Billboard", "Billiard table", "Binoculars", "Bird",
        "Blender", "Blue jay", "Boat", "Bomb", "Book", "Bookcase", "Boot", "Bottle", "Bottle opener",
        "Bow and arrow", "Bowl", "Bowling equipment", "Box", "Boy", "Brassiere", "Bread", "Briefcase",
        "Broccoli", "Bronze sculpture", "Brown bear", "Building", "Bull", "Burrito", "Bus", "Bust", "Butterfly",
        "Cabbage", "Cabinetry", "Cake", "Cake stand", "Calculator", "Camel", "Camera", "Can opener", "Canary",
        "Candle", "Candy", "Cannon", "Canoe", "Cantaloupe", "Car", "Carnivore", "Carrot", "Cart", "Cassette deck",
        "Castle", "Cat", "Cat furniture", "Caterpillar", "Cattle", "Ceiling fan", "Cello", "Centipede",
        "Chainsaw", "Chair", "Cheese", "Cheetah", "Chest of drawers", "Chicken", "Chime", "Chisel", "Chopsticks",
        "Christmas tree", "Clock", "Closet", "Clothing", "Coat", "Cocktail", "Cocktail shaker", "Coconut",
        "Coffee", "Coffee cup", "Coffee table", "Coffeemaker", "Coin", "Common fig", "Common sunflower",
        "Computer keyboard", "Computer monitor", "Computer mouse", "Container", "Convenience store", "Cookie",
        "Cooking spray", "Corded phone", "Cosmetics", "Couch", "Countertop", "Cowboy hat", "Crab", "Cream",
        "Cricket ball", "Crocodile", "Croissant", "Crown", "Crutch", "Cucumber", "Cupboard", "Curtain",
        "Cutting board", "Dagger", "Dairy Product", "Deer", "Desk", "Dessert", "Diaper", "Dice", "Digital clock",
        "Dinosaur", "Dishwasher", "Dog", "Dog bed", "Doll", "Dolphin", "Door", "Door handle", "Doughnut",
        "Dragonfly", "Drawer", "Dress", "Drill (Tool)", "Drink", "Drinking straw", "Drum", "Duck", "Dumbbell",
        "Eagle", "Earrings", "Egg (Food)", "Elephant", "Envelope", "Eraser", "Face powder", "Facial tissue holder",
        "Falcon", "Fashion accessory", "Fast food", "Fax", "Fedora", "Filing cabinet", "Fire hydrant",
        "Fireplace", "Fish", "Flag", "Flashlight", "Flower", "Flowerpot", "Flute", "Flying disc", "Food",
        "Food processor", "Football", "Football helmet", "Footwear", "Fork", "Fountain", "Fox", "French fries",
        "French horn", "Frog", "Fruit", "Frying pan", "Furniture", "Garden Asparagus", "Gas stove", "Giraffe",
        "Girl", "Glasses", "Glove", "Goat", "Goggles", "Goldfish", "Golf ball", "Golf cart", "Gondola",
        "Goose", "Grape", "Grapefruit", "Grinder", "Guacamole", "Guitar", "Hair dryer", "Hair spray", "Hamburger",
        "Hammer", "Hamster", "Hand dryer", "Handbag", "Handgun", "Harbor seal", "Harmonica", "Harp",
        "Harpsichord", "Hat", "Headphones", "Heater", "Hedgehog", "Helicopter", "Helmet", "High heels",
        "Hiking equipment", "Hippopotamus", "Home appliance", "Honeycomb", "Horizontal bar", "Horse", "Hot dog",
        "House", "Houseplant", "Human arm", "Human beard", "Human body", "Human ear", "Human eye", "Human face",
        "Human foot", "Human hair", "Human hand", "Human head", "Human leg", "Human mouth", "Human nose",
        "Humidifier", "Ice cream", "Indoor rower", "Infant bed", "Insect", "Invertebrate", "Ipod", "Isopod",
        "Jacket", "Jacuzzi", "Jaguar (Animal)", "Jeans", "Jellyfish", "Jet ski", "Jug", "Juice", "Kangaroo",
        "Kettle", "Kitchen & dining room table", "Kitchen appliance", "Kitchen knife", "Kitchen utensil",
        "Kitchenware", "Kite", "Knife", "Koala", "Ladder", "Ladle", "Ladybug", "Lamp", "Land vehicle",
        "Lantern", "Laptop", "Lavender (Plant)", "Lemon", "Leopard", "Light bulb", "Light switch", "Lighthouse",
        "Lily", "Limousine", "Lion", "Lipstick", "Lizard", "Lobster", "Loveseat", "Luggage and bags", "Lynx",
        "Magpie", "Mammal", "Man", "Mango", "Maple", "Maracas", "Marine invertebrates", "Marine mammal",
        "Measuring cup", "Mechanical fan", "Medical equipment", "Microphone", "Microwave oven", "Milk",
        "Miniskirt", "Mirror", "Missile", "Mixer", "Mixing bowl", "Mobile phone", "Monkey", "Moths and butterflies",
        "Motorcycle", "Mouse", "Muffin", "Mug", "Mule", "Mushroom", "Musical instrument", "Musical keyboard",
        "Nail (Construction)", "Necklace", "Nightstand", "Oboe", "Office building", "Office supplies", "Orange",
        "Organ (Musical Instrument)", "Ostrich", "Otter", "Oven", "Owl", "Oyster", "Paddle", "Palm tree",
        "Pancake", "Panda", "Paper cutter", "Paper towel", "Parachute", "Parking meter", "Parrot", "Pasta",
        "Pastry", "Peach", "Pear", "Pen", "Pencil case", "Pencil sharpener", "Penguin", "Perfume", "Person",
        "Personal care", "Personal flotation device", "Piano", "Picnic basket", "Picture frame", "Pig",
        "Pillow", "Pineapple", "Pitcher (Container)", "Pizza", "Pizza cutter", "Plant", "Plastic bag", "Plate",
        "Platter", "Plumbing fixture", "Polar bear", "Pomegranate", "Popcorn", "Porch", "Porcupine", "Poster",
        "Potato", "Power plugs and sockets", "Pressure cooker", "Pretzel", "Printer", "Pumpkin", "Punching bag",
        "Rabbit", "Raccoon", "Racket", "Radish", "Ratchet (Device)", "Raven", "Rays and skates", "Red panda",
        "Refrigerator", "Remote control", "Reptile", "Rhinoceros", "Rifle", "Ring binder", "Rocket",
        "Roller skates", "Rose", "Rugby ball", "Ruler", "Salad", "Salt and pepper shakers", "Sandal",
        "Sandwich", "Saucer", "Saxophone", "Scale", "Scarf", "Scissors", "Scoreboard", "Scorpion",
        "Screwdriver", "Sculpture", "Sea lion", "Sea turtle", "Seafood", "Seahorse", "Seat belt", "Segway",
        "Serving tray", "Sewing machine", "Shark", "Sheep", "Shelf", "Shellfish", "Shirt", "Shorts",
        "Shotgun", "Shower", "Shrimp", "Sink", "Skateboard", "Ski", "Skirt", "Skull", "Skunk", "Skyscraper",
        "Slow cooker", "Snack", "Snail", "Snake", "Snowboard", "Snowman", "Snowmobile", "Snowplow",
        "Soap dispenser", "Sock", "Sofa bed", "Sombrero", "Sparrow", "Spatula", "Spice rack", "Spider",
        "Spoon", "Sports equipment", "Sports uniform", "Squash (Plant)", "Squid", "Squirrel", "Stairs",
        "Stapler", "Starfish", "Stationary bicycle", "Stethoscope", "Stool", "Stop sign", "Strawberry",
        "Street light", "Stretcher", "Studio couch", "Submarine", "Submarine sandwich", "Suit", "Suitcase",
        "Sun hat", "Sunglasses", "Surfboard", "Sushi", "Swan", "Swim cap", "Swimming pool", "Swimwear",
        "Sword", "Syringe", "Table", "Table tennis racket", "Tablet computer", "Tableware", "Taco", "Tank",
        "Tap", "Tart", "Taxi", "Tea", "Teapot", "Teddy bear", "Telephone", "Television", "Tennis ball",
        "Tennis racket", "Tent", "Tiara", "Tick", "Tie", "Tiger", "Tin can", "Tire", "Toaster", "Toilet",
        "Toilet paper", "Tomato", "Tool", "Toothbrush", "Torch", "Tortoise", "Towel", "Tower", "Toy",
        "Traffic light", "Traffic sign", "Train", "Training bench", "Treadmill", "Tree", "Tree house",
        "Tripod", "Trombone", "Trousers", "Truck", "Trumpet", "Turkey", "Turtle", "Umbrella", "Unicycle",
        "Van", "Vase", "Vegetable", "Vehicle", "Vehicle registration plate", "Violin", "Volleyball (Ball)",
        "Waffle", "Waffle iron", "Wall clock", "Wardrobe", "Washing machine", "Waste container", "Watch",
        "Watercraft", "Watermelon", "Weapon", "Whale", "Wheel", "Wheelchair", "Whisk", "Whiteboard", "Willow",
        "Window", "Window blind", "Wine", "Wine glass", "Wine rack", "Winter melon", "Wok", "Woman",
        "Wood-burning stove", "Woodpecker", "Worm", "Wrench", "Zebra", "Zucchini"
    };

    static cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov8(m, objects);

    draw_objects_coco(m, objects);
    // draw_objects_oiv(m, objects);

    return 0;
}


================================================
FILE: examples/yolov8_cls.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolov8-cls torchscript
//      yolo export model=yolov8n-cls.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolov8n-cls.torchscript
// 4. now you get ncnn model files
//      yolov8n_cls.ncnn.param
//      yolov8n_cls.ncnn.bin

#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    int label;
    float prob;
};

static void get_topk(const ncnn::Mat& cls_scores, int topk, std::vector<Object>& objects)
{
    // partial sort topk with index
    int size = cls_scores.w;
    std::vector<std::pair<float, int> > vec;
    vec.resize(size);
    for (int i = 0; i < size; i++)
    {
        vec[i] = std::make_pair(cls_scores[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater<std::pair<float, int> >());

    objects.resize(topk);
    for (int i = 0; i < topk; i++)
    {
        objects[i].label = vec[i].second;
        objects[i].prob = vec[i].first;
    }
}

static int detect_yolov8_cls(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov8;

    yolov8.opt.use_vulkan_compute = true;
    // yolov8.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets
    yolov8.load_param("yolov8n_cls.ncnn.param");
    yolov8.load_model("yolov8n_cls.ncnn.bin");
    // yolov8.load_param("yolov8s_cls.ncnn.param");
    // yolov8.load_model("yolov8s_cls.ncnn.bin");
    // yolov8.load_param("yolov8m_cls.ncnn.param");
    // yolov8.load_model("yolov8m_cls.ncnn.bin");

    const int target_size = 224;
    const int topk = 5;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // letterbox pad
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = target_size - w;
    int hpad = target_size - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov8.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    // return top-5
    get_topk(out, topk, objects);

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "tench", "goldfish", "great white shark", "tiger shark", "hammerhead", "electric ray", "stingray", "cock",
        "hen", "ostrich", "brambling", "goldfinch", "house finch", "junco", "indigo bunting", "robin", "bulbul",
        "jay", "magpie", "chickadee", "water ouzel", "kite", "bald eagle", "vulture", "great grey owl",
        "European fire salamander", "common newt", "eft", "spotted salamander", "axolotl", "bullfrog", "tree frog",
        "tailed frog", "loggerhead", "leatherback turtle", "mud turtle", "terrapin", "box turtle", "banded gecko",
        "common iguana", "American chameleon", "whiptail", "agama", "frilled lizard", "alligator lizard",
        "Gila monster", "green lizard", "African chameleon", "Komodo dragon", "African crocodile",
        "American alligator", "triceratops", "thunder snake", "ringneck snake", "hognose snake", "green snake",
        "king snake", "garter snake", "water snake", "vine snake", "night snake", "boa constrictor", "rock python",
        "Indian cobra", "green mamba", "sea snake", "horned viper", "diamondback", "sidewinder", "trilobite",
        "harvestman", "scorpion", "black and gold garden spider", "barn spider", "garden spider", "black widow",
        "tarantula", "wolf spider", "tick", "centipede", "black grouse", "ptarmigan", "ruffed grouse",
        "prairie chicken", "peacock", "quail", "partridge", "African grey", "macaw", "sulphur-crested cockatoo",
        "lorikeet", "coucal", "bee eater", "hornbill", "hummingbird", "jacamar", "toucan", "drake",
        "red-breasted merganser", "goose", "black swan", "tusker", "echidna", "platypus", "wallaby", "koala",
        "wombat", "jellyfish", "sea anemone", "brain coral", "flatworm", "nematode", "conch", "snail", "slug",
        "sea slug", "chiton", "chambered nautilus", "Dungeness crab", "rock crab", "fiddler crab", "king crab",
        "American lobster", "spiny lobster", "crayfish", "hermit crab", "isopod", "white stork", "black stork",
        "spoonbill", "flamingo", "little blue heron", "American egret", "bittern", "crane (bird)", "limpkin",
        "European gallinule", "American coot", "bustard", "ruddy turnstone", "red-backed sandpiper", "redshank",
        "dowitcher", "oystercatcher", "pelican", "king penguin", "albatross", "grey whale", "killer whale",
        "dugong", "sea lion", "Chihuahua", "Japanese spaniel", "Maltese dog", "Pekinese", "Shih-Tzu",
        "Blenheim spaniel", "papillon", "toy terrier", "Rhodesian ridgeback", "Afghan hound", "basset", "beagle",
        "bloodhound", "bluetick", "black-and-tan coonhound", "Walker hound", "English foxhound", "redbone",
        "borzoi", "Irish wolfhound", "Italian greyhound", "whippet", "Ibizan hound", "Norwegian elkhound",
        "otterhound", "Saluki", "Scottish deerhound", "Weimaraner", "Staffordshire bullterrier",
        "American Staffordshire terrier", "Bedlington terrier", "Border terrier", "Kerry blue terrier",
        "Irish terrier", "Norfolk terrier", "Norwich terrier", "Yorkshire terrier", "wire-haired fox terrier",
        "Lakeland terrier", "Sealyham terrier", "Airedale", "cairn", "Australian terrier", "Dandie Dinmont",
        "Boston bull", "miniature schnauzer", "giant schnauzer", "standard schnauzer", "Scotch terrier",
        "Tibetan terrier", "silky terrier", "soft-coated wheaten terrier", "West Highland white terrier",
        "Lhasa", "flat-coated retriever", "curly-coated retriever", "golden retriever", "Labrador retriever",
        "Chesapeake Bay retriever", "German short-haired pointer", "vizsla", "English setter", "Irish setter",
        "Gordon setter", "Brittany spaniel", "clumber", "English springer", "Welsh springer spaniel",
        "cocker spaniel", "Sussex spaniel", "Irish water spaniel", "kuvasz", "schipperke", "groenendael",
        "malinois", "briard", "kelpie", "komondor", "Old English sheepdog", "Shetland sheepdog", "collie",
        "Border collie", "Bouvier des Flandres", "Rottweiler", "German shepherd", "Doberman",
        "miniature pinscher", "Greater Swiss Mountain dog", "Bernese mountain dog", "Appenzeller", "EntleBucher",
        "boxer", "bull mastiff", "Tibetan mastiff", "French bulldog", "Great Dane", "Saint Bernard",
        "Eskimo dog", "malamute", "Siberian husky", "dalmatian", "affenpinscher", "basenji", "pug", "Leonberg",
        "Newfoundland", "Great Pyrenees", "Samoyed", "Pomeranian", "chow", "keeshond", "Brabancon griffon",
        "Pembroke", "Cardigan", "toy poodle", "miniature poodle", "standard poodle", "Mexican hairless",
        "timber wolf", "white wolf", "red wolf", "coyote", "dingo", "dhole", "African hunting dog", "hyena",
        "red fox", "kit fox", "Arctic fox", "grey fox", "tabby", "tiger cat", "Persian cat", "Siamese cat",
        "Egyptian cat", "cougar", "lynx", "leopard", "snow leopard", "jaguar", "lion", "tiger", "cheetah",
        "brown bear", "American black bear", "ice bear", "sloth bear", "mongoose", "meerkat", "tiger beetle",
        "ladybug", "ground beetle", "long-horned beetle", "leaf beetle", "dung beetle", "rhinoceros beetle",
        "weevil", "fly", "bee", "ant", "grasshopper", "cricket", "walking stick", "cockroach", "mantis",
        "cicada", "leafhopper", "lacewing", "dragonfly", "damselfly", "admiral", "ringlet", "monarch",
        "cabbage butterfly", "sulphur butterfly", "lycaenid", "starfish", "sea urchin", "sea cucumber",
        "wood rabbit", "hare", "Angora", "hamster", "porcupine", "fox squirrel", "marmot", "beaver",
        "guinea pig", "sorrel", "zebra", "hog", "wild boar", "warthog", "hippopotamus", "ox", "water buffalo",
        "bison", "ram", "bighorn", "ibex", "hartebeest", "impala", "gazelle", "Arabian camel", "llama",
        "weasel", "mink", "polecat", "black-footed ferret", "otter", "skunk", "badger", "armadillo",
        "three-toed sloth", "orangutan", "gorilla", "chimpanzee", "gibbon", "siamang", "guenon", "patas",
        "baboon", "macaque", "langur", "colobus", "proboscis monkey", "marmoset", "capuchin", "howler monkey",
        "titi", "spider monkey", "squirrel monkey", "Madagascar cat", "indri", "Indian elephant",
        "African elephant", "lesser panda", "giant panda", "barracouta", "eel", "coho", "rock beauty",
        "anemone fish", "sturgeon", "gar", "lionfish", "puffer", "abacus", "abaya", "academic gown",
        "accordion", "acoustic guitar", "aircraft carrier", "airliner", "airship", "altar", "ambulance",
        "amphibian", "analog clock", "apiary", "apron", "ashcan", "assault rifle", "backpack", "bakery",
        "balance beam", "balloon", "ballpoint", "Band Aid", "banjo", "bannister", "barbell", "barber chair",
        "barbershop", "barn", "barometer", "barrel", "barrow", "baseball", "basketball", "bassinet", "bassoon",
        "bathing cap", "bath towel", "bathtub", "beach wagon", "beacon", "beaker", "bearskin", "beer bottle",
        "beer glass", "bell cote", "bib", "bicycle-built-for-two", "bikini", "binder", "binoculars",
        "birdhouse", "boathouse", "bobsled", "bolo tie", "bonnet", "bookcase", "bookshop", "bottlecap", "bow",
        "bow tie", "brass", "brassiere", "breakwater", "breastplate", "broom", "bucket", "buckle",
        "bulletproof vest", "bullet train", "butcher shop", "cab", "caldron", "candle", "cannon", "canoe",
        "can opener", "cardigan", "car mirror", "carousel", "carpenter's kit", "carton", "car wheel",
        "cash machine", "cassette", "cassette player", "castle", "catamaran", "CD player", "cello",
        "cellular telephone", "chain", "chainlink fence", "chain mail", "chain saw", "chest", "chiffonier",
        "chime", "china cabinet", "Christmas stocking", "church", "cinema", "cleaver", "cliff dwelling",
        "cloak", "clog", "cocktail shaker", "coffee mug", "coffeepot", "coil", "combination lock",
        "computer keyboard", "confectionery", "container ship", "convertible", "corkscrew", "cornet",
        "cowboy boot", "cowboy hat", "cradle", "crane (machine)", "crash helmet", "crate", "crib",
        "Crock Pot", "croquet ball", "crutch", "cuirass", "dam", "desk", "desktop computer", "dial telephone",
        "diaper", "digital clock", "digital watch", "dining table", "dishrag", "dishwasher", "disk brake",
        "dock", "dogsled", "dome", "doormat", "drilling platform", "drum", "drumstick", "dumbbell",
        "Dutch oven", "electric fan", "electric guitar", "electric locomotive", "entertainment center",
        "envelope", "espresso maker", "face powder", "feather boa", "file", "fireboat", "fire engine",
        "fire screen", "flagpole", "flute", "folding chair", "football helmet", "forklift", "fountain",
        "fountain pen", "four-poster", "freight car", "French horn", "frying pan", "fur coat", "garbage truck",
        "gasmask", "gas pump", "goblet", "go-kart", "golf ball", "golfcart", "gondola", "gong", "gown",
        "grand piano", "greenhouse", "grille", "grocery store", "guillotine", "hair slide", "hair spray",
        "half track", "hammer", "hamper", "hand blower", "hand-held computer", "handkerchief", "hard disc",
        "harmonica", "harp", "harvester", "hatchet", "holster", "home theater", "honeycomb", "hook",
        "hoopskirt", "horizontal bar", "horse cart", "hourglass", "iPod", "iron", "jack-o'-lantern", "jean",
        "jeep", "jersey", "jigsaw puzzle", "jinrikisha", "joystick", "kimono", "knee pad", "knot", "lab coat",
        "ladle", "lampshade", "laptop", "lawn mower", "lens cap", "letter opener", "library", "lifeboat",
        "lighter", "limousine", "liner", "lipstick", "Loafer", "lotion", "loudspeaker", "loupe", "lumbermill",
        "magnetic compass", "mailbag", "mailbox", "maillot (tights)", "maillot (tank suit)", "manhole cover",
        "maraca", "marimba", "mask", "matchstick", "maypole", "maze", "measuring cup", "medicine chest",
        "megalith", "microphone", "microwave", "military uniform", "milk can", "minibus", "miniskirt",
        "minivan", "missile", "mitten", "mixing bowl", "mobile home", "Model T", "modem", "monastery",
        "monitor", "moped", "mortar", "mortarboard", "mosque", "mosquito net", "motor scooter", "mountain bike",
        "mountain tent", "mouse", "mousetrap", "moving van", "muzzle", "nail", "neck brace", "necklace",
        "nipple", "notebook", "obelisk", "oboe", "ocarina", "odometer", "oil filter", "organ", "oscilloscope",
        "overskirt", "oxcart", "oxygen mask", "packet", "paddle", "paddlewheel", "padlock", "paintbrush",
        "pajama", "palace", "panpipe", "paper towel", "parachute", "parallel bars", "park bench",
        "parking meter", "passenger car", "patio", "pay-phone", "pedestal", "pencil box", "pencil sharpener",
        "perfume", "Petri dish", "photocopier", "pick", "pickelhaube", "picket fence", "pickup", "pier",
        "piggy bank", "pill bottle", "pillow", "ping-pong ball", "pinwheel", "pirate", "pitcher", "plane",
        "planetarium", "plastic bag", "plate rack", "plow", "plunger", "Polaroid camera", "pole",
        "police van", "poncho", "pool table", "pop bottle", "pot", "potter's wheel", "power drill",
        "prayer rug", "printer", "prison", "projectile", "projector", "puck", "punching bag", "purse",
        "quill", "quilt", "racer", "racket", "radiator", "radio", "radio telescope", "rain barrel",
        "recreational vehicle", "reel", "reflex camera", "refrigerator", "remote control", "restaurant",
        "revolver", "rifle", "rocking chair", "rotisserie", "rubber eraser", "rugby ball", "rule",
        "running shoe", "safe", "safety pin", "saltshaker", "sandal", "sarong", "sax", "scabbard", "scale",
        "school bus", "schooner", "scoreboard", "screen", "screw", "screwdriver", "seat belt", "sewing machine",
        "shield", "shoe shop", "shoji", "shopping basket", "shopping cart", "shovel", "shower cap",
        "shower curtain", "ski", "ski mask", "sleeping bag", "slide rule", "sliding door", "slot", "snorkel",
        "snowmobile", "snowplow", "soap dispenser", "soccer ball", "sock", "solar dish", "sombrero",
        "soup bowl", "space bar", "space heater", "space shuttle", "spatula", "speedboat", "spider web",
        "spindle", "sports car", "spotlight", "stage", "steam locomotive", "steel arch bridge", "steel drum",
        "stethoscope", "stole", "stone wall", "stopwatch", "stove", "strainer", "streetcar", "stretcher",
        "studio couch", "stupa", "submarine", "suit", "sundial", "sunglass", "sunglasses", "sunscreen",
        "suspension bridge", "swab", "sweatshirt", "swimming trunks", "swing", "switch", "syringe",
        "table lamp", "tank", "tape player", "teapot", "teddy", "television", "tennis ball", "thatch",
        "theater curtain", "thimble", "thresher", "throne", "tile roof", "toaster", "tobacco shop",
        "toilet seat", "torch", "totem pole", "tow truck", "toyshop", "tractor", "trailer truck", "tray",
        "trench coat", "tricycle", "trimaran", "tripod", "triumphal arch", "trolleybus", "trombone", "tub",
        "turnstile", "typewriter keyboard", "umbrella", "unicycle", "upright", "vacuum", "vase", "vault",
        "velvet", "vending machine", "vestment", "viaduct", "violin", "volleyball", "waffle iron", "wall clock",
        "wallet", "wardrobe", "warplane", "washbasin", "washer", "water bottle", "water jug", "water tower",
        "whiskey jug", "whistle", "wig", "window screen", "window shade", "Windsor tie", "wine bottle", "wing",
        "wok", "wooden spoon", "wool", "worm fence", "wreck", "yawl", "yurt", "web site", "comic book",
        "crossword puzzle", "street sign", "traffic light", "book jacket", "menu", "plate", "guacamole",
        "consomme", "hot pot", "trifle", "ice cream", "ice lolly", "French loaf", "bagel", "pretzel",
        "cheeseburger", "hotdog", "mashed potato", "head cabbage", "broccoli", "cauliflower", "zucchini",
        "spaghetti squash", "acorn squash", "butternut squash", "cucumber", "artichoke", "bell pepper",
        "cardoon", "mushroom", "Granny Smith", "strawberry", "orange", "lemon", "fig", "pineapple", "banana",
        "jackfruit", "custard apple", "pomegranate", "hay", "carbonara", "chocolate sauce", "dough",
        "meat loaf", "pizza", "potpie", "burrito", "red wine", "espresso", "cup", "eggnog", "alp", "bubble",
        "cliff", "coral reef", "geyser", "lakeside", "promontory", "sandbar", "seashore", "valley", "volcano",
        "ballplayer", "groom", "scuba diver", "rapeseed", "daisy", "yellow lady's slipper", "corn", "acorn",
        "hip", "buckeye", "coral fungus", "agaric", "gyromitra", "stinkhorn", "earthstar", "hen-of-the-woods",
        "bolete", "ear", "toilet tissue"
    };

    cv::Mat image = bgr.clone();

    int y_offset = 0;
    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f\n", obj.label, obj.prob);

        char text[256];
        sprintf(text, "%4.1f%% %s", obj.prob * 100, class_names[obj.label]);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = 0;
        int y = y_offset;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));

        y_offset += label_size.height;
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov8_cls(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov8_obb.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolov8-obb torchscript
//      yolo export model=yolov8n-obb.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolov8n-obb.torchscript
// 4. modify yolov8n_obb_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_137 = v_136.view(1, 1, 16384)
//          v_143 = v_142.view(1, 1, 4096)
//          v_149 = v_148.view(1, 1, 1024)
//          v_150 = torch.cat((v_137, v_143, v_149), dim=2)
//          ...
//          v_186 = v_163.view(1, 79, 16384)
//          v_187 = v_174.view(1, 79, 4096)
//          v_188 = v_185.view(1, 79, 1024)
//          v_189 = torch.cat((v_186, v_187, v_188), dim=2)
//          ...
//      after:
//          v_137 = v_136.view(1, 1, -1).transpose(1, 2)
//          v_143 = v_142.view(1, 1, -1).transpose(1, 2)
//          v_149 = v_148.view(1, 1, -1).transpose(1, 2)
//          v_150 = torch.cat((v_137, v_143, v_149), dim=1)
//          ...
//          v_186 = v_163.view(1, 79, -1).transpose(1, 2)
//          v_187 = v_174.view(1, 79, -1).transpose(1, 2)
//          v_188 = v_185.view(1, 79, -1).transpose(1, 2)
//          v_189 = torch.cat((v_186, v_187, v_188), dim=1)
//          return v_189, v_150
// 5. re-export yolov8-obb torchscript
//      python3 -c 'import yolov8n_obb_pnnx; yolov8n_obb_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolov8n_obb_pnnx.py.pt inputshape=[1,3,1024,1024] inputshape2=[1,3,512,512]
// 7. now you get ncnn model files
//      mv yolov8n_obb_pnnx.py.ncnn.param yolov8n_obb.ncnn.param
//      mv yolov8n_obb_pnnx.py.ncnn.bin yolov8n_obb.ncnn.bin

// the out blob would be a 2-dim tensor with w=79 h=21504
//
//        | bbox-reg 16 x 4       |score(15)|
//        +-----+-----+-----+-----+---------+
//        | dx0 | dy0 | dx1 | dy1 | 0.1 ... |
//   all /|     |     |     |     |     ... |
//  boxes |  .. |  .. |  .. |  .. | 0.0 ... |
// (21504)|     |     |     |     |  .  ... |
//       \|     |     |     |     |  .  ... |
//        +-----+-----+-----+-----+---------+
//

// the out blob would be a 2-dim tensor with w=1 h=21504
//
//        | degree(1)|
//        +----------+
//        |    0.1   |
//   all /|          |
//  boxes |    0.0   |
// (21504)|     .    |
//       \|     .    |
//        +----------+
//

#include "layer.h"
#include "net.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

#include <float.h>
#include <math.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::RotatedRect rrect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    std::vector<cv::Point2f> intersection;
    cv::rotatedRectangleIntersection(a.rrect, b.rrect, intersection);
    if (intersection.empty())
        return 0.f;

    return cv::contourArea(intersection);
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rrect.size.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area;
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_class = pred.w - reg_max_1 * 4; // number of classes. 15 for DOTAv1

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);

            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            {
                const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);

                for (int k = 0; k < num_class; k++)
                {
                    float s = pred_score[k];
                    if (s > score)
                    {
                        label = k;
                        score = s;
                    }
                }

                score = sigmoid(score);
            }

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                const float angle = sigmoid(pred_angle.row(y * num_grid_x + x)[0]) - 0.25f;

                const float angle_rad = angle * 3.14159265358979323846f;
                const float angle_degree = angle * 180.f;

                float cos = cosf(angle_rad);
                float sin = sinf(angle_rad);

                float xx = (pred_ltrb[2] - pred_ltrb[0]) * 0.5f;
                float yy = (pred_ltrb[3] - pred_ltrb[1]) * 0.5f;
                float xr = xx * cos - yy * sin;
                float yr = xx * sin + yy * cos;
                const float cx = pb_cx + xr;
                const float cy = pb_cy + yr;
                const float ww = pred_ltrb[2] + pred_ltrb[0];
                const float hh = pred_ltrb[3] + pred_ltrb[1];

                Object obj;
                obj.rrect = cv::RotatedRect(cv::Point2f(cx, cy), cv::Size_<float>(ww, hh), angle_degree);
                obj.label = label;
                obj.prob = score;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_angle, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_angle.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);

        pred_row_offset += num_grid;
    }
}

static int detect_yolov8_obb(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov8;

    yolov8.opt.use_vulkan_compute = true;
    // yolov8.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets
    yolov8.load_param("yolov8n_obb.ncnn.param");
    yolov8.load_model("yolov8n_obb.ncnn.bin");
    // yolov8.load_param("yolov8s_obb.ncnn.param");
    // yolov8.load_model("yolov8s_obb.ncnn.bin");
    // yolov8.load_param("yolov8m_obb.ncnn.param");
    // yolov8.load_model("yolov8m_obb.ncnn.bin");

    const int target_size = 1024;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolov8.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov8.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    ncnn::Mat out_angle;
    ex.extract("out1", out_angle);

    std::vector<Object> proposals;
    generate_proposals(out, out_angle, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();
    if (count == 0)
        return 0;

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        Object obj = proposals[picked[i]];

        // adjust offset to original unpadded
        obj.rrect.center.x = (obj.rrect.center.x - (wpad / 2)) / scale;
        obj.rrect.center.y = (obj.rrect.center.y - (hpad / 2)) / scale;
        obj.rrect.size.width = (obj.rrect.size.width) / scale;
        obj.rrect.size.height = (obj.rrect.size.height) / scale;

        objects[i] = obj;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "plane", "ship", "storage tank", "baseball diamond", "tennis court",
        "basketball court", "ground track field", "harbor", "bridge", "large vehicle",
        "small vehicle", "helicopter", "roundabout", "soccer ball field", "swimming pool"
    };

    static const cv::Scalar colors[] = {
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[obj.label];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f  @ %.2f\n", obj.label, obj.prob,
                obj.rrect.center.x, obj.rrect.center.y, obj.rrect.size.width, obj.rrect.size.height, obj.rrect.angle);

        cv::Point2f corners[4];
        obj.rrect.points(corners);
        cv::line(image, corners[0], corners[1], color);
        cv::line(image, corners[1], corners[2], color);
        cv::line(image, corners[2], corners[3], color);
        cv::line(image, corners[3], corners[0], color);
    }

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[obj.label];

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rrect.center.x - label_size.width / 2;
        int y = obj.rrect.center.y - label_size.height / 2 - baseLine;
        if (y < 0)
            y = 0;
        if (y + label_size.height > image.rows)
            y = image.rows - label_size.height;
        if (x < 0)
            x = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov8_obb(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov8_pose.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolov8-pose torchscript
//      yolo export model=yolov8n-pose.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolov8n-pose.torchscript
// 4. modify yolov8n_pose_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_137 = v_136.view(1, 51, 6400)
//          v_143 = v_142.view(1, 51, 1600)
//          v_149 = v_148.view(1, 51, 400)
//          v_150 = torch.cat((v_137, v_143, v_149), dim=-1)
//          ...
//          v_184 = v_161.view(1, 65, 6400)
//          v_185 = v_172.view(1, 65, 1600)
//          v_186 = v_183.view(1, 65, 400)
//          v_187 = torch.cat((v_184, v_185, v_186), dim=2)
//          ...
//      after:
//          v_137 = v_136.view(1, 51, -1).transpose(1, 2)
//          v_143 = v_142.view(1, 51, -1).transpose(1, 2)
//          v_149 = v_148.view(1, 51, -1).transpose(1, 2)
//          v_150 = torch.cat((v_137, v_143, v_149), dim=1)
//          ...
//          v_184 = v_161.view(1, 65, -1).transpose(1, 2)
//          v_185 = v_172.view(1, 65, -1).transpose(1, 2)
//          v_186 = v_183.view(1, 65, -1).transpose(1, 2)
//          v_187 = torch.cat((v_184, v_185, v_186), dim=1)
//          return v_187, v_150
// 5. re-export yolov8-pose torchscript
//      python3 -c 'import yolov8n_pose_pnnx; yolov8n_pose_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolov8n_pose_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
// 7. now you get ncnn model files
//      mv yolov8n_pose_pnnx.py.ncnn.param yolov8n_pose.ncnn.param
//      mv yolov8n_pose_pnnx.py.ncnn.bin yolov8n_pose.ncnn.bin

// the out blob would be a 2-dim tensor with w=65 h=8400
//
//        | bbox-reg 16 x 4       |score(1)|
//        +-----+-----+-----+-----+--------+
//        | dx0 | dy0 | dx1 | dy1 |   0.1  |
//   all /|     |     |     |     |        |
//  boxes |  .. |  .. |  .. |  .. |   0.0  |
//  (8400)|     |     |     |     |   .    |
//       \|     |     |     |     |   .    |
//        +-----+-----+-----+-----+--------+
//

//
//        | pose (51) |
//        +-----------+
//        |0.1........|
//   all /|           |
//  boxes |0.0........|
//  (8400)|     .     |
//       \|     .     |
//        +-----------+
//

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct KeyPoint
{
    cv::Point2f p;
    float prob;
};

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
    std::vector<KeyPoint> keypoints;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_points = pred_points.w / 3;

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);
            const ncnn::Mat pred_points_grid = pred_points.row_range(y * num_grid_x + x, 1).reshape(3, num_points);

            // find label with max score
            int label = 0;
            float score = sigmoid(pred_grid[reg_max_1 * 4]);

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                std::vector<KeyPoint> keypoints;
                for (int k = 0; k < num_points; k++)
                {
                    KeyPoint keypoint;
                    keypoint.p.x = (x + pred_points_grid.row(k)[0] * 2) * stride;
                    keypoint.p.y = (y + pred_points_grid.row(k)[1] * 2) * stride;
                    keypoint.prob = sigmoid(pred_points_grid.row(k)[2]);
                    keypoints.push_back(keypoint);
                }

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;
                obj.keypoints = keypoints;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const ncnn::Mat& pred_points, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        generate_proposals(pred.row_range(pred_row_offset, num_grid), pred_points.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects);

        pred_row_offset += num_grid;
    }
}

static int detect_yolov8_pose(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov8;

    yolov8.opt.use_vulkan_compute = true;
    // yolov8.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets
    yolov8.load_param("yolov8n_pose.ncnn.param");
    yolov8.load_model("yolov8n_pose.ncnn.bin");
    // yolov8.load_param("yolov8s_pose.ncnn.param");
    // yolov8.load_model("yolov8s_pose.ncnn.bin");
    // yolov8.load_param("yolov8m_pose.ncnn.param");
    // yolov8.load_model("yolov8m_pose.ncnn.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;
    const float mask_threshold = 0.5f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolov8.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov8.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    ncnn::Mat out_points;
    ex.extract("out1", out_points);

    std::vector<Object> proposals;
    generate_proposals(out, out_points, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();
    if (count == 0)
        return 0;

    const int num_points = out_points.w / 3;

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        for (int j = 0; j < num_points; j++)
        {
            objects[i].keypoints[j].p.x = (objects[i].keypoints[j].p.x - (wpad / 2)) / scale;
            objects[i].keypoints[j].p.y = (objects[i].keypoints[j].p.y - (hpad / 2)) / scale;
        }

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {"person"};

    static const cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        // draw bone
        static const int joint_pairs[16][2] = {
            {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}
        };
        static const cv::Scalar bone_colors[] = {
            cv::Scalar(0, 255, 0),
            cv::Scalar(0, 255, 0),
            cv::Scalar(0, 255, 0),
            cv::Scalar(0, 255, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 128, 0),
            cv::Scalar(255, 51, 255),
            cv::Scalar(255, 51, 255),
            cv::Scalar(255, 51, 255),
            cv::Scalar(51, 153, 255),
            cv::Scalar(51, 153, 255),
            cv::Scalar(51, 153, 255),
            cv::Scalar(51, 153, 255),
        };

        for (int j = 0; j < 16; j++)
        {
            const KeyPoint& p1 = obj.keypoints[joint_pairs[j][0]];
            const KeyPoint& p2 = obj.keypoints[joint_pairs[j][1]];

            if (p1.prob < 0.2f || p2.prob < 0.2f)
                continue;

            cv::line(image, p1.p, p2.p, bone_colors[j], 2);
        }

        // draw joint
        for (size_t j = 0; j < obj.keypoints.size(); j++)
        {
            const KeyPoint& keypoint = obj.keypoints[j];

            fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob);

            if (keypoint.prob < 0.2f)
                continue;

            cv::circle(image, keypoint.p, 3, color, -1);
        }

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov8_pose(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolov8_seg.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yolov8-seg torchscript
//      yolo export model=yolov8n-seg.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolov8n-seg.torchscript
// 4. modify yolov8n_seg_pnnx.py for dynamic shape inference
//      A. modify reshape to support dynamic image sizes
//      B. permute tensor before concat and adjust concat axis
//      C. drop post-process part
//      before:
//          v_144 = v_143.view(1, 32, 6400)
//          v_150 = v_149.view(1, 32, 1600)
//          v_156 = v_155.view(1, 32, 400)
//          v_157 = torch.cat((v_144, v_150, v_156), dim=2)
//          ...
//          v_191 = v_168.view(1, 144, 6400)
//          v_192 = v_179.view(1, 144, 1600)
//          v_193 = v_190.view(1, 144, 400)
//          v_194 = torch.cat((v_191, v_192, v_193), dim=2)
//          ...
//          v_215 = (v_214, v_138, )
//          return v_215
//      after:
//          v_144 = v_143.view(1, 32, -1).transpose(1, 2)
//          v_150 = v_149.view(1, 32, -1).transpose(1, 2)
//          v_156 = v_155.view(1, 32, -1).transpose(1, 2)
//          v_157 = torch.cat((v_144, v_150, v_156), dim=1)
//          ...
//          v_191 = v_168.view(1, 144, -1).transpose(1, 2)
//          v_192 = v_179.view(1, 144, -1).transpose(1, 2)
//          v_193 = v_190.view(1, 144, -1).transpose(1, 2)
//          v_194 = torch.cat((v_191, v_192, v_193), dim=1)
//          return v_194, v_157, v_138
// 5. re-export yolov8-seg torchscript
//      python3 -c 'import yolov8n_seg_pnnx; yolov8n_seg_pnnx.export_torchscript()'
// 6. convert new torchscript with dynamic shape
//      pnnx yolov8n_seg_pnnx.py.pt inputshape=[1,3,640,640] inputshape2=[1,3,320,320]
// 7. now you get ncnn model files
//      mv yolov8n_seg_pnnx.py.ncnn.param yolov8n_seg.ncnn.param
//      mv yolov8n_seg_pnnx.py.ncnn.bin yolov8n_seg.ncnn.bin

// the out blob would be a 2-dim tensor with w=176 h=8400
//
//        | bbox-reg 16 x 4       | per-class scores(80) |
//        +-----+-----+-----+-----+----------------------+
//        | dx0 | dy0 | dx1 | dy1 |0.1 0.0 0.0 0.5 ......|
//   all /|     |     |     |     |           .          |
//  boxes |  .. |  .. |  .. |  .. |0.0 0.9 0.0 0.0 ......|
//  (8400)|     |     |     |     |           .          |
//       \|     |     |     |     |           .          |
//        +-----+-----+-----+-----+----------------------+
//

//
//        | mask (32) |
//        +-----------+
//        |0.1........|
//   all /|           |
//  boxes |0.0........|
//  (8400)|     .     |
//       \|     .     |
//        +-----------+
//

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
    int gindex;
    cv::Mat mask;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static inline float sigmoid(float x)
{
    return 1.0f / (1.0f + expf(-x));
}

static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    const int num_grid_x = w / stride;
    const int num_grid_y = h / stride;

    const int reg_max_1 = 16;
    const int num_class = pred.w - reg_max_1 * 4; // number of classes. 80 for COCO

    for (int y = 0; y < num_grid_y; y++)
    {
        for (int x = 0; x < num_grid_x; x++)
        {
            const ncnn::Mat pred_grid = pred.row_range(y * num_grid_x + x, 1);

            // find label with max score
            int label = -1;
            float score = -FLT_MAX;
            {
                const ncnn::Mat pred_score = pred_grid.range(reg_max_1 * 4, num_class);

                for (int k = 0; k < num_class; k++)
                {
                    float s = pred_score[k];
                    if (s > score)
                    {
                        label = k;
                        score = s;
                    }
                }

                score = sigmoid(score);
            }

            if (score >= prob_threshold)
            {
                ncnn::Mat pred_bbox = pred_grid.range(0, reg_max_1 * 4).reshape(reg_max_1, 4).clone();

                {
                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");

                    ncnn::ParamDict pd;
                    pd.set(0, 1); // axis
                    pd.set(1, 1);
                    softmax->load_param(pd);

                    ncnn::Option opt;
                    opt.num_threads = 1;
                    opt.use_packing_layout = false;

                    softmax->create_pipeline(opt);

                    softmax->forward_inplace(pred_bbox, opt);

                    softmax->destroy_pipeline(opt);

                    delete softmax;
                }

                float pred_ltrb[4];
                for (int k = 0; k < 4; k++)
                {
                    float dis = 0.f;
                    const float* dis_after_sm = pred_bbox.row(k);
                    for (int l = 0; l < reg_max_1; l++)
                    {
                        dis += l * dis_after_sm[l];
                    }

                    pred_ltrb[k] = dis * stride;
                }

                float pb_cx = (x + 0.5f) * stride;
                float pb_cy = (y + 0.5f) * stride;

                float x0 = pb_cx - pred_ltrb[0];
                float y0 = pb_cy - pred_ltrb[1];
                float x1 = pb_cx + pred_ltrb[2];
                float y1 = pb_cy + pred_ltrb[3];

                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = x1 - x0;
                obj.rect.height = y1 - y0;
                obj.label = label;
                obj.prob = score;
                obj.gindex = y * num_grid_x + x;

                objects.push_back(obj);
            }
        }
    }
}

static void generate_proposals(const ncnn::Mat& pred, const std::vector<int>& strides, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
{
    const int w = in_pad.w;
    const int h = in_pad.h;

    int pred_row_offset = 0;
    for (size_t i = 0; i < strides.size(); i++)
    {
        const int stride = strides[i];

        const int num_grid_x = w / stride;
        const int num_grid_y = h / stride;
        const int num_grid = num_grid_x * num_grid_y;

        std::vector<Object> objects_stride;
        generate_proposals(pred.row_range(pred_row_offset, num_grid), stride, in_pad, prob_threshold, objects_stride);

        for (size_t j = 0; j < objects_stride.size(); j++)
        {
            Object obj = objects_stride[j];
            obj.gindex += pred_row_offset;
            objects.push_back(obj);
        }

        pred_row_offset += num_grid;
    }
}

static int detect_yolov8_seg(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolov8;

    yolov8.opt.use_vulkan_compute = true;
    // yolov8.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-android-yolov8/tree/master/app/src/main/assets
    yolov8.load_param("yolov8n_seg.ncnn.param");
    yolov8.load_model("yolov8n_seg.ncnn.bin");
    // yolov8.load_param("yolov8s_seg.ncnn.param");
    // yolov8.load_model("yolov8s_seg.ncnn.bin");
    // yolov8.load_param("yolov8m_seg.ncnn.param");
    // yolov8.load_model("yolov8m_seg.ncnn.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;
    const float mask_threshold = 0.5f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // ultralytics/cfg/models/v8/yolov8.yaml
    std::vector<int> strides(3);
    strides[0] = 8;
    strides[1] = 16;
    strides[2] = 32;
    const int max_stride = 32;

    // letterbox pad to multiple of max_stride
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yolov8.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    std::vector<Object> proposals;
    generate_proposals(out, strides, in_pad, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();
    if (count == 0)
        return 0;

    ncnn::Mat mask_feat;
    ex.extract("out1", mask_feat);

    ncnn::Mat mask_protos;
    ex.extract("out2", mask_protos);

    ncnn::Mat objects_mask_feat(mask_feat.w, 1, count);

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;

        // pick mask feat
        memcpy(objects_mask_feat.channel(i), mask_feat.row(objects[i].gindex), mask_feat.w * sizeof(float));
    }

    // process mask
    ncnn::Mat objects_mask;
    {
        ncnn::Layer* gemm = ncnn::create_layer("Gemm");

        ncnn::ParamDict pd;
        pd.set(6, 1);                             // constantC
        pd.set(7, count);                         // constantM
        pd.set(8, mask_protos.w * mask_protos.h); // constantN
        pd.set(9, mask_feat.w);                   // constantK
        pd.set(10, -1);                           // constant_broadcast_type_C
        pd.set(11, 1);                            // output_N1M
        gemm->load_param(pd);

        ncnn::Option opt;
        opt.num_threads = 1;
        opt.use_packing_layout = false;

        gemm->create_pipeline(opt);

        std::vector<ncnn::Mat> gemm_inputs(2);
        gemm_inputs[0] = objects_mask_feat;
        gemm_inputs[1] = mask_protos.reshape(mask_protos.w * mask_protos.h, 1, mask_protos.c);
        std::vector<ncnn::Mat> gemm_outputs(1);
        gemm->forward(gemm_inputs, gemm_outputs, opt);
        objects_mask = gemm_outputs[0].reshape(mask_protos.w, mask_protos.h, count);

        gemm->destroy_pipeline(opt);

        delete gemm;
    }
    {
        ncnn::Layer* sigmoid = ncnn::create_layer("Sigmoid");

        ncnn::Option opt;
        opt.num_threads = 1;
        opt.use_packing_layout = false;

        sigmoid->create_pipeline(opt);

        sigmoid->forward_inplace(objects_mask, opt);

        sigmoid->destroy_pipeline(opt);

        delete sigmoid;
    }

    // resize mask map
    {
        ncnn::Mat objects_mask_resized;
        ncnn::resize_bilinear(objects_mask, objects_mask_resized, in_pad.w / scale, in_pad.h / scale);
        objects_mask = objects_mask_resized;
    }

    // create per-object mask
    for (int i = 0; i < count; i++)
    {
        Object& obj = objects[i];

        const ncnn::Mat mm = objects_mask.channel(i);

        obj.mask = cv::Mat((int)obj.rect.height, (int)obj.rect.width, CV_8UC1);

        // adjust offset to original unpadded and clip inside object box
        for (int y = 0; y < (int)obj.rect.height; y++)
        {
            const float* pmm = mm.row((int)(hpad / 2 / scale + obj.rect.y + y)) + (int)(wpad / 2 / scale + obj.rect.x);
            uchar* pmask = obj.mask.ptr<uchar>(y);
            for (int x = 0; x < (int)obj.rect.width; x++)
            {
                pmask[x] = pmm[x] > mask_threshold ? 1 : 0;
            }
        }
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    static cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        for (int y = 0; y < (int)obj.rect.height; y++)
        {
            const uchar* maskptr = obj.mask.ptr<const uchar>(y);
            uchar* bgrptr = image.ptr<uchar>((int)obj.rect.y + y) + (int)obj.rect.x * 3;
            for (int x = 0; x < (int)obj.rect.width; x++)
            {
                if (maskptr[x])
                {
                    bgrptr[0] = bgrptr[0] * 0.5 + color[0] * 0.5;
                    bgrptr[1] = bgrptr[1] * 0.5 + color[1] * 0.5;
                    bgrptr[2] = bgrptr[2] * 0.5 + color[2] * 0.5;
                }
                bgrptr += 3;
            }
        }

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolov8_seg(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yoloworld.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

// 1. install
//      pip3 install -U ultralytics pnnx ncnn
// 2. export yoloworld torchscript
//      yolo export model=yolov8s-world.pt format=torchscript
//      yolo export model=yolov8m-world.pt format=torchscript
//      yolo export model=yolov8l-world.pt format=torchscript
//      yolo export model=yolov8x-world.pt format=torchscript
//      yolo export model=yolov8s-worldv2.pt format=torchscript
//      yolo export model=yolov8m-worldv2.pt format=torchscript
//      yolo export model=yolov8l-worldv2.pt format=torchscript
//      yolo export model=yolov8x-worldv2.pt format=torchscript
// 3. convert torchscript with static shape
//      pnnx yolov8s-world.torchscript
//      pnnx yolov8m-world.torchscript
//      pnnx yolov8l-world.torchscript
//      pnnx yolov8x-world.torchscript
//      pnnx yolov8s-worldv2.torchscript
//      pnnx yolov8m-worldv2.torchscript
//      pnnx yolov8l-worldv2.torchscript
//      pnnx yolov8x-worldv2.torchscript

// the out blob would be a 2-dim tensor with w=8400 h=84
//
//        |    all boxes (8400)     |
//        +-------------------------+
//        | center-x   .            |
//  bbox  | center-y   .            |
//        |   w        .            |
//        |   h        .            |
//        +-------------------------+
//        | 0.1        .            |
//   per  | 0.0        .            |
//  class | 0.5        .            |
// scores |  .         .            |
//  (80)  |  .         .            |
//        +-------------------------+

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
{
    int i = left;
    int j = right;
    float p = objects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (objects[i].prob > p)
            i++;

        while (objects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(objects[i], objects[j]);

            i++;
            j--;
        }
    }

    // #pragma omp parallel sections
    {
        // #pragma omp section
        {
            if (left < j) qsort_descent_inplace(objects, left, j);
        }
        // #pragma omp section
        {
            if (i < right) qsort_descent_inplace(objects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = objects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = objects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = objects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = objects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static void generate_proposals(const ncnn::Mat& pred, float prob_threshold, std::vector<Object>& objects)
{
    const int num_boxes = pred.w;
    const int num_class = pred.h - 4;

    const ncnn::Mat pred_bbox = pred.row_range(0, 4);
    const ncnn::Mat pred_score = pred.row_range(4, num_class);

    for (int i = 0; i < num_boxes; i++)
    {
        int label = 0;
        float score = -9999.f;
        for (int j = 0; j < num_class; j++)
        {
            const float prob = pred_score.row(j)[i];
            if (prob > score)
            {
                score = prob;
                label = j;
            }
        }

        if (score >= prob_threshold)
        {
            const float cx = pred_bbox.row(0)[i];
            const float cy = pred_bbox.row(1)[i];
            const float w = pred_bbox.row(2)[i];
            const float h = pred_bbox.row(3)[i];

            Object obj;
            obj.rect.x = cx - w / 2;
            obj.rect.y = cy - h / 2;
            obj.rect.width = w;
            obj.rect.height = h;
            obj.label = label;
            obj.prob = score;

            objects.push_back(obj);
        }
    }
}

static int detect_yoloworld(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yoloworld;

    yoloworld.opt.use_vulkan_compute = true;
    // yoloworld.opt.use_bf16_storage = true;

    // https://github.com/nihui/ncnn-assets/tree/master/models
    // yoloworld.load_param("yolov8s_world.ncnn.param");
    // yoloworld.load_model("yolov8s_world.ncnn.bin");
    // yoloworld.load_param("yolov8m_world.ncnn.param");
    // yoloworld.load_model("yolov8m_world.ncnn.bin");
    // yoloworld.load_param("yolov8l_world.ncnn.param");
    // yoloworld.load_model("yolov8l_world.ncnn.bin");
    // yoloworld.load_param("yolov8x_world.ncnn.param");
    // yoloworld.load_model("yolov8x_world.ncnn.bin");
    yoloworld.load_param("yolov8s_worldv2.ncnn.param");
    yoloworld.load_model("yolov8s_worldv2.ncnn.bin");
    // yoloworld.load_param("yolov8m_worldv2.ncnn.param");
    // yoloworld.load_model("yolov8m_worldv2.ncnn.bin");
    // yoloworld.load_param("yolov8l_worldv2.ncnn.param");
    // yoloworld.load_model("yolov8l_worldv2.ncnn.bin");
    // yoloworld.load_param("yolov8x_worldv2.ncnn.param");
    // yoloworld.load_model("yolov8x_worldv2.ncnn.bin");

    const int target_size = 640;
    const float prob_threshold = 0.25f;
    const float nms_threshold = 0.45f;

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    // letterbox pad
    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)target_size / w;
        w = target_size;
        h = h * scale;
    }
    else
    {
        scale = (float)target_size / h;
        h = target_size;
        w = w * scale;
    }

    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);

    // letterbox pad to target_size rectangle
    int wpad = target_size - w;
    int hpad = target_size - h;
    ncnn::Mat in_pad;
    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);

    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
    in_pad.substract_mean_normalize(0, norm_vals);

    ncnn::Extractor ex = yoloworld.create_extractor();

    ex.input("in0", in_pad);

    ncnn::Mat out;
    ex.extract("out0", out);

    std::vector<Object> proposals;
    generate_proposals(out, prob_threshold, proposals);

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, nms_threshold);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    static cv::Scalar colors[] = {
        cv::Scalar(244, 67, 54),
        cv::Scalar(233, 30, 99),
        cv::Scalar(156, 39, 176),
        cv::Scalar(103, 58, 183),
        cv::Scalar(63, 81, 181),
        cv::Scalar(33, 150, 243),
        cv::Scalar(3, 169, 244),
        cv::Scalar(0, 188, 212),
        cv::Scalar(0, 150, 136),
        cv::Scalar(76, 175, 80),
        cv::Scalar(139, 195, 74),
        cv::Scalar(205, 220, 57),
        cv::Scalar(255, 235, 59),
        cv::Scalar(255, 193, 7),
        cv::Scalar(255, 152, 0),
        cv::Scalar(255, 87, 34),
        cv::Scalar(121, 85, 72),
        cv::Scalar(158, 158, 158),
        cv::Scalar(96, 125, 139)
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        const cv::Scalar& color = colors[i % 19];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, color);

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yoloworld(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: examples/yolox.cpp
================================================
// Copyright 2020 Tencent
// Copyright 2020-2021 Megvii Inc.
// SPDX-License-Identifier: BSD-3-Clause

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

#define YOLOX_NMS_THRESH  0.45 // nms threshold
#define YOLOX_CONF_THRESH 0.25 // threshold of bounding box prob
#define YOLOX_TARGET_SIZE 640  // target image size after resize, might use 416 for small model

// YOLOX use the same focus in yolov5
class YoloV5Focus : public ncnn::Layer
{
public:
    YoloV5Focus()
    {
        one_blob_only = true;
    }

    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;

        int outw = w / 2;
        int outh = h / 2;
        int outc = channels * 4;

        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc; p++)
        {
            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
            float* outptr = top_blob.channel(p);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    *outptr = *ptr;

                    outptr += 1;
                    ptr += 2;
                }

                ptr += w;
            }
        }

        return 0;
    }
};

DEFINE_LAYER_CREATOR(YoloV5Focus)

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

struct GridAndStride
{
    int grid0;
    int grid1;
    int stride;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static void generate_grids_and_stride(const int target_w, const int target_h, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
{
    for (int i = 0; i < (int)strides.size(); i++)
    {
        int stride = strides[i];
        int num_grid_w = target_w / stride;
        int num_grid_h = target_h / stride;
        for (int g1 = 0; g1 < num_grid_h; g1++)
        {
            for (int g0 = 0; g0 < num_grid_w; g0++)
            {
                GridAndStride gs;
                gs.grid0 = g0;
                gs.grid1 = g1;
                gs.stride = stride;
                grid_strides.push_back(gs);
            }
        }
    }
}

static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid = feat_blob.h;
    const int num_class = feat_blob.w - 5;
    const int num_anchors = grid_strides.size();

    const float* feat_ptr = feat_blob.channel(0);
    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
    {
        const int grid0 = grid_strides[anchor_idx].grid0;
        const int grid1 = grid_strides[anchor_idx].grid1;
        const int stride = grid_strides[anchor_idx].stride;

        // yolox/models/yolo_head.py decode logic
        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
        float x_center = (feat_ptr[0] + grid0) * stride;
        float y_center = (feat_ptr[1] + grid1) * stride;
        float w = exp(feat_ptr[2]) * stride;
        float h = exp(feat_ptr[3]) * stride;
        float x0 = x_center - w * 0.5f;
        float y0 = y_center - h * 0.5f;

        float box_objectness = feat_ptr[4];
        for (int class_idx = 0; class_idx < num_class; class_idx++)
        {
            float box_cls_score = feat_ptr[5 + class_idx];
            float box_prob = box_objectness * box_cls_score;
            if (box_prob > prob_threshold)
            {
                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = w;
                obj.rect.height = h;
                obj.label = class_idx;
                obj.prob = box_prob;

                objects.push_back(obj);
            }

        } // class loop
        feat_ptr += feat_blob.w;

    } // point anchor loop
}

static int detect_yolox(const cv::Mat& bgr, std::vector<Object>& objects)
{
    ncnn::Net yolox;

    yolox.opt.use_vulkan_compute = true;
    // yolox.opt.use_bf16_storage = true;

    // Focus in yolov5
    yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);

    // original pretrained model from https://github.com/Megvii-BaseDetection/YOLOX
    // ncnn model param: https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s_ncnn.tar.gz
    // NOTE that newest version YOLOX remove normalization of model (minus mean and then div by std),
    // which might cause your model outputs becoming a total mess, plz check carefully.
    if (yolox.load_param("yolox.param"))
        exit(-1);
    if (yolox.load_model("yolox.bin"))
        exit(-1);

    int img_w = bgr.cols;
    int img_h = bgr.rows;

    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)YOLOX_TARGET_SIZE / w;
        w = YOLOX_TARGET_SIZE;
        h = h * scale;
    }
    else
    {
        scale = (float)YOLOX_TARGET_SIZE / h;
        h = YOLOX_TARGET_SIZE;
        w = w * scale;
    }
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);

    // pad to YOLOX_TARGET_SIZE rectangle
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    // different from yolov5, yolox only pad on bottom and right side,
    // which means users don't need to extra padding info to decode boxes coordinate.
    ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);

    ncnn::Extractor ex = yolox.create_extractor();

    ex.input("images", in_pad);

    std::vector<Object> proposals;

    {
        ncnn::Mat out;
        ex.extract("output", out);

        static const int stride_arr[] = {8, 16, 32}; // might have stride=64 in YOLOX
        std::vector<int> strides(stride_arr, stride_arr + sizeof(stride_arr) / sizeof(stride_arr[0]));
        std::vector<GridAndStride> grid_strides;
        generate_grids_and_stride(in_pad.w, in_pad.h, strides, grid_strides);
        generate_yolox_proposals(grid_strides, out, YOLOX_CONF_THRESH, proposals);
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, YOLOX_NMS_THRESH);

    int count = picked.size();

    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x) / scale;
        float y0 = (objects[i].rect.y) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }

    return 0;
}

static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(0);
}

int main(int argc, char** argv)
{
    if (argc != 2)
    {
        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
        return -1;
    }

    const char* imagepath = argv[1];

    cv::Mat m = cv::imread(imagepath, 1);
    if (m.empty())
    {
        fprintf(stderr, "cv::imread %s failed\n", imagepath);
        return -1;
    }

    std::vector<Object> objects;
    detect_yolox(m, objects);

    draw_objects(m, objects);

    return 0;
}


================================================
FILE: package.sh
================================================
#!/usr/bin/bash

NAME=ncnn

##### package android lib
ANDROIDPKGNAME=${NAME}-android-lib
rm -rf $ANDROIDPKGNAME
mkdir -p $ANDROIDPKGNAME
mkdir -p $ANDROIDPKGNAME/armeabi-v7a
mkdir -p $ANDROIDPKGNAME/arm64-v8a
mkdir -p $ANDROIDPKGNAME/x86
mkdir -p $ANDROIDPKGNAME/x86_64
mkdir -p $ANDROIDPKGNAME/include
cp build-android-armv7/install/lib/lib*.a $ANDROIDPKGNAME/armeabi-v7a/
cp build-android-aarch64/install/lib/lib*.a $ANDROIDPKGNAME/arm64-v8a/
cp build-android-x86/install/lib/lib*.a $ANDROIDPKGNAME/x86/
cp build-android-x86_64/install/lib/lib*.a $ANDROIDPKGNAME/x86_64/
cp -r build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/
rm -f $ANDROIDPKGNAME.zip
zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME

##### package ios framework
IOSPKGNAME=${NAME}.framework
rm -rf $IOSPKGNAME
mkdir -p $IOSPKGNAME/Versions/A/Headers
mkdir -p $IOSPKGNAME/Versions/A/Resources
ln -s A $IOSPKGNAME/Versions/Current
ln -s Versions/Current/Headers $IOSPKGNAME/Headers
ln -s Versions/Current/Resources $IOSPKGNAME/Resources
ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
lipo -create \
    build-ios/install/lib/lib${NAME}.a \
    build-ios-sim/install/lib/lib${NAME}.a \
    -o $IOSPKGNAME/Versions/A/${NAME}
cp -r build-ios/install/include/* $IOSPKGNAME/Versions/A/Headers/
cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
rm -f $IOSPKGNAME.zip
zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME

##### package ios framework bitcode
IOSPKGNAME=${NAME}.framework
rm -rf $IOSPKGNAME
mkdir -p $IOSPKGNAME/Versions/A/Headers
mkdir -p $IOSPKGNAME/Versions/A/Resources
ln -s A $IOSPKGNAME/Versions/Current
ln -s Versions/Current/Headers $IOSPKGNAME/Headers
ln -s Versions/Current/Resources $IOSPKGNAME/Resources
ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
lipo -create \
    build-ios-bitcode/install/lib/lib${NAME}.a \
    build-ios-sim-bitcode/install/lib/lib${NAME}.a \
    -o $IOSPKGNAME/Versions/A/${NAME}
cp -r build-ios-bitcode/install/include/ncnn $IOSPKGNAME/Versions/A/Headers/
cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
rm -f $IOSPKGNAME-bitcode.zip
zip -9 -y -r $IOSPKGNAME-bitcode.zip $IOSPKGNAME


##### package android lib vulkan
ANDROIDPKGNAME=${NAME}-android-vulkan-lib
rm -rf $ANDROIDPKGNAME
mkdir -p $ANDROIDPKGNAME
mkdir -p $ANDROIDPKGNAME/armeabi-v7a
mkdir -p $ANDROIDPKGNAME/arm64-v8a
mkdir -p $ANDROIDPKGNAME/x86
mkdir -p $ANDROIDPKGNAME/x86_64
mkdir -p $ANDROIDPKGNAME/include
cp build-android-armv7-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/armeabi-v7a/
cp build-android-aarch64-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/arm64-v8a/
cp build-android-x86-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/x86/
cp build-android-x86_64-vulkan/install/lib/lib*.a $ANDROIDPKGNAME/x86_64/
cp -r build-android-aarch64-vulkan/install/include/* $ANDROIDPKGNAME/include/
rm -f $ANDROIDPKGNAME.zip
zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME

##### package ios framework vulkan
IOSPKGNAME=${NAME}.framework
rm -rf $IOSPKGNAME
mkdir -p $IOSPKGNAME/Versions/A/Headers
mkdir -p $IOSPKGNAME/Versions/A/Resources
ln -s A $IOSPKGNAME/Versions/Current
ln -s Versions/Current/Headers $IOSPKGNAME/Headers
ln -s Versions/Current/Resources $IOSPKGNAME/Resources
ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
lipo -create \
    build-ios-vulkan/install/lib/lib${NAME}.a \
    build-ios-sim-vulkan/install/lib/lib${NAME}.a \
    -o $IOSPKGNAME/Versions/A/${NAME}
cp -r build-ios-vulkan/install/include/ncnn $IOSPKGNAME/Versions/A/Headers/
cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
rm -f $IOSPKGNAME-vulkan.zip
zip -9 -y -r $IOSPKGNAME-vulkan.zip $IOSPKGNAME

##### package ios framework vulkan bitcode
IOSPKGNAME=${NAME}.framework
rm -rf $IOSPKGNAME
mkdir -p $IOSPKGNAME/Versions/A/Headers
mkdir -p $IOSPKGNAME/Versions/A/Resources
ln -s A $IOSPKGNAME/Versions/Current
ln -s Versions/Current/Headers $IOSPKGNAME/Headers
ln -s Versions/Current/Resources $IOSPKGNAME/Resources
ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
lipo -create \
    build-ios-vulkan-bitcode/install/lib/lib${NAME}.a \
    build-ios-sim-vulkan-bitcode/install/lib/lib${NAME}.a \
    -o $IOSPKGNAME/Versions/A/${NAME}
cp -r build-ios-vulkan-bitcode/install/include/ncnn $IOSPKGNAME/Versions/A/Headers/
cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
rm -f $IOSPKGNAME-vulkan-bitcode.zip
zip -9 -y -r $IOSPKGNAME-vulkan-bitcode.zip $IOSPKGNAME


##### package ios framework glslang
IOSPKGNAME=glslang.framework
rm -rf $IOSPKGNAME
mkdir -p $IOSPKGNAME/Versions/A/Headers
mkdir -p $IOSPKGNAME/Versions/A/Resources
ln -s A $IOSPKGNAME/Versions/Current
ln -s Versions/Current/Headers $IOSPKGNAME/Headers
ln -s Versions/Current/Resources $IOSPKGNAME/Resources
ln -s Versions/Current/glslang $IOSPKGNAME/glslang
libtool -static \
    build-ios-vulkan/install/lib/libglslang.a \
    build-ios-vulkan/install/lib/libSPIRV.a \
    build-ios-vulkan/install/lib/libOGLCompiler.a \
    build-ios-vulkan/install/lib/libOSDependent.a \
    -o build-ios-vulkan/install/lib/libglslang_combined.a
libtool -static \
    build-ios-sim-vulkan/install/lib/libglslang.a \
    build-ios-sim-vulkan/install/lib/libSPIRV.a \
    build-ios-sim-vulkan/install/lib/libOGLCompiler.a \
    build-ios-sim-vulkan/install/lib/libOSDependent.a \
    -o build-ios-sim-vulkan/install/lib/libglslang_combined.a
lipo -create \
    build-ios-vulkan/install/lib/libglslang_combined.a \
    build-ios-sim-vulkan/install/lib/libglslang_combined.a \
    -o $IOSPKGNAME/Versions/A/glslang
cp -r build-ios-vulkan/install/include/glslang $IOSPKGNAME/Versions/A/Headers/
cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
rm -f $IOSPKGNAME.zip
zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME

##### package ios framework glslang bitcode
IOSPKGNAME=glslang.framework
rm -rf $IOSPKGNAME
mkdir -p $IOSPKGNAME/Versions/A/Headers
mkdir -p $IOSPKGNAME/Versions/A/Resources
ln -s A $IOSPKGNAME/Versions/Current
ln -s Versions/Current/Headers $IOSPKGNAME/Headers
ln -s Versions/Current/Resources $IOSPKGNAME/Resources
ln -s Versions/Current/glslang $IOSPKGNAME/glslang
libtool -static \
    build-ios-vulkan-bitcode/install/lib/libglslang.a \
    build-ios-vulkan-bitcode/install/lib/libSPIRV.a \
    build-ios-vulkan-bitcode/install/lib/libOGLCompiler.a \
    build-ios-vulkan-bitcode/install/lib/libOSDependent.a \
    -o build-ios-vulkan-bitcode/install/lib/libglslang_combined.a
libtool -static \
    build-ios-sim-vulkan-bitcode/install/lib/libglslang.a \
    build-ios-sim-vulkan-bitcode/install/lib/libSPIRV.a \
    build-ios-sim-vulkan-bitcode/install/lib/libOGLCompiler.a \
    build-ios-sim-vulkan-bitcode/install/lib/libOSDependent.a \
    -o build-ios-sim-vulkan-bitcode/install/lib/libglslang_combined.a
lipo -create \
    build-ios-vulkan-bitcode/install/lib/libglslang_combined.a \
    build-ios-sim-vulkan-bitcode/install/lib/libglslang_combined.a \
    -o $IOSPKGNAME/Versions/A/glslang
cp -r build-ios-vulkan-bitcode/install/include/glslang $IOSPKGNAME/Versions/A/Headers/
cp Info.plist ${IOSPKGNAME}/Versions/A/Resources/
rm -f $IOSPKGNAME-bitcode.zip
zip -9 -y -r $IOSPKGNAME-bitcode.zip $IOSPKGNAME


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = [
    "setuptools>=42",
    "wheel",
    "importlib-metadata",
]
build-backend = "setuptools.build_meta"


================================================
FILE: python/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.4...3.10)

project(pyncnn)

set(PACKAGE_VERSION ${NCNN_VERSION_STRING})
add_definitions(-DVERSION_INFO="${PACKAGE_VERSION}")

set( CMAKE_CXX_STANDARD 11 )
set( CMAKE_CXX_STANDARD_REQUIRED ON )

option(NCNN_SYSTEM_PYBIND11 "use system pybind11" OFF)

if(CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64")
    option(PYBIND11_PYTHONLIBS_OVERWRITE "" OFF)

    set(PYTHON_PREFIX "$ENV{LOCALAPPDATA}/pypa/cibuildwheel/Cache/nuget-cpython/pythonarm64.$ENV{PYTHON_VERSION}/tools")
    if(NOT DEFINED $ENV{CIBUILDWHEEL})
        message(WARNING
            " This is hack for cibuildwheel on github action\n"
            " Use the right way to cross-compile python module for windows arm64 like follows\n"
            " set(PYTHON_PREFIX \"<your-pythonarm64-root-path>\")\n"
        )
    endif()
endif()

if(NCNN_SYSTEM_PYBIND11)
    find_package(pybind11)
    if(NOT pybind11_FOUND)
        message(WARNING "pybind11 package not found! NCNN_SYSTEM_PYBIND11 will be turned off.")
        set(NCNN_SYSTEM_PYBIND11 OFF)
    endif()
endif()

if(NOT NCNN_SYSTEM_PYBIND11)
    if(NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/pybind11/CMakeLists.txt")
        message(FATAL_ERROR "The submodules were not downloaded! Please update submodules with \"git submodule update --init\" and try again.")
    else()
        add_subdirectory(pybind11)
    endif()
endif()

if("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "")
    if(MSVC OR CMAKE_GENERATOR STREQUAL "Xcode")
        set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${CMAKE_CURRENT_BINARY_DIR}/ncnn/)
        set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${CMAKE_CURRENT_BINARY_DIR}/ncnn/)
    endif(MSVC OR CMAKE_GENERATOR STREQUAL "Xcode")
endif("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "")

# enable global link time optimization
cmake_policy(SET CMP0069 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
include(CheckIPOSupported)
check_ipo_supported(RESULT ipo_supported OUTPUT ipo_supported_output)
if(ipo_supported)
    set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()

include_directories(${pybind11_INCLUDE_DIR} ${PYTHON_INCLUDE_DIRS})
pybind11_add_module(pyncnn src/main.cpp)
set_target_properties(pyncnn PROPERTIES OUTPUT_NAME "ncnn")
target_link_libraries(pyncnn PUBLIC ncnn)
set_target_properties(pyncnn PROPERTIES PREFIX "" LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/ncnn")
set_property(TARGET pyncnn PROPERTY FOLDER "python")
if("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "")
    add_custom_command(TARGET pyncnn POST_BUILD 
        COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/ncnn/ncnn${PYTHON_MODULE_PREFIX}${PYTHON_MODULE_EXTENSION} 
        ${PROJECT_SOURCE_DIR}/ncnn/ncnn${PYTHON_MODULE_PREFIX}${PYTHON_MODULE_EXTENSION})
endif("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "")

configure_file(setup.py.i ${PROJECT_SOURCE_DIR}/setup.py)


================================================
FILE: python/README.md
================================================
# ncnn
python wrapper of ncnn with [pybind11](https://github.com/pybind/pybind11), only support python3.x now.


Install from pip
==================

ncnn is available as wheel packages for macOS, Windows and Linux distributions, you can install with pip:

```
python -m pip install -U pip
python -m pip install -U ncnn
```

# Build from source

If you want to build ncnn with some options not as default, or just like to build everything yourself, it is not difficult to build ncnn from source.

## Prerequisites

**On Unix (Linux, OS X)**

* A compiler with C++11 support
* CMake >= 3.4

**On Mac**

* A compiler with C++11 support
* CMake >= 3.4

**On Windows**

* Visual Studio 2015 or higher
* CMake >= 3.4

##  Build & Install

1. clone ncnn and init submodule.

```bash
cd /pathto/ncnn
git submodule init && git submodule update
```

2. build and install.

```
python setup.py install
```

If you want to use a custom toolchain, you can install with the `CMAKE_TOOLCHAIN_FILE` environment variable, like this:

```
CMAKE_TOOLCHAIN_FILE="../../toolchains/power9le-linux-gnu-vsx.clang.toolchain.cmake" python setup.py install
```

if you want to enable the usage of vulkan, you can install as following:

```
python setup.py install --vulkan=on
```

> **Attention:**
>
> To enable Vulkan support, you must first install the Vulkan SDK.
>
> **For Windows or Linux Users:**
>
> Ensure that the `VULKAN_SDK` environment variable is set to the path of the Vulkan SDK.
>
> **For MacOS Users:**
>
> On MacOS, you will need to specify additional environment variables. For guidance on setting these variables, please refer to lines 279-286 in the following file: [ncnn/.github/workflows/release-python.yml at master · Tencent/ncnn](https://github.com/Tencent/ncnn/blob/master/.github/workflows/release-python.yml).

## Custom-build & Install

1. clone ncnn and init submodule.
```bash
cd /pathto/ncnn
git submodule init && git submodule update
```
2. build.
```bash
mkdir build
cd build
cmake -DNCNN_PYTHON=ON ..
make
```

To use the pybind11 package provided by your system, set the CMake variable `NCNN_SYSTEM_PYBIND11` to `ON` during the build process, like this:

```bash
mkdir build
cd build
cmake -DNCNN_PYTHON=ON -DNCNN_SYSTEM_PYBIND11=ON ..
make
```

3. install

```bash
cd /pathto/ncnn
pip install .
```

if you use conda or miniconda, you can also install as following:
```bash
cd /pathto/ncnn
python3 setup.py install
```

## Tests

**test**
```bash
cd /pathto/ncnn/python
python3 tests/test.py
```

**benchmark**

```bash
cd /pathto/ncnn/python
python3 tests/benchmark.py
```

## Numpy
**ncnn.Mat->numpy.array, with no memory copy**

```bash
mat = ncnn.Mat(...)
mat_np = np.array(mat)
```

**numpy.array->ncnn.Mat, with no memory copy**
```bash
mat_np = np.array(...)
mat = ncnn.Mat(mat_np)
```

# Model Zoo
install requirements
```bash
pip install -r requirements.txt
```
then you can import ncnn.model_zoo and get model list as follow:
```bash
import ncnn
import ncnn.model_zoo as model_zoo

print(model_zoo.get_model_list())
```
models now in model zoo are as list below:
```bash
mobilenet_yolov2
mobilenetv2_yolov3
yolov4_tiny
yolov4
yolov5s
yolact
mobilenet_ssd
squeezenet_ssd
mobilenetv2_ssdlite
mobilenetv3_ssdlite
squeezenet
faster_rcnn
peleenet_ssd
retinaface
rfcn
shufflenetv2
simplepose
nanodet
```
all model in model zoo has example in ncnn/python/examples folder

# Custom Layer

custom layer demo is in ncnn/python/ncnn/model_zoo/yolov5.py:23


================================================
FILE: python/examples/fasterrcnn.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("faster_rcnn", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/mobilenetssd.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("mobilenet_ssd", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/mobilenetv2ssdlite.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("mobilenetv2_ssdlite", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/mobilenetv3ssdlite.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("mobilenetv3_ssdlite", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects, 0.6)


================================================
FILE: python/examples/model_zoo.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

from ncnn.model_zoo import get_model_list

if __name__ == "__main__":
    print(get_model_list())


================================================
FILE: python/examples/nanodet.py
================================================
# Copyright 2021 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model(
        "nanodet",
        target_size=320,
        prob_threshold=0.4,
        nms_threshold=0.5,
        num_threads=4,
        use_gpu=True,
    )

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/peleenetssd.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
import numpy as np
from ncnn.model_zoo import get_model


def draw_detection_objects_seg(image, class_names, objects, mat_map):
    color = [128, 255, 128, 244, 35, 232]
    color_count = len(color)

    for obj in objects:
        print(
            "%d = %.5f at %.2f %.2f %.2f x %.2f\n"
            % (obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.w, obj.rect.h)
        )

        cv2.rectangle(
            image,
            (int(obj.rect.x), int(obj.rect.y)),
            (int(obj.rect.x + obj.rect.w), int(obj.rect.y + obj.rect.h)),
            (255, 0, 0),
        )

        text = "%s %.1f%%" % (class_names[int(obj.label)], obj.prob * 100)

        label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)

        x = obj.rect.x
        y = obj.rect.y - label_size[1] - baseLine
        if y < 0:
            y = 0
        if x + label_size[0] > image.shape[1]:
            x = image.shape[1] - label_size[0]

        cv2.rectangle(
            image,
            (int(x), int(y)),
            (int(x + label_size[0]), int(y + label_size[1] + baseLine)),
            (255, 255, 255),
            -1,
        )

        cv2.putText(
            image,
            text,
            (int(x), int(y + label_size[1])),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
        )

    width = mat_map.w
    height = mat_map.h
    size = mat_map.c
    img_index2 = 0
    threshold = 0.45
    ptr2 = np.array(mat_map)
    for i in range(height):
        ptr1 = image[i].flatten()
        img_index1 = 0
        for j in range(width):
            maxima = threshold
            index = -1
            for c in range(size):
                # const float* ptr3 = ptr2 + c*width*height
                ptr3 = ptr2[c].flatten()
                if ptr3[img_index2] > maxima:
                    maxima = ptr3[img_index2]
                    index = c

            if index > -1:
                color_index = (index) * 3
                if color_index < color_count:
                    b = color[color_index]
                    g = color[color_index + 1]
                    r = color[color_index + 2]
                    ptr1[img_index1] = b / 2 + ptr1[img_index1] / 2
                    ptr1[img_index1 + 1] = g / 2 + ptr1[img_index1 + 1] / 2
                    ptr1[img_index1 + 2] = r / 2 + ptr1[img_index1 + 2] / 2

            img_index1 += 3
            img_index2 += 1

        image[i] = ptr1.reshape(image[i].shape)

    cv2.imshow("image", image)
    cv2.waitKey(0)


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("peleenet_ssd", num_threads=4, use_gpu=True)

    objects, seg_out = net(m)

    draw_detection_objects_seg(m, net.class_names, objects, seg_out)


================================================
FILE: python/examples/retinaface.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_faceobjects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("retinaface", num_threads=4, use_gpu=True)

    faceobjects = net(m)

    draw_faceobjects(m, faceobjects)


================================================
FILE: python/examples/rfcn.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("rfcn", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/shufflenetv2.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import print_topk

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("shufflenetv2", num_threads=4, use_gpu=True)

    cls_scores = net(m)

    print_topk(cls_scores, 3)


================================================
FILE: python/examples/simplepose.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_pose

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("simplepose", num_threads=4, use_gpu=True)

    keypoints = net(m)

    draw_pose(m, keypoints)


================================================
FILE: python/examples/squeezenet.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import print_topk

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("squeezenet", num_threads=4, use_gpu=True)

    cls_scores = net(m)

    print_topk(cls_scores, 5)


================================================
FILE: python/examples/squeezenetssd.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("squeezenet_ssd", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/yolact.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
import numpy as np
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects


def draw_result(image, class_names, boxes, masks, classes, scores):
    colors = [
        [56, 0, 255],
        [226, 255, 0],
        [0, 94, 255],
        [0, 37, 255],
        [0, 255, 94],
        [255, 226, 0],
        [0, 18, 255],
        [255, 151, 0],
        [170, 0, 255],
        [0, 255, 56],
        [255, 0, 75],
        [0, 75, 255],
        [0, 255, 169],
        [255, 0, 207],
        [75, 255, 0],
        [207, 0, 255],
        [37, 0, 255],
        [0, 207, 255],
        [94, 0, 255],
        [0, 255, 113],
        [255, 18, 0],
        [255, 0, 56],
        [18, 0, 255],
        [0, 255, 226],
        [170, 255, 0],
        [255, 0, 245],
        [151, 255, 0],
        [132, 255, 0],
        [75, 0, 255],
        [151, 0, 255],
        [0, 151, 255],
        [132, 0, 255],
        [0, 255, 245],
        [255, 132, 0],
        [226, 0, 255],
        [255, 37, 0],
        [207, 255, 0],
        [0, 255, 207],
        [94, 255, 0],
        [0, 226, 255],
        [56, 255, 0],
        [255, 94, 0],
        [255, 113, 0],
        [0, 132, 255],
        [255, 0, 132],
        [255, 170, 0],
        [255, 0, 188],
        [113, 255, 0],
        [245, 0, 255],
        [113, 0, 255],
        [255, 188, 0],
        [0, 113, 255],
        [255, 0, 0],
        [0, 56, 255],
        [255, 0, 113],
        [0, 255, 188],
        [255, 0, 94],
        [255, 0, 18],
        [18, 255, 0],
        [0, 255, 132],
        [0, 188, 255],
        [0, 245, 255],
        [0, 169, 255],
        [37, 255, 0],
        [255, 0, 151],
        [188, 0, 255],
        [0, 255, 37],
        [0, 255, 0],
        [255, 0, 170],
        [255, 0, 37],
        [255, 75, 0],
        [0, 0, 255],
        [255, 207, 0],
        [255, 0, 226],
        [255, 245, 0],
        [188, 255, 0],
        [0, 255, 18],
        [0, 255, 75],
        [0, 255, 151],
        [255, 56, 0],
        [245, 255, 0],
    ]

    color_index = 0

    for box, mask, label, score in zip(boxes, masks, classes, scores):
        if score < 0.15:
            continue

        print(
            "%s = %.5f at %.2f %.2f %.2f x %.2f\n"
            % (label, score, box[0], box[1], box[2], box[3])
        )

        cv2.rectangle(
            image,
            (int(box[0]), int(box[1])),
            (int(box[0] + box[2]), int(int(box[1] + box[3]))),
            (255, 0, 0),
        )

        text = "%s %.1f%%" % (class_names[int(label)], score * 100)

        label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)

        x = box[0]
        y = box[1] - label_size[1] - baseLine
        if y < 0:
            y = 0
        if x + label_size[0] > image.shape[1]:
            x = image.shape[1] - label_size[0]

        cv2.rectangle(
            image,
            (int(x), int(y)),
            (int(x + label_size[0]), int(y + label_size[1] + baseLine)),
            (255, 255, 255),
            -1,
        )

        cv2.putText(
            image,
            text,
            (int(x), int(y + label_size[1])),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
        )

        image[mask] = image[mask] * 0.5 + np.array(colors[color_index]) * 0.5
        color_index += 1

    cv2.imshow("image", image)
    cv2.waitKey(0)


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]
    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model(
        "yolact",
        target_size=550,
        confidence_threshold=0.05,
        nms_threshold=0.5,
        keep_top_k=200,
        num_threads=4,
        use_gpu=True,
    )

    boxes, masks, classes, scores = net(m)

    draw_result(m, net.class_names, boxes, masks, classes, scores)


================================================
FILE: python/examples/yolov2.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("mobilenet_yolov2", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/yolov3.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model("mobilenetv2_yolov3", num_threads=4, use_gpu=True)

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/yolov4.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [v4l input device or image]\n" % (sys.argv[0]))
        sys.exit(0)

    devicepath = sys.argv[1]

    net = get_model("yolov4_tiny", num_threads=4, use_gpu=True)
    # net = get_model("yolov4", num_threads=4, use_gpu=True)

    if devicepath.find("/dev/video") == -1:
        m = cv2.imread(devicepath)
        if m is None:
            print("cv2.imread %s failed\n" % (devicepath))
            sys.exit(0)

        objects = net(m)

        draw_detection_objects(m, net.class_names, objects)
    else:
        cap = cv2.VideoCapture(devicepath)

        if cap.isOpened() == False:
            print("Failed to open %s" % (devicepath))
            sys.exit(0)

        while True:
            ret, frame = cap.read()

            objects = net(frame)

            draw_detection_objects(frame, net.class_names, objects)


================================================
FILE: python/examples/yolov5.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model(
        "yolov5s",
        target_size=640,
        prob_threshold=0.25,
        nms_threshold=0.45,
        num_threads=4,
        use_gpu=True,
    )

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/examples/yolov8.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import cv2
from ncnn.model_zoo import get_model
from ncnn.utils import draw_detection_objects

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: %s [imagepath]\n" % (sys.argv[0]))
        sys.exit(0)

    imagepath = sys.argv[1]

    m = cv2.imread(imagepath)
    if m is None:
        print("cv2.imread %s failed\n" % (imagepath))
        sys.exit(0)

    net = get_model(
        "yolov8s",
        target_size=640,
        prob_threshold=0.25,
        nms_threshold=0.45,
        num_threads=4,
        use_gpu=True,
    )

    objects = net(m)

    draw_detection_objects(m, net.class_names, objects)


================================================
FILE: python/ncnn/__init__.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

from .ncnn import *

__version__ = ncnn.__version__


================================================
FILE: python/ncnn/model_zoo/__init__.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

# coding: utf-8
"""Predefined and pretrained models."""

from . import model_store

from .model_zoo import get_model, get_model_list


================================================
FILE: python/ncnn/model_zoo/fasterrcnn.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class Faster_RCNN:
    def __init__(
        self,
        img_width=600,
        img_height=600,
        num_threads=1,
        use_gpu=False,
        max_per_image=100,
        confidence_thresh=0.05,
        nms_threshold=0.3,
    ):
        self.img_width = img_width
        self.img_height = img_height
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [102.9801, 115.9465, 122.7717]
        self.norm_vals = []

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # original pretrained model from https://github.com/rbgirshick/py-faster-rcnn
        # py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt
        # https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0
        # ZF_faster_rcnn_final.caffemodel
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("ZF_faster_rcnn_final.param"))
        self.net.load_model(get_model_file("ZF_faster_rcnn_final.bin"))

        self.max_per_image = max_per_image
        self.confidence_thresh = confidence_thresh
        self.nms_threshold = nms_threshold

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        # scale to target detect size
        h = img.shape[0]
        w = img.shape[1]
        scale = 1.0
        if w < h:
            scale = float(self.img_width) / w
            w = self.img_width
            h = int(h * scale)
        else:
            scale = float(self.img_height) / h
            h = self.img_height
            w = int(w * scale)

        mat_in = ncnn.Mat.from_pixels_resize(
            img, ncnn.Mat.PixelType.PIXEL_BGR, img.shape[1], img.shape[0], w, h
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        # method 1 use numpy to Mat interface
        # im_info = ncnn.Mat(np.array([h, w, scale], dtype=np.float32))

        # method 2 use ncnn.Mat interface
        im_info = ncnn.Mat(3)
        im_info[0] = h
        im_info[1] = w
        im_info[2] = scale

        ex1 = self.net.create_extractor()

        ex1.input("data", mat_in)
        ex1.input("im_info", im_info)

        ret1, conv5_relu5 = ex1.extract("conv5_relu5")
        ret2, rois = ex1.extract("rois")

        class_candidates = []
        for i in range(rois.c):
            ex2 = self.net.create_extractor()

            roi = rois.channel(i)  # get single roi
            ex2.input("conv5_relu5", conv5_relu5)
            ex2.input("rois", roi)

            ret1, bbox_pred = ex2.extract("bbox_pred")
            ret2, cls_prob = ex2.extract("cls_prob")

            num_class = cls_prob.w
            while len(class_candidates) < num_class:
                class_candidates.append([])

            # find class id with highest score
            label = 0
            score = 0.0
            for j in range(num_class):
                class_score = cls_prob[j]
                if class_score > score:
                    label = j
                    score = class_score

            # ignore background or low score
            if label == 0 or score <= self.confidence_thresh:
                continue

            # fprintf(stderr, "%d = %f\n", label, score);

            # unscale to image size
            x1 = roi[0] / scale
            y1 = roi[1] / scale
            x2 = roi[2] / scale
            y2 = roi[3] / scale

            pb_w = x2 - x1 + 1
            pb_h = y2 - y1 + 1

            # apply bbox regression
            dx = bbox_pred[label * 4]
            dy = bbox_pred[label * 4 + 1]
            dw = bbox_pred[label * 4 + 2]
            dh = bbox_pred[label * 4 + 3]

            cx = x1 + pb_w * 0.5
            cy = y1 + pb_h * 0.5

            obj_cx = cx + pb_w * dx
            obj_cy = cy + pb_h * dy

            obj_w = pb_w * np.exp(dw)
            obj_h = pb_h * np.exp(dh)

            obj_x1 = obj_cx - obj_w * 0.5
            obj_y1 = obj_cy - obj_h * 0.5
            obj_x2 = obj_cx + obj_w * 0.5
            obj_y2 = obj_cy + obj_h * 0.5

            # clip
            obj_x1 = np.maximum(np.minimum(obj_x1, float(img.shape[1] - 1)), 0.0)
            obj_y1 = np.maximum(np.minimum(obj_y1, float(img.shape[0] - 1)), 0.0)
            obj_x2 = np.maximum(np.minimum(obj_x2, float(img.shape[1] - 1)), 0.0)
            obj_y2 = np.maximum(np.minimum(obj_y2, float(img.shape[0] - 1)), 0.0)

            # append object
            obj = Detect_Object()
            obj.rect.x = obj_x1
            obj.rect.y = obj_y1
            obj.rect.w = obj_x2 - obj_x1 + 1
            obj.rect.h = obj_y2 - obj_y1 + 1
            obj.label = label
            obj.prob = score

            class_candidates[label].append(obj)

        # post process
        objects = []
        for candidates in class_candidates:
            if len(candidates) == 0:
                continue

            candidates.sort(key=lambda obj: obj.prob, reverse=True)

            picked = self.nms_sorted_bboxes(candidates, self.nms_threshold)

            for j in range(len(picked)):
                z = picked[j]
                objects.append(candidates[z])

        objects.sort(key=lambda obj: obj.prob, reverse=True)

        objects = objects[: self.max_per_image]

        return objects

    def nms_sorted_bboxes(self, objects, nms_threshold):
        picked = []

        n = len(objects)

        areas = np.zeros((n,), dtype=np.float32)
        for i in range(n):
            areas[i] = objects[i].rect.area()

        for i in range(n):
            a = objects[i]

            keep = True
            for j in range(len(picked)):
                b = objects[picked[j]]

                # intersection over union
                inter_area = a.rect.intersection_area(b.rect)
                union_area = areas[i] + areas[picked[j]] - inter_area
                # float IoU = inter_area / union_area
                if inter_area / union_area > nms_threshold:
                    keep = False

            if keep:
                picked.append(i)

        return picked


================================================
FILE: python/ncnn/model_zoo/mobilenetssd.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class MobileNet_SSD:
    def __init__(self, target_size=300, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [127.5, 127.5, 127.5]
        self.norm_vals = [0.007843, 0.007843, 0.007843]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # model is converted from https://github.com/chuanqi305/MobileNet-SSD
        # and can be downloaded from https://drive.google.com/open?id=0ByaKLD9QaPtucWk0Y0dha1VVY0U
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("mobilenet_ssd_voc_ncnn.param"))
        self.net.load_model(get_model_file("mobilenet_ssd_voc_ncnn.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("detection_out")

        objects = []

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y
            objects.append(obj)
        """

        return objects


================================================
FILE: python/ncnn/model_zoo/mobilenetv2ssdlite.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class Noop(ncnn.Layer):
    pass


def Noop_layer_creator():
    return Noop()


class MobileNetV2_SSDLite:
    def __init__(self, target_size=300, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [127.5, 127.5, 127.5]
        self.norm_vals = [0.007843, 0.007843, 0.007843]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu
        # self.net.register_custom_layer("Silence", Noop_layer_creator)

        # original pretrained model from https://github.com/chuanqi305/MobileNetv2-SSDLite
        # https://github.com/chuanqi305/MobileNetv2-SSDLite/blob/master/ssdlite/voc/deploy.prototxt
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("mobilenetv2_ssdlite_voc.param"))
        self.net.load_model(get_model_file("mobilenetv2_ssdlite_voc.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img_w,
            img_h,
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("detection_out")

        objects = []

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y
            objects.append(obj)
        """

        return objects


================================================
FILE: python/ncnn/model_zoo/mobilenetv3ssdlite.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


def clamp(v, lo, hi):
    if v < lo:
        return lo
    elif hi < v:
        return hi
    else:
        return v


class MobileNetV3_SSDLite:
    def __init__(self, target_size=300, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [123.675, 116.28, 103.53]
        self.norm_vals = [1.0, 1.0, 1.0]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # converted ncnn model from https://github.com/ujsyehao/mobilenetv3-ssd
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("mobilenetv3_ssdlite_voc.param"))
        self.net.load_model(get_model_file("mobilenetv3_ssdlite_voc.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR2RGB,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize([], self.norm_vals)
        mat_in.substract_mean_normalize(self.mean_vals, [])

        ex = self.net.create_extractor()

        ex.input("input", mat_in)

        ret, mat_out = ex.extract("detection_out")

        objects = []

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]

            x1 = (
                clamp(values[2] * self.target_size, 0.0, float(self.target_size - 1))
                / self.target_size
                * img_w
            )
            y1 = (
                clamp(values[3] * self.target_size, 0.0, float(self.target_size - 1))
                / self.target_size
                * img_h
            )
            x2 = (
                clamp(values[4] * self.target_size, 0.0, float(self.target_size - 1))
                / self.target_size
                * img_w
            )
            y2 = (
                clamp(values[5] * self.target_size, 0.0, float(self.target_size - 1))
                / self.target_size
                * img_h
            )

            if np.isnan(x1) or np.isnan(y1) or np.isnan(x2) or np.isnan(y2):
                continue

            obj.rect.x = x1
            obj.rect.y = y1
            obj.rect.w = x2 - x1
            obj.rect.h = y2 - y1

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]

            x1 = clamp(values[2] * self.img_width, 0.0, float(self.img_width - 1)) / self.img_width * img_w
            y1 = clamp(values[3] * self.img_height, 0.0, float(self.img_height - 1)) / self.img_height * img_h
            x2 = clamp(values[4] * self.img_width, 0.0, float(self.img_width - 1)) / self.img_width * img_w
            y2 = clamp(values[5] * self.img_height, 0.0, float(self.img_height - 1)) / self.img_height * img_h

            obj.rect.x = x1
            obj.rect.y = y1
            obj.rect.w = x2 - x1
            obj.rect.h = y2 - y1

            objects.append(obj)
        """

        return objects


================================================
FILE: python/ncnn/model_zoo/model_store.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

"""Model store which provides pretrained models."""
from __future__ import print_function

__all__ = ["get_model_file", "purge"]

import os
import zipfile
import logging
import portalocker

from ..utils import download, check_sha1

_model_sha1 = {
    name: checksum
    for checksum, name in [
        ("4ff279e78cdb0b8bbc9363181df6f094ad46dc36", "mobilenet_yolo.param"),
        ("1528cf08b9823fc01aaebfc932ec8c8d4a3b1613", "mobilenet_yolo.bin"),
        ("3f5b78b0c982f8bdf3a2c3a27e6136d4d2680e96", "mobilenetv2_yolov3.param"),
        ("0705b0f8fe5a77718561b9b7d6ed4f33fcd3d455", "mobilenetv2_yolov3.bin"),
        ("de59186323ebad5650631e12a6cc66b526ec7df4", "yolov4-tiny-opt.param"),
        ("1765c3b251c041dd6ac59d2ec3ddf7b983fe9ee9", "yolov4-tiny-opt.bin"),
        ("e92d3a3a8ac5e6a6c08c433aa2252b0680124328", "yolov4-opt.param"),
        ("69d128b42b70fb790e9d3ccabcf1b6e8cc2859fe", "yolov4-opt.bin"),
        ("6fa8ccc8cabc0f5633ab3c6ffa268e6042b8888f", "yolov5s.param"),
        ("0cbab3664deb090480ea748c1305f6fe850b9ac4", "yolov5s.bin"),
        ("35ab0c1ce2864e0759d5794aa818df2de3013ab3", "yolov7-tiny.param"),
        ("c0454f072b41997aa230c3fe1c1d504566574b6c", "yolov7-tiny.bin"),
        ("e9de3c929d1c93f7dc94ed0f125795ac16ecc120", "yolov8s.param"),
        ("90f4eb9e90086e2ec3af4c7837f00757e710b9c6", "yolov8s.bin"),
        ("e65bae7052d9e9b9d45e1214a8d1b5fe6f64e8af", "yolact.param"),
        ("9bda99f50b1c14c98c5c6bbc08d4f782eed66548", "yolact.bin"),
        ("3723ce3e312db6a102cff1a5c39dae80e1de658e", "mobilenet_ssd_voc_ncnn.param"),
        ("8e2d2139550dcbee1ce5e200b7697b25aab29656", "mobilenet_ssd_voc_ncnn.bin"),
        ("52c669821dc32ef5b7ab30749fa71a3bc27786b8", "squeezenet_ssd_voc.param"),
        ("347e31d1cbe469259fa8305860a7c24a95039202", "squeezenet_ssd_voc.bin"),
        ("52dab628ecac8137e61ce3aea1a912f9c5a0a638", "mobilenetv2_ssdlite_voc.param"),
        ("9fea06f74f7c60d753cf703ea992f92e50a986d4", "mobilenetv2_ssdlite_voc.bin"),
        ("f36661eff1eda1e36185e7f2f28fc722ad8b66bb", "mobilenetv3_ssdlite_voc.param"),
        ("908f63ca9bff0061a499512664b9c533a0b7f485", "mobilenetv3_ssdlite_voc.bin"),
        ("a63d779a1f789af976bc4e2eae86fdd9b0bb6c2c", "squeezenet_v1.1.param"),
        ("262f0e33e37aeac69021b5a3556664be65fc0aeb", "squeezenet_v1.1.bin"),
        ("3ba57cccd1d4a583f6eb76eae25a2dbda7ce7f74", "ZF_faster_rcnn_final.param"),
        ("1095fbb5f846a1f311b40941add5fef691acaf8d", "ZF_faster_rcnn_final.bin"),
        ("3586ec3d663b1cc8ec8c662768caa9c7fbcf4fdc", "pelee.param"),
        ("2442ad483dc546940271591b86db0d9c8b1c7118", "pelee.bin"),
        ("6cfeda08d5494a1274199089fda77c421be1ecac", "mnet.25-opt.param"),
        ("3ff9a51dc81cdf506a87543dbf752071ffc50b8d", "mnet.25-opt.bin"),
        ("50acebff393c91468a73a7b7c604ef231429d068", "rfcn_end2end.param"),
        ("9a68cd937959b4dda9c5bf9c99181cb0e40f266b", "rfcn_end2end.bin"),
        ("d6b289cda068e9a9d8a171fb909352a05a39a494", "shufflenet_v2_x0.5.param"),
        ("2ccd631d04a1b7e05483cd8a8def76bca7d330a8", "shufflenet_v2_x0.5.bin"),
        ("7c8f8d72c60aab6802985423686b36c61be2f68c", "pose.param"),
        ("7f691540972715298c611a3e595b20c59c2147ce", "pose.bin"),
        ("979d09942881cf1207a93cbfa9853005a434469b", "nanodet_m.param"),
        ("51d868905361e4ba9c45bd12e8a5608e7aadd1bd", "nanodet_m.bin"),
    ]
}


_split_model_bins = {
    "ZF_faster_rcnn_final.bin": 3,
    "rfcn_end2end.bin": 2,
    "yolov4-opt.bin": 7,
}


github_repo_url = "https://github.com/nihui/ncnn-assets/raw/master/models/"
_url_format = "{repo_url}{file_name}"


def merge_file(root, files_in, file_out, remove=True):
    with open(file_out, "wb") as fd_out:
        for file_in in files_in:
            file = os.path.join(root, file_in)
            with open(file, "rb") as fd_in:
                fd_out.write(fd_in.read())
            if remove == True:
                os.remove(file)


def short_hash(name):
    if name not in _model_sha1:
        raise ValueError(
            "Pretrained model for {name} is not available.".format(name=name)
        )
    return _model_sha1[name][:8]


def get_model_file(name, tag=None, root=os.path.join("~", ".ncnn", "models")):
    r"""Return location for the pretrained on local file system.

    This function will download from online model zoo when model cannot be found or has mismatch.
    The root directory will be created if it doesn't exist.

    Parameters
    ----------
    name : str
        Name of the model.
    root : str, default '~/.ncnn/models'
        Location for keeping the model parameters.

    Returns
    -------
    file_path
        Path to the requested pretrained model file.
    """
    if "NCNN_HOME" in os.environ:
        root = os.path.join(os.environ["NCNN_HOME"], "models")

    use_tag = isinstance(tag, str)
    if use_tag:
        file_name = "{name}-{short_hash}".format(name=name, short_hash=tag)
    else:
        file_name = "{name}".format(name=name)

    root = os.path.expanduser(root)
    params_path = os.path.join(root, file_name)
    lockfile = os.path.join(root, file_name + ".lock")
    if use_tag:
        sha1_hash = tag
    else:
        sha1_hash = _model_sha1[name]

    if not os.path.exists(root):
        os.makedirs(root)

    with portalocker.Lock(
        lockfile, timeout=int(os.environ.get("NCNN_MODEL_LOCK_TIMEOUT", 300))
    ):
        if os.path.exists(params_path):
            if check_sha1(params_path, sha1_hash):
                return params_path
            else:
                logging.warning(
                    "Hash mismatch in the content of model file '%s' detected. "
                    "Downloading again.",
                    params_path,
                )
        else:
            logging.info("Model file not found. Downloading.")

        zip_file_path = os.path.join(root, file_name)
        if file_name in _split_model_bins:
            file_name_parts = [
                "%s.part%02d" % (file_name, i + 1)
                for i in range(_split_model_bins[file_name])
            ]
            for file_name_part in file_name_parts:
                file_path = os.path.join(root, file_name_part)
                repo_url = os.environ.get("NCNN_REPO", github_repo_url)
                if repo_url[-1] != "/":
                    repo_url = repo_url + "/"
                download(
                    _url_format.format(repo_url=repo_url, file_name=file_name_part),
                    path=file_path,
                    overwrite=True,
                )

            merge_file(root, file_name_parts, zip_file_path)
        else:
            repo_url = os.environ.get("NCNN_REPO", github_repo_url)
            if repo_url[-1] != "/":
                repo_url = repo_url + "/"
            download(
                _url_format.format(repo_url=repo_url, file_name=file_name),
                path=zip_file_path,
                overwrite=True,
            )
        if zip_file_path.endswith(".zip"):
            with zipfile.ZipFile(zip_file_path) as zf:
                zf.extractall(root)
            os.remove(zip_file_path)
        # Make sure we write the model file on networked filesystems
        try:
            os.sync()
        except AttributeError:
            pass
        if check_sha1(params_path, sha1_hash):
            return params_path
        else:
            raise ValueError("Downloaded file has different hash. Please try again.")


def purge(root=os.path.join("~", ".ncnn", "models")):
    r"""Purge all pretrained model files in local file store.

    Parameters
    ----------
    root : str, default '~/.ncnn/models'
        Location for keeping the model parameters.
    """
    root = os.path.expanduser(root)
    files = os.listdir(root)
    for f in files:
        if f.endswith(".params"):
            os.remove(os.path.join(root, f))


================================================
FILE: python/ncnn/model_zoo/model_zoo.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

from .yolov2 import MobileNet_YoloV2
from .yolov3 import MobileNetV2_YoloV3
from .yolov4 import YoloV4_Tiny, YoloV4
from .yolov5 import YoloV5s
from .yolov7 import YoloV7_Tiny
from .yolov8 import YoloV8s
from .yolact import Yolact
from .mobilenetssd import MobileNet_SSD
from .squeezenetssd import SqueezeNet_SSD
from .mobilenetv2ssdlite import MobileNetV2_SSDLite
from .mobilenetv3ssdlite import MobileNetV3_SSDLite
from .squeezenet import SqueezeNet
from .fasterrcnn import Faster_RCNN
from .peleenetssd import PeleeNet_SSD
from .retinaface import RetinaFace
from .rfcn import RFCN
from .shufflenetv2 import ShuffleNetV2
from .simplepose import SimplePose
from .nanodet import NanoDet

__all__ = ["get_model", "get_model_list"]

_models = {
    "mobilenet_yolov2": MobileNet_YoloV2,
    "mobilenetv2_yolov3": MobileNetV2_YoloV3,
    "yolov4_tiny": YoloV4_Tiny,
    "yolov4": YoloV4,
    "yolov5s": YoloV5s,
    "yolov7_tiny": YoloV7_Tiny,
    "yolov8s": YoloV8s,
    "yolact": Yolact,
    "mobilenet_ssd": MobileNet_SSD,
    "squeezenet_ssd": SqueezeNet_SSD,
    "mobilenetv2_ssdlite": MobileNetV2_SSDLite,
    "mobilenetv3_ssdlite": MobileNetV3_SSDLite,
    "squeezenet": SqueezeNet,
    "faster_rcnn": Faster_RCNN,
    "peleenet_ssd": PeleeNet_SSD,
    "retinaface": RetinaFace,
    "rfcn": RFCN,
    "shufflenetv2": ShuffleNetV2,
    "simplepose": SimplePose,
    "nanodet": NanoDet,
}


def get_model(name, **kwargs):
    name = name.lower()
    if name not in _models:
        err_str = '"%s" is not among the following model list:\n\t' % (name)
        err_str += "%s" % ("\n\t".join(sorted(_models.keys())))
        raise ValueError(err_str)
    net = _models[name](**kwargs)
    return net


def get_model_list():
    return list(_models.keys())


================================================
FILE: python/ncnn/model_zoo/nanodet.py
================================================
# Copyright 2021 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object
from ..utils.functional import *


class NanoDet:
    def __init__(
        self,
        target_size=320,
        prob_threshold=0.4,
        nms_threshold=0.3,
        num_threads=1,
        use_gpu=False,
    ):
        self.target_size = target_size
        self.prob_threshold = prob_threshold
        self.nms_threshold = nms_threshold
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [103.53, 116.28, 123.675]
        self.norm_vals = [0.017429, 0.017507, 0.017125]

        self.net = ncnn.Net()
        self.net.opt.use_vulkan_compute = self.use_gpu
        self.net.opt.num_threads = self.num_threads

        # original pretrained model from https://github.com/RangiLyu/nanodet
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("nanodet_m.param"))
        self.net.load_model(get_model_file("nanodet_m.bin"))

        self.reg_max = 7
        self.strides = [8, 16, 32]
        self.num_candidate = 1000
        self.top_k = -1

        self.class_names = [
            "person",
            "bicycle",
            "car",
            "motorcycle",
            "airplane",
            "bus",
            "train",
            "truck",
            "boat",
            "traffic light",
            "fire hydrant",
            "stop sign",
            "parking meter",
            "bench",
            "bird",
            "cat",
            "dog",
            "horse",
            "sheep",
            "cow",
            "elephant",
            "bear",
            "zebra",
            "giraffe",
            "backpack",
            "umbrella",
            "handbag",
            "tie",
            "suitcase",
            "frisbee",
            "skis",
            "snowboard",
            "sports ball",
            "kite",
            "baseball bat",
            "baseball glove",
            "skateboard",
            "surfboard",
            "tennis racket",
            "bottle",
            "wine glass",
            "cup",
            "fork",
            "knife",
            "spoon",
            "bowl",
            "banana",
            "apple",
            "sandwich",
            "orange",
            "broccoli",
            "carrot",
            "hot dog",
            "pizza",
            "donut",
            "cake",
            "chair",
            "couch",
            "potted plant",
            "bed",
            "dining table",
            "toilet",
            "tv",
            "laptop",
            "mouse",
            "remote",
            "keyboard",
            "cell phone",
            "microwave",
            "oven",
            "toaster",
            "sink",
            "refrigerator",
            "book",
            "clock",
            "vase",
            "scissors",
            "teddy bear",
            "hair drier",
            "toothbrush",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_w = img.shape[1]
        img_h = img.shape[0]

        w = img_w
        h = img_h
        scale = 1.0
        if w > h:
            scale = float(self.target_size) / w
            w = self.target_size
            h = int(h * scale)
        else:
            scale = float(self.target_size) / h
            h = self.target_size
            w = int(w * scale)

        mat_in = ncnn.Mat.from_pixels_resize(
            img, ncnn.Mat.PixelType.PIXEL_BGR, img_w, img_h, w, h
        )

        # pad to target_size rectangle
        wpad = (w + 31) // 32 * 32 - w
        hpad = (h + 31) // 32 * 32 - h
        mat_in_pad = ncnn.copy_make_border(
            mat_in,
            hpad // 2,
            hpad - hpad // 2,
            wpad // 2,
            wpad - wpad // 2,
            ncnn.BorderType.BORDER_CONSTANT,
            0,
        )

        mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()
        ex.input("input.1", mat_in_pad)

        score_out_name = ["792", "814", "836"]
        scores = [ex.extract(x)[1] for x in score_out_name]
        scores = [np.reshape(x, (-1, 80)) for x in scores]

        boxes_out_name = ["795", "817", "839"]
        raw_boxes = [ex.extract(x)[1] for x in boxes_out_name]
        raw_boxes = [np.reshape(x, (-1, 32)) for x in raw_boxes]

        # generate centers
        decode_boxes = []
        select_scores = []
        for stride, box_distribute, score in zip(self.strides, raw_boxes, scores):
            # centers
            if mat_in_pad.w > mat_in_pad.h:
                fm_w = mat_in_pad.w // stride
                fm_h = score.shape[0] // fm_w
            else:
                fm_h = mat_in_pad.h // stride
                fm_w = score.shape[1] // fm_h
            h_range = np.arange(fm_h)
            w_range = np.arange(fm_w)
            ww, hh = np.meshgrid(w_range, h_range)
            ct_row = (hh.flatten() + 0.5) * stride
            ct_col = (ww.flatten() + 0.5) * stride
            center = np.stack((ct_col, ct_row, ct_col, ct_row), axis=1)

            # box distribution to distance
            reg_range = np.arange(self.reg_max + 1)
            box_distance = box_distribute.reshape((-1, self.reg_max + 1))
            box_distance = softmax(box_distance)
            box_distance = box_distance * np.expand_dims(reg_range, axis=0)
            box_distance = np.sum(box_distance, axis=1).reshape((-1, 4))
            box_distance = box_distance * stride

            # top K candidate
            topk_idx = np.argsort(score.max(axis=1))[::-1]
            topk_idx = topk_idx[: self.num_candidate]
            center = center[topk_idx]
            score = score[topk_idx]
            box_distance = box_distance[topk_idx]

            # decode box
            decode_box = center + [-1, -1, 1, 1] * box_distance

            select_scores.append(score)
            decode_boxes.append(decode_box)

        # nms
        bboxes = np.concatenate(decode_boxes, axis=0)
        confidences = np.concatenate(select_scores, axis=0)
        picked_box = []
        picked_probs = []
        picked_labels = []
        for class_index in range(0, confidences.shape[1]):
            probs = confidences[:, class_index]
            mask = probs > self.prob_threshold
            probs = probs[mask]
            if probs.shape[0] == 0:
                continue
            subset_boxes = bboxes[mask, :]
            picked = nms(
                subset_boxes,
                probs,
                iou_threshold=self.nms_threshold,
                top_k=self.top_k,
            )
            picked_box.append(subset_boxes[picked])
            picked_probs.append(probs[picked])
            picked_labels.extend([class_index] * len(picked))

        if not picked_box:
            return []

        picked_box = np.concatenate(picked_box)
        picked_probs = np.concatenate(picked_probs)

        # result with clip
        objects = [
            Detect_Object(
                label,
                score,
                (bbox[0] - wpad / 2) / scale if bbox[0] > 0 else 0,
                (bbox[1] - hpad / 2) / scale if bbox[1] > 0 else 0,
                (bbox[2] - bbox[0]) / scale
                if bbox[2] < mat_in_pad.w
                else (mat_in_pad.w - bbox[0]) / scale,
                (bbox[3] - bbox[1]) / scale
                if bbox[3] < mat_in_pad.h
                else (mat_in_pad.h - bbox[1]) / scale,
            )
            for label, score, bbox in zip(picked_labels, picked_probs, picked_box)
        ]

        return objects


================================================
FILE: python/ncnn/model_zoo/peleenetssd.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class PeleeNet_SSD:
    def __init__(self, target_size=304, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [103.9, 116.7, 123.6]
        self.norm_vals = [0.017, 0.017, 0.017]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # model is converted from https://github.com/eric612/MobileNet-YOLO
        # and can be downloaded from https://drive.google.com/open?id=1Wt6jKv13sBRMHgrGAJYlOlRF-o80pC0g
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("pelee.param"))
        self.net.load_model(get_model_file("pelee.bin"))

        self.class_names = [
            "background",
            "person",
            "rider",
            "car",
            "bus",
            "truck",
            "bike",
            "motor",
            "traffic light",
            "traffic sign",
            "train",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("detection_out")

        objects = []

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y
            objects.append(obj)
        """

        ret, seg_out = ex.extract("sigmoid")

        resized = ncnn.Mat()
        ncnn.resize_bilinear(seg_out, resized, img_w, img_h)

        return objects, resized


================================================
FILE: python/ncnn/model_zoo/retinaface.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Point, Face_Object


class RetinaFace:
    def __init__(
        self, prob_threshold=0.8, nms_threshold=0.4, num_threads=1, use_gpu=False
    ):
        self.prob_threshold = prob_threshold
        self.nms_threshold = nms_threshold
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # model is converted from
        # https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models
        # https://github.com/deepinsight/insightface/issues/669
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("mnet.25-opt.param"))
        self.net.load_model(get_model_file("mnet.25-opt.bin"))

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels(
            img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h
        )

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        faceobjects32 = self.detect_stride32(ex)
        faceobjects16 = self.detect_stride16(ex)
        faceobjects8 = self.detect_stride8(ex)

        faceproposals = [*faceobjects32, *faceobjects16, *faceobjects8]

        # sort all proposals by score from highest to lowest
        faceproposals.sort(key=lambda obj: obj.prob, reverse=True)

        # apply nms with nms_threshold
        picked = self.nms_sorted_bboxes(faceproposals, self.nms_threshold)

        face_count = len(picked)

        faceobjects = []
        for i in range(face_count):
            faceobjects.append(faceproposals[picked[i]])

            # clip to image size
            x0 = faceobjects[i].rect.x
            y0 = faceobjects[i].rect.y
            x1 = x0 + faceobjects[i].rect.w
            y1 = y0 + faceobjects[i].rect.h

            x0 = np.maximum(np.minimum(x0, float(img_w) - 1), 0.0)
            y0 = np.maximum(np.minimum(y0, float(img_h) - 1), 0.0)
            x1 = np.maximum(np.minimum(x1, float(img_w) - 1), 0.0)
            y1 = np.maximum(np.minimum(y1, float(img_h) - 1), 0.0)

            faceobjects[i].rect.x = x0
            faceobjects[i].rect.y = y0
            faceobjects[i].rect.w = x1 - x0
            faceobjects[i].rect.h = y1 - y0

        return faceobjects

    def detect_stride32(self, ex):
        ret1, score_blob = ex.extract("face_rpn_cls_prob_reshape_stride32")
        ret2, bbox_blob = ex.extract("face_rpn_bbox_pred_stride32")
        ret3, landmark_blob = ex.extract("face_rpn_landmark_pred_stride32")

        base_size = 16
        feat_stride = 32
        ratios = ncnn.Mat(1)
        ratios[0] = 1.0
        scales = ncnn.Mat(2)
        scales[0] = 32.0
        scales[1] = 16.0
        anchors = self.generate_anchors(base_size, ratios, scales)

        faceobjects32 = self.generate_proposals(
            anchors,
            feat_stride,
            score_blob,
            bbox_blob,
            landmark_blob,
            self.prob_threshold,
        )

        return faceobjects32

    def detect_stride16(self, ex):
        ret1, score_blob = ex.extract("face_rpn_cls_prob_reshape_stride16")
        ret2, bbox_blob = ex.extract("face_rpn_bbox_pred_stride16")
        ret3, landmark_blob = ex.extract("face_rpn_landmark_pred_stride16")

        base_size = 16
        feat_stride = 16
        ratios = ncnn.Mat(1)
        ratios[0] = 1.0
        scales = ncnn.Mat(2)
        scales[0] = 8.0
        scales[1] = 4.0
        anchors = self.generate_anchors(base_size, ratios, scales)

        faceobjects16 = self.generate_proposals(
            anchors,
            feat_stride,
            score_blob,
            bbox_blob,
            landmark_blob,
            self.prob_threshold,
        )

        return faceobjects16

    def detect_stride8(self, ex):
        ret1, score_blob = ex.extract("face_rpn_cls_prob_reshape_stride8")
        ret2, bbox_blob = ex.extract("face_rpn_bbox_pred_stride8")
        ret3, landmark_blob = ex.extract("face_rpn_landmark_pred_stride8")

        base_size = 16
        feat_stride = 8
        ratios = ncnn.Mat(1)
        ratios[0] = 1.0
        scales = ncnn.Mat(2)
        scales[0] = 2.0
        scales[1] = 1.0
        anchors = self.generate_anchors(base_size, ratios, scales)

        faceobjects8 = self.generate_proposals(
            anchors,
            feat_stride,
            score_blob,
            bbox_blob,
            landmark_blob,
            self.prob_threshold,
        )

        return faceobjects8

    def generate_anchors(self, base_size, ratios, scales):
        num_ratio = ratios.w
        num_scale = scales.w

        # anchors = ncnn.Mat()
        # anchors.create(w=4, h=num_ratio * num_scale)

        anchors_np = np.zeros((2, 4), dtype=np.float32)

        cx = base_size * 0.5
        cy = base_size * 0.5

        for i in range(num_ratio):
            ar = ratios[i]

            r_w = np.round(base_size / np.sqrt(ar))
            r_h = np.round(r_w * ar)  # round(base_size * np.sqrt(ar))

            for j in range(num_scale):
                scale = scales[j]

                rs_w = r_w * scale
                rs_h = r_h * scale

                anchor = anchors_np[i * num_scale + j]

                anchor[0] = cx - rs_w * 0.5
                anchor[1] = cy - rs_h * 0.5
                anchor[2] = cx + rs_w * 0.5
                anchor[3] = cy + rs_h * 0.5

        anchors = ncnn.Mat(anchors_np)
        return anchors

    def generate_proposals(
        self, anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold
    ):
        faceobjects = []

        w = score_blob.w
        h = score_blob.h

        # generate face proposal from bbox deltas and shifted anchors
        num_anchors = anchors.h

        for q in range(num_anchors):
            anchor = anchors.row(q)

            score = score_blob.channel(q + num_anchors)
            bbox = bbox_blob.channel_range(q * 4, 4)
            landmark = landmark_blob.channel_range(q * 10, 10)

            # shifted anchor
            anchor_y = anchor[1]

            anchor_w = anchor[2] - anchor[0]
            anchor_h = anchor[3] - anchor[1]

            for i in range(h):
                anchor_x = anchor[0]

                for j in range(w):
                    index = i * w + j

                    prob = score[index]

                    if prob >= prob_threshold:
                        # apply center size
                        dx = bbox.channel(0)[index]
                        dy = bbox.channel(1)[index]
                        dw = bbox.channel(2)[index]
                        dh = bbox.channel(3)[index]

                        cx = anchor_x + anchor_w * 0.5
                        cy = anchor_y + anchor_h * 0.5

                        pb_cx = cx + anchor_w * dx
                        pb_cy = cy + anchor_h * dy

                        pb_w = anchor_w * np.exp(dw)
                        pb_h = anchor_h * np.exp(dh)

                        x0 = pb_cx - pb_w * 0.5
                        y0 = pb_cy - pb_h * 0.5
                        x1 = pb_cx + pb_w * 0.5
                        y1 = pb_cy + pb_h * 0.5

                        obj = Face_Object()
                        obj.rect.x = x0
                        obj.rect.y = y0
                        obj.rect.w = x1 - x0 + 1
                        obj.rect.h = y1 - y0 + 1
                        obj.landmark = [Point(), Point(), Point(), Point(), Point()]
                        obj.landmark[0].x = (
                            cx + (anchor_w + 1) * landmark.channel(0)[index]
                        )
                        obj.landmark[0].y = (
                            cy + (anchor_h + 1) * landmark.channel(1)[index]
                        )
                        obj.landmark[1].x = (
                            cx + (anchor_w + 1) * landmark.channel(2)[index]
                        )
                        obj.landmark[1].y = (
                            cy + (anchor_h + 1) * landmark.channel(3)[index]
                        )
                        obj.landmark[2].x = (
                            cx + (anchor_w + 1) * landmark.channel(4)[index]
                        )
                        obj.landmark[2].y = (
                            cy + (anchor_h + 1) * landmark.channel(5)[index]
                        )
                        obj.landmark[3].x = (
                            cx + (anchor_w + 1) * landmark.channel(6)[index]
                        )
                        obj.landmark[3].y = (
                            cy + (anchor_h + 1) * landmark.channel(7)[index]
                        )
                        obj.landmark[4].x = (
                            cx + (anchor_w + 1) * landmark.channel(8)[index]
                        )
                        obj.landmark[4].y = (
                            cy + (anchor_h + 1) * landmark.channel(9)[index]
                        )
                        obj.prob = prob

                        faceobjects.append(obj)

                    anchor_x += feat_stride

                anchor_y += feat_stride

        return faceobjects

    def nms_sorted_bboxes(self, faceobjects, nms_threshold):
        picked = []

        n = len(faceobjects)

        areas = []
        for i in range(n):
            areas.append(faceobjects[i].rect.area())

        for i in range(n):
            a = faceobjects[i]

            keep = True
            for j in range(len(picked)):
                b = faceobjects[picked[j]]

                # intersection over union
                inter_area = a.rect.intersection_area(b.rect)
                union_area = areas[i] + areas[picked[j]] - inter_area
                # float IoU = inter_area / union_area
                if inter_area / union_area > nms_threshold:
                    keep = False

            if keep:
                picked.append(i)

        return picked


================================================
FILE: python/ncnn/model_zoo/rfcn.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class RFCN:
    def __init__(
        self,
        target_size=224,
        max_per_image=100,
        confidence_thresh=0.6,
        nms_threshold=0.3,
        num_threads=1,
        use_gpu=False,
    ):
        self.target_size = target_size
        self.max_per_image = max_per_image
        self.confidence_thresh = confidence_thresh
        self.nms_threshold = nms_threshold
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [102.9801, 115.9465, 122.7717]
        self.norm_vals = []

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # original pretrained model from https://github.com/YuwenXiong/py-R-FCN
        # https://github.com/YuwenXiong/py-R-FCN/blob/master/models/pascal_voc/ResNet-50/rfcn_end2end/test_agnostic.prototxt
        # https://1drv.ms/u/s!AoN7vygOjLIQqUWHpY67oaC7mopf
        # resnet50_rfcn_final.caffemodel
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("rfcn_end2end.param"))
        self.net.load_model(get_model_file("rfcn_end2end.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        h = img.shape[0]
        w = img.shape[1]

        scale = 1.0
        if w < h:
            scale = float(self.target_size) / w
            w = self.target_size
            h = h * scale
        else:
            scale = float(self.target_size) / h
            h = self.target_size
            w = w * scale

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            int(w),
            int(h),
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        im_info = ncnn.Mat(3)
        im_info[0] = h
        im_info[1] = w
        im_info[2] = scale

        # step1, extract feature and all rois
        ex1 = self.net.create_extractor()

        ex1.input("data", mat_in)
        ex1.input("im_info", im_info)

        ret1, rfcn_cls = ex1.extract("rfcn_cls")
        ret2, rfcn_bbox = ex1.extract("rfcn_bbox")
        ret3, rois = ex1.extract("rois")  # all rois

        # step2, extract bbox and score for each roi
        class_candidates = []
        for i in range(rois.c):
            ex2 = self.net.create_extractor()

            roi = rois.channel(i)  # get single roi
            ex2.input("rfcn_cls", rfcn_cls)
            ex2.input("rfcn_bbox", rfcn_bbox)
            ex2.input("rois", roi)

            ret1, bbox_pred = ex2.extract("bbox_pred")
            ret2, cls_prob = ex2.extract("cls_prob")

            num_class = cls_prob.w
            while len(class_candidates) < num_class:
                class_candidates.append([])

            # find class id with highest score
            label = 0
            score = 0.0
            for j in range(num_class):
                class_score = cls_prob[j]
                if class_score > score:
                    label = j
                    score = class_score

            # ignore background or low score
            if label == 0 or score <= self.confidence_thresh:
                continue

            # fprintf(stderr, "%d = %f\n", label, score)

            # unscale to image size
            x1 = roi[0] / scale
            y1 = roi[1] / scale
            x2 = roi[2] / scale
            y2 = roi[3] / scale

            pb_w = x2 - x1 + 1
            pb_h = y2 - y1 + 1

            # apply bbox regression
            dx = bbox_pred[4]
            dy = bbox_pred[4 + 1]
            dw = bbox_pred[4 + 2]
            dh = bbox_pred[4 + 3]

            cx = x1 + pb_w * 0.5
            cy = y1 + pb_h * 0.5

            obj_cx = cx + pb_w * dx
            obj_cy = cy + pb_h * dy

            obj_w = pb_w * np.exp(dw)
            obj_h = pb_h * np.exp(dh)

            obj_x1 = obj_cx - obj_w * 0.5
            obj_y1 = obj_cy - obj_h * 0.5
            obj_x2 = obj_cx + obj_w * 0.5
            obj_y2 = obj_cy + obj_h * 0.5

            # clip
            obj_x1 = np.maximum(np.minimum(obj_x1, float(img.shape[1] - 1)), 0.0)
            obj_y1 = np.maximum(np.minimum(obj_y1, float(img.shape[0] - 1)), 0.0)
            obj_x2 = np.maximum(np.minimum(obj_x2, float(img.shape[1] - 1)), 0.0)
            obj_y2 = np.maximum(np.minimum(obj_y2, float(img.shape[0] - 1)), 0.0)

            # append object
            obj = Detect_Object()
            obj.rect.x = obj_x1
            obj.rect.y = obj_y1
            obj.rect.w = obj_x2 - obj_x1 + 1
            obj.rect.h = obj_y2 - obj_y1 + 1
            obj.label = label
            obj.prob = score

            class_candidates[label].append(obj)

        # post process
        objects = []
        for candidates in class_candidates:
            if len(candidates) == 0:
                continue

            candidates.sort(key=lambda obj: obj.prob, reverse=True)

            picked = self.nms_sorted_bboxes(candidates, self.nms_threshold)

            for j in range(len(picked)):
                z = picked[j]
                objects.append(candidates[z])

        objects.sort(key=lambda obj: obj.prob, reverse=True)

        objects = objects[: self.max_per_image]

        return objects

    def nms_sorted_bboxes(self, objects, nms_threshold):
        picked = []

        n = len(objects)

        areas = np.zeros((n,), dtype=np.float32)
        for i in range(n):
            areas[i] = objects[i].rect.area()

        for i in range(n):
            a = objects[i]

            keep = True
            for j in range(len(picked)):
                b = objects[picked[j]]

                # intersection over union
                inter_area = a.rect.intersection_area(b.rect)
                union_area = areas[i] + areas[picked[j]] - inter_area
                # float IoU = inter_area / union_area
                if inter_area / union_area > nms_threshold:
                    keep = False

            if keep:
                picked.append(i)

        return picked


================================================
FILE: python/ncnn/model_zoo/shufflenetv2.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import ncnn
from .model_store import get_model_file


class ShuffleNetV2:
    def __init__(self, target_size=224, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = []
        self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe
        # models can be downloaded from https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe/releases
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("shufflenet_v2_x0.5.param"))
        self.net.load_model(get_model_file("shufflenet_v2_x0.5.bin"))

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("fc")

        # manually call softmax on the fc output
        # convert result into probability
        # skip if your model already has softmax operation
        softmax = ncnn.create_layer("Softmax")

        pd = ncnn.ParamDict()
        softmax.load_param(pd)

        softmax.forward_inplace(mat_out, self.net.opt)

        mat_out = mat_out.reshape(mat_out.w * mat_out.h * mat_out.c)

        cls_scores = np.array(mat_out)
        return cls_scores


================================================
FILE: python/ncnn/model_zoo/simplepose.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import KeyPoint


class SimplePose:
    def __init__(
        self, target_width=192, target_height=256, num_threads=1, use_gpu=False
    ):
        self.target_width = target_width
        self.target_height = target_height
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [0.485 * 255.0, 0.456 * 255.0, 0.406 * 255.0]
        self.norm_vals = [1 / 0.229 / 255.0, 1 / 0.224 / 255.0, 1 / 0.225 / 255.0]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # the simple baseline human pose estimation from gluon-cv
        # https://gluon-cv.mxnet.io/build/examples_pose/demo_simple_pose.html
        # mxnet model exported via
        #      pose_net.hybridize()
        #      pose_net.export('pose')
        # then mxnet2ncnn
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("pose.param"))
        self.net.load_model(get_model_file("pose.bin"))

    def __del__(self):
        self.net = None

    def __call__(self, img):
        h = img.shape[0]
        w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR2RGB,
            img.shape[1],
            img.shape[0],
            self.target_width,
            self.target_height,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("conv3_fwd")

        keypoints = []

        for p in range(mat_out.c):
            m = mat_out.channel(p)

            max_prob = 0.0
            max_x = 0
            max_y = 0
            for y in range(mat_out.h):
                ptr = m.row(y)
                for x in range(mat_out.w):
                    prob = ptr[x]
                    if prob > max_prob:
                        max_prob = prob
                        max_x = x
                        max_y = y

            keypoint = KeyPoint()
            keypoint.p.x = max_x * w / float(mat_out.w)
            keypoint.p.y = max_y * h / float(mat_out.h)
            keypoint.prob = max_prob

            keypoints.append(keypoint)

        return keypoints


================================================
FILE: python/ncnn/model_zoo/squeezenet.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import ncnn
from .model_store import get_model_file


class SqueezeNet:
    def __init__(self, target_size=227, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [104.0, 117.0, 123.0]
        self.norm_vals = []

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("squeezenet_v1.1.param"))
        self.net.load_model(get_model_file("squeezenet_v1.1.bin"))

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("prob")

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        out = np.array(mat_out)
        return out


================================================
FILE: python/ncnn/model_zoo/squeezenetssd.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class SqueezeNet_SSD:
    def __init__(self, target_size=300, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [104.0, 117.0, 123.0]
        self.norm_vals = []

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # original pretrained model from https://github.com/chuanqi305/SqueezeNet-SSD
        # squeezenet_ssd_voc_deploy.prototxt
        # https://drive.google.com/open?id=0B3gersZ2cHIxdGpyZlZnbEQ5Snc
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("squeezenet_ssd_voc.param"))
        self.net.load_model(get_model_file("squeezenet_ssd_voc.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("detection_out")

        objects = []

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y
            objects.append(obj)
        """

        return objects


================================================
FILE: python/ncnn/model_zoo/yolact.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

from math import sqrt
import numpy as np
import cv2
import ncnn
from .model_store import get_model_file
from ..utils.functional import sigmoid, nms


class Yolact:
    def __init__(
        self,
        target_size=550,
        confidence_threshold=0.05,
        nms_threshold=0.5,
        keep_top_k=200,
        num_threads=1,
        use_gpu=False,
    ):
        self.target_size = target_size
        self.confidence_threshold = confidence_threshold
        self.nms_threshold = nms_threshold
        self.keep_top_k = keep_top_k
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [123.68, 116.78, 103.94]
        self.norm_vals = [1.0 / 58.40, 1.0 / 57.12, 1.0 / 57.38]

        self.net = ncnn.Net()
        self.net.opt.use_vulkan_compute = self.use_gpu
        self.net.opt.num_threads = self.num_threads

        # original model converted from https://github.com/dbolya/yolact
        # yolact_resnet50_54_800000.pth
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("yolact.param"))
        self.net.load_model(get_model_file("yolact.bin"))

        self.conv_ws = [69, 35, 18, 9, 5]
        self.conv_hs = [69, 35, 18, 9, 5]
        self.aspect_ratios = [1, 0.5, 2]
        self.scales = [24, 48, 96, 192, 384]

        self.priors = None
        self.last_img_size = None

        self.make_priors()

        self.class_names = [
            "background",
            "person",
            "bicycle",
            "car",
            "motorcycle",
            "airplane",
            "bus",
            "train",
            "truck",
            "boat",
            "traffic light",
            "fire hydrant",
            "stop sign",
            "parking meter",
            "bench",
            "bird",
            "cat",
            "dog",
            "horse",
            "sheep",
            "cow",
            "elephant",
            "bear",
            "zebra",
            "giraffe",
            "backpack",
            "umbrella",
            "handbag",
            "tie",
            "suitcase",
            "frisbee",
            "skis",
            "snowboard",
            "sports ball",
            "kite",
            "baseball bat",
            "baseball glove",
            "skateboard",
            "surfboard",
            "tennis racket",
            "bottle",
            "wine glass",
            "cup",
            "fork",
            "knife",
            "spoon",
            "bowl",
            "banana",
            "apple",
            "sandwich",
            "orange",
            "broccoli",
            "carrot",
            "hot dog",
            "pizza",
            "donut",
            "cake",
            "chair",
            "couch",
            "potted plant",
            "bed",
            "dining table",
            "toilet",
            "tv",
            "laptop",
            "mouse",
            "remote",
            "keyboard",
            "cell phone",
            "microwave",
            "oven",
            "toaster",
            "sink",
            "refrigerator",
            "book",
            "clock",
            "vase",
            "scissors",
            "teddy bear",
            "hair drier",
            "toothbrush",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR2RGB,
            img_w,
            img_h,
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()
        ex.input("input.1", mat_in)

        ret1, proto_data = ex.extract("619")  # 138x138 x 32
        ret2, loc_data = ex.extract("816")  # 4 x 19248
        ret3, mask_data = ex.extract("818")  # maskdim 32 x 19248
        ret4, conf_data = ex.extract("820")  # 81 x 19248

        proto_data = np.array(proto_data)
        loc_data = np.array(loc_data)
        mask_data = np.array(mask_data)
        conf_data = np.array(conf_data)
        prior_data = self.make_priors()

        # decoded_boxes = self.decode(loc_data, prior_data)
        boxes, masks, classes, scores = self.detect(
            conf_data, loc_data, prior_data, mask_data, img_w, img_h
        )

        # generate mask
        masks = proto_data.transpose(1, 2, 0) @ masks.T
        masks = sigmoid(masks)

        # Scale masks up to the full image
        masks = cv2.resize(masks, (img_w, img_h), interpolation=cv2.INTER_LINEAR)

        # transpose into the correct output shape [num_dets, proto_h, proto_w]
        masks = masks.transpose(2, 0, 1)

        masks = masks > 0.5

        return boxes, masks, classes, scores

    def make_priors(self):
        """ Note that priors are [x,y,width,height] where (x,y) is the center of the box. """
        if self.last_img_size != (self.target_size, self.target_size):
            prior_data = []

            for conv_w, conv_h, scale in zip(self.conv_ws, self.conv_hs, self.scales):
                for i in range(conv_h):
                    for j in range(conv_w):
                        # +0.5 because priors are in center-size notation
                        cx = (j + 0.5) / conv_w
                        cy = (i + 0.5) / conv_h

                        for ar in self.aspect_ratios:
                            ar = sqrt(ar)

                            w = scale * ar / self.target_size
                            h = scale / ar / self.target_size

                            # This is for backward compatibility with a bug where I made everything square by accident
                            h = w

                            prior_data += [cx, cy, w, h]

            self.priors = np.array(prior_data).reshape(-1, 4)
            self.last_img_size = (self.target_size, self.target_size)

        return self.priors

    def decode(self, loc, priors, img_w, img_h):
        """
        Decode predicted bbox coordinates using the same scheme
        employed by Yolov2: https://arxiv.org/pdf/1612.08242.pdf

            b_x = (sigmoid(pred_x) - .5) / conv_w + prior_x
            b_y = (sigmoid(pred_y) - .5) / conv_h + prior_y
            b_w = prior_w * exp(loc_w)
            b_h = prior_h * exp(loc_h)

        Note that loc is inputed as [(s(x)-.5)/conv_w, (s(y)-.5)/conv_h, w, h]
        while priors are inputed as [x, y, w, h] where each coordinate
        is relative to size of the image (even sigmoid(x)). We do this
        in the network by dividing by the 'cell size', which is just
        the size of the convouts.

        Also note that prior_x and prior_y are center coordinates which
        is why we have to subtract .5 from sigmoid(pred_x and pred_y).

        Args:
            - loc:    The predicted bounding boxes of size [num_priors, 4]
            - priors: The priorbox coords with size [num_priors, 4]

        Returns: A tensor of decoded relative coordinates in point form
                form with size [num_priors, 4(x, y, w, h)]
        """

        variances = [0.1, 0.2]

        boxes = np.concatenate(
            (
                priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
                priors[:, 2:] * np.exp(loc[:, 2:] * variances[1]),
            ),
            1,
        )
        boxes[:, :2] -= boxes[:, 2:] / 2
        # boxes[:, 2:] += boxes[:, :2]

        # crop
        np.where(boxes[:, 0] < 0, 0, boxes[:, 0])
        np.where(boxes[:, 1] < 0, 0, boxes[:, 1])
        np.where(boxes[:, 2] > 1, 1, boxes[:, 2])
        np.where(boxes[:, 3] > 1, 1, boxes[:, 3])

        # decode to img size
        boxes[:, 0] *= img_w
        boxes[:, 1] *= img_h
        boxes[:, 2] = boxes[:, 2] * img_w + 1
        boxes[:, 3] = boxes[:, 3] * img_h + 1

        return boxes

    def detect(self, conf_preds, loc_data, prior_data, mask_data, img_w, img_h):
        """ Perform nms for only the max scoring class that isn't background (class 0) """
        cur_scores = conf_preds[:, 1:]
        num_class = cur_scores.shape[1]

        classes = np.argmax(cur_scores, axis=1)
        conf_scores = cur_scores[range(cur_scores.shape[0]), classes]

        # filte by confidence_threshold
        keep = conf_scores > self.confidence_threshold
        conf_scores = conf_scores[keep]
        classes = classes[keep]
        loc_data = loc_data[keep, :]
        prior_data = prior_data[keep, :]
        masks = mask_data[keep, :]

        # decode x, y, w, h
        boxes = self.decode(loc_data, prior_data, img_w, img_h)

        # nms for every class
        boxes_result = []
        masks_result = []
        classes_result = []
        conf_scores_result = []
        for i in range(num_class):
            where = np.where(classes == i)
            if len(where) == 0:
                continue

            boxes_tmp = boxes[where]
            masks_tmp = masks[where]
            classes_tmp = classes[where]
            conf_scores_tmp = conf_scores[where]

            score_mask = conf_scores_tmp > self.confidence_threshold
            boxes_tmp = boxes_tmp[score_mask]
            masks_tmp = masks_tmp[score_mask]
            classes_tmp = classes_tmp[score_mask]
            conf_scores_tmp = conf_scores_tmp[score_mask]

            indexes = nms(
                boxes_tmp,
                conf_scores_tmp,
                iou_threshold=self.nms_threshold,
                top_k=self.keep_top_k,
            )

            for index in indexes:
                boxes_result.append(boxes_tmp[index])
                masks_result.append(masks_tmp[index])
                classes_result.append(classes_tmp[index] + 1)
                conf_scores_result.append(conf_scores_tmp[index])

        # keep top k
        if len(conf_scores_result) > self.keep_top_k:
            indexes = np.argsort(conf_scores_result)
            indexes = indexes[: self.keep_top_k]

            boxes_result = boxes_result[indexes]
            masks_result = masks_result[indexes]
            classes_result = classes_result[indexes]
            conf_scores_result = conf_scores_result[indexes]

        return (
            np.array(boxes_result),
            np.array(masks_result),
            np.array(classes_result),
            np.array(conf_scores_result),
        )


================================================
FILE: python/ncnn/model_zoo/yolov2.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class MobileNet_YoloV2:
    def __init__(self, target_size=416, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [1.0, 1.0, 1.0]
        self.norm_vals = [0.007843, 0.007843, 0.007843]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # original pretrained model from https://github.com/eric612/MobileNet-YOLO
        # https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy.prototxt
        # https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy_iter_80000.caffemodel
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("mobilenet_yolo.param"))
        self.net.load_model(get_model_file("mobilenet_yolo.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize([], self.norm_vals)
        mat_in.substract_mean_normalize(self.mean_vals, [])

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("detection_out")

        objects = []

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y
            objects.append(obj)
        """

        return objects


================================================
FILE: python/ncnn/model_zoo/yolov3.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class MobileNetV2_YoloV3:
    def __init__(self, target_size=352, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [127.5, 127.5, 127.5]
        self.norm_vals = [0.007843, 0.007843, 0.007843]

        self.net = ncnn.Net()
        self.net.opt.num_threads = self.num_threads
        self.net.opt.use_vulkan_compute = self.use_gpu

        # original pretrained model from https://github.com/eric612/MobileNet-YOLO
        # param : https://drive.google.com/open?id=1V9oKHP6G6XvXZqhZbzNKL6FI_clRWdC-
        # bin : https://drive.google.com/open?id=1DBcuFCr-856z3FRQznWL_S5h-Aj3RawA
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("mobilenetv2_yolov3.param"))
        self.net.load_model(get_model_file("mobilenetv2_yolov3.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()

        ex.input("data", mat_in)

        ret, mat_out = ex.extract("detection_out")

        objects = []

        # printf("%d %d %d\n", mat_out.w, mat_out.h, mat_out.c)

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.x = values[2] * img_w
            obj.y = values[3] * img_h
            obj.w = values[4] * img_w - obj.x
            obj.h = values[5] * img_h - obj.y
            objects.append(obj)
        """

        return objects


================================================
FILE: python/ncnn/model_zoo/yolov4.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class YoloV4_Base:
    def __init__(self, tiny, target_size, num_threads=1, use_gpu=False):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = []
        self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0]

        self.net = ncnn.Net()
        self.net.opt.use_vulkan_compute = self.use_gpu
        self.net.opt.num_threads = self.num_threads

        # original pretrained model from https://github.com/AlexeyAB/darknet
        # the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        if tiny == True:
            self.net.load_param(get_model_file("yolov4-tiny-opt.param"))
            self.net.load_model(get_model_file("yolov4-tiny-opt.bin"))
        else:
            self.net.load_param(get_model_file("yolov4-opt.param"))
            self.net.load_model(get_model_file("yolov4-opt.bin"))

        self.class_names = [
            "background",
            "person",
            "bicycle",
            "car",
            "motorbike",
            "aeroplane",
            "bus",
            "train",
            "truck",
            "boat",
            "traffic light",
            "fire hydrant",
            "stop sign",
            "parking meter",
            "bench",
            "bird",
            "cat",
            "dog",
            "horse",
            "sheep",
            "cow",
            "elephant",
            "bear",
            "zebra",
            "giraffe",
            "backpack",
            "umbrella",
            "handbag",
            "tie",
            "suitcase",
            "frisbee",
            "skis",
            "snowboard",
            "sports ball",
            "kite",
            "baseball bat",
            "baseball glove",
            "skateboard",
            "surfboard",
            "tennis racket",
            "bottle",
            "wine glass",
            "cup",
            "fork",
            "knife",
            "spoon",
            "bowl",
            "banana",
            "apple",
            "sandwich",
            "orange",
            "broccoli",
            "carrot",
            "hot dog",
            "pizza",
            "donut",
            "cake",
            "chair",
            "sofa",
            "pottedplant",
            "bed",
            "diningtable",
            "toilet",
            "tvmonitor",
            "laptop",
            "mouse",
            "remote",
            "keyboard",
            "cell phone",
            "microwave",
            "oven",
            "toaster",
            "sink",
            "refrigerator",
            "book",
            "clock",
            "vase",
            "scissors",
            "teddy bear",
            "hair drier",
            "toothbrush",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR2RGB,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()
        ex.input("data", mat_in)

        ret, mat_out = ex.extract("output")

        objects = []

        # method 1, use ncnn.Mat.row to get the result, no memory copy
        for i in range(mat_out.h):
            values = mat_out.row(i)

            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.rect.x = values[2] * img_w
            obj.rect.y = values[3] * img_h
            obj.rect.w = values[4] * img_w - obj.rect.x
            obj.rect.h = values[5] * img_h - obj.rect.y

            objects.append(obj)

        """
        #method 2, use ncnn.Mat->numpy.array to get the result, no memory copy too
        out = np.array(mat_out)
        for i in range(len(out)):
            values = out[i]
            obj = Detect_Object()
            obj.label = values[0]
            obj.prob = values[1]
            obj.x = values[2] * img_w
            obj.y = values[3] * img_h
            obj.w = values[4] * img_w - obj.x
            obj.h = values[5] * img_h - obj.y
            objects.append(obj)
        """

        return objects


class YoloV4_Tiny(YoloV4_Base):
    def __init__(self, **kwargs):
        super(YoloV4_Tiny, self).__init__(True, 416, **kwargs)


class YoloV4(YoloV4_Base):
    def __init__(self, **kwargs):
        super(YoloV4, self).__init__(False, 608, **kwargs)


================================================
FILE: python/ncnn/model_zoo/yolov5.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import time
import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object
from ..utils.functional import *


class YoloV5Focus(ncnn.Layer):
    yolov5FocusLayers = []

    def __init__(self):
        ncnn.Layer.__init__(self)
        self.one_blob_only = True

        self.yolov5FocusLayers.append(self)

    def forward(self, bottom_blob, top_blob, opt):
        x = np.array(bottom_blob)
        x = np.concatenate(
            [
                x[..., ::2, ::2],
                x[..., 1::2, ::2],
                x[..., ::2, 1::2],
                x[..., 1::2, 1::2],
            ]
        )

        top_blob.clone_from(ncnn.Mat(x), opt.blob_allocator)
        if top_blob.empty():
            return -100

        return 0


def YoloV5Focus_layer_creator():
    return YoloV5Focus()


def YoloV5Focus_layer_destroyer(layer):
    for i in range(len(YoloV5Focus.yolov5FocusLayers)):
        if YoloV5Focus.yolov5FocusLayers[i] == layer:
            del YoloV5Focus.yolov5FocusLayers[i]
            break


class YoloV5s:
    def __init__(
        self,
        target_size=640,
        prob_threshold=0.25,
        nms_threshold=0.45,
        num_threads=1,
        use_gpu=False,
    ):
        self.target_size = target_size
        self.prob_threshold = prob_threshold
        self.nms_threshold = nms_threshold
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = []
        self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0]

        self.net = ncnn.Net()
        self.net.opt.use_vulkan_compute = self.use_gpu
        self.net.opt.num_threads = self.num_threads

        self.net.register_custom_layer(
            "YoloV5Focus", YoloV5Focus_layer_creator, YoloV5Focus_layer_destroyer
        )

        # original pretrained model from https://github.com/ultralytics/yolov5
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("yolov5s.param"))
        self.net.load_model(get_model_file("yolov5s.bin"))

        self.grid = [make_grid(10, 6), make_grid(20, 12), make_grid(40, 24)]
        self.stride = np.array([32, 16, 8])
        self.anchor_grid = np.array(
            [
                [116, 90, 156, 198, 373, 326],
                [30, 61, 62, 45, 59, 119],
                [10, 13, 16, 30, 33, 23],
            ]
        ).reshape((3, 1, 3, 1, 1, 2))

        self.class_names = [
            "person",
            "bicycle",
            "car",
            "motorcycle",
            "airplane",
            "bus",
            "train",
            "truck",
            "boat",
            "traffic light",
            "fire hydrant",
            "stop sign",
            "parking meter",
            "bench",
            "bird",
            "cat",
            "dog",
            "horse",
            "sheep",
            "cow",
            "elephant",
            "bear",
            "zebra",
            "giraffe",
            "backpack",
            "umbrella",
            "handbag",
            "tie",
            "suitcase",
            "frisbee",
            "skis",
            "snowboard",
            "sports ball",
            "kite",
            "baseball bat",
            "baseball glove",
            "skateboard",
            "surfboard",
            "tennis racket",
            "bottle",
            "wine glass",
            "cup",
            "fork",
            "knife",
            "spoon",
            "bowl",
            "banana",
            "apple",
            "sandwich",
            "orange",
            "broccoli",
            "carrot",
            "hot dog",
            "pizza",
            "donut",
            "cake",
            "chair",
            "couch",
            "potted plant",
            "bed",
            "dining table",
            "toilet",
            "tv",
            "laptop",
            "mouse",
            "remote",
            "keyboard",
            "cell phone",
            "microwave",
            "oven",
            "toaster",
            "sink",
            "refrigerator",
            "book",
            "clock",
            "vase",
            "scissors",
            "teddy bear",
            "hair drier",
            "toothbrush",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_w = img.shape[1]
        img_h = img.shape[0]

        w = img_w
        h = img_h
        scale = 1.0
        if w > h:
            scale = float(self.target_size) / w
            w = self.target_size
            h = int(h * scale)
        else:
            scale = float(self.target_size) / h
            h = self.target_size
            w = int(w * scale)

        mat_in = ncnn.Mat.from_pixels_resize(
            img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h, w, h
        )
        # pad to target_size rectangle
        # yolov5/utils/datasets.py letterbox
        wpad = (w + 31) // 32 * 32 - w
        hpad = (h + 31) // 32 * 32 - h
        mat_in_pad = ncnn.copy_make_border(
            mat_in,
            hpad // 2,
            hpad - hpad // 2,
            wpad // 2,
            wpad - wpad // 2,
            ncnn.BorderType.BORDER_CONSTANT,
            114.0,
        )

        mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()
        ex.input("images", mat_in_pad)

        # anchor setting from yolov5/models/yolov5s.yaml
        ret1, mat_out1 = ex.extract("output")  # stride 8
        ret2, mat_out2 = ex.extract("781")  # stride 16
        ret3, mat_out3 = ex.extract("801")  # stride 32

        pred = [np.array(mat_out3), np.array(mat_out2), np.array(mat_out1)]
        z = []
        for i in range(len(pred)):
            num_grid = pred[i].shape[1]
            if mat_in_pad.w > mat_in_pad.h:
                num_grid_x = mat_in_pad.w // self.stride[i]
                num_grid_y = num_grid // num_grid_x
            else:
                num_grid_y = mat_in_pad.h // self.stride[i]
                num_grid_x = num_grid // num_grid_y
            if (
                self.grid[i].shape[0] != num_grid_x
                or self.grid[i].shape[1] != num_grid_y
            ):
                self.grid[i] = make_grid(num_grid_x, num_grid_y)

            y = sigmoid(pred[i])
            y = y.reshape(pred[i].shape[0], num_grid_y, num_grid_x, pred[i].shape[2])
            y[..., 0:2] = (y[..., 0:2] * 2.0 - 0.5 + self.grid[i]) * self.stride[
                i
            ]  # xy
            y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
            z.append(y.reshape(1, -1, y.shape[-1]))
        pred = np.concatenate(z, 1)

        result = self.non_max_suppression(
            pred, self.prob_threshold, self.nms_threshold
        )[0]

        objects = [
            Detect_Object(
                obj[5],
                obj[4],
                (obj[0] - (wpad / 2)) / scale,
                (obj[1] - (hpad / 2)) / scale,
                (obj[2] - obj[0]) / scale,
                (obj[3] - obj[1]) / scale,
            )
            for obj in result
        ]

        return objects

    def non_max_suppression(
        self,
        prediction,
        conf_thres=0.1,
        iou_thres=0.6,
        merge=False,
        classes=None,
        agnostic=False,
    ):
        """Performs Non-Maximum Suppression (NMS) on inference results

        Returns:
            detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
        """
        nc = prediction[0].shape[1] - 5  # number of classes
        xc = prediction[..., 4] > conf_thres  # candidates

        # Settings
        min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
        max_det = 300  # maximum number of detections per image
        time_limit = 10.0  # seconds to quit after
        redundant = True  # require redundant detections
        multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)

        t = time.time()
        output = [None] * prediction.shape[0]
        for xi, x in enumerate(prediction):  # image index, image inference
            # Apply constraints
            # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
            x = x[xc[xi]]  # confidence

            # If none remain process next image
            if not x.shape[0]:
                continue

            # Compute conf
            x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

            # Box (center x, center y, width, height) to (x1, y1, x2, y2)
            box = xywh2xyxy(x[:, :4])

            # Detections matrix nx6 (xyxy, conf, cls)
            if multi_label:
                i, j = (x[:, 5:] > conf_thres).nonzero()
                x = np.concatenate(
                    (box[i], x[i, j + 5, None], j[:, None].astype(np.float32)), axis=1
                )
            else:  # best class only
                conf, j = x[:, 5:].max(1, keepdim=True)
                x = np.concatenate((box, conf, j.float()), axis=1)[
                    conf.view(-1) > conf_thres
                ]

            # Filter by class
            if classes:
                x = x[(x[:, 5:6] == np.array(classes)).any(1)]

            # Apply finite constraint
            # if not torch.isfinite(x).all():
            #     x = x[torch.isfinite(x).all(1)]

            # If none remain process next image
            n = x.shape[0]  # number of boxes
            if not n:
                continue

            # Sort by confidence
            # x = x[x[:, 4].argsort(descending=True)]

            # Batched NMS
            c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
            boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
            i = nms(boxes, scores, iou_threshold=iou_thres)
            if len(i) > max_det:  # limit detections
                i = i[:max_det]
            if merge and (1 < n < 3e3):  # Merge NMS (boxes merged using weighted mean)
                try:  # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
                    iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
                    weights = iou * scores[None]  # box weights
                    x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
                        1, keepdim=True
                    )  # merged boxes
                    if redundant:
                        i = i[iou.sum(1) > 1]  # require redundancy
                except:  # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
                    print(x, i, x.shape, i.shape)
                    pass

            output[xi] = x[i]
            if (time.time() - t) > time_limit:
                break  # time limit exceeded

        return output


================================================
FILE: python/ncnn/model_zoo/yolov7.py
================================================
# Copyright 2020 Tencent
# Copyright 2023 Kenny Bradley
# SPDX-License-Identifier: BSD-3-Clause

# Ported yolov7-tiny to python based on:
#   - https://github.com/Qengineering/YoloV7-ncnn-Raspberry-Pi-4/blob/main/yolo.cpp
# Format based on the ncnn yolov4 implementation

import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object
import numpy as np


#def sigmoid_binned(val)
#   this could use a much faster binned lookup table instead of np.exp and floating division

def sigmoid(val):
    return 1.0 / (1.0 + np.exp(-val))

#IOU functions:
#find the overlap width given ([x1,x2], [x3,x4]) or ([y1,y2], [y3,y4])
def calcOverlap(r1, r2):
    #r1 contains r2
    if r1[0] <= r2[0] and r1[1] >= r2[1]:
        return r2[1] - r2[0]
    #r2 contains r1
    elif r1[0] >= r2[0] and r1[1] <= r2[1]:
        return r1[1] - r1[0]
    #r1.1 is between r2.0 and r2.1
    elif r1[0] <= r2[0] and r1[1] >= r2[0]: # r1[1] <= r2[1] is true since the first if failed
        return r1[1] - r2[0]
    #r1.0 is between r2.0 and r2.1
    elif r1[0] >= r2[0] and r1[0] <= r2[1]: # r1[1] >= r2[1] is true since the second if failed
        return r2[1] - r1[0]
    else:
        return 0

#find X and Y overlaps and return intersection area
def calcIntersection(r1 : Detect_Object, r2 : Detect_Object):
    xOverlap = calcOverlap([r1.rect.x, r1.rect.x+r1.rect.w], [r2.rect.x, r2.rect.x+r2.rect.w])
    yOverlap = calcOverlap([r1.rect.y, r1.rect.y+r1.rect.h], [r2.rect.y, r2.rect.y+r2.rect.h])
    return xOverlap*yOverlap


#with r = [X1,X2,Y1,Y2] as the format return the IOU
def IOU(r1 : Detect_Object, r2 : Detect_Object):
    intersection = calcIntersection(r1,r2)
    #union =        r1 area       +        r2 area        - duplicate area
    union = (r1.rect.w*r1.rect.h) + (r2.rect.w*r2.rect.h) - intersection
    if union == 0:
        return 0
    else:
        return intersection/union

#NMS
#detections are pre-sorted in ascending confidence order
#detections are a list of Detect_Objects with : label, prob, rect
def NMS(detections, iou_thresh=0.45):
    cleanDetections = []
    detByClasses = {}
    #group by class
    for det in detections:
        #det.label is the class
        if det.label not in detByClasses.keys():
            detByClasses[det.label] = []
        detByClasses[det.label].append(det)

    #for each class find the values to keep
    for key, dets in detByClasses.items():
        for i in range(len(dets)):
            keep = 1
            #keep unless a higher priority det has IOU > thresh
            for j in range(i+1,len(dets)):
                iou = IOU(dets[i], dets[j])
                if iou > iou_thresh:
                    keep = 0
                    break
            if keep:
                cleanDetections.append(dets[i])

    #return cleaner list of Detect_Object values
    return cleanDetections
    
class YoloV7_Base:
    def __init__(self, target_size, num_threads=1, use_gpu=False, use_strides=[8,16,32]):
        self.target_size = target_size
        self.num_threads = num_threads
        self.use_gpu = use_gpu
        self.use_strides = use_strides

        self.mean_vals = []
        self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0]

        self.net = ncnn.Net()
        self.net.opt.use_vulkan_compute = self.use_gpu
        self.net.opt.num_threads = self.num_threads

        # original pretrained model from https://github.com/AlexeyAB/darknet
        # the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("yolov7-tiny.param"))
        self.net.load_model(get_model_file("yolov7-tiny.bin"))

        self.class_names = [
            "person",
            "bicycle",
            "car",
            "motorbike",
            "aeroplane",
            "bus",
            "train",
            "truck",
            "boat",
            "traffic light",
            "fire hydrant",
            "stop sign",
            "parking meter",
            "bench",
            "bird",
            "cat",
            "dog",
            "horse",
            "sheep",
            "cow",
            "elephant",
            "bear",
            "zebra",
            "giraffe",
            "backpack",
            "umbrella",
            "handbag",
            "tie",
            "suitcase",
            "frisbee",
            "skis",
            "snowboard",
            "sports ball",
            "kite",
            "baseball bat",
            "baseball glove",
            "skateboard",
            "surfboard",
            "tennis racket",
            "bottle",
            "wine glass",
            "cup",
            "fork",
            "knife",
            "spoon",
            "bowl",
            "banana",
            "apple",
            "sandwich",
            "orange",
            "broccoli",
            "carrot",
            "hot dog",
            "pizza",
            "donut",
            "cake",
            "chair",
            "sofa",
            "pottedplant",
            "bed",
            "diningtable",
            "toilet",
            "tvmonitor",
            "laptop",
            "mouse",
            "remote",
            "keyboard",
            "cell phone",
            "microwave",
            "oven",
            "toaster",
            "sink",
            "refrigerator",
            "book",
            "clock",
            "vase",
            "scissors",
            "teddy bear",
            "hair drier",
            "toothbrush"
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):

        img_h = img.shape[0]
        img_w = img.shape[1]

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR2RGB,
            img.shape[1],
            img.shape[0],
            self.target_size,
            self.target_size,
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()
        ex.input("images", mat_in)

        outValues = []
        if 8 in self.use_strides:
            ret8, out8 = ex.extract("output");
            outValues.append(out8)
        else:
            outValues.append(None)

        if 16 in self.use_strides:
            ret16, out16 = ex.extract("288");
            outValues.append(out16)
        else:
            outValues.append(None)

        if 32 in self.use_strides:
            ret32, out32 = ex.extract("302");
            outValues.append(out32)
        else:
            outValues.append(None)

        #           P3/8,                  P4/16,                  P5/32
        anchors = [[12,16, 19,36, 40,28], [36,75, 76,55, 72,146], [142,110, 192,243, 459,401]]
        strides = [8,16,32]

        objects = []
        #this threshold is the value for which sigmoid gives 0.25 which is the threshold
        threshNonSigmoid = -1.098612
        for strideCount, mat_out in enumerate(outValues):
            if mat_out is None:
                continue

            stride = strides[strideCount]
            for c in range(3):
              mat = mat_out.channel(c)

              #yolo should always be square, it is expected to be 52x52
              #    but sqrt() guarantees the correct size for side
              side = int(np.sqrt(mat.h))

              anchorW = anchors[strideCount][c*2]
              anchorH = anchors[strideCount][c*2+1]
              index = 0
              for i in range(side):
                  for j in range(side):

                      #values 5-84 are class data
                      classData=mat.row(index)[5:]
                      maxLabel = max(classData)
                      
                      #optimization
                      #if either the objectness or max class score resolve to < 0.25 we can skip this
                      #  but the values are pre-sigmoid so compare to threshNonSigmoid.
                      #  1 / (1+e^(-1.098612)) = 0.25 so just compare to the -1.098612 threshold
                      if mat.row(index)[4] < threshNonSigmoid or maxLabel < threshNonSigmoid:
                          index += 1
                          continue

                      #values 0-3 are coordinate data
                      locData = mat.row(index)[0:4]
                      #value 4 is the box confidence score
                      box_score = sigmoid(mat.row(index)[4])
                      #get the highest scoring class for this detection to multiply by the box_score
                      label = np.argmax(classData)
                      class_score = sigmoid(mat.row(index)[label+5])

                      conf = box_score * class_score
                      if conf > 0.25:
                          obj = Detect_Object()
                          obj.label = self.class_names[label]
                          obj.prob = conf
                          #convert from raw yolo output to W,H and X,Y
                          obj.rect.w = ((sigmoid(locData[2]) *2) ** 2) * anchorW
                          obj.rect.h = ((sigmoid(locData[3]) *2) ** 2) * anchorH
                          obj.rect.x = ((sigmoid(locData[0]) * 2) - 0.5 + j) * stride - (obj.rect.w/2)
                          obj.rect.y = ((sigmoid(locData[1]) * 2) - 0.5 + i) * stride - (obj.rect.h/2)
                          objects.append(obj)

                      index +=1

        #sort based on probability in ascending order
        objects.sort(key = lambda x: x.prob)
        filtered_objects = NMS(objects)

        #rescale to input image size
        XscaleAdj = img_w / self.target_size
        YscaleAdj = img_h / self.target_size
        for count in range(len(filtered_objects)):
            filtered_objects[count].rect.x *= XscaleAdj
            filtered_objects[count].rect.w *= XscaleAdj
            filtered_objects[count].rect.y *= YscaleAdj
            filtered_objects[count].rect.h *= YscaleAdj

        return filtered_objects


class YoloV7_Tiny(YoloV7_Base):
    def __init__(self, **kwargs):
        super(YoloV7_Tiny, self).__init__(416, **kwargs)


================================================
FILE: python/ncnn/model_zoo/yolov8.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import time
import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object
from ..utils.functional import *
from typing import Iterable

class YoloV8s:
    def __init__(
        self,
        target_size=640,
        prob_threshold=0.25,
        nms_threshold=0.45,
        num_threads=1,
        use_gpu=False,
    ):
        self.target_size = target_size
        self.prob_threshold = prob_threshold
        self.nms_threshold = nms_threshold
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.reg_max = 16
        self.mean_vals = []
        self.norm_vals = [1 / 255.0, 1 / 255.0, 1 / 255.0]

        self.net = ncnn.Net()
        self.net.opt.use_vulkan_compute = self.use_gpu
        self.net.opt.num_threads = self.num_threads

        # original pretrained model from https://github.com/ultralytics/ultralytics
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("yolov8s.param"))
        self.net.load_model(get_model_file("yolov8s.bin"))

        self.grid = [make_grid(20, 20), make_grid(40, 40), make_grid(80, 80)]
        self.stride = np.array([32, 16, 8])

        self.class_names = [
            "person",
            "bicycle",
            "car",
            "motorcycle",
            "airplane",
            "bus",
            "train",
            "truck",
            "boat",
            "traffic light",
            "fire hydrant",
            "stop sign",
            "parking meter",
            "bench",
            "bird",
            "cat",
            "dog",
            "horse",
            "sheep",
            "cow",
            "elephant",
            "bear",
            "zebra",
            "giraffe",
            "backpack",
            "umbrella",
            "handbag",
            "tie",
            "suitcase",
            "frisbee",
            "skis",
            "snowboard",
            "sports ball",
            "kite",
            "baseball bat",
            "baseball glove",
            "skateboard",
            "surfboard",
            "tennis racket",
            "bottle",
            "wine glass",
            "cup",
            "fork",
            "knife",
            "spoon",
            "bowl",
            "banana",
            "apple",
            "sandwich",
            "orange",
            "broccoli",
            "carrot",
            "hot dog",
            "pizza",
            "donut",
            "cake",
            "chair",
            "couch",
            "potted plant",
            "bed",
            "dining table",
            "toilet",
            "tv",
            "laptop",
            "mouse",
            "remote",
            "keyboard",
            "cell phone",
            "microwave",
            "oven",
            "toaster",
            "sink",
            "refrigerator",
            "book",
            "clock",
            "vase",
            "scissors",
            "teddy bear",
            "hair drier",
            "toothbrush",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        img_w = img.shape[1]
        img_h = img.shape[0]

        w = img_w
        h = img_h
        scale = 1.0
        if w > h:
            scale = float(self.target_size) / w
            w = self.target_size
            h = int(h * scale)
        else:
            scale = float(self.target_size) / h
            h = self.target_size
            w = int(w * scale)

        mat_in = ncnn.Mat.from_pixels_resize(
            img, ncnn.Mat.PixelType.PIXEL_BGR2RGB, img_w, img_h, w, h
        )
        # pad to target_size rectangle
        # yolov5/utils/datasets.py letterbox
        wpad = (w + 31) // 32 * 32 - w
        hpad = (h + 31) // 32 * 32 - h
        mat_in_pad = ncnn.copy_make_border(
            mat_in,
            hpad // 2,
            hpad - hpad // 2,
            wpad // 2,
            wpad - wpad // 2,
            ncnn.BorderType.BORDER_CONSTANT,
            114.0,
        )

        mat_in_pad.substract_mean_normalize(self.mean_vals, self.norm_vals)

        ex = self.net.create_extractor()
        ex.input("in0", mat_in_pad)

        ret1, mat_out1 = ex.extract("out0")  # stride 8
        ret2, mat_out2 = ex.extract("out1")  # stride 16
        ret3, mat_out3 = ex.extract("out2")  # stride 32

        pred = [np.array(mat_out3), np.array(mat_out2), np.array(mat_out1)]
        z = []
        for i in range(len(pred)):
            num_grid_x = mat_in_pad.w // self.stride[i]
            num_grid_y = mat_in_pad.h // self.stride[i]
            if (
                    self.grid[i].shape[1] != num_grid_y
                    or self.grid[i].shape[2] != num_grid_x
            ):
                self.grid[i] = make_grid(num_grid_x, num_grid_y)
            cls, box = np.split(pred[i].transpose((1, 2, 0)), [len(self.class_names), ], -1)
            box = softmax(box.reshape(-1, self.reg_max))
            box = box.reshape(num_grid_y, num_grid_x, 4, self.reg_max)
            box = box @ np.arange(0, self.reg_max, dtype=np.float32)
            cls = sigmoid(cls)
            conf = cls.max(-1, keepdims=True)
            x1y1 = (self.grid[i][0] + 0.5 - box[..., :2]) * self.stride[i]
            x2y2 = (self.grid[i][0] + 0.5 + box[..., 2:]) * self.stride[i]
            res = np.concatenate([x1y1, x2y2, conf, cls], -1)
            z.append(res.reshape((1, -1, len(self.class_names) + 5)))
        pred = np.concatenate(z, 1)

        result = self.non_max_suppression(
            pred, self.prob_threshold, self.nms_threshold
        )[0]

        if isinstance(result, Iterable):
            objects = [
                Detect_Object(
                    obj[5],
                    obj[4],
                    (obj[0] - (wpad / 2)) / scale,
                    (obj[1] - (hpad / 2)) / scale,
                    (obj[2] - obj[0]) / scale,
                    (obj[3] - obj[1]) / scale,
                )
                for obj in result
            ]
        else:
            objects = []

        return objects

    def non_max_suppression(
        self,
        prediction,
        conf_thres=0.1,
        iou_thres=0.6,
        merge=False,
        classes=None,
        agnostic=False,
    ):
        """Performs Non-Maximum Suppression (NMS) on inference results

        Returns:
            detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
        """
        nc = prediction[0].shape[1] - 5  # number of classes
        xc = prediction[..., 4] > conf_thres  # candidates

        # Settings
        min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
        max_det = 300  # maximum number of detections per image
        time_limit = 10.0  # seconds to quit after
        redundant = True  # require redundant detections
        multi_label = nc > 1  # multiple labels per box (adds 0.5ms/img)

        t = time.time()
        output = [None] * prediction.shape[0]
        for xi, x in enumerate(prediction):  # image index, image inference
            # Apply constraints
            # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
            x = x[xc[xi]]  # confidence

            # If none remain process next image
            if not x.shape[0]:
                continue

            box = x[:, :4]

            # Detections matrix nx6 (xyxy, conf, cls)
            if multi_label:
                i, j = (x[:, 5:] > conf_thres).nonzero()
                x = np.concatenate(
                    (box[i], x[i, j + 5, None], j[:, None].astype(np.float32)), axis=1
                )
            else:  # best class only
                conf, j = x[:, 5:].max(1, keepdim=True)
                x = np.concatenate((box, conf, j.float()), axis=1)[
                    conf.view(-1) > conf_thres
                ]

            # Filter by class
            if classes:
                x = x[(x[:, 5:6] == np.array(classes)).any(1)]

            # Apply finite constraint
            # if not torch.isfinite(x).all():
            #     x = x[torch.isfinite(x).all(1)]

            # If none remain process next image
            n = x.shape[0]  # number of boxes
            if not n:
                continue

            # Sort by confidence
            # x = x[x[:, 4].argsort(descending=True)]

            # Batched NMS
            c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
            boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
            i = nms(boxes, scores, iou_threshold=iou_thres)
            if len(i) > max_det:  # limit detections
                i = i[:max_det]
            if merge and (1 < n < 3e3):  # Merge NMS (boxes merged using weighted mean)
                try:  # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
                    iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
                    weights = iou * scores[None]  # box weights
                    x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(
                        1, keepdim=True
                    )  # merged boxes
                    if redundant:
                        i = i[iou.sum(1) > 1]  # require redundancy
                except:  # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
                    print(x, i, x.shape, i.shape)
                    pass

            output[xi] = x[i]
            if (time.time() - t) > time_limit:
                break  # time limit exceeded

        return output


================================================
FILE: python/ncnn/utils/__init__.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

from .download import download, check_sha1
from .visual import *
from .objects import *


================================================
FILE: python/ncnn/utils/download.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

"""Download files with progress bar."""

import os
import hashlib
import requests
from tqdm import tqdm


def check_sha1(filename, sha1_hash):
    """Check whether the sha1 hash of the file content matches the expected hash.
    Parameters
    ----------
    filename : str
        Path to the file.
    sha1_hash : str
        Expected sha1 hash in hexadecimal digits.
    Returns
    -------
    bool
        Whether the file content matches the expected hash.
    """
    sha1 = hashlib.sha1()
    with open(filename, "rb") as f:
        while True:
            data = f.read(1048576)
            if not data:
                break
            sha1.update(data)

    sha1_file = sha1.hexdigest()
    l = min(len(sha1_file), len(sha1_hash))
    return sha1.hexdigest()[0:l] == sha1_hash[0:l]


def download(url, path=None, overwrite=False, sha1_hash=None):
    """Download an given URL
    Parameters
    ----------
    url : str
        URL to download
    path : str, optional
        Destination path to store downloaded file. By default stores to the
        current directory with same name as in url.
    overwrite : bool, optional
        Whether to overwrite destination file if already exists.
    sha1_hash : str, optional
        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
        but doesn't match.
    Returns
    -------
    str
        The file path of the downloaded file.
    """
    if path is None:
        fname = url.split("/")[-1]
    else:
        path = os.path.expanduser(path)
        if os.path.isdir(path):
            fname = os.path.join(path, url.split("/")[-1])
        else:
            fname = path

    if (
        overwrite
        or not os.path.exists(fname)
        or (sha1_hash and not check_sha1(fname, sha1_hash))
    ):
        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
        if not os.path.exists(dirname):
            os.makedirs(dirname)

        print("Downloading %s from %s..." % (fname, url))
        r = requests.get(url, stream=True)
        if r.status_code != 200:
            raise RuntimeError("Failed downloading url %s" % url)
        total_length = r.headers.get("content-length")
        with open(fname, "wb") as f:
            if total_length is None:  # no content length header
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
            else:
                total_length = int(total_length)
                for chunk in tqdm(
                    r.iter_content(chunk_size=1024),
                    total=int(total_length / 1024.0 + 0.5),
                    unit="KB",
                    unit_scale=False,
                    dynamic_ncols=True,
                ):
                    f.write(chunk)

        if sha1_hash and not check_sha1(fname, sha1_hash):
            raise UserWarning(
                "File {} is downloaded but the content hash does not match. "
                "The repo may be outdated or download may be incomplete. "
                'If the "repo_url" is overridden, consider switching to '
                "the default repo.".format(fname)
            )

    return fname


================================================
FILE: python/ncnn/utils/functional.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np


def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = np.zeros_like(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y


def xyxy2xywh(x):
    # Convert nx4 boxes from [x1, y1, x2, y2] to [x, y, w, h] where xy1=top-left, xy2=bottom-right
    y = np.zeros_like(x)
    y[:, 0] = (x[:, 0] + x[:, 2]) / 2  # x center
    y[:, 1] = (x[:, 1] + x[:, 3]) / 2  # y center
    y[:, 2] = x[:, 2] - x[:, 0]  # width
    y[:, 3] = x[:, 3] - x[:, 1]  # height
    return y


def make_grid(nx=20, ny=20):
    xv1, yv1 = np.meshgrid(np.arange(nx), np.arange(ny))
    z1 = np.stack((xv1, yv1), 2).reshape((1, ny, nx, 2)).astype(np.float32)
    return z1


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def softmax(x):
    max_value = np.max(x, axis=-1)
    x -= max_value.reshape((x.shape[0], 1))
    x = np.exp(x)
    sum_value = np.sum(x, axis=-1)
    x /= sum_value.reshape((x.shape[0], 1))
    return x


def iou_of(boxes0, boxes1, eps=1e-5):
    """Return intersection-over-union (Jaccard index) of boxes.

    Args:
        boxes0 (N, 4): ground truth boxes.
        boxes1 (N or 1, 4): predicted boxes.
        eps: a small number to avoid 0 as denominator.
    Returns:
        iou (N): IoU values.
    """
    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])

    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
    return overlap_area / (area0 + area1 - overlap_area + eps)


def area_of(left_top, right_bottom):
    """Compute the areas of rectangles given two corners.

    Args:
        left_top (N, 2): left top corner.
        right_bottom (N, 2): right bottom corner.

    Returns:
        area (N): return the area.
    """
    hw = np.clip(right_bottom - left_top, 0.0, None)
    return hw[..., 0] * hw[..., 1]


def nms(boxes, scores, iou_threshold, top_k=-1, candidate_size=200):
    """

    Args:
        box_scores (N, 5): boxes in corner-form(x1, y1, x2, y2) and probabilities.
        iou_threshold: intersection over union threshold.
        top_k: keep top_k results. If k <= 0, keep all the results.
        candidate_size: only consider the candidates with the highest scores.
    Returns:
         picked: a list of indexes of the kept boxes
    """

    picked = []
    indexes = np.argsort(scores)
    indexes = indexes[-candidate_size:]
    while len(indexes) > 0:
        current = indexes[-1]
        picked.append(current)
        if 0 < top_k == len(picked) or len(indexes) == 1:
            break

        current_box = boxes[current, :]
        indexes = indexes[:-1]
        rest_boxes = boxes[indexes, :]
        iou = iou_of(
            rest_boxes,
            np.expand_dims(current_box, axis=0),
        )
        indexes = indexes[iou <= iou_threshold]

    return picked


================================================
FILE: python/ncnn/utils/objects.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np


class Point(object):
    def __init__(self):
        self.x = 0.0
        self.y = 0.0


class Rect(object):
    def __init__(self, x=0, y=0, w=0, h=0):
        self.x = x
        self.y = y
        self.w = w
        self.h = h

    def area(self):
        return self.w * self.h

    def intersection_area(self, b):
        x1 = np.maximum(self.x, b.x)
        y1 = np.maximum(self.y, b.y)
        x2 = np.minimum(self.x + self.w, b.x + b.w)
        y2 = np.minimum(self.y + self.h, b.y + b.h)
        return np.abs(x1 - x2) * np.abs(y1 - y2)


class Detect_Object(object):
    def __init__(self, label=0, prob=0, x=0, y=0, w=0, h=0):
        self.label = label
        self.prob = prob
        self.rect = Rect(x, y, w, h)


class Face_Object(object):
    def __init__(self):
        self.prob = 0.0
        self.rect = Rect()
        self.landmark = []


class KeyPoint(object):
    def __init__(self):
        self.p = Point()
        self.prob = 0.0


================================================
FILE: python/ncnn/utils/visual.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import cv2


def draw_detection_objects(image, class_names, objects, min_prob=0.0):
    for obj in objects:
        if obj.prob < min_prob:
            continue

        print(
            "%d = %.5f at %.2f %.2f %.2f x %.2f\n"
            % (obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.w, obj.rect.h)
        )

        cv2.rectangle(
            image,
            (int(obj.rect.x), int(obj.rect.y)),
            (int(obj.rect.x + obj.rect.w), int(obj.rect.y + obj.rect.h)),
            (255, 0, 0),
        )

        text = "%s %.1f%%" % (class_names[int(obj.label)], obj.prob * 100)

        label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)

        x = obj.rect.x
        y = obj.rect.y - label_size[1] - baseLine
        if y < 0:
            y = 0
        if x + label_size[0] > image.shape[1]:
            x = image.shape[1] - label_size[0]

        cv2.rectangle(
            image,
            (int(x), int(y)),
            (int(x + label_size[0]), int(y + label_size[1] + baseLine)),
            (255, 255, 255),
            -1,
        )

        cv2.putText(
            image,
            text,
            (int(x), int(y + label_size[1])),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
        )

    cv2.imshow("image", image)
    cv2.waitKey(0)


def print_topk(cls_scores, topk):
    indexes = np.argsort(cls_scores)[::-1][0:topk]
    scores = cls_scores[indexes]

    for index, score in zip(indexes, scores):
        print("%d=%f" % (index, score))


def draw_faceobjects(image, faceobjects):
    for obj in faceobjects:
        print(
            "%.5f at %.2f %.2f %.2f x %.2f"
            % (obj.prob, obj.rect.x, obj.rect.y, obj.rect.w, obj.rect.h)
        )

        cv2.rectangle(
            image,
            (int(obj.rect.x), int(obj.rect.y)),
            (int(obj.rect.x + obj.rect.w), int(obj.rect.y + obj.rect.h)),
            (255, 0, 0),
        )

        cv2.circle(
            image,
            (int(obj.landmark[0].x), int(obj.landmark[0].y)),
            2,
            (0, 255, 255),
            -1,
        )
        cv2.circle(
            image,
            (int(obj.landmark[1].x), int(obj.landmark[1].y)),
            2,
            (0, 255, 255),
            -1,
        )
        cv2.circle(
            image,
            (int(obj.landmark[2].x), int(obj.landmark[2].y)),
            2,
            (0, 255, 255),
            -1,
        )
        cv2.circle(
            image,
            (int(obj.landmark[3].x), int(obj.landmark[3].y)),
            2,
            (0, 255, 255),
            -1,
        )
        cv2.circle(
            image,
            (int(obj.landmark[4].x), int(obj.landmark[4].y)),
            2,
            (0, 255, 255),
            -1,
        )

        text = "%.1f%%" % (obj.prob * 100)

        label_size, baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)

        x = obj.rect.x
        y = obj.rect.y - label_size[1] - baseLine
        if y < 0:
            y = 0
        if x + label_size[0] > image.shape[1]:
            x = image.shape[1] - label_size[0]

        cv2.rectangle(
            image,
            (int(x), int(y)),
            (int(x + label_size[0]), int(y + label_size[1] + baseLine)),
            (255, 255, 255),
            -1,
        )

        cv2.putText(
            image,
            text,
            (int(x), int(y + label_size[1])),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
        )

    cv2.imshow("image", image)
    cv2.waitKey(0)


def draw_pose(image, keypoints):
    # draw bone
    joint_pairs = [
        (0, 1),
        (1, 3),
        (0, 2),
        (2, 4),
        (5, 6),
        (5, 7),
        (7, 9),
        (6, 8),
        (8, 10),
        (5, 11),
        (6, 12),
        (11, 12),
        (11, 13),
        (12, 14),
        (13, 15),
        (14, 16),
    ]

    for i in range(16):
        p1 = keypoints[joint_pairs[i][0]]
        p2 = keypoints[joint_pairs[i][1]]

        if p1.prob < 0.2 or p2.prob < 0.2:
            continue

        cv2.line(
            image,
            (int(p1.p.x), int(p1.p.y)),
            (int(p2.p.x), int(p2.p.y)),
            (255, 0, 0),
            2,
        )

    # draw joint
    for keypoint in keypoints:
        print("%.2f %.2f = %.5f" % (keypoint.p.x, keypoint.p.y, keypoint.prob))

        if keypoint.prob < 0.2:
            continue

        cv2.circle(image, (int(keypoint.p.x), int(keypoint.p.y)), 3, (0, 255, 0), -1)

    cv2.imshow("image", image)
    cv2.waitKey(0)


================================================
FILE: python/requirements.txt
================================================
numpy
tqdm
requests
portalocker
opencv-python

================================================
FILE: python/setup.py.i
================================================
import sys
from setuptools import setup, find_packages

try:
    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel

    class bdist_wheel(_bdist_wheel):
        def finalize_options(self):
            _bdist_wheel.finalize_options(self)
            self.root_is_pure = False


except ImportError:
    bdist_wheel = None

if sys.version_info < (3, 0):
    sys.exit("Sorry, Python < 3.0 is not supported")

requirements = ["numpy", "tqdm", "requests", "portalocker", "opencv-python"]

setup(
    name="ncnn",
    version="${PACKAGE_VERSION}",
    author="nihui",
    author_email="nihuini@tencent.com",
    maintainer="caishanli",
    maintainer_email="caishanli25@gmail.com",
    description="ncnn is a high-performance neural network inference framework optimized for the mobile platform",
    url="https://github.com/Tencent/ncnn",
    classifiers=[
        "Programming Language :: C++",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
        "Programming Language :: Python :: 3.12",
        "Programming Language :: Python :: 3.13",
        "Programming Language :: Python :: 3.14",
        "License :: OSI Approved :: BSD License",
        "Operating System :: OS Independent",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    license="BSD-3",
    python_requires=">=3.5",
    packages=find_packages(),
    package_dir={"": "."},
    package_data={"ncnn": ["ncnn${PYTHON_MODULE_PREFIX}${PYTHON_MODULE_EXTENSION}"]},
    install_requires=requirements,
    cmdclass={"bdist_wheel": bdist_wheel},
)


================================================
FILE: python/src/main.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>
#include <pybind11/functional.h>

#include <cpu.h>
#include <gpu.h>
#include <net.h>
#include <option.h>
#include <blob.h>
#include <paramdict.h>

#include "pybind11_mat.h"
#include "pybind11_datareader.h"
#include "pybind11_allocator.h"
#include "pybind11_modelbin.h"
#include "pybind11_layer.h"
using namespace ncnn;

namespace py = pybind11;

class DataReaderFromMemoryCopy : public DataReaderFromMemory
{
public:
    explicit DataReaderFromMemoryCopy(const unsigned char*& mem)
        : DataReaderFromMemory(mem)
    {
    }

    virtual size_t reference(size_t size, const void** buf) const
    {
        return 0;
    }
};

struct LayerFactory
{
    std::string name;
    int index;
    std::function<Layer*()> creator;
    std::function<void(Layer*)> destroyer;
    layer_creator_func creator_func;
    layer_destroyer_func destroyer_func;
};

#define LayerFactoryDeclear(n)                  \
    static ncnn::Layer* LayerCreator##n(void*); \
    static void LayerDestroyer##n(ncnn::Layer*, void*);

LayerFactoryDeclear(0);
LayerFactoryDeclear(1);
LayerFactoryDeclear(2);
LayerFactoryDeclear(3);
LayerFactoryDeclear(4);
LayerFactoryDeclear(5);
LayerFactoryDeclear(6);
LayerFactoryDeclear(7);
LayerFactoryDeclear(8);
LayerFactoryDeclear(9);

std::vector<LayerFactory> g_layer_factroys = {
    {"", -1, nullptr, nullptr, LayerCreator0, LayerDestroyer0},
    {"", -1, nullptr, nullptr, LayerCreator1, LayerDestroyer1},
    {"", -1, nullptr, nullptr, LayerCreator2, LayerDestroyer2},
    {"", -1, nullptr, nullptr, LayerCreator3, LayerDestroyer3},
    {"", -1, nullptr, nullptr, LayerCreator4, LayerDestroyer4},
    {"", -1, nullptr, nullptr, LayerCreator5, LayerDestroyer5},
    {"", -1, nullptr, nullptr, LayerCreator6, LayerDestroyer6},
    {"", -1, nullptr, nullptr, LayerCreator7, LayerDestroyer7},
    {"", -1, nullptr, nullptr, LayerCreator8, LayerDestroyer8},
    {"", -1, nullptr, nullptr, LayerCreator9, LayerDestroyer9},
};
int g_layer_factroy_index = 0;

#define LayerFactoryDefine(n)                                  \
    static ncnn::Layer* LayerCreator##n(void* p)               \
    {                                                          \
        if (g_layer_factroys[n].creator != nullptr)            \
        {                                                      \
            return g_layer_factroys[n].creator();              \
        }                                                      \
        return nullptr;                                        \
    }                                                          \
    static void LayerDestroyer##n(ncnn::Layer* layer, void* p) \
    {                                                          \
        if (g_layer_factroys[n].destroyer)                     \
        {                                                      \
            g_layer_factroys[n].destroyer(layer);              \
        }                                                      \
    }

LayerFactoryDefine(0);
LayerFactoryDefine(1);
LayerFactoryDefine(2);
LayerFactoryDefine(3);
LayerFactoryDefine(4);
LayerFactoryDefine(5);
LayerFactoryDefine(6);
LayerFactoryDefine(7);
LayerFactoryDefine(8);
LayerFactoryDefine(9);

PYBIND11_MODULE(ncnn, m)
{
    auto atexit = py::module_::import("atexit");
    atexit.attr("register")(py::cpp_function([]() {
        for (int i = 0; i < g_layer_factroys.size(); i++)
        {
            g_layer_factroys[i].creator = nullptr;
            g_layer_factroys[i].destroyer = nullptr;
        }
    }));

    py::class_<Allocator, PyAllocator<> >(m, "Allocator");
    py::class_<PoolAllocator, Allocator, PyAllocatorOther<PoolAllocator> >(m, "PoolAllocator")
    .def(py::init<>())
    .def("set_size_compare_ratio", &PoolAllocator::set_size_compare_ratio, py::arg("src"))
    .def("clear", &PoolAllocator::clear)
    .def("fastMalloc", &PoolAllocator::fastMalloc, py::arg("size"))
    .def("fastFree", &PoolAllocator::fastFree, py::arg("ptr"));
    py::class_<UnlockedPoolAllocator, Allocator, PyAllocatorOther<UnlockedPoolAllocator> >(m, "UnlockedPoolAllocator")
    .def(py::init<>())
    .def("set_size_compare_ratio", &UnlockedPoolAllocator::set_size_compare_ratio, py::arg("src"))
    .def("clear", &UnlockedPoolAllocator::clear)
    .def("fastMalloc", &UnlockedPoolAllocator::fastMalloc, py::arg("size"))
    .def("fastFree", &UnlockedPoolAllocator::fastFree, py::arg("ptr"));

    py::class_<DataReader, PyDataReader<> >(m, "DataReader")
    .def(py::init<>())
#if NCNN_STRING
    .def("scan", &DataReader::scan, py::arg("format"), py::arg("p"))
#endif // NCNN_STRING
    .def("read", &DataReader::read, py::arg("buf"), py::arg("size"));
    py::class_<DataReaderFromEmpty, DataReader, PyDataReaderOther<DataReaderFromEmpty> >(m, "DataReaderFromEmpty")
    .def(py::init<>())
#if NCNN_STRING
    .def("scan", &DataReaderFromEmpty::scan, py::arg("format"), py::arg("p"))
#endif // NCNN_STRING
    .def("read", &DataReaderFromEmpty::read, py::arg("buf"), py::arg("size"));

    py::class_<Blob>(m, "Blob")
    .def(py::init<>())
#if NCNN_STRING
    .def_readwrite("name", &Blob::name)
#endif // NCNN_STRING
    .def_readwrite("producer", &Blob::producer)
    .def_readwrite("consumer", &Blob::consumer)
    .def_readwrite("shape", &Blob::shape);

    py::class_<ModelBin, PyModelBin<> >(m, "ModelBin")
    .def(py::init<>())
    .def("load", (Mat(ModelBin::*)(int, int) const) & ModelBin::load, py::arg("w"), py::arg("type"))
    .def("load", (Mat(ModelBin::*)(int, int, int) const) & ModelBin::load, py::arg("w"), py::arg("h"), py::arg("type"))
    .def("load", (Mat(ModelBin::*)(int, int, int, int) const) & ModelBin::load, py::arg("w"), py::arg("h"), py::arg("c"), py::arg("type"))
    .def("load", (Mat(ModelBin::*)(int, int, int, int, int) const) & ModelBin::load, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::arg("type"));
    py::class_<ModelBinFromDataReader, ModelBin, PyModelBinOther<ModelBinFromDataReader> >(m, "ModelBinFromDataReader")
    .def(py::init<const DataReader&>(), py::arg("dr"))
    .def("load", &ModelBinFromDataReader::load, py::arg("w"), py::arg("type"));
    py::class_<ModelBinFromMatArray, ModelBin, PyModelBinOther<ModelBinFromMatArray> >(m, "ModelBinFromMatArray")
    .def(py::init<const Mat*>(), py::arg("weights"))
    .def("load", &ModelBinFromMatArray::load, py::arg("w"), py::arg("type"));

    py::class_<ParamDict>(m, "ParamDict")
    .def(py::init<>())
    .def("type", &ParamDict::type, py::arg("id"))
    .def("get", (int (ParamDict::*)(int, int) const) & ParamDict::get, py::arg("id"), py::arg("def"))
    .def("get", (float (ParamDict::*)(int, float) const) & ParamDict::get, py::arg("id"), py::arg("def"))
    .def("get", (Mat(ParamDict::*)(int, const Mat&) const) & ParamDict::get, py::arg("id"), py::arg("def"))
    .def("set", (void (ParamDict::*)(int, int)) & ParamDict::set, py::arg("id"), py::arg("i"))
    .def("set", (void (ParamDict::*)(int, float)) & ParamDict::set, py::arg("id"), py::arg("f"))
    .def("set", (void (ParamDict::*)(int, const Mat&)) & ParamDict::set, py::arg("id"), py::arg("v"));

    py::class_<Option>(m, "Option")
    .def(py::init<>())
    .def_readwrite("lightmode", &Option::lightmode)
    .def_readwrite("num_threads", &Option::num_threads)
    .def_readwrite("blob_allocator", &Option::blob_allocator)
    .def_readwrite("workspace_allocator", &Option::workspace_allocator)
#if NCNN_VULKAN
    .def_readwrite("blob_vkallocator", &Option::blob_vkallocator)
    .def_readwrite("workspace_vkallocator", &Option::workspace_vkallocator)
    .def_readwrite("staging_vkallocator", &Option::staging_vkallocator)
    //.def_readwrite("pipeline_cache", &Option::pipeline_cache)
#endif // NCNN_VULKAN
    .def_readwrite("openmp_blocktime", &Option::openmp_blocktime)
    .def_readwrite("use_winograd_convolution", &Option::use_winograd_convolution)
    .def_readwrite("use_winograd23_convolution", &Option::use_winograd23_convolution)
    .def_readwrite("use_winograd43_convolution", &Option::use_winograd43_convolution)
    .def_readwrite("use_winograd63_convolution", &Option::use_winograd63_convolution)
    .def_readwrite("use_sgemm_convolution", &Option::use_sgemm_convolution)
    .def_readwrite("use_int8_inference", &Option::use_int8_inference)
    .def_readwrite("use_vulkan_compute", &Option::use_vulkan_compute)
    .def_readwrite("use_bf16_packed", &Option::use_bf16_packed)
    .def_readwrite("use_bf16_storage", &Option::use_bf16_storage)
    .def_readwrite("use_fp16_packed", &Option::use_fp16_packed)
    .def_readwrite("use_fp16_storage", &Option::use_fp16_storage)
    .def_readwrite("use_fp16_arithmetic", &Option::use_fp16_arithmetic)
    .def_readwrite("use_int8_packed", &Option::use_int8_packed)
    .def_readwrite("use_int8_storage", &Option::use_int8_storage)
    .def_readwrite("use_int8_arithmetic", &Option::use_int8_arithmetic)
    .def_readwrite("use_packing_layout", &Option::use_packing_layout)
    .def_readwrite("use_subgroup_ops", &Option::use_subgroup_ops)
    .def_readwrite("use_tensor_storage", &Option::use_tensor_storage);

    py::class_<Mat> mat(m, "Mat", py::buffer_protocol());
    mat.def(py::init<>())
    .def(py::init(
    [](py::tuple shape, size_t elemsize, int elempack, Allocator* allocator) {
        Mat* mat = nullptr;
        switch (shape.size())
        {
        case 1:
            mat = new Mat(shape[0].cast<int>(), elemsize, elempack, allocator);
            break;
        case 2:
            mat = new Mat(shape[0].cast<int>(), shape[1].cast<int>(), elemsize, elempack, allocator);
            break;
        case 3:
            mat = new Mat(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), elemsize, elempack, allocator);
            break;
        case 4:
            mat = new Mat(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), shape[3].cast<int>(), elemsize, elempack, allocator);
            break;
        default:
            std::stringstream ss;
            ss << "shape must be 1, 2, 3 or 4 dims, not " << shape.size();
            pybind11::pybind11_fail(ss.str());
        }
        return mat;
    }),
    py::arg("shape"), py::kw_only(),
    py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def(py::init<int, size_t, int, Allocator*>(),
         py::arg("w"), py::kw_only(),
         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def(py::init<int, int, size_t, int, Allocator*>(),
         py::arg("w"), py::arg("h"), py::kw_only(),
         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def(py::init<int, int, int, size_t, int, Allocator*>(),
         py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(),
         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def(py::init<int, int, int, int, size_t, int, Allocator*>(),
         py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(),
         py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)

    .def(py::init<const Mat&>(), py::arg("m"))

    .def(py::init([](py::buffer const b) {
        py::buffer_info info = b.request();
        if (info.ndim > 4)
        {
            std::stringstream ss;
            ss << "convert numpy.ndarray to ncnn.Mat only dims <=4 support now, but given " << info.ndim;
            pybind11::pybind11_fail(ss.str());
        }

        size_t elemsize = info.itemsize;

        Mat* v = nullptr;
        if (info.ndim == 1)
        {
            v = new Mat((int)info.shape[0], info.ptr, elemsize);
        }
        else if (info.ndim == 2)
        {
            v = new Mat((int)info.shape[1], (int)info.shape[0], info.ptr, elemsize);
        }
        else if (info.ndim == 3)
        {
            v = new Mat((int)info.shape[2], (int)info.shape[1], (int)info.shape[0], info.ptr, elemsize);

            // in ncnn, buffer to construct ncnn::Mat need align to ncnn::alignSize
            // with (w * h * elemsize, 16) / elemsize, but the buffer from numpy not
            // so we set the cstep as numpy's cstep
            v->cstep = (int)info.shape[2] * (int)info.shape[1];
        }
        else if (info.ndim == 4)
        {
            v = new Mat((int)info.shape[3], (int)info.shape[2], (int)info.shape[1], (int)info.shape[0], info.ptr, elemsize);

            // in ncnn, buffer to construct ncnn::Mat need align to ncnn::alignSize
            // with (w * h * d elemsize, 16) / elemsize, but the buffer from numpy not
            // so we set the cstep as numpy's cstep
            v->cstep = (int)info.shape[3] * (int)info.shape[2] * (int)info.shape[1];
        }
        return std::unique_ptr<Mat>(v);
    }),
    py::arg("array"))
    .def_buffer([](Mat& m) -> py::buffer_info {
        return to_buffer_info(m);
    })
    .def(
    "numpy", [](py::object obj, const std::string& format = "") -> py::array {
        auto* m = obj.cast<Mat*>();
        return py::array(to_buffer_info(*m, format), obj);
    },
    py::arg("format") = "", "i for int32, f for float32, d for double")
    //.def("fill", (void (Mat::*)(int))(&Mat::fill), py::arg("v"))
    .def("fill", (void (Mat::*)(float))(&Mat::fill), py::arg("v"))
    .def("clone", &Mat::clone, py::arg("allocator") = nullptr)
    .def("clone_from", &Mat::clone_from, py::arg("mat"), py::arg("allocator") = nullptr)
    .def(
    "reshape", [](Mat& mat, py::tuple shape, Allocator* allocator) {
        switch (shape.size())
        {
        case 1:
            return mat.reshape(shape[0].cast<int>(), allocator);
        case 2:
            return mat.reshape(shape[0].cast<int>(), shape[1].cast<int>(), allocator);
        case 3:
            return mat.reshape(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), allocator);
        case 4:
            return mat.reshape(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), shape[3].cast<int>(), allocator);
        default:
            std::stringstream ss;
            ss << "shape must be 1, 2, 3 or 4 dims, not " << shape.size();
            pybind11::pybind11_fail(ss.str());
        }
        return Mat();
    },
    py::arg("shape"), py::kw_only(), py::arg("allocator") = nullptr)
    .def("reshape", (Mat(Mat::*)(int, Allocator*) const) & Mat::reshape, py::arg("w"), py::kw_only(), py::arg("allocator") = nullptr)
    .def("reshape", (Mat(Mat::*)(int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("allocator") = nullptr)
    .def("reshape", (Mat(Mat::*)(int, int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr)
    .def("reshape", (Mat(Mat::*)(int, int, int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr)

    .def(
    "create", [](Mat& mat, py::tuple shape, size_t elemsize, int elempack, Allocator* allocator) {
        switch (shape.size())
        {
        case 1:
            return mat.create(shape[0].cast<int>(), elemsize, elempack, allocator);
        case 2:
            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), elemsize, elempack, allocator);
        case 3:
            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), elemsize, elempack, allocator);
        case 4:
            return mat.create(shape[0].cast<int>(), shape[1].cast<int>(), shape[2].cast<int>(), shape[3].cast<int>(), elemsize, elempack, allocator);
        default:
            std::stringstream ss;
            ss << "shape must be 1, 2, 3 or 4 dims, not " << shape.size();
            pybind11::pybind11_fail(ss.str());
        }
        return;
    },
    py::arg("shape"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def("create", (void (Mat::*)(int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def("create", (void (Mat::*)(int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def("create", (void (Mat::*)(int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def("create", (void (Mat::*)(int, int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr)
    .def("create_like", (void (Mat::*)(const Mat&, Allocator*)) & Mat::create_like, py::arg("m"), py::arg("allocator") = nullptr)
    .def("addref", &Mat::addref)
    .def("release", &Mat::release)
    .def("empty", &Mat::empty)
    .def("total", &Mat::total)
    .def("elembits", &Mat::elembits)
    .def("shape", &Mat::shape)
    .def("channel", (Mat(Mat::*)(int)) & Mat::channel, py::arg("c"))
    //.def("channel", (const Mat (Mat::*)(int) const) & Mat::channel, py::arg("c"))
    .def("depth", (Mat(Mat::*)(int)) & Mat::depth, py::arg("z"))
    //.def("depth", (const Mat (Mat::*)(int) const) & Mat::depth, py::arg("z"))
    .def(
    "row", [](Mat& m, int y) {
        if (m.elempack != 1)
        {
            std::stringstream ss;
            ss << "get ncnn.Mat row only elempack 1 support now, but given " << m.elempack;
            pybind11::pybind11_fail(ss.str());
        }

        switch (m.elemsize)
        {
        case 1:
            return py::memoryview::from_buffer(m.row<int8_t>(y), {m.w}, {sizeof(int8_t)});
        //case 2:
        //    return py::memoryview::from_buffer(m.row<short>(y), {m.w}, {sizeof(short)});
        case 4:
            return py::memoryview::from_buffer(m.row<float>(y), {m.w}, {sizeof(float)});
        default:
            std::stringstream ss;
            ss << "ncnn.Mat row elemsize " << m.elemsize << "not support now";
            pybind11::pybind11_fail(ss.str());
        }
        return py::memoryview::from_buffer(m.row<float>(y), {m.w}, {sizeof(float)});
    },
    py::arg("y"))
    .def("channel_range", (Mat(Mat::*)(int, int)) & Mat::channel_range, py::arg("c"), py::arg("channels"))
    //.def("channel_range", (const Mat (Mat::*)(int, int) const) & Mat::channel_range, py::arg("c"), py::arg("channels"))
    .def("depth_range", (Mat(Mat::*)(int, int)) & Mat::depth_range, py::arg("z"), py::arg("depths"))
    //.def("depth_range", (const Mat (Mat::*)(int, int) const) & Mat::depth_range, py::arg("z"), py::arg("depths"))
    .def("row_range", (Mat(Mat::*)(int, int)) & Mat::row_range, py::arg("y"), py::arg("rows"))
    //.def("row_range", (const Mat (Mat::*)(int, int) const) & Mat::row_range, py::arg("y"), py::arg("rows"))
    .def("range", (Mat(Mat::*)(int, int)) & Mat::range, py::arg("x"), py::arg("n"))
    //.def("range", (const Mat (Mat::*)(int, int) const) & Mat::range, py::arg("x"), py::arg("n"))
    .def(
    "__getitem__", [](const Mat& m, size_t i) {
        return m[i];
    },
    py::arg("i"))
    .def(
    "__setitem__", [](Mat& m, size_t i, float v) {
        m[i] = v;
    },
    py::arg("i"), py::arg("v"))
    .def("__len__", [](Mat& m) {
        return m.w;
    })

    //convenient construct from pixel data
    .def_static(
    "from_pixels", [](py::buffer const b, int type, int w, int h, Allocator* allocator) {
        return Mat::from_pixels((const unsigned char*)b.request().ptr, type, w, h, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("allocator") = nullptr)
    .def_static(
    "from_pixels", [](py::buffer const b, int type, int w, int h, int stride, Allocator* allocator) {
        return Mat::from_pixels((const unsigned char*)b.request().ptr, type, w, h, stride, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("stride"), py::arg("allocator") = nullptr)
    .def_static(
    "from_pixels_resize", [](py::buffer const b, int type, int w, int h, int target_width, int target_height, Allocator* allocator) {
        return Mat::from_pixels_resize((const unsigned char*)b.request().ptr,
                                       type, w, h, target_width, target_height, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("target_width"), py::arg("target_height"), py::arg("allocator") = nullptr)
    .def_static(
    "from_pixels_resize", [](py::buffer const b, int type, int w, int h, int stride, int target_width, int target_height, Allocator* allocator) {
        return Mat::from_pixels_resize((const unsigned char*)b.request().ptr,
                                       type, w, h, stride, target_width, target_height, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("stride"), py::arg("target_width"), py::arg("target_height"), py::arg("allocator") = nullptr)
    .def_static(
    "from_pixels_roi", [](py::buffer const b, int type, int w, int h, int roix, int roiy, int roiw, int roih, Allocator* allocator) {
        return Mat::from_pixels_roi((const unsigned char*)b.request().ptr,
                                    type, w, h, roix, roiy, roiw, roih, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("roix"), py::arg("roiy"), py::arg("roiw"), py::arg("roih"), py::arg("allocator") = nullptr)
    .def_static(
    "from_pixels_roi", [](py::buffer const b, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, Allocator* allocator) {
        return Mat::from_pixels_roi((const unsigned char*)b.request().ptr,
                                    type, w, h, stride, roix, roiy, roiw, roih, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("stride"), py::arg("roix"), py::arg("roiy"), py::arg("roiw"), py::arg("roih"), py::arg("allocator") = nullptr)
    .def_static(
    "from_pixels_roi_resize", [](py::buffer const b, int type, int w, int h, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator) {
        return Mat::from_pixels_roi_resize((const unsigned char*)b.request().ptr,
                                           type, w, h, roix, roiy, roiw, roih, target_width, target_height, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("roix"), py::arg("roiy"), py::arg("roiw"), py::arg("roih"), py::arg("target_width"), py::arg("target_height"), py::arg("allocator") = nullptr)
    .def_static(
    "from_pixels_roi_resize", [](py::buffer const b, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, Allocator* allocator) {
        return Mat::from_pixels_roi_resize((const unsigned char*)b.request().ptr,
                                           type, w, h, stride, roix, roiy, roiw, roih, target_width, target_height, allocator);
    },
    py::arg("array"), py::arg("type"), py::arg("w"), py::arg("h"), py::arg("stride"), py::arg("roix"), py::arg("roiy"), py::arg("roiw"), py::arg("roih"), py::arg("target_width"), py::arg("target_height"), py::arg("allocator") = nullptr)
    .def(
    "substract_mean_normalize", [](Mat& mat, std::vector<float>& mean, std::vector<float>& norm) {
        return mat.substract_mean_normalize(mean.size() > 0 ? &mean[0] : 0, norm.size() > 0 ? &norm[0] : 0);
    },
    py::arg("mean"), py::arg("norm"))
    .def_readwrite("refcount", &Mat::refcount)
    .def_readwrite("elemsize", &Mat::elemsize)
    .def_readwrite("elempack", &Mat::elempack)
    .def_readwrite("allocator", &Mat::allocator)
    .def_readwrite("dims", &Mat::dims)
    .def_readwrite("w", &Mat::w)
    .def_readwrite("h", &Mat::h)
    .def_readwrite("d", &Mat::d)
    .def_readwrite("c", &Mat::c)
    .def_readwrite("cstep", &Mat::cstep)
    .def("__repr__", [](const Mat& m) {
        std::stringstream ss;
        ss << "<ncnn.Mat w=" << m.w << " h=" << m.h << " d=" << m.d << " c=" << m.c << " dims=" << m.dims
           << " cstep=" << m.cstep << " elemsize=" << m.elemsize << " elempack=" << m.elempack << "\n\t"
           << "refcount=" << (m.refcount ? *m.refcount : 0) << " data=0x" << static_cast<const void*>(m.data)
           << " allocator=0x" << static_cast<const void*>(m.allocator) << ">\n";

        const int max_count = m.dims == 1 ? 10 : 6;
        if (m.dims == 1)
        {
            ss << "[";
            bool dot_printed_w = false;

            if (m.elemsize == 1)
            {
                const int8_t* row = m.row<int8_t>(0);
                for (int i = 0; i < m.w; i++)
                {
                    if (i < max_count / 2 || i >= m.w - max_count / 2)
                    {
                        if (i > 0)
                        {
                            ss << ", ";
                        }
                        ss << static_cast<int>(row[i]);
                    }
                    else if (!dot_printed_w)
                    {
                        dot_printed_w = true;
                        ss << ", ...";
                    }
                }
            }
            if (m.elemsize == 4)
            {
                const float* row = m.row<float>(0);
                for (int i = 0; i < m.w; i++)
                {
                    if (i < max_count / 2 || i >= m.w - max_count / 2)
                    {
                        if (i > 0)
                        {
                            ss << ", ";
                        }
                        ss << row[i];
                    }
                    else if (!dot_printed_w)
                    {
                        dot_printed_w = true;
                        ss << ", ...";
                    }
                }
            }
            ss << "]";
        }
        else if (m.dims == 2)
        {
            bool dot_printed_h = false;
            ss << "[";
            for (int j = 0; j < m.h; j++)
            {
                bool dot_printed_w = false;
                if (j < max_count / 2 || j >= m.h - max_count / 2)
                {
                    ss << "[";
                    if (m.elemsize == 1)
                    {
                        const int8_t* row = m.row<int8_t>(j);
                        for (int i = 0; i < m.w; i++)
                        {
                            if (i < max_count / 2 || i >= m.w - max_count / 2)
                            {
                                if (i > 0)
                                {
                                    ss << ", ";
                                }
                                ss << static_cast<int>(row[i]);
                            }
                            else if (!dot_printed_w)
                            {
                                dot_printed_w = true;
                                ss << ", ...";
                            }
                        }
                    }
                    if (m.elemsize == 4)
                    {
                        const float* row = m.row<float>(j);
                        for (int i = 0; i < m.w; i++)
                        {
                            if (i < max_count / 2 || i >= m.w - max_count / 2)
                            {
                                if (i > 0)
                                {
                                    ss << ", ";
                                }
                                ss << row[i];
                            }
                            else if (!dot_printed_w)
                            {
                                dot_printed_w = true;
                                ss << ", ...";
                            }
                        }
                    }
                    ss << "]";
                    if (j < m.h - 1)
                    {
                        ss << "\n";
                    }
                }
                else if (!dot_printed_h)
                {
                    dot_printed_h = true;
                    ss << "...\n";
                }
            }
            ss << "]\n";
        }
        else if (m.dims == 3)
        {
            bool dot_printed_c = false;
            ss << "[";
            for (int k = 0; k < m.c; k++)
            {
                bool dot_printed_h = false;
                if (k < max_count / 2 || k >= m.c - max_count / 2)
                {
                    Mat channel = m.channel(k);
                    if (k > 0)
                    {
                        ss << " ";
                    }
                    ss << "[";
                    for (int j = 0; j < channel.h; j++)
                    {
                        bool dot_printed_w = false;
                        if (j < max_count / 2 || j >= channel.h - max_count / 2)
                        {
                            if (j > 0)
                            {
                                ss << "  ";
                            }
                            ss << "[";
                            if (m.elemsize == 1)
                            {
                                const int8_t* row = channel.row<int8_t>(j);
                                for (int i = 0; i < channel.w; i++)
                                {
                                    if (i < max_count / 2 || i >= channel.w - max_count / 2)
                                    {
                                        if (i > 0)
                                        {
                                            ss << ", ";
                                        }
                                        ss << static_cast<int>(row[i]);
                                    }
                                    else if (!dot_printed_w)
                                    {
                                        dot_printed_w = true;
                                        ss << ", ...";
                                    }
                                }
                            }
                            if (m.elemsize == 4)
                            {
                                const float* row = channel.row<float>(j);
                                for (int i = 0; i < m.w; i++)
                                {
                                    if (i < max_count / 2 || i >= m.w - max_count / 2)
                                    {
                                        if (i > 0)
                                        {
                                            ss << ", ";
                                        }
                                        ss << row[i];
                                    }
                                    else if (!dot_printed_w)
                                    {
                                        dot_printed_w = true;
                                        ss << ", ...";
                                    }
                                }
                            }
                            ss << "]";
                            if (j < channel.h - 1)
                            {
                                ss << "\n";
                            }
                        }
                        else if (!dot_printed_h)
                        {
                            dot_printed_h = true;
                            ss << "  ...\n";
                        }
                    } // for j
                    ss << "]";
                    if (k < m.c - 1)
                    {
                        ss << "\n\n";
                    }
                }
                else if (!dot_printed_c)
                {
                    dot_printed_c = true;
                    ss << " ...\n";
                }
            } // for k
            ss << "]\n";
        }
        else if (m.dims == 4)
        {
            bool dot_printed_c = false;
            ss << "[";
            for (int k = 0; k < m.c; k++)
            {
                bool dot_printed_d = false;
                if (k < max_count / 2 || k >= m.c - max_count / 2)
                {
                    Mat channel = m.channel(k);
                    if (k > 0)
                    {
                        ss << " ";
                    }
                    ss << "[";
                    for (int z = 0; z < channel.d; z++)
                    {
                        bool dot_printed_h = false;
                        if (z < max_count / 2 || z >= channel.d - max_count / 2)
                        {
                            if (z > 0)
                            {
                                ss << "  ";
                            }
                            ss << "[";
                            for (int j = 0; j < channel.h; j++)
                            {
                                bool dot_printed_w = false;
                                if (j < max_count / 2 || j >= channel.h - max_count / 2)
                                {
                                    if (j > 0)
                                    {
                                        ss << "  ";
                                    }
                                    ss << "[";
                                    if (m.elemsize == 1)
                                    {
                                        const int8_t* row = channel.depth(z).row<int8_t>(j);
                                        for (int i = 0; i < channel.w; i++)
                                        {
                                            if (i < max_count / 2 || i >= channel.w - max_count / 2)
                                            {
                                                if (i > 0)
                                                {
                                                    ss << ", ";
                                                }
                                                ss << static_cast<int>(row[i]);
                                            }
                                            else if (!dot_printed_w)
                                            {
                                                dot_printed_w = true;
                                                ss << ", ...";
                                            }
                                        }
                                    }
                                    if (m.elemsize == 4)
                                    {
                                        const float* row = channel.depth(z).row<float>(j);
                                        for (int i = 0; i < m.w; i++)
                                        {
                                            if (i < max_count / 2 || i >= m.w - max_count / 2)
                                            {
                                                if (i > 0)
                                                {
                                                    ss << ", ";
                                                }
                                                ss << row[i];
                                            }
                                            else if (!dot_printed_w)
                                            {
                                                dot_printed_w = true;
                                                ss << ", ...";
                                            }
                                        }
                                    }
                                    ss << "]";
                                    if (j < channel.h - 1)
                                    {
                                        ss << "\n";
                                    }
                                }
                                else if (!dot_printed_h)
                                {
                                    dot_printed_h = true;
                                    ss << "  ...\n";
                                }
                            } // for j
                            ss << "]";
                            if (z < channel.d - 1)
                            {
                                ss << "\n";
                            }
                        }
                        else if (!dot_printed_d)
                        {
                            dot_printed_d = true;
                            ss << " ...\n";
                        }
                    } // for z
                    ss << "]";
                    if (k < m.c - 1)
                    {
                        ss << "\n\n";
                    }
                }
                else if (!dot_printed_c)
                {
                    dot_printed_c = true;
                    ss << " ...\n";
                }
            } // for k
            ss << "]\n";
        }
        return ss.str();
    });

    py::enum_<ncnn::Mat::PixelType>(mat, "PixelType")
    .value("PIXEL_CONVERT_SHIFT", ncnn::Mat::PixelType::PIXEL_CONVERT_SHIFT)
    .value("PIXEL_FORMAT_MASK", ncnn::Mat::PixelType::PIXEL_FORMAT_MASK)
    .value("PIXEL_CONVERT_MASK", ncnn::Mat::PixelType::PIXEL_CONVERT_MASK)

    .value("PIXEL_RGB", ncnn::Mat::PixelType::PIXEL_RGB)
    .value("PIXEL_BGR", ncnn::Mat::PixelType::PIXEL_BGR)
    .value("PIXEL_GRAY", ncnn::Mat::PixelType::PIXEL_GRAY)
    .value("PIXEL_RGBA", ncnn::Mat::PixelType::PIXEL_RGBA)
    .value("PIXEL_BGRA", ncnn::Mat::PixelType::PIXEL_BGRA)

    .value("PIXEL_RGB2BGR", ncnn::Mat::PixelType::PIXEL_RGB2BGR)
    .value("PIXEL_RGB2GRAY", ncnn::Mat::PixelType::PIXEL_RGB2GRAY)
    .value("PIXEL_RGB2RGBA", ncnn::Mat::PixelType::PIXEL_RGB2RGBA)
    .value("PIXEL_RGB2BGRA", ncnn::Mat::PixelType::PIXEL_RGB2BGRA)

    .value("PIXEL_BGR2RGB", ncnn::Mat::PixelType::PIXEL_BGR2RGB)
    .value("PIXEL_BGR2GRAY", ncnn::Mat::PixelType::PIXEL_BGR2GRAY)
    .value("PIXEL_BGR2RGBA", ncnn::Mat::PixelType::PIXEL_BGR2RGBA)
    .value("PIXEL_BGR2BGRA", ncnn::Mat::PixelType::PIXEL_BGR2BGRA)

    .value("PIXEL_GRAY2RGB", ncnn::Mat::PixelType::PIXEL_GRAY2RGB)
    .value("PIXEL_GRAY2BGR", ncnn::Mat::PixelType::PIXEL_GRAY2BGR)
    .value("PIXEL_GRAY2RGBA", ncnn::Mat::PixelType::PIXEL_GRAY2RGBA)
    .value("PIXEL_GRAY2BGRA", ncnn::Mat::PixelType::PIXEL_GRAY2BGRA)

    .value("PIXEL_RGBA2RGB", ncnn::Mat::PixelType::PIXEL_RGBA2RGB)
    .value("PIXEL_RGBA2BGR", ncnn::Mat::PixelType::PIXEL_RGBA2BGR)
    .value("PIXEL_RGBA2GRAY", ncnn::Mat::PixelType::PIXEL_RGBA2GRAY)
    .value("PIXEL_RGBA2BGRA", ncnn::Mat::PixelType::PIXEL_RGBA2BGRA)

    .value("PIXEL_BGRA2RGB", ncnn::Mat::PixelType::PIXEL_BGRA2RGB)
    .value("PIXEL_BGRA2BGR", ncnn::Mat::PixelType::PIXEL_BGRA2BGR)
    .value("PIXEL_BGRA2GRAY", ncnn::Mat::PixelType::PIXEL_BGRA2GRAY)
    .value("PIXEL_BGRA2RGBA", ncnn::Mat::PixelType::PIXEL_BGRA2RGBA);

    py::class_<Extractor>(m, "Extractor")
    .def("__enter__", [](Extractor& ex) -> Extractor& { return ex; })
    .def("__exit__", [](Extractor& ex, pybind11::args) {
        ex.clear();
    })
    .def("clear", &Extractor::clear)
    .def("set_light_mode", &Extractor::set_light_mode, py::arg("enable"))
    .def("set_blob_allocator", &Extractor::set_blob_allocator, py::arg("allocator"))
    .def("set_workspace_allocator", &Extractor::set_workspace_allocator, py::arg("allocator"))
#if NCNN_STRING
    .def("input", (int (Extractor::*)(const char*, const Mat&)) & Extractor::input, py::arg("blob_name"), py::arg("in"))
    .def("extract", (int (Extractor::*)(const char*, Mat&, int)) & Extractor::extract, py::arg("blob_name"), py::arg("feat"), py::arg("type") = 0)
    .def(
    "extract", [](Extractor& ex, const char* blob_name, int type) {
        ncnn::Mat feat;
        int ret = ex.extract(blob_name, feat, type);
        return py::make_tuple(ret, feat.clone());
    },
    py::arg("blob_name"), py::arg("type") = 0)
#endif
    .def("input", (int (Extractor::*)(int, const Mat&)) & Extractor::input)
    .def("extract", (int (Extractor::*)(int, Mat&, int)) & Extractor::extract, py::arg("blob_index"), py::arg("feat"), py::arg("type") = 0)
    .def(
    "extract", [](Extractor& ex, int blob_index, int type) {
        ncnn::Mat feat;
        int ret = ex.extract(blob_index, feat, type);
        return py::make_tuple(ret, feat.clone());
    },
    py::arg("blob_index"), py::arg("type") = 0);

    py::class_<Layer, PyLayer>(m, "Layer")
    .def(py::init<>())
    .def("load_param", &Layer::load_param, py::arg("pd"))
    .def("load_model", &Layer::load_model, py::arg("mb"))
    .def("create_pipeline", &Layer::create_pipeline, py::arg("opt"))
    .def("destroy_pipeline", &Layer::destroy_pipeline, py::arg("opt"))
    .def_readwrite("one_blob_only", &Layer::one_blob_only)
    .def_readwrite("support_inplace", &Layer::support_inplace)
    .def_readwrite("support_vulkan", &Layer::support_vulkan)
    .def_readwrite("support_packing", &Layer::support_packing)
    .def_readwrite("support_bf16_storage", &Layer::support_bf16_storage)
    .def_readwrite("support_fp16_storage", &Layer::support_fp16_storage)
    .def_readwrite("support_vulkan_packing", &Layer::support_vulkan_packing)
    .def_readwrite("support_any_packing", &Layer::support_any_packing)
    .def_readwrite("support_vulkan_any_packing", &Layer::support_vulkan_any_packing)
    .def("forward", (int (Layer::*)(const std::vector<Mat>&, std::vector<Mat>&, const Option&) const) & Layer::forward,
         py::arg("bottom_blobs"), py::arg("top_blobs"), py::arg("opt"))
    .def("forward", (int (Layer::*)(const Mat&, Mat&, const Option&) const) & Layer::forward,
         py::arg("bottom_blob"), py::arg("top_blob"), py::arg("opt"))
    .def("forward_inplace", (int (Layer::*)(std::vector<Mat>&, const Option&) const) & Layer::forward_inplace,
         py::arg("bottom_top_blobs"), py::arg("opt"))
    .def("forward_inplace", (int (Layer::*)(Mat&, const Option&) const) & Layer::forward_inplace,
         py::arg("bottom_top_blob"), py::arg("opt"))
    .def_readwrite("typeindex", &Layer::typeindex)
#if NCNN_STRING
    .def_readwrite("type", &Layer::type)
    .def_readwrite("name", &Layer::name)
#endif // NCNN_STRING
    .def_readwrite("bottoms", &Layer::bottoms)
    .def_readwrite("tops", &Layer::tops)
    .def_readwrite("bottom_shapes", &Layer::bottom_shapes)
    .def_readwrite("top_shapes", &Layer::top_shapes);

    py::class_<Net>(m, "Net")
    .def(py::init<>())
    .def_readwrite("opt", &Net::opt)
    .def("__enter__", [](Net& net) -> Net& { return net; })
    .def("__exit__", [](Net& net, pybind11::args) {
        net.clear();
    })

#if NCNN_VULKAN
    .def("set_vulkan_device", (void (Net::*)(int)) & Net::set_vulkan_device, py::arg("device_index"))
    .def("set_vulkan_device", (void (Net::*)(const VulkanDevice*)) & Net::set_vulkan_device, py::arg("vkdev"))
    .def("vulkan_device", &Net::vulkan_device, py::return_value_policy::reference_internal)
#endif // NCNN_VULKAN

#if NCNN_STRING
    .def(
    "register_custom_layer", [](Net& net, const char* type, const std::function<ncnn::Layer*()>& creator, const std::function<void(ncnn::Layer*)>& destroyer) {
        if (g_layer_factroy_index == g_layer_factroys.size())
        {
            std::stringstream ss;
            ss << "python version only support " << g_layer_factroys.size() << " custom layers now";
            pybind11::pybind11_fail(ss.str());
        }
        LayerFactory& lf = g_layer_factroys[g_layer_factroy_index++];
        lf.name = type;
        lf.creator = creator;
        lf.destroyer = destroyer;
        return net.register_custom_layer(lf.name.c_str(), lf.creator_func, lf.destroyer_func);
    },
    py::arg("type"), py::arg("creator"), py::arg("destroyer"))
#endif //NCNN_STRING
    .def(
    "register_custom_layer", [](Net& net, int index, const std::function<ncnn::Layer*()>& creator, const std::function<void(ncnn::Layer*)>& destroyer) {
        if (g_layer_factroy_index == g_layer_factroys.size())
        {
            std::stringstream ss;
            ss << "python version only support " << g_layer_factroys.size() << " custom layers now";
            pybind11::pybind11_fail(ss.str());
        }
        LayerFactory& lf = g_layer_factroys[g_layer_factroy_index++];
        lf.index = index;
        lf.creator = creator;
        lf.destroyer = destroyer;
        return net.register_custom_layer(index, lf.creator_func, lf.destroyer_func);
    },
    py::arg("index"), py::arg("creator"), py::arg("destroyer"))
#if NCNN_STRING
    .def("load_param", (int (Net::*)(const DataReader&)) & Net::load_param, py::arg("dr"))
#endif // NCNN_STRING
    .def("load_param_bin", (int (Net::*)(const DataReader&)) & Net::load_param_bin, py::arg("dr"))
    .def("load_model", (int (Net::*)(const DataReader&)) & Net::load_model, py::arg("dr"))

#if NCNN_STDIO
#if NCNN_STRING
#if _WIN32
    .def(
    "load_param", [](Net& self, const std::wstring& path) {
        return self.load_param(path.c_str());
    },
    py::arg("protopath"))
#else
    .def("load_param", (int (Net::*)(const char*)) & Net::load_param, py::arg("protopath"))
#endif
    .def("load_param_mem", (int (Net::*)(const char*)) & Net::load_param_mem, py::arg("mem"))
#endif // NCNN_STRING
#if _WIN32
    .def(
    "load_param_bin", [](Net& self, const std::wstring& path) {
        return self.load_param_bin(path.c_str());
    },
    py::arg("protopath"))
    .def(
    "load_model", [](Net& self, const std::wstring& path) {
        return self.load_model(path.c_str());
    },
    py::arg("modelpath"))
#else
    .def("load_param_bin", (int (Net::*)(const char*)) & Net::load_param_bin, py::arg("protopath"))
    .def("load_model", (int (Net::*)(const char*)) & Net::load_model, py::arg("modelpath"))
#endif
    .def(
    "load_model_mem", [](Net& net, const char* mem) {
        const unsigned char* _mem = (const unsigned char*)mem;
        DataReaderFromMemoryCopy dr(_mem);
        net.load_model(dr);
    },
    py::arg("mem"))
#endif // NCNN_STDIO

    .def("clear", &Net::clear)
    .def("create_extractor", &Net::create_extractor, py::keep_alive<0, 1>()) //net should be kept alive until retuned ex is freed by gc

    .def("input_indexes", &Net::input_indexes, py::return_value_policy::reference)
    .def("output_indexes", &Net::output_indexes, py::return_value_policy::reference)
#if NCNN_STRING
    .def("input_names", &Net::input_names, py::return_value_policy::reference)
    .def("output_names", &Net::output_names, py::return_value_policy::reference)
#endif // NCNN_STRING

    .def("blobs", &Net::blobs, py::return_value_policy::reference_internal)
    .def("layers", &Net::layers, py::return_value_policy::reference_internal);

    py::enum_<ncnn::BorderType>(m, "BorderType")
    .value("BORDER_CONSTANT", ncnn::BorderType::BORDER_CONSTANT)
    .value("BORDER_REPLICATE", ncnn::BorderType::BORDER_REPLICATE);

    m.def("cpu_support_arm_neon", &cpu_support_arm_neon);
    m.def("cpu_support_arm_vfpv4", &cpu_support_arm_vfpv4);
    m.def("cpu_support_arm_asimdhp", &cpu_support_arm_asimdhp);
    m.def("cpu_support_x86_avx2", &cpu_support_x86_avx2);
    m.def("cpu_support_x86_avx", &cpu_support_x86_avx);
    m.def("get_cpu_count", &get_cpu_count);
    m.def("get_little_cpu_count", &get_little_cpu_count);
    m.def("get_big_cpu_count", &get_big_cpu_count);
    m.def("get_physical_cpu_count", &get_physical_cpu_count);
    m.def("get_physical_little_cpu_count", &get_physical_little_cpu_count);
    m.def("get_physical_big_cpu_count", &get_physical_big_cpu_count);
    m.def("get_cpu_powersave", &get_cpu_powersave);
    m.def("set_cpu_powersave", &set_cpu_powersave, py::arg("powersave"));
    m.def("get_omp_num_threads", &get_omp_num_threads);
    m.def("set_omp_num_threads", &set_omp_num_threads, py::arg("num_threads"));
    m.def("get_omp_dynamic", &get_omp_dynamic);
    m.def("set_omp_dynamic", &set_omp_dynamic, py::arg("dynamic"));
    m.def("get_omp_thread_num", &get_omp_thread_num);
    m.def("get_kmp_blocktime", &get_kmp_blocktime);
    m.def("set_kmp_blocktime", &set_kmp_blocktime, py::arg("time_ms"));

    m.def("copy_make_border", &copy_make_border,
          py::arg("src"), py::arg("dst"),
          py::arg("top"), py::arg("bottom"), py::arg("left"), py::arg("right"),
          py::arg("type"), py::arg("v"), py::arg("opt") = Option());
    m.def(
        "copy_make_border",
    [](const Mat& src, int top, int bottom, int left, int right, int type, float v, const Option& opt) {
        Mat dst;
        copy_make_border(src, dst, top, bottom, left, right, type, v, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("top"), py::arg("bottom"), py::arg("left"), py::arg("right"),
    py::arg("type"), py::arg("v"), py::arg("opt") = Option());

    m.def("copy_make_border_3d", &copy_make_border_3d,
          py::arg("src"), py::arg("dst"),
          py::arg("top"), py::arg("bottom"), py::arg("left"), py::arg("right"), py::arg("front"), py::arg("behind"),
          py::arg("type"), py::arg("v"), py::arg("opt") = Option());
    m.def(
        "copy_make_border_3d",
    [](const Mat& src, int top, int bottom, int left, int right, int front, int behind, int type, float v, const Option& opt) {
        Mat dst;
        copy_make_border_3d(src, dst, top, bottom, left, right, front, behind, type, v, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("top"), py::arg("bottom"), py::arg("left"), py::arg("right"), py::arg("front"), py::arg("behind"),
    py::arg("type"), py::arg("v"), py::arg("opt") = Option());

    m.def("copy_cut_border", &copy_cut_border,
          py::arg("src"), py::arg("dst"),
          py::arg("top"), py::arg("bottom"), py::arg("left"), py::arg("right"),
          py::arg("opt") = Option());
    m.def(
        "copy_cut_border",
    [](const Mat& src, int top, int bottom, int left, int right, const Option& opt) {
        Mat dst;
        copy_cut_border(src, dst, top, bottom, left, right, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("top"), py::arg("bottom"), py::arg("left"), py::arg("right"),
    py::arg("opt") = Option());

    m.def("resize_nearest", &resize_nearest,
          py::arg("src"), py::arg("dst"),
          py::arg("w"), py::arg("h"),
          py::arg("opt") = Option());
    m.def(
        "resize_nearest",
    [](const Mat& src, int w, int h, const Option& opt) {
        Mat dst;
        resize_nearest(src, dst, w, h);
        return dst;
    },
    py::arg("src"),
    py::arg("w"), py::arg("h"),
    py::arg("opt") = Option());

    m.def("resize_bilinear", &resize_bilinear,
          py::arg("src"), py::arg("dst"),
          py::arg("w"), py::arg("h"),
          py::arg("opt") = Option());
    m.def(
        "resize_bilinear",
    [](const Mat& src, int w, int h, const Option& opt) {
        Mat dst;
        resize_bilinear(src, dst, w, h, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("w"), py::arg("h"),
    py::arg("opt") = Option());

    m.def("resize_bicubic", &resize_bicubic,
          py::arg("src"), py::arg("dst"),
          py::arg("w"), py::arg("h"),
          py::arg("opt") = Option());
    m.def(
        "resize_bicubic",
    [](const Mat& src, int w, int h, const Option& opt) {
        Mat dst;
        resize_bicubic(src, dst, w, h, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("w"), py::arg("h"),
    py::arg("opt") = Option());

    m.def("convert_packing", &convert_packing,
          py::arg("src"), py::arg("dst"),
          py::arg("elempack"),
          py::arg("opt") = Option());
    m.def(
        "convert_packing",
    [](const Mat& src, int elempack, const Option& opt) {
        Mat dst;
        convert_packing(src, dst, elempack, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("elempack"),
    py::arg("opt") = Option());

    m.def("flatten", &flatten,
          py::arg("src"), py::arg("dst"),
          py::arg("opt") = Option());
    m.def(
        "flatten",
    [](const Mat& src, const Option& opt) {
        Mat dst;
        flatten(src, dst, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("opt") = Option());

    m.def("cast_float32_to_float16", &cast_float32_to_float16,
          py::arg("src"), py::arg("dst"),
          py::arg("opt") = Option());
    m.def(
        "cast_float32_to_float16",
    [](const Mat& src, const Option& opt) {
        Mat dst;
        cast_float32_to_float16(src, dst, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("opt") = Option());

    m.def("cast_float16_to_float32", &cast_float16_to_float32,
          py::arg("src"), py::arg("dst"),
          py::arg("opt") = Option());
    m.def(
        "cast_float16_to_float32",
    [](const Mat& src, const Option& opt) {
        Mat dst;
        cast_float16_to_float32(src, dst, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("opt") = Option());

    m.def("cast_int8_to_float32", &cast_int8_to_float32,
          py::arg("src"), py::arg("dst"),
          py::arg("opt") = Option());
    m.def(
        "cast_int8_to_float32",
    [](const Mat& src, const Option& opt) {
        Mat dst;
        cast_int8_to_float32(src, dst, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("opt") = Option());

    m.def("cast_float32_to_bfloat16", &cast_float32_to_bfloat16,
          py::arg("src"), py::arg("dst"),
          py::arg("opt") = Option());
    m.def(
        "cast_float32_to_bfloat16",
    [](const Mat& src, const Option& opt) {
        Mat dst;
        cast_float32_to_bfloat16(src, dst, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("opt") = Option());

    m.def("cast_bfloat16_to_float32", &cast_bfloat16_to_float32,
          py::arg("src"), py::arg("dst"),
          py::arg("opt") = Option());
    m.def(
        "cast_bfloat16_to_float32",
    [](const Mat& src, const Option& opt) {
        Mat dst;
        cast_bfloat16_to_float32(src, dst, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("opt") = Option());

    m.def("quantize_to_int8", &quantize_to_int8,
          py::arg("src"), py::arg("dst"),
          py::arg("scale_data"),
          py::arg("opt") = Option());
    m.def(
        "quantize_to_int8",
    [](const Mat& src, const Mat& scale_data, const Option& opt) {
        Mat dst;
        quantize_to_int8(src, dst, scale_data, opt);
        return dst;
    },
    py::arg("src"),
    py::arg("scale_data"),
    py::arg("opt") = Option());

#if NCNN_STRING
    m.def("layer_to_index", &layer_to_index, py::arg("type"));
    m.def(
        "create_layer",
    [](const char* type) {
        return static_cast<Layer*>(create_layer(type));
    },
    py::arg("type"));
    m.def(
        "create_layer",
    [](int index) {
        return static_cast<Layer*>(create_layer(index));
    },
    py::arg("index"));
#endif //NCNN_STRING

#if NCNN_VULKAN
    m.def("create_gpu_instance", &create_gpu_instance, py::arg("driver_path") = ((const char*)0));
    m.def("destroy_gpu_instance", &destroy_gpu_instance);
    m.def("get_gpu_count", &get_gpu_count);
    m.def("get_default_gpu_index", &get_default_gpu_index);
    m.def("get_gpu_info", &get_gpu_info, py::arg("device_index") = 0, py::return_value_policy::reference);
    m.def("get_gpu_device", &get_gpu_device, py::arg("device_index") = 0, py::return_value_policy::reference);

    py::class_<VkBufferMemory>(m, "VkBufferMemory")
    .def_readwrite("offset", &VkBufferMemory::offset)
    .def_readwrite("capacity", &VkBufferMemory::capacity)
    .def_readwrite("refcount", &VkBufferMemory::refcount);

    py::class_<VkImageMemory>(m, "VkImageMemory")
    .def_readwrite("width", &VkImageMemory::width)
    .def_readwrite("height", &VkImageMemory::height)
    .def_readwrite("depth", &VkImageMemory::depth)
    .def_readwrite("refcount", &VkImageMemory::refcount);

    py::class_<VkAllocator, PyVkAllocator<> >(m, "VkAllocator")
    .def_readonly("vkdev", &VkAllocator::vkdev)
    .def_readwrite("buffer_memory_type_index", &VkAllocator::buffer_memory_type_index)
    .def_readwrite("image_memory_type_index", &VkAllocator::image_memory_type_index)
    .def_readwrite("mappable", &VkAllocator::mappable)
    .def_readwrite("coherent", &VkAllocator::coherent);

    py::class_<VkBlobAllocator, VkAllocator, PyVkAllocatorOther<VkBlobAllocator> >(m, "VkBlobAllocator")
    .def(py::init<const VulkanDevice*>())
    .def("clear", &VkBlobAllocator::clear)
    .def("fastMalloc", (VkBufferMemory * (VkBlobAllocator::*)(size_t size)) & VkBlobAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkBlobAllocator::*)(VkBufferMemory * ptr)) & VkBlobAllocator::fastFree)
    .def("fastMalloc", (VkImageMemory * (VkBlobAllocator::*)(int, int, int, size_t, int)) & VkBlobAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkBlobAllocator::*)(VkImageMemory * ptr)) & VkBlobAllocator::fastFree);

    py::class_<VkWeightAllocator, VkAllocator, PyVkAllocatorOther<VkWeightAllocator> >(m, "VkWeightAllocator")
    .def(py::init<const VulkanDevice*>())
    .def("clear", &VkWeightAllocator::clear)
    .def("fastMalloc", (VkBufferMemory * (VkWeightAllocator::*)(size_t size)) & VkWeightAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkWeightAllocator::*)(VkBufferMemory * ptr)) & VkWeightAllocator::fastFree)
    .def("fastMalloc", (VkImageMemory * (VkWeightAllocator::*)(int, int, int, size_t, int)) & VkWeightAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkWeightAllocator::*)(VkImageMemory * ptr)) & VkWeightAllocator::fastFree);

    py::class_<VkStagingAllocator, VkAllocator, PyVkAllocatorOther<VkStagingAllocator> >(m, "VkStagingAllocator")
    .def(py::init<const VulkanDevice*>())
    .def("set_size_compare_ratio", &VkStagingAllocator::set_size_compare_ratio)
    .def("clear", &VkStagingAllocator::clear)
    .def("fastMalloc", (VkBufferMemory * (VkStagingAllocator::*)(size_t size)) & VkStagingAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkStagingAllocator::*)(VkBufferMemory * ptr)) & VkStagingAllocator::fastFree)
    .def("fastMalloc", (VkImageMemory * (VkStagingAllocator::*)(int, int, int, size_t, int)) & VkStagingAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkStagingAllocator::*)(VkImageMemory * ptr)) & VkStagingAllocator::fastFree);

    py::class_<VkWeightStagingAllocator, VkAllocator, PyVkAllocatorOther<VkWeightStagingAllocator> >(m, "VkWeightStagingAllocator")
    .def(py::init<const VulkanDevice*>())
    .def("fastMalloc", (VkBufferMemory * (VkWeightStagingAllocator::*)(size_t size)) & VkWeightStagingAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkWeightStagingAllocator::*)(VkBufferMemory * ptr)) & VkWeightStagingAllocator::fastFree)
    .def("fastMalloc", (VkImageMemory * (VkWeightStagingAllocator::*)(int, int, int, size_t, int)) & VkWeightStagingAllocator::fastMalloc, py::return_value_policy::reference_internal)
    .def("fastFree", (void (VkWeightStagingAllocator::*)(VkImageMemory * ptr)) & VkWeightStagingAllocator::fastFree);

    py::class_<GpuInfo>(m, "GpuInfo")
    .def(py::init<>())
    .def("api_version", &GpuInfo::api_version)
    .def("driver_version", &GpuInfo::driver_version)
    .def("vendor_id", &GpuInfo::vendor_id)
    .def("device_id", &GpuInfo::device_id)
    .def("pipeline_cache_uuid", [](GpuInfo& gpuinfo) {
        return py::memoryview::from_buffer(gpuinfo.pipeline_cache_uuid(), {VK_UUID_SIZE}, {sizeof(uint8_t) * VK_UUID_SIZE});
    })
    .def("type", &GpuInfo::type)
    .def("device_name", &GpuInfo::device_name);

    py::class_<VulkanDevice>(m, "VulkanDevice")
    .def(py::init<int>(), py::arg("device_index") = 0)
    .def(
    "info", [](VulkanDevice& dev) {
        return &dev.info;
    },
    py::return_value_policy::reference_internal)
    .def("acquire_blob_allocator", &VulkanDevice::acquire_blob_allocator)
    .def("reclaim_blob_allocator", &VulkanDevice::reclaim_blob_allocator, py::arg("vkallocator"))
    .def("acquire_staging_allocator", &VulkanDevice::acquire_staging_allocator)
    .def("reclaim_staging_allocator", &VulkanDevice::reclaim_staging_allocator, py::arg("vkallocator"))
    .def("get_heap_budget", &VulkanDevice::get_heap_budget);
#endif // NCNN_VULKAN

    m.doc() = R"pbdoc(
        ncnn python wrapper
        -----------------------
        .. currentmodule:: pyncnn
        .. autosummary::
           :toctree: _generate
    )pbdoc";

#ifdef VERSION_INFO
    m.attr("__version__") = VERSION_INFO;
#else
    m.attr("__version__") = "dev";
#endif
}


================================================
FILE: python/src/pybind11_allocator.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef PYBIND11_NCNN_ALLOCATOR_H
#define PYBIND11_NCNN_ALLOCATOR_H

#include <allocator.h>

template<class Base = ncnn::Allocator>
class PyAllocator : public Base
{
public:
    using Base::Base; // Inherit constructors
    void* fastMalloc(size_t size) override
    {
        PYBIND11_OVERRIDE_PURE(void*, Base, fastMalloc, size);
    }
    void fastFree(void* ptr) override
    {
        PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr);
    }
};

template<class Other>
class PyAllocatorOther : public PyAllocator<Other>
{
public:
    using PyAllocator<Other>::PyAllocator;
    void* fastMalloc(size_t size) override
    {
        PYBIND11_OVERRIDE(void*, Other, fastMalloc, size);
    }
    void fastFree(void* ptr) override
    {
        PYBIND11_OVERRIDE(void, Other, fastFree, ptr);
    }
};

#if NCNN_VULKAN
template<class Base = ncnn::VkAllocator>
class PyVkAllocator : public Base
{
public:
    using Base::Base; // Inherit constructors
    void clear() override
    {
        PYBIND11_OVERRIDE(void, Base, clear, );
    }
    ncnn::VkBufferMemory* fastMalloc(size_t size) override
    {
        PYBIND11_OVERRIDE_PURE(ncnn::VkBufferMemory*, Base, fastMalloc, size);
    }
    void fastFree(ncnn::VkBufferMemory* ptr) override
    {
        PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr);
    }
    int flush(ncnn::VkBufferMemory* ptr) override
    {
        PYBIND11_OVERRIDE(int, Base, flush, ptr);
    }
    int invalidate(ncnn::VkBufferMemory* ptr) override
    {
        PYBIND11_OVERRIDE(int, Base, invalidate, ptr);
    }
};

template<class Other>
class PyVkAllocatorOther : public PyVkAllocator<Other>
{
public:
    using PyVkAllocator<Other>::PyVkAllocator;
    void clear() override
    {
        PYBIND11_OVERRIDE(void, Other, clear, );
    }
    ncnn::VkBufferMemory* fastMalloc(size_t size) override
    {
        PYBIND11_OVERRIDE(ncnn::VkBufferMemory*, Other, fastMalloc, size);
    }
    void fastFree(ncnn::VkBufferMemory* ptr) override
    {
        PYBIND11_OVERRIDE(void, Other, fastFree, ptr);
    }
};

template<class Base = ncnn::VkBlobAllocator>
class PyVkBlobAllocator : public Base
{
public:
    using Base::Base; // Inherit constructors
    void clear() override
    {
        PYBIND11_OVERRIDE(void, Base, clear, );
    }
    ncnn::VkImageMemory* fastMalloc(int width, int height, VkFormat format) override
    {
        PYBIND11_OVERRIDE_PURE(ncnn::VkImageMemory*, Base, fastMalloc, width, height, format);
    }
    void fastFree(ncnn::VkImageMemory* ptr) override
    {
        PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr);
    }
};

//template<class Other>
//class PyVkImageAllocatorOther : public PyVkImageAllocator<Other>
//{
//public:
//    using PyVkImageAllocator<Other>::PyVkImageAllocator;
//    ncnn::VkImageMemory* fastMalloc(int width, int height,
//                                    VkFormat format) override
//    {
//        PYBIND11_OVERRIDE(ncnn::VkImageMemory*, Other, fastMalloc, width, height, format);
//    }
//    void fastFree(ncnn::VkImageMemory* ptr) override
//    {
//        PYBIND11_OVERRIDE(void, Other, fastFree, ptr);
//    }
//};
#endif // NCNN_VULKAN

#endif


================================================
FILE: python/src/pybind11_bind.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef PYBIND11_NCNN_BIND_H
#define PYBIND11_NCNN_BIND_H

#include <pybind11/functional.h>

///////////////////////////////////////////////////////////////////////////////////////////////////////////////
// virtual function pass by reference by https://github.com/pybind/pybind11/issues/2033
#define PYBIND11_OVERRIDE_REFERENCE_IMPL(ret_type, cname, name, ...)                                 \
    do                                                                                               \
    {                                                                                                \
        pybind11::gil_scoped_acquire gil;                                                            \
        pybind11::function override = pybind11::get_override(static_cast<const cname*>(this), name); \
        if (override)                                                                                \
        {                                                                                            \
            auto o = override.operator()<pybind11::return_value_policy::reference>(__VA_ARGS__);     \
            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value)                \
            {                                                                                        \
                static pybind11::detail::override_caster_t<ret_type> caster;                         \
                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster);                   \
            }                                                                                        \
            else                                                                                     \
                return pybind11::detail::cast_safe<ret_type>(std::move(o));                          \
        }                                                                                            \
    } while (false)

#define PYBIND11_OVERRIDE_REFERENCE_NAME(ret_type, cname, name, fn, ...)                                    \
    do                                                                                                      \
    {                                                                                                       \
        PYBIND11_OVERRIDE_REFERENCE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__); \
        return cname::fn(__VA_ARGS__);                                                                      \
    } while (false)

#define PYBIND11_OVERRIDE_REFERENCE(ret_type, cname, fn, ...) \
    PYBIND11_OVERRIDE_REFERENCE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
///////////////////////////////////////////////////////////////////////////////////////////////////////////////

#endif


================================================
FILE: python/src/pybind11_datareader.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef PYBIND11_NCNN_DATAREADER_H
#define PYBIND11_NCNN_DATAREADER_H

#include <datareader.h>

class DataReaderFromEmpty : public ncnn::DataReader
{
public:
#if NCNN_STRING
    virtual int scan(const char* format, void* p) const
    {
        return 0;
    }
#endif // NCNN_STRING
    virtual size_t read(void* buf, size_t size) const
    {
        memset(buf, 0, size);
        return size;
    }
};

template<class Base = ncnn::DataReader>
class PyDataReader : public Base
{
public:
    using Base::Base; // Inherit constructors
#if NCNN_STRING
    int scan(const char* format, void* p) const override
    {
        PYBIND11_OVERRIDE(int, Base, scan, format, p);
    }
#endif // NCNN_STRING
    size_t read(void* buf, size_t size) const override
    {
        PYBIND11_OVERRIDE(size_t, Base, read, buf, size);
    }
};

template<class Other>
class PyDataReaderOther : public PyDataReader<Other>
{
public:
    using PyDataReader<Other>::PyDataReader;
#if NCNN_STRING
    int scan(const char* format, void* p) const override
    {
        PYBIND11_OVERRIDE(int, Other, scan, format, p);
    }
#endif // NCNN_STRING
    size_t read(void* buf, size_t size) const override
    {
        PYBIND11_OVERRIDE(size_t, Other, read, buf, size);
    }
};

#endif


================================================
FILE: python/src/pybind11_layer.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef PYBIND11_NCNN_LAYER_H
#define PYBIND11_NCNN_LAYER_H

#include <layer.h>
#include "pybind11_bind.h"

class PyLayer : public ncnn::Layer
{
public:
    virtual int load_param(const ncnn::ParamDict& pd)
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            load_param,
            pd);
    }

    virtual int load_model(const ncnn::ModelBin& mb)
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            load_model,
            mb);
    }

    virtual int create_pipeline(const ncnn::Option& opt)
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            create_pipeline,
            opt);
    }

    virtual int destroy_pipeline(const ncnn::Option& opt)
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            destroy_pipeline,
            opt);
    }

public:
    virtual int forward(const std::vector<ncnn::Mat>& bottom_blobs, std::vector<ncnn::Mat>& top_blobs, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward,
            bottom_blobs,
            top_blobs,
            opt);
    }
    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward,
            bottom_blob,
            top_blob,
            opt);
    }

    virtual int forward_inplace(std::vector<ncnn::Mat>& bottom_top_blobs, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward_inplace,
            bottom_top_blobs,
            opt);
    }
    virtual int forward_inplace(ncnn::Mat& bottom_top_blob, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward_inplace,
            bottom_top_blob,
            opt);
    }

#if NCNN_VULKAN
public:
    virtual int upload_model(ncnn::VkTransfer& cmd, const ncnn::Option& opt)
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            upload_model,
            cmd,
            opt);
    }

public:
    virtual int forward(const std::vector<ncnn::VkMat>& bottom_blobs, std::vector<ncnn::VkMat>& top_blobs, ncnn::VkCompute& cmd, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward,
            bottom_blobs,
            top_blobs,
            cmd,
            opt);
    }
    virtual int forward(const ncnn::VkMat& bottom_blob, ncnn::VkMat& top_blob, ncnn::VkCompute& cmd, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward,
            bottom_blob,
            top_blob,
            cmd,
            opt);
    }

    virtual int forward_inplace(std::vector<ncnn::VkMat>& bottom_top_blobs, ncnn::VkCompute& cmd, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward_inplace,
            bottom_top_blobs,
            cmd,
            opt);
    }
    virtual int forward_inplace(ncnn::VkMat& bottom_top_blob, ncnn::VkCompute& cmd, const ncnn::Option& opt) const
    {
        PYBIND11_OVERRIDE_REFERENCE(
            int,
            ncnn::Layer,
            forward_inplace,
            bottom_top_blob,
            cmd,
            opt);
    }
#endif // NCNN_VULKAN
};

#endif


================================================
FILE: python/src/pybind11_mat.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef PYBIND11_NCNN_MAT_H
#define PYBIND11_NCNN_MAT_H

#include <string>

#include <pybind11/pybind11.h>

#include <mat.h>

namespace py = pybind11;

std::string get_mat_format(const ncnn::Mat& m)
{
    std::string format;
    if (m.elemsize == 4)
    {
        format = pybind11::format_descriptor<float>::format();
    }
    if (m.elemsize == 2)
    {
        // see https://docs.python.org/3/library/struct.html#format-characters
        format = "e";
    }
    if (m.elemsize == 1)
    {
        format = pybind11::format_descriptor<int8_t>::format();
    }
    return format;
}

// possible values for format:
// i (int32_t)
// f (float)
// d (double)
// leave it to empty to use get_mat_format
py::buffer_info to_buffer_info(ncnn::Mat& m, const std::string& format = "")
{
    if (m.elemsize != 1 && m.elemsize != 2 && m.elemsize != 4)
    {
        std::ostringstream ss;
        ss << "Convert ncnn.Mat to numpy.ndarray. Support only elemsize 1, 2, 4; but given "
           << m.elemsize;
        py::pybind11_fail(ss.str());
    }
    if (m.elempack != 1)
    {
        std::ostringstream ss;
        ss << "Convert ncnn.Mat to numpy.ndarray. Support only elempack == 1, but "
           "given "
           << m.elempack;
        py::pybind11_fail(ss.str());
    }
    std::string _format(format);
    if (_format.empty())
    {
        _format = get_mat_format(m);
    }
    std::vector<py::ssize_t> shape;
    std::vector<py::ssize_t> strides;
    if (m.dims == 1)
    {
        shape.push_back(m.w);
        strides.push_back(m.elemsize);
    }
    else if (m.dims == 2)
    {
        shape.push_back(m.h);
        shape.push_back(m.w);
        strides.push_back(m.w * m.elemsize);
        strides.push_back(m.elemsize);
    }
    else if (m.dims == 3)
    {
        shape.push_back(m.c);
        shape.push_back(m.h);
        shape.push_back(m.w);
        strides.push_back(m.cstep * m.elemsize);
        strides.push_back(m.w * m.elemsize);
        strides.push_back(m.elemsize);
    }
    else if (m.dims == 4)
    {
        shape.push_back(m.c);
        shape.push_back(m.d);
        shape.push_back(m.h);
        shape.push_back(m.w);
        strides.push_back(m.cstep * m.elemsize);
        strides.push_back(m.w * m.h * m.elemsize);
        strides.push_back(m.w * m.elemsize);
        strides.push_back(m.elemsize);
    }
    return py::buffer_info(m.data,     /* Pointer to buffer */
                           m.elemsize, /* Size of one scalar */
                           _format,    /* Python struct-style format descriptor */
                           m.dims,     /* Number of dimensions */
                           shape,      /* Buffer dimensions */
                           strides     /* Strides (in bytes) for each index */
                          );
}

#endif


================================================
FILE: python/src/pybind11_modelbin.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef PYBIND11_NCNN_MODELBIN_H
#define PYBIND11_NCNN_MODELBIN_H

#include <modelbin.h>

template<class Base = ncnn::ModelBin>
class PyModelBin : public Base
{
public:
    using Base::Base; // Inherit constructors
    ncnn::Mat load(int w, int type) const override
    {
        PYBIND11_OVERRIDE(ncnn::Mat, Base, load, w, type);
    }
    //ncnn::Mat load(int w, int h, int type) const override {
    //	PYBIND11_OVERRIDE(ncnn::Mat, Base, load, w, h, type);
    //}
    //ncnn::Mat load(int w, int h, int c, int type) const override {
    //	PYBIND11_OVERRIDE(ncnn::Mat, Base, load, w, h, c, type);
    //}
};

template<class Other>
class PyModelBinOther : public PyModelBin<Other>
{
public:
    using PyModelBin<Other>::PyModelBin;
    ncnn::Mat load(int w, int type) const override
    {
        PYBIND11_OVERRIDE(ncnn::Mat, Other, load, w, type);
    }
};

#endif


================================================
FILE: python/tests/benchmark.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import time
import ncnn

param_root = "../../benchmark"

g_warmup_loop_count = 8
g_loop_count = 4
g_enable_cooling_down = True

g_vkdev = None
g_blob_vkallocator = None
g_staging_vkallocator = None

g_blob_pool_allocator = ncnn.UnlockedPoolAllocator()
g_workspace_pool_allocator = ncnn.PoolAllocator()


def benchmark(comment, _in, opt):
    _in.fill(0.01)

    g_blob_pool_allocator.clear()
    g_workspace_pool_allocator.clear()

    if opt.use_vulkan_compute:
        g_blob_vkallocator.clear()
        g_staging_vkallocator.clear()

    net = ncnn.Net()
    net.opt = opt

    if net.opt.use_vulkan_compute:
        net.set_vulkan_device(g_vkdev)

    net.load_param(param_root + comment + ".param")

    dr = ncnn.DataReaderFromEmpty()
    net.load_model(dr)

    input_names = net.input_names()
    output_names = net.output_names()

    if g_enable_cooling_down:
        time.sleep(10)

    # warm up
    for i in range(g_warmup_loop_count):
        # test with statement
        with net.create_extractor() as ex:
            ex.input(input_names[0], _in)
            ex.extract(output_names[0])

    time_min = sys.float_info.max
    time_max = -sys.float_info.max
    time_avg = 0.0

    for i in range(g_loop_count):
        start = time.time()

        # test net keep alive until ex freed
        ex = net.create_extractor()
        ex.input(input_names[0], _in)
        ex.extract(output_names[0])

        end = time.time()

        timespan = end - start

        time_min = timespan if timespan < time_min else time_min
        time_max = timespan if timespan > time_max else time_max
        time_avg += timespan

    time_avg /= g_loop_count

    print(
        "%20s  min = %7.2f  max = %7.2f  avg = %7.2f"
        % (comment, time_min * 1000, time_max * 1000, time_avg * 1000)
    )


if __name__ == "__main__":
    loop_count = 4
    num_threads = ncnn.get_cpu_count()
    powersave = 0
    gpu_device = -1
    cooling_down = 1

    argc = len(sys.argv)
    if argc >= 2:
        loop_count = int(sys.argv[1])
    if argc >= 3:
        num_threads = int(sys.argv[2])
    if argc >= 4:
        powersave = int(sys.argv[3])
    if argc >= 5:
        gpu_device = int(sys.argv[4])
    if argc >= 6:
        cooling_down = int(sys.argv[5])

    use_vulkan_compute = gpu_device != -1

    g_enable_cooling_down = cooling_down != 0

    g_loop_count = loop_count

    g_blob_pool_allocator.set_size_compare_ratio(0.0)
    g_workspace_pool_allocator.set_size_compare_ratio(0.5)

    if use_vulkan_compute:
        g_warmup_loop_count = 10

        g_vkdev = ncnn.get_gpu_device(gpu_device)

        g_blob_vkallocator = ncnn.VkBlobAllocator(g_vkdev)
        g_staging_vkallocator = ncnn.VkStagingAllocator(g_vkdev)

    opt = ncnn.Option()
    opt.lightmode = True
    opt.num_threads = num_threads
    opt.blob_allocator = g_blob_pool_allocator
    opt.workspace_allocator = g_workspace_pool_allocator
    if use_vulkan_compute:
        opt.blob_vkallocator = g_blob_vkallocator
        opt.workspace_vkallocator = g_blob_vkallocator
        opt.staging_vkallocator = g_staging_vkallocator
    opt.use_winograd_convolution = True
    opt.use_sgemm_convolution = True
    opt.use_int8_inference = True
    opt.use_vulkan_compute = use_vulkan_compute
    opt.use_fp16_packed = True
    opt.use_fp16_storage = True
    opt.use_fp16_arithmetic = True
    opt.use_int8_storage = True
    opt.use_int8_arithmetic = True
    opt.use_packing_layout = True

    ncnn.set_cpu_powersave(powersave)
    ncnn.set_omp_dynamic(0)
    ncnn.set_omp_num_threads(num_threads)

    print("loop_count =", loop_count)
    print("num_threads =", num_threads)
    print("powersave =", ncnn.get_cpu_powersave())
    print("gpu_device =", gpu_device)
    print("cooling_down =", g_enable_cooling_down)

    benchmark("squeezenet", ncnn.Mat((227, 227, 3)), opt)
    benchmark("squeezenet_int8", ncnn.Mat((227, 227, 3)), opt)
    benchmark("mobilenet", ncnn.Mat((224, 224, 3)), opt)
    benchmark("mobilenet_int8", ncnn.Mat((224, 224, 3)), opt)
    benchmark("mobilenet_v2", ncnn.Mat((224, 224, 3)), opt)
    # benchmark("mobilenet_v2_int8", ncnn.Mat(w=224, h=224, c=3), opt)
    benchmark("mobilenet_v3", ncnn.Mat((224, 224, 3)), opt)
    benchmark("shufflenet", ncnn.Mat((224, 224, 3)), opt)
    benchmark("shufflenet_v2", ncnn.Mat((224, 224, 3)), opt)
    benchmark("mnasnet", ncnn.Mat((224, 224, 3)), opt)
    benchmark("proxylessnasnet", ncnn.Mat((224, 224, 3)), opt)
    benchmark("efficientnet_b0", ncnn.Mat((224, 224, 3)), opt)
    benchmark("regnety_400m", ncnn.Mat((224, 224, 3)), opt)
    benchmark("blazeface", ncnn.Mat((128, 128, 3)), opt)
    benchmark("googlenet", ncnn.Mat((224, 224, 3)), opt)
    benchmark("googlenet_int8", ncnn.Mat((224, 224, 3)), opt)
    benchmark("resnet18", ncnn.Mat((224, 224, 3)), opt)
    benchmark("resnet18_int8", ncnn.Mat((224, 224, 3)), opt)
    benchmark("alexnet", ncnn.Mat((227, 227, 3)), opt)
    benchmark("vgg16", ncnn.Mat((224, 224, 3)), opt)
    benchmark("vgg16_int8", ncnn.Mat((224, 224, 3)), opt)
    benchmark("resnet50", ncnn.Mat((224, 224, 3)), opt)
    benchmark("resnet50_int8", ncnn.Mat((224, 224, 3)), opt)
    benchmark("squeezenet_ssd", ncnn.Mat((300, 300, 3)), opt)
    benchmark("squeezenet_ssd_int8", ncnn.Mat((300, 300, 3)), opt)
    benchmark("mobilenet_ssd", ncnn.Mat((300, 300, 3)), opt)
    benchmark("mobilenet_ssd_int8", ncnn.Mat((300, 300, 3)), opt)
    benchmark("mobilenet_yolo", ncnn.Mat((416, 416, 3)), opt)
    benchmark("mobilenetv2_yolov3", ncnn.Mat((352, 352, 3)), opt)
    benchmark("yolov4-tiny", ncnn.Mat((416, 416, 3)), opt)


================================================
FILE: python/tests/custom_layer.param
================================================
7767517
2 2
Input            data                             0 1 data
CustomLayer      cl_fwd                           1 1 data output


================================================
FILE: python/tests/test.param
================================================
7767517
3 3
Input            data                             0 1 data
Convolution      conv0_fwd                        1 1 data conv0_fwd 0=3 1=3 11=3 2=1 12=1 3=1 13=1 4=0 14=0 5=1 6=81
InnerProduct     dense0_fwd                       1 1 conv0_fwd output 0=1 1=1 2=151875


================================================
FILE: python/tests/test_allocator.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import pytest

import ncnn


def test_pool_allocator():
    pa = ncnn.PoolAllocator()
    assert pa is not None
    pa.set_size_compare_ratio(0.5)
    buf = pa.fastMalloc(10 * 1024)
    assert buf is not None
    pa.fastFree(buf)
    pa.clear()


def test_unlocked_pool_allocator():
    upa = ncnn.UnlockedPoolAllocator()
    assert upa is not None
    upa.set_size_compare_ratio(0.5)
    buf = upa.fastMalloc(10 * 1024)
    assert buf is not None
    upa.fastFree(buf)
    upa.clear()


================================================
FILE: python/tests/test_blob.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import pytest

import ncnn


def test_blob():
    blob = ncnn.Blob()

    blob.name = "myblob"
    assert blob.name == "myblob"

    blob.producer = 0
    assert blob.producer == 0

    blob.consumer = 0
    assert blob.consumer == 0

    blob.shape = ncnn.Mat(1)
    assert blob.shape.dims == 1 and blob.shape.w == 1


================================================
FILE: python/tests/test_extractor.py
================================================
# Copyright 2021 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import pytest

import ncnn

alloctor = ncnn.PoolAllocator()


def test_extractor():
    with pytest.raises(TypeError, match="No constructor"):
        ex = ncnn.Extractor()

    dr = ncnn.DataReaderFromEmpty()

    net = ncnn.Net()
    net.load_param("tests/test.param")
    net.load_model(dr)

    in_mat = ncnn.Mat((227, 227, 3))
    with net.create_extractor() as ex:
        ex.set_light_mode(True)

        ex.set_blob_allocator(alloctor)
        ex.set_workspace_allocator(alloctor)

        ex.input("data", in_mat)
        ret, out_mat = ex.extract("conv0_fwd")
        assert (
            ret == 0
            and out_mat.dims == 3
            and out_mat.w == 225
            and out_mat.h == 225
            and out_mat.c == 3
        )

        ret, out_mat = ex.extract("output")
        assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1


def test_extractor_index():
    with pytest.raises(TypeError, match="No constructor"):
        ex = ncnn.Extractor()

    dr = ncnn.DataReaderFromEmpty()

    net = ncnn.Net()
    net.load_param("tests/test.param")
    net.load_model(dr)

    in_mat = ncnn.Mat((227, 227, 3))
    ex = net.create_extractor()
    ex.set_light_mode(True)

    ex.set_blob_allocator(alloctor)
    ex.set_workspace_allocator(alloctor)

    ex.input(0, in_mat)
    ret, out_mat = ex.extract(1)
    assert (
        ret == 0
        and out_mat.dims == 3
        and out_mat.w == 225
        and out_mat.h == 225
        and out_mat.c == 3
    )

    ret, out_mat = ex.extract(2)
    assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1

    # not use with sentence, call clear manually to ensure ex destruct before net
    ex.clear()


================================================
FILE: python/tests/test_mat.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import sys
import numpy as np
import pytest

import ncnn


def test_mat_dims1():
    mat = ncnn.Mat(1)
    assert mat.dims == 1 and mat.w == 1
    mat = ncnn.Mat(2, elemsize=4)
    assert mat.dims == 1 and mat.w == 2 and mat.elemsize == 4
    mat = ncnn.Mat(3, elemsize=4, elempack=1)
    assert mat.dims == 1 and mat.w == 3 and mat.elemsize == 4 and mat.elempack == 1
    mat = ncnn.Mat(4, elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 1
        and mat.w == 4
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )

    mat = ncnn.Mat((1,))
    assert mat.dims == 1 and mat.w == 1
    mat = ncnn.Mat((2,), elemsize=4)
    assert mat.dims == 1 and mat.w == 2 and mat.elemsize == 4
    mat = ncnn.Mat((3,), elemsize=4, elempack=1)
    assert mat.dims == 1 and mat.w == 3 and mat.elemsize == 4 and mat.elempack == 1
    mat = ncnn.Mat((4,), elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 1
        and mat.w == 4
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )


def test_mat_dims2():
    mat = ncnn.Mat(1, 2)
    assert mat.dims == 2 and mat.w == 1 and mat.h == 2
    mat = ncnn.Mat(3, 4, elemsize=4)
    assert mat.dims == 2 and mat.w == 3 and mat.h == 4 and mat.elemsize == 4
    mat = ncnn.Mat(5, 6, elemsize=4, elempack=1)
    assert (
        mat.dims == 2
        and mat.w == 5
        and mat.h == 6
        and mat.elemsize == 4
        and mat.elempack == 1
    )
    mat = ncnn.Mat(7, 8, elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 2
        and mat.w == 7
        and mat.h == 8
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )

    mat = ncnn.Mat((1, 2))
    assert mat.dims == 2 and mat.w == 1 and mat.h == 2
    mat = ncnn.Mat((3, 4), elemsize=4)
    assert mat.dims == 2 and mat.w == 3 and mat.h == 4 and mat.elemsize == 4
    mat = ncnn.Mat((5, 6), elemsize=4, elempack=1)
    assert (
        mat.dims == 2
        and mat.w == 5
        and mat.h == 6
        and mat.elemsize == 4
        and mat.elempack == 1
    )
    mat = ncnn.Mat((7, 8), elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 2
        and mat.w == 7
        and mat.h == 8
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )


def test_mat_dims3():
    mat = ncnn.Mat(1, 2, 3)
    assert mat.dims == 3 and mat.w == 1 and mat.h == 2 and mat.c == 3
    mat = ncnn.Mat(4, 5, 6, elemsize=4)
    assert (
        mat.dims == 3 and mat.w == 4 and mat.h == 5 and mat.c == 6 and mat.elemsize == 4
    )
    mat = ncnn.Mat(7, 8, 9, elemsize=4, elempack=1)
    assert (
        mat.dims == 3
        and mat.w == 7
        and mat.h == 8
        and mat.c == 9
        and mat.elemsize == 4
        and mat.elempack == 1
    )
    mat = ncnn.Mat(10, 11, 12, elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 3
        and mat.w == 10
        and mat.h == 11
        and mat.c == 12
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )

    mat = ncnn.Mat((1, 2, 3))
    assert mat.dims == 3 and mat.w == 1 and mat.h == 2 and mat.c == 3
    mat = ncnn.Mat((4, 5, 6), elemsize=4)
    assert (
        mat.dims == 3 and mat.w == 4 and mat.h == 5 and mat.c == 6 and mat.elemsize == 4
    )
    mat = ncnn.Mat((7, 8, 9), elemsize=4, elempack=1)
    assert (
        mat.dims == 3
        and mat.w == 7
        and mat.h == 8
        and mat.c == 9
        and mat.elemsize == 4
        and mat.elempack == 1
    )
    mat = ncnn.Mat((10, 11, 12), elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 3
        and mat.w == 10
        and mat.h == 11
        and mat.c == 12
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )


def test_mat_dims4():
    mat = ncnn.Mat(1, 2, 3, 4)
    assert mat.dims == 4 and mat.w == 1 and mat.h == 2 and mat.d == 3 and mat.c == 4
    mat = ncnn.Mat(4, 5, 6, 7, elemsize=4)
    assert (
        mat.dims == 4 and mat.w == 4 and mat.h == 5 and mat.d == 6 and mat.c == 7 and mat.elemsize == 4
    )
    mat = ncnn.Mat(7, 8, 9, 10, elemsize=4, elempack=1)
    assert (
        mat.dims == 4
        and mat.w == 7
        and mat.h == 8
        and mat.d == 9
        and mat.c == 10
        and mat.elemsize == 4
        and mat.elempack == 1
    )
    mat = ncnn.Mat(10, 11, 12, 13, elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 4
        and mat.w == 10
        and mat.h == 11
        and mat.d == 12
        and mat.c == 13
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )

    mat = ncnn.Mat((1, 2, 3, 4))
    assert mat.dims == 4 and mat.w == 1 and mat.h == 2 and mat.d == 3 and mat.c == 4
    mat = ncnn.Mat((4, 5, 6, 7), elemsize=4)
    assert (
        mat.dims == 4 and mat.w == 4 and mat.h == 5 and mat.d == 6 and mat.c == 7 and mat.elemsize == 4
    )
    mat = ncnn.Mat((7, 8, 9, 10), elemsize=4, elempack=1)
    assert (
        mat.dims == 4
        and mat.w == 7
        and mat.h == 8
        and mat.d == 9
        and mat.c == 10
        and mat.elemsize == 4
        and mat.elempack == 1
    )
    mat = ncnn.Mat((10, 11, 12, 13), elemsize=4, elempack=1, allocator=None)
    assert (
        mat.dims == 4
        and mat.w == 10
        and mat.h == 11
        and mat.d == 12
        and mat.c == 13
        and mat.elemsize == 4
        and mat.elempack == 1
        and mat.allocator == None
    )


def test_numpy():
    mat = ncnn.Mat(1)
    array = mat.numpy()
    assert mat.dims == array.ndim and mat.w == array.shape[0]
    mat = ncnn.Mat(2, 3)
    array = mat.numpy()
    assert array.dtype == np.float32
    assert (
        mat.dims == array.ndim and mat.w == array.shape[1] and mat.h == array.shape[0]
    )
    mat = ncnn.Mat(4, 5, 6)
    array = np.array(mat)
    assert (
        mat.dims == array.ndim
        and mat.w == array.shape[2]
        and mat.h == array.shape[1]
        and mat.c == array.shape[0]
    )
    mat = ncnn.Mat(7, 8, 9, 10)
    array = np.array(mat)
    assert (
        mat.dims == array.ndim
        and mat.w == array.shape[3]
        and mat.h == array.shape[2]
        and mat.d == array.shape[1]
        and mat.c == array.shape[0]
    )

    mat = ncnn.Mat(1, elemsize=1)
    array = mat.numpy()
    assert array.dtype == np.int8
    mat = ncnn.Mat(1, elemsize=2)
    array = mat.numpy()
    assert array.dtype == np.float16
    # pybind11 def_buffer throw bug
    # with pytest.raises(RuntimeError) as execinfo:
    #     mat = ncnn.Mat(1, elemsize=3)
    #     array = np.array(mat)
    #     assert "convert ncnn.Mat to numpy.ndarray only elemsize 1, 2, 4 support now, but given 3" in str(
    #         execinfo.value
    #     )
    assert array.dtype == np.float16
    mat = ncnn.Mat(1, elemsize=4)
    array = mat.numpy()
    assert array.dtype == np.float32

    mat = np.random.randint(0, 128, size=(12,)).astype(np.uint8)
    array = np.array(mat)
    assert (mat == array).all()
    mat = np.random.rand(12).astype(np.float32)
    array = np.array(mat)
    assert (mat == array).all()
    mat = np.random.randint(0, 128, size=(12, 11)).astype(np.uint8)
    array = np.array(mat)
    assert (mat == array).all()
    mat = np.random.rand(12, 11).astype(np.float32)
    array = np.array(mat)
    assert (mat == array).all()
    mat = np.random.randint(0, 256, size=(12, 11, 3)).astype(np.uint8)
    array = np.array(mat)
    assert (mat == array).all()
    mat = np.random.rand(12, 11, 3).astype(np.float32)
    array = np.array(mat)
    assert (mat == array).all()
    mat = np.random.randint(0, 256, size=(12, 11, 7, 3)).astype(np.uint8)
    array = np.array(mat)
    assert (mat == array).all()
    mat = np.random.rand(12, 11, 7, 3).astype(np.float32)
    array = np.array(mat)
    assert (mat == array).all()

    array = np.array([1, 2, 3], dtype=np.int32)
    mat = ncnn.Mat(array)
    array2 = mat.numpy(format='i')
    assert array2.dtype == np.int32
    array[0] = 10
    assert array2[0] == 10

    array = np.array([1, 2, 3], dtype=np.float32)
    mat = ncnn.Mat(array)
    array2 = mat.numpy(format='f')
    assert array2.dtype == np.float32
    array2[0] = 100
    assert array[0] == 100

def test_fill():
    mat = ncnn.Mat(1)
    mat.fill(1.0)
    array = np.array(mat)
    assert np.abs(array[0] - 1.0) < np.finfo(np.float32).eps


def test_clone():
    mat1 = ncnn.Mat(1)
    mat2 = mat1.clone()
    assert mat1.dims == mat2.dims and mat1.w == mat2.w

    mat1 = ncnn.Mat(2, 3)
    mat2 = mat1.clone()
    assert mat1.dims == mat2.dims and mat1.w == mat2.w and mat1.h == mat2.h

    mat1 = ncnn.Mat(4, 5, 6)
    mat2 = mat1.clone()
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.c == mat2.c
    )

    mat1 = ncnn.Mat(7, 8, 9, 10)
    mat2 = mat1.clone()
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.d == mat2.d
        and mat1.c == mat2.c
    )

    mat1 = ncnn.Mat((1,))
    mat2 = mat1.clone()
    assert mat1.dims == mat2.dims and mat1.w == mat2.w

    mat1 = ncnn.Mat((2, 3))
    mat2 = mat1.clone()
    assert mat1.dims == mat2.dims and mat1.w == mat2.w and mat1.h == mat2.h

    mat1 = ncnn.Mat((4, 5, 6))
    mat2 = mat1.clone()
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.c == mat2.c
    )

    mat1 = ncnn.Mat((7, 8, 9, 10))
    mat2 = mat1.clone()
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.d == mat2.d
        and mat1.c == mat2.c
    )


def test_clone_from():
    mat2 = ncnn.Mat()

    mat1 = ncnn.Mat(1)
    mat2.clone_from(mat1)
    assert mat1.dims == mat2.dims and mat1.w == mat2.w

    mat1 = ncnn.Mat(2, 3)
    mat2.clone_from(mat1)
    assert mat1.dims == mat2.dims and mat1.w == mat2.w and mat1.h == mat2.h

    mat1 = ncnn.Mat(4, 5, 6)
    mat2.clone_from(mat1)
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.c == mat2.c
    )

    mat1 = ncnn.Mat(7, 8, 9, 10)
    mat2.clone_from(mat1)
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.d == mat2.d
        and mat1.c == mat2.c
    )

    mat1 = ncnn.Mat((1,))
    mat2.clone_from(mat1)
    assert mat1.dims == mat2.dims and mat1.w == mat2.w

    mat1 = ncnn.Mat((2, 3))
    mat2.clone_from(mat1)
    assert mat1.dims == mat2.dims and mat1.w == mat2.w and mat1.h == mat2.h

    mat1 = ncnn.Mat((4, 5, 6))
    mat2.clone_from(mat1)
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.c == mat2.c
    )

    mat1 = ncnn.Mat((7, 8, 9, 10))
    mat2.clone_from(mat1)
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.d == mat2.d
        and mat1.c == mat2.c
    )


def test_reshape():
    mat1 = ncnn.Mat()
    mat2 = mat1.reshape(1)
    assert mat2.dims == 0
    mat2 = mat1.reshape(1, 1)
    assert mat2.dims == 0
    mat2 = mat1.reshape(1, 1, 1)
    assert mat2.dims == 0
    mat2 = mat1.reshape(1, 1, 1, 1)
    assert mat2.dims == 0

    mat1 = ncnn.Mat(1)
    mat2 = mat1.reshape(1, 1)
    assert mat2.dims == 2 and mat2.w == 1 and mat2.h == 1
    mat2 = mat1.reshape(1, 1, 1)
    assert mat2.dims == 3 and mat2.w == 1 and mat2.h == 1 and mat2.c == 1
    mat2 = mat1.reshape(1, 1, 1, 1)
    assert mat2.dims == 4 and mat2.w == 1 and mat2.h == 1 and mat2.d == 1 and mat2.c == 1

    mat1 = ncnn.Mat(1, 2)
    mat2 = mat1.reshape(2)
    assert mat2.dims == 1 and mat2.w == 2
    mat2 = mat1.reshape(2, 1)
    assert mat2.dims == 2 and mat2.w == 2 and mat2.h == 1
    mat2 = mat1.reshape(2, 1, 1)
    assert mat2.dims == 3 and mat2.w == 2 and mat2.h == 1 and mat2.c == 1
    mat2 = mat1.reshape(2, 1, 1, 1)
    assert mat2.dims == 4 and mat2.w == 2 and mat2.h == 1 and mat2.d == 1 and mat2.c == 1

    mat1 = ncnn.Mat(1, 2, 3)
    mat2 = mat1.reshape(6)
    assert mat2.dims == 1 and mat2.w == 6
    mat2 = mat1.reshape(2, 3)
    assert mat2.dims == 2 and mat2.w == 2 and mat2.h == 3
    mat2 = mat1.reshape(2, 3, 1)
    assert mat2.dims == 3 and mat2.w == 2 and mat2.h == 3 and mat2.c == 1
    mat2 = mat1.reshape(2, 1, 3, 1)
    assert mat2.dims == 4 and mat2.w == 2 and mat2.h == 1 and mat2.d == 3 and mat2.c == 1

    mat1 = ncnn.Mat((1,))
    mat2 = mat1.reshape((1, 1))
    assert mat2.dims == 2 and mat2.w == 1 and mat2.h == 1
    mat2 = mat1.reshape((1, 1, 1))
    assert mat2.dims == 3 and mat2.w == 1 and mat2.h == 1 and mat2.c == 1
    mat2 = mat1.reshape((1, 1, 1, 1))
    assert mat2.dims == 4 and mat2.w == 1 and mat2.h == 1 and mat2.d == 1 and mat2.c == 1

    mat1 = ncnn.Mat((1, 2))
    mat2 = mat1.reshape((2,))
    assert mat2.dims == 1 and mat2.w == 2
    mat2 = mat1.reshape((2, 1))
    assert mat2.dims == 2 and mat2.w == 2 and mat2.h == 1
    mat2 = mat1.reshape((2, 1, 1))
    assert mat2.dims == 3 and mat2.w == 2 and mat2.h == 1 and mat2.c == 1
    mat2 = mat1.reshape((2, 1, 1, 1))
    assert mat2.dims == 4 and mat2.w == 2 and mat2.h == 1 and mat2.d == 1 and mat2.c == 1

    mat1 = ncnn.Mat((1, 2, 3))
    mat2 = mat1.reshape((6,))
    assert mat2.dims == 1 and mat2.w == 6
    mat2 = mat1.reshape((2, 3))
    assert mat2.dims == 2 and mat2.w == 2 and mat2.h == 3 and mat2.c == 1
    mat2 = mat1.reshape((2, 3, 1))
    assert mat2.dims == 3 and mat2.w == 2 and mat2.h == 3 and mat2.c == 1
    mat2 = mat1.reshape((2, 1, 3, 1))
    assert mat2.dims == 4 and mat2.w == 2 and mat2.h == 1 and mat2.d == 3 and mat2.c == 1

    with pytest.raises(RuntimeError) as execinfo:
        mat1.reshape((1, 1, 1, 1, 1))
    assert "shape must be 1, 2, 3 or 4 dims, not 5" in str(execinfo.value)


def test_create():
    mat = ncnn.Mat()
    mat.create(1)
    assert mat.dims == 1 and mat.w == 1
    mat.create(2, 3)
    assert mat.dims == 2 and mat.w == 2 and mat.h == 3
    mat.create(4, 5, 6)
    assert mat.dims == 3 and mat.w == 4 and mat.h == 5 and mat.c == 6
    mat.create(7, 8, 9, 10)
    assert mat.dims == 4 and mat.w == 7 and mat.h == 8 and mat.d == 9 and mat.c == 10

    mat.create((1,))
    assert mat.dims == 1 and mat.w == 1
    mat.create((2, 3))
    assert mat.dims == 2 and mat.w == 2 and mat.h == 3
    mat.create((4, 5, 6))
    assert mat.dims == 3 and mat.w == 4 and mat.h == 5 and mat.c == 6
    mat.create((7, 8, 9, 10))
    assert mat.dims == 4 and mat.w == 7 and mat.h == 8 and mat.d == 9 and mat.c == 10


def test_create_like():
    mat2 = ncnn.Mat()

    mat1 = ncnn.Mat(1)
    mat2.create_like(mat1)
    assert mat1.dims == mat2.dims and mat1.w == mat2.w
    mat1 = ncnn.Mat(2, 3)
    mat2.create_like(mat1)
    assert mat1.dims == mat2.dims and mat1.w == mat2.w and mat1.h == mat2.h
    mat1 = ncnn.Mat(4, 5, 6)
    mat2.create_like(mat1)
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.c == mat2.c
    )
    mat1 = ncnn.Mat(7, 8, 9, 10)
    mat2.create_like(mat1)
    assert (
        mat1.dims == mat2.dims
        and mat1.w == mat2.w
        and mat1.h == mat2.h
        and mat1.d == mat2.d
        and mat1.c == mat2.c
    )


def test_addref_release():
    mat = ncnn.Mat(1)
    assert mat.refcount == 1

    mat.addref()
    assert mat.refcount == 2

    mat.release()
    assert mat.refcount == None


def test_empty():
    mat = ncnn.Mat()
    assert mat.empty() == True

    mat = ncnn.Mat(1)
    assert mat.empty() == False


def test_total():
    mat = ncnn.Mat(1)
    assert mat.total() == 4 # 1 aligned
    mat = ncnn.Mat(2, 3)
    assert mat.total() == 8 # 2 * 3 aligned
    mat = ncnn.Mat(4, 5, 6)
    assert mat.total() == 4 * 5 * 6
    mat = ncnn.Mat(7, 8, 9, 10)
    assert mat.total() == 7 * 8 * 9 * 10


def test_elembits():
    mat = ncnn.Mat(1, elemsize=1, elempack=1)
    assert mat.elembits() == 8
    mat = ncnn.Mat(2, elemsize=2, elempack=1)
    assert mat.elembits() == 16
    mat = ncnn.Mat(3, elemsize=4, elempack=1)
    assert mat.elembits() == 32


def test_shape():
    mat = ncnn.Mat(1)
    shape = mat.shape()
    assert shape.dims == 1 and shape.w == 1
    mat = ncnn.Mat(2, 3)
    shape = mat.shape()
    assert shape.dims == 2 and shape.w == 2 and shape.h == 3
    mat = ncnn.Mat(4, 5, 6)
    shape = mat.shape()
    assert shape.dims == 3 and shape.w == 4 and shape.h == 5 and shape.c == 6
    mat = ncnn.Mat(7, 8, 9, 10)
    shape = mat.shape()
    assert shape.dims == 4 and shape.w == 7 and shape.h == 8 and shape.d == 9 and shape.c == 10


def test_channel_depth_row():
    mat = ncnn.Mat(2, 3, 4, 5)
    mat.fill(6.0)
    channel = mat.channel(1)
    assert channel.dims == 3 and channel.w == 2 and channel.h == 3 and channel.c == 4

    depth = channel.depth(1)
    assert depth.dims == 2 and depth.w == 2 and depth.h == 3

    row = depth.row(1)
    assert len(row) == 2 and np.abs(row[0] - 6.0) < sys.float_info.min


def test_channel_row():
    mat = ncnn.Mat(2, 3, 4)
    mat.fill(4.0)
    channel = mat.channel(1)
    assert channel.dims == 2 and channel.w == 2 and channel.h == 3 and channel.c == 1

    row = channel.row(1)
    assert len(row) == 2 and np.abs(row[0] - 4.0) < sys.float_info.min


def test_channel_range():
    mat = ncnn.Mat(1, 2, 3)
    channel_range = mat.channel_range(0, 2)
    assert (
        channel_range.dims == 3
        and channel_range.w == 1
        and channel_range.h == 2
        and channel_range.c == 2
    )


def test_depth_range():
    mat = ncnn.Mat(1, 2, 3, 4)
    depth_range = mat.channel(1).depth_range(1, 2)
    assert (
        depth_range.dims == 3
        and depth_range.w == 1
        and depth_range.h == 2
        and depth_range.c == 2
    )


def test_row_range():
    mat = ncnn.Mat(1, 2)
    row_range = mat.row_range(0, 2)
    assert row_range.dims == 2 and row_range.w == 1 and row_range.h == 2


def test_range():
    mat = ncnn.Mat(2)
    range = mat.range(0, 2)
    assert range.dims == 1 and range.w == 2


def test_getitem_setitem():
    mat = ncnn.Mat(2)
    mat.fill(1)
    assert (
        np.abs(mat[0] - 1.0) < sys.float_info.min
        and np.abs(mat[1] - 1.0) < sys.float_info.min
    )

    mat[0] = 2.0
    assert (
        np.abs(mat[0] - 2.0) < sys.float_info.min
        and np.abs(mat[1] - 1.0) < sys.float_info.min
    )


def test_from_pixels():
    pixels = np.random.randint(0, 256, size=(300, 400, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels(pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300)  # chw
    assert mat.dims == 3 and mat.w == 400 and mat.h == 300 and mat.c == 3
    assert pixels[0, 0, 0] == mat.channel(0).row(0)[0]
    assert pixels[200, 150, 1] == mat.channel(1).row(200)[150]
    assert pixels[299, 399, 2] == mat.channel(2).row(299)[399]

    pixels = np.random.randint(0, 256, size=(300, 500, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels(
        pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300, stride=500 * 3
    )  # chw
    assert mat.dims == 3 and mat.w == 400 and mat.h == 300 and mat.c == 3
    assert pixels[0, 0, 0] == mat.channel(0).row(0)[0]
    assert pixels[200, 150, 1] == mat.channel(1).row(200)[150]
    assert pixels[299, 399, 2] == mat.channel(2).row(299)[399]


def test_from_pixels_resize():
    pixels = np.random.randint(0, 256, size=(300, 400, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_resize(
        pixels, ncnn.Mat.PixelType.PIXEL_BGR2RGB, 400, 300, 200, 150
    )  # chw
    assert mat.dims == 3 and mat.w == 200 and mat.h == 150 and mat.c == 3

    pixels = np.random.randint(0, 256, size=(300, 400, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_resize(
        pixels, ncnn.Mat.PixelType.PIXEL_BGR2RGB, 400, 300, 400, 300
    )  # chw
    assert mat.dims == 3 and mat.w == 400 and mat.h == 300 and mat.c == 3
    assert pixels[0, 0, 0] == mat.channel(2).row(0)[0]
    assert pixels[200, 150, 1] == mat.channel(1).row(200)[150]
    assert pixels[299, 399, 2] == mat.channel(0).row(299)[399]

    pixels = np.random.randint(0, 256, size=(300, 500, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_resize(
        pixels, ncnn.Mat.PixelType.PIXEL_BGR2RGB, 400, 300, 500 * 3, 200, 150
    )  # chw
    assert mat.dims == 3 and mat.w == 200 and mat.h == 150 and mat.c == 3

    pixels = np.random.randint(0, 256, size=(300, 500, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_resize(
        pixels, ncnn.Mat.PixelType.PIXEL_BGR2RGB, 400, 300, 500 * 3, 400, 300
    )  # chw
    assert mat.dims == 3 and mat.w == 400 and mat.h == 300 and mat.c == 3
    assert pixels[0, 0, 0] == mat.channel(2).row(0)[0]
    assert pixels[200, 150, 1] == mat.channel(1).row(200)[150]
    assert pixels[299, 399, 2] == mat.channel(0).row(299)[399]


def test_from_pixels_roi():
    pixels = np.random.randint(0, 256, size=(300, 400, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_roi(
        pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300, 100, 75, 200, 150
    )  # chw
    assert mat.dims == 3 and mat.w == 200 and mat.h == 150 and mat.c == 3
    assert pixels[75, 100, 0] == mat.channel(0).row(0)[0]
    assert pixels[150, 200, 1] == mat.channel(1).row(75)[100]
    assert pixels[224, 299, 2] == mat.channel(2).row(149)[199]

    pixels = np.random.randint(0, 256, size=(300, 500, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_roi(
        pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300, 500 * 3, 100, 75, 200, 150
    )  # chw
    assert mat.dims == 3 and mat.w == 200 and mat.h == 150 and mat.c == 3
    assert pixels[75, 100, 0] == mat.channel(0).row(0)[0]
    assert pixels[150, 200, 1] == mat.channel(1).row(75)[100]
    assert pixels[224, 299, 2] == mat.channel(2).row(149)[199]


def test_from_pixels_roi_resize():
    pixels = np.random.randint(0, 256, size=(300, 400, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_roi_resize(
        pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300, 100, 75, 200, 150, 100, 75
    )  # chw
    assert mat.dims == 3 and mat.w == 100 and mat.h == 75 and mat.c == 3

    pixels = np.random.randint(0, 256, size=(300, 500, 3)).astype(np.uint8)  # hwc
    mat = ncnn.Mat.from_pixels_roi_resize(
        pixels,
        ncnn.Mat.PixelType.PIXEL_RGB,
        400,
        300,
        500 * 3,
        100,
        75,
        200,
        150,
        100,
        75,
    )  # chw
    assert mat.dims == 3 and mat.w == 100 and mat.h == 75 and mat.c == 3


def test_substract_mean_normalize():
    pixels = np.random.randint(0, 256, size=(300, 400, 3)).astype(np.uint8)  # hwc
    mean_vals = [127.5, 127.5, 127.5]
    norm_vals = [0.007843, 0.007843, 0.007843]

    mat = ncnn.Mat.from_pixels(pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300)  # chw
    mat.substract_mean_normalize([], norm_vals)
    assert np.abs(pixels[0, 0, 0] * 0.007843 - mat.channel(0).row(0)[0]) < 1e-5

    mat = ncnn.Mat.from_pixels(pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300)  # chw
    mat.substract_mean_normalize(mean_vals, [])
    assert np.abs((pixels[0, 0, 0] - 127.5) - mat.channel(0).row(0)[0]) < 1e-5

    mat = ncnn.Mat.from_pixels(pixels, ncnn.Mat.PixelType.PIXEL_RGB, 400, 300)  # chw
    mat.substract_mean_normalize(mean_vals, norm_vals)
    assert (
        np.abs((pixels[0, 0, 0] - 127.5) * 0.007843 - mat.channel(0).row(0)[0]) < 1e-5
    )


================================================
FILE: python/tests/test_net.py
================================================
# Copyright 2021 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import numpy as np
import pytest

import ncnn


def test_net():
    dr = ncnn.DataReaderFromEmpty()

    with ncnn.Net() as net:
        ret = net.load_param("tests/test.param")
        net.load_model(dr)
        assert ret == 0 and len(net.blobs()) == 3 and len(net.layers()) == 3

        input_names = net.input_names()
        output_names = net.output_names()
        assert len(input_names) > 0 and len(output_names) > 0

        in_mat = ncnn.Mat((227, 227, 3))

        with net.create_extractor() as ex:
            ex.input("data", in_mat)
            ret, out_mat = ex.extract("output")

        assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1

        net.clear()
        assert len(net.blobs()) == 0 and len(net.layers()) == 0


def test_net_mem():
    modelbin = bytearray(303940)
    modelbin[0:4] = 71,107,48,1
    modelbin[180:184] = 71,107,48,1

    with ncnn.Net() as net:
        ret = net.load_param("tests/test.param")
        net.load_model_mem(bytes(modelbin))
        assert ret == 0 and len(net.blobs()) == 3 and len(net.layers()) == 3

        input_names = net.input_names()
        output_names = net.output_names()
        assert len(input_names) > 0 and len(output_names) > 0

        in_mat = ncnn.Mat((227, 227, 3))

        with net.create_extractor() as ex:
            ex.input("data", in_mat)
            ret, out_mat = ex.extract("output")

        assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1

        net.clear()
        assert len(net.blobs()) == 0 and len(net.layers()) == 0


def test_net_vulkan():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    dr = ncnn.DataReaderFromEmpty()

    net = ncnn.Net()
    net.opt.use_vulkan_compute = True
    ret = net.load_param("tests/test.param")
    net.load_model(dr)
    assert ret == 0 and len(net.blobs()) == 3 and len(net.layers()) == 3

    in_mat = ncnn.Mat((227, 227, 3))

    ex = net.create_extractor()
    ex.input("data", in_mat)
    ret, out_mat = ex.extract("output")

    assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1

    ex.clear()

    net.clear()
    assert len(net.blobs()) == 0 and len(net.layers()) == 0


def test_custom_layer():
    class CustomLayer(ncnn.Layer):
        customLayers = []

        def __init__(self):
            ncnn.Layer.__init__(self)
            self.one_blob_only = True

            self.customLayers.append(self)

        def forward(self, bottom_blob, top_blob, opt):
            x = np.array(bottom_blob)
            x += 1

            top_blob.clone_from(ncnn.Mat(x), opt.blob_allocator)
            if top_blob.empty():
                return -100

            return 0

    def CustomLayer_layer_creator():
        return CustomLayer()

    def CustomLayer_layer_destroyer(layer):
        for i in range(len(CustomLayer.customLayers)):
            if CustomLayer.customLayers[i] == layer:
                del CustomLayer.customLayers[i]
                break

    dr = ncnn.DataReaderFromEmpty()

    net = ncnn.Net()
    net.register_custom_layer(
        "CustomLayer", CustomLayer_layer_creator, CustomLayer_layer_destroyer
    )
    ret = net.load_param("tests/custom_layer.param")
    net.load_model(dr)
    assert ret == 0 and len(net.blobs()) == 2 and len(net.layers()) == 2

    in_mat = ncnn.Mat(1)
    in_mat.fill(1.0)

    ex = net.create_extractor()
    ex.input("data", in_mat)
    ret, out_mat = ex.extract("output")
    assert ret == 0 and out_mat.dims == 1 and out_mat.w == 1 and out_mat[0] == 2.0

    ex.clear()

    net.clear()
    assert len(net.blobs()) == 0 and len(net.layers()) == 0


def test_vulkan_device_index():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    net = ncnn.Net()
    assert net.vulkan_device() is None

    net.set_vulkan_device(0)
    assert net.vulkan_device() is not None


def test_vulkan_device_vkdev():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    net = ncnn.Net()
    assert net.vulkan_device() is None

    vkdev = ncnn.get_gpu_device(0)
    net.set_vulkan_device(vkdev)
    assert net.vulkan_device() is not None


================================================
FILE: python/tests/test_option.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import pytest

import ncnn


def test_option():
    allocator = ncnn.PoolAllocator()

    opt = ncnn.Option()

    opt.lightmode = True
    assert opt.lightmode == True
    opt.lightmode = False
    assert opt.lightmode == False

    assert opt.num_threads == ncnn.get_physical_big_cpu_count()
    opt.num_threads = 1
    assert opt.num_threads == 1

    assert opt.blob_allocator is None
    opt.blob_allocator = allocator
    assert opt.blob_allocator == allocator

    assert opt.workspace_allocator is None
    opt.workspace_allocator = allocator
    assert opt.workspace_allocator == allocator

    assert opt.openmp_blocktime == 20
    opt.openmp_blocktime = 40
    assert opt.openmp_blocktime == 40

    opt.use_winograd_convolution = True
    assert opt.use_winograd_convolution == True
    opt.use_winograd_convolution = False
    assert opt.use_winograd_convolution == False

    opt.use_sgemm_convolution = True
    assert opt.use_sgemm_convolution == True
    opt.use_sgemm_convolution = False
    assert opt.use_sgemm_convolution == False

    opt.use_int8_inference = True
    assert opt.use_int8_inference == True
    opt.use_int8_inference = False
    assert opt.use_int8_inference == False

    opt.use_vulkan_compute = True
    assert opt.use_vulkan_compute == True
    opt.use_vulkan_compute = False
    assert opt.use_vulkan_compute == False

    opt.use_bf16_packed = True
    assert opt.use_bf16_packed == True
    opt.use_bf16_packed = False
    assert opt.use_bf16_packed == False

    opt.use_bf16_storage = True
    assert opt.use_bf16_storage == True
    opt.use_bf16_storage = False
    assert opt.use_bf16_storage == False

    opt.use_fp16_packed = True
    assert opt.use_fp16_packed == True
    opt.use_fp16_packed = False
    assert opt.use_fp16_packed == False

    opt.use_fp16_storage = True
    assert opt.use_fp16_storage == True
    opt.use_fp16_storage = False
    assert opt.use_fp16_storage == False

    opt.use_fp16_arithmetic = True
    assert opt.use_fp16_arithmetic == True
    opt.use_fp16_arithmetic = False
    assert opt.use_fp16_arithmetic == False

    opt.use_int8_packed = True
    assert opt.use_int8_packed == True
    opt.use_int8_packed = False
    assert opt.use_int8_packed == False

    opt.use_int8_storage = True
    assert opt.use_int8_storage == True
    opt.use_int8_storage = False
    assert opt.use_int8_storage == False

    opt.use_int8_arithmetic = True
    assert opt.use_int8_arithmetic == True
    opt.use_int8_arithmetic = False
    assert opt.use_int8_arithmetic == False

    opt.use_packing_layout = True
    assert opt.use_packing_layout == True
    opt.use_packing_layout = False
    assert opt.use_packing_layout == False

    opt.use_subgroup_ops = True
    assert opt.use_subgroup_ops == True
    opt.use_subgroup_ops = False
    assert opt.use_subgroup_ops == False

    opt.use_tensor_storage = True
    assert opt.use_tensor_storage == True
    opt.use_tensor_storage = False
    assert opt.use_tensor_storage == False


================================================
FILE: python/tests/test_paramdict.py
================================================
# Copyright 2020 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import pytest

import ncnn


def test_paramdict():
    pd = ncnn.ParamDict()
    assert pd.type(0) == 0
    assert pd.get(0, -1) == -1

    pd.set(1, 1)
    assert pd.type(1) == 2 and pd.get(1, -1) == 1

    pd.set(2, 2.0)
    assert pd.type(2) == 3 and pd.get(2, -2.0) == 2.0

    mat = ncnn.Mat(1)
    pd.set(3, mat)
    assert pd.type(3) == 4 and pd.get(3, ncnn.Mat()).dims == mat.dims


================================================
FILE: python/tests/test_vulkan_allocator.py
================================================
# Copyright 2021 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import pytest

import ncnn


def test_vk_blob_allocator():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    vkdev = ncnn.get_gpu_device(0)
    assert vkdev is not None
    allocator = ncnn.VkBlobAllocator(vkdev)
    assert allocator.buffer_memory_type_index >= 0
    assert allocator.image_memory_type_index >= 0

    mappable = allocator.mappable
    allocator.mappable = not mappable
    assert allocator.mappable == (not mappable)

    coherent = allocator.coherent
    allocator.coherent = not coherent
    assert allocator.coherent == (not coherent)

    bufmem = allocator.fastMalloc(10 * 1024)
    assert bufmem is not None
    allocator.fastFree(bufmem)

    imgmem = allocator.fastMalloc(4, 4, 3, 4, 1)
    assert imgmem is not None
    allocator.fastFree(imgmem)


def test_vk_weight_allocator():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    vkdev = ncnn.get_gpu_device(0)
    assert vkdev is not None
    allocator = ncnn.VkWeightAllocator(vkdev)
    assert allocator.buffer_memory_type_index >= 0
    assert allocator.image_memory_type_index >= 0

    mappable = allocator.mappable
    allocator.mappable = not mappable
    assert allocator.mappable == (not mappable)

    coherent = allocator.coherent
    allocator.coherent = not coherent
    assert allocator.coherent == (not coherent)

    bufmem = allocator.fastMalloc(10 * 1024)
    assert bufmem is not None
    allocator.fastFree(bufmem)

    imgmem = allocator.fastMalloc(4, 4, 3, 4, 1)
    assert imgmem is not None
    allocator.fastFree(imgmem)


def test_vk_staging_allocator():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    vkdev = ncnn.get_gpu_device(0)
    assert vkdev is not None
    allocator = ncnn.VkStagingAllocator(vkdev)
    assert allocator.buffer_memory_type_index >= 0
    assert allocator.image_memory_type_index >= 0

    mappable = allocator.mappable
    allocator.mappable = not mappable
    assert allocator.mappable == (not mappable)

    coherent = allocator.coherent
    allocator.coherent = not coherent
    assert allocator.coherent == (not coherent)

    bufmem = allocator.fastMalloc(10 * 1024)
    assert bufmem is not None
    allocator.fastFree(bufmem)

    imgmem = allocator.fastMalloc(4, 4, 3, 4, 1)
    assert imgmem is not None
    allocator.fastFree(imgmem)


def test_vk_weight_staging_allocator():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    vkdev = ncnn.get_gpu_device(0)
    assert vkdev is not None
    allocator = ncnn.VkWeightStagingAllocator(vkdev)
    assert allocator.buffer_memory_type_index >= 0
    assert allocator.image_memory_type_index >= 0

    mappable = allocator.mappable
    allocator.mappable = not mappable
    assert allocator.mappable == (not mappable)

    coherent = allocator.coherent
    allocator.coherent = not coherent
    assert allocator.coherent == (not coherent)

    bufmem = allocator.fastMalloc(10 * 1024)
    assert bufmem is not None
    allocator.fastFree(bufmem)

    imgmem = allocator.fastMalloc(4, 4, 3, 4, 1)
    assert imgmem is None


================================================
FILE: python/tests/test_vulkan_device.py
================================================
# Copyright 2021 Tencent
# SPDX-License-Identifier: BSD-3-Clause

import pytest

import ncnn


def check_gpuinfo(gpuinfo):
    assert gpuinfo.api_version() > 0
    assert gpuinfo.driver_version() > 0
    assert gpuinfo.vendor_id() > 0
    assert gpuinfo.device_id() > 0
    assert gpuinfo.pipeline_cache_uuid() is not None
    assert gpuinfo.type() >= 0


def test_gpu_api():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    assert ncnn.create_gpu_instance() == 0
    assert ncnn.get_gpu_count() > 0
    assert ncnn.get_default_gpu_index() >= 0

    gpuinfo = ncnn.get_gpu_info(0)
    check_gpuinfo(gpuinfo)

    vkdev = ncnn.get_gpu_device(0)
    assert vkdev is not None
    gpuinfo = vkdev.info()
    check_gpuinfo(gpuinfo)

    ncnn.destroy_gpu_instance()


def test_vulkan_device():
    if not hasattr(ncnn, "get_gpu_count"):
        return

    vkdev = ncnn.VulkanDevice(0)
    assert vkdev is not None
    gpuinfo = vkdev.info()
    check_gpuinfo(gpuinfo)


================================================
FILE: setup.py
================================================
import io
import os
import sys
import time
import re
import shutil
import subprocess

from setuptools import setup, find_packages, Extension
from setuptools.command.build_ext import build_ext
from setuptools.command.install import install


def find_version():
    with io.open("CMakeLists.txt", encoding="utf8") as f:
        version_file = f.read()

    version_major = re.findall(r"NCNN_VERSION_MAJOR (.+?)", version_file)
    version_minor = re.findall(r"NCNN_VERSION_MINOR (.+?)", version_file)

    if version_major and version_minor:
        ncnn_version = time.strftime("%Y%m%d", time.localtime())

        return version_major[0] + "." + version_minor[0] + "." + ncnn_version
    raise RuntimeError("Unable to find version string.")

# Parse environment variables
Vulkan_LIBRARY = os.environ.get("Vulkan_LIBRARY", "")
CMAKE_TOOLCHAIN_FILE = os.environ.get("CMAKE_TOOLCHAIN_FILE", "")
PLATFORM = os.environ.get("PLATFORM", "")
ARCHS = os.environ.get("ARCHS", "")
DEPLOYMENT_TARGET = os.environ.get("DEPLOYMENT_TARGET", "")
OpenMP_C_FLAGS = os.environ.get("OpenMP_C_FLAGS", "")
OpenMP_CXX_FLAGS = os.environ.get("OpenMP_CXX_FLAGS", "")
OpenMP_C_LIB_NAMES = os.environ.get("OpenMP_C_LIB_NAMES", "")
OpenMP_CXX_LIB_NAMES = os.environ.get("OpenMP_CXX_LIB_NAMES", "")
OpenMP_libomp_LIBRARY = os.environ.get("OpenMP_libomp_LIBRARY", "")
ENABLE_BITCODE = os.environ.get("ENABLE_BITCODE", "")
ENABLE_ARC = os.environ.get("ENABLE_ARC", "")
ENABLE_VISIBILITY = os.environ.get("ENABLE_VISIBILITY", "")
EXTRA_CMAKE_ARGS = os.getenv("EXTRA_CMAKE_ARGS", "").split()

# Parse variables from command line with setup.py install
class InstallCommand(install):
    user_options = install.user_options + [
        ('vulkan=', None, 'Enable the usage of Vulkan.'),
    ]
    def initialize_options(self):
        install.initialize_options(self)
        self.vulkan = None

    def finalize_options(self):
        install.finalize_options(self)

    def run(self):
        install.run(self)

# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE = {
    "win32": "Win32",
    "win-amd64": "x64",
    "win-arm32": "ARM",
    "win-arm64": "ARM64",
}

# A CMakeExtension needs a sourcedir instead of a file list.
# The name must be the _single_ output extension from the CMake build.
# If you need multiple extensions, see scikit-build.
class CMakeExtension(Extension):
    def __init__(self, name, sourcedir=""):
        Extension.__init__(self, name, sources=[])
        self.sourcedir = os.path.abspath(sourcedir)


class CMakeBuild(build_ext):
    def build_extension(self, ext):
        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
        extdir = os.path.join(extdir, "ncnn")

        # required for auto-detection of auxiliary "native" libs
        if not extdir.endswith(os.path.sep):
            extdir += os.path.sep

        cfg = "Debug" if self.debug else "Release"

        # CMake lets you override the generator - we need to check this.
        # Can be set with Conda-Build, for example.
        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")

        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
        # from Python.
        cmake_args = [
            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}".format(extdir),
            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE={}".format(extdir),
            "-DPYTHON_EXECUTABLE={}".format(sys.executable),
            "-DCMAKE_BUILD_TYPE={}".format(cfg),  # not used on MSVC, but no harm
            "-DNCNN_PYTHON=ON",
            "-DNCNN_VULKAN=ON",
            "-DNCNN_DISABLE_RTTI=OFF",
            "-DNCNN_DISABLE_EXCEPTION=OFF",
            "-DNCNN_BUILD_BENCHMARK=OFF",
            "-DNCNN_BUILD_EXAMPLES=OFF",
            "-DNCNN_BUILD_TOOLS=OFF",
        ]
        if Vulkan_LIBRARY != "":
            cmake_args.append("-DVulkan_LIBRARY=" + Vulkan_LIBRARY)
        if CMAKE_TOOLCHAIN_FILE != "":
            cmake_args.append("-DCMAKE_TOOLCHAIN_FILE=" + CMAKE_TOOLCHAIN_FILE)
        if PLATFORM != "":
            cmake_args.append("-DPLATFORM=" + PLATFORM)
        if ARCHS != "":
            cmake_args.append("-DARCHS=" + ARCHS)
        if DEPLOYMENT_TARGET != "":
            cmake_args.append("-DDEPLOYMENT_TARGET=" + DEPLOYMENT_TARGET)
        if OpenMP_C_FLAGS != "":
            cmake_args.append("-DOpenMP_C_FLAGS=" + OpenMP_C_FLAGS)
        if OpenMP_CXX_FLAGS != "":
            cmake_args.append("-DOpenMP_CXX_FLAGS=" + OpenMP_CXX_FLAGS)
        if OpenMP_C_LIB_NAMES != "":
            cmake_args.append("-DOpenMP_C_LIB_NAMES=" + OpenMP_C_LIB_NAMES)
        if OpenMP_CXX_LIB_NAMES != "":
            cmake_args.append("-DOpenMP_CXX_LIB_NAMES=" + OpenMP_CXX_LIB_NAMES)
        if OpenMP_libomp_LIBRARY != "":
            cmake_args.append("-DOpenMP_libomp_LIBRARY=" + OpenMP_libomp_LIBRARY)
        if ENABLE_BITCODE != "":
            cmake_args.append("-DENABLE_BITCODE=" + ENABLE_BITCODE)
        if ENABLE_ARC != "":
            cmake_args.append("-DENABLE_ARC=" + ENABLE_ARC)
        if ENABLE_VISIBILITY != "":
            cmake_args.append("-DENABLE_VISIBILITY=" + ENABLE_VISIBILITY)

        cmake_args += EXTRA_CMAKE_ARGS

        build_args = []

        if self.compiler.compiler_type == "msvc":
            # Single config generators are handled "normally"
            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})

            # CMake allows an arch-in-generator style for backward compatibility
            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})

            # Specify the arch if using MSVC generator, but only if it doesn't
            # contain a backward-compatibility arch spec already in the
            # generator name.
            if not single_config and not contains_arch:
                cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]

            # Multi-config generators have a different way to specify configs
            if not single_config:
                cmake_args += [
                    "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(cfg.upper(), extdir)
                ]
                build_args += ["--config", cfg]

        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
        # across all generators.
        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
            # self.parallel is a Python 3 only way to set parallel jobs by hand
            # using -j in the build_ext call, not supported by pip or PyPA-build.
            if hasattr(self, "parallel") and self.parallel:
                # CMake 3.12+ only.
                build_args += ["-j{}".format(self.parallel)]
            else:
                # Automatically set parallel jobs based on CPU core count
                cpu_count = os.cpu_count() or 1
                build_args += ["-j{}".format(cpu_count)]

        if not os.path.exists(self.build_temp):
            os.makedirs(self.build_temp)

        subprocess.check_call(
            ["cmake", ext.sourcedir] + cmake_args, cwd=self.build_temp
        )
        subprocess.check_call(
            ["cmake", "--build", "."] + build_args, cwd=self.build_temp
        )


if sys.version_info < (3, 0):
    sys.exit("Sorry, Python < 3.0 is not supported")

requirements = ["numpy", "tqdm", "requests", "portalocker", "opencv-python"]
setup_requires = []
if shutil.which("cmake") is None:
    setup_requires += ["cmake>=3.12"]
if shutil.which("ninja") is None:
    setup_requires += ["ninja; sys_platform != 'win32'"]

with io.open("README.md", encoding="utf-8") as h:
    long_description = h.read()

setup(
    name="ncnn",
    version=find_version(),
    author="nihui",
    author_email="nihuini@tencent.com",
    maintainer="caishanli",
    maintainer_email="caishanli25@gmail.com",
    description="ncnn is a high-performance neural network inference framework optimized for the mobile platform",
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/Tencent/ncnn",
    classifiers=[
        "Programming Language :: C++",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.6",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
        "Programming Language :: Python :: 3.11",
        "Programming Language :: Python :: 3.12",
        "Programming Language :: Python :: 3.13",
        "Programming Language :: Python :: 3.14",
        "License :: OSI Approved :: BSD License",
        "Operating System :: OS Independent",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
    license="BSD-3",
    python_requires=">=3.5",
    packages=find_packages("python"),
    package_dir={"": "python"},
    setup_requires=setup_requires,
    install_requires=requirements,
    ext_modules=[CMakeExtension("ncnn")],
    cmdclass={'install': InstallCommand, "build_ext": CMakeBuild},
)


================================================
FILE: src/CMakeLists.txt
================================================

##############################################

configure_file(platform.h.in ${CMAKE_CURRENT_BINARY_DIR}/platform.h)

# Add source file to list, and add to special visual folder
function(ncnn_src_group ncnn_src_string folder)
    string(REPLACE " " ";" _ncnn_src_list ${ncnn_src_string})

    string(REGEX REPLACE "/" "\\\\" _target_folder "${folder}")

    foreach(_file IN LISTS ${_ncnn_src_list})
        source_group ("${_target_folder}" FILES "${_file}")
    endforeach ()
endfunction()

set(ncnn_SRCS
    allocator.cpp
    benchmark.cpp
    blob.cpp
    c_api.cpp
    command.cpp
    cpu.cpp
    datareader.cpp
    expression.cpp
    gpu.cpp
    layer.cpp
    mat.cpp
    mat_pixel.cpp
    mat_pixel_affine.cpp
    mat_pixel_drawing.cpp
    mat_pixel_resize.cpp
    mat_pixel_rotate.cpp
    modelbin.cpp
    net.cpp
    option.cpp
    paramdict.cpp
    pipeline.cpp
    pipelinecache.cpp
    simpleocv.cpp
    simpleomp.cpp
    simplestl.cpp
    simplemath.cpp
    simplevk.cpp
)

if(ANDROID)
    list(APPEND ncnn_SRCS mat_pixel_android.cpp)
endif()

ncnn_src_group(ncnn_SRCS "sources")

include_directories("${CMAKE_CURRENT_SOURCE_DIR}/layer/${NCNN_TARGET_ARCH}")

# ncnn macro
include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_shader.cmake)
include(${CMAKE_CURRENT_SOURCE_DIR}/../cmake/ncnn_add_layer.cmake)

# look for vulkan compute shader and compile
set(NCNN_SHADER_SPV_HEX_FILES)

set(__LAYER_TYPE_ENUM_INDEX 0)
set(__LAYER_SHADER_TYPE_ENUM_INDEX 0)

# layer implementation
ncnn_add_layer(AbsVal)
ncnn_add_layer(ArgMax OFF)
ncnn_add_layer(BatchNorm)
ncnn_add_layer(Bias)
ncnn_add_layer(BNLL)
ncnn_add_layer(Concat)
ncnn_add_layer(Convolution)
ncnn_add_layer(Crop)
ncnn_add_layer(Deconvolution)
ncnn_add_layer(Dropout)
ncnn_add_layer(Eltwise)
ncnn_add_layer(ELU)
ncnn_add_layer(Embed)
ncnn_add_layer(Exp)
ncnn_add_layer(Flatten)
ncnn_add_layer(InnerProduct)
ncnn_add_layer(Input)
ncnn_add_layer(Log)
ncnn_add_layer(LRN)
ncnn_add_layer(MemoryData)
ncnn_add_layer(MVN)
ncnn_add_layer(Pooling)
ncnn_add_layer(Power)
ncnn_add_layer(PReLU)
ncnn_add_layer(Proposal)
ncnn_add_layer(Reduction)
ncnn_add_layer(ReLU)
ncnn_add_layer(Reshape)
ncnn_add_layer(ROIPooling)
ncnn_add_layer(Scale)
ncnn_add_layer(Sigmoid)
ncnn_add_layer(Slice)
ncnn_add_layer(Softmax)
ncnn_add_layer(Split)
ncnn_add_layer(SPP OFF)
ncnn_add_layer(TanH)
ncnn_add_layer(Threshold)
ncnn_add_layer(Tile)
ncnn_add_layer(RNN)
ncnn_add_layer(LSTM)
ncnn_add_layer(BinaryOp)
ncnn_add_layer(UnaryOp)
ncnn_add_layer(ConvolutionDepthWise)
ncnn_add_layer(Padding)
ncnn_add_layer(Squeeze)
ncnn_add_layer(ExpandDims)
ncnn_add_layer(Normalize)
ncnn_add_layer(Permute)
ncnn_add_layer(PriorBox)
ncnn_add_layer(DetectionOutput)
ncnn_add_layer(Interp)
ncnn_add_layer(DeconvolutionDepthWise)
ncnn_add_layer(ShuffleChannel)
ncnn_add_layer(InstanceNorm)
ncnn_add_layer(Clip)
ncnn_add_layer(Reorg)
ncnn_add_layer(YoloDetectionOutput)
ncnn_add_layer(Quantize)
ncnn_add_layer(Dequantize)
ncnn_add_layer(Yolov3DetectionOutput)
ncnn_add_layer(PSROIPooling)
ncnn_add_layer(ROIAlign)
ncnn_add_layer(Packing)
ncnn_add_layer(Requantize)
ncnn_add_layer(Cast)
ncnn_add_layer(HardSigmoid)
ncnn_add_layer(SELU)
ncnn_add_layer(HardSwish)
ncnn_add_layer(Noop)
ncnn_add_layer(PixelShuffle)
ncnn_add_layer(DeepCopy)
ncnn_add_layer(Mish)
ncnn_add_layer(StatisticsPooling)
ncnn_add_layer(Swish)
ncnn_add_layer(Gemm)
ncnn_add_layer(GroupNorm)
ncnn_add_layer(LayerNorm)
ncnn_add_layer(Softplus)
ncnn_add_layer(GRU)
ncnn_add_layer(MultiHeadAttention)
ncnn_add_layer(GELU)
ncnn_add_layer(Convolution1D)
ncnn_add_layer(Pooling1D)
ncnn_add_layer(ConvolutionDepthWise1D)
ncnn_add_layer(Convolution3D)
ncnn_add_layer(ConvolutionDepthWise3D)
ncnn_add_layer(Pooling3D)
ncnn_add_layer(MatMul)
ncnn_add_layer(Deconvolution1D)
ncnn_add_layer(DeconvolutionDepthWise1D)
ncnn_add_layer(Deconvolution3D)
ncnn_add_layer(DeconvolutionDepthWise3D)
ncnn_add_layer(Einsum)
ncnn_add_layer(DeformableConv2D)
ncnn_add_layer(GLU)
ncnn_add_layer(Fold)
ncnn_add_layer(Unfold)
ncnn_add_layer(GridSample)
ncnn_add_layer(CumulativeSum)
ncnn_add_layer(CopyTo)
ncnn_add_layer(Erf)
ncnn_add_layer(Diag)
ncnn_add_layer(CELU)
ncnn_add_layer(Shrink)
ncnn_add_layer(RMSNorm)
ncnn_add_layer(Spectrogram)
ncnn_add_layer(InverseSpectrogram)
ncnn_add_layer(Flip)
ncnn_add_layer(SDPA)
ncnn_add_layer(RotaryEmbed)

if(NCNN_VULKAN)
    ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
    ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/shader/vulkan_activation.comp)
endif()

add_custom_target(ncnn-generate-spirv DEPENDS ${NCNN_SHADER_SPV_HEX_FILES})

# create new
configure_file(layer_declaration.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_declaration.h)
configure_file(layer_registry.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_registry.h)
configure_file(layer_type_enum.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_type_enum.h)
configure_file(layer_shader_registry.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_registry.h)
configure_file(layer_shader_spv_data.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_spv_data.h)
configure_file(layer_shader_type_enum.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_type_enum.h)

if(NCNN_SHARED_LIB)
    add_library(ncnn SHARED ${ncnn_SRCS})
else()
    add_library(ncnn STATIC ${ncnn_SRCS})
endif()
set_target_properties(ncnn PROPERTIES DEBUG_POSTFIX "d")
if(APPLE OR IOS)
    # macos / ios only accepts a.b.c.d.e where a=24bit b/c/d/e=10bit
    # 20201228 to 20.12.28
    string(SUBSTRING ${NCNN_VERSION} 2 2 NCNN_VERSION_YEAR)
    string(SUBSTRING ${NCNN_VERSION} 4 2 NCNN_VERSION_MONTH)
    string(SUBSTRING ${NCNN_VERSION} 6 2 NCNN_VERSION_DAY)
    set(NCNN_VERSION_APPLE_STRING ${NCNN_VERSION_MAJOR}.${NCNN_VERSION_MINOR}.${NCNN_VERSION_YEAR}.${NCNN_VERSION_MONTH}.${NCNN_VERSION_DAY})
    set_target_properties(ncnn PROPERTIES VERSION ${NCNN_VERSION_APPLE_STRING} SOVERSION ${NCNN_VERSION_MAJOR})
else()
    set_target_properties(ncnn PROPERTIES VERSION ${NCNN_VERSION_STRING} SOVERSION ${NCNN_VERSION_MAJOR})
endif()

include(GenerateExportHeader)
generate_export_header(ncnn)

if(NOT NCNN_SHARED_LIB)
    set_target_properties(ncnn PROPERTIES COMPILE_FLAGS -DNCNN_STATIC_DEFINE)
endif()

if(NCNN_SIMPLESTL AND NOT NCNN_SIMPLEMATH)
    # link math lib explicitly
    target_link_libraries(ncnn PUBLIC m)
endif()

target_include_directories(ncnn
    PUBLIC
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
        $<INSTALL_INTERFACE:include/ncnn>
        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
    PRIVATE
        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/layer>)

if(NCNN_OPENMP)
    if(NOT NCNN_SIMPLEOMP)
        find_package(OpenMP)
        if(NOT TARGET OpenMP::OpenMP_CXX AND (OpenMP_CXX_FOUND OR OPENMP_FOUND))
            target_compile_options(ncnn PRIVATE ${OpenMP_CXX_FLAGS})
        endif()
    endif()

    if(NCNN_SIMPLEOMP OR OpenMP_CXX_FOUND OR OPENMP_FOUND)
        if(NCNN_CMAKE_VERBOSE)
            message("Building with OpenMP")
        endif()

        if(NCNN_SIMPLEOMP)
            if(IOS OR APPLE)
                target_compile_options(ncnn PRIVATE -Xpreprocessor -fopenmp)
            else()
                target_compile_options(ncnn PRIVATE -fopenmp)
            endif()
        elseif(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20))
            target_compile_options(ncnn PRIVATE -fopenmp)
            target_link_libraries(ncnn PUBLIC -fopenmp -static-openmp)
        elseif(OpenMP_CXX_FOUND)
            target_link_libraries(ncnn PUBLIC OpenMP::OpenMP_CXX)
        else()
            target_link_libraries(ncnn PRIVATE "${OpenMP_CXX_FLAGS}")
        endif()
    endif()
endif()

if(NCNN_THREADS)
    set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
    set(THREADS_PREFER_PTHREAD_FLAG TRUE)
    find_package(Threads REQUIRED)

    if(TARGET Threads::Threads)
        target_link_libraries(ncnn PUBLIC Threads::Threads)
    endif()
    if(NCNN_SIMPLEOMP OR NCNN_SIMPLESTL)
        target_link_libraries(ncnn PUBLIC pthread)
    endif()
endif()

if(NCNN_VULKAN)
    if(NCNN_SIMPLEVK)
        if(APPLE)
            # simplevk use static vulkan linkage on apple platform as fallback
            if(DEFINED Vulkan_LIBRARY)
                message(STATUS "simplevk static vulkan linkage as fallback enabled on APPLE platforms")
                target_link_libraries(ncnn PUBLIC ${Vulkan_LIBRARY})

                # https://github.com/KhronosGroup/MoltenVK/blob/main/Docs/MoltenVK_Runtime_UserGuide.md#optionally-link-to-required-system-libraries
                if(NOT NCNN_SHARED_LIB)
                    find_library(Metal NAMES Metal)
                    find_library(Foundation NAMES Foundation)
                    find_library(QuartzCore NAMES QuartzCore)
                    find_library(CoreGraphics NAMES CoreGraphics)
                    find_library(IOSurface NAMES IOSurface)
                    list(APPEND vulkan_dependent_LINK_LIBRARIES ${Metal} ${Foundation} ${QuartzCore} ${CoreGraphics} ${IOSurface})
                    if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
                        if(NOT IOS)
                            find_library(AppKit NAMES AppKit)
                            list(APPEND vulkan_dependent_LINK_LIBRARIES ${AppKit})
                        endif()
                        find_library(IOKit NAMES IOKit)
                        list(APPEND vulkan_dependent_LINK_LIBRARIES ${IOKit})
                    endif()
                    if(IOS OR CMAKE_SYSTEM_NAME STREQUAL "iOS" OR CMAKE_SYSTEM_NAME STREQUAL "tvOS")
                        find_library(UIKit NAMES UIKit)
                        list(APPEND vulkan_dependent_LINK_LIBRARIES ${UIKit})
                    endif()
                    target_link_libraries(ncnn PRIVATE ${vulkan_dependent_LINK_LIBRARIES})
                endif()

            else()
                message(WARNING "Vulkan_LIBRARY shall be defined for simplevk static linkage as fallback on APPLE platforms")

                # link simplevk stub
                set(SIMPLEVK_TBD "${CMAKE_CURRENT_SOURCE_DIR}/simplevk.tbd")
                target_link_libraries(ncnn PRIVATE "-Wl,-weak_library,${SIMPLEVK_TBD}")
            endif()
        endif()
        target_link_libraries(ncnn PRIVATE ${CMAKE_DL_LIBS})
    else()
        find_package(Vulkan QUIET)
        if(NOT Vulkan_FOUND)
            if(DEFINED ENV{VULKAN_SDK})
                if(CMAKE_SYSTEM_NAME MATCHES "Linux")
                    list(APPEND CMAKE_MODULE_PATH "$ENV{VULKAN_SDK}/../source/VulkanTools/cmake")
                elseif(CMAKE_SYSTEM_NAME MATCHES "Windows")
                    list(APPEND CMAKE_MODULE_PATH "$ENV{VULKAN_SDK}/Samples/cmake")
                elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
                    message(WARNING "Failed to find vulkan since cmake is too old\n"
                        "cmake >= 3.7 required. Consider `brew upgrade cmake`")
                endif()
            else()
                message(FATAL_ERROR "Error: CMake didn't find Vulkan. Please set VULKAN_SDK env var, e.g.:\n"
                    "Linux: export VULKAN_SDK=~/soft/vulkansdk/1.2.148.0/x86_64\n"
                    "Windows: set VULKAN_SDK=E:/lib/VulkanSDK/1.2.148.0\n"
                    "MacOS: export VULKAN_SDK=~/soft/vulkansdk/1.2.148.0/macOS\n"
                )
            endif()
            find_package(Vulkan REQUIRED)
        endif()

        target_link_libraries(ncnn PUBLIC Vulkan::Vulkan)
    endif()

    # link in-house glslang
    target_include_directories(ncnn PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../>)
    target_link_libraries(ncnn PRIVATE glslang SPIRV)
endif()

if(NCNN_PLATFORM_API AND ANDROID)
    target_link_libraries(ncnn PUBLIC android jnigraphics log)
endif()

if(WIN32)
    target_compile_definitions(ncnn PUBLIC NOMINMAX)
endif()

if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
    target_compile_definitions(ncnn PRIVATE _SCL_SECURE_NO_WARNINGS _CRT_SECURE_NO_DEPRECATE)

    if(CMAKE_BUILD_TYPE MATCHES "(Release|RELEASE|release)")
        target_compile_options(ncnn PRIVATE /fp:fast)
    endif()

    if(NCNN_TARGET_ARCH STREQUAL "arm")
        # disable msvc svml optimization on arm target as it produces wrong result
        target_compile_options(ncnn PRIVATE /d2Qvec-mathlib-)
    endif()

    if(NCNN_SHARED_LIB)
        # msvc argues about stl string and vector uses in exported functions
        target_compile_options(ncnn PRIVATE /wd4251)
    endif()
else()
    target_compile_options(ncnn PRIVATE -Wall -Wextra -Wno-unused-function)

    if(NOT NCNN_DISABLE_PIC)
        set_target_properties(ncnn PROPERTIES POSITION_INDEPENDENT_CODE ON INTERFACE_POSITION_INDEPENDENT_CODE ON)
    endif()

    if(CMAKE_BUILD_TYPE MATCHES "(Release|RELEASE|release)")
        if(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.6) AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
            target_compile_options(ncnn PRIVATE -Ofast)
        endif()

        target_compile_options(ncnn PRIVATE -ffast-math)
    endif()

    # target_compile_options(ncnn PRIVATE -march=native)
    target_compile_options(ncnn PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
    if(NCNN_SHARED_LIB AND NCNN_ENABLE_LTO)
        set_target_properties(ncnn PROPERTIES INTERPROCEDURAL_OPTIMIZATION ON)
    endif()
endif()

if(NCNN_DISABLE_RTTI)
    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
        target_compile_options(ncnn PUBLIC /GR-)
    else()
        target_compile_options(ncnn PUBLIC -fno-rtti)
    endif()
endif()

if(NCNN_DISABLE_EXCEPTION)
    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
        target_compile_options(ncnn PUBLIC /EHsc /D_HAS_EXCEPTIONS=0)
    else()
        target_compile_options(ncnn PUBLIC -fno-exceptions)
    endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "x86")
    if(NCNN_SSE2)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
            if(CMAKE_SIZEOF_VOID_P EQUAL 4)
                target_compile_options(ncnn PRIVATE /arch:SSE2)
            endif()
            target_compile_options(ncnn PRIVATE /D__SSE2__)
        else()
            if(NOT CMAKE_SYSTEM_NAME MATCHES "WASI")
                target_compile_options(ncnn PRIVATE -msse2 -msse)
            endif ()
            if(CMAKE_SYSTEM_NAME MATCHES "Emscripten|WASI")
                target_compile_options(ncnn PRIVATE -msimd128)
            endif()
        endif()

        if(NCNN_COMPILER_SUPPORT_X86_RECIP_NONE)
            # recip optimization causes precision loss
            target_compile_options(ncnn PRIVATE -mrecip=none)
        endif()
    endif()

    if(NOT NCNN_RUNTIME_CPU AND NCNN_AVX512)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            target_compile_options(ncnn PRIVATE /arch:AVX512 /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__)
            if(NCNN_AVX512VNNI)
                target_compile_options(ncnn PRIVATE /D__AVX512VNNI__)
            endif()
        elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
            target_compile_options(ncnn PRIVATE /arch:AVX512 -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c /D__SSSE3__ /D__SSE4_1__ /D__FMA__ /D__F16C__)
            if(NCNN_AVX512VNNI)
                target_compile_options(ncnn PRIVATE -mavx512vnni /D__AVX512VNNI__)
            endif()
            if(NCNN_AVX512BF16)
                target_compile_options(ncnn PRIVATE -mavx512bf16 /D__AVX512BF16__)
            endif()
            if(NCNN_AVX512FP16)
                target_compile_options(ncnn PRIVATE -mavx512fp16 /D__AVX512FP16__)
            endif()
        else()
            target_compile_options(ncnn PRIVATE -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl -mfma -mf16c)
            if(NCNN_AVX512VNNI)
                target_compile_options(ncnn PRIVATE -mavx512vnni)
            endif()
            if(NCNN_AVX512BF16)
                target_compile_options(ncnn PRIVATE -mavx512bf16)
            endif()
            if(NCNN_AVX512FP16)
                target_compile_options(ncnn PRIVATE -mavx512fp16)
            endif()
        endif()
    elseif(NOT NCNN_RUNTIME_CPU AND NCNN_FMA)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            if(NCNN_AVX2)
                target_compile_options(ncnn PRIVATE /arch:AVX2 /D__SSSE3__ /D__SSE4_1__ /D__FMA__)
            else()
                target_compile_options(ncnn PRIVATE /arch:AVX /D__SSSE3__ /D__SSE4_1__ /D__FMA__)
            endif()
            if(NCNN_AVXVNNIINT8)
                target_compile_options(ncnn PRIVATE /D__AVXVNNIINT8__)
            endif()
            if(NCNN_AVXVNNIINT16)
                target_compile_options(ncnn PRIVATE /D__AVXVNNIINT16__)
            endif()
            if(NCNN_AVXNECONVERT)
                target_compile_options(ncnn PRIVATE /D__AVXNECONVERT__)
            endif()
            if(NCNN_AVXVNNI)
                target_compile_options(ncnn PRIVATE /D__AVXVNNI__)
            elseif(NCNN_XOP)
                target_compile_options(ncnn PRIVATE /D__XOP__)
            endif()
            if(NCNN_F16C)
                target_compile_options(ncnn PRIVATE /D__F16C__)
            endif()
        elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
            if(NCNN_AVX2)
                target_compile_options(ncnn PRIVATE /arch:AVX2 -mfma /D__SSSE3__ /D__SSE4_1__ /D__FMA__)
            else()
                target_compile_options(ncnn PRIVATE /arch:AVX -mfma /D__SSSE3__ /D__SSE4_1__ /D__FMA__)
            endif()
            if(NCNN_AVXVNNIINT8)
                target_compile_options(ncnn PRIVATE -mavxvnniint8 /D__AVXVNNIINT8__)
            endif()
            if(NCNN_AVXVNNIINT16)
                target_compile_options(ncnn PRIVATE -mavxvnniint16 /D__AVXVNNIINT16__)
            endif()
            if(NCNN_AVXNECONVERT)
                target_compile_options(ncnn PRIVATE -mavxneconvert /D__AVXNECONVERT__)
            endif()
            if(NCNN_AVXVNNI)
                target_compile_options(ncnn PRIVATE -mavxvnni /D__AVXVNNI__)
            elseif(NCNN_XOP)
                target_compile_options(ncnn PRIVATE -mxop /D__XOP__)
            endif()
            if(NCNN_F16C)
                target_compile_options(ncnn PRIVATE -mf16c /D__F16C__)
            endif()
        else()
            if(NCNN_AVX2)
                target_compile_options(ncnn PRIVATE -mavx2 -mfma)
            else()
                target_compile_options(ncnn PRIVATE -mavx -mfma)
            endif()
            if(NCNN_AVXVNNIINT8)
                target_compile_options(ncnn PRIVATE -mavxvnniint8)
            endif()
            if(NCNN_AVXVNNIINT16)
                target_compile_options(ncnn PRIVATE -mavxvnniint16)
            endif()
            if(NCNN_AVXNECONVERT)
                target_compile_options(ncnn PRIVATE -mavxneconvert)
            endif()
            if(NCNN_AVXVNNI)
                target_compile_options(ncnn PRIVATE -mavxvnni)
            elseif(NCNN_XOP)
                target_compile_options(ncnn PRIVATE -mxop)
            endif()
            if(NCNN_F16C)
                target_compile_options(ncnn PRIVATE -mf16c)
            endif()
        endif()
    elseif(NOT NCNN_RUNTIME_CPU AND NCNN_AVX)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            target_compile_options(ncnn PRIVATE /arch:AVX /D__SSSE3__ /D__SSE4_1__)
            if(NCNN_XOP)
                target_compile_options(ncnn PRIVATE /D__XOP__)
            endif()
            if(NCNN_F16C)
                target_compile_options(ncnn PRIVATE /D__F16C__)
            endif()
        elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")
            target_compile_options(ncnn PRIVATE /arch:AVX /D__SSSE3__ /D__SSE4_1__)
            if(NCNN_XOP)
                target_compile_options(ncnn PRIVATE -mxop /D__XOP__)
            endif()
            if(NCNN_F16C)
                target_compile_options(ncnn PRIVATE -mf16c /D__F16C__)
            endif()
        else()
            target_compile_options(ncnn PRIVATE -mavx)
            if(NCNN_XOP)
                target_compile_options(ncnn PRIVATE -mxop)
            endif()
            if(NCNN_F16C)
                target_compile_options(ncnn PRIVATE -mf16c)
            endif()
        endif()
    endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 4 AND NOT NCNN_TARGET_ILP32))
    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
        # always enable neon for msvc arm
        target_compile_options(ncnn PRIVATE /D__arm__ /D__ARM_NEON)
    endif()

    if(NOT NCNN_RUNTIME_CPU AND NCNN_VFPV4)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
            target_compile_options(ncnn PRIVATE /arch:VFPv4 /D__ARM_FP=0x0E)
        else()
            if(NCNN_COMPILER_SUPPORT_ARM_VFPV4)
                target_compile_options(ncnn PRIVATE -mfpu=neon-vfpv4)
            elseif(NCNN_COMPILER_SUPPORT_ARM_VFPV4_FP16)
                target_compile_options(ncnn PRIVATE -mfpu=neon-vfpv4 -mfp16-format=ieee)
            endif()
        endif()
    elseif(NOT NCNN_RUNTIME_CPU)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
            target_compile_options(ncnn PRIVATE /D__ARM_FP=0x0C)
        endif()
    endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "arm" AND (CMAKE_SIZEOF_VOID_P EQUAL 8 OR NCNN_TARGET_ILP32))
    if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
        # always enable neon and vfpv4 for msvc arm64
        target_compile_options(ncnn PRIVATE /D__arm__ /D__aarch64__ /D__ARM_NEON /D__ARM_FP=0x0E)
    endif()

    if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM86SVE)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
            # TODO add support for sve family
            target_compile_options(ncnn PRIVATE /arch:armv8.6 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC /D__ARM_FEATURE_MATMUL_INT8)
            if(NCNN_ARM86SVE2)
            endif()
            if(NCNN_ARM86SVEBF16)
            endif()
            if(NCNN_ARM86SVEI8MM)
            endif()
            if(NCNN_ARM86SVEF32MM)
            endif()
        endif()
        if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            set(ARM_MARCH_FLAG "-march=armv8.6-a+fp16+dotprod+sve")
            if(NCNN_ARM86SVE2)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+sve2")
            endif()
            if(NCNN_ARM86SVEBF16)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16")
            endif()
            if(NCNN_ARM86SVEI8MM)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+i8mm")
            endif()
            if(NCNN_ARM86SVEF32MM)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+f32mm")
            endif()
        endif()
    elseif(NOT NCNN_RUNTIME_CPU AND (NCNN_ARM84BF16 OR NCNN_ARM84I8MM))
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
            target_compile_options(ncnn PRIVATE /arch:armv8.4 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC /D__ARM_FEATURE_DOTPROD /D__ARM_FEATURE_FP16_FML)
            if(NCNN_ARM84BF16)
                target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_BF16_VECTOR_ARITHMETIC)
            endif()
            if(NCNN_ARM84I8MM)
                target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_MATMUL_INT8)
            endif()
        endif()
        if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            set(ARM_MARCH_FLAG "-march=armv8.4-a+fp16+dotprod")
            if(NCNN_ARM84BF16)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+bf16")
            endif()
            if(NCNN_ARM84I8MM)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+i8mm")
            endif()
        endif()
    elseif(NOT NCNN_RUNTIME_CPU AND NCNN_ARM82)
        if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
            target_compile_options(ncnn PRIVATE /arch:armv8.2 /D__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
            if(NCNN_ARM82DOT)
                target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_DOTPROD)
            endif()
            if(NCNN_ARM82FP16FML)
                target_compile_options(ncnn PRIVATE /D__ARM_FEATURE_FP16_FML)
            endif()
        endif()
        if(NOT CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
            set(ARM_MARCH_FLAG "-march=armv8.2-a+fp16")
            if(NCNN_ARM82DOT)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+dotprod")
            endif()
            if(NCNN_ARM82FP16FML)
                set(ARM_MARCH_FLAG "${ARM_MARCH_FLAG}+fp16fml")
                # clang 9.0.9 shipped with android ndk-r21 is missing __ARM_FEATURE_FP16_FML macro for asimdfhm target
                target_compile_options(ncnn PRIVATE -D__ARM_FEATURE_FP16_FML)
            endif()
        endif()
    endif()
    target_compile_options(ncnn PRIVATE ${ARM_MARCH_FLAG})

    if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER_EQUAL 23))
        # llvm 12 in ndk-23 enables out-of-line atomics by default
        # disable this feature for fixing linking atomic builtins issue with old ndk
        target_compile_options(ncnn PRIVATE -mno-outline-atomics)
    endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "mips")
    if(NOT NCNN_RUNTIME_CPU AND NCNN_MSA)
        target_compile_options(ncnn PRIVATE -mmsa)
    endif()
    if(NOT NCNN_RUNTIME_CPU AND NCNN_MMI)
        target_compile_options(ncnn PRIVATE -mloongson-mmi)
    endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "loongarch")
    if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX)
        target_compile_options(ncnn PRIVATE -mlsx)
        if(NCNN_LASX)
            target_compile_options(ncnn PRIVATE -mlasx)
        endif()
    endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
        if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
            set(RISCV_MARCH_FLAG "-march=rv64gcv")
            if(NCNN_ZFH)
                set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh")
                target_compile_options(ncnn PRIVATE -D__fp16=_Float16)
            endif()
            if(NCNN_ZVFH)
                set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zvfh")
            endif()
        elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
            set(RISCV_MARCH_FLAG "-march=rv64gc_xtheadvector")
            if(NCNN_ZFH)
                set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh")
                target_compile_options(ncnn PRIVATE -D__riscv_zvfh=1 -D__fp16=_Float16)
            endif()
        endif()
        target_compile_options(ncnn PRIVATE ${RISCV_MARCH_FLAG})
    elseif(CMAKE_SIZEOF_VOID_P EQUAL 4)
        if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
            set(RISCV_MARCH_FLAG "-march=rv32gcv")
            if(NCNN_ZFH)
                set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh")
                target_compile_options(ncnn PRIVATE -D__fp16=_Float16)
            endif()
            if(NCNN_ZVFH)
                set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zvfh")
            endif()
        elseif(NOT NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
            set(RISCV_MARCH_FLAG "-march=rv32gc_xtheadvector")
            if(NCNN_ZFH)
                set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh")
                target_compile_options(ncnn PRIVATE -D__riscv_zvfh=1 -D__fp16=_Float16)
            endif()
        endif()
        target_compile_options(ncnn PRIVATE ${RISCV_MARCH_FLAG})
    endif()
endif()

if(NCNN_PPC64LE_VSX)
    # Auto-translate SSE2 to VSX if compiler is new enough.
    if(NCNN_VSX_SSE2)
        target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__SSE2__)
    endif()

    # Auto-translate SSE4.1 to VSX if compiler is new enough.
    if(NCNN_VSX_SSE41)
        target_compile_options(ncnn PRIVATE -DNO_WARN_X86_INTRINSICS -D__SSE4_1__)
    endif()
endif()

if(NCNN_COVERAGE)
    target_compile_options(ncnn PUBLIC -coverage -fprofile-arcs -ftest-coverage)
    target_link_libraries(ncnn PUBLIC -coverage -lgcov)
endif()

if(NCNN_ASAN)
    target_compile_options(ncnn PUBLIC -fsanitize=address)
    target_link_libraries(ncnn PUBLIC -fsanitize=address)
endif()

add_dependencies(ncnn ncnn-generate-spirv)

if(NCNN_INSTALL_SDK)
    include(GNUInstallDirs)

    include(CMakePackageConfigHelpers)
    write_basic_package_version_file(
        ${CMAKE_CURRENT_BINARY_DIR}/ncnnConfigVersion.cmake
        VERSION ${NCNN_VERSION}
        COMPATIBILITY AnyNewerVersion
    )

    install(TARGETS ncnn EXPORT ncnn
        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
    )
    install(FILES
        allocator.h
        benchmark.h
        blob.h
        c_api.h
        command.h
        cpu.h
        datareader.h
        expression.h
        gpu.h
        layer.h
        layer_shader_type.h
        layer_type.h
        mat.h
        modelbin.h
        net.h
        option.h
        paramdict.h
        pipeline.h
        pipelinecache.h
        simpleocv.h
        simpleomp.h
        simplestl.h
        simplemath.h
        simplevk.h
        vulkan_header_fix.h
        ${CMAKE_CURRENT_BINARY_DIR}/ncnn_export.h
        ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_type_enum.h
        ${CMAKE_CURRENT_BINARY_DIR}/layer_type_enum.h
        ${CMAKE_CURRENT_BINARY_DIR}/platform.h
        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ncnn
    )
    install(EXPORT ncnn DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ncnn)
    configure_file(${CMAKE_CURRENT_LIST_DIR}/../cmake/ncnnConfig.cmake.in ncnnConfig.cmake @ONLY)
    install(FILES
        ${CMAKE_CURRENT_BINARY_DIR}/ncnnConfig.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/ncnnConfigVersion.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ncnn)
    # pkgconfig
    configure_file(ncnn.pc.in ${CMAKE_CURRENT_BINARY_DIR}/ncnn.pc @ONLY)
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ncnn.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
endif()

# add ncnn and generate-spirv to a virtual project group
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
set_property(TARGET ncnn PROPERTY FOLDER "libncnn")
set_property(TARGET ncnn-generate-spirv PROPERTY FOLDER "libncnn")


================================================
FILE: src/allocator.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "allocator.h"

#include "gpu.h"
#include "pipeline.h"

#if __ANDROID_API__ >= 26
#include <android/hardware_buffer.h>
#endif // __ANDROID_API__ >= 26

namespace ncnn {

Allocator::~Allocator()
{
}

class PoolAllocatorPrivate
{
public:
    Mutex budgets_lock;
    Mutex payouts_lock;
    unsigned int size_compare_ratio; // 0~256
    size_t size_drop_threshold;
    std::list<std::pair<size_t, void*> > budgets;
    std::list<std::pair<size_t, void*> > payouts;
};

PoolAllocator::PoolAllocator()
    : Allocator(), d(new PoolAllocatorPrivate)
{
    d->size_compare_ratio = 0;
    d->size_drop_threshold = 10;
}

PoolAllocator::~PoolAllocator()
{
    clear();

    if (!d->payouts.empty())
    {
        NCNN_LOGE("FATAL ERROR! pool allocator destroyed too early");
#if NCNN_STDIO
        std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
        for (; it != d->payouts.end(); ++it)
        {
            void* ptr = it->second;
            NCNN_LOGE("%p still in use", ptr);
        }
#endif
    }

    delete d;
}

PoolAllocator::PoolAllocator(const PoolAllocator&)
    : d(0)
{
}

PoolAllocator& PoolAllocator::operator=(const PoolAllocator&)
{
    return *this;
}

void PoolAllocator::clear()
{
    d->budgets_lock.lock();

    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
    for (; it != d->budgets.end(); ++it)
    {
        void* ptr = it->second;
        ncnn::fastFree(ptr);
    }
    d->budgets.clear();

    d->budgets_lock.unlock();
}

void PoolAllocator::set_size_compare_ratio(float scr)
{
    if (scr < 0.f || scr > 1.f)
    {
        NCNN_LOGE("invalid size compare ratio %f", scr);
        return;
    }

    d->size_compare_ratio = (unsigned int)(scr * 256);
}

void PoolAllocator::set_size_drop_threshold(size_t threshold)
{
    d->size_drop_threshold = threshold;
}

void* PoolAllocator::fastMalloc(size_t size)
{
    d->budgets_lock.lock();

    // find free budget
    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
    for (; it != d->budgets.end(); ++it)
    {
        size_t bs = it->first;

        // size_compare_ratio ~ 100%
        if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
        {
            void* ptr = it->second;

            d->budgets.erase(it);

            d->budgets_lock.unlock();

            d->payouts_lock.lock();

            d->payouts.push_back(std::make_pair(bs, ptr));

            d->payouts_lock.unlock();

            return ptr;
        }

        if (bs < it_min->first)
        {
            it_min = it;
        }
        if (bs > it_max->first)
        {
            it_max = it;
        }
    }

    if (d->budgets.size() >= d->size_drop_threshold)
    {
        // All chunks in pool are not chosen. Then try to drop some outdated
        // chunks and return them to OS.
        if (it_max->first < size)
        {
            // Current query is asking for a chunk larger than any cached chunks.
            // Then remove the smallest one.
            ncnn::fastFree(it_min->second);
            d->budgets.erase(it_min);
        }
        else if (it_min->first > size)
        {
            // Current query is asking for a chunk smaller than any cached chunks.
            // Then remove the largest one.
            ncnn::fastFree(it_max->second);
            d->budgets.erase(it_max);
        }
    }

    d->budgets_lock.unlock();

    // new
    void* ptr = ncnn::fastMalloc(size);

    d->payouts_lock.lock();

    d->payouts.push_back(std::make_pair(size, ptr));

    d->payouts_lock.unlock();

    return ptr;
}

void PoolAllocator::fastFree(void* ptr)
{
    d->payouts_lock.lock();

    // return to budgets
    std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
    for (; it != d->payouts.end(); ++it)
    {
        if (it->second == ptr)
        {
            size_t size = it->first;

            d->payouts.erase(it);

            d->payouts_lock.unlock();

            d->budgets_lock.lock();

            d->budgets.push_back(std::make_pair(size, ptr));

            d->budgets_lock.unlock();

            return;
        }
    }

    d->payouts_lock.unlock();

    NCNN_LOGE("FATAL ERROR! pool allocator get wild %p", ptr);
    ncnn::fastFree(ptr);
}

class UnlockedPoolAllocatorPrivate
{
public:
    unsigned int size_compare_ratio; // 0~256
    size_t size_drop_threshold;
    std::list<std::pair<size_t, void*> > budgets;
    std::list<std::pair<size_t, void*> > payouts;
};

UnlockedPoolAllocator::UnlockedPoolAllocator()
    : Allocator(), d(new UnlockedPoolAllocatorPrivate)
{
    d->size_compare_ratio = 0;
    d->size_drop_threshold = 10;
}

UnlockedPoolAllocator::~UnlockedPoolAllocator()
{
    clear();

    if (!d->payouts.empty())
    {
        NCNN_LOGE("FATAL ERROR! unlocked pool allocator destroyed too early");
#if NCNN_STDIO
        std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
        for (; it != d->payouts.end(); ++it)
        {
            void* ptr = it->second;
            NCNN_LOGE("%p still in use", ptr);
        }
#endif
    }

    delete d;
}

UnlockedPoolAllocator::UnlockedPoolAllocator(const UnlockedPoolAllocator&)
    : d(0)
{
}

UnlockedPoolAllocator& UnlockedPoolAllocator::operator=(const UnlockedPoolAllocator&)
{
    return *this;
}

void UnlockedPoolAllocator::clear()
{
    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin();
    for (; it != d->budgets.end(); ++it)
    {
        void* ptr = it->second;
        ncnn::fastFree(ptr);
    }
    d->budgets.clear();
}

void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
{
    if (scr < 0.f || scr > 1.f)
    {
        NCNN_LOGE("invalid size compare ratio %f", scr);
        return;
    }

    d->size_compare_ratio = (unsigned int)(scr * 256);
}

void UnlockedPoolAllocator::set_size_drop_threshold(size_t threshold)
{
    d->size_drop_threshold = threshold;
}

void* UnlockedPoolAllocator::fastMalloc(size_t size)
{
    // find free budget
    std::list<std::pair<size_t, void*> >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin();
    for (; it != d->budgets.end(); ++it)
    {
        size_t bs = it->first;

        // size_compare_ratio ~ 100%
        if (bs >= size && ((bs * d->size_compare_ratio) >> 8) <= size)
        {
            void* ptr = it->second;

            d->budgets.erase(it);

            d->payouts.push_back(std::make_pair(bs, ptr));

            return ptr;
        }

        if (bs > it_max->first)
        {
            it_max = it;
        }
        if (bs < it_min->first)
        {
            it_min = it;
        }
    }

    if (d->budgets.size() >= d->size_drop_threshold)
    {
        if (it_max->first < size)
        {
            ncnn::fastFree(it_min->second);
            d->budgets.erase(it_min);
        }
        else if (it_min->first > size)
        {
            ncnn::fastFree(it_max->second);
            d->budgets.erase(it_max);
        }
    }

    // new
    void* ptr = ncnn::fastMalloc(size);

    d->payouts.push_back(std::make_pair(size, ptr));

    return ptr;
}

void UnlockedPoolAllocator::fastFree(void* ptr)
{
    // return to budgets
    std::list<std::pair<size_t, void*> >::iterator it = d->payouts.begin();
    for (; it != d->payouts.end(); ++it)
    {
        if (it->second == ptr)
        {
            size_t size = it->first;

            d->payouts.erase(it);

            d->budgets.push_back(std::make_pair(size, ptr));

            return;
        }
    }

    NCNN_LOGE("FATAL ERROR! unlocked pool allocator get wild %p", ptr);
    ncnn::fastFree(ptr);
}

#if NCNN_VULKAN
VkAllocator::VkAllocator(const VulkanDevice* _vkdev)
    : vkdev(_vkdev)
{
    buffer_memory_type_index = (uint32_t)-1;
    image_memory_type_index = (uint32_t)-1;
    reserved_type_index = (uint32_t)-1;
    mappable = false;
    coherent = false;
}

VkAllocator::~VkAllocator()
{
    clear();
}

void VkAllocator::clear()
{
}

static inline size_t round_up(size_t n, size_t multiple)
{
    return (n + multiple - 1) / multiple * multiple;
}

static inline size_t round_down(size_t n, size_t multiple)
{
    return n / multiple * multiple;
}

int VkAllocator::flush(VkBufferMemory* ptr)
{
    if (coherent)
        return 0;

    VkMappedMemoryRange mappedMemoryRange;
    mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
    mappedMemoryRange.pNext = 0;
    mappedMemoryRange.memory = ptr->memory;
    mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
    mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;

    VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkFlushMappedMemoryRanges failed %d", ret);
        return -1;
    }

    return 0;
}

int VkAllocator::invalidate(VkBufferMemory* ptr)
{
    if (coherent)
        return 0;

    VkMappedMemoryRange mappedMemoryRange;
    mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
    mappedMemoryRange.pNext = 0;
    mappedMemoryRange.memory = ptr->memory;
    mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size());
    mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size()) - mappedMemoryRange.offset;

    VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkInvalidateMappedMemoryRanges failed %d", ret);
        return -1;
    }

    return 0;
}

VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage)
{
    VkBufferCreateInfo bufferCreateInfo;
    bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
    bufferCreateInfo.pNext = 0;
    bufferCreateInfo.flags = 0;
    bufferCreateInfo.size = size;
    bufferCreateInfo.usage = usage;
    bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
    bufferCreateInfo.queueFamilyIndexCount = 0;
    bufferCreateInfo.pQueueFamilyIndices = 0;

    VkBuffer buffer = 0;
    VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateBuffer failed %d", ret);
        return 0;
    }

    return buffer;
}

VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index)
{
    VkMemoryAllocateInfo memoryAllocateInfo;
    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
    memoryAllocateInfo.pNext = 0;
    memoryAllocateInfo.allocationSize = size;
    memoryAllocateInfo.memoryTypeIndex = memory_type_index;

    VkDeviceMemory memory = 0;
    VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkAllocateMemory failed %d", ret);
        return 0;
    }

    return memory;
}

VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer)
{
    VkMemoryAllocateInfo memoryAllocateInfo;
    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
    memoryAllocateInfo.pNext = 0;
    memoryAllocateInfo.allocationSize = size;
    memoryAllocateInfo.memoryTypeIndex = memory_type_index;

    VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo;
    memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR;
    memoryDedicatedAllocateInfo.pNext = 0;
    memoryDedicatedAllocateInfo.image = image;
    memoryDedicatedAllocateInfo.buffer = buffer;
    memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;

    VkDeviceMemory memory = 0;
    VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkAllocateMemory failed %d", ret);
        return 0;
    }

    return memory;
}

VkDeviceMemory VkAllocator::allocate_import_host_memory(size_t size, uint32_t memory_type_index, void* host_ptr)
{
    VkMemoryAllocateInfo memoryAllocateInfo;
    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
    memoryAllocateInfo.pNext = 0;
    memoryAllocateInfo.allocationSize = size;
    memoryAllocateInfo.memoryTypeIndex = memory_type_index;

    VkImportMemoryHostPointerInfoEXT importMemoryHostPointerInfo;
    importMemoryHostPointerInfo.sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_HOST_POINTER_INFO_EXT;
    importMemoryHostPointerInfo.pNext = 0;
    importMemoryHostPointerInfo.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT;
    importMemoryHostPointerInfo.pHostPointer = host_ptr;
    memoryAllocateInfo.pNext = &importMemoryHostPointerInfo;

    VkDeviceMemory memory = 0;
    VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkAllocateMemory failed %d", ret);
        return 0;
    }

    return memory;
}

VkImage VkAllocator::create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage)
{
    VkImageCreateInfo imageCreateInfo;
    imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
    imageCreateInfo.pNext = 0;
    imageCreateInfo.flags = 0;
    imageCreateInfo.imageType = VK_IMAGE_TYPE_3D;
    imageCreateInfo.format = format;
    imageCreateInfo.extent.width = width;
    imageCreateInfo.extent.height = height;
    imageCreateInfo.extent.depth = depth;
    imageCreateInfo.mipLevels = 1;
    imageCreateInfo.arrayLayers = 1;
    imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
    imageCreateInfo.tiling = tiling;
    imageCreateInfo.usage = usage;
    imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
    imageCreateInfo.queueFamilyIndexCount = 0;
    imageCreateInfo.pQueueFamilyIndices = 0;
    imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;

    VkImage image;
    VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateImage failed %d %d %d %d %d %d %d", ret, width, height, depth, format, tiling, usage);
        return 0;
    }

    return image;
}

VkImageView VkAllocator::create_imageview(VkImage image, VkFormat format)
{
    VkImageViewCreateInfo imageViewCreateInfo;
    imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
    imageViewCreateInfo.pNext = 0;
    imageViewCreateInfo.flags = 0;
    imageViewCreateInfo.image = image;
    imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_3D;
    imageViewCreateInfo.format = format;
    imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
    imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
    imageViewCreateInfo.subresourceRange.levelCount = 1;
    imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
    imageViewCreateInfo.subresourceRange.layerCount = 1;

    VkImageView imageview;
    VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateImageView failed %d", ret);
        return 0;
    }

    return imageview;
}

static inline size_t least_common_multiple(size_t a, size_t b)
{
    if (a == b)
        return a;

    if (a > b)
        return least_common_multiple(b, a);

    size_t lcm = b;
    while (lcm % a != 0)
    {
        lcm += b;
    }

    return lcm;
}

class VkBlobAllocatorPrivate
{
public:
    size_t block_size;
    size_t buffer_offset_alignment;
    size_t bind_memory_offset_alignment;
    std::vector<std::list<std::pair<size_t, size_t> > > buffer_budgets;
    std::vector<VkBufferMemory*> buffer_blocks;
    std::vector<std::list<std::pair<size_t, size_t> > > image_memory_budgets;
    std::vector<VkDeviceMemory> image_memory_blocks;
};

VkBlobAllocator::VkBlobAllocator(const VulkanDevice* _vkdev, size_t preferred_block_size)
    : VkAllocator(_vkdev), d(new VkBlobAllocatorPrivate)
{
    d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
    d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();

    if (vkdev->info.type() == 1)
    {
        // on integrated gpu, there may be device local only memory too, eg. AMD APU
        // assuming larger alignment always keeps us safe :)

        // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
    }

    if (vkdev->info.support_VK_KHR_robustness2() || vkdev->info.support_VK_EXT_robustness2())
    {
        size_t robust_storage_buffer_access_size_alignment = vkdev->info.queryRobustness2Properties().robustStorageBufferAccessSizeAlignment;
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, robust_storage_buffer_access_size_alignment);
    }

    d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);
}

VkBlobAllocator::~VkBlobAllocator()
{
    clear();

    delete d;
}

VkBlobAllocator::VkBlobAllocator(const VkBlobAllocator&)
    : VkAllocator(0), d(0)
{
}

VkBlobAllocator& VkBlobAllocator::operator=(const VkBlobAllocator&)
{
    return *this;
}

void VkBlobAllocator::clear()
{
    //     NCNN_LOGE("VkBlobAllocator %lu", buffer_blocks.size());

    for (size_t i = 0; i < d->buffer_blocks.size(); i++)
    {
        VkBufferMemory* ptr = d->buffer_blocks[i];

        //         std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
        //         while (it != buffer_budgets[i].end())
        //         {
        //             NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
        //             it++;
        //         }

        if (mappable)
            vkUnmapMemory(vkdev->vkdevice(), ptr->memory);

        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);

        delete ptr;
    }
    d->buffer_blocks.clear();

    d->buffer_budgets.clear();

    for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
    {
        VkDeviceMemory memory = d->image_memory_blocks[i];

        //         std::list< std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
        //         while (it != d->image_memory_budgets[i].end())
        //         {
        //             NCNN_LOGE("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
        //             it++;
        //         }

        vkFreeMemory(vkdev->vkdevice(), memory, 0);
    }
    d->image_memory_blocks.clear();

    d->image_memory_budgets.clear();
}

VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
{
    size_t aligned_size = alignSize(size, d->buffer_offset_alignment);

    const int buffer_block_count = d->buffer_blocks.size();

    // find first spare space in buffer_blocks
    for (int i = 0; i < buffer_block_count; i++)
    {
        std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[i].begin();
        while (it != d->buffer_budgets[i].end())
        {
            size_t budget_size = it->second;
            if (budget_size < aligned_size)
            {
                it++;
                continue;
            }

            // return sub buffer
            VkBufferMemory* ptr = new VkBufferMemory;

            ptr->buffer = d->buffer_blocks[i]->buffer;
            ptr->offset = it->first;
            ptr->memory = d->buffer_blocks[i]->memory;
            ptr->capacity = aligned_size;
            ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
            ptr->memory_type_index = d->buffer_blocks[i]->memory_type_index;
            ptr->access_flags = 0;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

            // adjust buffer_budgets
            if (budget_size == aligned_size)
            {
                d->buffer_budgets[i].erase(it);
            }
            else
            {
                it->first += aligned_size;
                it->second -= aligned_size;
            }

            //             NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);

            return ptr;
        }
    }

    size_t new_block_size = std::max(d->block_size, aligned_size);

    // create new block
    VkBufferMemory* block = new VkBufferMemory;

    block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
    block->offset = 0;

    // TODO respect VK_KHR_dedicated_allocation ?

    VkMemoryRequirements memoryRequirements;
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);

    // setup memory type and alignment
    if (buffer_memory_type_index == (uint32_t)-1)
    {
        if (vkdev->info.type() == 1)
        {
            // integrated gpu, prefer unified memory
            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

            // on amd integrated gpu, there is a faster and larger device-only heap
            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physicalDeviceMemoryProperties();
            uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
            {
                buffer_memory_type_index = device_local_memory_type_index;
            }
        }
        else
        {
            // discrete gpu, device local
            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
        }

        mappable = vkdev->is_mappable(buffer_memory_type_index);
        coherent = vkdev->is_coherent(buffer_memory_type_index);
    }

    block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
    if (!block->memory)
    {
        vkDestroyBuffer(vkdev->vkdevice(), block->buffer, 0);
        delete block;
        return 0;
    }

    // ignore memoryRequirements.alignment as we always bind at zero offset
    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);

    block->mapped_ptr = 0;
    if (mappable)
    {
        vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
    }

    block->memory_type_index = buffer_memory_type_index;

    d->buffer_blocks.push_back(block);

    // return sub buffer
    VkBufferMemory* ptr = new VkBufferMemory;

    ptr->buffer = block->buffer;
    ptr->offset = 0;
    ptr->memory = block->memory;
    ptr->capacity = aligned_size;
    ptr->mapped_ptr = block->mapped_ptr;
    ptr->memory_type_index = block->memory_type_index;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    // adjust buffer_budgets
    std::list<std::pair<size_t, size_t> > budget;
    if (new_block_size > aligned_size)
    {
        budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
    }
    d->buffer_budgets.push_back(budget);

    //     NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);

    return ptr;
}

void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
{
    //     NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);

    const int buffer_block_count = d->buffer_blocks.size();

    int block_index = -1;
    for (int i = 0; i < buffer_block_count; i++)
    {
        if (d->buffer_blocks[i]->buffer == ptr->buffer && d->buffer_blocks[i]->memory == ptr->memory)
        {
            block_index = i;
            break;
        }
    }

    if (block_index == -1)
    {
        NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->buffer);

        delete ptr;

        return;
    }

    // merge
    std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->buffer_budgets[block_index].end();
    std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->buffer_budgets[block_index].end();
    std::list<std::pair<size_t, size_t> >::iterator it = d->buffer_budgets[block_index].begin();
    for (; it != d->buffer_budgets[block_index].end(); it++)
    {
        if (it->first + it->second == ptr->offset)
        {
            it_merge_left = it;
        }
        else if (ptr->offset + ptr->capacity == it->first)
        {
            it_merge_right = it;
        }
    }

    if (it_merge_left != d->buffer_budgets[block_index].end() && it_merge_right != d->buffer_budgets[block_index].end())
    {
        it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
        d->buffer_budgets[block_index].erase(it_merge_right);
    }
    else if (it_merge_left != d->buffer_budgets[block_index].end())
    {
        it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first;
    }
    else if (it_merge_right != d->buffer_budgets[block_index].end())
    {
        it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset;
        it_merge_right->first = ptr->offset;
    }
    else
    {
        if (ptr->offset == 0)
        {
            // chain leading block
            d->buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity));
        }
        else
        {
            d->buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity));
        }
    }

    delete ptr;
}

VkImageMemory* VkBlobAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
{
    if (elempack != 1 && elempack != 4 && elempack != 8)
    {
        NCNN_LOGE("elempack must be 1 4 8");
        return 0;
    }

    // resolve format
    VkFormat format = VK_FORMAT_UNDEFINED;

    if (elemsize / elempack == 4)
    {
        // fp32
        if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
        if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
        if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
    }
    if (elemsize / elempack == 2)
    {
        // fp16
        if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
        if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
        if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
    }
    if (elemsize / elempack == 1)
    {
        // int8
        if (elempack == 1) format = VK_FORMAT_R8_SINT;
        if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
    }

    // resolve image width height depth
    int width = w;
    int height = h;
    int depth = c;

    // large elempack spills on image w
    if (elempack == 8) width *= 2;

    if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
    {
        NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
        return 0;
    }

    VkImageMemory* ptr = new VkImageMemory;

    ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);

    ptr->width = width;
    ptr->height = height;
    ptr->depth = depth;
    ptr->format = format;

    // TODO respect VK_KHR_dedicated_allocation ?
    VkMemoryRequirements memoryRequirements;
    vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);

    const size_t size = memoryRequirements.size;
    const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);

    size_t aligned_size = alignSize(size, alignment);

    const int image_memory_block_count = d->image_memory_blocks.size();

    // find first spare space in image_memory_blocks
    for (int i = 0; i < image_memory_block_count; i++)
    {
#if __APPLE__
        // HACK moltenvk v1.2.3 is unhappy for image binding with offset  :(
        break;
#endif

        std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[i].begin();
        while (it != d->image_memory_budgets[i].end())
        {
            // we cannot use it->first directly for base offset alignment
            size_t bind_base_offset = it->first;
            size_t bind_offset = alignSize(bind_base_offset, alignment);
            size_t budget_size = it->second;
            if (budget_size < aligned_size + (bind_offset - bind_base_offset))
            {
                it++;
                continue;
            }

            // bind at memory offset
            ptr->memory = d->image_memory_blocks[i];
            ptr->bind_offset = bind_offset;
            ptr->bind_capacity = aligned_size;

            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);

            // do not allow host access to optimal tiling image
            ptr->mapped_ptr = 0;
            ptr->memory_type_index = image_memory_type_index;

            ptr->imageview = create_imageview(ptr->image, format);

            ptr->access_flags = 0;
            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
            ptr->command_refcount = 0;

            if (bind_base_offset != bind_offset)
            {
                // NOTE there is small offset inside bind_base_offset and bind_offset
                // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
                // so that memory management could be easier
                aligned_size += (bind_offset - bind_base_offset);

                ptr->bind_offset = bind_base_offset;
                ptr->bind_capacity = aligned_size;
            }

            // adjust image_memory_budgets
            if (budget_size == aligned_size)
            {
                d->image_memory_budgets[i].erase(it);
            }
            else
            {
                it->first += aligned_size;
                it->second -= aligned_size;
            }

            //             NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);

            return ptr;
        }
    }

    // setup memory type and alignment
    if (image_memory_type_index == (uint32_t)-1)
    {
        if (vkdev->info.type() == 1)
        {
            // integrated gpu, prefer unified memory
            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

            // on amd integrated gpu, there is a faster and larger device-only heap
            uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
            const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physicalDeviceMemoryProperties();
            uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
            uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
            if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
            {
                image_memory_type_index = device_local_memory_type_index;
            }
        }
        else
        {
            // discrete gpu, device local
            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
        }

        mappable = vkdev->is_mappable(image_memory_type_index);
        coherent = vkdev->is_coherent(image_memory_type_index);
    }

    // create new block
    size_t new_block_size = std::max(d->block_size, aligned_size);

#if __APPLE__
    // HACK moltenvk v1.2.3 is unhappy for image binding with offset
    // always ignore block size for smaller memory footprint :(
    new_block_size = aligned_size;
#endif

    // bind at memory offset
    ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
    if (!ptr->memory)
    {
        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
        delete ptr;
        return 0;
    }
    ptr->bind_offset = 0;
    ptr->bind_capacity = aligned_size;

    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
    vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);

    // do not allow host access to optimal tiling image
    ptr->mapped_ptr = 0;
    ptr->memory_type_index = image_memory_type_index;

    ptr->imageview = create_imageview(ptr->image, format);

    ptr->access_flags = 0;
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
    ptr->command_refcount = 0;

    // adjust image_memory_budgets
    d->image_memory_blocks.push_back(ptr->memory);

    std::list<std::pair<size_t, size_t> > budget;
    if (new_block_size > aligned_size)
    {
        budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
    }
    d->image_memory_budgets.push_back(budget);

    //     NCNN_LOGE("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);

    return ptr;
}

void VkBlobAllocator::fastFree(VkImageMemory* ptr)
{
    //     NCNN_LOGE("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);

    const int image_memory_block_count = d->image_memory_blocks.size();

    int block_index = -1;
    for (int i = 0; i < image_memory_block_count; i++)
    {
        if (d->image_memory_blocks[i] == ptr->memory)
        {
            block_index = i;
            break;
        }
    }

    if (block_index == -1)
    {
        NCNN_LOGE("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->memory);

        if (!ptr->command_refcount)
        {
            vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
            vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);

            delete ptr;
        }

        return;
    }

    // merge
    std::list<std::pair<size_t, size_t> >::iterator it_merge_left = d->image_memory_budgets[block_index].end();
    std::list<std::pair<size_t, size_t> >::iterator it_merge_right = d->image_memory_budgets[block_index].end();
    std::list<std::pair<size_t, size_t> >::iterator it = d->image_memory_budgets[block_index].begin();
    for (; it != d->image_memory_budgets[block_index].end(); it++)
    {
        if (it->first + it->second == ptr->bind_offset)
        {
            it_merge_left = it;
        }
        else if (ptr->bind_offset + ptr->bind_capacity == it->first)
        {
            it_merge_right = it;
        }
    }

    if (it_merge_left != d->image_memory_budgets[block_index].end() && it_merge_right != d->image_memory_budgets[block_index].end())
    {
        it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
        d->image_memory_budgets[block_index].erase(it_merge_right);
    }
    else if (it_merge_left != d->image_memory_budgets[block_index].end())
    {
        it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first;
    }
    else if (it_merge_right != d->image_memory_budgets[block_index].end())
    {
        it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset;
        it_merge_right->first = ptr->bind_offset;
    }
    else
    {
        if (ptr->bind_offset == 0)
        {
            // chain leading block
            d->image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
        }
        else
        {
            d->image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
        }
    }

    if (!ptr->command_refcount)
    {
        vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);

        delete ptr;
    }
}

class VkWeightAllocatorPrivate
{
public:
    size_t block_size;
    size_t buffer_offset_alignment;
    size_t bind_memory_offset_alignment;
    std::vector<size_t> buffer_block_free_spaces;
    std::vector<VkBufferMemory*> buffer_blocks;
    std::vector<VkBufferMemory*> dedicated_buffer_blocks;
    std::vector<size_t> image_memory_block_free_spaces;
    std::vector<VkDeviceMemory> image_memory_blocks;
    std::vector<VkDeviceMemory> dedicated_image_memory_blocks;

    bool prefer_host_memory;
#if !defined(_WIN32)
    std::vector<void*> host_ptrs;
#endif
};

VkWeightAllocator::VkWeightAllocator(const VulkanDevice* _vkdev, bool _prefer_host_memory, size_t preferred_block_size)
    : VkAllocator(_vkdev), d(new VkWeightAllocatorPrivate)
{
    d->buffer_offset_alignment = vkdev->info.buffer_offset_alignment();
    d->bind_memory_offset_alignment = vkdev->info.buffer_image_granularity();

    if (vkdev->info.type() == 1)
    {
        // on integrated gpu, there may be device local only memory too, eg. AMD APU
        // assuming larger alignment always keeps us safe :)

        // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.memory_map_alignment());
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, vkdev->info.non_coherent_atom_size());
    }

    if (vkdev->info.support_VK_KHR_robustness2() || vkdev->info.support_VK_EXT_robustness2())
    {
        size_t robust_storage_buffer_access_size_alignment = vkdev->info.queryRobustness2Properties().robustStorageBufferAccessSizeAlignment;
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, robust_storage_buffer_access_size_alignment);
    }

    if (_prefer_host_memory && vkdev->info.support_VK_EXT_external_memory_host())
    {
        size_t min_imported_host_pointer_alignment = vkdev->info.queryExternalMemoryHostProperties().minImportedHostPointerAlignment;
        d->buffer_offset_alignment = least_common_multiple(d->buffer_offset_alignment, min_imported_host_pointer_alignment);
    }

    d->block_size = alignSize(preferred_block_size, d->buffer_offset_alignment);

    d->prefer_host_memory = _prefer_host_memory;
}

VkWeightAllocator::~VkWeightAllocator()
{
    clear();

    delete d;
}

VkWeightAllocator::VkWeightAllocator(const VkWeightAllocator&)
    : VkAllocator(0), d(0)
{
}

VkWeightAllocator& VkWeightAllocator::operator=(const VkWeightAllocator&)
{
    return *this;
}

void VkWeightAllocator::clear()
{
    //     NCNN_LOGE("VkWeightAllocator %lu %lu", d->buffer_blocks.size(), d->dedicated_buffer_blocks.size());

    d->buffer_block_free_spaces.clear();

    for (size_t i = 0; i < d->buffer_blocks.size(); i++)
    {
        VkBufferMemory* ptr = d->buffer_blocks[i];

        if (mappable)
            vkUnmapMemory(vkdev->vkdevice(), ptr->memory);

        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);

        delete ptr;
    }
    d->buffer_blocks.clear();

    for (size_t i = 0; i < d->dedicated_buffer_blocks.size(); i++)
    {
        VkBufferMemory* ptr = d->dedicated_buffer_blocks[i];

        if (mappable)
            vkUnmapMemory(vkdev->vkdevice(), ptr->memory);

        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);

        delete ptr;
    }
    d->dedicated_buffer_blocks.clear();

    d->image_memory_block_free_spaces.clear();

    for (size_t i = 0; i < d->image_memory_blocks.size(); i++)
    {
        VkDeviceMemory memory = d->image_memory_blocks[i];

        vkFreeMemory(vkdev->vkdevice(), memory, 0);
    }
    d->image_memory_blocks.clear();

    for (size_t i = 0; i < d->dedicated_image_memory_blocks.size(); i++)
    {
        VkDeviceMemory memory = d->dedicated_image_memory_blocks[i];

        vkFreeMemory(vkdev->vkdevice(), memory, 0);
    }
    d->dedicated_image_memory_blocks.clear();

#if !defined(_WIN32)
    for (size_t i = 0; i < d->host_ptrs.size(); i++)
    {
        void* host_ptr = d->host_ptrs[i];

        ncnn::fastFree(host_ptr);
    }
    d->host_ptrs.clear();
#endif
}

// fastMalloc() with alignment parameter and no malloc overread
static void* fastMalloc_with_alignment(size_t size, size_t alignment)
{
#if _MSC_VER
    return _aligned_malloc(size, alignment);
#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
    void* ptr = 0;
    if (posix_memalign(&ptr, alignment, size))
        ptr = 0;
    return ptr;
#elif __ANDROID__ && __ANDROID_API__ < 17
    return memalign(alignment, size);
#else
    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + alignment);
    if (!udata)
        return 0;
    unsigned char** adata = alignPtr((unsigned char**)udata + 1, alignment);
    adata[-1] = udata;
    return adata;
#endif
}

VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
{
    //     NCNN_LOGE("VkWeightAllocator fastMalloc %lu", size);

    size_t aligned_size = alignSize(size, d->buffer_offset_alignment);

    const int buffer_block_count = d->buffer_blocks.size();

    // find first spare space in buffer_blocks
    for (int i = 0; i < buffer_block_count; i++)
    {
        size_t free_size = d->buffer_block_free_spaces[i];
        if (free_size >= aligned_size)
        {
            size_t block_offset = d->block_size - free_size;

            // return sub buffer
            VkBufferMemory* ptr = new VkBufferMemory;

            ptr->buffer = d->buffer_blocks[i]->buffer;
            ptr->offset = block_offset;
            ptr->memory = d->buffer_blocks[i]->memory;
            ptr->capacity = aligned_size;
            ptr->mapped_ptr = d->buffer_blocks[i]->mapped_ptr;
            ptr->memory_type_index = d->buffer_blocks[i]->memory_type_index;
            ptr->access_flags = 0;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

            d->buffer_block_free_spaces[i] -= aligned_size;

            return ptr;
        }
    }

    size_t new_block_size = std::max(d->block_size, aligned_size);

    // create new block
    VkBufferMemory* block = new VkBufferMemory;

    block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
    block->offset = 0;

    if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation() && !d->prefer_host_memory)
    {
        VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
        bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR;
        bufferMemoryRequirementsInfo2.pNext = 0;
        bufferMemoryRequirementsInfo2.buffer = block->buffer;

        VkMemoryRequirements2KHR memoryRequirements2;
        memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
        memoryRequirements2.pNext = 0;

        VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
        memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
        memoryDedicatedRequirements.pNext = 0;
        memoryRequirements2.pNext = &memoryDedicatedRequirements;

        vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);

        bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;

        if (dedicatedAllocation)
        {
            // setup memory type and alignment
            if (buffer_memory_type_index == (uint32_t)-1)
            {
                if (vkdev->info.type() == 1)
                {
                    // integrated gpu, prefer unified memory
                    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

                    // on amd integrated gpu, there is a faster and larger device-only heap
                    uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                    const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physicalDeviceMemoryProperties();
                    uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
                    uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
                    if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
                    {
                        buffer_memory_type_index = device_local_memory_type_index;
                    }
                }
                else
                {
                    // discrete gpu, device local
                    if (vkdev->info.resizable_bar_enabled())
                    {
                        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
                    }
                    else
                    {
                        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                    }
                }

                mappable = vkdev->is_mappable(buffer_memory_type_index);
                coherent = vkdev->is_coherent(buffer_memory_type_index);
            }

            block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
            if (!block->memory)
            {
                vkDestroyBuffer(vkdev->vkdevice(), block->buffer, 0);
                delete block;
                return 0;
            }

            // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
            vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);

            block->mapped_ptr = 0;
            if (mappable)
            {
                vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
            }

            block->memory_type_index = buffer_memory_type_index;

            d->dedicated_buffer_blocks.push_back(block);

            // return sub buffer
            VkBufferMemory* ptr = new VkBufferMemory;

            ptr->buffer = block->buffer;
            ptr->offset = 0;
            ptr->memory = block->memory;
            ptr->capacity = new_block_size;
            ptr->mapped_ptr = block->mapped_ptr;
            ptr->memory_type_index = block->memory_type_index;
            ptr->access_flags = 0;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

            return ptr;
        }
    }

    VkMemoryRequirements memoryRequirements;
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);

    if (d->prefer_host_memory)
    {
#if !defined(_WIN32)
        if (vkdev->info.support_VK_EXT_external_memory_host())
        {
            void* host_ptr = fastMalloc_with_alignment(memoryRequirements.size, d->buffer_offset_alignment);

            if (host_ptr)
            {
                VkMemoryHostPointerPropertiesEXT pointerProperties;
                pointerProperties.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT;
                pointerProperties.pNext = 0;
                VkResult ret = vkdev->vkGetMemoryHostPointerPropertiesEXT(vkdev->vkdevice(), VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, host_ptr, &pointerProperties);
                if (ret != VK_SUCCESS)
                {
                    NCNN_LOGE("vkGetMemoryHostPointerPropertiesEXT failed %d", ret);
                    ncnn::fastFree(host_ptr);
                    vkDestroyBuffer(vkdev->vkdevice(), block->buffer, 0);
                    delete block;
                    return 0;
                }

                // setup memory type and alignment
                if (buffer_memory_type_index == (uint32_t)-1)
                {
                    buffer_memory_type_index = vkdev->find_memory_index(pointerProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

                    mappable = vkdev->is_mappable(buffer_memory_type_index);
                    coherent = vkdev->is_coherent(buffer_memory_type_index);
                }

                block->memory = allocate_import_host_memory(memoryRequirements.size, buffer_memory_type_index, host_ptr);
                if (!block->memory)
                {
                    // oom
                    ncnn::fastFree(host_ptr);
                    d->prefer_host_memory = false;
                }
                else
                {
                    d->host_ptrs.push_back(host_ptr);
                }
            }
            else
            {
                // oom
                d->prefer_host_memory = false;
            }
        }
        else
#endif // !defined(_WIN32)
        {
            // setup memory type and alignment
            if (buffer_memory_type_index == (uint32_t)-1)
            {
                buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

                mappable = vkdev->is_mappable(buffer_memory_type_index);
                coherent = vkdev->is_coherent(buffer_memory_type_index);
            }

            block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
            if (!block->memory)
            {
                // oom
                d->prefer_host_memory = false;
            }
        }

        if (!d->prefer_host_memory)
        {
            NCNN_LOGE("weight allocator fallback to device memory");
            buffer_memory_type_index = (uint32_t)-1;
            image_memory_type_index = (uint32_t)-1;
        }
    }
    if (!d->prefer_host_memory)
    {
        // setup memory type and alignment
        if (buffer_memory_type_index == (uint32_t)-1)
        {
            if (vkdev->info.type() == 1)
            {
                // integrated gpu, prefer unified memory
                buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

                // on amd integrated gpu, there is a faster and larger device-only heap
                uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physicalDeviceMemoryProperties();
                uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;
                uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
                if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
                {
                    buffer_memory_type_index = device_local_memory_type_index;
                }
            }
            else
            {
                // discrete gpu, device local
                if (vkdev->info.resizable_bar_enabled())
                {
                    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
                }
                else
                {
                    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                }
            }

            mappable = vkdev->is_mappable(buffer_memory_type_index);
            coherent = vkdev->is_coherent(buffer_memory_type_index);
        }

        block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
    }
    if (!block->memory)
    {
        vkDestroyBuffer(vkdev->vkdevice(), block->buffer, 0);
        delete block;
        return 0;
    }

    // ignore memoryRequirements.alignment as we always bind at zero offset
    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);

    //     NCNN_LOGE("VkWeightAllocator M %p", block->buffer);

    block->mapped_ptr = 0;
    if (mappable)
    {
        vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
    }

    block->memory_type_index = buffer_memory_type_index;

    d->buffer_blocks.push_back(block);

    d->buffer_block_free_spaces.push_back(new_block_size - aligned_size);

    // return sub buffer
    VkBufferMemory* ptr = new VkBufferMemory;

    ptr->buffer = block->buffer;
    ptr->offset = 0;
    ptr->memory = block->memory;
    ptr->capacity = aligned_size;
    ptr->mapped_ptr = block->mapped_ptr;
    ptr->memory_type_index = block->memory_type_index;
    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    return ptr;
}

void VkWeightAllocator::fastFree(VkBufferMemory* ptr)
{
    //     NCNN_LOGE("VkWeightAllocator F %p", ptr->buffer);

    delete ptr;
}

VkImageMemory* VkWeightAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int elempack)
{
    if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64)
    {
        NCNN_LOGE("elempack must be 1 4 8 16 32 64");
        return 0;
    }

    // resolve format
    VkFormat format = VK_FORMAT_UNDEFINED;

    if (elemsize / elempack == 4)
    {
        // fp32
        if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
        if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
        if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
        if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT;
        if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT;
        if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT;
    }
    if (elemsize / elempack == 2)
    {
        // fp16
        if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
        if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
        if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
        if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT;
        if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
        if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
    }
    if (elemsize / elempack == 1)
    {
        // int8
        if (elempack == 1) format = VK_FORMAT_R8_SINT;
        if (elempack == 4) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 8) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 16) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 32) format = VK_FORMAT_R8G8B8A8_SINT;
        if (elempack == 64) format = VK_FORMAT_R8G8B8A8_SINT;
    }

    // resolve image width height depth
    int width = w;
    int height = h;
    int depth = c;

    // large elempack spills on image w
    if (elempack == 8) width *= 2;
    if (elempack == 16) width *= 4;
    if (elempack == 32) width *= 8;
    if (elempack == 64) width *= 16;

    if (width > (int)vkdev->info.max_image_dimension_3d() || height > (int)vkdev->info.max_image_dimension_3d() || depth > (int)vkdev->info.max_image_dimension_3d())
    {
        NCNN_LOGE("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d());
        return 0;
    }

    VkImageMemory* ptr = new VkImageMemory;

    ptr->image = create_image(width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);

    ptr->width = width;
    ptr->height = height;
    ptr->depth = depth;
    ptr->format = format;

    if (vkdev->info.support_VK_KHR_get_memory_requirements2() && vkdev->info.support_VK_KHR_dedicated_allocation() && !d->prefer_host_memory)
    {
        VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2;
        imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR;
        imageMemoryRequirementsInfo2.pNext = 0;
        imageMemoryRequirementsInfo2.image = ptr->image;

        VkMemoryRequirements2KHR memoryRequirements2;
        memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
        memoryRequirements2.pNext = 0;

        VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
        memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
        memoryDedicatedRequirements.pNext = 0;
        memoryRequirements2.pNext = &memoryDedicatedRequirements;

        vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2);

        bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;

        if (dedicatedAllocation)
        {
            // setup memory type and alignment
            if (image_memory_type_index == (uint32_t)-1)
            {
                if (vkdev->info.type() == 1)
                {
                    // integrated gpu, prefer unified memory
                    image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

                    // on amd integrated gpu, there is a faster and larger device-only heap
                    uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                    const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physicalDeviceMemoryProperties();
                    uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
                    uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
                    if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
                    {
                        image_memory_type_index = device_local_memory_type_index;
                    }
                }
                else
                {
                    // discrete gpu, device local
                    if (vkdev->info.resizable_bar_enabled())
                    {
                        image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
                    }
                    else
                    {
                        image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                    }
                }

                mappable = vkdev->is_mappable(image_memory_type_index);
                coherent = vkdev->is_coherent(image_memory_type_index);
            }

            // bind memory
            ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, image_memory_type_index, ptr->image, 0);
            if (!ptr->memory)
            {
                vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
                delete ptr;
                return 0;
            }
            ptr->bind_offset = 0;
            ptr->bind_capacity = memoryRequirements2.memoryRequirements.size;

            // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);

            // do not allow host access to optimal tiling image
            ptr->mapped_ptr = 0;
            ptr->memory_type_index = image_memory_type_index;

            ptr->imageview = create_imageview(ptr->image, format);

            ptr->access_flags = 0;
            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
            ptr->command_refcount = 0;

            d->dedicated_image_memory_blocks.push_back(ptr->memory);

            return ptr;
        }
    }

    VkMemoryRequirements memoryRequirements;
    vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);

    const size_t size = memoryRequirements.size;
    const size_t alignment = std::max((size_t)memoryRequirements.alignment, d->bind_memory_offset_alignment);

    size_t aligned_size = alignSize(size, alignment);

    const int image_memory_block_count = d->image_memory_blocks.size();

    // find first spare space in buffer_blocks
    for (int i = 0; i < image_memory_block_count; i++)
    {
        // we cannot use image_memory_block_free_spaces[i] directly for base offset alignment
        size_t bind_base_offset = d->block_size - d->image_memory_block_free_spaces[i];
        size_t bind_offset = alignSize(bind_base_offset, alignment);
        if (d->image_memory_block_free_spaces[i] >= aligned_size + (bind_offset - bind_base_offset))
        {
            // bind at memory offset
            ptr->memory = d->image_memory_blocks[i];
            ptr->bind_offset = bind_offset;
            ptr->bind_capacity = aligned_size;

            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);

            // do not allow host access to optimal tiling image
            ptr->mapped_ptr = 0;
            ptr->memory_type_index = image_memory_type_index;

            ptr->imageview = create_imageview(ptr->image, format);

            ptr->access_flags = 0;
            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
            ptr->command_refcount = 0;

            if (bind_base_offset != bind_offset)
            {
                // NOTE there is small offset inside bind_base_offset and bind_offset
                // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
                // so that memory management could be easier
                aligned_size += (bind_offset - bind_base_offset);

                ptr->bind_offset = bind_base_offset;
                ptr->bind_capacity = aligned_size;
            }

            d->image_memory_block_free_spaces[i] -= aligned_size;

            return ptr;
        }
    }

    // create new block
    size_t new_block_size = std::max(d->block_size, aligned_size);

    if (d->prefer_host_memory)
    {
#if !defined(_WIN32)
        if (vkdev->info.support_VK_EXT_external_memory_host())
        {
            void* host_ptr = fastMalloc_with_alignment(new_block_size, d->buffer_offset_alignment);

            if (host_ptr)
            {
                VkMemoryHostPointerPropertiesEXT pointerProperties;
                pointerProperties.sType = VK_STRUCTURE_TYPE_MEMORY_HOST_POINTER_PROPERTIES_EXT;
                pointerProperties.pNext = 0;
                VkResult ret = vkdev->vkGetMemoryHostPointerPropertiesEXT(vkdev->vkdevice(), VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT, host_ptr, &pointerProperties);
                if (ret != VK_SUCCESS)
                {
                    NCNN_LOGE("vkGetMemoryHostPointerPropertiesEXT failed %d", ret);
                    ncnn::fastFree(host_ptr);
                    vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
                    delete ptr;
                    return 0;
                }

                // setup memory type and alignment
                if (image_memory_type_index == (uint32_t)-1)
                {
                    image_memory_type_index = vkdev->find_memory_index(pointerProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

                    mappable = vkdev->is_mappable(image_memory_type_index);
                    coherent = vkdev->is_coherent(image_memory_type_index);
                }

                ptr->memory = allocate_import_host_memory(new_block_size, image_memory_type_index, host_ptr);
                if (!ptr->memory)
                {
                    // oom
                    ncnn::fastFree(host_ptr);
                    d->prefer_host_memory = false;
                }
                else
                {
                    d->host_ptrs.push_back(host_ptr);
                }
            }
            else
            {
                // oom
                d->prefer_host_memory = false;
            }
        }
        else
#endif // !defined(_WIN32)
        {
            // setup memory type and alignment
            if (image_memory_type_index == (uint32_t)-1)
            {
                image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);

                mappable = vkdev->is_mappable(image_memory_type_index);
                coherent = vkdev->is_coherent(image_memory_type_index);
            }

            // bind at memory offset
            ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
            if (!ptr->memory)
            {
                // oom
                d->prefer_host_memory = false;
            }
        }

        if (!d->prefer_host_memory)
        {
            NCNN_LOGE("weight allocator fallback to device memory");
            buffer_memory_type_index = (uint32_t)-1;
            image_memory_type_index = (uint32_t)-1;
        }
    }
    if (!d->prefer_host_memory)
    {
        // setup memory type and alignment
        if (image_memory_type_index == (uint32_t)-1)
        {
            if (vkdev->info.type() == 1)
            {
                // integrated gpu, prefer unified memory
                image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);

                // on amd integrated gpu, there is a faster and larger device-only heap
                uint32_t device_local_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                const VkPhysicalDeviceMemoryProperties& memory_properties = vkdev->info.physicalDeviceMemoryProperties();
                uint32_t buffer_heap_index = memory_properties.memoryTypes[image_memory_type_index].heapIndex;
                uint32_t device_local_heap_index = memory_properties.memoryTypes[device_local_memory_type_index].heapIndex;
                if (device_local_heap_index < buffer_heap_index && memory_properties.memoryHeaps[device_local_heap_index].size > memory_properties.memoryHeaps[buffer_heap_index].size)
                {
                    image_memory_type_index = device_local_memory_type_index;
                }
            }
            else
            {
                // discrete gpu, device local
                if (vkdev->info.resizable_bar_enabled())
                {
                    image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
                }
                else
                {
                    image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
                }
            }

            mappable = vkdev->is_mappable(image_memory_type_index);
            coherent = vkdev->is_coherent(image_memory_type_index);
        }

        // bind at memory offset
        ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
    }
    if (!ptr->memory)
    {
        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
        delete ptr;
        return 0;
    }
    ptr->bind_offset = 0;
    ptr->bind_capacity = aligned_size;

    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
    vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);

    // do not allow host access to optimal tiling image
    ptr->mapped_ptr = 0;
    ptr->memory_type_index = image_memory_type_index;

    ptr->imageview = create_imageview(ptr->image, format);

    ptr->access_flags = 0;
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
    ptr->command_refcount = 0;

    d->image_memory_blocks.push_back(ptr->memory);
    d->image_memory_block_free_spaces.push_back(new_block_size - aligned_size);

    return ptr;
}

void VkWeightAllocator::fastFree(VkImageMemory* ptr)
{
    //     NCNN_LOGE("VkWeightAllocator F %p", ptr->memory);

    if (!ptr->command_refcount)
    {
        vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);

        delete ptr;
    }
}

class VkStagingAllocatorPrivate
{
public:
    unsigned int size_compare_ratio; // 0~256
    std::list<VkBufferMemory*> buffer_budgets;
};

VkStagingAllocator::VkStagingAllocator(const VulkanDevice* _vkdev)
    : VkAllocator(_vkdev), d(new VkStagingAllocatorPrivate)
{
    mappable = true;
    coherent = true;

    d->size_compare_ratio = 192; // 0.75f * 256
}

VkStagingAllocator::~VkStagingAllocator()
{
    clear();

    delete d;
}

VkStagingAllocator::VkStagingAllocator(const VkStagingAllocator&)
    : VkAllocator(0), d(0)
{
}

VkStagingAllocator& VkStagingAllocator::operator=(const VkStagingAllocator&)
{
    return *this;
}

void VkStagingAllocator::set_size_compare_ratio(float scr)
{
    if (scr < 0.f || scr > 1.f)
    {
        NCNN_LOGE("invalid size compare ratio %f", scr);
        return;
    }

    d->size_compare_ratio = (unsigned int)(scr * 256);
}

void VkStagingAllocator::clear()
{
    //     NCNN_LOGE("VkStagingAllocator %lu", buffer_budgets.size());

    for (std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin(); it != d->buffer_budgets.end(); it++)
    {
        VkBufferMemory* ptr = *it;

        //         NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);

        vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);

        delete ptr;
    }
    d->buffer_budgets.clear();
}

VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
{
    // find free budget
    std::list<VkBufferMemory*>::iterator it = d->buffer_budgets.begin();
    for (; it != d->buffer_budgets.end(); it++)
    {
        VkBufferMemory* ptr = *it;

        size_t capacity = ptr->capacity;

        // size_compare_ratio ~ 100%
        if (capacity >= size && ((capacity * d->size_compare_ratio) >> 8) <= size)
        {
            d->buffer_budgets.erase(it);

            //             NCNN_LOGE("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);

            return ptr;
        }
    }

    VkBufferMemory* ptr = new VkBufferMemory;

    ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
    ptr->offset = 0;

    VkMemoryRequirements memoryRequirements;
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);

    // setup memory type
    if (buffer_memory_type_index == (uint32_t)-1)
    {
        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
    }

    ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
    if (!ptr->memory)
    {
        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
        delete ptr;
        return 0;
    }

    // ignore memoryRequirements.alignment as we always bind at zero offset
    vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);

    ptr->capacity = size;

    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);

    ptr->memory_type_index = buffer_memory_type_index;

    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    //     NCNN_LOGE("VkStagingAllocator M %p %lu", ptr->buffer, size);

    return ptr;
}

void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
{
    //     NCNN_LOGE("VkStagingAllocator F %p", ptr->buffer);

    // return to buffer_budgets
    d->buffer_budgets.push_back(ptr);
}

VkImageMemory* VkStagingAllocator::fastMalloc(int w, int h, int c, size_t elemsize, int /* elempack */)
{
    // staging image is mainly used for storing small piece of dynamic parameters
    // we allocate host memory as a fake image, it's simple and good

    const size_t size = w * h * c * elemsize;

    VkImageMemory* ptr = new VkImageMemory;

    ptr->image = 0;
    ptr->width = w;
    ptr->height = h;
    ptr->depth = c;
    ptr->format = VK_FORMAT_UNDEFINED;
    ptr->memory = 0;
    ptr->bind_offset = 0;
    ptr->bind_capacity = size;

    ptr->mapped_ptr = malloc(size);
    ptr->memory_type_index = (uint32_t)-1;

    ptr->imageview = 0;

    ptr->access_flags = 0;
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
    ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
    ptr->command_refcount = 0;

    //     NCNN_LOGE("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);

    return ptr;
}

void VkStagingAllocator::fastFree(VkImageMemory* ptr)
{
    //     NCNN_LOGE("VkStagingAllocator F %p", ptr->image);

    free(ptr->mapped_ptr);

    delete ptr;
}

class VkWeightStagingAllocatorPrivate
{
public:
};

VkWeightStagingAllocator::VkWeightStagingAllocator(const VulkanDevice* _vkdev)
    : VkAllocator(_vkdev), d(new VkWeightStagingAllocatorPrivate)
{
    mappable = true;
    coherent = true;
}

VkWeightStagingAllocator::~VkWeightStagingAllocator()
{
    delete d;
}

VkWeightStagingAllocator::VkWeightStagingAllocator(const VkWeightStagingAllocator&)
    : VkAllocator(0), d(0)
{
}

VkWeightStagingAllocator& VkWeightStagingAllocator::operator=(const VkWeightStagingAllocator&)
{
    return *this;
}

VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
{
    VkBufferMemory* ptr = new VkBufferMemory;

    ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
    ptr->offset = 0;

    VkMemoryRequirements memoryRequirements;
    vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);

    // setup memory type
    if (buffer_memory_type_index == (uint32_t)-1)
    {
        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
    }

    ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
    if (!ptr->memory)
    {
        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
        delete ptr;
        return 0;
    }

    // ignore memoryRequirements.alignment as we always bind at zero offset
    vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);

    ptr->capacity = size;

    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);

    ptr->memory_type_index = buffer_memory_type_index;

    ptr->access_flags = 0;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    //     NCNN_LOGE("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);

    return ptr;
}

void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr)
{
    //     NCNN_LOGE("VkWeightStagingAllocator F %p", ptr->buffer);

    vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
    vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
    vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);

    delete ptr;
}

VkImageMemory* VkWeightStagingAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
{
    return 0;
}

void VkWeightStagingAllocator::fastFree(VkImageMemory* /*ptr*/)
{
}

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb)
    : VkAllocator(_vkdev), hb(_hb)
{
    samplerYcbcrConversion = 0;

    init();
}

VkAndroidHardwareBufferImageAllocator::~VkAndroidHardwareBufferImageAllocator()
{
    if (samplerYcbcrConversion)
    {
        vkdev->vkDestroySamplerYcbcrConversionKHR(vkdev->vkdevice(), samplerYcbcrConversion, 0);
        samplerYcbcrConversion = 0;
    }
}

VkAndroidHardwareBufferImageAllocator::VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&)
    : VkAllocator(0)
{
}

VkAndroidHardwareBufferImageAllocator& VkAndroidHardwareBufferImageAllocator::operator=(const VkAndroidHardwareBufferImageAllocator&)
{
    return *this;
}

VkBufferMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(size_t /*size*/)
{
    return 0;
}

void VkAndroidHardwareBufferImageAllocator::fastFree(VkBufferMemory* /*ptr*/)
{
}

VkImageMemory* VkAndroidHardwareBufferImageAllocator::fastMalloc(int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/)
{
    VkResult ret;

    VkExternalFormatANDROID externalFormat;
    externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
    externalFormat.pNext = 0;
    externalFormat.externalFormat = bufferFormatProperties.externalFormat;

    VkExternalMemoryImageCreateInfo externalMemoryImageCreateInfo;
    externalMemoryImageCreateInfo.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO,
    externalMemoryImageCreateInfo.pNext = &externalFormat,
    externalMemoryImageCreateInfo.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_ANDROID_HARDWARE_BUFFER_BIT_ANDROID;

    VkImageCreateInfo imageCreateInfo;
    imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
    imageCreateInfo.pNext = &externalMemoryImageCreateInfo;
    imageCreateInfo.flags = 0;
    imageCreateInfo.imageType = VK_IMAGE_TYPE_2D;
    imageCreateInfo.format = VK_FORMAT_UNDEFINED;
    imageCreateInfo.extent.width = bufferDesc.width;
    imageCreateInfo.extent.height = bufferDesc.height;
    imageCreateInfo.extent.depth = 1;
    imageCreateInfo.mipLevels = 1;
    imageCreateInfo.arrayLayers = 1;
    imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
    imageCreateInfo.tiling = VK_IMAGE_TILING_OPTIMAL;
    imageCreateInfo.usage = VK_IMAGE_USAGE_SAMPLED_BIT;
    imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
    imageCreateInfo.queueFamilyIndexCount = 0;
    imageCreateInfo.pQueueFamilyIndices = 0;
    imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;

    VkImage image = 0;
    ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateImage failed %d", ret);
        return 0;
    }

    // setup memory type
    if (image_memory_type_index == (uint32_t)-1)
    {
        image_memory_type_index = vkdev->find_memory_index(bufferProperties.memoryTypeBits, 0, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
    }

    VkImportAndroidHardwareBufferInfoANDROID importAndroidHardwareBufferInfo;
    importAndroidHardwareBufferInfo.sType = VK_STRUCTURE_TYPE_IMPORT_ANDROID_HARDWARE_BUFFER_INFO_ANDROID;
    importAndroidHardwareBufferInfo.pNext = 0;
    importAndroidHardwareBufferInfo.buffer = hb;

    VkMemoryDedicatedAllocateInfo memoryDedicatedAllocateInfo;
    memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO;
    memoryDedicatedAllocateInfo.pNext = &importAndroidHardwareBufferInfo;
    memoryDedicatedAllocateInfo.image = image;
    memoryDedicatedAllocateInfo.buffer = VK_NULL_HANDLE;

    VkMemoryAllocateInfo memoryAllocateInfo;
    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
    memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
    memoryAllocateInfo.allocationSize = bufferProperties.allocationSize;
    memoryAllocateInfo.memoryTypeIndex = image_memory_type_index;

    VkDeviceMemory memory = 0;
    ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkAllocateMemory failed %d", ret);
        return 0;
    }

    VkBindImageMemoryInfo bindImageMemoryInfo;
    bindImageMemoryInfo.sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO;
    bindImageMemoryInfo.pNext = 0;
    bindImageMemoryInfo.image = image;
    bindImageMemoryInfo.memory = memory;
    bindImageMemoryInfo.memoryOffset = 0;
    ret = vkdev->vkBindImageMemory2KHR(vkdev->vkdevice(), 1, &bindImageMemoryInfo);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkBindImageMemory2KHR failed %d", ret);
        vkDestroyImage(vkdev->vkdevice(), image, 0);
        return 0;
    }

    VkSamplerYcbcrConversionInfoKHR samplerYcbcrConversionInfo;
    samplerYcbcrConversionInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_INFO_KHR;
    samplerYcbcrConversionInfo.pNext = &externalFormat;
    samplerYcbcrConversionInfo.conversion = samplerYcbcrConversion;

    VkImageViewCreateInfo imageViewCreateInfo;
    imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
    imageViewCreateInfo.pNext = &samplerYcbcrConversionInfo;
    imageViewCreateInfo.flags = 0;
    imageViewCreateInfo.image = image;
    imageViewCreateInfo.viewType = VK_IMAGE_VIEW_TYPE_2D;
    imageViewCreateInfo.format = VK_FORMAT_UNDEFINED;
    imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
    imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
    imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
    imageViewCreateInfo.subresourceRange.levelCount = 1;
    imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
    imageViewCreateInfo.subresourceRange.layerCount = 1;

    VkImageView imageview = 0;
    ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateImageView failed %d", ret);
        vkDestroyImage(vkdev->vkdevice(), image, 0);
        vkFreeMemory(vkdev->vkdevice(), memory, 0);
        return 0;
    }

    VkImageMemory* ptr = new VkImageMemory;
    ptr->image = image;
    ptr->memory = memory;
    ptr->imageview = imageview;
    ptr->mapped_ptr = 0;
    ptr->memory_type_index = (uint32_t)-1;
    ptr->access_flags = 0;
    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;

    return ptr;
}

void VkAndroidHardwareBufferImageAllocator::fastFree(VkImageMemory* ptr)
{
    vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
    vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
    vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);

    delete ptr;
}

int VkAndroidHardwareBufferImageAllocator::init()
{
    AHardwareBuffer_describe(hb, &bufferDesc);

    VkResult ret;

    // resolve externalFormat
    bufferFormatProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_FORMAT_PROPERTIES_ANDROID;
    bufferFormatProperties.pNext = 0;

    bufferProperties.sType = VK_STRUCTURE_TYPE_ANDROID_HARDWARE_BUFFER_PROPERTIES_ANDROID;
    bufferProperties.pNext = &bufferFormatProperties;

    ret = vkdev->vkGetAndroidHardwareBufferPropertiesANDROID(vkdev->vkdevice(), hb, &bufferProperties);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkGetAndroidHardwareBufferPropertiesANDROID failed %d", ret);
        return -1;
    }

    // setup samplerYcbcrConversion
    VkExternalFormatANDROID externalFormat;
    externalFormat.sType = VK_STRUCTURE_TYPE_EXTERNAL_FORMAT_ANDROID;
    externalFormat.pNext = 0;
    externalFormat.externalFormat = bufferFormatProperties.externalFormat;

    VkSamplerYcbcrConversionCreateInfoKHR samplerYcbcrConversionCreateInfo;
    samplerYcbcrConversionCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_YCBCR_CONVERSION_CREATE_INFO_KHR;
    samplerYcbcrConversionCreateInfo.pNext = &externalFormat;
    samplerYcbcrConversionCreateInfo.format = VK_FORMAT_UNDEFINED;
    samplerYcbcrConversionCreateInfo.ycbcrModel = bufferFormatProperties.suggestedYcbcrModel;
    samplerYcbcrConversionCreateInfo.ycbcrRange = bufferFormatProperties.suggestedYcbcrRange;
    samplerYcbcrConversionCreateInfo.components = bufferFormatProperties.samplerYcbcrConversionComponents;
    samplerYcbcrConversionCreateInfo.xChromaOffset = bufferFormatProperties.suggestedXChromaOffset;
    samplerYcbcrConversionCreateInfo.yChromaOffset = bufferFormatProperties.suggestedYChromaOffset;
    samplerYcbcrConversionCreateInfo.chromaFilter = VK_FILTER_NEAREST;
    samplerYcbcrConversionCreateInfo.forceExplicitReconstruction = VK_FALSE;

    ret = vkdev->vkCreateSamplerYcbcrConversionKHR(vkdev->vkdevice(), &samplerYcbcrConversionCreateInfo, 0, &samplerYcbcrConversion);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateSamplerYcbcrConversionKHR failed %d", ret);
        return -1;
    }

    return 0;
}

int VkAndroidHardwareBufferImageAllocator::width() const
{
    return bufferDesc.width;
}

int VkAndroidHardwareBufferImageAllocator::height() const
{
    return bufferDesc.height;
}

uint64_t VkAndroidHardwareBufferImageAllocator::external_format() const
{
    return bufferFormatProperties.externalFormat;
}
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API

#endif // NCNN_VULKAN

} // namespace ncnn


================================================
FILE: src/allocator.h
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NCNN_ALLOCATOR_H
#define NCNN_ALLOCATOR_H

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif

#include "platform.h"

#include <stdlib.h>

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
#include <android/hardware_buffer.h>
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API

namespace ncnn {

// the alignment of all the allocated buffers
#if NCNN_AVX512
#define NCNN_MALLOC_ALIGN 64
#elif NCNN_AVX
#define NCNN_MALLOC_ALIGN 32
#else
#define NCNN_MALLOC_ALIGN 16
#endif

// we have some optimized kernels that may overread buffer a bit in loop
// it is common to interleave next-loop data load with arithmetic instructions
// allocating more bytes keeps us safe from SEGV_ACCERR failure
#define NCNN_MALLOC_OVERREAD 64

// Aligns a pointer to the specified number of bytes
// ptr Aligned pointer
// n Alignment size that must be a power of two
template<typename _Tp>
static NCNN_FORCEINLINE _Tp* alignPtr(_Tp* ptr, int n = (int)sizeof(_Tp))
{
    return (_Tp*)(((size_t)ptr + n - 1) & -n);
}

// Aligns a buffer size to the specified number of bytes
// The function returns the minimum number that is greater or equal to sz and is divisible by n
// sz Buffer size to align
// n Alignment size that must be a power of two
static NCNN_FORCEINLINE size_t alignSize(size_t sz, int n)
{
    return (sz + n - 1) & -n;
}

static NCNN_FORCEINLINE void* fastMalloc(size_t size)
{
#if _MSC_VER
    return _aligned_malloc(size + NCNN_MALLOC_OVERREAD, NCNN_MALLOC_ALIGN);
#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
    void* ptr = 0;
    if (posix_memalign(&ptr, NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD))
        ptr = 0;
    return ptr;
#elif __ANDROID__ && __ANDROID_API__ < 17
    return memalign(NCNN_MALLOC_ALIGN, size + NCNN_MALLOC_OVERREAD);
#else
    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + NCNN_MALLOC_ALIGN + NCNN_MALLOC_OVERREAD);
    if (!udata)
        return 0;
    unsigned char** adata = alignPtr((unsigned char**)udata + 1, NCNN_MALLOC_ALIGN);
    adata[-1] = udata;
    return adata;
#endif
}

static NCNN_FORCEINLINE void fastFree(void* ptr)
{
    if (ptr)
    {
#if _MSC_VER
        _aligned_free(ptr);
#elif (defined(__unix__) || defined(__APPLE__)) && _POSIX_C_SOURCE >= 200112L || (__ANDROID__ && __ANDROID_API__ >= 17)
        free(ptr);
#elif __ANDROID__ && __ANDROID_API__ < 17
        free(ptr);
#else
        unsigned char* udata = ((unsigned char**)ptr)[-1];
        free(udata);
#endif
    }
}

#if NCNN_THREADS
// exchange-add operation for atomic operations on reference counters
#if defined __riscv && !defined __riscv_atomic
// riscv target without A extension
static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
{
    int tmp = *addr;
    *addr += delta;
    return tmp;
}
#elif defined __INTEL_COMPILER && !(defined WIN32 || defined _WIN32)
// atomic increment on the linux version of the Intel(tm) compiler
#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd(const_cast<void*>(reinterpret_cast<volatile void*>(addr)), delta)
#elif defined __GNUC__
#if defined __clang__ && __clang_major__ >= 3 && !defined __ANDROID__ && !defined __EMSCRIPTEN__ && !defined(__CUDACC__)
#ifdef __ATOMIC_ACQ_REL
#define NCNN_XADD(addr, delta) __c11_atomic_fetch_add((_Atomic(int)*)(addr), delta, __ATOMIC_ACQ_REL)
#else
#define NCNN_XADD(addr, delta) __atomic_fetch_add((_Atomic(int)*)(addr), delta, 4)
#endif
#else
#if defined __ATOMIC_ACQ_REL && !defined __clang__
// version for gcc >= 4.7
#define NCNN_XADD(addr, delta) (int)__atomic_fetch_add((unsigned*)(addr), (unsigned)(delta), __ATOMIC_ACQ_REL)
#else
#define NCNN_XADD(addr, delta) (int)__sync_fetch_and_add((unsigned*)(addr), (unsigned)(delta))
#endif
#endif
#elif defined _MSC_VER && !defined RC_INVOKED
#define NCNN_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
#else
// thread-unsafe branch
static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
{
    int tmp = *addr;
    *addr += delta;
    return tmp;
}
#endif
#else  // NCNN_THREADS
static NCNN_FORCEINLINE int NCNN_XADD(int* addr, int delta)
{
    int tmp = *addr;
    *addr += delta;
    return tmp;
}
#endif // NCNN_THREADS

class NCNN_EXPORT Allocator
{
public:
    virtual ~Allocator();
    virtual void* fastMalloc(size_t size) = 0;
    virtual void fastFree(void* ptr) = 0;
};

class PoolAllocatorPrivate;
class NCNN_EXPORT PoolAllocator : public Allocator
{
public:
    PoolAllocator();
    ~PoolAllocator();

    // ratio range 0 ~ 1
    // default cr = 0
    void set_size_compare_ratio(float scr);

    // budget drop threshold
    // default threshold = 10
    void set_size_drop_threshold(size_t);

    // release all budgets immediately
    void clear();

    virtual void* fastMalloc(size_t size);
    virtual void fastFree(void* ptr);

private:
    PoolAllocator(const PoolAllocator&);
    PoolAllocator& operator=(const PoolAllocator&);

private:
    PoolAllocatorPrivate* const d;
};

class UnlockedPoolAllocatorPrivate;
class NCNN_EXPORT UnlockedPoolAllocator : public Allocator
{
public:
    UnlockedPoolAllocator();
    ~UnlockedPoolAllocator();

    // ratio range 0 ~ 1
    // default cr = 0
    void set_size_compare_ratio(float scr);

    // budget drop threshold
    // default threshold = 10
    void set_size_drop_threshold(size_t);

    // release all budgets immediately
    void clear();

    virtual void* fastMalloc(size_t size);
    virtual void fastFree(void* ptr);

private:
    UnlockedPoolAllocator(const UnlockedPoolAllocator&);
    UnlockedPoolAllocator& operator=(const UnlockedPoolAllocator&);

private:
    UnlockedPoolAllocatorPrivate* const d;
};

#if NCNN_VULKAN

class VulkanDevice;

class NCNN_EXPORT VkBufferMemory
{
public:
    VkBuffer buffer;

    // the base offset assigned by allocator
    size_t offset;
    size_t capacity;

    VkDeviceMemory memory;
    void* mapped_ptr;

    uint32_t memory_type_index;

    // buffer state, modified by command functions internally
    mutable VkAccessFlags access_flags;
    mutable VkPipelineStageFlags stage_flags;

    // initialize and modified by mat
    int refcount;
};

class NCNN_EXPORT VkImageMemory
{
public:
    VkImage image;
    VkImageView imageview;

    // underlying info assigned by allocator
    int width;
    int height;
    int depth;
    VkFormat format;

    VkDeviceMemory memory;
    void* mapped_ptr;

    uint32_t memory_type_index;

    // the base offset assigned by allocator
    size_t bind_offset;
    size_t bind_capacity;

    // image state, modified by command functions internally
    mutable VkAccessFlags access_flags;
    mutable VkImageLayout image_layout;
    mutable VkPipelineStageFlags stage_flags;

    // in-execution state, modified by command functions internally
    mutable int command_refcount;

    // initialize and modified by mat
    int refcount;
};

class NCNN_EXPORT VkAllocator
{
public:
    explicit VkAllocator(const VulkanDevice* _vkdev);
    virtual ~VkAllocator();

    virtual void clear();

    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
    virtual void fastFree(VkBufferMemory* ptr) = 0;
    virtual int flush(VkBufferMemory* ptr);
    virtual int invalidate(VkBufferMemory* ptr);

    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack) = 0;
    virtual void fastFree(VkImageMemory* ptr) = 0;

public:
    const VulkanDevice* vkdev;
    uint32_t buffer_memory_type_index;
    uint32_t image_memory_type_index;
    uint32_t reserved_type_index;
    bool mappable;
    bool coherent;

protected:
    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
    VkDeviceMemory allocate_import_host_memory(size_t size, uint32_t memory_type_index, void* host_ptr);

    VkImage create_image(int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
    VkImageView create_imageview(VkImage image, VkFormat format);
};

class VkBlobAllocatorPrivate;
class NCNN_EXPORT VkBlobAllocator : public VkAllocator
{
public:
    explicit VkBlobAllocator(const VulkanDevice* vkdev, size_t preferred_block_size = 16 * 1024 * 1024); // 16M
    virtual ~VkBlobAllocator();

public:
    // release all budgets immediately
    virtual void clear();

    virtual VkBufferMemory* fastMalloc(size_t size);
    virtual void fastFree(VkBufferMemory* ptr);
    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
    virtual void fastFree(VkImageMemory* ptr);

private:
    VkBlobAllocator(const VkBlobAllocator&);
    VkBlobAllocator& operator=(const VkBlobAllocator&);

private:
    VkBlobAllocatorPrivate* const d;
};

class VkWeightAllocatorPrivate;
class NCNN_EXPORT VkWeightAllocator : public VkAllocator
{
public:
    explicit VkWeightAllocator(const VulkanDevice* vkdev, bool prefer_host_memory = false, size_t preferred_block_size = 8 * 1024 * 1024); // 8M
    explicit VkWeightAllocator(const VulkanDevice* vkdev, size_t preferred_block_size)
        : VkWeightAllocator(vkdev, false, preferred_block_size)
    {
    }
    virtual ~VkWeightAllocator();

public:
    // release all blocks immediately
    virtual void clear();

public:
    virtual VkBufferMemory* fastMalloc(size_t size);
    virtual void fastFree(VkBufferMemory* ptr);
    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
    virtual void fastFree(VkImageMemory* ptr);

private:
    VkWeightAllocator(const VkWeightAllocator&);
    VkWeightAllocator& operator=(const VkWeightAllocator&);

private:
    VkWeightAllocatorPrivate* const d;
};

class VkStagingAllocatorPrivate;
class NCNN_EXPORT VkStagingAllocator : public VkAllocator
{
public:
    explicit VkStagingAllocator(const VulkanDevice* vkdev);
    virtual ~VkStagingAllocator();

public:
    // ratio range 0 ~ 1
    // default cr = 0.75
    void set_size_compare_ratio(float scr);

    // release all budgets immediately
    virtual void clear();

    virtual VkBufferMemory* fastMalloc(size_t size);
    virtual void fastFree(VkBufferMemory* ptr);
    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
    virtual void fastFree(VkImageMemory* ptr);

private:
    VkStagingAllocator(const VkStagingAllocator&);
    VkStagingAllocator& operator=(const VkStagingAllocator&);

private:
    VkStagingAllocatorPrivate* const d;
};

class VkWeightStagingAllocatorPrivate;
class NCNN_EXPORT VkWeightStagingAllocator : public VkAllocator
{
public:
    explicit VkWeightStagingAllocator(const VulkanDevice* vkdev);
    virtual ~VkWeightStagingAllocator();

public:
    virtual VkBufferMemory* fastMalloc(size_t size);
    virtual void fastFree(VkBufferMemory* ptr);
    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
    virtual void fastFree(VkImageMemory* ptr);

private:
    VkWeightStagingAllocator(const VkWeightStagingAllocator&);
    VkWeightStagingAllocator& operator=(const VkWeightStagingAllocator&);

private:
    VkWeightStagingAllocatorPrivate* const d;
};

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
class NCNN_EXPORT VkAndroidHardwareBufferImageAllocator : public VkAllocator
{
public:
    VkAndroidHardwareBufferImageAllocator(const VulkanDevice* _vkdev, AHardwareBuffer* _hb);
    virtual ~VkAndroidHardwareBufferImageAllocator();

public:
    virtual VkBufferMemory* fastMalloc(size_t size);
    virtual void fastFree(VkBufferMemory* ptr);
    virtual VkImageMemory* fastMalloc(int w, int h, int c, size_t elemsize, int elempack);
    virtual void fastFree(VkImageMemory* ptr);

private:
    VkAndroidHardwareBufferImageAllocator(const VkAndroidHardwareBufferImageAllocator&);
    VkAndroidHardwareBufferImageAllocator& operator=(const VkAndroidHardwareBufferImageAllocator&);

public:
    int init();

    int width() const;
    int height() const;
    uint64_t external_format() const;

public:
    AHardwareBuffer* hb;
    AHardwareBuffer_Desc bufferDesc;
    VkAndroidHardwareBufferFormatPropertiesANDROID bufferFormatProperties;
    VkAndroidHardwareBufferPropertiesANDROID bufferProperties;
    VkSamplerYcbcrConversionKHR samplerYcbcrConversion;
};
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API

#endif // NCNN_VULKAN

} // namespace ncnn

#endif // NCNN_ALLOCATOR_H


================================================
FILE: src/benchmark.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "benchmark.h"

#if (__cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L)) && !defined(__riscv) && !NCNN_SIMPLESTL
#define USE_CXX11_CLOCK 1
#else
#define USE_CXX11_CLOCK 0
#endif

#if USE_CXX11_CLOCK
#include <chrono>
#if NCNN_THREADS
#include <thread>
#endif
#include <numeric>
#include <algorithm>
#endif // USE_CXX11_CLOCK

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else                 // _WIN32
#include <sys/time.h> //gettimeofday()
#include <unistd.h>   // sleep()
#endif                // _WIN32

#if NCNN_BENCHMARK
#include "layer/convolution.h"
#include "layer/convolutiondepthwise.h"
#include "layer/deconvolution.h"
#include "layer/deconvolutiondepthwise.h"
#include "layer/convolution3d.h"
#include "layer/convolutiondepthwise3d.h"
#include "layer/deconvolution3d.h"
#include "layer/deconvolutiondepthwise3d.h"

#include <stdio.h>
#endif // NCNN_BENCHMARK

namespace ncnn {

double get_current_time()
{
#if USE_CXX11_CLOCK
    auto now = std::chrono::high_resolution_clock::now();
    auto usec = std::chrono::duration_cast<std::chrono::microseconds>(now.time_since_epoch());
    return usec.count() / 1000.0;
#else
#ifdef _WIN32
    LARGE_INTEGER freq;
    LARGE_INTEGER pc;
    QueryPerformanceFrequency(&freq);
    QueryPerformanceCounter(&pc);

    return pc.QuadPart * 1000.0 / freq.QuadPart;
#else  // _WIN32
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
#endif // _WIN32
#endif
}

void sleep(unsigned long long int milliseconds)
{
#if USE_CXX11_CLOCK && NCNN_THREADS
    std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
#else
#ifdef _WIN32
    Sleep(milliseconds);
#elif defined(__unix__) || defined(__APPLE__)
    usleep(milliseconds * 1000);
#elif _POSIX_TIMERS
    struct timespec ts;
    ts.tv_sec = milliseconds * 0.001;
    ts.tv_nsec = 0;
    nanosleep(&ts, &ts);
#else
    // TODO How to handle it ?
#endif
#endif
}

#if NCNN_BENCHMARK

void benchmark(const Layer* layer, double start, double end)
{
    fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
    fprintf(stderr, "    |");
    fprintf(stderr, "\n");
}

void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end)
{
    fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);

    char in_shape_str[64] = {'\0'};
    char out_shape_str[64] = {'\0'};

    if (bottom_blob.dims == 1)
    {
        sprintf(in_shape_str, "[%3d *%d]", bottom_blob.w, bottom_blob.elempack);
    }
    if (bottom_blob.dims == 2)
    {
        sprintf(in_shape_str, "[%3d, %3d *%d]", bottom_blob.w, bottom_blob.h, bottom_blob.elempack);
    }
    if (bottom_blob.dims == 3)
    {
        sprintf(in_shape_str, "[%3d, %3d, %3d *%d]", bottom_blob.w, bottom_blob.h, bottom_blob.c, bottom_blob.elempack);
    }
    if (bottom_blob.dims == 4)
    {
        sprintf(in_shape_str, "[%3d, %3d, %3d, %3d *%d]", bottom_blob.w, bottom_blob.h, bottom_blob.d, bottom_blob.c, bottom_blob.elempack);
    }

    if (top_blob.dims == 1)
    {
        sprintf(out_shape_str, "[%3d *%d]", top_blob.w, top_blob.elempack);
    }
    if (top_blob.dims == 2)
    {
        sprintf(out_shape_str, "[%3d, %3d *%d]", top_blob.w, top_blob.h, top_blob.elempack);
    }
    if (top_blob.dims == 3)
    {
        sprintf(out_shape_str, "[%3d, %3d, %3d *%d]", top_blob.w, top_blob.h, top_blob.c, top_blob.elempack);
    }
    if (top_blob.dims == 4)
    {
        sprintf(out_shape_str, "[%3d, %3d, %3d, %3d *%d]", top_blob.w, top_blob.h, top_blob.d, top_blob.c, top_blob.elempack);
    }

    fprintf(stderr, "    | %22s -> %-22s", in_shape_str, out_shape_str);

    if (layer->type == "Convolution")
    {
        fprintf(stderr, "     kernel: %1d x %1d     stride: %1d x %1d",
                ((Convolution*)layer)->kernel_w,
                ((Convolution*)layer)->kernel_h,
                ((Convolution*)layer)->stride_w,
                ((Convolution*)layer)->stride_h);
    }
    else if (layer->type == "ConvolutionDepthWise")
    {
        fprintf(stderr, "     kernel: %1d x %1d     stride: %1d x %1d",
                ((ConvolutionDepthWise*)layer)->kernel_w,
                ((ConvolutionDepthWise*)layer)->kernel_h,
                ((ConvolutionDepthWise*)layer)->stride_w,
                ((ConvolutionDepthWise*)layer)->stride_h);
    }
    else if (layer->type == "Deconvolution")
    {
        fprintf(stderr, "     kernel: %1d x %1d     stride: %1d x %1d",
                ((Deconvolution*)layer)->kernel_w,
                ((Deconvolution*)layer)->kernel_h,
                ((Deconvolution*)layer)->stride_w,
                ((Deconvolution*)layer)->stride_h);
    }
    else if (layer->type == "DeconvolutionDepthWise")
    {
        fprintf(stderr, "     kernel: %1d x %1d     stride: %1d x %1d",
                ((DeconvolutionDepthWise*)layer)->kernel_w,
                ((DeconvolutionDepthWise*)layer)->kernel_h,
                ((DeconvolutionDepthWise*)layer)->stride_w,
                ((DeconvolutionDepthWise*)layer)->stride_h);
    }
    else if (layer->type == "Convolution3D")
    {
        fprintf(stderr, "     kernel: %1d x %1d x %1d    stride: %1d x %1d x %1d",
                ((Convolution3D*)layer)->kernel_w,
                ((Convolution3D*)layer)->kernel_h,
                ((Convolution3D*)layer)->kernel_d,
                ((Convolution3D*)layer)->stride_w,
                ((Convolution3D*)layer)->stride_h,
                ((Convolution3D*)layer)->stride_d);
    }
    else if (layer->type == "ConvolutionDepthWise3D")
    {
        fprintf(stderr, "     kernel: %1d x %1d x %1d    stride: %1d x %1d x %1d",
                ((ConvolutionDepthWise3D*)layer)->kernel_w,
                ((ConvolutionDepthWise3D*)layer)->kernel_h,
                ((ConvolutionDepthWise3D*)layer)->kernel_d,
                ((ConvolutionDepthWise3D*)layer)->stride_w,
                ((ConvolutionDepthWise3D*)layer)->stride_h,
                ((ConvolutionDepthWise3D*)layer)->stride_d);
    }
    else if (layer->type == "Deconvolution3D")
    {
        fprintf(stderr, "     kernel: %1d x %1d x %1d    stride: %1d x %1d x %1d",
                ((Deconvolution3D*)layer)->kernel_w,
                ((Deconvolution3D*)layer)->kernel_h,
                ((Deconvolution3D*)layer)->kernel_d,
                ((Deconvolution3D*)layer)->stride_w,
                ((Deconvolution3D*)layer)->stride_h,
                ((Deconvolution3D*)layer)->stride_d);
    }
    else if (layer->type == "DeconvolutionDepthWise3D")
    {
        fprintf(stderr, "     kernel: %1d x %1d x %1d    stride: %1d x %1d x %1d",
                ((DeconvolutionDepthWise3D*)layer)->kernel_w,
                ((DeconvolutionDepthWise3D*)layer)->kernel_h,
                ((DeconvolutionDepthWise3D*)layer)->kernel_d,
                ((DeconvolutionDepthWise3D*)layer)->stride_w,
                ((DeconvolutionDepthWise3D*)layer)->stride_h,
                ((DeconvolutionDepthWise3D*)layer)->stride_d);
    }
    fprintf(stderr, "\n");
}

#endif // NCNN_BENCHMARK

} // namespace ncnn


================================================
FILE: src/benchmark.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NCNN_BENCHMARK_H
#define NCNN_BENCHMARK_H

#include "layer.h"
#include "mat.h"
#include "platform.h"

namespace ncnn {

// get now timestamp in ms
NCNN_EXPORT double get_current_time();

// sleep milliseconds
NCNN_EXPORT void sleep(unsigned long long int milliseconds = 1000);

#if NCNN_BENCHMARK

NCNN_EXPORT void benchmark(const Layer* layer, double start, double end);
NCNN_EXPORT void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end);

#endif // NCNN_BENCHMARK

} // namespace ncnn

#endif // NCNN_BENCHMARK_H


================================================
FILE: src/blob.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "blob.h"

namespace ncnn {

Blob::Blob()
{
    producer = -1;
    consumer = -1;
}

} // namespace ncnn


================================================
FILE: src/blob.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NCNN_BLOB_H
#define NCNN_BLOB_H

#include "mat.h"
#include "platform.h"

namespace ncnn {

class NCNN_EXPORT Blob
{
public:
    // empty
    Blob();

public:
#if NCNN_STRING
    // blob name
    std::string name;
#endif // NCNN_STRING
    // layer index which produce this blob as output
    int producer;
    // layer index which need this blob as input
    int consumer;
    // shape hint
    Mat shape;
};

} // namespace ncnn

#endif // NCNN_BLOB_H


================================================
FILE: src/c_api.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "platform.h"

#if NCNN_C_API

#include "c_api.h"

#include <stdlib.h>

#include "allocator.h"
#include "blob.h"
#include "datareader.h"
#include "layer.h"
#include "mat.h"
#include "modelbin.h"
#include "net.h"
#include "option.h"
#include "paramdict.h"

using ncnn::Allocator;
using ncnn::Blob;
using ncnn::DataReader;
using ncnn::Extractor;
using ncnn::Layer;
using ncnn::Mat;
using ncnn::ModelBin;
using ncnn::Net;
using ncnn::Option;
using ncnn::ParamDict;

#ifdef __cplusplus
extern "C" {
#endif

const char* ncnn_version()
{
    return NCNN_VERSION_STRING;
}

int ncnn_version_number()
{
    return NCNN_VERSION_NUMBER;
}

/* allocator api */
class PoolAllocator_c_api : public ncnn::PoolAllocator
{
public:
    PoolAllocator_c_api(ncnn_allocator_t _allocator)
        : ncnn::PoolAllocator()
    {
        allocator = _allocator;
    }

    virtual void* fastMalloc(size_t size)
    {
        return allocator->fast_malloc(allocator, size);
    }

    virtual void fastFree(void* ptr)
    {
        return allocator->fast_free(allocator, ptr);
    }

public:
    ncnn_allocator_t allocator;
};

static void* __ncnn_PoolAllocator_fast_malloc(ncnn_allocator_t allocator, size_t size)
{
    return ((ncnn::PoolAllocator*)allocator->pthis)->ncnn::PoolAllocator::fastMalloc(size);
}

static void __ncnn_PoolAllocator_fast_free(ncnn_allocator_t allocator, void* ptr)
{
    ((ncnn::PoolAllocator*)allocator->pthis)->ncnn::PoolAllocator::fastFree(ptr);
}

class UnlockedPoolAllocator_c_api : public ncnn::UnlockedPoolAllocator
{
public:
    UnlockedPoolAllocator_c_api(ncnn_allocator_t _allocator)
        : ncnn::UnlockedPoolAllocator()
    {
        allocator = _allocator;
    }

    virtual void* fastMalloc(size_t size)
    {
        return allocator->fast_malloc(allocator, size);
    }

    virtual void fastFree(void* ptr)
    {
        return allocator->fast_free(allocator, ptr);
    }

public:
    ncnn_allocator_t allocator;
};

static void* __ncnn_UnlockedPoolAllocator_fast_malloc(ncnn_allocator_t allocator, size_t size)
{
    return ((ncnn::UnlockedPoolAllocator*)allocator->pthis)->ncnn::UnlockedPoolAllocator::fastMalloc(size);
}

static void __ncnn_UnlockedPoolAllocator_fast_free(ncnn_allocator_t allocator, void* ptr)
{
    ((ncnn::UnlockedPoolAllocator*)allocator->pthis)->ncnn::UnlockedPoolAllocator::fastFree(ptr);
}

ncnn_allocator_t ncnn_allocator_create_pool_allocator()
{
    ncnn_allocator_t allocator = (ncnn_allocator_t)malloc(sizeof(struct __ncnn_allocator_t));
    allocator->pthis = (void*)(new PoolAllocator_c_api(allocator));
    allocator->fast_malloc = __ncnn_PoolAllocator_fast_malloc;
    allocator->fast_free = __ncnn_PoolAllocator_fast_free;
    return allocator;
}

ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator()
{
    ncnn_allocator_t allocator = (ncnn_allocator_t)malloc(sizeof(struct __ncnn_allocator_t));
    allocator->pthis = (void*)(new UnlockedPoolAllocator_c_api(allocator));
    allocator->fast_malloc = __ncnn_UnlockedPoolAllocator_fast_malloc;
    allocator->fast_free = __ncnn_UnlockedPoolAllocator_fast_free;
    return allocator;
}

void ncnn_allocator_destroy(ncnn_allocator_t allocator)
{
    if (allocator)
    {
        delete (Allocator*)allocator->pthis;
        free(allocator);
    }
}

/* option api */
ncnn_option_t ncnn_option_create()
{
    return (ncnn_option_t)(new Option());
}

void ncnn_option_destroy(ncnn_option_t opt)
{
    delete (Option*)opt;
}

int ncnn_option_get_num_threads(const ncnn_option_t opt)
{
    return ((const Option*)opt)->num_threads;
}

void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads)
{
    ((Option*)opt)->num_threads = num_threads;
}

void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator)
{
    ((Option*)opt)->blob_allocator = allocator ? (Allocator*)allocator->pthis : NULL;
}

void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator)
{
    ((Option*)opt)->workspace_allocator = allocator ? (Allocator*)allocator->pthis : NULL;
}

int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt)
{
#if NCNN_VULKAN
    return ((const Option*)opt)->use_vulkan_compute;
#else
    (void)opt;
    return 0;
#endif
}

int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_local_pool_allocator;
}

int ncnn_option_get_use_winograd_convolution(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_winograd_convolution;
}

int ncnn_option_get_use_sgemm_convolution(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_sgemm_convolution;
}

int ncnn_option_get_use_packing_layout(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_packing_layout;
}

int ncnn_option_get_use_fp16_packed(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_fp16_packed;
}

int ncnn_option_get_use_fp16_storage(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_fp16_storage;
}

int ncnn_option_get_use_fp16_arithmetic(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_fp16_arithmetic;
}

int ncnn_option_get_use_int8_packed(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_int8_packed;
}

int ncnn_option_get_use_int8_storage(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_int8_storage;
}

int ncnn_option_get_use_int8_arithmetic(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_int8_arithmetic;
}

int ncnn_option_get_use_bf16_packed(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_bf16_packed;
}

int ncnn_option_get_use_bf16_storage(const ncnn_option_t opt)
{
    return ((const Option*)opt)->use_bf16_storage;
}

int ncnn_option_get_use_shader_local_memory(const ncnn_option_t opt)
{
#if NCNN_VULKAN
    return ((const Option*)opt)->use_shader_local_memory;
#else
    (void)opt;
    return 0;
#endif
}

int ncnn_option_get_use_cooperative_matrix(const ncnn_option_t opt)
{
#if NCNN_VULKAN
    return ((const Option*)opt)->use_cooperative_matrix;
#else
    (void)opt;
    return 0;
#endif
}

void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int enable)
{
#if NCNN_VULKAN
    ((Option*)opt)->use_vulkan_compute = enable;
#else
    (void)opt;
    (void)enable;
#endif
}

void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_local_pool_allocator = enable;
}

void ncnn_option_set_use_winograd_convolution(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_winograd_convolution = enable;
}

void ncnn_option_set_use_sgemm_convolution(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_sgemm_convolution = enable;
}

void ncnn_option_set_use_packing_layout(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_packing_layout = enable;
}

void ncnn_option_set_use_fp16_packed(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_fp16_packed = enable;
}

void ncnn_option_set_use_fp16_storage(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_fp16_storage = enable;
}

void ncnn_option_set_use_fp16_arithmetic(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_fp16_arithmetic = enable;
}

void ncnn_option_set_use_int8_packed(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_int8_packed = enable;
}

void ncnn_option_set_use_int8_storage(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_int8_storage = enable;
}

void ncnn_option_set_use_int8_arithmetic(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_int8_arithmetic = enable;
}

void ncnn_option_set_use_bf16_packed(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_bf16_packed = enable;
}

void ncnn_option_set_use_bf16_storage(ncnn_option_t opt, int enable)
{
    ((Option*)opt)->use_bf16_storage = enable;
}

void ncnn_option_set_use_shader_local_memory(ncnn_option_t opt, int enable)
{
#if NCNN_VULKAN
    ((Option*)opt)->use_shader_local_memory = enable;
#else
    (void)opt;
    (void)enable;
#endif
}

void ncnn_option_set_use_cooperative_matrix(ncnn_option_t opt, int enable)
{
#if NCNN_VULKAN
    ((Option*)opt)->use_cooperative_matrix = enable;
#else
    (void)opt;
    (void)enable;
#endif
}

/* mat api */
ncnn_mat_t ncnn_mat_create()
{
    return (ncnn_mat_t)(new Mat());
}

ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, c, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, d, c, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, c, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, d, c, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, c, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, d, c, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, c, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(w, h, d, c, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL));
}

void ncnn_mat_destroy(ncnn_mat_t mat)
{
    delete (Mat*)mat;
}

void ncnn_mat_fill_float(ncnn_mat_t mat, float v)
{
    ((Mat*)mat)->fill(v);
}

ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->clone(allocator ? (Allocator*)allocator->pthis : NULL)));
}

ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, allocator ? (Allocator*)allocator->pthis : NULL)));
}

ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, allocator ? (Allocator*)allocator->pthis : NULL)));
}

ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, c, allocator ? (Allocator*)allocator->pthis : NULL)));
}

ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, d, c, allocator ? (Allocator*)allocator->pthis : NULL)));
}

int ncnn_mat_get_dims(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->dims;
}

int ncnn_mat_get_w(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->w;
}

int ncnn_mat_get_h(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->h;
}

int ncnn_mat_get_d(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->d;
}

int ncnn_mat_get_c(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->c;
}

size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->elemsize;
}

int ncnn_mat_get_elempack(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->elempack;
}

size_t ncnn_mat_get_cstep(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->cstep;
}

void* ncnn_mat_get_data(const ncnn_mat_t mat)
{
    return ((const Mat*)mat)->data;
}

void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c)
{
    return ((const Mat*)mat)->channel(c).data;
}

#if NCNN_PIXEL

/* mat pixel api */
ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(Mat::from_pixels(pixels, type, w, h, stride, allocator ? (Allocator*)allocator->pthis : NULL)));
}

ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(Mat::from_pixels_resize(pixels, type, w, h, stride, target_width, target_height, allocator ? (Allocator*)allocator->pthis : NULL)));
}

ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi(pixels, type, w, h, stride, roix, roiy, roiw, roih, allocator ? (Allocator*)allocator->pthis : NULL)));
}

ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator)
{
    return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi_resize(pixels, type, w, h, stride, roix, roiy, roiw, roih, target_width, target_height, allocator ? (Allocator*)allocator->pthis : NULL)));
}

void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride)
{
    ((const Mat*)mat)->to_pixels(pixels, type, stride);
}

void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride)
{
    ((const Mat*)mat)->to_pixels_resize(pixels, type, target_width, target_height, target_stride);
}

#endif /* NCNN_PIXEL */

void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals)
{
    ((Mat*)mat)->substract_mean_normalize(mean_vals, norm_vals);
}

void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt)
{
    Mat _dst;
    ncnn::convert_packing(*(const Mat*)src, _dst, elempack, *(Option*)opt);
    *dst = (ncnn_mat_t)(new Mat(_dst));
}

void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt)
{
    Mat _dst;
    ncnn::flatten(*(const Mat*)src, _dst, *(Option*)opt);
    *dst = (ncnn_mat_t)(new Mat(_dst));
}

/* blob api */
#if NCNN_STRING
const char* ncnn_blob_get_name(const ncnn_blob_t blob)
{
    return ((const Blob*)blob)->name.c_str();
}
#endif /* NCNN_STRING */

int ncnn_blob_get_producer(const ncnn_blob_t blob)
{
    return ((const Blob*)blob)->producer;
}

int ncnn_blob_get_consumer(const ncnn_blob_t blob)
{
    return ((const Blob*)blob)->consumer;
}

void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c)
{
    const Mat& shape = ((const Blob*)blob)->shape;
    *dims = shape.dims;
    *w = shape.w;
    *h = shape.h;
    *c = shape.c;
}

/* paramdict api */
ncnn_paramdict_t ncnn_paramdict_create()
{
    return (ncnn_paramdict_t)(new ParamDict());
}

void ncnn_paramdict_destroy(ncnn_paramdict_t pd)
{
    delete (ParamDict*)pd;
}

int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id)
{
    return ((const ParamDict*)pd)->type(id);
}

int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def)
{
    return ((const ParamDict*)pd)->get(id, def);
}

float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def)
{
    return ((const ParamDict*)pd)->get(id, def);
}

ncnn_mat_t ncnn_paramdict_get_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t def)
{
    return (ncnn_mat_t)(new Mat(((const ParamDict*)pd)->get(id, *(const Mat*)def)));
}

void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i)
{
    return ((ParamDict*)pd)->set(id, i);
}

void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f)
{
    return ((ParamDict*)pd)->set(id, f);
}

void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, ncnn_mat_t v)
{
    return ((ParamDict*)pd)->set(id, *(const Mat*)v);
}

/* datareader api */
class DataReader_c_api : public ncnn::DataReader
{
public:
    DataReader_c_api(ncnn_datareader_t _dr)
        : ncnn::DataReader()
    {
        dr = _dr;
    }

#if NCNN_STRING
    virtual int scan(const char* format, void* p) const
    {
        return dr->scan(dr, format, p);
    }
#endif /* NCNN_STRING */

    virtual size_t read(void* buf, size_t size) const
    {
        return dr->read(dr, buf, size);
    }

public:
    ncnn_datareader_t dr;
};

#if NCNN_STRING
static int __ncnn_DataReader_scan(ncnn_datareader_t dr, const char* format, void* p)
{
    return ((ncnn::DataReader*)dr->pthis)->ncnn::DataReader::scan(format, p);
}
#endif /* NCNN_STRING */

static size_t __ncnn_DataReader_read(ncnn_datareader_t dr, void* buf, size_t size)
{
    return ((ncnn::DataReader*)dr->pthis)->ncnn::DataReader::read(buf, size);
}

#if NCNN_STDIO
class DataReaderFromStdio_c_api : public ncnn::DataReaderFromStdio
{
public:
    DataReaderFromStdio_c_api(FILE* fp, ncnn_datareader_t _dr)
        : ncnn::DataReaderFromStdio(fp)
    {
        dr = _dr;
    }

#if NCNN_STRING
    virtual int scan(const char* format, void* p) const
    {
        return dr->scan(dr, format, p);
    }
#endif /* NCNN_STRING */

    virtual size_t read(void* buf, size_t size) const
    {
        return dr->read(dr, buf, size);
    }

public:
    ncnn_datareader_t dr;
};

#if NCNN_STRING
static int __ncnn_DataReaderFromStdio_scan(ncnn_datareader_t dr, const char* format, void* p)
{
    return ((ncnn::DataReaderFromStdio*)dr->pthis)->ncnn::DataReaderFromStdio::scan(format, p);
}
#endif /* NCNN_STRING */

static size_t __ncnn_DataReaderFromStdio_read(ncnn_datareader_t dr, void* buf, size_t size)
{
    return ((ncnn::DataReaderFromStdio*)dr->pthis)->ncnn::DataReaderFromStdio::read(buf, size);
}
#endif /* NCNN_STDIO */

class DataReaderFromMemory_c_api : public ncnn::DataReaderFromMemory
{
public:
    DataReaderFromMemory_c_api(const unsigned char*& mem, ncnn_datareader_t _dr)
        : ncnn::DataReaderFromMemory(mem)
    {
        dr = _dr;
    }

#if NCNN_STRING
    virtual int scan(const char* format, void* p) const
    {
        return dr->scan(dr, format, p);
    }
#endif /* NCNN_STRING */

    virtual size_t read(void* buf, size_t size) const
    {
        return dr->read(dr, buf, size);
    }

public:
    ncnn_datareader_t dr;
};

#if NCNN_STRING
static int __ncnn_DataReaderFromMemory_scan(ncnn_datareader_t dr, const char* format, void* p)
{
    return ((ncnn::DataReaderFromMemory*)dr->pthis)->ncnn::DataReaderFromMemory::scan(format, p);
}
#endif /* NCNN_STRING */

static size_t __ncnn_DataReaderFromMemory_read(ncnn_datareader_t dr, void* buf, size_t size)
{
    return ((ncnn::DataReaderFromMemory*)dr->pthis)->ncnn::DataReaderFromMemory::read(buf, size);
}

ncnn_datareader_t ncnn_datareader_create()
{
    ncnn_datareader_t dr = (ncnn_datareader_t)malloc(sizeof(struct __ncnn_datareader_t));
    dr->pthis = (void*)(new DataReader_c_api(dr));
#if NCNN_STRING
    dr->scan = __ncnn_DataReader_scan;
#endif /* NCNN_STRING */
    dr->read = __ncnn_DataReader_read;
    return dr;
}

#if NCNN_STDIO
ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp)
{
    ncnn_datareader_t dr = (ncnn_datareader_t)malloc(sizeof(struct __ncnn_datareader_t));
    dr->pthis = (void*)(new DataReaderFromStdio_c_api(fp, dr));
#if NCNN_STRING
    dr->scan = __ncnn_DataReaderFromStdio_scan;
#endif /* NCNN_STRING */
    dr->read = __ncnn_DataReaderFromStdio_read;
    return dr;
}
#endif /* NCNN_STDIO */

ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem)
{
    ncnn_datareader_t dr = (ncnn_datareader_t)malloc(sizeof(struct __ncnn_datareader_t));
    dr->pthis = (void*)(new DataReaderFromMemory_c_api(*mem, dr));
#if NCNN_STRING
    dr->scan = __ncnn_DataReaderFromMemory_scan;
#endif /* NCNN_STRING */
    dr->read = __ncnn_DataReaderFromMemory_read;
    return dr;
}

void ncnn_datareader_destroy(ncnn_datareader_t dr)
{
    delete (DataReader*)dr->pthis;
    free(dr);
}

/* modelbin api */
class ModelBinFromDataReader_c_api : public ncnn::ModelBinFromDataReader
{
public:
    ModelBinFromDataReader_c_api(ncnn_modelbin_t _mb, const DataReader& dr)
        : ncnn::ModelBinFromDataReader(dr)
    {
        mb = _mb;
    }

    virtual Mat load(int w, int type) const
    {
        ncnn_mat_t m = mb->load_1d(mb, w, type);
        Mat m2 = *(Mat*)m;
        ncnn_mat_destroy(m);
        return m2;
    }

    virtual Mat load(int w, int h, int type) const
    {
        ncnn_mat_t m = mb->load_2d(mb, w, h, type);
        Mat m2 = *(Mat*)m;
        ncnn_mat_destroy(m);
        return m2;
    }

    virtual Mat load(int w, int h, int c, int type) const
    {
        ncnn_mat_t m = mb->load_3d(mb, w, h, c, type);
        Mat m2 = *(Mat*)m;
        ncnn_mat_destroy(m);
        return m2;
    }

public:
    ncnn_modelbin_t mb;
};

static ncnn_mat_t __ncnn_ModelBinFromDataReader_load_1d(const ncnn_modelbin_t mb, int w, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBinFromDataReader*)mb->pthis)->ncnn::ModelBinFromDataReader::load(w, type)));
}

static ncnn_mat_t __ncnn_ModelBinFromDataReader_load_2d(const ncnn_modelbin_t mb, int w, int h, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBinFromDataReader*)mb->pthis)->ncnn::ModelBin::load(w, h, type)));
}

static ncnn_mat_t __ncnn_ModelBinFromDataReader_load_3d(const ncnn_modelbin_t mb, int w, int h, int c, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBinFromDataReader*)mb->pthis)->ncnn::ModelBin::load(w, h, c, type)));
}

class ModelBinFromMatArray_c_api : public ncnn::ModelBinFromMatArray
{
public:
    ModelBinFromMatArray_c_api(ncnn_modelbin_t _mb, const Mat* weights)
        : ncnn::ModelBinFromMatArray(weights)
    {
        mb = _mb;
    }

    virtual Mat load(int w, int type) const
    {
        ncnn_mat_t m = mb->load_1d(mb, w, type);
        Mat m2 = *(Mat*)m;
        ncnn_mat_destroy(m);
        return m2;
    }

    virtual Mat load(int w, int h, int type) const
    {
        ncnn_mat_t m = mb->load_2d(mb, w, h, type);
        Mat m2 = *(Mat*)m;
        ncnn_mat_destroy(m);
        return m2;
    }

    virtual Mat load(int w, int h, int c, int type) const
    {
        ncnn_mat_t m = mb->load_3d(mb, w, h, c, type);
        Mat m2 = *(Mat*)m;
        ncnn_mat_destroy(m);
        return m2;
    }

public:
    ncnn_modelbin_t mb;
};

static ncnn_mat_t __ncnn_ModelBinFromMatArray_load_1d(const ncnn_modelbin_t mb, int w, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBinFromMatArray*)mb->pthis)->ncnn::ModelBinFromMatArray::load(w, type)));
}

static ncnn_mat_t __ncnn_ModelBinFromMatArray_load_2d(const ncnn_modelbin_t mb, int w, int h, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBinFromMatArray*)mb->pthis)->ncnn::ModelBin::load(w, h, type)));
}

static ncnn_mat_t __ncnn_ModelBinFromMatArray_load_3d(const ncnn_modelbin_t mb, int w, int h, int c, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBinFromMatArray*)mb->pthis)->ncnn::ModelBin::load(w, h, c, type)));
}

ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr)
{
    ncnn_modelbin_t mb = (ncnn_modelbin_t)malloc(sizeof(struct __ncnn_modelbin_t));
    mb->pthis = (void*)(new ModelBinFromDataReader_c_api(mb, *(const DataReader*)dr->pthis));
    mb->load_1d = __ncnn_ModelBinFromDataReader_load_1d;
    mb->load_2d = __ncnn_ModelBinFromDataReader_load_2d;
    mb->load_3d = __ncnn_ModelBinFromDataReader_load_3d;
    return mb;
}

ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n)
{
    std::vector<Mat> matarray(n);
    for (int i = 0; i < n; i++)
    {
        matarray[i] = *(const Mat*)weights[i];
    }
    ncnn_modelbin_t mb = (ncnn_modelbin_t)malloc(sizeof(struct __ncnn_modelbin_t));
    mb->pthis = (void*)(new ModelBinFromMatArray_c_api(mb, n ? &matarray[0] : NULL));
    mb->load_1d = __ncnn_ModelBinFromMatArray_load_1d;
    mb->load_2d = __ncnn_ModelBinFromMatArray_load_2d;
    mb->load_3d = __ncnn_ModelBinFromMatArray_load_3d;
    return mb;
}

void ncnn_modelbin_destroy(ncnn_modelbin_t mb)
{
    delete (ModelBin*)mb->pthis;
    free(mb);
}

static ncnn_mat_t __ncnn_modelbin_load_1d(const ncnn_modelbin_t mb, int w, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBin*)mb->pthis)->load(w, type)));
}

static ncnn_mat_t __ncnn_modelbin_load_2d(const ncnn_modelbin_t mb, int w, int h, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBin*)mb->pthis)->load(w, h, type)));
}

static ncnn_mat_t __ncnn_modelbin_load_3d(const ncnn_modelbin_t mb, int w, int h, int c, int type)
{
    return (ncnn_mat_t)(new Mat(((const ncnn::ModelBin*)mb->pthis)->load(w, h, c, type)));
}

/* layer api */
class Layer_c_api : public Layer
{
public:
    Layer_c_api(ncnn_layer_t _layer)
        : Layer()
    {
        layer = _layer;
    }

    virtual int load_param(const ParamDict& pd)
    {
        return layer->load_param(layer, (ncnn_paramdict_t)&pd);
    }

    virtual int load_model(const ModelBin& mb)
    {
        struct __ncnn_modelbin_t mb0;
        mb0.pthis = (void*)&mb;
        mb0.load_1d = __ncnn_modelbin_load_1d;
        mb0.load_2d = __ncnn_modelbin_load_2d;
        mb0.load_3d = __ncnn_modelbin_load_3d;
        return layer->load_model(layer, &mb0);
    }

    virtual int create_pipeline(const Option& opt)
    {
        return layer->create_pipeline(layer, (ncnn_option_t)&opt);
    }

    virtual int destroy_pipeline(const Option& opt)
    {
        return layer->destroy_pipeline(layer, (ncnn_option_t)&opt);
    }

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
    {
        const int n = bottom_blobs.size();
        const int n2 = top_blobs.size();
        std::vector<ncnn_mat_t> bottom_blobs0(n);
        for (int i = 0; i < n; i++)
        {
            bottom_blobs0[i] = (ncnn_mat_t)&bottom_blobs[i];
        }
        std::vector<ncnn_mat_t> top_blobs0(n2, (ncnn_mat_t)0);
        int ret = layer->forward_n(layer, &bottom_blobs0[0], n, &top_blobs0[0], n2, (ncnn_option_t)&opt);
        for (int i = 0; i < n2; i++)
        {
            top_blobs[i] = *(Mat*)top_blobs0[i];
            ncnn_mat_destroy(top_blobs0[i]);
        }
        return ret;
    }

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
    {
        ncnn_mat_t top_blob0 = 0;
        int ret = layer->forward_1(layer, (ncnn_mat_t)&bottom_blob, &top_blob0, (ncnn_option_t)&opt);
        top_blob = *(Mat*)top_blob0;
        ncnn_mat_destroy(top_blob0);
        return ret;
    }

    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const
    {
        const int n = bottom_top_blobs.size();
        std::vector<ncnn_mat_t> bottom_top_blobs0(n);
        for (int i = 0; i < n; i++)
        {
            bottom_top_blobs0[i] = (ncnn_mat_t)&bottom_top_blobs[i];
        }
        return layer->forward_inplace_n(layer, &bottom_top_blobs0[0], n, (ncnn_option_t)&opt);
    }

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    {
        return layer->forward_inplace_1(layer, (ncnn_mat_t)&bottom_top_blob, (ncnn_option_t)&opt);
    }

public:
    ncnn_layer_t layer;
};

static int __ncnn_Layer_load_param(ncnn_layer_t layer, const ncnn_paramdict_t pd)
{
    return ((Layer*)layer->pthis)->Layer::load_param(*(const ParamDict*)pd);
}

static int __ncnn_Layer_load_model(ncnn_layer_t layer, const ncnn_modelbin_t mb)
{
    return ((Layer*)layer->pthis)->Layer::load_model(*(const ModelBin*)mb);
}

static int __ncnn_Layer_create_pipeline(ncnn_layer_t layer, const ncnn_option_t opt)
{
    return ((Layer*)layer->pthis)->Layer::create_pipeline(*(const Option*)opt);
}

static int __ncnn_Layer_destroy_pipeline(ncnn_layer_t layer, const ncnn_option_t opt)
{
    return ((Layer*)layer->pthis)->Layer::destroy_pipeline(*(const Option*)opt);
}

static int __ncnn_Layer_forward_1(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt)
{
    Mat _top_blob;
    int ret = ((const Layer*)layer->pthis)->Layer::forward(*(const Mat*)bottom_blob, _top_blob, *(const Option*)opt);
    *top_blob = (ncnn_mat_t)(new Mat(_top_blob));
    return ret;
}

static int __ncnn_Layer_forward_n(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt)
{
    std::vector<Mat> _bottom_blobs(n);
    std::vector<Mat> _top_blobs(n2);
    for (int i = 0; i < n; i++)
    {
        _bottom_blobs[i] = *(Mat*)bottom_blobs[i];
    }
    int ret = ((const Layer*)layer->pthis)->Layer::forward(_bottom_blobs, _top_blobs, *(const Option*)opt);
    for (int i = 0; i < n2; i++)
    {
        top_blobs[i] = (ncnn_mat_t)(new Mat(_top_blobs[i]));
    }
    return ret;
}

static int __ncnn_Layer_forward_inplace_1(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt)
{
    return ((const Layer*)layer->pthis)->Layer::forward_inplace(*(Mat*)bottom_top_blob, *(const Option*)opt);
}

static int __ncnn_Layer_forward_inplace_n(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt)
{
    std::vector<Mat> _bottom_top_blobs(n);
    for (int i = 0; i < n; i++)
    {
        _bottom_top_blobs[i] = *(Mat*)bottom_top_blobs[i];
    }
    return ((const Layer*)layer->pthis)->Layer::forward_inplace(_bottom_top_blobs, *(const Option*)opt);
}

static int __ncnn_layer_load_param(ncnn_layer_t layer, const ncnn_paramdict_t pd)
{
    return ((Layer*)layer->pthis)->load_param(*(const ParamDict*)pd);
}

static int __ncnn_layer_load_model(ncnn_layer_t layer, const ncnn_modelbin_t mb)
{
    return ((Layer*)layer->pthis)->load_model(*(const ModelBin*)mb);
}

static int __ncnn_layer_create_pipeline(ncnn_layer_t layer, const ncnn_option_t opt)
{
    return ((Layer*)layer->pthis)->create_pipeline(*(const Option*)opt);
}

static int __ncnn_layer_destroy_pipeline(ncnn_layer_t layer, const ncnn_option_t opt)
{
    return ((Layer*)layer->pthis)->destroy_pipeline(*(const Option*)opt);
}

static int __ncnn_layer_forward_1(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt)
{
    Mat _top_blob;
    int ret = ((const Layer*)layer->pthis)->forward(*(const Mat*)bottom_blob, _top_blob, *(const Option*)opt);
    *top_blob = (ncnn_mat_t)(new Mat(_top_blob));
    return ret;
}

static int __ncnn_layer_forward_n(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt)
{
    std::vector<Mat> _bottom_blobs(n);
    std::vector<Mat> _top_blobs(n2);
    for (int i = 0; i < n; i++)
    {
        _bottom_blobs[i] = *(Mat*)bottom_blobs[i];
    }
    int ret = ((const Layer*)layer->pthis)->forward(_bottom_blobs, _top_blobs, *(const Option*)opt);
    for (int i = 0; i < n2; i++)
    {
        top_blobs[i] = (ncnn_mat_t)(new Mat(_top_blobs[i]));
    }
    return ret;
}

static int __ncnn_layer_forward_inplace_1(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt)
{
    return ((const Layer*)layer->pthis)->forward_inplace(*(Mat*)bottom_top_blob, *(const Option*)opt);
}

static int __ncnn_layer_forward_inplace_n(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt)
{
    std::vector<Mat> _bottom_top_blobs(n);
    for (int i = 0; i < n; i++)
    {
        _bottom_top_blobs[i] = *(Mat*)bottom_top_blobs[i];
    }
    return ((const Layer*)layer->pthis)->forward_inplace(_bottom_top_blobs, *(const Option*)opt);
}

ncnn_layer_t ncnn_layer_create()
{
    ncnn_layer_t layer = (ncnn_layer_t)malloc(sizeof(__ncnn_layer_t));
    layer->pthis = (void*)(new Layer_c_api(layer));
    layer->load_param = __ncnn_Layer_load_param;
    layer->load_model = __ncnn_Layer_load_model;
    layer->create_pipeline = __ncnn_Layer_create_pipeline;
    layer->destroy_pipeline = __ncnn_Layer_destroy_pipeline;
    layer->forward_1 = __ncnn_Layer_forward_1;
    layer->forward_n = __ncnn_Layer_forward_n;
    layer->forward_inplace_1 = __ncnn_Layer_forward_inplace_1;
    layer->forward_inplace_n = __ncnn_Layer_forward_inplace_n;
    return layer;
}

ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex)
{
    void* pthis = (void*)(ncnn::create_layer(typeindex));
    if (!pthis)
    {
        return 0;
    }

    ncnn_layer_t layer = (ncnn_layer_t)malloc(sizeof(__ncnn_layer_t));
    layer->pthis = pthis;
    layer->load_param = __ncnn_layer_load_param;
    layer->load_model = __ncnn_layer_load_model;
    layer->create_pipeline = __ncnn_layer_create_pipeline;
    layer->destroy_pipeline = __ncnn_layer_destroy_pipeline;
    layer->forward_1 = __ncnn_layer_forward_1;
    layer->forward_n = __ncnn_layer_forward_n;
    layer->forward_inplace_1 = __ncnn_layer_forward_inplace_1;
    layer->forward_inplace_n = __ncnn_layer_forward_inplace_n;
    return layer;
}

#if NCNN_STRING
ncnn_layer_t ncnn_layer_create_by_type(const char* type)
{
    void* pthis = (void*)(ncnn::create_layer(type));
    if (!pthis)
    {
        return 0;
    }

    ncnn_layer_t layer = (ncnn_layer_t)malloc(sizeof(__ncnn_layer_t));
    layer->pthis = pthis;
    layer->load_param = __ncnn_layer_load_param;
    layer->load_model = __ncnn_layer_load_model;
    layer->create_pipeline = __ncnn_layer_create_pipeline;
    layer->destroy_pipeline = __ncnn_layer_destroy_pipeline;
    layer->forward_1 = __ncnn_layer_forward_1;
    layer->forward_n = __ncnn_layer_forward_n;
    layer->forward_inplace_1 = __ncnn_layer_forward_inplace_1;
    layer->forward_inplace_n = __ncnn_layer_forward_inplace_n;
    return layer;
}

int ncnn_layer_type_to_index(const char* type)
{
    return ncnn::layer_to_index(type);
}
#endif /* NCNN_STRING */

void ncnn_layer_destroy(ncnn_layer_t layer)
{
    delete (Layer*)layer->pthis;
    free(layer);
}

#if NCNN_STRING
const char* ncnn_layer_get_name(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->name.c_str();
}
#endif /* NCNN_STRING */

int ncnn_layer_get_typeindex(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->typeindex;
}

#if NCNN_STRING
const char* ncnn_layer_get_type(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->type.c_str();
}
#endif /* NCNN_STRING */

int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->one_blob_only;
}

int ncnn_layer_get_support_inplace(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->support_inplace;
}

int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer)
{
#if NCNN_VULKAN
    return ((const Layer*)layer->pthis)->support_vulkan;
#else
    (void)layer;
    return 0;
#endif
}

int ncnn_layer_get_support_packing(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->support_packing;
}

int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->support_bf16_storage;
}

int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->support_fp16_storage;
}

int ncnn_layer_get_support_vulkan_packing(const ncnn_layer_t layer)
{
#if NCNN_VULKAN
    return ((const Layer*)layer->pthis)->support_vulkan_packing;
#else
    (void)layer;
    return 0;
#endif
}

int ncnn_layer_get_support_any_packing(const ncnn_layer_t layer)
{
    return ((const Layer*)layer->pthis)->support_any_packing;
}

int ncnn_layer_get_support_vulkan_any_packing(const ncnn_layer_t layer)
{
#if NCNN_VULKAN
    return ((const Layer*)layer->pthis)->support_vulkan_any_packing;
#else
    (void)layer;
    return 0;
#endif
}

void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable)
{
    ((Layer*)layer->pthis)->one_blob_only = enable;
}

void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable)
{
    ((Layer*)layer->pthis)->support_inplace = enable;
}

void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable)
{
#if NCNN_VULKAN
    ((Layer*)layer->pthis)->support_vulkan = enable;
#else
    (void)layer;
    (void)enable;
#endif
}

void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable)
{
    ((Layer*)layer->pthis)->support_packing = enable;
}

void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable)
{
    ((Layer*)layer->pthis)->support_bf16_storage = enable;
}

void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable)
{
    ((Layer*)layer->pthis)->support_fp16_storage = enable;
}

void ncnn_layer_set_support_vulkan_packing(ncnn_layer_t layer, int enable)
{
#if NCNN_VULKAN
    ((Layer*)layer->pthis)->support_vulkan_packing = enable;
#else
    (void)layer;
    (void)enable;
#endif
}

void ncnn_layer_set_support_any_packing(ncnn_layer_t layer, int enable)
{
    ((Layer*)layer->pthis)->support_any_packing = enable;
}

void ncnn_layer_set_support_vulkan_any_packing(ncnn_layer_t layer, int enable)
{
#if NCNN_VULKAN
    ((Layer*)layer->pthis)->support_vulkan_any_packing = enable;
#else
    (void)layer;
    (void)enable;
#endif
}

int ncnn_layer_get_bottom_count(const ncnn_layer_t layer)
{
    return (int)((const Layer*)layer->pthis)->bottoms.size();
}

int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i)
{
    return ((const Layer*)layer->pthis)->bottoms[i];
}

int ncnn_layer_get_top_count(const ncnn_layer_t layer)
{
    return (int)((const Layer*)layer->pthis)->tops.size();
}

int ncnn_layer_get_top(const ncnn_layer_t layer, int i)
{
    return ((const Layer*)layer->pthis)->tops[i];
}

void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c)
{
    const Mat& shape = ((const Layer*)layer->pthis)->bottom_shapes[i];
    *dims = shape.dims;
    *w = shape.w;
    *h = shape.h;
    *c = shape.c;
}

void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c)
{
    const Mat& shape = ((const Layer*)layer->pthis)->top_shapes[i];
    *dims = shape.dims;
    *w = shape.w;
    *h = shape.h;
    *c = shape.c;
}

/* net api */
ncnn_net_t ncnn_net_create()
{
    ncnn_net_t net = (ncnn_net_t)malloc(sizeof(struct __ncnn_net_t));
    net->pthis = (void*)(new Net());
    net->custom_layer_factory = 0;
    return net;
}

void ncnn_net_destroy(ncnn_net_t net)
{
    delete (Net*)net->pthis;
    ncnn_net_custom_layer_factory_t ud = net->custom_layer_factory;
    while (ud)
    {
        ncnn_net_custom_layer_factory_t ud_next = ud->next;
        free(ud);
        ud = ud_next;
    }
    free(net);
}

ncnn_option_t ncnn_net_get_option(ncnn_net_t net)
{
    return (ncnn_option_t)(&((Net*)(net->pthis))->opt);
}

void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt)
{
    ((Net*)net->pthis)->opt = *((Option*)opt);
}

#if NCNN_VULKAN
void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index)
{
    ((Net*)net->pthis)->set_vulkan_device(device_index);
}
#endif

static ::ncnn::Layer* __Layer_c_api_layer_creator(void* userdata)
{
    ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)userdata;

    ncnn_layer_t layer0 = ud->creator(ud->userdata);

    ::ncnn::Layer* layer = (::ncnn::Layer*)layer0->pthis;

    layer->userdata = (void*)layer0;

    layer->one_blob_only = ncnn_layer_get_one_blob_only(layer0);
    layer->support_inplace = ncnn_layer_get_support_inplace(layer0);
    layer->support_vulkan = ncnn_layer_get_support_vulkan(layer0);
    layer->support_packing = ncnn_layer_get_support_packing(layer0);

    layer->support_bf16_storage = ncnn_layer_get_support_bf16_storage(layer0);
    layer->support_fp16_storage = ncnn_layer_get_support_fp16_storage(layer0);

    return layer;
}

static void __Layer_c_api_layer_destroyer(::ncnn::Layer* layer, void* userdata)
{
    ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)userdata;

    ncnn_layer_t layer0 = (ncnn_layer_t)layer->userdata;

    ud->destroyer(layer0, ud->userdata);
}

#if NCNN_STRING
void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata)
{
    ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)malloc(sizeof(struct __ncnn_net_custom_layer_factory_t));
    ud->creator = creator;
    ud->destroyer = destroyer;
    ud->userdata = userdata;
    ud->next = net->custom_layer_factory;
    net->custom_layer_factory = ud;
    ((Net*)net->pthis)->register_custom_layer(type, __Layer_c_api_layer_creator, __Layer_c_api_layer_destroyer, (void*)ud);
}
#endif /* NCNN_STRING */

void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata)
{
    ncnn_net_custom_layer_factory_t ud = (ncnn_net_custom_layer_factory_t)malloc(sizeof(struct __ncnn_net_custom_layer_factory_t));
    ud->creator = creator;
    ud->destroyer = destroyer;
    ud->userdata = userdata;
    ud->next = net->custom_layer_factory;
    net->custom_layer_factory = ud;
    ((Net*)net->pthis)->register_custom_layer(typeindex, __Layer_c_api_layer_creator, __Layer_c_api_layer_destroyer, (void*)ud);
}

#if NCNN_STDIO
#if NCNN_STRING
int ncnn_net_load_param(ncnn_net_t net, const char* path)
{
    return ((Net*)net->pthis)->load_param(path);
}
#endif /* NCNN_STRING */

int ncnn_net_load_param_bin(ncnn_net_t net, const char* path)
{
    return ((Net*)net->pthis)->load_param_bin(path);
}

int ncnn_net_load_model(ncnn_net_t net, const char* path)
{
    return ((Net*)net->pthis)->load_model(path);
}

#if _WIN32
#if NCNN_STRING
int ncnn_net_load_param_w(ncnn_net_t net, const wchar_t* path)
{
    return ((Net*)net->pthis)->load_param(path);
}
#endif /* NCNN_STRING */

int ncnn_net_load_param_bin_w(ncnn_net_t net, const wchar_t* path)
{
    return ((Net*)net->pthis)->load_param_bin(path);
}

int ncnn_net_load_model_w(ncnn_net_t net, const wchar_t* path)
{
    return ((Net*)net->pthis)->load_model(path);
}
#endif /* _WIN32 */
#endif /* NCNN_STDIO */

#if NCNN_STDIO
#if NCNN_STRING
int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem)
{
    return ((Net*)net->pthis)->load_param_mem(mem);
}
#endif /* NCNN_STRING */
#endif /* NCNN_STDIO */

size_t ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem)
{
    return ((Net*)net->pthis)->load_param(mem);
}

size_t ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem)
{
    return ((Net*)net->pthis)->load_model(mem);
}

#if NCNN_STRING
int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr)
{
    return ((Net*)net->pthis)->load_param(*(const DataReader*)dr->pthis);
}
#endif /* NCNN_STRING */

int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr)
{
    return ((Net*)net->pthis)->load_param_bin(*(const DataReader*)dr->pthis);
}

int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr)
{
    return ((Net*)net->pthis)->load_model(*(const DataReader*)dr->pthis);
}

void ncnn_net_clear(ncnn_net_t net)
{
    return ((Net*)net->pthis)->clear();
}

int ncnn_net_get_input_count(const ncnn_net_t net)
{
    return (int)((Net*)net->pthis)->input_indexes().size();
}

int ncnn_net_get_output_count(const ncnn_net_t net)
{
    return (int)((Net*)net->pthis)->output_indexes().size();
}

#if NCNN_STRING
const char* ncnn_net_get_input_name(const ncnn_net_t net, int i)
{
    return ((Net*)net->pthis)->input_names()[i];
}

const char* ncnn_net_get_output_name(const ncnn_net_t net, int i)
{
    return ((Net*)net->pthis)->output_names()[i];
}
#endif /* NCNN_STRING */

int ncnn_net_get_input_index(const ncnn_net_t net, int i)
{
    return ((Net*)net->pthis)->input_indexes()[i];
}

int ncnn_net_get_output_index(const ncnn_net_t net, int i)
{
    return ((Net*)net->pthis)->output_indexes()[i];
}

/* extractor api */
ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net)
{
    return (ncnn_extractor_t)(new Extractor(((Net*)net->pthis)->create_extractor()));
}

void ncnn_extractor_destroy(ncnn_extractor_t ex)
{
    delete (Extractor*)ex;
}

void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt)
{
    (void)ex;
    (void)opt;
}

#if NCNN_STRING
int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat)
{
    return ((Extractor*)ex)->input(name, *((const Mat*)mat));
}

int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat)
{
    Mat mat0;
    int ret = ((Extractor*)ex)->extract(name, mat0);
    *mat = (ncnn_mat_t)(new Mat(mat0));
    return ret;
}
#endif /* NCNN_STRING */

int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat)
{
    return ((Extractor*)ex)->input(index, *((const Mat*)mat));
}

int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat)
{
    Mat mat0;
    int ret = ((Extractor*)ex)->extract(index, mat0);
    *mat = (ncnn_mat_t)(new Mat(mat0));
    return ret;
}

void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt)
{
    const Option _opt = opt ? *((const Option*)opt) : Option();
    copy_make_border(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, type, v, _opt);
}

void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt)
{
    const Option _opt = opt ? *((const Option*)opt) : Option();
    copy_make_border_3d(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, front, behind, type, v, _opt);
}

void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt)
{
    const Option _opt = opt ? *((const Option*)opt) : Option();
    copy_cut_border(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, _opt);
}

void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt)
{
    const Option _opt = opt ? *((const Option*)opt) : Option();
    copy_cut_border_3d(*(const Mat*)src, *(Mat*)dst, top, bottom, left, right, front, behind, _opt);
}

#if NCNN_PIXEL_DRAWING
void ncnn_draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
{
    ncnn::draw_rectangle_c1(pixels, w, h, w, rx, ry, rw, rh, color, thickness);
}

void ncnn_draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
{
    ncnn::draw_rectangle_c2(pixels, w, h, w * 2, rx, ry, rw, rh, color, thickness);
}

void ncnn_draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
{
    ncnn::draw_rectangle_c3(pixels, w, h, w * 3, rx, ry, rw, rh, color, thickness);
}

void ncnn_draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness)
{
    ncnn::draw_rectangle_c4(pixels, w, h, w * 4, rx, ry, rw, rh, color, thickness);
}

void ncnn_draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
{
    ncnn::draw_text_c1(pixels, w, h, w, text, x, y, fontpixelsize, color);
}

void ncnn_draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
{
    ncnn::draw_text_c2(pixels, w, h, w * 2, text, x, y, fontpixelsize, color);
}

void ncnn_draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
{
    ncnn::draw_text_c3(pixels, w, h, w * 3, text, x, y, fontpixelsize, color);
}

void ncnn_draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color)
{
    ncnn::draw_text_c4(pixels, w, h, w * 4, text, x, y, fontpixelsize, color);
}

void ncnn_draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
{
    ncnn::draw_circle_c1(pixels, w, h, w, cx, cy, radius, color, thickness);
}

void ncnn_draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
{
    ncnn::draw_circle_c2(pixels, w, h, w * 2, cx, cy, radius, color, thickness);
}

void ncnn_draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
{
    ncnn::draw_circle_c3(pixels, w, h, w * 3, cx, cy, radius, color, thickness);
}

void ncnn_draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness)
{
    ncnn::draw_circle_c4(pixels, w, h, w * 4, cx, cy, radius, color, thickness);
}

void ncnn_draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
{
    ncnn::draw_line_c1(pixels, w, h, w, x0, y0, x1, y1, color, thickness);
}

void ncnn_draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
{
    ncnn::draw_line_c2(pixels, w, h, w * 2, x0, y0, x1, y1, color, thickness);
}

void ncnn_draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
{
    ncnn::draw_line_c3(pixels, w, h, w * 3, x0, y0, x1, y1, color, thickness);
}

void ncnn_draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness)
{
    ncnn::draw_line_c4(pixels, w, h, w * 4, x0, y0, x1, y1, color, thickness);
}
#endif /* NCNN_PIXEL_DRAWING */

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* NCNN_C_API */


================================================
FILE: src/c_api.h
================================================
/* Copyright 2020 Tencent
 * SPDX-License-Identifier: BSD-3-Clause
 */

#ifndef NCNN_C_API_H
#define NCNN_C_API_H

#include "platform.h"

#if NCNN_C_API

#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

NCNN_EXPORT const char* ncnn_version(void);
NCNN_EXPORT int ncnn_version_number(void);

/* allocator api */
typedef struct __ncnn_allocator_t* ncnn_allocator_t;
struct NCNN_EXPORT __ncnn_allocator_t
{
    void* pthis;

    void* (*fast_malloc)(ncnn_allocator_t allocator, size_t size);
    void (*fast_free)(ncnn_allocator_t allocator, void* ptr);
};

NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_pool_allocator(void);
NCNN_EXPORT ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator(void);
NCNN_EXPORT void ncnn_allocator_destroy(ncnn_allocator_t allocator);

/* option api */
typedef struct __ncnn_option_t* ncnn_option_t;

NCNN_EXPORT ncnn_option_t ncnn_option_create(void);
NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt);

NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt);
NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads);

NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);
NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator);

NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_winograd_convolution(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_sgemm_convolution(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_packing_layout(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_fp16_packed(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_fp16_storage(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_fp16_arithmetic(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_int8_packed(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_int8_storage(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_int8_arithmetic(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_bf16_packed(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_bf16_storage(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_shader_local_memory(const ncnn_option_t opt);
NCNN_EXPORT int ncnn_option_get_use_cooperative_matrix(const ncnn_option_t opt);

NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_winograd_convolution(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_sgemm_convolution(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_packing_layout(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_fp16_packed(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_fp16_storage(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_fp16_arithmetic(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_int8_packed(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_int8_storage(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_int8_arithmetic(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_bf16_packed(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_bf16_storage(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_shader_local_memory(ncnn_option_t opt, int enable);
NCNN_EXPORT void ncnn_option_set_use_cooperative_matrix(ncnn_option_t opt, int enable);

/* mat api */
typedef struct __ncnn_mat_t* ncnn_mat_t;

NCNN_EXPORT ncnn_mat_t ncnn_mat_create(void);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator);
NCNN_EXPORT void ncnn_mat_destroy(ncnn_mat_t mat);

NCNN_EXPORT void ncnn_mat_fill_float(ncnn_mat_t mat, float v);

NCNN_EXPORT ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator);

NCNN_EXPORT int ncnn_mat_get_dims(const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_mat_get_w(const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_mat_get_h(const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_mat_get_d(const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_mat_get_c(const ncnn_mat_t mat);
NCNN_EXPORT size_t ncnn_mat_get_elemsize(const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_mat_get_elempack(const ncnn_mat_t mat);
NCNN_EXPORT size_t ncnn_mat_get_cstep(const ncnn_mat_t mat);
NCNN_EXPORT void* ncnn_mat_get_data(const ncnn_mat_t mat);

NCNN_EXPORT void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c);

#if NCNN_PIXEL

/* mat pixel api */
#define NCNN_MAT_PIXEL_RGB       1
#define NCNN_MAT_PIXEL_BGR       2
#define NCNN_MAT_PIXEL_GRAY      3
#define NCNN_MAT_PIXEL_RGBA      4
#define NCNN_MAT_PIXEL_BGRA      5
#define NCNN_MAT_PIXEL_X2Y(X, Y) (X | (Y << 16))
NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator);
NCNN_EXPORT ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator);
NCNN_EXPORT void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride);
NCNN_EXPORT void ncnn_mat_to_pixels_resize(const ncnn_mat_t mat, unsigned char* pixels, int type, int target_width, int target_height, int target_stride);

#endif /* NCNN_PIXEL */

NCNN_EXPORT void ncnn_mat_substract_mean_normalize(ncnn_mat_t mat, const float* mean_vals, const float* norm_vals);

NCNN_EXPORT void ncnn_convert_packing(const ncnn_mat_t src, ncnn_mat_t* dst, int elempack, const ncnn_option_t opt);
NCNN_EXPORT void ncnn_flatten(const ncnn_mat_t src, ncnn_mat_t* dst, const ncnn_option_t opt);

/* blob api */
typedef struct __ncnn_blob_t* ncnn_blob_t;

#if NCNN_STRING
NCNN_EXPORT const char* ncnn_blob_get_name(const ncnn_blob_t blob);
#endif /* NCNN_STRING */

NCNN_EXPORT int ncnn_blob_get_producer(const ncnn_blob_t blob);
NCNN_EXPORT int ncnn_blob_get_consumer(const ncnn_blob_t blob);

NCNN_EXPORT void ncnn_blob_get_shape(const ncnn_blob_t blob, int* dims, int* w, int* h, int* c);

/* paramdict api */
typedef struct __ncnn_paramdict_t* ncnn_paramdict_t;

NCNN_EXPORT ncnn_paramdict_t ncnn_paramdict_create(void);
NCNN_EXPORT void ncnn_paramdict_destroy(ncnn_paramdict_t pd);

NCNN_EXPORT int ncnn_paramdict_get_type(const ncnn_paramdict_t pd, int id);

NCNN_EXPORT int ncnn_paramdict_get_int(const ncnn_paramdict_t pd, int id, int def);
NCNN_EXPORT float ncnn_paramdict_get_float(const ncnn_paramdict_t pd, int id, float def);
NCNN_EXPORT ncnn_mat_t ncnn_paramdict_get_array(const ncnn_paramdict_t pd, int id, const ncnn_mat_t def);

NCNN_EXPORT void ncnn_paramdict_set_int(ncnn_paramdict_t pd, int id, int i);
NCNN_EXPORT void ncnn_paramdict_set_float(ncnn_paramdict_t pd, int id, float f);
NCNN_EXPORT void ncnn_paramdict_set_array(ncnn_paramdict_t pd, int id, const ncnn_mat_t v);

/* datareader api */
typedef struct __ncnn_datareader_t* ncnn_datareader_t;
struct NCNN_EXPORT __ncnn_datareader_t
{
    void* pthis;

#if NCNN_STRING
    int (*scan)(ncnn_datareader_t dr, const char* format, void* p);
#endif /* NCNN_STRING */
    size_t (*read)(ncnn_datareader_t dr, void* buf, size_t size);
};

NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create(void);
#if NCNN_STDIO
NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_stdio(FILE* fp);
#endif /* NCNN_STDIO */
NCNN_EXPORT ncnn_datareader_t ncnn_datareader_create_from_memory(const unsigned char** mem);
NCNN_EXPORT void ncnn_datareader_destroy(ncnn_datareader_t dr);

/* modelbin api */
typedef struct __ncnn_modelbin_t* ncnn_modelbin_t;
struct NCNN_EXPORT __ncnn_modelbin_t
{
    void* pthis;

    ncnn_mat_t (*load_1d)(const ncnn_modelbin_t mb, int w, int type);
    ncnn_mat_t (*load_2d)(const ncnn_modelbin_t mb, int w, int h, int type);
    ncnn_mat_t (*load_3d)(const ncnn_modelbin_t mb, int w, int h, int c, int type);
};

NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_datareader(const ncnn_datareader_t dr);
NCNN_EXPORT ncnn_modelbin_t ncnn_modelbin_create_from_mat_array(const ncnn_mat_t* weights, int n);
NCNN_EXPORT void ncnn_modelbin_destroy(ncnn_modelbin_t mb);

/* layer api */
typedef struct __ncnn_layer_t* ncnn_layer_t;
struct NCNN_EXPORT __ncnn_layer_t
{
    void* pthis;

    int (*load_param)(ncnn_layer_t layer, const ncnn_paramdict_t pd);
    int (*load_model)(ncnn_layer_t layer, const ncnn_modelbin_t mb);

    int (*create_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);
    int (*destroy_pipeline)(ncnn_layer_t layer, const ncnn_option_t opt);

    int (*forward_1)(const ncnn_layer_t layer, const ncnn_mat_t bottom_blob, ncnn_mat_t* top_blob, const ncnn_option_t opt);
    int (*forward_n)(const ncnn_layer_t layer, const ncnn_mat_t* bottom_blobs, int n, ncnn_mat_t* top_blobs, int n2, const ncnn_option_t opt);

    int (*forward_inplace_1)(const ncnn_layer_t layer, ncnn_mat_t bottom_top_blob, const ncnn_option_t opt);
    int (*forward_inplace_n)(const ncnn_layer_t layer, ncnn_mat_t* bottom_top_blobs, int n, const ncnn_option_t opt);
};

NCNN_EXPORT ncnn_layer_t ncnn_layer_create(void);
NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex);
#if NCNN_STRING
NCNN_EXPORT ncnn_layer_t ncnn_layer_create_by_type(const char* type);
NCNN_EXPORT int ncnn_layer_type_to_index(const char* type);
#endif /* NCNN_STRING */
NCNN_EXPORT void ncnn_layer_destroy(ncnn_layer_t layer);

#if NCNN_STRING
NCNN_EXPORT const char* ncnn_layer_get_name(const ncnn_layer_t layer);
#endif /* NCNN_STRING */

NCNN_EXPORT int ncnn_layer_get_typeindex(const ncnn_layer_t layer);
#if NCNN_STRING
NCNN_EXPORT const char* ncnn_layer_get_type(const ncnn_layer_t layer);
#endif /* NCNN_STRING */

NCNN_EXPORT int ncnn_layer_get_one_blob_only(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_inplace(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_vulkan(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_packing(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_bf16_storage(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_fp16_storage(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_vulkan_packing(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_any_packing(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_support_vulkan_any_packing(const ncnn_layer_t layer);

NCNN_EXPORT void ncnn_layer_set_one_blob_only(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_inplace(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_vulkan(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_packing(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_bf16_storage(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_fp16_storage(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_vulkan_packing(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_any_packing(ncnn_layer_t layer, int enable);
NCNN_EXPORT void ncnn_layer_set_support_vulkan_any_packing(ncnn_layer_t layer, int enable);

NCNN_EXPORT int ncnn_layer_get_bottom_count(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_bottom(const ncnn_layer_t layer, int i);
NCNN_EXPORT int ncnn_layer_get_top_count(const ncnn_layer_t layer);
NCNN_EXPORT int ncnn_layer_get_top(const ncnn_layer_t layer, int i);

NCNN_EXPORT void ncnn_blob_get_bottom_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);
NCNN_EXPORT void ncnn_blob_get_top_shape(const ncnn_layer_t layer, int i, int* dims, int* w, int* h, int* c);

/* layer factory function */
typedef ncnn_layer_t (*ncnn_layer_creator_t)(void* userdata);
typedef void (*ncnn_layer_destroyer_t)(ncnn_layer_t layer, void* userdata);

typedef struct __ncnn_net_custom_layer_factory_t* ncnn_net_custom_layer_factory_t;
struct __ncnn_net_custom_layer_factory_t
{
    ncnn_layer_creator_t creator;
    ncnn_layer_destroyer_t destroyer;
    void* userdata;
    ncnn_net_custom_layer_factory_t next;
};

/* net api */
typedef struct __ncnn_net_t* ncnn_net_t;
struct __ncnn_net_t
{
    void* pthis;

    ncnn_net_custom_layer_factory_t custom_layer_factory;
};

NCNN_EXPORT ncnn_net_t ncnn_net_create(void);
NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net);

NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net);
NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt);

#if NCNN_VULKAN
NCNN_EXPORT void ncnn_net_set_vulkan_device(ncnn_net_t net, int device_index);
#endif

#if NCNN_STRING
NCNN_EXPORT void ncnn_net_register_custom_layer_by_type(ncnn_net_t net, const char* type, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);
#endif /* NCNN_STRING */
NCNN_EXPORT void ncnn_net_register_custom_layer_by_typeindex(ncnn_net_t net, int typeindex, ncnn_layer_creator_t creator, ncnn_layer_destroyer_t destroyer, void* userdata);

#if NCNN_STDIO
#if NCNN_STRING
NCNN_EXPORT int ncnn_net_load_param(ncnn_net_t net, const char* path);
#endif /* NCNN_STRING */
NCNN_EXPORT int ncnn_net_load_param_bin(ncnn_net_t net, const char* path);
NCNN_EXPORT int ncnn_net_load_model(ncnn_net_t net, const char* path);
#if _WIN32
#if NCNN_STRING
NCNN_EXPORT int ncnn_net_load_param_w(ncnn_net_t net, const wchar_t* path);
#endif /* NCNN_STRING */
NCNN_EXPORT int ncnn_net_load_param_bin_w(ncnn_net_t net, const wchar_t* path);
NCNN_EXPORT int ncnn_net_load_model_w(ncnn_net_t net, const wchar_t* path);
#endif /* _WIN32 */
#endif /* NCNN_STDIO */

#if NCNN_STDIO
#if NCNN_STRING
NCNN_EXPORT int ncnn_net_load_param_memory(ncnn_net_t net, const char* mem);
#endif /* NCNN_STRING */
#endif /* NCNN_STDIO */
NCNN_EXPORT size_t ncnn_net_load_param_bin_memory(ncnn_net_t net, const unsigned char* mem);
NCNN_EXPORT size_t ncnn_net_load_model_memory(ncnn_net_t net, const unsigned char* mem);

#if NCNN_STRING
NCNN_EXPORT int ncnn_net_load_param_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
#endif /* NCNN_STRING */
NCNN_EXPORT int ncnn_net_load_param_bin_datareader(ncnn_net_t net, const ncnn_datareader_t dr);
NCNN_EXPORT int ncnn_net_load_model_datareader(ncnn_net_t net, const ncnn_datareader_t dr);

NCNN_EXPORT void ncnn_net_clear(ncnn_net_t net);

NCNN_EXPORT int ncnn_net_get_input_count(const ncnn_net_t net);
NCNN_EXPORT int ncnn_net_get_output_count(const ncnn_net_t net);
#if NCNN_STRING
NCNN_EXPORT const char* ncnn_net_get_input_name(const ncnn_net_t net, int i);
NCNN_EXPORT const char* ncnn_net_get_output_name(const ncnn_net_t net, int i);
#endif /* NCNN_STRING */
NCNN_EXPORT int ncnn_net_get_input_index(const ncnn_net_t net, int i);
NCNN_EXPORT int ncnn_net_get_output_index(const ncnn_net_t net, int i);

/* extractor api */
typedef struct __ncnn_extractor_t* ncnn_extractor_t;

NCNN_EXPORT ncnn_extractor_t ncnn_extractor_create(ncnn_net_t net);
NCNN_EXPORT void ncnn_extractor_destroy(ncnn_extractor_t ex);

NCNN_EXPORT void ncnn_extractor_set_option(ncnn_extractor_t ex, const ncnn_option_t opt);

#if NCNN_STRING
NCNN_EXPORT int ncnn_extractor_input(ncnn_extractor_t ex, const char* name, const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, ncnn_mat_t* mat);
#endif /* NCNN_STRING */
NCNN_EXPORT int ncnn_extractor_input_index(ncnn_extractor_t ex, int index, const ncnn_mat_t mat);
NCNN_EXPORT int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat);

/* mat process api */
#define NCNN_BORDER_CONSTANT    0
#define NCNN_BORDER_REPLICATE   1
#define NCNN_BORDER_REFLECT     2
#define NCNN_BORDER_TRANSPARENT -233
NCNN_EXPORT void ncnn_copy_make_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int type, float v, const ncnn_option_t opt);
NCNN_EXPORT void ncnn_copy_make_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, int type, float v, const ncnn_option_t opt);
NCNN_EXPORT void ncnn_copy_cut_border(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, const ncnn_option_t opt);
NCNN_EXPORT void ncnn_copy_cut_border_3d(const ncnn_mat_t src, ncnn_mat_t dst, int top, int bottom, int left, int right, int front, int behind, const ncnn_option_t opt);

#if NCNN_PIXEL_DRAWING
/* mat pixel drawing api*/
NCNN_EXPORT void ncnn_draw_rectangle_c1(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_rectangle_c2(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_rectangle_c3(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_rectangle_c4(unsigned char* pixels, int w, int h, int rx, int ry, int rw, int rh, unsigned int color, int thickness);

NCNN_EXPORT void ncnn_draw_text_c1(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
NCNN_EXPORT void ncnn_draw_text_c2(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
NCNN_EXPORT void ncnn_draw_text_c3(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);
NCNN_EXPORT void ncnn_draw_text_c4(unsigned char* pixels, int w, int h, const char* text, int x, int y, int fontpixelsize, unsigned int color);

NCNN_EXPORT void ncnn_draw_circle_c1(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_circle_c2(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_circle_c3(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_circle_c4(unsigned char* pixels, int w, int h, int cx, int cy, int radius, unsigned int color, int thickness);

NCNN_EXPORT void ncnn_draw_line_c1(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_line_c2(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_line_c3(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
NCNN_EXPORT void ncnn_draw_line_c4(unsigned char* pixels, int w, int h, int x0, int y0, int x1, int y1, unsigned int color, int thickness);
#endif /* NCNN_PIXEL_DRAWING */

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* NCNN_C_API */

#endif /* NCNN_C_API_H */


================================================
FILE: src/command.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "command.h"

#if NCNN_VULKAN

#include "option.h"
#include "pipeline.h"

namespace ncnn {

class VkComputePrivate
{
public:
    VkComputePrivate(const VulkanDevice* _vkdev);
    ~VkComputePrivate();

    int init();
    int begin_command_buffer();
    int end_command_buffer();

    const VulkanDevice* vkdev;

    VkCommandPool compute_command_pool;

    VkCommandBuffer compute_command_buffer;

    VkFence compute_command_fence;

    std::vector<VkMat> upload_staging_buffers;
    std::vector<VkMat> download_post_buffers;
    std::vector<Mat> download_post_mats_fp16;
    std::vector<Mat> download_post_mats;

    std::vector<VkImageMemory*> image_blocks_to_destroy;

    // the good-old path for device without VK_KHR_push_descriptor
    std::vector<VkDescriptorPool> descriptor_pools;
    std::vector<VkDescriptorSet> descriptorsets;

    struct record
    {
        enum
        {
            TYPE_copy_buffer,
            TYPE_copy_image,
            TYPE_copy_buffer_to_image,
            TYPE_copy_image_to_buffer,
            TYPE_bind_pipeline,
            TYPE_bind_descriptorsets,
            TYPE_push_constants,
            TYPE_dispatch,
            TYPE_memory_barrers,
            TYPE_buffer_barrers,
            TYPE_image_barrers,

#if NCNN_BENCHMARK
            TYPE_write_timestamp,
#endif // NCNN_BENCHMARK

            TYPE_post_download,
            TYPE_post_cast_float16_to_float32,
            TYPE_post_cast_bfloat16_to_float32,
        };

        int type;
        VkCommandBuffer command_buffer;

        union
        {
            struct
            {
                VkBuffer src;
                VkBuffer dst;
                uint32_t region_count;
                const VkBufferCopy* regions;
            } copy_buffer;
            struct
            {
                VkImage src;
                VkImageLayout src_layout;
                VkImage dst;
                VkImageLayout dst_layout;
                uint32_t region_count;
                const VkImageCopy* regions;
            } copy_image;
            struct
            {
                VkBuffer src;
                VkImage dst;
                VkImageLayout layout;
                uint32_t region_count;
                const VkBufferImageCopy* regions;
            } copy_buffer_to_image;
            struct
            {
                VkImage src;
                VkImageLayout layout;
                VkBuffer dst;
                uint32_t region_count;
                const VkBufferImageCopy* regions;
            } copy_image_to_buffer;

            struct
            {
                VkPipelineBindPoint bind_point;
                VkPipeline pipeline;
            } bind_pipeline;
            struct
            {
                VkPipelineBindPoint bind_point;
                VkPipelineLayout pipeline_layout;
                uint32_t descriptorset_count;
                uint32_t descriptorset_offset;
            } bind_descriptorsets;
            struct
            {
                VkPipelineLayout pipeline_layout;
                VkShaderStageFlags stage_flags;
                uint32_t size;
                const void* values;
            } push_constants;

            struct
            {
                uint32_t group_count_x;
                uint32_t group_count_y;
                uint32_t group_count_z;
            } dispatch;

            struct
            {
                VkPipelineStageFlags src_stage;
                VkPipelineStageFlags dst_stage;
                uint32_t barrier_count;
                const VkMemoryBarrier* barriers;
            } memory_barrers;
            struct
            {
                VkPipelineStageFlags src_stage;
                VkPipelineStageFlags dst_stage;
                uint32_t barrier_count;
                const VkBufferMemoryBarrier* barriers;
            } buffer_barrers;
            struct
            {
                VkPipelineStageFlags src_stage;
                VkPipelineStageFlags dst_stage;
                uint32_t barrier_count;
                const VkImageMemoryBarrier* barriers;
            } image_barrers;

#if NCNN_BENCHMARK
            struct
            {
                uint32_t query;
            } write_timestamp;
#endif // NCNN_BENCHMARK

            struct
            {
                uint32_t download_post_buffer_mat_offset;
                uint32_t download_post_mat_fp16_offset;
            } post_download;
            struct
            {
                uint32_t download_post_mat_fp16_offset;
                uint32_t download_post_mat_offset;
                int num_threads;
            } post_cast_float16_to_float32;
            struct
            {
                uint32_t download_post_mat_bf16_offset;
                uint32_t download_post_mat_offset;
                int num_threads;
            } post_cast_bfloat16_to_float32;
        };
    };

    std::vector<record> delayed_records;

    uint64_t pending_dispatch_total;

#if NCNN_BENCHMARK
    uint32_t query_count;
    VkQueryPool query_pool;
#endif // NCNN_BENCHMARK
};

VkComputePrivate::VkComputePrivate(const VulkanDevice* _vkdev)
    : vkdev(_vkdev)
{
    compute_command_pool = 0;
    compute_command_buffer = 0;
    compute_command_fence = 0;

    pending_dispatch_total = 0;

#if NCNN_BENCHMARK
    query_count = 0;
    query_pool = 0;
#endif // NCNN_BENCHMARK

    init();
}

VkComputePrivate::~VkComputePrivate()
{
    for (size_t i = 0; i < image_blocks_to_destroy.size(); i++)
    {
        VkImageMemory* ptr = image_blocks_to_destroy[i];

        int old_command_refcount = NCNN_XADD(&ptr->command_refcount, -1);
        if (ptr->refcount == 0 && old_command_refcount == 1)
        {
            // no userspace reference and we are the last command reference
            vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
            vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);

            delete ptr;
        }
        else
        {
            // reference exists in user code or other command
        }
    }
    image_blocks_to_destroy.clear();

    if (!vkdev->info.support_VK_KHR_push_descriptor())
    {
        for (size_t i = 0; i < descriptorsets.size(); i++)
        {
            vkFreeDescriptorSets(vkdev->vkdevice(), descriptor_pools[i], 1, &descriptorsets[i]);
            vkDestroyDescriptorPool(vkdev->vkdevice(), descriptor_pools[i], 0);
        }
    }

#if NCNN_BENCHMARK
    if (query_pool)
    {
        // all submitted commands that refer to queryPool must have completed execution
        vkResetCommandBuffer(compute_command_buffer, 0);

        vkDestroyQueryPool(vkdev->vkdevice(), query_pool, 0);
    }
#endif // NCNN_BENCHMARK

    vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0);

    vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer);
    vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0);
}

int VkComputePrivate::init()
{
    // compute_command_pool
    {
        VkCommandPoolCreateInfo commandPoolCreateInfo;
        commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
        commandPoolCreateInfo.pNext = 0;
        commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
        commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index();

        VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkCreateCommandPool failed %d", ret);
            return -1;
        }
    }

    // compute_command_buffer
    {
        VkCommandBufferAllocateInfo commandBufferAllocateInfo;
        commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
        commandBufferAllocateInfo.pNext = 0;
        commandBufferAllocateInfo.commandPool = compute_command_pool;
        commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
        commandBufferAllocateInfo.commandBufferCount = 1;

        VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret);
            return -1;
        }
    }

    // compute_command_fence
    {
        VkFenceCreateInfo fenceCreateInfo;
        fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
        fenceCreateInfo.pNext = 0;
        fenceCreateInfo.flags = 0;

        VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkCreateFence failed %d", ret);
            return -1;
        }
    }

    if (vkdev->info.support_VK_KHR_push_descriptor())
    {
        begin_command_buffer();

#if NCNN_BENCHMARK
        if (query_pool)
            vkCmdResetQueryPool(compute_command_buffer, query_pool, 0, query_count);
#endif // NCNN_BENCHMARK
    }

    return 0;
}

int VkComputePrivate::begin_command_buffer()
{
    VkCommandBufferBeginInfo commandBufferBeginInfo;
    commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
    commandBufferBeginInfo.pNext = 0;
    commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
    commandBufferBeginInfo.pInheritanceInfo = 0;

    VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkBeginCommandBuffer failed %d", ret);
        return -1;
    }

    return 0;
}

int VkComputePrivate::end_command_buffer()
{
    VkResult ret = vkEndCommandBuffer(compute_command_buffer);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEndCommandBuffer failed %d", ret);
        return -1;
    }

    return 0;
}

VkCompute::VkCompute(const VulkanDevice* _vkdev)
    : vkdev(_vkdev), d(new VkComputePrivate(_vkdev))
{
}

VkCompute::~VkCompute()
{
    delete d;
}

void VkCompute::record_upload(const Mat& src, VkMat& dst, const Option& opt)
{
    // NCNN_LOGE("record_upload buffer");

    Mat src_fp16;
    if (src.elemsize == src.elempack * 4u)
    {
        // cpu cast to fp16 (discrete gpu)
        if (vkdev->info.type() == 0 && (opt.use_bf16_storage || opt.use_bf16_packed))
        {
            ncnn::cast_float32_to_bfloat16(src, src_fp16, opt);
        }
        else if (vkdev->info.type() == 0 && (opt.use_fp16_storage || opt.use_fp16_packed))
        {
            ncnn::cast_float32_to_float16(src, src_fp16, opt);
        }
        else
        {
            src_fp16 = src;
        }
    }
    else
    {
        src_fp16 = src;
    }

    // vkdev->convert_packing only handles elempack=1/4
    if (src_fp16.elempack > 4)
    {
        Mat src_fp16_pack4;
        ncnn::convert_packing(src_fp16, src_fp16_pack4, 4, opt);
        src_fp16 = src_fp16_pack4;
    }

    // upload
    VkMat dst_staging;
    dst_staging.create_like(src_fp16, opt.staging_vkallocator);
    if (dst_staging.empty())
        return;

    // stash staging
    d->upload_staging_buffers.push_back(dst_staging);

    //     NCNN_LOGE("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());

    // memcpy src to device
    memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize);
    dst_staging.allocator->flush(dst_staging.data);

    // mark device host-write @ null
    dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT;
    dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;

    // resolve dst_elempack
    int dims = src_fp16.dims;
    int elemcount = 0;
    if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w;
    if (dims == 2) elemcount = src_fp16.elempack * src_fp16.h;
    if (dims == 3 || dims == 4) elemcount = src_fp16.elempack * src_fp16.c;

    int dst_elempack = elemcount % 4 == 0 ? 4 : 1;

    // gpu cast to fp16 on the fly (integrated gpu)
    int cast_type_to = 0;
    if (vkdev->info.type() != 0)
    {
        if (opt.use_bf16_storage || opt.use_bf16_packed)
            cast_type_to = 5;
        else if (opt.use_fp16_storage || opt.use_fp16_packed)
            cast_type_to = 2;
        else
            cast_type_to = 1;
    }
    vkdev->convert_packing(dst_staging, dst, dst_elempack, cast_type_to, *this, opt);
}

void VkCompute::record_download(const VkMat& src, Mat& dst, const Option& opt)
{
    // NCNN_LOGE("record_download buffer");

    // resolve dst_elempack
    int dims = src.dims;
    int elemcount = 0;
    if (dims == 1) elemcount = src.elempack * src.w;
    if (dims == 2) elemcount = src.elempack * src.h;
    if (dims == 3 || dims == 4) elemcount = src.elempack * src.c;

    int dst_elempack = 1;
    if (opt.use_packing_layout)
        dst_elempack = elemcount % 4 == 0 ? 4 : 1;
    else
        dst_elempack = 1;

    // gpu cast to fp32 on the fly (integrated gpu)
    Option opt_staging = opt;
    if (!opt_staging.blob_vkallocator->mappable)
    {
        opt_staging.blob_vkallocator = opt.staging_vkallocator;
    }
    int cast_type_to = 0;
    if (vkdev->info.type() != 0)
    {
        cast_type_to = 1;
    }

    if (src.elemsize == src.elempack * 1u)
    {
        cast_type_to = 4;
    }

    VkMat dst_staging;
    vkdev->convert_packing(src, dst_staging, dst_elempack, cast_type_to, *this, opt_staging);

    // barrier device any @ compute to host-read @ compute
    if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT)
    {
        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = dst_staging.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].buffer = dst_staging.buffer();
        barriers[0].offset = dst_staging.buffer_offset();
        barriers[0].size = dst_staging.buffer_capacity();

        VkPipelineStageFlags src_stage = dst_staging.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_buffer_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.buffer_barrers.src_stage = src_stage;
            r.buffer_barrers.dst_stage = dst_stage;
            r.buffer_barrers.barrier_count = 1;
            r.buffer_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark device host-read @ any
        dst_staging.data->access_flags = VK_ACCESS_HOST_READ_BIT;
        dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
    }

    // create dst
    Mat dst_fp16;
    dst_fp16.create_like(dst_staging, opt.blob_allocator);
    if (dst_fp16.empty())
        return;

    // download
    d->download_post_buffers.push_back(dst_staging);
    d->download_post_mats_fp16.push_back(dst_fp16);

    // post memcpy device to dst
    {
        VkComputePrivate::record r;
        r.type = VkComputePrivate::record::TYPE_post_download;
        r.command_buffer = 0;
        r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1;
        r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1;
        d->delayed_records.push_back(r);
    }

    // cast to fp32 (discrete gpu)
    if (dst_fp16.elemsize == dst_fp16.elempack * 2u)
    {
        if (vkdev->info.type() == 0 && (opt.use_bf16_storage || opt.use_bf16_packed))
        {
            int dims = dst_fp16.dims;
            if (dims == 1)
                dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
            if (dims == 2)
                dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
            if (dims == 3)
                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
            if (dims == 4)
                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);

            d->download_post_mats.push_back(dst);

            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_post_cast_bfloat16_to_float32;
            r.command_buffer = 0;
            r.post_cast_bfloat16_to_float32.download_post_mat_bf16_offset = d->download_post_mats_fp16.size() - 1;
            r.post_cast_bfloat16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1;
            r.post_cast_bfloat16_to_float32.num_threads = opt.num_threads;
            d->delayed_records.push_back(r);
        }
        else if (vkdev->info.type() == 0 && (opt.use_fp16_storage || opt.use_fp16_packed))
        {
            int dims = dst_fp16.dims;
            if (dims == 1)
                dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
            if (dims == 2)
                dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
            if (dims == 3)
                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
            if (dims == 4)
                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.d, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);

            d->download_post_mats.push_back(dst);

            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_post_cast_float16_to_float32;
            r.command_buffer = 0;
            r.post_cast_float16_to_float32.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1;
            r.post_cast_float16_to_float32.download_post_mat_offset = d->download_post_mats.size() - 1;
            r.post_cast_float16_to_float32.num_threads = opt.num_threads;
            d->delayed_records.push_back(r);
        }
        else
        {
            dst = dst_fp16;
        }
    }
    else
    {
        dst = dst_fp16;
    }
}

void VkCompute::record_clone(const Mat& src, VkMat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone host to buffer");

    // host to staging
    VkMat dst_staging;
    dst_staging.create_like(src, opt.staging_vkallocator);
    if (dst_staging.empty())
        return;

    // memcpy src to device
    memcpy(dst_staging.mapped_ptr(), src.data, src.total() * src.elemsize);
    dst_staging.allocator->flush(dst_staging.data);

    // mark device host-write @ null
    dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT;
    dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;

    // staging to device
    record_clone(dst_staging, dst, opt);

    // stash staging
    d->upload_staging_buffers.push_back(dst_staging);
}

void VkCompute::record_clone(const Mat& src, VkImageMat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone host to image");

    // host to staging
    VkMat dst_staging;
    Option opt_staging = opt;
    opt_staging.blob_vkallocator = opt.staging_vkallocator;
    record_clone(src, dst_staging, opt_staging);

    // staging to image
    record_clone(dst_staging, dst, opt);

    // stash staging
    d->upload_staging_buffers.push_back(dst_staging);
}

void VkCompute::record_clone(const VkMat& src, Mat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone buffer to host");

    if (!src.allocator->mappable)
    {
        // device to staging
        VkMat src_staging;
        Option opt_staging = opt;
        opt_staging.blob_vkallocator = opt.staging_vkallocator;
        record_clone(src, src_staging, opt_staging);

        // staging to host
        record_clone(src_staging, dst, opt);

        return;
    }

    // create dst
    dst.create_like(src, opt.blob_allocator);
    if (dst.empty())
        return;

    // barrier device any @ compute to host-read @ compute
    if (src.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT)
    {
        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = src.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].buffer = src.buffer();
        barriers[0].offset = src.buffer_offset();
        barriers[0].size = src.buffer_capacity();

        VkPipelineStageFlags src_stage = src.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_buffer_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.buffer_barrers.src_stage = src_stage;
            r.buffer_barrers.dst_stage = dst_stage;
            r.buffer_barrers.barrier_count = 1;
            r.buffer_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark device host-read @ any
        src.data->access_flags = VK_ACCESS_HOST_READ_BIT;
        src.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
    }

    // stash download post buffer and mat
    d->download_post_buffers.push_back(src);
    d->download_post_mats_fp16.push_back(dst);

    // post memcpy device to dst
    {
        VkComputePrivate::record r;
        r.type = VkComputePrivate::record::TYPE_post_download;
        r.command_buffer = 0;
        r.post_download.download_post_buffer_mat_offset = d->download_post_buffers.size() - 1;
        r.post_download.download_post_mat_fp16_offset = d->download_post_mats_fp16.size() - 1;
        d->delayed_records.push_back(r);
    }
}

void VkCompute::record_clone(const VkImageMat& src, Mat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone image to host");

    // image to staging
    VkMat src_staging;
    Option opt_staging = opt;
    opt_staging.blob_vkallocator = opt.staging_vkallocator;
    record_clone(src, src_staging, opt_staging);

    // staging to host
    record_clone(src_staging, dst, opt);
}

void VkCompute::record_clone(const VkMat& src, VkMat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone buffer to buffer");

    // create dst
    dst.create_like(src, opt.blob_vkallocator);
    if (dst.empty())
        return;

    if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT)
    {
        // barrier device any @ compute to transfer-read @ compute
        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = src.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].buffer = src.buffer();
        barriers[0].offset = src.buffer_offset();
        barriers[0].size = src.buffer_capacity();

        VkPipelineStageFlags src_stage = src.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_buffer_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.buffer_barrers.src_stage = src_stage;
            r.buffer_barrers.dst_stage = dst_stage;
            r.buffer_barrers.barrier_count = 1;
            r.buffer_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark device transfer-read @ transfer
        src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT;
        src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    {
        // barrier device any @ null to transfer-write @ compute

        // mark device transfer-write @ transfer
        dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT;
        dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    // record device to staging
    {
        VkBufferCopy* regions = new VkBufferCopy[1];
        regions[0].srcOffset = src.buffer_offset();
        regions[0].dstOffset = dst.buffer_offset();
        regions[0].size = std::min(src.buffer_capacity(), dst.buffer_capacity());

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdCopyBuffer(d->compute_command_buffer, src.buffer(), dst.buffer(), 1, regions);
            delete[] regions;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_copy_buffer;
            r.command_buffer = d->compute_command_buffer;
            r.copy_buffer.src = src.buffer();
            r.copy_buffer.dst = dst.buffer();
            r.copy_buffer.region_count = 1;
            r.copy_buffer.regions = regions;
            d->delayed_records.push_back(r);
        }
    }
}

void VkCompute::record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone image to image");

    // create dst
    dst.create_like(src, opt.blob_vkallocator);
    if (dst.empty())
        return;

    // image layout transform any @ any to transfer-src-optimal @ compute
    if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT)
    {
        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = src.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
        barriers[0].oldLayout = src.data->image_layout;
        barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].image = src.image();
        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barriers[0].subresourceRange.baseMipLevel = 0;
        barriers[0].subresourceRange.levelCount = 1;
        barriers[0].subresourceRange.baseArrayLayer = 0;
        barriers[0].subresourceRange.layerCount = 1;

        VkPipelineStageFlags src_stage = src.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_image_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.image_barrers.src_stage = src_stage;
            r.image_barrers.dst_stage = dst_stage;
            r.image_barrers.barrier_count = 1;
            r.image_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark image transfer-src-optimal @ compute
        src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT;
        src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
        src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    // image layout transform undefined @ null to transfer-dst-optimal @ compute
    {
        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = 0;
        barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
        barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].image = dst.image();
        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barriers[0].subresourceRange.baseMipLevel = 0;
        barriers[0].subresourceRange.levelCount = 1;
        barriers[0].subresourceRange.baseArrayLayer = 0;
        barriers[0].subresourceRange.layerCount = 1;

        VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_image_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.image_barrers.src_stage = src_stage;
            r.image_barrers.dst_stage = dst_stage;
            r.image_barrers.barrier_count = 1;
            r.image_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark image transfer-dst-optimal @ compute
        dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT;
        dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
        dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    // record device to staging
    {
        VkImageCopy* regions = new VkImageCopy[1];
        regions[0].srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        regions[0].srcSubresource.mipLevel = 0;
        regions[0].srcSubresource.baseArrayLayer = 0;
        regions[0].srcSubresource.layerCount = 1;
        regions[0].srcOffset.x = 0;
        regions[0].srcOffset.y = 0;
        regions[0].srcOffset.z = 0;
        regions[0].dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        regions[0].dstSubresource.mipLevel = 0;
        regions[0].dstSubresource.baseArrayLayer = 0;
        regions[0].dstSubresource.layerCount = 1;
        regions[0].dstOffset.x = 0;
        regions[0].dstOffset.y = 0;
        regions[0].dstOffset.z = 0;
        regions[0].extent.width = src.data->width;
        regions[0].extent.height = src.data->height;
        regions[0].extent.depth = src.data->depth;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdCopyImage(d->compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, 1, regions);
            delete[] regions;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_copy_image;
            r.command_buffer = d->compute_command_buffer;
            r.copy_image.src = src.image();
            r.copy_image.src_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
            r.copy_image.dst = dst.image();
            r.copy_image.dst_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
            r.copy_image.region_count = 1;
            r.copy_image.regions = regions;
            d->delayed_records.push_back(r);
        }
    }

    // image and imageview can not be destroyed until command execution ends
    NCNN_XADD(&src.data->command_refcount, 1);
    NCNN_XADD(&dst.data->command_refcount, 1);
    d->image_blocks_to_destroy.push_back(src.data);
    d->image_blocks_to_destroy.push_back(dst.data);
}

void VkCompute::record_clone(const VkMat& src, VkImageMat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone buffer to image");

    // create dst
    dst.create_like(src, opt.blob_vkallocator);
    if (dst.empty())
        return;

    // barrier device any @ any to transfer-read @ compute
    if (src.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || src.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
    {
        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = src.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].buffer = src.buffer();
        barriers[0].offset = src.buffer_offset();
        barriers[0].size = src.buffer_capacity();

        VkPipelineStageFlags src_stage = src.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_buffer_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.buffer_barrers.src_stage = src_stage;
            r.buffer_barrers.dst_stage = dst_stage;
            r.buffer_barrers.barrier_count = 1;
            r.buffer_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark device transfer-read @ compute
        src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT;
        src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    // image layout transform undefined @ null to transfer-dst-optimal @ compute
    {
        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = 0;
        barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
        barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].image = dst.image();
        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barriers[0].subresourceRange.baseMipLevel = 0;
        barriers[0].subresourceRange.levelCount = 1;
        barriers[0].subresourceRange.baseArrayLayer = 0;
        barriers[0].subresourceRange.layerCount = 1;

        VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_image_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.image_barrers.src_stage = src_stage;
            r.image_barrers.dst_stage = dst_stage;
            r.image_barrers.barrier_count = 1;
            r.image_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark image transfer-dst-optimal @ compute
        dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT;
        dst.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
        dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    // record device to image
    {
        int region_count;
        VkBufferImageCopy* regions;
        if (dst.elemsize * dst.w * dst.h % 16 == 0)
        {
            region_count = 1;
            regions = new VkBufferImageCopy[1];
            regions[0].bufferOffset = src.buffer_offset();
            regions[0].bufferRowLength = 0;
            regions[0].bufferImageHeight = 0;
            regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
            regions[0].imageSubresource.mipLevel = 0;
            regions[0].imageSubresource.baseArrayLayer = 0;
            regions[0].imageSubresource.layerCount = 1;
            regions[0].imageOffset.x = 0;
            regions[0].imageOffset.y = 0;
            regions[0].imageOffset.z = 0;
            regions[0].imageExtent.width = dst.data->width;
            regions[0].imageExtent.height = dst.data->height;
            regions[0].imageExtent.depth = dst.data->depth;
        }
        else
        {
            region_count = dst.c;
            regions = new VkBufferImageCopy[region_count];
            for (int i = 0; i < region_count; i++)
            {
                regions[i].bufferOffset = src.buffer_offset() + src.cstep * src.elemsize * i;
                regions[i].bufferRowLength = 0;
                regions[i].bufferImageHeight = 0;
                regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
                regions[i].imageSubresource.mipLevel = 0;
                regions[i].imageSubresource.baseArrayLayer = 0;
                regions[i].imageSubresource.layerCount = 1;
                regions[i].imageOffset.x = 0;
                regions[i].imageOffset.y = 0;
                regions[i].imageOffset.z = i;
                regions[i].imageExtent.width = dst.data->width;
                regions[i].imageExtent.height = dst.data->height;
                regions[i].imageExtent.depth = 1;
            }
        }

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdCopyBufferToImage(d->compute_command_buffer, src.buffer(), dst.image(), VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, region_count, regions);
            delete[] regions;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_copy_buffer_to_image;
            r.command_buffer = d->compute_command_buffer;
            r.copy_buffer_to_image.src = src.buffer();
            r.copy_buffer_to_image.dst = dst.image();
            r.copy_buffer_to_image.layout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
            r.copy_buffer_to_image.region_count = region_count;
            r.copy_buffer_to_image.regions = regions;
            d->delayed_records.push_back(r);
        }
    }

    // image and imageview can not be destroyed until command execution ends
    NCNN_XADD(&dst.data->command_refcount, 1);
    d->image_blocks_to_destroy.push_back(dst.data);
}

void VkCompute::record_clone(const VkImageMat& src, VkMat& dst, const Option& opt)
{
    //     NCNN_LOGE("record_clone image to buffer");

    // create dst
    dst.create_like(src, opt.blob_vkallocator);
    if (dst.empty())
        return;

    // image layout transform any @ any to transfer-src-optimal @ compute
    if (src.data->access_flags & VK_ACCESS_TRANSFER_WRITE_BIT || src.data->image_layout != VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL || src.data->stage_flags != VK_PIPELINE_STAGE_TRANSFER_BIT)
    {
        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = src.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
        barriers[0].oldLayout = src.data->image_layout;
        barriers[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].image = src.image();
        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barriers[0].subresourceRange.baseMipLevel = 0;
        barriers[0].subresourceRange.levelCount = 1;
        barriers[0].subresourceRange.baseArrayLayer = 0;
        barriers[0].subresourceRange.layerCount = 1;

        VkPipelineStageFlags src_stage = src.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_image_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.image_barrers.src_stage = src_stage;
            r.image_barrers.dst_stage = dst_stage;
            r.image_barrers.barrier_count = 1;
            r.image_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark image transfer-src-optimal @ compute
        src.data->access_flags = VK_ACCESS_TRANSFER_READ_BIT;
        src.data->image_layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
        src.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    {
        // barrier device any @ null to transfer-write @ compute

        // mark device transfer-write @ transfer
        dst.data->access_flags = VK_ACCESS_TRANSFER_WRITE_BIT;
        dst.data->stage_flags = VK_PIPELINE_STAGE_TRANSFER_BIT;
    }

    // record image to device
    {
        int region_count;
        VkBufferImageCopy* regions;
        if (src.elemsize * src.w * src.h % 16 == 0)
        {
            region_count = 1;
            regions = new VkBufferImageCopy[1];
            regions[0].bufferOffset = dst.buffer_offset();
            regions[0].bufferRowLength = 0;
            regions[0].bufferImageHeight = 0;
            regions[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
            regions[0].imageSubresource.mipLevel = 0;
            regions[0].imageSubresource.baseArrayLayer = 0;
            regions[0].imageSubresource.layerCount = 1;
            regions[0].imageOffset.x = 0;
            regions[0].imageOffset.y = 0;
            regions[0].imageOffset.z = 0;
            regions[0].imageExtent.width = src.data->width;
            regions[0].imageExtent.height = src.data->height;
            regions[0].imageExtent.depth = src.data->depth;
        }
        else
        {
            region_count = src.c;
            regions = new VkBufferImageCopy[region_count];
            for (int i = 0; i < region_count; i++)
            {
                regions[i].bufferOffset = dst.buffer_offset() + dst.cstep * dst.elemsize * i;
                regions[i].bufferRowLength = 0;
                regions[i].bufferImageHeight = 0;
                regions[i].imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
                regions[i].imageSubresource.mipLevel = 0;
                regions[i].imageSubresource.baseArrayLayer = 0;
                regions[i].imageSubresource.layerCount = 1;
                regions[i].imageOffset.x = 0;
                regions[i].imageOffset.y = 0;
                regions[i].imageOffset.z = i;
                regions[i].imageExtent.width = src.data->width;
                regions[i].imageExtent.height = src.data->height;
                regions[i].imageExtent.depth = 1;
            }
        }

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdCopyImageToBuffer(d->compute_command_buffer, src.image(), VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst.buffer(), region_count, regions);
            delete[] regions;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_copy_image_to_buffer;
            r.command_buffer = d->compute_command_buffer;
            r.copy_image_to_buffer.src = src.image();
            r.copy_image_to_buffer.layout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
            r.copy_image_to_buffer.dst = dst.buffer();
            r.copy_image_to_buffer.region_count = region_count;
            r.copy_image_to_buffer.regions = regions;
            d->delayed_records.push_back(r);
        }
    }

    // image and imageview can not be destroyed until command execution ends
    NCNN_XADD(&src.data->command_refcount, 1);
    d->image_blocks_to_destroy.push_back(src.data);
}

void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher)
{
    record_pipeline(pipeline, bindings, std::vector<VkImageMat>(), constants, dispatcher);
}

void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher)
{
    record_pipeline(pipeline, std::vector<VkMat>(), bindings, constants, dispatcher);
}

void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher)
{
    Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0);

    record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat);
}

void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher)
{
    Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.d, dispatcher.c, (void*)0);

    record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher_mat);
}

void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher)
{
    //     NCNN_LOGE("record_pipeline %p", pipeline);

    const int buffer_binding_count = (int)buffer_bindings.size();
    const int image_binding_count = (int)image_bindings.size();
    const int constant_count = (int)constants.size();

    const int binding_count = buffer_binding_count + image_binding_count;
    const ShaderInfo& shader_info = pipeline->shader_info();

    if (binding_count != shader_info.binding_count)
    {
        NCNN_LOGE("binding_count not match, expect %d but got %d + %d", shader_info.binding_count, buffer_binding_count, image_binding_count);
    }

    if (constant_count != shader_info.push_constant_count)
    {
        NCNN_LOGE("push_constant_count not match, expect %d but got %d", shader_info.push_constant_count, constant_count);
    }

    int buffer_index = 0;
    int image_index = 0;
    for (int i = 0; i < binding_count; i++)
    {
        int binding_type = shader_info.binding_types[i];

        if (binding_type == 1)
        {
            const VkMat& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index];
            buffer_index++;

            //             NCNN_LOGE("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity());

            barrier_readwrite(binding);
        }
        else if (binding_type == 2)
        {
            const VkImageMat& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index];
            image_index++;

            //             NCNN_LOGE("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());

            barrier_readwrite(binding);

            // image and imageview can not be destroyed until command execution ends
            NCNN_XADD(&binding.data->command_refcount, 1);
            d->image_blocks_to_destroy.push_back(binding.data);
        }
        else // if (binding_type == 3)
        {
            const VkImageMat& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image_readonly() : image_bindings[image_index];
            image_index++;

            //             NCNN_LOGE("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());

            // if the same image used for both storage image and combined image sampler
            // only apply image layout transition to general
            bool image_read_write = false;
            for (int j = 0; j < image_binding_count; j++)
            {
                if (shader_info.binding_types[j] == 2 && binding.data == image_bindings[j].data)
                {
                    // the same image is used as storage image, skip it
                    image_read_write = true;
                    break;
                }
            }
            if (image_read_write)
                continue;

            barrier_readonly(binding);

            // image and imageview can not be destroyed until command execution ends
            NCNN_XADD(&binding.data->command_refcount, 1);
            d->image_blocks_to_destroy.push_back(binding.data);
        }
    }

    // record bind pipeline
    {
        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline());
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_bind_pipeline;
            r.command_buffer = d->compute_command_buffer;
            r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
            r.bind_pipeline.pipeline = pipeline->pipeline();
            d->delayed_records.push_back(r);
        }
    }

    // record update bindings
    if (binding_count > 0)
    {
        std::vector<unsigned char> descriptorInfos;
        {
            descriptorInfos.resize(sizeof(VkDescriptorBufferInfo) * buffer_binding_count + sizeof(VkDescriptorImageInfo) * image_binding_count);

            unsigned char* p_descriptorInfos = descriptorInfos.data();
            int descriptorBufferInfo_index = 0;
            int descriptorImageInfo_index = 0;
            for (int i = 0; i < binding_count; i++)
            {
                int binding_type = shader_info.binding_types[i];

                if (binding_type == 1)
                {
                    const VkMat& binding = buffer_bindings[descriptorBufferInfo_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[descriptorBufferInfo_index];
                    descriptorBufferInfo_index++;

                    VkDescriptorBufferInfo descriptorBufferInfo;
                    descriptorBufferInfo.buffer = binding.buffer();
                    descriptorBufferInfo.offset = binding.buffer_offset();
                    descriptorBufferInfo.range = binding.total() * binding.elemsize;

                    memcpy(p_descriptorInfos, &descriptorBufferInfo, sizeof(VkDescriptorBufferInfo));
                    p_descriptorInfos += sizeof(VkDescriptorBufferInfo);
                }
                else //if (binding_type == 2 || binding_type == 3)
                {
                    const VkImageMat& binding = image_bindings[descriptorImageInfo_index].empty() ? vkdev->get_dummy_image() : image_bindings[descriptorImageInfo_index];
                    descriptorImageInfo_index++;

                    // we always use immutable nearest sampler set in descroptor layout during pipeline creation
                    VkDescriptorImageInfo descriptorImageInfo;
                    descriptorImageInfo.sampler = 0;
                    descriptorImageInfo.imageView = binding.imageview();
                    descriptorImageInfo.imageLayout = binding.data->image_layout;

                    memcpy(p_descriptorInfos, &descriptorImageInfo, sizeof(VkDescriptorImageInfo));
                    p_descriptorInfos += sizeof(VkDescriptorImageInfo);
                }
            }
        }

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, descriptorInfos.data());
        }
        else
        {
            // create new descriptor_pool and descriptorset
            VkDescriptorPool descriptor_pool;
            {
                int image_binding_count = 0;
                int sampler_binding_count = 0;
                for (int i = 0; i < binding_count; i++)
                {
                    int binding_type = shader_info.binding_types[i];

                    if (binding_type == 2)
                        image_binding_count++;
                    else // if (binding_type == 3)
                        sampler_binding_count++;
                }

                VkDescriptorPoolSize poolSizes[3];
                poolSizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                poolSizes[0].descriptorCount = buffer_binding_count;
                poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
                poolSizes[1].descriptorCount = image_binding_count;
                poolSizes[2].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
                poolSizes[2].descriptorCount = sampler_binding_count;

                VkDescriptorPoolCreateInfo descriptorPoolCreateInfo;
                descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
                descriptorPoolCreateInfo.pNext = 0;
                descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
                descriptorPoolCreateInfo.maxSets = 1;
                descriptorPoolCreateInfo.poolSizeCount = 3;
                descriptorPoolCreateInfo.pPoolSizes = poolSizes;

                VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool);
                if (ret != VK_SUCCESS)
                {
                    NCNN_LOGE("vkCreateDescriptorPool failed %d", ret);
                    return;
                }
            }
            d->descriptor_pools.push_back(descriptor_pool);

            VkDescriptorSet descriptorset;
            {
                VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout();

                VkDescriptorSetAllocateInfo descriptorSetAllocateInfo;
                descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
                descriptorSetAllocateInfo.pNext = 0;
                descriptorSetAllocateInfo.descriptorPool = descriptor_pool;
                descriptorSetAllocateInfo.descriptorSetCount = 1;
                descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout;

                VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset);
                if (ret != VK_SUCCESS)
                {
                    NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret);
                    return;
                }
            }
            d->descriptorsets.push_back(descriptorset);

            if (vkdev->info.support_VK_KHR_descriptor_update_template())
            {
                vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), descriptorInfos.data());
            }
            else
            {
                std::vector<VkWriteDescriptorSet> writeDescriptorSets(binding_count);
                {
                    const unsigned char* p_descriptorInfos = descriptorInfos.data();
                    for (int i = 0; i < binding_count; i++)
                    {
                        int binding_type = shader_info.binding_types[i];

                        writeDescriptorSets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
                        writeDescriptorSets[i].pNext = 0;
                        writeDescriptorSets[i].dstSet = descriptorset;
                        writeDescriptorSets[i].dstBinding = i;
                        writeDescriptorSets[i].dstArrayElement = 0;
                        writeDescriptorSets[i].descriptorCount = 1;
                        writeDescriptorSets[i].pTexelBufferView = 0;

                        if (binding_type == 1)
                        {
                            writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                            writeDescriptorSets[i].pImageInfo = 0;
                            writeDescriptorSets[i].pBufferInfo = (const VkDescriptorBufferInfo*)p_descriptorInfos;

                            p_descriptorInfos += sizeof(VkDescriptorBufferInfo);
                        }
                        else if (binding_type == 2)
                        {
                            writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
                            writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos;
                            writeDescriptorSets[i].pBufferInfo = 0;

                            p_descriptorInfos += sizeof(VkDescriptorImageInfo);
                        }
                        else // if (binding_type == 3)
                        {
                            writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
                            writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos;
                            writeDescriptorSets[i].pBufferInfo = 0;

                            p_descriptorInfos += sizeof(VkDescriptorImageInfo);
                        }
                    }
                }

                vkUpdateDescriptorSets(vkdev->vkdevice(), binding_count, writeDescriptorSets.data(), 0, 0);
            }

            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_bind_descriptorsets;
            r.command_buffer = d->compute_command_buffer;
            r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
            r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout();
            r.bind_descriptorsets.descriptorset_count = 1;
            r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1;
            d->delayed_records.push_back(r);
        }
    }

    // record push constants
    if (constant_count > 0)
    {
        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPushConstants(d->compute_command_buffer, pipeline->pipeline_layout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data());
        }
        else
        {
            uint32_t size = constant_count * sizeof(vk_constant_type);
            unsigned char* constant_values = new unsigned char[size];
            memcpy(constant_values, constants.data(), size);

            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_push_constants;
            r.command_buffer = d->compute_command_buffer;
            r.push_constants.pipeline_layout = pipeline->pipeline_layout();
            r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT;
            r.push_constants.size = size;
            r.push_constants.values = constant_values;
            d->delayed_records.push_back(r);
        }
    }

    // record dispatch
    {
        uint32_t group_count_x = (dispatcher.w + pipeline->local_size_x() - 1) / pipeline->local_size_x();
        uint32_t group_count_y = (dispatcher.h * (dispatcher.d ? dispatcher.d : 1) + pipeline->local_size_y() - 1) / pipeline->local_size_y();
        uint32_t group_count_z = (dispatcher.c + pipeline->local_size_z() - 1) / pipeline->local_size_z();

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z);
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_dispatch;
            r.command_buffer = d->compute_command_buffer;
            r.dispatch.group_count_x = group_count_x;
            r.dispatch.group_count_y = group_count_y;
            r.dispatch.group_count_z = group_count_z;
            d->delayed_records.push_back(r);
        }

        d->pending_dispatch_total += group_count_x * group_count_y * group_count_z;
    }
}

#if NCNN_BENCHMARK
void VkCompute::record_write_timestamp(uint32_t query)
{
    if (vkdev->info.support_VK_KHR_push_descriptor())
    {
        if (d->query_pool)
            vkCmdWriteTimestamp(d->compute_command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, d->query_pool, query);
    }
    else
    {
        VkComputePrivate::record r;
        r.type = VkComputePrivate::record::TYPE_write_timestamp;
        r.command_buffer = d->compute_command_buffer;
        r.write_timestamp.query = query;
        d->delayed_records.push_back(r);
    }
}
#endif // NCNN_BENCHMARK

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst)
{
    // image layout transform undefined @ null to general @ compute
    {
        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = 0;
        barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
        barriers[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
        barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].image = src.image();
        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barriers[0].subresourceRange.baseMipLevel = 0;
        barriers[0].subresourceRange.levelCount = 1;
        barriers[0].subresourceRange.baseArrayLayer = 0;
        barriers[0].subresourceRange.layerCount = 1;

        VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_image_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.image_barrers.src_stage = src_stage;
            r.image_barrers.dst_stage = dst_stage;
            r.image_barrers.barrier_count = 1;
            r.image_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }
    }

    // record bind pipeline
    {
        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdBindPipeline(d->compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline());
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_bind_pipeline;
            r.command_buffer = d->compute_command_buffer;
            r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
            r.bind_pipeline.pipeline = pipeline->pipeline();
            d->delayed_records.push_back(r);
        }
    }

    // record update bindings
    {
        VkDescriptorImageInfo descriptorImageInfo;
        descriptorImageInfo.sampler = pipeline->sampler;
        descriptorImageInfo.imageView = src.imageview();
        descriptorImageInfo.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;

        VkDescriptorBufferInfo descriptorBufferInfo;
        descriptorBufferInfo.buffer = dst.buffer();
        descriptorBufferInfo.offset = dst.buffer_offset();
        descriptorBufferInfo.range = dst.total() * dst.elemsize;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            struct ImportAndroidHardwareBufferDescriptorInfo
            {
                VkDescriptorImageInfo imageInfo;
                VkDescriptorBufferInfo bufferInfo;
                VkDescriptorBufferInfo buffer4Info;
            };

            ImportAndroidHardwareBufferDescriptorInfo info;
            info.imageInfo = descriptorImageInfo;
            info.bufferInfo = descriptorBufferInfo;
            info.buffer4Info = descriptorBufferInfo;

            vkdev->vkCmdPushDescriptorSetWithTemplateKHR(d->compute_command_buffer, pipeline->descriptor_update_template(), pipeline->pipeline_layout(), 0, &info);
        }
        else
        {
            // create new descriptor_pool and descriptorset
            VkDescriptorPool descriptor_pool;
            {
                VkDescriptorPoolSize poolSizes[2];
                poolSizes[0].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
                poolSizes[0].descriptorCount = 1;
                poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                poolSizes[1].descriptorCount = 2;

                VkDescriptorPoolCreateInfo descriptorPoolCreateInfo;
                descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
                descriptorPoolCreateInfo.pNext = 0;
                descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
                descriptorPoolCreateInfo.maxSets = 1;
                descriptorPoolCreateInfo.poolSizeCount = 2;
                descriptorPoolCreateInfo.pPoolSizes = poolSizes;

                VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool);
                if (ret != VK_SUCCESS)
                {
                    NCNN_LOGE("vkCreateDescriptorPool failed %d", ret);
                    return;
                }
            }
            d->descriptor_pools.push_back(descriptor_pool);

            VkDescriptorSet descriptorset;
            {
                VkDescriptorSetLayout descriptorset_layout = pipeline->descriptorset_layout();

                VkDescriptorSetAllocateInfo descriptorSetAllocateInfo;
                descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
                descriptorSetAllocateInfo.pNext = 0;
                descriptorSetAllocateInfo.descriptorPool = descriptor_pool;
                descriptorSetAllocateInfo.descriptorSetCount = 1;
                descriptorSetAllocateInfo.pSetLayouts = &descriptorset_layout;

                VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset);
                if (ret != VK_SUCCESS)
                {
                    NCNN_LOGE("vkAllocateDescriptorSets failed %d", ret);
                    return;
                }
            }
            d->descriptorsets.push_back(descriptorset);

            if (vkdev->info.support_VK_KHR_descriptor_update_template())
            {
                struct ImportAndroidHardwareBufferDescriptorInfo
                {
                    VkDescriptorImageInfo imageInfo;
                    VkDescriptorBufferInfo bufferInfo;
                    VkDescriptorBufferInfo buffer4Info;
                };

                ImportAndroidHardwareBufferDescriptorInfo info;
                info.imageInfo = descriptorImageInfo;
                info.bufferInfo = descriptorBufferInfo;
                info.buffer4Info = descriptorBufferInfo;

                vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template(), &info);
            }
            else
            {
                VkWriteDescriptorSet writeDescriptorSets[3];
                writeDescriptorSets[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
                writeDescriptorSets[0].pNext = 0;
                writeDescriptorSets[0].dstSet = descriptorset;
                writeDescriptorSets[0].dstBinding = 0;
                writeDescriptorSets[0].dstArrayElement = 0;
                writeDescriptorSets[0].descriptorCount = 1;
                writeDescriptorSets[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
                writeDescriptorSets[0].pImageInfo = &descriptorImageInfo;
                writeDescriptorSets[0].pBufferInfo = 0;
                writeDescriptorSets[0].pTexelBufferView = 0;
                writeDescriptorSets[1].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
                writeDescriptorSets[1].pNext = 0;
                writeDescriptorSets[1].dstSet = descriptorset;
                writeDescriptorSets[1].dstBinding = 1;
                writeDescriptorSets[1].dstArrayElement = 0;
                writeDescriptorSets[1].descriptorCount = 1;
                writeDescriptorSets[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                writeDescriptorSets[1].pImageInfo = 0;
                writeDescriptorSets[1].pBufferInfo = &descriptorBufferInfo;
                writeDescriptorSets[1].pTexelBufferView = 0;
                writeDescriptorSets[2].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
                writeDescriptorSets[2].pNext = 0;
                writeDescriptorSets[2].dstSet = descriptorset;
                writeDescriptorSets[2].dstBinding = 2;
                writeDescriptorSets[2].dstArrayElement = 0;
                writeDescriptorSets[2].descriptorCount = 1;
                writeDescriptorSets[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
                writeDescriptorSets[2].pImageInfo = 0;
                writeDescriptorSets[2].pBufferInfo = &descriptorBufferInfo;
                writeDescriptorSets[2].pTexelBufferView = 0;

                vkUpdateDescriptorSets(vkdev->vkdevice(), 3, writeDescriptorSets, 0, 0);
            }

            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_bind_descriptorsets;
            r.command_buffer = d->compute_command_buffer;
            r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
            r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout();
            r.bind_descriptorsets.descriptorset_count = 1;
            r.bind_descriptorsets.descriptorset_offset = d->descriptorsets.size() - 1;
            d->delayed_records.push_back(r);
        }
    }

    // record dispatch
    {
        uint32_t group_count_x = (dst.w + pipeline->local_size_x() - 1) / pipeline->local_size_x();
        uint32_t group_count_y = (dst.h + pipeline->local_size_y() - 1) / pipeline->local_size_y();
        uint32_t group_count_z = (dst.c + pipeline->local_size_z() - 1) / pipeline->local_size_z();

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdDispatch(d->compute_command_buffer, group_count_x, group_count_y, group_count_z);
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_dispatch;
            r.command_buffer = d->compute_command_buffer;
            r.dispatch.group_count_x = group_count_x;
            r.dispatch.group_count_y = group_count_y;
            r.dispatch.group_count_z = group_count_z;
            d->delayed_records.push_back(r);
        }
    }
}
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API

int VkCompute::submit_and_wait()
{
    //     NCNN_LOGE("submit_and_wait");

    if (!vkdev->info.support_VK_KHR_push_descriptor())
    {
        d->begin_command_buffer();

#if NCNN_BENCHMARK
        if (d->query_pool)
            vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count);
#endif // NCNN_BENCHMARK

        const size_t record_count = d->delayed_records.size();

        // handle delayed records
        for (size_t i = 0; i < record_count; i++)
        {
            const VkComputePrivate::record& r = d->delayed_records[i];

            switch (r.type)
            {
            case VkComputePrivate::record::TYPE_copy_buffer:
            {
                vkCmdCopyBuffer(r.command_buffer, r.copy_buffer.src, r.copy_buffer.dst, r.copy_buffer.region_count, r.copy_buffer.regions);
                delete[] r.copy_buffer.regions;
                break;
            }
            case VkComputePrivate::record::TYPE_copy_image:
            {
                vkCmdCopyImage(r.command_buffer, r.copy_image.src, r.copy_image.src_layout, r.copy_image.dst, r.copy_image.dst_layout, r.copy_image.region_count, r.copy_image.regions);
                delete[] r.copy_image.regions;
                break;
            }
            case VkComputePrivate::record::TYPE_copy_buffer_to_image:
            {
                vkCmdCopyBufferToImage(r.command_buffer, r.copy_buffer_to_image.src, r.copy_buffer_to_image.dst, r.copy_buffer_to_image.layout, r.copy_buffer_to_image.region_count, r.copy_buffer_to_image.regions);
                delete[] r.copy_buffer_to_image.regions;
                break;
            }
            case VkComputePrivate::record::TYPE_copy_image_to_buffer:
            {
                vkCmdCopyImageToBuffer(r.command_buffer, r.copy_image_to_buffer.src, r.copy_image_to_buffer.layout, r.copy_image_to_buffer.dst, r.copy_image_to_buffer.region_count, r.copy_image_to_buffer.regions);
                delete[] r.copy_image_to_buffer.regions;
                break;
            }
            case VkComputePrivate::record::TYPE_bind_pipeline:
            {
                vkCmdBindPipeline(r.command_buffer, r.bind_pipeline.bind_point, r.bind_pipeline.pipeline);
                break;
            }
            case VkComputePrivate::record::TYPE_bind_descriptorsets:
            {
                vkCmdBindDescriptorSets(r.command_buffer, r.bind_descriptorsets.bind_point, r.bind_descriptorsets.pipeline_layout, 0, r.bind_descriptorsets.descriptorset_count, &d->descriptorsets[r.bind_descriptorsets.descriptorset_offset], 0, 0);
                break;
            }
            case VkComputePrivate::record::TYPE_push_constants:
            {
                vkCmdPushConstants(r.command_buffer, r.push_constants.pipeline_layout, r.push_constants.stage_flags, 0, r.push_constants.size, r.push_constants.values);
                delete[](unsigned char*) r.push_constants.values;
                break;
            }
            case VkComputePrivate::record::TYPE_dispatch:
            {
                vkCmdDispatch(r.command_buffer, r.dispatch.group_count_x, r.dispatch.group_count_y, r.dispatch.group_count_z);
                break;
            }
            case VkComputePrivate::record::TYPE_memory_barrers:
            {
                vkCmdPipelineBarrier(r.command_buffer, r.memory_barrers.src_stage, r.memory_barrers.dst_stage, 0, r.memory_barrers.barrier_count, r.memory_barrers.barriers, 0, 0, 0, 0);
                delete[] r.memory_barrers.barriers;
                break;
            }
            case VkComputePrivate::record::TYPE_buffer_barrers:
            {
                vkCmdPipelineBarrier(r.command_buffer, r.buffer_barrers.src_stage, r.buffer_barrers.dst_stage, 0, 0, 0, r.buffer_barrers.barrier_count, r.buffer_barrers.barriers, 0, 0);
                delete[] r.buffer_barrers.barriers;
                break;
            }
            case VkComputePrivate::record::TYPE_image_barrers:
            {
                vkCmdPipelineBarrier(r.command_buffer, r.image_barrers.src_stage, r.image_barrers.dst_stage, 0, 0, 0, 0, 0, r.image_barrers.barrier_count, r.image_barrers.barriers);
                delete[] r.image_barrers.barriers;
                break;
            }
#if NCNN_BENCHMARK
            case VkComputePrivate::record::TYPE_write_timestamp:
            {
                if (d->query_pool)
                    vkCmdWriteTimestamp(r.command_buffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, d->query_pool, r.write_timestamp.query);
                break;
            }
#endif // NCNN_BENCHMARK
            case VkComputePrivate::record::TYPE_post_download:
            case VkComputePrivate::record::TYPE_post_cast_float16_to_float32:
            case VkComputePrivate::record::TYPE_post_cast_bfloat16_to_float32:
            default:
                break;
            }
        }
    }

    // end command buffer
    {
        d->end_command_buffer();
    }

    // acquire queue and reclaim on return
    VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index());
    if (compute_queue == 0)
    {
        NCNN_LOGE("out of compute queue");
        return -1;
    }

    // submit compute
    {
        VkSubmitInfo submitInfo;
        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
        submitInfo.pNext = 0;
        submitInfo.waitSemaphoreCount = 0;
        submitInfo.pWaitSemaphores = 0;
        submitInfo.pWaitDstStageMask = 0;
        submitInfo.commandBufferCount = 1;
        submitInfo.pCommandBuffers = &d->compute_command_buffer;
        submitInfo.signalSemaphoreCount = 0;
        submitInfo.pSignalSemaphores = 0;

        VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkQueueSubmit failed %d", ret);
            vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
            return -1;
        }
    }

    vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);

    // wait
    {
        VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkWaitForFences failed %d", ret);
            return -1;
        }
    }

    // handle delayed post records
    for (size_t i = 0; i < d->delayed_records.size(); i++)
    {
        const VkComputePrivate::record& r = d->delayed_records[i];

        switch (r.type)
        {
        case VkComputePrivate::record::TYPE_post_download:
        {
            const VkMat& src = d->download_post_buffers[r.post_download.download_post_buffer_mat_offset];
            Mat& dst = d->download_post_mats_fp16[r.post_download.download_post_mat_fp16_offset];

            // NCNN_LOGE("post_download  %p +%d ~%d  -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data);

            src.allocator->invalidate(src.data);
            memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize);
            break;
        }
        case VkComputePrivate::record::TYPE_post_cast_float16_to_float32:
        {
            // NCNN_LOGE("post_cast_float16_to_float32");

            const Mat& src = d->download_post_mats_fp16[r.post_cast_float16_to_float32.download_post_mat_fp16_offset];
            Mat& dst = d->download_post_mats[r.post_cast_float16_to_float32.download_post_mat_offset];

            Option opt;
            opt.num_threads = r.post_cast_float16_to_float32.num_threads;
            opt.blob_allocator = dst.allocator;
            ncnn::cast_float16_to_float32(src, dst, opt);
            break;
        }
        case VkComputePrivate::record::TYPE_post_cast_bfloat16_to_float32:
        {
            // NCNN_LOGE("post_cast_bfloat16_to_float32");

            const Mat& src = d->download_post_mats_fp16[r.post_cast_bfloat16_to_float32.download_post_mat_bf16_offset];
            Mat& dst = d->download_post_mats[r.post_cast_bfloat16_to_float32.download_post_mat_offset];

            Option opt;
            opt.num_threads = r.post_cast_bfloat16_to_float32.num_threads;
            opt.blob_allocator = dst.allocator;
            ncnn::cast_bfloat16_to_float32(src, dst, opt);
            break;
        }
        default:
            break;
        }
    }

    d->delayed_records.clear();

    d->pending_dispatch_total = 0;

    return 0;
}

int VkCompute::reset()
{
    d->upload_staging_buffers.clear();
    d->download_post_buffers.clear();
    d->download_post_mats_fp16.clear();
    d->download_post_mats.clear();

    for (size_t i = 0; i < d->image_blocks_to_destroy.size(); i++)
    {
        VkImageMemory* ptr = d->image_blocks_to_destroy[i];

        int old_command_refcount = NCNN_XADD(&ptr->command_refcount, -1);
        if (ptr->refcount == 0 && old_command_refcount == 1)
        {
            // no userspace reference and we are the last command reference
            vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
            vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);

            delete ptr;
        }
        else
        {
            // reference exists in user code or other command
        }
    }
    d->image_blocks_to_destroy.clear();

    if (!vkdev->info.support_VK_KHR_push_descriptor())
    {
        for (size_t i = 0; i < d->descriptorsets.size(); i++)
        {
            vkFreeDescriptorSets(vkdev->vkdevice(), d->descriptor_pools[i], 1, &d->descriptorsets[i]);
            vkDestroyDescriptorPool(vkdev->vkdevice(), d->descriptor_pools[i], 0);
        }
        d->descriptor_pools.clear();
        d->descriptorsets.clear();
    }

    d->delayed_records.clear();

    d->pending_dispatch_total = 0;

    // reset command buffer and fence
    {
        VkResult ret = vkResetCommandBuffer(d->compute_command_buffer, 0);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkResetCommandBuffer failed %d", ret);
            return -1;
        }
    }
    {
        VkResult ret = vkResetFences(vkdev->vkdevice(), 1, &d->compute_command_fence);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkResetFences failed %d", ret);
            return -1;
        }
    }

    if (vkdev->info.support_VK_KHR_push_descriptor())
    {
        d->begin_command_buffer();

#if NCNN_BENCHMARK
        if (d->query_pool)
            vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count);
#endif // NCNN_BENCHMARK
    }

    return 0;
}

uint64_t VkCompute::pending_dispatch_total() const
{
    return d->pending_dispatch_total;
}

#if NCNN_BENCHMARK
int VkCompute::create_query_pool(uint32_t _query_count)
{
    d->query_count = _query_count;

    VkQueryPoolCreateInfo queryPoolCreateInfo;
    queryPoolCreateInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
    queryPoolCreateInfo.pNext = 0;
    queryPoolCreateInfo.flags = 0;
    queryPoolCreateInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
    queryPoolCreateInfo.queryCount = d->query_count;
    queryPoolCreateInfo.pipelineStatistics = 0;

    VkResult ret = vkCreateQueryPool(vkdev->vkdevice(), &queryPoolCreateInfo, 0, &d->query_pool);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateQueryPool failed %d", ret);
        return -1;
    }

    if (vkdev->info.support_VK_KHR_push_descriptor())
    {
        if (d->query_pool)
            vkCmdResetQueryPool(d->compute_command_buffer, d->query_pool, 0, d->query_count);
    }

    return 0;
}

int VkCompute::get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results)
{
    if (results.size() < first_query + query_count)
    {
        NCNN_LOGE("results not large enough");
        return -1;
    }

    VkResult ret = vkGetQueryPoolResults(vkdev->vkdevice(), d->query_pool, first_query, query_count,
                                         query_count * sizeof(uint64_t), results.data() + first_query, sizeof(uint64_t), VK_QUERY_RESULT_64_BIT);
    if (ret != VK_SUCCESS && ret != VK_NOT_READY)
    {
        NCNN_LOGE("vkGetQueryPoolResults failed %d", ret);
        return -1;
    }

    return 0;
}
#endif // NCNN_BENCHMARK

void VkCompute::barrier_readwrite(const VkMat& binding)
{
    if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
    {
        // barrier device any @ compute/null to shader-readwrite @ compute
        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = binding.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].buffer = binding.buffer();
        barriers[0].offset = binding.buffer_offset();
        barriers[0].size = binding.buffer_capacity();

        VkPipelineStageFlags src_stage = binding.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_buffer_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.buffer_barrers.src_stage = src_stage;
            r.buffer_barrers.dst_stage = dst_stage;
            r.buffer_barrers.barrier_count = 1;
            r.buffer_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark device shader-readwrite @ compute
        binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
        binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
    }
}

void VkCompute::barrier_readwrite(const VkImageMat& binding)
{
    if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
    {
        // image layout transform any @ any to shader-write @ compute
        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = binding.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
        barriers[0].oldLayout = binding.data->image_layout;
        barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].image = binding.image();
        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barriers[0].subresourceRange.baseMipLevel = 0;
        barriers[0].subresourceRange.levelCount = 1;
        barriers[0].subresourceRange.baseArrayLayer = 0;
        barriers[0].subresourceRange.layerCount = 1;

        VkPipelineStageFlags src_stage = binding.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_image_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.image_barrers.src_stage = src_stage;
            r.image_barrers.dst_stage = dst_stage;
            r.image_barrers.barrier_count = 1;
            r.image_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark image shader-write @ compute
        binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
        binding.data->image_layout = VK_IMAGE_LAYOUT_GENERAL;
        binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
    }
}

void VkCompute::barrier_readonly(const VkImageMat& binding)
{
    if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
    {
        // image layout transform any @ any to shader-readonly-optimal @ compute
        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
        barriers[0].pNext = 0;
        barriers[0].srcAccessMask = binding.data->access_flags;
        barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
        barriers[0].oldLayout = binding.data->image_layout;
        barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barriers[0].image = binding.image();
        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
        barriers[0].subresourceRange.baseMipLevel = 0;
        barriers[0].subresourceRange.levelCount = 1;
        barriers[0].subresourceRange.baseArrayLayer = 0;
        barriers[0].subresourceRange.layerCount = 1;

        VkPipelineStageFlags src_stage = binding.data->stage_flags;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

        if (vkdev->info.support_VK_KHR_push_descriptor())
        {
            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
            delete[] barriers;
        }
        else
        {
            VkComputePrivate::record r;
            r.type = VkComputePrivate::record::TYPE_image_barrers;
            r.command_buffer = d->compute_command_buffer;
            r.image_barrers.src_stage = src_stage;
            r.image_barrers.dst_stage = dst_stage;
            r.image_barrers.barrier_count = 1;
            r.image_barrers.barriers = barriers;
            d->delayed_records.push_back(r);
        }

        // mark image shader-readonly-optimal @ compute
        binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
        binding.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
        binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
    }
}

class VkTransferPrivate
{
public:
    VkTransferPrivate(const VulkanDevice* _vkdev);
    ~VkTransferPrivate();

    int init();
    int begin_command_buffer();
    int end_command_buffer();

    const VulkanDevice* vkdev;

    uint64_t pending_upload_total;

    VkCommandPool compute_command_pool;
    VkCommandPool transfer_command_pool;

    VkCommandBuffer upload_command_buffer;
    VkCommandBuffer compute_command_buffer;

    VkSemaphore upload_compute_semaphore;

    VkFence upload_command_fence;
    VkFence compute_command_fence;

    std::vector<VkMat> upload_staging_buffers;
};

VkTransferPrivate::VkTransferPrivate(const VulkanDevice* _vkdev)
    : vkdev(_vkdev)
{
    pending_upload_total = 0;

    compute_command_pool = 0;
    transfer_command_pool = 0;

    upload_command_buffer = 0;
    compute_command_buffer = 0;

    upload_compute_semaphore = 0;

    upload_command_fence = 0;
    compute_command_fence = 0;

    init();
}

VkTransferPrivate::~VkTransferPrivate()
{
    vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0);

    vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer);
    vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0);

    if (!vkdev->info.unified_compute_transfer_queue())
    {
        vkDestroyFence(vkdev->vkdevice(), upload_command_fence, 0);

        vkDestroySemaphore(vkdev->vkdevice(), upload_compute_semaphore, 0);

        vkFreeCommandBuffers(vkdev->vkdevice(), transfer_command_pool, 1, &upload_command_buffer);
        vkDestroyCommandPool(vkdev->vkdevice(), transfer_command_pool, 0);
    }
}

int VkTransferPrivate::init()
{
    // compute_command_pool
    {
        VkCommandPoolCreateInfo commandPoolCreateInfo;
        commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
        commandPoolCreateInfo.pNext = 0;
        commandPoolCreateInfo.flags = 0;
        commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index();

        VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkCreateCommandPool failed %d", ret);
            return -1;
        }
    }

    // compute_command_buffer
    {
        VkCommandBufferAllocateInfo commandBufferAllocateInfo;
        commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
        commandBufferAllocateInfo.pNext = 0;
        commandBufferAllocateInfo.commandPool = compute_command_pool;
        commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
        commandBufferAllocateInfo.commandBufferCount = 1;

        VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret);
            return -1;
        }
    }

    // compute_command_fence
    {
        VkFenceCreateInfo fenceCreateInfo;
        fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
        fenceCreateInfo.pNext = 0;
        fenceCreateInfo.flags = 0;

        VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkCreateFence failed %d", ret);
            return -1;
        }
    }

    if (!vkdev->info.unified_compute_transfer_queue())
    {
        // transfer_command_pool
        {
            VkCommandPoolCreateInfo commandPoolCreateInfo;
            commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
            commandPoolCreateInfo.pNext = 0;
            commandPoolCreateInfo.flags = 0;
            commandPoolCreateInfo.queueFamilyIndex = vkdev->info.transfer_queue_family_index();

            VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &transfer_command_pool);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkCreateCommandPool failed %d", ret);
                return -1;
            }
        }

        // upload_command_buffer
        {
            VkCommandBufferAllocateInfo commandBufferAllocateInfo;
            commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
            commandBufferAllocateInfo.pNext = 0;
            commandBufferAllocateInfo.commandPool = transfer_command_pool;
            commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
            commandBufferAllocateInfo.commandBufferCount = 1;

            VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &upload_command_buffer);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkAllocateCommandBuffers failed %d", ret);
                return -1;
            }
        }

        // upload_compute_semaphore
        {
            VkSemaphoreCreateInfo semaphoreCreateInfo;
            semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
            semaphoreCreateInfo.pNext = 0;
            semaphoreCreateInfo.flags = 0;

            VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkCreateSemaphore failed %d", ret);
                return -1;
            }
        }

        // upload_command_fence
        {
            VkFenceCreateInfo fenceCreateInfo;
            fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
            fenceCreateInfo.pNext = 0;
            fenceCreateInfo.flags = 0;

            VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkCreateFence failed %d", ret);
                return -1;
            }
        }
    }

    begin_command_buffer();

    return 0;
}

int VkTransferPrivate::begin_command_buffer()
{
    {
        VkCommandBufferBeginInfo commandBufferBeginInfo;
        commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
        commandBufferBeginInfo.pNext = 0;
        commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
        commandBufferBeginInfo.pInheritanceInfo = 0;

        VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkBeginCommandBuffer failed %d", ret);
            return -1;
        }
    }

    if (!vkdev->info.unified_compute_transfer_queue())
    {
        {
            VkCommandBufferBeginInfo commandBufferBeginInfo;
            commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
            commandBufferBeginInfo.pNext = 0;
            commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
            commandBufferBeginInfo.pInheritanceInfo = 0;

            VkResult ret = vkBeginCommandBuffer(upload_command_buffer, &commandBufferBeginInfo);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkBeginCommandBuffer failed %d", ret);
                return -1;
            }
        }
    }

    return 0;
}

int VkTransferPrivate::end_command_buffer()
{
    {
        VkResult ret = vkEndCommandBuffer(compute_command_buffer);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkEndCommandBuffer failed %d", ret);
            return -1;
        }
    }

    if (!vkdev->info.unified_compute_transfer_queue())
    {
        {
            VkResult ret = vkEndCommandBuffer(upload_command_buffer);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkEndCommandBuffer failed %d", ret);
                return -1;
            }
        }
    }

    return 0;
}

VkTransfer::VkTransfer(const VulkanDevice* _vkdev)
    : vkdev(_vkdev), d(new VkTransferPrivate(_vkdev))
{
}

VkTransfer::~VkTransfer()
{
    delete d;
}

void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten)
{
    //     NCNN_LOGE("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack);

    // NOTE keep the hack here ?
    if (src.elembits() == 32)
    {
        if (opt.use_bf16_storage || opt.use_bf16_packed)
        {
            Mat src_bf16;
            cast_float32_to_bfloat16(src, src_bf16, opt);

            record_upload(src_bf16, dst, opt, flatten);

            return;
        }
        else if (opt.use_fp16_storage || opt.use_fp16_packed)
        {
            Mat src_fp16;
            cast_float32_to_float16(src, src_fp16, opt);

            record_upload(src_fp16, dst, opt, flatten);

            return;
        }
    }

    Mat src_flattened = flatten ? src.reshape(src.w * src.h * src.c) : src;

    // create dst
    dst.create_like(src_flattened, opt.blob_vkallocator);

    if (dst.empty())
    {
        return;
    }

    d->pending_upload_total += dst.total() * dst.elemsize;

    if (dst.allocator->mappable)
    {
        // memcpy src_flattened to device
        memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize);
        dst.allocator->flush(dst.data);

        // barrier device host-write @ null to shader-read @ compute
        {
            VkBufferMemoryBarrier barrier;
            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
            barrier.pNext = 0;
            barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            barrier.buffer = dst.buffer();
            barrier.offset = dst.buffer_offset();
            barrier.size = dst.buffer_capacity();

            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT;
            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
        }

        // mark device shader-readwrite @ compute
        dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
        dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

        return;
    }

    // create staging
    VkMat dst_staging;
    dst_staging.create_like(src_flattened, opt.staging_vkallocator);

    // memcpy src_flattened to staging
    memcpy(dst_staging.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize);
    dst_staging.allocator->flush(dst_staging.data);

    VkCommandBuffer command_buffer;
    if (vkdev->info.unified_compute_transfer_queue())
    {
        command_buffer = d->compute_command_buffer;
    }
    else
    {
        command_buffer = d->upload_command_buffer;
    }

    // barrier staging host-write @ null to transfer-read @ queue
    {
        VkBufferMemoryBarrier barrier;
        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
        barrier.pNext = 0;
        barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
        barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
        barrier.buffer = dst_staging.buffer();
        barrier.offset = dst_staging.buffer_offset();
        barrier.size = dst_staging.buffer_capacity();

        VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT;
        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;

        vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
    }

    // record staging to device
    {
        VkBufferCopy region;
        region.srcOffset = dst_staging.buffer_offset();
        region.dstOffset = dst.buffer_offset();
        region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity());

        vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, &region);
    }

    if (vkdev->info.unified_compute_transfer_queue())
    {
        // barrier device transfer-write @ compute to shader-read @ compute
        {
            VkBufferMemoryBarrier barrier;
            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
            barrier.pNext = 0;
            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
            barrier.buffer = dst.buffer();
            barrier.offset = dst.buffer_offset();
            barrier.size = dst.buffer_capacity();

            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

            vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
        }
    }
    else
    {
        // queue ownership transfer transfer-write @ transfer to shader-read @ compute

        // release
        {
            VkBufferMemoryBarrier barrier;
            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
            barrier.pNext = 0;
            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
            barrier.dstAccessMask = 0;
            barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index();
            barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index();
            barrier.buffer = dst.buffer();
            barrier.offset = dst.buffer_offset();
            barrier.size = dst.buffer_capacity();

            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;

            vkCmdPipelineBarrier(d->upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
        }

        // acquire
        {
            VkBufferMemoryBarrier barrier;
            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
            barrier.pNext = 0;
            barrier.srcAccessMask = 0;
            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
            barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index();
            barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index();
            barrier.buffer = dst.buffer();
            barrier.offset = dst.buffer_offset();
            barrier.size = dst.buffer_capacity();

            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

            vkCmdPipelineBarrier(d->compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
        }
    }

    // mark device shader-readwrite @ compute
    dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
    dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;

    // stash staging
    d->upload_staging_buffers.push_back(dst_staging);
}

int VkTransfer::submit_and_wait()
{
    //     NCNN_LOGE("submit_and_wait");

    // end command buffer
    {
        d->end_command_buffer();
    }

    VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index());
    if (compute_queue == 0)
    {
        NCNN_LOGE("out of compute queue");
        return -1;
    }

    if (vkdev->info.unified_compute_transfer_queue())
    {
        // submit compute
        {
            VkSubmitInfo submitInfo;
            submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
            submitInfo.pNext = 0;
            submitInfo.waitSemaphoreCount = 0;
            submitInfo.pWaitSemaphores = 0;
            submitInfo.pWaitDstStageMask = 0;
            submitInfo.commandBufferCount = 1;
            submitInfo.pCommandBuffers = &d->compute_command_buffer;
            submitInfo.signalSemaphoreCount = 0;
            submitInfo.pSignalSemaphores = 0;

            VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkQueueSubmit failed %d", ret);
                vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
                return -1;
            }
        }
    }
    else
    {
        VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index());
        if (transfer_queue == 0)
        {
            NCNN_LOGE("out of transfer queue");
            vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
            return -1;
        }

        // submit upload compute
        {
            VkSubmitInfo submitInfo;
            submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
            submitInfo.pNext = 0;
            submitInfo.waitSemaphoreCount = 0;
            submitInfo.pWaitSemaphores = 0;
            submitInfo.pWaitDstStageMask = 0;
            submitInfo.commandBufferCount = 1;
            submitInfo.pCommandBuffers = &d->upload_command_buffer;
            submitInfo.signalSemaphoreCount = 1;
            submitInfo.pSignalSemaphores = &d->upload_compute_semaphore;

            VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, d->upload_command_fence);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkQueueSubmit failed %d", ret);
                vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue);
                vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
                return -1;
            }
        }
        {
            VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; // FIXME

            VkSubmitInfo submitInfo;
            submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
            submitInfo.pNext = 0;
            submitInfo.waitSemaphoreCount = 1;
            submitInfo.pWaitSemaphores = &d->upload_compute_semaphore;
            submitInfo.pWaitDstStageMask = &wait_dst_stage;
            submitInfo.commandBufferCount = 1;
            submitInfo.pCommandBuffers = &d->compute_command_buffer;
            submitInfo.signalSemaphoreCount = 0;
            submitInfo.pSignalSemaphores = 0;

            VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, d->compute_command_fence);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkQueueSubmit failed %d", ret);
                vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue);
                vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);
                return -1;
            }
        }

        vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index(), transfer_queue);
    }

    vkdev->reclaim_queue(vkdev->info.compute_queue_family_index(), compute_queue);

    // wait
    if (vkdev->info.unified_compute_transfer_queue())
    {
        VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &d->compute_command_fence, VK_TRUE, (uint64_t)-1);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkWaitForFences failed %d", ret);
            return -1;
        }
    }
    else
    {
        VkFence fences[2] = {d->upload_command_fence, d->compute_command_fence};

        VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, (uint64_t)-1);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkWaitForFences failed %d", ret);
            return -1;
        }
    }

    d->pending_upload_total = 0;

    return 0;
}

int VkTransfer::reset()
{
    d->upload_staging_buffers.clear();

    d->pending_upload_total = 0;

    // reset command buffer and fence
    {
        VkResult ret = vkResetCommandBuffer(d->compute_command_buffer, 0);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkResetCommandBuffer failed %d", ret);
            return -1;
        }
    }
    {
        VkResult ret = vkResetFences(vkdev->vkdevice(), 1, &d->compute_command_fence);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkResetFences failed %d", ret);
            return -1;
        }
    }

    if (!vkdev->info.unified_compute_transfer_queue())
    {
        {
            VkResult ret = vkResetCommandBuffer(d->upload_command_buffer, 0);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkResetCommandBuffer failed %d", ret);
                return -1;
            }
        }
        {
            VkResult ret = vkResetFences(vkdev->vkdevice(), 1, &d->upload_command_fence);
            if (ret != VK_SUCCESS)
            {
                NCNN_LOGE("vkResetFences failed %d", ret);
                return -1;
            }
        }
    }

    d->begin_command_buffer();

    return 0;
}

uint64_t VkTransfer::pending_upload_total() const
{
    return d->pending_upload_total;
}

} // namespace ncnn

#endif // NCNN_VULKAN


================================================
FILE: src/command.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NCNN_COMMAND_H
#define NCNN_COMMAND_H

#include "platform.h"

#if NCNN_VULKAN

#include "mat.h"

namespace ncnn {

class Pipeline;
#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
class ImportAndroidHardwareBufferPipeline;
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API
class VkComputePrivate;
class NCNN_EXPORT VkCompute
{
public:
    explicit VkCompute(const VulkanDevice* vkdev);
    virtual ~VkCompute();

public:
    void record_upload(const Mat& src, VkMat& dst, const Option& opt);

    void record_download(const VkMat& src, Mat& dst, const Option& opt);

    void record_clone(const Mat& src, VkMat& dst, const Option& opt);

    void record_clone(const Mat& src, VkImageMat& dst, const Option& opt);

    void record_clone(const VkMat& src, Mat& dst, const Option& opt);

    void record_clone(const VkImageMat& src, Mat& dst, const Option& opt);

    void record_clone(const VkMat& src, VkMat& dst, const Option& opt);

    void record_clone(const VkImageMat& src, VkImageMat& dst, const Option& opt);

    void record_clone(const VkMat& src, VkImageMat& dst, const Option& opt);

    void record_clone(const VkImageMat& src, VkMat& dst, const Option& opt);

    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);

    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageMat>& bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);

    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkMat& dispatcher);
    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageMat& dispatcher);
    void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& buffer_bindings, const std::vector<VkImageMat>& image_bindings, const std::vector<vk_constant_type>& constants, const Mat& dispatcher);

#if NCNN_BENCHMARK
    void record_write_timestamp(uint32_t query);
#endif // NCNN_BENCHMARK

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 26
    void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& src, const VkMat& dst);
#endif // __ANDROID_API__ >= 26
#endif // NCNN_PLATFORM_API

    int submit_and_wait();

    int reset();

    uint64_t pending_dispatch_total() const;

#if NCNN_BENCHMARK
    int create_query_pool(uint32_t query_count);

    int get_query_pool_results(uint32_t first_query, uint32_t query_count, std::vector<uint64_t>& results);
#endif // NCNN_BENCHMARK

protected:
    const VulkanDevice* vkdev;

    void barrier_readwrite(const VkMat& binding);
    void barrier_readwrite(const VkImageMat& binding);
    void barrier_readonly(const VkImageMat& binding);

private:
    VkComputePrivate* const d;
};

class VkTransferPrivate;
class NCNN_EXPORT VkTransfer
{
public:
    explicit VkTransfer(const VulkanDevice* vkdev);
    virtual ~VkTransfer();

public:
    void record_upload(const Mat& src, VkMat& dst, const Option& opt, bool flatten = true);

    int submit_and_wait();

    int reset();

    uint64_t pending_upload_total() const;

protected:
    const VulkanDevice* vkdev;

private:
    VkTransferPrivate* const d;
};

} // namespace ncnn

#endif // NCNN_VULKAN

#endif // NCNN_COMMAND_H


================================================
FILE: src/convert_ycbcr.comp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#version 450

layout (constant_id = 0) const int w = 0;
layout (constant_id = 1) const int h = 0;
layout (constant_id = 2) const int outw = 0;
layout (constant_id = 3) const int outh = 0;
layout (constant_id = 4) const int type_to = 0;
layout (constant_id = 5) const int rotate_from = 0;
layout (constant_id = 6) const int need_resize = 0;

layout (binding = 0) uniform sampler2D android_hardware_buffer_image;
layout (binding = 1) writeonly buffer vkmat_blob { sfp vkmat_blob_data[]; };
layout (binding = 2) writeonly buffer vkmat_pack4_blob { sfpvec4 vkmat_pack4_blob_data[]; };

void main()
{
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= outw || gy >= outh || gz >= 1)
        return;

    vec2 pos;

    if (rotate_from == 1)
    {
        pos = vec2(gx, gy);
    }

    if (rotate_from == 2)
    {
        pos = vec2(outw - 1 - gx, gy);
    }

    if (rotate_from == 3)
    {
        pos = vec2(outw - 1 - gx, outh - 1 - gy);
    }

    if (rotate_from == 4)
    {
        pos = vec2(gx, outh - 1 - gy);
    }

    if (rotate_from == 5)
    {
        pos = vec2(gy, gx);
    }

    if (rotate_from == 6)
    {
        pos = vec2(gy, outw - 1 - gx);
    }

    if (rotate_from == 7)
    {
        pos = vec2(outh - 1 - gy, outw - 1 - gx);
    }

    if (rotate_from == 8)
    {
        pos = vec2(outh - 1 - gy, gx);
    }

    if (need_resize == 1)
    {
        if (rotate_from < 5) // 1 2 3 4
        {
            pos.x = pos.x * (float(w) / outw);
            pos.y = pos.y * (float(h) / outh);
        }
        else // 5 6 7 8
        {
            pos.x = pos.x * (float(w) / outh);
            pos.y = pos.y * (float(h) / outw);
        }
    }

    vec3 rgb = texture(android_hardware_buffer_image, pos).rgb * 255.f;

    const int outcstep = outw * outh / 4 * 4;

    if (type_to == 1) // PIXEL_RGB
    {
        ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep;

        buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.r));
        buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g));
        buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.b));
    }

    if (type_to == 2) // PIXEL_BGR
    {
        ivec3 v_offset = (gy * outw + gx) + ivec3(0, 1, 2) * outcstep;

        buffer_st1(vkmat_blob_data, v_offset.r, afp(rgb.b));
        buffer_st1(vkmat_blob_data, v_offset.g, afp(rgb.g));
        buffer_st1(vkmat_blob_data, v_offset.b, afp(rgb.r));
    }

    if (type_to == 3) // PIXEL_GRAY
    {
        // coeffs for r g b = 0.299f, 0.587f, 0.114f
        float v = clamp(rgb.r * 0.299f + rgb.g * 0.587f + rgb.b * 0.114f, 0.f, 255.f);

        int v_offset = gy * outw + gx;

        buffer_st1(vkmat_blob_data, v_offset, afp(v));
    }

    if (type_to == 4) // PIXEL_RGBA
    {
        vec4 rgba;
        rgba.rgb = rgb;
        rgba.a = 255.f;

        int v_offset = gy * outw + gx;

        buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba));
    }

    if (type_to == 5) // PIXEL_BGRA
    {
        vec4 rgba;
        rgba.bgr = rgb;
        rgba.a = 255.f;

        int v_offset = gy * outw + gx;

        buffer_st4(vkmat_pack4_blob_data, v_offset, afpvec4(rgba));
    }
}


================================================
FILE: src/cpu.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"

#include "platform.h"

#include <limits.h>
#ifndef __wasi__
#include <setjmp.h>
#include <signal.h>
#endif // __wasi__
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _OPENMP
#if NCNN_SIMPLEOMP
#include "simpleomp.h"
#else
#include <omp.h>
#endif
#endif

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#ifdef _MSC_VER
#include <intrin.h>    // __cpuid()
#include <immintrin.h> // _xgetbv()
#endif
#if defined(__clang__) || defined(__GNUC__)
#include <cpuid.h> // __get_cpuid() and __cpuid_count()
#endif
#endif

#ifdef __EMSCRIPTEN__
#include <emscripten/threading.h>
#endif

#if defined _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif

#if defined __ANDROID__ || defined __OHOS__ || __linux__
#if defined __ANDROID__
#if __ANDROID_API__ >= 18
#include <sys/auxv.h> // getauxval()
#endif
#include <sys/system_properties.h> // __system_property_get()
#include <dlfcn.h>
#endif
#if defined __OHOS__
#include <sys/auxv.h> // getauxval()
#endif
#include <ctype.h>
#include <stdint.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif

#if __APPLE__
#include <mach/mach.h>
#include <mach/machine.h>
#include <mach/thread_act.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#include <unistd.h>
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#define __IOS__ 1
#endif
// define missing cpu model for old sdk
#ifndef CPUFAMILY_ARM_HURRICANE
#define CPUFAMILY_ARM_HURRICANE 0x67ceee93
#endif
// A11
#ifndef CPUFAMILY_ARM_MONSOON_MISTRAL
#define CPUFAMILY_ARM_MONSOON_MISTRAL 0xe81e7ef6
#endif
// A12
#ifndef CPUFAMILY_ARM_VORTEX_TEMPEST
#define CPUFAMILY_ARM_VORTEX_TEMPEST 0x07d34b9f
#endif
// A13
#ifndef CPUFAMILY_ARM_LIGHTNING_THUNDER
#define CPUFAMILY_ARM_LIGHTNING_THUNDER 0x462504d2
#endif
// A14 / M1
#ifndef CPUFAMILY_ARM_FIRESTORM_ICESTORM
#define CPUFAMILY_ARM_FIRESTORM_ICESTORM 0x1b588bb3
#endif
// A15 / M2
#ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD
#define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d
#endif
// A16
#ifndef CPUFAMILY_ARM_EVEREST_SAWTOOTH
#define CPUFAMILY_ARM_EVEREST_SAWTOOTH 0x8765edea
#endif
// A17
#ifndef CPUFAMILY_ARM_COLL
#define CPUFAMILY_ARM_COLL 0x2876f5b5
#endif
// A18
#ifndef CPUFAMILY_ARM_TUPAI
#define CPUFAMILY_ARM_TUPAI 0x204526d0
#endif
// A18 Pro
#ifndef CPUFAMILY_ARM_TAHITI
#define CPUFAMILY_ARM_TAHITI 0x75d4acb9
#endif
// M3
#ifndef CPUFAMILY_ARM_IBIZA
#define CPUFAMILY_ARM_IBIZA 0xfa33415e
#endif
// M3 Pro
#ifndef CPUFAMILY_ARM_LOBOS
#define CPUFAMILY_ARM_LOBOS 0x5f4dea93
#endif
// M3 Max
#ifndef CPUFAMILY_ARM_PALMA
#define CPUFAMILY_ARM_PALMA 0x72015832
#endif
// M4
#ifndef CPUFAMILY_ARM_DONAN
#define CPUFAMILY_ARM_DONAN 0x6f5129ac
#endif
// M4 Pro / M4 Max
#ifndef CPUFAMILY_ARM_BRAVA
#define CPUFAMILY_ARM_BRAVA 0x17d5b93a
#endif
#endif // __APPLE__

#if defined(__SSE3__)
#include <immintrin.h>
#endif

#if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
#define RUAPU_IMPLEMENTATION
#include "ruapu.h"
#endif

#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
__attribute__((constructor)) void ncnn_kmp_env_initializer()
{
    // this function should be called before touching all openmp stuff
    // the env setting here helps prevent abort from happening inside openmp

    // the internal affinity routines in llvm openmp call abort on __NR_sched_getaffinity / __NR_sched_setaffinity fails
    // ref KMPNativeAffinity::get_system_affinity/set_system_affinity in openmp/runtime/src/kmp_affinity.h
    // and cpu core goes offline in powersave mode on android, which triggers abort
    // disable affinity capability, we handle thread affinity for openmp threads
#if defined _WIN32
#if _WIN32_WINNT >= 0x0600
    _putenv_s("KMP_AFFINITY", "disabled");
#else
    _putenv("KMP_AFFINITY=disabled");
#endif
#else
    setenv("KMP_AFFINITY", "disabled", 1);
#endif

    // openmp initialization triggers abort when another openmp runtime detected
    // ref __kmp_register_library_startup in openmp/runtime/src/kmp_runtime.cpp
    // this happens when loading multiple libraries that are static linked openmp
    // just let it continue to work, it works well in most cases, at least it won't crash unexpectedly
#if defined _WIN32
#if _WIN32_WINNT >= 0x0600
    _putenv_s("KMP_DUPLICATE_LIB_OK", "1");
#else
    _putenv("KMP_DUPLICATE_LIB_OK=1");
#endif
#else
    setenv("KMP_DUPLICATE_LIB_OK", "1", 1);
#endif
}
#endif

// topology info
static int g_cpucount;
static int g_physical_cpucount;
static int g_powersave;
static ncnn::CpuSet g_cpu_affinity_mask_all;
static ncnn::CpuSet g_cpu_affinity_mask_little;
static ncnn::CpuSet g_cpu_affinity_mask_big;

// isa info
#if defined _WIN32
#if __aarch64__
static int g_cpu_support_arm_asimdhp;
static int g_cpu_support_arm_cpuid;
static int g_cpu_support_arm_asimddp;
static int g_cpu_support_arm_asimdfhm;
static int g_cpu_support_arm_bf16;
static int g_cpu_support_arm_i8mm;
static int g_cpu_support_arm_sve;
static int g_cpu_support_arm_sve2;
static int g_cpu_support_arm_svebf16;
static int g_cpu_support_arm_svei8mm;
static int g_cpu_support_arm_svef32mm;
#elif __arm__
static int g_cpu_support_arm_edsp;
static int g_cpu_support_arm_neon;
static int g_cpu_support_arm_vfpv4;
#endif // __aarch64__ || __arm__
#elif defined __ANDROID__ || defined __linux__
static unsigned int g_hwcaps;
static unsigned int g_hwcaps2;
#elif __APPLE__
static unsigned int g_hw_cpufamily;
static cpu_type_t g_hw_cputype;
static cpu_subtype_t g_hw_cpusubtype;
#if __aarch64__
static int g_hw_optional_arm_FEAT_FP16;
static int g_hw_optional_arm_FEAT_DotProd;
static int g_hw_optional_arm_FEAT_FHM;
static int g_hw_optional_arm_FEAT_BF16;
static int g_hw_optional_arm_FEAT_I8MM;
#endif // __aarch64__
#endif

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static int g_cpu_support_x86_avx;
static int g_cpu_support_x86_fma;
static int g_cpu_support_x86_xop;
static int g_cpu_support_x86_f16c;
static int g_cpu_support_x86_avx2;
static int g_cpu_support_x86_avx_vnni;
static int g_cpu_support_x86_avx_vnni_int8;
static int g_cpu_support_x86_avx_vnni_int16;
static int g_cpu_support_x86_avx_ne_convert;
static int g_cpu_support_x86_avx512;
static int g_cpu_support_x86_avx512_vnni;
static int g_cpu_support_x86_avx512_bf16;
static int g_cpu_support_x86_avx512_fp16;
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)

#if defined __ANDROID__ || defined __linux__
#if __riscv
static int g_cpu_support_riscv_zfh;
static int g_cpu_support_riscv_zvfh;
static int g_cpu_support_riscv_xtheadvector;
#endif // __riscv
#endif // defined __ANDROID__ || defined __linux__

static int g_cpu_level2_cachesize;
static int g_cpu_level3_cachesize;

// misc info
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
static int g_cpu_is_arm_a53_a55;
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__

static bool is_being_debugged()
{
#if defined _WIN32
    return IsDebuggerPresent();
#elif defined __ANDROID__ || defined __linux__
    // https://stackoverflow.com/questions/3596781/how-to-detect-if-the-current-process-is-being-run-by-gdb
    int status_fd = open("/proc/self/status", O_RDONLY);
    if (status_fd == -1)
        return false;

    char buf[4096];
    ssize_t num_read = read(status_fd, buf, sizeof(buf) - 1);
    close(status_fd);

    if (num_read <= 0)
        return false;

    buf[num_read] = '\0';
    const char tracerPidString[] = "TracerPid:";
    const char* tracer_pid_ptr = strstr(buf, tracerPidString);
    if (!tracer_pid_ptr)
        return false;

    for (const char* ch = tracer_pid_ptr + sizeof(tracerPidString) - 1; ch <= buf + num_read; ++ch)
    {
        if (isspace(*ch))
            continue;

        return isdigit(*ch) != 0 && *ch != '0';
    }

    return false;
#elif defined __APPLE__
    // https://stackoverflow.com/questions/2200277/detecting-debugger-on-mac-os-x
    struct kinfo_proc info;
    info.kp_proc.p_flag = 0;

    int mib[4];
    mib[0] = CTL_KERN;
    mib[1] = KERN_PROC;
    mib[2] = KERN_PROC_PID;
    mib[3] = getpid();

    size_t size = sizeof(info);
    sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, NULL, 0);

    return ((info.kp_proc.p_flag & P_TRACED) != 0);
#else
    // unknown platform :(
    fprintf(stderr, "unknown platform!\n");
    return false;
#endif
}

#if defined __ANDROID__ || defined __OHOS__ || defined __linux__

#define AT_HWCAP  16
#define AT_HWCAP2 26

#if __aarch64__
// from arch/arm64/include/uapi/asm/hwcap.h
#define HWCAP_ASIMD     (1 << 1)
#define HWCAP_ASIMDHP   (1 << 10)
#define HWCAP_CPUID     (1 << 11)
#define HWCAP_ASIMDDP   (1 << 20)
#define HWCAP_SVE       (1 << 22)
#define HWCAP_ASIMDFHM  (1 << 23)
#define HWCAP2_SVE2     (1 << 1)
#define HWCAP2_SVEI8MM  (1 << 9)
#define HWCAP2_SVEF32MM (1 << 10)
#define HWCAP2_SVEBF16  (1 << 12)
#define HWCAP2_I8MM     (1 << 13)
#define HWCAP2_BF16     (1 << 14)
#else
// from arch/arm/include/uapi/asm/hwcap.h
#define HWCAP_EDSP  (1 << 7)
#define HWCAP_NEON  (1 << 12)
#define HWCAP_VFPv4 (1 << 16)
#endif

#if __mips__
// from arch/mips/include/uapi/asm/hwcap.h
#define HWCAP_MIPS_MSA     (1 << 1)
#define HWCAP_LOONGSON_MMI (1 << 11)
#endif

#if __loongarch64
// from arch/loongarch/include/uapi/asm/hwcap.h
#define HWCAP_LOONGARCH_LSX  (1 << 4)
#define HWCAP_LOONGARCH_LASX (1 << 5)
#endif

#if __riscv
// from arch/riscv/include/uapi/asm/hwcap.h
#define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
#define COMPAT_HWCAP_ISA_V (1 << ('V' - 'A'))
#endif

#if defined __ANDROID__ || defined __OHOS__
// Probe the system's C library for a 'getauxval' function and call it if
// it exits, or return 0 for failure. This function is available since API
// level 18.
//
// HarmonyOS NEXT support `getauxval` directly.
//
// Note that getauxval() can't really be re-implemented here, because
// its implementation does not parse /proc/self/auxv. Instead it depends
// on values  that are passed by the kernel at process-init time to the
// C runtime initialization layer.
static unsigned int get_elf_hwcap_from_getauxval(unsigned int type)
{
#if defined __OHOS__
    return getauxval(type);
#else
#if __ANDROID_API__ >= 18
    unsigned int hwcap = getauxval(type);
    if (hwcap)
        return hwcap;
#endif

    typedef unsigned long getauxval_func_t(unsigned long);

    dlerror();
    void* libc_handle = dlopen("libc.so", RTLD_NOW);
    if (!libc_handle)
    {
        NCNN_LOGE("dlopen libc.so failed %s", dlerror());
        return 0;
    }

    unsigned int result = 0;
    getauxval_func_t* func = (getauxval_func_t*)dlsym(libc_handle, "getauxval");
    if (!func)
    {
        NCNN_LOGE("dlsym getauxval failed");
    }
    else
    {
        // Note: getauxval() returns 0 on failure. Doesn't touch errno.
        result = (unsigned int)(*func)(type);
    }
    dlclose(libc_handle);

    return result;
#endif
}
#endif // defined __ANDROID__ || defined __OHOS__

// extract the ELF HW capabilities bitmap from /proc/self/auxv
static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
{
    FILE* fp = fopen("/proc/self/auxv", "rb");
    if (!fp)
    {
        NCNN_LOGE("fopen /proc/self/auxv failed");
        return 0;
    }

#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
    struct
    {
        uint64_t tag;
        uint64_t value;
    } entry;
#else
    struct
    {
        unsigned int tag;
        unsigned int value;
    } entry;

#endif

    unsigned int result = 0;
    while (!feof(fp))
    {
        int nread = fread((char*)&entry, sizeof(entry), 1, fp);
        if (nread != 1)
            break;

        if (entry.tag == 0 && entry.value == 0)
            break;

        if (entry.tag == type)
        {
            result = entry.value;
            break;
        }
    }

    fclose(fp);

    return result;
}

static unsigned int get_elf_hwcap(unsigned int type)
{
    unsigned int hwcap = 0;

#if defined __ANDROID__ || defined __OHOS__
    hwcap = get_elf_hwcap_from_getauxval(type);
#endif

    if (!hwcap)
        hwcap = get_elf_hwcap_from_proc_self_auxv(type);

#if defined __ANDROID__
#if __aarch64__
    if (type == AT_HWCAP)
    {
        // samsung exynos9810 on android pre-9 incorrectly reports armv8.2
        // for little cores, but big cores only support armv8.0
        // drop all armv8.2 features used by ncnn for preventing SIGILLs
        // ref https://reviews.llvm.org/D114523
        char arch[PROP_VALUE_MAX];
        int len = __system_property_get("ro.arch", arch);
        if (len > 0 && strncmp(arch, "exynos9810", 10) == 0)
        {
            hwcap &= ~HWCAP_ASIMDHP;
            hwcap &= ~HWCAP_ASIMDDP;
        }
    }
#endif // __aarch64__
#endif // defined __ANDROID__

    return hwcap;
}
#endif // defined __ANDROID__ || defined __OHOS__ || defined __linux__

#if __APPLE__
static unsigned int get_hw_cpufamily()
{
    unsigned int value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cpufamily", &value, &len, NULL, 0);
    return value;
}

static cpu_type_t get_hw_cputype()
{
    cpu_type_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cputype", &value, &len, NULL, 0);
    return value;
}

static cpu_subtype_t get_hw_cpusubtype()
{
    cpu_subtype_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname("hw.cpusubtype", &value, &len, NULL, 0);
    return value;
}

static int get_hw_capability(const char* cap)
{
    int64_t value = 0;
    size_t len = sizeof(value);
    sysctlbyname(cap, &value, &len, NULL, 0);
    return value;
}
#endif // __APPLE__

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
static inline void x86_cpuid(int level, unsigned int out[4])
{
#if defined(_MSC_VER) && !defined(__clang__)
    __cpuid((int*)out, level);
#elif defined(__clang__) || defined(__GNUC__)
    __get_cpuid(level, out, out + 1, out + 2, out + 3);
#else
    NCNN_LOGE("x86_cpuid is unknown for current compiler");
    out[0] = 0;
    out[1] = 0;
    out[2] = 0;
    out[3] = 0;
#endif
}

static inline void x86_cpuid_sublevel(int level, int sublevel, unsigned int out[4])
{
#if defined(_MSC_VER)
    __cpuidex((int*)out, level, sublevel);
#elif defined(__clang__) || defined(__GNUC__)
    __cpuid_count(level, sublevel, out[0], out[1], out[2], out[3]);
#else
    NCNN_LOGE("x86_cpuid_sublevel is unknown for current compiler");
    out[0] = 0;
    out[1] = 0;
    out[2] = 0;
    out[3] = 0;
#endif
}

static inline int x86_get_xcr0()
{
#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
    return _xgetbv(0);
#elif defined(__i386__) || defined(__x86_64__)
    int xcr0 = 0;
    asm(".byte 0x0f, 0x01, 0xd0"
        : "=a"(xcr0)
        : "c"(0)
        : "%edx");
    return xcr0;
#else
    NCNN_LOGE("x86_get_xcr0 is unknown for current compiler");
    return 0xffffffff; // assume it will work
#endif
}

static int get_cpu_support_x86_avx()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 1)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    return 1;
}

static int get_cpu_support_x86_fma()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    return cpu_info[2] & (1u << 12);
}

static int get_cpu_support_x86_xop()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0x80000000, cpu_info);

    if (cpu_info[0] < 0x80000001)
        return 0;

    x86_cpuid(0x80000001, cpu_info);

    return cpu_info[2] & (1u << 11);
}

static int get_cpu_support_x86_f16c()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 1)
        return 0;

    x86_cpuid(1, cpu_info);

    return cpu_info[2] & (1u << 29);
}

static int get_cpu_support_x86_avx2()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return cpu_info[1] & (1u << 5);
}

static int get_cpu_support_x86_avx_vnni()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 1, cpu_info);
    return cpu_info[0] & (1u << 4);
}

static int get_cpu_support_x86_avx_vnni_int8()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 1, cpu_info);
    return cpu_info[3] & (1u << 4);
}

static int get_cpu_support_x86_avx_vnni_int16()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 1, cpu_info);
    return cpu_info[3] & (1u << 10);
}

static int get_cpu_support_x86_avx_ne_convert()
{
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 1, cpu_info);
    return cpu_info[3] & (1u << 5);
}

static int get_cpu_support_x86_avx512()
{
#if __APPLE__
    return get_hw_capability("hw.optional.avx512f")
           && get_hw_capability("hw.optional.avx512bw")
           && get_hw_capability("hw.optional.avx512cd")
           && get_hw_capability("hw.optional.avx512dq")
           && get_hw_capability("hw.optional.avx512vl");
#else
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    // check avx512 XSAVE enabled by kernel
    if ((x86_get_xcr0() & 0xe0) != 0xe0)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return (cpu_info[1] & (1u << 16)) && (cpu_info[1] & (1u << 17)) && (cpu_info[1] & (1u << 28)) && (cpu_info[1] & (1u << 30)) && (cpu_info[1] & (1u << 31));
#endif
}

static int get_cpu_support_x86_avx512_vnni()
{
#if __APPLE__
    return get_hw_capability("hw.optional.avx512vnni");
#else
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    // check avx512 XSAVE enabled by kernel
    if ((x86_get_xcr0() & 0xe0) != 0xe0)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return cpu_info[2] & (1u << 11);
#endif
}

static int get_cpu_support_x86_avx512_bf16()
{
#if __APPLE__
    return get_hw_capability("hw.optional.avx512bf16");
#else
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    x86_cpuid_sublevel(7, 1, cpu_info);
    return cpu_info[0] & (1u << 5);
#endif
}

static int get_cpu_support_x86_avx512_fp16()
{
#if __APPLE__
    return get_hw_capability("hw.optional.avx512fp16");
#else
    unsigned int cpu_info[4] = {0};
    x86_cpuid(0, cpu_info);

    int nIds = cpu_info[0];
    if (nIds < 7)
        return 0;

    x86_cpuid(1, cpu_info);
    // check AVX XSAVE OSXSAVE
    if (!(cpu_info[2] & (1u << 28)) || !(cpu_info[2] & (1u << 26)) || !(cpu_info[2] & (1u << 27)))
        return 0;

    // check XSAVE enabled by kernel
    if ((x86_get_xcr0() & 6) != 6)
        return 0;

    // check avx512 XSAVE enabled by kernel
    if ((x86_get_xcr0() & 0xe0) != 0xe0)
        return 0;

    x86_cpuid_sublevel(7, 0, cpu_info);
    return cpu_info[3] & (1u << 23);
#endif
}
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)

static int get_cpucount()
{
    int count = 0;
#ifdef __EMSCRIPTEN__
    if (emscripten_has_threading_support())
        count = emscripten_num_logical_cores();
    else
        count = 1;
#elif defined _WIN32
    SYSTEM_INFO system_info;
    GetSystemInfo(&system_info);
    count = system_info.dwNumberOfProcessors;
#elif defined __ANDROID__ || defined __linux__
    // get cpu count from /proc/cpuinfo
    FILE* fp = fopen("/proc/cpuinfo", "rb");
    if (!fp)
        return 1;

    char line[1024];
    while (!feof(fp))
    {
        char* s = fgets(line, 1024, fp);
        if (!s)
            break;

        if (memcmp(line, "processor", 9) == 0)
        {
            count++;
        }
    }

    fclose(fp);
#elif __APPLE__
    size_t len = sizeof(count);
    sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
#else
#ifdef _OPENMP
    count = omp_get_max_threads();
#else
    count = 1;
#endif // _OPENMP
#endif

    if (count < 1)
        count = 1;

    return count;
}

#if defined __ANDROID__ || defined __linux__
static int get_thread_siblings(int cpuid)
{
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid);

    FILE* fp = 0; //fopen(path, "rb");
    if (fp)
    {
        int thread_siblings = -1;
        int nscan = fscanf(fp, "%x", &thread_siblings);
        if (nscan != 1)
        {
            // ignore
        }

        fclose(fp);

        return thread_siblings;
    }

    // second try, parse from human-readable thread_siblings_list
    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);

    fp = fopen(path, "rb");
    if (fp)
    {
        int thread_siblings = -1;

        int id0;
        char sep;
        int id1;

        int nscan = fscanf(fp, "%d", &id0);
        if (nscan == 1)
        {
            thread_siblings = (1 << id0);

            while (fscanf(fp, "%c%d", &sep, &id1) == 2)
            {
                if (sep == ',')
                {
                    thread_siblings |= (1 << id1);
                }
                if (sep == '-' && id0 < id1)
                {
                    for (int i = id0 + 1; i <= id1; i++)
                    {
                        thread_siblings |= (1 << i);
                    }
                }

                id0 = id1;
            }
        }
        else
        {
            // ignore
        }

        fclose(fp);

        return thread_siblings;
    }

    return -1;
}
#endif // defined __ANDROID__ || defined __linux__

static int get_physical_cpucount()
{
    int count = 0;
#if defined _WIN32
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi == NULL)
    {
        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
        return g_cpucount;
    }

    DWORD return_length = 0;
    glpi(NULL, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
    glpi(buffer, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
    DWORD byte_offset = 0;
    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
    {
        if (ptr->Relationship == RelationProcessorCore)
        {
            count++;
        }

        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
        ptr++;
    }

    free(buffer);
#elif defined __ANDROID__ || defined __linux__
    std::vector<int> thread_set;
    for (int i = 0; i < g_cpucount; i++)
    {
        int thread_siblings = get_thread_siblings(i);
        if (thread_siblings == -1)
        {
            // ignore malformed one
            continue;
        }

        bool thread_siblings_exists = false;
        for (size_t j = 0; j < thread_set.size(); j++)
        {
            if (thread_set[j] == thread_siblings)
            {
                thread_siblings_exists = true;
                break;
            }
        }

        if (!thread_siblings_exists)
        {
            thread_set.push_back(thread_siblings);
            count++;
        }
    }
    if (count == 0)
    {
        // cannot resolve siblings, fallback to all cpu count
        count = g_cpucount;
    }
#elif __APPLE__
    size_t len = sizeof(count);
    sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0);
#else
    count = g_cpucount;
#endif

    if (count > g_cpucount)
        count = g_cpucount;

    return count;
}

#if defined __ANDROID__ || defined __linux__
static int get_data_cache_size(int cpuid, int level)
{
    char path[256];

    // discover sysfs cache entry
    int indexid = -1;
    for (int i = 0;; i++)
    {
        // check level
        {
            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpuid, i);
            FILE* fp = fopen(path, "rb");
            if (!fp)
                break;

            int cache_level = -1;
            int nscan = fscanf(fp, "%d", &cache_level);
            fclose(fp);
            if (nscan != 1 || cache_level != level)
                continue;
        }

        // check type
        {
            sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/type", cpuid, i);
            FILE* fp = fopen(path, "rb");
            if (!fp)
                break;

            char type[32];
            int nscan = fscanf(fp, "%31s", type);
            fclose(fp);
            if (nscan != 1 || (strcmp(type, "Data") != 0 && strcmp(type, "Unified") != 0))
                continue;
        }

        indexid = i;
        break;
    }

    if (indexid == -1)
    {
        // no sysfs entry
        return 0;
    }

    // get size
    int cache_size_K = 0;
    {
        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpuid, indexid);
        FILE* fp = fopen(path, "rb");
        if (!fp)
            return 0;

        int nscan = fscanf(fp, "%dK", &cache_size_K);
        fclose(fp);
        if (nscan != 1)
        {
            NCNN_LOGE("fscanf cache_size_K error %d", nscan);
            return 0;
        }
    }

    // parse shared_cpu_map mask
    ncnn::CpuSet shared_cpu_map;
    {
        sprintf(path, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map", cpuid, indexid);
        FILE* fp = fopen(path, "rb");
        if (!fp)
            return 0;

        char shared_cpu_map_str[256];
        int nscan = fscanf(fp, "%255s", shared_cpu_map_str);
        fclose(fp);
        if (nscan != 1)
        {
            NCNN_LOGE("fscanf shared_cpu_map error %d", nscan);
            return 0;
        }

        int len = strlen(shared_cpu_map_str);

        if (shared_cpu_map_str[0] == '0' && shared_cpu_map_str[1] == 'x')
        {
            // skip leading 0x
            len -= 2;
        }

        int ci = 0;
        for (int i = len - 1; i >= 0; i--)
        {
            char x = shared_cpu_map_str[i];
            if (x & 1) shared_cpu_map.enable(ci + 0);
            if (x & 2) shared_cpu_map.enable(ci + 1);
            if (x & 4) shared_cpu_map.enable(ci + 2);
            if (x & 8) shared_cpu_map.enable(ci + 3);

            ci += 4;
        }
    }

    if (shared_cpu_map.num_enabled() == 1)
        return cache_size_K * 1024;

    // resolve physical cpu count in the shared_cpu_map
    int shared_physical_cpu_count = 0;
    {
        std::vector<int> thread_set;
        for (int i = 0; i < g_cpucount; i++)
        {
            if (!shared_cpu_map.is_enabled(i))
                continue;

            int thread_siblings = get_thread_siblings(i);
            if (thread_siblings == -1)
            {
                // ignore malformed one
                continue;
            }

            bool thread_siblings_exists = false;
            for (size_t j = 0; j < thread_set.size(); j++)
            {
                if (thread_set[j] == thread_siblings)
                {
                    thread_siblings_exists = true;
                    break;
                }
            }

            if (!thread_siblings_exists)
            {
                thread_set.push_back(thread_siblings);
                shared_physical_cpu_count++;
            }
        }
    }

    // return per-physical-core cache size with 4K aligned
    cache_size_K = (cache_size_K / shared_physical_cpu_count + 3) / 4 * 4;

    return cache_size_K * 1024;
}

static int get_big_cpu_data_cache_size(int level)
{
    if (g_cpu_affinity_mask_big.num_enabled() == 0)
    {
        // smp cpu
        return get_data_cache_size(0, level);
    }

    for (int i = 0; i < g_cpucount; i++)
    {
        if (g_cpu_affinity_mask_big.is_enabled(i))
        {
            return get_data_cache_size(i, level);
        }
    }

    // should never reach here, fallback to cpu0
    return get_data_cache_size(0, level);
}
#endif // defined __ANDROID__ || defined __linux__

static int get_cpu_level2_cachesize()
{
    int size = 0;
#if defined _WIN32
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi != NULL)
    {
        DWORD return_length = 0;
        glpi(NULL, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
        glpi(buffer, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
        DWORD byte_offset = 0;
        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
        {
            if (ptr->Relationship == RelationCache)
            {
                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
                if (Cache->Level == 2)
                {
                    size = std::max(size, (int)Cache->Size);
                }
            }

            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
            ptr++;
        }

        free(buffer);
    }
#elif defined __ANDROID__ || defined __linux__
    size = get_big_cpu_data_cache_size(2);
#if defined(_SC_LEVEL2_CACHE_SIZE)
    if (size <= 0)
        size = sysconf(_SC_LEVEL2_CACHE_SIZE);
#endif
#elif __APPLE__
    // perflevel 0 is the higher performance cluster
    int cpusperl2 = get_hw_capability("hw.perflevel0.cpusperl2");
    int l2cachesize = get_hw_capability("hw.perflevel0.l2cachesize");
    size = cpusperl2 > 1 ? l2cachesize / cpusperl2 : l2cachesize;
#endif

    // fallback to a common value
    if (size <= 0)
    {
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
        size = 64 * 1024;
        if (g_cpu_support_x86_avx)
            size = 128 * 1024;
        if (g_cpu_support_x86_avx2)
            size = 256 * 1024;
        if (g_cpu_support_x86_avx512)
            size = 1024 * 1024;
#elif __aarch64__
        size = 256 * 1024;
#elif __arm__
        size = 128 * 1024;
#else
        // is 64k still too large here ?
        size = 64 * 1024;
#endif
    }

    return size;
}

static int get_cpu_level3_cachesize()
{
    int size = 0;
#if defined _WIN32
    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi != NULL)
    {
        DWORD return_length = 0;
        glpi(NULL, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
        glpi(buffer, &return_length);

        PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
        DWORD byte_offset = 0;
        while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
        {
            if (ptr->Relationship == RelationCache)
            {
                PCACHE_DESCRIPTOR Cache = &ptr->Cache;
                if (Cache->Level == 3)
                {
                    size = std::max(size, (int)Cache->Size);
                }
            }

            byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
            ptr++;
        }

        free(buffer);
    }
#elif defined __ANDROID__ || defined __linux__
    size = get_big_cpu_data_cache_size(3);
#if defined(_SC_LEVEL3_CACHE_SIZE)
    if (size <= 0)
        size = sysconf(_SC_LEVEL3_CACHE_SIZE);
#endif
#elif __APPLE__
    // perflevel 0 is the higher performance cluster
    // get the size shared among all cpus
    size = get_hw_capability("hw.perflevel0.l3cachesize");
#endif

    // l3 cache size can be zero

    return size;
}

#if defined _WIN32
static ncnn::CpuSet get_smt_cpu_mask()
{
    ncnn::CpuSet smt_cpu_mask;

    typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
    if (glpi == NULL)
    {
        NCNN_LOGE("GetLogicalProcessorInformation is not supported");
        return smt_cpu_mask;
    }

    DWORD return_length = 0;
    glpi(NULL, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length);
    glpi(buffer, &return_length);

    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer;
    DWORD byte_offset = 0;
    while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length)
    {
        if (ptr->Relationship == RelationProcessorCore)
        {
            ncnn::CpuSet smt_set;
            smt_set.mask = ptr->ProcessorMask;
            if (smt_set.num_enabled() > 1)
            {
                // this core is smt
                smt_cpu_mask.mask |= smt_set.mask;
            }
        }

        byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
        ptr++;
    }

    free(buffer);

    return smt_cpu_mask;
}

static std::vector<int> get_max_freq_mhz()
{
    typedef struct _PROCESSOR_POWER_INFORMATION
    {
        ULONG Number;
        ULONG MaxMhz;
        ULONG CurrentMhz;
        ULONG MhzLimit;
        ULONG MaxIdleState;
        ULONG CurrentIdleState;
    } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION;

    HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll"));

    typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG);
    LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation");
    if (cnpi == NULL)
    {
        NCNN_LOGE("CallNtPowerInformation is not supported");
        FreeLibrary(powrprof);
        return std::vector<int>(g_cpucount, 0);
    }

    DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount;
    PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length);

    cnpi(ProcessorInformation, NULL, 0, buffer, return_length);

    std::vector<int> ret;
    for (int i = 0; i < g_cpucount; i++)
    {
        ULONG max_mhz = buffer[i].MaxMhz;
        ret.push_back(max_mhz);
    }

    free(buffer);
    FreeLibrary(powrprof);
    return ret;
}

static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
    DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask);
    if (prev_mask == 0)
    {
        NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError());
        return -1;
    }

    return 0;
}
#endif // defined _WIN32

#if defined __ANDROID__ || defined __linux__
static int get_max_freq_khz(int cpuid)
{
    // first try, for all possible cpu
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);

    FILE* fp = fopen(path, "rb");

    if (!fp)
    {
        // second try, for online cpu
        sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid);
        fp = fopen(path, "rb");

        if (fp)
        {
            int max_freq_khz = 0;
            while (!feof(fp))
            {
                int freq_khz = 0;
                int nscan = fscanf(fp, "%d %*d", &freq_khz);
                if (nscan != 1)
                    break;

                if (freq_khz > max_freq_khz)
                    max_freq_khz = freq_khz;
            }

            fclose(fp);

            if (max_freq_khz != 0)
                return max_freq_khz;

            fp = NULL;
        }

        if (!fp)
        {
            // third try, for online cpu
            sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
            fp = fopen(path, "rb");

            if (!fp)
                return -1;

            int max_freq_khz = -1;
            int nscan = fscanf(fp, "%d", &max_freq_khz);
            if (nscan != 1)
            {
                NCNN_LOGE("fscanf cpuinfo_max_freq error %d", nscan);
            }
            fclose(fp);

            return max_freq_khz;
        }
    }

    int max_freq_khz = 0;
    while (!feof(fp))
    {
        int freq_khz = 0;
        int nscan = fscanf(fp, "%d %*d", &freq_khz);
        if (nscan != 1)
            break;

        if (freq_khz > max_freq_khz)
            max_freq_khz = freq_khz;
    }

    fclose(fp);

    return max_freq_khz;
}

static bool is_smt_cpu(int cpuid)
{
    // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid);

    FILE* fp = fopen(path, "rb");

    if (!fp)
    {
        sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid);
        fp = fopen(path, "rb");

        if (!fp)
            return false;
    }

    bool is_smt = false;
    while (!feof(fp))
    {
        char ch = fgetc(fp);
        if (ch == ',' || ch == '-')
        {
            is_smt = true;
            break;
        }
    }

    fclose(fp);

    return is_smt;
}

static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
    // set affinity for thread
#if defined(__BIONIC__) && !defined(__OHOS__)
    pid_t pid = gettid();
#else
    pid_t pid = syscall(SYS_gettid);
#endif

    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
    if (syscallret)
    {
        NCNN_LOGE("syscall error %d", syscallret);
        return -1;
    }

    return 0;
}
#endif // defined __ANDROID__ || defined __linux__

#if __APPLE__
static int set_sched_affinity(const ncnn::CpuSet& thread_affinity_mask)
{
    // https://developer.apple.com/library/archive/releasenotes/Performance/RN-AffinityAPI/index.html
    // http://www.hybridkernel.com/2015/01/18/binding_threads_to_cores_osx.html
    // https://gist.github.com/Coneko/4234842

    // This is a quite outdated document. Apple will not allow developers to set CPU affinity.
    // In OS X 10.5 it worked, later it became a suggestion to OS X, then in 10.10 or so (as well in later ones), macOS will ignore any affinity settings.
    // see https://github.com/Tencent/ncnn/pull/2335#discussion_r528233919   --- AmeAkio

    int affinity_tag = THREAD_AFFINITY_TAG_NULL;
    for (int i = 0; i < (int)sizeof(thread_affinity_mask.policy) * 8; i++)
    {
        if (thread_affinity_mask.is_enabled(i))
        {
            affinity_tag = i + 1;
            break;
        }
    }

    mach_port_t tid = pthread_mach_thread_np(pthread_self());

    thread_affinity_policy_data_t policy_data;
    policy_data.affinity_tag = affinity_tag;
    int ret = thread_policy_set(tid, THREAD_AFFINITY_POLICY, (thread_policy_t)&policy_data, THREAD_AFFINITY_POLICY_COUNT);
    if (ret && ret != KERN_NOT_SUPPORTED)
    {
        NCNN_LOGE("thread_policy_set error %d", ret);
        return -1;
    }

    return 0;
}
#endif // __APPLE__

static void initialize_cpu_thread_affinity_mask(ncnn::CpuSet& mask_all, ncnn::CpuSet& mask_little, ncnn::CpuSet& mask_big)
{
    mask_all.disable_all();
    for (int i = 0; i < g_cpucount; i++)
    {
        mask_all.enable(i);
    }

#if defined _WIN32
// Check SDK >= Win7
#if _WIN32_WINNT >= _WIN32_WINNT_WIN7 // win7

    // Load GetLogicalProcessorInformationEx
    HMODULE kernel32 = LoadLibrary(TEXT("kernel32.dll"));
    if (!kernel32)
    {
        NCNN_LOGE("LoadLibrary kernel32.dll failed");
        return;
    }

    typedef BOOL(WINAPI * LPFN_GLPIE)(LOGICAL_PROCESSOR_RELATIONSHIP, PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, PDWORD);
    LPFN_GLPIE glpie = (LPFN_GLPIE)GetProcAddress(kernel32, "GetLogicalProcessorInformationEx");

    if (glpie != NULL)
    {
        DWORD bufferSize = 0;
        glpie(RelationProcessorCore, nullptr, &bufferSize);
        std::vector<BYTE> buffer(bufferSize);
        if (!glpie(RelationProcessorCore, (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)(buffer.data()), &bufferSize))
        {
            NCNN_LOGE("GetLogicalProcessorInformationEx failed");
            return;
        }

        // A map from processor number to whether it is an E core
        std::vector<std::pair<DWORD, bool> > processorCoreType;
        BYTE maxEfficiencyClass = 0; // In a system without E cores, all cores EfficiencyClass is 0

        BYTE* ptr = buffer.data();
        while (ptr < buffer.data() + bufferSize)
        {
            SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX* info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX*)ptr;
            if (info->Relationship == RelationProcessorCore)
            {
                // Mingw and some old MSVC do not have EfficiencyClass in PROCESSOR_RELATIONSHIP
                // So we should redefine PROCESSOR_RELATIONSHIP
                // Because ncnn need to support c++98, so we can't use some new features in c++11
                // So there is a ugly implementation

                BYTE efficiencyClass = ((BYTE*)&info->Processor)[1];

                bool isECore = (efficiencyClass == 0);
                maxEfficiencyClass = (std::max)(maxEfficiencyClass, efficiencyClass);

                for (WORD g = 0; g < info->Processor.GroupCount; ++g)
                {
                    const GROUP_AFFINITY& ga = info->Processor.GroupMask[g];
                    KAFFINITY mask = ga.Mask;
                    WORD group = ga.Group;
                    for (int bit = 0; bit < 64; ++bit)
                    {   // for each bit in the mask
                        if (mask & (static_cast<KAFFINITY>(1) << bit))
                        {
                            DWORD processorNumber = group * 64 + bit;
                            processorCoreType.push_back(std::pair<DWORD, bool>(processorNumber, isECore));
                        }
                    }
                }
            }
            ptr += info->Size;
        }

        if (maxEfficiencyClass == 0)
        {
            // All cores are P cores
            mask_little.disable_all();
            mask_big = mask_all;
        }
        else
        {
            for (int i = 0; i < g_cpucount; i++)
            {
                bool isECore = false;
                for (int j = 0; j < processorCoreType.size(); j++)
                {
                    std::pair<DWORD, bool> p = processorCoreType[j];
                    if (p.first == i)
                    {
                        isECore = p.second;
                        break;
                    }
                }
                // fprintf(stderr, "processor %d is %s\n", i, isECore ? "E" : "P");

                if (isECore)
                {
                    mask_little.enable(i);
                }
                else
                {
                    mask_big.enable(i);
                }
            }
        }
    }
    else
#endif
    {
        // get max freq mhz for all cores
        int max_freq_mhz_min = INT_MAX;
        int max_freq_mhz_max = 0;
        std::vector<int> cpu_max_freq_mhz = get_max_freq_mhz();
        for (int i = 0; i < g_cpucount; i++)
        {
            int max_freq_mhz = cpu_max_freq_mhz[i];

            // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz);

            if (max_freq_mhz > max_freq_mhz_max)
                max_freq_mhz_max = max_freq_mhz;
            if (max_freq_mhz < max_freq_mhz_min)
                max_freq_mhz_min = max_freq_mhz;
        }

        int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2;
        if (max_freq_mhz_medium == max_freq_mhz_max)
        {
            mask_little.disable_all();
            mask_big = mask_all;
            return;
        }

        ncnn::CpuSet smt_cpu_mask = get_smt_cpu_mask();

        for (int i = 0; i < g_cpucount; i++)
        {
            if (smt_cpu_mask.is_enabled(i))
            {
                // always treat smt core as big core
                mask_big.enable(i);
                continue;
            }

            if (cpu_max_freq_mhz[i] < max_freq_mhz_medium)
                mask_little.enable(i);
            else
                mask_big.enable(i);
        }
    }
#elif defined __ANDROID__ || defined __linux__
    int max_freq_khz_min = INT_MAX;
    int max_freq_khz_max = 0;
    std::vector<int> cpu_max_freq_khz(g_cpucount);
    for (int i = 0; i < g_cpucount; i++)
    {
        int max_freq_khz = get_max_freq_khz(i);

        // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz);

        cpu_max_freq_khz[i] = max_freq_khz;

        if (max_freq_khz > max_freq_khz_max)
            max_freq_khz_max = max_freq_khz;
        if (max_freq_khz < max_freq_khz_min)
            max_freq_khz_min = max_freq_khz;
    }

    int max_freq_khz_medium = (max_freq_khz_min + max_freq_khz_max) / 2;
    if (max_freq_khz_medium == max_freq_khz_max)
    {
        mask_little.disable_all();
        mask_big = mask_all;
        return;
    }

    for (int i = 0; i < g_cpucount; i++)
    {
        if (is_smt_cpu(i))
        {
            // always treat smt core as big core
            mask_big.enable(i);
            continue;
        }

        if (cpu_max_freq_khz[i] < max_freq_khz_medium)
            mask_little.enable(i);
        else
            mask_big.enable(i);
    }
#elif __APPLE__
    int nperflevels = get_hw_capability("hw.nperflevels");
    if (nperflevels == 1)
    {
        // smp models
        mask_little.disable_all();
        mask_big = mask_all;
    }
    else
    {
        // two or more clusters, level0 is the high-performance cluster
        int perflevel0_logicalcpu = get_hw_capability("hw.perflevel0.logicalcpu_max");
        for (int i = 0; i < perflevel0_logicalcpu; i++)
        {
            mask_big.enable(i);
        }
        for (int i = perflevel0_logicalcpu; i < g_cpucount; i++)
        {
            mask_little.enable(i);
        }
    }
#else
    // TODO implement me for other platforms
    mask_little.disable_all();
    mask_big = mask_all;
#endif
}

#if defined __ANDROID__ || defined __linux__
#if __aarch64__
union midr_info_t
{
    struct __attribute__((packed))
    {
        unsigned int revision : 4;
        unsigned int part : 12;
        unsigned int architecture : 4;
        unsigned int variant : 4;
        unsigned int implementer : 8;
    };
    unsigned int midr;

    midr_info_t(unsigned int _midr)
        : midr(_midr)
    {
    }
};

static unsigned int get_midr_from_sysfs(int cpuid)
{
    char path[256];
    sprintf(path, "/sys/devices/system/cpu/cpu%d/regs/identification/midr_el1", cpuid);

    FILE* fp = fopen(path, "rb");
    if (!fp)
        return 0;

    unsigned int midr_el1 = 0;
    int nscan = fscanf(fp, "%x", &midr_el1);
    if (nscan != 1)
    {
        // ignore
    }

    fclose(fp);

    return midr_el1;
}

static int get_midr_from_proc_cpuinfo(std::vector<unsigned int>& midrs)
{
    FILE* fp = fopen("/proc/cpuinfo", "rb");
    if (!fp)
        return -1;

    midrs.resize(g_cpucount, 0);

    int cpuid = -1;
    midr_info_t midr_info(0);

    char line[1024];
    while (!feof(fp))
    {
        char* s = fgets(line, 1024, fp);
        if (!s)
            break;

        if (memcmp(line, "processor", 9) == 0)
        {
            // processor       : 4
            int id = -1;
            int nscan = sscanf(line, "%*[^:]: %d", &id);
            if (nscan != 1)
                continue;

            if (cpuid >= 0 && cpuid < g_cpucount)
            {
                if (midr_info.midr == 0)
                {
                    // shared midr
                    midrs[cpuid] = (unsigned int)-1;
                }
                else
                {
                    // save midr and reset
                    midrs[cpuid] = midr_info.midr;
                    for (int i = 0; i < g_cpucount; i++)
                    {
                        if (midrs[i] == (unsigned int)-1)
                            midrs[i] = midr_info.midr;
                    }
                }

                midr_info.midr = 0;
            }

            cpuid = id;
        }

        if (cpuid == -1)
            continue;

        if (memcmp(line, "CPU implementer", 15) == 0)
        {
            // CPU implementer : 0x51
            unsigned int id = 0;
            int nscan = sscanf(line, "%*[^:]: %x", &id);
            if (nscan != 1)
                continue;

            midr_info.implementer = id;
        }
        else if (memcmp(line, "CPU architecture", 16) == 0)
        {
            // CPU architecture: 8
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %d", &id);
            if (nscan != 1)
                continue;

            midr_info.architecture = id;
        }
        else if (memcmp(line, "CPU variant", 11) == 0)
        {
            // CPU variant     : 0xd
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %x", &id);
            if (nscan != 1)
                continue;

            midr_info.variant = id;
        }
        else if (memcmp(line, "CPU part", 8) == 0)
        {
            // CPU part        : 0x804
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %x", &id);
            if (nscan != 1)
                continue;

            midr_info.part = id;
        }
        else if (memcmp(line, "CPU revision", 12) == 0)
        {
            // CPU revision    : 14
            int id = 0;
            int nscan = sscanf(line, "%*[^:]: %d", &id);
            if (nscan != 1)
                continue;

            midr_info.revision = id;
        }
    }

    fclose(fp);

    if (cpuid >= 0 && cpuid < g_cpucount)
    {
        if (midr_info.midr == 0)
        {
            // shared midr
            midrs[cpuid] = (unsigned int)-1;
        }
        else
        {
            // save midr and reset
            midrs[cpuid] = midr_info.midr;
            for (int i = 0; i < g_cpucount; i++)
            {
                if (midrs[i] == (unsigned int)-1)
                    midrs[i] = midr_info.midr;
            }
        }

        midr_info.midr = 0;
    }

    // /proc/cpuinfo may only report little/online cores on old kernel
    if (g_cpu_affinity_mask_big.num_enabled() == g_cpucount)
    {
        // assign the remaining unknown midrs for smp cpu
        for (int i = 0; i < g_cpucount; i++)
        {
            if (midrs[i] == 0)
                midrs[i] = midr_info.midr;
        }
    }
    else
    {
        // clear the big core midrs for hmp cpu if they are the same as little cores
        unsigned int little_midr = 0;
        for (int i = 0; i < g_cpucount; i++)
        {
            if (g_cpu_affinity_mask_little.is_enabled(i))
            {
                little_midr = midrs[i];
                break;
            }
        }

        for (int i = 0; i < g_cpucount; i++)
        {
            if (g_cpu_affinity_mask_big.is_enabled(i))
            {
                if (midrs[i] == little_midr)
                {
                    midrs[i] = 0;
                }
            }
        }
    }

    return 0;
}

// return midr for the current running core
static unsigned int get_midr_from_register()
{
    uint64_t midr;
    asm volatile("mrs   %0, MIDR_EL1"
                 : "=r"(midr));

    return (unsigned int)midr;
}

static int get_sched_affinity(ncnn::CpuSet& thread_affinity_mask)
{
    // get affinity for thread
#if defined(__BIONIC__) && !defined(__OHOS__)
    pid_t pid = gettid();
#else
    pid_t pid = syscall(SYS_gettid);
#endif

    thread_affinity_mask.disable_all();

    int syscallret = syscall(__NR_sched_getaffinity, pid, sizeof(cpu_set_t), &thread_affinity_mask.cpu_set);
    if (syscallret)
    {
        // handle get error silently
        return -1;
    }

    return 0;
}

static int midr_is_a53_a55(unsigned int midr)
{
    // 0x 41 ? f d03 ? = arm cortex-a53
    // 0x 51 ? f 801 ? = qcom kryo200 a53
    // 0x 41 ? f d04 ? = arm cortex-a35
    // 0x 41 ? f d05 ? = arm cortex-a55
    // 0x 51 ? f 803 ? = qcom kryo300 a55
    // 0x 51 ? f 805 ? = qcom kryo400 a55

    midr_info_t midr_info(midr);

    return (midr_info.implementer == 0x41 && midr_info.part == 0xd03)
           || (midr_info.implementer == 0x51 && midr_info.part == 0x801)
           || (midr_info.implementer == 0x41 && midr_info.part == 0xd04)
           || (midr_info.implementer == 0x41 && midr_info.part == 0xd05)
           || (midr_info.implementer == 0x51 && midr_info.part == 0x803)
           || (midr_info.implementer == 0x51 && midr_info.part == 0x805);
}

static int detect_cpu_is_arm_a53_a55()
{
    int a53_a55_cpu_count = 0;

    // first try, iterate /sys/devices/system/cpu/cpuX/regs/identification/midr_el1
    bool sysfs_midr = true;
    for (int i = 0; i < g_cpucount; i++)
    {
        unsigned int midr = 0;

        // for kernel 4.7+
        midr = get_midr_from_sysfs(i);
        if (midr == 0)
        {
            sysfs_midr = false;
            break;
        }

        if (midr_is_a53_a55(midr))
        {
            a53_a55_cpu_count++;
        }
    }

    if (!sysfs_midr)
    {
        // second try, collect midr from /proc/cpuinfo
        std::vector<unsigned int> midrs;
        int ret = get_midr_from_proc_cpuinfo(midrs);
        if (ret == 0 && (int)midrs.size() == g_cpucount)
        {
            for (int i = 0; i < g_cpucount; i++)
            {
                if (midr_is_a53_a55(midrs[i]))
                {
                    a53_a55_cpu_count++;
                }
            }
        }
        else
        {
            // third try, assume all aarch64 little cores are a53/a55
            a53_a55_cpu_count = g_cpu_affinity_mask_little.num_enabled();
        }
    }

    if (a53_a55_cpu_count == 0)
        return 0; // all non a53/a55

    if (a53_a55_cpu_count == g_cpucount)
        return 1; // all a53/a55

    // little cores are a53/a55
    return 2;
}
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__

// the initialization
static void initialize_global_cpu_info()
{
#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
    ncnn_kmp_env_initializer();
#endif

    g_cpucount = get_cpucount();
    g_physical_cpucount = get_physical_cpucount();
    g_powersave = 0;
    initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);

#if (defined _WIN32 && (__aarch64__ || __arm__)) || ((defined __ANDROID__ || defined __linux__) && __riscv)
    if (!is_being_debugged())
    {
        ruapu_init();
    }
#endif

#if defined _WIN32
#if __aarch64__
    g_cpu_support_arm_cpuid = ruapu_supports("cpuid");
    g_cpu_support_arm_asimdhp = ruapu_supports("asimdhp") || IsProcessorFeaturePresent(43) || IsProcessorFeaturePresent(67);   // dp implies hp, 67 is PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_asimddp = ruapu_supports("asimddp") || IsProcessorFeaturePresent(43);                                    // 43 is PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_asimdfhm = ruapu_supports("asimdfhm") || IsProcessorFeaturePresent(66) || IsProcessorFeaturePresent(68); // bf16 or i8mm implies fhm
    g_cpu_support_arm_bf16 = ruapu_supports("bf16") || IsProcessorFeaturePresent(68);                                          // 68 is PF_ARM_V86_BF16_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_i8mm = ruapu_supports("i8mm") || IsProcessorFeaturePresent(66);                                          // 66 is PF_ARM_V82_I8MM_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_sve = ruapu_supports("sve") || IsProcessorFeaturePresent(46);                                            // 46 is PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_sve2 = ruapu_supports("sve2") || IsProcessorFeaturePresent(47);                                          // 47 is PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_svebf16 = ruapu_supports("svebf16") || IsProcessorFeaturePresent(52);                                    // 52 is PF_ARM_SVE_BF16_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_svei8mm = ruapu_supports("svei8mm") || IsProcessorFeaturePresent(57);                                    // 57 is PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
    g_cpu_support_arm_svef32mm = ruapu_supports("svef32mm") || IsProcessorFeaturePresent(58);                                  // 58 is PF_ARM_SVE_F32MM_INSTRUCTIONS_AVAILABLE
#elif __arm__
    g_cpu_support_arm_edsp = ruapu_supports("edsp");
    g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
    g_cpu_support_arm_vfpv4 = ruapu_supports("vfpv4");
#endif // __aarch64__ || __arm__
#elif defined __ANDROID__ || defined __linux__
    g_hwcaps = get_elf_hwcap(AT_HWCAP);
    g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
#elif __APPLE__
    g_hw_cpufamily = get_hw_cpufamily();
    g_hw_cputype = get_hw_cputype();
    g_hw_cpusubtype = get_hw_cpusubtype();
#if __aarch64__
    g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16");
    g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd");
    g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM");
    g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16");
    g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM");

    switch (g_hw_cpufamily)
    {
    case CPUFAMILY_ARM_TUPAI:
    case CPUFAMILY_ARM_TAHITI:
    case CPUFAMILY_ARM_DONAN:
    case CPUFAMILY_ARM_BRAVA:
    // TODO check sve sme
    case CPUFAMILY_ARM_AVALANCHE_BLIZZARD:
    case CPUFAMILY_ARM_EVEREST_SAWTOOTH:
    case CPUFAMILY_ARM_COLL:
    case CPUFAMILY_ARM_IBIZA:
    case CPUFAMILY_ARM_LOBOS:
    case CPUFAMILY_ARM_PALMA:
        g_hw_optional_arm_FEAT_BF16 = 1;
        g_hw_optional_arm_FEAT_I8MM = 1;
    case CPUFAMILY_ARM_LIGHTNING_THUNDER:
    case CPUFAMILY_ARM_FIRESTORM_ICESTORM:
        g_hw_optional_arm_FEAT_DotProd = 1;
        g_hw_optional_arm_FEAT_FHM = 1;
    case CPUFAMILY_ARM_MONSOON_MISTRAL:
    case CPUFAMILY_ARM_VORTEX_TEMPEST:
        g_hw_optional_arm_FEAT_FP16 = 1;
    default:
        break;
    }
#endif // __aarch64__
#endif

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    g_cpu_support_x86_avx = get_cpu_support_x86_avx();
    g_cpu_support_x86_fma = get_cpu_support_x86_fma();
    g_cpu_support_x86_xop = get_cpu_support_x86_xop();
    g_cpu_support_x86_f16c = get_cpu_support_x86_f16c();
    g_cpu_support_x86_avx2 = get_cpu_support_x86_avx2();
    g_cpu_support_x86_avx_vnni = get_cpu_support_x86_avx_vnni();
    g_cpu_support_x86_avx_vnni_int8 = get_cpu_support_x86_avx_vnni_int8();
    g_cpu_support_x86_avx_vnni_int16 = get_cpu_support_x86_avx_vnni_int16();
    g_cpu_support_x86_avx_ne_convert = get_cpu_support_x86_avx_ne_convert();
    g_cpu_support_x86_avx512 = get_cpu_support_x86_avx512();
    g_cpu_support_x86_avx512_vnni = get_cpu_support_x86_avx512_vnni();
    g_cpu_support_x86_avx512_bf16 = get_cpu_support_x86_avx512_bf16();
    g_cpu_support_x86_avx512_fp16 = get_cpu_support_x86_avx512_fp16();
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)

#if defined __ANDROID__ || defined __linux__
#if __riscv
    g_cpu_support_riscv_zfh = ruapu_supports("zfh") || ruapu_supports("xtheadvector");   // xtheadvector implies zfh
    g_cpu_support_riscv_zvfh = ruapu_supports("zvfh") || ruapu_supports("xtheadvector"); // xtheadvector implies zvfh
    g_cpu_support_riscv_xtheadvector = ruapu_supports("xtheadvector");
#endif // __riscv
#endif // defined __ANDROID__ || defined __linux__

    g_cpu_level2_cachesize = get_cpu_level2_cachesize();
    g_cpu_level3_cachesize = get_cpu_level3_cachesize();

#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    g_cpu_is_arm_a53_a55 = detect_cpu_is_arm_a53_a55();
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__
}

static int g_cpu_info_initialized = 0;

static inline void try_initialize_global_cpu_info()
{
    if (!g_cpu_info_initialized)
    {
        initialize_global_cpu_info();
        g_cpu_info_initialized = 1;
    }
}

namespace ncnn {

#if defined _WIN32
CpuSet::CpuSet()
{
    disable_all();
}

void CpuSet::enable(int cpu)
{
    mask |= ((ULONG_PTR)1 << cpu);
}

void CpuSet::disable(int cpu)
{
    mask &= ~((ULONG_PTR)1 << cpu);
}

void CpuSet::disable_all()
{
    mask = 0;
}

bool CpuSet::is_enabled(int cpu) const
{
    return mask & ((ULONG_PTR)1 << cpu);
}

int CpuSet::num_enabled() const
{
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(mask) * 8; i++)
    {
        if (is_enabled(i))
            num_enabled++;
    }

    return num_enabled;
}
#elif defined __ANDROID__ || defined __linux__
CpuSet::CpuSet()
{
    disable_all();
}

void CpuSet::enable(int cpu)
{
    CPU_SET(cpu, &cpu_set);
}

void CpuSet::disable(int cpu)
{
    CPU_CLR(cpu, &cpu_set);
}

void CpuSet::disable_all()
{
    CPU_ZERO(&cpu_set);
}

bool CpuSet::is_enabled(int cpu) const
{
    return CPU_ISSET(cpu, &cpu_set);
}

int CpuSet::num_enabled() const
{
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(cpu_set_t) * 8; i++)
    {
        if (is_enabled(i))
            num_enabled++;
    }

    return num_enabled;
}
#elif __APPLE__
CpuSet::CpuSet()
{
    disable_all();
}

void CpuSet::enable(int cpu)
{
    policy |= ((unsigned int)1 << cpu);
}

void CpuSet::disable(int cpu)
{
    policy &= ~((unsigned int)1 << cpu);
}

void CpuSet::disable_all()
{
    policy = 0;
}

bool CpuSet::is_enabled(int cpu) const
{
    return policy & ((unsigned int)1 << cpu);
}

int CpuSet::num_enabled() const
{
    int num_enabled = 0;
    for (int i = 0; i < (int)sizeof(policy) * 8; i++)
    {
        if (is_enabled(i))
            num_enabled++;
    }

    return num_enabled;
}
#else
CpuSet::CpuSet()
{
}

void CpuSet::enable(int /* cpu */)
{
}

void CpuSet::disable(int /* cpu */)
{
}

void CpuSet::disable_all()
{
}

bool CpuSet::is_enabled(int /* cpu */) const
{
    return true;
}

int CpuSet::num_enabled() const
{
    return get_cpu_count();
}
#endif

int cpu_support_arm_edsp()
{
    try_initialize_global_cpu_info();
#if __arm__ && !__aarch64__
#if defined _WIN32
    return g_cpu_support_arm_edsp;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_EDSP;
#elif __APPLE__
    return g_hw_cputype == CPU_TYPE_ARM;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_neon()
{
    try_initialize_global_cpu_info();
#if __aarch64__
    return 1;
#elif __arm__
#if defined _WIN32
    return g_cpu_support_arm_neon;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_NEON;
#elif __APPLE__
    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_vfpv4()
{
    try_initialize_global_cpu_info();
#if __aarch64__
    return 1;
#elif __arm__
#if defined _WIN32
    return g_cpu_support_arm_vfpv4;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_VFPv4;
#elif __APPLE__
    return g_hw_cputype == CPU_TYPE_ARM && g_hw_cpusubtype > CPU_SUBTYPE_ARM_V7S;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_asimdhp()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_asimdhp;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_ASIMDHP;
#elif __APPLE__
    return g_hw_optional_arm_FEAT_FP16;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_cpuid()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_cpuid;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_CPUID;
#elif __APPLE__
    return 0;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_asimddp()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_asimddp;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_ASIMDDP;
#elif __APPLE__
    return g_hw_optional_arm_FEAT_DotProd;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_asimdfhm()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_asimdfhm;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_ASIMDFHM;
#elif __APPLE__
    return g_hw_optional_arm_FEAT_FHM;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_bf16()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_bf16;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps2 & HWCAP2_BF16;
#elif __APPLE__
    return g_hw_optional_arm_FEAT_BF16;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_i8mm()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_i8mm;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps2 & HWCAP2_I8MM;
#elif __APPLE__
    return g_hw_optional_arm_FEAT_I8MM;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_sve()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_sve;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps & HWCAP_SVE;
#elif __APPLE__
    return 0; // no known apple cpu support armv8.6 sve
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_sve2()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_sve2;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps2 & HWCAP2_SVE2;
#elif __APPLE__
    return 0; // no known apple cpu support armv8.6 sve2
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_svebf16()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_svebf16;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps2 & HWCAP2_SVEBF16;
#elif __APPLE__
    return 0; // no known apple cpu support armv8.6 svebf16
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_svei8mm()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_svei8mm;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps2 & HWCAP2_SVEI8MM;
#elif __APPLE__
    return 0; // no known apple cpu support armv8.6 svei8mm
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_arm_svef32mm()
{
    try_initialize_global_cpu_info();
#if __aarch64__
#if defined _WIN32
    return g_cpu_support_arm_svef32mm;
#elif defined __ANDROID__ || defined __linux__
    return g_hwcaps2 & HWCAP2_SVEF32MM;
#elif __APPLE__
    return 0; // no known apple cpu support armv8.6 svef32mm
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_x86_avx()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx;
#else
    return 0;
#endif
}

int cpu_support_x86_fma()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_fma;
#else
    return 0;
#endif
}

int cpu_support_x86_xop()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_xop;
#else
    return 0;
#endif
}

int cpu_support_x86_f16c()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_f16c;
#else
    return 0;
#endif
}

int cpu_support_x86_avx2()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx2;
#else
    return 0;
#endif
}

int cpu_support_x86_avx_vnni()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx_vnni;
#else
    return 0;
#endif
}

int cpu_support_x86_avx_vnni_int8()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx_vnni_int8;
#else
    return 0;
#endif
}

int cpu_support_x86_avx_vnni_int16()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx_vnni_int16;
#else
    return 0;
#endif
}

int cpu_support_x86_avx_ne_convert()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx_ne_convert;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512_vnni()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512_vnni;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512_bf16()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512_bf16;
#else
    return 0;
#endif
}

int cpu_support_x86_avx512_fp16()
{
    try_initialize_global_cpu_info();
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
    return g_cpu_support_x86_avx512_fp16;
#else
    return 0;
#endif
}

int cpu_support_mips_msa()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __mips__
    return g_hwcaps & HWCAP_MIPS_MSA;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_loongarch_lsx()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
    return g_hwcaps & HWCAP_LOONGARCH_LSX;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_loongarch_lasx()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
    return g_hwcaps & HWCAP_LOONGARCH_LASX;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_loongson_mmi()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __mips__
    return g_hwcaps & HWCAP_LOONGSON_MMI;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_riscv_v()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
    return g_hwcaps & COMPAT_HWCAP_ISA_V;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_riscv_zfh()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
    return g_cpu_support_riscv_zfh;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_riscv_zvfh()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
    return g_cpu_support_riscv_zvfh;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_support_riscv_xtheadvector()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __riscv
    return g_cpu_support_riscv_xtheadvector;
#else
    return 0;
#endif
#else
    return 0;
#endif
}

int cpu_riscv_vlenb()
{
#if C906
    // FIXME xuantie qemu reports all zero auxv flags
    return 16;
#endif
    try_initialize_global_cpu_info();
#if __riscv
    if (!cpu_support_riscv_v())
        return 0;

    int a = 0;
    asm volatile(
        ".word  0xc22026f3  \n" // csrr  a3, vlenb
        "mv     %0, a3      \n"
        : "=r"(a)
        :
        : "memory", "a3");
    return a;
#else
    return 0;
#endif
}

int get_cpu_count()
{
    try_initialize_global_cpu_info();
    return g_cpucount;
}

int get_little_cpu_count()
{
    try_initialize_global_cpu_info();
    return get_cpu_thread_affinity_mask(1).num_enabled();
}

int get_big_cpu_count()
{
    try_initialize_global_cpu_info();
    int big_cpu_count = get_cpu_thread_affinity_mask(2).num_enabled();
    return big_cpu_count ? big_cpu_count : g_cpucount;
}

int get_physical_cpu_count()
{
    try_initialize_global_cpu_info();
    return g_physical_cpucount;
}

int get_physical_little_cpu_count()
{
    try_initialize_global_cpu_info();
    if (g_physical_cpucount == g_cpucount)
        return get_little_cpu_count();

    return g_physical_cpucount * 2 - g_cpucount;
}

int get_physical_big_cpu_count()
{
    try_initialize_global_cpu_info();
    if (g_physical_cpucount == g_cpucount)
        return get_big_cpu_count();

    return g_cpucount - g_physical_cpucount;
}

int get_cpu_level2_cache_size()
{
    try_initialize_global_cpu_info();
    return g_cpu_level2_cachesize;
}

int get_cpu_level3_cache_size()
{
    try_initialize_global_cpu_info();
    return g_cpu_level3_cachesize;
}

int get_cpu_powersave()
{
    try_initialize_global_cpu_info();
    return g_powersave;
}

int set_cpu_powersave(int powersave)
{
    try_initialize_global_cpu_info();
    if (powersave < 0 || powersave > 2)
    {
        NCNN_LOGE("powersave %d not supported", powersave);
        return -1;
    }

    const CpuSet& thread_affinity_mask = get_cpu_thread_affinity_mask(powersave);

    int ret = set_cpu_thread_affinity(thread_affinity_mask);
    if (ret != 0)
        return ret;

    g_powersave = powersave;

    return 0;
}

const CpuSet& get_cpu_thread_affinity_mask(int powersave)
{
    try_initialize_global_cpu_info();
    if (powersave == 0)
        return g_cpu_affinity_mask_all;

    if (powersave == 1)
        return g_cpu_affinity_mask_little;

    if (powersave == 2)
        return g_cpu_affinity_mask_big;

    NCNN_LOGE("powersave %d not supported", powersave);

    // fallback to all cores anyway
    return g_cpu_affinity_mask_all;
}

int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask)
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__ || defined _WIN32
#ifdef _OPENMP
    int num_threads = thread_affinity_mask.num_enabled();

    // set affinity for each thread
    set_omp_num_threads(num_threads);
    std::vector<int> ssarets(num_threads, 0);
    #pragma omp parallel for num_threads(num_threads)
    for (int i = 0; i < num_threads; i++)
    {
        ssarets[i] = set_sched_affinity(thread_affinity_mask);
    }
    for (int i = 0; i < num_threads; i++)
    {
        if (ssarets[i] != 0)
            return -1;
    }
#else
    int ssaret = set_sched_affinity(thread_affinity_mask);
    if (ssaret != 0)
        return -1;
#endif

    return 0;
#elif __APPLE__

#ifdef _OPENMP
    int num_threads = thread_affinity_mask.num_enabled();

    // set affinity for each thread
    set_omp_num_threads(num_threads);
    std::vector<int> ssarets(num_threads, 0);
    #pragma omp parallel for num_threads(num_threads)
    for (int i = 0; i < num_threads; i++)
    {
        // assign one core for each thread
        int core = -1 - i;
        for (int j = 0; j < (int)sizeof(thread_affinity_mask.policy) * 8; j++)
        {
            if (thread_affinity_mask.is_enabled(j))
            {
                if (core == -1)
                {
                    core = j;
                    break;
                }
                else
                {
                    core++;
                }
            }
        }
        CpuSet this_thread_affinity_mask;
        if (core != -1 - i)
        {
            this_thread_affinity_mask.enable(core);
        }

        ssarets[i] = set_sched_affinity(this_thread_affinity_mask);
    }
    for (int i = 0; i < num_threads; i++)
    {
        if (ssarets[i] != 0)
            return -1;
    }
#else
    int ssaret = set_sched_affinity(thread_affinity_mask);
    if (ssaret != 0)
        return -1;
#endif

    return 0;
#else
    // TODO
    (void)thread_affinity_mask;
    return -1;
#endif
}

int is_current_thread_running_on_a53_a55()
{
    try_initialize_global_cpu_info();
#if defined __ANDROID__ || defined __linux__
#if __aarch64__
    if (g_cpu_is_arm_a53_a55 == 0)
        return 0; // all non a53/a55

    if (g_cpu_is_arm_a53_a55 == 1)
        return 1; // all a53/a55

    if (g_powersave == 2)
        return 0; // big clusters

    if (g_powersave == 1)
        return 1; // little clusters

    // little cores are a53/a55

    // use cpuid for retrieving midr since kernel 4.7+
    if (cpu_support_arm_cpuid())
    {
        unsigned int midr = get_midr_from_register();
        if (midr)
            return midr_is_a53_a55(midr);
    }

    // check if affinity cpuid is in the little ones
    CpuSet thread_cs;
    int ret = get_sched_affinity(thread_cs);
    if (ret != 0)
    {
        // no affinity capability
        return 0;
    }

    const CpuSet& little_cs = get_cpu_thread_affinity_mask(1);
    for (int i = 0; i < g_cpucount; i++)
    {
        if (!thread_cs.is_enabled(i))
            continue;

        if (!little_cs.is_enabled(i))
            return 0;
    }

    // all affinity cpuids are little core
    return 1;
#else
    return 0;
#endif // __aarch64__
#else
    return 0;
#endif // defined __ANDROID__ || defined __linux__
}

int get_omp_num_threads()
{
#ifdef _OPENMP
    return omp_get_num_threads();
#else
    return 1;
#endif
}

void set_omp_num_threads(int num_threads)
{
#ifdef _OPENMP
    omp_set_num_threads(num_threads);
#else
    (void)num_threads;
#endif
}

int get_omp_dynamic()
{
#ifdef _OPENMP
    return omp_get_dynamic();
#else
    return 0;
#endif
}

void set_omp_dynamic(int dynamic)
{
#ifdef _OPENMP
    omp_set_dynamic(dynamic);
#else
    (void)dynamic;
#endif
}

int get_omp_thread_num()
{
#ifdef _OPENMP
    return omp_get_thread_num();
#else
    return 0;
#endif
}

int get_kmp_blocktime()
{
#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
    return kmp_get_blocktime();
#else
    return 0;
#endif
}

void set_kmp_blocktime(int time_ms)
{
#if defined(_OPENMP) && (__clang__ || defined(_OPENMP_LLVM_RUNTIME))
    kmp_set_blocktime(time_ms);
#else
    (void)time_ms;
#endif
}

static ncnn::ThreadLocalStorage tls_flush_denormals;

int get_flush_denormals()
{
#if defined(__SSE3__)
    return (int)reinterpret_cast<size_t>(tls_flush_denormals.get());
#else
    return 0;
#endif
}

int set_flush_denormals(int flush_denormals)
{
    if (flush_denormals < 0 || flush_denormals > 3)
    {
        NCNN_LOGE("denormals_zero %d not supported", flush_denormals);
        return -1;
    }
#if defined(__SSE3__)
    if (flush_denormals == 0)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
    }
    else if (flush_denormals == 1)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
    }
    else if (flush_denormals == 2)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    }
    else if (flush_denormals == 3)
    {
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    }

    tls_flush_denormals.set(reinterpret_cast<void*>((size_t)flush_denormals));
    return 0;
#else
    return 0;
#endif
}

} // namespace ncnn


================================================
FILE: src/cpu.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NCNN_CPU_H
#define NCNN_CPU_H

#include <stddef.h>

#if defined _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
#if defined __ANDROID__ || defined __linux__
#include <sched.h> // cpu_set_t
#endif

#include "platform.h"

namespace ncnn {

class NCNN_EXPORT CpuSet
{
public:
    CpuSet();
    void enable(int cpu);
    void disable(int cpu);
    void disable_all();
    bool is_enabled(int cpu) const;
    int num_enabled() const;

public:
#if defined _WIN32
    ULONG_PTR mask;
#endif
#if defined __ANDROID__ || defined __linux__
    cpu_set_t cpu_set;
#endif
#if __APPLE__
    unsigned int policy;
#endif
};

// test optional cpu features
// edsp = armv7 edsp
NCNN_EXPORT int cpu_support_arm_edsp();
// neon = armv7 neon or aarch64 asimd
NCNN_EXPORT int cpu_support_arm_neon();
// vfpv4 = armv7 fp16 + fma
NCNN_EXPORT int cpu_support_arm_vfpv4();
// asimdhp = aarch64 asimd half precision
NCNN_EXPORT int cpu_support_arm_asimdhp();
// cpuid = aarch64 cpuid info
NCNN_EXPORT int cpu_support_arm_cpuid();
// asimddp = aarch64 asimd dot product
NCNN_EXPORT int cpu_support_arm_asimddp();
// asimdfhm = aarch64 asimd fhm
NCNN_EXPORT int cpu_support_arm_asimdfhm();
// bf16 = aarch64 bf16
NCNN_EXPORT int cpu_support_arm_bf16();
// i8mm = aarch64 i8mm
NCNN_EXPORT int cpu_support_arm_i8mm();
// sve = aarch64 sve
NCNN_EXPORT int cpu_support_arm_sve();
// sve2 = aarch64 sve2
NCNN_EXPORT int cpu_support_arm_sve2();
// svebf16 = aarch64 svebf16
NCNN_EXPORT int cpu_support_arm_svebf16();
// svei8mm = aarch64 svei8mm
NCNN_EXPORT int cpu_support_arm_svei8mm();
// svef32mm = aarch64 svef32mm
NCNN_EXPORT int cpu_support_arm_svef32mm();

// avx = x86 avx
NCNN_EXPORT int cpu_support_x86_avx();
// fma = x86 fma
NCNN_EXPORT int cpu_support_x86_fma();
// xop = x86 xop
NCNN_EXPORT int cpu_support_x86_xop();
// f16c = x86 f16c
NCNN_EXPORT int cpu_support_x86_f16c();
// avx2 = x86 avx2 + fma + f16c
NCNN_EXPORT int cpu_support_x86_avx2();
// avx_vnni = x86 avx vnni
NCNN_EXPORT int cpu_support_x86_avx_vnni();
// avx_vnni_int8 = x86 avx vnni int8
NCNN_EXPORT int cpu_support_x86_avx_vnni_int8();
// avx_vnni_int16 = x86 avx vnni int16
NCNN_EXPORT int cpu_support_x86_avx_vnni_int16();
// avx_ne_convert = x86 avx ne convert
NCNN_EXPORT int cpu_support_x86_avx_ne_convert();
// avx512 = x86 avx512f + avx512cd + avx512bw + avx512dq + avx512vl
NCNN_EXPORT int cpu_support_x86_avx512();
// avx512_vnni = x86 avx512 vnni
NCNN_EXPORT int cpu_support_x86_avx512_vnni();
// avx512_bf16 = x86 avx512 bf16
NCNN_EXPORT int cpu_support_x86_avx512_bf16();
// avx512_fp16 = x86 avx512 fp16
NCNN_EXPORT int cpu_support_x86_avx512_fp16();

// lsx = loongarch lsx
NCNN_EXPORT int cpu_support_loongarch_lsx();
// lasx = loongarch lasx
NCNN_EXPORT int cpu_support_loongarch_lasx();

// msa = mips mas
NCNN_EXPORT int cpu_support_mips_msa();
// mmi = loongson mmi
NCNN_EXPORT int cpu_support_loongson_mmi();

// v = riscv vector
NCNN_EXPORT int cpu_support_riscv_v();
// zfh = riscv half-precision float
NCNN_EXPORT int cpu_support_riscv_zfh();
// zvfh = riscv vector half-precision float
NCNN_EXPORT int cpu_support_riscv_zvfh();
// xtheadvector = riscv xtheadvector
NCNN_EXPORT int cpu_support_riscv_xtheadvector();
// vlenb = riscv vector length in bytes
NCNN_EXPORT int cpu_riscv_vlenb();

// cpu info
NCNN_EXPORT int get_cpu_count();
NCNN_EXPORT int get_little_cpu_count();
NCNN_EXPORT int get_big_cpu_count();

NCNN_EXPORT int get_physical_cpu_count();
NCNN_EXPORT int get_physical_little_cpu_count();
NCNN_EXPORT int get_physical_big_cpu_count();

// cpu l2 varies from 64k to 1M, but l3 can be zero
NCNN_EXPORT int get_cpu_level2_cache_size();
NCNN_EXPORT int get_cpu_level3_cache_size();

// bind all threads on little clusters if powersave enabled
// affects HMP arch cpu like ARM big.LITTLE
// only implemented on android at the moment
// switching powersave is expensive and not thread-safe
// 0 = all cores enabled(default)
// 1 = only little clusters enabled
// 2 = only big clusters enabled
// return 0 if success for setter function
NCNN_EXPORT int get_cpu_powersave();
NCNN_EXPORT int set_cpu_powersave(int powersave);

// convenient wrapper
NCNN_EXPORT const CpuSet& get_cpu_thread_affinity_mask(int powersave);

// set explicit thread affinity
NCNN_EXPORT int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask);

// runtime thread affinity info
NCNN_EXPORT int is_current_thread_running_on_a53_a55();

// misc function wrapper for openmp routines
NCNN_EXPORT int get_omp_num_threads();
NCNN_EXPORT void set_omp_num_threads(int num_threads);

NCNN_EXPORT int get_omp_dynamic();
NCNN_EXPORT void set_omp_dynamic(int dynamic);

NCNN_EXPORT int get_omp_thread_num();

NCNN_EXPORT int get_kmp_blocktime();
NCNN_EXPORT void set_kmp_blocktime(int time_ms);

// need to flush denormals on Intel Chipset.
// Other architectures such as ARM can be added as needed.
// 0 = DAZ OFF, FTZ OFF
// 1 = DAZ ON , FTZ OFF
// 2 = DAZ OFF, FTZ ON
// 3 = DAZ ON,  FTZ ON
NCNN_EXPORT int get_flush_denormals();
NCNN_EXPORT int set_flush_denormals(int flush_denormals);

} // namespace ncnn

#endif // NCNN_CPU_H


================================================
FILE: src/datareader.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "datareader.h"

#include <string.h>

namespace ncnn {

DataReader::DataReader()
{
}

DataReader::~DataReader()
{
}

#if NCNN_STRING
int DataReader::scan(const char* /*format*/, void* /*p*/) const
{
    return 0;
}
#endif // NCNN_STRING

size_t DataReader::read(void* /*buf*/, size_t /*size*/) const
{
    return 0;
}

size_t DataReader::reference(size_t /*size*/, const void** /*buf*/) const
{
    return 0;
}

#if NCNN_STDIO
class DataReaderFromStdioPrivate
{
public:
    DataReaderFromStdioPrivate(FILE* _fp)
        : fp(_fp)
    {
    }
    FILE* fp;
};

DataReaderFromStdio::DataReaderFromStdio(FILE* _fp)
    : DataReader(), d(new DataReaderFromStdioPrivate(_fp))
{
}

DataReaderFromStdio::~DataReaderFromStdio()
{
    delete d;
}

DataReaderFromStdio::DataReaderFromStdio(const DataReaderFromStdio&)
    : d(0)
{
}

DataReaderFromStdio& DataReaderFromStdio::operator=(const DataReaderFromStdio&)
{
    return *this;
}

#if NCNN_STRING
int DataReaderFromStdio::scan(const char* format, void* p) const
{
    return fscanf(d->fp, format, p);
}
#endif // NCNN_STRING

size_t DataReaderFromStdio::read(void* buf, size_t size) const
{
    return fread(buf, 1, size, d->fp);
}
#endif // NCNN_STDIO

class DataReaderFromMemoryPrivate
{
public:
    DataReaderFromMemoryPrivate(const unsigned char*& _mem)
        : mem(_mem)
    {
    }
    const unsigned char*& mem;
};

DataReaderFromMemory::DataReaderFromMemory(const unsigned char*& _mem)
    : DataReader(), d(new DataReaderFromMemoryPrivate(_mem))
{
}

DataReaderFromMemory::~DataReaderFromMemory()
{
    delete d;
}

DataReaderFromMemory::DataReaderFromMemory(const DataReaderFromMemory&)
    : d(0)
{
}

DataReaderFromMemory& DataReaderFromMemory::operator=(const DataReaderFromMemory&)
{
    return *this;
}

#if NCNN_STRING
int DataReaderFromMemory::scan(const char* format, void* p) const
{
    size_t fmtlen = strlen(format);

    const size_t nlen = fmtlen + 4;
    char* format_with_n = new char[nlen];
    snprintf(format_with_n, nlen, "%s%%n", format);

    int nconsumed = 0;
    int nscan = sscanf((const char*)d->mem, format_with_n, p, &nconsumed);
    d->mem += nconsumed;

    delete[] format_with_n;

    return nconsumed > 0 ? nscan : 0;
}
#endif // NCNN_STRING

size_t DataReaderFromMemory::read(void* buf, size_t size) const
{
    memcpy(buf, d->mem, size);
    d->mem += size;
    return size;
}

size_t DataReaderFromMemory::reference(size_t size, const void** buf) const
{
    *buf = d->mem;
    d->mem += size;
    return size;
}

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 9
class DataReaderFromAndroidAssetPrivate
{
public:
    DataReaderFromAndroidAssetPrivate(AAsset* _asset)
        : asset(_asset), mem(0)
    {
    }
    AAsset* asset;
    mutable const unsigned char* mem;
};

DataReaderFromAndroidAsset::DataReaderFromAndroidAsset(AAsset* _asset)
    : DataReader(), d(new DataReaderFromAndroidAssetPrivate(_asset))
{
}

DataReaderFromAndroidAsset::~DataReaderFromAndroidAsset()
{
    delete d;
}

DataReaderFromAndroidAsset::DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&)
    : d(0)
{
}

DataReaderFromAndroidAsset& DataReaderFromAndroidAsset::operator=(const DataReaderFromAndroidAsset&)
{
    return *this;
}

#if NCNN_STRING
int DataReaderFromAndroidAsset::scan(const char* format, void* p) const
{
    if (!d->mem)
    {
        off_t pos = AAsset_seek(d->asset, 0, SEEK_CUR);
        d->mem = (const unsigned char*)AAsset_getBuffer(d->asset);
        d->mem += pos;
    }

    // asset internal buffer may not be NULL-terminated
    // create a NULL-terminated string for sscanf
    std::string line;
    {
        off64_t remain_length = AAsset_getRemainingLength64(d->asset);
        const char* newline_pos;
        if (remain_length > 1 && ((const char*)d->mem)[0] == '\n')
        {
            // skip the leading newline
            // however, it is fine to create "\nXYZ 123 abc" as sscanf will skip the leading newline silently
            newline_pos = (const char*)memchr((const char*)d->mem + 1, '\n', remain_length - 1);
        }
        else if (remain_length > 2 && ((const char*)d->mem)[0] == '\r' && ((const char*)d->mem)[1] == '\n')
        {
            // skip the leading newline
            // however, it is fine to create "\r\nXYZ 123 abc" as sscanf will skip the leading newline silently
            newline_pos = (const char*)memchr((const char*)d->mem + 2, '\n', remain_length - 2);
        }
        else
        {
            newline_pos = (const char*)memchr((const char*)d->mem, '\n', remain_length);
        }

        size_t line_length = newline_pos ? newline_pos - (const char*)d->mem : (size_t)remain_length;
        line = std::string((const char*)d->mem, line_length);
    }

    int fmtlen = strlen(format);

    char* format_with_n = new char[fmtlen + 3];
    sprintf(format_with_n, "%s%%n", format);

    int nconsumed = 0;
    int nscan = sscanf(line.c_str(), format_with_n, p, &nconsumed);
    d->mem += nconsumed;

    delete[] format_with_n;

    if (nconsumed == 0)
        return 0;

    AAsset_seek(d->asset, nconsumed, SEEK_CUR);

    return nscan;
}
#endif // NCNN_STRING

size_t DataReaderFromAndroidAsset::read(void* buf, size_t size) const
{
    int nread = AAsset_read(d->asset, buf, size);
    if (nread < 0)
        return 0;

    if (d->mem)
    {
        d->mem += nread;
    }

    return nread;
}
#endif // __ANDROID_API__ >= 9
#endif // NCNN_PLATFORM_API

} // namespace ncnn


================================================
FILE: src/datareader.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NCNN_DATAREADER_H
#define NCNN_DATAREADER_H

#include "platform.h"
#if NCNN_STDIO
#include <stdio.h>
#endif

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 9
#include <android/asset_manager.h>
#endif
#endif // NCNN_PLATFORM_API

namespace ncnn {

// data read wrapper
class NCNN_EXPORT DataReader
{
public:
    DataReader();
    virtual ~DataReader();

#if NCNN_STRING
    // parse plain param text
    // return 1 if scan success
    virtual int scan(const char* format, void* p) const;
#endif // NCNN_STRING

    // read binary param and model data
    // return bytes read
    virtual size_t read(void* buf, size_t size) const;

    // get model data reference
    // return bytes referenced
    virtual size_t reference(size_t size, const void** buf) const;
};

#if NCNN_STDIO
class DataReaderFromStdioPrivate;
class NCNN_EXPORT DataReaderFromStdio : public DataReader
{
public:
    explicit DataReaderFromStdio(FILE* fp);
    virtual ~DataReaderFromStdio();

#if NCNN_STRING
    virtual int scan(const char* format, void* p) const;
#endif // NCNN_STRING
    virtual size_t read(void* buf, size_t size) const;

private:
    DataReaderFromStdio(const DataReaderFromStdio&);
    DataReaderFromStdio& operator=(const DataReaderFromStdio&);

private:
    DataReaderFromStdioPrivate* const d;
};
#endif // NCNN_STDIO

class DataReaderFromMemoryPrivate;
class NCNN_EXPORT DataReaderFromMemory : public DataReader
{
public:
    explicit DataReaderFromMemory(const unsigned char*& mem);
    virtual ~DataReaderFromMemory();

#if NCNN_STRING
    virtual int scan(const char* format, void* p) const;
#endif // NCNN_STRING
    virtual size_t read(void* buf, size_t size) const;
    virtual size_t reference(size_t size, const void** buf) const;

private:
    DataReaderFromMemory(const DataReaderFromMemory&);
    DataReaderFromMemory& operator=(const DataReaderFromMemory&);

private:
    DataReaderFromMemoryPrivate* const d;
};

#if NCNN_PLATFORM_API
#if __ANDROID_API__ >= 9
class DataReaderFromAndroidAssetPrivate;
class NCNN_EXPORT DataReaderFromAndroidAsset : public DataReader
{
public:
    explicit DataReaderFromAndroidAsset(AAsset* asset);
    virtual ~DataReaderFromAndroidAsset();

#if NCNN_STRING
    virtual int scan(const char* format, void* p) const;
#endif // NCNN_STRING
    virtual size_t read(void* buf, size_t size) const;

private:
    DataReaderFromAndroidAsset(const DataReaderFromAndroidAsset&);
    DataReaderFromAndroidAsset& operator=(const DataReaderFromAndroidAsset&);

private:
    DataReaderFromAndroidAssetPrivate* const d;
};
#endif // __ANDROID_API__ >= 9
#endif // NCNN_PLATFORM_API

} // namespace ncnn

#endif // NCNN_DATAREADER_H


================================================
FILE: src/expression.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "expression.h"

#include <stdio.h> // sscanf

namespace ncnn {

int count_expression_blobs(const std::string& expr)
{
    int count = 0;

    std::string t;
    for (size_t i = 0; i < expr.size(); i++)
    {
        char ch = expr[i];

        if (ch == '(' || ch == ')' || ch == ',')
        {
            if (!t.empty())
            {
                if (t.size() == 2 && (t[0] >= '0' && t[0] <= '9') && (t[1] == 'w' || t[1] == 'h' || t[1] == 'd' || t[1] == 'c'))
                {
                    int blob_index = t[0] - '0';
                    count = std::max(count, blob_index + 1);
                }

                t.clear();
            }
        }
        else
        {
#if NCNN_SIMPLESTL
            t.resize(t.size() + 1);
            t[t.size() - 1] = ch;
#else
            t += ch;
#endif
        }
    }

    if (!t.empty())
    {
        if (t.size() == 2 && (t[0] >= '0' && t[0] <= '9') && (t[1] == 'w' || t[1] == 'h' || t[1] == 'd' || t[1] == 'c'))
        {
            int blob_index = t[0] - '0';
            count = std::max(count, blob_index + 1);
        }
    }

    return count;
}

struct typed_value
{
    int type; // 0=i 1=f
    union
    {
        int i;
        float f;
    };

    typed_value()
        : type(0), i(0)
    {
    }
    typed_value(int _i)
        : type(0), i(_i)
    {
    }
    typed_value(float _f)
        : type(1), f(_f)
    {
    }

    int to_int()
    {
        if (type == 0)
            return i;

        // trunc by default
        return (int)f;
    }
};

int eval_list_expression(const std::string& expr, const std::vector<Mat>& blobs, std::vector<int>& outlist)
{
    // /(0w,2),*(0h,2),0c

    // split by , ( )
    //
    //     /
    //         0w
    //         2
    // -------------------
    //     *
    //         0h
    //         2
    // -------------------
    //     0c
    // -------------------

    // split by , ( )

    // split into tokens
    std::vector<std::string> tokens;
    {
        std::string t;
        for (size_t i = 0; i < expr.size(); i++)
        {
            char ch = expr[i];

            if (ch == '(' || ch == ')' || ch == ',')
            {
                if (!t.empty())
                {
                    tokens.push_back(t);
                    t.clear();
                }
            }
            else
            {
#if NCNN_SIMPLESTL
                t.resize(t.size() + 1);
                t[t.size() - 1] = ch;
#else
                t += ch;
#endif
            }
        }

        if (!t.empty())
        {
            tokens.push_back(t);
        }
    }

    //      / 0w 2 * 0h 2 0c

    // scan and stack
    std::stack<typed_value> exprstack;
    for (int i = (int)tokens.size() - 1; i >= 0; i--)
    {
        const std::string& t = tokens[i];

        // NCNN_LOGE("t = %s", t.c_str());

        // + - * / 0w 0h 0d 0c 12345

        if (t.size() == 2 && (t[0] >= '0' && t[0] <= '9') && (t[1] == 'w' || t[1] == 'h' || t[1] == 'd' || t[1] == 'c'))
        {
            size_t blob_index = t[0] - '0';
            if (blob_index >= blobs.size())
            {
                NCNN_LOGE("shape expression blob index %d out of bound!", (int)blob_index);
                return -1;
            }

            const Mat& blob = blobs[blob_index].shape();
            int size;
            if (t[1] == 'w')
                size = blob.w;
            else if (t[1] == 'h')
                size = blob.h;
            else if (t[1] == 'd')
                size = blob.d;
            else // if (t[1] == 'c')
                size = blob.c;

            // NCNN_LOGE("t = %s  =>  %d", t.c_str(), size);

            exprstack.push(size);
        }
        else if (t == "+" || t == "-" || t == "*" || t == "//" || t == "max" || t == "min")
        {
            typed_value ta = exprstack.top();
            exprstack.pop();
            typed_value tb = exprstack.top();
            exprstack.pop();

            if (ta.type == 0 && tb.type == 0)
            {
                const int a = ta.i;
                const int b = tb.i;

                int r = 0;
                if (t == "+")
                {
                    r = a + b;
                }
                else if (t == "-")
                {
                    r = a - b;
                }
                else if (t == "*")
                {
                    r = a * b;
                }
                else if (t == "//")
                {
                    if (b == 0)
                    {
                        NCNN_LOGE("expr divide by zero");
                        return -1;
                    }
                    else
                    {
                        r = a / b;
                    }
                }
                else if (t == "max")
                {
                    r = std::max(a, b);
                }
                else // if (t == "min")
                {
                    r = std::min(a, b);
                }
                exprstack.push(r);
            }
            else
            {
                const float a = ta.type == 0 ? ta.i : ta.f;
                const float b = tb.type == 0 ? tb.i : tb.f;

                float r = 0.f;
                if (t == "+")
                {
                    r = a + b;
                }
                else if (t == "-")
                {
                    r = a - b;
                }
                else if (t == "*")
                {
                    r = a * b;
                }
                else if (t == "//")
                {
                    r = floorf(a / b);
                }
                else if (t == "max")
                {
                    r = std::max(a, b);
                }
                else // if (t == "min")
                {
                    r = std::min(a, b);
                }
                exprstack.push(r);
            }
        }
        else if (t == "abs" || t == "neg" || t == "sign" || t == "square")
        {
            typed_value ta = exprstack.top();
            exprstack.pop();

            if (ta.type == 0)
            {
                const int a = ta.i;

                int r = 0;
                if (t == "abs")
                {
                    r = a > 0 ? a : -a;
                }
                else if (t == "neg")
                {
                    r = -a;
                }
                else if (t == "sign")
                {
                    r = a > 0 ? 1 : (a == 0 ? 0 : -1);
                }
                else // if (t == "square")
                {
                    r = a * a;
                }
                exprstack.push(r);
            }
            else
            {
                const float a = ta.f;

                float r = 0;
                if (t == "abs")
                {
                    r = fabsf(a);
                }
                else if (t == "neg")
                {
                    r = -a;
                }
                else if (t == "sign")
                {
                    r = a > 0.f ? 1 : (a == 0.f ? 0 : -1);
                }
                else // if (t == "square")
                {
                    r = a * a;
                }
                exprstack.push(r);
            }
        }
        else if (t == "trunc" || t == "ceil" || t == "floor" || t == "round")
        {
            typed_value ta = exprstack.top();
            exprstack.pop();

            if (ta.type == 0)
            {
                const int a = ta.i;
                exprstack.push(a);
            }
            else
            {
                const float a = ta.f;

                int r = 0;
                if (t == "trunc")
                {
                    r = (int)a;
                }
                else if (t == "ceil")
                {
                    r = (int)ceil(a);
                }
                else if (t == "floor")
                {
                    r = (int)floor(a);
                }
                else // if (t == "round")
                {
                    r = (int)round(a);
                }
                exprstack.push(r);
            }
        }
        else if (t == "acos"
                 || t == "acosh"
                 || t == "asin"
                 || t == "asinh"
                 || t == "atan"
                 || t == "atanh"
                 || t == "cos"
                 || t == "cosh"
                 || t == "erf"
                 || t == "exp"
                 || t == "log"
                 || t == "log10"
                 || t == "reciprocal"
                 || t == "rsqrt"
                 || t == "sin"
                 || t == "sinh"
                 || t == "sqrt"
                 || t == "tan"
                 || t == "tanh")
        {
            typed_value ta = exprstack.top();
            exprstack.pop();

            const float a = ta.type == 0 ? ta.i : ta.f;

            float r = 0;
            if (t == "acos")
            {
                r = acosf(a);
            }
            else if (t == "acosh")
            {
                r = acoshf(a);
            }
            else if (t == "asin")
            {
                r = asinf(a);
            }
            else if (t == "asinh")
            {
                r = asinhf(a);
            }
            else if (t == "atan")
            {
                r = atanf(a);
            }
            else if (t == "atanh")
            {
                r = atanhf(a);
            }
            else if (t == "cos")
            {
                r = cosf(a);
            }
            else if (t == "cosh")
            {
                r = coshf(a);
            }
            else if (t == "erf")
            {
                r = erff(a);
            }
            else if (t == "exp")
            {
                r = expf(a);
            }
            else if (t == "log")
            {
                r = logf(a);
            }
            else if (t == "log10")
            {
                r = log10f(a);
            }
            else if (t == "reciprocal")
            {
                r = 1.f / a;
            }
            else if (t == "rsqrt")
            {
                r = 1.f / sqrtf(a);
            }
            else if (t == "sin")
            {
                r = sinf(a);
            }
            else if (t == "sinh")
            {
                r = sinhf(a);
            }
            else if (t == "sqrt")
            {
                r = sqrtf(a);
            }
            else if (t == "tan")
            {
                r = tanf(a);
            }
            else // if (t == "tanh")
            {
                r = tanhf(a);
            }
            exprstack.push(r);
        }
        else if (t == "/"
                 || t == "atan2"
                 || t == "fmod"
                 || t == "pow"
                 || t == "remainder"
                 || t == "logaddexp")
        {
            typed_value ta = exprstack.top();
            exprstack.pop();
            typed_value tb = exprstack.top();
            exprstack.pop();

            const float a = ta.type == 0 ? ta.i : ta.f;
            const float b = tb.type == 0 ? tb.i : tb.f;

            float r = 0.f;
            if (t == "/")
            {
                r = a / b;
            }
            else if (t == "atan2")
            {
                r = atan2f(a, b);
            }
            else if (t == "fmod")
            {
                r = fmodf(a, b);
            }
            else if (t == "pow")
            {
                r = powf(a, b);
            }
            else if (t == "remainder")
            {
                r = fmodf(a, b);
                if (a * b < 0)
                    r += b;
            }
            else // if (t == "logaddexp")
            {
                r = logf(expf(a) + expf(b));
            }
            exprstack.push(r);
        }
        else if (t == "and" || t == "or" || t == "xor" || t == "lshift" || t == "rshift")
        {
            typed_value ta = exprstack.top();
            exprstack.pop();
            typed_value tb = exprstack.top();
            exprstack.pop();

            // assert ta.type == 0 && tb.type == 0

            const int a = ta.i;
            const int b = tb.i;

            int r = 0;
            if (t == "and")
            {
                r = a & b;
            }
            else if (t == "or")
            {
                r = a | b;
            }
            else if (t == "xor")
            {
                r = a ^ b;
            }
            else if (t == "lshift")
            {
                r = a << b;
            }
            else // if (t == "rshift")
            {
                r = a >> b;
            }
            exprstack.push(r);
        }
        else
        {
            // literal
            int vi;
            float vf;
            int nscani = sscanf(t.c_str(), "%d", &vi);
            int nscanf = sscanf(t.c_str(), "%f", &vf);
            if (nscani == 1 && nscanf == 1 && vi == vf)
            {
                exprstack.push(vi);
            }
            else if (nscanf == 1)
            {
                exprstack.push(vf);
            }
            else
            {
                NCNN_LOGE("malformed literal token %s", t.c_str());
                return -1;
            }
        }
    }

    int size = exprstack.top().to_int();
    exprstack.pop();
    outlist.push_back(size);
    while (!exprstack.empty())
    {
        size = exprstack.top().to_int();
        exprstack.pop();
        outlist.push_back(size);
    }

    // NCNN_LOGE("shape %s = %d %d", expr.c_str(), list[0], list[1]);

    return 0;
}

} // namespace ncnn


================================================
FILE: src/expression.h
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "mat.h"

namespace ncnn {

// count how many blobs are referenced inside expression
NCNN_EXPORT int count_expression_blobs(const std::string& expr);

// resolve reshape shape from expression and input blobs
// resolve slice indices(starts, ends) from expression and input blobs
// see docs/developer-guide/expression.md
// return 0 if success
NCNN_EXPORT int eval_list_expression(const std::string& expr, const std::vector<Mat>& blobs, std::vector<int>& outlist);

} // namespace ncnn


================================================
FILE: src/gpu.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gpu.h"

#if NCNN_VULKAN

#include <float.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>

#include "glslang/SPIRV/GlslangToSpv.h"
#if NCNN_SYSTEM_GLSLANG
#include "glslang/Public/ShaderLang.h"
#else
#include "glslang/glslang/Public/ShaderLang.h"
#endif

#include "layer/vulkan/shader/vulkan_activation.comp.hex.h"

#include "command.h"
#include "layer.h"
#include "layer_type.h"
#include "mat.h"
#include "pipelinecache.h"

// There is known issue that vkDestroyDebugUtilsMessengerEXT crash on exit when vulkan validation layer enabled
// upstream fix https://github.com/KhronosGroup/Vulkan-Loader/pull/539
#define ENABLE_VALIDATION_LAYER 0

namespace ncnn {

// global
static Mutex g_instance_lock;

class __ncnn_vulkan_instance_holder
{
public:
    __ncnn_vulkan_instance_holder()
    {
        instance = 0;
        instance_api_version = 0;
        created = 0;
        glslang_initialized = false;

#if NCNN_VULKAN_LOADER
        libvulkan = 0;
#if defined __ANDROID__
        hvkdi = 0;
#endif
#endif // NCNN_VULKAN_LOADER

#if ENABLE_VALIDATION_LAYER
        callback = 0;
#endif
    }

    ~__ncnn_vulkan_instance_holder()
    {
        destroy_gpu_instance();
    }

    operator VkInstance()
    {
        return instance;
    }

    VkInstance instance;
    uint32_t instance_api_version;
    int created;
    bool glslang_initialized;

#if ENABLE_VALIDATION_LAYER
    VkDebugUtilsMessengerEXT callback;
#endif
};
static __ncnn_vulkan_instance_holder g_instance;

static int g_gpu_count = 0;
static int g_default_gpu_index = -1;

// NOTE 32 is large enough i think ...
#define NCNN_MAX_GPU_COUNT 32
static GpuInfo* g_gpu_infos[NCNN_MAX_GPU_COUNT] = {0};

// default vulkan device
static Mutex g_default_vkdev_lock;
static VulkanDevice* g_default_vkdev[NCNN_MAX_GPU_COUNT] = {0};

struct layer_shader_registry_entry
{
    const char* comp_data;
    int comp_data_size;
};

#include "layer_shader_spv_data.h"

static const layer_shader_registry_entry layer_shader_registry[] = {
#include "layer_shader_registry.h"
};

static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);

// vulkan core
PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers = 0;
PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets = 0;
PFN_vkAllocateMemory vkAllocateMemory = 0;
PFN_vkBeginCommandBuffer vkBeginCommandBuffer = 0;
PFN_vkBindBufferMemory vkBindBufferMemory = 0;
PFN_vkBindImageMemory vkBindImageMemory = 0;
PFN_vkCmdBeginQuery vkCmdBeginQuery = 0;
PFN_vkCmdBindDescriptorSets vkCmdBindDescriptorSets = 0;
PFN_vkCmdBindIndexBuffer vkCmdBindIndexBuffer = 0;
PFN_vkCmdBindPipeline vkCmdBindPipeline = 0;
PFN_vkCmdCopyBuffer vkCmdCopyBuffer = 0;
PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage = 0;
PFN_vkCmdCopyImage vkCmdCopyImage = 0;
PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer = 0;
PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults = 0;
PFN_vkCmdDispatch vkCmdDispatch = 0;
PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect = 0;
PFN_vkCmdEndQuery vkCmdEndQuery = 0;
PFN_vkCmdExecuteCommands vkCmdExecuteCommands = 0;
PFN_vkCmdFillBuffer vkCmdFillBuffer = 0;
PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier = 0;
PFN_vkCmdPushConstants vkCmdPushConstants = 0;
PFN_vkCmdResetQueryPool vkCmdResetQueryPool = 0;
PFN_vkCmdResolveImage vkCmdResolveImage = 0;
PFN_vkCmdUpdateBuffer vkCmdUpdateBuffer = 0;
PFN_vkCmdWriteTimestamp vkCmdWriteTimestamp = 0;
PFN_vkCreateBuffer vkCreateBuffer = 0;
PFN_vkCreateBufferView vkCreateBufferView = 0;
PFN_vkCreateCommandPool vkCreateCommandPool = 0;
PFN_vkCreateComputePipelines vkCreateComputePipelines = 0;
PFN_vkCreateDescriptorPool vkCreateDescriptorPool = 0;
PFN_vkCreateDescriptorSetLayout vkCreateDescriptorSetLayout = 0;
PFN_vkCreateDevice vkCreateDevice = 0;
PFN_vkCreateFence vkCreateFence = 0;
PFN_vkCreateImage vkCreateImage = 0;
PFN_vkCreateImageView vkCreateImageView = 0;
PFN_vkCreatePipelineCache vkCreatePipelineCache = 0;
PFN_vkCreatePipelineLayout vkCreatePipelineLayout = 0;
PFN_vkCreateQueryPool vkCreateQueryPool = 0;
PFN_vkCreateSampler vkCreateSampler = 0;
PFN_vkCreateSemaphore vkCreateSemaphore = 0;
PFN_vkCreateShaderModule vkCreateShaderModule = 0;
PFN_vkDestroyBuffer vkDestroyBuffer = 0;
PFN_vkDestroyBufferView vkDestroyBufferView = 0;
PFN_vkDestroyCommandPool vkDestroyCommandPool = 0;
PFN_vkDestroyDescriptorPool vkDestroyDescriptorPool = 0;
PFN_vkDestroyDescriptorSetLayout vkDestroyDescriptorSetLayout = 0;
PFN_vkDestroyDevice vkDestroyDevice = 0;
PFN_vkDestroyFence vkDestroyFence = 0;
PFN_vkDestroyImage vkDestroyImage = 0;
PFN_vkDestroyImageView vkDestroyImageView = 0;
PFN_vkDestroyInstance vkDestroyInstance = 0;
PFN_vkDestroyPipeline vkDestroyPipeline = 0;
PFN_vkDestroyPipelineCache vkDestroyPipelineCache = 0;
PFN_vkDestroyPipelineLayout vkDestroyPipelineLayout = 0;
PFN_vkDestroyQueryPool vkDestroyQueryPool = 0;
PFN_vkDestroySampler vkDestroySampler = 0;
PFN_vkDestroySemaphore vkDestroySemaphore = 0;
PFN_vkDestroyShaderModule vkDestroyShaderModule = 0;
PFN_vkDeviceWaitIdle vkDeviceWaitIdle = 0;
PFN_vkEndCommandBuffer vkEndCommandBuffer = 0;
PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties = 0;
PFN_vkEnumerateDeviceLayerProperties vkEnumerateDeviceLayerProperties = 0;
PFN_vkEnumeratePhysicalDevices vkEnumeratePhysicalDevices = 0;
PFN_vkFlushMappedMemoryRanges vkFlushMappedMemoryRanges = 0;
PFN_vkFreeCommandBuffers vkFreeCommandBuffers = 0;
PFN_vkFreeDescriptorSets vkFreeDescriptorSets = 0;
PFN_vkFreeMemory vkFreeMemory = 0;
PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements = 0;
PFN_vkGetDeviceMemoryCommitment vkGetDeviceMemoryCommitment = 0;
PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr = 0;
PFN_vkGetDeviceQueue vkGetDeviceQueue = 0;
PFN_vkGetFenceStatus vkGetFenceStatus = 0;
PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements = 0;
PFN_vkGetImageSubresourceLayout vkGetImageSubresourceLayout = 0;
PFN_vkGetPhysicalDeviceFeatures vkGetPhysicalDeviceFeatures = 0;
PFN_vkGetPhysicalDeviceFormatProperties vkGetPhysicalDeviceFormatProperties = 0;
PFN_vkGetPhysicalDeviceImageFormatProperties vkGetPhysicalDeviceImageFormatProperties = 0;
PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties = 0;
PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties = 0;
PFN_vkGetPhysicalDeviceQueueFamilyProperties vkGetPhysicalDeviceQueueFamilyProperties = 0;
PFN_vkGetPipelineCacheData vkGetPipelineCacheData = 0;
PFN_vkGetQueryPoolResults vkGetQueryPoolResults = 0;
PFN_vkInvalidateMappedMemoryRanges vkInvalidateMappedMemoryRanges = 0;
PFN_vkMapMemory vkMapMemory = 0;
PFN_vkMergePipelineCaches vkMergePipelineCaches = 0;
PFN_vkQueueSubmit vkQueueSubmit = 0;
PFN_vkQueueWaitIdle vkQueueWaitIdle = 0;
PFN_vkResetCommandBuffer vkResetCommandBuffer = 0;
PFN_vkResetCommandPool vkResetCommandPool = 0;
PFN_vkResetDescriptorPool vkResetDescriptorPool = 0;
PFN_vkResetFences vkResetFences = 0;
PFN_vkUnmapMemory vkUnmapMemory = 0;
PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets = 0;
PFN_vkWaitForFences vkWaitForFences = 0;

int support_VK_KHR_external_memory_capabilities = 0;
int support_VK_KHR_get_physical_device_properties2 = 0;
int support_VK_KHR_get_surface_capabilities2 = 0;
int support_VK_KHR_portability_enumeration = 0;
int support_VK_KHR_surface = 0;
int support_VK_EXT_debug_utils = 0;
int support_VK_EXT_validation_features = 0;
int support_VK_EXT_validation_flags = 0;
#if __ANDROID_API__ >= 26
int support_VK_KHR_android_surface = 0;
#endif // __ANDROID_API__ >= 26

// VK_KHR_cooperative_matrix
PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = 0;

// VK_KHR_external_memory_capabilities
PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0;

// VK_KHR_get_physical_device_properties2
PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0;
PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0;
PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0;
PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0;
PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0;
PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0;

// VK_KHR_get_surface_capabilities2
PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0;
PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0;

// VK_KHR_surface
PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0;
PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0;
PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0;
PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0;
PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0;

#if __ANDROID_API__ >= 26
// VK_KHR_android_surface
PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0;
#endif // __ANDROID_API__ >= 26

// VK_NV_cooperative_matrix
PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = 0;

// VK_NV_cooperative_matrix2
PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = 0;

// VK_NV_cooperative_vector
PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV vkGetPhysicalDeviceCooperativeVectorPropertiesNV = 0;

class GpuInfoPrivate
{
public:
    void query_features();
    void query_properties();
    void query_queue_properties();
    void query_memory_properties();
    int query_extensions();
    void query_extension_features();
    void query_extension_properties();
    void evaluate_rough_score();

public:
    int device_index;

    // physical device
    VkPhysicalDevice physicalDevice;

    // features
    VkPhysicalDeviceFeatures physicalDevicefeatures;

    // properties
    VkPhysicalDeviceProperties physicalDeviceProperties;

    // memory properties
    VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties;

    // extension properties
    std::vector<VkExtensionProperties> deviceExtensionProperties;

    // 0 = discrete gpu
    // 1 = integrated gpu
    // 2 = virtual gpu
    // 3 = cpu
    int type;

    uint32_t rough_score;

    // runtime
    uint32_t compute_queue_family_index;
    uint32_t transfer_queue_family_index;

    uint32_t compute_queue_count;
    uint32_t transfer_queue_count;

    // property
    bool unified_compute_transfer_queue;
    bool resizable_bar_enabled;

    // bug is not feature
    bool bug_storage_buffer_no_l1;
    bool bug_corrupted_online_pipeline_cache;
    bool bug_buffer_image_load_zero;

    // but sometimes bug is a feature
    bool bug_implicit_fp16_arithmetic;

    // cooperative matrix
    bool support_cooperative_matrix_8_8_16;
    bool support_cooperative_matrix_16_8_8;
    bool support_cooperative_matrix_16_8_16;
    bool support_cooperative_matrix_16_16_16;

    // bf16 cooperative matrix feature
    bool support_bf16_cooperative_matrix;

    // extension capability
    int support_VK_KHR_8bit_storage;
    int support_VK_KHR_16bit_storage;
    int support_VK_KHR_bind_memory2;
    int support_VK_KHR_buffer_device_address;
    int support_VK_KHR_create_renderpass2;
    int support_VK_KHR_cooperative_matrix;
    int support_VK_KHR_dedicated_allocation;
    int support_VK_KHR_descriptor_update_template;
    int support_VK_KHR_driver_properties;
    int support_VK_KHR_external_memory;
    int support_VK_KHR_get_memory_requirements2;
    int support_VK_KHR_maintenance1;
    int support_VK_KHR_maintenance2;
    int support_VK_KHR_maintenance3;
    int support_VK_KHR_multiview;
    int support_VK_KHR_portability_subset;
    int support_VK_KHR_push_descriptor;
    int support_VK_KHR_robustness2;
    int support_VK_KHR_sampler_ycbcr_conversion;
    int support_VK_KHR_shader_bfloat16;
    int support_VK_KHR_shader_float16_int8;
    int support_VK_KHR_shader_float_controls;
    int support_VK_KHR_shader_float_controls2;
    int support_VK_KHR_shader_integer_dot_product;
    int support_VK_KHR_shader_non_semantic_info;
    int support_VK_KHR_shader_subgroup_extended_types;
    int support_VK_KHR_shader_subgroup_rotate;
    int support_VK_KHR_storage_buffer_storage_class;
    int support_VK_KHR_swapchain;
    int support_VK_KHR_vulkan_memory_model;
    int support_VK_KHR_zero_initialize_workgroup_memory;
    int support_VK_EXT_buffer_device_address;
    int support_VK_EXT_descriptor_indexing;
    int support_VK_EXT_external_memory_host;
    int support_VK_EXT_memory_budget;
    int support_VK_EXT_memory_priority;
    int support_VK_EXT_queue_family_foreign;
    int support_VK_EXT_robustness2;
    int support_VK_EXT_shader_atomic_float;
    int support_VK_EXT_shader_atomic_float2;
    int support_VK_EXT_shader_float8;
    int support_VK_EXT_subgroup_size_control;
    int support_VK_AMD_device_coherent_memory;
#if __ANDROID_API__ >= 26
    int support_VK_ANDROID_external_memory_android_hardware_buffer;
#endif // __ANDROID_API__ >= 26
    int support_VK_NV_cooperative_matrix;
    int support_VK_NV_cooperative_matrix2;
    int support_VK_NV_cooperative_vector;

    // extension features
    void* queryExtensionFeatures;
    VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures;
    VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures;
    VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features;
    VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
    VkPhysicalDeviceCooperativeMatrixFeaturesKHR queryCooperativeMatrixFeatures;
    VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeaturesNV;
    VkPhysicalDeviceCooperativeMatrix2FeaturesNV queryCooperativeMatrix2FeaturesNV;
    VkPhysicalDeviceCooperativeVectorFeaturesNV queryCooperativeVectorFeaturesNV;
    VkPhysicalDeviceRobustness2FeaturesKHR queryRobustness2Features;
    VkPhysicalDeviceShaderBfloat16FeaturesKHR queryShaderBfloat16Features;
    VkPhysicalDeviceShaderFloat8FeaturesEXT queryShaderFloat8Features;
    VkPhysicalDeviceShaderFloatControls2FeaturesKHR queryShaderFloatControls2Features;
    VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR queryShaderIntegerDotProductFeatures;
    VkPhysicalDeviceSubgroupSizeControlFeaturesEXT querySubgroupSizeControlFeatures;
    VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR queryShaderSubgroupRotateFeatures;
    VkPhysicalDeviceShaderAtomicFloatFeaturesEXT queryShaderAtomicFloatFeatures;
    VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT queryShaderAtomicFloat2Features;
    VkPhysicalDeviceVulkanMemoryModelFeaturesKHR queryVulkanMemoryModelFeatures;

    // extension properties
    void* queryExtensionProperties;
    VkPhysicalDeviceFloatControlsPropertiesKHR queryFloatControlsProperties;
    VkPhysicalDeviceRobustness2PropertiesKHR queryRobustness2Properties;
    VkPhysicalDeviceShaderIntegerDotProductProperties queryShaderIntegerDotProductProperties;
    VkPhysicalDeviceSubgroupProperties querySubgroupProperties;
    VkPhysicalDeviceDriverPropertiesKHR queryDriverProperties;
    VkPhysicalDeviceSubgroupSizeControlPropertiesEXT querySubgroupSizeControlProperties;
    VkPhysicalDeviceExternalMemoryHostPropertiesEXT queryExternalMemoryHostProperties;
    VkPhysicalDeviceCooperativeMatrix2PropertiesNV queryCooperativeMatrix2PropertiesNV;
    VkPhysicalDeviceCooperativeVectorPropertiesNV queryCooperativeVectorPropertiesNV;

    // extension sub properties
    std::vector<VkCooperativeMatrixPropertiesKHR> queryCooperativeMatrixSubProperties;
    std::vector<VkCooperativeMatrixPropertiesNV> queryCooperativeMatrixSubPropertiesNV;
    std::vector<VkCooperativeMatrixFlexibleDimensionsPropertiesNV> queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV;
    std::vector<VkCooperativeVectorPropertiesNV> queryCooperativeVectorSubPropertiesNV;
};

void GpuInfoPrivate::query_features()
{
    vkGetPhysicalDeviceFeatures(physicalDevice, &physicalDevicefeatures);
}

void GpuInfoPrivate::query_properties()
{
    vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties);

    // NCNN_LOGE("[%u] apiVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
    //     VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
    // NCNN_LOGE("[%u] driverVersion = %u.%u.%u", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
    //     VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
    // NCNN_LOGE("[%u] vendorID = %x", i, physicalDeviceProperties.vendorID);
    // NCNN_LOGE("[%u] deviceID = %x", i, physicalDeviceProperties.deviceID);
    // NCNN_LOGE("[%u] deviceType = %x", i, physicalDeviceProperties.deviceType);
    // NCNN_LOGE("[%u] deviceName = %s", i, physicalDeviceProperties.deviceName);
    // NCNN_LOGE("[%u] pipelineCacheUUID = %u", i, physicalDeviceProperties.pipelineCacheUUID);

    // device type
    {
        type = -1;
        if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
            type = 0;
        if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
            type = 1;
        if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)
            type = 2;
        if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU)
            type = 3;
    }

    // mali
    // t760 = 0x13b5 0x7500001 / 0x7501000
    // t860 = 0x13b5 0x8602000
    // t880 = 0x13b5 0x8800020
    // g31  = 0x13b5 0x70930000
    // g51  = 0x13b5 0x70901010
    // g52  = 0x13b5 0x74021000 / 0x72120000
    // g71  = 0x13b5 0x60a00002
    // g72  = 0x13b5 0x62210001
    // g76  = 0x13b5 0x72110000
    // g77  = 0x13b5 0x90800011

    // adreno
    // 506 = 0x5143 0x5000600
    // 510 = 0x5143 0x5010000
    // 512 = 0x5143 0x5010200
    // 530 = 0x5143 0x5030004
    // 540 = 0x5143 0x5040001
    // 616 = 0x5143 0x6010600
    // 630 = 0x5143 0x6030001
    // 640 = 0x5143 0x6040001
    // 650 = 0x5143 0x6050002

    bug_storage_buffer_no_l1 = false;
    bug_corrupted_online_pipeline_cache = false;
    bug_implicit_fp16_arithmetic = false;
    bug_buffer_image_load_zero = false;

    if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66))
    {
        // qcom adreno with old buggy driver cannot share created pipeline properly
        bug_corrupted_online_pipeline_cache = true;
    }

    if (physicalDeviceProperties.vendorID == 0x5143 && !(physicalDeviceProperties.deviceID == 0x6040001 || physicalDeviceProperties.deviceID == 0x6050002))
    {
        // NOTE but qcom855/qcom855plus/qcom865 are known exceptions
        // qcom adreno storage buffer without L1 cache
        bug_storage_buffer_no_l1 = true;
    }

    if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 1, 87))
    {
        // HACK buffer2image before image-read dependency does not work properly
        // even promised with full image memory barrier on old adreno driver
        // TODO figure out a proper workaround without hurt speed too much
        // TODO only for old drivers
        bug_buffer_image_load_zero = true;
    }

    if (physicalDeviceProperties.vendorID == 0x13b5
            && (physicalDeviceProperties.deviceID == 0x7500001
                || physicalDeviceProperties.deviceID == 0x7501000
                || physicalDeviceProperties.deviceID == 0x8602000
                || physicalDeviceProperties.deviceID == 0x8800020
                || physicalDeviceProperties.deviceID == 0x70930000
                || physicalDeviceProperties.deviceID == 0x70901010
                || physicalDeviceProperties.deviceID == 0x72120000
                || physicalDeviceProperties.deviceID == 0x74021000
                || physicalDeviceProperties.deviceID == 0x60a00002
                || physicalDeviceProperties.deviceID == 0x62210001))
    {
        // NOTE rk3288/rk3399/t880/g31/g51/g52/g71/g72
        // however, g76/g77 has explicit fp16 arithmetic
        // arm mali driver accept spirv with fp16 arithmetic
        bug_implicit_fp16_arithmetic = true;
    }

    if (physicalDeviceProperties.vendorID == 0x5143
            && (physicalDeviceProperties.deviceID == 0x6030001
                || physicalDeviceProperties.deviceID == 0x6040001
                || physicalDeviceProperties.deviceID == 0x6050002))
    {
        // TODO enable devices other than qcom845/qcom855/qcom855plus/qcom865
        // qcom adreno driver accept spirv with fp16 arithmetic
        bug_implicit_fp16_arithmetic = true;
    }
}

static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
{
    // first try, compute only queue
    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
    {
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
                && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
        {
            return i;
        }
    }

    // second try, any queue with compute and graphics
    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
    {
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
                && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
        {
            return i;
        }
    }

    // third try, any queue with compute
    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
    {
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
        {
            return i;
        }
    }

    //     NCNN_LOGE("no compute queue");
    return -1;
}

static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
{
    // first try, transfer only queue
    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
    {
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
                && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
                && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
        {
            return i;
        }
    }

    // second try, any queue with transfer
    for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
    {
        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];

        if (queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
        {
            return i;
        }
    }

    // third try, use compute queue
    uint32_t compute_queue_index = find_device_compute_queue(queueFamilyProperties);
    if (compute_queue_index != (uint32_t)-1)
    {
        return compute_queue_index;
    }

    //     NCNN_LOGE("no transfer queue");
    return -1;
}

void GpuInfoPrivate::query_queue_properties()
{
    // find compute queue
    uint32_t queueFamilyPropertiesCount;
    vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);

    std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
    vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());

    compute_queue_family_index = find_device_compute_queue(queueFamilyProperties);
    transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties);

    compute_queue_count = queueFamilyProperties[compute_queue_family_index].queueCount;
    transfer_queue_count = queueFamilyProperties[transfer_queue_family_index].queueCount;

    unified_compute_transfer_queue = compute_queue_family_index == transfer_queue_family_index;
}

void GpuInfoPrivate::query_memory_properties()
{
    // cache memory properties
    vkGetPhysicalDeviceMemoryProperties(physicalDevice, &physicalDeviceMemoryProperties);

    if (type == 0)
    {
        // discrete gpu
        resizable_bar_enabled = false;

        // find heap that is device local and host visible and not host cached
        for (uint32_t i = 0; i < physicalDeviceMemoryProperties.memoryHeapCount; i++)
        {
            const VkMemoryHeap& memoryHeap = physicalDeviceMemoryProperties.memoryHeaps[i];
            if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
            {
                VkFlags required = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
                VkFlags disallowed = VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
                for (uint32_t j = 0; j < physicalDeviceMemoryProperties.memoryTypeCount; j++)
                {
                    const VkMemoryType& memoryType = physicalDeviceMemoryProperties.memoryTypes[j];
                    if (memoryType.heapIndex != i)
                        continue;

                    if ((memoryType.propertyFlags & disallowed) != 0)
                    {
                        // some driver treats a portion of host memory as device local heap, do not select this option
                        resizable_bar_enabled = false;
                        break;
                    }

                    if ((memoryType.propertyFlags & required) == required)
                    {
                        resizable_bar_enabled = true;
                    }
                }

                // subsequent device local heap is no longer considered
                // amd may declare a small device local + host visible heap for uploading
                // resizable bar feature is for the main device heap anyway
                break;
            }
        }
    }
    else
    {
        // integrated gpu
        resizable_bar_enabled = true;
    }
}

int GpuInfoPrivate::query_extensions()
{
    // get device extension
    uint32_t deviceExtensionPropertyCount = 0;
    VkResult ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
        return -1;
    }

    deviceExtensionProperties.resize(deviceExtensionPropertyCount);
    ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumerateDeviceExtensionProperties failed %d", ret);
        return -1;
    }

    // extension capability
    support_VK_KHR_8bit_storage = 0;
    support_VK_KHR_16bit_storage = 0;
    support_VK_KHR_bind_memory2 = 0;
    support_VK_KHR_buffer_device_address = 0;
    support_VK_KHR_create_renderpass2 = 0;
    support_VK_KHR_cooperative_matrix = 0;
    support_VK_KHR_dedicated_allocation = 0;
    support_VK_KHR_descriptor_update_template = 0;
    support_VK_KHR_driver_properties = 0;
    support_VK_KHR_external_memory = 0;
    support_VK_KHR_get_memory_requirements2 = 0;
    support_VK_KHR_maintenance1 = 0;
    support_VK_KHR_maintenance2 = 0;
    support_VK_KHR_maintenance3 = 0;
    support_VK_KHR_multiview = 0;
    support_VK_KHR_portability_subset = 0;
    support_VK_KHR_push_descriptor = 0;
    support_VK_KHR_robustness2 = 0;
    support_VK_KHR_sampler_ycbcr_conversion = 0;
    support_VK_KHR_shader_bfloat16 = 0;
    support_VK_KHR_shader_float16_int8 = 0;
    support_VK_KHR_shader_float_controls = 0;
    support_VK_KHR_shader_float_controls2 = 0;
    support_VK_KHR_shader_integer_dot_product = 0;
    support_VK_KHR_shader_non_semantic_info = 0;
    support_VK_KHR_shader_subgroup_extended_types = 0;
    support_VK_KHR_shader_subgroup_rotate = 0;
    support_VK_KHR_storage_buffer_storage_class = 0;
    support_VK_KHR_swapchain = 0;
    support_VK_KHR_vulkan_memory_model = 0;
    support_VK_KHR_zero_initialize_workgroup_memory = 0;
    support_VK_EXT_buffer_device_address = 0;
    support_VK_EXT_descriptor_indexing = 0;
    support_VK_EXT_external_memory_host = 0;
    support_VK_EXT_memory_budget = 0;
    support_VK_EXT_memory_priority = 0;
    support_VK_EXT_queue_family_foreign = 0;
    support_VK_EXT_robustness2 = 0;
    support_VK_EXT_shader_atomic_float = 0;
    support_VK_EXT_shader_atomic_float2 = 0;
    support_VK_EXT_shader_float8 = 0;
    support_VK_EXT_subgroup_size_control = 0;
    support_VK_AMD_device_coherent_memory = 0;
#if __ANDROID_API__ >= 26
    support_VK_ANDROID_external_memory_android_hardware_buffer = 0;
#endif // __ANDROID_API__ >= 26
    support_VK_NV_cooperative_matrix = 0;
    support_VK_NV_cooperative_matrix2 = 0;
    support_VK_NV_cooperative_vector = 0;
    for (uint32_t j = 0; j < deviceExtensionPropertyCount; j++)
    {
        const VkExtensionProperties& exp = deviceExtensionProperties[j];
        // NCNN_LOGE("device extension %s = %u", exp.extensionName, exp.specVersion);

        if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0)
            support_VK_KHR_8bit_storage = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_16bit_storage") == 0)
            support_VK_KHR_16bit_storage = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_bind_memory2") == 0)
            support_VK_KHR_bind_memory2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_buffer_device_address") == 0)
            support_VK_KHR_buffer_device_address = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_create_renderpass2") == 0)
            support_VK_KHR_create_renderpass2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_cooperative_matrix") == 0)
            support_VK_KHR_cooperative_matrix = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0)
            support_VK_KHR_dedicated_allocation = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0)
            support_VK_KHR_descriptor_update_template = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_driver_properties") == 0)
            support_VK_KHR_driver_properties = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_external_memory") == 0)
            support_VK_KHR_external_memory = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_get_memory_requirements2") == 0)
            support_VK_KHR_get_memory_requirements2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_maintenance1") == 0)
            support_VK_KHR_maintenance1 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_maintenance2") == 0)
            support_VK_KHR_maintenance2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_maintenance3") == 0)
            support_VK_KHR_maintenance3 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_multiview") == 0)
            support_VK_KHR_multiview = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_portability_subset") == 0)
            support_VK_KHR_portability_subset = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_push_descriptor") == 0)
            support_VK_KHR_push_descriptor = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_robustness2") == 0)
            support_VK_KHR_robustness2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_sampler_ycbcr_conversion") == 0)
            support_VK_KHR_sampler_ycbcr_conversion = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_bfloat16") == 0)
            support_VK_KHR_shader_bfloat16 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0)
            support_VK_KHR_shader_float16_int8 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls") == 0)
            support_VK_KHR_shader_float_controls = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls2") == 0)
            support_VK_KHR_shader_float_controls2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_integer_dot_product") == 0)
            support_VK_KHR_shader_integer_dot_product = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_non_semantic_info") == 0)
            support_VK_KHR_shader_non_semantic_info = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_subgroup_extended_types") == 0)
            support_VK_KHR_shader_subgroup_extended_types = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_subgroup_rotate") == 0)
            support_VK_KHR_shader_subgroup_rotate = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_storage_buffer_storage_class") == 0)
            support_VK_KHR_storage_buffer_storage_class = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_swapchain") == 0)
            support_VK_KHR_swapchain = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_vulkan_memory_model") == 0)
            support_VK_KHR_vulkan_memory_model = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_zero_initialize_workgroup_memory") == 0)
            support_VK_KHR_zero_initialize_workgroup_memory = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_buffer_device_address") == 0)
            support_VK_EXT_buffer_device_address = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_descriptor_indexing") == 0)
            support_VK_EXT_descriptor_indexing = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_external_memory_host") == 0)
            support_VK_EXT_external_memory_host = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_memory_budget") == 0)
            support_VK_EXT_memory_budget = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_memory_priority") == 0)
            support_VK_EXT_memory_priority = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_queue_family_foreign") == 0)
            support_VK_EXT_queue_family_foreign = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_robustness2") == 0)
            support_VK_EXT_robustness2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_shader_atomic_float") == 0)
            support_VK_EXT_shader_atomic_float = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_shader_atomic_float2") == 0)
            support_VK_EXT_shader_atomic_float2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_shader_float8") == 0)
            support_VK_EXT_shader_float8 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_subgroup_size_control") == 0)
            support_VK_EXT_subgroup_size_control = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_AMD_device_coherent_memory") == 0)
            support_VK_AMD_device_coherent_memory = exp.specVersion;
#if __ANDROID_API__ >= 26
        else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0)
            support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion;
#endif // __ANDROID_API__ >= 26
        else if (strcmp(exp.extensionName, "VK_NV_cooperative_matrix") == 0)
            support_VK_NV_cooperative_matrix = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_NV_cooperative_matrix2") == 0)
            support_VK_NV_cooperative_matrix2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_NV_cooperative_vector") == 0)
            support_VK_NV_cooperative_vector = exp.specVersion;
    }

    if (support_VK_KHR_buffer_device_address)
    {
        // we prefer khr extension
        support_VK_EXT_buffer_device_address = 0;
    }

    if (support_VK_KHR_cooperative_matrix)
    {
        // we prefer khr extension
        support_VK_NV_cooperative_matrix = 0;
    }

    if (support_VK_KHR_robustness2)
    {
        // we prefer khr extension
        support_VK_EXT_robustness2 = 0;
    }

    return 0;
}

void GpuInfoPrivate::query_extension_features()
{
    queryExtensionFeatures = 0;

    // query int8 storage
    memset(&query8BitStorageFeatures, 0, sizeof(query8BitStorageFeatures));
    query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
    query8BitStorageFeatures.pNext = 0;
    if (support_VK_KHR_8bit_storage)
    {
        query8BitStorageFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &query8BitStorageFeatures;
    }

    // query fp16/int16 storage
    memset(&query16BitStorageFeatures, 0, sizeof(query16BitStorageFeatures));
    query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
    query16BitStorageFeatures.pNext = 0;
    if (support_VK_KHR_16bit_storage)
    {
        query16BitStorageFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &query16BitStorageFeatures;
    }

    // query fp16/int8 arithmetic
    memset(&queryFloat16Int8Features, 0, sizeof(queryFloat16Int8Features));
    queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
    queryFloat16Int8Features.pNext = 0;
    if (support_VK_KHR_shader_float16_int8)
    {
        queryFloat16Int8Features.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryFloat16Int8Features;
    }

    // query ycbcr_conversion
    memset(&querySamplerYcbcrConversionFeatures, 0, sizeof(querySamplerYcbcrConversionFeatures));
    querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
    querySamplerYcbcrConversionFeatures.pNext = 0;
    if (support_VK_KHR_sampler_ycbcr_conversion)
    {
        querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &querySamplerYcbcrConversionFeatures;
    }

    // query cooperative_matrix
    memset(&queryCooperativeMatrixFeatures, 0, sizeof(queryCooperativeMatrixFeatures));
    queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
    queryCooperativeMatrixFeatures.pNext = 0;
    if (support_VK_KHR_cooperative_matrix)
    {
        queryCooperativeMatrixFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryCooperativeMatrixFeatures;
    }

    // query nv cooperative matrix
    memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV));
    queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
    queryCooperativeMatrixFeaturesNV.pNext = 0;
    if (support_VK_NV_cooperative_matrix)
    {
        queryCooperativeMatrixFeaturesNV.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryCooperativeMatrixFeaturesNV;
    }

    // query nv cooperative matrix2
    memset(&queryCooperativeMatrix2FeaturesNV, 0, sizeof(queryCooperativeMatrix2FeaturesNV));
    queryCooperativeMatrix2FeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV;
    queryCooperativeMatrix2FeaturesNV.pNext = 0;
    if (support_VK_NV_cooperative_matrix2)
    {
        queryCooperativeMatrix2FeaturesNV.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryCooperativeMatrix2FeaturesNV;
    }

    // query nv cooperative vector
    memset(&queryCooperativeVectorFeaturesNV, 0, sizeof(queryCooperativeVectorFeaturesNV));
    queryCooperativeVectorFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_VECTOR_FEATURES_NV;
    queryCooperativeVectorFeaturesNV.pNext = 0;
    if (support_VK_NV_cooperative_vector)
    {
        queryCooperativeVectorFeaturesNV.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryCooperativeVectorFeaturesNV;
    }

    // query robustness2
    memset(&queryRobustness2Features, 0, sizeof(queryRobustness2Features));
    queryRobustness2Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_KHR;
    queryRobustness2Features.pNext = 0;
    if (support_VK_KHR_robustness2 || support_VK_EXT_robustness2)
    {
        queryRobustness2Features.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryRobustness2Features;
    }

    // query bfloat16
    memset(&queryShaderBfloat16Features, 0, sizeof(queryShaderBfloat16Features));
    queryShaderBfloat16Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_BFLOAT16_FEATURES_KHR;
    queryShaderBfloat16Features.pNext = 0;
    if (support_VK_KHR_shader_bfloat16)
    {
        queryShaderBfloat16Features.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryShaderBfloat16Features;
    }

    // query float8
    memset(&queryShaderFloat8Features, 0, sizeof(queryShaderFloat8Features));
    queryShaderFloat8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT8_FEATURES_EXT;
    queryShaderFloat8Features.pNext = 0;
    if (support_VK_EXT_shader_float8)
    {
        queryShaderFloat8Features.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryShaderFloat8Features;
    }

    // query float controls 2
    memset(&queryShaderFloatControls2Features, 0, sizeof(queryShaderFloatControls2Features));
    queryShaderFloatControls2Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_FLOAT_CONTROLS_2_FEATURES_KHR;
    queryShaderFloatControls2Features.pNext = 0;
    if (support_VK_KHR_shader_float_controls2)
    {
        queryShaderFloatControls2Features.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryShaderFloatControls2Features;
    }

    // query integer dot product
    memset(&queryShaderIntegerDotProductFeatures, 0, sizeof(queryShaderIntegerDotProductFeatures));
    queryShaderIntegerDotProductFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_FEATURES_KHR;
    queryShaderIntegerDotProductFeatures.pNext = 0;
    if (support_VK_KHR_shader_integer_dot_product)
    {
        queryShaderIntegerDotProductFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryShaderIntegerDotProductFeatures;
    }

    // query subgroup size control
    memset(&querySubgroupSizeControlFeatures, 0, sizeof(querySubgroupSizeControlFeatures));
    querySubgroupSizeControlFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
    querySubgroupSizeControlFeatures.pNext = 0;
    if (support_VK_EXT_subgroup_size_control >= 2)
    {
        querySubgroupSizeControlFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &querySubgroupSizeControlFeatures;
    }

    // query subgroup rotate
    memset(&queryShaderSubgroupRotateFeatures, 0, sizeof(queryShaderSubgroupRotateFeatures));
    queryShaderSubgroupRotateFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_SUBGROUP_ROTATE_FEATURES_KHR;
    queryShaderSubgroupRotateFeatures.pNext = 0;
    if (support_VK_KHR_shader_subgroup_rotate)
    {
        queryShaderSubgroupRotateFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryShaderSubgroupRotateFeatures;
    }

    // query atomic float
    memset(&queryShaderAtomicFloatFeatures, 0, sizeof(queryShaderAtomicFloatFeatures));
    queryShaderAtomicFloatFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_FEATURES_EXT;
    queryShaderAtomicFloatFeatures.pNext = 0;
    if (support_VK_EXT_shader_atomic_float)
    {
        queryShaderAtomicFloatFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryShaderAtomicFloatFeatures;
    }

    // query atomic float2
    memset(&queryShaderAtomicFloat2Features, 0, sizeof(queryShaderAtomicFloat2Features));
    queryShaderAtomicFloat2Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_ATOMIC_FLOAT_2_FEATURES_EXT;
    queryShaderAtomicFloat2Features.pNext = 0;
    if (support_VK_EXT_shader_atomic_float2)
    {
        queryShaderAtomicFloat2Features.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryShaderAtomicFloat2Features;
    }

    // query vulkan memory model
    memset(&queryVulkanMemoryModelFeatures, 0, sizeof(queryVulkanMemoryModelFeatures));
    queryVulkanMemoryModelFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_MEMORY_MODEL_FEATURES_KHR;
    queryVulkanMemoryModelFeatures.pNext = 0;
    if (support_VK_KHR_vulkan_memory_model)
    {
        queryVulkanMemoryModelFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryVulkanMemoryModelFeatures;
    }

    if (support_VK_KHR_get_physical_device_properties2)
    {
        VkPhysicalDeviceFeatures2KHR queryFeatures;
        queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR;
        queryFeatures.pNext = queryExtensionFeatures;

        vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures);
    }

    // apply known blacklist
    if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 82))
    {
        // the 16bit_storage implementation of arm mali driver is buggy :[
        query16BitStorageFeatures.storageBuffer16BitAccess = VK_FALSE;
    }

    if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82))
    {
        // the 16bit_storage implementation of vivante gc1700 driver is buggy :[
        query16BitStorageFeatures.storageBuffer16BitAccess = VK_FALSE;
    }

    if (bug_implicit_fp16_arithmetic)
    {
        // force capability on as long as the driver accept spirv with fp16 arithmetic :D
        queryFloat16Int8Features.shaderFloat16 = VK_TRUE;
    }

    if (physicalDeviceProperties.vendorID == 0x5143 && !query16BitStorageFeatures.storageBuffer16BitAccess)
    {
        // fp16 arithmetic yields wrong result on old adreno drivers :(
        queryFloat16Int8Features.shaderFloat16 = VK_FALSE;
    }

    if (physicalDeviceProperties.vendorID == 0x1002)
    {
        // emulated cooperative matrix on amd rdna2 is slow
        switch (physicalDeviceProperties.deviceID)
        {
        case 0x73a1: // V620
        case 0x73a2: // W6900X
        case 0x73a3: // W6800
        case 0x73a4: // NAVI21-USB
        case 0x73a5: // 6950XT
        case 0x73ab: // W6800X/W6800X-DUO
        case 0x73ae: // V620-MX
        case 0x73af: // 6900XT
        case 0x73bf: // 6800/6800XT/6900XT
        case 0x73c3: // NAVI22-?
        case 0x73c4: // NAVI22-USB
        case 0x73df: // 6700/6700XT/6750XT/6750GRE-12G/6800M/6850MXT
        case 0x73e0: // NAVI23-?
        case 0x73e1: // W6600M
        case 0x73e3: // W6600
        case 0x73e4: // NAVI23-USB
        case 0x73ef: // 6650XT/6700S/6800S
        case 0x73ff: // 6600/6600XT/6750GRE-10G/6600M
        case 0x7421: // W6500M
        case 0x7422: // W6400
        case 0x7423: // W6300/W6300M
        case 0x7424: // 6300
        case 0x743f: // 6400/6500XT/6500M
        {
            queryCooperativeMatrixFeatures.cooperativeMatrix = VK_FALSE;
            queryCooperativeMatrixFeaturesNV.cooperativeMatrix = VK_FALSE;
            break;
        }
        default:
            break;
        }
    }
}

void GpuInfoPrivate::evaluate_rough_score()
{
    rough_score = 0;

    // device type score
    if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
        rough_score += 50;
    if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
        rough_score += 5;
    if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)
        rough_score += 4;

    // simd width score
    rough_score += querySubgroupProperties.subgroupSize / 32;

    // extension score
    for (size_t i = 0; i < deviceExtensionProperties.size(); i++)
    {
        const VkExtensionProperties& exp = deviceExtensionProperties[i];

        if (strcmp(exp.extensionName, "VK_KHR_cooperative_matrix") == 0)
            rough_score += 10;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_bfloat16") == 0)
            rough_score += 2;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_integer_dot_product") == 0)
            rough_score += 2;
        else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0)
            rough_score += 2;
        else if (strcmp(exp.extensionName, "VK_EXT_shader_float8") == 0)
            rough_score += 2;
    }

    // device local heap size score
    if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
    {
        VkDeviceSize max_device_local = 0;
        for (uint32_t i = 0; i < physicalDeviceMemoryProperties.memoryHeapCount; i++)
        {
            const VkMemoryHeap& memoryHeap = physicalDeviceMemoryProperties.memoryHeaps[i];
            if (memoryHeap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT)
            {
                max_device_local = std::max(max_device_local, memoryHeap.size);
            }
        }
        uint32_t mem_gb = max_device_local / (1024 * 1024 * 1024);
        rough_score += mem_gb;
    }
}

static int get_vendor_default_subgroup_size(uint32_t vendorID)
{
    int default_size = 64;  // default to 64
    if (vendorID == 0x5143) // qcom adreno
        default_size = 128;
    else if (vendorID == 0x13b5) // arm mali
        default_size = 16;
    else if (vendorID == 0x1010) // imgtec powervr
        default_size = 32;
    else if (vendorID == 0x1002) // amd
        default_size = 64;
    else if (vendorID == 0x10de) // nvidia
        default_size = 32;
    else if (vendorID == 0x8086) // intel
        default_size = 32;
    return default_size;
}

void GpuInfoPrivate::query_extension_properties()
{
    queryExtensionProperties = 0;

    // query float controls
    memset(&queryFloatControlsProperties, 0, sizeof(queryFloatControlsProperties));
    queryFloatControlsProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT_CONTROLS_PROPERTIES;
    queryFloatControlsProperties.pNext = 0;
    if (support_VK_KHR_shader_float_controls)
    {
        queryFloatControlsProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryFloatControlsProperties;
    }

    // query integer dot product
    memset(&queryShaderIntegerDotProductProperties, 0, sizeof(queryShaderIntegerDotProductProperties));
    queryShaderIntegerDotProductProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SHADER_INTEGER_DOT_PRODUCT_PROPERTIES_KHR;
    queryShaderIntegerDotProductProperties.pNext = 0;
    if (support_VK_KHR_shader_integer_dot_product)
    {
        queryShaderIntegerDotProductProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryShaderIntegerDotProductProperties;
    }

    // query subgroup
    memset(&querySubgroupProperties, 0, sizeof(querySubgroupProperties));
    querySubgroupProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES;
    querySubgroupProperties.pNext = 0;
    if (VK_VERSION_MAJOR(g_instance.instance_api_version) >= 1 && VK_VERSION_MINOR(g_instance.instance_api_version) >= 1)
    {
        querySubgroupProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &querySubgroupProperties;
    }
    else
    {
        querySubgroupProperties.subgroupSize = get_vendor_default_subgroup_size(physicalDeviceProperties.vendorID);
    }

    // query driver properties
    memset(&queryDriverProperties, 0, sizeof(queryDriverProperties));
    queryDriverProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR;
    queryDriverProperties.pNext = 0;
    if (support_VK_KHR_driver_properties)
    {
        queryDriverProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryDriverProperties;
    }

    // query robustness2
    memset(&queryRobustness2Properties, 0, sizeof(queryRobustness2Properties));
    queryRobustness2Properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_PROPERTIES_KHR;
    queryRobustness2Properties.pNext = 0;
    if (support_VK_KHR_robustness2 || support_VK_EXT_robustness2)
    {
        queryRobustness2Properties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryRobustness2Properties;
    }

    // query subgroup size control
    memset(&querySubgroupSizeControlProperties, 0, sizeof(querySubgroupSizeControlProperties));
    querySubgroupSizeControlProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT;
    querySubgroupSizeControlProperties.pNext = 0;
    if (support_VK_EXT_subgroup_size_control)
    {
        querySubgroupSizeControlProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &querySubgroupSizeControlProperties;
    }

    // query external memory host
    memset(&queryExternalMemoryHostProperties, 0, sizeof(queryExternalMemoryHostProperties));
    queryExternalMemoryHostProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_MEMORY_HOST_PROPERTIES_EXT;
    queryExternalMemoryHostProperties.pNext = 0;
    if (support_VK_EXT_external_memory_host)
    {
        queryExternalMemoryHostProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryExternalMemoryHostProperties;
    }

    // query nv cooperative matrix2
    memset(&queryCooperativeMatrix2PropertiesNV, 0, sizeof(queryCooperativeMatrix2PropertiesNV));
    queryCooperativeMatrix2PropertiesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_PROPERTIES_NV;
    queryCooperativeMatrix2PropertiesNV.pNext = 0;
    if (support_VK_NV_cooperative_matrix2)
    {
        queryCooperativeMatrix2PropertiesNV.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryCooperativeMatrix2PropertiesNV;
    }

    // query nv cooperative vector
    memset(&queryCooperativeVectorPropertiesNV, 0, sizeof(queryCooperativeVectorPropertiesNV));
    queryCooperativeVectorPropertiesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_VECTOR_PROPERTIES_NV;
    queryCooperativeVectorPropertiesNV.pNext = 0;
    if (support_VK_NV_cooperative_vector)
    {
        queryCooperativeVectorPropertiesNV.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryCooperativeVectorPropertiesNV;
    }

    if (support_VK_KHR_get_physical_device_properties2)
    {
        VkPhysicalDeviceProperties2KHR queryProperties;
        queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
        queryProperties.pNext = queryExtensionProperties;

        vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);

        // append subgroup rotate
        if (support_VK_KHR_shader_subgroup_rotate)
        {
            if (queryShaderSubgroupRotateFeatures.shaderSubgroupRotate)
                querySubgroupProperties.supportedOperations |= VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR;
            if (queryShaderSubgroupRotateFeatures.shaderSubgroupRotateClustered)
                querySubgroupProperties.supportedOperations |= VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR;
        }
        // Avoid invalid subgroup size
        bool is_subgroup_size_valid = (querySubgroupProperties.subgroupSize > 0) && ((querySubgroupProperties.subgroupSize & (querySubgroupProperties.subgroupSize - 1)) == 0);
        if (!is_subgroup_size_valid)
        {
            querySubgroupProperties.subgroupSize = get_vendor_default_subgroup_size(physicalDeviceProperties.vendorID);
        }
    }

    if (!support_VK_EXT_subgroup_size_control)
    {
        querySubgroupSizeControlProperties.minSubgroupSize = querySubgroupProperties.subgroupSize;
        querySubgroupSizeControlProperties.maxSubgroupSize = querySubgroupProperties.subgroupSize;
        querySubgroupSizeControlProperties.maxComputeWorkgroupSubgroups = std::max(physicalDeviceProperties.limits.maxComputeWorkGroupInvocations / querySubgroupProperties.subgroupSize, 1u);
    }

    // query supported cooperative matrix types and operations
    queryCooperativeMatrixSubProperties.clear();
    queryCooperativeMatrixSubPropertiesNV.clear();
    support_cooperative_matrix_8_8_16 = false;
    support_cooperative_matrix_16_8_8 = false;
    support_cooperative_matrix_16_8_16 = false;
    support_cooperative_matrix_16_16_16 = false;
    support_bf16_cooperative_matrix = false;
    if (support_VK_KHR_cooperative_matrix && queryCooperativeMatrixFeatures.cooperativeMatrix)
    {
        uint32_t propertyCount = 0;
        VkResult ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, 0);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
        }

        queryCooperativeMatrixSubProperties.resize(propertyCount);
        for (uint32_t j = 0; j < propertyCount; j++)
        {
            memset(&queryCooperativeMatrixSubProperties[j], 0, sizeof(queryCooperativeMatrixSubProperties[j]));
            queryCooperativeMatrixSubProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
            queryCooperativeMatrixSubProperties[j].pNext = 0;
        }
        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixSubProperties.data());
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
        }

        for (uint32_t j = 0; j < propertyCount; j++)
        {
            const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixSubProperties[j];
            // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);

            if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_8_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_16_8_8 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_16_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_cooperative_matrix_16_16_16 = true;
            }

            if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
                    && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
            {
                support_bf16_cooperative_matrix = true;
            }
        }
    }
    else if (support_VK_NV_cooperative_matrix && queryCooperativeMatrixFeaturesNV.cooperativeMatrix)
    {
        uint32_t propertyCount = 0;
        VkResult ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
        }

        queryCooperativeMatrixSubPropertiesNV.resize(propertyCount);
        for (uint32_t j = 0; j < propertyCount; j++)
        {
            memset(&queryCooperativeMatrixSubPropertiesNV[j], 0, sizeof(queryCooperativeMatrixSubPropertiesNV[j]));
            queryCooperativeMatrixSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
            queryCooperativeMatrixSubPropertiesNV[j].pNext = 0;
        }
        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixSubPropertiesNV.data());
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
        }

        for (uint32_t j = 0; j < propertyCount; j++)
        {
            const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixSubPropertiesNV[j];
            // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);

            if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_8_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_16_8_8 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_16_8_16 = true;
            }
            if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
                    && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
                    && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
                    && cmp.scope == VK_SCOPE_SUBGROUP_NV)
            {
                support_cooperative_matrix_16_16_16 = true;
            }
        }
    }

    // query supported cooperative matrix2 types and operations
    queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV.clear();
    if (support_VK_NV_cooperative_matrix2 && queryCooperativeMatrix2FeaturesNV.cooperativeMatrixFlexibleDimensions)
    {
        uint32_t propertyCount = 0;
        VkResult ret = vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(physicalDevice, &propertyCount, 0);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV failed %d", ret);
        }

        queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV.resize(propertyCount);
        for (uint32_t j = 0; j < propertyCount; j++)
        {
            memset(&queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j], 0, sizeof(queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j]));
            queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_FLEXIBLE_DIMENSIONS_PROPERTIES_NV;
            queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j].pNext = 0;
        }
        ret = vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV.data());
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV failed %d", ret);
        }

        for (uint32_t j = 0; j < propertyCount; j++)
        {
            const VkCooperativeMatrixFlexibleDimensionsPropertiesNV& cmfdp = queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV[j];
            // NCNN_LOGE("cmfdp %2d %2d %2d  %d %d %d %d  %d %d %d", cmfdp.MGranularity, cmfdp.NGranularity, cmfdp.KGranularity, cmfdp.AType, cmfdp.BType, cmfdp.CType, cmfdp.ResultType, cmfdp.saturatingAccumulation, cmfdp.scope, cmfdp.workgroupInvocations);
        }
    }

    // query supported cooperative vector types and operations
    queryCooperativeVectorSubPropertiesNV.clear();
    if (support_VK_NV_cooperative_vector && queryCooperativeVectorFeaturesNV.cooperativeVector)
    {
        uint32_t propertyCount = 0;
        VkResult ret = vkGetPhysicalDeviceCooperativeVectorPropertiesNV(physicalDevice, &propertyCount, 0);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeVectorPropertiesNV failed %d", ret);
        }

        queryCooperativeVectorSubPropertiesNV.resize(propertyCount);
        for (uint32_t j = 0; j < propertyCount; j++)
        {
            memset(&queryCooperativeVectorSubPropertiesNV[j], 0, sizeof(queryCooperativeVectorSubPropertiesNV[j]));
            queryCooperativeVectorSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_VECTOR_PROPERTIES_NV;
            queryCooperativeVectorSubPropertiesNV[j].pNext = 0;
        }
        ret = vkGetPhysicalDeviceCooperativeVectorPropertiesNV(physicalDevice, &propertyCount, queryCooperativeVectorSubPropertiesNV.data());
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeVectorPropertiesNV failed %d", ret);
        }

        for (uint32_t j = 0; j < propertyCount; j++)
        {
            const VkCooperativeVectorPropertiesNV& cvp = queryCooperativeVectorSubPropertiesNV[j];
            // NCNN_LOGE("cvp %d %d %d %d %d  %d", cvp.inputType, cvp.inputInterpretation, cvp.matrixInterpretation, cvp.biasInterpretation, cvp.resultType, cvp.transpose);
        }
    }

    if (queryDriverProperties.driverID == VK_DRIVER_ID_MESA_TURNIP)
    {
        // turnip crash when compiling large shader with full subgroup
        querySubgroupSizeControlFeatures.computeFullSubgroups = VK_FALSE;
    }
}

GpuInfo::GpuInfo()
    : d(new GpuInfoPrivate)
{
}

GpuInfo::~GpuInfo()
{
    delete d;
}

GpuInfo::GpuInfo(const GpuInfo&)
    : d(0)
{
}

GpuInfo& GpuInfo::operator=(const GpuInfo&)
{
    return *this;
}

int GpuInfo::device_index() const
{
    return d->device_index;
}

VkPhysicalDevice GpuInfo::physicalDevice() const
{
    return d->physicalDevice;
}

VkPhysicalDevice GpuInfo::physical_device() const
{
    return d->physicalDevice;
}

const VkPhysicalDeviceFeatures& GpuInfo::physicalDevicefeatures() const
{
    return d->physicalDevicefeatures;
}

const VkPhysicalDeviceProperties& GpuInfo::physicalDeviceProperties() const
{
    return d->physicalDeviceProperties;
}

const VkPhysicalDeviceMemoryProperties& GpuInfo::physicalDeviceMemoryProperties() const
{
    return d->physicalDeviceMemoryProperties;
}

const VkPhysicalDeviceMemoryProperties& GpuInfo::physical_device_memory_properties() const
{
    return d->physicalDeviceMemoryProperties;
}

const std::vector<VkExtensionProperties>& GpuInfo::deviceExtensionProperties() const
{
    return d->deviceExtensionProperties;
}

uint32_t GpuInfo::api_version() const
{
    return d->physicalDeviceProperties.apiVersion;
}

uint32_t GpuInfo::driver_version() const
{
    return d->physicalDeviceProperties.driverVersion;
}

uint32_t GpuInfo::vendor_id() const
{
    return d->physicalDeviceProperties.vendorID;
}

uint32_t GpuInfo::device_id() const
{
    return d->physicalDeviceProperties.deviceID;
}

const char* GpuInfo::device_name() const
{
    return d->physicalDeviceProperties.deviceName;
}

uint8_t* GpuInfo::pipeline_cache_uuid() const
{
    return d->physicalDeviceProperties.pipelineCacheUUID;
}

uint32_t GpuInfo::driver_id() const
{
    return d->queryDriverProperties.driverID;
}

const char* GpuInfo::driver_name() const
{
    return d->queryDriverProperties.driverName;
}

int GpuInfo::type() const
{
    return d->type;
}

uint32_t GpuInfo::rough_score() const
{
    return d->rough_score;
}

uint32_t GpuInfo::max_shared_memory_size() const
{
    return d->physicalDeviceProperties.limits.maxComputeSharedMemorySize;
}

uint32_t GpuInfo::max_workgroup_count_x() const
{
    return d->physicalDeviceProperties.limits.maxComputeWorkGroupCount[0];
}

uint32_t GpuInfo::max_workgroup_count_y() const
{
    return d->physicalDeviceProperties.limits.maxComputeWorkGroupCount[1];
}

uint32_t GpuInfo::max_workgroup_count_z() const
{
    return d->physicalDeviceProperties.limits.maxComputeWorkGroupCount[2];
}

uint32_t GpuInfo::max_workgroup_invocations() const
{
    return d->physicalDeviceProperties.limits.maxComputeWorkGroupInvocations;
}

uint32_t GpuInfo::max_workgroup_size_x() const
{
    return d->physicalDeviceProperties.limits.maxComputeWorkGroupSize[0];
}

uint32_t GpuInfo::max_workgroup_size_y() const
{
    return d->physicalDeviceProperties.limits.maxComputeWorkGroupSize[1];
}

uint32_t GpuInfo::max_workgroup_size_z() const
{
    return d->physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
}

size_t GpuInfo::memory_map_alignment() const
{
    return d->physicalDeviceProperties.limits.minMemoryMapAlignment;
}

size_t GpuInfo::buffer_offset_alignment() const
{
    return d->physicalDeviceProperties.limits.minStorageBufferOffsetAlignment;
}

size_t GpuInfo::non_coherent_atom_size() const
{
    return d->physicalDeviceProperties.limits.nonCoherentAtomSize;
}

size_t GpuInfo::buffer_image_granularity() const
{
    return d->physicalDeviceProperties.limits.bufferImageGranularity;
}

uint32_t GpuInfo::max_image_dimension_1d() const
{
    return d->physicalDeviceProperties.limits.maxImageDimension1D;
}

uint32_t GpuInfo::max_image_dimension_2d() const
{
    return d->physicalDeviceProperties.limits.maxImageDimension2D;
}

uint32_t GpuInfo::max_image_dimension_3d() const
{
    return d->physicalDeviceProperties.limits.maxImageDimension3D;
}

float GpuInfo::timestamp_period() const
{
    return d->physicalDeviceProperties.limits.timestampPeriod;
}

uint32_t GpuInfo::compute_queue_family_index() const
{
    return d->compute_queue_family_index;
}

uint32_t GpuInfo::transfer_queue_family_index() const
{
    return d->transfer_queue_family_index;
}

uint32_t GpuInfo::compute_queue_count() const
{
    return d->compute_queue_count;
}

uint32_t GpuInfo::transfer_queue_count() const
{
    return d->transfer_queue_count;
}

bool GpuInfo::unified_compute_transfer_queue() const
{
    return d->unified_compute_transfer_queue;
}

bool GpuInfo::resizable_bar_enabled() const
{
    return d->resizable_bar_enabled;
}

uint32_t GpuInfo::subgroup_size() const
{
    return d->querySubgroupProperties.subgroupSize;
}

uint32_t GpuInfo::min_subgroup_size() const
{
    return d->querySubgroupSizeControlProperties.minSubgroupSize;
}

uint32_t GpuInfo::max_subgroup_size() const
{
    return d->querySubgroupSizeControlProperties.maxSubgroupSize;
}

uint32_t GpuInfo::max_compute_workgroup_subgroups() const
{
    return d->querySubgroupSizeControlProperties.maxComputeWorkgroupSubgroups;
}

bool GpuInfo::support_subgroup_size_control() const
{
    return d->querySubgroupSizeControlFeatures.subgroupSizeControl;
}

bool GpuInfo::support_compute_full_subgroups() const
{
    return d->querySubgroupSizeControlFeatures.computeFullSubgroups;
}

uint32_t GpuInfo::support_subgroup_ops() const
{
    return d->querySubgroupProperties.supportedOperations;
}

bool GpuInfo::bug_storage_buffer_no_l1() const
{
    return d->bug_storage_buffer_no_l1;
}

bool GpuInfo::bug_corrupted_online_pipeline_cache() const
{
    return d->bug_corrupted_online_pipeline_cache;
}

bool GpuInfo::bug_buffer_image_load_zero() const
{
    return d->bug_buffer_image_load_zero;
}

bool GpuInfo::bug_implicit_fp16_arithmetic() const
{
    return d->bug_implicit_fp16_arithmetic;
}

bool GpuInfo::support_fp16_packed() const
{
    return true;
}

bool GpuInfo::support_fp16_storage() const
{
    return d->query16BitStorageFeatures.storageBuffer16BitAccess;
}

bool GpuInfo::support_fp16_uniform() const
{
    return d->query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
}

bool GpuInfo::support_fp16_arithmetic() const
{
    return d->queryFloat16Int8Features.shaderFloat16;
}

bool GpuInfo::support_int8_packed() const
{
    return true;
}

bool GpuInfo::support_int8_storage() const
{
    return d->query8BitStorageFeatures.storageBuffer8BitAccess;
}

bool GpuInfo::support_int8_uniform() const
{
    return d->query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
}

bool GpuInfo::support_int8_arithmetic() const
{
    return d->queryFloat16Int8Features.shaderInt8;
}

bool GpuInfo::support_bf16_packed() const
{
    return true;
}

bool GpuInfo::support_bf16_storage() const
{
    return d->queryShaderBfloat16Features.shaderBFloat16Type;
}

bool GpuInfo::support_fp16_image() const
{
    return d->physicalDevicefeatures.shaderStorageImageExtendedFormats;
}

bool GpuInfo::support_int8_image() const
{
    return d->physicalDevicefeatures.shaderStorageImageExtendedFormats;
}

bool GpuInfo::support_fp_fast_math() const
{
    return d->queryShaderFloatControls2Features.shaderFloatControls2;
}

bool GpuInfo::support_ycbcr_conversion() const
{
    return d->querySamplerYcbcrConversionFeatures.samplerYcbcrConversion;
}

bool GpuInfo::support_cooperative_matrix() const
{
    return d->queryCooperativeMatrixFeatures.cooperativeMatrix || d->queryCooperativeMatrixFeaturesNV.cooperativeMatrix;
}

bool GpuInfo::support_cooperative_matrix_8_8_16() const
{
    return d->support_cooperative_matrix_8_8_16;
}

bool GpuInfo::support_cooperative_matrix_16_8_8() const
{
    return d->support_cooperative_matrix_16_8_8;
}

bool GpuInfo::support_cooperative_matrix_16_8_16() const
{
    return d->support_cooperative_matrix_16_8_16;
}

bool GpuInfo::support_cooperative_matrix_16_16_16() const
{
    return d->support_cooperative_matrix_16_16_16;
}

bool GpuInfo::support_bf16_cooperative_matrix() const
{
    return d->support_bf16_cooperative_matrix;
}

int GpuInfo::support_VK_KHR_8bit_storage() const
{
    return d->support_VK_KHR_8bit_storage;
}

int GpuInfo::support_VK_KHR_16bit_storage() const
{
    return d->support_VK_KHR_16bit_storage;
}

int GpuInfo::support_VK_KHR_bind_memory2() const
{
    return d->support_VK_KHR_bind_memory2;
}

int GpuInfo::support_VK_KHR_buffer_device_address() const
{
    return d->support_VK_KHR_buffer_device_address;
}

int GpuInfo::support_VK_KHR_create_renderpass2() const
{
    return d->support_VK_KHR_create_renderpass2;
}

int GpuInfo::support_VK_KHR_cooperative_matrix() const
{
    return d->support_VK_KHR_cooperative_matrix;
}

int GpuInfo::support_VK_KHR_dedicated_allocation() const
{
    return d->support_VK_KHR_dedicated_allocation;
}

int GpuInfo::support_VK_KHR_descriptor_update_template() const
{
    return d->support_VK_KHR_descriptor_update_template;
}

int GpuInfo::support_VK_KHR_driver_properties() const
{
    return d->support_VK_KHR_driver_properties;
}

int GpuInfo::support_VK_KHR_external_memory() const
{
    return d->support_VK_KHR_external_memory;
}

int GpuInfo::support_VK_KHR_get_memory_requirements2() const
{
    return d->support_VK_KHR_get_memory_requirements2;
}

int GpuInfo::support_VK_KHR_maintenance1() const
{
    return d->support_VK_KHR_maintenance1;
}

int GpuInfo::support_VK_KHR_maintenance2() const
{
    return d->support_VK_KHR_maintenance2;
}

int GpuInfo::support_VK_KHR_maintenance3() const
{
    return d->support_VK_KHR_maintenance3;
}

int GpuInfo::support_VK_KHR_multiview() const
{
    return d->support_VK_KHR_multiview;
}

int GpuInfo::support_VK_KHR_portability_subset() const
{
    return d->support_VK_KHR_portability_subset;
}

int GpuInfo::support_VK_KHR_push_descriptor() const
{
    return d->support_VK_KHR_push_descriptor;
}

int GpuInfo::support_VK_KHR_robustness2() const
{
    return d->support_VK_KHR_robustness2;
}

int GpuInfo::support_VK_KHR_sampler_ycbcr_conversion() const
{
    return d->support_VK_KHR_sampler_ycbcr_conversion;
}

int GpuInfo::support_VK_KHR_shader_bfloat16() const
{
    return d->support_VK_KHR_shader_bfloat16;
}

int GpuInfo::support_VK_KHR_shader_float16_int8() const
{
    return d->support_VK_KHR_shader_float16_int8;
}

int GpuInfo::support_VK_KHR_shader_float_controls() const
{
    return d->support_VK_KHR_shader_float_controls;
}

int GpuInfo::support_VK_KHR_shader_float_controls2() const
{
    return d->support_VK_KHR_shader_float_controls2;
}

int GpuInfo::support_VK_KHR_shader_integer_dot_product() const
{
    return d->support_VK_KHR_shader_integer_dot_product;
}

int GpuInfo::support_VK_KHR_shader_non_semantic_info() const
{
    return d->support_VK_KHR_shader_non_semantic_info;
}

int GpuInfo::support_VK_KHR_shader_subgroup_extended_types() const
{
    return d->support_VK_KHR_shader_subgroup_extended_types;
}

int GpuInfo::support_VK_KHR_shader_subgroup_rotate() const
{
    return d->support_VK_KHR_shader_subgroup_rotate;
}

int GpuInfo::support_VK_KHR_storage_buffer_storage_class() const
{
    return d->support_VK_KHR_storage_buffer_storage_class;
}

int GpuInfo::support_VK_KHR_swapchain() const
{
    return d->support_VK_KHR_swapchain;
}

int GpuInfo::support_VK_KHR_vulkan_memory_model() const
{
    return d->support_VK_KHR_vulkan_memory_model;
}

int GpuInfo::support_VK_KHR_zero_initialize_workgroup_memory() const
{
    return d->support_VK_KHR_zero_initialize_workgroup_memory;
}

int GpuInfo::support_VK_EXT_buffer_device_address() const
{
    return d->support_VK_EXT_buffer_device_address;
}

int GpuInfo::support_VK_EXT_descriptor_indexing() const
{
    return d->support_VK_EXT_descriptor_indexing;
}

int GpuInfo::support_VK_EXT_external_memory_host() const
{
    return d->support_VK_EXT_external_memory_host;
}

int GpuInfo::support_VK_EXT_memory_budget() const
{
    return d->support_VK_EXT_memory_budget;
}

int GpuInfo::support_VK_EXT_memory_priority() const
{
    return d->support_VK_EXT_memory_priority;
}

int GpuInfo::support_VK_EXT_queue_family_foreign() const
{
    return d->support_VK_EXT_queue_family_foreign;
}

int GpuInfo::support_VK_EXT_robustness2() const
{
    return d->support_VK_EXT_robustness2;
}

int GpuInfo::support_VK_EXT_shader_atomic_float() const
{
    return d->support_VK_EXT_shader_atomic_float;
}

int GpuInfo::support_VK_EXT_shader_atomic_float2() const
{
    return d->support_VK_EXT_shader_atomic_float2;
}

int GpuInfo::support_VK_EXT_shader_float8() const
{
    return d->support_VK_EXT_shader_float8;
}

int GpuInfo::support_VK_EXT_subgroup_size_control() const
{
    return d->support_VK_EXT_subgroup_size_control;
}

int GpuInfo::support_VK_AMD_device_coherent_memory() const
{
    return d->support_VK_AMD_device_coherent_memory;
}

#if __ANDROID_API__ >= 26
int GpuInfo::support_VK_ANDROID_external_memory_android_hardware_buffer() const
{
    return d->support_VK_ANDROID_external_memory_android_hardware_buffer;
}
#endif // __ANDROID_API__ >= 26

int GpuInfo::support_VK_NV_cooperative_matrix() const
{
    return d->support_VK_NV_cooperative_matrix;
}

int GpuInfo::support_VK_NV_cooperative_matrix2() const
{
    return d->support_VK_NV_cooperative_matrix2;
}

int GpuInfo::support_VK_NV_cooperative_vector() const
{
    return d->support_VK_NV_cooperative_vector;
}

const void* GpuInfo::queryExtensionFeatures() const
{
    return d->queryExtensionFeatures;
}

const VkPhysicalDevice8BitStorageFeaturesKHR& GpuInfo::query8BitStorageFeatures() const
{
    return d->query8BitStorageFeatures;
}

const VkPhysicalDevice16BitStorageFeaturesKHR& GpuInfo::query16BitStorageFeatures() const
{
    return d->query16BitStorageFeatures;
}

const VkPhysicalDeviceFloat16Int8FeaturesKHR& GpuInfo::queryFloat16Int8Features() const
{
    return d->queryFloat16Int8Features;
}

const VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR& GpuInfo::querySamplerYcbcrConversionFeatures() const
{
    return d->querySamplerYcbcrConversionFeatures;
}

const VkPhysicalDeviceCooperativeMatrixFeaturesKHR& GpuInfo::queryCooperativeMatrixFeatures() const
{
    return d->queryCooperativeMatrixFeatures;
}

const VkPhysicalDeviceCooperativeMatrixFeaturesNV& GpuInfo::queryCooperativeMatrixFeaturesNV() const
{
    return d->queryCooperativeMatrixFeaturesNV;
}

const VkPhysicalDeviceCooperativeMatrix2FeaturesNV& GpuInfo::queryCooperativeMatrix2FeaturesNV() const
{
    return d->queryCooperativeMatrix2FeaturesNV;
}

const VkPhysicalDeviceCooperativeVectorFeaturesNV& GpuInfo::queryCooperativeVectorFeaturesNV() const
{
    return d->queryCooperativeVectorFeaturesNV;
}

const VkPhysicalDeviceRobustness2FeaturesKHR& GpuInfo::queryRobustness2Features() const
{
    return d->queryRobustness2Features;
}

const VkPhysicalDeviceSubgroupSizeControlFeaturesEXT& GpuInfo::querySubgroupSizeControlFeatures() const
{
    return d->querySubgroupSizeControlFeatures;
}

const VkPhysicalDeviceShaderBfloat16FeaturesKHR& GpuInfo::queryShaderBfloat16Features() const
{
    return d->queryShaderBfloat16Features;
}

const VkPhysicalDeviceShaderFloat8FeaturesEXT& GpuInfo::queryShaderFloat8Features() const
{
    return d->queryShaderFloat8Features;
}

const VkPhysicalDeviceShaderFloatControls2FeaturesKHR& GpuInfo::queryShaderFloatControls2Features() const
{
    return d->queryShaderFloatControls2Features;
}

const VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR& GpuInfo::queryShaderIntegerDotProductFeatures() const
{
    return d->queryShaderIntegerDotProductFeatures;
}

const VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR& GpuInfo::queryShaderSubgroupRotateFeatures() const
{
    return d->queryShaderSubgroupRotateFeatures;
}

const VkPhysicalDeviceShaderAtomicFloatFeaturesEXT& GpuInfo::queryShaderAtomicFloatFeatures() const
{
    return d->queryShaderAtomicFloatFeatures;
}

const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& GpuInfo::queryShaderAtomicFloat2Features() const
{
    return d->queryShaderAtomicFloat2Features;
}

const VkPhysicalDeviceVulkanMemoryModelFeaturesKHR& GpuInfo::queryVulkanMemoryModelFeatures() const
{
    return d->queryVulkanMemoryModelFeatures;
}

const void* GpuInfo::queryExtensionProperties() const
{
    return d->queryExtensionProperties;
}

const VkPhysicalDeviceCooperativeMatrix2PropertiesNV& GpuInfo::queryCooperativeMatrix2PropertiesNV() const
{
    return d->queryCooperativeMatrix2PropertiesNV;
}

const VkPhysicalDeviceCooperativeVectorPropertiesNV& GpuInfo::queryCooperativeVectorPropertiesNV() const
{
    return d->queryCooperativeVectorPropertiesNV;
}

const VkPhysicalDeviceDriverPropertiesKHR& GpuInfo::queryDriverProperties() const
{
    return d->queryDriverProperties;
}

const VkPhysicalDeviceFloatControlsPropertiesKHR& GpuInfo::queryFloatControlsProperties() const
{
    return d->queryFloatControlsProperties;
}

const VkPhysicalDeviceRobustness2PropertiesKHR& GpuInfo::queryRobustness2Properties() const
{
    return d->queryRobustness2Properties;
}

const VkPhysicalDeviceShaderIntegerDotProductProperties& GpuInfo::queryShaderIntegerDotProductProperties() const
{
    return d->queryShaderIntegerDotProductProperties;
}

const VkPhysicalDeviceSubgroupProperties& GpuInfo::querySubgroupProperties() const
{
    return d->querySubgroupProperties;
}

const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& GpuInfo::querySubgroupSizeControlProperties() const
{
    return d->querySubgroupSizeControlProperties;
}

const VkPhysicalDeviceExternalMemoryHostPropertiesEXT& GpuInfo::queryExternalMemoryHostProperties() const
{
    return d->queryExternalMemoryHostProperties;
}

const std::vector<VkCooperativeMatrixPropertiesKHR>& GpuInfo::queryCooperativeMatrixSubProperties() const
{
    return d->queryCooperativeMatrixSubProperties;
}

const std::vector<VkCooperativeMatrixPropertiesNV>& GpuInfo::queryCooperativeMatrixSubPropertiesNV() const
{
    return d->queryCooperativeMatrixSubPropertiesNV;
}

const std::vector<VkCooperativeMatrixFlexibleDimensionsPropertiesNV>& GpuInfo::queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV() const
{
    return d->queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV;
}

const std::vector<VkCooperativeVectorPropertiesNV>& GpuInfo::queryCooperativeVectorSubPropertiesNV() const
{
    return d->queryCooperativeVectorSubPropertiesNV;
}

void GpuInfo::get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponentTypeKHR type, VkComponentTypeKHR acctype, VkScopeKHR scope, int& coopmat_M, int& coopmat_N, int& coopmat_K, int& coopmat_subgroup_size) const
{
    coopmat_M = 0;
    coopmat_N = 0;
    coopmat_K = 0;
    coopmat_subgroup_size = d->querySubgroupProperties.subgroupSize;

    // collect mnk candidates
    std::vector<VkCooperativeMatrixPropertiesKHR> mnk_properties;

    if (d->support_VK_KHR_cooperative_matrix && d->queryCooperativeMatrixFeatures.cooperativeMatrix)
    {
        for (size_t i = 0; i < d->queryCooperativeMatrixSubProperties.size(); i++)
        {
            const VkCooperativeMatrixPropertiesKHR& cmp = d->queryCooperativeMatrixSubProperties[i];

            if (cmp.AType == type && cmp.BType == type
                    && cmp.CType == acctype && cmp.ResultType == acctype
                    && cmp.scope == scope)
            {
                mnk_properties.push_back(cmp);
            }
        }
    }
    else if (d->support_VK_NV_cooperative_matrix && d->queryCooperativeMatrixFeaturesNV.cooperativeMatrix)
    {
        for (size_t i = 0; i < d->queryCooperativeMatrixSubPropertiesNV.size(); i++)
        {
            const VkCooperativeMatrixPropertiesNV& cmp = d->queryCooperativeMatrixSubPropertiesNV[i];

            if (cmp.AType == (VkComponentTypeNV)type && cmp.BType == (VkComponentTypeNV)type
                    && cmp.CType == (VkComponentTypeNV)acctype && cmp.DType == (VkComponentTypeNV)acctype
                    && cmp.scope == (VkScopeNV)scope)
            {
                VkCooperativeMatrixPropertiesKHR cmp_khr;
                cmp_khr.MSize = cmp.MSize;
                cmp_khr.NSize = cmp.NSize;
                cmp_khr.KSize = cmp.KSize;

                mnk_properties.push_back(cmp_khr);
            }
        }
    }

    if (mnk_properties.empty() && (acctype == VK_COMPONENT_TYPE_FLOAT16_KHR || acctype == VK_COMPONENT_TYPE_BFLOAT16_KHR))
    {
        // try acctype fp32
        return get_optimal_cooperative_matrix_mnk(M, N, K, type, VK_COMPONENT_TYPE_FLOAT32_KHR, scope, coopmat_M, coopmat_N, coopmat_K, coopmat_subgroup_size);
    }

    if (mnk_properties.empty())
        return;

    // find the optimal, prefer the first mnk tuple with same cost
    double min_cost = DBL_MAX;
    for (size_t i = 0; i < mnk_properties.size(); i++)
    {
        const VkCooperativeMatrixPropertiesKHR& cmp = mnk_properties[i];

        const int M_pad = (M + cmp.MSize - 1) / cmp.MSize * cmp.MSize;
        const int N_pad = (N + cmp.NSize - 1) / cmp.NSize * cmp.NSize;
        const int K_pad = (K + cmp.KSize - 1) / cmp.KSize * cmp.KSize;

        double cost = M_pad * N_pad * K_pad - M * N * K;
        if (cost < min_cost)
        {
            min_cost = cost;
            coopmat_M = cmp.MSize;
            coopmat_N = cmp.NSize;
            coopmat_K = cmp.KSize;
        }
    }
}

static int init_instance_core()
{
    vkAllocateCommandBuffers = (PFN_vkAllocateCommandBuffers)vkGetInstanceProcAddr(g_instance, "vkAllocateCommandBuffers");
    vkAllocateDescriptorSets = (PFN_vkAllocateDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkAllocateDescriptorSets");
    vkAllocateMemory = (PFN_vkAllocateMemory)vkGetInstanceProcAddr(g_instance, "vkAllocateMemory");
    vkBeginCommandBuffer = (PFN_vkBeginCommandBuffer)vkGetInstanceProcAddr(g_instance, "vkBeginCommandBuffer");
    vkBindBufferMemory = (PFN_vkBindBufferMemory)vkGetInstanceProcAddr(g_instance, "vkBindBufferMemory");
    vkBindImageMemory = (PFN_vkBindImageMemory)vkGetInstanceProcAddr(g_instance, "vkBindImageMemory");
    vkCmdBeginQuery = (PFN_vkCmdBeginQuery)vkGetInstanceProcAddr(g_instance, "vkCmdBeginQuery");
    vkCmdBindDescriptorSets = (PFN_vkCmdBindDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkCmdBindDescriptorSets");
    vkCmdBindIndexBuffer = (PFN_vkCmdBindIndexBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdBindIndexBuffer");
    vkCmdBindPipeline = (PFN_vkCmdBindPipeline)vkGetInstanceProcAddr(g_instance, "vkCmdBindPipeline");
    vkCmdCopyBuffer = (PFN_vkCmdCopyBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdCopyBuffer");
    vkCmdCopyBufferToImage = (PFN_vkCmdCopyBufferToImage)vkGetInstanceProcAddr(g_instance, "vkCmdCopyBufferToImage");
    vkCmdCopyImage = (PFN_vkCmdCopyImage)vkGetInstanceProcAddr(g_instance, "vkCmdCopyImage");
    vkCmdCopyImageToBuffer = (PFN_vkCmdCopyImageToBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdCopyImageToBuffer");
    vkCmdCopyQueryPoolResults = (PFN_vkCmdCopyQueryPoolResults)vkGetInstanceProcAddr(g_instance, "vkCmdCopyQueryPoolResults");
    vkCmdDispatch = (PFN_vkCmdDispatch)vkGetInstanceProcAddr(g_instance, "vkCmdDispatch");
    vkCmdDispatchIndirect = (PFN_vkCmdDispatchIndirect)vkGetInstanceProcAddr(g_instance, "vkCmdDispatchIndirect");
    vkCmdEndQuery = (PFN_vkCmdEndQuery)vkGetInstanceProcAddr(g_instance, "vkCmdEndQuery");
    vkCmdExecuteCommands = (PFN_vkCmdExecuteCommands)vkGetInstanceProcAddr(g_instance, "vkCmdExecuteCommands");
    vkCmdFillBuffer = (PFN_vkCmdFillBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdFillBuffer");
    vkCmdPipelineBarrier = (PFN_vkCmdPipelineBarrier)vkGetInstanceProcAddr(g_instance, "vkCmdPipelineBarrier");
    vkCmdPushConstants = (PFN_vkCmdPushConstants)vkGetInstanceProcAddr(g_instance, "vkCmdPushConstants");
    vkCmdResetQueryPool = (PFN_vkCmdResetQueryPool)vkGetInstanceProcAddr(g_instance, "vkCmdResetQueryPool");
    vkCmdResolveImage = (PFN_vkCmdResolveImage)vkGetInstanceProcAddr(g_instance, "vkCmdResolveImage");
    vkCmdUpdateBuffer = (PFN_vkCmdUpdateBuffer)vkGetInstanceProcAddr(g_instance, "vkCmdUpdateBuffer");
    vkCmdWriteTimestamp = (PFN_vkCmdWriteTimestamp)vkGetInstanceProcAddr(g_instance, "vkCmdWriteTimestamp");
    vkCreateBuffer = (PFN_vkCreateBuffer)vkGetInstanceProcAddr(g_instance, "vkCreateBuffer");
    vkCreateBufferView = (PFN_vkCreateBufferView)vkGetInstanceProcAddr(g_instance, "vkCreateBufferView");
    vkCreateCommandPool = (PFN_vkCreateCommandPool)vkGetInstanceProcAddr(g_instance, "vkCreateCommandPool");
    vkCreateComputePipelines = (PFN_vkCreateComputePipelines)vkGetInstanceProcAddr(g_instance, "vkCreateComputePipelines");
    vkCreateDescriptorPool = (PFN_vkCreateDescriptorPool)vkGetInstanceProcAddr(g_instance, "vkCreateDescriptorPool");
    vkCreateDescriptorSetLayout = (PFN_vkCreateDescriptorSetLayout)vkGetInstanceProcAddr(g_instance, "vkCreateDescriptorSetLayout");
    vkCreateDevice = (PFN_vkCreateDevice)vkGetInstanceProcAddr(g_instance, "vkCreateDevice");
    vkCreateFence = (PFN_vkCreateFence)vkGetInstanceProcAddr(g_instance, "vkCreateFence");
    vkCreateImage = (PFN_vkCreateImage)vkGetInstanceProcAddr(g_instance, "vkCreateImage");
    vkCreateImageView = (PFN_vkCreateImageView)vkGetInstanceProcAddr(g_instance, "vkCreateImageView");
    vkCreatePipelineCache = (PFN_vkCreatePipelineCache)vkGetInstanceProcAddr(g_instance, "vkCreatePipelineCache");
    vkCreatePipelineLayout = (PFN_vkCreatePipelineLayout)vkGetInstanceProcAddr(g_instance, "vkCreatePipelineLayout");
    vkCreateQueryPool = (PFN_vkCreateQueryPool)vkGetInstanceProcAddr(g_instance, "vkCreateQueryPool");
    vkCreateSampler = (PFN_vkCreateSampler)vkGetInstanceProcAddr(g_instance, "vkCreateSampler");
    vkCreateSemaphore = (PFN_vkCreateSemaphore)vkGetInstanceProcAddr(g_instance, "vkCreateSemaphore");
    vkCreateShaderModule = (PFN_vkCreateShaderModule)vkGetInstanceProcAddr(g_instance, "vkCreateShaderModule");
    vkDestroyBuffer = (PFN_vkDestroyBuffer)vkGetInstanceProcAddr(g_instance, "vkDestroyBuffer");
    vkDestroyBufferView = (PFN_vkDestroyBufferView)vkGetInstanceProcAddr(g_instance, "vkDestroyBufferView");
    vkDestroyCommandPool = (PFN_vkDestroyCommandPool)vkGetInstanceProcAddr(g_instance, "vkDestroyCommandPool");
    vkDestroyDescriptorPool = (PFN_vkDestroyDescriptorPool)vkGetInstanceProcAddr(g_instance, "vkDestroyDescriptorPool");
    vkDestroyDescriptorSetLayout = (PFN_vkDestroyDescriptorSetLayout)vkGetInstanceProcAddr(g_instance, "vkDestroyDescriptorSetLayout");
    vkDestroyDevice = (PFN_vkDestroyDevice)vkGetInstanceProcAddr(g_instance, "vkDestroyDevice");
    vkDestroyFence = (PFN_vkDestroyFence)vkGetInstanceProcAddr(g_instance, "vkDestroyFence");
    vkDestroyImage = (PFN_vkDestroyImage)vkGetInstanceProcAddr(g_instance, "vkDestroyImage");
    vkDestroyImageView = (PFN_vkDestroyImageView)vkGetInstanceProcAddr(g_instance, "vkDestroyImageView");
    vkDestroyInstance = (PFN_vkDestroyInstance)vkGetInstanceProcAddr(g_instance, "vkDestroyInstance");
    vkDestroyPipeline = (PFN_vkDestroyPipeline)vkGetInstanceProcAddr(g_instance, "vkDestroyPipeline");
    vkDestroyPipelineCache = (PFN_vkDestroyPipelineCache)vkGetInstanceProcAddr(g_instance, "vkDestroyPipelineCache");
    vkDestroyPipelineLayout = (PFN_vkDestroyPipelineLayout)vkGetInstanceProcAddr(g_instance, "vkDestroyPipelineLayout");
    vkDestroyQueryPool = (PFN_vkDestroyQueryPool)vkGetInstanceProcAddr(g_instance, "vkDestroyQueryPool");
    vkDestroySampler = (PFN_vkDestroySampler)vkGetInstanceProcAddr(g_instance, "vkDestroySampler");
    vkDestroySemaphore = (PFN_vkDestroySemaphore)vkGetInstanceProcAddr(g_instance, "vkDestroySemaphore");
    vkDestroyShaderModule = (PFN_vkDestroyShaderModule)vkGetInstanceProcAddr(g_instance, "vkDestroyShaderModule");
    vkDeviceWaitIdle = (PFN_vkDeviceWaitIdle)vkGetInstanceProcAddr(g_instance, "vkDeviceWaitIdle");
    vkEndCommandBuffer = (PFN_vkEndCommandBuffer)vkGetInstanceProcAddr(g_instance, "vkEndCommandBuffer");
    vkEnumerateDeviceExtensionProperties = (PFN_vkEnumerateDeviceExtensionProperties)vkGetInstanceProcAddr(g_instance, "vkEnumerateDeviceExtensionProperties");
    vkEnumerateDeviceLayerProperties = (PFN_vkEnumerateDeviceLayerProperties)vkGetInstanceProcAddr(g_instance, "vkEnumerateDeviceLayerProperties");
    vkEnumeratePhysicalDevices = (PFN_vkEnumeratePhysicalDevices)vkGetInstanceProcAddr(g_instance, "vkEnumeratePhysicalDevices");
    vkFlushMappedMemoryRanges = (PFN_vkFlushMappedMemoryRanges)vkGetInstanceProcAddr(g_instance, "vkFlushMappedMemoryRanges");
    vkFreeCommandBuffers = (PFN_vkFreeCommandBuffers)vkGetInstanceProcAddr(g_instance, "vkFreeCommandBuffers");
    vkFreeDescriptorSets = (PFN_vkFreeDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkFreeDescriptorSets");
    vkFreeMemory = (PFN_vkFreeMemory)vkGetInstanceProcAddr(g_instance, "vkFreeMemory");
    vkGetBufferMemoryRequirements = (PFN_vkGetBufferMemoryRequirements)vkGetInstanceProcAddr(g_instance, "vkGetBufferMemoryRequirements");
    vkGetDeviceMemoryCommitment = (PFN_vkGetDeviceMemoryCommitment)vkGetInstanceProcAddr(g_instance, "vkGetDeviceMemoryCommitment");
    vkGetDeviceProcAddr = (PFN_vkGetDeviceProcAddr)vkGetInstanceProcAddr(g_instance, "vkGetDeviceProcAddr");
    vkGetDeviceQueue = (PFN_vkGetDeviceQueue)vkGetInstanceProcAddr(g_instance, "vkGetDeviceQueue");
    vkGetFenceStatus = (PFN_vkGetFenceStatus)vkGetInstanceProcAddr(g_instance, "vkGetFenceStatus");
    vkGetImageMemoryRequirements = (PFN_vkGetImageMemoryRequirements)vkGetInstanceProcAddr(g_instance, "vkGetImageMemoryRequirements");
    vkGetImageSubresourceLayout = (PFN_vkGetImageSubresourceLayout)vkGetInstanceProcAddr(g_instance, "vkGetImageSubresourceLayout");
    vkGetPhysicalDeviceFeatures = (PFN_vkGetPhysicalDeviceFeatures)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures");
    vkGetPhysicalDeviceFormatProperties = (PFN_vkGetPhysicalDeviceFormatProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties");
    vkGetPhysicalDeviceImageFormatProperties = (PFN_vkGetPhysicalDeviceImageFormatProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties");
    vkGetPhysicalDeviceMemoryProperties = (PFN_vkGetPhysicalDeviceMemoryProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties");
    vkGetPhysicalDeviceProperties = (PFN_vkGetPhysicalDeviceProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties");
    vkGetPhysicalDeviceQueueFamilyProperties = (PFN_vkGetPhysicalDeviceQueueFamilyProperties)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties");
    vkGetPipelineCacheData = (PFN_vkGetPipelineCacheData)vkGetInstanceProcAddr(g_instance, "vkGetPipelineCacheData");
    vkGetQueryPoolResults = (PFN_vkGetQueryPoolResults)vkGetInstanceProcAddr(g_instance, "vkGetQueryPoolResults");
    vkInvalidateMappedMemoryRanges = (PFN_vkInvalidateMappedMemoryRanges)vkGetInstanceProcAddr(g_instance, "vkInvalidateMappedMemoryRanges");
    vkMapMemory = (PFN_vkMapMemory)vkGetInstanceProcAddr(g_instance, "vkMapMemory");
    vkMergePipelineCaches = (PFN_vkMergePipelineCaches)vkGetInstanceProcAddr(g_instance, "vkMergePipelineCaches");
    vkQueueSubmit = (PFN_vkQueueSubmit)vkGetInstanceProcAddr(g_instance, "vkQueueSubmit");
    vkQueueWaitIdle = (PFN_vkQueueWaitIdle)vkGetInstanceProcAddr(g_instance, "vkQueueWaitIdle");
    vkResetCommandBuffer = (PFN_vkResetCommandBuffer)vkGetInstanceProcAddr(g_instance, "vkResetCommandBuffer");
    vkResetCommandPool = (PFN_vkResetCommandPool)vkGetInstanceProcAddr(g_instance, "vkResetCommandPool");
    vkResetDescriptorPool = (PFN_vkResetDescriptorPool)vkGetInstanceProcAddr(g_instance, "vkResetDescriptorPool");
    vkResetFences = (PFN_vkResetFences)vkGetInstanceProcAddr(g_instance, "vkResetFences");
    vkUnmapMemory = (PFN_vkUnmapMemory)vkGetInstanceProcAddr(g_instance, "vkUnmapMemory");
    vkUpdateDescriptorSets = (PFN_vkUpdateDescriptorSets)vkGetInstanceProcAddr(g_instance, "vkUpdateDescriptorSets");
    vkWaitForFences = (PFN_vkWaitForFences)vkGetInstanceProcAddr(g_instance, "vkWaitForFences");

    return 0;
}

static int init_instance_extension()
{
    if (support_VK_KHR_external_memory_capabilities)
    {
        vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR");
    }

    if (support_VK_KHR_get_physical_device_properties2)
    {
        vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
        vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
        vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
        vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
        vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR");
        vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR");
    }

    if (support_VK_KHR_get_surface_capabilities2)
    {
        vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");
        vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");
    }

    if (support_VK_KHR_surface)
    {
        vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");
        vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");
        vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");
        vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");
        vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");
    }

#if __ANDROID_API__ >= 26
    if (support_VK_KHR_android_surface)
    {
        vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR");
    }
#endif // __ANDROID_API__ >= 26

    // VK_KHR_cooperative_matrix
    {
        vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR");
    }

    // VK_NV_cooperative_matrix
    {
        vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesNV");
    }

    // VK_NV_cooperative_matrix2
    {
        vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV");
    }

    // VK_NV_cooperative_vector
    {
        vkGetPhysicalDeviceCooperativeVectorPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeVectorPropertiesNV");
    }

    return 0;
}

#if ENABLE_VALIDATION_LAYER
static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
    VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/,
    VkDebugUtilsMessageTypeFlagsEXT /*messageType*/,
    const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
    void* /*pUserData*/)
{
    NCNN_LOGE("validation layer: %s", pCallbackData->pMessage);

    return VK_FALSE;
}

static VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback)
{
    PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT");
    if (func)
        return func(instance, pCreateInfo, pAllocator, pCallback);

    return VK_ERROR_EXTENSION_NOT_PRESENT;
}

static void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator)
{
    PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT");
    if (func)
        func(instance, callback, pAllocator);
}
#endif // ENABLE_VALIDATION_LAYER

static int find_default_vulkan_device_index()
{
    // first try, discrete gpu
    for (int i = 0; i < g_gpu_count; i++)
    {
        if (g_gpu_infos[i]->type() == 0)
            return i;
    }

    // second try, integrated gpu
    for (int i = 0; i < g_gpu_count; i++)
    {
        if (g_gpu_infos[i]->type() == 1)
            return i;
    }

    // third try, any probed device
    if (g_gpu_count > 0)
        return 0;

    NCNN_LOGE("no vulkan device");
    return -1;
}

int create_gpu_instance(const char* driver_path)
{
    MutexLockGuard lock(g_instance_lock);

    if (g_instance.created != 0)
        return g_instance.instance ? 0 : -1;

    g_instance.created = 1;

    // NCNN_LOGE("create_gpu_instance");

#if NCNN_SIMPLEVK
    // load vulkan driver
    {
        int ret = load_vulkan_driver(driver_path);
        if (ret != 0)
        {
            NCNN_LOGE("load vulkan driver failed");
            return -1;
        }
    }
#else
    if (driver_path)
    {
        NCNN_LOGE("custom vulkan driver is not supported when NCNN_SIMPLEVK is off");
        NCNN_LOGE("will always use the system vulkan driver");
    }
#endif // NCNN_SIMPLEVK

    VkResult ret;

    std::vector<const char*> enabledLayers;

#if ENABLE_VALIDATION_LAYER
    uint32_t instanceLayerPropertyCount;
    ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
        return -1;
    }

    std::vector<VkLayerProperties> instanceLayerProperties(instanceLayerPropertyCount);
    ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data());
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumerateInstanceLayerProperties failed %d", ret);
        return -1;
    }

    for (uint32_t i = 0; i < instanceLayerPropertyCount; i++)
    {
        const VkLayerProperties& lp = instanceLayerProperties[i];
        //         NCNN_LOGE("instance layer %s = %u", lp.layerName, lp.implementationVersion);

        if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0)
        {
            enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation");
        }
        if (strcmp(lp.layerName, "VK_LAYER_LUNARG_parameter_validation") == 0)
        {
            enabledLayers.push_back("VK_LAYER_LUNARG_parameter_validation");
        }
        if (strcmp(lp.layerName, "VK_LAYER_KHRONOS_validation") == 0)
        {
            enabledLayers.push_back("VK_LAYER_KHRONOS_validation");
        }
    }
#endif // ENABLE_VALIDATION_LAYER

    std::vector<const char*> enabledExtensions;

    uint32_t instanceExtensionPropertyCount;
    ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
        return -1;
    }

    std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
    ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumerateInstanceExtensionProperties failed %d", ret);
        return -1;
    }

    support_VK_KHR_get_physical_device_properties2 = 0;
    support_VK_KHR_get_surface_capabilities2 = 0;
    support_VK_KHR_portability_enumeration = 0;
    support_VK_KHR_surface = 0;
    support_VK_EXT_debug_utils = 0;
    support_VK_EXT_validation_features = 0;
    support_VK_EXT_validation_flags = 0;
#if __ANDROID_API__ >= 26
    support_VK_KHR_android_surface = 0;
#endif // __ANDROID_API__ >= 26
    for (uint32_t j = 0; j < instanceExtensionPropertyCount; j++)
    {
        const VkExtensionProperties& exp = instanceExtensionProperties[j];
        //         NCNN_LOGE("instance extension %s = %u", exp.extensionName, exp.specVersion);

        if (strcmp(exp.extensionName, "VK_KHR_external_memory_capabilities") == 0)
            support_VK_KHR_external_memory_capabilities = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0)
            support_VK_KHR_get_physical_device_properties2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_get_surface_capabilities2") == 0)
            support_VK_KHR_get_surface_capabilities2 = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_portability_enumeration") == 0)
            support_VK_KHR_portability_enumeration = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_KHR_surface") == 0)
            support_VK_KHR_surface = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_debug_utils") == 0)
            support_VK_EXT_debug_utils = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_validation_features") == 0)
            support_VK_EXT_validation_features = exp.specVersion;
        else if (strcmp(exp.extensionName, "VK_EXT_validation_flags") == 0)
            support_VK_EXT_validation_flags = exp.specVersion;
#if __ANDROID_API__ >= 26
        else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0)
            support_VK_KHR_android_surface = exp.specVersion;
#endif // __ANDROID_API__ >= 26
    }

    if (support_VK_EXT_validation_features)
    {
        // we prefer the modern one
        support_VK_EXT_validation_flags = 0;
    }

    if (support_VK_KHR_external_memory_capabilities)
        enabledExtensions.push_back("VK_KHR_external_memory_capabilities");
    if (support_VK_KHR_get_physical_device_properties2)
        enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
    if (support_VK_KHR_get_surface_capabilities2)
        enabledExtensions.push_back("VK_KHR_get_surface_capabilities2");
    if (support_VK_KHR_portability_enumeration)
        enabledExtensions.push_back("VK_KHR_portability_enumeration");
    if (support_VK_KHR_surface)
        enabledExtensions.push_back("VK_KHR_surface");
#if ENABLE_VALIDATION_LAYER
    if (support_VK_EXT_debug_utils)
        enabledExtensions.push_back("VK_EXT_debug_utils");
    if (support_VK_EXT_validation_features)
        enabledExtensions.push_back("VK_EXT_validation_features");
    if (support_VK_EXT_validation_flags)
        enabledExtensions.push_back("VK_EXT_validation_flags");
#endif // ENABLE_VALIDATION_LAYER
#if __ANDROID_API__ >= 26
    if (support_VK_KHR_android_surface)
        enabledExtensions.push_back("VK_KHR_android_surface");
#endif // __ANDROID_API__ >= 26

    uint32_t instance_api_version = VK_MAKE_VERSION(1, 0, 0);
    typedef VkResult(VKAPI_PTR * PFN_vkEnumerateInstanceVersion)(uint32_t * pApiVersion);
    PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion = (PFN_vkEnumerateInstanceVersion)vkGetInstanceProcAddr(0, "vkEnumerateInstanceVersion");
    if (vkEnumerateInstanceVersion)
    {
        ret = vkEnumerateInstanceVersion(&instance_api_version);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkEnumerateInstanceVersion failed %d", ret);
            return -1;
        }
    }

    // NCNN_LOGE("instance apiVersion = %u.%u.%u", VK_VERSION_MAJOR(instance_api_version), VK_VERSION_MINOR(instance_api_version), VK_VERSION_PATCH(instance_api_version));

    VkApplicationInfo applicationInfo;
    applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
    applicationInfo.pNext = 0;
    applicationInfo.pApplicationName = "ncnn";
    applicationInfo.applicationVersion = 0;
    applicationInfo.pEngineName = "ncnn";
    applicationInfo.engineVersion = NCNN_VERSION_NUMBER;
    applicationInfo.apiVersion = instance_api_version;

    void* enabledExtensionFeatures = 0;

#if ENABLE_VALIDATION_LAYER
    std::vector<VkValidationFeatureEnableEXT> enabledValidationFeature;
    enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_EXT);
    enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_GPU_ASSISTED_RESERVE_BINDING_SLOT_EXT);
    enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_BEST_PRACTICES_EXT);
    enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT);
    enabledValidationFeature.push_back(VK_VALIDATION_FEATURE_ENABLE_SYNCHRONIZATION_VALIDATION_EXT);

    VkValidationFeaturesEXT validationFeatures;
    validationFeatures.sType = VK_STRUCTURE_TYPE_VALIDATION_FEATURES_EXT;
    validationFeatures.pNext = 0;
    validationFeatures.enabledValidationFeatureCount = enabledValidationFeature.size();
    validationFeatures.pEnabledValidationFeatures = enabledValidationFeature.data();
    validationFeatures.disabledValidationFeatureCount = 0;
    validationFeatures.pDisabledValidationFeatures = 0;
    if (support_VK_EXT_validation_features)
    {
        validationFeatures.pNext = enabledExtensionFeatures;
        enabledExtensionFeatures = &validationFeatures;
    }

    VkValidationFlagsEXT validationFlags;
    validationFlags.sType = VK_STRUCTURE_TYPE_VALIDATION_FLAGS_EXT;
    validationFlags.pNext = 0;
    validationFlags.disabledValidationCheckCount = 0;
    validationFlags.pDisabledValidationChecks = 0;
    if (support_VK_EXT_validation_flags)
    {
        validationFlags.pNext = enabledExtensionFeatures;
        enabledExtensionFeatures = &validationFlags;
    }
#endif // ENABLE_VALIDATION_LAYER

    VkInstanceCreateInfo instanceCreateInfo;
    instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
    instanceCreateInfo.pNext = enabledExtensionFeatures;
    instanceCreateInfo.flags = 0;
    if (support_VK_KHR_portability_enumeration)
        instanceCreateInfo.flags |= VK_INSTANCE_CREATE_ENUMERATE_PORTABILITY_BIT_KHR;
    instanceCreateInfo.pApplicationInfo = &applicationInfo;
    instanceCreateInfo.enabledLayerCount = enabledLayers.size();
    instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data();
    instanceCreateInfo.enabledExtensionCount = enabledExtensions.size();
    instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();

    VkInstance instance = 0;
    ret = vkCreateInstance(&instanceCreateInfo, 0, &instance);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateInstance failed %d", ret);
        return -1;
    }

    g_instance.instance = instance;
    g_instance.instance_api_version = instance_api_version;

    init_instance_core();

#if ENABLE_VALIDATION_LAYER
    if (support_VK_EXT_debug_utils)
    {
        VkDebugUtilsMessengerCreateInfoEXT createInfo = {};
        createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
        createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
        createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
        createInfo.pfnUserCallback = debugCallback;
        createInfo.pUserData = 0;
        ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &g_instance.callback);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("CreateDebugUtilsMessengerEXT failed %d", ret);
            return -1;
        }
    }
#endif // ENABLE_VALIDATION_LAYER

    init_instance_extension();

    uint32_t physicalDeviceCount = 0;
    ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
        return -1;
    }

    if (physicalDeviceCount > NCNN_MAX_GPU_COUNT)
        physicalDeviceCount = NCNN_MAX_GPU_COUNT;

    std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);

    ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkEnumeratePhysicalDevices failed %d", ret);
        return -1;
    }

    // find proper device and queue
    int gpu_info_index = 0;
    for (uint32_t i = 0; i < physicalDeviceCount; i++)
    {
        const VkPhysicalDevice& physicalDevice = physicalDevices[i];
        delete g_gpu_infos[gpu_info_index];
        g_gpu_infos[gpu_info_index] = new GpuInfo;

        GpuInfo& gpu_info = *g_gpu_infos[gpu_info_index];

        gpu_info.d->device_index = gpu_info_index;

        gpu_info.d->physicalDevice = physicalDevice;

        gpu_info.d->query_features();
        gpu_info.d->query_properties();

        // device type

        // info
        // NCNN_LOGE("[%u] max_shared_memory_size = %u", i, gpu_info.max_shared_memory_size);
        // NCNN_LOGE("[%u] max_workgroup_count = %u %u %u", i, gpu_info.max_workgroup_count[0], gpu_info.max_workgroup_count[1], gpu_info.max_workgroup_count[2]);
        // NCNN_LOGE("[%u] max_workgroup_invocations = %u", i, gpu_info.max_workgroup_invocations);
        // NCNN_LOGE("[%u] max_workgroup_size = %u %u %u", i, gpu_info.max_workgroup_size[0], gpu_info.max_workgroup_size[1], gpu_info.max_workgroup_size[2]);
        // NCNN_LOGE("[%u] memory_map_alignment = %lu", i, gpu_info.memory_map_alignment);
        // NCNN_LOGE("[%u] buffer_offset_alignment = %lu", i, gpu_info.buffer_offset_alignment);

        gpu_info.d->query_queue_properties();

        gpu_info.d->query_memory_properties();

        int rqde = gpu_info.d->query_extensions();
        if (rqde != 0)
        {
            return -1;
        }

        gpu_info.d->query_extension_features();
        gpu_info.d->query_extension_properties();

        gpu_info.d->evaluate_rough_score();

        NCNN_LOGE("[%u %s]  queueC=%u[%u]  queueT=%u[%u]  rebar=%d  r-score=%u", i, gpu_info.device_name(),
                  gpu_info.compute_queue_family_index(), gpu_info.compute_queue_count(),
                  gpu_info.transfer_queue_family_index(), gpu_info.transfer_queue_count(), gpu_info.resizable_bar_enabled(), gpu_info.rough_score());

        NCNN_LOGE("[%u %s]  fp16-p/s/u/a=%d/%d/%d/%d  int8-p/s/u/a=%d/%d/%d/%d  bf16-p/s=%d/%d", i, gpu_info.device_name(),
                  gpu_info.support_fp16_packed(), gpu_info.support_fp16_storage(), gpu_info.support_fp16_uniform(), gpu_info.support_fp16_arithmetic(),
                  gpu_info.support_int8_packed(), gpu_info.support_int8_storage(), gpu_info.support_int8_uniform(), gpu_info.support_int8_arithmetic(),
                  gpu_info.support_bf16_packed(), gpu_info.support_bf16_storage());

        NCNN_LOGE("[%u %s]  subgroup=%u(%u~%u)  ops=%d/%d/%d/%d/%d/%d/%d/%d/%d/%d", i, gpu_info.device_name(),
                  gpu_info.subgroup_size(), gpu_info.min_subgroup_size(), gpu_info.max_subgroup_size(),
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_BASIC_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_VOTE_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_BALLOT_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_SHUFFLE_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_CLUSTERED_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_QUAD_BIT) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ROTATE_BIT_KHR) != 0,
                  (gpu_info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT_KHR) != 0);

        // collect matrix mnk
        std::vector<VkCooperativeMatrixPropertiesKHR> fp16_matrix_properties;
        std::vector<VkCooperativeMatrixPropertiesKHR> int8_matrix_properties;
        std::vector<VkCooperativeMatrixPropertiesKHR> bf16_matrix_properties;
        std::vector<VkCooperativeMatrixPropertiesKHR> fp8_matrix_properties;
        if (gpu_info.support_VK_KHR_cooperative_matrix())
        {
            const std::vector<VkCooperativeMatrixPropertiesKHR>& properties = gpu_info.queryCooperativeMatrixSubProperties();
            for (uint32_t j = 0; j < properties.size(); j++)
            {
                const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];

                if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR)
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < fp16_matrix_properties.size(); k++)
                    {
                        const VkCooperativeMatrixPropertiesKHR& cmp0 = fp16_matrix_properties[k];
                        if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize)
                        {
                            mnk_hit = true;
                            break;
                        }
                    }
                    if (!mnk_hit)
                        fp16_matrix_properties.push_back(cmp);
                }
                if ((cmp.AType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.AType == VK_COMPONENT_TYPE_SINT8_PACKED_NV)
                        && (cmp.BType == VK_COMPONENT_TYPE_SINT8_KHR || cmp.BType == VK_COMPONENT_TYPE_SINT8_PACKED_NV))
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < int8_matrix_properties.size(); k++)
                    {
                        const VkCooperativeMatrixPropertiesKHR& cmp0 = int8_matrix_properties[k];
                        if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize)
                        {
                            mnk_hit = true;
                            break;
                        }
                    }
                    if (!mnk_hit)
                        int8_matrix_properties.push_back(cmp);
                }
                if (cmp.AType == VK_COMPONENT_TYPE_BFLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_BFLOAT16_KHR)
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < bf16_matrix_properties.size(); k++)
                    {
                        const VkCooperativeMatrixPropertiesKHR& cmp0 = bf16_matrix_properties[k];
                        if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize)
                        {
                            mnk_hit = true;
                            break;
                        }
                    }
                    if (!mnk_hit)
                        bf16_matrix_properties.push_back(cmp);
                }
                if ((cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.AType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
                        || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.AType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV)
                        && (cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E4M3_EXT || cmp.BType == VK_COMPONENT_TYPE_FLOAT8_E5M2_EXT
                            || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E4M3_NV || cmp.BType == VK_COMPONENT_TYPE_FLOAT_E5M2_NV))
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < fp8_matrix_properties.size(); k++)
                    {
                        const VkCooperativeMatrixPropertiesKHR& cmp0 = fp8_matrix_properties[k];
                        if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize)
                        {
                            mnk_hit = true;
                            break;
                        }
                    }
                    if (!mnk_hit)
                        fp8_matrix_properties.push_back(cmp);
                }
            }
        }
        else if (gpu_info.support_VK_NV_cooperative_matrix())
        {
            const std::vector<VkCooperativeMatrixPropertiesNV>& properties = gpu_info.queryCooperativeMatrixSubPropertiesNV();
            for (uint32_t j = 0; j < properties.size(); j++)
            {
                const VkCooperativeMatrixPropertiesNV& cmp = properties[j];

                if (cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV)
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < fp16_matrix_properties.size(); k++)
                    {
                        const VkCooperativeMatrixPropertiesKHR& cmp0 = fp16_matrix_properties[k];
                        if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize)
                        {
                            mnk_hit = true;
                            break;
                        }
                    }
                    if (!mnk_hit)
                    {
                        VkCooperativeMatrixPropertiesKHR cmp_khr;
                        cmp_khr.MSize = cmp.MSize;
                        cmp_khr.NSize = cmp.NSize;
                        cmp_khr.KSize = cmp.KSize;
                        fp16_matrix_properties.push_back(cmp_khr);
                    }
                }
                if (cmp.AType == VK_COMPONENT_TYPE_SINT8_NV && cmp.BType == VK_COMPONENT_TYPE_SINT8_NV)
                {
                    bool mnk_hit = false;
                    for (size_t k = 0; k < int8_matrix_properties.size(); k++)
                    {
                        const VkCooperativeMatrixPropertiesKHR& cmp0 = int8_matrix_properties[k];
                        if (cmp.MSize == cmp0.MSize && cmp.NSize == cmp0.NSize && cmp.KSize == cmp0.KSize)
                        {
                            mnk_hit = true;
                            break;
                        }
                    }
                    if (!mnk_hit)
                    {
                        VkCooperativeMatrixPropertiesKHR cmp_khr;
                        cmp_khr.MSize = cmp.MSize;
                        cmp_khr.NSize = cmp.NSize;
                        cmp_khr.KSize = cmp.KSize;
                        int8_matrix_properties.push_back(cmp_khr);
                    }
                }
            }
        }

        std::string fp16_matrix_info_str;
        std::string int8_matrix_info_str;
        std::string bf16_matrix_info_str;
        std::string fp8_matrix_info_str;
        {
            for (uint32_t j = 0; j < fp16_matrix_properties.size(); j++)
            {
                const VkCooperativeMatrixPropertiesKHR& cmp = fp16_matrix_properties[j];
                char tmp[64];
                sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize);
                fp16_matrix_info_str += tmp;
            }
            for (uint32_t j = 0; j < int8_matrix_properties.size(); j++)
            {
                const VkCooperativeMatrixPropertiesKHR& cmp = int8_matrix_properties[j];
                char tmp[64];
                sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize);
                int8_matrix_info_str += tmp;
            }
            for (uint32_t j = 0; j < bf16_matrix_properties.size(); j++)
            {
                const VkCooperativeMatrixPropertiesKHR& cmp = bf16_matrix_properties[j];
                char tmp[64];
                sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize);
                bf16_matrix_info_str += tmp;
            }
            for (uint32_t j = 0; j < fp8_matrix_properties.size(); j++)
            {
                const VkCooperativeMatrixPropertiesKHR& cmp = fp8_matrix_properties[j];
                char tmp[64];
                sprintf(tmp, j > 0 ? "/%ux%ux%u" : "%ux%ux%u", cmp.MSize, cmp.NSize, cmp.KSize);
                fp8_matrix_info_str += tmp;
            }

            if (fp16_matrix_info_str.empty())
                fp16_matrix_info_str = "0";
            if (int8_matrix_info_str.empty())
                int8_matrix_info_str = "0";
            if (bf16_matrix_info_str.empty())
                bf16_matrix_info_str = "0";
            if (fp8_matrix_info_str.empty())
                fp8_matrix_info_str = "0";
        }

        NCNN_LOGE("[%u %s]  fp16-cm=%s  int8-cm=%s  bf16-cm=%s  fp8-cm=%s", i, gpu_info.device_name(),
                  fp16_matrix_info_str.c_str(), int8_matrix_info_str.c_str(), bf16_matrix_info_str.c_str(), fp8_matrix_info_str.c_str());

        gpu_info_index++;
    }

    g_gpu_count = gpu_info_index;

    // the default gpu device
    g_default_gpu_index = find_default_vulkan_device_index();

    g_instance.glslang_initialized = glslang::InitializeProcess();

    // the global __ncnn_vulkan_instance_holder destructor will call destroy_gpu_instance() on exit
    // but it seems to be too late for nvidia driver :(
    // driver's internal data structure has been destroyed when called, causing segfault
    // atexit() seems to be helpful for calling it earlier    --- nihui
    static int destroy_gpu_instance_atexit_registered = 0;
    if (!destroy_gpu_instance_atexit_registered)
    {
        atexit(destroy_gpu_instance);
        destroy_gpu_instance_atexit_registered = 1;
    }

    return 0;
}

VkInstance get_gpu_instance()
{
    return (VkInstance)g_instance;
}

void destroy_gpu_instance()
{
    MutexLockGuard lock(g_instance_lock);

    if (g_instance.created == 0)
        return;

    for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++)
    {
        VulkanDevice* vulkan_device = g_default_vkdev[i];
        if (vulkan_device)
        {
            VkDevice vkdev = g_default_vkdev[i]->vkdevice();
            if (vkdev)
            {
                vkDeviceWaitIdle(vkdev);
            }
        }
    }

    // NCNN_LOGE("destroy_gpu_instance");

    if (g_instance.glslang_initialized)
    {
        glslang::FinalizeProcess();
        g_instance.glslang_initialized = false;
    }

    for (int i = 0; i < NCNN_MAX_GPU_COUNT; i++)
    {
        delete g_default_vkdev[i];
        g_default_vkdev[i] = 0;

        delete g_gpu_infos[i];
        g_gpu_infos[i] = 0;
    }

#if ENABLE_VALIDATION_LAYER
    if (support_VK_EXT_debug_utils && g_instance.callback)
    {
        DestroyDebugUtilsMessengerEXT(g_instance, g_instance.callback, NULL);
        g_instance.callback = 0;
    }
#endif // ENABLE_VALIDATION_LAYER

    if (vkDestroyInstance)
    {
        vkDestroyInstance(g_instance, 0);
        vkDestroyInstance = 0;
    }

    g_instance.instance = 0;

#if NCNN_SIMPLEVK
    unload_vulkan_driver();
#endif

    g_instance.created = 0;
}

static void try_create_gpu_instance()
{
    {
        MutexLockGuard lock(g_instance_lock);

        if (g_instance.created != 0)
            return;
    }

    create_gpu_instance();
}

int get_gpu_count()
{
    try_create_gpu_instance();

    return g_gpu_count;
}

int get_default_gpu_index()
{
    try_create_gpu_instance();

    return g_default_gpu_index;
}

const GpuInfo& get_gpu_info(int device_index)
{
    try_create_gpu_instance();

    return *g_gpu_infos[device_index];
}

class VkDummyAllocator : public VkBlobAllocator
{
public:
    // NOTE 16k is large enough I think ...
    VkDummyAllocator(const VulkanDevice* _vkdev)
        : VkBlobAllocator(_vkdev, 16 * 1024)
    {
    }
};

class VkDummyCompute : public VkCompute
{
public:
    VkDummyCompute(const VulkanDevice* _vkdev)
        : VkCompute(_vkdev)
    {
    }

    void record_dummy(const VkMat& buffer)
    {
        barrier_readwrite(buffer);
    }

    void record_dummy(const VkImageMat& image)
    {
        barrier_readwrite(image);
    }

    void record_dummy_readonly(const VkImageMat& image)
    {
        barrier_readonly(image);
    }
};

class VulkanDevicePrivate
{
public:
    VulkanDevicePrivate(VulkanDevice* _vkdev);
    VulkanDevice* const vkdev;

    // dummy buffer and image
    int create_dummy_buffer_image();
    void destroy_dummy_buffer_image();

    // utility operator
    const ncnn::Layer* get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const;
    void destroy_utility_operator();

    VkDevice device;

    // hardware queue
    mutable std::vector<VkQueue> compute_queues;
    mutable std::vector<VkQueue> transfer_queues;
    mutable int free_compute_queue_count;
    mutable int free_transfer_queue_count;
    mutable Mutex compute_queue_lock;
    mutable Mutex transfer_queue_lock;
    mutable ConditionVariable compute_queue_condition;
    mutable ConditionVariable transfer_queue_condition;

    // default blob allocator for each queue
    mutable std::vector<VkAllocator*> blob_allocators;
    mutable Mutex blob_allocator_lock;

    // default staging allocator for each queue
    mutable std::vector<VkAllocator*> staging_allocators;
    mutable Mutex staging_allocator_lock;

    // nearest sampler for texelfetch
    VkSampler texelfetch_sampler;

    // dummy buffer and image
    VkAllocator* dummy_allocator;
    VkMat dummy_buffer;
    VkImageMat dummy_image;
    VkImageMat dummy_image_readonly;

    // device-wide pipeline cache
    PipelineCache* pipeline_cache;

    // utility operator
    // from fp32 | fp16
    // to fp32 | fp16
    // to pack1 | pack4
    mutable ncnn::Layer* uop_packing[2][2][2];
    // from int8
    // to int8
    // to pack1 | pack4
    mutable ncnn::Layer* uop_packing_int8[2];
    // from fp32 to bf16 / from bf16 to fp32 / bf16
    // to pack1 | pack4
    mutable ncnn::Layer* uop_packing_bf16[3][2];
    mutable Mutex uop_lock;

    // device is valid and sucessfully initialized
    bool valid;
};

VulkanDevicePrivate::VulkanDevicePrivate(VulkanDevice* _vkdev)
    : vkdev(_vkdev)
{
    device = 0;
    texelfetch_sampler = 0;
    dummy_allocator = 0;
    pipeline_cache = 0;
    valid = false;
    memset(uop_packing, 0, sizeof(uop_packing));
    memset(uop_packing_int8, 0, sizeof(uop_packing_int8));
    memset(uop_packing_bf16, 0, sizeof(uop_packing_bf16));
}

int VulkanDevicePrivate::create_dummy_buffer_image()
{
    dummy_allocator = new VkDummyAllocator(vkdev);

    dummy_buffer.create(1, 4u, dummy_allocator);
    dummy_image.create(1, 4u, dummy_allocator);
#if __APPLE__
    if (vkdev->info.type() == 0)
        dummy_image_readonly.create(1, 4u, dummy_allocator);
#else
    dummy_image_readonly.create(1, 4u, dummy_allocator);
#endif

    VkDummyCompute cmd(vkdev);

    cmd.record_dummy(dummy_buffer);
    cmd.record_dummy(dummy_image);
#if __APPLE__
    if (vkdev->info.type() == 0)
        cmd.record_dummy_readonly(dummy_image_readonly);
#else
    cmd.record_dummy_readonly(dummy_image_readonly);
#endif

    return cmd.submit_and_wait();
}

void VulkanDevicePrivate::destroy_dummy_buffer_image()
{
    dummy_buffer.release();
    dummy_image.release();
#if __APPLE__
    if (vkdev->info.type() == 0)
        dummy_image_readonly.release();
#else
    dummy_image_readonly.release();
#endif

    if (dummy_allocator)
    {
        delete dummy_allocator;
        dummy_allocator = 0;
    }
}

const ncnn::Layer* VulkanDevicePrivate::get_utility_operator(int cast_type_from_index, int cast_type_to_index, int packing_type_to_index) const
{
    bool use_fp16 = (cast_type_from_index == 1 || cast_type_to_index == 1);
    bool use_int8 = (cast_type_from_index == 3 || cast_type_to_index == 3);
    bool use_bf16 = (cast_type_from_index == 4 || cast_type_to_index == 4);

    MutexLockGuard lock(uop_lock);

    const ncnn::Layer* cached_uop = 0;
    if (use_int8)
    {
        cached_uop = uop_packing_int8[packing_type_to_index];
    }
    else if (use_bf16)
    {
        if (cast_type_from_index == 4 && cast_type_to_index == 4)
        {
            cached_uop = uop_packing_bf16[2][packing_type_to_index];
        }
        else if (cast_type_to_index == 4)
        {
            cached_uop = uop_packing_bf16[1][packing_type_to_index];
        }
        else // if (cast_type_from_index == 4)
        {
            cached_uop = uop_packing_bf16[0][packing_type_to_index];
        }
    }
    else
    {
        cached_uop = uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index];
    }
    if (cached_uop)
        return cached_uop;

    // create uop
    Option opt;
    opt.use_fp16_packed = use_fp16; // fp16p is always supported
    opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
    opt.use_int8_packed = use_int8; // int8p is always supported
    opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();
    opt.use_bf16_packed = use_bf16; // bf16p is always supported
    opt.use_bf16_storage = use_bf16 && vkdev->info.support_bf16_storage();

    // fp16/int8 arithmetic are not necessary for packing
    // and may conflict with storage options
    opt.use_fp16_arithmetic = false;
    opt.use_int8_arithmetic = false;

    // do not enable spirv-1.3 from cooperative matrix
    opt.use_cooperative_matrix = false;

    opt.use_vulkan_compute = true;

    // cache uop pipeline as device member explicitly
    opt.pipeline_cache = 0;

    opt.vulkan_device_index = vkdev->info.device_index();

    ncnn::Layer* uop = ncnn::create_layer_vulkan(LayerType::Packing);
    uop->vkdev = vkdev;

    ncnn::ParamDict pd;
    pd.set(0, packing_type_to_index == 0 ? 1 : 4); // out_elempack
    pd.set(2, cast_type_from_index + 1);           // 0=auto 1=fp32 2=fp16 3=int32 4=int8 5=bf16
    pd.set(3, cast_type_to_index + 1);

    uop->load_param(pd);

    uop->create_pipeline(opt);

    if (use_int8)
    {
        uop_packing_int8[packing_type_to_index] = uop;
    }
    else if (use_bf16)
    {
        if (cast_type_from_index == 4 && cast_type_to_index == 4)
        {
            uop_packing_bf16[2][packing_type_to_index] = uop;
        }
        else if (cast_type_to_index == 4)
        {
            uop_packing_bf16[1][packing_type_to_index] = uop;
        }
        else // if (cast_type_from_index == 4)
        {
            uop_packing_bf16[0][packing_type_to_index] = uop;
        }
    }
    else
    {
        uop_packing[cast_type_from_index][cast_type_to_index][packing_type_to_index] = uop;
    }

    return uop;
}

void VulkanDevicePrivate::destroy_utility_operator()
{
    Option opt;
    opt.use_vulkan_compute = true;
    opt.use_fp16_arithmetic = false;
    opt.use_int8_arithmetic = false;
    opt.use_cooperative_matrix = false;
    opt.pipeline_cache = 0;
    opt.vulkan_device_index = vkdev->info.device_index();

    // from fp32 | fp16
    for (int j0 = 0; j0 < 2; j0++)
    {
        // to fp32 | fp16
        for (int j1 = 0; j1 < 2; j1++)
        {
            bool use_fp16 = (j0 == 1 || j1 == 1);

            opt.use_fp16_packed = use_fp16;
            opt.use_fp16_storage = use_fp16 && vkdev->info.support_fp16_storage();
            opt.use_int8_packed = false;
            opt.use_int8_storage = false;
            opt.use_bf16_packed = false;
            opt.use_bf16_storage = false;

            // to pack1 | pack4
            for (int k = 0; k < 2; k++)
            {
                ncnn::Layer* uop = uop_packing[j0][j1][k];
                if (!uop)
                    continue;

                uop->destroy_pipeline(opt);

                delete uop;

                uop_packing[j0][j1][k] = 0;
            }
        }
    }

    // int8
    {
        bool use_int8 = true;

        opt.use_fp16_packed = false;
        opt.use_fp16_storage = false;
        opt.use_int8_packed = use_int8;
        opt.use_int8_storage = use_int8 && vkdev->info.support_int8_storage();
        opt.use_bf16_packed = false;
        opt.use_bf16_storage = false;

        // to pack1 | pack4
        for (int k = 0; k < 2; k++)
        {
            ncnn::Layer* uop = uop_packing_int8[k];
            if (!uop)
                continue;

            uop->destroy_pipeline(opt);

            delete uop;

            uop_packing_int8[k] = 0;
        }
    }

    // from fp32 to bf16
    // from bf16 to fp32
    // bf16
    for (int j = 0; j < 3; j++)
    {
        bool use_bf16 = true;

        opt.use_fp16_packed = false;
        opt.use_fp16_storage = false;
        opt.use_int8_packed = false;
        opt.use_int8_storage = false;
        opt.use_bf16_packed = use_bf16;
        opt.use_bf16_storage = use_bf16 && vkdev->info.support_bf16_storage();

        // to pack1 | pack4
        for (int k = 0; k < 2; k++)
        {
            ncnn::Layer* uop = uop_packing_bf16[j][k];
            if (!uop)
                continue;

            uop->destroy_pipeline(opt);

            delete uop;

            uop_packing_bf16[j][k] = 0;
        }
    }
}

VulkanDevice::VulkanDevice(int device_index)
    : info(get_gpu_info(device_index)), d(new VulkanDevicePrivate(this))
{
    try_create_gpu_instance();

    std::vector<const char*> enabledExtensions;
    if (info.support_VK_KHR_8bit_storage())
        enabledExtensions.push_back("VK_KHR_8bit_storage");
    if (info.support_VK_KHR_16bit_storage())
        enabledExtensions.push_back("VK_KHR_16bit_storage");
    if (info.support_VK_KHR_bind_memory2())
        enabledExtensions.push_back("VK_KHR_bind_memory2");
    if (info.support_VK_KHR_buffer_device_address())
        enabledExtensions.push_back("VK_KHR_buffer_device_address");
    if (info.support_VK_KHR_create_renderpass2())
        enabledExtensions.push_back("VK_KHR_create_renderpass2");
    if (info.support_VK_KHR_cooperative_matrix())
        enabledExtensions.push_back("VK_KHR_cooperative_matrix");
    if (info.support_VK_KHR_dedicated_allocation())
        enabledExtensions.push_back("VK_KHR_dedicated_allocation");
    if (info.support_VK_KHR_descriptor_update_template())
        enabledExtensions.push_back("VK_KHR_descriptor_update_template");
    if (info.support_VK_KHR_driver_properties())
        enabledExtensions.push_back("VK_KHR_driver_properties");
    if (info.support_VK_KHR_external_memory())
        enabledExtensions.push_back("VK_KHR_external_memory");
    if (info.support_VK_KHR_get_memory_requirements2())
        enabledExtensions.push_back("VK_KHR_get_memory_requirements2");
    if (info.support_VK_KHR_maintenance1())
        enabledExtensions.push_back("VK_KHR_maintenance1");
    if (info.support_VK_KHR_maintenance2())
        enabledExtensions.push_back("VK_KHR_maintenance2");
    if (info.support_VK_KHR_maintenance3())
        enabledExtensions.push_back("VK_KHR_maintenance3");
    if (info.support_VK_KHR_multiview())
        enabledExtensions.push_back("VK_KHR_multiview");
    if (info.support_VK_KHR_portability_subset())
        enabledExtensions.push_back("VK_KHR_portability_subset");
    if (info.support_VK_KHR_push_descriptor())
        enabledExtensions.push_back("VK_KHR_push_descriptor");
    if (info.support_VK_KHR_robustness2())
        enabledExtensions.push_back("VK_KHR_robustness2");
    if (info.support_VK_KHR_sampler_ycbcr_conversion())
        enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion");
    if (info.support_VK_KHR_shader_bfloat16())
        enabledExtensions.push_back("VK_KHR_shader_bfloat16");
    if (info.support_VK_KHR_shader_float16_int8())
        enabledExtensions.push_back("VK_KHR_shader_float16_int8");
    if (info.support_VK_KHR_shader_float_controls())
        enabledExtensions.push_back("VK_KHR_shader_float_controls");
    if (info.support_VK_KHR_shader_float_controls2())
        enabledExtensions.push_back("VK_KHR_shader_float_controls2");
    if (info.support_VK_KHR_shader_integer_dot_product())
        enabledExtensions.push_back("VK_KHR_shader_integer_dot_product");
    if (info.support_VK_KHR_shader_non_semantic_info())
        enabledExtensions.push_back("VK_KHR_shader_non_semantic_info");
    if (info.support_VK_KHR_shader_subgroup_extended_types())
        enabledExtensions.push_back("VK_KHR_shader_subgroup_extended_types");
    if (info.support_VK_KHR_shader_subgroup_rotate())
        enabledExtensions.push_back("VK_KHR_shader_subgroup_rotate");
    if (info.support_VK_KHR_storage_buffer_storage_class())
        enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class");
    if (info.support_VK_KHR_swapchain())
        enabledExtensions.push_back("VK_KHR_swapchain");
    if (info.support_VK_KHR_vulkan_memory_model())
        enabledExtensions.push_back("VK_KHR_vulkan_memory_model");
    if (info.support_VK_KHR_zero_initialize_workgroup_memory())
        enabledExtensions.push_back("VK_KHR_zero_initialize_workgroup_memory");
    if (info.support_VK_EXT_buffer_device_address())
        enabledExtensions.push_back("VK_EXT_buffer_device_address");
    if (info.support_VK_EXT_descriptor_indexing())
        enabledExtensions.push_back("VK_EXT_descriptor_indexing");
    if (info.support_VK_EXT_external_memory_host())
        enabledExtensions.push_back("VK_EXT_external_memory_host");
    if (info.support_VK_EXT_memory_budget())
        enabledExtensions.push_back("VK_EXT_memory_budget");
    if (info.support_VK_EXT_memory_priority())
        enabledExtensions.push_back("VK_EXT_memory_priority");
    if (info.support_VK_EXT_queue_family_foreign())
        enabledExtensions.push_back("VK_EXT_queue_family_foreign");
    if (info.support_VK_EXT_robustness2())
        enabledExtensions.push_back("VK_EXT_robustness2");
    if (info.support_VK_EXT_shader_atomic_float())
        enabledExtensions.push_back("VK_EXT_shader_atomic_float");
    if (info.support_VK_EXT_shader_atomic_float2())
        enabledExtensions.push_back("VK_EXT_shader_atomic_float2");
    if (info.support_VK_EXT_shader_float8())
        enabledExtensions.push_back("VK_EXT_shader_float8");
    if (info.support_VK_EXT_subgroup_size_control())
        enabledExtensions.push_back("VK_EXT_subgroup_size_control");
    if (info.support_VK_AMD_device_coherent_memory())
        enabledExtensions.push_back("VK_AMD_device_coherent_memory");
#if __ANDROID_API__ >= 26
    if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
        enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer");
#endif // __ANDROID_API__ >= 26
    if (info.support_VK_NV_cooperative_matrix())
        enabledExtensions.push_back("VK_NV_cooperative_matrix");
    if (info.support_VK_NV_cooperative_matrix2())
        enabledExtensions.push_back("VK_NV_cooperative_matrix2");
    if (info.support_VK_NV_cooperative_vector())
        enabledExtensions.push_back("VK_NV_cooperative_vector");

    const void* enabledExtensionFeatures = info.queryExtensionFeatures();

    std::vector<float> compute_queue_priorities(info.compute_queue_count(), 1.f);   // 0.f ~ 1.f
    std::vector<float> transfer_queue_priorities(info.transfer_queue_count(), 1.f); // 0.f ~ 1.f

    VkDeviceQueueCreateInfo deviceQueueCreateInfos[3];

    VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
    deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
    deviceComputeQueueCreateInfo.pNext = 0;
    deviceComputeQueueCreateInfo.flags = 0;
    deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index();
    deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count();
    deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data();

    VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo;
    deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
    deviceTransferQueueCreateInfo.pNext = 0;
    deviceTransferQueueCreateInfo.flags = 0;
    deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index();
    deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count();
    deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data();

    VkDeviceCreateInfo deviceCreateInfo;
    deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
    deviceCreateInfo.pNext = enabledExtensionFeatures;
    deviceCreateInfo.flags = 0;
    if (info.compute_queue_family_index() == info.transfer_queue_family_index())
    {
        deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
        deviceCreateInfo.queueCreateInfoCount = 1;
    }
    else // if (info.compute_queue_family_index() != info.transfer_queue_family_index())
    {
        deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
        deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo;
        deviceCreateInfo.queueCreateInfoCount = 2;
    }

    deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos;
    deviceCreateInfo.enabledLayerCount = 0;
    deviceCreateInfo.ppEnabledLayerNames = 0;
    deviceCreateInfo.enabledExtensionCount = enabledExtensions.size();
    deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
    deviceCreateInfo.pEnabledFeatures = 0; // VkPhysicalDeviceFeatures pointer

    VkResult ret = vkCreateDevice(info.physicalDevice(), &deviceCreateInfo, 0, &d->device);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateDevice failed %d", ret);
        return;
    }

    init_device_extension();

    d->free_compute_queue_count = 0;
    d->free_transfer_queue_count = 0;

    d->free_compute_queue_count = info.compute_queue_count();
    d->compute_queues.resize(info.compute_queue_count());
    d->blob_allocators.resize(info.compute_queue_count());
    d->staging_allocators.resize(info.compute_queue_count());
    for (uint32_t i = 0; i < info.compute_queue_count(); i++)
    {
        vkGetDeviceQueue(d->device, info.compute_queue_family_index(), i, &d->compute_queues[i]);
        d->blob_allocators[i] = new VkBlobAllocator(this);
        d->staging_allocators[i] = new VkStagingAllocator(this);
    }
    if (info.compute_queue_family_index() != info.transfer_queue_family_index())
    {
        d->free_transfer_queue_count = info.transfer_queue_count();
        d->transfer_queues.resize(info.transfer_queue_count());
        for (uint32_t i = 0; i < info.transfer_queue_count(); i++)
        {
            vkGetDeviceQueue(d->device, info.transfer_queue_family_index(), i, &d->transfer_queues[i]);
        }
    }

    // prepare immutable texelfetch sampler
    {
        VkSamplerCreateInfo samplerCreateInfo;
        samplerCreateInfo.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
        samplerCreateInfo.pNext = 0;
        samplerCreateInfo.flags = 0;
        samplerCreateInfo.magFilter = VK_FILTER_NEAREST;
        samplerCreateInfo.minFilter = VK_FILTER_NEAREST;
        samplerCreateInfo.mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST;
        samplerCreateInfo.addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
        samplerCreateInfo.addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
        samplerCreateInfo.addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
        samplerCreateInfo.mipLodBias = 0.0f;
        samplerCreateInfo.anisotropyEnable = VK_FALSE;
        samplerCreateInfo.maxAnisotropy = 1;
        samplerCreateInfo.compareEnable = VK_FALSE;
        samplerCreateInfo.compareOp = VK_COMPARE_OP_NEVER;
        samplerCreateInfo.minLod = 0.0f;
        samplerCreateInfo.maxLod = 0.0f;
        samplerCreateInfo.borderColor = VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK;
        samplerCreateInfo.unnormalizedCoordinates = VK_TRUE;

        ret = vkCreateSampler(d->device, &samplerCreateInfo, 0, &d->texelfetch_sampler);
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkCreateSampler failed %d", ret);
        }
    }

    int cret = d->create_dummy_buffer_image();
    if (cret != 0)
    {
        NCNN_LOGE("VulkanDevice create_dummy_buffer_image failed %d", cret);
        return;
    }

    d->pipeline_cache = new PipelineCache(this);

    d->valid = true;
}

VulkanDevice::~VulkanDevice()
{
    d->destroy_utility_operator();

    d->destroy_dummy_buffer_image();

    if (d->texelfetch_sampler)
    {
        vkDestroySampler(d->device, d->texelfetch_sampler, 0);
    }

    for (size_t i = 0; i < d->blob_allocators.size(); i++)
    {
        delete d->blob_allocators[i];
    }
    d->blob_allocators.clear();
    for (size_t i = 0; i < d->staging_allocators.size(); i++)
    {
        delete d->staging_allocators[i];
    }
    d->staging_allocators.clear();

    if (d->pipeline_cache)
    {
        delete d->pipeline_cache;
    }

    if (d->device)
    {
        vkDestroyDevice(d->device, 0);
    }

    delete d;
}

VulkanDevice::VulkanDevice(const VulkanDevice&)
    : info(get_gpu_info(0)), d(0)
{
}

VulkanDevice& VulkanDevice::operator=(const VulkanDevice&)
{
    return *this;
}

VkDevice VulkanDevice::vkdevice() const
{
    return d->device;
}

bool VulkanDevice::is_valid() const
{
    return d->valid;
}

VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
{
    VkShaderModuleCreateInfo shaderModuleCreateInfo;
    shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
    shaderModuleCreateInfo.pNext = 0;
    shaderModuleCreateInfo.flags = 0;
    shaderModuleCreateInfo.codeSize = spv_data_size;
    shaderModuleCreateInfo.pCode = spv_data;

    VkShaderModule shader_module;
    VkResult ret = vkCreateShaderModule(d->device, &shaderModuleCreateInfo, 0, &shader_module);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateShaderModule failed %d", ret);
        return 0;
    }

    return shader_module;
}

static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
{
    uint32_t local_size_x_id = -1;
    uint32_t local_size_y_id = -1;
    uint32_t local_size_z_id = -1;
    uint32_t gl_WorkGroupSize_id = -1;

    const uint32_t* p = code;
    uint32_t* dp = dstcode;

    // skip magic version generator bound schema
    memcpy(dp, p, 5 * sizeof(uint32_t));
    p += 5;
    dp += 5;

    // foreach op
    while ((const unsigned char*)p < (const unsigned char*)code + size)
    {
        uint32_t opcode = p[0];

        uint16_t wordcount = opcode >> 16;
        uint16_t op = opcode & 0xffff;

        if (op == 16) // OpExecutionMode
        {
            uint32_t mode = p[2];
            if (mode == 17) // LocalSize
            {
                memcpy(dp, p, wordcount * sizeof(uint32_t));

                // set local_size_xyz
                dp[3] = local_size_x;
                dp[4] = local_size_y;
                dp[5] = local_size_z;

                p += wordcount;
                dp += wordcount;
                continue;
            }
        }
        else if (op == 50) // OpSpecConstant
        {
            uint32_t id = p[2];
            if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id)
            {
                p += wordcount;
                continue;
            }
        }
        else if (op == 51) // OpSpecConstantComposite
        {
            uint32_t id = p[2];
            if (id == gl_WorkGroupSize_id)
            {
                if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id))
                {
                    p += wordcount;
                    continue;
                }
            }
        }
        else if (op == 71) // OpDecorate
        {
            uint32_t id = p[1];
            uint32_t decoration = p[2];
            if (decoration == 1) // SpecId
            {
                uint32_t specid = p[3];
                if (specid == 233) local_size_x_id = id;
                if (specid == 234) local_size_y_id = id;
                if (specid == 235) local_size_z_id = id;
                if (specid == 233 || specid == 234 || specid == 235)
                {
                    p += wordcount;
                    continue;
                }
            }
            else if (decoration == 11) // BuiltIn
            {
                uint32_t builtin = p[3];
                if (builtin == 25) // WorkgroupSize
                {
                    gl_WorkGroupSize_id = id;
                    p += wordcount;
                    continue;
                }
            }
        }

        memcpy(dp, p, wordcount * sizeof(uint32_t));
        p += wordcount;
        dp += wordcount;
    }

    *dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
}

VkShaderModule VulkanDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
{
    uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
    size_t spv_data_size_modified = spv_data_size;
    inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);

    VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);

    free(spv_data_modified);

    return shader_module;
}

int VulkanDevice::create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const
{
    if (binding_count == 0)
    {
        *descriptorset_layout = 0;
        return 0;
    }

    std::vector<VkDescriptorSetLayoutBinding> descriptorSetLayoutBindings(binding_count);
    for (int i = 0; i < binding_count; i++)
    {
        int binding_type = binding_types[i];

        descriptorSetLayoutBindings[i].binding = i;
        descriptorSetLayoutBindings[i].descriptorCount = 1;
        descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;

        if (binding_type == 1)
        {
            descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
            descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
        }
        else if (binding_type == 2)
        {
            descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
            descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
        }
        else // if (binding_type == 3)
        {
            descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
            descriptorSetLayoutBindings[i].pImmutableSamplers = immutable_texelfetch_sampler(); // we always use texelfetch
        }
    }

    VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo;
    descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
    descriptorSetLayoutCreateInfo.pNext = 0;
    descriptorSetLayoutCreateInfo.flags = 0;
    descriptorSetLayoutCreateInfo.bindingCount = binding_count;
    descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings.data();

    if (info.support_VK_KHR_push_descriptor())
    {
        descriptorSetLayoutCreateInfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
    }

    VkResult ret = vkCreateDescriptorSetLayout(d->device, &descriptorSetLayoutCreateInfo, 0, descriptorset_layout);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateDescriptorSetLayout failed %d", ret);
        return -1;
    }

    return 0;
}

int VulkanDevice::create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const
{
    VkPushConstantRange pushConstantRange;
    pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
    pushConstantRange.offset = 0;
    pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count;

    VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo;
    pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
    pipelineLayoutCreateInfo.pNext = 0;
    pipelineLayoutCreateInfo.flags = 0;

    if (descriptorset_layout)
    {
        pipelineLayoutCreateInfo.setLayoutCount = 1;
        pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout;
    }
    else
    {
        pipelineLayoutCreateInfo.setLayoutCount = 0;
        pipelineLayoutCreateInfo.pSetLayouts = 0;
    }

    if (push_constant_count > 0)
    {
        pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
        pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
    }
    else
    {
        pipelineLayoutCreateInfo.pushConstantRangeCount = 0;
        pipelineLayoutCreateInfo.pPushConstantRanges = 0;
    }

    VkResult ret = vkCreatePipelineLayout(d->device, &pipelineLayoutCreateInfo, 0, pipeline_layout);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreatePipelineLayout failed %d", ret);
        return -1;
    }

    return 0;
}

int VulkanDevice::create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const
{
    const int specialization_count = specializations.size();

    std::vector<VkSpecializationMapEntry> specializationMapEntries(specialization_count);
    for (int i = 0; i < specialization_count; i++)
    {
        specializationMapEntries[i].constantID = i;
        specializationMapEntries[i].offset = i * sizeof(vk_specialization_type);
        specializationMapEntries[i].size = sizeof(vk_specialization_type);
    }

    VkSpecializationInfo specializationInfo;
    specializationInfo.mapEntryCount = specializationMapEntries.size();
    specializationInfo.pMapEntries = specializationMapEntries.data();
    specializationInfo.dataSize = specializations.size() * sizeof(vk_specialization_type);
    specializationInfo.pData = specializations.data();

    VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo;
    pipelineShaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
    pipelineShaderStageCreateInfo.pNext = 0;
    pipelineShaderStageCreateInfo.flags = 0;
    pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
    pipelineShaderStageCreateInfo.module = shader_module;
    pipelineShaderStageCreateInfo.pName = "main";
    pipelineShaderStageCreateInfo.pSpecializationInfo = &specializationInfo;

    // but full subgroup bits enforce local_size_x be multiple of subgroup size
    // if (info.support_compute_full_subgroups())
    // {
    //     pipelineShaderStageCreateInfo.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
    // }

    void* enabledExtensionFeatures = 0;

    // subgroup size control
    VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipelineShaderStageRequiredSubgroupSizeCreateInfo;
    pipelineShaderStageRequiredSubgroupSizeCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT;
    pipelineShaderStageRequiredSubgroupSizeCreateInfo.pNext = 0;
    pipelineShaderStageRequiredSubgroupSizeCreateInfo.requiredSubgroupSize = subgroup_size;
    if (info.support_subgroup_size_control())
    {
        // pipelineShaderStageCreateInfo.flags |= VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT;
        pipelineShaderStageRequiredSubgroupSizeCreateInfo.pNext = enabledExtensionFeatures;
        enabledExtensionFeatures = &pipelineShaderStageRequiredSubgroupSizeCreateInfo;
    }

    pipelineShaderStageCreateInfo.pNext = enabledExtensionFeatures;

    VkComputePipelineCreateInfo computePipelineCreateInfo;
    computePipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
    computePipelineCreateInfo.pNext = 0;
    computePipelineCreateInfo.flags = 0;
    computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
    computePipelineCreateInfo.layout = pipeline_layout;
    computePipelineCreateInfo.basePipelineHandle = 0;
    computePipelineCreateInfo.basePipelineIndex = 0;

    VkResult ret = vkCreateComputePipelines(d->device, 0, 1, &computePipelineCreateInfo, 0, pipeline);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateComputePipelines failed %d", ret);
        return -1;
    }

    return 0;
}

int VulkanDevice::create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const
{
    if (binding_count == 0)
    {
        *descriptor_update_template = 0;
        return 0;
    }

    std::vector<VkDescriptorUpdateTemplateEntryKHR> descriptorUpdateTemplateEntries(binding_count);
    size_t offset = 0;
    for (int i = 0; i < binding_count; i++) // TODO do not update weights
    {
        int binding_type = binding_types[i];

        descriptorUpdateTemplateEntries[i].dstBinding = i;
        descriptorUpdateTemplateEntries[i].dstArrayElement = 0;
        descriptorUpdateTemplateEntries[i].descriptorCount = 1;
        descriptorUpdateTemplateEntries[i].offset = offset;

        if (binding_type == 1)
        {
            descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
            descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorBufferInfo);
        }
        else if (binding_type == 2)
        {
            descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
            descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
        }
        else // if (binding_type == 3)
        {
            descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
            descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
        }

        offset += descriptorUpdateTemplateEntries[i].stride;
    }

    VkDescriptorUpdateTemplateCreateInfoKHR descriptorUpdateTemplateCreateInfo;
    descriptorUpdateTemplateCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
    descriptorUpdateTemplateCreateInfo.pNext = 0;
    descriptorUpdateTemplateCreateInfo.flags = 0;
    descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count; // TODO do not update weights
    descriptorUpdateTemplateCreateInfo.pDescriptorUpdateEntries = descriptorUpdateTemplateEntries.data();
    if (info.support_VK_KHR_push_descriptor())
    {
        descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR;
    }
    else
    {
        descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
    }
    // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR
    // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL  :(
    descriptorUpdateTemplateCreateInfo.descriptorSetLayout = descriptorset_layout;
    descriptorUpdateTemplateCreateInfo.pipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
    descriptorUpdateTemplateCreateInfo.pipelineLayout = pipeline_layout;
    descriptorUpdateTemplateCreateInfo.set = 0;

    VkResult ret = vkCreateDescriptorUpdateTemplateKHR(d->device, &descriptorUpdateTemplateCreateInfo, 0, descriptor_update_template);
    if (ret != VK_SUCCESS)
    {
        NCNN_LOGE("vkCreateDescriptorUpdateTemplateKHR failed %d", ret);
        return -1;
    }

    return 0;
}

uint32_t VulkanDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
{
    const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties();

    // first try, find required and with preferred and without preferred_not
    for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
    {
        bool is_required = (1 << i) & memory_type_bits;
        if (is_required)
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & required) == required
                    && (preferred && (memoryType.propertyFlags & preferred))
                    && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
            {
                return i;
            }
        }
    }

    // second try, find required and with preferred
    for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
    {
        bool is_required = (1 << i) & memory_type_bits;
        if (is_required)
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & required) == required
                    && (preferred && (memoryType.propertyFlags & preferred)))
            {
                return i;
            }
        }
    }

    // third try, find required and without preferred_not
    for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
    {
        bool is_required = (1 << i) & memory_type_bits;
        if (is_required)
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & required) == required
                    && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
            {
                return i;
            }
        }
    }

    // fourth try, find any required
    for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
    {
        bool is_required = (1 << i) & memory_type_bits;
        if (is_required)
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & required) == required)
            {
                return i;
            }
        }
    }

    if (info.driver_id() == VK_DRIVER_ID_GOOGLE_SWIFTSHADER)
    {
        // buggy swiftshader may set memory property flags in memory_type_bits field
        for (uint32_t i = 0; i < memory_properties.memoryTypeCount; i++)
        {
            const VkMemoryType& memoryType = memory_properties.memoryTypes[i];
            if ((memoryType.propertyFlags & (required | memory_type_bits)) == (required | memory_type_bits))
            {
                return i;
            }
        }
    }

    NCNN_LOGE("no such memory type %u %u %u %u", memory_type_bits, required, preferred, preferred_not);
    return -1;
}

bool VulkanDevice::is_mappable(uint32_t memory_type_index) const
{
    const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties().memoryTypes[memory_type_index];

    return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
}

bool VulkanDevice::is_coherent(uint32_t memory_type_index) const
{
    const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties().memoryTypes[memory_type_index];

    return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
}

bool VulkanDevice::is_device_local(uint32_t memory_type_index) const
{
    const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties().memoryTypes[memory_type_index];

    return memoryType.propertyFlags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
}

VkQueue VulkanDevice::acquire_queue(uint32_t queue_family_index) const
{
    if (queue_family_index != info.compute_queue_family_index() && queue_family_index != info.transfer_queue_family_index())
    {
        NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
        return 0;
    }

    Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock : d->transfer_queue_lock;

    queue_lock.lock();

    ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition : d->transfer_queue_condition;

    int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count : d->free_transfer_queue_count;

    while (free_queue_count == 0)
    {
        // no free queues, wait for recleams from other threads
        queue_condition.wait(queue_lock);
    }

    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues : d->transfer_queues;

    VkQueue queue = 0;
    for (size_t i = 0; i < queues.size(); i++)
    {
        if (queues[i])
        {
            queue = queues[i];
            queues[i] = 0;
            break;
        }
    }

    if (!queue)
    {
        NCNN_LOGE("FATAL ERROR! out of hardware queue %u", queue_family_index);
    }

    free_queue_count -= 1;

    queue_lock.unlock();

    queue_condition.signal();

    return queue;
}

void VulkanDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
{
    if (queue_family_index != info.compute_queue_family_index() && queue_family_index != info.transfer_queue_family_index())
    {
        NCNN_LOGE("invalid queue_family_index %u", queue_family_index);
        return;
    }

    Mutex& queue_lock = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_lock : d->transfer_queue_lock;

    queue_lock.lock();

    ConditionVariable& queue_condition = queue_family_index == info.compute_queue_family_index() ? d->compute_queue_condition : d->transfer_queue_condition;

    int& free_queue_count = queue_family_index == info.compute_queue_family_index() ? d->free_compute_queue_count : d->free_transfer_queue_count;

    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index() ? d->compute_queues : d->transfer_queues;

    size_t i = 0;
    for (; i < queues.size(); i++)
    {
        if (!queues[i])
        {
            queues[i] = queue;
            break;
        }
    }

    if (i == queues.size())
    {
        NCNN_LOGE("FATAL ERROR! reclaim_queue get wild queue %u %p", queue_family_index, queue);
    }

    free_queue_count += 1;

    queue_lock.unlock();

    queue_condition.signal();
}

VkAllocator* VulkanDevice::acquire_blob_allocator() const
{
    MutexLockGuard lock(d->blob_allocator_lock);

    for (int i = 0; i < (int)d->blob_allocators.size(); i++)
    {
        VkAllocator* allocator = d->blob_allocators[i];
        if (allocator)
        {
            d->blob_allocators[i] = 0;
            return allocator;
        }
    }

    // pre-allocated allcator exhausted, create new
    VkAllocator* allocator = new VkBlobAllocator(this);
    d->blob_allocators.push_back(allocator);
    d->blob_allocators[d->blob_allocators.size() - 1] = 0;
    return allocator;
}

void VulkanDevice::reclaim_blob_allocator(VkAllocator* allocator) const
{
    MutexLockGuard lock(d->blob_allocator_lock);

    for (int i = 0; i < (int)d->blob_allocators.size(); i++)
    {
        if (!d->blob_allocators[i])
        {
            d->blob_allocators[i] = allocator;
            return;
        }
    }

    NCNN_LOGE("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator);
}

VkAllocator* VulkanDevice::acquire_staging_allocator() const
{
    MutexLockGuard lock(d->staging_allocator_lock);

    for (int i = 0; i < (int)d->staging_allocators.size(); i++)
    {
        VkAllocator* allocator = d->staging_allocators[i];
        if (allocator)
        {
            d->staging_allocators[i] = 0;
            return allocator;
        }
    }

    // pre-allocated allcator exhausted, create new
    VkAllocator* allocator = new VkStagingAllocator(this);
    d->staging_allocators.push_back(allocator);
    d->staging_allocators[d->staging_allocators.size() - 1] = 0;
    return allocator;
}

void VulkanDevice::reclaim_staging_allocator(VkAllocator* allocator) const
{
    MutexLockGuard lock(d->staging_allocator_lock);

    for (int i = 0; i < (int)d->staging_allocators.size(); i++)
    {
        if (!d->staging_allocators[i])
        {
            d->staging_allocators[i] = allocator;
            return;
        }
    }

    NCNN_LOGE("FATAL ERROR! reclaim_staging_allocator get wild allocator %p", allocator);
}

const VkSampler* VulkanDevice::immutable_texelfetch_sampler() const
{
    return &d->texelfetch_sampler;
}

VkMat VulkanDevice::get_dummy_buffer() const
{
    return d->dummy_buffer;
}

VkImageMat VulkanDevice::get_dummy_image() const
{
    return d->dummy_image;
}

VkImageMat VulkanDevice::get_dummy_image_readonly() const
{
#if __APPLE__
    if (info.type() != 0)
        return d->dummy_image;
#endif
    return d->dummy_image_readonly;
}

const PipelineCache* VulkanDevice::get_pipeline_cache() const
{
    return d->pipeline_cache;
}

bool VulkanDevice::shape_support_image_storage(const Mat& shape) const
{
    int dims = shape.dims;
    int width = shape.w;
    int height = shape.h;
    int depth = shape.c;
    int elempack = shape.elempack;

    // large elempack spills on image w
    if (elempack == 8) width *= 2;
    if (elempack == 16) width *= 4;
    if (elempack == 32) width *= 8;
    if (elempack == 64) width *= 16;

    if (dims == 1)
    {
        if (width > (int)info.max_image_dimension_1d())
        {
            return false;
        }
    }
    else if (dims == 2)
    {
        if (width > (int)info.max_image_dimension_2d() || height > (int)info.max_image_dimension_2d())
        {
            return false;
        }
    }
    else // if (dims == 3)
    {
        if (width > (int)info.max_image_dimension_3d() || height > (int)info.max_image_dimension_3d() || depth > (int)info.max_image_dimension_3d())
        {
            return false;
        }
    }

    return true;
}

uint32_t VulkanDevice::get_heap_budget() const
{
    const VkPhysicalDeviceMemoryProperties& memory_properties = info.physicalDeviceMemoryProperties();

    uint32_t buffer_memory_type_index = d->dummy_allocator->buffer_memory_type_index;
    uint32_t buffer_heap_index = memory_properties.memoryTypes[buffer_memory_type_index].heapIndex;

    if (!info.support_VK_EXT_memory_budget())
    {
        //         NCNN_LOGE("heap budget from assumption\n");
        uint32_t device_local_heap_size = memory_properties.memoryHeaps[buffer_heap_index].size / 1024 / 1024;

        // we usually cannot use all heap
        // 70% for 4G+
        // 50% for 4G-
        return device_local_heap_size >= 4000 ? device_local_heap_size * 0.7 : device_local_heap_size * 0.5;
    }

    VkPhysicalDeviceMemoryBudgetPropertiesEXT memoryBudgetProperties;
    memoryBudgetProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_BUDGET_PROPERTIES_EXT;
    memoryBudgetProperties.pNext = 0;

    VkPhysicalDeviceMemoryProperties2KHR memoryProperties;
    memoryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PROPERTIES_2_KHR;
    memoryProperties.pNext = &memoryBudgetProperties;

    vkGetPhysicalDeviceMemoryProperties2KHR(info.physicalDevice(), &memoryProperties);

    return memoryBudgetProperties.heapBudget[buffer_heap_index] / 1024 / 1024;
}

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const
{
    convert_packing(src, dst, dst_elempack, 0, cmd, opt);
}

void VulkanDevice::convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const
{
    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;

    int cast_type_from_index;
    if (src.elembits() == 32)
    {
        cast_type_from_index = 0;
    }
    else if (src.elembits() == 16)
    {
        if (opt.use_bf16_storage || opt.use_bf16_packed)
            cast_type_from_index = 4;
        else
            cast_type_from_index = 1;
    }
    else // if (src.elembits() == 8)
    {
        cast_type_from_index = 3;
    }

    int cast_type_to_index = cast_type_to ? cast_type_to - 1 : cast_type_from_index;

    // NCNN_LOGE("convert_packing b2b %d %d %d", cast_type_from_index, cast_type_to_index, packing_type_to_index);

    if ((cast_type_from_index == 0 || cast_type_from_index == 1 || cast_type_from_index == 4) && (cast_type_to_index == 2 || cast_type_to_index == 3))
    {
        NCNN_LOGE("convert_packing from fp32/fp16/bf16 to int32/int8 is not supported");
        return;
    }
    if ((cast_type_from_index == 2 || cast_type_from_index == 3) && (cast_type_to_index == 0 || cast_type_to_index == 1 || cast_type_to_index == 4))
    {
        NCNN_LOGE("convert_packing from int32/int8 to fp32/fp16/bf16 is not supported");
        return;
    }
    if (cast_type_from_index == 1 && cast_type_to_index == 4)
    {
        NCNN_LOGE("convert_packing from fp16 to bf16 is not supported");
        return;
    }
    if (cast_type_from_index == 4 && cast_type_to_index == 1)
    {
        NCNN_LOGE("convert_packing from bf16 to fp16 is not supported");
        return;
    }

    Option opt2 = opt;
    opt2.use_fp16_packed = (cast_type_from_index == 1 || cast_type_to_index == 1);
    opt2.use_fp16_storage = (cast_type_from_index == 1 || cast_type_to_index == 1) && info.support_fp16_storage();
    opt2.use_int8_packed = (cast_type_from_index == 3 || cast_type_to_index == 3);
    opt2.use_int8_storage = (cast_type_from_index == 3 || cast_type_to_index == 3) && info.support_int8_storage();
    opt2.use_bf16_packed = (cast_type_from_index == 4 || cast_type_to_index == 4);
    opt2.use_bf16_storage = (cast_type_from_index == 4 || cast_type_to_index == 4) && info.support_bf16_storage();

    const ncnn::Layer* uop = d->get_utility_operator(cast_type_from_index, cast_type_to_index, packing_type_to_index);
    uop->forward(src, dst, cmd, opt2);
}

int VulkanDevice::init_device_extension()
{
    if (info.support_VK_KHR_bind_memory2())
    {
        vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindBufferMemory2KHR");
        vkBindImageMemory2KHR = (PFN_vkBindImageMemory2KHR)vkGetDeviceProcAddr(d->device, "vkBindImageMemory2KHR");
    }

    if (info.support_VK_KHR_buffer_device_address())
    {
        vkGetBufferDeviceAddressKHR = (PFN_vkGetBufferDeviceAddressKHR)vkGetDeviceProcAddr(d->device, "vkGetBufferDeviceAddressKHR");
        vkGetBufferOpaqueCaptureAddressKHR = (PFN_vkGetBufferOpaqueCaptureAddressKHR)vkGetDeviceProcAddr(d->device, "vkGetBufferOpaqueCaptureAddressKHR");
        vkGetDeviceMemoryOpaqueCaptureAddressKHR = (PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR)vkGetDeviceProcAddr(d->device, "vkGetDeviceMemoryOpaqueCaptureAddressKHR");
    }

    if (info.support_VK_KHR_descriptor_update_template())
    {
        vkCreateDescriptorUpdateTemplateKHR = (PFN_vkCreateDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCreateDescriptorUpdateTemplateKHR");
        vkDestroyDescriptorUpdateTemplateKHR = (PFN_vkDestroyDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(d->device, "vkDestroyDescriptorUpdateTemplateKHR");
        vkUpdateDescriptorSetWithTemplateKHR = (PFN_vkUpdateDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkUpdateDescriptorSetWithTemplateKHR");
    }

    if (info.support_VK_KHR_get_memory_requirements2())
    {
        vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetImageMemoryRequirements2KHR");
        vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2KHR)vkGetDeviceProcAddr(d->device, "vkGetBufferMemoryRequirements2KHR");
    }

    if (info.support_VK_KHR_maintenance1())
    {
        vkTrimCommandPoolKHR = (PFN_vkTrimCommandPoolKHR)vkGetDeviceProcAddr(d->device, "vkTrimCommandPoolKHR");
    }

    if (info.support_VK_KHR_maintenance3())
    {
        vkGetDescriptorSetLayoutSupportKHR = (PFN_vkGetDescriptorSetLayoutSupportKHR)vkGetDeviceProcAddr(d->device, "vkGetDescriptorSetLayoutSupportKHR");
    }

    if (info.support_VK_KHR_push_descriptor())
    {
        if (info.support_VK_KHR_descriptor_update_template())
        {
            vkCmdPushDescriptorSetWithTemplateKHR = (PFN_vkCmdPushDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetWithTemplateKHR");
        }

        vkCmdPushDescriptorSetKHR = (PFN_vkCmdPushDescriptorSetKHR)vkGetDeviceProcAddr(d->device, "vkCmdPushDescriptorSetKHR");
    }

    if (info.support_VK_KHR_sampler_ycbcr_conversion())
    {
        vkCreateSamplerYcbcrConversionKHR = (PFN_vkCreateSamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkCreateSamplerYcbcrConversionKHR");
        vkDestroySamplerYcbcrConversionKHR = (PFN_vkDestroySamplerYcbcrConversionKHR)vkGetDeviceProcAddr(d->device, "vkDestroySamplerYcbcrConversionKHR");
    }

    if (info.support_VK_KHR_swapchain())
    {
        vkCreateSwapchainKHR = (PFN_vkCreateSwapchainKHR)vkGetDeviceProcAddr(d->device, "vkCreateSwapchainKHR");
        vkDestroySwapchainKHR = (PFN_vkDestroySwapchainKHR)vkGetDeviceProcAddr(d->device, "vkDestroySwapchainKHR");
        vkGetSwapchainImagesKHR = (PFN_vkGetSwapchainImagesKHR)vkGetDeviceProcAddr(d->device, "vkGetSwapchainImagesKHR");
        vkAcquireNextImageKHR = (PFN_vkAcquireNextImageKHR)vkGetDeviceProcAddr(d->device, "vkAcquireNextImageKHR");
        vkQueuePresentKHR = (PFN_vkQueuePresentKHR)vkGetDeviceProcAddr(d->device, "vkQueuePresentKHR");
    }

    if (info.support_VK_EXT_buffer_device_address())
    {
        vkGetBufferDeviceAddressEXT = (PFN_vkGetBufferDeviceAddressEXT)vkGetDeviceProcAddr(d->device, "vkGetBufferDeviceAddressEXT");
    }

    if (info.support_VK_EXT_external_memory_host())
    {
        vkGetMemoryHostPointerPropertiesEXT = (PFN_vkGetMemoryHostPointerPropertiesEXT)vkGetDeviceProcAddr(d->device, "vkGetMemoryHostPointerPropertiesEXT");
    }

#if __ANDROID_API__ >= 26
    if (info.support_VK_ANDROID_external_memory_android_hardware_buffer())
    {
        vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(d->device, "vkGetAndroidHardwareBufferPropertiesANDROID");
        vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(d->device, "vkGetMemoryAndroidHardwareBufferANDROID");
    }
#endif // __ANDROID_API__ >= 26

    if (info.support_VK_NV_cooperative_vector())
    {
        vkCmdConvertCooperativeVectorMatrixNV = (PFN_vkCmdConvertCooperativeVectorMatrixNV)vkGetDeviceProcAddr(d->device, "vkCmdConvertCooperativeVectorMatrixNV");
        vkConvertCooperativeVectorMatrixNV = (PFN_vkConvertCooperativeVectorMatrixNV)vkGetDeviceProcAddr(d->device, "vkConvertCooperativeVectorMatrixNV");
    }

    return 0;
}

VulkanDevice* get_gpu_device(int device_index)
{
    try_create_gpu_instance();

    if (device_index < 0 || device_index >= g_gpu_count)
        return 0;

    MutexLockGuard lock(g_default_vkdev_lock);

    if (!g_default_vkdev[device_index])
        g_default_vkdev[device_index] = new VulkanDevice(device_index);

    return g_default_vkdev[device_index];
}

static TBuiltInResource get_default_TBuiltInResource()
{
    TBuiltInResource resource;

    resource.maxLights = 32;
    resource.maxClipPlanes = 6;
    resource.maxTextureUnits = 32;
    resource.maxTextureCoords = 32;
    resource.maxVertexAttribs = 64;
    resource.maxVertexUniformComponents = 4096;
    resource.maxVaryingFloats = 64;
    resource.maxVertexTextureImageUnits = 32;
    resource.maxCombinedTextureImageUnits = 80;
    resource.maxTextureImageUnits = 32;
    resource.maxFragmentUniformComponents = 4096;
    resource.maxDrawBuffers = 32;
    resource.maxVertexUniformVectors = 128;
    resource.maxVaryingVectors = 8;
    resource.maxFragmentUniformVectors = 16;
    resource.maxVertexOutputVectors = 16;
    resource.maxFragmentInputVectors = 15;
    resource.minProgramTexelOffset = -8;
    resource.maxProgramTexelOffset = 7;
    resource.maxClipDistances = 8;
    resource.maxComputeWorkGroupCountX = 65535;
    resource.maxComputeWorkGroupCountY = 65535;
    resource.maxComputeWorkGroupCountZ = 65535;
    resource.maxComputeWorkGroupSizeX = 1024;
    resource.maxComputeWorkGroupSizeY = 1024;
    resource.maxComputeWorkGroupSizeZ = 64;
    resource.maxComputeUniformComponents = 1024;
    resource.maxComputeTextureImageUnits = 16;
    resource.maxComputeImageUniforms = 8;
    resource.maxComputeAtomicCounters = 8;
    resource.maxComputeAtomicCounterBuffers = 1;
    resource.maxVaryingComponents = 60;
    resource.maxVertexOutputComponents = 64;
    resource.maxGeometryInputComponents = 64;
    resource.maxGeometryOutputComponents = 128;
    resource.maxFragmentInputComponents = 128;
    resource.maxImageUnits = 8;
    resource.maxCombinedImageUnitsAndFragmentOutputs = 8;
    resource.maxCombinedShaderOutputResources = 8;
    resource.maxImageSamples = 0;
    resource.maxVertexImageUniforms = 0;
    resource.maxTessControlImageUniforms = 0;
    resource.maxTessEvaluationImageUniforms = 0;
    resource.maxGeometryImageUniforms = 0;
    resource.maxFragmentImageUniforms = 8;
    resource.maxCombinedImageUniforms = 8;
    resource.maxGeometryTextureImageUnits = 16;
    resource.maxGeometryOutputVertices = 256;
    resource.maxGeometryTotalOutputComponents = 1024;
    resource.maxGeometryUniformComponents = 1024;
    resource.maxGeometryVaryingComponents = 64;
    resource.maxTessControlInputComponents = 128;
    resource.maxTessControlOutputComponents = 128;
    resource.maxTessControlTextureImageUnits = 16;
    resource.maxTessControlUniformComponents = 1024;
    resource.maxTessControlTotalOutputComponents = 4096;
    resource.maxTessEvaluationInputComponents = 128;
    resource.maxTessEvaluationOutputComponents = 128;
    resource.maxTessEvaluationTextureImageUnits = 16;
    resource.maxTessEvaluationUniformComponents = 1024;
    resource.maxTessPatchComponents = 120;
    resource.maxPatchVertices = 32;
    resource.maxTessGenLevel = 64;
    resource.maxViewports = 16;
    resource.maxVertexAtomicCounters = 0;
    resource.maxTessControlAtomicCounters = 0;
    resource.maxTessEvaluationAtomicCounters = 0;
    resource.maxGeometryAtomicCounters = 0;
    resource.maxFragmentAtomicCounters = 8;
    resource.maxCombinedAtomicCounters = 8;
    resource.maxAtomicCounterBindings = 1;
    resource.maxVertexAtomicCounterBuffers = 0;
    resource.maxTessControlAtomicCounterBuffers = 0;
    resource.maxTessEvaluationAtomicCounterBuffers = 0;
    resource.maxGeometryAtomicCounterBuffers = 0;
    resource.maxFragmentAtomicCounterBuffers = 1;
    resource.maxCombinedAtomicCounterBuffers = 1;
    resource.maxAtomicCounterBufferSize = 16384;
    resource.maxTransformFeedbackBuffers = 4;
    resource.maxTransformFeedbackInterleavedComponents = 64;
    resource.maxCullDistances = 8;
    resource.maxCombinedClipAndCullDistances = 8;
    resource.maxSamples = 4;
    resource.maxMeshOutputVerticesNV = 256;
    resource.maxMeshOutputPrimitivesNV = 512;
    resource.maxMeshWorkGroupSizeX_NV = 32;
    resource.maxMeshWorkGroupSizeY_NV = 1;
    resource.maxMeshWorkGroupSizeZ_NV = 1;
    resource.maxTaskWorkGroupSizeX_NV = 32;
    resource.maxTaskWorkGroupSizeY_NV = 1;
    resource.maxTaskWorkGroupSizeZ_NV = 1;
    resource.maxMeshViewCountNV = 4;

    // TODO compile-time glslang version check
    // resource.maxDualSourceDrawBuffersEXT = 1;

    resource.limits.nonInductiveForLoops = 1;
    resource.limits.whileLoops = 1;
    resource.limits.doWhileLoops = 1;
    resource.limits.generalUniformIndexing = 1;
    resource.limits.generalAttributeMatrixVectorIndexing = 1;
    resource.limits.generalVaryingIndexing = 1;
    resource.limits.generalSamplerIndexing = 1;
    resource.limits.generalVariableIndexing = 1;
    resource.limits.generalConstantMatrixVectorIndexing = 1;

    return resource;
}

class VulkanShaderIncluder : public glslang::TShader::Includer
{
public:
    virtual glslang::TShader::Includer::IncludeResult* includeLocal(const char* headerName, const char* /*includerName*/, size_t /*inclusionDepth*/)
    {
        if (strcmp(headerName, "vulkan_activation.comp") == 0)
        {
            const char* const headerData = vulkan_activation_comp_data;
            const size_t headerLength = sizeof(vulkan_activation_comp_data);
            glslang::TShader::Includer::IncludeResult* r = new glslang::TShader::Includer::IncludeResult(headerName, headerData, headerLength, 0);
            return r;
        }

        return 0;
    }

    virtual void releaseInclude(glslang::TShader::Includer::IncludeResult* r)
    {
        delete r;
    }
};

class DefinitionCollector
{
public:
    template<typename T>
    void append(const char* key, T def)
    {
        definitions.push_back(std::make_pair(key, def));
    }

public:
    struct typed_value
    {
        typed_value(const char* _s)
            : type(0), s(_s)
        {
        }
        typed_value(uint8_t _u8)
            : type(1), u8(_u8)
        {
        }
        typed_value(uint32_t _u32)
            : type(2), u32(_u32)
        {
        }
        typed_value(int32_t _i32)
            : type(3), i32(_i32)
        {
        }
        typed_value(uint64_t _u64)
            : type(4), u64(_u64)
        {
        }
        typed_value(float _f32)
            : type(5), f32(_f32)
        {
        }

        int type;
        union
        {
            const char* s;
            uint8_t u8;
            uint32_t u32;
            int32_t i32;
            uint64_t u64;
            float f32;
        };
    };

    std::vector<std::pair<const char*, typed_value> > definitions;
};

int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv)
{
    // -1 for omitting the tail '\0'
    int length = strlen(comp_string) - 1;
    return compile_spirv_module(comp_string, length, opt, spirv);
}

int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv)
{
    DefinitionCollector custom_defines;
    DefinitionCollector device_defines;

    int device_index = opt.vulkan_device_index;
    if (device_index < 0 || device_index >= get_gpu_count())
        device_index = get_default_gpu_index();

    const GpuInfo& info = get_gpu_info(device_index);
    const bool support_fp16_storage = info.support_fp16_storage();
    const bool support_fp16_uniform = info.support_fp16_uniform();

    if (opt.use_bf16_storage)
    {
        custom_defines.append("sfp", "bfloat16_t");
        custom_defines.append("sfpvec2", "bf16vec2");
        custom_defines.append("sfpvec4", "bf16vec4");

        // define pack and unpack macro for bf16s
        custom_defines.append("unpackBFloat2x16(v)", "vec2(uintBitsToBFloat16EXT(unpackUint2x16(v)))");
        custom_defines.append("packBFloat2x16(v)", "packUint2x16(bfloat16BitsToUintEXT(bf16vec2(v)))");
    }
    else if (opt.use_bf16_packed)
    {
        if (support_fp16_storage)
        {
            custom_defines.append("sfp", "uint16_t");
        }
        else
        {
            custom_defines.append("sfp", "uint");
        }
        custom_defines.append("sfpvec2", "uint");
        custom_defines.append("sfpvec4", "uvec2");

        // define pack and unpack macro for bf16p
        custom_defines.append("unpackBFloat2x16(v)", "vec2(uintBitsToFloat(v<<16),uintBitsToFloat(v&0xffff0000u))");
        custom_defines.append("packBFloat2x16(v)", "uint((floatBitsToUint(v.x)>>16)|(floatBitsToUint(v.y)&0xffff0000u))");
    }
    else if (opt.use_fp16_storage)
    {
        custom_defines.append("sfp", "float16_t");
        custom_defines.append("sfpvec2", "f16vec2");
        custom_defines.append("sfpvec4", "f16vec4");

        if (opt.use_fp16_arithmetic)
        {
            custom_defines.append("sfpmat4", "f16mat4");
        }
    }
    else if (opt.use_fp16_packed)
    {
        custom_defines.append("sfp", "uint");
        custom_defines.append("sfpvec2", "uint");
        custom_defines.append("sfpvec4", "uvec2");
    }
    else
    {
        custom_defines.append("sfp", "float");
        custom_defines.append("sfpvec2", "vec2");
        custom_defines.append("sfpvec4", "vec4");
        custom_defines.append("sfpmat4", "mat4");
    }

    if (opt.use_bf16_storage || opt.use_bf16_packed)
    {
        // bf16 conflicts with fp16a
        custom_defines.append("afp", "float");
        custom_defines.append("afpvec2", "vec2");
        custom_defines.append("afpvec4", "vec4");
        custom_defines.append("afpmat4", "mat4");
    }
    else if (opt.use_fp16_arithmetic)
    {
        custom_defines.append("afp", "float16_t");
        custom_defines.append("afpvec2", "f16vec2");
        custom_defines.append("afpvec4", "f16vec4");
        custom_defines.append("afpmat4", "f16mat4");
    }
    else
    {
        custom_defines.append("afp", "float");
        custom_defines.append("afpvec2", "vec2");
        custom_defines.append("afpvec4", "vec4");
        custom_defines.append("afpmat4", "mat4");
    }

    if (opt.use_bf16_storage)
    {
        // bf16s implies 16bit uniform
        custom_defines.append("lfp", "bfloat16_t");
        custom_defines.append("lfpvec4", "bf16vec4");
    }
    else if (opt.use_bf16_packed)
    {
        if (support_fp16_uniform)
        {
            custom_defines.append("lfp", "uint16_t");
        }
        else
        {
            custom_defines.append("lfp", "float");
        }
        custom_defines.append("lfpvec4", "uvec2");
    }
    else if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
    {
        custom_defines.append("lfp", "float16_t");
        custom_defines.append("lfpvec4", "f16vec4");
    }
    else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
    {
        custom_defines.append("lfp", "float");
        custom_defines.append("lfpvec4", "uint64_t");
    }
    else if (opt.use_fp16_storage || opt.use_fp16_packed)
    {
        custom_defines.append("lfp", "float");
        custom_defines.append("lfpvec4", "uvec2");
    }
    else
    {
        custom_defines.append("lfp", "float");
        custom_defines.append("lfpvec4", "vec4");
    }

    if (opt.use_bf16_storage)
    {
        custom_defines.append("buffer_sm1(buf,i)", "buf[i]");
        custom_defines.append("buffer_sm4(buf,i)", "buf[i]");

        custom_defines.append("lfp2afp(v)", "float(v)");
        custom_defines.append("afp2lfp(v)", "bfloat16_t(v)");
        custom_defines.append("lfp2afpvec4(v)", "vec4(v)");
        custom_defines.append("afp2lfpvec4(v)", "bf16vec4(v)");
    }
    else if (opt.use_bf16_packed)
    {
        if (support_fp16_uniform)
        {
            custom_defines.append("buffer_sm1(buf,i)", "buf[i]");
        }
        else if (support_fp16_storage)
        {
            custom_defines.append("buffer_sm1(buf,i)", "uintBitsToFloat(uint(buf[i])<<16)");
        }
        else
        {
            custom_defines.append("buffer_sm1(buf,i)", "unpackBFloat2x16(buf[(i)/2])[(i)%2]");
        }
        custom_defines.append("buffer_sm4(buf,i)", "buf[i]");

        if (support_fp16_uniform)
        {
            custom_defines.append("lfp2afp(v)", "uintBitsToFloat(uint(v)<<16)");
            custom_defines.append("afp2lfp(v)", "uint16_t(floatBitsToUint(v)>>16)");
        }
        else
        {
            custom_defines.append("lfp2afp(v)", "v");
            custom_defines.append("afp2lfp(v)", "v");
        }
        custom_defines.append("lfp2afpvec4(v)", "vec4(unpackBFloat2x16(v.x),unpackBFloat2x16(v.y))");
        custom_defines.append("afp2lfpvec4(v)", "uvec2(packBFloat2x16(v.rg),packBFloat2x16(v.ba))");
    }
    else if (opt.use_fp16_storage && opt.use_fp16_uniform && opt.use_fp16_arithmetic)
    {
        custom_defines.append("buffer_sm1(buf,i)", "buf[i]");
        custom_defines.append("buffer_sm4(buf,i)", "buf[i]");

        custom_defines.append("lfp2afp(v)", "v");
        custom_defines.append("afp2lfp(v)", "v");
        custom_defines.append("lfp2afpvec4(v)", "v");
        custom_defines.append("afp2lfpvec4(v)", "v");
    }
    else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
    {
        custom_defines.append("buffer_sm1(buf,i)", "float(buf[i])");
        custom_defines.append("buffer_sm4(buf,i)", "pack64(halfBitsToUint16(buf[i]))");

        custom_defines.append("lfp2afp(v)", "float16_t(v)");
        custom_defines.append("afp2lfp(v)", "float(v)");
        custom_defines.append("lfp2afpvec4(v)", "uint16BitsToHalf(unpack16(v))");
        custom_defines.append("afp2lfpvec4(v)", "pack64(halfBitsToUint16(v))");
    }
    else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
    {
        custom_defines.append("buffer_sm1(buf,i)", "unpackHalf2x16(buf[(i)/2])[(i)%2]");
        custom_defines.append("buffer_sm4(buf,i)", "buf[i]");

        custom_defines.append("lfp2afp(v)", "float16_t(v)");
        custom_defines.append("afp2lfp(v)", "float(v)");
        custom_defines.append("lfp2afpvec4(v)", "f16vec4(unpackFloat2x16(v.x),unpackFloat2x16(v.y))");
        custom_defines.append("afp2lfpvec4(v)", "uvec2(packFloat2x16(v.rg),packFloat2x16(v.ba))");
    }
    else if (opt.use_fp16_storage)
    {
        custom_defines.append("buffer_sm1(buf,i)", "float(buf[i])");
        custom_defines.append("buffer_sm4(buf,i)", "uvec2(packHalf2x16(vec4(buf[i]).rg),packHalf2x16(vec4(buf[i]).ba))");

        custom_defines.append("lfp2afp(v)", "v");
        custom_defines.append("afp2lfp(v)", "float(v)");
        custom_defines.append("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))");
        custom_defines.append("afp2lfpvec4(v)", "uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba))");
    }
    else if (opt.use_fp16_packed)
    {
        custom_defines.append("buffer_sm1(buf,i)", "unpackHalf2x16(buf[(i)/2])[(i)%2]");
        custom_defines.append("buffer_sm4(buf,i)", "buf[i]");

        custom_defines.append("lfp2afp(v)", "v");
        custom_defines.append("afp2lfp(v)", "v");
        custom_defines.append("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))");
        custom_defines.append("afp2lfpvec4(v)", "uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba))");
    }
    else
    {
        custom_defines.append("buffer_sm1(buf,i)", "buf[i]");
        custom_defines.append("buffer_sm4(buf,i)", "buf[i]");

        custom_defines.append("lfp2afp(v)", "v");
        custom_defines.append("afp2lfp(v)", "v");
        custom_defines.append("lfp2afpvec4(v)", "v");
        custom_defines.append("afp2lfpvec4(v)", "v");
    }

    if (opt.use_bf16_storage)
    {
        custom_defines.append("buffer_ld1(buf,i)", "float(buf[i])");
        custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=bfloat16_t(v);}");
        custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}");
        custom_defines.append("buffer_ld2(buf,i)", "vec2(buf[i])");
        custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=bf16vec2(v);}");
        custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_ld4(buf,i)", "vec4(buf[i])");
        custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=bf16vec4(v);}");
        custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}");
    }
    else if (opt.use_bf16_packed)
    {
        if (support_fp16_storage)
        {
            custom_defines.append("buffer_ld1(buf,i)", "uintBitsToFloat(uint(buf[i])<<16)");
            custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=uint16_t(floatBitsToUint(v)>>16);}");
            custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

            custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=uvec2(pack32(u16vec2(sbuf[si4.r],sbuf[si4.g])),pack32(u16vec2(sbuf[si4.b],sbuf[si4.a])));}");
            custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=unpack16(sbuf[si].x).x;buf[i4.g]=unpack16(sbuf[si].x).y;buf[i4.b]=unpack16(sbuf[si].y).x;buf[i4.a]=unpack16(sbuf[si].y).y;}");
        }
        else
        {
            custom_defines.append("buffer_ld1(buf,i)", "unpackBFloat2x16(buf[(i)/2])[(i)%2]");
            custom_defines.append("buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;float _vs=float(v);uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackBFloat2x16(_old_v);_v[_im2]=_vs;_new_v=packBFloat2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}");
            custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;uint _si=uint(si);uint _sid2=_si/2;uint _sim2=_si%2;float v=unpackBFloat2x16(sbuf[_sid2])[_sim2];uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackBFloat2x16(_old_v);_v[_im2]=v;_new_v=packBFloat2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}");

            custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{uvec4 _si4d2=uvec4(si4)/2;uvec4 _si4m2=uvec4(si4)%2; buf[i]=uvec2(packBFloat2x16(vec2(unpackBFloat2x16(sbuf[_si4d2.r])[_si4m2.r],unpackBFloat2x16(sbuf[_si4d2.g])[_si4m2.g])),packBFloat2x16(vec2(unpackBFloat2x16(sbuf[_si4d2.b])[_si4m2.b],unpackBFloat2x16(sbuf[_si4d2.a])[_si4m2.a])));}");
            custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si];vec2 _v0=unpackBFloat2x16(_v.x);vec2 _v1=unpackBFloat2x16(_v.y);buffer_st1(buf,i4.r,_v0.r);buffer_st1(buf,i4.g,_v0.g);buffer_st1(buf,i4.b,_v1.r);buffer_st1(buf,i4.a,_v1.g);}");
        }

        custom_defines.append("buffer_ld2(buf,i)", "unpackBFloat2x16(buf[i])");
        custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=packBFloat2x16(v);}");
        custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_ld4(buf,i)", "vec4(unpackBFloat2x16(buf[i].x),unpackBFloat2x16(buf[i].y))");
        custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packBFloat2x16(v.rg),packBFloat2x16(v.ba));}");
        custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
    }
    else if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
    {
        custom_defines.append("buffer_ld1(buf,i)", "buf[i]");
        custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=v;}");
        custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=f16vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}");
        custom_defines.append("buffer_ld2(buf,i)", "buf[i]");
        custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=v;}");
        custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_ld4(buf,i)", "buf[i]");
        custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=v;}");
        custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}");
        custom_defines.append("sfp2afpmat4(v)", "v");
        custom_defines.append("afp2sfpmat4(v)", "v");
    }
    else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
    {
        custom_defines.append("buffer_ld1(buf,i)", "float16_t(unpackHalf2x16(buf[(i)/2])[(i)%2])");
        custom_defines.append("buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;float _vs=float(v);uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=_vs;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}");
        custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;uint _si=uint(si);uint _sid2=_si/2;uint _sim2=_si%2;float v=unpackHalf2x16(sbuf[_sid2])[_sim2];uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=v;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}");

        custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{uvec4 _si4d2=uvec4(si4)/2;uvec4 _si4m2=uvec4(si4)%2; buf[i]=uvec2(packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.r])[_si4m2.r],unpackHalf2x16(sbuf[_si4d2.g])[_si4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.b])[_si4m2.b],unpackHalf2x16(sbuf[_si4d2.a])[_si4m2.a])));}");

        custom_defines.append("buffer_ld2(buf,i)", "unpackFloat2x16(buf[i])");
        custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=packFloat2x16(v)}");
        custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_ld4(buf,i)", "f16vec4(unpackFloat2x16(buf[i].x),unpackFloat2x16(buf[i].y))");
        custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packFloat2x16(v.rg),packFloat2x16(v.ba));}");
        custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

        custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si];vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y);buffer_st1(buf,i4.r,_v0.r);buffer_st1(buf,i4.g,_v0.g);buffer_st1(buf,i4.b,_v1.r);buffer_st1(buf,i4.a,_v1.g);}");
    }
    else if (opt.use_fp16_storage)
    {
        custom_defines.append("buffer_ld1(buf,i)", "float(buf[i])");
        custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=float16_t(v);}");
        custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i].r=sbuf[si4.r];buf[i].g=sbuf[si4.g];buf[i].b=sbuf[si4.b];buf[i].a=sbuf[si4.a];}");
        custom_defines.append("buffer_ld2(buf,i)", "vec2(buf[i])");
        custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=f16vec2(v);}");
        custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_ld4(buf,i)", "vec4(buf[i])");
        custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=f16vec4(v);}");
        custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{buf[i4.r]=sbuf[si].r;buf[i4.g]=sbuf[si].g;buf[i4.b]=sbuf[si].b;buf[i4.a]=sbuf[si].a;}");
    }
    else if (opt.use_fp16_packed)
    {
        custom_defines.append("buffer_ld1(buf,i)", "unpackHalf2x16(buf[(i)/2])[(i)%2]");
        custom_defines.append("buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;float _vs=float(v);uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=_vs;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}");
        custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{uint _i=uint(i);uint _id2=_i/2;uint _im2=_i%2;uint _si=uint(si);uint _sid2=_si/2;uint _sim2=_si%2;float v=unpackHalf2x16(sbuf[_sid2])[_sim2];uint _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id2],0,0);vec2 _v=unpackHalf2x16(_old_v);_v[_im2]=v;_new_v=packHalf2x16(_v);} while(atomicCompSwap(buf[_id2],_old_v,_new_v)!=_old_v);}");

        custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{uvec4 _si4d2=uvec4(si4)/2;uvec4 _si4m2=uvec4(si4)%2; buf[i]=uvec2(packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.r])[_si4m2.r],unpackHalf2x16(sbuf[_si4d2.g])[_si4m2.g])),packHalf2x16(vec2(unpackHalf2x16(sbuf[_si4d2.b])[_si4m2.b],unpackHalf2x16(sbuf[_si4d2.a])[_si4m2.a])));}");

        custom_defines.append("buffer_ld2(buf,i)", "unpackHalf2x16(buf[i])");
        custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=packHalf2x16(v);}");
        custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_ld4(buf,i)", "vec4(unpackHalf2x16(buf[i].x),unpackHalf2x16(buf[i].y))");
        custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=uvec2(packHalf2x16(v.rg),packHalf2x16(v.ba));}");
        custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

        custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{uvec2 _v=sbuf[si];vec2 _v0=unpackHalf2x16(_v.x);vec2 _v1=unpackHalf2x16(_v.y);buffer_st1(buf,i4.r,_v0.r);buffer_st1(buf,i4.g,_v0.g);buffer_st1(buf,i4.b,_v1.r);buffer_st1(buf,i4.a,_v1.g);}");
    }
    else
    {
        custom_defines.append("buffer_ld1(buf,i)", "buf[i]");
        custom_defines.append("buffer_st1(buf,i,v)", "{buf[i]=v;}");
        custom_defines.append("buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp1to4(buf,i,sbuf,si4)", "{buf[i]=vec4(sbuf[si4.r],sbuf[si4.g],sbuf[si4.b],sbuf[si4.a]);}");
        custom_defines.append("buffer_ld2(buf,i)", "buf[i]");
        custom_defines.append("buffer_st2(buf,i,v)", "{buf[i]=v;}");
        custom_defines.append("buffer_cp2(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_ld4(buf,i)", "buf[i]");
        custom_defines.append("buffer_st4(buf,i,v)", "{buf[i]=v;}");
        custom_defines.append("buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
        custom_defines.append("buffer_cp4to1(buf,i4,sbuf,si)", "{vec4 _v=sbuf[si]; buf[i4.r]=_v.r;buf[i4.g]=_v.g;buf[i4.b]=_v.b;buf[i4.a]=_v.a;}");
        custom_defines.append("sfp2afpmat4(v)", "v");
        custom_defines.append("afp2sfpmat4(v)", "v");
    }

    if (opt.use_int8_storage)
    {
        custom_defines.append("sint8", "int8_t");
    }
    else if (opt.use_int8_packed)
    {
        custom_defines.append("sint8", "int");
    }
    else
    {
        custom_defines.append("sint8", "int");
    }

    custom_defines.append("sint8vec4", "int");

    custom_defines.append("aint8", "int");
    custom_defines.append("aint8vec4", "ivec4");

    custom_defines.append("unpackInt4x8(v)", "ivec4((v<<24)>>24,(v<<16)>>24,(v<<8)>>24,v>>24)");
    custom_defines.append("packInt4x8(v)", "int((uint(v.r)&0xFFu)|((uint(v.g)&0xFFu)<<8)|((uint(v.b)&0xFFu)<<16)|((uint(v.a)&0xFFu)<<24))");

    if (opt.use_int8_storage)
    {
        custom_defines.append("i8buffer_ld1(buf,i)", "int(buf[i])");
        custom_defines.append("i8buffer_st1(buf,i,v)", "{buf[i]=int8_t(v);}");
        custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");
    }
    else
    {
        custom_defines.append("i8buffer_ld1(buf,i)", "int(((buf[(i)/4])<<(24-((i)%4)*8))>>24)");
        custom_defines.append("i8buffer_st1(buf,i,v)", "{uint _i=uint(i);uint _id4=_i/4;uint _im4=_i%4;int _vs=int(v);int _old_v, _new_v;do{_old_v=atomicCompSwap(buf[_id4],0,0);ivec4 _v=unpackInt4x8(_old_v);_v[_im4]=_vs;_new_v=packInt4x8(_v);} while(atomicCompSwap(buf[_id4],_old_v,_new_v)!=_old_v);}");
        custom_defines.append("i8buffer_cp1(buf,i,sbuf,si)", "{int _v=i8buffer_ld1(sbuf,si);i8buffer_st1(buf,i,_v);}");
    }

    custom_defines.append("i8buffer_ld4(buf,i)", "unpackInt4x8(buf[i])");
    custom_defines.append("i8buffer_st4(buf,i,v)", "{buf[i]=packInt4x8(v);}");
    custom_defines.append("i8buffer_cp4(buf,i,sbuf,si)", "{buf[i]=sbuf[si];}");

    custom_defines.append("psc(x)", "(x==0?p.x:x)");

    if (opt.use_bf16_storage)
    {
        custom_defines.append("NCNN_bf16_storage", 1);
    }
    else if (opt.use_bf16_packed)
    {
        custom_defines.append("NCNN_bf16_packed", 1);
    }
    else if (opt.use_fp16_storage)
    {
        custom_defines.append("NCNN_fp16_storage", 1);
    }
    else if (opt.use_fp16_packed)
    {
        custom_defines.append("NCNN_fp16_packed", 1);
    }

    if (opt.use_fp16_uniform)
    {
        custom_defines.append("NCNN_fp16_uniform", 1);
    }

    if (opt.use_fp16_arithmetic)
    {
        custom_defines.append("NCNN_fp16_arithmetic", 1);
    }

    if (opt.use_int8_storage)
    {
        custom_defines.append("NCNN_int8_storage", 1);
    }
    else if (opt.use_int8_packed)
    {
        custom_defines.append("NCNN_int8_packed", 1);
    }

    if (opt.use_int8_uniform)
    {
        custom_defines.append("NCNN_int8_uniform", 1);
    }

    if (opt.use_int8_arithmetic)
    {
        custom_defines.append("NCNN_int8_arithmetic", 1);
    }

    if (opt.use_shader_local_memory)
    {
        custom_defines.append("NCNN_shader_local_memory", 1);
    }

#if __APPLE__
    custom_defines.append("NCNN_moltenvk", 1);
#endif

    custom_defines.append("ncnn_glsl_version", 1);

    const bool support_shader_int64 = info.physicalDevicefeatures().shaderInt64;
    const bool support_shader_int16 = info.physicalDevicefeatures().shaderInt16;

    // fill device macros
    {
        // pull in device extensions
        {
            const std::vector<VkExtensionProperties>& properties = info.deviceExtensionProperties();

            for (size_t i = 0; i < properties.size(); i++)
            {
                const VkExtensionProperties& exp = properties[i];
                device_defines.append(exp.extensionName, exp.specVersion);
            }
        }

#define DD_APPEND_FEATURE(X) device_defines.append(#X, features.X ? 1 : 0);

        // pull in device features macros
        {
            const VkPhysicalDeviceFeatures& features = info.physicalDevicefeatures();
            DD_APPEND_FEATURE(robustBufferAccess)
            DD_APPEND_FEATURE(fullDrawIndexUint32)
            DD_APPEND_FEATURE(imageCubeArray)
            DD_APPEND_FEATURE(independentBlend)
            DD_APPEND_FEATURE(geometryShader)
            DD_APPEND_FEATURE(tessellationShader)
            DD_APPEND_FEATURE(sampleRateShading)
            DD_APPEND_FEATURE(dualSrcBlend)
            DD_APPEND_FEATURE(logicOp)
            DD_APPEND_FEATURE(multiDrawIndirect)
            DD_APPEND_FEATURE(drawIndirectFirstInstance)
            DD_APPEND_FEATURE(depthClamp)
            DD_APPEND_FEATURE(depthBiasClamp)
            DD_APPEND_FEATURE(fillModeNonSolid)
            DD_APPEND_FEATURE(depthBounds)
            DD_APPEND_FEATURE(wideLines)
            DD_APPEND_FEATURE(largePoints)
            DD_APPEND_FEATURE(alphaToOne)
            DD_APPEND_FEATURE(multiViewport)
            DD_APPEND_FEATURE(samplerAnisotropy)
            DD_APPEND_FEATURE(textureCompressionETC2)
            DD_APPEND_FEATURE(textureCompressionASTC_LDR)
            DD_APPEND_FEATURE(textureCompressionBC)
            DD_APPEND_FEATURE(occlusionQueryPrecise)
            DD_APPEND_FEATURE(pipelineStatisticsQuery)
            DD_APPEND_FEATURE(vertexPipelineStoresAndAtomics)
            DD_APPEND_FEATURE(fragmentStoresAndAtomics)
            DD_APPEND_FEATURE(shaderTessellationAndGeometryPointSize)
            DD_APPEND_FEATURE(shaderImageGatherExtended)
            DD_APPEND_FEATURE(shaderStorageImageExtendedFormats)
            DD_APPEND_FEATURE(shaderStorageImageMultisample)
            DD_APPEND_FEATURE(shaderStorageImageReadWithoutFormat)
            DD_APPEND_FEATURE(shaderStorageImageWriteWithoutFormat)
            DD_APPEND_FEATURE(shaderUniformBufferArrayDynamicIndexing)
            DD_APPEND_FEATURE(shaderSampledImageArrayDynamicIndexing)
            DD_APPEND_FEATURE(shaderStorageBufferArrayDynamicIndexing)
            DD_APPEND_FEATURE(shaderStorageImageArrayDynamicIndexing)
            DD_APPEND_FEATURE(shaderClipDistance)
            DD_APPEND_FEATURE(shaderCullDistance)
            DD_APPEND_FEATURE(shaderFloat64)
            DD_APPEND_FEATURE(shaderInt64)
            DD_APPEND_FEATURE(shaderInt16)
            DD_APPEND_FEATURE(shaderResourceResidency)
            DD_APPEND_FEATURE(shaderResourceMinLod)
            DD_APPEND_FEATURE(sparseBinding)
            DD_APPEND_FEATURE(sparseResidencyBuffer)
            DD_APPEND_FEATURE(sparseResidencyImage2D)
            DD_APPEND_FEATURE(sparseResidencyImage3D)
            DD_APPEND_FEATURE(sparseResidency2Samples)
            DD_APPEND_FEATURE(sparseResidency4Samples)
            DD_APPEND_FEATURE(sparseResidency8Samples)
            DD_APPEND_FEATURE(sparseResidency16Samples)
            DD_APPEND_FEATURE(sparseResidencyAliased)
            DD_APPEND_FEATURE(variableMultisampleRate)
            DD_APPEND_FEATURE(inheritedQueries)
        }
        if (info.support_VK_KHR_8bit_storage())
        {
            const VkPhysicalDevice8BitStorageFeaturesKHR& features = info.query8BitStorageFeatures();
            DD_APPEND_FEATURE(storageBuffer8BitAccess)
            DD_APPEND_FEATURE(uniformAndStorageBuffer8BitAccess)
            DD_APPEND_FEATURE(storagePushConstant8)
        }
        if (info.support_VK_KHR_16bit_storage())
        {
            const VkPhysicalDevice16BitStorageFeaturesKHR& features = info.query16BitStorageFeatures();
            DD_APPEND_FEATURE(storageBuffer16BitAccess)
            DD_APPEND_FEATURE(uniformAndStorageBuffer16BitAccess)
            DD_APPEND_FEATURE(storagePushConstant16)
            DD_APPEND_FEATURE(storageInputOutput16)
        }
        if (info.support_VK_KHR_robustness2() || info.support_VK_EXT_robustness2())
        {
            const VkPhysicalDeviceRobustness2FeaturesKHR& features = info.queryRobustness2Features();
            DD_APPEND_FEATURE(robustBufferAccess2)
            DD_APPEND_FEATURE(robustImageAccess2)
            DD_APPEND_FEATURE(nullDescriptor)
        }
        if (info.support_VK_KHR_shader_float16_int8())
        {
            const VkPhysicalDeviceFloat16Int8FeaturesKHR& features = info.queryFloat16Int8Features();
            DD_APPEND_FEATURE(shaderFloat16)
            DD_APPEND_FEATURE(shaderInt8)
        }
        if (info.support_VK_KHR_sampler_ycbcr_conversion())
        {
            const VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR& features = info.querySamplerYcbcrConversionFeatures();
            DD_APPEND_FEATURE(samplerYcbcrConversion)
        }
        if (info.support_VK_KHR_cooperative_matrix())
        {
            const VkPhysicalDeviceCooperativeMatrixFeaturesKHR& features = info.queryCooperativeMatrixFeatures();
            DD_APPEND_FEATURE(cooperativeMatrix)
            DD_APPEND_FEATURE(cooperativeMatrixRobustBufferAccess)
        }
        else if (info.support_VK_NV_cooperative_matrix())
        {
            const VkPhysicalDeviceCooperativeMatrixFeaturesNV& features = info.queryCooperativeMatrixFeaturesNV();
            DD_APPEND_FEATURE(cooperativeMatrix)
            DD_APPEND_FEATURE(cooperativeMatrixRobustBufferAccess)
        }
        if (info.support_VK_NV_cooperative_matrix2())
        {
            const VkPhysicalDeviceCooperativeMatrix2FeaturesNV& features = info.queryCooperativeMatrix2FeaturesNV();
            DD_APPEND_FEATURE(cooperativeMatrixWorkgroupScope)
            DD_APPEND_FEATURE(cooperativeMatrixFlexibleDimensions)
            DD_APPEND_FEATURE(cooperativeMatrixReductions)
            DD_APPEND_FEATURE(cooperativeMatrixConversions)
            DD_APPEND_FEATURE(cooperativeMatrixPerElementOperations)
            DD_APPEND_FEATURE(cooperativeMatrixTensorAddressing)
            DD_APPEND_FEATURE(cooperativeMatrixBlockLoads)
        }
        if (info.support_VK_NV_cooperative_vector())
        {
            const VkPhysicalDeviceCooperativeVectorFeaturesNV& features = info.queryCooperativeVectorFeaturesNV();
            DD_APPEND_FEATURE(cooperativeVector)
            DD_APPEND_FEATURE(cooperativeVectorTraining)
        }
        if (info.support_VK_EXT_subgroup_size_control())
        {
            const VkPhysicalDeviceSubgroupSizeControlFeaturesEXT& features = info.querySubgroupSizeControlFeatures();
            DD_APPEND_FEATURE(subgroupSizeControl)
            DD_APPEND_FEATURE(computeFullSubgroups)
        }
        if (info.support_VK_KHR_shader_bfloat16())
        {
            const VkPhysicalDeviceShaderBfloat16FeaturesKHR& features = info.queryShaderBfloat16Features();
            DD_APPEND_FEATURE(shaderBFloat16Type)
            DD_APPEND_FEATURE(shaderBFloat16DotProduct)
            DD_APPEND_FEATURE(shaderBFloat16CooperativeMatrix)
        }
        if (info.support_VK_EXT_shader_float8())
        {
            const VkPhysicalDeviceShaderFloat8FeaturesEXT& features = info.queryShaderFloat8Features();
            DD_APPEND_FEATURE(shaderFloat8)
            DD_APPEND_FEATURE(shaderFloat8CooperativeMatrix)
        }
        if (info.support_VK_KHR_shader_float_controls2())
        {
            const VkPhysicalDeviceShaderFloatControls2FeaturesKHR& features = info.queryShaderFloatControls2Features();
            DD_APPEND_FEATURE(shaderFloatControls2)
        }
        if (info.support_VK_KHR_shader_integer_dot_product())
        {
            const VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR& features = info.queryShaderIntegerDotProductFeatures();
            DD_APPEND_FEATURE(shaderIntegerDotProduct)
        }
        if (info.support_VK_KHR_shader_subgroup_rotate())
        {
            const VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR& features = info.queryShaderSubgroupRotateFeatures();
            DD_APPEND_FEATURE(shaderSubgroupRotate)
            DD_APPEND_FEATURE(shaderSubgroupRotateClustered)
        }
        if (info.support_VK_EXT_shader_atomic_float())
        {
            const VkPhysicalDeviceShaderAtomicFloatFeaturesEXT& features = info.queryShaderAtomicFloatFeatures();
            DD_APPEND_FEATURE(shaderBufferFloat32Atomics)
            DD_APPEND_FEATURE(shaderBufferFloat32AtomicAdd)
            DD_APPEND_FEATURE(shaderBufferFloat64Atomics)
            DD_APPEND_FEATURE(shaderBufferFloat64AtomicAdd)
            DD_APPEND_FEATURE(shaderSharedFloat32Atomics)
            DD_APPEND_FEATURE(shaderSharedFloat32AtomicAdd)
            DD_APPEND_FEATURE(shaderSharedFloat64Atomics)
            DD_APPEND_FEATURE(shaderSharedFloat64AtomicAdd)
            DD_APPEND_FEATURE(shaderImageFloat32Atomics)
            DD_APPEND_FEATURE(shaderImageFloat32AtomicAdd)
            DD_APPEND_FEATURE(sparseImageFloat32Atomics)
            DD_APPEND_FEATURE(sparseImageFloat32AtomicAdd)
        }
        if (info.support_VK_EXT_shader_atomic_float2())
        {
            const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& features = info.queryShaderAtomicFloat2Features();
            DD_APPEND_FEATURE(shaderBufferFloat16Atomics)
            DD_APPEND_FEATURE(shaderBufferFloat16AtomicAdd)
            DD_APPEND_FEATURE(shaderBufferFloat16AtomicMinMax)
            DD_APPEND_FEATURE(shaderBufferFloat32AtomicMinMax)
            DD_APPEND_FEATURE(shaderBufferFloat64AtomicMinMax)
            DD_APPEND_FEATURE(shaderSharedFloat16Atomics)
            DD_APPEND_FEATURE(shaderSharedFloat16AtomicAdd)
            DD_APPEND_FEATURE(shaderSharedFloat16AtomicMinMax)
            DD_APPEND_FEATURE(shaderSharedFloat32AtomicMinMax)
            DD_APPEND_FEATURE(shaderSharedFloat64AtomicMinMax)
            DD_APPEND_FEATURE(shaderImageFloat32AtomicMinMax)
            DD_APPEND_FEATURE(sparseImageFloat32AtomicMinMax)
        }
        if (info.support_VK_KHR_vulkan_memory_model())
        {
            const VkPhysicalDeviceVulkanMemoryModelFeaturesKHR& features = info.queryVulkanMemoryModelFeatures();
            DD_APPEND_FEATURE(vulkanMemoryModel)
            DD_APPEND_FEATURE(vulkanMemoryModelDeviceScope)
            DD_APPEND_FEATURE(vulkanMemoryModelAvailabilityVisibilityChains)
        }

#undef DD_APPEND_FEATURE

#define DD_APPEND_PROPERTY(X) device_defines.append(#X, properties.X);

        // pull in device properties macros
        {
            const VkPhysicalDeviceProperties& properties = info.physicalDeviceProperties();
            DD_APPEND_PROPERTY(apiVersion)
            DD_APPEND_PROPERTY(driverVersion)
            DD_APPEND_PROPERTY(vendorID)
            DD_APPEND_PROPERTY(deviceID)
            DD_APPEND_PROPERTY(deviceType)
            // DD_APPEND_PROPERTY(deviceName)

            // DD_APPEND_PROPERTY(pipelineCacheUUID)

#define DD_APPEND_PROPERTY_LIMIT(X) device_defines.append(#X, properties.limits.X);
#define DD_APPEND_PROPERTY_LIMIT_2(X)                       \
    device_defines.append(#X "_0", properties.limits.X[0]); \
    device_defines.append(#X "_1", properties.limits.X[1]);
#define DD_APPEND_PROPERTY_LIMIT_3(X)                       \
    device_defines.append(#X "_0", properties.limits.X[0]); \
    device_defines.append(#X "_1", properties.limits.X[1]); \
    device_defines.append(#X "_2", properties.limits.X[2]);

            DD_APPEND_PROPERTY_LIMIT(maxImageDimension1D)
            DD_APPEND_PROPERTY_LIMIT(maxImageDimension2D)
            DD_APPEND_PROPERTY_LIMIT(maxImageDimension3D)
            DD_APPEND_PROPERTY_LIMIT(maxImageDimensionCube)
            DD_APPEND_PROPERTY_LIMIT(maxImageArrayLayers)
            DD_APPEND_PROPERTY_LIMIT(maxTexelBufferElements)
            DD_APPEND_PROPERTY_LIMIT(maxUniformBufferRange)
            DD_APPEND_PROPERTY_LIMIT(maxStorageBufferRange)
            DD_APPEND_PROPERTY_LIMIT(maxPushConstantsSize)
            DD_APPEND_PROPERTY_LIMIT(maxMemoryAllocationCount)
            DD_APPEND_PROPERTY_LIMIT(maxSamplerAllocationCount)
            DD_APPEND_PROPERTY_LIMIT(bufferImageGranularity)
            DD_APPEND_PROPERTY_LIMIT(sparseAddressSpaceSize)
            DD_APPEND_PROPERTY_LIMIT(maxBoundDescriptorSets)
            DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorSamplers)
            DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorUniformBuffers)
            DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorStorageBuffers)
            DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorSampledImages)
            DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorStorageImages)
            DD_APPEND_PROPERTY_LIMIT(maxPerStageDescriptorInputAttachments)
            DD_APPEND_PROPERTY_LIMIT(maxPerStageResources)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetSamplers)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetUniformBuffers)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetUniformBuffersDynamic)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetStorageBuffers)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetStorageBuffersDynamic)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetSampledImages)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetStorageImages)
            DD_APPEND_PROPERTY_LIMIT(maxDescriptorSetInputAttachments)
            DD_APPEND_PROPERTY_LIMIT(maxVertexInputAttributes)
            DD_APPEND_PROPERTY_LIMIT(maxVertexInputBindings)
            DD_APPEND_PROPERTY_LIMIT(maxVertexInputAttributeOffset)
            DD_APPEND_PROPERTY_LIMIT(maxVertexInputBindingStride)
            DD_APPEND_PROPERTY_LIMIT(maxVertexOutputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationGenerationLevel)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationPatchSize)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationControlPerVertexInputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationControlPerVertexOutputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationControlPerPatchOutputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationControlTotalOutputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationEvaluationInputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxTessellationEvaluationOutputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxGeometryShaderInvocations)
            DD_APPEND_PROPERTY_LIMIT(maxGeometryInputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxGeometryOutputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxGeometryOutputVertices)
            DD_APPEND_PROPERTY_LIMIT(maxGeometryTotalOutputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxFragmentInputComponents)
            DD_APPEND_PROPERTY_LIMIT(maxFragmentOutputAttachments)
            DD_APPEND_PROPERTY_LIMIT(maxFragmentDualSrcAttachments)
            DD_APPEND_PROPERTY_LIMIT(maxFragmentCombinedOutputResources)
            DD_APPEND_PROPERTY_LIMIT(maxComputeSharedMemorySize)
            DD_APPEND_PROPERTY_LIMIT_3(maxComputeWorkGroupCount)
            DD_APPEND_PROPERTY_LIMIT(maxComputeWorkGroupInvocations)
            DD_APPEND_PROPERTY_LIMIT_3(maxComputeWorkGroupSize)
            DD_APPEND_PROPERTY_LIMIT(subPixelPrecisionBits)
            DD_APPEND_PROPERTY_LIMIT(subTexelPrecisionBits)
            DD_APPEND_PROPERTY_LIMIT(mipmapPrecisionBits)
            DD_APPEND_PROPERTY_LIMIT(maxDrawIndexedIndexValue)
            DD_APPEND_PROPERTY_LIMIT(maxDrawIndirectCount)
            DD_APPEND_PROPERTY_LIMIT(maxSamplerLodBias)
            DD_APPEND_PROPERTY_LIMIT(maxSamplerAnisotropy)
            DD_APPEND_PROPERTY_LIMIT(maxViewports)
            DD_APPEND_PROPERTY_LIMIT_2(maxViewportDimensions)
            DD_APPEND_PROPERTY_LIMIT_2(viewportBoundsRange)
            DD_APPEND_PROPERTY_LIMIT(viewportSubPixelBits)
            device_defines.append("minMemoryMapAlignment", (uint32_t)properties.limits.minMemoryMapAlignment);
            DD_APPEND_PROPERTY_LIMIT(minTexelBufferOffsetAlignment)
            DD_APPEND_PROPERTY_LIMIT(minUniformBufferOffsetAlignment)
            DD_APPEND_PROPERTY_LIMIT(minStorageBufferOffsetAlignment)
            DD_APPEND_PROPERTY_LIMIT(minTexelOffset)
            DD_APPEND_PROPERTY_LIMIT(maxTexelOffset)
            DD_APPEND_PROPERTY_LIMIT(minTexelGatherOffset)
            DD_APPEND_PROPERTY_LIMIT(maxTexelGatherOffset)
            DD_APPEND_PROPERTY_LIMIT(minInterpolationOffset)
            DD_APPEND_PROPERTY_LIMIT(maxInterpolationOffset)
            DD_APPEND_PROPERTY_LIMIT(subPixelInterpolationOffsetBits)
            DD_APPEND_PROPERTY_LIMIT(maxFramebufferWidth)
            DD_APPEND_PROPERTY_LIMIT(maxFramebufferHeight)
            DD_APPEND_PROPERTY_LIMIT(maxFramebufferLayers)
            DD_APPEND_PROPERTY_LIMIT(framebufferColorSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(framebufferDepthSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(framebufferStencilSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(framebufferNoAttachmentsSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(maxColorAttachments)
            DD_APPEND_PROPERTY_LIMIT(sampledImageColorSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(sampledImageIntegerSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(sampledImageDepthSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(sampledImageStencilSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(storageImageSampleCounts)
            DD_APPEND_PROPERTY_LIMIT(maxSampleMaskWords)
            DD_APPEND_PROPERTY_LIMIT(timestampComputeAndGraphics)
            DD_APPEND_PROPERTY_LIMIT(timestampPeriod)
            DD_APPEND_PROPERTY_LIMIT(maxClipDistances)
            DD_APPEND_PROPERTY_LIMIT(maxCullDistances)
            DD_APPEND_PROPERTY_LIMIT(maxCombinedClipAndCullDistances)
            DD_APPEND_PROPERTY_LIMIT(discreteQueuePriorities)
            DD_APPEND_PROPERTY_LIMIT_2(pointSizeRange)
            DD_APPEND_PROPERTY_LIMIT_2(lineWidthRange)
            DD_APPEND_PROPERTY_LIMIT(pointSizeGranularity)
            DD_APPEND_PROPERTY_LIMIT(lineWidthGranularity)
            DD_APPEND_PROPERTY_LIMIT(strictLines)
            DD_APPEND_PROPERTY_LIMIT(standardSampleLocations)
            DD_APPEND_PROPERTY_LIMIT(optimalBufferCopyOffsetAlignment)
            DD_APPEND_PROPERTY_LIMIT(optimalBufferCopyRowPitchAlignment)
            DD_APPEND_PROPERTY_LIMIT(nonCoherentAtomSize)

#undef DD_APPEND_PROPERTY_LIMIT
#undef DD_APPEND_PROPERTY_LIMIT_2
#undef DD_APPEND_PROPERTY_LIMIT_3

#define DD_APPEND_PROPERTY_SPARSE(X) device_defines.append(#X, properties.sparseProperties.X);

            DD_APPEND_PROPERTY_SPARSE(residencyStandard2DBlockShape)
            DD_APPEND_PROPERTY_SPARSE(residencyStandard2DMultisampleBlockShape)
            DD_APPEND_PROPERTY_SPARSE(residencyStandard3DBlockShape)
            DD_APPEND_PROPERTY_SPARSE(residencyAlignedMipSize)
            DD_APPEND_PROPERTY_SPARSE(residencyNonResidentStrict)

#undef DD_APPEND_PROPERTY_SPARSE
        }
        {
            const VkPhysicalDeviceSubgroupProperties& properties = info.querySubgroupProperties();
            DD_APPEND_PROPERTY(subgroupSize)
            DD_APPEND_PROPERTY(supportedStages)
            DD_APPEND_PROPERTY(supportedOperations)
            DD_APPEND_PROPERTY(quadOperationsInAllStages)

            // append subgroup ops
            device_defines.append("subgroup_basic", (properties.supportedOperations & VK_SUBGROUP_FEATURE_BASIC_BIT) ? 1 : 0);
            device_defines.append("subgroup_vote", (properties.supportedOperations & VK_SUBGROUP_FEATURE_VOTE_BIT) ? 1 : 0);
            device_defines.append("subgroup_arithmetic", (properties.supportedOperations & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) ? 1 : 0);
            device_defines.append("subgroup_ballot", (properties.supportedOperations & VK_SUBGROUP_FEATURE_BALLOT_BIT) ? 1 : 0);
            device_defines.append("subgroup_shuffle", (properties.supportedOperations & VK_SUBGROUP_FEATURE_SHUFFLE_BIT) ? 1 : 0);
            device_defines.append("subgroup_shuffle_relative", (properties.supportedOperations & VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) ? 1 : 0);
            device_defines.append("subgroup_clustered", (properties.supportedOperations & VK_SUBGROUP_FEATURE_CLUSTERED_BIT) ? 1 : 0);
            device_defines.append("subgroup_quad", (properties.supportedOperations & VK_SUBGROUP_FEATURE_QUAD_BIT) ? 1 : 0);
            device_defines.append("subgroup_rotate", (properties.supportedOperations & VK_SUBGROUP_FEATURE_ROTATE_BIT) ? 1 : 0);
            device_defines.append("subgroup_rotate_relative", (properties.supportedOperations & VK_SUBGROUP_FEATURE_ROTATE_CLUSTERED_BIT) ? 1 : 0);
            device_defines.append("subgroup_partitioned", (properties.supportedOperations & VK_SUBGROUP_FEATURE_PARTITIONED_BIT_NV) ? 1 : 0);
        }
        if (info.support_VK_NV_cooperative_matrix2())
        {
            const VkPhysicalDeviceCooperativeMatrix2PropertiesNV& properties = info.queryCooperativeMatrix2PropertiesNV();
            DD_APPEND_PROPERTY(cooperativeMatrixWorkgroupScopeMaxWorkgroupSize)
            DD_APPEND_PROPERTY(cooperativeMatrixFlexibleDimensionsMaxDimension)
            DD_APPEND_PROPERTY(cooperativeMatrixWorkgroupScopeReservedSharedMemory)
        }
        if (info.support_VK_NV_cooperative_vector())
        {
            const VkPhysicalDeviceCooperativeVectorPropertiesNV& properties = info.queryCooperativeVectorPropertiesNV();
            DD_APPEND_PROPERTY(cooperativeVectorSupportedStages)
            DD_APPEND_PROPERTY(cooperativeVectorTrainingFloat16Accumulation)
            DD_APPEND_PROPERTY(cooperativeVectorTrainingFloat32Accumulation)
            DD_APPEND_PROPERTY(maxCooperativeVectorComponents)
        }
        if (info.support_VK_KHR_driver_properties())
        {
            const VkPhysicalDeviceDriverPropertiesKHR& properties = info.queryDriverProperties();
            DD_APPEND_PROPERTY(driverID)
            // DD_APPEND_PROPERTY(driverName)
            // DD_APPEND_PROPERTY(driverInfo)
            device_defines.append("conformanceVersion_major", properties.conformanceVersion.major);
            device_defines.append("conformanceVersion_minor", properties.conformanceVersion.minor);
            device_defines.append("conformanceVersion_subminor", properties.conformanceVersion.subminor);
            device_defines.append("conformanceVersion_patch", properties.conformanceVersion.patch);
        }
        if (info.support_VK_KHR_robustness2() || info.support_VK_EXT_robustness2())
        {
            const VkPhysicalDeviceRobustness2PropertiesKHR& properties = info.queryRobustness2Properties();
            DD_APPEND_PROPERTY(robustStorageBufferAccessSizeAlignment)
            DD_APPEND_PROPERTY(robustUniformBufferAccessSizeAlignment)
        }
        if (info.support_VK_KHR_shader_integer_dot_product())
        {
            const VkPhysicalDeviceShaderIntegerDotProductProperties& properties = info.queryShaderIntegerDotProductProperties();
            DD_APPEND_PROPERTY(integerDotProduct8BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct8BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct8BitMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct4x8BitPackedUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct4x8BitPackedSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct4x8BitPackedMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct16BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct16BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct16BitMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct32BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct32BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct32BitMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct64BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct64BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProduct64BitMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating8BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating8BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating8BitMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating4x8BitPackedUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating4x8BitPackedSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating4x8BitPackedMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating16BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating16BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating16BitMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating32BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating32BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating32BitMixedSignednessAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating64BitUnsignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating64BitSignedAccelerated)
            DD_APPEND_PROPERTY(integerDotProductAccumulatingSaturating64BitMixedSignednessAccelerated)
        }
        if (info.support_VK_EXT_subgroup_size_control())
        {
            const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& properties = info.querySubgroupSizeControlProperties();
            DD_APPEND_PROPERTY(minSubgroupSize)
            DD_APPEND_PROPERTY(maxSubgroupSize)
            DD_APPEND_PROPERTY(maxComputeWorkgroupSubgroups)
            DD_APPEND_PROPERTY(requiredSubgroupSizeStages)
        }

#if ENABLE_VALIDATION_LAYER
        if (info.support_VK_KHR_shader_non_semantic_info())
        {
            device_defines.append("enable_validation_layer", VK_TRUE);
            custom_defines.append("NCNN_LOGE", "debugPrintfEXT");
        }
#endif

#undef DD_APPEND_PROPERTY
    }

    std::string define_macro_data;

    for (size_t i = 0; i < custom_defines.definitions.size(); i++)
    {
        const char* key = custom_defines.definitions[i].first;
        const DefinitionCollector::typed_value& def = custom_defines.definitions[i].second;

        if (def.type == 0)
        {
            define_macro_data += std::string("#define ") + key + " " + def.s + "\n";
        }
        else
        {
            char defstr[256];
            if (def.type == 1)
            {
                sprintf(defstr, "%u", def.u8);
            }
            if (def.type == 2)
            {
                sprintf(defstr, "%u", def.u32);
            }
            if (def.type == 3)
            {
                sprintf(defstr, "%d", def.i32);
            }
            if (def.type == 4)
            {
                if (support_shader_int64)
                {
                    sprintf(defstr, "%luull", def.u64);
                }
                else
                {
                    uint32_t u32 = def.u64 > UINT_MAX ? UINT_MAX : (uint32_t)def.u64;
                    sprintf(defstr, "%u", u32);
                }
            }
            if (def.type == 5)
            {
                sprintf(defstr, "%e", def.f32);
            }

            define_macro_data += std::string("#define ") + key + " " + defstr + "\n";
        }
    }
    for (size_t i = 0; i < device_defines.definitions.size(); i++)
    {
        const char* key = device_defines.definitions[i].first;
        const DefinitionCollector::typed_value& def = device_defines.definitions[i].second;

        if (def.type == 0)
        {
            define_macro_data += std::string("#define ncnn_") + key + " \"" + def.s + "\"\n";
        }
        else
        {
            char defstr[256];
            if (def.type == 1)
            {
                sprintf(defstr, "%u", def.u8);
            }
            if (def.type == 2)
            {
                sprintf(defstr, "%u", def.u32);
            }
            if (def.type == 3)
            {
                sprintf(defstr, "%d", def.i32);
            }
            if (def.type == 4)
            {
                if (support_shader_int64)
                {
                    sprintf(defstr, "%luull", def.u64);
                }
                else
                {
                    uint32_t u32 = def.u64 > UINT_MAX ? UINT_MAX : (uint32_t)def.u64;
                    sprintf(defstr, "%u", u32);
                }
            }
            if (def.type == 5)
            {
                sprintf(defstr, "%e", def.f32);
            }

            define_macro_data += std::string("#define ncnn_") + key + " " + defstr + "\n";
        }
    }

    // enable extensions
    std::string custom_exts;
    if (support_shader_int64)
    {
        custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int64: require\n";
    }
    if (support_shader_int16)
    {
        custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int16: require\n";
    }
    if (opt.use_bf16_storage)
    {
        custom_exts += "#extension GL_EXT_bfloat16: require\n";
    }
    if (opt.use_fp16_storage || opt.use_bf16_storage)
    {
        custom_exts += "#extension GL_EXT_shader_16bit_storage: require\n";
    }
    if (opt.use_fp16_arithmetic)
    {
        custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_float16: require\n";
    }
    if (opt.use_int8_storage)
    {
        custom_exts += "#extension GL_EXT_shader_8bit_storage: require\n";
    }
    if (opt.use_int8_arithmetic)
    {
        custom_exts += "#extension GL_EXT_shader_explicit_arithmetic_types_int8: require\n";
    }
#if ENABLE_VALIDATION_LAYER
    {
        custom_exts += "#extension GL_EXT_debug_printf : require\n";
    }
#endif

    // debug
    // NCNN_LOGE("%s", define_macro_data.c_str());

    bool compile_success = true;

    {
        glslang::TShader s(EShLangCompute);

        // split shader source by token "#version 450\n"
        int version_end_pos = -1;
        {
            for (int i = 0; i < comp_data_size - 8; i++)
            {
                if (strncmp(comp_data + i, "#version", 8) != 0)
                    continue;

                // #version shall be the very beginning or after newline
                if (i != 0 && comp_data[i - 1] != '\n')
                    continue;

                int nversion = 0;
                sscanf(comp_data + i, "#version %*d\n%n", &nversion);
                if (nversion == 0)
                    continue;

                version_end_pos = i + nversion;
                break;
            }

            if (version_end_pos == -1)
            {
                NCNN_LOGE("shader source has no #version token");
                return -1;
            }

            // NCNN_LOGE("version_end_pos = %d", version_end_pos);
        }

        const char* comp_data_2 = comp_data + version_end_pos;
        int comp_data_size_1 = version_end_pos;
        int comp_data_size_2 = comp_data_size - comp_data_size_1;

        const char* comp_datas[4] = {comp_data, custom_exts.c_str(), define_macro_data.c_str(), comp_data_2};
        const int comp_data_sizes[4] = {comp_data_size_1, (int)custom_exts.size(), (int)define_macro_data.size(), comp_data_size_2};

        s.setStringsWithLengths(comp_datas, comp_data_sizes, 4);

        s.setEntryPoint("main");
        s.setSourceEntryPoint("main");

        s.setEnvInput(glslang::EShSourceGlsl, EShLangCompute, glslang::EShClientVulkan, 1);

        if (opt.use_subgroup_ops || opt.use_cooperative_matrix)
        {
            // subgroup / cooperative_matrix need vulkan-1.1 and spirv-1.3
            s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_1);
            s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_3);
        }
        else
        {
            s.setEnvClient(glslang::EShClientVulkan, glslang::EShTargetVulkan_1_0);
            s.setEnvTarget(glslang::EshTargetSpv, glslang::EShTargetSpv_1_0);
        }

        TBuiltInResource resources = get_default_TBuiltInResource();

        VulkanShaderIncluder includer;

        bool pr = s.parse(&resources, 100, ENoProfile, false, false, EShMsgDefault, includer);
        if (!pr)
        {
            NCNN_LOGE("compile spir-v module failed");
            NCNN_LOGE("%s", s.getInfoLog());
            NCNN_LOGE("%s", s.getInfoDebugLog());

            // print as line_number: code
            {
                const char* p = comp_datas[3];
                const char* line_end;
                int line_number = 1;

                while ((line_end = strchr(p, '\n')) != NULL)
                {
                    NCNN_LOGE("%d:\t%.*s", line_number++, (int)(line_end - p), p);
                    p = line_end + 1;
                }

                if (*p != '\0')
                {
                    NCNN_LOGE("%d:\t%s", line_number, p);
                }
            }

            compile_success = false;
        }
        else
        {
            glslang::TIntermediate* ir = s.getIntermediate();
            glslang::GlslangToSpv(*ir, spirv);
        }
    }

    return compile_success ? 0 : -1;
}

int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv)
{
    if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
    {
        NCNN_LOGE("no such shader module %d", shader_type_index);
        return -1;
    }

    const char* comp_data = layer_shader_registry[shader_type_index].comp_data;
    int comp_data_size = layer_shader_registry[shader_type_index].comp_data_size;

    return compile_spirv_module(comp_data, comp_data_size, opt, spirv);
}

int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
{
    shader_info.specialization_count = 0;
    shader_info.binding_count = 0;
    shader_info.push_constant_count = 0;

    uint32_t parameter_id = -233;

    int specialization_count = 0;
    int binding_count = 0;
    int push_constant_count = 0;

    // id -> binding_type
    std::vector<int> id_types;

    // binding_id -> binding_type
    std::vector<int> binding_types;

    const uint32_t* p = spv_data;

    int bound = p[3];

    id_types.resize(bound);

    // skip magic version generator bound schema
    p += 5;

    // foreach op
    while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size)
    {
        uint32_t opcode = p[0];

        uint16_t wordcount = opcode >> 16;
        uint16_t op = opcode & 0xffff;

        if (op == 5) // OpName
        {
            uint32_t id = p[1];
            const char* name = (const char*)&p[2];
            if (strcmp(name, "parameter") == 0)
            {
                parameter_id = id;
            }
        }
        else if (op == 6) // OpMemberName
        {
            uint32_t id = p[1];
            if (id == parameter_id)
            {
                push_constant_count++;
            }
        }
        else if (op == 25) // OpTypeImage
        {
            uint32_t id = p[1];
            id_types[id] = 2;
        }
        else if (op == 27) // OpTypeSampledImage
        {
            uint32_t id = p[1];
            id_types[id] = 3;
        }
        else if (op == 32) // OpTypePointer
        {
            uint32_t id = p[1];
            uint32_t storage_class = p[2];
            uint32_t type = p[3];
            if (storage_class == 0) // UniformConstant
            {
                id_types[id] = id_types[type];
            }
            if (storage_class == 2) // Uniform
            {
                id_types[id] = id_types[type];
            }
            if (storage_class == 12) // StorageBuffer
            {
                id_types[type] = 1;
                id_types[id] = id_types[type];
            }
        }
        else if (op == 59) // OpVariable
        {
            uint32_t id = p[1];
            uint32_t var_id = p[2];
            uint32_t storage_class = p[3];
            if (storage_class == 0) // UniformConstant
            {
                id_types[var_id] = id_types[id];
            }
            if (storage_class == 2) // Uniform
            {
                id_types[var_id] = id_types[id];
            }
            if (storage_class == 12) // StorageBuffer
            {
                id_types[var_id] = id_types[id];
            }
        }
        else if (op == 71) // OpDecorate
        {
            uint32_t id = p[1];
            uint32_t decoration = p[2];
            uint32_t binding_id = p[3];
            if (decoration == 1) // SpecId
            {
                specialization_count++;
            }
            if (decoration == 3) // BufferBlock
            {
                id_types[id] = 1;
            }
            else if (decoration == 33) // Binding
            {
                binding_count = std::max(binding_count, (int)binding_id + 1);

                binding_types.resize(binding_count);
                binding_types[binding_id] = id;
            }
        }

        p += wordcount;
    }

    if (binding_count > 16)
    {
        NCNN_LOGE("too many binding %d", binding_count);
        return -1;
    }

    shader_info.specialization_count = specialization_count;
    shader_info.binding_count = binding_count;
    shader_info.push_constant_count = push_constant_count;

    // resolve binding_types
    for (int i = 0; i < binding_count; i++)
    {
        shader_info.binding_types[i] = id_types[binding_types[i]];
    }

    return 0;
}

} // namespace ncnn

#endif // NCNN_VULKAN


================================================
FILE: src/gpu.h
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NCNN_GPU_H
#define NCNN_GPU_H

#include "platform.h"

#if NCNN_VULKAN

#include "mat.h"

namespace ncnn {

// instance

// Create VkInstance and initialize some objects that need to be calculated by GPU
// Creates a VkInstance object, Checks the extended attributes supported by the Vulkan instance concerned,
// Initializes, and creates Vulkan validation layers (if ENABLE_VALIDATION_LAYER is enabled),
// Iterates over all supported physical devices, etc.
NCNN_EXPORT int create_gpu_instance(const char* driver_path = 0);

// Get global VkInstance variable
// Must be called after create_gpu_instance() and before destroy_gpu_instance()
NCNN_EXPORT VkInstance get_gpu_instance();

// Destroy VkInstance object and free the memory of the associated object
// Usually called in the destructor of the main program exit
// The function will internally ensure that all vulkan devices are idle before proceeding with destruction.
NCNN_EXPORT void destroy_gpu_instance();

// vulkan core
extern PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers;
extern PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets;
extern PFN_vkAllocateMemory vkAllocateMemory;
extern PFN_vkBeginCommandBuffer vkBeginCommandBuffer;
extern PFN_vkBindBufferMemory vkBindBufferMemory;
extern PFN_vkBindImageMemory vkBindImageMemory;
extern PFN_vkCmdBeginQuery vkCmdBeginQuery;
extern PFN_vkCmdBindDescriptorSets vkCmdBindDescriptorSets;
extern PFN_vkCmdBindIndexBuffer vkCmdBindIndexBuffer;
extern PFN_vkCmdBindPipeline vkCmdBindPipeline;
extern PFN_vkCmdCopyBuffer vkCmdCopyBuffer;
extern PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage;
extern PFN_vkCmdCopyImage vkCmdCopyImage;
extern PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer;
extern PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults;
extern PFN_vkCmdDispatch vkCmdDispatch;
extern PFN_vkCmdDispatchIndirect vkCmdDispatchIndirect;
extern PFN_vkCmdEndQuery vkCmdEndQuery;
extern PFN_vkCmdExecuteCommands vkCmdExecuteCommands;
extern PFN_vkCmdFillBuffer vkCmdFillBuffer;
extern PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier;
extern PFN_vkCmdPushConstants vkCmdPushConstants;
extern PFN_vkCmdResetQueryPool vkCmdResetQueryPool;
extern PFN_vkCmdResolveImage vkCmdResolveImage;
extern PFN_vkCmdUpdateBuffer vkCmdUpdateBuffer;
extern PFN_vkCmdWriteTimestamp vkCmdWriteTimestamp;
extern PFN_vkCreateBuffer vkCreateBuffer;
extern PFN_vkCreateBufferView vkCreateBufferView;
extern PFN_vkCreateCommandPool vkCreateCommandPool;
extern PFN_vkCreateComputePipelines vkCreateComputePipelines;
extern PFN_vkCreateDescriptorPool vkCreateDescriptorPool;
extern PFN_vkCreateDescriptorSetLayout vkCreateDescriptorSetLayout;
extern PFN_vkCreateDevice vkCreateDevice;
extern PFN_vkCreateFence vkCreateFence;
extern PFN_vkCreateImage vkCreateImage;
extern PFN_vkCreateImageView vkCreateImageView;
extern PFN_vkCreatePipelineCache vkCreatePipelineCache;
extern PFN_vkCreatePipelineLayout vkCreatePipelineLayout;
extern PFN_vkCreateQueryPool vkCreateQueryPool;
extern PFN_vkCreateSampler vkCreateSampler;
extern PFN_vkCreateSemaphore vkCreateSemaphore;
extern PFN_vkCreateShaderModule vkCreateShaderModule;
extern PFN_vkDestroyBuffer vkDestroyBuffer;
extern PFN_vkDestroyBufferView vkDestroyBufferView;
extern PFN_vkDestroyCommandPool vkDestroyCommandPool;
extern PFN_vkDestroyDescriptorPool vkDestroyDescriptorPool;
extern PFN_vkDestroyDescriptorSetLayout vkDestroyDescriptorSetLayout;
extern PFN_vkDestroyDevice vkDestroyDevice;
extern PFN_vkDestroyFence vkDestroyFence;
extern PFN_vkDestroyImage vkDestroyImage;
extern PFN_vkDestroyImageView vkDestroyImageView;
extern PFN_vkDestroyInstance vkDestroyInstance;
extern PFN_vkDestroyPipeline vkDestroyPipeline;
extern PFN_vkDestroyPipelineCache vkDestroyPipelineCache;
extern PFN_vkDestroyPipelineLayout vkDestroyPipelineLayout;
extern PFN_vkDestroyQueryPool vkDestroyQueryPool;
extern PFN_vkDestroySampler vkDestroySampler;
extern PFN_vkDestroySemaphore vkDestroySemaphore;
extern PFN_vkDestroyShaderModule vkDestroyShaderModule;
extern PFN_vkDeviceWaitIdle vkDeviceWaitIdle;
extern PFN_vkEndCommandBuffer vkEndCommandBuffer;
extern PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties;
extern PFN_vkEnumerateDeviceLayerProperties vkEnumerateDeviceLayerProperties;
extern PFN_vkEnumeratePhysicalDevices vkEnumeratePhysicalDevices;
extern PFN_vkFlushMappedMemoryRanges vkFlushMappedMemoryRanges;
extern PFN_vkFreeCommandBuffers vkFreeCommandBuffers;
extern PFN_vkFreeDescriptorSets vkFreeDescriptorSets;
extern PFN_vkFreeMemory vkFreeMemory;
extern PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements;
extern PFN_vkGetDeviceMemoryCommitment vkGetDeviceMemoryCommitment;
extern PFN_vkGetDeviceProcAddr vkGetDeviceProcAddr;
extern PFN_vkGetDeviceQueue vkGetDeviceQueue;
extern PFN_vkGetFenceStatus vkGetFenceStatus;
extern PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements;
extern PFN_vkGetImageSubresourceLayout vkGetImageSubresourceLayout;
extern PFN_vkGetPhysicalDeviceFeatures vkGetPhysicalDeviceFeatures;
extern PFN_vkGetPhysicalDeviceFormatProperties vkGetPhysicalDeviceFormatProperties;
extern PFN_vkGetPhysicalDeviceImageFormatProperties vkGetPhysicalDeviceImageFormatProperties;
extern PFN_vkGetPhysicalDeviceMemoryProperties vkGetPhysicalDeviceMemoryProperties;
extern PFN_vkGetPhysicalDeviceProperties vkGetPhysicalDeviceProperties;
extern PFN_vkGetPhysicalDeviceQueueFamilyProperties vkGetPhysicalDeviceQueueFamilyProperties;
extern PFN_vkGetPipelineCacheData vkGetPipelineCacheData;
extern PFN_vkGetQueryPoolResults vkGetQueryPoolResults;
extern PFN_vkInvalidateMappedMemoryRanges vkInvalidateMappedMemoryRanges;
extern PFN_vkMapMemory vkMapMemory;
extern PFN_vkMergePipelineCaches vkMergePipelineCaches;
extern PFN_vkQueueSubmit vkQueueSubmit;
extern PFN_vkQueueWaitIdle vkQueueWaitIdle;
extern PFN_vkResetCommandBuffer vkResetCommandBuffer;
extern PFN_vkResetCommandPool vkResetCommandPool;
extern PFN_vkResetDescriptorPool vkResetDescriptorPool;
extern PFN_vkResetFences vkResetFences;
extern PFN_vkUnmapMemory vkUnmapMemory;
extern PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets;
extern PFN_vkWaitForFences vkWaitForFences;

// instance extension capability
extern int support_VK_KHR_external_memory_capabilities;
extern int support_VK_KHR_get_physical_device_properties2;
extern int support_VK_KHR_get_surface_capabilities2;
extern int support_VK_KHR_surface;
extern int support_VK_EXT_debug_utils;
extern int support_VK_EXT_validation_features;
extern int support_VK_EXT_validation_flags;
#if __ANDROID_API__ >= 26
extern int support_VK_KHR_android_surface;
#endif // __ANDROID_API__ >= 26

// VK_KHR_cooperative_matrix
extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;

// VK_KHR_external_memory_capabilities
extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;

// VK_KHR_get_physical_device_properties2
extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;

// VK_KHR_get_surface_capabilities2
extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;

// VK_KHR_surface
extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;

#if __ANDROID_API__ >= 26
// VK_KHR_android_surface
extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
#endif // __ANDROID_API__ >= 26

// VK_NV_cooperative_matrix
extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV vkGetPhysicalDeviceCooperativeMatrixPropertiesNV;

// VK_NV_cooperative_matrix2
extern PFN_vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV vkGetPhysicalDeviceCooperativeMatrixFlexibleDimensionsPropertiesNV;

// VK_NV_cooperative_vector
extern PFN_vkGetPhysicalDeviceCooperativeVectorPropertiesNV vkGetPhysicalDeviceCooperativeVectorPropertiesNV;

// get info
NCNN_EXPORT int get_gpu_count();
NCNN_EXPORT int get_default_gpu_index();

class GpuInfoPrivate;
class NCNN_EXPORT GpuInfo
{
public:
    explicit GpuInfo();
    virtual ~GpuInfo();

    int device_index() const;

    // vulkan physical device
    VkPhysicalDevice physicalDevice() const;
    VkPhysicalDevice physical_device() const; // api compatibility

    // features
    const VkPhysicalDeviceFeatures& physicalDevicefeatures() const;

    // properties
    const VkPhysicalDeviceProperties& physicalDeviceProperties() const;

    // memory properties
    const VkPhysicalDeviceMemoryProperties& physicalDeviceMemoryProperties() const;
    const VkPhysicalDeviceMemoryProperties& physical_device_memory_properties() const; // api compatibility

    // extension properties
    const std::vector<VkExtensionProperties>& deviceExtensionProperties() const;

    // info
    uint32_t api_version() const;
    uint32_t driver_version() const;
    uint32_t vendor_id() const;
    uint32_t device_id() const;
    const char* device_name() const;
    uint8_t* pipeline_cache_uuid() const;

    // driver properties
    uint32_t driver_id() const;
    const char* driver_name() const;

    // 0 = discrete gpu
    // 1 = integrated gpu
    // 2 = virtual gpu
    // 3 = cpu
    int type() const;

    // performance score roughly evaluated based on parameters such as device type,
    // supported extensions, video memory size etc.
    // high-end device scores over 75
    // low-end device scores below 10
    uint32_t rough_score() const;

    // hardware limit
    uint32_t max_shared_memory_size() const;
    uint32_t max_workgroup_count_x() const;
    uint32_t max_workgroup_count_y() const;
    uint32_t max_workgroup_count_z() const;
    uint32_t max_workgroup_invocations() const;
    uint32_t max_workgroup_size_x() const;
    uint32_t max_workgroup_size_y() const;
    uint32_t max_workgroup_size_z() const;
    size_t memory_map_alignment() const;
    size_t buffer_offset_alignment() const;
    size_t non_coherent_atom_size() const;
    size_t buffer_image_granularity() const;
    uint32_t max_image_dimension_1d() const;
    uint32_t max_image_dimension_2d() const;
    uint32_t max_image_dimension_3d() const;
    float timestamp_period() const;

    // runtime
    uint32_t compute_queue_family_index() const;
    uint32_t transfer_queue_family_index() const;

    uint32_t compute_queue_count() const;
    uint32_t transfer_queue_count() const;

    // property
    bool unified_compute_transfer_queue() const;
    bool resizable_bar_enabled() const;

    // subgroup
    uint32_t subgroup_size() const;
    uint32_t min_subgroup_size() const;
    uint32_t max_subgroup_size() const;
    uint32_t max_compute_workgroup_subgroups() const;
    bool support_subgroup_size_control() const;
    bool support_compute_full_subgroups() const;
    uint32_t support_subgroup_ops() const;

    // bug is not feature
    bool bug_storage_buffer_no_l1() const;
    bool bug_corrupted_online_pipeline_cache() const;
    bool bug_buffer_image_load_zero() const;

    // but sometimes bug is a feature
    bool bug_implicit_fp16_arithmetic() const;

    // fp16 and int8 feature
    bool support_fp16_packed() const;
    bool support_fp16_storage() const;
    bool support_fp16_uniform() const;
    bool support_fp16_arithmetic() const;
    bool support_int8_packed() const;
    bool support_int8_storage() const;
    bool support_int8_uniform() const;
    bool support_int8_arithmetic() const;

    // bf16 feature
    bool support_bf16_packed() const;
    bool support_bf16_storage() const; // bf16s implies bf16u

    // r16f and r8s format in storage image
    bool support_fp16_image() const;
    bool support_int8_image() const;

    // shader float controls2
    bool support_fp_fast_math() const;

    // ycbcr conversion feature
    bool support_ycbcr_conversion() const;

    // cooperative matrix feature
    bool support_cooperative_matrix() const;
    bool support_cooperative_matrix_8_8_16() const;
    bool support_cooperative_matrix_16_8_8() const;
    bool support_cooperative_matrix_16_8_16() const;
    bool support_cooperative_matrix_16_16_16() const;

    // bf16 cooperative matrix feature
    bool support_bf16_cooperative_matrix() const;

    // extension capability
    int support_VK_KHR_8bit_storage() const;
    int support_VK_KHR_16bit_storage() const;
    int support_VK_KHR_bind_memory2() const;
    int support_VK_KHR_buffer_device_address() const;
    int support_VK_KHR_create_renderpass2() const;
    int support_VK_KHR_cooperative_matrix() const;
    int support_VK_KHR_dedicated_allocation() const;
    int support_VK_KHR_descriptor_update_template() const;
    int support_VK_KHR_driver_properties() const;
    int support_VK_KHR_external_memory() const;
    int support_VK_KHR_get_memory_requirements2() const;
    int support_VK_KHR_maintenance1() const;
    int support_VK_KHR_maintenance2() const;
    int support_VK_KHR_maintenance3() const;
    int support_VK_KHR_multiview() const;
    int support_VK_KHR_portability_subset() const;
    int support_VK_KHR_push_descriptor() const;
    int support_VK_KHR_robustness2() const;
    int support_VK_KHR_sampler_ycbcr_conversion() const;
    int support_VK_KHR_shader_bfloat16() const;
    int support_VK_KHR_shader_float16_int8() const;
    int support_VK_KHR_shader_float_controls() const;
    int support_VK_KHR_shader_float_controls2() const;
    int support_VK_KHR_shader_integer_dot_product() const;
    int support_VK_KHR_shader_non_semantic_info() const;
    int support_VK_KHR_shader_subgroup_extended_types() const;
    int support_VK_KHR_shader_subgroup_rotate() const;
    int support_VK_KHR_storage_buffer_storage_class() const;
    int support_VK_KHR_swapchain() const;
    int support_VK_KHR_vulkan_memory_model() const;
    int support_VK_KHR_zero_initialize_workgroup_memory() const;
    int support_VK_EXT_buffer_device_address() const;
    int support_VK_EXT_descriptor_indexing() const;
    int support_VK_EXT_external_memory_host() const;
    int support_VK_EXT_memory_budget() const;
    int support_VK_EXT_memory_priority() const;
    int support_VK_EXT_queue_family_foreign() const;
    int support_VK_EXT_robustness2() const;
    int support_VK_EXT_shader_atomic_float() const;
    int support_VK_EXT_shader_atomic_float2() const;
    int support_VK_EXT_shader_float8() const;
    int support_VK_EXT_subgroup_size_control() const;
    int support_VK_AMD_device_coherent_memory() const;
#if __ANDROID_API__ >= 26
    int support_VK_ANDROID_external_memory_android_hardware_buffer() const;
#endif // __ANDROID_API__ >= 26
    int support_VK_NV_cooperative_matrix() const;
    int support_VK_NV_cooperative_matrix2() const;
    int support_VK_NV_cooperative_vector() const;

    // extension features
    const void* queryExtensionFeatures() const;
    const VkPhysicalDevice8BitStorageFeaturesKHR& query8BitStorageFeatures() const;
    const VkPhysicalDevice16BitStorageFeaturesKHR& query16BitStorageFeatures() const;
    const VkPhysicalDeviceFloat16Int8FeaturesKHR& queryFloat16Int8Features() const;
    const VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR& querySamplerYcbcrConversionFeatures() const;
    const VkPhysicalDeviceCooperativeMatrixFeaturesKHR& queryCooperativeMatrixFeatures() const;
    const VkPhysicalDeviceCooperativeMatrixFeaturesNV& queryCooperativeMatrixFeaturesNV() const;
    const VkPhysicalDeviceCooperativeMatrix2FeaturesNV& queryCooperativeMatrix2FeaturesNV() const;
    const VkPhysicalDeviceCooperativeVectorFeaturesNV& queryCooperativeVectorFeaturesNV() const;
    const VkPhysicalDeviceRobustness2FeaturesKHR& queryRobustness2Features() const;
    const VkPhysicalDeviceSubgroupSizeControlFeaturesEXT& querySubgroupSizeControlFeatures() const;
    const VkPhysicalDeviceShaderBfloat16FeaturesKHR& queryShaderBfloat16Features() const;
    const VkPhysicalDeviceShaderFloat8FeaturesEXT& queryShaderFloat8Features() const;
    const VkPhysicalDeviceShaderFloatControls2FeaturesKHR& queryShaderFloatControls2Features() const;
    const VkPhysicalDeviceShaderIntegerDotProductFeaturesKHR& queryShaderIntegerDotProductFeatures() const;
    const VkPhysicalDeviceShaderSubgroupRotateFeaturesKHR& queryShaderSubgroupRotateFeatures() const;
    const VkPhysicalDeviceShaderAtomicFloatFeaturesEXT& queryShaderAtomicFloatFeatures() const;
    const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& queryShaderAtomicFloat2Features() const;
    const VkPhysicalDeviceVulkanMemoryModelFeaturesKHR& queryVulkanMemoryModelFeatures() const;

    // extension properties
    const void* queryExtensionProperties() const;
    const VkPhysicalDeviceCooperativeMatrix2PropertiesNV& queryCooperativeMatrix2PropertiesNV() const;
    const VkPhysicalDeviceCooperativeVectorPropertiesNV& queryCooperativeVectorPropertiesNV() const;
    const VkPhysicalDeviceDriverPropertiesKHR& queryDriverProperties() const;
    const VkPhysicalDeviceFloatControlsPropertiesKHR& queryFloatControlsProperties() const;
    const VkPhysicalDeviceRobustness2PropertiesKHR& queryRobustness2Properties() const;
    const VkPhysicalDeviceShaderIntegerDotProductProperties& queryShaderIntegerDotProductProperties() const;
    const VkPhysicalDeviceSubgroupProperties& querySubgroupProperties() const;
    const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& querySubgroupSizeControlProperties() const;
    const VkPhysicalDeviceExternalMemoryHostPropertiesEXT& queryExternalMemoryHostProperties() const;

    // extension sub properties
    const std::vector<VkCooperativeMatrixPropertiesKHR>& queryCooperativeMatrixSubProperties() const;
    const std::vector<VkCooperativeMatrixPropertiesNV>& queryCooperativeMatrixSubPropertiesNV() const;
    const std::vector<VkCooperativeMatrixFlexibleDimensionsPropertiesNV>& queryCooperativeMatrixFlexibleDimensionsSubPropertiesNV() const;
    const std::vector<VkCooperativeVectorPropertiesNV>& queryCooperativeVectorSubPropertiesNV() const;

    // some utility functions
    void get_optimal_cooperative_matrix_mnk(int M, int N, int K, VkComponentTypeKHR type, VkComponentTypeKHR acctype, VkScopeKHR scope, int& coopmat_M, int& coopmat_N, int& coopmat_K, int& coopmat_subgroup_size) const;

private:
    GpuInfo(const GpuInfo&);
    GpuInfo& operator=(const GpuInfo&);

private:
    friend int create_gpu_instance(const char* driver_path);
    GpuInfoPrivate* const d;
};

NCNN_EXPORT const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());

class VkAllocator;
class VkCompute;
class Option;
class PipelineCache;
class VulkanDevicePrivate;
class NCNN_EXPORT VulkanDevice
{
public:
    VulkanDevice(int device_index = get_default_gpu_index());
    ~VulkanDevice();

    const GpuInfo& info;

    VkDevice vkdevice() const;
    bool is_valid() const;

    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;

    // with fixed workgroup size
    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;

    // helper for creating pipeline
    int create_descriptorset_layout(int binding_count, const int* binding_types, VkDescriptorSetLayout* descriptorset_layout) const;
    int create_pipeline_layout(int push_constant_count, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout* pipeline_layout) const;
    int create_pipeline(VkShaderModule shader_module, VkPipelineLayout pipeline_layout, const std::vector<vk_specialization_type>& specializations, uint32_t subgroup_size, VkPipeline* pipeline) const;
    int create_descriptor_update_template(int binding_count, const int* binding_types, VkDescriptorSetLayout descriptorset_layout, VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR* descriptor_update_template) const;

    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
    bool is_mappable(uint32_t memory_type_index) const;
    bool is_coherent(uint32_t memory_type_index) const;
    bool is_device_local(uint32_t memory_type_index) const;

    VkQueue acquire_queue(uint32_t queue_family_index) const;
    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;

    // allocator on this device
    VkAllocator* acquire_blob_allocator() const;
    void reclaim_blob_allocator(VkAllocator* allocator) const;

    VkAllocator* acquire_staging_allocator() const;
    void reclaim_staging_allocator(VkAllocator* allocator) const;

    // immutable sampler for texelfetch
    const VkSampler* immutable_texelfetch_sampler() const;

    // dummy buffer image
    VkMat get_dummy_buffer() const;
    VkImageMat get_dummy_image() const;
    VkImageMat get_dummy_image_readonly() const;

    // pipeline cache on this device
    const PipelineCache* get_pipeline_cache() const;

    // test image allocation
    bool shape_support_image_storage(const Mat& shape) const;

    // current gpu heap memory budget in MB
    uint32_t get_heap_budget() const;

    // utility operator
    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
    // cast_type_to   0=auto(same as src)  1=fp32  2=fp16  3=int32  4=int8  5=bf16
    void convert_packing(const VkMat& src, VkMat& dst, int dst_elempack, int cast_type_to, VkCompute& cmd, const Option& opt) const;

    // VK_KHR_bind_memory2
    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;

    // VK_KHR_buffer_device_address
    PFN_vkGetBufferDeviceAddressKHR vkGetBufferDeviceAddressKHR;
    PFN_vkGetBufferOpaqueCaptureAddressKHR vkGetBufferOpaqueCaptureAddressKHR;
    PFN_vkGetDeviceMemoryOpaqueCaptureAddressKHR vkGetDeviceMemoryOpaqueCaptureAddressKHR;

    // VK_KHR_descriptor_update_template
    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;

    // VK_KHR_get_memory_requirements2
    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;

    // VK_KHR_maintenance1
    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;

    // VK_KHR_maintenance3
    PFN_vkGetDescriptorSetLayoutSupportKHR vkGetDescriptorSetLayoutSupportKHR;

    // VK_KHR_push_descriptor
    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;

    // VK_KHR_sampler_ycbcr_conversion
    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;

    // VK_KHR_swapchain
    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
    PFN_vkQueuePresentKHR vkQueuePresentKHR;

    // VK_EXT_buffer_device_address
    PFN_vkGetBufferDeviceAddressEXT vkGetBufferDeviceAddressEXT;

    // VK_EXT_external_memory_host
    PFN_vkGetMemoryHostPointerPropertiesEXT vkGetMemoryHostPointerPropertiesEXT;

#if __ANDROID_API__ >= 26
    // VK_ANDROID_external_memory_android_hardware_buffer
    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
#endif // __ANDROID_API__ >= 26

    // VK_NV_cooperative_vector
    PFN_vkCmdConvertCooperativeVectorMatrixNV vkCmdConvertCooperativeVectorMatrixNV;
    PFN_vkConvertCooperativeVectorMatrixNV vkConvertCooperativeVectorMatrixNV;

protected:
    // device extension
    int init_device_extension();

private:
    VulkanDevice(const VulkanDevice&);
    VulkanDevice& operator=(const VulkanDevice&);

private:
    VulkanDevicePrivate* const d;
};

NCNN_EXPORT VulkanDevice* get_gpu_device(int device_index = get_default_gpu_index());

// online spirv compilation
NCNN_EXPORT int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv);
NCNN_EXPORT int compile_spirv_module(const char* comp_data, int comp_data_size, const Option& opt, std::vector<uint32_t>& spirv);
NCNN_EXPORT int compile_spirv_module(int shader_type_index, const Option& opt, std::vector<uint32_t>& spirv);

// info from spirv
class NCNN_EXPORT ShaderInfo
{
public:
    int specialization_count;
    int binding_count;
    int push_constant_count;

    // 0 = null
    // 1 = storage buffer
    // 2 = storage image
    // 3 = combined image sampler
    int binding_types[16]; // 16 is large enough I think ...

    int reserved_0;
    int reserved_1;
    int reserved_2;
    int reserved_3;
};

NCNN_EXPORT int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);

} // namespace ncnn

#endif // NCNN_VULKAN

#endif // NCNN_GPU_H


================================================
FILE: src/layer/absval.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "absval.h"

namespace ncnn {

AbsVal::AbsVal()
{
    one_blob_only = true;
    support_inplace = true;
}

int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            if (ptr[i] < 0)
                ptr[i] = -ptr[i];
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/absval.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ABSVAL_H
#define LAYER_ABSVAL_H

#include "layer.h"

namespace ncnn {

class AbsVal : public Layer
{
public:
    AbsVal();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ABSVAL_H


================================================
FILE: src/layer/argmax.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "argmax.h"

#include <functional>

namespace ncnn {

ArgMax::ArgMax()
{
    one_blob_only = true;
}

int ArgMax::load_param(const ParamDict& pd)
{
    out_max_val = pd.get(0, 0);
    topk = pd.get(1, 1);

    return 0;
}

int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int size = bottom_blob.total();

    if (out_max_val)
        top_blob.create(topk, 2, 4u, opt.blob_allocator);
    else
        top_blob.create(topk, 1, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const float* ptr = bottom_blob;

    // partial sort topk with index
    // optional value
    std::vector<std::pair<float, int> > vec;
    vec.resize(size);
    for (int i = 0; i < size; i++)
    {
        vec[i] = std::make_pair(ptr[i], i);
    }

    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                      std::greater<std::pair<float, int> >());

    float* outptr = top_blob;
    if (out_max_val)
    {
        float* valptr = outptr + topk;
        for (int i = 0; i < topk; i++)
        {
            outptr[i] = vec[i].first;
            valptr[i] = vec[i].second;
        }
    }
    else
    {
        for (int i = 0; i < topk; i++)
        {
            outptr[i] = vec[i].second;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/argmax.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ARGMAX_H
#define LAYER_ARGMAX_H

#include "layer.h"

namespace ncnn {

class ArgMax : public Layer
{
public:
    ArgMax();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    int out_max_val;
    int topk;
};

} // namespace ncnn

#endif // LAYER_ARGMAX_H


================================================
FILE: src/layer/arm/absval_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "absval_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

AbsVal_arm::AbsVal_arm()
{
#if __ARM_NEON
    support_packing = true;
#endif // __ARM_NEON
}

int AbsVal_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                "fabs   v0.4s, v0.4s            \n"
                "fabs   v1.4s, v1.4s            \n"
                "fabs   v2.4s, v2.4s            \n"
                "fabs   v3.4s, v3.4s            \n"
                "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr)
                : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #512]      \n"
                "vldm       %0, {d0-d7}     \n"
                "vabs.f32   q0, q0          \n"
                "vabs.f32   q1, q1          \n"
                "vabs.f32   q2, q2          \n"
                "vabs.f32   q3, q3          \n"
                "vstm       %0!, {d0-d7}    \n"
                : "=r"(ptr) // %0
                : "0"(ptr)
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _p2 = vld1q_f32(ptr + 8);
            float32x4_t _p3 = vld1q_f32(ptr + 12);
            _p0 = vabsq_f32(_p0);
            _p1 = vabsq_f32(_p1);
            _p2 = vabsq_f32(_p2);
            _p3 = vabsq_f32(_p3);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            vst1q_f32(ptr + 8, _p2);
            vst1q_f32(ptr + 12, _p3);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            _p0 = vabsq_f32(_p0);
            _p1 = vabsq_f32(_p1);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vabsq_f32(_p);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = *ptr > 0 ? *ptr : -*ptr;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/absval_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ABSVAL_ARM_H
#define LAYER_ABSVAL_ARM_H

#include "absval.h"

namespace ncnn {

class AbsVal_arm : public AbsVal
{
public:
    AbsVal_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ABSVAL_ARM_H


================================================
FILE: src/layer/arm/arm_activation.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef ARM_ACTIVATION_H
#define ARM_ACTIVATION_H

#include "fused_activation.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"

static inline float32x4_t activation_ps(float32x4_t _v, int activation_type, const ncnn::Mat& activation_params)
{
    if (activation_type == 1)
    {
        const float32x4_t _zero = vdupq_n_f32(0.f);
        _v = vmaxq_f32(_v, _zero);
    }
    else if (activation_type == 2)
    {
        const float32x4_t _zero = vdupq_n_f32(0.f);
        const float32x4_t _slope = vdupq_n_f32(activation_params[0]);
        const uint32x4_t _lemask = vcleq_f32(_v, _zero);
        float32x4_t _ps = vmulq_f32(_v, _slope);
        _v = vbslq_f32(_lemask, _ps, _v);
    }
    else if (activation_type == 3)
    {
        const float32x4_t _min = vdupq_n_f32(activation_params[0]);
        const float32x4_t _max = vdupq_n_f32(activation_params[1]);
        _v = vmaxq_f32(_v, _min);
        _v = vminq_f32(_v, _max);
    }
    else if (activation_type == 4)
    {
        _v = sigmoid_ps(_v);
    }
    else if (activation_type == 5)
    {
        _v = vmulq_f32(_v, tanh_ps(log_ps(vaddq_f32(exp_ps(_v), vdupq_n_f32(1.f)))));
    }
    else if (activation_type == 6)
    {
        const float alpha = activation_params[0];
        const float beta = activation_params[1];
        const float32x4_t _zero = vdupq_n_f32(0.f);
        const float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _ans = vdupq_n_f32(beta);
        _ans = vmlaq_n_f32(_ans, _v, alpha);
        _ans = vmaxq_f32(_ans, _zero);
        _ans = vminq_f32(_ans, _one);
        _v = vmulq_f32(_ans, _v);
    }

    return _v;
}

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "arm_usability.h"
#include "neon_mathfun_fp16s.h"

static inline __fp16 activation_ss_f16(__fp16 v, int activation_type, const ncnn::Mat& activation_params)
{
    if (activation_type == 1)
    {
        v = std::max(v, (__fp16)0.f);
    }
    else if (activation_type == 2)
    {
        const __fp16 slope = (__fp16)(activation_params[0]);
        v = v > 0.f ? v : v * slope;
    }
    else if (activation_type == 3)
    {
        const __fp16 min = (__fp16)(activation_params[0]);
        const __fp16 max = (__fp16)(activation_params[1]);
        if (v < min)
            v = min;
        if (v > max)
            v = max;
    }
    else if (activation_type == 4)
    {
        v = (__fp16)1.f / ((__fp16)1.f + (__fp16)expf(-v));
    }
    else if (activation_type == 5)
    {
        v = v * (__fp16)tanhf(logf(expf((float)v) + 1.f));
    }
    else if (activation_type == 6)
    {
        const __fp16 alpha = (__fp16)(activation_params[0]);
        const __fp16 beta = (__fp16)(activation_params[1]);
        const __fp16 lower = -beta / alpha;
        const __fp16 upper = ((__fp16)1.f / alpha) + lower;
        if (v < lower)
            v = (__fp16)0.f;
        else if (v > upper)
            ;
        else
            v = v * (v * alpha + beta);
    }

    return v;
}

static inline float16x4_t activation_ps_f16(float16x4_t _v, int activation_type, const ncnn::Mat& activation_params)
{
    if (activation_type == 1)
    {
        const float16x4_t _zero = vdup_n_f16(0.f);
        _v = vmax_f16(_v, _zero);
    }
    else if (activation_type == 2)
    {
        const float16x4_t _zero = vdup_n_f16(0.f);
#if defined(_MSC_VER) && !defined(__clang__)
        const float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
#else
        const float16x4_t _slope = vdup_n_f16((__fp16)activation_params[0]);
#endif
        const uint16x4_t _lemask = vcle_f16(_v, _zero);
        float16x4_t _ps = vmul_f16(_v, _slope);
        _v = vbsl_f16(_lemask, _ps, _v);
    }
    else if (activation_type == 3)
    {
        const float16x4_t _min = vdup_n_f16((__fp16)activation_params[0]);
        const float16x4_t _max = vdup_n_f16((__fp16)activation_params[1]);
        _v = vmax_f16(_v, _min);
        _v = vmin_f16(_v, _max);
    }
    else if (activation_type == 4)
    {
        _v = sigmoid_ps_f16(_v);
    }
    else if (activation_type == 5)
    {
        _v = vmul_f16(_v, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_v), vdup_n_f16(1.f)))));
    }
    else if (activation_type == 6)
    {
        const __fp16 alpha = (__fp16)activation_params[0];
        const __fp16 beta = (__fp16)activation_params[1];
        const float16x4_t _zero = vdup_n_f16(0.f);
        const float16x4_t _one = vdup_n_f16(1.f);
        float16x4_t _ans = vdup_n_f16(beta);
        _ans = vfma_n_f16(_ans, _v, alpha);
        _ans = vmax_f16(_ans, _zero);
        _ans = vmin_f16(_ans, _one);
        _v = vmul_f16(_ans, _v);
    }

    return _v;
}

static inline float16x8_t activation_ps_f16(float16x8_t _v, int activation_type, const ncnn::Mat& activation_params)
{
    if (activation_type == 1)
    {
        const float16x8_t _zero = vdupq_n_f16(0.f);
        _v = vmaxq_f16(_v, _zero);
    }
    else if (activation_type == 2)
    {
        const float16x8_t _zero = vdupq_n_f16(0.f);
#if defined(_MSC_VER) && !defined(__clang__)
        const float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(activation_params[0]));
        const float16x8_t _slope = vcombine_f16(_slope0, _slope0);
#else
        const float16x8_t _slope = vdupq_n_f16((__fp16)activation_params[0]);
#endif
        const uint16x8_t _lemask = vcleq_f16(_v, _zero);
        float16x8_t _ps = vmulq_f16(_v, _slope);
        _v = vbslq_f16(_lemask, _ps, _v);
    }
    else if (activation_type == 3)
    {
        const float16x8_t _min = vdupq_n_f16((__fp16)activation_params[0]);
        const float16x8_t _max = vdupq_n_f16((__fp16)activation_params[1]);
        _v = vmaxq_f16(_v, _min);
        _v = vminq_f16(_v, _max);
    }
    else if (activation_type == 4)
    {
        _v = sigmoid_ps_f16(_v);
    }
    else if (activation_type == 5)
    {
        _v = vmulq_f16(_v, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_v), vdupq_n_f16(1.f)))));
    }
    else if (activation_type == 6)
    {
        const __fp16 alpha_fp16 = (__fp16)activation_params[0];
        const __fp16 beta_fp16 = (__fp16)activation_params[1];
        const float16x8_t _zero = vdupq_n_f16(0.f);
        const float16x8_t _one = vdupq_n_f16(1.f);
        float16x8_t _ans = vdupq_n_f16(beta_fp16);
        _ans = vfmaq_n_f16(_ans, _v, alpha_fp16);
        _ans = vmaxq_f16(_ans, _zero);
        _ans = vminq_f16(_ans, _one);
        _v = vmulq_f16(_ans, _v);
    }
    return _v;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#endif // __ARM_NEON

#endif // ARM_ACTIVATION_H


================================================
FILE: src/layer/arm/arm_usability.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef ARM_USABILITY_H
#define ARM_USABILITY_H

static inline signed char float2int8(float v)
{
    int int32 = (int)roundf(v);
    if (int32 > 127) return 127;
    if (int32 < -127) return -127;
    return (signed char)int32;
}

#if __ARM_NEON
#include <arm_neon.h>

static inline uint16x4_t float2bfloat(float32x4_t _v)
{
    return vshrn_n_u32(vreinterpretq_u32_f32(_v), 16);
}
static inline float32x4_t bfloat2float(uint16x4_t _v)
{
    return vreinterpretq_f32_u32(vshll_n_u16(_v, 16));
}

static inline int8x8_t float2int8(float32x4_t _vlow, float32x4_t _vhigh)
{
#if __aarch64__
    int32x4_t _vlow32 = vcvtaq_s32_f32(_vlow);
    int32x4_t _vhigh32 = vcvtaq_s32_f32(_vhigh);
#else
    // vcvtq_s32_f32 is round to zero
    // simulate round to nearest via +/-0.5
    float32x4_t _p5 = vdupq_n_f32(0.5f);
    int32x4_t _signmask = vdupq_n_s32(1 << 31);
    int32x4_t _signlow = vandq_s32(vreinterpretq_s32_f32(_vlow), _signmask);
    int32x4_t _signhigh = vandq_s32(vreinterpretq_s32_f32(_vhigh), _signmask);
    float32x4_t _p5low = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow));
    float32x4_t _p5high = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh));
    float32x4_t _vlow5 = vaddq_f32(_vlow, _p5low);
    float32x4_t _vhigh5 = vaddq_f32(_vhigh, _p5high);
    int32x4_t _vlow32 = vcvtq_s32_f32(_vlow5);
    int32x4_t _vhigh32 = vcvtq_s32_f32(_vhigh5);
#endif
    int16x8_t _v16 = vcombine_s16(vqmovn_s32(_vlow32), vqmovn_s32(_vhigh32));
    int8x8_t _v8 = vqmovn_s16(_v16);
    return vmax_s8(_v8, vdup_n_s8(-127));
}

static inline int8x8_t float2int8relu(float32x4_t _vlow, float32x4_t _vhigh)
{
#if __aarch64__
    int32x4_t _vlow32 = vcvtaq_s32_f32(_vlow);
    int32x4_t _vhigh32 = vcvtaq_s32_f32(_vhigh);
#else
    // vcvtq_s32_f32 is round to zero
    // simulate round to nearest via +/-0.5
    float32x4_t _p5 = vdupq_n_f32(0.5f);
    int32x4_t _signmask = vdupq_n_s32(1 << 31);
    int32x4_t _signlow = vandq_s32(vreinterpretq_s32_f32(_vlow), _signmask);
    int32x4_t _signhigh = vandq_s32(vreinterpretq_s32_f32(_vhigh), _signmask);
    float32x4_t _p5low = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow));
    float32x4_t _p5high = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh));
    float32x4_t _vlow5 = vaddq_f32(_vlow, _p5low);
    float32x4_t _vhigh5 = vaddq_f32(_vhigh, _p5high);
    int32x4_t _vlow32 = vcvtq_s32_f32(_vlow5);
    int32x4_t _vhigh32 = vcvtq_s32_f32(_vhigh5);
#endif
    int16x8_t _v16 = vcombine_s16(vqmovn_s32(_vlow32), vqmovn_s32(_vhigh32));
    int8x8_t _v8 = vqmovn_s16(_v16);
    return vmax_s8(_v8, vdup_n_s8(0));
}

static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh, float32x4_t _slope)
{
    float32x4_t _vlow_leaky = vmulq_f32(_vlow, _slope);
    float32x4_t _vhigh_leaky = vmulq_f32(_vhigh, _slope);
#if __aarch64__
    int32x4_t _vlow32 = vcvtaq_s32_f32(_vlow);
    int32x4_t _vhigh32 = vcvtaq_s32_f32(_vhigh);
    int32x4_t _vlow32_leaky = vcvtaq_s32_f32(_vlow_leaky);
    int32x4_t _vhigh32_leaky = vcvtaq_s32_f32(_vhigh_leaky);
#else
    // vcvtq_s32_f32 is round to zero
    // simulate round to nearest via +/-0.5
    float32x4_t _p5 = vdupq_n_f32(0.5f);
    int32x4_t _signmask = vdupq_n_s32(1 << 31);
    int32x4_t _signlow = vandq_s32(vreinterpretq_s32_f32(_vlow), _signmask);
    int32x4_t _signhigh = vandq_s32(vreinterpretq_s32_f32(_vhigh), _signmask);
    float32x4_t _p5low = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow));
    float32x4_t _p5high = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh));
    float32x4_t _vlow5 = vaddq_f32(_vlow, _p5low);
    float32x4_t _vhigh5 = vaddq_f32(_vhigh, _p5high);
    int32x4_t _vlow32 = vcvtq_s32_f32(_vlow5);
    int32x4_t _vhigh32 = vcvtq_s32_f32(_vhigh5);

    int32x4_t _signlow_leaky = vandq_s32(vreinterpretq_s32_f32(_vlow_leaky), _signmask);
    int32x4_t _signhigh_leaky = vandq_s32(vreinterpretq_s32_f32(_vhigh_leaky), _signmask);
    float32x4_t _p5low_leaky = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow_leaky));
    float32x4_t _p5high_leaky = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh_leaky));
    float32x4_t _vlow5_leaky = vaddq_f32(_vlow_leaky, _p5low_leaky);
    float32x4_t _vhigh5_leaky = vaddq_f32(_vhigh_leaky, _p5high_leaky);
    int32x4_t _vlow32_leaky = vcvtq_s32_f32(_vlow5_leaky);
    int32x4_t _vhigh32_leaky = vcvtq_s32_f32(_vhigh5_leaky);
#endif
    int16x8_t _v16 = vcombine_s16(vqmovn_s32(_vlow32), vqmovn_s32(_vhigh32));
    int16x8_t _v16_leaky = vcombine_s16(vqmovn_s32(_vlow32_leaky), vqmovn_s32(_vhigh32_leaky));
    int8x8_t _v8 = vqmovn_s16(_v16);
    int8x8_t _v8_leaky = vqmovn_s16(_v16_leaky);
    return vmax_s8(_v8, _v8_leaky);
}

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#if defined(_MSC_VER) && !defined(__clang__)
struct __fp16
{
    __fp16()
    {
        _u16 = 0;
    }

    __fp16(float f32)
    {
        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
    }

    __fp16(__n16 n16)
    {
        _u16 = n16.n16_u16[0];
    }

    operator const float() const
    {
        return vgetq_lane_f32(vcvt_f32_f16(vreinterpretq_f16_u16(vdup_n_u16(_u16))), 0);
    }

    __fp16& operator+=(const __fp16& b)
    {
        float a = (float)*this;
        float f32 = (a + (float)b);
        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
        return *this;
    }

    __fp16& operator-=(const __fp16& b)
    {
        float a = (float)*this;
        float f32 = (a - (float)b);
        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
        return *this;
    }

    __fp16& operator*=(const __fp16& b)
    {
        float a = (float)*this;
        float f32 = (a * (float)b);
        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
        return *this;
    }

    __fp16& operator/=(const __fp16& b)
    {
        float a = (float)*this;
        float f32 = (a / (float)b);
        _u16 = vget_lane_u16(vreinterpretq_u16_f16(vcvt_f16_f32(vdupq_n_f32(f32))), 0);
        return *this;
    }

    unsigned short _u16;
};

static inline __fp16 operator-(const __fp16& a)
{
    return __fp16(-(float)a);
}
static inline __fp16 operator+(const __fp16& a, const __fp16& b)
{
    return __fp16((float)a + (float)b);
}
static inline __fp16 operator-(const __fp16& a, const __fp16& b)
{
    return __fp16((float)a - (float)b);
}
static inline __fp16 operator*(const __fp16& a, const __fp16& b)
{
    return __fp16((float)a * (float)b);
}
static inline __fp16 operator/(const __fp16& a, const __fp16& b)
{
    return __fp16((float)a / (float)b);
}

static inline float16x4_t vdup_n_f16(const __fp16& f16)
{
    return vreinterpret_f16_u16(vdup_n_u16(f16._u16));
}

static inline float16x8_t vdupq_n_f16(const __fp16& f16)
{
    return vreinterpretq_f16_u16(vdupq_n_u16(f16._u16));
}

static inline __fp16 vmaxv_f16(float16x4_t a)
{
    return __fp16(vmaxvq_f32(vcvt_f32_f16(a)));
}

static inline __fp16 vmaxvq_f16(float16x8_t a)
{
    float x = vmaxvq_f32(vcvt_f32_f16(vget_low_f16(a)));
    float y = vmaxvq_f32(vcvt_f32_f16(vget_high_f16(a)));
    return __fp16(x > y ? x : y);
}

#define vld1q_f16 vld1q_u16
#define vst1q_f16 vst1q_u16

#define vld2_f16 vld2_u16
#define vst2_f16 vst2_u16

#define vld2q_f16 vld2q_u16
#define vst2q_f16 vst2q_u16

#define vld4_f16 vld4_u16
#define vst4_f16 vst4_u16

#define vld4q_f16 vld4q_u16
#define vst4q_f16 vst4q_u16

#define vld1q_dup_f16 vld1q_dup_u16

#define vset_lane_f16(x, v, i)  vset_lane_u16(x._u16, (uint16x4_t)v, i)
#define vsetq_lane_f16(x, v, i) vsetq_lane_u16(x._u16, (uint16x8_t)v, i)

#define vfma_n_f16(va, vb, x)  vfma_f16(va, vb, vdup_n_f16(x))
#define vfmaq_n_f16(va, vb, x) vfmaq_f16(va, vb, vdupq_n_f16(x))

#endif

static inline signed char float2int8(__fp16 v)
{
    int int32 = (int)roundf(v);
    if (int32 > 127) return 127;
    if (int32 < -127) return -127;
    return (signed char)int32;
}

static inline int8x8_t float2int8(float16x8_t _v)
{
    int16x8_t _v16 = vcvtaq_s16_f16(_v);
    int8x8_t _v8 = vqmovn_s16(_v16);
    return vmax_s8(_v8, vdup_n_s8(-127));
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

static inline void transpose4x4_u16(uint16x4_t& _r0, uint16x4_t& _r1, uint16x4_t& _r2, uint16x4_t& _r3)
{
    uint16x4x2_t _r01z = vzip_u16(_r0, _r1);
    uint16x4x2_t _r23z = vzip_u16(_r2, _r3);
    uint32x2x2_t _r01 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0]));
    uint32x2x2_t _r23 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1]));
    _r0 = vreinterpret_u16_u32(_r01.val[0]);
    _r1 = vreinterpret_u16_u32(_r01.val[1]);
    _r2 = vreinterpret_u16_u32(_r23.val[0]);
    _r3 = vreinterpret_u16_u32(_r23.val[1]);
}

static inline void transpose4x8_u16(uint16x4_t& _r0, uint16x4_t& _r1, uint16x4_t& _r2, uint16x4_t& _r3, uint16x4_t& _r4, uint16x4_t& _r5, uint16x4_t& _r6, uint16x4_t& _r7)
{
    uint16x4x2_t _r01z = vzip_u16(_r0, _r1);
    uint16x4x2_t _r23z = vzip_u16(_r2, _r3);
    uint16x4x2_t _r45z = vzip_u16(_r4, _r5);
    uint16x4x2_t _r67z = vzip_u16(_r6, _r7);
    uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0]));
    uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1]));
    uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0]));
    uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1]));
    _r0 = vreinterpret_u16_u32(_r01_0.val[0]);
    _r1 = vreinterpret_u16_u32(_r01_1.val[0]);
    _r2 = vreinterpret_u16_u32(_r01_0.val[1]);
    _r3 = vreinterpret_u16_u32(_r01_1.val[1]);
    _r4 = vreinterpret_u16_u32(_r23_0.val[0]);
    _r5 = vreinterpret_u16_u32(_r23_1.val[0]);
    _r6 = vreinterpret_u16_u32(_r23_0.val[1]);
    _r7 = vreinterpret_u16_u32(_r23_1.val[1]);
}

static inline void transpose4x12_u16(uint16x4_t& _r0, uint16x4_t& _r1, uint16x4_t& _r2, uint16x4_t& _r3, uint16x4_t& _r4, uint16x4_t& _r5, uint16x4_t& _r6, uint16x4_t& _r7, uint16x4_t& _r8, uint16x4_t& _r9, uint16x4_t& _ra, uint16x4_t& _rb)
{
    uint16x4x2_t _r01z = vzip_u16(_r0, _r1);
    uint16x4x2_t _r23z = vzip_u16(_r2, _r3);
    uint16x4x2_t _r45z = vzip_u16(_r4, _r5);
    uint16x4x2_t _r67z = vzip_u16(_r6, _r7);
    uint16x4x2_t _r89z = vzip_u16(_r8, _r9);
    uint16x4x2_t _rabz = vzip_u16(_ra, _rb);
    uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0]));
    uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1]));
    uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0]));
    uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1]));
    uint32x2x2_t _r01_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[0]), vreinterpret_u32_u16(_rabz.val[0]));
    uint32x2x2_t _r23_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[1]), vreinterpret_u32_u16(_rabz.val[1]));
    _r0 = vreinterpret_u16_u32(_r01_0.val[0]);
    _r1 = vreinterpret_u16_u32(_r01_1.val[0]);
    _r2 = vreinterpret_u16_u32(_r01_2.val[0]);
    _r3 = vreinterpret_u16_u32(_r01_0.val[1]);
    _r4 = vreinterpret_u16_u32(_r01_1.val[1]);
    _r5 = vreinterpret_u16_u32(_r01_2.val[1]);
    _r6 = vreinterpret_u16_u32(_r23_0.val[0]);
    _r7 = vreinterpret_u16_u32(_r23_1.val[0]);
    _r8 = vreinterpret_u16_u32(_r23_2.val[0]);
    _r9 = vreinterpret_u16_u32(_r23_0.val[1]);
    _ra = vreinterpret_u16_u32(_r23_1.val[1]);
    _rb = vreinterpret_u16_u32(_r23_2.val[1]);
}

static inline void transpose8x4_u16(uint16x8_t& _r0, uint16x8_t& _r1, uint16x8_t& _r2, uint16x8_t& _r3)
{
    uint16x8x2_t _r01t = vzipq_u16(_r0, _r1);
    uint16x8x2_t _r23t = vzipq_u16(_r2, _r3);
    uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0]));
    uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1]));
    _r0 = vreinterpretq_u16_u32(_r01_0.val[0]);
    _r1 = vreinterpretq_u16_u32(_r01_0.val[1]);
    _r2 = vreinterpretq_u16_u32(_r23_0.val[0]);
    _r3 = vreinterpretq_u16_u32(_r23_0.val[1]);
}

static inline void transpose8x8_u16(uint16x8_t& _r0, uint16x8_t& _r1, uint16x8_t& _r2, uint16x8_t& _r3, uint16x8_t& _r4, uint16x8_t& _r5, uint16x8_t& _r6, uint16x8_t& _r7)
{
    uint16x8x2_t _r01t = vzipq_u16(_r0, _r1);
    uint16x8x2_t _r23t = vzipq_u16(_r2, _r3);
    uint16x8x2_t _r45t = vzipq_u16(_r4, _r5);
    uint16x8x2_t _r67t = vzipq_u16(_r6, _r7);
    uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0]));
    uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1]));
    uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0]));
    uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1]));
    _r0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0])));
    _r1 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_0.val[0]), vget_high_u32(_r01_1.val[0])));
    _r2 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1])));
    _r3 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_0.val[1]), vget_high_u32(_r01_1.val[1])));
    _r4 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0])));
    _r5 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_0.val[0]), vget_high_u32(_r23_1.val[0])));
    _r6 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1])));
    _r7 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_0.val[1]), vget_high_u32(_r23_1.val[1])));
}

static inline void transpose8x12_u16(uint16x8_t& _r0, uint16x8_t& _r1, uint16x8_t& _r2, uint16x8_t& _r3, uint16x8_t& _r4, uint16x8_t& _r5, uint16x8_t& _r6, uint16x8_t& _r7, uint16x8_t& _r8, uint16x8_t& _r9, uint16x8_t& _ra, uint16x8_t& _rb)
{
    uint16x8x2_t _r01t = vzipq_u16(_r0, _r1);
    uint16x8x2_t _r23t = vzipq_u16(_r2, _r3);
    uint16x8x2_t _r45t = vzipq_u16(_r4, _r5);
    uint16x8x2_t _r67t = vzipq_u16(_r6, _r7);
    uint16x8x2_t _r89t = vzipq_u16(_r8, _r9);
    uint16x8x2_t _rabt = vzipq_u16(_ra, _rb);
    uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0]));
    uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1]));
    uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0]));
    uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1]));
    uint32x4x2_t _r01_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[0]), vreinterpretq_u32_u16(_rabt.val[0]));
    uint32x4x2_t _r23_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[1]), vreinterpretq_u32_u16(_rabt.val[1]));
    _r0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0])));
    _r1 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_2.val[0]), vget_high_u32(_r01_0.val[0])));
    _r2 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_1.val[0]), vget_high_u32(_r01_2.val[0])));
    _r3 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1])));
    _r4 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_2.val[1]), vget_high_u32(_r01_0.val[1])));
    _r5 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_1.val[1]), vget_high_u32(_r01_2.val[1])));
    _r6 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0])));
    _r7 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_2.val[0]), vget_high_u32(_r23_0.val[0])));
    _r8 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_1.val[0]), vget_high_u32(_r23_2.val[0])));
    _r9 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1])));
    _ra = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_2.val[1]), vget_high_u32(_r23_0.val[1])));
    _rb = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_1.val[1]), vget_high_u32(_r23_2.val[1])));
}

static inline void transpose4x4_ps(float32x4_t& _r0, float32x4_t& _r1, float32x4_t& _r2, float32x4_t& _r3)
{
    float32x4x2_t _r01z = vzipq_f32(_r0, _r1);
    float32x4x2_t _r23z = vzipq_f32(_r2, _r3);
    _r0 = vcombine_f32(vget_low_f32(_r01z.val[0]), vget_low_f32(_r23z.val[0]));
    _r1 = vcombine_f32(vget_high_f32(_r01z.val[0]), vget_high_f32(_r23z.val[0]));
    _r2 = vcombine_f32(vget_low_f32(_r01z.val[1]), vget_low_f32(_r23z.val[1]));
    _r3 = vcombine_f32(vget_high_f32(_r01z.val[1]), vget_high_f32(_r23z.val[1]));
}

static inline void transpose4x8_ps(float32x4_t& _r0, float32x4_t& _r1, float32x4_t& _r2, float32x4_t& _r3, float32x4_t& _r4, float32x4_t& _r5, float32x4_t& _r6, float32x4_t& _r7)
{
    float32x4x2_t _r01z = vzipq_f32(_r0, _r1);
    float32x4x2_t _r23z = vzipq_f32(_r2, _r3);
    float32x4x2_t _r45z = vzipq_f32(_r4, _r5);
    float32x4x2_t _r67z = vzipq_f32(_r6, _r7);
    _r0 = vcombine_f32(vget_low_f32(_r01z.val[0]), vget_low_f32(_r23z.val[0]));
    _r1 = vcombine_f32(vget_low_f32(_r45z.val[0]), vget_low_f32(_r67z.val[0]));
    _r2 = vcombine_f32(vget_high_f32(_r01z.val[0]), vget_high_f32(_r23z.val[0]));
    _r3 = vcombine_f32(vget_high_f32(_r45z.val[0]), vget_high_f32(_r67z.val[0]));
    _r4 = vcombine_f32(vget_low_f32(_r01z.val[1]), vget_low_f32(_r23z.val[1]));
    _r5 = vcombine_f32(vget_low_f32(_r45z.val[1]), vget_low_f32(_r67z.val[1]));
    _r6 = vcombine_f32(vget_high_f32(_r01z.val[1]), vget_high_f32(_r23z.val[1]));
    _r7 = vcombine_f32(vget_high_f32(_r45z.val[1]), vget_high_f32(_r67z.val[1]));
}

static inline void transpose4x12_ps(float32x4_t& _r0, float32x4_t& _r1, float32x4_t& _r2, float32x4_t& _r3, float32x4_t& _r4, float32x4_t& _r5, float32x4_t& _r6, float32x4_t& _r7, float32x4_t& _r8, float32x4_t& _r9, float32x4_t& _ra, float32x4_t& _rb)
{
    float32x4x2_t _r01z = vzipq_f32(_r0, _r1);
    float32x4x2_t _r23z = vzipq_f32(_r2, _r3);
    float32x4x2_t _r45z = vzipq_f32(_r4, _r5);
    float32x4x2_t _r67z = vzipq_f32(_r6, _r7);
    float32x4x2_t _r89z = vzipq_f32(_r8, _r9);
    float32x4x2_t _rabz = vzipq_f32(_ra, _rb);
    _r0 = vcombine_f32(vget_low_f32(_r01z.val[0]), vget_low_f32(_r23z.val[0]));
    _r1 = vcombine_f32(vget_low_f32(_r45z.val[0]), vget_low_f32(_r67z.val[0]));
    _r2 = vcombine_f32(vget_low_f32(_r89z.val[0]), vget_low_f32(_rabz.val[0]));
    _r3 = vcombine_f32(vget_high_f32(_r01z.val[0]), vget_high_f32(_r23z.val[0]));
    _r4 = vcombine_f32(vget_high_f32(_r45z.val[0]), vget_high_f32(_r67z.val[0]));
    _r5 = vcombine_f32(vget_high_f32(_r89z.val[0]), vget_high_f32(_rabz.val[0]));
    _r6 = vcombine_f32(vget_low_f32(_r01z.val[1]), vget_low_f32(_r23z.val[1]));
    _r7 = vcombine_f32(vget_low_f32(_r45z.val[1]), vget_low_f32(_r67z.val[1]));
    _r8 = vcombine_f32(vget_low_f32(_r89z.val[1]), vget_low_f32(_rabz.val[1]));
    _r9 = vcombine_f32(vget_high_f32(_r01z.val[1]), vget_high_f32(_r23z.val[1]));
    _ra = vcombine_f32(vget_high_f32(_r45z.val[1]), vget_high_f32(_r67z.val[1]));
    _rb = vcombine_f32(vget_high_f32(_r89z.val[1]), vget_high_f32(_rabz.val[1]));
}

static inline void transpose8x4_ps(float32x4_t& _r0l, float32x4_t& _r0h,
                                   float32x4_t& _r1l, float32x4_t& _r1h,
                                   float32x4_t& _r2l, float32x4_t& _r2h,
                                   float32x4_t& _r3l, float32x4_t& _r3h)
{
    float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l);
    float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l);
    float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h);
    float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h);
    _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0]));
    _r0h = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0]));
    _r1l = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1]));
    _r1h = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1]));
    _r2l = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0]));
    _r2h = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0]));
    _r3l = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1]));
    _r3h = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1]));
}

static inline void transpose12x4_ps(float32x4_t& _r0l, float32x4_t& _r0m, float32x4_t& _r0h,
                                    float32x4_t& _r1l, float32x4_t& _r1m, float32x4_t& _r1h,
                                    float32x4_t& _r2l, float32x4_t& _r2m, float32x4_t& _r2h,
                                    float32x4_t& _r3l, float32x4_t& _r3m, float32x4_t& _r3h)
{
    float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l);
    float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l);
    float32x4x2_t _r01mz = vzipq_f32(_r0m, _r1m);
    float32x4x2_t _r23mz = vzipq_f32(_r2m, _r3m);
    float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h);
    float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h);
    _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0]));
    _r0m = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0]));
    _r0h = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1]));
    _r1l = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1]));
    _r1m = vcombine_f32(vget_low_f32(_r01mz.val[0]), vget_low_f32(_r23mz.val[0]));
    _r1h = vcombine_f32(vget_high_f32(_r01mz.val[0]), vget_high_f32(_r23mz.val[0]));
    _r2l = vcombine_f32(vget_low_f32(_r01mz.val[1]), vget_low_f32(_r23mz.val[1]));
    _r2m = vcombine_f32(vget_high_f32(_r01mz.val[1]), vget_high_f32(_r23mz.val[1]));
    _r2h = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0]));
    _r3l = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0]));
    _r3m = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1]));
    _r3h = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1]));
}

#if __aarch64__
static inline void transpose8x8_ps(float32x4_t& _r0l, float32x4_t& _r0h,
                                   float32x4_t& _r1l, float32x4_t& _r1h,
                                   float32x4_t& _r2l, float32x4_t& _r2h,
                                   float32x4_t& _r3l, float32x4_t& _r3h,
                                   float32x4_t& _r4l, float32x4_t& _r4h,
                                   float32x4_t& _r5l, float32x4_t& _r5h,
                                   float32x4_t& _r6l, float32x4_t& _r6h,
                                   float32x4_t& _r7l, float32x4_t& _r7h)
{
    float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l);
    float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l);
    float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h);
    float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h);
    float32x4x2_t _r45lz = vzipq_f32(_r4l, _r5l);
    float32x4x2_t _r67lz = vzipq_f32(_r6l, _r7l);
    float32x4x2_t _r45hz = vzipq_f32(_r4h, _r5h);
    float32x4x2_t _r67hz = vzipq_f32(_r6h, _r7h);
    _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0]));
    _r0h = vcombine_f32(vget_low_f32(_r45lz.val[0]), vget_low_f32(_r67lz.val[0]));
    _r1l = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0]));
    _r1h = vcombine_f32(vget_high_f32(_r45lz.val[0]), vget_high_f32(_r67lz.val[0]));
    _r2l = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1]));
    _r2h = vcombine_f32(vget_low_f32(_r45lz.val[1]), vget_low_f32(_r67lz.val[1]));
    _r3l = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1]));
    _r3h = vcombine_f32(vget_high_f32(_r45lz.val[1]), vget_high_f32(_r67lz.val[1]));
    _r4l = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0]));
    _r4h = vcombine_f32(vget_low_f32(_r45hz.val[0]), vget_low_f32(_r67hz.val[0]));
    _r5l = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0]));
    _r5h = vcombine_f32(vget_high_f32(_r45hz.val[0]), vget_high_f32(_r67hz.val[0]));
    _r6l = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1]));
    _r6h = vcombine_f32(vget_low_f32(_r45hz.val[1]), vget_low_f32(_r67hz.val[1]));
    _r7l = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1]));
    _r7h = vcombine_f32(vget_high_f32(_r45hz.val[1]), vget_high_f32(_r67hz.val[1]));
}

static inline void transpose8x12_ps(float32x4_t& _r0l, float32x4_t& _r0h,
                                    float32x4_t& _r1l, float32x4_t& _r1h,
                                    float32x4_t& _r2l, float32x4_t& _r2h,
                                    float32x4_t& _r3l, float32x4_t& _r3h,
                                    float32x4_t& _r4l, float32x4_t& _r4h,
                                    float32x4_t& _r5l, float32x4_t& _r5h,
                                    float32x4_t& _r6l, float32x4_t& _r6h,
                                    float32x4_t& _r7l, float32x4_t& _r7h,
                                    float32x4_t& _r8l, float32x4_t& _r8h,
                                    float32x4_t& _r9l, float32x4_t& _r9h,
                                    float32x4_t& _ral, float32x4_t& _rah,
                                    float32x4_t& _rbl, float32x4_t& _rbh)
{
    float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l);
    float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l);
    float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h);
    float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h);
    float32x4x2_t _r45lz = vzipq_f32(_r4l, _r5l);
    float32x4x2_t _r67lz = vzipq_f32(_r6l, _r7l);
    float32x4x2_t _r45hz = vzipq_f32(_r4h, _r5h);
    float32x4x2_t _r67hz = vzipq_f32(_r6h, _r7h);
    float32x4x2_t _r89lz = vzipq_f32(_r8l, _r9l);
    float32x4x2_t _rablz = vzipq_f32(_ral, _rbl);
    float32x4x2_t _r89hz = vzipq_f32(_r8h, _r9h);
    float32x4x2_t _rabhz = vzipq_f32(_rah, _rbh);
    _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0]));
    _r0h = vcombine_f32(vget_low_f32(_r45lz.val[0]), vget_low_f32(_r67lz.val[0]));
    _r1l = vcombine_f32(vget_low_f32(_r89lz.val[0]), vget_low_f32(_rablz.val[0]));
    _r1h = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0]));
    _r2l = vcombine_f32(vget_high_f32(_r45lz.val[0]), vget_high_f32(_r67lz.val[0]));
    _r2h = vcombine_f32(vget_high_f32(_r89lz.val[0]), vget_high_f32(_rablz.val[0]));
    _r3l = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1]));
    _r3h = vcombine_f32(vget_low_f32(_r45lz.val[1]), vget_low_f32(_r67lz.val[1]));
    _r4l = vcombine_f32(vget_low_f32(_r89lz.val[1]), vget_low_f32(_rablz.val[1]));
    _r4h = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1]));
    _r5l = vcombine_f32(vget_high_f32(_r45lz.val[1]), vget_high_f32(_r67lz.val[1]));
    _r5h = vcombine_f32(vget_high_f32(_r89lz.val[1]), vget_high_f32(_rablz.val[1]));
    _r6l = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0]));
    _r6h = vcombine_f32(vget_low_f32(_r45hz.val[0]), vget_low_f32(_r67hz.val[0]));
    _r7l = vcombine_f32(vget_low_f32(_r89hz.val[0]), vget_low_f32(_rabhz.val[0]));
    _r7h = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0]));
    _r8l = vcombine_f32(vget_high_f32(_r45hz.val[0]), vget_high_f32(_r67hz.val[0]));
    _r8h = vcombine_f32(vget_high_f32(_r89hz.val[0]), vget_high_f32(_rabhz.val[0]));
    _r9l = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1]));
    _r9h = vcombine_f32(vget_low_f32(_r45hz.val[1]), vget_low_f32(_r67hz.val[1]));
    _ral = vcombine_f32(vget_low_f32(_r89hz.val[1]), vget_low_f32(_rabhz.val[1]));
    _rah = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1]));
    _rbl = vcombine_f32(vget_high_f32(_r45hz.val[1]), vget_high_f32(_r67hz.val[1]));
    _rbh = vcombine_f32(vget_high_f32(_r89hz.val[1]), vget_high_f32(_rabhz.val[1]));
}

static inline void transpose12x8_ps(float32x4_t& _r0l, float32x4_t& _r0m, float32x4_t& _r0h,
                                    float32x4_t& _r1l, float32x4_t& _r1m, float32x4_t& _r1h,
                                    float32x4_t& _r2l, float32x4_t& _r2m, float32x4_t& _r2h,
                                    float32x4_t& _r3l, float32x4_t& _r3m, float32x4_t& _r3h,
                                    float32x4_t& _r4l, float32x4_t& _r4m, float32x4_t& _r4h,
                                    float32x4_t& _r5l, float32x4_t& _r5m, float32x4_t& _r5h,
                                    float32x4_t& _r6l, float32x4_t& _r6m, float32x4_t& _r6h,
                                    float32x4_t& _r7l, float32x4_t& _r7m, float32x4_t& _r7h)
{
    float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l);
    float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l);
    float32x4x2_t _r01mz = vzipq_f32(_r0m, _r1m);
    float32x4x2_t _r23mz = vzipq_f32(_r2m, _r3m);
    float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h);
    float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h);
    float32x4x2_t _r45lz = vzipq_f32(_r4l, _r5l);
    float32x4x2_t _r67lz = vzipq_f32(_r6l, _r7l);
    float32x4x2_t _r45mz = vzipq_f32(_r4m, _r5m);
    float32x4x2_t _r67mz = vzipq_f32(_r6m, _r7m);
    float32x4x2_t _r45hz = vzipq_f32(_r4h, _r5h);
    float32x4x2_t _r67hz = vzipq_f32(_r6h, _r7h);
    _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0]));
    _r0m = vcombine_f32(vget_low_f32(_r45lz.val[0]), vget_low_f32(_r67lz.val[0]));
    _r0h = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0]));
    _r1l = vcombine_f32(vget_high_f32(_r45lz.val[0]), vget_high_f32(_r67lz.val[0]));
    _r1m = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1]));
    _r1h = vcombine_f32(vget_low_f32(_r45lz.val[1]), vget_low_f32(_r67lz.val[1]));
    _r2l = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1]));
    _r2m = vcombine_f32(vget_high_f32(_r45lz.val[1]), vget_high_f32(_r67lz.val[1]));
    _r2h = vcombine_f32(vget_low_f32(_r01mz.val[0]), vget_low_f32(_r23mz.val[0]));
    _r3l = vcombine_f32(vget_low_f32(_r45mz.val[0]), vget_low_f32(_r67mz.val[0]));
    _r3m = vcombine_f32(vget_high_f32(_r01mz.val[0]), vget_high_f32(_r23mz.val[0]));
    _r3h = vcombine_f32(vget_high_f32(_r45mz.val[0]), vget_high_f32(_r67mz.val[0]));
    _r4l = vcombine_f32(vget_low_f32(_r01mz.val[1]), vget_low_f32(_r23mz.val[1]));
    _r4m = vcombine_f32(vget_low_f32(_r45mz.val[1]), vget_low_f32(_r67mz.val[1]));
    _r4h = vcombine_f32(vget_high_f32(_r01mz.val[1]), vget_high_f32(_r23mz.val[1]));
    _r5l = vcombine_f32(vget_high_f32(_r45mz.val[1]), vget_high_f32(_r67mz.val[1]));
    _r5m = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0]));
    _r5h = vcombine_f32(vget_low_f32(_r45hz.val[0]), vget_low_f32(_r67hz.val[0]));
    _r6l = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0]));
    _r6m = vcombine_f32(vget_high_f32(_r45hz.val[0]), vget_high_f32(_r67hz.val[0]));
    _r6h = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1]));
    _r7l = vcombine_f32(vget_low_f32(_r45hz.val[1]), vget_low_f32(_r67hz.val[1]));
    _r7m = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1]));
    _r7h = vcombine_f32(vget_high_f32(_r45hz.val[1]), vget_high_f32(_r67hz.val[1]));
}

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static inline void transpose4x4_ph(float16x4_t& _r0, float16x4_t& _r1, float16x4_t& _r2, float16x4_t& _r3)
{
    uint16x4x2_t _r01z = vzip_u16(vreinterpret_u16_f16(_r0), vreinterpret_u16_f16(_r1));
    uint16x4x2_t _r23z = vzip_u16(vreinterpret_u16_f16(_r2), vreinterpret_u16_f16(_r3));
    uint32x2x2_t _r01 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0]));
    uint32x2x2_t _r23 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1]));
    _r0 = vreinterpret_f16_u32(_r01.val[0]);
    _r1 = vreinterpret_f16_u32(_r01.val[1]);
    _r2 = vreinterpret_f16_u32(_r23.val[0]);
    _r3 = vreinterpret_f16_u32(_r23.val[1]);
}

static inline void transpose4x8_ph(float16x4_t& _r0, float16x4_t& _r1, float16x4_t& _r2, float16x4_t& _r3, float16x4_t& _r4, float16x4_t& _r5, float16x4_t& _r6, float16x4_t& _r7)
{
    uint16x4x2_t _r01z = vzip_u16(vreinterpret_u16_f16(_r0), vreinterpret_u16_f16(_r1));
    uint16x4x2_t _r23z = vzip_u16(vreinterpret_u16_f16(_r2), vreinterpret_u16_f16(_r3));
    uint16x4x2_t _r45z = vzip_u16(vreinterpret_u16_f16(_r4), vreinterpret_u16_f16(_r5));
    uint16x4x2_t _r67z = vzip_u16(vreinterpret_u16_f16(_r6), vreinterpret_u16_f16(_r7));
    uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0]));
    uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1]));
    uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0]));
    uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1]));
    _r0 = vreinterpret_f16_u32(_r01_0.val[0]);
    _r1 = vreinterpret_f16_u32(_r01_1.val[0]);
    _r2 = vreinterpret_f16_u32(_r01_0.val[1]);
    _r3 = vreinterpret_f16_u32(_r01_1.val[1]);
    _r4 = vreinterpret_f16_u32(_r23_0.val[0]);
    _r5 = vreinterpret_f16_u32(_r23_1.val[0]);
    _r6 = vreinterpret_f16_u32(_r23_0.val[1]);
    _r7 = vreinterpret_f16_u32(_r23_1.val[1]);
}

static inline void transpose4x12_ph(float16x4_t& _r0, float16x4_t& _r1, float16x4_t& _r2, float16x4_t& _r3, float16x4_t& _r4, float16x4_t& _r5, float16x4_t& _r6, float16x4_t& _r7, float16x4_t& _r8, float16x4_t& _r9, float16x4_t& _ra, float16x4_t& _rb)
{
    uint16x4x2_t _r01z = vzip_u16(vreinterpret_u16_f16(_r0), vreinterpret_u16_f16(_r1));
    uint16x4x2_t _r23z = vzip_u16(vreinterpret_u16_f16(_r2), vreinterpret_u16_f16(_r3));
    uint16x4x2_t _r45z = vzip_u16(vreinterpret_u16_f16(_r4), vreinterpret_u16_f16(_r5));
    uint16x4x2_t _r67z = vzip_u16(vreinterpret_u16_f16(_r6), vreinterpret_u16_f16(_r7));
    uint16x4x2_t _r89z = vzip_u16(vreinterpret_u16_f16(_r8), vreinterpret_u16_f16(_r9));
    uint16x4x2_t _rabz = vzip_u16(vreinterpret_u16_f16(_ra), vreinterpret_u16_f16(_rb));
    uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0]));
    uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1]));
    uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0]));
    uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1]));
    uint32x2x2_t _r01_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[0]), vreinterpret_u32_u16(_rabz.val[0]));
    uint32x2x2_t _r23_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[1]), vreinterpret_u32_u16(_rabz.val[1]));
    _r0 = vreinterpret_f16_u32(_r01_0.val[0]);
    _r1 = vreinterpret_f16_u32(_r01_1.val[0]);
    _r2 = vreinterpret_f16_u32(_r01_2.val[0]);
    _r3 = vreinterpret_f16_u32(_r01_0.val[1]);
    _r4 = vreinterpret_f16_u32(_r01_1.val[1]);
    _r5 = vreinterpret_f16_u32(_r01_2.val[1]);
    _r6 = vreinterpret_f16_u32(_r23_0.val[0]);
    _r7 = vreinterpret_f16_u32(_r23_1.val[0]);
    _r8 = vreinterpret_f16_u32(_r23_2.val[0]);
    _r9 = vreinterpret_f16_u32(_r23_0.val[1]);
    _ra = vreinterpret_f16_u32(_r23_1.val[1]);
    _rb = vreinterpret_f16_u32(_r23_2.val[1]);
}

static inline void transpose8x4_ph(float16x8_t& _r0, float16x8_t& _r1, float16x8_t& _r2, float16x8_t& _r3)
{
    uint16x8x2_t _r01t = vzipq_u16(vreinterpretq_u16_f16(_r0), vreinterpretq_u16_f16(_r1));
    uint16x8x2_t _r23t = vzipq_u16(vreinterpretq_u16_f16(_r2), vreinterpretq_u16_f16(_r3));
    uint32x4x2_t _r01 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0]));
    uint32x4x2_t _r23 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1]));
    _r0 = vreinterpretq_f16_u32(_r01.val[0]);
    _r1 = vreinterpretq_f16_u32(_r01.val[1]);
    _r2 = vreinterpretq_f16_u32(_r23.val[0]);
    _r3 = vreinterpretq_f16_u32(_r23.val[1]);
}

static inline void transpose8x8_ph(float16x8_t& _r0, float16x8_t& _r1, float16x8_t& _r2, float16x8_t& _r3, float16x8_t& _r4, float16x8_t& _r5, float16x8_t& _r6, float16x8_t& _r7)
{
    uint16x8x2_t _r01t = vzipq_u16(vreinterpretq_u16_f16(_r0), vreinterpretq_u16_f16(_r1));
    uint16x8x2_t _r23t = vzipq_u16(vreinterpretq_u16_f16(_r2), vreinterpretq_u16_f16(_r3));
    uint16x8x2_t _r45t = vzipq_u16(vreinterpretq_u16_f16(_r4), vreinterpretq_u16_f16(_r5));
    uint16x8x2_t _r67t = vzipq_u16(vreinterpretq_u16_f16(_r6), vreinterpretq_u16_f16(_r7));
    uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0]));
    uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1]));
    uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0]));
    uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1]));
    _r0 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0])));
    _r1 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_0.val[0]), vget_high_u32(_r01_1.val[0])));
    _r2 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1])));
    _r3 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_0.val[1]), vget_high_u32(_r01_1.val[1])));
    _r4 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0])));
    _r5 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_0.val[0]), vget_high_u32(_r23_1.val[0])));
    _r6 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1])));
    _r7 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_0.val[1]), vget_high_u32(_r23_1.val[1])));
}

static inline void transpose8x12_ph(float16x8_t& _r0, float16x8_t& _r1, float16x8_t& _r2, float16x8_t& _r3, float16x8_t& _r4, float16x8_t& _r5, float16x8_t& _r6, float16x8_t& _r7, float16x8_t& _r8, float16x8_t& _r9, float16x8_t& _ra, float16x8_t& _rb)
{
    uint16x8x2_t _r01t = vzipq_u16(vreinterpretq_u16_f16(_r0), vreinterpretq_u16_f16(_r1));
    uint16x8x2_t _r23t = vzipq_u16(vreinterpretq_u16_f16(_r2), vreinterpretq_u16_f16(_r3));
    uint16x8x2_t _r45t = vzipq_u16(vreinterpretq_u16_f16(_r4), vreinterpretq_u16_f16(_r5));
    uint16x8x2_t _r67t = vzipq_u16(vreinterpretq_u16_f16(_r6), vreinterpretq_u16_f16(_r7));
    uint16x8x2_t _r89t = vzipq_u16(vreinterpretq_u16_f16(_r8), vreinterpretq_u16_f16(_r9));
    uint16x8x2_t _rabt = vzipq_u16(vreinterpretq_u16_f16(_ra), vreinterpretq_u16_f16(_rb));
    uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0]));
    uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1]));
    uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0]));
    uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1]));
    uint32x4x2_t _r01_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[0]), vreinterpretq_u32_u16(_rabt.val[0]));
    uint32x4x2_t _r23_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[1]), vreinterpretq_u32_u16(_rabt.val[1]));
    _r0 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0])));
    _r1 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_2.val[0]), vget_high_u32(_r01_0.val[0])));
    _r2 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_1.val[0]), vget_high_u32(_r01_2.val[0])));
    _r3 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1])));
    _r4 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_2.val[1]), vget_high_u32(_r01_0.val[1])));
    _r5 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_1.val[1]), vget_high_u32(_r01_2.val[1])));
    _r6 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0])));
    _r7 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_2.val[0]), vget_high_u32(_r23_0.val[0])));
    _r8 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_1.val[0]), vget_high_u32(_r23_2.val[0])));
    _r9 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1])));
    _ra = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_2.val[1]), vget_high_u32(_r23_0.val[1])));
    _rb = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_1.val[1]), vget_high_u32(_r23_2.val[1])));
}

static inline void transpose12x4_ph(float16x4_t& _r0l, float16x4_t& _r0m, float16x4_t& _r0h,
                                    float16x4_t& _r1l, float16x4_t& _r1m, float16x4_t& _r1h,
                                    float16x4_t& _r2l, float16x4_t& _r2m, float16x4_t& _r2h,
                                    float16x4_t& _r3l, float16x4_t& _r3m, float16x4_t& _r3h)
{
    uint16x4x2_t _r01lz = vzip_u16(vreinterpret_u16_f16(_r0l), vreinterpret_u16_f16(_r1l));
    uint16x4x2_t _r23lz = vzip_u16(vreinterpret_u16_f16(_r2l), vreinterpret_u16_f16(_r3l));
    uint16x4x2_t _r01mz = vzip_u16(vreinterpret_u16_f16(_r0m), vreinterpret_u16_f16(_r1m));
    uint16x4x2_t _r23mz = vzip_u16(vreinterpret_u16_f16(_r2m), vreinterpret_u16_f16(_r3m));
    uint16x4x2_t _r01hz = vzip_u16(vreinterpret_u16_f16(_r0h), vreinterpret_u16_f16(_r1h));
    uint16x4x2_t _r23hz = vzip_u16(vreinterpret_u16_f16(_r2h), vreinterpret_u16_f16(_r3h));
    uint32x2x2_t _r01 = vzip_u32(vreinterpret_u32_u16(_r01lz.val[0]), vreinterpret_u32_u16(_r23lz.val[0]));
    uint32x2x2_t _r23 = vzip_u32(vreinterpret_u32_u16(_r01lz.val[1]), vreinterpret_u32_u16(_r23lz.val[1]));
    uint32x2x2_t _r45 = vzip_u32(vreinterpret_u32_u16(_r01mz.val[0]), vreinterpret_u32_u16(_r23mz.val[0]));
    uint32x2x2_t _r67 = vzip_u32(vreinterpret_u32_u16(_r01mz.val[1]), vreinterpret_u32_u16(_r23mz.val[1]));
    uint32x2x2_t _r89 = vzip_u32(vreinterpret_u32_u16(_r01hz.val[0]), vreinterpret_u32_u16(_r23hz.val[0]));
    uint32x2x2_t _rab = vzip_u32(vreinterpret_u32_u16(_r01hz.val[1]), vreinterpret_u32_u16(_r23hz.val[1]));
    _r0l = vreinterpret_f16_u32(_r01.val[0]);
    _r0m = vreinterpret_f16_u32(_r01.val[1]);
    _r0h = vreinterpret_f16_u32(_r23.val[0]);
    _r1l = vreinterpret_f16_u32(_r23.val[1]);
    _r1m = vreinterpret_f16_u32(_r45.val[0]);
    _r1h = vreinterpret_f16_u32(_r45.val[1]);
    _r2l = vreinterpret_f16_u32(_r67.val[0]);
    _r2m = vreinterpret_f16_u32(_r67.val[1]);
    _r2h = vreinterpret_f16_u32(_r89.val[0]);
    _r3l = vreinterpret_f16_u32(_r89.val[1]);
    _r3m = vreinterpret_f16_u32(_rab.val[0]);
    _r3h = vreinterpret_f16_u32(_rab.val[1]);
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#endif // __aarch64__
#endif // __ARM_NEON

#endif // ARM_USABILITY_H


================================================
FILE: src/layer/arm/batchnorm_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "batchnorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

BatchNorm_arm::BatchNorm_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int BatchNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float* ptr = (float*)bottom_top_blob + i * 4;

                float32x4_t _a = vld1q_f32((const float*)a_data + i * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + i * 4);

                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_a, _p, _b);
                vst1q_f32(ptr, _p);
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                float32x4_t _a = vld1q_f32((const float*)a_data + i * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + i * 4);

                float* ptr = bottom_top_blob.row(i);

                for (int j = 0; j < w; j++)
                {
                    float32x4_t _p = vld1q_f32(ptr);
                    _p = vmlaq_f32(_a, _p, _b);
                    vst1q_f32(ptr, _p);

                    ptr += 4;
                }
            }
        }

        if (dims == 3 || dims == 4)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int d = bottom_top_blob.d;
            int c = bottom_top_blob.c;
            int size = w * h * d;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < c; q++)
            {
                float32x4_t _a = vld1q_f32((const float*)a_data + q * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + q * 4);

                float* ptr = bottom_top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p = vld1q_f32(ptr);
                    _p = vmlaq_f32(_a, _p, _b);
                    vst1q_f32(ptr, _p);

                    ptr += 4;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        float* ptr = bottom_top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < w; i++)
        {
            ptr[i] = b_data[i] * ptr[i] + a_data[i];
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            float a = a_data[i];
            float b = b_data[i];

            int j = 0;
#if __ARM_NEON
            float32x4_t _a = vdupq_n_f32(a);
            float32x4_t _b = vdupq_n_f32(b);

            for (; j + 3 < w; j += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_a, _p, _b);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < w; j++)
            {
                *ptr = b * *ptr + a;

                ptr++;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int d = bottom_top_blob.d;
        int c = bottom_top_blob.c;
        int size = w * h * d;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float a = a_data[q];
            float b = b_data[q];

            int j = 0;
#if __ARM_NEON
            float32x4_t _a = vdupq_n_f32(a);
            float32x4_t _b = vdupq_n_f32(b);

            for (; j + 15 < size; j += 16)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                float32x4_t _p2 = vld1q_f32(ptr + 8);
                float32x4_t _p3 = vld1q_f32(ptr + 12);
                _p0 = vmlaq_f32(_a, _p0, _b);
                _p1 = vmlaq_f32(_a, _p1, _b);
                _p2 = vmlaq_f32(_a, _p2, _b);
                _p3 = vmlaq_f32(_a, _p3, _b);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                vst1q_f32(ptr + 8, _p2);
                vst1q_f32(ptr + 12, _p3);
                ptr += 16;
            }
            for (; j + 7 < size; j += 8)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                _p0 = vmlaq_f32(_a, _p0, _b);
                _p1 = vmlaq_f32(_a, _p1, _b);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                ptr += 8;
            }
            for (; j + 3 < size; j += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_a, _p, _b);
                vst1q_f32(ptr, _p);
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < size; j++)
            {
                *ptr = b * *ptr + a;
                ptr++;
            }
        }
    }

    return 0;
}

#if NCNN_BF16
int BatchNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                unsigned short* ptr = (unsigned short*)bottom_top_blob + i * 4;

                float32x4_t _a = vld1q_f32((const float*)a_data + i * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + i * 4);

                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                _p = vmlaq_f32(_a, _p, _b);
                vst1_u16(ptr, float2bfloat(_p));
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                float32x4_t _a = vld1q_f32((const float*)a_data + i * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + i * 4);

                unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);

                for (int j = 0; j < w; j++)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    _p = vmlaq_f32(_a, _p, _b);
                    vst1_u16(ptr, float2bfloat(_p));

                    ptr += 4;
                }
            }
        }

        if (dims == 3 || dims == 4)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int d = bottom_top_blob.d;
            int c = bottom_top_blob.c;
            int size = w * h * d;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < c; q++)
            {
                float32x4_t _a = vld1q_f32((const float*)a_data + q * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + q * 4);

                unsigned short* ptr = bottom_top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    _p = vmlaq_f32(_a, _p, _b);
                    vst1_u16(ptr, float2bfloat(_p));

                    ptr += 4;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        unsigned short* ptr = bottom_top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < w; i++)
        {
            ptr[i] = float32_to_bfloat16(b_data[i] * bfloat16_to_float32(ptr[i]) + a_data[i]);
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);

            float a = a_data[i];
            float b = b_data[i];

            int j = 0;
#if __ARM_NEON
            float32x4_t _a = vdupq_n_f32(a);
            float32x4_t _b = vdupq_n_f32(b);

            for (; j + 3 < w; j += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                _p = vmlaq_f32(_a, _p, _b);
                vst1_u16(ptr, float2bfloat(_p));

                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < w; j++)
            {
                *ptr = float32_to_bfloat16(b * bfloat16_to_float32(*ptr) + a);

                ptr++;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int d = bottom_top_blob.d;
        int c = bottom_top_blob.c;
        int size = w * h * d;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            float a = a_data[q];
            float b = b_data[q];

            int j = 0;
#if __ARM_NEON
            float32x4_t _a = vdupq_n_f32(a);
            float32x4_t _b = vdupq_n_f32(b);

            for (; j + 3 < size; j += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                _p = vmlaq_f32(_a, _p, _b);
                vst1_u16(ptr, float2bfloat(_p));

                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < size; j++)
            {
                *ptr = float32_to_bfloat16(b * bfloat16_to_float32(*ptr) + a);

                ptr++;
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/batchnorm_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BATCHNORM_ARM_H
#define LAYER_BATCHNORM_ARM_H

#include "batchnorm.h"

namespace ncnn {

class BatchNorm_arm : public BatchNorm
{
public:
    BatchNorm_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_BATCHNORM_ARM_H


================================================
FILE: src/layer/arm/batchnorm_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "batchnorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int BatchNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 4)
    {
        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                __fp16* ptr = (__fp16*)bottom_top_blob + i * 4;

                float32x4_t _a = vld1q_f32((const float*)a_data + i * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + i * 4);

                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                _p = vfmaq_f32(_a, _p, _b);
                vst1_f16(ptr, vcvt_f16_f32(_p));
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                float32x4_t _a = vld1q_f32((const float*)a_data + i * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + i * 4);

                __fp16* ptr = bottom_top_blob.row<__fp16>(i);

                for (int j = 0; j < w; j++)
                {
                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    _p = vfmaq_f32(_a, _p, _b);
                    vst1_f16(ptr, vcvt_f16_f32(_p));

                    ptr += 4;
                }
            }
        }

        if (dims == 3 || dims == 4)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int d = bottom_top_blob.d;
            int c = bottom_top_blob.c;
            int size = w * h * d;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < c; q++)
            {
                float32x4_t _a = vld1q_f32((const float*)a_data + q * 4);
                float32x4_t _b = vld1q_f32((const float*)b_data + q * 4);

                __fp16* ptr = bottom_top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    _p = vfmaq_f32(_a, _p, _b);
                    vst1_f16(ptr, vcvt_f16_f32(_p));

                    ptr += 4;
                }
            }
        }

        return 0;
    }

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        __fp16* ptr = bottom_top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < w; i++)
        {
            ptr[i] = b_data[i] * (float)ptr[i] + a_data[i];
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            __fp16* ptr = bottom_top_blob.row<__fp16>(i);

            float a = a_data[i];
            float b = b_data[i];

            float32x4_t _a = vdupq_n_f32(a);
            float32x4_t _b = vdupq_n_f32(b);

            int j = 0;
            for (; j + 3 < w; j += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                _p = vfmaq_f32(_a, _p, _b);
                vst1_f16(ptr, vcvt_f16_f32(_p));

                ptr += 4;
            }
            for (; j < w; j++)
            {
                *ptr = b * (float)*ptr + a;

                ptr++;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int d = bottom_top_blob.d;
        int c = bottom_top_blob.c;
        int size = w * h * d;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            float a = a_data[q];
            float b = b_data[q];

            float32x4_t _a = vdupq_n_f32(a);
            float32x4_t _b = vdupq_n_f32(b);

            int j = 0;
            for (; j + 3 < size; j += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                _p = vfmaq_f32(_a, _p, _b);
                vst1_f16(ptr, vcvt_f16_f32(_p));

                ptr += 4;
            }
            for (; j < size; j++)
            {
                *ptr = b * (float)*ptr + a;

                ptr++;
            }
        }
    }

    return 0;
}

int BatchNorm_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 8)
    {
        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                __fp16* ptr = (__fp16*)bottom_top_blob + i * 8;

                float16x8_t _a = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)a_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)a_data + i * 8 + 4)));
                float16x8_t _b = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)b_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)b_data + i * 8 + 4)));

                float16x8_t _p = vld1q_f16(ptr);
                _p = vfmaq_f16(_a, _p, _b);
                vst1q_f16(ptr, _p);
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                float16x8_t _a = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)a_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)a_data + i * 8 + 4)));
                float16x8_t _b = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)b_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)b_data + i * 8 + 4)));

                __fp16* ptr = bottom_top_blob.row<__fp16>(i);

                for (int j = 0; j < w; j++)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    _p = vfmaq_f16(_a, _p, _b);
                    vst1q_f16(ptr, _p);

                    ptr += 8;
                }
            }
        }

        if (dims == 3 || dims == 4)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int d = bottom_top_blob.d;
            int c = bottom_top_blob.c;
            int size = w * h * d;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < c; q++)
            {
                float16x8_t _a = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)a_data + q * 8)), vcvt_f16_f32(vld1q_f32((const float*)a_data + q * 8 + 4)));
                float16x8_t _b = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)b_data + q * 8)), vcvt_f16_f32(vld1q_f32((const float*)b_data + q * 8 + 4)));

                __fp16* ptr = bottom_top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    _p = vfmaq_f16(_a, _p, _b);
                    vst1q_f16(ptr, _p);

                    ptr += 8;
                }
            }
        }

        return 0;
    }

    if (elempack == 4)
    {
        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                __fp16* ptr = (__fp16*)bottom_top_blob + i * 4;

                float16x4_t _a = vcvt_f16_f32(vld1q_f32((const float*)a_data + i * 4));
                float16x4_t _b = vcvt_f16_f32(vld1q_f32((const float*)b_data + i * 4));

                float16x4_t _p = vld1_f16(ptr);
                _p = vfma_f16(_a, _p, _b);
                vst1_f16(ptr, _p);
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                float16x4_t _a = vcvt_f16_f32(vld1q_f32((const float*)a_data + i * 4));
                float16x4_t _b = vcvt_f16_f32(vld1q_f32((const float*)b_data + i * 4));

                __fp16* ptr = bottom_top_blob.row<__fp16>(i);

                for (int j = 0; j < w; j++)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    _p = vfma_f16(_a, _p, _b);
                    vst1_f16(ptr, _p);

                    ptr += 4;
                }
            }
        }

        if (dims == 3 || dims == 4)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int d = bottom_top_blob.d;
            int c = bottom_top_blob.c;
            int size = w * h * d;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < c; q++)
            {
                float16x4_t _a = vcvt_f16_f32(vld1q_f32((const float*)a_data + q * 4));
                float16x4_t _b = vcvt_f16_f32(vld1q_f32((const float*)b_data + q * 4));

                __fp16* ptr = bottom_top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    _p = vfma_f16(_a, _p, _b);
                    vst1_f16(ptr, _p);

                    ptr += 4;
                }
            }
        }

        return 0;
    }

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        __fp16* ptr = bottom_top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < w; i++)
        {
            ptr[i] = (__fp16)b_data[i] * ptr[i] + (__fp16)a_data[i];
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            __fp16* ptr = bottom_top_blob.row<__fp16>(i);

            __fp16 a = (__fp16)a_data[i];
            __fp16 b = (__fp16)b_data[i];

            float16x4_t _a = vdup_n_f16(a);
#if defined(_MSC_VER) && !defined(__clang__)
            float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[i]));
#else
            float16x4_t _b = vdup_n_f16(b);
#endif

            int j = 0;
            for (; j + 3 < w; j += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                _p = vfma_f16(_a, _p, _b);
                vst1_f16(ptr, _p);

                ptr += 4;
            }
            for (; j < w; j++)
            {
                *ptr = b * *ptr + a;

                ptr++;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int d = bottom_top_blob.d;
        int c = bottom_top_blob.c;
        int size = w * h * d;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            __fp16 a = (__fp16)a_data[q];
            __fp16 b = (__fp16)b_data[q];

            float16x4_t _a = vdup_n_f16(a);
#if defined(_MSC_VER) && !defined(__clang__)
            float16x4_t _b = vcvt_f16_f32(vdupq_n_f32(b_data[q]));
#else
            float16x4_t _b = vdup_n_f16(b);
#endif

            int j = 0;
            for (; j + 3 < size; j += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                _p = vfma_f16(_a, _p, _b);
                vst1_f16(ptr, _p);

                ptr += 4;
            }
            for (; j < size; j++)
            {
                *ptr = b * *ptr + a;

                ptr++;
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/bias_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "bias_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

int Bias_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        float32x4_t _bias = vdupq_n_f32(bias);
        for (; nn > 0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vaddq_f32(_p, _bias);
            vst1q_f32(ptr, _outp);

            ptr += 4;
        }
#endif // __ARM_NEON

        for (; remain > 0; remain--)
        {
            *ptr = *ptr + bias;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/bias_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BIAS_ARM_H
#define LAYER_BIAS_ARM_H

#include "bias.h"

namespace ncnn {

class Bias_arm : public Bias
{
public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_BIAS_ARM_H


================================================
FILE: src/layer/arm/binaryop_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "binaryop_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

BinaryOp_arm::BinaryOp_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

template<typename Op>
static void binary_op_vector_no_broadcast(const float* ptr, const float* ptr1, float* outptr, int size)
{
    const Op op;

    int i = 0;
#if __ARM_NEON
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _p = vld1q_f32(ptr);
        float32x4_t _b = vld1q_f32(ptr1);
        float32x4_t _outp = op(_p, _b);
        vst1q_f32(outptr, _outp);
        ptr += 4;
        ptr1 += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        *outptr = op(*ptr, *ptr1);
        ptr += 1;
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_b(const float* ptr, const float* ptr1, float* outptr, int size, int elempack)
{
    const Op op;

    const float b = *ptr1;

    int i = 0;
#if __ARM_NEON
    float32x4_t _b_128 = (elempack == 4) ? vld1q_f32(ptr1) : vdupq_n_f32(b);
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _p = vld1q_f32(ptr);
        float32x4_t _outp = op(_p, _b_128);
        vst1q_f32(outptr, _outp);
        ptr += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        *outptr = op(*ptr, b);
        ptr += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_a(const float* ptr, const float* ptr1, float* outptr, int size, int elempack)
{
    const Op op;

    const float a = *ptr;

    int i = 0;
#if __ARM_NEON
    float32x4_t _a_128 = (elempack == 4) ? vld1q_f32(ptr) : vdupq_n_f32(a);
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _b = vld1q_f32(ptr1);
        float32x4_t _outp = op(_a_128, _b);
        vst1q_f32(outptr, _outp);
        ptr1 += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        *outptr = op(a, *ptr1);
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_pb(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

#if __ARM_NEON
    if (elempack == 4)
    {
        int i = 0;
        for (; i < w; i++)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _b = vdupq_n_f32(*ptr1);
            float32x4_t _outp = op(_p, _b);
            vst1q_f32(outptr, _outp);
            ptr += 4;
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __ARM_NEON
}

template<typename Op>
static void binary_op_vector_broadcast_pb_b(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

    const int size = w * elempack;

    int i = 0;
#if __ARM_NEON
    float32x4_t _b = vdupq_n_f32(*ptr1);
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _p = vld1q_f32(ptr);
        float32x4_t _outp = op(_p, _b);
        vst1q_f32(outptr, _outp);
        ptr += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
}

template<typename Op>
static void binary_op_vector_broadcast_pb_a(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

#if __ARM_NEON
    if (elempack == 4)
    {
        int i = 0;
        float32x4_t _p = vld1q_f32(ptr);
        for (; i < w; i++)
        {
            float32x4_t _b = vdupq_n_f32(*ptr1);
            float32x4_t _outp = op(_p, _b);
            vst1q_f32(outptr, _outp);
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __ARM_NEON
}

template<typename Op>
static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp)
{
    const int w = std::max(aw, bw);
    const int elempack = std::max(ap, bp);
    const int size = w * elempack;

    if (ap == bp)
    {
        if (aw == bw)
        {
            // no broadcast
            return binary_op_vector_no_broadcast<Op>(ptr, ptr1, outptr, size);
        }

        if (bw == 1)
        {
            // broadcast single b
            return binary_op_vector_broadcast_b<Op>(ptr, ptr1, outptr, size, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a
            return binary_op_vector_broadcast_a<Op>(ptr, ptr1, outptr, size, elempack);
        }
    }

    if (bp == 1)
    {
        if (aw == bw)
        {
            // broadcast pack1 b
            return binary_op_vector_broadcast_pb<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (bw == 1)
        {
            // broadcast pack1 single b
            return binary_op_vector_broadcast_pb_b<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a and pack1 b
            return binary_op_vector_broadcast_pb_a<Op>(ptr, ptr1, outptr, w, elempack);
        }
    }

    // shall never reach here
}

namespace BinaryOp_arm_functor {

#if __ARM_NEON
#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                                         \
    struct NAME                                                                  \
    {                                                                            \
        float operator()(const float& x, const float& y) const                   \
        {                                                                        \
            return IMPL;                                                         \
        }                                                                        \
        float32x4_t operator()(const float32x4_t& x, const float32x4_t& y) const \
        {                                                                        \
            return IMPL4;                                                        \
        }                                                                        \
    };
#else
#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                       \
    struct NAME                                                \
    {                                                          \
        float operator()(const float& x, const float& y) const \
        {                                                      \
            return IMPL;                                       \
        }                                                      \
    };
#endif

// clang-format off
// *INDENT-OFF*
MAKE_FUNCTION(binary_op_add, x + y, vaddq_f32(x, y))
MAKE_FUNCTION(binary_op_sub, x - y, vsubq_f32(x, y))
MAKE_FUNCTION(binary_op_mul, x * y, vmulq_f32(x, y))
#if __aarch64__
MAKE_FUNCTION(binary_op_div, x / y, vdivq_f32(x, y))
#else
MAKE_FUNCTION(binary_op_div, x / y, div_ps(x, y))
#endif
MAKE_FUNCTION(binary_op_max, std::max(x, y), vmaxq_f32(x, y))
MAKE_FUNCTION(binary_op_min, std::min(x, y), vminq_f32(x, y))
MAKE_FUNCTION(binary_op_pow, (float)powf(x, y), pow_ps(x, y))
MAKE_FUNCTION(binary_op_rsub, y - x, vsubq_f32(y, x))
#if __aarch64__
MAKE_FUNCTION(binary_op_rdiv, y / x, vdivq_f32(y, x))
#else
MAKE_FUNCTION(binary_op_rdiv, y / x, div_ps(y, x))
#endif
MAKE_FUNCTION(binary_op_rpow, (float)powf(y, x), pow_ps(y, x))
MAKE_FUNCTION(binary_op_atan2, atan2f(x, y), atan2_ps(x, y))
MAKE_FUNCTION(binary_op_ratan2, atan2f(y, x), atan2_ps(y, x))
MAKE_FUNCTION(binary_op_fmod, (float)fmodf(x, y), fmod_ps(x, y))
MAKE_FUNCTION(binary_op_rfmod, (float)fmodf(y, x), fmod_ps(y, x))
MAKE_FUNCTION(binary_op_logaddexp, (float)(std::max(x, y) + log1pf(expf(std::min(x, y) - std::max(x, y)))), logaddexp_ps(x, y))
MAKE_FUNCTION(binary_op_floor_divide, (float)floorf(x / y), floor_divide_ps(x, y))
MAKE_FUNCTION(binary_op_rfloor_divide, (float)floorf(y / x), floor_divide_ps(y, x))
MAKE_FUNCTION(binary_op_remainder, (float)remainderf(x, y), remainder_ps(x, y))
MAKE_FUNCTION(binary_op_rremainder, (float)remainderf(y, x), remainder_ps(y, x))
// *INDENT-ON*
// clang-format on

#undef MAKE_FUNCTION

} // namespace BinaryOp_arm_functor

static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp, int op_type)
{
    using namespace BinaryOp_arm_functor;

    if (op_type == BinaryOp::Operation_ADD) return binary_op_vector<binary_op_add>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_vector<binary_op_sub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_vector<binary_op_mul>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_vector<binary_op_div>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_vector<binary_op_max>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_vector<binary_op_min>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_POW) return binary_op_vector<binary_op_pow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector<binary_op_rsub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector<binary_op_rdiv>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_vector<binary_op_fmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_vector<binary_op_rfmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_vector<binary_op_logaddexp>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_vector<binary_op_floor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_vector<binary_op_rfloor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_vector<binary_op_rremainder>(ptr, ptr1, outptr, aw, bw, ap, bp);

    // should never reach here
}

static void binary_op_scalar(const Mat& a, float b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = a.channel(q);
        float* outptr = c.channel(q);

        binary_op_vector(ptr, &b, outptr, size, 1, 1, 1, op_type);
    }
}

static void binary_op_no_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = a.channel(q);
        const float* ptr1 = b.channel(q);
        float* outptr = c.channel(q);

        binary_op_vector(ptr, ptr1, outptr, size, size, 1, 1, op_type);
    }
}

static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    if (b.w * b.h * b.d * b.c * b.elempack == 1)
    {
        return binary_op_scalar(a, b[0], c, op_type, opt);
    }

    if (a.dims == b.dims && a.w == b.w && a.h == b.h && a.d == b.d && a.c == b.c && a.elempack == b.elempack)
    {
        return binary_op_no_broadcast(a, b, c, op_type, opt);
    }

    const int dims = c.dims;

    if (dims == 2)
    {
        const int h = c.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < h; y++)
        {
            const int y0 = std::min(y, a.h - 1);
            const int y1 = std::min(y, b.h - 1);

            const float* ptr = a.row(y0);
            const float* ptr1 = b.row(y1);
            float* outptr = c.row(y);

            binary_op_vector(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int channels = c.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int q0 = std::min(q, a.c - 1);
            const int q1 = std::min(q, b.c - 1);

            if (b.d * b.h * b.w == 1)
            {
                const float* ptr = a.channel(q0);
                const float* ptr1 = b.channel(q1);
                float* outptr = c.channel(q);

                binary_op_vector(ptr, ptr1, outptr, a.w * a.h * a.d, 1, a.elempack, b.elempack, op_type);
                continue;
            }

            if (b.h * b.w == 1)
            {
                for (int z = 0; z < c.d; z++)
                {
                    const int z0 = std::min(z, a.d - 1);
                    const int z1 = std::min(z, b.d - 1);

                    const float* ptr = a.channel(q0).depth(z0);
                    const float* ptr1 = b.channel(q1).depth(z1);
                    float* outptr = c.channel(q).depth(z);

                    binary_op_vector(ptr, ptr1, outptr, a.w * a.h, 1, a.elempack, b.elempack, op_type);
                }
                continue;
            }

            for (int z = 0; z < c.d; z++)
            {
                const int z0 = std::min(z, a.d - 1);
                const int z1 = std::min(z, b.d - 1);

                for (int y = 0; y < c.h; y++)
                {
                    const int y0 = std::min(y, a.h - 1);
                    const int y1 = std::min(y, b.h - 1);

                    const float* ptr = a.channel(q0).depth(z0).row(y0);
                    const float* ptr1 = b.channel(q1).depth(z1).row(y1);
                    float* outptr = c.channel(q).depth(z).row(y);

                    binary_op_vector(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
                }
            }
        }
    }
}

static void binary_op_scalar_inplace(Mat& a, float b, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        binary_op_vector(ptr, &b, ptr, size, 1, 1, 1, op_type);
    }
}

static int get_reverse_op_type(int op_type)
{
    if (op_type == BinaryOp::Operation_SUB) return BinaryOp::Operation_RSUB;
    if (op_type == BinaryOp::Operation_DIV) return BinaryOp::Operation_RDIV;
    if (op_type == BinaryOp::Operation_POW) return BinaryOp::Operation_RPOW;
    if (op_type == BinaryOp::Operation_ATAN2) return BinaryOp::Operation_RATAN2;
    if (op_type == BinaryOp::Operation_FMOD) return BinaryOp::Operation_RFMOD;
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return BinaryOp::Operation_RFLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_REMAINDER) return BinaryOp::Operation_RREMAINDER;

    if (op_type == BinaryOp::Operation_RSUB) return BinaryOp::Operation_SUB;
    if (op_type == BinaryOp::Operation_RDIV) return BinaryOp::Operation_DIV;
    if (op_type == BinaryOp::Operation_RPOW) return BinaryOp::Operation_POW;
    if (op_type == BinaryOp::Operation_RATAN2) return BinaryOp::Operation_ATAN2;
    if (op_type == BinaryOp::Operation_RFMOD) return BinaryOp::Operation_FMOD;
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return BinaryOp::Operation_FLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_RREMAINDER) return BinaryOp::Operation_REMAINDER;

    return op_type;
}

int BinaryOp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int elembits = std::max(bottom_blobs[0].elembits(), bottom_blobs[1].elembits());

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    const int outdims = std::max(A.dims, B.dims);

    Mat A2 = A;
    Mat B2 = B;
    if (A.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (A.w * A.elempack == B.h * B.elempack)
                A2 = A.reshape(1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 2;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 3;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 2)
            A2 = A.reshape(1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 4;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 4 && A.dims == 2)
            A2 = A.reshape(1, 1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 3)
            A2 = A.reshape(1, A.w, A.h, A.c, opt.workspace_allocator);
    }
    if (B.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (B.w * B.elempack == A.h * A.elempack)
                B2 = B.reshape(1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 2;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 3;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 2)
            B2 = B.reshape(1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 4;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 4 && B.dims == 2)
            B2 = B.reshape(1, 1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 3)
            B2 = B.reshape(1, B.w, B.h, B.c, opt.workspace_allocator);
    }

    const int outw = std::max(A2.w, B2.w);
    const int outh = std::max(A2.h, B2.h);
    const int outd = std::max(A2.d, B2.d);
    const int outc = std::max(A2.c, B2.c);
    const size_t out_elemsize = std::max(A2.elemsize, B2.elemsize);
    const int out_elempack = std::max(A2.elempack, B2.elempack);

    Mat& top_blob = top_blobs[0];
    if (outdims == 1)
    {
        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 2)
    {
        top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 3)
    {
        top_blob.create(outw, outh, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 4)
    {
        top_blob.create(outw, outh, outd, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    const bool a_pack_is_lower = A2.elempack < B2.elempack;
    const bool a_pack_is_equal = A2.elempack == B2.elempack;
    const bool a_size_is_lower = A2.w * A2.h * A2.d * A2.c * A2.elempack < B2.w * B2.h * B2.d * B2.c * B2.elempack;
    if (a_pack_is_lower || (a_pack_is_equal && a_size_is_lower))
    {
        binary_op_broadcast(B2, A2, top_blob, get_reverse_op_type(op_type), opt);
    }
    else
    {
        binary_op_broadcast(A2, B2, top_blob, op_type, opt);
    }

    return 0;
}

int BinaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    binary_op_scalar_inplace(bottom_top_blob, b, op_type, opt);

    return 0;
}

#if NCNN_BF16
template<typename Op>
static void binary_op_vector_no_broadcast_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int size)
{
    const Op op;

    int i = 0;
#if __ARM_NEON
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _p = bfloat2float(vld1_u16(ptr));
        float32x4_t _b = bfloat2float(vld1_u16(ptr1));
        float32x4_t _outp = op(_p, _b);
        vst1_u16(outptr, float2bfloat(_outp));
        ptr += 4;
        ptr1 += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        *outptr = float32_to_bfloat16(op(bfloat16_to_float32(*ptr), bfloat16_to_float32(*ptr1)));
        ptr += 1;
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_b_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int size, int elempack)
{
    const Op op;

    const float b = bfloat16_to_float32(*ptr1);

    int i = 0;
#if __ARM_NEON
    float32x4_t _b_128 = (elempack == 4) ? bfloat2float(vld1_u16(ptr1)) : vdupq_n_f32(b);
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _p = bfloat2float(vld1_u16(ptr));
        float32x4_t _outp = op(_p, _b_128);
        vst1_u16(outptr, float2bfloat(_outp));
        ptr += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        *outptr = float32_to_bfloat16(op(bfloat16_to_float32(*ptr), b));
        ptr += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_a_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int size, int elempack)
{
    const Op op;

    const float a = bfloat16_to_float32(*ptr);

    int i = 0;
#if __ARM_NEON
    float32x4_t _a_128 = (elempack == 4) ? bfloat2float(vld1_u16(ptr)) : vdupq_n_f32(a);
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _b = bfloat2float(vld1_u16(ptr1));
        float32x4_t _outp = op(_a_128, _b);
        vst1_u16(outptr, float2bfloat(_outp));
        ptr1 += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        *outptr = float32_to_bfloat16(op(a, bfloat16_to_float32(*ptr1)));
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_pb_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int w, int elempack)
{
    const Op op;

#if __ARM_NEON
    if (elempack == 4)
    {
        int i = 0;
        for (; i < w; i++)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _b = bfloat2float(vdup_n_u16(*ptr1));
            float32x4_t _outp = op(_p, _b);
            vst1_u16(outptr, float2bfloat(_outp));
            ptr += 4;
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __ARM_NEON
}

template<typename Op>
static void binary_op_vector_broadcast_pb_b_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int w, int elempack)
{
    const Op op;

    const int size = w * elempack;

    int i = 0;
#if __ARM_NEON
    float32x4_t _b = bfloat2float(vdup_n_u16(*ptr1));
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _p = bfloat2float(vld1_u16(ptr));
        float32x4_t _outp = op(_p, _b);
        vst1_u16(outptr, float2bfloat(_outp));
        ptr += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
}

template<typename Op>
static void binary_op_vector_broadcast_pb_a_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int w, int elempack)
{
    const Op op;

#if __ARM_NEON
    if (elempack == 4)
    {
        int i = 0;
        float32x4_t _p = bfloat2float(vld1_u16(ptr));
        for (; i < w; i++)
        {
            float32x4_t _b = bfloat2float(vdup_n_u16(*ptr1));
            float32x4_t _outp = op(_p, _b);
            vst1_u16(outptr, float2bfloat(_outp));
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __ARM_NEON
}

template<typename Op>
static void binary_op_vector_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int aw, int bw, int ap, int bp)
{
    const int w = std::max(aw, bw);
    const int elempack = std::max(ap, bp);
    const int size = w * elempack;

    if (ap == bp)
    {
        if (aw == bw)
        {
            // no broadcast
            return binary_op_vector_no_broadcast_bf16s<Op>(ptr, ptr1, outptr, size);
        }

        if (bw == 1)
        {
            // broadcast single b
            return binary_op_vector_broadcast_b_bf16s<Op>(ptr, ptr1, outptr, size, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a
            return binary_op_vector_broadcast_a_bf16s<Op>(ptr, ptr1, outptr, size, elempack);
        }
    }

    if (bp == 1)
    {
        if (aw == bw)
        {
            // broadcast pack1 b
            return binary_op_vector_broadcast_pb_bf16s<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (bw == 1)
        {
            // broadcast pack1 single b
            return binary_op_vector_broadcast_pb_b_bf16s<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a and pack1 b
            return binary_op_vector_broadcast_pb_a_bf16s<Op>(ptr, ptr1, outptr, w, elempack);
        }
    }

    // shall never reach here
}

static void binary_op_vector_bf16s(const unsigned short* ptr, const unsigned short* ptr1, unsigned short* outptr, int aw, int bw, int ap, int bp, int op_type)
{
    using namespace BinaryOp_arm_functor;

    if (op_type == BinaryOp::Operation_ADD) return binary_op_vector_bf16s<binary_op_add>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_vector_bf16s<binary_op_sub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_vector_bf16s<binary_op_mul>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_vector_bf16s<binary_op_div>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_vector_bf16s<binary_op_max>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_vector_bf16s<binary_op_min>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_POW) return binary_op_vector_bf16s<binary_op_pow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector_bf16s<binary_op_rsub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector_bf16s<binary_op_rdiv>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_bf16s<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_bf16s<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_bf16s<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_vector_bf16s<binary_op_fmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_vector_bf16s<binary_op_rfmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_vector_bf16s<binary_op_logaddexp>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_vector_bf16s<binary_op_floor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_vector_bf16s<binary_op_rfloor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector_bf16s<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_vector_bf16s<binary_op_rremainder>(ptr, ptr1, outptr, aw, bw, ap, bp);

    // should never reach here
}

template<typename Op>
static void binary_op_vector_scalar_b_bf16s(const unsigned short* ptr, float b, unsigned short* outptr, int size)
{
    const Op op;

    int i = 0;
#if __ARM_NEON
    float32x4_t _b_128 = vdupq_n_f32(b);
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _p = bfloat2float(vld1_u16(ptr));
        float32x4_t _outp = op(_p, _b_128);
        vst1_u16(outptr, float2bfloat(_outp));
        ptr += 4;
        outptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        *outptr = float32_to_bfloat16(op(bfloat16_to_float32(*ptr), b));
        ptr += 1;
        outptr += 1;
    }
}

static void binary_op_vector_scalar_b_bf16s(const unsigned short* ptr, float b, unsigned short* outptr, int size, int op_type)
{
    using namespace BinaryOp_arm_functor;

    if (op_type == BinaryOp::Operation_ADD) return binary_op_vector_scalar_b_bf16s<binary_op_add>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_vector_scalar_b_bf16s<binary_op_sub>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_vector_scalar_b_bf16s<binary_op_mul>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_vector_scalar_b_bf16s<binary_op_div>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_vector_scalar_b_bf16s<binary_op_max>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_vector_scalar_b_bf16s<binary_op_min>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_POW) return binary_op_vector_scalar_b_bf16s<binary_op_pow>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector_scalar_b_bf16s<binary_op_rsub>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector_scalar_b_bf16s<binary_op_rdiv>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_scalar_b_bf16s<binary_op_rpow>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_scalar_b_bf16s<binary_op_atan2>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_scalar_b_bf16s<binary_op_ratan2>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_vector_scalar_b_bf16s<binary_op_fmod>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_vector_scalar_b_bf16s<binary_op_rfmod>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_vector_scalar_b_bf16s<binary_op_logaddexp>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_vector_scalar_b_bf16s<binary_op_floor_divide>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_vector_scalar_b_bf16s<binary_op_rfloor_divide>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector_scalar_b_bf16s<binary_op_remainder>(ptr, b, outptr, size);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_vector_scalar_b_bf16s<binary_op_rremainder>(ptr, b, outptr, size);

    // should never reach here
}

static void binary_op_scalar_bf16s(const Mat& a, float b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const unsigned short* ptr = a.channel(q);
        unsigned short* outptr = c.channel(q);

        binary_op_vector_scalar_b_bf16s(ptr, b, outptr, size, op_type);
    }
}

static void binary_op_no_broadcast_bf16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const unsigned short* ptr = a.channel(q);
        const unsigned short* ptr1 = b.channel(q);
        unsigned short* outptr = c.channel(q);

        binary_op_vector_bf16s(ptr, ptr1, outptr, size, size, 1, 1, op_type);
    }
}

static void binary_op_broadcast_bf16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    if (b.w * b.h * b.d * b.c * b.elempack == 1)
    {
        return binary_op_scalar_bf16s(a, bfloat16_to_float32(((const unsigned short*)b)[0]), c, op_type, opt);
    }

    if (a.dims == b.dims && a.w == b.w && a.h == b.h && a.d == b.d && a.c == b.c && a.elempack == b.elempack)
    {
        return binary_op_no_broadcast_bf16s(a, b, c, op_type, opt);
    }

    const int dims = c.dims;

    if (dims == 2)
    {
        const int h = c.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < h; y++)
        {
            const int y0 = std::min(y, a.h - 1);
            const int y1 = std::min(y, b.h - 1);

            const unsigned short* ptr = a.row<const unsigned short>(y0);
            const unsigned short* ptr1 = b.row<const unsigned short>(y1);
            unsigned short* outptr = c.row<unsigned short>(y);

            binary_op_vector_bf16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int channels = c.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int q0 = std::min(q, a.c - 1);
            const int q1 = std::min(q, b.c - 1);

            if (b.d * b.h * b.w == 1)
            {
                const unsigned short* ptr = a.channel(q0);
                const unsigned short* ptr1 = b.channel(q1);
                unsigned short* outptr = c.channel(q);

                binary_op_vector_bf16s(ptr, ptr1, outptr, a.w * a.h * a.d, 1, a.elempack, b.elempack, op_type);
                continue;
            }

            if (b.h * b.w == 1)
            {
                for (int z = 0; z < c.d; z++)
                {
                    const int z0 = std::min(z, a.d - 1);
                    const int z1 = std::min(z, b.d - 1);

                    const unsigned short* ptr = a.channel(q0).depth(z0);
                    const unsigned short* ptr1 = b.channel(q1).depth(z1);
                    unsigned short* outptr = c.channel(q).depth(z);

                    binary_op_vector_bf16s(ptr, ptr1, outptr, a.w * a.h, 1, a.elempack, b.elempack, op_type);
                }
                continue;
            }

            for (int z = 0; z < c.d; z++)
            {
                const int z0 = std::min(z, a.d - 1);
                const int z1 = std::min(z, b.d - 1);

                for (int y = 0; y < c.h; y++)
                {
                    const int y0 = std::min(y, a.h - 1);
                    const int y1 = std::min(y, b.h - 1);

                    const unsigned short* ptr = a.channel(q0).depth(z0).row<const unsigned short>(y0);
                    const unsigned short* ptr1 = b.channel(q1).depth(z1).row<const unsigned short>(y1);
                    unsigned short* outptr = c.channel(q).depth(z).row<unsigned short>(y);

                    binary_op_vector_bf16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
                }
            }
        }
    }
}

static void binary_op_scalar_inplace_bf16s(Mat& a, float b, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = a.channel(q);

        binary_op_vector_scalar_b_bf16s(ptr, b, ptr, size, op_type);
    }
}

int BinaryOp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    const int outdims = std::max(A.dims, B.dims);

    Mat A2 = A;
    Mat B2 = B;
    if (A.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (A.w * A.elempack == B.h * B.elempack)
                A2 = A.reshape(1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 2;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 3;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 2)
            A2 = A.reshape(1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 4;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 4 && A.dims == 2)
            A2 = A.reshape(1, 1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 3)
            A2 = A.reshape(1, A.w, A.h, A.c, opt.workspace_allocator);
    }
    if (B.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (B.w * B.elempack == A.h * A.elempack)
                B2 = B.reshape(1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 2;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 3;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 2)
            B2 = B.reshape(1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 4;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 4 && B.dims == 2)
            B2 = B.reshape(1, 1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 3)
            B2 = B.reshape(1, B.w, B.h, B.c, opt.workspace_allocator);
    }

    const int outw = std::max(A2.w, B2.w);
    const int outh = std::max(A2.h, B2.h);
    const int outd = std::max(A2.d, B2.d);
    const int outc = std::max(A2.c, B2.c);
    const size_t out_elemsize = std::max(A2.elemsize, B2.elemsize);
    const int out_elempack = std::max(A2.elempack, B2.elempack);

    Mat& top_blob = top_blobs[0];
    if (outdims == 1)
    {
        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 2)
    {
        top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 3)
    {
        top_blob.create(outw, outh, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 4)
    {
        top_blob.create(outw, outh, outd, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    const bool a_pack_is_lower = A2.elempack < B2.elempack;
    const bool a_pack_is_equal = A2.elempack == B2.elempack;
    const bool a_size_is_lower = A2.w * A2.h * A2.d * A2.c * A2.elempack < B2.w * B2.h * B2.d * B2.c * B2.elempack;
    if (a_pack_is_lower || (a_pack_is_equal && a_size_is_lower))
    {
        binary_op_broadcast_bf16s(B2, A2, top_blob, get_reverse_op_type(op_type), opt);
    }
    else
    {
        binary_op_broadcast_bf16s(A2, B2, top_blob, op_type, opt);
    }

    return 0;
}

int BinaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    binary_op_scalar_inplace_bf16s(bottom_top_blob, b, op_type, opt);

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/binaryop_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BINARYOP_ARM_H
#define LAYER_BINARYOP_ARM_H

#include "binaryop.h"

namespace ncnn {

class BinaryOp_arm : public BinaryOp
{
public:
    BinaryOp_arm();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_BINARYOP_ARM_H


================================================
FILE: src/layer/arm/binaryop_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "binaryop_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static inline float16x4_t fmod_f16(const float16x4_t& x, const float16x4_t& y)
{
    float32x4_t fx = vcvt_f32_f16(x);
    float32x4_t fy = vcvt_f32_f16(y);
    return vcvt_f16_f32(fmod_ps(fx, fy));
}

static inline float16x8_t fmodq_f16(const float16x8_t& x, const float16x8_t& y)
{
    float16x4_t xl = vget_low_f16(x);
    float16x4_t xh = vget_high_f16(x);
    float16x4_t yl = vget_low_f16(y);
    float16x4_t yh = vget_high_f16(y);

    float16x4_t rl = fmod_f16(xl, yl);
    float16x4_t rh = fmod_f16(xh, yh);
    return vcombine_f16(rl, rh);
}

static inline float16x4_t round_f16(const float16x4_t& x)
{
    return vcvt_f16_f32(round_ps(vcvt_f32_f16(x)));
}

static inline float16x8_t roundq_f16(const float16x8_t& x)
{
    float16x4_t xl = vget_low_f16(x);
    float16x4_t xh = vget_high_f16(x);
    float16x4_t rl = round_f16(xl);
    float16x4_t rh = round_f16(xh);
    return vcombine_f16(rl, rh);
}

static inline float16x4_t logaddexp_f16(const float16x4_t& x, const float16x4_t& y)
{
    return vcvt_f16_f32(logaddexp_ps(vcvt_f32_f16(x), vcvt_f32_f16(y)));
}

static inline float16x8_t logaddexpq_f16(const float16x8_t& x, const float16x8_t& y)
{
    float16x4_t xl = vget_low_f16(x);
    float16x4_t xh = vget_high_f16(x);
    float16x4_t yl = vget_low_f16(y);
    float16x4_t yh = vget_high_f16(y);
    float16x4_t rl = logaddexp_f16(xl, yl);
    float16x4_t rh = logaddexp_f16(xh, yh);
    return vcombine_f16(rl, rh);
}

static inline float16x4_t floor_divide_f16(const float16x4_t& x, const float16x4_t& y)
{
    return vcvt_f16_f32(floor_divide_ps(vcvt_f32_f16(x), vcvt_f32_f16(y)));
}

static inline float16x8_t floor_divideq_f16(const float16x8_t& x, const float16x8_t& y)
{
    float16x4_t xl = vget_low_f16(x);
    float16x4_t xh = vget_high_f16(x);
    float16x4_t yl = vget_low_f16(y);
    float16x4_t yh = vget_high_f16(y);
    float16x4_t rl = floor_divide_f16(xl, yl);
    float16x4_t rh = floor_divide_f16(xh, yh);
    return vcombine_f16(rl, rh);
}

static inline float16x4_t remainder_f16(const float16x4_t& x, const float16x4_t& y)
{
    return vcvt_f16_f32(remainder_ps(vcvt_f32_f16(x), vcvt_f32_f16(y)));
}

static inline float16x8_t remainderq_f16(const float16x8_t& x, const float16x8_t& y)
{
    float16x4_t xl = vget_low_f16(x);
    float16x4_t xh = vget_high_f16(x);
    float16x4_t yl = vget_low_f16(y);
    float16x4_t yh = vget_high_f16(y);
    float16x4_t rl = remainder_f16(xl, yl);
    float16x4_t rh = remainder_f16(xh, yh);
    return vcombine_f16(rl, rh);
}

template<typename Op>
static void binary_op_vector_no_broadcast_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size)
{
    const Op op;

    int i = 0;
    for (; i + 7 < size; i += 8)
    {
        float16x8_t _p = vld1q_f16(ptr);
        float16x8_t _b = vld1q_f16(ptr1);
        float16x8_t _outp = op(_p, _b);
        vst1q_f16(outptr, _outp);
        ptr += 8;
        ptr1 += 8;
        outptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float16x4_t _p = vld1_f16(ptr);
        float16x4_t _b = vld1_f16(ptr1);
        float16x4_t _outp = op(_p, _b);
        vst1_f16(outptr, _outp);
        ptr += 4;
        ptr1 += 4;
        outptr += 4;
    }
    for (; i < size; i++)
    {
        *outptr = op(*ptr, *ptr1);
        ptr += 1;
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_b_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size, int elempack)
{
    const Op op;

    const __fp16 b = *ptr1;

    int i = 0;
    float16x4_t _b_128 = (elempack == 4) ? vld1_f16(ptr1) : vdup_n_f16(b);
    float16x8_t _b_256 = (elempack == 8) ? vld1q_f16(ptr1) : vcombine_f16(_b_128, _b_128);
    for (; i + 7 < size; i += 8)
    {
        float16x8_t _p = vld1q_f16(ptr);
        float16x8_t _outp = op(_p, _b_256);
        vst1q_f16(outptr, _outp);
        ptr += 8;
        outptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float16x4_t _p = vld1_f16(ptr);
        float16x4_t _outp = op(_p, _b_128);
        vst1_f16(outptr, _outp);
        ptr += 4;
        outptr += 4;
    }
    for (; i < size; i++)
    {
        *outptr = op(*ptr, b);
        ptr += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_a_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int size, int elempack)
{
    const Op op;

    const __fp16 a = *ptr;

    int i = 0;
    float16x4_t _a_128 = (elempack == 4) ? vld1_f16(ptr) : vdup_n_f16(a);
    float16x8_t _a_256 = (elempack == 8) ? vld1q_f16(ptr) : vcombine_f16(_a_128, _a_128);
    for (; i + 7 < size; i += 8)
    {
        float16x8_t _b = vld1q_f16(ptr1);
        float16x8_t _outp = op(_a_256, _b);
        vst1q_f16(outptr, _outp);
        ptr1 += 8;
        outptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float16x4_t _b = vld1_f16(ptr1);
        float16x4_t _outp = op(_a_128, _b);
        vst1_f16(outptr, _outp);
        ptr1 += 4;
        outptr += 4;
    }
    for (; i < size; i++)
    {
        *outptr = op(a, *ptr1);
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_pb_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack)
{
    const Op op;

    if (elempack == 8)
    {
        int i = 0;
        for (; i < w; i++)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x8_t _b = vdupq_n_f16(*ptr1);
            float16x8_t _outp = op(_p, _b);
            vst1q_f16(outptr, _outp);
            ptr += 8;
            ptr1 += 1;
            outptr += 8;
        }
    }
    if (elempack == 4)
    {
        int i = 0;
        for (; i + 1 < w; i += 2)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x4_t _b0 = vdup_n_f16(ptr1[0]);
            float16x4_t _b1 = vdup_n_f16(ptr1[1]);
            float16x8_t _b = vcombine_f16(_b0, _b1);
            float16x8_t _outp = op(_p, _b);
            vst1q_f16(outptr, _outp);
            ptr += 8;
            ptr1 += 2;
            outptr += 8;
        }
        for (; i < w; i++)
        {
            float16x4_t _p = vld1_f16(ptr);
            float16x4_t _b = vdup_n_f16(*ptr1);
            float16x4_t _outp = op(_p, _b);
            vst1_f16(outptr, _outp);
            ptr += 4;
            ptr1 += 1;
            outptr += 4;
        }
    }
}

template<typename Op>
static void binary_op_vector_broadcast_pb_b_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack)
{
    const Op op;

    const int size = w * elempack;

    int i = 0;
    float16x8_t _b = vdupq_n_f16(*ptr1);
    for (; i + 7 < size; i += 8)
    {
        float16x8_t _p = vld1q_f16(ptr);
        float16x8_t _outp = op(_p, _b);
        vst1q_f16(outptr, _outp);
        ptr += 8;
        outptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float16x4_t _p = vld1_f16(ptr);
        float16x4_t _outp = op(_p, vget_low_f16(_b));
        vst1_f16(outptr, _outp);
        ptr += 4;
        outptr += 4;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_pb_a_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int w, int elempack)
{
    const Op op;

    if (elempack == 8)
    {
        int i = 0;
        float16x8_t _p = vld1q_f16(ptr);
        for (; i < w; i++)
        {
            float16x8_t _b = vdupq_n_f16(*ptr1);
            float16x8_t _outp = op(_p, _b);
            vst1q_f16(outptr, _outp);
            ptr1 += 1;
            outptr += 8;
        }
    }
    if (elempack == 4)
    {
        int i = 0;
        float16x4_t _p0 = vld1_f16(ptr);
        float16x8_t _p = vcombine_f16(_p0, _p0);
        for (; i + 1 < w; i += 2)
        {
            float16x4_t _b0 = vdup_n_f16(ptr1[0]);
            float16x4_t _b1 = vdup_n_f16(ptr1[1]);
            float16x8_t _b = vcombine_f16(_b0, _b1);
            float16x8_t _outp = op(_p, _b);
            vst1q_f16(outptr, _outp);
            ptr1 += 2;
            outptr += 8;
        }
        for (; i < w; i++)
        {
            float16x4_t _b = vdup_n_f16(*ptr1);
            float16x4_t _outp = op(_p0, _b);
            vst1_f16(outptr, _outp);
            ptr1 += 1;
            outptr += 4;
        }
    }
}

template<typename Op>
static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int aw, int bw, int ap, int bp)
{
    const int w = std::max(aw, bw);
    const int elempack = std::max(ap, bp);
    const int size = w * elempack;

    if (ap == bp)
    {
        if (aw == bw)
        {
            // no broadcast
            return binary_op_vector_no_broadcast_fp16s<Op>(ptr, ptr1, outptr, size);
        }

        if (bw == 1)
        {
            // broadcast single b
            return binary_op_vector_broadcast_b_fp16s<Op>(ptr, ptr1, outptr, size, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a
            return binary_op_vector_broadcast_a_fp16s<Op>(ptr, ptr1, outptr, size, elempack);
        }
    }

    if (bp == 1)
    {
        if (aw == bw)
        {
            // broadcast pack1 b
            return binary_op_vector_broadcast_pb_fp16s<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (bw == 1)
        {
            // broadcast pack1 single b
            return binary_op_vector_broadcast_pb_b_fp16s<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a and pack1 b
            return binary_op_vector_broadcast_pb_a_fp16s<Op>(ptr, ptr1, outptr, w, elempack);
        }
    }

    // shall never reach here
}

namespace BinaryOp_arm_functor {

#define MAKE_FUNCTION(NAME, IMPL, IMPL4, IMPL8)                                  \
    struct NAME                                                                  \
    {                                                                            \
        __fp16 operator()(const __fp16& x, const __fp16& y) const                \
        {                                                                        \
            return IMPL;                                                         \
        }                                                                        \
        float16x4_t operator()(const float16x4_t& x, const float16x4_t& y) const \
        {                                                                        \
            return IMPL4;                                                        \
        }                                                                        \
        float16x8_t operator()(const float16x8_t& x, const float16x8_t& y) const \
        {                                                                        \
            return IMPL8;                                                        \
        }                                                                        \
    };

// clang-format off
// *INDENT-OFF*
MAKE_FUNCTION(binary_op_add_fp16s, x + y, vadd_f16(x, y), vaddq_f16(x, y))
MAKE_FUNCTION(binary_op_sub_fp16s, x - y, vsub_f16(x, y), vsubq_f16(x, y))
MAKE_FUNCTION(binary_op_mul_fp16s, x * y, vmul_f16(x, y), vmulq_f16(x, y))
MAKE_FUNCTION(binary_op_div_fp16s, x / y, vdiv_f16(x, y), vdivq_f16(x, y))
MAKE_FUNCTION(binary_op_max_fp16s, std::max(x, y), vmax_f16(x, y), vmaxq_f16(x, y))
MAKE_FUNCTION(binary_op_min_fp16s, std::min(x, y), vmin_f16(x, y), vminq_f16(x, y))
MAKE_FUNCTION(binary_op_pow_fp16s, (__fp16)powf(x, y), vcvt_f16_f32(pow_ps(vcvt_f32_f16(x), vcvt_f32_f16(y))), vcombine_f16(vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_low_f16(y)))), vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_high_f16(x)), vcvt_f32_f16(vget_high_f16(y))))))
MAKE_FUNCTION(binary_op_rsub_fp16s, y - x, vsub_f16(y, x), vsubq_f16(y, x))
MAKE_FUNCTION(binary_op_rdiv_fp16s, y / x, vdiv_f16(y, x), vdivq_f16(y, x))
MAKE_FUNCTION(binary_op_rpow_fp16s, (__fp16)powf(y, x), vcvt_f16_f32(pow_ps(vcvt_f32_f16(y), vcvt_f32_f16(x))), vcombine_f16(vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_low_f16(y)), vcvt_f32_f16(vget_low_f16(x)))), vcvt_f16_f32(pow_ps(vcvt_f32_f16(vget_high_f16(y)), vcvt_f32_f16(vget_high_f16(x))))))
MAKE_FUNCTION(binary_op_atan2_fp16s, (__fp16)atan2f(x, y), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(x), vcvt_f32_f16(y))), vcombine_f16(vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_low_f16(x)), vcvt_f32_f16(vget_low_f16(y)))), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_high_f16(x)), vcvt_f32_f16(vget_high_f16(y))))))
MAKE_FUNCTION(binary_op_ratan2_fp16s, (__fp16)atan2f(y, x), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(y), vcvt_f32_f16(x))), vcombine_f16(vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_low_f16(y)), vcvt_f32_f16(vget_low_f16(x)))), vcvt_f16_f32(atan2_ps(vcvt_f32_f16(vget_high_f16(y)), vcvt_f32_f16(vget_high_f16(x))))))
MAKE_FUNCTION(binary_op_fmod_fp16s, (__fp16)fmodf((float)x, (float)y), fmod_f16(x, y), fmodq_f16(x, y))
MAKE_FUNCTION(binary_op_rfmod_fp16s, (__fp16)fmodf((float)y, (float)x), fmod_f16(y, x), fmodq_f16(y, x))
MAKE_FUNCTION(binary_op_logaddexp_fp16s, (__fp16)(std::max((float)x, (float)y) + log1pf(expf(std::min((float)x, (float)y) - std::max((float)x, (float)y)))), logaddexp_f16(x, y), logaddexpq_f16(x, y))
MAKE_FUNCTION(binary_op_floor_divide_fp16s, (__fp16)floorf((float)x / (float)y), floor_divide_f16(x, y), floor_divideq_f16(x, y))
MAKE_FUNCTION(binary_op_rfloor_divide_fp16s, (__fp16)floorf((float)y / (float)x), floor_divide_f16(y, x), floor_divideq_f16(y, x))
MAKE_FUNCTION(binary_op_remainder_fp16s, (__fp16)remainderf((float)x, (float)y), remainder_f16(x, y), remainderq_f16(x, y))
MAKE_FUNCTION(binary_op_rremainder_fp16s, (__fp16)remainderf((float)y, (float)x), remainder_f16(y, x), remainderq_f16(y, x))
// *INDENT-ON*
// clang-format on

#undef MAKE_FUNCTION

} // namespace BinaryOp_arm_functor

static void binary_op_vector_fp16s(const __fp16* ptr, const __fp16* ptr1, __fp16* outptr, int aw, int bw, int ap, int bp, int op_type)
{
    using namespace BinaryOp_arm_functor;

    if (op_type == BinaryOp::Operation_ADD) return binary_op_vector_fp16s<binary_op_add_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_vector_fp16s<binary_op_sub_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_vector_fp16s<binary_op_mul_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_vector_fp16s<binary_op_div_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_vector_fp16s<binary_op_max_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_vector_fp16s<binary_op_min_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_POW) return binary_op_vector_fp16s<binary_op_pow_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector_fp16s<binary_op_rsub_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector_fp16s<binary_op_rdiv_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector_fp16s<binary_op_rpow_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector_fp16s<binary_op_atan2_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector_fp16s<binary_op_ratan2_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_vector_fp16s<binary_op_fmod_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_vector_fp16s<binary_op_rfmod_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_vector_fp16s<binary_op_logaddexp_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_vector_fp16s<binary_op_floor_divide_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_vector_fp16s<binary_op_rfloor_divide_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector_fp16s<binary_op_remainder_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_vector_fp16s<binary_op_rremainder_fp16s>(ptr, ptr1, outptr, aw, bw, ap, bp);

    // should never reach here
}

static void binary_op_scalar_fp16s(const Mat& a, __fp16 b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const __fp16* ptr = a.channel(q);
        __fp16* outptr = c.channel(q);

        binary_op_vector_fp16s(ptr, &b, outptr, size, 1, 1, 1, op_type);
    }
}

static void binary_op_no_broadcast_fp16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const __fp16* ptr = a.channel(q);
        const __fp16* ptr1 = b.channel(q);
        __fp16* outptr = c.channel(q);

        binary_op_vector_fp16s(ptr, ptr1, outptr, size, size, 1, 1, op_type);
    }
}

static void binary_op_broadcast_fp16s(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    if (b.w * b.h * b.d * b.c * b.elempack == 1)
    {
        return binary_op_scalar_fp16s(a, ((const __fp16*)b)[0], c, op_type, opt);
    }

    if (a.dims == b.dims && a.w == b.w && a.h == b.h && a.d == b.d && a.c == b.c && a.elempack == b.elempack)
    {
        return binary_op_no_broadcast_fp16s(a, b, c, op_type, opt);
    }

    const int dims = c.dims;

    if (dims == 2)
    {
        const int h = c.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < h; y++)
        {
            const int y0 = std::min(y, a.h - 1);
            const int y1 = std::min(y, b.h - 1);

            const __fp16* ptr = a.row<const __fp16>(y0);
            const __fp16* ptr1 = b.row<const __fp16>(y1);
            __fp16* outptr = c.row<__fp16>(y);

            binary_op_vector_fp16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int channels = c.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int q0 = std::min(q, a.c - 1);
            const int q1 = std::min(q, b.c - 1);

            if (b.d * b.h * b.w == 1)
            {
                const __fp16* ptr = a.channel(q0);
                const __fp16* ptr1 = b.channel(q1);
                __fp16* outptr = c.channel(q);

                binary_op_vector_fp16s(ptr, ptr1, outptr, a.w * a.h * a.d, 1, a.elempack, b.elempack, op_type);
                continue;
            }

            if (b.h * b.w == 1)
            {
                for (int z = 0; z < c.d; z++)
                {
                    const int z0 = std::min(z, a.d - 1);
                    const int z1 = std::min(z, b.d - 1);

                    const __fp16* ptr = a.channel(q0).depth(z0);
                    const __fp16* ptr1 = b.channel(q1).depth(z1);
                    __fp16* outptr = c.channel(q).depth(z);

                    binary_op_vector_fp16s(ptr, ptr1, outptr, a.w * a.h, 1, a.elempack, b.elempack, op_type);
                }
                continue;
            }

            for (int z = 0; z < c.d; z++)
            {
                const int z0 = std::min(z, a.d - 1);
                const int z1 = std::min(z, b.d - 1);

                for (int y = 0; y < c.h; y++)
                {
                    const int y0 = std::min(y, a.h - 1);
                    const int y1 = std::min(y, b.h - 1);

                    const __fp16* ptr = a.channel(q0).depth(z0).row<const __fp16>(y0);
                    const __fp16* ptr1 = b.channel(q1).depth(z1).row<const __fp16>(y1);
                    __fp16* outptr = c.channel(q).depth(z).row<__fp16>(y);

                    binary_op_vector_fp16s(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
                }
            }
        }
    }
}

static void binary_op_scalar_inplace_fp16s(Mat& a, __fp16 b, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = a.channel(q);

        binary_op_vector_fp16s(ptr, &b, ptr, size, 1, 1, 1, op_type);
    }
}

static int get_reverse_op_type(int op_type)
{
    if (op_type == BinaryOp::Operation_SUB) return BinaryOp::Operation_RSUB;
    if (op_type == BinaryOp::Operation_DIV) return BinaryOp::Operation_RDIV;
    if (op_type == BinaryOp::Operation_POW) return BinaryOp::Operation_RPOW;
    if (op_type == BinaryOp::Operation_ATAN2) return BinaryOp::Operation_RATAN2;
    if (op_type == BinaryOp::Operation_FMOD) return BinaryOp::Operation_RFMOD;
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return BinaryOp::Operation_RFLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_REMAINDER) return BinaryOp::Operation_RREMAINDER;

    if (op_type == BinaryOp::Operation_RSUB) return BinaryOp::Operation_SUB;
    if (op_type == BinaryOp::Operation_RDIV) return BinaryOp::Operation_DIV;
    if (op_type == BinaryOp::Operation_RPOW) return BinaryOp::Operation_POW;
    if (op_type == BinaryOp::Operation_RATAN2) return BinaryOp::Operation_ATAN2;
    if (op_type == BinaryOp::Operation_RFMOD) return BinaryOp::Operation_FMOD;
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return BinaryOp::Operation_FLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_RREMAINDER) return BinaryOp::Operation_REMAINDER;

    return op_type;
}

int BinaryOp_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    const int outdims = std::max(A.dims, B.dims);

    Mat A2 = A;
    Mat B2 = B;
    if (A.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (A.w * A.elempack == B.h * B.elempack)
                A2 = A.reshape(1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 2;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 3;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 2)
            A2 = A.reshape(1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 4;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 4 && A.dims == 2)
            A2 = A.reshape(1, 1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 3)
            A2 = A.reshape(1, A.w, A.h, A.c, opt.workspace_allocator);
    }
    if (B.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (B.w * B.elempack == A.h * A.elempack)
                B2 = B.reshape(1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 2;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 3;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 2)
            B2 = B.reshape(1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 4;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 4 && B.dims == 2)
            B2 = B.reshape(1, 1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 3)
            B2 = B.reshape(1, B.w, B.h, B.c, opt.workspace_allocator);
    }

    const int outw = std::max(A2.w, B2.w);
    const int outh = std::max(A2.h, B2.h);
    const int outd = std::max(A2.d, B2.d);
    const int outc = std::max(A2.c, B2.c);
    const size_t out_elemsize = std::max(A2.elemsize, B2.elemsize);
    const int out_elempack = std::max(A2.elempack, B2.elempack);

    Mat& top_blob = top_blobs[0];
    if (outdims == 1)
    {
        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 2)
    {
        top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 3)
    {
        top_blob.create(outw, outh, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 4)
    {
        top_blob.create(outw, outh, outd, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    const bool a_pack_is_lower = A2.elempack < B2.elempack;
    const bool a_pack_is_equal = A2.elempack == B2.elempack;
    const bool a_size_is_lower = A2.w * A2.h * A2.d * A2.c * A2.elempack < B2.w * B2.h * B2.d * B2.c * B2.elempack;
    if (a_pack_is_lower || (a_pack_is_equal && a_size_is_lower))
    {
        binary_op_broadcast_fp16s(B2, A2, top_blob, get_reverse_op_type(op_type), opt);
    }
    else
    {
        binary_op_broadcast_fp16s(A2, B2, top_blob, op_type, opt);
    }

    return 0;
}

int BinaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    binary_op_scalar_inplace_fp16s(bottom_top_blob, b, op_type, opt);

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/cast_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cast_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

#include "cast_bf16.h"
#include "cast_fp16.h"

Cast_arm::Cast_arm()
{
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif

    support_bf16_storage = true;
}

int Cast_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (type_from == type_to)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    size_t out_elemsize = elemsize;
    if (type_to == 1)
    {
        if (type_from == 3)
        {
            Cast::forward(bottom_blob, top_blob, opt);
        }

        // float32
        out_elemsize = 4 * elempack;
    }
    else if (type_to == 2)
    {
        // float16
        out_elemsize = 2 * elempack;
    }
    else if (type_to == 3)
    {
        // int8
        out_elemsize = elempack;
    }
    else if (type_to == 4)
    {
        // bfloat16
        out_elemsize = 2 * elempack;
    }

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 4)
    {
        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int size = w * h * d * elempack;

    if (type_from == 1 && type_to == 2)
    {
        cast_fp32_to_fp16_neon(bottom_blob, top_blob, opt);
    }

    if (type_from == 2 && type_to == 1)
    {
        cast_fp16_to_fp32_neon(bottom_blob, top_blob, opt);
    }

    if (type_from == 3 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const signed char* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = (float)ptr[i];
            }
        }
    }

    if (type_from == 1 && type_to == 4)
    {
        cast_fp32_to_bf16_neon(bottom_blob, top_blob, opt);
    }

    if (type_from == 4 && type_to == 1)
    {
        cast_bf16_to_fp32_neon(bottom_blob, top_blob, opt);
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/cast_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CAST_ARM_H
#define LAYER_CAST_ARM_H

#include "cast.h"

namespace ncnn {

class Cast_arm : public Cast
{
public:
    Cast_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CAST_ARM_H


================================================
FILE: src/layer/arm/cast_arm_bf16.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"

namespace ncnn {

#include "cast_bf16.h"

void cast_fp32_to_bf16_neon_bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    cast_fp32_to_bf16_neon(bottom_blob, top_blob, opt);
}

void cast_bf16_to_fp32_neon_bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    cast_bf16_to_fp32_neon(bottom_blob, top_blob, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/cast_arm_vfpv4.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"

namespace ncnn {

#include "cast_fp16.h"

void cast_fp32_to_fp16_neon_vfpv4(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    cast_fp32_to_fp16_neon(bottom_blob, top_blob, opt);
}

void cast_fp16_to_fp32_neon_vfpv4(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    cast_fp16_to_fp32_neon(bottom_blob, top_blob, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/cast_bf16.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
void cast_fp32_to_bf16_neon_bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
void cast_bf16_to_fp32_neon_bf16(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
#endif

static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
    if (ncnn::cpu_support_arm_bf16())
    {
        cast_fp32_to_bf16_neon_bf16(bottom_blob, top_blob, opt);
        return;
    }
#endif

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
        __bf16* outptr = top_blob.channel(q);
#else
        unsigned short* outptr = top_blob.channel(q);
#endif

        int i = 0;
#if __ARM_NEON
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                "bfcvtn v0.4h, v0.4s            \n"
                "bfcvtn2 v0.8h, v1.4s           \n"
                "bfcvtn v1.4h, v2.4s            \n"
                "bfcvtn2 v1.8h, v3.4s           \n"
                "st1    {v0.8h, v1.8h}, [%1], #32 \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0", "v1");
#else  // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                "shrn   v0.4h, v0.4s, #16       \n"
                "shrn   v1.4h, v1.4s, #16       \n"
                "shrn   v2.4h, v2.4s, #16       \n"
                "shrn   v3.4h, v3.4s, #16       \n"
                "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0", "v1", "v2", "v3");
#endif // __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #512]      \n"
                "vldm       %0!, {d0-d7}    \n"
                "vshrn.u32  d0, q0, #16     \n"
                "vshrn.u32  d1, q1, #16     \n"
                "vshrn.u32  d2, q2, #16     \n"
                "vshrn.u32  d3, q3, #16     \n"
                "vst1.u16   {d0-d3}, [%1]!  \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0_fp32 = vld1q_f32(ptr);
            float32x4_t _p1_fp32 = vld1q_f32(ptr + 4);
            float32x4_t _p2_fp32 = vld1q_f32(ptr + 8);
            float32x4_t _p3_fp32 = vld1q_f32(ptr + 12);
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            bfloat16x4_t _p0_bf16 = vcvt_bf16_f32(_p0_fp32);
            bfloat16x4_t _p1_bf16 = vcvt_bf16_f32(_p1_fp32);
            bfloat16x4_t _p2_bf16 = vcvt_bf16_f32(_p2_fp32);
            bfloat16x4_t _p3_bf16 = vcvt_bf16_f32(_p3_fp32);
            bfloat16x8_t _p_bf16 = vcombine_bf16(_p0_bf16, _p1_bf16);
            bfloat16x8_t _q_bf16 = vcombine_bf16(_p2_bf16, _p3_bf16);
            vst1q_bf16(outptr, _p_bf16);
            vst1q_bf16(outptr + 8, _q_bf16);
#else
            uint16x4_t _p0_bf16 = float2bfloat(_p0_fp32);
            uint16x4_t _p1_bf16 = float2bfloat(_p1_fp32);
            uint16x4_t _p2_bf16 = float2bfloat(_p2_fp32);
            uint16x4_t _p3_bf16 = float2bfloat(_p3_fp32);
            uint16x8_t _p_bf16 = vcombine_u16(_p0_bf16, _p1_bf16);
            uint16x8_t _q_bf16 = vcombine_u16(_p2_bf16, _p3_bf16);
            vst1q_u16(outptr, _p_bf16);
            vst1q_u16(outptr + 8, _q_bf16);
#endif
            ptr += 16;
            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0_fp32 = vld1q_f32(ptr);
            float32x4_t _p1_fp32 = vld1q_f32(ptr + 4);
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            bfloat16x4_t _p0_bf16 = vcvt_bf16_f32(_p0_fp32);
            bfloat16x4_t _p1_bf16 = vcvt_bf16_f32(_p1_fp32);
            bfloat16x8_t _p_bf16 = vcombine_bf16(_p0_bf16, _p1_bf16);
            vst1q_bf16(outptr, _p_bf16);
#else
            uint16x4_t _p0_bf16 = float2bfloat(_p0_fp32);
            uint16x4_t _p1_bf16 = float2bfloat(_p1_fp32);
            uint16x8_t _p_bf16 = vcombine_u16(_p0_bf16, _p1_bf16);
            vst1q_u16(outptr, _p_bf16);
#endif
            ptr += 8;
            outptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p_fp32 = vld1q_f32(ptr);
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            bfloat16x4_t _p_bf16 = vcvt_bf16_f32(_p_fp32);
            vst1_bf16(outptr, _p_bf16);
#else
            uint16x4_t _p_bf16 = float2bfloat(_p_fp32);
            vst1_u16(outptr, _p_bf16);
#endif
            ptr += 4;
            outptr += 4;
        }
#endif
        for (; i < size; i++)
        {
#if NCNN_GNU_INLINE_ASM && __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            asm volatile(
                "ldr    s0, [%0], #4    \n"
                "bfcvt  h0, s0          \n"
                "str    h0, [%1], #2    \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "s0");
            // because intrinsic cause ndk clang crash
            // *outptr++ = vcvth_bf16_f32(*ptr++);
#else
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            *(unsigned short*)outptr = float32_to_bfloat16(*ptr++);
            outptr++;
#else
            *outptr++ = float32_to_bfloat16(*ptr++);
#endif
#endif
        }
    }
}

static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC
    if (ncnn::cpu_support_arm_bf16())
    {
        cast_bf16_to_fp32_neon_bf16(bottom_blob, top_blob, opt);
        return;
    }
#endif

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
        const __bf16* ptr = bottom_blob.channel(q);
#else
        const unsigned short* ptr = bottom_blob.channel(q);
#endif
        float* outptr = top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #256]   \n"
                "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
                "shll   v0.4s, v0.4h, #16       \n"
                "shll   v1.4s, v1.4h, #16       \n"
                "shll   v2.4s, v2.4h, #16       \n"
                "shll   v3.4s, v3.4h, #16       \n"
                "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #256]      \n"
                "vld1.u16   {d4-d7}, [%0]!  \n"
                "vshll.u16  q0, d4, #16     \n"
                "vshll.u16  q1, d5, #16     \n"
                "vshll.u16  q2, d6, #16     \n"
                "vshll.u16  q3, d7, #16     \n"
                "vstm       %1!, {d0-d7}    \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            bfloat16x8_t _p_bf16 = vld1q_bf16(ptr);
            bfloat16x8_t _q_bf16 = vld1q_bf16(ptr + 8);
            float32x4_t _p0_fp32 = vcvt_f32_bf16(vget_low_bf16(_p_bf16));
            float32x4_t _p1_fp32 = vcvt_f32_bf16(vget_high_bf16(_p_bf16));
            float32x4_t _p2_fp32 = vcvt_f32_bf16(vget_low_bf16(_q_bf16));
            float32x4_t _p3_fp32 = vcvt_f32_bf16(vget_high_bf16(_q_bf16));
#else
            uint16x8_t _p_bf16 = vld1q_u16(ptr);
            uint16x8_t _q_bf16 = vld1q_u16(ptr + 8);
            float32x4_t _p0_fp32 = bfloat2float(vget_low_u16(_p_bf16));
            float32x4_t _p1_fp32 = bfloat2float(vget_high_u16(_p_bf16));
            float32x4_t _p2_fp32 = bfloat2float(vget_low_u16(_q_bf16));
            float32x4_t _p3_fp32 = bfloat2float(vget_high_u16(_q_bf16));
#endif
            vst1q_f32(outptr, _p0_fp32);
            vst1q_f32(outptr + 4, _p1_fp32);
            vst1q_f32(outptr + 8, _p2_fp32);
            vst1q_f32(outptr + 12, _p3_fp32);
            ptr += 16;
            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            bfloat16x8_t _p_bf16 = vld1q_bf16(ptr);
            float32x4_t _p0_fp32 = vcvt_f32_bf16(vget_low_bf16(_p_bf16));
            float32x4_t _p1_fp32 = vcvt_f32_bf16(vget_high_bf16(_p_bf16));
#else
            uint16x8_t _p_bf16 = vld1q_u16(ptr);
            float32x4_t _p0_fp32 = bfloat2float(vget_low_u16(_p_bf16));
            float32x4_t _p1_fp32 = bfloat2float(vget_high_u16(_p_bf16));
#endif
            vst1q_f32(outptr, _p0_fp32);
            vst1q_f32(outptr + 4, _p1_fp32);
            ptr += 8;
            outptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            bfloat16x4_t _p_bf16 = vld1_bf16(ptr);
            float32x4_t _p_fp32 = vcvt_f32_bf16(_p_bf16);
#else
            uint16x4_t _p_bf16 = vld1_u16(ptr);
            float32x4_t _p_fp32 = bfloat2float(_p_bf16);
#endif
            vst1q_f32(outptr, _p_fp32);
            ptr += 4;
            outptr += 4;
        }
#endif
        for (; i < size; i++)
        {
#if __ARM_FEATURE_BF16_VECTOR_ARITHMETIC
            *outptr++ = vcvtah_f32_bf16(*ptr++);
#else
            *outptr++ = bfloat16_to_float32(*ptr++);
#endif
        }
    }
}


================================================
FILE: src/layer/arm/cast_fp16.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
void cast_fp32_to_fp16_neon_vfpv4(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
void cast_fp16_to_fp32_neon_vfpv4(const Mat& bottom_blob, Mat& top_blob, const Option& opt);
#endif

static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
    if (ncnn::cpu_support_arm_vfpv4())
    {
        cast_fp32_to_fp16_neon_vfpv4(bottom_blob, top_blob, opt);
        return;
    }
#endif

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = bottom_blob.channel(q);
        unsigned short* outptr = top_blob.channel(q);

        int i = 0;
#if (__ARM_FP & 2)
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #512]       \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                "fcvtn  v0.4h, v0.4s                \n"
                "fcvtn  v1.4h, v1.4s                \n"
                "fcvtn  v2.4h, v2.4s                \n"
                "fcvtn  v3.4h, v3.4s                \n"
                "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #512]      \n"
                "vldm       %0!, {d0-d7}    \n"
                "vcvt.f16.f32 d0, q0        \n"
                "vcvt.f16.f32 d1, q1        \n"
                "vcvt.f16.f32 d2, q2        \n"
                "vcvt.f16.f32 d3, q3        \n"
                "vst1.u16   {d0-d3}, [%1]!  \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0_fp32 = vld1q_f32(ptr);
            float32x4_t _p1_fp32 = vld1q_f32(ptr + 4);
            float32x4_t _p2_fp32 = vld1q_f32(ptr + 8);
            float32x4_t _p3_fp32 = vld1q_f32(ptr + 12);
            uint16x4_t _p0_fp16 = (uint16x4_t)vcvt_f16_f32(_p0_fp32);
            uint16x4_t _p1_fp16 = (uint16x4_t)vcvt_f16_f32(_p1_fp32);
            uint16x4_t _p2_fp16 = (uint16x4_t)vcvt_f16_f32(_p2_fp32);
            uint16x4_t _p3_fp16 = (uint16x4_t)vcvt_f16_f32(_p3_fp32);
            uint16x8_t _p_fp16 = vcombine_u16(_p0_fp16, _p1_fp16);
            uint16x8_t _q_fp16 = vcombine_u16(_p2_fp16, _p3_fp16);
            vst1q_u16(outptr, _p_fp16);
            vst1q_u16(outptr + 8, _q_fp16);
            ptr += 16;
            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
#if NCNN_GNU_INLINE_ASM
            // This is originally implemented with neon fp16 intrinsics.
            // In the new version of gcc, __ARM_FP16_FORMAT_IEEE or __ARM_FP16_FORMAT_ALTERNATIVE needs to be defined to use the float16x4_t type.
            // That leads to compiler error when compiled with -mfpu=neon-vfpv4 but without -mfp16-format=ieee flag.
            // We could add more macro conditions to differentiate between old and new versions, but that's pretty ugly!
            // Just use all inline assembly here ~
            //          --- nihui
#if __aarch64__
            asm volatile(
                "ld1    {v0.4s, v1.4s}, [%0], #32   \n"
                "fcvtn  v0.4h, v0.4s                \n"
                "fcvtn  v1.4h, v1.4s                \n"
                "st1    {v0.4h, v1.4h}, [%1], #16   \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0", "v1");
#else  // __aarch64__
            asm volatile(
                "vld1.f32   {d0-d3}, [%0]!  \n"
                "vcvt.f16.f32 d0, q0        \n"
                "vcvt.f16.f32 d1, q1        \n"
                "vst1.u16   {d0-d1}, [%1]!  \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0", "q1");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0_fp32 = vld1q_f32(ptr);
            float32x4_t _p1_fp32 = vld1q_f32(ptr + 4);
            uint16x4_t _p0_fp16 = (uint16x4_t)vcvt_f16_f32(_p0_fp32);
            uint16x4_t _p1_fp16 = (uint16x4_t)vcvt_f16_f32(_p1_fp32);
            uint16x8_t _p_fp16 = vcombine_u16(_p0_fp16, _p1_fp16);
            vst1q_u16(outptr, _p_fp16);
            ptr += 8;
            outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 3 < size; i += 4)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "ld1    {v0.4s}, [%0], #16  \n"
                "fcvtn  v0.4h, v0.4s        \n"
                "st1    {v0.4h}, [%1], #8   \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0");
#else  // __aarch64__
            asm volatile(
                "vld1.f32   {d0-d1}, [%0]!  \n"
                "vcvt.f16.f32 d0, q0        \n"
                "vst1.u16   {d0}, [%1]!     \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p_fp32 = vld1q_f32(ptr);
            uint16x4_t _p_fp16 = (uint16x4_t)vcvt_f16_f32(_p_fp32);
            vst1_u16(outptr, _p_fp16);
            ptr += 4;
            outptr += 4;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // (__ARM_FP & 2)
        for (; i < size; i++)
        {
            *outptr++ = float32_to_float16(*ptr++);
        }
    }
}

static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
    if (ncnn::cpu_support_arm_vfpv4())
    {
        cast_fp16_to_fp32_neon_vfpv4(bottom_blob, top_blob, opt);
        return;
    }
#endif

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const unsigned short* ptr = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        int i = 0;
#if (__ARM_FP & 2)
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #256]       \n"
                "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
                "fcvtl  v0.4s, v0.4h                \n"
                "fcvtl  v1.4s, v1.4h                \n"
                "fcvtl  v2.4s, v2.4h                \n"
                "fcvtl  v3.4s, v3.4h                \n"
                "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #256]      \n"
                "vld1.u16   {d4-d7}, [%0]!  \n"
                "vcvt.f32.f16 q0, d4        \n"
                "vcvt.f32.f16 q1, d5        \n"
                "vcvt.f32.f16 q2, d6        \n"
                "vcvt.f32.f16 q3, d7        \n"
                "vstm       %1!, {d0-d7}    \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            uint16x8_t _p_fp16 = vld1q_u16(ptr);
            uint16x8_t _q_fp16 = vld1q_u16(ptr + 8);
            float32x4_t _p0_fp32 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p_fp16));
            float32x4_t _p1_fp32 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p_fp16));
            float32x4_t _p2_fp32 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q_fp16));
            float32x4_t _p3_fp32 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q_fp16));
            vst1q_f32(outptr, _p0_fp32);
            vst1q_f32(outptr + 4, _p1_fp32);
            vst1q_f32(outptr + 8, _p2_fp32);
            vst1q_f32(outptr + 12, _p3_fp32);
            ptr += 16;
            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "ld1    {v0.4h, v1.4h}, [%0], #16   \n"
                "fcvtl  v0.4s, v0.4h                \n"
                "fcvtl  v1.4s, v1.4h                \n"
                "st1    {v0.4s, v1.4s}, [%1], #32   \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0", "v1");
#else  // __aarch64__
            asm volatile(
                "vld1.u16   {d4-d5}, [%0]!  \n"
                "vcvt.f32.f16 q0, d4        \n"
                "vcvt.f32.f16 q1, d5        \n"
                "vst1.f32   {d0-d3}, [%1]!  \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0", "q1", "q2");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            uint16x8_t _p_fp16 = vld1q_u16(ptr);
            float32x4_t _p0_fp32 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p_fp16));
            float32x4_t _p1_fp32 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p_fp16));
            vst1q_f32(outptr, _p0_fp32);
            vst1q_f32(outptr + 4, _p1_fp32);
            ptr += 8;
            outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 3 < size; i += 4)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "ld1    {v0.4h}, [%0], #8   \n"
                "fcvtl  v0.4s, v0.4h        \n"
                "st1    {v0.4s}, [%1], #16  \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "v0");
#else  // __aarch64__
            asm volatile(
                "vld1.u16   {d2}, [%0]!     \n"
                "vcvt.f32.f16 q0, d2        \n"
                "vst1.f32   {d0-d1}, [%1]!  \n"
                : "=r"(ptr),   // %0
                "=r"(outptr) // %1
                : "0"(ptr),
                "1"(outptr)
                : "memory", "q0", "q1");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            uint16x4_t _p_fp16 = vld1_u16(ptr);
            float32x4_t _p_fp32 = vcvt_f32_f16((float16x4_t)_p_fp16);
            vst1q_f32(outptr, _p_fp32);
            ptr += 4;
            outptr += 4;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // (__ARM_FP & 2)
        for (; i < size; i++)
        {
            *outptr++ = float16_to_float32(*ptr++);
        }
    }
}


================================================
FILE: src/layer/arm/clip_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "clip_arm.h"

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

Clip_arm::Clip_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Clip_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _min = vdupq_n_f32(min);
        float32x4_t _max = vdupq_n_f32(max);
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                "fmax   v0.4s, v0.4s, %2.4s     \n"
                "fmax   v1.4s, v1.4s, %2.4s     \n"
                "fmax   v2.4s, v2.4s, %2.4s     \n"
                "fmax   v3.4s, v3.4s, %2.4s     \n"
                "fmin   v0.4s, v0.4s, %3.4s     \n"
                "fmin   v1.4s, v1.4s, %3.4s     \n"
                "fmin   v2.4s, v2.4s, %3.4s     \n"
                "fmin   v3.4s, v3.4s, %3.4s     \n"
                "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_min), // %2
                "w"(_max)  // %3
                : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #512]      \n"
                "vldm       %0, {d0-d7}     \n"
                "vmax.f32   q0, q0, %q2     \n"
                "vmax.f32   q1, q1, %q2     \n"
                "vmax.f32   q2, q2, %q2     \n"
                "vmax.f32   q3, q3, %q2     \n"
                "vmin.f32   q0, q0, %q3     \n"
                "vmin.f32   q1, q1, %q3     \n"
                "vmin.f32   q2, q2, %q3     \n"
                "vmin.f32   q3, q3, %q3     \n"
                "vstm       %0!, {d0-d7}    \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_min), // %2
                "w"(_max)  // %3
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _p2 = vld1q_f32(ptr + 8);
            float32x4_t _p3 = vld1q_f32(ptr + 12);
            _p0 = vmaxq_f32(_p0, _min);
            _p1 = vmaxq_f32(_p1, _min);
            _p2 = vmaxq_f32(_p2, _min);
            _p3 = vmaxq_f32(_p3, _min);
            _p0 = vminq_f32(_p0, _max);
            _p1 = vminq_f32(_p1, _max);
            _p2 = vminq_f32(_p2, _max);
            _p3 = vminq_f32(_p3, _max);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            vst1q_f32(ptr + 8, _p2);
            vst1q_f32(ptr + 12, _p3);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            _p0 = vmaxq_f32(_p0, _min);
            _p1 = vmaxq_f32(_p1, _min);
            _p0 = vminq_f32(_p0, _max);
            _p1 = vminq_f32(_p1, _max);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vmaxq_f32(_p, _min);
            _p = vminq_f32(_p, _max);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            if (*ptr < min)
                *ptr = min;

            if (*ptr > max)
                *ptr = max;

            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int Clip_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _min = vdupq_n_f32(min);
        float32x4_t _max = vdupq_n_f32(max);
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #256]   \n"
                "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0] \n"
                "shll   v0.4s, v0.4h, #16       \n"
                "shll   v1.4s, v1.4h, #16       \n"
                "shll   v2.4s, v2.4h, #16       \n"
                "shll   v3.4s, v3.4h, #16       \n"
                "fmax   v0.4s, v0.4s, %2.4s     \n"
                "fmax   v1.4s, v1.4s, %2.4s     \n"
                "fmax   v2.4s, v2.4s, %2.4s     \n"
                "fmax   v3.4s, v3.4s, %2.4s     \n"
                "fmin   v0.4s, v0.4s, %3.4s     \n"
                "fmin   v1.4s, v1.4s, %3.4s     \n"
                "fmin   v2.4s, v2.4s, %3.4s     \n"
                "fmin   v3.4s, v3.4s, %3.4s     \n"
                "shrn   v0.4h, v0.4s, #16       \n"
                "shrn   v1.4h, v1.4s, #16       \n"
                "shrn   v2.4h, v2.4s, #16       \n"
                "shrn   v3.4h, v3.4s, #16       \n"
                "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_min), // %2
                "w"(_max)  // %3
                : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #256]      \n"
                "vld1.u16   {d4-d7}, [%0]   \n"
                "vshll.u16  q0, d4, #16     \n"
                "vshll.u16  q1, d5, #16     \n"
                "vshll.u16  q2, d6, #16     \n"
                "vshll.u16  q3, d7, #16     \n"
                "vmax.f32   q0, q0, %q2     \n"
                "vmax.f32   q1, q1, %q2     \n"
                "vmax.f32   q2, q2, %q2     \n"
                "vmax.f32   q3, q3, %q2     \n"
                "vmin.f32   q0, q0, %q3     \n"
                "vmin.f32   q1, q1, %q3     \n"
                "vmin.f32   q2, q2, %q3     \n"
                "vmin.f32   q3, q3, %q3     \n"
                "vshrn.u32  d0, q0, #16     \n"
                "vshrn.u32  d1, q1, #16     \n"
                "vshrn.u32  d2, q2, #16     \n"
                "vshrn.u32  d3, q3, #16     \n"
                "vst1.u16   {d0-d3}, [%0]!  \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_min), // %2
                "w"(_max)  // %3
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            uint16x8_t _p = vld1q_u16(ptr);
            uint16x8_t _q = vld1q_u16(ptr + 8);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
            float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
            _p0 = vmaxq_f32(_p0, _min);
            _p1 = vmaxq_f32(_p1, _min);
            _p2 = vmaxq_f32(_p2, _min);
            _p3 = vmaxq_f32(_p3, _min);
            _p0 = vminq_f32(_p0, _max);
            _p1 = vminq_f32(_p1, _max);
            _p2 = vminq_f32(_p2, _max);
            _p3 = vminq_f32(_p3, _max);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            _q = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
            vst1q_u16(ptr, _p);
            vst1q_u16(ptr + 8, _q);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _p0 = vmaxq_f32(_p0, _min);
            _p1 = vmaxq_f32(_p1, _min);
            _p0 = vminq_f32(_p0, _max);
            _p1 = vminq_f32(_p1, _max);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = vmaxq_f32(_p, _min);
            _p = vminq_f32(_p, _max);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(*ptr);
            if (v < min)
                v = min;

            if (v > max)
                v = max;

            *ptr = float32_to_bfloat16(v);
            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/clip_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CLIP_ARM_H
#define LAYER_CLIP_ARM_H

#include "clip.h"

namespace ncnn {

class Clip_arm : public Clip
{
public:
    Clip_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_CLIP_ARM_H


================================================
FILE: src/layer/arm/clip_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "clip_arm.h"

#ifdef __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Clip_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        __fp16 min_fp16 = min;
        __fp16 max_fp16 = max;

        float16x8_t _min = vdupq_n_f16(min_fp16);
        float16x8_t _max = vdupq_n_f16(max_fp16);

        int i = 0;
        for (; i + 31 < size; i += 32)
        {
#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                "fmax   v0.8h, v0.8h, %2.8h     \n"
                "fmax   v1.8h, v1.8h, %2.8h     \n"
                "fmax   v2.8h, v2.8h, %2.8h     \n"
                "fmax   v3.8h, v3.8h, %2.8h     \n"
                "fmin   v0.8h, v0.8h, %3.8h     \n"
                "fmin   v1.8h, v1.8h, %3.8h     \n"
                "fmin   v2.8h, v2.8h, %3.8h     \n"
                "fmin   v3.8h, v3.8h, %3.8h     \n"
                "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_min), // %2
                "w"(_max)  // %3
                : "memory", "v0", "v1", "v2", "v3");
#else  // NCNN_GNU_INLINE_ASM
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            _p0 = vmaxq_f16(_p0, _min);
            _p1 = vmaxq_f16(_p1, _min);
            _p2 = vmaxq_f16(_p2, _min);
            _p3 = vmaxq_f16(_p3, _min);
            _p0 = vminq_f16(_p0, _max);
            _p1 = vminq_f16(_p1, _max);
            _p2 = vminq_f16(_p2, _max);
            _p3 = vminq_f16(_p3, _max);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            ptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            _p0 = vmaxq_f16(_p0, _min);
            _p1 = vmaxq_f16(_p1, _min);
            _p0 = vminq_f16(_p0, _max);
            _p1 = vminq_f16(_p1, _max);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _p = vmaxq_f16(_p, _min);
            _p = vminq_f16(_p, _max);
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = vmax_f16(_p, vget_low_f16(_min));
            _p = vmin_f16(_p, vget_low_f16(_max));
            vst1_f16(ptr, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = *ptr;
            if (v < min_fp16)
                v = min_fp16;

            if (v > max_fp16)
                v = max_fp16;

            *ptr = v;
            ptr++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/concat_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "concat_arm.h"

#include "cpu.h"

namespace ncnn {

Concat_arm::Concat_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int elembits = bottom_blobs[0].elembits();

#if NCNN_ARM82
    if (support_packing && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

    int dims = bottom_blobs[0].dims;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // concat vector
        // total length
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        float* outptr = top_blob;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            const float* ptr = bottom_blob;
            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);

            outptr += bottom_blob.w * bottom_blob.elempack;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // concat image
        int w = bottom_blobs[0].w;

        // total height
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_h += bottom_blob.h * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;
        }

        float* outptr = top_blob_unpacked;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                for (int i = 0; i < bottom_blob.h; i++)
                {
                    const float* r0 = bottom_blob.row(i);

                    float* outptr0 = outptr;
                    float* outptr1 = outptr + w;
                    float* outptr2 = outptr + w * 2;
                    float* outptr3 = outptr + w * 3;

                    for (int j = 0; j < w; j++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    outptr += w * 4;
                }
            }
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
            {
                int size = w * bottom_blob.h;

                const float* ptr = bottom_blob;
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                outptr += size * bottom_blob.elempack;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // interleave image row
        int h = bottom_blobs[0].h;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total width
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* outptr = top_blob.row(i);
            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                const float* ptr = bottom_blob.row(i);
                memcpy(outptr, ptr, bottom_blob.w * elemsize);

                outptr += bottom_blob.w * elempack;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;

        // total channels
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_channels = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_channels += bottom_blob.c * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;

            top_blob_unpacked.dims = dims;
        }

        int p = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                for (int q = 0; q < bottom_blob.c; q++)
                {
                    const float* r0 = bottom_blob.channel(q);

                    float* outptr0 = top_blob_unpacked.channel(p);
                    float* outptr1 = top_blob_unpacked.channel(p + 1);
                    float* outptr2 = top_blob_unpacked.channel(p + 2);
                    float* outptr3 = top_blob_unpacked.channel(p + 3);

                    for (int i = 0; i < size; i++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    p += 4;
                }
            }
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
            {
                int size = bottom_blob.total();

                const float* ptr = bottom_blob;
                float* outptr = top_blob_unpacked.channel(p);
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                p += bottom_blob.c;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // interleave dim height
        int w = bottom_blobs[0].w;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (size_t b = 0; b < bottom_blobs.size(); b++)
                {
                    const Mat& bottom_blob = bottom_blobs[b];

                    int size = bottom_blob.w * bottom_blob.h;

                    const float* ptr = bottom_blob.channel(q).depth(i);
                    memcpy(outptr, ptr, size * elemsize);

                    outptr += size * elempack;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // interleave dim width
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
                    {
                        const Mat& bottom_blob = bottom_blobs[b];

                        const float* ptr = bottom_blob.channel(q).depth(i).row(j);
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);

                        outptr += bottom_blob.w * elempack;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        // interleave dim depth
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total depth
        int top_d = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_d += bottom_blob.d;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                const float* ptr = bottom_blob.channel(q);
                memcpy(outptr, ptr, size * elemsize);

                outptr += size * elempack;
            }
        }
    }

    return 0;
}

int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int dims = bottom_blobs[0].dims;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // concat vector
        // total length
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w * bottom_blob.elempack;
        }

        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
#if NCNN_ARM82
            out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1;
#else
            out_elempack = top_w % 4 == 0 ? 4 : 1;
#endif
        }
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        unsigned short* outptr = top_blob;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            const unsigned short* ptr = bottom_blob;
            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);

            outptr += bottom_blob.w * bottom_blob.elempack;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // concat image
        int w = bottom_blobs[0].w;

        // total height
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_h += bottom_blob.h * bottom_blob.elempack;
        }

        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
#if NCNN_ARM82
            out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1;
#else
            out_elempack = top_h % 4 == 0 ? 4 : 1;
#endif
        }
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;
        }

        unsigned short* outptr = top_blob_unpacked;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

#if NCNN_ARM82
            if (bottom_blob.elempack == 8 && elempack == 4)
            {
                for (int i = 0; i < bottom_blob.h; i++)
                {
                    const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);

                    unsigned short* outptr0 = outptr;
                    unsigned short* outptr1 = outptr + w * 4;

                    for (int j = 0; j < w; j++)
                    {
                        outptr0[0] = r0[0];
                        outptr0[1] = r0[1];
                        outptr0[2] = r0[2];
                        outptr0[3] = r0[3];
                        outptr1[0] = r0[4];
                        outptr1[1] = r0[5];
                        outptr1[2] = r0[6];
                        outptr1[3] = r0[7];

                        outptr0 += 4;
                        outptr1 += 4;
                        r0 += 8;
                    }

                    outptr += w * 8;
                }
            }
            if (bottom_blob.elempack == 8 && elempack == 1)
            {
                for (int i = 0; i < bottom_blob.h; i++)
                {
                    const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);

                    unsigned short* outptr0 = outptr;
                    unsigned short* outptr1 = outptr + w;
                    unsigned short* outptr2 = outptr + w * 2;
                    unsigned short* outptr3 = outptr + w * 3;
                    unsigned short* outptr4 = outptr + w * 4;
                    unsigned short* outptr5 = outptr + w * 5;
                    unsigned short* outptr6 = outptr + w * 6;
                    unsigned short* outptr7 = outptr + w * 7;

                    for (int j = 0; j < w; j++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];
                        *outptr4++ = r0[4];
                        *outptr5++ = r0[5];
                        *outptr6++ = r0[6];
                        *outptr7++ = r0[7];

                        r0 += 8;
                    }

                    outptr += w * 8;
                }
            }
#endif // NCNN_ARM82
            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                for (int i = 0; i < bottom_blob.h; i++)
                {
                    const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);

                    unsigned short* outptr0 = outptr;
                    unsigned short* outptr1 = outptr + w;
                    unsigned short* outptr2 = outptr + w * 2;
                    unsigned short* outptr3 = outptr + w * 3;

                    for (int j = 0; j < w; j++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    outptr += w * 4;
                }
            }
            if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8
            {
                int size = w * bottom_blob.h;

                const unsigned short* ptr = bottom_blob;
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                outptr += size * bottom_blob.elempack;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // interleave image row
        int h = bottom_blobs[0].h;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total width
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            unsigned short* outptr = top_blob.row<unsigned short>(i);
            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                const unsigned short* ptr = bottom_blob.row<unsigned short>(i);
                memcpy(outptr, ptr, bottom_blob.w * elemsize);

                outptr += bottom_blob.w * elempack;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;

        // total channels
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_channels = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_channels += bottom_blob.c * bottom_blob.elempack;
        }

        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
#if NCNN_ARM82
            out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1;
#else
            out_elempack = top_channels % 4 == 0 ? 4 : 1;
#endif
        }
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;

            top_blob_unpacked.dims = dims;
        }

        int p = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

#if NCNN_ARM82
            if (bottom_blob.elempack == 8 && elempack == 4)
            {
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                for (int q = 0; q < bottom_blob.c; q++)
                {
                    const unsigned short* r0 = bottom_blob.channel(q);

                    unsigned short* outptr0 = top_blob_unpacked.channel(p);
                    unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);

                    for (int i = 0; i < size; i++)
                    {
                        outptr0[0] = r0[0];
                        outptr0[1] = r0[1];
                        outptr0[2] = r0[2];
                        outptr0[3] = r0[3];
                        outptr1[0] = r0[4];
                        outptr1[1] = r0[5];
                        outptr1[2] = r0[6];
                        outptr1[3] = r0[7];

                        outptr0 += 4;
                        outptr1 += 4;
                        r0 += 8;
                    }

                    p += 2;
                }
            }
            if (bottom_blob.elempack == 8 && elempack == 1)
            {
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                for (int q = 0; q < bottom_blob.c; q++)
                {
                    const unsigned short* r0 = bottom_blob.channel(q);

                    unsigned short* outptr0 = top_blob_unpacked.channel(p);
                    unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
                    unsigned short* outptr2 = top_blob_unpacked.channel(p + 2);
                    unsigned short* outptr3 = top_blob_unpacked.channel(p + 3);
                    unsigned short* outptr4 = top_blob_unpacked.channel(p + 4);
                    unsigned short* outptr5 = top_blob_unpacked.channel(p + 5);
                    unsigned short* outptr6 = top_blob_unpacked.channel(p + 6);
                    unsigned short* outptr7 = top_blob_unpacked.channel(p + 7);

                    for (int i = 0; i < size; i++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];
                        *outptr4++ = r0[4];
                        *outptr5++ = r0[5];
                        *outptr6++ = r0[6];
                        *outptr7++ = r0[7];

                        r0 += 8;
                    }

                    p += 8;
                }
            }
#endif // NCNN_ARM82
            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                for (int q = 0; q < bottom_blob.c; q++)
                {
                    const unsigned short* r0 = bottom_blob.channel(q);

                    unsigned short* outptr0 = top_blob_unpacked.channel(p);
                    unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
                    unsigned short* outptr2 = top_blob_unpacked.channel(p + 2);
                    unsigned short* outptr3 = top_blob_unpacked.channel(p + 3);

                    for (int i = 0; i < size; i++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    p += 4;
                }
            }
            if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8
            {
                int size = bottom_blob.total();

                const unsigned short* ptr = bottom_blob;
                unsigned short* outptr = top_blob_unpacked.channel(p);
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                p += bottom_blob.c;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // interleave dim height
        int w = bottom_blobs[0].w;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (size_t b = 0; b < bottom_blobs.size(); b++)
                {
                    const Mat& bottom_blob = bottom_blobs[b];

                    int size = bottom_blob.w * bottom_blob.h;

                    const unsigned short* ptr = bottom_blob.channel(q).depth(i);
                    memcpy(outptr, ptr, size * elemsize);

                    outptr += size * elempack;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // interleave dim width
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
                    {
                        const Mat& bottom_blob = bottom_blobs[b];

                        const unsigned short* ptr = bottom_blob.channel(q).depth(i).row<const unsigned short>(j);
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);

                        outptr += bottom_blob.w * elempack;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        // interleave dim depth
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total depth
        int top_d = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_d += bottom_blob.d;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* outptr = top_blob.channel(q);

            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                const unsigned short* ptr = bottom_blob.channel(q);
                memcpy(outptr, ptr, size * elemsize);

                outptr += size * elempack;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/concat_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONCAT_ARM_H
#define LAYER_CONCAT_ARM_H

#include "concat.h"

namespace ncnn {

class Concat_arm : public Concat
{
public:
    Concat_arm();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CONCAT_ARM_H


================================================
FILE: src/layer/arm/convolution1d_arm.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution1d_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "arm_activation.h"

#include "cpu.h"
#include "layer_type.h"

namespace ncnn {

#include "convolution1d_packed.h"
#if NCNN_BF16
#include "convolution1d_packed_bf16s.h"
#endif // NCNN_BF16

Convolution1D_arm::Convolution1D_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Convolution1D_arm::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

    const int num_input = weight_data_size / kernel_w / num_output;

    convolution1d_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution1D_arm::destroy_pipeline(const Option& /*opt*/)
{
    return 0;
}

int Convolution1D_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    int w = bottom_blob.w;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = num_output / out_elempack;

    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    convolution1d_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, dilation_w, stride_w, activation_type, activation_params, opt);

    return 0;
}

int Convolution1D_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

#if NCNN_ARM82
    if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_ARM82
#if NCNN_BF16
    if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_BF16

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

#if NCNN_ARM82
        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_ARM82
#if NCNN_BF16
        if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_BF16

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(2, dilation_w);
    pd.set(3, stride_w);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_BF16
int Convolution1D_arm::create_pipeline_bf16s(const Option& opt)
{
    const int num_input = weight_data_size / kernel_w / num_output;

    convolution1d_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution1D_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = num_output / out_elempack;

    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    convolution1d_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, dilation_w, stride_w, activation_type, activation_params, opt);

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/convolution1d_arm.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION1D_ARM_H
#define LAYER_CONVOLUTION1D_ARM_H

#include "convolution1d.h"

namespace ncnn {

class Convolution1D_arm : public Convolution1D
{
public:
    Convolution1D_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Mat weight_data_tm;

    // fp16
    Mat bias_data_fp16;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION1D_ARM_H


================================================
FILE: src/layer/arm/convolution1d_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution1d_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "convolution1d_packed_fp16s.h"

int Convolution1D_arm::create_pipeline_fp16s(const Option& opt)
{
    const int num_input = weight_data_size / kernel_w / num_output;

    convolution1d_transform_kernel_packed_fp16s(weight_data, weight_data_tm, num_input, num_output, kernel_w);

    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution1D_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;

    int out_elempack = (opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
    size_t out_elemsize = elemsize / elempack * out_elempack;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = num_output / out_elempack;

    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    convolution1d_packed_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, dilation_w, stride_w, activation_type, activation_params, opt);

    return 0;
}

int Convolution1D_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = num_output / out_elempack;

    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    convolution1d_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, dilation_w, stride_w, activation_type, activation_params, opt);

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/convolution1d_packed.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution1d_transform_kernel_packed(const Mat& kernel, Mat& kernel_tm, int inh, int outh, int kernel_w)
{
    // src = kw-inh-outh
    // dst = pb-pa-kw-inh/pa-outh/pb

    // clang-format off
    // *INDENT-OFF*
#if __ARM_NEON
#if __aarch64__
    if (outh >= 8)
    {
        if (inh >= 8)
            kernel_tm.create(8 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2);
        else if (inh >= 4)
            kernel_tm.create(8 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2);
        else if (inh >= 2)
            kernel_tm.create(8 * 2 * kernel_w, inh / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2);
        else
            kernel_tm.create(8 * kernel_w, inh, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2);
    }
    else
#endif // __aarch64__
    if (outh >= 4)
    {
#if __aarch64__
        if (inh >= 8)
            kernel_tm.create(4 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2);
        else
#endif // __aarch64__
        if (inh >= 4)
            kernel_tm.create(4 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2);
        else if (inh >= 2)
            kernel_tm.create(4 * 2 * kernel_w, inh / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2);
        else
            kernel_tm.create(4 * kernel_w, inh, outh / 4 + (outh % 4) / 2 + outh % 2);
    }
    else
#endif // __ARM_NEON
    if (outh >= 2)
    {
#if __ARM_NEON
#if __aarch64__
        if (inh >= 8)
            kernel_tm.create(2 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2);
        else
#endif // __aarch64__
        if (inh >= 4)
            kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2);
        else
#endif // __ARM_NEON
        if (inh >= 2)
            kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2);
        else
            kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2);
    }
    else
    {
#if __ARM_NEON
#if __aarch64__
        if (inh >= 8)
            kernel_tm.create(8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh);
        else
#endif // __aarch64__
        if (inh >= 4)
            kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh);
        else
#endif // __ARM_NEON
        if (inh >= 2)
            kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh);
        else
            kernel_tm.create(kernel_w, inh, outh);
    }
    // *INDENT-ON*
    // clang-format on

    int q = 0;
#if __ARM_NEON
#if __aarch64__
    for (; q + 7 < outh; q += 8)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;
        const float* kptr2 = (const float*)kernel + (q + 2) * inh * kernel_w;
        const float* kptr3 = (const float*)kernel + (q + 3) * inh * kernel_w;
        const float* kptr4 = (const float*)kernel + (q + 4) * inh * kernel_w;
        const float* kptr5 = (const float*)kernel + (q + 5) * inh * kernel_w;
        const float* kptr6 = (const float*)kernel + (q + 6) * inh * kernel_w;
        const float* kptr7 = (const float*)kernel + (q + 7) * inh * kernel_w;

        float* g00 = kernel_tm.channel(q / 8);

        int p = 0;
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    g00[4] = k4[k];
                    g00[5] = k5[k];
                    g00[6] = k6[k];
                    g00[7] = k7[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    g00[4] = k4[k];
                    g00[5] = k5[k];
                    g00[6] = k6[k];
                    g00[7] = k7[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    g00[4] = k4[k];
                    g00[5] = k5[k];
                    g00[6] = k6[k];
                    g00[7] = k7[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;
            const float* k2 = kptr2 + p * kernel_w;
            const float* k3 = kptr3 + p * kernel_w;
            const float* k4 = kptr4 + p * kernel_w;
            const float* k5 = kptr5 + p * kernel_w;
            const float* k6 = kptr6 + p * kernel_w;
            const float* k7 = kptr7 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = k0[k];
                g00[1] = k1[k];
                g00[2] = k2[k];
                g00[3] = k3[k];
                g00[4] = k4[k];
                g00[5] = k5[k];
                g00[6] = k6[k];
                g00[7] = k7[k];
                g00 += 8;
            }
        }
    }
#endif // __aarch64__
    for (; q + 3 < outh; q += 4)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;
        const float* kptr2 = (const float*)kernel + (q + 2) * inh * kernel_w;
        const float* kptr3 = (const float*)kernel + (q + 3) * inh * kernel_w;

#if __aarch64__
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
#else
        float* g00 = kernel_tm.channel(q / 4);
#endif

        int p = 0;
#if __aarch64__
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;
            const float* k2 = kptr2 + p * kernel_w;
            const float* k3 = kptr3 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = k0[k];
                g00[1] = k1[k];
                g00[2] = k2[k];
                g00[3] = k3[k];
                g00 += 4;
            }
        }
    }
#endif // __ARM_NEON
    for (; q + 1 < outh; q += 2)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;

#if __aarch64__
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);
#elif __ARM_NEON
        float* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2);
#else
        float* g00 = kernel_tm.channel(q / 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w + k;
                const float* k1 = kptr1 + p * kernel_w + k;

                g00[0] = k0[0];
                g00[1] = k0[kernel_w];
                g00[2] = k0[kernel_w * 2];
                g00[3] = k0[kernel_w * 3];
                g00[4] = k0[kernel_w * 4];
                g00[5] = k0[kernel_w * 5];
                g00[6] = k0[kernel_w * 6];
                g00[7] = k0[kernel_w * 7];
                g00[8] = k1[0];
                g00[9] = k1[kernel_w];
                g00[10] = k1[kernel_w * 2];
                g00[11] = k1[kernel_w * 3];
                g00[12] = k1[kernel_w * 4];
                g00[13] = k1[kernel_w * 5];
                g00[14] = k1[kernel_w * 6];
                g00[15] = k1[kernel_w * 7];
                g00 += 16;
            }
        }
#endif // __aarch64__
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w + k;
                const float* k1 = kptr1 + p * kernel_w + k;

                g00[0] = k0[0];
                g00[1] = k0[kernel_w];
                g00[2] = k0[kernel_w * 2];
                g00[3] = k0[kernel_w * 3];
                g00[4] = k1[0];
                g00[5] = k1[kernel_w];
                g00[6] = k1[kernel_w * 2];
                g00[7] = k1[kernel_w * 3];
                g00 += 8;
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    g00 += 2;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = k0[k];
                g00[1] = k1[k];
                g00 += 2;
            }
        }
    }
    for (; q < outh; q++)
    {
        const float* kptr = (const float*)kernel + q * inh * kernel_w;

#if __aarch64__
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);
#elif __ARM_NEON
        float* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2 + q % 2);
#else
        float* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[k];
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[k];
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = k0[k];
                g00++;
            }
        }
    }
}

static void convolution1d_packed(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int dilation_w, int stride_w, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int elempack = bottom_blob.elempack;
    const int inh = bottom_blob.h * elempack;

    const int N = bottom_blob.w * elempack;

    const int outw = top_blob.w;
    const int out_elempack = top_blob.elempack;
    const int outh = top_blob.h * out_elempack;

    const int M = top_blob.w * out_elempack;

    const float* bias_data_ptr = bias_data;

    int nn_outh = 0;
    int remain_outh_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_outh = (outh - remain_outh_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        float* outptr = top_blob.row(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            float32x4_t _sum4 = vdupq_n_f32(0.f);
            float32x4_t _sum5 = vdupq_n_f32(0.f);
            float32x4_t _sum6 = vdupq_n_f32(0.f);
            float32x4_t _sum7 = vdupq_n_f32(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1q_f32(bias_data_ptr + p);
                _sum1 = vld1q_f32(bias_data_ptr + p + 4);
            }

            const float* kptr = weight_data_tm.channel(p / 8);

            int q = 0;
            for (; q + 7 < inh; q += 8)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        _r1 = vld1q_f32(r0 + N);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r1 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        _r1 = vsetq_lane_f32(r0[N * 4], _r1, 0);
                        _r1 = vsetq_lane_f32(r0[N * 5], _r1, 1);
                        _r1 = vsetq_lane_f32(r0[N * 6], _r1, 2);
                        _r1 = vsetq_lane_f32(r0[N * 7], _r1, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    float32x4_t _w2 = vld1q_f32(kptr + 4 * 2);
                    float32x4_t _w3 = vld1q_f32(kptr + 4 * 3);
                    float32x4_t _w4 = vld1q_f32(kptr + 4 * 4);
                    float32x4_t _w5 = vld1q_f32(kptr + 4 * 5);
                    float32x4_t _w6 = vld1q_f32(kptr + 4 * 6);
                    float32x4_t _w7 = vld1q_f32(kptr + 4 * 7);
                    float32x4_t _w8 = vld1q_f32(kptr + 4 * 8);
                    float32x4_t _w9 = vld1q_f32(kptr + 4 * 9);
                    float32x4_t _wa = vld1q_f32(kptr + 4 * 10);
                    float32x4_t _wb = vld1q_f32(kptr + 4 * 11);
                    float32x4_t _wc = vld1q_f32(kptr + 4 * 12);
                    float32x4_t _wd = vld1q_f32(kptr + 4 * 13);
                    float32x4_t _we = vld1q_f32(kptr + 4 * 14);
                    float32x4_t _wf = vld1q_f32(kptr + 4 * 15);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w8, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w9, _r1, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _wa, _r1, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _wb, _r1, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _wc, _r1, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _wd, _r1, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _we, _r1, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _wf, _r1, 3);

                    kptr += 64;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    float32x4_t _w2 = vld1q_f32(kptr + 4 * 2);
                    float32x4_t _w3 = vld1q_f32(kptr + 4 * 3);
                    float32x4_t _w4 = vld1q_f32(kptr + 4 * 4);
                    float32x4_t _w5 = vld1q_f32(kptr + 4 * 5);
                    float32x4_t _w6 = vld1q_f32(kptr + 4 * 6);
                    float32x4_t _w7 = vld1q_f32(kptr + 4 * 7);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);

                    kptr += 32;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    float32x4_t _w2 = vld1q_f32(kptr + 8);
                    float32x4_t _w3 = vld1q_f32(kptr + 12);
                    _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vfmaq_n_f32(_sum1, _w1, val0);
                    _sum2 = vfmaq_n_f32(_sum2, _w2, val1);
                    _sum3 = vfmaq_n_f32(_sum3, _w3, val1);

                    kptr += 16;
                }
            }
            for (; q < inh; q++)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _val;
                    // if (elempack == 1)
                    {
                        _val = vdupq_n_f32(r0[0]);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    _sum0 = vfmaq_f32(_sum0, _w0, _val);
                    _sum1 = vfmaq_f32(_sum1, _w1, _val);

                    kptr += 8;
                }
            }

            _sum0 = vaddq_f32(_sum0, _sum2);
            _sum1 = vaddq_f32(_sum1, _sum3);
            _sum4 = vaddq_f32(_sum4, _sum6);
            _sum5 = vaddq_f32(_sum5, _sum7);
            _sum0 = vaddq_f32(_sum0, _sum4);
            _sum1 = vaddq_f32(_sum1, _sum5);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);
            _sum1 = activation_ps(_sum1, activation_type, activation_params);

            if (out_elempack == 4)
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + M, _sum1);
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                outptr[0] = vgetq_lane_f32(_sum0, 0);
                outptr[M] = vgetq_lane_f32(_sum0, 1);
                outptr[M * 2] = vgetq_lane_f32(_sum0, 2);
                outptr[M * 3] = vgetq_lane_f32(_sum0, 3);
                outptr[M * 4] = vgetq_lane_f32(_sum1, 0);
                outptr[M * 5] = vgetq_lane_f32(_sum1, 1);
                outptr[M * 6] = vgetq_lane_f32(_sum1, 2);
                outptr[M * 7] = vgetq_lane_f32(_sum1, 3);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 8;
    nn_outh = (outh - remain_outh_start) / 4;
#else // __aarch64__
    nn_outh = (outh - remain_outh_start) / 4;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __aarch64__
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        float* outptr = top_blob.row(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1q_f32(bias_data_ptr + p);
            }

#if __aarch64__
            const float* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);
#else
            const float* kptr = weight_data_tm.channel(p / 4);
#endif

            int q = 0;
#if __aarch64__
            for (; q + 7 < inh; q += 8)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        _r1 = vld1q_f32(r0 + N);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r1 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        _r1 = vsetq_lane_f32(r0[N * 4], _r1, 0);
                        _r1 = vsetq_lane_f32(r0[N * 5], _r1, 1);
                        _r1 = vsetq_lane_f32(r0[N * 6], _r1, 2);
                        _r1 = vsetq_lane_f32(r0[N * 7], _r1, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    float32x4_t _w2 = vld1q_f32(kptr + 8);
                    float32x4_t _w3 = vld1q_f32(kptr + 12);
                    float32x4_t _w4 = vld1q_f32(kptr + 16);
                    float32x4_t _w5 = vld1q_f32(kptr + 20);
                    float32x4_t _w6 = vld1q_f32(kptr + 24);
                    float32x4_t _w7 = vld1q_f32(kptr + 28);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w4, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w5, _r1, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w6, _r1, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w7, _r1, 3);

                    kptr += 32;
                }
            }
#endif // __aarch64__
            for (; q + 3 < inh; q += 4)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    float32x4_t _w2 = vld1q_f32(kptr + 8);
                    float32x4_t _w3 = vld1q_f32(kptr + 12);
#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_r0), 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_r0), 1);
                    _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_r0), 0);
                    _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_r0), 1);
#endif

                    kptr += 16;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
#if __aarch64__
                    _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vfmaq_n_f32(_sum1, _w1, val1);
#else
                    _sum0 = vmlaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vmlaq_n_f32(_sum1, _w1, val1);
#endif

                    kptr += 8;
                }
            }
            for (; q < inh; q++)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _val;
                    // if (elempack == 1)
                    {
                        _val = vdupq_n_f32(r0[0]);
                        r0 += dilation_w;
                    }

                    float32x4_t _w = vld1q_f32(kptr);
#if __aarch64__
                    _sum0 = vfmaq_f32(_sum0, _val, _w);
#else
                    _sum0 = vmlaq_f32(_sum0, _val, _w);
#endif

                    kptr += 4;
                }
            }

            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _sum0 = vaddq_f32(_sum0, _sum2);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);

            if (out_elempack == 4)
            {
                vst1q_f32(outptr, _sum0);
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                outptr[0] = vgetq_lane_f32(_sum0, 0);
                outptr[M] = vgetq_lane_f32(_sum0, 1);
                outptr[M * 2] = vgetq_lane_f32(_sum0, 2);
                outptr[M * 3] = vgetq_lane_f32(_sum0, 3);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 4;
    nn_outh = (outh - remain_outh_start) / 2;
#else // __ARM_NEON
    nn_outh = (outh - remain_outh_start) / 2;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __ARM_NEON
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;

        float* outptr0 = top_blob.row(p);
        float* outptr1 = top_blob.row(p + 1);

        for (int j = 0; j < outw; j++)
        {
            float sum0 = 0.f;
            float sum1 = 0.f;

            if (bias_data_ptr)
            {
                sum0 = bias_data_ptr[p];
                sum1 = bias_data_ptr[p + 1];
            }

#if __aarch64__
            const float* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);
#elif __ARM_NEON
            const float* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2);
#else
            const float* kptr = weight_data_tm.channel(p / 2);
#endif

            int q = 0;
#if __ARM_NEON
#if __aarch64__
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        _r1 = vld1q_f32(r0 + N);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r1 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        _r1 = vsetq_lane_f32(r0[N * 4], _r1, 0);
                        _r1 = vsetq_lane_f32(r0[N * 5], _r1, 1);
                        _r1 = vsetq_lane_f32(r0[N * 6], _r1, 2);
                        _r1 = vsetq_lane_f32(r0[N * 7], _r1, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    float32x4_t _w2 = vld1q_f32(kptr + 8);
                    float32x4_t _w3 = vld1q_f32(kptr + 12);
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r1, _w1);
                    _sum2 = vfmaq_f32(_sum2, _r0, _w2);
                    _sum3 = vfmaq_f32(_sum3, _r1, _w3);

                    kptr += 16;
                }
            }
            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            sum0 += vaddvq_f32(_sum0);
            sum1 += vaddvq_f32(_sum2);
            _sum0 = vdupq_n_f32(0.f);
            _sum1 = vdupq_n_f32(0.f);
#else  // __aarch64__
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
#endif // __aarch64__
            for (; q + 3 < inh; q += 4)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
#if __aarch64__
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r0, _w1);
#else
                    _sum0 = vmlaq_f32(_sum0, _r0, _w0);
                    _sum1 = vmlaq_f32(_sum1, _r0, _w1);
#endif

                    kptr += 8;
                }
            }
#if __aarch64__
            sum0 += vaddvq_f32(_sum0);
            sum1 += vaddvq_f32(_sum1);
#else
            float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
            float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
            float32x2_t _ss = vpadd_f32(_ss0, _ss1);
            sum0 += vget_lane_f32(_ss, 0);
            sum1 += vget_lane_f32(_ss, 1);
#endif
#endif // __ARM_NEON
            for (; q + 1 < inh; q += 2)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    sum0 += val0 * kptr[0];
                    sum1 += val0 * kptr[1];
                    sum0 += val1 * kptr[2];
                    sum1 += val1 * kptr[3];

                    kptr += 4;
                }
            }
            for (; q < inh; q++)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val;
                    // if (elempack == 1)
                    {
                        val = r0[0];
                        r0 += dilation_w;
                    }

                    sum0 += val * kptr[0];
                    sum1 += val * kptr[1];

                    kptr += 2;
                }
            }

            sum0 = activation_ss(sum0, activation_type, activation_params);
            sum1 = activation_ss(sum1, activation_type, activation_params);

            outptr0[0] = sum0;
            outptr1[0] = sum1;
            outptr0 += 1;
            outptr1 += 1;
        }
    }
    remain_outh_start += nn_outh * 2;
    for (int p = remain_outh_start; p < outh; p++)
    {
        float* outptr = top_blob.row(p);

        for (int j = 0; j < outw; j++)
        {
            float sum = 0.f;

            if (bias_data_ptr)
            {
                sum = bias_data_ptr[p];
            }

#if __aarch64__
            const float* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);
#elif __ARM_NEON
            const float* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2 + p % 2);
#else
            const float* kptr = weight_data_tm.channel(p / 2 + p % 2);
#endif

            int q = 0;
#if __ARM_NEON
#if __aarch64__
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        _r1 = vld1q_f32(r0 + N);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r1 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        _r1 = vsetq_lane_f32(r0[N * 4], _r1, 0);
                        _r1 = vsetq_lane_f32(r0[N * 5], _r1, 1);
                        _r1 = vsetq_lane_f32(r0[N * 6], _r1, 2);
                        _r1 = vsetq_lane_f32(r0[N * 7], _r1, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w0 = vld1q_f32(kptr);
                    float32x4_t _w1 = vld1q_f32(kptr + 4);
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r1, _w1);

                    kptr += 8;
                }
            }
            _sum0 = vaddq_f32(_sum0, _sum1);
            sum += vaddvq_f32(_sum0);
#endif // __aarch64__
            float32x4_t _sum = vdupq_n_f32(0.f);
            for (; q + 3 < inh; q += 4)
            {
                const float* r0 = bottom_blob.row(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float32x4_t();
                        _r0 = vsetq_lane_f32(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f32(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f32(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f32(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float32x4_t _w = vld1q_f32(kptr);
#if __aarch64__
                    _sum = vfmaq_f32(_sum, _r0, _w);
#else
                    _sum = vmlaq_f32(_sum, _r0, _w);
#endif

                    kptr += 4;
                }
            }
#if __aarch64__
            sum += vaddvq_f32(_sum);
#else
            float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
            _ss = vpadd_f32(_ss, _ss);
            sum += vget_lane_f32(_ss, 0);
#endif
#endif // __ARM_NEON
            for (; q + 1 < inh; q += 2)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    sum += val0 * kptr[0];
                    sum += val1 * kptr[1];

                    kptr += 2;
                }
            }
            for (; q < inh; q++)
            {
                const float* r0 = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val;
                    // if (elempack == 1)
                    {
                        val = r0[0];
                        r0 += dilation_w;
                    }

                    sum += val * kptr[0];

                    kptr += 1;
                }
            }

            sum = activation_ss(sum, activation_type, activation_params);

            outptr[0] = sum;
            outptr += 1;
        }
    }
}


================================================
FILE: src/layer/arm/convolution1d_packed_bf16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution1d_transform_kernel_packed_bf16s(const Mat& kernel, Mat& kernel_tm, int inh, int outh, int kernel_w)
{
    // src = kw-inh-outh
    // dst = pb-pa-kw-inh/pa-outh/pb

    // clang-format off
    // *INDENT-OFF*
#if __ARM_NEON
#if __aarch64__
    if (outh >= 8)
    {
        if (inh >= 8)
            kernel_tm.create(8 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else if (inh >= 4)
            kernel_tm.create(8 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else if (inh >= 2)
            kernel_tm.create(8 * 2 * kernel_w, inh / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else
            kernel_tm.create(8 * kernel_w, inh, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
    }
    else
#endif // __aarch64__
    if (outh >= 4)
    {
#if __aarch64__
        if (inh >= 8)
            kernel_tm.create(4 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else
#endif // __aarch64__
        if (inh >= 4)
            kernel_tm.create(4 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else if (inh >= 2)
            kernel_tm.create(4 * 2 * kernel_w, inh / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else
            kernel_tm.create(4 * kernel_w, inh, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
    }
    else
#endif // __ARM_NEON
    if (outh >= 2)
    {
#if __ARM_NEON
#if __aarch64__
        if (inh >= 8)
            kernel_tm.create(2 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
        else
#endif // __aarch64__
        if (inh >= 4)
            kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
        else
#endif // __ARM_NEON
        if (inh >= 2)
            kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
        else
            kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2, (size_t)2u);
    }
    else
    {
#if __ARM_NEON
#if __aarch64__
        if (inh >= 8)
            kernel_tm.create(8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh, (size_t)2u);
        else
#endif // __aarch64__
        if (inh >= 4)
            kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh, (size_t)2u);
        else
#endif // __ARM_NEON
        if (inh >= 2)
            kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh, (size_t)2u);
        else
            kernel_tm.create(kernel_w, inh, outh, (size_t)2u);
    }
    // *INDENT-ON*
    // clang-format on

    int q = 0;
#if __ARM_NEON
#if __aarch64__
    for (; q + 7 < outh; q += 8)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;
        const float* kptr2 = (const float*)kernel + (q + 2) * inh * kernel_w;
        const float* kptr3 = (const float*)kernel + (q + 3) * inh * kernel_w;
        const float* kptr4 = (const float*)kernel + (q + 4) * inh * kernel_w;
        const float* kptr5 = (const float*)kernel + (q + 5) * inh * kernel_w;
        const float* kptr6 = (const float*)kernel + (q + 6) * inh * kernel_w;
        const float* kptr7 = (const float*)kernel + (q + 7) * inh * kernel_w;

        unsigned short* g00 = kernel_tm.channel(q / 8);

        int p = 0;
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    g00[4] = float32_to_bfloat16(k4[k]);
                    g00[5] = float32_to_bfloat16(k5[k]);
                    g00[6] = float32_to_bfloat16(k6[k]);
                    g00[7] = float32_to_bfloat16(k7[k]);
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    g00[4] = float32_to_bfloat16(k4[k]);
                    g00[5] = float32_to_bfloat16(k5[k]);
                    g00[6] = float32_to_bfloat16(k6[k]);
                    g00[7] = float32_to_bfloat16(k7[k]);
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    g00[4] = float32_to_bfloat16(k4[k]);
                    g00[5] = float32_to_bfloat16(k5[k]);
                    g00[6] = float32_to_bfloat16(k6[k]);
                    g00[7] = float32_to_bfloat16(k7[k]);
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;
            const float* k2 = kptr2 + p * kernel_w;
            const float* k3 = kptr3 + p * kernel_w;
            const float* k4 = kptr4 + p * kernel_w;
            const float* k5 = kptr5 + p * kernel_w;
            const float* k6 = kptr6 + p * kernel_w;
            const float* k7 = kptr7 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00[1] = float32_to_bfloat16(k1[k]);
                g00[2] = float32_to_bfloat16(k2[k]);
                g00[3] = float32_to_bfloat16(k3[k]);
                g00[4] = float32_to_bfloat16(k4[k]);
                g00[5] = float32_to_bfloat16(k5[k]);
                g00[6] = float32_to_bfloat16(k6[k]);
                g00[7] = float32_to_bfloat16(k7[k]);
                g00 += 8;
            }
        }
    }
#endif // __aarch64__
    for (; q + 3 < outh; q += 4)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;
        const float* kptr2 = (const float*)kernel + (q + 2) * inh * kernel_w;
        const float* kptr3 = (const float*)kernel + (q + 3) * inh * kernel_w;

#if __aarch64__
        unsigned short* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
#else
        unsigned short* g00 = kernel_tm.channel(q / 4);
#endif

        int p = 0;
#if __aarch64__
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;
            const float* k2 = kptr2 + p * kernel_w;
            const float* k3 = kptr3 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00[1] = float32_to_bfloat16(k1[k]);
                g00[2] = float32_to_bfloat16(k2[k]);
                g00[3] = float32_to_bfloat16(k3[k]);
                g00 += 4;
            }
        }
    }
#endif // __ARM_NEON
    for (; q + 1 < outh; q += 2)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;

#if __aarch64__
        unsigned short* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);
#elif __ARM_NEON
        unsigned short* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2);
#else
        unsigned short* g00 = kernel_tm.channel(q / 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w + k;
                const float* k1 = kptr1 + p * kernel_w + k;

                g00[0] = float32_to_bfloat16(k0[0]);
                g00[1] = float32_to_bfloat16(k0[kernel_w]);
                g00[2] = float32_to_bfloat16(k0[kernel_w * 2]);
                g00[3] = float32_to_bfloat16(k0[kernel_w * 3]);
                g00[4] = float32_to_bfloat16(k0[kernel_w * 4]);
                g00[5] = float32_to_bfloat16(k0[kernel_w * 5]);
                g00[6] = float32_to_bfloat16(k0[kernel_w * 6]);
                g00[7] = float32_to_bfloat16(k0[kernel_w * 7]);
                g00[8] = float32_to_bfloat16(k1[0]);
                g00[9] = float32_to_bfloat16(k1[kernel_w]);
                g00[10] = float32_to_bfloat16(k1[kernel_w * 2]);
                g00[11] = float32_to_bfloat16(k1[kernel_w * 3]);
                g00[12] = float32_to_bfloat16(k1[kernel_w * 4]);
                g00[13] = float32_to_bfloat16(k1[kernel_w * 5]);
                g00[14] = float32_to_bfloat16(k1[kernel_w * 6]);
                g00[15] = float32_to_bfloat16(k1[kernel_w * 7]);
                g00 += 16;
            }
        }
#endif // __aarch64__
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w + k;
                const float* k1 = kptr1 + p * kernel_w + k;

                g00[0] = float32_to_bfloat16(k0[0]);
                g00[1] = float32_to_bfloat16(k0[kernel_w]);
                g00[2] = float32_to_bfloat16(k0[kernel_w * 2]);
                g00[3] = float32_to_bfloat16(k0[kernel_w * 3]);
                g00[4] = float32_to_bfloat16(k1[0]);
                g00[5] = float32_to_bfloat16(k1[kernel_w]);
                g00[6] = float32_to_bfloat16(k1[kernel_w * 2]);
                g00[7] = float32_to_bfloat16(k1[kernel_w * 3]);
                g00 += 8;
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    k0 += kernel_w;
                    k1 += kernel_w;
                    g00 += 2;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00[1] = float32_to_bfloat16(k1[k]);
                g00 += 2;
            }
        }
    }
    for (; q < outh; q++)
    {
        const float* kptr = (const float*)kernel + q * inh * kernel_w;

#if __aarch64__
        unsigned short* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);
#elif __ARM_NEON
        unsigned short* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2 + q % 2);
#else
        unsigned short* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00++;
            }
        }
    }
}

static void convolution1d_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int dilation_w, int stride_w, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int elempack = bottom_blob.elempack;
    const int inh = bottom_blob.h * elempack;

    const int N = bottom_blob.w * elempack;

    const int outw = top_blob.w;
    const int out_elempack = top_blob.elempack;
    const int outh = top_blob.h * out_elempack;

    const int M = top_blob.w * out_elempack;

    const float* bias_data_ptr = bias_data;

    int nn_outh = 0;
    int remain_outh_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_outh = (outh - remain_outh_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        unsigned short* outptr = top_blob.row<unsigned short>(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            float32x4_t _sum4 = vdupq_n_f32(0.f);
            float32x4_t _sum5 = vdupq_n_f32(0.f);
            float32x4_t _sum6 = vdupq_n_f32(0.f);
            float32x4_t _sum7 = vdupq_n_f32(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1q_f32(bias_data_ptr + p);
                _sum1 = vld1q_f32(bias_data_ptr + p + 4);
            }

            const unsigned short* kptr = weight_data_tm.channel(p / 8);

            int q = 0;
            for (; q + 7 < inh; q += 8)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        _r1 = bfloat2float(vld1_u16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x8_t _r_u16 = uint16x8_t();
                        _r_u16 = vsetq_lane_u16(r0[0], _r_u16, 0);
                        _r_u16 = vsetq_lane_u16(r0[N], _r_u16, 1);
                        _r_u16 = vsetq_lane_u16(r0[N * 2], _r_u16, 2);
                        _r_u16 = vsetq_lane_u16(r0[N * 3], _r_u16, 3);
                        _r_u16 = vsetq_lane_u16(r0[N * 4], _r_u16, 4);
                        _r_u16 = vsetq_lane_u16(r0[N * 5], _r_u16, 5);
                        _r_u16 = vsetq_lane_u16(r0[N * 6], _r_u16, 6);
                        _r_u16 = vsetq_lane_u16(r0[N * 7], _r_u16, 7);
                        _r0 = bfloat2float(vget_low_u16(_r_u16));
                        _r1 = bfloat2float(vget_high_u16(_r_u16));
                        r0 += dilation_w;
                    }

                    uint16x8_t _w01 = vld1q_u16(kptr);
                    uint16x8_t _w23 = vld1q_u16(kptr + 8);
                    uint16x8_t _w45 = vld1q_u16(kptr + 16);
                    uint16x8_t _w67 = vld1q_u16(kptr + 24);
                    uint16x8_t _w89 = vld1q_u16(kptr + 32);
                    uint16x8_t _wab = vld1q_u16(kptr + 40);
                    uint16x8_t _wcd = vld1q_u16(kptr + 48);
                    uint16x8_t _wef = vld1q_u16(kptr + 56);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                    float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                    float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                    float32x4_t _w4 = bfloat2float(vget_low_u16(_w45));
                    float32x4_t _w5 = bfloat2float(vget_high_u16(_w45));
                    float32x4_t _w6 = bfloat2float(vget_low_u16(_w67));
                    float32x4_t _w7 = bfloat2float(vget_high_u16(_w67));
                    float32x4_t _w8 = bfloat2float(vget_low_u16(_w89));
                    float32x4_t _w9 = bfloat2float(vget_high_u16(_w89));
                    float32x4_t _wa = bfloat2float(vget_low_u16(_wab));
                    float32x4_t _wb = bfloat2float(vget_high_u16(_wab));
                    float32x4_t _wc = bfloat2float(vget_low_u16(_wcd));
                    float32x4_t _wd = bfloat2float(vget_high_u16(_wcd));
                    float32x4_t _we = bfloat2float(vget_low_u16(_wef));
                    float32x4_t _wf = bfloat2float(vget_high_u16(_wef));
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w8, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w9, _r1, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _wa, _r1, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _wb, _r1, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _wc, _r1, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _wd, _r1, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _we, _r1, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _wf, _r1, 3);

                    kptr += 64;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x4_t _r_u16 = uint16x4_t();
                        _r_u16 = vset_lane_u16(r0[0], _r_u16, 0);
                        _r_u16 = vset_lane_u16(r0[N], _r_u16, 1);
                        _r_u16 = vset_lane_u16(r0[N * 2], _r_u16, 2);
                        _r_u16 = vset_lane_u16(r0[N * 3], _r_u16, 3);
                        _r0 = bfloat2float(_r_u16);
                        r0 += dilation_w;
                    }

                    uint16x8_t _w01 = vld1q_u16(kptr);
                    uint16x8_t _w23 = vld1q_u16(kptr + 8);
                    uint16x8_t _w45 = vld1q_u16(kptr + 16);
                    uint16x8_t _w67 = vld1q_u16(kptr + 24);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                    float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                    float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                    float32x4_t _w4 = bfloat2float(vget_low_u16(_w45));
                    float32x4_t _w5 = bfloat2float(vget_high_u16(_w45));
                    float32x4_t _w6 = bfloat2float(vget_low_u16(_w67));
                    float32x4_t _w7 = bfloat2float(vget_high_u16(_w67));
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);

                    kptr += 32;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = bfloat16_to_float32(r0[0]);
                        val1 = bfloat16_to_float32(r0[N]);
                        r0 += dilation_w;
                    }

                    uint16x8_t _w01 = vld1q_u16(kptr);
                    uint16x8_t _w23 = vld1q_u16(kptr + 8);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                    float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                    float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                    _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vfmaq_n_f32(_sum1, _w1, val0);
                    _sum2 = vfmaq_n_f32(_sum2, _w2, val1);
                    _sum3 = vfmaq_n_f32(_sum3, _w3, val1);

                    kptr += 16;
                }
            }
            for (; q < inh; q++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _val;
                    // if (elempack == 1)
                    {
                        _val = bfloat2float(vdup_n_u16(r0[0]));
                        r0 += dilation_w;
                    }

                    uint16x8_t _w = vld1q_u16(kptr);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
                    _sum0 = vfmaq_f32(_sum0, _w0, _val);
                    _sum1 = vfmaq_f32(_sum1, _w1, _val);

                    kptr += 8;
                }
            }

            _sum0 = vaddq_f32(_sum0, _sum2);
            _sum1 = vaddq_f32(_sum1, _sum3);
            _sum4 = vaddq_f32(_sum4, _sum6);
            _sum5 = vaddq_f32(_sum5, _sum7);
            _sum0 = vaddq_f32(_sum0, _sum4);
            _sum1 = vaddq_f32(_sum1, _sum5);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);
            _sum1 = activation_ps(_sum1, activation_type, activation_params);

            if (out_elempack == 4)
            {
                vst1_u16(outptr, float2bfloat(_sum0));
                vst1_u16(outptr + M, float2bfloat(_sum1));
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                uint16x4_t _sum0_u16 = float2bfloat(_sum0);
                uint16x4_t _sum1_u16 = float2bfloat(_sum1);
                outptr[0] = vget_lane_u16(_sum0_u16, 0);
                outptr[M] = vget_lane_u16(_sum0_u16, 1);
                outptr[M * 2] = vget_lane_u16(_sum0_u16, 2);
                outptr[M * 3] = vget_lane_u16(_sum0_u16, 3);
                outptr[M * 4] = vget_lane_u16(_sum1_u16, 0);
                outptr[M * 5] = vget_lane_u16(_sum1_u16, 1);
                outptr[M * 6] = vget_lane_u16(_sum1_u16, 2);
                outptr[M * 7] = vget_lane_u16(_sum1_u16, 3);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 8;
    nn_outh = (outh - remain_outh_start) / 4;
#else // __aarch64__
    nn_outh = (outh - remain_outh_start) / 4;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __aarch64__
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        unsigned short* outptr = top_blob.row<unsigned short>(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1q_f32(bias_data_ptr + p);
            }

#if __aarch64__
            const unsigned short* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);
#else
            const unsigned short* kptr = weight_data_tm.channel(p / 4);
#endif

            int q = 0;
#if __aarch64__
            for (; q + 7 < inh; q += 8)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        _r1 = bfloat2float(vld1_u16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x8_t _r_u16 = uint16x8_t();
                        _r_u16 = vsetq_lane_u16(r0[0], _r_u16, 0);
                        _r_u16 = vsetq_lane_u16(r0[N], _r_u16, 1);
                        _r_u16 = vsetq_lane_u16(r0[N * 2], _r_u16, 2);
                        _r_u16 = vsetq_lane_u16(r0[N * 3], _r_u16, 3);
                        _r_u16 = vsetq_lane_u16(r0[N * 4], _r_u16, 4);
                        _r_u16 = vsetq_lane_u16(r0[N * 5], _r_u16, 5);
                        _r_u16 = vsetq_lane_u16(r0[N * 6], _r_u16, 6);
                        _r_u16 = vsetq_lane_u16(r0[N * 7], _r_u16, 7);
                        _r0 = bfloat2float(vget_low_u16(_r_u16));
                        _r1 = bfloat2float(vget_high_u16(_r_u16));
                        r0 += dilation_w;
                    }

                    uint16x8_t _w01 = vld1q_u16(kptr);
                    uint16x8_t _w23 = vld1q_u16(kptr + 8);
                    uint16x8_t _w45 = vld1q_u16(kptr + 16);
                    uint16x8_t _w67 = vld1q_u16(kptr + 24);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                    float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                    float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                    float32x4_t _w4 = bfloat2float(vget_low_u16(_w45));
                    float32x4_t _w5 = bfloat2float(vget_high_u16(_w45));
                    float32x4_t _w6 = bfloat2float(vget_low_u16(_w67));
                    float32x4_t _w7 = bfloat2float(vget_high_u16(_w67));
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w4, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w5, _r1, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w6, _r1, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w7, _r1, 3);

                    kptr += 32;
                }
            }
#endif // __aarch64__
            for (; q + 3 < inh; q += 4)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x4_t _r_u16 = uint16x4_t();
                        _r_u16 = vset_lane_u16(r0[0], _r_u16, 0);
                        _r_u16 = vset_lane_u16(r0[N], _r_u16, 1);
                        _r_u16 = vset_lane_u16(r0[N * 2], _r_u16, 2);
                        _r_u16 = vset_lane_u16(r0[N * 3], _r_u16, 3);
                        _r0 = bfloat2float(_r_u16);
                        r0 += dilation_w;
                    }

                    uint16x8_t _w01 = vld1q_u16(kptr);
                    uint16x8_t _w23 = vld1q_u16(kptr + 8);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                    float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                    float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_r0), 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_r0), 1);
                    _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_r0), 0);
                    _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_r0), 1);
#endif

                    kptr += 16;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = bfloat16_to_float32(r0[0]);
                        val1 = bfloat16_to_float32(r0[N]);
                        r0 += dilation_w;
                    }

                    uint16x8_t _w = vld1q_u16(kptr);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
#if __aarch64__
                    _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vfmaq_n_f32(_sum1, _w1, val1);
#else
                    _sum0 = vmlaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vmlaq_n_f32(_sum1, _w1, val1);
#endif

                    kptr += 8;
                }
            }
            for (; q < inh; q++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _val;
                    // if (elempack == 1)
                    {
                        _val = bfloat2float(vdup_n_u16(r0[0]));
                        r0 += dilation_w;
                    }

                    float32x4_t _w = bfloat2float(vld1_u16(kptr));
#if __aarch64__
                    _sum0 = vfmaq_f32(_sum0, _val, _w);
#else
                    _sum0 = vmlaq_f32(_sum0, _val, _w);
#endif

                    kptr += 4;
                }
            }

            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _sum0 = vaddq_f32(_sum0, _sum2);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);

            if (out_elempack == 4)
            {
                vst1_u16(outptr, float2bfloat(_sum0));
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                uint16x4_t _sum0_u16 = float2bfloat(_sum0);
                outptr[0] = vget_lane_u16(_sum0_u16, 0);
                outptr[M] = vget_lane_u16(_sum0_u16, 1);
                outptr[M * 2] = vget_lane_u16(_sum0_u16, 2);
                outptr[M * 3] = vget_lane_u16(_sum0_u16, 3);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 4;
    nn_outh = (outh - remain_outh_start) / 2;
#else // __ARM_NEON
    nn_outh = (outh - remain_outh_start) / 2;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __ARM_NEON
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;

        unsigned short* outptr0 = top_blob.row<unsigned short>(p);
        unsigned short* outptr1 = top_blob.row<unsigned short>(p + 1);

        for (int j = 0; j < outw; j++)
        {
            float sum0 = 0.f;
            float sum1 = 0.f;

            if (bias_data_ptr)
            {
                sum0 = bias_data_ptr[p];
                sum1 = bias_data_ptr[p + 1];
            }

#if __aarch64__
            const unsigned short* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);
#elif __ARM_NEON
            const unsigned short* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2);
#else
            const unsigned short* kptr = weight_data_tm.channel(p / 2);
#endif

            int q = 0;
#if __ARM_NEON
#if __aarch64__
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        _r1 = bfloat2float(vld1_u16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x8_t _r01_u16 = uint16x8_t();
                        _r01_u16 = vsetq_lane_u16(r0[0], _r01_u16, 0);
                        _r01_u16 = vsetq_lane_u16(r0[N], _r01_u16, 1);
                        _r01_u16 = vsetq_lane_u16(r0[N * 2], _r01_u16, 2);
                        _r01_u16 = vsetq_lane_u16(r0[N * 3], _r01_u16, 3);
                        _r01_u16 = vsetq_lane_u16(r0[N * 4], _r01_u16, 4);
                        _r01_u16 = vsetq_lane_u16(r0[N * 5], _r01_u16, 5);
                        _r01_u16 = vsetq_lane_u16(r0[N * 6], _r01_u16, 6);
                        _r01_u16 = vsetq_lane_u16(r0[N * 7], _r01_u16, 7);
                        _r0 = bfloat2float(vget_low_u16(_r01_u16));
                        _r1 = bfloat2float(vget_high_u16(_r01_u16));
                        r0 += dilation_w;
                    }

                    uint16x8_t _w01 = vld1q_u16(kptr);
                    uint16x8_t _w23 = vld1q_u16(kptr + 8);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                    float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                    float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r1, _w1);
                    _sum2 = vfmaq_f32(_sum2, _r0, _w2);
                    _sum3 = vfmaq_f32(_sum3, _r1, _w3);

                    kptr += 16;
                }
            }
            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            sum0 += vaddvq_f32(_sum0);
            sum1 += vaddvq_f32(_sum2);
            _sum0 = vdupq_n_f32(0.f);
            _sum1 = vdupq_n_f32(0.f);
#else  // __aarch64__
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
#endif // __aarch64__
            for (; q + 3 < inh; q += 4)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x4_t _r0_u16 = uint16x4_t();
                        _r0_u16 = vset_lane_u16(r0[0], _r0_u16, 0);
                        _r0_u16 = vset_lane_u16(r0[N], _r0_u16, 1);
                        _r0_u16 = vset_lane_u16(r0[N * 2], _r0_u16, 2);
                        _r0_u16 = vset_lane_u16(r0[N * 3], _r0_u16, 3);
                        _r0 = bfloat2float(_r0_u16);
                        r0 += dilation_w;
                    }

                    uint16x8_t _w = vld1q_u16(kptr);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
#if __aarch64__
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r0, _w1);
#else
                    _sum0 = vmlaq_f32(_sum0, _r0, _w0);
                    _sum1 = vmlaq_f32(_sum1, _r0, _w1);
#endif

                    kptr += 8;
                }
            }
#if __aarch64__
            sum0 += vaddvq_f32(_sum0);
            sum1 += vaddvq_f32(_sum1);
#else
            float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
            float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
            float32x2_t _ss = vpadd_f32(_ss0, _ss1);
            sum0 += vget_lane_f32(_ss, 0);
            sum1 += vget_lane_f32(_ss, 1);
#endif
#endif // __ARM_NEON
            for (; q + 1 < inh; q += 2)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = bfloat16_to_float32(r0[0]);
                        val1 = bfloat16_to_float32(r0[N]);
                        r0 += dilation_w;
                    }

                    sum0 += val0 * bfloat16_to_float32(kptr[0]);
                    sum1 += val0 * bfloat16_to_float32(kptr[1]);
                    sum0 += val1 * bfloat16_to_float32(kptr[2]);
                    sum1 += val1 * bfloat16_to_float32(kptr[3]);

                    kptr += 4;
                }
            }
            for (; q < inh; q++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val;
                    // if (elempack == 1)
                    {
                        val = bfloat16_to_float32(r0[0]);
                        r0 += dilation_w;
                    }

                    sum0 += val * bfloat16_to_float32(kptr[0]);
                    sum1 += val * bfloat16_to_float32(kptr[1]);

                    kptr += 2;
                }
            }

            sum0 = activation_ss(sum0, activation_type, activation_params);
            sum1 = activation_ss(sum1, activation_type, activation_params);

            outptr0[0] = float32_to_bfloat16(sum0);
            outptr1[0] = float32_to_bfloat16(sum1);
            outptr0 += 1;
            outptr1 += 1;
        }
    }
    remain_outh_start += nn_outh * 2;
    for (int p = remain_outh_start; p < outh; p++)
    {
        unsigned short* outptr = top_blob.row<unsigned short>(p);

        for (int j = 0; j < outw; j++)
        {
            float sum = 0.f;

            if (bias_data_ptr)
            {
                sum = bias_data_ptr[p];
            }

#if __aarch64__
            const unsigned short* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);
#elif __ARM_NEON
            const unsigned short* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2 + p % 2);
#else
            const unsigned short* kptr = weight_data_tm.channel(p / 2 + p % 2);
#endif

            int q = 0;
#if __ARM_NEON
#if __aarch64__
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        _r1 = bfloat2float(vld1_u16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x8_t _r01_u16 = uint16x8_t();
                        _r01_u16 = vsetq_lane_u16(r0[0], _r01_u16, 0);
                        _r01_u16 = vsetq_lane_u16(r0[N], _r01_u16, 1);
                        _r01_u16 = vsetq_lane_u16(r0[N * 2], _r01_u16, 2);
                        _r01_u16 = vsetq_lane_u16(r0[N * 3], _r01_u16, 3);
                        _r01_u16 = vsetq_lane_u16(r0[N * 4], _r01_u16, 4);
                        _r01_u16 = vsetq_lane_u16(r0[N * 5], _r01_u16, 5);
                        _r01_u16 = vsetq_lane_u16(r0[N * 6], _r01_u16, 6);
                        _r01_u16 = vsetq_lane_u16(r0[N * 7], _r01_u16, 7);
                        _r0 = bfloat2float(vget_low_u16(_r01_u16));
                        _r1 = bfloat2float(vget_high_u16(_r01_u16));
                        r0 += dilation_w;
                    }

                    uint16x8_t _w = vld1q_u16(kptr);
                    float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                    float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r1, _w1);

                    kptr += 8;
                }
            }
            _sum0 = vaddq_f32(_sum0, _sum1);
            sum += vaddvq_f32(_sum0);
#endif // __aarch64__
            float32x4_t _sum = vdupq_n_f32(0.f);
            for (; q + 3 < inh; q += 4)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        uint16x4_t _r0_u16 = uint16x4_t();
                        _r0_u16 = vset_lane_u16(r0[0], _r0_u16, 0);
                        _r0_u16 = vset_lane_u16(r0[N], _r0_u16, 1);
                        _r0_u16 = vset_lane_u16(r0[N * 2], _r0_u16, 2);
                        _r0_u16 = vset_lane_u16(r0[N * 3], _r0_u16, 3);
                        _r0 = bfloat2float(_r0_u16);
                        r0 += dilation_w;
                    }

                    float32x4_t _w = bfloat2float(vld1_u16(kptr));
#if __aarch64__
                    _sum = vfmaq_f32(_sum, _r0, _w);
#else
                    _sum = vmlaq_f32(_sum, _r0, _w);
#endif

                    kptr += 4;
                }
            }
#if __aarch64__
            sum += vaddvq_f32(_sum);
#else
            float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
            _ss = vpadd_f32(_ss, _ss);
            sum += vget_lane_f32(_ss, 0);
#endif
#endif // __ARM_NEON
            for (; q + 1 < inh; q += 2)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = bfloat16_to_float32(r0[0]);
                        val1 = bfloat16_to_float32(r0[N]);
                        r0 += dilation_w;
                    }

                    sum += val0 * bfloat16_to_float32(kptr[0]);
                    sum += val1 * bfloat16_to_float32(kptr[1]);

                    kptr += 2;
                }
            }
            for (; q < inh; q++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val;
                    // if (elempack == 1)
                    {
                        val = bfloat16_to_float32(r0[0]);
                        r0 += dilation_w;
                    }

                    sum += val * bfloat16_to_float32(kptr[0]);

                    kptr += 1;
                }
            }

            sum = activation_ss(sum, activation_type, activation_params);

            outptr[0] = float32_to_bfloat16(sum);
            outptr += 1;
        }
    }
}


================================================
FILE: src/layer/arm/convolution1d_packed_fp16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution1d_transform_kernel_packed_fp16s(const Mat& kernel, Mat& kernel_tm, int inh, int outh, int kernel_w)
{
    // src = kw-inh-outh
    // dst = pb-pa-kw-inh/pa-outh/pb

    // clang-format off
    // *INDENT-OFF*
    if (outh >= 8)
    {
        if (inh >= 8)
            kernel_tm.create(8 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else if (inh >= 4)
            kernel_tm.create(8 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else if (inh >= 2)
            kernel_tm.create(8 * 2 * kernel_w, inh / 2 + inh % 2, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else
            kernel_tm.create(8 * kernel_w, inh, outh / 8 + (outh % 8) / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
    }
    else if (outh >= 4)
    {
        if (inh >= 8)
            kernel_tm.create(4 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else if (inh >= 4)
            kernel_tm.create(4 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else if (inh >= 2)
            kernel_tm.create(4 * 2 * kernel_w, inh / 2 + inh % 2, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
        else
            kernel_tm.create(4 * kernel_w, inh, outh / 4 + (outh % 4) / 2 + outh % 2, (size_t)2u);
    }
    else if (outh >= 2)
    {
        if (inh >= 8)
            kernel_tm.create(2 * 8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
        else if (inh >= 4)
            kernel_tm.create(2 * 4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
        else if (inh >= 2)
            kernel_tm.create(2 * 2 * kernel_w, inh / 2 + inh % 2, outh / 2 + outh % 2, (size_t)2u);
        else
            kernel_tm.create(2 * kernel_w, inh, outh / 2 + outh % 2, (size_t)2u);
    }
    else
    {
        if (inh >= 8)
            kernel_tm.create(8 * kernel_w, inh / 8 + (inh % 8) / 4 + (inh % 4) / 2 + inh % 2, outh, (size_t)2u);
        else if (inh >= 4)
            kernel_tm.create(4 * kernel_w, inh / 4 + (inh % 4) / 2 + inh % 2, outh, (size_t)2u);
        else if (inh >= 2)
            kernel_tm.create(2 * kernel_w, inh / 2 + inh % 2, outh, (size_t)2u);
        else
            kernel_tm.create(kernel_w, inh, outh, (size_t)2u);
    }
    // *INDENT-ON*
    // clang-format on

    int q = 0;
    for (; q + 7 < outh; q += 8)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;
        const float* kptr2 = (const float*)kernel + (q + 2) * inh * kernel_w;
        const float* kptr3 = (const float*)kernel + (q + 3) * inh * kernel_w;
        const float* kptr4 = (const float*)kernel + (q + 4) * inh * kernel_w;
        const float* kptr5 = (const float*)kernel + (q + 5) * inh * kernel_w;
        const float* kptr6 = (const float*)kernel + (q + 6) * inh * kernel_w;
        const float* kptr7 = (const float*)kernel + (q + 7) * inh * kernel_w;

        __fp16* g00 = kernel_tm.channel(q / 8);

        int p = 0;
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    g00[4] = (__fp16)k4[k];
                    g00[5] = (__fp16)k5[k];
                    g00[6] = (__fp16)k6[k];
                    g00[7] = (__fp16)k7[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    g00[4] = (__fp16)k4[k];
                    g00[5] = (__fp16)k5[k];
                    g00[6] = (__fp16)k6[k];
                    g00[7] = (__fp16)k7[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;
                const float* k4 = kptr4 + p * kernel_w;
                const float* k5 = kptr5 + p * kernel_w;
                const float* k6 = kptr6 + p * kernel_w;
                const float* k7 = kptr7 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    g00[4] = (__fp16)k4[k];
                    g00[5] = (__fp16)k5[k];
                    g00[6] = (__fp16)k6[k];
                    g00[7] = (__fp16)k7[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    k4 += kernel_w;
                    k5 += kernel_w;
                    k6 += kernel_w;
                    k7 += kernel_w;
                    g00 += 8;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;
            const float* k2 = kptr2 + p * kernel_w;
            const float* k3 = kptr3 + p * kernel_w;
            const float* k4 = kptr4 + p * kernel_w;
            const float* k5 = kptr5 + p * kernel_w;
            const float* k6 = kptr6 + p * kernel_w;
            const float* k7 = kptr7 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00[1] = (__fp16)k1[k];
                g00[2] = (__fp16)k2[k];
                g00[3] = (__fp16)k3[k];
                g00[4] = (__fp16)k4[k];
                g00[5] = (__fp16)k5[k];
                g00[6] = (__fp16)k6[k];
                g00[7] = (__fp16)k7[k];
                g00 += 8;
            }
        }
    }
    for (; q + 3 < outh; q += 4)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;
        const float* kptr2 = (const float*)kernel + (q + 2) * inh * kernel_w;
        const float* kptr3 = (const float*)kernel + (q + 3) * inh * kernel_w;

        __fp16* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);

        int p = 0;
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;
                const float* k2 = kptr2 + p * kernel_w;
                const float* k3 = kptr3 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    k2 += kernel_w;
                    k3 += kernel_w;
                    g00 += 4;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;
            const float* k2 = kptr2 + p * kernel_w;
            const float* k3 = kptr3 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00[1] = (__fp16)k1[k];
                g00[2] = (__fp16)k2[k];
                g00[3] = (__fp16)k3[k];
                g00 += 4;
            }
        }
    }
    for (; q + 1 < outh; q += 2)
    {
        const float* kptr0 = (const float*)kernel + q * inh * kernel_w;
        const float* kptr1 = (const float*)kernel + (q + 1) * inh * kernel_w;

        __fp16* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);

        int p = 0;
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w + k;
                const float* k1 = kptr1 + p * kernel_w + k;

                g00[0] = (__fp16)k0[0];
                g00[1] = (__fp16)k0[kernel_w];
                g00[2] = (__fp16)k0[kernel_w * 2];
                g00[3] = (__fp16)k0[kernel_w * 3];
                g00[4] = (__fp16)k0[kernel_w * 4];
                g00[5] = (__fp16)k0[kernel_w * 5];
                g00[6] = (__fp16)k0[kernel_w * 6];
                g00[7] = (__fp16)k0[kernel_w * 7];
                g00[8] = (__fp16)k1[0];
                g00[9] = (__fp16)k1[kernel_w];
                g00[10] = (__fp16)k1[kernel_w * 2];
                g00[11] = (__fp16)k1[kernel_w * 3];
                g00[12] = (__fp16)k1[kernel_w * 4];
                g00[13] = (__fp16)k1[kernel_w * 5];
                g00[14] = (__fp16)k1[kernel_w * 6];
                g00[15] = (__fp16)k1[kernel_w * 7];
                g00 += 16;
            }
        }
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w + k;
                const float* k1 = kptr1 + p * kernel_w + k;

                g00[0] = (__fp16)k0[0];
                g00[1] = (__fp16)k0[kernel_w];
                g00[2] = (__fp16)k0[kernel_w * 2];
                g00[3] = (__fp16)k0[kernel_w * 3];
                g00[4] = (__fp16)k1[0];
                g00[5] = (__fp16)k1[kernel_w];
                g00[6] = (__fp16)k1[kernel_w * 2];
                g00[7] = (__fp16)k1[kernel_w * 3];
                g00 += 8;
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr0 + p * kernel_w;
                const float* k1 = kptr1 + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    k0 += kernel_w;
                    k1 += kernel_w;
                    g00 += 2;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr0 + p * kernel_w;
            const float* k1 = kptr1 + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00[1] = (__fp16)k1[k];
                g00 += 2;
            }
        }
    }
    for (; q < outh; q++)
    {
        const float* kptr = (const float*)kernel + q * inh * kernel_w;

        __fp16* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);

        int p = 0;
        for (; p + 7 < inh; p += 8)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
        for (; p + 3 < inh; p += 4)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
        for (; p + 1 < inh; p += 2)
        {
            for (int k = 0; k < kernel_w; k++)
            {
                const float* k0 = kptr + p * kernel_w;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    k0 += kernel_w;
                    g00 += 1;
                }
            }
        }
        for (; p < inh; p++)
        {
            const float* k0 = kptr + p * kernel_w;

            for (int k = 0; k < kernel_w; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00++;
            }
        }
    }
}

static void convolution1d_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int dilation_w, int stride_w, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int elempack = bottom_blob.elempack;
    const int inh = bottom_blob.h * elempack;

    const int N = bottom_blob.w * elempack;

    const int outw = top_blob.w;
    const int out_elempack = top_blob.elempack;
    const int outh = top_blob.h * out_elempack;

    const int M = top_blob.w * out_elempack;

    const float* bias_data_ptr = bias_data;

    int nn_outh = 0;
    int remain_outh_start = 0;
    nn_outh = (outh - remain_outh_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            float32x4_t _sum4 = vdupq_n_f32(0.f);
            float32x4_t _sum5 = vdupq_n_f32(0.f);
            float32x4_t _sum6 = vdupq_n_f32(0.f);
            float32x4_t _sum7 = vdupq_n_f32(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1q_f32(bias_data_ptr + p);
                _sum1 = vld1q_f32(bias_data_ptr + p + 4);
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8);

            int q = 0;
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        _r1 = vcvt_f32_f16(vld1_f16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x8_t _r_f16 = float16x8_t();
                        _r_f16 = vsetq_lane_f16(r0[0], _r_f16, 0);
                        _r_f16 = vsetq_lane_f16(r0[N], _r_f16, 1);
                        _r_f16 = vsetq_lane_f16(r0[N * 2], _r_f16, 2);
                        _r_f16 = vsetq_lane_f16(r0[N * 3], _r_f16, 3);
                        _r_f16 = vsetq_lane_f16(r0[N * 4], _r_f16, 4);
                        _r_f16 = vsetq_lane_f16(r0[N * 5], _r_f16, 5);
                        _r_f16 = vsetq_lane_f16(r0[N * 6], _r_f16, 6);
                        _r_f16 = vsetq_lane_f16(r0[N * 7], _r_f16, 7);
                        _r0 = vcvt_f32_f16(vget_low_f16(_r_f16));
                        _r1 = vcvt_f32_f16(vget_high_f16(_r_f16));
                        r0 += dilation_w;
                    }

                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);
                    float16x8_t _w45 = vld1q_f16(kptr + 16);
                    float16x8_t _w67 = vld1q_f16(kptr + 24);
                    float16x8_t _w89 = vld1q_f16(kptr + 32);
                    float16x8_t _wab = vld1q_f16(kptr + 40);
                    float16x8_t _wcd = vld1q_f16(kptr + 48);
                    float16x8_t _wef = vld1q_f16(kptr + 56);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                    float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                    float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                    float32x4_t _w4 = vcvt_f32_f16(vget_low_f16(_w45));
                    float32x4_t _w5 = vcvt_f32_f16(vget_high_f16(_w45));
                    float32x4_t _w6 = vcvt_f32_f16(vget_low_f16(_w67));
                    float32x4_t _w7 = vcvt_f32_f16(vget_high_f16(_w67));
                    float32x4_t _w8 = vcvt_f32_f16(vget_low_f16(_w89));
                    float32x4_t _w9 = vcvt_f32_f16(vget_high_f16(_w89));
                    float32x4_t _wa = vcvt_f32_f16(vget_low_f16(_wab));
                    float32x4_t _wb = vcvt_f32_f16(vget_high_f16(_wab));
                    float32x4_t _wc = vcvt_f32_f16(vget_low_f16(_wcd));
                    float32x4_t _wd = vcvt_f32_f16(vget_high_f16(_wcd));
                    float32x4_t _we = vcvt_f32_f16(vget_low_f16(_wef));
                    float32x4_t _wf = vcvt_f32_f16(vget_high_f16(_wef));
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w8, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w9, _r1, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _wa, _r1, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _wb, _r1, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _wc, _r1, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _wd, _r1, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _we, _r1, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _wf, _r1, 3);

                    kptr += 64;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x4_t _r_f16 = float16x4_t();
                        _r_f16 = vset_lane_f16(r0[0], _r_f16, 0);
                        _r_f16 = vset_lane_f16(r0[N], _r_f16, 1);
                        _r_f16 = vset_lane_f16(r0[N * 2], _r_f16, 2);
                        _r_f16 = vset_lane_f16(r0[N * 3], _r_f16, 3);
                        _r0 = vcvt_f32_f16(_r_f16);
                        r0 += dilation_w;
                    }

                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);
                    float16x8_t _w45 = vld1q_f16(kptr + 16);
                    float16x8_t _w67 = vld1q_f16(kptr + 24);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                    float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                    float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                    float32x4_t _w4 = vcvt_f32_f16(vget_low_f16(_w45));
                    float32x4_t _w5 = vcvt_f32_f16(vget_high_f16(_w45));
                    float32x4_t _w6 = vcvt_f32_f16(vget_low_f16(_w67));
                    float32x4_t _w7 = vcvt_f32_f16(vget_high_f16(_w67));
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                    _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                    _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                    _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                    _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);

                    kptr += 32;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = (float)(r0[0]);
                        val1 = (float)(r0[N]);
                        r0 += dilation_w;
                    }

                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                    float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                    float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                    _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vfmaq_n_f32(_sum1, _w1, val0);
                    _sum2 = vfmaq_n_f32(_sum2, _w2, val1);
                    _sum3 = vfmaq_n_f32(_sum3, _w3, val1);

                    kptr += 16;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _val;
                    // if (elempack == 1)
                    {
                        _val = vcvt_f32_f16(vdup_n_f16(r0[0]));
                        r0 += dilation_w;
                    }

                    float16x8_t _w = vld1q_f16(kptr);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                    _sum0 = vfmaq_f32(_sum0, _w0, _val);
                    _sum1 = vfmaq_f32(_sum1, _w1, _val);

                    kptr += 8;
                }
            }

            _sum0 = vaddq_f32(_sum0, _sum2);
            _sum1 = vaddq_f32(_sum1, _sum3);
            _sum4 = vaddq_f32(_sum4, _sum6);
            _sum5 = vaddq_f32(_sum5, _sum7);
            _sum0 = vaddq_f32(_sum0, _sum4);
            _sum1 = vaddq_f32(_sum1, _sum5);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);
            _sum1 = activation_ps(_sum1, activation_type, activation_params);

            if (out_elempack == 4)
            {
                vst1_f16(outptr, vcvt_f16_f32(_sum0));
                vst1_f16(outptr + M, vcvt_f16_f32(_sum1));
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                float16x4_t _sum0_f16 = vcvt_f16_f32(_sum0);
                float16x4_t _sum1_f16 = vcvt_f16_f32(_sum1);
                outptr[0] = vget_lane_f16(_sum0_f16, 0);
                outptr[M] = vget_lane_f16(_sum0_f16, 1);
                outptr[M * 2] = vget_lane_f16(_sum0_f16, 2);
                outptr[M * 3] = vget_lane_f16(_sum0_f16, 3);
                outptr[M * 4] = vget_lane_f16(_sum1_f16, 0);
                outptr[M * 5] = vget_lane_f16(_sum1_f16, 1);
                outptr[M * 6] = vget_lane_f16(_sum1_f16, 2);
                outptr[M * 7] = vget_lane_f16(_sum1_f16, 3);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 8;
    nn_outh = (outh - remain_outh_start) / 4;
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1q_f32(bias_data_ptr + p);
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);

            int q = 0;
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        _r1 = vcvt_f32_f16(vld1_f16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x8_t _r_f16 = float16x8_t();
                        _r_f16 = vsetq_lane_f16(r0[0], _r_f16, 0);
                        _r_f16 = vsetq_lane_f16(r0[N], _r_f16, 1);
                        _r_f16 = vsetq_lane_f16(r0[N * 2], _r_f16, 2);
                        _r_f16 = vsetq_lane_f16(r0[N * 3], _r_f16, 3);
                        _r_f16 = vsetq_lane_f16(r0[N * 4], _r_f16, 4);
                        _r_f16 = vsetq_lane_f16(r0[N * 5], _r_f16, 5);
                        _r_f16 = vsetq_lane_f16(r0[N * 6], _r_f16, 6);
                        _r_f16 = vsetq_lane_f16(r0[N * 7], _r_f16, 7);
                        _r0 = vcvt_f32_f16(vget_low_f16(_r_f16));
                        _r1 = vcvt_f32_f16(vget_high_f16(_r_f16));
                        r0 += dilation_w;
                    }

                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);
                    float16x8_t _w45 = vld1q_f16(kptr + 16);
                    float16x8_t _w67 = vld1q_f16(kptr + 24);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                    float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                    float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                    float32x4_t _w4 = vcvt_f32_f16(vget_low_f16(_w45));
                    float32x4_t _w5 = vcvt_f32_f16(vget_high_f16(_w45));
                    float32x4_t _w6 = vcvt_f32_f16(vget_low_f16(_w67));
                    float32x4_t _w7 = vcvt_f32_f16(vget_high_f16(_w67));
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
                    _sum0 = vfmaq_laneq_f32(_sum0, _w4, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w5, _r1, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w6, _r1, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w7, _r1, 3);

                    kptr += 32;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x4_t _r_f16 = float16x4_t();
                        _r_f16 = vset_lane_f16(r0[0], _r_f16, 0);
                        _r_f16 = vset_lane_f16(r0[N], _r_f16, 1);
                        _r_f16 = vset_lane_f16(r0[N * 2], _r_f16, 2);
                        _r_f16 = vset_lane_f16(r0[N * 3], _r_f16, 3);
                        _r0 = vcvt_f32_f16(_r_f16);
                        r0 += dilation_w;
                    }

                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                    float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                    float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);

                    kptr += 16;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = (float)(r0[0]);
                        val1 = (float)(r0[N]);
                        r0 += dilation_w;
                    }

                    float16x8_t _w = vld1q_f16(kptr);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                    _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                    _sum1 = vfmaq_n_f32(_sum1, _w1, val1);

                    kptr += 8;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _val;
                    // if (elempack == 1)
                    {
                        _val = vcvt_f32_f16(vdup_n_f16(r0[0]));
                        r0 += dilation_w;
                    }

                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
                    _sum0 = vfmaq_f32(_sum0, _val, _w);

                    kptr += 4;
                }
            }

            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _sum0 = vaddq_f32(_sum0, _sum2);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);

            if (out_elempack == 4)
            {
                vst1_f16(outptr, vcvt_f16_f32(_sum0));
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                float16x4_t _sum0_f16 = vcvt_f16_f32(_sum0);
                outptr[0] = vget_lane_f16(_sum0_f16, 0);
                outptr[M] = vget_lane_f16(_sum0_f16, 1);
                outptr[M * 2] = vget_lane_f16(_sum0_f16, 2);
                outptr[M * 3] = vget_lane_f16(_sum0_f16, 3);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 4;
    nn_outh = (outh - remain_outh_start) / 2;
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;

        __fp16* outptr0 = top_blob.row<__fp16>(p);
        __fp16* outptr1 = top_blob.row<__fp16>(p + 1);

        for (int j = 0; j < outw; j++)
        {
            float sum0 = 0.f;
            float sum1 = 0.f;

            if (bias_data_ptr)
            {
                sum0 = bias_data_ptr[p];
                sum1 = bias_data_ptr[p + 1];
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);

            int q = 0;
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        _r1 = vcvt_f32_f16(vld1_f16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x8_t _r01_f16 = float16x8_t();
                        _r01_f16 = vsetq_lane_f16(r0[0], _r01_f16, 0);
                        _r01_f16 = vsetq_lane_f16(r0[N], _r01_f16, 1);
                        _r01_f16 = vsetq_lane_f16(r0[N * 2], _r01_f16, 2);
                        _r01_f16 = vsetq_lane_f16(r0[N * 3], _r01_f16, 3);
                        _r01_f16 = vsetq_lane_f16(r0[N * 4], _r01_f16, 4);
                        _r01_f16 = vsetq_lane_f16(r0[N * 5], _r01_f16, 5);
                        _r01_f16 = vsetq_lane_f16(r0[N * 6], _r01_f16, 6);
                        _r01_f16 = vsetq_lane_f16(r0[N * 7], _r01_f16, 7);
                        _r0 = vcvt_f32_f16(vget_low_f16(_r01_f16));
                        _r1 = vcvt_f32_f16(vget_high_f16(_r01_f16));
                        r0 += dilation_w;
                    }

                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                    float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                    float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r1, _w1);
                    _sum2 = vfmaq_f32(_sum2, _r0, _w2);
                    _sum3 = vfmaq_f32(_sum3, _r1, _w3);

                    kptr += 16;
                }
            }
            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            sum0 += vaddvq_f32(_sum0);
            sum1 += vaddvq_f32(_sum2);
            _sum0 = vdupq_n_f32(0.f);
            _sum1 = vdupq_n_f32(0.f);
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x4_t _r0_f16 = float16x4_t();
                        _r0_f16 = vset_lane_f16(r0[0], _r0_f16, 0);
                        _r0_f16 = vset_lane_f16(r0[N], _r0_f16, 1);
                        _r0_f16 = vset_lane_f16(r0[N * 2], _r0_f16, 2);
                        _r0_f16 = vset_lane_f16(r0[N * 3], _r0_f16, 3);
                        _r0 = vcvt_f32_f16(_r0_f16);
                        r0 += dilation_w;
                    }

                    float16x8_t _w = vld1q_f16(kptr);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r0, _w1);

                    kptr += 8;
                }
            }
            sum0 += vaddvq_f32(_sum0);
            sum1 += vaddvq_f32(_sum1);
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = (float)(r0[0]);
                        val1 = (float)(r0[N]);
                        r0 += dilation_w;
                    }

                    sum0 += val0 * (float)(kptr[0]);
                    sum1 += val0 * (float)(kptr[1]);
                    sum0 += val1 * (float)(kptr[2]);
                    sum1 += val1 * (float)(kptr[3]);

                    kptr += 4;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val;
                    // if (elempack == 1)
                    {
                        val = (float)(r0[0]);
                        r0 += dilation_w;
                    }

                    sum0 += val * (float)(kptr[0]);
                    sum1 += val * (float)(kptr[1]);

                    kptr += 2;
                }
            }

            sum0 = activation_ss(sum0, activation_type, activation_params);
            sum1 = activation_ss(sum1, activation_type, activation_params);

            outptr0[0] = (__fp16)(sum0);
            outptr1[0] = (__fp16)(sum1);
            outptr0 += 1;
            outptr1 += 1;
        }
    }
    remain_outh_start += nn_outh * 2;
    for (int p = remain_outh_start; p < outh; p++)
    {
        __fp16* outptr = top_blob.row<__fp16>(p);

        for (int j = 0; j < outw; j++)
        {
            float sum = 0.f;

            if (bias_data_ptr)
            {
                sum = bias_data_ptr[p];
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);

            int q = 0;
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    float32x4_t _r1;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        _r1 = vcvt_f32_f16(vld1_f16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x8_t _r01_f16 = float16x8_t();
                        _r01_f16 = vsetq_lane_f16(r0[0], _r01_f16, 0);
                        _r01_f16 = vsetq_lane_f16(r0[N], _r01_f16, 1);
                        _r01_f16 = vsetq_lane_f16(r0[N * 2], _r01_f16, 2);
                        _r01_f16 = vsetq_lane_f16(r0[N * 3], _r01_f16, 3);
                        _r01_f16 = vsetq_lane_f16(r0[N * 4], _r01_f16, 4);
                        _r01_f16 = vsetq_lane_f16(r0[N * 5], _r01_f16, 5);
                        _r01_f16 = vsetq_lane_f16(r0[N * 6], _r01_f16, 6);
                        _r01_f16 = vsetq_lane_f16(r0[N * 7], _r01_f16, 7);
                        _r0 = vcvt_f32_f16(vget_low_f16(_r01_f16));
                        _r1 = vcvt_f32_f16(vget_high_f16(_r01_f16));
                        r0 += dilation_w;
                    }

                    float16x8_t _w = vld1q_f16(kptr);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                    _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _r1, _w1);

                    kptr += 8;
                }
            }
            _sum0 = vaddq_f32(_sum0, _sum1);
            sum += vaddvq_f32(_sum0);
            float32x4_t _sum = vdupq_n_f32(0.f);
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float32x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vcvt_f32_f16(vld1_f16(r0));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        float16x4_t _r0_f16 = float16x4_t();
                        _r0_f16 = vset_lane_f16(r0[0], _r0_f16, 0);
                        _r0_f16 = vset_lane_f16(r0[N], _r0_f16, 1);
                        _r0_f16 = vset_lane_f16(r0[N * 2], _r0_f16, 2);
                        _r0_f16 = vset_lane_f16(r0[N * 3], _r0_f16, 3);
                        _r0 = vcvt_f32_f16(_r0_f16);
                        r0 += dilation_w;
                    }

                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
                    _sum = vfmaq_f32(_sum, _r0, _w);

                    kptr += 4;
                }
            }
            sum += vaddvq_f32(_sum);
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val0;
                    float val1;
                    // if (elempack == 1)
                    {
                        val0 = (float)(r0[0]);
                        val1 = (float)(r0[N]);
                        r0 += dilation_w;
                    }

                    sum += val0 * (float)(kptr[0]);
                    sum += val1 * (float)(kptr[1]);

                    kptr += 2;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val;
                    // if (elempack == 1)
                    {
                        val = (float)(r0[0]);
                        r0 += dilation_w;
                    }

                    sum += val * (float)(kptr[0]);

                    kptr += 1;
                }
            }

            sum = activation_ss(sum, activation_type, activation_params);

            outptr[0] = (__fp16)(sum);
            outptr += 1;
        }
    }
}

static void convolution1d_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int dilation_w, int stride_w, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int elempack = bottom_blob.elempack;
    const int inh = bottom_blob.h * elempack;

    const int N = bottom_blob.w * elempack;

    const int outw = top_blob.w;
    const int out_elempack = top_blob.elempack;
    const int outh = top_blob.h * out_elempack;

    const int M = top_blob.w * out_elempack;

    const __fp16* bias_data_ptr = bias_data;

    int nn_outh = 0;
    int remain_outh_start = 0;
    nn_outh = (outh - remain_outh_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float16x8_t _sum0 = vdupq_n_f16(0.f);
            float16x8_t _sum1 = vdupq_n_f16(0.f);
            float16x8_t _sum2 = vdupq_n_f16(0.f);
            float16x8_t _sum3 = vdupq_n_f16(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1q_f16(bias_data_ptr + p);
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8);

            int q = 0;
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x8_t _r0;
                    if (elempack == 8)
                    {
                        _r0 = vld1q_f16(r0);
                        r0 += dilation_w * 8;
                    }
                    else if (elempack == 4)
                    {
                        _r0 = vcombine_f16(vld1_f16(r0), vld1_f16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x8_t();
                        _r0 = vsetq_lane_f16(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f16(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f16(r0[N * 3], _r0, 3);
                        _r0 = vsetq_lane_f16(r0[N * 4], _r0, 4);
                        _r0 = vsetq_lane_f16(r0[N * 5], _r0, 5);
                        _r0 = vsetq_lane_f16(r0[N * 6], _r0, 6);
                        _r0 = vsetq_lane_f16(r0[N * 7], _r0, 7);
                        r0 += dilation_w;
                    }

                    float16x8_t _w0 = vld1q_f16(kptr);
                    float16x8_t _w1 = vld1q_f16(kptr + 8);
                    float16x8_t _w2 = vld1q_f16(kptr + 8 * 2);
                    float16x8_t _w3 = vld1q_f16(kptr + 8 * 3);
                    float16x8_t _w4 = vld1q_f16(kptr + 8 * 4);
                    float16x8_t _w5 = vld1q_f16(kptr + 8 * 5);
                    float16x8_t _w6 = vld1q_f16(kptr + 8 * 6);
                    float16x8_t _w7 = vld1q_f16(kptr + 8 * 7);
                    _sum0 = vfmaq_laneq_f16(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_laneq_f16(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_laneq_f16(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_laneq_f16(_sum3, _w3, _r0, 3);
                    _sum0 = vfmaq_laneq_f16(_sum0, _w4, _r0, 4);
                    _sum1 = vfmaq_laneq_f16(_sum1, _w5, _r0, 5);
                    _sum2 = vfmaq_laneq_f16(_sum2, _w6, _r0, 6);
                    _sum3 = vfmaq_laneq_f16(_sum3, _w7, _r0, 7);

                    kptr += 64;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x4_t();
                        _r0 = vset_lane_f16(r0[0], _r0, 0);
                        _r0 = vset_lane_f16(r0[N], _r0, 1);
                        _r0 = vset_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vset_lane_f16(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float16x8_t _w0 = vld1q_f16(kptr);
                    float16x8_t _w1 = vld1q_f16(kptr + 8);
                    float16x8_t _w2 = vld1q_f16(kptr + 8 * 2);
                    float16x8_t _w3 = vld1q_f16(kptr + 8 * 3);
                    _sum0 = vfmaq_lane_f16(_sum0, _w0, _r0, 0);
                    _sum1 = vfmaq_lane_f16(_sum1, _w1, _r0, 1);
                    _sum2 = vfmaq_lane_f16(_sum2, _w2, _r0, 2);
                    _sum3 = vfmaq_lane_f16(_sum3, _w3, _r0, 3);

                    kptr += 32;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    __fp16 val0;
                    __fp16 val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    float16x8_t _w0 = vld1q_f16(kptr);
                    float16x8_t _w1 = vld1q_f16(kptr + 8);
                    _sum0 = vfmaq_n_f16(_sum0, _w0, val0);
                    _sum1 = vfmaq_n_f16(_sum1, _w1, val1);

                    kptr += 16;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x8_t _val;
                    // if (elempack == 1)
                    {
                        _val = vdupq_n_f16(r0[0]);
                        r0 += dilation_w;
                    }

                    float16x8_t _w0 = vld1q_f16(kptr);
                    _sum0 = vfmaq_f16(_sum0, _w0, _val);

                    kptr += 8;
                }
            }

            _sum0 = vaddq_f16(_sum0, _sum1);
            _sum2 = vaddq_f16(_sum2, _sum3);
            _sum0 = vaddq_f16(_sum0, _sum2);

            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);

            if (out_elempack == 8)
            {
                vst1q_f16(outptr, _sum0);
                outptr += 8;
            }
            else if (out_elempack == 4)
            {
                vst1_f16(outptr, vget_low_f16(_sum0));
                vst1_f16(outptr + M, vget_high_f16(_sum0));
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                outptr[0] = vgetq_lane_f16(_sum0, 0);
                outptr[M] = vgetq_lane_f16(_sum0, 1);
                outptr[M * 2] = vgetq_lane_f16(_sum0, 2);
                outptr[M * 3] = vgetq_lane_f16(_sum0, 3);
                outptr[M * 4] = vgetq_lane_f16(_sum0, 4);
                outptr[M * 5] = vgetq_lane_f16(_sum0, 5);
                outptr[M * 6] = vgetq_lane_f16(_sum0, 6);
                outptr[M * 7] = vgetq_lane_f16(_sum0, 7);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 8;
    nn_outh = (outh - remain_outh_start) / 4;
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.row<__fp16>(p / out_elempack);

        for (int j = 0; j < outw; j++)
        {
            float16x4_t _sum0 = vdup_n_f16(0.f);
            float16x4_t _sum1 = vdup_n_f16(0.f);
            float16x4_t _sum2 = vdup_n_f16(0.f);
            float16x4_t _sum3 = vdup_n_f16(0.f);

            if (bias_data_ptr)
            {
                _sum0 = vld1_f16(bias_data_ptr + p);
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);

            int q = 0;
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x4_t _r0;
                    float16x4_t _r1;
                    if (elempack == 8)
                    {
                        float16x8_t _r01 = vld1q_f16(r0);
                        _r0 = vget_low_f16(_r01);
                        _r1 = vget_high_f16(_r01);
                        r0 += dilation_w * 8;
                    }
                    else if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        _r1 = vld1_f16(r0 + N);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x4_t();
                        _r1 = float16x4_t();
                        _r0 = vset_lane_f16(r0[0], _r0, 0);
                        _r0 = vset_lane_f16(r0[N], _r0, 1);
                        _r0 = vset_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vset_lane_f16(r0[N * 3], _r0, 3);
                        _r1 = vset_lane_f16(r0[N * 4], _r1, 0);
                        _r1 = vset_lane_f16(r0[N * 5], _r1, 1);
                        _r1 = vset_lane_f16(r0[N * 6], _r1, 2);
                        _r1 = vset_lane_f16(r0[N * 7], _r1, 3);
                        r0 += dilation_w;
                    }

                    float16x4_t _w0 = vld1_f16(kptr);
                    float16x4_t _w1 = vld1_f16(kptr + 4);
                    float16x4_t _w2 = vld1_f16(kptr + 8);
                    float16x4_t _w3 = vld1_f16(kptr + 12);
                    float16x4_t _w4 = vld1_f16(kptr + 16);
                    float16x4_t _w5 = vld1_f16(kptr + 20);
                    float16x4_t _w6 = vld1_f16(kptr + 24);
                    float16x4_t _w7 = vld1_f16(kptr + 28);
                    _sum0 = vfma_lane_f16(_sum0, _w0, _r0, 0);
                    _sum1 = vfma_lane_f16(_sum1, _w1, _r0, 1);
                    _sum2 = vfma_lane_f16(_sum2, _w2, _r0, 2);
                    _sum3 = vfma_lane_f16(_sum3, _w3, _r0, 3);
                    _sum0 = vfma_lane_f16(_sum0, _w4, _r1, 0);
                    _sum1 = vfma_lane_f16(_sum1, _w5, _r1, 1);
                    _sum2 = vfma_lane_f16(_sum2, _w6, _r1, 2);
                    _sum3 = vfma_lane_f16(_sum3, _w7, _r1, 3);

                    kptr += 32;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x4_t();
                        _r0 = vset_lane_f16(r0[0], _r0, 0);
                        _r0 = vset_lane_f16(r0[N], _r0, 1);
                        _r0 = vset_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vset_lane_f16(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float16x4_t _w0 = vld1_f16(kptr);
                    float16x4_t _w1 = vld1_f16(kptr + 4);
                    float16x4_t _w2 = vld1_f16(kptr + 8);
                    float16x4_t _w3 = vld1_f16(kptr + 12);
                    _sum0 = vfma_lane_f16(_sum0, _w0, _r0, 0);
                    _sum1 = vfma_lane_f16(_sum1, _w1, _r0, 1);
                    _sum2 = vfma_lane_f16(_sum2, _w2, _r0, 2);
                    _sum3 = vfma_lane_f16(_sum3, _w3, _r0, 3);

                    kptr += 16;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    __fp16 val0;
                    __fp16 val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    float16x4_t _w0 = vld1_f16(kptr);
                    float16x4_t _w1 = vld1_f16(kptr + 4);
                    _sum0 = vfma_n_f16(_sum0, _w0, val0);
                    _sum1 = vfma_n_f16(_sum1, _w1, val1);

                    kptr += 8;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x4_t _val;
                    // if (elempack == 1)
                    {
                        _val = vdup_n_f16(r0[0]);
                        r0 += dilation_w;
                    }

                    float16x4_t _w = vld1_f16(kptr);
                    _sum0 = vfma_f16(_sum0, _val, _w);

                    kptr += 4;
                }
            }

            _sum0 = vadd_f16(_sum0, _sum1);
            _sum2 = vadd_f16(_sum2, _sum3);
            _sum0 = vadd_f16(_sum0, _sum2);

            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);

            if (out_elempack == 4)
            {
                vst1_f16(outptr, _sum0);
                outptr += 4;
            }
            else // if (out_elempack == 1)
            {
                outptr[0] = vget_lane_f16(_sum0, 0);
                outptr[M] = vget_lane_f16(_sum0, 1);
                outptr[M * 2] = vget_lane_f16(_sum0, 2);
                outptr[M * 3] = vget_lane_f16(_sum0, 3);
                outptr += 1;
            }
        }
    }
    remain_outh_start += nn_outh * 4;
    nn_outh = (outh - remain_outh_start) / 2;
    for (int pp = 0; pp < nn_outh; pp++)
    {
        const int p = remain_outh_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inh = bottom_blob.h * elempack;
        const int outw = top_blob.w;

        __fp16* outptr0 = top_blob.row<__fp16>(p);
        __fp16* outptr1 = top_blob.row<__fp16>(p + 1);

        for (int j = 0; j < outw; j++)
        {
            __fp16 sum0 = 0.f;
            __fp16 sum1 = 0.f;

            if (bias_data_ptr)
            {
                sum0 = bias_data_ptr[p];
                sum1 = bias_data_ptr[p + 1];
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);

            int q = 0;
            float16x8_t _sum0 = vdupq_n_f16(0.f);
            float16x8_t _sum1 = vdupq_n_f16(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x8_t _r0;
                    if (elempack == 8)
                    {
                        _r0 = vld1q_f16(r0);
                        r0 += dilation_w * 8;
                    }
                    else if (elempack == 4)
                    {
                        _r0 = vcombine_f16(vld1_f16(r0), vld1_f16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x8_t();
                        _r0 = vsetq_lane_f16(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f16(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f16(r0[N * 3], _r0, 3);
                        _r0 = vsetq_lane_f16(r0[N * 4], _r0, 4);
                        _r0 = vsetq_lane_f16(r0[N * 5], _r0, 5);
                        _r0 = vsetq_lane_f16(r0[N * 6], _r0, 6);
                        _r0 = vsetq_lane_f16(r0[N * 7], _r0, 7);
                        r0 += dilation_w;
                    }

                    float16x8_t _w0 = vld1q_f16(kptr);
                    float16x8_t _w1 = vld1q_f16(kptr + 8);
                    _sum0 = vfmaq_f16(_sum0, _r0, _w0);
                    _sum1 = vfmaq_f16(_sum1, _r0, _w1);

                    kptr += 16;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x4_t();
                        _r0 = vset_lane_f16(r0[0], _r0, 0);
                        _r0 = vset_lane_f16(r0[N], _r0, 1);
                        _r0 = vset_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vset_lane_f16(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float16x4_t _w0 = vld1_f16(kptr);
                    float16x4_t _w1 = vld1_f16(kptr + 4);
                    _sum0 = vcombine_f16(vfma_f16(vget_low_f16(_sum0), _r0, _w0), vget_high_f16(_sum0));
                    _sum1 = vcombine_f16(vfma_f16(vget_low_f16(_sum1), _r0, _w1), vget_high_f16(_sum1));

                    kptr += 8;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    __fp16 val0;
                    __fp16 val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    sum0 += val0 * kptr[0];
                    sum1 += val0 * kptr[1];
                    sum0 += val1 * kptr[2];
                    sum1 += val1 * kptr[3];

                    kptr += 4;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    __fp16 val;
                    // if (elempack == 1)
                    {
                        val = r0[0];
                        r0 += dilation_w;
                    }

                    sum0 += val * kptr[0];
                    sum1 += val * kptr[1];

                    kptr += 2;
                }
            }

            float16x4_t _ss0 = vadd_f16(vget_low_f16(_sum0), vget_high_f16(_sum0));
            float16x4_t _ss1 = vadd_f16(vget_low_f16(_sum1), vget_high_f16(_sum1));
            float16x4_t _ss = vpadd_f16(_ss0, _ss1);
            _ss = vpadd_f16(_ss, _ss);
            sum0 += vget_lane_f16(_ss, 0);
            sum1 += vget_lane_f16(_ss, 1);

            sum0 = activation_ss_f16(sum0, activation_type, activation_params);
            sum1 = activation_ss_f16(sum1, activation_type, activation_params);

            outptr0[0] = sum0;
            outptr1[0] = sum1;
            outptr0 += 1;
            outptr1 += 1;
        }
    }
    remain_outh_start += nn_outh * 2;
    for (int p = remain_outh_start; p < outh; p++)
    {
        __fp16* outptr = top_blob.row<__fp16>(p);

        for (int j = 0; j < outw; j++)
        {
            __fp16 sum = 0.f;

            if (bias_data_ptr)
            {
                sum = bias_data_ptr[p];
            }

            const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);

            int q = 0;
            float16x8_t _sum = vdupq_n_f16(0.f);
            for (; q + 7 < inh; q += 8)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x8_t _r0;
                    if (elempack == 8)
                    {
                        _r0 = vld1q_f16(r0);
                        r0 += dilation_w * 8;
                    }
                    else if (elempack == 4)
                    {
                        _r0 = vcombine_f16(vld1_f16(r0), vld1_f16(r0 + N));
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x8_t();
                        _r0 = vsetq_lane_f16(r0[0], _r0, 0);
                        _r0 = vsetq_lane_f16(r0[N], _r0, 1);
                        _r0 = vsetq_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vsetq_lane_f16(r0[N * 3], _r0, 3);
                        _r0 = vsetq_lane_f16(r0[N * 4], _r0, 4);
                        _r0 = vsetq_lane_f16(r0[N * 5], _r0, 5);
                        _r0 = vsetq_lane_f16(r0[N * 6], _r0, 6);
                        _r0 = vsetq_lane_f16(r0[N * 7], _r0, 7);
                        r0 += dilation_w;
                    }

                    float16x8_t _w0 = vld1q_f16(kptr);
                    _sum = vfmaq_f16(_sum, _r0, _w0);

                    kptr += 8;
                }
            }
            for (; q + 3 < inh; q += 4)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q / elempack) + j * stride_w * elempack;

                for (int k = 0; k < kernel_w; k++)
                {
                    float16x4_t _r0;
                    if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        r0 += dilation_w * 4;
                    }
                    else // if (elempack == 1)
                    {
                        _r0 = float16x4_t();
                        _r0 = vset_lane_f16(r0[0], _r0, 0);
                        _r0 = vset_lane_f16(r0[N], _r0, 1);
                        _r0 = vset_lane_f16(r0[N * 2], _r0, 2);
                        _r0 = vset_lane_f16(r0[N * 3], _r0, 3);
                        r0 += dilation_w;
                    }

                    float16x4_t _w = vld1_f16(kptr);
                    _sum = vcombine_f16(vfma_f16(vget_low_f16(_sum), _r0, _w), vget_high_f16(_sum));

                    kptr += 4;
                }
            }
            for (; q + 1 < inh; q += 2)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    __fp16 val0;
                    __fp16 val1;
                    // if (elempack == 1)
                    {
                        val0 = r0[0];
                        val1 = r0[N];
                        r0 += dilation_w;
                    }

                    sum += val0 * kptr[0];
                    sum += val1 * kptr[1];

                    kptr += 2;
                }
            }
            for (; q < inh; q++)
            {
                const __fp16* r0 = bottom_blob.row<const __fp16>(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    __fp16 val;
                    // if (elempack == 1)
                    {
                        val = r0[0];
                        r0 += dilation_w;
                    }

                    sum += val * kptr[0];

                    kptr += 1;
                }
            }

            float16x4_t _ss = vadd_f16(vget_low_f16(_sum), vget_high_f16(_sum));
            _ss = vpadd_f16(_ss, _ss);
            _ss = vpadd_f16(_ss, _ss);
            sum += vget_lane_f16(_ss, 0);

            sum = activation_ss_f16(sum, activation_type, activation_params);

            outptr[0] = sum;
            outptr += 1;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_1x1.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    int nn_outch = 0;
    int remain_outch_start = 0;

#if __ARM_NEON && __aarch64__

    nn_outch = outch >> 3;
    remain_outch_start = nn_outch << 3;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 8;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);
        Mat out2 = top_blob.channel(p + 2);
        Mat out3 = top_blob.channel(p + 3);
        Mat out4 = top_blob.channel(p + 4);
        Mat out5 = top_blob.channel(p + 5);
        Mat out6 = top_blob.channel(p + 6);
        Mat out7 = top_blob.channel(p + 7);

        const float bias0 = bias ? bias[p] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;
        const float bias2 = bias ? bias[p + 2] : 0.f;
        const float bias3 = bias ? bias[p + 3] : 0.f;
        const float bias4 = bias ? bias[p + 4] : 0.f;
        const float bias5 = bias ? bias[p + 5] : 0.f;
        const float bias6 = bias ? bias[p + 6] : 0.f;
        const float bias7 = bias ? bias[p + 7] : 0.f;

        out0.fill(bias0);
        out1.fill(bias1);
        out2.fill(bias2);
        out3.fill(bias3);
        out4.fill(bias4);
        out5.fill(bias5);
        out6.fill(bias6);
        out7.fill(bias7);

        int q = 0;

        for (; q + 7 < inch; q += 8)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;
            float* outptr4 = out4;
            float* outptr5 = out5;
            float* outptr6 = out6;
            float* outptr7 = out7;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q + 1);
            const float* img2 = bottom_blob.channel(q + 2);
            const float* img3 = bottom_blob.channel(q + 3);
            const float* img4 = bottom_blob.channel(q + 4);
            const float* img5 = bottom_blob.channel(q + 5);
            const float* img6 = bottom_blob.channel(q + 6);
            const float* img7 = bottom_blob.channel(q + 7);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;
            const float* kernel4 = kernel + (p + 4) * inch + q;
            const float* kernel5 = kernel + (p + 5) * inch + q;
            const float* kernel6 = kernel + (p + 6) * inch + q;
            const float* kernel7 = kernel + (p + 7) * inch + q;

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;
            const float* r4 = img4;
            const float* r5 = img5;
            const float* r6 = img6;
            const float* r7 = img7;

            int size = outw * outh;

            int nn = size >> 2;
            int remain = size & 3;

            float32x4_t _k0 = vld1q_f32(kernel0);
            float32x4_t _k1 = vld1q_f32(kernel1);
            float32x4_t _k2 = vld1q_f32(kernel2);
            float32x4_t _k3 = vld1q_f32(kernel3);
            float32x4_t _k4 = vld1q_f32(kernel4);
            float32x4_t _k5 = vld1q_f32(kernel5);
            float32x4_t _k6 = vld1q_f32(kernel6);
            float32x4_t _k7 = vld1q_f32(kernel7);

            float32x4_t _k0n = vld1q_f32(kernel0 + 4);
            float32x4_t _k1n = vld1q_f32(kernel1 + 4);
            float32x4_t _k2n = vld1q_f32(kernel2 + 4);
            float32x4_t _k3n = vld1q_f32(kernel3 + 4);
            float32x4_t _k4n = vld1q_f32(kernel4 + 4);
            float32x4_t _k5n = vld1q_f32(kernel5 + 4);
            float32x4_t _k6n = vld1q_f32(kernel6 + 4);
            float32x4_t _k7n = vld1q_f32(kernel7 + 4);

#ifdef __clang__
            // gcc reject over 30 oprands :(
            if (nn > 0)
            {
                asm volatile(
                    "prfm   pldl1keep, [%9, #128]       \n"
                    "ld1    {v17.4s}, [%9], #16         \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v18.4s}, [%1]              \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v19.4s}, [%2]              \n"

                    "0:                                 \n"

                    "fmla   v18.4s, v17.4s, %34.s[0]    \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v20.4s}, [%3]              \n"

                    "fmla   v19.4s, v17.4s, %35.s[0]    \n"

                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v21.4s}, [%4]              \n"

                    "fmla   v20.4s, v17.4s, %36.s[0]    \n"

                    "prfm   pldl1keep, [%5, #128]       \n"
                    "ld1    {v22.4s}, [%5]              \n"

                    "fmla   v21.4s, v17.4s, %37.s[0]    \n"

                    "prfm   pldl1keep, [%6, #128]       \n"
                    "ld1    {v23.4s}, [%6]              \n"

                    "fmla   v22.4s, v17.4s, %38.s[0]    \n"

                    "prfm   pldl1keep, [%10, #128]      \n"
                    "ld1    {v16.4s}, [%10], #16        \n"

                    "fmla   v23.4s, v17.4s, %39.s[0]    \n"

                    "prfm   pldl1keep, [%7, #128]       \n"
                    "ld1    {v24.4s}, [%7]              \n"

                    "fmla   v18.4s, v16.4s, %34.s[1]    \n"
                    "fmla   v19.4s, v16.4s, %35.s[1]    \n"

                    "prfm   pldl1keep, [%8, #128]       \n"
                    "ld1    {v25.4s}, [%8]              \n"

                    "fmla   v24.4s, v17.4s, %40.s[0]    \n"
                    "fmla   v25.4s, v17.4s, %41.s[0]    \n"

                    "fmla   v20.4s, v16.4s, %36.s[1]    \n"
                    "fmla   v21.4s, v16.4s, %37.s[1]    \n"

                    "prfm   pldl1keep, [%11, #128]      \n"
                    "ld1    {v17.4s}, [%11], #16        \n"

                    "fmla   v22.4s, v16.4s, %38.s[1]    \n"
                    "fmla   v23.4s, v16.4s, %39.s[1]    \n"

                    "fmla   v18.4s, v17.4s, %34.s[2]    \n"
                    "fmla   v19.4s, v17.4s, %35.s[2]    \n"

                    "fmla   v24.4s, v16.4s, %40.s[1]    \n"
                    "fmla   v25.4s, v16.4s, %41.s[1]    \n"

                    "fmla   v20.4s, v17.4s, %36.s[2]    \n"
                    "fmla   v21.4s, v17.4s, %37.s[2]    \n"

                    "prfm   pldl1keep, [%12, #128]      \n"
                    "ld1    {v16.4s}, [%12], #16        \n"

                    "fmla   v22.4s, v17.4s, %38.s[2]    \n"
                    "fmla   v23.4s, v17.4s, %39.s[2]    \n"

                    "fmla   v18.4s, v16.4s, %34.s[3]    \n"
                    "fmla   v19.4s, v16.4s, %35.s[3]    \n"

                    "fmla   v24.4s, v17.4s, %40.s[2]    \n"
                    "fmla   v25.4s, v17.4s, %41.s[2]    \n"

                    "fmla   v20.4s, v16.4s, %36.s[3]    \n"
                    "fmla   v21.4s, v16.4s, %37.s[3]    \n"

                    "prfm   pldl1keep, [%13, #128]      \n"
                    "ld1    {v17.4s}, [%13], #16        \n"

                    "fmla   v22.4s, v16.4s, %38.s[3]    \n"
                    "fmla   v23.4s, v16.4s, %39.s[3]    \n"

                    "fmla   v18.4s, v17.4s, %42.s[0]    \n"
                    "fmla   v19.4s, v17.4s, %43.s[0]    \n"

                    "fmla   v24.4s, v16.4s, %40.s[3]    \n"
                    "fmla   v25.4s, v16.4s, %41.s[3]    \n"

                    "fmla   v20.4s, v17.4s, %44.s[0]    \n"
                    "fmla   v21.4s, v17.4s, %45.s[0]    \n"

                    "prfm   pldl1keep, [%14, #128]      \n"
                    "ld1    {v16.4s}, [%14], #16        \n"

                    "fmla   v22.4s, v17.4s, %46.s[0]    \n"
                    "fmla   v23.4s, v17.4s, %47.s[0]    \n"

                    "fmla   v18.4s, v16.4s, %42.s[1]    \n"
                    "fmla   v19.4s, v16.4s, %43.s[1]    \n"

                    "fmla   v24.4s, v17.4s, %48.s[0]    \n"
                    "fmla   v25.4s, v17.4s, %49.s[0]    \n"

                    "fmla   v20.4s, v16.4s, %44.s[1]    \n"
                    "fmla   v21.4s, v16.4s, %45.s[1]    \n"

                    "prfm   pldl1keep, [%15, #128]      \n"
                    "ld1    {v17.4s}, [%15], #16        \n"

                    "fmla   v22.4s, v16.4s, %46.s[1]    \n"
                    "fmla   v23.4s, v16.4s, %47.s[1]    \n"

                    "fmla   v18.4s, v17.4s, %42.s[2]    \n"
                    "fmla   v19.4s, v17.4s, %43.s[2]    \n"

                    "fmla   v24.4s, v16.4s, %48.s[1]    \n"
                    "fmla   v25.4s, v16.4s, %49.s[1]    \n"

                    "fmla   v20.4s, v17.4s, %44.s[2]    \n"
                    "fmla   v21.4s, v17.4s, %45.s[2]    \n"

                    "prfm   pldl1keep, [%16, #128]      \n"
                    "ld1    {v16.4s}, [%16], #16        \n"

                    "fmla   v22.4s, v17.4s, %46.s[2]    \n"
                    "fmla   v23.4s, v17.4s, %47.s[2]    \n"

                    "fmla   v18.4s, v16.4s, %42.s[3]    \n"
                    "fmla   v19.4s, v16.4s, %43.s[3]    \n"

                    "fmla   v24.4s, v17.4s, %48.s[2]    \n"
                    "fmla   v25.4s, v17.4s, %49.s[2]    \n"

                    "fmla   v20.4s, v16.4s, %44.s[3]    \n"
                    "fmla   v21.4s, v16.4s, %45.s[3]    \n"

                    "st1    {v18.4s}, [%1], #16         \n"

                    "fmla   v22.4s, v16.4s, %46.s[3]    \n"

                    "st1    {v19.4s}, [%2], #16         \n"

                    "fmla   v23.4s, v16.4s, %47.s[3]    \n"

                    "st1    {v20.4s}, [%3], #16         \n"

                    "prfm   pldl1keep, [%9, #128]       \n"
                    "ld1    {v17.4s}, [%9], #16         \n"

                    "fmla   v24.4s, v16.4s, %48.s[3]    \n"

                    "st1    {v21.4s}, [%4], #16         \n"

                    "fmla   v25.4s, v16.4s, %49.s[3]    \n"

                    "st1    {v22.4s}, [%5], #16         \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v18.4s}, [%1]              \n"

                    "st1    {v23.4s}, [%6], #16         \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v19.4s}, [%2]              \n"

                    "st1    {v24.4s}, [%7], #16         \n"

                    "subs   %w0, %w0, #1                \n"

                    "st1    {v25.4s}, [%8], #16         \n"

                    "bne    0b                          \n"
                    "sub    %9, %9, #16                 \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(outptr1), // %2
                    "=r"(outptr2), // %3
                    "=r"(outptr3), // %4
                    "=r"(outptr4), // %5
                    "=r"(outptr5), // %6
                    "=r"(outptr6), // %7
                    "=r"(outptr7), // %8
                    "=r"(r0),      // %9
                    "=r"(r1),      // %10
                    "=r"(r2),      // %11
                    "=r"(r3),      // %12
                    "=r"(r4),      // %13
                    "=r"(r5),      // %14
                    "=r"(r6),      // %15
                    "=r"(r7)       // %16
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr1),
                    "3"(outptr2),
                    "4"(outptr3),
                    "5"(outptr4),
                    "6"(outptr5),
                    "7"(outptr6),
                    "8"(outptr7),
                    "9"(r0),
                    "10"(r1),
                    "11"(r2),
                    "12"(r3),
                    "13"(r4),
                    "14"(r5),
                    "15"(r6),
                    "16"(r7),
                    "w"(_k0),                                                                            // %34
                    "w"(_k1),                                                                            // %35
                    "w"(_k2),                                                                            // %36
                    "w"(_k3),                                                                            // %37
                    "w"(_k4),                                                                            // %38
                    "w"(_k5),                                                                            // %39
                    "w"(_k6),                                                                            // %40
                    "w"(_k7),                                                                            // %41
                    "w"(_k0n),                                                                           // %42
                    "w"(_k1n),                                                                           // %43
                    "w"(_k2n),                                                                           // %44
                    "w"(_k3n),                                                                           // %45
                    "w"(_k4n),                                                                           // %46
                    "w"(_k5n),                                                                           // %47
                    "w"(_k6n),                                                                           // %48
                    "w"(_k7n)                                                                            // %49
                    : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25" //, "v26", "v27", "v28", "v29", "v30", "v31"
                );
            }
#else
            for (; nn > 0; nn--)
            {
                float32x4_t _p = vld1q_f32(r0);

                float32x4_t _out0p = vld1q_f32(outptr0);
                float32x4_t _out1p = vld1q_f32(outptr1);
                float32x4_t _out2p = vld1q_f32(outptr2);
                float32x4_t _out3p = vld1q_f32(outptr3);
                float32x4_t _out4p = vld1q_f32(outptr4);
                float32x4_t _out5p = vld1q_f32(outptr5);
                float32x4_t _out6p = vld1q_f32(outptr6);
                float32x4_t _out7p = vld1q_f32(outptr7);

                _out0p = vfmaq_laneq_f32(_out0p, _p, _k0, 0);
                _out1p = vfmaq_laneq_f32(_out1p, _p, _k1, 0);
                _out2p = vfmaq_laneq_f32(_out2p, _p, _k2, 0);
                _out3p = vfmaq_laneq_f32(_out3p, _p, _k3, 0);
                _out4p = vfmaq_laneq_f32(_out4p, _p, _k4, 0);
                _out5p = vfmaq_laneq_f32(_out5p, _p, _k5, 0);
                _out6p = vfmaq_laneq_f32(_out6p, _p, _k6, 0);
                _out7p = vfmaq_laneq_f32(_out7p, _p, _k7, 0);

                float32x4_t _p1 = vld1q_f32(r1);

                _out0p = vfmaq_laneq_f32(_out0p, _p1, _k0, 1);
                _out1p = vfmaq_laneq_f32(_out1p, _p1, _k1, 1);
                _out2p = vfmaq_laneq_f32(_out2p, _p1, _k2, 1);
                _out3p = vfmaq_laneq_f32(_out3p, _p1, _k3, 1);
                _out4p = vfmaq_laneq_f32(_out4p, _p1, _k4, 1);
                _out5p = vfmaq_laneq_f32(_out5p, _p1, _k5, 1);
                _out6p = vfmaq_laneq_f32(_out6p, _p1, _k6, 1);
                _out7p = vfmaq_laneq_f32(_out7p, _p1, _k7, 1);

                float32x4_t _p2 = vld1q_f32(r2);

                _out0p = vfmaq_laneq_f32(_out0p, _p2, _k0, 2);
                _out1p = vfmaq_laneq_f32(_out1p, _p2, _k1, 2);
                _out2p = vfmaq_laneq_f32(_out2p, _p2, _k2, 2);
                _out3p = vfmaq_laneq_f32(_out3p, _p2, _k3, 2);
                _out4p = vfmaq_laneq_f32(_out4p, _p2, _k4, 2);
                _out5p = vfmaq_laneq_f32(_out5p, _p2, _k5, 2);
                _out6p = vfmaq_laneq_f32(_out6p, _p2, _k6, 2);
                _out7p = vfmaq_laneq_f32(_out7p, _p2, _k7, 2);

                float32x4_t _p3 = vld1q_f32(r3);

                _out0p = vfmaq_laneq_f32(_out0p, _p3, _k0, 3);
                _out1p = vfmaq_laneq_f32(_out1p, _p3, _k1, 3);
                _out2p = vfmaq_laneq_f32(_out2p, _p3, _k2, 3);
                _out3p = vfmaq_laneq_f32(_out3p, _p3, _k3, 3);
                _out4p = vfmaq_laneq_f32(_out4p, _p3, _k4, 3);
                _out5p = vfmaq_laneq_f32(_out5p, _p3, _k5, 3);
                _out6p = vfmaq_laneq_f32(_out6p, _p3, _k6, 3);
                _out7p = vfmaq_laneq_f32(_out7p, _p3, _k7, 3);

                float32x4_t _p4 = vld1q_f32(r4);

                _out0p = vfmaq_laneq_f32(_out0p, _p4, _k0n, 0);
                _out1p = vfmaq_laneq_f32(_out1p, _p4, _k1n, 0);
                _out2p = vfmaq_laneq_f32(_out2p, _p4, _k2n, 0);
                _out3p = vfmaq_laneq_f32(_out3p, _p4, _k3n, 0);
                _out4p = vfmaq_laneq_f32(_out4p, _p4, _k4n, 0);
                _out5p = vfmaq_laneq_f32(_out5p, _p4, _k5n, 0);
                _out6p = vfmaq_laneq_f32(_out6p, _p4, _k6n, 0);
                _out7p = vfmaq_laneq_f32(_out7p, _p4, _k7n, 0);

                float32x4_t _p5 = vld1q_f32(r5);

                _out0p = vfmaq_laneq_f32(_out0p, _p5, _k0n, 1);
                _out1p = vfmaq_laneq_f32(_out1p, _p5, _k1n, 1);
                _out2p = vfmaq_laneq_f32(_out2p, _p5, _k2n, 1);
                _out3p = vfmaq_laneq_f32(_out3p, _p5, _k3n, 1);
                _out4p = vfmaq_laneq_f32(_out4p, _p5, _k4n, 1);
                _out5p = vfmaq_laneq_f32(_out5p, _p5, _k5n, 1);
                _out6p = vfmaq_laneq_f32(_out6p, _p5, _k6n, 1);
                _out7p = vfmaq_laneq_f32(_out7p, _p5, _k7n, 1);

                float32x4_t _p6 = vld1q_f32(r6);

                _out0p = vfmaq_laneq_f32(_out0p, _p6, _k0n, 2);
                _out1p = vfmaq_laneq_f32(_out1p, _p6, _k1n, 2);
                _out2p = vfmaq_laneq_f32(_out2p, _p6, _k2n, 2);
                _out3p = vfmaq_laneq_f32(_out3p, _p6, _k3n, 2);
                _out4p = vfmaq_laneq_f32(_out4p, _p6, _k4n, 2);
                _out5p = vfmaq_laneq_f32(_out5p, _p6, _k5n, 2);
                _out6p = vfmaq_laneq_f32(_out6p, _p6, _k6n, 2);
                _out7p = vfmaq_laneq_f32(_out7p, _p6, _k7n, 2);

                float32x4_t _p7 = vld1q_f32(r7);

                _out0p = vfmaq_laneq_f32(_out0p, _p7, _k0n, 3);
                _out1p = vfmaq_laneq_f32(_out1p, _p7, _k1n, 3);
                _out2p = vfmaq_laneq_f32(_out2p, _p7, _k2n, 3);
                _out3p = vfmaq_laneq_f32(_out3p, _p7, _k3n, 3);
                _out4p = vfmaq_laneq_f32(_out4p, _p7, _k4n, 3);
                _out5p = vfmaq_laneq_f32(_out5p, _p7, _k5n, 3);
                _out6p = vfmaq_laneq_f32(_out6p, _p7, _k6n, 3);
                _out7p = vfmaq_laneq_f32(_out7p, _p7, _k7n, 3);

                vst1q_f32(outptr0, _out0p);
                vst1q_f32(outptr1, _out1p);
                vst1q_f32(outptr2, _out2p);
                vst1q_f32(outptr3, _out3p);
                vst1q_f32(outptr4, _out4p);
                vst1q_f32(outptr5, _out5p);
                vst1q_f32(outptr6, _out6p);
                vst1q_f32(outptr7, _out7p);

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
                r4 += 4;
                r5 += 4;
                r6 += 4;
                r7 += 4;
                outptr0 += 4;
                outptr1 += 4;
                outptr2 += 4;
                outptr3 += 4;
                outptr4 += 4;
                outptr5 += 4;
                outptr6 += 4;
                outptr7 += 4;
            }
#endif
            for (; remain > 0; remain--)
            {
                // TODO neon optimize
                float sum0 = *r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7];
                float sum1 = *r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3] + *r4 * kernel1[4] + *r5 * kernel1[5] + *r6 * kernel1[6] + *r7 * kernel1[7];
                float sum2 = *r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3] + *r4 * kernel2[4] + *r5 * kernel2[5] + *r6 * kernel2[6] + *r7 * kernel2[7];
                float sum3 = *r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3] + *r4 * kernel3[4] + *r5 * kernel3[5] + *r6 * kernel3[6] + *r7 * kernel3[7];
                float sum4 = *r0 * kernel4[0] + *r1 * kernel4[1] + *r2 * kernel4[2] + *r3 * kernel4[3] + *r4 * kernel4[4] + *r5 * kernel4[5] + *r6 * kernel4[6] + *r7 * kernel4[7];
                float sum5 = *r0 * kernel5[0] + *r1 * kernel5[1] + *r2 * kernel5[2] + *r3 * kernel5[3] + *r4 * kernel5[4] + *r5 * kernel5[5] + *r6 * kernel5[6] + *r7 * kernel5[7];
                float sum6 = *r0 * kernel6[0] + *r1 * kernel6[1] + *r2 * kernel6[2] + *r3 * kernel6[3] + *r4 * kernel6[4] + *r5 * kernel6[5] + *r6 * kernel6[6] + *r7 * kernel6[7];
                float sum7 = *r0 * kernel7[0] + *r1 * kernel7[1] + *r2 * kernel7[2] + *r3 * kernel7[3] + *r4 * kernel7[4] + *r5 * kernel7[5] + *r6 * kernel7[6] + *r7 * kernel7[7];

                *outptr0 += sum0;
                *outptr1 += sum1;
                *outptr2 += sum2;
                *outptr3 += sum3;
                *outptr4 += sum4;
                *outptr5 += sum5;
                *outptr6 += sum6;
                *outptr7 += sum7;

                r0++;
                r1++;
                r2++;
                r3++;
                r4++;
                r5++;
                r6++;
                r7++;
                outptr0++;
                outptr1++;
                outptr2++;
                outptr3++;
                outptr4++;
                outptr5++;
                outptr6++;
                outptr7++;
            }
        }

        for (; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;
            float* outptr4 = out4;
            float* outptr5 = out5;
            float* outptr6 = out6;
            float* outptr7 = out7;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;
            const float* kernel4 = kernel + (p + 4) * inch + q;
            const float* kernel5 = kernel + (p + 5) * inch + q;
            const float* kernel6 = kernel + (p + 6) * inch + q;
            const float* kernel7 = kernel + (p + 7) * inch + q;

            const float k0 = kernel0[0];
            const float k1 = kernel1[0];
            const float k2 = kernel2[0];
            const float k3 = kernel3[0];
            const float k4 = kernel4[0];
            const float k5 = kernel5[0];
            const float k6 = kernel6[0];
            const float k7 = kernel7[0];

            const float* r0 = img0;

            int size = outw * outh;

            int nn = size >> 2;
            int remain = size & 3;

            float32x4_t _k0 = vdupq_n_f32(k0);
            float32x4_t _k1 = vdupq_n_f32(k1);
            float32x4_t _k2 = vdupq_n_f32(k2);
            float32x4_t _k3 = vdupq_n_f32(k3);
            float32x4_t _k4 = vdupq_n_f32(k4);
            float32x4_t _k5 = vdupq_n_f32(k5);
            float32x4_t _k6 = vdupq_n_f32(k6);
            float32x4_t _k7 = vdupq_n_f32(k7);

            for (; nn > 0; nn--)
            {
                float32x4_t _p = vld1q_f32(r0);

                float32x4_t _out0p = vld1q_f32(outptr0);
                float32x4_t _out1p = vld1q_f32(outptr1);
                float32x4_t _out2p = vld1q_f32(outptr2);
                float32x4_t _out3p = vld1q_f32(outptr3);
                float32x4_t _out4p = vld1q_f32(outptr4);
                float32x4_t _out5p = vld1q_f32(outptr5);
                float32x4_t _out6p = vld1q_f32(outptr6);
                float32x4_t _out7p = vld1q_f32(outptr7);

                _out0p = vfmaq_f32(_out0p, _p, _k0);
                _out1p = vfmaq_f32(_out1p, _p, _k1);
                _out2p = vfmaq_f32(_out2p, _p, _k2);
                _out3p = vfmaq_f32(_out3p, _p, _k3);
                _out4p = vfmaq_f32(_out4p, _p, _k4);
                _out5p = vfmaq_f32(_out5p, _p, _k5);
                _out6p = vfmaq_f32(_out6p, _p, _k6);
                _out7p = vfmaq_f32(_out7p, _p, _k7);

                vst1q_f32(outptr0, _out0p);
                vst1q_f32(outptr1, _out1p);
                vst1q_f32(outptr2, _out2p);
                vst1q_f32(outptr3, _out3p);
                vst1q_f32(outptr4, _out4p);
                vst1q_f32(outptr5, _out5p);
                vst1q_f32(outptr6, _out6p);
                vst1q_f32(outptr7, _out7p);

                r0 += 4;
                outptr0 += 4;
                outptr1 += 4;
                outptr2 += 4;
                outptr3 += 4;
                outptr4 += 4;
                outptr5 += 4;
                outptr6 += 4;
                outptr7 += 4;
            }
            for (; remain > 0; remain--)
            {
                // TODO neon optimize
                float sum0 = *r0 * k0;
                float sum1 = *r0 * k1;
                float sum2 = *r0 * k2;
                float sum3 = *r0 * k3;
                float sum4 = *r0 * k4;
                float sum5 = *r0 * k5;
                float sum6 = *r0 * k6;
                float sum7 = *r0 * k7;

                *outptr0 += sum0;
                *outptr1 += sum1;
                *outptr2 += sum2;
                *outptr3 += sum3;
                *outptr4 += sum4;
                *outptr5 += sum5;
                *outptr6 += sum6;
                *outptr7 += sum7;

                r0++;
                outptr0++;
                outptr1++;
                outptr2++;
                outptr3++;
                outptr4++;
                outptr5++;
                outptr6++;
                outptr7++;
            }
        }
    }

#else

    nn_outch = outch / 6;
    remain_outch_start = nn_outch * 6;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 6;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);
        Mat out2 = top_blob.channel(p + 2);
        Mat out3 = top_blob.channel(p + 3);
        Mat out4 = top_blob.channel(p + 4);
        Mat out5 = top_blob.channel(p + 5);

        const float bias0 = bias ? bias[p] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;
        const float bias2 = bias ? bias[p + 2] : 0.f;
        const float bias3 = bias ? bias[p + 3] : 0.f;
        const float bias4 = bias ? bias[p + 4] : 0.f;
        const float bias5 = bias ? bias[p + 5] : 0.f;

        out0.fill(bias0);
        out1.fill(bias1);
        out2.fill(bias2);
        out3.fill(bias3);
        out4.fill(bias4);
        out5.fill(bias5);

        int q = 0;

        for (; q + 3 < inch; q += 4)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;
            float* outptr4 = out4;
            float* outptr5 = out5;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q + 1);
            const float* img2 = bottom_blob.channel(q + 2);
            const float* img3 = bottom_blob.channel(q + 3);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;
            const float* kernel4 = kernel + (p + 4) * inch + q;
            const float* kernel5 = kernel + (p + 5) * inch + q;

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;

            int size = outw * outh;

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size & 3;
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(kernel0);
            float32x4_t _k1 = vld1q_f32(kernel1);
            float32x4_t _k2 = vld1q_f32(kernel2);
            float32x4_t _k3 = vld1q_f32(kernel3);
            float32x4_t _k4 = vld1q_f32(kernel4);
            float32x4_t _k5 = vld1q_f32(kernel5);

            for (; nn > 0; nn--)
            {
                asm volatile(
                    "pld        [%6, #128]              \n"
                    "vld1.f32   {d24-d25}, [%6 :128]!   \n" // q12 = r0

                    "pld        [%0, #128]              \n"
                    "vld1.f32   {d12-d13}, [%0 :128]    \n" // q6 = outptr0

                    "pld        [%1, #128]              \n"
                    "vld1.f32   {d14-d15}, [%1 :128]    \n" // q7 = outptr1

                    "vmla.f32   q6, q12, %e20[0]        \n"

                    "pld        [%2, #128]              \n"
                    "vld1.f32   {d16-d17}, [%2 :128]    \n" // q8 = outptr2

                    "vmla.f32   q7, q12, %e21[0]        \n"

                    "pld        [%3, #128]              \n"
                    "vld1.f32   {d18-d19}, [%3 :128]    \n" // q9 = outptr3

                    "vmla.f32   q8, q12, %e22[0]        \n"

                    "pld        [%7, #128]              \n"
                    "vld1.f32   {d26-d27}, [%7 :128]!   \n" // q13 = r1

                    "vmla.f32   q9, q12, %e23[0]        \n"

                    "pld        [%4, #128]              \n"
                    "vld1.f32   {d20-d21}, [%4 :128]    \n" // q10 = outptr4

                    "vmla.f32   q6, q13, %e20[1]        \n"
                    "vmla.f32   q7, q13, %e21[1]        \n"

                    "pld        [%5, #128]              \n"
                    "vld1.f32   {d22-d23}, [%5 :128]    \n" // q11 = outptr5

                    "vmla.f32   q10, q12, %e24[0]       \n"
                    "vmla.f32   q11, q12, %e25[0]       \n"

                    "vmla.f32   q8, q13, %e22[1]        \n"
                    "vmla.f32   q9, q13, %e23[1]        \n"

                    "pld        [%8, #128]              \n"
                    "vld1.f32   {d28-d29}, [%8 :128]!   \n" // q14 = r2

                    "vmla.f32   q10, q13, %e24[1]       \n"
                    "vmla.f32   q11, q13, %e25[1]       \n"

                    "vmla.f32   q6, q14, %f20[0]        \n"
                    "vmla.f32   q7, q14, %f21[0]        \n"
                    "vmla.f32   q8, q14, %f22[0]        \n"
                    "vmla.f32   q9, q14, %f23[0]        \n"

                    "pld        [%9, #128]             \n"
                    "vld1.f32   {d30-d31}, [%9 :128]!  \n" // q15 = r3

                    "vmla.f32   q10, q14, %f24[0]       \n"
                    "vmla.f32   q11, q14, %f25[0]       \n"

                    "vmla.f32   q6, q15, %f20[1]        \n"
                    "vmla.f32   q7, q15, %f21[1]        \n"
                    "vmla.f32   q8, q15, %f22[1]        \n"
                    "vmla.f32   q9, q15, %f23[1]        \n"

                    "vmla.f32   q10, q15, %f24[1]       \n"
                    "vmla.f32   q11, q15, %f25[1]       \n"

                    "vst1.f32   {d12-d13}, [%0 :128]!   \n"
                    "vst1.f32   {d14-d15}, [%1 :128]!   \n"
                    "vst1.f32   {d16-d17}, [%2 :128]!   \n"
                    "vst1.f32   {d18-d19}, [%3 :128]!   \n"
                    "vst1.f32   {d20-d21}, [%4 :128]!   \n"
                    "vst1.f32   {d22-d23}, [%5 :128]!   \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(outptr2), // %2
                    "=r"(outptr3), // %3
                    "=r"(outptr4), // %4
                    "=r"(outptr5), // %5
                    "=r"(r0),      // %6
                    "=r"(r1),      // %7
                    "=r"(r2),      // %8
                    "=r"(r3)       // %9
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(outptr2),
                    "3"(outptr3),
                    "4"(outptr4),
                    "5"(outptr5),
                    "6"(r0),
                    "7"(r1),
                    "8"(r2),
                    "9"(r3),
                    "w"(_k0), // %20
                    "w"(_k1), // %21
                    "w"(_k2), // %22
                    "w"(_k3), // %23
                    "w"(_k4), // %24
                    "w"(_k5)  // %25
                    : "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __ARM_NEON

            for (; remain > 0; remain--)
            {
                // TODO neon optimize
                float sum0 = *r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3];
                float sum1 = *r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3];
                float sum2 = *r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3];
                float sum3 = *r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3];
                float sum4 = *r0 * kernel4[0] + *r1 * kernel4[1] + *r2 * kernel4[2] + *r3 * kernel4[3];
                float sum5 = *r0 * kernel5[0] + *r1 * kernel5[1] + *r2 * kernel5[2] + *r3 * kernel5[3];

                *outptr0 += sum0;
                *outptr1 += sum1;
                *outptr2 += sum2;
                *outptr3 += sum3;
                *outptr4 += sum4;
                *outptr5 += sum5;

                r0++;
                r1++;
                r2++;
                r3++;
                outptr0++;
                outptr1++;
                outptr2++;
                outptr3++;
                outptr4++;
                outptr5++;
            }
        }

        for (; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;
            float* outptr4 = out4;
            float* outptr5 = out5;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;
            const float* kernel4 = kernel + (p + 4) * inch + q;
            const float* kernel5 = kernel + (p + 5) * inch + q;

            const float k0 = kernel0[0];
            const float k1 = kernel1[0];
            const float k2 = kernel2[0];
            const float k3 = kernel3[0];
            const float k4 = kernel4[0];
            const float k5 = kernel5[0];

            const float* r0 = img0;

            int size = outw * outh;

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size & 3;
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(k0);
            float32x4_t _k1 = vdupq_n_f32(k1);
            float32x4_t _k2 = vdupq_n_f32(k2);
            float32x4_t _k3 = vdupq_n_f32(k3);
            float32x4_t _k4 = vdupq_n_f32(k4);
            float32x4_t _k5 = vdupq_n_f32(k5);

            if (nn > 0)
            {
                asm volatile(
                    "pld        [%7, #128]              \n"
                    "vld1.f32   {d24-d25}, [%7 :128]!   \n" // q12 = r0

                    "pld        [%1, #128]              \n"
                    "vld1.f32   {d12-d13}, [%1 :128]    \n" // q6 = outptr0

                    "0:                                 \n"

                    "pld        [%2, #128]              \n"
                    "vld1.f32   {d14-d15}, [%2 :128]    \n" // q7 = outptr1

                    "vmla.f32   q6, q12, %q16           \n"

                    "pld        [%3, #128]              \n"
                    "vld1.f32   {d16-d17}, [%3 :128]    \n" // q8 = outptr2

                    "vmla.f32   q7, q12, %q17           \n"

                    "pld        [%4, #128]              \n"
                    "vld1.f32   {d18-d19}, [%4 :128]    \n" // q9 = outptr3

                    "vmla.f32   q8, q12, %q18           \n"

                    "pld        [%5, #128]              \n"
                    "vld1.f32   {d20-d21}, [%5 :128]    \n" // q10 = outptr4

                    "vmla.f32   q9, q12, %q19           \n"

                    "pld        [%6, #128]              \n"
                    "vld1.f32   {d22-d23}, [%6 :128]    \n" // q11 = outptr5

                    "vmla.f32   q10, q12, %q20          \n"
                    "vmla.f32   q11, q12, %q21          \n"

                    "pld        [%7, #128]              \n"
                    "vld1.f32   {d24-d25}, [%7 :128]!   \n" // q12 = r0

                    "vst1.f32   {d12-d13}, [%1 :128]!   \n"
                    "vst1.f32   {d14-d15}, [%2 :128]!   \n"

                    "pld        [%1, #128]              \n"
                    "vld1.f32   {d12-d13}, [%1 :128]    \n" // q6 = outptr0

                    "vst1.f32   {d16-d17}, [%3 :128]!   \n"
                    "vst1.f32   {d18-d19}, [%4 :128]!   \n"

                    "subs       %0, #1                  \n"

                    "vst1.f32   {d20-d21}, [%5 :128]!   \n"
                    "vst1.f32   {d22-d23}, [%6 :128]!   \n"

                    "bne        0b                      \n"

                    "sub        %7, #16                 \n"

                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(outptr1), // %2
                    "=r"(outptr2), // %3
                    "=r"(outptr3), // %4
                    "=r"(outptr4), // %5
                    "=r"(outptr5), // %6
                    "=r"(r0)       // %7
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr1),
                    "3"(outptr2),
                    "4"(outptr3),
                    "5"(outptr4),
                    "6"(outptr5),
                    "7"(r0),
                    "w"(_k0), // %16
                    "w"(_k1), // %17
                    "w"(_k2), // %18
                    "w"(_k3), // %19
                    "w"(_k4), // %20
                    "w"(_k5)  // %21
                    : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12");
            }
#endif // __ARM_NEON

            for (; remain > 0; remain--)
            {
                // TODO neon optimize
                float sum0 = *r0 * k0;
                float sum1 = *r0 * k1;
                float sum2 = *r0 * k2;
                float sum3 = *r0 * k3;
                float sum4 = *r0 * k4;
                float sum5 = *r0 * k5;

                *outptr0 += sum0;
                *outptr1 += sum1;
                *outptr2 += sum2;
                *outptr3 += sum3;
                *outptr4 += sum4;
                *outptr5 += sum5;

                r0++;
                outptr0++;
                outptr1++;
                outptr2++;
                outptr3++;
                outptr4++;
                outptr5++;
            }
        }
    }
#endif // __ARM_NEON && __aarch64__

    nn_outch = (outch - remain_outch_start) >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = remain_outch_start + pp * 4;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);
        Mat out2 = top_blob.channel(p + 2);
        Mat out3 = top_blob.channel(p + 3);

        const float bias0 = bias ? bias[p] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;
        const float bias2 = bias ? bias[p + 2] : 0.f;
        const float bias3 = bias ? bias[p + 3] : 0.f;

        out0.fill(bias0);
        out1.fill(bias1);
        out2.fill(bias2);
        out3.fill(bias3);

        int q = 0;

        for (; q + 3 < inch; q += 4)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q + 1);
            const float* img2 = bottom_blob.channel(q + 2);
            const float* img3 = bottom_blob.channel(q + 3);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;

            int size = outw * outh;

#if __ARM_NEON
            int nn = size >> 3;
            int remain = size & 7;
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(kernel0);
            float32x4_t _k1 = vld1q_f32(kernel1);
            float32x4_t _k2 = vld1q_f32(kernel2);
            float32x4_t _k3 = vld1q_f32(kernel3);

#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v6.4s, v7.4s}, [%5], #32   \n"

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v8.4s, v9.4s}, [%1]        \n"

                    "0:                                 \n"

                    "fmla   v8.4s, v6.4s, %18.s[0]      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v10.4s, v11.4s}, [%2]      \n"

                    "fmla   v9.4s, v7.4s, %18.s[0]      \n"

                    "fmla   v10.4s, v6.4s, %19.s[0]     \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v12.4s, v13.4s}, [%3]      \n"

                    "fmla   v11.4s, v7.4s, %19.s[0]     \n"

                    "fmla   v12.4s, v6.4s, %20.s[0]     \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v14.4s, v15.4s}, [%4]      \n"

                    "fmla   v13.4s, v7.4s, %20.s[0]     \n"

                    "prfm   pldl1keep, [%6, #256]       \n"
                    "ld1    {v4.4s, v5.4s}, [%6], #32   \n"

                    "fmla   v14.4s, v6.4s, %21.s[0]     \n"
                    "fmla   v15.4s, v7.4s, %21.s[0]     \n"

                    "fmla   v8.4s, v4.4s, %18.s[1]      \n"
                    "fmla   v9.4s, v5.4s, %18.s[1]      \n"

                    "fmla   v10.4s, v4.4s, %19.s[1]     \n"
                    "fmla   v11.4s, v5.4s, %19.s[1]     \n"

                    "fmla   v12.4s, v4.4s, %20.s[1]     \n"
                    "fmla   v13.4s, v5.4s, %20.s[1]     \n"

                    "prfm   pldl1keep, [%7, #256]       \n"
                    "ld1    {v6.4s, v7.4s}, [%7], #32   \n"

                    "fmla   v14.4s, v4.4s, %21.s[1]     \n"
                    "fmla   v15.4s, v5.4s, %21.s[1]     \n"

                    "fmla   v8.4s, v6.4s, %18.s[2]      \n"
                    "fmla   v9.4s, v7.4s, %18.s[2]      \n"

                    "fmla   v10.4s, v6.4s, %19.s[2]     \n"
                    "fmla   v11.4s, v7.4s, %19.s[2]     \n"

                    "fmla   v12.4s, v6.4s, %20.s[2]     \n"
                    "fmla   v13.4s, v7.4s, %20.s[2]     \n"

                    "prfm   pldl1keep, [%8, #256]       \n"
                    "ld1    {v4.4s, v5.4s}, [%8], #32   \n"

                    "fmla   v14.4s, v6.4s, %21.s[2]     \n"
                    "fmla   v15.4s, v7.4s, %21.s[2]     \n"

                    "fmla   v8.4s, v4.4s, %18.s[3]      \n"
                    "fmla   v9.4s, v5.4s, %18.s[3]      \n"

                    "fmla   v10.4s, v4.4s, %19.s[3]     \n"
                    "fmla   v11.4s, v5.4s, %19.s[3]     \n"

                    "st1    {v8.4s, v9.4s}, [%1], #32   \n"

                    "fmla   v12.4s, v4.4s, %20.s[3]     \n"
                    "fmla   v13.4s, v5.4s, %20.s[3]     \n"

                    "st1    {v10.4s, v11.4s}, [%2], #32 \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v6.4s, v7.4s}, [%5], #32   \n"

                    "fmla   v14.4s, v4.4s, %21.s[3]     \n"
                    "fmla   v15.4s, v5.4s, %21.s[3]     \n"

                    "st1    {v12.4s, v13.4s}, [%3], #32 \n"

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v8.4s, v9.4s}, [%1]        \n"

                    "subs   %w0, %w0, #1                \n"

                    "st1    {v14.4s, v15.4s}, [%4], #32 \n"

                    "bne    0b                          \n"
                    "sub    %5, %5, #32                 \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(outptr1), // %2
                    "=r"(outptr2), // %3
                    "=r"(outptr3), // %4
                    "=r"(r0),      // %5
                    "=r"(r1),      // %6
                    "=r"(r2),      // %7
                    "=r"(r3)       // %8
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr1),
                    "3"(outptr2),
                    "4"(outptr3),
                    "5"(r0),
                    "6"(r1),
                    "7"(r2),
                    "8"(r3),
                    "w"(_k0), // %18
                    "w"(_k1), // %19
                    "w"(_k2), // %20
                    "w"(_k3)  // %21
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%5, #256]              \n"
                    "vld1.f32   {d12-d15}, [%5 :128]!   \n"
                    "pld        [%1, #256]              \n"
                    "vld1.f32   {d16-d19}, [%1 :128]    \n"
                    "0:                                 \n"

                    "vmla.f32   q8, q6, %e18[0]         \n"

                    "pld        [%2, #256]              \n"
                    "vld1.f32   {d20-d23}, [%2 :128]    \n"
                    "vmla.f32   q9, q7, %e18[0]         \n"

                    "vmla.f32   q10, q6, %e19[0]        \n"

                    "pld        [%3, #256]              \n"
                    "vld1.f32   {d24-d27}, [%3 :128]    \n"
                    "vmla.f32   q11, q7, %e19[0]        \n"

                    "vmla.f32   q12, q6, %e20[0]        \n"

                    "pld        [%4, #256]              \n"
                    "vld1.f32   {d28-d31}, [%4 :128]    \n"
                    "vmla.f32   q13, q7, %e20[0]        \n"

                    "pld        [%6, #256]              \n"
                    "vld1.f32   {d8-d11}, [%6 :128]!    \n"

                    "vmla.f32   q14, q6, %e21[0]        \n"
                    "vmla.f32   q15, q7, %e21[0]        \n"

                    "vmla.f32   q8, q4, %e18[1]         \n"
                    "vmla.f32   q9, q5, %e18[1]         \n"

                    "vmla.f32   q10, q4, %e19[1]        \n"
                    "vmla.f32   q11, q5, %e19[1]        \n"

                    "vmla.f32   q12, q4, %e20[1]        \n"
                    "vmla.f32   q13, q5, %e20[1]        \n"

                    "pld        [%7, #256]              \n"
                    "vld1.f32   {d12-d15}, [%7 :128]!   \n"

                    "vmla.f32   q14, q4, %e21[1]        \n"
                    "vmla.f32   q15, q5, %e21[1]        \n"

                    "vmla.f32   q8, q6, %f18[0]         \n"
                    "vmla.f32   q9, q7, %f18[0]         \n"

                    "vmla.f32   q10, q6, %f19[0]        \n"
                    "vmla.f32   q11, q7, %f19[0]        \n"

                    "vmla.f32   q12, q6, %f20[0]        \n"
                    "vmla.f32   q13, q7, %f20[0]        \n"

                    "pld        [%8, #256]              \n"
                    "vld1.f32   {d8-d11}, [%8 :128]!    \n"

                    "vmla.f32   q14, q6, %f21[0]        \n"
                    "vmla.f32   q15, q7, %f21[0]        \n"

                    "vmla.f32   q8, q4, %f18[1]         \n"
                    "vmla.f32   q9, q5, %f18[1]         \n"

                    "vmla.f32   q10, q4, %f19[1]        \n"
                    "vmla.f32   q11, q5, %f19[1]        \n"

                    "vmla.f32   q12, q4, %f20[1]        \n"
                    "vst1.f32   {d16-d19}, [%1 :128]!   \n"

                    "vmla.f32   q13, q5, %f20[1]        \n"

                    "vst1.f32   {d20-d23}, [%2 :128]!   \n"

                    "vmla.f32   q14, q4, %f21[1]        \n"
                    "pld        [%5, #256]              \n"
                    "vld1.f32   {d12-d15}, [%5 :128]!   \n"

                    "vmla.f32   q15, q5, %f21[1]        \n"

                    "vst1.f32   {d24-d27}, [%3 :128]!   \n"

                    "pld        [%1, #256]              \n"
                    "vld1.f32   {d16-d19}, [%1 :128]    \n"

                    "subs       %0, #1                  \n"
                    "vst1.f32   {d28-d31}, [%4 :128]!   \n"

                    "bne        0b                      \n"
                    "sub        %5, #32                 \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(outptr1), // %2
                    "=r"(outptr2), // %3
                    "=r"(outptr3), // %4
                    "=r"(r0),      // %5
                    "=r"(r1),      // %6
                    "=r"(r2),      // %7
                    "=r"(r3)       // %8
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr1),
                    "3"(outptr2),
                    "4"(outptr3),
                    "5"(r0),
                    "6"(r1),
                    "7"(r2),
                    "8"(r3),
                    "w"(_k0), // %18
                    "w"(_k1), // %19
                    "w"(_k2), // %20
                    "w"(_k3)  // %21
                    : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                // TODO neon optimize
                float sum0 = *r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3];
                float sum1 = *r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3];
                float sum2 = *r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3];
                float sum3 = *r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3];

                *outptr0 += sum0;
                *outptr1 += sum1;
                *outptr2 += sum2;
                *outptr3 += sum3;

                r0++;
                r1++;
                r2++;
                r3++;
                outptr0++;
                outptr1++;
                outptr2++;
                outptr3++;
            }
        }

        for (; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;

            const float k0 = kernel0[0];
            const float k1 = kernel1[0];
            const float k2 = kernel2[0];
            const float k3 = kernel3[0];

            const float* r0 = img0;

            int size = outw * outh;

#if __ARM_NEON
            int nn = size >> 3;
            int remain = size & 7;
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(k0);
            float32x4_t _k1 = vdupq_n_f32(k1);
            float32x4_t _k2 = vdupq_n_f32(k2);
            float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm       pldl1keep, [%5, #256]          \n"
                    "ld1        {v6.4s, v7.4s}, [%5], #32      \n"
                    "0:                                        \n"
                    "prfm       pldl1keep, [%1, #256]          \n"
                    "ld1        {v8.4s, v9.4s}, [%1]           \n"
                    "fmla       v8.4s, v6.4s, %12.4s           \n"
                    "fmla       v9.4s, v7.4s, %12.4s           \n"

                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld1        {v10.4s, v11.4s}, [%2]         \n"
                    "fmla       v10.4s, v6.4s, %13.4s          \n"
                    "fmla       v11.4s, v7.4s, %13.4s          \n"

                    "st1        {v8.4s, v9.4s}, [%1], #32      \n"

                    "prfm       pldl1keep, [%3, #256]          \n"
                    "ld1        {v12.4s, v13.4s}, [%3]         \n"
                    "fmla       v12.4s, v6.4s, %14.4s          \n"
                    "fmla       v13.4s, v7.4s, %14.4s          \n"

                    "st1        {v10.4s, v11.4s}, [%2], #32    \n"

                    "prfm       pldl1keep, [%4, #256]          \n"
                    "ld1        {v14.4s, v15.4s}, [%4]         \n"
                    "fmla       v14.4s, v6.4s, %15.4s          \n"
                    "fmla       v15.4s, v7.4s, %15.4s          \n"

                    "st1        {v12.4s, v13.4s}, [%3], #32    \n"

                    "prfm       pldl1keep, [%5, #256]          \n"
                    "ld1        {v6.4s, v7.4s}, [%5], #32      \n"
                    "subs       %w0, %w0, #1                   \n"
                    "st1        {v14.4s, v15.4s}, [%4], #32    \n"
                    "bne        0b                             \n"
                    "sub        %5, %5, #32                    \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(outptr1), // %2
                    "=r"(outptr2), // %3
                    "=r"(outptr3), // %4
                    "=r"(r0)       // %5
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr1),
                    "3"(outptr2),
                    "4"(outptr3),
                    "5"(r0),
                    "w"(_k0), // %12
                    "w"(_k1), // %13
                    "w"(_k2), // %14
                    "w"(_k3)  // %15
                    : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%5, #256]              \n"
                    "vld1.f32   {d12-d15}, [%5 :128]!   \n"
                    "0:                                 \n"
                    "pld        [%1, #256]              \n"
                    "vld1.f32   {d16-d19}, [%1 :128]    \n"
                    "vmla.f32   q8, q6, %q12            \n"
                    "vmla.f32   q9, q7, %q12            \n"

                    "pld        [%2, #256]              \n"
                    "vld1.f32   {d20-d23}, [%2 :128]    \n"
                    "vmla.f32   q10, q6, %q13           \n"
                    "vmla.f32   q11, q7, %q13           \n"

                    "vst1.f32   {d16-d19}, [%1 :128]!   \n"

                    "pld        [%3, #256]              \n"
                    "vld1.f32   {d24-d27}, [%3 :128]    \n"
                    "vmla.f32   q12, q6, %q14           \n"
                    "vmla.f32   q13, q7, %q14           \n"

                    "vst1.f32   {d20-d23}, [%2 :128]!   \n"

                    "pld        [%4, #256]              \n"
                    "vld1.f32   {d28-d31}, [%4 :128]    \n"
                    "vmla.f32   q14, q6, %q15           \n"
                    "vmla.f32   q15, q7, %q15           \n"

                    "vst1.f32   {d24-d27}, [%3 :128]!   \n"

                    "pld        [%5, #256]              \n"
                    "vld1.f32   {d12-d15}, [%5 :128]!   \n"
                    "subs       %0, #1                  \n"
                    "vst1.f32   {d28-d31}, [%4 :128]!   \n"
                    "bne        0b                      \n"
                    "sub        %5, #32                 \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(outptr1), // %2
                    "=r"(outptr2), // %3
                    "=r"(outptr3), // %4
                    "=r"(r0)       // %5
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr1),
                    "3"(outptr2),
                    "4"(outptr3),
                    "5"(r0),
                    "w"(_k0), // %12
                    "w"(_k1), // %13
                    "w"(_k2), // %14
                    "w"(_k3)  // %15
                    : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                // TODO neon optimize
                float sum0 = *r0 * k0;
                float sum1 = *r0 * k1;
                float sum2 = *r0 * k2;
                float sum3 = *r0 * k3;

                *outptr0 += sum0;
                *outptr1 += sum1;
                *outptr2 += sum2;
                *outptr3 += sum3;

                r0++;
                outptr0++;
                outptr1++;
                outptr2++;
                outptr3++;
            }
        }
    }

    remain_outch_start += nn_outch << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        int q = 0;

        for (; q + 3 < inch; q += 4)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q + 1);
            const float* img2 = bottom_blob.channel(q + 2);
            const float* img3 = bottom_blob.channel(q + 3);

            const float* kernel0 = kernel + p * inch + q;
            const float k0 = kernel0[0];
            const float k1 = kernel0[1];
            const float k2 = kernel0[2];
            const float k3 = kernel0[3];

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;

            int size = outw * outh;

#if __ARM_NEON
            int nn = size >> 3;
            int remain = size & 7;
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(k0);
            float32x4_t _k1 = vdupq_n_f32(k1);
            float32x4_t _k2 = vdupq_n_f32(k2);
            float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld1        {v2.4s, v3.4s}, [%2], #32      \n"
                    "0:                                        \n"
                    "prfm       pldl1keep, [%1, #256]          \n"
                    "ld1        {v0.4s, v1.4s}, [%1]           \n"
                    "fmla       v0.4s, v2.4s, %12.4s           \n"
                    "fmla       v1.4s, v3.4s, %12.4s           \n"

                    "prfm       pldl1keep, [%3, #256]          \n"
                    "ld1        {v2.4s, v3.4s}, [%3], #32      \n"
                    "fmla       v0.4s, v2.4s, %13.4s           \n"
                    "fmla       v1.4s, v3.4s, %13.4s           \n"

                    "prfm       pldl1keep, [%4, #256]          \n"
                    "ld1        {v2.4s, v3.4s}, [%4], #32      \n"
                    "fmla       v0.4s, v2.4s, %14.4s           \n"
                    "fmla       v1.4s, v3.4s, %14.4s           \n"

                    "prfm       pldl1keep, [%5, #256]          \n"
                    "ld1        {v2.4s, v3.4s}, [%5], #32      \n"
                    "fmla       v0.4s, v2.4s, %15.4s           \n"
                    "fmla       v1.4s, v3.4s, %15.4s           \n"

                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld1        {v2.4s, v3.4s}, [%2], #32      \n"
                    "subs       %w0, %w0, #1                   \n"
                    "st1        {v0.4s, v1.4s}, [%1], #32      \n"
                    "bne        0b                             \n"
                    "sub        %2, %2, #32                    \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2),     // %4
                    "=r"(r3)      // %5
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k0), // %12
                    "w"(_k1), // %13
                    "w"(_k2), // %14
                    "w"(_k3)  // %15
                    : "cc", "memory", "v0", "v1", "v2", "v3");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d4-d7}, [%2 :128]! \n"
                    "0:                             \n"
                    "pld        [%1, #256]          \n"
                    "vld1.f32   {d0-d3}, [%1 :128]  \n"
                    "vmla.f32   q0, q2, %q12        \n"
                    "vmla.f32   q1, q3, %q12        \n"
                    "pld        [%3, #256]          \n"
                    "vld1.f32   {d4-d7}, [%3 :128]! \n"
                    "vmla.f32   q0, q2, %q13        \n"
                    "vmla.f32   q1, q3, %q13        \n"
                    "pld        [%4, #256]          \n"
                    "vld1.f32   {d4-d7}, [%4 :128]! \n"
                    "vmla.f32   q0, q2, %q14        \n"
                    "vmla.f32   q1, q3, %q14        \n"
                    "pld        [%5, #256]          \n"
                    "vld1.f32   {d4-d7}, [%5 :128]! \n"
                    "vmla.f32   q0, q2, %q15        \n"
                    "vmla.f32   q1, q3, %q15        \n"
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d4-d7}, [%2 :128]! \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d3}, [%1 :128]! \n"
                    "bne        0b                  \n"
                    "sub        %2, #32             \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2),     // %4
                    "=r"(r3)      // %5
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k0), // %12
                    "w"(_k1), // %13
                    "w"(_k2), // %14
                    "w"(_k3)  // %15
                    : "cc", "memory", "q0", "q1", "q2", "q3");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                float sum = *r0 * k0;
                float sum1 = *r1 * k1;
                float sum2 = *r2 * k2;
                float sum3 = *r3 * k3;

                *outptr += sum + sum1 + sum2 + sum3;

                r0++;
                r1++;
                r2++;
                r3++;
                outptr++;
            }
        }

        for (; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch + q;
            const float k0 = kernel0[0];

            const float* r0 = img0;

            int size = outw * outh;

#if __ARM_NEON
            int nn = size >> 3;
            int remain = size & 7;
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(k0);
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld1        {v2.4s, v3.4s}, [%2], #32      \n"
                    "0:                                        \n"
                    "prfm       pldl1keep, [%1, #256]          \n"
                    "ld1        {v0.4s, v1.4s}, [%1]           \n"
                    "fmla       v0.4s, v2.4s, %6.4s            \n"
                    "fmla       v1.4s, v3.4s, %6.4s            \n"
                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld1        {v2.4s, v3.4s}, [%2], #32      \n"
                    "subs       %w0, %w0, #1                   \n"
                    "st1        {v0.4s, v1.4s}, [%1], #32      \n"
                    "bne        0b                             \n"
                    "sub        %2, %2, #32                    \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0)      // %2
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "w"(_k0) // %6
                    : "cc", "memory", "v0", "v1", "v2", "v3");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d4-d7}, [%2 :128]! \n"
                    "0:                             \n"
                    "pld        [%1, #256]          \n"
                    "vld1.f32   {d0-d3}, [%1 :128]  \n"
                    "vmla.f32   q0, q2, %q6         \n"
                    "vmla.f32   q1, q3, %q6         \n"
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d4-d7}, [%2 :128]! \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d3}, [%1 :128]! \n"
                    "bne        0b                  \n"
                    "sub        %2, #32             \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0)      // %2
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "w"(_k0) // %6
                    : "cc", "memory", "q0", "q1", "q2", "q3");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                float sum = *r0 * k0;

                *outptr += sum;

                r0++;
                outptr++;
            }
        }
    }
}

static void conv1x1s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    int nn_outch = outch >> 2;
    int remain_outch_start = nn_outch << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);
        Mat out2 = top_blob.channel(p + 2);
        Mat out3 = top_blob.channel(p + 3);

        const float bias0 = bias ? bias[p] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;
        const float bias2 = bias ? bias[p + 2] : 0.f;
        const float bias3 = bias ? bias[p + 3] : 0.f;

        out0.fill(bias0);
        out1.fill(bias1);
        out2.fill(bias2);
        out3.fill(bias3);

        int q = 0;

        for (; q + 3 < inch; q += 4)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q + 1);
            const float* img2 = bottom_blob.channel(q + 2);
            const float* img3 = bottom_blob.channel(q + 3);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;

            for (int i = 0; i < outh; i++)
            {
                int size = outw;

#if __ARM_NEON
                int nn = size >> 3;
                int remain = size & 7;
#else
                int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
                float32x4_t _k0 = vld1q_f32(kernel0);
                float32x4_t _k1 = vld1q_f32(kernel1);
                float32x4_t _k2 = vld1q_f32(kernel2);
                float32x4_t _k3 = vld1q_f32(kernel3);
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                                        \n"

                        "prfm       pldl1keep, [%5, #512]          \n"
                        "ld2        {v4.4s, v5.4s}, [%5], #32      \n"
                        "ld2        {v6.4s, v7.4s}, [%5], #32      \n"
                        "and        v5.16b, v6.16b, v6.16b         \n" // v4 v5

                        "prfm       pldl1keep, [%1, #256]          \n"
                        "ld1        {v8.4s, v9.4s}, [%1]           \n"

                        "fmla       v8.4s, v4.4s, %18.s[0]         \n"
                        "fmla       v9.4s, v5.4s, %18.s[0]         \n"

                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld1        {v10.4s, v11.4s}, [%2]         \n"

                        "fmla       v10.4s, v4.4s, %19.s[0]        \n"
                        "fmla       v11.4s, v5.4s, %19.s[0]        \n"

                        "prfm       pldl1keep, [%3, #256]          \n"
                        "ld1        {v12.4s, v13.4s}, [%3]         \n"

                        "fmla       v12.4s, v4.4s, %20.s[0]        \n"
                        "fmla       v13.4s, v5.4s, %20.s[0]        \n"

                        "prfm       pldl1keep, [%4, #256]          \n"
                        "ld1        {v14.4s, v15.4s}, [%4]         \n"

                        "prfm       pldl1keep, [%6, #512]          \n"
                        "ld2        {v6.4s, v7.4s}, [%6], #32      \n"

                        "fmla       v14.4s, v4.4s, %21.s[0]        \n"
                        "fmla       v15.4s, v5.4s, %21.s[0]        \n"

                        "ld2        {v4.4s, v5.4s}, [%6], #32      \n"
                        "and        v7.16b, v4.16b, v4.16b         \n" // v6 v7

                        "fmla       v8.4s, v6.4s, %18.s[1]         \n"
                        "fmla       v9.4s, v7.4s, %18.s[1]         \n"

                        "fmla       v10.4s, v6.4s, %19.s[1]        \n"
                        "fmla       v11.4s, v7.4s, %19.s[1]        \n"

                        "fmla       v12.4s, v6.4s, %20.s[1]        \n"
                        "fmla       v13.4s, v7.4s, %20.s[1]        \n"

                        "prfm       pldl1keep, [%7, #512]          \n"
                        "ld2        {v4.4s, v5.4s}, [%7], #32      \n"

                        "fmla       v14.4s, v6.4s, %21.s[1]        \n"
                        "fmla       v15.4s, v7.4s, %21.s[1]        \n"

                        "ld2        {v6.4s, v7.4s}, [%7], #32      \n"
                        "and        v5.16b, v6.16b, v6.16b         \n" // v4 v5

                        "fmla       v8.4s, v4.4s, %18.s[2]         \n"
                        "fmla       v9.4s, v5.4s, %18.s[2]         \n"

                        "fmla       v10.4s, v4.4s, %19.s[2]        \n"
                        "fmla       v11.4s, v5.4s, %19.s[2]        \n"

                        "fmla       v12.4s, v4.4s, %20.s[2]        \n"
                        "fmla       v13.4s, v5.4s, %20.s[2]        \n"

                        "prfm       pldl1keep, [%8, #512]          \n"
                        "ld2        {v6.4s, v7.4s}, [%8], #32      \n"

                        "fmla       v14.4s, v4.4s, %21.s[2]        \n"
                        "fmla       v15.4s, v5.4s, %21.s[2]        \n"

                        "ld2        {v4.4s, v5.4s}, [%8], #32      \n"
                        "and        v7.16b, v4.16b, v4.16b         \n" // v6 v7

                        "fmla       v8.4s, v6.4s, %18.s[3]         \n"
                        "fmla       v9.4s, v7.4s, %18.s[3]         \n"

                        "fmla       v10.4s, v6.4s, %19.s[3]        \n"
                        "fmla       v11.4s, v7.4s, %19.s[3]        \n"

                        "st1        {v8.4s, v9.4s}, [%1], #32      \n"

                        "fmla       v12.4s, v6.4s, %20.s[3]        \n"
                        "fmla       v13.4s, v7.4s, %20.s[3]        \n"

                        "st1        {v10.4s, v11.4s}, [%2], #32    \n"

                        "fmla       v14.4s, v6.4s, %21.s[3]        \n"
                        "fmla       v15.4s, v7.4s, %21.s[3]        \n"

                        "st1        {v12.4s, v13.4s}, [%3], #32    \n"

                        "subs       %w0, %w0, #1                   \n"
                        "st1        {v14.4s, v15.4s}, [%4], #32    \n"

                        "bne        0b                             \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(r0),      // %5
                        "=r"(r1),      // %6
                        "=r"(r2),      // %7
                        "=r"(r3)       // %8
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(r0),
                        "6"(r1),
                        "7"(r2),
                        "8"(r3),
                        "w"(_k0), // %18
                        "w"(_k1), // %19
                        "w"(_k2), // %20
                        "w"(_k3)  // %21
                        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "0:                             \n"

                        "pld        [%5, #512]          \n"
                        "vld2.f32   {d8-d11}, [%5]!     \n"
                        "vld2.f32   {d12-d15}, [%5]!    \n"
                        "vand       q5, q6, q6          \n" // q4 q5

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d16-d19}, [%1]     \n"

                        "vmla.f32   q8, q4, %e18[0]     \n"
                        "vmla.f32   q9, q5, %e18[0]     \n"

                        "pld        [%2, #256]          \n"
                        "vld1.f32   {d20-d23}, [%2]     \n"

                        "vmla.f32   q10, q4, %e19[0]    \n"
                        "vmla.f32   q11, q5, %e19[0]    \n"

                        "pld        [%3, #256]          \n"
                        "vld1.f32   {d24-d27}, [%3]     \n"

                        "vmla.f32   q12, q4, %e20[0]    \n"
                        "vmla.f32   q13, q5, %e20[0]    \n"

                        "pld        [%4, #256]          \n"
                        "vld1.f32   {d28-d31}, [%4]     \n"

                        "pld        [%6, #512]          \n"
                        "vld2.f32   {d12-d15}, [%6]!    \n"

                        "vmla.f32   q14, q4, %e21[0]    \n"
                        "vmla.f32   q15, q5, %e21[0]    \n"

                        "vld2.f32   {d8-d11}, [%6]!     \n"
                        "vand       q7, q4, q4          \n" // q6 q7

                        "vmla.f32   q8, q6, %e18[1]     \n"
                        "vmla.f32   q9, q7, %e18[1]     \n"

                        "vmla.f32   q10, q6, %e19[1]    \n"
                        "vmla.f32   q11, q7, %e19[1]    \n"

                        "vmla.f32   q12, q6, %e20[1]    \n"
                        "vmla.f32   q13, q7, %e20[1]    \n"

                        "pld        [%7, #512]          \n"
                        "vld2.f32   {d8-d11}, [%7]!     \n"

                        "vmla.f32   q14, q6, %e21[1]    \n"
                        "vmla.f32   q15, q7, %e21[1]    \n"

                        "vld2.f32   {d12-d15}, [%7]!    \n"
                        "vand       q5, q6, q6          \n" // q4 q5

                        "vmla.f32   q8, q4, %f18[0]     \n"
                        "vmla.f32   q9, q5, %f18[0]     \n"

                        "vmla.f32   q10, q4, %f19[0]    \n"
                        "vmla.f32   q11, q5, %f19[0]    \n"

                        "vmla.f32   q12, q4, %f20[0]    \n"
                        "vmla.f32   q13, q5, %f20[0]    \n"

                        "pld        [%8, #512]          \n"
                        "vld2.f32   {d12-d15}, [%8]!    \n"

                        "vmla.f32   q14, q4, %f21[0]    \n"
                        "vmla.f32   q15, q5, %f21[0]    \n"

                        "vld2.f32   {d8-d11}, [%8]!     \n"
                        "vand       q7, q4, q4          \n" // q6 q7

                        "vmla.f32   q8, q6, %f18[1]     \n"
                        "vmla.f32   q9, q7, %f18[1]     \n"

                        "vmla.f32   q10, q6, %f19[1]    \n"
                        "vmla.f32   q11, q7, %f19[1]    \n"

                        "vst1.f32   {d16-d19}, [%1]!    \n"

                        "vmla.f32   q12, q6, %f20[1]    \n"
                        "vmla.f32   q13, q7, %f20[1]    \n"

                        "vst1.f32   {d20-d23}, [%2]!    \n"

                        "vmla.f32   q14, q6, %f21[1]    \n"
                        "vmla.f32   q15, q7, %f21[1]    \n"

                        "vst1.f32   {d24-d27}, [%3]!    \n"

                        "subs       %0, #1              \n"
                        "vst1.f32   {d28-d31}, [%4]!    \n"

                        "bne        0b                  \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(r0),      // %5
                        "=r"(r1),      // %6
                        "=r"(r2),      // %7
                        "=r"(r3)       // %8
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(r0),
                        "6"(r1),
                        "7"(r2),
                        "8"(r3),
                        "w"(_k0), // %18
                        "w"(_k1), // %19
                        "w"(_k2), // %20
                        "w"(_k3)  // %21
                        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    // TODO neon optimize
                    float sum0 = *r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3];
                    float sum1 = *r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3];
                    float sum2 = *r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3];
                    float sum3 = *r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3];

                    *outptr0 += sum0;
                    *outptr1 += sum1;
                    *outptr2 += sum2;
                    *outptr3 += sum3;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;
                    outptr0++;
                    outptr1++;
                    outptr2++;
                    outptr3++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
            }
        }

        for (; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch + q;
            const float* kernel1 = kernel + (p + 1) * inch + q;
            const float* kernel2 = kernel + (p + 2) * inch + q;
            const float* kernel3 = kernel + (p + 3) * inch + q;

            const float k0 = kernel0[0];
            const float k1 = kernel1[0];
            const float k2 = kernel2[0];
            const float k3 = kernel3[0];

            const float* r0 = img0;

            for (int i = 0; i < outh; i++)
            {
                int size = outw;

#if __ARM_NEON
                int nn = size >> 3;
                int remain = size & 7;
#else
                int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
                float32x4_t _k0 = vdupq_n_f32(k0);
                float32x4_t _k1 = vdupq_n_f32(k1);
                float32x4_t _k2 = vdupq_n_f32(k2);
                float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                                        \n"

                        "prfm       pldl1keep, [%5, #512]          \n"
                        "ld2        {v4.4s, v5.4s}, [%5], #32      \n"
                        "ld2        {v6.4s, v7.4s}, [%5], #32      \n"
                        "and        v5.16b, v6.16b, v6.16b         \n"

                        "prfm       pldl1keep, [%1, #256]          \n"
                        "ld1        {v8.4s, v9.4s}, [%1]           \n"

                        "fmla       v8.4s, v4.4s, %12.4s           \n"
                        "fmla       v9.4s, v5.4s, %12.4s           \n"

                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld1        {v10.4s, v11.4s}, [%2]         \n"

                        "fmla       v10.4s, v4.4s, %13.4s          \n"
                        "fmla       v11.4s, v5.4s, %13.4s          \n"

                        "prfm       pldl1keep, [%3, #256]          \n"
                        "ld1        {v12.4s, v13.4s}, [%3]         \n"

                        "st1        {v8.4s, v9.4s}, [%1], #32      \n"

                        "fmla       v12.4s, v4.4s, %14.4s          \n"
                        "fmla       v13.4s, v5.4s, %14.4s          \n"

                        "prfm       pldl1keep, [%4, #256]          \n"
                        "ld1        {v14.4s, v15.4s}, [%4]         \n"

                        "st1        {v10.4s, v11.4s}, [%2], #32    \n"

                        "fmla       v14.4s, v4.4s, %15.4s          \n"
                        "fmla       v15.4s, v5.4s, %15.4s          \n"

                        "st1        {v12.4s, v13.4s}, [%3], #32    \n"
                        "subs       %w0, %w0, #1                   \n"

                        "st1        {v14.4s, v15.4s}, [%4], #32    \n"
                        "bne        0b                             \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(r0)       // %5
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(r0),
                        "w"(_k0), // %12
                        "w"(_k1), // %13
                        "w"(_k2), // %14
                        "w"(_k3)  // %15
                        : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "0:                             \n"

                        "pld        [%5, #512]          \n"
                        "vld2.f32   {d8-d11}, [%5]!     \n"
                        "vld2.f32   {d12-d15}, [%5]!    \n"
                        "vand       q5, q6, q6          \n" // q4 q5

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d16-d19}, [%1]     \n"

                        "vmla.f32   q8, q4, %q12        \n"
                        "vmla.f32   q9, q5, %q12        \n"

                        "pld        [%2, #256]          \n"
                        "vld1.f32   {d20-d23}, [%2]     \n"

                        "vmla.f32   q10, q4, %q13       \n"
                        "vmla.f32   q11, q5, %q13       \n"

                        "pld        [%3, #256]          \n"
                        "vld1.f32   {d24-d27}, [%3]     \n"

                        "vst1.f32   {d16-d19}, [%1]!    \n"

                        "vmla.f32   q12, q4, %q14       \n"
                        "vmla.f32   q13, q5, %q14       \n"

                        "pld        [%4, #256]          \n"
                        "vld1.f32   {d28-d31}, [%4]     \n"

                        "vst1.f32   {d20-d23}, [%2]!    \n"

                        "vmla.f32   q14, q4, %q15       \n"
                        "vmla.f32   q15, q5, %q15       \n"

                        "vst1.f32   {d24-d27}, [%3]!    \n"
                        "subs       %0, #1              \n"
                        "vst1.f32   {d28-d31}, [%4]!    \n"
                        "bne        0b                  \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(r0)       // %5
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(r0),
                        "w"(_k0), // %12
                        "w"(_k1), // %13
                        "w"(_k2), // %14
                        "w"(_k3)  // %15
                        : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    // TODO neon optimize
                    float sum0 = *r0 * k0;
                    float sum1 = *r0 * k1;
                    float sum2 = *r0 * k2;
                    float sum3 = *r0 * k3;

                    *outptr0 += sum0;
                    *outptr1 += sum1;
                    *outptr2 += sum2;
                    *outptr3 += sum3;

                    r0 += 2;
                    outptr0++;
                    outptr1++;
                    outptr2++;
                    outptr3++;
                }

                r0 += tailstep;
            }
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        int q = 0;

        for (; q + 3 < inch; q += 4)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q + 1);
            const float* img2 = bottom_blob.channel(q + 2);
            const float* img3 = bottom_blob.channel(q + 3);

            const float* kernel0 = kernel + p * inch + q;
            const float k0 = kernel0[0];
            const float k1 = kernel0[1];
            const float k2 = kernel0[2];
            const float k3 = kernel0[3];

            const float* r0 = img0;
            const float* r1 = img1;
            const float* r2 = img2;
            const float* r3 = img3;

            for (int i = 0; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 3;
                int remain = outw & 7;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
                float32x4_t _k0 = vdupq_n_f32(k0);
                float32x4_t _k1 = vdupq_n_f32(k1);
                float32x4_t _k2 = vdupq_n_f32(k2);
                float32x4_t _k3 = vdupq_n_f32(k3);
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%2, #512]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
                        "ld2        {v8.4s, v9.4s}, [%2], #32      \n"
                        "0:                                        \n"

                        "prfm       pldl1keep, [%1, #256]          \n"
                        "ld1        {v0.4s, v1.4s}, [%1]           \n"
                        "fmla       v0.4s, v2.4s, %12.4s           \n"
                        "fmla       v1.4s, v8.4s, %12.4s           \n"

                        "prfm       pldl1keep, [%3, #512]          \n"
                        "ld2        {v2.4s, v3.4s}, [%3], #32      \n"
                        "ld2        {v8.4s, v9.4s}, [%3], #32      \n"
                        "fmla       v0.4s, v2.4s, %13.4s           \n"
                        "fmla       v1.4s, v8.4s, %13.4s           \n"

                        "prfm       pldl1keep, [%4, #512]          \n"
                        "ld2        {v2.4s, v3.4s}, [%4], #32      \n"
                        "ld2        {v8.4s, v9.4s}, [%4], #32      \n"
                        "fmla       v0.4s, v2.4s, %14.4s           \n"
                        "fmla       v1.4s, v8.4s, %14.4s           \n"

                        "prfm       pldl1keep, [%5, #512]          \n"
                        "ld2        {v2.4s, v3.4s}, [%5], #32      \n"
                        "ld2        {v8.4s, v9.4s}, [%5], #32      \n"
                        "fmla       v0.4s, v2.4s, %15.4s           \n"
                        "fmla       v1.4s, v8.4s, %15.4s           \n"

                        "prfm       pldl1keep, [%2, #512]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
                        "ld2        {v8.4s, v9.4s}, [%2], #32      \n"

                        "subs       %w0, %w0, #1                   \n"
                        "st1        {v0.4s, v1.4s}, [%1], #32      \n"
                        "bne        0b                             \n"
                        "sub        %2, %2, #64                    \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3)      // %5
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "w"(_k0), // %12
                        "w"(_k1), // %13
                        "w"(_k2), // %14
                        "w"(_k3)  // %15
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%2, #512]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"
                        "vld2.f32   {d16-d19}, [%2]!    \n"
                        "0:                             \n"
                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d0-d3}, [%1]       \n"
                        "vmla.f32   q0, q2, %q12        \n"
                        "vmla.f32   q1, q8, %q12        \n"
                        "pld        [%3, #512]          \n"
                        "vld2.f32   {d4-d7}, [%3]!      \n"
                        "vld2.f32   {d16-d19}, [%3]!    \n"
                        "vmla.f32   q0, q2, %q13        \n"
                        "vmla.f32   q1, q8, %q13        \n"
                        "pld        [%4, #512]          \n"
                        "vld2.f32   {d4-d7}, [%4]!      \n"
                        "vld2.f32   {d16-d19}, [%4]!    \n"
                        "vmla.f32   q0, q2, %q14        \n"
                        "vmla.f32   q1, q8, %q14        \n"
                        "pld        [%5, #512]          \n"
                        "vld2.f32   {d4-d7}, [%5]!      \n"
                        "vld2.f32   {d16-d19}, [%5]!    \n"
                        "vmla.f32   q0, q2, %q15        \n"
                        "vmla.f32   q1, q8, %q15        \n"
                        "pld        [%2, #512]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"
                        "vld2.f32   {d16-d19}, [%2]!    \n"
                        "subs       %0, #1              \n"
                        "vst1.f32   {d0-d3}, [%1]!      \n"
                        "bne        0b                  \n"
                        "sub        %2, #64             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3)      // %5
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "w"(_k0), // %12
                        "w"(_k1), // %13
                        "w"(_k2), // %14
                        "w"(_k3)  // %15
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    float sum = *r0 * k0;
                    float sum1 = *r1 * k1;
                    float sum2 = *r2 * k2;
                    float sum3 = *r3 * k3;

                    *outptr += sum + sum1 + sum2 + sum3;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
            }
        }

        for (; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch + q;
            const float k0 = kernel0[0];

            const float* r0 = img0;

            for (int i = 0; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 3;
                int remain = outw & 7;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
                float32x4_t _k0 = vdupq_n_f32(k0);
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%2, #512]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
                        "ld2        {v8.4s, v9.4s}, [%2], #32      \n"

                        "0:                                        \n"

                        "prfm       pldl1keep, [%1, #256]          \n"
                        "ld1        {v0.4s, v1.4s}, [%1]           \n"
                        "fmla       v0.4s, v2.4s, %6.4s            \n"
                        "fmla       v1.4s, v8.4s, %6.4s            \n"

                        "prfm       pldl1keep, [%2, #512]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
                        "ld2        {v8.4s, v9.4s}, [%2], #32      \n"

                        "subs       %w0, %w0, #1                   \n"
                        "st1        {v0.4s, v1.4s}, [%1], #32      \n"
                        "bne        0b                             \n"
                        "sub        %2, %2, #64                    \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0)      // %2
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "w"(_k0) // %6
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%2, #512]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"
                        "vld2.f32   {d16-d19}, [%2]!    \n"
                        "0:                             \n"
                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d0-d3}, [%1]       \n"
                        "vmla.f32   q0, q2, %q6         \n"
                        "vmla.f32   q1, q8, %q6         \n"
                        "pld        [%2, #512]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"
                        "vld2.f32   {d16-d19}, [%2]!    \n"
                        "subs       %0, #1              \n"
                        "vst1.f32   {d0-d3}, [%1]!      \n"
                        "bne        0b                  \n"
                        "sub        %2, #64             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0)      // %2
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "w"(_k0) // %6
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    float sum = *r0 * k0;

                    *outptr += sum;

                    r0 += 2;
                    outptr++;
                }

                r0 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_2x2.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv2x2s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        int q = 0;

        for (; q + 1 < inch; q += 2)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);
            const float* img1 = bottom_blob.channel(q + 1);

            const float* kernel0 = kernel + p * inch * 4 + q * 4;
            const float* kernel1 = kernel0 + 4;

            const float* r00 = img0;
            const float* r01 = img0 + w;

            const float* r10 = img1;
            const float* r11 = img1 + w;

#if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(kernel0);
            float32x4_t _k1 = vld1q_f32(kernel1);
#endif // __ARM_NEON

            for (int i = 0; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v0.4s}, [%1], #16             \n"
                        "prfm       pldl1keep, [%2, #128]          \n"
                        "ld1        {v2.4s}, [%2], #16             \n"
                        "prfm       pldl1keep, [%3, #128]          \n"
                        "ld1        {v12.4s}, [%3], #16            \n"
                        "prfm       pldl1keep, [%4, #128]          \n"
                        "ld1        {v14.4s}, [%4], #16            \n"

                        "0:                                        \n"
                        "prfm       pldl1keep, [%5, #128]          \n"
                        "ld1        {v9.4s}, [%5]                  \n"

                        "fmul       v8.4s, v0.4s, %12.s[0]         \n"
                        "fmla       v9.4s, v2.4s, %12.s[2]         \n"

                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v1.4s}, [%1], #16             \n"

                        "prfm       pldl1keep, [%2, #128]          \n"
                        "ld1        {v3.4s}, [%2], #16             \n"

                        "ext        v10.16b, v0.16b, v1.16b, #4    \n"
                        "ext        v11.16b, v2.16b, v3.16b, #4    \n"

                        "fmla       v8.4s, v12.4s, %13.s[0]        \n"
                        "fmla       v9.4s, v14.4s, %13.s[2]        \n"

                        "prfm       pldl1keep, [%3, #128]          \n"
                        "ld1        {v13.4s}, [%3], #16            \n"

                        "prfm       pldl1keep, [%4, #128]          \n"
                        "ld1        {v15.4s}, [%4], #16            \n"

                        "fmla       v8.4s, v10.4s, %12.s[1]        \n"
                        "fmla       v9.4s, v11.4s, %12.s[3]        \n"

                        "ext        v10.16b, v12.16b, v13.16b, #4  \n"
                        "ext        v11.16b, v14.16b, v15.16b, #4  \n"

                        "fmla       v8.4s, v10.4s, %13.s[1]        \n"
                        "fmla       v9.4s, v11.4s, %13.s[3]        \n"

                        "orr        v0.16b, v1.16b, v1.16b         \n"
                        "orr        v2.16b, v3.16b, v3.16b         \n"

                        "fadd       v8.4s, v8.4s, v9.4s            \n"

                        "orr        v12.16b, v13.16b, v13.16b      \n"
                        "orr        v14.16b, v15.16b, v15.16b      \n"

                        "subs       %w0, %w0, #1                   \n"
                        "st1        {v8.4s}, [%5], #16             \n"
                        "bne        0b                             \n"
                        "sub        %1, %1, #16                    \n"
                        "sub        %2, %2, #16                    \n"
                        "sub        %3, %3, #16                    \n"
                        "sub        %4, %4, #16                    \n"
                        : "=r"(nn),    // %0
                        "=r"(r00),   // %1
                        "=r"(r01),   // %2
                        "=r"(r10),   // %3
                        "=r"(r11),   // %4
                        "=r"(outptr) // %5
                        : "0"(nn),
                        "1"(r00),
                        "2"(r01),
                        "3"(r10),
                        "4"(r11),
                        "5"(outptr),
                        "w"(_k0), // %12
                        "w"(_k1)  // %13
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1]!      \n"
                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d4-d5}, [%2]!      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d24-d25}, [%3]!    \n"
                        "pld        [%4, #128]          \n"
                        "vld1.f32   {d28-d29}, [%4]!    \n"

                        "0:                             \n"
                        "pld        [%5, #128]          \n"
                        "vld1.f32   {d18-d19}, [%5]     \n" // q9 = sum

                        "vmul.f32   q8, q0, %e12[0]     \n"
                        "vmla.f32   q9, q2, %f12[0]     \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d2-d3}, [%1]!      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d6-d7}, [%2]!      \n"

                        "vext.f32   q10, q0, q1, #1     \n"
                        "vext.f32   q11, q2, q3, #1     \n"

                        "vmla.f32   q8, q12, %e13[0]    \n"
                        "vmla.f32   q9, q14, %f13[0]    \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d26-d27}, [%3]!    \n"

                        "pld        [%4, #128]          \n"
                        "vld1.f32   {d30-d31}, [%4]!    \n"

                        "vmla.f32   q8, q10, %e12[1]    \n"
                        "vmla.f32   q9, q11, %f12[1]    \n"

                        "vext.f32   q10, q12, q13, #1   \n"
                        "vext.f32   q11, q14, q15, #1   \n"

                        "vmla.f32   q8, q10, %e13[1]    \n"
                        "vmla.f32   q9, q11, %f13[1]    \n"

                        "vorr       q0, q1, q1          \n"
                        "vorr       q2, q3, q3          \n"

                        "vadd.f32   q8, q8, q9          \n"

                        "vorr       q12, q13, q13       \n"
                        "vorr       q14, q15, q15       \n"

                        "subs       %0, #1              \n"

                        "vst1.f32   {d16-d17}, [%5]!    \n"

                        "bne        0b                  \n"
                        "sub        %1, #16             \n"
                        "sub        %2, #16             \n"
                        "sub        %3, #16             \n"
                        "sub        %4, #16             \n"
                        : "=r"(nn),    // %0
                        "=r"(r00),   // %1
                        "=r"(r01),   // %2
                        "=r"(r10),   // %3
                        "=r"(r11),   // %4
                        "=r"(outptr) // %5
                        : "0"(nn),
                        "1"(r00),
                        "2"(r01),
                        "3"(r10),
                        "4"(r11),
                        "5"(outptr),
                        "w"(_k0), // %12
                        "w"(_k1)  // %13
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON

                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x2_t _r00 = vld1_f32(r00);
                    float32x2_t _r01 = vld1_f32(r01);
                    float32x4_t _r00r1 = vcombine_f32(_r00, _r01);
                    float32x4_t _s0s1 = vmulq_f32(_r00r1, _k0);

                    float32x2_t _r10 = vld1_f32(r10);
                    float32x2_t _r11 = vld1_f32(r11);
                    float32x4_t _r10r1 = vcombine_f32(_r10, _r11);
                    _s0s1 = vmlaq_f32(_s0s1, _r10r1, _k1);

                    float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
                    _s = vpadd_f32(_s, _s);
                    *outptr += vget_lane_f32(_s, 0);
#else
                    float sum = 0.f;

                    sum += r00[0] * kernel0[0];
                    sum += r00[1] * kernel0[1];
                    sum += r01[0] * kernel0[2];
                    sum += r01[1] * kernel0[3];

                    sum += r10[0] * kernel1[0];
                    sum += r10[1] * kernel1[1];
                    sum += r11[0] * kernel1[2];
                    sum += r11[1] * kernel1[3];

                    *outptr += sum;
#endif // __ARM_NEON

                    r00 += 1;
                    r01 += 1;
                    r10 += 1;
                    r11 += 1;
                    outptr++;
                }

                r00 += 1;
                r01 += 1;
                r10 += 1;
                r11 += 1;
            }
        }

        for (; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 4 + q * 4;

            const float* r0 = img0;
            const float* r1 = img0 + w;

#if __ARM_NEON
            float32x4_t _k0 = vdupq_n_f32(kernel0[0]);
            float32x4_t _k1 = vdupq_n_f32(kernel0[1]);
            float32x4_t _k2 = vdupq_n_f32(kernel0[2]);
            float32x4_t _k3 = vdupq_n_f32(kernel0[3]);
#endif // __ARM_NEON

            for (int i = 0; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v0.4s}, [%1], #16             \n"
                        "prfm       pldl1keep, [%2, #128]          \n"
                        "ld1        {v2.4s}, [%2], #16             \n"

                        "0:                                        \n"
                        "prfm       pldl1keep, [%3, #128]          \n"
                        "ld1        {v9.4s}, [%3]                  \n"

                        "fmul       v8.4s, v0.4s, %8.4s            \n"
                        "fmla       v9.4s, v2.4s, %10.4s           \n"

                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v1.4s}, [%1], #16             \n"
                        "ext        v10.16b, v0.16b, v1.16b, #4    \n"

                        "fmla       v8.4s, v10.4s, %9.4s           \n"

                        "prfm       pldl1keep, [%2, #128]          \n"
                        "ld1        {v3.4s}, [%2], #16             \n"
                        "ext        v11.16b, v2.16b, v3.16b, #4    \n"

                        "fmla       v9.4s, v11.4s, %11.4s          \n"

                        "orr        v0.16b, v1.16b, v1.16b         \n"
                        "fadd       v8.4s, v8.4s, v9.4s            \n"
                        "orr        v2.16b, v3.16b, v3.16b         \n"

                        "subs       %w0, %w0, #1                   \n"
                        "st1        {v8.4s}, [%3], #16             \n"
                        "bne        0b                             \n"
                        "sub        %1, %1, #16                    \n"
                        "sub        %2, %2, #16                    \n"
                        : "=r"(nn),    // %0
                        "=r"(r0),    // %1
                        "=r"(r1),    // %2
                        "=r"(outptr) // %3
                        : "0"(nn),
                        "1"(r0),
                        "2"(r1),
                        "3"(outptr),
                        "w"(_k0), // %8
                        "w"(_k1), // %9
                        "w"(_k2), // %10
                        "w"(_k3)  // %11
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1]!      \n"
                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d4-d5}, [%2]!      \n"

                        "0:                             \n"
                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d18-d19}, [%3]     \n" // q9 = sum

                        "vmul.f32   q8, q0, %q8         \n"
                        "vmla.f32   q9, q2, %q10        \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d2-d3}, [%1]!      \n"
                        "vext.f32   q10, q0, q1, #1     \n"

                        "vmla.f32   q8, q10, %q9        \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d6-d7}, [%2]!      \n"
                        "vext.f32   q11, q2, q3, #1     \n"

                        "vmla.f32   q9, q11, %q11       \n"

                        "vorr       q0, q1, q1          \n"
                        "vadd.f32   q8, q8, q9          \n"
                        "vorr       q2, q3, q3          \n"

                        "subs       %0, #1              \n"
                        "vst1.f32   {d16-d17}, [%3]!    \n"
                        "bne        0b                  \n"
                        "sub        %1, #16             \n"
                        "sub        %2, #16             \n"
                        : "=r"(nn),    // %0
                        "=r"(r0),    // %1
                        "=r"(r1),    // %2
                        "=r"(outptr) // %3
                        : "0"(nn),
                        "1"(r0),
                        "2"(r1),
                        "3"(outptr),
                        "w"(_k0), // %8
                        "w"(_k1), // %9
                        "w"(_k2), // %10
                        "w"(_k3)  // %11
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
                }
#endif // __aarch64__
#endif // __ARM_NEON

#if __ARM_NEON
                float32x4_t _k0123 = vld1q_f32(kernel0);
#endif

                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x2_t _r0 = vld1_f32(r0);
                    float32x2_t _r1 = vld1_f32(r1);
                    float32x4_t _r0r1 = vcombine_f32(_r0, _r1);
                    float32x4_t _s0s1 = vmulq_f32(_r0r1, _k0123);
                    float32x2_t _s = vadd_f32(vget_low_f32(_s0s1), vget_high_f32(_s0s1));
                    _s = vpadd_f32(_s, _s);
                    *outptr += vget_lane_f32(_s, 0);
#else
                    float sum = 0.f;
                    sum += r0[0] * kernel0[0];
                    sum += r0[1] * kernel0[1];
                    sum += r1[0] * kernel0[2];
                    sum += r1[1] * kernel0[3];
                    *outptr += sum;
#endif

                    r0 += 1;
                    r1 += 1;
                    outptr++;
                }

                r0 += 1;
                r1 += 1;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);

        const float bias0 = bias ? bias[p] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;

        out0.fill(bias0);
        out1.fill(bias1);

        const float* k0 = kernel + p * inch * 9;
        const float* k1 = kernel + (p + 1) * inch * 9;

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr0n = outptr0 + outw;
            float* outptr1n = outptr1 + outw;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;
            const float* r3 = img0 + w * 3;

#if __ARM_NEON
            float32x4_t _k00 = vld1q_f32(k0);
            float32x4_t _k03 = vld1q_f32(k0 + 3);
            float32x4_t _k06 = vld1q_f32(k0 + 6);

            float32x4_t _k10 = vld1q_f32(k1);
            float32x4_t _k13 = vld1q_f32(k1 + 3);
            float32x4_t _k16 = vld1q_f32(k1 + 6);
#endif // __ARM_NEON

            int i = 0;

            for (; i + 1 < outh; i += 2)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%5]        \n" // r0
                        "add    %5, %5, #16                 \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v14.4s, v15.4s}, [%8]      \n" // r3
                        "add    %8, %8, #16                 \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v14.16b, v15.16b, #8 \n"

                        "0:                                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v6.4s}, [%1]               \n" // _sum0

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v7.4s}, [%2]               \n" // _sum1

                        "fmla   v6.4s, v8.4s, %18.s[0]      \n"
                        "fmla   v7.4s, v8.4s, %21.s[0]      \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v12.4s}, [%3]              \n" // _sum0n

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v13.4s}, [%4]              \n" // _sum1n

                        "fmla   v12.4s, v14.4s, %20.s[0]    \n"
                        "fmla   v13.4s, v14.4s, %23.s[0]    \n"

                        "ext    v8.16b, v8.16b, v9.16b, #8  \n"
                        "ext    v9.16b, v14.16b, v15.16b, #4 \n"

                        "fmla   v6.4s, v10.4s, %18.s[1]     \n"
                        "fmla   v7.4s, v10.4s, %21.s[1]     \n"
                        "fmla   v12.4s, v11.4s, %20.s[2]    \n"
                        "fmla   v13.4s, v11.4s, %23.s[2]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v14.4s, v15.4s}, [%6]      \n" // r1
                        "add    %6, %6, #16                 \n"

                        "fmla   v6.4s, v8.4s, %18.s[2]      \n"
                        "fmla   v7.4s, v8.4s, %21.s[2]      \n"
                        "fmla   v12.4s, v9.4s, %20.s[1]     \n"
                        "fmla   v13.4s, v9.4s, %23.s[1]     \n"

                        "ext    v10.16b, v14.16b, v15.16b, #4 \n"

                        "fmla   v6.4s, v14.4s, %19.s[0]     \n"
                        "fmla   v7.4s, v14.4s, %22.s[0]     \n"
                        "fmla   v12.4s, v14.4s, %18.s[0]    \n"
                        "fmla   v13.4s, v14.4s, %21.s[0]    \n"

                        "ext    v11.16b, v14.16b, v15.16b, #8 \n"

                        "fmla   v6.4s, v10.4s, %19.s[1]     \n"
                        "fmla   v7.4s, v10.4s, %22.s[1]     \n"
                        "fmla   v12.4s, v10.4s, %18.s[1]    \n"
                        "fmla   v13.4s, v10.4s, %21.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%7]        \n" // r2
                        "add    %7, %7, #16                 \n"

                        "fmla   v6.4s, v11.4s, %19.s[2]     \n"
                        "fmla   v7.4s, v11.4s, %22.s[2]     \n"
                        "fmla   v12.4s, v11.4s, %18.s[2]    \n"
                        "fmla   v13.4s, v11.4s, %21.s[2]    \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"

                        "fmla   v6.4s, v8.4s, %20.s[0]      \n"
                        "fmla   v7.4s, v8.4s, %23.s[0]      \n"
                        "fmla   v12.4s, v8.4s, %19.s[0]     \n"
                        "fmla   v13.4s, v8.4s, %22.s[0]     \n"

                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "fmla   v6.4s, v10.4s, %20.s[1]     \n"
                        "fmla   v7.4s, v10.4s, %23.s[1]     \n"
                        "fmla   v12.4s, v10.4s, %19.s[1]    \n"
                        "fmla   v13.4s, v10.4s, %22.s[1]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%5]        \n" // r0
                        "add    %5, %5, #16                 \n"

                        "fmla   v6.4s, v11.4s, %20.s[2]     \n"
                        "fmla   v7.4s, v11.4s, %23.s[2]     \n"
                        "fmla   v12.4s, v11.4s, %19.s[2]    \n"
                        "fmla   v13.4s, v11.4s, %22.s[2]    \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v14.4s, v15.4s}, [%8]      \n" // r3
                        "add    %8, %8, #16                 \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"

                        "st1    {v6.4s}, [%1], #16          \n"
                        "st1    {v7.4s}, [%2], #16          \n"

                        "ext    v11.16b, v14.16b, v15.16b, #8 \n"

                        "st1    {v12.4s}, [%3], #16         \n"
                        "st1    {v13.4s}, [%4], #16         \n"

                        "subs   %w0, %w0, #1                \n"
                        "bne    0b                          \n"

                        "sub    %5, %5, #16                 \n"
                        "sub    %8, %8, #16                 \n"
                        : "=r"(nn),       // %0
                        "=r"(outptr0),  // %1
                        "=r"(outptr1),  // %2
                        "=r"(outptr0n), // %3
                        "=r"(outptr1n), // %4
                        "=r"(r0),       // %5
                        "=r"(r1),       // %6
                        "=r"(r2),       // %7
                        "=r"(r3)        // %8
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr0n),
                        "4"(outptr1n),
                        "5"(r0),
                        "6"(r1),
                        "7"(r2),
                        "8"(r3),
                        "w"(_k00), // %18
                        "w"(_k03), // %19
                        "w"(_k06), // %20
                        "w"(_k10), // %21
                        "w"(_k13), // %22
                        "w"(_k16)  // %23
                        : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(

                        "pld        [%5, #192]          \n"
                        "vld1.f32   {d16-d18}, [%5 :64] \n" // r0
                        "add        %5, #16             \n"

                        "pld        [%8, #192]          \n"
                        "vld1.f32   {d28-d30}, [%8]     \n" // r3
                        "add        %8, #16             \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q14, q15, #2   \n"

                        "0:                             \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d12-d13}, [%1 :64] \n" // _sum0

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d14-d15}, [%2 :64] \n" // _sum1

                        "vmla.f32   q6, q8, %e18[0]     \n"
                        "vmla.f32   q7, q8, %e21[0]     \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d24-d25}, [%3]     \n" // _sum0n

                        "pld        [%4, #128]          \n"
                        "vld1.f32   {d26-d27}, [%4]     \n" // _sum1n

                        "vmla.f32   q12, q14, %e20[0]   \n"
                        "vmla.f32   q13, q14, %e23[0]   \n"

                        "vext.32    q8, q8, q9, #2      \n"
                        "vext.32    q9, q14, q15, #1    \n"

                        "vmla.f32   q6, q10, %e18[1]    \n"
                        "vmla.f32   q7, q10, %e21[1]    \n"
                        "vmla.f32   q12, q11, %f20[0]   \n"
                        "vmla.f32   q13, q11, %f23[0]   \n"

                        "pld        [%6, #192]          \n"
                        "vld1.f32   {d28-d30}, [%6]     \n" // r1
                        "add        %6, #16             \n"

                        "vmla.f32   q6, q8, %f18[0]     \n"
                        "vmla.f32   q7, q8, %f21[0]     \n"
                        "vmla.f32   q12, q9, %e20[1]    \n"
                        "vmla.f32   q13, q9, %e23[1]    \n"

                        "vext.32    q10, q14, q15, #1   \n"

                        "vmla.f32   q6, q14, %e19[0]    \n"
                        "vmla.f32   q7, q14, %e22[0]    \n"
                        "vmla.f32   q12, q14, %e18[0]   \n"
                        "vmla.f32   q13, q14, %e21[0]   \n"

                        "vext.32    q11, q14, q15, #2   \n"

                        "vmla.f32   q6, q10, %e19[1]    \n"
                        "vmla.f32   q7, q10, %e22[1]    \n"
                        "vmla.f32   q12, q10, %e18[1]   \n"
                        "vmla.f32   q13, q10, %e21[1]   \n"

                        "pld        [%7, #192]          \n"
                        "vld1.f32   {d16-d18}, [%7 :64] \n" // r2
                        "add        %7, #16             \n"

                        "vmla.f32   q6, q11, %f19[0]    \n"
                        "vmla.f32   q7, q11, %f22[0]    \n"
                        "vmla.f32   q12, q11, %f18[0]   \n"
                        "vmla.f32   q13, q11, %f21[0]   \n"

                        "vext.32    q10, q8, q9, #1     \n"

                        "vmla.f32   q6, q8, %e20[0]     \n"
                        "vmla.f32   q7, q8, %e23[0]     \n"
                        "vmla.f32   q12, q8, %e19[0]    \n"
                        "vmla.f32   q13, q8, %e22[0]    \n"

                        "vext.32    q11, q8, q9, #2     \n"

                        "vmla.f32   q6, q10, %e20[1]    \n"
                        "vmla.f32   q7, q10, %e23[1]    \n"
                        "vmla.f32   q12, q10, %e19[1]   \n"
                        "vmla.f32   q13, q10, %e22[1]   \n"

                        "pld        [%5, #192]          \n"
                        "vld1.f32   {d16-d18}, [%5 :64] \n" // r0
                        "add        %5, #16             \n"

                        "vmla.f32   q6, q11, %f20[0]    \n"
                        "vmla.f32   q7, q11, %f23[0]    \n"
                        "vmla.f32   q12, q11, %f19[0]   \n"
                        "vmla.f32   q13, q11, %f22[0]   \n"

                        "pld        [%8, #192]          \n"
                        "vld1.f32   {d28-d30}, [%8]     \n" // r3
                        "add        %8, #16             \n"

                        "vext.32    q10, q8, q9, #1     \n"

                        "vst1.f32   {d12-d13}, [%1 : 64]!\n"
                        "vst1.f32   {d14-d15}, [%2 : 64]!\n"

                        "vext.32    q11, q14, q15, #2   \n"

                        "vst1.f32   {d24-d25}, [%3]!    \n"
                        "vst1.f32   {d26-d27}, [%4]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"

                        "sub        %5, #16             \n"
                        "sub        %8, #16             \n"
                        : "=r"(nn),       // %0
                        "=r"(outptr0),  // %1
                        "=r"(outptr1),  // %2
                        "=r"(outptr0n), // %3
                        "=r"(outptr1n), // %4
                        "=r"(r0),       // %5
                        "=r"(r1),       // %6
                        "=r"(r2),       // %7
                        "=r"(r3)        // %8
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr0n),
                        "4"(outptr1n),
                        "5"(r0),
                        "6"(r1),
                        "7"(r2),
                        "8"(r3),
                        "w"(_k00), // %18
                        "w"(_k03), // %19
                        "w"(_k06), // %20
                        "w"(_k10), // %21
                        "w"(_k13), // %22
                        "w"(_k16)  // %23
                        : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);
                    float32x4_t _r30 = vld1q_f32(r3);

                    float32x4_t _sum0 = vmulq_f32(_r00, _k00);
                    float32x4_t _sum1 = vmulq_f32(_r00, _k10);
                    _sum0 = vmlaq_f32(_sum0, _r10, _k03);
                    _sum1 = vmlaq_f32(_sum1, _r10, _k13);
                    _sum0 = vmlaq_f32(_sum0, _r20, _k06);
                    _sum1 = vmlaq_f32(_sum1, _r20, _k16);

                    float32x4_t _sum0n = vmulq_f32(_r10, _k00);
                    float32x4_t _sum1n = vmulq_f32(_r10, _k10);
                    _sum0n = vmlaq_f32(_sum0n, _r20, _k03);
                    _sum1n = vmlaq_f32(_sum1n, _r20, _k13);
                    _sum0n = vmlaq_f32(_sum0n, _r30, _k06);
                    _sum1n = vmlaq_f32(_sum1n, _r30, _k16);

                    _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
                    _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
                    _sum0n = vsetq_lane_f32(*outptr0n, _sum0n, 3);
                    _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3);
#if __aarch64__
                    *outptr0 = vaddvq_f32(_sum0);
                    *outptr1 = vaddvq_f32(_sum1);
                    *outptr0n = vaddvq_f32(_sum0n);
                    *outptr1n = vaddvq_f32(_sum1n);
#else
                    float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
                    float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
                    float32x2_t _ss0n = vadd_f32(vget_low_f32(_sum0n), vget_high_f32(_sum0n));
                    float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n));

                    float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);
                    float32x2_t _ss01n = vpadd_f32(_ss0n, _ss1n);

                    *outptr0 = vget_lane_f32(_ss01, 0);
                    *outptr1 = vget_lane_f32(_ss01, 1);
                    *outptr0n = vget_lane_f32(_ss01n, 0);
                    *outptr1n = vget_lane_f32(_ss01n, 1);
#endif // __aarch64__
#else
                    float sum0 = 0.f;
                    float sum0n = 0.f;
                    float sum1 = 0.f;
                    float sum1n = 0.f;

                    sum0 += r0[0] * k0[0];
                    sum0 += r0[1] * k0[1];
                    sum0 += r0[2] * k0[2];
                    sum0 += r1[0] * k0[3];
                    sum0 += r1[1] * k0[4];
                    sum0 += r1[2] * k0[5];
                    sum0 += r2[0] * k0[6];
                    sum0 += r2[1] * k0[7];
                    sum0 += r2[2] * k0[8];

                    sum1 += r0[0] * k1[0];
                    sum1 += r0[1] * k1[1];
                    sum1 += r0[2] * k1[2];
                    sum1 += r1[0] * k1[3];
                    sum1 += r1[1] * k1[4];
                    sum1 += r1[2] * k1[5];
                    sum1 += r2[0] * k1[6];
                    sum1 += r2[1] * k1[7];
                    sum1 += r2[2] * k1[8];

                    sum0n += r1[0] * k0[0];
                    sum0n += r1[1] * k0[1];
                    sum0n += r1[2] * k0[2];
                    sum0n += r2[0] * k0[3];
                    sum0n += r2[1] * k0[4];
                    sum0n += r2[2] * k0[5];
                    sum0n += r3[0] * k0[6];
                    sum0n += r3[1] * k0[7];
                    sum0n += r3[2] * k0[8];

                    sum1n += r1[0] * k1[0];
                    sum1n += r1[1] * k1[1];
                    sum1n += r1[2] * k1[2];
                    sum1n += r2[0] * k1[3];
                    sum1n += r2[1] * k1[4];
                    sum1n += r2[2] * k1[5];
                    sum1n += r3[0] * k1[6];
                    sum1n += r3[1] * k1[7];
                    sum1n += r3[2] * k1[8];

                    *outptr0 += sum0;
                    *outptr1 += sum1;
                    *outptr0n += sum0n;
                    *outptr1n += sum1n;
#endif // __ARM_NEON
                    r0++;
                    r1++;
                    r2++;
                    r3++;
                    outptr0++;
                    outptr1++;
                    outptr0n++;
                    outptr1n++;
                }

                r0 += 2 + w;
                r1 += 2 + w;
                r2 += 2 + w;
                r3 += 2 + w;

                outptr0 += outw;
                outptr1 += outw;
                outptr0n += outw;
                outptr1n += outw;
            }

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                                 \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%3]        \n" // r0
                        "add    %3, %3, #16                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v6.4s}, [%1]               \n" // _sum0

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v7.4s}, [%2]               \n" // _sum1

                        "fmul   v14.4s, v8.4s, %12.s[0]     \n"
                        "fmul   v15.4s, v8.4s, %15.s[0]     \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "fmla   v6.4s, v10.4s, %12.s[1]     \n"
                        "fmla   v7.4s, v10.4s, %15.s[1]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%4]        \n" // r1
                        "add    %4, %4, #16                 \n"

                        "fmla   v14.4s, v11.4s, %12.s[2]    \n"
                        "fmla   v15.4s, v11.4s, %15.s[2]    \n"

                        "fmla   v6.4s, v8.4s, %13.s[0]      \n"
                        "fmla   v7.4s, v8.4s, %16.s[0]      \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "fmla   v14.4s, v10.4s, %13.s[1]    \n"
                        "fmla   v15.4s, v10.4s, %16.s[1]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%5]        \n" // r2
                        "add    %5, %5, #16                 \n"

                        "fmla   v6.4s, v11.4s, %13.s[2]     \n"
                        "fmla   v7.4s, v11.4s, %16.s[2]     \n"

                        "fmla   v14.4s, v8.4s, %14.s[0]     \n"
                        "fmla   v15.4s, v8.4s, %17.s[0]     \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "fmla   v6.4s, v10.4s, %14.s[1]     \n"
                        "fmla   v7.4s, v10.4s, %17.s[1]     \n"

                        "fmla   v14.4s, v11.4s, %14.s[2]    \n"
                        "fmla   v15.4s, v11.4s, %17.s[2]    \n"

                        "fadd   v6.4s, v6.4s, v14.4s        \n"
                        "fadd   v7.4s, v7.4s, v15.4s        \n"

                        "st1    {v6.4s}, [%1], #16          \n"
                        "st1    {v7.4s}, [%2], #16          \n"

                        "subs   %w0, %w0, #1                \n"
                        "bne    0b                          \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2)       // %5
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00), // %12
                        "w"(_k03), // %13
                        "w"(_k06), // %14
                        "w"(_k10), // %15
                        "w"(_k13), // %16
                        "w"(_k16)  // %17
                        : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "0:                             \n"

                        "pld        [%3, #192]          \n"
                        "vld1.f32   {d16-d18}, [%3]     \n" // r0
                        "add        %3, #16             \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d12-d13}, [%1]     \n" // _sum0

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d14-d15}, [%2]     \n" // _sum1

                        "vmul.f32   q14, q8, %e12[0]    \n"
                        "vmul.f32   q15, q8, %e15[0]    \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"

                        "vmla.f32   q6, q10, %e12[1]    \n"
                        "vmla.f32   q7, q10, %e15[1]    \n"

                        "pld        [%4, #192]          \n"
                        "vld1.f32   {d16-d18}, [%4]     \n" // r1
                        "add        %4, #16             \n"

                        "vmla.f32   q14, q11, %f12[0]   \n"
                        "vmla.f32   q15, q11, %f15[0]   \n"

                        "vmla.f32   q6, q8, %e13[0]     \n"
                        "vmla.f32   q7, q8, %e16[0]     \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"

                        "vmla.f32   q14, q10, %e13[1]   \n"
                        "vmla.f32   q15, q10, %e16[1]   \n"

                        "pld        [%5, #192]          \n"
                        "vld1.f32   {d16-d18}, [%5]     \n" // r2
                        "add        %5, #16             \n"

                        "vmla.f32   q6, q11, %f13[0]    \n"
                        "vmla.f32   q7, q11, %f16[0]    \n"

                        "vmla.f32   q14, q8, %e14[0]    \n"
                        "vmla.f32   q15, q8, %e17[0]    \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"

                        "vmla.f32   q6, q10, %e14[1]    \n"
                        "vmla.f32   q7, q10, %e17[1]    \n"

                        "vmla.f32   q14, q11, %f14[0]   \n"
                        "vmla.f32   q15, q11, %f17[0]   \n"

                        "vadd.f32   q6, q6, q14         \n"
                        "vadd.f32   q7, q7, q15         \n"

                        "vst1.f32   {d12-d13}, [%1]!    \n"

                        "vst1.f32   {d14-d15}, [%2]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2)       // %5
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00), // %12
                        "w"(_k03), // %13
                        "w"(_k06), // %14
                        "w"(_k10), // %15
                        "w"(_k13), // %16
                        "w"(_k16)  // %17
                        : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);

                    float32x4_t _sum0 = vmulq_f32(_r00, _k00);
                    float32x4_t _sum1 = vmulq_f32(_r00, _k10);
                    _sum0 = vmlaq_f32(_sum0, _r10, _k03);
                    _sum1 = vmlaq_f32(_sum1, _r10, _k13);
                    _sum0 = vmlaq_f32(_sum0, _r20, _k06);
                    _sum1 = vmlaq_f32(_sum1, _r20, _k16);

                    _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
                    _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
#if __aarch64__
                    *outptr0 = vaddvq_f32(_sum0);
                    *outptr1 = vaddvq_f32(_sum1);
#else
                    float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
                    float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
                    float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);

                    *outptr0 = vget_lane_f32(_ss01, 0);
                    *outptr1 = vget_lane_f32(_ss01, 1);
#endif // __aarch64__
#else
                    float sum0 = 0.f;
                    float sum1 = 0.f;

                    sum0 += r0[0] * k0[0];
                    sum0 += r0[1] * k0[1];
                    sum0 += r0[2] * k0[2];
                    sum0 += r1[0] * k0[3];
                    sum0 += r1[1] * k0[4];
                    sum0 += r1[2] * k0[5];
                    sum0 += r2[0] * k0[6];
                    sum0 += r2[1] * k0[7];
                    sum0 += r2[2] * k0[8];

                    sum1 += r0[0] * k1[0];
                    sum1 += r0[1] * k1[1];
                    sum1 += r0[2] * k1[2];
                    sum1 += r1[0] * k1[3];
                    sum1 += r1[1] * k1[4];
                    sum1 += r1[2] * k1[5];
                    sum1 += r2[0] * k1[6];
                    sum1 += r2[1] * k1[7];
                    sum1 += r2[2] * k1[8];

                    *outptr0 += sum0;
                    *outptr1 += sum1;
#endif // __ARM_NEON
                    r0++;
                    r1++;
                    r2++;
                    outptr0++;
                    outptr1++;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9;
            k1 += 9;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        const float* kernel0 = kernel + p * inch * 9;

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;
            float* outptr2 = outptr + outw;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;
            const float* r3 = img0 + w * 3;

#if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(kernel0);
            float32x4_t _k3456 = vld1q_f32(kernel0 + 3);
            float32x4_t _k6789 = vld1q_f32(kernel0 + 6);
#else
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;
#endif // __ARM_NEON

            int i = 0;

            for (; i + 1 < outh; i += 2)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v9.4s, v10.4s}, [%3]       \n" // r0
                        "add    %3, %3, #16                 \n"

                        "ext    v11.16b, v9.16b, v10.16b, #4 \n"
                        "ext    v12.16b, v9.16b, v10.16b, #8 \n"

                        "0:                                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v7.4s}, [%1]               \n" // _sum

                        "fmla   v7.4s, v9.4s, %14.s[0]      \n"
                        "fmul   v6.4s, v11.4s, %14.s[1]     \n"
                        "fmul   v13.4s, v12.4s, %14.s[2]    \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v9.4s, v10.4s}, [%4]       \n" // r1
                        "add    %4, %4, #16                 \n"

                        "fmla   v7.4s, v9.4s, %15.s[0]      \n"

                        "ext    v11.16b, v9.16b, v10.16b, #4 \n"
                        "ext    v12.16b, v9.16b, v10.16b, #8 \n"

                        "fmla   v6.4s, v11.4s, %15.s[1]     \n"
                        "fmla   v13.4s, v12.4s, %15.s[2]    \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v8.4s}, [%2]               \n" // _sum2

                        "fmla   v8.4s, v9.4s, %14.s[0]      \n"
                        "fmul   v14.4s, v11.4s, %14.s[1]    \n"
                        "fmul   v15.4s, v12.4s, %14.s[2]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v9.4s, v10.4s}, [%5]       \n" // r2
                        "add    %5, %5, #16                 \n"

                        "fmla   v7.4s, v9.4s, %16.s[0]      \n"

                        "ext    v11.16b, v9.16b, v10.16b, #4 \n"
                        "ext    v12.16b, v9.16b, v10.16b, #8 \n"

                        "fmla   v6.4s, v11.4s, %16.s[1]     \n"
                        "fmla   v13.4s, v12.4s, %16.s[2]    \n"

                        "fmla   v8.4s, v9.4s, %15.s[0]      \n"
                        "fmla   v14.4s, v11.4s, %15.s[1]    \n"
                        "fmla   v15.4s, v12.4s, %15.s[2]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v9.4s, v10.4s}, [%6]       \n" // r3
                        "add    %6, %6, #16                 \n"

                        "fmla   v8.4s, v9.4s, %16.s[0]      \n"

                        "ext    v11.16b, v9.16b, v10.16b, #4 \n"
                        "ext    v12.16b, v9.16b, v10.16b, #8 \n"

                        "fmla   v14.4s, v11.4s, %16.s[1]    \n"
                        "fmla   v15.4s, v12.4s, %16.s[2]    \n"

                        "fadd   v7.4s, v7.4s, v6.4s         \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v9.4s, v10.4s}, [%3]       \n" // r0

                        "fadd   v8.4s, v8.4s, v14.4s        \n"
                        "fadd   v7.4s, v7.4s, v13.4s        \n"
                        "fadd   v8.4s, v8.4s, v15.4s        \n"

                        "ext    v11.16b, v9.16b, v10.16b, #4 \n"
                        "ext    v12.16b, v9.16b, v10.16b, #8 \n"

                        "add    %3, %3, #16                 \n"

                        "st1    {v7.4s}, [%1], #16          \n"
                        "st1    {v8.4s}, [%2], #16          \n"

                        "subs   %w0, %w0, #1                \n"
                        "bne    0b                          \n"

                        "sub    %3, %3, #16                 \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr),  // %1
                        "=r"(outptr2), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2),      // %5
                        "=r"(r3)       // %6
                        : "0"(nn),
                        "1"(outptr),
                        "2"(outptr2),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "6"(r3),
                        "w"(_k0123), // %14
                        "w"(_k3456), // %15
                        "w"(_k6789)  // %16
                        : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%3, #192]          \n"
                        "vld1.f32   {d18-d20}, [%3 :64] \n" // r0
                        "add        %3, #16             \n"

                        "vext.32    q11, q9, q10, #1    \n"
                        "vext.32    q12, q9, q10, #2    \n"

                        "0:                             \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d14-d15}, [%1 :64] \n" // _sum

                        "vmla.f32   q7, q9, %e14[0]     \n"
                        "vmul.f32   q6, q11, %e14[1]    \n"
                        "vmul.f32   q13, q12, %f14[0]   \n"

                        "pld        [%4, #192]          \n"
                        "vld1.f32   {d18-d20}, [%4]     \n" // r1
                        "add        %4, #16             \n"

                        "vmla.f32   q7, q9, %e15[0]     \n"

                        "vext.32    q11, q9, q10, #1    \n"
                        "vext.32    q12, q9, q10, #2    \n"

                        "vmla.f32   q6, q11, %e15[1]    \n"
                        "vmla.f32   q13, q12, %f15[0]   \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d16-d17}, [%2]     \n" // _sum2

                        "vmla.f32   q8, q9, %e14[0]     \n"
                        "vmul.f32   q14, q11, %e14[1]   \n"
                        "vmul.f32   q15, q12, %f14[0]   \n"

                        "pld        [%5, #192]          \n"
                        "vld1.f32   {d18-d20}, [%5 :64] \n" // r2
                        "add        %5, #16             \n"

                        "vmla.f32   q7, q9, %e16[0]     \n"

                        "vext.32    q11, q9, q10, #1    \n"
                        "vext.32    q12, q9, q10, #2    \n"

                        "vmla.f32   q6, q11, %e16[1]    \n"
                        "vmla.f32   q13, q12, %f16[0]   \n"

                        "vmla.f32   q8, q9, %e15[0]     \n"
                        "vmla.f32   q14, q11, %e15[1]   \n"
                        "vmla.f32   q15, q12, %f15[0]   \n"

                        "pld        [%6, #192]          \n"
                        "vld1.f32   {d18-d20}, [%6]     \n" // r3
                        "add        %6, #16             \n"

                        "vmla.f32   q8, q9, %e16[0]     \n"

                        "vext.32    q11, q9, q10, #1    \n"
                        "vext.32    q12, q9, q10, #2    \n"

                        "vmla.f32   q14, q11, %e16[1]   \n"
                        "vmla.f32   q15, q12, %f16[0]   \n"

                        "vadd.f32   q7, q7, q6          \n"

                        "pld        [%3, #192]          \n"
                        "vld1.f32   {d18-d20}, [%3 :64] \n" // r0

                        "vadd.f32   q8, q8, q14         \n"
                        "vadd.f32   q7, q7, q13         \n"
                        "vadd.f32   q8, q8, q15         \n"

                        "vext.32    q11, q9, q10, #1    \n"
                        "vext.32    q12, q9, q10, #2    \n"

                        "add        %3, #16             \n"

                        "vst1.f32   {d14-d15}, [%1]!    \n"
                        "vst1.f32   {d16-d17}, [%2]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"

                        "sub        %3, #16             \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr),  // %1
                        "=r"(outptr2), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2),      // %5
                        "=r"(r3)       // %6
                        : "0"(nn),
                        "1"(outptr),
                        "2"(outptr2),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "6"(r3),
                        "w"(_k0123), // %14
                        "w"(_k3456), // %15
                        "w"(_k6789)  // %16
                        : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);
                    float32x4_t _r30 = vld1q_f32(r3);

                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
                    _sum = vmlaq_f32(_sum, _r10, _k3456);
                    _sum = vmlaq_f32(_sum, _r20, _k6789);

                    float32x4_t _sum2 = vmulq_f32(_r10, _k0123);
                    _sum2 = vmlaq_f32(_sum2, _r20, _k3456);
                    _sum2 = vmlaq_f32(_sum2, _r30, _k6789);

                    _sum = vsetq_lane_f32(*outptr, _sum, 3);
                    _sum2 = vsetq_lane_f32(*outptr2, _sum2, 3);

#if __aarch64__
                    *outptr = vaddvq_f32(_sum);
                    *outptr2 = vaddvq_f32(_sum2);
#else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));

                    float32x2_t _sss2 = vpadd_f32(_ss, _ss2);

                    *outptr = vget_lane_f32(_sss2, 0);
                    *outptr2 = vget_lane_f32(_sss2, 1);
#endif // __aarch64__
#else
                    float sum = 0;
                    float sum2 = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];

                    sum2 += r1[0] * k0[0];
                    sum2 += r1[1] * k0[1];
                    sum2 += r1[2] * k0[2];
                    sum2 += r2[0] * k1[0];
                    sum2 += r2[1] * k1[1];
                    sum2 += r2[2] * k1[2];
                    sum2 += r3[0] * k2[0];
                    sum2 += r3[1] * k2[1];
                    sum2 += r3[2] * k2[2];

                    *outptr += sum;
                    *outptr2 += sum2;
#endif
                    r0++;
                    r1++;
                    r2++;
                    r3++;
                    outptr++;
                    outptr2++;
                }

                r0 += 2 + w;
                r1 += 2 + w;
                r2 += 2 + w;
                r3 += 2 + w;

                outptr += outw;
                outptr2 += outw;
            }

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%2]        \n" // r0
                        "add    %2, %2, #16                 \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "0:                                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v7.4s}, [%1]               \n" // _sum

                        "fmla   v7.4s, v8.4s, %10.s[0]      \n"
                        "fmul   v13.4s, v10.4s, %10.s[1]    \n"
                        "fmul   v14.4s, v11.4s, %10.s[2]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%3]        \n" // r1
                        "add    %3, %3, #16                 \n"

                        "fmla   v7.4s, v8.4s, %11.s[0]      \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "fmla   v13.4s, v10.4s, %11.s[1]    \n"
                        "fmla   v14.4s, v11.4s, %11.s[2]    \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%4]        \n" // r2
                        "add    %4, %4, #16                 \n"

                        "fmla   v7.4s, v8.4s, %12.s[0]      \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "fmla   v13.4s, v10.4s, %12.s[1]    \n"
                        "fmla   v14.4s, v11.4s, %12.s[2]    \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%2]        \n" // r0
                        "add    %2, %2, #16                 \n"

                        "fadd   v7.4s, v7.4s, v13.4s        \n"
                        "fadd   v7.4s, v7.4s, v14.4s        \n"

                        "ext    v10.16b, v8.16b, v9.16b, #4 \n"
                        "ext    v11.16b, v8.16b, v9.16b, #8 \n"

                        "st1    {v7.4s}, [%1], #16          \n"

                        "subs   %w0, %w0, #1                \n"
                        "bne    0b                          \n"

                        "sub    %2, %2, #16                 \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2)      // %4
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k0123), // %10
                        "w"(_k3456), // %11
                        "w"(_k6789)  // %12
                        : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%2, #192]          \n"
                        "vld1.f32   {d16-d18}, [%2]     \n" // r0
                        "add        %2, #16             \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"

                        "0:                             \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d14-d15}, [%1]     \n" // _sum

                        "vmla.f32   q7, q8, %e10[0]     \n"
                        "vmul.f32   q13, q10, %e10[1]   \n"
                        "vmul.f32   q14, q11, %f10[0]   \n"

                        "pld        [%3, #192]          \n"
                        "vld1.f32   {d16-d18}, [%3]     \n" // r1
                        "add        %3, #16             \n"

                        "vmla.f32   q7, q8, %e11[0]     \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"

                        "vmla.f32   q13, q10, %e11[1]   \n"
                        "vmla.f32   q14, q11, %f11[0]   \n"

                        "pld        [%4, #192]          \n"
                        "vld1.f32   {d16-d18}, [%4]     \n" // r2
                        "add        %4, #16             \n"

                        "vmla.f32   q7, q8, %e12[0]     \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"

                        "vmla.f32   q13, q10, %e12[1]   \n"
                        "vmla.f32   q14, q11, %f12[0]   \n"

                        "pld        [%2, #192]          \n"
                        "vld1.f32   {d16-d18}, [%2]     \n" // r0
                        "add        %2, #16             \n"

                        "vadd.f32   q7, q7, q13         \n"
                        "vadd.f32   q7, q7, q14         \n"

                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"

                        "vst1.f32   {d14-d15}, [%1]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"

                        "sub        %2, #16             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2)      // %4
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k0123), // %10
                        "w"(_k3456), // %11
                        "w"(_k6789)  // %12
                        : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);

                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
                    _sum = vmlaq_f32(_sum, _r10, _k3456);
                    _sum = vmlaq_f32(_sum, _r20, _k6789);

                    _sum = vsetq_lane_f32(*outptr, _sum, 3);

#if __aarch64__
                    *outptr = vaddvq_f32(_sum);
#else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);

                    *outptr = vget_lane_f32(_ss, 0);
#endif // __aarch64__
#else
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];

                    *outptr += sum;
#endif
                    r0++;
                    r1++;
                    r2++;
                    outptr++;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            kernel0 += 9;
        }
    }
}

static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);

        const float bias0 = bias ? bias[p] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;

        out0.fill(bias0);
        out1.fill(bias1);

        const float* k0 = kernel + p * inch * 9;
        const float* k1 = kernel + (p + 1) * inch * 9;

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;

#if __ARM_NEON
            float32x4_t _k00 = vld1q_f32(k0);
            float32x4_t _k03 = vld1q_f32(k0 + 3);
            float32x4_t _k06 = vld1q_f32(k0 + 6);

            float32x4_t _k10 = vld1q_f32(k1);
            float32x4_t _k13 = vld1q_f32(k1 + 3);
            float32x4_t _k16 = vld1q_f32(k1 + 6);
#endif // __ARM_NEON

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld2    {v8.4s, v9.4s}, [%3], #32   \n" // v8 v9 = r0

                        "0:                                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v6.4s}, [%1]               \n" // v6 = _sum0

                        "fmul   v12.4s, v8.4s, %12.s[0]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v7.4s}, [%2]               \n" // v7 = _sum1

                        "fmul   v13.4s, v8.4s, %15.s[0]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld2    {v10.4s, v11.4s}, [%3]      \n" // v10

                        "fmla   v6.4s, v9.4s, %12.s[1]      \n"

                        "ext    v14.16b, v8.16b, v10.16b, #4\n"

                        "fmla   v7.4s, v9.4s, %15.s[1]      \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld2    {v8.4s, v9.4s}, [%4], #32   \n" // r1

                        "fmla   v12.4s, v14.4s, %12.s[2]    \n"
                        "fmla   v13.4s, v14.4s, %15.s[2]    \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld2    {v10.4s, v11.4s}, [%4]      \n"

                        "fmla   v6.4s, v8.4s, %13.s[0]      \n"
                        "fmla   v7.4s, v8.4s, %16.s[0]      \n"

                        "ext    v14.16b, v8.16b, v10.16b, #4\n"

                        "fmla   v12.4s, v9.4s, %13.s[1]     \n"
                        "fmla   v13.4s, v9.4s, %16.s[1]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld2    {v8.4s, v9.4s}, [%5], #32   \n" // r2

                        "fmla   v6.4s, v14.4s, %13.s[2]     \n"
                        "fmla   v7.4s, v14.4s, %16.s[2]     \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld2    {v10.4s, v11.4s}, [%5]      \n"

                        "fmla   v12.4s, v8.4s, %14.s[0]     \n"
                        "fmla   v13.4s, v8.4s, %17.s[0]     \n"

                        "ext    v14.16b, v8.16b, v10.16b, #4\n"

                        "fmla   v6.4s, v9.4s, %14.s[1]      \n"
                        "fmla   v7.4s, v9.4s, %17.s[1]      \n"

                        "fmla   v12.4s, v14.4s, %14.s[2]    \n"
                        "fmla   v13.4s, v14.4s, %17.s[2]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld2    {v8.4s, v9.4s}, [%3], #32   \n" // v8 v9 = r0

                        "fadd   v6.4s, v6.4s, v12.4s        \n"
                        "fadd   v7.4s, v7.4s, v13.4s        \n"

                        "subs   %w0, %w0, #1                \n"

                        "st1    {v6.4s}, [%1], #16          \n"
                        "st1    {v7.4s}, [%2], #16          \n"

                        "bne    0b                          \n"
                        "sub    %3, %3, #32                 \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2)       // %5
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00), // %12
                        "w"(_k03), // %13
                        "w"(_k06), // %14
                        "w"(_k10), // %15
                        "w"(_k13), // %16
                        "w"(_k16)  // %17
                        : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%3, #256]          \n"
                        "vld2.f32   {d16-d19}, [%3]!    \n" // q8 q9 = r0

                        "0:                             \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d12-d13}, [%1]     \n" // q6 = _sum0

                        "vmul.f32   q12, q8, %e12[0]    \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d14-d15}, [%2]     \n" // q7 = _sum1

                        "vmul.f32   q13, q8, %e15[0]    \n"

                        "pld        [%3, #128]          \n"
                        "vld2.f32   {d20-d21}, [%3]     \n" // q10

                        "vmla.f32   q6, q9, %e12[1]     \n"

                        "vext.32    q11, q8, q10, #1    \n"

                        "vmla.f32   q7, q9, %e15[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld2.f32   {d16-d19}, [%4]!    \n" // r1

                        "vmla.f32   q12, q11, %f12[0]   \n"
                        "vmla.f32   q13, q11, %f15[0]   \n"

                        "pld        [%4, #128]          \n"
                        "vld2.f32   {d20-d21}, [%4]     \n"

                        "vmla.f32   q6, q8, %e13[0]     \n"
                        "vmla.f32   q7, q8, %e16[0]     \n"

                        "vext.32    q11, q8, q10, #1    \n"

                        "vmla.f32   q12, q9, %e13[1]    \n"
                        "vmla.f32   q13, q9, %e16[1]    \n"

                        "pld        [%5, #256]          \n"
                        "vld2.f32   {d16-d19}, [%5]!    \n" // r2

                        "vmla.f32   q6, q11, %f13[0]    \n"
                        "vmla.f32   q7, q11, %f16[0]    \n"

                        "pld        [%5, #128]          \n"
                        "vld2.f32   {d20-d21}, [%5]     \n"

                        "vmla.f32   q12, q8, %e14[0]    \n"
                        "vmla.f32   q13, q8, %e17[0]    \n"

                        "vext.32    q11, q8, q10, #1    \n"

                        "vmla.f32   q6, q9, %e14[1]     \n"
                        "vmla.f32   q7, q9, %e17[1]     \n"

                        "vmla.f32   q12, q11, %f14[0]   \n"
                        "vmla.f32   q13, q11, %f17[0]   \n"

                        "pld        [%3, #256]          \n"
                        "vld2.f32   {d16-d19}, [%3]!    \n" // q8 q9 = r0

                        "vadd.f32   q6, q6, q12         \n"
                        "vadd.f32   q7, q7, q13         \n"

                        "subs       %0, #1              \n"

                        "vst1.f32   {d12-d13}, [%1]!    \n"
                        "vst1.f32   {d14-d15}, [%2]!    \n"

                        "bne        0b                  \n"
                        "sub        %3, #32             \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2)       // %5
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00), // %12
                        "w"(_k03), // %13
                        "w"(_k06), // %14
                        "w"(_k10), // %15
                        "w"(_k13), // %16
                        "w"(_k16)  // %17
                        : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);

                    float32x4_t _sum0 = vmulq_f32(_r00, _k00);
                    float32x4_t _sum1 = vmulq_f32(_r00, _k10);
                    _sum0 = vmlaq_f32(_sum0, _r10, _k03);
                    _sum1 = vmlaq_f32(_sum1, _r10, _k13);
                    _sum0 = vmlaq_f32(_sum0, _r20, _k06);
                    _sum1 = vmlaq_f32(_sum1, _r20, _k16);

                    _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
                    _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
#if __aarch64__
                    *outptr0 = vaddvq_f32(_sum0);
                    *outptr1 = vaddvq_f32(_sum1);
#else
                    float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
                    float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
                    float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);

                    *outptr0 = vget_lane_f32(_ss01, 0);
                    *outptr1 = vget_lane_f32(_ss01, 1);
#endif // __aarch64__
#else
                    float sum0 = 0.f;
                    float sum1 = 0.f;

                    sum0 += r0[0] * k0[0];
                    sum0 += r0[1] * k0[1];
                    sum0 += r0[2] * k0[2];
                    sum0 += r1[0] * k0[3];
                    sum0 += r1[1] * k0[4];
                    sum0 += r1[2] * k0[5];
                    sum0 += r2[0] * k0[6];
                    sum0 += r2[1] * k0[7];
                    sum0 += r2[2] * k0[8];

                    sum1 += r0[0] * k1[0];
                    sum1 += r0[1] * k1[1];
                    sum1 += r0[2] * k1[2];
                    sum1 += r1[0] * k1[3];
                    sum1 += r1[1] * k1[4];
                    sum1 += r1[2] * k1[5];
                    sum1 += r2[0] * k1[6];
                    sum1 += r2[1] * k1[7];
                    sum1 += r2[2] * k1[8];

                    *outptr0 += sum0;
                    *outptr1 += sum1;
#endif // __ARM_NEON

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0++;
                    outptr1++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9;
            k1 += 9;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        const float* kernel0 = kernel + p * inch * 9;

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

#if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(k0);
            float32x4_t _k3456 = vld1q_f32(k1);
            float32x4_t _k6789 = vld1q_f32(k2);
#endif // __ARM_NEON

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
                        "0:                                        \n"

                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v0.4s}, [%1]                  \n"

                        "fmla       v0.4s,  v2.4s, %10.s[0]        \n"
                        "fmul       v10.4s, v3.4s, %10.s[1]        \n"

                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v8.4s, v9.4s}, [%2]           \n"
                        "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                        "fmul       v11.4s, v1.4s, %10.s[2]        \n"

                        "prfm       pldl1keep, [%3, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%3], #32      \n"

                        "fmla       v0.4s,  v2.4s, %11.s[0]        \n"
                        "fmla       v10.4s, v3.4s, %11.s[1]        \n"

                        "prfm       pldl1keep, [%3, #256]          \n"
                        "ld2        {v8.4s, v9.4s}, [%3]           \n"
                        "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                        "fmla       v11.4s, v1.4s, %11.s[2]        \n"

                        "prfm       pldl1keep, [%4, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%4], #32      \n"

                        "fmla       v0.4s,  v2.4s, %12.s[0]        \n"
                        "fmla       v10.4s, v3.4s, %12.s[1]        \n"

                        "prfm       pldl1keep, [%4, #256]          \n"
                        "ld2        {v8.4s, v9.4s}, [%4]           \n"
                        "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                        "fmla       v11.4s, v1.4s, %12.s[2]        \n"

                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"

                        "fadd       v0.4s, v0.4s, v10.4s           \n"
                        "fadd       v0.4s, v0.4s, v11.4s           \n"

                        "subs       %w0, %w0, #1                   \n"
                        "st1        {v0.4s}, [%1], #16             \n"
                        "bne        0b                             \n"
                        "sub        %2, %2, #32                    \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2)      // %4
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k0123), // %10
                        "w"(_k3456), // %11
                        "w"(_k6789)  // %12
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%2, #256]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"

                        "0:                             \n"
                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1]       \n"

                        "vmla.f32   q0, q2, %e10[0]     \n"
                        "vmul.f32   q10, q3, %e10[1]    \n"

                        "pld        [%2, #128]          \n"
                        "vld2.f32   {d16-d17}, [%2]     \n"
                        "vext.32    q1, q2, q8, #1      \n"

                        "vmul.f32   q11, q1, %f10[0]    \n"

                        "pld        [%3, #256]          \n"
                        "vld2.f32   {d4-d7}, [%3]!      \n"

                        "vmla.f32   q0, q2, %e11[0]     \n"
                        "vmla.f32   q10, q3, %e11[1]    \n"

                        "pld        [%3, #128]          \n"
                        "vld2.f32   {d16-d17}, [%3]     \n"
                        "vext.32    q1, q2, q8, #1      \n"

                        "vmla.f32   q11, q1, %f11[0]    \n"

                        "pld        [%4, #256]          \n"
                        "vld2.f32   {d4-d7}, [%4]!      \n"

                        "vmla.f32   q0, q2, %e12[0]     \n"
                        "vmla.f32   q10, q3, %e12[1]    \n"

                        "pld        [%4, #128]          \n"
                        "vld2.f32   {d16-d17}, [%4]     \n"
                        "vext.32    q1, q2, q8, #1      \n"

                        "vmla.f32   q11, q1, %f12[0]    \n"

                        "pld        [%2, #256]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"

                        "vadd.f32   q0, q0, q10         \n"
                        "vadd.f32   q0, q0, q11         \n"

                        "subs       %0, #1              \n"
                        "vst1.f32   {d0-d1}, [%1]!      \n"
                        "bne        0b                  \n"
                        "sub        %2, #32             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2)      // %4
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k0123), // %10
                        "w"(_k3456), // %11
                        "w"(_k6789)  // %12
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);

                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
                    _sum = vmlaq_f32(_sum, _r10, _k3456);
                    _sum = vmlaq_f32(_sum, _r20, _k6789);

                    _sum = vsetq_lane_f32(*outptr, _sum, 3);

#if __aarch64__
                    *outptr = vaddvq_f32(_sum);
#else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);

                    *outptr = vget_lane_f32(_ss, 0);
#endif // __aarch64__
#else
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];

                    *outptr += sum;
#endif // __ARM_NEON

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            kernel0 += 9;
        }
    }
}

static void conv3x3s2_transform_kernel_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch)
{
    kernel_tm.create(8 * 9, inch, outch / 8 + outch % 8);

    const float* kernel = _kernel;

    int p = 0;
    for (; p + 7 < outch; p += 8)
    {
        const float* k0 = kernel + (p + 0) * inch * 9;
        const float* k1 = kernel + (p + 1) * inch * 9;
        const float* k2 = kernel + (p + 2) * inch * 9;
        const float* k3 = kernel + (p + 3) * inch * 9;
        const float* k4 = kernel + (p + 4) * inch * 9;
        const float* k5 = kernel + (p + 5) * inch * 9;
        const float* k6 = kernel + (p + 6) * inch * 9;
        const float* k7 = kernel + (p + 7) * inch * 9;

        float* ktmp = kernel_tm.channel(p / 8);

        for (int q = 0; q < inch; q++)
        {
            for (int k = 0; k < 9; k++)
            {
                ktmp[0] = k0[k];
                ktmp[1] = k1[k];
                ktmp[2] = k2[k];
                ktmp[3] = k3[k];
                ktmp[4] = k4[k];
                ktmp[5] = k5[k];
                ktmp[6] = k6[k];
                ktmp[7] = k7[k];
                ktmp += 8;
            }

            k0 += 9;
            k1 += 9;
            k2 += 9;
            k3 += 9;
            k4 += 9;
            k5 += 9;
            k6 += 9;
            k7 += 9;
        }
    }
    for (; p < outch; p++)
    {
        const float* k0 = kernel + (p + 0) * inch * 9;

        float* ktmp = kernel_tm.channel(p / 8 + p % 8);

        for (int q = 0; q < inch; q++)
        {
            for (int k = 0; k < 9; k++)
            {
                ktmp[k] = k0[k];
            }
            ktmp += 9;

            k0 += 9;
        }
    }
}

static void conv3x3s2_packed_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    //     const float* kernel = _kernel;
    const float* bias = _bias;

    int nn_outch = outch >> 3;
    int remain_outch_start = nn_outch << 3;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 8;

        Mat out0 = top_blob.channel(p + 0);
        Mat out1 = top_blob.channel(p + 1);
        Mat out2 = top_blob.channel(p + 2);
        Mat out3 = top_blob.channel(p + 3);
        Mat out4 = top_blob.channel(p + 4);
        Mat out5 = top_blob.channel(p + 5);
        Mat out6 = top_blob.channel(p + 6);
        Mat out7 = top_blob.channel(p + 7);

        const float bias0 = bias ? bias[p + 0] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;
        const float bias2 = bias ? bias[p + 2] : 0.f;
        const float bias3 = bias ? bias[p + 3] : 0.f;
        const float bias4 = bias ? bias[p + 4] : 0.f;
        const float bias5 = bias ? bias[p + 5] : 0.f;
        const float bias6 = bias ? bias[p + 6] : 0.f;
        const float bias7 = bias ? bias[p + 7] : 0.f;

        out0.fill(bias0);
        out1.fill(bias1);
        out2.fill(bias2);
        out3.fill(bias3);
        out4.fill(bias4);
        out5.fill(bias5);
        out6.fill(bias6);
        out7.fill(bias7);

        const float* ktmp = _kernel.channel(p / 8);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;
            float* outptr2 = out2;
            float* outptr3 = out3;
            float* outptr4 = out4;
            float* outptr5 = out5;
            float* outptr6 = out6;
            float* outptr7 = out7;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v8.4s}, [%1]               \n"
                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v9.4s}, [%2]               \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v10.4s}, [%3]              \n"
                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v11.4s}, [%4]              \n"

                        ///
                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld2    {v4.4s, v5.4s}, [%9], #32   \n" // v4=00 v5=01

                        "ld1    {v0.4s, v1.4s}, [%12], #32  \n"

                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "fmla   v9.4s, v4.4s, v0.s[1]       \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v12.4s}, [%5]              \n"
                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v13.4s}, [%6]              \n"

                        "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v4.4s, v0.s[3]      \n"

                        "prfm   pldl1keep, [%7, #128]       \n"
                        "ld1    {v14.4s}, [%7]              \n"
                        "prfm   pldl1keep, [%8, #128]       \n"
                        "ld1    {v15.4s}, [%8]              \n"

                        "ld1    {v2.4s, v3.4s}, [%12], #32  \n"

                        "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v15.4s, v4.4s, v1.s[3]      \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld2    {v6.4s, v7.4s}, [%9]        \n" // v6

                        "fmla   v8.4s, v5.4s, v2.s[0]       \n"
                        "fmla   v9.4s, v5.4s, v2.s[1]       \n"
                        "fmla   v10.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v5.4s, v2.s[3]      \n"

                        "ext    v6.16b, v4.16b, v6.16b, #4  \n" // v6=02

                        "ld1    {v0.4s, v1.4s}, [%12], #32  \n"

                        "fmla   v12.4s, v5.4s, v3.s[0]      \n"
                        "fmla   v13.4s, v5.4s, v3.s[1]      \n"
                        "fmla   v14.4s, v5.4s, v3.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v3.s[3]      \n"

                        ///
                        "prfm   pldl1keep, [%10, #256]      \n"
                        "ld2    {v4.4s, v5.4s}, [%10], #32  \n" // v4=10 v5=11

                        "fmla   v8.4s, v6.4s, v0.s[0]       \n"
                        "fmla   v9.4s, v6.4s, v0.s[1]       \n"
                        "fmla   v10.4s, v6.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v6.4s, v0.s[3]      \n"

                        "ld1    {v2.4s, v3.4s}, [%12], #32  \n"

                        "fmla   v12.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v13.4s, v6.4s, v1.s[1]      \n"
                        "fmla   v14.4s, v6.4s, v1.s[2]      \n"
                        "fmla   v15.4s, v6.4s, v1.s[3]      \n"

                        "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                        "fmla   v9.4s, v4.4s, v2.s[1]       \n"
                        "fmla   v10.4s, v4.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v4.4s, v2.s[3]      \n"

                        "ld1    {v0.4s, v1.4s}, [%12], #32  \n"

                        "fmla   v12.4s, v4.4s, v3.s[0]      \n"
                        "fmla   v13.4s, v4.4s, v3.s[1]      \n"
                        "fmla   v14.4s, v4.4s, v3.s[2]      \n"
                        "fmla   v15.4s, v4.4s, v3.s[3]      \n"

                        "prfm   pldl1keep, [%10, #256]      \n"
                        "ld2    {v6.4s, v7.4s}, [%10]       \n" // v6

                        "fmla   v8.4s, v5.4s, v0.s[0]       \n"
                        "fmla   v9.4s, v5.4s, v0.s[1]       \n"
                        "fmla   v10.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v5.4s, v0.s[3]      \n"

                        "ld1    {v2.4s, v3.4s}, [%12], #32  \n"

                        "ext    v6.16b, v4.16b, v6.16b, #4  \n" // v6=12

                        "fmla   v12.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v13.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v14.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v1.s[3]      \n"

                        ///
                        "prfm   pldl1keep, [%11, #256]      \n"
                        "ld2    {v4.4s, v5.4s}, [%11], #32  \n" // v4=20 v5=21

                        "fmla   v8.4s, v6.4s, v2.s[0]       \n"
                        "fmla   v9.4s, v6.4s, v2.s[1]       \n"
                        "fmla   v10.4s, v6.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v6.4s, v2.s[3]      \n"

                        "ld1    {v0.4s, v1.4s}, [%12], #32  \n"

                        "fmla   v12.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v13.4s, v6.4s, v3.s[1]      \n"
                        "fmla   v14.4s, v6.4s, v3.s[2]      \n"
                        "fmla   v15.4s, v6.4s, v3.s[3]      \n"

                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                        "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v4.4s, v0.s[3]      \n"

                        "ld1    {v2.4s, v3.4s}, [%12], #32  \n"

                        "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v15.4s, v4.4s, v1.s[3]      \n"

                        "prfm   pldl1keep, [%11, #256]      \n"
                        "ld2    {v6.4s, v7.4s}, [%11]       \n" // v6

                        "fmla   v8.4s, v5.4s, v2.s[0]       \n"
                        "fmla   v9.4s, v5.4s, v2.s[1]       \n"
                        "fmla   v10.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v5.4s, v2.s[3]      \n"

                        "ext    v6.16b, v4.16b, v6.16b, #4  \n" // v6=22

                        "ld1    {v0.4s, v1.4s}, [%12], #32  \n"

                        "fmla   v12.4s, v5.4s, v3.s[0]      \n"
                        "fmla   v13.4s, v5.4s, v3.s[1]      \n"
                        "fmla   v14.4s, v5.4s, v3.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v3.s[3]      \n"

                        "fmla   v8.4s, v6.4s, v0.s[0]       \n"
                        "fmla   v9.4s, v6.4s, v0.s[1]       \n"
                        "fmla   v10.4s, v6.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v6.4s, v0.s[3]      \n"

                        "fmla   v12.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v13.4s, v6.4s, v1.s[1]      \n"

                        "st1    {v8.4s}, [%1], #16          \n"
                        "st1    {v9.4s}, [%2], #16          \n"

                        "fmla   v14.4s, v6.4s, v1.s[2]      \n"
                        "fmla   v15.4s, v6.4s, v1.s[3]      \n"

                        "st1    {v10.4s}, [%3], #16         \n"
                        "st1    {v11.4s}, [%4], #16         \n"

                        "sub    %12, %12, #288              \n"

                        "st1    {v12.4s}, [%5], #16         \n"
                        "st1    {v13.4s}, [%6], #16         \n"

                        "subs   %w0, %w0, #1                \n"

                        "st1    {v14.4s}, [%7], #16         \n"
                        "st1    {v15.4s}, [%8], #16         \n"

                        "bne    0b                          \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(outptr4), // %5
                        "=r"(outptr5), // %6
                        "=r"(outptr6), // %7
                        "=r"(outptr7), // %8
                        "=r"(r0),      // %9
                        "=r"(r1),      // %10
                        "=r"(r2),      // %11
                        "=r"(ktmp)     // %12
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(outptr4),
                        "6"(outptr5),
                        "7"(outptr6),
                        "8"(outptr7),
                        "9"(r0),
                        "10"(r1),
                        "11"(r2),
                        "12"(ktmp)
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else  // __aarch64__
                for (; nn > 0; nn--)
                {
                    asm volatile(
                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d16-d17}, [%0]     \n"
                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d18-d19}, [%1]     \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d20-d21}, [%2]     \n"
                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d22-d23}, [%3]     \n"

                        ///
                        "pld        [%8, #256]          \n"
                        "vld2.f32   {d8-d11}, [%8]!     \n" // q4=00 q5=01

                        "vld1.f32   {d0-d3}, [%11 :128]! \n"

                        "vmla.f32   q8, q4, d0[0]       \n"
                        "vmla.f32   q9, q4, d0[1]       \n"

                        "pld        [%4, #128]          \n"
                        "vld1.f32   {d24-d25}, [%4]     \n"
                        "pld        [%5, #128]          \n"
                        "vld1.f32   {d26-d27}, [%5]     \n"

                        "vmla.f32   q10, q4, d1[0]      \n"
                        "vmla.f32   q11, q4, d1[1]      \n"

                        "pld        [%6, #128]          \n"
                        "vld1.f32   {d28-d29}, [%6]     \n"
                        "pld        [%7, #128]          \n"
                        "vld1.f32   {d30-d31}, [%7]     \n"

                        "vld1.f32   {d4-d7}, [%11 :128]! \n"

                        "vmla.f32   q12, q4, d2[0]      \n"
                        "vmla.f32   q13, q4, d2[1]      \n"
                        "vmla.f32   q14, q4, d3[0]      \n"
                        "vmla.f32   q15, q4, d3[1]      \n"

                        "pld        [%8, #128]          \n"
                        "vld2.f32   {d12-d13}, [%8]     \n" // q6

                        "vmla.f32   q8, q5, d4[0]       \n"
                        "vmla.f32   q9, q5, d4[1]       \n"
                        "vmla.f32   q10, q5, d5[0]      \n"
                        "vmla.f32   q11, q5, d5[1]      \n"

                        "vext.f32   q6, q4, q6, #1      \n" // q6=02

                        "vld1.f32   {d0-d3}, [%11 :128]! \n"

                        "vmla.f32   q12, q5, d6[0]      \n"
                        "vmla.f32   q13, q5, d6[1]      \n"
                        "vmla.f32   q14, q5, d7[0]      \n"
                        "vmla.f32   q15, q5, d7[1]      \n"

                        ///
                        "pld        [%9, #256]          \n"
                        "vld2.f32   {d8-d11}, [%9]!     \n" // q4=10 q5=11

                        "vmla.f32   q8, q6, d0[0]       \n"
                        "vmla.f32   q9, q6, d0[1]       \n"
                        "vmla.f32   q10, q6, d1[0]      \n"
                        "vmla.f32   q11, q6, d1[1]      \n"

                        "vld1.f32   {d4-d7}, [%11 :128]! \n"

                        "vmla.f32   q12, q6, d2[0]      \n"
                        "vmla.f32   q13, q6, d2[1]      \n"
                        "vmla.f32   q14, q6, d3[0]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "vmla.f32   q8, q4, d4[0]       \n"
                        "vmla.f32   q9, q4, d4[1]       \n"
                        "vmla.f32   q10, q4, d5[0]      \n"
                        "vmla.f32   q11, q4, d5[1]      \n"

                        "vld1.f32   {d0-d3}, [%11 :128]! \n"

                        "vmla.f32   q12, q4, d6[0]      \n"
                        "vmla.f32   q13, q4, d6[1]      \n"
                        "vmla.f32   q14, q4, d7[0]      \n"
                        "vmla.f32   q15, q4, d7[1]      \n"

                        "pld        [%9, #128]          \n"
                        "vld2.f32   {d12-d13}, [%9]     \n" // q6

                        "vmla.f32   q8, q5, d0[0]       \n"
                        "vmla.f32   q9, q5, d0[1]       \n"
                        "vmla.f32   q10, q5, d1[0]      \n"
                        "vmla.f32   q11, q5, d1[1]      \n"

                        "vld1.f32   {d4-d7}, [%11 :128]! \n"

                        "vext.f32   q6, q4, q6, #1      \n" // q6=12

                        "vmla.f32   q12, q5, d2[0]      \n"
                        "vmla.f32   q13, q5, d2[1]      \n"
                        "vmla.f32   q14, q5, d3[0]      \n"
                        "vmla.f32   q15, q5, d3[1]      \n"

                        ///
                        "pld        [%10, #256]         \n"
                        "vld2.f32   {d8-d11}, [%10]!    \n" // q4=20 q5=21

                        "vmla.f32   q8, q6, d4[0]       \n"
                        "vmla.f32   q9, q6, d4[1]       \n"
                        "vmla.f32   q10, q6, d5[0]      \n"
                        "vmla.f32   q11, q6, d5[1]      \n"

                        "vld1.f32   {d0-d3}, [%11 :128]! \n"

                        "vmla.f32   q12, q6, d6[0]      \n"
                        "vmla.f32   q13, q6, d6[1]      \n"
                        "vmla.f32   q14, q6, d7[0]      \n"
                        "vmla.f32   q15, q6, d7[1]      \n"

                        "vmla.f32   q8, q4, d0[0]       \n"
                        "vmla.f32   q9, q4, d0[1]       \n"
                        "vmla.f32   q10, q4, d1[0]      \n"
                        "vmla.f32   q11, q4, d1[1]      \n"

                        "vld1.f32   {d4-d7}, [%11 :128]! \n"

                        "vmla.f32   q12, q4, d2[0]      \n"
                        "vmla.f32   q13, q4, d2[1]      \n"
                        "vmla.f32   q14, q4, d3[0]      \n"
                        "vmla.f32   q15, q4, d3[1]      \n"

                        "pld        [%10, #128]         \n"
                        "vld2.f32   {d12-d13}, [%10]    \n" // q6

                        "vmla.f32   q8, q5, d4[0]       \n"
                        "vmla.f32   q9, q5, d4[1]       \n"
                        "vmla.f32   q10, q5, d5[0]      \n"
                        "vmla.f32   q11, q5, d5[1]      \n"

                        "vext.f32   q6, q4, q6, #1      \n" // q6=22

                        "vld1.f32   {d0-d3}, [%11 :128]! \n"

                        "vmla.f32   q12, q5, d6[0]      \n"
                        "vmla.f32   q13, q5, d6[1]      \n"
                        "vmla.f32   q14, q5, d7[0]      \n"
                        "vmla.f32   q15, q5, d7[1]      \n"

                        "vmla.f32   q8, q6, d0[0]       \n"
                        "vmla.f32   q9, q6, d0[1]       \n"
                        "vmla.f32   q10, q6, d1[0]      \n"
                        "vmla.f32   q11, q6, d1[1]      \n"

                        "vmla.f32   q12, q6, d2[0]      \n"
                        "vmla.f32   q13, q6, d2[1]      \n"

                        "vst1.f32   {d16-d17}, [%0]!    \n"
                        "vst1.f32   {d18-d19}, [%1]!    \n"

                        "vmla.f32   q14, q6, d3[0]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "vst1.f32   {d20-d21}, [%2]!    \n"
                        "vst1.f32   {d22-d23}, [%3]!    \n"

                        "sub        %11, %11, #288      \n"

                        "vst1.f32   {d24-d25}, [%4]!    \n"
                        "vst1.f32   {d26-d27}, [%5]!    \n"
                        "vst1.f32   {d28-d29}, [%6]!    \n"
                        "vst1.f32   {d30-d31}, [%7]!    \n"
                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(outptr2), // %2
                        "=r"(outptr3), // %3
                        "=r"(outptr4), // %4
                        "=r"(outptr5), // %5
                        "=r"(outptr6), // %6
                        "=r"(outptr7), // %7
                        "=r"(r0),      // %8
                        "=r"(r1),      // %9
                        "=r"(r2),      // %10
                        "=r"(ktmp)     // %11
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(outptr2),
                        "3"(outptr3),
                        "4"(outptr4),
                        "5"(outptr5),
                        "6"(outptr6),
                        "7"(outptr7),
                        "8"(r0),
                        "9"(r1),
                        "10"(r2),
                        "11"(ktmp)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
#if __aarch64__
                    asm volatile(
                        "ld1    {v10.4s, v11.4s}, [%11], #32    \n"

                        "prfm   pldl1keep, [%8, #128]   \n"
                        "ld1    {v0.4s}, [%8]           \n"

                        "ld1    {v12.4s, v13.4s}, [%11], #32    \n"

                        "ld1    {v8.s}[0], [%0]         \n"
                        "ld1    {v8.s}[1], [%1]         \n"
                        "ld1    {v8.s}[2], [%2]         \n"
                        "ld1    {v8.s}[3], [%3]         \n"

                        "fmul   v14.4s, v10.4s, v0.s[0] \n"
                        "fmul   v15.4s, v11.4s, v0.s[0] \n"

                        "ld1    {v9.s}[0], [%4]         \n"
                        "ld1    {v9.s}[1], [%5]         \n"
                        "ld1    {v9.s}[2], [%6]         \n"
                        "ld1    {v9.s}[3], [%7]         \n"

                        "ld1    {v10.4s, v11.4s}, [%11], #32    \n"

                        "fmla   v8.4s, v12.4s, v0.s[1]  \n"
                        "fmla   v9.4s, v13.4s, v0.s[1]  \n"

                        "ld1    {v12.4s, v13.4s}, [%11], #32    \n"

                        "fmla   v14.4s, v10.4s, v0.s[2] \n"
                        "fmla   v15.4s, v11.4s, v0.s[2] \n"

                        "prfm   pldl1keep, [%9, #128]   \n"
                        "ld1    {v1.4s}, [%9]           \n"

                        "ld1    {v10.4s, v11.4s}, [%11], #32    \n"

                        "fmla   v8.4s, v12.4s, v1.s[0]  \n"
                        "fmla   v9.4s, v13.4s, v1.s[0]  \n"

                        "ld1    {v12.4s, v13.4s}, [%11], #32    \n"

                        "fmla   v14.4s, v10.4s, v1.s[1] \n"
                        "fmla   v15.4s, v11.4s, v1.s[1] \n"

                        "ld1    {v10.4s, v11.4s}, [%11], #32    \n"

                        "fmla   v8.4s, v12.4s, v1.s[2]  \n"
                        "fmla   v9.4s, v13.4s, v1.s[2]  \n"

                        "prfm   pldl1keep, [%10, #128]  \n"
                        "ld1    {v0.4s}, [%10]          \n"

                        "ld1    {v12.4s, v13.4s}, [%11], #32    \n"

                        "fmla   v14.4s, v10.4s, v0.s[0] \n"
                        "fmla   v15.4s, v11.4s, v0.s[0] \n"

                        "ld1    {v10.4s, v11.4s}, [%11], #32    \n"

                        "fmla   v8.4s, v12.4s, v0.s[1]  \n"
                        "fmla   v9.4s, v13.4s, v0.s[1]  \n"

                        "fmla   v14.4s, v10.4s, v0.s[2] \n"
                        "fmla   v15.4s, v11.4s, v0.s[2] \n"

                        "fadd   v8.4s, v8.4s, v14.4s    \n"
                        "fadd   v9.4s, v9.4s, v15.4s    \n"

                        "sub    %11, %11, #288          \n"

                        "st1    {v8.s}[0], [%0], #4     \n"
                        "st1    {v8.s}[1], [%1], #4     \n"
                        "st1    {v8.s}[2], [%2], #4     \n"
                        "st1    {v8.s}[3], [%3], #4     \n"

                        "st1    {v9.s}[0], [%4], #4     \n"
                        "st1    {v9.s}[1], [%5], #4     \n"
                        "st1    {v9.s}[2], [%6], #4     \n"
                        "st1    {v9.s}[3], [%7], #4     \n"

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(outptr2), // %2
                        "=r"(outptr3), // %3
                        "=r"(outptr4), // %4
                        "=r"(outptr5), // %5
                        "=r"(outptr6), // %6
                        "=r"(outptr7), // %7
                        "=r"(r0),      // %8
                        "=r"(r1),      // %9
                        "=r"(r2),      // %10
                        "=r"(ktmp)     // %11
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(outptr2),
                        "3"(outptr3),
                        "4"(outptr4),
                        "5"(outptr5),
                        "6"(outptr6),
                        "7"(outptr7),
                        "8"(r0),
                        "9"(r1),
                        "10"(r2),
                        "11"(ktmp)
                        : "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
#else  // __aarch64__
                    asm volatile(
                        "vld1.f32   {d20-d23}, [%11 :128]! \n"

                        "pld        [%8, #128]      \n"
                        "vld1.f32   {d0-d1}, [%8]   \n"

                        "vld1.f32   {d24-d27}, [%11 :128]! \n"

                        "vld1.f32   {d16[0]}, [%0]  \n"
                        "vld1.f32   {d16[1]}, [%1]  \n"
                        "vld1.f32   {d17[0]}, [%2]  \n"
                        "vld1.f32   {d17[1]}, [%3]  \n"

                        "vmul.f32   q14, q10, d0[0] \n"
                        "vmul.f32   q15, q11, d0[0] \n"

                        "vld1.f32   {d18[0]}, [%4]  \n"
                        "vld1.f32   {d18[1]}, [%5]  \n"
                        "vld1.f32   {d19[0]}, [%6]  \n"
                        "vld1.f32   {d19[1]}, [%7]  \n"

                        "vld1.f32   {d20-d23}, [%11 :128]! \n"

                        "vmla.f32   q8, q12, d0[1]  \n"
                        "vmla.f32   q9, q13, d0[1]  \n"

                        "vld1.f32   {d24-d27}, [%11 :128]! \n"

                        "vmla.f32   q14, q10, d1[0] \n"
                        "vmla.f32   q15, q11, d1[0] \n"

                        "pld        [%9, #128]      \n"
                        "vld1.f32   {d2-d3}, [%9]   \n"

                        "vld1.f32   {d20-d23}, [%11 :128]! \n"

                        "vmla.f32   q8, q12, d2[0]  \n"
                        "vmla.f32   q9, q13, d2[0]  \n"

                        "vld1.f32   {d24-d27}, [%11 :128]! \n"

                        "vmla.f32   q14, q10, d2[1] \n"
                        "vmla.f32   q15, q11, d2[1] \n"

                        "vld1.f32   {d20-d23}, [%11 :128]! \n"

                        "vmla.f32   q8, q12, d3[0]  \n"
                        "vmla.f32   q9, q13, d3[0]  \n"

                        "pld        [%10, #128]     \n"
                        "vld1.f32   {d0-d1}, [%10]  \n"

                        "vld1.f32   {d24-d27}, [%11 :128]! \n"

                        "vmla.f32   q14, q10, d0[0] \n"
                        "vmla.f32   q15, q11, d0[0] \n"

                        "vld1.f32   {d20-d23}, [%11 :128]! \n"

                        "vmla.f32   q8, q12, d0[1]  \n"
                        "vmla.f32   q9, q13, d0[1]  \n"

                        "vmla.f32   q14, q10, d1[0] \n"
                        "vmla.f32   q15, q11, d1[0] \n"

                        "vadd.f32   q8, q8, q14     \n"
                        "vadd.f32   q9, q9, q15     \n"

                        "sub        %11, %11, #288  \n"

                        "vst1.f32   {d16[0]}, [%0]! \n"
                        "vst1.f32   {d16[1]}, [%1]! \n"
                        "vst1.f32   {d17[0]}, [%2]! \n"
                        "vst1.f32   {d17[1]}, [%3]! \n"

                        "vst1.f32   {d18[0]}, [%4]! \n"
                        "vst1.f32   {d18[1]}, [%5]! \n"
                        "vst1.f32   {d19[0]}, [%6]! \n"
                        "vst1.f32   {d19[1]}, [%7]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(outptr2), // %2
                        "=r"(outptr3), // %3
                        "=r"(outptr4), // %4
                        "=r"(outptr5), // %5
                        "=r"(outptr6), // %6
                        "=r"(outptr7), // %7
                        "=r"(r0),      // %8
                        "=r"(r1),      // %9
                        "=r"(r2),      // %10
                        "=r"(ktmp)     // %11
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(outptr2),
                        "3"(outptr3),
                        "4"(outptr4),
                        "5"(outptr5),
                        "6"(outptr6),
                        "7"(outptr7),
                        "8"(r0),
                        "9"(r1),
                        "10"(r2),
                        "11"(ktmp)
                        : "memory", "q0", "q1", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // __ARM_NEON
                    float sum0 = 0.f;
                    float sum1 = 0.f;
                    float sum2 = 0.f;
                    float sum3 = 0.f;
                    float sum4 = 0.f;
                    float sum5 = 0.f;
                    float sum6 = 0.f;
                    float sum7 = 0.f;

                    sum0 += r0[0] * ktmp[0];
                    sum1 += r0[0] * ktmp[1];
                    sum2 += r0[0] * ktmp[2];
                    sum3 += r0[0] * ktmp[3];
                    sum4 += r0[0] * ktmp[4];
                    sum5 += r0[0] * ktmp[5];
                    sum6 += r0[0] * ktmp[6];
                    sum7 += r0[0] * ktmp[7];
                    ktmp += 8;

                    sum0 += r0[1] * ktmp[0];
                    sum1 += r0[1] * ktmp[1];
                    sum2 += r0[1] * ktmp[2];
                    sum3 += r0[1] * ktmp[3];
                    sum4 += r0[1] * ktmp[4];
                    sum5 += r0[1] * ktmp[5];
                    sum6 += r0[1] * ktmp[6];
                    sum7 += r0[1] * ktmp[7];
                    ktmp += 8;

                    sum0 += r0[2] * ktmp[0];
                    sum1 += r0[2] * ktmp[1];
                    sum2 += r0[2] * ktmp[2];
                    sum3 += r0[2] * ktmp[3];
                    sum4 += r0[2] * ktmp[4];
                    sum5 += r0[2] * ktmp[5];
                    sum6 += r0[2] * ktmp[6];
                    sum7 += r0[2] * ktmp[7];
                    ktmp += 8;

                    sum0 += r1[0] * ktmp[0];
                    sum1 += r1[0] * ktmp[1];
                    sum2 += r1[0] * ktmp[2];
                    sum3 += r1[0] * ktmp[3];
                    sum4 += r1[0] * ktmp[4];
                    sum5 += r1[0] * ktmp[5];
                    sum6 += r1[0] * ktmp[6];
                    sum7 += r1[0] * ktmp[7];
                    ktmp += 8;

                    sum0 += r1[1] * ktmp[0];
                    sum1 += r1[1] * ktmp[1];
                    sum2 += r1[1] * ktmp[2];
                    sum3 += r1[1] * ktmp[3];
                    sum4 += r1[1] * ktmp[4];
                    sum5 += r1[1] * ktmp[5];
                    sum6 += r1[1] * ktmp[6];
                    sum7 += r1[1] * ktmp[7];
                    ktmp += 8;

                    sum0 += r1[2] * ktmp[0];
                    sum1 += r1[2] * ktmp[1];
                    sum2 += r1[2] * ktmp[2];
                    sum3 += r1[2] * ktmp[3];
                    sum4 += r1[2] * ktmp[4];
                    sum5 += r1[2] * ktmp[5];
                    sum6 += r1[2] * ktmp[6];
                    sum7 += r1[2] * ktmp[7];
                    ktmp += 8;

                    sum0 += r2[0] * ktmp[0];
                    sum1 += r2[0] * ktmp[1];
                    sum2 += r2[0] * ktmp[2];
                    sum3 += r2[0] * ktmp[3];
                    sum4 += r2[0] * ktmp[4];
                    sum5 += r2[0] * ktmp[5];
                    sum6 += r2[0] * ktmp[6];
                    sum7 += r2[0] * ktmp[7];
                    ktmp += 8;

                    sum0 += r2[1] * ktmp[0];
                    sum1 += r2[1] * ktmp[1];
                    sum2 += r2[1] * ktmp[2];
                    sum3 += r2[1] * ktmp[3];
                    sum4 += r2[1] * ktmp[4];
                    sum5 += r2[1] * ktmp[5];
                    sum6 += r2[1] * ktmp[6];
                    sum7 += r2[1] * ktmp[7];
                    ktmp += 8;

                    sum0 += r2[2] * ktmp[0];
                    sum1 += r2[2] * ktmp[1];
                    sum2 += r2[2] * ktmp[2];
                    sum3 += r2[2] * ktmp[3];
                    sum4 += r2[2] * ktmp[4];
                    sum5 += r2[2] * ktmp[5];
                    sum6 += r2[2] * ktmp[6];
                    sum7 += r2[2] * ktmp[7];
                    ktmp += 8;

                    *outptr0 += sum0;
                    *outptr1 += sum1;
                    *outptr2 += sum2;
                    *outptr3 += sum3;
                    *outptr4 += sum4;
                    *outptr5 += sum5;
                    *outptr6 += sum6;
                    *outptr7 += sum7;

                    ktmp -= 8 * 9;

                    outptr0++;
                    outptr1++;
                    outptr2++;
                    outptr3++;
                    outptr4++;
                    outptr5++;
                    outptr6++;
                    outptr7++;
#endif // __ARM_NEON
                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            ktmp += 8 * 9;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        const float* ktmp = _kernel.channel(p / 8 + p % 8);

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;

            const float* k0 = ktmp;
            const float* k1 = ktmp + 3;
            const float* k2 = ktmp + 6;

#if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(k0);
            float32x4_t _k3456 = vld1q_f32(k1);
            float32x4_t _k6789 = vld1q_f32(k2);
#endif // __ARM_NEON

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw & 3;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
                        "0:                                        \n"

                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v0.4s}, [%1]                  \n"

                        "fmla       v0.4s,  v2.4s, %10.s[0]        \n"
                        "fmul       v10.4s, v3.4s, %10.s[1]        \n"

                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v8.4s, v9.4s}, [%2]           \n"
                        "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                        "fmul       v11.4s, v1.4s, %10.s[2]        \n"

                        "prfm       pldl1keep, [%3, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%3], #32      \n"

                        "fmla       v0.4s,  v2.4s, %11.s[0]        \n"
                        "fmla       v10.4s, v3.4s, %11.s[1]        \n"

                        "prfm       pldl1keep, [%3, #256]          \n"
                        "ld2        {v8.4s, v9.4s}, [%3]           \n"
                        "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                        "fmla       v11.4s, v1.4s, %11.s[2]        \n"

                        "prfm       pldl1keep, [%4, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%4], #32      \n"

                        "fmla       v0.4s,  v2.4s, %12.s[0]        \n"
                        "fmla       v10.4s, v3.4s, %12.s[1]        \n"

                        "prfm       pldl1keep, [%4, #256]          \n"
                        "ld2        {v8.4s, v9.4s}, [%4]           \n"
                        "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                        "fmla       v11.4s, v1.4s, %12.s[2]        \n"

                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v2.4s, v3.4s}, [%2], #32      \n"

                        "fadd       v0.4s, v0.4s, v10.4s           \n"
                        "fadd       v0.4s, v0.4s, v11.4s           \n"

                        "subs       %w0, %w0, #1                   \n"
                        "st1        {v0.4s}, [%1], #16             \n"
                        "bne        0b                             \n"
                        "sub        %2, %2, #32                    \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2)      // %4
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k0123), // %10
                        "w"(_k3456), // %11
                        "w"(_k6789)  // %12
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "pld        [%2, #256]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"

                        "0:                             \n"
                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1]       \n"

                        "vmla.f32   q0, q2, %e10[0]     \n"
                        "vmul.f32   q10, q3, %e10[1]    \n"

                        "pld        [%2, #128]          \n"
                        "vld2.f32   {d16-d17}, [%2]     \n"
                        "vext.32    q1, q2, q8, #1      \n"

                        "vmul.f32   q11, q1, %f10[0]    \n"

                        "pld        [%3, #256]          \n"
                        "vld2.f32   {d4-d7}, [%3]!      \n"

                        "vmla.f32   q0, q2, %e11[0]     \n"
                        "vmla.f32   q10, q3, %e11[1]    \n"

                        "pld        [%3, #128]          \n"
                        "vld2.f32   {d16-d17}, [%3]     \n"
                        "vext.32    q1, q2, q8, #1      \n"

                        "vmla.f32   q11, q1, %f11[0]    \n"

                        "pld        [%4, #256]          \n"
                        "vld2.f32   {d4-d7}, [%4]!      \n"

                        "vmla.f32   q0, q2, %e12[0]     \n"
                        "vmla.f32   q10, q3, %e12[1]    \n"

                        "pld        [%4, #128]          \n"
                        "vld2.f32   {d16-d17}, [%4]     \n"
                        "vext.32    q1, q2, q8, #1      \n"

                        "vmla.f32   q11, q1, %f12[0]    \n"

                        "pld        [%2, #256]          \n"
                        "vld2.f32   {d4-d7}, [%2]!      \n"

                        "vadd.f32   q0, q0, q10         \n"
                        "vadd.f32   q0, q0, q11         \n"

                        "subs       %0, #1              \n"
                        "vst1.f32   {d0-d1}, [%1]!      \n"
                        "bne        0b                  \n"
                        "sub        %2, #32             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2)      // %4
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k0123), // %10
                        "w"(_k3456), // %11
                        "w"(_k6789)  // %12
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
                    float32x4_t _r00 = vld1q_f32(r0);
                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r20 = vld1q_f32(r2);

                    float32x4_t _sum = vmulq_f32(_r00, _k0123);
                    _sum = vmlaq_f32(_sum, _r10, _k3456);
                    _sum = vmlaq_f32(_sum, _r20, _k6789);

                    _sum = vsetq_lane_f32(*outptr, _sum, 3);

#if __aarch64__
                    *outptr = vaddvq_f32(_sum);
#else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);

                    *outptr = vget_lane_f32(_ss, 0);
#endif // __aarch64__
#else
                    float sum = 0;

                    sum += r0[0] * ktmp[0];
                    sum += r0[1] * ktmp[1];
                    sum += r0[2] * ktmp[2];
                    sum += r1[0] * ktmp[3];
                    sum += r1[1] * ktmp[4];
                    sum += r1[2] * ktmp[5];
                    sum += r2[0] * ktmp[6];
                    sum += r2[1] * ktmp[7];
                    sum += r2[2] * ktmp[8];

                    *outptr += sum;
#endif // __ARM_NEON

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            ktmp += 9;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s2_transform_kernel_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch)
{
    kernel_tm.create(8 * 9, inch, outch / 8 + outch % 8, (size_t)1u);

    const signed char* kernel = _kernel;

    int p = 0;
    for (; p + 7 < outch; p += 8)
    {
        const signed char* k0 = kernel + (p + 0) * inch * 9;
        const signed char* k1 = kernel + (p + 1) * inch * 9;
        const signed char* k2 = kernel + (p + 2) * inch * 9;
        const signed char* k3 = kernel + (p + 3) * inch * 9;
        const signed char* k4 = kernel + (p + 4) * inch * 9;
        const signed char* k5 = kernel + (p + 5) * inch * 9;
        const signed char* k6 = kernel + (p + 6) * inch * 9;
        const signed char* k7 = kernel + (p + 7) * inch * 9;

        signed char* ktmp = kernel_tm.channel(p / 8);

        for (int q = 0; q < inch; q++)
        {
            for (int k = 0; k < 9; k++)
            {
                ktmp[0] = k0[k];
                ktmp[1] = k1[k];
                ktmp[2] = k2[k];
                ktmp[3] = k3[k];
                ktmp[4] = k4[k];
                ktmp[5] = k5[k];
                ktmp[6] = k6[k];
                ktmp[7] = k7[k];
                ktmp += 8;
            }

            k0 += 9;
            k1 += 9;
            k2 += 9;
            k3 += 9;
            k4 += 9;
            k5 += 9;
            k6 += 9;
            k7 += 9;
        }
    }
    for (; p < outch; p++)
    {
        const signed char* k0 = kernel + (p + 0) * inch * 9;

        signed char* ktmp = kernel_tm.channel(p / 8 + p % 8);

        for (int q = 0; q < inch; q++)
        {
            for (int k = 0; k < 9; k++)
            {
                ktmp[k] = k0[k];
            }
            ktmp += 9;

            k0 += 9;
        }
    }
}

static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    int nn_outch = outch >> 3;
    int remain_outch_start = nn_outch << 3;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 8;

        Mat out0 = top_blob.channel(p + 0);
        Mat out1 = top_blob.channel(p + 1);
        Mat out2 = top_blob.channel(p + 2);
        Mat out3 = top_blob.channel(p + 3);
        Mat out4 = top_blob.channel(p + 4);
        Mat out5 = top_blob.channel(p + 5);
        Mat out6 = top_blob.channel(p + 6);
        Mat out7 = top_blob.channel(p + 7);

        out0.fill(0);
        out1.fill(0);
        out2.fill(0);
        out3.fill(0);
        out4.fill(0);
        out5.fill(0);
        out6.fill(0);
        out7.fill(0);

        const signed char* ktmp = _kernel.channel(p / 8);

        for (int q = 0; q < inch; q++)
        {
            int* outptr0 = out0;
            int* outptr1 = out1;
            int* outptr2 = out2;
            int* outptr3 = out3;
            int* outptr4 = out4;
            int* outptr5 = out5;
            int* outptr6 = out6;
            int* outptr7 = out7;

            const signed char* img0 = bottom_blob.channel(q);

            const signed char* r0 = img0;
            const signed char* r1 = img0 + w;
            const signed char* r2 = img0 + w * 2;

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
#if __aarch64__
                int nn = outw >> 3;
                int remain = outw & 7;
#else
                int nn = outw >> 2;
                int remain = outw & 3;
#endif // __aarch64__
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                                   \n"

                        "ld1    {v0.8b, v1.8b, v2.8b}, [%12], #24  \n" //ktmp
                        "ld2    {v3.8b, v4.8b}, [%9], #16     \n"      //r0-r2
                        "ld2    {v5.8b, v6.8b}, [%9]          \n"

                        "ld1    {v8.4s, v9.4s}, [%1]          \n" //out0
                        "ld1    {v10.4s, v11.4s}, [%2]        \n" //out1
                        "ld1    {v12.4s, v13.4s}, [%3]        \n" //out2
                        "ld1    {v14.4s, v15.4s}, [%4]        \n" //out3
                        "ld1    {v16.4s, v17.4s}, [%5]        \n" //out4
                        "ld1    {v18.4s, v19.4s}, [%6]        \n" //out5
                        "ld1    {v20.4s, v21.4s}, [%7]        \n" //out6
                        "ld1    {v22.4s, v23.4s}, [%8]        \n" //out7

                        "ext    v7.8b, v3.8b, v5.8b, #1       \n"

                        "sshll  v0.8h, v0.8b, #0              \n" //(k00-k70)
                        "sshll  v1.8h, v1.8b, #0              \n" //(k01-k71)
                        "sshll  v2.8h, v2.8b, #0              \n" //(k02-k72)
                        "sshll  v3.8h, v3.8b, #0              \n" // r0
                        "sshll  v4.8h, v4.8b, #0              \n" // r1
                        "sshll  v7.8h, v7.8b, #0              \n" // r2

                        // r0
                        "smlal  v8.4s, v3.4h, v0.h[0]         \n" // out0 += (r00-r07)*k00
                        "smlal2  v9.4s, v3.8h, v0.h[0]        \n"
                        "smlal  v10.4s, v3.4h, v0.h[1]        \n" // out1 += (r00-r07)*k10
                        "smlal2  v11.4s, v3.8h, v0.h[1]       \n"
                        "smlal  v12.4s, v3.4h, v0.h[2]        \n" // out2 += (r00-r07)*k20
                        "smlal2  v13.4s, v3.8h, v0.h[2]       \n"
                        "smlal  v14.4s, v3.4h, v0.h[3]        \n" // out3 += (r00-r07)*k30
                        "smlal2  v15.4s, v3.8h, v0.h[3]       \n"
                        "smlal  v16.4s, v3.4h, v0.h[4]        \n" // out4 += (r00-r07)*k40
                        "smlal2  v17.4s, v3.8h, v0.h[4]       \n"
                        "smlal  v18.4s, v3.4h, v0.h[5]        \n" // out5 += (r00-r07)*k50
                        "smlal2  v19.4s, v3.8h, v0.h[5]       \n"
                        "smlal  v20.4s, v3.4h, v0.h[6]        \n" // out6 += (r00-r07)*k60
                        "smlal2  v21.4s, v3.8h, v0.h[6]       \n"
                        "smlal  v22.4s, v3.4h, v0.h[7]        \n" // out7 += (r00-r07)*k70
                        "smlal2  v23.4s, v3.8h, v0.h[7]       \n"
                        // r1
                        "smlal  v8.4s, v4.4h, v1.h[0]         \n" // out0 += (r10-r17)*k01
                        "smlal2  v9.4s, v4.8h, v1.h[0]        \n"
                        "smlal  v10.4s, v4.4h, v1.h[1]        \n" // out1 += (r10-r17)*k11
                        "smlal2  v11.4s, v4.8h, v1.h[1]       \n"
                        "smlal  v12.4s, v4.4h, v1.h[2]        \n" // out2 += (r10-r17)*k21
                        "smlal2  v13.4s, v4.8h, v1.h[2]       \n"
                        "smlal  v14.4s, v4.4h, v1.h[3]        \n" // out3 += (r10-r17)*k31
                        "smlal2  v15.4s, v4.8h, v1.h[3]       \n"
                        "smlal  v16.4s, v4.4h, v1.h[4]        \n" // out4 += (r10-r17)*k41
                        "smlal2  v17.4s, v4.8h, v1.h[4]       \n"
                        "smlal  v18.4s, v4.4h, v1.h[5]        \n" // out5 += (r10-r17)*k51
                        "smlal2  v19.4s, v4.8h, v1.h[5]       \n"
                        "smlal  v20.4s, v4.4h, v1.h[6]        \n" // out6 += (r10-r17)*k61
                        "smlal2  v21.4s, v4.8h, v1.h[6]       \n"
                        "smlal  v22.4s, v4.4h, v1.h[7]        \n" // out7 += (r10-r17)*k71
                        "smlal2  v23.4s, v4.8h, v1.h[7]       \n"
                        // r2
                        "smlal  v8.4s, v7.4h, v2.h[0]         \n" // out0 += (r20-r27)*k02
                        "smlal2  v9.4s, v7.8h, v2.h[0]        \n"
                        "smlal  v10.4s, v7.4h, v2.h[1]        \n" // out1 += (r20-r27)*k12
                        "smlal2  v11.4s, v7.8h, v2.h[1]       \n"
                        "smlal  v12.4s, v7.4h, v2.h[2]        \n" // out2 += (r20-r27)*k22
                        "smlal2  v13.4s, v7.8h, v2.h[2]       \n"
                        "smlal  v14.4s, v7.4h, v2.h[3]        \n" // out3 += (r20-r27)*k32
                        "smlal2  v15.4s, v7.8h, v2.h[3]       \n"
                        "smlal  v16.4s, v7.4h, v2.h[4]        \n" // out4 += (r20-r27)*k42
                        "smlal2  v17.4s, v7.8h, v2.h[4]       \n"
                        "smlal  v18.4s, v7.4h, v2.h[5]        \n" // out5 += (r20-r27)*k52
                        "smlal2  v19.4s, v7.8h, v2.h[5]       \n"
                        "smlal  v20.4s, v7.4h, v2.h[6]        \n" // out6 += (r20-r27)*k62
                        "smlal2  v21.4s, v7.8h, v2.h[6]       \n"
                        "smlal  v22.4s, v7.4h, v2.h[7]        \n" // out7 += (r20-r27)*k72
                        "smlal2  v23.4s, v7.8h, v2.h[7]       \n"

                        "ld1    {v0.8b, v1.8b, v2.8b}, [%12], #24  \n" //ktmp
                        "ld2    {v3.8b, v4.8b}, [%10], #16    \n"      //r3-r5
                        "ld2    {v5.8b, v6.8b}, [%10]         \n"

                        "ext    v7.8b, v3.8b, v5.8b, #1       \n"

                        "sshll  v0.8h, v0.8b, #0              \n" //(k03-k73)
                        "sshll  v1.8h, v1.8b, #0              \n" //(k04-k74)
                        "sshll  v2.8h, v2.8b, #0              \n" //(k05-k75)
                        "sshll  v3.8h, v3.8b, #0              \n" // r3
                        "sshll  v4.8h, v4.8b, #0              \n" // r4
                        "sshll  v7.8h, v7.8b, #0              \n" // r5

                        // r3
                        "smlal  v8.4s, v3.4h, v0.h[0]         \n" // out0 += (r30-r37)*k03
                        "smlal2  v9.4s, v3.8h, v0.h[0]        \n"
                        "smlal  v10.4s, v3.4h, v0.h[1]        \n" // out1 += (r30-r37)*k13
                        "smlal2  v11.4s, v3.8h, v0.h[1]       \n"
                        "smlal  v12.4s, v3.4h, v0.h[2]        \n" // out2 += (r30-r37)*k23
                        "smlal2  v13.4s, v3.8h, v0.h[2]       \n"
                        "smlal  v14.4s, v3.4h, v0.h[3]        \n" // out3 += (r30-r37)*k33
                        "smlal2  v15.4s, v3.8h, v0.h[3]       \n"
                        "smlal  v16.4s, v3.4h, v0.h[4]        \n" // out4 += (r30-r37)*k43
                        "smlal2  v17.4s, v3.8h, v0.h[4]       \n"
                        "smlal  v18.4s, v3.4h, v0.h[5]        \n" // out5 += (r30-r37)*k53
                        "smlal2  v19.4s, v3.8h, v0.h[5]       \n"
                        "smlal  v20.4s, v3.4h, v0.h[6]        \n" // out6 += (r30-r37)*k63
                        "smlal2  v21.4s, v3.8h, v0.h[6]       \n"
                        "smlal  v22.4s, v3.4h, v0.h[7]        \n" // out7 += (r30-r37)*k73
                        "smlal2  v23.4s, v3.8h, v0.h[7]       \n"
                        // r4
                        "smlal  v8.4s, v4.4h, v1.h[0]         \n" // out0 += (r40-r47)*k04
                        "smlal2  v9.4s, v4.8h, v1.h[0]        \n"
                        "smlal  v10.4s, v4.4h, v1.h[1]        \n" // out1 += (r40-r47)*k14
                        "smlal2  v11.4s, v4.8h, v1.h[1]       \n"
                        "smlal  v12.4s, v4.4h, v1.h[2]        \n" // out2 += (r40-r47)*k24
                        "smlal2  v13.4s, v4.8h, v1.h[2]       \n"
                        "smlal  v14.4s, v4.4h, v1.h[3]        \n" // out3 += (r40-r47)*k34
                        "smlal2  v15.4s, v4.8h, v1.h[3]       \n"
                        "smlal  v16.4s, v4.4h, v1.h[4]        \n" // out4 += (r40-r47)*k44
                        "smlal2  v17.4s, v4.8h, v1.h[4]       \n"
                        "smlal  v18.4s, v4.4h, v1.h[5]        \n" // out5 += (r40-r47)*k54
                        "smlal2  v19.4s, v4.8h, v1.h[5]       \n"
                        "smlal  v20.4s, v4.4h, v1.h[6]        \n" // out6 += (r40-r47)*k64
                        "smlal2  v21.4s, v4.8h, v1.h[6]       \n"
                        "smlal  v22.4s, v4.4h, v1.h[7]        \n" // out7 += (r40-r47)*k74
                        "smlal2  v23.4s, v4.8h, v1.h[7]       \n"
                        // r5
                        "smlal  v8.4s, v7.4h, v2.h[0]         \n" // out0 += (r50-r57)*k05
                        "smlal2  v9.4s, v7.8h, v2.h[0]        \n"
                        "smlal  v10.4s, v7.4h, v2.h[1]        \n" // out1 += (r50-r57)*k15
                        "smlal2  v11.4s, v7.8h, v2.h[1]       \n"
                        "smlal  v12.4s, v7.4h, v2.h[2]        \n" // out2 += (r50-r57)*k25
                        "smlal2  v13.4s, v7.8h, v2.h[2]       \n"
                        "smlal  v14.4s, v7.4h, v2.h[3]        \n" // out3 += (r50-r57)*k35
                        "smlal2  v15.4s, v7.8h, v2.h[3]       \n"
                        "smlal  v16.4s, v7.4h, v2.h[4]        \n" // out4 += (r50-r57)*k45
                        "smlal2  v17.4s, v7.8h, v2.h[4]       \n"
                        "smlal  v18.4s, v7.4h, v2.h[5]        \n" // out5 += (r50-r57)*k55
                        "smlal2  v19.4s, v7.8h, v2.h[5]       \n"
                        "smlal  v20.4s, v7.4h, v2.h[6]        \n" // out6 += (r50-r57)*k65
                        "smlal2  v21.4s, v7.8h, v2.h[6]       \n"
                        "smlal  v22.4s, v7.4h, v2.h[7]        \n" // out7 += (r50-r57)*k75
                        "smlal2  v23.4s, v7.8h, v2.h[7]       \n"

                        "ld1    {v0.8b, v1.8b, v2.8b}, [%12], #24  \n" //ktmp
                        "ld2    {v3.8b, v4.8b}, [%11], #16    \n"      //r6-r8
                        "ld2    {v5.8b, v6.8b}, [%11]         \n"

                        "ext    v7.8b, v3.8b, v5.8b, #1       \n"

                        "sshll  v0.8h, v0.8b, #0              \n" //(k06-k76)
                        "sshll  v1.8h, v1.8b, #0              \n" //(k07-k77)
                        "sshll  v2.8h, v2.8b, #0              \n" //(k08-k78)
                        "sshll  v3.8h, v3.8b, #0              \n" // r6
                        "sshll  v4.8h, v4.8b, #0              \n" // r7
                        "sshll  v7.8h, v7.8b, #0              \n" // r8

                        // r6
                        "smlal  v8.4s, v3.4h, v0.h[0]         \n" // out0 += (r60-r67)*k06
                        "smlal2  v9.4s, v3.8h, v0.h[0]        \n"
                        "smlal  v10.4s, v3.4h, v0.h[1]        \n" // out1 += (r60-r67)*k16
                        "smlal2  v11.4s, v3.8h, v0.h[1]       \n"
                        "smlal  v12.4s, v3.4h, v0.h[2]        \n" // out2 += (r60-r67)*k26
                        "smlal2  v13.4s, v3.8h, v0.h[2]       \n"
                        "smlal  v14.4s, v3.4h, v0.h[3]        \n" // out3 += (r60-r67)*k36
                        "smlal2  v15.4s, v3.8h, v0.h[3]       \n"
                        "smlal  v16.4s, v3.4h, v0.h[4]        \n" // out4 += (r60-r67)*k46
                        "smlal2  v17.4s, v3.8h, v0.h[4]       \n"
                        "smlal  v18.4s, v3.4h, v0.h[5]        \n" // out5 += (r60-r67)*k56
                        "smlal2  v19.4s, v3.8h, v0.h[5]       \n"
                        "smlal  v20.4s, v3.4h, v0.h[6]        \n" // out6 += (r60-r67)*k66
                        "smlal2  v21.4s, v3.8h, v0.h[6]       \n"
                        "smlal  v22.4s, v3.4h, v0.h[7]        \n" // out7 += (r60-r67)*k76
                        "smlal2  v23.4s, v3.8h, v0.h[7]       \n"
                        // r7
                        "smlal  v8.4s, v4.4h, v1.h[0]         \n" // out0 += (r70-r77)*k07
                        "smlal2  v9.4s, v4.8h, v1.h[0]        \n"
                        "smlal  v10.4s, v4.4h, v1.h[1]        \n" // out1 += (r70-r77)*k17
                        "smlal2  v11.4s, v4.8h, v1.h[1]       \n"
                        "smlal  v12.4s, v4.4h, v1.h[2]        \n" // out2 += (r70-r77)*k27
                        "smlal2  v13.4s, v4.8h, v1.h[2]       \n"
                        "smlal  v14.4s, v4.4h, v1.h[3]        \n" // out3 += (r70-r77)*k37
                        "smlal2  v15.4s, v4.8h, v1.h[3]       \n"
                        "smlal  v16.4s, v4.4h, v1.h[4]        \n" // out4 += (r70-r77)*k47
                        "smlal2  v17.4s, v4.8h, v1.h[4]       \n"
                        "smlal  v18.4s, v4.4h, v1.h[5]        \n" // out5 += (r70-r77)*k57
                        "smlal2  v19.4s, v4.8h, v1.h[5]       \n"
                        "smlal  v20.4s, v4.4h, v1.h[6]        \n" // out6 += (r70-r77)*k67
                        "smlal2  v21.4s, v4.8h, v1.h[6]       \n"
                        "smlal  v22.4s, v4.4h, v1.h[7]        \n" // out7 += (r70-r77)*k77
                        "smlal2  v23.4s, v4.8h, v1.h[7]       \n"
                        // r8
                        "smlal  v8.4s, v7.4h, v2.h[0]         \n" // out0 += (r80-r87)*k08
                        "smlal2  v9.4s, v7.8h, v2.h[0]        \n"
                        "smlal  v10.4s, v7.4h, v2.h[1]        \n" // out1 += (r80-r87)*k18
                        "smlal2  v11.4s, v7.8h, v2.h[1]       \n"
                        "smlal  v12.4s, v7.4h, v2.h[2]        \n" // out2 += (r80-r87)*k28
                        "smlal2  v13.4s, v7.8h, v2.h[2]       \n"
                        "smlal  v14.4s, v7.4h, v2.h[3]        \n" // out3 += (r80-r87)*k38
                        "smlal2  v15.4s, v7.8h, v2.h[3]       \n"
                        "smlal  v16.4s, v7.4h, v2.h[4]        \n" // out4 += (r80-r87)*k48
                        "smlal2  v17.4s, v7.8h, v2.h[4]       \n"
                        "smlal  v18.4s, v7.4h, v2.h[5]        \n" // out5 += (r80-r87)*k58
                        "smlal2  v19.4s, v7.8h, v2.h[5]       \n"
                        "smlal  v20.4s, v7.4h, v2.h[6]        \n" // out6 += (r80-r87)*k68
                        "smlal2  v21.4s, v7.8h, v2.h[6]       \n"
                        "smlal  v22.4s, v7.4h, v2.h[7]        \n" // out7 += (r80-r87)*k78
                        "smlal2  v23.4s, v7.8h, v2.h[7]       \n"

                        "st1    {v8.4s, v9.4s}, [%1], #32     \n"
                        "st1    {v10.4s, v11.4s}, [%2], #32   \n"
                        "st1    {v12.4s, v13.4s}, [%3], #32   \n"
                        "st1    {v14.4s, v15.4s}, [%4], #32   \n"
                        "st1    {v16.4s, v17.4s}, [%5], #32   \n"
                        "st1    {v18.4s, v19.4s}, [%6], #32   \n"
                        "st1    {v20.4s, v21.4s}, [%7], #32   \n"
                        "st1    {v22.4s, v23.4s}, [%8], #32   \n"

                        "subs   %w0, %w0, #1                  \n"
                        "sub    %12, %12, #72                 \n" // reset ktmp

                        "bne    0b                            \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(outptr4), // %5
                        "=r"(outptr5), // %6
                        "=r"(outptr6), // %7
                        "=r"(outptr7), // %8
                        "=r"(r0),      // %9
                        "=r"(r1),      // %10
                        "=r"(r2),      // %11
                        "=r"(ktmp)     // %12
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(outptr4),
                        "6"(outptr5),
                        "7"(outptr6),
                        "8"(outptr7),
                        "9"(r0),
                        "10"(r1),
                        "11"(r2),
                        "12"(ktmp)
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
                }
#else  // __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                             \n"
                        "pld        [%1, #128]          \n"
                        "vld1.s32   {d16-d17}, [%1]     \n" // out0
                        "pld        [%2, #128]          \n"
                        "vld1.s32   {d18-d19}, [%2]     \n" // out1
                        "pld        [%3, #128]          \n"
                        "vld1.s32   {d20-d21}, [%3]     \n" // out2
                        "pld        [%4, #128]          \n"
                        "vld1.s32   {d22-d23}, [%4]     \n" // out3

                        // r0
                        "pld        [%9, #64]          \n"
                        "vld2.s8    {d8-d9}, [%9]       \n" // d8(a00 a02 a04 a06 a08 a010 a012 a014), d9(a01 a03 a05 a07 a09 a011 a013 a015)
                        "add        %9, #8              \n"
                        "pld        [%12, #64]         \n"
                        "vld1.s8    {d0-d2}, [%12]!     \n" // d0(k00-k70) d1(k01-k71) d2(k02-k72)

                        "pld        [%5, #128]          \n"
                        "vld1.s32   {d24-d25}, [%5]     \n" // out4
                        "pld        [%6, #128]          \n"
                        "vld1.s32   {d26-d27}, [%6]     \n" // out5

                        "vmovl.s8   q2, d2              \n" // q2(k02-k72)
                        "vmovl.s8   q1, d1              \n" // q1(k01-k71)
                        "vmovl.s8   q0, d0              \n" // q0(k00-k70)
                        "vext.s8    d12, d8, d8, #1     \n" // d12(a02 a04 a06 a08 x x x x)

                        "pld        [%7, #128]          \n"
                        "vld1.s32   {d28-d29}, [%7]     \n" // out6

                        "vmovl.s8   q5, d9              \n" // q5(a01 a03 a05 a07 a09 a011 a013 a015) d11
                        "vmovl.s8   q4, d8              \n" // q4(a00 a02 a04 a06 a08 a010 a012 a014) d9
                        "vmovl.s8   q6, d12             \n" // q6(a02 a04 a06 a08 a010 a012 a014 a016) d13

                        "pld        [%8, #128]          \n"
                        "vld1.s32   {d30-d31}, [%8]     \n" // out7

                        "vmlal.s16  q8, d8, d0[0]       \n" // sum0 += (a00 a02 a04 a06) * k00
                        "vmlal.s16  q9, d8, d0[1]       \n" // sum1 += (a00 a02 a04 a06) * k10
                        "vmlal.s16  q10, d8, d0[2]      \n" // sum2 += (a00 a02 a04 a06) * k20
                        "vmlal.s16  q11, d8, d0[3]      \n" // sum3 += (a00 a02 a04 a06) * k30
                        "vmlal.s16  q12, d8, d1[0]      \n" // sum4 += (a00 a02 a04 a06) * k40
                        "vmlal.s16  q13, d8, d1[1]      \n" // sum5 += (a00 a02 a04 a06) * k50
                        "vmlal.s16  q14, d8, d1[2]      \n" // sum6 += (a00 a02 a04 a06) * k60
                        "vmlal.s16  q15, d8, d1[3]      \n" // sum7 += (a00 a02 a04 a06) * k70

                        "vmlal.s16  q8, d10, d2[0]      \n" // sum0 += (a01-a07) * k01
                        "vmlal.s16  q9, d10, d2[1]      \n" // sum1 += (a01-a07) * k11
                        "vmlal.s16  q10, d10, d2[2]     \n" // sum2 += (a01-a07) * k21
                        "vmlal.s16  q11, d10, d2[3]     \n" // sum3 += (a01-a07) * k31
                        "vmlal.s16  q12, d10, d3[0]     \n" // sum4 += (a01-a07) * k41
                        "vmlal.s16  q13, d10, d3[1]     \n" // sum5 += (a01-a07) * k51
                        "vmlal.s16  q14, d10, d3[2]     \n" // sum6 += (a01-a07) * k61
                        "vmlal.s16  q15, d10, d3[3]     \n" // sum7 += (a01-a07) * k71

                        "pld        [%10, #64]         \n"
                        "vld2.s8    {d8-d9}, [%10]      \n" // d8(a10 a12 a14 a16 a18 a110 a112 a114), d9(a11 a13 a15 a17 a19 a111 a113 a115)
                        "add        %10, #8             \n"

                        "vmlal.s16  q8, d12, d4[0]      \n" // sum0 += (a02-a08) * k02
                        "vmlal.s16  q9, d12, d4[1]      \n" // sum1 += (a02-a08) * k12
                        "vmlal.s16  q10, d12, d4[2]     \n" // sum2 += (a02-a08) * k22
                        "vmlal.s16  q11, d12, d4[3]     \n" // sum3 += (a02-a08) * k32

                        "pld        [%12, #64]         \n"
                        "vld1.s8    {d0-d2}, [%12]!     \n" // d0(k03-k73) d1(k04-k74) d2(k05-k75)

                        "vmlal.s16  q12, d12, d5[0]     \n" // sum4 += (a02-a08) * k42
                        "vmlal.s16  q13, d12, d5[1]     \n" // sum5 += (a02-a08) * k52
                        "vmlal.s16  q14, d12, d5[2]     \n" // sum6 += (a02-a08) * k62
                        "vmlal.s16  q15, d12, d5[3]     \n" // sum7 += (a02-a08) * k72

                        // r1
                        "vext.s8    d12, d8, d8, #1     \n" // d12(a12 a14 a16 a18 x x x x)

                        "vmovl.s8   q2, d2              \n" // q2(k05-k75)
                        "vmovl.s8   q1, d1              \n" // q1(k04-k74)
                        "vmovl.s8   q0, d0              \n" // q0(k03-k73)
                        "vmovl.s8   q5, d9              \n" // q5(a11-a115)
                        "vmovl.s8   q4, d8              \n" // q4(a10-a114)
                        "vmovl.s8   q6, d12             \n" // q6(a12-a116)

                        "vmlal.s16  q8, d8, d0[0]       \n" // sum0 += (a10-a16) * k03
                        "vmlal.s16  q9, d8, d0[1]       \n" // sum1 += (a10-a16) * k13
                        "vmlal.s16  q10, d8, d0[2]      \n" // sum2 += (a10-a16) * k23
                        "vmlal.s16  q11, d8, d0[3]      \n" // sum3 += (a10-a16) * k33
                        "vmlal.s16  q12, d8, d1[0]      \n" // sum4 += (a10-a16) * k43
                        "vmlal.s16  q13, d8, d1[1]      \n" // sum5 += (a10-a16) * k53
                        "vmlal.s16  q14, d8, d1[2]      \n" // sum6 += (a10-a16) * k63
                        "vmlal.s16  q15, d8, d1[3]      \n" // sum7 += (a10-a16) * k73

                        "vmlal.s16  q8, d10, d2[0]      \n" // sum0 += (a11-a17) * k04
                        "vmlal.s16  q9, d10, d2[1]      \n" // sum1 += (a11-a17) * k14
                        "vmlal.s16  q10, d10, d2[2]     \n" // sum2 += (a11-a17) * k24
                        "vmlal.s16  q11, d10, d2[3]     \n" // sum3 += (a11-a17) * k34
                        "vmlal.s16  q12, d10, d3[0]     \n" // sum4 += (a11-a17) * k44
                        "vmlal.s16  q13, d10, d3[1]     \n" // sum5 += (a11-a17) * k54
                        "vmlal.s16  q14, d10, d3[2]     \n" // sum6 += (a11-a17) * k64
                        "vmlal.s16  q15, d10, d3[3]     \n" // sum7 += (a11-a17) * k74

                        "pld        [%11, #64]         \n"
                        "vld2.s8    {d8-d9}, [%11]      \n" // d8(a20 a22 a24 a26 a28 a210 a212 a214), d9(a21 a23 a25 a27 a29 a211 a213 a215)
                        "add        %11, #8             \n"

                        "vmlal.s16  q8, d12, d4[0]      \n" // sum0 += (a12-a18) * k05
                        "vmlal.s16  q9, d12, d4[1]      \n" // sum1 += (a12-a18) * k15
                        "vmlal.s16  q10, d12, d4[2]     \n" // sum2 += (a12-a18) * k25
                        "vmlal.s16  q11, d12, d4[3]     \n" // sum3 += (a12-a18) * k35

                        "pld        [%12, #64]         \n"
                        "vld1.s8    {d0-d2}, [%12]!     \n" // d0(k06-k76) d1(k07-k77) d2(k08-k78)

                        "vmlal.s16  q12, d12, d5[0]     \n" // sum4 += (a12-a18) * k45
                        "vmlal.s16  q13, d12, d5[1]     \n" // sum5 += (a12-a18) * k55
                        "vmlal.s16  q14, d12, d5[2]     \n" // sum6 += (a12-a18) * k65
                        "vmlal.s16  q15, d12, d5[3]     \n" // sum7 += (a12-a18) * k75

                        // r2
                        "vext.s8    d12, d8, d8, #1     \n" // d12(a22 a24 a26 a28 x x x x)

                        "vmovl.s8   q2, d2              \n" // q2(k08-k78)
                        "vmovl.s8   q1, d1              \n" // q1(k07-k77)
                        "vmovl.s8   q0, d0              \n" // q0(k06-k76)
                        "vmovl.s8   q5, d9              \n" // q5(a21-a215)
                        "vmovl.s8   q4, d8              \n" // q4(a20-a214)
                        "vmovl.s8   q6, d12             \n" // q6(a22-a216)

                        "vmlal.s16  q8, d8, d0[0]       \n" // sum0 += (a20-a26) * k06
                        "vmlal.s16  q9, d8, d0[1]       \n" // sum1 += (a20-a26) * k16
                        "vmlal.s16  q10, d8, d0[2]      \n" // sum2 += (a20-a26) * k26
                        "vmlal.s16  q11, d8, d0[3]      \n" // sum3 += (a20-a26) * k36
                        "vmlal.s16  q12, d8, d1[0]      \n" // sum4 += (a20-a26) * k46
                        "vmlal.s16  q13, d8, d1[1]      \n" // sum5 += (a20-a26) * k56
                        "vmlal.s16  q14, d8, d1[2]      \n" // sum6 += (a20-a26) * k66
                        "vmlal.s16  q15, d8, d1[3]      \n" // sum7 += (a20-a26) * k76

                        "vmlal.s16  q8, d10, d2[0]      \n" // sum0 += (a21-a27) * k07
                        "vmlal.s16  q9, d10, d2[1]      \n" // sum1 += (a21-a27) * k17
                        "vmlal.s16  q10, d10, d2[2]     \n" // sum2 += (a21-a27) * k27
                        "vmlal.s16  q11, d10, d2[3]     \n" // sum3 += (a21-a27) * k37
                        "vmlal.s16  q12, d10, d3[0]     \n" // sum4 += (a21-a27) * k47
                        "vmlal.s16  q13, d10, d3[1]     \n" // sum5 += (a21-a27) * k57
                        "vmlal.s16  q14, d10, d3[2]     \n" // sum6 += (a21-a27) * k67
                        "vmlal.s16  q15, d10, d3[3]     \n" // sum7 += (a21-a27) * k77

                        "vmlal.s16  q8, d12, d4[0]      \n" // sum0 += (a22-a28) * k08
                        "vmlal.s16  q9, d12, d4[1]      \n" // sum1 += (a22-a28) * k18
                        "vmlal.s16  q10, d12, d4[2]     \n" // sum2 += (a22-a28) * k28
                        "vmlal.s16  q11, d12, d4[3]     \n" // sum3 += (a22-a28) * k38
                        "vmlal.s16  q12, d12, d5[0]     \n" // sum4 += (a22-a28) * k48
                        "vmlal.s16  q13, d12, d5[1]     \n" // sum5 += (a22-a28) * k58
                        "vmlal.s16  q14, d12, d5[2]     \n" // sum6 += (a22-a28) * k68
                        "vmlal.s16  q15, d12, d5[3]     \n" // sum7 += (a22-a28) * k78

                        // save s32 to memory
                        "sub        %12, %12, #72       \n"
                        "vst1.s32   {d16-d17}, [%1]!    \n" // out0
                        "vst1.s32   {d18-d19}, [%2]!    \n" // out1
                        "vst1.s32   {d20-d21}, [%3]!    \n" // out2
                        "vst1.s32   {d22-d23}, [%4]!    \n" // out3
                        "subs       %0, #1              \n"
                        "vst1.s32   {d24-d25}, [%5]!    \n" // out4
                        "vst1.s32   {d26-d27}, [%6]!    \n" // out5
                        "vst1.s32   {d28-d29}, [%7]!    \n" // out6
                        "vst1.s32   {d30-d31}, [%8]!    \n" // out7

                        "bne        0b                  \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(outptr4), // %5
                        "=r"(outptr5), // %6
                        "=r"(outptr6), // %7
                        "=r"(outptr7), // %8
                        "=r"(r0),      // %9
                        "=r"(r1),      // %10
                        "=r"(r2),      // %11
                        "=r"(ktmp)     // %12
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(outptr4),
                        "6"(outptr5),
                        "7"(outptr6),
                        "8"(outptr7),
                        "9"(r0),
                        "10"(r1),
                        "11"(r2),
                        "12"(ktmp)
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
#if __aarch64__
                    int8x8_t _r0_s8 = vld1_s8(r0); // (a00 a01 a02 ....)
                    int8x8_t _r1_s8 = vld1_s8(r1); // (a10 a11 a12 ....)
                    int8x8_t _r2_s8 = vld1_s8(r2); // (a20 a21 a22 ....)

                    int16x8_t _r0 = vmovl_s8(_r0_s8);
                    int16x8_t _r1 = vmovl_s8(_r1_s8);
                    int16x8_t _r2 = vmovl_s8(_r2_s8);

                    int32x4_t _sum03 = {};
                    int32x4_t _sum47 = {};

                    _sum03 = vld1q_lane_s32(outptr0, _sum03, 0); // out0
                    _sum03 = vld1q_lane_s32(outptr1, _sum03, 1); // out1
                    _sum03 = vld1q_lane_s32(outptr2, _sum03, 2); // out2
                    _sum03 = vld1q_lane_s32(outptr3, _sum03, 3); // out3
                    _sum47 = vld1q_lane_s32(outptr4, _sum47, 0); // out4
                    _sum47 = vld1q_lane_s32(outptr5, _sum47, 1); // out5
                    _sum47 = vld1q_lane_s32(outptr6, _sum47, 2); // out6
                    _sum47 = vld1q_lane_s32(outptr7, _sum47, 3); // out7

                    // k0 - k2
                    int8x8_t _k0_8 = vld1_s8(ktmp);      //(k00-k70)
                    int8x8_t _k1_8 = vld1_s8(ktmp + 8);  //(k01-k71)
                    int8x8_t _k2_8 = vld1_s8(ktmp + 16); //(k02-k72)

                    int16x8_t _k0 = vmovl_s8(_k0_8);
                    int16x8_t _k1 = vmovl_s8(_k1_8);
                    int16x8_t _k2 = vmovl_s8(_k2_8);

                    int32x4_t _sum0 = vmull_laneq_s16(vget_low_s16(_k0), _r0, 0);
                    int32x4_t _sum0n = vmull_laneq_s16(vget_high_s16(_k0), _r0, 0);
                    int32x4_t _sum1 = vmull_laneq_s16(vget_low_s16(_k1), _r0, 1);
                    int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1);
                    _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r0, 2);
                    _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r0, 2);

                    // k3 - k5
                    _k0_8 = vld1_s8(ktmp + 24); //(k03-k73)
                    _k1_8 = vld1_s8(ktmp + 32); //(k04-k74)
                    _k2_8 = vld1_s8(ktmp + 40); //(k05-k75)

                    _k0 = vmovl_s8(_k0_8);
                    _k1 = vmovl_s8(_k1_8);
                    _k2 = vmovl_s8(_k2_8);

                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r1, 0);
                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r1, 0);
                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r1, 1);
                    _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1);
                    _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r1, 2);
                    _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r1, 2);

                    // k6 - k8
                    _k0_8 = vld1_s8(ktmp + 48); //(k06-k76)
                    _k1_8 = vld1_s8(ktmp + 56); //(k07-k77)
                    _k2_8 = vld1_s8(ktmp + 64); //(k08-k78)

                    _k0 = vmovl_s8(_k0_8);
                    _k1 = vmovl_s8(_k1_8);
                    _k2 = vmovl_s8(_k2_8);

                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r2, 0);
                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r2, 0);
                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r2, 1);
                    _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1);
                    _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r2, 2);
                    _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r2, 2);

                    _sum0 = vaddq_s32(_sum0, _sum1);
                    _sum0n = vaddq_s32(_sum0n, _sum1n);
                    _sum03 = vaddq_s32(_sum03, _sum0);
                    _sum47 = vaddq_s32(_sum47, _sum0n);

                    vst1q_lane_s32(outptr0, _sum03, 0);
                    vst1q_lane_s32(outptr1, _sum03, 1);
                    vst1q_lane_s32(outptr2, _sum03, 2);
                    vst1q_lane_s32(outptr3, _sum03, 3);
                    vst1q_lane_s32(outptr4, _sum47, 0);
                    vst1q_lane_s32(outptr5, _sum47, 1);
                    vst1q_lane_s32(outptr6, _sum47, 2);
                    vst1q_lane_s32(outptr7, _sum47, 3);

                    outptr0++;
                    outptr1++;
                    outptr2++;
                    outptr3++;
                    outptr4++;
                    outptr5++;
                    outptr6++;
                    outptr7++;
#else  // __aarch64__
                    asm volatile(
                        "pld        [%8, #64]          \n"
                        "vld1.s8    {d0}, [%8]         \n" // d0(a00 a01 a02 ....)
                        "pld        [%9, #64]          \n"
                        "vld1.s8    {d2}, [%9]         \n" // d2(a10 a11 a12 ....)
                        "pld        [%10, #64]         \n"
                        "vld1.s8    {d4}, [%10]        \n" // d4(a20 a21 a22 ....)

                        "pld        [%11, #64]         \n"
                        "vld1.s8    {d6-d8}, [%11]!    \n" // d6(k00-k70) d7(k01-k71) d8(k02-k72)

                        "vmovl.s8   q0, d0             \n" // d0(a00 a01 a02 x)
                        "vmovl.s8   q1, d2             \n" // d2(a10 a11 a12 x)
                        "vmovl.s8   q2, d4             \n" // d4(a20 a21 a22 x)

                        "vmovl.s8   q5, d8             \n" // d10(k02-k32) d11(k42-k72)
                        "vmovl.s8   q4, d7             \n" // d8(k01-k31) d9(k41-k71)
                        "vmovl.s8   q3, d6             \n" // d6(k00-k30) d7(k40-k70)

                        "vld1.s32   {d20[0]}, [%0]     \n" // out0 q10
                        "vld1.s32   {d20[1]}, [%1]     \n" // out1
                        "vld1.s32   {d21[0]}, [%2]     \n" // out2
                        "vld1.s32   {d21[1]}, [%3]     \n" // out3

                        "pld        [%11, #64]         \n"
                        "vld1.s8    {d24-d26}, [%11]!  \n"
                        "vmovl.s8   q14, d26           \n" // d28(k05-k35) d29(k45-k75)
                        "vmovl.s8   q13, d25           \n" // d26(k04-k34) d27(k44-k74)
                        "vmovl.s8   q12, d24           \n" // d24(k03-k33) d25(k43-k73)

                        "vld1.s32   {d22[0]}, [%4]     \n" // out4 q11
                        "vld1.s32   {d22[1]}, [%5]     \n" // out5
                        "vld1.s32   {d23[0]}, [%6]     \n" // out6
                        "vld1.s32   {d23[1]}, [%7]     \n" // out7

                        "vmull.s16  q6, d6, d0[0]      \n" // a00 x (k00-k30)
                        "vmull.s16  q7, d7, d0[0]      \n" // a00 x (k40-k70)
                        "vmull.s16  q8, d8, d0[1]      \n" // a01 x (k01-k31)
                        "vmull.s16  q9, d9, d0[1]      \n" // a01 x (k41-k71)
                        "vmlal.s16  q10, d10, d0[2]    \n" // a02 x (k02-k32)
                        "vmlal.s16  q11, d11, d0[2]    \n" // a02 x (k42-k72)

                        "pld        [%11, #64]         \n"
                        "vld1.s8    {d6-d8}, [%11]!    \n"
                        "vmovl.s8   q5, d8             \n" // d10(k08-k38) d11(k48-k78)
                        "vmovl.s8   q4, d7             \n" // d8(k07-k37) d9(k47-k77)
                        "vmovl.s8   q3, d6             \n" // d6(k06-k36) d7(k46-k76)

                        "vmlal.s16  q6, d24, d2[0]     \n" // a10 x (k03-k33)
                        "vmlal.s16  q7, d25, d2[0]     \n" // a10 x (k43-k73)
                        "vmlal.s16  q8, d26, d2[1]     \n" // a11 x (k04-k34)
                        "vmlal.s16  q9, d27, d2[1]     \n" // a11 x (k44-k74)
                        "vmlal.s16  q10, d28, d2[2]    \n" // a12 x (k05-k35)
                        "vmlal.s16  q11, d29, d2[2]    \n" // a12 x (k45-k75)

                        "vmlal.s16  q6, d6, d4[0]      \n" // a20 x (k06-k36)
                        "vmlal.s16  q7, d7, d4[0]      \n" // a20 x (k46-k76)
                        "vmlal.s16  q8, d8, d4[1]      \n" // a21 x (k07-k37)
                        "vmlal.s16  q9, d9, d4[1]      \n" // a21 x (k47-k77)
                        "vmlal.s16  q10, d10, d4[2]    \n" // a22 x (k08-k38)
                        "vmlal.s16  q11, d11, d4[2]    \n" // a22 x (k48-k78)

                        "vadd.s32   q8, q8, q6         \n"
                        "vadd.s32   q9, q9, q7         \n"

                        "sub        %11, %11, #72      \n"

                        "vadd.s32   q10, q10, q8       \n"
                        "vadd.s32   q11, q11, q9       \n"

                        "vst1.s32   {d20[0]}, [%0]!    \n" // out0
                        "vst1.s32   {d20[1]}, [%1]!    \n" // out1
                        "vst1.s32   {d21[0]}, [%2]!    \n" // out2
                        "vst1.s32   {d21[1]}, [%3]!    \n" // out3
                        "vst1.s32   {d22[0]}, [%4]!    \n" // out4
                        "vst1.s32   {d22[1]}, [%5]!    \n" // out5
                        "vst1.s32   {d23[0]}, [%6]!    \n" // out6
                        "vst1.s32   {d23[1]}, [%7]!    \n" // out7

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(outptr2), // %2
                        "=r"(outptr3), // %3
                        "=r"(outptr4), // %4
                        "=r"(outptr5), // %5
                        "=r"(outptr6), // %6
                        "=r"(outptr7), // %7
                        "=r"(r0),      // %8
                        "=r"(r1),      // %9
                        "=r"(r2),      // %10
                        "=r"(ktmp)     // %11
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(outptr2),
                        "3"(outptr3),
                        "4"(outptr4),
                        "5"(outptr5),
                        "6"(outptr6),
                        "7"(outptr7),
                        "8"(r0),
                        "9"(r1),
                        "10"(r2),
                        "11"(ktmp)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // __ARM_NEON
                    int sum0 = 0;
                    int sum1 = 0;
                    int sum2 = 0;
                    int sum3 = 0;
                    int sum4 = 0;
                    int sum5 = 0;
                    int sum6 = 0;
                    int sum7 = 0;

                    sum0 += (int)r0[0] * ktmp[0];
                    sum1 += (int)r0[0] * ktmp[1];
                    sum2 += (int)r0[0] * ktmp[2];
                    sum3 += (int)r0[0] * ktmp[3];
                    sum4 += (int)r0[0] * ktmp[4];
                    sum5 += (int)r0[0] * ktmp[5];
                    sum6 += (int)r0[0] * ktmp[6];
                    sum7 += (int)r0[0] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r0[1] * ktmp[0];
                    sum1 += (int)r0[1] * ktmp[1];
                    sum2 += (int)r0[1] * ktmp[2];
                    sum3 += (int)r0[1] * ktmp[3];
                    sum4 += (int)r0[1] * ktmp[4];
                    sum5 += (int)r0[1] * ktmp[5];
                    sum6 += (int)r0[1] * ktmp[6];
                    sum7 += (int)r0[1] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r0[2] * ktmp[0];
                    sum1 += (int)r0[2] * ktmp[1];
                    sum2 += (int)r0[2] * ktmp[2];
                    sum3 += (int)r0[2] * ktmp[3];
                    sum4 += (int)r0[2] * ktmp[4];
                    sum5 += (int)r0[2] * ktmp[5];
                    sum6 += (int)r0[2] * ktmp[6];
                    sum7 += (int)r0[2] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r1[0] * ktmp[0];
                    sum1 += (int)r1[0] * ktmp[1];
                    sum2 += (int)r1[0] * ktmp[2];
                    sum3 += (int)r1[0] * ktmp[3];
                    sum4 += (int)r1[0] * ktmp[4];
                    sum5 += (int)r1[0] * ktmp[5];
                    sum6 += (int)r1[0] * ktmp[6];
                    sum7 += (int)r1[0] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r1[1] * ktmp[0];
                    sum1 += (int)r1[1] * ktmp[1];
                    sum2 += (int)r1[1] * ktmp[2];
                    sum3 += (int)r1[1] * ktmp[3];
                    sum4 += (int)r1[1] * ktmp[4];
                    sum5 += (int)r1[1] * ktmp[5];
                    sum6 += (int)r1[1] * ktmp[6];
                    sum7 += (int)r1[1] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r1[2] * ktmp[0];
                    sum1 += (int)r1[2] * ktmp[1];
                    sum2 += (int)r1[2] * ktmp[2];
                    sum3 += (int)r1[2] * ktmp[3];
                    sum4 += (int)r1[2] * ktmp[4];
                    sum5 += (int)r1[2] * ktmp[5];
                    sum6 += (int)r1[2] * ktmp[6];
                    sum7 += (int)r1[2] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r2[0] * ktmp[0];
                    sum1 += (int)r2[0] * ktmp[1];
                    sum2 += (int)r2[0] * ktmp[2];
                    sum3 += (int)r2[0] * ktmp[3];
                    sum4 += (int)r2[0] * ktmp[4];
                    sum5 += (int)r2[0] * ktmp[5];
                    sum6 += (int)r2[0] * ktmp[6];
                    sum7 += (int)r2[0] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r2[1] * ktmp[0];
                    sum1 += (int)r2[1] * ktmp[1];
                    sum2 += (int)r2[1] * ktmp[2];
                    sum3 += (int)r2[1] * ktmp[3];
                    sum4 += (int)r2[1] * ktmp[4];
                    sum5 += (int)r2[1] * ktmp[5];
                    sum6 += (int)r2[1] * ktmp[6];
                    sum7 += (int)r2[1] * ktmp[7];
                    ktmp += 8;

                    sum0 += (int)r2[2] * ktmp[0];
                    sum1 += (int)r2[2] * ktmp[1];
                    sum2 += (int)r2[2] * ktmp[2];
                    sum3 += (int)r2[2] * ktmp[3];
                    sum4 += (int)r2[2] * ktmp[4];
                    sum5 += (int)r2[2] * ktmp[5];
                    sum6 += (int)r2[2] * ktmp[6];
                    sum7 += (int)r2[2] * ktmp[7];
                    ktmp += 8;

                    *outptr0 += sum0;
                    *outptr1 += sum1;
                    *outptr2 += sum2;
                    *outptr3 += sum3;
                    *outptr4 += sum4;
                    *outptr5 += sum5;
                    *outptr6 += sum6;
                    *outptr7 += sum7;

                    ktmp -= 8 * 9;

                    outptr0++;
                    outptr1++;
                    outptr2++;
                    outptr3++;
                    outptr4++;
                    outptr5++;
                    outptr6++;
                    outptr7++;
#endif // __ARM_NEON
                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            ktmp += 8 * 9;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        out.fill(0);

        const signed char* ktmp = _kernel.channel(p / 8 + p % 8);

        for (int q = 0; q < inch; q++)
        {
            int* outptr = out;

            const signed char* img0 = bottom_blob.channel(q);

            const signed char* r0 = img0;
            const signed char* r1 = img0 + w;
            const signed char* r2 = img0 + w * 2;

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 3;
                int remain = outw & 7;
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                                   \n"

                        "ld1    {v0.8b, v1.8b}, [%5]          \n" //ktmp
                        "ld2    {v2.8b, v3.8b}, [%2], #16     \n" //r0-r2
                        "ld2    {v4.8b, v5.8b}, [%2]          \n"

                        "ld2    {v6.8b, v7.8b}, [%3], #16     \n" //r3-r5
                        "ld2    {v8.8b, v9.8b}, [%3]          \n"

                        "ld2    {v10.8b, v11.8b}, [%4], #16   \n" //r6-r8
                        "ld2    {v12.8b, v13.8b}, [%4]        \n"

                        "ld1    {v14.4s, v15.4s}, [%1]        \n" //out0

                        "ext    v4.8b, v2.8b, v4.8b, #1       \n"
                        "ext    v8.8b, v6.8b, v8.8b, #1       \n"
                        "ext    v12.8b, v10.8b, v12.8b, #1    \n"

                        "sshll  v0.8h, v0.8b, #0              \n" //(k0-k7)
                        "sshll  v1.8h, v1.8b, #0              \n" //(k8)
                        "sshll  v2.8h, v2.8b, #0              \n" // r0
                        "sshll  v3.8h, v3.8b, #0              \n" // r1
                        "sshll  v4.8h, v4.8b, #0              \n" // r2
                        "sshll  v6.8h, v6.8b, #0              \n" // r3
                        "sshll  v7.8h, v7.8b, #0              \n" // r4
                        "sshll  v8.8h, v8.8b, #0              \n" // r5
                        "sshll  v10.8h, v10.8b, #0            \n" // r6
                        "sshll  v11.8h, v11.8b, #0            \n" // r7
                        "sshll  v12.8h, v12.8b, #0            \n" // r8

                        // r0
                        "smull  v16.4s, v2.4h, v0.h[0]        \n" // out = r0*k0
                        "smull2  v17.4s, v2.8h, v0.h[0]       \n"
                        "smull  v18.4s, v3.4h, v0.h[1]        \n" // outn = r1*k1
                        "smull2  v19.4s, v3.8h, v0.h[1]       \n"
                        "smlal  v16.4s, v4.4h, v0.h[2]        \n" // out = r2*k2
                        "smlal2  v17.4s, v4.8h, v0.h[2]       \n"
                        "smlal  v18.4s, v6.4h, v0.h[3]        \n" // outn = r3*k3
                        "smlal2  v19.4s, v6.8h, v0.h[3]       \n"
                        "smlal  v16.4s, v7.4h, v0.h[4]        \n" // out = r4*k4
                        "smlal2  v17.4s, v7.8h, v0.h[4]       \n"
                        "smlal  v18.4s, v8.4h, v0.h[5]        \n" // outn = r5*k5
                        "smlal2  v19.4s, v8.8h, v0.h[5]       \n"
                        "smlal  v16.4s, v10.4h, v0.h[6]       \n" // out = r6*k6
                        "smlal2  v17.4s, v10.8h, v0.h[6]      \n"
                        "smlal  v18.4s, v11.4h, v0.h[7]       \n" // outn = r7*k7
                        "smlal2  v19.4s, v11.8h, v0.h[7]      \n"
                        "smlal  v16.4s, v12.4h, v1.h[0]       \n" // out = r8*k8
                        "smlal2  v17.4s, v12.8h, v1.h[0]      \n"

                        "add    v8.4s, v16.4s, v18.4s         \n"
                        "add    v9.4s, v17.4s, v19.4s         \n"

                        "st1    {v8.4s, v9.4s}, [%1], #32     \n"

                        "subs   %w0, %w0, #1                  \n"

                        "bne    0b                            \n"

                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(ktmp)    // %5
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(ktmp)
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        "vld1.s8    {d0-d1}, [%5]       \n" // d0(k0 - k7) d1(k8 ...)
                        "vmovl.s8   q1, d1              \n" // d2(k8 ...)
                        "vmovl.s8   q0, d0              \n" // d0(k0 - k3) d1(k4 - k7)
                        "0:                             \n"
                        "pld        [%2, #192]          \n"
                        "vld2.s8    {d4-d5}, [%2]!      \n" // r0 d4(a00 a02 ... a014) d5(a01 a03 ... a015)
                        "vld2.s8    {d8-d9}, [%2]       \n" //    d8(a016 ....)
                        "vld2.s8    {d10-d11}, [%3]!    \n" // r1 d10(a10 a12 ... a114) d11(a11 a13 ... a115)
                        "vld2.s8    {d14-d15}, [%3]     \n" //    d14(a116 ....)
                        "vld2.s8    {d16-d17}, [%4]!    \n" // r2 d16(a20 a22 ... a214) d17(a21 a23 ... a215)
                        "vld2.s8    {d20-d21}, [%4]     \n" //    d20(a216 ....)
                        "vld1.s32   {d22-d25}, [%1]     \n" // q11(out0 - out3) q12(out4 - out7)

                        "vext.s8    d8, d4, d8, #1      \n" //  d8(a02 a04 ... a016)
                        "vext.s8    d14, d10, d14, #1   \n" // d14(a12 a14 ... a116)
                        "vext.s8    d20, d16, d20, #1   \n" // d20(a22 a24 ... a216)

                        "vmovl.s8   q3, d5              \n" // q3(a01 a03 ... a015)
                        "vmovl.s8   q2, d4              \n" // q2(a00 a02 ... a014)
                        "vmovl.s8   q4, d8              \n" // q4(a02 a04 ... a016)

                        "vmovl.s8   q6, d11             \n" // q6(a11 a13 ... a115)
                        "vmovl.s8   q5, d10             \n" // q5(a10 a12 ... a114)
                        "vmovl.s8   q7, d14             \n" // q7(a12 a14 ... a116)

                        "vmovl.s8   q9, d17             \n" // q9(a21 a23 ... a215)
                        "vmovl.s8   q8, d16             \n" // q8(a20 a22 ... a214)
                        "vmovl.s8   q10, d20            \n" // q10(a22 a24 ... a216)

                        "vmlal.s16  q11, d4, d0[0]      \n" // k0
                        "vmlal.s16  q12, d5, d0[0]      \n"
                        "vmull.s16  q13, d6, d0[1]      \n" // k1
                        "vmull.s16  q14, d7, d0[1]      \n"
                        "vmlal.s16  q11, d8, d0[2]      \n" // k2
                        "vmlal.s16  q12, d9, d0[2]      \n"

                        "vmlal.s16  q13, d12, d1[0]     \n" // k4
                        "vmlal.s16  q14, d13, d1[0]     \n"
                        "vmlal.s16  q11, d10, d0[3]     \n" // k3
                        "vmlal.s16  q12, d11, d0[3]     \n"
                        "vmlal.s16  q13, d14, d1[1]     \n" // k5
                        "vmlal.s16  q14, d15, d1[1]     \n"

                        "vmlal.s16  q11, d16, d1[2]     \n" // k6
                        "vmlal.s16  q12, d17, d1[2]     \n"
                        "vmlal.s16  q13, d18, d1[3]     \n" // k7
                        "vmlal.s16  q14, d19, d1[3]     \n"
                        "vmlal.s16  q11, d20, d2[0]     \n" // k8
                        "vmlal.s16  q12, d21, d2[0]     \n"

                        "vadd.s32   q11, q11, q13       \n"
                        "vadd.s32   q12, q12, q14       \n"

                        "vst1.32    {d22-d25}, [%1]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(ktmp)    // %5
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(ktmp)
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                if (remain > 0)
                {
#if __ARM_NEON
                    int8x8_t _k01234567s8 = vld1_s8(ktmp);
                    int8x8_t _k8xxxxxxxs8 = vld1_s8(ktmp + 8);
                    int8x8_t _k34567xxxs8 = vext_s8(_k01234567s8, _k01234567s8, 3);
                    int8x8_t _k678xxxxxs8 = vext_s8(_k01234567s8, _k8xxxxxxxs8, 6);
                    int16x8_t _k0123_s16 = vmovl_s8(_k01234567s8);
                    int16x8_t _k3456_s16 = vmovl_s8(_k34567xxxs8);
                    int16x8_t _k678x_s16 = vmovl_s8(_k678xxxxxs8);
#endif
                    for (; remain > 0; remain--)
                    {
#if __ARM_NEON
                        int8x8_t _r00s8 = vld1_s8(r0);
                        int8x8_t _r10s8 = vld1_s8(r1);
                        int8x8_t _r20s8 = vld1_s8(r2);

                        int16x8_t _r00s16 = vmovl_s8(_r00s8);
                        int16x8_t _r10s16 = vmovl_s8(_r10s8);
                        int16x8_t _r20s16 = vmovl_s8(_r20s8);

                        int32x4_t _sum = vmull_s16(vget_low_s16(_r00s16), vget_low_s16(_k0123_s16));
                        _sum = vmlal_s16(_sum, vget_low_s16(_r10s16), vget_low_s16(_k3456_s16));
                        _sum = vmlal_s16(_sum, vget_low_s16(_r20s16), vget_low_s16(_k678x_s16));

                        _sum = vsetq_lane_s32(*outptr, _sum, 3);

#if __aarch64__
                        *outptr = vaddvq_s32(_sum);
#else
                        int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum));
                        _ss = vpadd_s32(_ss, _ss);

                        *outptr = vget_lane_s32(_ss, 0);
#endif // __aarch64__
#else
                        int sum = 0;

                        sum += (int)r0[0] * ktmp[0];
                        sum += (int)r0[1] * ktmp[1];
                        sum += (int)r0[2] * ktmp[2];
                        sum += (int)r1[0] * ktmp[3];
                        sum += (int)r1[1] * ktmp[4];
                        sum += (int)r1[2] * ktmp[5];
                        sum += (int)r2[0] * ktmp[6];
                        sum += (int)r2[1] * ktmp[7];
                        sum += (int)r2[2] * ktmp[8];

                        *outptr += sum;
#endif // __ARM_NEON
                        r0 += 2;
                        r1 += 2;
                        r2 += 2;
                        outptr++;
                    }
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            ktmp += 9;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack1to4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack1to4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* bias = _bias;

    int remain_outch_start = 0;

#if __ARM_NEON && __aarch64__
    int nn_outch = 0;
    nn_outch = outch >> 1;
    remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = bias ? vld1q_f32((const float*)bias + (p + 1) * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);
        out1.fill(_bias1);

        const float* k0 = kernel.channel(p);
        const float* k1 = kernel.channel(p + 1);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            float32x4_t _k00_0 = vld1q_f32(k0);
            float32x4_t _k01_0 = vld1q_f32(k0 + 4);
            float32x4_t _k02_0 = vld1q_f32(k0 + 8);
            float32x4_t _k10_0 = vld1q_f32(k0 + 12);
            float32x4_t _k11_0 = vld1q_f32(k0 + 16);
            float32x4_t _k12_0 = vld1q_f32(k0 + 20);
            float32x4_t _k20_0 = vld1q_f32(k0 + 24);
            float32x4_t _k21_0 = vld1q_f32(k0 + 28);
            float32x4_t _k22_0 = vld1q_f32(k0 + 32);

            float32x4_t _k00_1 = vld1q_f32(k1);
            float32x4_t _k01_1 = vld1q_f32(k1 + 4);
            float32x4_t _k02_1 = vld1q_f32(k1 + 8);
            float32x4_t _k10_1 = vld1q_f32(k1 + 12);
            float32x4_t _k11_1 = vld1q_f32(k1 + 16);
            float32x4_t _k12_1 = vld1q_f32(k1 + 20);
            float32x4_t _k20_1 = vld1q_f32(k1 + 24);
            float32x4_t _k21_1 = vld1q_f32(k1 + 28);
            float32x4_t _k22_1 = vld1q_f32(k1 + 32);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0] \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%1] \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4s}, [%2], #16          \n"

                        "ld1    {v1.2s}, [%2]               \n"

                        "fmla   v24.4s, %10.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %19.4s, v0.s[0]     \n"
                        "fmla   v29.4s, %19.4s, v0.s[1]     \n"
                        "fmla   v30.4s, %19.4s, v0.s[2]     \n"
                        "fmla   v31.4s, %19.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %11.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %11.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %11.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %11.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %20.4s, v0.s[1]     \n"
                        "fmla   v29.4s, %20.4s, v0.s[2]     \n"
                        "fmla   v30.4s, %20.4s, v0.s[3]     \n"
                        "fmla   v31.4s, %20.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v2.4s}, [%3], #16          \n"

                        "ld1    {v3.2s}, [%3]               \n"

                        "fmla   v24.4s, %12.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %12.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %12.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %12.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %21.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %21.4s, v0.s[3]     \n"
                        "fmla   v30.4s, %21.4s, v1.s[0]     \n"
                        "fmla   v31.4s, %21.4s, v1.s[1]     \n"

                        "fmla   v24.4s, %13.4s, v2.s[0]     \n"
                        "fmla   v25.4s, %13.4s, v2.s[1]     \n"
                        "fmla   v26.4s, %13.4s, v2.s[2]     \n"
                        "fmla   v27.4s, %13.4s, v2.s[3]     \n"
                        "fmla   v28.4s, %22.4s, v2.s[0]     \n"
                        "fmla   v29.4s, %22.4s, v2.s[1]     \n"
                        "fmla   v30.4s, %22.4s, v2.s[2]     \n"
                        "fmla   v31.4s, %22.4s, v2.s[3]     \n"

                        "fmla   v24.4s, %14.4s, v2.s[1]     \n"
                        "fmla   v25.4s, %14.4s, v2.s[2]     \n"
                        "fmla   v26.4s, %14.4s, v2.s[3]     \n"
                        "fmla   v27.4s, %14.4s, v3.s[0]     \n"
                        "fmla   v28.4s, %23.4s, v2.s[1]     \n"
                        "fmla   v29.4s, %23.4s, v2.s[2]     \n"
                        "fmla   v30.4s, %23.4s, v2.s[3]     \n"
                        "fmla   v31.4s, %23.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4s}, [%4], #16          \n"

                        "ld1    {v1.2s}, [%4]               \n"

                        "fmla   v24.4s, %15.4s, v2.s[2]     \n"
                        "fmla   v25.4s, %15.4s, v2.s[3]     \n"
                        "fmla   v26.4s, %15.4s, v3.s[0]     \n"
                        "fmla   v27.4s, %15.4s, v3.s[1]     \n"
                        "fmla   v28.4s, %24.4s, v2.s[2]     \n"
                        "fmla   v29.4s, %24.4s, v2.s[3]     \n"
                        "fmla   v30.4s, %24.4s, v3.s[0]     \n"
                        "fmla   v31.4s, %24.4s, v3.s[1]     \n"

                        "fmla   v24.4s, %16.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %25.4s, v0.s[0]     \n"
                        "fmla   v29.4s, %25.4s, v0.s[1]     \n"
                        "fmla   v30.4s, %25.4s, v0.s[2]     \n"
                        "fmla   v31.4s, %25.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %17.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %17.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %17.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %17.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %26.4s, v0.s[1]     \n"
                        "fmla   v29.4s, %26.4s, v0.s[2]     \n"
                        "fmla   v30.4s, %26.4s, v0.s[3]     \n"
                        "fmla   v31.4s, %26.4s, v1.s[0]     \n"

                        "fmla   v24.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %18.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %18.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %18.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %27.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %27.4s, v0.s[3]     \n"
                        "fmla   v30.4s, %27.4s, v1.s[0]     \n"
                        "fmla   v31.4s, %27.4s, v1.s[1]     \n"

                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%1], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(r0),      // %2
                        "=r"(r1),      // %3
                        "=r"(r2)       // %4
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_0), // %10
                        "w"(_k01_0), // %11
                        "w"(_k02_0), // %12
                        "w"(_k10_0), // %13
                        "w"(_k11_0), // %14
                        "w"(_k12_0), // %15
                        "w"(_k20_0), // %16
                        "w"(_k21_0), // %17
                        "w"(_k22_0), // %18
                        "w"(_k00_1), // %19
                        "w"(_k01_1), // %20
                        "w"(_k02_1), // %21
                        "w"(_k10_1), // %22
                        "w"(_k11_1), // %23
                        "w"(_k12_1), // %24
                        "w"(_k20_1), // %25
                        "w"(_k21_1), // %26
                        "w"(_k22_1)  // %27
                        : "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v24.4s, v25.4s}, [%0]      \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v26.4s, v27.4s}, [%1]      \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4s}, [%2]               \n"
                        "add    %2, %2, #8                  \n"

                        "fmla   v24.4s, %10.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %19.4s, v0.s[0]     \n"
                        "fmla   v27.4s, %19.4s, v0.s[1]     \n"

                        "fmla   v24.4s, %11.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %11.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %20.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %20.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v1.4s}, [%3]               \n"

                        "fmla   v24.4s, %12.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %12.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %21.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %21.4s, v0.s[3]     \n"

                        "add    %3, %3, #8                  \n"

                        "fmla   v24.4s, %13.4s, v1.s[0]     \n"
                        "fmla   v25.4s, %13.4s, v1.s[1]     \n"
                        "fmla   v26.4s, %22.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %22.4s, v1.s[1]     \n"

                        "fmla   v24.4s, %14.4s, v1.s[1]     \n"
                        "fmla   v25.4s, %14.4s, v1.s[2]     \n"
                        "fmla   v26.4s, %23.4s, v1.s[1]     \n"
                        "fmla   v27.4s, %23.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4s}, [%4]               \n"

                        "fmla   v24.4s, %15.4s, v1.s[2]     \n"
                        "fmla   v25.4s, %15.4s, v1.s[3]     \n"
                        "fmla   v26.4s, %24.4s, v1.s[2]     \n"
                        "fmla   v27.4s, %24.4s, v1.s[3]     \n"

                        "add    %4, %4, #8                  \n"

                        "fmla   v24.4s, %16.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %25.4s, v0.s[0]     \n"
                        "fmla   v27.4s, %25.4s, v0.s[1]     \n"

                        "fmla   v24.4s, %17.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %17.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %26.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %26.4s, v0.s[2]     \n"

                        "fmla   v24.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %18.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %27.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %27.4s, v0.s[3]     \n"

                        "st1    {v24.4s, v25.4s}, [%0], #32 \n"
                        "st1    {v26.4s, v27.4s}, [%1], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(r0),      // %2
                        "=r"(r1),      // %3
                        "=r"(r2)       // %4
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_0), // %10
                        "w"(_k01_0), // %11
                        "w"(_k02_0), // %12
                        "w"(_k10_0), // %13
                        "w"(_k11_0), // %14
                        "w"(_k12_0), // %15
                        "w"(_k20_0), // %16
                        "w"(_k21_0), // %17
                        "w"(_k22_0), // %18
                        "w"(_k00_1), // %19
                        "w"(_k01_1), // %20
                        "w"(_k02_1), // %21
                        "w"(_k10_1), // %22
                        "w"(_k11_1), // %23
                        "w"(_k12_1), // %24
                        "w"(_k20_1), // %25
                        "w"(_k21_1), // %26
                        "w"(_k22_1)  // %27
                        : "memory", "v0", "v1", "v24", "v25", "v26", "v27");
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum00 = vld1q_f32(outptr0);
                    float32x4_t _sum10 = vld1q_f32(outptr1);

                    float32x4_t _r0 = vld1q_f32(r0);
                    float32x4_t _r1 = vld1q_f32(r1);
                    float32x4_t _r2 = vld1q_f32(r2);

                    _sum00 = vfmaq_laneq_f32(_sum00, _k00_0, _r0, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k01_0, _r0, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k02_0, _r0, 2);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k10_0, _r1, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k11_0, _r1, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k12_0, _r1, 2);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k20_0, _r2, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k21_0, _r2, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k22_0, _r2, 2);

                    _sum10 = vfmaq_laneq_f32(_sum10, _k00_1, _r0, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k01_1, _r0, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k02_1, _r0, 2);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k10_1, _r1, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k11_1, _r1, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k12_1, _r1, 2);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k20_1, _r2, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k21_1, _r2, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k22_1, _r2, 2);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr1, _sum10);

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                    outptr0 += 4;
                    outptr1 += 4;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
            k1 += 9 * 4;
        }
    }
#endif // __ARM_NEON && __aarch64__

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        const float* k0 = kernel.channel(p);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            float32x4_t _k00 = vld1q_f32(k0);
            float32x4_t _k01 = vld1q_f32(k0 + 4);
            float32x4_t _k02 = vld1q_f32(k0 + 8);
            float32x4_t _k10 = vld1q_f32(k0 + 12);
            float32x4_t _k11 = vld1q_f32(k0 + 16);
            float32x4_t _k12 = vld1q_f32(k0 + 20);
            float32x4_t _k20 = vld1q_f32(k0 + 24);
            float32x4_t _k21 = vld1q_f32(k0 + 28);
            float32x4_t _k22 = vld1q_f32(k0 + 32);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

#if __aarch64__
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%1], #32   \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0] \n"

                        "fmla   v24.4s, %8.4s, v0.s[0]      \n"
                        "fmla   v25.4s, %8.4s, v0.s[1]      \n"
                        "fmla   v26.4s, %8.4s, v0.s[2]      \n"
                        "fmla   v27.4s, %8.4s, v0.s[3]      \n"
                        "fmla   v28.4s, %8.4s, v1.s[0]      \n"
                        "fmla   v29.4s, %8.4s, v1.s[1]      \n"
                        "fmla   v30.4s, %8.4s, v1.s[2]      \n"
                        "fmla   v31.4s, %8.4s, v1.s[3]      \n"

                        "ld1    {v2.2s}, [%1]               \n"

                        "fmla   v24.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v25.4s, %9.4s, v0.s[2]      \n"
                        "fmla   v26.4s, %9.4s, v0.s[3]      \n"
                        "fmla   v27.4s, %9.4s, v1.s[0]      \n"
                        "fmla   v28.4s, %9.4s, v1.s[1]      \n"
                        "fmla   v29.4s, %9.4s, v1.s[2]      \n"
                        "fmla   v30.4s, %9.4s, v1.s[3]      \n"
                        "fmla   v31.4s, %9.4s, v2.s[0]      \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%2], #32   \n"

                        "fmla   v24.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %10.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %10.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %10.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %10.4s, v1.s[3]     \n"
                        "fmla   v30.4s, %10.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %10.4s, v2.s[1]     \n"

                        "ld1    {v2.2s}, [%2]               \n"

                        "fmla   v24.4s, %11.4s, v4.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v4.s[1]     \n"
                        "fmla   v26.4s, %11.4s, v4.s[2]     \n"
                        "fmla   v27.4s, %11.4s, v4.s[3]     \n"
                        "fmla   v28.4s, %11.4s, v5.s[0]     \n"
                        "fmla   v29.4s, %11.4s, v5.s[1]     \n"
                        "fmla   v30.4s, %11.4s, v5.s[2]     \n"
                        "fmla   v31.4s, %11.4s, v5.s[3]     \n"

                        "fmla   v24.4s, %12.4s, v4.s[1]     \n"
                        "fmla   v25.4s, %12.4s, v4.s[2]     \n"
                        "fmla   v26.4s, %12.4s, v4.s[3]     \n"
                        "fmla   v27.4s, %12.4s, v5.s[0]     \n"
                        "fmla   v28.4s, %12.4s, v5.s[1]     \n"
                        "fmla   v29.4s, %12.4s, v5.s[2]     \n"
                        "fmla   v30.4s, %12.4s, v5.s[3]     \n"
                        "fmla   v31.4s, %12.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%3], #32   \n"

                        "fmla   v24.4s, %13.4s, v4.s[2]     \n"
                        "fmla   v25.4s, %13.4s, v4.s[3]     \n"
                        "fmla   v26.4s, %13.4s, v5.s[0]     \n"
                        "fmla   v27.4s, %13.4s, v5.s[1]     \n"
                        "fmla   v28.4s, %13.4s, v5.s[2]     \n"
                        "fmla   v29.4s, %13.4s, v5.s[3]     \n"
                        "fmla   v30.4s, %13.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %13.4s, v2.s[1]     \n"

                        "ld1    {v2.2s}, [%3]               \n"

                        "fmla   v24.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %14.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %14.4s, v1.s[0]     \n"
                        "fmla   v29.4s, %14.4s, v1.s[1]     \n"
                        "fmla   v30.4s, %14.4s, v1.s[2]     \n"
                        "fmla   v31.4s, %14.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %15.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %15.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %15.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %15.4s, v1.s[1]     \n"
                        "fmla   v29.4s, %15.4s, v1.s[2]     \n"
                        "fmla   v30.4s, %15.4s, v1.s[3]     \n"
                        "fmla   v31.4s, %15.4s, v2.s[0]     \n"

                        "sub    %0, %0, #64                 \n"

                        "fmla   v24.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %16.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %16.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %16.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %16.4s, v1.s[3]     \n"
                        "fmla   v30.4s, %16.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %16.4s, v2.s[1]     \n"

                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v4", "v5", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
#endif // __aarch64__
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0] \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4s}, [%1], #16          \n"

                        "fmla   v24.4s, %8.4s, v0.s[0]      \n"
                        "fmla   v25.4s, %8.4s, v0.s[1]      \n"
                        "fmla   v26.4s, %8.4s, v0.s[2]      \n"
                        "fmla   v27.4s, %8.4s, v0.s[3]      \n"

                        "ld1    {v1.2s}, [%1]               \n"

                        "fmla   v24.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v25.4s, %9.4s, v0.s[2]      \n"
                        "fmla   v26.4s, %9.4s, v0.s[3]      \n"
                        "fmla   v27.4s, %9.4s, v1.s[0]      \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v2.4s}, [%2], #16          \n"

                        "fmla   v24.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %10.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %10.4s, v1.s[1]     \n"

                        "ld1    {v3.2s}, [%2]               \n"

                        "fmla   v24.4s, %11.4s, v2.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v2.s[1]     \n"
                        "fmla   v26.4s, %11.4s, v2.s[2]     \n"
                        "fmla   v27.4s, %11.4s, v2.s[3]     \n"

                        "fmla   v24.4s, %12.4s, v2.s[1]     \n"
                        "fmla   v25.4s, %12.4s, v2.s[2]     \n"
                        "fmla   v26.4s, %12.4s, v2.s[3]     \n"
                        "fmla   v27.4s, %12.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4s}, [%3], #16          \n"

                        "fmla   v24.4s, %13.4s, v2.s[2]     \n"
                        "fmla   v25.4s, %13.4s, v2.s[3]     \n"
                        "fmla   v26.4s, %13.4s, v3.s[0]     \n"
                        "fmla   v27.4s, %13.4s, v3.s[1]     \n"

                        "ld1    {v1.2s}, [%3]               \n"

                        "fmla   v24.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %14.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %15.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %15.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %15.4s, v1.s[0]     \n"

                        "fmla   v24.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %16.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %16.4s, v1.s[1]     \n"

                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1]!      \n"

                        "vmla.f32   q12, %q8, d0[0]     \n"
                        "vmla.f32   q13, %q8, d0[1]     \n"
                        "vmla.f32   q14, %q8, d1[0]     \n"
                        "vmla.f32   q15, %q8, d1[1]     \n"

                        "vld1.f32   {d2}, [%1]          \n"

                        "vmla.f32   q12, %q9, d0[1]     \n"
                        "vmla.f32   q13, %q9, d1[0]     \n"
                        "vmla.f32   q14, %q9, d1[1]     \n"
                        "vmla.f32   q15, %q9, d2[0]     \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d4-d5}, [%2]!      \n"

                        "vmla.f32   q12, %q10, d1[0]    \n"
                        "vmla.f32   q13, %q10, d1[1]    \n"
                        "vmla.f32   q14, %q10, d2[0]    \n"
                        "vmla.f32   q15, %q10, d2[1]    \n"

                        "vmla.f32   q12, %q11, d4[0]    \n"
                        "vmla.f32   q13, %q11, d4[1]    \n"
                        "vmla.f32   q14, %q11, d5[0]    \n"
                        "vmla.f32   q15, %q11, d5[1]    \n"

                        "vld1.f32   {d3}, [%2]          \n"

                        "vmla.f32   q12, %q12, d4[1]    \n"
                        "vmla.f32   q13, %q12, d5[0]    \n"
                        "vmla.f32   q14, %q12, d5[1]    \n"
                        "vmla.f32   q15, %q12, d3[0]    \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d0-d1}, [%3]!      \n"

                        "vmla.f32   q12, %q13, d5[0]    \n"
                        "vmla.f32   q13, %q13, d5[1]    \n"
                        "vmla.f32   q14, %q13, d3[0]    \n"
                        "vmla.f32   q15, %q13, d3[1]    \n"

                        "vmla.f32   q12, %q14, d0[0]    \n"
                        "vmla.f32   q13, %q14, d0[1]    \n"
                        "vmla.f32   q14, %q14, d1[0]    \n"
                        "vmla.f32   q15, %q14, d1[1]    \n"

                        "vld1.f32   {d2}, [%3]          \n"

                        "vmla.f32   q12, %q15, d0[1]    \n"
                        "vmla.f32   q13, %q15, d1[0]    \n"
                        "vmla.f32   q14, %q15, d1[1]    \n"
                        "vmla.f32   q15, %q15, d2[0]    \n"

                        "vmla.f32   q12, %q16, d1[0]    \n"
                        "vmla.f32   q13, %q16, d1[1]    \n"
                        "vmla.f32   q14, %q16, d2[0]    \n"
                        "vmla.f32   q15, %q16, d2[1]    \n"

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "q0", "q1", "q2", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v24.4s, v25.4s}, [%0]      \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4s}, [%1]               \n"

                        "fmul   v26.4s, %8.4s, v0.s[0]      \n"
                        "fmul   v27.4s, %8.4s, v0.s[1]      \n"
                        "fmla   v24.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v25.4s, %9.4s, v0.s[2]      \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v1.4s}, [%2]               \n"

                        "fmla   v26.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v24.4s, %11.4s, v1.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v1.s[1]     \n"

                        "add    %1, %1, #8                  \n"

                        "fmla   v26.4s, %12.4s, v1.s[1]     \n"
                        "fmla   v27.4s, %12.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4s}, [%3]               \n"

                        "fmla   v24.4s, %13.4s, v1.s[2]     \n"
                        "fmla   v25.4s, %13.4s, v1.s[3]     \n"
                        "fmla   v26.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v27.4s, %14.4s, v0.s[1]     \n"

                        "add    %2, %2, #8                  \n"

                        "fmla   v24.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %15.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %16.4s, v0.s[3]     \n"

                        "add    %3, %3, #8                  \n"

                        "fadd   v24.4s, v24.4s, v26.4s      \n"
                        "fadd   v25.4s, v25.4s, v27.4s      \n"

                        "st1    {v24.4s, v25.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d24-d27}, [%0 :128] \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1]       \n"

                        "vmul.f32   q14, %q8, d0[0]     \n"
                        "vmul.f32   q15, %q8, d0[1]     \n"
                        "vmla.f32   q12, %q9, d0[1]     \n"
                        "vmla.f32   q13, %q9, d1[0]     \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d2-d3}, [%2]       \n"

                        "vmla.f32   q14, %q10, d1[0]    \n"
                        "vmla.f32   q15, %q10, d1[1]    \n"
                        "vmla.f32   q12, %q11, d2[0]    \n"
                        "vmla.f32   q13, %q11, d2[1]    \n"

                        "add        %1, %1, #8          \n"

                        "vmla.f32   q14, %q12, d2[1]    \n"
                        "vmla.f32   q15, %q12, d3[0]    \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d0-d1}, [%3]       \n"

                        "vmla.f32   q12, %q13, d3[0]    \n"
                        "vmla.f32   q13, %q13, d3[1]    \n"
                        "vmla.f32   q14, %q14, d0[0]    \n"
                        "vmla.f32   q15, %q14, d0[1]    \n"

                        "add        %2, %2, #8          \n"

                        "vmla.f32   q12, %q15, d0[1]    \n"
                        "vmla.f32   q13, %q15, d1[0]    \n"
                        "vmla.f32   q14, %q16, d1[0]    \n"
                        "vmla.f32   q15, %q16, d1[1]    \n"

                        "add        %3, %3, #8          \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "vst1.f32   {d24-d27}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "q0", "q1", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);

                    float32x4_t _r0 = vld1q_f32(r0);
                    float32x4_t _r1 = vld1q_f32(r1);
                    float32x4_t _r2 = vld1q_f32(r2);

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _k00, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22, _r2, 2);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _k00, vget_low_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k01, vget_low_f32(_r0), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k02, vget_high_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k10, vget_low_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k11, vget_low_f32(_r1), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k12, vget_high_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k20, vget_low_f32(_r2), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k21, vget_low_f32(_r2), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k22, vget_high_f32(_r2), 0);
#endif

                    vst1q_f32(outptr0, _sum0);

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                    outptr0 += 4;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
        }
    }
}

static void conv3x3s2_pack1to4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* bias = _bias;

    int remain_outch_start = 0;

#if __ARM_NEON && __aarch64__
    int nn_outch = 0;
    nn_outch = outch >> 1;
    remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = bias ? vld1q_f32((const float*)bias + (p + 1) * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);
        out1.fill(_bias1);

        const float* k0 = kernel.channel(p);
        const float* k1 = kernel.channel(p + 1);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            float32x4_t _k00_0 = vld1q_f32(k0);
            float32x4_t _k01_0 = vld1q_f32(k0 + 4);
            float32x4_t _k02_0 = vld1q_f32(k0 + 8);
            float32x4_t _k10_0 = vld1q_f32(k0 + 12);
            float32x4_t _k11_0 = vld1q_f32(k0 + 16);
            float32x4_t _k12_0 = vld1q_f32(k0 + 20);
            float32x4_t _k20_0 = vld1q_f32(k0 + 24);
            float32x4_t _k21_0 = vld1q_f32(k0 + 28);
            float32x4_t _k22_0 = vld1q_f32(k0 + 32);

            float32x4_t _k00_1 = vld1q_f32(k1);
            float32x4_t _k01_1 = vld1q_f32(k1 + 4);
            float32x4_t _k02_1 = vld1q_f32(k1 + 8);
            float32x4_t _k10_1 = vld1q_f32(k1 + 12);
            float32x4_t _k11_1 = vld1q_f32(k1 + 16);
            float32x4_t _k12_1 = vld1q_f32(k1 + 20);
            float32x4_t _k20_1 = vld1q_f32(k1 + 24);
            float32x4_t _k21_1 = vld1q_f32(k1 + 28);
            float32x4_t _k22_1 = vld1q_f32(k1 + 32);

            int i = 0;

            for (; i < outh; i++)
            {
                int nn = outw >> 2;
                int remain = outw & 3;

                if (nn > 0)
                {
                    asm volatile(
                        "0:                                 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%1] \n" // sum0

                        // r0
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%3], #32   \n"
                        "ld1r   {v4.4s}, [%3]               \n"

                        "fmla   v6.4s, %12.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %12.4s, v0.s[2]      \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%2] \n" // sum1

                        "fmla   v8.4s, %12.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %12.4s, v1.s[2]      \n"

                        "fmla   v10.4s, %21.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %21.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %21.4s, v1.s[0]     \n"
                        "fmla   v13.4s, %21.4s, v1.s[2]     \n"

                        "fmla   v6.4s, %13.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %13.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %13.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %13.4s, v1.s[3]      \n"
                        "fmla   v10.4s, %22.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %22.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %22.4s, v1.s[1]     \n"
                        "fmla   v13.4s, %22.4s, v1.s[3]     \n"

                        // r1
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v2.4s, v3.4s}, [%4], #32   \n"
                        "ld1r   {v5.4s}, [%4]               \n"

                        "fmla   v6.4s, %14.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %14.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %14.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %14.4s, v4.s[0]      \n"
                        "fmla   v10.4s, %23.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %23.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %23.4s, v1.s[2]     \n"
                        "fmla   v13.4s, %23.4s, v4.s[0]     \n"

                        "fmla   v6.4s, %15.4s, v2.s[0]      \n"
                        "fmla   v7.4s, %15.4s, v2.s[2]      \n"
                        "fmla   v8.4s, %15.4s, v3.s[0]      \n"
                        "fmla   v9.4s, %15.4s, v3.s[2]      \n"
                        "fmla   v10.4s, %24.4s, v2.s[0]     \n"
                        "fmla   v11.4s, %24.4s, v2.s[2]     \n"
                        "fmla   v12.4s, %24.4s, v3.s[0]     \n"
                        "fmla   v13.4s, %24.4s, v3.s[2]     \n"

                        "fmla   v6.4s, %16.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %16.4s, v2.s[3]      \n"
                        "fmla   v8.4s, %16.4s, v3.s[1]      \n"
                        "fmla   v9.4s, %16.4s, v3.s[3]      \n"
                        "fmla   v10.4s, %25.4s, v2.s[1]     \n"
                        "fmla   v11.4s, %25.4s, v2.s[3]     \n"
                        "fmla   v12.4s, %25.4s, v3.s[1]     \n"
                        "fmla   v13.4s, %25.4s, v3.s[3]     \n"

                        // r2
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%5], #32   \n"
                        "ld1r   {v4.4s}, [%5]               \n"

                        "fmla   v6.4s, %17.4s, v2.s[2]      \n"
                        "fmla   v7.4s, %17.4s, v3.s[0]      \n"
                        "fmla   v8.4s, %17.4s, v3.s[2]      \n"
                        "fmla   v9.4s, %17.4s, v5.s[0]      \n"
                        "fmla   v10.4s, %26.4s, v2.s[2]     \n"
                        "fmla   v11.4s, %26.4s, v3.s[0]     \n"
                        "fmla   v12.4s, %26.4s, v3.s[2]     \n"
                        "fmla   v13.4s, %26.4s, v5.s[0]     \n"

                        "fmla   v6.4s, %18.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %18.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %18.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %18.4s, v1.s[2]      \n"
                        "fmla   v10.4s, %27.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %27.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %27.4s, v1.s[0]     \n"
                        "fmla   v13.4s, %27.4s, v1.s[2]     \n"

                        "fmla   v6.4s, %19.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %19.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %19.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %19.4s, v1.s[3]      \n"
                        "fmla   v10.4s, %28.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %28.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %28.4s, v1.s[1]     \n"
                        "fmla   v13.4s, %28.4s, v1.s[3]     \n"

                        "fmla   v6.4s, %20.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %20.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %20.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %20.4s, v4.s[0]      \n"
                        "fmla   v10.4s, %29.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %29.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %29.4s, v1.s[2]     \n"
                        "fmla   v13.4s, %29.4s, v4.s[0]     \n"

                        "subs   %w0, %w0, #1                \n"

                        "st1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%1], #64 \n"
                        "st1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%2], #64 \n"

                        "bne    0b                          \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2)       // %5
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00_0), // %12
                        "w"(_k01_0), // %13
                        "w"(_k02_0), // %14
                        "w"(_k10_0), // %15
                        "w"(_k11_0), // %16
                        "w"(_k12_0), // %17
                        "w"(_k20_0), // %18
                        "w"(_k21_0), // %19
                        "w"(_k22_0), // %20
                        "w"(_k00_1), // %21
                        "w"(_k01_1), // %22
                        "w"(_k02_1), // %23
                        "w"(_k10_1), // %24
                        "w"(_k11_1), // %25
                        "w"(_k12_1), // %26
                        "w"(_k20_1), // %27
                        "w"(_k21_1), // %28
                        "w"(_k22_1)  // %29
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }

                for (; remain > 0; remain--)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);
                    float32x4_t _sum1 = vld1q_f32(outptr1);

                    float32x4_t _r0 = vld1q_f32(r0);
                    float32x4_t _r1 = vld1q_f32(r1);
                    float32x4_t _r2 = vld1q_f32(r2);

                    _sum0 = vfmaq_laneq_f32(_sum0, _k00_0, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01_0, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02_0, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10_0, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11_0, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12_0, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20_0, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21_0, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22_0, _r2, 2);

                    _sum1 = vfmaq_laneq_f32(_sum1, _k00_1, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k01_1, _r0, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k02_1, _r0, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k10_1, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k11_1, _r1, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k12_1, _r1, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k20_1, _r2, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k21_1, _r2, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k22_1, _r2, 2);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr1, _sum1);

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0 += 4;
                    outptr1 += 4;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
            k1 += 9 * 4;
        }
    }
#endif // __ARM_NEON && __aarch64__

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        const float* k0 = kernel.channel(p);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            float32x4_t _k00 = vld1q_f32(k0);
            float32x4_t _k01 = vld1q_f32(k0 + 4);
            float32x4_t _k02 = vld1q_f32(k0 + 8);
            float32x4_t _k10 = vld1q_f32(k0 + 12);
            float32x4_t _k11 = vld1q_f32(k0 + 16);
            float32x4_t _k12 = vld1q_f32(k0 + 20);
            float32x4_t _k20 = vld1q_f32(k0 + 24);
            float32x4_t _k21 = vld1q_f32(k0 + 28);
            float32x4_t _k22 = vld1q_f32(k0 + 32);

            int i = 0;

            for (; i < outh; i++)
            {
                int nn = outw >> 2;
                int remain = outw & 3;

#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                                 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%1] \n" // sum0

                        // r0
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                        "ld1r   {v4.4s}, [%2]               \n"

                        "fmla   v6.4s, %10.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %10.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %10.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %10.4s, v1.s[2]      \n"

                        "fmla   v6.4s, %11.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %11.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %11.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %11.4s, v1.s[3]      \n"

                        // r1
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v2.4s, v3.4s}, [%3], #32   \n"
                        "ld1r   {v5.4s}, [%3]               \n"

                        "fmla   v6.4s, %12.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %12.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %12.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %12.4s, v4.s[0]      \n"

                        "fmla   v6.4s, %13.4s, v2.s[0]      \n"
                        "fmla   v7.4s, %13.4s, v2.s[2]      \n"
                        "fmla   v8.4s, %13.4s, v3.s[0]      \n"
                        "fmla   v9.4s, %13.4s, v3.s[2]      \n"

                        "fmla   v6.4s, %14.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %14.4s, v2.s[3]      \n"
                        "fmla   v8.4s, %14.4s, v3.s[1]      \n"
                        "fmla   v9.4s, %14.4s, v3.s[3]      \n"

                        // r2
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%4], #32   \n"
                        "ld1r   {v4.4s}, [%4]               \n"

                        "fmla   v6.4s, %15.4s, v2.s[2]      \n"
                        "fmla   v7.4s, %15.4s, v3.s[0]      \n"
                        "fmla   v8.4s, %15.4s, v3.s[2]      \n"
                        "fmla   v9.4s, %15.4s, v5.s[0]      \n"

                        "fmla   v6.4s, %16.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %16.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %16.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %16.4s, v1.s[2]      \n"

                        "fmla   v6.4s, %17.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %17.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %17.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %17.4s, v1.s[3]      \n"

                        "fmla   v6.4s, %18.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %18.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %18.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %18.4s, v4.s[0]      \n"

                        "subs   %w0, %w0, #1                \n"

                        "st1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%1], #64 \n"

                        "bne    0b                          \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(r0),      // %2
                        "=r"(r1),      // %3
                        "=r"(r2)       // %4
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
                }
#else  // __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                             \n"

                        "pld        [%1, #512]          \n"
                        "vldm       %1, {d0-d7}         \n" // sum0

                        // r0
                        "pld        [%2, #256]          \n"
                        "vld1.f32   {d8-d11}, [%2]!     \n"
                        "vld1.f32   {d12[]}, [%2]       \n"

                        "vmla.f32   q0, %q10, d8[0]     \n"
                        "vmla.f32   q1, %q10, d9[0]     \n"
                        "vmla.f32   q2, %q10, d10[0]    \n"
                        "vmla.f32   q3, %q10, d11[0]    \n"

                        "vmla.f32   q0, %q11, d8[1]     \n"
                        "vmla.f32   q1, %q11, d9[1]     \n"
                        "vmla.f32   q2, %q11, d10[1]    \n"
                        "vmla.f32   q3, %q11, d11[1]    \n"

                        "vmla.f32   q0, %q12, d9[0]     \n"
                        "vmla.f32   q1, %q12, d10[0]    \n"
                        "vmla.f32   q2, %q12, d11[0]    \n"

                        // r1
                        "pld        [%3, #256]          \n"
                        "vld1.f32   {d8-d11}, [%3]!     \n"
                        "vld1.f32   {d13[]}, [%3]       \n"

                        "vmla.f32   q3, %q12, d12[0]    \n"

                        "vmla.f32   q0, %q13, d8[0]     \n"
                        "vmla.f32   q1, %q13, d9[0]     \n"
                        "vmla.f32   q2, %q13, d10[0]    \n"
                        "vmla.f32   q3, %q13, d11[0]    \n"

                        "vmla.f32   q0, %q14, d8[1]     \n"
                        "vmla.f32   q1, %q14, d9[1]     \n"
                        "vmla.f32   q2, %q14, d10[1]    \n"
                        "vmla.f32   q3, %q14, d11[1]    \n"

                        "vmla.f32   q0, %q15, d9[0]     \n"
                        "vmla.f32   q1, %q15, d10[0]    \n"
                        "vmla.f32   q2, %q15, d11[0]    \n"

                        // r2
                        "pld        [%4, #256]          \n"
                        "vld1.f32   {d8-d11}, [%4]!     \n"
                        "vld1.f32   {d12[]}, [%4]       \n"

                        "vmla.f32   q3, %q15, d13[0]    \n"

                        "vmla.f32   q0, %q16, d8[0]     \n"
                        "vmla.f32   q1, %q16, d9[0]     \n"
                        "vmla.f32   q2, %q16, d10[0]    \n"
                        "vmla.f32   q3, %q16, d11[0]    \n"

                        "vmla.f32   q0, %q17, d8[1]     \n"
                        "vmla.f32   q1, %q17, d9[1]     \n"
                        "vmla.f32   q2, %q17, d10[1]    \n"
                        "vmla.f32   q3, %q17, d11[1]    \n"

                        "vmla.f32   q0, %q18, d9[0]     \n"
                        "vmla.f32   q1, %q18, d10[0]    \n"
                        "vmla.f32   q2, %q18, d11[0]    \n"
                        "vmla.f32   q3, %q18, d12[0]    \n"

                        "subs       %0, %0, #1          \n"

                        "vstm       %1!, {d0-d7}        \n"

                        "bne        0b                  \n"

                        : "=r"(nn),      // %0
                        "=r"(outptr0), // %1
                        "=r"(r0),      // %2
                        "=r"(r1),      // %3
                        "=r"(r2)       // %4
                        : "0"(nn),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
                }
#endif // __aarch64__

                for (; remain > 0; remain--)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);

                    float32x4_t _r0 = vld1q_f32(r0);
                    float32x4_t _r1 = vld1q_f32(r1);
                    float32x4_t _r2 = vld1q_f32(r2);

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _k00, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22, _r2, 2);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _k00, vget_low_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k01, vget_low_f32(_r0), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k02, vget_high_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k10, vget_low_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k11, vget_low_f32(_r1), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k12, vget_high_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k20, vget_low_f32(_r2), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k21, vget_low_f32(_r2), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k22, vget_high_f32(_r2), 0);
#endif

                    vst1q_f32(outptr0, _sum0);

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0 += 4;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack1to4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

#if __ARM_NEON && __aarch64__
    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4 * 2, 4 * 2, opt.workspace_allocator);
#else
    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4, 4, opt.workspace_allocator);
#endif

    const float* bias = _bias;

    int remain_outch_start = 0;

#if __ARM_NEON && __aarch64__
    int nn_outch = 0;
    nn_outch = outch >> 1;
    remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = bias ? vld1q_f32((const float*)bias + (p + 1) * 4) : vdupq_n_f32(0.f);
        {
            float* ptr = (float*)out0;

            for (int i = 0; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    vst1q_f32(ptr, _bias0);
                    vst1q_f32(ptr + 4, _bias0);
                    vst1q_f32(ptr + 8, _bias0);
                    vst1q_f32(ptr + 12, _bias0);
                    vst1q_f32(ptr + 16, _bias1);
                    vst1q_f32(ptr + 20, _bias1);
                    vst1q_f32(ptr + 24, _bias1);
                    vst1q_f32(ptr + 28, _bias1);
                    ptr += 32;
                }
                for (; j + 1 < outw; j += 2)
                {
                    vst1q_f32(ptr, _bias0);
                    vst1q_f32(ptr + 4, _bias0);
                    vst1q_f32(ptr + 8, _bias1);
                    vst1q_f32(ptr + 12, _bias1);
                    ptr += 16;
                }
                for (; j < outw; j++)
                {
                    vst1q_f32(ptr, _bias0);
                    vst1q_f32(ptr + 4, _bias1);
                    ptr += 8;
                }
            }
        }

        const unsigned short* k0 = kernel.channel(p);
        const unsigned short* k1 = kernel.channel(p + 1);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            float32x4_t _k00_0 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01_0 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02_0 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10_0 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11_0 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12_0 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20_0 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21_0 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22_0 = bfloat2float(vld1_u16(k0 + 32));

            float32x4_t _k00_1 = bfloat2float(vld1_u16(k1));
            float32x4_t _k01_1 = bfloat2float(vld1_u16(k1 + 4));
            float32x4_t _k02_1 = bfloat2float(vld1_u16(k1 + 8));
            float32x4_t _k10_1 = bfloat2float(vld1_u16(k1 + 12));
            float32x4_t _k11_1 = bfloat2float(vld1_u16(k1 + 16));
            float32x4_t _k12_1 = bfloat2float(vld1_u16(k1 + 20));
            float32x4_t _k20_1 = bfloat2float(vld1_u16(k1 + 24));
            float32x4_t _k21_1 = bfloat2float(vld1_u16(k1 + 28));
            float32x4_t _k22_1 = bfloat2float(vld1_u16(k1 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1], #8           \n"
                        "ld1    {v1.s}[0], [%1]             \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0] \n"

                        "fmla   v24.4s, %8.4s, v0.s[0]      \n"
                        "fmla   v25.4s, %8.4s, v0.s[1]      \n"
                        "fmla   v26.4s, %8.4s, v0.s[2]      \n"
                        "fmla   v27.4s, %8.4s, v0.s[3]      \n"
                        "fmla   v28.4s, %17.4s, v0.s[0]     \n"
                        "fmla   v29.4s, %17.4s, v0.s[1]     \n"
                        "fmla   v30.4s, %17.4s, v0.s[2]     \n"
                        "fmla   v31.4s, %17.4s, v0.s[3]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v2.4h}, [%2], #8           \n"
                        "ld1    {v3.s}[0], [%2]             \n"

                        "fmla   v24.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v25.4s, %9.4s, v0.s[2]      \n"
                        "fmla   v26.4s, %9.4s, v0.s[3]      \n"
                        "fmla   v27.4s, %9.4s, v1.s[0]      \n"
                        "fmla   v28.4s, %18.4s, v0.s[1]     \n"
                        "fmla   v29.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v30.4s, %18.4s, v0.s[3]     \n"
                        "fmla   v31.4s, %18.4s, v1.s[0]     \n"

                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v24.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %10.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %10.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %19.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %19.4s, v0.s[3]     \n"
                        "fmla   v30.4s, %19.4s, v1.s[0]     \n"
                        "fmla   v31.4s, %19.4s, v1.s[1]     \n"

                        "fmla   v24.4s, %11.4s, v2.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v2.s[1]     \n"
                        "fmla   v26.4s, %11.4s, v2.s[2]     \n"
                        "fmla   v27.4s, %11.4s, v2.s[3]     \n"
                        "fmla   v28.4s, %20.4s, v2.s[0]     \n"
                        "fmla   v29.4s, %20.4s, v2.s[1]     \n"
                        "fmla   v30.4s, %20.4s, v2.s[2]     \n"
                        "fmla   v31.4s, %20.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n"
                        "ld1    {v1.s}[0], [%3]             \n"

                        "fmla   v24.4s, %12.4s, v2.s[1]     \n"
                        "fmla   v25.4s, %12.4s, v2.s[2]     \n"
                        "fmla   v26.4s, %12.4s, v2.s[3]     \n"
                        "fmla   v27.4s, %12.4s, v3.s[0]     \n"
                        "fmla   v28.4s, %21.4s, v2.s[1]     \n"
                        "fmla   v29.4s, %21.4s, v2.s[2]     \n"
                        "fmla   v30.4s, %21.4s, v2.s[3]     \n"
                        "fmla   v31.4s, %21.4s, v3.s[0]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %13.4s, v2.s[2]     \n"
                        "fmla   v25.4s, %13.4s, v2.s[3]     \n"
                        "fmla   v26.4s, %13.4s, v3.s[0]     \n"
                        "fmla   v27.4s, %13.4s, v3.s[1]     \n"
                        "fmla   v28.4s, %22.4s, v2.s[2]     \n"
                        "fmla   v29.4s, %22.4s, v2.s[3]     \n"
                        "fmla   v30.4s, %22.4s, v3.s[0]     \n"
                        "fmla   v31.4s, %22.4s, v3.s[1]     \n"

                        "fmla   v24.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %14.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %23.4s, v0.s[0]     \n"
                        "fmla   v29.4s, %23.4s, v0.s[1]     \n"
                        "fmla   v30.4s, %23.4s, v0.s[2]     \n"
                        "fmla   v31.4s, %23.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %15.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %15.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %15.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %24.4s, v0.s[1]     \n"
                        "fmla   v29.4s, %24.4s, v0.s[2]     \n"
                        "fmla   v30.4s, %24.4s, v0.s[3]     \n"
                        "fmla   v31.4s, %24.4s, v1.s[0]     \n"

                        "sub    %0, %0, #64                 \n"

                        "fmla   v24.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %16.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %16.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %25.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %25.4s, v0.s[3]     \n"
                        "fmla   v30.4s, %25.4s, v1.s[0]     \n"
                        "fmla   v31.4s, %25.4s, v1.s[1]     \n"

                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_0), // %8
                        "w"(_k01_0), // %9
                        "w"(_k02_0), // %10
                        "w"(_k10_0), // %11
                        "w"(_k11_0), // %12
                        "w"(_k12_0), // %13
                        "w"(_k20_0), // %14
                        "w"(_k21_0), // %15
                        "w"(_k22_0), // %16
                        "w"(_k00_1), // %17
                        "w"(_k01_1), // %18
                        "w"(_k02_1), // %19
                        "w"(_k10_1), // %20
                        "w"(_k11_1), // %21
                        "w"(_k12_1), // %22
                        "w"(_k20_1), // %23
                        "w"(_k21_1), // %24
                        "w"(_k22_1)  // %25
                        : "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1]               \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0] \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v24.4s, %8.4s, v0.s[0]      \n"
                        "fmla   v25.4s, %8.4s, v0.s[1]      \n"
                        "fmla   v26.4s, %17.4s, v0.s[0]     \n"
                        "fmla   v27.4s, %17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%2, #64]       \n"
                        "ld1    {v1.4h}, [%2]               \n"

                        "fmla   v24.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v25.4s, %9.4s, v0.s[2]      \n"
                        "fmla   v26.4s, %18.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %18.4s, v0.s[2]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %19.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %19.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %11.4s, v1.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v1.s[1]     \n"
                        "fmla   v26.4s, %20.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %20.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%3, #64]       \n"
                        "ld1    {v0.4h}, [%3]               \n"

                        "fmla   v24.4s, %12.4s, v1.s[1]     \n"
                        "fmla   v25.4s, %12.4s, v1.s[2]     \n"
                        "fmla   v26.4s, %21.4s, v1.s[1]     \n"
                        "fmla   v27.4s, %21.4s, v1.s[2]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v24.4s, %13.4s, v1.s[2]     \n"
                        "fmla   v25.4s, %13.4s, v1.s[3]     \n"
                        "fmla   v26.4s, %22.4s, v1.s[2]     \n"
                        "fmla   v27.4s, %22.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %23.4s, v0.s[0]     \n"
                        "fmla   v27.4s, %23.4s, v0.s[1]     \n"

                        "add    %1, %1, #4                  \n"

                        "fmla   v24.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %15.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %24.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %24.4s, v0.s[2]     \n"

                        "add    %2, %2, #4                  \n"

                        "fmla   v24.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %25.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %25.4s, v0.s[3]     \n"

                        "add    %3, %3, #4                  \n"

                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_0), // %8
                        "w"(_k01_0), // %9
                        "w"(_k02_0), // %10
                        "w"(_k10_0), // %11
                        "w"(_k11_0), // %12
                        "w"(_k12_0), // %13
                        "w"(_k20_0), // %14
                        "w"(_k21_0), // %15
                        "w"(_k22_0), // %16
                        "w"(_k00_1), // %17
                        "w"(_k01_1), // %18
                        "w"(_k02_1), // %19
                        "w"(_k10_1), // %20
                        "w"(_k11_1), // %21
                        "w"(_k12_1), // %22
                        "w"(_k20_1), // %23
                        "w"(_k21_1), // %24
                        "w"(_k22_1)  // %25
                        : "memory", "v0", "v1", "v24", "v25", "v26", "v27");
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum00 = vld1q_f32(outptr0);
                    float32x4_t _sum10 = vld1q_f32(outptr0 + 4);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

                    _sum00 = vfmaq_laneq_f32(_sum00, _k00_0, _r0, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k01_0, _r0, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k02_0, _r0, 2);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k10_0, _r1, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k11_0, _r1, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k12_0, _r1, 2);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k20_0, _r2, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k21_0, _r2, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k22_0, _r2, 2);

                    _sum10 = vfmaq_laneq_f32(_sum10, _k00_1, _r0, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k01_1, _r0, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k02_1, _r0, 2);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k10_1, _r1, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k11_1, _r1, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k12_1, _r1, 2);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k20_1, _r2, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k21_1, _r2, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k22_1, _r2, 2);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                    outptr0 += 8;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
            k1 += 9 * 4;
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);
            unsigned short* outptr1_bf16 = top_blob.channel(p + 1);

            const float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            float32x4_t _k00_0 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01_0 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02_0 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10_0 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11_0 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12_0 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20_0 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21_0 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22_0 = bfloat2float(vld1_u16(k0 + 32));

            float32x4_t _k00_1 = bfloat2float(vld1_u16(k1));
            float32x4_t _k01_1 = bfloat2float(vld1_u16(k1 + 4));
            float32x4_t _k02_1 = bfloat2float(vld1_u16(k1 + 8));
            float32x4_t _k10_1 = bfloat2float(vld1_u16(k1 + 12));
            float32x4_t _k11_1 = bfloat2float(vld1_u16(k1 + 16));
            float32x4_t _k12_1 = bfloat2float(vld1_u16(k1 + 20));
            float32x4_t _k20_1 = bfloat2float(vld1_u16(k1 + 24));
            float32x4_t _k21_1 = bfloat2float(vld1_u16(k1 + 28));
            float32x4_t _k22_1 = bfloat2float(vld1_u16(k1 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n"
                        "ld1    {v1.s}[0], [%3]             \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"

                        "fmla   v24.4s, %12.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %12.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %12.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %12.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %21.4s, v0.s[0]     \n"
                        "fmla   v29.4s, %21.4s, v0.s[1]     \n"
                        "fmla   v30.4s, %21.4s, v0.s[2]     \n"
                        "fmla   v31.4s, %21.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %13.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %13.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %13.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %13.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %22.4s, v0.s[1]     \n"
                        "fmla   v29.4s, %22.4s, v0.s[2]     \n"
                        "fmla   v30.4s, %22.4s, v0.s[3]     \n"
                        "fmla   v31.4s, %22.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v2.4h}, [%4], #8           \n"
                        "ld1    {v3.s}[0], [%4]             \n"

                        "fmla   v24.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %14.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %14.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %23.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %23.4s, v0.s[3]     \n"
                        "fmla   v30.4s, %23.4s, v1.s[0]     \n"
                        "fmla   v31.4s, %23.4s, v1.s[1]     \n"

                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v24.4s, %15.4s, v2.s[0]     \n"
                        "fmla   v25.4s, %15.4s, v2.s[1]     \n"
                        "fmla   v26.4s, %15.4s, v2.s[2]     \n"
                        "fmla   v27.4s, %15.4s, v2.s[3]     \n"
                        "fmla   v28.4s, %24.4s, v2.s[0]     \n"
                        "fmla   v29.4s, %24.4s, v2.s[1]     \n"
                        "fmla   v30.4s, %24.4s, v2.s[2]     \n"
                        "fmla   v31.4s, %24.4s, v2.s[3]     \n"

                        "fmla   v24.4s, %16.4s, v2.s[1]     \n"
                        "fmla   v25.4s, %16.4s, v2.s[2]     \n"
                        "fmla   v26.4s, %16.4s, v2.s[3]     \n"
                        "fmla   v27.4s, %16.4s, v3.s[0]     \n"
                        "fmla   v28.4s, %25.4s, v2.s[1]     \n"
                        "fmla   v29.4s, %25.4s, v2.s[2]     \n"
                        "fmla   v30.4s, %25.4s, v2.s[3]     \n"
                        "fmla   v31.4s, %25.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%5, #64]        \n"
                        "ld1    {v0.4h}, [%5], #8           \n"
                        "ld1    {v1.s}[0], [%5]             \n"

                        "fmla   v24.4s, %17.4s, v2.s[2]     \n"
                        "fmla   v25.4s, %17.4s, v2.s[3]     \n"
                        "fmla   v26.4s, %17.4s, v3.s[0]     \n"
                        "fmla   v27.4s, %17.4s, v3.s[1]     \n"
                        "fmla   v28.4s, %26.4s, v2.s[2]     \n"
                        "fmla   v29.4s, %26.4s, v2.s[3]     \n"
                        "fmla   v30.4s, %26.4s, v3.s[0]     \n"
                        "fmla   v31.4s, %26.4s, v3.s[1]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %18.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %18.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %18.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %27.4s, v0.s[0]     \n"
                        "fmla   v29.4s, %27.4s, v0.s[1]     \n"
                        "fmla   v30.4s, %27.4s, v0.s[2]     \n"
                        "fmla   v31.4s, %27.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %19.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %19.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %19.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %19.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %28.4s, v0.s[1]     \n"
                        "fmla   v29.4s, %28.4s, v0.s[2]     \n"
                        "fmla   v30.4s, %28.4s, v0.s[3]     \n"
                        "fmla   v31.4s, %28.4s, v1.s[0]     \n"

                        "fmla   v24.4s, %20.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %20.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %20.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %20.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %29.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %29.4s, v0.s[3]     \n"
                        "fmla   v30.4s, %29.4s, v1.s[0]     \n"
                        "fmla   v31.4s, %29.4s, v1.s[1]     \n"

                        "shrn   v24.4h, v24.4s, #16         \n"
                        "shrn   v25.4h, v25.4s, #16         \n"
                        "shrn   v26.4h, v26.4s, #16         \n"
                        "shrn   v27.4h, v27.4s, #16         \n"
                        "shrn   v28.4h, v28.4s, #16         \n"
                        "shrn   v29.4h, v29.4s, #16         \n"
                        "shrn   v30.4h, v30.4s, #16         \n"
                        "shrn   v31.4h, v31.4s, #16         \n"

                        "st1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%0], #32 \n"
                        "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%1], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr1_bf16), // %1
                        "=r"(outptr0),      // %2
                        "=r"(r0),           // %3
                        "=r"(r1),           // %4
                        "=r"(r2)            // %5
                        : "0"(outptr0_bf16),
                        "1"(outptr1_bf16),
                        "2"(outptr0),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00_0), // %12
                        "w"(_k01_0), // %13
                        "w"(_k02_0), // %14
                        "w"(_k10_0), // %15
                        "w"(_k11_0), // %16
                        "w"(_k12_0), // %17
                        "w"(_k20_0), // %18
                        "w"(_k21_0), // %19
                        "w"(_k22_0), // %20
                        "w"(_k00_1), // %21
                        "w"(_k01_1), // %22
                        "w"(_k02_1), // %23
                        "w"(_k10_1), // %24
                        "w"(_k11_1), // %25
                        "w"(_k12_1), // %26
                        "w"(_k20_1), // %27
                        "w"(_k21_1), // %28
                        "w"(_k22_1)  // %29
                        : "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3]               \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v24.4s, %12.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %12.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %21.4s, v0.s[0]     \n"
                        "fmla   v27.4s, %21.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v1.4h}, [%4]               \n"

                        "fmla   v24.4s, %13.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %13.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %22.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %22.4s, v0.s[2]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %23.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %23.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %15.4s, v1.s[0]     \n"
                        "fmla   v25.4s, %15.4s, v1.s[1]     \n"
                        "fmla   v26.4s, %24.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %24.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%5, #64]        \n"
                        "ld1    {v0.4h}, [%5]               \n"

                        "fmla   v24.4s, %16.4s, v1.s[1]     \n"
                        "fmla   v25.4s, %16.4s, v1.s[2]     \n"
                        "fmla   v26.4s, %25.4s, v1.s[1]     \n"
                        "fmla   v27.4s, %25.4s, v1.s[2]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v24.4s, %17.4s, v1.s[2]     \n"
                        "fmla   v25.4s, %17.4s, v1.s[3]     \n"
                        "fmla   v26.4s, %26.4s, v1.s[2]     \n"
                        "fmla   v27.4s, %26.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %18.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %18.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %27.4s, v0.s[0]     \n"
                        "fmla   v27.4s, %27.4s, v0.s[1]     \n"

                        "fmla   v24.4s, %19.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %19.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %28.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %28.4s, v0.s[2]     \n"

                        "add    %3, %3, #4                  \n"

                        "fmla   v24.4s, %20.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %20.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %29.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %29.4s, v0.s[3]     \n"

                        "add    %4, %4, #4                  \n"

                        "shrn   v24.4h, v24.4s, #16         \n"
                        "shrn   v25.4h, v25.4s, #16         \n"
                        "shrn   v26.4h, v26.4s, #16         \n"
                        "shrn   v27.4h, v27.4s, #16         \n"

                        "add    %5, %5, #4                  \n"

                        "st1    {v24.4h, v25.4h}, [%0], #16 \n"
                        "st1    {v26.4h, v27.4h}, [%1], #16 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr1_bf16), // %1
                        "=r"(outptr0),      // %2
                        "=r"(r0),           // %3
                        "=r"(r1),           // %4
                        "=r"(r2)            // %5
                        : "0"(outptr0_bf16),
                        "1"(outptr1_bf16),
                        "2"(outptr0),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00_0), // %12
                        "w"(_k01_0), // %13
                        "w"(_k02_0), // %14
                        "w"(_k10_0), // %15
                        "w"(_k11_0), // %16
                        "w"(_k12_0), // %17
                        "w"(_k20_0), // %18
                        "w"(_k21_0), // %19
                        "w"(_k22_0), // %20
                        "w"(_k00_1), // %21
                        "w"(_k01_1), // %22
                        "w"(_k02_1), // %23
                        "w"(_k10_1), // %24
                        "w"(_k11_1), // %25
                        "w"(_k12_1), // %26
                        "w"(_k20_1), // %27
                        "w"(_k21_1), // %28
                        "w"(_k22_1)  // %29
                        : "memory", "v0", "v1", "v24", "v25", "v26", "v27");
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum00 = vld1q_f32(outptr0);
                    float32x4_t _sum10 = vld1q_f32(outptr0 + 4);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

                    _sum00 = vfmaq_laneq_f32(_sum00, _k00_0, _r0, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k01_0, _r0, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k02_0, _r0, 2);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k10_0, _r1, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k11_0, _r1, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k12_0, _r1, 2);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k20_0, _r2, 0);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k21_0, _r2, 1);
                    _sum00 = vfmaq_laneq_f32(_sum00, _k22_0, _r2, 2);

                    _sum10 = vfmaq_laneq_f32(_sum10, _k00_1, _r0, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k01_1, _r0, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k02_1, _r0, 2);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k10_1, _r1, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k11_1, _r1, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k12_1, _r1, 2);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k20_1, _r2, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k21_1, _r2, 1);
                    _sum10 = vfmaq_laneq_f32(_sum10, _k22_1, _r2, 2);

                    vst1_u16(outptr0_bf16, float2bfloat(_sum00));
                    vst1_u16(outptr1_bf16, float2bfloat(_sum10));

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                    outptr0 += 8;
                    outptr0_bf16 += 4;
                    outptr1_bf16 += 4;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
            k1 += 9 * 4;
        }
    }
#endif // __ARM_NEON && __aarch64__

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        const unsigned short* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<unsigned short>(0);
            const unsigned short* r1 = img0.row<unsigned short>(1);
            const unsigned short* r2 = img0.row<unsigned short>(2);

            float32x4_t _k00 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22 = bfloat2float(vld1_u16(k0 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

#if __aarch64__
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"

                        //                         "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0] \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%1], #16   \n"
                        "ld1    {v2.s}[0], [%1]             \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v24.4s, %8.4s, v0.s[0]      \n"
                        "fmla   v25.4s, %8.4s, v0.s[1]      \n"
                        "fmla   v26.4s, %8.4s, v0.s[2]      \n"
                        "fmla   v27.4s, %8.4s, v0.s[3]      \n"
                        "fmla   v28.4s, %8.4s, v1.s[0]      \n"
                        "fmla   v29.4s, %8.4s, v1.s[1]      \n"
                        "fmla   v30.4s, %8.4s, v1.s[2]      \n"
                        "fmla   v31.4s, %8.4s, v1.s[3]      \n"

                        "fmla   v24.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v25.4s, %9.4s, v0.s[2]      \n"
                        "fmla   v26.4s, %9.4s, v0.s[3]      \n"
                        "fmla   v27.4s, %9.4s, v1.s[0]      \n"
                        "fmla   v28.4s, %9.4s, v1.s[1]      \n"
                        "fmla   v29.4s, %9.4s, v1.s[2]      \n"
                        "fmla   v30.4s, %9.4s, v1.s[3]      \n"
                        "fmla   v31.4s, %9.4s, v2.s[0]      \n"

                        "fmla   v24.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %10.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %10.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %10.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %10.4s, v1.s[3]     \n"
                        "fmla   v30.4s, %10.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %10.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%2], #16   \n"
                        "ld1    {v2.s}[0], [%2]             \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v24.4s, %11.4s, v4.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v4.s[1]     \n"
                        "fmla   v26.4s, %11.4s, v4.s[2]     \n"
                        "fmla   v27.4s, %11.4s, v4.s[3]     \n"
                        "fmla   v28.4s, %11.4s, v5.s[0]     \n"
                        "fmla   v29.4s, %11.4s, v5.s[1]     \n"
                        "fmla   v30.4s, %11.4s, v5.s[2]     \n"
                        "fmla   v31.4s, %11.4s, v5.s[3]     \n"

                        "fmla   v24.4s, %12.4s, v4.s[1]     \n"
                        "fmla   v25.4s, %12.4s, v4.s[2]     \n"
                        "fmla   v26.4s, %12.4s, v4.s[3]     \n"
                        "fmla   v27.4s, %12.4s, v5.s[0]     \n"
                        "fmla   v28.4s, %12.4s, v5.s[1]     \n"
                        "fmla   v29.4s, %12.4s, v5.s[2]     \n"
                        "fmla   v30.4s, %12.4s, v5.s[3]     \n"
                        "fmla   v31.4s, %12.4s, v2.s[0]     \n"

                        "fmla   v24.4s, %13.4s, v4.s[2]     \n"
                        "fmla   v25.4s, %13.4s, v4.s[3]     \n"
                        "fmla   v26.4s, %13.4s, v5.s[0]     \n"
                        "fmla   v27.4s, %13.4s, v5.s[1]     \n"
                        "fmla   v28.4s, %13.4s, v5.s[2]     \n"
                        "fmla   v29.4s, %13.4s, v5.s[3]     \n"
                        "fmla   v30.4s, %13.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %13.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n"
                        "ld1    {v2.s}[0], [%3]             \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v24.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %14.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %14.4s, v1.s[0]     \n"
                        "fmla   v29.4s, %14.4s, v1.s[1]     \n"
                        "fmla   v30.4s, %14.4s, v1.s[2]     \n"
                        "fmla   v31.4s, %14.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %15.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %15.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %15.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %15.4s, v1.s[1]     \n"
                        "fmla   v29.4s, %15.4s, v1.s[2]     \n"
                        "fmla   v30.4s, %15.4s, v1.s[3]     \n"
                        "fmla   v31.4s, %15.4s, v2.s[0]     \n"

                        "sub    %0, %0, #64                 \n"

                        "fmla   v24.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %16.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %16.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %16.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %16.4s, v1.s[3]     \n"
                        "fmla   v30.4s, %16.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %16.4s, v2.s[1]     \n"

                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v4", "v5", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
#endif // __aarch64__
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1], #8           \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0] \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "ld1    {v1.s}[0], [%1]             \n"

                        "fmla   v24.4s, %8.4s, v0.s[0]      \n"
                        "fmla   v25.4s, %8.4s, v0.s[1]      \n"

                        "fmla   v26.4s, %8.4s, v0.s[2]      \n"
                        "fmla   v27.4s, %8.4s, v0.s[3]      \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v25.4s, %9.4s, v0.s[2]      \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v2.4h}, [%2], #8           \n"

                        "fmla   v26.4s, %9.4s, v0.s[3]      \n"
                        "fmla   v27.4s, %9.4s, v1.s[0]      \n"

                        "ld1    {v3.s}[0], [%2]             \n"

                        "fmla   v24.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[3]     \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v26.4s, %10.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %10.4s, v1.s[1]     \n"

                        "fmla   v24.4s, %11.4s, v2.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v2.s[1]     \n"

                        "fmla   v26.4s, %11.4s, v2.s[2]     \n"
                        "fmla   v27.4s, %11.4s, v2.s[3]     \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v24.4s, %12.4s, v2.s[1]     \n"
                        "fmla   v25.4s, %12.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n"

                        "fmla   v26.4s, %12.4s, v2.s[3]     \n"
                        "fmla   v27.4s, %12.4s, v3.s[0]     \n"

                        "ld1    {v1.s}[0], [%3]             \n"

                        "fmla   v24.4s, %13.4s, v2.s[2]     \n"
                        "fmla   v25.4s, %13.4s, v2.s[3]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v26.4s, %13.4s, v3.s[0]     \n"
                        "fmla   v27.4s, %13.4s, v3.s[1]     \n"

                        "fmla   v24.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[1]     \n"

                        "fmla   v26.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %14.4s, v0.s[3]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %15.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %15.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %15.4s, v1.s[0]     \n"

                        "fmla   v24.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %16.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %16.4s, v1.s[1]     \n"

                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n"

                        "pld        [%1, #64]           \n"
                        "vld1.u16   {d1}, [%1]!         \n"
                        "vld1.u32   {d2[0]}, [%1]       \n"

                        "vshll.u16  q0, d1, #16         \n"
                        "vshll.u16  q1, d2, #16         \n"

                        "vmla.f32   q12, %q8, d0[0]     \n"
                        "vmla.f32   q13, %q8, d0[1]     \n"
                        "vmla.f32   q14, %q8, d1[0]     \n"
                        "vmla.f32   q15, %q8, d1[1]     \n"

                        "vmla.f32   q12, %q9, d0[1]     \n"
                        "vmla.f32   q13, %q9, d1[0]     \n"
                        "vmla.f32   q14, %q9, d1[1]     \n"
                        "vmla.f32   q15, %q9, d2[0]     \n"

                        "vmla.f32   q12, %q10, d1[0]    \n"
                        "vmla.f32   q13, %q10, d1[1]    \n"
                        "vmla.f32   q14, %q10, d2[0]    \n"
                        "vmla.f32   q15, %q10, d2[1]    \n"

                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d5}, [%2]!         \n"
                        "vld1.u32   {d3[0]}, [%2]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, %q11, d4[0]    \n"
                        "vmla.f32   q13, %q11, d4[1]    \n"
                        "vmla.f32   q14, %q11, d5[0]    \n"
                        "vmla.f32   q15, %q11, d5[1]    \n"

                        "vmla.f32   q12, %q12, d4[1]    \n"
                        "vmla.f32   q13, %q12, d5[0]    \n"
                        "vmla.f32   q14, %q12, d5[1]    \n"
                        "vmla.f32   q15, %q12, d2[0]    \n"

                        "vmla.f32   q12, %q13, d5[0]    \n"
                        "vmla.f32   q13, %q13, d5[1]    \n"
                        "vmla.f32   q14, %q13, d2[0]    \n"
                        "vmla.f32   q15, %q13, d2[1]    \n"

                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d1}, [%3]!         \n"
                        "vld1.u32   {d2[0]}, [%3]       \n"

                        "vshll.u16  q0, d1, #16         \n"
                        "vshll.u16  q1, d2, #16         \n"

                        "vmla.f32   q12, %q14, d0[0]    \n"
                        "vmla.f32   q13, %q14, d0[1]    \n"
                        "vmla.f32   q14, %q14, d1[0]    \n"
                        "vmla.f32   q15, %q14, d1[1]    \n"

                        "vmla.f32   q12, %q15, d0[1]    \n"
                        "vmla.f32   q13, %q15, d1[0]    \n"
                        "vmla.f32   q14, %q15, d1[1]    \n"
                        "vmla.f32   q15, %q15, d2[0]    \n"

                        "vmla.f32   q12, %q16, d1[0]    \n"
                        "vmla.f32   q13, %q16, d1[1]    \n"
                        "vmla.f32   q14, %q16, d2[0]    \n"
                        "vmla.f32   q15, %q16, d2[1]    \n"

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "q0", "q1", "q2", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1]               \n"

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v28.4s, v29.4s}, [%0]      \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmul   v24.4s, %8.4s, v0.s[0]      \n"
                        "fmul   v25.4s, %8.4s, v0.s[1]      \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v1.4h}, [%2]               \n"

                        "fmul   v26.4s, %9.4s, v0.s[1]      \n"
                        "fmul   v27.4s, %9.4s, v0.s[2]      \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v28.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %10.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %11.4s, v1.s[0]     \n"
                        "fmla   v25.4s, %11.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3]               \n"

                        "fmla   v26.4s, %12.4s, v1.s[1]     \n"
                        "fmla   v27.4s, %12.4s, v1.s[2]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v28.4s, %13.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %13.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %14.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %15.4s, v0.s[2]     \n"

                        "fmla   v28.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %16.4s, v0.s[3]     \n"

                        "add    %1, %1, #4                  \n"

                        "fadd   v24.4s, v24.4s, v26.4s      \n"
                        "fadd   v25.4s, v25.4s, v27.4s      \n"

                        "add    %2, %2, #4                  \n"

                        "fadd   v28.4s, v28.4s, v24.4s      \n"
                        "fadd   v29.4s, v29.4s, v25.4s      \n"

                        "add    %3, %3, #4                  \n"

                        "st1    {v28.4s, v29.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v24", "v25", "v26", "v27", "v28", "v29");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #64]           \n"
                        "vld1.u16   {d1}, [%1]          \n"

                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d24-d27}, [%0 :128] \n"

                        "vshll.u16  q0, d1, #16         \n"

                        "vmul.f32   q14, %q8, d0[0]     \n"
                        "vmul.f32   q15, %q8, d0[1]     \n"
                        "vmla.f32   q12, %q9, d0[1]     \n"
                        "vmla.f32   q13, %q9, d1[0]     \n"

                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d3}, [%2]          \n"

                        "vmla.f32   q14, %q10, d1[0]    \n"
                        "vmla.f32   q15, %q10, d1[1]    \n"

                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, %q11, d2[0]    \n"
                        "vmla.f32   q13, %q11, d2[1]    \n"

                        "vmla.f32   q14, %q12, d2[1]    \n"
                        "vmla.f32   q15, %q12, d3[0]    \n"

                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d1}, [%3]          \n"

                        "vmla.f32   q12, %q13, d3[0]    \n"
                        "vmla.f32   q13, %q13, d3[1]    \n"

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q14, %q14, d0[0]    \n"
                        "vmla.f32   q15, %q14, d0[1]    \n"

                        "vmla.f32   q12, %q15, d0[1]    \n"
                        "vmla.f32   q13, %q15, d1[0]    \n"

                        "add        %1, %1, #4          \n"

                        "vmla.f32   q14, %q16, d1[0]    \n"
                        "vmla.f32   q15, %q16, d1[1]    \n"

                        "add        %2, %2, #4          \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "add        %3, %3, #4          \n"

                        "vst1.f32   {d24-d27}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "q0", "q1", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _k00, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22, _r2, 2);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _k00, vget_low_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k01, vget_low_f32(_r0), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k02, vget_high_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k10, vget_low_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k11, vget_low_f32(_r1), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k12, vget_high_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k20, vget_low_f32(_r2), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k21, vget_low_f32(_r2), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k22, vget_high_f32(_r2), 0);
#endif

                    vst1q_f32(outptr0, _sum0);

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                    outptr0 += 4;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);

            const float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<unsigned short>(0);
            const unsigned short* r1 = img0.row<unsigned short>(1);
            const unsigned short* r2 = img0.row<unsigned short>(2);

            float32x4_t _k00 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22 = bfloat2float(vld1_u16(k0 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

#if __aarch64__
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%1], #64 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%1], #64 \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%2], #16   \n"
                        "ld1    {v2.s}[0], [%2]             \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v24.4s, %10.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %10.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %10.4s, v1.s[0]     \n"
                        "fmla   v29.4s, %10.4s, v1.s[1]     \n"
                        "fmla   v30.4s, %10.4s, v1.s[2]     \n"
                        "fmla   v31.4s, %10.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %11.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %11.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %11.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %11.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %11.4s, v1.s[1]     \n"
                        "fmla   v29.4s, %11.4s, v1.s[2]     \n"
                        "fmla   v30.4s, %11.4s, v1.s[3]     \n"
                        "fmla   v31.4s, %11.4s, v2.s[0]     \n"

                        "fmla   v24.4s, %12.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %12.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %12.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %12.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %12.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %12.4s, v1.s[3]     \n"
                        "fmla   v30.4s, %12.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %12.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%3], #16   \n"
                        "ld1    {v2.s}[0], [%3]             \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v24.4s, %13.4s, v4.s[0]     \n"
                        "fmla   v25.4s, %13.4s, v4.s[1]     \n"
                        "fmla   v26.4s, %13.4s, v4.s[2]     \n"
                        "fmla   v27.4s, %13.4s, v4.s[3]     \n"
                        "fmla   v28.4s, %13.4s, v5.s[0]     \n"
                        "fmla   v29.4s, %13.4s, v5.s[1]     \n"
                        "fmla   v30.4s, %13.4s, v5.s[2]     \n"
                        "fmla   v31.4s, %13.4s, v5.s[3]     \n"

                        "fmla   v24.4s, %14.4s, v4.s[1]     \n"
                        "fmla   v25.4s, %14.4s, v4.s[2]     \n"
                        "fmla   v26.4s, %14.4s, v4.s[3]     \n"
                        "fmla   v27.4s, %14.4s, v5.s[0]     \n"
                        "fmla   v28.4s, %14.4s, v5.s[1]     \n"
                        "fmla   v29.4s, %14.4s, v5.s[2]     \n"
                        "fmla   v30.4s, %14.4s, v5.s[3]     \n"
                        "fmla   v31.4s, %14.4s, v2.s[0]     \n"

                        "fmla   v24.4s, %15.4s, v4.s[2]     \n"
                        "fmla   v25.4s, %15.4s, v4.s[3]     \n"
                        "fmla   v26.4s, %15.4s, v5.s[0]     \n"
                        "fmla   v27.4s, %15.4s, v5.s[1]     \n"
                        "fmla   v28.4s, %15.4s, v5.s[2]     \n"
                        "fmla   v29.4s, %15.4s, v5.s[3]     \n"
                        "fmla   v30.4s, %15.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %15.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%4], #16   \n"
                        "ld1    {v2.s}[0], [%4]             \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v24.4s, %16.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %16.4s, v0.s[3]     \n"
                        "fmla   v28.4s, %16.4s, v1.s[0]     \n"
                        "fmla   v29.4s, %16.4s, v1.s[1]     \n"
                        "fmla   v30.4s, %16.4s, v1.s[2]     \n"
                        "fmla   v31.4s, %16.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %17.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %17.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %17.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %17.4s, v1.s[0]     \n"
                        "fmla   v28.4s, %17.4s, v1.s[1]     \n"
                        "fmla   v29.4s, %17.4s, v1.s[2]     \n"
                        "fmla   v30.4s, %17.4s, v1.s[3]     \n"
                        "fmla   v31.4s, %17.4s, v2.s[0]     \n"

                        "fmla   v24.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %18.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %18.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %18.4s, v1.s[1]     \n"
                        "fmla   v28.4s, %18.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %18.4s, v1.s[3]     \n"
                        "fmla   v30.4s, %18.4s, v2.s[0]     \n"
                        "fmla   v31.4s, %18.4s, v2.s[1]     \n"

                        "shrn   v24.4h, v24.4s, #16         \n"
                        "shrn   v25.4h, v25.4s, #16         \n"
                        "shrn   v26.4h, v26.4s, #16         \n"
                        "shrn   v27.4h, v27.4s, #16         \n"
                        "shrn   v28.4h, v28.4s, #16         \n"
                        "shrn   v29.4h, v29.4s, #16         \n"
                        "shrn   v30.4h, v30.4s, #16         \n"
                        "shrn   v31.4h, v31.4s, #16         \n"

                        "st1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%0], #32 \n"
                        "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "memory", "v0", "v1", "v2", "v4", "v5", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
#endif // __aarch64__
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v0.4h}, [%2], #8           \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%1], #64 \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "ld1    {v1.s}[0], [%2]             \n"

                        "fmla   v24.4s, %10.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %10.4s, v0.s[1]     \n"

                        "fmla   v26.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %10.4s, v0.s[3]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %11.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %11.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v2.4h}, [%3], #8           \n"

                        "fmla   v26.4s, %11.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %11.4s, v1.s[0]     \n"

                        "ld1    {v3.s}[0], [%3]             \n"

                        "fmla   v24.4s, %12.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %12.4s, v0.s[3]     \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v26.4s, %12.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %12.4s, v1.s[1]     \n"

                        "fmla   v24.4s, %13.4s, v2.s[0]     \n"
                        "fmla   v25.4s, %13.4s, v2.s[1]     \n"

                        "fmla   v26.4s, %13.4s, v2.s[2]     \n"
                        "fmla   v27.4s, %13.4s, v2.s[3]     \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v24.4s, %14.4s, v2.s[1]     \n"
                        "fmla   v25.4s, %14.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v0.4h}, [%4], #8           \n"

                        "fmla   v26.4s, %14.4s, v2.s[3]     \n"
                        "fmla   v27.4s, %14.4s, v3.s[0]     \n"

                        "ld1    {v1.s}[0], [%4]             \n"

                        "fmla   v24.4s, %15.4s, v2.s[2]     \n"
                        "fmla   v25.4s, %15.4s, v2.s[3]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v26.4s, %15.4s, v3.s[0]     \n"
                        "fmla   v27.4s, %15.4s, v3.s[1]     \n"

                        "fmla   v24.4s, %16.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[1]     \n"

                        "fmla   v26.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v27.4s, %16.4s, v0.s[3]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v24.4s, %17.4s, v0.s[1]     \n"
                        "fmla   v25.4s, %17.4s, v0.s[2]     \n"
                        "fmla   v26.4s, %17.4s, v0.s[3]     \n"
                        "fmla   v27.4s, %17.4s, v1.s[0]     \n"

                        "fmla   v24.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v25.4s, %18.4s, v0.s[3]     \n"
                        "fmla   v26.4s, %18.4s, v1.s[0]     \n"
                        "fmla   v27.4s, %18.4s, v1.s[1]     \n"

                        "shrn   v24.4h, v24.4s, #16         \n"
                        "shrn   v25.4h, v25.4s, #16         \n"
                        "shrn   v26.4h, v26.4s, #16         \n"
                        "shrn   v27.4h, v27.4s, #16         \n"

                        "st1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d24-d31}      \n"

                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d1}, [%2]!         \n"
                        "vld1.u32   {d2[0]}, [%2]       \n"

                        "vshll.u16  q0, d1, #16         \n"
                        "vshll.u16  q1, d2, #16         \n"

                        "vmla.f32   q12, %q10, d0[0]    \n"
                        "vmla.f32   q13, %q10, d0[1]    \n"
                        "vmla.f32   q14, %q10, d1[0]    \n"
                        "vmla.f32   q15, %q10, d1[1]    \n"

                        "vmla.f32   q12, %q11, d0[1]    \n"
                        "vmla.f32   q13, %q11, d1[0]    \n"
                        "vmla.f32   q14, %q11, d1[1]    \n"
                        "vmla.f32   q15, %q11, d2[0]    \n"

                        "vmla.f32   q12, %q12, d1[0]    \n"
                        "vmla.f32   q13, %q12, d1[1]    \n"
                        "vmla.f32   q14, %q12, d2[0]    \n"
                        "vmla.f32   q15, %q12, d2[1]    \n"

                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d5}, [%3]!         \n"
                        "vld1.u32   {d3[0]}, [%3]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, %q13, d4[0]    \n"
                        "vmla.f32   q13, %q13, d4[1]    \n"
                        "vmla.f32   q14, %q13, d5[0]    \n"
                        "vmla.f32   q15, %q13, d5[1]    \n"

                        "vmla.f32   q12, %q14, d4[1]    \n"
                        "vmla.f32   q13, %q14, d5[0]    \n"
                        "vmla.f32   q14, %q14, d5[1]    \n"
                        "vmla.f32   q15, %q14, d2[0]    \n"

                        "vmla.f32   q12, %q15, d5[0]    \n"
                        "vmla.f32   q13, %q15, d5[1]    \n"
                        "vmla.f32   q14, %q15, d2[0]    \n"
                        "vmla.f32   q15, %q15, d2[1]    \n"

                        "pld        [%4, #64]           \n"
                        "vld1.u16   {d1}, [%4]!         \n"
                        "vld1.u32   {d2[0]}, [%4]       \n"

                        "vshll.u16  q0, d1, #16         \n"
                        "vshll.u16  q1, d2, #16         \n"

                        "vmla.f32   q12, %q16, d0[0]    \n"
                        "vmla.f32   q13, %q16, d0[1]    \n"
                        "vmla.f32   q14, %q16, d1[0]    \n"
                        "vmla.f32   q15, %q16, d1[1]    \n"

                        "vmla.f32   q12, %q17, d0[1]    \n"
                        "vmla.f32   q13, %q17, d1[0]    \n"
                        "vmla.f32   q14, %q17, d1[1]    \n"
                        "vmla.f32   q15, %q17, d2[0]    \n"

                        "vmla.f32   q12, %q18, d1[0]    \n"
                        "vmla.f32   q13, %q18, d1[1]    \n"
                        "vmla.f32   q14, %q18, d2[0]    \n"
                        "vmla.f32   q15, %q18, d2[1]    \n"

                        "vshrn.s32  d24, q12, #16       \n"
                        "vshrn.s32  d25, q13, #16       \n"
                        "vshrn.s32  d26, q14, #16       \n"
                        "vshrn.s32  d27, q15, #16       \n"

                        "vst1.u16   {d24-d27}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "memory", "q0", "q1", "q2", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v0.4h}, [%2]               \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v28.4s, v29.4s}, [%1], #32 \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmul   v24.4s, %10.4s, v0.s[0]     \n"
                        "fmul   v25.4s, %10.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v1.4h}, [%3]               \n"

                        "fmul   v26.4s, %11.4s, v0.s[1]     \n"
                        "fmul   v27.4s, %11.4s, v0.s[2]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v28.4s, %12.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %12.4s, v0.s[3]     \n"

                        "fmla   v24.4s, %13.4s, v1.s[0]     \n"
                        "fmla   v25.4s, %13.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v0.4h}, [%4]               \n"

                        "fmla   v26.4s, %14.4s, v1.s[1]     \n"
                        "fmla   v27.4s, %14.4s, v1.s[2]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v28.4s, %15.4s, v1.s[2]     \n"
                        "fmla   v29.4s, %15.4s, v1.s[3]     \n"

                        "fmla   v24.4s, %16.4s, v0.s[0]     \n"
                        "fmla   v25.4s, %16.4s, v0.s[1]     \n"
                        "fmla   v26.4s, %17.4s, v0.s[1]     \n"
                        "fmla   v27.4s, %17.4s, v0.s[2]     \n"

                        "fmla   v28.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v29.4s, %18.4s, v0.s[3]     \n"

                        "add    %2, %2, #4                  \n"

                        "fadd   v24.4s, v24.4s, v26.4s      \n"
                        "fadd   v25.4s, v25.4s, v27.4s      \n"

                        "add    %3, %3, #4                  \n"

                        "fadd   v28.4s, v28.4s, v24.4s      \n"
                        "fadd   v29.4s, v29.4s, v25.4s      \n"

                        "add    %4, %4, #4                  \n"

                        "shrn   v28.4h, v28.4s, #16         \n"
                        "shrn   v29.4h, v29.4s, #16         \n"

                        "st1    {v28.4h, v29.4h}, [%0], #16 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "memory", "v0", "v1", "v24", "v25", "v26", "v27", "v28", "v29");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d1}, [%2]          \n"

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d24-d27}, [%1 :128]! \n"

                        "vshll.u16  q0, d1, #16         \n"

                        "vmul.f32   q14, %q10, d0[0]    \n"
                        "vmul.f32   q15, %q10, d0[1]    \n"
                        "vmla.f32   q12, %q11, d0[1]    \n"
                        "vmla.f32   q13, %q11, d1[0]    \n"

                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d3}, [%3]          \n"

                        "vmla.f32   q14, %q12, d1[0]    \n"
                        "vmla.f32   q15, %q12, d1[1]    \n"

                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, %q13, d2[0]    \n"
                        "vmla.f32   q13, %q13, d2[1]    \n"

                        "vmla.f32   q14, %q14, d2[1]    \n"
                        "vmla.f32   q15, %q14, d3[0]    \n"

                        "pld        [%4, #64]           \n"
                        "vld1.u16   {d1}, [%4]          \n"

                        "vmla.f32   q12, %q15, d3[0]    \n"
                        "vmla.f32   q13, %q15, d3[1]    \n"

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q14, %q16, d0[0]    \n"
                        "vmla.f32   q15, %q16, d0[1]    \n"

                        "vmla.f32   q12, %q17, d0[1]    \n"
                        "vmla.f32   q13, %q17, d1[0]    \n"

                        "add        %2, %2, #4          \n"

                        "vmla.f32   q14, %q18, d1[0]    \n"
                        "vmla.f32   q15, %q18, d1[1]    \n"

                        "add        %3, %3, #4          \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "add        %4, %4, #4          \n"

                        "vshrn.s32  d24, q12, #16       \n"
                        "vshrn.s32  d25, q13, #16       \n"

                        "vst1.f32   {d24-d25}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "memory", "q0", "q1", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _k00, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22, _r2, 2);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _k00, vget_low_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k01, vget_low_f32(_r0), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k02, vget_high_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k10, vget_low_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k11, vget_low_f32(_r1), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k12, vget_high_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k20, vget_low_f32(_r2), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k21, vget_low_f32(_r2), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k22, vget_high_f32(_r2), 0);
#endif

                    vst1_u16(outptr0_bf16, float2bfloat(_sum0));

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                    outptr0 += 4;
                    outptr0_bf16 += 4;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
        }
    }
}

static void conv3x3s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

#if __ARM_NEON && __aarch64__
    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4 * 2, 4 * 2, opt.workspace_allocator);
#else
    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4, 4, opt.workspace_allocator);
#endif

    const int tailstep = w - 2 * outw + w;

    const float* bias = _bias;

    int remain_outch_start = 0;

#if __ARM_NEON && __aarch64__
    int nn_outch = 0;
    nn_outch = outch >> 1;
    remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = bias ? vld1q_f32((const float*)bias + (p + 1) * 4) : vdupq_n_f32(0.f);
        {
            float* ptr = (float*)out0;

            for (int i = 0; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    vst1q_f32(ptr, _bias0);
                    vst1q_f32(ptr + 4, _bias0);
                    vst1q_f32(ptr + 8, _bias0);
                    vst1q_f32(ptr + 12, _bias0);
                    vst1q_f32(ptr + 16, _bias1);
                    vst1q_f32(ptr + 20, _bias1);
                    vst1q_f32(ptr + 24, _bias1);
                    vst1q_f32(ptr + 28, _bias1);
                    ptr += 32;
                }
                for (; j + 1 < outw; j += 2)
                {
                    vst1q_f32(ptr, _bias0);
                    vst1q_f32(ptr + 4, _bias0);
                    vst1q_f32(ptr + 8, _bias1);
                    vst1q_f32(ptr + 12, _bias1);
                    ptr += 16;
                }
                for (; j < outw; j++)
                {
                    vst1q_f32(ptr, _bias0);
                    vst1q_f32(ptr + 4, _bias1);
                    ptr += 8;
                }
            }
        }

        const unsigned short* k0 = kernel.channel(p);
        const unsigned short* k1 = kernel.channel(p + 1);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            float32x4_t _k00_0 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01_0 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02_0 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10_0 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11_0 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12_0 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20_0 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21_0 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22_0 = bfloat2float(vld1_u16(k0 + 32));

            float32x4_t _k00_1 = bfloat2float(vld1_u16(k1));
            float32x4_t _k01_1 = bfloat2float(vld1_u16(k1 + 4));
            float32x4_t _k02_1 = bfloat2float(vld1_u16(k1 + 8));
            float32x4_t _k10_1 = bfloat2float(vld1_u16(k1 + 12));
            float32x4_t _k11_1 = bfloat2float(vld1_u16(k1 + 16));
            float32x4_t _k12_1 = bfloat2float(vld1_u16(k1 + 20));
            float32x4_t _k20_1 = bfloat2float(vld1_u16(k1 + 24));
            float32x4_t _k21_1 = bfloat2float(vld1_u16(k1 + 28));
            float32x4_t _k22_1 = bfloat2float(vld1_u16(k1 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%1], #16   \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%0], #64 \n" // sum0

                        "shll   v0.4s, v0.4h, #16           \n"

                        //                         "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%0] \n" // sum1

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %8.4s, v0.s[0]       \n"
                        "fmla   v7.4s, %8.4s, v0.s[2]       \n"
                        "fmla   v8.4s, %8.4s, v1.s[0]       \n"
                        "fmla   v9.4s, %8.4s, v1.s[2]       \n"
                        "fmla   v10.4s, %17.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %17.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %17.4s, v1.s[0]     \n"
                        "fmla   v13.4s, %17.4s, v1.s[2]     \n"

                        "ld1    {v4.h}[0], [%1]             \n"

                        "fmla   v6.4s, %9.4s, v0.s[1]       \n"
                        "fmla   v7.4s, %9.4s, v0.s[3]       \n"
                        "fmla   v8.4s, %9.4s, v1.s[1]       \n"
                        "fmla   v9.4s, %9.4s, v1.s[3]       \n"
                        "fmla   v10.4s, %18.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %18.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %18.4s, v1.s[1]     \n"
                        "fmla   v13.4s, %18.4s, v1.s[3]     \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        // r1
                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v2.4h, v3.4h}, [%2], #16   \n"

                        "fmla   v6.4s, %10.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %10.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %10.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %10.4s, v4.s[0]      \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v10.4s, %19.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %19.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %19.4s, v1.s[2]     \n"
                        "fmla   v13.4s, %19.4s, v4.s[0]     \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v6.4s, %11.4s, v2.s[0]      \n"
                        "fmla   v7.4s, %11.4s, v2.s[2]      \n"
                        "fmla   v8.4s, %11.4s, v3.s[0]      \n"
                        "fmla   v9.4s, %11.4s, v3.s[2]      \n"
                        "fmla   v10.4s, %20.4s, v2.s[0]     \n"
                        "fmla   v11.4s, %20.4s, v2.s[2]     \n"
                        "fmla   v12.4s, %20.4s, v3.s[0]     \n"
                        "fmla   v13.4s, %20.4s, v3.s[2]     \n"

                        "ld1    {v5.h}[0], [%2]             \n"

                        "fmla   v6.4s, %12.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %12.4s, v2.s[3]      \n"
                        "fmla   v8.4s, %12.4s, v3.s[1]      \n"
                        "fmla   v9.4s, %12.4s, v3.s[3]      \n"

                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v10.4s, %21.4s, v2.s[1]     \n"
                        "fmla   v11.4s, %21.4s, v2.s[3]     \n"
                        "fmla   v12.4s, %21.4s, v3.s[1]     \n"
                        "fmla   v13.4s, %21.4s, v3.s[3]     \n"

                        // r2
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n"

                        "fmla   v6.4s, %13.4s, v2.s[2]      \n"
                        "fmla   v7.4s, %13.4s, v3.s[0]      \n"
                        "fmla   v8.4s, %13.4s, v3.s[2]      \n"
                        "fmla   v9.4s, %13.4s, v5.s[0]      \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v10.4s, %22.4s, v2.s[2]     \n"
                        "fmla   v11.4s, %22.4s, v3.s[0]     \n"
                        "fmla   v12.4s, %22.4s, v3.s[2]     \n"
                        "fmla   v13.4s, %22.4s, v5.s[0]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %14.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %14.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %14.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %14.4s, v1.s[2]      \n"
                        "fmla   v10.4s, %23.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %23.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %23.4s, v1.s[0]     \n"
                        "fmla   v13.4s, %23.4s, v1.s[2]     \n"

                        "ld1    {v4.h}[0], [%3]             \n"

                        "fmla   v6.4s, %15.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %15.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %15.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %15.4s, v1.s[3]      \n"
                        "fmla   v10.4s, %24.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %24.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %24.4s, v1.s[1]     \n"
                        "fmla   v13.4s, %24.4s, v1.s[3]     \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v6.4s, %16.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %16.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %16.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %16.4s, v4.s[0]      \n"

                        "sub    %0, %0, #64                 \n"

                        "fmla   v10.4s, %25.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %25.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %25.4s, v1.s[2]     \n"
                        "fmla   v13.4s, %25.4s, v4.s[0]     \n"

                        "st1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%0], #64 \n"
                        "st1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_0), // %8
                        "w"(_k01_0), // %9
                        "w"(_k02_0), // %10
                        "w"(_k10_0), // %11
                        "w"(_k11_0), // %12
                        "w"(_k12_0), // %13
                        "w"(_k20_0), // %14
                        "w"(_k21_0), // %15
                        "w"(_k22_0), // %16
                        "w"(_k00_1), // %17
                        "w"(_k01_1), // %18
                        "w"(_k02_1), // %19
                        "w"(_k10_1), // %20
                        "w"(_k11_1), // %21
                        "w"(_k12_1), // %22
                        "w"(_k20_1), // %23
                        "w"(_k21_1), // %24
                        "w"(_k22_1)  // %25
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1], #8           \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%0] \n" // sum0 sum1

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v10.4s, %8.4s, v0.s[0]      \n"
                        "fmla   v11.4s, %8.4s, v0.s[2]      \n"
                        "fmla   v12.4s, %17.4s, v0.s[0]     \n"
                        "fmla   v13.4s, %17.4s, v0.s[2]     \n"

                        "ld1    {v1.h}[0], [%1]             \n"

                        "fmla   v10.4s, %9.4s, v0.s[1]      \n"
                        "fmla   v11.4s, %9.4s, v0.s[3]      \n"
                        "fmla   v12.4s, %18.4s, v0.s[1]     \n"
                        "fmla   v13.4s, %18.4s, v0.s[3]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        // r1
                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v2.4h}, [%2], #8           \n"

                        "fmla   v10.4s, %10.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %10.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %19.4s, v0.s[2]     \n"
                        "fmla   v13.4s, %19.4s, v1.s[0]     \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v10.4s, %11.4s, v2.s[0]     \n"
                        "fmla   v11.4s, %11.4s, v2.s[2]     \n"
                        "fmla   v12.4s, %20.4s, v2.s[0]     \n"
                        "fmla   v13.4s, %20.4s, v2.s[2]     \n"

                        "ld1    {v3.h}[0], [%2]             \n"

                        "fmla   v10.4s, %12.4s, v2.s[1]     \n"
                        "fmla   v11.4s, %12.4s, v2.s[3]     \n"
                        "fmla   v12.4s, %21.4s, v2.s[1]     \n"
                        "fmla   v13.4s, %21.4s, v2.s[3]     \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        // r2
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n"

                        "fmla   v10.4s, %13.4s, v2.s[2]     \n"
                        "fmla   v11.4s, %13.4s, v3.s[0]     \n"
                        "fmla   v12.4s, %22.4s, v2.s[2]     \n"
                        "fmla   v13.4s, %22.4s, v3.s[0]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v10.4s, %14.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %23.4s, v0.s[0]     \n"
                        "fmla   v13.4s, %23.4s, v0.s[2]     \n"

                        "ld1    {v1.h}[0], [%3]             \n"

                        "fmla   v10.4s, %15.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %15.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %24.4s, v0.s[1]     \n"
                        "fmla   v13.4s, %24.4s, v0.s[3]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v10.4s, %16.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %16.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %25.4s, v0.s[2]     \n"
                        "fmla   v13.4s, %25.4s, v1.s[0]     \n"

                        "st1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_0), // %8
                        "w"(_k01_0), // %9
                        "w"(_k02_0), // %10
                        "w"(_k10_0), // %11
                        "w"(_k11_0), // %12
                        "w"(_k12_0), // %13
                        "w"(_k20_0), // %14
                        "w"(_k21_0), // %15
                        "w"(_k22_0), // %16
                        "w"(_k00_1), // %17
                        "w"(_k01_1), // %18
                        "w"(_k02_1), // %19
                        "w"(_k10_1), // %20
                        "w"(_k11_1), // %21
                        "w"(_k12_1), // %22
                        "w"(_k20_1), // %23
                        "w"(_k21_1), // %24
                        "w"(_k22_1)  // %25
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);
                    float32x4_t _sum1 = vld1q_f32(outptr0 + 4);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

                    _sum0 = vfmaq_laneq_f32(_sum0, _k00_0, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01_0, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02_0, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10_0, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11_0, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12_0, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20_0, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21_0, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22_0, _r2, 2);

                    _sum1 = vfmaq_laneq_f32(_sum1, _k00_1, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k01_1, _r0, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k02_1, _r0, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k10_1, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k11_1, _r1, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k12_1, _r1, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k20_1, _r2, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k21_1, _r2, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k22_1, _r2, 2);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0 += 8;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
            k1 += 9 * 4;
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);
            unsigned short* outptr1_bf16 = top_blob.channel(p + 1);

            const float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            float32x4_t _k00_0 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01_0 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02_0 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10_0 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11_0 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12_0 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20_0 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21_0 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22_0 = bfloat2float(vld1_u16(k0 + 32));

            float32x4_t _k00_1 = bfloat2float(vld1_u16(k1));
            float32x4_t _k01_1 = bfloat2float(vld1_u16(k1 + 4));
            float32x4_t _k02_1 = bfloat2float(vld1_u16(k1 + 8));
            float32x4_t _k10_1 = bfloat2float(vld1_u16(k1 + 12));
            float32x4_t _k11_1 = bfloat2float(vld1_u16(k1 + 16));
            float32x4_t _k12_1 = bfloat2float(vld1_u16(k1 + 20));
            float32x4_t _k20_1 = bfloat2float(vld1_u16(k1 + 24));
            float32x4_t _k21_1 = bfloat2float(vld1_u16(k1 + 28));
            float32x4_t _k22_1 = bfloat2float(vld1_u16(k1 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%2], #64 \n" // sum0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%2], #64 \n" // sum1

                        "fmla   v6.4s, %12.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %12.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %12.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %12.4s, v1.s[2]      \n"
                        "fmla   v10.4s, %21.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %21.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %21.4s, v1.s[0]     \n"
                        "fmla   v13.4s, %21.4s, v1.s[2]     \n"

                        "ld1    {v4.h}[0], [%3]             \n"

                        "fmla   v6.4s, %13.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %13.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %13.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %13.4s, v1.s[3]      \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v10.4s, %22.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %22.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %22.4s, v1.s[1]     \n"
                        "fmla   v13.4s, %22.4s, v1.s[3]     \n"

                        // r1
                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v2.4h, v3.4h}, [%4], #16   \n"

                        "fmla   v6.4s, %14.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %14.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %14.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %14.4s, v4.s[0]      \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v10.4s, %23.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %23.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %23.4s, v1.s[2]     \n"
                        "fmla   v13.4s, %23.4s, v4.s[0]     \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v6.4s, %15.4s, v2.s[0]      \n"
                        "fmla   v7.4s, %15.4s, v2.s[2]      \n"
                        "fmla   v8.4s, %15.4s, v3.s[0]      \n"
                        "fmla   v9.4s, %15.4s, v3.s[2]      \n"
                        "fmla   v10.4s, %24.4s, v2.s[0]     \n"
                        "fmla   v11.4s, %24.4s, v2.s[2]     \n"
                        "fmla   v12.4s, %24.4s, v3.s[0]     \n"
                        "fmla   v13.4s, %24.4s, v3.s[2]     \n"

                        "ld1    {v5.h}[0], [%4]             \n"

                        "fmla   v6.4s, %16.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %16.4s, v2.s[3]      \n"
                        "fmla   v8.4s, %16.4s, v3.s[1]      \n"
                        "fmla   v9.4s, %16.4s, v3.s[3]      \n"

                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v10.4s, %25.4s, v2.s[1]     \n"
                        "fmla   v11.4s, %25.4s, v2.s[3]     \n"
                        "fmla   v12.4s, %25.4s, v3.s[1]     \n"
                        "fmla   v13.4s, %25.4s, v3.s[3]     \n"

                        // r2
                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%5], #16   \n"

                        "fmla   v6.4s, %17.4s, v2.s[2]      \n"
                        "fmla   v7.4s, %17.4s, v3.s[0]      \n"
                        "fmla   v8.4s, %17.4s, v3.s[2]      \n"
                        "fmla   v9.4s, %17.4s, v5.s[0]      \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v10.4s, %26.4s, v2.s[2]     \n"
                        "fmla   v11.4s, %26.4s, v3.s[0]     \n"
                        "fmla   v12.4s, %26.4s, v3.s[2]     \n"
                        "fmla   v13.4s, %26.4s, v5.s[0]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %18.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %18.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %18.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %18.4s, v1.s[2]      \n"
                        "fmla   v10.4s, %27.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %27.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %27.4s, v1.s[0]     \n"
                        "fmla   v13.4s, %27.4s, v1.s[2]     \n"

                        "ld1    {v4.h}[0], [%5]             \n"

                        "fmla   v6.4s, %19.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %19.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %19.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %19.4s, v1.s[3]      \n"
                        "fmla   v10.4s, %28.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %28.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %28.4s, v1.s[1]     \n"
                        "fmla   v13.4s, %28.4s, v1.s[3]     \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v6.4s, %20.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %20.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %20.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %20.4s, v4.s[0]      \n"
                        "fmla   v10.4s, %29.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %29.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %29.4s, v1.s[2]     \n"
                        "fmla   v13.4s, %29.4s, v4.s[0]     \n"

                        "shrn   v6.4h, v6.4s, #16           \n"
                        "shrn   v7.4h, v7.4s, #16           \n"
                        "shrn   v8.4h, v8.4s, #16           \n"
                        "shrn   v9.4h, v9.4s, #16           \n"
                        "shrn   v10.4h, v10.4s, #16         \n"
                        "shrn   v11.4h, v11.4s, #16         \n"

                        "st1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%0], #32 \n"

                        "shrn   v12.4h, v12.4s, #16         \n"
                        "shrn   v13.4h, v13.4s, #16         \n"

                        "st1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%1], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr1_bf16), // %1
                        "=r"(outptr0),      // %2
                        "=r"(r0),           // %3
                        "=r"(r1),           // %4
                        "=r"(r2)            // %5
                        : "0"(outptr0_bf16),
                        "1"(outptr1_bf16),
                        "2"(outptr0),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00_0), // %12
                        "w"(_k01_0), // %13
                        "w"(_k02_0), // %14
                        "w"(_k10_0), // %15
                        "w"(_k11_0), // %16
                        "w"(_k12_0), // %17
                        "w"(_k20_0), // %18
                        "w"(_k21_0), // %19
                        "w"(_k22_0), // %20
                        "w"(_k00_1), // %21
                        "w"(_k01_1), // %22
                        "w"(_k02_1), // %23
                        "w"(_k10_1), // %24
                        "w"(_k11_1), // %25
                        "w"(_k12_1), // %26
                        "w"(_k20_1), // %27
                        "w"(_k21_1), // %28
                        "w"(_k22_1)  // %29
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%2], #64 \n" // sum0 sum1

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v10.4s, %12.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %12.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %21.4s, v0.s[0]     \n"
                        "fmla   v13.4s, %21.4s, v0.s[2]     \n"

                        "ld1    {v1.h}[0], [%3]             \n"

                        "fmla   v10.4s, %13.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %13.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %22.4s, v0.s[1]     \n"
                        "fmla   v13.4s, %22.4s, v0.s[3]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        // r1
                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v2.4h}, [%4], #8           \n"

                        "fmla   v10.4s, %14.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %14.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %23.4s, v0.s[2]     \n"
                        "fmla   v13.4s, %23.4s, v1.s[0]     \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v10.4s, %15.4s, v2.s[0]     \n"
                        "fmla   v11.4s, %15.4s, v2.s[2]     \n"
                        "fmla   v12.4s, %24.4s, v2.s[0]     \n"
                        "fmla   v13.4s, %24.4s, v2.s[2]     \n"

                        "ld1    {v3.h}[0], [%4]             \n"

                        "fmla   v10.4s, %16.4s, v2.s[1]     \n"
                        "fmla   v11.4s, %16.4s, v2.s[3]     \n"
                        "fmla   v12.4s, %25.4s, v2.s[1]     \n"
                        "fmla   v13.4s, %25.4s, v2.s[3]     \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        // r2
                        "prfm   pldl1keep, [%5, #64]        \n"
                        "ld1    {v0.4h}, [%5], #8           \n"

                        "fmla   v10.4s, %17.4s, v2.s[2]     \n"
                        "fmla   v11.4s, %17.4s, v3.s[0]     \n"
                        "fmla   v12.4s, %26.4s, v2.s[2]     \n"
                        "fmla   v13.4s, %26.4s, v3.s[0]     \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v10.4s, %18.4s, v0.s[0]     \n"
                        "fmla   v11.4s, %18.4s, v0.s[2]     \n"
                        "fmla   v12.4s, %27.4s, v0.s[0]     \n"
                        "fmla   v13.4s, %27.4s, v0.s[2]     \n"

                        "ld1    {v1.h}[0], [%5]             \n"

                        "fmla   v10.4s, %19.4s, v0.s[1]     \n"
                        "fmla   v11.4s, %19.4s, v0.s[3]     \n"
                        "fmla   v12.4s, %28.4s, v0.s[1]     \n"
                        "fmla   v13.4s, %28.4s, v0.s[3]     \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v10.4s, %20.4s, v0.s[2]     \n"
                        "fmla   v11.4s, %20.4s, v1.s[0]     \n"
                        "fmla   v12.4s, %29.4s, v0.s[2]     \n"
                        "fmla   v13.4s, %29.4s, v1.s[0]     \n"

                        "shrn   v10.4h, v10.4s, #16         \n"
                        "shrn   v11.4h, v11.4s, #16         \n"
                        "shrn   v12.4h, v12.4s, #16         \n"
                        "shrn   v13.4h, v13.4s, #16         \n"

                        "st1    {v10.4h, v11.4h}, [%0], #16 \n"
                        "st1    {v12.4h, v13.4h}, [%1], #16 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr1_bf16), // %1
                        "=r"(outptr0),      // %2
                        "=r"(r0),           // %3
                        "=r"(r1),           // %4
                        "=r"(r2)            // %5
                        : "0"(outptr0_bf16),
                        "1"(outptr1_bf16),
                        "2"(outptr0),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "w"(_k00_0), // %12
                        "w"(_k01_0), // %13
                        "w"(_k02_0), // %14
                        "w"(_k10_0), // %15
                        "w"(_k11_0), // %16
                        "w"(_k12_0), // %17
                        "w"(_k20_0), // %18
                        "w"(_k21_0), // %19
                        "w"(_k22_0), // %20
                        "w"(_k00_1), // %21
                        "w"(_k01_1), // %22
                        "w"(_k02_1), // %23
                        "w"(_k10_1), // %24
                        "w"(_k11_1), // %25
                        "w"(_k12_1), // %26
                        "w"(_k20_1), // %27
                        "w"(_k21_1), // %28
                        "w"(_k22_1)  // %29
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);
                    float32x4_t _sum1 = vld1q_f32(outptr0 + 4);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

                    _sum0 = vfmaq_laneq_f32(_sum0, _k00_0, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01_0, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02_0, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10_0, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11_0, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12_0, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20_0, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21_0, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22_0, _r2, 2);

                    _sum1 = vfmaq_laneq_f32(_sum1, _k00_1, _r0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k01_1, _r0, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k02_1, _r0, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k10_1, _r1, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k11_1, _r1, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k12_1, _r1, 2);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k20_1, _r2, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k21_1, _r2, 1);
                    _sum1 = vfmaq_laneq_f32(_sum1, _k22_1, _r2, 2);

                    vst1_u16(outptr0_bf16, float2bfloat(_sum0));
                    vst1_u16(outptr1_bf16, float2bfloat(_sum1));

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0 += 8;
                    outptr0_bf16 += 4;
                    outptr1_bf16 += 4;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
            k1 += 9 * 4;
        }
    }
#endif // __ARM_NEON && __aarch64__

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        const unsigned short* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            float32x4_t _k00 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22 = bfloat2float(vld1_u16(k0 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%1], #16   \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%0] \n" // sum0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %8.4s, v0.s[0]       \n"
                        "fmla   v7.4s, %8.4s, v0.s[2]       \n"
                        "fmla   v8.4s, %8.4s, v1.s[0]       \n"
                        "fmla   v9.4s, %8.4s, v1.s[2]       \n"

                        "ld1    {v4.h}[0], [%1]             \n"

                        "fmla   v6.4s, %9.4s, v0.s[1]       \n"
                        "fmla   v7.4s, %9.4s, v0.s[3]       \n"
                        "fmla   v8.4s, %9.4s, v1.s[1]       \n"
                        "fmla   v9.4s, %9.4s, v1.s[3]       \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        // r1
                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v2.4h, v3.4h}, [%2], #16   \n"

                        "fmla   v6.4s, %10.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %10.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %10.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %10.4s, v4.s[0]      \n"

                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v6.4s, %11.4s, v2.s[0]      \n"
                        "fmla   v7.4s, %11.4s, v2.s[2]      \n"
                        "fmla   v8.4s, %11.4s, v3.s[0]      \n"
                        "fmla   v9.4s, %11.4s, v3.s[2]      \n"

                        "ld1    {v5.h}[0], [%2]             \n"

                        "fmla   v6.4s, %12.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %12.4s, v2.s[3]      \n"
                        "fmla   v8.4s, %12.4s, v3.s[1]      \n"
                        "fmla   v9.4s, %12.4s, v3.s[3]      \n"

                        "shll   v5.4s, v5.4h, #16           \n"

                        // r2
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n"

                        "fmla   v6.4s, %13.4s, v2.s[2]      \n"
                        "fmla   v7.4s, %13.4s, v3.s[0]      \n"
                        "fmla   v8.4s, %13.4s, v3.s[2]      \n"
                        "fmla   v9.4s, %13.4s, v5.s[0]      \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %14.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %14.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %14.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %14.4s, v1.s[2]      \n"

                        "ld1    {v4.h}[0], [%3]             \n"

                        "fmla   v6.4s, %15.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %15.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %15.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %15.4s, v1.s[3]      \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v6.4s, %16.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %16.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %16.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %16.4s, v4.s[0]      \n"

                        "st1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#else  // __aarch64__
                    asm volatile(
                        // r0
                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d12-d13}, [%1]!    \n"

                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d0-d7}         \n" // sum0

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"

                        "vld1.u16   {d12[0]}, [%1]      \n"

                        "vmla.f32   q0, %q8, d8[0]      \n"
                        "vmla.f32   q1, %q8, d9[0]      \n"

                        "vmla.f32   q2, %q8, d10[0]     \n"
                        "vmla.f32   q3, %q8, d11[0]     \n"

                        "vmla.f32   q0, %q9, d8[1]      \n"
                        "vmla.f32   q1, %q9, d9[1]      \n"

                        "vshl.u32   d8, d12, #16        \n"

                        "vmla.f32   q2, %q9, d10[1]     \n"
                        "vmla.f32   q3, %q9, d11[1]     \n"

                        // r1
                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d12-d13}, [%2]!    \n"

                        "vmla.f32   q0, %q10, d9[0]     \n"
                        "vmla.f32   q1, %q10, d10[0]    \n"
                        "vmla.f32   q2, %q10, d11[0]    \n"
                        "vmla.f32   q3, %q10, d8[0]     \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"

                        "vld1.u16   {d12[0]}, [%2]      \n"

                        "vmla.f32   q0, %q11, d8[0]     \n"
                        "vmla.f32   q1, %q11, d9[0]     \n"

                        "vmla.f32   q2, %q11, d10[0]    \n"
                        "vmla.f32   q3, %q11, d11[0]    \n"

                        "vmla.f32   q0, %q12, d8[1]     \n"
                        "vmla.f32   q1, %q12, d9[1]     \n"

                        "vshl.u32   d8, d12, #16        \n"

                        "vmla.f32   q2, %q12, d10[1]    \n"
                        "vmla.f32   q3, %q12, d11[1]    \n"

                        // r2
                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d12-d13}, [%3]!    \n"

                        "vmla.f32   q0, %q13, d9[0]     \n"
                        "vmla.f32   q1, %q13, d10[0]    \n"
                        "vmla.f32   q2, %q13, d11[0]    \n"
                        "vmla.f32   q3, %q13, d8[0]     \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"

                        "vld1.u16   {d12[0]}, [%3]      \n"

                        "vmla.f32   q0, %q14, d8[0]     \n"
                        "vmla.f32   q1, %q14, d9[0]     \n"

                        "vmla.f32   q2, %q14, d10[0]    \n"
                        "vmla.f32   q3, %q14, d11[0]    \n"

                        "vmla.f32   q0, %q15, d8[1]     \n"
                        "vmla.f32   q1, %q15, d9[1]     \n"

                        "vshl.u32   d8, d12, #16        \n"

                        "vmla.f32   q2, %q15, d10[1]    \n"
                        "vmla.f32   q3, %q15, d11[1]    \n"

                        "vmla.f32   q0, %q16, d9[0]     \n"
                        "vmla.f32   q1, %q16, d10[0]    \n"
                        "vmla.f32   q2, %q16, d11[0]    \n"
                        "vmla.f32   q3, %q16, d8[0]     \n"

                        "vstm       %0!, {d0-d7}        \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1], #8           \n"

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%0]        \n" // sum0

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmul   v6.4s, %8.4s, v0.s[0]       \n"
                        "fmul   v7.4s, %8.4s, v0.s[2]       \n"

                        "ld1    {v1.h}[0], [%1]             \n"

                        "fmla   v8.4s, %9.4s, v0.s[1]       \n"
                        "fmla   v9.4s, %9.4s, v0.s[3]       \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        // r1
                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v2.4h}, [%2], #8           \n"

                        "fmla   v6.4s, %10.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %10.4s, v1.s[0]      \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v8.4s, %11.4s, v2.s[0]      \n"
                        "fmla   v9.4s, %11.4s, v2.s[2]      \n"

                        "ld1    {v3.h}[0], [%2]             \n"

                        "fmla   v6.4s, %12.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %12.4s, v2.s[3]      \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        // r2
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n"

                        "fmla   v8.4s, %13.4s, v2.s[2]      \n"
                        "fmla   v9.4s, %13.4s, v3.s[0]      \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v6.4s, %14.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %14.4s, v0.s[2]      \n"

                        "ld1    {v1.h}[0], [%3]             \n"

                        "fmla   v8.4s, %15.4s, v0.s[1]      \n"
                        "fmla   v9.4s, %15.4s, v0.s[3]      \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %16.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %16.4s, v1.s[0]      \n"

                        "fadd   v8.4s, v8.4s, v6.4s         \n"
                        "fadd   v9.4s, v9.4s, v7.4s         \n"

                        "st1    {v8.4s, v9.4s}, [%0], #32   \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#else  // __aarch64__
                    asm volatile(
                        // r0
                        "pld        [%1, #64]           \n"
                        "vld1.u16   {d9}, [%1]!         \n"

                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d4-d7}, [%0]       \n" // sum0

                        "vshll.u16  q4, d9, #16         \n"

                        "vmul.f32   q0, %q8, d8[0]      \n"
                        "vmul.f32   q1, %q8, d9[0]      \n"

                        "vld1.u16   {d11[]}, [%1]       \n"

                        "vmla.f32   q2, %q9, d8[1]      \n"
                        "vmla.f32   q3, %q9, d9[1]      \n"

                        "vshll.u16  q5, d11, #16        \n"

                        // r1
                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d13}, [%2]!        \n"

                        "vmla.f32   q0, %q10, d9[0]     \n"
                        "vmla.f32   q1, %q10, d10[0]    \n"

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q2, %q11, d12[0]    \n"
                        "vmla.f32   q3, %q11, d13[0]    \n"

                        "vld1.u16   {d9[]}, [%2]        \n"

                        "vmla.f32   q0, %q12, d12[1]    \n"
                        "vmla.f32   q1, %q12, d13[1]    \n"

                        "vshll.u16  q4, d9, #16         \n"

                        // r2
                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d11}, [%3]!        \n"

                        "vmla.f32   q2, %q13, d13[0]    \n"
                        "vmla.f32   q3, %q13, d8[0]     \n"

                        "vshll.u16  q5, d11, #16        \n"

                        "vmla.f32   q0, %q14, d10[0]    \n"
                        "vmla.f32   q1, %q14, d11[0]    \n"

                        "vld1.u16   {d13[]}, [%3]       \n"

                        "vmla.f32   q2, %q15, d10[1]    \n"
                        "vmla.f32   q3, %q15, d11[1]    \n"

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q0, %q16, d11[0]    \n"
                        "vmla.f32   q1, %q16, d12[0]    \n"

                        "vadd.f32   q2, q2, q0          \n"
                        "vadd.f32   q3, q3, q1          \n"

                        "vst1.f32   {d4-d7}, [%0]!      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _k00, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22, _r2, 2);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _k00, vget_low_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k01, vget_low_f32(_r0), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k02, vget_high_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k10, vget_low_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k11, vget_low_f32(_r1), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k12, vget_high_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k20, vget_low_f32(_r2), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k21, vget_low_f32(_r2), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k22, vget_high_f32(_r2), 0);
#endif

                    vst1q_f32(outptr0, _sum0);

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0 += 4;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);

            const float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            float32x4_t _k00 = bfloat2float(vld1_u16(k0));
            float32x4_t _k01 = bfloat2float(vld1_u16(k0 + 4));
            float32x4_t _k02 = bfloat2float(vld1_u16(k0 + 8));
            float32x4_t _k10 = bfloat2float(vld1_u16(k0 + 12));
            float32x4_t _k11 = bfloat2float(vld1_u16(k0 + 16));
            float32x4_t _k12 = bfloat2float(vld1_u16(k0 + 20));
            float32x4_t _k20 = bfloat2float(vld1_u16(k0 + 24));
            float32x4_t _k21 = bfloat2float(vld1_u16(k0 + 28));
            float32x4_t _k22 = bfloat2float(vld1_u16(k0 + 32));

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%2], #16   \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%1], #64 \n" // sum0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %10.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %10.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %10.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %10.4s, v1.s[2]      \n"

                        "ld1    {v4.h}[0], [%2]             \n"

                        "fmla   v6.4s, %11.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %11.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %11.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %11.4s, v1.s[3]      \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        // r1
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v2.4h, v3.4h}, [%3], #16   \n"

                        "fmla   v6.4s, %12.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %12.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %12.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %12.4s, v4.s[0]      \n"

                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v6.4s, %13.4s, v2.s[0]      \n"
                        "fmla   v7.4s, %13.4s, v2.s[2]      \n"
                        "fmla   v8.4s, %13.4s, v3.s[0]      \n"
                        "fmla   v9.4s, %13.4s, v3.s[2]      \n"

                        "ld1    {v5.h}[0], [%3]             \n"

                        "fmla   v6.4s, %14.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %14.4s, v2.s[3]      \n"
                        "fmla   v8.4s, %14.4s, v3.s[1]      \n"
                        "fmla   v9.4s, %14.4s, v3.s[3]      \n"

                        "shll   v5.4s, v5.4h, #16           \n"

                        // r2
                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%4], #16   \n"

                        "fmla   v6.4s, %15.4s, v2.s[2]      \n"
                        "fmla   v7.4s, %15.4s, v3.s[0]      \n"
                        "fmla   v8.4s, %15.4s, v3.s[2]      \n"
                        "fmla   v9.4s, %15.4s, v5.s[0]      \n"

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %16.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %16.4s, v0.s[2]      \n"
                        "fmla   v8.4s, %16.4s, v1.s[0]      \n"
                        "fmla   v9.4s, %16.4s, v1.s[2]      \n"

                        "ld1    {v4.h}[0], [%4]             \n"

                        "fmla   v6.4s, %17.4s, v0.s[1]      \n"
                        "fmla   v7.4s, %17.4s, v0.s[3]      \n"
                        "fmla   v8.4s, %17.4s, v1.s[1]      \n"
                        "fmla   v9.4s, %17.4s, v1.s[3]      \n"

                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v6.4s, %18.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %18.4s, v1.s[0]      \n"
                        "fmla   v8.4s, %18.4s, v1.s[2]      \n"
                        "fmla   v9.4s, %18.4s, v4.s[0]      \n"

                        "shrn   v6.4h, v6.4s, #16           \n"
                        "shrn   v7.4h, v7.4s, #16           \n"
                        "shrn   v8.4h, v8.4s, #16           \n"
                        "shrn   v9.4h, v9.4s, #16           \n"

                        "st1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#else  // __aarch64__
                    asm volatile(
                        // r0
                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d12-d13}, [%2]!    \n"

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d0-d7}        \n" // sum0

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"

                        "vld1.u16   {d12[0]}, [%2]      \n"

                        "vmla.f32   q0, %q10, d8[0]     \n"
                        "vmla.f32   q1, %q10, d9[0]     \n"

                        "vmla.f32   q2, %q10, d10[0]    \n"
                        "vmla.f32   q3, %q10, d11[0]    \n"

                        "vmla.f32   q0, %q11, d8[1]     \n"
                        "vmla.f32   q1, %q11, d9[1]     \n"

                        "vshl.u32   d8, d12, #16        \n"

                        "vmla.f32   q2, %q11, d10[1]    \n"
                        "vmla.f32   q3, %q11, d11[1]    \n"

                        // r1
                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d12-d13}, [%3]!    \n"

                        "vmla.f32   q0, %q12, d9[0]     \n"
                        "vmla.f32   q1, %q12, d10[0]    \n"
                        "vmla.f32   q2, %q12, d11[0]    \n"
                        "vmla.f32   q3, %q12, d8[0]     \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"

                        "vld1.u16   {d12[0]}, [%3]      \n"

                        "vmla.f32   q0, %q13, d8[0]     \n"
                        "vmla.f32   q1, %q13, d9[0]     \n"

                        "vmla.f32   q2, %q13, d10[0]    \n"
                        "vmla.f32   q3, %q13, d11[0]    \n"

                        "vmla.f32   q0, %q14, d8[1]     \n"
                        "vmla.f32   q1, %q14, d9[1]     \n"

                        "vshl.u32   d8, d12, #16        \n"

                        "vmla.f32   q2, %q14, d10[1]    \n"
                        "vmla.f32   q3, %q14, d11[1]    \n"

                        // r2
                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d12-d13}, [%4]!    \n"

                        "vmla.f32   q0, %q15, d9[0]     \n"
                        "vmla.f32   q1, %q15, d10[0]    \n"
                        "vmla.f32   q2, %q15, d11[0]    \n"
                        "vmla.f32   q3, %q15, d8[0]     \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"

                        "vld1.u16   {d12[0]}, [%4]      \n"

                        "vmla.f32   q0, %q16, d8[0]     \n"
                        "vmla.f32   q1, %q16, d9[0]     \n"

                        "vmla.f32   q2, %q16, d10[0]    \n"
                        "vmla.f32   q3, %q16, d11[0]    \n"

                        "vmla.f32   q0, %q17, d8[1]     \n"
                        "vmla.f32   q1, %q17, d9[1]     \n"

                        "vshl.u32   d8, d12, #16        \n"

                        "vmla.f32   q2, %q17, d10[1]    \n"
                        "vmla.f32   q3, %q17, d11[1]    \n"

                        "vmla.f32   q0, %q18, d9[0]     \n"
                        "vmla.f32   q1, %q18, d10[0]    \n"
                        "vmla.f32   q2, %q18, d11[0]    \n"
                        "vmla.f32   q3, %q18, d8[0]     \n"

                        "vshrn.u32  d0, q0, #16         \n"
                        "vshrn.u32  d1, q1, #16         \n"
                        "vshrn.u32  d2, q2, #16         \n"
                        "vshrn.u32  d3, q3, #16         \n"

                        "vst1.u16   {d0-d3}, [%0 :64]!  \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        // r0
                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v0.4h}, [%2], #8           \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%1], #32   \n" // sum0

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmul   v6.4s, %10.4s, v0.s[0]      \n"
                        "fmul   v7.4s, %10.4s, v0.s[2]      \n"

                        "ld1    {v1.h}[0], [%2]             \n"

                        "fmla   v8.4s, %11.4s, v0.s[1]      \n"
                        "fmla   v9.4s, %11.4s, v0.s[3]      \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        // r1
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v2.4h}, [%3], #8           \n"

                        "fmla   v6.4s, %12.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %12.4s, v1.s[0]      \n"

                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v8.4s, %13.4s, v2.s[0]      \n"
                        "fmla   v9.4s, %13.4s, v2.s[2]      \n"

                        "ld1    {v3.h}[0], [%3]             \n"

                        "fmla   v6.4s, %14.4s, v2.s[1]      \n"
                        "fmla   v7.4s, %14.4s, v2.s[3]      \n"

                        "shll   v3.4s, v3.4h, #16           \n"

                        // r2
                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v0.4h}, [%4], #8           \n"

                        "fmla   v8.4s, %15.4s, v2.s[2]      \n"
                        "fmla   v9.4s, %15.4s, v3.s[0]      \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "fmla   v6.4s, %16.4s, v0.s[0]      \n"
                        "fmla   v7.4s, %16.4s, v0.s[2]      \n"

                        "ld1    {v1.h}[0], [%4]             \n"

                        "fmla   v8.4s, %17.4s, v0.s[1]      \n"
                        "fmla   v9.4s, %17.4s, v0.s[3]      \n"

                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v6.4s, %18.4s, v0.s[2]      \n"
                        "fmla   v7.4s, %18.4s, v1.s[0]      \n"

                        "fadd   v8.4s, v8.4s, v6.4s         \n"
                        "fadd   v9.4s, v9.4s, v7.4s         \n"

                        "shrn   v8.4h, v8.4s, #16           \n"
                        "shrn   v9.4h, v9.4s, #16           \n"

                        "st1    {v8.4h, v9.4h}, [%0], #16   \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#else  // __aarch64__
                    asm volatile(
                        // r0
                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d9}, [%2]!         \n"

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d4-d7}, [%1]!      \n" // sum0

                        "vshll.u16  q4, d9, #16         \n"

                        "vmul.f32   q0, %q10, d8[0]     \n"
                        "vmul.f32   q1, %q10, d9[0]     \n"

                        "vld1.u16   {d11[]}, [%2]       \n"

                        "vmla.f32   q2, %q11, d8[1]     \n"
                        "vmla.f32   q3, %q11, d9[1]     \n"

                        "vshll.u16  q5, d11, #16        \n"

                        // r1
                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d13}, [%3]!        \n"

                        "vmla.f32   q0, %q12, d9[0]     \n"
                        "vmla.f32   q1, %q12, d10[0]    \n"

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q2, %q13, d12[0]    \n"
                        "vmla.f32   q3, %q13, d13[0]    \n"

                        "vld1.u16   {d9[]}, [%3]        \n"

                        "vmla.f32   q0, %q14, d12[1]    \n"
                        "vmla.f32   q1, %q14, d13[1]    \n"

                        "vshll.u16  q4, d9, #16         \n"

                        // r2
                        "pld        [%4, #64]           \n"
                        "vld1.u16   {d11}, [%4]!        \n"

                        "vmla.f32   q2, %q15, d13[0]    \n"
                        "vmla.f32   q3, %q15, d8[0]     \n"

                        "vshll.u16  q5, d11, #16        \n"

                        "vmla.f32   q0, %q16, d10[0]    \n"
                        "vmla.f32   q1, %q16, d11[0]    \n"

                        "vld1.u16   {d13[]}, [%4]       \n"

                        "vmla.f32   q2, %q17, d10[1]    \n"
                        "vmla.f32   q3, %q17, d11[1]    \n"

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q0, %q18, d11[0]    \n"
                        "vmla.f32   q1, %q18, d12[0]    \n"

                        "vadd.f32   q2, q2, q0          \n"
                        "vadd.f32   q3, q3, q1          \n"

                        "vshrn.u32  d2, q2, #16         \n"
                        "vshrn.u32  d3, q3, #16         \n"

                        "vst1.u16   {d2-d3}, [%0 :64]!  \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00), // %10
                        "w"(_k01), // %11
                        "w"(_k02), // %12
                        "w"(_k10), // %13
                        "w"(_k11), // %14
                        "w"(_k12), // %15
                        "w"(_k20), // %16
                        "w"(_k21), // %17
                        "w"(_k22)  // %18
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
                    float32x4_t _sum0 = vld1q_f32(outptr0);

                    float32x4_t _r0 = bfloat2float(vld1_u16(r0));
                    float32x4_t _r1 = bfloat2float(vld1_u16(r1));
                    float32x4_t _r2 = bfloat2float(vld1_u16(r2));

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _k00, _r0, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k01, _r0, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k02, _r0, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k10, _r1, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k11, _r1, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k12, _r1, 2);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k20, _r2, 0);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k21, _r2, 1);
                    _sum0 = vfmaq_laneq_f32(_sum0, _k22, _r2, 2);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _k00, vget_low_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k01, vget_low_f32(_r0), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k02, vget_high_f32(_r0), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k10, vget_low_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k11, vget_low_f32(_r1), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k12, vget_high_f32(_r1), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k20, vget_low_f32(_r2), 0);
                    _sum0 = vmlaq_lane_f32(_sum0, _k21, vget_low_f32(_r2), 1);
                    _sum0 = vmlaq_lane_f32(_sum0, _k22, vget_high_f32(_r2), 0);
#endif

                    vst1_u16(outptr0_bf16, float2bfloat(_sum0));

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr0 += 4;
                    outptr0_bf16 += 4;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack1to4_fp16s.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack1to4_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x4_t _bias0 = bias ? vld1_f16(bias + p * 4) : vdup_n_f16((__fp16)0.f);
        out0.fill(_bias0);

        const __fp16* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);

            float16x4_t _k00 = vld1_f16(k0);
            float16x4_t _k01 = vld1_f16(k0 + 4);
            float16x4_t _k02 = vld1_f16(k0 + 8);
            float16x4_t _k10 = vld1_f16(k0 + 12);
            float16x4_t _k11 = vld1_f16(k0 + 16);
            float16x4_t _k12 = vld1_f16(k0 + 20);
            float16x4_t _k20 = vld1_f16(k0 + 24);
            float16x4_t _k21 = vld1_f16(k0 + 28);
            float16x4_t _k22 = vld1_f16(k0 + 32);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%0], #32 \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0] \n" // sum4 sum5 sum6 sum7

                        "sub    %0, %0, #32                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.8h}, [%1], #16          \n" // r0
                        "ld1    {v1.4h}, [%1]               \n"

                        "fmla   v24.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v25.4h, %8.4h, v0.h[1]      \n"
                        "fmla   v26.4h, %8.4h, v0.h[2]      \n"
                        "fmla   v27.4h, %8.4h, v0.h[3]      \n"
                        "fmla   v28.4h, %8.4h, v0.h[4]      \n"
                        "fmla   v29.4h, %8.4h, v0.h[5]      \n"
                        "fmla   v30.4h, %8.4h, v0.h[6]      \n"
                        "fmla   v31.4h, %8.4h, v0.h[7]      \n"

                        "fmla   v24.4h, %9.4h, v0.h[1]      \n"
                        "fmla   v25.4h, %9.4h, v0.h[2]      \n"
                        "fmla   v26.4h, %9.4h, v0.h[3]      \n"
                        "fmla   v27.4h, %9.4h, v0.h[4]      \n"
                        "fmla   v28.4h, %9.4h, v0.h[5]      \n"
                        "fmla   v29.4h, %9.4h, v0.h[6]      \n"
                        "fmla   v30.4h, %9.4h, v0.h[7]      \n"
                        "fmla   v31.4h, %9.4h, v1.h[0]      \n"

                        "fmla   v24.4h, %10.4h, v0.h[2]     \n"
                        "fmla   v25.4h, %10.4h, v0.h[3]     \n"
                        "fmla   v26.4h, %10.4h, v0.h[4]     \n"
                        "fmla   v27.4h, %10.4h, v0.h[5]     \n"
                        "fmla   v28.4h, %10.4h, v0.h[6]     \n"
                        "fmla   v29.4h, %10.4h, v0.h[7]     \n"
                        "fmla   v30.4h, %10.4h, v1.h[0]     \n"
                        "fmla   v31.4h, %10.4h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v2.8h}, [%2], #16          \n" // r1
                        "ld1    {v3.4h}, [%2]               \n"

                        "fmla   v24.4h, %11.4h, v2.h[0]     \n"
                        "fmla   v25.4h, %11.4h, v2.h[1]     \n"
                        "fmla   v26.4h, %11.4h, v2.h[2]     \n"
                        "fmla   v27.4h, %11.4h, v2.h[3]     \n"
                        "fmla   v28.4h, %11.4h, v2.h[4]     \n"
                        "fmla   v29.4h, %11.4h, v2.h[5]     \n"
                        "fmla   v30.4h, %11.4h, v2.h[6]     \n"
                        "fmla   v31.4h, %11.4h, v2.h[7]     \n"

                        "fmla   v24.4h, %12.4h, v2.h[1]     \n"
                        "fmla   v25.4h, %12.4h, v2.h[2]     \n"
                        "fmla   v26.4h, %12.4h, v2.h[3]     \n"
                        "fmla   v27.4h, %12.4h, v2.h[4]     \n"
                        "fmla   v28.4h, %12.4h, v2.h[5]     \n"
                        "fmla   v29.4h, %12.4h, v2.h[6]     \n"
                        "fmla   v30.4h, %12.4h, v2.h[7]     \n"
                        "fmla   v31.4h, %12.4h, v3.h[0]     \n"

                        "fmla   v24.4h, %13.4h, v2.h[2]     \n"
                        "fmla   v25.4h, %13.4h, v2.h[3]     \n"
                        "fmla   v26.4h, %13.4h, v2.h[4]     \n"
                        "fmla   v27.4h, %13.4h, v2.h[5]     \n"
                        "fmla   v28.4h, %13.4h, v2.h[6]     \n"
                        "fmla   v29.4h, %13.4h, v2.h[7]     \n"
                        "fmla   v30.4h, %13.4h, v3.h[0]     \n"
                        "fmla   v31.4h, %13.4h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v4.8h}, [%3], #16          \n" // r2
                        "ld1    {v5.4h}, [%3]               \n"

                        "fmla   v24.4h, %14.4h, v4.h[0]     \n"
                        "fmla   v25.4h, %14.4h, v4.h[1]     \n"
                        "fmla   v26.4h, %14.4h, v4.h[2]     \n"
                        "fmla   v27.4h, %14.4h, v4.h[3]     \n"
                        "fmla   v28.4h, %14.4h, v4.h[4]     \n"
                        "fmla   v29.4h, %14.4h, v4.h[5]     \n"
                        "fmla   v30.4h, %14.4h, v4.h[6]     \n"
                        "fmla   v31.4h, %14.4h, v4.h[7]     \n"

                        "fmla   v24.4h, %15.4h, v4.h[1]     \n"
                        "fmla   v25.4h, %15.4h, v4.h[2]     \n"
                        "fmla   v26.4h, %15.4h, v4.h[3]     \n"
                        "fmla   v27.4h, %15.4h, v4.h[4]     \n"
                        "fmla   v28.4h, %15.4h, v4.h[5]     \n"
                        "fmla   v29.4h, %15.4h, v4.h[6]     \n"
                        "fmla   v30.4h, %15.4h, v4.h[7]     \n"
                        "fmla   v31.4h, %15.4h, v5.h[0]     \n"

                        "fmla   v24.4h, %16.4h, v4.h[2]     \n"
                        "fmla   v25.4h, %16.4h, v4.h[3]     \n"
                        "fmla   v26.4h, %16.4h, v4.h[4]     \n"
                        "fmla   v27.4h, %16.4h, v4.h[5]     \n"
                        "fmla   v28.4h, %16.4h, v4.h[6]     \n"
                        "fmla   v29.4h, %16.4h, v4.h[7]     \n"
                        "fmla   v30.4h, %16.4h, v5.h[0]     \n"
                        "fmla   v31.4h, %16.4h, v5.h[1]     \n"

                        "st1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%0], #32 \n"
                        "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.8h}, [%1]               \n" // r0

                        "fmla   v28.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v29.4h, %8.4h, v0.h[1]      \n"
                        "fmla   v30.4h, %8.4h, v0.h[2]      \n"
                        "fmla   v31.4h, %8.4h, v0.h[3]      \n"

                        "fmla   v28.4h, %9.4h, v0.h[1]      \n"
                        "fmla   v29.4h, %9.4h, v0.h[2]      \n"
                        "fmla   v30.4h, %9.4h, v0.h[3]      \n"
                        "fmla   v31.4h, %9.4h, v0.h[4]      \n"

                        "fmla   v28.4h, %10.4h, v0.h[2]     \n"
                        "fmla   v29.4h, %10.4h, v0.h[3]     \n"
                        "fmla   v30.4h, %10.4h, v0.h[4]     \n"
                        "fmla   v31.4h, %10.4h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v1.8h}, [%2]               \n" // r1

                        "fmla   v28.4h, %11.4h, v1.h[0]     \n"
                        "fmla   v29.4h, %11.4h, v1.h[1]     \n"
                        "fmla   v30.4h, %11.4h, v1.h[2]     \n"
                        "fmla   v31.4h, %11.4h, v1.h[3]     \n"

                        "fmla   v28.4h, %12.4h, v1.h[1]     \n"
                        "fmla   v29.4h, %12.4h, v1.h[2]     \n"
                        "fmla   v30.4h, %12.4h, v1.h[3]     \n"
                        "fmla   v31.4h, %12.4h, v1.h[4]     \n"

                        "fmla   v28.4h, %13.4h, v1.h[2]     \n"
                        "fmla   v29.4h, %13.4h, v1.h[3]     \n"
                        "fmla   v30.4h, %13.4h, v1.h[4]     \n"
                        "fmla   v31.4h, %13.4h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v2.8h}, [%3]               \n" // r2

                        "fmla   v28.4h, %14.4h, v2.h[0]     \n"
                        "fmla   v29.4h, %14.4h, v2.h[1]     \n"
                        "fmla   v30.4h, %14.4h, v2.h[2]     \n"
                        "fmla   v31.4h, %14.4h, v2.h[3]     \n"

                        "fmla   v28.4h, %15.4h, v2.h[1]     \n"
                        "fmla   v29.4h, %15.4h, v2.h[2]     \n"
                        "fmla   v30.4h, %15.4h, v2.h[3]     \n"
                        "fmla   v31.4h, %15.4h, v2.h[4]     \n"

                        "fmla   v28.4h, %16.4h, v2.h[2]     \n"
                        "fmla   v29.4h, %16.4h, v2.h[3]     \n"
                        "fmla   v30.4h, %16.4h, v2.h[4]     \n"
                        "fmla   v31.4h, %16.4h, v2.h[5]     \n"

                        "add    %1, %1, #8                  \n"
                        "add    %2, %2, #8                  \n"
                        "add    %3, %3, #8                  \n"

                        "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v30.4h, v31.4h}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1]               \n" // r0

                        "fmla   v30.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v31.4h, %8.4h, v0.h[1]      \n"
                        "fmla   v30.4h, %9.4h, v0.h[1]      \n"
                        "fmla   v31.4h, %9.4h, v0.h[2]      \n"
                        "fmla   v30.4h, %10.4h, v0.h[2]     \n"
                        "fmla   v31.4h, %10.4h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v1.4h}, [%2]               \n" // r1

                        "fmla   v30.4h, %11.4h, v1.h[0]     \n"
                        "fmla   v31.4h, %11.4h, v1.h[1]     \n"
                        "fmla   v30.4h, %12.4h, v1.h[1]     \n"
                        "fmla   v31.4h, %12.4h, v1.h[2]     \n"
                        "fmla   v30.4h, %13.4h, v1.h[2]     \n"
                        "fmla   v31.4h, %13.4h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v2.4h}, [%3]               \n" // r2

                        "fmla   v30.4h, %14.4h, v2.h[0]     \n"
                        "fmla   v31.4h, %14.4h, v2.h[1]     \n"
                        "fmla   v30.4h, %15.4h, v2.h[1]     \n"
                        "fmla   v31.4h, %15.4h, v2.h[2]     \n"
                        "fmla   v30.4h, %16.4h, v2.h[2]     \n"
                        "fmla   v31.4h, %16.4h, v2.h[3]     \n"

                        "add    %1, %1, #4                  \n"
                        "add    %2, %2, #4                  \n"
                        "add    %3, %3, #4                  \n"

                        "st1    {v30.4h, v31.4h}, [%0], #16 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #64]        \n"
                        "ld1    {v30.4h}, [%0]              \n" // sum0

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1]               \n" // r0

                        "fmla   v30.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v30.4h, %9.4h, v0.h[1]      \n"
                        "fmla   v30.4h, %10.4h, v0.h[2]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v1.4h}, [%2]               \n" // r1

                        "fmla   v30.4h, %11.4h, v1.h[0]     \n"
                        "fmla   v30.4h, %12.4h, v1.h[1]     \n"
                        "fmla   v30.4h, %13.4h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v2.4h}, [%3]               \n" // r2

                        "fmla   v30.4h, %14.4h, v2.h[0]     \n"
                        "fmla   v30.4h, %15.4h, v2.h[1]     \n"
                        "fmla   v30.4h, %16.4h, v2.h[2]     \n"

                        "add    %1, %1, #2                  \n"
                        "add    %2, %2, #2                  \n"
                        "add    %3, %3, #2                  \n"

                        "st1    {v30.4h}, [%0], #8          \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v30");
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
        }
    }
}

static void conv3x3s2_pack1to4_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x4_t _bias0 = bias ? vld1_f16(bias + p * 4) : vdup_n_f16((__fp16)0.f);
        out0.fill(_bias0);

        const __fp16* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);

            float16x4_t _k00 = vld1_f16(k0);
            float16x4_t _k01 = vld1_f16(k0 + 4);
            float16x4_t _k02 = vld1_f16(k0 + 8);
            float16x4_t _k10 = vld1_f16(k0 + 12);
            float16x4_t _k11 = vld1_f16(k0 + 16);
            float16x4_t _k12 = vld1_f16(k0 + 20);
            float16x4_t _k20 = vld1_f16(k0 + 24);
            float16x4_t _k21 = vld1_f16(k0 + 28);
            float16x4_t _k22 = vld1_f16(k0 + 32);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.8h}, [%1], #16          \n" // r0
                        "ld1    {v1.h}[0], [%1]             \n"

                        "fmla   v28.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v29.4h, %8.4h, v0.h[2]      \n"
                        "fmla   v30.4h, %8.4h, v0.h[4]      \n"
                        "fmla   v31.4h, %8.4h, v0.h[6]      \n"

                        "fmla   v28.4h, %9.4h, v0.h[1]      \n"
                        "fmla   v29.4h, %9.4h, v0.h[3]      \n"
                        "fmla   v30.4h, %9.4h, v0.h[5]      \n"
                        "fmla   v31.4h, %9.4h, v0.h[7]      \n"

                        "fmla   v28.4h, %10.4h, v0.h[2]     \n"
                        "fmla   v29.4h, %10.4h, v0.h[4]     \n"
                        "fmla   v30.4h, %10.4h, v0.h[6]     \n"
                        "fmla   v31.4h, %10.4h, v1.h[0]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v2.8h}, [%2], #16          \n" // r1
                        "ld1    {v3.h}[0], [%2]             \n"

                        "fmla   v28.4h, %11.4h, v2.h[0]     \n"
                        "fmla   v29.4h, %11.4h, v2.h[2]     \n"
                        "fmla   v30.4h, %11.4h, v2.h[4]     \n"
                        "fmla   v31.4h, %11.4h, v2.h[6]     \n"

                        "fmla   v28.4h, %12.4h, v2.h[1]     \n"
                        "fmla   v29.4h, %12.4h, v2.h[3]     \n"
                        "fmla   v30.4h, %12.4h, v2.h[5]     \n"
                        "fmla   v31.4h, %12.4h, v2.h[7]     \n"

                        "fmla   v28.4h, %13.4h, v2.h[2]     \n"
                        "fmla   v29.4h, %13.4h, v2.h[4]     \n"
                        "fmla   v30.4h, %13.4h, v2.h[6]     \n"
                        "fmla   v31.4h, %13.4h, v3.h[0]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v4.8h}, [%3], #16          \n" // r2
                        "ld1    {v5.h}[0], [%3]             \n"

                        "fmla   v28.4h, %14.4h, v4.h[0]     \n"
                        "fmla   v29.4h, %14.4h, v4.h[2]     \n"
                        "fmla   v30.4h, %14.4h, v4.h[4]     \n"
                        "fmla   v31.4h, %14.4h, v4.h[6]     \n"

                        "fmla   v28.4h, %15.4h, v4.h[1]     \n"
                        "fmla   v29.4h, %15.4h, v4.h[3]     \n"
                        "fmla   v30.4h, %15.4h, v4.h[5]     \n"
                        "fmla   v31.4h, %15.4h, v4.h[7]     \n"

                        "fmla   v28.4h, %16.4h, v4.h[2]     \n"
                        "fmla   v29.4h, %16.4h, v4.h[4]     \n"
                        "fmla   v30.4h, %16.4h, v4.h[6]     \n"
                        "fmla   v31.4h, %16.4h, v5.h[0]     \n"

                        "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v30.4h, v31.4h}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1], #8           \n" // r0
                        "ld1    {v1.h}[0], [%1]             \n"

                        "fmla   v30.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v31.4h, %8.4h, v0.h[2]      \n"
                        "fmla   v30.4h, %9.4h, v0.h[1]      \n"
                        "fmla   v31.4h, %9.4h, v0.h[3]      \n"
                        "fmla   v30.4h, %10.4h, v0.h[2]     \n"
                        "fmla   v31.4h, %10.4h, v1.h[0]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v2.4h}, [%2], #8           \n" // r1
                        "ld1    {v3.h}[0], [%2]             \n"

                        "fmla   v30.4h, %11.4h, v2.h[0]     \n"
                        "fmla   v31.4h, %11.4h, v2.h[2]     \n"
                        "fmla   v30.4h, %12.4h, v2.h[1]     \n"
                        "fmla   v31.4h, %12.4h, v2.h[3]     \n"
                        "fmla   v30.4h, %13.4h, v2.h[2]     \n"
                        "fmla   v31.4h, %13.4h, v3.h[0]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v4.4h}, [%3], #8           \n" // r2
                        "ld1    {v5.h}[0], [%3]             \n"

                        "fmla   v30.4h, %14.4h, v4.h[0]     \n"
                        "fmla   v31.4h, %14.4h, v4.h[2]     \n"
                        "fmla   v30.4h, %15.4h, v4.h[1]     \n"
                        "fmla   v31.4h, %15.4h, v4.h[3]     \n"
                        "fmla   v30.4h, %16.4h, v4.h[2]     \n"
                        "fmla   v31.4h, %16.4h, v5.h[0]     \n"

                        "st1    {v30.4h, v31.4h}, [%0], #16 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #64]        \n"
                        "ld1    {v30.4h}, [%0]              \n" // sum0

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1]               \n" // r0

                        "fmla   v30.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v30.4h, %9.4h, v0.h[1]      \n"
                        "fmla   v30.4h, %10.4h, v0.h[2]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v1.4h}, [%2]               \n" // r1

                        "fmla   v30.4h, %11.4h, v1.h[0]     \n"
                        "fmla   v30.4h, %12.4h, v1.h[1]     \n"
                        "fmla   v30.4h, %13.4h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v2.4h}, [%3]               \n" // r2

                        "fmla   v30.4h, %14.4h, v2.h[0]     \n"
                        "fmla   v30.4h, %15.4h, v2.h[1]     \n"
                        "fmla   v30.4h, %16.4h, v2.h[2]     \n"

                        "add    %1, %1, #4                  \n"
                        "add    %2, %2, #4                  \n"
                        "add    %3, %3, #4                  \n"

                        "st1    {v30.4h}, [%0], #8          \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v30");
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack1to8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16((__fp16)0.f);
        out0.fill(_bias0);

        const __fp16* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);

            float16x8_t _k00 = vld1q_f16(k0);
            float16x8_t _k01 = vld1q_f16(k0 + 8);
            float16x8_t _k02 = vld1q_f16(k0 + 16);
            float16x8_t _k10 = vld1q_f16(k0 + 24);
            float16x8_t _k11 = vld1q_f16(k0 + 32);
            float16x8_t _k12 = vld1q_f16(k0 + 40);
            float16x8_t _k20 = vld1q_f16(k0 + 48);
            float16x8_t _k21 = vld1q_f16(k0 + 56);
            float16x8_t _k22 = vld1q_f16(k0 + 64);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum4 sum5 sum6 sum7

                        "sub    %0, %0, #64                 \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ldr    q0, [%1], #16               \n" // r0
                        "ldr    s1, [%1]                    \n"

                        "fmla   v24.8h, %8.8h, v0.h[0]      \n"
                        "fmla   v25.8h, %8.8h, v0.h[1]      \n"
                        "fmla   v26.8h, %8.8h, v0.h[2]      \n"
                        "fmla   v27.8h, %8.8h, v0.h[3]      \n"
                        "fmla   v28.8h, %8.8h, v0.h[4]      \n"
                        "fmla   v29.8h, %8.8h, v0.h[5]      \n"
                        "fmla   v30.8h, %8.8h, v0.h[6]      \n"
                        "fmla   v31.8h, %8.8h, v0.h[7]      \n"

                        "fmla   v24.8h, %9.8h, v0.h[1]      \n"
                        "fmla   v25.8h, %9.8h, v0.h[2]      \n"
                        "fmla   v26.8h, %9.8h, v0.h[3]      \n"
                        "fmla   v27.8h, %9.8h, v0.h[4]      \n"
                        "fmla   v28.8h, %9.8h, v0.h[5]      \n"
                        "fmla   v29.8h, %9.8h, v0.h[6]      \n"
                        "fmla   v30.8h, %9.8h, v0.h[7]      \n"
                        "fmla   v31.8h, %9.8h, v1.h[0]      \n"

                        "fmla   v24.8h, %10.8h, v0.h[2]     \n"
                        "fmla   v25.8h, %10.8h, v0.h[3]     \n"
                        "fmla   v26.8h, %10.8h, v0.h[4]     \n"
                        "fmla   v27.8h, %10.8h, v0.h[5]     \n"
                        "fmla   v28.8h, %10.8h, v0.h[6]     \n"
                        "fmla   v29.8h, %10.8h, v0.h[7]     \n"
                        "fmla   v30.8h, %10.8h, v1.h[0]     \n"
                        "fmla   v31.8h, %10.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ldr    q2, [%2], #16               \n" // r1
                        "ldr    s3, [%2]                    \n"

                        "fmla   v24.8h, %11.8h, v2.h[0]     \n"
                        "fmla   v25.8h, %11.8h, v2.h[1]     \n"
                        "fmla   v26.8h, %11.8h, v2.h[2]     \n"
                        "fmla   v27.8h, %11.8h, v2.h[3]     \n"
                        "fmla   v28.8h, %11.8h, v2.h[4]     \n"
                        "fmla   v29.8h, %11.8h, v2.h[5]     \n"
                        "fmla   v30.8h, %11.8h, v2.h[6]     \n"
                        "fmla   v31.8h, %11.8h, v2.h[7]     \n"

                        "fmla   v24.8h, %12.8h, v2.h[1]     \n"
                        "fmla   v25.8h, %12.8h, v2.h[2]     \n"
                        "fmla   v26.8h, %12.8h, v2.h[3]     \n"
                        "fmla   v27.8h, %12.8h, v2.h[4]     \n"
                        "fmla   v28.8h, %12.8h, v2.h[5]     \n"
                        "fmla   v29.8h, %12.8h, v2.h[6]     \n"
                        "fmla   v30.8h, %12.8h, v2.h[7]     \n"
                        "fmla   v31.8h, %12.8h, v3.h[0]     \n"

                        "fmla   v24.8h, %13.8h, v2.h[2]     \n"
                        "fmla   v25.8h, %13.8h, v2.h[3]     \n"
                        "fmla   v26.8h, %13.8h, v2.h[4]     \n"
                        "fmla   v27.8h, %13.8h, v2.h[5]     \n"
                        "fmla   v28.8h, %13.8h, v2.h[6]     \n"
                        "fmla   v29.8h, %13.8h, v2.h[7]     \n"
                        "fmla   v30.8h, %13.8h, v3.h[0]     \n"
                        "fmla   v31.8h, %13.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ldr    q4, [%3], #16               \n" // r2
                        "ldr    s5, [%3]                    \n"

                        "fmla   v24.8h, %14.8h, v4.h[0]     \n"
                        "fmla   v25.8h, %14.8h, v4.h[1]     \n"
                        "fmla   v26.8h, %14.8h, v4.h[2]     \n"
                        "fmla   v27.8h, %14.8h, v4.h[3]     \n"
                        "fmla   v28.8h, %14.8h, v4.h[4]     \n"
                        "fmla   v29.8h, %14.8h, v4.h[5]     \n"
                        "fmla   v30.8h, %14.8h, v4.h[6]     \n"
                        "fmla   v31.8h, %14.8h, v4.h[7]     \n"

                        "fmla   v24.8h, %15.8h, v4.h[1]     \n"
                        "fmla   v25.8h, %15.8h, v4.h[2]     \n"
                        "fmla   v26.8h, %15.8h, v4.h[3]     \n"
                        "fmla   v27.8h, %15.8h, v4.h[4]     \n"
                        "fmla   v28.8h, %15.8h, v4.h[5]     \n"
                        "fmla   v29.8h, %15.8h, v4.h[6]     \n"
                        "fmla   v30.8h, %15.8h, v4.h[7]     \n"
                        "fmla   v31.8h, %15.8h, v5.h[0]     \n"

                        "fmla   v24.8h, %16.8h, v4.h[2]     \n"
                        "fmla   v25.8h, %16.8h, v4.h[3]     \n"
                        "fmla   v26.8h, %16.8h, v4.h[4]     \n"
                        "fmla   v27.8h, %16.8h, v4.h[5]     \n"
                        "fmla   v28.8h, %16.8h, v4.h[6]     \n"
                        "fmla   v29.8h, %16.8h, v4.h[7]     \n"
                        "fmla   v30.8h, %16.8h, v5.h[0]     \n"
                        "fmla   v31.8h, %16.8h, v5.h[1]     \n"

                        "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ldr    q0, [%1]                    \n" // r0

                        "fmla   v28.8h, %8.8h, v0.h[0]      \n"
                        "fmla   v29.8h, %8.8h, v0.h[1]      \n"
                        "fmla   v30.8h, %8.8h, v0.h[2]      \n"
                        "fmla   v31.8h, %8.8h, v0.h[3]      \n"

                        "fmla   v28.8h, %9.8h, v0.h[1]      \n"
                        "fmla   v29.8h, %9.8h, v0.h[2]      \n"
                        "fmla   v30.8h, %9.8h, v0.h[3]      \n"
                        "fmla   v31.8h, %9.8h, v0.h[4]      \n"

                        "fmla   v28.8h, %10.8h, v0.h[2]     \n"
                        "fmla   v29.8h, %10.8h, v0.h[3]     \n"
                        "fmla   v30.8h, %10.8h, v0.h[4]     \n"
                        "fmla   v31.8h, %10.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ldr    q1, [%2]                    \n" // r1

                        "fmla   v28.8h, %11.8h, v1.h[0]     \n"
                        "fmla   v29.8h, %11.8h, v1.h[1]     \n"
                        "fmla   v30.8h, %11.8h, v1.h[2]     \n"
                        "fmla   v31.8h, %11.8h, v1.h[3]     \n"

                        "fmla   v28.8h, %12.8h, v1.h[1]     \n"
                        "fmla   v29.8h, %12.8h, v1.h[2]     \n"
                        "fmla   v30.8h, %12.8h, v1.h[3]     \n"
                        "fmla   v31.8h, %12.8h, v1.h[4]     \n"

                        "fmla   v28.8h, %13.8h, v1.h[2]     \n"
                        "fmla   v29.8h, %13.8h, v1.h[3]     \n"
                        "fmla   v30.8h, %13.8h, v1.h[4]     \n"
                        "fmla   v31.8h, %13.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ldr    q2, [%3]                    \n" // r2

                        "fmla   v28.8h, %14.8h, v2.h[0]     \n"
                        "fmla   v29.8h, %14.8h, v2.h[1]     \n"
                        "fmla   v30.8h, %14.8h, v2.h[2]     \n"
                        "fmla   v31.8h, %14.8h, v2.h[3]     \n"

                        "fmla   v28.8h, %15.8h, v2.h[1]     \n"
                        "fmla   v29.8h, %15.8h, v2.h[2]     \n"
                        "fmla   v30.8h, %15.8h, v2.h[3]     \n"
                        "fmla   v31.8h, %15.8h, v2.h[4]     \n"

                        "fmla   v28.8h, %16.8h, v2.h[2]     \n"
                        "fmla   v29.8h, %16.8h, v2.h[3]     \n"
                        "fmla   v30.8h, %16.8h, v2.h[4]     \n"
                        "fmla   v31.8h, %16.8h, v2.h[5]     \n"

                        "add    %1, %1, #8                  \n"
                        "add    %2, %2, #8                  \n"
                        "add    %3, %3, #8                  \n"

                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ldr    d0, [%1]                    \n" // r0

                        "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                        "fmla   v31.8h, %8.8h, v0.h[1]      \n"
                        "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                        "fmla   v31.8h, %9.8h, v0.h[2]      \n"
                        "fmla   v30.8h, %10.8h, v0.h[2]     \n"
                        "fmla   v31.8h, %10.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ldr    d1, [%2]                    \n" // r1

                        "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                        "fmla   v31.8h, %11.8h, v1.h[1]     \n"
                        "fmla   v30.8h, %12.8h, v1.h[1]     \n"
                        "fmla   v31.8h, %12.8h, v1.h[2]     \n"
                        "fmla   v30.8h, %13.8h, v1.h[2]     \n"
                        "fmla   v31.8h, %13.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ldr    d2, [%3]                    \n" // r2

                        "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                        "fmla   v31.8h, %14.8h, v2.h[1]     \n"
                        "fmla   v30.8h, %15.8h, v2.h[1]     \n"
                        "fmla   v31.8h, %15.8h, v2.h[2]     \n"
                        "fmla   v30.8h, %16.8h, v2.h[2]     \n"
                        "fmla   v31.8h, %16.8h, v2.h[3]     \n"

                        "add    %1, %1, #4                  \n"
                        "add    %2, %2, #4                  \n"
                        "add    %3, %3, #4                  \n"

                        "st1    {v30.8h, v31.8h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ldr    q30, [%0]                   \n" // sum0

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ldr    d0, [%1]                    \n" // r0

                        "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                        "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                        "fmla   v30.8h, %10.8h, v0.h[2]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ldr    d1, [%2]                    \n" // r1

                        "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                        "fmla   v30.8h, %12.8h, v1.h[1]     \n"
                        "fmla   v30.8h, %13.8h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ldr    d2, [%3]                    \n" // r2

                        "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                        "fmla   v30.8h, %15.8h, v2.h[1]     \n"
                        "fmla   v30.8h, %16.8h, v2.h[2]     \n"

                        "add    %1, %1, #2                  \n"
                        "add    %2, %2, #2                  \n"
                        "add    %3, %3, #2                  \n"

                        "str    q30, [%0], #16              \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v30");
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 8;
        }
    }
}

static void conv3x3s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16((__fp16)0.f);
        out0.fill(_bias0);

        const __fp16* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);

            float16x8_t _k00 = vld1q_f16(k0);
            float16x8_t _k01 = vld1q_f16(k0 + 8);
            float16x8_t _k02 = vld1q_f16(k0 + 16);
            float16x8_t _k10 = vld1q_f16(k0 + 24);
            float16x8_t _k11 = vld1q_f16(k0 + 32);
            float16x8_t _k12 = vld1q_f16(k0 + 40);
            float16x8_t _k20 = vld1q_f16(k0 + 48);
            float16x8_t _k21 = vld1q_f16(k0 + 56);
            float16x8_t _k22 = vld1q_f16(k0 + 64);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ldr    q0, [%1], #16               \n" // r0
                        "ldr    h1, [%1]                    \n"

                        "fmla   v28.8h, %8.8h, v0.h[0]      \n"
                        "fmla   v29.8h, %8.8h, v0.h[2]      \n"
                        "fmla   v30.8h, %8.8h, v0.h[4]      \n"
                        "fmla   v31.8h, %8.8h, v0.h[6]      \n"

                        "fmla   v28.8h, %9.8h, v0.h[1]      \n"
                        "fmla   v29.8h, %9.8h, v0.h[3]      \n"
                        "fmla   v30.8h, %9.8h, v0.h[5]      \n"
                        "fmla   v31.8h, %9.8h, v0.h[7]      \n"

                        "fmla   v28.8h, %10.8h, v0.h[2]     \n"
                        "fmla   v29.8h, %10.8h, v0.h[4]     \n"
                        "fmla   v30.8h, %10.8h, v0.h[6]     \n"
                        "fmla   v31.8h, %10.8h, v1.h[0]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ldr    q2, [%2], #16               \n" // r1
                        "ldr    h3, [%2]                    \n"

                        "fmla   v28.8h, %11.8h, v2.h[0]     \n"
                        "fmla   v29.8h, %11.8h, v2.h[2]     \n"
                        "fmla   v30.8h, %11.8h, v2.h[4]     \n"
                        "fmla   v31.8h, %11.8h, v2.h[6]     \n"

                        "fmla   v28.8h, %12.8h, v2.h[1]     \n"
                        "fmla   v29.8h, %12.8h, v2.h[3]     \n"
                        "fmla   v30.8h, %12.8h, v2.h[5]     \n"
                        "fmla   v31.8h, %12.8h, v2.h[7]     \n"

                        "fmla   v28.8h, %13.8h, v2.h[2]     \n"
                        "fmla   v29.8h, %13.8h, v2.h[4]     \n"
                        "fmla   v30.8h, %13.8h, v2.h[6]     \n"
                        "fmla   v31.8h, %13.8h, v3.h[0]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ldr    q4, [%3], #16               \n" // r2
                        "ldr    h5, [%3]                    \n"

                        "fmla   v28.8h, %14.8h, v4.h[0]     \n"
                        "fmla   v29.8h, %14.8h, v4.h[2]     \n"
                        "fmla   v30.8h, %14.8h, v4.h[4]     \n"
                        "fmla   v31.8h, %14.8h, v4.h[6]     \n"

                        "fmla   v28.8h, %15.8h, v4.h[1]     \n"
                        "fmla   v29.8h, %15.8h, v4.h[3]     \n"
                        "fmla   v30.8h, %15.8h, v4.h[5]     \n"
                        "fmla   v31.8h, %15.8h, v4.h[7]     \n"

                        "fmla   v28.8h, %16.8h, v4.h[2]     \n"
                        "fmla   v29.8h, %16.8h, v4.h[4]     \n"
                        "fmla   v30.8h, %16.8h, v4.h[6]     \n"
                        "fmla   v31.8h, %16.8h, v5.h[0]     \n"

                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ldr    d0, [%1], #8                \n" // r0
                        "ldr    h1, [%1]                    \n"

                        "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                        "fmla   v31.8h, %8.8h, v0.h[2]      \n"
                        "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                        "fmla   v31.8h, %9.8h, v0.h[3]      \n"
                        "fmla   v30.8h, %10.8h, v0.h[2]     \n"
                        "fmla   v31.8h, %10.8h, v1.h[0]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ldr    d2, [%2], #8                \n" // r1
                        "ldr    h3, [%2]                    \n"

                        "fmla   v30.8h, %11.8h, v2.h[0]     \n"
                        "fmla   v31.8h, %11.8h, v2.h[2]     \n"
                        "fmla   v30.8h, %12.8h, v2.h[1]     \n"
                        "fmla   v31.8h, %12.8h, v2.h[3]     \n"
                        "fmla   v30.8h, %13.8h, v2.h[2]     \n"
                        "fmla   v31.8h, %13.8h, v3.h[0]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ldr    d4, [%3], #8                \n" // r2
                        "ldr    h5, [%3]                    \n"

                        "fmla   v30.8h, %14.8h, v4.h[0]     \n"
                        "fmla   v31.8h, %14.8h, v4.h[2]     \n"
                        "fmla   v30.8h, %15.8h, v4.h[1]     \n"
                        "fmla   v31.8h, %15.8h, v4.h[3]     \n"
                        "fmla   v30.8h, %16.8h, v4.h[2]     \n"
                        "fmla   v31.8h, %16.8h, v5.h[0]     \n"

                        "st1    {v30.8h, v31.8h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ldr    q30, [%0]                   \n" // sum0

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ldr    d0, [%1]                    \n" // r0

                        "fmla   v30.8h, %8.8h, v0.h[0]      \n"
                        "fmla   v30.8h, %9.8h, v0.h[1]      \n"
                        "fmla   v30.8h, %10.8h, v0.h[2]     \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ldr    d1, [%2]                    \n" // r1

                        "fmla   v30.8h, %11.8h, v1.h[0]     \n"
                        "fmla   v30.8h, %12.8h, v1.h[1]     \n"
                        "fmla   v30.8h, %13.8h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ldr    d2, [%3]                    \n" // r2

                        "fmla   v30.8h, %14.8h, v2.h[0]     \n"
                        "fmla   v30.8h, %15.8h, v2.h[1]     \n"
                        "fmla   v30.8h, %16.8h, v2.h[2]     \n"

                        "add    %1, %1, #4                  \n"
                        "add    %2, %2, #4                  \n"
                        "add    %3, %3, #4                  \n"

                        "str    q30, [%0], #16              \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "cc", "memory", "v0", "v1", "v2", "v30");
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 8;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            const float* kptr = (const float*)kernel.channel(p).row(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n" // r04 r05 r06 r07

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v28.4s}, [%1]              \n" // r08

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%2], #64 \n" // r14 r15 r16 r17

                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v24.4s, v12.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v14.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v10.s[1]    \n"
                        "fmla   v22.4s, v25.4s, v12.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v14.s[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v10.s[2]    \n"
                        "fmla   v22.4s, v26.4s, v12.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v14.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v10.s[3]    \n"
                        "fmla   v22.4s, v27.4s, v12.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v14.s[3]    \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v28.4s}, [%2]              \n" // r18

                        "fmla   v20.4s, v16.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v11.s[0]    \n"
                        "fmla   v22.4s, v16.4s, v13.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v15.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v11.s[1]    \n"
                        "fmla   v22.4s, v17.4s, v13.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v15.s[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v18.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v11.s[2]    \n"
                        "fmla   v22.4s, v18.4s, v13.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v15.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v9.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v11.s[3]    \n"
                        "fmla   v22.4s, v19.4s, v13.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v15.s[3]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v20.4s, v24.4s, v10.s[0]    \n"
                        "fmla   v21.4s, v24.4s, v12.s[0]    \n"
                        "fmla   v22.4s, v24.4s, v14.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v10.s[1]    \n"
                        "fmla   v21.4s, v25.4s, v12.s[1]    \n"
                        "fmla   v22.4s, v25.4s, v14.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v26.4s, v10.s[2]    \n"
                        "fmla   v21.4s, v26.4s, v12.s[2]    \n"
                        "fmla   v22.4s, v26.4s, v14.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v10.s[3]    \n"
                        "fmla   v21.4s, v27.4s, v12.s[3]    \n"
                        "fmla   v22.4s, v27.4s, v14.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v28.s[3]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r24 r25 r26 r27

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v28.4s}, [%3]              \n" // r28

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        //                         "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "sub    %4, %4, #512                \n" // kptr -= 8 * 16;

                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n" // sum0 sum1 sum2 sum3

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d0-d7}        \n" // r00 r01 r02 r03

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d8-d15}       \n" // r04 r05 r06 r07

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1 :128]  \n" // r08

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d8-d15}       \n" // r10 r11 r12 r13

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d0-d7}        \n" // r14 r15 r16 r17

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d8-d9}, [%2 :128]  \n" // r18

                        "vmla.f32   q12, q8, d10[0]     \n"
                        "vmla.f32   q13, q8, d14[0]     \n"
                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d10[1]     \n"
                        "vmla.f32   q13, q9, d14[1]     \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d11[0]    \n"
                        "vmla.f32   q13, q10, d15[0]    \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d11[1]    \n"
                        "vmla.f32   q13, q11, d15[1]    \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d12[0]     \n"
                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d12[1]     \n"
                        "vmla.f32   q13, q9, d0[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d13[0]    \n"
                        "vmla.f32   q13, q10, d1[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d13[1]    \n"
                        "vmla.f32   q13, q11, d1[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d0-d7}        \n" // r20 r21 r22 r23

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d8-d15}       \n" // r24 r25 r26 r27

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d0-d1}, [%3 :128]  \n" // r28

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        //                         "pld        [%4, #512]          \n"
                        "vldm       %4, {d16-d23}       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "sub        %4, %4, #512        \n" // kptr -= 8 * 16;

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v20.4s, v21.4s}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmul   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v23.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v4.4s}, [%1]               \n" // r04

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v4.4s}, [%2]               \n" // r14

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v4.4s}, [%3]               \n" // r24

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"

                        //                         "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"

                        "fadd   v20.4s, v20.4s, v22.4s      \n"
                        "fadd   v21.4s, v21.4s, v23.4s      \n"

                        "sub    %4, %4, #512                \n" // kptr -= 8 * 16;

                        "st1    {v20.4s, v21.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d24-d27}, [%0 :128] \n" // sum0 sum1

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d0-d7}        \n" // r00 r01 r02 r03

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmul.f32   q14, q8, d0[0]      \n"
                        "vmul.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d8-d9}, [%1 :128]  \n" // r04

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d0-d7}        \n" // r10 r11 r12 r13

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d8-d9}, [%2 :128]  \n" // r14

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d0-d7}        \n" // r20 r21 r22 r23

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d8-d9}, [%3 :128]  \n" // r24

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        //                         "pld        [%4, #512]          \n"
                        "vldm       %4, {d16-d23}       \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "sub        %4, %4, #512        \n" // kptr -= 8 * 16;

                        "vst1.f32   {d24-d27}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v20.4s}, [%0]              \n" // sum0

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%1] \n" // r00 r01 r02

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmul   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmul   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v3.4s, v4.4s, v5.4s}, [%2] \n" // r10 r11 r12

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%3] \n" // r20 r21 r22

                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"

                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"

                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        //                         "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "add    %1, %1, #32                 \n"

                        "fadd   v22.4s, v21.4s, v22.4s      \n"

                        "add    %2, %2, #32                 \n"

                        "fadd   v23.4s, v23.4s, v22.4s      \n"

                        "add    %3, %3, #32                 \n"

                        "fadd   v20.4s, v20.4s, v23.4s      \n"

                        "sub    %4, %4, #512                \n" // kptr -= 8 * 16;

                        "st1    {v20.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d24-d25}, [%0 :128] \n" // sum0

                        "pld        [%1, #384]          \n"
                        "vldm       %1, {d0-d5}         \n" // r00 r01 r02

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmul.f32   q13, q8, d0[0]      \n"
                        "vmul.f32   q14, q9, d0[1]      \n"
                        "vmul.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%2, #384]          \n"
                        "vldm       %2, {d0-d5}         \n" // r10 r11 r12

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%3, #384]          \n"
                        "vldm       %3, {d0-d5}         \n" // r20 r21 r22

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        //                         "pld        [%4, #512]          \n"
                        "vldm       %4, {d16-d23}       \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "vadd.f32   q14, q14, q13       \n"

                        "add        %1, %1, #32         \n"

                        "vadd.f32   q15, q15, q14       \n"

                        "add        %2, %2, #32         \n"

                        "vadd.f32   q12, q12, q15       \n"

                        "add        %3, %3, #32         \n"

                        "sub        %4, %4, #512        \n" // kptr -= 8 * 16;

                        "vst1.f32   {d24-d25}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s2_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4, 4, opt.workspace_allocator);

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            const unsigned short* kptr = (const unsigned short*)kernel.channel(p).row<const unsigned short>(q);

#if __aarch64__
            // 16 * 9
            uint16x8_t _k00_01 = vld1q_u16(kptr);
            uint16x8_t _k00_23 = vld1q_u16(kptr + 8);
            uint16x8_t _k01_01 = vld1q_u16(kptr + 16);
            uint16x8_t _k01_23 = vld1q_u16(kptr + 24);
            uint16x8_t _k02_01 = vld1q_u16(kptr + 32);
            uint16x8_t _k02_23 = vld1q_u16(kptr + 40);
            uint16x8_t _k10_01 = vld1q_u16(kptr + 48);
            uint16x8_t _k10_23 = vld1q_u16(kptr + 56);
            uint16x8_t _k11_01 = vld1q_u16(kptr + 64);
            uint16x8_t _k11_23 = vld1q_u16(kptr + 72);
            uint16x8_t _k12_01 = vld1q_u16(kptr + 80);
            uint16x8_t _k12_23 = vld1q_u16(kptr + 88);
            uint16x8_t _k20_01 = vld1q_u16(kptr + 96);
            uint16x8_t _k20_23 = vld1q_u16(kptr + 104);
            uint16x8_t _k21_01 = vld1q_u16(kptr + 112);
            uint16x8_t _k21_23 = vld1q_u16(kptr + 120);
            uint16x8_t _k22_01 = vld1q_u16(kptr + 128);
            uint16x8_t _k22_23 = vld1q_u16(kptr + 136);
#endif // __aarch64__

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v4.4h, #16           \n"
                        "shll2  v1.4s, v4.8h, #16           \n"
                        "shll   v2.4s, v5.4h, #16           \n"
                        "shll2  v3.4s, v5.8h, #16           \n"

                        "shll   v4.4s, v6.4h, #16           \n"
                        "shll2  v5.4s, v6.8h, #16           \n"
                        "shll   v6.4s, v7.4h, #16           \n"
                        "shll2  v7.4s, v7.8h, #16           \n"

                        "shll   v8.4s, %8.4h, #16           \n"
                        "shll2  v9.4s, %8.8h, #16           \n"

                        "fmla   v10.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[1]      \n"

                        "shll   v8.4s, %9.4h, #16           \n"
                        "shll2  v9.4s, %9.8h, #16           \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[3]      \n"

                        "shll   v8.4s, %10.4h, #16          \n"
                        "shll2  v9.4s, %10.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[1]      \n"

                        "shll   v8.4s, %11.4h, #16          \n"
                        "shll2  v9.4s, %11.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[3]      \n"

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1]               \n" // r08

                        "shll   v0.4s, v0.4h, #16           \n"

                        "shll   v8.4s, %12.4h, #16          \n"
                        "shll2  v9.4s, %12.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[1]      \n"

                        "shll   v8.4s, %13.4h, #16          \n"
                        "shll2  v9.4s, %13.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v4.4h, #16           \n"
                        "shll2  v1.4s, v4.8h, #16           \n"
                        "shll   v2.4s, v5.4h, #16           \n"
                        "shll2  v3.4s, v5.8h, #16           \n"

                        "shll   v4.4s, v6.4h, #16           \n"
                        "shll2  v5.4s, v6.8h, #16           \n"
                        "shll   v6.4s, v7.4h, #16           \n"
                        "shll2  v7.4s, v7.8h, #16           \n"

                        "shll   v8.4s, %14.4h, #16          \n"
                        "shll2  v9.4s, %14.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[1]      \n"

                        "shll   v8.4s, %15.4h, #16          \n"
                        "shll2  v9.4s, %15.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[3]      \n"

                        "shll   v8.4s, %16.4h, #16          \n"
                        "shll2  v9.4s, %16.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[1]      \n"

                        "shll   v8.4s, %17.4h, #16          \n"
                        "shll2  v9.4s, %17.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[3]      \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v0.4h}, [%2]               \n" // r18

                        "shll   v0.4s, v0.4h, #16           \n"

                        "shll   v8.4s, %18.4h, #16          \n"
                        "shll2  v9.4s, %18.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[1]      \n"

                        "shll   v8.4s, %19.4h, #16          \n"
                        "shll2  v9.4s, %19.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%3], #64 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v4.4h, #16           \n"
                        "shll2  v1.4s, v4.8h, #16           \n"
                        "shll   v2.4s, v5.4h, #16           \n"
                        "shll2  v3.4s, v5.8h, #16           \n"

                        "shll   v4.4s, v6.4h, #16           \n"
                        "shll2  v5.4s, v6.8h, #16           \n"
                        "shll   v6.4s, v7.4h, #16           \n"
                        "shll2  v7.4s, v7.8h, #16           \n"

                        "shll   v8.4s, %20.4h, #16          \n"
                        "shll2  v9.4s, %20.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[1]      \n"

                        "shll   v8.4s, %21.4h, #16          \n"
                        "shll2  v9.4s, %21.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[3]      \n"

                        "shll   v8.4s, %22.4h, #16          \n"
                        "shll2  v9.4s, %22.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[1]      \n"

                        "shll   v8.4s, %23.4h, #16          \n"
                        "shll2  v9.4s, %23.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[3]      \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3]               \n" // r28

                        "shll   v0.4s, v0.4h, #16           \n"

                        "shll   v8.4s, %24.4h, #16          \n"
                        "shll2  v9.4s, %24.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[1]      \n"

                        "shll   v8.4s, %25.4h, #16          \n"
                        "shll2  v9.4s, %25.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "st1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_01), // %8
                        "w"(_k00_23), // %9
                        "w"(_k01_01), // %10
                        "w"(_k01_23), // %11
                        "w"(_k02_01), // %12
                        "w"(_k02_23), // %13
                        "w"(_k10_01), // %14
                        "w"(_k10_23), // %15
                        "w"(_k11_01), // %16
                        "w"(_k11_23), // %17
                        "w"(_k12_01), // %18
                        "w"(_k12_23), // %19
                        "w"(_k20_01), // %20
                        "w"(_k20_23), // %21
                        "w"(_k21_01), // %22
                        "w"(_k21_23), // %23
                        "w"(_k22_01), // %24
                        "w"(_k22_23)  // %25
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n" // sum0 sum1 sum2 sum3

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d8-d15}       \n" // r00 r01 r02 r03 r04 r05 r06 r07

                        "vshll.u16  q0, d8, #16         \n"
                        "vshll.u16  q1, d9, #16         \n"
                        "vshll.u16  q2, d10, #16        \n"
                        "vshll.u16  q3, d11, #16        \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%1, #64]           \n"
                        "vld1.f32   {d1}, [%1 :64]      \n" // r08

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d8-d15}       \n" // r10 r11 r12 r13 r14 r15 r16 r17

                        "vshll.u16  q0, d8, #16         \n"
                        "vshll.u16  q1, d9, #16         \n"
                        "vshll.u16  q2, d10, #16        \n"
                        "vshll.u16  q3, d11, #16        \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #64]           \n"
                        "vld1.f32   {d1}, [%2 :64]      \n" // r18

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%3, #256]          \n"
                        "vldm       %3!, {d8-d15}       \n" // r20 r21 r22 r23 r24 r25 r26 r27

                        "vshll.u16  q0, d8, #16         \n"
                        "vshll.u16  q1, d9, #16         \n"
                        "vshll.u16  q2, d10, #16        \n"
                        "vshll.u16  q3, d11, #16        \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #64]           \n"
                        "vld1.f32   {d1}, [%3 :64]      \n" // r28

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        //                         "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "sub        %4, %4, #256        \n" // kptr -= 8 * 16;

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v12.4s, v13.4s}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "shll   v6.4s, %8.4h, #16           \n"
                        "shll2  v7.4s, %8.8h, #16           \n"
                        "shll   v8.4s, %9.4h, #16           \n"
                        "shll2  v9.4s, %9.8h, #16           \n"

                        "fmul   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmul   v11.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v2.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v4.4h}, [%1]               \n" // r04

                        "shll   v4.4s, v4.4h, #16           \n"

                        "shll   v6.4s, %10.4h, #16          \n"
                        "shll2  v7.4s, %10.8h, #16          \n"
                        "shll   v8.4s, %11.4h, #16          \n"
                        "shll2  v9.4s, %11.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v3.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %12.4h, #16          \n"
                        "shll2  v7.4s, %12.8h, #16          \n"
                        "shll   v8.4s, %13.4h, #16          \n"
                        "shll2  v9.4s, %13.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v2.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v4.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "shll   v6.4s, %14.4h, #16          \n"
                        "shll2  v7.4s, %14.8h, #16          \n"
                        "shll   v8.4s, %15.4h, #16          \n"
                        "shll2  v9.4s, %15.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v2.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v4.4h}, [%2]               \n" // r14

                        "shll   v4.4s, v4.4h, #16           \n"

                        "shll   v6.4s, %16.4h, #16          \n"
                        "shll2  v7.4s, %16.8h, #16          \n"
                        "shll   v8.4s, %17.4h, #16          \n"
                        "shll2  v9.4s, %17.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v3.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %18.4h, #16          \n"
                        "shll2  v7.4s, %18.8h, #16          \n"
                        "shll   v8.4s, %19.4h, #16          \n"
                        "shll2  v9.4s, %19.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v2.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v4.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "shll   v6.4s, %20.4h, #16          \n"
                        "shll2  v7.4s, %20.8h, #16          \n"
                        "shll   v8.4s, %21.4h, #16          \n"
                        "shll2  v9.4s, %21.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v2.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v4.4h}, [%3]               \n" // r24

                        "shll   v4.4s, v4.4h, #16           \n"

                        "shll   v6.4s, %22.4h, #16          \n"
                        "shll2  v7.4s, %22.8h, #16          \n"
                        "shll   v8.4s, %23.4h, #16          \n"
                        "shll2  v9.4s, %23.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v3.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %24.4h, #16          \n"
                        "shll2  v7.4s, %24.8h, #16          \n"
                        "shll   v8.4s, %25.4h, #16          \n"
                        "shll2  v9.4s, %25.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v2.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v4.s[1]      \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "fadd   v12.4s, v10.4s, v12.4s      \n"
                        "fadd   v13.4s, v11.4s, v13.4s      \n"

                        "st1    {v12.4s, v13.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_01), // %8
                        "w"(_k00_23), // %9
                        "w"(_k01_01), // %10
                        "w"(_k01_23), // %11
                        "w"(_k02_01), // %12
                        "w"(_k02_23), // %13
                        "w"(_k10_01), // %14
                        "w"(_k10_23), // %15
                        "w"(_k11_01), // %16
                        "w"(_k11_23), // %17
                        "w"(_k12_01), // %18
                        "w"(_k12_23), // %19
                        "w"(_k20_01), // %20
                        "w"(_k20_23), // %21
                        "w"(_k21_01), // %22
                        "w"(_k21_23), // %23
                        "w"(_k22_01), // %24
                        "w"(_k22_23)  // %25
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d28-d31}, [%0 :128] \n" // sum0 sum1

                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d4-d7}, [%1 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmul.f32   q12, q8, d0[0]      \n"
                        "vmul.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%1, #64]           \n"
                        "vld1.f32   {d9}, [%1 :64]      \n" // r04

                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r10 r11 r12 r13

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #64]           \n"
                        "vld1.f32   {d9}, [%2 :64]      \n" // r14

                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r20 r21 r22 r23

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #64]           \n"
                        "vld1.f32   {d9}, [%3 :64]      \n" // r24

                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        //                         "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "vadd.f32   q14, q12, q14       \n"
                        "vadd.f32   q15, q13, q15       \n"

                        "sub        %4, %4, #256        \n" // kptr -= 8 * 16;

                        "vst1.f32   {d28-d31}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v13.4s}, [%0]              \n" // sum0

                        "prfm   pldl1keep, [%1, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%1] \n" // r00 r01 r02

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "shll   v6.4s, %8.4h, #16           \n"
                        "shll2  v7.4s, %8.8h, #16           \n"

                        "fmul   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmul   v11.4s, v7.4s, v0.s[1]      \n"

                        "shll   v8.4s, %9.4h, #16           \n"
                        "shll2  v9.4s, %9.8h, #16           \n"

                        "fmul   v12.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "shll   v6.4s, %10.4h, #16          \n"
                        "shll2  v7.4s, %10.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v1.s[1]      \n"

                        "shll   v8.4s, %11.4h, #16          \n"
                        "shll2  v9.4s, %11.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v1.s[3]      \n"

                        "shll   v6.4s, %12.4h, #16          \n"
                        "shll2  v7.4s, %12.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v2.s[1]      \n"

                        "shll   v8.4s, %13.4h, #16          \n"
                        "shll2  v9.4s, %13.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v3.4h, v4.4h, v5.4h}, [%2] \n" // r10 r11 r12

                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "shll   v6.4s, %14.4h, #16          \n"
                        "shll2  v7.4s, %14.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v3.s[1]      \n"

                        "shll   v8.4s, %15.4h, #16          \n"
                        "shll2  v9.4s, %15.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %16.4h, #16          \n"
                        "shll2  v7.4s, %16.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v4.s[1]      \n"

                        "shll   v8.4s, %17.4h, #16          \n"
                        "shll2  v9.4s, %17.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "shll   v6.4s, %18.4h, #16          \n"
                        "shll2  v7.4s, %18.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v5.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v5.s[1]      \n"

                        "shll   v8.4s, %19.4h, #16          \n"
                        "shll2  v9.4s, %19.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v5.s[3]      \n"

                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%3] \n" // r20 r21 r22

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "shll   v6.4s, %20.4h, #16          \n"
                        "shll2  v7.4s, %20.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v0.s[1]      \n"

                        "shll   v8.4s, %21.4h, #16          \n"
                        "shll2  v9.4s, %21.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "shll   v6.4s, %22.4h, #16          \n"
                        "shll2  v7.4s, %22.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v1.s[1]      \n"

                        "shll   v8.4s, %23.4h, #16          \n"
                        "shll2  v9.4s, %23.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v1.s[3]      \n"

                        "shll   v6.4s, %24.4h, #16          \n"
                        "shll2  v7.4s, %24.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v2.s[1]      \n"

                        "shll   v8.4s, %25.4h, #16          \n"
                        "shll2  v9.4s, %25.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "fadd   v11.4s, v10.4s, v11.4s      \n"

                        "add    %1, %1, #16                 \n"
                        "fadd   v13.4s, v12.4s, v13.4s      \n"

                        "add    %2, %2, #16                 \n"
                        "fadd   v13.4s, v11.4s, v13.4s      \n"

                        "add    %3, %3, #16                 \n"

                        "st1    {v13.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_01), // %8
                        "w"(_k00_23), // %9
                        "w"(_k01_01), // %10
                        "w"(_k01_23), // %11
                        "w"(_k02_01), // %12
                        "w"(_k02_23), // %13
                        "w"(_k10_01), // %14
                        "w"(_k10_23), // %15
                        "w"(_k11_01), // %16
                        "w"(_k11_23), // %17
                        "w"(_k12_01), // %18
                        "w"(_k12_23), // %19
                        "w"(_k20_01), // %20
                        "w"(_k20_23), // %21
                        "w"(_k21_01), // %22
                        "w"(_k21_23), // %23
                        "w"(_k22_01), // %24
                        "w"(_k22_23)  // %25
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d30-d31}, [%0 :128] \n" // sum0

                        "pld        [%1, #192]          \n"
                        "vld1.u16   {d2-d4}, [%1 :64]   \n" // r00 r01 r02

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshll.u16  q2, d4, #16         \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmul.f32   q12, q8, d0[0]      \n"
                        "vmul.f32   q13, q9, d0[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmul.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%2, #192]          \n"
                        "vld1.u16   {d2-d4}, [%2 :64]   \n" // r10 r11 r12

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshll.u16  q2, d4, #16         \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q9, d0[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%3, #192]          \n"
                        "vld1.u16   {d2-d4}, [%3 :64]   \n" // r20 r21 r22

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshll.u16  q2, d4, #16         \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q9, d0[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        //                         "pld        [%4, #256]          \n"
                        "vld1.u16   {d20-d23}, [%4 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "add        %1, %1, #16         \n"
                        "vadd.f32   q13, q12, q13       \n"

                        "add        %2, %2, #16         \n"
                        "vadd.f32   q15, q14, q15       \n"

                        "add        %3, %3, #16         \n"
                        "vadd.f32   q15, q13, q15       \n"

                        "sub        %4, %4, #256        \n" // kptr -= 8 * 16 * 2;

                        "vst1.f32   {d30-d31}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);

            const float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);

            const unsigned short* kptr = (const unsigned short*)kernel.channel(p).row<const unsigned short>(q);

#if __aarch64__
            // 16 * 9
            uint16x8_t _k00_01 = vld1q_u16(kptr);
            uint16x8_t _k00_23 = vld1q_u16(kptr + 8);
            uint16x8_t _k01_01 = vld1q_u16(kptr + 16);
            uint16x8_t _k01_23 = vld1q_u16(kptr + 24);
            uint16x8_t _k02_01 = vld1q_u16(kptr + 32);
            uint16x8_t _k02_23 = vld1q_u16(kptr + 40);
            uint16x8_t _k10_01 = vld1q_u16(kptr + 48);
            uint16x8_t _k10_23 = vld1q_u16(kptr + 56);
            uint16x8_t _k11_01 = vld1q_u16(kptr + 64);
            uint16x8_t _k11_23 = vld1q_u16(kptr + 72);
            uint16x8_t _k12_01 = vld1q_u16(kptr + 80);
            uint16x8_t _k12_23 = vld1q_u16(kptr + 88);
            uint16x8_t _k20_01 = vld1q_u16(kptr + 96);
            uint16x8_t _k20_23 = vld1q_u16(kptr + 104);
            uint16x8_t _k21_01 = vld1q_u16(kptr + 112);
            uint16x8_t _k21_23 = vld1q_u16(kptr + 120);
            uint16x8_t _k22_01 = vld1q_u16(kptr + 128);
            uint16x8_t _k22_23 = vld1q_u16(kptr + 136);
#endif // __aarch64__

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%1], #64 \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v4.4h, #16           \n"
                        "shll2  v1.4s, v4.8h, #16           \n"
                        "shll   v2.4s, v5.4h, #16           \n"
                        "shll2  v3.4s, v5.8h, #16           \n"

                        "shll   v4.4s, v6.4h, #16           \n"
                        "shll2  v5.4s, v6.8h, #16           \n"
                        "shll   v6.4s, v7.4h, #16           \n"
                        "shll2  v7.4s, v7.8h, #16           \n"

                        "shll   v8.4s, %10.4h, #16          \n"
                        "shll2  v9.4s, %10.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[1]      \n"

                        "shll   v8.4s, %11.4h, #16          \n"
                        "shll2  v9.4s, %11.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[3]      \n"

                        "shll   v8.4s, %12.4h, #16          \n"
                        "shll2  v9.4s, %12.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[1]      \n"

                        "shll   v8.4s, %13.4h, #16          \n"
                        "shll2  v9.4s, %13.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[3]      \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v0.4h}, [%2]               \n" // r08

                        "shll   v0.4s, v0.4h, #16           \n"

                        "shll   v8.4s, %14.4h, #16          \n"
                        "shll2  v9.4s, %14.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[1]      \n"

                        "shll   v8.4s, %15.4h, #16          \n"
                        "shll2  v9.4s, %15.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%3], #64 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v4.4h, #16           \n"
                        "shll2  v1.4s, v4.8h, #16           \n"
                        "shll   v2.4s, v5.4h, #16           \n"
                        "shll2  v3.4s, v5.8h, #16           \n"

                        "shll   v4.4s, v6.4h, #16           \n"
                        "shll2  v5.4s, v6.8h, #16           \n"
                        "shll   v6.4s, v7.4h, #16           \n"
                        "shll2  v7.4s, v7.8h, #16           \n"

                        "shll   v8.4s, %16.4h, #16          \n"
                        "shll2  v9.4s, %16.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[1]      \n"

                        "shll   v8.4s, %17.4h, #16          \n"
                        "shll2  v9.4s, %17.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[3]      \n"

                        "shll   v8.4s, %18.4h, #16          \n"
                        "shll2  v9.4s, %18.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[1]      \n"

                        "shll   v8.4s, %19.4h, #16          \n"
                        "shll2  v9.4s, %19.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[3]      \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3]               \n" // r18

                        "shll   v0.4s, v0.4h, #16           \n"

                        "shll   v8.4s, %20.4h, #16          \n"
                        "shll2  v9.4s, %20.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[1]      \n"

                        "shll   v8.4s, %21.4h, #16          \n"
                        "shll2  v9.4s, %21.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%4], #64 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v4.4h, #16           \n"
                        "shll2  v1.4s, v4.8h, #16           \n"
                        "shll   v2.4s, v5.4h, #16           \n"
                        "shll2  v3.4s, v5.8h, #16           \n"

                        "shll   v4.4s, v6.4h, #16           \n"
                        "shll2  v5.4s, v6.8h, #16           \n"
                        "shll   v6.4s, v7.4h, #16           \n"
                        "shll2  v7.4s, v7.8h, #16           \n"

                        "shll   v8.4s, %22.4h, #16          \n"
                        "shll2  v9.4s, %22.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[1]      \n"

                        "shll   v8.4s, %23.4h, #16          \n"
                        "shll2  v9.4s, %23.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v6.s[3]      \n"

                        "shll   v8.4s, %24.4h, #16          \n"
                        "shll2  v9.4s, %24.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[1]      \n"

                        "shll   v8.4s, %25.4h, #16          \n"
                        "shll2  v9.4s, %25.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v7.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v3.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v5.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v7.s[3]      \n"

                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v0.4h}, [%4]               \n" // r28

                        "shll   v0.4s, v0.4h, #16           \n"

                        "shll   v8.4s, %26.4h, #16          \n"
                        "shll2  v9.4s, %26.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[0]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[1]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[1]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[1]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[1]      \n"

                        "shll   v8.4s, %27.4h, #16          \n"
                        "shll2  v9.4s, %27.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v8.4s, v6.s[2]      \n"
                        "fmla   v13.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v10.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v11.4s, v9.4s, v4.s[3]      \n"
                        "fmla   v12.4s, v9.4s, v6.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "shrn   v10.4h, v10.4s, #16         \n"
                        "shrn   v11.4h, v11.4s, #16         \n"
                        "shrn   v12.4h, v12.4s, #16         \n"
                        "shrn   v13.4h, v13.4s, #16         \n"

                        "st1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_01), // %10
                        "w"(_k00_23), // %11
                        "w"(_k01_01), // %12
                        "w"(_k01_23), // %13
                        "w"(_k02_01), // %14
                        "w"(_k02_23), // %15
                        "w"(_k10_01), // %16
                        "w"(_k10_23), // %17
                        "w"(_k11_01), // %18
                        "w"(_k11_23), // %19
                        "w"(_k12_01), // %20
                        "w"(_k12_23), // %21
                        "w"(_k20_01), // %22
                        "w"(_k20_23), // %23
                        "w"(_k21_01), // %24
                        "w"(_k21_23), // %25
                        "w"(_k22_01), // %26
                        "w"(_k22_23)  // %27
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d24-d31}      \n" // sum0 sum1 sum2 sum3

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d8-d15}       \n" // r00 r01 r02 r03 r04 r05 r06 r07

                        "vshll.u16  q0, d8, #16         \n"
                        "vshll.u16  q1, d9, #16         \n"
                        "vshll.u16  q2, d10, #16        \n"
                        "vshll.u16  q3, d11, #16        \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #64]           \n"
                        "vld1.f32   {d1}, [%2 :64]      \n" // r08

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d8-d15}       \n" // r10 r11 r12 r13 r14 r15 r16 r17

                        "vshll.u16  q0, d8, #16         \n"
                        "vshll.u16  q1, d9, #16         \n"
                        "vshll.u16  q2, d10, #16        \n"
                        "vshll.u16  q3, d11, #16        \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #64]           \n"
                        "vld1.f32   {d1}, [%3 :64]      \n" // r18

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vldm       %4!, {d8-d15}       \n" // r20 r21 r22 r23 r24 r25 r26 r27

                        "vshll.u16  q0, d8, #16         \n"
                        "vshll.u16  q1, d9, #16         \n"
                        "vshll.u16  q2, d10, #16        \n"
                        "vshll.u16  q3, d11, #16        \n"

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%4, #64]           \n"
                        "vld1.f32   {d1}, [%4 :64]      \n" // r28

                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        //                         "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "sub        %5, %5, #256        \n" // kptr -= 8 * 16;

                        "vshrn.u32  d24, q12, #16       \n"
                        "vshrn.u32  d25, q13, #16       \n"
                        "vshrn.u32  d26, q14, #16       \n"
                        "vshrn.u32  d27, q15, #16       \n"

                        "vst1.f32   {d24-d27}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(kptr)          // %5
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v12.4s, v13.4s}, [%1], #32 \n" // sum0 sum1

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "shll   v6.4s, %10.4h, #16          \n"
                        "shll2  v7.4s, %10.8h, #16          \n"

                        "fmul   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmul   v11.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v2.s[1]      \n"

                        "shll   v8.4s, %11.4h, #16          \n"
                        "shll2  v9.4s, %11.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v4.4h}, [%2]               \n" // r04

                        "shll   v4.4s, v4.4h, #16           \n"

                        "shll   v6.4s, %12.4h, #16          \n"
                        "shll2  v7.4s, %12.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v3.s[1]      \n"

                        "shll   v8.4s, %13.4h, #16          \n"
                        "shll2  v9.4s, %13.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %14.4h, #16          \n"
                        "shll2  v7.4s, %14.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v2.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v4.s[1]      \n"

                        "shll   v8.4s, %15.4h, #16          \n"
                        "shll2  v9.4s, %15.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "shll   v6.4s, %16.4h, #16          \n"
                        "shll2  v7.4s, %16.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v2.s[1]      \n"

                        "shll   v8.4s, %17.4h, #16          \n"
                        "shll2  v9.4s, %17.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v4.4h}, [%3]               \n" // r14

                        "shll   v4.4s, v4.4h, #16           \n"

                        "shll   v6.4s, %18.4h, #16          \n"
                        "shll2  v7.4s, %18.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v3.s[1]      \n"

                        "shll   v8.4s, %19.4h, #16          \n"
                        "shll2  v9.4s, %19.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %20.4h, #16          \n"
                        "shll2  v7.4s, %20.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v2.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v4.s[1]      \n"

                        "shll   v8.4s, %21.4h, #16          \n"
                        "shll2  v9.4s, %21.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "shll   v6.4s, %22.4h, #16          \n"
                        "shll2  v7.4s, %22.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v2.s[1]      \n"

                        "shll   v8.4s, %23.4h, #16          \n"
                        "shll2  v9.4s, %23.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v4.4h}, [%4]               \n" // r24

                        "shll   v4.4s, v4.4h, #16           \n"

                        "shll   v6.4s, %24.4h, #16          \n"
                        "shll2  v7.4s, %24.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v3.s[1]      \n"

                        "shll   v8.4s, %25.4h, #16          \n"
                        "shll2  v9.4s, %25.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %26.4h, #16          \n"
                        "shll2  v7.4s, %26.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v12.4s, v7.4s, v2.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v4.s[1]      \n"

                        "shll   v8.4s, %27.4h, #16          \n"
                        "shll2  v9.4s, %27.8h, #16          \n"

                        "fmla   v10.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v11.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v12.4s, v9.4s, v2.s[3]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "fadd   v12.4s, v10.4s, v12.4s      \n"
                        "fadd   v13.4s, v11.4s, v13.4s      \n"

                        "shrn   v12.4h, v12.4s, #16         \n"
                        "shrn   v13.4h, v13.4s, #16         \n"

                        "st1    {v12.4h, v13.4h}, [%0], #16 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_01), // %10
                        "w"(_k00_23), // %11
                        "w"(_k01_01), // %12
                        "w"(_k01_23), // %13
                        "w"(_k02_01), // %14
                        "w"(_k02_23), // %15
                        "w"(_k10_01), // %16
                        "w"(_k10_23), // %17
                        "w"(_k11_01), // %18
                        "w"(_k11_23), // %19
                        "w"(_k12_01), // %20
                        "w"(_k12_23), // %21
                        "w"(_k20_01), // %22
                        "w"(_k20_23), // %23
                        "w"(_k21_01), // %24
                        "w"(_k21_23), // %25
                        "w"(_k22_01), // %26
                        "w"(_k22_23)  // %27
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d28-d31}, [%1 :128]! \n" // sum0 sum1

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmul.f32   q12, q8, d0[0]      \n"
                        "vmul.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #64]           \n"
                        "vld1.f32   {d9}, [%2 :64]      \n" // r04

                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r10 r11 r12 r13

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #64]           \n"
                        "vld1.f32   {d9}, [%3 :64]      \n" // r14

                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r20 r21 r22 r23

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"

                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%4, #64]           \n"
                        "vld1.f32   {d9}, [%4 :64]      \n" // r24

                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"

                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        //                         "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"

                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "vadd.f32   q14, q12, q14       \n"
                        "vadd.f32   q15, q13, q15       \n"

                        "sub        %5, %5, #256        \n" // kptr -= 8 * 16;

                        "vshrn.u32  d28, q14, #16       \n"
                        "vshrn.u32  d29, q15, #16       \n"

                        "vst1.f32   {d28-d29}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(kptr)          // %5
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v13.4s}, [%1], #16         \n" // sum0

                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%2] \n" // r00 r01 r02

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "shll   v6.4s, %10.4h, #16          \n"
                        "shll2  v7.4s, %10.8h, #16          \n"

                        "fmul   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmul   v11.4s, v7.4s, v0.s[1]      \n"

                        "shll   v8.4s, %11.4h, #16          \n"
                        "shll2  v9.4s, %11.8h, #16          \n"

                        "fmul   v12.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "shll   v6.4s, %12.4h, #16          \n"
                        "shll2  v7.4s, %12.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v1.s[1]      \n"

                        "shll   v8.4s, %13.4h, #16          \n"
                        "shll2  v9.4s, %13.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v1.s[3]      \n"

                        "shll   v6.4s, %14.4h, #16          \n"
                        "shll2  v7.4s, %14.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v2.s[1]      \n"

                        "shll   v8.4s, %15.4h, #16          \n"
                        "shll2  v9.4s, %15.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v3.4h, v4.4h, v5.4h}, [%3] \n" // r10 r11 r12

                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "shll   v6.4s, %16.4h, #16          \n"
                        "shll2  v7.4s, %16.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v3.s[1]      \n"

                        "shll   v8.4s, %17.4h, #16          \n"
                        "shll2  v9.4s, %17.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v3.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v3.s[3]      \n"

                        "shll   v6.4s, %18.4h, #16          \n"
                        "shll2  v7.4s, %18.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v4.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v4.s[1]      \n"

                        "shll   v8.4s, %19.4h, #16          \n"
                        "shll2  v9.4s, %19.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v4.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v4.s[3]      \n"

                        "shll   v6.4s, %20.4h, #16          \n"
                        "shll2  v7.4s, %20.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v5.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v5.s[1]      \n"

                        "shll   v8.4s, %21.4h, #16          \n"
                        "shll2  v9.4s, %21.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v5.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v5.s[3]      \n"

                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%4] \n" // r20 r21 r22

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "shll   v6.4s, %22.4h, #16          \n"
                        "shll2  v7.4s, %22.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v0.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v0.s[1]      \n"

                        "shll   v8.4s, %23.4h, #16          \n"
                        "shll2  v9.4s, %23.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v0.s[3]      \n"

                        "shll   v6.4s, %24.4h, #16          \n"
                        "shll2  v7.4s, %24.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v1.s[1]      \n"

                        "shll   v8.4s, %25.4h, #16          \n"
                        "shll2  v9.4s, %25.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v1.s[3]      \n"

                        "shll   v6.4s, %26.4h, #16          \n"
                        "shll2  v7.4s, %26.8h, #16          \n"

                        "fmla   v10.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v11.4s, v7.4s, v2.s[1]      \n"

                        "shll   v8.4s, %27.4h, #16          \n"
                        "shll2  v9.4s, %27.8h, #16          \n"

                        "fmla   v12.4s, v8.4s, v2.s[2]      \n"
                        "fmla   v13.4s, v9.4s, v2.s[3]      \n"

                        "fadd   v11.4s, v10.4s, v11.4s      \n"

                        "add    %2, %2, #16                 \n"
                        "fadd   v13.4s, v12.4s, v13.4s      \n"

                        "add    %3, %3, #16                 \n"
                        "fadd   v13.4s, v11.4s, v13.4s      \n"

                        "add    %4, %4, #16                 \n"
                        "shrn   v13.4h, v13.4s, #16         \n"

                        "st1    {v13.4h}, [%0], #8          \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2)            // %4
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_01), // %10
                        "w"(_k00_23), // %11
                        "w"(_k01_01), // %12
                        "w"(_k01_23), // %13
                        "w"(_k02_01), // %14
                        "w"(_k02_23), // %15
                        "w"(_k10_01), // %16
                        "w"(_k10_23), // %17
                        "w"(_k11_01), // %18
                        "w"(_k11_23), // %19
                        "w"(_k12_01), // %20
                        "w"(_k12_23), // %21
                        "w"(_k20_01), // %22
                        "w"(_k20_23), // %23
                        "w"(_k21_01), // %24
                        "w"(_k21_23), // %25
                        "w"(_k22_01), // %26
                        "w"(_k22_23)  // %27
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d30-d31}, [%1 :128]! \n" // sum0

                        "pld        [%2, #192]          \n"
                        "vld1.u16   {d2-d4}, [%2 :64]   \n" // r00 r01 r02

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshll.u16  q2, d4, #16         \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmul.f32   q12, q8, d0[0]      \n"
                        "vmul.f32   q13, q9, d0[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmul.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%3, #192]          \n"
                        "vld1.u16   {d2-d4}, [%3 :64]   \n" // r10 r11 r12

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshll.u16  q2, d4, #16         \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q9, d0[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%4, #192]          \n"
                        "vld1.u16   {d2-d4}, [%4 :64]   \n" // r20 r21 r22

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshll.u16  q2, d4, #16         \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q9, d0[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        //                         "pld        [%5, #256]          \n"
                        "vld1.u16   {d20-d23}, [%5 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"

                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "add        %2, %2, #16         \n"
                        "vadd.f32   q13, q12, q13       \n"

                        "add        %3, %3, #16         \n"
                        "vadd.f32   q15, q14, q15       \n"

                        "add        %4, %4, #16         \n"
                        "vadd.f32   q15, q13, q15       \n"

                        "sub        %5, %5, #256        \n" // kptr -= 8 * 16 * 2;

                        "vshrn.u32  d31, q15, #16       \n"

                        "vst1.u16   {d31}, [%0 :64]!    \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(kptr)          // %5
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack4_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack4_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x4_t _bias0 = bias ? vld1_f16(bias + p * 4) : vdup_n_f16((__fp16)0.f);
        out0.fill(_bias0);

        int q = 0;
        for (; q < inch; q++)
        {
            __fp16* outptr0 = out0.row<__fp16>(0);

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);

            const __fp16* kptr = kernel.channel(p).row<const __fp16>(q);

            // 16 * 9
            float16x8_t _k00_01 = vld1q_f16(kptr);
            float16x8_t _k00_23 = vld1q_f16(kptr + 8);
            float16x8_t _k01_01 = vld1q_f16(kptr + 16);
            float16x8_t _k01_23 = vld1q_f16(kptr + 24);
            float16x8_t _k02_01 = vld1q_f16(kptr + 32);
            float16x8_t _k02_23 = vld1q_f16(kptr + 40);
            float16x8_t _k10_01 = vld1q_f16(kptr + 48);
            float16x8_t _k10_23 = vld1q_f16(kptr + 56);
            float16x8_t _k11_01 = vld1q_f16(kptr + 64);
            float16x8_t _k11_23 = vld1q_f16(kptr + 72);
            float16x8_t _k12_01 = vld1q_f16(kptr + 80);
            float16x8_t _k12_23 = vld1q_f16(kptr + 88);
            float16x8_t _k20_01 = vld1q_f16(kptr + 96);
            float16x8_t _k20_23 = vld1q_f16(kptr + 104);
            float16x8_t _k21_01 = vld1q_f16(kptr + 112);
            float16x8_t _k21_23 = vld1q_f16(kptr + 120);
            float16x8_t _k22_01 = vld1q_f16(kptr + 128);
            float16x8_t _k22_23 = vld1q_f16(kptr + 136);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%1] \n" // r00 r01 r02 r03 r04 r05

                        "ext    v6.16b, %8.16b, %8.16b, #8  \n"
                        "fmla   v10.4h, %8.4h, v0.h[0]      \n"
                        "fmla   v11.4h, %8.4h, v0.h[4]      \n"
                        "fmla   v12.4h, %8.4h, v1.h[0]      \n"
                        "fmla   v13.4h, %8.4h, v1.h[4]      \n"
                        "fmla   v10.4h, v6.4h, v0.h[1]      \n"
                        "fmla   v11.4h, v6.4h, v0.h[5]      \n"
                        "fmla   v12.4h, v6.4h, v1.h[1]      \n"
                        "fmla   v13.4h, v6.4h, v1.h[5]      \n"
                        "ext    v7.16b, %9.16b, %9.16b, #8  \n"
                        "fmla   v10.4h, %9.4h, v0.h[2]      \n"
                        "fmla   v11.4h, %9.4h, v0.h[6]      \n"
                        "fmla   v12.4h, %9.4h, v1.h[2]      \n"
                        "fmla   v13.4h, %9.4h, v1.h[6]      \n"
                        "fmla   v10.4h, v7.4h, v0.h[3]      \n"
                        "fmla   v11.4h, v7.4h, v0.h[7]      \n"
                        "fmla   v12.4h, v7.4h, v1.h[3]      \n"
                        "fmla   v13.4h, v7.4h, v1.h[7]      \n"

                        "ext    v8.16b, %10.16b, %10.16b, #8 \n"
                        "fmla   v10.4h, %10.4h, v0.h[4]     \n"
                        "fmla   v11.4h, %10.4h, v1.h[0]     \n"
                        "fmla   v12.4h, %10.4h, v1.h[4]     \n"
                        "fmla   v13.4h, %10.4h, v2.h[0]     \n"
                        "fmla   v10.4h, v8.4h, v0.h[5]      \n"
                        "fmla   v11.4h, v8.4h, v1.h[1]      \n"
                        "fmla   v12.4h, v8.4h, v1.h[5]      \n"
                        "fmla   v13.4h, v8.4h, v2.h[1]      \n"
                        "ext    v9.16b, %11.16b, %11.16b, #8 \n"
                        "fmla   v10.4h, %11.4h, v0.h[6]     \n"
                        "fmla   v11.4h, %11.4h, v1.h[2]     \n"
                        "fmla   v12.4h, %11.4h, v1.h[6]     \n"
                        "fmla   v13.4h, %11.4h, v2.h[2]     \n"
                        "fmla   v10.4h, v9.4h, v0.h[7]      \n"
                        "fmla   v11.4h, v9.4h, v1.h[3]      \n"
                        "fmla   v12.4h, v9.4h, v1.h[7]      \n"
                        "fmla   v13.4h, v9.4h, v2.h[3]      \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v3.8h, v4.8h, v5.8h}, [%2] \n" // r10 r11 r12 r13 r14 r15

                        "ext    v6.16b, %12.16b, %12.16b, #8 \n"
                        "fmla   v10.4h, %12.4h, v1.h[0]     \n"
                        "fmla   v11.4h, %12.4h, v1.h[4]     \n"
                        "fmla   v12.4h, %12.4h, v2.h[0]     \n"
                        "fmla   v13.4h, %12.4h, v2.h[4]     \n"
                        "fmla   v10.4h, v6.4h, v1.h[1]      \n"
                        "fmla   v11.4h, v6.4h, v1.h[5]      \n"
                        "fmla   v12.4h, v6.4h, v2.h[1]      \n"
                        "fmla   v13.4h, v6.4h, v2.h[5]      \n"
                        "ext    v7.16b, %13.16b, %13.16b, #8 \n"
                        "fmla   v10.4h, %13.4h, v1.h[2]     \n"
                        "fmla   v11.4h, %13.4h, v1.h[6]     \n"
                        "fmla   v12.4h, %13.4h, v2.h[2]     \n"
                        "fmla   v13.4h, %13.4h, v2.h[6]     \n"
                        "fmla   v10.4h, v7.4h, v1.h[3]      \n"
                        "fmla   v11.4h, v7.4h, v1.h[7]      \n"
                        "fmla   v12.4h, v7.4h, v2.h[3]      \n"
                        "fmla   v13.4h, v7.4h, v2.h[7]      \n"

                        "ext    v8.16b, %14.16b, %14.16b, #8 \n"
                        "fmla   v10.4h, %14.4h, v3.h[0]     \n"
                        "fmla   v11.4h, %14.4h, v3.h[4]     \n"
                        "fmla   v12.4h, %14.4h, v4.h[0]     \n"
                        "fmla   v13.4h, %14.4h, v4.h[4]     \n"
                        "fmla   v10.4h, v8.4h, v3.h[1]      \n"
                        "fmla   v11.4h, v8.4h, v3.h[5]      \n"
                        "fmla   v12.4h, v8.4h, v4.h[1]      \n"
                        "fmla   v13.4h, v8.4h, v4.h[5]      \n"
                        "ext    v9.16b, %15.16b, %15.16b, #8 \n"
                        "fmla   v10.4h, %15.4h, v3.h[2]     \n"
                        "fmla   v11.4h, %15.4h, v3.h[6]     \n"
                        "fmla   v12.4h, %15.4h, v4.h[2]     \n"
                        "fmla   v13.4h, %15.4h, v4.h[6]     \n"
                        "fmla   v10.4h, v9.4h, v3.h[3]      \n"
                        "fmla   v11.4h, v9.4h, v3.h[7]      \n"
                        "fmla   v12.4h, v9.4h, v4.h[3]      \n"
                        "fmla   v13.4h, v9.4h, v4.h[7]      \n"

                        "ext    v6.16b, %16.16b, %16.16b, #8 \n"
                        "fmla   v10.4h, %16.4h, v3.h[4]     \n"
                        "fmla   v11.4h, %16.4h, v4.h[0]     \n"
                        "fmla   v12.4h, %16.4h, v4.h[4]     \n"
                        "fmla   v13.4h, %16.4h, v5.h[0]     \n"
                        "fmla   v10.4h, v6.4h, v3.h[5]      \n"
                        "fmla   v11.4h, v6.4h, v4.h[1]      \n"
                        "fmla   v12.4h, v6.4h, v4.h[5]      \n"
                        "fmla   v13.4h, v6.4h, v5.h[1]      \n"
                        "ext    v7.16b, %17.16b, %17.16b, #8 \n"
                        "fmla   v10.4h, %17.4h, v3.h[6]     \n"
                        "fmla   v11.4h, %17.4h, v4.h[2]     \n"
                        "fmla   v12.4h, %17.4h, v4.h[6]     \n"
                        "fmla   v13.4h, %17.4h, v5.h[2]     \n"
                        "fmla   v10.4h, v7.4h, v3.h[7]      \n"
                        "fmla   v11.4h, v7.4h, v4.h[3]      \n"
                        "fmla   v12.4h, v7.4h, v4.h[7]      \n"
                        "fmla   v13.4h, v7.4h, v5.h[3]      \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%3] \n" // r20 r21 r22 r23 r24 r25

                        "ext    v8.16b, %18.16b, %18.16b, #8 \n"
                        "fmla   v10.4h, %18.4h, v4.h[0]     \n"
                        "fmla   v11.4h, %18.4h, v4.h[4]     \n"
                        "fmla   v12.4h, %18.4h, v5.h[0]     \n"
                        "fmla   v13.4h, %18.4h, v5.h[4]     \n"
                        "fmla   v10.4h, v8.4h, v4.h[1]      \n"
                        "fmla   v11.4h, v8.4h, v4.h[5]      \n"
                        "fmla   v12.4h, v8.4h, v5.h[1]      \n"
                        "fmla   v13.4h, v8.4h, v5.h[5]      \n"
                        "ext    v9.16b, %19.16b, %19.16b, #8 \n"
                        "fmla   v10.4h, %19.4h, v4.h[2]     \n"
                        "fmla   v11.4h, %19.4h, v4.h[6]     \n"
                        "fmla   v12.4h, %19.4h, v5.h[2]     \n"
                        "fmla   v13.4h, %19.4h, v5.h[6]     \n"
                        "fmla   v10.4h, v9.4h, v4.h[3]      \n"
                        "fmla   v11.4h, v9.4h, v4.h[7]      \n"
                        "fmla   v12.4h, v9.4h, v5.h[3]      \n"
                        "fmla   v13.4h, v9.4h, v5.h[7]      \n"

                        "ext    v6.16b, %20.16b, %20.16b, #8 \n"
                        "fmla   v10.4h, %20.4h, v0.h[0]     \n"
                        "fmla   v11.4h, %20.4h, v0.h[4]     \n"
                        "fmla   v12.4h, %20.4h, v1.h[0]     \n"
                        "fmla   v13.4h, %20.4h, v1.h[4]     \n"
                        "fmla   v10.4h, v6.4h, v0.h[1]      \n"
                        "fmla   v11.4h, v6.4h, v0.h[5]      \n"
                        "fmla   v12.4h, v6.4h, v1.h[1]      \n"
                        "fmla   v13.4h, v6.4h, v1.h[5]      \n"
                        "ext    v7.16b, %21.16b, %21.16b, #8 \n"
                        "fmla   v10.4h, %21.4h, v0.h[2]     \n"
                        "fmla   v11.4h, %21.4h, v0.h[6]     \n"
                        "fmla   v12.4h, %21.4h, v1.h[2]     \n"
                        "fmla   v13.4h, %21.4h, v1.h[6]     \n"
                        "fmla   v10.4h, v7.4h, v0.h[3]      \n"
                        "fmla   v11.4h, v7.4h, v0.h[7]      \n"
                        "fmla   v12.4h, v7.4h, v1.h[3]      \n"
                        "fmla   v13.4h, v7.4h, v1.h[7]      \n"

                        "ext    v8.16b, %22.16b, %22.16b, #8 \n"
                        "fmla   v10.4h, %22.4h, v0.h[4]     \n"
                        "fmla   v11.4h, %22.4h, v1.h[0]     \n"
                        "fmla   v12.4h, %22.4h, v1.h[4]     \n"
                        "fmla   v13.4h, %22.4h, v2.h[0]     \n"
                        "fmla   v10.4h, v8.4h, v0.h[5]      \n"
                        "fmla   v11.4h, v8.4h, v1.h[1]      \n"
                        "fmla   v12.4h, v8.4h, v1.h[5]      \n"
                        "fmla   v13.4h, v8.4h, v2.h[1]      \n"
                        "ext    v9.16b, %23.16b, %23.16b, #8 \n"
                        "fmla   v10.4h, %23.4h, v0.h[6]     \n"
                        "fmla   v11.4h, %23.4h, v1.h[2]     \n"
                        "fmla   v12.4h, %23.4h, v1.h[6]     \n"
                        "fmla   v13.4h, %23.4h, v2.h[2]     \n"
                        "fmla   v10.4h, v9.4h, v0.h[7]      \n"
                        "fmla   v11.4h, v9.4h, v1.h[3]      \n"
                        "fmla   v12.4h, v9.4h, v1.h[7]      \n"
                        "fmla   v13.4h, v9.4h, v2.h[3]      \n"

                        "ext    v6.16b, %24.16b, %24.16b, #8 \n"
                        "fmla   v10.4h, %24.4h, v1.h[0]     \n"
                        "fmla   v11.4h, %24.4h, v1.h[4]     \n"
                        "fmla   v12.4h, %24.4h, v2.h[0]     \n"
                        "fmla   v13.4h, %24.4h, v2.h[4]     \n"

                        "add    %1, %1, #32                 \n"

                        "fmla   v10.4h, v6.4h, v1.h[1]      \n"
                        "fmla   v11.4h, v6.4h, v1.h[5]      \n"
                        "fmla   v12.4h, v6.4h, v2.h[1]      \n"
                        "fmla   v13.4h, v6.4h, v2.h[5]      \n"
                        "ext    v7.16b, %25.16b, %25.16b, #8 \n"
                        "fmla   v10.4h, %25.4h, v1.h[2]     \n"
                        "fmla   v11.4h, %25.4h, v1.h[6]     \n"
                        "fmla   v12.4h, %25.4h, v2.h[2]     \n"
                        "fmla   v13.4h, %25.4h, v2.h[6]     \n"

                        "add    %2, %2, #32                 \n"

                        "fmla   v10.4h, v7.4h, v1.h[3]      \n"
                        "fmla   v11.4h, v7.4h, v1.h[7]      \n"
                        "fmla   v12.4h, v7.4h, v2.h[3]      \n"
                        "fmla   v13.4h, v7.4h, v2.h[7]      \n"

                        "add    %3, %3, #32                 \n"

                        "st1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_01), // %8
                        "w"(_k00_23), // %9
                        "w"(_k01_01), // %10
                        "w"(_k01_23), // %11
                        "w"(_k02_01), // %12
                        "w"(_k02_23), // %13
                        "w"(_k10_01), // %14
                        "w"(_k10_23), // %15
                        "w"(_k11_01), // %16
                        "w"(_k11_23), // %17
                        "w"(_k12_01), // %18
                        "w"(_k12_23), // %19
                        "w"(_k20_01), // %20
                        "w"(_k20_23), // %21
                        "w"(_k21_01), // %22
                        "w"(_k21_23), // %23
                        "w"(_k22_01), // %24
                        "w"(_k22_23)  // %25
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%1]        \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v12.4h, v13.4h}, [%0]      \n" // sum0 sum1

                        "ext    v4.16b, %8.16b, %8.16b, #8  \n"
                        "fmul   v10.4h, %8.4h, v0.h[0]      \n"
                        "fmul   v11.4h, %8.4h, v0.h[4]      \n"
                        "fmla   v12.4h, v4.4h, v0.h[1]      \n"
                        "fmla   v13.4h, v4.4h, v0.h[5]      \n"
                        "ext    v5.16b, %9.16b, %9.16b, #8  \n"
                        "fmla   v10.4h, %9.4h, v0.h[2]      \n"
                        "fmla   v11.4h, %9.4h, v0.h[6]      \n"
                        "fmla   v12.4h, v5.4h, v0.h[3]      \n"
                        "fmla   v13.4h, v5.4h, v0.h[7]      \n"

                        "ext    v6.16b, %10.16b, %10.16b, #8 \n"
                        "fmla   v10.4h, %10.4h, v0.h[4]     \n"
                        "fmla   v11.4h, %10.4h, v1.h[0]     \n"
                        "fmla   v12.4h, v6.4h, v0.h[5]      \n"
                        "fmla   v13.4h, v6.4h, v1.h[1]      \n"
                        "ext    v7.16b, %11.16b, %11.16b, #8 \n"
                        "fmla   v10.4h, %11.4h, v0.h[6]     \n"
                        "fmla   v11.4h, %11.4h, v1.h[2]     \n"
                        "fmla   v12.4h, v7.4h, v0.h[7]      \n"
                        "fmla   v13.4h, v7.4h, v1.h[3]      \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v2.8h, v3.8h}, [%2]        \n" // r10 r11 r12 r13

                        "ext    v8.16b, %12.16b, %12.16b, #8 \n"
                        "fmla   v10.4h, %12.4h, v1.h[0]     \n"
                        "fmla   v11.4h, %12.4h, v1.h[4]     \n"
                        "fmla   v12.4h, v8.4h, v1.h[1]      \n"
                        "fmla   v13.4h, v8.4h, v1.h[5]      \n"
                        "ext    v9.16b, %13.16b, %13.16b, #8 \n"
                        "fmla   v10.4h, %13.4h, v1.h[2]     \n"
                        "fmla   v11.4h, %13.4h, v1.h[6]     \n"
                        "fmla   v12.4h, v9.4h, v1.h[3]      \n"
                        "fmla   v13.4h, v9.4h, v1.h[7]      \n"

                        "ext    v4.16b, %14.16b, %14.16b, #8 \n"
                        "fmla   v10.4h, %14.4h, v2.h[0]     \n"
                        "fmla   v11.4h, %14.4h, v2.h[4]     \n"
                        "fmla   v12.4h, v4.4h, v2.h[1]      \n"
                        "fmla   v13.4h, v4.4h, v2.h[5]      \n"
                        "ext    v5.16b, %15.16b, %15.16b, #8 \n"
                        "fmla   v10.4h, %15.4h, v2.h[2]     \n"
                        "fmla   v11.4h, %15.4h, v2.h[6]     \n"
                        "fmla   v12.4h, v5.4h, v2.h[3]      \n"
                        "fmla   v13.4h, v5.4h, v2.h[7]      \n"

                        "ext    v6.16b, %16.16b, %16.16b, #8 \n"
                        "fmla   v10.4h, %16.4h, v2.h[4]     \n"
                        "fmla   v11.4h, %16.4h, v3.h[0]     \n"
                        "fmla   v12.4h, v6.4h, v2.h[5]      \n"
                        "fmla   v13.4h, v6.4h, v3.h[1]      \n"
                        "ext    v7.16b, %17.16b, %17.16b, #8 \n"
                        "fmla   v10.4h, %17.4h, v2.h[6]     \n"
                        "fmla   v11.4h, %17.4h, v3.h[2]     \n"
                        "fmla   v12.4h, v7.4h, v2.h[7]      \n"
                        "fmla   v13.4h, v7.4h, v3.h[3]      \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%3]        \n" // r20 r21 r22 r23

                        "ext    v8.16b, %18.16b, %18.16b, #8 \n"
                        "fmla   v10.4h, %18.4h, v3.h[0]     \n"
                        "fmla   v11.4h, %18.4h, v3.h[4]     \n"
                        "fmla   v12.4h, v8.4h, v3.h[1]      \n"
                        "fmla   v13.4h, v8.4h, v3.h[5]      \n"
                        "ext    v9.16b, %19.16b, %19.16b, #8 \n"
                        "fmla   v10.4h, %19.4h, v3.h[2]     \n"
                        "fmla   v11.4h, %19.4h, v3.h[6]     \n"
                        "fmla   v12.4h, v9.4h, v3.h[3]      \n"
                        "fmla   v13.4h, v9.4h, v3.h[7]      \n"

                        "ext    v4.16b, %20.16b, %20.16b, #8 \n"
                        "fmla   v10.4h, %20.4h, v0.h[0]     \n"
                        "fmla   v11.4h, %20.4h, v0.h[4]     \n"
                        "fmla   v12.4h, v4.4h, v0.h[1]      \n"
                        "fmla   v13.4h, v4.4h, v0.h[5]      \n"
                        "ext    v5.16b, %21.16b, %21.16b, #8 \n"
                        "fmla   v10.4h, %21.4h, v0.h[2]     \n"
                        "fmla   v11.4h, %21.4h, v0.h[6]     \n"
                        "fmla   v12.4h, v5.4h, v0.h[3]      \n"
                        "fmla   v13.4h, v5.4h, v0.h[7]      \n"

                        "ext    v6.16b, %22.16b, %22.16b, #8 \n"
                        "fmla   v10.4h, %22.4h, v0.h[4]     \n"
                        "fmla   v11.4h, %22.4h, v1.h[0]     \n"
                        "fmla   v12.4h, v6.4h, v0.h[5]      \n"
                        "fmla   v13.4h, v6.4h, v1.h[1]      \n"
                        "ext    v7.16b, %23.16b, %23.16b, #8 \n"
                        "fmla   v10.4h, %23.4h, v0.h[6]     \n"
                        "fmla   v11.4h, %23.4h, v1.h[2]     \n"
                        "fmla   v12.4h, v7.4h, v0.h[7]      \n"
                        "fmla   v13.4h, v7.4h, v1.h[3]      \n"

                        "ext    v8.16b, %24.16b, %24.16b, #8 \n"
                        "fmla   v10.4h, %24.4h, v1.h[0]     \n"
                        "fmla   v11.4h, %24.4h, v1.h[4]     \n"
                        "fmla   v12.4h, v8.4h, v1.h[1]      \n"
                        "fmla   v13.4h, v8.4h, v1.h[5]      \n"
                        "ext    v9.16b, %25.16b, %25.16b, #8 \n"
                        "fmla   v10.4h, %25.4h, v1.h[2]     \n"
                        "fmla   v11.4h, %25.4h, v1.h[6]     \n"
                        "fmla   v12.4h, v9.4h, v1.h[3]      \n"
                        "fmla   v13.4h, v9.4h, v1.h[7]      \n"

                        "add    %1, %1, #16                 \n"

                        "fadd   v10.4h, v10.4h, v12.4h      \n"

                        "add    %2, %2, #16                 \n"

                        "fadd   v11.4h, v11.4h, v13.4h      \n"

                        "add    %3, %3, #16                 \n"

                        "st1    {v10.4h, v11.4h}, [%0], #16 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_01), // %8
                        "w"(_k00_23), // %9
                        "w"(_k01_01), // %10
                        "w"(_k01_23), // %11
                        "w"(_k02_01), // %12
                        "w"(_k02_23), // %13
                        "w"(_k10_01), // %14
                        "w"(_k10_23), // %15
                        "w"(_k11_01), // %16
                        "w"(_k11_23), // %17
                        "w"(_k12_01), // %18
                        "w"(_k12_23), // %19
                        "w"(_k20_01), // %20
                        "w"(_k20_23), // %21
                        "w"(_k21_01), // %22
                        "w"(_k21_23), // %23
                        "w"(_k22_01), // %24
                        "w"(_k22_23)  // %25
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%1] \n" // r00 r01 r02

                        "prfm   pldl1keep, [%0, #64]        \n"
                        "ld1    {v13.4h}, [%0]              \n" // sum0

                        "ext    v6.16b, %8.16b, %8.16b, #8  \n"
                        "fmul   v10.4h, %8.4h, v0.h[0]      \n"
                        "fmul   v11.4h, v6.4h, v0.h[1]      \n"
                        "ext    v7.16b, %9.16b, %9.16b, #8  \n"
                        "fmul   v12.4h, %9.4h, v0.h[2]      \n"
                        "fmla   v13.4h, v7.4h, v0.h[3]      \n"

                        "ext    v8.16b, %10.16b, %10.16b, #8 \n"
                        "fmla   v10.4h, %10.4h, v1.h[0]     \n"
                        "fmla   v11.4h, v8.4h, v1.h[1]      \n"
                        "ext    v9.16b, %11.16b, %11.16b, #8 \n"
                        "fmla   v12.4h, %11.4h, v1.h[2]     \n"
                        "fmla   v13.4h, v9.4h, v1.h[3]      \n"

                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v3.4h, v4.4h, v5.4h}, [%2] \n" // r10 r11 r12

                        "ext    v6.16b, %12.16b, %12.16b, #8 \n"
                        "fmla   v10.4h, %12.4h, v2.h[0]     \n"
                        "fmla   v11.4h, v6.4h, v2.h[1]      \n"
                        "ext    v7.16b, %13.16b, %13.16b, #8 \n"
                        "fmla   v12.4h, %13.4h, v2.h[2]     \n"
                        "fmla   v13.4h, v7.4h, v2.h[3]      \n"

                        "ext    v8.16b, %14.16b, %14.16b, #8 \n"
                        "fmla   v10.4h, %14.4h, v3.h[0]     \n"
                        "fmla   v11.4h, v8.4h, v3.h[1]      \n"
                        "ext    v9.16b, %15.16b, %15.16b, #8 \n"
                        "fmla   v12.4h, %15.4h, v3.h[2]     \n"
                        "fmla   v13.4h, v9.4h, v3.h[3]      \n"

                        "ext    v6.16b, %16.16b, %16.16b, #8 \n"
                        "fmla   v10.4h, %16.4h, v4.h[0]     \n"
                        "fmla   v11.4h, v6.4h, v4.h[1]      \n"
                        "ext    v7.16b, %17.16b, %17.16b, #8 \n"
                        "fmla   v12.4h, %17.4h, v4.h[2]     \n"
                        "fmla   v13.4h, v7.4h, v4.h[3]      \n"

                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%3] \n" // r20 r21 r22

                        "ext    v8.16b, %18.16b, %18.16b, #8 \n"
                        "fmla   v10.4h, %18.4h, v5.h[0]     \n"
                        "fmla   v11.4h, v8.4h, v5.h[1]      \n"
                        "ext    v9.16b, %19.16b, %19.16b, #8 \n"
                        "fmla   v12.4h, %19.4h, v5.h[2]     \n"
                        "fmla   v13.4h, v9.4h, v5.h[3]      \n"

                        "ext    v6.16b, %20.16b, %20.16b, #8 \n"
                        "fmla   v10.4h, %20.4h, v0.h[0]     \n"
                        "fmla   v11.4h, v6.4h, v0.h[1]      \n"
                        "ext    v7.16b, %21.16b, %21.16b, #8 \n"
                        "fmla   v12.4h, %21.4h, v0.h[2]     \n"
                        "fmla   v13.4h, v7.4h, v0.h[3]      \n"

                        "ext    v8.16b, %22.16b, %22.16b, #8 \n"
                        "fmla   v10.4h, %22.4h, v1.h[0]     \n"
                        "fmla   v11.4h, v8.4h, v1.h[1]      \n"
                        "ext    v9.16b, %23.16b, %23.16b, #8 \n"
                        "fmla   v12.4h, %23.4h, v1.h[2]     \n"
                        "fmla   v13.4h, v9.4h, v1.h[3]      \n"

                        "ext    v6.16b, %24.16b, %24.16b, #8 \n"
                        "fmla   v10.4h, %24.4h, v2.h[0]     \n"
                        "fmla   v11.4h, v6.4h, v2.h[1]      \n"
                        "ext    v7.16b, %25.16b, %25.16b, #8 \n"
                        "fmla   v12.4h, %25.4h, v2.h[2]     \n"
                        "fmla   v13.4h, v7.4h, v2.h[3]      \n"

                        "fadd   v10.4h, v10.4h, v11.4h      \n"

                        "add    %1, %1, #8                  \n"

                        "fadd   v12.4h, v12.4h, v13.4h      \n"

                        "add    %2, %2, #8                  \n"

                        "fadd   v10.4h, v10.4h, v12.4h      \n"

                        "add    %3, %3, #8                  \n"

                        "st1    {v10.4h}, [%0], #8          \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00_01), // %8
                        "w"(_k00_23), // %9
                        "w"(_k01_01), // %10
                        "w"(_k01_23), // %11
                        "w"(_k02_01), // %12
                        "w"(_k02_23), // %13
                        "w"(_k10_01), // %14
                        "w"(_k10_23), // %15
                        "w"(_k11_01), // %16
                        "w"(_k11_23), // %17
                        "w"(_k12_01), // %18
                        "w"(_k12_23), // %19
                        "w"(_k20_01), // %20
                        "w"(_k20_23), // %21
                        "w"(_k21_01), // %22
                        "w"(_k21_23), // %23
                        "w"(_k22_01), // %24
                        "w"(_k22_23)  // %25
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13");
                }

                r0 += 8;
                r1 += 8;
                r2 += 8;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack4to1.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack4to1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* bias = _bias;

    int remain_outch_start = 0;

#if __ARM_NEON && __aarch64__
    int nn_outch = 0;
    nn_outch = outch >> 1;
    remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        Mat out0 = top_blob.channel(p);
        Mat out1 = top_blob.channel(p + 1);

        const float bias0 = bias ? bias[p] : 0.f;
        const float bias1 = bias ? bias[p + 1] : 0.f;
        out0.fill(bias0);
        out1.fill(bias1);

        const float* k0 = kernel.channel(p);
        const float* k1 = kernel.channel(p + 1);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;
            float* outptr1 = out1;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            float32x4_t _k00_0 = vld1q_f32(k0);
            float32x4_t _k01_0 = vld1q_f32(k0 + 4);
            float32x4_t _k02_0 = vld1q_f32(k0 + 8);
            float32x4_t _k10_0 = vld1q_f32(k0 + 12);
            float32x4_t _k11_0 = vld1q_f32(k0 + 16);
            float32x4_t _k12_0 = vld1q_f32(k0 + 20);
            float32x4_t _k20_0 = vld1q_f32(k0 + 24);
            float32x4_t _k21_0 = vld1q_f32(k0 + 28);
            float32x4_t _k22_0 = vld1q_f32(k0 + 32);

            float32x4_t _k00_1 = vld1q_f32(k1);
            float32x4_t _k01_1 = vld1q_f32(k1 + 4);
            float32x4_t _k02_1 = vld1q_f32(k1 + 8);
            float32x4_t _k10_1 = vld1q_f32(k1 + 12);
            float32x4_t _k11_1 = vld1q_f32(k1 + 16);
            float32x4_t _k12_1 = vld1q_f32(k1 + 20);
            float32x4_t _k20_1 = vld1q_f32(k1 + 24);
            float32x4_t _k21_1 = vld1q_f32(k1 + 28);
            float32x4_t _k22_1 = vld1q_f32(k1 + 32);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r00 r01 r02 r03

                        "fmul   v16.4s, %10.4s, v0.4s       \n"
                        "fmul   v17.4s, %19.4s, v0.4s       \n"
                        "fmul   v18.4s, %10.4s, v1.4s       \n"
                        "fmul   v19.4s, %19.4s, v1.4s       \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%2]        \n" // r04 r05

                        "fmul   v6.4s, %10.4s, v2.4s        \n"
                        "fmul   v7.4s, %19.4s, v2.4s        \n"
                        "fmul   v8.4s, %10.4s, v3.4s        \n"
                        "fmul   v9.4s, %19.4s, v3.4s        \n"

                        "fmla   v16.4s, %11.4s, v1.4s       \n"
                        "fmla   v17.4s, %20.4s, v1.4s       \n"
                        "fmla   v18.4s, %11.4s, v2.4s       \n"
                        "fmla   v19.4s, %20.4s, v2.4s       \n"
                        "fmla   v6.4s, %11.4s, v3.4s        \n"
                        "fmla   v7.4s, %20.4s, v3.4s        \n"
                        "fmla   v8.4s, %11.4s, v4.4s        \n"
                        "fmla   v9.4s, %20.4s, v4.4s        \n"

                        "fmla   v16.4s, %12.4s, v2.4s       \n"
                        "fmla   v17.4s, %21.4s, v2.4s       \n"
                        "fmla   v18.4s, %12.4s, v3.4s       \n"
                        "fmla   v19.4s, %21.4s, v3.4s       \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r10 r11 r12 r12

                        "fmla   v6.4s, %12.4s, v4.4s        \n"
                        "fmla   v7.4s, %21.4s, v4.4s        \n"
                        "fmla   v8.4s, %12.4s, v5.4s        \n"
                        "fmla   v9.4s, %21.4s, v5.4s        \n"

                        "fmla   v16.4s, %13.4s, v0.4s       \n"
                        "fmla   v17.4s, %22.4s, v0.4s       \n"
                        "fmla   v18.4s, %13.4s, v1.4s       \n"
                        "fmla   v19.4s, %22.4s, v1.4s       \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%3]        \n" // r14 r15

                        "fmla   v6.4s, %13.4s, v2.4s        \n"
                        "fmla   v7.4s, %22.4s, v2.4s        \n"
                        "fmla   v8.4s, %13.4s, v3.4s        \n"
                        "fmla   v9.4s, %22.4s, v3.4s        \n"

                        "fmla   v16.4s, %14.4s, v1.4s       \n"
                        "fmla   v17.4s, %23.4s, v1.4s       \n"
                        "fmla   v18.4s, %14.4s, v2.4s       \n"
                        "fmla   v19.4s, %23.4s, v2.4s       \n"
                        "fmla   v6.4s, %14.4s, v3.4s        \n"
                        "fmla   v7.4s, %23.4s, v3.4s        \n"
                        "fmla   v8.4s, %14.4s, v4.4s        \n"
                        "fmla   v9.4s, %23.4s, v4.4s        \n"

                        "fmla   v16.4s, %15.4s, v2.4s       \n"
                        "fmla   v17.4s, %24.4s, v2.4s       \n"
                        "fmla   v18.4s, %15.4s, v3.4s       \n"
                        "fmla   v19.4s, %24.4s, v3.4s       \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%4], #64 \n" // r20 r21 r22 r22

                        "fmla   v6.4s, %15.4s, v4.4s        \n"
                        "fmla   v7.4s, %24.4s, v4.4s        \n"
                        "fmla   v8.4s, %15.4s, v5.4s        \n"
                        "fmla   v9.4s, %24.4s, v5.4s        \n"

                        "fmla   v16.4s, %16.4s, v0.4s       \n"
                        "fmla   v17.4s, %25.4s, v0.4s       \n"
                        "fmla   v18.4s, %16.4s, v1.4s       \n"
                        "fmla   v19.4s, %25.4s, v1.4s       \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%4]        \n" // r24 r25

                        "fmla   v6.4s, %16.4s, v2.4s        \n"
                        "fmla   v7.4s, %25.4s, v2.4s        \n"
                        "fmla   v8.4s, %16.4s, v3.4s        \n"
                        "fmla   v9.4s, %25.4s, v3.4s        \n"

                        "fmla   v16.4s, %17.4s, v1.4s       \n"
                        "fmla   v17.4s, %26.4s, v1.4s       \n"
                        "fmla   v18.4s, %17.4s, v2.4s       \n"
                        "fmla   v19.4s, %26.4s, v2.4s       \n"
                        "fmla   v6.4s, %17.4s, v3.4s        \n"
                        "fmla   v7.4s, %26.4s, v3.4s        \n"
                        "fmla   v8.4s, %17.4s, v4.4s        \n"
                        "fmla   v9.4s, %26.4s, v4.4s        \n"

                        "fmla   v16.4s, %18.4s, v2.4s       \n"
                        "fmla   v17.4s, %27.4s, v2.4s       \n"
                        "fmla   v18.4s, %18.4s, v3.4s       \n"
                        "fmla   v19.4s, %27.4s, v3.4s       \n"
                        "fmla   v6.4s, %18.4s, v4.4s        \n"
                        "fmla   v7.4s, %27.4s, v4.4s        \n"
                        "fmla   v8.4s, %18.4s, v5.4s        \n"
                        "fmla   v9.4s, %27.4s, v5.4s        \n"

                        "ld1    {v0.4s}, [%0]               \n" // sum00 sum01 sum02 sum03
                        "ld1    {v1.4s}, [%1]               \n" // sum10 sum11 sum12 sum13

                        "faddp  v16.4s, v16.4s, v16.4s      \n"
                        "faddp  v17.4s, v17.4s, v17.4s      \n"
                        "faddp  v18.4s, v18.4s, v18.4s      \n"
                        "faddp  v19.4s, v19.4s, v19.4s      \n"
                        "faddp  v6.4s, v6.4s, v6.4s         \n"
                        "faddp  v7.4s, v7.4s, v7.4s         \n"
                        "faddp  v8.4s, v8.4s, v8.4s         \n"
                        "faddp  v9.4s, v9.4s, v9.4s         \n"

                        "faddp  v16.2s, v16.2s, v18.2s      \n"
                        "faddp  v17.2s, v17.2s, v19.2s      \n"
                        "faddp  v6.2s, v6.2s, v8.2s         \n"
                        "faddp  v7.2s, v7.2s, v9.2s         \n"

                        "trn1   v16.2d, v16.2d, v6.2d       \n"
                        "trn1   v17.2d, v17.2d, v7.2d       \n"

                        "fadd   v0.4s, v0.4s, v16.4s        \n"
                        "fadd   v1.4s, v1.4s, v17.4s        \n"

                        "st1    {v0.4s}, [%0], #16          \n"
                        "st1    {v1.4s}, [%1], #16          \n"

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(r0),      // %2
                        "=r"(r1),      // %3
                        "=r"(r2)       // %4
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_0), // %10
                        "w"(_k01_0), // %11
                        "w"(_k02_0), // %12
                        "w"(_k10_0), // %13
                        "w"(_k11_0), // %14
                        "w"(_k12_0), // %15
                        "w"(_k20_0), // %16
                        "w"(_k21_0), // %17
                        "w"(_k22_0), // %18
                        "w"(_k00_1), // %19
                        "w"(_k01_1), // %20
                        "w"(_k02_1), // %21
                        "w"(_k10_1), // %22
                        "w"(_k11_1), // %23
                        "w"(_k12_1), // %24
                        "w"(_k20_1), // %25
                        "w"(_k21_1), // %26
                        "w"(_k22_1)  // %27
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2] \n" // r00 r01 r02 r03

                        "fmul   v16.4s, %10.4s, v0.4s       \n"
                        "fmul   v17.4s, %19.4s, v0.4s       \n"
                        "fmul   v18.4s, %10.4s, v1.4s       \n"
                        "fmul   v19.4s, %19.4s, v1.4s       \n"
                        "fmla   v16.4s, %11.4s, v1.4s       \n"
                        "fmla   v17.4s, %20.4s, v1.4s       \n"
                        "fmla   v18.4s, %11.4s, v2.4s       \n"
                        "fmla   v19.4s, %20.4s, v2.4s       \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3] \n" // r10 r11 r12 r12

                        "fmla   v16.4s, %12.4s, v2.4s       \n"
                        "fmla   v17.4s, %21.4s, v2.4s       \n"
                        "fmla   v18.4s, %12.4s, v3.4s       \n"
                        "fmla   v19.4s, %21.4s, v3.4s       \n"

                        "fmla   v16.4s, %13.4s, v4.4s       \n"
                        "fmla   v17.4s, %22.4s, v4.4s       \n"
                        "fmla   v18.4s, %13.4s, v5.4s       \n"
                        "fmla   v19.4s, %22.4s, v5.4s       \n"
                        "fmla   v16.4s, %14.4s, v5.4s       \n"
                        "fmla   v17.4s, %23.4s, v5.4s       \n"
                        "fmla   v18.4s, %14.4s, v6.4s       \n"
                        "fmla   v19.4s, %23.4s, v6.4s       \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%4] \n" // r20 r21 r22 r22

                        "fmla   v16.4s, %15.4s, v6.4s       \n"
                        "fmla   v17.4s, %24.4s, v6.4s       \n"
                        "fmla   v18.4s, %15.4s, v7.4s       \n"
                        "fmla   v19.4s, %24.4s, v7.4s       \n"

                        "fmla   v16.4s, %16.4s, v0.4s       \n"
                        "fmla   v17.4s, %25.4s, v0.4s       \n"
                        "fmla   v18.4s, %16.4s, v1.4s       \n"
                        "fmla   v19.4s, %25.4s, v1.4s       \n"
                        "fmla   v16.4s, %17.4s, v1.4s       \n"
                        "fmla   v17.4s, %26.4s, v1.4s       \n"
                        "fmla   v18.4s, %17.4s, v2.4s       \n"
                        "fmla   v19.4s, %26.4s, v2.4s       \n"
                        "fmla   v16.4s, %18.4s, v2.4s       \n"
                        "fmla   v17.4s, %27.4s, v2.4s       \n"
                        "fmla   v18.4s, %18.4s, v3.4s       \n"
                        "fmla   v19.4s, %27.4s, v3.4s       \n"

                        "ld1    {v4.2s}, [%0]               \n" // sum00 sum01
                        "ld1    {v5.2s}, [%1]               \n" // sum10 sum11

                        "faddp  v16.4s, v16.4s, v16.4s      \n"
                        "faddp  v17.4s, v17.4s, v17.4s      \n"
                        "faddp  v18.4s, v18.4s, v18.4s      \n"
                        "faddp  v19.4s, v19.4s, v19.4s      \n"

                        "add    %2, %2, #32                 \n"

                        "faddp  v16.2s, v16.2s, v18.2s      \n"
                        "faddp  v17.2s, v17.2s, v19.2s      \n"

                        "add    %3, %3, #32                 \n"

                        "fadd   v4.2s, v4.2s, v16.2s        \n"
                        "fadd   v5.2s, v5.2s, v17.2s        \n"

                        "add    %4, %4, #32                 \n"

                        "st1    {v4.2s}, [%0], #8           \n"
                        "st1    {v5.2s}, [%1], #8           \n"

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(r0),      // %2
                        "=r"(r1),      // %3
                        "=r"(r2)       // %4
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_0), // %10
                        "w"(_k01_0), // %11
                        "w"(_k02_0), // %12
                        "w"(_k10_0), // %13
                        "w"(_k11_0), // %14
                        "w"(_k12_0), // %15
                        "w"(_k20_0), // %16
                        "w"(_k21_0), // %17
                        "w"(_k22_0), // %18
                        "w"(_k00_1), // %19
                        "w"(_k01_1), // %20
                        "w"(_k02_1), // %21
                        "w"(_k10_1), // %22
                        "w"(_k11_1), // %23
                        "w"(_k12_1), // %24
                        "w"(_k20_1), // %25
                        "w"(_k21_1), // %26
                        "w"(_k22_1)  // %27
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%2] \n" // r00 r01 r02

                        "fmul   v16.4s, %10.4s, v0.4s       \n"
                        "fmul   v17.4s, %19.4s, v0.4s       \n"
                        "fmul   v18.4s, %11.4s, v1.4s       \n"
                        "fmul   v19.4s, %20.4s, v1.4s       \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v3.4s, v4.4s, v5.4s}, [%3] \n" // r10 r11 r12

                        "fmla   v16.4s, %12.4s, v2.4s       \n"
                        "fmla   v17.4s, %21.4s, v2.4s       \n"

                        "fmla   v18.4s, %13.4s, v3.4s       \n"
                        "fmla   v19.4s, %22.4s, v3.4s       \n"
                        "fmla   v16.4s, %14.4s, v4.4s       \n"
                        "fmla   v17.4s, %23.4s, v4.4s       \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%4] \n" // r20 r21 r22

                        "fmla   v18.4s, %15.4s, v5.4s       \n"
                        "fmla   v19.4s, %24.4s, v5.4s       \n"

                        "fmla   v16.4s, %16.4s, v0.4s       \n"
                        "fmla   v17.4s, %25.4s, v0.4s       \n"
                        "fmla   v18.4s, %17.4s, v1.4s       \n"
                        "fmla   v19.4s, %26.4s, v1.4s       \n"
                        "fmla   v16.4s, %18.4s, v2.4s       \n"
                        "fmla   v17.4s, %27.4s, v2.4s       \n"

                        "ld1    {v3.s}[0], [%0]             \n" // sum00
                        "ld1    {v4.s}[0], [%1]             \n" // sum10

                        "fadd   v16.4s, v16.4s, v18.4s      \n"
                        "fadd   v17.4s, v17.4s, v19.4s      \n"

                        "add    %2, %2, #16                 \n"

                        "faddp  v16.4s, v16.4s, v16.4s      \n"
                        "faddp  v17.4s, v17.4s, v17.4s      \n"

                        "add    %3, %3, #16                 \n"

                        "faddp  v16.2s, v16.2s, v16.2s      \n"
                        "faddp  v17.2s, v17.2s, v17.2s      \n"

                        "add    %4, %4, #16                 \n"

                        "fadd   v3.2s, v3.2s, v16.2s        \n"
                        "fadd   v4.2s, v4.2s, v17.2s        \n"

                        "st1    {v3.s}[0], [%0], #4         \n"
                        "st1    {v4.s}[0], [%1], #4         \n"

                        : "=r"(outptr0), // %0
                        "=r"(outptr1), // %1
                        "=r"(r0),      // %2
                        "=r"(r1),      // %3
                        "=r"(r2)       // %4
                        : "0"(outptr0),
                        "1"(outptr1),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "w"(_k00_0), // %10
                        "w"(_k01_0), // %11
                        "w"(_k02_0), // %12
                        "w"(_k10_0), // %13
                        "w"(_k11_0), // %14
                        "w"(_k12_0), // %15
                        "w"(_k20_0), // %16
                        "w"(_k21_0), // %17
                        "w"(_k22_0), // %18
                        "w"(_k00_1), // %19
                        "w"(_k01_1), // %20
                        "w"(_k02_1), // %21
                        "w"(_k10_1), // %22
                        "w"(_k11_1), // %23
                        "w"(_k12_1), // %24
                        "w"(_k20_1), // %25
                        "w"(_k21_1), // %26
                        "w"(_k22_1)  // %27
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19");
                }

                r0 += 2 * 4;
                r1 += 2 * 4;
                r2 += 2 * 4;
            }

            k0 += 9 * 4;
            k1 += 9 * 4;
        }
    }
#endif // __ARM_NEON && __aarch64__

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;
        out0.fill(bias0);

        const float* k0 = kernel.channel(p);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            float32x4_t _k00 = vld1q_f32(k0);
            float32x4_t _k01 = vld1q_f32(k0 + 4);
            float32x4_t _k02 = vld1q_f32(k0 + 8);
            float32x4_t _k10 = vld1q_f32(k0 + 12);
            float32x4_t _k11 = vld1q_f32(k0 + 16);
            float32x4_t _k12 = vld1q_f32(k0 + 20);
            float32x4_t _k20 = vld1q_f32(k0 + 24);
            float32x4_t _k21 = vld1q_f32(k0 + 28);
            float32x4_t _k22 = vld1q_f32(k0 + 32);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;

#if __aarch64__
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n" // r04 r05 r06 r07

                        "fmul   v16.4s, %8.4s, v0.4s        \n"
                        "fmul   v17.4s, %8.4s, v1.4s        \n"
                        "fmul   v18.4s, %8.4s, v2.4s        \n"
                        "fmul   v19.4s, %8.4s, v3.4s        \n"
                        "fmul   v20.4s, %8.4s, v4.4s        \n"
                        "fmul   v21.4s, %8.4s, v5.4s        \n"
                        "fmul   v22.4s, %8.4s, v6.4s        \n"
                        "fmul   v23.4s, %8.4s, v7.4s        \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%1]        \n" // r08 r09

                        "fmla   v16.4s, %9.4s, v1.4s        \n"
                        "fmla   v17.4s, %9.4s, v2.4s        \n"
                        "fmla   v18.4s, %9.4s, v3.4s        \n"
                        "fmla   v19.4s, %9.4s, v4.4s        \n"
                        "fmla   v20.4s, %9.4s, v5.4s        \n"
                        "fmla   v21.4s, %9.4s, v6.4s        \n"
                        "fmla   v22.4s, %9.4s, v7.4s        \n"
                        "fmla   v23.4s, %9.4s, v8.4s        \n"

                        "fmla   v16.4s, %10.4s, v2.4s       \n"
                        "fmla   v17.4s, %10.4s, v3.4s       \n"
                        "fmla   v18.4s, %10.4s, v4.4s       \n"
                        "fmla   v19.4s, %10.4s, v5.4s       \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v20.4s, %10.4s, v6.4s       \n"
                        "fmla   v21.4s, %10.4s, v7.4s       \n"
                        "fmla   v22.4s, %10.4s, v8.4s       \n"
                        "fmla   v23.4s, %10.4s, v9.4s       \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r14 r15 r16 r17

                        "fmla   v16.4s, %11.4s, v0.4s       \n"
                        "fmla   v17.4s, %11.4s, v1.4s       \n"
                        "fmla   v18.4s, %11.4s, v2.4s       \n"
                        "fmla   v19.4s, %11.4s, v3.4s       \n"
                        "fmla   v20.4s, %11.4s, v4.4s       \n"
                        "fmla   v21.4s, %11.4s, v5.4s       \n"
                        "fmla   v22.4s, %11.4s, v6.4s       \n"
                        "fmla   v23.4s, %11.4s, v7.4s       \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%2]        \n" // r18 r19

                        "fmla   v16.4s, %12.4s, v1.4s       \n"
                        "fmla   v17.4s, %12.4s, v2.4s       \n"
                        "fmla   v18.4s, %12.4s, v3.4s       \n"
                        "fmla   v19.4s, %12.4s, v4.4s       \n"
                        "fmla   v20.4s, %12.4s, v5.4s       \n"
                        "fmla   v21.4s, %12.4s, v6.4s       \n"
                        "fmla   v22.4s, %12.4s, v7.4s       \n"
                        "fmla   v23.4s, %12.4s, v8.4s       \n"

                        "fmla   v16.4s, %13.4s, v2.4s       \n"
                        "fmla   v17.4s, %13.4s, v3.4s       \n"
                        "fmla   v18.4s, %13.4s, v4.4s       \n"
                        "fmla   v19.4s, %13.4s, v5.4s       \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v20.4s, %13.4s, v6.4s       \n"
                        "fmla   v21.4s, %13.4s, v7.4s       \n"
                        "fmla   v22.4s, %13.4s, v8.4s       \n"
                        "fmla   v23.4s, %13.4s, v9.4s       \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r24 r25 r26 r27

                        "fmla   v16.4s, %14.4s, v0.4s       \n"
                        "fmla   v17.4s, %14.4s, v1.4s       \n"
                        "fmla   v18.4s, %14.4s, v2.4s       \n"
                        "fmla   v19.4s, %14.4s, v3.4s       \n"
                        "fmla   v20.4s, %14.4s, v4.4s       \n"
                        "fmla   v21.4s, %14.4s, v5.4s       \n"
                        "fmla   v22.4s, %14.4s, v6.4s       \n"
                        "fmla   v23.4s, %14.4s, v7.4s       \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%3]        \n" // r28 r29

                        "fmla   v16.4s, %15.4s, v1.4s       \n"
                        "fmla   v17.4s, %15.4s, v2.4s       \n"
                        "fmla   v18.4s, %15.4s, v3.4s       \n"
                        "fmla   v19.4s, %15.4s, v4.4s       \n"
                        "fmla   v20.4s, %15.4s, v5.4s       \n"
                        "fmla   v21.4s, %15.4s, v6.4s       \n"
                        "fmla   v22.4s, %15.4s, v7.4s       \n"
                        "fmla   v23.4s, %15.4s, v8.4s       \n"

                        "fmla   v16.4s, %16.4s, v2.4s       \n"
                        "fmla   v17.4s, %16.4s, v3.4s       \n"
                        "fmla   v18.4s, %16.4s, v4.4s       \n"
                        "fmla   v19.4s, %16.4s, v5.4s       \n"
                        "fmla   v20.4s, %16.4s, v6.4s       \n"
                        "fmla   v21.4s, %16.4s, v7.4s       \n"
                        "fmla   v22.4s, %16.4s, v8.4s       \n"
                        "fmla   v23.4s, %16.4s, v9.4s       \n"

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%0]        \n" // sum0 sum1 sum2 sum3 sum4 sum5 sum6 sum7

                        "faddp  v16.4s, v16.4s, v17.4s      \n"
                        "faddp  v18.4s, v18.4s, v19.4s      \n"
                        "faddp  v20.4s, v20.4s, v21.4s      \n"
                        "faddp  v22.4s, v22.4s, v23.4s      \n"

                        "faddp  v16.4s, v16.4s, v18.4s      \n"
                        "faddp  v20.4s, v20.4s, v22.4s      \n"

                        "fadd   v0.4s, v0.4s, v16.4s        \n"
                        "fadd   v1.4s, v1.4s, v20.4s        \n"

                        "st1    {v0.4s, v1.4s}, [%0], #32   \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
                }
#endif // __aarch64__
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%1]        \n" // r04 r05

                        "fmul   v16.4s, %8.4s, v0.4s        \n"
                        "fmul   v17.4s, %8.4s, v1.4s        \n"
                        "fmul   v18.4s, %8.4s, v2.4s        \n"
                        "fmul   v19.4s, %8.4s, v3.4s        \n"

                        "fmla   v16.4s, %9.4s, v1.4s        \n"
                        "fmla   v17.4s, %9.4s, v2.4s        \n"
                        "fmla   v18.4s, %9.4s, v3.4s        \n"
                        "fmla   v19.4s, %9.4s, v8.4s        \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v16.4s, %10.4s, v2.4s       \n"
                        "fmla   v17.4s, %10.4s, v3.4s       \n"
                        "fmla   v18.4s, %10.4s, v8.4s       \n"
                        "fmla   v19.4s, %10.4s, v9.4s       \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%2]        \n" // r14 r15

                        "fmla   v16.4s, %11.4s, v4.4s       \n"
                        "fmla   v17.4s, %11.4s, v5.4s       \n"
                        "fmla   v18.4s, %11.4s, v6.4s       \n"
                        "fmla   v19.4s, %11.4s, v7.4s       \n"

                        "fmla   v16.4s, %12.4s, v5.4s       \n"
                        "fmla   v17.4s, %12.4s, v6.4s       \n"
                        "fmla   v18.4s, %12.4s, v7.4s       \n"
                        "fmla   v19.4s, %12.4s, v8.4s       \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v16.4s, %13.4s, v6.4s       \n"
                        "fmla   v17.4s, %13.4s, v7.4s       \n"
                        "fmla   v18.4s, %13.4s, v8.4s       \n"
                        "fmla   v19.4s, %13.4s, v9.4s       \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v8.4s, v9.4s}, [%3]        \n" // r24 r25

                        "fmla   v16.4s, %14.4s, v0.4s       \n"
                        "fmla   v17.4s, %14.4s, v1.4s       \n"
                        "fmla   v18.4s, %14.4s, v2.4s       \n"
                        "fmla   v19.4s, %14.4s, v3.4s       \n"

                        "fmla   v16.4s, %15.4s, v1.4s       \n"
                        "fmla   v17.4s, %15.4s, v2.4s       \n"
                        "fmla   v18.4s, %15.4s, v3.4s       \n"
                        "fmla   v19.4s, %15.4s, v8.4s       \n"

                        "fmla   v16.4s, %16.4s, v2.4s       \n"
                        "fmla   v17.4s, %16.4s, v3.4s       \n"
                        "fmla   v18.4s, %16.4s, v8.4s       \n"
                        "fmla   v19.4s, %16.4s, v9.4s       \n"

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v0.4s}, [%0]               \n" // sum0 sum1 sum2 sum3

                        "faddp  v16.4s, v16.4s, v17.4s      \n"
                        "faddp  v18.4s, v18.4s, v19.4s      \n"

                        "faddp  v16.4s, v16.4s, v18.4s      \n"

                        "fadd   v0.4s, v0.4s, v16.4s        \n"

                        "st1    {v0.4s}, [%0], #16          \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #256]      \n"
                        "vld1.f32   {d0-d3}, [%1 :128]! \n" // r00 r01

                        "vmul.f32   q3, %q8, q0     \n"

                        "pld        [%1, #128]      \n"
                        "vld1.f32   {d4-d5}, [%1 :128]! \n" // r02

                        "vmul.f32   q4, %q8, q1     \n"
                        "vmla.f32   q3, %q9, q1     \n"

                        "pld        [%1, #256]      \n"
                        "vld1.f32   {d0-d3}, [%1 :128]! \n" // r03 r04

                        "vmul.f32   q5, %q8, q2     \n"
                        "vmla.f32   q4, %q9, q2     \n"
                        "vmla.f32   q3, %q10, q2    \n"

                        "vmul.f32   q6, %q8, q0     \n"
                        "vmla.f32   q5, %q9, q0     \n"
                        "vmla.f32   q4, %q10, q0    \n"

                        "pld        [%1, #128]      \n"
                        "vld1.f32   {d4-d5}, [%1 :128] \n" // r05

                        "vmla.f32   q6, %q9, q1     \n"
                        "vmla.f32   q5, %q10, q1    \n"

                        "pld        [%2, #256]      \n"
                        "vld1.f32   {d0-d3}, [%2 :128]! \n" // r10 r11

                        "vmla.f32   q6, %q10, q2    \n"

                        "vmla.f32   q3, %q11, q0    \n"

                        "pld        [%2, #128]      \n"
                        "vld1.f32   {d4-d5}, [%2 :128]! \n" // r12

                        "vmla.f32   q4, %q11, q1    \n"
                        "vmla.f32   q3, %q12, q1    \n"

                        "pld        [%2, #256]      \n"
                        "vld1.f32   {d0-d3}, [%2 :128]! \n" // r13 r14

                        "vmla.f32   q5, %q11, q2    \n"
                        "vmla.f32   q4, %q12, q2    \n"
                        "vmla.f32   q3, %q13, q2    \n"

                        "vmla.f32   q6, %q11, q0    \n"
                        "vmla.f32   q5, %q12, q0    \n"
                        "vmla.f32   q4, %q13, q0    \n"

                        "pld        [%2, #128]      \n"
                        "vld1.f32   {d4-d5}, [%2 :128] \n" // r15

                        "vmla.f32   q6, %q12, q1    \n"
                        "vmla.f32   q5, %q13, q1    \n"

                        "pld        [%3, #256]      \n"
                        "vld1.f32   {d0-d3}, [%3 :128]! \n" // r20 r21

                        "vmla.f32   q6, %q13, q2    \n"

                        "vmla.f32   q3, %q14, q0    \n"

                        "pld        [%3, #128]      \n"
                        "vld1.f32   {d4-d5}, [%3 :128]! \n" // r22

                        "vmla.f32   q4, %q14, q1    \n"
                        "vmla.f32   q3, %q15, q1    \n"

                        "pld        [%3, #256]      \n"
                        "vld1.f32   {d0-d3}, [%3 :128]! \n" // r23 r24

                        "vmla.f32   q5, %q14, q2    \n"
                        "vmla.f32   q4, %q15, q2    \n"
                        "vmla.f32   q3, %q16, q2    \n"

                        "vmla.f32   q6, %q14, q0    \n"
                        "vmla.f32   q5, %q15, q0    \n"
                        "vmla.f32   q4, %q16, q0    \n"

                        "pld        [%3, #128]      \n"
                        "vld1.f32   {d4-d5}, [%3 :128] \n" // r25

                        "vmla.f32   q6, %q15, q1    \n"
                        "vmla.f32   q5, %q16, q1    \n"

                        "vld1.f32   {d0-d1}, [%0]   \n" // sum0 sum1 sum2 sum3

                        "vmla.f32   q6, %q16, q2    \n"

                        "vadd.f32   d6, d6, d7      \n"
                        "vadd.f32   d8, d8, d9      \n"
                        "vadd.f32   d10, d10, d11   \n"
                        "vadd.f32   d12, d12, d13   \n"

                        "sub        %1, %1, #16     \n"

                        "vpadd.f32  d6, d6, d8      \n"
                        "vpadd.f32  d7, d10, d12    \n"

                        "sub        %2, %2, #16     \n"

                        "vadd.f32   q0, q0, q3      \n"

                        "sub        %3, %3, #16     \n"

                        "vst1.f32   {d0-d1}, [%0]!  \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n" // r00 r01 r02 r03

                        "fmul   v16.4s, %8.4s, v0.4s        \n"
                        "fmul   v17.4s, %8.4s, v1.4s        \n"
                        "fmul   v18.4s, %9.4s, v1.4s        \n"
                        "fmul   v19.4s, %9.4s, v2.4s        \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2] \n" // r10 r11 r12 r13

                        "fmla   v16.4s, %10.4s, v2.4s       \n"
                        "fmla   v17.4s, %10.4s, v3.4s       \n"

                        "fmla   v18.4s, %11.4s, v4.4s       \n"
                        "fmla   v19.4s, %11.4s, v5.4s       \n"
                        "fmla   v16.4s, %12.4s, v5.4s       \n"
                        "fmla   v17.4s, %12.4s, v6.4s       \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3] \n" // r20 r21 r22 r23

                        "fmla   v18.4s, %13.4s, v6.4s       \n"
                        "fmla   v19.4s, %13.4s, v7.4s       \n"

                        "fmla   v16.4s, %14.4s, v0.4s       \n"
                        "fmla   v17.4s, %14.4s, v1.4s       \n"
                        "fmla   v18.4s, %15.4s, v1.4s       \n"
                        "fmla   v19.4s, %15.4s, v2.4s       \n"
                        "fmla   v16.4s, %16.4s, v2.4s       \n"
                        "fmla   v17.4s, %16.4s, v3.4s       \n"

                        "ld1    {v0.2s}, [%0]               \n" // sum0 sum1

                        "fadd   v16.4s, v16.4s, v18.4s      \n"
                        "fadd   v17.4s, v17.4s, v19.4s      \n"

                        "add    %1, %1, #32                 \n"

                        "faddp  v16.4s, v16.4s, v17.4s      \n"

                        "add    %2, %2, #32                 \n"

                        "faddp  v16.4s, v16.4s, v16.4s      \n"

                        "add    %3, %3, #32                 \n"

                        "fadd   v0.2s, v0.2s, v16.2s        \n"

                        "st1    {v0.2s}, [%0], #8           \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #256]      \n"
                        "vld1.f32   {d0-d3}, [%1 :128]! \n" // r00 r01

                        "vmul.f32   q5, %q8, q0     \n"
                        "vmul.f32   q6, %q8, q1     \n"
                        "vmul.f32   q2, %q9, q1     \n"

                        "pld        [%1, #256]      \n"
                        "vld1.f32   {d0-d3}, [%1 :128] \n" // r02 r03

                        "vmul.f32   q3, %q9, q0     \n"
                        "vmla.f32   q5, %q10, q0    \n"
                        "vmla.f32   q6, %q10, q1    \n"

                        "pld        [%2, #256]      \n"
                        "vld1.f32   {d0-d3}, [%2 :128]! \n" // r10 r11

                        "vmla.f32   q2, %q11, q0    \n"
                        "vmla.f32   q3, %q11, q1    \n"
                        "vmla.f32   q5, %q12, q1    \n"

                        "pld        [%2, #256]      \n"
                        "vld1.f32   {d0-d3}, [%2 :128] \n" // r12 r13

                        "vmla.f32   q6, %q12, q0    \n"
                        "vmla.f32   q2, %q13, q0    \n"
                        "vmla.f32   q3, %q13, q1    \n"

                        "pld        [%3, #256]      \n"
                        "vld1.f32   {d0-d3}, [%3 :128]! \n" // r20 r21

                        "vmla.f32   q5, %q14, q0    \n"
                        "vmla.f32   q6, %q14, q1    \n"
                        "vmla.f32   q2, %q15, q1    \n"

                        "pld        [%3, #256]      \n"
                        "vld1.f32   {d0-d3}, [%3 :128] \n" // r22 r23

                        "vmla.f32   q3, %q15, q0    \n"
                        "vmla.f32   q5, %q16, q0    \n"
                        "vmla.f32   q6, %q16, q1    \n"

                        "vld1.f32   {d8}, [%0]      \n" // sum0 sum1

                        "vadd.f32   q5, q5, q2      \n"
                        "vadd.f32   q6, q6, q3      \n"

                        "vadd.f32   d10, d10, d11   \n"
                        "vadd.f32   d12, d12, d13   \n"

                        "vpadd.f32  d10, d10, d12   \n"

                        "vadd.f32   d8, d8, d10     \n"

                        "vst1.f32   {d8}, [%0]!     \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%1] \n" // r00 r01 r02

                        "eor    v16.16b, v16.16b, v16.16b   \n"
                        "ld1    {v16.s}[0], [%0]            \n" // sum0

                        "fmul   v17.4s, %8.4s, v0.4s        \n"
                        "fmul   v18.4s, %9.4s, v1.4s        \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v3.4s, v4.4s, v5.4s}, [%2] \n" // r10 r11 r12

                        "fmla   v16.4s, %10.4s, v2.4s       \n"

                        "fmla   v17.4s, %11.4s, v3.4s       \n"
                        "fmla   v18.4s, %12.4s, v4.4s       \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%3] \n" // r20 r21 r22

                        "fmla   v16.4s, %13.4s, v5.4s       \n"

                        "fmla   v17.4s, %14.4s, v0.4s       \n"
                        "fmla   v18.4s, %15.4s, v1.4s       \n"
                        "fmla   v16.4s, %16.4s, v2.4s       \n"

                        "fadd   v17.4s, v17.4s, v18.4s      \n"
                        "fadd   v16.4s, v16.4s, v17.4s      \n"

                        "add    %1, %1, #16                 \n"

                        "faddp  v16.4s, v16.4s, v16.4s      \n"

                        "add    %2, %2, #16                 \n"

                        "faddp  v16.2s, v16.2s, v16.2s      \n"

                        "add    %3, %3, #16                 \n"

                        "st1    {v16.s}[0], [%0], #4        \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #384]      \n"
                        "vldm       %1, {d0-d5}     \n" // r00 r01 r02

                        "veor       q3, q3          \n"
                        "vld1.f32   {d6[0]}, [%0]   \n" // sum0

                        "vmul.f32   q4, %q8, q0     \n"
                        "vmul.f32   q5, %q9, q1     \n"
                        "vmla.f32   q3, %q10, q2    \n"

                        "pld        [%2, #384]      \n"
                        "vldm       %2, {d0-d5}     \n" // r10 r11 r12

                        "vmla.f32   q4, %q11, q0    \n"
                        "vmla.f32   q5, %q12, q1    \n"
                        "vmla.f32   q3, %q13, q2    \n"

                        "pld        [%3, #384]      \n"
                        "vldm       %3, {d0-d5}     \n" // r20 r21 r22

                        "vmla.f32   q4, %q14, q0    \n"
                        "vmla.f32   q5, %q15, q1    \n"
                        "vmla.f32   q3, %q16, q2    \n"

                        "vadd.f32   q4, q4, q5      \n"
                        "vadd.f32   q3, q3, q4      \n"

                        "add        %1, %1, #16     \n"

                        "vadd.f32   d6, d6, d7      \n"

                        "add        %2, %2, #16     \n"

                        "vpadd.f32  d6, d6, d6      \n"

                        "add        %3, %3, #16     \n"

                        "vst1.f32   {d6[0]}, [%0]!  \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2)       // %3
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "w"(_k00), // %8
                        "w"(_k01), // %9
                        "w"(_k02), // %10
                        "w"(_k10), // %11
                        "w"(_k11), // %12
                        "w"(_k12), // %13
                        "w"(_k20), // %14
                        "w"(_k21), // %15
                        "w"(_k22)  // %16
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
#endif // __aarch64__
                }

                r0 += 2 * 4;
                r1 += 2 * 4;
                r2 += 2 * 4;
            }

            k0 += 9 * 4;
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_pack8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16(0.f);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            __fp16* outptr0 = out0.row<__fp16>(0);

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);

            const __fp16* kptr = kernel.channel(p).row<const __fp16>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v4.8h, v5.8h}, [%1]        \n" // r04 r05

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v3.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v3.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v3.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v3.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v4.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v4.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v4.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v4.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v5.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v5.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v5.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v5.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v12.8h, v13.8h}, [%2]      \n" // r14 r15

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v11.h[0]    \n"

                        "fmla   v28.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v11.h[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v11.h[2]    \n"

                        "fmla   v28.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v11.h[4]    \n"

                        "fmla   v28.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v11.h[5]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v11.h[6]    \n"

                        "fmla   v28.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v12.h[0]    \n"

                        "fmla   v28.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v12.h[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v12.h[2]    \n"

                        "fmla   v28.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v12.h[4]    \n"

                        "fmla   v28.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v12.h[5]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v12.h[6]    \n"

                        "fmla   v28.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v13.h[0]    \n"

                        "fmla   v28.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v13.h[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v13.h[2]    \n"

                        "fmla   v28.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v13.h[3]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v13.h[4]    \n"

                        "fmla   v28.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v13.h[5]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v13.h[6]    \n"

                        "fmla   v28.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v13.h[7]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.8h, v5.8h}, [%3]        \n" // r24 r25

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v3.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v3.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v3.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v3.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v4.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v4.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v4.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v4.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v5.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"

                        // "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v5.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v5.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"

                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v5.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "sub    %4, %4, #1088               \n" // kptr -= 8.5 * 64;

                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1] \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%2] \n" // r10 r11 r12 r13

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v7.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v7.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v7.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v7.h[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3] \n" // r20 r21 r22 r23

                        "fmla   v28.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v7.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v7.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v7.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v7.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        // "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "add    %1, %1, #32                 \n"

                        "add    %2, %2, #32                 \n"

                        "add    %3, %3, #32                 \n"

                        "fadd   v28.8h, v28.8h, v30.8h      \n"
                        "fadd   v29.8h, v29.8h, v31.8h      \n"

                        "sub    %4, %4, #1088               \n" // kptr -= 8.5 * 64;

                        "st1    {v28.8h, v29.8h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%1] \n" // r00 r01 r02

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v31.8h}, [%0]              \n" // sum0

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmul   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v3.8h, v4.8h, v5.8h}, [%2] \n" // r10 r11 r12

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v5.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%3] \n" // r20 r21 r22

                        "fmla   v28.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v5.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"

                        // "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n"

                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"

                        "add    %1, %1, #16                 \n"

                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "add    %2, %2, #16                 \n"

                        "fadd   v28.8h, v28.8h, v29.8h      \n"
                        "fadd   v30.8h, v30.8h, v31.8h      \n"

                        "add    %3, %3, #16                 \n"

                        "fadd   v28.8h, v28.8h, v30.8h      \n"

                        "sub    %4, %4, #1088               \n" // kptr -= 8.5 * 64;

                        "st1    {v28.8h}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }

                r0 += 16;
                r1 += 16;
                r2 += 16;
            }
        }
    }
}

static void conv3x3s2_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = (w - 2 * outw + w) * 8;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16(0.f);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);

            const __fp16* kptr = kernel.channel(p).row<const __fp16>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n" // r04 r05 r06 r07

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v6.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v6.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v6.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v6.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v7.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v7.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v7.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v7.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v7.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v7.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v7.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v7.h[7]     \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.8h}, [%1]               \n" // r08

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v0.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v0.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v0.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v0.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%2], #64 \n" // r14 r15 r16 r17

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v14.h[0]    \n"

                        "fmla   v28.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v14.h[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v14.h[2]    \n"

                        "fmla   v28.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v14.h[3]    \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v14.h[4]    \n"

                        "fmla   v28.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v14.h[5]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v14.h[6]    \n"

                        "fmla   v28.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v14.h[7]    \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v15.h[0]    \n"

                        "fmla   v28.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v15.h[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v15.h[2]    \n"

                        "fmla   v28.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v13.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v15.h[3]    \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v15.h[4]    \n"

                        "fmla   v28.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v15.h[5]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v15.h[6]    \n"

                        "fmla   v28.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v13.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v15.h[7]    \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v8.8h}, [%2]               \n" // r18

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v14.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v8.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v14.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v8.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v14.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v8.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v14.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v8.h[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v14.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v8.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v14.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v8.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v14.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v8.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v14.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v8.h[7]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%3], #64 \n" // r24 r25 r26 r27

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v6.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v6.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v6.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v6.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v7.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v7.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v7.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v7.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v7.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v7.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v7.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v7.h[7]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.8h}, [%3]               \n" // r28

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v0.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v0.h[1]     \n"

                        // "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v0.h[2]     \n"

                        "fmla   v28.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v0.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v0.h[5]     \n"

                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v0.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "sub    %4, %4, #1088               \n" // kptr -= 8.5 * 64;

                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.8h}, [%1]               \n" // r04

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v7.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v7.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v7.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v7.h[3]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v4.8h}, [%2]               \n" // r14

                        "fmla   v28.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v7.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v7.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v7.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v7.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v28.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.8h}, [%3]               \n" // r24

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v0.h[1]     \n"

                        // "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n"

                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "fadd   v28.8h, v28.8h, v30.8h      \n"
                        "fadd   v29.8h, v29.8h, v31.8h      \n"

                        "sub    %4, %4, #1088               \n" // kptr -= 8.5 * 64;

                        "st1    {v28.8h, v29.8h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%1] \n" // r00 r01 r02

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v31.8h}, [%0]              \n" // sum0

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmul   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v3.8h, v4.8h, v5.8h}, [%2] \n" // r10 r11 r12

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v5.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%3] \n" // r20 r21 r22

                        "fmla   v28.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v5.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n"

                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"

                        // "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n"

                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"

                        "add    %1, %1, #32                 \n"

                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "add    %2, %2, #32                 \n"

                        "fadd   v28.8h, v28.8h, v29.8h      \n"
                        "fadd   v30.8h, v30.8h, v31.8h      \n"

                        "add    %3, %3, #32                 \n"

                        "fadd   v28.8h, v28.8h, v30.8h      \n"

                        "sub    %4, %4, #1088               \n" // kptr -= 8.5 * 64;

                        "st1    {v28.8h}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(kptr)     // %4
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_3x3_winograd.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd_pack_A_tile(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
{
    const int N = max_kk * batch;

    for (int b = 0; b < batch; b++)
    {
        float* pp = AT.row(b);

        int ii = 0;
#if __ARM_NEON
#if __aarch64__
        for (; ii + 7 < max_ii; ii += 8)
        {
            const float* p0 = (const float*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                pp[2] = p0[2 * N];
                pp[3] = p0[3 * N];
                pp[4] = p0[4 * N];
                pp[5] = p0[5 * N];
                pp[6] = p0[6 * N];
                pp[7] = p0[7 * N];
                p0 += batch;
                pp += 8;
            }
        }
#endif // __aarch64__
        for (; ii + 3 < max_ii; ii += 4)
        {
            const float* p0 = (const float*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                pp[2] = p0[2 * N];
                pp[3] = p0[3 * N];
                p0 += batch;
                pp += 4;
            }
        }
#endif // __ARM_NEON
        for (; ii + 1 < max_ii; ii += 2)
        {
            const float* p0 = (const float*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                p0 += batch;
                pp += 2;
            }
        }
        for (; ii < max_ii; ii++)
        {
            const float* p0 = (const float*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                p0 += batch;
                pp += 1;
            }
        }
    }
}

static void conv3x3s1_winograd_transpose_pack_B_tile(const Mat& B, Mat& BT, int batch, int max_jj, int max_kk, int nT)
{
    #pragma omp parallel for num_threads(nT)
    for (int b = 0; b < batch; b++)
    {
        float* pp = BT.row(b);

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            const float* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x12
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0]  \n"

                    "uzp1   v24.4s, v0.4s, v4.4s        \n"
                    "uzp2   v25.4s, v0.4s, v4.4s        \n"
                    "uzp1   v26.4s, v1.4s, v5.4s        \n"
                    "uzp2   v27.4s, v1.4s, v5.4s        \n"
                    "uzp1   v28.4s, v2.4s, v6.4s        \n"
                    "uzp2   v29.4s, v2.4s, v6.4s        \n"
                    "uzp1   v30.4s, v3.4s, v7.4s        \n"
                    "uzp2   v31.4s, v3.4s, v7.4s        \n"

                    "uzp1   v0.4s, v8.4s, v12.4s        \n"
                    "uzp2   v1.4s, v8.4s, v12.4s        \n"
                    "uzp1   v2.4s, v9.4s, v13.4s        \n"
                    "uzp2   v3.4s, v9.4s, v13.4s        \n"
                    "uzp1   v4.4s, v10.4s, v14.4s       \n"
                    "uzp2   v5.4s, v10.4s, v14.4s       \n"
                    "uzp1   v6.4s, v11.4s, v15.4s       \n"
                    "uzp2   v7.4s, v11.4s, v15.4s       \n"

                    "sub    %0, %0, #320                \n"

                    "uzp1   v8.4s, v16.4s, v20.4s       \n"
                    "uzp2   v9.4s, v16.4s, v20.4s       \n"
                    "uzp1   v10.4s, v17.4s, v21.4s      \n"
                    "uzp2   v11.4s, v17.4s, v21.4s      \n"
                    "uzp1   v12.4s, v18.4s, v22.4s      \n"
                    "uzp2   v13.4s, v18.4s, v22.4s      \n"
                    "uzp1   v14.4s, v19.4s, v23.4s      \n"
                    "uzp2   v15.4s, v19.4s, v23.4s      \n"

                    "st1    {v24.4s}, [%1], #16         \n"
                    "st1    {v0.4s}, [%1], #16          \n"
                    "st1    {v8.4s}, [%1], #16          \n"
                    "st1    {v26.4s}, [%1], #16         \n"
                    "st1    {v2.4s}, [%1], #16          \n"
                    "st1    {v10.4s}, [%1], #16         \n"
                    "st1    {v28.4s}, [%1], #16         \n"
                    "st1    {v4.4s}, [%1], #16          \n"
                    "st1    {v12.4s}, [%1], #16         \n"
                    "st1    {v30.4s}, [%1], #16         \n"
                    "st1    {v6.4s}, [%1], #16          \n"
                    "st1    {v14.4s}, [%1], #16         \n"

                    "st1    {v25.4s}, [%1], #16         \n"
                    "st1    {v1.4s}, [%1], #16          \n"
                    "st1    {v9.4s}, [%1], #16          \n"
                    "st1    {v27.4s}, [%1], #16         \n"
                    "st1    {v3.4s}, [%1], #16          \n"
                    "st1    {v11.4s}, [%1], #16         \n"
                    "st1    {v29.4s}, [%1], #16         \n"
                    "st1    {v5.4s}, [%1], #16          \n"
                    "st1    {v13.4s}, [%1], #16         \n"
                    "st1    {v31.4s}, [%1], #16         \n"
                    "st1    {v7.4s}, [%1], #16          \n"
                    "st1    {v15.4s}, [%1], #16         \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0 = vld4q_f32(p0);
                float32x4x4_t _r1 = vld4q_f32(p0 + 16);
                float32x4x4_t _r2 = vld4q_f32(p0 + 32);
                float32x4x4_t _r3 = vld4q_f32(p0 + 48);
                float32x4x4_t _r4 = vld4q_f32(p0 + 64);
                float32x4x4_t _r5 = vld4q_f32(p0 + 80);
                float32x4x2_t _r04l = vuzpq_f32(_r0.val[0], _r1.val[0]);
                float32x4x2_t _r15l = vuzpq_f32(_r0.val[1], _r1.val[1]);
                float32x4x2_t _r26l = vuzpq_f32(_r0.val[2], _r1.val[2]);
                float32x4x2_t _r37l = vuzpq_f32(_r0.val[3], _r1.val[3]);
                float32x4x2_t _r04m = vuzpq_f32(_r2.val[0], _r3.val[0]);
                float32x4x2_t _r15m = vuzpq_f32(_r2.val[1], _r3.val[1]);
                float32x4x2_t _r26m = vuzpq_f32(_r2.val[2], _r3.val[2]);
                float32x4x2_t _r37m = vuzpq_f32(_r2.val[3], _r3.val[3]);
                float32x4x2_t _r04h = vuzpq_f32(_r4.val[0], _r5.val[0]);
                float32x4x2_t _r15h = vuzpq_f32(_r4.val[1], _r5.val[1]);
                float32x4x2_t _r26h = vuzpq_f32(_r4.val[2], _r5.val[2]);
                float32x4x2_t _r37h = vuzpq_f32(_r4.val[3], _r5.val[3]);
                vst1q_f32(pp, _r04l.val[0]);
                vst1q_f32(pp + 4, _r04m.val[0]);
                vst1q_f32(pp + 4 * 2, _r04h.val[0]);
                vst1q_f32(pp + 4 * 3, _r15l.val[0]);
                vst1q_f32(pp + 4 * 4, _r15m.val[0]);
                vst1q_f32(pp + 4 * 5, _r15h.val[0]);
                vst1q_f32(pp + 4 * 6, _r26l.val[0]);
                vst1q_f32(pp + 4 * 7, _r26m.val[0]);
                vst1q_f32(pp + 4 * 8, _r26h.val[0]);
                vst1q_f32(pp + 4 * 9, _r37l.val[0]);
                vst1q_f32(pp + 4 * 10, _r37m.val[0]);
                vst1q_f32(pp + 4 * 11, _r37h.val[0]);
                vst1q_f32(pp + 4 * 12, _r04l.val[1]);
                vst1q_f32(pp + 4 * 13, _r04m.val[1]);
                vst1q_f32(pp + 4 * 14, _r04h.val[1]);
                vst1q_f32(pp + 4 * 15, _r15l.val[1]);
                vst1q_f32(pp + 4 * 16, _r15m.val[1]);
                vst1q_f32(pp + 4 * 17, _r15h.val[1]);
                vst1q_f32(pp + 4 * 18, _r26l.val[1]);
                vst1q_f32(pp + 4 * 19, _r26m.val[1]);
                vst1q_f32(pp + 4 * 20, _r26h.val[1]);
                vst1q_f32(pp + 4 * 21, _r37l.val[1]);
                vst1q_f32(pp + 4 * 22, _r37m.val[1]);
                vst1q_f32(pp + 4 * 23, _r37h.val[1]);
                p0 += max_jj * batch * 8;
                pp += 96;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x12
                float32x4x4_t _r0 = vld4q_f32(p0);
                float32x4x4_t _r1 = vld4q_f32(p0 + 16);
                float32x4x4_t _r2 = vld4q_f32(p0 + 32);
                vst1q_f32(pp, _r0.val[0]);
                vst1q_f32(pp + 4, _r1.val[0]);
                vst1q_f32(pp + 4 * 2, _r2.val[0]);
                vst1q_f32(pp + 4 * 3, _r0.val[1]);
                vst1q_f32(pp + 4 * 4, _r1.val[1]);
                vst1q_f32(pp + 4 * 5, _r2.val[1]);
                vst1q_f32(pp + 4 * 6, _r0.val[2]);
                vst1q_f32(pp + 4 * 7, _r1.val[2]);
                vst1q_f32(pp + 4 * 8, _r2.val[2]);
                vst1q_f32(pp + 4 * 9, _r0.val[3]);
                vst1q_f32(pp + 4 * 10, _r1.val[3]);
                vst1q_f32(pp + 4 * 11, _r2.val[3]);
                p0 += max_jj * batch * 4;
                pp += 48;
            }
            p0 -= (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                // transpose 2x12
                float32x4x2_t _r0 = vld2q_f32(p0);
                float32x4x2_t _r1 = vld2q_f32(p0 + 8);
                float32x4x2_t _r2 = vld2q_f32(p0 + 16);
                vst1q_f32(pp, _r0.val[0]);
                vst1q_f32(pp + 4, _r1.val[0]);
                vst1q_f32(pp + 4 * 2, _r2.val[0]);
                vst1q_f32(pp + 4 * 3, _r0.val[1]);
                vst1q_f32(pp + 4 * 4, _r1.val[1]);
                vst1q_f32(pp + 4 * 5, _r2.val[1]);
                p0 += max_jj * batch * 2;
                pp += 24;
            }
            p0 -= (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                float32x4_t _r0 = vld1q_f32(p0);
                float32x4_t _r1 = vld1q_f32(p0 + 4);
                float32x4_t _r2 = vld1q_f32(p0 + 8);
                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r1);
                vst1q_f32(pp + 8, _r2);
                p0 += max_jj * batch;
                pp += 12;
            }
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            const float* p0 = B;

            int kk = 0;
#if __aarch64__
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x8
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0] \n"

                    "uzp1   v16.4s, v0.4s, v4.4s        \n"
                    "uzp2   v24.4s, v0.4s, v4.4s        \n"
                    "uzp1   v18.4s, v1.4s, v5.4s        \n"
                    "uzp2   v26.4s, v1.4s, v5.4s        \n"
                    "uzp1   v20.4s, v2.4s, v6.4s        \n"
                    "uzp2   v28.4s, v2.4s, v6.4s        \n"
                    "uzp1   v22.4s, v3.4s, v7.4s        \n"
                    "uzp2   v30.4s, v3.4s, v7.4s        \n"

                    "sub    %0, %0, #192                \n"

                    "uzp1   v17.4s, v8.4s, v12.4s       \n"
                    "uzp2   v25.4s, v8.4s, v12.4s       \n"
                    "uzp1   v19.4s, v9.4s, v13.4s       \n"
                    "uzp2   v27.4s, v9.4s, v13.4s       \n"
                    "uzp1   v21.4s, v10.4s, v14.4s      \n"
                    "uzp2   v29.4s, v10.4s, v14.4s      \n"
                    "uzp1   v23.4s, v11.4s, v15.4s      \n"
                    "uzp2   v31.4s, v11.4s, v15.4s      \n"

                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%1], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0 = vld4q_f32(p0);
                float32x4x4_t _r1 = vld4q_f32(p0 + 16);
                float32x4x4_t _r2 = vld4q_f32(p0 + 32);
                float32x4x4_t _r3 = vld4q_f32(p0 + 48);
                float32x4x2_t _r04l = vuzpq_f32(_r0.val[0], _r1.val[0]);
                float32x4x2_t _r15l = vuzpq_f32(_r0.val[1], _r1.val[1]);
                float32x4x2_t _r26l = vuzpq_f32(_r0.val[2], _r1.val[2]);
                float32x4x2_t _r37l = vuzpq_f32(_r0.val[3], _r1.val[3]);
                float32x4x2_t _r04h = vuzpq_f32(_r2.val[0], _r3.val[0]);
                float32x4x2_t _r15h = vuzpq_f32(_r2.val[1], _r3.val[1]);
                float32x4x2_t _r26h = vuzpq_f32(_r2.val[2], _r3.val[2]);
                float32x4x2_t _r37h = vuzpq_f32(_r2.val[3], _r3.val[3]);
                vst1q_f32(pp, _r04l.val[0]);
                vst1q_f32(pp + 4, _r04h.val[0]);
                vst1q_f32(pp + 4 * 2, _r15l.val[0]);
                vst1q_f32(pp + 4 * 3, _r15h.val[0]);
                vst1q_f32(pp + 4 * 4, _r26l.val[0]);
                vst1q_f32(pp + 4 * 5, _r26h.val[0]);
                vst1q_f32(pp + 4 * 6, _r37l.val[0]);
                vst1q_f32(pp + 4 * 7, _r37h.val[0]);
                vst1q_f32(pp + 4 * 8, _r04l.val[1]);
                vst1q_f32(pp + 4 * 9, _r04h.val[1]);
                vst1q_f32(pp + 4 * 10, _r15l.val[1]);
                vst1q_f32(pp + 4 * 11, _r15h.val[1]);
                vst1q_f32(pp + 4 * 12, _r26l.val[1]);
                vst1q_f32(pp + 4 * 13, _r26h.val[1]);
                vst1q_f32(pp + 4 * 14, _r37l.val[1]);
                vst1q_f32(pp + 4 * 15, _r37h.val[1]);
                p0 += max_jj * batch * 8;
                pp += 64;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
#endif // __aarch64__
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x8
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
                    "sub    %0, %0, #64                 \n"
                    "st1    {v0.4s}, [%1], #16          \n"
                    "st1    {v4.4s}, [%1], #16          \n"
                    "st1    {v1.4s}, [%1], #16          \n"
                    "st1    {v5.4s}, [%1], #16          \n"
                    "st1    {v2.4s}, [%1], #16          \n"
                    "st1    {v6.4s}, [%1], #16          \n"
                    "st1    {v3.4s}, [%1], #16          \n"
                    "st1    {v7.4s}, [%1], #16          \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #512]          \n"
                    "vldm       %0!, {d0-d7}        \n"
                    "pld        [%0, #512]          \n"
                    "vldm       %0, {d16-d23}       \n"

                    "vtrn.32    q0, q1              \n"
                    "vtrn.32    q2, q3              \n"
                    "vtrn.32    q8, q9              \n"
                    "vtrn.32    q10, q11            \n"
                    "vswp       d1, d4              \n"
                    "vswp       d3, d6              \n"
                    "vswp       d17, d20            \n"
                    "vswp       d19, d22            \n"
                    "vswp       q1, q8              \n"
                    "vswp       q3, q10             \n"

                    "vst1.f32   {d0-d3}, [%1 :128]! \n"
                    "vst1.f32   {d16-d19}, [%1 :128]! \n"
                    "sub        %0, %0, #64         \n"
                    "vst1.f32   {d4-d7}, [%1 :128]! \n"
                    "vst1.f32   {d20-d23}, [%1 :128]! \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
#endif // __aarch64__
                p0 += max_jj * batch * 4;
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0 = vld4q_f32(p0);
                float32x4x4_t _r1 = vld4q_f32(p0 + 16);
                vst1q_f32(pp, _r0.val[0]);
                vst1q_f32(pp + 4, _r1.val[0]);
                vst1q_f32(pp + 4 * 2, _r0.val[1]);
                vst1q_f32(pp + 4 * 3, _r1.val[1]);
                vst1q_f32(pp + 4 * 4, _r0.val[2]);
                vst1q_f32(pp + 4 * 5, _r1.val[2]);
                vst1q_f32(pp + 4 * 6, _r0.val[3]);
                vst1q_f32(pp + 4 * 7, _r1.val[3]);
                p0 += max_jj * batch * 4;
                pp += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 4;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                // transpose 2x8
                float32x4x2_t _r0 = vld2q_f32(p0);
                float32x4x2_t _r1 = vld2q_f32(p0 + 8);
                vst1q_f32(pp, _r0.val[0]);
                vst1q_f32(pp + 4, _r1.val[0]);
                vst1q_f32(pp + 4 * 2, _r0.val[1]);
                vst1q_f32(pp + 4 * 3, _r1.val[1]);
                p0 += max_jj * batch * 2;
                pp += 16;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                float32x4_t _r0 = vld1q_f32(p0);
                float32x4_t _r1 = vld1q_f32(p0 + 4);
                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r1);
                p0 += max_jj * batch;
                pp += 8;
            }
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const float* p0 = B;

            int kk = 0;
#if __aarch64__
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x4
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"

                    "uzp1   v8.4s, v0.4s, v4.4s         \n"
                    "uzp2   v12.4s, v0.4s, v4.4s        \n"
                    "uzp1   v9.4s, v1.4s, v5.4s         \n"
                    "uzp2   v13.4s, v1.4s, v5.4s        \n"

                    "sub    %0, %0, #64                 \n"

                    "uzp1   v10.4s, v2.4s, v6.4s        \n"
                    "uzp2   v14.4s, v2.4s, v6.4s        \n"
                    "uzp1   v11.4s, v3.4s, v7.4s        \n"
                    "uzp2   v15.4s, v3.4s, v7.4s        \n"

                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0;
                float32x4x4_t _r1;
                _r0.val[0] = vld1q_f32(p0);
                _r1.val[0] = vld1q_f32(p0 + 4);
                _r0.val[1] = vld1q_f32(p0 + 8);
                _r1.val[1] = vld1q_f32(p0 + 12);
                _r0.val[2] = vld1q_f32(p0 + 16);
                _r1.val[2] = vld1q_f32(p0 + 20);
                _r0.val[3] = vld1q_f32(p0 + 24);
                _r1.val[3] = vld1q_f32(p0 + 28);
                vst4q_f32(pp, _r0);
                vst4q_f32(pp + 16, _r1);
                p0 += max_jj * batch * 8;
                pp += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
#endif // __aarch64__
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x4
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                    "st4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #512]          \n"
                    "vldm       %0, {d0-d7}         \n"
                    "vtrn.32    q0, q1              \n"
                    "vtrn.32    q2, q3              \n"
                    "vswp       d1, d4              \n"
                    "vswp       d3, d6              \n"
                    "vstm       %1!, {d0-d7}        \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
                p0 += max_jj * batch * 4;
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0;
                _r0.val[0] = vld1q_f32(p0);
                _r0.val[1] = vld1q_f32(p0 + 4);
                _r0.val[2] = vld1q_f32(p0 + 8);
                _r0.val[3] = vld1q_f32(p0 + 12);
                vst4q_f32(pp, _r0);
                p0 += max_jj * batch * 4;
                pp += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 4;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                // transpose 2x4
                float32x4x2_t _r0 = vld2q_f32(p0);
                vst1q_f32(pp, _r0.val[0]);
                vst1q_f32(pp + 4, _r0.val[1]);
                p0 += max_jj * batch * 2;
                pp += 8;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                float32x4_t _r0 = vld1q_f32(p0);
                vst1q_f32(pp, _r0);
                p0 += max_jj * batch;
                pp += 4;
            }
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            const float* p0 = B;

            int kk = 0;
#if __ARM_NEON
#if __aarch64__
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x2
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"

                    "zip1   v4.4s, v0.4s, v2.4s         \n"
                    "zip2   v5.4s, v0.4s, v2.4s         \n"
                    "zip1   v6.4s, v1.4s, v3.4s         \n"
                    "zip2   v7.4s, v1.4s, v3.4s         \n"

                    "st1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                float32x4x2_t _r0;
                float32x4x2_t _r1;
                _r0.val[0] = vld1q_f32(p0);
                _r1.val[0] = vld1q_f32(p0 + 4);
                _r0.val[1] = vld1q_f32(p0 + 8);
                _r1.val[1] = vld1q_f32(p0 + 12);
                vst2q_f32(pp, _r0);
                vst2q_f32(pp + 8, _r1);
                p0 += max_jj * batch * 8;
                pp += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
#endif // __aarch64__
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x2
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]       \n"
                    "ld1    {v0.4s, v1.4s}, [%0]        \n"
                    "st2    {v0.4s, v1.4s}, [%1], #32   \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld1.f32   {d0-d3}, [%0 :128]  \n"
                    "vst2.f32   {d0-d3}, [%1 :128]! \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1");
#endif // __aarch64__
                p0 += max_jj * batch * 4;
#else  // NCNN_GNU_INLINE_ASM
                float32x4x2_t _r0;
                _r0.val[0] = vld1q_f32(p0);
                _r0.val[1] = vld1q_f32(p0 + 4);
                vst2q_f32(pp, _r0);
                p0 += max_jj * batch * 4;
                pp += 8;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 4;
#endif // __ARM_NEON
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[2];
                pp[2] = p0[1];
                pp[3] = p0[3];
                p0 += max_jj * batch * 2;
                pp += 4;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                p0 += max_jj * batch;
                pp += 2;
            }
        }
        for (; jj < max_jj; jj++)
        {
            const float* p0 = B;

            int kk = 0;
#if __ARM_NEON
#if __aarch64__
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]       \n"
                    "ld1    {v0.4s, v1.4s}, [%0]        \n"
                    "st1    {v0.4s, v1.4s}, [%1], #32   \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _r0 = vld1q_f32(p0);
                float32x4_t _r1 = vld1q_f32(p0 + 4);
                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r1);
                p0 += max_jj * batch * 8;
                pp += 8;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
#endif // __aarch64__
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #128]       \n"
                    "ld1    {v0.4s}, [%0]               \n"
                    "st1    {v0.4s}, [%1], #16          \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #128]          \n"
                    "vld1.f32   {d0-d1}, [%0]       \n"
                    "vst1.f32   {d0-d1}, [%1]!      \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0");
#endif // __aarch64__
                p0 += max_jj * batch * 4;
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _r0 = vld1q_f32(p0);
                vst1q_f32(pp, _r0);
                p0 += max_jj * batch * 4;
                pp += 4;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 4;
#endif // __ARM_NEON
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                p0 += max_jj * batch * 2;
                pp += 2;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                p0 += max_jj * batch;
                pp += 1;
            }
        }
    }
}

static void conv3x3s1_winograd_gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, Mat& top_blob, int batch, int max_ii, int max_jj, int k, int max_kk, int use_a53_a55_optimized_kernel)
{
    // NCNN_LOGE("conv3x3s1_winograd_gemm_transB_packed_tile %d %d %d", max_ii, max_jj, max_kk);
    float* outptr = top_blob;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        for (int b = 0; b < batch; b++)
        {
            const float* pAT = AT_tile.row(b) + max_kk * ii;
            const float* pB = BT_tile.row(b);

            int jj = 0;
            for (; jj + 11 < max_jj; jj += 12)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                if (use_a53_a55_optimized_kernel && cpu_support_arm_asimdhp())
                {
                    // a55
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                        "subs   %0, %0, #320                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v8.16b, v8.16b, v8.16b      \n"
                        "eor    v9.16b, v9.16b, v9.16b      \n"
                        "eor    v10.16b, v10.16b, v10.16b   \n"
                        "eor    v11.16b, v11.16b, v11.16b   \n"
                        "eor    v12.16b, v12.16b, v12.16b   \n"
                        "eor    v13.16b, v13.16b, v13.16b   \n"
                        "eor    v14.16b, v14.16b, v14.16b   \n"
                        "eor    v15.16b, v15.16b, v15.16b   \n"
                        "eor    v16.16b, v16.16b, v16.16b   \n"
                        "eor    v17.16b, v17.16b, v17.16b   \n"
                        "eor    v18.16b, v18.16b, v18.16b   \n"
                        "eor    v19.16b, v19.16b, v19.16b   \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.4s}, [%1], #16          \n"
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s}, [%2], #16          \n"

                        "ldr    d5, [%1], #8                \n"
                        "ldr    x25, [%1], #8               \n"

                        ".align 4                           \n"
                        "2:                                 \n"
                        "ldr    d1, [%2], #8                \n"
                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v10.4s, v4.4s, v0.s[1]      \n"
                        "ins    v5.d[1], x25                \n"
                        "fmla   v12.4s, v4.4s, v0.s[2]      \n"
                        "ldr    d2, [%2], #8                \n"
                        "fmla   v14.4s, v4.4s, v0.s[3]      \n"
                        "ldr    x22, [%2], #8               \n"
                        "fmla   v9.4s, v5.4s, v0.s[0]       \n"
                        "ldr    d6, [%1], #8                \n"
                        "fmla   v11.4s, v5.4s, v0.s[1]      \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v13.4s, v5.4s, v0.s[2]      \n"
                        "ldr    x26, [%1], #8               \n"
                        "fmla   v15.4s, v5.4s, v0.s[3]      \n"
                        "ldr    d3, [%2], #8                \n"
                        "fmla   v16.4s, v4.4s, v1.s[0]      \n"
                        "ldr    x23, [%2], #8               \n"
                        "fmla   v18.4s, v4.4s, v1.s[1]      \n"
                        "ldr    d7, [%1], #8                \n"
                        "fmla   v20.4s, v4.4s, v1.s[2]      \n"
                        "ldr    x27, [%1], #8               \n"
                        "fmla   v22.4s, v4.4s, v1.s[3]      \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v17.4s, v5.4s, v1.s[0]      \n"
                        "ldr    d0, [%2], #8                \n"
                        "fmla   v19.4s, v5.4s, v1.s[1]      \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v21.4s, v5.4s, v1.s[2]      \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v23.4s, v5.4s, v1.s[3]      \n"
                        "fmla   v24.4s, v4.4s, v2.s[0]      \n"
                        "ldr    d1, [%2], #8                \n"
                        "fmla   v26.4s, v4.4s, v2.s[1]      \n"
                        "ins    v6.d[1], x26                \n"
                        "fmla   v28.4s, v4.4s, v2.s[2]      \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v30.4s, v4.4s, v2.s[3]      \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v25.4s, v5.4s, v2.s[0]      \n"
                        "ldr    d4, [%1], #8                \n"
                        "fmla   v27.4s, v5.4s, v2.s[1]      \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v29.4s, v5.4s, v2.s[2]      \n"
                        "ldr    x24, [%1], #8               \n"
                        "fmla   v31.4s, v5.4s, v2.s[3]      \n"
                        "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                        "ldr    d2, [%2], #8                \n"
                        "fmla   v10.4s, v6.4s, v3.s[1]      \n"
                        "ins    v7.d[1], x27                \n"
                        "fmla   v12.4s, v6.4s, v3.s[2]      \n"
                        "ldr    x22, [%2], #8               \n"
                        "fmla   v14.4s, v6.4s, v3.s[3]      \n"
                        "fmla   v9.4s, v7.4s, v3.s[0]       \n"
                        "ldr    d5, [%1], #8                \n"
                        "fmla   v11.4s, v7.4s, v3.s[1]      \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v13.4s, v7.4s, v3.s[2]      \n"
                        "ldr    x25, [%1], #8               \n"
                        "fmla   v15.4s, v7.4s, v3.s[3]      \n"
                        "fmla   v16.4s, v6.4s, v0.s[0]      \n"
                        "ldr    d3, [%2], #8                \n"
                        "fmla   v18.4s, v6.4s, v0.s[1]      \n"
                        "ldr    x23, [%2], #8               \n"
                        "fmla   v20.4s, v6.4s, v0.s[2]      \n"
                        "fmla   v22.4s, v6.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v7.4s, v0.s[0]      \n"
                        "fmla   v19.4s, v7.4s, v0.s[1]      \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v21.4s, v7.4s, v0.s[2]      \n"
                        "fmla   v23.4s, v7.4s, v0.s[3]      \n"
                        "prfm   pldl1keep, [%2, #256]       \n" // NOTE PRELOAD
                        "fmla   v24.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v26.4s, v6.4s, v1.s[1]      \n"
                        "ins    v4.d[1], x24                \n"
                        "fmla   v28.4s, v6.4s, v1.s[2]      \n"
                        "ldr    d0, [%2], #8                \n"
                        "fmla   v30.4s, v6.4s, v1.s[3]      \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v25.4s, v7.4s, v1.s[0]      \n"
                        "ldr    d6, [%1], #8                \n"
                        "fmla   v27.4s, v7.4s, v1.s[1]      \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v29.4s, v7.4s, v1.s[2]      \n"
                        "fmla   v31.4s, v7.4s, v1.s[3]      \n"
                        "ldr    x26, [%1], #8               \n"
                        "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                        "ldr    d1, [%2], #8                \n"
                        "fmla   v10.4s, v4.4s, v2.s[1]      \n"
                        "ins    v5.d[1], x25                \n"
                        "fmla   v12.4s, v4.4s, v2.s[2]      \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v14.4s, v4.4s, v2.s[3]      \n"
                        "ldr    d7, [%1], #8                \n"
                        "fmla   v9.4s, v5.4s, v2.s[0]       \n"
                        "ldr    x27, [%1], #8               \n"
                        "fmla   v11.4s, v5.4s, v2.s[1]      \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v13.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v2.s[3]      \n"
                        "fmla   v16.4s, v4.4s, v3.s[0]      \n"
                        "ldr    d2, [%2], #8                \n"
                        "fmla   v18.4s, v4.4s, v3.s[1]      \n"
                        "ldr    x22, [%2], #8               \n"
                        "fmla   v20.4s, v4.4s, v3.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v3.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v3.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v3.s[1]      \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v21.4s, v5.4s, v3.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v3.s[3]      \n"
                        "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                        "ldr    d3, [%2], #8                \n"
                        "fmla   v26.4s, v4.4s, v0.s[1]      \n"
                        "ldr    x23, [%2], #8               \n"
                        "fmla   v28.4s, v4.4s, v0.s[2]      \n"
                        "ins    v6.d[1], x26                \n"
                        "fmla   v30.4s, v4.4s, v0.s[3]      \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v25.4s, v5.4s, v0.s[0]      \n"
                        "ldr    d4, [%1], #8                \n"
                        "fmla   v27.4s, v5.4s, v0.s[1]      \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v29.4s, v5.4s, v0.s[2]      \n"
                        "ldr    x24, [%1], #8               \n"
                        "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                        "ldr    d0, [%2], #8                \n"
                        "fmla   v10.4s, v6.4s, v1.s[1]      \n"
                        "ins    v7.d[1], x27                \n"
                        "fmla   v12.4s, v6.4s, v1.s[2]      \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v14.4s, v6.4s, v1.s[3]      \n"
                        "ldr    d5, [%1], #8                \n"
                        "fmla   v9.4s, v7.4s, v1.s[0]       \n"
                        "ldr    x25, [%1], #8               \n"
                        "fmla   v11.4s, v7.4s, v1.s[1]      \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v13.4s, v7.4s, v1.s[2]      \n"
                        "fmla   v15.4s, v7.4s, v1.s[3]      \n"
                        "fmla   v16.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v18.4s, v6.4s, v2.s[1]      \n"
                        "fmla   v20.4s, v6.4s, v2.s[2]      \n"
                        "fmla   v22.4s, v6.4s, v2.s[3]      \n"
                        "fmla   v17.4s, v7.4s, v2.s[0]      \n"
                        "fmla   v19.4s, v7.4s, v2.s[1]      \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v21.4s, v7.4s, v2.s[2]      \n"
                        "fmla   v23.4s, v7.4s, v2.s[3]      \n"
                        "fmla   v24.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v26.4s, v6.4s, v3.s[1]      \n"
                        "fmla   v28.4s, v6.4s, v3.s[2]      \n"
                        "ins    v4.d[1], x24                \n"
                        "fmla   v30.4s, v6.4s, v3.s[3]      \n"
                        "fmla   v25.4s, v7.4s, v3.s[0]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v27.4s, v7.4s, v3.s[1]      \n"
                        "fmla   v29.4s, v7.4s, v3.s[2]      \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v31.4s, v7.4s, v3.s[3]      \n"
                        "bne    2b                          \n"

                        "sub    %1, %1, #32                 \n"
                        "sub    %2, %2, #16                 \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                        "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "fmla   v10.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v12.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v14.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v16.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v1.s[3]      \n"
                        "fmla   v24.4s, v4.4s, v2.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v2.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v2.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v2.s[3]      \n"

                        "subs   w4, w4, #1                  \n"

                        "fmla   v9.4s, v5.4s, v0.s[0]       \n"
                        "fmla   v11.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v1.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v2.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v2.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                        "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                else if (use_a53_a55_optimized_kernel && !cpu_support_arm_asimdhp())
                {
                    // a53
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                        "subs   %0, %0, #320                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v8.16b, v8.16b, v8.16b      \n"
                        "eor    v9.16b, v9.16b, v9.16b      \n"
                        "eor    v10.16b, v10.16b, v10.16b   \n"
                        "eor    v11.16b, v11.16b, v11.16b   \n"
                        "eor    v12.16b, v12.16b, v12.16b   \n"
                        "eor    v13.16b, v13.16b, v13.16b   \n"
                        "eor    v14.16b, v14.16b, v14.16b   \n"
                        "eor    v15.16b, v15.16b, v15.16b   \n"
                        "eor    v16.16b, v16.16b, v16.16b   \n"
                        "eor    v17.16b, v17.16b, v17.16b   \n"
                        "eor    v18.16b, v18.16b, v18.16b   \n"
                        "eor    v19.16b, v19.16b, v19.16b   \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v4.4s}, [%1], #16          \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v0.4s}, [%2], #16          \n"

                        "ldr    d1, [%2]                    \n"
                        "ldr    x21, [%2, #8]               \n"
                        "ldr    d2, [%2, #16]               \n"
                        "ldr    x22, [%2, #24]              \n"
                        "add    %2, %2, #32                 \n"

                        ".align 4                           \n"
                        "2:                                 \n"

                        "ldr    d5, [%1]                    \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "ldr    x25, [%1, #8]               \n"
                        "fmla   v10.4s, v4.4s, v0.s[1]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v12.4s, v4.4s, v0.s[2]      \n"

                        "ldr    d6, [%1]                    \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v14.4s, v4.4s, v0.s[3]      \n"
                        "ldr    x26, [%1, #8]               \n"
                        "fmla   v16.4s, v4.4s, v1.s[0]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v18.4s, v4.4s, v1.s[1]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                        "fmla   v20.4s, v4.4s, v1.s[2]      \n"
                        "nop                                \n"
                        "fmla   v22.4s, v4.4s, v1.s[3]      \n"
                        "nop                                \n"
                        "fmla   v24.4s, v4.4s, v2.s[0]      \n"

                        "ldr    d3, [%2]                    \n"
                        "ins    v5.d[1], x25                \n"
                        "fmla   v26.4s, v4.4s, v2.s[1]      \n"
                        "ldr    x23, [%2, #8]               \n"
                        "fmla   v28.4s, v4.4s, v2.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v30.4s, v4.4s, v2.s[3]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                        "fmla   v9.4s, v5.4s, v0.s[0]       \n"
                        "nop                                \n"
                        "fmla   v11.4s, v5.4s, v0.s[1]      \n"
                        "nop                                \n"
                        "fmla   v13.4s, v5.4s, v0.s[2]      \n"

                        "nop                                \n"
                        "nop                                \n"
                        "fmla   v15.4s, v5.4s, v0.s[3]      \n"
                        "nop                                \n"
                        "fmla   v17.4s, v5.4s, v1.s[0]      \n"
                        "nop                                \n"
                        "fmla   v19.4s, v5.4s, v1.s[1]      \n"

                        "ldr    d0, [%2]                    \n"
                        "ins    v6.d[1], x26                \n"
                        "fmla   v21.4s, v5.4s, v1.s[2]      \n"
                        "ldr    x20, [%2, #8]               \n"
                        "fmla   v23.4s, v5.4s, v1.s[3]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v25.4s, v5.4s, v2.s[0]      \n"

                        "ldr    d1, [%2]                    \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v27.4s, v5.4s, v2.s[1]      \n"
                        "ldr    x21, [%2, #8]               \n"
                        "fmla   v29.4s, v5.4s, v2.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                        "ldr    d7, [%1]                    \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                        "ldr    x27, [%1, #8]               \n"
                        "fmla   v10.4s, v6.4s, v3.s[1]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v12.4s, v6.4s, v3.s[2]      \n"

                        "ldr    d4, [%1]                    \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v14.4s, v6.4s, v3.s[3]      \n"
                        "ldr    x24, [%1, #8]               \n"
                        "fmla   v16.4s, v6.4s, v0.s[0]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v18.4s, v6.4s, v0.s[1]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                        "fmla   v20.4s, v6.4s, v0.s[2]      \n"
                        "nop                                \n"
                        "fmla   v22.4s, v6.4s, v0.s[3]      \n"
                        "nop                                \n"
                        "fmla   v24.4s, v6.4s, v1.s[0]      \n"

                        "ldr    d2, [%2]                    \n"
                        "ins    v7.d[1], x27                \n"
                        "fmla   v26.4s, v6.4s, v1.s[1]      \n"
                        "ldr    x22, [%2, #8]               \n"
                        "fmla   v28.4s, v6.4s, v1.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v30.4s, v6.4s, v1.s[3]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                        "fmla   v9.4s, v7.4s, v3.s[0]       \n"
                        "nop                                \n"
                        "fmla   v11.4s, v7.4s, v3.s[1]      \n"
                        "nop                                \n"
                        "fmla   v13.4s, v7.4s, v3.s[2]      \n"

                        "nop                                \n"
                        "nop                                \n"
                        "fmla   v15.4s, v7.4s, v3.s[3]      \n"
                        "nop                                \n"
                        "fmla   v17.4s, v7.4s, v0.s[0]      \n"
                        "nop                                \n"
                        "fmla   v19.4s, v7.4s, v0.s[1]      \n"

                        "ldr    d3, [%2]                    \n"
                        "ins    v4.d[1], x24                \n"
                        "fmla   v21.4s, v7.4s, v0.s[2]      \n"
                        "ldr    x23, [%2, #8]               \n"
                        "fmla   v23.4s, v7.4s, v0.s[3]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v25.4s, v7.4s, v1.s[0]      \n"

                        "ldr    d0, [%2]                    \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v27.4s, v7.4s, v1.s[1]      \n"
                        "ldr    x20, [%2, #8]               \n"
                        "fmla   v29.4s, v7.4s, v1.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v31.4s, v7.4s, v1.s[3]      \n"

                        "ldr    d5, [%1]                    \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                        "ldr    x25, [%1, #8]               \n"
                        "fmla   v10.4s, v4.4s, v2.s[1]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v12.4s, v4.4s, v2.s[2]      \n"

                        "ldr    d6, [%1]                    \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v14.4s, v4.4s, v2.s[3]      \n"
                        "ldr    x26, [%1, #8]               \n"
                        "fmla   v16.4s, v4.4s, v3.s[0]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v18.4s, v4.4s, v3.s[1]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                        "fmla   v20.4s, v4.4s, v3.s[2]      \n"
                        "nop                                \n"
                        "fmla   v22.4s, v4.4s, v3.s[3]      \n"
                        "nop                                \n"
                        "fmla   v24.4s, v4.4s, v0.s[0]      \n"

                        "ldr    d1, [%2]                    \n"
                        "ins    v5.d[1], x25                \n"
                        "fmla   v26.4s, v4.4s, v0.s[1]      \n"
                        "ldr    x21, [%2, #8]               \n"
                        "fmla   v28.4s, v4.4s, v0.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v30.4s, v4.4s, v0.s[3]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                        "fmla   v9.4s, v5.4s, v2.s[0]       \n"
                        "nop                                \n"
                        "fmla   v11.4s, v5.4s, v2.s[1]      \n"
                        "nop                                \n"
                        "fmla   v13.4s, v5.4s, v2.s[2]      \n"

                        "nop                                \n"
                        "nop                                \n"
                        "fmla   v15.4s, v5.4s, v2.s[3]      \n"
                        "nop                                \n"
                        "fmla   v17.4s, v5.4s, v3.s[0]      \n"
                        "nop                                \n"
                        "fmla   v19.4s, v5.4s, v3.s[1]      \n"

                        "ldr    d2, [%2]                    \n"
                        "ins    v6.d[1], x26                \n"
                        "fmla   v21.4s, v5.4s, v3.s[2]      \n"
                        "ldr    x22, [%2, #8]               \n"
                        "fmla   v23.4s, v5.4s, v3.s[3]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v25.4s, v5.4s, v0.s[0]      \n"

                        "ldr    d3, [%2]                    \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v27.4s, v5.4s, v0.s[1]      \n"
                        "ldr    x23, [%2, #8]               \n"
                        "fmla   v29.4s, v5.4s, v0.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v31.4s, v5.4s, v0.s[3]      \n"

                        "ldr    d7, [%1]                    \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                        "ldr    x27, [%1, #8]               \n"
                        "fmla   v10.4s, v6.4s, v1.s[1]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v12.4s, v6.4s, v1.s[2]      \n"

                        "ldr    d4, [%1]                    \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v14.4s, v6.4s, v1.s[3]      \n"
                        "ldr    x24, [%1, #8]               \n"
                        "fmla   v16.4s, v6.4s, v2.s[0]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v18.4s, v6.4s, v2.s[1]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                        "fmla   v20.4s, v6.4s, v2.s[2]      \n"
                        "nop                                \n"
                        "fmla   v22.4s, v6.4s, v2.s[3]      \n"
                        "nop                                \n"
                        "fmla   v24.4s, v6.4s, v3.s[0]      \n"

                        "ldr    d0, [%2]                    \n"
                        "ins    v7.d[1], x27                \n"
                        "fmla   v26.4s, v6.4s, v3.s[1]      \n"
                        "ldr    x20, [%2, #8]               \n"
                        "fmla   v28.4s, v6.4s, v3.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v30.4s, v6.4s, v3.s[3]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                        "fmla   v9.4s, v7.4s, v1.s[0]       \n"
                        "nop                                \n"
                        "fmla   v11.4s, v7.4s, v1.s[1]      \n"
                        "nop                                \n"
                        "fmla   v13.4s, v7.4s, v1.s[2]      \n"

                        "nop                                \n"
                        "nop                                \n"
                        "fmla   v15.4s, v7.4s, v1.s[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v17.4s, v7.4s, v2.s[0]      \n"
                        "nop                                \n"
                        "fmla   v19.4s, v7.4s, v2.s[1]      \n"

                        "ldr    d1, [%2]                    \n"
                        "ins    v4.d[1], x24                \n"
                        "fmla   v21.4s, v7.4s, v2.s[2]      \n"
                        "ldr    x21, [%2, #8]               \n"
                        "fmla   v23.4s, v7.4s, v2.s[3]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v25.4s, v7.4s, v3.s[0]      \n"

                        "ldr    d2, [%2]                    \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v27.4s, v7.4s, v3.s[1]      \n"
                        "ldr    x22, [%2, #8]               \n"
                        "fmla   v29.4s, v7.4s, v3.s[2]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v31.4s, v7.4s, v3.s[3]      \n"

                        "bne    2b                          \n"

                        "sub    %1, %1, #16                 \n"
                        "sub    %2, %2, #48                 \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                        "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "fmla   v10.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v12.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v14.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v16.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v1.s[3]      \n"
                        "fmla   v24.4s, v4.4s, v2.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v2.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v2.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v2.s[3]      \n"

                        "subs   w4, w4, #1                  \n"

                        "fmla   v9.4s, v5.4s, v0.s[0]       \n"
                        "fmla   v11.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v1.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v2.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v2.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                        "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                else
                {
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                        "subs   %0, %0, #320                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v8.16b, v8.16b, v8.16b      \n"
                        "eor    v9.16b, v9.16b, v9.16b      \n"
                        "eor    v10.16b, v10.16b, v10.16b   \n"
                        "eor    v11.16b, v11.16b, v11.16b   \n"
                        "eor    v12.16b, v12.16b, v12.16b   \n"
                        "eor    v13.16b, v13.16b, v13.16b   \n"
                        "eor    v14.16b, v14.16b, v14.16b   \n"
                        "eor    v15.16b, v15.16b, v15.16b   \n"
                        "eor    v16.16b, v16.16b, v16.16b   \n"
                        "eor    v17.16b, v17.16b, v17.16b   \n"
                        "eor    v18.16b, v18.16b, v18.16b   \n"
                        "eor    v19.16b, v19.16b, v19.16b   \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "2:                                 \n"
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"

                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "fmla   v10.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v12.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v14.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v9.4s, v5.4s, v0.s[0]       \n"
                        "fmla   v11.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v0.s[3]      \n"

                        "fmla   v16.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v1.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v1.s[3]      \n"

                        "fmla   v24.4s, v4.4s, v2.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v2.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v2.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v2.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v2.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v2.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                        "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                        "fmla   v10.4s, v6.4s, v3.s[1]      \n"
                        "fmla   v12.4s, v6.4s, v3.s[2]      \n"
                        "fmla   v14.4s, v6.4s, v3.s[3]      \n"
                        "fmla   v9.4s, v7.4s, v3.s[0]       \n"
                        "fmla   v11.4s, v7.4s, v3.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v3.s[2]      \n"
                        "fmla   v15.4s, v7.4s, v3.s[3]      \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"

                        "fmla   v16.4s, v6.4s, v0.s[0]      \n"
                        "fmla   v18.4s, v6.4s, v0.s[1]      \n"
                        "fmla   v20.4s, v6.4s, v0.s[2]      \n"
                        "fmla   v22.4s, v6.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v7.4s, v0.s[0]      \n"
                        "fmla   v19.4s, v7.4s, v0.s[1]      \n"
                        "fmla   v21.4s, v7.4s, v0.s[2]      \n"
                        "fmla   v23.4s, v7.4s, v0.s[3]      \n"

                        "fmla   v24.4s, v6.4s, v1.s[0]      \n"
                        "fmla   v26.4s, v6.4s, v1.s[1]      \n"
                        "fmla   v28.4s, v6.4s, v1.s[2]      \n"
                        "fmla   v30.4s, v6.4s, v1.s[3]      \n"
                        "fmla   v25.4s, v7.4s, v1.s[0]      \n"
                        "fmla   v27.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v29.4s, v7.4s, v1.s[2]      \n"
                        "fmla   v31.4s, v7.4s, v1.s[3]      \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"

                        "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                        "fmla   v10.4s, v4.4s, v2.s[1]      \n"
                        "fmla   v12.4s, v4.4s, v2.s[2]      \n"
                        "fmla   v14.4s, v4.4s, v2.s[3]      \n"
                        "fmla   v9.4s, v5.4s, v2.s[0]       \n"
                        "fmla   v11.4s, v5.4s, v2.s[1]      \n"
                        "fmla   v13.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v2.s[3]      \n"

                        "fmla   v16.4s, v4.4s, v3.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v3.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v3.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v3.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v3.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v3.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v3.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v3.s[3]      \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"

                        "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v0.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v0.s[3]      \n"

                        "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                        "fmla   v10.4s, v6.4s, v1.s[1]      \n"
                        "fmla   v12.4s, v6.4s, v1.s[2]      \n"
                        "fmla   v14.4s, v6.4s, v1.s[3]      \n"
                        "fmla   v9.4s, v7.4s, v1.s[0]       \n"
                        "fmla   v11.4s, v7.4s, v1.s[1]      \n"
                        "fmla   v13.4s, v7.4s, v1.s[2]      \n"
                        "fmla   v15.4s, v7.4s, v1.s[3]      \n"

                        "fmla   v16.4s, v6.4s, v2.s[0]      \n"
                        "fmla   v18.4s, v6.4s, v2.s[1]      \n"
                        "fmla   v20.4s, v6.4s, v2.s[2]      \n"
                        "fmla   v22.4s, v6.4s, v2.s[3]      \n"
                        "fmla   v17.4s, v7.4s, v2.s[0]      \n"
                        "fmla   v19.4s, v7.4s, v2.s[1]      \n"
                        "fmla   v21.4s, v7.4s, v2.s[2]      \n"
                        "fmla   v23.4s, v7.4s, v2.s[3]      \n"

                        "subs   w4, w4, #1                  \n"

                        "fmla   v24.4s, v6.4s, v3.s[0]      \n"
                        "fmla   v26.4s, v6.4s, v3.s[1]      \n"
                        "fmla   v28.4s, v6.4s, v3.s[2]      \n"
                        "fmla   v30.4s, v6.4s, v3.s[3]      \n"
                        "fmla   v25.4s, v7.4s, v3.s[0]      \n"
                        "fmla   v27.4s, v7.4s, v3.s[1]      \n"
                        "fmla   v29.4s, v7.4s, v3.s[2]      \n"
                        "fmla   v31.4s, v7.4s, v3.s[3]      \n"

                        "bne    2b                          \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                        "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                        "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                        "fmla   v10.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v12.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v14.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v16.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v1.s[3]      \n"
                        "fmla   v24.4s, v4.4s, v2.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v2.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v2.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v2.s[3]      \n"

                        "subs   w4, w4, #1                  \n"

                        "fmla   v9.4s, v5.4s, v0.s[0]       \n"
                        "fmla   v11.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v13.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v15.4s, v5.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v1.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v2.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v2.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v2.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                        "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum00;
                float32x4_t _sum01;
                float32x4_t _sum10;
                float32x4_t _sum11;
                float32x4_t _sum20;
                float32x4_t _sum21;
                float32x4_t _sum30;
                float32x4_t _sum31;
                float32x4_t _sum40;
                float32x4_t _sum41;
                float32x4_t _sum50;
                float32x4_t _sum51;
                float32x4_t _sum60;
                float32x4_t _sum61;
                float32x4_t _sum70;
                float32x4_t _sum71;
                float32x4_t _sum80;
                float32x4_t _sum81;
                float32x4_t _sum90;
                float32x4_t _sum91;
                float32x4_t _suma0;
                float32x4_t _suma1;
                float32x4_t _sumb0;
                float32x4_t _sumb1;

                if (k == 0)
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                    _sum40 = vdupq_n_f32(0.f);
                    _sum41 = vdupq_n_f32(0.f);
                    _sum50 = vdupq_n_f32(0.f);
                    _sum51 = vdupq_n_f32(0.f);
                    _sum60 = vdupq_n_f32(0.f);
                    _sum61 = vdupq_n_f32(0.f);
                    _sum70 = vdupq_n_f32(0.f);
                    _sum71 = vdupq_n_f32(0.f);
                    _sum80 = vdupq_n_f32(0.f);
                    _sum81 = vdupq_n_f32(0.f);
                    _sum90 = vdupq_n_f32(0.f);
                    _sum91 = vdupq_n_f32(0.f);
                    _suma0 = vdupq_n_f32(0.f);
                    _suma1 = vdupq_n_f32(0.f);
                    _sumb0 = vdupq_n_f32(0.f);
                    _sumb1 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum00 = vld1q_f32(outptr);
                    _sum01 = vld1q_f32(outptr + 4 * 1);
                    _sum10 = vld1q_f32(outptr + 4 * 2);
                    _sum11 = vld1q_f32(outptr + 4 * 3);
                    _sum20 = vld1q_f32(outptr + 4 * 4);
                    _sum21 = vld1q_f32(outptr + 4 * 5);
                    _sum30 = vld1q_f32(outptr + 4 * 6);
                    _sum31 = vld1q_f32(outptr + 4 * 7);
                    _sum40 = vld1q_f32(outptr + 4 * 8);
                    _sum41 = vld1q_f32(outptr + 4 * 9);
                    _sum50 = vld1q_f32(outptr + 4 * 10);
                    _sum51 = vld1q_f32(outptr + 4 * 11);
                    _sum60 = vld1q_f32(outptr + 4 * 12);
                    _sum61 = vld1q_f32(outptr + 4 * 13);
                    _sum70 = vld1q_f32(outptr + 4 * 14);
                    _sum71 = vld1q_f32(outptr + 4 * 15);
                    _sum80 = vld1q_f32(outptr + 4 * 16);
                    _sum81 = vld1q_f32(outptr + 4 * 17);
                    _sum90 = vld1q_f32(outptr + 4 * 18);
                    _sum91 = vld1q_f32(outptr + 4 * 19);
                    _suma0 = vld1q_f32(outptr + 4 * 20);
                    _suma1 = vld1q_f32(outptr + 4 * 21);
                    _sumb0 = vld1q_f32(outptr + 4 * 22);
                    _sumb1 = vld1q_f32(outptr + 4 * 23);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA0 = vld1q_f32(pA);
                    float32x4_t _pA1 = vld1q_f32(pA + 4);

                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);
                    float32x4_t _pB2 = vld1q_f32(pB + 8);

                    _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                    _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                    _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                    _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                    _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                    _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                    _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                    _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                    _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                    _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                    _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                    _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                    _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                    _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                    _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                    _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                    _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                    _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                    _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                    _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                    _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                    _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                    _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                    pA += 8;
                    pB += 12;
                }

                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
                vst1q_f32(outptr + 4 * 16, _sum80);
                vst1q_f32(outptr + 4 * 17, _sum81);
                vst1q_f32(outptr + 4 * 18, _sum90);
                vst1q_f32(outptr + 4 * 19, _sum91);
                vst1q_f32(outptr + 4 * 20, _suma0);
                vst1q_f32(outptr + 4 * 21, _suma1);
                vst1q_f32(outptr + 4 * 22, _sumb0);
                vst1q_f32(outptr + 4 * 23, _sumb1);
                outptr += 8 * 12;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                if (use_a53_a55_optimized_kernel && cpu_support_arm_asimdhp())
                {
                    // a55
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                        "subs   %0, %0, #192                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v16.16b, v16.16b, v16.16b   \n"
                        "eor    v17.16b, v17.16b, v17.16b   \n"
                        "eor    v18.16b, v18.16b, v18.16b   \n"
                        "eor    v19.16b, v19.16b, v19.16b   \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v8.4s}, [%1], #16          \n"
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s}, [%2], #16          \n"

                        "ldr    d1, [%2], #8                \n"
                        "ldr    x21, [%2], #8               \n"

                        ".align 4                           \n"
                        "2:                                 \n"
                        "ldr    d9, [%1], #8                \n"
                        "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                        "ldr    x25, [%1], #8               \n"
                        "fmla   v18.4s, v8.4s, v0.s[1]      \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v20.4s, v8.4s, v0.s[2]      \n"
                        "ldr    d10, [%1], #8               \n"
                        "fmla   v22.4s, v8.4s, v0.s[3]      \n"
                        "ldr    x26, [%1], #8               \n"
                        "fmla   v24.4s, v8.4s, v1.s[0]      \n"
                        "ldr    d2, [%2], #8                \n"
                        "fmla   v26.4s, v8.4s, v1.s[1]      \n"
                        "ins    v9.d[1], x25                \n"
                        "fmla   v28.4s, v8.4s, v1.s[2]      \n"
                        "ldr    x22, [%2], #8               \n"
                        "fmla   v30.4s, v8.4s, v1.s[3]      \n"
                        "ldr    d3, [%2], #8                \n"
                        "fmla   v17.4s, v9.4s, v0.s[0]      \n"
                        "ldr    x23, [%2], #8               \n"
                        "fmla   v19.4s, v9.4s, v0.s[1]      \n"
                        "ins    v10.d[1], x26               \n"
                        "fmla   v21.4s, v9.4s, v0.s[2]      \n"
                        "ldr    d11, [%1], #8               \n"
                        "fmla   v23.4s, v9.4s, v0.s[3]      \n"
                        "ldr    x27, [%1], #8               \n"
                        "fmla   v25.4s, v9.4s, v1.s[0]      \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v27.4s, v9.4s, v1.s[1]      \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v29.4s, v9.4s, v1.s[2]      \n"
                        "ldr    d12, [%1], #8               \n"
                        "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                        "ldr    x24, [%1], #8               \n"
                        "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v18.4s, v10.4s, v2.s[1]     \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v20.4s, v10.4s, v2.s[2]     \n"
                        "ldr    d4, [%2], #8                \n"
                        "fmla   v22.4s, v10.4s, v2.s[3]     \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v24.4s, v10.4s, v3.s[0]     \n"
                        "ldr    d5, [%2], #8                \n"
                        "fmla   v26.4s, v10.4s, v3.s[1]     \n"
                        "ins    v11.d[1], x27               \n"
                        "fmla   v28.4s, v10.4s, v3.s[2]     \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v30.4s, v10.4s, v3.s[3]     \n"
                        "ldr    d13, [%1], #8               \n"
                        "fmla   v17.4s, v11.4s, v2.s[0]     \n"
                        "ldr    x25, [%1], #8               \n"
                        "fmla   v19.4s, v11.4s, v2.s[1]     \n"
                        "ins    v12.d[1], x24               \n"
                        "fmla   v21.4s, v11.4s, v2.s[2]     \n"
                        "ldr    d14, [%1], #8               \n"
                        "fmla   v23.4s, v11.4s, v2.s[3]     \n"
                        "ldr    x26, [%1], #8               \n"
                        "fmla   v25.4s, v11.4s, v3.s[0]     \n"
                        "ldr    d6, [%2], #8                \n"
                        "fmla   v27.4s, v11.4s, v3.s[1]     \n"
                        "ins    v4.d[1], x20                \n"
                        "fmla   v29.4s, v11.4s, v3.s[2]     \n"
                        "ldr    x22, [%2], #8               \n"
                        "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                        "ldr    d7, [%2], #8                \n"
                        "fmla   v16.4s, v12.4s, v4.s[0]     \n"
                        "ldr    x23, [%2], #8               \n"
                        "fmla   v18.4s, v12.4s, v4.s[1]     \n"
                        "ins    v5.d[1], x21                \n"
                        "fmla   v20.4s, v12.4s, v4.s[2]     \n"
                        "ldr    d15, [%1], #8               \n"
                        "fmla   v22.4s, v12.4s, v4.s[3]     \n"
                        "ldr    x27, [%1], #8               \n"
                        "fmla   v24.4s, v12.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v26.4s, v12.4s, v5.s[1]     \n"
                        "ins    v13.d[1], x25               \n"
                        "fmla   v28.4s, v12.4s, v5.s[2]     \n"
                        "ldr    d8, [%1], #8                \n"
                        "fmla   v30.4s, v12.4s, v5.s[3]     \n"
                        "ldr    x24, [%1], #8               \n"
                        "fmla   v17.4s, v13.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v19.4s, v13.4s, v4.s[1]     \n"
                        "ins    v14.d[1], x26               \n"
                        "fmla   v21.4s, v13.4s, v4.s[2]     \n"
                        "ldr    d0, [%2], #8                \n"
                        "fmla   v23.4s, v13.4s, v4.s[3]     \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v25.4s, v13.4s, v5.s[0]     \n"
                        "ldr    d1, [%2], #8                \n"
                        "fmla   v27.4s, v13.4s, v5.s[1]     \n"
                        "ins    v6.d[1], x22                \n"
                        "fmla   v29.4s, v13.4s, v5.s[2]     \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v31.4s, v13.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v14.4s, v6.s[1]     \n"
                        "ins    v7.d[1], x23                \n"
                        "fmla   v20.4s, v14.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v14.4s, v6.s[3]     \n"
                        "fmla   v24.4s, v14.4s, v7.s[0]     \n"
                        "fmla   v26.4s, v14.4s, v7.s[1]     \n"
                        "ins    v15.d[1], x27               \n"
                        "fmla   v28.4s, v14.4s, v7.s[2]     \n"
                        "fmla   v30.4s, v14.4s, v7.s[3]     \n"
                        "fmla   v17.4s, v15.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v15.4s, v6.s[1]     \n"
                        "ins    v8.d[1], x24                \n"
                        "fmla   v21.4s, v15.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v15.4s, v6.s[3]     \n"
                        "fmla   v25.4s, v15.4s, v7.s[0]     \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v27.4s, v15.4s, v7.s[1]     \n"
                        "fmla   v29.4s, v15.4s, v7.s[2]     \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                        "bne    2b                          \n"

                        "sub    %1, %1, #16                 \n"
                        "sub    %2, %2, #32                 \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                        "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                        "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v0.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v24.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v1.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v1.s[3]      \n"
                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                else if (use_a53_a55_optimized_kernel && !cpu_support_arm_asimdhp())
                {
                    // a53
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                        "subs   %0, %0, #192                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v16.16b, v16.16b, v16.16b   \n"
                        "eor    v17.16b, v17.16b, v17.16b   \n"
                        "eor    v18.16b, v18.16b, v18.16b   \n"
                        "eor    v19.16b, v19.16b, v19.16b   \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ldr    d0, [%2]                    \n"
                        "ldr    x20, [%2, #8]               \n"
                        "ins    v0.d[1], x20                \n"
                        "add    %2, %2, #16                 \n"

                        "ldr    d8, [%1]                    \n"
                        "ldr    x24, [%1, #8]               \n"
                        "ins    v8.d[1], x24                \n"
                        "add    %1, %1, #16                 \n"

                        "ldr    d1, [%2]                    \n"
                        "ldr    x21, [%2, #8]               \n"
                        "add    %2, %2, #16                 \n"

                        "ldr    d9, [%1]                    \n"
                        "ldr    x25, [%1, #8]               \n"
                        "add    %1, %1, #16                 \n"

                        ".align 4                           \n"
                        "2:                                 \n"

                        "ldr    d2, [%2]                    \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                        "ldr    x22, [%2, #8]               \n"
                        "fmla   v18.4s, v8.4s, v0.s[1]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v20.4s, v8.4s, v0.s[2]      \n"

                        "ldr    d10, [%1]                   \n"
                        "ins    v9.d[1], x25                \n"
                        "fmla   v22.4s, v8.4s, v0.s[3]      \n"
                        "ldr    x26, [%1, #8]               \n"
                        "fmla   v24.4s, v8.4s, v1.s[0]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v26.4s, v8.4s, v1.s[1]      \n"

                        "ldr    d3, [%2]                    \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v28.4s, v8.4s, v1.s[2]      \n"
                        "ldr    x23, [%2, #8]               \n"
                        "fmla   v30.4s, v8.4s, v1.s[3]      \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v17.4s, v9.4s, v0.s[0]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v19.4s, v9.4s, v0.s[1]      \n"
                        "nop                                \n"
                        "fmla   v21.4s, v9.4s, v0.s[2]      \n"
                        "nop                                \n"
                        "fmla   v23.4s, v9.4s, v0.s[3]      \n"

                        "ldr    d11, [%1]                   \n"
                        "ins    v10.d[1], x26               \n"
                        "fmla   v25.4s, v9.4s, v1.s[0]      \n"
                        "ldr    x27, [%1, #8]               \n"
                        "fmla   v27.4s, v9.4s, v1.s[1]      \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v29.4s, v9.4s, v1.s[2]      \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                        "nop                                \n"
                        "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                        "nop                                \n"
                        "fmla   v18.4s, v10.4s, v2.s[1]     \n"

                        "ldr    d4, [%2]                    \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v20.4s, v10.4s, v2.s[2]     \n"
                        "ldr    x20, [%2, #8]               \n"
                        "fmla   v22.4s, v10.4s, v2.s[3]     \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v24.4s, v10.4s, v3.s[0]     \n"

                        "ldr    d12, [%1]                   \n"
                        "ins    v11.d[1], x27               \n"
                        "fmla   v26.4s, v10.4s, v3.s[1]     \n"
                        "ldr    x24, [%1, #8]               \n"
                        "fmla   v28.4s, v10.4s, v3.s[2]     \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v30.4s, v10.4s, v3.s[3]     \n"

                        "ldr    d5, [%2]                    \n"
                        "ins    v4.d[1], x20                \n"
                        "fmla   v17.4s, v11.4s, v2.s[0]     \n"
                        "ldr    x21, [%2, #8]               \n"
                        "fmla   v19.4s, v11.4s, v2.s[1]     \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v21.4s, v11.4s, v2.s[2]     \n"

                        "ldr    d13, [%1]                   \n"
                        "ins    v12.d[1], x24               \n"
                        "fmla   v23.4s, v11.4s, v2.s[3]     \n"
                        "ldr    x25, [%1, #8]               \n"
                        "fmla   v25.4s, v11.4s, v3.s[0]     \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v27.4s, v11.4s, v3.s[1]     \n"

                        "ldr    d6, [%2]                    \n"
                        "ins    v5.d[1], x21                \n"
                        "fmla   v29.4s, v11.4s, v3.s[2]     \n"
                        "ldr    x22, [%2, #8]               \n"
                        "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v16.4s, v12.4s, v4.s[0]     \n"

                        "ldr    d14, [%1]                   \n"
                        "ins    v13.d[1], x25               \n"
                        "fmla   v18.4s, v12.4s, v4.s[1]     \n"
                        "ldr    x26, [%1, #8]               \n"
                        "fmla   v20.4s, v12.4s, v4.s[2]     \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v22.4s, v12.4s, v4.s[3]     \n"

                        "ldr    d7, [%2]                    \n"
                        "ins    v6.d[1], x22                \n"
                        "fmla   v24.4s, v12.4s, v5.s[0]     \n"
                        "ldr    x23, [%2, #8]               \n"
                        "fmla   v26.4s, v12.4s, v5.s[1]     \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v28.4s, v12.4s, v5.s[2]     \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v30.4s, v12.4s, v5.s[3]     \n"
                        "nop                                \n"
                        "fmla   v17.4s, v13.4s, v4.s[0]     \n"
                        "nop                                \n"
                        "fmla   v19.4s, v13.4s, v4.s[1]     \n"

                        "ldr    d15, [%1]                   \n"
                        "ins    v14.d[1], x26               \n"
                        "fmla   v21.4s, v13.4s, v4.s[2]     \n"
                        "ldr    x27, [%1, #8]               \n"
                        "fmla   v23.4s, v13.4s, v4.s[3]     \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v25.4s, v13.4s, v5.s[0]     \n"

                        "nop                                \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v27.4s, v13.4s, v5.s[1]     \n"
                        "nop                                \n"
                        "fmla   v29.4s, v13.4s, v5.s[2]     \n"
                        "nop                                \n"
                        "fmla   v31.4s, v13.4s, v5.s[3]     \n"

                        "ldr    d0, [%2]                    \n"
                        "ins    v7.d[1], x23                \n"
                        "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                        "ldr    x20, [%2, #8]               \n"
                        "fmla   v18.4s, v14.4s, v6.s[1]     \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v20.4s, v14.4s, v6.s[2]     \n"

                        "ldr    d8, [%1]                    \n"
                        "ins    v15.d[1], x27               \n"
                        "fmla   v22.4s, v14.4s, v6.s[3]     \n"
                        "ldr    x24, [%1, #8]               \n"
                        "fmla   v24.4s, v14.4s, v7.s[0]     \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v26.4s, v14.4s, v7.s[1]     \n"

                        "ldr    d1, [%2]                    \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v28.4s, v14.4s, v7.s[2]     \n"
                        "ldr    x21, [%2, #8]               \n"
                        "fmla   v30.4s, v14.4s, v7.s[3]     \n"
                        "add    %2, %2, #16                 \n"
                        "fmla   v17.4s, v15.4s, v6.s[0]     \n"

                        "ldr    d9, [%1]                    \n"
                        "ins    v8.d[1], x24                \n"
                        "fmla   v19.4s, v15.4s, v6.s[1]     \n"
                        "ldr    x25, [%1, #8]               \n"
                        "fmla   v21.4s, v15.4s, v6.s[2]     \n"
                        "add    %1, %1, #16                 \n"
                        "fmla   v23.4s, v15.4s, v6.s[3]     \n"

                        "nop                                \n"
                        "nop                                \n"
                        "fmla   v25.4s, v15.4s, v7.s[0]     \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v27.4s, v15.4s, v7.s[1]     \n"
                        "nop                                \n"
                        "fmla   v29.4s, v15.4s, v7.s[2]     \n"

                        "nop                                \n"
                        "nop                                \n"
                        "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                        "nop                                \n"
                        "nop                                \n"
                        "nop                                \n"
                        "nop                                \n"

                        "bne    2b                          \n"

                        "sub    %1, %1, #32                 \n"
                        "sub    %2, %2, #32                 \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                        "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                        "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v0.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v24.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v1.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v1.s[3]      \n"
                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                else
                {
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                        "subs   %0, %0, #192                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v16.16b, v16.16b, v16.16b   \n"
                        "eor    v17.16b, v17.16b, v17.16b   \n"
                        "eor    v18.16b, v18.16b, v18.16b   \n"
                        "eor    v19.16b, v19.16b, v19.16b   \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "2:                                 \n"
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                        "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                        "fmla   v18.4s, v8.4s, v0.s[1]      \n"
                        "fmla   v20.4s, v8.4s, v0.s[2]      \n"
                        "fmla   v22.4s, v8.4s, v0.s[3]      \n"
                        "fmla   v24.4s, v8.4s, v1.s[0]      \n"
                        "fmla   v26.4s, v8.4s, v1.s[1]      \n"
                        "fmla   v28.4s, v8.4s, v1.s[2]      \n"
                        "fmla   v30.4s, v8.4s, v1.s[3]      \n"
                        "fmla   v17.4s, v9.4s, v0.s[0]      \n"
                        "fmla   v19.4s, v9.4s, v0.s[1]      \n"
                        "fmla   v21.4s, v9.4s, v0.s[2]      \n"
                        "fmla   v23.4s, v9.4s, v0.s[3]      \n"
                        "fmla   v25.4s, v9.4s, v1.s[0]      \n"
                        "fmla   v27.4s, v9.4s, v1.s[1]      \n"
                        "fmla   v29.4s, v9.4s, v1.s[2]      \n"
                        "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                        "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v10.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v10.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v10.4s, v2.s[3]     \n"
                        "fmla   v24.4s, v10.4s, v3.s[0]     \n"
                        "fmla   v26.4s, v10.4s, v3.s[1]     \n"
                        "fmla   v28.4s, v10.4s, v3.s[2]     \n"
                        "fmla   v30.4s, v10.4s, v3.s[3]     \n"
                        "fmla   v17.4s, v11.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v11.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v11.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v11.4s, v2.s[3]     \n"
                        "fmla   v25.4s, v11.4s, v3.s[0]     \n"
                        "fmla   v27.4s, v11.4s, v3.s[1]     \n"
                        "fmla   v29.4s, v11.4s, v3.s[2]     \n"
                        "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
                        "fmla   v16.4s, v12.4s, v4.s[0]     \n"
                        "fmla   v18.4s, v12.4s, v4.s[1]     \n"
                        "fmla   v20.4s, v12.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v12.4s, v4.s[3]     \n"
                        "fmla   v24.4s, v12.4s, v5.s[0]     \n"
                        "fmla   v26.4s, v12.4s, v5.s[1]     \n"
                        "fmla   v28.4s, v12.4s, v5.s[2]     \n"
                        "fmla   v30.4s, v12.4s, v5.s[3]     \n"
                        "fmla   v17.4s, v13.4s, v4.s[0]     \n"
                        "fmla   v19.4s, v13.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v13.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v13.4s, v4.s[3]     \n"
                        "fmla   v25.4s, v13.4s, v5.s[0]     \n"
                        "fmla   v27.4s, v13.4s, v5.s[1]     \n"
                        "fmla   v29.4s, v13.4s, v5.s[2]     \n"
                        "fmla   v31.4s, v13.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v14.4s, v6.s[1]     \n"
                        "fmla   v20.4s, v14.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v14.4s, v6.s[3]     \n"
                        "fmla   v24.4s, v14.4s, v7.s[0]     \n"
                        "fmla   v26.4s, v14.4s, v7.s[1]     \n"
                        "fmla   v28.4s, v14.4s, v7.s[2]     \n"
                        "fmla   v30.4s, v14.4s, v7.s[3]     \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v17.4s, v15.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v15.4s, v6.s[1]     \n"
                        "fmla   v21.4s, v15.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v15.4s, v6.s[3]     \n"
                        "fmla   v25.4s, v15.4s, v7.s[0]     \n"
                        "fmla   v27.4s, v15.4s, v7.s[1]     \n"
                        "fmla   v29.4s, v15.4s, v7.s[2]     \n"
                        "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                        "bne    2b                          \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                        "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                        "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                        "fmla   v18.4s, v4.4s, v0.s[1]      \n"
                        "fmla   v20.4s, v4.4s, v0.s[2]      \n"
                        "fmla   v22.4s, v4.4s, v0.s[3]      \n"
                        "fmla   v17.4s, v5.4s, v0.s[0]      \n"
                        "fmla   v19.4s, v5.4s, v0.s[1]      \n"
                        "fmla   v21.4s, v5.4s, v0.s[2]      \n"
                        "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v24.4s, v4.4s, v1.s[0]      \n"
                        "fmla   v26.4s, v4.4s, v1.s[1]      \n"
                        "fmla   v28.4s, v4.4s, v1.s[2]      \n"
                        "fmla   v30.4s, v4.4s, v1.s[3]      \n"
                        "fmla   v25.4s, v5.4s, v1.s[0]      \n"
                        "fmla   v27.4s, v5.4s, v1.s[1]      \n"
                        "fmla   v29.4s, v5.4s, v1.s[2]      \n"
                        "fmla   v31.4s, v5.4s, v1.s[3]      \n"
                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                        "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                        "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum00;
                float32x4_t _sum01;
                float32x4_t _sum10;
                float32x4_t _sum11;
                float32x4_t _sum20;
                float32x4_t _sum21;
                float32x4_t _sum30;
                float32x4_t _sum31;
                float32x4_t _sum40;
                float32x4_t _sum41;
                float32x4_t _sum50;
                float32x4_t _sum51;
                float32x4_t _sum60;
                float32x4_t _sum61;
                float32x4_t _sum70;
                float32x4_t _sum71;

                if (k == 0)
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                    _sum40 = vdupq_n_f32(0.f);
                    _sum41 = vdupq_n_f32(0.f);
                    _sum50 = vdupq_n_f32(0.f);
                    _sum51 = vdupq_n_f32(0.f);
                    _sum60 = vdupq_n_f32(0.f);
                    _sum61 = vdupq_n_f32(0.f);
                    _sum70 = vdupq_n_f32(0.f);
                    _sum71 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum00 = vld1q_f32(outptr);
                    _sum01 = vld1q_f32(outptr + 4 * 1);
                    _sum10 = vld1q_f32(outptr + 4 * 2);
                    _sum11 = vld1q_f32(outptr + 4 * 3);
                    _sum20 = vld1q_f32(outptr + 4 * 4);
                    _sum21 = vld1q_f32(outptr + 4 * 5);
                    _sum30 = vld1q_f32(outptr + 4 * 6);
                    _sum31 = vld1q_f32(outptr + 4 * 7);
                    _sum40 = vld1q_f32(outptr + 4 * 8);
                    _sum41 = vld1q_f32(outptr + 4 * 9);
                    _sum50 = vld1q_f32(outptr + 4 * 10);
                    _sum51 = vld1q_f32(outptr + 4 * 11);
                    _sum60 = vld1q_f32(outptr + 4 * 12);
                    _sum61 = vld1q_f32(outptr + 4 * 13);
                    _sum70 = vld1q_f32(outptr + 4 * 14);
                    _sum71 = vld1q_f32(outptr + 4 * 15);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA0 = vld1q_f32(pA);
                    float32x4_t _pA1 = vld1q_f32(pA + 4);

                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);

                    _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                    _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                    _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                    _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                    _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                    _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                    _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                    _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                    _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                    _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                    _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                    _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                    _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                    _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                    _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);

                    pA += 8;
                    pB += 8;
                }

                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
                outptr += 8 * 8;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #64                 \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v25.16b, v25.16b, v25.16b   \n"
                    "eor    v26.16b, v26.16b, v26.16b   \n"
                    "eor    v27.16b, v27.16b, v27.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                    "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v26.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v28.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v30.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v25.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v27.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v6.4s, v1.s[0]      \n"
                    "fmla   v26.4s, v6.4s, v1.s[1]      \n"
                    "fmla   v28.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v30.4s, v6.4s, v1.s[3]      \n"
                    "fmla   v25.4s, v7.4s, v1.s[0]      \n"
                    "fmla   v27.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v29.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                    "fmla   v24.4s, v8.4s, v2.s[0]      \n"
                    "fmla   v26.4s, v8.4s, v2.s[1]      \n"
                    "fmla   v28.4s, v8.4s, v2.s[2]      \n"
                    "fmla   v30.4s, v8.4s, v2.s[3]      \n"
                    "fmla   v25.4s, v9.4s, v2.s[0]      \n"
                    "fmla   v27.4s, v9.4s, v2.s[1]      \n"
                    "fmla   v29.4s, v9.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v9.4s, v2.s[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v24.4s, v10.4s, v3.s[0]     \n"
                    "fmla   v26.4s, v10.4s, v3.s[1]     \n"
                    "fmla   v28.4s, v10.4s, v3.s[2]     \n"
                    "fmla   v30.4s, v10.4s, v3.s[3]     \n"
                    "fmla   v25.4s, v11.4s, v3.s[0]     \n"
                    "fmla   v27.4s, v11.4s, v3.s[1]     \n"
                    "fmla   v29.4s, v11.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "bne    2b                          \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v0.4s}, [%2], #16          \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                    "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v26.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v28.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v30.4s, v4.4s, v0.s[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v25.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v27.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum00;
                float32x4_t _sum01;
                float32x4_t _sum10;
                float32x4_t _sum11;
                float32x4_t _sum20;
                float32x4_t _sum21;
                float32x4_t _sum30;
                float32x4_t _sum31;

                if (k == 0)
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum00 = vld1q_f32(outptr);
                    _sum01 = vld1q_f32(outptr + 4 * 1);
                    _sum10 = vld1q_f32(outptr + 4 * 2);
                    _sum11 = vld1q_f32(outptr + 4 * 3);
                    _sum20 = vld1q_f32(outptr + 4 * 4);
                    _sum21 = vld1q_f32(outptr + 4 * 5);
                    _sum30 = vld1q_f32(outptr + 4 * 6);
                    _sum31 = vld1q_f32(outptr + 4 * 7);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA0 = vld1q_f32(pA);
                    float32x4_t _pA1 = vld1q_f32(pA + 4);

                    float32x4_t _pB0 = vld1q_f32(pB);

                    _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                    _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                    _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                    _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                    _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                    _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                    _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                    _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);

                    pA += 8;
                    pB += 4;
                }

                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                outptr += 8 * 4;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                    "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v30.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v28.4s, v6.4s, v0.s[2]      \n"
                    "fmla   v30.4s, v6.4s, v0.s[3]      \n"
                    "fmla   v29.4s, v7.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v0.s[3]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                    "fmla   v28.4s, v8.4s, v1.s[0]      \n"
                    "fmla   v30.4s, v8.4s, v1.s[1]      \n"
                    "fmla   v29.4s, v9.4s, v1.s[0]      \n"
                    "fmla   v31.4s, v9.4s, v1.s[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.4s, v10.4s, v1.s[2]     \n"
                    "fmla   v30.4s, v10.4s, v1.s[3]     \n"
                    "fmla   v29.4s, v11.4s, v1.s[2]     \n"
                    "fmla   v31.4s, v11.4s, v1.s[3]     \n"
                    "bne    2b                          \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v0.2s}, [%2], #8           \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                    "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v30.4s, v4.4s, v0.s[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v29.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[1]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum00;
                float32x4_t _sum01;
                float32x4_t _sum10;
                float32x4_t _sum11;

                if (k == 0)
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum00 = vld1q_f32(outptr);
                    _sum01 = vld1q_f32(outptr + 4 * 1);
                    _sum10 = vld1q_f32(outptr + 4 * 2);
                    _sum11 = vld1q_f32(outptr + 4 * 3);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA0 = vld1q_f32(pA);
                    float32x4_t _pA1 = vld1q_f32(pA + 4);

                    float32x2_t _pB0 = vld1_f32(pB);

                    _sum00 = vfmaq_lane_f32(_sum00, _pA0, _pB0, 0);
                    _sum01 = vfmaq_lane_f32(_sum01, _pA1, _pB0, 0);
                    _sum10 = vfmaq_lane_f32(_sum10, _pA0, _pB0, 1);
                    _sum11 = vfmaq_lane_f32(_sum11, _pA1, _pB0, 1);

                    pA += 8;
                    pB += 2;
                }

                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                outptr += 8 * 2;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj < max_jj; jj++)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v30.4s, v31.4s}, [%0]      \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v0.4s}, [%2], #16          \n"
                    "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v30.4s, v6.4s, v0.s[1]      \n"
                    "fmla   v31.4s, v7.4s, v0.s[1]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                    "fmla   v28.4s, v8.4s, v0.s[2]      \n"
                    "fmla   v29.4s, v9.4s, v0.s[2]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v10.4s, v0.s[3]     \n"
                    "fmla   v31.4s, v11.4s, v0.s[3]     \n"
                    "bne    2b                          \n"
                    "fadd   v30.4s, v30.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1r   {v0.4s}, [%2], #4           \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                    "fmla   v30.4s, v4.4s, v0.4s        \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v31.4s, v5.4s, v0.4s        \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v30.4s, v31.4s}, [%0], #32 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum00;
                float32x4_t _sum01;

                if (k == 0)
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum00 = vld1q_f32(outptr);
                    _sum01 = vld1q_f32(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA0 = vld1q_f32(pA);
                    float32x4_t _pA1 = vld1q_f32(pA + 4);

                    float32x4_t _pB = vld1q_dup_f32(pB);

                    _sum00 = vfmaq_f32(_sum00, _pA0, _pB);
                    _sum01 = vfmaq_f32(_sum01, _pA1, _pB);

                    pA += 8;
                    pB += 1;
                }

                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        for (int b = 0; b < batch; b++)
        {
            const float* pAT = AT_tile.row(b) + max_kk * ii;
            const float* pB = BT_tile.row(b);

            int jj = 0;
#if __aarch64__
            for (; jj + 11 < max_jj; jj += 12)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #128                \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"
                    "eor    v21.16b, v21.16b, v21.16b   \n"
                    "eor    v22.16b, v22.16b, v22.16b   \n"
                    "eor    v23.16b, v23.16b, v23.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v25.16b, v25.16b, v25.16b   \n"
                    "eor    v26.16b, v26.16b, v26.16b   \n"
                    "eor    v27.16b, v27.16b, v27.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                    "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v21.4s, v16.4s, v0.s[1]     \n"
                    "fmla   v22.4s, v16.4s, v0.s[2]     \n"
                    "fmla   v23.4s, v16.4s, v0.s[3]     \n"
                    "fmla   v24.4s, v16.4s, v1.s[0]     \n"
                    "fmla   v25.4s, v16.4s, v1.s[1]     \n"
                    "fmla   v26.4s, v16.4s, v1.s[2]     \n"
                    "fmla   v27.4s, v16.4s, v1.s[3]     \n"
                    "fmla   v28.4s, v16.4s, v2.s[0]     \n"
                    "fmla   v29.4s, v16.4s, v2.s[1]     \n"
                    "fmla   v30.4s, v16.4s, v2.s[2]     \n"
                    "fmla   v31.4s, v16.4s, v2.s[3]     \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
                    "fmla   v20.4s, v17.4s, v3.s[0]     \n"
                    "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                    "fmla   v22.4s, v17.4s, v3.s[2]     \n"
                    "fmla   v23.4s, v17.4s, v3.s[3]     \n"
                    "fmla   v24.4s, v17.4s, v4.s[0]     \n"
                    "fmla   v25.4s, v17.4s, v4.s[1]     \n"
                    "fmla   v26.4s, v17.4s, v4.s[2]     \n"
                    "fmla   v27.4s, v17.4s, v4.s[3]     \n"
                    "fmla   v28.4s, v17.4s, v5.s[0]     \n"
                    "fmla   v29.4s, v17.4s, v5.s[1]     \n"
                    "fmla   v30.4s, v17.4s, v5.s[2]     \n"
                    "fmla   v31.4s, v17.4s, v5.s[3]     \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                    "fmla   v20.4s, v18.4s, v6.s[0]     \n"
                    "fmla   v21.4s, v18.4s, v6.s[1]     \n"
                    "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                    "fmla   v23.4s, v18.4s, v6.s[3]     \n"
                    "fmla   v24.4s, v18.4s, v7.s[0]     \n"
                    "fmla   v25.4s, v18.4s, v7.s[1]     \n"
                    "fmla   v26.4s, v18.4s, v7.s[2]     \n"
                    "fmla   v27.4s, v18.4s, v7.s[3]     \n"
                    "fmla   v28.4s, v18.4s, v0.s[0]     \n"
                    "fmla   v29.4s, v18.4s, v0.s[1]     \n"
                    "fmla   v30.4s, v18.4s, v0.s[2]     \n"
                    "fmla   v31.4s, v18.4s, v0.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v20.4s, v19.4s, v1.s[0]     \n"
                    "fmla   v21.4s, v19.4s, v1.s[1]     \n"
                    "fmla   v22.4s, v19.4s, v1.s[2]     \n"
                    "fmla   v23.4s, v19.4s, v1.s[3]     \n"
                    "fmla   v24.4s, v19.4s, v2.s[0]     \n"
                    "fmla   v25.4s, v19.4s, v2.s[1]     \n"
                    "fmla   v26.4s, v19.4s, v2.s[2]     \n"
                    "fmla   v27.4s, v19.4s, v2.s[3]     \n"
                    "fmla   v28.4s, v19.4s, v3.s[0]     \n"
                    "fmla   v29.4s, v19.4s, v3.s[1]     \n"
                    "fmla   v30.4s, v19.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v19.4s, v3.s[3]     \n"
                    "bne    2b                          \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                    "ld1    {v16.4s}, [%1], #16         \n"
                    "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v21.4s, v16.4s, v0.s[1]     \n"
                    "fmla   v22.4s, v16.4s, v0.s[2]     \n"
                    "fmla   v23.4s, v16.4s, v0.s[3]     \n"
                    "fmla   v24.4s, v16.4s, v1.s[0]     \n"
                    "fmla   v25.4s, v16.4s, v1.s[1]     \n"
                    "fmla   v26.4s, v16.4s, v1.s[2]     \n"
                    "fmla   v27.4s, v16.4s, v1.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.4s, v16.4s, v2.s[0]     \n"
                    "fmla   v29.4s, v16.4s, v2.s[1]     \n"
                    "fmla   v30.4s, v16.4s, v2.s[2]     \n"
                    "fmla   v31.4s, v16.4s, v2.s[3]     \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum0;
                float32x4_t _sum1;
                float32x4_t _sum2;
                float32x4_t _sum3;
                float32x4_t _sum4;
                float32x4_t _sum5;
                float32x4_t _sum6;
                float32x4_t _sum7;
                float32x4_t _sum8;
                float32x4_t _sum9;
                float32x4_t _suma;
                float32x4_t _sumb;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                    _sum4 = vdupq_n_f32(0.f);
                    _sum5 = vdupq_n_f32(0.f);
                    _sum6 = vdupq_n_f32(0.f);
                    _sum7 = vdupq_n_f32(0.f);
                    _sum8 = vdupq_n_f32(0.f);
                    _sum9 = vdupq_n_f32(0.f);
                    _suma = vdupq_n_f32(0.f);
                    _sumb = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum0 = vld1q_f32(outptr);
                    _sum1 = vld1q_f32(outptr + 4);
                    _sum2 = vld1q_f32(outptr + 8);
                    _sum3 = vld1q_f32(outptr + 12);
                    _sum4 = vld1q_f32(outptr + 16);
                    _sum5 = vld1q_f32(outptr + 20);
                    _sum6 = vld1q_f32(outptr + 24);
                    _sum7 = vld1q_f32(outptr + 28);
                    _sum8 = vld1q_f32(outptr + 32);
                    _sum9 = vld1q_f32(outptr + 36);
                    _suma = vld1q_f32(outptr + 40);
                    _sumb = vld1q_f32(outptr + 44);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA = vld1q_f32(pA);
                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);
                    float32x4_t _pB2 = vld1q_f32(pB + 8);

                    _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                    _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                    _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                    _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                    _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
                    _sum8 = vfmaq_laneq_f32(_sum8, _pA, _pB2, 0);
                    _sum9 = vfmaq_laneq_f32(_sum9, _pA, _pB2, 1);
                    _suma = vfmaq_laneq_f32(_suma, _pA, _pB2, 2);
                    _sumb = vfmaq_laneq_f32(_sumb, _pA, _pB2, 3);

                    pA += 4;
                    pB += 12;
                }

                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
                vst1q_f32(outptr + 4 * 8, _sum8);
                vst1q_f32(outptr + 4 * 9, _sum9);
                vst1q_f32(outptr + 4 * 10, _suma);
                vst1q_f32(outptr + 4 * 11, _sumb);
                outptr += 4 * 12;
#endif // NCNN_GNU_INLINE_ASM
            }
#endif // __aarch64__
            for (; jj + 7 < max_jj; jj += 8)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #64                 \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v25.16b, v25.16b, v25.16b   \n"
                    "eor    v26.16b, v26.16b, v26.16b   \n"
                    "eor    v27.16b, v27.16b, v27.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                    "fmla   v24.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v25.4s, v16.4s, v0.s[1]     \n"
                    "fmla   v26.4s, v16.4s, v0.s[2]     \n"
                    "fmla   v27.4s, v16.4s, v0.s[3]     \n"
                    "fmla   v28.4s, v16.4s, v1.s[0]     \n"
                    "fmla   v29.4s, v16.4s, v1.s[1]     \n"
                    "fmla   v30.4s, v16.4s, v1.s[2]     \n"
                    "fmla   v31.4s, v16.4s, v1.s[3]     \n"
                    "fmla   v24.4s, v17.4s, v2.s[0]     \n"
                    "fmla   v25.4s, v17.4s, v2.s[1]     \n"
                    "fmla   v26.4s, v17.4s, v2.s[2]     \n"
                    "fmla   v27.4s, v17.4s, v2.s[3]     \n"
                    "fmla   v28.4s, v17.4s, v3.s[0]     \n"
                    "fmla   v29.4s, v17.4s, v3.s[1]     \n"
                    "fmla   v30.4s, v17.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v17.4s, v3.s[3]     \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
                    "fmla   v24.4s, v18.4s, v4.s[0]     \n"
                    "fmla   v25.4s, v18.4s, v4.s[1]     \n"
                    "fmla   v26.4s, v18.4s, v4.s[2]     \n"
                    "fmla   v27.4s, v18.4s, v4.s[3]     \n"
                    "fmla   v28.4s, v18.4s, v5.s[0]     \n"
                    "fmla   v29.4s, v18.4s, v5.s[1]     \n"
                    "fmla   v30.4s, v18.4s, v5.s[2]     \n"
                    "fmla   v31.4s, v18.4s, v5.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v24.4s, v19.4s, v6.s[0]     \n"
                    "fmla   v25.4s, v19.4s, v6.s[1]     \n"
                    "fmla   v26.4s, v19.4s, v6.s[2]     \n"
                    "fmla   v27.4s, v19.4s, v6.s[3]     \n"
                    "fmla   v28.4s, v19.4s, v7.s[0]     \n"
                    "fmla   v29.4s, v19.4s, v7.s[1]     \n"
                    "fmla   v30.4s, v19.4s, v7.s[2]     \n"
                    "fmla   v31.4s, v19.4s, v7.s[3]     \n"
                    "bne    2b                          \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                    "ld1    {v16.4s}, [%1], #16         \n"
                    "fmla   v24.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v25.4s, v16.4s, v0.s[1]     \n"
                    "fmla   v26.4s, v16.4s, v0.s[2]     \n"
                    "fmla   v27.4s, v16.4s, v0.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.4s, v16.4s, v1.s[0]     \n"
                    "fmla   v29.4s, v16.4s, v1.s[1]     \n"
                    "fmla   v30.4s, v16.4s, v1.s[2]     \n"
                    "fmla   v31.4s, v16.4s, v1.s[3]     \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vldm       %0!, {d16-d23}      \n"
                    "vldm       %0, {d24-d31}       \n"
                    "sub        %0, %0, #64         \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q8, q8              \n"
                    "veor       q9, q9              \n"
                    "veor       q10, q10            \n"
                    "veor       q11, q11            \n"
                    "veor       q12, q12            \n"
                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "2:                             \n"
                    "pld        [%1, #512]          \n"
                    "vldm       %1!, {d8-d15}       \n"
                    "pld        [%2, #512]          \n"
                    "vldm       %2!, {d0-d7}        \n"
                    "vmla.f32   q8, q4, d0[0]       \n"
                    "vmla.f32   q9, q4, d0[1]       \n"
                    "vmla.f32   q10, q4, d1[0]      \n"
                    "vmla.f32   q11, q4, d1[1]      \n"
                    "vmla.f32   q12, q4, d2[0]      \n"
                    "vmla.f32   q13, q4, d2[1]      \n"
                    "vmla.f32   q14, q4, d3[0]      \n"
                    "vmla.f32   q15, q4, d3[1]      \n"
                    "vmla.f32   q8, q5, d4[0]       \n"
                    "vmla.f32   q9, q5, d4[1]       \n"
                    "vmla.f32   q10, q5, d5[0]      \n"
                    "vmla.f32   q11, q5, d5[1]      \n"
                    "vmla.f32   q12, q5, d6[0]      \n"
                    "vmla.f32   q13, q5, d6[1]      \n"
                    "vmla.f32   q14, q5, d7[0]      \n"
                    "vmla.f32   q15, q5, d7[1]      \n"
                    "pld        [%2, #512]          \n"
                    "vldm       %2!, {d0-d7}        \n"
                    "vmla.f32   q8, q6, d0[0]       \n"
                    "vmla.f32   q9, q6, d0[1]       \n"
                    "vmla.f32   q10, q6, d1[0]      \n"
                    "vmla.f32   q11, q6, d1[1]      \n"
                    "vmla.f32   q12, q6, d2[0]      \n"
                    "vmla.f32   q13, q6, d2[1]      \n"
                    "vmla.f32   q14, q6, d3[0]      \n"
                    "vmla.f32   q15, q6, d3[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q8, q7, d4[0]       \n"
                    "vmla.f32   q9, q7, d4[1]       \n"
                    "vmla.f32   q10, q7, d5[0]      \n"
                    "vmla.f32   q11, q7, d5[1]      \n"
                    "vmla.f32   q12, q7, d6[0]      \n"
                    "vmla.f32   q13, q7, d6[1]      \n"
                    "vmla.f32   q14, q7, d7[0]      \n"
                    "vmla.f32   q15, q7, d7[1]      \n"
                    "bne        2b                  \n"

                    "3:                             \n"
                    "and        r4, %6, #3          \n" // r4 = remain = max_kk & 3
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vldm       %2!, {d0-d3}        \n"
                    "vld1.f32   {d8-d9}, [%1 :128]! \n"
                    "vmla.f32   q8, q4, d0[0]       \n"
                    "vmla.f32   q9, q4, d0[1]       \n"
                    "vmla.f32   q10, q4, d1[0]      \n"
                    "vmla.f32   q11, q4, d1[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q12, q4, d2[0]      \n"
                    "vmla.f32   q13, q4, d2[1]      \n"
                    "vmla.f32   q14, q4, d3[0]      \n"
                    "vmla.f32   q15, q4, d3[1]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vstm       %0!, {d16-d23}      \n"
                    "vstm       %0!, {d24-d31}      \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum0;
                float32x4_t _sum1;
                float32x4_t _sum2;
                float32x4_t _sum3;
                float32x4_t _sum4;
                float32x4_t _sum5;
                float32x4_t _sum6;
                float32x4_t _sum7;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                    _sum4 = vdupq_n_f32(0.f);
                    _sum5 = vdupq_n_f32(0.f);
                    _sum6 = vdupq_n_f32(0.f);
                    _sum7 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum0 = vld1q_f32(outptr);
                    _sum1 = vld1q_f32(outptr + 4);
                    _sum2 = vld1q_f32(outptr + 8);
                    _sum3 = vld1q_f32(outptr + 12);
                    _sum4 = vld1q_f32(outptr + 16);
                    _sum5 = vld1q_f32(outptr + 20);
                    _sum6 = vld1q_f32(outptr + 24);
                    _sum7 = vld1q_f32(outptr + 28);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA = vld1q_f32(pA);
                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                    _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                    _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                    _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                    _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB0), 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB0), 1);
                    _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB0), 0);
                    _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB0), 1);
                    _sum4 = vmlaq_lane_f32(_sum4, _pA, vget_low_f32(_pB1), 0);
                    _sum5 = vmlaq_lane_f32(_sum5, _pA, vget_low_f32(_pB1), 1);
                    _sum6 = vmlaq_lane_f32(_sum6, _pA, vget_high_f32(_pB1), 0);
                    _sum7 = vmlaq_lane_f32(_sum7, _pA, vget_high_f32(_pB1), 1);
#endif

                    pA += 4;
                    pB += 8;
                }

                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
                outptr += 4 * 8;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0] \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                    "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                    "fmla   v30.4s, v16.4s, v0.s[2]     \n"
                    "fmla   v31.4s, v16.4s, v0.s[3]     \n"
                    "fmla   v28.4s, v17.4s, v1.s[0]     \n"
                    "fmla   v29.4s, v17.4s, v1.s[1]     \n"
                    "fmla   v30.4s, v17.4s, v1.s[2]     \n"
                    "fmla   v31.4s, v17.4s, v1.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.4s, v18.4s, v2.s[0]     \n"
                    "fmla   v29.4s, v18.4s, v2.s[1]     \n"
                    "fmla   v30.4s, v18.4s, v2.s[2]     \n"
                    "fmla   v31.4s, v18.4s, v2.s[3]     \n"
                    "fmla   v28.4s, v19.4s, v3.s[0]     \n"
                    "fmla   v29.4s, v19.4s, v3.s[1]     \n"
                    "fmla   v30.4s, v19.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v19.4s, v3.s[3]     \n"
                    "bne    2b                          \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v0.4s}, [%2], #16          \n"
                    "ld1    {v16.4s}, [%1], #16         \n"
                    "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v16.4s, v0.s[2]     \n"
                    "fmla   v31.4s, v16.4s, v0.s[3]     \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vldm       %0, {d24-d31}       \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q12, q12            \n"
                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "2:                             \n"
                    "pld        [%1, #512]          \n"
                    "vldm       %1!, {d8-d15}       \n"
                    "pld        [%2, #512]          \n"
                    "vldm       %2!, {d0-d7}        \n"
                    "vmla.f32   q12, q4, d0[0]      \n"
                    "vmla.f32   q13, q4, d0[1]      \n"
                    "vmla.f32   q14, q4, d1[0]      \n"
                    "vmla.f32   q15, q4, d1[1]      \n"
                    "vmla.f32   q12, q5, d2[0]      \n"
                    "vmla.f32   q13, q5, d2[1]      \n"
                    "vmla.f32   q14, q5, d3[0]      \n"
                    "vmla.f32   q15, q5, d3[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q12, q6, d4[0]      \n"
                    "vmla.f32   q13, q6, d4[1]      \n"
                    "vmla.f32   q14, q6, d5[0]      \n"
                    "vmla.f32   q15, q6, d5[1]      \n"
                    "vmla.f32   q12, q7, d6[0]      \n"
                    "vmla.f32   q13, q7, d6[1]      \n"
                    "vmla.f32   q14, q7, d7[0]      \n"
                    "vmla.f32   q15, q7, d7[1]      \n"
                    "bne        2b                  \n"

                    "3:                             \n"
                    "and        r4, %6, #3          \n" // r4 = remain = max_kk & 3
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vld1.f32   {d0-d1}, [%2 :128]! \n"
                    "vld1.f32   {d8-d9}, [%1 :128]! \n"
                    "vmla.f32   q12, q4, d0[0]      \n"
                    "vmla.f32   q13, q4, d0[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q14, q4, d1[0]      \n"
                    "vmla.f32   q15, q4, d1[1]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vstm       %0!, {d24-d31}      \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum0;
                float32x4_t _sum1;
                float32x4_t _sum2;
                float32x4_t _sum3;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum0 = vld1q_f32(outptr);
                    _sum1 = vld1q_f32(outptr + 4);
                    _sum2 = vld1q_f32(outptr + 8);
                    _sum3 = vld1q_f32(outptr + 12);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA = vld1q_f32(pA);
                    float32x4_t _pB = vld1q_f32(pB);

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB, 3);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB), 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB), 1);
                    _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB), 0);
                    _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB), 1);
#endif

                    pA += 4;
                    pB += 4;
                }

                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                outptr += 4 * 4;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v30.4s, v31.4s}, [%0]      \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                    "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                    "fmla   v30.4s, v17.4s, v0.s[2]     \n"
                    "fmla   v31.4s, v17.4s, v0.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.4s, v18.4s, v1.s[0]     \n"
                    "fmla   v29.4s, v18.4s, v1.s[1]     \n"
                    "fmla   v30.4s, v19.4s, v1.s[2]     \n"
                    "fmla   v31.4s, v19.4s, v1.s[3]     \n"
                    "bne    2b                          \n"
                    "fadd   v30.4s, v30.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v0.2s}, [%2], #8           \n"
                    "ld1    {v16.4s}, [%1], #16         \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v31.4s, v16.4s, v0.s[1]     \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v30.4s, v31.4s}, [%0], #32 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vld1.f32   {d28-d31}, [%0 :128] \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q14, q14            \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "veor       q12, q12            \n"
                    "veor       q13, q13            \n"
                    "2:                             \n"
                    "pld        [%1, #512]          \n"
                    "vldm       %1!, {d8-d15}       \n"
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d0-d3}, [%2 :128]! \n"
                    "vmla.f32   q12, q4, d0[0]      \n"
                    "vmla.f32   q13, q4, d0[1]      \n"
                    "vmla.f32   q14, q5, d1[0]      \n"
                    "vmla.f32   q15, q5, d1[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q12, q6, d2[0]      \n"
                    "vmla.f32   q13, q6, d2[1]      \n"
                    "vmla.f32   q14, q7, d3[0]      \n"
                    "vmla.f32   q15, q7, d3[1]      \n"
                    "bne        2b                  \n"
                    "vadd.f32   q14, q14, q12       \n"
                    "vadd.f32   q15, q15, q13       \n"

                    "3:                             \n"
                    "and        r4, %6, #3          \n" // r4 = remain = max_kk & 3
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vld1.f32   {d0}, [%2 :64]!     \n"
                    "vld1.f32   {d8-d9}, [%1 :128]! \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q14, q4, d0[0]      \n"
                    "vmla.f32   q15, q4, d0[1]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vst1.f32   {d28-d31}, [%0 :128]! \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum0;
                float32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum0 = vld1q_f32(outptr);
                    _sum1 = vld1q_f32(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA = vld1q_f32(pA);
                    float32x2_t _pB = vld1_f32(pB);

#if __aarch64__
                    _sum0 = vfmaq_lane_f32(_sum0, _pA, _pB, 0);
                    _sum1 = vfmaq_lane_f32(_sum1, _pA, _pB, 1);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _pA, _pB, 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _pA, _pB, 1);
#endif

                    pA += 4;
                    pB += 2;
                }

                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                outptr += 4 * 2;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj < max_jj; jj++)
            {
                const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v31.4s}, [%0]              \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v0.4s}, [%2], #16          \n"
                    "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                    "fmla   v29.4s, v17.4s, v0.s[1]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v18.4s, v0.s[2]     \n"
                    "fmla   v31.4s, v19.4s, v0.s[3]     \n"
                    "bne    2b                          \n"
                    "fadd   v30.4s, v30.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"
                    "fadd   v31.4s, v31.4s, v30.4s      \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1r   {v0.4s}, [%2], #4           \n"
                    "ld1    {v16.4s}, [%1], #16         \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v31.4s, v16.4s, v0.4s       \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v31.4s}, [%0], #16         \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vld1.f32   {d30-d31}, [%0 :128] \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "veor       q12, q12            \n"
                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"
                    "2:                             \n"
                    "pld        [%1, #512]          \n"
                    "vldm       %1!, {d8-d15}       \n"
                    "pld        [%2, #128]          \n"
                    "vld1.f32   {d0-d1}, [%2 :64]!  \n"
                    "vmla.f32   q12, q4, d0[0]      \n"
                    "vmla.f32   q13, q5, d0[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q14, q6, d1[0]      \n"
                    "vmla.f32   q15, q7, d1[1]      \n"
                    "bne        2b                  \n"
                    "vadd.f32   q14, q14, q12       \n"
                    "vadd.f32   q15, q15, q13       \n"
                    "vadd.f32   q15, q15, q14       \n"

                    "3:                             \n"
                    "and        r4, %6, #3          \n" // r4 = remain = max_kk & 3
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vld1.f32   {d0[0]}, [%2]!      \n"
                    "vld1.f32   {d8-d9}, [%1 :128]! \n"
                    "subs       r4, r4, #1          \n"
                    "vmla.f32   q15, q4, d0[0]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vst1.f32   {d30-d31}, [%0 :128]! \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _sum;

                if (k == 0)
                {
                    _sum = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum = vld1q_f32(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pA = vld1q_f32(pA);
                    float32x4_t _pB = vdupq_n_f32(pB[0]);

#if __aarch64__
                    _sum = vfmaq_f32(_sum, _pA, _pB);
#else
                    _sum = vmlaq_f32(_sum, _pA, _pB);
#endif

                    pA += 4;
                    pB += 1;
                }

                vst1q_f32(outptr, _sum);
                outptr += 4;
#endif // NCNN_GNU_INLINE_ASM
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        for (int b = 0; b < batch; b++)
        {
            const float* pAT = AT_tile.row(b) + max_kk * ii;
            const float* pB = BT_tile.row(b);

            int jj = 0;
#if __ARM_NEON
#if __aarch64__
            for (; jj + 11 < max_jj; jj += 12)
            {
                const float* pA = pAT;

                float32x4_t _sum00;
                float32x4_t _sum01;
                float32x4_t _sum02;
                float32x4_t _sum10;
                float32x4_t _sum11;
                float32x4_t _sum12;

                if (k == 0)
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum02 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum12 = vdupq_n_f32(0.f);
                }
                else
                {
                    float32x4x2_t _tmp01 = vld2q_f32(outptr);
                    float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                    float32x4x2_t _tmp45 = vld2q_f32(outptr + 16);
                    _sum00 = _tmp01.val[0];
                    _sum01 = _tmp23.val[0];
                    _sum02 = _tmp45.val[0];
                    _sum10 = _tmp01.val[1];
                    _sum11 = _tmp23.val[1];
                    _sum12 = _tmp45.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);
                    float32x4_t _pB2 = vld1q_f32(pB + 8);

                    float32x2_t _pA = vld1_f32(pA);
#if __aarch64__
                    _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                    _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                    _sum02 = vfmaq_lane_f32(_sum02, _pB2, _pA, 0);
                    _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                    _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
                    _sum12 = vfmaq_lane_f32(_sum12, _pB2, _pA, 1);
#else
                    _sum00 = vmlaq_lane_f32(_sum00, _pB0, _pA, 0);
                    _sum01 = vmlaq_lane_f32(_sum01, _pB1, _pA, 0);
                    _sum02 = vmlaq_lane_f32(_sum02, _pB2, _pA, 0);
                    _sum10 = vmlaq_lane_f32(_sum10, _pB0, _pA, 1);
                    _sum11 = vmlaq_lane_f32(_sum11, _pB1, _pA, 1);
                    _sum12 = vmlaq_lane_f32(_sum12, _pB2, _pA, 1);
#endif

                    pA += 2;
                    pB += 12;
                }

                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float32x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
                vst2q_f32(outptr + 16, _tmp45);
                outptr += 2 * 12;
            }
#endif // __aarch64__
            for (; jj + 7 < max_jj; jj += 8)
            {
                const float* pA = pAT;

                float32x4_t _sum00;
                float32x4_t _sum01;
                float32x4_t _sum10;
                float32x4_t _sum11;

                if (k == 0)
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                }
                else
                {
                    float32x4x2_t _tmp01 = vld2q_f32(outptr);
                    float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                    _sum00 = _tmp01.val[0];
                    _sum01 = _tmp23.val[0];
                    _sum10 = _tmp01.val[1];
                    _sum11 = _tmp23.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);

                    float32x2_t _pA = vld1_f32(pA);
#if __aarch64__
                    _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                    _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                    _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                    _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
#else
                    _sum00 = vmlaq_lane_f32(_sum00, _pB0, _pA, 0);
                    _sum01 = vmlaq_lane_f32(_sum01, _pB1, _pA, 0);
                    _sum10 = vmlaq_lane_f32(_sum10, _pB0, _pA, 1);
                    _sum11 = vmlaq_lane_f32(_sum11, _pB1, _pA, 1);
#endif

                    pA += 2;
                    pB += 8;
                }

                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
                outptr += 2 * 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const float* pA = pAT;

                float32x4_t _sum0;
                float32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
                else
                {
                    float32x4x2_t _tmp01 = vld2q_f32(outptr);
                    _sum0 = _tmp01.val[0];
                    _sum1 = _tmp01.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pB = vld1q_f32(pB);

                    float32x2_t _pA = vld1_f32(pA);
#if __aarch64__
                    _sum0 = vfmaq_lane_f32(_sum0, _pB, _pA, 0);
                    _sum1 = vfmaq_lane_f32(_sum1, _pB, _pA, 1);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _pB, _pA, 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _pB, _pA, 1);
#endif

                    pA += 2;
                    pB += 4;
                }

                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2q_f32(outptr, _tmp01);
                outptr += 2 * 4;
            }
#endif // __ARM_NEON
            for (; jj + 1 < max_jj; jj += 2)
            {
                const float* pA = pAT;

                float sum00 = 0.f;
                float sum01 = 0.f;
                float sum10 = 0.f;
                float sum11 = 0.f;

                if (k == 0)
                {
                    sum00 = 0.f;
                    sum01 = 0.f;
                    sum10 = 0.f;
                    sum11 = 0.f;
                }
                else
                {
                    sum00 = outptr[0];
                    sum01 = outptr[1];
                    sum10 = outptr[2];
                    sum11 = outptr[3];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum00 += pA[0] * pB[0];
                    sum01 += pA[1] * pB[0];
                    sum10 += pA[0] * pB[1];
                    sum11 += pA[1] * pB[1];
                    pA += 2;
                    pB += 2;
                }

                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
                outptr += 2 * 2;
            }
            for (; jj < max_jj; jj++)
            {
                const float* pA = pAT;

                float sum0 = 0.f;
                float sum1 = 0.f;

                if (k == 0)
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
                else
                {
                    sum0 = outptr[0];
                    sum1 = outptr[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum0 += pA[0] * pB[0];
                    sum1 += pA[1] * pB[0];
                    pA += 2;
                    pB += 1;
                }

                outptr[0] = sum0;
                outptr[1] = sum1;
                outptr += 2;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        for (int b = 0; b < batch; b++)
        {
            const float* pAT = AT_tile.row(b) + max_kk * ii;
            const float* pB = BT_tile.row(b);

            int jj = 0;
#if __ARM_NEON
#if __aarch64__
            for (; jj + 11 < max_jj; jj += 12)
            {
                const float* pA = pAT;

                float32x4_t _sum0;
                float32x4_t _sum1;
                float32x4_t _sum2;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum0 = vld1q_f32(outptr);
                    _sum1 = vld1q_f32(outptr + 4);
                    _sum2 = vld1q_f32(outptr + 8);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);
                    float32x4_t _pB2 = vld1q_f32(pB + 8);

                    float32x4_t _pA0 = vdupq_n_f32(pA[0]);
#if __aarch64__
                    _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                    _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
                    _sum2 = vfmaq_f32(_sum2, _pA0, _pB2);
#else
                    _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                    _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
                    _sum2 = vmlaq_f32(_sum2, _pA0, _pB2);
#endif

                    pA += 1;
                    pB += 12;
                }

                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 8, _sum2);
                outptr += 12;
            }
#endif // __aarch64__
            for (; jj + 7 < max_jj; jj += 8)
            {
                const float* pA = pAT;

                float32x4_t _sum0;
                float32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum0 = vld1q_f32(outptr);
                    _sum1 = vld1q_f32(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pB0 = vld1q_f32(pB);
                    float32x4_t _pB1 = vld1q_f32(pB + 4);

                    float32x4_t _pA0 = vdupq_n_f32(pA[0]);
#if __aarch64__
                    _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                    _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
#else
                    _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                    _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
#endif

                    pA += 1;
                    pB += 8;
                }

                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                outptr += 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const float* pA = pAT;

                float32x4_t _sum;

                if (k == 0)
                {
                    _sum = vdupq_n_f32(0.f);
                }
                else
                {
                    _sum = vld1q_f32(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float32x4_t _pB = vld1q_f32(pB);
                    float32x4_t _pA = vdupq_n_f32(pA[0]);

#if __aarch64__
                    _sum = vfmaq_f32(_sum, _pA, _pB);
#else
                    _sum = vmlaq_f32(_sum, _pA, _pB);
#endif

                    pA += 1;
                    pB += 4;
                }

                vst1q_f32(outptr, _sum);
                outptr += 4;
            }
#endif // __ARM_NEON
            for (; jj + 1 < max_jj; jj += 2)
            {
                const float* pA = pAT;

                float sum0 = 0.f;
                float sum1 = 0.f;

                if (k == 0)
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
                else
                {
                    sum0 = outptr[0];
                    sum1 = outptr[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum0 += pA[0] * pB[0];
                    sum1 += pA[0] * pB[1];
                    pA += 1;
                    pB += 2;
                }

                outptr[0] = sum0;
                outptr[1] = sum1;
                outptr += 2;
            }
            for (; jj < max_jj; jj++)
            {
                const float* pA = pAT;

                float sum = 0.f;

                if (k == 0)
                {
                    sum = 0.f;
                }
                else
                {
                    sum = outptr[0];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum += pA[0] * pB[0];
                    pA += 1;
                    pB += 1;
                }

                outptr[0] = sum;
                outptr += 1;
            }
        }
    }
}

static void conv3x3s1_winograd_get_optimal_tile_mnk(int M, int N, int K, int B, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
    (void)B;

    // solve K
    {
        // try not to split K
#if __aarch64__
        int tile_size = (l2_cache_size_fp32 - 32) / 12;
#elif __ARM_NEON
        int tile_size = (l2_cache_size_fp32 - 16) / 8;
#else
        int tile_size = (l2_cache_size_fp32 - 2) / 3;
#endif

#if __aarch64__
        TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::max(4, tile_size / 4 * 4);
#else
        TILE_K = std::max(2, tile_size / 2 * 2);
#endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
#if __aarch64__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#else
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#endif
    }

    // solve M
    {
#if __aarch64__
        TILE_M = 8;
#elif __ARM_NEON
        TILE_M = 4;
#else
        TILE_M = 2;
#endif
    }

    {
        TILE_M *= std::min(nT, get_physical_cpu_count());

        int nn_M = (M + TILE_M - 1) / TILE_M;
#if __aarch64__
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 3) / 4 * 4);
#else
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif

        if (nT > 1)
        {
#if __aarch64__
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
#elif __ARM_NEON
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 3) / 4 * 4);
#else
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
#endif
        }

#if __aarch64__
        TILE_M = std::max(8, TILE_M);
#elif __ARM_NEON
        TILE_M = std::max(4, TILE_M);
#else
        TILE_M = std::max(2, TILE_M);
#endif
    }

    if (N > 0)
    {
        int tile_size;
        if (TILE_K >= K)
        {
            tile_size = (l2_cache_size_fp32 - TILE_M * TILE_K) / TILE_K;
        }
        else
        {
            tile_size = (l2_cache_size_fp32 - TILE_M * TILE_K) / (TILE_M + TILE_K);
        }

#if __aarch64__
        TILE_N = std::max(4, tile_size / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::max(4, tile_size / 4 * 4);
#else
        TILE_N = std::max(1, tile_size);
#endif

        int nn_N = (N + TILE_N - 1) / TILE_N;

#if __aarch64__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#else
        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
#endif

#if __aarch64__
        TILE_N = std::max(4, TILE_N);
#elif __ARM_NEON
        TILE_N = std::max(4, TILE_N);
#else
        TILE_N = std::max(1, TILE_N);
#endif
    }
}

static inline void conv3x3s1_winograd23_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    // const float ktm[4][3] = {
    //     {1.0f, 0.0f, 0.0f},
    //     {1.0f / 2, 1.0f / 2, 1.0f / 2},
    //     {1.0f / 2, -1.0f / 2, 1.0f / 2},
    //     {0.0f, 0.0f, 1.0f}
    // };

    float* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            float tmp[4][3];

            const float* k0 = (const float*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                float r0 = k0[0];
                float r1 = k0[1];
                float r2 = k0[2];

                tmp[0][m] = r0;
                tmp[1][m] = r0 * 0.5f + r1 * 0.5f + r2 * 0.5f;
                tmp[2][m] = r0 * 0.5f - r1 * 0.5f + r2 * 0.5f;
                tmp[3][m] = r2;

                k0 += 3;
            }

            for (int m = 0; m < 4; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];

                float z0 = r0;
                float z1 = r0 * 0.5f + r1 * 0.5f + r2 * 0.5f;
                float z2 = r0 * 0.5f - r1 * 0.5f + r2 * 0.5f;
                float z3 = r2;

                ptmp[0] = z0;
                ptmp[1] = z1;
                ptmp[2] = z2;
                ptmp[3] = z3;
                ptmp += 4;
            }
        }
    }
}

static void conv3x3s1_winograd23_transform_kernel(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 16;

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, 0, K, B, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd23_transform_kernel_tile(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            conv3x3s1_winograd_pack_A_tile(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd23_transform_input_tile(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const float itm[4][4] = {
    //     {1.0f,  0.0f, -1.0f,  0.0f},
    //     {0.0f,  1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  0.00f, 1.0f}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w - 1) / 2;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel((k + kk) / elempack).row(ti * 2) + (tj * 2) * elempack;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r00 = vdupq_n_f32(0.f);
                float32x4_t _r01 = vdupq_n_f32(0.f);
                float32x4_t _r10 = vdupq_n_f32(0.f);
                float32x4_t _r11 = vdupq_n_f32(0.f);
                float32x4_t _r20 = vdupq_n_f32(0.f);
                float32x4_t _r21 = vdupq_n_f32(0.f);
                float32x4_t _r30 = vdupq_n_f32(0.f);
                float32x4_t _r31 = vdupq_n_f32(0.f);

                if (ti * 2 + m < h)
                {
                    if (elempack == 4)
                    {
                        const float* r1 = r0 + N;

                        _r00 = vld1q_f32(r0);
                        _r01 = vld1q_f32(r1);
                        if (tj * 2 + 1 < w)
                        {
                            _r10 = vld1q_f32(r0 + 4);
                            _r11 = vld1q_f32(r1 + 4);
                        }
                        if (tj * 2 + 2 < w)
                        {
                            _r20 = vld1q_f32(r0 + 8);
                            _r21 = vld1q_f32(r1 + 8);
                        }
                        if (tj * 2 + 3 < w)
                        {
                            _r30 = vld1q_f32(r0 + 12);
                            _r31 = vld1q_f32(r1 + 12);
                        }
                    }
                    if (elempack == 1)
                    {
                        const float* r1 = r0 + N;
                        const float* r2 = r0 + N * 2;
                        const float* r3 = r0 + N * 3;
                        const float* r4 = r0 + N * 4;
                        const float* r5 = r0 + N * 5;
                        const float* r6 = r0 + N * 6;
                        const float* r7 = r0 + N * 7;

                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4_t _t2 = vld1q_f32(r2);
                        float32x4_t _t3 = vld1q_f32(r3);
                        float32x4_t _t4 = vld1q_f32(r4);
                        float32x4_t _t5 = vld1q_f32(r5);
                        float32x4_t _t6 = vld1q_f32(r6);
                        float32x4_t _t7 = vld1q_f32(r7);

                        transpose4x4_ps(_t0, _t1, _t2, _t3);
                        transpose4x4_ps(_t4, _t5, _t6, _t7);

                        _r00 = _t0;
                        _r01 = _t4;
                        if (tj * 2 + 1 < w)
                        {
                            _r10 = _t1;
                            _r11 = _t5;
                        }
                        if (tj * 2 + 2 < w)
                        {
                            _r20 = _t2;
                            _r21 = _t6;
                        }
                        if (tj * 2 + 3 < w)
                        {
                            _r30 = _t3;
                            _r31 = _t7;
                        }
                    }
                }

                float32x4_t _tmp00 = vsubq_f32(_r00, _r20);
                float32x4_t _tmp01 = vsubq_f32(_r01, _r21);
                float32x4_t _tmp10 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp11 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp20 = vsubq_f32(_r20, _r10);
                float32x4_t _tmp21 = vsubq_f32(_r21, _r11);
                float32x4_t _tmp30 = vsubq_f32(_r30, _r10);
                float32x4_t _tmp31 = vsubq_f32(_r31, _r11);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj * 8;
            float* p1 = p0 + max_jj * 8;
            float* p2 = p0 + max_jj * 8 * 2;
            float* p3 = p0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);

                float32x4_t _tmp00 = vsubq_f32(_r00, _r20);
                float32x4_t _tmp01 = vsubq_f32(_r01, _r21);
                float32x4_t _tmp10 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp11 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp20 = vsubq_f32(_r20, _r10);
                float32x4_t _tmp21 = vsubq_f32(_r21, _r11);
                float32x4_t _tmp30 = vsubq_f32(_r30, _r10);
                float32x4_t _tmp31 = vsubq_f32(_r31, _r11);

                vst1q_f32(p0, _tmp00);
                vst1q_f32(p0 + 4, _tmp01);
                vst1q_f32(p1, _tmp10);
                vst1q_f32(p1 + 4, _tmp11);
                vst1q_f32(p2, _tmp20);
                vst1q_f32(p2 + 4, _tmp21);
                vst1q_f32(p3, _tmp30);
                vst1q_f32(p3 + 4, _tmp31);

                p0 += max_jj * 4 * 8;
                p1 += max_jj * 4 * 8;
                p2 += max_jj * 4 * 8;
                p3 += max_jj * 4 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
#else // __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    #pragma omp parallel for num_threads(nT)
#endif // __aarch64__
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel((k + kk) / elempack).row(ti * 2) + (tj * 2) * elempack;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r0 = vdupq_n_f32(0.f);
                float32x4_t _r1 = vdupq_n_f32(0.f);
                float32x4_t _r2 = vdupq_n_f32(0.f);
                float32x4_t _r3 = vdupq_n_f32(0.f);

                if (ti * 2 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        if (tj * 2 + 1 < w) _r1 = vld1q_f32(r0 + 4);
                        if (tj * 2 + 2 < w) _r2 = vld1q_f32(r0 + 8);
                        if (tj * 2 + 3 < w) _r3 = vld1q_f32(r0 + 12);
                    }
                    if (elempack == 1)
                    {
                        const float* r1 = r0 + N;
                        const float* r2 = r0 + N * 2;
                        const float* r3 = r0 + N * 3;

                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4_t _t2 = vld1q_f32(r2);
                        float32x4_t _t3 = vld1q_f32(r3);

                        transpose4x4_ps(_t0, _t1, _t2, _t3);

                        _r0 = _t0;
                        if (tj * 2 + 1 < w) _r1 = _t1;
                        if (tj * 2 + 2 < w) _r2 = _t2;
                        if (tj * 2 + 3 < w) _r3 = _t3;
                    }
                }

                float32x4_t _tmp0 = vsubq_f32(_r0, _r2);
                float32x4_t _tmp1 = vaddq_f32(_r1, _r2);
                float32x4_t _tmp2 = vsubq_f32(_r2, _r1);
                float32x4_t _tmp3 = vsubq_f32(_r3, _r1);

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj * 4;
            float* p1 = p0 + max_jj * 4;
            float* p2 = p0 + max_jj * 4 * 2;
            float* p3 = p0 + max_jj * 4 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);

                float32x4_t _tmp0 = vsubq_f32(_r0, _r2);
                float32x4_t _tmp1 = vaddq_f32(_r1, _r2);
                float32x4_t _tmp2 = vsubq_f32(_r2, _r1);
                float32x4_t _tmp3 = vsubq_f32(_r3, _r1);

                vst1q_f32(p0, _tmp0);
                vst1q_f32(p1, _tmp1);
                vst1q_f32(p2, _tmp2);
                vst1q_f32(p3, _tmp3);

                p0 += max_jj * 4 * 4;
                p1 += max_jj * 4 * 4;
                p2 += max_jj * 4 * 4;
                p3 += max_jj * 4 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[4][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel(k + kk).row(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vdup_n_f32(0.f);
                float32x2_t _r1 = vdup_n_f32(0.f);
                float32x2_t _r2 = vdup_n_f32(0.f);
                float32x2_t _r3 = vdup_n_f32(0.f);
#else
                float r00 = 0.f;
                float r01 = 0.f;
                float r10 = 0.f;
                float r11 = 0.f;
                float r20 = 0.f;
                float r21 = 0.f;
                float r30 = 0.f;
                float r31 = 0.f;
#endif

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const float* r1 = r0 + N;

#if __ARM_NEON
                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4x2_t _t01 = vzipq_f32(_t0, _t1);

                        _r0 = vget_low_f32(_t01.val[0]);
                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
#else
                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 2 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 2 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 2 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
#endif
                    }
                }

#if __ARM_NEON
                float32x2_t _tmp0 = vsub_f32(_r0, _r2);
                float32x2_t _tmp1 = vadd_f32(_r1, _r2);
                float32x2_t _tmp2 = vsub_f32(_r2, _r1);
                float32x2_t _tmp3 = vsub_f32(_r3, _r1);

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
#else
                tmp[0][m][0] = r00 - r20;
                tmp[0][m][1] = r01 - r21;
                tmp[1][m][0] = r10 + r20;
                tmp[1][m][1] = r11 + r21;
                tmp[2][m][0] = r20 - r10;
                tmp[2][m][1] = r21 - r11;
                tmp[3][m][0] = r30 - r10;
                tmp[3][m][1] = r31 - r11;
#endif

                r0 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj * 2;
            float* p1 = p0 + max_jj * 2;
            float* p2 = p0 + max_jj * 2 * 2;
            float* p3 = p0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);

                float32x2_t _tmp0 = vsub_f32(_r0, _r2);
                float32x2_t _tmp1 = vadd_f32(_r1, _r2);
                float32x2_t _tmp2 = vsub_f32(_r2, _r1);
                float32x2_t _tmp3 = vsub_f32(_r3, _r1);

                vst1_f32(p0, _tmp0);
                vst1_f32(p1, _tmp1);
                vst1_f32(p2, _tmp2);
                vst1_f32(p3, _tmp3);
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];

                p0[0] = r00 - r20;
                p0[1] = r01 - r21;
                p1[0] = r10 + r20;
                p1[1] = r11 + r21;
                p2[0] = r20 - r10;
                p2[1] = r21 - r11;
                p3[0] = r30 - r10;
                p3[1] = r31 - r11;
#endif

                p0 += max_jj * 4 * 2;
                p1 += max_jj * 4 * 2;
                p2 += max_jj * 4 * 2;
                p3 += max_jj * 4 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        float tmp[4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0123 = bottom_blob.channel(k + kk).row(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
                float r0 = 0.f;
                float r1 = 0.f;
                float r2 = 0.f;
                float r3 = 0.f;

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 2 + 1 < w) r1 = r0123[1];
                        if (tj * 2 + 2 < w) r2 = r0123[2];
                        if (tj * 2 + 3 < w) r3 = r0123[3];
                    }
                }

                tmp[0][m] = r0 - r2;
                tmp[1][m] = r1 + r2;
                tmp[2][m] = r2 - r1;
                tmp[3][m] = r3 - r1;

                r0123 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj;
            float* p1 = p0 + max_jj;
            float* p2 = p0 + max_jj * 2;
            float* p3 = p0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];

                p0[0] = r0 - r2;
                p1[0] = r1 + r2;
                p2[0] = r2 - r1;
                p3[0] = r3 - r1;

                p0 += max_jj * 4;
                p1 += max_jj * 4;
                p2 += max_jj * 4;
                p3 += max_jj * 4;
            }
        }
    }
}

static inline void conv3x3s1_winograd23_transform_output_tile(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    // const float otm[2][4] = {
    //     {1.0f,  1.0f,  1.0f,  0.0f},
    //     {0.0f,  1.0f, -1.0f,  1.0f}
    // };

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 1) / 2;

    const float* biasptr = bias;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = biasptr ? vld1q_f32(biasptr + i + ii + 4) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[2][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj * 8;
            const float* r1 = r0 + max_jj * 8;
            const float* r2 = r0 + max_jj * 8 * 2;
            const float* r3 = r0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _r10), _r20);
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _r11), _r21);
                float32x4_t _tmp10 = vaddq_f32(vsubq_f32(_r10, _r20), _r30);
                float32x4_t _tmp11 = vaddq_f32(vsubq_f32(_r11, _r21), _r31);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);

                r0 += max_jj * 4 * 8;
                r1 += max_jj * 4 * 8;
                r2 += max_jj * 4 * 8;
                r3 += max_jj * 4 * 8;
            }

            float* outptr0 = top_blob.channel((i + ii) / out_elempack).row(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);

                float32x4_t _tmp00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r00, _r10), _r20));
                float32x4_t _tmp01 = vaddq_f32(_bias1, vaddq_f32(vaddq_f32(_r01, _r11), _r21));
                float32x4_t _tmp10 = vaddq_f32(_bias0, vaddq_f32(vsubq_f32(_r10, _r20), _r30));
                float32x4_t _tmp11 = vaddq_f32(_bias1, vaddq_f32(vsubq_f32(_r11, _r21), _r31));

                if (out_elempack == 4)
                {
                    float* outptr1 = outptr0 + N;

                    vst1q_f32(outptr0, _tmp00);
                    vst1q_f32(outptr1, _tmp01);
                    if (tj * 2 + 1 < outw)
                    {
                        vst1q_f32(outptr0 + 4, _tmp10);
                        vst1q_f32(outptr1 + 4, _tmp11);
                    }
                }
                if (out_elempack == 1)
                {
                    float tmp0[8];
                    float tmp1[8];
                    vst1q_f32(tmp0, _tmp00);
                    vst1q_f32(tmp0 + 4, _tmp01);
                    vst1q_f32(tmp1, _tmp10);
                    vst1q_f32(tmp1 + 4, _tmp11);

                    float* outptr1 = outptr0 + N;
                    float* outptr2 = outptr0 + N * 2;
                    float* outptr3 = outptr0 + N * 3;
                    float* outptr4 = outptr0 + N * 4;
                    float* outptr5 = outptr0 + N * 5;
                    float* outptr6 = outptr0 + N * 6;
                    float* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[2][4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj * 4;
            const float* r1 = r0 + max_jj * 4;
            const float* r2 = r0 + max_jj * 4 * 2;
            const float* r3 = r0 + max_jj * 4 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _r1 = vld1q_f32(r1);
                float32x4_t _r2 = vld1q_f32(r2);
                float32x4_t _r3 = vld1q_f32(r3);

                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _r1), _r2);
                float32x4_t _tmp1 = vaddq_f32(vsubq_f32(_r1, _r2), _r3);

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);

                r0 += max_jj * 4 * 4;
                r1 += max_jj * 4 * 4;
                r2 += max_jj * 4 * 4;
                r3 += max_jj * 4 * 4;
            }

            float* outptr0 = top_blob.channel((i + ii) / out_elempack).row(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);

                float32x4_t _tmp0 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r0, _r1), _r2));
                float32x4_t _tmp1 = vaddq_f32(_bias0, vaddq_f32(vsubq_f32(_r1, _r2), _r3));

                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _tmp0);
                    if (tj * 2 + 1 < outw) vst1q_f32(outptr0 + 4, _tmp1);
                }
                if (out_elempack == 1)
                {
                    float tmp0[4];
                    float tmp1[4];
                    vst1q_f32(tmp0, _tmp0);
                    vst1q_f32(tmp1, _tmp1);

                    float* outptr1 = outptr0 + N;
                    float* outptr2 = outptr0 + N * 2;
                    float* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        float32x2_t _bias0 = biasptr ? vld1_f32(biasptr + i + ii) : vdup_n_f32(0.f);
#else
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;
        float bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;
#endif

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[2][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj * 2;
            const float* r1 = r0 + max_jj * 2;
            const float* r2 = r0 + max_jj * 2 * 2;
            const float* r3 = r0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(r0);
                float32x2_t _r1 = vld1_f32(r1);
                float32x2_t _r2 = vld1_f32(r2);
                float32x2_t _r3 = vld1_f32(r3);

                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _r1), _r2);
                float32x2_t _tmp1 = vadd_f32(vsub_f32(_r1, _r2), _r3);

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
#else
                tmp[0][m][0] = r0[0] + r1[0] + r2[0];
                tmp[0][m][1] = r0[1] + r1[1] + r2[1];
                tmp[1][m][0] = r1[0] - r2[0] + r3[0];
                tmp[1][m][1] = r1[1] - r2[1] + r3[1];
#endif

                r0 += max_jj * 4 * 2;
                r1 += max_jj * 4 * 2;
                r2 += max_jj * 4 * 2;
                r3 += max_jj * 4 * 2;
            }

            float* outptr0 = top_blob.channel(i + ii).row(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);

                float32x2_t _tmp0 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r0, _r1), _r2));
                float32x2_t _tmp1 = vadd_f32(_bias0, vadd_f32(vsub_f32(_r1, _r2), _r3));
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];

                float tmp00 = bias0 + r00 + r10 + r20;
                float tmp01 = bias1 + r01 + r11 + r21;
                float tmp10 = bias0 + r10 - r20 + r30;
                float tmp11 = bias1 + r11 - r21 + r31;
#endif

                // if (out_elempack == 1)
                {
                    float* outptr1 = outptr0 + N;

#if __ARM_NEON
                    outptr0[0] = vget_lane_f32(_tmp0, 0);
                    outptr1[0] = vget_lane_f32(_tmp0, 1);
                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = vget_lane_f32(_tmp1, 0);
                        outptr1[1] = vget_lane_f32(_tmp1, 1);
                    }
#else
                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
#endif
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;

        float tmp[2][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj;
            const float* r1 = r0 + max_jj;
            const float* r2 = r0 + max_jj * 2;
            const float* r3 = r0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                tmp[0][m] = r0[0] + r1[0] + r2[0];
                tmp[1][m] = r1[0] - r2[0] + r3[0];

                r0 += max_jj * 4;
                r1 += max_jj * 4;
                r2 += max_jj * 4;
                r3 += max_jj * 4;
            }

            float* outptr0 = top_blob.channel(i + ii).row(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];

                float tmp0 = bias0 + r0 + r1 + r2;
                float tmp1 = bias0 + r1 - r2 + r3;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 2 + 1 < outw) outptr0[1] = tmp1;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd23(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 2n+2, winograd F(2,3)
    int w_tiles = (outw + 1) / 2;
    int h_tiles = (outh + 1) / 2;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 16;

    // NCNN_LOGE("conv3x3s1_winograd23 %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd23_transform_input_tile(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd23_transform_input_tile(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd23_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    float* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            const float sq2 = 1.41421356237f;
            // const float ktm[6][3] = {
            //     {1.0f, 0.0f, 0.0f},
            //     {-2.0f / 3, -sq2 / 3, -1.0f / 3},
            //     {-2.0f / 3, sq2 / 3, -1.0f / 3},
            //     {1.0f / 6, sq2 / 6, 1.0f / 3},
            //     {1.0f / 6, -sq2 / 6, 1.0f / 3},
            //     {0.0f, 0.0f, 1.0f}
            // };
            const float ktm0 = 2.0f / 3;
            const float ktm1 = sq2 / 3;
            const float ktm2 = 1.0f / 3;
            const float ktm3 = 1.0f / 6;
            const float ktm4 = sq2 / 6;

            float tmp[6][3];

            const float* k0 = (const float*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                float r0 = k0[0];
                float r1 = k0[1];
                float r2 = k0[2];

                tmp[0][m] = r0;
                tmp[1][m] = -r0 * ktm0 - r1 * ktm1 - r2 * ktm2;
                tmp[2][m] = -r0 * ktm0 + r1 * ktm1 - r2 * ktm2;
                tmp[3][m] = r0 * ktm3 + r1 * ktm4 + r2 * ktm2;
                tmp[4][m] = r0 * ktm3 - r1 * ktm4 + r2 * ktm2;
                tmp[5][m] = r2;

                k0 += 3;
            }

            for (int m = 0; m < 6; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];

                float z0 = r0;
                float z1 = -r0 * ktm0 - r1 * ktm1 - r2 * ktm2;
                float z2 = -r0 * ktm0 + r1 * ktm1 - r2 * ktm2;
                float z3 = r0 * ktm3 + r1 * ktm4 + r2 * ktm2;
                float z4 = r0 * ktm3 - r1 * ktm4 + r2 * ktm2;
                float z5 = r2;

                ptmp[0] = z0;
                ptmp[1] = z1;
                ptmp[2] = z2;
                ptmp[3] = z3;
                ptmp[4] = z4;
                ptmp[5] = z5;
                ptmp += 6;
            }
        }
    }
}

static void conv3x3s1_winograd43_transform_kernel(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 36;

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, 0, K, B, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd43_transform_kernel_tile(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            conv3x3s1_winograd_pack_A_tile(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd43_transform_input_tile(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    const float sq2 = 1.41421356237;
    const float sq2_d2 = 1.41421356237 / 2;

    // const float itm[6][6] = {
    //     {1.0f,  0.0f,  -2.5f,  0.0f,  1.0f, 0.0f},
    //     {0.0f, -sq2,   -2.0f,  sq2/2, 1.0f, 0.0f},
    //     {0.0f,  sq2,   -2.0f, -sq2/2, 1.0f, 0.0f},
    //     {0.0f, -sq2/2, -0.5f,  sq2,   1.0f, 0.0f},
    //     {0.0f,  sq2/2, -0.5f, -sq2,   1.0f, 0.0f},
    //     {0.0f,  1.0f,   0.0f,  -2.5f, 0.0f, 1.0f}
    // };

    // 0 =  r00 + r04 - 2.5f * r02
    // 1 = -(sq2 * r01 - sq2_d2 * r03) + (r04 - 2 * r02)
    // 2 =  (sq2 * r01 - sq2_d2 * r03) + (r04 - 2 * r02)
    // 3 =  (sq2 * r03 - sq2_d2 * r01) + (r04 - 0.5f * r02)
    // 4 = -(sq2 * r03 - sq2_d2 * r01) + (r04 - 0.5f * r02)
    // 5 =  r01 + r05 - 2.5f * r03

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w + 1) / 4;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][6][8];

        const float coeffs[4] = {sq2, -sq2_d2, -2.f, -0.5f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _vm2_5 = vdupq_n_f32(-2.5f);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel((k + kk) / elempack).row(ti * 4) + (tj * 4) * elempack;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r00 = vdupq_n_f32(0.f);
                float32x4_t _r01 = vdupq_n_f32(0.f);
                float32x4_t _r10 = vdupq_n_f32(0.f);
                float32x4_t _r11 = vdupq_n_f32(0.f);
                float32x4_t _r20 = vdupq_n_f32(0.f);
                float32x4_t _r21 = vdupq_n_f32(0.f);
                float32x4_t _r30 = vdupq_n_f32(0.f);
                float32x4_t _r31 = vdupq_n_f32(0.f);
                float32x4_t _r40 = vdupq_n_f32(0.f);
                float32x4_t _r41 = vdupq_n_f32(0.f);
                float32x4_t _r50 = vdupq_n_f32(0.f);
                float32x4_t _r51 = vdupq_n_f32(0.f);

                if (ti * 4 + m < h)
                {
                    if (elempack == 4)
                    {
                        const float* r1 = r0 + N;

                        _r00 = vld1q_f32(r0);
                        _r01 = vld1q_f32(r1);
                        if (tj * 4 + 1 < w)
                        {
                            _r10 = vld1q_f32(r0 + 4);
                            _r11 = vld1q_f32(r1 + 4);
                        }
                        if (tj * 4 + 2 < w)
                        {
                            _r20 = vld1q_f32(r0 + 8);
                            _r21 = vld1q_f32(r1 + 8);
                        }
                        if (tj * 4 + 3 < w)
                        {
                            _r30 = vld1q_f32(r0 + 12);
                            _r31 = vld1q_f32(r1 + 12);
                        }
                        if (tj * 4 + 4 < w)
                        {
                            _r40 = vld1q_f32(r0 + 16);
                            _r41 = vld1q_f32(r1 + 16);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            _r50 = vld1q_f32(r0 + 20);
                            _r51 = vld1q_f32(r1 + 20);
                        }
                    }
                    if (elempack == 1)
                    {
                        const float* r1 = r0 + N;
                        const float* r2 = r0 + N * 2;
                        const float* r3 = r0 + N * 3;
                        const float* r4 = r0 + N * 4;
                        const float* r5 = r0 + N * 5;
                        const float* r6 = r0 + N * 6;
                        const float* r7 = r0 + N * 7;

                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4_t _t2 = vld1q_f32(r2);
                        float32x4_t _t3 = vld1q_f32(r3);
                        float32x4_t _t4 = vld1q_f32(r4);
                        float32x4_t _t5 = vld1q_f32(r5);
                        float32x4_t _t6 = vld1q_f32(r6);
                        float32x4_t _t7 = vld1q_f32(r7);

                        transpose4x4_ps(_t0, _t1, _t2, _t3);
                        transpose4x4_ps(_t4, _t5, _t6, _t7);

                        _r00 = _t0;
                        _r01 = _t4;
                        if (tj * 4 + 1 < w)
                        {
                            _r10 = _t1;
                            _r11 = _t5;
                        }
                        if (tj * 4 + 2 < w)
                        {
                            _r20 = _t2;
                            _r21 = _t6;
                        }
                        if (tj * 4 + 3 < w)
                        {
                            _r30 = _t3;
                            _r31 = _t7;
                        }
                        if (tj * 4 + 4 < w)
                        {
                            float tmp[8] = {r0[4], r1[4], r2[4], r3[4], r4[4], r5[4], r6[4], r7[4]};
                            _r40 = vld1q_f32(tmp);
                            _r41 = vld1q_f32(tmp + 4);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            float tmp[8] = {r0[5], r1[5], r2[5], r3[5], r4[5], r5[5], r6[5], r7[5]};
                            _r50 = vld1q_f32(tmp);
                            _r51 = vld1q_f32(tmp + 4);
                        }
                    }
                }

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs, 0), _r30, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs, 0), _r31, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 2);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 2);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r30, _coeffs, 0), _r10, _coeffs, 1);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r31, _coeffs, 0), _r11, _coeffs, 1);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 3);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 3);

                float32x4_t _tmp00 = vfmaq_f32(vaddq_f32(_r00, _r40), _r20, _vm2_5);
                float32x4_t _tmp01 = vfmaq_f32(vaddq_f32(_r01, _r41), _r21, _vm2_5);
                float32x4_t _tmp10 = vsubq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp11 = vsubq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp20 = vaddq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp21 = vaddq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp50 = vfmaq_f32(vaddq_f32(_r10, _r50), _r30, _vm2_5);
                float32x4_t _tmp51 = vfmaq_f32(vaddq_f32(_r11, _r51), _r31, _vm2_5);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);
                vst1q_f32(tmp[4][m], _tmp40);
                vst1q_f32(tmp[4][m] + 4, _tmp41);
                vst1q_f32(tmp[5][m], _tmp50);
                vst1q_f32(tmp[5][m] + 4, _tmp51);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj * 8;
            float* p1 = p0 + max_jj * 8;
            float* p2 = p0 + max_jj * 8 * 2;
            float* p3 = p0 + max_jj * 8 * 3;
            float* p4 = p0 + max_jj * 8 * 4;
            float* p5 = p0 + max_jj * 8 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs, 0), _r30, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs, 0), _r31, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 2);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 2);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r30, _coeffs, 0), _r10, _coeffs, 1);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r31, _coeffs, 0), _r11, _coeffs, 1);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 3);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 3);

                float32x4_t _tmp00 = vfmaq_f32(vaddq_f32(_r00, _r40), _r20, _vm2_5);
                float32x4_t _tmp01 = vfmaq_f32(vaddq_f32(_r01, _r41), _r21, _vm2_5);
                float32x4_t _tmp10 = vsubq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp11 = vsubq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp20 = vaddq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp21 = vaddq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp50 = vfmaq_f32(vaddq_f32(_r10, _r50), _r30, _vm2_5);
                float32x4_t _tmp51 = vfmaq_f32(vaddq_f32(_r11, _r51), _r31, _vm2_5);

                vst1q_f32(p0, _tmp00);
                vst1q_f32(p0 + 4, _tmp01);
                vst1q_f32(p1, _tmp10);
                vst1q_f32(p1 + 4, _tmp11);
                vst1q_f32(p2, _tmp20);
                vst1q_f32(p2 + 4, _tmp21);
                vst1q_f32(p3, _tmp30);
                vst1q_f32(p3 + 4, _tmp31);
                vst1q_f32(p4, _tmp40);
                vst1q_f32(p4 + 4, _tmp41);
                vst1q_f32(p5, _tmp50);
                vst1q_f32(p5 + 4, _tmp51);

                p0 += max_jj * 6 * 8;
                p1 += max_jj * 6 * 8;
                p2 += max_jj * 6 * 8;
                p3 += max_jj * 6 * 8;
                p4 += max_jj * 6 * 8;
                p5 += max_jj * 6 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
#else // __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    #pragma omp parallel for num_threads(nT)
#endif // __aarch64__
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][6][4];

        const float coeffs[4] = {sq2, -sq2_d2, -2.f, -0.5f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _vm2_5 = vdupq_n_f32(-2.5f);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel((k + kk) / elempack).row(ti * 4) + (tj * 4) * elempack;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r0 = vdupq_n_f32(0.f);
                float32x4_t _r1 = vdupq_n_f32(0.f);
                float32x4_t _r2 = vdupq_n_f32(0.f);
                float32x4_t _r3 = vdupq_n_f32(0.f);
                float32x4_t _r4 = vdupq_n_f32(0.f);
                float32x4_t _r5 = vdupq_n_f32(0.f);

                if (ti * 4 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        if (tj * 4 + 1 < w) _r1 = vld1q_f32(r0 + 4);
                        if (tj * 4 + 2 < w) _r2 = vld1q_f32(r0 + 8);
                        if (tj * 4 + 3 < w) _r3 = vld1q_f32(r0 + 12);
                        if (tj * 4 + 4 < w) _r4 = vld1q_f32(r0 + 16);
                        if (tj * 4 + 5 < w) _r5 = vld1q_f32(r0 + 20);
                    }
                    if (elempack == 1)
                    {
                        const float* r1 = r0 + N;
                        const float* r2 = r0 + N * 2;
                        const float* r3 = r0 + N * 3;

                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4_t _t2 = vld1q_f32(r2);
                        float32x4_t _t3 = vld1q_f32(r3);

                        transpose4x4_ps(_t0, _t1, _t2, _t3);

                        _r0 = _t0;
                        if (tj * 4 + 1 < w) _r1 = _t1;
                        if (tj * 4 + 2 < w) _r2 = _t2;
                        if (tj * 4 + 3 < w) _r3 = _t3;
                        if (tj * 4 + 4 < w)
                        {
                            float tmp[4] = {r0[4], r1[4], r2[4], r3[4]};
                            _r4 = vld1q_f32(tmp);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            float tmp[4] = {r0[5], r1[5], r2[5], r3[5]};
                            _r5 = vld1q_f32(tmp);
                        }
                    }
                }

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vmulq_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x4_t _tmp34b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmulq_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x4_t _tmp0 = vmlaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x4_t _tmp1 = vsubq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp2 = vaddq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp3 = vaddq_f32(_tmp34b, _tmp34a);
                float32x4_t _tmp4 = vsubq_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x4_t _tmp5 = vfmaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x4_t _tmp5 = vmlaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);
                vst1q_f32(tmp[4][m], _tmp4);
                vst1q_f32(tmp[5][m], _tmp5);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj * 4;
            float* p1 = p0 + max_jj * 4;
            float* p2 = p0 + max_jj * 4 * 2;
            float* p3 = p0 + max_jj * 4 * 3;
            float* p4 = p0 + max_jj * 4 * 4;
            float* p5 = p0 + max_jj * 4 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vmulq_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x4_t _tmp34b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmulq_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x4_t _tmp0 = vmlaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x4_t _tmp1 = vsubq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp2 = vaddq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp3 = vaddq_f32(_tmp34b, _tmp34a);
                float32x4_t _tmp4 = vsubq_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x4_t _tmp5 = vfmaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x4_t _tmp5 = vmlaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1q_f32(p0, _tmp0);
                vst1q_f32(p1, _tmp1);
                vst1q_f32(p2, _tmp2);
                vst1q_f32(p3, _tmp3);
                vst1q_f32(p4, _tmp4);
                vst1q_f32(p5, _tmp5);

                p0 += max_jj * 6 * 4;
                p1 += max_jj * 6 * 4;
                p2 += max_jj * 6 * 4;
                p3 += max_jj * 6 * 4;
                p4 += max_jj * 6 * 4;
                p5 += max_jj * 6 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[6][6][2];

#if __ARM_NEON
        const float coeffs[4] = {sq2, -sq2_d2, -2.f, -0.5f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x2_t _vm2_5 = vdup_n_f32(-2.5f);
#endif

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel(k + kk).row(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vdup_n_f32(0.f);
                float32x2_t _r1 = vdup_n_f32(0.f);
                float32x2_t _r2 = vdup_n_f32(0.f);
                float32x2_t _r3 = vdup_n_f32(0.f);
                float32x2_t _r4 = vdup_n_f32(0.f);
                float32x2_t _r5 = vdup_n_f32(0.f);
#else
                float r00 = 0.f;
                float r01 = 0.f;
                float r10 = 0.f;
                float r11 = 0.f;
                float r20 = 0.f;
                float r21 = 0.f;
                float r30 = 0.f;
                float r31 = 0.f;
                float r40 = 0.f;
                float r41 = 0.f;
                float r50 = 0.f;
                float r51 = 0.f;
#endif

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const float* r1 = r0 + N;

#if __ARM_NEON
                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4x2_t _t01 = vzipq_f32(_t0, _t1);

                        _r0 = vget_low_f32(_t01.val[0]);
                        if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
                        if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
                        if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
                        if (tj * 4 + 4 < w)
                        {
                            float tmp[2] = {r0[4], r1[4]};
                            _r4 = vld1_f32(tmp);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            float tmp[2] = {r0[5], r1[5]};
                            _r5 = vld1_f32(tmp);
                        }
#else
                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 4 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 4 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 4 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
                        if (tj * 4 + 4 < w)
                        {
                            r40 = r0[4];
                            r41 = r1[4];
                        }
                        if (tj * 4 + 5 < w)
                        {
                            r50 = r0[5];
                            r51 = r1[5];
                        }
#endif
                    }
                }

#if __ARM_NEON
#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x2_t _tmp34a = vfma_laneq_f32(vmul_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x2_t _tmp34b = vfma_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34a = vmla_lane_f32(vmul_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x2_t _tmp0 = vmla_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x2_t _tmp1 = vsub_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp2 = vadd_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp3 = vadd_f32(_tmp34b, _tmp34a);
                float32x2_t _tmp4 = vsub_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x2_t _tmp5 = vfma_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x2_t _tmp5 = vmla_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
                vst1_f32(tmp[4][m], _tmp4);
                vst1_f32(tmp[5][m], _tmp5);
#else
                float tmp12a0 = sq2 * r10 - sq2_d2 * r30;
                float tmp12a1 = sq2 * r11 - sq2_d2 * r31;
                float tmp12b0 = r40 - 2 * r20;
                float tmp12b1 = r41 - 2 * r21;
                float tmp34a0 = sq2 * r30 - sq2_d2 * r10;
                float tmp34a1 = sq2 * r31 - sq2_d2 * r11;
                float tmp34b0 = r40 - 0.5f * r20;
                float tmp34b1 = r41 - 0.5f * r21;

                tmp[0][m][0] = r00 + r40 - 2.5f * r20;
                tmp[0][m][1] = r01 + r41 - 2.5f * r21;
                tmp[1][m][0] = tmp12b0 - tmp12a0;
                tmp[1][m][1] = tmp12b1 - tmp12a1;
                tmp[2][m][0] = tmp12b0 + tmp12a0;
                tmp[2][m][1] = tmp12b1 + tmp12a1;
                tmp[3][m][0] = tmp34b0 + tmp34a0;
                tmp[3][m][1] = tmp34b1 + tmp34a1;
                tmp[4][m][0] = tmp34b0 - tmp34a0;
                tmp[4][m][1] = tmp34b1 - tmp34a1;
                tmp[5][m][0] = r10 + r50 - 2.5f * r30;
                tmp[5][m][1] = r11 + r51 - 2.5f * r31;
#endif

                r0 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj * 2;
            float* p1 = p0 + max_jj * 2;
            float* p2 = p0 + max_jj * 2 * 2;
            float* p3 = p0 + max_jj * 2 * 3;
            float* p4 = p0 + max_jj * 2 * 4;
            float* p5 = p0 + max_jj * 2 * 5;

            for (int m = 0; m < 6; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);

#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x2_t _tmp34a = vfma_laneq_f32(vmul_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x2_t _tmp34b = vfma_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34a = vmla_lane_f32(vmul_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x2_t _tmp0 = vmla_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x2_t _tmp1 = vsub_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp2 = vadd_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp3 = vadd_f32(_tmp34b, _tmp34a);
                float32x2_t _tmp4 = vsub_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x2_t _tmp5 = vfma_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x2_t _tmp5 = vmla_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1_f32(p0, _tmp0);
                vst1_f32(p1, _tmp1);
                vst1_f32(p2, _tmp2);
                vst1_f32(p3, _tmp3);
                vst1_f32(p4, _tmp4);
                vst1_f32(p5, _tmp5);
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];

                float tmp12a0 = sq2 * r10 - sq2_d2 * r30;
                float tmp12a1 = sq2 * r11 - sq2_d2 * r31;
                float tmp12b0 = r40 - 2 * r20;
                float tmp12b1 = r41 - 2 * r21;
                float tmp34a0 = sq2 * r30 - sq2_d2 * r10;
                float tmp34a1 = sq2 * r31 - sq2_d2 * r11;
                float tmp34b0 = r40 - 0.5f * r20;
                float tmp34b1 = r41 - 0.5f * r21;

                p0[0] = r00 + r40 - 2.5f * r20;
                p0[1] = r01 + r41 - 2.5f * r21;
                p1[0] = tmp12b0 - tmp12a0;
                p1[1] = tmp12b1 - tmp12a1;
                p2[0] = tmp12b0 + tmp12a0;
                p2[1] = tmp12b1 + tmp12a1;
                p3[0] = tmp34b0 + tmp34a0;
                p3[1] = tmp34b1 + tmp34a1;
                p4[0] = tmp34b0 - tmp34a0;
                p4[1] = tmp34b1 - tmp34a1;
                p5[0] = r10 + r50 - 2.5f * r30;
                p5[1] = r11 + r51 - 2.5f * r31;
#endif

                p0 += max_jj * 6 * 2;
                p1 += max_jj * 6 * 2;
                p2 += max_jj * 6 * 2;
                p3 += max_jj * 6 * 2;
                p4 += max_jj * 6 * 2;
                p5 += max_jj * 6 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        float tmp[6][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0123 = bottom_blob.channel(k + kk).row(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
                float r0 = 0.f;
                float r1 = 0.f;
                float r2 = 0.f;
                float r3 = 0.f;
                float r4 = 0.f;
                float r5 = 0.f;

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 4 + 1 < w) r1 = r0123[1];
                        if (tj * 4 + 2 < w) r2 = r0123[2];
                        if (tj * 4 + 3 < w) r3 = r0123[3];
                        if (tj * 4 + 4 < w) r4 = r0123[4];
                        if (tj * 4 + 5 < w) r5 = r0123[5];
                    }
                }

                float tmp12a = sq2 * r1 - sq2_d2 * r3;
                float tmp12b = r4 - 2 * r2;
                float tmp34a = sq2 * r3 - sq2_d2 * r1;
                float tmp34b = r4 - 0.5f * r2;

                tmp[0][m] = r0 + r4 - 2.5f * r2;
                tmp[1][m] = tmp12b - tmp12a;
                tmp[2][m] = tmp12b + tmp12a;
                tmp[3][m] = tmp34b + tmp34a;
                tmp[4][m] = tmp34b - tmp34a;
                tmp[5][m] = r1 + r5 - 2.5f * r3;

                r0123 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj;
            float* p1 = p0 + max_jj;
            float* p2 = p0 + max_jj * 2;
            float* p3 = p0 + max_jj * 3;
            float* p4 = p0 + max_jj * 4;
            float* p5 = p0 + max_jj * 5;

            for (int m = 0; m < 6; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];

                float tmp12a = sq2 * r1 - sq2_d2 * r3;
                float tmp12b = r4 - 2 * r2;
                float tmp34a = sq2 * r3 - sq2_d2 * r1;
                float tmp34b = r4 - 0.5f * r2;

                p0[0] = r0 + r4 - 2.5f * r2;
                p1[0] = tmp12b - tmp12a;
                p2[0] = tmp12b + tmp12a;
                p3[0] = tmp34b + tmp34a;
                p4[0] = tmp34b - tmp34a;
                p5[0] = r1 + r5 - 2.5f * r3;

                p0 += max_jj * 6;
                p1 += max_jj * 6;
                p2 += max_jj * 6;
                p3 += max_jj * 6;
                p4 += max_jj * 6;
                p5 += max_jj * 6;
            }
        }
    }
}

static inline void conv3x3s1_winograd43_transform_output_tile(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    const float sq2 = 1.41421356237;
    const float sq2_m2 = 1.41421356237 * 2;
    const float sq2_d2 = 1.41421356237 / 2;
    const float sq2_d4 = 1.41421356237 / 4;

    // const float otm[4][6] = {
    //     {1.0f, 1.0f,   1.0f,  1.0f,  1.0f,   0.0f},
    //     {0.0f, sq2/2, -sq2/2, sq2,   -sq2,   0.0f},
    //     {0.0f, 0.5f,   0.5f,  2.0f,  2.0f,   0.0f},
    //     {0.0f, sq2/4, -sq2/4, sq2*2, -sq2*2, 1.0f}
    // };

    // 0 = r00 + (r01 + r02) + (r03 + r04)
    // 1 =       (r01 - r02) * sq2_d2 + (r03 - r04) * sq2
    // 2 =       (r01 + r02) * 0.5f + (r03 + r04) * 2
    // 3 = r05 + (r01 - r02) * sq2_d4 + (r03 - r04) * sq2_m2

#if __ARM_NEON
    const float coeffs[6] = {sq2, sq2_d2, sq2_d4, sq2_m2, 0.5f, 2.f};
    float32x4_t _coeffs = vld1q_f32(coeffs);
    float32x2_t _coeffs2 = vld1_f32(coeffs + 4);
#endif // __ARM_NEON

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 3) / 4;

    const float* biasptr = bias;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = biasptr ? vld1q_f32(biasptr + i + ii + 4) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj * 8;
            const float* r1 = r0 + max_jj * 8;
            const float* r2 = r0 + max_jj * 8 * 2;
            const float* r3 = r0 + max_jj * 8 * 3;
            const float* r4 = r0 + max_jj * 8 * 4;
            const float* r5 = r0 + max_jj * 8 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r50 = vld1q_f32(r5);
                float32x4_t _r51 = vld1q_f32(r5 + 4);

                float32x4_t _tmp02a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp02a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp02b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp02b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp13a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp13a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp13b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp13b1 = vsubq_f32(_r31, _r41);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _tmp02a0), _tmp02b0);
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _tmp02a1), _tmp02b1);
                float32x4_t _tmp10 = vfmaq_laneq_f32(vmulq_laneq_f32(_tmp13a0, _coeffs, 1), _tmp13b0, _coeffs, 0);
                float32x4_t _tmp11 = vfmaq_laneq_f32(vmulq_laneq_f32(_tmp13a1, _coeffs, 1), _tmp13b1, _coeffs, 0);
                float32x4_t _tmp20 = vfmaq_lane_f32(vmulq_lane_f32(_tmp02a0, _coeffs2, 0), _tmp02b0, _coeffs2, 1);
                float32x4_t _tmp21 = vfmaq_lane_f32(vmulq_lane_f32(_tmp02a1, _coeffs2, 0), _tmp02b1, _coeffs2, 1);
                float32x4_t _tmp30 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r50, _tmp13a0, _coeffs, 2), _tmp13b0, _coeffs, 3);
                float32x4_t _tmp31 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r51, _tmp13a1, _coeffs, 2), _tmp13b1, _coeffs, 3);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);

                r0 += max_jj * 6 * 8;
                r1 += max_jj * 6 * 8;
                r2 += max_jj * 6 * 8;
                r3 += max_jj * 6 * 8;
                r4 += max_jj * 6 * 8;
                r5 += max_jj * 6 * 8;
            }

            float* outptr0 = top_blob.channel((i + ii) / out_elempack).row(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);

                float32x4_t _tmp02a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp02a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp02b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp02b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp13a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp13a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp13b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp13b1 = vsubq_f32(_r31, _r41);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _tmp02a0), vaddq_f32(_tmp02b0, _bias0));
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _tmp02a1), vaddq_f32(_tmp02b1, _bias1));
                float32x4_t _tmp10 = vfmaq_laneq_f32(vfmaq_laneq_f32(_bias0, _tmp13a0, _coeffs, 1), _tmp13b0, _coeffs, 0);
                float32x4_t _tmp11 = vfmaq_laneq_f32(vfmaq_laneq_f32(_bias1, _tmp13a1, _coeffs, 1), _tmp13b1, _coeffs, 0);
                float32x4_t _tmp20 = vfmaq_lane_f32(vfmaq_lane_f32(_bias0, _tmp02a0, _coeffs2, 0), _tmp02b0, _coeffs2, 1);
                float32x4_t _tmp21 = vfmaq_lane_f32(vfmaq_lane_f32(_bias1, _tmp02a1, _coeffs2, 0), _tmp02b1, _coeffs2, 1);
                float32x4_t _tmp30 = vfmaq_laneq_f32(vfmaq_laneq_f32(vaddq_f32(_r50, _bias0), _tmp13a0, _coeffs, 2), _tmp13b0, _coeffs, 3);
                float32x4_t _tmp31 = vfmaq_laneq_f32(vfmaq_laneq_f32(vaddq_f32(_r51, _bias1), _tmp13a1, _coeffs, 2), _tmp13b1, _coeffs, 3);

                if (out_elempack == 4)
                {
                    float* outptr1 = outptr0 + N;

                    vst1q_f32(outptr0, _tmp00);
                    vst1q_f32(outptr1, _tmp01);
                    if (tj * 4 + 1 < outw)
                    {
                        vst1q_f32(outptr0 + 4, _tmp10);
                        vst1q_f32(outptr1 + 4, _tmp11);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        vst1q_f32(outptr0 + 8, _tmp20);
                        vst1q_f32(outptr1 + 8, _tmp21);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        vst1q_f32(outptr0 + 12, _tmp30);
                        vst1q_f32(outptr1 + 12, _tmp31);
                    }
                }
                if (out_elempack == 1)
                {
                    float tmp0[8];
                    float tmp1[8];
                    float tmp2[8];
                    float tmp3[8];
                    vst1q_f32(tmp0, _tmp00);
                    vst1q_f32(tmp0 + 4, _tmp01);
                    vst1q_f32(tmp1, _tmp10);
                    vst1q_f32(tmp1 + 4, _tmp11);
                    vst1q_f32(tmp2, _tmp20);
                    vst1q_f32(tmp2 + 4, _tmp21);
                    vst1q_f32(tmp3, _tmp30);
                    vst1q_f32(tmp3 + 4, _tmp31);

                    float* outptr1 = outptr0 + N;
                    float* outptr2 = outptr0 + N * 2;
                    float* outptr3 = outptr0 + N * 3;
                    float* outptr4 = outptr0 + N * 4;
                    float* outptr5 = outptr0 + N * 5;
                    float* outptr6 = outptr0 + N * 6;
                    float* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                        outptr4[2] = tmp2[4];
                        outptr5[2] = tmp2[5];
                        outptr6[2] = tmp2[6];
                        outptr7[2] = tmp2[7];
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                        outptr4[3] = tmp3[4];
                        outptr5[3] = tmp3[5];
                        outptr6[3] = tmp3[6];
                        outptr7[3] = tmp3[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][6][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj * 4;
            const float* r1 = r0 + max_jj * 4;
            const float* r2 = r0 + max_jj * 4 * 2;
            const float* r3 = r0 + max_jj * 4 * 3;
            const float* r4 = r0 + max_jj * 4 * 4;
            const float* r5 = r0 + max_jj * 4 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _r1 = vld1q_f32(r1);
                float32x4_t _r2 = vld1q_f32(r2);
                float32x4_t _r3 = vld1q_f32(r3);
                float32x4_t _r4 = vld1q_f32(r4);
                float32x4_t _r5 = vld1q_f32(r5);

                float32x4_t _tmp02a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp02b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp13a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp13b = vsubq_f32(_r3, _r4);

                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp02a), _tmp02b);
#if __aarch64__
                float32x4_t _tmp1 = vfmaq_laneq_f32(vmulq_laneq_f32(_tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x4_t _tmp2 = vfmaq_lane_f32(vmulq_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r5, _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x4_t _tmp1 = vmlaq_lane_f32(vmulq_lane_f32(_tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x4_t _tmp2 = vmlaq_lane_f32(vmulq_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vmlaq_lane_f32(vmlaq_lane_f32(_r5, _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);

                r0 += max_jj * 6 * 4;
                r1 += max_jj * 6 * 4;
                r2 += max_jj * 6 * 4;
                r3 += max_jj * 6 * 4;
                r4 += max_jj * 6 * 4;
                r5 += max_jj * 6 * 4;
            }

            float* outptr0 = top_blob.channel((i + ii) / out_elempack).row(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);

                float32x4_t _tmp02a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp02b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp13a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp13b = vsubq_f32(_r3, _r4);

                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp02a), vaddq_f32(_tmp02b, _bias0));
#if __aarch64__
                float32x4_t _tmp1 = vfmaq_laneq_f32(vfmaq_laneq_f32(_bias0, _tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x4_t _tmp2 = vfmaq_lane_f32(vfmaq_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vfmaq_laneq_f32(vfmaq_laneq_f32(vaddq_f32(_r5, _bias0), _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x4_t _tmp1 = vmlaq_lane_f32(vmlaq_lane_f32(_bias0, _tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x4_t _tmp2 = vmlaq_lane_f32(vmlaq_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vmlaq_lane_f32(vmlaq_lane_f32(vaddq_f32(_r5, _bias0), _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif

                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _tmp0);
                    if (tj * 4 + 1 < outw) vst1q_f32(outptr0 + 4, _tmp1);
                    if (tj * 4 + 2 < outw) vst1q_f32(outptr0 + 8, _tmp2);
                    if (tj * 4 + 3 < outw) vst1q_f32(outptr0 + 12, _tmp3);
                }
                if (out_elempack == 1)
                {
                    float tmp0[4];
                    float tmp1[4];
                    float tmp2[4];
                    float tmp3[4];
                    vst1q_f32(tmp0, _tmp0);
                    vst1q_f32(tmp1, _tmp1);
                    vst1q_f32(tmp2, _tmp2);
                    vst1q_f32(tmp3, _tmp3);

                    float* outptr1 = outptr0 + N;
                    float* outptr2 = outptr0 + N * 2;
                    float* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        float32x2_t _bias0 = biasptr ? vld1_f32(biasptr + i + ii) : vdup_n_f32(0.f);
#else
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;
        float bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;
#endif

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[4][6][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj * 2;
            const float* r1 = r0 + max_jj * 2;
            const float* r2 = r0 + max_jj * 2 * 2;
            const float* r3 = r0 + max_jj * 2 * 3;
            const float* r4 = r0 + max_jj * 2 * 4;
            const float* r5 = r0 + max_jj * 2 * 5;

            for (int m = 0; m < 6; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(r0);
                float32x2_t _r1 = vld1_f32(r1);
                float32x2_t _r2 = vld1_f32(r2);
                float32x2_t _r3 = vld1_f32(r3);
                float32x2_t _r4 = vld1_f32(r4);
                float32x2_t _r5 = vld1_f32(r5);

                float32x2_t _tmp02a = vadd_f32(_r1, _r2);
                float32x2_t _tmp02b = vadd_f32(_r3, _r4);
                float32x2_t _tmp13a = vsub_f32(_r1, _r2);
                float32x2_t _tmp13b = vsub_f32(_r3, _r4);

                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp02a), _tmp02b);
#if __aarch64__
                float32x2_t _tmp1 = vfma_laneq_f32(vmul_laneq_f32(_tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x2_t _tmp2 = vfma_lane_f32(vmul_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vfma_laneq_f32(vfma_laneq_f32(_r5, _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x2_t _tmp1 = vmla_lane_f32(vmul_lane_f32(_tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x2_t _tmp2 = vmla_lane_f32(vmul_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vmla_lane_f32(vmla_lane_f32(_r5, _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
#else
                float tmp02a0 = r1[0] + r2[0];
                float tmp02a1 = r1[1] + r2[1];
                float tmp02b0 = r3[0] + r4[0];
                float tmp02b1 = r3[1] + r4[1];
                float tmp13a0 = r1[0] - r2[0];
                float tmp13a1 = r1[1] - r2[1];
                float tmp13b0 = r3[0] - r4[0];
                float tmp13b1 = r3[1] - r4[1];

                tmp[0][m][0] = r0[0] + tmp02a0 + tmp02b0;
                tmp[0][m][1] = r0[1] + tmp02a1 + tmp02b1;
                tmp[1][m][0] = tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                tmp[1][m][1] = tmp13a1 * sq2_d2 + tmp13b1 * sq2;
                tmp[2][m][0] = tmp02a0 * 0.5f + tmp02b0 * 2;
                tmp[2][m][1] = tmp02a1 * 0.5f + tmp02b1 * 2;
                tmp[3][m][0] = r5[0] + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                tmp[3][m][1] = r5[1] + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;
#endif

                r0 += max_jj * 6 * 2;
                r1 += max_jj * 6 * 2;
                r2 += max_jj * 6 * 2;
                r3 += max_jj * 6 * 2;
                r4 += max_jj * 6 * 2;
                r5 += max_jj * 6 * 2;
            }

            float* outptr0 = top_blob.channel(i + ii).row(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);

                float32x2_t _tmp02a = vadd_f32(_r1, _r2);
                float32x2_t _tmp02b = vadd_f32(_r3, _r4);
                float32x2_t _tmp13a = vsub_f32(_r1, _r2);
                float32x2_t _tmp13b = vsub_f32(_r3, _r4);

                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp02a), vadd_f32(_tmp02b, _bias0));
#if __aarch64__
                float32x2_t _tmp1 = vfma_laneq_f32(vfma_laneq_f32(_bias0, _tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x2_t _tmp2 = vfma_lane_f32(vfma_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vfma_laneq_f32(vfma_laneq_f32(vadd_f32(_r5, _bias0), _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x2_t _tmp1 = vmla_lane_f32(vmla_lane_f32(_bias0, _tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x2_t _tmp2 = vmla_lane_f32(vmla_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vmla_lane_f32(vmla_lane_f32(vadd_f32(_r5, _bias0), _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];

                float tmp02a0 = r10 + r20;
                float tmp02a1 = r11 + r21;
                float tmp02b0 = r30 + r40;
                float tmp02b1 = r31 + r41;
                float tmp13a0 = r10 - r20;
                float tmp13a1 = r11 - r21;
                float tmp13b0 = r30 - r40;
                float tmp13b1 = r31 - r41;

                float tmp00 = bias0 + r00 + tmp02a0 + tmp02b0;
                float tmp01 = bias1 + r01 + tmp02a1 + tmp02b1;
                float tmp10 = bias0 + tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                float tmp11 = bias1 + tmp13a1 * sq2_d2 + tmp13b1 * sq2;
                float tmp20 = bias0 + tmp02a0 * 0.5f + tmp02b0 * 2;
                float tmp21 = bias1 + tmp02a1 * 0.5f + tmp02b1 * 2;
                float tmp30 = bias0 + r50 + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                float tmp31 = bias1 + r51 + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;
#endif

                // if (out_elempack == 1)
                {
                    float* outptr1 = outptr0 + N;

#if __ARM_NEON
                    outptr0[0] = vget_lane_f32(_tmp0, 0);
                    outptr1[0] = vget_lane_f32(_tmp0, 1);
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = vget_lane_f32(_tmp1, 0);
                        outptr1[1] = vget_lane_f32(_tmp1, 1);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = vget_lane_f32(_tmp2, 0);
                        outptr1[2] = vget_lane_f32(_tmp2, 1);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = vget_lane_f32(_tmp3, 0);
                        outptr1[3] = vget_lane_f32(_tmp3, 1);
                    }
#else
                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp20;
                        outptr1[2] = tmp21;
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp30;
                        outptr1[3] = tmp31;
                    }
#endif
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;

        float tmp[4][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj;
            const float* r1 = r0 + max_jj;
            const float* r2 = r0 + max_jj * 2;
            const float* r3 = r0 + max_jj * 3;
            const float* r4 = r0 + max_jj * 4;
            const float* r5 = r0 + max_jj * 5;

            for (int m = 0; m < 6; m++)
            {
                float tmp02a = r1[0] + r2[0];
                float tmp02b = r3[0] + r4[0];
                float tmp13a = r1[0] - r2[0];
                float tmp13b = r3[0] - r4[0];

                tmp[0][m] = r0[0] + tmp02a + tmp02b;
                tmp[1][m] = tmp13a * sq2_d2 + tmp13b * sq2;
                tmp[2][m] = tmp02a * 0.5f + tmp02b * 2;
                tmp[3][m] = r5[0] + tmp13a * sq2_d4 + tmp13b * sq2_m2;

                r0 += max_jj * 6;
                r1 += max_jj * 6;
                r2 += max_jj * 6;
                r3 += max_jj * 6;
                r4 += max_jj * 6;
                r5 += max_jj * 6;
            }

            float* outptr0 = top_blob.channel(i + ii).row(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];

                float tmp02a = r1 + r2;
                float tmp02b = r3 + r4;
                float tmp13a = r1 - r2;
                float tmp13b = r3 - r4;

                float tmp0 = bias0 + r0 + tmp02a + tmp02b;
                float tmp1 = bias0 + tmp13a * sq2_d2 + tmp13b * sq2;
                float tmp2 = bias0 + tmp02a * 0.5f + tmp02b * 2;
                float tmp3 = bias0 + r5 + tmp13a * sq2_d4 + tmp13b * sq2_m2;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 4 + 1 < outw) outptr0[1] = tmp1;
                    if (tj * 4 + 2 < outw) outptr0[2] = tmp2;
                    if (tj * 4 + 3 < outw) outptr0[3] = tmp3;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd43(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 4n+2, winograd F(4,3)
    int w_tiles = (outw + 3) / 4;
    int h_tiles = (outh + 3) / 4;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 36;

    // NCNN_LOGE("conv3x3s1_winograd43 %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd43_transform_input_tile(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd43_transform_input_tile(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd43_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}

static inline void conv3x3s1_winograd63_transform_kernel_tile(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    float* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            // const float ktm[8][3] = {
            //     {1.0f, 0.0f, 0.0f},
            //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
            //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
            //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
            //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
            //     {1.0f / 45, 1.0f / 90, 1.0f / 180},
            //     {1.0f / 45, -1.0f / 90, 1.0f / 180},
            //     {0.0f, 0.0f, 1.0f}
            // };
            const float ktm0 = 2.0f / 9;
            const float ktm1 = 1.0f / 45;
            const float ktm2 = 2.0f / 45;
            const float ktm3 = 1.0f / 90;
            const float ktm4 = 1.0f / 180;

            float tmp[8][3];

            const float* k0 = (const float*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                float r0 = k0[0];
                float r1 = k0[1];
                float r2 = k0[2];

                tmp[0][m] = r0;
                tmp[1][m] = -r0 * ktm0 - r1 * ktm0 - r2 * ktm0;
                tmp[2][m] = -r0 * ktm0 + r1 * ktm0 - r2 * ktm0;
                tmp[3][m] = r0 * ktm3 + r1 * ktm1 + r2 * ktm2;
                tmp[4][m] = r0 * ktm3 - r1 * ktm1 + r2 * ktm2;
                tmp[5][m] = r0 * ktm1 + r1 * ktm3 + r2 * ktm4;
                tmp[6][m] = r0 * ktm1 - r1 * ktm3 + r2 * ktm4;
                tmp[7][m] = r2;

                k0 += 3;
            }

            for (int m = 0; m < 8; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];

                float z0 = r0;
                float z1 = -r0 * ktm0 - r1 * ktm0 - r2 * ktm0;
                float z2 = -r0 * ktm0 + r1 * ktm0 - r2 * ktm0;
                float z3 = r0 * ktm3 + r1 * ktm1 + r2 * ktm2;
                float z4 = r0 * ktm3 - r1 * ktm1 + r2 * ktm2;
                float z5 = r0 * ktm1 + r1 * ktm3 + r2 * ktm4;
                float z6 = r0 * ktm1 - r1 * ktm3 + r2 * ktm4;
                float z7 = r2;

                ptmp[0] = z0;
                ptmp[1] = z1;
                ptmp[2] = z2;
                ptmp[3] = z3;
                ptmp[4] = z4;
                ptmp[5] = z5;
                ptmp[6] = z6;
                ptmp[7] = z7;
                ptmp += 8;
            }
        }
    }
}

static void conv3x3s1_winograd63_transform_kernel(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 64;

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, 0, K, B, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd63_transform_kernel_tile(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            conv3x3s1_winograd_pack_A_tile(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd63_transform_input_tile(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const float itm[8][8] = {
    //     {1.0f, 0.0f,-5.25f, 0.00f, 5.25f, 0.00f,-1.0f, 0.0f},
    //     {0.0f, 1.0f, 1.00f,-4.25f,-4.25f, 1.00f, 1.0f, 0.0f},
    //     {0.0f,-1.0f, 1.00f, 4.25f,-4.25f,-1.00f, 1.0f, 0.0f},
    //     {0.0f, 0.5f, 0.25f,-2.50f,-1.25f, 2.00f, 1.0f, 0.0f},
    //     {0.0f,-0.5f, 0.25f, 2.50f,-1.25f,-2.00f, 1.0f, 0.0f},
    //     {0.0f, 2.0f, 4.00f,-2.50f,-5.00f, 0.50f, 1.0f, 0.0f},
    //     {0.0f,-2.0f, 4.00f, 2.50f,-5.00f,-0.50f, 1.0f, 0.0f},
    //     {0.0f,-1.0f, 0.00f, 5.25f, 0.00f,-5.25f, 0.0f, 1.0f}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w + 3) / 6;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[8][8][8];

        const float coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _coeffs2 = vld1q_f32(coeffs + 4);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel((k + kk) / elempack).row(ti * 6) + (tj * 6) * elempack;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r00 = vdupq_n_f32(0.f);
                float32x4_t _r01 = vdupq_n_f32(0.f);
                float32x4_t _r10 = vdupq_n_f32(0.f);
                float32x4_t _r11 = vdupq_n_f32(0.f);
                float32x4_t _r20 = vdupq_n_f32(0.f);
                float32x4_t _r21 = vdupq_n_f32(0.f);
                float32x4_t _r30 = vdupq_n_f32(0.f);
                float32x4_t _r31 = vdupq_n_f32(0.f);
                float32x4_t _r40 = vdupq_n_f32(0.f);
                float32x4_t _r41 = vdupq_n_f32(0.f);
                float32x4_t _r50 = vdupq_n_f32(0.f);
                float32x4_t _r51 = vdupq_n_f32(0.f);
                float32x4_t _r60 = vdupq_n_f32(0.f);
                float32x4_t _r61 = vdupq_n_f32(0.f);
                float32x4_t _r70 = vdupq_n_f32(0.f);
                float32x4_t _r71 = vdupq_n_f32(0.f);

                if (ti * 6 + m < h)
                {
                    if (elempack == 4)
                    {
                        const float* r1 = r0 + N;

                        _r00 = vld1q_f32(r0);
                        _r01 = vld1q_f32(r1);
                        if (tj * 6 + 1 < w)
                        {
                            _r10 = vld1q_f32(r0 + 4);
                            _r11 = vld1q_f32(r1 + 4);
                        }
                        if (tj * 6 + 2 < w)
                        {
                            _r20 = vld1q_f32(r0 + 8);
                            _r21 = vld1q_f32(r1 + 8);
                        }
                        if (tj * 6 + 3 < w)
                        {
                            _r30 = vld1q_f32(r0 + 12);
                            _r31 = vld1q_f32(r1 + 12);
                        }
                        if (tj * 6 + 4 < w)
                        {
                            _r40 = vld1q_f32(r0 + 16);
                            _r41 = vld1q_f32(r1 + 16);
                        }
                        if (tj * 6 + 5 < w)
                        {
                            _r50 = vld1q_f32(r0 + 20);
                            _r51 = vld1q_f32(r1 + 20);
                        }
                        if (tj * 6 + 6 < w)
                        {
                            _r60 = vld1q_f32(r0 + 24);
                            _r61 = vld1q_f32(r1 + 24);
                        }
                        if (tj * 6 + 7 < w)
                        {
                            _r70 = vld1q_f32(r0 + 28);
                            _r71 = vld1q_f32(r1 + 28);
                        }
                    }
                    if (elempack == 1)
                    {
                        const float* r1 = r0 + N;
                        const float* r2 = r0 + N * 2;
                        const float* r3 = r0 + N * 3;
                        const float* r4 = r0 + N * 4;
                        const float* r5 = r0 + N * 5;
                        const float* r6 = r0 + N * 6;
                        const float* r7 = r0 + N * 7;

                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4_t _t2 = vld1q_f32(r2);
                        float32x4_t _t3 = vld1q_f32(r3);
                        float32x4_t _t4 = vld1q_f32(r4);
                        float32x4_t _t5 = vld1q_f32(r5);
                        float32x4_t _t6 = vld1q_f32(r6);
                        float32x4_t _t7 = vld1q_f32(r7);

                        transpose4x4_ps(_t0, _t1, _t2, _t3);
                        transpose4x4_ps(_t4, _t5, _t6, _t7);

                        _r00 = _t0;
                        _r01 = _t4;
                        if (tj * 6 + 1 < w)
                        {
                            _r10 = _t1;
                            _r11 = _t5;
                        }
                        if (tj * 6 + 2 < w)
                        {
                            _r20 = _t2;
                            _r21 = _t6;
                        }
                        if (tj * 6 + 3 < w)
                        {
                            _r30 = _t3;
                            _r31 = _t7;
                        }
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1q_f32(r0 + 4);
                            _t1 = vld1q_f32(r1 + 4);
                            _t2 = vld1q_f32(r2 + 4);
                            _t3 = vld1q_f32(r3 + 4);
                            _t4 = vld1q_f32(r4 + 4);
                            _t5 = vld1q_f32(r5 + 4);
                            _t6 = vld1q_f32(r6 + 4);
                            _t7 = vld1q_f32(r7 + 4);

                            transpose4x4_ps(_t0, _t1, _t2, _t3);
                            transpose4x4_ps(_t4, _t5, _t6, _t7);

                            _r40 = _t0;
                            _r41 = _t4;
                            if (tj * 6 + 5 < w)
                            {
                                _r50 = _t1;
                                _r51 = _t5;
                            }
                            if (tj * 6 + 6 < w)
                            {
                                _r60 = _t2;
                                _r61 = _t6;
                            }
                            if (tj * 6 + 7 < w)
                            {
                                _r70 = _t3;
                                _r71 = _t7;
                            }
                        }
                    }
                }

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vaddq_f32(_r20, _r60), _r40, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vaddq_f32(_r21, _r61), _r41, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(vaddq_f32(_r10, _r50), _r30, _coeffs, 1);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(vaddq_f32(_r11, _r51), _r31, _coeffs, 1);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r60, _r20, _coeffs, 3), _r40, _coeffs, 2);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r61, _r21, _coeffs, 3), _r41, _coeffs, 2);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 1), _r30, _coeffs2, 0), _r50, _coeffs2, 2);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 1), _r31, _coeffs2, 0), _r51, _coeffs2, 2);
                float32x4_t _tmp56a0 = vfmaq_laneq_f32(_r60, vfmaq_laneq_f32(_r20, _r40, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56a1 = vfmaq_laneq_f32(_r61, vfmaq_laneq_f32(_r21, _r41, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 2), _r30, _coeffs2, 0), _r50, _coeffs2, 1);
                float32x4_t _tmp56b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 2), _r31, _coeffs2, 0), _r51, _coeffs2, 1);

                float32x4_t _tmp00 = vfmaq_laneq_f32(vsubq_f32(_r00, _r60), vsubq_f32(_r40, _r20), _coeffs, 0);
                float32x4_t _tmp01 = vfmaq_laneq_f32(vsubq_f32(_r01, _r61), vsubq_f32(_r41, _r21), _coeffs, 0);
                float32x4_t _tmp10 = vaddq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp11 = vaddq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp20 = vsubq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp21 = vsubq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp50 = vaddq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp51 = vaddq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp60 = vsubq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp61 = vsubq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp70 = vfmaq_laneq_f32(vsubq_f32(_r70, _r10), vsubq_f32(_r30, _r50), _coeffs, 0);
                float32x4_t _tmp71 = vfmaq_laneq_f32(vsubq_f32(_r71, _r11), vsubq_f32(_r31, _r51), _coeffs, 0);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);
                vst1q_f32(tmp[4][m], _tmp40);
                vst1q_f32(tmp[4][m] + 4, _tmp41);
                vst1q_f32(tmp[5][m], _tmp50);
                vst1q_f32(tmp[5][m] + 4, _tmp51);
                vst1q_f32(tmp[6][m], _tmp60);
                vst1q_f32(tmp[6][m] + 4, _tmp61);
                vst1q_f32(tmp[7][m], _tmp70);
                vst1q_f32(tmp[7][m] + 4, _tmp71);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj * 8;
            float* p1 = p0 + max_jj * 8;
            float* p2 = p0 + max_jj * 8 * 2;
            float* p3 = p0 + max_jj * 8 * 3;
            float* p4 = p0 + max_jj * 8 * 4;
            float* p5 = p0 + max_jj * 8 * 5;
            float* p6 = p0 + max_jj * 8 * 6;
            float* p7 = p0 + max_jj * 8 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);
                float32x4_t _r60 = vld1q_f32(tmp[m][6]);
                float32x4_t _r61 = vld1q_f32(tmp[m][6] + 4);
                float32x4_t _r70 = vld1q_f32(tmp[m][7]);
                float32x4_t _r71 = vld1q_f32(tmp[m][7] + 4);

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vaddq_f32(_r20, _r60), _r40, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vaddq_f32(_r21, _r61), _r41, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(vaddq_f32(_r10, _r50), _r30, _coeffs, 1);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(vaddq_f32(_r11, _r51), _r31, _coeffs, 1);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r60, _r20, _coeffs, 3), _r40, _coeffs, 2);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r61, _r21, _coeffs, 3), _r41, _coeffs, 2);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 1), _r30, _coeffs2, 0), _r50, _coeffs2, 2);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 1), _r31, _coeffs2, 0), _r51, _coeffs2, 2);
                float32x4_t _tmp56a0 = vfmaq_laneq_f32(_r60, vfmaq_laneq_f32(_r20, _r40, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56a1 = vfmaq_laneq_f32(_r61, vfmaq_laneq_f32(_r21, _r41, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 2), _r30, _coeffs2, 0), _r50, _coeffs2, 1);
                float32x4_t _tmp56b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 2), _r31, _coeffs2, 0), _r51, _coeffs2, 1);

                float32x4_t _tmp00 = vfmaq_laneq_f32(vsubq_f32(_r00, _r60), vsubq_f32(_r40, _r20), _coeffs, 0);
                float32x4_t _tmp01 = vfmaq_laneq_f32(vsubq_f32(_r01, _r61), vsubq_f32(_r41, _r21), _coeffs, 0);
                float32x4_t _tmp10 = vaddq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp11 = vaddq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp20 = vsubq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp21 = vsubq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp50 = vaddq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp51 = vaddq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp60 = vsubq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp61 = vsubq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp70 = vfmaq_laneq_f32(vsubq_f32(_r70, _r10), vsubq_f32(_r30, _r50), _coeffs, 0);
                float32x4_t _tmp71 = vfmaq_laneq_f32(vsubq_f32(_r71, _r11), vsubq_f32(_r31, _r51), _coeffs, 0);

                vst1q_f32(p0, _tmp00);
                vst1q_f32(p0 + 4, _tmp01);
                vst1q_f32(p1, _tmp10);
                vst1q_f32(p1 + 4, _tmp11);
                vst1q_f32(p2, _tmp20);
                vst1q_f32(p2 + 4, _tmp21);
                vst1q_f32(p3, _tmp30);
                vst1q_f32(p3 + 4, _tmp31);
                vst1q_f32(p4, _tmp40);
                vst1q_f32(p4 + 4, _tmp41);
                vst1q_f32(p5, _tmp50);
                vst1q_f32(p5 + 4, _tmp51);
                vst1q_f32(p6, _tmp60);
                vst1q_f32(p6 + 4, _tmp61);
                vst1q_f32(p7, _tmp70);
                vst1q_f32(p7 + 4, _tmp71);

                p0 += max_jj * 8 * 8;
                p1 += max_jj * 8 * 8;
                p2 += max_jj * 8 * 8;
                p3 += max_jj * 8 * 8;
                p4 += max_jj * 8 * 8;
                p5 += max_jj * 8 * 8;
                p6 += max_jj * 8 * 8;
                p7 += max_jj * 8 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
#else // __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    #pragma omp parallel for num_threads(nT)
#endif // __aarch64__
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[8][8][4];

        const float coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _coeffs2 = vld1q_f32(coeffs + 4);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel((k + kk) / elempack).row(ti * 6) + (tj * 6) * elempack;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r0 = vdupq_n_f32(0.f);
                float32x4_t _r1 = vdupq_n_f32(0.f);
                float32x4_t _r2 = vdupq_n_f32(0.f);
                float32x4_t _r3 = vdupq_n_f32(0.f);
                float32x4_t _r4 = vdupq_n_f32(0.f);
                float32x4_t _r5 = vdupq_n_f32(0.f);
                float32x4_t _r6 = vdupq_n_f32(0.f);
                float32x4_t _r7 = vdupq_n_f32(0.f);

                if (ti * 6 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = vld1q_f32(r0);
                        if (tj * 6 + 1 < w) _r1 = vld1q_f32(r0 + 4);
                        if (tj * 6 + 2 < w) _r2 = vld1q_f32(r0 + 8);
                        if (tj * 6 + 3 < w) _r3 = vld1q_f32(r0 + 12);
                        if (tj * 6 + 4 < w) _r4 = vld1q_f32(r0 + 16);
                        if (tj * 6 + 5 < w) _r5 = vld1q_f32(r0 + 20);
                        if (tj * 6 + 6 < w) _r6 = vld1q_f32(r0 + 24);
                        if (tj * 6 + 7 < w) _r7 = vld1q_f32(r0 + 28);
                    }
                    if (elempack == 1)
                    {
                        const float* r1 = r0 + N;
                        const float* r2 = r0 + N * 2;
                        const float* r3 = r0 + N * 3;

                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4_t _t2 = vld1q_f32(r2);
                        float32x4_t _t3 = vld1q_f32(r3);

                        transpose4x4_ps(_t0, _t1, _t2, _t3);

                        _r0 = _t0;
                        if (tj * 6 + 1 < w) _r1 = _t1;
                        if (tj * 6 + 2 < w) _r2 = _t2;
                        if (tj * 6 + 3 < w) _r3 = _t3;
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1q_f32(r0 + 4);
                            _t1 = vld1q_f32(r1 + 4);
                            _t2 = vld1q_f32(r2 + 4);
                            _t3 = vld1q_f32(r3 + 4);

                            transpose4x4_ps(_t0, _t1, _t2, _t3);

                            _r4 = _t0;
                            if (tj * 6 + 5 < w) _r5 = _t1;
                            if (tj * 6 + 6 < w) _r6 = _t2;
                            if (tj * 6 + 7 < w) _r7 = _t3;
                        }
                    }
                }

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vaddq_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(vaddq_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vfmaq_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x4_t _tmp34b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x4_t _tmp56a = vfmaq_laneq_f32(_r6, vfmaq_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vaddq_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(vaddq_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmlaq_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x4_t _tmp56a = vmlaq_lane_f32(_r6, vmlaq_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x4_t _tmp56b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_laneq_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), _coeffs, 0);
#else
                float32x4_t _tmp0 = vmlaq_lane_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
                float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x4_t _tmp7 = vfmaq_laneq_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), _coeffs, 0);
#else
                float32x4_t _tmp7 = vmlaq_lane_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);
                vst1q_f32(tmp[4][m], _tmp4);
                vst1q_f32(tmp[5][m], _tmp5);
                vst1q_f32(tmp[6][m], _tmp6);
                vst1q_f32(tmp[7][m], _tmp7);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj * 4;
            float* p1 = p0 + max_jj * 4;
            float* p2 = p0 + max_jj * 4 * 2;
            float* p3 = p0 + max_jj * 4 * 3;
            float* p4 = p0 + max_jj * 4 * 4;
            float* p5 = p0 + max_jj * 4 * 5;
            float* p6 = p0 + max_jj * 4 * 6;
            float* p7 = p0 + max_jj * 4 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);
                float32x4_t _r6 = vld1q_f32(tmp[m][6]);
                float32x4_t _r7 = vld1q_f32(tmp[m][7]);

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vaddq_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(vaddq_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vfmaq_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x4_t _tmp34b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x4_t _tmp56a = vfmaq_laneq_f32(_r6, vfmaq_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vaddq_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(vaddq_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmlaq_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x4_t _tmp56a = vmlaq_lane_f32(_r6, vmlaq_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x4_t _tmp56b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_laneq_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), _coeffs, 0);
#else
                float32x4_t _tmp0 = vmlaq_lane_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
                float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x4_t _tmp7 = vfmaq_laneq_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), _coeffs, 0);
#else
                float32x4_t _tmp7 = vmlaq_lane_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1q_f32(p0, _tmp0);
                vst1q_f32(p1, _tmp1);
                vst1q_f32(p2, _tmp2);
                vst1q_f32(p3, _tmp3);
                vst1q_f32(p4, _tmp4);
                vst1q_f32(p5, _tmp5);
                vst1q_f32(p6, _tmp6);
                vst1q_f32(p7, _tmp7);

                p0 += max_jj * 8 * 4;
                p1 += max_jj * 8 * 4;
                p2 += max_jj * 8 * 4;
                p3 += max_jj * 8 * 4;
                p4 += max_jj * 8 * 4;
                p5 += max_jj * 8 * 4;
                p6 += max_jj * 8 * 4;
                p7 += max_jj * 8 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[8][8][2];

#if __ARM_NEON
        const float coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _coeffs2 = vld1q_f32(coeffs + 4);
#endif

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = bottom_blob.channel(k + kk).row(ti * 6) + (tj * 6);

            for (int m = 0; m < 8; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vdup_n_f32(0.f);
                float32x2_t _r1 = vdup_n_f32(0.f);
                float32x2_t _r2 = vdup_n_f32(0.f);
                float32x2_t _r3 = vdup_n_f32(0.f);
                float32x2_t _r4 = vdup_n_f32(0.f);
                float32x2_t _r5 = vdup_n_f32(0.f);
                float32x2_t _r6 = vdup_n_f32(0.f);
                float32x2_t _r7 = vdup_n_f32(0.f);
#else
                float r00 = 0.f;
                float r01 = 0.f;
                float r10 = 0.f;
                float r11 = 0.f;
                float r20 = 0.f;
                float r21 = 0.f;
                float r30 = 0.f;
                float r31 = 0.f;
                float r40 = 0.f;
                float r41 = 0.f;
                float r50 = 0.f;
                float r51 = 0.f;
                float r60 = 0.f;
                float r61 = 0.f;
                float r70 = 0.f;
                float r71 = 0.f;
#endif

                if (ti * 6 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const float* r1 = r0 + N;

#if __ARM_NEON
                        float32x4_t _t0 = vld1q_f32(r0);
                        float32x4_t _t1 = vld1q_f32(r1);
                        float32x4x2_t _t01 = vzipq_f32(_t0, _t1);

                        _r0 = vget_low_f32(_t01.val[0]);
                        if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t01.val[0]);
                        if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t01.val[1]);
                        if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t01.val[1]);
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1q_f32(r0 + 4);
                            _t1 = vld1q_f32(r1 + 4);
                            _t01 = vzipq_f32(_t0, _t1);

                            _r4 = vget_low_f32(_t01.val[0]);
                            if (tj * 6 + 5 < w) _r5 = vget_high_f32(_t01.val[0]);
                            if (tj * 6 + 6 < w) _r6 = vget_low_f32(_t01.val[1]);
                            if (tj * 6 + 7 < w) _r7 = vget_high_f32(_t01.val[1]);
                        }
#else
                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 6 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 6 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 6 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
                        if (tj * 6 + 4 < w)
                        {
                            r40 = r0[4];
                            r41 = r1[4];
                        }
                        if (tj * 6 + 5 < w)
                        {
                            r50 = r0[5];
                            r51 = r1[5];
                        }
                        if (tj * 6 + 6 < w)
                        {
                            r60 = r0[6];
                            r61 = r1[6];
                        }
                        if (tj * 6 + 7 < w)
                        {
                            r70 = r0[7];
                            r71 = r1[7];
                        }
#endif
                    }
                }

#if __ARM_NEON
#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vadd_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(vadd_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x2_t _tmp34a = vfma_laneq_f32(vfma_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x2_t _tmp34b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x2_t _tmp56a = vfma_laneq_f32(_r6, vfma_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x2_t _tmp56b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vadd_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(vadd_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34a = vmla_lane_f32(vmla_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x2_t _tmp56a = vmla_lane_f32(_r6, vmla_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x2_t _tmp56b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_laneq_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), _coeffs, 0);
#else
                float32x2_t _tmp0 = vmla_lane_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x2_t _tmp1 = vadd_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp2 = vsub_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp3 = vadd_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp4 = vsub_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp5 = vadd_f32(_tmp56a, _tmp56b);
                float32x2_t _tmp6 = vsub_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x2_t _tmp7 = vfma_laneq_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), _coeffs, 0);
#else
                float32x2_t _tmp7 = vmla_lane_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
                vst1_f32(tmp[4][m], _tmp4);
                vst1_f32(tmp[5][m], _tmp5);
                vst1_f32(tmp[6][m], _tmp6);
                vst1_f32(tmp[7][m], _tmp7);
#else
                float tmp12a0 = r20 + r60 - r40 * 4.25f;
                float tmp12a1 = r21 + r61 - r41 * 4.25f;
                float tmp12b0 = r10 + r50 - r30 * 4.25f;
                float tmp12b1 = r11 + r51 - r31 * 4.25f;
                float tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f;
                float tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f;
                float tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f;
                float tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f;
                float tmp56a0 = r20 * 4.f - r40 * 5.f + r60;
                float tmp56a1 = r21 * 4.f - r41 * 5.f + r61;
                float tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f;
                float tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f;

                tmp[0][m][0] = r00 - r60 + (r40 - r20) * 5.25f;
                tmp[0][m][1] = r01 - r61 + (r41 - r21) * 5.25f;
                tmp[1][m][0] = tmp12a0 + tmp12b0;
                tmp[1][m][1] = tmp12a1 + tmp12b1;
                tmp[2][m][0] = tmp12a0 - tmp12b0;
                tmp[2][m][1] = tmp12a1 - tmp12b1;
                tmp[3][m][0] = tmp34a0 + tmp34b0;
                tmp[3][m][1] = tmp34a1 + tmp34b1;
                tmp[4][m][0] = tmp34a0 - tmp34b0;
                tmp[4][m][1] = tmp34a1 - tmp34b1;
                tmp[5][m][0] = tmp56a0 + tmp56b0;
                tmp[5][m][1] = tmp56a1 + tmp56b1;
                tmp[6][m][0] = tmp56a0 - tmp56b0;
                tmp[6][m][1] = tmp56a1 - tmp56b1;
                tmp[7][m][0] = r70 - r10 + (r30 - r50) * 5.25f;
                tmp[7][m][1] = r71 - r11 + (r31 - r51) * 5.25f;
#endif

                r0 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj * 2;
            float* p1 = p0 + max_jj * 2;
            float* p2 = p0 + max_jj * 2 * 2;
            float* p3 = p0 + max_jj * 2 * 3;
            float* p4 = p0 + max_jj * 2 * 4;
            float* p5 = p0 + max_jj * 2 * 5;
            float* p6 = p0 + max_jj * 2 * 6;
            float* p7 = p0 + max_jj * 2 * 7;

            for (int m = 0; m < 8; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);
                float32x2_t _r6 = vld1_f32(tmp[m][6]);
                float32x2_t _r7 = vld1_f32(tmp[m][7]);

#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vadd_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(vadd_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x2_t _tmp34a = vfma_laneq_f32(vfma_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x2_t _tmp34b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x2_t _tmp56a = vfma_laneq_f32(_r6, vfma_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x2_t _tmp56b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vadd_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(vadd_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34a = vmla_lane_f32(vmla_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x2_t _tmp56a = vmla_lane_f32(_r6, vmla_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x2_t _tmp56b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_laneq_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), _coeffs, 0);
#else
                float32x2_t _tmp0 = vmla_lane_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x2_t _tmp1 = vadd_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp2 = vsub_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp3 = vadd_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp4 = vsub_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp5 = vadd_f32(_tmp56a, _tmp56b);
                float32x2_t _tmp6 = vsub_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x2_t _tmp7 = vfma_laneq_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), _coeffs, 0);
#else
                float32x2_t _tmp7 = vmla_lane_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1_f32(p0, _tmp0);
                vst1_f32(p1, _tmp1);
                vst1_f32(p2, _tmp2);
                vst1_f32(p3, _tmp3);
                vst1_f32(p4, _tmp4);
                vst1_f32(p5, _tmp5);
                vst1_f32(p6, _tmp6);
                vst1_f32(p7, _tmp7);
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];
                float r60 = tmp[m][6][0];
                float r61 = tmp[m][6][1];
                float r70 = tmp[m][7][0];
                float r71 = tmp[m][7][1];

                float tmp12a0 = r20 + r60 - r40 * 4.25f;
                float tmp12a1 = r21 + r61 - r41 * 4.25f;
                float tmp12b0 = r10 + r50 - r30 * 4.25f;
                float tmp12b1 = r11 + r51 - r31 * 4.25f;
                float tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f;
                float tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f;
                float tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f;
                float tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f;
                float tmp56a0 = r20 * 4.f - r40 * 5.f + r60;
                float tmp56a1 = r21 * 4.f - r41 * 5.f + r61;
                float tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f;
                float tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f;

                p0[0] = r00 - r60 + (r40 - r20) * 5.25f;
                p0[1] = r01 - r61 + (r41 - r21) * 5.25f;
                p1[0] = tmp12a0 + tmp12b0;
                p1[1] = tmp12a1 + tmp12b1;
                p2[0] = tmp12a0 - tmp12b0;
                p2[1] = tmp12a1 - tmp12b1;
                p3[0] = tmp34a0 + tmp34b0;
                p3[1] = tmp34a1 + tmp34b1;
                p4[0] = tmp34a0 - tmp34b0;
                p4[1] = tmp34a1 - tmp34b1;
                p5[0] = tmp56a0 + tmp56b0;
                p5[1] = tmp56a1 + tmp56b1;
                p6[0] = tmp56a0 - tmp56b0;
                p6[1] = tmp56a1 - tmp56b1;
                p7[0] = r70 - r10 + (r30 - r50) * 5.25f;
                p7[1] = r71 - r11 + (r31 - r51) * 5.25f;
#endif

                p0 += max_jj * 8 * 2;
                p1 += max_jj * 8 * 2;
                p2 += max_jj * 8 * 2;
                p3 += max_jj * 8 * 2;
                p4 += max_jj * 8 * 2;
                p5 += max_jj * 8 * 2;
                p6 += max_jj * 8 * 2;
                p7 += max_jj * 8 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        float tmp[8][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0123 = bottom_blob.channel(k + kk).row(ti * 6) + (tj * 6);

            for (int m = 0; m < 8; m++)
            {
                float r0 = 0.f;
                float r1 = 0.f;
                float r2 = 0.f;
                float r3 = 0.f;
                float r4 = 0.f;
                float r5 = 0.f;
                float r6 = 0.f;
                float r7 = 0.f;

                if (ti * 6 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 6 + 1 < w) r1 = r0123[1];
                        if (tj * 6 + 2 < w) r2 = r0123[2];
                        if (tj * 6 + 3 < w) r3 = r0123[3];
                        if (tj * 6 + 4 < w) r4 = r0123[4];
                        if (tj * 6 + 5 < w) r5 = r0123[5];
                        if (tj * 6 + 6 < w) r6 = r0123[6];
                        if (tj * 6 + 7 < w) r7 = r0123[7];
                    }
                }

                float tmp12a = r2 + r6 - r4 * 4.25f;
                float tmp12b = r1 + r5 - r3 * 4.25f;
                float tmp34a = r6 + r2 * 0.25f - r4 * 1.25f;
                float tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f;
                float tmp56a = r2 * 4.f - r4 * 5.f + r6;
                float tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f;

                tmp[0][m] = r0 - r6 + (r4 - r2) * 5.25f;
                tmp[1][m] = tmp12a + tmp12b;
                tmp[2][m] = tmp12a - tmp12b;
                tmp[3][m] = tmp34a + tmp34b;
                tmp[4][m] = tmp34a - tmp34b;
                tmp[5][m] = tmp56a + tmp56b;
                tmp[6][m] = tmp56a - tmp56b;
                tmp[7][m] = r7 - r1 + (r3 - r5) * 5.25f;

                r0123 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj;
            float* p1 = p0 + max_jj;
            float* p2 = p0 + max_jj * 2;
            float* p3 = p0 + max_jj * 3;
            float* p4 = p0 + max_jj * 4;
            float* p5 = p0 + max_jj * 5;
            float* p6 = p0 + max_jj * 6;
            float* p7 = p0 + max_jj * 7;

            for (int m = 0; m < 8; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];
                float r6 = tmp[m][6];
                float r7 = tmp[m][7];

                float tmp12a = r2 + r6 - r4 * 4.25f;
                float tmp12b = r1 + r5 - r3 * 4.25f;
                float tmp34a = r6 + r2 * 0.25f - r4 * 1.25f;
                float tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f;
                float tmp56a = r2 * 4.f - r4 * 5.f + r6;
                float tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f;

                p0[0] = r0 - r6 + (r4 - r2) * 5.25f;
                p1[0] = tmp12a + tmp12b;
                p2[0] = tmp12a - tmp12b;
                p3[0] = tmp34a + tmp34b;
                p4[0] = tmp34a - tmp34b;
                p5[0] = tmp56a + tmp56b;
                p6[0] = tmp56a - tmp56b;
                p7[0] = r7 - r1 + (r3 - r5) * 5.25f;

                p0 += max_jj * 8;
                p1 += max_jj * 8;
                p2 += max_jj * 8;
                p3 += max_jj * 8;
                p4 += max_jj * 8;
                p5 += max_jj * 8;
                p6 += max_jj * 8;
                p7 += max_jj * 8;
            }
        }
    }
}

static inline void conv3x3s1_winograd63_transform_output_tile(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    // const float otm[6][8] = {
    //     {1.0f, 1.0f,  1.0f,  1.0f,  1.0f, 32.0f, 32.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f,  2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f,  4.0f,  4.0f,  8.0f,  8.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f,  8.0f, -8.0f,  4.0f, -4.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f, 16.0f, 16.0f,  2.0f,  2.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 32.0f,-32.0f,  1.0f, -1.0f, 1.0f}
    // };

#if __ARM_NEON
    const float coeffs[4] = {32.f, 16.f, 8.f, 4.f};
    float32x4_t _coeffs = vld1q_f32(coeffs);
    float32x2_t _v2 = vdup_n_f32(2.f);
#endif

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 5) / 6;

    const float* biasptr = bias;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = biasptr ? vld1q_f32(biasptr + i + ii + 4) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][8][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj * 8;
            const float* r1 = r0 + max_jj * 8;
            const float* r2 = r0 + max_jj * 8 * 2;
            const float* r3 = r0 + max_jj * 8 * 3;
            const float* r4 = r0 + max_jj * 8 * 4;
            const float* r5 = r0 + max_jj * 8 * 5;
            const float* r6 = r0 + max_jj * 8 * 6;
            const float* r7 = r0 + max_jj * 8 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r50 = vld1q_f32(r5);
                float32x4_t _r51 = vld1q_f32(r5 + 4);
                float32x4_t _r60 = vld1q_f32(r6);
                float32x4_t _r61 = vld1q_f32(r6 + 4);
                float32x4_t _r70 = vld1q_f32(r7);
                float32x4_t _r71 = vld1q_f32(r7 + 4);

                float32x4_t _tmp024a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp024a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp135a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp135a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp024b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp024b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp135b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp135b1 = vsubq_f32(_r31, _r41);
                float32x4_t _tmp024c0 = vaddq_f32(_r50, _r60);
                float32x4_t _tmp024c1 = vaddq_f32(_r51, _r61);
                float32x4_t _tmp135c0 = vsubq_f32(_r50, _r60);
                float32x4_t _tmp135c1 = vsubq_f32(_r51, _r61);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _tmp024a0), vfmaq_laneq_f32(_tmp024b0, _tmp024c0, _coeffs, 0));
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _tmp024a1), vfmaq_laneq_f32(_tmp024b1, _tmp024c1, _coeffs, 0));
                float32x4_t _tmp10 = vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a0, _tmp135b0, _v2, 0), _tmp135c0, _coeffs, 1);
                float32x4_t _tmp11 = vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a1, _tmp135b1, _v2, 0), _tmp135c1, _coeffs, 1);
                float32x4_t _tmp20 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 3), _tmp024c0, _coeffs, 2);
                float32x4_t _tmp21 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 3), _tmp024c1, _coeffs, 2);
                float32x4_t _tmp30 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a0, _tmp135b0, _coeffs, 2), _tmp135c0, _coeffs, 3);
                float32x4_t _tmp31 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a1, _tmp135b1, _coeffs, 2), _tmp135c1, _coeffs, 3);
                float32x4_t _tmp40 = vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 1), _tmp024c0, _v2, 0);
                float32x4_t _tmp41 = vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 1), _tmp024c1, _v2, 0);
                float32x4_t _tmp50 = vaddq_f32(vaddq_f32(_r70, _tmp135a0), vfmaq_laneq_f32(_tmp135c0, _tmp135b0, _coeffs, 0));
                float32x4_t _tmp51 = vaddq_f32(vaddq_f32(_r71, _tmp135a1), vfmaq_laneq_f32(_tmp135c1, _tmp135b1, _coeffs, 0));

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);
                vst1q_f32(tmp[4][m], _tmp40);
                vst1q_f32(tmp[4][m] + 4, _tmp41);
                vst1q_f32(tmp[5][m], _tmp50);
                vst1q_f32(tmp[5][m] + 4, _tmp51);

                r0 += max_jj * 8 * 8;
                r1 += max_jj * 8 * 8;
                r2 += max_jj * 8 * 8;
                r3 += max_jj * 8 * 8;
                r4 += max_jj * 8 * 8;
                r5 += max_jj * 8 * 8;
                r6 += max_jj * 8 * 8;
                r7 += max_jj * 8 * 8;
            }

            float* outptr0 = top_blob.channel((i + ii) / out_elempack).row(ti * 6) + (tj * 6) * out_elempack;

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);
                float32x4_t _r60 = vld1q_f32(tmp[m][6]);
                float32x4_t _r61 = vld1q_f32(tmp[m][6] + 4);
                float32x4_t _r70 = vld1q_f32(tmp[m][7]);
                float32x4_t _r71 = vld1q_f32(tmp[m][7] + 4);

                float32x4_t _tmp024a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp024a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp135a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp135a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp024b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp024b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp135b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp135b1 = vsubq_f32(_r31, _r41);
                float32x4_t _tmp024c0 = vaddq_f32(_r50, _r60);
                float32x4_t _tmp024c1 = vaddq_f32(_r51, _r61);
                float32x4_t _tmp135c0 = vsubq_f32(_r50, _r60);
                float32x4_t _tmp135c1 = vsubq_f32(_r51, _r61);

                float32x4_t _tmp00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r00, _tmp024a0), vfmaq_laneq_f32(_tmp024b0, _tmp024c0, _coeffs, 0)));
                float32x4_t _tmp01 = vaddq_f32(_bias1, vaddq_f32(vaddq_f32(_r01, _tmp024a1), vfmaq_laneq_f32(_tmp024b1, _tmp024c1, _coeffs, 0)));
                float32x4_t _tmp10 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a0, _tmp135b0, _v2, 0), _tmp135c0, _coeffs, 1));
                float32x4_t _tmp11 = vaddq_f32(_bias1, vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a1, _tmp135b1, _v2, 0), _tmp135c1, _coeffs, 1));
                float32x4_t _tmp20 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 3), _tmp024c0, _coeffs, 2));
                float32x4_t _tmp21 = vaddq_f32(_bias1, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 3), _tmp024c1, _coeffs, 2));
                float32x4_t _tmp30 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a0, _tmp135b0, _coeffs, 2), _tmp135c0, _coeffs, 3));
                float32x4_t _tmp31 = vaddq_f32(_bias1, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a1, _tmp135b1, _coeffs, 2), _tmp135c1, _coeffs, 3));
                float32x4_t _tmp40 = vaddq_f32(_bias0, vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 1), _tmp024c0, _v2, 0));
                float32x4_t _tmp41 = vaddq_f32(_bias1, vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 1), _tmp024c1, _v2, 0));
                float32x4_t _tmp50 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r70, _tmp135a0), vfmaq_laneq_f32(_tmp135c0, _tmp135b0, _coeffs, 0)));
                float32x4_t _tmp51 = vaddq_f32(_bias1, vaddq_f32(vaddq_f32(_r71, _tmp135a1), vfmaq_laneq_f32(_tmp135c1, _tmp135b1, _coeffs, 0)));

                if (out_elempack == 4)
                {
                    float* outptr1 = outptr0 + N;

                    vst1q_f32(outptr0, _tmp00);
                    vst1q_f32(outptr1, _tmp01);
                    if (tj * 6 + 1 < outw)
                    {
                        vst1q_f32(outptr0 + 4, _tmp10);
                        vst1q_f32(outptr1 + 4, _tmp11);
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        vst1q_f32(outptr0 + 8, _tmp20);
                        vst1q_f32(outptr1 + 8, _tmp21);
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        vst1q_f32(outptr0 + 12, _tmp30);
                        vst1q_f32(outptr1 + 12, _tmp31);
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        vst1q_f32(outptr0 + 16, _tmp40);
                        vst1q_f32(outptr1 + 16, _tmp41);
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        vst1q_f32(outptr0 + 20, _tmp50);
                        vst1q_f32(outptr1 + 20, _tmp51);
                    }
                }
                if (out_elempack == 1)
                {
                    float tmp0[8];
                    float tmp1[8];
                    float tmp2[8];
                    float tmp3[8];
                    float tmp4[8];
                    float tmp5[8];
                    vst1q_f32(tmp0, _tmp00);
                    vst1q_f32(tmp0 + 4, _tmp01);
                    vst1q_f32(tmp1, _tmp10);
                    vst1q_f32(tmp1 + 4, _tmp11);
                    vst1q_f32(tmp2, _tmp20);
                    vst1q_f32(tmp2 + 4, _tmp21);
                    vst1q_f32(tmp3, _tmp30);
                    vst1q_f32(tmp3 + 4, _tmp31);
                    vst1q_f32(tmp4, _tmp40);
                    vst1q_f32(tmp4 + 4, _tmp41);
                    vst1q_f32(tmp5, _tmp50);
                    vst1q_f32(tmp5 + 4, _tmp51);

                    float* outptr1 = outptr0 + N;
                    float* outptr2 = outptr0 + N * 2;
                    float* outptr3 = outptr0 + N * 3;
                    float* outptr4 = outptr0 + N * 4;
                    float* outptr5 = outptr0 + N * 5;
                    float* outptr6 = outptr0 + N * 6;
                    float* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                        outptr4[2] = tmp2[4];
                        outptr5[2] = tmp2[5];
                        outptr6[2] = tmp2[6];
                        outptr7[2] = tmp2[7];
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                        outptr4[3] = tmp3[4];
                        outptr5[3] = tmp3[5];
                        outptr6[3] = tmp3[6];
                        outptr7[3] = tmp3[7];
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp4[0];
                        outptr1[4] = tmp4[1];
                        outptr2[4] = tmp4[2];
                        outptr3[4] = tmp4[3];
                        outptr4[4] = tmp4[4];
                        outptr5[4] = tmp4[5];
                        outptr6[4] = tmp4[6];
                        outptr7[4] = tmp4[7];
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp5[0];
                        outptr1[5] = tmp5[1];
                        outptr2[5] = tmp5[2];
                        outptr3[5] = tmp5[3];
                        outptr4[5] = tmp5[4];
                        outptr5[5] = tmp5[5];
                        outptr6[5] = tmp5[6];
                        outptr7[5] = tmp5[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][8][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj * 4;
            const float* r1 = r0 + max_jj * 4;
            const float* r2 = r0 + max_jj * 4 * 2;
            const float* r3 = r0 + max_jj * 4 * 3;
            const float* r4 = r0 + max_jj * 4 * 4;
            const float* r5 = r0 + max_jj * 4 * 5;
            const float* r6 = r0 + max_jj * 4 * 6;
            const float* r7 = r0 + max_jj * 4 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _r1 = vld1q_f32(r1);
                float32x4_t _r2 = vld1q_f32(r2);
                float32x4_t _r3 = vld1q_f32(r3);
                float32x4_t _r4 = vld1q_f32(r4);
                float32x4_t _r5 = vld1q_f32(r5);
                float32x4_t _r6 = vld1q_f32(r6);
                float32x4_t _r7 = vld1q_f32(r7);

                float32x4_t _tmp024a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp135a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp024b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp135b = vsubq_f32(_r3, _r4);
                float32x4_t _tmp024c = vaddq_f32(_r5, _r6);
                float32x4_t _tmp135c = vsubq_f32(_r5, _r6);

#if __aarch64__
                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp024a), vfmaq_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0));
                float32x4_t _tmp1 = vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, _coeffs, 1);
                float32x4_t _tmp2 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2);
                float32x4_t _tmp3 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3);
                float32x4_t _tmp4 = vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2, 0);
                float32x4_t _tmp5 = vaddq_f32(vaddq_f32(_r7, _tmp135a), vfmaq_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0));
#else
                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp024a), vmlaq_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0));
                float32x4_t _tmp1 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp2 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp3 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1);
                float32x4_t _tmp4 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2, 0);
                float32x4_t _tmp5 = vaddq_f32(vaddq_f32(_r7, _tmp135a), vmlaq_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0));
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);
                vst1q_f32(tmp[4][m], _tmp4);
                vst1q_f32(tmp[5][m], _tmp5);

                r0 += max_jj * 8 * 4;
                r1 += max_jj * 8 * 4;
                r2 += max_jj * 8 * 4;
                r3 += max_jj * 8 * 4;
                r4 += max_jj * 8 * 4;
                r5 += max_jj * 8 * 4;
                r6 += max_jj * 8 * 4;
                r7 += max_jj * 8 * 4;
            }

            float* outptr0 = top_blob.channel((i + ii) / out_elempack).row(ti * 6) + (tj * 6) * out_elempack;

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);
                float32x4_t _r6 = vld1q_f32(tmp[m][6]);
                float32x4_t _r7 = vld1q_f32(tmp[m][7]);

                float32x4_t _tmp024a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp135a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp024b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp135b = vsubq_f32(_r3, _r4);
                float32x4_t _tmp024c = vaddq_f32(_r5, _r6);
                float32x4_t _tmp135c = vsubq_f32(_r5, _r6);

#if __aarch64__
                float32x4_t _tmp0 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r0, _tmp024a), vfmaq_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0)));
                float32x4_t _tmp1 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, _coeffs, 1));
                float32x4_t _tmp2 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2));
                float32x4_t _tmp3 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3));
                float32x4_t _tmp4 = vaddq_f32(_bias0, vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2, 0));
                float32x4_t _tmp5 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r7, _tmp135a), vfmaq_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0)));
#else
                float32x4_t _tmp0 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r0, _tmp024a), vmlaq_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0)));
                float32x4_t _tmp1 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, vget_low_f32(_coeffs), 1));
                float32x4_t _tmp2 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0));
                float32x4_t _tmp3 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1));
                float32x4_t _tmp4 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2, 0));
                float32x4_t _tmp5 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r7, _tmp135a), vmlaq_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0)));
#endif

                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _tmp0);
                    if (tj * 6 + 1 < outw) vst1q_f32(outptr0 + 4, _tmp1);
                    if (tj * 6 + 2 < outw) vst1q_f32(outptr0 + 8, _tmp2);
                    if (tj * 6 + 3 < outw) vst1q_f32(outptr0 + 12, _tmp3);
                    if (tj * 6 + 4 < outw) vst1q_f32(outptr0 + 16, _tmp4);
                    if (tj * 6 + 5 < outw) vst1q_f32(outptr0 + 20, _tmp5);
                }
                if (out_elempack == 1)
                {
                    float tmp0[4];
                    float tmp1[4];
                    float tmp2[4];
                    float tmp3[4];
                    float tmp4[4];
                    float tmp5[4];
                    vst1q_f32(tmp0, _tmp0);
                    vst1q_f32(tmp1, _tmp1);
                    vst1q_f32(tmp2, _tmp2);
                    vst1q_f32(tmp3, _tmp3);
                    vst1q_f32(tmp4, _tmp4);
                    vst1q_f32(tmp5, _tmp5);

                    float* outptr1 = outptr0 + N;
                    float* outptr2 = outptr0 + N * 2;
                    float* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp4[0];
                        outptr1[4] = tmp4[1];
                        outptr2[4] = tmp4[2];
                        outptr3[4] = tmp4[3];
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp5[0];
                        outptr1[5] = tmp5[1];
                        outptr2[5] = tmp5[2];
                        outptr3[5] = tmp5[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        float32x2_t _bias0 = biasptr ? vld1_f32(biasptr + i + ii) : vdup_n_f32(0.f);
#else
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;
        float bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;
#endif

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[6][8][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj * 2;
            const float* r1 = r0 + max_jj * 2;
            const float* r2 = r0 + max_jj * 2 * 2;
            const float* r3 = r0 + max_jj * 2 * 3;
            const float* r4 = r0 + max_jj * 2 * 4;
            const float* r5 = r0 + max_jj * 2 * 5;
            const float* r6 = r0 + max_jj * 2 * 6;
            const float* r7 = r0 + max_jj * 2 * 7;

            for (int m = 0; m < 8; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(r0);
                float32x2_t _r1 = vld1_f32(r1);
                float32x2_t _r2 = vld1_f32(r2);
                float32x2_t _r3 = vld1_f32(r3);
                float32x2_t _r4 = vld1_f32(r4);
                float32x2_t _r5 = vld1_f32(r5);
                float32x2_t _r6 = vld1_f32(r6);
                float32x2_t _r7 = vld1_f32(r7);

                float32x2_t _tmp024a = vadd_f32(_r1, _r2);
                float32x2_t _tmp135a = vsub_f32(_r1, _r2);
                float32x2_t _tmp024b = vadd_f32(_r3, _r4);
                float32x2_t _tmp135b = vsub_f32(_r3, _r4);
                float32x2_t _tmp024c = vadd_f32(_r5, _r6);
                float32x2_t _tmp135c = vsub_f32(_r5, _r6);

#if __aarch64__
                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp024a), vfma_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0));
                float32x2_t _tmp1 = vfma_laneq_f32(vfma_f32(_tmp135a, _tmp135b, _v2), _tmp135c, _coeffs, 1);
                float32x2_t _tmp2 = vfma_laneq_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2);
                float32x2_t _tmp3 = vfma_laneq_f32(vfma_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3);
                float32x2_t _tmp4 = vfma_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2);
                float32x2_t _tmp5 = vadd_f32(vadd_f32(_r7, _tmp135a), vfma_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0));
#else
                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp024a), vmla_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0));
                float32x2_t _tmp1 = vmla_lane_f32(vmla_f32(_tmp135a, _tmp135b, _v2), _tmp135c, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp2 = vmla_lane_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp3 = vmla_lane_f32(vmla_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1);
                float32x2_t _tmp4 = vmla_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2);
                float32x2_t _tmp5 = vadd_f32(vadd_f32(_r7, _tmp135a), vmla_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0));
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
                vst1_f32(tmp[4][m], _tmp4);
                vst1_f32(tmp[5][m], _tmp5);
#else
                float tmp024a0 = r1[0] + r2[0];
                float tmp024a1 = r1[1] + r2[1];
                float tmp135a0 = r1[0] - r2[0];
                float tmp135a1 = r1[1] - r2[1];
                float tmp024b0 = r3[0] + r4[0];
                float tmp024b1 = r3[1] + r4[1];
                float tmp135b0 = r3[0] - r4[0];
                float tmp135b1 = r3[1] - r4[1];
                float tmp024c0 = r5[0] + r6[0];
                float tmp024c1 = r5[1] + r6[1];
                float tmp135c0 = r5[0] - r6[0];
                float tmp135c1 = r5[1] - r6[1];

                tmp[0][m][0] = r0[0] + tmp024a0 + tmp024b0 + tmp024c0 * 32;
                tmp[0][m][1] = r0[1] + tmp024a1 + tmp024b1 + tmp024c1 * 32;
                tmp[1][m][0] = tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16;
                tmp[1][m][1] = tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16;
                tmp[2][m][0] = tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8;
                tmp[2][m][1] = tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8;
                tmp[3][m][0] = tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4;
                tmp[3][m][1] = tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4;
                tmp[4][m][0] = tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0;
                tmp[4][m][1] = tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1;
                tmp[5][m][0] = r7[0] + tmp135a0 + tmp135b0 * 32 + tmp135c0;
                tmp[5][m][1] = r7[1] + tmp135a1 + tmp135b1 * 32 + tmp135c1;
#endif

                r0 += max_jj * 8 * 2;
                r1 += max_jj * 8 * 2;
                r2 += max_jj * 8 * 2;
                r3 += max_jj * 8 * 2;
                r4 += max_jj * 8 * 2;
                r5 += max_jj * 8 * 2;
                r6 += max_jj * 8 * 2;
                r7 += max_jj * 8 * 2;
            }

            float* outptr0 = top_blob.channel(i + ii).row(ti * 6) + (tj * 6);

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);
                float32x2_t _r6 = vld1_f32(tmp[m][6]);
                float32x2_t _r7 = vld1_f32(tmp[m][7]);

                float32x2_t _tmp024a = vadd_f32(_r1, _r2);
                float32x2_t _tmp135a = vsub_f32(_r1, _r2);
                float32x2_t _tmp024b = vadd_f32(_r3, _r4);
                float32x2_t _tmp135b = vsub_f32(_r3, _r4);
                float32x2_t _tmp024c = vadd_f32(_r5, _r6);
                float32x2_t _tmp135c = vsub_f32(_r5, _r6);

#if __aarch64__
                float32x2_t _tmp0 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r0, _tmp024a), vfma_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0)));
                float32x2_t _tmp1 = vadd_f32(_bias0, vfma_laneq_f32(vfma_f32(_tmp135a, _tmp135b, _v2), _tmp135c, _coeffs, 1));
                float32x2_t _tmp2 = vadd_f32(_bias0, vfma_laneq_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2));
                float32x2_t _tmp3 = vadd_f32(_bias0, vfma_laneq_f32(vfma_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3));
                float32x2_t _tmp4 = vadd_f32(_bias0, vfma_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2));
                float32x2_t _tmp5 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r7, _tmp135a), vfma_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0)));
#else
                float32x2_t _tmp0 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r0, _tmp024a), vmla_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0)));
                float32x2_t _tmp1 = vadd_f32(_bias0, vmla_lane_f32(vmla_f32(_tmp135a, _tmp135b, _v2), _tmp135c, vget_low_f32(_coeffs), 1));
                float32x2_t _tmp2 = vadd_f32(_bias0, vmla_lane_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0));
                float32x2_t _tmp3 = vadd_f32(_bias0, vmla_lane_f32(vmla_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1));
                float32x2_t _tmp4 = vadd_f32(_bias0, vmla_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2));
                float32x2_t _tmp5 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r7, _tmp135a), vmla_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0)));
#endif
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];
                float r60 = tmp[m][6][0];
                float r61 = tmp[m][6][1];
                float r70 = tmp[m][7][0];
                float r71 = tmp[m][7][1];

                float tmp024a0 = r10 + r20;
                float tmp024a1 = r11 + r21;
                float tmp135a0 = r10 - r20;
                float tmp135a1 = r11 - r21;
                float tmp024b0 = r30 + r40;
                float tmp024b1 = r31 + r41;
                float tmp135b0 = r30 - r40;
                float tmp135b1 = r31 - r41;
                float tmp024c0 = r50 + r60;
                float tmp024c1 = r51 + r61;
                float tmp135c0 = r50 - r60;
                float tmp135c1 = r51 - r61;

                float tmp00 = bias0 + r00 + tmp024a0 + tmp024b0 + tmp024c0 * 32;
                float tmp01 = bias1 + r01 + tmp024a1 + tmp024b1 + tmp024c1 * 32;
                float tmp10 = bias0 + tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16;
                float tmp11 = bias1 + tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16;
                float tmp20 = bias0 + tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8;
                float tmp21 = bias1 + tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8;
                float tmp30 = bias0 + tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4;
                float tmp31 = bias1 + tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4;
                float tmp40 = bias0 + tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0;
                float tmp41 = bias1 + tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1;
                float tmp50 = bias0 + r70 + tmp135a0 + tmp135b0 * 32 + tmp135c0;
                float tmp51 = bias1 + r71 + tmp135a1 + tmp135b1 * 32 + tmp135c1;
#endif

                // if (out_elempack == 1)
                {
                    float* outptr1 = outptr0 + N;

#if __ARM_NEON
                    outptr0[0] = vget_lane_f32(_tmp0, 0);
                    outptr1[0] = vget_lane_f32(_tmp0, 1);
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = vget_lane_f32(_tmp1, 0);
                        outptr1[1] = vget_lane_f32(_tmp1, 1);
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = vget_lane_f32(_tmp2, 0);
                        outptr1[2] = vget_lane_f32(_tmp2, 1);
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = vget_lane_f32(_tmp3, 0);
                        outptr1[3] = vget_lane_f32(_tmp3, 1);
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = vget_lane_f32(_tmp4, 0);
                        outptr1[4] = vget_lane_f32(_tmp4, 1);
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = vget_lane_f32(_tmp5, 0);
                        outptr1[5] = vget_lane_f32(_tmp5, 1);
                    }
#else
                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp20;
                        outptr1[2] = tmp21;
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp30;
                        outptr1[3] = tmp31;
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp40;
                        outptr1[4] = tmp41;
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp50;
                        outptr1[5] = tmp51;
                    }
#endif
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;

        float tmp[6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj;
            const float* r1 = r0 + max_jj;
            const float* r2 = r0 + max_jj * 2;
            const float* r3 = r0 + max_jj * 3;
            const float* r4 = r0 + max_jj * 4;
            const float* r5 = r0 + max_jj * 5;
            const float* r6 = r0 + max_jj * 6;
            const float* r7 = r0 + max_jj * 7;

            for (int m = 0; m < 8; m++)
            {
                float tmp024a = r1[0] + r2[0];
                float tmp135a = r1[0] - r2[0];
                float tmp024b = r3[0] + r4[0];
                float tmp135b = r3[0] - r4[0];
                float tmp024c = r5[0] + r6[0];
                float tmp135c = r5[0] - r6[0];

                tmp[0][m] = r0[0] + tmp024a + tmp024b + tmp024c * 32;
                tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
                tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
                tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
                tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
                tmp[5][m] = r7[0] + tmp135a + tmp135b * 32 + tmp135c;

                r0 += max_jj * 8;
                r1 += max_jj * 8;
                r2 += max_jj * 8;
                r3 += max_jj * 8;
                r4 += max_jj * 8;
                r5 += max_jj * 8;
                r6 += max_jj * 8;
                r7 += max_jj * 8;
            }

            float* outptr0 = top_blob.channel(i + ii).row(ti * 6) + (tj * 6);

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];
                float r6 = tmp[m][6];
                float r7 = tmp[m][7];

                float tmp024a = r1 + r2;
                float tmp135a = r1 - r2;
                float tmp024b = r3 + r4;
                float tmp135b = r3 - r4;
                float tmp024c = r5 + r6;
                float tmp135c = r5 - r6;

                float tmp0 = bias0 + r0 + tmp024a + tmp024b + tmp024c * 32;
                float tmp1 = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
                float tmp2 = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
                float tmp3 = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
                float tmp4 = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
                float tmp5 = bias0 + r7 + tmp135a + tmp135b * 32 + tmp135c;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 6 + 1 < outw) outptr0[1] = tmp1;
                    if (tj * 6 + 2 < outw) outptr0[2] = tmp2;
                    if (tj * 6 + 3 < outw) outptr0[3] = tmp3;
                    if (tj * 6 + 4 < outw) outptr0[4] = tmp4;
                    if (tj * 6 + 5 < outw) outptr0[5] = tmp5;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd63(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 6n+2, winograd F(6,3)
    int w_tiles = (outw + 5) / 6;
    int h_tiles = (outh + 5) / 6;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 64;

    // NCNN_LOGE("conv3x3s1_winograd63 %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd63_transform_input_tile(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd63_transform_input_tile(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd63_transform_output_tile(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_3x3_winograd_bf16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static inline void conv3x3s1_winograd23_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const float itm[4][4] = {
    //     {1.0f,  0.0f, -1.0f,  0.0f},
    //     {0.0f,  1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  0.00f, 1.0f}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w - 1) / 2;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel((k + kk) / elempack).row<const unsigned short>(ti * 2) + (tj * 2) * elempack;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r00 = vdupq_n_f32(0.f);
                float32x4_t _r01 = vdupq_n_f32(0.f);
                float32x4_t _r10 = vdupq_n_f32(0.f);
                float32x4_t _r11 = vdupq_n_f32(0.f);
                float32x4_t _r20 = vdupq_n_f32(0.f);
                float32x4_t _r21 = vdupq_n_f32(0.f);
                float32x4_t _r30 = vdupq_n_f32(0.f);
                float32x4_t _r31 = vdupq_n_f32(0.f);

                if (ti * 2 + m < h)
                {
                    if (elempack == 4)
                    {
                        const unsigned short* r1 = r0 + N;

                        _r00 = bfloat2float(vld1_u16(r0));
                        _r01 = bfloat2float(vld1_u16(r1));
                        if (tj * 2 + 1 < w)
                        {
                            _r10 = bfloat2float(vld1_u16(r0 + 4));
                            _r11 = bfloat2float(vld1_u16(r1 + 4));
                        }
                        if (tj * 2 + 2 < w)
                        {
                            _r20 = bfloat2float(vld1_u16(r0 + 8));
                            _r21 = bfloat2float(vld1_u16(r1 + 8));
                        }
                        if (tj * 2 + 3 < w)
                        {
                            _r30 = bfloat2float(vld1_u16(r0 + 12));
                            _r31 = bfloat2float(vld1_u16(r1 + 12));
                        }
                    }
                    if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;
                        const unsigned short* r2 = r0 + N * 2;
                        const unsigned short* r3 = r0 + N * 3;
                        const unsigned short* r4 = r0 + N * 4;
                        const unsigned short* r5 = r0 + N * 5;
                        const unsigned short* r6 = r0 + N * 6;
                        const unsigned short* r7 = r0 + N * 7;

                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4_t _t2 = vld1_u16(r2);
                        uint16x4_t _t3 = vld1_u16(r3);
                        uint16x4_t _t4 = vld1_u16(r4);
                        uint16x4_t _t5 = vld1_u16(r5);
                        uint16x4_t _t6 = vld1_u16(r6);
                        uint16x4_t _t7 = vld1_u16(r7);

                        transpose4x4_u16(_t0, _t1, _t2, _t3);
                        transpose4x4_u16(_t4, _t5, _t6, _t7);

                        _r00 = bfloat2float(_t0);
                        _r01 = bfloat2float(_t4);
                        if (tj * 2 + 1 < w)
                        {
                            _r10 = bfloat2float(_t1);
                            _r11 = bfloat2float(_t5);
                        }
                        if (tj * 2 + 2 < w)
                        {
                            _r20 = bfloat2float(_t2);
                            _r21 = bfloat2float(_t6);
                        }
                        if (tj * 2 + 3 < w)
                        {
                            _r30 = bfloat2float(_t3);
                            _r31 = bfloat2float(_t7);
                        }
                    }
                }

                float32x4_t _tmp00 = vsubq_f32(_r00, _r20);
                float32x4_t _tmp01 = vsubq_f32(_r01, _r21);
                float32x4_t _tmp10 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp11 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp20 = vsubq_f32(_r20, _r10);
                float32x4_t _tmp21 = vsubq_f32(_r21, _r11);
                float32x4_t _tmp30 = vsubq_f32(_r30, _r10);
                float32x4_t _tmp31 = vsubq_f32(_r31, _r11);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj * 8;
            float* p1 = p0 + max_jj * 8;
            float* p2 = p0 + max_jj * 8 * 2;
            float* p3 = p0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);

                float32x4_t _tmp00 = vsubq_f32(_r00, _r20);
                float32x4_t _tmp01 = vsubq_f32(_r01, _r21);
                float32x4_t _tmp10 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp11 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp20 = vsubq_f32(_r20, _r10);
                float32x4_t _tmp21 = vsubq_f32(_r21, _r11);
                float32x4_t _tmp30 = vsubq_f32(_r30, _r10);
                float32x4_t _tmp31 = vsubq_f32(_r31, _r11);

                vst1q_f32(p0, _tmp00);
                vst1q_f32(p0 + 4, _tmp01);
                vst1q_f32(p1, _tmp10);
                vst1q_f32(p1 + 4, _tmp11);
                vst1q_f32(p2, _tmp20);
                vst1q_f32(p2 + 4, _tmp21);
                vst1q_f32(p3, _tmp30);
                vst1q_f32(p3 + 4, _tmp31);

                p0 += max_jj * 4 * 8;
                p1 += max_jj * 4 * 8;
                p2 += max_jj * 4 * 8;
                p3 += max_jj * 4 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
#else // __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    #pragma omp parallel for num_threads(nT)
#endif // __aarch64__
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel((k + kk) / elempack).row<const unsigned short>(ti * 2) + (tj * 2) * elempack;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r0 = vdupq_n_f32(0.f);
                float32x4_t _r1 = vdupq_n_f32(0.f);
                float32x4_t _r2 = vdupq_n_f32(0.f);
                float32x4_t _r3 = vdupq_n_f32(0.f);

                if (ti * 2 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        if (tj * 2 + 1 < w) _r1 = bfloat2float(vld1_u16(r0 + 4));
                        if (tj * 2 + 2 < w) _r2 = bfloat2float(vld1_u16(r0 + 8));
                        if (tj * 2 + 3 < w) _r3 = bfloat2float(vld1_u16(r0 + 12));
                    }
                    if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;
                        const unsigned short* r2 = r0 + N * 2;
                        const unsigned short* r3 = r0 + N * 3;

                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4_t _t2 = vld1_u16(r2);
                        uint16x4_t _t3 = vld1_u16(r3);

                        transpose4x4_u16(_t0, _t1, _t2, _t3);

                        _r0 = bfloat2float(_t0);
                        if (tj * 2 + 1 < w) _r1 = bfloat2float(_t1);
                        if (tj * 2 + 2 < w) _r2 = bfloat2float(_t2);
                        if (tj * 2 + 3 < w) _r3 = bfloat2float(_t3);
                    }
                }

                float32x4_t _tmp0 = vsubq_f32(_r0, _r2);
                float32x4_t _tmp1 = vaddq_f32(_r1, _r2);
                float32x4_t _tmp2 = vsubq_f32(_r2, _r1);
                float32x4_t _tmp3 = vsubq_f32(_r3, _r1);

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj * 4;
            float* p1 = p0 + max_jj * 4;
            float* p2 = p0 + max_jj * 4 * 2;
            float* p3 = p0 + max_jj * 4 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);

                float32x4_t _tmp0 = vsubq_f32(_r0, _r2);
                float32x4_t _tmp1 = vaddq_f32(_r1, _r2);
                float32x4_t _tmp2 = vsubq_f32(_r2, _r1);
                float32x4_t _tmp3 = vsubq_f32(_r3, _r1);

                vst1q_f32(p0, _tmp0);
                vst1q_f32(p1, _tmp1);
                vst1q_f32(p2, _tmp2);
                vst1q_f32(p3, _tmp3);

                p0 += max_jj * 4 * 4;
                p1 += max_jj * 4 * 4;
                p2 += max_jj * 4 * 4;
                p3 += max_jj * 4 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[4][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel(k + kk).row<const unsigned short>(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vdup_n_f32(0.f);
                float32x2_t _r1 = vdup_n_f32(0.f);
                float32x2_t _r2 = vdup_n_f32(0.f);
                float32x2_t _r3 = vdup_n_f32(0.f);
#else
                float r00 = 0.f;
                float r01 = 0.f;
                float r10 = 0.f;
                float r11 = 0.f;
                float r20 = 0.f;
                float r21 = 0.f;
                float r30 = 0.f;
                float r31 = 0.f;
#endif

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;

#if __ARM_NEON
                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4x2_t _t01 = vzip_u16(_t0, _t1);
                        float32x4_t _t0_fp32 = bfloat2float(_t01.val[0]);
                        float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);

                        _r0 = vget_low_f32(_t0_fp32);
                        if (tj * 2 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
                        if (tj * 2 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
                        if (tj * 2 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
#else
                        r00 = bfloat16_to_float32(r0[0]);
                        r01 = bfloat16_to_float32(r1[0]);
                        if (tj * 2 + 1 < w)
                        {
                            r10 = bfloat16_to_float32(r0[1]);
                            r11 = bfloat16_to_float32(r1[1]);
                        }
                        if (tj * 2 + 2 < w)
                        {
                            r20 = bfloat16_to_float32(r0[2]);
                            r21 = bfloat16_to_float32(r1[2]);
                        }
                        if (tj * 2 + 3 < w)
                        {
                            r30 = bfloat16_to_float32(r0[3]);
                            r31 = bfloat16_to_float32(r1[3]);
                        }
#endif
                    }
                }

#if __ARM_NEON
                float32x2_t _tmp0 = vsub_f32(_r0, _r2);
                float32x2_t _tmp1 = vadd_f32(_r1, _r2);
                float32x2_t _tmp2 = vsub_f32(_r2, _r1);
                float32x2_t _tmp3 = vsub_f32(_r3, _r1);

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
#else
                tmp[0][m][0] = r00 - r20;
                tmp[0][m][1] = r01 - r21;
                tmp[1][m][0] = r10 + r20;
                tmp[1][m][1] = r11 + r21;
                tmp[2][m][0] = r20 - r10;
                tmp[2][m][1] = r21 - r11;
                tmp[3][m][0] = r30 - r10;
                tmp[3][m][1] = r31 - r11;
#endif

                r0 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj * 2;
            float* p1 = p0 + max_jj * 2;
            float* p2 = p0 + max_jj * 2 * 2;
            float* p3 = p0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);

                float32x2_t _tmp0 = vsub_f32(_r0, _r2);
                float32x2_t _tmp1 = vadd_f32(_r1, _r2);
                float32x2_t _tmp2 = vsub_f32(_r2, _r1);
                float32x2_t _tmp3 = vsub_f32(_r3, _r1);

                vst1_f32(p0, _tmp0);
                vst1_f32(p1, _tmp1);
                vst1_f32(p2, _tmp2);
                vst1_f32(p3, _tmp3);
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];

                p0[0] = r00 - r20;
                p0[1] = r01 - r21;
                p1[0] = r10 + r20;
                p1[1] = r11 + r21;
                p2[0] = r20 - r10;
                p2[1] = r21 - r11;
                p3[0] = r30 - r10;
                p3[1] = r31 - r11;
#endif

                p0 += max_jj * 4 * 2;
                p1 += max_jj * 4 * 2;
                p2 += max_jj * 4 * 2;
                p3 += max_jj * 4 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        float tmp[4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0123 = bottom_blob.channel(k + kk).row<const unsigned short>(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
                float r0 = 0.f;
                float r1 = 0.f;
                float r2 = 0.f;
                float r3 = 0.f;

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = bfloat16_to_float32(r0123[0]);
                        if (tj * 2 + 1 < w) r1 = bfloat16_to_float32(r0123[1]);
                        if (tj * 2 + 2 < w) r2 = bfloat16_to_float32(r0123[2]);
                        if (tj * 2 + 3 < w) r3 = bfloat16_to_float32(r0123[3]);
                    }
                }

                tmp[0][m] = r0 - r2;
                tmp[1][m] = r1 + r2;
                tmp[2][m] = r2 - r1;
                tmp[3][m] = r3 - r1;

                r0123 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 16 + jj;
            float* p1 = p0 + max_jj;
            float* p2 = p0 + max_jj * 2;
            float* p3 = p0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];

                p0[0] = r0 - r2;
                p1[0] = r1 + r2;
                p2[0] = r2 - r1;
                p3[0] = r3 - r1;

                p0 += max_jj * 4;
                p1 += max_jj * 4;
                p2 += max_jj * 4;
                p3 += max_jj * 4;
            }
        }
    }
}

static inline void conv3x3s1_winograd23_transform_output_tile_bf16s(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    // const float otm[2][4] = {
    //     {1.0f,  1.0f,  1.0f,  0.0f},
    //     {0.0f,  1.0f, -1.0f,  1.0f}
    // };

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 1) / 2;

    const float* biasptr = bias;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = biasptr ? vld1q_f32(biasptr + i + ii + 4) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[2][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj * 8;
            const float* r1 = r0 + max_jj * 8;
            const float* r2 = r0 + max_jj * 8 * 2;
            const float* r3 = r0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _r10), _r20);
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _r11), _r21);
                float32x4_t _tmp10 = vaddq_f32(vsubq_f32(_r10, _r20), _r30);
                float32x4_t _tmp11 = vaddq_f32(vsubq_f32(_r11, _r21), _r31);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);

                r0 += max_jj * 4 * 8;
                r1 += max_jj * 4 * 8;
                r2 += max_jj * 4 * 8;
                r3 += max_jj * 4 * 8;
            }

            unsigned short* outptr0 = top_blob.channel((i + ii) / out_elempack).row<unsigned short>(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);

                float32x4_t _tmp00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r00, _r10), _r20));
                float32x4_t _tmp01 = vaddq_f32(_bias1, vaddq_f32(vaddq_f32(_r01, _r11), _r21));
                float32x4_t _tmp10 = vaddq_f32(_bias0, vaddq_f32(vsubq_f32(_r10, _r20), _r30));
                float32x4_t _tmp11 = vaddq_f32(_bias1, vaddq_f32(vsubq_f32(_r11, _r21), _r31));

                if (out_elempack == 4)
                {
                    unsigned short* outptr1 = outptr0 + N;

                    vst1_u16(outptr0, float2bfloat(_tmp00));
                    vst1_u16(outptr1, float2bfloat(_tmp01));
                    if (tj * 2 + 1 < outw)
                    {
                        vst1_u16(outptr0 + 4, float2bfloat(_tmp10));
                        vst1_u16(outptr1 + 4, float2bfloat(_tmp11));
                    }
                }
                if (out_elempack == 1)
                {
                    unsigned short tmp0[8];
                    unsigned short tmp1[8];
                    vst1_u16(tmp0, float2bfloat(_tmp00));
                    vst1_u16(tmp0 + 4, float2bfloat(_tmp01));
                    vst1_u16(tmp1, float2bfloat(_tmp10));
                    vst1_u16(tmp1 + 4, float2bfloat(_tmp11));

                    unsigned short* outptr1 = outptr0 + N;
                    unsigned short* outptr2 = outptr0 + N * 2;
                    unsigned short* outptr3 = outptr0 + N * 3;
                    unsigned short* outptr4 = outptr0 + N * 4;
                    unsigned short* outptr5 = outptr0 + N * 5;
                    unsigned short* outptr6 = outptr0 + N * 6;
                    unsigned short* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[2][4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj * 4;
            const float* r1 = r0 + max_jj * 4;
            const float* r2 = r0 + max_jj * 4 * 2;
            const float* r3 = r0 + max_jj * 4 * 3;

            for (int m = 0; m < 4; m++)
            {
                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _r1 = vld1q_f32(r1);
                float32x4_t _r2 = vld1q_f32(r2);
                float32x4_t _r3 = vld1q_f32(r3);

                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _r1), _r2);
                float32x4_t _tmp1 = vaddq_f32(vsubq_f32(_r1, _r2), _r3);

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);

                r0 += max_jj * 4 * 4;
                r1 += max_jj * 4 * 4;
                r2 += max_jj * 4 * 4;
                r3 += max_jj * 4 * 4;
            }

            unsigned short* outptr0 = top_blob.channel((i + ii) / out_elempack).row<unsigned short>(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);

                float32x4_t _tmp0 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r0, _r1), _r2));
                float32x4_t _tmp1 = vaddq_f32(_bias0, vaddq_f32(vsubq_f32(_r1, _r2), _r3));

                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_tmp0));
                    if (tj * 2 + 1 < outw) vst1_u16(outptr0 + 4, float2bfloat(_tmp1));
                }
                if (out_elempack == 1)
                {
                    unsigned short tmp0[4];
                    unsigned short tmp1[4];
                    vst1_u16(tmp0, float2bfloat(_tmp0));
                    vst1_u16(tmp1, float2bfloat(_tmp1));

                    unsigned short* outptr1 = outptr0 + N;
                    unsigned short* outptr2 = outptr0 + N * 2;
                    unsigned short* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        float32x2_t _bias0 = biasptr ? vld1_f32(biasptr + i + ii) : vdup_n_f32(0.f);
#else
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;
        float bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;
#endif

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[2][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj * 2;
            const float* r1 = r0 + max_jj * 2;
            const float* r2 = r0 + max_jj * 2 * 2;
            const float* r3 = r0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(r0);
                float32x2_t _r1 = vld1_f32(r1);
                float32x2_t _r2 = vld1_f32(r2);
                float32x2_t _r3 = vld1_f32(r3);

                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _r1), _r2);
                float32x2_t _tmp1 = vadd_f32(vsub_f32(_r1, _r2), _r3);

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
#else
                tmp[0][m][0] = r0[0] + r1[0] + r2[0];
                tmp[0][m][1] = r0[1] + r1[1] + r2[1];
                tmp[1][m][0] = r1[0] - r2[0] + r3[0];
                tmp[1][m][1] = r1[1] - r2[1] + r3[1];
#endif

                r0 += max_jj * 4 * 2;
                r1 += max_jj * 4 * 2;
                r2 += max_jj * 4 * 2;
                r3 += max_jj * 4 * 2;
            }

            unsigned short* outptr0 = top_blob.channel(i + ii).row<unsigned short>(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);

                float32x2_t _tmp0 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r0, _r1), _r2));
                float32x2_t _tmp1 = vadd_f32(_bias0, vadd_f32(vsub_f32(_r1, _r2), _r3));
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];

                float tmp00 = bias0 + r00 + r10 + r20;
                float tmp01 = bias1 + r01 + r11 + r21;
                float tmp10 = bias0 + r10 - r20 + r30;
                float tmp11 = bias1 + r11 - r21 + r31;
#endif

                // if (out_elempack == 1)
                {
                    unsigned short* outptr1 = outptr0 + N;

#if __ARM_NEON
                    uint16x4_t _tmp01 = float2bfloat(vcombine_f32(_tmp0, _tmp1));

                    outptr0[0] = vget_lane_u16(_tmp01, 0);
                    outptr1[0] = vget_lane_u16(_tmp01, 1);
                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = vget_lane_u16(_tmp01, 2);
                        outptr1[1] = vget_lane_u16(_tmp01, 3);
                    }
#else
                    outptr0[0] = float32_to_bfloat16(tmp00);
                    outptr1[0] = float32_to_bfloat16(tmp01);
                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = float32_to_bfloat16(tmp10);
                        outptr1[1] = float32_to_bfloat16(tmp11);
                    }
#endif
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;

        float tmp[2][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 16 + jj;
            const float* r1 = r0 + max_jj;
            const float* r2 = r0 + max_jj * 2;
            const float* r3 = r0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                tmp[0][m] = r0[0] + r1[0] + r2[0];
                tmp[1][m] = r1[0] - r2[0] + r3[0];

                r0 += max_jj * 4;
                r1 += max_jj * 4;
                r2 += max_jj * 4;
                r3 += max_jj * 4;
            }

            unsigned short* outptr0 = top_blob.channel(i + ii).row<unsigned short>(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];

                float tmp0 = bias0 + r0 + r1 + r2;
                float tmp1 = bias0 + r1 - r2 + r3;

                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(tmp0);
                    if (tj * 2 + 1 < outw) outptr0[1] = float32_to_bfloat16(tmp1);
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd23_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 2n+2, winograd F(2,3)
    int w_tiles = (outw + 1) / 2;
    int h_tiles = (outh + 1) / 2;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 16;

    // NCNN_LOGE("conv3x3s1_winograd23_bf16s %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd23_transform_input_tile_bf16s(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd23_transform_input_tile_bf16s(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd23_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}

static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    const float sq2 = 1.41421356237;
    const float sq2_d2 = 1.41421356237 / 2;

    // const float itm[6][6] = {
    //     {1.0f,  0.0f,  -2.5f,  0.0f,  1.0f, 0.0f},
    //     {0.0f, -sq2,   -2.0f,  sq2/2, 1.0f, 0.0f},
    //     {0.0f,  sq2,   -2.0f, -sq2/2, 1.0f, 0.0f},
    //     {0.0f, -sq2/2, -0.5f,  sq2,   1.0f, 0.0f},
    //     {0.0f,  sq2/2, -0.5f, -sq2,   1.0f, 0.0f},
    //     {0.0f,  1.0f,   0.0f,  -2.5f, 0.0f, 1.0f}
    // };

    // 0 =  r00 + r04 - 2.5f * r02
    // 1 = -(sq2 * r01 - sq2_d2 * r03) + (r04 - 2 * r02)
    // 2 =  (sq2 * r01 - sq2_d2 * r03) + (r04 - 2 * r02)
    // 3 =  (sq2 * r03 - sq2_d2 * r01) + (r04 - 0.5f * r02)
    // 4 = -(sq2 * r03 - sq2_d2 * r01) + (r04 - 0.5f * r02)
    // 5 =  r01 + r05 - 2.5f * r03

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w + 1) / 4;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][6][8];

        const float coeffs[4] = {sq2, -sq2_d2, -2.f, -0.5f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _vm2_5 = vdupq_n_f32(-2.5f);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel((k + kk) / elempack).row<const unsigned short>(ti * 4) + (tj * 4) * elempack;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r00 = vdupq_n_f32(0.f);
                float32x4_t _r01 = vdupq_n_f32(0.f);
                float32x4_t _r10 = vdupq_n_f32(0.f);
                float32x4_t _r11 = vdupq_n_f32(0.f);
                float32x4_t _r20 = vdupq_n_f32(0.f);
                float32x4_t _r21 = vdupq_n_f32(0.f);
                float32x4_t _r30 = vdupq_n_f32(0.f);
                float32x4_t _r31 = vdupq_n_f32(0.f);
                float32x4_t _r40 = vdupq_n_f32(0.f);
                float32x4_t _r41 = vdupq_n_f32(0.f);
                float32x4_t _r50 = vdupq_n_f32(0.f);
                float32x4_t _r51 = vdupq_n_f32(0.f);

                if (ti * 4 + m < h)
                {
                    if (elempack == 4)
                    {
                        const unsigned short* r1 = r0 + N;

                        _r00 = bfloat2float(vld1_u16(r0));
                        _r01 = bfloat2float(vld1_u16(r1));
                        if (tj * 4 + 1 < w)
                        {
                            _r10 = bfloat2float(vld1_u16(r0 + 4));
                            _r11 = bfloat2float(vld1_u16(r1 + 4));
                        }
                        if (tj * 4 + 2 < w)
                        {
                            _r20 = bfloat2float(vld1_u16(r0 + 8));
                            _r21 = bfloat2float(vld1_u16(r1 + 8));
                        }
                        if (tj * 4 + 3 < w)
                        {
                            _r30 = bfloat2float(vld1_u16(r0 + 12));
                            _r31 = bfloat2float(vld1_u16(r1 + 12));
                        }
                        if (tj * 4 + 4 < w)
                        {
                            _r40 = bfloat2float(vld1_u16(r0 + 16));
                            _r41 = bfloat2float(vld1_u16(r1 + 16));
                        }
                        if (tj * 4 + 5 < w)
                        {
                            _r50 = bfloat2float(vld1_u16(r0 + 20));
                            _r51 = bfloat2float(vld1_u16(r1 + 20));
                        }
                    }
                    if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;
                        const unsigned short* r2 = r0 + N * 2;
                        const unsigned short* r3 = r0 + N * 3;
                        const unsigned short* r4 = r0 + N * 4;
                        const unsigned short* r5 = r0 + N * 5;
                        const unsigned short* r6 = r0 + N * 6;
                        const unsigned short* r7 = r0 + N * 7;

                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4_t _t2 = vld1_u16(r2);
                        uint16x4_t _t3 = vld1_u16(r3);
                        uint16x4_t _t4 = vld1_u16(r4);
                        uint16x4_t _t5 = vld1_u16(r5);
                        uint16x4_t _t6 = vld1_u16(r6);
                        uint16x4_t _t7 = vld1_u16(r7);

                        transpose4x4_u16(_t0, _t1, _t2, _t3);
                        transpose4x4_u16(_t4, _t5, _t6, _t7);

                        _r00 = bfloat2float(_t0);
                        _r01 = bfloat2float(_t4);
                        if (tj * 4 + 1 < w)
                        {
                            _r10 = bfloat2float(_t1);
                            _r11 = bfloat2float(_t5);
                        }
                        if (tj * 4 + 2 < w)
                        {
                            _r20 = bfloat2float(_t2);
                            _r21 = bfloat2float(_t6);
                        }
                        if (tj * 4 + 3 < w)
                        {
                            _r30 = bfloat2float(_t3);
                            _r31 = bfloat2float(_t7);
                        }
                        if (tj * 4 + 4 < w)
                        {
                            unsigned short tmp[8] = {r0[4], r1[4], r2[4], r3[4], r4[4], r5[4], r6[4], r7[4]};
                            _r40 = bfloat2float(vld1_u16(tmp));
                            _r41 = bfloat2float(vld1_u16(tmp + 4));
                        }
                        if (tj * 4 + 5 < w)
                        {
                            unsigned short tmp[8] = {r0[5], r1[5], r2[5], r3[5], r4[5], r5[5], r6[5], r7[5]};
                            _r50 = bfloat2float(vld1_u16(tmp));
                            _r51 = bfloat2float(vld1_u16(tmp + 4));
                        }
                    }
                }

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs, 0), _r30, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs, 0), _r31, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 2);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 2);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r30, _coeffs, 0), _r10, _coeffs, 1);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r31, _coeffs, 0), _r11, _coeffs, 1);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 3);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 3);

                float32x4_t _tmp00 = vfmaq_f32(vaddq_f32(_r00, _r40), _r20, _vm2_5);
                float32x4_t _tmp01 = vfmaq_f32(vaddq_f32(_r01, _r41), _r21, _vm2_5);
                float32x4_t _tmp10 = vsubq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp11 = vsubq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp20 = vaddq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp21 = vaddq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp50 = vfmaq_f32(vaddq_f32(_r10, _r50), _r30, _vm2_5);
                float32x4_t _tmp51 = vfmaq_f32(vaddq_f32(_r11, _r51), _r31, _vm2_5);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);
                vst1q_f32(tmp[4][m], _tmp40);
                vst1q_f32(tmp[4][m] + 4, _tmp41);
                vst1q_f32(tmp[5][m], _tmp50);
                vst1q_f32(tmp[5][m] + 4, _tmp51);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj * 8;
            float* p1 = p0 + max_jj * 8;
            float* p2 = p0 + max_jj * 8 * 2;
            float* p3 = p0 + max_jj * 8 * 3;
            float* p4 = p0 + max_jj * 8 * 4;
            float* p5 = p0 + max_jj * 8 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs, 0), _r30, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs, 0), _r31, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 2);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 2);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vmulq_laneq_f32(_r30, _coeffs, 0), _r10, _coeffs, 1);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vmulq_laneq_f32(_r31, _coeffs, 0), _r11, _coeffs, 1);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(_r40, _r20, _coeffs, 3);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(_r41, _r21, _coeffs, 3);

                float32x4_t _tmp00 = vfmaq_f32(vaddq_f32(_r00, _r40), _r20, _vm2_5);
                float32x4_t _tmp01 = vfmaq_f32(vaddq_f32(_r01, _r41), _r21, _vm2_5);
                float32x4_t _tmp10 = vsubq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp11 = vsubq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp20 = vaddq_f32(_tmp12b0, _tmp12a0);
                float32x4_t _tmp21 = vaddq_f32(_tmp12b1, _tmp12a1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34b0, _tmp34a0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34b1, _tmp34a1);
                float32x4_t _tmp50 = vfmaq_f32(vaddq_f32(_r10, _r50), _r30, _vm2_5);
                float32x4_t _tmp51 = vfmaq_f32(vaddq_f32(_r11, _r51), _r31, _vm2_5);

                vst1q_f32(p0, _tmp00);
                vst1q_f32(p0 + 4, _tmp01);
                vst1q_f32(p1, _tmp10);
                vst1q_f32(p1 + 4, _tmp11);
                vst1q_f32(p2, _tmp20);
                vst1q_f32(p2 + 4, _tmp21);
                vst1q_f32(p3, _tmp30);
                vst1q_f32(p3 + 4, _tmp31);
                vst1q_f32(p4, _tmp40);
                vst1q_f32(p4 + 4, _tmp41);
                vst1q_f32(p5, _tmp50);
                vst1q_f32(p5 + 4, _tmp51);

                p0 += max_jj * 6 * 8;
                p1 += max_jj * 6 * 8;
                p2 += max_jj * 6 * 8;
                p3 += max_jj * 6 * 8;
                p4 += max_jj * 6 * 8;
                p5 += max_jj * 6 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
#else // __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    #pragma omp parallel for num_threads(nT)
#endif // __aarch64__
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][6][4];

        const float coeffs[4] = {sq2, -sq2_d2, -2.f, -0.5f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _vm2_5 = vdupq_n_f32(-2.5f);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel((k + kk) / elempack).row<const unsigned short>(ti * 4) + (tj * 4) * elempack;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r0 = vdupq_n_f32(0.f);
                float32x4_t _r1 = vdupq_n_f32(0.f);
                float32x4_t _r2 = vdupq_n_f32(0.f);
                float32x4_t _r3 = vdupq_n_f32(0.f);
                float32x4_t _r4 = vdupq_n_f32(0.f);
                float32x4_t _r5 = vdupq_n_f32(0.f);

                if (ti * 4 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        if (tj * 4 + 1 < w) _r1 = bfloat2float(vld1_u16(r0 + 4));
                        if (tj * 4 + 2 < w) _r2 = bfloat2float(vld1_u16(r0 + 8));
                        if (tj * 4 + 3 < w) _r3 = bfloat2float(vld1_u16(r0 + 12));
                        if (tj * 4 + 4 < w) _r4 = bfloat2float(vld1_u16(r0 + 16));
                        if (tj * 4 + 5 < w) _r5 = bfloat2float(vld1_u16(r0 + 20));
                    }
                    if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;
                        const unsigned short* r2 = r0 + N * 2;
                        const unsigned short* r3 = r0 + N * 3;

                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4_t _t2 = vld1_u16(r2);
                        uint16x4_t _t3 = vld1_u16(r3);

                        transpose4x4_u16(_t0, _t1, _t2, _t3);

                        _r0 = bfloat2float(_t0);
                        if (tj * 4 + 1 < w) _r1 = bfloat2float(_t1);
                        if (tj * 4 + 2 < w) _r2 = bfloat2float(_t2);
                        if (tj * 4 + 3 < w) _r3 = bfloat2float(_t3);
                        if (tj * 4 + 4 < w)
                        {
                            unsigned short tmp[4] = {r0[4], r1[4], r2[4], r3[4]};
                            _r4 = bfloat2float(vld1_u16(tmp));
                        }
                        if (tj * 4 + 5 < w)
                        {
                            unsigned short tmp[4] = {r0[5], r1[5], r2[5], r3[5]};
                            _r5 = bfloat2float(vld1_u16(tmp));
                        }
                    }
                }

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vmulq_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x4_t _tmp34b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmulq_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x4_t _tmp0 = vmlaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x4_t _tmp1 = vsubq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp2 = vaddq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp3 = vaddq_f32(_tmp34b, _tmp34a);
                float32x4_t _tmp4 = vsubq_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x4_t _tmp5 = vfmaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x4_t _tmp5 = vmlaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);
                vst1q_f32(tmp[4][m], _tmp4);
                vst1q_f32(tmp[5][m], _tmp5);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj * 4;
            float* p1 = p0 + max_jj * 4;
            float* p2 = p0 + max_jj * 4 * 2;
            float* p3 = p0 + max_jj * 4 * 3;
            float* p4 = p0 + max_jj * 4 * 4;
            float* p5 = p0 + max_jj * 4 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vmulq_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x4_t _tmp34b = vfmaq_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmulq_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34b = vmlaq_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x4_t _tmp0 = vmlaq_f32(vaddq_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x4_t _tmp1 = vsubq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp2 = vaddq_f32(_tmp12b, _tmp12a);
                float32x4_t _tmp3 = vaddq_f32(_tmp34b, _tmp34a);
                float32x4_t _tmp4 = vsubq_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x4_t _tmp5 = vfmaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x4_t _tmp5 = vmlaq_f32(vaddq_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1q_f32(p0, _tmp0);
                vst1q_f32(p1, _tmp1);
                vst1q_f32(p2, _tmp2);
                vst1q_f32(p3, _tmp3);
                vst1q_f32(p4, _tmp4);
                vst1q_f32(p5, _tmp5);

                p0 += max_jj * 6 * 4;
                p1 += max_jj * 6 * 4;
                p2 += max_jj * 6 * 4;
                p3 += max_jj * 6 * 4;
                p4 += max_jj * 6 * 4;
                p5 += max_jj * 6 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[6][6][2];

#if __ARM_NEON
        const float coeffs[4] = {sq2, -sq2_d2, -2.f, -0.5f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x2_t _vm2_5 = vdup_n_f32(-2.5f);
#endif

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel(k + kk).row<const unsigned short>(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vdup_n_f32(0.f);
                float32x2_t _r1 = vdup_n_f32(0.f);
                float32x2_t _r2 = vdup_n_f32(0.f);
                float32x2_t _r3 = vdup_n_f32(0.f);
                float32x2_t _r4 = vdup_n_f32(0.f);
                float32x2_t _r5 = vdup_n_f32(0.f);
#else
                float r00 = 0.f;
                float r01 = 0.f;
                float r10 = 0.f;
                float r11 = 0.f;
                float r20 = 0.f;
                float r21 = 0.f;
                float r30 = 0.f;
                float r31 = 0.f;
                float r40 = 0.f;
                float r41 = 0.f;
                float r50 = 0.f;
                float r51 = 0.f;
#endif

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;

#if __ARM_NEON
                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4x2_t _t01 = vzip_u16(_t0, _t1);
                        float32x4_t _t0_fp32 = bfloat2float(_t01.val[0]);
                        float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);

                        _r0 = vget_low_f32(_t0_fp32);
                        if (tj * 4 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
                        if (tj * 4 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
                        if (tj * 4 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
                        if (tj * 4 + 4 < w)
                        {
                            float tmp[2] = {bfloat16_to_float32(r0[4]), bfloat16_to_float32(r1[4])};
                            _r4 = vld1_f32(tmp);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            float tmp[2] = {bfloat16_to_float32(r0[5]), bfloat16_to_float32(r1[5])};
                            _r5 = vld1_f32(tmp);
                        }
#else
                        r00 = bfloat16_to_float32(r0[0]);
                        r01 = bfloat16_to_float32(r1[0]);
                        if (tj * 4 + 1 < w)
                        {
                            r10 = bfloat16_to_float32(r0[1]);
                            r11 = bfloat16_to_float32(r1[1]);
                        }
                        if (tj * 4 + 2 < w)
                        {
                            r20 = bfloat16_to_float32(r0[2]);
                            r21 = bfloat16_to_float32(r1[2]);
                        }
                        if (tj * 4 + 3 < w)
                        {
                            r30 = bfloat16_to_float32(r0[3]);
                            r31 = bfloat16_to_float32(r1[3]);
                        }
                        if (tj * 4 + 4 < w)
                        {
                            r40 = bfloat16_to_float32(r0[4]);
                            r41 = bfloat16_to_float32(r1[4]);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            r50 = bfloat16_to_float32(r0[5]);
                            r51 = bfloat16_to_float32(r1[5]);
                        }
#endif
                    }
                }

#if __ARM_NEON
#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x2_t _tmp34a = vfma_laneq_f32(vmul_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x2_t _tmp34b = vfma_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34a = vmla_lane_f32(vmul_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x2_t _tmp0 = vmla_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x2_t _tmp1 = vsub_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp2 = vadd_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp3 = vadd_f32(_tmp34b, _tmp34a);
                float32x2_t _tmp4 = vsub_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x2_t _tmp5 = vfma_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x2_t _tmp5 = vmla_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
                vst1_f32(tmp[4][m], _tmp4);
                vst1_f32(tmp[5][m], _tmp5);
#else
                float tmp12a0 = sq2 * r10 - sq2_d2 * r30;
                float tmp12a1 = sq2 * r11 - sq2_d2 * r31;
                float tmp12b0 = r40 - 2 * r20;
                float tmp12b1 = r41 - 2 * r21;
                float tmp34a0 = sq2 * r30 - sq2_d2 * r10;
                float tmp34a1 = sq2 * r31 - sq2_d2 * r11;
                float tmp34b0 = r40 - 0.5f * r20;
                float tmp34b1 = r41 - 0.5f * r21;

                tmp[0][m][0] = r00 + r40 - 2.5f * r20;
                tmp[0][m][1] = r01 + r41 - 2.5f * r21;
                tmp[1][m][0] = tmp12b0 - tmp12a0;
                tmp[1][m][1] = tmp12b1 - tmp12a1;
                tmp[2][m][0] = tmp12b0 + tmp12a0;
                tmp[2][m][1] = tmp12b1 + tmp12a1;
                tmp[3][m][0] = tmp34b0 + tmp34a0;
                tmp[3][m][1] = tmp34b1 + tmp34a1;
                tmp[4][m][0] = tmp34b0 - tmp34a0;
                tmp[4][m][1] = tmp34b1 - tmp34a1;
                tmp[5][m][0] = r10 + r50 - 2.5f * r30;
                tmp[5][m][1] = r11 + r51 - 2.5f * r31;
#endif

                r0 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj * 2;
            float* p1 = p0 + max_jj * 2;
            float* p2 = p0 + max_jj * 2 * 2;
            float* p3 = p0 + max_jj * 2 * 3;
            float* p4 = p0 + max_jj * 2 * 4;
            float* p5 = p0 + max_jj * 2 * 5;

            for (int m = 0; m < 6; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);

#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(_r4, _r2, _coeffs, 2);
                float32x2_t _tmp34a = vfma_laneq_f32(vmul_laneq_f32(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float32x2_t _tmp34b = vfma_laneq_f32(_r4, _r2, _coeffs, 3);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs), 0), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34a = vmla_lane_f32(vmul_lane_f32(_r3, vget_low_f32(_coeffs), 0), _r1, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34b = vmla_lane_f32(_r4, _r2, vget_high_f32(_coeffs), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#else
                float32x2_t _tmp0 = vmla_f32(vadd_f32(_r0, _r4), _r2, _vm2_5);
#endif
                float32x2_t _tmp1 = vsub_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp2 = vadd_f32(_tmp12b, _tmp12a);
                float32x2_t _tmp3 = vadd_f32(_tmp34b, _tmp34a);
                float32x2_t _tmp4 = vsub_f32(_tmp34b, _tmp34a);
#if __aarch64__
                float32x2_t _tmp5 = vfma_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#else
                float32x2_t _tmp5 = vmla_f32(vadd_f32(_r1, _r5), _r3, _vm2_5);
#endif

                vst1_f32(p0, _tmp0);
                vst1_f32(p1, _tmp1);
                vst1_f32(p2, _tmp2);
                vst1_f32(p3, _tmp3);
                vst1_f32(p4, _tmp4);
                vst1_f32(p5, _tmp5);
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];

                float tmp12a0 = sq2 * r10 - sq2_d2 * r30;
                float tmp12a1 = sq2 * r11 - sq2_d2 * r31;
                float tmp12b0 = r40 - 2 * r20;
                float tmp12b1 = r41 - 2 * r21;
                float tmp34a0 = sq2 * r30 - sq2_d2 * r10;
                float tmp34a1 = sq2 * r31 - sq2_d2 * r11;
                float tmp34b0 = r40 - 0.5f * r20;
                float tmp34b1 = r41 - 0.5f * r21;

                p0[0] = r00 + r40 - 2.5f * r20;
                p0[1] = r01 + r41 - 2.5f * r21;
                p1[0] = tmp12b0 - tmp12a0;
                p1[1] = tmp12b1 - tmp12a1;
                p2[0] = tmp12b0 + tmp12a0;
                p2[1] = tmp12b1 + tmp12a1;
                p3[0] = tmp34b0 + tmp34a0;
                p3[1] = tmp34b1 + tmp34a1;
                p4[0] = tmp34b0 - tmp34a0;
                p4[1] = tmp34b1 - tmp34a1;
                p5[0] = r10 + r50 - 2.5f * r30;
                p5[1] = r11 + r51 - 2.5f * r31;
#endif

                p0 += max_jj * 6 * 2;
                p1 += max_jj * 6 * 2;
                p2 += max_jj * 6 * 2;
                p3 += max_jj * 6 * 2;
                p4 += max_jj * 6 * 2;
                p5 += max_jj * 6 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        float tmp[6][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0123 = bottom_blob.channel(k + kk).row<const unsigned short>(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
                float r0 = 0.f;
                float r1 = 0.f;
                float r2 = 0.f;
                float r3 = 0.f;
                float r4 = 0.f;
                float r5 = 0.f;

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = bfloat16_to_float32(r0123[0]);
                        if (tj * 4 + 1 < w) r1 = bfloat16_to_float32(r0123[1]);
                        if (tj * 4 + 2 < w) r2 = bfloat16_to_float32(r0123[2]);
                        if (tj * 4 + 3 < w) r3 = bfloat16_to_float32(r0123[3]);
                        if (tj * 4 + 4 < w) r4 = bfloat16_to_float32(r0123[4]);
                        if (tj * 4 + 5 < w) r5 = bfloat16_to_float32(r0123[5]);
                    }
                }

                float tmp12a = sq2 * r1 - sq2_d2 * r3;
                float tmp12b = r4 - 2 * r2;
                float tmp34a = sq2 * r3 - sq2_d2 * r1;
                float tmp34b = r4 - 0.5f * r2;

                tmp[0][m] = r0 + r4 - 2.5f * r2;
                tmp[1][m] = tmp12b - tmp12a;
                tmp[2][m] = tmp12b + tmp12a;
                tmp[3][m] = tmp34b + tmp34a;
                tmp[4][m] = tmp34b - tmp34a;
                tmp[5][m] = r1 + r5 - 2.5f * r3;

                r0123 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 36 + jj;
            float* p1 = p0 + max_jj;
            float* p2 = p0 + max_jj * 2;
            float* p3 = p0 + max_jj * 3;
            float* p4 = p0 + max_jj * 4;
            float* p5 = p0 + max_jj * 5;

            for (int m = 0; m < 6; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];

                float tmp12a = sq2 * r1 - sq2_d2 * r3;
                float tmp12b = r4 - 2 * r2;
                float tmp34a = sq2 * r3 - sq2_d2 * r1;
                float tmp34b = r4 - 0.5f * r2;

                p0[0] = r0 + r4 - 2.5f * r2;
                p1[0] = tmp12b - tmp12a;
                p2[0] = tmp12b + tmp12a;
                p3[0] = tmp34b + tmp34a;
                p4[0] = tmp34b - tmp34a;
                p5[0] = r1 + r5 - 2.5f * r3;

                p0 += max_jj * 6;
                p1 += max_jj * 6;
                p2 += max_jj * 6;
                p3 += max_jj * 6;
                p4 += max_jj * 6;
                p5 += max_jj * 6;
            }
        }
    }
}

static inline void conv3x3s1_winograd43_transform_output_tile_bf16s(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    const float sq2 = 1.41421356237;
    const float sq2_m2 = 1.41421356237 * 2;
    const float sq2_d2 = 1.41421356237 / 2;
    const float sq2_d4 = 1.41421356237 / 4;

    // const float otm[4][6] = {
    //     {1.0f, 1.0f,   1.0f,  1.0f,  1.0f,   0.0f},
    //     {0.0f, sq2/2, -sq2/2, sq2,   -sq2,   0.0f},
    //     {0.0f, 0.5f,   0.5f,  2.0f,  2.0f,   0.0f},
    //     {0.0f, sq2/4, -sq2/4, sq2*2, -sq2*2, 1.0f}
    // };

    // 0 = r00 + (r01 + r02) + (r03 + r04)
    // 1 =       (r01 - r02) * sq2_d2 + (r03 - r04) * sq2
    // 2 =       (r01 + r02) * 0.5f + (r03 + r04) * 2
    // 3 = r05 + (r01 - r02) * sq2_d4 + (r03 - r04) * sq2_m2

#if __ARM_NEON
    const float coeffs[6] = {sq2, sq2_d2, sq2_d4, sq2_m2, 0.5f, 2.f};
    float32x4_t _coeffs = vld1q_f32(coeffs);
    float32x2_t _coeffs2 = vld1_f32(coeffs + 4);
#endif // __ARM_NEON

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 3) / 4;

    const float* biasptr = bias;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = biasptr ? vld1q_f32(biasptr + i + ii + 4) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj * 8;
            const float* r1 = r0 + max_jj * 8;
            const float* r2 = r0 + max_jj * 8 * 2;
            const float* r3 = r0 + max_jj * 8 * 3;
            const float* r4 = r0 + max_jj * 8 * 4;
            const float* r5 = r0 + max_jj * 8 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r50 = vld1q_f32(r5);
                float32x4_t _r51 = vld1q_f32(r5 + 4);

                float32x4_t _tmp02a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp02a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp02b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp02b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp13a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp13a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp13b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp13b1 = vsubq_f32(_r31, _r41);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _tmp02a0), _tmp02b0);
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _tmp02a1), _tmp02b1);
                float32x4_t _tmp10 = vfmaq_laneq_f32(vmulq_laneq_f32(_tmp13a0, _coeffs, 1), _tmp13b0, _coeffs, 0);
                float32x4_t _tmp11 = vfmaq_laneq_f32(vmulq_laneq_f32(_tmp13a1, _coeffs, 1), _tmp13b1, _coeffs, 0);
                float32x4_t _tmp20 = vfmaq_lane_f32(vmulq_lane_f32(_tmp02a0, _coeffs2, 0), _tmp02b0, _coeffs2, 1);
                float32x4_t _tmp21 = vfmaq_lane_f32(vmulq_lane_f32(_tmp02a1, _coeffs2, 0), _tmp02b1, _coeffs2, 1);
                float32x4_t _tmp30 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r50, _tmp13a0, _coeffs, 2), _tmp13b0, _coeffs, 3);
                float32x4_t _tmp31 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r51, _tmp13a1, _coeffs, 2), _tmp13b1, _coeffs, 3);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);

                r0 += max_jj * 6 * 8;
                r1 += max_jj * 6 * 8;
                r2 += max_jj * 6 * 8;
                r3 += max_jj * 6 * 8;
                r4 += max_jj * 6 * 8;
                r5 += max_jj * 6 * 8;
            }

            unsigned short* outptr0 = top_blob.channel((i + ii) / out_elempack).row<unsigned short>(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);

                float32x4_t _tmp02a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp02a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp02b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp02b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp13a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp13a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp13b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp13b1 = vsubq_f32(_r31, _r41);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _tmp02a0), vaddq_f32(_tmp02b0, _bias0));
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _tmp02a1), vaddq_f32(_tmp02b1, _bias1));
                float32x4_t _tmp10 = vfmaq_laneq_f32(vfmaq_laneq_f32(_bias0, _tmp13a0, _coeffs, 1), _tmp13b0, _coeffs, 0);
                float32x4_t _tmp11 = vfmaq_laneq_f32(vfmaq_laneq_f32(_bias1, _tmp13a1, _coeffs, 1), _tmp13b1, _coeffs, 0);
                float32x4_t _tmp20 = vfmaq_lane_f32(vfmaq_lane_f32(_bias0, _tmp02a0, _coeffs2, 0), _tmp02b0, _coeffs2, 1);
                float32x4_t _tmp21 = vfmaq_lane_f32(vfmaq_lane_f32(_bias1, _tmp02a1, _coeffs2, 0), _tmp02b1, _coeffs2, 1);
                float32x4_t _tmp30 = vfmaq_laneq_f32(vfmaq_laneq_f32(vaddq_f32(_r50, _bias0), _tmp13a0, _coeffs, 2), _tmp13b0, _coeffs, 3);
                float32x4_t _tmp31 = vfmaq_laneq_f32(vfmaq_laneq_f32(vaddq_f32(_r51, _bias1), _tmp13a1, _coeffs, 2), _tmp13b1, _coeffs, 3);

                if (out_elempack == 4)
                {
                    unsigned short* outptr1 = outptr0 + N;

                    vst1_u16(outptr0, float2bfloat(_tmp00));
                    vst1_u16(outptr1, float2bfloat(_tmp01));
                    if (tj * 4 + 1 < outw)
                    {
                        vst1_u16(outptr0 + 4, float2bfloat(_tmp10));
                        vst1_u16(outptr1 + 4, float2bfloat(_tmp11));
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        vst1_u16(outptr0 + 8, float2bfloat(_tmp20));
                        vst1_u16(outptr1 + 8, float2bfloat(_tmp21));
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        vst1_u16(outptr0 + 12, float2bfloat(_tmp30));
                        vst1_u16(outptr1 + 12, float2bfloat(_tmp31));
                    }
                }
                if (out_elempack == 1)
                {
                    unsigned short tmp0[8];
                    unsigned short tmp1[8];
                    unsigned short tmp2[8];
                    unsigned short tmp3[8];
                    vst1_u16(tmp0, float2bfloat(_tmp00));
                    vst1_u16(tmp0 + 4, float2bfloat(_tmp01));
                    vst1_u16(tmp1, float2bfloat(_tmp10));
                    vst1_u16(tmp1 + 4, float2bfloat(_tmp11));
                    vst1_u16(tmp2, float2bfloat(_tmp20));
                    vst1_u16(tmp2 + 4, float2bfloat(_tmp21));
                    vst1_u16(tmp3, float2bfloat(_tmp30));
                    vst1_u16(tmp3 + 4, float2bfloat(_tmp31));

                    unsigned short* outptr1 = outptr0 + N;
                    unsigned short* outptr2 = outptr0 + N * 2;
                    unsigned short* outptr3 = outptr0 + N * 3;
                    unsigned short* outptr4 = outptr0 + N * 4;
                    unsigned short* outptr5 = outptr0 + N * 5;
                    unsigned short* outptr6 = outptr0 + N * 6;
                    unsigned short* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                        outptr4[2] = tmp2[4];
                        outptr5[2] = tmp2[5];
                        outptr6[2] = tmp2[6];
                        outptr7[2] = tmp2[7];
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                        outptr4[3] = tmp3[4];
                        outptr5[3] = tmp3[5];
                        outptr6[3] = tmp3[6];
                        outptr7[3] = tmp3[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[4][6][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj * 4;
            const float* r1 = r0 + max_jj * 4;
            const float* r2 = r0 + max_jj * 4 * 2;
            const float* r3 = r0 + max_jj * 4 * 3;
            const float* r4 = r0 + max_jj * 4 * 4;
            const float* r5 = r0 + max_jj * 4 * 5;

            for (int m = 0; m < 6; m++)
            {
                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _r1 = vld1q_f32(r1);
                float32x4_t _r2 = vld1q_f32(r2);
                float32x4_t _r3 = vld1q_f32(r3);
                float32x4_t _r4 = vld1q_f32(r4);
                float32x4_t _r5 = vld1q_f32(r5);

                float32x4_t _tmp02a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp02b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp13a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp13b = vsubq_f32(_r3, _r4);

                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp02a), _tmp02b);
#if __aarch64__
                float32x4_t _tmp1 = vfmaq_laneq_f32(vmulq_laneq_f32(_tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x4_t _tmp2 = vfmaq_lane_f32(vmulq_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r5, _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x4_t _tmp1 = vmlaq_lane_f32(vmulq_lane_f32(_tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x4_t _tmp2 = vmlaq_lane_f32(vmulq_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vmlaq_lane_f32(vmlaq_lane_f32(_r5, _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);

                r0 += max_jj * 6 * 4;
                r1 += max_jj * 6 * 4;
                r2 += max_jj * 6 * 4;
                r3 += max_jj * 6 * 4;
                r4 += max_jj * 6 * 4;
                r5 += max_jj * 6 * 4;
            }

            unsigned short* outptr0 = top_blob.channel((i + ii) / out_elempack).row<unsigned short>(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);

                float32x4_t _tmp02a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp02b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp13a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp13b = vsubq_f32(_r3, _r4);

                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp02a), vaddq_f32(_tmp02b, _bias0));
#if __aarch64__
                float32x4_t _tmp1 = vfmaq_laneq_f32(vfmaq_laneq_f32(_bias0, _tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x4_t _tmp2 = vfmaq_lane_f32(vfmaq_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vfmaq_laneq_f32(vfmaq_laneq_f32(vaddq_f32(_r5, _bias0), _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x4_t _tmp1 = vmlaq_lane_f32(vmlaq_lane_f32(_bias0, _tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x4_t _tmp2 = vmlaq_lane_f32(vmlaq_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x4_t _tmp3 = vmlaq_lane_f32(vmlaq_lane_f32(vaddq_f32(_r5, _bias0), _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif

                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_tmp0));
                    if (tj * 4 + 1 < outw) vst1_u16(outptr0 + 4, float2bfloat(_tmp1));
                    if (tj * 4 + 2 < outw) vst1_u16(outptr0 + 8, float2bfloat(_tmp2));
                    if (tj * 4 + 3 < outw) vst1_u16(outptr0 + 12, float2bfloat(_tmp3));
                }
                if (out_elempack == 1)
                {
                    unsigned short tmp0[4];
                    unsigned short tmp1[4];
                    unsigned short tmp2[4];
                    unsigned short tmp3[4];
                    vst1_u16(tmp0, float2bfloat(_tmp0));
                    vst1_u16(tmp1, float2bfloat(_tmp1));
                    vst1_u16(tmp2, float2bfloat(_tmp2));
                    vst1_u16(tmp3, float2bfloat(_tmp3));

                    unsigned short* outptr1 = outptr0 + N;
                    unsigned short* outptr2 = outptr0 + N * 2;
                    unsigned short* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        float32x2_t _bias0 = biasptr ? vld1_f32(biasptr + i + ii) : vdup_n_f32(0.f);
#else
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;
        float bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;
#endif

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[4][6][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj * 2;
            const float* r1 = r0 + max_jj * 2;
            const float* r2 = r0 + max_jj * 2 * 2;
            const float* r3 = r0 + max_jj * 2 * 3;
            const float* r4 = r0 + max_jj * 2 * 4;
            const float* r5 = r0 + max_jj * 2 * 5;

            for (int m = 0; m < 6; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(r0);
                float32x2_t _r1 = vld1_f32(r1);
                float32x2_t _r2 = vld1_f32(r2);
                float32x2_t _r3 = vld1_f32(r3);
                float32x2_t _r4 = vld1_f32(r4);
                float32x2_t _r5 = vld1_f32(r5);

                float32x2_t _tmp02a = vadd_f32(_r1, _r2);
                float32x2_t _tmp02b = vadd_f32(_r3, _r4);
                float32x2_t _tmp13a = vsub_f32(_r1, _r2);
                float32x2_t _tmp13b = vsub_f32(_r3, _r4);

                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp02a), _tmp02b);
#if __aarch64__
                float32x2_t _tmp1 = vfma_laneq_f32(vmul_laneq_f32(_tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x2_t _tmp2 = vfma_lane_f32(vmul_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vfma_laneq_f32(vfma_laneq_f32(_r5, _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x2_t _tmp1 = vmla_lane_f32(vmul_lane_f32(_tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x2_t _tmp2 = vmla_lane_f32(vmul_lane_f32(_tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vmla_lane_f32(vmla_lane_f32(_r5, _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
#else
                float tmp02a0 = r1[0] + r2[0];
                float tmp02a1 = r1[1] + r2[1];
                float tmp02b0 = r3[0] + r4[0];
                float tmp02b1 = r3[1] + r4[1];
                float tmp13a0 = r1[0] - r2[0];
                float tmp13a1 = r1[1] - r2[1];
                float tmp13b0 = r3[0] - r4[0];
                float tmp13b1 = r3[1] - r4[1];

                tmp[0][m][0] = r0[0] + tmp02a0 + tmp02b0;
                tmp[0][m][1] = r0[1] + tmp02a1 + tmp02b1;
                tmp[1][m][0] = tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                tmp[1][m][1] = tmp13a1 * sq2_d2 + tmp13b1 * sq2;
                tmp[2][m][0] = tmp02a0 * 0.5f + tmp02b0 * 2;
                tmp[2][m][1] = tmp02a1 * 0.5f + tmp02b1 * 2;
                tmp[3][m][0] = r5[0] + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                tmp[3][m][1] = r5[1] + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;
#endif

                r0 += max_jj * 6 * 2;
                r1 += max_jj * 6 * 2;
                r2 += max_jj * 6 * 2;
                r3 += max_jj * 6 * 2;
                r4 += max_jj * 6 * 2;
                r5 += max_jj * 6 * 2;
            }

            unsigned short* outptr0 = top_blob.channel(i + ii).row<unsigned short>(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);

                float32x2_t _tmp02a = vadd_f32(_r1, _r2);
                float32x2_t _tmp02b = vadd_f32(_r3, _r4);
                float32x2_t _tmp13a = vsub_f32(_r1, _r2);
                float32x2_t _tmp13b = vsub_f32(_r3, _r4);

                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp02a), vadd_f32(_tmp02b, _bias0));
#if __aarch64__
                float32x2_t _tmp1 = vfma_laneq_f32(vfma_laneq_f32(_bias0, _tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float32x2_t _tmp2 = vfma_lane_f32(vfma_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vfma_laneq_f32(vfma_laneq_f32(vadd_f32(_r5, _bias0), _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);
#else
                float32x2_t _tmp1 = vmla_lane_f32(vmla_lane_f32(_bias0, _tmp13a, vget_low_f32(_coeffs), 1), _tmp13b, vget_low_f32(_coeffs), 0);
                float32x2_t _tmp2 = vmla_lane_f32(vmla_lane_f32(_bias0, _tmp02a, _coeffs2, 0), _tmp02b, _coeffs2, 1);
                float32x2_t _tmp3 = vmla_lane_f32(vmla_lane_f32(vadd_f32(_r5, _bias0), _tmp13a, vget_high_f32(_coeffs), 0), _tmp13b, vget_high_f32(_coeffs), 1);
#endif
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];

                float tmp02a0 = r10 + r20;
                float tmp02a1 = r11 + r21;
                float tmp02b0 = r30 + r40;
                float tmp02b1 = r31 + r41;
                float tmp13a0 = r10 - r20;
                float tmp13a1 = r11 - r21;
                float tmp13b0 = r30 - r40;
                float tmp13b1 = r31 - r41;

                float tmp00 = bias0 + r00 + tmp02a0 + tmp02b0;
                float tmp01 = bias1 + r01 + tmp02a1 + tmp02b1;
                float tmp10 = bias0 + tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                float tmp11 = bias1 + tmp13a1 * sq2_d2 + tmp13b1 * sq2;
                float tmp20 = bias0 + tmp02a0 * 0.5f + tmp02b0 * 2;
                float tmp21 = bias1 + tmp02a1 * 0.5f + tmp02b1 * 2;
                float tmp30 = bias0 + r50 + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                float tmp31 = bias1 + r51 + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;
#endif

                // if (out_elempack == 1)
                {
                    unsigned short* outptr1 = outptr0 + N;

#if __ARM_NEON
                    uint16x4_t _tmp01 = float2bfloat(vcombine_f32(_tmp0, _tmp1));
                    uint16x4_t _tmp23 = float2bfloat(vcombine_f32(_tmp2, _tmp3));

                    outptr0[0] = vget_lane_u16(_tmp01, 0);
                    outptr1[0] = vget_lane_u16(_tmp01, 1);
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = vget_lane_u16(_tmp01, 2);
                        outptr1[1] = vget_lane_u16(_tmp01, 3);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = vget_lane_u16(_tmp23, 0);
                        outptr1[2] = vget_lane_u16(_tmp23, 1);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = vget_lane_u16(_tmp23, 2);
                        outptr1[3] = vget_lane_u16(_tmp23, 3);
                    }
#else
                    outptr0[0] = float32_to_bfloat16(tmp00);
                    outptr1[0] = float32_to_bfloat16(tmp01);
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = float32_to_bfloat16(tmp10);
                        outptr1[1] = float32_to_bfloat16(tmp11);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = float32_to_bfloat16(tmp20);
                        outptr1[2] = float32_to_bfloat16(tmp21);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = float32_to_bfloat16(tmp30);
                        outptr1[3] = float32_to_bfloat16(tmp31);
                    }
#endif
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;

        float tmp[4][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 36 + jj;
            const float* r1 = r0 + max_jj;
            const float* r2 = r0 + max_jj * 2;
            const float* r3 = r0 + max_jj * 3;
            const float* r4 = r0 + max_jj * 4;
            const float* r5 = r0 + max_jj * 5;

            for (int m = 0; m < 6; m++)
            {
                float tmp02a = r1[0] + r2[0];
                float tmp02b = r3[0] + r4[0];
                float tmp13a = r1[0] - r2[0];
                float tmp13b = r3[0] - r4[0];

                tmp[0][m] = r0[0] + tmp02a + tmp02b;
                tmp[1][m] = tmp13a * sq2_d2 + tmp13b * sq2;
                tmp[2][m] = tmp02a * 0.5f + tmp02b * 2;
                tmp[3][m] = r5[0] + tmp13a * sq2_d4 + tmp13b * sq2_m2;

                r0 += max_jj * 6;
                r1 += max_jj * 6;
                r2 += max_jj * 6;
                r3 += max_jj * 6;
                r4 += max_jj * 6;
                r5 += max_jj * 6;
            }

            unsigned short* outptr0 = top_blob.channel(i + ii).row<unsigned short>(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];

                float tmp02a = r1 + r2;
                float tmp02b = r3 + r4;
                float tmp13a = r1 - r2;
                float tmp13b = r3 - r4;

                float tmp0 = bias0 + r0 + tmp02a + tmp02b;
                float tmp1 = bias0 + tmp13a * sq2_d2 + tmp13b * sq2;
                float tmp2 = bias0 + tmp02a * 0.5f + tmp02b * 2;
                float tmp3 = bias0 + r5 + tmp13a * sq2_d4 + tmp13b * sq2_m2;

                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(tmp0);
                    if (tj * 4 + 1 < outw) outptr0[1] = float32_to_bfloat16(tmp1);
                    if (tj * 4 + 2 < outw) outptr0[2] = float32_to_bfloat16(tmp2);
                    if (tj * 4 + 3 < outw) outptr0[3] = float32_to_bfloat16(tmp3);
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd43_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 4n+2, winograd F(4,3)
    int w_tiles = (outw + 3) / 4;
    int h_tiles = (outh + 3) / 4;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 36;

    // NCNN_LOGE("conv3x3s1_winograd43_bf16s %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd43_transform_input_tile_bf16s(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd43_transform_input_tile_bf16s(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd43_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}

static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const float itm[8][8] = {
    //     {1.0f, 0.0f,-5.25f, 0.00f, 5.25f, 0.00f,-1.0f, 0.0f},
    //     {0.0f, 1.0f, 1.00f,-4.25f,-4.25f, 1.00f, 1.0f, 0.0f},
    //     {0.0f,-1.0f, 1.00f, 4.25f,-4.25f,-1.00f, 1.0f, 0.0f},
    //     {0.0f, 0.5f, 0.25f,-2.50f,-1.25f, 2.00f, 1.0f, 0.0f},
    //     {0.0f,-0.5f, 0.25f, 2.50f,-1.25f,-2.00f, 1.0f, 0.0f},
    //     {0.0f, 2.0f, 4.00f,-2.50f,-5.00f, 0.50f, 1.0f, 0.0f},
    //     {0.0f,-2.0f, 4.00f, 2.50f,-5.00f,-0.50f, 1.0f, 0.0f},
    //     {0.0f,-1.0f, 0.00f, 5.25f, 0.00f,-5.25f, 0.0f, 1.0f}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w + 3) / 6;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[8][8][8];

        const float coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _coeffs2 = vld1q_f32(coeffs + 4);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel((k + kk) / elempack).row<const unsigned short>(ti * 6) + (tj * 6) * elempack;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r00 = vdupq_n_f32(0.f);
                float32x4_t _r01 = vdupq_n_f32(0.f);
                float32x4_t _r10 = vdupq_n_f32(0.f);
                float32x4_t _r11 = vdupq_n_f32(0.f);
                float32x4_t _r20 = vdupq_n_f32(0.f);
                float32x4_t _r21 = vdupq_n_f32(0.f);
                float32x4_t _r30 = vdupq_n_f32(0.f);
                float32x4_t _r31 = vdupq_n_f32(0.f);
                float32x4_t _r40 = vdupq_n_f32(0.f);
                float32x4_t _r41 = vdupq_n_f32(0.f);
                float32x4_t _r50 = vdupq_n_f32(0.f);
                float32x4_t _r51 = vdupq_n_f32(0.f);
                float32x4_t _r60 = vdupq_n_f32(0.f);
                float32x4_t _r61 = vdupq_n_f32(0.f);
                float32x4_t _r70 = vdupq_n_f32(0.f);
                float32x4_t _r71 = vdupq_n_f32(0.f);

                if (ti * 6 + m < h)
                {
                    if (elempack == 4)
                    {
                        const unsigned short* r1 = r0 + N;

                        _r00 = bfloat2float(vld1_u16(r0));
                        _r01 = bfloat2float(vld1_u16(r1));
                        if (tj * 6 + 1 < w)
                        {
                            _r10 = bfloat2float(vld1_u16(r0 + 4));
                            _r11 = bfloat2float(vld1_u16(r1 + 4));
                        }
                        if (tj * 6 + 2 < w)
                        {
                            _r20 = bfloat2float(vld1_u16(r0 + 8));
                            _r21 = bfloat2float(vld1_u16(r1 + 8));
                        }
                        if (tj * 6 + 3 < w)
                        {
                            _r30 = bfloat2float(vld1_u16(r0 + 12));
                            _r31 = bfloat2float(vld1_u16(r1 + 12));
                        }
                        if (tj * 6 + 4 < w)
                        {
                            _r40 = bfloat2float(vld1_u16(r0 + 16));
                            _r41 = bfloat2float(vld1_u16(r1 + 16));
                        }
                        if (tj * 6 + 5 < w)
                        {
                            _r50 = bfloat2float(vld1_u16(r0 + 20));
                            _r51 = bfloat2float(vld1_u16(r1 + 20));
                        }
                        if (tj * 6 + 6 < w)
                        {
                            _r60 = bfloat2float(vld1_u16(r0 + 24));
                            _r61 = bfloat2float(vld1_u16(r1 + 24));
                        }
                        if (tj * 6 + 7 < w)
                        {
                            _r70 = bfloat2float(vld1_u16(r0 + 28));
                            _r71 = bfloat2float(vld1_u16(r1 + 28));
                        }
                    }
                    if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;
                        const unsigned short* r2 = r0 + N * 2;
                        const unsigned short* r3 = r0 + N * 3;
                        const unsigned short* r4 = r0 + N * 4;
                        const unsigned short* r5 = r0 + N * 5;
                        const unsigned short* r6 = r0 + N * 6;
                        const unsigned short* r7 = r0 + N * 7;

                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4_t _t2 = vld1_u16(r2);
                        uint16x4_t _t3 = vld1_u16(r3);
                        uint16x4_t _t4 = vld1_u16(r4);
                        uint16x4_t _t5 = vld1_u16(r5);
                        uint16x4_t _t6 = vld1_u16(r6);
                        uint16x4_t _t7 = vld1_u16(r7);

                        transpose4x4_u16(_t0, _t1, _t2, _t3);
                        transpose4x4_u16(_t4, _t5, _t6, _t7);

                        _r00 = bfloat2float(_t0);
                        _r01 = bfloat2float(_t4);
                        if (tj * 6 + 1 < w)
                        {
                            _r10 = bfloat2float(_t1);
                            _r11 = bfloat2float(_t5);
                        }
                        if (tj * 6 + 2 < w)
                        {
                            _r20 = bfloat2float(_t2);
                            _r21 = bfloat2float(_t6);
                        }
                        if (tj * 6 + 3 < w)
                        {
                            _r30 = bfloat2float(_t3);
                            _r31 = bfloat2float(_t7);
                        }
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1_u16(r0 + 4);
                            _t1 = vld1_u16(r1 + 4);
                            _t2 = vld1_u16(r2 + 4);
                            _t3 = vld1_u16(r3 + 4);
                            _t4 = vld1_u16(r4 + 4);
                            _t5 = vld1_u16(r5 + 4);
                            _t6 = vld1_u16(r6 + 4);
                            _t7 = vld1_u16(r7 + 4);

                            transpose4x4_u16(_t0, _t1, _t2, _t3);
                            transpose4x4_u16(_t4, _t5, _t6, _t7);

                            _r40 = bfloat2float(_t0);
                            _r41 = bfloat2float(_t4);
                            if (tj * 6 + 5 < w)
                            {
                                _r50 = bfloat2float(_t1);
                                _r51 = bfloat2float(_t5);
                            }
                            if (tj * 6 + 6 < w)
                            {
                                _r60 = bfloat2float(_t2);
                                _r61 = bfloat2float(_t6);
                            }
                            if (tj * 6 + 7 < w)
                            {
                                _r70 = bfloat2float(_t3);
                                _r71 = bfloat2float(_t7);
                            }
                        }
                    }
                }

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vaddq_f32(_r20, _r60), _r40, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vaddq_f32(_r21, _r61), _r41, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(vaddq_f32(_r10, _r50), _r30, _coeffs, 1);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(vaddq_f32(_r11, _r51), _r31, _coeffs, 1);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r60, _r20, _coeffs, 3), _r40, _coeffs, 2);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r61, _r21, _coeffs, 3), _r41, _coeffs, 2);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 1), _r30, _coeffs2, 0), _r50, _coeffs2, 2);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 1), _r31, _coeffs2, 0), _r51, _coeffs2, 2);
                float32x4_t _tmp56a0 = vfmaq_laneq_f32(_r60, vfmaq_laneq_f32(_r20, _r40, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56a1 = vfmaq_laneq_f32(_r61, vfmaq_laneq_f32(_r21, _r41, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 2), _r30, _coeffs2, 0), _r50, _coeffs2, 1);
                float32x4_t _tmp56b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 2), _r31, _coeffs2, 0), _r51, _coeffs2, 1);

                float32x4_t _tmp00 = vfmaq_laneq_f32(vsubq_f32(_r00, _r60), vsubq_f32(_r40, _r20), _coeffs, 0);
                float32x4_t _tmp01 = vfmaq_laneq_f32(vsubq_f32(_r01, _r61), vsubq_f32(_r41, _r21), _coeffs, 0);
                float32x4_t _tmp10 = vaddq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp11 = vaddq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp20 = vsubq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp21 = vsubq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp50 = vaddq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp51 = vaddq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp60 = vsubq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp61 = vsubq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp70 = vfmaq_laneq_f32(vsubq_f32(_r70, _r10), vsubq_f32(_r30, _r50), _coeffs, 0);
                float32x4_t _tmp71 = vfmaq_laneq_f32(vsubq_f32(_r71, _r11), vsubq_f32(_r31, _r51), _coeffs, 0);

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);
                vst1q_f32(tmp[4][m], _tmp40);
                vst1q_f32(tmp[4][m] + 4, _tmp41);
                vst1q_f32(tmp[5][m], _tmp50);
                vst1q_f32(tmp[5][m] + 4, _tmp51);
                vst1q_f32(tmp[6][m], _tmp60);
                vst1q_f32(tmp[6][m] + 4, _tmp61);
                vst1q_f32(tmp[7][m], _tmp70);
                vst1q_f32(tmp[7][m] + 4, _tmp71);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj * 8;
            float* p1 = p0 + max_jj * 8;
            float* p2 = p0 + max_jj * 8 * 2;
            float* p3 = p0 + max_jj * 8 * 3;
            float* p4 = p0 + max_jj * 8 * 4;
            float* p5 = p0 + max_jj * 8 * 5;
            float* p6 = p0 + max_jj * 8 * 6;
            float* p7 = p0 + max_jj * 8 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);
                float32x4_t _r60 = vld1q_f32(tmp[m][6]);
                float32x4_t _r61 = vld1q_f32(tmp[m][6] + 4);
                float32x4_t _r70 = vld1q_f32(tmp[m][7]);
                float32x4_t _r71 = vld1q_f32(tmp[m][7] + 4);

                float32x4_t _tmp12a0 = vfmaq_laneq_f32(vaddq_f32(_r20, _r60), _r40, _coeffs, 1);
                float32x4_t _tmp12a1 = vfmaq_laneq_f32(vaddq_f32(_r21, _r61), _r41, _coeffs, 1);
                float32x4_t _tmp12b0 = vfmaq_laneq_f32(vaddq_f32(_r10, _r50), _r30, _coeffs, 1);
                float32x4_t _tmp12b1 = vfmaq_laneq_f32(vaddq_f32(_r11, _r51), _r31, _coeffs, 1);
                float32x4_t _tmp34a0 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r60, _r20, _coeffs, 3), _r40, _coeffs, 2);
                float32x4_t _tmp34a1 = vfmaq_laneq_f32(vfmaq_laneq_f32(_r61, _r21, _coeffs, 3), _r41, _coeffs, 2);
                float32x4_t _tmp34b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 1), _r30, _coeffs2, 0), _r50, _coeffs2, 2);
                float32x4_t _tmp34b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 1), _r31, _coeffs2, 0), _r51, _coeffs2, 2);
                float32x4_t _tmp56a0 = vfmaq_laneq_f32(_r60, vfmaq_laneq_f32(_r20, _r40, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56a1 = vfmaq_laneq_f32(_r61, vfmaq_laneq_f32(_r21, _r41, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b0 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r10, _coeffs2, 2), _r30, _coeffs2, 0), _r50, _coeffs2, 1);
                float32x4_t _tmp56b1 = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r11, _coeffs2, 2), _r31, _coeffs2, 0), _r51, _coeffs2, 1);

                float32x4_t _tmp00 = vfmaq_laneq_f32(vsubq_f32(_r00, _r60), vsubq_f32(_r40, _r20), _coeffs, 0);
                float32x4_t _tmp01 = vfmaq_laneq_f32(vsubq_f32(_r01, _r61), vsubq_f32(_r41, _r21), _coeffs, 0);
                float32x4_t _tmp10 = vaddq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp11 = vaddq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp20 = vsubq_f32(_tmp12a0, _tmp12b0);
                float32x4_t _tmp21 = vsubq_f32(_tmp12a1, _tmp12b1);
                float32x4_t _tmp30 = vaddq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp31 = vaddq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp40 = vsubq_f32(_tmp34a0, _tmp34b0);
                float32x4_t _tmp41 = vsubq_f32(_tmp34a1, _tmp34b1);
                float32x4_t _tmp50 = vaddq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp51 = vaddq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp60 = vsubq_f32(_tmp56a0, _tmp56b0);
                float32x4_t _tmp61 = vsubq_f32(_tmp56a1, _tmp56b1);
                float32x4_t _tmp70 = vfmaq_laneq_f32(vsubq_f32(_r70, _r10), vsubq_f32(_r30, _r50), _coeffs, 0);
                float32x4_t _tmp71 = vfmaq_laneq_f32(vsubq_f32(_r71, _r11), vsubq_f32(_r31, _r51), _coeffs, 0);

                vst1q_f32(p0, _tmp00);
                vst1q_f32(p0 + 4, _tmp01);
                vst1q_f32(p1, _tmp10);
                vst1q_f32(p1 + 4, _tmp11);
                vst1q_f32(p2, _tmp20);
                vst1q_f32(p2 + 4, _tmp21);
                vst1q_f32(p3, _tmp30);
                vst1q_f32(p3 + 4, _tmp31);
                vst1q_f32(p4, _tmp40);
                vst1q_f32(p4 + 4, _tmp41);
                vst1q_f32(p5, _tmp50);
                vst1q_f32(p5 + 4, _tmp51);
                vst1q_f32(p6, _tmp60);
                vst1q_f32(p6 + 4, _tmp61);
                vst1q_f32(p7, _tmp70);
                vst1q_f32(p7 + 4, _tmp71);

                p0 += max_jj * 8 * 8;
                p1 += max_jj * 8 * 8;
                p2 += max_jj * 8 * 8;
                p3 += max_jj * 8 * 8;
                p4 += max_jj * 8 * 8;
                p5 += max_jj * 8 * 8;
                p6 += max_jj * 8 * 8;
                p7 += max_jj * 8 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
#else // __aarch64__
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    #pragma omp parallel for num_threads(nT)
#endif // __aarch64__
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[8][8][4];

        const float coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _coeffs2 = vld1q_f32(coeffs + 4);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel((k + kk) / elempack).row<const unsigned short>(ti * 6) + (tj * 6) * elempack;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r0 = vdupq_n_f32(0.f);
                float32x4_t _r1 = vdupq_n_f32(0.f);
                float32x4_t _r2 = vdupq_n_f32(0.f);
                float32x4_t _r3 = vdupq_n_f32(0.f);
                float32x4_t _r4 = vdupq_n_f32(0.f);
                float32x4_t _r5 = vdupq_n_f32(0.f);
                float32x4_t _r6 = vdupq_n_f32(0.f);
                float32x4_t _r7 = vdupq_n_f32(0.f);

                if (ti * 6 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = bfloat2float(vld1_u16(r0));
                        if (tj * 6 + 1 < w) _r1 = bfloat2float(vld1_u16(r0 + 4));
                        if (tj * 6 + 2 < w) _r2 = bfloat2float(vld1_u16(r0 + 8));
                        if (tj * 6 + 3 < w) _r3 = bfloat2float(vld1_u16(r0 + 12));
                        if (tj * 6 + 4 < w) _r4 = bfloat2float(vld1_u16(r0 + 16));
                        if (tj * 6 + 5 < w) _r5 = bfloat2float(vld1_u16(r0 + 20));
                        if (tj * 6 + 6 < w) _r6 = bfloat2float(vld1_u16(r0 + 24));
                        if (tj * 6 + 7 < w) _r7 = bfloat2float(vld1_u16(r0 + 28));
                    }
                    if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;
                        const unsigned short* r2 = r0 + N * 2;
                        const unsigned short* r3 = r0 + N * 3;

                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4_t _t2 = vld1_u16(r2);
                        uint16x4_t _t3 = vld1_u16(r3);

                        transpose4x4_u16(_t0, _t1, _t2, _t3);

                        _r0 = bfloat2float(_t0);
                        if (tj * 6 + 1 < w) _r1 = bfloat2float(_t1);
                        if (tj * 6 + 2 < w) _r2 = bfloat2float(_t2);
                        if (tj * 6 + 3 < w) _r3 = bfloat2float(_t3);
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1_u16(r0 + 4);
                            _t1 = vld1_u16(r1 + 4);
                            _t2 = vld1_u16(r2 + 4);
                            _t3 = vld1_u16(r3 + 4);

                            transpose4x4_u16(_t0, _t1, _t2, _t3);

                            _r4 = bfloat2float(_t0);
                            if (tj * 6 + 5 < w) _r5 = bfloat2float(_t1);
                            if (tj * 6 + 6 < w) _r6 = bfloat2float(_t2);
                            if (tj * 6 + 7 < w) _r7 = bfloat2float(_t3);
                        }
                    }
                }

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vaddq_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(vaddq_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vfmaq_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x4_t _tmp34b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x4_t _tmp56a = vfmaq_laneq_f32(_r6, vfmaq_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vaddq_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(vaddq_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmlaq_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x4_t _tmp56a = vmlaq_lane_f32(_r6, vmlaq_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x4_t _tmp56b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_laneq_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), _coeffs, 0);
#else
                float32x4_t _tmp0 = vmlaq_lane_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
                float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x4_t _tmp7 = vfmaq_laneq_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), _coeffs, 0);
#else
                float32x4_t _tmp7 = vmlaq_lane_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);
                vst1q_f32(tmp[4][m], _tmp4);
                vst1q_f32(tmp[5][m], _tmp5);
                vst1q_f32(tmp[6][m], _tmp6);
                vst1q_f32(tmp[7][m], _tmp7);

                r0 += w * elempack;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj * 4;
            float* p1 = p0 + max_jj * 4;
            float* p2 = p0 + max_jj * 4 * 2;
            float* p3 = p0 + max_jj * 4 * 3;
            float* p4 = p0 + max_jj * 4 * 4;
            float* p5 = p0 + max_jj * 4 * 5;
            float* p6 = p0 + max_jj * 4 * 6;
            float* p7 = p0 + max_jj * 4 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);
                float32x4_t _r6 = vld1q_f32(tmp[m][6]);
                float32x4_t _r7 = vld1q_f32(tmp[m][7]);

#if __aarch64__
                float32x4_t _tmp12a = vfmaq_laneq_f32(vaddq_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x4_t _tmp12b = vfmaq_laneq_f32(vaddq_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x4_t _tmp34a = vfmaq_laneq_f32(vfmaq_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x4_t _tmp34b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x4_t _tmp56a = vfmaq_laneq_f32(_r6, vfmaq_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x4_t _tmp56b = vfmaq_laneq_f32(vfmaq_laneq_f32(vmulq_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x4_t _tmp12a = vmlaq_lane_f32(vaddq_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp12b = vmlaq_lane_f32(vaddq_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp34a = vmlaq_lane_f32(vmlaq_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp34b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x4_t _tmp56a = vmlaq_lane_f32(_r6, vmlaq_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x4_t _tmp56b = vmlaq_lane_f32(vmlaq_lane_f32(vmulq_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x4_t _tmp0 = vfmaq_laneq_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), _coeffs, 0);
#else
                float32x4_t _tmp0 = vmlaq_lane_f32(vsubq_f32(_r0, _r6), vsubq_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
                float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
                float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
                float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x4_t _tmp7 = vfmaq_laneq_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), _coeffs, 0);
#else
                float32x4_t _tmp7 = vmlaq_lane_f32(vsubq_f32(_r7, _r1), vsubq_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1q_f32(p0, _tmp0);
                vst1q_f32(p1, _tmp1);
                vst1q_f32(p2, _tmp2);
                vst1q_f32(p3, _tmp3);
                vst1q_f32(p4, _tmp4);
                vst1q_f32(p5, _tmp5);
                vst1q_f32(p6, _tmp6);
                vst1q_f32(p7, _tmp7);

                p0 += max_jj * 8 * 4;
                p1 += max_jj * 8 * 4;
                p2 += max_jj * 8 * 4;
                p3 += max_jj * 8 * 4;
                p4 += max_jj * 8 * 4;
                p5 += max_jj * 8 * 4;
                p6 += max_jj * 8 * 4;
                p7 += max_jj * 8 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[8][8][2];

#if __ARM_NEON
        const float coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float32x4_t _coeffs = vld1q_f32(coeffs);
        float32x4_t _coeffs2 = vld1q_f32(coeffs + 4);
#endif

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0 = bottom_blob.channel(k + kk).row<const unsigned short>(ti * 6) + (tj * 6);

            for (int m = 0; m < 8; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vdup_n_f32(0.f);
                float32x2_t _r1 = vdup_n_f32(0.f);
                float32x2_t _r2 = vdup_n_f32(0.f);
                float32x2_t _r3 = vdup_n_f32(0.f);
                float32x2_t _r4 = vdup_n_f32(0.f);
                float32x2_t _r5 = vdup_n_f32(0.f);
                float32x2_t _r6 = vdup_n_f32(0.f);
                float32x2_t _r7 = vdup_n_f32(0.f);
#else
                float r00 = 0.f;
                float r01 = 0.f;
                float r10 = 0.f;
                float r11 = 0.f;
                float r20 = 0.f;
                float r21 = 0.f;
                float r30 = 0.f;
                float r31 = 0.f;
                float r40 = 0.f;
                float r41 = 0.f;
                float r50 = 0.f;
                float r51 = 0.f;
                float r60 = 0.f;
                float r61 = 0.f;
                float r70 = 0.f;
                float r71 = 0.f;
#endif

                if (ti * 6 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const unsigned short* r1 = r0 + N;

#if __ARM_NEON
                        uint16x4_t _t0 = vld1_u16(r0);
                        uint16x4_t _t1 = vld1_u16(r1);
                        uint16x4x2_t _t01 = vzip_u16(_t0, _t1);
                        float32x4_t _t0_fp32 = bfloat2float(_t01.val[0]);
                        float32x4_t _t1_fp32 = bfloat2float(_t01.val[1]);

                        _r0 = vget_low_f32(_t0_fp32);
                        if (tj * 6 + 1 < w) _r1 = vget_high_f32(_t0_fp32);
                        if (tj * 6 + 2 < w) _r2 = vget_low_f32(_t1_fp32);
                        if (tj * 6 + 3 < w) _r3 = vget_high_f32(_t1_fp32);
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1_u16(r0 + 4);
                            _t1 = vld1_u16(r1 + 4);
                            _t01 = vzip_u16(_t0, _t1);
                            _t0_fp32 = bfloat2float(_t01.val[0]);
                            _t1_fp32 = bfloat2float(_t01.val[1]);

                            _r4 = vget_low_f32(_t0_fp32);
                            if (tj * 6 + 5 < w) _r5 = vget_high_f32(_t0_fp32);
                            if (tj * 6 + 6 < w) _r6 = vget_low_f32(_t1_fp32);
                            if (tj * 6 + 7 < w) _r7 = vget_high_f32(_t1_fp32);
                        }
#else
                        r00 = bfloat16_to_float32(r0[0]);
                        r01 = bfloat16_to_float32(r1[0]);
                        if (tj * 6 + 1 < w)
                        {
                            r10 = bfloat16_to_float32(r0[1]);
                            r11 = bfloat16_to_float32(r1[1]);
                        }
                        if (tj * 6 + 2 < w)
                        {
                            r20 = bfloat16_to_float32(r0[2]);
                            r21 = bfloat16_to_float32(r1[2]);
                        }
                        if (tj * 6 + 3 < w)
                        {
                            r30 = bfloat16_to_float32(r0[3]);
                            r31 = bfloat16_to_float32(r1[3]);
                        }
                        if (tj * 6 + 4 < w)
                        {
                            r40 = bfloat16_to_float32(r0[4]);
                            r41 = bfloat16_to_float32(r1[4]);
                        }
                        if (tj * 6 + 5 < w)
                        {
                            r50 = bfloat16_to_float32(r0[5]);
                            r51 = bfloat16_to_float32(r1[5]);
                        }
                        if (tj * 6 + 6 < w)
                        {
                            r60 = bfloat16_to_float32(r0[6]);
                            r61 = bfloat16_to_float32(r1[6]);
                        }
                        if (tj * 6 + 7 < w)
                        {
                            r70 = bfloat16_to_float32(r0[7]);
                            r71 = bfloat16_to_float32(r1[7]);
                        }
#endif
                    }
                }

#if __ARM_NEON
#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vadd_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(vadd_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x2_t _tmp34a = vfma_laneq_f32(vfma_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x2_t _tmp34b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x2_t _tmp56a = vfma_laneq_f32(_r6, vfma_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x2_t _tmp56b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vadd_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(vadd_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34a = vmla_lane_f32(vmla_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x2_t _tmp56a = vmla_lane_f32(_r6, vmla_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x2_t _tmp56b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_laneq_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), _coeffs, 0);
#else
                float32x2_t _tmp0 = vmla_lane_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x2_t _tmp1 = vadd_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp2 = vsub_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp3 = vadd_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp4 = vsub_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp5 = vadd_f32(_tmp56a, _tmp56b);
                float32x2_t _tmp6 = vsub_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x2_t _tmp7 = vfma_laneq_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), _coeffs, 0);
#else
                float32x2_t _tmp7 = vmla_lane_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
                vst1_f32(tmp[4][m], _tmp4);
                vst1_f32(tmp[5][m], _tmp5);
                vst1_f32(tmp[6][m], _tmp6);
                vst1_f32(tmp[7][m], _tmp7);
#else
                float tmp12a0 = r20 + r60 - r40 * 4.25f;
                float tmp12a1 = r21 + r61 - r41 * 4.25f;
                float tmp12b0 = r10 + r50 - r30 * 4.25f;
                float tmp12b1 = r11 + r51 - r31 * 4.25f;
                float tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f;
                float tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f;
                float tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f;
                float tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f;
                float tmp56a0 = r20 * 4.f - r40 * 5.f + r60;
                float tmp56a1 = r21 * 4.f - r41 * 5.f + r61;
                float tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f;
                float tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f;

                tmp[0][m][0] = r00 - r60 + (r40 - r20) * 5.25f;
                tmp[0][m][1] = r01 - r61 + (r41 - r21) * 5.25f;
                tmp[1][m][0] = tmp12a0 + tmp12b0;
                tmp[1][m][1] = tmp12a1 + tmp12b1;
                tmp[2][m][0] = tmp12a0 - tmp12b0;
                tmp[2][m][1] = tmp12a1 - tmp12b1;
                tmp[3][m][0] = tmp34a0 + tmp34b0;
                tmp[3][m][1] = tmp34a1 + tmp34b1;
                tmp[4][m][0] = tmp34a0 - tmp34b0;
                tmp[4][m][1] = tmp34a1 - tmp34b1;
                tmp[5][m][0] = tmp56a0 + tmp56b0;
                tmp[5][m][1] = tmp56a1 + tmp56b1;
                tmp[6][m][0] = tmp56a0 - tmp56b0;
                tmp[6][m][1] = tmp56a1 - tmp56b1;
                tmp[7][m][0] = r70 - r10 + (r30 - r50) * 5.25f;
                tmp[7][m][1] = r71 - r11 + (r31 - r51) * 5.25f;
#endif

                r0 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj * 2;
            float* p1 = p0 + max_jj * 2;
            float* p2 = p0 + max_jj * 2 * 2;
            float* p3 = p0 + max_jj * 2 * 3;
            float* p4 = p0 + max_jj * 2 * 4;
            float* p5 = p0 + max_jj * 2 * 5;
            float* p6 = p0 + max_jj * 2 * 6;
            float* p7 = p0 + max_jj * 2 * 7;

            for (int m = 0; m < 8; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);
                float32x2_t _r6 = vld1_f32(tmp[m][6]);
                float32x2_t _r7 = vld1_f32(tmp[m][7]);

#if __aarch64__
                float32x2_t _tmp12a = vfma_laneq_f32(vadd_f32(_r2, _r6), _r4, _coeffs, 1);
                float32x2_t _tmp12b = vfma_laneq_f32(vadd_f32(_r1, _r5), _r3, _coeffs, 1);
                float32x2_t _tmp34a = vfma_laneq_f32(vfma_laneq_f32(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float32x2_t _tmp34b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 1), _r3, _coeffs2, 0), _r5, _coeffs2, 2);
                float32x2_t _tmp56a = vfma_laneq_f32(_r6, vfma_laneq_f32(_r2, _r4, _coeffs, 2), _coeffs2, 3);
                float32x2_t _tmp56b = vfma_laneq_f32(vfma_laneq_f32(vmul_laneq_f32(_r1, _coeffs2, 2), _r3, _coeffs2, 0), _r5, _coeffs2, 1);
#else
                float32x2_t _tmp12a = vmla_lane_f32(vadd_f32(_r2, _r6), _r4, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp12b = vmla_lane_f32(vadd_f32(_r1, _r5), _r3, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp34a = vmla_lane_f32(vmla_lane_f32(_r6, _r2, vget_high_f32(_coeffs), 1), _r4, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp34b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_low_f32(_coeffs2), 1), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_high_f32(_coeffs2), 0);
                float32x2_t _tmp56a = vmla_lane_f32(_r6, vmla_lane_f32(_r2, _r4, vget_high_f32(_coeffs), 0), vget_high_f32(_coeffs2), 1);
                float32x2_t _tmp56b = vmla_lane_f32(vmla_lane_f32(vmul_lane_f32(_r1, vget_high_f32(_coeffs2), 0), _r3, vget_low_f32(_coeffs2), 0), _r5, vget_low_f32(_coeffs2), 1);
#endif

#if __aarch64__
                float32x2_t _tmp0 = vfma_laneq_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), _coeffs, 0);
#else
                float32x2_t _tmp0 = vmla_lane_f32(vsub_f32(_r0, _r6), vsub_f32(_r4, _r2), vget_low_f32(_coeffs), 0);
#endif
                float32x2_t _tmp1 = vadd_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp2 = vsub_f32(_tmp12a, _tmp12b);
                float32x2_t _tmp3 = vadd_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp4 = vsub_f32(_tmp34a, _tmp34b);
                float32x2_t _tmp5 = vadd_f32(_tmp56a, _tmp56b);
                float32x2_t _tmp6 = vsub_f32(_tmp56a, _tmp56b);
#if __aarch64__
                float32x2_t _tmp7 = vfma_laneq_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), _coeffs, 0);
#else
                float32x2_t _tmp7 = vmla_lane_f32(vsub_f32(_r7, _r1), vsub_f32(_r3, _r5), vget_low_f32(_coeffs), 0);
#endif

                vst1_f32(p0, _tmp0);
                vst1_f32(p1, _tmp1);
                vst1_f32(p2, _tmp2);
                vst1_f32(p3, _tmp3);
                vst1_f32(p4, _tmp4);
                vst1_f32(p5, _tmp5);
                vst1_f32(p6, _tmp6);
                vst1_f32(p7, _tmp7);
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];
                float r60 = tmp[m][6][0];
                float r61 = tmp[m][6][1];
                float r70 = tmp[m][7][0];
                float r71 = tmp[m][7][1];

                float tmp12a0 = r20 + r60 - r40 * 4.25f;
                float tmp12a1 = r21 + r61 - r41 * 4.25f;
                float tmp12b0 = r10 + r50 - r30 * 4.25f;
                float tmp12b1 = r11 + r51 - r31 * 4.25f;
                float tmp34a0 = r60 + r20 * 0.25f - r40 * 1.25f;
                float tmp34a1 = r61 + r21 * 0.25f - r41 * 1.25f;
                float tmp34b0 = r10 * 0.5f - r30 * 2.5f + r50 * 2.f;
                float tmp34b1 = r11 * 0.5f - r31 * 2.5f + r51 * 2.f;
                float tmp56a0 = r20 * 4.f - r40 * 5.f + r60;
                float tmp56a1 = r21 * 4.f - r41 * 5.f + r61;
                float tmp56b0 = r10 * 2.f - r30 * 2.5f + r50 * 0.5f;
                float tmp56b1 = r11 * 2.f - r31 * 2.5f + r51 * 0.5f;

                p0[0] = r00 - r60 + (r40 - r20) * 5.25f;
                p0[1] = r01 - r61 + (r41 - r21) * 5.25f;
                p1[0] = tmp12a0 + tmp12b0;
                p1[1] = tmp12a1 + tmp12b1;
                p2[0] = tmp12a0 - tmp12b0;
                p2[1] = tmp12a1 - tmp12b1;
                p3[0] = tmp34a0 + tmp34b0;
                p3[1] = tmp34a1 + tmp34b1;
                p4[0] = tmp34a0 - tmp34b0;
                p4[1] = tmp34a1 - tmp34b1;
                p5[0] = tmp56a0 + tmp56b0;
                p5[1] = tmp56a1 + tmp56b1;
                p6[0] = tmp56a0 - tmp56b0;
                p6[1] = tmp56a1 - tmp56b1;
                p7[0] = r70 - r10 + (r30 - r50) * 5.25f;
                p7[1] = r71 - r11 + (r31 - r51) * 5.25f;
#endif

                p0 += max_jj * 8 * 2;
                p1 += max_jj * 8 * 2;
                p2 += max_jj * 8 * 2;
                p3 += max_jj * 8 * 2;
                p4 += max_jj * 8 * 2;
                p5 += max_jj * 8 * 2;
                p6 += max_jj * 8 * 2;
                p7 += max_jj * 8 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        float tmp[8][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const unsigned short* r0123 = bottom_blob.channel(k + kk).row<const unsigned short>(ti * 6) + (tj * 6);

            for (int m = 0; m < 8; m++)
            {
                float r0 = 0.f;
                float r1 = 0.f;
                float r2 = 0.f;
                float r3 = 0.f;
                float r4 = 0.f;
                float r5 = 0.f;
                float r6 = 0.f;
                float r7 = 0.f;

                if (ti * 6 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = bfloat16_to_float32(r0123[0]);
                        if (tj * 6 + 1 < w) r1 = bfloat16_to_float32(r0123[1]);
                        if (tj * 6 + 2 < w) r2 = bfloat16_to_float32(r0123[2]);
                        if (tj * 6 + 3 < w) r3 = bfloat16_to_float32(r0123[3]);
                        if (tj * 6 + 4 < w) r4 = bfloat16_to_float32(r0123[4]);
                        if (tj * 6 + 5 < w) r5 = bfloat16_to_float32(r0123[5]);
                        if (tj * 6 + 6 < w) r6 = bfloat16_to_float32(r0123[6]);
                        if (tj * 6 + 7 < w) r7 = bfloat16_to_float32(r0123[7]);
                    }
                }

                float tmp12a = r2 + r6 - r4 * 4.25f;
                float tmp12b = r1 + r5 - r3 * 4.25f;
                float tmp34a = r6 + r2 * 0.25f - r4 * 1.25f;
                float tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f;
                float tmp56a = r2 * 4.f - r4 * 5.f + r6;
                float tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f;

                tmp[0][m] = r0 - r6 + (r4 - r2) * 5.25f;
                tmp[1][m] = tmp12a + tmp12b;
                tmp[2][m] = tmp12a - tmp12b;
                tmp[3][m] = tmp34a + tmp34b;
                tmp[4][m] = tmp34a - tmp34b;
                tmp[5][m] = tmp56a + tmp56b;
                tmp[6][m] = tmp56a - tmp56b;
                tmp[7][m] = r7 - r1 + (r3 - r5) * 5.25f;

                r0123 += w;
            }

            float* p0 = (float*)B + kk * max_jj * 64 + jj;
            float* p1 = p0 + max_jj;
            float* p2 = p0 + max_jj * 2;
            float* p3 = p0 + max_jj * 3;
            float* p4 = p0 + max_jj * 4;
            float* p5 = p0 + max_jj * 5;
            float* p6 = p0 + max_jj * 6;
            float* p7 = p0 + max_jj * 7;

            for (int m = 0; m < 8; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];
                float r6 = tmp[m][6];
                float r7 = tmp[m][7];

                float tmp12a = r2 + r6 - r4 * 4.25f;
                float tmp12b = r1 + r5 - r3 * 4.25f;
                float tmp34a = r6 + r2 * 0.25f - r4 * 1.25f;
                float tmp34b = r1 * 0.5f - r3 * 2.5f + r5 * 2.f;
                float tmp56a = r2 * 4.f - r4 * 5.f + r6;
                float tmp56b = r1 * 2.f - r3 * 2.5f + r5 * 0.5f;

                p0[0] = r0 - r6 + (r4 - r2) * 5.25f;
                p1[0] = tmp12a + tmp12b;
                p2[0] = tmp12a - tmp12b;
                p3[0] = tmp34a + tmp34b;
                p4[0] = tmp34a - tmp34b;
                p5[0] = tmp56a + tmp56b;
                p6[0] = tmp56a - tmp56b;
                p7[0] = r7 - r1 + (r3 - r5) * 5.25f;

                p0 += max_jj * 8;
                p1 += max_jj * 8;
                p2 += max_jj * 8;
                p3 += max_jj * 8;
                p4 += max_jj * 8;
                p5 += max_jj * 8;
                p6 += max_jj * 8;
                p7 += max_jj * 8;
            }
        }
    }
}

static inline void conv3x3s1_winograd63_transform_output_tile_bf16s(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    // const float otm[6][8] = {
    //     {1.0f, 1.0f,  1.0f,  1.0f,  1.0f, 32.0f, 32.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f,  2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f,  4.0f,  4.0f,  8.0f,  8.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f,  8.0f, -8.0f,  4.0f, -4.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f, 16.0f, 16.0f,  2.0f,  2.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 32.0f,-32.0f,  1.0f, -1.0f, 1.0f}
    // };

#if __ARM_NEON
    const float coeffs[4] = {32.f, 16.f, 8.f, 4.f};
    float32x4_t _coeffs = vld1q_f32(coeffs);
    float32x2_t _v2 = vdup_n_f32(2.f);
#endif

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 5) / 6;

    const float* biasptr = bias;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);
        float32x4_t _bias1 = biasptr ? vld1q_f32(biasptr + i + ii + 4) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][8][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj * 8;
            const float* r1 = r0 + max_jj * 8;
            const float* r2 = r0 + max_jj * 8 * 2;
            const float* r3 = r0 + max_jj * 8 * 3;
            const float* r4 = r0 + max_jj * 8 * 4;
            const float* r5 = r0 + max_jj * 8 * 5;
            const float* r6 = r0 + max_jj * 8 * 6;
            const float* r7 = r0 + max_jj * 8 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r50 = vld1q_f32(r5);
                float32x4_t _r51 = vld1q_f32(r5 + 4);
                float32x4_t _r60 = vld1q_f32(r6);
                float32x4_t _r61 = vld1q_f32(r6 + 4);
                float32x4_t _r70 = vld1q_f32(r7);
                float32x4_t _r71 = vld1q_f32(r7 + 4);

                float32x4_t _tmp024a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp024a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp135a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp135a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp024b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp024b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp135b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp135b1 = vsubq_f32(_r31, _r41);
                float32x4_t _tmp024c0 = vaddq_f32(_r50, _r60);
                float32x4_t _tmp024c1 = vaddq_f32(_r51, _r61);
                float32x4_t _tmp135c0 = vsubq_f32(_r50, _r60);
                float32x4_t _tmp135c1 = vsubq_f32(_r51, _r61);

                float32x4_t _tmp00 = vaddq_f32(vaddq_f32(_r00, _tmp024a0), vfmaq_laneq_f32(_tmp024b0, _tmp024c0, _coeffs, 0));
                float32x4_t _tmp01 = vaddq_f32(vaddq_f32(_r01, _tmp024a1), vfmaq_laneq_f32(_tmp024b1, _tmp024c1, _coeffs, 0));
                float32x4_t _tmp10 = vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a0, _tmp135b0, _v2, 0), _tmp135c0, _coeffs, 1);
                float32x4_t _tmp11 = vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a1, _tmp135b1, _v2, 0), _tmp135c1, _coeffs, 1);
                float32x4_t _tmp20 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 3), _tmp024c0, _coeffs, 2);
                float32x4_t _tmp21 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 3), _tmp024c1, _coeffs, 2);
                float32x4_t _tmp30 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a0, _tmp135b0, _coeffs, 2), _tmp135c0, _coeffs, 3);
                float32x4_t _tmp31 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a1, _tmp135b1, _coeffs, 2), _tmp135c1, _coeffs, 3);
                float32x4_t _tmp40 = vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 1), _tmp024c0, _v2, 0);
                float32x4_t _tmp41 = vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 1), _tmp024c1, _v2, 0);
                float32x4_t _tmp50 = vaddq_f32(vaddq_f32(_r70, _tmp135a0), vfmaq_laneq_f32(_tmp135c0, _tmp135b0, _coeffs, 0));
                float32x4_t _tmp51 = vaddq_f32(vaddq_f32(_r71, _tmp135a1), vfmaq_laneq_f32(_tmp135c1, _tmp135b1, _coeffs, 0));

                vst1q_f32(tmp[0][m], _tmp00);
                vst1q_f32(tmp[0][m] + 4, _tmp01);
                vst1q_f32(tmp[1][m], _tmp10);
                vst1q_f32(tmp[1][m] + 4, _tmp11);
                vst1q_f32(tmp[2][m], _tmp20);
                vst1q_f32(tmp[2][m] + 4, _tmp21);
                vst1q_f32(tmp[3][m], _tmp30);
                vst1q_f32(tmp[3][m] + 4, _tmp31);
                vst1q_f32(tmp[4][m], _tmp40);
                vst1q_f32(tmp[4][m] + 4, _tmp41);
                vst1q_f32(tmp[5][m], _tmp50);
                vst1q_f32(tmp[5][m] + 4, _tmp51);

                r0 += max_jj * 8 * 8;
                r1 += max_jj * 8 * 8;
                r2 += max_jj * 8 * 8;
                r3 += max_jj * 8 * 8;
                r4 += max_jj * 8 * 8;
                r5 += max_jj * 8 * 8;
                r6 += max_jj * 8 * 8;
                r7 += max_jj * 8 * 8;
            }

            unsigned short* outptr0 = top_blob.channel((i + ii) / out_elempack).row<unsigned short>(ti * 6) + (tj * 6) * out_elempack;

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float32x4_t _r00 = vld1q_f32(tmp[m][0]);
                float32x4_t _r01 = vld1q_f32(tmp[m][0] + 4);
                float32x4_t _r10 = vld1q_f32(tmp[m][1]);
                float32x4_t _r11 = vld1q_f32(tmp[m][1] + 4);
                float32x4_t _r20 = vld1q_f32(tmp[m][2]);
                float32x4_t _r21 = vld1q_f32(tmp[m][2] + 4);
                float32x4_t _r30 = vld1q_f32(tmp[m][3]);
                float32x4_t _r31 = vld1q_f32(tmp[m][3] + 4);
                float32x4_t _r40 = vld1q_f32(tmp[m][4]);
                float32x4_t _r41 = vld1q_f32(tmp[m][4] + 4);
                float32x4_t _r50 = vld1q_f32(tmp[m][5]);
                float32x4_t _r51 = vld1q_f32(tmp[m][5] + 4);
                float32x4_t _r60 = vld1q_f32(tmp[m][6]);
                float32x4_t _r61 = vld1q_f32(tmp[m][6] + 4);
                float32x4_t _r70 = vld1q_f32(tmp[m][7]);
                float32x4_t _r71 = vld1q_f32(tmp[m][7] + 4);

                float32x4_t _tmp024a0 = vaddq_f32(_r10, _r20);
                float32x4_t _tmp024a1 = vaddq_f32(_r11, _r21);
                float32x4_t _tmp135a0 = vsubq_f32(_r10, _r20);
                float32x4_t _tmp135a1 = vsubq_f32(_r11, _r21);
                float32x4_t _tmp024b0 = vaddq_f32(_r30, _r40);
                float32x4_t _tmp024b1 = vaddq_f32(_r31, _r41);
                float32x4_t _tmp135b0 = vsubq_f32(_r30, _r40);
                float32x4_t _tmp135b1 = vsubq_f32(_r31, _r41);
                float32x4_t _tmp024c0 = vaddq_f32(_r50, _r60);
                float32x4_t _tmp024c1 = vaddq_f32(_r51, _r61);
                float32x4_t _tmp135c0 = vsubq_f32(_r50, _r60);
                float32x4_t _tmp135c1 = vsubq_f32(_r51, _r61);

                float32x4_t _tmp00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r00, _tmp024a0), vfmaq_laneq_f32(_tmp024b0, _tmp024c0, _coeffs, 0)));
                float32x4_t _tmp01 = vaddq_f32(_bias1, vaddq_f32(vaddq_f32(_r01, _tmp024a1), vfmaq_laneq_f32(_tmp024b1, _tmp024c1, _coeffs, 0)));
                float32x4_t _tmp10 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a0, _tmp135b0, _v2, 0), _tmp135c0, _coeffs, 1));
                float32x4_t _tmp11 = vaddq_f32(_bias1, vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a1, _tmp135b1, _v2, 0), _tmp135c1, _coeffs, 1));
                float32x4_t _tmp20 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 3), _tmp024c0, _coeffs, 2));
                float32x4_t _tmp21 = vaddq_f32(_bias1, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 3), _tmp024c1, _coeffs, 2));
                float32x4_t _tmp30 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a0, _tmp135b0, _coeffs, 2), _tmp135c0, _coeffs, 3));
                float32x4_t _tmp31 = vaddq_f32(_bias1, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a1, _tmp135b1, _coeffs, 2), _tmp135c1, _coeffs, 3));
                float32x4_t _tmp40 = vaddq_f32(_bias0, vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a0, _tmp024b0, _coeffs, 1), _tmp024c0, _v2, 0));
                float32x4_t _tmp41 = vaddq_f32(_bias1, vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a1, _tmp024b1, _coeffs, 1), _tmp024c1, _v2, 0));
                float32x4_t _tmp50 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r70, _tmp135a0), vfmaq_laneq_f32(_tmp135c0, _tmp135b0, _coeffs, 0)));
                float32x4_t _tmp51 = vaddq_f32(_bias1, vaddq_f32(vaddq_f32(_r71, _tmp135a1), vfmaq_laneq_f32(_tmp135c1, _tmp135b1, _coeffs, 0)));

                if (out_elempack == 4)
                {
                    unsigned short* outptr1 = outptr0 + N;

                    vst1_u16(outptr0, float2bfloat(_tmp00));
                    vst1_u16(outptr1, float2bfloat(_tmp01));
                    if (tj * 6 + 1 < outw)
                    {
                        vst1_u16(outptr0 + 4, float2bfloat(_tmp10));
                        vst1_u16(outptr1 + 4, float2bfloat(_tmp11));
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        vst1_u16(outptr0 + 8, float2bfloat(_tmp20));
                        vst1_u16(outptr1 + 8, float2bfloat(_tmp21));
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        vst1_u16(outptr0 + 12, float2bfloat(_tmp30));
                        vst1_u16(outptr1 + 12, float2bfloat(_tmp31));
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        vst1_u16(outptr0 + 16, float2bfloat(_tmp40));
                        vst1_u16(outptr1 + 16, float2bfloat(_tmp41));
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        vst1_u16(outptr0 + 20, float2bfloat(_tmp50));
                        vst1_u16(outptr1 + 20, float2bfloat(_tmp51));
                    }
                }
                if (out_elempack == 1)
                {
                    unsigned short tmp0[8];
                    unsigned short tmp1[8];
                    unsigned short tmp2[8];
                    unsigned short tmp3[8];
                    unsigned short tmp4[8];
                    unsigned short tmp5[8];
                    vst1_u16(tmp0, float2bfloat(_tmp00));
                    vst1_u16(tmp0 + 4, float2bfloat(_tmp01));
                    vst1_u16(tmp1, float2bfloat(_tmp10));
                    vst1_u16(tmp1 + 4, float2bfloat(_tmp11));
                    vst1_u16(tmp2, float2bfloat(_tmp20));
                    vst1_u16(tmp2 + 4, float2bfloat(_tmp21));
                    vst1_u16(tmp3, float2bfloat(_tmp30));
                    vst1_u16(tmp3 + 4, float2bfloat(_tmp31));
                    vst1_u16(tmp4, float2bfloat(_tmp40));
                    vst1_u16(tmp4 + 4, float2bfloat(_tmp41));
                    vst1_u16(tmp5, float2bfloat(_tmp50));
                    vst1_u16(tmp5 + 4, float2bfloat(_tmp51));

                    unsigned short* outptr1 = outptr0 + N;
                    unsigned short* outptr2 = outptr0 + N * 2;
                    unsigned short* outptr3 = outptr0 + N * 3;
                    unsigned short* outptr4 = outptr0 + N * 4;
                    unsigned short* outptr5 = outptr0 + N * 5;
                    unsigned short* outptr6 = outptr0 + N * 6;
                    unsigned short* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                        outptr4[2] = tmp2[4];
                        outptr5[2] = tmp2[5];
                        outptr6[2] = tmp2[6];
                        outptr7[2] = tmp2[7];
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                        outptr4[3] = tmp3[4];
                        outptr5[3] = tmp3[5];
                        outptr6[3] = tmp3[6];
                        outptr7[3] = tmp3[7];
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp4[0];
                        outptr1[4] = tmp4[1];
                        outptr2[4] = tmp4[2];
                        outptr3[4] = tmp4[3];
                        outptr4[4] = tmp4[4];
                        outptr5[4] = tmp4[5];
                        outptr6[4] = tmp4[6];
                        outptr7[4] = tmp4[7];
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp5[0];
                        outptr1[5] = tmp5[1];
                        outptr2[5] = tmp5[2];
                        outptr3[5] = tmp5[3];
                        outptr4[5] = tmp5[4];
                        outptr5[5] = tmp5[5];
                        outptr6[5] = tmp5[6];
                        outptr7[5] = tmp5[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float32x4_t _bias0 = biasptr ? vld1q_f32(biasptr + i + ii) : vdupq_n_f32(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        float tmp[6][8][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj * 4;
            const float* r1 = r0 + max_jj * 4;
            const float* r2 = r0 + max_jj * 4 * 2;
            const float* r3 = r0 + max_jj * 4 * 3;
            const float* r4 = r0 + max_jj * 4 * 4;
            const float* r5 = r0 + max_jj * 4 * 5;
            const float* r6 = r0 + max_jj * 4 * 6;
            const float* r7 = r0 + max_jj * 4 * 7;

            for (int m = 0; m < 8; m++)
            {
                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _r1 = vld1q_f32(r1);
                float32x4_t _r2 = vld1q_f32(r2);
                float32x4_t _r3 = vld1q_f32(r3);
                float32x4_t _r4 = vld1q_f32(r4);
                float32x4_t _r5 = vld1q_f32(r5);
                float32x4_t _r6 = vld1q_f32(r6);
                float32x4_t _r7 = vld1q_f32(r7);

                float32x4_t _tmp024a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp135a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp024b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp135b = vsubq_f32(_r3, _r4);
                float32x4_t _tmp024c = vaddq_f32(_r5, _r6);
                float32x4_t _tmp135c = vsubq_f32(_r5, _r6);

#if __aarch64__
                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp024a), vfmaq_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0));
                float32x4_t _tmp1 = vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, _coeffs, 1);
                float32x4_t _tmp2 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2);
                float32x4_t _tmp3 = vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3);
                float32x4_t _tmp4 = vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2, 0);
                float32x4_t _tmp5 = vaddq_f32(vaddq_f32(_r7, _tmp135a), vfmaq_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0));
#else
                float32x4_t _tmp0 = vaddq_f32(vaddq_f32(_r0, _tmp024a), vmlaq_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0));
                float32x4_t _tmp1 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, vget_low_f32(_coeffs), 1);
                float32x4_t _tmp2 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0);
                float32x4_t _tmp3 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1);
                float32x4_t _tmp4 = vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2, 0);
                float32x4_t _tmp5 = vaddq_f32(vaddq_f32(_r7, _tmp135a), vmlaq_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0));
#endif

                vst1q_f32(tmp[0][m], _tmp0);
                vst1q_f32(tmp[1][m], _tmp1);
                vst1q_f32(tmp[2][m], _tmp2);
                vst1q_f32(tmp[3][m], _tmp3);
                vst1q_f32(tmp[4][m], _tmp4);
                vst1q_f32(tmp[5][m], _tmp5);

                r0 += max_jj * 8 * 4;
                r1 += max_jj * 8 * 4;
                r2 += max_jj * 8 * 4;
                r3 += max_jj * 8 * 4;
                r4 += max_jj * 8 * 4;
                r5 += max_jj * 8 * 4;
                r6 += max_jj * 8 * 4;
                r7 += max_jj * 8 * 4;
            }

            unsigned short* outptr0 = top_blob.channel((i + ii) / out_elempack).row<unsigned short>(ti * 6) + (tj * 6) * out_elempack;

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float32x4_t _r0 = vld1q_f32(tmp[m][0]);
                float32x4_t _r1 = vld1q_f32(tmp[m][1]);
                float32x4_t _r2 = vld1q_f32(tmp[m][2]);
                float32x4_t _r3 = vld1q_f32(tmp[m][3]);
                float32x4_t _r4 = vld1q_f32(tmp[m][4]);
                float32x4_t _r5 = vld1q_f32(tmp[m][5]);
                float32x4_t _r6 = vld1q_f32(tmp[m][6]);
                float32x4_t _r7 = vld1q_f32(tmp[m][7]);

                float32x4_t _tmp024a = vaddq_f32(_r1, _r2);
                float32x4_t _tmp135a = vsubq_f32(_r1, _r2);
                float32x4_t _tmp024b = vaddq_f32(_r3, _r4);
                float32x4_t _tmp135b = vsubq_f32(_r3, _r4);
                float32x4_t _tmp024c = vaddq_f32(_r5, _r6);
                float32x4_t _tmp135c = vsubq_f32(_r5, _r6);

#if __aarch64__
                float32x4_t _tmp0 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r0, _tmp024a), vfmaq_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0)));
                float32x4_t _tmp1 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, _coeffs, 1));
                float32x4_t _tmp2 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2));
                float32x4_t _tmp3 = vaddq_f32(_bias0, vfmaq_laneq_f32(vfmaq_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3));
                float32x4_t _tmp4 = vaddq_f32(_bias0, vfmaq_lane_f32(vfmaq_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2, 0));
                float32x4_t _tmp5 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r7, _tmp135a), vfmaq_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0)));
#else
                float32x4_t _tmp0 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r0, _tmp024a), vmlaq_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0)));
                float32x4_t _tmp1 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, _v2, 0), _tmp135c, vget_low_f32(_coeffs), 1));
                float32x4_t _tmp2 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0));
                float32x4_t _tmp3 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1));
                float32x4_t _tmp4 = vaddq_f32(_bias0, vmlaq_lane_f32(vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2, 0));
                float32x4_t _tmp5 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_r7, _tmp135a), vmlaq_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0)));
#endif

                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_tmp0));
                    if (tj * 6 + 1 < outw) vst1_u16(outptr0 + 4, float2bfloat(_tmp1));
                    if (tj * 6 + 2 < outw) vst1_u16(outptr0 + 8, float2bfloat(_tmp2));
                    if (tj * 6 + 3 < outw) vst1_u16(outptr0 + 12, float2bfloat(_tmp3));
                    if (tj * 6 + 4 < outw) vst1_u16(outptr0 + 16, float2bfloat(_tmp4));
                    if (tj * 6 + 5 < outw) vst1_u16(outptr0 + 20, float2bfloat(_tmp5));
                }
                if (out_elempack == 1)
                {
                    unsigned short tmp0[4];
                    unsigned short tmp1[4];
                    unsigned short tmp2[4];
                    unsigned short tmp3[4];
                    unsigned short tmp4[4];
                    unsigned short tmp5[4];
                    vst1_u16(tmp0, float2bfloat(_tmp0));
                    vst1_u16(tmp1, float2bfloat(_tmp1));
                    vst1_u16(tmp2, float2bfloat(_tmp2));
                    vst1_u16(tmp3, float2bfloat(_tmp3));
                    vst1_u16(tmp4, float2bfloat(_tmp4));
                    vst1_u16(tmp5, float2bfloat(_tmp5));

                    unsigned short* outptr1 = outptr0 + N;
                    unsigned short* outptr2 = outptr0 + N * 2;
                    unsigned short* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp4[0];
                        outptr1[4] = tmp4[1];
                        outptr2[4] = tmp4[2];
                        outptr3[4] = tmp4[3];
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp5[0];
                        outptr1[5] = tmp5[1];
                        outptr2[5] = tmp5[2];
                        outptr3[5] = tmp5[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        float32x2_t _bias0 = biasptr ? vld1_f32(biasptr + i + ii) : vdup_n_f32(0.f);
#else
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;
        float bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;
#endif

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        float tmp[6][8][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj * 2;
            const float* r1 = r0 + max_jj * 2;
            const float* r2 = r0 + max_jj * 2 * 2;
            const float* r3 = r0 + max_jj * 2 * 3;
            const float* r4 = r0 + max_jj * 2 * 4;
            const float* r5 = r0 + max_jj * 2 * 5;
            const float* r6 = r0 + max_jj * 2 * 6;
            const float* r7 = r0 + max_jj * 2 * 7;

            for (int m = 0; m < 8; m++)
            {
#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(r0);
                float32x2_t _r1 = vld1_f32(r1);
                float32x2_t _r2 = vld1_f32(r2);
                float32x2_t _r3 = vld1_f32(r3);
                float32x2_t _r4 = vld1_f32(r4);
                float32x2_t _r5 = vld1_f32(r5);
                float32x2_t _r6 = vld1_f32(r6);
                float32x2_t _r7 = vld1_f32(r7);

                float32x2_t _tmp024a = vadd_f32(_r1, _r2);
                float32x2_t _tmp135a = vsub_f32(_r1, _r2);
                float32x2_t _tmp024b = vadd_f32(_r3, _r4);
                float32x2_t _tmp135b = vsub_f32(_r3, _r4);
                float32x2_t _tmp024c = vadd_f32(_r5, _r6);
                float32x2_t _tmp135c = vsub_f32(_r5, _r6);

#if __aarch64__
                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp024a), vfma_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0));
                float32x2_t _tmp1 = vfma_laneq_f32(vfma_f32(_tmp135a, _tmp135b, _v2), _tmp135c, _coeffs, 1);
                float32x2_t _tmp2 = vfma_laneq_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2);
                float32x2_t _tmp3 = vfma_laneq_f32(vfma_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3);
                float32x2_t _tmp4 = vfma_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2);
                float32x2_t _tmp5 = vadd_f32(vadd_f32(_r7, _tmp135a), vfma_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0));
#else
                float32x2_t _tmp0 = vadd_f32(vadd_f32(_r0, _tmp024a), vmla_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0));
                float32x2_t _tmp1 = vmla_lane_f32(vmla_f32(_tmp135a, _tmp135b, _v2), _tmp135c, vget_low_f32(_coeffs), 1);
                float32x2_t _tmp2 = vmla_lane_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0);
                float32x2_t _tmp3 = vmla_lane_f32(vmla_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1);
                float32x2_t _tmp4 = vmla_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2);
                float32x2_t _tmp5 = vadd_f32(vadd_f32(_r7, _tmp135a), vmla_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0));
#endif

                vst1_f32(tmp[0][m], _tmp0);
                vst1_f32(tmp[1][m], _tmp1);
                vst1_f32(tmp[2][m], _tmp2);
                vst1_f32(tmp[3][m], _tmp3);
                vst1_f32(tmp[4][m], _tmp4);
                vst1_f32(tmp[5][m], _tmp5);
#else
                float tmp024a0 = r1[0] + r2[0];
                float tmp024a1 = r1[1] + r2[1];
                float tmp135a0 = r1[0] - r2[0];
                float tmp135a1 = r1[1] - r2[1];
                float tmp024b0 = r3[0] + r4[0];
                float tmp024b1 = r3[1] + r4[1];
                float tmp135b0 = r3[0] - r4[0];
                float tmp135b1 = r3[1] - r4[1];
                float tmp024c0 = r5[0] + r6[0];
                float tmp024c1 = r5[1] + r6[1];
                float tmp135c0 = r5[0] - r6[0];
                float tmp135c1 = r5[1] - r6[1];

                tmp[0][m][0] = r0[0] + tmp024a0 + tmp024b0 + tmp024c0 * 32;
                tmp[0][m][1] = r0[1] + tmp024a1 + tmp024b1 + tmp024c1 * 32;
                tmp[1][m][0] = tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16;
                tmp[1][m][1] = tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16;
                tmp[2][m][0] = tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8;
                tmp[2][m][1] = tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8;
                tmp[3][m][0] = tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4;
                tmp[3][m][1] = tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4;
                tmp[4][m][0] = tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0;
                tmp[4][m][1] = tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1;
                tmp[5][m][0] = r7[0] + tmp135a0 + tmp135b0 * 32 + tmp135c0;
                tmp[5][m][1] = r7[1] + tmp135a1 + tmp135b1 * 32 + tmp135c1;
#endif

                r0 += max_jj * 8 * 2;
                r1 += max_jj * 8 * 2;
                r2 += max_jj * 8 * 2;
                r3 += max_jj * 8 * 2;
                r4 += max_jj * 8 * 2;
                r5 += max_jj * 8 * 2;
                r6 += max_jj * 8 * 2;
                r7 += max_jj * 8 * 2;
            }

            unsigned short* outptr0 = top_blob.channel(i + ii).row<unsigned short>(ti * 6) + (tj * 6);

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

#if __ARM_NEON
                float32x2_t _r0 = vld1_f32(tmp[m][0]);
                float32x2_t _r1 = vld1_f32(tmp[m][1]);
                float32x2_t _r2 = vld1_f32(tmp[m][2]);
                float32x2_t _r3 = vld1_f32(tmp[m][3]);
                float32x2_t _r4 = vld1_f32(tmp[m][4]);
                float32x2_t _r5 = vld1_f32(tmp[m][5]);
                float32x2_t _r6 = vld1_f32(tmp[m][6]);
                float32x2_t _r7 = vld1_f32(tmp[m][7]);

                float32x2_t _tmp024a = vadd_f32(_r1, _r2);
                float32x2_t _tmp135a = vsub_f32(_r1, _r2);
                float32x2_t _tmp024b = vadd_f32(_r3, _r4);
                float32x2_t _tmp135b = vsub_f32(_r3, _r4);
                float32x2_t _tmp024c = vadd_f32(_r5, _r6);
                float32x2_t _tmp135c = vsub_f32(_r5, _r6);

#if __aarch64__
                float32x2_t _tmp0 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r0, _tmp024a), vfma_laneq_f32(_tmp024b, _tmp024c, _coeffs, 0)));
                float32x2_t _tmp1 = vadd_f32(_bias0, vfma_laneq_f32(vfma_f32(_tmp135a, _tmp135b, _v2), _tmp135c, _coeffs, 1));
                float32x2_t _tmp2 = vadd_f32(_bias0, vfma_laneq_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2));
                float32x2_t _tmp3 = vadd_f32(_bias0, vfma_laneq_f32(vfma_laneq_f32(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3));
                float32x2_t _tmp4 = vadd_f32(_bias0, vfma_f32(vfma_laneq_f32(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _v2));
                float32x2_t _tmp5 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r7, _tmp135a), vfma_laneq_f32(_tmp135c, _tmp135b, _coeffs, 0)));
#else
                float32x2_t _tmp0 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r0, _tmp024a), vmla_lane_f32(_tmp024b, _tmp024c, vget_low_f32(_coeffs), 0)));
                float32x2_t _tmp1 = vadd_f32(_bias0, vmla_lane_f32(vmla_f32(_tmp135a, _tmp135b, _v2), _tmp135c, vget_low_f32(_coeffs), 1));
                float32x2_t _tmp2 = vadd_f32(_bias0, vmla_lane_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeffs), 1), _tmp024c, vget_high_f32(_coeffs), 0));
                float32x2_t _tmp3 = vadd_f32(_bias0, vmla_lane_f32(vmla_lane_f32(_tmp135a, _tmp135b, vget_high_f32(_coeffs), 0), _tmp135c, vget_high_f32(_coeffs), 1));
                float32x2_t _tmp4 = vadd_f32(_bias0, vmla_f32(vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeffs), 1), _tmp024c, _v2));
                float32x2_t _tmp5 = vadd_f32(_bias0, vadd_f32(vadd_f32(_r7, _tmp135a), vmla_lane_f32(_tmp135c, _tmp135b, vget_low_f32(_coeffs), 0)));
#endif
#else
                float r00 = tmp[m][0][0];
                float r01 = tmp[m][0][1];
                float r10 = tmp[m][1][0];
                float r11 = tmp[m][1][1];
                float r20 = tmp[m][2][0];
                float r21 = tmp[m][2][1];
                float r30 = tmp[m][3][0];
                float r31 = tmp[m][3][1];
                float r40 = tmp[m][4][0];
                float r41 = tmp[m][4][1];
                float r50 = tmp[m][5][0];
                float r51 = tmp[m][5][1];
                float r60 = tmp[m][6][0];
                float r61 = tmp[m][6][1];
                float r70 = tmp[m][7][0];
                float r71 = tmp[m][7][1];

                float tmp024a0 = r10 + r20;
                float tmp024a1 = r11 + r21;
                float tmp135a0 = r10 - r20;
                float tmp135a1 = r11 - r21;
                float tmp024b0 = r30 + r40;
                float tmp024b1 = r31 + r41;
                float tmp135b0 = r30 - r40;
                float tmp135b1 = r31 - r41;
                float tmp024c0 = r50 + r60;
                float tmp024c1 = r51 + r61;
                float tmp135c0 = r50 - r60;
                float tmp135c1 = r51 - r61;

                float tmp00 = bias0 + r00 + tmp024a0 + tmp024b0 + tmp024c0 * 32;
                float tmp01 = bias1 + r01 + tmp024a1 + tmp024b1 + tmp024c1 * 32;
                float tmp10 = bias0 + tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * 16;
                float tmp11 = bias1 + tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * 16;
                float tmp20 = bias0 + tmp024a0 + tmp024b0 * 4 + tmp024c0 * 8;
                float tmp21 = bias1 + tmp024a1 + tmp024b1 * 4 + tmp024c1 * 8;
                float tmp30 = bias0 + tmp135a0 + tmp135b0 * 8 + tmp135c0 * 4;
                float tmp31 = bias1 + tmp135a1 + tmp135b1 * 8 + tmp135c1 * 4;
                float tmp40 = bias0 + tmp024a0 + tmp024b0 * 16 + tmp024c0 + tmp024c0;
                float tmp41 = bias1 + tmp024a1 + tmp024b1 * 16 + tmp024c1 + tmp024c1;
                float tmp50 = bias0 + r70 + tmp135a0 + tmp135b0 * 32 + tmp135c0;
                float tmp51 = bias1 + r71 + tmp135a1 + tmp135b1 * 32 + tmp135c1;
#endif

                // if (out_elempack == 1)
                {
                    unsigned short* outptr1 = outptr0 + N;

#if __ARM_NEON
                    uint16x4_t _tmp01 = float2bfloat(vcombine_f32(_tmp0, _tmp1));
                    uint16x4_t _tmp23 = float2bfloat(vcombine_f32(_tmp2, _tmp3));
                    uint16x4_t _tmp45 = float2bfloat(vcombine_f32(_tmp4, _tmp5));

                    outptr0[0] = vget_lane_u16(_tmp01, 0);
                    outptr1[0] = vget_lane_u16(_tmp01, 1);
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = vget_lane_u16(_tmp01, 2);
                        outptr1[1] = vget_lane_u16(_tmp01, 3);
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = vget_lane_u16(_tmp23, 0);
                        outptr1[2] = vget_lane_u16(_tmp23, 1);
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = vget_lane_u16(_tmp23, 2);
                        outptr1[3] = vget_lane_u16(_tmp23, 3);
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = vget_lane_u16(_tmp45, 0);
                        outptr1[4] = vget_lane_u16(_tmp45, 1);
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = vget_lane_u16(_tmp45, 2);
                        outptr1[5] = vget_lane_u16(_tmp45, 3);
                    }
#else
                    outptr0[0] = float32_to_bfloat16(tmp00);
                    outptr1[0] = float32_to_bfloat16(tmp01);
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = float32_to_bfloat16(tmp10);
                        outptr1[1] = float32_to_bfloat16(tmp11);
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = float32_to_bfloat16(tmp20);
                        outptr1[2] = float32_to_bfloat16(tmp21);
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = float32_to_bfloat16(tmp30);
                        outptr1[3] = float32_to_bfloat16(tmp31);
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = float32_to_bfloat16(tmp40);
                        outptr1[4] = float32_to_bfloat16(tmp41);
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = float32_to_bfloat16(tmp50);
                        outptr1[5] = float32_to_bfloat16(tmp51);
                    }
#endif
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        float bias0 = biasptr ? biasptr[i + ii] : 0.f;

        float tmp[6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const float* r0 = (const float*)top_tile + ii * max_jj * 64 + jj;
            const float* r1 = r0 + max_jj;
            const float* r2 = r0 + max_jj * 2;
            const float* r3 = r0 + max_jj * 3;
            const float* r4 = r0 + max_jj * 4;
            const float* r5 = r0 + max_jj * 5;
            const float* r6 = r0 + max_jj * 6;
            const float* r7 = r0 + max_jj * 7;

            for (int m = 0; m < 8; m++)
            {
                float tmp024a = r1[0] + r2[0];
                float tmp135a = r1[0] - r2[0];
                float tmp024b = r3[0] + r4[0];
                float tmp135b = r3[0] - r4[0];
                float tmp024c = r5[0] + r6[0];
                float tmp135c = r5[0] - r6[0];

                tmp[0][m] = r0[0] + tmp024a + tmp024b + tmp024c * 32;
                tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
                tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
                tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
                tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
                tmp[5][m] = r7[0] + tmp135a + tmp135b * 32 + tmp135c;

                r0 += max_jj * 8;
                r1 += max_jj * 8;
                r2 += max_jj * 8;
                r3 += max_jj * 8;
                r4 += max_jj * 8;
                r5 += max_jj * 8;
                r6 += max_jj * 8;
                r7 += max_jj * 8;
            }

            unsigned short* outptr0 = top_blob.channel(i + ii).row<unsigned short>(ti * 6) + (tj * 6);

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];
                float r3 = tmp[m][3];
                float r4 = tmp[m][4];
                float r5 = tmp[m][5];
                float r6 = tmp[m][6];
                float r7 = tmp[m][7];

                float tmp024a = r1 + r2;
                float tmp135a = r1 - r2;
                float tmp024b = r3 + r4;
                float tmp135b = r3 - r4;
                float tmp024c = r5 + r6;
                float tmp135c = r5 - r6;

                float tmp0 = bias0 + r0 + tmp024a + tmp024b + tmp024c * 32;
                float tmp1 = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
                float tmp2 = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
                float tmp3 = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
                float tmp4 = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
                float tmp5 = bias0 + r7 + tmp135a + tmp135b * 32 + tmp135c;

                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(tmp0);
                    if (tj * 6 + 1 < outw) outptr0[1] = float32_to_bfloat16(tmp1);
                    if (tj * 6 + 2 < outw) outptr0[2] = float32_to_bfloat16(tmp2);
                    if (tj * 6 + 3 < outw) outptr0[3] = float32_to_bfloat16(tmp3);
                    if (tj * 6 + 4 < outw) outptr0[4] = float32_to_bfloat16(tmp4);
                    if (tj * 6 + 5 < outw) outptr0[5] = float32_to_bfloat16(tmp5);
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd63_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 6n+2, winograd F(6,3)
    int w_tiles = (outw + 5) / 6;
    int h_tiles = (outh + 5) / 6;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 64;

    // NCNN_LOGE("conv3x3s1_winograd63_bf16s %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd63_transform_input_tile_bf16s(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd63_transform_input_tile_bf16s(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd63_transform_output_tile_bf16s(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_3x3_winograd_fp16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd_pack_A_tile_fp16(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
{
    const int N = max_kk * batch;

    for (int b = 0; b < batch; b++)
    {
        unsigned short* pp = AT.row<unsigned short>(b);

        int ii = 0;
        for (; ii + 7 < max_ii; ii += 8)
        {
            const unsigned short* p0 = (const unsigned short*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                pp[2] = p0[2 * N];
                pp[3] = p0[3 * N];
                pp[4] = p0[4 * N];
                pp[5] = p0[5 * N];
                pp[6] = p0[6 * N];
                pp[7] = p0[7 * N];
                p0 += batch;
                pp += 8;
            }
        }
        for (; ii + 3 < max_ii; ii += 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                pp[2] = p0[2 * N];
                pp[3] = p0[3 * N];
                p0 += batch;
                pp += 4;
            }
        }
        for (; ii + 1 < max_ii; ii += 2)
        {
            const unsigned short* p0 = (const unsigned short*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                p0 += batch;
                pp += 2;
            }
        }
        for (; ii < max_ii; ii++)
        {
            const unsigned short* p0 = (const unsigned short*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                p0 += batch;
                pp += 1;
            }
        }
    }
}

static void conv3x3s1_winograd_transpose_pack_B_tile_fp16(const Mat& B, Mat& BT, int batch, int max_jj, int max_kk, int nT)
{
    #pragma omp parallel for num_threads(nT)
    for (int b = 0; b < batch; b++)
    {
        unsigned short* pp = BT.row<unsigned short>(b);

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            const unsigned short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x12
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v8.8h, v9.8h, v10.8h, v11.8h}, [%0] \n"

                    "uzp1   v12.8h, v0.8h, v4.8h        \n"
                    "uzp2   v16.8h, v0.8h, v4.8h        \n"
                    "uzp1   v13.8h, v1.8h, v5.8h        \n"
                    "uzp2   v17.8h, v1.8h, v5.8h        \n"
                    "uzp1   v14.8h, v2.8h, v6.8h        \n"
                    "uzp2   v18.8h, v2.8h, v6.8h        \n"
                    "uzp1   v15.8h, v3.8h, v7.8h        \n"
                    "uzp2   v19.8h, v3.8h, v7.8h        \n"
                    "uzp1   v20.8h, v8.8h, v9.8h        \n"
                    "uzp2   v22.8h, v8.8h, v9.8h        \n"
                    "uzp1   v21.8h, v10.8h, v11.8h      \n"
                    "uzp2   v23.8h, v10.8h, v11.8h      \n"

                    "sub    %0, %0, #128                \n"

                    "ext    v24.16b, v20.16b, v20.16b, #8 \n"
                    "ext    v26.16b, v22.16b, v22.16b, #8 \n"
                    "ext    v25.16b, v21.16b, v21.16b, #8 \n"
                    "ext    v27.16b, v23.16b, v23.16b, #8 \n"

                    "st1    {v12.8h}, [%1], #16         \n"
                    "st1    {v20.4h}, [%1], #8          \n"
                    "st1    {v13.8h}, [%1], #16         \n"
                    "st1    {v24.4h}, [%1], #8          \n"
                    "st1    {v14.8h}, [%1], #16         \n"
                    "st1    {v21.4h}, [%1], #8          \n"
                    "st1    {v15.8h}, [%1], #16         \n"
                    "st1    {v25.4h}, [%1], #8          \n"
                    "st1    {v16.8h}, [%1], #16         \n"
                    "st1    {v22.4h}, [%1], #8          \n"
                    "st1    {v17.8h}, [%1], #16         \n"
                    "st1    {v26.4h}, [%1], #8          \n"
                    "st1    {v18.8h}, [%1], #16         \n"
                    "st1    {v23.4h}, [%1], #8          \n"
                    "st1    {v19.8h}, [%1], #16         \n"
                    "st1    {v27.4h}, [%1], #8          \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                uint16x8x4_t _r0 = vld4q_u16(p0);
                uint16x8x4_t _r1 = vld4q_u16(p0 + 32);
                uint16x8x4_t _r2 = vld4q_u16(p0 + 64);
                uint16x8x2_t _r04lm = vuzpq_u16(_r0.val[0], _r1.val[0]);
                uint16x8x2_t _r15lm = vuzpq_u16(_r0.val[1], _r1.val[1]);
                uint16x8x2_t _r26lm = vuzpq_u16(_r0.val[2], _r1.val[2]);
                uint16x8x2_t _r37lm = vuzpq_u16(_r0.val[3], _r1.val[3]);
                uint16x8x2_t _r0145h = vuzpq_u16(_r2.val[0], _r2.val[1]);
                uint16x8x2_t _r2367h = vuzpq_u16(_r2.val[2], _r2.val[3]);
                vst1q_u16(pp, _r04lm.val[0]);
                vst1_u16(pp + 8, vget_low_u16(_r0145h.val[0]));
                vst1q_u16(pp + 12, _r15lm.val[0]);
                vst1_u16(pp + 20, vget_high_u16(_r0145h.val[0]));
                vst1q_u16(pp + 24, _r26lm.val[0]);
                vst1_u16(pp + 32, vget_low_u16(_r2367h.val[0]));
                vst1q_u16(pp + 36, _r37lm.val[0]);
                vst1_u16(pp + 44, vget_high_u16(_r2367h.val[0]));
                vst1q_u16(pp + 48, _r04lm.val[1]);
                vst1_u16(pp + 56, vget_low_u16(_r0145h.val[1]));
                vst1q_u16(pp + 60, _r15lm.val[1]);
                vst1_u16(pp + 68, vget_high_u16(_r0145h.val[1]));
                vst1q_u16(pp + 72, _r26lm.val[1]);
                vst1_u16(pp + 80, vget_low_u16(_r2367h.val[1]));
                vst1q_u16(pp + 84, _r37lm.val[1]);
                vst1_u16(pp + 92, vget_high_u16(_r2367h.val[1]));
                p0 += max_jj * batch * 8;
                pp += 96;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x12
                uint16x8x4_t _r01 = vld4q_u16(p0);
                uint16x4x4_t _r2 = vld4_u16(p0 + 32);
                vst1q_u16(pp, _r01.val[0]);
                vst1_u16(pp + 8, _r2.val[0]);
                vst1q_u16(pp + 12, _r01.val[1]);
                vst1_u16(pp + 20, _r2.val[1]);
                vst1q_u16(pp + 24, _r01.val[2]);
                vst1_u16(pp + 32, _r2.val[2]);
                vst1q_u16(pp + 36, _r01.val[3]);
                vst1_u16(pp + 44, _r2.val[3]);
                p0 += max_jj * batch * 4;
                pp += 48;
            }
            p0 -= (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                // transpose 2x12
                uint16x8x2_t _r01 = vld2q_u16(p0);
                uint16x4x2_t _r2 = vld2_u16(p0 + 16);
                vst1q_u16(pp, _r01.val[0]);
                vst1_u16(pp + 8, _r2.val[0]);
                vst1q_u16(pp + 12, _r01.val[1]);
                vst1_u16(pp + 20, _r2.val[1]);
                p0 += max_jj * batch * 2;
                pp += 24;
            }
            p0 -= (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _r01 = vld1q_u16(p0);
                uint16x4_t _r2 = vld1_u16(p0 + 8);
                vst1q_u16(pp, _r01);
                vst1_u16(pp + 8, _r2);
                p0 += max_jj * batch;
                pp += 12;
            }
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            const unsigned short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x8
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0] \n"

                    "uzp1   v8.8h, v0.8h, v4.8h         \n"
                    "uzp2   v12.8h, v0.8h, v4.8h        \n"
                    "uzp1   v9.8h, v1.8h, v5.8h         \n"
                    "uzp2   v13.8h, v1.8h, v5.8h        \n"

                    "sub    %0, %0, #64                 \n"

                    "uzp1   v10.8h, v2.8h, v6.8h        \n"
                    "uzp2   v14.8h, v2.8h, v6.8h        \n"
                    "uzp1   v11.8h, v3.8h, v7.8h        \n"
                    "uzp2   v15.8h, v3.8h, v7.8h        \n"

                    "st1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%1], #64 \n"
                    "st1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                uint16x8x4_t _r0 = vld4q_u16(p0);
                uint16x8x4_t _r1 = vld4q_u16(p0 + 32);
                uint16x8x2_t _r04 = vuzpq_u16(_r0.val[0], _r1.val[0]);
                uint16x8x2_t _r15 = vuzpq_u16(_r0.val[1], _r1.val[1]);
                uint16x8x2_t _r26 = vuzpq_u16(_r0.val[2], _r1.val[2]);
                uint16x8x2_t _r37 = vuzpq_u16(_r0.val[3], _r1.val[3]);
                vst1q_u16(pp, _r04.val[0]);
                vst1q_u16(pp + 8, _r15.val[0]);
                vst1q_u16(pp + 8 * 2, _r26.val[0]);
                vst1q_u16(pp + 8 * 3, _r37.val[0]);
                vst1q_u16(pp + 8 * 4, _r04.val[1]);
                vst1q_u16(pp + 8 * 5, _r15.val[1]);
                vst1q_u16(pp + 8 * 6, _r26.val[1]);
                vst1q_u16(pp + 8 * 7, _r37.val[1]);
                p0 += max_jj * batch * 8;
                pp += 64;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x8
                uint16x8x4_t _r0 = vld4q_u16(p0);
                vst1q_u16(pp, _r0.val[0]);
                vst1q_u16(pp + 8, _r0.val[1]);
                vst1q_u16(pp + 16, _r0.val[2]);
                vst1q_u16(pp + 24, _r0.val[3]);
                p0 += max_jj * batch * 4;
                pp += 32;
            }
            p0 -= (b * max_jj + jj) * 4;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                // transpose 2x8
                uint16x8x2_t _r0 = vld2q_u16(p0);
                vst1q_u16(pp, _r0.val[0]);
                vst1q_u16(pp + 8, _r0.val[1]);
                p0 += max_jj * batch * 2;
                pp += 16;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _r0 = vld1q_u16(p0);
                vst1q_u16(pp, _r0);
                p0 += max_jj * batch;
                pp += 8;
            }
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const unsigned short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x4
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                    "st4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                uint16x8x4_t _r0;
                _r0.val[0] = vld1q_u16(p0);
                _r0.val[1] = vld1q_u16(p0 + 8);
                _r0.val[2] = vld1q_u16(p0 + 16);
                _r0.val[3] = vld1q_u16(p0 + 24);
                vst4q_u16(pp, _r0);
                p0 += max_jj * batch * 8;
                pp += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x4
                uint16x4x4_t _r0;
                _r0.val[0] = vld1_u16(p0);
                _r0.val[1] = vld1_u16(p0 + 4);
                _r0.val[2] = vld1_u16(p0 + 8);
                _r0.val[3] = vld1_u16(p0 + 12);
                vst4_u16(pp, _r0);
                p0 += max_jj * batch * 4;
                pp += 16;
            }
            p0 -= (b * max_jj + jj) * 4;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                // transpose 2x4
                uint16x4x2_t _r0 = vld2_u16(p0);
                vst1_u16(pp, _r0.val[0]);
                vst1_u16(pp + 4, _r0.val[1]);
                p0 += max_jj * batch * 2;
                pp += 8;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                uint16x4_t _r0 = vld1_u16(p0);
                vst1_u16(pp, _r0);
                p0 += max_jj * batch;
                pp += 4;
            }
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const unsigned short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x2
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]       \n"
                    "ld1    {v0.8h, v1.8h}, [%0]        \n"
                    "st2    {v0.8h, v1.8h}, [%1], #32   \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                uint16x8x2_t _r0;
                _r0.val[0] = vld1q_u16(p0);
                _r0.val[1] = vld1q_u16(p0 + 8);
                vst2q_u16(pp, _r0);
                p0 += max_jj * batch * 8;
                pp += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                // transpose 4x2
                uint16x4x2_t _r0;
                _r0.val[0] = vld1_u16(p0);
                _r0.val[1] = vld1_u16(p0 + 4);
                vst2_u16(pp, _r0);
                p0 += max_jj * batch * 4;
                pp += 8;
            }
            p0 -= (b * max_jj + jj) * 4;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[2];
                pp[2] = p0[1];
                pp[3] = p0[3];
                p0 += max_jj * batch * 2;
                pp += 4;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                p0 += max_jj * batch;
                pp += 2;
            }
        }
        for (; jj < max_jj; jj++)
        {
            const unsigned short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _r0 = vld1q_u16(p0);
                vst1q_u16(pp, _r0);
                p0 += max_jj * batch * 8;
                pp += 8;
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 4;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4_t _r0 = vld1_u16(p0);
                vst1_u16(pp, _r0);
                p0 += max_jj * batch * 4;
                pp += 4;
            }
            p0 -= (b * max_jj + jj) * 4;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                p0 += max_jj * batch * 2;
                pp += 2;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                p0 += max_jj * batch;
                pp += 1;
            }
        }
    }
}

static void conv3x3s1_winograd_gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_tile, Mat& top_blob, int batch, int max_ii, int max_jj, int k, int max_kk, int use_a53_a55_optimized_kernel)
{
    // NCNN_LOGE("conv3x3s1_winograd_gemm_transB_packed_tile_fp16sa %d %d %d", max_ii, max_jj, max_kk);
    __fp16* outptr = top_blob;

    int ii = 0;
    for (; ii + 7 < max_ii; ii += 8)
    {
        for (int b = 0; b < batch; b++)
        {
            const __fp16* pAT = AT_tile.row<const __fp16>(b) + max_kk * ii;
            const __fp16* pB = BT_tile.row<const __fp16>(b);

            int jj = 0;
            for (; jj + 11 < max_jj; jj += 12)
            {
                const __fp16* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                if (use_a53_a55_optimized_kernel)
                {
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                        "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                        "subs   %0, %0, #128                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.8h}, [%1], #16          \n"
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.8h}, [%2], #16          \n"

                        "ldr    d1, [%2], #8                \n"
                        "ldr    x21, [%2], #8               \n"

                        ".align 4                           \n"
                        "2:                                 \n"
                        "ldr    d5, [%1], #8                \n"
                        "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                        "ldr    x25, [%1], #8               \n"
                        "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                        "ldr    d2, [%2], #8                \n"
                        "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                        "ldr    x22, [%2], #8               \n"
                        "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                        "ldr    d6, [%1], #8                \n"
                        "fmla   v24.8h, v4.8h, v0.h[4]      \n"
                        "ldr    x26, [%1], #8               \n"
                        "fmla   v25.8h, v4.8h, v0.h[5]      \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v26.8h, v4.8h, v0.h[6]      \n"
                        "ldr    d3, [%2], #8                \n"
                        "fmla   v27.8h, v4.8h, v0.h[7]      \n"
                        "ldr    x23, [%2], #8               \n"
                        "fmla   v28.8h, v4.8h, v1.h[0]      \n"
                        "prfm   pldl1keep, [%2, #256]       \n" // NOTE PRELOAD
                        "fmla   v29.8h, v4.8h, v1.h[1]      \n"
                        "ins    v5.d[1], x25                \n"
                        "fmla   v30.8h, v4.8h, v1.h[2]      \n"
                        "ldr    d8, [%2], #8                \n"
                        "fmla   v31.8h, v4.8h, v1.h[3]      \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v20.8h, v5.8h, v1.h[4]      \n"
                        "ldr    d7, [%1], #8                \n"
                        "fmla   v21.8h, v5.8h, v1.h[5]      \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v22.8h, v5.8h, v1.h[6]      \n"
                        "ldr    x27, [%1], #8               \n"
                        "fmla   v23.8h, v5.8h, v1.h[7]      \n"
                        "ldr    d9, [%2], #8                \n"
                        "fmla   v24.8h, v5.8h, v2.h[0]      \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v25.8h, v5.8h, v2.h[1]      \n"
                        "ins    v6.d[1], x26                \n"
                        "fmla   v26.8h, v5.8h, v2.h[2]      \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v27.8h, v5.8h, v2.h[3]      \n"
                        "ldr    d4, [%1], #8                \n"
                        "fmla   v28.8h, v5.8h, v2.h[4]      \n"
                        "ldr    x24, [%1], #8               \n"
                        "fmla   v29.8h, v5.8h, v2.h[5]      \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v30.8h, v5.8h, v2.h[6]      \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v31.8h, v5.8h, v2.h[7]      \n"
                        "ldr    d0, [%2], #8                \n"
                        "fmla   v20.8h, v6.8h, v3.h[0]      \n"
                        "fmla   v21.8h, v6.8h, v3.h[1]      \n"
                        "fmla   v22.8h, v6.8h, v3.h[2]      \n"
                        "fmla   v23.8h, v6.8h, v3.h[3]      \n"
                        "fmla   v24.8h, v6.8h, v3.h[4]      \n"
                        "fmla   v25.8h, v6.8h, v3.h[5]      \n"
                        "ins    v8.d[1], x20                \n"
                        "fmla   v26.8h, v6.8h, v3.h[6]      \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v27.8h, v6.8h, v3.h[7]      \n"
                        "ldr    d1, [%2], #8                \n"
                        "fmla   v28.8h, v6.8h, v8.h[0]      \n"
                        "fmla   v29.8h, v6.8h, v8.h[1]      \n"
                        "ins    v7.d[1], x27                \n"
                        "fmla   v30.8h, v6.8h, v8.h[2]      \n"
                        "fmla   v31.8h, v6.8h, v8.h[3]      \n"
                        "fmla   v20.8h, v7.8h, v8.h[4]      \n"
                        "fmla   v21.8h, v7.8h, v8.h[5]      \n"
                        "ins    v9.d[1], x21                \n"
                        "fmla   v22.8h, v7.8h, v8.h[6]      \n"
                        "fmla   v23.8h, v7.8h, v8.h[7]      \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v24.8h, v7.8h, v9.h[0]      \n"
                        "fmla   v25.8h, v7.8h, v9.h[1]      \n"
                        "ins    v4.d[1], x24                \n"
                        "fmla   v26.8h, v7.8h, v9.h[2]      \n"
                        "fmla   v27.8h, v7.8h, v9.h[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v28.8h, v7.8h, v9.h[4]      \n"
                        "fmla   v29.8h, v7.8h, v9.h[5]      \n"
                        "fmla   v30.8h, v7.8h, v9.h[6]      \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v31.8h, v7.8h, v9.h[7]      \n"
                        "bne    2b                          \n"

                        "sub    %1, %1, #16                 \n"
                        "sub    %2, %2, #32                 \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"
                        "ld1    {v4.8h}, [%1], #16          \n"
                        "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                        "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                        "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                        "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                        "fmla   v24.8h, v4.8h, v1.h[0]      \n"
                        "fmla   v25.8h, v4.8h, v1.h[1]      \n"
                        "fmla   v26.8h, v4.8h, v1.h[2]      \n"
                        "fmla   v27.8h, v4.8h, v1.h[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v28.8h, v4.8h, v2.h[0]      \n"
                        "fmla   v29.8h, v4.8h, v2.h[1]      \n"
                        "fmla   v30.8h, v4.8h, v2.h[2]      \n"
                        "fmla   v31.8h, v4.8h, v2.h[3]      \n"
                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                        "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                else
                {
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                        "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                        "subs   %0, %0, #128                \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v20.16b, v20.16b, v20.16b   \n"
                        "eor    v21.16b, v21.16b, v21.16b   \n"
                        "eor    v22.16b, v22.16b, v22.16b   \n"
                        "eor    v23.16b, v23.16b, v23.16b   \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "2:                                 \n"
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"

                        "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                        "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                        "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                        "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                        "fmla   v24.8h, v4.8h, v0.h[4]      \n"
                        "fmla   v25.8h, v4.8h, v0.h[5]      \n"
                        "fmla   v26.8h, v4.8h, v0.h[6]      \n"
                        "fmla   v27.8h, v4.8h, v0.h[7]      \n"
                        "fmla   v28.8h, v4.8h, v1.h[0]      \n"
                        "fmla   v29.8h, v4.8h, v1.h[1]      \n"
                        "fmla   v30.8h, v4.8h, v1.h[2]      \n"
                        "fmla   v31.8h, v4.8h, v1.h[3]      \n"

                        "fmla   v20.8h, v5.8h, v1.h[4]      \n"
                        "fmla   v21.8h, v5.8h, v1.h[5]      \n"
                        "fmla   v22.8h, v5.8h, v1.h[6]      \n"
                        "fmla   v23.8h, v5.8h, v1.h[7]      \n"
                        "fmla   v24.8h, v5.8h, v2.h[0]      \n"
                        "fmla   v25.8h, v5.8h, v2.h[1]      \n"
                        "fmla   v26.8h, v5.8h, v2.h[2]      \n"
                        "fmla   v27.8h, v5.8h, v2.h[3]      \n"
                        "fmla   v28.8h, v5.8h, v2.h[4]      \n"
                        "fmla   v29.8h, v5.8h, v2.h[5]      \n"
                        "fmla   v30.8h, v5.8h, v2.h[6]      \n"
                        "fmla   v31.8h, v5.8h, v2.h[7]      \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v8.8h, v9.8h}, [%2], #32   \n"

                        "fmla   v20.8h, v6.8h, v3.h[0]      \n"
                        "fmla   v21.8h, v6.8h, v3.h[1]      \n"
                        "fmla   v22.8h, v6.8h, v3.h[2]      \n"
                        "fmla   v23.8h, v6.8h, v3.h[3]      \n"
                        "fmla   v24.8h, v6.8h, v3.h[4]      \n"
                        "fmla   v25.8h, v6.8h, v3.h[5]      \n"
                        "fmla   v26.8h, v6.8h, v3.h[6]      \n"
                        "fmla   v27.8h, v6.8h, v3.h[7]      \n"
                        "fmla   v28.8h, v6.8h, v8.h[0]      \n"
                        "fmla   v29.8h, v6.8h, v8.h[1]      \n"
                        "fmla   v30.8h, v6.8h, v8.h[2]      \n"
                        "fmla   v31.8h, v6.8h, v8.h[3]      \n"

                        "subs   w4, w4, #1                  \n"

                        "fmla   v20.8h, v7.8h, v8.h[4]      \n"
                        "fmla   v21.8h, v7.8h, v8.h[5]      \n"
                        "fmla   v22.8h, v7.8h, v8.h[6]      \n"
                        "fmla   v23.8h, v7.8h, v8.h[7]      \n"
                        "fmla   v24.8h, v7.8h, v9.h[0]      \n"
                        "fmla   v25.8h, v7.8h, v9.h[1]      \n"
                        "fmla   v26.8h, v7.8h, v9.h[2]      \n"
                        "fmla   v27.8h, v7.8h, v9.h[3]      \n"
                        "fmla   v28.8h, v7.8h, v9.h[4]      \n"
                        "fmla   v29.8h, v7.8h, v9.h[5]      \n"
                        "fmla   v30.8h, v7.8h, v9.h[6]      \n"
                        "fmla   v31.8h, v7.8h, v9.h[7]      \n"

                        "bne    2b                          \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"
                        "ld1    {v4.8h}, [%1], #16          \n"
                        "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                        "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                        "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                        "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                        "fmla   v24.8h, v4.8h, v1.h[0]      \n"
                        "fmla   v25.8h, v4.8h, v1.h[1]      \n"
                        "fmla   v26.8h, v4.8h, v1.h[2]      \n"
                        "fmla   v27.8h, v4.8h, v1.h[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v28.8h, v4.8h, v2.h[0]      \n"
                        "fmla   v29.8h, v4.8h, v2.h[1]      \n"
                        "fmla   v30.8h, v4.8h, v2.h[2]      \n"
                        "fmla   v31.8h, v4.8h, v2.h[3]      \n"
                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                        "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
#else  // NCNN_GNU_INLINE_ASM
                float16x8_t _sum0;
                float16x8_t _sum1;
                float16x8_t _sum2;
                float16x8_t _sum3;
                float16x8_t _sum4;
                float16x8_t _sum5;
                float16x8_t _sum6;
                float16x8_t _sum7;
                float16x8_t _sum8;
                float16x8_t _sum9;
                float16x8_t _suma;
                float16x8_t _sumb;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                    _sum2 = vdupq_n_f16(0.f);
                    _sum3 = vdupq_n_f16(0.f);
                    _sum4 = vdupq_n_f16(0.f);
                    _sum5 = vdupq_n_f16(0.f);
                    _sum6 = vdupq_n_f16(0.f);
                    _sum7 = vdupq_n_f16(0.f);
                    _sum8 = vdupq_n_f16(0.f);
                    _sum9 = vdupq_n_f16(0.f);
                    _suma = vdupq_n_f16(0.f);
                    _sumb = vdupq_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1q_f16(outptr);
                    _sum1 = vld1q_f16(outptr + 8);
                    _sum2 = vld1q_f16(outptr + 16);
                    _sum3 = vld1q_f16(outptr + 24);
                    _sum4 = vld1q_f16(outptr + 32);
                    _sum5 = vld1q_f16(outptr + 40);
                    _sum6 = vld1q_f16(outptr + 48);
                    _sum7 = vld1q_f16(outptr + 56);
                    _sum8 = vld1q_f16(outptr + 64);
                    _sum9 = vld1q_f16(outptr + 72);
                    _suma = vld1q_f16(outptr + 80);
                    _sumb = vld1q_f16(outptr + 88);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pA = vld1q_f16(pA);
                    float16x8_t _pB0 = vld1q_f16(pB);
                    float16x4_t _pB2 = vld1_f16(pB + 8);
                    _sum0 = vfmaq_laneq_f16(_sum0, _pA, _pB0, 0);
                    _sum1 = vfmaq_laneq_f16(_sum1, _pA, _pB0, 1);
                    _sum2 = vfmaq_laneq_f16(_sum2, _pA, _pB0, 2);
                    _sum3 = vfmaq_laneq_f16(_sum3, _pA, _pB0, 3);
                    _sum4 = vfmaq_laneq_f16(_sum4, _pA, _pB0, 4);
                    _sum5 = vfmaq_laneq_f16(_sum5, _pA, _pB0, 5);
                    _sum6 = vfmaq_laneq_f16(_sum6, _pA, _pB0, 6);
                    _sum7 = vfmaq_laneq_f16(_sum7, _pA, _pB0, 7);
                    _sum8 = vfmaq_lane_f16(_sum8, _pA, _pB2, 0);
                    _sum9 = vfmaq_lane_f16(_sum9, _pA, _pB2, 1);
                    _suma = vfmaq_lane_f16(_suma, _pA, _pB2, 2);
                    _sumb = vfmaq_lane_f16(_sumb, _pA, _pB2, 3);

                    pA += 8;
                    pB += 12;
                }

                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
                vst1q_f16(outptr + 8 * 4, _sum4);
                vst1q_f16(outptr + 8 * 5, _sum5);
                vst1q_f16(outptr + 8 * 6, _sum6);
                vst1q_f16(outptr + 8 * 7, _sum7);
                vst1q_f16(outptr + 8 * 8, _sum8);
                vst1q_f16(outptr + 8 * 9, _sum9);
                vst1q_f16(outptr + 8 * 10, _suma);
                vst1q_f16(outptr + 8 * 11, _sumb);
                outptr += 8 * 12;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const __fp16* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                if (use_a53_a55_optimized_kernel)
                {
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                        "subs   %0, %0, #64                 \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.8h}, [%1], #16          \n"
                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.8h}, [%2], #16          \n"

                        "ldr    d5, [%1], #8                \n"
                        "ldr    x25, [%1], #8               \n"

                        ".align 4                           \n"
                        "2:                                 \n"
                        "ldr    d1, [%2], #8                \n"
                        "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                        "ldr    x21, [%2], #8               \n"
                        "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                        "ins    v5.d[1], x25                \n"
                        "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                        "ldr    d6, [%1], #8                \n"
                        "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                        "ldr    x26, [%1], #8               \n"
                        "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                        "ldr    d2, [%2], #8                \n"
                        "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                        "ins    v1.d[1], x21                \n"
                        "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                        "ldr    x22, [%2], #8               \n"
                        "fmla   v31.8h, v4.8h, v0.h[7]      \n"
                        "ldr    d7, [%1], #8                \n"
                        "fmla   v24.8h, v5.8h, v1.h[0]      \n"
                        "ldr    x27, [%1], #8               \n"
                        "fmla   v25.8h, v5.8h, v1.h[1]      \n"
                        "ins    v6.d[1], x26                \n"
                        "fmla   v26.8h, v5.8h, v1.h[2]      \n"
                        "ldr    d3, [%2], #8                \n"
                        "fmla   v27.8h, v5.8h, v1.h[3]      \n"
                        "ldr    x23, [%2], #8               \n"
                        "fmla   v28.8h, v5.8h, v1.h[4]      \n"
                        "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                        "fmla   v29.8h, v5.8h, v1.h[5]      \n"
                        "ins    v2.d[1], x22                \n"
                        "fmla   v30.8h, v5.8h, v1.h[6]      \n"
                        "ldr    d4, [%1], #8                \n"
                        "fmla   v31.8h, v5.8h, v1.h[7]      \n"
                        "ldr    x24, [%1], #8               \n"
                        "fmla   v24.8h, v6.8h, v2.h[0]      \n"
                        "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                        "fmla   v25.8h, v6.8h, v2.h[1]      \n"
                        "ins    v7.d[1], x27                \n"
                        "fmla   v26.8h, v6.8h, v2.h[2]      \n"
                        "ldr    d0, [%2], #8                \n"
                        "fmla   v27.8h, v6.8h, v2.h[3]      \n"
                        "ldr    x20, [%2], #8               \n"
                        "fmla   v28.8h, v6.8h, v2.h[4]      \n"
                        "ldr    d5, [%1], #8                \n"
                        "fmla   v29.8h, v6.8h, v2.h[5]      \n"
                        "ins    v3.d[1], x23                \n"
                        "fmla   v30.8h, v6.8h, v2.h[6]      \n"
                        "ldr    x25, [%1], #8               \n"
                        "fmla   v31.8h, v6.8h, v2.h[7]      \n"
                        "fmla   v24.8h, v7.8h, v3.h[0]      \n"
                        "fmla   v25.8h, v7.8h, v3.h[1]      \n"
                        "fmla   v26.8h, v7.8h, v3.h[2]      \n"
                        "ins    v4.d[1], x24                \n"
                        "fmla   v27.8h, v7.8h, v3.h[3]      \n"
                        "fmla   v28.8h, v7.8h, v3.h[4]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v29.8h, v7.8h, v3.h[5]      \n"
                        "fmla   v30.8h, v7.8h, v3.h[6]      \n"
                        "ins    v0.d[1], x20                \n"
                        "fmla   v31.8h, v7.8h, v3.h[7]      \n"
                        "bne    2b                          \n"

                        "sub    %1, %1, #32                 \n"
                        "sub    %2, %2, #16                 \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.8h}, [%2], #16          \n"
                        "ld1    {v4.8h}, [%1], #16          \n"
                        "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                        "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                        "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                        "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                        "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                        "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                        "fmla   v31.8h, v4.8h, v0.h[7]      \n"
                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                else
                {
                    asm volatile(
                        "cbz    %w7, 0f                     \n"

                        "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                        "subs   %0, %0, #64                 \n"
                        "b      1f                          \n"

                        "0:                                 \n"
                        "eor    v24.16b, v24.16b, v24.16b   \n"
                        "eor    v25.16b, v25.16b, v25.16b   \n"
                        "eor    v26.16b, v26.16b, v26.16b   \n"
                        "eor    v27.16b, v27.16b, v27.16b   \n"
                        "eor    v28.16b, v28.16b, v28.16b   \n"
                        "eor    v29.16b, v29.16b, v29.16b   \n"
                        "eor    v30.16b, v30.16b, v30.16b   \n"
                        "eor    v31.16b, v31.16b, v31.16b   \n"

                        "1:                                 \n"
                        "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                        "cmp    w4, #0                      \n"
                        "beq    3f                          \n"

                        "2:                                 \n"
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"

                        "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                        "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                        "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                        "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                        "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                        "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                        "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                        "fmla   v31.8h, v4.8h, v0.h[7]      \n"

                        "fmla   v24.8h, v5.8h, v1.h[0]      \n"
                        "fmla   v25.8h, v5.8h, v1.h[1]      \n"
                        "fmla   v26.8h, v5.8h, v1.h[2]      \n"
                        "fmla   v27.8h, v5.8h, v1.h[3]      \n"
                        "fmla   v28.8h, v5.8h, v1.h[4]      \n"
                        "fmla   v29.8h, v5.8h, v1.h[5]      \n"
                        "fmla   v30.8h, v5.8h, v1.h[6]      \n"
                        "fmla   v31.8h, v5.8h, v1.h[7]      \n"

                        "fmla   v24.8h, v6.8h, v2.h[0]      \n"
                        "fmla   v25.8h, v6.8h, v2.h[1]      \n"
                        "fmla   v26.8h, v6.8h, v2.h[2]      \n"
                        "fmla   v27.8h, v6.8h, v2.h[3]      \n"
                        "fmla   v28.8h, v6.8h, v2.h[4]      \n"
                        "fmla   v29.8h, v6.8h, v2.h[5]      \n"
                        "fmla   v30.8h, v6.8h, v2.h[6]      \n"
                        "fmla   v31.8h, v6.8h, v2.h[7]      \n"

                        "subs   w4, w4, #1                  \n"

                        "fmla   v24.8h, v7.8h, v3.h[0]      \n"
                        "fmla   v25.8h, v7.8h, v3.h[1]      \n"
                        "fmla   v26.8h, v7.8h, v3.h[2]      \n"
                        "fmla   v27.8h, v7.8h, v3.h[3]      \n"
                        "fmla   v28.8h, v7.8h, v3.h[4]      \n"
                        "fmla   v29.8h, v7.8h, v3.h[5]      \n"
                        "fmla   v30.8h, v7.8h, v3.h[6]      \n"
                        "fmla   v31.8h, v7.8h, v3.h[7]      \n"

                        "bne    2b                          \n"

                        "3:                                 \n"
                        "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                        "cmp    w4, #0                      \n"
                        "beq    5f                          \n"

                        "4:                                 \n"
                        "ld1    {v0.8h}, [%2], #16          \n"
                        "ld1    {v4.8h}, [%1], #16          \n"
                        "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                        "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                        "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                        "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                        "subs   w4, w4, #1                  \n"
                        "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                        "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                        "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                        "fmla   v31.8h, v4.8h, v0.h[7]      \n"
                        "bne    4b                          \n"

                        "5:                                 \n"
                        "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr), // %0
                        "=r"(pA),     // %1
                        "=r"(pB)      // %2
                        : "0"(outptr),
                        "1"(pA),
                        "2"(pB),
                        "r"(max_kk), // %6
                        "r"(k)       // %7
                        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
#else  // NCNN_GNU_INLINE_ASM
                float16x8_t _sum0;
                float16x8_t _sum1;
                float16x8_t _sum2;
                float16x8_t _sum3;
                float16x8_t _sum4;
                float16x8_t _sum5;
                float16x8_t _sum6;
                float16x8_t _sum7;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                    _sum2 = vdupq_n_f16(0.f);
                    _sum3 = vdupq_n_f16(0.f);
                    _sum4 = vdupq_n_f16(0.f);
                    _sum5 = vdupq_n_f16(0.f);
                    _sum6 = vdupq_n_f16(0.f);
                    _sum7 = vdupq_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1q_f16(outptr);
                    _sum1 = vld1q_f16(outptr + 8);
                    _sum2 = vld1q_f16(outptr + 16);
                    _sum3 = vld1q_f16(outptr + 24);
                    _sum4 = vld1q_f16(outptr + 32);
                    _sum5 = vld1q_f16(outptr + 40);
                    _sum6 = vld1q_f16(outptr + 48);
                    _sum7 = vld1q_f16(outptr + 56);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pA = vld1q_f16(pA);
                    float16x8_t _pB = vld1q_f16(pB);
                    _sum0 = vfmaq_laneq_f16(_sum0, _pA, _pB, 0);
                    _sum1 = vfmaq_laneq_f16(_sum1, _pA, _pB, 1);
                    _sum2 = vfmaq_laneq_f16(_sum2, _pA, _pB, 2);
                    _sum3 = vfmaq_laneq_f16(_sum3, _pA, _pB, 3);
                    _sum4 = vfmaq_laneq_f16(_sum4, _pA, _pB, 4);
                    _sum5 = vfmaq_laneq_f16(_sum5, _pA, _pB, 5);
                    _sum6 = vfmaq_laneq_f16(_sum6, _pA, _pB, 6);
                    _sum7 = vfmaq_laneq_f16(_sum7, _pA, _pB, 7);

                    pA += 8;
                    pB += 8;
                }

                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
                vst1q_f16(outptr + 8 * 4, _sum4);
                vst1q_f16(outptr + 8 * 5, _sum5);
                vst1q_f16(outptr + 8 * 6, _sum6);
                vst1q_f16(outptr + 8 * 7, _sum7);
                outptr += 8 * 8;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const __fp16* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "cbz    %w7, 0f                     \n"

                    "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "2:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"

                    "fmla   v28.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v29.8h, v4.8h, v0.h[1]      \n"
                    "fmla   v30.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v31.8h, v4.8h, v0.h[3]      \n"

                    "fmla   v28.8h, v5.8h, v0.h[4]      \n"
                    "fmla   v29.8h, v5.8h, v0.h[5]      \n"
                    "fmla   v30.8h, v5.8h, v0.h[6]      \n"
                    "fmla   v31.8h, v5.8h, v0.h[7]      \n"

                    "fmla   v28.8h, v6.8h, v1.h[0]      \n"
                    "fmla   v29.8h, v6.8h, v1.h[1]      \n"
                    "fmla   v30.8h, v6.8h, v1.h[2]      \n"
                    "fmla   v31.8h, v6.8h, v1.h[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v28.8h, v7.8h, v1.h[4]      \n"
                    "fmla   v29.8h, v7.8h, v1.h[5]      \n"
                    "fmla   v30.8h, v7.8h, v1.h[6]      \n"
                    "fmla   v31.8h, v7.8h, v1.h[7]      \n"

                    "bne    2b                          \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v0.4h}, [%2], #8           \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "fmla   v28.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v29.8h, v4.8h, v0.h[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v31.8h, v4.8h, v0.h[3]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                float16x8_t _sum0;
                float16x8_t _sum1;
                float16x8_t _sum2;
                float16x8_t _sum3;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                    _sum2 = vdupq_n_f16(0.f);
                    _sum3 = vdupq_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1q_f16(outptr);
                    _sum1 = vld1q_f16(outptr + 8);
                    _sum2 = vld1q_f16(outptr + 16);
                    _sum3 = vld1q_f16(outptr + 24);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pA = vld1q_f16(pA);
                    float16x4_t _pB = vld1_f16(pB);
                    _sum0 = vfmaq_lane_f16(_sum0, _pA, _pB, 0);
                    _sum1 = vfmaq_lane_f16(_sum1, _pA, _pB, 1);
                    _sum2 = vfmaq_lane_f16(_sum2, _pA, _pB, 2);
                    _sum3 = vfmaq_lane_f16(_sum3, _pA, _pB, 3);

                    pA += 8;
                    pB += 4;
                }

                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
                outptr += 8 * 4;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const __fp16* pA = pAT;

                float16x8_t _sum0;
                float16x8_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1q_f16(outptr);
                    _sum1 = vld1q_f16(outptr + 8);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pA = vld1q_f16(pA);
                    _sum0 = vfmaq_n_f16(_sum0, _pA, pB[0]);
                    _sum1 = vfmaq_n_f16(_sum1, _pA, pB[1]);

                    pA += 8;
                    pB += 2;
                }

                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
                outptr += 8 * 2;
            }
            for (; jj < max_jj; jj++)
            {
                const __fp16* pA = pAT;

                float16x8_t _sum;

                if (k == 0)
                {
                    _sum = vdupq_n_f16(0.f);
                }
                else
                {
                    _sum = vld1q_f16(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pA = vld1q_f16(pA);
                    _sum = vfmaq_n_f16(_sum, _pA, pB[0]);

                    pA += 8;
                    pB += 1;
                }

                vst1q_f16(outptr, _sum);
                outptr += 8;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        for (int b = 0; b < batch; b++)
        {
            const __fp16* pAT = AT_tile.row<const __fp16>(b) + max_kk * ii;
            const __fp16* pB = BT_tile.row<const __fp16>(b);

            int jj = 0;
            for (; jj + 11 < max_jj; jj += 12)
            {
                const __fp16* pA = pAT;

                float16x4_t _sum0;
                float16x4_t _sum1;
                float16x4_t _sum2;
                float16x4_t _sum3;
                float16x4_t _sum4;
                float16x4_t _sum5;
                float16x4_t _sum6;
                float16x4_t _sum7;
                float16x4_t _sum8;
                float16x4_t _sum9;
                float16x4_t _suma;
                float16x4_t _sumb;

                if (k == 0)
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                    _sum3 = vdup_n_f16(0.f);
                    _sum4 = vdup_n_f16(0.f);
                    _sum5 = vdup_n_f16(0.f);
                    _sum6 = vdup_n_f16(0.f);
                    _sum7 = vdup_n_f16(0.f);
                    _sum8 = vdup_n_f16(0.f);
                    _sum9 = vdup_n_f16(0.f);
                    _suma = vdup_n_f16(0.f);
                    _sumb = vdup_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1_f16(outptr);
                    _sum1 = vld1_f16(outptr + 4);
                    _sum2 = vld1_f16(outptr + 8);
                    _sum3 = vld1_f16(outptr + 12);
                    _sum4 = vld1_f16(outptr + 16);
                    _sum5 = vld1_f16(outptr + 20);
                    _sum6 = vld1_f16(outptr + 24);
                    _sum7 = vld1_f16(outptr + 28);
                    _sum8 = vld1_f16(outptr + 32);
                    _sum9 = vld1_f16(outptr + 36);
                    _suma = vld1_f16(outptr + 40);
                    _sumb = vld1_f16(outptr + 44);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x4_t _pA = vld1_f16(pA);
                    float16x8_t _pB0 = vld1q_f16(pB);
                    float16x4_t _pB2 = vld1_f16(pB + 8);
                    _sum0 = vfma_laneq_f16(_sum0, _pA, _pB0, 0);
                    _sum1 = vfma_laneq_f16(_sum1, _pA, _pB0, 1);
                    _sum2 = vfma_laneq_f16(_sum2, _pA, _pB0, 2);
                    _sum3 = vfma_laneq_f16(_sum3, _pA, _pB0, 3);
                    _sum4 = vfma_laneq_f16(_sum4, _pA, _pB0, 4);
                    _sum5 = vfma_laneq_f16(_sum5, _pA, _pB0, 5);
                    _sum6 = vfma_laneq_f16(_sum6, _pA, _pB0, 6);
                    _sum7 = vfma_laneq_f16(_sum7, _pA, _pB0, 7);
                    _sum8 = vfma_lane_f16(_sum8, _pA, _pB2, 0);
                    _sum9 = vfma_lane_f16(_sum9, _pA, _pB2, 1);
                    _suma = vfma_lane_f16(_suma, _pA, _pB2, 2);
                    _sumb = vfma_lane_f16(_sumb, _pA, _pB2, 3);

                    pA += 4;
                    pB += 12;
                }

                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
                vst1_f16(outptr + 4 * 4, _sum4);
                vst1_f16(outptr + 4 * 5, _sum5);
                vst1_f16(outptr + 4 * 6, _sum6);
                vst1_f16(outptr + 4 * 7, _sum7);
                vst1_f16(outptr + 4 * 8, _sum8);
                vst1_f16(outptr + 4 * 9, _sum9);
                vst1_f16(outptr + 4 * 10, _suma);
                vst1_f16(outptr + 4 * 11, _sumb);
                outptr += 4 * 12;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const __fp16* pA = pAT;

                float16x4_t _sum0;
                float16x4_t _sum1;
                float16x4_t _sum2;
                float16x4_t _sum3;
                float16x4_t _sum4;
                float16x4_t _sum5;
                float16x4_t _sum6;
                float16x4_t _sum7;

                if (k == 0)
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                    _sum3 = vdup_n_f16(0.f);
                    _sum4 = vdup_n_f16(0.f);
                    _sum5 = vdup_n_f16(0.f);
                    _sum6 = vdup_n_f16(0.f);
                    _sum7 = vdup_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1_f16(outptr);
                    _sum1 = vld1_f16(outptr + 4);
                    _sum2 = vld1_f16(outptr + 8);
                    _sum3 = vld1_f16(outptr + 12);
                    _sum4 = vld1_f16(outptr + 16);
                    _sum5 = vld1_f16(outptr + 20);
                    _sum6 = vld1_f16(outptr + 24);
                    _sum7 = vld1_f16(outptr + 28);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x4_t _pA = vld1_f16(pA);
                    float16x8_t _pB = vld1q_f16(pB);
                    _sum0 = vfma_laneq_f16(_sum0, _pA, _pB, 0);
                    _sum1 = vfma_laneq_f16(_sum1, _pA, _pB, 1);
                    _sum2 = vfma_laneq_f16(_sum2, _pA, _pB, 2);
                    _sum3 = vfma_laneq_f16(_sum3, _pA, _pB, 3);
                    _sum4 = vfma_laneq_f16(_sum4, _pA, _pB, 4);
                    _sum5 = vfma_laneq_f16(_sum5, _pA, _pB, 5);
                    _sum6 = vfma_laneq_f16(_sum6, _pA, _pB, 6);
                    _sum7 = vfma_laneq_f16(_sum7, _pA, _pB, 7);

                    pA += 4;
                    pB += 8;
                }

                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
                vst1_f16(outptr + 4 * 4, _sum4);
                vst1_f16(outptr + 4 * 5, _sum5);
                vst1_f16(outptr + 4 * 6, _sum6);
                vst1_f16(outptr + 4 * 7, _sum7);
                outptr += 4 * 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const __fp16* pA = pAT;

                float16x4_t _sum0;
                float16x4_t _sum1;
                float16x4_t _sum2;
                float16x4_t _sum3;

                if (k == 0)
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                    _sum3 = vdup_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1_f16(outptr);
                    _sum1 = vld1_f16(outptr + 4);
                    _sum2 = vld1_f16(outptr + 8);
                    _sum3 = vld1_f16(outptr + 12);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x4_t _pA = vld1_f16(pA);
                    float16x4_t _pB = vld1_f16(pB);
                    _sum0 = vfma_lane_f16(_sum0, _pA, _pB, 0);
                    _sum1 = vfma_lane_f16(_sum1, _pA, _pB, 1);
                    _sum2 = vfma_lane_f16(_sum2, _pA, _pB, 2);
                    _sum3 = vfma_lane_f16(_sum3, _pA, _pB, 3);

                    pA += 4;
                    pB += 4;
                }

                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
                outptr += 4 * 4;
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const __fp16* pA = pAT;

                float16x4_t _sum0;
                float16x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                }
                else
                {
                    _sum0 = vld1_f16(outptr);
                    _sum1 = vld1_f16(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x4_t _pA = vld1_f16(pA);
                    _sum0 = vfma_n_f16(_sum0, _pA, pB[0]);
                    _sum1 = vfma_n_f16(_sum1, _pA, pB[1]);

                    pA += 4;
                    pB += 2;
                }

                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                outptr += 4 * 2;
            }
            for (; jj < max_jj; jj++)
            {
                const __fp16* pA = pAT;

                float16x4_t _sum;

                if (k == 0)
                {
                    _sum = vdup_n_f16(0.f);
                }
                else
                {
                    _sum = vld1_f16(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x4_t _pA = vld1_f16(pA);
                    _sum = vfma_n_f16(_sum, _pA, pB[0]);

                    pA += 4;
                    pB += 1;
                }

                vst1_f16(outptr, _sum);
                outptr += 4;
            }
        }
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        for (int b = 0; b < batch; b++)
        {
            const __fp16* pAT = AT_tile.row<const __fp16>(b) + max_kk * ii;
            const __fp16* pB = BT_tile.row<const __fp16>(b);

            int jj = 0;
            for (; jj + 11 < max_jj; jj += 12)
            {
                const __fp16* pA = pAT;

                float16x8_t _sum01;
                float16x4_t _sum2;
                float16x8_t _sum34;
                float16x4_t _sum5;

                if (k == 0)
                {
                    _sum01 = vdupq_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                    _sum34 = vdupq_n_f16(0.f);
                    _sum5 = vdup_n_f16(0.f);
                }
                else
                {
                    float16x8x2_t _tmp0123 = vld2q_f16(outptr);
                    float16x4x2_t _tmp45 = vld2_f16(outptr + 16);
                    _sum01 = _tmp0123.val[0];
                    _sum2 = _tmp45.val[0];
                    _sum34 = _tmp0123.val[1];
                    _sum5 = _tmp45.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pB0 = vld1q_f16(pB);
                    float16x4_t _pB2 = vld1_f16(pB + 8);
                    _sum01 = vfmaq_n_f16(_sum01, _pB0, pA[0]);
                    _sum2 = vfma_n_f16(_sum2, _pB2, pA[0]);
                    _sum34 = vfmaq_n_f16(_sum34, _pB0, pA[1]);
                    _sum5 = vfma_n_f16(_sum5, _pB2, pA[1]);
                    pA += 2;
                    pB += 12;
                }

                float16x8x2_t _tmp0123;
                _tmp0123.val[0] = _sum01;
                _tmp0123.val[1] = _sum34;
                float16x4x2_t _tmp45;
                _tmp45.val[0] = _sum2;
                _tmp45.val[1] = _sum5;
                vst2q_f16(outptr, _tmp0123);
                vst2_f16(outptr + 16, _tmp45);
                outptr += 2 * 12;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const __fp16* pA = pAT;

                float16x8_t _sum01;
                float16x8_t _sum23;

                if (k == 0)
                {
                    _sum01 = vdupq_n_f16(0.f);
                    _sum23 = vdupq_n_f16(0.f);
                }
                else
                {
                    float16x8x2_t _tmp0123 = vld2q_f16(outptr);
                    _sum01 = _tmp0123.val[0];
                    _sum23 = _tmp0123.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pB = vld1q_f16(pB);
                    _sum01 = vfmaq_n_f16(_sum01, _pB, pA[0]);
                    _sum23 = vfmaq_n_f16(_sum23, _pB, pA[1]);
                    pA += 2;
                    pB += 8;
                }

                float16x8x2_t _tmp0123;
                _tmp0123.val[0] = _sum01;
                _tmp0123.val[1] = _sum23;
                vst2q_f16(outptr, _tmp0123);
                outptr += 2 * 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const __fp16* pA = pAT;

                float16x4_t _sum0;
                float16x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                }
                else
                {
                    float16x4x2_t _tmp01 = vld2_f16(outptr);
                    _sum0 = _tmp01.val[0];
                    _sum1 = _tmp01.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x4_t _pB = vld1_f16(pB);
                    _sum0 = vfma_n_f16(_sum0, _pB, pA[0]);
                    _sum1 = vfma_n_f16(_sum1, _pB, pA[1]);
                    pA += 2;
                    pB += 4;
                }

                float16x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2_f16(outptr, _tmp01);
                outptr += 2 * 4;
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const __fp16* pA = pAT;

                __fp16 sum00 = 0.f;
                __fp16 sum01 = 0.f;
                __fp16 sum10 = 0.f;
                __fp16 sum11 = 0.f;

                if (k == 0)
                {
                    sum00 = 0.f;
                    sum01 = 0.f;
                    sum10 = 0.f;
                    sum11 = 0.f;
                }
                else
                {
                    sum00 = outptr[0];
                    sum01 = outptr[1];
                    sum10 = outptr[2];
                    sum11 = outptr[3];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum00 += pA[0] * pB[0];
                    sum01 += pA[1] * pB[0];
                    sum10 += pA[0] * pB[1];
                    sum11 += pA[1] * pB[1];
                    pA += 2;
                    pB += 2;
                }

                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
                outptr += 2 * 2;
            }
            for (; jj < max_jj; jj++)
            {
                const __fp16* pA = pAT;

                __fp16 sum0 = 0.f;
                __fp16 sum1 = 0.f;

                if (k == 0)
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
                else
                {
                    sum0 = outptr[0];
                    sum1 = outptr[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum0 += pA[0] * pB[0];
                    sum1 += pA[1] * pB[0];
                    pA += 2;
                    pB += 1;
                }

                outptr[0] = sum0;
                outptr[1] = sum1;
                outptr += 2;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        for (int b = 0; b < batch; b++)
        {
            const __fp16* pAT = AT_tile.row<const __fp16>(b) + max_kk * ii;
            const __fp16* pB = BT_tile.row<const __fp16>(b);

            int jj = 0;
            for (; jj + 11 < max_jj; jj += 12)
            {
                const __fp16* pA = pAT;

                float16x8_t _sum01;
                float16x4_t _sum2;

                if (k == 0)
                {
                    _sum01 = vdupq_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                }
                else
                {
                    _sum01 = vld1q_f16(outptr);
                    _sum2 = vld1_f16(outptr + 8);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pB0 = vld1q_f16(pB);
                    float16x4_t _pB2 = vld1_f16(pB + 8);
                    _sum01 = vfmaq_n_f16(_sum01, _pB0, pA[0]);
                    _sum2 = vfma_n_f16(_sum2, _pB2, pA[0]);
                    pA += 1;
                    pB += 12;
                }

                vst1q_f16(outptr, _sum01);
                vst1_f16(outptr + 8, _sum2);
                outptr += 12;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const __fp16* pA = pAT;

                float16x8_t _sum01;

                if (k == 0)
                {
                    _sum01 = vdupq_n_f16(0.f);
                }
                else
                {
                    _sum01 = vld1q_f16(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x8_t _pB = vld1q_f16(pB);
                    _sum01 = vfmaq_n_f16(_sum01, _pB, pA[0]);
                    pA += 1;
                    pB += 8;
                }

                vst1q_f16(outptr, _sum01);
                outptr += 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const __fp16* pA = pAT;

                float16x4_t _sum;

                if (k == 0)
                {
                    _sum = vdup_n_f16(0.f);
                }
                else
                {
                    _sum = vld1_f16(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    float16x4_t _pB = vld1_f16(pB);
                    _sum = vfma_n_f16(_sum, _pB, pA[0]);
                    pA += 1;
                    pB += 4;
                }

                vst1_f16(outptr, _sum);
                outptr += 4;
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const __fp16* pA = pAT;

                __fp16 sum0 = 0.f;
                __fp16 sum1 = 0.f;

                if (k == 0)
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
                else
                {
                    sum0 = outptr[0];
                    sum1 = outptr[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum0 += pA[0] * pB[0];
                    sum1 += pA[0] * pB[1];
                    pA += 1;
                    pB += 2;
                }

                outptr[0] = sum0;
                outptr[1] = sum1;
                outptr += 2;
            }
            for (; jj < max_jj; jj++)
            {
                const __fp16* pA = pAT;

                __fp16 sum = 0.f;

                if (k == 0)
                {
                    sum = 0.f;
                }
                else
                {
                    sum = outptr[0];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    sum += pA[0] * pB[0];
                    pA += 1;
                    pB += 1;
                }

                outptr[0] = sum;
                outptr += 1;
            }
        }
    }
}

static void conv3x3s1_winograd_get_optimal_tile_mnk_fp16(int M, int N, int K, int B, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
    (void)B;

    // solve K
    {
        // try not to split K
        int tile_size = (l2_cache_size_fp16 - 32) / 12;

        TILE_K = std::max(8, tile_size / 8 * 8);

        int nn_K = (K + TILE_K - 1) / TILE_K;
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
    }

    // solve M
    {
        TILE_M = 8;
    }

    {
        TILE_M *= std::min(nT, get_physical_cpu_count());

        int nn_M = (M + TILE_M - 1) / TILE_M;
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);

        if (nT > 1)
        {
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
        }
    }

    if (N > 0)
    {
        int tile_size;
        if (TILE_K >= K)
        {
            tile_size = (l2_cache_size_fp16 - TILE_M * TILE_K) / TILE_K;
        }
        else
        {
            tile_size = (l2_cache_size_fp16 - TILE_M * TILE_K) / (TILE_M + TILE_K);
        }

        TILE_N = std::max(4, tile_size / 4 * 4);

        int nn_N = (N + TILE_N - 1) / TILE_N;
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
    }
}

static inline void conv3x3s1_winograd23_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    // const float ktm[4][3] = {
    //     {1.0f, 0.0f, 0.0f},
    //     {1.0f / 2, 1.0f / 2, 1.0f / 2},
    //     {1.0f / 2, -1.0f / 2, 1.0f / 2},
    //     {0.0f, 0.0f, 1.0f}
    // };

    __fp16* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            float tmp[4][3];

            const float* k0 = (const float*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                float r0 = k0[0];
                float r1 = k0[1];
                float r2 = k0[2];

                tmp[0][m] = r0;
                tmp[1][m] = r0 * 0.5f + r1 * 0.5f + r2 * 0.5f;
                tmp[2][m] = r0 * 0.5f - r1 * 0.5f + r2 * 0.5f;
                tmp[3][m] = r2;

                k0 += 3;
            }

            for (int m = 0; m < 4; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];

                float z0 = r0;
                float z1 = r0 * 0.5f + r1 * 0.5f + r2 * 0.5f;
                float z2 = r0 * 0.5f - r1 * 0.5f + r2 * 0.5f;
                float z3 = r2;

                ptmp[0] = (__fp16)z0;
                ptmp[1] = (__fp16)z1;
                ptmp[2] = (__fp16)z2;
                ptmp[3] = (__fp16)z3;
                ptmp += 4;
            }
        }
    }
}

static void conv3x3s1_winograd23_transform_kernel_fp16sa(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 16;

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk_fp16(M, 0, K, B, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, (size_t)2u);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)2u);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd23_transform_kernel_tile_fp16sa(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            conv3x3s1_winograd_pack_A_tile_fp16(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd23_transform_input_tile_fp16sa(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const float itm[4][4] = {
    //     {1.0f,  0.0f, -1.0f,  0.0f},
    //     {0.0f,  1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  0.00f, 1.0f}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const int N = bottom_blob.cstep * elempack;

    const int w_tiles = (w - 1) / 2;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[4][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel((k + kk) / elempack).row<const __fp16>(ti * 2) + (tj * 2) * elempack;

            for (int m = 0; m < 4; m++)
            {
                float16x8_t _r0 = vdupq_n_f16(0.f);
                float16x8_t _r1 = vdupq_n_f16(0.f);
                float16x8_t _r2 = vdupq_n_f16(0.f);
                float16x8_t _r3 = vdupq_n_f16(0.f);

                if (ti * 2 + m < h)
                {
                    if (elempack == 8)
                    {
                        _r0 = vld1q_f16(r0);
                        if (tj * 2 + 1 < w) _r1 = vld1q_f16(r0 + 8);
                        if (tj * 2 + 2 < w) _r2 = vld1q_f16(r0 + 16);
                        if (tj * 2 + 3 < w) _r3 = vld1q_f16(r0 + 24);
                    }
                    if (elempack == 4)
                    {
                        const __fp16* r1 = r0 + N;

                        _r0 = vcombine_f16(vld1_f16(r0), vld1_f16(r1));
                        if (tj * 2 + 1 < w)
                        {
                            _r1 = vcombine_f16(vld1_f16(r0 + 4), vld1_f16(r1 + 4));
                        }
                        if (tj * 2 + 2 < w)
                        {
                            _r2 = vcombine_f16(vld1_f16(r0 + 8), vld1_f16(r1 + 8));
                        }
                        if (tj * 2 + 3 < w)
                        {
                            _r3 = vcombine_f16(vld1_f16(r0 + 12), vld1_f16(r1 + 12));
                        }
                    }
                    if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;
                        const __fp16* r2 = r0 + N * 2;
                        const __fp16* r3 = r0 + N * 3;
                        const __fp16* r4 = r0 + N * 4;
                        const __fp16* r5 = r0 + N * 5;
                        const __fp16* r6 = r0 + N * 6;
                        const __fp16* r7 = r0 + N * 7;

                        float16x4_t _t0 = vld1_f16(r0);
                        float16x4_t _t1 = vld1_f16(r1);
                        float16x4_t _t2 = vld1_f16(r2);
                        float16x4_t _t3 = vld1_f16(r3);
                        float16x4_t _t4 = vld1_f16(r4);
                        float16x4_t _t5 = vld1_f16(r5);
                        float16x4_t _t6 = vld1_f16(r6);
                        float16x4_t _t7 = vld1_f16(r7);

                        transpose4x4_ph(_t0, _t1, _t2, _t3);
                        transpose4x4_ph(_t4, _t5, _t6, _t7);

                        _r0 = vcombine_f16(_t0, _t4);
                        if (tj * 2 + 1 < w)
                        {
                            _r1 = vcombine_f16(_t1, _t5);
                        }
                        if (tj * 2 + 2 < w)
                        {
                            _r2 = vcombine_f16(_t2, _t6);
                        }
                        if (tj * 2 + 3 < w)
                        {
                            _r3 = vcombine_f16(_t3, _t7);
                        }
                    }
                }

                float16x8_t _tmp0 = vsubq_f16(_r0, _r2);
                float16x8_t _tmp1 = vaddq_f16(_r1, _r2);
                float16x8_t _tmp2 = vsubq_f16(_r2, _r1);
                float16x8_t _tmp3 = vsubq_f16(_r3, _r1);

                vst1q_f16(tmp[0][m], _tmp0);
                vst1q_f16(tmp[1][m], _tmp1);
                vst1q_f16(tmp[2][m], _tmp2);
                vst1q_f16(tmp[3][m], _tmp3);

                r0 += w * elempack;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 16 + jj * 8;
            __fp16* p1 = p0 + max_jj * 8;
            __fp16* p2 = p0 + max_jj * 8 * 2;
            __fp16* p3 = p0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                float16x8_t _r0 = vld1q_f16(tmp[m][0]);
                float16x8_t _r1 = vld1q_f16(tmp[m][1]);
                float16x8_t _r2 = vld1q_f16(tmp[m][2]);
                float16x8_t _r3 = vld1q_f16(tmp[m][3]);

                float16x8_t _tmp0 = vsubq_f16(_r0, _r2);
                float16x8_t _tmp1 = vaddq_f16(_r1, _r2);
                float16x8_t _tmp2 = vsubq_f16(_r2, _r1);
                float16x8_t _tmp3 = vsubq_f16(_r3, _r1);

                vst1q_f16(p0, _tmp0);
                vst1q_f16(p1, _tmp1);
                vst1q_f16(p2, _tmp2);
                vst1q_f16(p3, _tmp3);

                p0 += max_jj * 4 * 8;
                p1 += max_jj * 4 * 8;
                p2 += max_jj * 4 * 8;
                p3 += max_jj * 4 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[4][4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel((k + kk) / elempack).row<const __fp16>(ti * 2) + (tj * 2) * elempack;

            for (int m = 0; m < 4; m++)
            {
                float16x4_t _r0 = vdup_n_f16(0.f);
                float16x4_t _r1 = vdup_n_f16(0.f);
                float16x4_t _r2 = vdup_n_f16(0.f);
                float16x4_t _r3 = vdup_n_f16(0.f);

                if (ti * 2 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        if (tj * 2 + 1 < w) _r1 = vld1_f16(r0 + 4);
                        if (tj * 2 + 2 < w) _r2 = vld1_f16(r0 + 8);
                        if (tj * 2 + 3 < w) _r3 = vld1_f16(r0 + 12);
                    }
                    if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;
                        const __fp16* r2 = r0 + N * 2;
                        const __fp16* r3 = r0 + N * 3;

                        float16x4_t _t0 = vld1_f16(r0);
                        float16x4_t _t1 = vld1_f16(r1);
                        float16x4_t _t2 = vld1_f16(r2);
                        float16x4_t _t3 = vld1_f16(r3);

                        transpose4x4_ph(_t0, _t1, _t2, _t3);

                        _r0 = _t0;
                        if (tj * 2 + 1 < w) _r1 = _t1;
                        if (tj * 2 + 2 < w) _r2 = _t2;
                        if (tj * 2 + 3 < w) _r3 = _t3;
                    }
                }

                float16x4_t _tmp0 = vsub_f16(_r0, _r2);
                float16x4_t _tmp1 = vadd_f16(_r1, _r2);
                float16x4_t _tmp2 = vsub_f16(_r2, _r1);
                float16x4_t _tmp3 = vsub_f16(_r3, _r1);

                vst1_f16(tmp[0][m], _tmp0);
                vst1_f16(tmp[1][m], _tmp1);
                vst1_f16(tmp[2][m], _tmp2);
                vst1_f16(tmp[3][m], _tmp3);

                r0 += w * elempack;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 16 + jj * 4;
            __fp16* p1 = p0 + max_jj * 4;
            __fp16* p2 = p0 + max_jj * 4 * 2;
            __fp16* p3 = p0 + max_jj * 4 * 3;

            for (int m = 0; m < 4; m++)
            {
                float16x4_t _r0 = vld1_f16(tmp[m][0]);
                float16x4_t _r1 = vld1_f16(tmp[m][1]);
                float16x4_t _r2 = vld1_f16(tmp[m][2]);
                float16x4_t _r3 = vld1_f16(tmp[m][3]);

                float16x4_t _tmp0 = vsub_f16(_r0, _r2);
                float16x4_t _tmp1 = vadd_f16(_r1, _r2);
                float16x4_t _tmp2 = vsub_f16(_r2, _r1);
                float16x4_t _tmp3 = vsub_f16(_r3, _r1);

                vst1_f16(p0, _tmp0);
                vst1_f16(p1, _tmp1);
                vst1_f16(p2, _tmp2);
                vst1_f16(p3, _tmp3);

                p0 += max_jj * 4 * 4;
                p1 += max_jj * 4 * 4;
                p2 += max_jj * 4 * 4;
                p3 += max_jj * 4 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        __fp16 tmp[4][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel(k + kk).row<const __fp16>(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
                __fp16 r00 = 0.f;
                __fp16 r01 = 0.f;
                __fp16 r10 = 0.f;
                __fp16 r11 = 0.f;
                __fp16 r20 = 0.f;
                __fp16 r21 = 0.f;
                __fp16 r30 = 0.f;
                __fp16 r31 = 0.f;

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;

                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 2 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 2 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 2 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
                    }
                }

                tmp[0][m][0] = r00 - r20;
                tmp[0][m][1] = r01 - r21;
                tmp[1][m][0] = r10 + r20;
                tmp[1][m][1] = r11 + r21;
                tmp[2][m][0] = r20 - r10;
                tmp[2][m][1] = r21 - r11;
                tmp[3][m][0] = r30 - r10;
                tmp[3][m][1] = r31 - r11;

                r0 += w;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 16 + jj * 2;
            __fp16* p1 = p0 + max_jj * 2;
            __fp16* p2 = p0 + max_jj * 2 * 2;
            __fp16* p3 = p0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
                __fp16 r00 = tmp[m][0][0];
                __fp16 r01 = tmp[m][0][1];
                __fp16 r10 = tmp[m][1][0];
                __fp16 r11 = tmp[m][1][1];
                __fp16 r20 = tmp[m][2][0];
                __fp16 r21 = tmp[m][2][1];
                __fp16 r30 = tmp[m][3][0];
                __fp16 r31 = tmp[m][3][1];

                p0[0] = r00 - r20;
                p0[1] = r01 - r21;
                p1[0] = r10 + r20;
                p1[1] = r11 + r21;
                p2[0] = r20 - r10;
                p2[1] = r21 - r11;
                p3[0] = r30 - r10;
                p3[1] = r31 - r11;

                p0 += max_jj * 4 * 2;
                p1 += max_jj * 4 * 2;
                p2 += max_jj * 4 * 2;
                p3 += max_jj * 4 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        __fp16 tmp[4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0123 = bottom_blob.channel(k + kk).row<const __fp16>(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
                __fp16 r0 = 0.f;
                __fp16 r1 = 0.f;
                __fp16 r2 = 0.f;
                __fp16 r3 = 0.f;

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 2 + 1 < w) r1 = r0123[1];
                        if (tj * 2 + 2 < w) r2 = r0123[2];
                        if (tj * 2 + 3 < w) r3 = r0123[3];
                    }
                }

                tmp[0][m] = r0 - r2;
                tmp[1][m] = r1 + r2;
                tmp[2][m] = r2 - r1;
                tmp[3][m] = r3 - r1;

                r0123 += w;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 16 + jj;
            __fp16* p1 = p0 + max_jj;
            __fp16* p2 = p0 + max_jj * 2;
            __fp16* p3 = p0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                __fp16 r0 = tmp[m][0];
                __fp16 r1 = tmp[m][1];
                __fp16 r2 = tmp[m][2];
                __fp16 r3 = tmp[m][3];

                p0[0] = r0 - r2;
                p1[0] = r1 + r2;
                p2[0] = r2 - r1;
                p3[0] = r3 - r1;

                p0 += max_jj * 4;
                p1 += max_jj * 4;
                p2 += max_jj * 4;
                p3 += max_jj * 4;
            }
        }
    }
}

static inline void conv3x3s1_winograd23_transform_output_tile_fp16sa(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    // const float otm[2][4] = {
    //     {1.0f,  1.0f,  1.0f,  0.0f},
    //     {0.0f,  1.0f, -1.0f,  1.0f}
    // };

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 1) / 2;

    const __fp16* biasptr = bias;

    int ii = 0;
    for (; ii + 7 < max_ii; ii += 8)
    {
        float16x8_t _bias0 = biasptr ? vld1q_f16(biasptr + i + ii) : vdupq_n_f16(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[2][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 16 + jj * 8;
            const __fp16* r1 = r0 + max_jj * 8;
            const __fp16* r2 = r0 + max_jj * 8 * 2;
            const __fp16* r3 = r0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                float16x8_t _r0 = vld1q_f16(r0);
                float16x8_t _r1 = vld1q_f16(r1);
                float16x8_t _r2 = vld1q_f16(r2);
                float16x8_t _r3 = vld1q_f16(r3);

                float16x8_t _tmp0 = vaddq_f16(vaddq_f16(_r0, _r1), _r2);
                float16x8_t _tmp1 = vaddq_f16(vsubq_f16(_r1, _r2), _r3);

                vst1q_f16(tmp[0][m], _tmp0);
                vst1q_f16(tmp[1][m], _tmp1);

                r0 += max_jj * 4 * 8;
                r1 += max_jj * 4 * 8;
                r2 += max_jj * 4 * 8;
                r3 += max_jj * 4 * 8;
            }

            __fp16* outptr0 = top_blob.channel((i + ii) / out_elempack).row<__fp16>(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float16x8_t _r0 = vld1q_f16(tmp[m][0]);
                float16x8_t _r1 = vld1q_f16(tmp[m][1]);
                float16x8_t _r2 = vld1q_f16(tmp[m][2]);
                float16x8_t _r3 = vld1q_f16(tmp[m][3]);

                float16x8_t _tmp0 = vaddq_f16(_bias0, vaddq_f16(vaddq_f16(_r0, _r1), _r2));
                float16x8_t _tmp1 = vaddq_f16(_bias0, vaddq_f16(vsubq_f16(_r1, _r2), _r3));

                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _tmp0);
                    if (tj * 2 + 1 < outw)
                    {
                        vst1q_f16(outptr0 + 8, _tmp1);
                    }
                }
                if (out_elempack == 4)
                {
                    __fp16* outptr1 = outptr0 + N;

                    vst1_f16(outptr0, vget_low_f16(_tmp0));
                    vst1_f16(outptr1, vget_high_f16(_tmp0));
                    if (tj * 2 + 1 < outw)
                    {
                        vst1_f16(outptr0 + 4, vget_low_f16(_tmp1));
                        vst1_f16(outptr1 + 4, vget_high_f16(_tmp1));
                    }
                }
                if (out_elempack == 1)
                {
                    __fp16 tmp0[8];
                    __fp16 tmp1[8];
                    vst1q_f16(tmp0, _tmp0);
                    vst1q_f16(tmp1, _tmp1);

                    __fp16* outptr1 = outptr0 + N;
                    __fp16* outptr2 = outptr0 + N * 2;
                    __fp16* outptr3 = outptr0 + N * 3;
                    __fp16* outptr4 = outptr0 + N * 4;
                    __fp16* outptr5 = outptr0 + N * 5;
                    __fp16* outptr6 = outptr0 + N * 6;
                    __fp16* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        float16x4_t _bias0 = biasptr ? vld1_f16(biasptr + i + ii) : vdup_n_f16(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[2][4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 16 + jj * 4;
            const __fp16* r1 = r0 + max_jj * 4;
            const __fp16* r2 = r0 + max_jj * 4 * 2;
            const __fp16* r3 = r0 + max_jj * 4 * 3;

            for (int m = 0; m < 4; m++)
            {
                float16x4_t _r0 = vld1_f16(r0);
                float16x4_t _r1 = vld1_f16(r1);
                float16x4_t _r2 = vld1_f16(r2);
                float16x4_t _r3 = vld1_f16(r3);

                float16x4_t _tmp0 = vadd_f16(vadd_f16(_r0, _r1), _r2);
                float16x4_t _tmp1 = vadd_f16(vsub_f16(_r1, _r2), _r3);

                vst1_f16(tmp[0][m], _tmp0);
                vst1_f16(tmp[1][m], _tmp1);

                r0 += max_jj * 4 * 4;
                r1 += max_jj * 4 * 4;
                r2 += max_jj * 4 * 4;
                r3 += max_jj * 4 * 4;
            }

            __fp16* outptr0 = top_blob.channel((i + ii) / out_elempack).row<__fp16>(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                float16x4_t _r0 = vld1_f16(tmp[m][0]);
                float16x4_t _r1 = vld1_f16(tmp[m][1]);
                float16x4_t _r2 = vld1_f16(tmp[m][2]);
                float16x4_t _r3 = vld1_f16(tmp[m][3]);

                float16x4_t _tmp0 = vadd_f16(_bias0, vadd_f16(vadd_f16(_r0, _r1), _r2));
                float16x4_t _tmp1 = vadd_f16(_bias0, vadd_f16(vsub_f16(_r1, _r2), _r3));

                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _tmp0);
                    if (tj * 2 + 1 < outw) vst1_f16(outptr0 + 4, _tmp1);
                }
                if (out_elempack == 1)
                {
                    __fp16 tmp0[4];
                    __fp16 tmp1[4];
                    vst1_f16(tmp0, _tmp0);
                    vst1_f16(tmp1, _tmp1);

                    __fp16* outptr1 = outptr0 + N;
                    __fp16* outptr2 = outptr0 + N * 2;
                    __fp16* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        __fp16 bias0 = biasptr ? biasptr[i + ii] : 0.f;
        __fp16 bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        __fp16 tmp[2][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 16 + jj * 2;
            const __fp16* r1 = r0 + max_jj * 2;
            const __fp16* r2 = r0 + max_jj * 2 * 2;
            const __fp16* r3 = r0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
                tmp[0][m][0] = r0[0] + r1[0] + r2[0];
                tmp[0][m][1] = r0[1] + r1[1] + r2[1];
                tmp[1][m][0] = r1[0] - r2[0] + r3[0];
                tmp[1][m][1] = r1[1] - r2[1] + r3[1];

                r0 += max_jj * 4 * 2;
                r1 += max_jj * 4 * 2;
                r2 += max_jj * 4 * 2;
                r3 += max_jj * 4 * 2;
            }

            __fp16* outptr0 = top_blob.channel(i + ii).row<__fp16>(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                __fp16 r00 = tmp[m][0][0];
                __fp16 r01 = tmp[m][0][1];
                __fp16 r10 = tmp[m][1][0];
                __fp16 r11 = tmp[m][1][1];
                __fp16 r20 = tmp[m][2][0];
                __fp16 r21 = tmp[m][2][1];
                __fp16 r30 = tmp[m][3][0];
                __fp16 r31 = tmp[m][3][1];

                __fp16 tmp00 = bias0 + r00 + r10 + r20;
                __fp16 tmp01 = bias1 + r01 + r11 + r21;
                __fp16 tmp10 = bias0 + r10 - r20 + r30;
                __fp16 tmp11 = bias1 + r11 - r21 + r31;

                // if (out_elempack == 1)
                {
                    __fp16* outptr1 = outptr0 + N;

                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        __fp16 bias0 = biasptr ? biasptr[i + ii] : 0.f;

        __fp16 tmp[2][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 16 + jj;
            const __fp16* r1 = r0 + max_jj;
            const __fp16* r2 = r0 + max_jj * 2;
            const __fp16* r3 = r0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                tmp[0][m] = r0[0] + r1[0] + r2[0];
                tmp[1][m] = r1[0] - r2[0] + r3[0];

                r0 += max_jj * 4;
                r1 += max_jj * 4;
                r2 += max_jj * 4;
                r3 += max_jj * 4;
            }

            __fp16* outptr0 = top_blob.channel(i + ii).row<__fp16>(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                __fp16 r0 = tmp[m][0];
                __fp16 r1 = tmp[m][1];
                __fp16 r2 = tmp[m][2];
                __fp16 r3 = tmp[m][3];

                __fp16 tmp0 = bias0 + r0 + r1 + r2;
                __fp16 tmp1 = bias0 + r1 - r2 + r3;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 2 + 1 < outw) outptr0[1] = tmp1;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd23_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 2n+2, winograd F(2,3)
    int w_tiles = (outw + 1) / 2;
    int h_tiles = (outh + 1) / 2;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 16;

    // NCNN_LOGE("conv3x3s1_winograd23_fp16sa %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk_fp16(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd23_transform_input_tile_fp16sa(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile_fp16(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd23_transform_input_tile_fp16sa(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile_fp16(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd23_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    __fp16* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            const float sq2 = 1.41421356237f;
            // const float ktm[6][3] = {
            //     {1.0f, 0.0f, 0.0f},
            //     {-2.0f / 3, -sq2 / 3, -1.0f / 3},
            //     {-2.0f / 3, sq2 / 3, -1.0f / 3},
            //     {1.0f / 6, sq2 / 6, 1.0f / 3},
            //     {1.0f / 6, -sq2 / 6, 1.0f / 3},
            //     {0.0f, 0.0f, 1.0f}
            // };
            const float ktm0 = 2.0f / 3;
            const float ktm1 = sq2 / 3;
            const float ktm2 = 1.0f / 3;
            const float ktm3 = 1.0f / 6;
            const float ktm4 = sq2 / 6;

            float tmp[6][3];

            const float* k0 = (const float*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                float r0 = k0[0];
                float r1 = k0[1];
                float r2 = k0[2];

                tmp[0][m] = r0;
                tmp[1][m] = -r0 * ktm0 - r1 * ktm1 - r2 * ktm2;
                tmp[2][m] = -r0 * ktm0 + r1 * ktm1 - r2 * ktm2;
                tmp[3][m] = r0 * ktm3 + r1 * ktm4 + r2 * ktm2;
                tmp[4][m] = r0 * ktm3 - r1 * ktm4 + r2 * ktm2;
                tmp[5][m] = r2;

                k0 += 3;
            }

            for (int m = 0; m < 6; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];

                float z0 = r0;
                float z1 = -r0 * ktm0 - r1 * ktm1 - r2 * ktm2;
                float z2 = -r0 * ktm0 + r1 * ktm1 - r2 * ktm2;
                float z3 = r0 * ktm3 + r1 * ktm4 + r2 * ktm2;
                float z4 = r0 * ktm3 - r1 * ktm4 + r2 * ktm2;
                float z5 = r2;

                ptmp[0] = (__fp16)z0;
                ptmp[1] = (__fp16)z1;
                ptmp[2] = (__fp16)z2;
                ptmp[3] = (__fp16)z3;
                ptmp[4] = (__fp16)z4;
                ptmp[5] = (__fp16)z5;
                ptmp += 6;
            }
        }
    }
}

static void conv3x3s1_winograd43_transform_kernel_fp16sa(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 36;

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk_fp16(M, 0, K, B, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, (size_t)2u);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)2u);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd43_transform_kernel_tile_fp16sa(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            conv3x3s1_winograd_pack_A_tile_fp16(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd43_transform_input_tile_fp16sa(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    const __fp16 sq2 = 1.41421356237;
    const __fp16 msq2_d2 = -1.41421356237 / 2;

    // const float itm[6][6] = {
    //     {1.0f,  0.0f,  -2.5f,  0.0f,  1.0f, 0.0f},
    //     {0.0f, -sq2,   -2.0f,  sq2/2, 1.0f, 0.0f},
    //     {0.0f,  sq2,   -2.0f, -sq2/2, 1.0f, 0.0f},
    //     {0.0f, -sq2/2, -0.5f,  sq2,   1.0f, 0.0f},
    //     {0.0f,  sq2/2, -0.5f, -sq2,   1.0f, 0.0f},
    //     {0.0f,  1.0f,   0.0f,  -2.5f, 0.0f, 1.0f}
    // };

    // 0 =  r00 + r04 - 2.5f * r02
    // 1 = -(sq2 * r01 - sq2_d2 * r03) + (r04 - 2 * r02)
    // 2 =  (sq2 * r01 - sq2_d2 * r03) + (r04 - 2 * r02)
    // 3 =  (sq2 * r03 - sq2_d2 * r01) + (r04 - 0.5f * r02)
    // 4 = -(sq2 * r03 - sq2_d2 * r01) + (r04 - 0.5f * r02)
    // 5 =  r01 + r05 - 2.5f * r03

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const int N = bottom_blob.cstep * elempack;

    const int w_tiles = (w + 1) / 4;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[6][6][8];

        const __fp16 coeffs[8] = {sq2, msq2_d2, -2.f, -0.5f, -2.5f, 0.f, 0.f, 0.f};
        float16x8_t _coeffs = vld1q_f16(coeffs);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel((k + kk) / elempack).row<const __fp16>(ti * 4) + (tj * 4) * elempack;

            for (int m = 0; m < 6; m++)
            {
                float16x8_t _r0 = vdupq_n_f16(0.f);
                float16x8_t _r1 = vdupq_n_f16(0.f);
                float16x8_t _r2 = vdupq_n_f16(0.f);
                float16x8_t _r3 = vdupq_n_f16(0.f);
                float16x8_t _r4 = vdupq_n_f16(0.f);
                float16x8_t _r5 = vdupq_n_f16(0.f);

                if (ti * 4 + m < h)
                {
                    if (elempack == 8)
                    {
                        _r0 = vld1q_f16(r0);
                        if (tj * 4 + 1 < w) _r1 = vld1q_f16(r0 + 8);
                        if (tj * 4 + 2 < w) _r2 = vld1q_f16(r0 + 16);
                        if (tj * 4 + 3 < w) _r3 = vld1q_f16(r0 + 24);
                        if (tj * 4 + 4 < w) _r4 = vld1q_f16(r0 + 32);
                        if (tj * 4 + 5 < w) _r5 = vld1q_f16(r0 + 40);
                    }
                    if (elempack == 4)
                    {
                        const __fp16* r1 = r0 + N;

                        _r0 = vcombine_f16(vld1_f16(r0), vld1_f16(r1));
                        if (tj * 4 + 1 < w)
                        {
                            _r1 = vcombine_f16(vld1_f16(r0 + 4), vld1_f16(r1 + 4));
                        }
                        if (tj * 4 + 2 < w)
                        {
                            _r2 = vcombine_f16(vld1_f16(r0 + 8), vld1_f16(r1 + 8));
                        }
                        if (tj * 4 + 3 < w)
                        {
                            _r3 = vcombine_f16(vld1_f16(r0 + 12), vld1_f16(r1 + 12));
                        }
                        if (tj * 4 + 4 < w)
                        {
                            _r4 = vcombine_f16(vld1_f16(r0 + 16), vld1_f16(r1 + 16));
                        }
                        if (tj * 4 + 5 < w)
                        {
                            _r5 = vcombine_f16(vld1_f16(r0 + 20), vld1_f16(r1 + 20));
                        }
                    }
                    if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;
                        const __fp16* r2 = r0 + N * 2;
                        const __fp16* r3 = r0 + N * 3;
                        const __fp16* r4 = r0 + N * 4;
                        const __fp16* r5 = r0 + N * 5;
                        const __fp16* r6 = r0 + N * 6;
                        const __fp16* r7 = r0 + N * 7;

                        float16x4_t _t0 = vld1_f16(r0);
                        float16x4_t _t1 = vld1_f16(r1);
                        float16x4_t _t2 = vld1_f16(r2);
                        float16x4_t _t3 = vld1_f16(r3);
                        float16x4_t _t4 = vld1_f16(r4);
                        float16x4_t _t5 = vld1_f16(r5);
                        float16x4_t _t6 = vld1_f16(r6);
                        float16x4_t _t7 = vld1_f16(r7);

                        transpose4x4_ph(_t0, _t1, _t2, _t3);
                        transpose4x4_ph(_t4, _t5, _t6, _t7);

                        _r0 = vcombine_f16(_t0, _t4);
                        if (tj * 4 + 1 < w)
                        {
                            _r1 = vcombine_f16(_t1, _t5);
                        }
                        if (tj * 4 + 2 < w)
                        {
                            _r2 = vcombine_f16(_t2, _t6);
                        }
                        if (tj * 4 + 3 < w)
                        {
                            _r3 = vcombine_f16(_t3, _t7);
                        }
                        if (tj * 4 + 4 < w)
                        {
                            __fp16 tmp[8] = {r0[4], r1[4], r2[4], r3[4], r4[4], r5[4], r6[4], r7[4]};
                            _r4 = vld1q_f16(tmp);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            __fp16 tmp[8] = {r0[5], r1[5], r2[5], r3[5], r4[5], r5[5], r6[5], r7[5]};
                            _r5 = vld1q_f16(tmp);
                        }
                    }
                }

                float16x8_t _tmp12a = vfmaq_laneq_f16(vmulq_laneq_f16(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float16x8_t _tmp12b = vfmaq_laneq_f16(_r4, _r2, _coeffs, 2);
                float16x8_t _tmp34a = vfmaq_laneq_f16(vmulq_laneq_f16(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float16x8_t _tmp34b = vfmaq_laneq_f16(_r4, _r2, _coeffs, 3);

                float16x8_t _tmp0 = vfmaq_laneq_f16(vaddq_f16(_r0, _r4), _r2, _coeffs, 4);
                float16x8_t _tmp1 = vsubq_f16(_tmp12b, _tmp12a);
                float16x8_t _tmp2 = vaddq_f16(_tmp12b, _tmp12a);
                float16x8_t _tmp3 = vaddq_f16(_tmp34b, _tmp34a);
                float16x8_t _tmp4 = vsubq_f16(_tmp34b, _tmp34a);
                float16x8_t _tmp5 = vfmaq_laneq_f16(vaddq_f16(_r1, _r5), _r3, _coeffs, 4);

                vst1q_f16(tmp[0][m], _tmp0);
                vst1q_f16(tmp[1][m], _tmp1);
                vst1q_f16(tmp[2][m], _tmp2);
                vst1q_f16(tmp[3][m], _tmp3);
                vst1q_f16(tmp[4][m], _tmp4);
                vst1q_f16(tmp[5][m], _tmp5);

                r0 += w * elempack;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 36 + jj * 8;
            __fp16* p1 = p0 + max_jj * 8;
            __fp16* p2 = p0 + max_jj * 8 * 2;
            __fp16* p3 = p0 + max_jj * 8 * 3;
            __fp16* p4 = p0 + max_jj * 8 * 4;
            __fp16* p5 = p0 + max_jj * 8 * 5;

            for (int m = 0; m < 6; m++)
            {
                float16x8_t _r0 = vld1q_f16(tmp[m][0]);
                float16x8_t _r1 = vld1q_f16(tmp[m][1]);
                float16x8_t _r2 = vld1q_f16(tmp[m][2]);
                float16x8_t _r3 = vld1q_f16(tmp[m][3]);
                float16x8_t _r4 = vld1q_f16(tmp[m][4]);
                float16x8_t _r5 = vld1q_f16(tmp[m][5]);

                float16x8_t _tmp12a = vfmaq_laneq_f16(vmulq_laneq_f16(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float16x8_t _tmp12b = vfmaq_laneq_f16(_r4, _r2, _coeffs, 2);
                float16x8_t _tmp34a = vfmaq_laneq_f16(vmulq_laneq_f16(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float16x8_t _tmp34b = vfmaq_laneq_f16(_r4, _r2, _coeffs, 3);

                float16x8_t _tmp0 = vfmaq_laneq_f16(vaddq_f16(_r0, _r4), _r2, _coeffs, 4);
                float16x8_t _tmp1 = vsubq_f16(_tmp12b, _tmp12a);
                float16x8_t _tmp2 = vaddq_f16(_tmp12b, _tmp12a);
                float16x8_t _tmp3 = vaddq_f16(_tmp34b, _tmp34a);
                float16x8_t _tmp4 = vsubq_f16(_tmp34b, _tmp34a);
                float16x8_t _tmp5 = vfmaq_laneq_f16(vaddq_f16(_r1, _r5), _r3, _coeffs, 4);

                vst1q_f16(p0, _tmp0);
                vst1q_f16(p1, _tmp1);
                vst1q_f16(p2, _tmp2);
                vst1q_f16(p3, _tmp3);
                vst1q_f16(p4, _tmp4);
                vst1q_f16(p5, _tmp5);

                p0 += max_jj * 6 * 8;
                p1 += max_jj * 6 * 8;
                p2 += max_jj * 6 * 8;
                p3 += max_jj * 6 * 8;
                p4 += max_jj * 6 * 8;
                p5 += max_jj * 6 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[6][6][4];

        const __fp16 coeffs[8] = {sq2, msq2_d2, -2.f, -0.5f, -2.5f, 0.f, 0.f, 0.f};
        float16x8_t _coeffs = vld1q_f16(coeffs);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel((k + kk) / elempack).row<const __fp16>(ti * 4) + (tj * 4) * elempack;

            for (int m = 0; m < 6; m++)
            {
                float16x4_t _r0 = vdup_n_f16(0.f);
                float16x4_t _r1 = vdup_n_f16(0.f);
                float16x4_t _r2 = vdup_n_f16(0.f);
                float16x4_t _r3 = vdup_n_f16(0.f);
                float16x4_t _r4 = vdup_n_f16(0.f);
                float16x4_t _r5 = vdup_n_f16(0.f);

                if (ti * 4 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        if (tj * 4 + 1 < w) _r1 = vld1_f16(r0 + 4);
                        if (tj * 4 + 2 < w) _r2 = vld1_f16(r0 + 8);
                        if (tj * 4 + 3 < w) _r3 = vld1_f16(r0 + 12);
                        if (tj * 4 + 4 < w) _r4 = vld1_f16(r0 + 16);
                        if (tj * 4 + 5 < w) _r5 = vld1_f16(r0 + 20);
                    }
                    if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;
                        const __fp16* r2 = r0 + N * 2;
                        const __fp16* r3 = r0 + N * 3;

                        float16x4_t _t0 = vld1_f16(r0);
                        float16x4_t _t1 = vld1_f16(r1);
                        float16x4_t _t2 = vld1_f16(r2);
                        float16x4_t _t3 = vld1_f16(r3);

                        transpose4x4_ph(_t0, _t1, _t2, _t3);

                        _r0 = _t0;
                        if (tj * 4 + 1 < w) _r1 = _t1;
                        if (tj * 4 + 2 < w) _r2 = _t2;
                        if (tj * 4 + 3 < w) _r3 = _t3;
                        if (tj * 4 + 4 < w)
                        {
                            __fp16 tmp[4] = {r0[4], r1[4], r2[4], r3[4]};
                            _r4 = vld1_f16(tmp);
                        }
                        if (tj * 4 + 5 < w)
                        {
                            __fp16 tmp[4] = {r0[5], r1[5], r2[5], r3[5]};
                            _r5 = vld1_f16(tmp);
                        }
                    }
                }

                float16x4_t _tmp12a = vfma_laneq_f16(vmul_laneq_f16(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float16x4_t _tmp12b = vfma_laneq_f16(_r4, _r2, _coeffs, 2);
                float16x4_t _tmp34a = vfma_laneq_f16(vmul_laneq_f16(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float16x4_t _tmp34b = vfma_laneq_f16(_r4, _r2, _coeffs, 3);

                float16x4_t _tmp0 = vfma_laneq_f16(vadd_f16(_r0, _r4), _r2, _coeffs, 4);
                float16x4_t _tmp1 = vsub_f16(_tmp12b, _tmp12a);
                float16x4_t _tmp2 = vadd_f16(_tmp12b, _tmp12a);
                float16x4_t _tmp3 = vadd_f16(_tmp34b, _tmp34a);
                float16x4_t _tmp4 = vsub_f16(_tmp34b, _tmp34a);
                float16x4_t _tmp5 = vfma_laneq_f16(vadd_f16(_r1, _r5), _r3, _coeffs, 4);

                vst1_f16(tmp[0][m], _tmp0);
                vst1_f16(tmp[1][m], _tmp1);
                vst1_f16(tmp[2][m], _tmp2);
                vst1_f16(tmp[3][m], _tmp3);
                vst1_f16(tmp[4][m], _tmp4);
                vst1_f16(tmp[5][m], _tmp5);

                r0 += w * elempack;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 36 + jj * 4;
            __fp16* p1 = p0 + max_jj * 4;
            __fp16* p2 = p0 + max_jj * 4 * 2;
            __fp16* p3 = p0 + max_jj * 4 * 3;
            __fp16* p4 = p0 + max_jj * 4 * 4;
            __fp16* p5 = p0 + max_jj * 4 * 5;

            for (int m = 0; m < 6; m++)
            {
                float16x4_t _r0 = vld1_f16(tmp[m][0]);
                float16x4_t _r1 = vld1_f16(tmp[m][1]);
                float16x4_t _r2 = vld1_f16(tmp[m][2]);
                float16x4_t _r3 = vld1_f16(tmp[m][3]);
                float16x4_t _r4 = vld1_f16(tmp[m][4]);
                float16x4_t _r5 = vld1_f16(tmp[m][5]);

                float16x4_t _tmp12a = vfma_laneq_f16(vmul_laneq_f16(_r1, _coeffs, 0), _r3, _coeffs, 1);
                float16x4_t _tmp12b = vfma_laneq_f16(_r4, _r2, _coeffs, 2);
                float16x4_t _tmp34a = vfma_laneq_f16(vmul_laneq_f16(_r3, _coeffs, 0), _r1, _coeffs, 1);
                float16x4_t _tmp34b = vfma_laneq_f16(_r4, _r2, _coeffs, 3);

                float16x4_t _tmp0 = vfma_laneq_f16(vadd_f16(_r0, _r4), _r2, _coeffs, 4);
                float16x4_t _tmp1 = vsub_f16(_tmp12b, _tmp12a);
                float16x4_t _tmp2 = vadd_f16(_tmp12b, _tmp12a);
                float16x4_t _tmp3 = vadd_f16(_tmp34b, _tmp34a);
                float16x4_t _tmp4 = vsub_f16(_tmp34b, _tmp34a);
                float16x4_t _tmp5 = vfma_laneq_f16(vadd_f16(_r1, _r5), _r3, _coeffs, 4);

                vst1_f16(p0, _tmp0);
                vst1_f16(p1, _tmp1);
                vst1_f16(p2, _tmp2);
                vst1_f16(p3, _tmp3);
                vst1_f16(p4, _tmp4);
                vst1_f16(p5, _tmp5);

                p0 += max_jj * 6 * 4;
                p1 += max_jj * 6 * 4;
                p2 += max_jj * 6 * 4;
                p3 += max_jj * 6 * 4;
                p4 += max_jj * 6 * 4;
                p5 += max_jj * 6 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        __fp16 tmp[6][6][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel(k + kk).row<const __fp16>(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
                __fp16 r00 = 0.f;
                __fp16 r01 = 0.f;
                __fp16 r10 = 0.f;
                __fp16 r11 = 0.f;
                __fp16 r20 = 0.f;
                __fp16 r21 = 0.f;
                __fp16 r30 = 0.f;
                __fp16 r31 = 0.f;
                __fp16 r40 = 0.f;
                __fp16 r41 = 0.f;
                __fp16 r50 = 0.f;
                __fp16 r51 = 0.f;

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;

                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 4 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 4 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 4 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
                        if (tj * 4 + 4 < w)
                        {
                            r40 = r0[4];
                            r41 = r1[4];
                        }
                        if (tj * 4 + 5 < w)
                        {
                            r50 = r0[5];
                            r51 = r1[5];
                        }
                    }
                }

                __fp16 tmp12a0 = sq2 * r10 + msq2_d2 * r30;
                __fp16 tmp12a1 = sq2 * r11 + msq2_d2 * r31;
                __fp16 tmp12b0 = r40 - (__fp16)2.f * r20;
                __fp16 tmp12b1 = r41 - (__fp16)2.f * r21;
                __fp16 tmp34a0 = sq2 * r30 + msq2_d2 * r10;
                __fp16 tmp34a1 = sq2 * r31 + msq2_d2 * r11;
                __fp16 tmp34b0 = r40 - (__fp16)0.5f * r20;
                __fp16 tmp34b1 = r41 - (__fp16)0.5f * r21;

                tmp[0][m][0] = r00 + r40 - (__fp16)2.5f * r20;
                tmp[0][m][1] = r01 + r41 - (__fp16)2.5f * r21;
                tmp[1][m][0] = tmp12b0 - tmp12a0;
                tmp[1][m][1] = tmp12b1 - tmp12a1;
                tmp[2][m][0] = tmp12b0 + tmp12a0;
                tmp[2][m][1] = tmp12b1 + tmp12a1;
                tmp[3][m][0] = tmp34b0 + tmp34a0;
                tmp[3][m][1] = tmp34b1 + tmp34a1;
                tmp[4][m][0] = tmp34b0 - tmp34a0;
                tmp[4][m][1] = tmp34b1 - tmp34a1;
                tmp[5][m][0] = r10 + r50 - (__fp16)2.5f * r30;
                tmp[5][m][1] = r11 + r51 - (__fp16)2.5f * r31;

                r0 += w;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 36 + jj * 2;
            __fp16* p1 = p0 + max_jj * 2;
            __fp16* p2 = p0 + max_jj * 2 * 2;
            __fp16* p3 = p0 + max_jj * 2 * 3;
            __fp16* p4 = p0 + max_jj * 2 * 4;
            __fp16* p5 = p0 + max_jj * 2 * 5;

            for (int m = 0; m < 6; m++)
            {
                __fp16 r00 = tmp[m][0][0];
                __fp16 r01 = tmp[m][0][1];
                __fp16 r10 = tmp[m][1][0];
                __fp16 r11 = tmp[m][1][1];
                __fp16 r20 = tmp[m][2][0];
                __fp16 r21 = tmp[m][2][1];
                __fp16 r30 = tmp[m][3][0];
                __fp16 r31 = tmp[m][3][1];
                __fp16 r40 = tmp[m][4][0];
                __fp16 r41 = tmp[m][4][1];
                __fp16 r50 = tmp[m][5][0];
                __fp16 r51 = tmp[m][5][1];

                __fp16 tmp12a0 = sq2 * r10 + msq2_d2 * r30;
                __fp16 tmp12a1 = sq2 * r11 + msq2_d2 * r31;
                __fp16 tmp12b0 = r40 - (__fp16)2.f * r20;
                __fp16 tmp12b1 = r41 - (__fp16)2.f * r21;
                __fp16 tmp34a0 = sq2 * r30 + msq2_d2 * r10;
                __fp16 tmp34a1 = sq2 * r31 + msq2_d2 * r11;
                __fp16 tmp34b0 = r40 - (__fp16)0.5f * r20;
                __fp16 tmp34b1 = r41 - (__fp16)0.5f * r21;

                p0[0] = r00 + r40 - (__fp16)2.5f * r20;
                p0[1] = r01 + r41 - (__fp16)2.5f * r21;
                p1[0] = tmp12b0 - tmp12a0;
                p1[1] = tmp12b1 - tmp12a1;
                p2[0] = tmp12b0 + tmp12a0;
                p2[1] = tmp12b1 + tmp12a1;
                p3[0] = tmp34b0 + tmp34a0;
                p3[1] = tmp34b1 + tmp34a1;
                p4[0] = tmp34b0 - tmp34a0;
                p4[1] = tmp34b1 - tmp34a1;
                p5[0] = r10 + r50 - (__fp16)2.5f * r30;
                p5[1] = r11 + r51 - (__fp16)2.5f * r31;

                p0 += max_jj * 6 * 2;
                p1 += max_jj * 6 * 2;
                p2 += max_jj * 6 * 2;
                p3 += max_jj * 6 * 2;
                p4 += max_jj * 6 * 2;
                p5 += max_jj * 6 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        __fp16 tmp[6][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0123 = bottom_blob.channel(k + kk).row<const __fp16>(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
                __fp16 r0 = 0.f;
                __fp16 r1 = 0.f;
                __fp16 r2 = 0.f;
                __fp16 r3 = 0.f;
                __fp16 r4 = 0.f;
                __fp16 r5 = 0.f;

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 4 + 1 < w) r1 = r0123[1];
                        if (tj * 4 + 2 < w) r2 = r0123[2];
                        if (tj * 4 + 3 < w) r3 = r0123[3];
                        if (tj * 4 + 4 < w) r4 = r0123[4];
                        if (tj * 4 + 5 < w) r5 = r0123[5];
                    }
                }

                __fp16 tmp12a = sq2 * r1 + msq2_d2 * r3;
                __fp16 tmp12b = r4 - (__fp16)2.f * r2;
                __fp16 tmp34a = sq2 * r3 + msq2_d2 * r1;
                __fp16 tmp34b = r4 - (__fp16)0.5f * r2;

                tmp[0][m] = r0 + r4 - (__fp16)2.5f * r2;
                tmp[1][m] = tmp12b - tmp12a;
                tmp[2][m] = tmp12b + tmp12a;
                tmp[3][m] = tmp34b + tmp34a;
                tmp[4][m] = tmp34b - tmp34a;
                tmp[5][m] = r1 + r5 - (__fp16)2.5f * r3;

                r0123 += w;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 36 + jj;
            __fp16* p1 = p0 + max_jj;
            __fp16* p2 = p0 + max_jj * 2;
            __fp16* p3 = p0 + max_jj * 3;
            __fp16* p4 = p0 + max_jj * 4;
            __fp16* p5 = p0 + max_jj * 5;

            for (int m = 0; m < 6; m++)
            {
                __fp16 r0 = tmp[m][0];
                __fp16 r1 = tmp[m][1];
                __fp16 r2 = tmp[m][2];
                __fp16 r3 = tmp[m][3];
                __fp16 r4 = tmp[m][4];
                __fp16 r5 = tmp[m][5];

                __fp16 tmp12a = sq2 * r1 + msq2_d2 * r3;
                __fp16 tmp12b = r4 - (__fp16)2.f * r2;
                __fp16 tmp34a = sq2 * r3 + msq2_d2 * r1;
                __fp16 tmp34b = r4 - (__fp16)0.5f * r2;

                p0[0] = r0 + r4 - (__fp16)2.5f * r2;
                p1[0] = tmp12b - tmp12a;
                p2[0] = tmp12b + tmp12a;
                p3[0] = tmp34b + tmp34a;
                p4[0] = tmp34b - tmp34a;
                p5[0] = r1 + r5 - (__fp16)2.5f * r3;

                p0 += max_jj * 6;
                p1 += max_jj * 6;
                p2 += max_jj * 6;
                p3 += max_jj * 6;
                p4 += max_jj * 6;
                p5 += max_jj * 6;
            }
        }
    }
}

static inline void conv3x3s1_winograd43_transform_output_tile_fp16sa(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    const __fp16 sq2 = 1.41421356237;
    const __fp16 sq2_m2 = 1.41421356237 * 2;
    const __fp16 sq2_d2 = 1.41421356237 / 2;
    const __fp16 sq2_d4 = 1.41421356237 / 4;

    // const float otm[4][6] = {
    //     {1.0f, 1.0f,   1.0f,  1.0f,  1.0f,   0.0f},
    //     {0.0f, sq2/2, -sq2/2, sq2,   -sq2,   0.0f},
    //     {0.0f, 0.5f,   0.5f,  2.0f,  2.0f,   0.0f},
    //     {0.0f, sq2/4, -sq2/4, sq2*2, -sq2*2, 1.0f}
    // };

    // 0 = r00 + (r01 + r02) + (r03 + r04)
    // 1 =       (r01 - r02) * sq2_d2 + (r03 - r04) * sq2
    // 2 =       (r01 + r02) * 0.5f + (r03 + r04) * 2
    // 3 = r05 + (r01 - r02) * sq2_d4 + (r03 - r04) * sq2_m2

    const __fp16 coeffs[8] = {sq2, sq2_d2, sq2_d4, sq2_m2, 0.5f, 2.f, 0.f, 0.f};
    float16x8_t _coeffs = vld1q_f16(coeffs);

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 3) / 4;

    const __fp16* biasptr = bias;

    int ii = 0;
    for (; ii + 7 < max_ii; ii += 8)
    {
        float16x8_t _bias0 = biasptr ? vld1q_f16(biasptr + i + ii) : vdupq_n_f16(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[4][6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 36 + jj * 8;
            const __fp16* r1 = r0 + max_jj * 8;
            const __fp16* r2 = r0 + max_jj * 8 * 2;
            const __fp16* r3 = r0 + max_jj * 8 * 3;
            const __fp16* r4 = r0 + max_jj * 8 * 4;
            const __fp16* r5 = r0 + max_jj * 8 * 5;

            for (int m = 0; m < 6; m++)
            {
                float16x8_t _r0 = vld1q_f16(r0);
                float16x8_t _r1 = vld1q_f16(r1);
                float16x8_t _r2 = vld1q_f16(r2);
                float16x8_t _r3 = vld1q_f16(r3);
                float16x8_t _r4 = vld1q_f16(r4);
                float16x8_t _r5 = vld1q_f16(r5);

                float16x8_t _tmp02a = vaddq_f16(_r1, _r2);
                float16x8_t _tmp02b = vaddq_f16(_r3, _r4);
                float16x8_t _tmp13a = vsubq_f16(_r1, _r2);
                float16x8_t _tmp13b = vsubq_f16(_r3, _r4);

                float16x8_t _tmp0 = vaddq_f16(vaddq_f16(_r0, _tmp02a), _tmp02b);
                float16x8_t _tmp1 = vfmaq_laneq_f16(vmulq_laneq_f16(_tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float16x8_t _tmp2 = vfmaq_laneq_f16(vmulq_laneq_f16(_tmp02a, _coeffs, 4), _tmp02b, _coeffs, 5);
                float16x8_t _tmp3 = vfmaq_laneq_f16(vfmaq_laneq_f16(_r5, _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);

                vst1q_f16(tmp[0][m], _tmp0);
                vst1q_f16(tmp[1][m], _tmp1);
                vst1q_f16(tmp[2][m], _tmp2);
                vst1q_f16(tmp[3][m], _tmp3);

                r0 += max_jj * 6 * 8;
                r1 += max_jj * 6 * 8;
                r2 += max_jj * 6 * 8;
                r3 += max_jj * 6 * 8;
                r4 += max_jj * 6 * 8;
                r5 += max_jj * 6 * 8;
            }

            __fp16* outptr0 = top_blob.channel((i + ii) / out_elempack).row<__fp16>(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float16x8_t _r0 = vld1q_f16(tmp[m][0]);
                float16x8_t _r1 = vld1q_f16(tmp[m][1]);
                float16x8_t _r2 = vld1q_f16(tmp[m][2]);
                float16x8_t _r3 = vld1q_f16(tmp[m][3]);
                float16x8_t _r4 = vld1q_f16(tmp[m][4]);
                float16x8_t _r5 = vld1q_f16(tmp[m][5]);

                float16x8_t _tmp02a = vaddq_f16(_r1, _r2);
                float16x8_t _tmp02b = vaddq_f16(_r3, _r4);
                float16x8_t _tmp13a = vsubq_f16(_r1, _r2);
                float16x8_t _tmp13b = vsubq_f16(_r3, _r4);

                float16x8_t _tmp0 = vaddq_f16(vaddq_f16(_r0, _tmp02a), vaddq_f16(_tmp02b, _bias0));
                float16x8_t _tmp1 = vfmaq_laneq_f16(vfmaq_laneq_f16(_bias0, _tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float16x8_t _tmp2 = vfmaq_laneq_f16(vfmaq_laneq_f16(_bias0, _tmp02a, _coeffs, 4), _tmp02b, _coeffs, 5);
                float16x8_t _tmp3 = vfmaq_laneq_f16(vfmaq_laneq_f16(vaddq_f16(_r5, _bias0), _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);

                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _tmp0);
                    if (tj * 4 + 1 < outw) vst1q_f16(outptr0 + 8, _tmp1);
                    if (tj * 4 + 2 < outw) vst1q_f16(outptr0 + 16, _tmp2);
                    if (tj * 4 + 3 < outw) vst1q_f16(outptr0 + 24, _tmp3);
                }
                if (out_elempack == 4)
                {
                    __fp16* outptr1 = outptr0 + N;

                    vst1_f16(outptr0, vget_low_f16(_tmp0));
                    vst1_f16(outptr1, vget_high_f16(_tmp0));
                    if (tj * 4 + 1 < outw)
                    {
                        vst1_f16(outptr0 + 4, vget_low_f16(_tmp1));
                        vst1_f16(outptr1 + 4, vget_high_f16(_tmp1));
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        vst1_f16(outptr0 + 8, vget_low_f16(_tmp2));
                        vst1_f16(outptr1 + 8, vget_high_f16(_tmp2));
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        vst1_f16(outptr0 + 12, vget_low_f16(_tmp3));
                        vst1_f16(outptr1 + 12, vget_high_f16(_tmp3));
                    }
                }
                if (out_elempack == 1)
                {
                    __fp16 tmp0[8];
                    __fp16 tmp1[8];
                    __fp16 tmp2[8];
                    __fp16 tmp3[8];
                    vst1q_f16(tmp0, _tmp0);
                    vst1q_f16(tmp1, _tmp1);
                    vst1q_f16(tmp2, _tmp2);
                    vst1q_f16(tmp3, _tmp3);

                    __fp16* outptr1 = outptr0 + N;
                    __fp16* outptr2 = outptr0 + N * 2;
                    __fp16* outptr3 = outptr0 + N * 3;
                    __fp16* outptr4 = outptr0 + N * 4;
                    __fp16* outptr5 = outptr0 + N * 5;
                    __fp16* outptr6 = outptr0 + N * 6;
                    __fp16* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                        outptr4[2] = tmp2[4];
                        outptr5[2] = tmp2[5];
                        outptr6[2] = tmp2[6];
                        outptr7[2] = tmp2[7];
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                        outptr4[3] = tmp3[4];
                        outptr5[3] = tmp3[5];
                        outptr6[3] = tmp3[6];
                        outptr7[3] = tmp3[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        float16x4_t _bias0 = biasptr ? vld1_f16(biasptr + i + ii) : vdup_n_f16(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[4][6][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 36 + jj * 4;
            const __fp16* r1 = r0 + max_jj * 4;
            const __fp16* r2 = r0 + max_jj * 4 * 2;
            const __fp16* r3 = r0 + max_jj * 4 * 3;
            const __fp16* r4 = r0 + max_jj * 4 * 4;
            const __fp16* r5 = r0 + max_jj * 4 * 5;

            for (int m = 0; m < 6; m++)
            {
                float16x4_t _r0 = vld1_f16(r0);
                float16x4_t _r1 = vld1_f16(r1);
                float16x4_t _r2 = vld1_f16(r2);
                float16x4_t _r3 = vld1_f16(r3);
                float16x4_t _r4 = vld1_f16(r4);
                float16x4_t _r5 = vld1_f16(r5);

                float16x4_t _tmp02a = vadd_f16(_r1, _r2);
                float16x4_t _tmp02b = vadd_f16(_r3, _r4);
                float16x4_t _tmp13a = vsub_f16(_r1, _r2);
                float16x4_t _tmp13b = vsub_f16(_r3, _r4);

                float16x4_t _tmp0 = vadd_f16(vadd_f16(_r0, _tmp02a), _tmp02b);
                float16x4_t _tmp1 = vfma_laneq_f16(vmul_laneq_f16(_tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float16x4_t _tmp2 = vfma_laneq_f16(vmul_laneq_f16(_tmp02a, _coeffs, 4), _tmp02b, _coeffs, 5);
                float16x4_t _tmp3 = vfma_laneq_f16(vfma_laneq_f16(_r5, _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);

                vst1_f16(tmp[0][m], _tmp0);
                vst1_f16(tmp[1][m], _tmp1);
                vst1_f16(tmp[2][m], _tmp2);
                vst1_f16(tmp[3][m], _tmp3);

                r0 += max_jj * 6 * 4;
                r1 += max_jj * 6 * 4;
                r2 += max_jj * 6 * 4;
                r3 += max_jj * 6 * 4;
                r4 += max_jj * 6 * 4;
                r5 += max_jj * 6 * 4;
            }

            __fp16* outptr0 = top_blob.channel((i + ii) / out_elempack).row<__fp16>(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                float16x4_t _r0 = vld1_f16(tmp[m][0]);
                float16x4_t _r1 = vld1_f16(tmp[m][1]);
                float16x4_t _r2 = vld1_f16(tmp[m][2]);
                float16x4_t _r3 = vld1_f16(tmp[m][3]);
                float16x4_t _r4 = vld1_f16(tmp[m][4]);
                float16x4_t _r5 = vld1_f16(tmp[m][5]);

                float16x4_t _tmp02a = vadd_f16(_r1, _r2);
                float16x4_t _tmp02b = vadd_f16(_r3, _r4);
                float16x4_t _tmp13a = vsub_f16(_r1, _r2);
                float16x4_t _tmp13b = vsub_f16(_r3, _r4);

                float16x4_t _tmp0 = vadd_f16(vadd_f16(_r0, _tmp02a), vadd_f16(_tmp02b, _bias0));
                float16x4_t _tmp1 = vfma_laneq_f16(vfma_laneq_f16(_bias0, _tmp13a, _coeffs, 1), _tmp13b, _coeffs, 0);
                float16x4_t _tmp2 = vfma_laneq_f16(vfma_laneq_f16(_bias0, _tmp02a, _coeffs, 4), _tmp02b, _coeffs, 5);
                float16x4_t _tmp3 = vfma_laneq_f16(vfma_laneq_f16(vadd_f16(_r5, _bias0), _tmp13a, _coeffs, 2), _tmp13b, _coeffs, 3);

                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _tmp0);
                    if (tj * 4 + 1 < outw) vst1_f16(outptr0 + 4, _tmp1);
                    if (tj * 4 + 2 < outw) vst1_f16(outptr0 + 8, _tmp2);
                    if (tj * 4 + 3 < outw) vst1_f16(outptr0 + 12, _tmp3);
                }
                if (out_elempack == 1)
                {
                    __fp16 tmp0[4];
                    __fp16 tmp1[4];
                    __fp16 tmp2[4];
                    __fp16 tmp3[4];
                    vst1_f16(tmp0, _tmp0);
                    vst1_f16(tmp1, _tmp1);
                    vst1_f16(tmp2, _tmp2);
                    vst1_f16(tmp3, _tmp3);

                    __fp16* outptr1 = outptr0 + N;
                    __fp16* outptr2 = outptr0 + N * 2;
                    __fp16* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        __fp16 bias0 = biasptr ? biasptr[i + ii] : 0.f;
        __fp16 bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        __fp16 tmp[4][6][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 36 + jj * 2;
            const __fp16* r1 = r0 + max_jj * 2;
            const __fp16* r2 = r0 + max_jj * 2 * 2;
            const __fp16* r3 = r0 + max_jj * 2 * 3;
            const __fp16* r4 = r0 + max_jj * 2 * 4;
            const __fp16* r5 = r0 + max_jj * 2 * 5;

            for (int m = 0; m < 6; m++)
            {
                __fp16 tmp02a0 = r1[0] + r2[0];
                __fp16 tmp02a1 = r1[1] + r2[1];
                __fp16 tmp02b0 = r3[0] + r4[0];
                __fp16 tmp02b1 = r3[1] + r4[1];
                __fp16 tmp13a0 = r1[0] - r2[0];
                __fp16 tmp13a1 = r1[1] - r2[1];
                __fp16 tmp13b0 = r3[0] - r4[0];
                __fp16 tmp13b1 = r3[1] - r4[1];

                tmp[0][m][0] = r0[0] + tmp02a0 + tmp02b0;
                tmp[0][m][1] = r0[1] + tmp02a1 + tmp02b1;
                tmp[1][m][0] = tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                tmp[1][m][1] = tmp13a1 * sq2_d2 + tmp13b1 * sq2;
                tmp[2][m][0] = tmp02a0 * (__fp16)0.5f + tmp02b0 * (__fp16)2;
                tmp[2][m][1] = tmp02a1 * (__fp16)0.5f + tmp02b1 * (__fp16)2;
                tmp[3][m][0] = r5[0] + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                tmp[3][m][1] = r5[1] + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;

                r0 += max_jj * 6 * 2;
                r1 += max_jj * 6 * 2;
                r2 += max_jj * 6 * 2;
                r3 += max_jj * 6 * 2;
                r4 += max_jj * 6 * 2;
                r5 += max_jj * 6 * 2;
            }

            __fp16* outptr0 = top_blob.channel(i + ii).row<__fp16>(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                __fp16 r00 = tmp[m][0][0];
                __fp16 r01 = tmp[m][0][1];
                __fp16 r10 = tmp[m][1][0];
                __fp16 r11 = tmp[m][1][1];
                __fp16 r20 = tmp[m][2][0];
                __fp16 r21 = tmp[m][2][1];
                __fp16 r30 = tmp[m][3][0];
                __fp16 r31 = tmp[m][3][1];
                __fp16 r40 = tmp[m][4][0];
                __fp16 r41 = tmp[m][4][1];
                __fp16 r50 = tmp[m][5][0];
                __fp16 r51 = tmp[m][5][1];

                __fp16 tmp02a0 = r10 + r20;
                __fp16 tmp02a1 = r11 + r21;
                __fp16 tmp02b0 = r30 + r40;
                __fp16 tmp02b1 = r31 + r41;
                __fp16 tmp13a0 = r10 - r20;
                __fp16 tmp13a1 = r11 - r21;
                __fp16 tmp13b0 = r30 - r40;
                __fp16 tmp13b1 = r31 - r41;

                __fp16 tmp00 = bias0 + r00 + tmp02a0 + tmp02b0;
                __fp16 tmp01 = bias1 + r01 + tmp02a1 + tmp02b1;
                __fp16 tmp10 = bias0 + tmp13a0 * sq2_d2 + tmp13b0 * sq2;
                __fp16 tmp11 = bias1 + tmp13a1 * sq2_d2 + tmp13b1 * sq2;
                __fp16 tmp20 = bias0 + tmp02a0 * (__fp16)0.5f + tmp02b0 * (__fp16)2;
                __fp16 tmp21 = bias1 + tmp02a1 * (__fp16)0.5f + tmp02b1 * (__fp16)2;
                __fp16 tmp30 = bias0 + r50 + tmp13a0 * sq2_d4 + tmp13b0 * sq2_m2;
                __fp16 tmp31 = bias1 + r51 + tmp13a1 * sq2_d4 + tmp13b1 * sq2_m2;

                // if (out_elempack == 1)
                {
                    __fp16* outptr1 = outptr0 + N;

                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp20;
                        outptr1[2] = tmp21;
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp30;
                        outptr1[3] = tmp31;
                    }
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        __fp16 bias0 = biasptr ? biasptr[i + ii] : 0.f;

        __fp16 tmp[4][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 36 + jj;
            const __fp16* r1 = r0 + max_jj;
            const __fp16* r2 = r0 + max_jj * 2;
            const __fp16* r3 = r0 + max_jj * 3;
            const __fp16* r4 = r0 + max_jj * 4;
            const __fp16* r5 = r0 + max_jj * 5;

            for (int m = 0; m < 6; m++)
            {
                __fp16 tmp02a = r1[0] + r2[0];
                __fp16 tmp02b = r3[0] + r4[0];
                __fp16 tmp13a = r1[0] - r2[0];
                __fp16 tmp13b = r3[0] - r4[0];

                tmp[0][m] = r0[0] + tmp02a + tmp02b;
                tmp[1][m] = tmp13a * sq2_d2 + tmp13b * sq2;
                tmp[2][m] = tmp02a * (__fp16)0.5f + tmp02b * (__fp16)2;
                tmp[3][m] = r5[0] + tmp13a * sq2_d4 + tmp13b * sq2_m2;

                r0 += max_jj * 6;
                r1 += max_jj * 6;
                r2 += max_jj * 6;
                r3 += max_jj * 6;
                r4 += max_jj * 6;
                r5 += max_jj * 6;
            }

            __fp16* outptr0 = top_blob.channel(i + ii).row<__fp16>(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                __fp16 r0 = tmp[m][0];
                __fp16 r1 = tmp[m][1];
                __fp16 r2 = tmp[m][2];
                __fp16 r3 = tmp[m][3];
                __fp16 r4 = tmp[m][4];
                __fp16 r5 = tmp[m][5];

                __fp16 tmp02a = r1 + r2;
                __fp16 tmp02b = r3 + r4;
                __fp16 tmp13a = r1 - r2;
                __fp16 tmp13b = r3 - r4;

                __fp16 tmp0 = bias0 + r0 + tmp02a + tmp02b;
                __fp16 tmp1 = bias0 + tmp13a * sq2_d2 + tmp13b * sq2;
                __fp16 tmp2 = bias0 + tmp02a * (__fp16)0.5f + tmp02b * (__fp16)2;
                __fp16 tmp3 = bias0 + r5 + tmp13a * sq2_d4 + tmp13b * sq2_m2;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 4 + 1 < outw) outptr0[1] = tmp1;
                    if (tj * 4 + 2 < outw) outptr0[2] = tmp2;
                    if (tj * 4 + 3 < outw) outptr0[3] = tmp3;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd43_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 4n+2, winograd F(4,3)
    int w_tiles = (outw + 3) / 4;
    int h_tiles = (outh + 3) / 4;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 36;

    // NCNN_LOGE("conv3x3s1_winograd43_fp16sa %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk_fp16(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd43_transform_input_tile_fp16sa(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile_fp16(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd43_transform_input_tile_fp16sa(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile_fp16(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd43_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}

static inline void conv3x3s1_winograd63_transform_kernel_tile_fp16sa(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    __fp16* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            // const float ktm[8][3] = {
            //     {1.0f, 0.0f, 0.0f},
            //     {-2.0f / 9, -2.0f / 9, -2.0f / 9},
            //     {-2.0f / 9, 2.0f / 9, -2.0f / 9},
            //     {1.0f / 90, 1.0f / 45, 2.0f / 45},
            //     {1.0f / 90, -1.0f / 45, 2.0f / 45},
            //     {1.0f / 45, 1.0f / 90, 1.0f / 180},
            //     {1.0f / 45, -1.0f / 90, 1.0f / 180},
            //     {0.0f, 0.0f, 1.0f}
            // };
            const float ktm0 = 2.0f / 9;
            const float ktm1 = 1.0f / 45;
            const float ktm2 = 2.0f / 45;
            const float ktm3 = 1.0f / 90;
            const float ktm4 = 1.0f / 180;

            float tmp[8][3];

            const float* k0 = (const float*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                float r0 = k0[0];
                float r1 = k0[1];
                float r2 = k0[2];

                tmp[0][m] = r0;
                tmp[1][m] = -r0 * ktm0 - r1 * ktm0 - r2 * ktm0;
                tmp[2][m] = -r0 * ktm0 + r1 * ktm0 - r2 * ktm0;
                tmp[3][m] = r0 * ktm3 + r1 * ktm1 + r2 * ktm2;
                tmp[4][m] = r0 * ktm3 - r1 * ktm1 + r2 * ktm2;
                tmp[5][m] = r0 * ktm1 + r1 * ktm3 + r2 * ktm4;
                tmp[6][m] = r0 * ktm1 - r1 * ktm3 + r2 * ktm4;
                tmp[7][m] = r2;

                k0 += 3;
            }

            for (int m = 0; m < 8; m++)
            {
                float r0 = tmp[m][0];
                float r1 = tmp[m][1];
                float r2 = tmp[m][2];

                float z0 = r0;
                float z1 = -r0 * ktm0 - r1 * ktm0 - r2 * ktm0;
                float z2 = -r0 * ktm0 + r1 * ktm0 - r2 * ktm0;
                float z3 = r0 * ktm3 + r1 * ktm1 + r2 * ktm2;
                float z4 = r0 * ktm3 - r1 * ktm1 + r2 * ktm2;
                float z5 = r0 * ktm1 + r1 * ktm3 + r2 * ktm4;
                float z6 = r0 * ktm1 - r1 * ktm3 + r2 * ktm4;
                float z7 = r2;

                ptmp[0] = (__fp16)z0;
                ptmp[1] = (__fp16)z1;
                ptmp[2] = (__fp16)z2;
                ptmp[3] = (__fp16)z3;
                ptmp[4] = (__fp16)z4;
                ptmp[5] = (__fp16)z5;
                ptmp[6] = (__fp16)z6;
                ptmp[7] = (__fp16)z7;
                ptmp += 8;
            }
        }
    }
}

static void conv3x3s1_winograd63_transform_kernel_fp16sa(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 64;

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk_fp16(M, 0, K, B, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, (size_t)2u);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)2u);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd63_transform_kernel_tile_fp16sa(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            conv3x3s1_winograd_pack_A_tile_fp16(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd63_transform_input_tile_fp16sa(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const float itm[8][8] = {
    //     {1.0f, 0.0f,-5.25f, 0.00f, 5.25f, 0.00f,-1.0f, 0.0f},
    //     {0.0f, 1.0f, 1.00f,-4.25f,-4.25f, 1.00f, 1.0f, 0.0f},
    //     {0.0f,-1.0f, 1.00f, 4.25f,-4.25f,-1.00f, 1.0f, 0.0f},
    //     {0.0f, 0.5f, 0.25f,-2.50f,-1.25f, 2.00f, 1.0f, 0.0f},
    //     {0.0f,-0.5f, 0.25f, 2.50f,-1.25f,-2.00f, 1.0f, 0.0f},
    //     {0.0f, 2.0f, 4.00f,-2.50f,-5.00f, 0.50f, 1.0f, 0.0f},
    //     {0.0f,-2.0f, 4.00f, 2.50f,-5.00f,-0.50f, 1.0f, 0.0f},
    //     {0.0f,-1.0f, 0.00f, 5.25f, 0.00f,-5.25f, 0.0f, 1.0f}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const int N = bottom_blob.cstep * elempack;

    const int w_tiles = (w + 3) / 6;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[8][8][8];

        const __fp16 coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float16x8_t _coeffs = vld1q_f16(coeffs);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel((k + kk) / elempack).row<const __fp16>(ti * 6) + (tj * 6) * elempack;

            for (int m = 0; m < 8; m++)
            {
                float16x8_t _r0 = vdupq_n_f16(0.f);
                float16x8_t _r1 = vdupq_n_f16(0.f);
                float16x8_t _r2 = vdupq_n_f16(0.f);
                float16x8_t _r3 = vdupq_n_f16(0.f);
                float16x8_t _r4 = vdupq_n_f16(0.f);
                float16x8_t _r5 = vdupq_n_f16(0.f);
                float16x8_t _r6 = vdupq_n_f16(0.f);
                float16x8_t _r7 = vdupq_n_f16(0.f);

                if (ti * 6 + m < h)
                {
                    if (elempack == 8)
                    {
                        _r0 = vld1q_f16(r0);
                        if (tj * 6 + 1 < w) _r1 = vld1q_f16(r0 + 8);
                        if (tj * 6 + 2 < w) _r2 = vld1q_f16(r0 + 16);
                        if (tj * 6 + 3 < w) _r3 = vld1q_f16(r0 + 24);
                        if (tj * 6 + 4 < w) _r4 = vld1q_f16(r0 + 32);
                        if (tj * 6 + 5 < w) _r5 = vld1q_f16(r0 + 40);
                        if (tj * 6 + 6 < w) _r6 = vld1q_f16(r0 + 48);
                        if (tj * 6 + 7 < w) _r7 = vld1q_f16(r0 + 56);
                    }
                    if (elempack == 4)
                    {
                        const __fp16* r1 = r0 + N;

                        _r0 = vcombine_f16(vld1_f16(r0), vld1_f16(r1));
                        if (tj * 6 + 1 < w)
                        {
                            _r1 = vcombine_f16(vld1_f16(r0 + 4), vld1_f16(r1 + 4));
                        }
                        if (tj * 6 + 2 < w)
                        {
                            _r2 = vcombine_f16(vld1_f16(r0 + 8), vld1_f16(r1 + 8));
                        }
                        if (tj * 6 + 3 < w)
                        {
                            _r3 = vcombine_f16(vld1_f16(r0 + 12), vld1_f16(r1 + 12));
                        }
                        if (tj * 6 + 4 < w)
                        {
                            _r4 = vcombine_f16(vld1_f16(r0 + 16), vld1_f16(r1 + 16));
                        }
                        if (tj * 6 + 5 < w)
                        {
                            _r5 = vcombine_f16(vld1_f16(r0 + 20), vld1_f16(r1 + 20));
                        }
                        if (tj * 6 + 6 < w)
                        {
                            _r6 = vcombine_f16(vld1_f16(r0 + 24), vld1_f16(r1 + 24));
                        }
                        if (tj * 6 + 7 < w)
                        {
                            _r7 = vcombine_f16(vld1_f16(r0 + 28), vld1_f16(r1 + 28));
                        }
                    }
                    if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;
                        const __fp16* r2 = r0 + N * 2;
                        const __fp16* r3 = r0 + N * 3;
                        const __fp16* r4 = r0 + N * 4;
                        const __fp16* r5 = r0 + N * 5;
                        const __fp16* r6 = r0 + N * 6;
                        const __fp16* r7 = r0 + N * 7;

                        float16x4_t _t0 = vld1_f16(r0);
                        float16x4_t _t1 = vld1_f16(r1);
                        float16x4_t _t2 = vld1_f16(r2);
                        float16x4_t _t3 = vld1_f16(r3);
                        float16x4_t _t4 = vld1_f16(r4);
                        float16x4_t _t5 = vld1_f16(r5);
                        float16x4_t _t6 = vld1_f16(r6);
                        float16x4_t _t7 = vld1_f16(r7);

                        transpose4x4_ph(_t0, _t1, _t2, _t3);
                        transpose4x4_ph(_t4, _t5, _t6, _t7);

                        _r0 = vcombine_f16(_t0, _t4);
                        if (tj * 6 + 1 < w)
                        {
                            _r1 = vcombine_f16(_t1, _t5);
                        }
                        if (tj * 6 + 2 < w)
                        {
                            _r2 = vcombine_f16(_t2, _t6);
                        }
                        if (tj * 6 + 3 < w)
                        {
                            _r3 = vcombine_f16(_t3, _t7);
                        }
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1_f16(r0 + 4);
                            _t1 = vld1_f16(r1 + 4);
                            _t2 = vld1_f16(r2 + 4);
                            _t3 = vld1_f16(r3 + 4);
                            _t4 = vld1_f16(r4 + 4);
                            _t5 = vld1_f16(r5 + 4);
                            _t6 = vld1_f16(r6 + 4);
                            _t7 = vld1_f16(r7 + 4);

                            transpose4x4_ph(_t0, _t1, _t2, _t3);
                            transpose4x4_ph(_t4, _t5, _t6, _t7);

                            _r4 = vcombine_f16(_t0, _t4);
                            if (tj * 6 + 5 < w)
                            {
                                _r5 = vcombine_f16(_t1, _t5);
                            }
                            if (tj * 6 + 6 < w)
                            {
                                _r6 = vcombine_f16(_t2, _t6);
                            }
                            if (tj * 6 + 7 < w)
                            {
                                _r7 = vcombine_f16(_t3, _t7);
                            }
                        }
                    }
                }

                float16x8_t _tmp12a = vfmaq_laneq_f16(vaddq_f16(_r2, _r6), _r4, _coeffs, 1);
                float16x8_t _tmp12b = vfmaq_laneq_f16(vaddq_f16(_r1, _r5), _r3, _coeffs, 1);
                float16x8_t _tmp34a = vfmaq_laneq_f16(vfmaq_laneq_f16(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float16x8_t _tmp34b = vfmaq_laneq_f16(vfmaq_laneq_f16(vmulq_laneq_f16(_r1, _coeffs, 5), _r3, _coeffs, 4), _r5, _coeffs, 6);
                float16x8_t _tmp56a = vfmaq_laneq_f16(_r6, vfmaq_laneq_f16(_r2, _r4, _coeffs, 2), _coeffs, 7);
                float16x8_t _tmp56b = vfmaq_laneq_f16(vfmaq_laneq_f16(vmulq_laneq_f16(_r1, _coeffs, 6), _r3, _coeffs, 4), _r5, _coeffs, 5);

                float16x8_t _tmp0 = vfmaq_laneq_f16(vsubq_f16(_r0, _r6), vsubq_f16(_r4, _r2), _coeffs, 0);
                float16x8_t _tmp1 = vaddq_f16(_tmp12a, _tmp12b);
                float16x8_t _tmp2 = vsubq_f16(_tmp12a, _tmp12b);
                float16x8_t _tmp3 = vaddq_f16(_tmp34a, _tmp34b);
                float16x8_t _tmp4 = vsubq_f16(_tmp34a, _tmp34b);
                float16x8_t _tmp5 = vaddq_f16(_tmp56a, _tmp56b);
                float16x8_t _tmp6 = vsubq_f16(_tmp56a, _tmp56b);
                float16x8_t _tmp7 = vfmaq_laneq_f16(vsubq_f16(_r7, _r1), vsubq_f16(_r3, _r5), _coeffs, 0);

                vst1q_f16(tmp[0][m], _tmp0);
                vst1q_f16(tmp[1][m], _tmp1);
                vst1q_f16(tmp[2][m], _tmp2);
                vst1q_f16(tmp[3][m], _tmp3);
                vst1q_f16(tmp[4][m], _tmp4);
                vst1q_f16(tmp[5][m], _tmp5);
                vst1q_f16(tmp[6][m], _tmp6);
                vst1q_f16(tmp[7][m], _tmp7);

                r0 += w * elempack;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 64 + jj * 8;
            __fp16* p1 = p0 + max_jj * 8;
            __fp16* p2 = p0 + max_jj * 8 * 2;
            __fp16* p3 = p0 + max_jj * 8 * 3;
            __fp16* p4 = p0 + max_jj * 8 * 4;
            __fp16* p5 = p0 + max_jj * 8 * 5;
            __fp16* p6 = p0 + max_jj * 8 * 6;
            __fp16* p7 = p0 + max_jj * 8 * 7;

            for (int m = 0; m < 8; m++)
            {
                float16x8_t _r0 = vld1q_f16(tmp[m][0]);
                float16x8_t _r1 = vld1q_f16(tmp[m][1]);
                float16x8_t _r2 = vld1q_f16(tmp[m][2]);
                float16x8_t _r3 = vld1q_f16(tmp[m][3]);
                float16x8_t _r4 = vld1q_f16(tmp[m][4]);
                float16x8_t _r5 = vld1q_f16(tmp[m][5]);
                float16x8_t _r6 = vld1q_f16(tmp[m][6]);
                float16x8_t _r7 = vld1q_f16(tmp[m][7]);

                float16x8_t _tmp12a = vfmaq_laneq_f16(vaddq_f16(_r2, _r6), _r4, _coeffs, 1);
                float16x8_t _tmp12b = vfmaq_laneq_f16(vaddq_f16(_r1, _r5), _r3, _coeffs, 1);
                float16x8_t _tmp34a = vfmaq_laneq_f16(vfmaq_laneq_f16(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float16x8_t _tmp34b = vfmaq_laneq_f16(vfmaq_laneq_f16(vmulq_laneq_f16(_r1, _coeffs, 5), _r3, _coeffs, 4), _r5, _coeffs, 6);
                float16x8_t _tmp56a = vfmaq_laneq_f16(_r6, vfmaq_laneq_f16(_r2, _r4, _coeffs, 2), _coeffs, 7);
                float16x8_t _tmp56b = vfmaq_laneq_f16(vfmaq_laneq_f16(vmulq_laneq_f16(_r1, _coeffs, 6), _r3, _coeffs, 4), _r5, _coeffs, 5);

                float16x8_t _tmp0 = vfmaq_laneq_f16(vsubq_f16(_r0, _r6), vsubq_f16(_r4, _r2), _coeffs, 0);
                float16x8_t _tmp1 = vaddq_f16(_tmp12a, _tmp12b);
                float16x8_t _tmp2 = vsubq_f16(_tmp12a, _tmp12b);
                float16x8_t _tmp3 = vaddq_f16(_tmp34a, _tmp34b);
                float16x8_t _tmp4 = vsubq_f16(_tmp34a, _tmp34b);
                float16x8_t _tmp5 = vaddq_f16(_tmp56a, _tmp56b);
                float16x8_t _tmp6 = vsubq_f16(_tmp56a, _tmp56b);
                float16x8_t _tmp7 = vfmaq_laneq_f16(vsubq_f16(_r7, _r1), vsubq_f16(_r3, _r5), _coeffs, 0);

                vst1q_f16(p0, _tmp0);
                vst1q_f16(p1, _tmp1);
                vst1q_f16(p2, _tmp2);
                vst1q_f16(p3, _tmp3);
                vst1q_f16(p4, _tmp4);
                vst1q_f16(p5, _tmp5);
                vst1q_f16(p6, _tmp6);
                vst1q_f16(p7, _tmp7);

                p0 += max_jj * 8 * 8;
                p1 += max_jj * 8 * 8;
                p2 += max_jj * 8 * 8;
                p3 += max_jj * 8 * 8;
                p4 += max_jj * 8 * 8;
                p5 += max_jj * 8 * 8;
                p6 += max_jj * 8 * 8;
                p7 += max_jj * 8 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 4;
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 4;

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[8][8][4];

        const __fp16 coeffs[8] = {5.25f, -4.25f, -1.25f, 0.25f, -2.5f, 0.5f, 2.f, 4.f};
        float16x8_t _coeffs = vld1q_f16(coeffs);

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel((k + kk) / elempack).row<const __fp16>(ti * 6) + (tj * 6) * elempack;

            for (int m = 0; m < 8; m++)
            {
                float16x4_t _r0 = vdup_n_f16(0.f);
                float16x4_t _r1 = vdup_n_f16(0.f);
                float16x4_t _r2 = vdup_n_f16(0.f);
                float16x4_t _r3 = vdup_n_f16(0.f);
                float16x4_t _r4 = vdup_n_f16(0.f);
                float16x4_t _r5 = vdup_n_f16(0.f);
                float16x4_t _r6 = vdup_n_f16(0.f);
                float16x4_t _r7 = vdup_n_f16(0.f);

                if (ti * 6 + m < h)
                {
                    if (elempack == 4)
                    {
                        _r0 = vld1_f16(r0);
                        if (tj * 6 + 1 < w) _r1 = vld1_f16(r0 + 4);
                        if (tj * 6 + 2 < w) _r2 = vld1_f16(r0 + 8);
                        if (tj * 6 + 3 < w) _r3 = vld1_f16(r0 + 12);
                        if (tj * 6 + 4 < w) _r4 = vld1_f16(r0 + 16);
                        if (tj * 6 + 5 < w) _r5 = vld1_f16(r0 + 20);
                        if (tj * 6 + 6 < w) _r6 = vld1_f16(r0 + 24);
                        if (tj * 6 + 7 < w) _r7 = vld1_f16(r0 + 28);
                    }
                    if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;
                        const __fp16* r2 = r0 + N * 2;
                        const __fp16* r3 = r0 + N * 3;

                        float16x4_t _t0 = vld1_f16(r0);
                        float16x4_t _t1 = vld1_f16(r1);
                        float16x4_t _t2 = vld1_f16(r2);
                        float16x4_t _t3 = vld1_f16(r3);

                        transpose4x4_ph(_t0, _t1, _t2, _t3);

                        _r0 = _t0;
                        if (tj * 6 + 1 < w) _r1 = _t1;
                        if (tj * 6 + 2 < w) _r2 = _t2;
                        if (tj * 6 + 3 < w) _r3 = _t3;
                        if (tj * 6 + 4 < w)
                        {
                            _t0 = vld1_f16(r0 + 4);
                            _t1 = vld1_f16(r1 + 4);
                            _t2 = vld1_f16(r2 + 4);
                            _t3 = vld1_f16(r3 + 4);

                            transpose4x4_ph(_t0, _t1, _t2, _t3);

                            _r4 = _t0;
                            if (tj * 6 + 5 < w) _r5 = _t1;
                            if (tj * 6 + 6 < w) _r6 = _t2;
                            if (tj * 6 + 7 < w) _r7 = _t3;
                        }
                    }
                }

                float16x4_t _tmp12a = vfma_laneq_f16(vadd_f16(_r2, _r6), _r4, _coeffs, 1);
                float16x4_t _tmp12b = vfma_laneq_f16(vadd_f16(_r1, _r5), _r3, _coeffs, 1);
                float16x4_t _tmp34a = vfma_laneq_f16(vfma_laneq_f16(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float16x4_t _tmp34b = vfma_laneq_f16(vfma_laneq_f16(vmul_laneq_f16(_r1, _coeffs, 5), _r3, _coeffs, 4), _r5, _coeffs, 6);
                float16x4_t _tmp56a = vfma_laneq_f16(_r6, vfma_laneq_f16(_r2, _r4, _coeffs, 2), _coeffs, 7);
                float16x4_t _tmp56b = vfma_laneq_f16(vfma_laneq_f16(vmul_laneq_f16(_r1, _coeffs, 6), _r3, _coeffs, 4), _r5, _coeffs, 5);

                float16x4_t _tmp0 = vfma_laneq_f16(vsub_f16(_r0, _r6), vsub_f16(_r4, _r2), _coeffs, 0);
                float16x4_t _tmp1 = vadd_f16(_tmp12a, _tmp12b);
                float16x4_t _tmp2 = vsub_f16(_tmp12a, _tmp12b);
                float16x4_t _tmp3 = vadd_f16(_tmp34a, _tmp34b);
                float16x4_t _tmp4 = vsub_f16(_tmp34a, _tmp34b);
                float16x4_t _tmp5 = vadd_f16(_tmp56a, _tmp56b);
                float16x4_t _tmp6 = vsub_f16(_tmp56a, _tmp56b);
                float16x4_t _tmp7 = vfma_laneq_f16(vsub_f16(_r7, _r1), vsub_f16(_r3, _r5), _coeffs, 0);

                vst1_f16(tmp[0][m], _tmp0);
                vst1_f16(tmp[1][m], _tmp1);
                vst1_f16(tmp[2][m], _tmp2);
                vst1_f16(tmp[3][m], _tmp3);
                vst1_f16(tmp[4][m], _tmp4);
                vst1_f16(tmp[5][m], _tmp5);
                vst1_f16(tmp[6][m], _tmp6);
                vst1_f16(tmp[7][m], _tmp7);

                r0 += w * elempack;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 64 + jj * 4;
            __fp16* p1 = p0 + max_jj * 4;
            __fp16* p2 = p0 + max_jj * 4 * 2;
            __fp16* p3 = p0 + max_jj * 4 * 3;
            __fp16* p4 = p0 + max_jj * 4 * 4;
            __fp16* p5 = p0 + max_jj * 4 * 5;
            __fp16* p6 = p0 + max_jj * 4 * 6;
            __fp16* p7 = p0 + max_jj * 4 * 7;

            for (int m = 0; m < 8; m++)
            {
                float16x4_t _r0 = vld1_f16(tmp[m][0]);
                float16x4_t _r1 = vld1_f16(tmp[m][1]);
                float16x4_t _r2 = vld1_f16(tmp[m][2]);
                float16x4_t _r3 = vld1_f16(tmp[m][3]);
                float16x4_t _r4 = vld1_f16(tmp[m][4]);
                float16x4_t _r5 = vld1_f16(tmp[m][5]);
                float16x4_t _r6 = vld1_f16(tmp[m][6]);
                float16x4_t _r7 = vld1_f16(tmp[m][7]);

                float16x4_t _tmp12a = vfma_laneq_f16(vadd_f16(_r2, _r6), _r4, _coeffs, 1);
                float16x4_t _tmp12b = vfma_laneq_f16(vadd_f16(_r1, _r5), _r3, _coeffs, 1);
                float16x4_t _tmp34a = vfma_laneq_f16(vfma_laneq_f16(_r6, _r2, _coeffs, 3), _r4, _coeffs, 2);
                float16x4_t _tmp34b = vfma_laneq_f16(vfma_laneq_f16(vmul_laneq_f16(_r1, _coeffs, 5), _r3, _coeffs, 4), _r5, _coeffs, 6);
                float16x4_t _tmp56a = vfma_laneq_f16(_r6, vfma_laneq_f16(_r2, _r4, _coeffs, 2), _coeffs, 7);
                float16x4_t _tmp56b = vfma_laneq_f16(vfma_laneq_f16(vmul_laneq_f16(_r1, _coeffs, 6), _r3, _coeffs, 4), _r5, _coeffs, 5);

                float16x4_t _tmp0 = vfma_laneq_f16(vsub_f16(_r0, _r6), vsub_f16(_r4, _r2), _coeffs, 0);
                float16x4_t _tmp1 = vadd_f16(_tmp12a, _tmp12b);
                float16x4_t _tmp2 = vsub_f16(_tmp12a, _tmp12b);
                float16x4_t _tmp3 = vadd_f16(_tmp34a, _tmp34b);
                float16x4_t _tmp4 = vsub_f16(_tmp34a, _tmp34b);
                float16x4_t _tmp5 = vadd_f16(_tmp56a, _tmp56b);
                float16x4_t _tmp6 = vsub_f16(_tmp56a, _tmp56b);
                float16x4_t _tmp7 = vfma_laneq_f16(vsub_f16(_r7, _r1), vsub_f16(_r3, _r5), _coeffs, 0);

                vst1_f16(p0, _tmp0);
                vst1_f16(p1, _tmp1);
                vst1_f16(p2, _tmp2);
                vst1_f16(p3, _tmp3);
                vst1_f16(p4, _tmp4);
                vst1_f16(p5, _tmp5);
                vst1_f16(p6, _tmp6);
                vst1_f16(p7, _tmp7);

                p0 += max_jj * 8 * 4;
                p1 += max_jj * 8 * 4;
                p2 += max_jj * 8 * 4;
                p3 += max_jj * 8 * 4;
                p4 += max_jj * 8 * 4;
                p5 += max_jj * 8 * 4;
                p6 += max_jj * 8 * 4;
                p7 += max_jj * 8 * 4;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 4;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        __fp16 tmp[8][8][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = bottom_blob.channel(k + kk).row<const __fp16>(ti * 6) + (tj * 6);

            for (int m = 0; m < 8; m++)
            {
                __fp16 r00 = 0.f;
                __fp16 r01 = 0.f;
                __fp16 r10 = 0.f;
                __fp16 r11 = 0.f;
                __fp16 r20 = 0.f;
                __fp16 r21 = 0.f;
                __fp16 r30 = 0.f;
                __fp16 r31 = 0.f;
                __fp16 r40 = 0.f;
                __fp16 r41 = 0.f;
                __fp16 r50 = 0.f;
                __fp16 r51 = 0.f;
                __fp16 r60 = 0.f;
                __fp16 r61 = 0.f;
                __fp16 r70 = 0.f;
                __fp16 r71 = 0.f;

                if (ti * 6 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const __fp16* r1 = r0 + N;

                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 6 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 6 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 6 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
                        if (tj * 6 + 4 < w)
                        {
                            r40 = r0[4];
                            r41 = r1[4];
                        }
                        if (tj * 6 + 5 < w)
                        {
                            r50 = r0[5];
                            r51 = r1[5];
                        }
                        if (tj * 6 + 6 < w)
                        {
                            r60 = r0[6];
                            r61 = r1[6];
                        }
                        if (tj * 6 + 7 < w)
                        {
                            r70 = r0[7];
                            r71 = r1[7];
                        }
                    }
                }

                __fp16 tmp12a0 = r20 + r60 - r40 * (__fp16)4.25f;
                __fp16 tmp12a1 = r21 + r61 - r41 * (__fp16)4.25f;
                __fp16 tmp12b0 = r10 + r50 - r30 * (__fp16)4.25f;
                __fp16 tmp12b1 = r11 + r51 - r31 * (__fp16)4.25f;
                __fp16 tmp34a0 = r60 + r20 * (__fp16)0.25f - r40 * (__fp16)1.25f;
                __fp16 tmp34a1 = r61 + r21 * (__fp16)0.25f - r41 * (__fp16)1.25f;
                __fp16 tmp34b0 = r10 * (__fp16)0.5f - r30 * (__fp16)2.5f + r50 * (__fp16)2.f;
                __fp16 tmp34b1 = r11 * (__fp16)0.5f - r31 * (__fp16)2.5f + r51 * (__fp16)2.f;
                __fp16 tmp56a0 = r20 * (__fp16)4.f - r40 * (__fp16)5.f + r60;
                __fp16 tmp56a1 = r21 * (__fp16)4.f - r41 * (__fp16)5.f + r61;
                __fp16 tmp56b0 = r10 * (__fp16)2.f - r30 * (__fp16)2.5f + r50 * (__fp16)0.5f;
                __fp16 tmp56b1 = r11 * (__fp16)2.f - r31 * (__fp16)2.5f + r51 * (__fp16)0.5f;

                tmp[0][m][0] = r00 - r60 + (r40 - r20) * (__fp16)5.25f;
                tmp[0][m][1] = r01 - r61 + (r41 - r21) * (__fp16)5.25f;
                tmp[1][m][0] = tmp12a0 + tmp12b0;
                tmp[1][m][1] = tmp12a1 + tmp12b1;
                tmp[2][m][0] = tmp12a0 - tmp12b0;
                tmp[2][m][1] = tmp12a1 - tmp12b1;
                tmp[3][m][0] = tmp34a0 + tmp34b0;
                tmp[3][m][1] = tmp34a1 + tmp34b1;
                tmp[4][m][0] = tmp34a0 - tmp34b0;
                tmp[4][m][1] = tmp34a1 - tmp34b1;
                tmp[5][m][0] = tmp56a0 + tmp56b0;
                tmp[5][m][1] = tmp56a1 + tmp56b1;
                tmp[6][m][0] = tmp56a0 - tmp56b0;
                tmp[6][m][1] = tmp56a1 - tmp56b1;
                tmp[7][m][0] = r70 - r10 + (r30 - r50) * (__fp16)5.25f;
                tmp[7][m][1] = r71 - r11 + (r31 - r51) * (__fp16)5.25f;

                r0 += w;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 64 + jj * 2;
            __fp16* p1 = p0 + max_jj * 2;
            __fp16* p2 = p0 + max_jj * 2 * 2;
            __fp16* p3 = p0 + max_jj * 2 * 3;
            __fp16* p4 = p0 + max_jj * 2 * 4;
            __fp16* p5 = p0 + max_jj * 2 * 5;
            __fp16* p6 = p0 + max_jj * 2 * 6;
            __fp16* p7 = p0 + max_jj * 2 * 7;

            for (int m = 0; m < 8; m++)
            {
                __fp16 r00 = tmp[m][0][0];
                __fp16 r01 = tmp[m][0][1];
                __fp16 r10 = tmp[m][1][0];
                __fp16 r11 = tmp[m][1][1];
                __fp16 r20 = tmp[m][2][0];
                __fp16 r21 = tmp[m][2][1];
                __fp16 r30 = tmp[m][3][0];
                __fp16 r31 = tmp[m][3][1];
                __fp16 r40 = tmp[m][4][0];
                __fp16 r41 = tmp[m][4][1];
                __fp16 r50 = tmp[m][5][0];
                __fp16 r51 = tmp[m][5][1];
                __fp16 r60 = tmp[m][6][0];
                __fp16 r61 = tmp[m][6][1];
                __fp16 r70 = tmp[m][7][0];
                __fp16 r71 = tmp[m][7][1];

                __fp16 tmp12a0 = r20 + r60 - r40 * (__fp16)4.25f;
                __fp16 tmp12a1 = r21 + r61 - r41 * (__fp16)4.25f;
                __fp16 tmp12b0 = r10 + r50 - r30 * (__fp16)4.25f;
                __fp16 tmp12b1 = r11 + r51 - r31 * (__fp16)4.25f;
                __fp16 tmp34a0 = r60 + r20 * (__fp16)0.25f - r40 * (__fp16)1.25f;
                __fp16 tmp34a1 = r61 + r21 * (__fp16)0.25f - r41 * (__fp16)1.25f;
                __fp16 tmp34b0 = r10 * (__fp16)0.5f - r30 * (__fp16)2.5f + r50 * (__fp16)2.f;
                __fp16 tmp34b1 = r11 * (__fp16)0.5f - r31 * (__fp16)2.5f + r51 * (__fp16)2.f;
                __fp16 tmp56a0 = r20 * (__fp16)4.f - r40 * (__fp16)5.f + r60;
                __fp16 tmp56a1 = r21 * (__fp16)4.f - r41 * (__fp16)5.f + r61;
                __fp16 tmp56b0 = r10 * (__fp16)2.f - r30 * (__fp16)2.5f + r50 * (__fp16)0.5f;
                __fp16 tmp56b1 = r11 * (__fp16)2.f - r31 * (__fp16)2.5f + r51 * (__fp16)0.5f;

                p0[0] = r00 - r60 + (r40 - r20) * (__fp16)5.25f;
                p0[1] = r01 - r61 + (r41 - r21) * (__fp16)5.25f;
                p1[0] = tmp12a0 + tmp12b0;
                p1[1] = tmp12a1 + tmp12b1;
                p2[0] = tmp12a0 - tmp12b0;
                p2[1] = tmp12a1 - tmp12b1;
                p3[0] = tmp34a0 + tmp34b0;
                p3[1] = tmp34a1 + tmp34b1;
                p4[0] = tmp34a0 - tmp34b0;
                p4[1] = tmp34a1 - tmp34b1;
                p5[0] = tmp56a0 + tmp56b0;
                p5[1] = tmp56a1 + tmp56b1;
                p6[0] = tmp56a0 - tmp56b0;
                p6[1] = tmp56a1 - tmp56b1;
                p7[0] = r70 - r10 + (r30 - r50) * (__fp16)5.25f;
                p7[1] = r71 - r11 + (r31 - r51) * (__fp16)5.25f;

                p0 += max_jj * 8 * 2;
                p1 += max_jj * 8 * 2;
                p2 += max_jj * 8 * 2;
                p3 += max_jj * 8 * 2;
                p4 += max_jj * 8 * 2;
                p5 += max_jj * 8 * 2;
                p6 += max_jj * 8 * 2;
                p7 += max_jj * 8 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        __fp16 tmp[8][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0123 = bottom_blob.channel(k + kk).row<const __fp16>(ti * 6) + (tj * 6);

            for (int m = 0; m < 8; m++)
            {
                __fp16 r0 = 0.f;
                __fp16 r1 = 0.f;
                __fp16 r2 = 0.f;
                __fp16 r3 = 0.f;
                __fp16 r4 = 0.f;
                __fp16 r5 = 0.f;
                __fp16 r6 = 0.f;
                __fp16 r7 = 0.f;

                if (ti * 6 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 6 + 1 < w) r1 = r0123[1];
                        if (tj * 6 + 2 < w) r2 = r0123[2];
                        if (tj * 6 + 3 < w) r3 = r0123[3];
                        if (tj * 6 + 4 < w) r4 = r0123[4];
                        if (tj * 6 + 5 < w) r5 = r0123[5];
                        if (tj * 6 + 6 < w) r6 = r0123[6];
                        if (tj * 6 + 7 < w) r7 = r0123[7];
                    }
                }

                __fp16 tmp12a = r2 + r6 - r4 * (__fp16)4.25f;
                __fp16 tmp12b = r1 + r5 - r3 * (__fp16)4.25f;
                __fp16 tmp34a = r6 + r2 * (__fp16)0.25f - r4 * (__fp16)1.25f;
                __fp16 tmp34b = r1 * (__fp16)0.5f - r3 * (__fp16)2.5f + r5 * (__fp16)2.f;
                __fp16 tmp56a = r2 * (__fp16)4.f - r4 * (__fp16)5.f + r6;
                __fp16 tmp56b = r1 * (__fp16)2.f - r3 * (__fp16)2.5f + r5 * (__fp16)0.5f;

                tmp[0][m] = r0 - r6 + (r4 - r2) * (__fp16)5.25f;
                tmp[1][m] = tmp12a + tmp12b;
                tmp[2][m] = tmp12a - tmp12b;
                tmp[3][m] = tmp34a + tmp34b;
                tmp[4][m] = tmp34a - tmp34b;
                tmp[5][m] = tmp56a + tmp56b;
                tmp[6][m] = tmp56a - tmp56b;
                tmp[7][m] = r7 - r1 + (r3 - r5) * (__fp16)5.25f;

                r0123 += w;
            }

            __fp16* p0 = (__fp16*)B + kk * max_jj * 64 + jj;
            __fp16* p1 = p0 + max_jj;
            __fp16* p2 = p0 + max_jj * 2;
            __fp16* p3 = p0 + max_jj * 3;
            __fp16* p4 = p0 + max_jj * 4;
            __fp16* p5 = p0 + max_jj * 5;
            __fp16* p6 = p0 + max_jj * 6;
            __fp16* p7 = p0 + max_jj * 7;

            for (int m = 0; m < 8; m++)
            {
                __fp16 r0 = tmp[m][0];
                __fp16 r1 = tmp[m][1];
                __fp16 r2 = tmp[m][2];
                __fp16 r3 = tmp[m][3];
                __fp16 r4 = tmp[m][4];
                __fp16 r5 = tmp[m][5];
                __fp16 r6 = tmp[m][6];
                __fp16 r7 = tmp[m][7];

                __fp16 tmp12a = r2 + r6 - r4 * (__fp16)4.25f;
                __fp16 tmp12b = r1 + r5 - r3 * (__fp16)4.25f;
                __fp16 tmp34a = r6 + r2 * (__fp16)0.25f - r4 * (__fp16)1.25f;
                __fp16 tmp34b = r1 * (__fp16)0.5f - r3 * (__fp16)2.5f + r5 * (__fp16)2.f;
                __fp16 tmp56a = r2 * (__fp16)4.f - r4 * (__fp16)5.f + r6;
                __fp16 tmp56b = r1 * (__fp16)2.f - r3 * (__fp16)2.5f + r5 * (__fp16)0.5f;

                p0[0] = r0 - r6 + (r4 - r2) * (__fp16)5.25f;
                p1[0] = tmp12a + tmp12b;
                p2[0] = tmp12a - tmp12b;
                p3[0] = tmp34a + tmp34b;
                p4[0] = tmp34a - tmp34b;
                p5[0] = tmp56a + tmp56b;
                p6[0] = tmp56a - tmp56b;
                p7[0] = r7 - r1 + (r3 - r5) * (__fp16)5.25f;

                p0 += max_jj * 8;
                p1 += max_jj * 8;
                p2 += max_jj * 8;
                p3 += max_jj * 8;
                p4 += max_jj * 8;
                p5 += max_jj * 8;
                p6 += max_jj * 8;
                p7 += max_jj * 8;
            }
        }
    }
}

static inline void conv3x3s1_winograd63_transform_output_tile_fp16sa(const Mat& top_tile, Mat& top_blob, const Mat& bias, int i, int max_ii, int j, int max_jj)
{
    // const float otm[6][8] = {
    //     {1.0f, 1.0f,  1.0f,  1.0f,  1.0f, 32.0f, 32.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f,  2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f,  4.0f,  4.0f,  8.0f,  8.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f,  8.0f, -8.0f,  4.0f, -4.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f, 16.0f, 16.0f,  2.0f,  2.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 32.0f,-32.0f,  1.0f, -1.0f, 1.0f}
    // };

    const __fp16 coeffs[8] = {32.f, 16.f, 8.f, 4.f, 2.f, 0.f, 0.f, 0.f};
    float16x8_t _coeffs = vld1q_f16(coeffs);

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 5) / 6;

    const __fp16* biasptr = bias;

    int ii = 0;
    for (; ii + 7 < max_ii; ii += 8)
    {
        float16x8_t _bias0 = biasptr ? vld1q_f16(biasptr + i + ii) : vdupq_n_f16(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[6][8][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 64 + jj * 8;
            const __fp16* r1 = r0 + max_jj * 8;
            const __fp16* r2 = r0 + max_jj * 8 * 2;
            const __fp16* r3 = r0 + max_jj * 8 * 3;
            const __fp16* r4 = r0 + max_jj * 8 * 4;
            const __fp16* r5 = r0 + max_jj * 8 * 5;
            const __fp16* r6 = r0 + max_jj * 8 * 6;
            const __fp16* r7 = r0 + max_jj * 8 * 7;

            for (int m = 0; m < 8; m++)
            {
                float16x8_t _r0 = vld1q_f16(r0);
                float16x8_t _r1 = vld1q_f16(r1);
                float16x8_t _r2 = vld1q_f16(r2);
                float16x8_t _r3 = vld1q_f16(r3);
                float16x8_t _r4 = vld1q_f16(r4);
                float16x8_t _r5 = vld1q_f16(r5);
                float16x8_t _r6 = vld1q_f16(r6);
                float16x8_t _r7 = vld1q_f16(r7);

                float16x8_t _tmp024a = vaddq_f16(_r1, _r2);
                float16x8_t _tmp135a = vsubq_f16(_r1, _r2);
                float16x8_t _tmp024b = vaddq_f16(_r3, _r4);
                float16x8_t _tmp135b = vsubq_f16(_r3, _r4);
                float16x8_t _tmp024c = vaddq_f16(_r5, _r6);
                float16x8_t _tmp135c = vsubq_f16(_r5, _r6);

                float16x8_t _tmp0 = vaddq_f16(vaddq_f16(_r0, _tmp024a), vfmaq_laneq_f16(_tmp024b, _tmp024c, _coeffs, 0));
                float16x8_t _tmp1 = vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp135a, _tmp135b, _coeffs, 4), _tmp135c, _coeffs, 1);
                float16x8_t _tmp2 = vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2);
                float16x8_t _tmp3 = vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3);
                float16x8_t _tmp4 = vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _coeffs, 4);
                float16x8_t _tmp5 = vaddq_f16(vaddq_f16(_r7, _tmp135a), vfmaq_laneq_f16(_tmp135c, _tmp135b, _coeffs, 0));

                vst1q_f16(tmp[0][m], _tmp0);
                vst1q_f16(tmp[1][m], _tmp1);
                vst1q_f16(tmp[2][m], _tmp2);
                vst1q_f16(tmp[3][m], _tmp3);
                vst1q_f16(tmp[4][m], _tmp4);
                vst1q_f16(tmp[5][m], _tmp5);

                r0 += max_jj * 8 * 8;
                r1 += max_jj * 8 * 8;
                r2 += max_jj * 8 * 8;
                r3 += max_jj * 8 * 8;
                r4 += max_jj * 8 * 8;
                r5 += max_jj * 8 * 8;
                r6 += max_jj * 8 * 8;
                r7 += max_jj * 8 * 8;
            }

            __fp16* outptr0 = top_blob.channel((i + ii) / out_elempack).row<__fp16>(ti * 6) + (tj * 6) * out_elempack;

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float16x8_t _r0 = vld1q_f16(tmp[m][0]);
                float16x8_t _r1 = vld1q_f16(tmp[m][1]);
                float16x8_t _r2 = vld1q_f16(tmp[m][2]);
                float16x8_t _r3 = vld1q_f16(tmp[m][3]);
                float16x8_t _r4 = vld1q_f16(tmp[m][4]);
                float16x8_t _r5 = vld1q_f16(tmp[m][5]);
                float16x8_t _r6 = vld1q_f16(tmp[m][6]);
                float16x8_t _r7 = vld1q_f16(tmp[m][7]);

                float16x8_t _tmp024a = vaddq_f16(_r1, _r2);
                float16x8_t _tmp135a = vsubq_f16(_r1, _r2);
                float16x8_t _tmp024b = vaddq_f16(_r3, _r4);
                float16x8_t _tmp135b = vsubq_f16(_r3, _r4);
                float16x8_t _tmp024c = vaddq_f16(_r5, _r6);
                float16x8_t _tmp135c = vsubq_f16(_r5, _r6);

                float16x8_t _tmp0 = vaddq_f16(_bias0, vaddq_f16(vaddq_f16(_r0, _tmp024a), vfmaq_laneq_f16(_tmp024b, _tmp024c, _coeffs, 0)));
                float16x8_t _tmp1 = vaddq_f16(_bias0, vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp135a, _tmp135b, _coeffs, 4), _tmp135c, _coeffs, 1));
                float16x8_t _tmp2 = vaddq_f16(_bias0, vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2));
                float16x8_t _tmp3 = vaddq_f16(_bias0, vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3));
                float16x8_t _tmp4 = vaddq_f16(_bias0, vfmaq_laneq_f16(vfmaq_laneq_f16(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _coeffs, 4));
                float16x8_t _tmp5 = vaddq_f16(_bias0, vaddq_f16(vaddq_f16(_r7, _tmp135a), vfmaq_laneq_f16(_tmp135c, _tmp135b, _coeffs, 0)));

                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _tmp0);
                    if (tj * 6 + 1 < outw) vst1q_f16(outptr0 + 8, _tmp1);
                    if (tj * 6 + 2 < outw) vst1q_f16(outptr0 + 16, _tmp2);
                    if (tj * 6 + 3 < outw) vst1q_f16(outptr0 + 24, _tmp3);
                    if (tj * 6 + 4 < outw) vst1q_f16(outptr0 + 32, _tmp4);
                    if (tj * 6 + 5 < outw) vst1q_f16(outptr0 + 40, _tmp5);
                }
                if (out_elempack == 4)
                {
                    __fp16* outptr1 = outptr0 + N;

                    vst1_f16(outptr0, vget_low_f16(_tmp0));
                    vst1_f16(outptr1, vget_high_f16(_tmp0));
                    if (tj * 6 + 1 < outw)
                    {
                        vst1_f16(outptr0 + 4, vget_low_f16(_tmp1));
                        vst1_f16(outptr1 + 4, vget_high_f16(_tmp1));
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        vst1_f16(outptr0 + 8, vget_low_f16(_tmp2));
                        vst1_f16(outptr1 + 8, vget_high_f16(_tmp2));
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        vst1_f16(outptr0 + 12, vget_low_f16(_tmp3));
                        vst1_f16(outptr1 + 12, vget_high_f16(_tmp3));
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        vst1_f16(outptr0 + 16, vget_low_f16(_tmp4));
                        vst1_f16(outptr1 + 16, vget_high_f16(_tmp4));
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        vst1_f16(outptr0 + 20, vget_low_f16(_tmp5));
                        vst1_f16(outptr1 + 20, vget_high_f16(_tmp5));
                    }
                }
                if (out_elempack == 1)
                {
                    __fp16 tmp0[8];
                    __fp16 tmp1[8];
                    __fp16 tmp2[8];
                    __fp16 tmp3[8];
                    __fp16 tmp4[8];
                    __fp16 tmp5[8];
                    vst1q_f16(tmp0, _tmp0);
                    vst1q_f16(tmp1, _tmp1);
                    vst1q_f16(tmp2, _tmp2);
                    vst1q_f16(tmp3, _tmp3);
                    vst1q_f16(tmp4, _tmp4);
                    vst1q_f16(tmp5, _tmp5);

                    __fp16* outptr1 = outptr0 + N;
                    __fp16* outptr2 = outptr0 + N * 2;
                    __fp16* outptr3 = outptr0 + N * 3;
                    __fp16* outptr4 = outptr0 + N * 4;
                    __fp16* outptr5 = outptr0 + N * 5;
                    __fp16* outptr6 = outptr0 + N * 6;
                    __fp16* outptr7 = outptr0 + N * 7;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    outptr4[0] = tmp0[4];
                    outptr5[0] = tmp0[5];
                    outptr6[0] = tmp0[6];
                    outptr7[0] = tmp0[7];
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                        outptr4[1] = tmp1[4];
                        outptr5[1] = tmp1[5];
                        outptr6[1] = tmp1[6];
                        outptr7[1] = tmp1[7];
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                        outptr4[2] = tmp2[4];
                        outptr5[2] = tmp2[5];
                        outptr6[2] = tmp2[6];
                        outptr7[2] = tmp2[7];
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                        outptr4[3] = tmp3[4];
                        outptr5[3] = tmp3[5];
                        outptr6[3] = tmp3[6];
                        outptr7[3] = tmp3[7];
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp4[0];
                        outptr1[4] = tmp4[1];
                        outptr2[4] = tmp4[2];
                        outptr3[4] = tmp4[3];
                        outptr4[4] = tmp4[4];
                        outptr5[4] = tmp4[5];
                        outptr6[4] = tmp4[6];
                        outptr7[4] = tmp4[7];
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp5[0];
                        outptr1[5] = tmp5[1];
                        outptr2[5] = tmp5[2];
                        outptr3[5] = tmp5[3];
                        outptr4[5] = tmp5[4];
                        outptr5[5] = tmp5[5];
                        outptr6[5] = tmp5[6];
                        outptr7[5] = tmp5[7];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        float16x4_t _bias0 = biasptr ? vld1_f16(biasptr + i + ii) : vdup_n_f16(0.f);

#ifdef _MSC_VER
        __declspec(align(16))
#else
        __attribute__((aligned(16)))
#endif
        __fp16 tmp[6][8][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 64 + jj * 4;
            const __fp16* r1 = r0 + max_jj * 4;
            const __fp16* r2 = r0 + max_jj * 4 * 2;
            const __fp16* r3 = r0 + max_jj * 4 * 3;
            const __fp16* r4 = r0 + max_jj * 4 * 4;
            const __fp16* r5 = r0 + max_jj * 4 * 5;
            const __fp16* r6 = r0 + max_jj * 4 * 6;
            const __fp16* r7 = r0 + max_jj * 4 * 7;

            for (int m = 0; m < 8; m++)
            {
                float16x4_t _r0 = vld1_f16(r0);
                float16x4_t _r1 = vld1_f16(r1);
                float16x4_t _r2 = vld1_f16(r2);
                float16x4_t _r3 = vld1_f16(r3);
                float16x4_t _r4 = vld1_f16(r4);
                float16x4_t _r5 = vld1_f16(r5);
                float16x4_t _r6 = vld1_f16(r6);
                float16x4_t _r7 = vld1_f16(r7);

                float16x4_t _tmp024a = vadd_f16(_r1, _r2);
                float16x4_t _tmp135a = vsub_f16(_r1, _r2);
                float16x4_t _tmp024b = vadd_f16(_r3, _r4);
                float16x4_t _tmp135b = vsub_f16(_r3, _r4);
                float16x4_t _tmp024c = vadd_f16(_r5, _r6);
                float16x4_t _tmp135c = vsub_f16(_r5, _r6);

                float16x4_t _tmp0 = vadd_f16(vadd_f16(_r0, _tmp024a), vfma_laneq_f16(_tmp024b, _tmp024c, _coeffs, 0));
                float16x4_t _tmp1 = vfma_laneq_f16(vfma_laneq_f16(_tmp135a, _tmp135b, _coeffs, 4), _tmp135c, _coeffs, 1);
                float16x4_t _tmp2 = vfma_laneq_f16(vfma_laneq_f16(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2);
                float16x4_t _tmp3 = vfma_laneq_f16(vfma_laneq_f16(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3);
                float16x4_t _tmp4 = vfma_laneq_f16(vfma_laneq_f16(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _coeffs, 4);
                float16x4_t _tmp5 = vadd_f16(vadd_f16(_r7, _tmp135a), vfma_laneq_f16(_tmp135c, _tmp135b, _coeffs, 0));

                vst1_f16(tmp[0][m], _tmp0);
                vst1_f16(tmp[1][m], _tmp1);
                vst1_f16(tmp[2][m], _tmp2);
                vst1_f16(tmp[3][m], _tmp3);
                vst1_f16(tmp[4][m], _tmp4);
                vst1_f16(tmp[5][m], _tmp5);

                r0 += max_jj * 8 * 4;
                r1 += max_jj * 8 * 4;
                r2 += max_jj * 8 * 4;
                r3 += max_jj * 8 * 4;
                r4 += max_jj * 8 * 4;
                r5 += max_jj * 8 * 4;
                r6 += max_jj * 8 * 4;
                r7 += max_jj * 8 * 4;
            }

            __fp16* outptr0 = top_blob.channel((i + ii) / out_elempack).row<__fp16>(ti * 6) + (tj * 6) * out_elempack;

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                float16x4_t _r0 = vld1_f16(tmp[m][0]);
                float16x4_t _r1 = vld1_f16(tmp[m][1]);
                float16x4_t _r2 = vld1_f16(tmp[m][2]);
                float16x4_t _r3 = vld1_f16(tmp[m][3]);
                float16x4_t _r4 = vld1_f16(tmp[m][4]);
                float16x4_t _r5 = vld1_f16(tmp[m][5]);
                float16x4_t _r6 = vld1_f16(tmp[m][6]);
                float16x4_t _r7 = vld1_f16(tmp[m][7]);

                float16x4_t _tmp024a = vadd_f16(_r1, _r2);
                float16x4_t _tmp135a = vsub_f16(_r1, _r2);
                float16x4_t _tmp024b = vadd_f16(_r3, _r4);
                float16x4_t _tmp135b = vsub_f16(_r3, _r4);
                float16x4_t _tmp024c = vadd_f16(_r5, _r6);
                float16x4_t _tmp135c = vsub_f16(_r5, _r6);

                float16x4_t _tmp0 = vadd_f16(_bias0, vadd_f16(vadd_f16(_r0, _tmp024a), vfma_laneq_f16(_tmp024b, _tmp024c, _coeffs, 0)));
                float16x4_t _tmp1 = vadd_f16(_bias0, vfma_laneq_f16(vfma_laneq_f16(_tmp135a, _tmp135b, _coeffs, 4), _tmp135c, _coeffs, 1));
                float16x4_t _tmp2 = vadd_f16(_bias0, vfma_laneq_f16(vfma_laneq_f16(_tmp024a, _tmp024b, _coeffs, 3), _tmp024c, _coeffs, 2));
                float16x4_t _tmp3 = vadd_f16(_bias0, vfma_laneq_f16(vfma_laneq_f16(_tmp135a, _tmp135b, _coeffs, 2), _tmp135c, _coeffs, 3));
                float16x4_t _tmp4 = vadd_f16(_bias0, vfma_laneq_f16(vfma_laneq_f16(_tmp024a, _tmp024b, _coeffs, 1), _tmp024c, _coeffs, 4));
                float16x4_t _tmp5 = vadd_f16(_bias0, vadd_f16(vadd_f16(_r7, _tmp135a), vfma_laneq_f16(_tmp135c, _tmp135b, _coeffs, 0)));

                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _tmp0);
                    if (tj * 6 + 1 < outw) vst1_f16(outptr0 + 4, _tmp1);
                    if (tj * 6 + 2 < outw) vst1_f16(outptr0 + 8, _tmp2);
                    if (tj * 6 + 3 < outw) vst1_f16(outptr0 + 12, _tmp3);
                    if (tj * 6 + 4 < outw) vst1_f16(outptr0 + 16, _tmp4);
                    if (tj * 6 + 5 < outw) vst1_f16(outptr0 + 20, _tmp5);
                }
                if (out_elempack == 1)
                {
                    __fp16 tmp0[4];
                    __fp16 tmp1[4];
                    __fp16 tmp2[4];
                    __fp16 tmp3[4];
                    __fp16 tmp4[4];
                    __fp16 tmp5[4];
                    vst1_f16(tmp0, _tmp0);
                    vst1_f16(tmp1, _tmp1);
                    vst1_f16(tmp2, _tmp2);
                    vst1_f16(tmp3, _tmp3);
                    vst1_f16(tmp4, _tmp4);
                    vst1_f16(tmp5, _tmp5);

                    __fp16* outptr1 = outptr0 + N;
                    __fp16* outptr2 = outptr0 + N * 2;
                    __fp16* outptr3 = outptr0 + N * 3;

                    outptr0[0] = tmp0[0];
                    outptr1[0] = tmp0[1];
                    outptr2[0] = tmp0[2];
                    outptr3[0] = tmp0[3];
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp1[0];
                        outptr1[1] = tmp1[1];
                        outptr2[1] = tmp1[2];
                        outptr3[1] = tmp1[3];
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp2[0];
                        outptr1[2] = tmp2[1];
                        outptr2[2] = tmp2[2];
                        outptr3[2] = tmp2[3];
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp3[0];
                        outptr1[3] = tmp3[1];
                        outptr2[3] = tmp3[2];
                        outptr3[3] = tmp3[3];
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp4[0];
                        outptr1[4] = tmp4[1];
                        outptr2[4] = tmp4[2];
                        outptr3[4] = tmp4[3];
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp5[0];
                        outptr1[5] = tmp5[1];
                        outptr2[5] = tmp5[2];
                        outptr3[5] = tmp5[3];
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        __fp16 bias0 = biasptr ? biasptr[i + ii] : 0.f;
        __fp16 bias1 = biasptr ? biasptr[i + ii + 1] : 0.f;

#ifdef _MSC_VER
        __declspec(align(8))
#else
        __attribute__((aligned(8)))
#endif
        __fp16 tmp[6][8][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 64 + jj * 2;
            const __fp16* r1 = r0 + max_jj * 2;
            const __fp16* r2 = r0 + max_jj * 2 * 2;
            const __fp16* r3 = r0 + max_jj * 2 * 3;
            const __fp16* r4 = r0 + max_jj * 2 * 4;
            const __fp16* r5 = r0 + max_jj * 2 * 5;
            const __fp16* r6 = r0 + max_jj * 2 * 6;
            const __fp16* r7 = r0 + max_jj * 2 * 7;

            for (int m = 0; m < 8; m++)
            {
                __fp16 tmp024a0 = r1[0] + r2[0];
                __fp16 tmp024a1 = r1[1] + r2[1];
                __fp16 tmp135a0 = r1[0] - r2[0];
                __fp16 tmp135a1 = r1[1] - r2[1];
                __fp16 tmp024b0 = r3[0] + r4[0];
                __fp16 tmp024b1 = r3[1] + r4[1];
                __fp16 tmp135b0 = r3[0] - r4[0];
                __fp16 tmp135b1 = r3[1] - r4[1];
                __fp16 tmp024c0 = r5[0] + r6[0];
                __fp16 tmp024c1 = r5[1] + r6[1];
                __fp16 tmp135c0 = r5[0] - r6[0];
                __fp16 tmp135c1 = r5[1] - r6[1];

                tmp[0][m][0] = r0[0] + tmp024a0 + tmp024b0 + tmp024c0 * (__fp16)32;
                tmp[0][m][1] = r0[1] + tmp024a1 + tmp024b1 + tmp024c1 * (__fp16)32;
                tmp[1][m][0] = tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * (__fp16)16;
                tmp[1][m][1] = tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * (__fp16)16;
                tmp[2][m][0] = tmp024a0 + tmp024b0 * (__fp16)4 + tmp024c0 * (__fp16)8;
                tmp[2][m][1] = tmp024a1 + tmp024b1 * (__fp16)4 + tmp024c1 * (__fp16)8;
                tmp[3][m][0] = tmp135a0 + tmp135b0 * (__fp16)8 + tmp135c0 * (__fp16)4;
                tmp[3][m][1] = tmp135a1 + tmp135b1 * (__fp16)8 + tmp135c1 * (__fp16)4;
                tmp[4][m][0] = tmp024a0 + tmp024b0 * (__fp16)16 + tmp024c0 + tmp024c0;
                tmp[4][m][1] = tmp024a1 + tmp024b1 * (__fp16)16 + tmp024c1 + tmp024c1;
                tmp[5][m][0] = r7[0] + tmp135a0 + tmp135b0 * (__fp16)32 + tmp135c0;
                tmp[5][m][1] = r7[1] + tmp135a1 + tmp135b1 * (__fp16)32 + tmp135c1;

                r0 += max_jj * 8 * 2;
                r1 += max_jj * 8 * 2;
                r2 += max_jj * 8 * 2;
                r3 += max_jj * 8 * 2;
                r4 += max_jj * 8 * 2;
                r5 += max_jj * 8 * 2;
                r6 += max_jj * 8 * 2;
                r7 += max_jj * 8 * 2;
            }

            __fp16* outptr0 = top_blob.channel(i + ii).row<__fp16>(ti * 6) + (tj * 6);

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                __fp16 r00 = tmp[m][0][0];
                __fp16 r01 = tmp[m][0][1];
                __fp16 r10 = tmp[m][1][0];
                __fp16 r11 = tmp[m][1][1];
                __fp16 r20 = tmp[m][2][0];
                __fp16 r21 = tmp[m][2][1];
                __fp16 r30 = tmp[m][3][0];
                __fp16 r31 = tmp[m][3][1];
                __fp16 r40 = tmp[m][4][0];
                __fp16 r41 = tmp[m][4][1];
                __fp16 r50 = tmp[m][5][0];
                __fp16 r51 = tmp[m][5][1];
                __fp16 r60 = tmp[m][6][0];
                __fp16 r61 = tmp[m][6][1];
                __fp16 r70 = tmp[m][7][0];
                __fp16 r71 = tmp[m][7][1];

                __fp16 tmp024a0 = r10 + r20;
                __fp16 tmp024a1 = r11 + r21;
                __fp16 tmp135a0 = r10 - r20;
                __fp16 tmp135a1 = r11 - r21;
                __fp16 tmp024b0 = r30 + r40;
                __fp16 tmp024b1 = r31 + r41;
                __fp16 tmp135b0 = r30 - r40;
                __fp16 tmp135b1 = r31 - r41;
                __fp16 tmp024c0 = r50 + r60;
                __fp16 tmp024c1 = r51 + r61;
                __fp16 tmp135c0 = r50 - r60;
                __fp16 tmp135c1 = r51 - r61;

                __fp16 tmp00 = bias0 + r00 + tmp024a0 + tmp024b0 + tmp024c0 * (__fp16)32;
                __fp16 tmp01 = bias1 + r01 + tmp024a1 + tmp024b1 + tmp024c1 * (__fp16)32;
                __fp16 tmp10 = bias0 + tmp135a0 + tmp135b0 + tmp135b0 + tmp135c0 * (__fp16)16;
                __fp16 tmp11 = bias1 + tmp135a1 + tmp135b1 + tmp135b1 + tmp135c1 * (__fp16)16;
                __fp16 tmp20 = bias0 + tmp024a0 + tmp024b0 * (__fp16)4 + tmp024c0 * (__fp16)8;
                __fp16 tmp21 = bias1 + tmp024a1 + tmp024b1 * (__fp16)4 + tmp024c1 * (__fp16)8;
                __fp16 tmp30 = bias0 + tmp135a0 + tmp135b0 * (__fp16)8 + tmp135c0 * (__fp16)4;
                __fp16 tmp31 = bias1 + tmp135a1 + tmp135b1 * (__fp16)8 + tmp135c1 * (__fp16)4;
                __fp16 tmp40 = bias0 + tmp024a0 + tmp024b0 * (__fp16)16 + tmp024c0 + tmp024c0;
                __fp16 tmp41 = bias1 + tmp024a1 + tmp024b1 * (__fp16)16 + tmp024c1 + tmp024c1;
                __fp16 tmp50 = bias0 + r70 + tmp135a0 + tmp135b0 * (__fp16)32 + tmp135c0;
                __fp16 tmp51 = bias1 + r71 + tmp135a1 + tmp135b1 * (__fp16)32 + tmp135c1;

                // if (out_elempack == 1)
                {
                    __fp16* outptr1 = outptr0 + N;

                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 6 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
                    if (tj * 6 + 2 < outw)
                    {
                        outptr0[2] = tmp20;
                        outptr1[2] = tmp21;
                    }
                    if (tj * 6 + 3 < outw)
                    {
                        outptr0[3] = tmp30;
                        outptr1[3] = tmp31;
                    }
                    if (tj * 6 + 4 < outw)
                    {
                        outptr0[4] = tmp40;
                        outptr1[4] = tmp41;
                    }
                    if (tj * 6 + 5 < outw)
                    {
                        outptr0[5] = tmp50;
                        outptr1[5] = tmp51;
                    }
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        __fp16 bias0 = biasptr ? biasptr[i + ii] : 0.f;

        __fp16 tmp[6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const __fp16* r0 = (const __fp16*)top_tile + ii * max_jj * 64 + jj;
            const __fp16* r1 = r0 + max_jj;
            const __fp16* r2 = r0 + max_jj * 2;
            const __fp16* r3 = r0 + max_jj * 3;
            const __fp16* r4 = r0 + max_jj * 4;
            const __fp16* r5 = r0 + max_jj * 5;
            const __fp16* r6 = r0 + max_jj * 6;
            const __fp16* r7 = r0 + max_jj * 7;

            for (int m = 0; m < 8; m++)
            {
                __fp16 tmp024a = r1[0] + r2[0];
                __fp16 tmp135a = r1[0] - r2[0];
                __fp16 tmp024b = r3[0] + r4[0];
                __fp16 tmp135b = r3[0] - r4[0];
                __fp16 tmp024c = r5[0] + r6[0];
                __fp16 tmp135c = r5[0] - r6[0];

                tmp[0][m] = r0[0] + tmp024a + tmp024b + tmp024c * (__fp16)32;
                tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * (__fp16)16;
                tmp[2][m] = tmp024a + tmp024b * (__fp16)4 + tmp024c * (__fp16)8;
                tmp[3][m] = tmp135a + tmp135b * (__fp16)8 + tmp135c * (__fp16)4;
                tmp[4][m] = tmp024a + tmp024b * (__fp16)16 + tmp024c + tmp024c;
                tmp[5][m] = r7[0] + tmp135a + tmp135b * (__fp16)32 + tmp135c;

                r0 += max_jj * 8;
                r1 += max_jj * 8;
                r2 += max_jj * 8;
                r3 += max_jj * 8;
                r4 += max_jj * 8;
                r5 += max_jj * 8;
                r6 += max_jj * 8;
                r7 += max_jj * 8;
            }

            __fp16* outptr0 = top_blob.channel(i + ii).row<__fp16>(ti * 6) + (tj * 6);

            for (int m = 0; m < 6; m++)
            {
                if (ti * 6 + m >= outh)
                    continue;

                __fp16 r0 = tmp[m][0];
                __fp16 r1 = tmp[m][1];
                __fp16 r2 = tmp[m][2];
                __fp16 r3 = tmp[m][3];
                __fp16 r4 = tmp[m][4];
                __fp16 r5 = tmp[m][5];
                __fp16 r6 = tmp[m][6];
                __fp16 r7 = tmp[m][7];

                __fp16 tmp024a = r1 + r2;
                __fp16 tmp135a = r1 - r2;
                __fp16 tmp024b = r3 + r4;
                __fp16 tmp135b = r3 - r4;
                __fp16 tmp024c = r5 + r6;
                __fp16 tmp135c = r5 - r6;

                __fp16 tmp0 = bias0 + r0 + tmp024a + tmp024b + tmp024c * (__fp16)32;
                __fp16 tmp1 = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * (__fp16)16;
                __fp16 tmp2 = bias0 + tmp024a + tmp024b * (__fp16)4 + tmp024c * (__fp16)8;
                __fp16 tmp3 = bias0 + tmp135a + tmp135b * (__fp16)8 + tmp135c * (__fp16)4;
                __fp16 tmp4 = bias0 + tmp024a + tmp024b * (__fp16)16 + tmp024c + tmp024c;
                __fp16 tmp5 = bias0 + r7 + tmp135a + tmp135b * (__fp16)32 + tmp135c;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 6 + 1 < outw) outptr0[1] = tmp1;
                    if (tj * 6 + 2 < outw) outptr0[2] = tmp2;
                    if (tj * 6 + 3 < outw) outptr0[3] = tmp3;
                    if (tj * 6 + 4 < outw) outptr0[4] = tmp4;
                    if (tj * 6 + 5 < outw) outptr0[5] = tmp5;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd63_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 6n+2, winograd F(6,3)
    int w_tiles = (outw + 5) / 6;
    int h_tiles = (outh + 5) / 6;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 64;

    // NCNN_LOGE("conv3x3s1_winograd63_fp16sa %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    conv3x3s1_winograd_get_optimal_tile_mnk_fp16(M, N, K, B, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd63_transform_input_tile_fp16sa(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile_fp16(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd63_transform_input_tile_fp16sa(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            conv3x3s1_winograd_transpose_pack_B_tile_fp16(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 2u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                conv3x3s1_winograd_gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, opt.use_a53_a55_optimized_kernel);
            }

            // transform output
            conv3x3s1_winograd63_transform_output_tile_fp16sa(top_tile, top_blob, bias, i, max_ii, j, max_jj);
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_3x3_winograd_int8.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
{
    const int N = max_kk * batch;

    for (int b = 0; b < batch; b++)
    {
        short* pp = AT.row<short>(b);

        int ii = 0;
#if __ARM_NEON
        for (; ii + 7 < max_ii; ii += 8)
        {
            const short* p0 = (const short*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                pp[2] = p0[N * 2];
                pp[3] = p0[N * 3];
                pp[4] = p0[N * 4];
                pp[5] = p0[N * 5];
                pp[6] = p0[N * 6];
                pp[7] = p0[N * 7];
                p0 += batch;
                pp += 8;
            }
        }
        for (; ii + 3 < max_ii; ii += 4)
        {
            const short* p0 = (const short*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                pp[2] = p0[N * 2];
                pp[3] = p0[N * 3];
                p0 += batch;
                pp += 4;
            }
        }
#endif // __ARM_NEON
        for (; ii + 1 < max_ii; ii += 2)
        {
            const short* p0 = (const short*)A + ii * N + b;

            int kk = 0;
#if !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[batch];
                pp[2] = p0[N];
                pp[3] = p0[batch + N];
                p0 += batch * 2;
                pp += 4;
            }
#endif
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[N];
                p0 += batch;
                pp += 2;
            }
        }
        for (; ii < max_ii; ii++)
        {
            const short* p0 = (const short*)A + ii * N + b;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                p0 += batch;
                pp += 1;
            }
        }
    }
}

static void transpose_pack_B_tile_int8(const Mat& B, Mat& BT, int batch, int max_jj, int max_kk, int nT)
{
    // NCNN_LOGE("transpose_pack_B_tile_int8 %d %d", max_jj, max_kk);

    #pragma omp parallel for num_threads(nT)
    for (int b = 0; b < batch; b++)
    {
        short* pp = BT.row<short>(b);

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            const short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x12
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "prfm   pldl1keep, [%0, #1024]  \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    "ld4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0], #64 \n"
                    "ld4    {v16.8h, v17.8h, v18.8h, v19.8h}, [%0] \n"
                    "sub    %0, %0, #128            \n"
                    "uzp1   v20.8h, v0.8h, v4.8h    \n"
                    "uzp2   v26.8h, v0.8h, v4.8h    \n"
                    "uzp1   v23.8h, v2.8h, v6.8h    \n"
                    "uzp2   v29.8h, v2.8h, v6.8h    \n"
                    "uzp1   v21.8h, v16.8h, v1.8h   \n"
                    "uzp2   v27.8h, v16.8h, v1.8h   \n"
                    "uzp1   v22.8h, v5.8h, v17.8h   \n"
                    "uzp2   v28.8h, v5.8h, v17.8h   \n"
                    "uzp1   v24.8h, v18.8h, v3.8h   \n"
                    "uzp2   v30.8h, v18.8h, v3.8h   \n"
                    "uzp1   v25.8h, v7.8h, v19.8h   \n"
                    "uzp2   v31.8h, v7.8h, v19.8h   \n"
                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%1], #64 \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%1], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                int16x8x4_t _r0 = vld4q_s16(p0);
                int16x8x4_t _r1 = vld4q_s16(p0 + 32);
                int16x8x4_t _r2 = vld4q_s16(p0 + 64);
                int16x8x2_t _t0 = vuzpq_s16(_r0.val[0], _r1.val[0]);
                int16x8x2_t _t1 = vuzpq_s16(_r2.val[0], _r0.val[1]);
                int16x8x2_t _t2 = vuzpq_s16(_r1.val[1], _r2.val[1]);
                int16x8x2_t _t3 = vuzpq_s16(_r0.val[2], _r1.val[2]);
                int16x8x2_t _t4 = vuzpq_s16(_r2.val[2], _r0.val[3]);
                int16x8x2_t _t5 = vuzpq_s16(_r1.val[3], _r2.val[3]);
                vst1q_s16(pp, _t0.val[0]);
                vst1q_s16(pp + 8, _t1.val[0]);
                vst1q_s16(pp + 16, _t2.val[0]);
                vst1q_s16(pp + 24, _t3.val[0]);
                vst1q_s16(pp + 32, _t4.val[0]);
                vst1q_s16(pp + 40, _t5.val[0]);
                vst1q_s16(pp + 48, _t0.val[1]);
                vst1q_s16(pp + 56, _t1.val[1]);
                vst1q_s16(pp + 64, _t2.val[1]);
                vst1q_s16(pp + 72, _t3.val[1]);
                vst1q_s16(pp + 80, _t4.val[1]);
                vst1q_s16(pp + 88, _t5.val[1]);
                p0 += max_jj * batch * 8;
                pp += 96;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x8x2_t _r01 = vld2q_s16(p0);
                int16x4x2_t _r2 = vld2_s16(p0 + 16);
                vst1q_s16(pp, _r01.val[0]);
                vst1_s16(pp + 8, _r2.val[0]);
                vst1q_s16(pp + 12, _r01.val[1]);
                vst1_s16(pp + 20, _r2.val[1]);
                p0 += max_jj * batch * 2;
                pp += 24;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                int16x8_t _r0 = vld1q_s16(p0);
                int16x4_t _r1 = vld1_s16(p0 + 8);
                vst1q_s16(pp, _r0);
                vst1_s16(pp + 8, _r1);
                p0 += max_jj * batch;
                pp += 12;
            }
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            const short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
                // transpose 8x8
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "prfm   pldl1keep, [%0, #1024]  \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0] \n"
                    "sub    %0, %0, #64             \n"
                    "zip1   v16.8h, v0.8h, v4.8h    \n"
                    "zip2   v20.8h, v0.8h, v4.8h    \n"
                    "zip1   v17.8h, v1.8h, v5.8h    \n"
                    "zip2   v21.8h, v1.8h, v5.8h    \n"
                    "zip1   v18.8h, v2.8h, v6.8h    \n"
                    "zip2   v22.8h, v2.8h, v6.8h    \n"
                    "zip1   v19.8h, v3.8h, v7.8h    \n"
                    "zip2   v23.8h, v3.8h, v7.8h    \n"
                    "st4    {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n"
                    "st4    {v20.8h, v21.8h, v22.8h, v23.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
                p0 += max_jj * batch * 8;
#else  // NCNN_GNU_INLINE_ASM
                int16x8_t _r0 = vld1q_s16(p0);
                int16x8_t _r1 = vld1q_s16(p0 + 8);
                int16x8_t _r2 = vld1q_s16(p0 + 16);
                int16x8_t _r3 = vld1q_s16(p0 + 24);
                int16x8_t _r4 = vld1q_s16(p0 + 32);
                int16x8_t _r5 = vld1q_s16(p0 + 40);
                int16x8_t _r6 = vld1q_s16(p0 + 48);
                int16x8_t _r7 = vld1q_s16(p0 + 56);
                int16x8x2_t _r04 = vzipq_s16(_r0, _r4);
                int16x8x2_t _r15 = vzipq_s16(_r1, _r5);
                int16x8x2_t _r26 = vzipq_s16(_r2, _r6);
                int16x8x2_t _r37 = vzipq_s16(_r3, _r7);
                int16x8x4_t _r0123;
                _r0123.val[0] = _r04.val[0];
                _r0123.val[1] = _r15.val[0];
                _r0123.val[2] = _r26.val[0];
                _r0123.val[3] = _r37.val[0];
                int16x8x4_t _r4567;
                _r4567.val[0] = _r04.val[1];
                _r4567.val[1] = _r15.val[1];
                _r4567.val[2] = _r26.val[1];
                _r4567.val[3] = _r37.val[1];
                vst4q_s16(pp, _r0123);
                vst4q_s16(pp + 32, _r4567);
                p0 += max_jj * batch * 8;
                pp += 64;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x8x2_t _r01 = vld2q_s16(p0);
                vst1q_s16(pp, _r01.val[0]);
                vst1q_s16(pp + 8, _r01.val[1]);
                p0 += max_jj * batch * 2;
                pp += 16;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                int16x8_t _r0 = vld1q_s16(p0);
                vst1q_s16(pp, _r0);
                p0 += max_jj * batch;
                pp += 8;
            }
        }
#endif // __aarch64__
        for (; jj + 5 < max_jj; jj += 6)
        {
            const short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #768]   \n"
                    "ld1    {v0.8h, v1.8h, v2.8h}, [%0], #48 \n"
                    "ld1    {v3.8h, v4.8h, v5.8h}, [%0] \n"
                    "sub    %0, %0, #48             \n"
                    "zip1   v16.8h, v0.8h, v3.8h    \n"
                    "zip2   v20.8h, v0.8h, v3.8h    \n"
                    "zip1   v17.8h, v1.8h, v4.8h    \n"
                    "zip2   v21.8h, v1.8h, v4.8h    \n"
                    "zip1   v18.8h, v2.8h, v5.8h    \n"
                    "zip2   v22.8h, v2.8h, v5.8h    \n"
                    "st3    {v16.8h, v17.8h, v18.8h}, [%1], #48 \n"
                    "st3    {v20.8h, v21.8h, v22.8h}, [%1], #48 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v20", "v21", "v22");
                p0 += max_jj * batch * 8;
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #768]          \n"
                    "vldm       %0, {d0-d11}        \n"
                    "vzip.16    q0, q3              \n"
                    "vzip.16    q1, q4              \n"
                    "vzip.16    q2, q5              \n"
                    "vst3.s16   {d0,d2,d4}, [%1]!   \n"
                    "vst3.s16   {d1,d3,d5}, [%1]!   \n"
                    "vst3.s16   {d6,d8,d10}, [%1]!  \n"
                    "vst3.s16   {d7,d9,d11}, [%1]!  \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
                p0 += max_jj * batch * 8;
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int16x8_t _r0 = vld1q_s16(p0);
                int16x8_t _r1 = vld1q_s16(p0 + 8);
                int16x8_t _r2 = vld1q_s16(p0 + 16);
                int16x8_t _r3 = vld1q_s16(p0 + 24);
                int16x8_t _r4 = vld1q_s16(p0 + 32);
                int16x8_t _r5 = vld1q_s16(p0 + 40);
                int16x8x2_t _r03 = vzipq_s16(_r0, _r3);
                int16x8x2_t _r14 = vzipq_s16(_r1, _r4);
                int16x8x2_t _r25 = vzipq_s16(_r2, _r5);
                int16x8x3_t _r012;
                _r012.val[0] = _r03.val[0];
                _r012.val[1] = _r14.val[0];
                _r012.val[2] = _r25.val[0];
                int16x8x3_t _r345;
                _r345.val[0] = _r03.val[1];
                _r345.val[1] = _r14.val[1];
                _r345.val[2] = _r25.val[1];
                vst3q_s16(pp, _r012);
                vst3q_s16(pp + 24, _r345);
                p0 += max_jj * batch * 8;
                pp += 48;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x8x2_t _r01 = vld2q_s16(p0);
                int32x4x2_t _r01x = vtrnq_s32(vreinterpretq_s32_s16(_r01.val[0]), vreinterpretq_s32_s16(_r01.val[1]));
                int32x2x3_t _r012;
                _r012.val[0] = vget_low_s32(_r01x.val[0]);
                _r012.val[1] = vget_low_s32(_r01x.val[1]);
                _r012.val[2] = vget_high_s32(_r01x.val[0]);
                vst3_s32((int*)pp, _r012);
                p0 += max_jj * batch * 2;
                pp += 12;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                int16x4_t _r0 = vld1_s16(p0);
                vst1_s16(pp, _r0);
                pp[4] = p0[4];
                pp[5] = p0[5];
                p0 += max_jj * batch;
                pp += 6;
            }
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const short* p0 = B;

            int kk = 0;
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                    "st4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3");
                p0 += max_jj * batch * 8;
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #512]          \n"
                    "vldm       %0, {d0-d7}         \n"
                    "vst4.s16   {d0,d2,d4,d6}, [%1]! \n"
                    "vst4.s16   {d1,d3,d5,d7}, [%1]! \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1", "q2", "q3");
                p0 += max_jj * batch * 8;
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int16x8x4_t _r0123;
                _r0123.val[0] = vld1q_s16(p0);
                _r0123.val[1] = vld1q_s16(p0 + 8);
                _r0123.val[2] = vld1q_s16(p0 + 16);
                _r0123.val[3] = vld1q_s16(p0 + 24);
                vst4q_s16(pp, _r0123);
                p0 += max_jj * batch * 8;
                pp += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x4x2_t _r01 = vld2_s16(p0);
                vst1_s16(pp, _r01.val[0]);
                vst1_s16(pp + 4, _r01.val[1]);
                p0 += max_jj * batch * 2;
                pp += 8;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                int16x4_t _r0 = vld1_s16(p0);
                vst1_s16(pp, _r0);
                p0 += max_jj * batch;
                pp += 4;
            }
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            const short* p0 = B;

            int kk = 0;
#if __ARM_NEON
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]   \n"
                    "ld1    {v0.8h, v1.8h}, [%0]    \n"
                    "st2    {v0.8h, v1.8h}, [%1], #32 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1");
                p0 += max_jj * batch * 8;
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld1.s16   {d0-d3}, [%0]       \n"
                    "vst2.s16   {d0-d3}, [%1]!      \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1");
                p0 += max_jj * batch * 8;
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int16x8x2_t _r01;
                _r01.val[0] = vld1q_s16(p0);
                _r01.val[1] = vld1q_s16(p0 + 8);
                vst2q_s16(pp, _r01);
                p0 += max_jj * batch * 8;
                pp += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
#endif // __ARM_NEON
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
#if !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp[2] = p0[2];
                pp[3] = p0[3];
#else
                pp[0] = p0[0];
                pp[1] = p0[2];
                pp[2] = p0[1];
                pp[3] = p0[3];
#endif
                p0 += max_jj * batch * 2;
                pp += 4;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                p0 += max_jj * batch;
                pp += 2;
            }
        }
        for (; jj < max_jj; jj++)
        {
            const short* p0 = B;

            int kk = 0;
#if __ARM_NEON
            p0 += (b * max_jj + jj) * 8;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #128]   \n"
                    "ld1    {v0.8h}, [%0]           \n"
                    "st1    {v0.8h}, [%1], #16      \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0");
                p0 += max_jj * batch * 8;
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #128]          \n"
                    "vld1.s16   {d0-d1}, [%0]       \n"
                    "vst1.s16   {d0-d1}, [%1]!      \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0");
                p0 += max_jj * batch * 8;
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int16x8_t _r0 = vld1q_s16(p0);
                vst1q_s16(pp, _r0);
                p0 += max_jj * batch * 8;
                pp += 8;
#endif // NCNN_GNU_INLINE_ASM
            }
            p0 -= (b * max_jj + jj) * 8;
#endif // __ARM_NEON
            p0 += (b * max_jj + jj) * 2;
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                p0 += max_jj * batch * 2;
                pp += 2;
            }
            p0 -= (b * max_jj + jj) * 2;
            p0 += (b * max_jj + jj);
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                p0 += max_jj * batch;
                pp += 1;
            }
        }
    }
}

static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& top_blob, int batch, int max_ii, int max_jj, int k, int max_kk)
{
    // return;
    // NCNN_LOGE("gemm_transB_packed_tile_int8 %d %d %d", max_ii, max_jj, max_kk);

    int* outptr = top_blob;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        for (int b = 0; b < batch; b++)
        {
            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
            const short* pB = BT_tile.row<const short>(b);

            int jj = 0;
#if __aarch64__
            for (; jj + 11 < max_jj; jj += 12)
            {
                const short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "cmp    %w7, #0                     \n"
                    "beq    0f                          \n"

                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "sub    %0, %0, #320                \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v8.16b, v8.16b, v8.16b      \n"
                    "eor    v9.16b, v9.16b, v9.16b      \n"
                    "eor    v10.16b, v10.16b, v10.16b   \n"
                    "eor    v11.16b, v11.16b, v11.16b   \n"
                    "eor    v12.16b, v12.16b, v12.16b   \n"
                    "eor    v13.16b, v13.16b, v13.16b   \n"
                    "eor    v14.16b, v14.16b, v14.16b   \n"
                    "eor    v15.16b, v15.16b, v15.16b   \n"
                    "eor    v16.16b, v16.16b, v16.16b   \n"
                    "eor    v17.16b, v17.16b, v17.16b   \n"
                    "eor    v18.16b, v18.16b, v18.16b   \n"
                    "eor    v19.16b, v19.16b, v19.16b   \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"
                    "eor    v21.16b, v21.16b, v21.16b   \n"
                    "eor    v22.16b, v22.16b, v22.16b   \n"
                    "eor    v23.16b, v23.16b, v23.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v25.16b, v25.16b, v25.16b   \n"
                    "eor    v26.16b, v26.16b, v26.16b   \n"
                    "eor    v27.16b, v27.16b, v27.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    ".align 4                           \n"
                    "2:                                 \n"
                    "smlal  v8.4s, v4.4h, v0.h[0]       \n"
                    "smlal  v10.4s, v4.4h, v0.h[1]      \n"
                    "ld1    {v2.8h, v3.8h}, [%2], #32   \n"
                    "smlal2 v9.4s, v4.8h, v0.h[0]       \n"
                    "smlal2 v11.4s, v4.8h, v0.h[1]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal  v12.4s, v4.4h, v0.h[2]      \n"
                    "smlal  v14.4s, v4.4h, v0.h[3]      \n"
                    "smlal2 v13.4s, v4.8h, v0.h[2]      \n"
                    "smlal2 v15.4s, v4.8h, v0.h[3]      \n"
                    "smlal  v16.4s, v4.4h, v0.h[4]      \n"
                    "smlal  v18.4s, v4.4h, v0.h[5]      \n"
                    "smlal2 v17.4s, v4.8h, v0.h[4]      \n"
                    "smlal2 v19.4s, v4.8h, v0.h[5]      \n"
                    "smlal  v20.4s, v4.4h, v0.h[6]      \n"
                    "smlal  v22.4s, v4.4h, v0.h[7]      \n"
                    "smlal2 v21.4s, v4.8h, v0.h[6]      \n"
                    "smlal2 v23.4s, v4.8h, v0.h[7]      \n"
                    "smlal  v24.4s, v4.4h, v1.h[0]      \n"
                    "smlal  v26.4s, v4.4h, v1.h[1]      \n"
                    "smlal2 v25.4s, v4.8h, v1.h[0]      \n"
                    "smlal2 v27.4s, v4.8h, v1.h[1]      \n"
                    "smlal  v28.4s, v4.4h, v1.h[2]      \n"
                    "smlal  v30.4s, v4.4h, v1.h[3]      \n"
                    "smlal2 v29.4s, v4.8h, v1.h[2]      \n"
                    "smlal2 v31.4s, v4.8h, v1.h[3]      \n"
                    "smlal  v8.4s, v5.4h, v1.h[4]       \n"
                    "smlal  v10.4s, v5.4h, v1.h[5]      \n"
                    "smlal2 v9.4s, v5.8h, v1.h[4]       \n"
                    "smlal2 v11.4s, v5.8h, v1.h[5]      \n"
                    "smlal  v12.4s, v5.4h, v1.h[6]      \n"
                    "smlal  v14.4s, v5.4h, v1.h[7]      \n"
                    "smlal2 v13.4s, v5.8h, v1.h[6]      \n"
                    "smlal2 v15.4s, v5.8h, v1.h[7]      \n"
                    "smlal  v16.4s, v5.4h, v2.h[0]      \n"
                    "smlal  v18.4s, v5.4h, v2.h[1]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    "smlal2 v17.4s, v5.8h, v2.h[0]      \n"
                    "smlal2 v19.4s, v5.8h, v2.h[1]      \n"
                    "smlal  v20.4s, v5.4h, v2.h[2]      \n"
                    "smlal  v22.4s, v5.4h, v2.h[3]      \n"
                    "smlal2 v21.4s, v5.8h, v2.h[2]      \n"
                    "smlal2 v23.4s, v5.8h, v2.h[3]      \n"
                    "smlal  v24.4s, v5.4h, v2.h[4]      \n"
                    "smlal  v26.4s, v5.4h, v2.h[5]      \n"
                    "smlal2 v25.4s, v5.8h, v2.h[4]      \n"
                    "smlal2 v27.4s, v5.8h, v2.h[5]      \n"
                    "smlal  v28.4s, v5.4h, v2.h[6]      \n"
                    "smlal  v30.4s, v5.4h, v2.h[7]      \n"
                    "smlal2 v29.4s, v5.8h, v2.h[6]      \n"
                    "smlal2 v31.4s, v5.8h, v2.h[7]      \n"
                    "smlal  v8.4s, v6.4h, v3.h[0]       \n"
                    "smlal  v10.4s, v6.4h, v3.h[1]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v9.4s, v6.8h, v3.h[0]       \n"
                    "smlal2 v11.4s, v6.8h, v3.h[1]      \n"
                    "smlal  v12.4s, v6.4h, v3.h[2]      \n"
                    "smlal  v14.4s, v6.4h, v3.h[3]      \n"
                    "smlal2 v13.4s, v6.8h, v3.h[2]      \n"
                    "smlal2 v15.4s, v6.8h, v3.h[3]      \n"
                    "smlal  v16.4s, v6.4h, v3.h[4]      \n"
                    "smlal  v18.4s, v6.4h, v3.h[5]      \n"
                    "smlal2 v17.4s, v6.8h, v3.h[4]      \n"
                    "smlal2 v19.4s, v6.8h, v3.h[5]      \n"
                    "smlal  v20.4s, v6.4h, v3.h[6]      \n"
                    "smlal  v22.4s, v6.4h, v3.h[7]      \n"
                    "smlal2 v21.4s, v6.8h, v3.h[6]      \n"
                    "smlal2 v23.4s, v6.8h, v3.h[7]      \n"
                    "smlal  v24.4s, v6.4h, v0.h[0]      \n"
                    "smlal  v26.4s, v6.4h, v0.h[1]      \n"
                    "ld1    {v2.8h, v3.8h}, [%2], #32   \n"
                    "smlal2 v25.4s, v6.8h, v0.h[0]      \n"
                    "smlal2 v27.4s, v6.8h, v0.h[1]      \n"
                    "smlal  v28.4s, v6.4h, v0.h[2]      \n"
                    "smlal  v30.4s, v6.4h, v0.h[3]      \n"
                    "smlal2 v29.4s, v6.8h, v0.h[2]      \n"
                    "smlal2 v31.4s, v6.8h, v0.h[3]      \n"
                    "smlal  v8.4s, v7.4h, v0.h[4]       \n"
                    "smlal  v10.4s, v7.4h, v0.h[5]      \n"
                    "smlal2 v9.4s, v7.8h, v0.h[4]       \n"
                    "smlal2 v11.4s, v7.8h, v0.h[5]      \n"
                    "smlal  v12.4s, v7.4h, v0.h[6]      \n"
                    "smlal  v14.4s, v7.4h, v0.h[7]      \n"
                    "smlal2 v13.4s, v7.8h, v0.h[6]      \n"
                    "smlal2 v15.4s, v7.8h, v0.h[7]      \n"
                    "smlal  v16.4s, v7.4h, v1.h[0]      \n"
                    "smlal  v18.4s, v7.4h, v1.h[1]      \n"
                    "smlal2 v17.4s, v7.8h, v1.h[0]      \n"
                    "smlal2 v19.4s, v7.8h, v1.h[1]      \n"
                    "smlal  v20.4s, v7.4h, v1.h[2]      \n"
                    "smlal  v22.4s, v7.4h, v1.h[3]      \n"
                    "smlal2 v21.4s, v7.8h, v1.h[2]      \n"
                    "smlal2 v23.4s, v7.8h, v1.h[3]      \n"
                    "smlal  v24.4s, v7.4h, v1.h[4]      \n"
                    "smlal  v26.4s, v7.4h, v1.h[5]      \n"
                    "smlal2 v25.4s, v7.8h, v1.h[4]      \n"
                    "smlal2 v27.4s, v7.8h, v1.h[5]      \n"
                    "smlal  v28.4s, v7.4h, v1.h[6]      \n"
                    "smlal  v30.4s, v7.4h, v1.h[7]      \n"
                    "smlal2 v29.4s, v7.8h, v1.h[6]      \n"
                    "smlal2 v31.4s, v7.8h, v1.h[7]      \n"
                    "smlal  v8.4s, v4.4h, v2.h[0]       \n"
                    "smlal  v10.4s, v4.4h, v2.h[1]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    "smlal2 v9.4s, v4.8h, v2.h[0]       \n"
                    "smlal2 v11.4s, v4.8h, v2.h[1]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal  v12.4s, v4.4h, v2.h[2]      \n"
                    "smlal  v14.4s, v4.4h, v2.h[3]      \n"
                    "smlal2 v13.4s, v4.8h, v2.h[2]      \n"
                    "smlal2 v15.4s, v4.8h, v2.h[3]      \n"
                    "smlal  v16.4s, v4.4h, v2.h[4]      \n"
                    "smlal  v18.4s, v4.4h, v2.h[5]      \n"
                    "smlal2 v17.4s, v4.8h, v2.h[4]      \n"
                    "smlal2 v19.4s, v4.8h, v2.h[5]      \n"
                    "smlal  v20.4s, v4.4h, v2.h[6]      \n"
                    "smlal  v22.4s, v4.4h, v2.h[7]      \n"
                    "smlal2 v21.4s, v4.8h, v2.h[6]      \n"
                    "smlal2 v23.4s, v4.8h, v2.h[7]      \n"
                    "smlal  v24.4s, v4.4h, v3.h[0]      \n"
                    "smlal  v26.4s, v4.4h, v3.h[1]      \n"
                    "smlal2 v25.4s, v4.8h, v3.h[0]      \n"
                    "smlal2 v27.4s, v4.8h, v3.h[1]      \n"
                    "smlal  v28.4s, v4.4h, v3.h[2]      \n"
                    "smlal  v30.4s, v4.4h, v3.h[3]      \n"
                    "smlal2 v29.4s, v4.8h, v3.h[2]      \n"
                    "smlal2 v31.4s, v4.8h, v3.h[3]      \n"
                    "smlal  v8.4s, v5.4h, v3.h[4]       \n"
                    "smlal  v10.4s, v5.4h, v3.h[5]      \n"
                    "smlal2 v9.4s, v5.8h, v3.h[4]       \n"
                    "smlal2 v11.4s, v5.8h, v3.h[5]      \n"
                    "smlal  v12.4s, v5.4h, v3.h[6]      \n"
                    "smlal  v14.4s, v5.4h, v3.h[7]      \n"
                    "smlal2 v13.4s, v5.8h, v3.h[6]      \n"
                    "smlal2 v15.4s, v5.8h, v3.h[7]      \n"
                    "smlal  v16.4s, v5.4h, v0.h[0]      \n"
                    "smlal  v18.4s, v5.4h, v0.h[1]      \n"
                    "ld1    {v2.8h, v3.8h}, [%2], #32   \n"
                    "smlal2 v17.4s, v5.8h, v0.h[0]      \n"
                    "smlal2 v19.4s, v5.8h, v0.h[1]      \n"
                    "smlal  v20.4s, v5.4h, v0.h[2]      \n"
                    "smlal  v22.4s, v5.4h, v0.h[3]      \n"
                    "smlal2 v21.4s, v5.8h, v0.h[2]      \n"
                    "smlal2 v23.4s, v5.8h, v0.h[3]      \n"
                    "smlal  v24.4s, v5.4h, v0.h[4]      \n"
                    "smlal  v26.4s, v5.4h, v0.h[5]      \n"
                    "smlal2 v25.4s, v5.8h, v0.h[4]      \n"
                    "smlal2 v27.4s, v5.8h, v0.h[5]      \n"
                    "smlal  v28.4s, v5.4h, v0.h[6]      \n"
                    "smlal  v30.4s, v5.4h, v0.h[7]      \n"
                    "smlal2 v29.4s, v5.8h, v0.h[6]      \n"
                    "smlal2 v31.4s, v5.8h, v0.h[7]      \n"
                    "smlal  v8.4s, v6.4h, v1.h[0]       \n"
                    "smlal  v10.4s, v6.4h, v1.h[1]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v9.4s, v6.8h, v1.h[0]       \n"
                    "smlal2 v11.4s, v6.8h, v1.h[1]      \n"
                    "smlal  v12.4s, v6.4h, v1.h[2]      \n"
                    "smlal  v14.4s, v6.4h, v1.h[3]      \n"
                    "smlal2 v13.4s, v6.8h, v1.h[2]      \n"
                    "smlal2 v15.4s, v6.8h, v1.h[3]      \n"
                    "smlal  v16.4s, v6.4h, v1.h[4]      \n"
                    "smlal  v18.4s, v6.4h, v1.h[5]      \n"
                    "smlal2 v17.4s, v6.8h, v1.h[4]      \n"
                    "smlal2 v19.4s, v6.8h, v1.h[5]      \n"
                    "smlal  v20.4s, v6.4h, v1.h[6]      \n"
                    "smlal  v22.4s, v6.4h, v1.h[7]      \n"
                    "smlal2 v21.4s, v6.8h, v1.h[6]      \n"
                    "smlal2 v23.4s, v6.8h, v1.h[7]      \n"
                    "smlal  v24.4s, v6.4h, v2.h[0]      \n"
                    "smlal  v26.4s, v6.4h, v2.h[1]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    "smlal2 v25.4s, v6.8h, v2.h[0]      \n"
                    "smlal2 v27.4s, v6.8h, v2.h[1]      \n"
                    "smlal  v28.4s, v6.4h, v2.h[2]      \n"
                    "smlal  v30.4s, v6.4h, v2.h[3]      \n"
                    "smlal2 v29.4s, v6.8h, v2.h[2]      \n"
                    "smlal2 v31.4s, v6.8h, v2.h[3]      \n"
                    "smlal  v8.4s, v7.4h, v2.h[4]       \n"
                    "smlal  v10.4s, v7.4h, v2.h[5]      \n"
                    "smlal2 v9.4s, v7.8h, v2.h[4]       \n"
                    "smlal2 v11.4s, v7.8h, v2.h[5]      \n"
                    "smlal  v12.4s, v7.4h, v2.h[6]      \n"
                    "smlal  v14.4s, v7.4h, v2.h[7]      \n"
                    "smlal2 v13.4s, v7.8h, v2.h[6]      \n"
                    "smlal2 v15.4s, v7.8h, v2.h[7]      \n"
                    "smlal  v16.4s, v7.4h, v3.h[0]      \n"
                    "smlal  v18.4s, v7.4h, v3.h[1]      \n"
                    "smlal2 v17.4s, v7.8h, v3.h[0]      \n"
                    "smlal2 v19.4s, v7.8h, v3.h[1]      \n"
                    "smlal  v20.4s, v7.4h, v3.h[2]      \n"
                    "smlal  v22.4s, v7.4h, v3.h[3]      \n"
                    "smlal2 v21.4s, v7.8h, v3.h[2]      \n"
                    "smlal2 v23.4s, v7.8h, v3.h[3]      \n"
                    "smlal  v24.4s, v7.4h, v3.h[4]      \n"
                    "smlal  v26.4s, v7.4h, v3.h[5]      \n"
                    "smlal2 v25.4s, v7.8h, v3.h[4]      \n"
                    "smlal2 v27.4s, v7.8h, v3.h[5]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v28.4s, v7.4h, v3.h[6]      \n"
                    "smlal  v30.4s, v7.4h, v3.h[7]      \n"
                    "smlal2 v29.4s, v7.8h, v3.h[6]      \n"
                    "smlal2 v31.4s, v7.8h, v3.h[7]      \n"
                    "bne    2b                          \n"
                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #32                 \n"

                    "3:                                 \n"
                    "and    w4, %w6, #7                 \n" // w4 = remain = max_kk & 7
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"
                    "smlal  v8.4s, v4.4h, v0.h[0]       \n"
                    "smlal  v10.4s, v4.4h, v0.h[1]      \n"
                    "smlal2 v9.4s, v4.8h, v0.h[0]       \n"
                    "smlal2 v11.4s, v4.8h, v0.h[1]      \n"
                    "smlal  v12.4s, v4.4h, v0.h[2]      \n"
                    "smlal  v14.4s, v4.4h, v0.h[3]      \n"
                    "smlal2 v13.4s, v4.8h, v0.h[2]      \n"
                    "smlal2 v15.4s, v4.8h, v0.h[3]      \n"
                    "smlal  v16.4s, v4.4h, v1.h[0]      \n"
                    "smlal  v18.4s, v4.4h, v1.h[1]      \n"
                    "smlal2 v17.4s, v4.8h, v1.h[0]      \n"
                    "smlal2 v19.4s, v4.8h, v1.h[1]      \n"
                    "smlal  v20.4s, v4.4h, v1.h[2]      \n"
                    "smlal  v22.4s, v4.4h, v1.h[3]      \n"
                    "smlal2 v21.4s, v4.8h, v1.h[2]      \n"
                    "smlal2 v23.4s, v4.8h, v1.h[3]      \n"
                    "smlal  v24.4s, v4.4h, v2.h[0]      \n"
                    "smlal  v26.4s, v4.4h, v2.h[1]      \n"
                    "smlal2 v25.4s, v4.8h, v2.h[0]      \n"
                    "smlal2 v27.4s, v4.8h, v2.h[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v28.4s, v4.4h, v2.h[2]      \n"
                    "smlal  v30.4s, v4.4h, v2.h[3]      \n"
                    "smlal2 v29.4s, v4.8h, v2.h[2]      \n"
                    "smlal2 v31.4s, v4.8h, v2.h[3]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;
                int32x4_t _sum6;
                int32x4_t _sum7;
                int32x4_t _sum8;
                int32x4_t _sum9;
                int32x4_t _suma;
                int32x4_t _sumb;
                int32x4_t _sumc;
                int32x4_t _sumd;
                int32x4_t _sume;
                int32x4_t _sumf;
                int32x4_t _sumg;
                int32x4_t _sumh;
                int32x4_t _sumi;
                int32x4_t _sumj;
                int32x4_t _sumk;
                int32x4_t _suml;
                int32x4_t _summ;
                int32x4_t _sumn;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                    _sum6 = vdupq_n_s32(0);
                    _sum7 = vdupq_n_s32(0);
                    _sum8 = vdupq_n_s32(0);
                    _sum9 = vdupq_n_s32(0);
                    _suma = vdupq_n_s32(0);
                    _sumb = vdupq_n_s32(0);
                    _sumc = vdupq_n_s32(0);
                    _sumd = vdupq_n_s32(0);
                    _sume = vdupq_n_s32(0);
                    _sumf = vdupq_n_s32(0);
                    _sumg = vdupq_n_s32(0);
                    _sumh = vdupq_n_s32(0);
                    _sumi = vdupq_n_s32(0);
                    _sumj = vdupq_n_s32(0);
                    _sumk = vdupq_n_s32(0);
                    _suml = vdupq_n_s32(0);
                    _summ = vdupq_n_s32(0);
                    _sumn = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                    _sum8 = vld1q_s32(outptr + 32);
                    _sum9 = vld1q_s32(outptr + 36);
                    _suma = vld1q_s32(outptr + 40);
                    _sumb = vld1q_s32(outptr + 44);
                    _sumc = vld1q_s32(outptr + 48);
                    _sumd = vld1q_s32(outptr + 52);
                    _sume = vld1q_s32(outptr + 56);
                    _sumf = vld1q_s32(outptr + 60);
                    _sumg = vld1q_s32(outptr + 64);
                    _sumh = vld1q_s32(outptr + 68);
                    _sumi = vld1q_s32(outptr + 72);
                    _sumj = vld1q_s32(outptr + 76);
                    _sumk = vld1q_s32(outptr + 80);
                    _suml = vld1q_s32(outptr + 84);
                    _summ = vld1q_s32(outptr + 88);
                    _sumn = vld1q_s32(outptr + 92);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x8_t _pA = vld1q_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    int16x4_t _pB2 = vld1_s16(pB + 8);
                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_pA), _pB, 0);
                    _sum1 = vmlal_laneq_s16(_sum1, vget_high_s16(_pA), _pB, 0);
                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_pA), _pB, 1);
                    _sum3 = vmlal_laneq_s16(_sum3, vget_high_s16(_pA), _pB, 1);
                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_pA), _pB, 2);
                    _sum5 = vmlal_laneq_s16(_sum5, vget_high_s16(_pA), _pB, 2);
                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_pA), _pB, 3);
                    _sum7 = vmlal_laneq_s16(_sum7, vget_high_s16(_pA), _pB, 3);
                    _sum8 = vmlal_laneq_s16(_sum8, vget_low_s16(_pA), _pB, 4);
                    _sum9 = vmlal_laneq_s16(_sum9, vget_high_s16(_pA), _pB, 4);
                    _suma = vmlal_laneq_s16(_suma, vget_low_s16(_pA), _pB, 5);
                    _sumb = vmlal_laneq_s16(_sumb, vget_high_s16(_pA), _pB, 5);
                    _sumc = vmlal_laneq_s16(_sumc, vget_low_s16(_pA), _pB, 6);
                    _sumd = vmlal_laneq_s16(_sumd, vget_high_s16(_pA), _pB, 6);
                    _sume = vmlal_laneq_s16(_sume, vget_low_s16(_pA), _pB, 7);
                    _sumf = vmlal_laneq_s16(_sumf, vget_high_s16(_pA), _pB, 7);
                    _sumg = vmlal_lane_s16(_sumg, vget_low_s16(_pA), _pB2, 0);
                    _sumh = vmlal_lane_s16(_sumh, vget_high_s16(_pA), _pB2, 0);
                    _sumi = vmlal_lane_s16(_sumi, vget_low_s16(_pA), _pB2, 1);
                    _sumj = vmlal_lane_s16(_sumj, vget_high_s16(_pA), _pB2, 1);
                    _sumk = vmlal_lane_s16(_sumk, vget_low_s16(_pA), _pB2, 2);
                    _suml = vmlal_lane_s16(_suml, vget_high_s16(_pA), _pB2, 2);
                    _summ = vmlal_lane_s16(_summ, vget_low_s16(_pA), _pB2, 3);
                    _sumn = vmlal_lane_s16(_sumn, vget_high_s16(_pA), _pB2, 3);
                    pA += 8;
                    pB += 12;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
                vst1q_s32(outptr + 32, _sum8);
                vst1q_s32(outptr + 36, _sum9);
                vst1q_s32(outptr + 40, _suma);
                vst1q_s32(outptr + 44, _sumb);
                vst1q_s32(outptr + 48, _sumc);
                vst1q_s32(outptr + 52, _sumd);
                vst1q_s32(outptr + 56, _sume);
                vst1q_s32(outptr + 60, _sumf);
                vst1q_s32(outptr + 64, _sumg);
                vst1q_s32(outptr + 68, _sumh);
                vst1q_s32(outptr + 72, _sumi);
                vst1q_s32(outptr + 76, _sumj);
                vst1q_s32(outptr + 80, _sumk);
                vst1q_s32(outptr + 84, _suml);
                vst1q_s32(outptr + 88, _summ);
                vst1q_s32(outptr + 92, _sumn);
                outptr += 96;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "cmp    %w7, #0                     \n"
                    "beq    0f                          \n"

                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "sub    %0, %0, #192                \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v16.16b, v16.16b, v16.16b   \n"
                    "eor    v17.16b, v17.16b, v17.16b   \n"
                    "eor    v18.16b, v18.16b, v18.16b   \n"
                    "eor    v19.16b, v19.16b, v19.16b   \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"
                    "eor    v21.16b, v21.16b, v21.16b   \n"
                    "eor    v22.16b, v22.16b, v22.16b   \n"
                    "eor    v23.16b, v23.16b, v23.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v25.16b, v25.16b, v25.16b   \n"
                    "eor    v26.16b, v26.16b, v26.16b   \n"
                    "eor    v27.16b, v27.16b, v27.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    ".align 4                           \n"
                    "2:                                 \n"
                    "smlal  v16.4s, v4.4h, v0.h[0]      \n"
                    "smlal  v18.4s, v4.4h, v0.h[1]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v17.4s, v4.8h, v0.h[0]      \n"
                    "smlal2 v19.4s, v4.8h, v0.h[1]      \n"
                    "ld1    {v2.8h, v3.8h}, [%2], #32   \n"
                    "smlal  v20.4s, v4.4h, v0.h[2]      \n"
                    "smlal  v22.4s, v4.4h, v0.h[3]      \n"
                    "smlal2 v21.4s, v4.8h, v0.h[2]      \n"
                    "smlal2 v23.4s, v4.8h, v0.h[3]      \n"
                    "smlal  v24.4s, v4.4h, v0.h[4]      \n"
                    "smlal  v26.4s, v4.4h, v0.h[5]      \n"
                    "smlal2 v25.4s, v4.8h, v0.h[4]      \n"
                    "smlal2 v27.4s, v4.8h, v0.h[5]      \n"
                    "smlal  v28.4s, v4.4h, v0.h[6]      \n"
                    "smlal  v30.4s, v4.4h, v0.h[7]      \n"
                    "smlal2 v29.4s, v4.8h, v0.h[6]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[7]      \n"
                    "smlal  v16.4s, v5.4h, v1.h[0]      \n"
                    "smlal  v18.4s, v5.4h, v1.h[1]      \n"
                    "smlal2 v17.4s, v5.8h, v1.h[0]      \n"
                    "smlal2 v19.4s, v5.8h, v1.h[1]      \n"
                    "smlal  v20.4s, v5.4h, v1.h[2]      \n"
                    "smlal  v22.4s, v5.4h, v1.h[3]      \n"
                    "smlal2 v21.4s, v5.8h, v1.h[2]      \n"
                    "smlal2 v23.4s, v5.8h, v1.h[3]      \n"
                    "smlal  v24.4s, v5.4h, v1.h[4]      \n"
                    "smlal  v26.4s, v5.4h, v1.h[5]      \n"
                    "smlal2 v25.4s, v5.8h, v1.h[4]      \n"
                    "smlal2 v27.4s, v5.8h, v1.h[5]      \n"
                    "smlal  v28.4s, v5.4h, v1.h[6]      \n"
                    "smlal  v30.4s, v5.4h, v1.h[7]      \n"
                    "smlal2 v29.4s, v5.8h, v1.h[6]      \n"
                    "smlal2 v31.4s, v5.8h, v1.h[7]      \n"
                    "smlal  v16.4s, v6.4h, v2.h[0]      \n"
                    "smlal  v18.4s, v6.4h, v2.h[1]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v17.4s, v6.8h, v2.h[0]      \n"
                    "smlal2 v19.4s, v6.8h, v2.h[1]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    "smlal  v20.4s, v6.4h, v2.h[2]      \n"
                    "smlal  v22.4s, v6.4h, v2.h[3]      \n"
                    "smlal2 v21.4s, v6.8h, v2.h[2]      \n"
                    "smlal2 v23.4s, v6.8h, v2.h[3]      \n"
                    "smlal  v24.4s, v6.4h, v2.h[4]      \n"
                    "smlal  v26.4s, v6.4h, v2.h[5]      \n"
                    "smlal2 v25.4s, v6.8h, v2.h[4]      \n"
                    "smlal2 v27.4s, v6.8h, v2.h[5]      \n"
                    "smlal  v28.4s, v6.4h, v2.h[6]      \n"
                    "smlal  v30.4s, v6.4h, v2.h[7]      \n"
                    "smlal2 v29.4s, v6.8h, v2.h[6]      \n"
                    "smlal2 v31.4s, v6.8h, v2.h[7]      \n"
                    "smlal  v16.4s, v7.4h, v3.h[0]      \n"
                    "smlal  v18.4s, v7.4h, v3.h[1]      \n"
                    "smlal2 v17.4s, v7.8h, v3.h[0]      \n"
                    "smlal2 v19.4s, v7.8h, v3.h[1]      \n"
                    "smlal  v20.4s, v7.4h, v3.h[2]      \n"
                    "smlal  v22.4s, v7.4h, v3.h[3]      \n"
                    "smlal2 v21.4s, v7.8h, v3.h[2]      \n"
                    "smlal2 v23.4s, v7.8h, v3.h[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v24.4s, v7.4h, v3.h[4]      \n"
                    "smlal  v26.4s, v7.4h, v3.h[5]      \n"
                    "smlal2 v25.4s, v7.8h, v3.h[4]      \n"
                    "smlal2 v27.4s, v7.8h, v3.h[5]      \n"
                    "smlal  v28.4s, v7.4h, v3.h[6]      \n"
                    "smlal  v30.4s, v7.4h, v3.h[7]      \n"
                    "smlal2 v29.4s, v7.8h, v3.h[6]      \n"
                    "smlal2 v31.4s, v7.8h, v3.h[7]      \n"
                    "bne    2b                          \n"
                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #32                 \n"

                    "3:                                 \n"
                    "and    w4, %w6, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "ld1    {v0.8h}, [%2], #16          \n"
                    "smlal  v16.4s, v4.4h, v0.h[0]      \n"
                    "smlal  v18.4s, v4.4h, v0.h[1]      \n"
                    "smlal2 v17.4s, v4.8h, v0.h[0]      \n"
                    "smlal2 v19.4s, v4.8h, v0.h[1]      \n"
                    "smlal  v20.4s, v4.4h, v0.h[2]      \n"
                    "smlal  v22.4s, v4.4h, v0.h[3]      \n"
                    "smlal2 v21.4s, v4.8h, v0.h[2]      \n"
                    "smlal2 v23.4s, v4.8h, v0.h[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v24.4s, v4.4h, v0.h[4]      \n"
                    "smlal  v26.4s, v4.4h, v0.h[5]      \n"
                    "smlal2 v25.4s, v4.8h, v0.h[4]      \n"
                    "smlal2 v27.4s, v4.8h, v0.h[5]      \n"
                    "smlal  v28.4s, v4.4h, v0.h[6]      \n"
                    "smlal  v30.4s, v4.4h, v0.h[7]      \n"
                    "smlal2 v29.4s, v4.8h, v0.h[6]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[7]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;
                int32x4_t _sum6;
                int32x4_t _sum7;
                int32x4_t _sum8;
                int32x4_t _sum9;
                int32x4_t _suma;
                int32x4_t _sumb;
                int32x4_t _sumc;
                int32x4_t _sumd;
                int32x4_t _sume;
                int32x4_t _sumf;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                    _sum6 = vdupq_n_s32(0);
                    _sum7 = vdupq_n_s32(0);
                    _sum8 = vdupq_n_s32(0);
                    _sum9 = vdupq_n_s32(0);
                    _suma = vdupq_n_s32(0);
                    _sumb = vdupq_n_s32(0);
                    _sumc = vdupq_n_s32(0);
                    _sumd = vdupq_n_s32(0);
                    _sume = vdupq_n_s32(0);
                    _sumf = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                    _sum8 = vld1q_s32(outptr + 32);
                    _sum9 = vld1q_s32(outptr + 36);
                    _suma = vld1q_s32(outptr + 40);
                    _sumb = vld1q_s32(outptr + 44);
                    _sumc = vld1q_s32(outptr + 48);
                    _sumd = vld1q_s32(outptr + 52);
                    _sume = vld1q_s32(outptr + 56);
                    _sumf = vld1q_s32(outptr + 60);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x8_t _pA = vld1q_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_pA), _pB, 0);
                    _sum1 = vmlal_laneq_s16(_sum1, vget_high_s16(_pA), _pB, 0);
                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_pA), _pB, 1);
                    _sum3 = vmlal_laneq_s16(_sum3, vget_high_s16(_pA), _pB, 1);
                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_pA), _pB, 2);
                    _sum5 = vmlal_laneq_s16(_sum5, vget_high_s16(_pA), _pB, 2);
                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_pA), _pB, 3);
                    _sum7 = vmlal_laneq_s16(_sum7, vget_high_s16(_pA), _pB, 3);
                    _sum8 = vmlal_laneq_s16(_sum8, vget_low_s16(_pA), _pB, 4);
                    _sum9 = vmlal_laneq_s16(_sum9, vget_high_s16(_pA), _pB, 4);
                    _suma = vmlal_laneq_s16(_suma, vget_low_s16(_pA), _pB, 5);
                    _sumb = vmlal_laneq_s16(_sumb, vget_high_s16(_pA), _pB, 5);
                    _sumc = vmlal_laneq_s16(_sumc, vget_low_s16(_pA), _pB, 6);
                    _sumd = vmlal_laneq_s16(_sumd, vget_high_s16(_pA), _pB, 6);
                    _sume = vmlal_laneq_s16(_sume, vget_low_s16(_pA), _pB, 7);
                    _sumf = vmlal_laneq_s16(_sumf, vget_high_s16(_pA), _pB, 7);
                    pA += 8;
                    pB += 8;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
                vst1q_s32(outptr + 32, _sum8);
                vst1q_s32(outptr + 36, _sum9);
                vst1q_s32(outptr + 40, _suma);
                vst1q_s32(outptr + 44, _sumb);
                vst1q_s32(outptr + 48, _sumc);
                vst1q_s32(outptr + 52, _sumd);
                vst1q_s32(outptr + 56, _sume);
                vst1q_s32(outptr + 60, _sumf);
                outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
            }
#endif // __aarch64__
            for (; jj + 5 < max_jj; jj += 6)
            {
                const short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "prfm   pldl1keep, [%2, #384]       \n"
                    "cmp    %w7, #0                     \n"
                    "beq    0f                          \n"

                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "sub    %0, %0, #128                \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"
                    "eor    v21.16b, v21.16b, v21.16b   \n"
                    "eor    v22.16b, v22.16b, v22.16b   \n"
                    "eor    v23.16b, v23.16b, v23.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v25.16b, v25.16b, v25.16b   \n"
                    "eor    v26.16b, v26.16b, v26.16b   \n"
                    "eor    v27.16b, v27.16b, v27.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    ".align 4                           \n"
                    "2:                                 \n"
                    "smlal  v20.4s, v6.4h, v0.h[0]      \n"
                    "smlal  v22.4s, v6.4h, v0.h[1]      \n"
                    "ld1    {v8.8h, v9.8h}, [%1], #32   \n"
                    "smlal2 v21.4s, v6.8h, v0.h[0]      \n"
                    "smlal2 v23.4s, v6.8h, v0.h[1]      \n"
                    "ld1    {v2.8h, v3.8h}, [%2], #32   \n"
                    "smlal  v24.4s, v6.4h, v0.h[2]      \n"
                    "smlal  v26.4s, v6.4h, v0.h[3]      \n"
                    "smlal2 v25.4s, v6.8h, v0.h[2]      \n"
                    "smlal2 v27.4s, v6.8h, v0.h[3]      \n"
                    "smlal  v28.4s, v6.4h, v0.h[4]      \n"
                    "smlal  v30.4s, v6.4h, v0.h[5]      \n"
                    "smlal2 v29.4s, v6.8h, v0.h[4]      \n"
                    "smlal2 v31.4s, v6.8h, v0.h[5]      \n"
                    "smlal  v20.4s, v7.4h, v0.h[6]      \n"
                    "smlal  v22.4s, v7.4h, v0.h[7]      \n"
                    "smlal2 v21.4s, v7.8h, v0.h[6]      \n"
                    "smlal2 v23.4s, v7.8h, v0.h[7]      \n"
                    "smlal  v24.4s, v7.4h, v1.h[0]      \n"
                    "smlal  v26.4s, v7.4h, v1.h[1]      \n"
                    "smlal2 v25.4s, v7.8h, v1.h[0]      \n"
                    "smlal2 v27.4s, v7.8h, v1.h[1]      \n"
                    "smlal  v28.4s, v7.4h, v1.h[2]      \n"
                    "smlal  v30.4s, v7.4h, v1.h[3]      \n"
                    "smlal2 v29.4s, v7.8h, v1.h[2]      \n"
                    "smlal2 v31.4s, v7.8h, v1.h[3]      \n"
                    "smlal  v20.4s, v8.4h, v1.h[4]      \n"
                    "smlal  v22.4s, v8.4h, v1.h[5]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v21.4s, v8.8h, v1.h[4]      \n"
                    "smlal2 v23.4s, v8.8h, v1.h[5]      \n"
                    "smlal  v24.4s, v8.4h, v1.h[6]      \n"
                    "smlal  v26.4s, v8.4h, v1.h[7]      \n"
                    "smlal2 v25.4s, v8.8h, v1.h[6]      \n"
                    "smlal2 v27.4s, v8.8h, v1.h[7]      \n"
                    "smlal  v28.4s, v8.4h, v2.h[0]      \n"
                    "smlal  v30.4s, v8.4h, v2.h[1]      \n"
                    "ld1    {v4.8h, v5.8h}, [%2], #32   \n"
                    "smlal2 v29.4s, v8.8h, v2.h[0]      \n"
                    "smlal2 v31.4s, v8.8h, v2.h[1]      \n"
                    "smlal  v20.4s, v9.4h, v2.h[2]      \n"
                    "smlal  v22.4s, v9.4h, v2.h[3]      \n"
                    "smlal2 v21.4s, v9.8h, v2.h[2]      \n"
                    "smlal2 v23.4s, v9.8h, v2.h[3]      \n"
                    "smlal  v24.4s, v9.4h, v2.h[4]      \n"
                    "smlal  v26.4s, v9.4h, v2.h[5]      \n"
                    "smlal2 v25.4s, v9.8h, v2.h[4]      \n"
                    "smlal2 v27.4s, v9.8h, v2.h[5]      \n"
                    "smlal  v28.4s, v9.4h, v2.h[6]      \n"
                    "smlal  v30.4s, v9.4h, v2.h[7]      \n"
                    "smlal2 v29.4s, v9.8h, v2.h[6]      \n"
                    "smlal2 v31.4s, v9.8h, v2.h[7]      \n"
                    "smlal  v20.4s, v6.4h, v3.h[0]      \n"
                    "smlal  v22.4s, v6.4h, v3.h[1]      \n"
                    "ld1    {v8.8h, v9.8h}, [%1], #32   \n"
                    "smlal2 v21.4s, v6.8h, v3.h[0]      \n"
                    "smlal2 v23.4s, v6.8h, v3.h[1]      \n"
                    "smlal  v24.4s, v6.4h, v3.h[2]      \n"
                    "smlal  v26.4s, v6.4h, v3.h[3]      \n"
                    "smlal2 v25.4s, v6.8h, v3.h[2]      \n"
                    "smlal2 v27.4s, v6.8h, v3.h[3]      \n"
                    "smlal  v28.4s, v6.4h, v3.h[4]      \n"
                    "smlal  v30.4s, v6.4h, v3.h[5]      \n"
                    "smlal2 v29.4s, v6.8h, v3.h[4]      \n"
                    "smlal2 v31.4s, v6.8h, v3.h[5]      \n"
                    "smlal  v20.4s, v7.4h, v3.h[6]      \n"
                    "smlal  v22.4s, v7.4h, v3.h[7]      \n"
                    "smlal2 v21.4s, v7.8h, v3.h[6]      \n"
                    "smlal2 v23.4s, v7.8h, v3.h[7]      \n"
                    "smlal  v24.4s, v7.4h, v4.h[0]      \n"
                    "smlal  v26.4s, v7.4h, v4.h[1]      \n"
                    "prfm   pldl1keep, [%2, #384]       \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    "smlal2 v25.4s, v7.8h, v4.h[0]      \n"
                    "smlal2 v27.4s, v7.8h, v4.h[1]      \n"
                    "smlal  v28.4s, v7.4h, v4.h[2]      \n"
                    "smlal  v30.4s, v7.4h, v4.h[3]      \n"
                    "smlal2 v29.4s, v7.8h, v4.h[2]      \n"
                    "smlal2 v31.4s, v7.8h, v4.h[3]      \n"
                    "smlal  v20.4s, v8.4h, v4.h[4]      \n"
                    "smlal  v22.4s, v8.4h, v4.h[5]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v21.4s, v8.8h, v4.h[4]      \n"
                    "smlal2 v23.4s, v8.8h, v4.h[5]      \n"
                    "smlal  v24.4s, v8.4h, v4.h[6]      \n"
                    "smlal  v26.4s, v8.4h, v4.h[7]      \n"
                    "smlal2 v25.4s, v8.8h, v4.h[6]      \n"
                    "smlal2 v27.4s, v8.8h, v4.h[7]      \n"
                    "smlal  v28.4s, v8.4h, v5.h[0]      \n"
                    "smlal  v30.4s, v8.4h, v5.h[1]      \n"
                    "smlal2 v29.4s, v8.8h, v5.h[0]      \n"
                    "smlal2 v31.4s, v8.8h, v5.h[1]      \n"
                    "smlal  v20.4s, v9.4h, v5.h[2]      \n"
                    "smlal  v22.4s, v9.4h, v5.h[3]      \n"
                    "smlal2 v21.4s, v9.8h, v5.h[2]      \n"
                    "smlal2 v23.4s, v9.8h, v5.h[3]      \n"
                    "smlal  v24.4s, v9.4h, v5.h[4]      \n"
                    "smlal  v26.4s, v9.4h, v5.h[5]      \n"
                    "smlal2 v25.4s, v9.8h, v5.h[4]      \n"
                    "smlal2 v27.4s, v9.8h, v5.h[5]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v28.4s, v9.4h, v5.h[6]      \n"
                    "smlal  v30.4s, v9.4h, v5.h[7]      \n"
                    "smlal2 v29.4s, v9.8h, v5.h[6]      \n"
                    "smlal2 v31.4s, v9.8h, v5.h[7]      \n"
                    "bne    2b                          \n"
                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #32                 \n"

                    "3:                                 \n"
                    "and    w4, %w6, #7                 \n" // w4 = remain = max_kk & 7
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "ld1    {v0.8h}, [%2]               \n"
                    "add    %2, %2, #12                 \n"
                    "smlal  v20.4s, v4.4h, v0.h[0]      \n"
                    "smlal  v22.4s, v4.4h, v0.h[1]      \n"
                    "smlal2 v21.4s, v4.8h, v0.h[0]      \n"
                    "smlal2 v23.4s, v4.8h, v0.h[1]      \n"
                    "smlal  v24.4s, v4.4h, v0.h[2]      \n"
                    "smlal  v26.4s, v4.4h, v0.h[3]      \n"
                    "smlal2 v25.4s, v4.8h, v0.h[2]      \n"
                    "smlal2 v27.4s, v4.8h, v0.h[3]      \n"
                    "smlal  v28.4s, v4.4h, v0.h[4]      \n"
                    "smlal  v30.4s, v4.4h, v0.h[5]      \n"
                    "smlal2 v29.4s, v4.8h, v0.h[4]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[5]      \n"
                    "subs   w4, w4, #1                  \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #512]          \n"
                    "pld        [%2, #384]          \n"
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vldm       %0!, {d8-d15}       \n"
                    "vldm       %0, {d16-d31}       \n"
                    "sub        %0, %0, #64         \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q4, q4              \n"
                    "veor       q5, q5              \n"
                    "veor       q6, q6              \n"
                    "veor       q7, q7              \n"
                    "veor       q8, q8              \n"
                    "veor       q9, q9              \n"
                    "veor       q10, q10            \n"
                    "veor       q11, q11            \n"
                    "veor       q12, q12            \n"
                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #3          \n" // r4 = max_kk >> 3
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "vld1.s16   {d4-d5}, [%1]!      \n"
                    "vld1.s16   {d0-d1}, [%2]!      \n"
                    ".align 4                       \n"
                    "2:                             \n"
                    "vmlal.s16  q4, d4, d0[0]       \n"
                    "vld1.s16   {d6-d7}, [%1]!      \n"
                    "vmlal.s16  q6, d4, d0[1]       \n"
                    "vld1.s16   {d2-d3}, [%2]!      \n"
                    "vmlal.s16  q8, d4, d0[2]       \n"
                    "vmlal.s16  q10, d4, d0[3]      \n"
                    "vmlal.s16  q5, d5, d0[0]       \n"
                    "vmlal.s16  q7, d5, d0[1]       \n"
                    "vmlal.s16  q9, d5, d0[2]       \n"
                    "vmlal.s16  q11, d5, d0[3]      \n"
                    "vmlal.s16  q12, d4, d1[0]      \n"
                    "vmlal.s16  q14, d4, d1[1]      \n"
                    "vmlal.s16  q13, d5, d1[0]      \n"
                    "vmlal.s16  q15, d5, d1[1]      \n"
                    "vmlal.s16  q4, d6, d1[2]       \n"
                    "vld1.s16   {d4-d5}, [%1]!      \n"
                    "vmlal.s16  q6, d6, d1[3]       \n"
                    "vmlal.s16  q5, d7, d1[2]       \n"
                    "vmlal.s16  q7, d7, d1[3]       \n"
                    "vmlal.s16  q8, d6, d2[0]       \n"
                    "pld        [%2, #384]          \n"
                    "vld1.s16   {d0-d1}, [%2]!      \n"
                    "vmlal.s16  q10, d6, d2[1]      \n"
                    "vmlal.s16  q12, d6, d2[2]      \n"
                    "vmlal.s16  q14, d6, d2[3]      \n"
                    "vmlal.s16  q9, d7, d2[0]       \n"
                    "vmlal.s16  q11, d7, d2[1]      \n"
                    "vmlal.s16  q13, d7, d2[2]      \n"
                    "vmlal.s16  q15, d7, d2[3]      \n"
                    "vmlal.s16  q4, d4, d3[0]       \n"
                    "vld1.s16   {d6-d7}, [%1]!      \n"
                    "vmlal.s16  q6, d4, d3[1]       \n"
                    "vmlal.s16  q8, d4, d3[2]       \n"
                    "vmlal.s16  q10, d4, d3[3]      \n"
                    "vmlal.s16  q5, d5, d3[0]       \n"
                    "vmlal.s16  q7, d5, d3[1]       \n"
                    "vmlal.s16  q9, d5, d3[2]       \n"
                    "vmlal.s16  q11, d5, d3[3]      \n"
                    "vmlal.s16  q12, d4, d0[0]      \n"
                    "vld1.s16   {d2-d3}, [%2]!      \n"
                    "vmlal.s16  q14, d4, d0[1]      \n"
                    "vmlal.s16  q13, d5, d0[0]      \n"
                    "vmlal.s16  q15, d5, d0[1]      \n"
                    "vmlal.s16  q4, d6, d0[2]       \n"
                    "pld        [%1, #512]          \n"
                    "vld1.s16   {d4-d5}, [%1]!      \n"
                    "vmlal.s16  q6, d6, d0[3]       \n"
                    "vmlal.s16  q5, d7, d0[2]       \n"
                    "vmlal.s16  q7, d7, d0[3]       \n"
                    "vmlal.s16  q8, d6, d1[0]       \n"
                    "vmlal.s16  q10, d6, d1[1]      \n"
                    "vmlal.s16  q12, d6, d1[2]      \n"
                    "vmlal.s16  q14, d6, d1[3]      \n"
                    "vmlal.s16  q9, d7, d1[0]       \n"
                    "vmlal.s16  q11, d7, d1[1]      \n"
                    "vmlal.s16  q13, d7, d1[2]      \n"
                    "vmlal.s16  q15, d7, d1[3]      \n"
                    "vmlal.s16  q4, d4, d2[0]       \n"
                    "vld1.s16   {d6-d7}, [%1]!      \n"
                    "vmlal.s16  q6, d4, d2[1]       \n"
                    "vld1.s16   {d0-d1}, [%2]!      \n"
                    "vmlal.s16  q8, d4, d2[2]       \n"
                    "vmlal.s16  q10, d4, d2[3]      \n"
                    "vmlal.s16  q5, d5, d2[0]       \n"
                    "vmlal.s16  q7, d5, d2[1]       \n"
                    "vmlal.s16  q9, d5, d2[2]       \n"
                    "vmlal.s16  q11, d5, d2[3]      \n"
                    "vmlal.s16  q12, d4, d3[0]      \n"
                    "vmlal.s16  q14, d4, d3[1]      \n"
                    "vmlal.s16  q13, d5, d3[0]      \n"
                    "vmlal.s16  q15, d5, d3[1]      \n"
                    "vmlal.s16  q4, d6, d3[2]       \n"
                    "vld1.s16   {d4-d5}, [%1]!      \n"
                    "vmlal.s16  q6, d6, d3[3]       \n"
                    "vmlal.s16  q5, d7, d3[2]       \n"
                    "vmlal.s16  q7, d7, d3[3]       \n"
                    "vmlal.s16  q8, d6, d0[0]       \n"
                    "pld        [%2, #384]          \n"
                    "vld1.s16   {d2-d3}, [%2]!      \n"
                    "vmlal.s16  q10, d6, d0[1]      \n"
                    "vmlal.s16  q12, d6, d0[2]      \n"
                    "vmlal.s16  q14, d6, d0[3]      \n"
                    "vmlal.s16  q9, d7, d0[0]       \n"
                    "vmlal.s16  q11, d7, d0[1]      \n"
                    "vmlal.s16  q13, d7, d0[2]      \n"
                    "vmlal.s16  q15, d7, d0[3]      \n"
                    "vmlal.s16  q4, d4, d1[0]       \n"
                    "vld1.s16   {d6-d7}, [%1]!      \n"
                    "vmlal.s16  q6, d4, d1[1]       \n"
                    "vmlal.s16  q8, d4, d1[2]       \n"
                    "vmlal.s16  q10, d4, d1[3]      \n"
                    "vmlal.s16  q5, d5, d1[0]       \n"
                    "vmlal.s16  q7, d5, d1[1]       \n"
                    "vmlal.s16  q9, d5, d1[2]       \n"
                    "vmlal.s16  q11, d5, d1[3]      \n"
                    "vmlal.s16  q12, d4, d2[0]      \n"
                    "vld1.s16   {d0-d1}, [%2]!      \n"
                    "vmlal.s16  q14, d4, d2[1]      \n"
                    "vmlal.s16  q13, d5, d2[0]      \n"
                    "vmlal.s16  q15, d5, d2[1]      \n"
                    "vmlal.s16  q4, d6, d2[2]       \n"
                    "pld        [%1, #512]          \n"
                    "vld1.s16   {d4-d5}, [%1]!      \n"
                    "vmlal.s16  q6, d6, d2[3]       \n"
                    "vmlal.s16  q5, d7, d2[2]       \n"
                    "vmlal.s16  q7, d7, d2[3]       \n"
                    "vmlal.s16  q8, d6, d3[0]       \n"
                    "vmlal.s16  q10, d6, d3[1]      \n"
                    "vmlal.s16  q12, d6, d3[2]      \n"
                    "vmlal.s16  q14, d6, d3[3]      \n"
                    "vmlal.s16  q9, d7, d3[0]       \n"
                    "vmlal.s16  q11, d7, d3[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q13, d7, d3[2]      \n"
                    "vmlal.s16  q15, d7, d3[3]      \n"
                    "bne        2b                  \n"
                    "sub        %1, %1, #16         \n"
                    "sub        %2, %2, #16         \n"

                    "3:                             \n"
                    "and        r4, %6, #7          \n" // w4 = remain = max_kk & 7
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vld1.s16   {d0-d1}, [%1]!      \n"
                    "vld1.s16   {d2-d3}, [%2]       \n"
                    "add        %2, %2, #12         \n"
                    "vmlal.s16  q4, d0, d2[0]       \n"
                    "vmlal.s16  q6, d0, d2[1]       \n"
                    "vmlal.s16  q8, d0, d2[2]       \n"
                    "vmlal.s16  q10, d0, d2[3]      \n"
                    "vmlal.s16  q5, d1, d2[0]       \n"
                    "vmlal.s16  q7, d1, d2[1]       \n"
                    "vmlal.s16  q9, d1, d2[2]       \n"
                    "vmlal.s16  q11, d1, d2[3]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q12, d0, d3[0]      \n"
                    "vmlal.s16  q14, d0, d3[1]      \n"
                    "vmlal.s16  q13, d1, d3[0]      \n"
                    "vmlal.s16  q15, d1, d3[1]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vstm       %0!, {d8-d15}       \n"
                    "vstm       %0!, {d16-d31}      \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;
                int32x4_t _sum6;
                int32x4_t _sum7;
                int32x4_t _sum8;
                int32x4_t _sum9;
                int32x4_t _suma;
                int32x4_t _sumb;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                    _sum6 = vdupq_n_s32(0);
                    _sum7 = vdupq_n_s32(0);
                    _sum8 = vdupq_n_s32(0);
                    _sum9 = vdupq_n_s32(0);
                    _suma = vdupq_n_s32(0);
                    _sumb = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                    _sum8 = vld1q_s32(outptr + 32);
                    _sum9 = vld1q_s32(outptr + 36);
                    _suma = vld1q_s32(outptr + 40);
                    _sumb = vld1q_s32(outptr + 44);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x8_t _pA = vld1q_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_pA), vget_low_s16(_pB), 0);
                    _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_pA), vget_low_s16(_pB), 0);
                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_pA), vget_low_s16(_pB), 1);
                    _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_pA), vget_low_s16(_pB), 1);
                    _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_pA), vget_low_s16(_pB), 2);
                    _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_pA), vget_low_s16(_pB), 2);
                    _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_pA), vget_low_s16(_pB), 3);
                    _sum7 = vmlal_lane_s16(_sum7, vget_high_s16(_pA), vget_low_s16(_pB), 3);
                    _sum8 = vmlal_lane_s16(_sum8, vget_low_s16(_pA), vget_high_s16(_pB), 0);
                    _sum9 = vmlal_lane_s16(_sum9, vget_high_s16(_pA), vget_high_s16(_pB), 0);
                    _suma = vmlal_lane_s16(_suma, vget_low_s16(_pA), vget_high_s16(_pB), 1);
                    _sumb = vmlal_lane_s16(_sumb, vget_high_s16(_pA), vget_high_s16(_pB), 1);
                    pA += 8;
                    pB += 6;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
                vst1q_s32(outptr + 32, _sum8);
                vst1q_s32(outptr + 36, _sum9);
                vst1q_s32(outptr + 40, _suma);
                vst1q_s32(outptr + 44, _sumb);
                outptr += 48;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "cmp    %w7, #0                     \n"
                    "beq    0f                          \n"

                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "sub    %0, %0, #64                 \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v25.16b, v25.16b, v25.16b   \n"
                    "eor    v26.16b, v26.16b, v26.16b   \n"
                    "eor    v27.16b, v27.16b, v27.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    ".align 4                           \n"
                    "2:                                 \n"
                    "smlal  v24.4s, v4.4h, v0.h[0]      \n"
                    "smlal  v26.4s, v4.4h, v0.h[1]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v25.4s, v4.8h, v0.h[0]      \n"
                    "smlal2 v27.4s, v4.8h, v0.h[1]      \n"
                    "ld1    {v2.8h, v3.8h}, [%2], #32   \n"
                    "smlal  v28.4s, v4.4h, v0.h[2]      \n"
                    "smlal  v30.4s, v4.4h, v0.h[3]      \n"
                    "smlal2 v29.4s, v4.8h, v0.h[2]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[3]      \n"
                    "smlal  v24.4s, v5.4h, v0.h[4]      \n"
                    "smlal  v26.4s, v5.4h, v0.h[5]      \n"
                    "smlal2 v25.4s, v5.8h, v0.h[4]      \n"
                    "smlal2 v27.4s, v5.8h, v0.h[5]      \n"
                    "smlal  v28.4s, v5.4h, v0.h[6]      \n"
                    "smlal  v30.4s, v5.4h, v0.h[7]      \n"
                    "smlal2 v29.4s, v5.8h, v0.h[6]      \n"
                    "smlal2 v31.4s, v5.8h, v0.h[7]      \n"
                    "smlal  v24.4s, v6.4h, v1.h[0]      \n"
                    "smlal  v26.4s, v6.4h, v1.h[1]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v25.4s, v6.8h, v1.h[0]      \n"
                    "smlal2 v27.4s, v6.8h, v1.h[1]      \n"
                    "smlal  v28.4s, v6.4h, v1.h[2]      \n"
                    "smlal  v30.4s, v6.4h, v1.h[3]      \n"
                    "smlal2 v29.4s, v6.8h, v1.h[2]      \n"
                    "smlal2 v31.4s, v6.8h, v1.h[3]      \n"
                    "smlal  v24.4s, v7.4h, v1.h[4]      \n"
                    "smlal  v26.4s, v7.4h, v1.h[5]      \n"
                    "smlal2 v25.4s, v7.8h, v1.h[4]      \n"
                    "smlal2 v27.4s, v7.8h, v1.h[5]      \n"
                    "smlal  v28.4s, v7.4h, v1.h[6]      \n"
                    "smlal  v30.4s, v7.4h, v1.h[7]      \n"
                    "smlal2 v29.4s, v7.8h, v1.h[6]      \n"
                    "smlal2 v31.4s, v7.8h, v1.h[7]      \n"
                    "smlal  v24.4s, v4.4h, v2.h[0]      \n"
                    "smlal  v26.4s, v4.4h, v2.h[1]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v25.4s, v4.8h, v2.h[0]      \n"
                    "smlal2 v27.4s, v4.8h, v2.h[1]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                    "smlal  v28.4s, v4.4h, v2.h[2]      \n"
                    "smlal  v30.4s, v4.4h, v2.h[3]      \n"
                    "smlal2 v29.4s, v4.8h, v2.h[2]      \n"
                    "smlal2 v31.4s, v4.8h, v2.h[3]      \n"
                    "smlal  v24.4s, v5.4h, v2.h[4]      \n"
                    "smlal  v26.4s, v5.4h, v2.h[5]      \n"
                    "smlal2 v25.4s, v5.8h, v2.h[4]      \n"
                    "smlal2 v27.4s, v5.8h, v2.h[5]      \n"
                    "smlal  v28.4s, v5.4h, v2.h[6]      \n"
                    "smlal  v30.4s, v5.4h, v2.h[7]      \n"
                    "smlal2 v29.4s, v5.8h, v2.h[6]      \n"
                    "smlal2 v31.4s, v5.8h, v2.h[7]      \n"
                    "smlal  v24.4s, v6.4h, v3.h[0]      \n"
                    "smlal  v26.4s, v6.4h, v3.h[1]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v25.4s, v6.8h, v3.h[0]      \n"
                    "smlal2 v27.4s, v6.8h, v3.h[1]      \n"
                    "smlal  v28.4s, v6.4h, v3.h[2]      \n"
                    "smlal  v30.4s, v6.4h, v3.h[3]      \n"
                    "smlal2 v29.4s, v6.8h, v3.h[2]      \n"
                    "smlal2 v31.4s, v6.8h, v3.h[3]      \n"
                    "smlal  v24.4s, v7.4h, v3.h[4]      \n"
                    "smlal  v26.4s, v7.4h, v3.h[5]      \n"
                    "smlal2 v25.4s, v7.8h, v3.h[4]      \n"
                    "smlal2 v27.4s, v7.8h, v3.h[5]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v28.4s, v7.4h, v3.h[6]      \n"
                    "smlal  v30.4s, v7.4h, v3.h[7]      \n"
                    "smlal2 v29.4s, v7.8h, v3.h[6]      \n"
                    "smlal2 v31.4s, v7.8h, v3.h[7]      \n"
                    "bne    2b                          \n"
                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #32                 \n"

                    "3:                                 \n"
                    "and    w4, %w6, #7                 \n" // w4 = remain = max_kk & 7
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "ld1    {v0.4h}, [%2], #8           \n"
                    "smlal  v24.4s, v4.4h, v0.h[0]      \n"
                    "smlal  v26.4s, v4.4h, v0.h[1]      \n"
                    "smlal2 v25.4s, v4.8h, v0.h[0]      \n"
                    "smlal2 v27.4s, v4.8h, v0.h[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v28.4s, v4.4h, v0.h[2]      \n"
                    "smlal  v30.4s, v4.4h, v0.h[3]      \n"
                    "smlal2 v29.4s, v4.8h, v0.h[2]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[3]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #512]          \n"
                    "pld        [%2, #256]          \n"
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vldm       %0, {d16-d31}       \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q8, q8              \n"
                    "veor       q9, q9              \n"
                    "veor       q10, q10            \n"
                    "veor       q11, q11            \n"
                    "veor       q12, q12            \n"
                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "vld1.s16   {d4-d5}, [%1]!      \n"
                    "vld1.s16   {d0-d1}, [%2]!      \n"
                    ".align 4                       \n"
                    "2:                             \n"
                    "vmlal.s16  q8, d4, d0[0]       \n"
                    "vld1.s16   {d6-d7}, [%1]!      \n"
                    "vmlal.s16  q10, d4, d0[1]      \n"
                    "vmlal.s16  q12, d4, d0[2]      \n"
                    "vmlal.s16  q14, d4, d0[3]      \n"
                    "vmlal.s16  q9, d5, d0[0]       \n"
                    "vld1.s16   {d8-d9}, [%1]!      \n"
                    "vmlal.s16  q11, d5, d0[1]      \n"
                    "vld1.s16   {d2-d3}, [%2]!      \n"
                    "vmlal.s16  q13, d5, d0[2]      \n"
                    "vmlal.s16  q15, d5, d0[3]      \n"
                    "vmlal.s16  q8, d6, d1[0]       \n"
                    "vmlal.s16  q10, d6, d1[1]      \n"
                    "vmlal.s16  q12, d6, d1[2]      \n"
                    "vmlal.s16  q14, d6, d1[3]      \n"
                    "vmlal.s16  q9, d7, d1[0]       \n"
                    "vld1.s16   {d10-d11}, [%1]!    \n"
                    "vmlal.s16  q11, d7, d1[1]      \n"
                    "vmlal.s16  q13, d7, d1[2]      \n"
                    "vmlal.s16  q15, d7, d1[3]      \n"
                    "vmlal.s16  q8, d8, d2[0]       \n"
                    "vmlal.s16  q10, d8, d2[1]      \n"
                    "vmlal.s16  q12, d8, d2[2]      \n"
                    "vmlal.s16  q14, d8, d2[3]      \n"
                    "vmlal.s16  q9, d9, d2[0]       \n"
                    "pld        [%1, #512]          \n"
                    "vld1.s16   {d4-d5}, [%1]!      \n"
                    "vmlal.s16  q11, d9, d2[1]      \n"
                    "pld        [%2, #256]          \n"
                    "vld1.s16   {d0-d1}, [%2]!      \n"
                    "vmlal.s16  q13, d9, d2[2]      \n"
                    "vmlal.s16  q15, d9, d2[3]      \n"
                    "vmlal.s16  q8, d10, d3[0]      \n"
                    "vmlal.s16  q10, d10, d3[1]     \n"
                    "vmlal.s16  q12, d10, d3[2]     \n"
                    "vmlal.s16  q14, d10, d3[3]     \n"
                    "vmlal.s16  q9, d11, d3[0]      \n"
                    "vmlal.s16  q11, d11, d3[1]     \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q13, d11, d3[2]     \n"
                    "vmlal.s16  q15, d11, d3[3]     \n"
                    "bne        2b                  \n"
                    "sub        %1, %1, #16         \n"
                    "sub        %2, %2, #16         \n"

                    "3:                             \n"
                    "and        r4, %6, #3          \n" // w4 = remain = max_kk & 3
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vld1.s16   {d0-d1}, [%1]!      \n"
                    "vld1.s16   {d2}, [%2]!         \n"
                    "vmlal.s16  q8, d0, d2[0]       \n"
                    "vmlal.s16  q10, d0, d2[1]      \n"
                    "vmlal.s16  q12, d0, d2[2]      \n"
                    "vmlal.s16  q14, d0, d2[3]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q9, d1, d2[0]       \n"
                    "vmlal.s16  q11, d1, d2[1]      \n"
                    "vmlal.s16  q13, d1, d2[2]      \n"
                    "vmlal.s16  q15, d1, d2[3]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vstm       %0!, {d16-d31}      \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;
                int32x4_t _sum6;
                int32x4_t _sum7;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                    _sum6 = vdupq_n_s32(0);
                    _sum7 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x8_t _pA = vld1q_s16(pA);
                    int16x4_t _pB = vld1_s16(pB);
                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_pA), _pB, 0);
                    _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_pA), _pB, 0);
                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_pA), _pB, 1);
                    _sum3 = vmlal_lane_s16(_sum3, vget_high_s16(_pA), _pB, 1);
                    _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_pA), _pB, 2);
                    _sum5 = vmlal_lane_s16(_sum5, vget_high_s16(_pA), _pB, 2);
                    _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_pA), _pB, 3);
                    _sum7 = vmlal_lane_s16(_sum7, vget_high_s16(_pA), _pB, 3);
                    pA += 8;
                    pB += 4;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
                outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "cmp    %w7, #0                     \n"
                    "beq    0f                          \n"

                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0] \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "ld1    {v0.8h}, [%2], #16          \n"
                    ".align 4                           \n"
                    "2:                                 \n"
                    "smlal  v28.4s, v4.4h, v0.h[0]      \n"
                    "smlal  v30.4s, v4.4h, v0.h[1]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v4.8h, v0.h[0]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[1]      \n"
                    "ld1    {v1.8h}, [%2], #16          \n"
                    "smlal  v28.4s, v5.4h, v0.h[2]      \n"
                    "smlal  v30.4s, v5.4h, v0.h[3]      \n"
                    "smlal2 v29.4s, v5.8h, v0.h[2]      \n"
                    "smlal2 v31.4s, v5.8h, v0.h[3]      \n"
                    "smlal  v28.4s, v6.4h, v0.h[4]      \n"
                    "smlal  v30.4s, v6.4h, v0.h[5]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v6.8h, v0.h[4]      \n"
                    "smlal2 v31.4s, v6.8h, v0.h[5]      \n"
                    "smlal  v28.4s, v7.4h, v0.h[6]      \n"
                    "smlal  v30.4s, v7.4h, v0.h[7]      \n"
                    "smlal2 v29.4s, v7.8h, v0.h[6]      \n"
                    "smlal2 v31.4s, v7.8h, v0.h[7]      \n"
                    "smlal  v28.4s, v4.4h, v1.h[0]      \n"
                    "smlal  v30.4s, v4.4h, v1.h[1]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v4.8h, v1.h[0]      \n"
                    "smlal2 v31.4s, v4.8h, v1.h[1]      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.8h}, [%2], #16          \n"
                    "smlal  v28.4s, v5.4h, v1.h[2]      \n"
                    "smlal  v30.4s, v5.4h, v1.h[3]      \n"
                    "smlal2 v29.4s, v5.8h, v1.h[2]      \n"
                    "smlal2 v31.4s, v5.8h, v1.h[3]      \n"
                    "smlal  v28.4s, v6.4h, v1.h[4]      \n"
                    "smlal  v30.4s, v6.4h, v1.h[5]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v6.8h, v1.h[4]      \n"
                    "smlal2 v31.4s, v6.8h, v1.h[5]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v28.4s, v7.4h, v1.h[6]      \n"
                    "smlal  v30.4s, v7.4h, v1.h[7]      \n"
                    "smlal2 v29.4s, v7.8h, v1.h[6]      \n"
                    "smlal2 v31.4s, v7.8h, v1.h[7]      \n"
                    "bne    2b                          \n"
                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #16                 \n"

                    "3:                                 \n"
                    "and    w4, %w6, #7                 \n" // w4 = remain = max_kk & 7
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "ld1    {v0.4h}, [%2]               \n"
                    "add    %2, %2, #4                  \n"
                    "smlal  v28.4s, v4.4h, v0.h[0]      \n"
                    "smlal  v30.4s, v4.4h, v0.h[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal2 v29.4s, v4.8h, v0.h[0]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[1]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #512]          \n"
                    "pld        [%2, #128]          \n"
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vldm       %0, {d24-d31}       \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q12, q12            \n"
                    "veor       q13, q13            \n"
                    "veor       q14, q14            \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "vld1.s16   {d2-d5}, [%1]!      \n"
                    "vld1.s16   {d0}, [%2]!         \n"
                    ".align 4                       \n"
                    "2:                             \n"
                    "vmlal.s16  q12, d2, d0[0]      \n"
                    "vld1.s16   {d6-d9}, [%1]!      \n"
                    "vmlal.s16  q14, d2, d0[1]      \n"
                    "vld1.s16   {d1}, [%2]!         \n"
                    "vmlal.s16  q13, d3, d0[0]      \n"
                    "vmlal.s16  q15, d3, d0[1]      \n"
                    "vmlal.s16  q12, d4, d0[2]      \n"
                    "vmlal.s16  q14, d4, d0[3]      \n"
                    "vmlal.s16  q13, d5, d0[2]      \n"
                    "vmlal.s16  q15, d5, d0[3]      \n"
                    "vmlal.s16  q12, d6, d1[0]      \n"
                    "pld        [%1, #512]          \n"
                    "vld1.s16   {d2-d5}, [%1]!      \n"
                    "vmlal.s16  q14, d6, d1[1]      \n"
                    "pld        [%2, #128]          \n"
                    "vld1.s16   {d0}, [%2]!         \n"
                    "vmlal.s16  q13, d7, d1[0]      \n"
                    "vmlal.s16  q15, d7, d1[1]      \n"
                    "vmlal.s16  q12, d8, d1[2]      \n"
                    "vmlal.s16  q14, d8, d1[3]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q13, d9, d1[2]      \n"
                    "vmlal.s16  q15, d9, d1[3]      \n"
                    "bne        2b                  \n"
                    "sub        %1, %1, #32         \n"
                    "sub        %2, %2, #8          \n"

                    "3:                             \n"
                    "and        r4, %6, #3          \n" // w4 = remain = max_kk & 3
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vld1.s16   {d0-d1}, [%1]!      \n"
                    "vld1.s16   {d2}, [%2]          \n"
                    "add        %2, %2, #4          \n"
                    "vmlal.s16  q12, d0, d2[0]      \n"
                    "vmlal.s16  q14, d0, d2[1]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q13, d1, d2[0]      \n"
                    "vmlal.s16  q15, d1, d2[1]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vstm       %0!, {d24-d31}      \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x8_t _pA = vld1q_s16(pA);
                    int16x4_t _pB0 = vdup_n_s16(pB[0]);
                    int16x4_t _pB1 = vdup_n_s16(pB[1]);
                    _sum0 = vmlal_s16(_sum0, vget_low_s16(_pA), _pB0);
                    _sum1 = vmlal_s16(_sum1, vget_high_s16(_pA), _pB0);
                    _sum2 = vmlal_s16(_sum2, vget_low_s16(_pA), _pB1);
                    _sum3 = vmlal_s16(_sum3, vget_high_s16(_pA), _pB1);
                    pA += 8;
                    pB += 2;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; jj < max_jj; jj++)
            {
                const short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "prfm   pldl1keep, [%2, #128]       \n"
                    "cmp    %w7, #0                     \n"
                    "beq    0f                          \n"

                    "ld1    {v30.4s, v31.4s}, [%0]      \n"
                    "b      1f                          \n"

                    "0:                                 \n"
                    "eor    v30.16b, v30.16b, v30.16b   \n"
                    "eor    v31.16b, v31.16b, v31.16b   \n"

                    "1:                                 \n"
                    "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                    "cmp    w4, #0                      \n"
                    "beq    3f                          \n"

                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "ld1    {v1.8h}, [%2], #16          \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"
                    "eor    v29.16b, v29.16b, v29.16b   \n"
                    ".align 4                           \n"
                    "2:                                 \n"
                    "mov    v0.16b, v1.16b              \n"
                    "smlal  v28.4s, v4.4h, v0.h[0]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v4.8h, v0.h[0]      \n"
                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v1.8h}, [%2], #16          \n"
                    "smlal  v30.4s, v5.4h, v0.h[1]      \n"
                    "smlal2 v31.4s, v5.8h, v0.h[1]      \n"
                    "smlal  v28.4s, v6.4h, v0.h[2]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v6.8h, v0.h[2]      \n"
                    "smlal  v30.4s, v7.4h, v0.h[3]      \n"
                    "smlal2 v31.4s, v7.8h, v0.h[3]      \n"
                    "smlal  v28.4s, v4.4h, v0.h[4]      \n"
                    "ld1    {v6.8h, v7.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v4.8h, v0.h[4]      \n"
                    "smlal  v30.4s, v5.4h, v0.h[5]      \n"
                    "smlal2 v31.4s, v5.8h, v0.h[5]      \n"
                    "smlal  v28.4s, v6.4h, v0.h[6]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h}, [%1], #32   \n"
                    "smlal2 v29.4s, v6.8h, v0.h[6]      \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v30.4s, v7.4h, v0.h[7]      \n"
                    "smlal2 v31.4s, v7.8h, v0.h[7]      \n"
                    "bne    2b                          \n"
                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #16                 \n"
                    "add    v30.4s, v30.4s, v28.4s      \n"
                    "add    v31.4s, v31.4s, v29.4s      \n"

                    "3:                                 \n"
                    "and    w4, %w6, #7                 \n" // w4 = remain = max_kk & 7
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "ld1r   {v0.4h}, [%2], #2           \n"
                    "subs   w4, w4, #1                  \n"
                    "smlal  v30.4s, v4.4h, v0.h[0]      \n"
                    "smlal2 v31.4s, v4.8h, v0.h[0]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "st1    {v30.4s, v31.4s}, [%0], #32 \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #512]          \n"
                    "pld        [%2, #64]           \n"
                    "cmp        %7, #0              \n"
                    "beq        0f                  \n"

                    "vld1.s32   {d28-d31}, [%0]     \n"
                    "b          1f                  \n"

                    "0:                             \n"
                    "veor       q14, q14            \n"
                    "veor       q15, q15            \n"

                    "1:                             \n"
                    "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                    "cmp        r4, #0              \n"
                    "beq        3f                  \n"

                    "vld1.s16   {d2-d5}, [%1]!      \n"
                    ".align 4                       \n"
                    "2:                             \n"
                    "pld        [%2, #64]           \n"
                    "vld1.s16   {d0}, [%2]!         \n"
                    "vmlal.s16  q14, d2, d0[0]      \n"
                    "vld1.s16   {d6-d9}, [%1]!      \n"
                    "vmlal.s16  q15, d3, d0[0]      \n"
                    "vmlal.s16  q14, d4, d0[1]      \n"
                    "vmlal.s16  q15, d5, d0[1]      \n"
                    "vmlal.s16  q14, d6, d0[2]      \n"
                    "pld        [%1, #512]          \n"
                    "vld1.s16   {d2-d5}, [%1]!      \n"
                    "vmlal.s16  q15, d7, d0[2]      \n"
                    "vmlal.s16  q14, d8, d0[3]      \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q15, d9, d0[3]      \n"
                    "bne        2b                  \n"
                    "sub        %1, %1, #32         \n"

                    "3:                             \n"
                    "and        r4, %6, #3          \n" // w4 = remain = max_kk & 3
                    "cmp        r4, #0              \n"
                    "beq        5f                  \n"

                    "4:                             \n"
                    "vld1.s16   {d0-d1}, [%1]!      \n"
                    "vld1.s16   {d2[]}, [%2]!       \n"
                    "subs       r4, r4, #1          \n"
                    "vmlal.s16  q14, d0, d2[0]      \n"
                    "vmlal.s16  q15, d1, d2[0]      \n"
                    "bne        4b                  \n"

                    "5:                             \n"
                    "vst1.s32   {d28-d31}, [%0]!    \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB)      // %2
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "r"(max_kk), // %6
                    "r"(k)       // %7
                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int32x4_t _sum0;
                int32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x8_t _pA = vld1q_s16(pA);
                    int16x4_t _pB = vld1_dup_s16(pB);
                    _sum0 = vmlal_s16(_sum0, vget_low_s16(_pA), _pB);
                    _sum1 = vmlal_s16(_sum1, vget_high_s16(_pA), _pB);
                    pA += 8;
                    pB += 1;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        for (int b = 0; b < batch; b++)
        {
            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
            const short* pB = BT_tile.row<const short>(b);

            int jj = 0;
#if __aarch64__
            for (; jj + 11 < max_jj; jj += 12)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;
                int32x4_t _sum6;
                int32x4_t _sum7;
                int32x4_t _sum8;
                int32x4_t _sum9;
                int32x4_t _suma;
                int32x4_t _sumb;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                    _sum6 = vdupq_n_s32(0);
                    _sum7 = vdupq_n_s32(0);
                    _sum8 = vdupq_n_s32(0);
                    _sum9 = vdupq_n_s32(0);
                    _suma = vdupq_n_s32(0);
                    _sumb = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                    _sum8 = vld1q_s32(outptr + 32);
                    _sum9 = vld1q_s32(outptr + 36);
                    _suma = vld1q_s32(outptr + 40);
                    _sumb = vld1q_s32(outptr + 44);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    int16x4_t _pB2 = vld1_s16(pB + 8);
                    _sum0 = vmlal_laneq_s16(_sum0, _pA, _pB, 0);
                    _sum1 = vmlal_laneq_s16(_sum1, _pA, _pB, 1);
                    _sum2 = vmlal_laneq_s16(_sum2, _pA, _pB, 2);
                    _sum3 = vmlal_laneq_s16(_sum3, _pA, _pB, 3);
                    _sum4 = vmlal_laneq_s16(_sum4, _pA, _pB, 4);
                    _sum5 = vmlal_laneq_s16(_sum5, _pA, _pB, 5);
                    _sum6 = vmlal_laneq_s16(_sum6, _pA, _pB, 6);
                    _sum7 = vmlal_laneq_s16(_sum7, _pA, _pB, 7);
                    _sum8 = vmlal_lane_s16(_sum8, _pA, _pB2, 0);
                    _sum9 = vmlal_lane_s16(_sum9, _pA, _pB2, 1);
                    _suma = vmlal_lane_s16(_suma, _pA, _pB2, 2);
                    _sumb = vmlal_lane_s16(_sumb, _pA, _pB2, 3);
                    pA += 4;
                    pB += 12;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
                vst1q_s32(outptr + 32, _sum8);
                vst1q_s32(outptr + 36, _sum9);
                vst1q_s32(outptr + 40, _suma);
                vst1q_s32(outptr + 44, _sumb);
                outptr += 48;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;
                int32x4_t _sum6;
                int32x4_t _sum7;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                    _sum6 = vdupq_n_s32(0);
                    _sum7 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    _sum0 = vmlal_laneq_s16(_sum0, _pA, _pB, 0);
                    _sum1 = vmlal_laneq_s16(_sum1, _pA, _pB, 1);
                    _sum2 = vmlal_laneq_s16(_sum2, _pA, _pB, 2);
                    _sum3 = vmlal_laneq_s16(_sum3, _pA, _pB, 3);
                    _sum4 = vmlal_laneq_s16(_sum4, _pA, _pB, 4);
                    _sum5 = vmlal_laneq_s16(_sum5, _pA, _pB, 5);
                    _sum6 = vmlal_laneq_s16(_sum6, _pA, _pB, 6);
                    _sum7 = vmlal_laneq_s16(_sum7, _pA, _pB, 7);
                    pA += 4;
                    pB += 8;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
                outptr += 32;
            }
#endif // __aarch64__
            for (; jj + 5 < max_jj; jj += 6)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    _sum0 = vmlal_lane_s16(_sum0, _pA, vget_low_s16(_pB), 0);
                    _sum1 = vmlal_lane_s16(_sum1, _pA, vget_low_s16(_pB), 1);
                    _sum2 = vmlal_lane_s16(_sum2, _pA, vget_low_s16(_pB), 2);
                    _sum3 = vmlal_lane_s16(_sum3, _pA, vget_low_s16(_pB), 3);
                    _sum4 = vmlal_lane_s16(_sum4, _pA, vget_high_s16(_pB), 0);
                    _sum5 = vmlal_lane_s16(_sum5, _pA, vget_high_s16(_pB), 1);
                    pA += 4;
                    pB += 6;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                outptr += 24;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_s16(pA);
                    int16x4_t _pB = vld1_s16(pB);
                    _sum0 = vmlal_lane_s16(_sum0, _pA, _pB, 0);
                    _sum1 = vmlal_lane_s16(_sum1, _pA, _pB, 1);
                    _sum2 = vmlal_lane_s16(_sum2, _pA, _pB, 2);
                    _sum3 = vmlal_lane_s16(_sum3, _pA, _pB, 3);
                    pA += 4;
                    pB += 4;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                outptr += 16;
            }
            for (; jj + 1 < max_jj; jj += 2)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_s16(pA);
                    int16x4_t _pB0 = vdup_n_s16(pB[0]);
                    int16x4_t _pB1 = vdup_n_s16(pB[1]);
                    _sum0 = vmlal_s16(_sum0, _pA, _pB0);
                    _sum1 = vmlal_s16(_sum1, _pA, _pB1);
                    pA += 4;
                    pB += 2;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                outptr += 8;
            }
            for (; jj < max_jj; jj++)
            {
                const short* pA = pAT;

                int32x4_t _sum0;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_s16(pA);
                    int16x4_t _pB = vld1_dup_s16(pB);
                    _sum0 = vmlal_s16(_sum0, _pA, _pB);
                    pA += 4;
                    pB += 1;
                }

                vst1q_s32(outptr, _sum0);
                outptr += 4;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        for (int b = 0; b < batch; b++)
        {
            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
            const short* pB = BT_tile.row<const short>(b);

            int jj = 0;
#if __ARM_NEON
#if __aarch64__
            for (; jj + 11 < max_jj; jj += 12)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;
                int32x4_t _sum4;
                int32x4_t _sum5;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                    _sum4 = vdupq_n_s32(0);
                    _sum5 = vdupq_n_s32(0);
                }
                else
                {
                    int32x4x2_t _s01 = vld2q_s32(outptr);
                    int32x4x2_t _s23 = vld2q_s32(outptr + 8);
                    int32x4x2_t _s45 = vld2q_s32(outptr + 16);
                    _sum0 = _s01.val[0];
                    _sum3 = _s01.val[1];
                    _sum1 = _s23.val[0];
                    _sum4 = _s23.val[1];
                    _sum2 = _s45.val[0];
                    _sum5 = _s45.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA0 = vdup_n_s16(pA[0]);
                    int16x4_t _pA1 = vdup_n_s16(pA[1]);
                    int16x8_t _pB = vld1q_s16(pB);
                    int16x4_t _pB2 = vld1_s16(pB + 8);
                    _sum0 = vmlal_s16(_sum0, _pA0, vget_low_s16(_pB));
                    _sum1 = vmlal_s16(_sum1, _pA0, vget_high_s16(_pB));
                    _sum2 = vmlal_s16(_sum2, _pA0, _pB2);
                    _sum3 = vmlal_s16(_sum3, _pA1, vget_low_s16(_pB));
                    _sum4 = vmlal_s16(_sum4, _pA1, vget_high_s16(_pB));
                    _sum5 = vmlal_s16(_sum5, _pA1, _pB2);
                    pA += 2;
                    pB += 12;
                }

                int32x4x2_t _s01;
                _s01.val[0] = _sum0;
                _s01.val[1] = _sum3;
                int32x4x2_t _s23;
                _s23.val[0] = _sum1;
                _s23.val[1] = _sum4;
                int32x4x2_t _s45;
                _s45.val[0] = _sum2;
                _s45.val[1] = _sum5;
                vst2q_s32(outptr, _s01);
                vst2q_s32(outptr + 8, _s23);
                vst2q_s32(outptr + 16, _s45);
                outptr += 24;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;
                int32x4_t _sum3;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                    _sum3 = vdupq_n_s32(0);
                }
                else
                {
                    int32x4x2_t _s01 = vld2q_s32(outptr);
                    int32x4x2_t _s23 = vld2q_s32(outptr + 8);
                    _sum0 = _s01.val[0];
                    _sum2 = _s01.val[1];
                    _sum1 = _s23.val[0];
                    _sum3 = _s23.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA0 = vdup_n_s16(pA[0]);
                    int16x4_t _pA1 = vdup_n_s16(pA[1]);
                    int16x8_t _pB = vld1q_s16(pB);
                    _sum0 = vmlal_s16(_sum0, _pA0, vget_low_s16(_pB));
                    _sum1 = vmlal_s16(_sum1, _pA0, vget_high_s16(_pB));
                    _sum2 = vmlal_s16(_sum2, _pA1, vget_low_s16(_pB));
                    _sum3 = vmlal_s16(_sum3, _pA1, vget_high_s16(_pB));
                    pA += 2;
                    pB += 8;
                }

                int32x4x2_t _s01;
                _s01.val[0] = _sum0;
                _s01.val[1] = _sum2;
                int32x4x2_t _s23;
                _s23.val[0] = _sum1;
                _s23.val[1] = _sum3;
                vst2q_s32(outptr, _s01);
                vst2q_s32(outptr + 8, _s23);
                outptr += 16;
            }
#endif // __aarch64__
            for (; jj + 5 < max_jj; jj += 6)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                }
                else
                {
                    int32x4x2_t _s01 = vld2q_s32(outptr);
                    _sum0 = _s01.val[0];
                    _sum1 = _s01.val[1];
                    _sum2 = vld1q_s32(outptr + 8);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vreinterpret_s16_s32(vld1_dup_s32((const int*)pA));
                    int16x8_t _pB = vld1q_s16(pB);
                    int16x4_t _pB2 = vzip_s16(vget_high_s16(_pB), vget_high_s16(_pB)).val[0];
                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_pB), _pA, 0);
                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_pB), _pA, 1);
                    _sum2 = vmlal_s16(_sum2, _pA, _pB2);
                    pA += 2;
                    pB += 6;
                }

                int32x4x2_t _s01;
                _s01.val[0] = _sum0;
                _s01.val[1] = _sum1;
                vst2q_s32(outptr, _s01);
                vst1q_s32(outptr + 8, _sum2);
                outptr += 12;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                }
                else
                {
                    int32x4x2_t _s01 = vld2q_s32(outptr);
                    _sum0 = _s01.val[0];
                    _sum1 = _s01.val[1];
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA0 = vdup_n_s16(pA[0]);
                    int16x4_t _pA1 = vdup_n_s16(pA[1]);
                    int16x4_t _pB = vld1_s16(pB);
                    _sum0 = vmlal_s16(_sum0, _pA0, _pB);
                    _sum1 = vmlal_s16(_sum1, _pA1, _pB);
                    pA += 2;
                    pB += 4;
                }

                int32x4x2_t _s01;
                _s01.val[0] = _sum0;
                _s01.val[1] = _sum1;
                vst2q_s32(outptr, _s01);
                outptr += 8;
            }
#endif // __ARM_NEON
            for (; jj + 1 < max_jj; jj += 2)
            {
                const short* pA = pAT;

                int sum00 = 0;
                int sum01 = 0;
                int sum10 = 0;
                int sum11 = 0;

                if (k == 0)
                {
                    sum00 = 0;
                    sum01 = 0;
                    sum10 = 0;
                    sum11 = 0;
                }
                else
                {
                    sum00 = outptr[0];
                    sum01 = outptr[1];
                    sum10 = outptr[2];
                    sum11 = outptr[3];
                }

                int kk = 0;
#if !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk + 1 < max_kk; kk += 2)
                {
                    // fomit-frame-pointer implied in optimized flag spare one register
                    // let us stay away from error: ‘asm’ operand has impossible constraints   --- nihui
#if __OPTIMIZE__
                    asm volatile(
                        "ldr    r2, [%0], #4    \n" // int16x2_t _pA0 = *((int16x2_t*)pA); pA += 2;
                        "ldr    r3, [%0], #4    \n" // int16x2_t _pA1 = *((int16x2_t*)pA); pA += 2;
                        "ldr    r4, [%1], #4    \n" // int16x2_t _pB0 = *((int16x2_t*)pB); pB += 2;
                        "ldr    r5, [%1], #4    \n" // int16x2_t _pB1 = *((int16x2_t*)pB); pB += 2;
                        "smlad  %2, r2, r4, %2  \n" // sum00 = __smlad(_pA0, _pB0, sum00);
                        "smlad  %3, r3, r4, %3  \n" // sum01 = __smlad(_pA1, _pB0, sum01);
                        "smlad  %4, r2, r5, %4  \n" // sum10 = __smlad(_pA0, _pB1, sum10);
                        "smlad  %5, r3, r5, %5  \n" // sum11 = __smlad(_pA1, _pB1, sum11);
                        : "=r"(pA),
                        "=r"(pB),
                        "=r"(sum00),
                        "=r"(sum01),
                        "=r"(sum10),
                        "=r"(sum11)
                        : "0"(pA),
                        "1"(pB),
                        "2"(sum00),
                        "3"(sum01),
                        "4"(sum10),
                        "5"(sum11)
                        : "memory", "r2", "r3", "r4", "r5");
#else
                    int _pA0 = *((int*)pA);
                    int _pA1 = *((int*)(pA + 2));
                    int _pB0 = *((int*)pB);
                    int _pB1 = *((int*)(pB + 2));
                    asm volatile("smlad %0, %2, %3, %0"
                                 : "=r"(sum00)
                                 : "0"(sum00), "r"(_pA0), "r"(_pB0)
                                 :);
                    asm volatile("smlad %0, %2, %3, %0"
                                 : "=r"(sum01)
                                 : "0"(sum01), "r"(_pA1), "r"(_pB0)
                                 :);
                    asm volatile("smlad %0, %2, %3, %0"
                                 : "=r"(sum10)
                                 : "0"(sum10), "r"(_pA0), "r"(_pB1)
                                 :);
                    asm volatile("smlad %0, %2, %3, %0"
                                 : "=r"(sum11)
                                 : "0"(sum11), "r"(_pA1), "r"(_pB1)
                                 :);
                    pA += 4;
                    pB += 4;
#endif
                }
#endif // !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk < max_kk; kk++)
                {
                    sum00 += pA[0] * pB[0];
                    sum01 += pA[1] * pB[0];
                    sum10 += pA[0] * pB[1];
                    sum11 += pA[1] * pB[1];
                    pA += 2;
                    pB += 2;
                }

                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
                outptr += 2 * 2;
            }
            for (; jj < max_jj; jj++)
            {
                const short* pA = pAT;

                int sum0 = 0;
                int sum1 = 0;

                if (k == 0)
                {
                    sum0 = 0;
                    sum1 = 0;
                }
                else
                {
                    sum0 = outptr[0];
                    sum1 = outptr[1];
                }

                int kk = 0;
#if !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk + 1 < max_kk; kk += 2)
                {
                    asm volatile(
                        "ldr    r2, [%0], #4    \n" // int16x2_t _pA0 = *((int16x2_t*)pA); pA += 2;
                        "ldr    r3, [%0], #4    \n" // int16x2_t _pA1 = *((int16x2_t*)pA); pA += 2;
                        "ldr    r4, [%1], #4    \n" // int16x2_t _pB = *((int16x2_t*)pB); pB += 2;
                        "smlad  %2, r2, r4, %2  \n" // sum0 = __smlad(_pA0, _pB, sum0);
                        "smlad  %3, r3, r4, %3  \n" // sum1 = __smlad(_pA1, _pB, sum1);
                        : "=r"(pA),
                        "=r"(pB),
                        "=r"(sum0),
                        "=r"(sum1)
                        : "0"(pA),
                        "1"(pB),
                        "2"(sum0),
                        "3"(sum1)
                        : "memory", "r2", "r3", "r4");
                }
#endif // !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk < max_kk; kk++)
                {
                    sum0 += pA[0] * pB[0];
                    sum1 += pA[1] * pB[0];
                    pA += 2;
                    pB += 1;
                }

                outptr[0] = sum0;
                outptr[1] = sum1;
                outptr += 2;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        for (int b = 0; b < batch; b++)
        {
            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
            const short* pB = BT_tile.row<const short>(b);

            int jj = 0;
#if __ARM_NEON
#if __aarch64__
            for (; jj + 11 < max_jj; jj += 12)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;
                int32x4_t _sum2;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                    _sum2 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_dup_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    int16x4_t _pB2 = vld1_s16(pB + 8);
                    _sum0 = vmlal_s16(_sum0, _pA, vget_low_s16(_pB));
                    _sum1 = vmlal_s16(_sum1, _pA, vget_high_s16(_pB));
                    _sum2 = vmlal_s16(_sum2, _pA, _pB2);
                    pA += 1;
                    pB += 12;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                outptr += 12;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_dup_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    _sum0 = vmlal_s16(_sum0, _pA, vget_low_s16(_pB));
                    _sum1 = vmlal_s16(_sum1, _pA, vget_high_s16(_pB));
                    pA += 1;
                    pB += 8;
                }

                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                outptr += 8;
            }
#endif // __aarch64__
            for (; jj + 5 < max_jj; jj += 6)
            {
                const short* pA = pAT;

                int32x4_t _sum0;
                int32x4_t _sum1;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                    _sum1 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_dup_s16(pA);
                    int16x8_t _pB = vld1q_s16(pB);
                    _sum0 = vmlal_s16(_sum0, _pA, vget_low_s16(_pB));
                    _sum1 = vmlal_s16(_sum1, _pA, vget_high_s16(_pB));
                    pA += 1;
                    pB += 6;
                }

                vst1q_s32(outptr, _sum0);
                vst1_s32(outptr + 4, vget_low_s32(_sum1));
                outptr += 6;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                const short* pA = pAT;

                int32x4_t _sum0;

                if (k == 0)
                {
                    _sum0 = vdupq_n_s32(0);
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                }

                int kk = 0;
                for (; kk < max_kk; kk++)
                {
                    int16x4_t _pA = vld1_dup_s16(pA);
                    int16x4_t _pB = vld1_s16(pB);
                    _sum0 = vmlal_s16(_sum0, _pA, _pB);
                    pA += 1;
                    pB += 4;
                }

                vst1q_s32(outptr, _sum0);
                outptr += 4;
            }
#endif // __ARM_NEON
            for (; jj + 1 < max_jj; jj += 2)
            {
                const short* pA = pAT;

                int sum0 = 0;
                int sum1 = 0;

                if (k == 0)
                {
                    sum0 = 0;
                    sum1 = 0;
                }
                else
                {
                    sum0 = outptr[0];
                    sum1 = outptr[1];
                }

                int kk = 0;
#if !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk + 1 < max_kk; kk += 2)
                {
                    asm volatile(
                        "ldr    r2, [%0], #4    \n" // int16x2_t _pA = *((int16x2_t*)pA); pA += 2;
                        "ldr    r3, [%1], #4    \n" // int16x2_t _pB0 = *((int16x2_t*)pB); pB += 2;
                        "ldr    r4, [%1], #4    \n" // int16x2_t _pB1 = *((int16x2_t*)pB); pB += 2;
                        "smlad  %2, r2, r3, %2  \n" // sum0 = __smlad(_pA, _pB0, sum0);
                        "smlad  %3, r2, r4, %3  \n" // sum1 = __smlad(_pA, _pB1, sum1);
                        : "=r"(pA),
                        "=r"(pB),
                        "=r"(sum0),
                        "=r"(sum1)
                        : "0"(pA),
                        "1"(pB),
                        "2"(sum0),
                        "3"(sum1)
                        : "memory", "r2", "r3", "r4");
                }
#endif // !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk < max_kk; kk++)
                {
                    sum0 += pA[0] * pB[0];
                    sum1 += pA[0] * pB[1];
                    pA += 1;
                    pB += 2;
                }

                outptr[0] = sum0;
                outptr[1] = sum1;
                outptr += 2;
            }
            for (; jj < max_jj; jj++)
            {
                const short* pA = pAT;

                int sum = 0;

                if (k == 0)
                {
                    sum = 0;
                }
                else
                {
                    sum = outptr[0];
                }

                int kk = 0;
#if !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk + 1 < max_kk; kk += 2)
                {
                    asm volatile(
                        "ldr    r2, [%0], #4    \n" // int16x2_t _pA = *((int16x2_t*)pA); pA += 2;
                        "ldr    r3, [%1], #4    \n" // int16x2_t _pB = *((int16x2_t*)pB); pB += 2;
                        "smlad  %2, r2, r3, %2  \n" // sum = __smlad(_pA, _pB, sum);
                        : "=r"(pA),
                        "=r"(pB),
                        "=r"(sum)
                        : "0"(pA),
                        "1"(pB),
                        "2"(sum)
                        : "memory", "r2", "r3");
                }
#endif // !__ARM_NEON && __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
                for (; kk < max_kk; kk++)
                {
                    sum += pA[0] * pB[0];
                    pA += 1;
                    pB += 1;
                }

                outptr[0] = sum;
                outptr += 1;
            }
        }
    }
}

static void get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const int l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(short));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // we shall take B into account for batched gemm, but that will be slower on arm in practice, why ?
    // (void)B;

    // solve K
    {
        // try not to split K
#if __aarch64__
        int tile_size = (l2_cache_size_int8 - 32) / 12;
#elif __ARM_NEON
        int tile_size = (l2_cache_size_int8 - 32) / 6;
#else
        int tile_size = (l2_cache_size_int8 - 2) / 3;
#endif

#if __aarch64__
        TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::max(4, tile_size / 4 * 4);
#else
        TILE_K = std::max(2, tile_size / 2 * 2);
#endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
#if __aarch64__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#else
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#endif
    }

    // solve M
    {
#if __ARM_NEON
        TILE_M = 8;
#else
        TILE_M = 2;
#endif
    }

    {
        TILE_M *= std::min(nT, get_physical_cpu_count());

        int nn_M = (M + TILE_M - 1) / TILE_M;
#if __ARM_NEON
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#else
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif

        if (nT > 1)
        {
#if __ARM_NEON
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
#else
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
#endif
        }

#if __ARM_NEON
        TILE_M = std::max(8, TILE_M);
#else
        TILE_M = std::max(2, TILE_M);
#endif
    }

    if (N > 0)
    {
        int tile_size;
        if (TILE_K >= K)
        {
            tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / TILE_K;
        }
        else
        {
            tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / (TILE_M * 2 + TILE_K);
        }

#if __aarch64__
        TILE_N = std::max(4, tile_size / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::max(4, tile_size / 4 * 4);
#else
        TILE_N = std::max(1, tile_size);
#endif

        int nn_N = (N + TILE_N - 1) / TILE_N;

#if __aarch64__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#else
        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
#endif

#if __aarch64__
        TILE_N = std::max(4, TILE_N);
#elif __ARM_NEON
        TILE_N = std::max(4, TILE_N);
#else
        TILE_N = std::max(1, TILE_N);
#endif
    }
}

static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    // const signed char ktm[4][3] = {
    //     {2, 0, 0},
    //     {1, 1, 1},
    //     {1, -1, 1},
    //     {0, 0, 2}
    // };

    short* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            short tmp[4][3];

            const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                signed char r0 = k0[0];
                signed char r1 = k0[1];
                signed char r2 = k0[2];

                tmp[0][m] = r0 * 2;
                tmp[1][m] = r0 + r1 + r2;
                tmp[2][m] = r0 - r1 + r2;
                tmp[3][m] = r2 * 2;

                k0 += 3;
            }

            for (int m = 0; m < 4; m++)
            {
                short r0 = tmp[m][0];
                short r1 = tmp[m][1];
                short r2 = tmp[m][2];

                short z0 = r0 * 2;
                short z1 = r0 + r1 + r2;
                short z2 = r0 - r1 + r2;
                short z3 = r2 * 2;

                ptmp[0] = z0;
                ptmp[1] = z1;
                ptmp[2] = z2;
                ptmp[3] = z3;
                ptmp += 4;
            }
        }
    }
}

static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 16;

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 2u, (Allocator*)0);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd23_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const signed char itm[4][4] = {
    //     {1,  0, -1,  0},
    //     {0,  1,  1,  0},
    //     {0, -1,  1,  0},
    //     {0, -1,  0,  1}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w - 1) / 2;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
    nn_max_kk = max_kk / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

        short tmp[4][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const signed char* r0 = bottom_blob.channel((k + kk) / elempack).row<const signed char>(ti * 2) + (tj * 2) * elempack;

            for (int m = 0; m < 4; m++)
            {
                int8x8_t _r0 = vdup_n_s8(0);
                int8x8_t _r1 = vdup_n_s8(0);
                int8x8_t _r2 = vdup_n_s8(0);
                int8x8_t _r3 = vdup_n_s8(0);

                if (ti * 2 + m < h)
                {
                    if (elempack == 8)
                    {
                        _r0 = vld1_s8(r0);
                        if (tj * 2 + 1 < w) _r1 = vld1_s8(r0 + 8);
                        if (tj * 2 + 2 < w) _r2 = vld1_s8(r0 + 16);
                        if (tj * 2 + 3 < w) _r3 = vld1_s8(r0 + 24);
                    }
                    if (elempack == 1)
                    {
                        const signed char* r1 = r0 + N;
                        const signed char* r2 = r0 + N * 2;
                        const signed char* r3 = r0 + N * 3;
                        const signed char* r4 = r0 + N * 4;
                        const signed char* r5 = r0 + N * 5;
                        const signed char* r6 = r0 + N * 6;
                        const signed char* r7 = r0 + N * 7;

                        int8x8_t _t0 = vld1_s8(r0);
                        int8x8_t _t1 = vld1_s8(r1);
                        int8x8_t _t2 = vld1_s8(r2);
                        int8x8_t _t3 = vld1_s8(r3);
                        int8x8_t _t4 = vld1_s8(r4);
                        int8x8_t _t5 = vld1_s8(r5);
                        int8x8_t _t6 = vld1_s8(r6);
                        int8x8_t _t7 = vld1_s8(r7);

                        int8x8_t _t01 = vzip_s8(_t0, _t1).val[0];
                        int8x8_t _t23 = vzip_s8(_t2, _t3).val[0];
                        int8x8_t _t45 = vzip_s8(_t4, _t5).val[0];
                        int8x8_t _t67 = vzip_s8(_t6, _t7).val[0];
                        int16x4x2_t _t0123 = vzip_s16(vreinterpret_s16_s8(_t01), vreinterpret_s16_s8(_t23));
                        int16x4x2_t _t4567 = vzip_s16(vreinterpret_s16_s8(_t45), vreinterpret_s16_s8(_t67));
                        int16x8_t _ta = vcombine_s16(_t0123.val[0], _t0123.val[1]);
                        int16x8_t _tb = vcombine_s16(_t4567.val[0], _t4567.val[1]);
                        int32x4x2_t _tab = vzipq_s32(vreinterpretq_s32_s16(_ta), vreinterpretq_s32_s16(_tb));

                        _r0 = vreinterpret_s8_s32(vget_low_s32(_tab.val[0]));
                        if (tj * 2 + 1 < w) _r1 = vreinterpret_s8_s32(vget_high_s32(_tab.val[0]));
                        if (tj * 2 + 2 < w) _r2 = vreinterpret_s8_s32(vget_low_s32(_tab.val[1]));
                        if (tj * 2 + 3 < w) _r3 = vreinterpret_s8_s32(vget_high_s32(_tab.val[1]));
                    }
                }

                int16x8_t _tmp0 = vsubl_s8(_r0, _r2);
                int16x8_t _tmp1 = vaddl_s8(_r1, _r2);
                int16x8_t _tmp2 = vsubl_s8(_r2, _r1);
                int16x8_t _tmp3 = vsubl_s8(_r3, _r1);

                vst1q_s16(tmp[0][m], _tmp0);
                vst1q_s16(tmp[1][m], _tmp1);
                vst1q_s16(tmp[2][m], _tmp2);
                vst1q_s16(tmp[3][m], _tmp3);

                r0 += w * elempack;
            }

            short* p0 = (short*)B + kk * max_jj * 16 + jj * 8;
            short* p1 = p0 + max_jj * 8;
            short* p2 = p0 + max_jj * 8 * 2;
            short* p3 = p0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                int16x8_t _r0 = vld1q_s16(tmp[m][0]);
                int16x8_t _r1 = vld1q_s16(tmp[m][1]);
                int16x8_t _r2 = vld1q_s16(tmp[m][2]);
                int16x8_t _r3 = vld1q_s16(tmp[m][3]);

                int16x8_t _tmp0 = vsubq_s16(_r0, _r2);
                int16x8_t _tmp1 = vaddq_s16(_r1, _r2);
                int16x8_t _tmp2 = vsubq_s16(_r2, _r1);
                int16x8_t _tmp3 = vsubq_s16(_r3, _r1);

                vst1q_s16(p0, _tmp0);
                vst1q_s16(p1, _tmp1);
                vst1q_s16(p2, _tmp2);
                vst1q_s16(p3, _tmp3);

                p0 += max_jj * 4 * 8;
                p1 += max_jj * 4 * 8;
                p2 += max_jj * 4 * 8;
                p3 += max_jj * 4 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

        short tmp[4][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const signed char* r0 = bottom_blob.channel(k + kk).row<const signed char>(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
                signed char r00 = 0;
                signed char r01 = 0;
                signed char r10 = 0;
                signed char r11 = 0;
                signed char r20 = 0;
                signed char r21 = 0;
                signed char r30 = 0;
                signed char r31 = 0;

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const signed char* r1 = r0 + N;

                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 2 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 2 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 2 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
                    }
                }

                tmp[0][m][0] = r00 - r20;
                tmp[0][m][1] = r01 - r21;
                tmp[1][m][0] = r10 + r20;
                tmp[1][m][1] = r11 + r21;
                tmp[2][m][0] = r20 - r10;
                tmp[2][m][1] = r21 - r11;
                tmp[3][m][0] = r30 - r10;
                tmp[3][m][1] = r31 - r11;

                r0 += w;
            }

            short* p0 = (short*)B + kk * max_jj * 16 + jj * 2;
            short* p1 = p0 + max_jj * 2;
            short* p2 = p0 + max_jj * 2 * 2;
            short* p3 = p0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
                short r00 = tmp[m][0][0];
                short r01 = tmp[m][0][1];
                short r10 = tmp[m][1][0];
                short r11 = tmp[m][1][1];
                short r20 = tmp[m][2][0];
                short r21 = tmp[m][2][1];
                short r30 = tmp[m][3][0];
                short r31 = tmp[m][3][1];

                p0[0] = r00 - r20;
                p0[1] = r01 - r21;
                p1[0] = r10 + r20;
                p1[1] = r11 + r21;
                p2[0] = r20 - r10;
                p2[1] = r21 - r11;
                p3[0] = r30 - r10;
                p3[1] = r31 - r11;

                p0 += max_jj * 4 * 2;
                p1 += max_jj * 4 * 2;
                p2 += max_jj * 4 * 2;
                p3 += max_jj * 4 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        short tmp[4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const signed char* r0123 = bottom_blob.channel(k + kk).row<const signed char>(ti * 2) + (tj * 2);

            for (int m = 0; m < 4; m++)
            {
                signed char r0 = 0;
                signed char r1 = 0;
                signed char r2 = 0;
                signed char r3 = 0;

                if (ti * 2 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 2 + 1 < w) r1 = r0123[1];
                        if (tj * 2 + 2 < w) r2 = r0123[2];
                        if (tj * 2 + 3 < w) r3 = r0123[3];
                    }
                }

                tmp[0][m] = r0 - r2;
                tmp[1][m] = r1 + r2;
                tmp[2][m] = r2 - r1;
                tmp[3][m] = r3 - r1;

                r0123 += w;
            }

            short* p0 = (short*)B + kk * max_jj * 16 + jj;
            short* p1 = p0 + max_jj;
            short* p2 = p0 + max_jj * 2;
            short* p3 = p0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                short r0 = tmp[m][0];
                short r1 = tmp[m][1];
                short r2 = tmp[m][2];
                short r3 = tmp[m][3];

                p0[0] = r0 - r2;
                p1[0] = r1 + r2;
                p2[0] = r2 - r1;
                p3[0] = r3 - r1;

                p0 += max_jj * 4;
                p1 += max_jj * 4;
                p2 += max_jj * 4;
                p3 += max_jj * 4;
            }
        }
    }
}

static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj)
{
    // const int otm[2][4] = {
    //     {1,  1,  1,  0},
    //     {0,  1, -1,  1}
    // };

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 1) / 2;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        int tmp[2][4][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 8;
            const int* r1 = r0 + max_jj * 8;
            const int* r2 = r0 + max_jj * 8 * 2;
            const int* r3 = r0 + max_jj * 8 * 3;

            for (int m = 0; m < 4; m++)
            {
                int32x4_t _r00 = vld1q_s32(r0);
                int32x4_t _r01 = vld1q_s32(r0 + 4);
                int32x4_t _r10 = vld1q_s32(r1);
                int32x4_t _r11 = vld1q_s32(r1 + 4);
                int32x4_t _r20 = vld1q_s32(r2);
                int32x4_t _r21 = vld1q_s32(r2 + 4);
                int32x4_t _r30 = vld1q_s32(r3);
                int32x4_t _r31 = vld1q_s32(r3 + 4);

                int32x4_t _tmp00 = vaddq_s32(vaddq_s32(_r00, _r10), _r20);
                int32x4_t _tmp01 = vaddq_s32(vaddq_s32(_r01, _r11), _r21);
                int32x4_t _tmp10 = vaddq_s32(vsubq_s32(_r10, _r20), _r30);
                int32x4_t _tmp11 = vaddq_s32(vsubq_s32(_r11, _r21), _r31);

                vst1q_s32(tmp[0][m], _tmp00);
                vst1q_s32(tmp[0][m] + 4, _tmp01);
                vst1q_s32(tmp[1][m], _tmp10);
                vst1q_s32(tmp[1][m] + 4, _tmp11);

                r0 += max_jj * 4 * 8;
                r1 += max_jj * 4 * 8;
                r2 += max_jj * 4 * 8;
                r3 += max_jj * 4 * 8;
            }

            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                int32x4_t _r00 = vld1q_s32(tmp[m][0]);
                int32x4_t _r01 = vld1q_s32(tmp[m][0] + 4);
                int32x4_t _r10 = vld1q_s32(tmp[m][1]);
                int32x4_t _r11 = vld1q_s32(tmp[m][1] + 4);
                int32x4_t _r20 = vld1q_s32(tmp[m][2]);
                int32x4_t _r21 = vld1q_s32(tmp[m][2] + 4);
                int32x4_t _r30 = vld1q_s32(tmp[m][3]);
                int32x4_t _r31 = vld1q_s32(tmp[m][3] + 4);

                int32x4_t _tmp00 = vaddq_s32(vaddq_s32(_r00, _r10), _r20);
                int32x4_t _tmp01 = vaddq_s32(vaddq_s32(_r01, _r11), _r21);
                int32x4_t _tmp10 = vaddq_s32(vsubq_s32(_r10, _r20), _r30);
                int32x4_t _tmp11 = vaddq_s32(vsubq_s32(_r11, _r21), _r31);

                _tmp00 = vshrq_n_s32(_tmp00, 2);
                _tmp01 = vshrq_n_s32(_tmp01, 2);
                _tmp10 = vshrq_n_s32(_tmp10, 2);
                _tmp11 = vshrq_n_s32(_tmp11, 2);

                if (out_elempack == 8)
                {
                    vst1q_s32(outptr0, _tmp00);
                    vst1q_s32(outptr0 + 4, _tmp01);
                    if (tj * 2 + 1 < outw)
                    {
                        vst1q_s32(outptr0 + 8, _tmp10);
                        vst1q_s32(outptr0 + 12, _tmp11);
                    }
                }
                if (out_elempack == 4)
                {
                    int* outptr1 = outptr0 + N;

                    vst1q_s32(outptr0, _tmp00);
                    vst1q_s32(outptr1, _tmp01);
                    if (tj * 2 + 1 < outw)
                    {
                        vst1q_s32(outptr0 + 4, _tmp10);
                        vst1q_s32(outptr1 + 4, _tmp11);
                    }
                }
                if (out_elempack == 1)
                {
                    int* outptr1 = outptr0 + N;
                    int* outptr2 = outptr0 + N * 2;
                    int* outptr3 = outptr0 + N * 3;
                    int* outptr4 = outptr0 + N * 4;
                    int* outptr5 = outptr0 + N * 5;
                    int* outptr6 = outptr0 + N * 6;
                    int* outptr7 = outptr0 + N * 7;

                    outptr0[0] = vgetq_lane_s32(_tmp00, 0);
                    outptr1[0] = vgetq_lane_s32(_tmp00, 1);
                    outptr2[0] = vgetq_lane_s32(_tmp00, 2);
                    outptr3[0] = vgetq_lane_s32(_tmp00, 3);
                    outptr4[0] = vgetq_lane_s32(_tmp01, 0);
                    outptr5[0] = vgetq_lane_s32(_tmp01, 1);
                    outptr6[0] = vgetq_lane_s32(_tmp01, 2);
                    outptr7[0] = vgetq_lane_s32(_tmp01, 3);

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = vgetq_lane_s32(_tmp10, 0);
                        outptr1[1] = vgetq_lane_s32(_tmp10, 1);
                        outptr2[1] = vgetq_lane_s32(_tmp10, 2);
                        outptr3[1] = vgetq_lane_s32(_tmp10, 3);
                        outptr4[1] = vgetq_lane_s32(_tmp11, 0);
                        outptr5[1] = vgetq_lane_s32(_tmp11, 1);
                        outptr6[1] = vgetq_lane_s32(_tmp11, 2);
                        outptr7[1] = vgetq_lane_s32(_tmp11, 3);
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        int tmp[2][4][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 4;
            const int* r1 = r0 + max_jj * 4;
            const int* r2 = r0 + max_jj * 4 * 2;
            const int* r3 = r0 + max_jj * 4 * 3;

            for (int m = 0; m < 4; m++)
            {
                int32x4_t _r0 = vld1q_s32(r0);
                int32x4_t _r1 = vld1q_s32(r1);
                int32x4_t _r2 = vld1q_s32(r2);
                int32x4_t _r3 = vld1q_s32(r3);

                int32x4_t _tmp0 = vaddq_s32(vaddq_s32(_r0, _r1), _r2);
                int32x4_t _tmp1 = vaddq_s32(vsubq_s32(_r1, _r2), _r3);

                vst1q_s32(tmp[0][m], _tmp0);
                vst1q_s32(tmp[1][m], _tmp1);

                r0 += max_jj * 4 * 4;
                r1 += max_jj * 4 * 4;
                r2 += max_jj * 4 * 4;
                r3 += max_jj * 4 * 4;
            }

            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 2) + (tj * 2) * out_elempack;

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                int32x4_t _r0 = vld1q_s32(tmp[m][0]);
                int32x4_t _r1 = vld1q_s32(tmp[m][1]);
                int32x4_t _r2 = vld1q_s32(tmp[m][2]);
                int32x4_t _r3 = vld1q_s32(tmp[m][3]);

                int32x4_t _tmp0 = vaddq_s32(vaddq_s32(_r0, _r1), _r2);
                int32x4_t _tmp1 = vaddq_s32(vsubq_s32(_r1, _r2), _r3);

                _tmp0 = vshrq_n_s32(_tmp0, 2);
                _tmp1 = vshrq_n_s32(_tmp1, 2);

                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _tmp0);
                    if (tj * 2 + 1 < outw) vst1q_s32(outptr0 + 4, _tmp1);
                }
                if (out_elempack == 1)
                {
                    int* outptr1 = outptr0 + N;
                    int* outptr2 = outptr0 + N * 2;
                    int* outptr3 = outptr0 + N * 3;

                    outptr0[0] = vgetq_lane_s32(_tmp0, 0);
                    outptr1[0] = vgetq_lane_s32(_tmp0, 1);
                    outptr2[0] = vgetq_lane_s32(_tmp0, 2);
                    outptr3[0] = vgetq_lane_s32(_tmp0, 3);

                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = vgetq_lane_s32(_tmp1, 0);
                        outptr1[1] = vgetq_lane_s32(_tmp1, 1);
                        outptr2[1] = vgetq_lane_s32(_tmp1, 2);
                        outptr3[1] = vgetq_lane_s32(_tmp1, 3);
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        int tmp[2][4][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 2;
            const int* r1 = r0 + max_jj * 2;
            const int* r2 = r0 + max_jj * 2 * 2;
            const int* r3 = r0 + max_jj * 2 * 3;

            for (int m = 0; m < 4; m++)
            {
                tmp[0][m][0] = r0[0] + r1[0] + r2[0];
                tmp[0][m][1] = r0[1] + r1[1] + r2[1];
                tmp[1][m][0] = r1[0] - r2[0] + r3[0];
                tmp[1][m][1] = r1[1] - r2[1] + r3[1];

                r0 += max_jj * 4 * 2;
                r1 += max_jj * 4 * 2;
                r2 += max_jj * 4 * 2;
                r3 += max_jj * 4 * 2;
            }

            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                int tmp00 = tmp[m][0][0] + tmp[m][1][0] + tmp[m][2][0];
                int tmp01 = tmp[m][0][1] + tmp[m][1][1] + tmp[m][2][1];
                int tmp10 = tmp[m][1][0] - tmp[m][2][0] + tmp[m][3][0];
                int tmp11 = tmp[m][1][1] - tmp[m][2][1] + tmp[m][3][1];

                tmp00 = tmp00 >> 2;
                tmp01 = tmp01 >> 2;
                tmp10 = tmp10 >> 2;
                tmp11 = tmp11 >> 2;

                // if (out_elempack == 1)
                {
                    int* outptr1 = outptr0 + N;

                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 2 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        int tmp[2][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj;
            const int* r1 = r0 + max_jj;
            const int* r2 = r0 + max_jj * 2;
            const int* r3 = r0 + max_jj * 3;

            for (int m = 0; m < 4; m++)
            {
                tmp[0][m] = r0[0] + r1[0] + r2[0];
                tmp[1][m] = r1[0] - r2[0] + r3[0];

                r0 += max_jj * 4;
                r1 += max_jj * 4;
                r2 += max_jj * 4;
                r3 += max_jj * 4;
            }

            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 2) + (tj * 2);

            for (int m = 0; m < 2; m++)
            {
                if (ti * 2 + m >= outh)
                    continue;

                int tmp0 = tmp[m][0] + tmp[m][1] + tmp[m][2];
                int tmp1 = tmp[m][1] - tmp[m][2] + tmp[m][3];

                tmp0 = tmp0 >> 2;
                tmp1 = tmp1 >> 2;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 2 + 1 < outw) outptr0[1] = tmp1;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd23_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 2n+2, winograd F(2,3)
    int w_tiles = (outw + 1) / 2;
    int h_tiles = (outh + 1) / 2;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 16;

    // NCNN_LOGE("conv3x3s1_winograd23_int8 %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        // #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    bottom_blob.release();

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk);
            }

            // transform output
            conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
        }
    }

    return 0;
}

static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
{
    // const short ktm[6][3] = {
    //     {6, 0, 0},
    //     {-4, -4, -4},
    //     {-4, 4, -4},
    //     {1, 2, 4},
    //     {1, -2, 4},
    //     {0, 0, 6}
    // };

    short* ptmp = A;

    int ii = 0;
    for (; ii < max_ii; ii++)
    {
        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            short tmp[6][3];

            const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;

            for (int m = 0; m < 3; m++)
            {
                signed char r0 = k0[0];
                signed char r1 = k0[1];
                signed char r2 = k0[2];

                tmp[0][m] = r0 * 6;
                tmp[1][m] = -r0 * 4 - r1 * 4 - r2 * 4;
                tmp[2][m] = -r0 * 4 + r1 * 4 - r2 * 4;
                tmp[3][m] = r0 + r1 * 2 + r2 * 4;
                tmp[4][m] = r0 - r1 * 2 + r2 * 4;
                tmp[5][m] = r2 * 6;

                k0 += 3;
            }

            for (int m = 0; m < 6; m++)
            {
                short r0 = tmp[m][0];
                short r1 = tmp[m][1];
                short r2 = tmp[m][2];

                short z0 = r0 * 6;
                short z1 = -r0 * 4 - r1 * 4 - r2 * 4;
                short z2 = -r0 * 4 + r1 * 4 - r2 * 4;
                short z3 = r0 + r1 * 2 + r2 * 4;
                short z4 = r0 - r1 * 2 + r2 * 4;
                short z5 = r2 * 6;

                ptmp[0] = z0;
                ptmp[1] = z1;
                ptmp[2] = z2;
                ptmp[3] = z3;
                ptmp[4] = z4;
                ptmp[5] = z5;
                ptmp += 6;
            }
        }
    }
}

static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
{
    const int M = outch;
    const int K = inch;
    const int B = 36;

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 2u, (Allocator*)0);

    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat A_tile = A_tileX.channel(get_omp_thread_num());

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_ii = std::min((M - i), TILE_M);
            const int max_kk = std::min((K - k), TILE_K);

            conv3x3s1_winograd43_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk);

            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

            pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk);
        }
    }
}

static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
{
    // const float itm[4][4] = {
    //     {4,  0, -5,  0, 1, 0},
    //     {0, -4, -4,  1, 1, 0},
    //     {0,  4, -4, -1, 1, 0},
    //     {0, -2, -1,  2, 1, 0},
    //     {0,  2, -1, -2, 1, 0},
    //     {0,  4,  0, -5, 0, 1}
    // };

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int elempack = bottom_blob.elempack;
    const size_t N = bottom_blob.cstep * elempack;

    const int w_tiles = (w + 1) / 4;

    int nn_max_kk = 0;
    int remain_max_kk_start = 0;
#if __ARM_NEON
    nn_max_kk = max_kk / 8;
    #pragma omp parallel for num_threads(nT)
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 8;

        short tmp[6][6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const signed char* r0 = bottom_blob.channel((k + kk) / elempack).row<const signed char>(ti * 4) + (tj * 4) * elempack;

            int8x8_t _v5 = vdup_n_s8(5);

            for (int m = 0; m < 6; m++)
            {
                int8x8_t _r0 = vdup_n_s8(0);
                int8x8_t _r1 = vdup_n_s8(0);
                int8x8_t _r2 = vdup_n_s8(0);
                int8x8_t _r3 = vdup_n_s8(0);
                int8x8_t _r4 = vdup_n_s8(0);
                int8x8_t _r5 = vdup_n_s8(0);

                if (ti * 4 + m < h)
                {
                    if (elempack == 8)
                    {
                        _r0 = vld1_s8(r0);
                        if (tj * 4 + 1 < w) _r1 = vld1_s8(r0 + 8);
                        if (tj * 4 + 2 < w) _r2 = vld1_s8(r0 + 16);
                        if (tj * 4 + 3 < w) _r3 = vld1_s8(r0 + 24);
                        if (tj * 4 + 4 < w) _r4 = vld1_s8(r0 + 32);
                        if (tj * 4 + 5 < w) _r5 = vld1_s8(r0 + 40);
                    }
                    if (elempack == 1)
                    {
                        const signed char* r1 = r0 + N;
                        const signed char* r2 = r0 + N * 2;
                        const signed char* r3 = r0 + N * 3;
                        const signed char* r4 = r0 + N * 4;
                        const signed char* r5 = r0 + N * 5;
                        const signed char* r6 = r0 + N * 6;
                        const signed char* r7 = r0 + N * 7;

                        int8x8_t _t0 = vld1_s8(r0);
                        int8x8_t _t1 = vld1_s8(r1);
                        int8x8_t _t2 = vld1_s8(r2);
                        int8x8_t _t3 = vld1_s8(r3);
                        int8x8_t _t4 = vld1_s8(r4);
                        int8x8_t _t5 = vld1_s8(r5);
                        int8x8_t _t6 = vld1_s8(r6);
                        int8x8_t _t7 = vld1_s8(r7);

                        int8x8_t _t01 = vzip_s8(_t0, _t1).val[0];
                        int8x8_t _t23 = vzip_s8(_t2, _t3).val[0];
                        int8x8_t _t45 = vzip_s8(_t4, _t5).val[0];
                        int8x8_t _t67 = vzip_s8(_t6, _t7).val[0];
                        int16x4x2_t _t0123 = vzip_s16(vreinterpret_s16_s8(_t01), vreinterpret_s16_s8(_t23));
                        int16x4x2_t _t4567 = vzip_s16(vreinterpret_s16_s8(_t45), vreinterpret_s16_s8(_t67));
                        int16x8_t _ta = vcombine_s16(_t0123.val[0], _t0123.val[1]);
                        int16x8_t _tb = vcombine_s16(_t4567.val[0], _t4567.val[1]);
                        int32x4x2_t _tab = vzipq_s32(vreinterpretq_s32_s16(_ta), vreinterpretq_s32_s16(_tb));

                        _r0 = vreinterpret_s8_s32(vget_low_s32(_tab.val[0]));
                        if (tj * 4 + 1 < w) _r1 = vreinterpret_s8_s32(vget_high_s32(_tab.val[0]));
                        if (tj * 4 + 2 < w) _r2 = vreinterpret_s8_s32(vget_low_s32(_tab.val[1]));
                        if (tj * 4 + 3 < w) _r3 = vreinterpret_s8_s32(vget_high_s32(_tab.val[1]));
                        if (tj * 4 + 4 < w)
                        {
                            _t01 = vzip_s8(_t0, _t1).val[1];
                            _t23 = vzip_s8(_t2, _t3).val[1];
                            _t45 = vzip_s8(_t4, _t5).val[1];
                            _t67 = vzip_s8(_t6, _t7).val[1];
                            int16x4_t _tc = vzip_s16(vreinterpret_s16_s8(_t01), vreinterpret_s16_s8(_t23)).val[0];
                            int16x4_t _td = vzip_s16(vreinterpret_s16_s8(_t45), vreinterpret_s16_s8(_t67)).val[0];
                            int32x2x2_t _tcd = vzip_s32(vreinterpret_s32_s16(_tc), vreinterpret_s32_s16(_td));

                            _r4 = vreinterpret_s8_s32(_tcd.val[0]);
                            if (tj * 4 + 5 < w) _r5 = vreinterpret_s8_s32(_tcd.val[1]);
                        }
                    }
                }

                int16x8_t _tmp12a = vsubw_s8(vshll_n_s8(_r1, 2), _r3);
                int16x8_t _tmp12b = vsubw_s8(vshll_n_s8(_r2, 2), _r4);
                int16x8_t _tmp34a = vshlq_n_s16(vsubl_s8(_r3, _r1), 1);
                int16x8_t _tmp34b = vsubl_s8(_r4, _r2);

                int16x8_t _tmp0 = vaddq_s16(vmovl_s8(_r4), vsubq_s16(vshll_n_s8(_r0, 2), vmull_s8(_r2, _v5)));
                int16x8_t _tmp1 = vnegq_s16(vaddq_s16(_tmp12a, _tmp12b));
                int16x8_t _tmp2 = vsubq_s16(_tmp12a, _tmp12b);
                int16x8_t _tmp3 = vaddq_s16(_tmp34b, _tmp34a);
                int16x8_t _tmp4 = vsubq_s16(_tmp34b, _tmp34a);
                int16x8_t _tmp5 = vaddq_s16(vmovl_s8(_r5), vsubq_s16(vshll_n_s8(_r1, 2), vmull_s8(_r3, _v5)));

                vst1q_s16(tmp[0][m], _tmp0);
                vst1q_s16(tmp[1][m], _tmp1);
                vst1q_s16(tmp[2][m], _tmp2);
                vst1q_s16(tmp[3][m], _tmp3);
                vst1q_s16(tmp[4][m], _tmp4);
                vst1q_s16(tmp[5][m], _tmp5);

                r0 += w * elempack;
            }

            int16x8_t _v5q = vdupq_n_s16(5);

            short* p0 = (short*)B + kk * max_jj * 36 + jj * 8;
            short* p1 = p0 + max_jj * 8;
            short* p2 = p0 + max_jj * 8 * 2;
            short* p3 = p0 + max_jj * 8 * 3;
            short* p4 = p0 + max_jj * 8 * 4;
            short* p5 = p0 + max_jj * 8 * 5;

            for (int m = 0; m < 6; m++)
            {
                int16x8_t _r0 = vld1q_s16(tmp[m][0]);
                int16x8_t _r1 = vld1q_s16(tmp[m][1]);
                int16x8_t _r2 = vld1q_s16(tmp[m][2]);
                int16x8_t _r3 = vld1q_s16(tmp[m][3]);
                int16x8_t _r4 = vld1q_s16(tmp[m][4]);
                int16x8_t _r5 = vld1q_s16(tmp[m][5]);

                int16x8_t _tmp12a = vsubq_s16(_r3, vshlq_n_s16(_r1, 2));
                int16x8_t _tmp12b = vsubq_s16(_r4, vshlq_n_s16(_r2, 2));
                int16x8_t _tmp34a = vshlq_n_s16(vsubq_s16(_r3, _r1), 1);
                int16x8_t _tmp34b = vsubq_s16(_r4, _r2);

                int16x8_t _tmp0 = vaddq_s16(_r4, vsubq_s16(vshlq_n_s16(_r0, 2), vmulq_s16(_r2, _v5q)));
                int16x8_t _tmp1 = vaddq_s16(_tmp12b, _tmp12a);
                int16x8_t _tmp2 = vsubq_s16(_tmp12b, _tmp12a);
                int16x8_t _tmp3 = vaddq_s16(_tmp34b, _tmp34a);
                int16x8_t _tmp4 = vsubq_s16(_tmp34b, _tmp34a);
                int16x8_t _tmp5 = vaddq_s16(_r5, vsubq_s16(vshlq_n_s16(_r1, 2), vmulq_s16(_r3, _v5q)));

                vst1q_s16(p0, _tmp0);
                vst1q_s16(p1, _tmp1);
                vst1q_s16(p2, _tmp2);
                vst1q_s16(p3, _tmp3);
                vst1q_s16(p4, _tmp4);
                vst1q_s16(p5, _tmp5);

                p0 += max_jj * 6 * 8;
                p1 += max_jj * 6 * 8;
                p2 += max_jj * 6 * 8;
                p3 += max_jj * 6 * 8;
                p4 += max_jj * 6 * 8;
                p5 += max_jj * 6 * 8;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 8;
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
#else // __ARM_NEON
    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
    #pragma omp parallel for num_threads(nT)
#endif // __ARM_NEON
    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
    {
        const int kk = remain_max_kk_start + ppkk * 2;

        short tmp[6][6][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const signed char* r0 = bottom_blob.channel(k + kk).row<const signed char>(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
                signed char r00 = 0;
                signed char r01 = 0;
                signed char r10 = 0;
                signed char r11 = 0;
                signed char r20 = 0;
                signed char r21 = 0;
                signed char r30 = 0;
                signed char r31 = 0;
                signed char r40 = 0;
                signed char r41 = 0;
                signed char r50 = 0;
                signed char r51 = 0;

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        const signed char* r1 = r0 + N;

                        r00 = r0[0];
                        r01 = r1[0];
                        if (tj * 4 + 1 < w)
                        {
                            r10 = r0[1];
                            r11 = r1[1];
                        }
                        if (tj * 4 + 2 < w)
                        {
                            r20 = r0[2];
                            r21 = r1[2];
                        }
                        if (tj * 4 + 3 < w)
                        {
                            r30 = r0[3];
                            r31 = r1[3];
                        }
                        if (tj * 4 + 4 < w)
                        {
                            r40 = r0[4];
                            r41 = r1[4];
                        }
                        if (tj * 4 + 5 < w)
                        {
                            r50 = r0[5];
                            r51 = r1[5];
                        }
                    }
                }

                short tmp120a = r30 - r10 * 4;
                short tmp121a = r31 - r11 * 4;
                short tmp120b = r40 - r20 * 4;
                short tmp121b = r41 - r21 * 4;
                short tmp340a = (r30 - r10) * 2;
                short tmp341a = (r31 - r11) * 2;
                short tmp340b = r40 - r20;
                short tmp341b = r41 - r21;

                tmp[0][m][0] = r40 + r00 * 4 - r20 * 5;
                tmp[0][m][1] = r41 + r01 * 4 - r21 * 5;
                tmp[1][m][0] = tmp120b + tmp120a;
                tmp[1][m][1] = tmp121b + tmp121a;
                tmp[2][m][0] = tmp120b - tmp120a;
                tmp[2][m][1] = tmp121b - tmp121a;
                tmp[3][m][0] = tmp340b + tmp340a;
                tmp[3][m][1] = tmp341b + tmp341a;
                tmp[4][m][0] = tmp340b - tmp340a;
                tmp[4][m][1] = tmp341b - tmp341a;
                tmp[5][m][0] = r50 + r10 * 4 - r30 * 5;
                tmp[5][m][1] = r51 + r11 * 4 - r31 * 5;

                r0 += w;
            }

            short* p0 = (short*)B + kk * max_jj * 36 + jj * 2;
            short* p1 = p0 + max_jj * 2;
            short* p2 = p0 + max_jj * 2 * 2;
            short* p3 = p0 + max_jj * 2 * 3;
            short* p4 = p0 + max_jj * 2 * 4;
            short* p5 = p0 + max_jj * 2 * 5;

            for (int m = 0; m < 6; m++)
            {
                short r00 = tmp[m][0][0];
                short r01 = tmp[m][0][1];
                short r10 = tmp[m][1][0];
                short r11 = tmp[m][1][1];
                short r20 = tmp[m][2][0];
                short r21 = tmp[m][2][1];
                short r30 = tmp[m][3][0];
                short r31 = tmp[m][3][1];
                short r40 = tmp[m][4][0];
                short r41 = tmp[m][4][1];
                short r50 = tmp[m][5][0];
                short r51 = tmp[m][5][1];

                short tmp120a = r30 - r10 * 4;
                short tmp121a = r31 - r11 * 4;
                short tmp120b = r40 - r20 * 4;
                short tmp121b = r41 - r21 * 4;
                short tmp340a = (r30 - r10) * 2;
                short tmp341a = (r31 - r11) * 2;
                short tmp340b = r40 - r20;
                short tmp341b = r41 - r21;

                p0[0] = r40 + r00 * 4 - r20 * 5;
                p0[1] = r41 + r01 * 4 - r21 * 5;
                p1[0] = tmp120b + tmp120a;
                p1[1] = tmp121b + tmp121a;
                p2[0] = tmp120b - tmp120a;
                p2[1] = tmp121b - tmp121a;
                p3[0] = tmp340b + tmp340a;
                p3[1] = tmp341b + tmp341a;
                p4[0] = tmp340b - tmp340a;
                p4[1] = tmp341b - tmp341a;
                p5[0] = r50 + r10 * 4 - r30 * 5;
                p5[1] = r51 + r11 * 4 - r31 * 5;

                p0 += max_jj * 6 * 2;
                p1 += max_jj * 6 * 2;
                p2 += max_jj * 6 * 2;
                p3 += max_jj * 6 * 2;
                p4 += max_jj * 6 * 2;
                p5 += max_jj * 6 * 2;
            }
        }
    }
    remain_max_kk_start += nn_max_kk * 2;
    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
    {
        short tmp[6][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const signed char* r0123 = bottom_blob.channel(k + kk).row<const signed char>(ti * 4) + (tj * 4);

            for (int m = 0; m < 6; m++)
            {
                signed char r0 = 0;
                signed char r1 = 0;
                signed char r2 = 0;
                signed char r3 = 0;
                signed char r4 = 0;
                signed char r5 = 0;

                if (ti * 4 + m < h)
                {
                    // if (elempack == 1)
                    {
                        r0 = r0123[0];
                        if (tj * 4 + 1 < w) r1 = r0123[1];
                        if (tj * 4 + 2 < w) r2 = r0123[2];
                        if (tj * 4 + 3 < w) r3 = r0123[3];
                        if (tj * 4 + 4 < w) r4 = r0123[4];
                        if (tj * 4 + 5 < w) r5 = r0123[5];
                    }
                }

                short tmp12a = r3 - r1 * 4;
                short tmp12b = r4 - r2 * 4;
                short tmp34a = (r3 - r1) * 2;
                short tmp34b = r4 - r2;

                tmp[0][m] = r4 + r0 * 4 - r2 * 5;
                tmp[1][m] = tmp12b + tmp12a;
                tmp[2][m] = tmp12b - tmp12a;
                tmp[3][m] = tmp34b + tmp34a;
                tmp[4][m] = tmp34b - tmp34a;
                tmp[5][m] = r5 + r1 * 4 - r3 * 5;

                r0123 += w;
            }

            short* p0 = (short*)B + kk * max_jj * 36 + jj;
            short* p1 = p0 + max_jj;
            short* p2 = p0 + max_jj * 2;
            short* p3 = p0 + max_jj * 3;
            short* p4 = p0 + max_jj * 4;
            short* p5 = p0 + max_jj * 5;

            for (int m = 0; m < 6; m++)
            {
                short r0 = tmp[m][0];
                short r1 = tmp[m][1];
                short r2 = tmp[m][2];
                short r3 = tmp[m][3];
                short r4 = tmp[m][4];
                short r5 = tmp[m][5];

                short tmp12a = r3 - r1 * 4;
                short tmp12b = r4 - r2 * 4;
                short tmp34a = (r3 - r1) * 2;
                short tmp34b = r4 - r2;

                p0[0] = r4 + r0 * 4 - r2 * 5;
                p1[0] = tmp12b + tmp12a;
                p2[0] = tmp12b - tmp12a;
                p3[0] = tmp34b + tmp34a;
                p4[0] = tmp34b - tmp34a;
                p5[0] = r5 + r1 * 4 - r3 * 5;

                p0 += max_jj * 6;
                p1 += max_jj * 6;
                p2 += max_jj * 6;
                p3 += max_jj * 6;
                p4 += max_jj * 6;
                p5 += max_jj * 6;
            }
        }
    }
}

static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj)
{
    // const int otm[4][6] = {
    //     {1, 1,  1, 1,  1, 0},
    //     {0, 1, -1, 2, -2, 0},
    //     {0, 1,  1, 4,  4, 0},
    //     {0, 1, -1, 8, -8, 1}
    // };

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const size_t N = top_blob.cstep * out_elempack;

    const int w_tiles = (outw + 3) / 4;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        int tmp[4][6][8];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 8;
            const int* r1 = r0 + max_jj * 8;
            const int* r2 = r0 + max_jj * 8 * 2;
            const int* r3 = r0 + max_jj * 8 * 3;
            const int* r4 = r0 + max_jj * 8 * 4;
            const int* r5 = r0 + max_jj * 8 * 5;

            for (int m = 0; m < 5; m++)
            {
                int32x4_t _r00 = vld1q_s32(r0);
                int32x4_t _r01 = vld1q_s32(r0 + 4);
                int32x4_t _r10 = vld1q_s32(r1);
                int32x4_t _r11 = vld1q_s32(r1 + 4);
                int32x4_t _r20 = vld1q_s32(r2);
                int32x4_t _r21 = vld1q_s32(r2 + 4);
                int32x4_t _r30 = vld1q_s32(r3);
                int32x4_t _r31 = vld1q_s32(r3 + 4);
                int32x4_t _r40 = vld1q_s32(r4);
                int32x4_t _r41 = vld1q_s32(r4 + 4);
                int32x4_t _r50 = vld1q_s32(r5);
                int32x4_t _r51 = vld1q_s32(r5 + 4);

                int32x4_t _tmp02a0 = vaddq_s32(_r10, _r20);
                int32x4_t _tmp02a1 = vaddq_s32(_r11, _r21);
                int32x4_t _tmp02b0 = vaddq_s32(_r30, _r40);
                int32x4_t _tmp02b1 = vaddq_s32(_r31, _r41);
                int32x4_t _tmp13a0 = vsubq_s32(_r10, _r20);
                int32x4_t _tmp13a1 = vsubq_s32(_r11, _r21);
                int32x4_t _tmp13b0 = vsubq_s32(_r30, _r40);
                int32x4_t _tmp13b1 = vsubq_s32(_r31, _r41);

                int32x4_t _tmp00 = vaddq_s32(vaddq_s32(_tmp02a0, _tmp02b0), _r00);
                int32x4_t _tmp01 = vaddq_s32(vaddq_s32(_tmp02a1, _tmp02b1), _r01);
                int32x4_t _tmp10 = vaddq_s32(_tmp13a0, vshlq_n_s32(_tmp13b0, 1));
                int32x4_t _tmp11 = vaddq_s32(_tmp13a1, vshlq_n_s32(_tmp13b1, 1));
                int32x4_t _tmp20 = vaddq_s32(_tmp02a0, vshlq_n_s32(_tmp02b0, 2));
                int32x4_t _tmp21 = vaddq_s32(_tmp02a1, vshlq_n_s32(_tmp02b1, 2));
                int32x4_t _tmp30 = vaddq_s32(vaddq_s32(_tmp13a0, vshlq_n_s32(_tmp13b0, 3)), vshlq_n_s32(_r50, 2));
                int32x4_t _tmp31 = vaddq_s32(vaddq_s32(_tmp13a1, vshlq_n_s32(_tmp13b1, 3)), vshlq_n_s32(_r51, 2));

                vst1q_s32(tmp[0][m], _tmp00);
                vst1q_s32(tmp[0][m] + 4, _tmp01);
                vst1q_s32(tmp[1][m], _tmp10);
                vst1q_s32(tmp[1][m] + 4, _tmp11);
                vst1q_s32(tmp[2][m], _tmp20);
                vst1q_s32(tmp[2][m] + 4, _tmp21);
                vst1q_s32(tmp[3][m], _tmp30);
                vst1q_s32(tmp[3][m] + 4, _tmp31);

                r0 += max_jj * 6 * 8;
                r1 += max_jj * 6 * 8;
                r2 += max_jj * 6 * 8;
                r3 += max_jj * 6 * 8;
                r4 += max_jj * 6 * 8;
                r5 += max_jj * 6 * 8;
            }
            for (int m = 5; m < 6; m++)
            {
                int32x4_t _r00 = vld1q_s32(r0);
                int32x4_t _r01 = vld1q_s32(r0 + 4);
                int32x4_t _r10 = vld1q_s32(r1);
                int32x4_t _r11 = vld1q_s32(r1 + 4);
                int32x4_t _r20 = vld1q_s32(r2);
                int32x4_t _r21 = vld1q_s32(r2 + 4);
                int32x4_t _r30 = vld1q_s32(r3);
                int32x4_t _r31 = vld1q_s32(r3 + 4);
                int32x4_t _r40 = vld1q_s32(r4);
                int32x4_t _r41 = vld1q_s32(r4 + 4);
                int32x4_t _r50 = vld1q_s32(r5);
                int32x4_t _r51 = vld1q_s32(r5 + 4);

                int32x4_t _tmp02a0 = vaddq_s32(_r10, _r20);
                int32x4_t _tmp02a1 = vaddq_s32(_r11, _r21);
                int32x4_t _tmp02b0 = vaddq_s32(_r30, _r40);
                int32x4_t _tmp02b1 = vaddq_s32(_r31, _r41);
                int32x4_t _tmp13a0 = vsubq_s32(_r10, _r20);
                int32x4_t _tmp13a1 = vsubq_s32(_r11, _r21);
                int32x4_t _tmp13b0 = vsubq_s32(_r30, _r40);
                int32x4_t _tmp13b1 = vsubq_s32(_r31, _r41);

                int32x4_t _tmp00 = vaddq_s32(vaddq_s32(_tmp02a0, _tmp02b0), _r00);
                int32x4_t _tmp01 = vaddq_s32(vaddq_s32(_tmp02a1, _tmp02b1), _r01);
                int32x4_t _tmp10 = vaddq_s32(_tmp13a0, vshlq_n_s32(_tmp13b0, 1));
                int32x4_t _tmp11 = vaddq_s32(_tmp13a1, vshlq_n_s32(_tmp13b1, 1));
                int32x4_t _tmp20 = vaddq_s32(_tmp02a0, vshlq_n_s32(_tmp02b0, 2));
                int32x4_t _tmp21 = vaddq_s32(_tmp02a1, vshlq_n_s32(_tmp02b1, 2));
                int32x4_t _tmp30 = vaddq_s32(vaddq_s32(_tmp13a0, vshlq_n_s32(_tmp13b0, 3)), vshlq_n_s32(_r50, 2));
                int32x4_t _tmp31 = vaddq_s32(vaddq_s32(_tmp13a1, vshlq_n_s32(_tmp13b1, 3)), vshlq_n_s32(_r51, 2));

                _tmp00 = vshlq_n_s32(_tmp00, 2);
                _tmp01 = vshlq_n_s32(_tmp01, 2);
                _tmp10 = vshlq_n_s32(_tmp10, 2);
                _tmp11 = vshlq_n_s32(_tmp11, 2);
                _tmp20 = vshlq_n_s32(_tmp20, 2);
                _tmp21 = vshlq_n_s32(_tmp21, 2);
                _tmp30 = vshlq_n_s32(_tmp30, 2);
                _tmp31 = vshlq_n_s32(_tmp31, 2);

                vst1q_s32(tmp[0][m], _tmp00);
                vst1q_s32(tmp[0][m] + 4, _tmp01);
                vst1q_s32(tmp[1][m], _tmp10);
                vst1q_s32(tmp[1][m] + 4, _tmp11);
                vst1q_s32(tmp[2][m], _tmp20);
                vst1q_s32(tmp[2][m] + 4, _tmp21);
                vst1q_s32(tmp[3][m], _tmp30);
                vst1q_s32(tmp[3][m] + 4, _tmp31);

                r0 += max_jj * 6 * 8;
                r1 += max_jj * 6 * 8;
                r2 += max_jj * 6 * 8;
                r3 += max_jj * 6 * 8;
                r4 += max_jj * 6 * 8;
                r5 += max_jj * 6 * 8;
            }

            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                int32x4_t _r00 = vld1q_s32(tmp[m][0]);
                int32x4_t _r01 = vld1q_s32(tmp[m][0] + 4);
                int32x4_t _r10 = vld1q_s32(tmp[m][1]);
                int32x4_t _r11 = vld1q_s32(tmp[m][1] + 4);
                int32x4_t _r20 = vld1q_s32(tmp[m][2]);
                int32x4_t _r21 = vld1q_s32(tmp[m][2] + 4);
                int32x4_t _r30 = vld1q_s32(tmp[m][3]);
                int32x4_t _r31 = vld1q_s32(tmp[m][3] + 4);
                int32x4_t _r40 = vld1q_s32(tmp[m][4]);
                int32x4_t _r41 = vld1q_s32(tmp[m][4] + 4);
                int32x4_t _r50 = vld1q_s32(tmp[m][5]);
                int32x4_t _r51 = vld1q_s32(tmp[m][5] + 4);

                int32x4_t _tmp02a0 = vaddq_s32(_r10, _r20);
                int32x4_t _tmp02a1 = vaddq_s32(_r11, _r21);
                int32x4_t _tmp02b0 = vaddq_s32(_r30, _r40);
                int32x4_t _tmp02b1 = vaddq_s32(_r31, _r41);
                int32x4_t _tmp13a0 = vsubq_s32(_r10, _r20);
                int32x4_t _tmp13a1 = vsubq_s32(_r11, _r21);
                int32x4_t _tmp13b0 = vsubq_s32(_r30, _r40);
                int32x4_t _tmp13b1 = vsubq_s32(_r31, _r41);

                int32x4_t _tmp00 = vaddq_s32(vaddq_s32(_tmp02a0, _tmp02b0), _r00);
                int32x4_t _tmp01 = vaddq_s32(vaddq_s32(_tmp02a1, _tmp02b1), _r01);
                int32x4_t _tmp10 = vaddq_s32(_tmp13a0, vshlq_n_s32(_tmp13b0, 1));
                int32x4_t _tmp11 = vaddq_s32(_tmp13a1, vshlq_n_s32(_tmp13b1, 1));
                int32x4_t _tmp20 = vaddq_s32(_tmp02a0, vshlq_n_s32(_tmp02b0, 2));
                int32x4_t _tmp21 = vaddq_s32(_tmp02a1, vshlq_n_s32(_tmp02b1, 2));
                int32x4_t _tmp30 = vaddq_s32(vaddq_s32(_tmp13a0, vshlq_n_s32(_tmp13b0, 3)), _r50);
                int32x4_t _tmp31 = vaddq_s32(vaddq_s32(_tmp13a1, vshlq_n_s32(_tmp13b1, 3)), _r51);

                // TODO use integer trick for division by 576
                float32x4_t _v576 = vdupq_n_f32(1.0 / 576);
                _tmp00 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp00), _v576));
                _tmp01 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp01), _v576));
                _tmp10 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp10), _v576));
                _tmp11 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp11), _v576));
                _tmp20 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp20), _v576));
                _tmp21 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp21), _v576));
                _tmp30 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp30), _v576));
                _tmp31 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp31), _v576));

                if (out_elempack == 8)
                {
                    vst1q_s32(outptr0, _tmp00);
                    vst1q_s32(outptr0 + 4, _tmp01);
                    if (tj * 4 + 1 < outw)
                    {
                        vst1q_s32(outptr0 + 8, _tmp10);
                        vst1q_s32(outptr0 + 12, _tmp11);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        vst1q_s32(outptr0 + 16, _tmp20);
                        vst1q_s32(outptr0 + 20, _tmp21);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        vst1q_s32(outptr0 + 24, _tmp30);
                        vst1q_s32(outptr0 + 28, _tmp31);
                    }
                }
                if (out_elempack == 4)
                {
                    int* outptr1 = outptr0 + N;

                    vst1q_s32(outptr0, _tmp00);
                    vst1q_s32(outptr1, _tmp01);
                    if (tj * 4 + 1 < outw)
                    {
                        vst1q_s32(outptr0 + 4, _tmp10);
                        vst1q_s32(outptr1 + 4, _tmp11);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        vst1q_s32(outptr0 + 8, _tmp20);
                        vst1q_s32(outptr1 + 8, _tmp21);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        vst1q_s32(outptr0 + 12, _tmp30);
                        vst1q_s32(outptr1 + 12, _tmp31);
                    }
                }
                if (out_elempack == 1)
                {
                    int* outptr1 = outptr0 + N;
                    int* outptr2 = outptr0 + N * 2;
                    int* outptr3 = outptr0 + N * 3;
                    int* outptr4 = outptr0 + N * 4;
                    int* outptr5 = outptr0 + N * 5;
                    int* outptr6 = outptr0 + N * 6;
                    int* outptr7 = outptr0 + N * 7;

                    outptr0[0] = vgetq_lane_s32(_tmp00, 0);
                    outptr1[0] = vgetq_lane_s32(_tmp00, 1);
                    outptr2[0] = vgetq_lane_s32(_tmp00, 2);
                    outptr3[0] = vgetq_lane_s32(_tmp00, 3);
                    outptr4[0] = vgetq_lane_s32(_tmp01, 0);
                    outptr5[0] = vgetq_lane_s32(_tmp01, 1);
                    outptr6[0] = vgetq_lane_s32(_tmp01, 2);
                    outptr7[0] = vgetq_lane_s32(_tmp01, 3);
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = vgetq_lane_s32(_tmp10, 0);
                        outptr1[1] = vgetq_lane_s32(_tmp10, 1);
                        outptr2[1] = vgetq_lane_s32(_tmp10, 2);
                        outptr3[1] = vgetq_lane_s32(_tmp10, 3);
                        outptr4[1] = vgetq_lane_s32(_tmp11, 0);
                        outptr5[1] = vgetq_lane_s32(_tmp11, 1);
                        outptr6[1] = vgetq_lane_s32(_tmp11, 2);
                        outptr7[1] = vgetq_lane_s32(_tmp11, 3);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = vgetq_lane_s32(_tmp20, 0);
                        outptr1[2] = vgetq_lane_s32(_tmp20, 1);
                        outptr2[2] = vgetq_lane_s32(_tmp20, 2);
                        outptr3[2] = vgetq_lane_s32(_tmp20, 3);
                        outptr4[2] = vgetq_lane_s32(_tmp21, 0);
                        outptr5[2] = vgetq_lane_s32(_tmp21, 1);
                        outptr6[2] = vgetq_lane_s32(_tmp21, 2);
                        outptr7[2] = vgetq_lane_s32(_tmp21, 3);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = vgetq_lane_s32(_tmp30, 0);
                        outptr1[3] = vgetq_lane_s32(_tmp30, 1);
                        outptr2[3] = vgetq_lane_s32(_tmp30, 2);
                        outptr3[3] = vgetq_lane_s32(_tmp30, 3);
                        outptr4[3] = vgetq_lane_s32(_tmp31, 0);
                        outptr5[3] = vgetq_lane_s32(_tmp31, 1);
                        outptr6[3] = vgetq_lane_s32(_tmp31, 2);
                        outptr7[3] = vgetq_lane_s32(_tmp31, 3);
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        int tmp[4][6][4];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 4;
            const int* r1 = r0 + max_jj * 4;
            const int* r2 = r0 + max_jj * 4 * 2;
            const int* r3 = r0 + max_jj * 4 * 3;
            const int* r4 = r0 + max_jj * 4 * 4;
            const int* r5 = r0 + max_jj * 4 * 5;

            for (int m = 0; m < 5; m++)
            {
                int32x4_t _r0 = vld1q_s32(r0);
                int32x4_t _r1 = vld1q_s32(r1);
                int32x4_t _r2 = vld1q_s32(r2);
                int32x4_t _r3 = vld1q_s32(r3);
                int32x4_t _r4 = vld1q_s32(r4);
                int32x4_t _r5 = vld1q_s32(r5);

                int32x4_t _tmp02a = vaddq_s32(_r1, _r2);
                int32x4_t _tmp02b = vaddq_s32(_r3, _r4);
                int32x4_t _tmp13a = vsubq_s32(_r1, _r2);
                int32x4_t _tmp13b = vsubq_s32(_r3, _r4);

                int32x4_t _tmp0 = vaddq_s32(vaddq_s32(_tmp02a, _tmp02b), _r0);
                int32x4_t _tmp1 = vaddq_s32(_tmp13a, vshlq_n_s32(_tmp13b, 1));
                int32x4_t _tmp2 = vaddq_s32(_tmp02a, vshlq_n_s32(_tmp02b, 2));
                int32x4_t _tmp3 = vaddq_s32(vaddq_s32(_tmp13a, vshlq_n_s32(_tmp13b, 3)), vshlq_n_s32(_r5, 2));

                vst1q_s32(tmp[0][m], _tmp0);
                vst1q_s32(tmp[1][m], _tmp1);
                vst1q_s32(tmp[2][m], _tmp2);
                vst1q_s32(tmp[3][m], _tmp3);

                r0 += max_jj * 6 * 4;
                r1 += max_jj * 6 * 4;
                r2 += max_jj * 6 * 4;
                r3 += max_jj * 6 * 4;
                r4 += max_jj * 6 * 4;
                r5 += max_jj * 6 * 4;
            }
            for (int m = 5; m < 6; m++)
            {
                int32x4_t _r0 = vld1q_s32(r0);
                int32x4_t _r1 = vld1q_s32(r1);
                int32x4_t _r2 = vld1q_s32(r2);
                int32x4_t _r3 = vld1q_s32(r3);
                int32x4_t _r4 = vld1q_s32(r4);
                int32x4_t _r5 = vld1q_s32(r5);

                int32x4_t _tmp02a = vaddq_s32(_r1, _r2);
                int32x4_t _tmp02b = vaddq_s32(_r3, _r4);
                int32x4_t _tmp13a = vsubq_s32(_r1, _r2);
                int32x4_t _tmp13b = vsubq_s32(_r3, _r4);

                int32x4_t _tmp0 = vaddq_s32(vaddq_s32(_tmp02a, _tmp02b), _r0);
                int32x4_t _tmp1 = vaddq_s32(_tmp13a, vshlq_n_s32(_tmp13b, 1));
                int32x4_t _tmp2 = vaddq_s32(_tmp02a, vshlq_n_s32(_tmp02b, 2));
                int32x4_t _tmp3 = vaddq_s32(vaddq_s32(_tmp13a, vshlq_n_s32(_tmp13b, 3)), vshlq_n_s32(_r5, 2));

                _tmp0 = vshlq_n_s32(_tmp0, 2);
                _tmp1 = vshlq_n_s32(_tmp1, 2);
                _tmp2 = vshlq_n_s32(_tmp2, 2);
                _tmp3 = vshlq_n_s32(_tmp3, 2);

                vst1q_s32(tmp[0][m], _tmp0);
                vst1q_s32(tmp[1][m], _tmp1);
                vst1q_s32(tmp[2][m], _tmp2);
                vst1q_s32(tmp[3][m], _tmp3);

                r0 += max_jj * 6 * 4;
                r1 += max_jj * 6 * 4;
                r2 += max_jj * 6 * 4;
                r3 += max_jj * 6 * 4;
                r4 += max_jj * 6 * 4;
                r5 += max_jj * 6 * 4;
            }

            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 4) + (tj * 4) * out_elempack;

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                int32x4_t _r0 = vld1q_s32(tmp[m][0]);
                int32x4_t _r1 = vld1q_s32(tmp[m][1]);
                int32x4_t _r2 = vld1q_s32(tmp[m][2]);
                int32x4_t _r3 = vld1q_s32(tmp[m][3]);
                int32x4_t _r4 = vld1q_s32(tmp[m][4]);
                int32x4_t _r5 = vld1q_s32(tmp[m][5]);

                int32x4_t _tmp02a = vaddq_s32(_r1, _r2);
                int32x4_t _tmp02b = vaddq_s32(_r3, _r4);
                int32x4_t _tmp13a = vsubq_s32(_r1, _r2);
                int32x4_t _tmp13b = vsubq_s32(_r3, _r4);

                int32x4_t _tmp0 = vaddq_s32(vaddq_s32(_tmp02a, _tmp02b), _r0);
                int32x4_t _tmp1 = vaddq_s32(_tmp13a, vshlq_n_s32(_tmp13b, 1));
                int32x4_t _tmp2 = vaddq_s32(_tmp02a, vshlq_n_s32(_tmp02b, 2));
                int32x4_t _tmp3 = vaddq_s32(vaddq_s32(_tmp13a, vshlq_n_s32(_tmp13b, 3)), _r5);

                // TODO use integer trick for division by 576
                float32x4_t _v576 = vdupq_n_f32(1.0 / 576);
                _tmp0 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp0), _v576));
                _tmp1 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp1), _v576));
                _tmp2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp2), _v576));
                _tmp3 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(_tmp3), _v576));

                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _tmp0);
                    if (tj * 4 + 1 < outw) vst1q_s32(outptr0 + 4, _tmp1);
                    if (tj * 4 + 2 < outw) vst1q_s32(outptr0 + 8, _tmp2);
                    if (tj * 4 + 3 < outw) vst1q_s32(outptr0 + 12, _tmp3);
                }
                if (out_elempack == 1)
                {
                    int* outptr1 = outptr0 + N;
                    int* outptr2 = outptr0 + N * 2;
                    int* outptr3 = outptr0 + N * 3;

                    outptr0[0] = vgetq_lane_s32(_tmp0, 0);
                    outptr1[0] = vgetq_lane_s32(_tmp0, 1);
                    outptr2[0] = vgetq_lane_s32(_tmp0, 2);
                    outptr3[0] = vgetq_lane_s32(_tmp0, 3);
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = vgetq_lane_s32(_tmp1, 0);
                        outptr1[1] = vgetq_lane_s32(_tmp1, 1);
                        outptr2[1] = vgetq_lane_s32(_tmp1, 2);
                        outptr3[1] = vgetq_lane_s32(_tmp1, 3);
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = vgetq_lane_s32(_tmp2, 0);
                        outptr1[2] = vgetq_lane_s32(_tmp2, 1);
                        outptr2[2] = vgetq_lane_s32(_tmp2, 2);
                        outptr3[2] = vgetq_lane_s32(_tmp2, 3);
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = vgetq_lane_s32(_tmp3, 0);
                        outptr1[3] = vgetq_lane_s32(_tmp3, 1);
                        outptr2[3] = vgetq_lane_s32(_tmp3, 2);
                        outptr3[3] = vgetq_lane_s32(_tmp3, 3);
                    }
                }

                outptr0 += outw * out_elempack;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        int tmp[4][6][2];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 2;
            const int* r1 = r0 + max_jj * 2;
            const int* r2 = r0 + max_jj * 2 * 2;
            const int* r3 = r0 + max_jj * 2 * 3;
            const int* r4 = r0 + max_jj * 2 * 4;
            const int* r5 = r0 + max_jj * 2 * 5;

            for (int m = 0; m < 5; m++)
            {
                int tmp02a0 = r1[0] + r2[0];
                int tmp02a1 = r1[1] + r2[1];
                int tmp02b0 = r3[0] + r4[0];
                int tmp02b1 = r3[1] + r4[1];
                int tmp13a0 = r1[0] - r2[0];
                int tmp13a1 = r1[1] - r2[1];
                int tmp13b0 = r3[0] - r4[0];
                int tmp13b1 = r3[1] - r4[1];

                int tmp00 = tmp02a0 + tmp02b0 + r0[0];
                int tmp01 = tmp02a1 + tmp02b1 + r0[1];
                int tmp10 = tmp13a0 + tmp13b0 * 2;
                int tmp11 = tmp13a1 + tmp13b1 * 2;
                int tmp20 = tmp02a0 + tmp02b0 * 4;
                int tmp21 = tmp02a1 + tmp02b1 * 4;
                int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4;
                int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4;

                tmp[0][m][0] = tmp00;
                tmp[0][m][1] = tmp01;
                tmp[1][m][0] = tmp10;
                tmp[1][m][1] = tmp11;
                tmp[2][m][0] = tmp20;
                tmp[2][m][1] = tmp21;
                tmp[3][m][0] = tmp30;
                tmp[3][m][1] = tmp31;

                r0 += max_jj * 6 * 2;
                r1 += max_jj * 6 * 2;
                r2 += max_jj * 6 * 2;
                r3 += max_jj * 6 * 2;
                r4 += max_jj * 6 * 2;
                r5 += max_jj * 6 * 2;
            }
            for (int m = 5; m < 6; m++)
            {
                int tmp02a0 = r1[0] + r2[0];
                int tmp02a1 = r1[1] + r2[1];
                int tmp02b0 = r3[0] + r4[0];
                int tmp02b1 = r3[1] + r4[1];
                int tmp13a0 = r1[0] - r2[0];
                int tmp13a1 = r1[1] - r2[1];
                int tmp13b0 = r3[0] - r4[0];
                int tmp13b1 = r3[1] - r4[1];

                int tmp00 = tmp02a0 + tmp02b0 + r0[0];
                int tmp01 = tmp02a1 + tmp02b1 + r0[1];
                int tmp10 = tmp13a0 + tmp13b0 * 2;
                int tmp11 = tmp13a1 + tmp13b1 * 2;
                int tmp20 = tmp02a0 + tmp02b0 * 4;
                int tmp21 = tmp02a1 + tmp02b1 * 4;
                int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4;
                int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4;

                tmp00 = tmp00 * 4;
                tmp01 = tmp01 * 4;
                tmp10 = tmp10 * 4;
                tmp11 = tmp11 * 4;
                tmp20 = tmp20 * 4;
                tmp21 = tmp21 * 4;
                tmp30 = tmp30 * 4;
                tmp31 = tmp31 * 4;

                tmp[0][m][0] = tmp00;
                tmp[0][m][1] = tmp01;
                tmp[1][m][0] = tmp10;
                tmp[1][m][1] = tmp11;
                tmp[2][m][0] = tmp20;
                tmp[2][m][1] = tmp21;
                tmp[3][m][0] = tmp30;
                tmp[3][m][1] = tmp31;

                r0 += max_jj * 6 * 2;
                r1 += max_jj * 6 * 2;
                r2 += max_jj * 6 * 2;
                r3 += max_jj * 6 * 2;
                r4 += max_jj * 6 * 2;
                r5 += max_jj * 6 * 2;
            }

            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                int tmp02a0 = tmp[m][1][0] + tmp[m][2][0];
                int tmp02a1 = tmp[m][1][1] + tmp[m][2][1];
                int tmp02b0 = tmp[m][3][0] + tmp[m][4][0];
                int tmp02b1 = tmp[m][3][1] + tmp[m][4][1];
                int tmp13a0 = tmp[m][1][0] - tmp[m][2][0];
                int tmp13a1 = tmp[m][1][1] - tmp[m][2][1];
                int tmp13b0 = tmp[m][3][0] - tmp[m][4][0];
                int tmp13b1 = tmp[m][3][1] - tmp[m][4][1];

                int tmp00 = tmp02a0 + tmp02b0 + tmp[m][0][0];
                int tmp01 = tmp02a1 + tmp02b1 + tmp[m][0][1];
                int tmp10 = tmp13a0 + tmp13b0 * 2;
                int tmp11 = tmp13a1 + tmp13b1 * 2;
                int tmp20 = tmp02a0 + tmp02b0 * 4;
                int tmp21 = tmp02a1 + tmp02b1 * 4;
                int tmp30 = tmp13a0 + tmp13b0 * 8 + tmp[m][5][0];
                int tmp31 = tmp13a1 + tmp13b1 * 8 + tmp[m][5][1];

                tmp00 = tmp00 / 576;
                tmp01 = tmp01 / 576;
                tmp10 = tmp10 / 576;
                tmp11 = tmp11 / 576;
                tmp20 = tmp20 / 576;
                tmp21 = tmp21 / 576;
                tmp30 = tmp30 / 576;
                tmp31 = tmp31 / 576;

                // if (out_elempack == 1)
                {
                    int* outptr1 = outptr0 + N;

                    outptr0[0] = tmp00;
                    outptr1[0] = tmp01;
                    if (tj * 4 + 1 < outw)
                    {
                        outptr0[1] = tmp10;
                        outptr1[1] = tmp11;
                    }
                    if (tj * 4 + 2 < outw)
                    {
                        outptr0[2] = tmp20;
                        outptr1[2] = tmp21;
                    }
                    if (tj * 4 + 3 < outw)
                    {
                        outptr0[3] = tmp30;
                        outptr1[3] = tmp31;
                    }
                }

                outptr0 += outw;
            }
        }
    }
    for (; ii < max_ii; ii++)
    {
        int tmp[4][6];

        int jj = 0;
        for (; jj < max_jj; jj++)
        {
            int ti = (j + jj) / w_tiles;
            int tj = (j + jj) % w_tiles;

            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj;
            const int* r1 = r0 + max_jj;
            const int* r2 = r0 + max_jj * 2;
            const int* r3 = r0 + max_jj * 3;
            const int* r4 = r0 + max_jj * 4;
            const int* r5 = r0 + max_jj * 5;

            for (int m = 0; m < 5; m++)
            {
                int tmp02a = r1[0] + r2[0];
                int tmp02b = r3[0] + r4[0];
                int tmp13a = r1[0] - r2[0];
                int tmp13b = r3[0] - r4[0];

                int tmp0 = tmp02a + tmp02b + r0[0];
                int tmp1 = tmp13a + tmp13b * 2;
                int tmp2 = tmp02a + tmp02b * 4;
                int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4;

                tmp[0][m] = tmp0;
                tmp[1][m] = tmp1;
                tmp[2][m] = tmp2;
                tmp[3][m] = tmp3;

                r0 += max_jj * 6;
                r1 += max_jj * 6;
                r2 += max_jj * 6;
                r3 += max_jj * 6;
                r4 += max_jj * 6;
                r5 += max_jj * 6;
            }
            for (int m = 5; m < 6; m++)
            {
                int tmp02a = r1[0] + r2[0];
                int tmp02b = r3[0] + r4[0];
                int tmp13a = r1[0] - r2[0];
                int tmp13b = r3[0] - r4[0];

                int tmp0 = tmp02a + tmp02b + r0[0];
                int tmp1 = tmp13a + tmp13b * 2;
                int tmp2 = tmp02a + tmp02b * 4;
                int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4;

                tmp0 = tmp0 * 4;
                tmp1 = tmp1 * 4;
                tmp2 = tmp2 * 4;
                tmp3 = tmp3 * 4;

                tmp[0][m] = tmp0;
                tmp[1][m] = tmp1;
                tmp[2][m] = tmp2;
                tmp[3][m] = tmp3;

                r0 += max_jj * 6;
                r1 += max_jj * 6;
                r2 += max_jj * 6;
                r3 += max_jj * 6;
                r4 += max_jj * 6;
                r5 += max_jj * 6;
            }

            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 4) + (tj * 4);

            for (int m = 0; m < 4; m++)
            {
                if (ti * 4 + m >= outh)
                    continue;

                int tmp02a = tmp[m][1] + tmp[m][2];
                int tmp02b = tmp[m][3] + tmp[m][4];
                int tmp13a = tmp[m][1] - tmp[m][2];
                int tmp13b = tmp[m][3] - tmp[m][4];

                int tmp0 = tmp02a + tmp02b + tmp[m][0];
                int tmp1 = tmp13a + tmp13b * 2;
                int tmp2 = tmp02a + tmp02b * 4;
                int tmp3 = tmp13a + tmp13b * 8 + tmp[m][5];

                tmp0 = tmp0 / 576;
                tmp1 = tmp1 / 576;
                tmp2 = tmp2 / 576;
                tmp3 = tmp3 / 576;

                // if (out_elempack == 1)
                {
                    outptr0[0] = tmp0;
                    if (tj * 4 + 1 < outw) outptr0[1] = tmp1;
                    if (tj * 4 + 2 < outw) outptr0[2] = tmp2;
                    if (tj * 4 + 3 < outw) outptr0[3] = tmp3;
                }

                outptr0 += outw;
            }
        }
    }
}

static int conv3x3s1_winograd43_int8(Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
{
    int outw = top_blob.w;
    int outh = top_blob.h;

    // pad to 4n+2, winograd F(4,3)
    int w_tiles = (outw + 3) / 4;
    int h_tiles = (outh + 3) / 4;
    int tiles = w_tiles * h_tiles;

    const int M = top_blob.c * top_blob.elempack;
    const int N = tiles;
    const int K = bottom_blob.c * bottom_blob.elempack;
    const int B = 36;

    // NCNN_LOGE("conv3x3s1_winograd43_int8 %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    if (nT > 1 && nn_NK < nT)
    {
        Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
        if (B_tile.empty())
            return -100;

        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            // transform input
            conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT);
        }
    }
    else
    {
        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
        if (B_tileX.empty())
            return -100;

        #pragma omp parallel for num_threads(nT)
        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
        {
            const int ppj = ppjk / nn_K;
            const int ppk = ppjk % nn_K;

            const int j = ppj * TILE_N;
            const int k = ppk * TILE_K;

            const int max_jj = std::min((N - j), TILE_N);
            const int max_kk = std::min((K - k), TILE_K);

            Mat B_tile = B_tileX.channel(get_omp_thread_num());

            // transform input
            conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);

            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1);
        }
    }

    bottom_blob.release();

    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (top_tileX.empty())
        return -100;

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat top_tile = top_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);

                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);

                gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk);
            }

            // transform output
            conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_4x4.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv4x4s4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 4 * outw + w * 3;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 16 + q * 16;

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;
            const float* r3 = img0 + w * 3;

#if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(kernel0);
            float32x4_t _k4567 = vld1q_f32(kernel0 + 4);
            float32x4_t _k891011 = vld1q_f32(kernel0 + 8);
            float32x4_t _k12131415 = vld1q_f32(kernel0 + 12);
#else
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 4;
            const float* k2 = kernel0 + 8;
            const float* k3 = kernel0 + 12;
#endif // __ARM_NEON

            for (int i = 0; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw - (nn << 2);
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%1, #128]          \n"
                        "0:                                        \n"

                        "prfm       pldl1keep, [%2, #512]          \n"
                        "prfm       pldl1keep, [%3, #512]          \n"

                        "ld1        {v7.4s}, [%1]                  \n" // v7 = outptr

                        "ld1        {v8.4s}, [%2], #16             \n" // v8  = r0
                        "ld1        {v9.4s}, [%3], #16             \n" // v9  = r1

                        "prfm       pldl1keep, [%4, #512]          \n"
                        "prfm       pldl1keep, [%5, #512]          \n"

                        "fmul       v12.4s, v8.4s, %12.4s          \n"
                        "fmul       v13.4s, v9.4s, %13.4s          \n"

                        "ld1        {v10.4s}, [%4], #16            \n" // v10 = r2
                        "ld1        {v11.4s}, [%5], #16            \n" // v11 = r3

                        "fmla       v12.4s, v10.4s, %14.4s         \n"
                        "fmla       v13.4s, v11.4s, %15.4s         \n"

                        "fadd       v5.4s, v12.4s, v13.4s          \n"

                        "ld1        {v8.4s}, [%2], #16             \n" // v8  = r0
                        "ld1        {v9.4s}, [%3], #16             \n" // v9  = r1

                        "fmul       v12.4s, v8.4s, %12.4s          \n"
                        "fmul       v13.4s, v9.4s, %13.4s          \n"

                        "ld1        {v10.4s}, [%4], #16            \n" // v10 = r2
                        "ld1        {v11.4s}, [%5], #16            \n" // v11 = r3

                        "fmla       v12.4s, v10.4s, %14.4s         \n"
                        "fmla       v13.4s, v11.4s, %15.4s         \n"

                        "fadd       v6.4s, v12.4s, v13.4s          \n"

                        "ld1        {v8.4s}, [%2], #16             \n" // v8  = r0
                        "ld1        {v9.4s}, [%3], #16             \n" // v9  = r1

                        "fmul       v12.4s, v8.4s, %12.4s          \n"
                        "fmul       v13.4s, v9.4s, %13.4s          \n"

                        "ld1        {v10.4s}, [%4], #16            \n" // v10 = r2
                        "ld1        {v11.4s}, [%5], #16            \n" // v11 = r3

                        "fmla       v12.4s, v10.4s, %14.4s         \n"
                        "fmla       v13.4s, v11.4s, %15.4s         \n"

                        "fadd       v14.4s, v12.4s, v13.4s         \n"
                        "faddp      v5.4s, v5.4s, v6.4s            \n" // Move to here to enhance ILP

                        "ld1        {v8.4s}, [%2], #16             \n" // v8  = r0
                        "ld1        {v9.4s}, [%3], #16             \n" // v9  = r1

                        "fmul       v12.4s, v8.4s, %12.4s          \n"
                        "fmul       v13.4s, v9.4s, %13.4s          \n"

                        "ld1        {v10.4s}, [%4], #16            \n" // v10 = r2
                        "ld1        {v11.4s}, [%5], #16            \n" // v11 = r3

                        "fmla       v12.4s, v10.4s, %14.4s         \n"
                        "fmla       v13.4s, v11.4s, %15.4s         \n"

                        "fadd       v15.4s, v12.4s, v13.4s         \n"

                        //                  "faddp      v5.4s ,  v5.4s,  v6.4s         \n"  // Move this line upward.
                        "faddp      v14.4s, v14.4s, v15.4s         \n"
                        "faddp      v5.4s ,  v5.4s, v14.4s         \n"

                        "fadd       v7.4s, v7.4s, v5.4s            \n"

                        "st1        {v7.4s}, [%1], #16             \n"

                        "prfm       pldl1keep, [%1, #128]          \n"

                        "subs       %w0, %w0, #1                   \n"
                        "bne        0b                             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3)      // %5
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "w"(_k0123),    // %12
                        "w"(_k4567),    // %13
                        "w"(_k891011),  // %14
                        "w"(_k12131415) // %15
                        : "cc", "memory", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(

                        "pld        [%1, #128]          \n"

                        "0:                             \n"

                        "pld        [%2, #512]          \n"
                        "pld        [%3, #512]          \n"

                        "vld1.f32   {d14-d15}, [%1]     \n" // q7 = outptr

                        "vld1.f32   {d16-d17}, [%2]!    \n" // q8  = r0
                        "vld1.f32   {d18-d19}, [%3]!    \n" // q9  = r1

                        "pld        [%4, #512]          \n"
                        "pld        [%5, #512]          \n"

                        "vmul.f32   q12, q8, %q12       \n"
                        "vmul.f32   q13, q9, %q13       \n"

                        "vld1.f32   {d20-d21}, [%4]!    \n" // q10 = r2
                        "vld1.f32   {d22-d23}, [%5]!    \n" // q11 = r3

                        "vmla.f32   q12, q10, %q14      \n"
                        "vmla.f32   q13, q11, %q15      \n"

                        "vadd.f32   q5, q12, q13        \n"

                        "vld1.f32   {d16-d17}, [%2]!    \n" // q8  = r0
                        "vld1.f32   {d18-d19}, [%3]!    \n" // q9  = r1

                        "vmul.f32   q12, q8, %q12       \n"
                        "vmul.f32   q13, q9, %q13       \n"

                        "vld1.f32   {d20-d21}, [%4]!    \n" // q10 = r2
                        "vld1.f32   {d22-d23}, [%5]!    \n" // q11 = r3

                        "vmla.f32   q12, q10, %q14      \n"
                        "vmla.f32   q13, q11, %q15      \n"

                        "vadd.f32   q6, q12, q13        \n"

                        "vld1.f32   {d16-d17}, [%2]!    \n" // q8  = r0
                        "vld1.f32   {d18-d19}, [%3]!    \n" // q9  = r1

                        "vmul.f32   q12, q8, %q12       \n"
                        "vmul.f32   q13, q9, %q13       \n"

                        "vld1.f32   {d20-d21}, [%4]!    \n" // q10 = r2
                        "vld1.f32   {d22-d23}, [%5]!    \n" // q11 = r3

                        "vmla.f32   q12, q10, %q14      \n"
                        "vmla.f32   q13, q11, %q15      \n"

                        "vadd.f32   q14, q12, q13       \n"

                        "vld1.f32   {d16-d17}, [%2]!    \n" // q8  = r0
                        "vld1.f32   {d18-d19}, [%3]!    \n" // q9  = r1

                        "vmul.f32   q12, q8, %q12       \n"
                        "vmul.f32   q13, q9, %q13       \n"

                        "vld1.f32   {d20-d21}, [%4]!    \n" // q10 = r2
                        "vld1.f32   {d22-d23}, [%5]!    \n" // q11 = r3

                        "vmla.f32   q12, q10, %q14      \n"
                        "vmla.f32   q13, q11, %q15      \n"

                        "vadd.f32   q15, q12, q13       \n"

                        "vadd.f32   d10, d10, d11       \n"
                        "vadd.f32   d28, d28, d29       \n"
                        "vadd.f32   d11, d12, d13       \n"
                        "vadd.f32   d29, d30, d31       \n"

                        "vpadd.f32  d10, d10, d11       \n"
                        "vpadd.f32  d11, d28, d29       \n"

                        "vadd.f32   q7, q7, q5          \n"

                        "vst1.f32   {d14-d15}, [%1]!    \n"

                        "pld        [%1, #128]          \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3)      // %5
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "w"(_k0123),    // %12
                        "w"(_k4567),    // %13
                        "w"(_k891011),  // %14
                        "w"(_k12131415) // %15
                        : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
#if __ARM_NEON
#if __aarch64__
                    float sum = 0.f;

                    asm volatile(
                        "ld1        {v8.4s}, [%0], #16             \n" // v8  = r0
                        "ld1        {v9.4s}, [%1], #16             \n" // v9  = r1

                        "fmul       v12.4s, v8.4s, %9.4s           \n"
                        "fmul       v13.4s, v9.4s, %10.4s          \n"

                        "ld1        {v10.4s}, [%2], #16            \n" // v10 = r2
                        "ld1        {v11.4s}, [%3], #16            \n" // v11 = r3

                        "fmla       v12.4s, v10.4s, %11.4s         \n"
                        "fmla       v13.4s, v11.4s, %12.4s         \n"

                        "fadd       v5.4s, v12.4s, v13.4s          \n"
                        "faddp      v5.4s, v5.4s, v5.4s            \n"
                        "faddp      s5, v5.2s                      \n"
                        "fmov       %w4, s5                        \n"
                        : "=r"(r0), // %0
                        "=r"(r1), // %1
                        "=r"(r2), // %2
                        "=r"(r3), // %3
                        "=r"(sum) // %4
                        : "0"(r0),
                        "1"(r1),
                        "2"(r2),
                        "3"(r3),
                        "w"(_k0123),    // %9
                        "w"(_k4567),    // %10
                        "w"(_k891011),  // %11
                        "w"(_k12131415) // %12
                        : "cc", "memory", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13");

                    *outptr += sum;
#else
                    float sum = 0.f;

                    asm volatile(
                        "vld1.f32   {d16-d17}, [%0]!    \n" // q8  = r0
                        "vld1.f32   {d18-d19}, [%1]!    \n" // q9  = r1

                        "vmul.f32   q12, q8, %q9        \n"
                        "vmul.f32   q13, q9, %q10       \n"

                        "vld1.f32   {d20-d21}, [%2]!    \n" // q10 = r2
                        "vld1.f32   {d22-d23}, [%3]!    \n" // q11 = r3

                        "vmla.f32   q12, q10, %q11      \n"
                        "vmla.f32   q13, q11, %q12      \n"

                        "vadd.f32   q5, q12, q13        \n"
                        "vadd.f32   d10, d10, d11       \n"
                        "vpadd.f32  d10, d10, d10       \n"
                        "vmov.f32   %4, d10[0]          \n"
                        : "=r"(r0), // %0
                        "=r"(r1), // %1
                        "=r"(r2), // %2
                        "=r"(r3), // %3
                        "=r"(sum) // %4
                        : "0"(r0),
                        "1"(r1),
                        "2"(r2),
                        "3"(r3),
                        "w"(_k0123),    // %9
                        "w"(_k4567),    // %10
                        "w"(_k891011),  // %11
                        "w"(_k12131415) // %12
                        : "cc", "memory", "q5", "q6", "q8", "q9", "q10", "q11", "q12", "q13");

                    *outptr += sum;
#endif // __aarch64__
#else
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r0[3] * k0[3];

                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r1[3] * k1[3];

                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum += r2[3] * k2[3];

                    sum += r3[0] * k3[0];
                    sum += r3[1] * k3[1];
                    sum += r3[2] * k3[2];
                    sum += r3[3] * k3[3];

                    *outptr += sum;

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
#endif // __ARM_NEON
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_5x5.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;
            float* outptr2 = outptr + outw;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 25 + q * 25;

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;
            const float* r3 = img0 + w * 3;
            const float* r4 = img0 + w * 4;
            const float* r5 = img0 + w * 5;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 5;
            const float* k2 = kernel0 + 10;
            const float* k3 = kernel0 + 15;
            const float* k4 = kernel0 + 20;

#if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(kernel0);
            float32x4_t _k4567 = vld1q_f32(kernel0 + 4);
            float32x4_t _k891011 = vld1q_f32(kernel0 + 8);
            float32x4_t _k12131415 = vld1q_f32(kernel0 + 12);
            float32x4_t _k16171819 = vld1q_f32(kernel0 + 16);
            float32x4_t _k20212223 = vld1q_f32(kernel0 + 20);
            float32x4_t _k24242424 = vdupq_n_f32(kernel0[24]);
#endif // __ARM_NEON

            int i = 0;

            for (; i + 1 < outh; i += 2)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw - (nn << 2);
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        // v11 = rx1 / rx3
                        // v12 = rx2
                        // v13 v14 = intermediate sum register

                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v7.4s}, [%1]                  \n" // v7 = out

                        "0:                                        \n"

                        "prfm       pldl1keep, [%2, #128]          \n"
                        "ld1        {v8.4s}, [%2]                  \n" // v8 = out2

                        // r1
                        "prfm       pldl1keep, [%4, #256]          \n"
                        "ld1        {v9.4s, v10.4s}, [%4]          \n" // v9 v10 = r10 r14
                        "add        %4, %4, #16                    \n"

                        "ext        v11.16b, v9.16b, v10.16b, #4   \n" //r11
                        "fmul       v13.4s, v9.4s, %19.s[1]        \n"
                        "fmla       v8.4s,  v9.4s, %18.s[0]        \n"

                        "ext        v12.16b, v9.16b, v10.16b, #8   \n" //r12
                        "fmla       v7.4s,  v11.4s, %19.s[2]       \n"
                        "fmul       v14.4s, v11.4s, %18.s[1]       \n"

                        "ext        v11.16b, v9.16b, v10.16b, #12  \n" //r13
                        "fmla       v13.4s, v12.4s, %19.s[3]       \n"
                        "fmla       v8.4s,  v12.4s, %18.s[2]       \n"

                        "fmla       v7.4s,  v11.4s, %20.s[0]       \n"
                        "fmla       v14.4s, v11.4s, %18.s[3]       \n"

                        "prfm       pldl1keep, [%5, #256]          \n"

                        "fmla       v13.4s, v10.4s, %20.s[1]       \n"
                        "fmla       v8.4s,  v10.4s, %19.s[0]       \n"

                        // r2
                        "ld1        {v9.4s, v10.4s}, [%5]          \n" // v9 v10 = r20 r24
                        "add        %5, %5, #16                    \n"

                        "ext        v11.16b, v9.16b, v10.16b, #4   \n" //r21
                        "fmla       v7.4s,  v9.4s, %20.s[2]        \n"
                        "fmla       v14.4s, v9.4s, %19.s[1]        \n"

                        "ext        v12.16b, v9.16b, v10.16b, #8   \n" //r22
                        "fmla       v13.4s, v11.4s, %20.s[3]       \n"
                        "fmla       v8.4s,  v11.4s, %19.s[2]       \n"

                        "ext        v11.16b, v9.16b, v10.16b, #12  \n" //r23
                        "fmla       v7.4s,  v12.4s, %21.s[0]       \n"
                        "fmla       v14.4s, v12.4s, %19.s[3]       \n"

                        "fmla       v13.4s, v11.4s, %21.s[1]       \n"
                        "fmla       v8.4s,  v11.4s, %20.s[0]       \n"

                        "prfm       pldl1keep, [%6, #256]          \n"

                        "fmla       v7.4s,  v10.4s, %21.s[2]       \n"
                        "fmla       v14.4s, v10.4s, %20.s[1]       \n"

                        // r3
                        "ld1        {v9.4s, v10.4s}, [%6]          \n" // v9 v10 = r30 r34
                        "add        %6, %6, #16                    \n"

                        "ext        v11.16b, v9.16b, v10.16b, #4   \n" //r31
                        "fmla       v13.4s, v9.4s, %21.s[3]        \n"
                        "fmla       v8.4s,  v9.4s, %20.s[2]        \n"

                        "ext        v12.16b, v9.16b, v10.16b, #8   \n" //r32
                        "fmla       v7.4s,  v11.4s, %22.s[0]       \n"
                        "fmla       v14.4s, v11.4s, %20.s[3]       \n"

                        "ext        v11.16b, v9.16b, v10.16b, #12  \n" //r33
                        "fmla       v13.4s, v12.4s, %22.s[1]       \n"
                        "fmla       v8.4s,  v12.4s, %21.s[0]       \n"

                        "fmla       v7.4s,  v11.4s, %22.s[2]       \n"
                        "fmla       v14.4s, v11.4s, %21.s[1]       \n"

                        "prfm       pldl1keep, [%7, #256]          \n"

                        "fmla       v13.4s, v10.4s, %22.s[3]       \n"
                        "fmla       v8.4s,  v10.4s, %21.s[2]       \n"

                        // r4
                        "ld1        {v9.4s, v10.4s}, [%7]          \n" // v9 v10 = r40 r44
                        "add        %7, %7, #16                    \n"

                        "ext        v11.16b, v9.16b, v10.16b, #4   \n" //r41
                        "fmla       v7.4s,  v9.4s, %23.s[0]        \n"
                        "fmla       v14.4s, v9.4s, %21.s[3]        \n"

                        "ext        v12.16b, v9.16b, v10.16b, #8   \n" //r41
                        "fmla       v13.4s, v11.4s, %23.s[1]       \n"
                        "fmla       v8.4s,  v11.4s, %22.s[0]       \n"

                        "ext        v11.16b, v9.16b, v10.16b, #12  \n" //r41
                        "fmla       v7.4s,  v12.4s, %23.s[2]       \n"
                        "fmla       v14.4s, v12.4s, %22.s[1]       \n"

                        "fmla       v13.4s, v11.4s, %23.s[3]       \n"
                        "fmla       v8.4s,  v11.4s, %22.s[2]       \n"

                        "prfm       pldl1keep, [%3, #256]          \n"

                        "fmla       v7.4s,  v10.4s, %24.s[0]       \n"
                        "fmla       v14.4s, v10.4s, %22.s[3]       \n"

                        // r0 and r5
                        "ld1        {v9.4s, v10.4s}, [%3]          \n" // v9 v10 = r00 r04
                        "add        %3, %3, #16                    \n"

                        "ext        v11.16b, v9.16b, v10.16b, #4   \n" //r01
                        "fmla       v13.4s, v11.4s, %18.s[1]       \n"

                        "ext        v12.16b, v9.16b, v10.16b, #8   \n" //r02
                        "fmla       v7.4s, v12.4s, %18.s[2]        \n"

                        "ext        v11.16b, v9.16b, v10.16b, #12  \n" //r03

                        "prfm       pldl1keep, [%8, #256]          \n"

                        "fmla       v13.4s, v11.4s, %18.s[3]       \n"

                        // r5
                        "ld1        {v11.4s, v12.4s}, [%8]         \n" // v11 v12 = r50 r54
                        "add        %8, %8, #16                    \n"

                        "fmla       v8.4s,  v11.4s, %23.s[0]       \n"
                        "fmla       v14.4s, v12.4s, %24.s[0]       \n"

                        "fmla       v7.4s,  v9.4s,  %18.s[0]       \n"
                        "fmla       v13.4s, v10.4s, %19.s[0]       \n"

                        "ext        v9.16b,  v11.16b, v12.16b, #4  \n" //r51
                        "ext        v10.16b, v11.16b, v12.16b, #8  \n" //r52

                        "fmla       v14.4s, v9.4s, %23.s[1]        \n"

                        "ext        v9.16b, v11.16b, v12.16b, #12  \n" //r53
                        "fmla       v8.4s, v10.4s, %23.s[2]        \n"

                        "fmla       v14.4s, v9.4s, %23.s[3]        \n"

                        "fadd       v7.4s, v7.4s, v13.4s           \n"

                        "st1        {v7.4s}, [%1], #16             \n"

                        "fadd       v8.4s, v8.4s, v14.4s           \n"

                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v7.4s}, [%1]                  \n" // v7 = out
                        "st1        {v8.4s}, [%2], #16             \n"

                        "subs       %w0, %w0, #1                   \n"
                        "bne        0b                             \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr),  // %1
                        "=r"(outptr2), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2),      // %5
                        "=r"(r3),      // %6
                        "=r"(r4),      // %7
                        "=r"(r5)       // %8
                        : "0"(nn),
                        "1"(outptr),
                        "2"(outptr2),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "6"(r3),
                        "7"(r4),
                        "8"(r5),
                        "w"(_k0123),     // %18
                        "w"(_k4567),     // %19
                        "w"(_k891011),   // %20
                        "w"(_k12131415), // %21
                        "w"(_k16171819), // %22
                        "w"(_k20212223), // %23
                        "w"(_k24242424)  // %24
                        : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        //                     "veor       q13, q13            \n"
                        //                     "veor       q14, q14            \n"

                        "pld        [%1, #128]          \n"

                        "vld1.f32   {d14-d15}, [%1]     \n" // q7 = out

                        "0:                             \n"

                        // q11 = rx1 / rx3
                        // q12 = rx2

                        // q13 q14 = intermediate sum register

                        "pld        [%2, #128]          \n"

                        "vld1.f32   {d16-d17}, [%2]     \n" // q8 = out2

                        "pld        [%4, #256]          \n"

                        // r1
                        "vld1.f32   {d18-d21}, [%4]     \n" // q9 q10 = r10 r14
                        "add        %4, #16             \n"

                        "vext.32    q11, q9, q10, #1    \n" // r11
                        "vmul.f32   q13, q9, %e19[1]    \n"
                        "vmla.f32   q8, q9, %e18[0]     \n"

                        "vext.32    q12, q9, q10, #2    \n" // r12
                        "vmla.f32   q7, q11, %f19[0]    \n"
                        "vmul.f32   q14, q11, %e18[1]   \n"

                        "vext.32    q11, q9, q10, #3    \n" // r13
                        "vmla.f32   q13, q12, %f19[1]   \n"
                        "vmla.f32   q8, q12, %f18[0]    \n"

                        "vmla.f32   q7, q11, %e20[0]    \n"
                        "vmla.f32   q14, q11, %f18[1]   \n"

                        "pld        [%5, #256]          \n"

                        "vmla.f32   q13, q10, %e20[1]   \n"
                        "vmla.f32   q8, q10, %e19[0]    \n"

                        // r2
                        "vld1.f32   {d18-d21}, [%5]     \n" // q9 q10 = r20 r24
                        "add        %5, #16             \n"

                        "vext.32    q11, q9, q10, #1    \n" // r21
                        "vmla.f32   q7, q9, %f20[0]     \n"
                        "vmla.f32   q14, q9, %e19[1]    \n"

                        "vext.32    q12, q9, q10, #2    \n" // r22
                        "vmla.f32   q13, q11, %f20[1]   \n"
                        "vmla.f32   q8, q11, %f19[0]    \n"

                        "vext.32    q11, q9, q10, #3    \n" // r23
                        "vmla.f32   q7, q12, %e21[0]    \n"
                        "vmla.f32   q14, q12, %f19[1]   \n"

                        "vmla.f32   q13, q11, %e21[1]   \n"
                        "vmla.f32   q8, q11, %e20[0]    \n"

                        "pld        [%6, #256]          \n"

                        "vmla.f32   q7, q10, %f21[0]    \n"
                        "vmla.f32   q14, q10, %e20[1]   \n"

                        // r3
                        "vld1.f32   {d18-d21}, [%6]     \n" // q9 q10 = r30 r34
                        "add        %6, #16             \n"

                        "vext.32    q11, q9, q10, #1    \n" // r31
                        "vmla.f32   q13, q9, %f21[1]    \n"
                        "vmla.f32   q8, q9, %f20[0]     \n"

                        "vext.32    q12, q9, q10, #2    \n" // r32
                        "vmla.f32   q7, q11, %e22[0]    \n"
                        "vmla.f32   q14, q11, %f20[1]   \n"

                        "vext.32    q11, q9, q10, #3    \n" // r33
                        "vmla.f32   q13, q12, %e22[1]   \n"
                        "vmla.f32   q8, q12, %e21[0]    \n"

                        "vmla.f32   q7, q11, %f22[0]    \n"
                        "vmla.f32   q14, q11, %e21[1]   \n"

                        "pld        [%7, #256]          \n"

                        "vmla.f32   q13, q10, %f22[1]   \n"
                        "vmla.f32   q8, q10, %f21[0]    \n"

                        // r4
                        "vld1.f32   {d18-d21}, [%7]     \n" // q9 q10 = r40 r44
                        "add        %7, #16             \n"

                        "vext.32    q11, q9, q10, #1    \n" // r41
                        "vmla.f32   q7, q9, %e23[0]     \n"
                        "vmla.f32   q14, q9, %f21[1]    \n"

                        "vext.32    q12, q9, q10, #2    \n" // r42
                        "vmla.f32   q13, q11, %e23[1]   \n"
                        "vmla.f32   q8, q11, %e22[0]    \n"

                        "vext.32    q11, q9, q10, #3    \n" // r43
                        "vmla.f32   q7, q12, %f23[0]    \n"
                        "vmla.f32   q14, q12, %e22[1]   \n"

                        "vmla.f32   q13, q11, %f23[1]   \n"
                        "vmla.f32   q8, q11, %f22[0]    \n"

                        "pld        [%3, #256]          \n"

                        "vmla.f32   q7, q10, %e24[0]    \n"
                        "vmla.f32   q14, q10, %f22[1]   \n"

                        // r0 and r5
                        "vld1.f32   {d18-d21}, [%3]     \n" // q9 q10 = r00 r04
                        "add        %3, #16             \n"

                        "vext.32    q11, q9, q10, #1    \n" // r01
                        "vmla.f32   q13, q11, %e18[1]   \n"

                        "vext.32    q12, q9, q10, #2    \n" // r02
                        "vmla.f32   q7, q12, %f18[0]    \n"

                        "vext.32    q11, q9, q10, #3    \n" // r03

                        "pld        [%8, #256]          \n"

                        "vmla.f32   q13, q11, %f18[1]   \n"

                        // r5
                        "vld1.f32   {d22-d25}, [%8]     \n" // q11 q12 = r50 r54
                        "add        %8, #16             \n"

                        "vmla.f32   q8, q11, %e23[0]    \n"
                        "vmla.f32   q14, q12, %e24[0]   \n"

                        "vmla.f32   q7, q9, %e18[0]     \n"
                        "vmla.f32   q13, q10, %e19[0]   \n"

                        "vext.32    q9, q11, q12, #1    \n" // r51
                        "vext.32    q10, q11, q12, #2   \n" // r52

                        "vmla.f32   q14, q9, %e23[1]    \n"

                        "vext.32    q9, q11, q12, #3    \n" // r53
                        "vmla.f32   q8, q10, %f23[0]    \n"

                        "vmla.f32   q14, q9, %f23[1]    \n"

                        "vadd.f32   q7, q7, q13         \n"

                        //                     "veor       q13, q13            \n"

                        "vst1.f32   {d14-d15}, [%1]!    \n"

                        "vadd.f32   q8, q8, q14         \n"

                        "pld        [%1, #128]          \n"

                        "vld1.f32   {d14-d15}, [%1]     \n" // q7 = out

                        //                     "veor       q14, q14            \n"

                        "vst1.f32   {d16-d17}, [%2]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"
                        : "=r"(nn),      // %0
                        "=r"(outptr),  // %1
                        "=r"(outptr2), // %2
                        "=r"(r0),      // %3
                        "=r"(r1),      // %4
                        "=r"(r2),      // %5
                        "=r"(r3),      // %6
                        "=r"(r4),      // %7
                        "=r"(r5)       // %8
                        : "0"(nn),
                        "1"(outptr),
                        "2"(outptr2),
                        "3"(r0),
                        "4"(r1),
                        "5"(r2),
                        "6"(r3),
                        "7"(r4),
                        "8"(r5),
                        "w"(_k0123),     // %18
                        "w"(_k4567),     // %19
                        "w"(_k891011),   // %20
                        "w"(_k12131415), // %21
                        "w"(_k16171819), // %22
                        "w"(_k20212223), // %23
                        "w"(_k24242424)  // %24
                        : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    float sum = 0;
                    float sum2 = 0;
#if __ARM_NEON
                    float32x4_t _r1 = vld1q_f32(r1);
                    float32x4_t _k1 = vld1q_f32(k1);
                    float32x4_t _sum = vmulq_f32(_r1, _k1);
                    float32x4_t _sum2 = vmulq_f32(_r1, _k0123);

                    float32x4_t _r2 = vld1q_f32(r2);
                    float32x4_t _k2 = vld1q_f32(k2);
                    _sum = vmlaq_f32(_sum, _r2, _k2);
                    _sum2 = vmlaq_f32(_sum2, _r2, _k1);

                    float32x4_t _r3 = vld1q_f32(r3);
                    float32x4_t _k3 = vld1q_f32(k3);
                    _sum = vmlaq_f32(_sum, _r3, _k3);
                    _sum2 = vmlaq_f32(_sum2, _r3, _k2);

                    float32x4_t _r4 = vld1q_f32(r4);
                    _sum = vmlaq_f32(_sum, _r4, _k20212223);
                    _sum2 = vmlaq_f32(_sum2, _r4, _k3);

                    float32x4_t _r0 = vld1q_f32(r0);
                    _sum = vmlaq_f32(_sum, _r0, _k0123);
                    float32x4_t _r5 = vld1q_f32(r5);
                    _sum2 = vmlaq_f32(_sum2, _r5, _k20212223);

                    float32x4_t _k_t4 = {};

                    _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
                    _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
                    _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
                    _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);

                    float32x4_t _r_t4 = {};

                    _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
                    _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
                    _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
                    _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
                    _sum = vmlaq_f32(_sum, _r_t4, _k_t4);

                    sum = r4[4] * k4[4];

                    _r_t4 = vextq_f32(_r_t4, _r_t4, 1);
                    _r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3);
                    _sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4);

                    sum2 = r5[4] * k4[4];

                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
                    float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2);

                    sum += vget_lane_f32(_ss_ss2, 0);
                    sum2 += vget_lane_f32(_ss_ss2, 1);
#else
                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r0[3] * k0[3];
                    sum += r0[4] * k0[4];

                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r1[3] * k1[3];
                    sum += r1[4] * k1[4];

                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum += r2[3] * k2[3];
                    sum += r2[4] * k2[4];

                    sum += r3[0] * k3[0];
                    sum += r3[1] * k3[1];
                    sum += r3[2] * k3[2];
                    sum += r3[3] * k3[3];
                    sum += r3[4] * k3[4];

                    sum += r4[0] * k4[0];
                    sum += r4[1] * k4[1];
                    sum += r4[2] * k4[2];
                    sum += r4[3] * k4[3];
                    sum += r4[4] * k4[4];

                    sum2 += r1[0] * k0[0];
                    sum2 += r1[1] * k0[1];
                    sum2 += r1[2] * k0[2];
                    sum2 += r1[3] * k0[3];
                    sum2 += r1[4] * k0[4];

                    sum2 += r2[0] * k1[0];
                    sum2 += r2[1] * k1[1];
                    sum2 += r2[2] * k1[2];
                    sum2 += r2[3] * k1[3];
                    sum2 += r2[4] * k1[4];

                    sum2 += r3[0] * k2[0];
                    sum2 += r3[1] * k2[1];
                    sum2 += r3[2] * k2[2];
                    sum2 += r3[3] * k2[3];
                    sum2 += r3[4] * k2[4];

                    sum2 += r4[0] * k3[0];
                    sum2 += r4[1] * k3[1];
                    sum2 += r4[2] * k3[2];
                    sum2 += r4[3] * k3[3];
                    sum2 += r4[4] * k3[4];

                    sum2 += r5[0] * k4[0];
                    sum2 += r5[1] * k4[1];
                    sum2 += r5[2] * k4[2];
                    sum2 += r5[3] * k4[3];
                    sum2 += r5[4] * k4[4];
#endif // __ARM_NEON
                    *outptr += sum;
                    *outptr2 += sum2;

                    r0++;
                    r1++;
                    r2++;
                    r3++;
                    r4++;
                    r5++;
                    outptr++;
                    outptr2++;
                }

                r0 += 4 + w;
                r1 += 4 + w;
                r2 += 4 + w;
                r3 += 4 + w;
                r4 += 4 + w;
                r5 += 4 + w;

                outptr += outw;
                outptr2 += outw;
            }

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw - (nn << 2);
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%1, #128]          \n"
                        "prfm       pldl1keep, [%2, #256]          \n"

                        "ld1        {v8.4s, v9.4s}, [%2]           \n" // _r00 = vld1q_f32(r0+j);
                        "add        %2, %2, #16                    \n"

                        "0:                                        \n"

                        "ld1        {v7.4s}, [%1]                  \n" // _sum = vld1q_f32(outptr+j);

                        "ext        v10.16b, v8.16b, v9.16b, #4    \n" //_r01
                        "ext        v11.16b, v8.16b, v9.16b, #8    \n" //_r02
                        "ext        v12.16b, v8.16b, v9.16b, #12   \n" //_r03

                        "fmla       v7.4s,   v8.4s, %14.s[0]       \n"
                        "fmul       v13.4s, v10.4s, %14.s[1]       \n"

                        "prfm       pldl1keep, [%3, #256]          \n"

                        "fmul       v14.4s, v11.4s, %14.s[2]       \n"
                        "fmul       v15.4s, v12.4s, %14.s[3]       \n"
                        "fmla       v7.4s,   v9.4s, %15.s[0]       \n"

                        "ld1        {v8.4s, v9.4s}, [%3]           \n"
                        "add        %3, %3, #16                    \n"
                        "ext        v10.16b, v8.16b, v9.16b, #4    \n" //_r11
                        "ext        v11.16b, v8.16b, v9.16b, #8    \n" //_r12
                        "ext        v12.16b, v8.16b, v9.16b, #12   \n" //_r13

                        "fmla       v7.4s,   v8.4s, %15.s[1]       \n"
                        "fmla       v13.4s, v10.4s, %15.s[2]       \n"

                        "prfm       pldl1keep, [%4, #256]          \n"

                        "fmla       v14.4s, v11.4s, %15.s[3]       \n"
                        "fmla       v15.4s, v12.4s, %16.s[0]       \n"
                        "fmla       v7.4s,   v9.4s, %16.s[1]       \n"

                        "ld1        {v8.4s, v9.4s}, [%4]           \n"
                        "add        %4, %4, #16                    \n"
                        "ext        v10.16b, v8.16b, v9.16b, #4    \n" //_r21
                        "ext        v11.16b, v8.16b, v9.16b, #8    \n" //_r22
                        "ext        v12.16b, v8.16b, v9.16b, #12   \n" //_r23

                        "fmla       v7.4s,   v8.4s, %16.s[2]       \n"
                        "fmla       v13.4s, v10.4s, %16.s[3]       \n"

                        "prfm       pldl1keep, [%5, #256]          \n"

                        "fmla       v14.4s, v11.4s, %17.s[0]       \n"
                        "fmla       v15.4s, v12.4s, %17.s[1]       \n"
                        "fmla       v7.4s,   v9.4s, %17.s[2]       \n"

                        "ld1        {v8.4s, v9.4s}, [%5]           \n"
                        "add        %5, %5, #16                    \n"
                        "ext        v10.16b, v8.16b, v9.16b, #4    \n" //_r31
                        "ext        v11.16b, v8.16b, v9.16b, #8    \n" //_r32
                        "ext        v12.16b, v8.16b, v9.16b, #12   \n" //_r33

                        "fmla       v7.4s,   v8.4s, %17.s[3]       \n"
                        "fmla       v13.4s, v10.4s, %18.s[0]       \n"

                        "prfm       pldl1keep, [%6, #256]          \n"

                        "fmla       v14.4s, v11.4s, %18.s[1]       \n"
                        "fmla       v15.4s, v12.4s, %18.s[2]       \n"
                        "fmla       v7.4s,   v9.4s, %18.s[3]       \n"

                        "ld1        {v8.4s, v9.4s}, [%6]           \n"
                        "add        %6, %6, #16                    \n"
                        "ext        v10.16b, v8.16b, v9.16b, #4    \n" //_r41
                        "ext        v11.16b, v8.16b, v9.16b, #8    \n" //_r42
                        "ext        v12.16b, v8.16b, v9.16b, #12   \n" //_r43

                        "fmla       v7.4s,   v8.4s, %19.s[0]       \n"
                        "fmla       v13.4s, v10.4s, %19.s[1]       \n"
                        "fmla       v14.4s, v11.4s, %19.s[2]       \n"
                        "fmla       v15.4s, v12.4s, %19.s[3]       \n"
                        "fmla       v7.4s,   v9.4s, %20.s[0]       \n"

                        "fadd       v14.4s, v14.4s, v15.4s         \n"
                        "fadd       v7.4s,   v7.4s, v13.4s         \n"

                        "prfm       pldl1keep, [%2, #256]          \n"

                        "fadd       v7.4s,   v7.4s, v14.4s         \n"

                        "ld1        {v8.4s, v9.4s}, [%2]           \n"
                        "add        %2, %2, #16                    \n"

                        "st1        {v7.4s}, [%1], #16             \n"

                        "prfm       pldl1keep, [%1, #128]          \n"

                        "subs       %w0, %w0, #1                   \n"
                        "bne        0b                             \n"

                        "sub        %2, %2, #16                    \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4)      // %6
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "w"(_k0123),     // %14
                        "w"(_k4567),     // %15
                        "w"(_k891011),   // %16
                        "w"(_k12131415), // %17
                        "w"(_k16171819), // %18
                        "w"(_k20212223), // %19
                        "w"(_k24242424)  // %20
                        : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }
#else
                if (nn > 0)
                {
                    asm volatile(
                        //                     "veor       q15, q15            \n"// _sum3 = 0;

                        "pld        [%1, #128]          \n"

                        "pld        [%2, #256]          \n"

                        "vld1.f32   {d16-d19}, [%2]     \n" // _r00 = vld1q_f32(r0+j);
                        "add        %2, #16             \n"

                        "0:                             \n"

                        "vld1.f32   {d14-d15}, [%1]     \n" // _sum = vld1q_f32(outptr+j);
                        //                     "veor       q13, q13            \n"// _sum2 = 0;
                        //                     "veor       q14, q14            \n"// _sum3 = 0;

                        "vext.32    q10, q8, q9, #1     \n" // _r01
                        "vext.32    q11, q8, q9, #2     \n" // _r02
                        "vext.32    q12, q8, q9, #3     \n" // _r03

                        "vmla.f32   q7, q8, %e14[0]     \n"
                        "vmul.f32   q13, q10, %e14[1]   \n"

                        "pld        [%3, #256]          \n"

                        "vmul.f32   q14, q11, %f14[0]   \n"
                        "vmul.f32   q15, q12, %f14[1]   \n"
                        "vmla.f32   q7, q9, %e15[0]     \n"

                        "vld1.f32   {d16-d19}, [%3]     \n"
                        "add        %3, #16             \n"
                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"
                        "vext.32    q12, q8, q9, #3     \n"

                        "vmla.f32   q7, q8, %e15[1]     \n"
                        "vmla.f32   q13, q10, %f15[0]   \n"

                        "pld        [%4, #256]          \n"

                        "vmla.f32   q14, q11, %f15[1]   \n"
                        "vmla.f32   q15, q12, %e16[0]   \n"
                        "vmla.f32   q7, q9, %e16[1]     \n"

                        "vld1.f32   {d16-d19}, [%4]     \n"
                        "add        %4, #16             \n"
                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"
                        "vext.32    q12, q8, q9, #3     \n"

                        "vmla.f32   q7, q8, %f16[0]     \n"
                        "vmla.f32   q13, q10, %f16[1]   \n"

                        "pld        [%5, #256]          \n"

                        "vmla.f32   q14, q11, %e17[0]   \n"
                        "vmla.f32   q15, q12, %e17[1]   \n"
                        "vmla.f32   q7, q9, %f17[0]     \n"

                        "vld1.f32   {d16-d19}, [%5]     \n"
                        "add        %5, #16             \n"
                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"
                        "vext.32    q12, q8, q9, #3     \n"

                        "vmla.f32   q7, q8, %f17[1]     \n"
                        "vmla.f32   q13, q10, %e18[0]   \n"

                        "pld        [%6, #256]          \n"

                        "vmla.f32   q14, q11, %e18[1]   \n"
                        "vmla.f32   q15, q12, %f18[0]   \n"
                        "vmla.f32   q7, q9, %f18[1]     \n"

                        "vld1.f32   {d16-d19}, [%6]     \n"
                        "add        %6, #16             \n"
                        "vext.32    q10, q8, q9, #1     \n"
                        "vext.32    q11, q8, q9, #2     \n"
                        "vext.32    q12, q8, q9, #3     \n"

                        "vmla.f32   q7, q8, %e19[0]     \n"
                        "vmla.f32   q13, q10, %e19[1]   \n"
                        "vmla.f32   q14, q11, %f19[0]   \n"
                        "vmla.f32   q15, q12, %f19[1]   \n"
                        "vmla.f32   q7, q9, %e20[0]     \n"

                        "vadd.f32   q14, q14, q15       \n"
                        "vadd.f32   q7, q7, q13         \n"
                        //                     "veor       q15, q15            \n"// _sum3 = 0;

                        "pld        [%2, #256]          \n"

                        "vadd.f32   q7, q7, q14         \n"

                        "vld1.f32   {d16-d19}, [%2]     \n" // _r00 = vld1q_f32(r0+j);
                        "add        %2, #16             \n"

                        "vst1.f32   {d14-d15}, [%1]!    \n"

                        "pld        [%1, #128]          \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"

                        "sub        %2, #16             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4)      // %6
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "w"(_k0123),     // %14
                        "w"(_k4567),     // %15
                        "w"(_k891011),   // %16
                        "w"(_k12131415), // %17
                        "w"(_k16171819), // %18
                        "w"(_k20212223), // %19
                        "w"(_k24242424)  // %20
                        : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    float sum = 0;
#if __ARM_NEON
                    float32x4_t _r0 = vld1q_f32(r0);
                    float32x4_t _sum = vmulq_f32(_r0, _k0123);

                    float32x4_t _r1 = vld1q_f32(r1);
                    _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));

                    float32x4_t _r2 = vld1q_f32(r2);
                    _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));

                    float32x4_t _r3 = vld1q_f32(r3);
                    _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));

                    float32x4_t _r4 = vld1q_f32(r4);
                    _sum = vmlaq_f32(_sum, _r4, _k20212223);

                    float32x4_t _k_t4 = {};

                    _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
                    _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
                    _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
                    _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);

                    float32x4_t _r_t4 = {};

                    _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
                    _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
                    _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
                    _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
                    _sum = vmlaq_f32(_sum, _r_t4, _k_t4);

                    sum = r4[4] * k4[4];

                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);

                    sum += vget_lane_f32(_ss, 0);
#else
                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r0[3] * k0[3];
                    sum += r0[4] * k0[4];

                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r1[3] * k1[3];
                    sum += r1[4] * k1[4];

                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum += r2[3] * k2[3];
                    sum += r2[4] * k2[4];

                    sum += r3[0] * k3[0];
                    sum += r3[1] * k3[1];
                    sum += r3[2] * k3[2];
                    sum += r3[3] * k3[3];
                    sum += r3[4] * k3[4];

                    sum += r4[0] * k4[0];
                    sum += r4[1] * k4[1];
                    sum += r4[2] * k4[2];
                    sum += r4[3] * k4[3];
                    sum += r4[4] * k4[4];
#endif
                    *outptr += sum;

                    r0++;
                    r1++;
                    r2++;
                    r3++;
                    r4++;
                    outptr++;
                }

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
                r4 += 4;
            }
        }
    }
}

static void conv5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 25 + q * 25;

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;
            const float* r3 = img0 + w * 3;
            const float* r4 = img0 + w * 4;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 5;
            const float* k2 = kernel0 + 10;
            const float* k3 = kernel0 + 15;
            const float* k4 = kernel0 + 20;

#if __ARM_NEON
            float32x4_t _k0123 = vld1q_f32(kernel0);
            float32x4_t _k4567 = vld1q_f32(kernel0 + 4);
            float32x4_t _k891011 = vld1q_f32(kernel0 + 8);
            float32x4_t _k12131415 = vld1q_f32(kernel0 + 12);
            float32x4_t _k16171819 = vld1q_f32(kernel0 + 16);
            float32x4_t _k20212223 = vld1q_f32(kernel0 + 20);
            float32x4_t _k24242424 = vdupq_n_f32(kernel0[24]);
#endif // __ARM_NEON

            for (int i = 0; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw - (nn << 2);
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                if (nn > 0)
                {
                    asm volatile(
                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v8.4s, v9.4s}, [%2], #32      \n" // v8  = 0  2  4  6   q9  = 1  3  5  7

                        "prfm       pldl1keep, [%2, #256]          \n"
                        "ld2        {v10.4s, v11.4s}, [%2]         \n" // v10 = 8 10 12 14   v11 = 9 11 13 15

                        "prfm       pldl1keep, [%1, #128]          \n"
                        "0:                                        \n"

                        "ld1        {v7.4s}, [%1]                  \n" // v7 = outptr

                        "ext        v12.16b, v8.16b, v10.16b, #4   \n" // v12 = 2 4 6 8
                        "ext        v11.16b, v9.16b, v11.16b, #4   \n" // v11 = 3 5 7 9
                        "ext        v10.16b, v8.16b, v10.16b, #8   \n" // v10 = 4 6 8 10

                        "fmla       v7.4s,  v8.4s, %14.s[0]        \n"
                        "fmul       v13.4s, v9.4s, %14.s[1]        \n"

                        "prfm       pldl1keep, [%3, #256]          \n"

                        "fmul       v14.4s, v12.4s, %14.s[2]       \n"
                        "fmul       v15.4s, v11.4s, %14.s[3]       \n"
                        "fmla       v7.4s,  v10.4s, %15.s[0]       \n"

                        "ld2        {v8.4s, v9.4s}, [%3], #32      \n"

                        "prfm       pldl1keep, [%3, #256]          \n"

                        "ld2        {v10.4s, v11.4s}, [%3]         \n"
                        "ext        v12.16b, v8.16b, v10.16b, #4   \n"
                        "ext        v11.16b, v9.16b, v11.16b, #4   \n"
                        "ext        v10.16b, v8.16b, v10.16b, #8   \n"

                        "fmla       v7.4s,  v8.4s, %15.s[1]        \n"
                        "fmla       v13.4s, v9.4s, %15.s[2]        \n"

                        "prfm       pldl1keep, [%4, #256]          \n"

                        "fmla       v14.4s, v12.4s, %15.s[3]       \n"
                        "fmla       v15.4s, v11.4s, %16.s[0]       \n"
                        "fmla       v7.4s,  v10.4s, %16.s[1]       \n"

                        "ld2        {v8.4s, v9.4s}, [%4], #32      \n"

                        "prfm       pldl1keep, [%4, #256]          \n"

                        "ld2        {v10.4s, v11.4s}, [%4]         \n"
                        "ext        v12.16b, v8.16b, v10.16b, #4   \n"
                        "ext        v11.16b, v9.16b, v11.16b, #4   \n"
                        "ext        v10.16b, v8.16b, v10.16b, #8   \n"

                        "fmla       v7.4s,  v8.4s, %16.s[2]        \n"
                        "fmla       v13.4s, v9.4s, %16.s[3]        \n"

                        "prfm       pldl1keep, [%5, #256]          \n"

                        "fmla       v14.4s, v12.4s, %17.s[0]       \n"
                        "fmla       v15.4s, v11.4s, %17.s[1]       \n"
                        "fmla       v7.4s,  v10.4s, %17.s[2]       \n"

                        "ld2        {v8.4s, v9.4s}, [%5], #32      \n"

                        "prfm       pldl1keep, [%5, #256]          \n"

                        "ld2        {v10.4s, v11.4s}, [%5]         \n"
                        "ext        v12.16b, v8.16b, v10.16b, #4   \n"
                        "ext        v11.16b, v9.16b, v11.16b, #4   \n"
                        "ext        v10.16b, v8.16b, v10.16b, #8   \n"

                        "fmla       v7.4s,  v8.4s, %17.s[3]        \n"
                        "fmla       v13.4s, v9.4s, %18.s[0]        \n"

                        "prfm       pldl1keep, [%6, #256]          \n"

                        "fmla       v14.4s, v12.4s, %18.s[1]       \n"
                        "fmla       v15.4s, v11.4s, %18.s[2]       \n"
                        "fmla       v7.4s,  v10.4s, %18.s[3]       \n"

                        "ld2        {v8.4s, v9.4s}, [%6], #32      \n"

                        "prfm       pldl1keep, [%6, #256]          \n"

                        "ld2        {v10.4s, v11.4s}, [%6]         \n"
                        "ext        v12.16b, v8.16b, v10.16b, #4   \n"
                        "ext        v11.16b, v9.16b, v11.16b, #4   \n"
                        "ext        v10.16b, v8.16b, v10.16b, #8   \n"

                        "fmla       v7.4s,   v8.4s, %19.s[0]       \n"
                        "fmla       v13.4s,  v9.4s, %19.s[1]       \n"
                        "fmla       v14.4s, v12.4s, %19.s[2]       \n"
                        "fmla       v15.4s, v11.4s, %19.s[3]       \n"
                        "fmla       v7.4s,  v10.4s, %20.s[0]       \n"

                        "prfm       pldl1keep, [%2, #256]          \n"

                        "ld2        {v8.4s, v9.4s}, [%2], #32      \n"

                        "fadd       v14.4s, v14.4s, v15.4s         \n"
                        "fadd       v7.4s,   v7.4s, v13.4s         \n"

                        "prfm       pldl1keep, [%2, #256]          \n"

                        "fadd       v7.4s, v7.4s, v14.4s           \n"

                        "ld2        {v10.4s, v11.4s}, [%2]         \n"
                        "st1        {v7.4s}, [%1], #16             \n"

                        "prfm       pldl1keep, [%1, #128]          \n"

                        "subs       %w0, %w0, #1                   \n"
                        "bne        0b                             \n"

                        "sub        %2, %2, #32                    \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4)      // %6
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "w"(_k0123),     // %14
                        "w"(_k4567),     // %15
                        "w"(_k891011),   // %16
                        "w"(_k12131415), // %17
                        "w"(_k16171819), // %18
                        "w"(_k20212223), // %19
                        "w"(_k24242424)  // %20
                        : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
                }

#else
                if (nn > 0)
                {
                    asm volatile(
                        //                     "veor       q15, q15            \n"// _sump3 = 0;
                        //                     "veor       q13, q13            \n"// _sump2 = 0;
                        //                     "veor       q14, q14            \n"// _sump3 = 0;

                        "pld        [%2, #256]          \n"
                        "vld2.f32   {d16-d19}, [%2]!    \n" // q8  = 0  2  4  6   q9  = 1  3  5  7

                        "pld        [%2, #256]          \n"
                        "vld2.f32   {d20-d23}, [%2]     \n" // q10 = 8 10 12 14   q11 = 9 11 13 15

                        "pld        [%1, #128]          \n"
                        "0:                             \n"

                        "vld1.f32   {d14-d15}, [%1]     \n" // q7 = outptr

                        "vext.32    q12, q8, q10, #1    \n" // q12 = 2 4 6 8
                        "vext.32    q11, q9, q11, #1    \n" // q11 = 3 5 7 9
                        "vext.32    q10, q8, q10, #2    \n" // q10 = 4 6 8 10

                        "vmla.f32   q7, q8, %e14[0]     \n"
                        "vmul.f32   q13, q9, %e14[1]    \n"

                        "pld        [%3, #256]          \n"

                        "vmul.f32   q14, q12, %f14[0]   \n"
                        "vmul.f32   q15, q11, %f14[1]   \n"
                        "vmla.f32   q7, q10, %e15[0]    \n"

                        "vld2.f32   {d16-d19}, [%3]!    \n"

                        "pld        [%3, #256]          \n"

                        "vld2.f32   {d20-d23}, [%3]     \n"
                        "vext.32    q12, q8, q10, #1    \n"
                        "vext.32    q11, q9, q11, #1    \n"
                        "vext.32    q10, q8, q10, #2    \n"

                        "vmla.f32   q7, q8, %e15[1]     \n"
                        "vmla.f32   q13, q9, %f15[0]    \n"

                        "pld        [%4, #256]          \n"

                        "vmla.f32   q14, q12, %f15[1]   \n"
                        "vmla.f32   q15, q11, %e16[0]   \n"
                        "vmla.f32   q7, q10, %e16[1]    \n"

                        "vld2.f32   {d16-d19}, [%4]!    \n"

                        "pld        [%4, #256]          \n"

                        "vld2.f32   {d20-d23}, [%4]     \n"
                        "vext.32    q12, q8, q10, #1    \n"
                        "vext.32    q11, q9, q11, #1    \n"
                        "vext.32    q10, q8, q10, #2    \n"

                        "vmla.f32   q7, q8, %f16[0]     \n"
                        "vmla.f32   q13, q9, %f16[1]    \n"

                        "pld        [%5, #256]          \n"

                        "vmla.f32   q14, q12, %e17[0]   \n"
                        "vmla.f32   q15, q11, %e17[1]   \n"
                        "vmla.f32   q7, q10, %f17[0]    \n"

                        "vld2.f32   {d16-d19}, [%5]!    \n"

                        "pld        [%5, #256]          \n"

                        "vld2.f32   {d20-d23}, [%5]     \n"
                        "vext.32    q12, q8, q10, #1    \n"
                        "vext.32    q11, q9, q11, #1    \n"
                        "vext.32    q10, q8, q10, #2    \n"

                        "vmla.f32   q7, q8, %f17[1]     \n"
                        "vmla.f32   q13, q9, %e18[0]    \n"

                        "pld        [%6, #256]          \n"

                        "vmla.f32   q14, q12, %e18[1]   \n"
                        "vmla.f32   q15, q11, %f18[0]   \n"
                        "vmla.f32   q7, q10, %f18[1]    \n"

                        "vld2.f32   {d16-d19}, [%6]!    \n"

                        "pld        [%6, #256]          \n"

                        "vld2.f32   {d20-d23}, [%6]     \n"
                        "vext.32    q12, q8, q10, #1    \n"
                        "vext.32    q11, q9, q11, #1    \n"
                        "vext.32    q10, q8, q10, #2    \n"

                        "vmla.f32   q7, q8, %e19[0]     \n"
                        "vmla.f32   q13, q9, %e19[1]    \n"
                        "vmla.f32   q14, q12, %f19[0]   \n"
                        "vmla.f32   q15, q11, %f19[1]   \n"
                        "vmla.f32   q7, q10, %e20[0]    \n"

                        "pld        [%2, #256]          \n"

                        "vld2.f32   {d16-d19}, [%2]!    \n" // q8  = 0  2  4  6   q9  = 1  3  5  7

                        "vadd.f32   q14, q14, q15       \n"
                        "vadd.f32   q7, q7, q13         \n"
                        //                     "veor       q15, q15            \n"// _sump3 = 0;
                        //                     "veor       q13, q13            \n"// _sump2 = 0;

                        "pld        [%2, #256]          \n"

                        "vadd.f32   q7, q7, q14         \n"

                        "vld2.f32   {d20-d23}, [%2]     \n" // q10 = 8 10 12 14   q11 = 9 11 13 15

                        //                     "veor       q14, q14            \n"// _sump3 = 0;

                        "vst1.f32   {d14-d15}, [%1]!    \n"

                        "pld        [%1, #128]          \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"

                        "sub        %2, #32             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4)      // %6
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "w"(_k0123),     // %14
                        "w"(_k4567),     // %15
                        "w"(_k891011),   // %16
                        "w"(_k12131415), // %17
                        "w"(_k16171819), // %18
                        "w"(_k20212223), // %19
                        "w"(_k24242424)  // %20
                        : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    float sum = 0;
#if __ARM_NEON
                    float32x4_t _r0 = vld1q_f32(r0);
                    float32x4_t _sum = vmulq_f32(_r0, _k0123);

                    float32x4_t _r1 = vld1q_f32(r1);
                    _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));

                    float32x4_t _r2 = vld1q_f32(r2);
                    _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));

                    float32x4_t _r3 = vld1q_f32(r3);
                    _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));

                    float32x4_t _r4 = vld1q_f32(r4);
                    _sum = vmlaq_f32(_sum, _r4, _k20212223);

                    sum += r0[4] * k0[4];
                    sum += r1[4] * k1[4];
                    sum += r2[4] * k2[4];
                    sum += r3[4] * k3[4];
                    sum += r4[4] * k4[4];

                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);

                    sum += vget_lane_f32(_ss, 0);
#else
                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r0[3] * k0[3];
                    sum += r0[4] * k0[4];

                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r1[3] * k1[3];
                    sum += r1[4] * k1[4];

                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum += r2[3] * k2[3];
                    sum += r2[4] * k2[4];

                    sum += r3[0] * k3[0];
                    sum += r3[1] * k3[1];
                    sum += r3[2] * k3[2];
                    sum += r3[3] * k3[3];
                    sum += r3[4] * k3[4];

                    sum += r4[0] * k4[0];
                    sum += r4[1] * k4[1];
                    sum += r4[2] * k4[2];
                    sum += r4[3] * k4[3];
                    sum += r4[4] * k4[4];
#endif
                    *outptr += sum;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;
                    r4 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_5x5_pack4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv5x5s1_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);
            const float* r3 = img0.row(3);
            const float* r4 = img0.row(4);

            const float* kptr = (const float*)kernel.channel(p).row(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1] \n" // r04 r05 r06 r07

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2] \n" // r14 r15 r16 r17

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3] \n" // r24 r25 r26 r27

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%4], #64 \n" // r30 r31 r32 r33

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4] \n" // r34 r35 r36 r37

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n" // r40 r41 r42 r43

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%5] \n" // r44 r45 r46 r47

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        //                         "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6] \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "sub    %6, %6, #1536               \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n" // sum0 sum1 sum2 sum3

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d0-d7}        \n" // r00 r01 r02 r03

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #512]          \n"
                        "vldm       %1, {d8-d15}        \n" // r04 r05 r06 r07

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d0-d7}        \n" // r10 r11 r12 r13

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2, {d8-d15}        \n" // r14 r15 r16 r17

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d0-d7}        \n" // r20 r21 r22 r23

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3, {d8-d15}        \n" // r24 r25 r26 r27

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d0-d7}        \n" // r30 r31 r32 r33

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4, {d8-d15}        \n" // r34 r35 r36 r37

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #512]          \n"
                        "vldm       %5!, {d0-d7}        \n" // r40 r41 r42 r43

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #512]          \n"
                        "vldm       %5, {d8-d15}        \n" // r44 r45 r46 r47

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        //                         "pld        [%6, #512]          \n"
                        "vldm       %6, {d16-d23}       \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "sub        %6, %6, #1536       \n" // kptr -= 24 * 16;

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v20.4s, v21.4s}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%1], #32   \n" // r00 r01

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmul   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v23.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s, v5.4s}, [%1] \n" // r02 r03 r04 r05

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%2], #32   \n" // r10 r11

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s, v5.4s}, [%2] \n" // r12 r13 r14 r15

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%3], #32   \n" // r20 r21

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s, v5.4s}, [%3] \n" // r22 r23 r24 r25

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%4], #32   \n" // r30 r31

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s, v5.4s}, [%4] \n" // r32 r33 r34 r35

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%5], #32   \n" // r40 r41

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s, v5.4s}, [%5] \n" // r42 r43 r44 r45

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"

                        //                         "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6] \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"

                        "fadd   v20.4s, v20.4s, v22.4s      \n"
                        "fadd   v21.4s, v21.4s, v23.4s      \n"

                        "sub    %6, %6, #1536               \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d24-d27}, [%0 :128] \n" // sum0 sum1

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d0-d3}, [%1 :128]! \n" // r00 r01

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmul.f32   q14, q8, d0[0]      \n"
                        "vmul.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #512]          \n"
                        "vldm       %1, {d4-d11}        \n" // r02 r03 r04 r05

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #256]          \n"
                        "vld1.f32   {d0-d3}, [%2 :128]! \n" // r10 r11

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2, {d4-d11}        \n" // r12 r13 r14 r15

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #256]          \n"
                        "vld1.f32   {d0-d3}, [%3 :128]! \n" // r20 r21

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3, {d4-d11}        \n" // r22 r23 r24 r25

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #256]          \n"
                        "vld1.f32   {d0-d3}, [%4 :128]! \n" // r30 r31

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4, {d4-d11}        \n" // r32 r33 r34 r35

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #256]          \n"
                        "vld1.f32   {d0-d3}, [%5 :128]! \n" // r40 r41

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #512]          \n"
                        "vldm       %5, {d4-d11}        \n" // r42 r43 r44 r45

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        //                         "pld        [%6, #512]          \n"
                        "vldm       %6, {d16-d23}       \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "sub        %6, %6, #1536       \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d27}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v20.4s}, [%0]              \n" // sum0

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4s}, [%1], #16          \n" // r00

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v1.4s, v2.4s, v3.4s, v4.4s}, [%1] \n" // r01 r02 r03 r04

                        "fmul   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmul   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4s}, [%2], #16          \n" // r10

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v1.4s, v2.4s, v3.4s, v4.4s}, [%2] \n" // r11 r12 r13 r14

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4s}, [%3], #16          \n" // r20

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v1.4s, v2.4s, v3.4s, v4.4s}, [%3] \n" // r21 r22 r23 r24

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4s}, [%4], #16          \n" // r30

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v1.4s, v2.4s, v3.4s, v4.4s}, [%4] \n" // r31 r32 r33 r34

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.4s}, [%5], #16          \n" // r40

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v1.4s, v2.4s, v3.4s, v4.4s}, [%5] \n" // r41 r42 r43 r44

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"

                        //                         "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6] \n"

                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fadd   v22.4s, v21.4s, v22.4s      \n"
                        "fadd   v23.4s, v22.4s, v23.4s      \n"
                        "fadd   v20.4s, v20.4s, v23.4s      \n"

                        "sub    %6, %6, #1536               \n" // kptr -= 24 * 16;

                        "st1    {v20.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d24-d25}, [%0 :128] \n" // sum0

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d0-d1}, [%1 :128]! \n" // r00

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmul.f32   q13, q8, d0[0]      \n"
                        "vmul.f32   q14, q9, d0[1]      \n"
                        "vmul.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #512]          \n"
                        "vldm       %1, {d2-d9}         \n" // r01 r02 r03 r04

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d0-d1}, [%2 :128]! \n" // r10

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2, {d2-d9}         \n" // r11 r12 r13 r14

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d0-d1}, [%3 :128]! \n" // r20

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3, {d2-d9}         \n" // r21 r22 r23 r24

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.f32   {d0-d1}, [%4 :128]! \n" // r30

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4, {d2-d9}         \n" // r31 r32 r33 r34

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #128]          \n"
                        "vld1.f32   {d0-d1}, [%5 :128]! \n" // r40

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #512]          \n"
                        "vldm       %5, {d2-d9}         \n" // r41 r42 r43 r44

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        //                         "pld        [%6, #512]          \n"
                        "vldm       %6, {d16-d23}       \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "vadd.f32   q13, q13, q14       \n"
                        "vadd.f32   q12, q12, q15       \n"
                        "vadd.f32   q12, q12, q13       \n"

                        "sub        %6, %6, #1536       \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d25}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += 4 * 4;
                r1 += 4 * 4;
                r2 += 4 * 4;
                r3 += 4 * 4;
                r4 += 4 * 4;
            }
        }
    }
}

static void conv5x5s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);
            const float* r3 = img0.row(3);
            const float* r4 = img0.row(4);

            const float* kptr = (const float*)kernel.channel(p).row(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n" // r04 r05 r06 r07

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%1] \n" // r08 r09 r010

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r14 r15 r16 r17

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%2] \n" // r18 r19 r110

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r24 r25 r26 r27

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%3] \n" // r28 r29 r210

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%4], #64 \n" // r30 r31 r32 r33

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // r34 r35 r36 r37

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%4] \n" // r38 r39 r310

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n" // r40 r41 r42 r43

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%5], #64 \n" // r44 r45 r46 r47

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%5, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%5] \n" // r48 r49 r410

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        //                         "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6] \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"
                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "sub    %6, %6, #1536               \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n" // sum0 sum1 sum2 sum3

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d0-d7}        \n" // r00 r01 r02 r03

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d8-d15}       \n" // r04 r05 r06 r07

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d0-d3}, [%1 :128]! \n" // r08 r09

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d14[0]     \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d14[1]     \n"
                        "vmla.f32   q15, q9, d2[1]      \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d15[0]    \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d15[1]    \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d4-d5}, [%1 :128]  \n" // r010

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d8-d15}       \n" // r10 r11 r12 r13

                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d0-d7}        \n" // r14 r15 r16 r17

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d10[0]     \n"
                        "vmla.f32   q13, q8, d14[0]     \n"
                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d10[1]     \n"
                        "vmla.f32   q13, q9, d14[1]     \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d11[0]    \n"
                        "vmla.f32   q13, q10, d15[0]    \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d11[1]    \n"
                        "vmla.f32   q13, q11, d15[1]    \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #256]          \n"
                        "vld1.f32   {d8-d11}, [%2 :128]! \n" // r18 r19

                        "vmla.f32   q12, q8, d12[0]     \n"
                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d12[1]     \n"
                        "vmla.f32   q13, q9, d0[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d13[0]    \n"
                        "vmla.f32   q13, q10, d1[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d13[1]    \n"
                        "vmla.f32   q13, q11, d1[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d14[0]     \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d14[1]     \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d15[0]    \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d15[1]    \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d12-d13}, [%2 :128] \n" // r110

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d0-d7}        \n" // r20 r21 r22 r23

                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d8-d15}       \n" // r24 r25 r26 r27

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #256]          \n"
                        "vld1.f32   {d0-d3}, [%3 :128]! \n" // r28 r29

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d14[0]     \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d14[1]     \n"
                        "vmla.f32   q15, q9, d2[1]      \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d15[0]    \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d15[1]    \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d4-d5}, [%3 :128]  \n" // r210

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d8-d15}       \n" // r30 r31 r32 r33

                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d0-d7}        \n" // r34 r35 r36 r37

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d10[0]     \n"
                        "vmla.f32   q13, q8, d14[0]     \n"
                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d10[1]     \n"
                        "vmla.f32   q13, q9, d14[1]     \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d11[0]    \n"
                        "vmla.f32   q13, q10, d15[0]    \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d11[1]    \n"
                        "vmla.f32   q13, q11, d15[1]    \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #256]          \n"
                        "vld1.f32   {d8-d11}, [%4 :128]! \n" // r38 r39

                        "vmla.f32   q12, q8, d12[0]     \n"
                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d12[1]     \n"
                        "vmla.f32   q13, q9, d0[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d13[0]    \n"
                        "vmla.f32   q13, q10, d1[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d13[1]    \n"
                        "vmla.f32   q13, q11, d1[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d14[0]     \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d14[1]     \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d15[0]    \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d15[1]    \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.f32   {d12-d13}, [%4 :128] \n" // r310

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%5, #512]          \n"
                        "vldm       %5!, {d0-d7}        \n" // r40 r41 r42 r43

                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #512]          \n"
                        "vldm       %5!, {d8-d15}       \n" // r44 r45 r46 r47

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #256]          \n"
                        "vld1.f32   {d0-d3}, [%5 :128]! \n" // r48 r49

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d14[0]     \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d14[1]     \n"
                        "vmla.f32   q15, q9, d2[1]      \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d15[0]    \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d15[1]    \n"
                        "vmla.f32   q15, q11, d3[1]     \n"

                        //                         "pld        [%6, #512]          \n"
                        "vldm       %6, {d16-d23}       \n"

                        "pld        [%5, #128]          \n"
                        "vld1.f32   {d4-d5}, [%5 :128]  \n" // r410

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "sub        %6, %6, #1536       \n" // kptr -= 24 * 16;

                        "sub        %1, %1, #32         \n"
                        "sub        %2, %2, #32         \n"
                        "sub        %3, %3, #32         \n"
                        "sub        %4, %4, #32         \n"
                        "sub        %5, %5, #32         \n"

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v20.4s, v21.4s}, [%0]      \n" // sum0 sum1

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmul   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v23.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%1] \n" // r04 r05 r06

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%2] \n" // r14 r15 r16

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%3] \n" // r24 r25 r26

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%4], #64 \n" // r30 r31 r32 r33

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%4] \n" // r34 r35 r36

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n" // r40 r41 r42 r43

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"

                        "prfm   pldl1keep, [%5, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%5] \n" // r44 r45 r46

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"

                        //                         "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6] \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"

                        "fadd   v20.4s, v20.4s, v22.4s      \n"
                        "fadd   v21.4s, v21.4s, v23.4s      \n"

                        "sub    %6, %6, #1536               \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d24-d27}, [%0 :128] \n" // sum0 sum1

                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d0-d7}        \n" // r00 r01 r02 r03

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmul.f32   q14, q8, d0[0]      \n"
                        "vmul.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #384]          \n"
                        "vldm       %1, {d8-d13}        \n" // r04 r05 r06

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #512]          \n"
                        "vldm       %2!, {d0-d7}        \n" // r10 r11 r12 r13

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #384]          \n"
                        "vldm       %2, {d8-d13}        \n" // r14 r15 r16

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #512]          \n"
                        "vldm       %3!, {d0-d7}        \n" // r20 r21 r22 r23

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #384]          \n"
                        "vldm       %3, {d8-d13}        \n" // r24 r25 r26

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #512]          \n"
                        "vldm       %4!, {d0-d7}        \n" // r30 r31 r32 r33

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #384]          \n"
                        "vldm       %4, {d8-d13}        \n" // r34 r35 r36

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #512]          \n"
                        "vldm       %5!, {d0-d7}        \n" // r40 r41 r42 r43

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #384]          \n"
                        "vldm       %5, {d8-d13}        \n" // r44 r45 r46

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        //                         "pld        [%6, #512]          \n"
                        "vldm       %6, {d16-d23}      \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "sub        %6, %6, #1536       \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d27}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v20.4s}, [%0]              \n" // sum0

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%1], #32   \n" // r00 r01

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmul   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmul   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s}, [%1] \n" // r02 r03 r04

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%2], #32   \n" // r10 r11

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s}, [%2] \n" // r12 r13 r14

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%3], #32   \n" // r20 r21

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s}, [%3] \n" // r22 r23 r24

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%4], #32   \n" // r30 r31

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s}, [%4] \n" // r32 r33 r34

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%5], #32   \n" // r40 r41

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"

                        "prfm   pldl1keep, [%5, #384]       \n"
                        "ld1    {v2.4s, v3.4s, v4.4s}, [%5] \n" // r42 r43 r44

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%6], #64 \n"

                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"

                        //                         "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%6] \n"

                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fadd   v22.4s, v21.4s, v22.4s      \n"
                        "fadd   v23.4s, v22.4s, v23.4s      \n"
                        "fadd   v20.4s, v20.4s, v23.4s      \n"

                        "sub    %6, %6, #1536               \n" // kptr -= 24 * 16;

                        "st1    {v20.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d24-d25}, [%0 :128] \n" // sum0

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d0-d3}, [%1 :128]! \n" // r00 r01

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmul.f32   q13, q8, d0[0]      \n"
                        "vmul.f32   q14, q9, d0[1]      \n"
                        "vmul.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%1, #384]          \n"
                        "vldm       %1, {d4-d9}         \n" // r02 r03 r04

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #256]          \n"
                        "vld1.f32   {d0-d3}, [%2 :128]! \n" // r10 r11

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%2, #384]          \n"
                        "vldm       %2, {d4-d9}         \n" // r12 r13 r14

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #256]          \n"
                        "vld1.f32   {d0-d3}, [%3 :128]! \n" // r20 r21

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%3, #384]          \n"
                        "vldm       %3, {d4-d9}         \n" // r22 r23 r24

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #256]          \n"
                        "vld1.f32   {d0-d3}, [%4 :128]! \n" // r30 r31

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%4, #384]          \n"
                        "vldm       %4, {d4-d9}         \n" // r32 r33 r34

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #256]          \n"
                        "vld1.f32   {d0-d3}, [%5 :128]! \n" // r40 r41

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "pld        [%5, #384]          \n"
                        "vldm       %5, {d4-d9}         \n" // r42 r43 r44

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"

                        "pld        [%6, #512]          \n"
                        "vldm       %6!, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d7[1]     \n"

                        //                         "pld        [%6, #512]          \n"
                        "vldm       %6, {d16-d23}      \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "vadd.f32   q14, q13, q14       \n"
                        "vadd.f32   q15, q14, q15       \n"
                        "vadd.f32   q12, q12, q15       \n"

                        "sub        %6, %6, #1536       \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d25}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_5x5_pack4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv5x5s1_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4, 4, opt.workspace_allocator);

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);
            const unsigned short* r3 = img0.row<const unsigned short>(3);
            const unsigned short* r4 = img0.row<const unsigned short>(4);

            const unsigned short* kptr = kernel.channel(p).row<const unsigned short>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%1] \n" // r04 r05 r06 r07

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%2] \n" // r14 r15 r16 r17

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%3] \n" // r24 r25 r26 r27

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r30 r31 r32 r33

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%4] \n" // r34 r35 r36 r37

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5], #32 \n" // r40 r41 r42 r43

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%5] \n" // r44 r45 r46 r47

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        //                         "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6] \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "sub    %6, %6, #768                \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n" // sum0 sum1 sum2 sum3

                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d4-d7}, [%1 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d12-d15}, [%1 :64] \n" // r04 r05 r06 r07

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vmla.f32   q14, q9, d7[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d9[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r10 r11 r12 r13

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d6[1]     \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d3[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d7[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d12-d15}, [%2 :64] \n" // r14 r15 r16 r17

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d4[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d10[1]    \n"
                        "vmla.f32   q12, q8, d5[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d11[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r20 r21 r22 r23

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d12[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d12[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d13[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d13[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d12-d15}, [%3 :64] \n" // r24 r25 r26 r27

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vmla.f32   q14, q9, d7[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d9[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r30 r31 r32 r33

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d6[1]     \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d3[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d7[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d12-d15}, [%4 :64] \n" // r34 r35 r36 r37

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d4[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d10[1]    \n"
                        "vmla.f32   q12, q8, d5[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d11[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r40 r41 r42 r43

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d12[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d12[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d13[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d13[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d12-d15}, [%5 :64] \n" // r44 r45 r46 r47

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vmla.f32   q14, q9, d7[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d9[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        //                         "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :64] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "sub        %6, %6, #768        \n" // kptr -= 24 * 16;

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%1], #16   \n" // r00 r01

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v20.4s, v21.4s}, [%0]      \n" // sum0 sum1

                        "fmul   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v23.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%1] \n" // r02 r03 r04 r05
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%2], #16   \n" // r10 r11
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%2] \n" // r12 r13 r14 r15
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n" // r20 r21
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%3] \n" // r22 r23 r24 r25
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%4], #16   \n" // r30 r31
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%4] \n" // r32 r33 r34 r35
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%5], #16   \n" // r40 r41
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%5] \n" // r42 r43 r44 r45
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        //                         "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6] \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"

                        "fadd   v20.4s, v20.4s, v22.4s      \n"
                        "fadd   v21.4s, v21.4s, v23.4s      \n"

                        "sub    %6, %6, #768                \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d2-d3}, [%1 :64]!  \n" // r00 r01

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d8-d11}, [%1 :64]  \n" // r02 r03 r04 r05

                        "vshll.u16  q8, d20, #16        \n"

                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d24-d27}, [%0 :128] \n" // sum0 sum1

                        "vmul.f32   q14, q8, d0[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmul.f32   q15, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r10 r11

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d8-d11}, [%2 :64]  \n" // r12 r13 r14 r15

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3 :64]!  \n" // r20 r21

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d8-d11}, [%3 :64]  \n" // r22 r23 r24 r25

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r30 r31

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d8-d11}, [%4 :64]  \n" // r32 r33 r34 r35

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5 :64]!  \n" // r40 r41

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d8-d11}, [%5 :64]  \n" // r42 r43 r44 r45

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128] \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "sub        %6, %6, #768        \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d27}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #64]        \n"
                        "ld1    {v0.4h}, [%1], #8           \n" // r00

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%1] \n" // r01 r02 r03 r04

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v20.4s}, [%0]              \n" // sum0

                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmul   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmul   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmul   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v0.4h}, [%2], #8           \n" // r10
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%2] \n" // r11 r12 r13 r14
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n" // r20
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%3] \n" // r21 r22 r23 r24
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v0.4h}, [%4], #8           \n" // r30
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%4] \n" // r31 r32 r33 r34
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%5, #64]        \n"
                        "ld1    {v0.4h}, [%5], #8           \n" // r40
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%5] \n" // r41 r42 r43 r44
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        //                         "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6] \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fadd   v22.4s, v21.4s, v22.4s      \n"
                        "fadd   v23.4s, v22.4s, v23.4s      \n"
                        "fadd   v20.4s, v20.4s, v23.4s      \n"

                        "sub    %6, %6, #768                \n" // kptr -= 24 * 16;

                        "st1    {v20.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #64]           \n"
                        "vld1.u16   {d1}, [%1 :64]!     \n" // r00

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q0, d1, #16         \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d24-d25}, [%0 :128] \n" // sum0

                        "vmul.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmul.f32   q14, q9, d0[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmul.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d6-d9}, [%1 :64]   \n" // r01 r02 r03 r04
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d1}, [%2 :64]!     \n" // r10
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d6-d9}, [%2 :64]   \n" // r11 r12 r13 r14
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d1}, [%3 :64]!     \n" // r20
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d6-d9}, [%3 :64]   \n" // r21 r22 r23 r24
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%4, #64]           \n"
                        "vld1.u16   {d1}, [%4 :64]!     \n" // r30
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d6-d9}, [%4 :64]   \n" // r31 r32 r33 r34
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%5, #64]           \n"
                        "vld1.u16   {d1}, [%5 :64]!     \n" // r40
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d6-d9}, [%5 :64]   \n" // r41 r42 r43 r44
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        //                         "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128] \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "vadd.f32   q13, q13, q14       \n"
                        "vadd.f32   q12, q12, q15       \n"
                        "vadd.f32   q12, q12, q13       \n"

                        "sub        %6, %6, #768        \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d25}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += 4 * 4;
                r1 += 4 * 4;
                r2 += 4 * 4;
                r3 += 4 * 4;
                r4 += 4 * 4;
            }
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);

            const float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);
            const unsigned short* r3 = img0.row<const unsigned short>(3);
            const unsigned short* r4 = img0.row<const unsigned short>(4);

            const unsigned short* kptr = kernel.channel(p).row<const unsigned short>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%2] \n" // r04 r05 r06 r07

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%3] \n" // r14 r15 r16 r17

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%4] \n" // r24 r25 r26 r27

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5], #32 \n" // r30 r31 r32 r33

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%5] \n" // r34 r35 r36 r37

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%6], #32 \n" // r40 r41 r42 r43

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v3.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%6] \n" // r44 r45 r46 r47

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v4.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v5.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v5.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        //                         "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7] \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "sub    %7, %7, #768                \n" // kptr -= 24 * 16;

                        "shrn   v20.4h, v20.4s, #16         \n"
                        "shrn   v21.4h, v21.4s, #16         \n"
                        "shrn   v22.4h, v22.4s, #16         \n"
                        "shrn   v23.4h, v23.4s, #16         \n"

                        "st1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d24-d31}      \n" // sum0 sum1 sum2 sum3

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d12-d15}, [%2 :64] \n" // r04 r05 r06 r07

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vmla.f32   q14, q9, d7[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d9[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r10 r11 r12 r13

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d6[1]     \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d3[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d7[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d12-d15}, [%3 :64] \n" // r14 r15 r16 r17

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d4[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d10[1]    \n"
                        "vmla.f32   q12, q8, d5[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d11[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r20 r21 r22 r23

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d12[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d12[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d13[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d13[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d12-d15}, [%4 :64] \n" // r24 r25 r26 r27

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vmla.f32   q14, q9, d7[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d9[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r30 r31 r32 r33

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d6[1]     \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d3[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d7[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d12-d15}, [%5 :64] \n" // r34 r35 r36 r37

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d2[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d8[1]      \n"
                        "vmla.f32   q12, q10, d3[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d9[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d4[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d10[1]    \n"
                        "vmla.f32   q12, q8, d5[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d11[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d6[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d10[0]     \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d10[1]     \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d7[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d11[0]    \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d11[1]    \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d4-d7}, [%6 :64]!  \n" // r40 r41 r42 r43

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d12[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d12[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d13[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d13[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d5[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d12-d15}, [%6 :64] \n" // r44 r45 r46 r47

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vmla.f32   q14, q9, d7[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d7[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :64]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d9[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        //                         "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :64] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d10[0]     \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d14[0]     \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d14[1]     \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d11[0]    \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d15[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d15[1]    \n"

                        "sub        %7, %7, #768        \n" // kptr -= 24 * 16;

                        "vshrn.u32  d24, q12, #16       \n"
                        "vshrn.u32  d25, q13, #16       \n"
                        "vshrn.u32  d26, q14, #16       \n"
                        "vshrn.u32  d27, q15, #16       \n"

                        "vst1.u16   {d24-d27}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%2], #16   \n" // r00 r01

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v20.4s, v21.4s}, [%1], #32 \n" // sum0 sum1

                        "fmul   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmul   v23.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%2] \n" // r02 r03 r04 r05
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n" // r10 r11
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%3] \n" // r12 r13 r14 r15
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%4], #16   \n" // r20 r21
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%4] \n" // r22 r23 r24 r25
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%5], #16   \n" // r30 r31
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%5] \n" // r32 r33 r34 r35
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v27.4s, v1.s[3]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%6], #16   \n" // r40 r41
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h, v5.4h}, [%6] \n" // r42 r43 r44 r45
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v19.4s, v1.s[3]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        //                         "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7] \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"

                        "fadd   v20.4s, v20.4s, v22.4s      \n"
                        "fadd   v21.4s, v21.4s, v23.4s      \n"

                        "sub    %7, %7, #768                \n" // kptr -= 24 * 16;

                        "shrn   v20.4h, v20.4s, #16         \n"
                        "shrn   v21.4h, v21.4s, #16         \n"

                        "st1    {v20.4h, v21.4h}, [%0], #16 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r00 r01

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d8-d11}, [%2 :64]  \n" // r02 r03 r04 r05

                        "vshll.u16  q8, d20, #16        \n"

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d24-d27}, [%1 :128]! \n" // sum0 sum1

                        "vmul.f32   q14, q8, d0[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmul.f32   q15, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3 :64]!  \n" // r10 r11

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d8-d11}, [%3 :64]  \n" // r12 r13 r14 r15

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r20 r21

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d8-d11}, [%4 :64]  \n" // r22 r23 r24 r25

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5 :64]!  \n" // r30 r31

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d8-d11}, [%5 :64]  \n" // r32 r33 r34 r35

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d2-d3}, [%6 :64]!  \n" // r40 r41

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d8-d11}, [%6 :64]  \n" // r42 r43 r44 r45

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vmla.f32   q15, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vshll.u16  q2, d8, #16         \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q3, d9, #16         \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128] \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "sub        %7, %7, #768        \n" // kptr -= 24 * 16;

                        "vshrn.u32  d24, q12, #16       \n"
                        "vshrn.u32  d25, q13, #16       \n"

                        "vst1.u16   {d24-d25}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%2, #64]        \n"
                        "ld1    {v0.4h}, [%2], #8           \n" // r00

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v0.4s, v0.4h, #16           \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%2] \n" // r01 r02 r03 r04

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v20.4s}, [%1], #16         \n" // sum0

                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmul   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmul   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmul   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%3, #64]        \n"
                        "ld1    {v0.4h}, [%3], #8           \n" // r10
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%3] \n" // r11 r12 r13 r14
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%4, #64]        \n"
                        "ld1    {v0.4h}, [%4], #8           \n" // r20
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%4] \n" // r21 r22 r23 r24
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%5, #64]        \n"
                        "ld1    {v0.4h}, [%5], #8           \n" // r30
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%5] \n" // r31 r32 r33 r34
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%6, #64]        \n"
                        "ld1    {v0.4h}, [%6], #8           \n" // r40
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v1.4h, v2.4h, v3.4h, v4.4h}, [%6] \n" // r41 r42 r43 r44
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        //                         "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7] \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fadd   v22.4s, v21.4s, v22.4s      \n"
                        "fadd   v23.4s, v22.4s, v23.4s      \n"
                        "fadd   v20.4s, v20.4s, v23.4s      \n"

                        "sub    %7, %7, #768                \n" // kptr -= 24 * 16;

                        "shrn   v20.4h, v20.4s, #16         \n"

                        "st1    {v20.4h}, [%0], #8          \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d1}, [%2 :64]!     \n" // r00

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q0, d1, #16         \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d24-d25}, [%1 :128]! \n" // sum0

                        "vmul.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmul.f32   q14, q9, d0[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmul.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d6-d9}, [%2 :64]   \n" // r01 r02 r03 r04
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d1}, [%3 :64]!     \n" // r10
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d6-d9}, [%3 :64]   \n" // r11 r12 r13 r14
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%4, #64]           \n"
                        "vld1.u16   {d1}, [%4 :64]!     \n" // r20
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d6-d9}, [%4 :64]   \n" // r21 r22 r23 r24
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%5, #64]           \n"
                        "vld1.u16   {d1}, [%5 :64]!     \n" // r30
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d6-d9}, [%5 :64]   \n" // r31 r32 r33 r34
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%6, #64]           \n"
                        "vld1.u16   {d1}, [%6 :64]!     \n" // r40
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d1, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d6-d9}, [%6 :64]   \n" // r41 r42 r43 r44
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q1, d6, #16         \n"
                        "vshll.u16  q2, d7, #16         \n"
                        "vshll.u16  q3, d8, #16         \n"
                        "vshll.u16  q4, d9, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        //                         "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128] \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "vadd.f32   q13, q13, q14       \n"
                        "vadd.f32   q12, q12, q15       \n"
                        "vadd.f32   q12, q12, q13       \n"

                        "sub        %7, %7, #768        \n" // kptr -= 24 * 16;

                        "vshrn.u32  d24, q12, #16       \n"

                        "vst1.u16   {d24}, [%0 :64]!    \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += 4 * 4;
                r1 += 4 * 4;
                r2 += 4 * 4;
                r3 += 4 * 4;
                r4 += 4 * 4;
            }
        }
    }
}

static void conv5x5s2_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4, 4, opt.workspace_allocator);

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);
            const unsigned short* r3 = img0.row<const unsigned short>(3);
            const unsigned short* r4 = img0.row<const unsigned short>(4);

            const unsigned short* kptr = kernel.channel(p).row<const unsigned short>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%1], #32 \n" // r04 r05 r06 r07

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n" // sum0 sum1 sum2 sum3

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%1, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%1] \n" // r08 r09 r010

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%2], #32 \n" // r14 r15 r16 r17

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%2] \n" // r18 r19 r110

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%3], #32 \n" // r24 r25 r26 r27

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%3] \n" // r28 r29 r210

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r30 r31 r32 r33

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%4], #32 \n" // r34 r35 r36 r37

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%4] \n" // r38 r39 r310

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5], #32 \n" // r40 r41 r42 r43

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%5], #32 \n" // r44 r45 r46 r47

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%5] \n" // r48 r49 r410

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        //                         "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6] \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"
                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "sub    %6, %6, #768                \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n" // sum0 sum1 sum2 sum3

                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d4-d7}, [%1 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d12-d15}, [%1 :64]! \n" // r04 r05 r06 r07

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d2-d3}, [%1 :64]!  \n" // r08 r09

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d14[0]    \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d14[1]    \n"
                        "vmla.f32   q15, q11, d2[1]     \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d15[0]     \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d15[1]     \n"
                        "vmla.f32   q15, q9, d3[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%1, #64]           \n"
                        "vld1.u16   {d5}, [%1 :64]      \n" // r010

                        "vshll.u16  q2, d5, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d12-d15}, [%2 :64]! \n" // r10 r11 r12 r13

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r14 r15 r16 r17

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d12[0]    \n"
                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "vmla.f32   q15, q11, d4[1]     \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d13[0]     \n"
                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vmla.f32   q14, q9, d1[1]      \n"
                        "vmla.f32   q15, q9, d5[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d10[0]     \n"
                        "vmla.f32   q13, q8, d14[0]     \n"
                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d10[1]     \n"
                        "vmla.f32   q13, q9, d14[1]     \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d11[0]    \n"
                        "vmla.f32   q13, q10, d15[0]    \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d11[1]    \n"
                        "vmla.f32   q13, q11, d15[1]    \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d10-d11}, [%2 :64]! \n" // r18 r19

                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"

                        "vmla.f32   q12, q10, d12[0]    \n"
                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d12[1]    \n"
                        "vmla.f32   q13, q11, d0[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d13[0]     \n"
                        "vmla.f32   q13, q8, d1[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d13[1]     \n"
                        "vmla.f32   q13, q9, d1[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d14[0]     \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d14[1]     \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d15[0]    \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d15[1]    \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d13}, [%2 :64]     \n" // r110

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r20 r21 r22 r23

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d12-d15}, [%3 :64]! \n" // r24 r25 r26 r27

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3 :64]!  \n" // r28 r29

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d14[0]    \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d14[1]    \n"
                        "vmla.f32   q15, q11, d2[1]     \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d15[0]     \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d15[1]     \n"
                        "vmla.f32   q15, q9, d3[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d5}, [%3 :64]      \n" // r210

                        "vshll.u16  q2, d5, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d12-d15}, [%4 :64]! \n" // r30 r31 r32 r33

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r34 r35 r36 r37

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d12[0]    \n"
                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "vmla.f32   q15, q11, d4[1]     \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d13[0]     \n"
                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vmla.f32   q14, q9, d1[1]      \n"
                        "vmla.f32   q15, q9, d5[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d10[0]     \n"
                        "vmla.f32   q13, q8, d14[0]     \n"
                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d10[1]     \n"
                        "vmla.f32   q13, q9, d14[1]     \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d11[0]    \n"
                        "vmla.f32   q13, q10, d15[0]    \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d11[1]    \n"
                        "vmla.f32   q13, q11, d15[1]    \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d10-d11}, [%4 :64]! \n" // r38 r39

                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"

                        "vmla.f32   q12, q10, d12[0]    \n"
                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d12[1]    \n"
                        "vmla.f32   q13, q11, d0[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d13[0]     \n"
                        "vmla.f32   q13, q8, d1[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d13[1]     \n"
                        "vmla.f32   q13, q9, d1[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d14[0]     \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d14[1]     \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d15[0]    \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d15[1]    \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%4, #64]           \n"
                        "vld1.u16   {d13}, [%4 :64]     \n" // r310

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r40 r41 r42 r43

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d12-d15}, [%5 :64]! \n" // r44 r45 r46 r47

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5 :64]!  \n" // r48 r49

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d14[0]    \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d14[1]    \n"
                        "vmla.f32   q15, q11, d2[1]     \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d15[0]     \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d15[1]     \n"
                        "vmla.f32   q15, q9, d3[1]      \n"

                        //                         "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%5, #64]           \n"
                        "vld1.u16   {d5}, [%5 :64]      \n" // r410

                        "vshll.u16  q2, d5, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "sub        %6, %6, #768        \n" // kptr -= 24 * 16;

                        "sub        %1, %1, #16         \n"
                        "sub        %2, %2, #16         \n"
                        "sub        %3, %3, #16         \n"
                        "sub        %4, %4, #16         \n"
                        "sub        %5, %5, #16         \n"

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"

                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v16.4s, v16.4h, #16         \n"

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v20.4s, v21.4s}, [%0]      \n" // sum0 sum1

                        "fmul   v22.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmul   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%1, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%1] \n" // r04 r05 r06
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r10 r11 r12 r13
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%2] \n" // r14 r15 r16
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r20 r21 r22 r23
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%3] \n" // r24 r25 r26
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r30 r31 r32 r33
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%4] \n" // r34 r35 r36
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5], #32 \n" // r40 r41 r42 r43
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%5] \n" // r44 r45 r46
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        //                         "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6] \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"

                        "fadd   v20.4s, v20.4s, v22.4s      \n"
                        "fadd   v21.4s, v21.4s, v23.4s      \n"

                        "sub    %6, %6, #768                \n" // kptr -= 24 * 16;

                        "st1    {v20.4s, v21.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #256]          \n"
                        "vld1.u16   {d4-d7}, [%1 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d24-d27}, [%0 :128] \n" // sum0 sum1

                        "vmul.f32   q14, q8, d0[0]      \n"
                        "vmul.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "pld        [%1, #192]          \n"
                        "vld1.u16   {d10-d12}, [%1 :64] \n" // r04 r05 r06
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r10 r11 r12 r13
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "pld        [%2, #192]          \n"
                        "vld1.u16   {d10-d12}, [%2 :64] \n" // r14 r15 r16
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r20 r21 r22 r23
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "pld        [%3, #192]          \n"
                        "vld1.u16   {d10-d12}, [%3 :64] \n" // r24 r25 r26
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r30 r31 r32 r33
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "pld        [%4, #192]          \n"
                        "vld1.u16   {d10-d12}, [%4 :64] \n" // r34 r35 r36
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r40 r41 r42 r43
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "pld        [%5, #192]          \n"
                        "vld1.u16   {d10-d12}, [%5 :64] \n" // r44 r45 r46
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        //                         "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128] \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "sub        %6, %6, #768        \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d27}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v20.4s}, [%0]              \n" // sum0

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%1], #16   \n" // r00 r01

                        "shll   v0.4s, v0.4h, #16           \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmul   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmul   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmul   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%1, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%1] \n" // r02 r03 r04
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%2], #16   \n" // r10 r11
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%2] \n" // r12 r13 r14
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n" // r20 r21
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%3] \n" // r22 r23 r24
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%4], #16   \n" // r30 r31
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%4] \n" // r32 r33 r34
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%5], #16   \n" // r40 r41
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%5] \n" // r42 r43 r44
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%6], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        //                         "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6] \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fadd   v22.4s, v21.4s, v22.4s      \n"
                        "fadd   v23.4s, v22.4s, v23.4s      \n"
                        "fadd   v20.4s, v20.4s, v23.4s      \n"

                        "sub    %6, %6, #768                \n" // kptr -= 24 * 16;

                        "st1    {v20.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d2-d3}, [%1 :64]!  \n" // r00 r01

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d24-d25}, [%0 :128] \n" // sum0

                        "vmul.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmul.f32   q14, q9, d0[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmul.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%1, #192]          \n"
                        "vld1.u16   {d6-d8}, [%1 :64]   \n" // r02 r03 r04
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r10 r11
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%2, #192]          \n"
                        "vld1.u16   {d6-d8}, [%2 :64]   \n" // r12 r13 r14
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3 :64]!  \n" // r20 r21
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%3, #192]          \n"
                        "vld1.u16   {d6-d8}, [%3 :64]   \n" // r22 r23 r24
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r30 r31
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%4, #192]          \n"
                        "vld1.u16   {d6-d8}, [%4 :64]   \n" // r32 r33 r34
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5 :64]!  \n" // r40 r41
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%5, #192]          \n"
                        "vld1.u16   {d6-d8}, [%5 :64]   \n" // r42 r43 r44
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d16-d19}, [%6 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        //                         "pld        [%6, #256]          \n"
                        "vld1.u16   {d20-d23}, [%6 :128] \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "vadd.f32   q14, q13, q14       \n"
                        "vadd.f32   q15, q14, q15       \n"
                        "vadd.f32   q12, q12, q15       \n"

                        "sub        %6, %6, #768        \n" // kptr -= 24 * 16;

                        "vst1.f32   {d24-d25}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
            }
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);

            const float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);
            const unsigned short* r3 = img0.row<const unsigned short>(3);
            const unsigned short* r4 = img0.row<const unsigned short>(4);

            const unsigned short* kptr = kernel.channel(p).row<const unsigned short>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%2], #32 \n" // r04 r05 r06 r07

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n" // sum0 sum1 sum2 sum3

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%2] \n" // r08 r09 r010

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r10 r11 r12 r13

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%3], #32 \n" // r14 r15 r16 r17

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%3] \n" // r18 r19 r110

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r20 r21 r22 r23

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%4], #32 \n" // r24 r25 r26 r27

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%4] \n" // r28 r29 r210

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5], #32 \n" // r30 r31 r32 r33

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%5], #32 \n" // r34 r35 r36 r37

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%5] \n" // r38 r39 r310

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v16.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v29.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v29.s[3]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%6], #32 \n" // r40 r41 r42 r43

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v20.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v24.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v25.4s, v30.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v26.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v27.4s, v30.s[3]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%6], #32 \n" // r44 r45 r46 r47

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v6.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v6.s[3]     \n"

                        "prfm   pldl1keep, [%6, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%6] \n" // r48 r49 r410

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v20.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v7.s[1]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v7.s[3]     \n"

                        "fmla   v20.4s, v16.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v23.4s, v17.4s, v28.s[1]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v20.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v23.4s, v19.4s, v28.s[3]    \n"

                        "fmla   v20.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v22.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v29.s[0]    \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v22.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v29.s[1]    \n"

                        //                         "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7] \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v20.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v22.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v29.s[2]    \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v22.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v29.s[3]    \n"

                        "fmla   v20.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v21.4s, v16.4s, v6.s[0]     \n"
                        "fmla   v22.4s, v16.4s, v28.s[0]    \n"
                        "fmla   v23.4s, v16.4s, v30.s[0]    \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "fmla   v22.4s, v17.4s, v28.s[1]    \n"
                        "fmla   v23.4s, v17.4s, v30.s[1]    \n"
                        "fmla   v20.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v21.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v22.4s, v18.4s, v28.s[2]    \n"
                        "fmla   v23.4s, v18.4s, v30.s[2]    \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "fmla   v22.4s, v19.4s, v28.s[3]    \n"
                        "fmla   v23.4s, v19.4s, v30.s[3]    \n"

                        "sub    %7, %7, #768                \n" // kptr -= 24 * 16;

                        "shrn   v20.4h, v20.4s, #16         \n"
                        "shrn   v21.4h, v21.4s, #16         \n"
                        "shrn   v22.4h, v22.4s, #16         \n"
                        "shrn   v23.4h, v23.4s, #16         \n"

                        "st1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d24-d31}      \n" // sum0 sum1 sum2 sum3

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d12-d15}, [%2 :64]! \n" // r04 r05 r06 r07

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r08 r09

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d14[0]    \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d14[1]    \n"
                        "vmla.f32   q15, q11, d2[1]     \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d15[0]     \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d15[1]     \n"
                        "vmla.f32   q15, q9, d3[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%2, #64]           \n"
                        "vld1.u16   {d5}, [%2 :64]      \n" // r010

                        "vshll.u16  q2, d5, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d12-d15}, [%3 :64]! \n" // r10 r11 r12 r13

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r14 r15 r16 r17

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d12[0]    \n"
                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "vmla.f32   q15, q11, d4[1]     \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d13[0]     \n"
                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vmla.f32   q14, q9, d1[1]      \n"
                        "vmla.f32   q15, q9, d5[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d10[0]     \n"
                        "vmla.f32   q13, q8, d14[0]     \n"
                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d10[1]     \n"
                        "vmla.f32   q13, q9, d14[1]     \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d11[0]    \n"
                        "vmla.f32   q13, q10, d15[0]    \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d11[1]    \n"
                        "vmla.f32   q13, q11, d15[1]    \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d10-d11}, [%3 :64]! \n" // r18 r19

                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"

                        "vmla.f32   q12, q10, d12[0]    \n"
                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d12[1]    \n"
                        "vmla.f32   q13, q11, d0[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d13[0]     \n"
                        "vmla.f32   q13, q8, d1[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d13[1]     \n"
                        "vmla.f32   q13, q9, d1[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d14[0]     \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d14[1]     \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d15[0]    \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d15[1]    \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%3, #64]           \n"
                        "vld1.u16   {d13}, [%3 :64]     \n" // r110

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r20 r21 r22 r23

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d12-d15}, [%4 :64]! \n" // r24 r25 r26 r27

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r28 r29

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d14[0]    \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d14[1]    \n"
                        "vmla.f32   q15, q11, d2[1]     \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d15[0]     \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d15[1]     \n"
                        "vmla.f32   q15, q9, d3[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%4, #64]           \n"
                        "vld1.u16   {d5}, [%4 :64]      \n" // r210

                        "vshll.u16  q2, d5, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d12-d15}, [%5 :64]! \n" // r30 r31 r32 r33

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r34 r35 r36 r37

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q12, q10, d8[0]     \n"
                        "vmla.f32   q13, q10, d12[0]    \n"
                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "vmla.f32   q15, q11, d4[1]     \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q8, d13[0]     \n"
                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vmla.f32   q14, q9, d1[1]      \n"
                        "vmla.f32   q15, q9, d5[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d10[0]     \n"
                        "vmla.f32   q13, q8, d14[0]     \n"
                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vmla.f32   q12, q9, d10[1]     \n"
                        "vmla.f32   q13, q9, d14[1]     \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "vmla.f32   q15, q9, d6[1]      \n"
                        "vmla.f32   q12, q10, d11[0]    \n"
                        "vmla.f32   q13, q10, d15[0]    \n"
                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vmla.f32   q12, q11, d11[1]    \n"
                        "vmla.f32   q13, q11, d15[1]    \n"
                        "vmla.f32   q14, q11, d3[1]     \n"
                        "vmla.f32   q15, q11, d7[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d10-d11}, [%5 :64]! \n" // r38 r39

                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"

                        "vmla.f32   q12, q10, d12[0]    \n"
                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vmla.f32   q12, q11, d12[1]    \n"
                        "vmla.f32   q13, q11, d0[1]     \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "vmla.f32   q15, q11, d8[1]     \n"
                        "vmla.f32   q12, q8, d13[0]     \n"
                        "vmla.f32   q13, q8, d1[0]      \n"
                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q12, q9, d13[1]     \n"
                        "vmla.f32   q13, q9, d1[1]      \n"
                        "vmla.f32   q14, q9, d5[1]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q12, q8, d14[0]     \n"
                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vmla.f32   q12, q9, d14[1]     \n"
                        "vmla.f32   q13, q9, d2[1]      \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "vmla.f32   q15, q9, d10[1]     \n"
                        "vmla.f32   q12, q10, d15[0]    \n"
                        "vmla.f32   q13, q10, d3[0]     \n"
                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vmla.f32   q12, q11, d15[1]    \n"
                        "vmla.f32   q13, q11, d3[1]     \n"
                        "vmla.f32   q14, q11, d7[1]     \n"
                        "vmla.f32   q15, q11, d11[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "pld        [%5, #64]           \n"
                        "vld1.u16   {d13}, [%5 :64]     \n" // r310

                        "vshll.u16  q6, d13, #16        \n"

                        "vmla.f32   q12, q10, d0[0]     \n"
                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "vmla.f32   q15, q11, d12[1]    \n"
                        "vmla.f32   q12, q8, d1[0]      \n"
                        "vmla.f32   q13, q8, d5[0]      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d4-d7}, [%6 :64]!  \n" // r40 r41 r42 r43

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q9, d9[1]      \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d12-d15}, [%6 :64]! \n" // r44 r45 r46 r47

                        "vshll.u16  q4, d12, #16        \n"
                        "vshll.u16  q5, d13, #16        \n"
                        "vshll.u16  q6, d14, #16        \n"
                        "vshll.u16  q7, d15, #16        \n"

                        "vmla.f32   q12, q8, d0[0]      \n"
                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vmla.f32   q15, q9, d12[1]     \n"
                        "vmla.f32   q12, q10, d1[0]     \n"
                        "vmla.f32   q13, q10, d5[0]     \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "vmla.f32   q14, q11, d9[1]     \n"
                        "vmla.f32   q15, q11, d13[1]    \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d2[0]     \n"
                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "vmla.f32   q14, q11, d10[1]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"
                        "vmla.f32   q12, q8, d3[0]      \n"
                        "vmla.f32   q13, q8, d7[0]      \n"
                        "vmla.f32   q14, q8, d11[0]     \n"
                        "vmla.f32   q15, q8, d15[0]     \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vmla.f32   q14, q9, d11[1]     \n"
                        "vmla.f32   q15, q9, d15[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d2-d3}, [%6 :64]!  \n" // r48 r49

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q12, q8, d4[0]      \n"
                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q8, d12[0]     \n"
                        "vmla.f32   q15, q8, d0[0]      \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "vmla.f32   q14, q9, d12[1]     \n"
                        "vmla.f32   q15, q9, d0[1]      \n"
                        "vmla.f32   q12, q10, d5[0]     \n"
                        "vmla.f32   q13, q10, d9[0]     \n"
                        "vmla.f32   q14, q10, d13[0]    \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vmla.f32   q14, q11, d13[1]    \n"
                        "vmla.f32   q15, q11, d1[1]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vshll.u16  q10, d16, #16       \n"
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vshll.u16  q9, d19, #16        \n"

                        "vmla.f32   q12, q10, d6[0]     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q10, d14[0]    \n"
                        "vmla.f32   q15, q10, d2[0]     \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "vmla.f32   q14, q11, d14[1]    \n"
                        "vmla.f32   q15, q11, d2[1]     \n"
                        "vmla.f32   q12, q8, d7[0]      \n"
                        "vmla.f32   q13, q8, d11[0]     \n"
                        "vmla.f32   q14, q8, d15[0]     \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vmla.f32   q14, q9, d15[1]     \n"
                        "vmla.f32   q15, q9, d3[1]      \n"

                        //                         "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128] \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "pld        [%6, #64]           \n"
                        "vld1.u16   {d5}, [%6 :64]      \n" // r410

                        "vshll.u16  q2, d5, #16         \n"

                        "vmla.f32   q12, q8, d8[0]      \n"
                        "vmla.f32   q13, q8, d12[0]     \n"
                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "vmla.f32   q15, q9, d4[1]      \n"
                        "vmla.f32   q12, q10, d9[0]     \n"
                        "vmla.f32   q13, q10, d13[0]    \n"
                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vmla.f32   q14, q11, d1[1]     \n"
                        "vmla.f32   q15, q11, d5[1]     \n"

                        "sub        %7, %7, #768        \n" // kptr -= 24 * 16;

                        "sub        %2, %2, #16         \n"
                        "sub        %3, %3, #16         \n"
                        "sub        %4, %4, #16         \n"
                        "sub        %5, %5, #16         \n"
                        "sub        %6, %6, #16         \n"

                        "vshrn.u32  d24, q12, #16       \n"
                        "vshrn.u32  d25, q13, #16       \n"
                        "vshrn.u32  d26, q14, #16       \n"
                        "vshrn.u32  d27, q15, #16       \n"

                        "vst1.u16   {d24-d27}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r00 r01 r02 r03

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"

                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v16.4s, v16.4h, #16         \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v20.4s, v21.4s}, [%1], #32 \n" // sum0 sum1

                        "fmul   v22.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v17.4s, v17.4h, #16         \n"
                        "fmul   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%2] \n" // r04 r05 r06
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r10 r11 r12 r13
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%3] \n" // r14 r15 r16
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r20 r21 r22 r23
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%4] \n" // r24 r25 r26
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5], #32 \n" // r30 r31 r32 r33
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v2.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%5] \n" // r34 r35 r36
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v3.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v3.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v4.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v5.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v5.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v5.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v5.s[3]     \n"
                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%6], #32 \n" // r40 r41 r42 r43
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v6.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v6.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v6.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v6.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v2.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v0.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v0.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v2.s[3]     \n"
                        "prfm   pldl1keep, [%6, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%6] \n" // r44 r45 r46
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v22.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v24.4s, v3.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "fmla   v23.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"

                        "fmla   v22.4s, v18.4s, v2.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        //                         "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7] \n"
                        "fmla   v23.4s, v24.4s, v5.s[0]     \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v20.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v5.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v5.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v5.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v22.4s, v16.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v16.4s, v6.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v20.4s, v17.4s, v4.s[1]     \n"
                        "fmla   v21.4s, v17.4s, v6.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v22.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v18.4s, v6.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "fmla   v21.4s, v19.4s, v6.s[3]     \n"

                        "fadd   v20.4s, v20.4s, v22.4s      \n"
                        "fadd   v21.4s, v21.4s, v23.4s      \n"

                        "sub    %7, %7, #768                \n" // kptr -= 24 * 16;

                        "shrn   v20.4h, v20.4s, #16         \n"
                        "shrn   v21.4h, v21.4s, #16         \n"

                        "st1    {v20.4h, v21.4h}, [%0], #16 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%2, #256]          \n"
                        "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r00 r01 r02 r03

                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d24-d27}, [%1 :128]! \n" // sum0 sum1

                        "vmul.f32   q14, q8, d0[0]      \n"
                        "vmul.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "pld        [%2, #192]          \n"
                        "vld1.u16   {d10-d12}, [%2 :64] \n" // r04 r05 r06
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%3, #256]          \n"
                        "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r10 r11 r12 r13
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "pld        [%3, #192]          \n"
                        "vld1.u16   {d10-d12}, [%3 :64] \n" // r14 r15 r16
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%4, #256]          \n"
                        "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r20 r21 r22 r23
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "pld        [%4, #192]          \n"
                        "vld1.u16   {d10-d12}, [%4 :64] \n" // r24 r25 r26
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "pld        [%5, #256]          \n"
                        "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r30 r31 r32 r33
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d0[0]     \n"
                        "vmla.f32   q15, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d0[1]     \n"
                        "vmla.f32   q13, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d1[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "vmla.f32   q13, q9, d5[1]      \n"
                        "pld        [%5, #192]          \n"
                        "vld1.u16   {d10-d12}, [%5 :64] \n" // r34 r35 r36
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q8, d2[0]      \n"
                        "vmla.f32   q15, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d2[1]      \n"
                        "vmla.f32   q13, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d3[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vmla.f32   q13, q11, d7[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d4[0]     \n"
                        "vmla.f32   q15, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d4[1]     \n"
                        "vmla.f32   q13, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d5[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d6[0]      \n"
                        "vmla.f32   q15, q8, d10[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d6[1]      \n"
                        "vmla.f32   q13, q9, d10[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d7[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d11[0]    \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "vmla.f32   q13, q11, d11[1]    \n"
                        "pld        [%6, #256]          \n"
                        "vld1.u16   {d4-d7}, [%6 :64]!  \n" // r40 r41 r42 r43
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d4, #16         \n"
                        "vshll.u16  q1, d5, #16         \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q14, q10, d8[0]     \n"
                        "vmla.f32   q15, q10, d12[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d8[1]     \n"
                        "vmla.f32   q13, q11, d12[1]    \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d0[0]      \n"
                        "vmla.f32   q15, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d0[1]      \n"
                        "vmla.f32   q13, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d1[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "vmla.f32   q13, q11, d5[1]     \n"
                        "pld        [%6, #192]          \n"
                        "vld1.u16   {d10-d12}, [%6 :64] \n" // r44 r45 r46
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q4, d10, #16        \n"
                        "vshll.u16  q5, d11, #16        \n"
                        "vshll.u16  q6, d12, #16        \n"

                        "vmla.f32   q14, q10, d2[0]     \n"
                        "vmla.f32   q15, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d2[1]     \n"
                        "vmla.f32   q13, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vmla.f32   q14, q8, d3[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vmla.f32   q13, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d4[0]      \n"
                        "vmla.f32   q15, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d4[1]      \n"
                        "vmla.f32   q13, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"

                        "vmla.f32   q14, q10, d5[0]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vmla.f32   q13, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q14, q10, d6[0]     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q12, q11, d6[1]     \n"
                        "vmla.f32   q13, q11, d10[1]    \n"
                        //                         "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128] \n"

                        "vmla.f32   q14, q8, d7[0]      \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d11[0]     \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vmla.f32   q13, q9, d11[1]     \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q14, q8, d8[0]      \n"
                        "vmla.f32   q15, q8, d12[0]     \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q12, q9, d8[1]      \n"
                        "vmla.f32   q13, q9, d12[1]     \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q14, q10, d9[0]     \n"
                        "vmla.f32   q15, q10, d13[0]    \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vmla.f32   q13, q11, d13[1]    \n"

                        "vadd.f32   q12, q12, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"

                        "sub        %7, %7, #768        \n" // kptr -= 24 * 16;

                        "vshrn.u32  d24, q12, #16       \n"
                        "vshrn.u32  d25, q13, #16       \n"

                        "vst1.u16   {d24-d25}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v20.4s}, [%1], #16         \n" // sum0

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%2], #16   \n" // r00 r01

                        "shll   v0.4s, v0.4h, #16           \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "shll   v16.4s, v16.4h, #16         \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmul   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmul   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmul   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%2] \n" // r02 r03 r04
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3], #16   \n" // r10 r11
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%3] \n" // r12 r13 r14
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%4], #16   \n" // r20 r21
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%4] \n" // r22 r23 r24
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%5], #16   \n" // r30 r31
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v0.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v0.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%5] \n" // r32 r33 r34
                        "shll   v17.4s, v17.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v16.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v1.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v1.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v1.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v2.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v2.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v3.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v3.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v3.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v3.s[3]     \n"
                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%6], #16   \n" // r40 r41
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v4.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v4.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v4.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v4.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v0.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v0.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v0.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v0.s[3]     \n"
                        "prfm   pldl1keep, [%6, #192]       \n"
                        "ld1    {v2.4h, v3.4h, v4.4h}, [%6] \n" // r42 r43 r44
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"
                        "shll   v4.4s, v4.4h, #16           \n"

                        "fmla   v21.4s, v24.4s, v1.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v1.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v1.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v1.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v2.s[0]     \n"
                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7], #32 \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v2.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v2.s[2]     \n"
                        "shll   v24.4s, v24.4h, #16         \n"
                        "fmla   v20.4s, v19.4s, v2.s[3]     \n"
                        "shll   v25.4s, v25.4h, #16         \n"

                        "fmla   v21.4s, v24.4s, v3.s[0]     \n"
                        //                         "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7] \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "shll   v27.4s, v27.4h, #16         \n"
                        "fmla   v23.4s, v26.4s, v3.s[2]     \n"
                        "shll   v16.4s, v16.4h, #16         \n"
                        "fmla   v20.4s, v27.4s, v3.s[3]     \n"
                        "shll   v17.4s, v17.4h, #16         \n"

                        "fmla   v21.4s, v16.4s, v4.s[0]     \n"
                        "shll   v18.4s, v18.4h, #16         \n"
                        "fmla   v22.4s, v17.4s, v4.s[1]     \n"
                        "shll   v19.4s, v19.4h, #16         \n"
                        "fmla   v23.4s, v18.4s, v4.s[2]     \n"
                        "fmla   v20.4s, v19.4s, v4.s[3]     \n"

                        "fadd   v22.4s, v21.4s, v22.4s      \n"
                        "fadd   v23.4s, v22.4s, v23.4s      \n"
                        "fadd   v20.4s, v20.4s, v23.4s      \n"

                        "sub    %7, %7, #768                \n" // kptr -= 24 * 16;

                        "shrn   v20.4h, v20.4s, #16         \n"

                        "st1    {v20.4h}, [%0], #8          \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r00 r01

                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d24-d25}, [%1 :128]! \n" // sum0

                        "vmul.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmul.f32   q14, q9, d0[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmul.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%2, #192]          \n"
                        "vld1.u16   {d6-d8}, [%2 :64]   \n" // r02 r03 r04
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3 :64]!  \n" // r10 r11
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%3, #192]          \n"
                        "vld1.u16   {d6-d8}, [%3 :64]   \n" // r12 r13 r14
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r20 r21
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%4, #192]          \n"
                        "vld1.u16   {d6-d8}, [%4 :64]   \n" // r22 r23 r24
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5 :64]!  \n" // r30 r31
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d9[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d0[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d0[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d1[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d1[1]      \n"
                        "pld        [%5, #192]          \n"
                        "vld1.u16   {d6-d8}, [%5 :64]   \n" // r32 r33 r34
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q8, d2[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d2[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d3[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d3[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d4[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d4[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d5[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d5[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d6[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d6[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d7[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d7[1]     \n"
                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d2-d3}, [%6 :64]!  \n" // r40 r41
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q13, q10, d8[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d8[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d9[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d0[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d0[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d1[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d1[1]     \n"
                        "pld        [%6, #192]          \n"
                        "vld1.u16   {d6-d8}, [%6 :64]   \n" // r42 r43 r44
                        "vshll.u16  q11, d17, #16       \n"
                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshll.u16  q4, d8, #16         \n"

                        "vmla.f32   q13, q10, d2[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d2[1]     \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128]! \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d3[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d3[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d4[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d4[1]      \n"
                        "pld        [%7, #256]          \n"
                        "vld1.u16   {d16-d19}, [%7 :128]! \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d5[0]     \n"
                        "vshll.u16  q10, d16, #16       \n"
                        "vmla.f32   q12, q11, d5[1]     \n"
                        "vshll.u16  q11, d17, #16       \n"

                        "vmla.f32   q13, q10, d6[0]     \n"
                        "vshll.u16  q8, d18, #16        \n"
                        "vmla.f32   q14, q11, d6[1]     \n"
                        //                         "pld        [%7, #256]          \n"
                        "vld1.u16   {d20-d23}, [%7 :128] \n"
                        "vshll.u16  q9, d19, #16        \n"
                        "vmla.f32   q15, q8, d7[0]      \n"
                        "vshll.u16  q8, d20, #16        \n"
                        "vmla.f32   q12, q9, d7[1]      \n"
                        "vshll.u16  q9, d21, #16        \n"

                        "vmla.f32   q13, q8, d8[0]      \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vmla.f32   q14, q9, d8[1]      \n"
                        "vshll.u16  q11, d23, #16       \n"
                        "vmla.f32   q15, q10, d9[0]     \n"
                        "vmla.f32   q12, q11, d9[1]     \n"

                        "vadd.f32   q14, q13, q14       \n"
                        "vadd.f32   q15, q14, q15       \n"
                        "vadd.f32   q12, q12, q15       \n"

                        "sub        %7, %7, #768        \n" // kptr -= 24 * 16;

                        "vshrn.u32  d24, q12, #16       \n"

                        "vst1.u16   {d24}, [%0 :64]!    \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(kptr)          // %7
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_5x5_pack8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv5x5s1_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16(0.f);
        out0.fill(_bias0);

        int q = 0;
        for (; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);
            const __fp16* r3 = img0.row<const __fp16>(3);
            const __fp16* r4 = img0.row<const __fp16>(4);

            const __fp16* kptr = kernel.channel(p).row<const __fp16>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0 sum1 sum2 sum3

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1] \n" // r04 r05 r06 r07

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v7.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v7.h[1]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v7.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v7.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v7.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v7.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v7.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v7.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v11.h[1]    \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%2] \n" // r14 r15 r16 r17

                        "fmla   v28.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v13.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v13.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v14.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v14.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v14.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v13.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v14.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v14.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v14.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v14.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v13.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v14.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v14.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v15.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v14.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v15.h[1]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v28.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v14.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v15.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v13.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v14.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v15.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v14.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v15.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v14.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v15.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v14.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v15.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v13.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v14.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v15.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%3] \n" // r24 r25 r26 r27

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v7.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v7.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%4], #64 \n" // r30 r31 r32 r33

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v7.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v7.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v7.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v7.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v7.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v7.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v11.h[1]    \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%4] \n" // r34 r35 r36 r37

                        "fmla   v28.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v13.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v13.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v14.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v14.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v14.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v13.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v14.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v14.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v14.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v14.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v13.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v14.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v30.8h, v16.8h, v14.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v15.h[0]    \n"
                        "fmla   v28.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v29.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v30.8h, v17.8h, v14.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v15.h[1]    \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%5], #64 \n" // r40 r41 r42 r43

                        "fmla   v28.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v30.8h, v18.8h, v14.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v15.h[2]    \n"
                        "fmla   v28.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v29.8h, v19.8h, v13.h[3]    \n"
                        "fmla   v30.8h, v19.8h, v14.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v15.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v30.8h, v20.8h, v14.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v15.h[4]    \n"
                        "fmla   v28.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v29.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v30.8h, v21.8h, v14.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v15.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v30.8h, v22.8h, v14.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v15.h[6]    \n"
                        "fmla   v28.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v29.8h, v23.8h, v13.h[7]    \n"
                        "fmla   v30.8h, v23.8h, v14.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v15.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%5] \n" // r44 r45 r46 r47

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6] \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v7.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v7.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v7.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v7.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v7.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v7.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v7.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v29.8h, v23.8h, v5.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v7.h[7]     \n"

                        "sub    %6, %6, #3136               \n" // kptr -= 24.5 * 64;

                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%1], #32   \n" // r00 r01

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[5]     \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v2.8h, v3.8h, v4.8h, v5.8h}, [%1] \n" // r02 r03 r04 r05

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v8.8h, v9.8h}, [%2], #32   \n" // r10 r11

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v9.h[1]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v10.8h, v11.8h, v12.8h, v13.8h}, [%2] \n" // r12 r13 r14 r15

                        "fmla   v30.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v9.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v9.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v13.h[1]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%3], #32   \n" // r20 r21

                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v13.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v13.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v2.8h, v3.8h, v4.8h, v5.8h}, [%3] \n" // r22 r23 r24 r25

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v8.8h, v9.8h}, [%4], #32   \n" // r30 r31

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v9.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v10.8h, v11.8h, v12.8h, v13.8h}, [%4] \n" // r32 r33 r34 r35

                        "fmla   v30.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v9.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v9.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v31.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v13.h[1]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%5], #32   \n" // r40 r41

                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v13.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v30.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v31.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v13.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v2.8h, v3.8h, v4.8h, v5.8h}, [%5] \n" // r42 r43 r44 r45

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6] \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "fadd   v28.8h, v28.8h, v30.8h      \n"
                        "fadd   v29.8h, v29.8h, v31.8h      \n"

                        "sub    %6, %6, #3136               \n" // kptr -= 24.5 * 64;

                        "st1    {v28.8h, v29.8h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.8h}, [%1], #16          \n" // r00

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v31.8h}, [%0]              \n" // sum0

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmul   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v1.8h, v2.8h, v3.8h, v4.8h}, [%1] \n" // r01 r02 r03 r04

                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v8.8h}, [%2], #16          \n" // r10

                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v8.h[1]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v9.8h, v10.8h, v11.8h, v12.8h}, [%2] \n" // r11 r12 r13 r14

                        "fmla   v30.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v8.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v8.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v9.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v9.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v12.h[1]    \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.8h}, [%3], #16          \n" // r20

                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v1.8h, v2.8h, v3.8h, v4.8h}, [%3] \n" // r21 r22 r23 r24

                        "fmla   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v8.8h}, [%4], #16          \n" // r30

                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v8.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v9.8h, v10.8h, v11.8h, v12.8h}, [%4] \n" // r31 r32 r33 r34

                        "fmla   v30.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v8.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v8.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v9.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v9.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v12.h[1]    \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.8h}, [%5], #16          \n" // r40

                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v1.8h, v2.8h, v3.8h, v4.8h}, [%5] \n" // r41 r42 r43 r44

                        "fmla   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6] \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "fadd   v28.8h, v28.8h, v29.8h      \n"
                        "fadd   v30.8h, v30.8h, v31.8h      \n"
                        "fadd   v28.8h, v28.8h, v30.8h      \n"

                        "sub    %6, %6, #3136               \n" // kptr -= 24.5 * 64;

                        "st1    {v28.8h}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }

                r0 += 4 * 8;
                r1 += 4 * 8;
                r2 += 4 * 8;
                r3 += 4 * 8;
                r4 += 4 * 8;
            }
        }
    }
}

static void conv5x5s2_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = (w - 2 * outw + w) * 8;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16(0.f);
        out0.fill(_bias0);

        int q = 0;
        for (; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);
            const __fp16* r3 = img0.row<const __fp16>(3);
            const __fp16* r4 = img0.row<const __fp16>(4);

            const __fp16* kptr = kernel.channel(p).row<const __fp16>(q);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 1 < outw; j += 2)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v30.8h, v31.8h}, [%0]      \n" // sum0 sum1

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h}, [%1] \n" // r04 r05 r06

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%2], #64 \n" // r10 r11 r12 r13

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v11.h[1]    \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v12.8h, v13.8h, v14.8h}, [%2] \n" // r14 r15 r16

                        "fmla   v28.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v13.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v13.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v14.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v14.h[1]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23

                        "fmla   v28.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v14.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v14.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v14.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v14.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v14.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v14.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h}, [%3] \n" // r24 r25 r26

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%4], #64 \n" // r30 r31 r32 r33

                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v8.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v8.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v9.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v11.h[1]    \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v12.8h, v13.8h, v14.8h}, [%4] \n" // r34 r35 r36

                        "fmla   v28.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v9.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v9.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v10.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v10.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v13.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v13.h[1]    \n"
                        "fmla   v28.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v13.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v11.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v13.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v13.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v13.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v13.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v11.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v13.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v16.8h, v14.h[0]    \n"
                        "fmla   v30.8h, v17.8h, v12.h[1]    \n"
                        "fmla   v31.8h, v17.8h, v14.h[1]    \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%5], #64 \n" // r40 r41 r42 r43

                        "fmla   v28.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v29.8h, v18.8h, v14.h[2]    \n"
                        "fmla   v30.8h, v19.8h, v12.h[3]    \n"
                        "fmla   v31.8h, v19.8h, v14.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v20.8h, v14.h[4]    \n"
                        "fmla   v30.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v31.8h, v21.8h, v14.h[5]    \n"
                        "fmla   v28.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v29.8h, v22.8h, v14.h[6]    \n"
                        "fmla   v30.8h, v23.8h, v12.h[7]    \n"
                        "fmla   v31.8h, v23.8h, v14.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%5, #384]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h}, [%5] \n" // r44 r45 r46

                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6] \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v6.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v6.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[5]     \n"
                        "fmla   v28.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v6.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v4.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[7]     \n"

                        "fadd   v28.8h, v28.8h, v30.8h      \n"
                        "fadd   v29.8h, v29.8h, v31.8h      \n"

                        "sub    %6, %6, #3136               \n" // kptr -= 24.5 * 64;

                        "st1    {v28.8h, v29.8h}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%1], #32   \n" // r00 r01

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v31.8h}, [%0]              \n" // sum0

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmul   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v2.8h, v3.8h, v4.8h}, [%1] \n" // r02 r03 r04

                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v8.8h, v9.8h}, [%2], #32   \n" // r10 r11

                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v8.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v8.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v9.h[1]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v10.8h, v11.8h, v12.8h}, [%2] \n" // r12 r13 r14

                        "fmla   v30.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v9.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v9.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v12.h[1]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%3], #32   \n" // r20 r21

                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v2.8h, v3.8h, v4.8h}, [%3] \n" // r22 r23 r24

                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v8.8h, v9.8h}, [%4], #32   \n" // r30 r31

                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v8.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v8.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v8.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v8.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v8.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v8.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v8.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v8.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v9.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v9.h[1]     \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v10.8h, v11.8h, v12.8h}, [%4] \n" // r32 r33 r34

                        "fmla   v30.8h, v18.8h, v9.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v9.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v9.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v9.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v9.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v9.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v10.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v10.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v10.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v10.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v10.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v10.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v10.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v10.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v11.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v11.h[1]    \n"
                        "fmla   v30.8h, v18.8h, v11.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v11.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v11.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v11.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v11.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v11.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v12.h[0]    \n"
                        "fmla   v29.8h, v17.8h, v12.h[1]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%5], #32   \n" // r40 r41

                        "fmla   v30.8h, v18.8h, v12.h[2]    \n"
                        "fmla   v31.8h, v19.8h, v12.h[3]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v12.h[4]    \n"
                        "fmla   v29.8h, v21.8h, v12.h[5]    \n"
                        "fmla   v30.8h, v22.8h, v12.h[6]    \n"
                        "fmla   v31.8h, v23.8h, v12.h[7]    \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%5, #384]       \n"
                        "ld1    {v2.8h, v3.8h, v4.8h}, [%5] \n" // r42 r43 r44

                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v16.8h, v3.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n"

                        "fmla   v28.8h, v20.8h, v3.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v3.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[7]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6] \n"

                        "fmla   v28.8h, v16.8h, v4.h[0]     \n"
                        "fmla   v29.8h, v17.8h, v4.h[1]     \n"
                        "fmla   v30.8h, v18.8h, v4.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v4.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v4.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v4.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v4.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v4.h[7]     \n"

                        "fadd   v28.8h, v28.8h, v29.8h      \n"
                        "fadd   v30.8h, v30.8h, v31.8h      \n"
                        "fadd   v28.8h, v28.8h, v30.8h      \n"

                        "sub    %6, %6, #3136               \n" // kptr -= 24.5 * 64;

                        "st1    {v28.8h}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(kptr)     // %6
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_7x7.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv7x7s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 49 + q * 49;

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;
            const float* r3 = img0 + w * 3;
            const float* r4 = img0 + w * 4;
            const float* r5 = img0 + w * 5;
            const float* r6 = img0 + w * 6;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 7;
            const float* k2 = kernel0 + 14;
            const float* k3 = kernel0 + 21;
            const float* k4 = kernel0 + 28;
            const float* k5 = kernel0 + 35;
            const float* k6 = kernel0 + 42;

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw - (nn << 2);
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                float32x4_t _k0123 = vld1q_f32(k0);
                float32x4_t _k4567 = vld1q_f32(k0 + 4);
                float32x4_t _k78910 = vld1q_f32(k1);
                float32x4_t _k11121314 = vld1q_f32(k1 + 4);
                float32x4_t _k14151617 = vld1q_f32(k2);
                float32x4_t _k18192021 = vld1q_f32(k2 + 4);
                float32x4_t _k21222324 = vld1q_f32(k3);
                float32x4_t _k25262728 = vld1q_f32(k3 + 4);
                float32x4_t _k28293031 = vld1q_f32(k4);
                float32x4_t _k32333435 = vld1q_f32(k4 + 4);
                float32x4_t _k35363738 = vld1q_f32(k5);
                float32x4_t _k39404142 = vld1q_f32(k5 + 4);
                float32x4_t _k42434445 = vld1q_f32(k6);
                float32x4_t _k46474849 = vld1q_f32(k6 + 4);
#ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__
                if (nn > 0)
                {
                    asm volatile(
                        // v0:  input / final output
                        // v1 v2 v3: = ri0 ri4 ri0n , i <-  1-7
                        // v4 = ri1 / ri3 / ri6
                        // v5 = ri2 / ri5
                        // v9 = intermediate sum register
                        "0:                                        \n"
                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v0.4s}, [%1]                  \n"

                        //i = 1
                        "prfm       pldl1keep, [%2, #384]          \n"
                        "ld1        {v1.4s, v2.4s, v3.4s}, [%2]    \n"
                        "add        %2, %2, #16                    \n"
                        "ext        v4.16b, v1.16b, v2.16b, #4     \n"
                        "fmul       v9.4s, v1.4s, %18.s[0]         \n"
                        "ext        v5.16b, v1.16b, v2.16b, #8     \n"
                        "fmla       v0.4s, v4.4s, %18.s[1]         \n"
                        "ext        v4.16b, v1.16b, v2.16b, #12    \n"
                        "fmla       v9.4s, v5.4s, %18.s[2]         \n"
                        "ext        v5.16b, v2.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v4.4s, %18.s[3]         \n"
                        "ext        v4.16b, v2.16b, v3.16b, #8     \n"
                        "fmla       v9.4s, v2.4s, %19.s[0]         \n"
                        "fmla       v0.4s, v5.4s, %19.s[1]         \n"
                        "fmla       v9.4s, v4.4s, %19.s[2]         \n"

                        //i = 2
                        "prfm       pldl1keep, [%3, #384]          \n"
                        "ld1        {v1.4s, v2.4s, v3.4s}, [%3]    \n" // v1 v2 v3: = r20 r24 r20n
                        "add        %3, %3, #16                    \n"
                        "ext        v4.16b, v1.16b, v2.16b, #4     \n" // v4 = r21
                        "fmla       v9.4s, v1.4s, %20.s[0]         \n" // *+ r10
                        "ext        v5.16b, v1.16b, v2.16b, #8     \n" // v5 = r22
                        "fmla       v0.4s, v4.4s, %20.s[1]         \n" // *+ r11
                        "ext        v4.16b, v1.16b, v2.16b, #12    \n" // v4 = r23
                        "fmla       v9.4s, v5.4s, %20.s[2]         \n" // *+ r1
                        "ext        v5.16b, v2.16b, v3.16b, #4     \n" // v5 = r25
                        "fmla       v0.4s, v4.4s, %20.s[3]         \n" // *+ r13
                        "ext        v4.16b, v2.16b, v3.16b, #8     \n" // v4 = r26
                        "fmla       v9.4s, v2.4s, %21.s[0]         \n" // *+ r14
                        "fmla       v0.4s, v5.4s, %21.s[1]         \n" // *+ r15
                        "fmla       v9.4s, v4.4s, %21.s[2]         \n" // *+ r16

                        //i = 3
                        "prfm       pldl1keep, [%4, #384]          \n"
                        "ld1        {v1.4s, v2.4s, v3.4s}, [%4]    \n"
                        "add        %4, %4, #16                    \n"
                        "ext        v4.16b, v1.16b, v2.16b, #4     \n"
                        "fmla       v9.4s, v1.4s, %22.s[0]         \n"
                        "ext        v5.16b, v1.16b, v2.16b, #8     \n"
                        "fmla       v0.4s, v4.4s, %22.s[1]         \n"
                        "ext        v4.16b, v1.16b, v2.16b, #12    \n"
                        "fmla       v9.4s, v5.4s, %22.s[2]         \n"
                        "ext        v5.16b, v2.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v4.4s, %22.s[3]         \n"
                        "ext        v4.16b, v2.16b, v3.16b, #8     \n"
                        "fmla       v9.4s, v2.4s, %23.s[0]         \n"
                        "fmla       v0.4s, v5.4s, %23.s[1]         \n"
                        "fmla       v9.4s, v4.4s, %23.s[2]         \n"

                        //i = 4
                        "prfm       pldl1keep, [%5, #384]          \n"
                        "ld1        {v1.4s, v2.4s, v3.4s}, [%5]    \n"
                        "add        %5, %5, #16                    \n"
                        "ext        v4.16b, v1.16b, v2.16b, #4     \n"
                        "fmla       v9.4s, v1.4s, %24.s[0]         \n"
                        "ext        v5.16b, v1.16b, v2.16b, #8     \n"
                        "fmla       v0.4s, v4.4s, %24.s[1]         \n"
                        "ext        v4.16b, v1.16b, v2.16b, #12    \n"
                        "fmla       v9.4s, v5.4s, %24.s[2]         \n"
                        "ext        v5.16b, v2.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v4.4s, %24.s[3]         \n"
                        "ext        v4.16b, v2.16b, v3.16b, #8     \n"
                        "fmla       v9.4s, v2.4s, %25.s[0]         \n"
                        "fmla       v0.4s, v5.4s, %25.s[1]         \n"
                        "fmla       v9.4s, v4.4s, %25.s[2]         \n"

                        //i = 5
                        "prfm       pldl1keep, [%6, #384]          \n"
                        "ld1        {v1.4s, v2.4s, v3.4s}, [%6]    \n"
                        "add        %6, %6, #16                    \n"
                        "ext        v4.16b, v1.16b, v2.16b, #4     \n"
                        "fmla       v9.4s, v1.4s, %26.s[0]         \n"
                        "ext        v5.16b, v1.16b, v2.16b, #8     \n"
                        "fmla       v0.4s, v4.4s, %26.s[1]         \n"
                        "ext        v4.16b, v1.16b, v2.16b, #12    \n"
                        "fmla       v9.4s, v5.4s, %26.s[2]         \n"
                        "ext        v5.16b, v2.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v4.4s, %26.s[3]         \n"
                        "ext        v4.16b, v2.16b, v3.16b, #8     \n"
                        "fmla       v9.4s, v2.4s, %27.s[0]         \n"
                        "fmla       v0.4s, v5.4s, %27.s[1]         \n"
                        "fmla       v9.4s, v4.4s, %27.s[2]         \n"

                        //i = 6
                        "prfm       pldl1keep, [%7, #384]          \n"
                        "ld1        {v1.4s, v2.4s, v3.4s}, [%7]    \n"
                        "add        %7, %7, #16                    \n"
                        "ext        v4.16b, v1.16b, v2.16b, #4     \n"
                        "fmla       v9.4s, v1.4s, %28.s[0]         \n"
                        "ext        v5.16b, v1.16b, v2.16b, #8     \n"
                        "fmla       v0.4s, v4.4s, %28.s[1]         \n"
                        "ext        v4.16b, v1.16b, v2.16b, #12    \n"
                        "fmla       v9.4s, v5.4s, %28.s[2]         \n"
                        "ext        v5.16b, v2.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v4.4s, %28.s[3]         \n"
                        "ext        v4.16b, v2.16b, v3.16b, #8     \n"
                        "fmla       v9.4s, v2.4s, %29.s[0]         \n"
                        "fmla       v0.4s, v5.4s, %29.s[1]         \n"
                        "fmla       v9.4s, v4.4s, %29.s[2]         \n"

                        //i = 7
                        "prfm       pldl1keep, [%8, #384]          \n"
                        "ld1        {v1.4s, v2.4s, v3.4s}, [%8]    \n"
                        "add        %8, %8, #16                    \n"
                        "ext        v4.16b, v1.16b, v2.16b, #4     \n"
                        "fmla       v9.4s, v1.4s, %30.s[0]         \n"
                        "ext        v5.16b, v1.16b, v2.16b, #8     \n"
                        "fmla       v0.4s, v4.4s, %30.s[1]         \n"
                        "ext        v4.16b, v1.16b, v2.16b, #12    \n"
                        "fmla       v9.4s, v5.4s, %30.s[2]         \n"
                        "ext        v5.16b, v2.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v4.4s, %30.s[3]         \n"
                        "ext        v4.16b, v2.16b, v3.16b, #8     \n"
                        "fmla       v9.4s, v2.4s, %31.s[0]         \n"
                        "fmla       v0.4s, v5.4s, %31.s[1]         \n"
                        "fmla       v9.4s, v4.4s, %31.s[2]         \n"

                        "fadd       v0.4s, v0.4s, v9.4s            \n"
                        "st1        {v0.4s}, [%1], #16             \n"
                        "subs       %w0, %w0, #1                   \n"
                        "bne        0b                             \n"

                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4),     // %6
                        "=r"(r5),     // %7
                        "=r"(r6)      // %8
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "w"(_k0123),     // %18
                        "w"(_k4567),     // %19
                        "w"(_k78910),    // %20
                        "w"(_k11121314), // %21
                        "w"(_k14151617), // %22
                        "w"(_k18192021), // %23
                        "w"(_k21222324), // %24
                        "w"(_k25262728), // %25
                        "w"(_k28293031), // %26
                        "w"(_k32333435), // %27
                        "w"(_k35363738), // %28
                        "w"(_k39404142), // %29
                        "w"(_k42434445), // %30
                        "w"(_k46474849)  // %31
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v9");
                }
#else
                /**
                * __ARM_NEON && __aarch64__ defined, but __clang__ not defined
                * When compiled with gcc, gcc does not accept over 30 operands
                */
                for (; nn > 0; nn--)
                {
                    float32x4_t _sum = vld1q_f32(outptr);

                    float32x4_t _r00 = vld1q_f32(r0);             // 0 1 2 3
                    float32x4_t _r04 = vld1q_f32(r0 + 4);         // 4 5 6 7
                    float32x4_t _r00n = vld1q_f32(r0 + 8);        // 8 9 10 11
                    float32x4_t _r01 = vextq_f32(_r00, _r04, 1);  // 1 2 3 4
                    float32x4_t _r02 = vextq_f32(_r00, _r04, 2);  // 2 3 4 5
                    float32x4_t _r03 = vextq_f32(_r00, _r04, 3);  // 3 4 5 6
                    float32x4_t _r05 = vextq_f32(_r04, _r00n, 1); // 5 6 7 8
                    float32x4_t _r06 = vextq_f32(_r04, _r00n, 2); // 6 7 8 9

                    _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);

                    float32x4_t _r10 = vld1q_f32(r1);
                    float32x4_t _r14 = vld1q_f32(r1 + 4);
                    float32x4_t _r10n = vld1q_f32(r1 + 8);
                    float32x4_t _r11 = vextq_f32(_r10, _r14, 1);
                    float32x4_t _r12 = vextq_f32(_r10, _r14, 2);
                    float32x4_t _r13 = vextq_f32(_r10, _r14, 3);
                    float32x4_t _r15 = vextq_f32(_r14, _r10n, 1);
                    float32x4_t _r16 = vextq_f32(_r14, _r10n, 2);

                    _sum = vfmaq_laneq_f32(_sum, _r10, _k78910, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r11, _k78910, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r12, _k78910, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r13, _k78910, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r14, _k11121314, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);

                    float32x4_t _r20 = vld1q_f32(r2);
                    float32x4_t _r24 = vld1q_f32(r2 + 4);
                    float32x4_t _r20n = vld1q_f32(r2 + 8);
                    float32x4_t _r21 = vextq_f32(_r20, _r24, 1);
                    float32x4_t _r22 = vextq_f32(_r20, _r24, 2);
                    float32x4_t _r23 = vextq_f32(_r20, _r24, 3);
                    float32x4_t _r25 = vextq_f32(_r24, _r20n, 1);
                    float32x4_t _r26 = vextq_f32(_r24, _r20n, 2);

                    _sum = vfmaq_laneq_f32(_sum, _r20, _k14151617, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r21, _k14151617, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r22, _k14151617, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r23, _k14151617, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r24, _k18192021, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);

                    float32x4_t _r30 = vld1q_f32(r3);
                    float32x4_t _r34 = vld1q_f32(r3 + 4);
                    float32x4_t _r30n = vld1q_f32(r3 + 8);
                    float32x4_t _r31 = vextq_f32(_r30, _r34, 1);
                    float32x4_t _r32 = vextq_f32(_r30, _r34, 2);
                    float32x4_t _r33 = vextq_f32(_r30, _r34, 3);
                    float32x4_t _r35 = vextq_f32(_r34, _r30n, 1);
                    float32x4_t _r36 = vextq_f32(_r34, _r30n, 2);

                    _sum = vfmaq_laneq_f32(_sum, _r30, _k21222324, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r31, _k21222324, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r32, _k21222324, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r33, _k21222324, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r34, _k25262728, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);

                    float32x4_t _r40 = vld1q_f32(r4);
                    float32x4_t _r44 = vld1q_f32(r4 + 4);
                    float32x4_t _r40n = vld1q_f32(r4 + 8);
                    float32x4_t _r41 = vextq_f32(_r40, _r44, 1);
                    float32x4_t _r42 = vextq_f32(_r40, _r44, 2);
                    float32x4_t _r43 = vextq_f32(_r40, _r44, 3);
                    float32x4_t _r45 = vextq_f32(_r44, _r40n, 1);
                    float32x4_t _r46 = vextq_f32(_r44, _r40n, 2);

                    _sum = vfmaq_laneq_f32(_sum, _r40, _k28293031, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r41, _k28293031, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r42, _k28293031, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r43, _k28293031, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r44, _k32333435, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);

                    float32x4_t _r50 = vld1q_f32(r5);
                    float32x4_t _r54 = vld1q_f32(r5 + 4);
                    float32x4_t _r50n = vld1q_f32(r5 + 8);
                    float32x4_t _r51 = vextq_f32(_r50, _r54, 1);
                    float32x4_t _r52 = vextq_f32(_r50, _r54, 2);
                    float32x4_t _r53 = vextq_f32(_r50, _r54, 3);
                    float32x4_t _r55 = vextq_f32(_r54, _r50n, 1);
                    float32x4_t _r56 = vextq_f32(_r54, _r50n, 2);

                    _sum = vfmaq_laneq_f32(_sum, _r50, _k35363738, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r51, _k35363738, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r52, _k35363738, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r53, _k35363738, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r54, _k39404142, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);

                    float32x4_t _r60 = vld1q_f32(r6);
                    float32x4_t _r64 = vld1q_f32(r6 + 4);
                    float32x4_t _r60n = vld1q_f32(r6 + 8);
                    float32x4_t _r61 = vextq_f32(_r60, _r64, 1);
                    float32x4_t _r62 = vextq_f32(_r60, _r64, 2);
                    float32x4_t _r63 = vextq_f32(_r60, _r64, 3);
                    float32x4_t _r65 = vextq_f32(_r64, _r60n, 1);
                    float32x4_t _r66 = vextq_f32(_r64, _r60n, 2);

                    _sum = vfmaq_laneq_f32(_sum, _r60, _k42434445, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r61, _k42434445, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r62, _k42434445, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r63, _k42434445, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r64, _k46474849, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r65, _k46474849, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r66, _k46474849, 2);

                    vst1q_f32(outptr, _sum);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    r4 += 4;
                    r5 += 4;
                    r6 += 4;
                    outptr += 4;
                }
#endif // __clang__
#else  //__aarch32__
                if (nn > 0)
                {
                    asm volatile(
                        "0:                             \n"

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d24-d25}, [%1]     \n" // _sum
                        //                     "veor       q13, q13            \n"// _sum2 = 0;
                        //                     "veor       q14, q14            \n"// _sum3 = 0;
                        //                     "veor       q15, q15            \n"// _sum4 = 0;

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k0123 k4567
                        "add        %9, #28             \n"

                        "pld        [%2, #128]          \n"
                        "vld1.f32   {d0-d1}, [%2]!      \n" // q0 = 0  1  2  3
                        "vmla.f32   q12, q0, d8[0]      \n"

                        "pld        [%2, #256]          \n"
                        "vld1.f32   {d4-d7}, [%2]       \n" // q2 = 4  5  6  7  q3 = 8  9 10 11
                        "vmul.f32   q13, q2, d10[0]     \n"

                        "vext.32    q1, q0, q2, #1      \n" // q1 = 1  2  3  4
                        "vext.32    q10, q2, q3, #1     \n" // q10= 5  6  7  8
                        "vmul.f32   q14, q1, d8[1]      \n"
                        "vmul.f32   q15, q10, d10[1]    \n"

                        "vext.32    q8, q0, q2, #2      \n" // q8 = 2  3  4  5
                        "vext.32    q11, q2, q3, #2     \n" // q11= 6  7  8  9
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q11, d11[0]    \n"

                        "vext.32    q9, q0, q2, #3      \n" // q9 = 3  4  5  6
                        "vmla.f32   q14, q9, d9[1]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d12-d15}, [%9]     \n" // q6 q7 = k78910 k11121314
                        "add        %9, #28             \n"

                        "pld        [%3, #128]          \n"
                        "vld1.f32   {d0-d1}, [%3]!      \n"
                        "vmla.f32   q15, q0, d12[0]     \n"

                        "pld        [%3, #256]          \n"
                        "vld1.f32   {d4-d7}, [%3]       \n"
                        "vmla.f32   q12, q2, d14[0]     \n"

                        "vext.32    q1, q0, q2, #1      \n"
                        "vext.32    q10, q2, q3, #1     \n"
                        "vmla.f32   q13, q1, d12[1]     \n"
                        "vmla.f32   q14, q10, d14[1]    \n"

                        "vext.32    q8, q0, q2, #2      \n"
                        "vext.32    q11, q2, q3, #2     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q11, d15[0]    \n"

                        "vext.32    q9, q0, q2, #3      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k14151617 k18192021
                        "add        %9, #28             \n"

                        "pld        [%4, #128]          \n"
                        "vld1.f32   {d0-d1}, [%4]!      \n"
                        "vmla.f32   q14, q0, d8[0]      \n"

                        "pld        [%4, #256]          \n"
                        "vld1.f32   {d4-d7}, [%4]       \n"
                        "vmla.f32   q15, q2, d10[0]     \n"

                        "vext.32    q1, q0, q2, #1      \n"
                        "vext.32    q10, q2, q3, #1     \n"
                        "vmla.f32   q12, q1, d8[1]      \n"
                        "vmla.f32   q13, q10, d10[1]    \n"

                        "vext.32    q8, q0, q2, #2      \n"
                        "vext.32    q11, q2, q3, #2     \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q11, d11[0]    \n"

                        "vext.32    q9, q0, q2, #3      \n"
                        "vmla.f32   q12, q9, d9[1]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d12-d15}, [%9]     \n" // q6 q7 = k21222324 k25262728
                        "add        %9, #28             \n"

                        "pld        [%5, #128]          \n"
                        "vld1.f32   {d0-d1}, [%5]!      \n"
                        "vmla.f32   q13, q0, d12[0]     \n"

                        "pld        [%5, #256]          \n"
                        "vld1.f32   {d4-d7}, [%5]       \n"
                        "vmla.f32   q14, q2, d14[0]     \n"

                        "vext.32    q1, q0, q2, #1      \n"
                        "vext.32    q10, q2, q3, #1     \n"
                        "vmla.f32   q15, q1, d12[1]     \n"
                        "vmla.f32   q12, q10, d14[1]    \n"

                        "vext.32    q8, q0, q2, #2      \n"
                        "vext.32    q11, q2, q3, #2     \n"
                        "vmla.f32   q13, q8, d13[0]     \n"
                        "vmla.f32   q14, q11, d15[0]    \n"

                        "vext.32    q9, q0, q2, #3      \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k28293031 k32333435
                        "add        %9, #28             \n"

                        "pld        [%6, #128]          \n"
                        "vld1.f32   {d0-d1}, [%6]!      \n"
                        "vmla.f32   q12, q0, d8[0]      \n"

                        "pld        [%6, #256]          \n"
                        "vld1.f32   {d4-d7}, [%6]       \n"
                        "vmla.f32   q13, q2, d10[0]     \n"

                        "vext.32    q1, q0, q2, #1      \n"
                        "vext.32    q10, q2, q3, #1     \n"
                        "vmla.f32   q14, q1, d8[1]      \n"
                        "vmla.f32   q15, q10, d10[1]    \n"

                        "vext.32    q8, q0, q2, #2      \n"
                        "vext.32    q11, q2, q3, #2     \n"
                        "vmla.f32   q12, q8, d9[0]      \n"
                        "vmla.f32   q13, q11, d11[0]    \n"

                        "vext.32    q9, q0, q2, #3      \n"
                        "vmla.f32   q14, q9, d9[1]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d12-d15}, [%9]     \n" // q6 q7 = k35363738 k39404142
                        "add        %9, #28             \n"

                        "pld        [%7, #128]          \n"
                        "vld1.f32   {d0-d1}, [%7]!      \n"
                        "vmla.f32   q15, q0, d12[0]     \n"

                        "pld        [%7, #256]          \n"
                        "vld1.f32   {d4-d7}, [%7]       \n"
                        "vmla.f32   q12, q2, d14[0]     \n"

                        "vext.32    q1, q0, q2, #1      \n"
                        "vext.32    q10, q2, q3, #1     \n"
                        "vmla.f32   q13, q1, d12[1]     \n"
                        "vmla.f32   q14, q10, d14[1]    \n"

                        "vext.32    q8, q0, q2, #2      \n"
                        "vext.32    q11, q2, q3, #2     \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q12, q11, d15[0]    \n"

                        "vext.32    q9, q0, q2, #3      \n"
                        "vmla.f32   q13, q9, d13[1]     \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k42434445 k46474849
                        "sub        %9, #168            \n" // restore k0

                        "pld        [%8, #128]          \n"
                        "vld1.f32   {d0-d1}, [%8]!      \n"
                        "vmla.f32   q14, q0, d8[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.f32   {d4-d7}, [%8]       \n"
                        "vmla.f32   q15, q2, d10[0]     \n"

                        "vext.32    q1, q0, q2, #1      \n"
                        "vext.32    q10, q2, q3, #1     \n"
                        "vmla.f32   q12, q1, d8[1]      \n"
                        "vmla.f32   q13, q10, d10[1]    \n"

                        "vext.32    q8, q0, q2, #2      \n"
                        "vext.32    q11, q2, q3, #2     \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q11, d11[0]    \n"

                        "vext.32    q9, q0, q2, #3      \n"
                        "vmla.f32   q12, q9, d9[1]      \n"

                        "vadd.f32   q13, q13, q14       \n"
                        "vadd.f32   q13, q13, q15       \n"
                        "vadd.f32   q12, q12, q13       \n"

                        "vst1.f32   {d24-d25}, [%1]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4),     // %6
                        "=r"(r5),     // %7
                        "=r"(r6),     // %8
                        "=r"(k0)      // %9
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(k0)
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON

                for (; remain > 0; remain--)
                {
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r0[3] * k0[3];
                    sum += r0[4] * k0[4];
                    sum += r0[5] * k0[5];
                    sum += r0[6] * k0[6];

                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r1[3] * k1[3];
                    sum += r1[4] * k1[4];
                    sum += r1[5] * k1[5];
                    sum += r1[6] * k1[6];

                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum += r2[3] * k2[3];
                    sum += r2[4] * k2[4];
                    sum += r2[5] * k2[5];
                    sum += r2[6] * k2[6];

                    sum += r3[0] * k3[0];
                    sum += r3[1] * k3[1];
                    sum += r3[2] * k3[2];
                    sum += r3[3] * k3[3];
                    sum += r3[4] * k3[4];
                    sum += r3[5] * k3[5];
                    sum += r3[6] * k3[6];

                    sum += r4[0] * k4[0];
                    sum += r4[1] * k4[1];
                    sum += r4[2] * k4[2];
                    sum += r4[3] * k4[3];
                    sum += r4[4] * k4[4];
                    sum += r4[5] * k4[5];
                    sum += r4[6] * k4[6];

                    sum += r5[0] * k5[0];
                    sum += r5[1] * k5[1];
                    sum += r5[2] * k5[2];
                    sum += r5[3] * k5[3];
                    sum += r5[4] * k5[4];
                    sum += r5[5] * k5[5];
                    sum += r5[6] * k5[6];

                    sum += r6[0] * k6[0];
                    sum += r6[1] * k6[1];
                    sum += r6[2] * k6[2];
                    sum += r6[3] * k6[3];
                    sum += r6[4] * k6[4];
                    sum += r6[5] * k6[5];
                    sum += r6[6] * k6[6];

                    *outptr += sum;

                    r0++;
                    r1++;
                    r2++;
                    r3++;
                    r4++;
                    r5++;
                    r6++;
                    outptr++;
                }

                r0 += 6;
                r1 += 6;
                r2 += 6;
                r3 += 6;
                r4 += 6;
                r5 += 6;
                r6 += 6;
            }
        }
    }
}

static void conv7x7s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr = out;

            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 49 + q * 49;

            const float* r0 = img0;
            const float* r1 = img0 + w;
            const float* r2 = img0 + w * 2;
            const float* r3 = img0 + w * 3;
            const float* r4 = img0 + w * 4;
            const float* r5 = img0 + w * 5;
            const float* r6 = img0 + w * 6;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 7;
            const float* k2 = kernel0 + 14;
            const float* k3 = kernel0 + 21;
            const float* k4 = kernel0 + 28;
            const float* k5 = kernel0 + 35;
            const float* k6 = kernel0 + 42;

            int i = 0;

            for (; i < outh; i++)
            {
#if __ARM_NEON
                int nn = outw >> 2;
                int remain = outw - (nn << 2);
#else
                int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
                float32x4_t _k0123 = vld1q_f32(k0);
                float32x4_t _k4567 = vld1q_f32(k0 + 4);
                float32x4_t _k78910 = vld1q_f32(k1);
                float32x4_t _k11121314 = vld1q_f32(k1 + 4);
                float32x4_t _k14151617 = vld1q_f32(k2);
                float32x4_t _k18192021 = vld1q_f32(k2 + 4);
                float32x4_t _k21222324 = vld1q_f32(k3);
                float32x4_t _k25262728 = vld1q_f32(k3 + 4);
                float32x4_t _k28293031 = vld1q_f32(k4);
                float32x4_t _k32333435 = vld1q_f32(k4 + 4);
                float32x4_t _k35363738 = vld1q_f32(k5);
                float32x4_t _k39404142 = vld1q_f32(k5 + 4);
                float32x4_t _k42434445 = vld1q_f32(k6);
                float32x4_t _k46474849 = vld1q_f32(k6 + 4);
#ifdef __clang__ // __ARM_NEON && __aarch64__ && __clang__
                if (nn > 0)
                {
                    asm volatile(
                        // v0:  input / final output
                        // v1 v2: = _ri0/_ri1  first
                        // v3 v4: =                  then _r0_8101214/_r0_9111315
                        // v5 = ri2 / ri4 / ri6
                        // v6 = ri3 / ri5
                        // v9 = intermediate sum register
                        "0:                                        \n"
                        "prfm       pldl1keep, [%1, #128]          \n"
                        "ld1        {v0.4s}, [%1]                  \n"

                        //i = 1
                        "prfm       pldl1keep, [%2, #512]          \n"
                        "ld2        {v1.4s, v2.4s}, [%2]           \n" // v1  v2 = _r00  _r01
                        "add        %2, %2, #32                    \n"
                        "ld2        {v3.4s, v4.4s}, [%2]           \n" // v3  v4 = _r0_8101214 / _r0_9111315
                        "fmul       v9.4s, v1.4s, %18.s[0]         \n" // *+ _r00
                        "ext        v5.16b, v1.16b, v3.16b, #4     \n" // v5 = _r02
                        "fmla       v0.4s, v2.4s, %18.s[1]         \n" // *+ _r01
                        "ext        v6.16b, v2.16b, v4.16b, #4     \n" // v6 = _r03
                        "fmla       v9.4s, v5.4s, %18.s[2]         \n" // *+ _r02
                        "ext        v5.16b, v1.16b, v3.16b, #8     \n" // v5 = _r04
                        "fmla       v0.4s, v6.4s, %18.s[3]         \n" // *+ _r03
                        "ext        v6.16b, v2.16b, v4.16b, #8     \n" // v6 = _r05
                        "fmla       v9.4s, v5.4s, %19.s[0]         \n" // *+ _r04
                        "ext        v5.16b, v1.16b, v3.16b, #12    \n" // v5 = _r06
                        "fmla       v0.4s, v6.4s, %19.s[1]         \n" // *+ _r05
                        "fmla       v9.4s, v5.4s, %19.s[2]         \n" // *+ _r06

                        //i = 2
                        "prfm       pldl1keep, [%3, #512]          \n"
                        "ld2        {v1.4s, v2.4s}, [%3]           \n"
                        "add        %3, %3, #32                    \n"
                        "ld2        {v3.4s, v4.4s}, [%3]           \n"
                        "fmla       v9.4s, v1.4s, %20.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v2.4s, %20.s[1]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #4     \n"
                        "fmla       v9.4s, v5.4s, %20.s[2]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #8     \n"
                        "fmla       v0.4s, v6.4s, %20.s[3]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #8     \n"
                        "fmla       v9.4s, v5.4s, %21.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #12    \n"
                        "fmla       v0.4s, v6.4s, %21.s[1]         \n"
                        "fmla       v9.4s, v5.4s, %21.s[2]         \n"

                        //i = 3
                        "prfm       pldl1keep, [%4, #512]          \n"
                        "ld2        {v1.4s, v2.4s}, [%4]           \n"
                        "add        %4, %4, #32                    \n"
                        "ld2        {v3.4s, v4.4s}, [%4]           \n"
                        "fmla       v9.4s, v1.4s, %22.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v2.4s, %22.s[1]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #4     \n"
                        "fmla       v9.4s, v5.4s, %22.s[2]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #8     \n"
                        "fmla       v0.4s, v6.4s, %22.s[3]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #8     \n"
                        "fmla       v9.4s, v5.4s, %23.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #12    \n"
                        "fmla       v0.4s, v6.4s, %23.s[1]         \n"
                        "fmla       v9.4s, v5.4s, %23.s[2]         \n"

                        //i = 4
                        "prfm       pldl1keep, [%5, #512]          \n"
                        "ld2        {v1.4s, v2.4s}, [%5]           \n"
                        "add        %5, %5, #32                    \n"
                        "ld2        {v3.4s, v4.4s}, [%5]           \n"
                        "fmla       v9.4s, v1.4s, %24.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v2.4s, %24.s[1]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #4     \n"
                        "fmla       v9.4s, v5.4s, %24.s[2]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #8     \n"
                        "fmla       v0.4s, v6.4s, %24.s[3]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #8     \n"
                        "fmla       v9.4s, v5.4s, %25.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #12    \n"
                        "fmla       v0.4s, v6.4s, %25.s[1]         \n"
                        "fmla       v9.4s, v5.4s, %25.s[2]         \n"

                        //i = 5
                        "prfm       pldl1keep, [%6, #512]          \n"
                        "ld2        {v1.4s, v2.4s}, [%6]           \n"
                        "add        %6, %6, #32                    \n"
                        "ld2        {v3.4s, v4.4s}, [%6]           \n"
                        "fmla       v9.4s, v1.4s, %26.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v2.4s, %26.s[1]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #4     \n"
                        "fmla       v9.4s, v5.4s, %26.s[2]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #8     \n"
                        "fmla       v0.4s, v6.4s, %26.s[3]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #8     \n"
                        "fmla       v9.4s, v5.4s, %27.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #12    \n"
                        "fmla       v0.4s, v6.4s, %27.s[1]         \n"
                        "fmla       v9.4s, v5.4s, %27.s[2]         \n"

                        //i = 6
                        "prfm       pldl1keep, [%7, #512]          \n"
                        "ld2        {v1.4s, v2.4s}, [%7]           \n"
                        "add        %7, %7, #32                    \n"
                        "ld2        {v3.4s, v4.4s}, [%7]           \n"
                        "fmla       v9.4s, v1.4s, %28.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v2.4s, %28.s[1]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #4     \n"
                        "fmla       v9.4s, v5.4s, %28.s[2]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #8     \n"
                        "fmla       v0.4s, v6.4s, %28.s[3]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #8     \n"
                        "fmla       v9.4s, v5.4s, %29.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #12    \n"
                        "fmla       v0.4s, v6.4s, %29.s[1]         \n"
                        "fmla       v9.4s, v5.4s, %29.s[2]         \n"

                        //i = 7
                        "prfm       pldl1keep, [%8, #512]          \n"
                        "ld2        {v1.4s, v2.4s}, [%8]           \n"
                        "add        %8, %8, #32                    \n"
                        "ld2        {v3.4s, v4.4s}, [%8]           \n"
                        "fmla       v9.4s, v1.4s, %30.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #4     \n"
                        "fmla       v0.4s, v2.4s, %30.s[1]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #4     \n"
                        "fmla       v9.4s, v5.4s, %30.s[2]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #8     \n"
                        "fmla       v0.4s, v6.4s, %30.s[3]         \n"
                        "ext        v6.16b, v2.16b, v4.16b, #8     \n"
                        "fmla       v9.4s, v5.4s, %31.s[0]         \n"
                        "ext        v5.16b, v1.16b, v3.16b, #12    \n"
                        "fmla       v0.4s, v6.4s, %31.s[1]         \n"
                        "fmla       v9.4s, v5.4s, %31.s[2]         \n"

                        "fadd       v0.4s, v0.4s, v9.4s            \n"
                        "st1        {v0.4s}, [%1], #16             \n"
                        "subs       %w0, %w0, #1                   \n"
                        "bne        0b                             \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4),     // %6
                        "=r"(r5),     // %7
                        "=r"(r6)      // %8
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "w"(_k0123),     // %18
                        "w"(_k4567),     // %19
                        "w"(_k78910),    // %20
                        "w"(_k11121314), // %21
                        "w"(_k14151617), // %22
                        "w"(_k18192021), // %23
                        "w"(_k21222324), // %24
                        "w"(_k25262728), // %25
                        "w"(_k28293031), // %26
                        "w"(_k32333435), // %27
                        "w"(_k35363738), // %28
                        "w"(_k39404142), // %29
                        "w"(_k42434445), // %30
                        "w"(_k46474849)  // %31
                        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v9");
                }
#else
                /**
                * __ARM_NEON && __aarch64__ defined, but __clang__ not defined
                * When compiled with gcc, gcc does not accept over 30 operands
                */
                for (; nn > 0; nn--)
                {
                    float32x4_t _sum = vld1q_f32(outptr);

                    float32x4x2_t _r00_02461357 = vld2q_f32(r0);
                    float32x4x2_t _r00nx2 = vld2q_f32(r0 + 8);
                    float32x4_t _r0_8101214 = _r00nx2.val[0];           // 8 10 12 14
                    float32x4_t _r0_9111315 = _r00nx2.val[1];           // 9 11 13 15
                    float32x4_t _r00 = _r00_02461357.val[0];            // 0 2 4 6
                    float32x4_t _r01 = _r00_02461357.val[1];            // 1 3 5 7
                    float32x4_t _r02 = vextq_f32(_r00, _r0_8101214, 1); // 2 4 6 8
                    float32x4_t _r03 = vextq_f32(_r01, _r0_9111315, 1); // 3 5 7 9
                    float32x4_t _r04 = vextq_f32(_r00, _r0_8101214, 2); // 4 6 8 10
                    float32x4_t _r05 = vextq_f32(_r01, _r0_9111315, 2); // 5 7 9 11
                    float32x4_t _r06 = vextq_f32(_r00, _r0_8101214, 3); // 6 8 10 12

                    _sum = vfmaq_laneq_f32(_sum, _r00, _k0123, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r01, _k0123, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r02, _k0123, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r03, _k0123, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r04, _k4567, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r05, _k4567, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r06, _k4567, 2);

                    float32x4x2_t _r10_02461357 = vld2q_f32(r1);
                    float32x4x2_t _r10nx2 = vld2q_f32(r1 + 8);
                    float32x4_t _r1_8101214 = _r10nx2.val[0];
                    float32x4_t _r1_9111315 = _r10nx2.val[1];
                    float32x4_t _r10 = _r10_02461357.val[0];
                    float32x4_t _r11 = _r10_02461357.val[1];
                    float32x4_t _r12 = vextq_f32(_r10, _r1_8101214, 1);
                    float32x4_t _r13 = vextq_f32(_r11, _r1_9111315, 1);
                    float32x4_t _r14 = vextq_f32(_r10, _r1_8101214, 2);
                    float32x4_t _r15 = vextq_f32(_r11, _r1_9111315, 2);
                    float32x4_t _r16 = vextq_f32(_r10, _r1_8101214, 3);

                    _sum = vfmaq_laneq_f32(_sum, _r10, _k78910, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r11, _k78910, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r12, _k78910, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r13, _k78910, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r14, _k11121314, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r15, _k11121314, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r16, _k11121314, 2);

                    float32x4x2_t _r20_02461357 = vld2q_f32(r2);
                    float32x4x2_t _r20nx2 = vld2q_f32(r2 + 8);
                    float32x4_t _r2_8101214 = _r20nx2.val[0];
                    float32x4_t _r2_9111315 = _r20nx2.val[1];
                    float32x4_t _r20 = _r20_02461357.val[0];
                    float32x4_t _r21 = _r20_02461357.val[1];
                    float32x4_t _r22 = vextq_f32(_r20, _r2_8101214, 1);
                    float32x4_t _r23 = vextq_f32(_r21, _r2_9111315, 1);
                    float32x4_t _r24 = vextq_f32(_r20, _r2_8101214, 2);
                    float32x4_t _r25 = vextq_f32(_r21, _r2_9111315, 2);
                    float32x4_t _r26 = vextq_f32(_r20, _r2_8101214, 3);

                    _sum = vfmaq_laneq_f32(_sum, _r20, _k14151617, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r21, _k14151617, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r22, _k14151617, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r23, _k14151617, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r24, _k18192021, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r25, _k18192021, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r26, _k18192021, 2);

                    float32x4x2_t _r30_02461357 = vld2q_f32(r3);
                    float32x4x2_t _r30nx2 = vld2q_f32(r3 + 8);
                    float32x4_t _r3_8101214 = _r30nx2.val[0];
                    float32x4_t _r3_9111315 = _r30nx2.val[1];
                    float32x4_t _r30 = _r30_02461357.val[0];
                    float32x4_t _r31 = _r30_02461357.val[1];
                    float32x4_t _r32 = vextq_f32(_r30, _r3_8101214, 1);
                    float32x4_t _r33 = vextq_f32(_r31, _r3_9111315, 1);
                    float32x4_t _r34 = vextq_f32(_r30, _r3_8101214, 2);
                    float32x4_t _r35 = vextq_f32(_r31, _r3_9111315, 2);
                    float32x4_t _r36 = vextq_f32(_r30, _r3_8101214, 3);

                    _sum = vfmaq_laneq_f32(_sum, _r30, _k21222324, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r31, _k21222324, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r32, _k21222324, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r33, _k21222324, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r34, _k25262728, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r35, _k25262728, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r36, _k25262728, 2);

                    float32x4x2_t _r40_02461357 = vld2q_f32(r4);
                    float32x4x2_t _r40nx2 = vld2q_f32(r4 + 8);
                    float32x4_t _r4_8101214 = _r40nx2.val[0];
                    float32x4_t _r4_9111315 = _r40nx2.val[1];
                    float32x4_t _r40 = _r40_02461357.val[0];
                    float32x4_t _r41 = _r40_02461357.val[1];
                    float32x4_t _r42 = vextq_f32(_r40, _r4_8101214, 1);
                    float32x4_t _r43 = vextq_f32(_r41, _r4_9111315, 1);
                    float32x4_t _r44 = vextq_f32(_r40, _r4_8101214, 2);
                    float32x4_t _r45 = vextq_f32(_r41, _r4_9111315, 2);
                    float32x4_t _r46 = vextq_f32(_r40, _r4_8101214, 3);

                    _sum = vfmaq_laneq_f32(_sum, _r40, _k28293031, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r41, _k28293031, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r42, _k28293031, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r43, _k28293031, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r44, _k32333435, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r45, _k32333435, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r46, _k32333435, 2);

                    float32x4x2_t _r50_02461357 = vld2q_f32(r5);
                    float32x4x2_t _r50nx2 = vld2q_f32(r5 + 8);
                    float32x4_t _r5_8101214 = _r50nx2.val[0];
                    float32x4_t _r5_9111315 = _r50nx2.val[1];
                    float32x4_t _r50 = _r50_02461357.val[0];
                    float32x4_t _r51 = _r50_02461357.val[1];
                    float32x4_t _r52 = vextq_f32(_r50, _r5_8101214, 1);
                    float32x4_t _r53 = vextq_f32(_r51, _r5_9111315, 1);
                    float32x4_t _r54 = vextq_f32(_r50, _r5_8101214, 2);
                    float32x4_t _r55 = vextq_f32(_r51, _r5_9111315, 2);
                    float32x4_t _r56 = vextq_f32(_r50, _r5_8101214, 3);

                    _sum = vfmaq_laneq_f32(_sum, _r50, _k35363738, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r51, _k35363738, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r52, _k35363738, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r53, _k35363738, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r54, _k39404142, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r55, _k39404142, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r56, _k39404142, 2);

                    float32x4x2_t _r60_02461357 = vld2q_f32(r6);
                    float32x4x2_t _r60nx2 = vld2q_f32(r6 + 8);
                    float32x4_t _r6_8101214 = _r60nx2.val[0];
                    float32x4_t _r6_9111315 = _r60nx2.val[1];
                    float32x4_t _r60 = _r60_02461357.val[0];
                    float32x4_t _r61 = _r60_02461357.val[1];
                    float32x4_t _r62 = vextq_f32(_r60, _r6_8101214, 1);
                    float32x4_t _r63 = vextq_f32(_r61, _r6_9111315, 1);
                    float32x4_t _r64 = vextq_f32(_r60, _r6_8101214, 2);
                    float32x4_t _r65 = vextq_f32(_r61, _r6_9111315, 2);
                    float32x4_t _r66 = vextq_f32(_r60, _r6_8101214, 3);

                    _sum = vfmaq_laneq_f32(_sum, _r60, _k42434445, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r61, _k42434445, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r62, _k42434445, 2);
                    _sum = vfmaq_laneq_f32(_sum, _r63, _k42434445, 3);
                    _sum = vfmaq_laneq_f32(_sum, _r64, _k46474849, 0);
                    _sum = vfmaq_laneq_f32(_sum, _r65, _k46474849, 1);
                    _sum = vfmaq_laneq_f32(_sum, _r66, _k46474849, 2);

                    vst1q_f32(outptr, _sum);

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                    r3 += 8;
                    r4 += 8;
                    r5 += 8;
                    r6 += 8;
                    outptr += 4;
                }
#endif // __clang__
#else
                if (nn > 0)
                {
                    asm volatile(
                        "0:                             \n"

                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d26-d27}, [%1]     \n" // _sum
                        //                     "veor       q14, q14            \n"// _sum2 = 0;
                        //                     "veor       q15, q15            \n"// _sum3 = 0;

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k0123 k4567
                        "add        %9, #28             \n"

                        "pld        [%2, #512]          \n"
                        "vld2.f32   {d0-d3}, [%2]!      \n" // q0 = 0  2  4  6  q1 = 1  3  5  7
                        "vmla.f32   q13, q0, d8[0]      \n"
                        "vmul.f32   q14, q1, d8[1]      \n"

                        "vld2.f32   {d4-d7}, [%2]       \n" // q2 = 8 10 12 14  q3 = 9 11 13 15
                        "vext.32    q8, q0, q2, #1      \n" // q8 = 2  4  6  8
                        "vext.32    q9, q1, q3, #1      \n" // q9 = 3  5  7  9
                        "vmul.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"

                        "vext.32    q10, q0, q2, #2     \n" // q10= 4  6  8 10
                        "vext.32    q11, q1, q3, #2     \n" // q11= 5  7  9 11
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q11, d10[1]    \n"

                        "vext.32    q12, q0, q2, #3     \n" // q12= 6  8 10 12
                        "vmla.f32   q13, q12, d11[0]    \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d12-d15}, [%9]     \n" // q6 q7 = k78910 k11121314
                        "add        %9, #28             \n"

                        "pld        [%3, #512]          \n"
                        "vld2.f32   {d0-d3}, [%3]!      \n"
                        "vmla.f32   q14, q0, d12[0]     \n"
                        "vmla.f32   q15, q1, d12[1]     \n"

                        "vld2.f32   {d4-d7}, [%3]       \n"
                        "vext.32    q8, q0, q2, #1      \n"
                        "vext.32    q9, q1, q3, #1      \n"
                        "vmla.f32   q13, q8, d13[0]     \n"
                        "vmla.f32   q14, q9, d13[1]     \n"

                        "vext.32    q10, q0, q2, #2     \n"
                        "vext.32    q11, q1, q3, #2     \n"
                        "vmla.f32   q15, q10, d14[0]    \n"
                        "vmla.f32   q13, q11, d14[1]    \n"

                        "vext.32    q12, q0, q2, #3     \n"
                        "vmla.f32   q14, q12, d15[0]    \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k14151617 k18192021
                        "add        %9, #28             \n"

                        "pld        [%4, #512]          \n"
                        "vld2.f32   {d0-d3}, [%4]!      \n"
                        "vmla.f32   q15, q0, d8[0]      \n"
                        "vmla.f32   q13, q1, d8[1]      \n"

                        "vld2.f32   {d4-d7}, [%4]       \n"
                        "vext.32    q8, q0, q2, #1      \n"
                        "vext.32    q9, q1, q3, #1      \n"
                        "vmla.f32   q14, q8, d9[0]      \n"
                        "vmla.f32   q15, q9, d9[1]      \n"

                        "vext.32    q10, q0, q2, #2     \n"
                        "vext.32    q11, q1, q3, #2     \n"
                        "vmla.f32   q13, q10, d10[0]    \n"
                        "vmla.f32   q14, q11, d10[1]    \n"

                        "vext.32    q12, q0, q2, #3     \n"
                        "vmla.f32   q15, q12, d11[0]    \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d12-d15}, [%9]     \n" // q6 q7 = k21222324 k25262728
                        "add        %9, #28             \n"

                        "pld        [%5, #512]          \n"
                        "vld2.f32   {d0-d3}, [%5]!      \n"
                        "vmla.f32   q13, q0, d12[0]     \n"
                        "vmla.f32   q14, q1, d12[1]     \n"

                        "vld2.f32   {d4-d7}, [%5]       \n"
                        "vext.32    q8, q0, q2, #1      \n"
                        "vext.32    q9, q1, q3, #1      \n"
                        "vmla.f32   q15, q8, d13[0]     \n"
                        "vmla.f32   q13, q9, d13[1]     \n"

                        "vext.32    q10, q0, q2, #2     \n"
                        "vext.32    q11, q1, q3, #2     \n"
                        "vmla.f32   q14, q10, d14[0]    \n"
                        "vmla.f32   q15, q11, d14[1]    \n"

                        "vext.32    q12, q0, q2, #3     \n"
                        "vmla.f32   q13, q12, d15[0]    \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k28293031 k32333435
                        "add        %9, #28             \n"

                        "pld        [%6, #512]          \n"
                        "vld2.f32   {d0-d3}, [%6]!      \n"
                        "vmla.f32   q14, q0, d8[0]      \n"
                        "vmla.f32   q15, q1, d8[1]      \n"

                        "vld2.f32   {d4-d7}, [%6]       \n"
                        "vext.32    q8, q0, q2, #1      \n"
                        "vext.32    q9, q1, q3, #1      \n"
                        "vmla.f32   q13, q8, d9[0]      \n"
                        "vmla.f32   q14, q9, d9[1]      \n"

                        "vext.32    q10, q0, q2, #2     \n"
                        "vext.32    q11, q1, q3, #2     \n"
                        "vmla.f32   q15, q10, d10[0]    \n"
                        "vmla.f32   q13, q11, d10[1]    \n"

                        "vext.32    q12, q0, q2, #3     \n"
                        "vmla.f32   q14, q12, d11[0]    \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d12-d15}, [%9]     \n" // q6 q7 = k35363738 k39404142
                        "add        %9, #28             \n"

                        "pld        [%7, #512]          \n"
                        "vld2.f32   {d0-d3}, [%7]!      \n"
                        "vmla.f32   q15, q0, d12[0]     \n"
                        "vmla.f32   q13, q1, d12[1]     \n"

                        "vld2.f32   {d4-d7}, [%7]       \n"
                        "vext.32    q8, q0, q2, #1      \n"
                        "vext.32    q9, q1, q3, #1      \n"
                        "vmla.f32   q14, q8, d13[0]     \n"
                        "vmla.f32   q15, q9, d13[1]     \n"

                        "vext.32    q10, q0, q2, #2     \n"
                        "vext.32    q11, q1, q3, #2     \n"
                        "vmla.f32   q13, q10, d14[0]    \n"
                        "vmla.f32   q14, q11, d14[1]    \n"

                        "vext.32    q12, q0, q2, #3     \n"
                        "vmla.f32   q15, q12, d15[0]    \n"

                        "pld        [%9, #256]          \n"
                        "vld1.f32   {d8-d11}, [%9]      \n" // q4 q5 = k42434445 k46474849
                        "sub        %9, #168            \n" // restore k0

                        "pld        [%8, #512]          \n"
                        "vld2.f32   {d0-d3}, [%8]!      \n"
                        "vmla.f32   q13, q0, d8[0]      \n"
                        "vmla.f32   q14, q1, d8[1]      \n"

                        "vld2.f32   {d4-d7}, [%8]       \n"
                        "vext.32    q8, q0, q2, #1      \n"
                        "vext.32    q9, q1, q3, #1      \n"
                        "vmla.f32   q15, q8, d9[0]      \n"
                        "vmla.f32   q13, q9, d9[1]      \n"

                        "vext.32    q10, q0, q2, #2     \n"
                        "vext.32    q11, q1, q3, #2     \n"
                        "vmla.f32   q14, q10, d10[0]    \n"
                        "vmla.f32   q15, q11, d10[1]    \n"

                        "vext.32    q12, q0, q2, #3     \n"
                        "vmla.f32   q13, q12, d11[0]    \n"

                        "vadd.f32   q14, q14, q15       \n"
                        "vadd.f32   q13, q13, q14       \n"

                        "vst1.f32   {d26-d27}, [%1]!    \n"

                        "subs       %0, #1              \n"
                        "bne        0b                  \n"
                        : "=r"(nn),     // %0
                        "=r"(outptr), // %1
                        "=r"(r0),     // %2
                        "=r"(r1),     // %3
                        "=r"(r2),     // %4
                        "=r"(r3),     // %5
                        "=r"(r4),     // %6
                        "=r"(r5),     // %7
                        "=r"(r6),     // %8
                        "=r"(k0)      // %9
                        : "0"(nn),
                        "1"(outptr),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(k0)
                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
                }
#endif // __aarch64__
#endif // __ARM_NEON

                for (; remain > 0; remain--)
                {
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r0[3] * k0[3];
                    sum += r0[4] * k0[4];
                    sum += r0[5] * k0[5];
                    sum += r0[6] * k0[6];

                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r1[3] * k1[3];
                    sum += r1[4] * k1[4];
                    sum += r1[5] * k1[5];
                    sum += r1[6] * k1[6];

                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];
                    sum += r2[3] * k2[3];
                    sum += r2[4] * k2[4];
                    sum += r2[5] * k2[5];
                    sum += r2[6] * k2[6];

                    sum += r3[0] * k3[0];
                    sum += r3[1] * k3[1];
                    sum += r3[2] * k3[2];
                    sum += r3[3] * k3[3];
                    sum += r3[4] * k3[4];
                    sum += r3[5] * k3[5];
                    sum += r3[6] * k3[6];

                    sum += r4[0] * k4[0];
                    sum += r4[1] * k4[1];
                    sum += r4[2] * k4[2];
                    sum += r4[3] * k4[3];
                    sum += r4[4] * k4[4];
                    sum += r4[5] * k4[5];
                    sum += r4[6] * k4[6];

                    sum += r5[0] * k5[0];
                    sum += r5[1] * k5[1];
                    sum += r5[2] * k5[2];
                    sum += r5[3] * k5[3];
                    sum += r5[4] * k5[4];
                    sum += r5[5] * k5[5];
                    sum += r5[6] * k5[6];

                    sum += r6[0] * k6[0];
                    sum += r6[1] * k6[1];
                    sum += r6[2] * k6[2];
                    sum += r6[3] * k6[3];
                    sum += r6[4] * k6[4];
                    sum += r6[5] * k6[5];
                    sum += r6[6] * k6[6];

                    *outptr += sum;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;
                    r4 += 2;
                    r5 += 2;
                    r6 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
                r5 += tailstep;
                r6 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_7x7_pack1to4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv7x7s2_pack1to4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);
            const float* r3 = img0.row(3);
            const float* r4 = img0.row(4);
            const float* r5 = img0.row(5);
            const float* r6 = img0.row(6);

            const float* kptr = (const float*)kernel.channel(p).row(q);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;
#if __aarch64__
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r0

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%1]        \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%2], #64 \n" // r1

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v10.4s, v11.4s}, [%2]      \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r2

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%3]        \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%4], #64 \n" // r3

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v10.4s, v11.4s}, [%4]      \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n" // r4

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%5]        \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v6.4s, v7.4s, v8.4s, v9.4s}, [%6], #64 \n" // r5

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v10.4s, v11.4s}, [%6]      \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%7, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%7], #64 \n" // r6

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%7]        \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "sub    %0, %0, #64                 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "sub    %8, %8, #784                \n"

                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
                }
#endif // __aarch64__
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0] \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1] \n" // r0
                        "add    %1, %1, #32                 \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%2, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2] \n" // r1
                        "add    %2, %2, #32                 \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%3, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3] \n" // r2
                        "add    %3, %3, #32                 \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%4, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4] \n" // r3
                        "add    %4, %4, #32                 \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%5, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5] \n" // r4
                        "add    %5, %5, #32                 \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%6, #512]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%6] \n" // r5
                        "add    %6, %6, #32                 \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%7, #512]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%7] \n" // r6
                        "add    %7, %7, #32                 \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "sub    %8, %8, #784                \n"

                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]      \n"
                        "vldm       %0, {d24-d31}   \n"

                        "pld        [%1, #256]      \n"
                        "vld1.f32   {d0-d3}, [%1]!  \n" // r0

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q5, d2[0]  \n"
                        "vmla.f32   q15, q5, d3[0]  \n"

                        "pld        [%1, #192]      \n"
                        "vld1.f32   {d4-d6}, [%1]   \n"

                        "vmla.f32   q12, q6, d0[1]  \n"
                        "vmla.f32   q13, q6, d1[1]  \n"
                        "vmla.f32   q14, q6, d2[1]  \n"
                        "vmla.f32   q15, q6, d3[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q7, d3[0]  \n"
                        "vmla.f32   q15, q7, d4[0]  \n"
                        "vmla.f32   q12, q8, d1[1]  \n"
                        "vmla.f32   q13, q8, d2[1]  \n"
                        "vmla.f32   q14, q8, d3[1]  \n"
                        "vmla.f32   q15, q8, d4[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q9, d4[0]  \n"
                        "vmla.f32   q15, q9, d5[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d2[1] \n"
                        "vmla.f32   q13, q10, d3[1] \n"
                        "vmla.f32   q14, q10, d4[1] \n"
                        "vmla.f32   q15, q10, d5[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d4[0] \n"

                        "pld        [%2, #256]      \n"
                        "vld1.f32   {d0-d3}, [%2]!  \n" // r1

                        "vmla.f32   q14, q11, d5[0] \n"
                        "vmla.f32   q15, q11, d6[0] \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q5, d2[0]  \n"
                        "vmla.f32   q15, q5, d3[0]  \n"

                        "pld        [%2, #192]      \n"
                        "vld1.f32   {d4-d6}, [%2]   \n"

                        "vmla.f32   q12, q6, d0[1]  \n"
                        "vmla.f32   q13, q6, d1[1]  \n"
                        "vmla.f32   q14, q6, d2[1]  \n"
                        "vmla.f32   q15, q6, d3[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q7, d3[0]  \n"
                        "vmla.f32   q15, q7, d4[0]  \n"
                        "vmla.f32   q12, q8, d1[1]  \n"
                        "vmla.f32   q13, q8, d2[1]  \n"
                        "vmla.f32   q14, q8, d3[1]  \n"
                        "vmla.f32   q15, q8, d4[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q9, d4[0]  \n"
                        "vmla.f32   q15, q9, d5[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d2[1] \n"
                        "vmla.f32   q13, q10, d3[1] \n"
                        "vmla.f32   q14, q10, d4[1] \n"
                        "vmla.f32   q15, q10, d5[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d4[0] \n"

                        "pld        [%3, #256]      \n"
                        "vld1.f32   {d0-d3}, [%3]!  \n" // r2

                        "vmla.f32   q14, q11, d5[0] \n"
                        "vmla.f32   q15, q11, d6[0] \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q5, d2[0]  \n"
                        "vmla.f32   q15, q5, d3[0]  \n"

                        "pld        [%3, #192]      \n"
                        "vld1.f32   {d4-d6}, [%3]   \n"

                        "vmla.f32   q12, q6, d0[1]  \n"
                        "vmla.f32   q13, q6, d1[1]  \n"
                        "vmla.f32   q14, q6, d2[1]  \n"
                        "vmla.f32   q15, q6, d3[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q7, d3[0]  \n"
                        "vmla.f32   q15, q7, d4[0]  \n"
                        "vmla.f32   q12, q8, d1[1]  \n"
                        "vmla.f32   q13, q8, d2[1]  \n"
                        "vmla.f32   q14, q8, d3[1]  \n"
                        "vmla.f32   q15, q8, d4[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q9, d4[0]  \n"
                        "vmla.f32   q15, q9, d5[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d2[1] \n"
                        "vmla.f32   q13, q10, d3[1] \n"
                        "vmla.f32   q14, q10, d4[1] \n"
                        "vmla.f32   q15, q10, d5[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d4[0] \n"

                        "pld        [%4, #256]      \n"
                        "vld1.f32   {d0-d3}, [%4]!  \n" // r3

                        "vmla.f32   q14, q11, d5[0] \n"
                        "vmla.f32   q15, q11, d6[0] \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q5, d2[0]  \n"
                        "vmla.f32   q15, q5, d3[0]  \n"

                        "pld        [%4, #192]      \n"
                        "vld1.f32   {d4-d6}, [%4]   \n"

                        "vmla.f32   q12, q6, d0[1]  \n"
                        "vmla.f32   q13, q6, d1[1]  \n"
                        "vmla.f32   q14, q6, d2[1]  \n"
                        "vmla.f32   q15, q6, d3[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q7, d3[0]  \n"
                        "vmla.f32   q15, q7, d4[0]  \n"
                        "vmla.f32   q12, q8, d1[1]  \n"
                        "vmla.f32   q13, q8, d2[1]  \n"
                        "vmla.f32   q14, q8, d3[1]  \n"
                        "vmla.f32   q15, q8, d4[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q9, d4[0]  \n"
                        "vmla.f32   q15, q9, d5[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d2[1] \n"
                        "vmla.f32   q13, q10, d3[1] \n"
                        "vmla.f32   q14, q10, d4[1] \n"
                        "vmla.f32   q15, q10, d5[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d4[0] \n"

                        "pld        [%5, #256]      \n"
                        "vld1.f32   {d0-d3}, [%5]!  \n" // r4

                        "vmla.f32   q14, q11, d5[0] \n"
                        "vmla.f32   q15, q11, d6[0] \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q5, d2[0]  \n"
                        "vmla.f32   q15, q5, d3[0]  \n"

                        "pld        [%5, #192]      \n"
                        "vld1.f32   {d4-d6}, [%5]   \n"

                        "vmla.f32   q12, q6, d0[1]  \n"
                        "vmla.f32   q13, q6, d1[1]  \n"
                        "vmla.f32   q14, q6, d2[1]  \n"
                        "vmla.f32   q15, q6, d3[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q7, d3[0]  \n"
                        "vmla.f32   q15, q7, d4[0]  \n"
                        "vmla.f32   q12, q8, d1[1]  \n"
                        "vmla.f32   q13, q8, d2[1]  \n"
                        "vmla.f32   q14, q8, d3[1]  \n"
                        "vmla.f32   q15, q8, d4[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q9, d4[0]  \n"
                        "vmla.f32   q15, q9, d5[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d2[1] \n"
                        "vmla.f32   q13, q10, d3[1] \n"
                        "vmla.f32   q14, q10, d4[1] \n"
                        "vmla.f32   q15, q10, d5[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d4[0] \n"

                        "pld        [%6, #256]      \n"
                        "vld1.f32   {d0-d3}, [%6]!  \n" // r5

                        "vmla.f32   q14, q11, d5[0] \n"
                        "vmla.f32   q15, q11, d6[0] \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q5, d2[0]  \n"
                        "vmla.f32   q15, q5, d3[0]  \n"

                        "pld        [%6, #192]      \n"
                        "vld1.f32   {d4-d6}, [%6]   \n"

                        "vmla.f32   q12, q6, d0[1]  \n"
                        "vmla.f32   q13, q6, d1[1]  \n"
                        "vmla.f32   q14, q6, d2[1]  \n"
                        "vmla.f32   q15, q6, d3[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q7, d3[0]  \n"
                        "vmla.f32   q15, q7, d4[0]  \n"
                        "vmla.f32   q12, q8, d1[1]  \n"
                        "vmla.f32   q13, q8, d2[1]  \n"
                        "vmla.f32   q14, q8, d3[1]  \n"
                        "vmla.f32   q15, q8, d4[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q9, d4[0]  \n"
                        "vmla.f32   q15, q9, d5[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d2[1] \n"
                        "vmla.f32   q13, q10, d3[1] \n"
                        "vmla.f32   q14, q10, d4[1] \n"
                        "vmla.f32   q15, q10, d5[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d4[0] \n"

                        "pld        [%7, #256]      \n"
                        "vld1.f32   {d0-d3}, [%7]!  \n" // r6

                        "vmla.f32   q14, q11, d5[0] \n"
                        "vmla.f32   q15, q11, d6[0] \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q5, d2[0]  \n"
                        "vmla.f32   q15, q5, d3[0]  \n"

                        "pld        [%7, #192]      \n"
                        "vld1.f32   {d4-d6}, [%7]   \n"

                        "vmla.f32   q12, q6, d0[1]  \n"
                        "vmla.f32   q13, q6, d1[1]  \n"
                        "vmla.f32   q14, q6, d2[1]  \n"
                        "vmla.f32   q15, q6, d3[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q7, d3[0]  \n"
                        "vmla.f32   q15, q7, d4[0]  \n"
                        "vmla.f32   q12, q8, d1[1]  \n"
                        "vmla.f32   q13, q8, d2[1]  \n"
                        "vmla.f32   q14, q8, d3[1]  \n"
                        "vmla.f32   q15, q8, d4[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q9, d4[0]  \n"
                        "vmla.f32   q15, q9, d5[0]  \n"
                        "vmla.f32   q12, q10, d2[1] \n"
                        "vmla.f32   q13, q10, d3[1] \n"
                        "vmla.f32   q14, q10, d4[1] \n"
                        "vmla.f32   q15, q10, d5[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d4[0] \n"
                        "vmla.f32   q14, q11, d5[0] \n"
                        "vmla.f32   q15, q11, d6[0] \n"

                        "sub        %8, %8, #784    \n"

                        "vstm       %0!, {d24-d31}  \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v16.4s, v17.4s}, [%0]      \n"

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%1] \n" // r0
                        "add    %1, %1, #16                 \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmul   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmul   v19.4s, v24.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%2] \n" // r1
                        "add    %2, %2, #16                 \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%3] \n" // r2
                        "add    %3, %3, #16                 \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%4] \n" // r3
                        "add    %4, %4, #16                 \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%5, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%5] \n" // r4
                        "add    %5, %5, #16                 \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%6, #384]       \n"
                        "ld1    {v4.4s, v5.4s, v6.4s}, [%6] \n" // r5
                        "add    %6, %6, #16                 \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%7, #384]       \n"
                        "ld1    {v0.4s, v1.4s, v2.4s}, [%7] \n" // r6
                        "add    %7, %7, #16                 \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "fadd   v16.4s, v16.4s, v18.4s      \n"
                        "fadd   v17.4s, v17.4s, v19.4s      \n"

                        "sub    %8, %8, #784                \n"

                        "st1    {v16.4s, v17.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #256]      \n"
                        "vld1.f32   {d28-d31}, [%0 :128] \n"

                        "pld        [%1, #256]      \n"
                        "vld1.f32   {d0-d3}, [%1]!  \n" // r0
                        "vld1.f32   {d8[0]}, [%1]   \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmul.f32   q12, q5, d0[0]  \n"
                        "vmul.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q6, d0[1]  \n"
                        "vmla.f32   q15, q6, d1[1]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"

                        "pld        [%2, #256]      \n"
                        "vld1.f32   {d4-d7}, [%2]!  \n" // r1
                        "vld1.f32   {d9[0]}, [%2]   \n"

                        "vmla.f32   q14, q8, d1[1]  \n"
                        "vmla.f32   q15, q8, d2[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q14, q10, d2[1] \n"
                        "vmla.f32   q15, q10, d3[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d8[0] \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q14, q5, d4[0]  \n"
                        "vmla.f32   q15, q5, d5[0]  \n"
                        "vmla.f32   q12, q6, d4[1]  \n"
                        "vmla.f32   q13, q6, d5[1]  \n"
                        "vmla.f32   q14, q7, d5[0]  \n"
                        "vmla.f32   q15, q7, d6[0]  \n"

                        "pld        [%3, #256]      \n"
                        "vld1.f32   {d0-d3}, [%3]!  \n" // r2
                        "vld1.f32   {d8[0]}, [%3]   \n"

                        "vmla.f32   q12, q8, d5[1]  \n"
                        "vmla.f32   q13, q8, d6[1]  \n"
                        "vmla.f32   q14, q9, d6[0]  \n"
                        "vmla.f32   q15, q9, d7[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d6[1] \n"
                        "vmla.f32   q13, q10, d7[1] \n"
                        "vmla.f32   q14, q11, d7[0] \n"
                        "vmla.f32   q15, q11, d9[0] \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q6, d0[1]  \n"
                        "vmla.f32   q15, q6, d1[1]  \n"
                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"

                        "pld        [%4, #256]      \n"
                        "vld1.f32   {d4-d7}, [%4]!  \n" // r3
                        "vld1.f32   {d9[0]}, [%4]   \n"

                        "vmla.f32   q14, q8, d1[1]  \n"
                        "vmla.f32   q15, q8, d2[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q14, q10, d2[1] \n"
                        "vmla.f32   q15, q10, d3[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d8[0] \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q14, q5, d4[0]  \n"
                        "vmla.f32   q15, q5, d5[0]  \n"
                        "vmla.f32   q12, q6, d4[1]  \n"
                        "vmla.f32   q13, q6, d5[1]  \n"
                        "vmla.f32   q14, q7, d5[0]  \n"
                        "vmla.f32   q15, q7, d6[0]  \n"

                        "pld        [%5, #256]      \n"
                        "vld1.f32   {d0-d3}, [%5]!  \n" // r4
                        "vld1.f32   {d8[0]}, [%5]   \n"

                        "vmla.f32   q12, q8, d5[1]  \n"
                        "vmla.f32   q13, q8, d6[1]  \n"
                        "vmla.f32   q14, q9, d6[0]  \n"
                        "vmla.f32   q15, q9, d7[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d6[1] \n"
                        "vmla.f32   q13, q10, d7[1] \n"
                        "vmla.f32   q14, q11, d7[0] \n"
                        "vmla.f32   q15, q11, d9[0] \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q6, d0[1]  \n"
                        "vmla.f32   q15, q6, d1[1]  \n"
                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"

                        "pld        [%6, #256]      \n"
                        "vld1.f32   {d4-d7}, [%6]!  \n" // r5
                        "vld1.f32   {d9[0]}, [%6]   \n"

                        "vmla.f32   q14, q8, d1[1]  \n"
                        "vmla.f32   q15, q8, d2[1]  \n"
                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q14, q10, d2[1] \n"
                        "vmla.f32   q15, q10, d3[1] \n"
                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d8[0] \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q14, q5, d4[0]  \n"
                        "vmla.f32   q15, q5, d5[0]  \n"
                        "vmla.f32   q12, q6, d4[1]  \n"
                        "vmla.f32   q13, q6, d5[1]  \n"
                        "vmla.f32   q14, q7, d5[0]  \n"
                        "vmla.f32   q15, q7, d6[0]  \n"

                        "pld        [%7, #256]      \n"
                        "vld1.f32   {d0-d3}, [%7]!  \n" // r6
                        "vld1.f32   {d8[0]}, [%7]   \n"

                        "vmla.f32   q12, q8, d5[1]  \n"
                        "vmla.f32   q13, q8, d6[1]  \n"
                        "vmla.f32   q14, q9, d6[0]  \n"
                        "vmla.f32   q15, q9, d7[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d10-d17}  \n"

                        "vmla.f32   q12, q10, d6[1] \n"
                        "vmla.f32   q13, q10, d7[1] \n"
                        "vmla.f32   q14, q11, d7[0] \n"
                        "vmla.f32   q15, q11, d9[0] \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d18-d23}  \n"

                        "vmla.f32   q12, q5, d0[0]  \n"
                        "vmla.f32   q13, q5, d1[0]  \n"
                        "vmla.f32   q14, q6, d0[1]  \n"
                        "vmla.f32   q15, q6, d1[1]  \n"

                        "sub        %1, %1, #16     \n"
                        "sub        %2, %2, #16     \n"

                        "vmla.f32   q12, q7, d1[0]  \n"
                        "vmla.f32   q13, q7, d2[0]  \n"
                        "vmla.f32   q14, q8, d1[1]  \n"
                        "vmla.f32   q15, q8, d2[1]  \n"

                        "sub        %8, %8, #784    \n"

                        "vmla.f32   q12, q9, d2[0]  \n"
                        "vmla.f32   q13, q9, d3[0]  \n"
                        "vmla.f32   q14, q10, d2[1] \n"
                        "vmla.f32   q15, q10, d3[1] \n"

                        "sub        %3, %3, #16     \n"
                        "sub        %4, %4, #16     \n"

                        "vmla.f32   q12, q11, d3[0] \n"
                        "vmla.f32   q13, q11, d8[0] \n"

                        "sub        %5, %5, #16     \n"
                        "sub        %6, %6, #16     \n"

                        "vadd.f32   q14, q14, q12   \n"
                        "vadd.f32   q15, q15, q13   \n"

                        "sub        %7, %7, #16     \n"

                        "vst1.f32   {d28-d31}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v16.4s}, [%0]              \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%1]        \n" // r0

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmul   v17.4s, v24.4s, v0.s[0]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmul   v18.4s, v25.4s, v0.s[1]     \n"
                        "fmul   v19.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%2]        \n" // r1

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v18.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v18.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%3]        \n" // r2

                        "fmla   v19.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v17.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v19.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%4]        \n" // r3

                        "fmla   v18.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v18.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%5]        \n" // r4

                        "fmla   v17.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v19.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v17.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v18.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v19.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v4.4s, v5.4s}, [%6]        \n" // r5

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v18.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v18.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v0.4s, v1.4s}, [%7]        \n" // r6

                        "fmla   v19.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%8], #64 \n"

                        "fmla   v17.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #384]       \n"
                        "ld1    {v28.4s, v29.4s, v30.4s}, [%8], #48 \n"

                        "fmla   v19.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v26.4s, v0.s[2]     \n"

                        "add    %1, %1, #8                  \n"
                        "add    %2, %2, #8                  \n"

                        "fmla   v18.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v30.4s, v1.s[2]     \n"

                        "add    %3, %3, #8                  \n"
                        "add    %4, %4, #8                  \n"

                        "fadd   v18.4s, v18.4s, v19.4s      \n"

                        "add    %5, %5, #8                  \n"

                        "fadd   v16.4s, v16.4s, v17.4s      \n"

                        "add    %6, %6, #8                  \n"
                        "add    %7, %7, #8                  \n"

                        "fadd   v16.4s, v16.4s, v18.4s      \n"

                        "sub    %8, %8, #784                \n"

                        "st1    {v16.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v4", "v5", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #128]      \n"
                        "vld1.f32   {d8-d9}, [%0 :128] \n"

                        "pld        [%1, #256]      \n"
                        "vld1.f32   {d0-d3}, [%1]   \n" // r0

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d16-d23}  \n"

                        "vmul.f32   q5, q8, d0[0]   \n"
                        "vmul.f32   q6, q9, d0[1]   \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d24-d29}  \n"

                        "vmul.f32   q7, q10, d1[0]  \n"
                        "vmla.f32   q4, q11, d1[1]  \n"

                        "pld        [%2, #256]      \n"
                        "vld1.f32   {d4-d7}, [%2]   \n" // r1

                        "vmla.f32   q5, q12, d2[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d16-d23}  \n"

                        "vmla.f32   q6, q13, d2[1]  \n"
                        "vmla.f32   q7, q14, d3[0]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d24-d29}  \n"

                        "vmla.f32   q4, q8, d4[0]   \n"
                        "vmla.f32   q5, q9, d4[1]   \n"
                        "vmla.f32   q6, q10, d5[0]  \n"

                        "pld        [%3, #256]      \n"
                        "vld1.f32   {d0-d3}, [%3]   \n" // r2

                        "vmla.f32   q7, q11, d5[1]  \n"
                        "vmla.f32   q4, q12, d6[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d16-d23}  \n"

                        "vmla.f32   q5, q13, d6[1]  \n"
                        "vmla.f32   q6, q14, d7[0]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d24-d29}  \n"

                        "vmla.f32   q7, q8, d0[0]   \n"
                        "vmla.f32   q4, q9, d0[1]   \n"
                        "vmla.f32   q5, q10, d1[0]  \n"

                        "pld        [%4, #256]      \n"
                        "vld1.f32   {d4-d7}, [%4]   \n" // r3

                        "vmla.f32   q6, q11, d1[1]  \n"
                        "vmla.f32   q7, q12, d2[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d16-d23}  \n"

                        "vmla.f32   q4, q13, d2[1]  \n"
                        "vmla.f32   q5, q14, d3[0]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d24-d29}  \n"

                        "vmla.f32   q6, q8, d4[0]   \n"
                        "vmla.f32   q7, q9, d4[1]   \n"
                        "vmla.f32   q4, q10, d5[0]  \n"

                        "pld        [%5, #256]      \n"
                        "vld1.f32   {d0-d3}, [%5]   \n" // r4

                        "vmla.f32   q5, q11, d5[1]  \n"
                        "vmla.f32   q6, q12, d6[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d16-d23}  \n"

                        "vmla.f32   q7, q13, d6[1]  \n"
                        "vmla.f32   q4, q14, d7[0]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d24-d29}  \n"

                        "vmla.f32   q5, q8, d0[0]   \n"
                        "vmla.f32   q6, q9, d0[1]   \n"
                        "vmla.f32   q7, q10, d1[0]  \n"

                        "pld        [%6, #256]      \n"
                        "vld1.f32   {d4-d7}, [%6]   \n" // r5

                        "vmla.f32   q4, q11, d1[1]  \n"
                        "vmla.f32   q5, q12, d2[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d16-d23}  \n"

                        "vmla.f32   q6, q13, d2[1]  \n"
                        "vmla.f32   q7, q14, d3[0]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d24-d29}  \n"

                        "vmla.f32   q4, q8, d4[0]   \n"
                        "vmla.f32   q5, q9, d4[1]   \n"
                        "vmla.f32   q6, q10, d5[0]  \n"

                        "pld        [%7, #256]      \n"
                        "vld1.f32   {d0-d3}, [%7]   \n" // r6

                        "vmla.f32   q7, q11, d5[1]  \n"
                        "vmla.f32   q4, q12, d6[0]  \n"

                        "pld        [%8, #512]      \n"
                        "vldm       %8!, {d16-d23}  \n"

                        "vmla.f32   q5, q13, d6[1]  \n"
                        "vmla.f32   q6, q14, d7[0]  \n"

                        "pld        [%8, #384]      \n"
                        "vldm       %8!, {d24-d29}  \n"

                        "vmla.f32   q7, q8, d0[0]   \n"
                        "vmla.f32   q4, q9, d0[1]   \n"

                        "add        %1, %1, #8      \n"
                        "add        %2, %2, #8      \n"

                        "vmla.f32   q5, q10, d1[0]  \n"
                        "vmla.f32   q6, q11, d1[1]  \n"

                        "sub        %8, %8, #784    \n"

                        "vmla.f32   q7, q12, d2[0]  \n"
                        "vmla.f32   q4, q13, d2[1]  \n"
                        "vmla.f32   q5, q14, d3[0]  \n"

                        "add        %3, %3, #8      \n"
                        "add        %4, %4, #8      \n"

                        "vadd.f32   q6, q6, q7      \n"

                        "add        %5, %5, #8      \n"

                        "vadd.f32   q4, q4, q5      \n"

                        "add        %6, %6, #8      \n"

                        "vadd.f32   q4, q4, q6      \n"

                        "add        %7, %7, #8      \n"

                        "vst1.f32   {d8-d9}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
                r5 += tailstep;
                r6 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_7x7_pack1to4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv7x7s2_pack1to4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    Mat top_blob_fp32(outw, outh, opt.num_threads, (size_t)4u * 4, 4, opt.workspace_allocator);

    const int tailstep = w - 2 * outw + w;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob_fp32.channel(get_omp_thread_num());

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
        out0.fill(_bias0);

        int q = 0;
        for (; q < inch - 1; q++)
        {
            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);
            const unsigned short* r3 = img0.row<const unsigned short>(3);
            const unsigned short* r4 = img0.row<const unsigned short>(4);
            const unsigned short* r5 = img0.row<const unsigned short>(5);
            const unsigned short* r6 = img0.row<const unsigned short>(6);

            const unsigned short* kptr = kernel.channel(p).row<const unsigned short>(q);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;
#if __aarch64__
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%1]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%2], #32 \n" // r1

                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"
                        "shll   v8.4s, v8.4h, #16           \n"
                        "shll   v9.4s, v9.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v10.4h, v11.4h}, [%2]      \n"

                        "shll   v10.4s, v10.4h, #16         \n"
                        "shll   v11.4s, v11.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%3]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%4], #32 \n" // r3

                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"
                        "shll   v8.4s, v8.4h, #16           \n"
                        "shll   v9.4s, v9.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v10.4h, v11.4h}, [%4]      \n"

                        "shll   v10.4s, v10.4h, #16         \n"
                        "shll   v11.4s, v11.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5], #32 \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%5]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%6], #32 \n" // r5

                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"
                        "shll   v8.4s, v8.4h, #16           \n"
                        "shll   v9.4s, v9.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v10.4h, v11.4h}, [%6]      \n"

                        "shll   v10.4s, v10.4h, #16         \n"
                        "shll   v11.4s, v11.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%7], #32 \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%7, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%7]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "sub    %0, %0, #64                 \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "sub    %8, %8, #392                \n"

                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
                }
#endif // __aarch64__
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0] \n"

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1] \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%2] \n" // r1

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3] \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%4] \n" // r3

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%5] \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%6] \n" // r5

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%7] \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "add    %1, %1, #16                 \n"
                        "add    %2, %2, #16                 \n"
                        "add    %3, %3, #16                 \n"
                        "add    %4, %4, #16                 \n"
                        "add    %5, %5, #16                 \n"
                        "add    %6, %6, #16                 \n"
                        "add    %7, %7, #16                 \n"

                        "sub    %8, %8, #392                \n"

                        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #512]          \n"
                        "vldm       %0, {d24-d31}       \n"

                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d2-d3}, [%1]!      \n" // r0

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d5-d6}, [%1]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2]!      \n" // r1

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d5-d6}, [%2]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3]!      \n" // r2

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d5-d6}, [%3]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4]!      \n" // r3

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d5-d6}, [%4]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5]!      \n" // r4

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d5-d6}, [%5]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d2-d3}, [%6]!      \n" // r5

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d5-d6}, [%6]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d2-d3}, [%7]!      \n" // r6

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d5-d6}, [%7]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"
                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"
                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "sub        %8, %8, #392        \n"

                        "vstm       %0!, {d24-d31}      \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #256]       \n"
                        "ld1    {v16.4s, v17.4s}, [%0]      \n"

                        "prfm   pldl1keep, [%1, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%1] \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmul   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmul   v19.4s, v24.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%2] \n" // r1

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%3] \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%4] \n" // r3

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%5] \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%6, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%6] \n" // r5

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%7, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%7] \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "add    %1, %1, #8                  \n"
                        "add    %2, %2, #8                  \n"
                        "add    %3, %3, #8                  \n"
                        "add    %4, %4, #8                  \n"
                        "add    %5, %5, #8                  \n"
                        "add    %6, %6, #8                  \n"
                        "add    %7, %7, #8                  \n"

                        "fadd   v16.4s, v16.4s, v18.4s      \n"
                        "fadd   v17.4s, v17.4s, v19.4s      \n"

                        "sub    %8, %8, #392                \n"

                        "st1    {v16.4s, v17.4s}, [%0], #32 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #256]          \n"
                        "vld1.f32   {d28-d31}, [%0 :128] \n"

                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d2-d3}, [%1]!      \n" // r0
                        "vld1.u16   {d8[0]}, [%1]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmul.f32   q12, q5, d0[0]      \n"
                        "vmul.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d6-d7}, [%2]!      \n" // r1
                        "vld1.u16   {d9[0]}, [%2]       \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshl.u32   d9, d9, #16         \n"

                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q14, q5, d4[0]      \n"
                        "vmla.f32   q15, q5, d5[0]      \n"
                        "vmla.f32   q12, q6, d4[1]      \n"
                        "vmla.f32   q13, q6, d5[1]      \n"
                        "vmla.f32   q14, q7, d5[0]      \n"
                        "vmla.f32   q15, q7, d6[0]      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3]!      \n" // r2
                        "vld1.u16   {d8[0]}, [%3]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "vmla.f32   q12, q8, d5[1]      \n"
                        "vmla.f32   q13, q8, d6[1]      \n"
                        "vmla.f32   q14, q9, d6[0]      \n"
                        "vmla.f32   q15, q9, d7[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d6[1]     \n"
                        "vmla.f32   q13, q10, d7[1]     \n"
                        "vmla.f32   q14, q11, d7[0]     \n"
                        "vmla.f32   q15, q11, d9[0]     \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"
                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d6-d7}, [%4]!      \n" // r3
                        "vld1.u16   {d9[0]}, [%4]       \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshl.u32   d9, d9, #16         \n"

                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q14, q5, d4[0]      \n"
                        "vmla.f32   q15, q5, d5[0]      \n"
                        "vmla.f32   q12, q6, d4[1]      \n"
                        "vmla.f32   q13, q6, d5[1]      \n"
                        "vmla.f32   q14, q7, d5[0]      \n"
                        "vmla.f32   q15, q7, d6[0]      \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5]!      \n" // r4
                        "vld1.u16   {d8[0]}, [%5]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "vmla.f32   q12, q8, d5[1]      \n"
                        "vmla.f32   q13, q8, d6[1]      \n"
                        "vmla.f32   q14, q9, d6[0]      \n"
                        "vmla.f32   q15, q9, d7[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d6[1]     \n"
                        "vmla.f32   q13, q10, d7[1]     \n"
                        "vmla.f32   q14, q11, d7[0]     \n"
                        "vmla.f32   q15, q11, d9[0]     \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"
                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d6-d7}, [%6]!      \n" // r5
                        "vld1.u16   {d9[0]}, [%6]       \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshl.u32   d9, d9, #16         \n"

                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q14, q5, d4[0]      \n"
                        "vmla.f32   q15, q5, d5[0]      \n"
                        "vmla.f32   q12, q6, d4[1]      \n"
                        "vmla.f32   q13, q6, d5[1]      \n"
                        "vmla.f32   q14, q7, d5[0]      \n"
                        "vmla.f32   q15, q7, d6[0]      \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d2-d3}, [%7]!      \n" // r6
                        "vld1.u16   {d8[0]}, [%7]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "vmla.f32   q12, q8, d5[1]      \n"
                        "vmla.f32   q13, q8, d6[1]      \n"
                        "vmla.f32   q14, q9, d6[0]      \n"
                        "vmla.f32   q15, q9, d7[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d14-d17}, [%8]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d6[1]     \n"
                        "vmla.f32   q13, q10, d7[1]     \n"
                        "vmla.f32   q14, q11, d7[0]     \n"
                        "vmla.f32   q15, q11, d9[0]     \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d20-d22}, [%8]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"

                        "sub        %1, %1, #8          \n"
                        "sub        %2, %2, #8          \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"

                        "sub        %8, %8, #392        \n"

                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"

                        "sub        %3, %3, #8          \n"
                        "sub        %4, %4, #8          \n"

                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "sub        %5, %5, #8          \n"
                        "sub        %6, %6, #8          \n"

                        "vadd.f32   q14, q14, q12       \n"
                        "vadd.f32   q15, q15, q13       \n"

                        "sub        %7, %7, #8          \n"

                        "vst1.f32   {d28-d31}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v16.4s}, [%0]              \n"

                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%1]        \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmul   v17.4s, v24.4s, v0.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmul   v18.4s, v25.4s, v0.s[1]     \n"
                        "fmul   v19.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%2]        \n" // r1

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v18.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%3]        \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v19.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v17.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v19.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%4]        \n" // r3

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%5]        \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v17.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v19.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v17.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v18.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v19.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%6]        \n" // r5

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v18.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%7, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%7]        \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v19.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%8], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v17.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%8], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v19.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v26.4s, v0.s[2]     \n"

                        "add    %1, %1, #4                  \n"
                        "add    %2, %2, #4                  \n"

                        "fmla   v18.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v30.4s, v1.s[2]     \n"

                        "add    %3, %3, #4                  \n"
                        "add    %4, %4, #4                  \n"

                        "fadd   v18.4s, v18.4s, v19.4s      \n"

                        "add    %5, %5, #4                  \n"

                        "fadd   v16.4s, v16.4s, v17.4s      \n"

                        "add    %6, %6, #4                  \n"
                        "add    %7, %7, #4                  \n"

                        "fadd   v16.4s, v16.4s, v18.4s      \n"

                        "sub    %8, %8, #392                \n"

                        "st1    {v16.4s}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v4", "v5", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%0, #128]          \n"
                        "vld1.f32   {d8-d9}, [%0 :128]  \n"

                        "pld        [%1, #128]          \n"
                        "vld1.u16   {d2-d3}, [%1]       \n" // r0

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d20-d23}, [%8]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmul.f32   q5, q8, d0[0]       \n"
                        "vmul.f32   q6, q9, d0[1]       \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d26-d28}, [%8]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmul.f32   q7, q10, d1[0]      \n"
                        "vmla.f32   q4, q11, d1[1]      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d6-d7}, [%2]       \n" // r1

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q5, q12, d2[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d20-d23}, [%8]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q6, q13, d2[1]      \n"
                        "vmla.f32   q7, q14, d3[0]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d26-d28}, [%8]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q4, q8, d4[0]       \n"
                        "vmla.f32   q5, q9, d4[1]       \n"
                        "vmla.f32   q6, q10, d5[0]      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3]       \n" // r2

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q7, q11, d5[1]      \n"
                        "vmla.f32   q4, q12, d6[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d20-d23}, [%8]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q5, q13, d6[1]      \n"
                        "vmla.f32   q6, q14, d7[0]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d26-d28}, [%8]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q7, q8, d0[0]       \n"
                        "vmla.f32   q4, q9, d0[1]       \n"
                        "vmla.f32   q5, q10, d1[0]      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d6-d7}, [%4]       \n" // r3

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q6, q11, d1[1]      \n"
                        "vmla.f32   q7, q12, d2[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d20-d23}, [%8]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q4, q13, d2[1]      \n"
                        "vmla.f32   q5, q14, d3[0]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d26-d28}, [%8]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q6, q8, d4[0]       \n"
                        "vmla.f32   q7, q9, d4[1]       \n"
                        "vmla.f32   q4, q10, d5[0]      \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5]       \n" // r4

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q5, q11, d5[1]      \n"
                        "vmla.f32   q6, q12, d6[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d20-d23}, [%8]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q7, q13, d6[1]      \n"
                        "vmla.f32   q4, q14, d7[0]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d26-d28}, [%8]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q5, q8, d0[0]       \n"
                        "vmla.f32   q6, q9, d0[1]       \n"
                        "vmla.f32   q7, q10, d1[0]      \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d6-d7}, [%6]       \n" // r5

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q4, q11, d1[1]      \n"
                        "vmla.f32   q5, q12, d2[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d20-d23}, [%8]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q6, q13, d2[1]      \n"
                        "vmla.f32   q7, q14, d3[0]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d26-d28}, [%8]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q4, q8, d4[0]       \n"
                        "vmla.f32   q5, q9, d4[1]       \n"
                        "vmla.f32   q6, q10, d5[0]      \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d2-d3}, [%7]       \n" // r6

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q7, q11, d5[1]      \n"
                        "vmla.f32   q4, q12, d6[0]      \n"

                        "pld        [%8, #256]          \n"
                        "vld1.u16   {d20-d23}, [%8]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q5, q13, d6[1]      \n"
                        "vmla.f32   q6, q14, d7[0]      \n"

                        "pld        [%8, #192]          \n"
                        "vld1.u16   {d26-d28}, [%8]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q7, q8, d0[0]       \n"
                        "vmla.f32   q4, q9, d0[1]       \n"

                        "add        %1, %1, #4          \n"
                        "add        %2, %2, #4          \n"

                        "vmla.f32   q5, q10, d1[0]      \n"
                        "vmla.f32   q6, q11, d1[1]      \n"

                        "sub        %8, %8, #392        \n"

                        "vmla.f32   q7, q12, d2[0]      \n"
                        "vmla.f32   q4, q13, d2[1]      \n"
                        "vmla.f32   q5, q14, d3[0]      \n"

                        "add        %3, %3, #4          \n"
                        "add        %4, %4, #4          \n"

                        "vadd.f32   q6, q6, q7          \n"

                        "add        %5, %5, #4          \n"

                        "vadd.f32   q4, q4, q5          \n"

                        "add        %6, %6, #4          \n"

                        "vadd.f32   q4, q4, q6          \n"

                        "add        %7, %7, #4          \n"

                        "vst1.f32   {d8-d9}, [%0 :128]! \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
                r5 += tailstep;
                r6 += tailstep;
            }
        }
        for (; q < inch; q++)
        {
            unsigned short* outptr0_bf16 = top_blob.channel(p);

            float* outptr0 = out0.row(0);

            const Mat img0 = bottom_blob.channel(q);

            const unsigned short* r0 = img0.row<const unsigned short>(0);
            const unsigned short* r1 = img0.row<const unsigned short>(1);
            const unsigned short* r2 = img0.row<const unsigned short>(2);
            const unsigned short* r3 = img0.row<const unsigned short>(3);
            const unsigned short* r4 = img0.row<const unsigned short>(4);
            const unsigned short* r5 = img0.row<const unsigned short>(5);
            const unsigned short* r6 = img0.row<const unsigned short>(6);

            const unsigned short* kptr = kernel.channel(p).row<const unsigned short>(q);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;
#if __aarch64__
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"

                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%2]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%3], #32 \n" // r1

                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"
                        "shll   v8.4s, v8.4h, #16           \n"
                        "shll   v9.4s, v9.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v10.4h, v11.4h}, [%3]      \n"

                        "shll   v10.4s, v10.4h, #16         \n"
                        "shll   v11.4s, v11.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4], #32 \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%4]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%5], #32 \n" // r3

                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"
                        "shll   v8.4s, v8.4h, #16           \n"
                        "shll   v9.4s, v9.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v10.4h, v11.4h}, [%5]      \n"

                        "shll   v10.4s, v10.4h, #16         \n"
                        "shll   v11.4s, v11.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%6], #32 \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%6]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v6.4h, v7.4h, v8.4h, v9.4h}, [%7], #32 \n" // r5

                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"
                        "shll   v8.4s, v8.4h, #16           \n"
                        "shll   v9.4s, v9.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%7, #128]       \n"
                        "ld1    {v10.4h, v11.4h}, [%7]      \n"

                        "shll   v10.4s, v10.4h, #16         \n"
                        "shll   v11.4s, v11.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v6.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v6.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v7.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v7.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v8.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v8.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v9.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v9.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v6.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v6.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v7.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v7.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v8.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v8.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v9.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v9.s[3]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v6.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v7.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v7.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v8.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v8.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v9.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v9.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v10.s[0]    \n"

                        "fmla   v16.4s, v27.4s, v6.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v7.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v7.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v8.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v8.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v9.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v9.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v10.s[1]    \n"

                        "fmla   v16.4s, v28.4s, v7.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v7.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v8.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v8.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v9.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v9.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v10.s[0]    \n"
                        "fmla   v23.4s, v28.4s, v10.s[2]    \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v7.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v7.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v8.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v8.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v9.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v9.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v10.s[1]    \n"
                        "fmla   v23.4s, v29.4s, v10.s[3]    \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%8], #32 \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v30.4s, v7.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v8.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v8.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v9.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v9.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v10.s[0]    \n"
                        "fmla   v22.4s, v30.4s, v10.s[2]    \n"
                        "fmla   v23.4s, v30.4s, v11.s[0]    \n"

                        "prfm   pldl1keep, [%8, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%8]        \n"

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v20.4s, v24.4s, v2.s[0]     \n"
                        "fmla   v21.4s, v24.4s, v2.s[2]     \n"
                        "fmla   v22.4s, v24.4s, v3.s[0]     \n"
                        "fmla   v23.4s, v24.4s, v3.s[2]     \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v20.4s, v25.4s, v2.s[1]     \n"
                        "fmla   v21.4s, v25.4s, v2.s[3]     \n"
                        "fmla   v22.4s, v25.4s, v3.s[1]     \n"
                        "fmla   v23.4s, v25.4s, v3.s[3]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v20.4s, v26.4s, v2.s[2]     \n"
                        "fmla   v21.4s, v26.4s, v3.s[0]     \n"
                        "fmla   v22.4s, v26.4s, v3.s[2]     \n"
                        "fmla   v23.4s, v26.4s, v4.s[0]     \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v20.4s, v27.4s, v2.s[3]     \n"
                        "fmla   v21.4s, v27.4s, v3.s[1]     \n"
                        "fmla   v22.4s, v27.4s, v3.s[3]     \n"
                        "fmla   v23.4s, v27.4s, v4.s[1]     \n"

                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v20.4s, v28.4s, v3.s[0]     \n"
                        "fmla   v21.4s, v28.4s, v3.s[2]     \n"
                        "fmla   v22.4s, v28.4s, v4.s[0]     \n"
                        "fmla   v23.4s, v28.4s, v4.s[2]     \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v20.4s, v29.4s, v3.s[1]     \n"
                        "fmla   v21.4s, v29.4s, v3.s[3]     \n"
                        "fmla   v22.4s, v29.4s, v4.s[1]     \n"
                        "fmla   v23.4s, v29.4s, v4.s[3]     \n"

                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"
                        "fmla   v20.4s, v30.4s, v3.s[2]     \n"
                        "fmla   v21.4s, v30.4s, v4.s[0]     \n"
                        "fmla   v22.4s, v30.4s, v4.s[2]     \n"
                        "fmla   v23.4s, v30.4s, v5.s[0]     \n"

                        "sub    %9, %9, #392                \n"

                        "shrn   v16.4h, v16.4s, #16         \n"
                        "shrn   v17.4h, v17.4s, #16         \n"
                        "shrn   v18.4h, v18.4s, #16         \n"
                        "shrn   v19.4h, v19.4s, #16         \n"
                        "shrn   v20.4h, v20.4s, #16         \n"
                        "shrn   v21.4h, v21.4s, #16         \n"
                        "shrn   v22.4h, v22.4s, #16         \n"
                        "shrn   v23.4h, v23.4s, #16         \n"

                        "st1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%0], #32 \n"
                        "st1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(r5),           // %7
                        "=r"(r6),           // %8
                        "=r"(kptr)          // %9
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
                }
#endif // __aarch64__
                for (; j + 3 < outw; j += 4)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #512]       \n"
                        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2] \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%3] \n" // r1

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%4] \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%5] \n" // r3

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%6] \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%7] \n" // r5

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"
                        "shll   v7.4s, v7.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v5.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v5.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v5.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%8, #256]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%8] \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"
                        "shll   v3.4s, v3.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v5.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v6.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v6.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v6.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v6.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v6.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v6.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v7.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v18.4s, v24.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v25.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v1.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v18.4s, v26.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v2.s[0]     \n"
                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v27.4s, v1.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v2.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v18.4s, v28.4s, v2.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v2.s[2]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v29.4s, v2.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v2.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v2.s[0]     \n"
                        "fmla   v18.4s, v30.4s, v2.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v3.s[0]     \n"

                        "add    %2, %2, #16                 \n"
                        "add    %3, %3, #16                 \n"
                        "add    %4, %4, #16                 \n"
                        "add    %5, %5, #16                 \n"
                        "add    %6, %6, #16                 \n"
                        "add    %7, %7, #16                 \n"
                        "add    %8, %8, #16                 \n"

                        "sub    %9, %9, #392                \n"

                        "shrn   v16.4h, v16.4s, #16         \n"
                        "shrn   v17.4h, v17.4s, #16         \n"
                        "shrn   v18.4h, v18.4s, #16         \n"
                        "shrn   v19.4h, v19.4s, #16         \n"

                        "st1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%0], #32 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(r5),           // %7
                        "=r"(r6),           // %8
                        "=r"(kptr)          // %9
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #512]          \n"
                        "vldm       %1!, {d24-d31}      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2]!      \n" // r0

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d5-d6}, [%2]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d2-d3}, [%3]!      \n" // r1

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d5-d6}, [%3]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4]!      \n" // r2

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d5-d6}, [%4]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d2-d3}, [%5]!      \n" // r3

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d5-d6}, [%5]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d2-d3}, [%6]!      \n" // r4

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d5-d6}, [%6]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d2-d3}, [%7]!      \n" // r5

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d5-d6}, [%7]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"

                        "pld        [%8, #128]          \n"
                        "vld1.u16   {d2-d3}, [%8]!      \n" // r6

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q5, d2[0]      \n"
                        "vmla.f32   q15, q5, d3[0]      \n"

                        "pld        [%8, #128]          \n"
                        "vld1.u16   {d5-d6}, [%8]       \n"

                        "vshll.u16  q2, d5, #16         \n"
                        "vshl.u32   d6, d6, #16         \n"

                        "vmla.f32   q12, q6, d0[1]      \n"
                        "vmla.f32   q13, q6, d1[1]      \n"
                        "vmla.f32   q14, q6, d2[1]      \n"
                        "vmla.f32   q15, q6, d3[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q7, d3[0]      \n"
                        "vmla.f32   q15, q7, d4[0]      \n"
                        "vmla.f32   q12, q8, d1[1]      \n"
                        "vmla.f32   q13, q8, d2[1]      \n"
                        "vmla.f32   q14, q8, d3[1]      \n"
                        "vmla.f32   q15, q8, d4[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q9, d4[0]      \n"
                        "vmla.f32   q15, q9, d5[0]      \n"
                        "vmla.f32   q12, q10, d2[1]     \n"
                        "vmla.f32   q13, q10, d3[1]     \n"
                        "vmla.f32   q14, q10, d4[1]     \n"
                        "vmla.f32   q15, q10, d5[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d4[0]     \n"
                        "vmla.f32   q14, q11, d5[0]     \n"
                        "vmla.f32   q15, q11, d6[0]     \n"

                        "sub        %9, %9, #392        \n"

                        "vshrn.u32  d24, q12, #16       \n"
                        "vshrn.u32  d25, q13, #16       \n"
                        "vshrn.u32  d26, q14, #16       \n"
                        "vshrn.u32  d27, q15, #16       \n"

                        "vst1.u16   {d24-d27}, [%0]!    \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(r5),           // %7
                        "=r"(r6),           // %8
                        "=r"(kptr)          // %9
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j + 1 < outw; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v16.4s, v17.4s}, [%1], #32 \n"

                        "prfm   pldl1keep, [%2, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%2] \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmul   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmul   v19.4s, v24.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%3, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%3] \n" // r1

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%4, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%4] \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%5, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%5] \n" // r3

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%6, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%6] \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%7, #192]       \n"
                        "ld1    {v4.4h, v5.4h, v6.4h}, [%7] \n" // r5

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"
                        "shll   v6.4s, v6.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v24.4s, v4.s[2]     \n"
                        "fmla   v18.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"
                        "fmla   v17.4s, v26.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%8, #192]       \n"
                        "ld1    {v0.4h, v1.4h, v2.4h}, [%8] \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"
                        "shll   v2.4s, v2.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v19.4s, v27.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"
                        "fmla   v17.4s, v28.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v19.4s, v29.4s, v5.s[3]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"
                        "fmla   v17.4s, v30.4s, v6.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v19.4s, v24.4s, v0.s[2]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v25.4s, v0.s[3]     \n"
                        "fmla   v18.4s, v26.4s, v0.s[2]     \n"
                        "fmla   v19.4s, v26.4s, v1.s[0]     \n"
                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v27.4s, v1.s[1]     \n"
                        "fmla   v18.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[2]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v29.4s, v1.s[3]     \n"
                        "fmla   v18.4s, v30.4s, v1.s[2]     \n"
                        "fmla   v19.4s, v30.4s, v2.s[0]     \n"

                        "add    %2, %2, #8                  \n"
                        "add    %3, %3, #8                  \n"
                        "add    %4, %4, #8                  \n"
                        "add    %5, %5, #8                  \n"
                        "add    %6, %6, #8                  \n"
                        "add    %7, %7, #8                  \n"
                        "add    %8, %8, #8                  \n"

                        "fadd   v16.4s, v16.4s, v18.4s      \n"
                        "fadd   v17.4s, v17.4s, v19.4s      \n"

                        "sub    %9, %9, #392                \n"

                        "shrn   v16.4h, v16.4s, #16         \n"
                        "shrn   v17.4h, v17.4s, #16         \n"

                        "st1    {v16.4h, v17.4h}, [%0], #16 \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(r5),           // %7
                        "=r"(r6),           // %8
                        "=r"(kptr)          // %9
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(kptr)
                        : "memory", "v0", "v1", "v2", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #256]          \n"
                        "vld1.f32   {d28-d31}, [%1 :128]! \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2]!      \n" // r0
                        "vld1.u16   {d8[0]}, [%2]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmul.f32   q12, q5, d0[0]      \n"
                        "vmul.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d6-d7}, [%3]!      \n" // r1
                        "vld1.u16   {d9[0]}, [%3]       \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshl.u32   d9, d9, #16         \n"

                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q14, q5, d4[0]      \n"
                        "vmla.f32   q15, q5, d5[0]      \n"
                        "vmla.f32   q12, q6, d4[1]      \n"
                        "vmla.f32   q13, q6, d5[1]      \n"
                        "vmla.f32   q14, q7, d5[0]      \n"
                        "vmla.f32   q15, q7, d6[0]      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4]!      \n" // r2
                        "vld1.u16   {d8[0]}, [%4]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "vmla.f32   q12, q8, d5[1]      \n"
                        "vmla.f32   q13, q8, d6[1]      \n"
                        "vmla.f32   q14, q9, d6[0]      \n"
                        "vmla.f32   q15, q9, d7[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d6[1]     \n"
                        "vmla.f32   q13, q10, d7[1]     \n"
                        "vmla.f32   q14, q11, d7[0]     \n"
                        "vmla.f32   q15, q11, d9[0]     \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"
                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d6-d7}, [%5]!      \n" // r3
                        "vld1.u16   {d9[0]}, [%5]       \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshl.u32   d9, d9, #16         \n"

                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q14, q5, d4[0]      \n"
                        "vmla.f32   q15, q5, d5[0]      \n"
                        "vmla.f32   q12, q6, d4[1]      \n"
                        "vmla.f32   q13, q6, d5[1]      \n"
                        "vmla.f32   q14, q7, d5[0]      \n"
                        "vmla.f32   q15, q7, d6[0]      \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d2-d3}, [%6]!      \n" // r4
                        "vld1.u16   {d8[0]}, [%6]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "vmla.f32   q12, q8, d5[1]      \n"
                        "vmla.f32   q13, q8, d6[1]      \n"
                        "vmla.f32   q14, q9, d6[0]      \n"
                        "vmla.f32   q15, q9, d7[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d6[1]     \n"
                        "vmla.f32   q13, q10, d7[1]     \n"
                        "vmla.f32   q14, q11, d7[0]     \n"
                        "vmla.f32   q15, q11, d9[0]     \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"
                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d6-d7}, [%7]!      \n" // r5
                        "vld1.u16   {d9[0]}, [%7]       \n"

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"
                        "vshl.u32   d9, d9, #16         \n"

                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"
                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"
                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q14, q5, d4[0]      \n"
                        "vmla.f32   q15, q5, d5[0]      \n"
                        "vmla.f32   q12, q6, d4[1]      \n"
                        "vmla.f32   q13, q6, d5[1]      \n"
                        "vmla.f32   q14, q7, d5[0]      \n"
                        "vmla.f32   q15, q7, d6[0]      \n"

                        "pld        [%8, #128]          \n"
                        "vld1.u16   {d2-d3}, [%8]!      \n" // r6
                        "vld1.u16   {d8[0]}, [%8]       \n"

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"
                        "vshl.u32   d8, d8, #16         \n"

                        "vmla.f32   q12, q8, d5[1]      \n"
                        "vmla.f32   q13, q8, d6[1]      \n"
                        "vmla.f32   q14, q9, d6[0]      \n"
                        "vmla.f32   q15, q9, d7[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d14-d17}, [%9]!    \n"

                        "vshll.u16  q5, d14, #16        \n"
                        "vshll.u16  q6, d15, #16        \n"
                        "vshll.u16  q7, d16, #16        \n"
                        "vshll.u16  q8, d17, #16        \n"

                        "vmla.f32   q12, q10, d6[1]     \n"
                        "vmla.f32   q13, q10, d7[1]     \n"
                        "vmla.f32   q14, q11, d7[0]     \n"
                        "vmla.f32   q15, q11, d9[0]     \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d20-d22}, [%9]!    \n"

                        "vshll.u16  q9, d20, #16        \n"
                        "vshll.u16  q10, d21, #16       \n"
                        "vshll.u16  q11, d22, #16       \n"

                        "vmla.f32   q12, q5, d0[0]      \n"
                        "vmla.f32   q13, q5, d1[0]      \n"
                        "vmla.f32   q14, q6, d0[1]      \n"
                        "vmla.f32   q15, q6, d1[1]      \n"

                        "sub        %2, %2, #8          \n"
                        "sub        %3, %3, #8          \n"

                        "vmla.f32   q12, q7, d1[0]      \n"
                        "vmla.f32   q13, q7, d2[0]      \n"
                        "vmla.f32   q14, q8, d1[1]      \n"
                        "vmla.f32   q15, q8, d2[1]      \n"

                        "sub        %9, %9, #392        \n"

                        "vmla.f32   q12, q9, d2[0]      \n"
                        "vmla.f32   q13, q9, d3[0]      \n"
                        "vmla.f32   q14, q10, d2[1]     \n"
                        "vmla.f32   q15, q10, d3[1]     \n"

                        "sub        %4, %4, #8          \n"
                        "sub        %5, %5, #8          \n"

                        "vmla.f32   q12, q11, d3[0]     \n"
                        "vmla.f32   q13, q11, d8[0]     \n"

                        "sub        %6, %6, #8          \n"
                        "sub        %7, %7, #8          \n"

                        "vadd.f32   q14, q14, q12       \n"
                        "vadd.f32   q15, q15, q13       \n"

                        "sub        %8, %8, #8          \n"

                        "vshrn.u32  d28, q14, #16       \n"
                        "vshrn.u32  d29, q15, #16       \n"

                        "vst1.u16   {d28-d29}, [%0 :64]! \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(r5),           // %7
                        "=r"(r6),           // %8
                        "=r"(kptr)          // %9
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
                }
                for (; j < outw; j++)
                {
#if __aarch64__
                    asm volatile(
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v16.4s}, [%1], #16         \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%2]        \n" // r0

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmul   v17.4s, v24.4s, v0.s[0]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmul   v18.4s, v25.4s, v0.s[1]     \n"
                        "fmul   v19.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%3]        \n" // r1

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v18.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%4]        \n" // r2

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v19.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v17.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v19.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%5]        \n" // r3

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v18.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v18.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v19.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v16.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%6]        \n" // r4

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v17.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v18.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v19.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v16.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v17.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v18.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v19.4s, v26.4s, v0.s[2]     \n"

                        "prfm   pldl1keep, [%7, #128]       \n"
                        "ld1    {v4.4h, v5.4h}, [%7]        \n" // r5

                        "shll   v4.4s, v4.4h, #16           \n"
                        "shll   v5.4s, v5.4h, #16           \n"

                        "fmla   v16.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v17.4s, v28.4s, v1.s[0]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v18.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v19.4s, v30.4s, v1.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v16.4s, v24.4s, v4.s[0]     \n"
                        "fmla   v17.4s, v25.4s, v4.s[1]     \n"
                        "fmla   v18.4s, v26.4s, v4.s[2]     \n"

                        "prfm   pldl1keep, [%8, #128]       \n"
                        "ld1    {v0.4h, v1.4h}, [%8]        \n" // r6

                        "shll   v0.4s, v0.4h, #16           \n"
                        "shll   v1.4s, v1.4h, #16           \n"

                        "fmla   v19.4s, v27.4s, v4.s[3]     \n"
                        "fmla   v16.4s, v28.4s, v5.s[0]     \n"

                        "prfm   pldl1keep, [%9, #256]       \n"
                        "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%9], #32 \n"

                        "shll   v24.4s, v24.4h, #16         \n"
                        "shll   v25.4s, v25.4h, #16         \n"
                        "shll   v26.4s, v26.4h, #16         \n"
                        "shll   v27.4s, v27.4h, #16         \n"

                        "fmla   v17.4s, v29.4s, v5.s[1]     \n"
                        "fmla   v18.4s, v30.4s, v5.s[2]     \n"

                        "prfm   pldl1keep, [%9, #192]       \n"
                        "ld1    {v28.4h, v29.4h, v30.4h}, [%9], #24 \n"

                        "shll   v28.4s, v28.4h, #16         \n"
                        "shll   v29.4s, v29.4h, #16         \n"
                        "shll   v30.4s, v30.4h, #16         \n"

                        "fmla   v19.4s, v24.4s, v0.s[0]     \n"
                        "fmla   v16.4s, v25.4s, v0.s[1]     \n"
                        "fmla   v17.4s, v26.4s, v0.s[2]     \n"

                        "add    %2, %2, #4                  \n"
                        "add    %3, %3, #4                  \n"

                        "fmla   v18.4s, v27.4s, v0.s[3]     \n"
                        "fmla   v19.4s, v28.4s, v1.s[0]     \n"
                        "fmla   v16.4s, v29.4s, v1.s[1]     \n"
                        "fmla   v17.4s, v30.4s, v1.s[2]     \n"

                        "add    %4, %4, #4                  \n"
                        "add    %5, %5, #4                  \n"

                        "fadd   v18.4s, v18.4s, v19.4s      \n"

                        "add    %6, %6, #4                  \n"

                        "fadd   v16.4s, v16.4s, v17.4s      \n"

                        "add    %7, %7, #4                  \n"
                        "add    %8, %8, #4                  \n"

                        "fadd   v16.4s, v16.4s, v18.4s      \n"

                        "sub    %9, %9, #392                \n"

                        "shrn   v16.4h, v16.4s, #16         \n"

                        "st1    {v16.4h}, [%0], #8          \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(r5),           // %7
                        "=r"(r6),           // %8
                        "=r"(kptr)          // %9
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(kptr)
                        : "memory", "v0", "v1", "v4", "v5", "v16", "v17", "v18", "v19", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
#else  // __aarch64__
                    asm volatile(
                        "pld        [%1, #128]          \n"
                        "vld1.f32   {d8-d9}, [%1 :128]! \n"

                        "pld        [%2, #128]          \n"
                        "vld1.u16   {d2-d3}, [%2]       \n" // r0

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d20-d23}, [%9]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmul.f32   q5, q8, d0[0]       \n"
                        "vmul.f32   q6, q9, d0[1]       \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d26-d28}, [%9]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmul.f32   q7, q10, d1[0]      \n"
                        "vmla.f32   q4, q11, d1[1]      \n"

                        "pld        [%3, #128]          \n"
                        "vld1.u16   {d6-d7}, [%3]       \n" // r1

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q5, q12, d2[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d20-d23}, [%9]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q6, q13, d2[1]      \n"
                        "vmla.f32   q7, q14, d3[0]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d26-d28}, [%9]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q4, q8, d4[0]       \n"
                        "vmla.f32   q5, q9, d4[1]       \n"
                        "vmla.f32   q6, q10, d5[0]      \n"

                        "pld        [%4, #128]          \n"
                        "vld1.u16   {d2-d3}, [%4]       \n" // r2

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q7, q11, d5[1]      \n"
                        "vmla.f32   q4, q12, d6[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d20-d23}, [%9]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q5, q13, d6[1]      \n"
                        "vmla.f32   q6, q14, d7[0]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d26-d28}, [%9]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q7, q8, d0[0]       \n"
                        "vmla.f32   q4, q9, d0[1]       \n"
                        "vmla.f32   q5, q10, d1[0]      \n"

                        "pld        [%5, #128]          \n"
                        "vld1.u16   {d6-d7}, [%5]       \n" // r3

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q6, q11, d1[1]      \n"
                        "vmla.f32   q7, q12, d2[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d20-d23}, [%9]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q4, q13, d2[1]      \n"
                        "vmla.f32   q5, q14, d3[0]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d26-d28}, [%9]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q6, q8, d4[0]       \n"
                        "vmla.f32   q7, q9, d4[1]       \n"
                        "vmla.f32   q4, q10, d5[0]      \n"

                        "pld        [%6, #128]          \n"
                        "vld1.u16   {d2-d3}, [%6]       \n" // r4

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q5, q11, d5[1]      \n"
                        "vmla.f32   q6, q12, d6[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d20-d23}, [%9]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q7, q13, d6[1]      \n"
                        "vmla.f32   q4, q14, d7[0]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d26-d28}, [%9]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q5, q8, d0[0]       \n"
                        "vmla.f32   q6, q9, d0[1]       \n"
                        "vmla.f32   q7, q10, d1[0]      \n"

                        "pld        [%7, #128]          \n"
                        "vld1.u16   {d6-d7}, [%7]       \n" // r5

                        "vshll.u16  q2, d6, #16         \n"
                        "vshll.u16  q3, d7, #16         \n"

                        "vmla.f32   q4, q11, d1[1]      \n"
                        "vmla.f32   q5, q12, d2[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d20-d23}, [%9]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q6, q13, d2[1]      \n"
                        "vmla.f32   q7, q14, d3[0]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d26-d28}, [%9]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q4, q8, d4[0]       \n"
                        "vmla.f32   q5, q9, d4[1]       \n"
                        "vmla.f32   q6, q10, d5[0]      \n"

                        "pld        [%8, #128]          \n"
                        "vld1.u16   {d2-d3}, [%8]       \n" // r6

                        "vshll.u16  q0, d2, #16         \n"
                        "vshll.u16  q1, d3, #16         \n"

                        "vmla.f32   q7, q11, d5[1]      \n"
                        "vmla.f32   q4, q12, d6[0]      \n"

                        "pld        [%9, #256]          \n"
                        "vld1.u16   {d20-d23}, [%9]!    \n"

                        "vshll.u16  q8, d20, #16        \n"
                        "vshll.u16  q9, d21, #16        \n"
                        "vshll.u16  q10, d22, #16       \n"
                        "vshll.u16  q11, d23, #16       \n"

                        "vmla.f32   q5, q13, d6[1]      \n"
                        "vmla.f32   q6, q14, d7[0]      \n"

                        "pld        [%9, #192]          \n"
                        "vld1.u16   {d26-d28}, [%9]!    \n"

                        "vshll.u16  q12, d26, #16       \n"
                        "vshll.u16  q13, d27, #16       \n"
                        "vshll.u16  q14, d28, #16       \n"

                        "vmla.f32   q7, q8, d0[0]       \n"
                        "vmla.f32   q4, q9, d0[1]       \n"

                        "add        %2, %2, #4          \n"
                        "add        %3, %3, #4          \n"

                        "vmla.f32   q5, q10, d1[0]      \n"
                        "vmla.f32   q6, q11, d1[1]      \n"

                        "sub        %9, %9, #392        \n"

                        "vmla.f32   q7, q12, d2[0]      \n"
                        "vmla.f32   q4, q13, d2[1]      \n"
                        "vmla.f32   q5, q14, d3[0]      \n"

                        "add        %4, %4, #4          \n"
                        "add        %5, %5, #4          \n"

                        "vadd.f32   q6, q6, q7          \n"

                        "add        %6, %6, #4          \n"

                        "vadd.f32   q4, q4, q5          \n"

                        "add        %7, %7, #4          \n"

                        "vadd.f32   q4, q4, q6          \n"

                        "add        %8, %8, #4          \n"

                        "vshrn.u32  d8, q4, #16         \n"

                        "vst1.u16   {d8}, [%0 :64]!     \n"

                        : "=r"(outptr0_bf16), // %0
                        "=r"(outptr0),      // %1
                        "=r"(r0),           // %2
                        "=r"(r1),           // %3
                        "=r"(r2),           // %4
                        "=r"(r3),           // %5
                        "=r"(r4),           // %6
                        "=r"(r5),           // %7
                        "=r"(r6),           // %8
                        "=r"(kptr)          // %9
                        : "0"(outptr0_bf16),
                        "1"(outptr0),
                        "2"(r0),
                        "3"(r1),
                        "4"(r2),
                        "5"(r3),
                        "6"(r4),
                        "7"(r5),
                        "8"(r6),
                        "9"(kptr)
                        : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14");
#endif // __aarch64__
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
                r5 += tailstep;
                r6 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_7x7_pack1to8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv7x7s2_pack1to8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + p * 8) : vdupq_n_f16(0.f);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            __fp16* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const __fp16* r0 = img0.row<const __fp16>(0);
            const __fp16* r1 = img0.row<const __fp16>(1);
            const __fp16* r2 = img0.row<const __fp16>(2);
            const __fp16* r3 = img0.row<const __fp16>(3);
            const __fp16* r4 = img0.row<const __fp16>(4);
            const __fp16* r5 = img0.row<const __fp16>(5);
            const __fp16* r6 = img0.row<const __fp16>(6);

            const __fp16* kptr = kernel.channel(p).row<const __fp16>(q);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 7 < outw; j += 8)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n" // sum0

                        "prfm   pldl1keep, [%1, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%1] \n" // r0

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0

                        "fmla   v24.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v25.8h, v16.8h, v0.h[2]     \n"
                        "fmla   v26.8h, v16.8h, v0.h[4]     \n"
                        "fmla   v27.8h, v16.8h, v0.h[6]     \n"
                        "fmla   v28.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v16.8h, v1.h[4]     \n"
                        "fmla   v31.8h, v16.8h, v1.h[6]     \n"

                        "sub    %0, %0, #64                 \n"

                        "fmla   v24.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v25.8h, v17.8h, v0.h[3]     \n"
                        "fmla   v26.8h, v17.8h, v0.h[5]     \n"
                        "fmla   v27.8h, v17.8h, v0.h[7]     \n"
                        "fmla   v28.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v25.8h, v18.8h, v0.h[4]     \n"
                        "fmla   v26.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v27.8h, v18.8h, v1.h[0]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v18.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v18.8h, v2.h[0]     \n"

                        "fmla   v24.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v25.8h, v19.8h, v0.h[5]     \n"
                        "fmla   v26.8h, v19.8h, v0.h[7]     \n"
                        "fmla   v27.8h, v19.8h, v1.h[1]     \n"
                        "fmla   v28.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%2, #384]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h}, [%2] \n" // r1

                        "fmla   v24.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v25.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v26.8h, v20.8h, v1.h[0]     \n"
                        "fmla   v27.8h, v20.8h, v1.h[2]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v20.8h, v2.h[2]     \n"

                        "fmla   v24.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v25.8h, v21.8h, v0.h[7]     \n"
                        "fmla   v26.8h, v21.8h, v1.h[1]     \n"
                        "fmla   v27.8h, v21.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v25.8h, v22.8h, v1.h[0]     \n"
                        "fmla   v26.8h, v22.8h, v1.h[2]     \n"
                        "fmla   v27.8h, v22.8h, v1.h[4]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v22.8h, v2.h[4]     \n"

                        "fmla   v24.8h, v23.8h, v4.h[0]     \n"
                        "fmla   v25.8h, v23.8h, v4.h[2]     \n"
                        "fmla   v26.8h, v23.8h, v4.h[4]     \n"
                        "fmla   v27.8h, v23.8h, v4.h[6]     \n"
                        "fmla   v28.8h, v23.8h, v5.h[0]     \n"
                        "fmla   v29.8h, v23.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v23.8h, v5.h[6]     \n"

                        "fmla   v24.8h, v16.8h, v4.h[1]     \n"
                        "fmla   v25.8h, v16.8h, v4.h[3]     \n"
                        "fmla   v26.8h, v16.8h, v4.h[5]     \n"
                        "fmla   v27.8h, v16.8h, v4.h[7]     \n"
                        "fmla   v28.8h, v16.8h, v5.h[1]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[3]     \n"
                        "fmla   v30.8h, v16.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v16.8h, v5.h[7]     \n"

                        "fmla   v24.8h, v17.8h, v4.h[2]     \n"
                        "fmla   v25.8h, v17.8h, v4.h[4]     \n"
                        "fmla   v26.8h, v17.8h, v4.h[6]     \n"
                        "fmla   v27.8h, v17.8h, v5.h[0]     \n"
                        "fmla   v28.8h, v17.8h, v5.h[2]     \n"
                        "fmla   v29.8h, v17.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v17.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v18.8h, v4.h[3]     \n"
                        "fmla   v25.8h, v18.8h, v4.h[5]     \n"
                        "fmla   v26.8h, v18.8h, v4.h[7]     \n"
                        "fmla   v27.8h, v18.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v18.8h, v5.h[3]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[5]     \n"
                        "fmla   v30.8h, v18.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v18.8h, v6.h[1]     \n"

                        "fmla   v24.8h, v19.8h, v4.h[4]     \n"
                        "fmla   v25.8h, v19.8h, v4.h[6]     \n"
                        "fmla   v26.8h, v19.8h, v5.h[0]     \n"
                        "fmla   v27.8h, v19.8h, v5.h[2]     \n"
                        "fmla   v28.8h, v19.8h, v5.h[4]     \n"
                        "fmla   v29.8h, v19.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[2]     \n"

                        "prfm   pldl1keep, [%3, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%3] \n" // r2

                        "fmla   v24.8h, v20.8h, v4.h[5]     \n"
                        "fmla   v25.8h, v20.8h, v4.h[7]     \n"
                        "fmla   v26.8h, v20.8h, v5.h[1]     \n"
                        "fmla   v27.8h, v20.8h, v5.h[3]     \n"
                        "fmla   v28.8h, v20.8h, v5.h[5]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[7]     \n"
                        "fmla   v30.8h, v20.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v20.8h, v6.h[3]     \n"

                        "fmla   v24.8h, v21.8h, v4.h[6]     \n"
                        "fmla   v25.8h, v21.8h, v5.h[0]     \n"
                        "fmla   v26.8h, v21.8h, v5.h[2]     \n"
                        "fmla   v27.8h, v21.8h, v5.h[4]     \n"
                        "fmla   v28.8h, v21.8h, v5.h[6]     \n"
                        "fmla   v29.8h, v21.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v21.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[4]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v22.8h, v0.h[0]     \n"
                        "fmla   v25.8h, v22.8h, v0.h[2]     \n"
                        "fmla   v26.8h, v22.8h, v0.h[4]     \n"
                        "fmla   v27.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[4]     \n"
                        "fmla   v31.8h, v22.8h, v1.h[6]     \n"

                        "fmla   v24.8h, v23.8h, v0.h[1]     \n"
                        "fmla   v25.8h, v23.8h, v0.h[3]     \n"
                        "fmla   v26.8h, v23.8h, v0.h[5]     \n"
                        "fmla   v27.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v28.8h, v23.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[7]     \n"

                        "fmla   v24.8h, v16.8h, v0.h[2]     \n"
                        "fmla   v25.8h, v16.8h, v0.h[4]     \n"
                        "fmla   v26.8h, v16.8h, v0.h[6]     \n"
                        "fmla   v27.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v28.8h, v16.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v16.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v16.8h, v2.h[0]     \n"

                        "fmla   v24.8h, v17.8h, v0.h[3]     \n"
                        "fmla   v25.8h, v17.8h, v0.h[5]     \n"
                        "fmla   v26.8h, v17.8h, v0.h[7]     \n"
                        "fmla   v27.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v28.8h, v17.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v18.8h, v0.h[4]     \n"
                        "fmla   v25.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v26.8h, v18.8h, v1.h[0]     \n"
                        "fmla   v27.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v18.8h, v2.h[2]     \n"

                        "prfm   pldl1keep, [%4, #384]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h}, [%4] \n" // r3

                        "fmla   v24.8h, v19.8h, v0.h[5]     \n"
                        "fmla   v25.8h, v19.8h, v0.h[7]     \n"
                        "fmla   v26.8h, v19.8h, v1.h[1]     \n"
                        "fmla   v27.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v19.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[3]     \n"

                        "fmla   v24.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v25.8h, v20.8h, v1.h[0]     \n"
                        "fmla   v26.8h, v20.8h, v1.h[2]     \n"
                        "fmla   v27.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v20.8h, v2.h[4]     \n"

                        "fmla   v24.8h, v21.8h, v4.h[0]     \n"
                        "fmla   v25.8h, v21.8h, v4.h[2]     \n"
                        "fmla   v26.8h, v21.8h, v4.h[4]     \n"
                        "fmla   v27.8h, v21.8h, v4.h[6]     \n"
                        "fmla   v28.8h, v21.8h, v5.h[0]     \n"
                        "fmla   v29.8h, v21.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v21.8h, v5.h[6]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v22.8h, v4.h[1]     \n"
                        "fmla   v25.8h, v22.8h, v4.h[3]     \n"
                        "fmla   v26.8h, v22.8h, v4.h[5]     \n"
                        "fmla   v27.8h, v22.8h, v4.h[7]     \n"
                        "fmla   v28.8h, v22.8h, v5.h[1]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[3]     \n"
                        "fmla   v30.8h, v22.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v22.8h, v5.h[7]     \n"

                        "fmla   v24.8h, v23.8h, v4.h[2]     \n"
                        "fmla   v25.8h, v23.8h, v4.h[4]     \n"
                        "fmla   v26.8h, v23.8h, v4.h[6]     \n"
                        "fmla   v27.8h, v23.8h, v5.h[0]     \n"
                        "fmla   v28.8h, v23.8h, v5.h[2]     \n"
                        "fmla   v29.8h, v23.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v23.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[0]     \n"

                        "fmla   v24.8h, v16.8h, v4.h[3]     \n"
                        "fmla   v25.8h, v16.8h, v4.h[5]     \n"
                        "fmla   v26.8h, v16.8h, v4.h[7]     \n"
                        "fmla   v27.8h, v16.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v16.8h, v5.h[3]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[5]     \n"
                        "fmla   v30.8h, v16.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v16.8h, v6.h[1]     \n"

                        "fmla   v24.8h, v17.8h, v4.h[4]     \n"
                        "fmla   v25.8h, v17.8h, v4.h[6]     \n"
                        "fmla   v26.8h, v17.8h, v5.h[0]     \n"
                        "fmla   v27.8h, v17.8h, v5.h[2]     \n"
                        "fmla   v28.8h, v17.8h, v5.h[4]     \n"
                        "fmla   v29.8h, v17.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v18.8h, v4.h[5]     \n"
                        "fmla   v25.8h, v18.8h, v4.h[7]     \n"
                        "fmla   v26.8h, v18.8h, v5.h[1]     \n"
                        "fmla   v27.8h, v18.8h, v5.h[3]     \n"
                        "fmla   v28.8h, v18.8h, v5.h[5]     \n"
                        "fmla   v29.8h, v18.8h, v5.h[7]     \n"
                        "fmla   v30.8h, v18.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v18.8h, v6.h[3]     \n"

                        "prfm   pldl1keep, [%5, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%5] \n" // r4

                        "fmla   v24.8h, v19.8h, v4.h[6]     \n"
                        "fmla   v25.8h, v19.8h, v5.h[0]     \n"
                        "fmla   v26.8h, v19.8h, v5.h[2]     \n"
                        "fmla   v27.8h, v19.8h, v5.h[4]     \n"
                        "fmla   v28.8h, v19.8h, v5.h[6]     \n"
                        "fmla   v29.8h, v19.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v19.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v6.h[4]     \n"

                        "fmla   v24.8h, v20.8h, v0.h[0]     \n"
                        "fmla   v25.8h, v20.8h, v0.h[2]     \n"
                        "fmla   v26.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v27.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v1.h[6]     \n"

                        "fmla   v24.8h, v21.8h, v0.h[1]     \n"
                        "fmla   v25.8h, v21.8h, v0.h[3]     \n"
                        "fmla   v26.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v27.8h, v21.8h, v0.h[7]     \n"
                        "fmla   v28.8h, v21.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[7]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v22.8h, v0.h[2]     \n"
                        "fmla   v25.8h, v22.8h, v0.h[4]     \n"
                        "fmla   v26.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v27.8h, v22.8h, v1.h[0]     \n"
                        "fmla   v28.8h, v22.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v2.h[0]     \n"

                        "fmla   v24.8h, v23.8h, v0.h[3]     \n"
                        "fmla   v25.8h, v23.8h, v0.h[5]     \n"
                        "fmla   v26.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v27.8h, v23.8h, v1.h[1]     \n"
                        "fmla   v28.8h, v23.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%6, #384]       \n"
                        "ld1    {v4.8h, v5.8h, v6.8h}, [%6] \n" // r5

                        "fmla   v24.8h, v16.8h, v0.h[4]     \n"
                        "fmla   v25.8h, v16.8h, v0.h[6]     \n"
                        "fmla   v26.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v27.8h, v16.8h, v1.h[2]     \n"
                        "fmla   v28.8h, v16.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v2.h[2]     \n"

                        "fmla   v24.8h, v17.8h, v0.h[5]     \n"
                        "fmla   v25.8h, v17.8h, v0.h[7]     \n"
                        "fmla   v26.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v27.8h, v17.8h, v1.h[3]     \n"
                        "fmla   v28.8h, v17.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v2.h[3]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v25.8h, v18.8h, v1.h[0]     \n"
                        "fmla   v26.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v27.8h, v18.8h, v1.h[4]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v18.8h, v2.h[4]     \n"

                        "fmla   v24.8h, v19.8h, v4.h[0]     \n"
                        "fmla   v25.8h, v19.8h, v4.h[2]     \n"
                        "fmla   v26.8h, v19.8h, v4.h[4]     \n"
                        "fmla   v27.8h, v19.8h, v4.h[6]     \n"
                        "fmla   v28.8h, v19.8h, v5.h[0]     \n"
                        "fmla   v29.8h, v19.8h, v5.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v5.h[4]     \n"
                        "fmla   v31.8h, v19.8h, v5.h[6]     \n"

                        "fmla   v24.8h, v20.8h, v4.h[1]     \n"
                        "fmla   v25.8h, v20.8h, v4.h[3]     \n"
                        "fmla   v26.8h, v20.8h, v4.h[5]     \n"
                        "fmla   v27.8h, v20.8h, v4.h[7]     \n"
                        "fmla   v28.8h, v20.8h, v5.h[1]     \n"
                        "fmla   v29.8h, v20.8h, v5.h[3]     \n"
                        "fmla   v30.8h, v20.8h, v5.h[5]     \n"
                        "fmla   v31.8h, v20.8h, v5.h[7]     \n"

                        "fmla   v24.8h, v21.8h, v4.h[2]     \n"
                        "fmla   v25.8h, v21.8h, v4.h[4]     \n"
                        "fmla   v26.8h, v21.8h, v4.h[6]     \n"
                        "fmla   v27.8h, v21.8h, v5.h[0]     \n"
                        "fmla   v28.8h, v21.8h, v5.h[2]     \n"
                        "fmla   v29.8h, v21.8h, v5.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v5.h[6]     \n"
                        "fmla   v31.8h, v21.8h, v6.h[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v22.8h, v4.h[3]     \n"
                        "fmla   v25.8h, v22.8h, v4.h[5]     \n"
                        "fmla   v26.8h, v22.8h, v4.h[7]     \n"
                        "fmla   v27.8h, v22.8h, v5.h[1]     \n"
                        "fmla   v28.8h, v22.8h, v5.h[3]     \n"
                        "fmla   v29.8h, v22.8h, v5.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v5.h[7]     \n"
                        "fmla   v31.8h, v22.8h, v6.h[1]     \n"

                        "fmla   v24.8h, v23.8h, v4.h[4]     \n"
                        "fmla   v25.8h, v23.8h, v4.h[6]     \n"
                        "fmla   v26.8h, v23.8h, v5.h[0]     \n"
                        "fmla   v27.8h, v23.8h, v5.h[2]     \n"
                        "fmla   v28.8h, v23.8h, v5.h[4]     \n"
                        "fmla   v29.8h, v23.8h, v5.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v6.h[0]     \n"
                        "fmla   v31.8h, v23.8h, v6.h[2]     \n"

                        "prfm   pldl1keep, [%7, #384]       \n"
                        "ld1    {v0.8h, v1.8h, v2.8h}, [%7] \n" // r6

                        "fmla   v24.8h, v16.8h, v4.h[5]     \n"
                        "fmla   v25.8h, v16.8h, v4.h[7]     \n"
                        "fmla   v26.8h, v16.8h, v5.h[1]     \n"
                        "fmla   v27.8h, v16.8h, v5.h[3]     \n"
                        "fmla   v28.8h, v16.8h, v5.h[5]     \n"
                        "fmla   v29.8h, v16.8h, v5.h[7]     \n"
                        "fmla   v30.8h, v16.8h, v6.h[1]     \n"
                        "fmla   v31.8h, v16.8h, v6.h[3]     \n"

                        "fmla   v24.8h, v17.8h, v4.h[6]     \n"
                        "fmla   v25.8h, v17.8h, v5.h[0]     \n"
                        "fmla   v26.8h, v17.8h, v5.h[2]     \n"
                        "fmla   v27.8h, v17.8h, v5.h[4]     \n"
                        "fmla   v28.8h, v17.8h, v5.h[6]     \n"
                        "fmla   v29.8h, v17.8h, v6.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v6.h[2]     \n"
                        "fmla   v31.8h, v17.8h, v6.h[4]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v24.8h, v18.8h, v0.h[0]     \n"
                        "fmla   v25.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v26.8h, v18.8h, v0.h[4]     \n"
                        "fmla   v27.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v28.8h, v18.8h, v1.h[0]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v1.h[4]     \n"
                        "fmla   v31.8h, v18.8h, v1.h[6]     \n"

                        "fmla   v24.8h, v19.8h, v0.h[1]     \n"
                        "fmla   v25.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v26.8h, v19.8h, v0.h[5]     \n"
                        "fmla   v27.8h, v19.8h, v0.h[7]     \n"
                        "fmla   v28.8h, v19.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v19.8h, v1.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[7]     \n"

                        "fmla   v24.8h, v20.8h, v0.h[2]     \n"
                        "fmla   v25.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v26.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v27.8h, v20.8h, v1.h[0]     \n"
                        "fmla   v28.8h, v20.8h, v1.h[2]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v1.h[6]     \n"
                        "fmla   v31.8h, v20.8h, v2.h[0]     \n"

                        "add    %1, %1, #32                 \n"

                        "fmla   v24.8h, v21.8h, v0.h[3]     \n"
                        "fmla   v25.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v26.8h, v21.8h, v0.h[7]     \n"
                        "fmla   v27.8h, v21.8h, v1.h[1]     \n"

                        "add    %2, %2, #32                 \n"

                        "fmla   v28.8h, v21.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[7]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[1]     \n"

                        "prfm   pldl1keep, [%8, #128]       \n"
                        "ld1    {v16.8h}, [%8]              \n"

                        "fmla   v24.8h, v22.8h, v0.h[4]     \n"
                        "fmla   v25.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v26.8h, v22.8h, v1.h[0]     \n"
                        "fmla   v27.8h, v22.8h, v1.h[2]     \n"

                        "add    %3, %3, #32                 \n"

                        "fmla   v28.8h, v22.8h, v1.h[4]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[0]     \n"
                        "fmla   v31.8h, v22.8h, v2.h[2]     \n"

                        "add    %4, %4, #32                 \n"

                        "fmla   v24.8h, v23.8h, v0.h[5]     \n"
                        "fmla   v25.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v26.8h, v23.8h, v1.h[1]     \n"
                        "fmla   v27.8h, v23.8h, v1.h[3]     \n"

                        "add    %5, %5, #32                 \n"

                        "fmla   v28.8h, v23.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v23.8h, v1.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[1]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[3]     \n"

                        "add    %6, %6, #32                 \n"

                        "fmla   v24.8h, v16.8h, v0.h[6]     \n"
                        "fmla   v25.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v26.8h, v16.8h, v1.h[2]     \n"
                        "fmla   v27.8h, v16.8h, v1.h[4]     \n"

                        "add    %7, %7, #32                 \n"

                        "fmla   v28.8h, v16.8h, v1.h[6]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[2]     \n"
                        "fmla   v31.8h, v16.8h, v2.h[4]     \n"

                        "sub    %8, %8, #768                \n" // kptr -= 48 * 8;

                        "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
                }
                for (; j + 3 < outw; j += 4)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%0, #512]       \n"
                        "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0] \n" // sum0

                        "prfm   pldl1keep, [%1, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%1]        \n" // r0

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v16.8h, v0.h[2]     \n"
                        "fmla   v30.8h, v16.8h, v0.h[4]     \n"
                        "fmla   v31.8h, v16.8h, v0.h[6]     \n"

                        "fmla   v28.8h, v17.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[3]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v17.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v18.8h, v0.h[4]     \n"
                        "fmla   v30.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v18.8h, v1.h[0]     \n"

                        "fmla   v28.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v19.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%2, #256]       \n"
                        "ld1    {v2.8h, v3.8h}, [%2]        \n" // r1

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v30.8h, v20.8h, v1.h[0]     \n"
                        "fmla   v31.8h, v20.8h, v1.h[2]     \n"

                        "fmla   v28.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[7]     \n"
                        "fmla   v30.8h, v21.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v22.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v22.8h, v1.h[4]     \n"

                        "fmla   v28.8h, v23.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v23.8h, v2.h[6]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v16.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v17.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v17.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v18.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v18.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v18.8h, v3.h[1]     \n"

                        "fmla   v28.8h, v19.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[2]     \n"

                        "prfm   pldl1keep, [%3, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%3]        \n" // r2

                        "fmla   v28.8h, v20.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v20.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v20.8h, v3.h[3]     \n"

                        "fmla   v28.8h, v21.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v21.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v21.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[4]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v22.8h, v0.h[2]     \n"
                        "fmla   v30.8h, v22.8h, v0.h[4]     \n"
                        "fmla   v31.8h, v22.8h, v0.h[6]     \n"

                        "fmla   v28.8h, v23.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v23.8h, v0.h[3]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[7]     \n"

                        "fmla   v28.8h, v16.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v16.8h, v0.h[4]     \n"
                        "fmla   v30.8h, v16.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v16.8h, v1.h[0]     \n"

                        "fmla   v28.8h, v17.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v17.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v30.8h, v18.8h, v1.h[0]     \n"
                        "fmla   v31.8h, v18.8h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%4, #256]       \n"
                        "ld1    {v2.8h, v3.8h}, [%4]        \n" // r3

                        "fmla   v28.8h, v19.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v19.8h, v0.h[7]     \n"
                        "fmla   v30.8h, v19.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[3]     \n"

                        "fmla   v28.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v20.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v20.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v20.8h, v1.h[4]     \n"

                        "fmla   v28.8h, v21.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v21.8h, v2.h[6]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v22.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v22.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v23.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v23.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[0]     \n"

                        "fmla   v28.8h, v16.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v16.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v16.8h, v3.h[1]     \n"

                        "prfm   pldl1keep, [%5, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%5]        \n" // r4

                        "fmla   v28.8h, v17.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v17.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v18.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v18.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v18.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v18.8h, v3.h[3]     \n"

                        "fmla   v28.8h, v19.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v19.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v19.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v3.h[4]     \n"

                        "fmla   v28.8h, v20.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v20.8h, v0.h[2]     \n"
                        "fmla   v30.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v31.8h, v20.8h, v0.h[6]     \n"

                        "fmla   v28.8h, v21.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[3]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v21.8h, v0.h[7]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v22.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v22.8h, v0.h[4]     \n"
                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v22.8h, v1.h[0]     \n"

                        "fmla   v28.8h, v23.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v23.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%6, #256]       \n"
                        "ld1    {v2.8h, v3.8h}, [%6]        \n" // r5

                        "fmla   v28.8h, v16.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v16.8h, v0.h[6]     \n"
                        "fmla   v30.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v31.8h, v16.8h, v1.h[2]     \n"

                        "fmla   v28.8h, v17.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[7]     \n"
                        "fmla   v30.8h, v17.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v17.8h, v1.h[3]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v18.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v18.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v18.8h, v1.h[4]     \n"

                        "fmla   v28.8h, v19.8h, v2.h[0]     \n"
                        "fmla   v29.8h, v19.8h, v2.h[2]     \n"
                        "fmla   v30.8h, v19.8h, v2.h[4]     \n"
                        "fmla   v31.8h, v19.8h, v2.h[6]     \n"

                        "fmla   v28.8h, v20.8h, v2.h[1]     \n"
                        "fmla   v29.8h, v20.8h, v2.h[3]     \n"
                        "fmla   v30.8h, v20.8h, v2.h[5]     \n"
                        "fmla   v31.8h, v20.8h, v2.h[7]     \n"

                        "fmla   v28.8h, v21.8h, v2.h[2]     \n"
                        "fmla   v29.8h, v21.8h, v2.h[4]     \n"
                        "fmla   v30.8h, v21.8h, v2.h[6]     \n"
                        "fmla   v31.8h, v21.8h, v3.h[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v22.8h, v2.h[3]     \n"
                        "fmla   v29.8h, v22.8h, v2.h[5]     \n"
                        "fmla   v30.8h, v22.8h, v2.h[7]     \n"
                        "fmla   v31.8h, v22.8h, v3.h[1]     \n"

                        "add    %1, %1, #16                 \n"

                        "fmla   v28.8h, v23.8h, v2.h[4]     \n"
                        "fmla   v29.8h, v23.8h, v2.h[6]     \n"
                        "fmla   v30.8h, v23.8h, v3.h[0]     \n"
                        "fmla   v31.8h, v23.8h, v3.h[2]     \n"

                        "prfm   pldl1keep, [%7, #256]       \n"
                        "ld1    {v0.8h, v1.8h}, [%7]        \n" // r6

                        "fmla   v28.8h, v16.8h, v2.h[5]     \n"
                        "fmla   v29.8h, v16.8h, v2.h[7]     \n"
                        "fmla   v30.8h, v16.8h, v3.h[1]     \n"
                        "fmla   v31.8h, v16.8h, v3.h[3]     \n"

                        "add    %2, %2, #16                 \n"

                        "fmla   v28.8h, v17.8h, v2.h[6]     \n"
                        "fmla   v29.8h, v17.8h, v3.h[0]     \n"
                        "fmla   v30.8h, v17.8h, v3.h[2]     \n"
                        "fmla   v31.8h, v17.8h, v3.h[4]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v28.8h, v18.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v30.8h, v18.8h, v0.h[4]     \n"
                        "fmla   v31.8h, v18.8h, v0.h[6]     \n"

                        "add    %3, %3, #16                 \n"

                        "fmla   v28.8h, v19.8h, v0.h[1]     \n"
                        "fmla   v29.8h, v19.8h, v0.h[3]     \n"
                        "fmla   v30.8h, v19.8h, v0.h[5]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[7]     \n"

                        "add    %4, %4, #16                 \n"

                        "fmla   v28.8h, v20.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v30.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v20.8h, v1.h[0]     \n"

                        "add    %5, %5, #16                 \n"

                        "fmla   v28.8h, v21.8h, v0.h[3]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"
                        "fmla   v30.8h, v21.8h, v0.h[7]     \n"
                        "fmla   v31.8h, v21.8h, v1.h[1]     \n"

                        "prfm   pldl1keep, [%8, #128]       \n"
                        "ld1    {v16.8h}, [%8]              \n"

                        "fmla   v28.8h, v22.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v30.8h, v22.8h, v1.h[0]     \n"
                        "fmla   v31.8h, v22.8h, v1.h[2]     \n"

                        "add    %6, %6, #16                 \n"

                        "fmla   v28.8h, v23.8h, v0.h[5]     \n"
                        "fmla   v29.8h, v23.8h, v0.h[7]     \n"
                        "fmla   v30.8h, v23.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[3]     \n"

                        "add    %7, %7, #16                 \n"

                        "fmla   v28.8h, v16.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v16.8h, v1.h[0]     \n"
                        "fmla   v30.8h, v16.8h, v1.h[2]     \n"
                        "fmla   v31.8h, v16.8h, v1.h[4]     \n"

                        "sub    %8, %8, #768                \n" // kptr -= 48 * 8;

                        "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }
                for (; j < outw; j++)
                {
                    asm volatile(
                        "prfm   pldl1keep, [%1, #128]       \n"
                        "ld1    {v0.8h}, [%1]               \n" // r0

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "prfm   pldl1keep, [%0, #128]       \n"
                        "ld1    {v31.8h}, [%0]              \n" // sum0

                        "fmul   v28.8h, v16.8h, v0.h[0]     \n"
                        "fmul   v29.8h, v17.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmul   v30.8h, v18.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%2, #128]       \n"
                        "ld1    {v1.8h}, [%2]               \n" // r1

                        "fmla   v28.8h, v20.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v22.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[0]     \n"

                        "fmla   v28.8h, v16.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v18.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[4]     \n"

                        "prfm   pldl1keep, [%3, #128]       \n"
                        "ld1    {v0.8h}, [%3]               \n" // r2

                        "fmla   v28.8h, v20.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[6]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v22.8h, v0.h[0]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%4, #128]       \n"
                        "ld1    {v1.8h}, [%4]               \n" // r3

                        "fmla   v28.8h, v16.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v18.8h, v0.h[4]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[5]     \n"

                        "add    %1, %1, #4                  \n"

                        "fmla   v28.8h, v20.8h, v0.h[6]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[0]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v22.8h, v1.h[1]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%5, #128]       \n"
                        "ld1    {v0.8h}, [%5]               \n" // r4

                        "fmla   v28.8h, v16.8h, v1.h[3]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[4]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v18.8h, v1.h[5]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[6]     \n"

                        "add    %2, %2, #4                  \n"

                        "fmla   v28.8h, v20.8h, v0.h[0]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[1]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v22.8h, v0.h[2]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%6, #128]       \n"
                        "ld1    {v1.8h}, [%6]               \n" // r5

                        "fmla   v28.8h, v16.8h, v0.h[4]     \n"
                        "fmla   v29.8h, v17.8h, v0.h[5]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v18.8h, v0.h[6]     \n"
                        "fmla   v31.8h, v19.8h, v1.h[0]     \n"

                        "add    %3, %3, #4                  \n"

                        "fmla   v28.8h, v20.8h, v1.h[1]     \n"
                        "fmla   v29.8h, v21.8h, v1.h[2]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v22.8h, v1.h[3]     \n"
                        "fmla   v31.8h, v23.8h, v1.h[4]     \n"

                        "prfm   pldl1keep, [%7, #128]       \n"
                        "ld1    {v0.8h}, [%7]               \n" // r6

                        "fmla   v28.8h, v16.8h, v1.h[5]     \n"
                        "fmla   v29.8h, v17.8h, v1.h[6]     \n"

                        "prfm   pldl1keep, [%8, #512]       \n"
                        "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"

                        "fmla   v30.8h, v18.8h, v0.h[0]     \n"
                        "fmla   v31.8h, v19.8h, v0.h[1]     \n"

                        "add    %4, %4, #4                  \n"

                        "fmla   v28.8h, v20.8h, v0.h[2]     \n"
                        "fmla   v29.8h, v21.8h, v0.h[3]     \n"

                        "prfm   pldl1keep, [%8, #128]       \n"
                        "ld1    {v16.8h}, [%8]              \n"

                        "fmla   v30.8h, v22.8h, v0.h[4]     \n"
                        "fmla   v31.8h, v23.8h, v0.h[5]     \n"

                        "add    %5, %5, #4                  \n"

                        "fmla   v28.8h, v16.8h, v0.h[6]     \n"

                        "add    %6, %6, #4                  \n"

                        "fadd   v29.8h, v29.8h, v30.8h      \n"
                        "fadd   v31.8h, v31.8h, v28.8h      \n"

                        "add    %7, %7, #4                  \n"

                        "fadd   v29.8h, v29.8h, v31.8h      \n"

                        "sub    %8, %8, #768                \n" // kptr -= 48 * 8;

                        "st1    {v29.8h}, [%0], #16         \n"

                        : "=r"(outptr0), // %0
                        "=r"(r0),      // %1
                        "=r"(r1),      // %2
                        "=r"(r2),      // %3
                        "=r"(r3),      // %4
                        "=r"(r4),      // %5
                        "=r"(r5),      // %6
                        "=r"(r6),      // %7
                        "=r"(kptr)     // %8
                        : "0"(outptr0),
                        "1"(r0),
                        "2"(r1),
                        "3"(r2),
                        "4"(r3),
                        "5"(r4),
                        "6"(r5),
                        "7"(r6),
                        "8"(kptr)
                        : "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
                r5 += tailstep;
                r6 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution_arm.h"

#include "benchmark.h"
#include "cpu.h"
#include "layer_type.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#if NCNN_GNU_INLINE_ASM
#include "convolution_1x1.h"
#include "convolution_2x2.h"
#include "convolution_3x3.h"
#include "convolution_4x4.h"
#include "convolution_5x5.h"
#include "convolution_7x7.h"
#endif // NCNN_GNU_INLINE_ASM

#include "convolution_packed.h"
#include "convolution_3x3_winograd.h"
#include "convolution_im2col_gemm.h"

#if NCNN_BF16
#include "convolution_packed_bf16s.h"
#include "convolution_3x3_winograd_bf16s.h"
#include "convolution_im2col_gemm_bf16s_fp16s.h"
#include "convolution_im2col_gemm_bf16s.h"
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolution_packed_int8.h"
#include "convolution_im2col_gemm_int8.h"
#include "convolution_3x3_winograd_int8.h"

// #include "convolution_3x3_int8.h"
#endif // NCNN_INT8

#if __ARM_NEON
#if NCNN_GNU_INLINE_ASM
#include "convolution_3x3_pack1to4.h"
#include "convolution_3x3_pack4.h"
#include "convolution_3x3_pack4to1.h"
#include "convolution_5x5_pack4.h"
#include "convolution_7x7_pack1to4.h"

#if NCNN_BF16
#include "convolution_3x3_pack1to4_bf16s.h"
#include "convolution_3x3_pack4_bf16s.h"
#include "convolution_5x5_pack4_bf16s.h"
#include "convolution_7x7_pack1to4_bf16s.h"
#endif // NCNN_BF16
#endif // NCNN_GNU_INLINE_ASM
#endif // __ARM_NEON

Convolution_arm::Convolution_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif

    activation = 0;
    nT = 0;
    convolution_dilation1 = 0;
}

static void convolution_transform_kernel_packed_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            float* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }
}

int Convolution_arm::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    activation = create_activation_layer(activation_type, activation_params, opt);
    nT = opt.num_threads;

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_arm(opt);
    }
#endif

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

    if ((!support_packing || !opt.use_packing_layout) && !opt.use_bf16_storage && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
    {
        convolution_dilation1 = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);

        // set param
        ncnn::ParamDict pd;
        pd.set(0, num_output); // num_output
        pd.set(1, kernel_w);
        pd.set(11, kernel_h);
        pd.set(2, 1);
        pd.set(12, 1);
        pd.set(3, 1);  // stride_w
        pd.set(13, 1); // stride_h
        pd.set(4, 0);  // pad_w
        pd.set(14, 0); // pad_h
        pd.set(5, bias_term);
        pd.set(6, weight_data_size);

        convolution_dilation1->load_param(pd);

        // set weights
        if (bias_term)
        {
            ncnn::Mat weights[2];
            weights[0] = weight_data;
            weights[1] = bias_data;

            convolution_dilation1->load_model(ModelBinFromMatArray(weights));
        }
        else
        {
            ncnn::Mat weights[1];
            weights[0] = weight_data;

            convolution_dilation1->load_model(ModelBinFromMatArray(weights));
        }

        convolution_dilation1->create_pipeline(opt);

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);

    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        // dynamic shape
        if (opt.use_winograd63_convolution && (num_input <= 128 && num_output <= 128))
            conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
        else if (opt.use_winograd43_convolution && (num_input >= 8 && num_output >= 8))
            conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
        else
            conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    int l2_cache_size_fp32 = get_cpu_level2_cache_size() / sizeof(float);
    bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp32 || (num_input > 16 || num_output > 16);

#if NCNN_GNU_INLINE_ASM
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }
#endif // NCNN_GNU_INLINE_ASM

    if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
    {
        convolution_im2col_gemm_transform_kernel(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

#if NCNN_GNU_INLINE_ASM
    if ((elempack == 4 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 1 && out_elempack == 4 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
    {
        convolution_transform_kernel_packed_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
    }
    else if (elempack == 1 && out_elempack == 1 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
    {
        conv3x3s2_transform_kernel_neon(weight_data, weight_3x3s2_data, num_input, num_output);
    }
    else if ((elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
             || (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
             || (elempack == 1 && out_elempack == 1 && kernel_w == 4 && kernel_h == 4 && dilation_w == 1 && dilation_h == 1 && stride_w == 4 && stride_h == 4)
             || (elempack == 1 && out_elempack == 1 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
             || (elempack == 1 && out_elempack == 1 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
             || (elempack == 1 && out_elempack == 1 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
             || (elempack == 1 && out_elempack == 1 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
    {
        weight_data_tm = weight_data;
    }
    else
#endif // NCNN_GNU_INLINE_ASM
    {
        convolution_transform_kernel_packed(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_arm::destroy_pipeline(const Option& opt)
{
    if (activation)
    {
        activation->destroy_pipeline(opt);
        delete activation;
        activation = 0;
    }

    if (convolution_dilation1)
    {
        convolution_dilation1->destroy_pipeline(opt);
        delete convolution_dilation1;
        convolution_dilation1 = 0;
    }

    return 0;
}

int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && int8_scale_term)
    {
        return forward_int8_arm(bottom_blob, top_blob, opt);
    }
#endif

    // flattened blob, implement as InnerProduct
    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
    {
        Mat bottom_blob_3d;
        if (bottom_blob.elemsize % 16 == 0)
        {
            bottom_blob_3d = bottom_blob;
            bottom_blob_3d.dims = 3;
            bottom_blob_3d.w = 1;
            bottom_blob_3d.h = 1;
            bottom_blob_3d.c = bottom_blob.w;
            bottom_blob_3d.cstep = 1;
        }
        else
        {
            bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
        }

        Mat top_blob_3d;
        int ret = forward(bottom_blob_3d, top_blob_3d, opt);
        if (ret != 0)
            return ret;

        if (top_blob_3d.elemsize % 16 == 0)
        {
            top_blob = top_blob_3d;
            top_blob.dims = 1;
            top_blob.w = top_blob_3d.c;
            top_blob.h = 1;
            top_blob.c = 1;
            top_blob.cstep = top_blob_3d.c;
        }
        else
        {
            top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
        }

        return 0;
    }

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
    {
        if (outw >= dilation_w && outh >= dilation_h)
        {
            return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
        }
    }

    const int num_input = channels * elempack;

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);

    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        bool prefer_winograd63 = false;
        bool prefer_winograd23 = false;
        bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;

        if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
        {
            // f23 fallback to f43
            prefer_winograd23 = false;
            prefer_winograd43 = true;
        }

        if (prefer_winograd63 && (!opt.use_winograd63_convolution || weight_winograd63_data.empty()))
        {
            // f63 fallback to f43
            prefer_winograd63 = false;
            prefer_winograd43 = true;
        }

        if (prefer_winograd43 && (!opt.use_winograd43_convolution || weight_winograd43_data.empty()))
        {
            // f43 fallback to f63 or f23
            prefer_winograd43 = false;
            if (opt.use_winograd63_convolution && !weight_winograd63_data.empty())
            {
                prefer_winograd63 = true;
            }
            else
            {
                prefer_winograd23 = true;
            }
        }
        // NCNN_LOGE("prefer_winograd %d %d %d", prefer_winograd23, prefer_winograd43, prefer_winograd63);

        int _nT = nT ? nT : opt.num_threads;
        if (nT != 0 && opt.num_threads != nT)
        {
            // force num_threads the same as in create_pipeline
            // so we could use pre-packed A/B from the same tile config
            NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
        }

        int ret = 0;
        if (prefer_winograd23)
        {
            ret = conv3x3s1_winograd23(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
        }
        else if (prefer_winograd43)
        {
            ret = conv3x3s1_winograd43(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
        }
        else if (prefer_winograd63)
        {
            ret = conv3x3s1_winograd63(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
        }
        else
        {
            // should never reach here
        }
        if (ret != 0)
            return ret;

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
        return 0;
    }

    int l2_cache_size_fp32 = get_cpu_level2_cache_size() / sizeof(float);
    bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp32 || (num_input > 16 || num_output > 16);

#if NCNN_GNU_INLINE_ASM
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }
#endif // NCNN_GNU_INLINE_ASM

    if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
    {
        int _nT = nT ? nT : opt.num_threads;
        if (nT != 0 && opt.num_threads != nT)
        {
            // force num_threads the same as in create_pipeline
            // so we could use pre-packed A/B from the same tile config
            NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
        }

        int ret = convolution_im2col_gemm(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
        if (ret != 0)
            return ret;

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
        return 0;
    }

#if NCNN_GNU_INLINE_ASM
#if __ARM_NEON
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv5x5s1_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv5x5s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv7x7s2_pack1to4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#endif // __ARM_NEON

    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_packed_neon(bottom_blob_bordered, top_blob, weight_3x3s2_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 4 && kernel_h == 4 && dilation_w == 1 && dilation_h == 1 && stride_w == 4 && stride_h == 4)
        {
            conv4x4s4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv5x5s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv5x5s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv7x7s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv7x7s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#else  // NCNN_GNU_INLINE_ASM
    {
        convolution_packed(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
    }
#endif // NCNN_GNU_INLINE_ASM

    return 0;
}

int Convolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

#if NCNN_ARM82
    if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_ARM82
#if NCNN_BF16
    if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_BF16

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

#if NCNN_ARM82
        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_ARM82
#if NCNN_BF16
        if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_BF16

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(8, int8_scale_term);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_BF16
static void convolution_transform_kernel_packed_bf16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            unsigned short* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = float32_to_bfloat16(k00[k]);

                            g00++;
                        }
                    }
                }
            }
        }
    }
}

int Convolution_arm::create_pipeline_bf16s(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);

    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        // dynamic shape
        if (opt.use_winograd63_convolution && (num_input <= 128 && num_output <= 128))
            conv3x3s1_winograd63_transform_kernel(weight_data, weight_winograd63_data, num_input, num_output, opt);
        else if (opt.use_winograd43_convolution && (num_input >= 8 && num_output >= 8))
            conv3x3s1_winograd43_transform_kernel(weight_data, weight_winograd43_data, num_input, num_output, opt);
        else
            conv3x3s1_winograd23_transform_kernel(weight_data, weight_winograd23_data, num_input, num_output, opt);

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    int l2_cache_size_bf16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
    bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_bf16 || (num_input > 16 || num_output > 16);

#if NCNN_GNU_INLINE_ASM
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }
#endif // NCNN_GNU_INLINE_ASM

    if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
    {
        convolution_im2col_gemm_transform_kernel_bf16s(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

#if NCNN_GNU_INLINE_ASM
    if ((elempack == 4 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (elempack == 4 && out_elempack == 4 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 1 && out_elempack == 4 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
    {
        convolution_transform_kernel_packed_bf16s_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
    }
    else
#endif // NCNN_GNU_INLINE_ASM
    {
        convolution_transform_kernel_packed_bf16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // TODO dilated conv for bf16s
    //     if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
    //     {
    //         return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
    //     }

    const int num_input = channels * elempack;

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 8 || num_output >= 8);

    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        bool prefer_winograd63 = false;
        bool prefer_winograd23 = false;
        bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;

        if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
        {
            // f23 fallback to f43
            prefer_winograd23 = false;
            prefer_winograd43 = true;
        }

        if (prefer_winograd63 && (!opt.use_winograd63_convolution || weight_winograd63_data.empty()))
        {
            // f63 fallback to f43
            prefer_winograd63 = false;
            prefer_winograd43 = true;
        }

        if (prefer_winograd43 && (!opt.use_winograd43_convolution || weight_winograd43_data.empty()))
        {
            // f43 fallback to f63 or f23
            prefer_winograd43 = false;
            if (opt.use_winograd63_convolution && !weight_winograd63_data.empty())
            {
                prefer_winograd63 = true;
            }
            else
            {
                prefer_winograd23 = true;
            }
        }
        // NCNN_LOGE("prefer_winograd %d %d %d", prefer_winograd23, prefer_winograd43, prefer_winograd63);

        int _nT = nT ? nT : opt.num_threads;
        if (nT != 0 && opt.num_threads != nT)
        {
            // force num_threads the same as in create_pipeline
            // so we could use pre-packed A/B from the same tile config
            NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
        }

        int ret = 0;
        if (prefer_winograd23)
        {
            ret = conv3x3s1_winograd23_bf16s(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, _nT, opt);
        }
        else if (prefer_winograd43)
        {
            ret = conv3x3s1_winograd43_bf16s(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, _nT, opt);
        }
        else if (prefer_winograd63)
        {
            ret = conv3x3s1_winograd63_bf16s(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, _nT, opt);
        }
        else
        {
            // should never reach here
        }
        if (ret != 0)
            return ret;

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
        return 0;
    }

    int l2_cache_size_bf16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
    bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_bf16 || (num_input > 16 || num_output > 16);

#if NCNN_GNU_INLINE_ASM
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 4 || num_output < 32))
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 8 || num_output < 44))
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }
#endif // NCNN_GNU_INLINE_ASM

    if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
    {
        int _nT = nT ? nT : opt.num_threads;
        if (nT != 0 && opt.num_threads != nT)
        {
            // force num_threads the same as in create_pipeline
            // so we could use pre-packed A/B from the same tile config
            NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
        }

        int ret = convolution_im2col_gemm_bf16s(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
        if (ret != 0)
            return ret;

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
        return 0;
    }

#if NCNN_GNU_INLINE_ASM
#if __ARM_NEON
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv5x5s1_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv5x5s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv7x7s2_pack1to4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#endif // __ARM_NEON

    if (elempack == 1 && out_elempack == 1)
    {
        {
            convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#else  // NCNN_GNU_INLINE_ASM
    {
        convolution_packed_bf16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
    }
#endif // NCNN_GNU_INLINE_ASM

    return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int Convolution_arm::create_pipeline_int8_arm(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1;
#if NCNN_ARM82DOT
    if (ncnn::cpu_support_arm_asimddp())
    {
        prefer_winograd = false;
    }
#endif

    if (opt.use_winograd_convolution && prefer_winograd)
    {
        if (opt.use_winograd43_convolution)
            conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt);
        else
            conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt);
    }
    else if (opt.use_sgemm_convolution)
    {
        convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
    }
    else
    {
        convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
    }

    scale_in_data.create(num_output);
    for (int p = 0; p < num_output; p++)
    {
        // requantize and relu
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        scale_in_data[p] = scale_in;
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
        if (bottom_blob_int8.empty())
            return -100;
    }

    //     NCNN_LOGE("Convolution_arm input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    int w = bottom_blob_bordered.w;
    int h = bottom_blob_bordered.h;
    int elempack = bottom_blob_bordered.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    bool use_int8_requantize = int8_scale_term > 100;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        if (use_int8_requantize)
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        else
            out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
    }
#endif
    if (opt.use_bf16_storage)
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;

    //     NCNN_LOGE("forward_int8_arm %d %d %d    %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack);

    int channels = bottom_blob_bordered.c;
    const int num_input = channels * elempack;

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1;
#if NCNN_ARM82DOT
    if (ncnn::cpu_support_arm_asimddp())
    {
        prefer_winograd = false;
    }
#endif

    int out_elempack_int32 = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        if (use_int8_requantize)
        {
            out_elempack_int32 = num_output % 8 == 0 ? 8 : 1;
        }
        else
        {
#if NCNN_ARM82
            if (ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_storage && opt.use_fp16_arithmetic)
            {
                out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
            }
            else
#endif // NCNN_ARM82
            {
                out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
            }
        }
    }
#endif // __ARM_NEON

    Mat top_blob_int32;
    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
    if (top_blob_int32.empty())
        return -100;

    int _nT = nT ? nT : opt.num_threads;
    if (nT != 0 && opt.num_threads != nT)
    {
        // force num_threads the same as in create_pipeline
        // so we could use pre-packed A/B from the same tile config
        NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
    }

    int ret = 0;
    if (opt.use_winograd_convolution && prefer_winograd)
    {
        if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
            ret = conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
        else
            ret = conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
    }
    else if (opt.use_sgemm_convolution)
    {
        ret = convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
    }
    else
    {
        convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
    }
    if (ret != 0)
        return ret;

    bottom_blob_bordered.release();

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (use_int8_requantize)
    {
        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
    }
    else
    {
        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
    }

    return 0;
}
#endif // NCNN_INT8

int Convolution_arm::forwardDilation_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_size = kernel_w;
    const int stride = stride_w;
    const int dilation = dilation_w;
    const int kernel_extent = dilation * (kernel_size - 1) + 1;

    int outw = (w - kernel_extent) / stride + 1;
    int outh = (h - kernel_extent) / stride + 1;

    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Make (dilation * dilation) batches
    Mat inner_bottom_blob;
    Mat inner_top_blob;
    for (int x = 0; x < dilation; x++)
    {
        for (int y = 0; y < dilation; y++)
        {
            int inner_w = (w - y + dilation - 1) / dilation;
            int inner_h = (h - x + dilation - 1) / dilation;

            int inner_outw = (inner_w - kernel_size) / stride + 1;
            int inner_outh = (inner_h - kernel_size) / stride + 1;

            inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
            if (inner_bottom_blob.empty())
                return -100;

            inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
            if (inner_top_blob.empty())
                return -100;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int c = 0; c < bottom_blob.c; c++)
            {
                float* outptr = inner_bottom_blob.channel(c);

                for (int i = 0; i < inner_h; i++)
                {
                    const float* ptr = (const float*)bottom_blob.channel(c) + dilation * i * w + x * w + y;
                    for (int j = 0; j < inner_w; j++)
                    {
                        outptr[j] = ptr[j * dilation];
                    }
                    outptr += inner_w;
                }
            }

            Option opt_g = opt;
            opt_g.blob_allocator = inner_top_blob.allocator;
            convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int c = 0; c < num_output; c++)
            {
                float* outptr = (float*)top_blob.channel(c) + x * outw + y;
                for (int i = 0; i < inner_outh; i++)
                {
                    const float* ptr = (const float*)inner_top_blob.channel(c) + i * inner_outw;
                    for (int j = 0; j < inner_outw; j++)
                    {
                        outptr[j * dilation] = ptr[j];
                    }
                    outptr += dilation * outw;
                }
            }
        }
    }

    if (activation)
    {
        activation->forward_inplace(top_blob, opt);
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/convolution_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION_ARM_H
#define LAYER_CONVOLUTION_ARM_H

#include "convolution.h"

namespace ncnn {

class Convolution_arm : public Convolution
{
public:
    Convolution_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
    int forwardDilation_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    Layer* activation;

    int nT;

    Mat weight_data_tm;
    Mat weight_3x3s2_data;

    Mat weight_sgemm_data;
    Mat weight_winograd23_data;
    Mat weight_winograd43_data;
    Mat weight_winograd63_data;

    // forwardDilation
    Layer* convolution_dilation1;

    // fp16
    Mat bias_data_fp16;

#if NCNN_INT8
    Mat scale_in_data;
#endif
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION_ARM_H


================================================
FILE: src/layer/arm/convolution_arm_asimddp.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"

namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_im2col_gemm_int8.h"

// packed
void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    convolution_transform_kernel_packed_int8(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// gemm
void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
    convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
}

int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
    return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/convolution_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution_arm.h"

#include "cpu.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "convolution_packed_fp16s.h"

#include "convolution_3x3_winograd_fp16s.h"

#include "convolution_im2col_gemm_bf16s_fp16s.h"
#include "convolution_im2col_gemm_fp16s.h"

#if NCNN_GNU_INLINE_ASM
#include "convolution_3x3_pack4_fp16s.h"
#include "convolution_3x3_pack1to8_fp16s.h"
#include "convolution_3x3_pack1to4_fp16s.h"
#include "convolution_3x3_pack8_fp16s.h"
#include "convolution_5x5_pack8_fp16s.h"
#include "convolution_7x7_pack1to8_fp16s.h"
#endif // NCNN_GNU_INLINE_ASM
#endif
#endif // __ARM_NEON

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static void convolution_transform_kernel_packed_fp16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            __fp16* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = (__fp16)k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }
}

int Convolution_arm::create_pipeline_fp16s(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;

    if (opt.use_packing_layout)
    {
        elempack = opt.use_fp16_arithmetic && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 16 || num_output >= 16);

    if (opt.use_fp16_arithmetic && opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        // dynamic shape
        if (opt.use_winograd63_convolution && (num_input <= 128 && num_output <= 128))
            conv3x3s1_winograd63_transform_kernel_fp16sa(weight_data, weight_winograd63_data, num_input, num_output, opt);
        else if (opt.use_winograd43_convolution && (num_input >= 16 && num_output >= 16))
            conv3x3s1_winograd43_transform_kernel_fp16sa(weight_data, weight_winograd43_data, num_input, num_output, opt);
        else
            conv3x3s1_winograd23_transform_kernel_fp16sa(weight_data, weight_winograd23_data, num_input, num_output, opt);

        if (opt.lightmode)
            weight_data.release();

        if (opt.use_fp16_arithmetic)
        {
            ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
        }

        return 0;
    }

    int l2_cache_size_fp16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
    bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp16 || (num_input > 16 || num_output > 16);

#if NCNN_GNU_INLINE_ASM
    if (elempack == 8 && out_elempack == 8)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 64 || num_output < 128))
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 16 || num_output < 88))
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 8)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }
#endif // NCNN_GNU_INLINE_ASM

    if (opt.use_fp16_arithmetic && ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1)))
    {
        convolution_im2col_gemm_transform_kernel_fp16sa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);

        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

#if NCNN_GNU_INLINE_ASM
    if ((elempack == 8 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (elempack == 8 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 8 && out_elempack == 8 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (elempack == 8 && out_elempack == 8 && kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 1 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (elempack == 1 && out_elempack == 8 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (elempack == 1 && out_elempack == 8 && kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            || (opt.use_fp16_arithmetic && elempack == 4 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (opt.use_fp16_arithmetic && elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            || (opt.use_fp16_arithmetic && elempack == 1 && out_elempack == 4 && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2))
    {
        convolution_transform_kernel_packed_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
    }
    else
#endif // NCNN_GNU_INLINE_ASM
    {
        convolution_transform_kernel_packed_fp16s(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
    }

    if (opt.use_fp16_arithmetic)
    {
        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    // NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = (opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // TODO dilated conv for bf16s
    //     if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
    //     {
    //         return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
    //     }

    if (elempack == 4 && out_elempack == 4)
    {
        convolution_packed_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
    }

    if (elempack == 1 && out_elempack == 4)
    {
        convolution_packed_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
    }

    if (elempack == 4 && out_elempack == 1)
    {
        convolution_packed_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
    }

    if (elempack == 1 && out_elempack == 1)
    {
        convolution_packed_fp16s(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
    }

    return 0;
}

int Convolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    // NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // TODO dilated conv for bf16s
    //     if ((!support_packing || !opt.use_packing_layout) && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
    //     {
    //         return forwardDilation_arm(bottom_blob_bordered, top_blob, opt);
    //     }

    const int num_input = channels * elempack;

    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && (num_input >= 16 || num_output >= 16);

    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        bool prefer_winograd63 = false;
        bool prefer_winograd23 = false;
        bool prefer_winograd43 = !prefer_winograd63 && !prefer_winograd23;

        if (prefer_winograd23 && (!opt.use_winograd23_convolution || weight_winograd23_data.empty()))
        {
            // f23 fallback to f43
            prefer_winograd23 = false;
            prefer_winograd43 = true;
        }

        if (prefer_winograd63 && (!opt.use_winograd63_convolution || weight_winograd63_data.empty()))
        {
            // f63 fallback to f43
            prefer_winograd63 = false;
            prefer_winograd43 = true;
        }

        if (prefer_winograd43 && (!opt.use_winograd43_convolution || weight_winograd43_data.empty()))
        {
            // f43 fallback to f63 or f23
            prefer_winograd43 = false;
            if (opt.use_winograd63_convolution && !weight_winograd63_data.empty())
            {
                prefer_winograd63 = true;
            }
            else
            {
                prefer_winograd23 = true;
            }
        }
        // NCNN_LOGE("prefer_winograd %d %d %d", prefer_winograd23, prefer_winograd43, prefer_winograd63);

        int _nT = nT ? nT : opt.num_threads;
        if (nT != 0 && opt.num_threads != nT)
        {
            // force num_threads the same as in create_pipeline
            // so we could use pre-packed A/B from the same tile config
            NCNN_LOGE("opt.num_threads %d changed, convolution winograd will use load-time value %d", opt.num_threads, nT);
        }

        int ret = 0;
        if (prefer_winograd23)
        {
            ret = conv3x3s1_winograd23_fp16sa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data_fp16, _nT, opt);
        }
        else if (prefer_winograd43)
        {
            ret = conv3x3s1_winograd43_fp16sa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data_fp16, _nT, opt);
        }
        else if (prefer_winograd63)
        {
            ret = conv3x3s1_winograd63_fp16sa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data_fp16, _nT, opt);
        }
        else
        {
            // should never reach here
        }
        if (ret != 0)
            return ret;

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
        return 0;
    }

    int l2_cache_size_fp16 = get_cpu_level2_cache_size() / sizeof(unsigned short);
    bool prefer_sgemm = num_input * num_output * kernel_w * kernel_h * dilation_w * dilation_h * stride_w * stride_h * 2 > l2_cache_size_fp16 || (num_input > 16 || num_output > 16);

#if NCNN_GNU_INLINE_ASM
    if (elempack == 8 && out_elempack == 8)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 64 || num_output < 128))
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (num_input < 16 || num_output < 88))
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 8)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            prefer_sgemm = false;
        }
    }
#endif // NCNN_GNU_INLINE_ASM

    if ((opt.use_sgemm_convolution && prefer_sgemm) || (kernel_w == 1 && kernel_h == 1))
    {
        int _nT = nT ? nT : opt.num_threads;
        if (nT != 0 && opt.num_threads != nT)
        {
            // force num_threads the same as in create_pipeline
            // so we could use pre-packed A/B from the same tile config
            NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
        }

        int ret = convolution_im2col_gemm_fp16sa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
        if (ret != 0)
            return ret;

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
        return 0;
    }

#if NCNN_GNU_INLINE_ASM
    if (elempack == 8 && out_elempack == 8)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv5x5s1_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv5x5s2_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 8)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack1to8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack1to8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv7x7s2_pack1to8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 8)
    {
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 8 && out_elempack == 1)
    {
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 8 && out_elempack == 4)
    {
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack1to4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack1to4_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 1)
    {
        {
            convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#else  // NCNN_GNU_INLINE_ASM
    {
        convolution_packed_fp16sa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
    }
#endif // NCNN_GNU_INLINE_ASM

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/convolution_arm_i8mm.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"

namespace ncnn {

#include "convolution_packed_int8.h"
#include "convolution_im2col_gemm_int8.h"

// packed
void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    convolution_transform_kernel_packed_int8(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    convolution_packed_int8(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
}

// gemm
void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
    convolution_im2col_gemm_transform_kernel_int8(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
}

int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
    return convolution_im2col_gemm_int8(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/convolution_im2col_gemm.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_im2col_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    // A = (pa, maxk, inch/pa), outch
    const int A_hstep = A.w;

    float* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
        const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
        const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;
        const float* p4 = (const float*)A + (i + ii + 4) * A_hstep + k;
        const float* p5 = (const float*)A + (i + ii + 5) * A_hstep + k;
        const float* p6 = (const float*)A + (i + ii + 6) * A_hstep + k;
        const float* p7 = (const float*)A + (i + ii + 7) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            float32x4_t _r0l = vld1q_f32(p0);
            float32x4_t _r0h = vld1q_f32(p0 + 4);
            float32x4_t _r1l = vld1q_f32(p1);
            float32x4_t _r1h = vld1q_f32(p1 + 4);
            float32x4_t _r2l = vld1q_f32(p2);
            float32x4_t _r2h = vld1q_f32(p2 + 4);
            float32x4_t _r3l = vld1q_f32(p3);
            float32x4_t _r3h = vld1q_f32(p3 + 4);
            float32x4_t _r4l = vld1q_f32(p4);
            float32x4_t _r4h = vld1q_f32(p4 + 4);
            float32x4_t _r5l = vld1q_f32(p5);
            float32x4_t _r5h = vld1q_f32(p5 + 4);
            float32x4_t _r6l = vld1q_f32(p6);
            float32x4_t _r6h = vld1q_f32(p6 + 4);
            float32x4_t _r7l = vld1q_f32(p7);
            float32x4_t _r7h = vld1q_f32(p7 + 4);
            transpose8x8_ps(_r0l, _r0h, _r1l, _r1h, _r2l, _r2h, _r3l, _r3h, _r4l, _r4h, _r5l, _r5h, _r6l, _r6h, _r7l, _r7h);
            vst1q_f32(pp, _r0l);
            vst1q_f32(pp + 4, _r0h);
            vst1q_f32(pp + 8, _r1l);
            vst1q_f32(pp + 12, _r1h);
            vst1q_f32(pp + 8 * 2, _r2l);
            vst1q_f32(pp + 8 * 2 + 4, _r2h);
            vst1q_f32(pp + 8 * 3, _r3l);
            vst1q_f32(pp + 8 * 3 + 4, _r3h);
            vst1q_f32(pp + 8 * 4, _r4l);
            vst1q_f32(pp + 8 * 4 + 4, _r4h);
            vst1q_f32(pp + 8 * 5, _r5l);
            vst1q_f32(pp + 8 * 5 + 4, _r5h);
            vst1q_f32(pp + 8 * 6, _r6l);
            vst1q_f32(pp + 8 * 6 + 4, _r6h);
            vst1q_f32(pp + 8 * 7, _r7l);
            vst1q_f32(pp + 8 * 7 + 4, _r7h);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp[4] = p4[0];
            pp[5] = p5[0];
            pp[6] = p6[0];
            pp[7] = p7[0];
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
        const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
        const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;

        int kk = 0;
        for (; kk + 3 < max_kk; kk += 4)
        {
            float32x4x4_t _r0123;
            _r0123.val[0] = vld1q_f32(p0);
            _r0123.val[1] = vld1q_f32(p1);
            _r0123.val[2] = vld1q_f32(p2);
            _r0123.val[3] = vld1q_f32(p3);
            vst4q_f32(pp, _r0123);
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 3 < max_kk; kk += 4)
        {
            float32x4x2_t _r01;
            _r01.val[0] = vld1q_f32(p0);
            _r01.val[1] = vld1q_f32(p1);
            vst2q_f32(pp, _r01);
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 3 < max_kk; kk += 4)
        {
            vst1q_f32(pp, vld1q_f32(p0));
            pp += 4;
            p0 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp += 1;
            p0++;
        }
    }
}

static void convolution_gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end, int use_a53_a55_optimized_kernel)
{
    // NCNN_LOGE("convolution_gemm_transB_packed_tile %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk);

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.cstep;

    const float* pAT = AT_tile;
    const float* pBT = BT_tile;
    const float* pC = CT_tile;

    float* outptr = topT_tile;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const float* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            if (use_a53_a55_optimized_kernel && cpu_support_arm_asimdhp())
            {
                // a55
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #320                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v8.4s}, [%8]               \n"
                    "ld1    {v20.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v8.16b, v8.16b, v8.16b      \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v9.16b, v8.16b              \n"
                    "mov    v10.16b, v8.16b             \n"
                    "mov    v11.16b, v8.16b             \n"
                    "mov    v12.16b, v8.16b             \n"
                    "mov    v13.16b, v8.16b             \n"
                    "mov    v14.16b, v8.16b             \n"
                    "mov    v15.16b, v8.16b             \n"
                    "mov    v16.16b, v8.16b             \n"
                    "mov    v17.16b, v8.16b             \n"
                    "mov    v18.16b, v8.16b             \n"
                    "mov    v19.16b, v8.16b             \n"

                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4s}, [%1], #16          \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s}, [%2], #16          \n"

                    "ldr    d5, [%1], #8                \n"
                    "ldr    x25, [%1], #8               \n"

                    ".align 4                           \n"
                    "4:                                 \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "ins    v5.d[1], x25                \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "ldr    x22, [%2], #8               \n"
                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "ldr    d6, [%1], #8                \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "ldr    x26, [%1], #8               \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "ldr    x23, [%2], #8               \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "ldr    d7, [%1], #8                \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "ldr    x27, [%1], #8               \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "ins    v6.d[1], x26                \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "ldr    d4, [%1], #8                \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "ldr    x24, [%1], #8               \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"
                    "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v9.4s, v6.4s, v3.s[1]       \n"
                    "ins    v7.d[1], x27                \n"
                    "fmla   v10.4s, v6.4s, v3.s[2]      \n"
                    "ldr    x22, [%2], #8               \n"
                    "fmla   v11.4s, v6.4s, v3.s[3]      \n"
                    "fmla   v20.4s, v7.4s, v3.s[0]      \n"
                    "ldr    d5, [%1], #8                \n"
                    "fmla   v21.4s, v7.4s, v3.s[1]      \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v22.4s, v7.4s, v3.s[2]      \n"
                    "ldr    x25, [%1], #8               \n"
                    "fmla   v23.4s, v7.4s, v3.s[3]      \n"
                    "fmla   v12.4s, v6.4s, v0.s[0]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v13.4s, v6.4s, v0.s[1]      \n"
                    "ldr    x23, [%2], #8               \n"
                    "fmla   v14.4s, v6.4s, v0.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v7.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v0.s[1]      \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v26.4s, v7.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v0.s[3]      \n"
                    "prfm   pldl1keep, [%2, #256]       \n" // NOTE PRELOAD
                    "fmla   v16.4s, v6.4s, v1.s[0]      \n"
                    "fmla   v17.4s, v6.4s, v1.s[1]      \n"
                    "ins    v4.d[1], x24                \n"
                    "fmla   v18.4s, v6.4s, v1.s[2]      \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v19.4s, v6.4s, v1.s[3]      \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"
                    "ldr    d6, [%1], #8                \n"
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"
                    "ldr    x26, [%1], #8               \n"
                    "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v9.4s, v4.4s, v2.s[1]       \n"
                    "ins    v5.d[1], x25                \n"
                    "fmla   v10.4s, v4.4s, v2.s[2]      \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v11.4s, v4.4s, v2.s[3]      \n"
                    "ldr    d7, [%1], #8                \n"
                    "fmla   v20.4s, v5.4s, v2.s[0]      \n"
                    "ldr    x27, [%1], #8               \n"
                    "fmla   v21.4s, v5.4s, v2.s[1]      \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v22.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v2.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v3.s[0]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v13.4s, v4.4s, v3.s[1]      \n"
                    "ldr    x22, [%2], #8               \n"
                    "fmla   v14.4s, v4.4s, v3.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v3.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v3.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v3.s[1]      \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v26.4s, v5.4s, v3.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v3.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "ldr    x23, [%2], #8               \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "ins    v6.d[1], x26                \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "ldr    d4, [%1], #8                \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "ldr    x24, [%1], #8               \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v9.4s, v6.4s, v1.s[1]       \n"
                    "ins    v7.d[1], x27                \n"
                    "fmla   v10.4s, v6.4s, v1.s[2]      \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v11.4s, v6.4s, v1.s[3]      \n"
                    "ldr    d5, [%1], #8                \n"
                    "fmla   v20.4s, v7.4s, v1.s[0]      \n"
                    "ldr    x25, [%1], #8               \n"
                    "fmla   v21.4s, v7.4s, v1.s[1]      \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v22.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v7.4s, v1.s[3]      \n"
                    "fmla   v12.4s, v6.4s, v2.s[0]      \n"
                    "fmla   v13.4s, v6.4s, v2.s[1]      \n"
                    "fmla   v14.4s, v6.4s, v2.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v2.s[3]      \n"
                    "fmla   v24.4s, v7.4s, v2.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v2.s[1]      \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v26.4s, v7.4s, v2.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v2.s[3]      \n"
                    "fmla   v16.4s, v6.4s, v3.s[0]      \n"
                    "fmla   v17.4s, v6.4s, v3.s[1]      \n"
                    "fmla   v18.4s, v6.4s, v3.s[2]      \n"
                    "ins    v4.d[1], x24                \n"
                    "fmla   v19.4s, v6.4s, v3.s[3]      \n"
                    "fmla   v28.4s, v7.4s, v3.s[0]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v29.4s, v7.4s, v3.s[1]      \n"
                    "fmla   v30.4s, v7.4s, v3.s[2]      \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v31.4s, v7.4s, v3.s[3]      \n"
                    "bne    4b                          \n"

                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #16                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 2          \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%3], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x12
                    "zip1   v6.4s, v8.4s, v9.4s         \n"
                    "zip2   v7.4s, v8.4s, v9.4s         \n"
                    "zip1   v8.4s, v10.4s, v11.4s       \n"
                    "zip2   v9.4s, v10.4s, v11.4s       \n"
                    "zip1   v10.4s, v12.4s, v13.4s      \n"
                    "zip2   v11.4s, v12.4s, v13.4s      \n"
                    "zip1   v12.4s, v14.4s, v15.4s      \n"
                    "zip2   v13.4s, v14.4s, v15.4s      \n"
                    "zip1   v14.4s, v16.4s, v17.4s      \n"
                    "zip2   v15.4s, v16.4s, v17.4s      \n"
                    "zip1   v16.4s, v18.4s, v19.4s      \n"
                    "zip2   v17.4s, v18.4s, v19.4s      \n"

                    "zip1   v18.4s, v20.4s, v21.4s      \n"
                    "zip2   v19.4s, v20.4s, v21.4s      \n"
                    "zip1   v20.4s, v22.4s, v23.4s      \n"
                    "zip2   v21.4s, v22.4s, v23.4s      \n"
                    "zip1   v22.4s, v24.4s, v25.4s      \n"
                    "zip2   v23.4s, v24.4s, v25.4s      \n"
                    "zip1   v24.4s, v26.4s, v27.4s      \n"
                    "zip2   v25.4s, v26.4s, v27.4s      \n"
                    "zip1   v26.4s, v28.4s, v29.4s      \n"
                    "zip2   v27.4s, v28.4s, v29.4s      \n"
                    "zip1   v28.4s, v30.4s, v31.4s      \n"
                    "zip2   v29.4s, v30.4s, v31.4s      \n"

                    "zip1   v0.2d, v6.2d, v8.2d         \n"
                    "zip2   v3.2d, v6.2d, v8.2d         \n"
                    "zip1   v1.2d, v10.2d, v12.2d       \n"
                    "zip2   v4.2d, v10.2d, v12.2d       \n"
                    "zip1   v2.2d, v14.2d, v16.2d       \n"
                    "zip2   v5.2d, v14.2d, v16.2d       \n"

                    "zip1   v6.2d, v7.2d, v9.2d         \n"
                    "zip2   v9.2d, v7.2d, v9.2d         \n"
                    "zip1   v7.2d, v11.2d, v13.2d       \n"
                    "zip2   v10.2d, v11.2d, v13.2d      \n"
                    "zip1   v8.2d, v15.2d, v17.2d       \n"
                    "zip2   v11.2d, v15.2d, v17.2d      \n"

                    "zip1   v12.2d, v18.2d, v20.2d      \n"
                    "zip2   v15.2d, v18.2d, v20.2d      \n"
                    "zip1   v13.2d, v22.2d, v24.2d      \n"
                    "zip2   v16.2d, v22.2d, v24.2d      \n"
                    "zip1   v14.2d, v26.2d, v28.2d      \n"
                    "zip2   v17.2d, v26.2d, v28.2d      \n"

                    "zip1   v18.2d, v19.2d, v21.2d      \n"
                    "zip2   v21.2d, v19.2d, v21.2d      \n"
                    "zip1   v19.2d, v23.2d, v25.2d      \n"
                    "zip2   v22.2d, v23.2d, v25.2d      \n"
                    "zip1   v20.2d, v27.2d, v29.2d      \n"
                    "zip2   v23.2d, v27.2d, v29.2d      \n"

                    "add    x4, %3, %w13, sxtw 2        \n"
                    "st1    {v0.4s, v1.4s, v2.4s}, [%3], #48 \n"
                    "st1    {v3.4s, v4.4s, v5.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v6.4s, v7.4s, v8.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v9.4s, v10.4s, v11.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v12.4s, v13.4s, v14.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v15.4s, v16.4s, v17.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v18.4s, v19.4s, v20.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v21.4s, v22.4s, v23.4s}, [x4] \n"

                    "9:                                 \n"
                    "add    %0, %0, #384                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else if (use_a53_a55_optimized_kernel && !cpu_support_arm_asimdhp())
            {
                // a53
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #320                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v8.4s}, [%8]               \n"
                    "ld1    {v20.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v8.16b, v8.16b, v8.16b      \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v9.16b, v8.16b              \n"
                    "mov    v10.16b, v8.16b             \n"
                    "mov    v11.16b, v8.16b             \n"
                    "mov    v12.16b, v8.16b             \n"
                    "mov    v13.16b, v8.16b             \n"
                    "mov    v14.16b, v8.16b             \n"
                    "mov    v15.16b, v8.16b             \n"
                    "mov    v16.16b, v8.16b             \n"
                    "mov    v17.16b, v8.16b             \n"
                    "mov    v18.16b, v8.16b             \n"
                    "mov    v19.16b, v8.16b             \n"

                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v4.4s}, [%1], #16          \n"

                    "prfm   pldl1keep, [%2, #384]       \n"
                    "ld1    {v0.4s}, [%2], #16          \n"

                    "ldr    d1, [%2]                    \n"
                    "ldr    x21, [%2, #8]               \n"
                    "ldr    d2, [%2, #16]               \n"
                    "ldr    x22, [%2, #24]              \n"
                    "add    %2, %2, #32                 \n"

                    ".align 4                           \n"
                    "4:                                 \n"

                    "ldr    d5, [%1]                    \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "ldr    x25, [%1, #8]               \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"

                    "ldr    d6, [%1]                    \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "ldr    x26, [%1, #8]               \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "nop                                \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "nop                                \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"

                    "ldr    d3, [%2]                    \n"
                    "ins    v5.d[1], x25                \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "ldr    x23, [%2, #8]               \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "nop                                \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"

                    "nop                                \n"
                    "nop                                \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "nop                                \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "nop                                \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"

                    "ldr    d0, [%2]                    \n"
                    "ins    v6.d[1], x26                \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "ldr    x20, [%2, #8]               \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"

                    "ldr    d1, [%2]                    \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "ldr    x21, [%2, #8]               \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "ldr    d7, [%1]                    \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                    "ldr    x27, [%1, #8]               \n"
                    "fmla   v9.4s, v6.4s, v3.s[1]       \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v10.4s, v6.4s, v3.s[2]      \n"

                    "ldr    d4, [%1]                    \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v11.4s, v6.4s, v3.s[3]      \n"
                    "ldr    x24, [%1, #8]               \n"
                    "fmla   v12.4s, v6.4s, v0.s[0]      \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v13.4s, v6.4s, v0.s[1]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                    "fmla   v14.4s, v6.4s, v0.s[2]      \n"
                    "nop                                \n"
                    "fmla   v15.4s, v6.4s, v0.s[3]      \n"
                    "nop                                \n"
                    "fmla   v16.4s, v6.4s, v1.s[0]      \n"

                    "ldr    d2, [%2]                    \n"
                    "ins    v7.d[1], x27                \n"
                    "fmla   v17.4s, v6.4s, v1.s[1]      \n"
                    "ldr    x22, [%2, #8]               \n"
                    "fmla   v18.4s, v6.4s, v1.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v19.4s, v6.4s, v1.s[3]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                    "fmla   v20.4s, v7.4s, v3.s[0]      \n"
                    "nop                                \n"
                    "fmla   v21.4s, v7.4s, v3.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v7.4s, v3.s[2]      \n"

                    "nop                                \n"
                    "nop                                \n"
                    "fmla   v23.4s, v7.4s, v3.s[3]      \n"
                    "nop                                \n"
                    "fmla   v24.4s, v7.4s, v0.s[0]      \n"
                    "nop                                \n"
                    "fmla   v25.4s, v7.4s, v0.s[1]      \n"

                    "ldr    d3, [%2]                    \n"
                    "ins    v4.d[1], x24                \n"
                    "fmla   v26.4s, v7.4s, v0.s[2]      \n"
                    "ldr    x23, [%2, #8]               \n"
                    "fmla   v27.4s, v7.4s, v0.s[3]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"

                    "ldr    d0, [%2]                    \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "ldr    x20, [%2, #8]               \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"

                    "ldr    d5, [%1]                    \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                    "ldr    x25, [%1, #8]               \n"
                    "fmla   v9.4s, v4.4s, v2.s[1]       \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v10.4s, v4.4s, v2.s[2]      \n"

                    "ldr    d6, [%1]                    \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v11.4s, v4.4s, v2.s[3]      \n"
                    "ldr    x26, [%1, #8]               \n"
                    "fmla   v12.4s, v4.4s, v3.s[0]      \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v13.4s, v4.4s, v3.s[1]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                    "fmla   v14.4s, v4.4s, v3.s[2]      \n"
                    "nop                                \n"
                    "fmla   v15.4s, v4.4s, v3.s[3]      \n"
                    "nop                                \n"
                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"

                    "ldr    d1, [%2]                    \n"
                    "ins    v5.d[1], x25                \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "ldr    x21, [%2, #8]               \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                    "fmla   v20.4s, v5.4s, v2.s[0]      \n"
                    "nop                                \n"
                    "fmla   v21.4s, v5.4s, v2.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v5.4s, v2.s[2]      \n"

                    "nop                                \n"
                    "nop                                \n"
                    "fmla   v23.4s, v5.4s, v2.s[3]      \n"
                    "nop                                \n"
                    "fmla   v24.4s, v5.4s, v3.s[0]      \n"
                    "nop                                \n"
                    "fmla   v25.4s, v5.4s, v3.s[1]      \n"

                    "ldr    d2, [%2]                    \n"
                    "ins    v6.d[1], x26                \n"
                    "fmla   v26.4s, v5.4s, v3.s[2]      \n"
                    "ldr    x22, [%2, #8]               \n"
                    "fmla   v27.4s, v5.4s, v3.s[3]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"

                    "ldr    d3, [%2]                    \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "ldr    x23, [%2, #8]               \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"

                    "ldr    d7, [%1]                    \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                    "ldr    x27, [%1, #8]               \n"
                    "fmla   v9.4s, v6.4s, v1.s[1]       \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v10.4s, v6.4s, v1.s[2]      \n"

                    "ldr    d4, [%1]                    \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v11.4s, v6.4s, v1.s[3]      \n"
                    "ldr    x24, [%1, #8]               \n"
                    "fmla   v12.4s, v6.4s, v2.s[0]      \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v13.4s, v6.4s, v2.s[1]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                    "fmla   v14.4s, v6.4s, v2.s[2]      \n"
                    "nop                                \n"
                    "fmla   v15.4s, v6.4s, v2.s[3]      \n"
                    "nop                                \n"
                    "fmla   v16.4s, v6.4s, v3.s[0]      \n"

                    "ldr    d0, [%2]                    \n"
                    "ins    v7.d[1], x27                \n"
                    "fmla   v17.4s, v6.4s, v3.s[1]      \n"
                    "ldr    x20, [%2, #8]               \n"
                    "fmla   v18.4s, v6.4s, v3.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v19.4s, v6.4s, v3.s[3]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                    "fmla   v20.4s, v7.4s, v1.s[0]      \n"
                    "nop                                \n"
                    "fmla   v21.4s, v7.4s, v1.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v7.4s, v1.s[2]      \n"

                    "nop                                \n"
                    "nop                                \n"
                    "fmla   v23.4s, v7.4s, v1.s[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v24.4s, v7.4s, v2.s[0]      \n"
                    "nop                                \n"
                    "fmla   v25.4s, v7.4s, v2.s[1]      \n"

                    "ldr    d1, [%2]                    \n"
                    "ins    v4.d[1], x24                \n"
                    "fmla   v26.4s, v7.4s, v2.s[2]      \n"
                    "ldr    x21, [%2, #8]               \n"
                    "fmla   v27.4s, v7.4s, v2.s[3]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v28.4s, v7.4s, v3.s[0]      \n"

                    "ldr    d2, [%2]                    \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v29.4s, v7.4s, v3.s[1]      \n"
                    "ldr    x22, [%2, #8]               \n"
                    "fmla   v30.4s, v7.4s, v3.s[2]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v31.4s, v7.4s, v3.s[3]      \n"

                    "bne    4b                          \n"

                    "sub    %1, %1, #16                 \n"
                    "sub    %2, %2, #48                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 2          \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%3], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x12
                    "zip1   v6.4s, v8.4s, v9.4s         \n"
                    "zip2   v7.4s, v8.4s, v9.4s         \n"
                    "zip1   v8.4s, v10.4s, v11.4s       \n"
                    "zip2   v9.4s, v10.4s, v11.4s       \n"
                    "zip1   v10.4s, v12.4s, v13.4s      \n"
                    "zip2   v11.4s, v12.4s, v13.4s      \n"
                    "zip1   v12.4s, v14.4s, v15.4s      \n"
                    "zip2   v13.4s, v14.4s, v15.4s      \n"
                    "zip1   v14.4s, v16.4s, v17.4s      \n"
                    "zip2   v15.4s, v16.4s, v17.4s      \n"
                    "zip1   v16.4s, v18.4s, v19.4s      \n"
                    "zip2   v17.4s, v18.4s, v19.4s      \n"

                    "zip1   v18.4s, v20.4s, v21.4s      \n"
                    "zip2   v19.4s, v20.4s, v21.4s      \n"
                    "zip1   v20.4s, v22.4s, v23.4s      \n"
                    "zip2   v21.4s, v22.4s, v23.4s      \n"
                    "zip1   v22.4s, v24.4s, v25.4s      \n"
                    "zip2   v23.4s, v24.4s, v25.4s      \n"
                    "zip1   v24.4s, v26.4s, v27.4s      \n"
                    "zip2   v25.4s, v26.4s, v27.4s      \n"
                    "zip1   v26.4s, v28.4s, v29.4s      \n"
                    "zip2   v27.4s, v28.4s, v29.4s      \n"
                    "zip1   v28.4s, v30.4s, v31.4s      \n"
                    "zip2   v29.4s, v30.4s, v31.4s      \n"

                    "zip1   v0.2d, v6.2d, v8.2d         \n"
                    "zip2   v3.2d, v6.2d, v8.2d         \n"
                    "zip1   v1.2d, v10.2d, v12.2d       \n"
                    "zip2   v4.2d, v10.2d, v12.2d       \n"
                    "zip1   v2.2d, v14.2d, v16.2d       \n"
                    "zip2   v5.2d, v14.2d, v16.2d       \n"

                    "zip1   v6.2d, v7.2d, v9.2d         \n"
                    "zip2   v9.2d, v7.2d, v9.2d         \n"
                    "zip1   v7.2d, v11.2d, v13.2d       \n"
                    "zip2   v10.2d, v11.2d, v13.2d      \n"
                    "zip1   v8.2d, v15.2d, v17.2d       \n"
                    "zip2   v11.2d, v15.2d, v17.2d      \n"

                    "zip1   v12.2d, v18.2d, v20.2d      \n"
                    "zip2   v15.2d, v18.2d, v20.2d      \n"
                    "zip1   v13.2d, v22.2d, v24.2d      \n"
                    "zip2   v16.2d, v22.2d, v24.2d      \n"
                    "zip1   v14.2d, v26.2d, v28.2d      \n"
                    "zip2   v17.2d, v26.2d, v28.2d      \n"

                    "zip1   v18.2d, v19.2d, v21.2d      \n"
                    "zip2   v21.2d, v19.2d, v21.2d      \n"
                    "zip1   v19.2d, v23.2d, v25.2d      \n"
                    "zip2   v22.2d, v23.2d, v25.2d      \n"
                    "zip1   v20.2d, v27.2d, v29.2d      \n"
                    "zip2   v23.2d, v27.2d, v29.2d      \n"

                    "add    x4, %3, %w13, sxtw 2        \n"
                    "st1    {v0.4s, v1.4s, v2.4s}, [%3], #48 \n"
                    "st1    {v3.4s, v4.4s, v5.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v6.4s, v7.4s, v8.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v9.4s, v10.4s, v11.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v12.4s, v13.4s, v14.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v15.4s, v16.4s, v17.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v18.4s, v19.4s, v20.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v21.4s, v22.4s, v23.4s}, [x4] \n"

                    "9:                                 \n"
                    "add    %0, %0, #384                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #320                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v8.4s}, [%8]               \n"
                    "ld1    {v20.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v8.16b, v8.16b, v8.16b      \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v9.16b, v8.16b              \n"
                    "mov    v10.16b, v8.16b             \n"
                    "mov    v11.16b, v8.16b             \n"
                    "mov    v12.16b, v8.16b             \n"
                    "mov    v13.16b, v8.16b             \n"
                    "mov    v14.16b, v8.16b             \n"
                    "mov    v15.16b, v8.16b             \n"
                    "mov    v16.16b, v8.16b             \n"
                    "mov    v17.16b, v8.16b             \n"
                    "mov    v18.16b, v8.16b             \n"
                    "mov    v19.16b, v8.16b             \n"

                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"

                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"

                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                    "fmla   v9.4s, v6.4s, v3.s[1]       \n"
                    "fmla   v10.4s, v6.4s, v3.s[2]      \n"
                    "fmla   v11.4s, v6.4s, v3.s[3]      \n"
                    "fmla   v20.4s, v7.4s, v3.s[0]      \n"
                    "fmla   v21.4s, v7.4s, v3.s[1]      \n"
                    "fmla   v22.4s, v7.4s, v3.s[2]      \n"
                    "fmla   v23.4s, v7.4s, v3.s[3]      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"

                    "fmla   v12.4s, v6.4s, v0.s[0]      \n"
                    "fmla   v13.4s, v6.4s, v0.s[1]      \n"
                    "fmla   v14.4s, v6.4s, v0.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v7.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v7.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v0.s[3]      \n"

                    "fmla   v16.4s, v6.4s, v1.s[0]      \n"
                    "fmla   v17.4s, v6.4s, v1.s[1]      \n"
                    "fmla   v18.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v19.4s, v6.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"

                    "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v2.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v2.s[3]      \n"
                    "fmla   v20.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v2.s[3]      \n"

                    "fmla   v12.4s, v4.4s, v3.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v3.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v3.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v3.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v3.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v3.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v3.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v3.s[3]      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"

                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"

                    "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                    "fmla   v9.4s, v6.4s, v1.s[1]       \n"
                    "fmla   v10.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v11.4s, v6.4s, v1.s[3]      \n"
                    "fmla   v20.4s, v7.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v7.4s, v1.s[3]      \n"

                    "fmla   v12.4s, v6.4s, v2.s[0]      \n"
                    "fmla   v13.4s, v6.4s, v2.s[1]      \n"
                    "fmla   v14.4s, v6.4s, v2.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v2.s[3]      \n"
                    "fmla   v24.4s, v7.4s, v2.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v2.s[1]      \n"
                    "fmla   v26.4s, v7.4s, v2.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v16.4s, v6.4s, v3.s[0]      \n"
                    "fmla   v17.4s, v6.4s, v3.s[1]      \n"
                    "fmla   v18.4s, v6.4s, v3.s[2]      \n"
                    "fmla   v19.4s, v6.4s, v3.s[3]      \n"
                    "fmla   v28.4s, v7.4s, v3.s[0]      \n"
                    "fmla   v29.4s, v7.4s, v3.s[1]      \n"
                    "fmla   v30.4s, v7.4s, v3.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v3.s[3]      \n"

                    "bne    4b                          \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 2          \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%3], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x12
                    "zip1   v6.4s, v8.4s, v9.4s         \n"
                    "zip2   v7.4s, v8.4s, v9.4s         \n"
                    "zip1   v8.4s, v10.4s, v11.4s       \n"
                    "zip2   v9.4s, v10.4s, v11.4s       \n"
                    "zip1   v10.4s, v12.4s, v13.4s      \n"
                    "zip2   v11.4s, v12.4s, v13.4s      \n"
                    "zip1   v12.4s, v14.4s, v15.4s      \n"
                    "zip2   v13.4s, v14.4s, v15.4s      \n"
                    "zip1   v14.4s, v16.4s, v17.4s      \n"
                    "zip2   v15.4s, v16.4s, v17.4s      \n"
                    "zip1   v16.4s, v18.4s, v19.4s      \n"
                    "zip2   v17.4s, v18.4s, v19.4s      \n"

                    "zip1   v18.4s, v20.4s, v21.4s      \n"
                    "zip2   v19.4s, v20.4s, v21.4s      \n"
                    "zip1   v20.4s, v22.4s, v23.4s      \n"
                    "zip2   v21.4s, v22.4s, v23.4s      \n"
                    "zip1   v22.4s, v24.4s, v25.4s      \n"
                    "zip2   v23.4s, v24.4s, v25.4s      \n"
                    "zip1   v24.4s, v26.4s, v27.4s      \n"
                    "zip2   v25.4s, v26.4s, v27.4s      \n"
                    "zip1   v26.4s, v28.4s, v29.4s      \n"
                    "zip2   v27.4s, v28.4s, v29.4s      \n"
                    "zip1   v28.4s, v30.4s, v31.4s      \n"
                    "zip2   v29.4s, v30.4s, v31.4s      \n"

                    "zip1   v0.2d, v6.2d, v8.2d         \n"
                    "zip2   v3.2d, v6.2d, v8.2d         \n"
                    "zip1   v1.2d, v10.2d, v12.2d       \n"
                    "zip2   v4.2d, v10.2d, v12.2d       \n"
                    "zip1   v2.2d, v14.2d, v16.2d       \n"
                    "zip2   v5.2d, v14.2d, v16.2d       \n"

                    "zip1   v6.2d, v7.2d, v9.2d         \n"
                    "zip2   v9.2d, v7.2d, v9.2d         \n"
                    "zip1   v7.2d, v11.2d, v13.2d       \n"
                    "zip2   v10.2d, v11.2d, v13.2d      \n"
                    "zip1   v8.2d, v15.2d, v17.2d       \n"
                    "zip2   v11.2d, v15.2d, v17.2d      \n"

                    "zip1   v12.2d, v18.2d, v20.2d      \n"
                    "zip2   v15.2d, v18.2d, v20.2d      \n"
                    "zip1   v13.2d, v22.2d, v24.2d      \n"
                    "zip2   v16.2d, v22.2d, v24.2d      \n"
                    "zip1   v14.2d, v26.2d, v28.2d      \n"
                    "zip2   v17.2d, v26.2d, v28.2d      \n"

                    "zip1   v18.2d, v19.2d, v21.2d      \n"
                    "zip2   v21.2d, v19.2d, v21.2d      \n"
                    "zip1   v19.2d, v23.2d, v25.2d      \n"
                    "zip2   v22.2d, v23.2d, v25.2d      \n"
                    "zip1   v20.2d, v27.2d, v29.2d      \n"
                    "zip2   v23.2d, v27.2d, v29.2d      \n"

                    "add    x4, %3, %w13, sxtw 2        \n"
                    "st1    {v0.4s, v1.4s, v2.4s}, [%3], #48 \n"
                    "st1    {v3.4s, v4.4s, v5.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v6.4s, v7.4s, v8.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v9.4s, v10.4s, v11.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v12.4s, v13.4s, v14.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v15.4s, v16.4s, v17.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v18.4s, v19.4s, v20.4s}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v21.4s, v22.4s, v23.4s}, [x4] \n"

                    "9:                                 \n"
                    "add    %0, %0, #384                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;
            float32x4_t _sum80;
            float32x4_t _sum81;
            float32x4_t _sum90;
            float32x4_t _sum91;
            float32x4_t _suma0;
            float32x4_t _suma1;
            float32x4_t _sumb0;
            float32x4_t _sumb1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                    _sum20 = _sum00;
                    _sum21 = _sum01;
                    _sum30 = _sum00;
                    _sum31 = _sum01;
                    _sum40 = _sum00;
                    _sum41 = _sum01;
                    _sum50 = _sum00;
                    _sum51 = _sum01;
                    _sum60 = _sum00;
                    _sum61 = _sum01;
                    _sum70 = _sum00;
                    _sum71 = _sum01;
                    _sum80 = _sum00;
                    _sum81 = _sum01;
                    _sum90 = _sum00;
                    _sum91 = _sum01;
                    _suma0 = _sum00;
                    _suma1 = _sum01;
                    _sumb0 = _sum00;
                    _sumb1 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                    _sum40 = vdupq_n_f32(0.f);
                    _sum41 = vdupq_n_f32(0.f);
                    _sum50 = vdupq_n_f32(0.f);
                    _sum51 = vdupq_n_f32(0.f);
                    _sum60 = vdupq_n_f32(0.f);
                    _sum61 = vdupq_n_f32(0.f);
                    _sum70 = vdupq_n_f32(0.f);
                    _sum71 = vdupq_n_f32(0.f);
                    _sum80 = vdupq_n_f32(0.f);
                    _sum81 = vdupq_n_f32(0.f);
                    _sum90 = vdupq_n_f32(0.f);
                    _sum91 = vdupq_n_f32(0.f);
                    _suma0 = vdupq_n_f32(0.f);
                    _suma1 = vdupq_n_f32(0.f);
                    _sumb0 = vdupq_n_f32(0.f);
                    _sumb1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
                _sum80 = vld1q_f32(outptr + 4 * 16);
                _sum81 = vld1q_f32(outptr + 4 * 17);
                _sum90 = vld1q_f32(outptr + 4 * 18);
                _sum91 = vld1q_f32(outptr + 4 * 19);
                _suma0 = vld1q_f32(outptr + 4 * 20);
                _suma1 = vld1q_f32(outptr + 4 * 21);
                _sumb0 = vld1q_f32(outptr + 4 * 22);
                _sumb1 = vld1q_f32(outptr + 4 * 23);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);
                    vst1q_f32(outptr0 + 4 * 2, _sum20);
                    vst1q_f32(outptr0 + 4 * 3, _sum30);
                    vst1q_f32(outptr0 + 4 * 4, _sum40);
                    vst1q_f32(outptr0 + 4 * 5, _sum50);
                    vst1q_f32(outptr0 + 4 * 6, _sum60);
                    vst1q_f32(outptr0 + 4 * 7, _sum70);
                    vst1q_f32(outptr0 + 4 * 8, _sum80);
                    vst1q_f32(outptr0 + 4 * 9, _sum90);
                    vst1q_f32(outptr0 + 4 * 10, _suma0);
                    vst1q_f32(outptr0 + 4 * 11, _sumb0);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 2, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 3, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 4, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 5, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 6, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 7, _sum71);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 8, _sum81);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 9, _sum91);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 10, _suma1);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 11, _sumb1);

                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose8x12_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, _sum80, _sum81, _sum90, _sum91, _suma0, _suma1, _sumb0, _sumb1);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + 8, _sum10);
                    vst1q_f32(outptr0 + out_hstep, _sum11);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum20);
                    vst1q_f32(outptr0 + out_hstep + 8, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum30);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 2 + 8, _sum40);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _sum50);
                    vst1q_f32(outptr0 + out_hstep * 3 + 8, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum60);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 4 + 8, _sum70);
                    vst1q_f32(outptr0 + out_hstep * 5, _sum71);
                    vst1q_f32(outptr0 + out_hstep * 5 + 4, _sum80);
                    vst1q_f32(outptr0 + out_hstep * 5 + 8, _sum81);
                    vst1q_f32(outptr0 + out_hstep * 6, _sum90);
                    vst1q_f32(outptr0 + out_hstep * 6 + 4, _sum91);
                    vst1q_f32(outptr0 + out_hstep * 6 + 8, _suma0);
                    vst1q_f32(outptr0 + out_hstep * 7, _suma1);
                    vst1q_f32(outptr0 + out_hstep * 7 + 4, _sumb0);
                    vst1q_f32(outptr0 + out_hstep * 7 + 8, _sumb1);

                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
                vst1q_f32(outptr + 4 * 16, _sum80);
                vst1q_f32(outptr + 4 * 17, _sum81);
                vst1q_f32(outptr + 4 * 18, _sum90);
                vst1q_f32(outptr + 4 * 19, _sum91);
                vst1q_f32(outptr + 4 * 20, _suma0);
                vst1q_f32(outptr + 4 * 21, _suma1);
                vst1q_f32(outptr + 4 * 22, _sumb0);
                vst1q_f32(outptr + 4 * 23, _sumb1);
            }

            outptr += 96;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            if (use_a53_a55_optimized_kernel && cpu_support_arm_asimdhp())
            {
                // a55
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #192                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v16.4s}, [%8]              \n"
                    "ld1    {v24.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v16.16b, v16.16b, v16.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"

                    "2:                                 \n"
                    "mov    v17.16b, v16.16b            \n"
                    "mov    v18.16b, v16.16b            \n"
                    "mov    v19.16b, v16.16b            \n"
                    "mov    v20.16b, v16.16b            \n"
                    "mov    v21.16b, v16.16b            \n"
                    "mov    v22.16b, v16.16b            \n"
                    "mov    v23.16b, v16.16b            \n"

                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"
                    "mov    v28.16b, v24.16b            \n"
                    "mov    v29.16b, v24.16b            \n"
                    "mov    v30.16b, v24.16b            \n"
                    "mov    v31.16b, v24.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v8.4s}, [%1], #16          \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s}, [%2], #16          \n"

                    "ldr    d1, [%2], #8                \n"
                    "ldr    x21, [%2], #8               \n"

                    ".align 4                           \n"
                    "4:                                 \n"
                    "ldr    d9, [%1], #8                \n"
                    "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                    "ldr    x25, [%1], #8               \n"
                    "fmla   v17.4s, v8.4s, v0.s[1]      \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v18.4s, v8.4s, v0.s[2]      \n"
                    "ldr    d10, [%1], #8               \n"
                    "fmla   v19.4s, v8.4s, v0.s[3]      \n"
                    "ldr    x26, [%1], #8               \n"
                    "fmla   v20.4s, v8.4s, v1.s[0]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v21.4s, v8.4s, v1.s[1]      \n"
                    "ins    v9.d[1], x25                \n"
                    "fmla   v22.4s, v8.4s, v1.s[2]      \n"
                    "ldr    x22, [%2], #8               \n"
                    "fmla   v23.4s, v8.4s, v1.s[3]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v24.4s, v9.4s, v0.s[0]      \n"
                    "ldr    x23, [%2], #8               \n"
                    "fmla   v25.4s, v9.4s, v0.s[1]      \n"
                    "ins    v10.d[1], x26               \n"
                    "fmla   v26.4s, v9.4s, v0.s[2]      \n"
                    "ldr    d11, [%1], #8               \n"
                    "fmla   v27.4s, v9.4s, v0.s[3]      \n"
                    "ldr    x27, [%1], #8               \n"
                    "fmla   v28.4s, v9.4s, v1.s[0]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v29.4s, v9.4s, v1.s[1]      \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v30.4s, v9.4s, v1.s[2]      \n"
                    "ldr    d12, [%1], #8               \n"
                    "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                    "ldr    x24, [%1], #8               \n"
                    "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v17.4s, v10.4s, v2.s[1]     \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v18.4s, v10.4s, v2.s[2]     \n"
                    "ldr    d4, [%2], #8                \n"
                    "fmla   v19.4s, v10.4s, v2.s[3]     \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v20.4s, v10.4s, v3.s[0]     \n"
                    "ldr    d5, [%2], #8                \n"
                    "fmla   v21.4s, v10.4s, v3.s[1]     \n"
                    "ins    v11.d[1], x27               \n"
                    "fmla   v22.4s, v10.4s, v3.s[2]     \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v23.4s, v10.4s, v3.s[3]     \n"
                    "ldr    d13, [%1], #8               \n"
                    "fmla   v24.4s, v11.4s, v2.s[0]     \n"
                    "ldr    x25, [%1], #8               \n"
                    "fmla   v25.4s, v11.4s, v2.s[1]     \n"
                    "ins    v12.d[1], x24               \n"
                    "fmla   v26.4s, v11.4s, v2.s[2]     \n"
                    "ldr    d14, [%1], #8               \n"
                    "fmla   v27.4s, v11.4s, v2.s[3]     \n"
                    "ldr    x26, [%1], #8               \n"
                    "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                    "ldr    d6, [%2], #8                \n"
                    "fmla   v29.4s, v11.4s, v3.s[1]     \n"
                    "ins    v4.d[1], x20                \n"
                    "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                    "ldr    x22, [%2], #8               \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "ldr    d7, [%2], #8                \n"
                    "fmla   v16.4s, v12.4s, v4.s[0]     \n"
                    "ldr    x23, [%2], #8               \n"
                    "fmla   v17.4s, v12.4s, v4.s[1]     \n"
                    "ins    v5.d[1], x21                \n"
                    "fmla   v18.4s, v12.4s, v4.s[2]     \n"
                    "ldr    d15, [%1], #8               \n"
                    "fmla   v19.4s, v12.4s, v4.s[3]     \n"
                    "ldr    x27, [%1], #8               \n"
                    "fmla   v20.4s, v12.4s, v5.s[0]     \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v21.4s, v12.4s, v5.s[1]     \n"
                    "ins    v13.d[1], x25               \n"
                    "fmla   v22.4s, v12.4s, v5.s[2]     \n"
                    "ldr    d8, [%1], #8                \n"
                    "fmla   v23.4s, v12.4s, v5.s[3]     \n"
                    "ldr    x24, [%1], #8               \n"
                    "fmla   v24.4s, v13.4s, v4.s[0]     \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v25.4s, v13.4s, v4.s[1]     \n"
                    "ins    v14.d[1], x26               \n"
                    "fmla   v26.4s, v13.4s, v4.s[2]     \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v27.4s, v13.4s, v4.s[3]     \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v28.4s, v13.4s, v5.s[0]     \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v29.4s, v13.4s, v5.s[1]     \n"
                    "ins    v6.d[1], x22                \n"
                    "fmla   v30.4s, v13.4s, v5.s[2]     \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v31.4s, v13.4s, v5.s[3]     \n"
                    "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                    "fmla   v17.4s, v14.4s, v6.s[1]     \n"
                    "ins    v7.d[1], x23                \n"
                    "fmla   v18.4s, v14.4s, v6.s[2]     \n"
                    "fmla   v19.4s, v14.4s, v6.s[3]     \n"
                    "fmla   v20.4s, v14.4s, v7.s[0]     \n"
                    "fmla   v21.4s, v14.4s, v7.s[1]     \n"
                    "ins    v15.d[1], x27               \n"
                    "fmla   v22.4s, v14.4s, v7.s[2]     \n"
                    "fmla   v23.4s, v14.4s, v7.s[3]     \n"
                    "fmla   v24.4s, v15.4s, v6.s[0]     \n"
                    "fmla   v25.4s, v15.4s, v6.s[1]     \n"
                    "ins    v8.d[1], x24                \n"
                    "fmla   v26.4s, v15.4s, v6.s[2]     \n"
                    "fmla   v27.4s, v15.4s, v6.s[3]     \n"
                    "fmla   v28.4s, v15.4s, v7.s[0]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v29.4s, v15.4s, v7.s[1]     \n"
                    "fmla   v30.4s, v15.4s, v7.s[2]     \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                    "bne    4b                          \n"

                    "sub    %1, %1, #16                 \n"
                    "sub    %2, %2, #32                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v4.4s, v1.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v24.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v1.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 2          \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x8
                    "zip1   v14.4s, v16.4s, v17.4s      \n"
                    "zip2   v15.4s, v16.4s, v17.4s      \n"
                    "zip1   v16.4s, v18.4s, v19.4s      \n"
                    "zip2   v17.4s, v18.4s, v19.4s      \n"
                    "zip1   v18.4s, v20.4s, v21.4s      \n"
                    "zip2   v19.4s, v20.4s, v21.4s      \n"
                    "zip1   v20.4s, v22.4s, v23.4s      \n"
                    "zip2   v21.4s, v22.4s, v23.4s      \n"

                    "zip1   v22.4s, v24.4s, v25.4s      \n"
                    "zip2   v23.4s, v24.4s, v25.4s      \n"
                    "zip1   v24.4s, v26.4s, v27.4s      \n"
                    "zip2   v25.4s, v26.4s, v27.4s      \n"
                    "zip1   v26.4s, v28.4s, v29.4s      \n"
                    "zip2   v27.4s, v28.4s, v29.4s      \n"
                    "zip1   v28.4s, v30.4s, v31.4s      \n"
                    "zip2   v29.4s, v30.4s, v31.4s      \n"

                    "zip1   v0.2d, v14.2d, v16.2d       \n"
                    "zip2   v2.2d, v14.2d, v16.2d       \n"
                    "zip1   v4.2d, v15.2d, v17.2d       \n"
                    "zip2   v6.2d, v15.2d, v17.2d       \n"
                    "zip1   v1.2d, v18.2d, v20.2d       \n"
                    "zip2   v3.2d, v18.2d, v20.2d       \n"
                    "zip1   v5.2d, v19.2d, v21.2d       \n"
                    "zip2   v7.2d, v19.2d, v21.2d       \n"

                    "zip1   v8.2d, v22.2d, v24.2d       \n"
                    "zip2   v10.2d, v22.2d, v24.2d      \n"
                    "zip1   v12.2d, v23.2d, v25.2d      \n"
                    "zip2   v14.2d, v23.2d, v25.2d      \n"
                    "zip1   v9.2d, v26.2d, v28.2d       \n"
                    "zip2   v11.2d, v26.2d, v28.2d      \n"
                    "zip1   v13.2d, v27.2d, v29.2d      \n"
                    "zip2   v15.2d, v27.2d, v29.2d      \n"

                    "add    x4, %3, %w13, sxtw 2        \n"
                    "st1    {v0.4s, v1.4s}, [%3], #32   \n"
                    "st1    {v2.4s, v3.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v4.4s, v5.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v6.4s, v7.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v8.4s, v9.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v10.4s, v11.4s}, [x4]      \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v12.4s, v13.4s}, [x4]      \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v14.4s, v15.4s}, [x4]      \n"

                    "9:                                 \n"
                    "add    %0, %0, #256                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else if (use_a53_a55_optimized_kernel && !cpu_support_arm_asimdhp())
            {
                // a53
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #192                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v16.4s}, [%8]              \n"
                    "ld1    {v24.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v16.16b, v16.16b, v16.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"

                    "2:                                 \n"
                    "mov    v17.16b, v16.16b            \n"
                    "mov    v18.16b, v16.16b            \n"
                    "mov    v19.16b, v16.16b            \n"
                    "mov    v20.16b, v16.16b            \n"
                    "mov    v21.16b, v16.16b            \n"
                    "mov    v22.16b, v16.16b            \n"
                    "mov    v23.16b, v16.16b            \n"

                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"
                    "mov    v28.16b, v24.16b            \n"
                    "mov    v29.16b, v24.16b            \n"
                    "mov    v30.16b, v24.16b            \n"
                    "mov    v31.16b, v24.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ldr    d0, [%2]                    \n"
                    "ldr    x20, [%2, #8]               \n"
                    "ins    v0.d[1], x20                \n"
                    "add    %2, %2, #16                 \n"

                    "ldr    d8, [%1]                    \n"
                    "ldr    x24, [%1, #8]               \n"
                    "ins    v8.d[1], x24                \n"
                    "add    %1, %1, #16                 \n"

                    "ldr    d1, [%2]                    \n"
                    "ldr    x21, [%2, #8]               \n"
                    "add    %2, %2, #16                 \n"

                    "ldr    d9, [%1]                    \n"
                    "ldr    x25, [%1, #8]               \n"
                    "add    %1, %1, #16                 \n"

                    ".align 4                           \n"
                    "4:                                 \n"

                    "ldr    d2, [%2]                    \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                    "ldr    x22, [%2, #8]               \n"
                    "fmla   v17.4s, v8.4s, v0.s[1]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v18.4s, v8.4s, v0.s[2]      \n"

                    "ldr    d10, [%1]                   \n"
                    "ins    v9.d[1], x25                \n"
                    "fmla   v19.4s, v8.4s, v0.s[3]      \n"
                    "ldr    x26, [%1, #8]               \n"
                    "fmla   v20.4s, v8.4s, v1.s[0]      \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v21.4s, v8.4s, v1.s[1]      \n"

                    "ldr    d3, [%2]                    \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v22.4s, v8.4s, v1.s[2]      \n"
                    "ldr    x23, [%2, #8]               \n"
                    "fmla   v23.4s, v8.4s, v1.s[3]      \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v24.4s, v9.4s, v0.s[0]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v25.4s, v9.4s, v0.s[1]      \n"
                    "nop                                \n"
                    "fmla   v26.4s, v9.4s, v0.s[2]      \n"
                    "nop                                \n"
                    "fmla   v27.4s, v9.4s, v0.s[3]      \n"

                    "ldr    d11, [%1]                   \n"
                    "ins    v10.d[1], x26               \n"
                    "fmla   v28.4s, v9.4s, v1.s[0]      \n"
                    "ldr    x27, [%1, #8]               \n"
                    "fmla   v29.4s, v9.4s, v1.s[1]      \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v30.4s, v9.4s, v1.s[2]      \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                    "nop                                \n"
                    "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                    "nop                                \n"
                    "fmla   v17.4s, v10.4s, v2.s[1]     \n"

                    "ldr    d4, [%2]                    \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v18.4s, v10.4s, v2.s[2]     \n"
                    "ldr    x20, [%2, #8]               \n"
                    "fmla   v19.4s, v10.4s, v2.s[3]     \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v20.4s, v10.4s, v3.s[0]     \n"

                    "ldr    d12, [%1]                   \n"
                    "ins    v11.d[1], x27               \n"
                    "fmla   v21.4s, v10.4s, v3.s[1]     \n"
                    "ldr    x24, [%1, #8]               \n"
                    "fmla   v22.4s, v10.4s, v3.s[2]     \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v23.4s, v10.4s, v3.s[3]     \n"

                    "ldr    d5, [%2]                    \n"
                    "ins    v4.d[1], x20                \n"
                    "fmla   v24.4s, v11.4s, v2.s[0]     \n"
                    "ldr    x21, [%2, #8]               \n"
                    "fmla   v25.4s, v11.4s, v2.s[1]     \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v26.4s, v11.4s, v2.s[2]     \n"

                    "ldr    d13, [%1]                   \n"
                    "ins    v12.d[1], x24               \n"
                    "fmla   v27.4s, v11.4s, v2.s[3]     \n"
                    "ldr    x25, [%1, #8]               \n"
                    "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v29.4s, v11.4s, v3.s[1]     \n"

                    "ldr    d6, [%2]                    \n"
                    "ins    v5.d[1], x21                \n"
                    "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                    "ldr    x22, [%2, #8]               \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v16.4s, v12.4s, v4.s[0]     \n"

                    "ldr    d14, [%1]                   \n"
                    "ins    v13.d[1], x25               \n"
                    "fmla   v17.4s, v12.4s, v4.s[1]     \n"
                    "ldr    x26, [%1, #8]               \n"
                    "fmla   v18.4s, v12.4s, v4.s[2]     \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v19.4s, v12.4s, v4.s[3]     \n"

                    "ldr    d7, [%2]                    \n"
                    "ins    v6.d[1], x22                \n"
                    "fmla   v20.4s, v12.4s, v5.s[0]     \n"
                    "ldr    x23, [%2, #8]               \n"
                    "fmla   v21.4s, v12.4s, v5.s[1]     \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v22.4s, v12.4s, v5.s[2]     \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v23.4s, v12.4s, v5.s[3]     \n"
                    "nop                                \n"
                    "fmla   v24.4s, v13.4s, v4.s[0]     \n"
                    "nop                                \n"
                    "fmla   v25.4s, v13.4s, v4.s[1]     \n"

                    "ldr    d15, [%1]                   \n"
                    "ins    v14.d[1], x26               \n"
                    "fmla   v26.4s, v13.4s, v4.s[2]     \n"
                    "ldr    x27, [%1, #8]               \n"
                    "fmla   v27.4s, v13.4s, v4.s[3]     \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v28.4s, v13.4s, v5.s[0]     \n"

                    "nop                                \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v29.4s, v13.4s, v5.s[1]     \n"
                    "nop                                \n"
                    "fmla   v30.4s, v13.4s, v5.s[2]     \n"
                    "nop                                \n"
                    "fmla   v31.4s, v13.4s, v5.s[3]     \n"

                    "ldr    d0, [%2]                    \n"
                    "ins    v7.d[1], x23                \n"
                    "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                    "ldr    x20, [%2, #8]               \n"
                    "fmla   v17.4s, v14.4s, v6.s[1]     \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v18.4s, v14.4s, v6.s[2]     \n"

                    "ldr    d8, [%1]                    \n"
                    "ins    v15.d[1], x27               \n"
                    "fmla   v19.4s, v14.4s, v6.s[3]     \n"
                    "ldr    x24, [%1, #8]               \n"
                    "fmla   v20.4s, v14.4s, v7.s[0]     \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v21.4s, v14.4s, v7.s[1]     \n"

                    "ldr    d1, [%2]                    \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v22.4s, v14.4s, v7.s[2]     \n"
                    "ldr    x21, [%2, #8]               \n"
                    "fmla   v23.4s, v14.4s, v7.s[3]     \n"
                    "add    %2, %2, #16                 \n"
                    "fmla   v24.4s, v15.4s, v6.s[0]     \n"

                    "ldr    d9, [%1]                    \n"
                    "ins    v8.d[1], x24                \n"
                    "fmla   v25.4s, v15.4s, v6.s[1]     \n"
                    "ldr    x25, [%1, #8]               \n"
                    "fmla   v26.4s, v15.4s, v6.s[2]     \n"
                    "add    %1, %1, #16                 \n"
                    "fmla   v27.4s, v15.4s, v6.s[3]     \n"

                    "nop                                \n"
                    "nop                                \n"
                    "fmla   v28.4s, v15.4s, v7.s[0]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v29.4s, v15.4s, v7.s[1]     \n"
                    "nop                                \n"
                    "fmla   v30.4s, v15.4s, v7.s[2]     \n"

                    "nop                                \n"
                    "nop                                \n"
                    "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                    "nop                                \n"
                    "nop                                \n"
                    "nop                                \n"
                    "nop                                \n"

                    "bne    4b                          \n"

                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #32                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v4.4s, v1.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v24.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v1.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 2          \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x8
                    "zip1   v14.4s, v16.4s, v17.4s      \n"
                    "zip2   v15.4s, v16.4s, v17.4s      \n"
                    "zip1   v16.4s, v18.4s, v19.4s      \n"
                    "zip2   v17.4s, v18.4s, v19.4s      \n"
                    "zip1   v18.4s, v20.4s, v21.4s      \n"
                    "zip2   v19.4s, v20.4s, v21.4s      \n"
                    "zip1   v20.4s, v22.4s, v23.4s      \n"
                    "zip2   v21.4s, v22.4s, v23.4s      \n"

                    "zip1   v22.4s, v24.4s, v25.4s      \n"
                    "zip2   v23.4s, v24.4s, v25.4s      \n"
                    "zip1   v24.4s, v26.4s, v27.4s      \n"
                    "zip2   v25.4s, v26.4s, v27.4s      \n"
                    "zip1   v26.4s, v28.4s, v29.4s      \n"
                    "zip2   v27.4s, v28.4s, v29.4s      \n"
                    "zip1   v28.4s, v30.4s, v31.4s      \n"
                    "zip2   v29.4s, v30.4s, v31.4s      \n"

                    "zip1   v0.2d, v14.2d, v16.2d       \n"
                    "zip2   v2.2d, v14.2d, v16.2d       \n"
                    "zip1   v4.2d, v15.2d, v17.2d       \n"
                    "zip2   v6.2d, v15.2d, v17.2d       \n"
                    "zip1   v1.2d, v18.2d, v20.2d       \n"
                    "zip2   v3.2d, v18.2d, v20.2d       \n"
                    "zip1   v5.2d, v19.2d, v21.2d       \n"
                    "zip2   v7.2d, v19.2d, v21.2d       \n"

                    "zip1   v8.2d, v22.2d, v24.2d       \n"
                    "zip2   v10.2d, v22.2d, v24.2d      \n"
                    "zip1   v12.2d, v23.2d, v25.2d      \n"
                    "zip2   v14.2d, v23.2d, v25.2d      \n"
                    "zip1   v9.2d, v26.2d, v28.2d       \n"
                    "zip2   v11.2d, v26.2d, v28.2d      \n"
                    "zip1   v13.2d, v27.2d, v29.2d      \n"
                    "zip2   v15.2d, v27.2d, v29.2d      \n"

                    "add    x4, %3, %w13, sxtw 2        \n"
                    "st1    {v0.4s, v1.4s}, [%3], #32   \n"
                    "st1    {v2.4s, v3.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v4.4s, v5.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v6.4s, v7.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v8.4s, v9.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v10.4s, v11.4s}, [x4]      \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v12.4s, v13.4s}, [x4]      \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v14.4s, v15.4s}, [x4]      \n"

                    "9:                                 \n"
                    "add    %0, %0, #256                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #192                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v16.4s}, [%8]              \n"
                    "ld1    {v24.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v16.16b, v16.16b, v16.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"

                    "2:                                 \n"
                    "mov    v17.16b, v16.16b            \n"
                    "mov    v18.16b, v16.16b            \n"
                    "mov    v19.16b, v16.16b            \n"
                    "mov    v20.16b, v16.16b            \n"
                    "mov    v21.16b, v16.16b            \n"
                    "mov    v22.16b, v16.16b            \n"
                    "mov    v23.16b, v16.16b            \n"

                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"
                    "mov    v28.16b, v24.16b            \n"
                    "mov    v29.16b, v24.16b            \n"
                    "mov    v30.16b, v24.16b            \n"
                    "mov    v31.16b, v24.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                    "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v8.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v8.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v8.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v8.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v8.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v8.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v8.4s, v1.s[3]      \n"
                    "fmla   v24.4s, v9.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v9.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v9.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v9.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v9.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v9.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v9.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                    "fmla   v17.4s, v10.4s, v2.s[1]     \n"
                    "fmla   v18.4s, v10.4s, v2.s[2]     \n"
                    "fmla   v19.4s, v10.4s, v2.s[3]     \n"
                    "fmla   v20.4s, v10.4s, v3.s[0]     \n"
                    "fmla   v21.4s, v10.4s, v3.s[1]     \n"
                    "fmla   v22.4s, v10.4s, v3.s[2]     \n"
                    "fmla   v23.4s, v10.4s, v3.s[3]     \n"
                    "fmla   v24.4s, v11.4s, v2.s[0]     \n"
                    "fmla   v25.4s, v11.4s, v2.s[1]     \n"
                    "fmla   v26.4s, v11.4s, v2.s[2]     \n"
                    "fmla   v27.4s, v11.4s, v2.s[3]     \n"
                    "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                    "fmla   v29.4s, v11.4s, v3.s[1]     \n"
                    "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
                    "fmla   v16.4s, v12.4s, v4.s[0]     \n"
                    "fmla   v17.4s, v12.4s, v4.s[1]     \n"
                    "fmla   v18.4s, v12.4s, v4.s[2]     \n"
                    "fmla   v19.4s, v12.4s, v4.s[3]     \n"
                    "fmla   v20.4s, v12.4s, v5.s[0]     \n"
                    "fmla   v21.4s, v12.4s, v5.s[1]     \n"
                    "fmla   v22.4s, v12.4s, v5.s[2]     \n"
                    "fmla   v23.4s, v12.4s, v5.s[3]     \n"
                    "fmla   v24.4s, v13.4s, v4.s[0]     \n"
                    "fmla   v25.4s, v13.4s, v4.s[1]     \n"
                    "fmla   v26.4s, v13.4s, v4.s[2]     \n"
                    "fmla   v27.4s, v13.4s, v4.s[3]     \n"
                    "fmla   v28.4s, v13.4s, v5.s[0]     \n"
                    "fmla   v29.4s, v13.4s, v5.s[1]     \n"
                    "fmla   v30.4s, v13.4s, v5.s[2]     \n"
                    "fmla   v31.4s, v13.4s, v5.s[3]     \n"
                    "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                    "fmla   v17.4s, v14.4s, v6.s[1]     \n"
                    "fmla   v18.4s, v14.4s, v6.s[2]     \n"
                    "fmla   v19.4s, v14.4s, v6.s[3]     \n"
                    "fmla   v20.4s, v14.4s, v7.s[0]     \n"
                    "fmla   v21.4s, v14.4s, v7.s[1]     \n"
                    "fmla   v22.4s, v14.4s, v7.s[2]     \n"
                    "fmla   v23.4s, v14.4s, v7.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v24.4s, v15.4s, v6.s[0]     \n"
                    "fmla   v25.4s, v15.4s, v6.s[1]     \n"
                    "fmla   v26.4s, v15.4s, v6.s[2]     \n"
                    "fmla   v27.4s, v15.4s, v6.s[3]     \n"
                    "fmla   v28.4s, v15.4s, v7.s[0]     \n"
                    "fmla   v29.4s, v15.4s, v7.s[1]     \n"
                    "fmla   v30.4s, v15.4s, v7.s[2]     \n"
                    "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                    "ld1    {v4.4s, v5.4s}, [%1], #32   \n"

                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v4.4s, v1.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v24.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v1.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 2          \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x8
                    "zip1   v14.4s, v16.4s, v17.4s      \n"
                    "zip2   v15.4s, v16.4s, v17.4s      \n"
                    "zip1   v16.4s, v18.4s, v19.4s      \n"
                    "zip2   v17.4s, v18.4s, v19.4s      \n"
                    "zip1   v18.4s, v20.4s, v21.4s      \n"
                    "zip2   v19.4s, v20.4s, v21.4s      \n"
                    "zip1   v20.4s, v22.4s, v23.4s      \n"
                    "zip2   v21.4s, v22.4s, v23.4s      \n"

                    "zip1   v22.4s, v24.4s, v25.4s      \n"
                    "zip2   v23.4s, v24.4s, v25.4s      \n"
                    "zip1   v24.4s, v26.4s, v27.4s      \n"
                    "zip2   v25.4s, v26.4s, v27.4s      \n"
                    "zip1   v26.4s, v28.4s, v29.4s      \n"
                    "zip2   v27.4s, v28.4s, v29.4s      \n"
                    "zip1   v28.4s, v30.4s, v31.4s      \n"
                    "zip2   v29.4s, v30.4s, v31.4s      \n"

                    "zip1   v0.2d, v14.2d, v16.2d       \n"
                    "zip2   v2.2d, v14.2d, v16.2d       \n"
                    "zip1   v4.2d, v15.2d, v17.2d       \n"
                    "zip2   v6.2d, v15.2d, v17.2d       \n"
                    "zip1   v1.2d, v18.2d, v20.2d       \n"
                    "zip2   v3.2d, v18.2d, v20.2d       \n"
                    "zip1   v5.2d, v19.2d, v21.2d       \n"
                    "zip2   v7.2d, v19.2d, v21.2d       \n"

                    "zip1   v8.2d, v22.2d, v24.2d       \n"
                    "zip2   v10.2d, v22.2d, v24.2d      \n"
                    "zip1   v12.2d, v23.2d, v25.2d      \n"
                    "zip2   v14.2d, v23.2d, v25.2d      \n"
                    "zip1   v9.2d, v26.2d, v28.2d       \n"
                    "zip2   v11.2d, v26.2d, v28.2d      \n"
                    "zip1   v13.2d, v27.2d, v29.2d      \n"
                    "zip2   v15.2d, v27.2d, v29.2d      \n"

                    "add    x4, %3, %w13, sxtw 2        \n"
                    "st1    {v0.4s, v1.4s}, [%3], #32   \n"
                    "st1    {v2.4s, v3.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v4.4s, v5.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v6.4s, v7.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v8.4s, v9.4s}, [x4]        \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v10.4s, v11.4s}, [x4]      \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v12.4s, v13.4s}, [x4]      \n"
                    "add    x4, x4, %w13, sxtw 2        \n"
                    "st1    {v14.4s, v15.4s}, [x4]      \n"

                    "9:                                 \n"
                    "add    %0, %0, #256                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                    _sum20 = _sum00;
                    _sum21 = _sum01;
                    _sum30 = _sum00;
                    _sum31 = _sum01;
                    _sum40 = _sum00;
                    _sum41 = _sum01;
                    _sum50 = _sum00;
                    _sum51 = _sum01;
                    _sum60 = _sum00;
                    _sum61 = _sum01;
                    _sum70 = _sum00;
                    _sum71 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                    _sum40 = vdupq_n_f32(0.f);
                    _sum41 = vdupq_n_f32(0.f);
                    _sum50 = vdupq_n_f32(0.f);
                    _sum51 = vdupq_n_f32(0.f);
                    _sum60 = vdupq_n_f32(0.f);
                    _sum61 = vdupq_n_f32(0.f);
                    _sum70 = vdupq_n_f32(0.f);
                    _sum71 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);

                pA += 8;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);
                    vst1q_f32(outptr0 + 4 * 2, _sum20);
                    vst1q_f32(outptr0 + 4 * 3, _sum30);
                    vst1q_f32(outptr0 + 4 * 4, _sum40);
                    vst1q_f32(outptr0 + 4 * 5, _sum50);
                    vst1q_f32(outptr0 + 4 * 6, _sum60);
                    vst1q_f32(outptr0 + 4 * 7, _sum70);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 2, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 3, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 4, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 5, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 6, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 7, _sum71);

                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose8x8_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep, _sum10);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum20);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum30);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum40);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 5, _sum50);
                    vst1q_f32(outptr0 + out_hstep * 5 + 4, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 6, _sum60);
                    vst1q_f32(outptr0 + out_hstep * 6 + 4, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 7, _sum70);
                    vst1q_f32(outptr0 + out_hstep * 7 + 4, _sum71);

                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
            }

            outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "subs   %0, %0, #64                 \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "add    x4, %8, #16                 \n"
                "ld1    {v24.4s}, [%8]              \n"
                "ld1    {v28.4s}, [x4]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"

                "2:                                 \n"
                "mov    v25.16b, v24.16b            \n"
                "mov    v26.16b, v24.16b            \n"
                "mov    v27.16b, v24.16b            \n"

                "mov    v29.16b, v28.16b            \n"
                "mov    v30.16b, v28.16b            \n"
                "mov    v31.16b, v28.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                "fmla   v25.4s, v4.4s, v0.s[1]      \n"
                "fmla   v26.4s, v4.4s, v0.s[2]      \n"
                "fmla   v27.4s, v4.4s, v0.s[3]      \n"
                "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                "fmla   v24.4s, v6.4s, v1.s[0]      \n"
                "fmla   v25.4s, v6.4s, v1.s[1]      \n"
                "fmla   v26.4s, v6.4s, v1.s[2]      \n"
                "fmla   v27.4s, v6.4s, v1.s[3]      \n"
                "fmla   v28.4s, v7.4s, v1.s[0]      \n"
                "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                "fmla   v31.4s, v7.4s, v1.s[3]      \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                "fmla   v24.4s, v8.4s, v2.s[0]      \n"
                "fmla   v25.4s, v8.4s, v2.s[1]      \n"
                "fmla   v26.4s, v8.4s, v2.s[2]      \n"
                "fmla   v27.4s, v8.4s, v2.s[3]      \n"
                "fmla   v28.4s, v9.4s, v2.s[0]      \n"
                "fmla   v29.4s, v9.4s, v2.s[1]      \n"
                "fmla   v30.4s, v9.4s, v2.s[2]      \n"
                "fmla   v31.4s, v9.4s, v2.s[3]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v24.4s, v10.4s, v3.s[0]     \n"
                "fmla   v25.4s, v10.4s, v3.s[1]     \n"
                "fmla   v26.4s, v10.4s, v3.s[2]     \n"
                "fmla   v27.4s, v10.4s, v3.s[3]     \n"
                "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                "fmla   v29.4s, v11.4s, v3.s[1]     \n"
                "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4s}, [%2], #16          \n"
                "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                "fmla   v25.4s, v4.4s, v0.s[1]      \n"
                "fmla   v26.4s, v4.4s, v0.s[2]      \n"
                "fmla   v27.4s, v4.4s, v0.s[3]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "lsl    w4, %w13, #2                \n"
                "add    x4, %3, w4, sxtw 2          \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%3], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose8x4
                "zip1   v22.4s, v24.4s, v25.4s      \n"
                "zip2   v23.4s, v24.4s, v25.4s      \n"
                "zip1   v24.4s, v26.4s, v27.4s      \n"
                "zip2   v25.4s, v26.4s, v27.4s      \n"
                "zip1   v26.4s, v28.4s, v29.4s      \n"
                "zip2   v27.4s, v28.4s, v29.4s      \n"
                "zip1   v28.4s, v30.4s, v31.4s      \n"
                "zip2   v29.4s, v30.4s, v31.4s      \n"

                "zip1   v0.2d, v22.2d, v24.2d       \n"
                "zip2   v1.2d, v22.2d, v24.2d       \n"
                "zip1   v2.2d, v23.2d, v25.2d       \n"
                "zip2   v3.2d, v23.2d, v25.2d       \n"
                "zip1   v4.2d, v26.2d, v28.2d       \n"
                "zip2   v5.2d, v26.2d, v28.2d       \n"
                "zip1   v6.2d, v27.2d, v29.2d       \n"
                "zip2   v7.2d, v27.2d, v29.2d       \n"

                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v0.4s}, [%3], #16          \n"
                "st1    {v1.4s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v2.4s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v3.4s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v4.4s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v5.4s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v6.4s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v7.4s}, [x4]               \n"

                "9:                                 \n"
                "add    %0, %0, #128                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                    _sum20 = _sum00;
                    _sum21 = _sum01;
                    _sum30 = _sum00;
                    _sum31 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB0 = vld1q_f32(pB);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);

                pA += 8;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);
                    vst1q_f32(outptr0 + 4 * 2, _sum20);
                    vst1q_f32(outptr0 + 4 * 3, _sum30);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 2, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 3, _sum31);

                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose8x4_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + out_hstep * 1, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum10);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum20);
                    vst1q_f32(outptr0 + out_hstep * 5, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 6, _sum30);
                    vst1q_f32(outptr0 + out_hstep * 7, _sum31);

                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
            }

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "add    x4, %8, #16                 \n"
                "ld1    {v28.4s}, [%8]              \n"
                "ld1    {v30.4s}, [x4]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"

                "2:                                 \n"
                "mov    v29.16b, v28.16b            \n"
                "mov    v31.16b, v30.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #256]       \n"
                "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                "fmla   v29.4s, v4.4s, v0.s[1]      \n"
                "fmla   v30.4s, v5.4s, v0.s[0]      \n"
                "fmla   v31.4s, v5.4s, v0.s[1]      \n"
                "fmla   v28.4s, v6.4s, v0.s[2]      \n"
                "fmla   v29.4s, v6.4s, v0.s[3]      \n"
                "fmla   v30.4s, v7.4s, v0.s[2]      \n"
                "fmla   v31.4s, v7.4s, v0.s[3]      \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                "fmla   v28.4s, v8.4s, v1.s[0]      \n"
                "fmla   v29.4s, v8.4s, v1.s[1]      \n"
                "fmla   v30.4s, v9.4s, v1.s[0]      \n"
                "fmla   v31.4s, v9.4s, v1.s[1]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v10.4s, v1.s[2]     \n"
                "fmla   v29.4s, v10.4s, v1.s[3]     \n"
                "fmla   v30.4s, v11.4s, v1.s[2]     \n"
                "fmla   v31.4s, v11.4s, v1.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.2s}, [%2], #8           \n"
                "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                "fmla   v29.4s, v4.4s, v0.s[1]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v5.4s, v0.s[0]      \n"
                "fmla   v31.4s, v5.4s, v0.s[1]      \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "lsl    w4, %w13, #2                \n"
                "add    x4, %3, w4, sxtw 2          \n"
                "st1    {v28.4s, v29.4s}, [%3], #32 \n"
                "st1    {v30.4s, v31.4s}, [x4]      \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose8x2
                "zip1   v0.4s, v28.4s, v29.4s       \n"
                "zip2   v2.4s, v28.4s, v29.4s       \n"
                "zip1   v4.4s, v30.4s, v31.4s       \n"
                "zip2   v6.4s, v30.4s, v31.4s       \n"

                "mov    v1.d[0], v0.d[1]            \n"
                "mov    v3.d[0], v2.d[1]            \n"
                "mov    v5.d[0], v4.d[1]            \n"
                "mov    v7.d[0], v6.d[1]            \n"

                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v0.2s}, [%3], #8           \n"
                "st1    {v1.2s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v2.2s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v3.2s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v4.2s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v5.2s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v6.2s}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v7.2s}, [x4]               \n"

                "9:                                 \n"
                "add    %0, %0, #64                 \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x2_t _pB0 = vld1_f32(pB);

                _sum00 = vfmaq_lane_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pA1, _pB0, 1);

                pA += 8;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    float sum0[8];
                    float sum1[8];
                    vst1q_f32(sum0, _sum00);
                    vst1q_f32(sum0 + 4, _sum01);
                    vst1q_f32(sum1, _sum10);
                    vst1q_f32(sum1 + 4, _sum11);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];

                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0[out_hstep * 4 + 1] = sum1[4];
                    outptr0[out_hstep * 5 + 1] = sum1[5];
                    outptr0[out_hstep * 6 + 1] = sum1[6];
                    outptr0[out_hstep * 7 + 1] = sum1[7];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
            }

            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj < max_jj; jj += 1)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v30.4s, v31.4s}, [%0]      \n"
                "b      2f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v30.4s, v31.4s}, [%8]      \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"

                "2:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "3:                                 \n"
                "prfm   pldl1keep, [%2, #128]       \n"
                "ld1    {v0.4s}, [%2], #16          \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                "fmla   v29.4s, v5.4s, v0.s[0]      \n"
                "fmla   v30.4s, v6.4s, v0.s[1]      \n"
                "fmla   v31.4s, v7.4s, v0.s[1]      \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
                "fmla   v28.4s, v8.4s, v0.s[2]      \n"
                "fmla   v29.4s, v9.4s, v0.s[2]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v10.4s, v0.s[3]     \n"
                "fmla   v31.4s, v11.4s, v0.s[3]     \n"
                "bne    3b                          \n"
                "fadd   v30.4s, v30.4s, v28.4s      \n"
                "fadd   v31.4s, v31.4s, v29.4s      \n"

                "4:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    6f                          \n"

                "5:                                 \n"
                "ld1r   {v0.4s}, [%2], #4           \n"
                "ld1    {v4.4s, v5.4s}, [%1], #32   \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v4.4s, v0.4s        \n"
                "fmla   v31.4s, v5.4s, v0.4s        \n"
                "bne    5b                          \n"

                "6:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    9f                          \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    7f                          \n"

                "lsl    w4, %w13, #2                \n"
                "add    x4, %3, w4, sxtw 2          \n"
                "st1    {v30.4s}, [%3], #16         \n"
                "st1    {v31.4s}, [x4]              \n"
                "b      8f                          \n"

                // if out_elempack == 1
                "7:                                 \n"
                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v30.s}[0], [%3], #4        \n"
                "st1    {v30.s}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v30.s}[2], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v30.s}[3], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v31.s}[0], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v31.s}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v31.s}[2], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v31.s}[3], [x4]            \n"

                "8:                                 \n"
                "add    %0, %0, #32                 \n"
                "b      10f                         \n"

                "9:                                 \n"
                "st1    {v30.4s, v31.4s}, [%0], #32 \n"

                "10:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB = vld1q_dup_f32(pB);

                _sum00 = vfmaq_f32(_sum00, _pA0, _pB);
                _sum01 = vfmaq_f32(_sum01, _pA1, _pB);

                pA += 8;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    float sum0[8];
                    vst1q_f32(sum0, _sum00);
                    vst1q_f32(sum0 + 4, _sum01);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep * 1] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
            }

            outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
        }

        pAT += max_kk * 8;
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const float* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "subs   %0, %0, #128                \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v20.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"

                "2:                                 \n"
                "mov    v21.16b, v20.16b            \n"
                "mov    v22.16b, v20.16b            \n"
                "mov    v23.16b, v20.16b            \n"
                "mov    v24.16b, v20.16b            \n"
                "mov    v25.16b, v20.16b            \n"
                "mov    v26.16b, v20.16b            \n"
                "mov    v27.16b, v20.16b            \n"
                "mov    v28.16b, v20.16b            \n"
                "mov    v29.16b, v20.16b            \n"
                "mov    v30.16b, v20.16b            \n"
                "mov    v31.16b, v20.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                "fmla   v21.4s, v16.4s, v0.s[1]     \n"
                "fmla   v22.4s, v16.4s, v0.s[2]     \n"
                "fmla   v23.4s, v16.4s, v0.s[3]     \n"
                "fmla   v24.4s, v16.4s, v1.s[0]     \n"
                "fmla   v25.4s, v16.4s, v1.s[1]     \n"
                "fmla   v26.4s, v16.4s, v1.s[2]     \n"
                "fmla   v27.4s, v16.4s, v1.s[3]     \n"
                "fmla   v28.4s, v16.4s, v2.s[0]     \n"
                "fmla   v29.4s, v16.4s, v2.s[1]     \n"
                "fmla   v30.4s, v16.4s, v2.s[2]     \n"
                "fmla   v31.4s, v16.4s, v2.s[3]     \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
                "fmla   v20.4s, v17.4s, v3.s[0]     \n"
                "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                "fmla   v22.4s, v17.4s, v3.s[2]     \n"
                "fmla   v23.4s, v17.4s, v3.s[3]     \n"
                "fmla   v24.4s, v17.4s, v4.s[0]     \n"
                "fmla   v25.4s, v17.4s, v4.s[1]     \n"
                "fmla   v26.4s, v17.4s, v4.s[2]     \n"
                "fmla   v27.4s, v17.4s, v4.s[3]     \n"
                "fmla   v28.4s, v17.4s, v5.s[0]     \n"
                "fmla   v29.4s, v17.4s, v5.s[1]     \n"
                "fmla   v30.4s, v17.4s, v5.s[2]     \n"
                "fmla   v31.4s, v17.4s, v5.s[3]     \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                "fmla   v20.4s, v18.4s, v6.s[0]     \n"
                "fmla   v21.4s, v18.4s, v6.s[1]     \n"
                "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                "fmla   v23.4s, v18.4s, v6.s[3]     \n"
                "fmla   v24.4s, v18.4s, v7.s[0]     \n"
                "fmla   v25.4s, v18.4s, v7.s[1]     \n"
                "fmla   v26.4s, v18.4s, v7.s[2]     \n"
                "fmla   v27.4s, v18.4s, v7.s[3]     \n"
                "fmla   v28.4s, v18.4s, v0.s[0]     \n"
                "fmla   v29.4s, v18.4s, v0.s[1]     \n"
                "fmla   v30.4s, v18.4s, v0.s[2]     \n"
                "fmla   v31.4s, v18.4s, v0.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v20.4s, v19.4s, v1.s[0]     \n"
                "fmla   v21.4s, v19.4s, v1.s[1]     \n"
                "fmla   v22.4s, v19.4s, v1.s[2]     \n"
                "fmla   v23.4s, v19.4s, v1.s[3]     \n"
                "fmla   v24.4s, v19.4s, v2.s[0]     \n"
                "fmla   v25.4s, v19.4s, v2.s[1]     \n"
                "fmla   v26.4s, v19.4s, v2.s[2]     \n"
                "fmla   v27.4s, v19.4s, v2.s[3]     \n"
                "fmla   v28.4s, v19.4s, v3.s[0]     \n"
                "fmla   v29.4s, v19.4s, v3.s[1]     \n"
                "fmla   v30.4s, v19.4s, v3.s[2]     \n"
                "fmla   v31.4s, v19.4s, v3.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4s, v1.4s, v2.4s}, [%2], #48 \n"
                "ld1    {v16.4s}, [%1], #16         \n"
                "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                "fmla   v21.4s, v16.4s, v0.s[1]     \n"
                "fmla   v22.4s, v16.4s, v0.s[2]     \n"
                "fmla   v23.4s, v16.4s, v0.s[3]     \n"
                "fmla   v24.4s, v16.4s, v1.s[0]     \n"
                "fmla   v25.4s, v16.4s, v1.s[1]     \n"
                "fmla   v26.4s, v16.4s, v1.s[2]     \n"
                "fmla   v27.4s, v16.4s, v1.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v16.4s, v2.s[0]     \n"
                "fmla   v29.4s, v16.4s, v2.s[1]     \n"
                "fmla   v30.4s, v16.4s, v2.s[2]     \n"
                "fmla   v31.4s, v16.4s, v2.s[3]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%3], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x12
                "zip1   v18.4s, v20.4s, v21.4s      \n"
                "zip2   v19.4s, v20.4s, v21.4s      \n"
                "zip1   v20.4s, v22.4s, v23.4s      \n"
                "zip2   v21.4s, v22.4s, v23.4s      \n"
                "zip1   v22.4s, v24.4s, v25.4s      \n"
                "zip2   v23.4s, v24.4s, v25.4s      \n"
                "zip1   v24.4s, v26.4s, v27.4s      \n"
                "zip2   v25.4s, v26.4s, v27.4s      \n"
                "zip1   v26.4s, v28.4s, v29.4s      \n"
                "zip2   v27.4s, v28.4s, v29.4s      \n"
                "zip1   v28.4s, v30.4s, v31.4s      \n"
                "zip2   v29.4s, v30.4s, v31.4s      \n"

                "zip1   v12.2d, v18.2d, v20.2d      \n"
                "zip2   v15.2d, v18.2d, v20.2d      \n"
                "zip1   v13.2d, v22.2d, v24.2d      \n"
                "zip2   v16.2d, v22.2d, v24.2d      \n"
                "zip1   v14.2d, v26.2d, v28.2d      \n"
                "zip2   v17.2d, v26.2d, v28.2d      \n"

                "zip1   v18.2d, v19.2d, v21.2d      \n"
                "zip2   v21.2d, v19.2d, v21.2d      \n"
                "zip1   v19.2d, v23.2d, v25.2d      \n"
                "zip2   v22.2d, v23.2d, v25.2d      \n"
                "zip1   v20.2d, v27.2d, v29.2d      \n"
                "zip2   v23.2d, v27.2d, v29.2d      \n"

                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v12.4s, v13.4s, v14.4s}, [%3], #48 \n"
                "st1    {v15.4s, v16.4s, v17.4s}, [x4] \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v18.4s, v19.4s, v20.4s}, [x4] \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v21.4s, v22.4s, v23.4s}, [x4] \n"

                "9:                                 \n"
                "add    %0, %0, #192                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;
            float32x4_t _sum8;
            float32x4_t _sum9;
            float32x4_t _suma;
            float32x4_t _sumb;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                    _sum8 = _sum0;
                    _sum9 = _sum0;
                    _suma = _sum0;
                    _sumb = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                    _sum4 = vdupq_n_f32(0.f);
                    _sum5 = vdupq_n_f32(0.f);
                    _sum6 = vdupq_n_f32(0.f);
                    _sum7 = vdupq_n_f32(0.f);
                    _sum8 = vdupq_n_f32(0.f);
                    _sum9 = vdupq_n_f32(0.f);
                    _suma = vdupq_n_f32(0.f);
                    _sumb = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
                _sum8 = vld1q_f32(outptr + 4 * 8);
                _sum9 = vld1q_f32(outptr + 4 * 9);
                _suma = vld1q_f32(outptr + 4 * 10);
                _sumb = vld1q_f32(outptr + 4 * 11);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
                _sum8 = vfmaq_laneq_f32(_sum8, _pA, _pB2, 0);
                _sum9 = vfmaq_laneq_f32(_sum9, _pA, _pB2, 1);
                _suma = vfmaq_laneq_f32(_suma, _pA, _pB2, 2);
                _sumb = vfmaq_laneq_f32(_sumb, _pA, _pB2, 3);

                pA += 4;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 4 * 2, _sum2);
                    vst1q_f32(outptr0 + 4 * 3, _sum3);
                    vst1q_f32(outptr0 + 4 * 4, _sum4);
                    vst1q_f32(outptr0 + 4 * 5, _sum5);
                    vst1q_f32(outptr0 + 4 * 6, _sum6);
                    vst1q_f32(outptr0 + 4 * 7, _sum7);
                    vst1q_f32(outptr0 + 4 * 8, _sum8);
                    vst1q_f32(outptr0 + 4 * 9, _sum9);
                    vst1q_f32(outptr0 + 4 * 10, _suma);
                    vst1q_f32(outptr0 + 4 * 11, _sumb);
                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose4x12_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 8, _sum2);
                    vst1q_f32(outptr0 + out_hstep, _sum3);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum4);
                    vst1q_f32(outptr0 + out_hstep + 8, _sum5);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum6);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum7);
                    vst1q_f32(outptr0 + out_hstep * 2 + 8, _sum8);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum9);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _suma);
                    vst1q_f32(outptr0 + out_hstep * 3 + 8, _sumb);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
                vst1q_f32(outptr + 4 * 8, _sum8);
                vst1q_f32(outptr + 4 * 9, _sum9);
                vst1q_f32(outptr + 4 * 10, _suma);
                vst1q_f32(outptr + 4 * 11, _sumb);
            }

            outptr += 48;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "subs   %0, %0, #64                 \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v24.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v24.16b, v24.16b, v24.16b   \n"

                "2:                                 \n"
                "mov    v25.16b, v24.16b            \n"
                "mov    v26.16b, v24.16b            \n"
                "mov    v27.16b, v24.16b            \n"
                "mov    v28.16b, v24.16b            \n"
                "mov    v29.16b, v24.16b            \n"
                "mov    v30.16b, v24.16b            \n"
                "mov    v31.16b, v24.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                "fmla   v24.4s, v16.4s, v0.s[0]     \n"
                "fmla   v25.4s, v16.4s, v0.s[1]     \n"
                "fmla   v26.4s, v16.4s, v0.s[2]     \n"
                "fmla   v27.4s, v16.4s, v0.s[3]     \n"
                "fmla   v28.4s, v16.4s, v1.s[0]     \n"
                "fmla   v29.4s, v16.4s, v1.s[1]     \n"
                "fmla   v30.4s, v16.4s, v1.s[2]     \n"
                "fmla   v31.4s, v16.4s, v1.s[3]     \n"
                "fmla   v24.4s, v17.4s, v2.s[0]     \n"
                "fmla   v25.4s, v17.4s, v2.s[1]     \n"
                "fmla   v26.4s, v17.4s, v2.s[2]     \n"
                "fmla   v27.4s, v17.4s, v2.s[3]     \n"
                "fmla   v28.4s, v17.4s, v3.s[0]     \n"
                "fmla   v29.4s, v17.4s, v3.s[1]     \n"
                "fmla   v30.4s, v17.4s, v3.s[2]     \n"
                "fmla   v31.4s, v17.4s, v3.s[3]     \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
                "fmla   v24.4s, v18.4s, v4.s[0]     \n"
                "fmla   v25.4s, v18.4s, v4.s[1]     \n"
                "fmla   v26.4s, v18.4s, v4.s[2]     \n"
                "fmla   v27.4s, v18.4s, v4.s[3]     \n"
                "fmla   v28.4s, v18.4s, v5.s[0]     \n"
                "fmla   v29.4s, v18.4s, v5.s[1]     \n"
                "fmla   v30.4s, v18.4s, v5.s[2]     \n"
                "fmla   v31.4s, v18.4s, v5.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v24.4s, v19.4s, v6.s[0]     \n"
                "fmla   v25.4s, v19.4s, v6.s[1]     \n"
                "fmla   v26.4s, v19.4s, v6.s[2]     \n"
                "fmla   v27.4s, v19.4s, v6.s[3]     \n"
                "fmla   v28.4s, v19.4s, v7.s[0]     \n"
                "fmla   v29.4s, v19.4s, v7.s[1]     \n"
                "fmla   v30.4s, v19.4s, v7.s[2]     \n"
                "fmla   v31.4s, v19.4s, v7.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                "ld1    {v16.4s}, [%1], #16         \n"
                "fmla   v24.4s, v16.4s, v0.s[0]     \n"
                "fmla   v25.4s, v16.4s, v0.s[1]     \n"
                "fmla   v26.4s, v16.4s, v0.s[2]     \n"
                "fmla   v27.4s, v16.4s, v0.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v16.4s, v1.s[0]     \n"
                "fmla   v29.4s, v16.4s, v1.s[1]     \n"
                "fmla   v30.4s, v16.4s, v1.s[2]     \n"
                "fmla   v31.4s, v16.4s, v1.s[3]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%3], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x8
                "zip1   v22.4s, v24.4s, v25.4s      \n"
                "zip2   v23.4s, v24.4s, v25.4s      \n"
                "zip1   v24.4s, v26.4s, v27.4s      \n"
                "zip2   v25.4s, v26.4s, v27.4s      \n"
                "zip1   v26.4s, v28.4s, v29.4s      \n"
                "zip2   v27.4s, v28.4s, v29.4s      \n"
                "zip1   v28.4s, v30.4s, v31.4s      \n"
                "zip2   v29.4s, v30.4s, v31.4s      \n"

                "zip1   v12.2d, v22.2d, v24.2d      \n"
                "zip2   v14.2d, v22.2d, v24.2d      \n"
                "zip1   v13.2d, v26.2d, v28.2d      \n"
                "zip2   v15.2d, v26.2d, v28.2d      \n"

                "zip1   v16.2d, v23.2d, v25.2d      \n"
                "zip2   v18.2d, v23.2d, v25.2d      \n"
                "zip1   v17.2d, v27.2d, v29.2d      \n"
                "zip2   v19.2d, v27.2d, v29.2d      \n"

                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v12.4s, v13.4s}, [%3], #32 \n"
                "st1    {v14.4s, v15.4s}, [x4]      \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v16.4s, v17.4s}, [x4]      \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v18.4s, v19.4s}, [x4]      \n"

                "9:                                 \n"
                "add    %0, %0, #128                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vldm       %0!, {d16-d23}      \n"
                "vldm       %0, {d24-d31}       \n"
                "sub        %0, %0, #64         \n"
                "b          3f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d16-d17}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q8, q8              \n"

                "2:                             \n"
                "vmov       q9, q8              \n"
                "vmov       q10, q8             \n"
                "vmov       q11, q8             \n"
                "vmov       q12, q8             \n"
                "vmov       q13, q8             \n"
                "vmov       q14, q8             \n"
                "vmov       q15, q8             \n"

                "3:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                "4:                             \n"
                "pld        [%2, #512]          \n"
                "vldm       %2!, {d0-d7}        \n"
                "pld        [%1, #512]          \n"
                "vldm       %1!, {d8-d15}       \n"
                "vmla.f32   q8, q4, d0[0]       \n"
                "vmla.f32   q9, q4, d0[1]       \n"
                "vmla.f32   q10, q4, d1[0]      \n"
                "vmla.f32   q11, q4, d1[1]      \n"
                "vmla.f32   q12, q4, d2[0]      \n"
                "vmla.f32   q13, q4, d2[1]      \n"
                "vmla.f32   q14, q4, d3[0]      \n"
                "vmla.f32   q15, q4, d3[1]      \n"
                "vmla.f32   q8, q5, d4[0]       \n"
                "vmla.f32   q9, q5, d4[1]       \n"
                "vmla.f32   q10, q5, d5[0]      \n"
                "vmla.f32   q11, q5, d5[1]      \n"
                "vmla.f32   q12, q5, d6[0]      \n"
                "vmla.f32   q13, q5, d6[1]      \n"
                "vmla.f32   q14, q5, d7[0]      \n"
                "vmla.f32   q15, q5, d7[1]      \n"
                "pld        [%2, #512]          \n"
                "vldm       %2!, {d0-d7}        \n"
                "vmla.f32   q8, q6, d0[0]       \n"
                "vmla.f32   q9, q6, d0[1]       \n"
                "vmla.f32   q10, q6, d1[0]      \n"
                "vmla.f32   q11, q6, d1[1]      \n"
                "vmla.f32   q12, q6, d2[0]      \n"
                "vmla.f32   q13, q6, d2[1]      \n"
                "vmla.f32   q14, q6, d3[0]      \n"
                "vmla.f32   q15, q6, d3[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q8, q7, d4[0]       \n"
                "vmla.f32   q9, q7, d4[1]       \n"
                "vmla.f32   q10, q7, d5[0]      \n"
                "vmla.f32   q11, q7, d5[1]      \n"
                "vmla.f32   q12, q7, d6[0]      \n"
                "vmla.f32   q13, q7, d6[1]      \n"
                "vmla.f32   q14, q7, d7[0]      \n"
                "vmla.f32   q15, q7, d7[1]      \n"
                "bne        4b                  \n"

                "5:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        7f                  \n"

                "6:                             \n"
                "vld1.f32   {d0-d3}, [%2 :128]! \n"
                "vld1.f32   {d8-d9}, [%1 :128]! \n"
                "vmla.f32   q8, q4, d0[0]       \n"
                "vmla.f32   q9, q4, d0[1]       \n"
                "vmla.f32   q10, q4, d1[0]      \n"
                "vmla.f32   q11, q4, d1[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q12, q4, d2[0]      \n"
                "vmla.f32   q13, q4, d2[1]      \n"
                "vmla.f32   q14, q4, d3[0]      \n"
                "vmla.f32   q15, q4, d3[1]      \n"
                "bne        6b                  \n"

                "7:                             \n"
                "cmp        %11, #0             \n"
                "beq        10f                 \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        8f                  \n"

                "vstm       %3!, {d16-d23}      \n"
                "vstm       %3!, {d24-d31}      \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // transpose4x8
                "vtrn.32    q8, q9              \n"
                "vtrn.32    q10, q11            \n"
                "vtrn.32    q12, q13            \n"
                "vtrn.32    q14, q15            \n"
                "vswp       d17, d20            \n"
                "vswp       d19, d22            \n"
                "vswp       d25, d28            \n"
                "vswp       d27, d30            \n"
                "vswp       q9, q12             \n"
                "vswp       q11, q14            \n"

                "add        r4, %3, %13, lsl #2 \n"
                "vst1.f32   {d16-d19}, [%3 :128]! \n"
                "vst1.f32   {d24-d27}, [r4 :128] \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d20-d23}, [r4 :128] \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d28-d31}, [r4 :128] \n"

                "9:                             \n"
                "add        %0, %0, #128        \n"
                "b          11f                 \n"

                "10:                            \n"
                "vstm       %0!, {d16-d23}      \n"
                "vstm       %0!, {d24-d31}      \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                    _sum4 = vdupq_n_f32(0.f);
                    _sum5 = vdupq_n_f32(0.f);
                    _sum6 = vdupq_n_f32(0.f);
                    _sum7 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB0), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB0), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB0), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB0), 1);
                _sum4 = vmlaq_lane_f32(_sum4, _pA, vget_low_f32(_pB1), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _pA, vget_low_f32(_pB1), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _pA, vget_high_f32(_pB1), 0);
                _sum7 = vmlaq_lane_f32(_sum7, _pA, vget_high_f32(_pB1), 1);
#endif

                pA += 4;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 4 * 2, _sum2);
                    vst1q_f32(outptr0 + 4 * 3, _sum3);
                    vst1q_f32(outptr0 + 4 * 4, _sum4);
                    vst1q_f32(outptr0 + 4 * 5, _sum5);
                    vst1q_f32(outptr0 + 4 * 6, _sum6);
                    vst1q_f32(outptr0 + 4 * 7, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose4x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + out_hstep, _sum2);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum3);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum4);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum6);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _sum7);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
            }

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v28.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"

                "2:                                 \n"
                "mov    v29.16b, v28.16b            \n"
                "mov    v30.16b, v28.16b            \n"
                "mov    v31.16b, v28.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                "fmla   v30.4s, v16.4s, v0.s[2]     \n"
                "fmla   v31.4s, v16.4s, v0.s[3]     \n"
                "fmla   v28.4s, v17.4s, v1.s[0]     \n"
                "fmla   v29.4s, v17.4s, v1.s[1]     \n"
                "fmla   v30.4s, v17.4s, v1.s[2]     \n"
                "fmla   v31.4s, v17.4s, v1.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v18.4s, v2.s[0]     \n"
                "fmla   v29.4s, v18.4s, v2.s[1]     \n"
                "fmla   v30.4s, v18.4s, v2.s[2]     \n"
                "fmla   v31.4s, v18.4s, v2.s[3]     \n"
                "fmla   v28.4s, v19.4s, v3.s[0]     \n"
                "fmla   v29.4s, v19.4s, v3.s[1]     \n"
                "fmla   v30.4s, v19.4s, v3.s[2]     \n"
                "fmla   v31.4s, v19.4s, v3.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4s}, [%2], #16          \n"
                "ld1    {v16.4s}, [%1], #16         \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v16.4s, v0.s[2]     \n"
                "fmla   v31.4s, v16.4s, v0.s[3]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x4
                "zip1   v26.4s, v28.4s, v29.4s      \n"
                "zip2   v27.4s, v28.4s, v29.4s      \n"
                "zip1   v28.4s, v30.4s, v31.4s      \n"
                "zip2   v29.4s, v30.4s, v31.4s      \n"

                "zip1   v12.2d, v26.2d, v28.2d      \n"
                "zip2   v13.2d, v26.2d, v28.2d      \n"
                "zip1   v14.2d, v27.2d, v29.2d      \n"
                "zip2   v15.2d, v27.2d, v29.2d      \n"

                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v12.4s}, [%3], #16         \n"
                "st1    {v13.4s}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v14.4s}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v15.4s}, [x4]              \n"

                "9:                                 \n"
                "add    %0, %0, #64                 \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vldm       %0, {d24-d31}       \n"
                "b          3f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d24-d25}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q12, q12            \n"

                "2:                             \n"
                "vmov       q13, q12            \n"
                "vmov       q14, q12            \n"
                "vmov       q15, q12            \n"

                "3:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                "4:                             \n"
                "pld        [%2, #512]          \n"
                "vldm       %2!, {d0-d7}        \n"
                "pld        [%1, #512]          \n"
                "vldm       %1!, {d8-d15}       \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q4, d0[1]      \n"
                "vmla.f32   q14, q4, d1[0]      \n"
                "vmla.f32   q15, q4, d1[1]      \n"
                "vmla.f32   q12, q5, d2[0]      \n"
                "vmla.f32   q13, q5, d2[1]      \n"
                "vmla.f32   q14, q5, d3[0]      \n"
                "vmla.f32   q15, q5, d3[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q12, q6, d4[0]      \n"
                "vmla.f32   q13, q6, d4[1]      \n"
                "vmla.f32   q14, q6, d5[0]      \n"
                "vmla.f32   q15, q6, d5[1]      \n"
                "vmla.f32   q12, q7, d6[0]      \n"
                "vmla.f32   q13, q7, d6[1]      \n"
                "vmla.f32   q14, q7, d7[0]      \n"
                "vmla.f32   q15, q7, d7[1]      \n"
                "bne        4b                  \n"

                "5:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        7f                  \n"

                "6:                             \n"
                "vld1.f32   {d0-d1}, [%2 :128]! \n"
                "vld1.f32   {d8-d9}, [%1 :128]! \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q4, d0[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q14, q4, d1[0]      \n"
                "vmla.f32   q15, q4, d1[1]      \n"
                "bne        6b                  \n"

                "7:                             \n"
                "cmp        %11, #0             \n"
                "beq        10f                 \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        8f                  \n"

                "vstm       %3!, {d24-d31}      \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // transpose4x4
                "vtrn.32    q12, q13            \n"
                "vtrn.32    q14, q15            \n"
                "vswp       d25, d28            \n"
                "vswp       d27, d30            \n"

                "add        r4, %3, %13, lsl #2 \n"
                "vst1.f32   {d24-d25}, [%3 :128]! \n"
                "vst1.f32   {d26-d27}, [r4 :128] \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d28-d29}, [r4 :128] \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d30-d31}, [r4 :128] \n"

                "9:                             \n"
                "add        %0, %0, #64         \n"
                "b          11f                 \n"

                "10:                            \n"
                "vstm       %0!, {d24-d31}      \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB = vld1q_f32(pB);

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB), 1);
#endif

                pA += 4;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 4 * 2, _sum2);
                    vst1q_f32(outptr0 + 4 * 3, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose4x4_ps(_sum0, _sum1, _sum2, _sum3);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + out_hstep * 1, _sum1);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum2);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum3);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
            }

            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v30.4s, v31.4s}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v30.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"

                "2:                                 \n"
                "mov    v31.16b, v30.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "4:                                 \n"
                "prfm   pldl1keep, [%2, #256]       \n"
                "ld1    {v0.4s, v1.4s}, [%2], #32   \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                "fmla   v30.4s, v17.4s, v0.s[2]     \n"
                "fmla   v31.4s, v17.4s, v0.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v18.4s, v1.s[0]     \n"
                "fmla   v29.4s, v18.4s, v1.s[1]     \n"
                "fmla   v30.4s, v19.4s, v1.s[2]     \n"
                "fmla   v31.4s, v19.4s, v1.s[3]     \n"
                "bne    4b                          \n"
                "fadd   v30.4s, v30.4s, v28.4s      \n"
                "fadd   v31.4s, v31.4s, v29.4s      \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.2s}, [%2], #8           \n"
                "ld1    {v16.4s}, [%1], #16         \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v16.4s, v0.s[0]     \n"
                "fmla   v31.4s, v16.4s, v0.s[1]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v30.4s, v31.4s}, [%3], #32 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x2
                "zip1   v28.4s, v30.4s, v31.4s      \n"
                "zip2   v29.4s, v30.4s, v31.4s      \n"

                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v28.d}[0], [%3], #8        \n"
                "st1    {v28.d}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v29.d}[0], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v29.d}[1], [x4]            \n"

                "9:                                 \n"
                "add    %0, %0, #32                 \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v30.4s, v31.4s}, [%0], #32 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vld1.f32   {d28-d31}, [%0 :128] \n"
                "b          3f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d28-d29}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q14, q14            \n"

                "2:                             \n"
                "vmov       q15, q14            \n"

                "3:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                "veor       q12, q12            \n"
                "veor       q13, q13            \n"
                "4:                             \n"
                "pld        [%2, #256]          \n"
                "vld1.f32   {d0-d3}, [%2 :128]! \n"
                "pld        [%1, #512]          \n"
                "vldm       %1!, {d8-d15}       \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q4, d0[1]      \n"
                "vmla.f32   q14, q5, d1[0]      \n"
                "vmla.f32   q15, q5, d1[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q12, q6, d2[0]      \n"
                "vmla.f32   q13, q6, d2[1]      \n"
                "vmla.f32   q14, q7, d3[0]      \n"
                "vmla.f32   q15, q7, d3[1]      \n"
                "bne        4b                  \n"
                "vadd.f32   q14, q14, q12       \n"
                "vadd.f32   q15, q15, q13       \n"

                "5:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        7f                  \n"

                "6:                             \n"
                "vld1.f32   {d0}, [%2 :64]!     \n"
                "vld1.f32   {d8-d9}, [%1 :128]! \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q14, q4, d0[0]      \n"
                "vmla.f32   q15, q4, d0[1]      \n"
                "bne        6b                  \n"

                "7:                             \n"
                "cmp        %11, #0             \n"
                "beq        10f                 \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        8f                  \n"

                "vst1.f32   {d28-d31}, [%3 :128]! \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // transpose4x2
                "vtrn.32    q14, q15            \n"

                "add        r4, %3, %13, lsl #2 \n"
                "vst1.f32   {d28}, [%3 :64]!    \n"
                "vst1.f32   {d30}, [r4 :64]     \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d29}, [r4 :64]     \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d31}, [r4 :64]     \n"

                "9:                             \n"
                "add        %0, %0, #32         \n"
                "b          11f                 \n"

                "10:                            \n"
                "vst1.f32   {d28-d31}, [%0 :128]! \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q1", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x2_t _pB = vld1_f32(pB);

#if __aarch64__
                _sum0 = vfmaq_lane_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_lane_f32(_sum1, _pA, _pB, 1);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, _pB, 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, _pB, 1);
#endif

                pA += 4;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    float sum0[4];
                    float sum1[4];
                    vst1q_f32(sum0, _sum0);
                    vst1q_f32(sum1, _sum1);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj < max_jj; jj += 1)
        {
            const float* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v31.4s}, [%0]              \n"
                "b      2f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v31.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"

                "2:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "3:                                 \n"
                "prfm   pldl1keep, [%2, #128]       \n"
                "ld1    {v0.4s}, [%2], #16          \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v17.4s, v0.s[1]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v18.4s, v0.s[2]     \n"
                "fmla   v31.4s, v19.4s, v0.s[3]     \n"
                "bne    3b                          \n"
                "fadd   v30.4s, v30.4s, v28.4s      \n"
                "fadd   v31.4s, v31.4s, v29.4s      \n"
                "fadd   v31.4s, v31.4s, v30.4s      \n"

                "4:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    6f                          \n"

                "5:                                 \n"
                "ld1r   {v0.4s}, [%2], #4           \n"
                "ld1    {v16.4s}, [%1], #16         \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v31.4s, v16.4s, v0.4s       \n"
                "bne    5b                          \n"

                "6:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    9f                          \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    7f                          \n"

                "st1    {v31.4s}, [%3], #16         \n"
                "b      8f                          \n"

                // if out_elempack == 1
                "7:                                 \n"
                "add    x4, %3, %w13, sxtw 2        \n"
                "st1    {v31.s}[0], [%3], #4        \n"
                "st1    {v31.s}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v31.s}[2], [x4]            \n"
                "add    x4, x4, %w13, sxtw 2        \n"
                "st1    {v31.s}[3], [x4]            \n"

                "8:                                 \n"
                "add    %0, %0, #16                 \n"
                "b      10f                         \n"

                "9:                                 \n"
                "st1    {v31.4s}, [%0], #16         \n"

                "10:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vld1.f32   {d30-d31}, [%0 :128] \n"
                "b          2f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d30-d31}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q15, q15            \n"

                "2:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        4f                  \n"

                "veor       q12, q12            \n"
                "veor       q13, q13            \n"
                "veor       q14, q14            \n"
                "3:                             \n"
                "pld        [%2, #128]          \n"
                "vld1.f32   {d0-d1}, [%2 :64]!  \n"
                "pld        [%1, #512]          \n"
                "vldm       %1!, {d8-d15}       \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q5, d0[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q14, q6, d1[0]      \n"
                "vmla.f32   q15, q7, d1[1]      \n"
                "bne        3b                  \n"
                "vadd.f32   q14, q14, q12       \n"
                "vadd.f32   q15, q15, q13       \n"
                "vadd.f32   q15, q15, q14       \n"

                "4:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        6f                  \n"

                "5:                             \n"
                "vld1.f32   {d0[0]}, [%2]!      \n"
                "vld1.f32   {d8-d9}, [%1 :128]! \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q15, q4, d0[0]      \n"
                "bne        5b                  \n"

                "6:                             \n"
                "cmp        %11, #0             \n"
                "beq        9f                  \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        7f                  \n"

                "vst1.f32   {d30-d31}, [%3 :128]! \n"
                "b          8f                  \n"

                // if out_elempack == 1
                "7:                             \n"
                "add        r4, %3, %13, lsl #2 \n"
                "vst1.f32   {d30[0]}, [%3]!     \n"
                "vst1.f32   {d30[1]}, [r4]      \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d31[0]}, [r4]      \n"
                "add        r4, r4, %13, lsl #2 \n"
                "vst1.f32   {d31[1]}, [r4]      \n"

                "8:                             \n"
                "add        %0, %0, #16         \n"
                "b          10f                 \n"

                "9:                             \n"
                "vst1.f32   {d30-d31}, [%0 :128]! \n"

                "10:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB = vdupq_n_f32(pB[0]);

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA, _pB);
#else
                _sum0 = vmlaq_f32(_sum0, _pA, _pB);
#endif

                pA += 4;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    float sum0[4];
                    vst1q_f32(sum0, _sum0);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
            }

            outptr += 4;
#endif // NCNN_GNU_INLINE_ASM
        }

        pAT += max_kk * 4;
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j;

        const float* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum02;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum12;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vdupq_n_f32(pC[0]);
                    _sum01 = vdupq_n_f32(pC[0]);
                    _sum02 = vdupq_n_f32(pC[0]);
                    _sum10 = vdupq_n_f32(pC[1]);
                    _sum11 = vdupq_n_f32(pC[1]);
                    _sum12 = vdupq_n_f32(pC[1]);
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum02 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum12 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                float32x4x2_t _tmp45 = vld2q_f32(outptr + 16);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum02 = _tmp45.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
                _sum12 = _tmp45.val[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                float32x2_t _pA = vld1_f32(pA);

                _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum02 = vfmaq_lane_f32(_sum02, _pB2, _pA, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
                _sum12 = vfmaq_lane_f32(_sum12, _pB2, _pA, 1);

                pA += 2;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + 8, _sum02);
                    vst1q_f32(outptr0 + out_hstep, _sum10);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep + 8, _sum12);
                    outptr0 += 12;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float32x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
                vst2q_f32(outptr + 16, _tmp45);
            }

            outptr += 24;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vdupq_n_f32(pC[0]);
                    _sum01 = vdupq_n_f32(pC[0]);
                    _sum10 = vdupq_n_f32(pC[1]);
                    _sum11 = vdupq_n_f32(pC[1]);
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

                float32x2_t _pA = vld1_f32(pA);
#if __aarch64__
                _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
#else
                _sum00 = vmlaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vmlaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum10 = vmlaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vmlaq_lane_f32(_sum11, _pB1, _pA, 1);
#endif

                pA += 2;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep, _sum10);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum11);
                    outptr0 += 8;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
            }

            outptr += 16;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdupq_n_f32(pC[0]);
                    _sum1 = vdupq_n_f32(pC[1]);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                _sum0 = _tmp01.val[0];
                _sum1 = _tmp01.val[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = vld1q_f32(pB);

                float32x2_t _pA = vld1_f32(pA);
#if __aarch64__
                _sum0 = vfmaq_lane_f32(_sum0, _pB, _pA, 0);
                _sum1 = vfmaq_lane_f32(_sum1, _pB, _pA, 1);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pB, _pA, 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pB, _pA, 1);
#endif

                pA += 2;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + out_hstep, _sum1);
                    outptr0 += 4;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2q_f32(outptr, _tmp01);
            }

            outptr += 8;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum00;
            float sum01;
            float sum10;
            float sum11;

            if (k == 0)
            {
                if (pC)
                {
                    sum00 = pC[0];
                    sum01 = pC[1];
                    sum10 = pC[0];
                    sum11 = pC[1];
                }
                else
                {
                    sum00 = 0.f;
                    sum01 = 0.f;
                    sum10 = 0.f;
                    sum11 = 0.f;
                }
            }
            else
            {
                sum00 = outptr[0];
                sum01 = outptr[1];
                sum10 = outptr[2];
                sum11 = outptr[3];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum00 += pA[0] * pB[0];
                sum01 += pA[1] * pB[0];
                sum10 += pA[0] * pB[1];
                sum11 += pA[1] * pB[1];

                pA += 2;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum00;
                    outptr0[1] = sum10;
                    outptr0[out_hstep] = sum01;
                    outptr0[out_hstep + 1] = sum11;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
            }

            outptr += 4;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                if (pC)
                {
                    sum0 = pC[0];
                    sum1 = pC[1];
                }
                else
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[1] * pB[0];
                pA += 2;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[out_hstep] = sum1;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j;

        const float* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdupq_n_f32(pC[0]);
                    _sum1 = vdupq_n_f32(pC[0]);
                    _sum2 = vdupq_n_f32(pC[0]);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
                _sum2 = vld1q_f32(outptr + 8);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                float32x4_t _pA0 = vdupq_n_f32(pA[0]);

                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
                _sum2 = vfmaq_f32(_sum2, _pA0, _pB2);

                pA += 1;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 8, _sum2);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 8, _sum2);
            }

            outptr += 12;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdupq_n_f32(pC[0]);
                    _sum1 = vdupq_n_f32(pC[0]);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

                float32x4_t _pA0 = vdupq_n_f32(pA[0]);
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
#else
                _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
#endif

                pA += 1;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum;

            if (k == 0)
            {
                if (pC)
                {
                    _sum = vdupq_n_f32(pC[0]);
                }
                else
                {
                    _sum = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum = vld1q_f32(outptr);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = vld1q_f32(pB);
                float32x4_t _pA = vdupq_n_f32(pA[0]);

#if __aarch64__
                _sum = vfmaq_f32(_sum, _pA, _pB);
#else
                _sum = vmlaq_f32(_sum, _pA, _pB);
#endif

                pA += 1;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum);
            }

            outptr += 4;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                if (pC)
                {
                    sum0 = pC[0];
                    sum1 = pC[0];
                }
                else
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[0] * pB[1];

                pA += 1;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[1] = sum1;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum;

            if (k == 0)
            {
                if (pC)
                {
                    sum = pC[0];
                }
                else
                {
                    sum = 0.f;
                }
            }
            else
            {
                sum = outptr[0];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum += pA[0] * pB[0];
                pA += 1;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}

static void convolution_im2col_gemm_get_optimal_tile_mnk(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp32 = (int)(get_cpu_level2_cache_size() / sizeof(float));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve K
    {
        // try not to split K
#if __aarch64__
        int tile_size = (l2_cache_size_fp32 - 32) / 12;
#elif __ARM_NEON
        int tile_size = (l2_cache_size_fp32 - 16) / 8;
#else
        int tile_size = (l2_cache_size_fp32 - 2) / 3;
#endif

#if __aarch64__
        TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::max(4, tile_size / 4 * 4);
#else
        TILE_K = std::max(2, tile_size / 2 * 2);
#endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
#if __aarch64__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#else
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#endif
    }

    // solve M
    {
#if __aarch64__
        int nn_M = (M + 31) / 32;
#elif __ARM_NEON
        int nn_M = (M + 15) / 16;
#else
        int nn_M = (M + 7) / 8;
#endif

#if __aarch64__
        TILE_M = std::max(8, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_M = std::max(4, ((M + nn_M - 1) / nn_M + 3) / 4 * 4);
#else
        TILE_M = std::max(2, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif
    }

    {
        TILE_M *= std::min(nT, get_physical_cpu_count());

        int nn_M = (M + TILE_M - 1) / TILE_M;
#if __aarch64__
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 3) / 4 * 4);
#else
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif

        if (nT > 1)
        {
#if __aarch64__
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
#elif __ARM_NEON
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 3) / 4 * 4);
#else
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
#endif
        }
    }

    if (N > 0)
    {
        int tile_size;
        if (TILE_K >= K)
        {
            tile_size = (l2_cache_size_fp32 - TILE_M * TILE_K) / TILE_K;
        }
        else
        {
            tile_size = (l2_cache_size_fp32 - TILE_M * TILE_K) / (TILE_M + TILE_K);
        }

#if __aarch64__
        TILE_N = std::max(4, tile_size / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::max(4, tile_size / 4 * 4);
#else
        TILE_N = std::max(1, tile_size);
#endif

        int nn_N = (N + TILE_N - 1) / TILE_N;
#if __aarch64__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#else
        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
#endif

#if __aarch64__
        TILE_N = std::max(4, TILE_N);
#elif __ARM_NEON
        TILE_N = std::max(4, TILE_N);
#else
        TILE_N = std::max(1, TILE_N);
#endif
    }
}

static void convolution_im2col_input_tile_conv1x1s1d1(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
{
    const int elempack = bottom_blob.elempack;

    float* pp = B;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                // transpose4x12
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n"
                    "st1    {v0.4s}, [%1], #16          \n"
                    "st1    {v4.4s}, [%1], #16          \n"
                    "st1    {v8.4s}, [%1], #16          \n"
                    "sub    %0, %0, #128                \n"
                    "st1    {v1.4s}, [%1], #16          \n"
                    "st1    {v5.4s}, [%1], #16          \n"
                    "st1    {v9.4s}, [%1], #16          \n"
                    "st1    {v2.4s}, [%1], #16          \n"
                    "st1    {v6.4s}, [%1], #16          \n"
                    "st1    {v10.4s}, [%1], #16         \n"
                    "st1    {v3.4s}, [%1], #16          \n"
                    "st1    {v7.4s}, [%1], #16          \n"
                    "st1    {v11.4s}, [%1], #16         \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0 = vld4q_f32(p0);
                float32x4x4_t _r1 = vld4q_f32(p0 + 16);
                float32x4x4_t _r2 = vld4q_f32(p0 + 32);
                vst1q_f32(pp, _r0.val[0]);
                vst1q_f32(pp + 4, _r1.val[0]);
                vst1q_f32(pp + 4 * 2, _r2.val[0]);
                vst1q_f32(pp + 4 * 3, _r0.val[1]);
                vst1q_f32(pp + 4 * 4, _r1.val[1]);
                vst1q_f32(pp + 4 * 5, _r2.val[1]);
                vst1q_f32(pp + 4 * 6, _r0.val[2]);
                vst1q_f32(pp + 4 * 7, _r1.val[2]);
                vst1q_f32(pp + 4 * 8, _r2.val[2]);
                vst1q_f32(pp + 4 * 9, _r0.val[3]);
                vst1q_f32(pp + 4 * 10, _r1.val[3]);
                vst1q_f32(pp + 4 * 11, _r2.val[3]);
                pp += 48;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 4;
            }
        }

        if (elempack == 1)
        {
            const float* p0 = (const float*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                float32x4_t _r0 = vld1q_f32(p0);
                float32x4_t _r1 = vld1q_f32(p0 + 4);
                float32x4_t _r2 = vld1q_f32(p0 + 8);
                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r1);
                vst1q_f32(pp + 8, _r2);
                pp += 12;
                p0 += bottom_blob.cstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                // transpose4x8
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
                    "st1    {v0.4s}, [%1], #16          \n"
                    "st1    {v4.4s}, [%1], #16          \n"
                    "st1    {v1.4s}, [%1], #16          \n"
                    "st1    {v5.4s}, [%1], #16          \n"
                    "sub    %0, %0, #64                 \n"
                    "st1    {v2.4s}, [%1], #16          \n"
                    "st1    {v6.4s}, [%1], #16          \n"
                    "st1    {v3.4s}, [%1], #16          \n"
                    "st1    {v7.4s}, [%1], #16          \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #512]          \n"
                    "vldm       %0!, {d0-d7}        \n"
                    "pld        [%0, #512]          \n"
                    "vldm       %0, {d16-d23}       \n"
                    "vzip.32    q0, q1              \n"
                    "vzip.32    q2, q3              \n"
                    "vzip.32    q8, q9              \n"
                    "vzip.32    q10, q11            \n"
                    "vswp       d1, d4              \n"
                    "vswp       d3, d6              \n"
                    "vswp       d17, d20            \n"
                    "vswp       d19, d22            \n"
                    "vswp       q1, q8              \n"
                    "vswp       q3, q10             \n"
                    "sub        %0, %0, #64         \n"
                    "vstm       %1!, {d0-d7}        \n"
                    "vstm       %1!, {d16-d23}      \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0 = vld4q_f32(p0);
                float32x4x4_t _r1 = vld4q_f32(p0 + 16);
                vst1q_f32(pp, _r0.val[0]);
                vst1q_f32(pp + 4, _r1.val[0]);
                vst1q_f32(pp + 4 * 2, _r0.val[1]);
                vst1q_f32(pp + 4 * 3, _r1.val[1]);
                vst1q_f32(pp + 4 * 4, _r0.val[2]);
                vst1q_f32(pp + 4 * 5, _r1.val[2]);
                vst1q_f32(pp + 4 * 6, _r0.val[3]);
                vst1q_f32(pp + 4 * 7, _r1.val[3]);
                pp += 32;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 4;
            }
        }

        if (elempack == 1)
        {
            const float* p0 = (const float*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                float32x4_t _r0 = vld1q_f32(p0);
                float32x4_t _r1 = vld1q_f32(p0 + 4);
                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r1);
                pp += 8;
                p0 += bottom_blob.cstep;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                // transpose4x4
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                    "st4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #512]          \n"
                    "vldm       %0, {d0-d7}         \n"
                    "vtrn.32    q0, q1              \n"
                    "vtrn.32    q2, q3              \n"
                    "vswp       d1, d4              \n"
                    "vswp       d3, d6              \n"
                    "vstm       %1!, {d0-d7}        \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4x4_t _r0;
                _r0.val[0] = vld1q_f32(p0);
                _r0.val[1] = vld1q_f32(p0 + 4);
                _r0.val[2] = vld1q_f32(p0 + 4 * 2);
                _r0.val[3] = vld1q_f32(p0 + 4 * 3);
                vst4q_f32(pp, _r0);
                pp += 16;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 4;
            }
        }

        if (elempack == 1)
        {
            const float* p0 = (const float*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += bottom_blob.cstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            const float* p0 = (const float*)bottom_blob.channel(k / elempack) + (j + jj) * elempack;

            int kk = 0;
            for (; kk < max_kk / elempack; kk++)
            {
                // transpose4x2
                float32x4x2_t _r0;
                _r0.val[0] = vld1q_f32(p0);
                _r0.val[1] = vld1q_f32(p0 + 4);
                vst2q_f32(pp, _r0);
                pp += 8;
                p0 += bottom_blob.cstep * elempack;
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            const float* p0 = (const float*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
#if __ARM_NEON
                vst1_f32(pp, vld1_f32(p0));
#else
                pp[0] = p0[0];
                pp[1] = p0[1];
#endif // __ARM_NEON
                pp += 2;
                p0 += bottom_blob.cstep;
            }
        }
    }
    for (; jj < max_jj; jj++)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            const float* p0 = (const float*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += bottom_blob.cstep * 4;
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            const float* p0 = (const float*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0 += bottom_blob.cstep;
            }
        }
    }
}

static void convolution_im2col_input_tile(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
{
    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        convolution_im2col_input_tile_conv1x1s1d1(bottom_blob, B, j, max_jj, k, max_kk);
        return;
    }

    const int w = bottom_blob.w;
    // const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int outw = (w - kernel_extent_w) / stride_w + 1;

    // j max_jj     outw*outh    split w and h

    // k max_kk     pa*maxk*(inch/pa)    split inch

    // k/max_kk shall be multiple of maxk

    const int maxk = kernel_w * kernel_h;

    float* pp = B;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dy2 = (j + jj + 2) / outw;
        int dy3 = (j + jj + 3) / outw;
        int dy4 = (j + jj + 4) / outw;
        int dy5 = (j + jj + 5) / outw;
        int dy6 = (j + jj + 6) / outw;
        int dy7 = (j + jj + 7) / outw;
        int dy8 = (j + jj + 8) / outw;
        int dy9 = (j + jj + 9) / outw;
        int dya = (j + jj + 10) / outw;
        int dyb = (j + jj + 11) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;
        int dx2 = (j + jj + 2) % outw;
        int dx3 = (j + jj + 3) % outw;
        int dx4 = (j + jj + 4) % outw;
        int dx5 = (j + jj + 5) % outw;
        int dx6 = (j + jj + 6) % outw;
        int dx7 = (j + jj + 7) % outw;
        int dx8 = (j + jj + 8) % outw;
        int dx9 = (j + jj + 9) % outw;
        int dxa = (j + jj + 10) % outw;
        int dxb = (j + jj + 11) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int x2 = stride_w * dx2 + dilation_w * v;
            int x3 = stride_w * dx3 + dilation_w * v;
            int x4 = stride_w * dx4 + dilation_w * v;
            int x5 = stride_w * dx5 + dilation_w * v;
            int x6 = stride_w * dx6 + dilation_w * v;
            int x7 = stride_w * dx7 + dilation_w * v;
            int x8 = stride_w * dx8 + dilation_w * v;
            int x9 = stride_w * dx9 + dilation_w * v;
            int xa = stride_w * dxa + dilation_w * v;
            int xb = stride_w * dxb + dilation_w * v;

            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;
            int y2 = stride_h * dy2 + dilation_h * u;
            int y3 = stride_h * dy3 + dilation_h * u;
            int y4 = stride_h * dy4 + dilation_h * u;
            int y5 = stride_h * dy5 + dilation_h * u;
            int y6 = stride_h * dy6 + dilation_h * u;
            int y7 = stride_h * dy7 + dilation_h * u;
            int y8 = stride_h * dy8 + dilation_h * u;
            int y9 = stride_h * dy9 + dilation_h * u;
            int ya = stride_h * dya + dilation_h * u;
            int yb = stride_h * dyb + dilation_h * u;

            const float* sptr0 = img.row(y0) + x0 * elempack;
            const float* sptr1 = img.row(y1) + x1 * elempack;
            const float* sptr2 = img.row(y2) + x2 * elempack;
            const float* sptr3 = img.row(y3) + x3 * elempack;
            const float* sptr4 = img.row(y4) + x4 * elempack;
            const float* sptr5 = img.row(y5) + x5 * elempack;
            const float* sptr6 = img.row(y6) + x6 * elempack;
            const float* sptr7 = img.row(y7) + x7 * elempack;
            const float* sptr8 = img.row(y8) + x8 * elempack;
            const float* sptr9 = img.row(y9) + x9 * elempack;
            const float* sptra = img.row(ya) + xa * elempack;
            const float* sptrb = img.row(yb) + xb * elempack;

            if (elempack == 4)
            {
                float32x4_t _r0 = vld1q_f32(sptr0);
                float32x4_t _r1 = vld1q_f32(sptr1);
                float32x4_t _r2 = vld1q_f32(sptr2);
                float32x4_t _r3 = vld1q_f32(sptr3);
                float32x4_t _r4 = vld1q_f32(sptr4);
                float32x4_t _r5 = vld1q_f32(sptr5);
                float32x4_t _r6 = vld1q_f32(sptr6);
                float32x4_t _r7 = vld1q_f32(sptr7);
                float32x4_t _r8 = vld1q_f32(sptr8);
                float32x4_t _r9 = vld1q_f32(sptr9);
                float32x4_t _ra = vld1q_f32(sptra);
                float32x4_t _rb = vld1q_f32(sptrb);
                transpose4x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r1);
                vst1q_f32(pp + 4 * 2, _r2);
                vst1q_f32(pp + 4 * 3, _r3);
                vst1q_f32(pp + 4 * 4, _r4);
                vst1q_f32(pp + 4 * 5, _r5);
                vst1q_f32(pp + 4 * 6, _r6);
                vst1q_f32(pp + 4 * 7, _r7);
                vst1q_f32(pp + 4 * 8, _r8);
                vst1q_f32(pp + 4 * 9, _r9);
                vst1q_f32(pp + 4 * 10, _ra);
                vst1q_f32(pp + 4 * 11, _rb);
                pp += 48;
            }
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr2[0];
                pp[3] = sptr3[0];
                pp[4] = sptr4[0];
                pp[5] = sptr5[0];
                pp[6] = sptr6[0];
                pp[7] = sptr7[0];
                pp[8] = sptr8[0];
                pp[9] = sptr9[0];
                pp[10] = sptra[0];
                pp[11] = sptrb[0];
                pp += 12;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dy2 = (j + jj + 2) / outw;
        int dy3 = (j + jj + 3) / outw;
        int dy4 = (j + jj + 4) / outw;
        int dy5 = (j + jj + 5) / outw;
        int dy6 = (j + jj + 6) / outw;
        int dy7 = (j + jj + 7) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;
        int dx2 = (j + jj + 2) % outw;
        int dx3 = (j + jj + 3) % outw;
        int dx4 = (j + jj + 4) % outw;
        int dx5 = (j + jj + 5) % outw;
        int dx6 = (j + jj + 6) % outw;
        int dx7 = (j + jj + 7) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int x2 = stride_w * dx2 + dilation_w * v;
            int x3 = stride_w * dx3 + dilation_w * v;
            int x4 = stride_w * dx4 + dilation_w * v;
            int x5 = stride_w * dx5 + dilation_w * v;
            int x6 = stride_w * dx6 + dilation_w * v;
            int x7 = stride_w * dx7 + dilation_w * v;
            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;
            int y2 = stride_h * dy2 + dilation_h * u;
            int y3 = stride_h * dy3 + dilation_h * u;
            int y4 = stride_h * dy4 + dilation_h * u;
            int y5 = stride_h * dy5 + dilation_h * u;
            int y6 = stride_h * dy6 + dilation_h * u;
            int y7 = stride_h * dy7 + dilation_h * u;

            const float* sptr0 = img.row(y0) + x0 * elempack;
            const float* sptr1 = img.row(y1) + x1 * elempack;
            const float* sptr2 = img.row(y2) + x2 * elempack;
            const float* sptr3 = img.row(y3) + x3 * elempack;
            const float* sptr4 = img.row(y4) + x4 * elempack;
            const float* sptr5 = img.row(y5) + x5 * elempack;
            const float* sptr6 = img.row(y6) + x6 * elempack;
            const float* sptr7 = img.row(y7) + x7 * elempack;

            if (elempack == 4)
            {
                float32x4_t _r0 = vld1q_f32(sptr0);
                float32x4_t _r1 = vld1q_f32(sptr1);
                float32x4_t _r2 = vld1q_f32(sptr2);
                float32x4_t _r3 = vld1q_f32(sptr3);
                float32x4_t _r4 = vld1q_f32(sptr4);
                float32x4_t _r5 = vld1q_f32(sptr5);
                float32x4_t _r6 = vld1q_f32(sptr6);
                float32x4_t _r7 = vld1q_f32(sptr7);
                transpose4x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r1);
                vst1q_f32(pp + 4 * 2, _r2);
                vst1q_f32(pp + 4 * 3, _r3);
                vst1q_f32(pp + 4 * 4, _r4);
                vst1q_f32(pp + 4 * 5, _r5);
                vst1q_f32(pp + 4 * 6, _r6);
                vst1q_f32(pp + 4 * 7, _r7);
                pp += 32;
            }
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr2[0];
                pp[3] = sptr3[0];
                pp[4] = sptr4[0];
                pp[5] = sptr5[0];
                pp[6] = sptr6[0];
                pp[7] = sptr7[0];
                pp += 8;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dy2 = (j + jj + 2) / outw;
        int dy3 = (j + jj + 3) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;
        int dx2 = (j + jj + 2) % outw;
        int dx3 = (j + jj + 3) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int x2 = stride_w * dx2 + dilation_w * v;
            int x3 = stride_w * dx3 + dilation_w * v;
            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;
            int y2 = stride_h * dy2 + dilation_h * u;
            int y3 = stride_h * dy3 + dilation_h * u;

            const float* sptr0 = img.row(y0) + x0 * elempack;
            const float* sptr1 = img.row(y1) + x1 * elempack;
            const float* sptr2 = img.row(y2) + x2 * elempack;
            const float* sptr3 = img.row(y3) + x3 * elempack;

            if (elempack == 4)
            {
                float32x4x4_t _r0;
                _r0.val[0] = vld1q_f32(sptr0);
                _r0.val[1] = vld1q_f32(sptr1);
                _r0.val[2] = vld1q_f32(sptr2);
                _r0.val[3] = vld1q_f32(sptr3);
                vst4q_f32(pp, _r0);
                pp += 16;
            }
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr2[0];
                pp[3] = sptr3[0];
                pp += 4;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;

            const float* sptr0 = img.row(y0) + x0 * elempack;
            const float* sptr1 = img.row(y1) + x1 * elempack;

#if __ARM_NEON
            if (elempack == 4)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr0[1];
                pp[3] = sptr1[1];
                pp[4] = sptr0[2];
                pp[5] = sptr1[2];
                pp[6] = sptr0[3];
                pp[7] = sptr1[3];
                pp += 8;
            }
#endif // __ARM_NEON
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp += 2;
            }
        }
    }
    for (; jj < max_jj; jj++)
    {
        int dy = (j + jj) / outw;
        int dx = (j + jj) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x = stride_w * dx + dilation_w * v;
            int y = stride_h * dy + dilation_h * u;

            const float* sptr = img.row(y) + x * elempack;

#if __ARM_NEON
            if (elempack == 4)
            {
                pp[0] = sptr[0];
                pp[1] = sptr[1];
                pp[2] = sptr[2];
                pp[3] = sptr[3];
                pp += 4;
            }
#endif // __ARM_NEON
            if (elempack == 1)
            {
                pp[0] = sptr[0];
                pp += 1;
            }
        }
    }
}

static void convolution_im2col_gemm_transform_kernel(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
    // NCNN_LOGE("convolution_im2col_gemm_transform_kernel");
    const int maxk = kernel_w * kernel_h;

    const int M = outch;
    const int K = inch * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    int elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        elempack = inch % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON

    // maxk-inch-outch to pa-maxk-inch/pa-outch
    Mat A_data;
    if (maxk == 1)
    {
        A_data = kernel.reshape(maxk * inch, outch);
    }
    else
    {
        Mat weight_data_r2 = kernel.reshape(maxk, inch, outch);

        A_data.create(maxk * inch, outch);

        for (int q = 0; q < outch; q += 1)
        {
            float* g00 = A_data.row(q);

            for (int p = 0; p + (elempack - 1) < inch; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        const float* k00 = weight_data_r2.channel(q).row(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }

    AT.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_kk = std::min((K - k), TILE_K);

            Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

            convolution_im2col_pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk);
        }
    }
}

static int convolution_im2col_gemm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
    const int maxk = kernel_w * kernel_h;

    const int M = top_blob.c * top_blob.elempack;
    const int N = top_blob.w * top_blob.h;
    const int K = bottom_blob.c * bottom_blob.elempack * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk(M, N, K, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        // im2col
        convolution_im2col_input_tile(bottom_blob, BT_tile, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h);
    }

    Mat topT_tileX;
    if (K > TILE_K)
    {
        topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT_tileX.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat topT_tile;
        if (K > TILE_K)
            topT_tile = topT_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                const Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = k + TILE_K >= K;

                convolution_gemm_transB_packed_tile(AT_tile, BT_tile, bias, topT_tile, top_blob, i, max_ii, j, max_jj, k, max_kk, k_end, opt.use_a53_a55_optimized_kernel);
            }
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_im2col_gemm_bf16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end, int use_a53_a55_optimized_kernel)
{
    // NCNN_LOGE("convolution_gemm_transB_packed_tile_bf16s %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk);

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.cstep;

    const unsigned short* pAT = AT_tile;
    const unsigned short* pBT = BT_tile;
    const float* pC = CT_tile;

    float* outptr = topT_tile;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const unsigned short* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            if (use_a53_a55_optimized_kernel && cpu_support_arm_asimdhp())
            {
                // a55
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #320                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v8.4s}, [%8]               \n"
                    "ld1    {v20.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v8.16b, v8.16b, v8.16b      \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v9.16b, v8.16b              \n"
                    "mov    v10.16b, v8.16b             \n"
                    "mov    v11.16b, v8.16b             \n"
                    "mov    v12.16b, v8.16b             \n"
                    "mov    v13.16b, v8.16b             \n"
                    "mov    v14.16b, v8.16b             \n"
                    "mov    v15.16b, v8.16b             \n"
                    "mov    v16.16b, v8.16b             \n"
                    "mov    v17.16b, v8.16b             \n"
                    "mov    v18.16b, v8.16b             \n"
                    "mov    v19.16b, v8.16b             \n"

                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4h, v5.4h}, [%1], #16   \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "shll   v0.4s, v0.4h, #16           \n"

                    ".align 4                           \n"
                    "4:                                 \n"
                    "shll   v5.4s, v5.4h, #16           \n"
                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "ldr    d6, [%1], #8                \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "ldr    d7, [%1], #8                \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "shll   v6.4s, v6.4h, #16           \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"
                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "ldr    d4, [%1], #8                \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"
                    "shll   v7.4s, v7.4h, #16           \n"
                    "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v9.4s, v6.4s, v3.s[1]       \n"
                    "ldr    d5, [%1], #8                \n"
                    "fmla   v10.4s, v6.4s, v3.s[2]      \n"
                    "fmla   v11.4s, v6.4s, v3.s[3]      \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v20.4s, v7.4s, v3.s[0]      \n"
                    "fmla   v21.4s, v7.4s, v3.s[1]      \n"
                    "fmla   v22.4s, v7.4s, v3.s[2]      \n"
                    "fmla   v23.4s, v7.4s, v3.s[3]      \n"
                    "fmla   v12.4s, v6.4s, v0.s[0]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v13.4s, v6.4s, v0.s[1]      \n"
                    "fmla   v14.4s, v6.4s, v0.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v0.s[3]      \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v24.4s, v7.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v7.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v0.s[3]      \n"
                    "shll   v4.4s, v4.4h, #16           \n"
                    "fmla   v16.4s, v6.4s, v1.s[0]      \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v17.4s, v6.4s, v1.s[1]      \n"
                    "fmla   v18.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v19.4s, v6.4s, v1.s[3]      \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"
                    "ldr    d6, [%1], #8                \n"
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"
                    "shll   v5.4s, v5.4h, #16           \n"
                    "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v9.4s, v4.4s, v2.s[1]       \n"
                    "ldr    d7, [%1], #8                \n"
                    "fmla   v10.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v2.s[3]      \n"
                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v20.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v2.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v3.s[0]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v13.4s, v4.4s, v3.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v3.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v3.s[3]      \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v24.4s, v5.4s, v3.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v3.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v3.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v3.s[3]      \n"
                    "shll   v6.4s, v6.4h, #16           \n"
                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "ldr    d4, [%1], #8                \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "shll   v7.4s, v7.4h, #16           \n"
                    "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v9.4s, v6.4s, v1.s[1]       \n"
                    "ldr    d5, [%1], #8                 \n"
                    "fmla   v10.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v11.4s, v6.4s, v1.s[3]      \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v20.4s, v7.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v7.4s, v1.s[3]      \n"
                    "fmla   v12.4s, v6.4s, v2.s[0]      \n"
                    "ldr    d1, [%2], #8                 \n"
                    "fmla   v13.4s, v6.4s, v2.s[1]      \n"
                    "fmla   v14.4s, v6.4s, v2.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v2.s[3]      \n"
                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v24.4s, v7.4s, v2.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v2.s[1]      \n"
                    "fmla   v26.4s, v7.4s, v2.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v2.s[3]      \n"
                    "shll   v4.4s, v4.4h, #16           \n"
                    "fmla   v16.4s, v6.4s, v3.s[0]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v17.4s, v6.4s, v3.s[1]      \n"
                    "fmla   v18.4s, v6.4s, v3.s[2]      \n"
                    "fmla   v19.4s, v6.4s, v3.s[3]      \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v28.4s, v7.4s, v3.s[0]      \n"
                    "fmla   v29.4s, v7.4s, v3.s[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v7.4s, v3.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v3.s[3]      \n"
                    "bne    4b                          \n"

                    "sub    %1, %1, #16                 \n"
                    "sub    %2, %2, #24                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "shll   v2.4s, v2.4h, #16           \n"

                    "ld1    {v4.4h, v5.4h}, [%1], #16   \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "shll   v5.4s, v5.4h, #16           \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "shrn   v0.4h, v8.4s, #16           \n"
                    "shrn2  v0.8h, v9.4s, #16           \n"
                    "shrn   v1.4h, v10.4s, #16          \n"
                    "shrn2  v1.8h, v11.4s, #16          \n"
                    "shrn   v2.4h, v12.4s, #16          \n"
                    "shrn2  v2.8h, v13.4s, #16          \n"
                    "shrn   v3.4h, v14.4s, #16          \n"
                    "shrn2  v3.8h, v15.4s, #16          \n"
                    "shrn   v4.4h, v16.4s, #16          \n"
                    "shrn2  v4.8h, v17.4s, #16          \n"
                    "shrn   v5.4h, v18.4s, #16          \n"
                    "shrn2  v5.8h, v19.4s, #16          \n"
                    "shrn   v6.4h, v20.4s, #16          \n"
                    "shrn2  v6.8h, v21.4s, #16          \n"
                    "shrn   v7.4h, v22.4s, #16          \n"
                    "shrn2  v7.8h, v23.4s, #16          \n"
                    "shrn   v8.4h, v24.4s, #16          \n"
                    "shrn2  v8.8h, v25.4s, #16          \n"
                    "shrn   v9.4h, v26.4s, #16          \n"
                    "shrn2  v9.8h, v27.4s, #16          \n"
                    "shrn   v10.4h, v28.4s, #16         \n"
                    "shrn2  v10.8h, v29.4s, #16         \n"
                    "shrn   v11.4h, v30.4s, #16         \n"
                    "shrn2  v11.8h, v31.4s, #16         \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
                    "st1    {v4.8h, v5.8h}, [%3], #32 \n"
                    "st1    {v6.8h, v7.8h, v8.8h, v9.8h}, [x4], #64 \n"
                    "st1    {v10.8h, v11.8h}, [x4]      \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x12
                    "uzp1   v20.8h, v0.8h, v1.8h        \n"
                    "uzp2   v21.8h, v0.8h, v1.8h        \n"
                    "uzp1   v22.8h, v2.8h, v3.8h        \n"
                    "uzp2   v23.8h, v2.8h, v3.8h        \n"
                    "uzp1   v24.8h, v4.8h, v5.8h        \n"
                    "uzp2   v25.8h, v4.8h, v5.8h        \n"
                    "uzp1   v26.8h, v6.8h, v7.8h        \n"
                    "uzp2   v27.8h, v6.8h, v7.8h        \n"
                    "uzp1   v28.8h, v8.8h, v9.8h        \n"
                    "uzp2   v29.8h, v8.8h, v9.8h        \n"
                    "uzp1   v30.8h, v10.8h, v11.8h      \n"
                    "uzp2   v31.8h, v10.8h, v11.8h      \n"

                    "uzp1   v0.8h, v20.8h, v22.8h       \n"
                    "uzp2   v6.8h, v20.8h, v22.8h       \n"
                    "uzp1   v3.8h, v21.8h, v23.8h       \n"
                    "uzp2   v9.8h, v21.8h, v23.8h       \n"
                    "mov    v1.d[0], v0.d[1]            \n"
                    "mov    v7.d[0], v6.d[1]            \n"
                    "mov    v4.d[0], v3.d[1]            \n"
                    "mov    v10.d[0], v9.d[1]           \n"
                    "uzp1   v2.8h, v24.8h, v24.8h       \n"
                    "uzp2   v8.8h, v24.8h, v24.8h       \n"
                    "uzp1   v5.8h, v25.8h, v25.8h       \n"
                    "uzp2   v11.8h, v25.8h, v25.8h      \n"

                    "uzp1   v12.8h, v26.8h, v28.8h      \n"
                    "uzp2   v18.8h, v26.8h, v28.8h      \n"
                    "uzp1   v15.8h, v27.8h, v29.8h      \n"
                    "uzp2   v21.8h, v27.8h, v29.8h      \n"
                    "mov    v13.d[0], v12.d[1]          \n"
                    "mov    v19.d[0], v18.d[1]          \n"
                    "mov    v16.d[0], v15.d[1]          \n"
                    "mov    v22.d[0], v21.d[1]          \n"
                    "uzp1   v14.8h, v30.8h, v30.8h      \n"
                    "uzp2   v20.8h, v30.8h, v30.8h      \n"
                    "uzp1   v17.8h, v31.8h, v31.8h      \n"
                    "uzp2   v23.8h, v31.8h, v31.8h      \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.4h, v1.4h, v2.4h}, [%3], #24 \n"
                    "st1    {v3.4h, v4.4h, v5.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.4h, v7.4h, v8.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v9.4h, v10.4h, v11.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v12.4h, v13.4h, v14.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v15.4h, v16.4h, v17.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v18.4h, v19.4h, v20.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v21.4h, v22.4h, v23.4h}, [x4] \n"

                    "9:                                 \n"
                    "add    %0, %0, #384                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else if (use_a53_a55_optimized_kernel && !cpu_support_arm_asimdhp())
            {
                // a53
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #320                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v8.4s}, [%8]               \n"
                    "ld1    {v20.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v8.16b, v8.16b, v8.16b      \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v9.16b, v8.16b              \n"
                    "mov    v10.16b, v8.16b             \n"
                    "mov    v11.16b, v8.16b             \n"
                    "mov    v12.16b, v8.16b             \n"
                    "mov    v13.16b, v8.16b             \n"
                    "mov    v14.16b, v8.16b             \n"
                    "mov    v15.16b, v8.16b             \n"
                    "mov    v16.16b, v8.16b             \n"
                    "mov    v17.16b, v8.16b             \n"
                    "mov    v18.16b, v8.16b             \n"
                    "mov    v19.16b, v8.16b             \n"

                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v4.4h}, [%1], #8           \n"

                    "prfm   pldl1keep, [%2, #384]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"

                    "ldr    x25, [%1]                   \n"
                    "add    %1, %1, #8                  \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "shll   v0.4s, v0.4h, #16           \n"

                    ".align 4                           \n"
                    "4:                                 \n"

                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "ldr    x26, [%1]                   \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"

                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "ldr    x23, [%2]                   \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"

                    "nop                                \n"
                    "ins    v5.d[0], x25                \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "ldr    x20, [%2]                   \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"

                    "shll   v5.4s, v5.4h, #16           \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "ldr    x21, [%2]                   \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "ins    v6.d[0], x26                \n"
                    "ins    v3.d[0], x23                \n"
                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"

                    "shll   v6.4s, v6.4h, #16           \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "ldr    x27, [%1]                   \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"

                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "nop                                \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"

                    "ins    v0.d[0], x20                \n"
                    "ins    v1.d[0], x21                \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "nop                                \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "nop                                \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                    "ldr    x24, [%1]                   \n"
                    "fmla   v9.4s, v6.4s, v3.s[1]       \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v10.4s, v6.4s, v3.s[2]      \n"

                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v11.4s, v6.4s, v3.s[3]      \n"
                    "ldr    x22, [%2]                   \n"
                    "fmla   v12.4s, v6.4s, v0.s[0]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v13.4s, v6.4s, v0.s[1]      \n"

                    "nop                                \n"
                    "ins    v7.d[0], x27                \n"
                    "fmla   v14.4s, v6.4s, v0.s[2]      \n"
                    "ldr    x23, [%2]                   \n"
                    "fmla   v15.4s, v6.4s, v0.s[3]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v16.4s, v6.4s, v1.s[0]      \n"

                    "shll   v7.4s, v7.4h, #16           \n"
                    "fmla   v17.4s, v6.4s, v1.s[1]      \n"
                    "ldr    x20, [%2]                   \n"
                    "fmla   v18.4s, v6.4s, v1.s[2]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v19.4s, v6.4s, v1.s[3]      \n"

                    "ins    v4.d[0], x24                \n"
                    "ins    v2.d[0], x22                \n"
                    "fmla   v20.4s, v7.4s, v3.s[0]      \n"
                    "nop                                \n"
                    "fmla   v21.4s, v7.4s, v3.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v7.4s, v3.s[2]      \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "fmla   v23.4s, v7.4s, v3.s[3]      \n"
                    "ldr    x25, [%1]                   \n"
                    "fmla   v24.4s, v7.4s, v0.s[0]      \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v25.4s, v7.4s, v0.s[1]      \n"

                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v26.4s, v7.4s, v0.s[2]      \n"
                    "nop                                \n"
                    "fmla   v27.4s, v7.4s, v0.s[3]      \n"
                    "nop                                \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"

                    "ins    v3.d[0], x23                \n"
                    "ins    v0.d[0], x20                \n"
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "nop                                \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "nop                                \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"

                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                    "ldr    x26, [%1]                   \n"
                    "fmla   v9.4s, v4.4s, v2.s[1]       \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v10.4s, v4.4s, v2.s[2]      \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v11.4s, v4.4s, v2.s[3]      \n"
                    "ldr    x21, [%2]                   \n"
                    "fmla   v12.4s, v4.4s, v3.s[0]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v13.4s, v4.4s, v3.s[1]      \n"

                    "nop                                \n"
                    "ins    v5.d[0], x25                \n"
                    "fmla   v14.4s, v4.4s, v3.s[2]      \n"
                    "ldr    x22, [%2]                   \n"
                    "fmla   v15.4s, v4.4s, v3.s[3]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"

                    "shll   v5.4s, v5.4h, #16           \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "ldr    x23, [%2]                   \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"

                    "ins    v6.d[0], x26                \n"
                    "ins    v1.d[0], x21                \n"
                    "fmla   v20.4s, v5.4s, v2.s[0]      \n"
                    "prfm   pldl1keep, [%2, #384]       \n" // NOTE PRELOAD
                    "fmla   v21.4s, v5.4s, v2.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v5.4s, v2.s[2]      \n"

                    "shll   v6.4s, v6.4h, #16           \n"
                    "fmla   v23.4s, v5.4s, v2.s[3]      \n"
                    "ldr    x27, [%1]                   \n"
                    "fmla   v24.4s, v5.4s, v3.s[0]      \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v25.4s, v5.4s, v3.s[1]      \n"

                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v26.4s, v5.4s, v3.s[2]      \n"
                    "prfm   pldl1keep, [%1, #256]       \n" // NOTE PRELOAD
                    "fmla   v27.4s, v5.4s, v3.s[3]      \n"
                    "nop                                \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"

                    "ins    v2.d[0], x22                \n"
                    "ins    v3.d[0], x23                \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "nop                                \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "nop                                \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"

                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                    "ldr    x24, [%1]                   \n"
                    "fmla   v9.4s, v6.4s, v1.s[1]       \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v10.4s, v6.4s, v1.s[2]      \n"

                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v11.4s, v6.4s, v1.s[3]      \n"
                    "ldr    x20, [%2]                   \n"
                    "fmla   v12.4s, v6.4s, v2.s[0]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v13.4s, v6.4s, v2.s[1]      \n"

                    "nop                                \n"
                    "ins    v7.d[0], x27                \n"
                    "fmla   v14.4s, v6.4s, v2.s[2]      \n"
                    "ldr    x21, [%2]                   \n"
                    "fmla   v15.4s, v6.4s, v2.s[3]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v16.4s, v6.4s, v3.s[0]      \n"

                    "shll   v7.4s, v7.4h, #16           \n"
                    "fmla   v17.4s, v6.4s, v3.s[1]      \n"
                    "ldr    x22, [%2]                   \n"
                    "fmla   v18.4s, v6.4s, v3.s[2]      \n"
                    "add    %2, %2, #8                  \n"
                    "fmla   v19.4s, v6.4s, v3.s[3]      \n"

                    "ins    v4.d[0], x24                \n"
                    "ins    v0.d[0], x20                \n"
                    "fmla   v20.4s, v7.4s, v1.s[0]      \n"
                    "nop                                \n"
                    "fmla   v21.4s, v7.4s, v1.s[1]      \n"
                    "nop                                \n"
                    "fmla   v22.4s, v7.4s, v1.s[2]      \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "fmla   v23.4s, v7.4s, v1.s[3]      \n"
                    "ldr    x25, [%1]                   \n"
                    "fmla   v24.4s, v7.4s, v2.s[0]      \n"
                    "add    %1, %1, #8                  \n"
                    "fmla   v25.4s, v7.4s, v2.s[1]      \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v26.4s, v7.4s, v2.s[2]      \n"
                    "nop                                \n"
                    "fmla   v27.4s, v7.4s, v2.s[3]      \n"
                    "nop                                \n"
                    "fmla   v28.4s, v7.4s, v3.s[0]      \n"

                    "ins    v1.d[0], x21                \n"
                    "ins    v2.d[0], x22                \n"
                    "fmla   v29.4s, v7.4s, v3.s[1]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v7.4s, v3.s[2]      \n"
                    "nop                                \n"
                    "fmla   v31.4s, v7.4s, v3.s[3]      \n"

                    "bne    4b                          \n"

                    "sub    %1, %1, #16                 \n"
                    "sub    %2, %2, #24                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "shll   v2.4s, v2.4h, #16           \n"

                    "ld1    {v4.4h, v5.4h}, [%1], #16   \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "shll   v5.4s, v5.4h, #16           \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "shrn   v0.4h, v8.4s, #16           \n"
                    "shrn2  v0.8h, v9.4s, #16           \n"
                    "shrn   v1.4h, v10.4s, #16          \n"
                    "shrn2  v1.8h, v11.4s, #16          \n"
                    "shrn   v2.4h, v12.4s, #16          \n"
                    "shrn2  v2.8h, v13.4s, #16          \n"
                    "shrn   v3.4h, v14.4s, #16          \n"
                    "shrn2  v3.8h, v15.4s, #16          \n"
                    "shrn   v4.4h, v16.4s, #16          \n"
                    "shrn2  v4.8h, v17.4s, #16          \n"
                    "shrn   v5.4h, v18.4s, #16          \n"
                    "shrn2  v5.8h, v19.4s, #16          \n"
                    "shrn   v6.4h, v20.4s, #16          \n"
                    "shrn2  v6.8h, v21.4s, #16          \n"
                    "shrn   v7.4h, v22.4s, #16          \n"
                    "shrn2  v7.8h, v23.4s, #16          \n"
                    "shrn   v8.4h, v24.4s, #16          \n"
                    "shrn2  v8.8h, v25.4s, #16          \n"
                    "shrn   v9.4h, v26.4s, #16          \n"
                    "shrn2  v9.8h, v27.4s, #16          \n"
                    "shrn   v10.4h, v28.4s, #16         \n"
                    "shrn2  v10.8h, v29.4s, #16         \n"
                    "shrn   v11.4h, v30.4s, #16         \n"
                    "shrn2  v11.8h, v31.4s, #16         \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
                    "st1    {v4.8h, v5.8h}, [%3], #32 \n"
                    "st1    {v6.8h, v7.8h, v8.8h, v9.8h}, [x4], #64 \n"
                    "st1    {v10.8h, v11.8h}, [x4]      \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x12
                    "uzp1   v20.8h, v0.8h, v1.8h        \n"
                    "uzp2   v21.8h, v0.8h, v1.8h        \n"
                    "uzp1   v22.8h, v2.8h, v3.8h        \n"
                    "uzp2   v23.8h, v2.8h, v3.8h        \n"
                    "uzp1   v24.8h, v4.8h, v5.8h        \n"
                    "uzp2   v25.8h, v4.8h, v5.8h        \n"
                    "uzp1   v26.8h, v6.8h, v7.8h        \n"
                    "uzp2   v27.8h, v6.8h, v7.8h        \n"
                    "uzp1   v28.8h, v8.8h, v9.8h        \n"
                    "uzp2   v29.8h, v8.8h, v9.8h        \n"
                    "uzp1   v30.8h, v10.8h, v11.8h      \n"
                    "uzp2   v31.8h, v10.8h, v11.8h      \n"

                    "uzp1   v0.8h, v20.8h, v22.8h       \n"
                    "uzp2   v6.8h, v20.8h, v22.8h       \n"
                    "uzp1   v3.8h, v21.8h, v23.8h       \n"
                    "uzp2   v9.8h, v21.8h, v23.8h       \n"
                    "mov    v1.d[0], v0.d[1]            \n"
                    "mov    v7.d[0], v6.d[1]            \n"
                    "mov    v4.d[0], v3.d[1]            \n"
                    "mov    v10.d[0], v9.d[1]           \n"
                    "uzp1   v2.8h, v24.8h, v24.8h       \n"
                    "uzp2   v8.8h, v24.8h, v24.8h       \n"
                    "uzp1   v5.8h, v25.8h, v25.8h       \n"
                    "uzp2   v11.8h, v25.8h, v25.8h      \n"

                    "uzp1   v12.8h, v26.8h, v28.8h      \n"
                    "uzp2   v18.8h, v26.8h, v28.8h      \n"
                    "uzp1   v15.8h, v27.8h, v29.8h      \n"
                    "uzp2   v21.8h, v27.8h, v29.8h      \n"
                    "mov    v13.d[0], v12.d[1]          \n"
                    "mov    v19.d[0], v18.d[1]          \n"
                    "mov    v16.d[0], v15.d[1]          \n"
                    "mov    v22.d[0], v21.d[1]          \n"
                    "uzp1   v14.8h, v30.8h, v30.8h      \n"
                    "uzp2   v20.8h, v30.8h, v30.8h      \n"
                    "uzp1   v17.8h, v31.8h, v31.8h      \n"
                    "uzp2   v23.8h, v31.8h, v31.8h      \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.4h, v1.4h, v2.4h}, [%3], #24 \n"
                    "st1    {v3.4h, v4.4h, v5.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.4h, v7.4h, v8.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v9.4h, v10.4h, v11.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v12.4h, v13.4h, v14.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v15.4h, v16.4h, v17.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v18.4h, v19.4h, v20.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v21.4h, v22.4h, v23.4h}, [x4] \n"

                    "9:                                 \n"
                    "add    %0, %0, #384                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #320                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v8.4s}, [%8]               \n"
                    "ld1    {v20.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v8.16b, v8.16b, v8.16b      \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v9.16b, v8.16b              \n"
                    "mov    v10.16b, v8.16b             \n"
                    "mov    v11.16b, v8.16b             \n"
                    "mov    v12.16b, v8.16b             \n"
                    "mov    v13.16b, v8.16b             \n"
                    "mov    v14.16b, v8.16b             \n"
                    "mov    v15.16b, v8.16b             \n"
                    "mov    v16.16b, v8.16b             \n"
                    "mov    v17.16b, v8.16b             \n"
                    "mov    v18.16b, v8.16b             \n"
                    "mov    v19.16b, v8.16b             \n"

                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%1], #32 \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "shll   v5.4s, v5.4h, #16           \n"
                    "shll   v6.4s, v6.4h, #16           \n"
                    "shll   v7.4s, v7.4h, #16           \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "shll   v3.4s, v3.4h, #16           \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"

                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"

                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "fmla   v8.4s, v6.4s, v3.s[0]       \n"
                    "fmla   v9.4s, v6.4s, v3.s[1]       \n"
                    "fmla   v10.4s, v6.4s, v3.s[2]      \n"
                    "fmla   v11.4s, v6.4s, v3.s[3]      \n"
                    "fmla   v20.4s, v7.4s, v3.s[0]      \n"
                    "fmla   v21.4s, v7.4s, v3.s[1]      \n"
                    "fmla   v22.4s, v7.4s, v3.s[2]      \n"
                    "fmla   v23.4s, v7.4s, v3.s[3]      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "shll   v3.4s, v3.4h, #16           \n"

                    "fmla   v12.4s, v6.4s, v0.s[0]      \n"
                    "fmla   v13.4s, v6.4s, v0.s[1]      \n"
                    "fmla   v14.4s, v6.4s, v0.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v7.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v7.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v0.s[3]      \n"

                    "fmla   v16.4s, v6.4s, v1.s[0]      \n"
                    "fmla   v17.4s, v6.4s, v1.s[1]      \n"
                    "fmla   v18.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v19.4s, v6.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%1], #32 \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "shll   v5.4s, v5.4h, #16           \n"
                    "shll   v6.4s, v6.4h, #16           \n"
                    "shll   v7.4s, v7.4h, #16           \n"

                    "fmla   v8.4s, v4.4s, v2.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v2.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v2.s[3]      \n"
                    "fmla   v20.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v2.s[3]      \n"

                    "fmla   v12.4s, v4.4s, v3.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v3.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v3.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v3.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v3.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v3.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v3.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v3.s[3]      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "shll   v3.4s, v3.4h, #16           \n"

                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"

                    "fmla   v8.4s, v6.4s, v1.s[0]       \n"
                    "fmla   v9.4s, v6.4s, v1.s[1]       \n"
                    "fmla   v10.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v11.4s, v6.4s, v1.s[3]      \n"
                    "fmla   v20.4s, v7.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v7.4s, v1.s[3]      \n"

                    "fmla   v12.4s, v6.4s, v2.s[0]      \n"
                    "fmla   v13.4s, v6.4s, v2.s[1]      \n"
                    "fmla   v14.4s, v6.4s, v2.s[2]      \n"
                    "fmla   v15.4s, v6.4s, v2.s[3]      \n"
                    "fmla   v24.4s, v7.4s, v2.s[0]      \n"
                    "fmla   v25.4s, v7.4s, v2.s[1]      \n"
                    "fmla   v26.4s, v7.4s, v2.s[2]      \n"
                    "fmla   v27.4s, v7.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v16.4s, v6.4s, v3.s[0]      \n"
                    "fmla   v17.4s, v6.4s, v3.s[1]      \n"
                    "fmla   v18.4s, v6.4s, v3.s[2]      \n"
                    "fmla   v19.4s, v6.4s, v3.s[3]      \n"
                    "fmla   v28.4s, v7.4s, v3.s[0]      \n"
                    "fmla   v29.4s, v7.4s, v3.s[1]      \n"
                    "fmla   v30.4s, v7.4s, v3.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v3.s[3]      \n"

                    "bne    4b                          \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "shll   v2.4s, v2.4h, #16           \n"

                    "ld1    {v4.4h, v5.4h}, [%1], #16   \n"

                    "shll   v4.4s, v4.4h, #16           \n"
                    "shll   v5.4s, v5.4h, #16           \n"

                    "fmla   v8.4s, v4.4s, v0.s[0]       \n"
                    "fmla   v9.4s, v4.4s, v0.s[1]       \n"
                    "fmla   v10.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v11.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v12.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v13.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v14.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v15.4s, v4.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v4.4s, v2.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v2.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v2.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v2.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v20.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v21.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v22.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v23.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v2.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "shrn   v0.4h, v8.4s, #16           \n"
                    "shrn2  v0.8h, v9.4s, #16           \n"
                    "shrn   v1.4h, v10.4s, #16          \n"
                    "shrn2  v1.8h, v11.4s, #16          \n"
                    "shrn   v2.4h, v12.4s, #16          \n"
                    "shrn2  v2.8h, v13.4s, #16          \n"
                    "shrn   v3.4h, v14.4s, #16          \n"
                    "shrn2  v3.8h, v15.4s, #16          \n"
                    "shrn   v4.4h, v16.4s, #16          \n"
                    "shrn2  v4.8h, v17.4s, #16          \n"
                    "shrn   v5.4h, v18.4s, #16          \n"
                    "shrn2  v5.8h, v19.4s, #16          \n"
                    "shrn   v6.4h, v20.4s, #16          \n"
                    "shrn2  v6.8h, v21.4s, #16          \n"
                    "shrn   v7.4h, v22.4s, #16          \n"
                    "shrn2  v7.8h, v23.4s, #16          \n"
                    "shrn   v8.4h, v24.4s, #16          \n"
                    "shrn2  v8.8h, v25.4s, #16          \n"
                    "shrn   v9.4h, v26.4s, #16          \n"
                    "shrn2  v9.8h, v27.4s, #16          \n"
                    "shrn   v10.4h, v28.4s, #16         \n"
                    "shrn2  v10.8h, v29.4s, #16         \n"
                    "shrn   v11.4h, v30.4s, #16         \n"
                    "shrn2  v11.8h, v31.4s, #16         \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
                    "st1    {v4.8h, v5.8h}, [%3], #32 \n"
                    "st1    {v6.8h, v7.8h, v8.8h, v9.8h}, [x4], #64 \n"
                    "st1    {v10.8h, v11.8h}, [x4]      \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x12
                    "uzp1   v20.8h, v0.8h, v1.8h        \n"
                    "uzp2   v21.8h, v0.8h, v1.8h        \n"
                    "uzp1   v22.8h, v2.8h, v3.8h        \n"
                    "uzp2   v23.8h, v2.8h, v3.8h        \n"
                    "uzp1   v24.8h, v4.8h, v5.8h        \n"
                    "uzp2   v25.8h, v4.8h, v5.8h        \n"
                    "uzp1   v26.8h, v6.8h, v7.8h        \n"
                    "uzp2   v27.8h, v6.8h, v7.8h        \n"
                    "uzp1   v28.8h, v8.8h, v9.8h        \n"
                    "uzp2   v29.8h, v8.8h, v9.8h        \n"
                    "uzp1   v30.8h, v10.8h, v11.8h      \n"
                    "uzp2   v31.8h, v10.8h, v11.8h      \n"

                    "uzp1   v0.8h, v20.8h, v22.8h       \n"
                    "uzp2   v6.8h, v20.8h, v22.8h       \n"
                    "uzp1   v3.8h, v21.8h, v23.8h       \n"
                    "uzp2   v9.8h, v21.8h, v23.8h       \n"
                    "mov    v1.d[0], v0.d[1]            \n"
                    "mov    v7.d[0], v6.d[1]            \n"
                    "mov    v4.d[0], v3.d[1]            \n"
                    "mov    v10.d[0], v9.d[1]           \n"
                    "uzp1   v2.8h, v24.8h, v24.8h       \n"
                    "uzp2   v8.8h, v24.8h, v24.8h       \n"
                    "uzp1   v5.8h, v25.8h, v25.8h       \n"
                    "uzp2   v11.8h, v25.8h, v25.8h      \n"

                    "uzp1   v12.8h, v26.8h, v28.8h      \n"
                    "uzp2   v18.8h, v26.8h, v28.8h      \n"
                    "uzp1   v15.8h, v27.8h, v29.8h      \n"
                    "uzp2   v21.8h, v27.8h, v29.8h      \n"
                    "mov    v13.d[0], v12.d[1]          \n"
                    "mov    v19.d[0], v18.d[1]          \n"
                    "mov    v16.d[0], v15.d[1]          \n"
                    "mov    v22.d[0], v21.d[1]          \n"
                    "uzp1   v14.8h, v30.8h, v30.8h      \n"
                    "uzp2   v20.8h, v30.8h, v30.8h      \n"
                    "uzp1   v17.8h, v31.8h, v31.8h      \n"
                    "uzp2   v23.8h, v31.8h, v31.8h      \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.4h, v1.4h, v2.4h}, [%3], #24 \n"
                    "st1    {v3.4h, v4.4h, v5.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.4h, v7.4h, v8.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v9.4h, v10.4h, v11.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v12.4h, v13.4h, v14.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v15.4h, v16.4h, v17.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v18.4h, v19.4h, v20.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v21.4h, v22.4h, v23.4h}, [x4] \n"

                    "9:                                 \n"
                    "add    %0, %0, #384                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64   \n"
                    "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0], #64 \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;
            float32x4_t _sum80;
            float32x4_t _sum81;
            float32x4_t _sum90;
            float32x4_t _sum91;
            float32x4_t _suma0;
            float32x4_t _suma1;
            float32x4_t _sumb0;
            float32x4_t _sumb1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                    _sum20 = _sum00;
                    _sum21 = _sum01;
                    _sum30 = _sum00;
                    _sum31 = _sum01;
                    _sum40 = _sum00;
                    _sum41 = _sum01;
                    _sum50 = _sum00;
                    _sum51 = _sum01;
                    _sum60 = _sum00;
                    _sum61 = _sum01;
                    _sum70 = _sum00;
                    _sum71 = _sum01;
                    _sum80 = _sum00;
                    _sum81 = _sum01;
                    _sum90 = _sum00;
                    _sum91 = _sum01;
                    _suma0 = _sum00;
                    _suma1 = _sum01;
                    _sumb0 = _sum00;
                    _sumb1 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                    _sum40 = vdupq_n_f32(0.f);
                    _sum41 = vdupq_n_f32(0.f);
                    _sum50 = vdupq_n_f32(0.f);
                    _sum51 = vdupq_n_f32(0.f);
                    _sum60 = vdupq_n_f32(0.f);
                    _sum61 = vdupq_n_f32(0.f);
                    _sum70 = vdupq_n_f32(0.f);
                    _sum71 = vdupq_n_f32(0.f);
                    _sum80 = vdupq_n_f32(0.f);
                    _sum81 = vdupq_n_f32(0.f);
                    _sum90 = vdupq_n_f32(0.f);
                    _sum91 = vdupq_n_f32(0.f);
                    _suma0 = vdupq_n_f32(0.f);
                    _suma1 = vdupq_n_f32(0.f);
                    _sumb0 = vdupq_n_f32(0.f);
                    _sumb1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
                _sum80 = vld1q_f32(outptr + 4 * 16);
                _sum81 = vld1q_f32(outptr + 4 * 17);
                _sum90 = vld1q_f32(outptr + 4 * 18);
                _sum91 = vld1q_f32(outptr + 4 * 19);
                _suma0 = vld1q_f32(outptr + 4 * 20);
                _suma1 = vld1q_f32(outptr + 4 * 21);
                _sumb0 = vld1q_f32(outptr + 4 * 22);
                _sumb1 = vld1q_f32(outptr + 4 * 23);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = bfloat2float(vld1_u16(pA));
                float32x4_t _pA1 = bfloat2float(vld1_u16(pA + 4));

                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum20));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum30));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum40));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum50));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum60));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum70));
                    vst1_u16(outptr0 + 4 * 8, float2bfloat(_sum80));
                    vst1_u16(outptr0 + 4 * 9, float2bfloat(_sum90));
                    vst1_u16(outptr0 + 4 * 10, float2bfloat(_suma0));
                    vst1_u16(outptr0 + 4 * 11, float2bfloat(_sumb0));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, float2bfloat(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 4, float2bfloat(_sum41));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 5, float2bfloat(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 6, float2bfloat(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 7, float2bfloat(_sum71));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 8, float2bfloat(_sum81));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 9, float2bfloat(_sum91));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 10, float2bfloat(_suma1));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 11, float2bfloat(_sumb1));

                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    uint16x8_t _t0 = vcombine_u16(float2bfloat(_sum00), float2bfloat(_sum01));
                    uint16x8_t _t1 = vcombine_u16(float2bfloat(_sum10), float2bfloat(_sum11));
                    uint16x8_t _t2 = vcombine_u16(float2bfloat(_sum20), float2bfloat(_sum21));
                    uint16x8_t _t3 = vcombine_u16(float2bfloat(_sum30), float2bfloat(_sum31));
                    uint16x8_t _t4 = vcombine_u16(float2bfloat(_sum40), float2bfloat(_sum41));
                    uint16x8_t _t5 = vcombine_u16(float2bfloat(_sum50), float2bfloat(_sum51));
                    uint16x8_t _t6 = vcombine_u16(float2bfloat(_sum60), float2bfloat(_sum61));
                    uint16x8_t _t7 = vcombine_u16(float2bfloat(_sum70), float2bfloat(_sum71));
                    uint16x8_t _t8 = vcombine_u16(float2bfloat(_sum80), float2bfloat(_sum81));
                    uint16x8_t _t9 = vcombine_u16(float2bfloat(_sum90), float2bfloat(_sum91));
                    uint16x8_t _ta = vcombine_u16(float2bfloat(_suma0), float2bfloat(_suma1));
                    uint16x8_t _tb = vcombine_u16(float2bfloat(_sumb0), float2bfloat(_sumb1));
                    transpose8x12_u16(_t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _ta, _tb);

                    vst1_u16(outptr0, vget_low_u16(_t0));
                    vst1_u16(outptr0 + 4, vget_high_u16(_t0));
                    vst1_u16(outptr0 + 8, vget_low_u16(_t1));
                    vst1_u16(outptr0 + out_hstep, vget_high_u16(_t1));
                    vst1_u16(outptr0 + out_hstep + 4, vget_low_u16(_t2));
                    vst1_u16(outptr0 + out_hstep + 8, vget_high_u16(_t2));
                    vst1_u16(outptr0 + out_hstep * 2, vget_low_u16(_t3));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, vget_high_u16(_t3));
                    vst1_u16(outptr0 + out_hstep * 2 + 8, vget_low_u16(_t4));
                    vst1_u16(outptr0 + out_hstep * 3, vget_high_u16(_t4));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, vget_low_u16(_t5));
                    vst1_u16(outptr0 + out_hstep * 3 + 8, vget_high_u16(_t5));
                    vst1_u16(outptr0 + out_hstep * 4, vget_low_u16(_t6));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, vget_high_u16(_t6));
                    vst1_u16(outptr0 + out_hstep * 4 + 8, vget_low_u16(_t7));
                    vst1_u16(outptr0 + out_hstep * 5, vget_high_u16(_t7));
                    vst1_u16(outptr0 + out_hstep * 5 + 4, vget_low_u16(_t8));
                    vst1_u16(outptr0 + out_hstep * 5 + 8, vget_high_u16(_t8));
                    vst1_u16(outptr0 + out_hstep * 6, vget_low_u16(_t9));
                    vst1_u16(outptr0 + out_hstep * 6 + 4, vget_high_u16(_t9));
                    vst1_u16(outptr0 + out_hstep * 6 + 8, vget_low_u16(_ta));
                    vst1_u16(outptr0 + out_hstep * 7, vget_high_u16(_ta));
                    vst1_u16(outptr0 + out_hstep * 7 + 4, vget_low_u16(_tb));
                    vst1_u16(outptr0 + out_hstep * 7 + 8, vget_high_u16(_tb));

                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
                vst1q_f32(outptr + 4 * 16, _sum80);
                vst1q_f32(outptr + 4 * 17, _sum81);
                vst1q_f32(outptr + 4 * 18, _sum90);
                vst1q_f32(outptr + 4 * 19, _sum91);
                vst1q_f32(outptr + 4 * 20, _suma0);
                vst1q_f32(outptr + 4 * 21, _suma1);
                vst1q_f32(outptr + 4 * 22, _sumb0);
                vst1q_f32(outptr + 4 * 23, _sumb1);
            }

            outptr += 96;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            if (use_a53_a55_optimized_kernel && cpu_support_arm_asimdhp())
            {
                // a55
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #192                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v16.4s}, [%8]              \n"
                    "ld1    {v24.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v16.16b, v16.16b, v16.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"

                    "2:                                 \n"
                    "mov    v17.16b, v16.16b            \n"
                    "mov    v18.16b, v16.16b            \n"
                    "mov    v19.16b, v16.16b            \n"
                    "mov    v20.16b, v16.16b            \n"
                    "mov    v21.16b, v16.16b            \n"
                    "mov    v22.16b, v16.16b            \n"
                    "mov    v23.16b, v16.16b            \n"

                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"
                    "mov    v28.16b, v24.16b            \n"
                    "mov    v29.16b, v24.16b            \n"
                    "mov    v30.16b, v24.16b            \n"
                    "mov    v31.16b, v24.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v8.4h, v9.4h}, [%1], #16   \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v8.4s, v8.4h, #16           \n"

                    ".align 4                           \n"
                    "4:                                 \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                    "ldr    d10, [%1], #8               \n"
                    "fmla   v17.4s, v8.4s, v0.s[1]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v18.4s, v8.4s, v0.s[2]      \n"
                    "ldr    d11, [%1], #8               \n"
                    "fmla   v19.4s, v8.4s, v0.s[3]      \n"
                    "shll   v9.4s, v9.4h, #16           \n"
                    "fmla   v20.4s, v8.4s, v1.s[0]      \n"
                    "ldr    d4, [%2], #8                \n"
                    "fmla   v21.4s, v8.4s, v1.s[1]      \n"
                    "ldr    d12, [%1], #8               \n"
                    "fmla   v22.4s, v8.4s, v1.s[2]      \n"
                    "ldr    d5, [%2], #8                \n"
                    "fmla   v23.4s, v8.4s, v1.s[3]      \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v24.4s, v9.4s, v0.s[0]      \n"
                    "ldr    d13, [%1], #8               \n"
                    "fmla   v25.4s, v9.4s, v0.s[1]      \n"
                    "ldr    d6, [%2], #8                \n"
                    "fmla   v26.4s, v9.4s, v0.s[2]      \n"
                    "ldr    d14, [%1], #8               \n"
                    "fmla   v27.4s, v9.4s, v0.s[3]      \n"
                    "shll   v10.4s, v10.4h, #16         \n"
                    "fmla   v28.4s, v9.4s, v1.s[0]      \n"
                    "ldr    d7, [%2], #8                \n"
                    "fmla   v29.4s, v9.4s, v1.s[1]      \n"
                    "ldr    d15, [%1], #8               \n"
                    "fmla   v30.4s, v9.4s, v1.s[2]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                    "ldr    d8, [%1], #8                \n"
                    "fmla   v17.4s, v10.4s, v2.s[1]     \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v18.4s, v10.4s, v2.s[2]     \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v19.4s, v10.4s, v2.s[3]     \n"
                    "shll   v11.4s, v11.4h, #16         \n"
                    "fmla   v20.4s, v10.4s, v3.s[0]     \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v21.4s, v10.4s, v3.s[1]     \n"
                    "ldr    d9, [%1], #8                \n"
                    "fmla   v22.4s, v10.4s, v3.s[2]     \n"
                    "fmla   v23.4s, v10.4s, v3.s[3]     \n"
                    "shll   v4.4s, v4.4h, #16           \n"
                    "fmla   v24.4s, v11.4s, v2.s[0]     \n"
                    "fmla   v25.4s, v11.4s, v2.s[1]     \n"
                    "fmla   v26.4s, v11.4s, v2.s[2]     \n"
                    "fmla   v27.4s, v11.4s, v2.s[3]     \n"
                    "shll   v12.4s, v12.4h, #16         \n"
                    "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v29.4s, v11.4s, v3.s[1]     \n"
                    "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "shll   v5.4s, v5.4h, #16           \n"
                    "fmla   v16.4s, v12.4s, v4.s[0]     \n"
                    "fmla   v17.4s, v12.4s, v4.s[1]     \n"
                    "fmla   v18.4s, v12.4s, v4.s[2]     \n"
                    "fmla   v19.4s, v12.4s, v4.s[3]     \n"
                    "shll   v13.4s, v13.4h, #16         \n"
                    "fmla   v20.4s, v12.4s, v5.s[0]     \n"
                    "fmla   v21.4s, v12.4s, v5.s[1]     \n"
                    "fmla   v22.4s, v12.4s, v5.s[2]     \n"
                    "fmla   v23.4s, v12.4s, v5.s[3]     \n"
                    "shll   v6.4s, v6.4h, #16           \n"
                    "fmla   v24.4s, v13.4s, v4.s[0]     \n"
                    "fmla   v25.4s, v13.4s, v4.s[1]     \n"
                    "fmla   v26.4s, v13.4s, v4.s[2]     \n"
                    "fmla   v27.4s, v13.4s, v4.s[3]     \n"
                    "shll   v14.4s, v14.4h, #16         \n"
                    "fmla   v28.4s, v13.4s, v5.s[0]     \n"
                    "fmla   v29.4s, v13.4s, v5.s[1]     \n"
                    "fmla   v30.4s, v13.4s, v5.s[2]     \n"
                    "fmla   v31.4s, v13.4s, v5.s[3]     \n"
                    "shll   v7.4s, v7.4h, #16           \n"
                    "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                    "fmla   v17.4s, v14.4s, v6.s[1]     \n"
                    "fmla   v18.4s, v14.4s, v6.s[2]     \n"
                    "fmla   v19.4s, v14.4s, v6.s[3]     \n"
                    "shll   v15.4s, v15.4h, #16         \n"
                    "fmla   v20.4s, v14.4s, v7.s[0]     \n"
                    "fmla   v21.4s, v14.4s, v7.s[1]     \n"
                    "fmla   v22.4s, v14.4s, v7.s[2]     \n"
                    "fmla   v23.4s, v14.4s, v7.s[3]     \n"
                    "shll   v8.4s, v8.4h, #16           \n"
                    "fmla   v24.4s, v15.4s, v6.s[0]     \n"
                    "fmla   v25.4s, v15.4s, v6.s[1]     \n"
                    "fmla   v26.4s, v15.4s, v6.s[2]     \n"
                    "fmla   v27.4s, v15.4s, v6.s[3]     \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v28.4s, v15.4s, v7.s[0]     \n"
                    "fmla   v29.4s, v15.4s, v7.s[1]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v15.4s, v7.s[2]     \n"
                    "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                    "bne    4b                          \n"

                    "sub    %1, %1, #16                 \n"
                    "sub    %2, %2, #24                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v2.8h}, [%2], #16          \n"
                    "shll   v0.4s, v2.4h, #16           \n"
                    "shll2  v1.4s, v2.8h, #16           \n"
                    "ld1    {v3.8h}, [%1], #16          \n"
                    "shll   v4.4s, v3.4h, #16           \n"
                    "shll2  v5.4s, v3.8h, #16           \n"

                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v4.4s, v1.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v24.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v1.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "shrn   v0.4h, v16.4s, #16          \n"
                    "shrn2  v0.8h, v17.4s, #16          \n"
                    "shrn   v1.4h, v18.4s, #16          \n"
                    "shrn2  v1.8h, v19.4s, #16          \n"
                    "shrn   v2.4h, v20.4s, #16          \n"
                    "shrn2  v2.8h, v21.4s, #16          \n"
                    "shrn   v3.4h, v22.4s, #16          \n"
                    "shrn2  v3.8h, v23.4s, #16          \n"
                    "shrn   v4.4h, v24.4s, #16          \n"
                    "shrn2  v4.8h, v25.4s, #16          \n"
                    "shrn   v5.4h, v26.4s, #16          \n"
                    "shrn2  v5.8h, v27.4s, #16          \n"
                    "shrn   v6.4h, v28.4s, #16          \n"
                    "shrn2  v6.8h, v29.4s, #16          \n"
                    "shrn   v7.4h, v30.4s, #16          \n"
                    "shrn2  v7.8h, v31.4s, #16          \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
                    "st1    {v4.8h, v5.8h, v6.8h, v7.8h}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x8
                    "uzp1   v24.8h, v0.8h, v1.8h        \n"
                    "uzp2   v25.8h, v0.8h, v1.8h        \n"
                    "uzp1   v26.8h, v2.8h, v3.8h        \n"
                    "uzp2   v27.8h, v2.8h, v3.8h        \n"
                    "uzp1   v28.8h, v4.8h, v5.8h        \n"
                    "uzp2   v29.8h, v4.8h, v5.8h        \n"
                    "uzp1   v30.8h, v6.8h, v7.8h        \n"
                    "uzp2   v31.8h, v6.8h, v7.8h        \n"

                    "uzp1   v0.8h, v24.8h, v26.8h       \n"
                    "uzp2   v2.8h, v24.8h, v26.8h       \n"
                    "uzp1   v1.8h, v25.8h, v27.8h       \n"
                    "uzp2   v3.8h, v25.8h, v27.8h       \n"

                    "uzp1   v4.8h, v28.8h, v30.8h       \n"
                    "uzp2   v6.8h, v28.8h, v30.8h       \n"
                    "uzp1   v5.8h, v29.8h, v31.8h       \n"
                    "uzp2   v7.8h, v29.8h, v31.8h       \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.8h}, [%3], #16          \n"
                    "st1    {v1.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v2.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v3.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v4.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v5.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v7.8h}, [x4]               \n"

                    "9:                                 \n"
                    "add    %0, %0, #256                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #192                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v16.4s}, [%8]              \n"
                    "ld1    {v24.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v16.16b, v16.16b, v16.16b   \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"

                    "2:                                 \n"
                    "mov    v17.16b, v16.16b            \n"
                    "mov    v18.16b, v16.16b            \n"
                    "mov    v19.16b, v16.16b            \n"
                    "mov    v20.16b, v16.16b            \n"
                    "mov    v21.16b, v16.16b            \n"
                    "mov    v22.16b, v16.16b            \n"
                    "mov    v23.16b, v16.16b            \n"

                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"
                    "mov    v28.16b, v24.16b            \n"
                    "mov    v29.16b, v24.16b            \n"
                    "mov    v30.16b, v24.16b            \n"
                    "mov    v31.16b, v24.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n"
                    "shll   v0.4s, v4.4h, #16           \n"
                    "shll2  v1.4s, v4.8h, #16           \n"
                    "shll   v2.4s, v5.4h, #16           \n"
                    "shll2  v3.4s, v5.8h, #16           \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n"
                    "shll   v8.4s, v12.4h, #16          \n"
                    "shll2  v9.4s, v12.8h, #16          \n"
                    "shll   v10.4s, v13.4h, #16         \n"
                    "shll2  v11.4s, v13.8h, #16         \n"
                    "fmla   v16.4s, v8.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v8.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v8.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v8.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v8.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v8.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v8.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v8.4s, v1.s[3]      \n"
                    "fmla   v24.4s, v9.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v9.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v9.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v9.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v9.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v9.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v9.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v9.4s, v1.s[3]      \n"
                    "fmla   v16.4s, v10.4s, v2.s[0]     \n"
                    "fmla   v17.4s, v10.4s, v2.s[1]     \n"
                    "fmla   v18.4s, v10.4s, v2.s[2]     \n"
                    "fmla   v19.4s, v10.4s, v2.s[3]     \n"
                    "fmla   v20.4s, v10.4s, v3.s[0]     \n"
                    "fmla   v21.4s, v10.4s, v3.s[1]     \n"
                    "fmla   v22.4s, v10.4s, v3.s[2]     \n"
                    "fmla   v23.4s, v10.4s, v3.s[3]     \n"
                    "fmla   v24.4s, v11.4s, v2.s[0]     \n"
                    "fmla   v25.4s, v11.4s, v2.s[1]     \n"
                    "fmla   v26.4s, v11.4s, v2.s[2]     \n"
                    "fmla   v27.4s, v11.4s, v2.s[3]     \n"
                    "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                    "fmla   v29.4s, v11.4s, v3.s[1]     \n"
                    "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "shll   v4.4s, v6.4h, #16           \n"
                    "shll2  v5.4s, v6.8h, #16           \n"
                    "shll   v6.4s, v7.4h, #16           \n"
                    "shll2  v7.4s, v7.8h, #16           \n"
                    "shll   v12.4s, v14.4h, #16         \n"
                    "shll2  v13.4s, v14.8h, #16         \n"
                    "shll   v14.4s, v15.4h, #16         \n"
                    "shll2  v15.4s, v15.8h, #16         \n"
                    "fmla   v16.4s, v12.4s, v4.s[0]     \n"
                    "fmla   v17.4s, v12.4s, v4.s[1]     \n"
                    "fmla   v18.4s, v12.4s, v4.s[2]     \n"
                    "fmla   v19.4s, v12.4s, v4.s[3]     \n"
                    "fmla   v20.4s, v12.4s, v5.s[0]     \n"
                    "fmla   v21.4s, v12.4s, v5.s[1]     \n"
                    "fmla   v22.4s, v12.4s, v5.s[2]     \n"
                    "fmla   v23.4s, v12.4s, v5.s[3]     \n"
                    "fmla   v24.4s, v13.4s, v4.s[0]     \n"
                    "fmla   v25.4s, v13.4s, v4.s[1]     \n"
                    "fmla   v26.4s, v13.4s, v4.s[2]     \n"
                    "fmla   v27.4s, v13.4s, v4.s[3]     \n"
                    "fmla   v28.4s, v13.4s, v5.s[0]     \n"
                    "fmla   v29.4s, v13.4s, v5.s[1]     \n"
                    "fmla   v30.4s, v13.4s, v5.s[2]     \n"
                    "fmla   v31.4s, v13.4s, v5.s[3]     \n"
                    "fmla   v16.4s, v14.4s, v6.s[0]     \n"
                    "fmla   v17.4s, v14.4s, v6.s[1]     \n"
                    "fmla   v18.4s, v14.4s, v6.s[2]     \n"
                    "fmla   v19.4s, v14.4s, v6.s[3]     \n"
                    "fmla   v20.4s, v14.4s, v7.s[0]     \n"
                    "fmla   v21.4s, v14.4s, v7.s[1]     \n"
                    "fmla   v22.4s, v14.4s, v7.s[2]     \n"
                    "fmla   v23.4s, v14.4s, v7.s[3]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v24.4s, v15.4s, v6.s[0]     \n"
                    "fmla   v25.4s, v15.4s, v6.s[1]     \n"
                    "fmla   v26.4s, v15.4s, v6.s[2]     \n"
                    "fmla   v27.4s, v15.4s, v6.s[3]     \n"
                    "fmla   v28.4s, v15.4s, v7.s[0]     \n"
                    "fmla   v29.4s, v15.4s, v7.s[1]     \n"
                    "fmla   v30.4s, v15.4s, v7.s[2]     \n"
                    "fmla   v31.4s, v15.4s, v7.s[3]     \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v2.8h}, [%2], #16          \n"
                    "shll   v0.4s, v2.4h, #16           \n"
                    "shll2  v1.4s, v2.8h, #16           \n"
                    "ld1    {v3.8h}, [%1], #16          \n"
                    "shll   v4.4s, v3.4h, #16           \n"
                    "shll2  v5.4s, v3.8h, #16           \n"

                    "fmla   v16.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v17.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v18.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v19.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v20.4s, v4.4s, v1.s[0]      \n"
                    "fmla   v21.4s, v4.4s, v1.s[1]      \n"
                    "fmla   v22.4s, v4.4s, v1.s[2]      \n"
                    "fmla   v23.4s, v4.4s, v1.s[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v24.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v1.s[3]      \n"

                    "bne    6b                          \n"

                    "7:                                 \n"
                    "shrn   v0.4h, v16.4s, #16          \n"
                    "shrn2  v0.8h, v17.4s, #16          \n"
                    "shrn   v1.4h, v18.4s, #16          \n"
                    "shrn2  v1.8h, v19.4s, #16          \n"
                    "shrn   v2.4h, v20.4s, #16          \n"
                    "shrn2  v2.8h, v21.4s, #16          \n"
                    "shrn   v3.4h, v22.4s, #16          \n"
                    "shrn2  v3.8h, v23.4s, #16          \n"
                    "shrn   v4.4h, v24.4s, #16          \n"
                    "shrn2  v4.8h, v25.4s, #16          \n"
                    "shrn   v5.4h, v26.4s, #16          \n"
                    "shrn2  v5.8h, v27.4s, #16          \n"
                    "shrn   v6.4h, v28.4s, #16          \n"
                    "shrn2  v6.8h, v29.4s, #16          \n"
                    "shrn   v7.4h, v30.4s, #16          \n"
                    "shrn2  v7.8h, v31.4s, #16          \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
                    "st1    {v4.8h, v5.8h, v6.8h, v7.8h}, [x4] \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x8
                    "uzp1   v24.8h, v0.8h, v1.8h        \n"
                    "uzp2   v25.8h, v0.8h, v1.8h        \n"
                    "uzp1   v26.8h, v2.8h, v3.8h        \n"
                    "uzp2   v27.8h, v2.8h, v3.8h        \n"
                    "uzp1   v28.8h, v4.8h, v5.8h        \n"
                    "uzp2   v29.8h, v4.8h, v5.8h        \n"
                    "uzp1   v30.8h, v6.8h, v7.8h        \n"
                    "uzp2   v31.8h, v6.8h, v7.8h        \n"

                    "uzp1   v0.8h, v24.8h, v26.8h       \n"
                    "uzp2   v2.8h, v24.8h, v26.8h       \n"
                    "uzp1   v1.8h, v25.8h, v27.8h       \n"
                    "uzp2   v3.8h, v25.8h, v27.8h       \n"

                    "uzp1   v4.8h, v28.8h, v30.8h       \n"
                    "uzp2   v6.8h, v28.8h, v30.8h       \n"
                    "uzp1   v5.8h, v29.8h, v31.8h       \n"
                    "uzp2   v7.8h, v29.8h, v31.8h       \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.8h}, [%3], #16          \n"
                    "st1    {v1.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v2.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v3.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v4.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v5.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.8h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v7.8h}, [x4]               \n"

                    "9:                                 \n"
                    "add    %0, %0, #256                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                    _sum20 = _sum00;
                    _sum21 = _sum01;
                    _sum30 = _sum00;
                    _sum31 = _sum01;
                    _sum40 = _sum00;
                    _sum41 = _sum01;
                    _sum50 = _sum00;
                    _sum51 = _sum01;
                    _sum60 = _sum00;
                    _sum61 = _sum01;
                    _sum70 = _sum00;
                    _sum71 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                    _sum40 = vdupq_n_f32(0.f);
                    _sum41 = vdupq_n_f32(0.f);
                    _sum50 = vdupq_n_f32(0.f);
                    _sum51 = vdupq_n_f32(0.f);
                    _sum60 = vdupq_n_f32(0.f);
                    _sum61 = vdupq_n_f32(0.f);
                    _sum70 = vdupq_n_f32(0.f);
                    _sum71 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = bfloat2float(vld1_u16(pA));
                float32x4_t _pA1 = bfloat2float(vld1_u16(pA + 4));

                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);

                pA += 8;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum20));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum30));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum40));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum50));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum60));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum70));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, float2bfloat(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 4, float2bfloat(_sum41));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 5, float2bfloat(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 6, float2bfloat(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 7, float2bfloat(_sum71));

                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    uint16x8_t _t0 = vcombine_u16(float2bfloat(_sum00), float2bfloat(_sum01));
                    uint16x8_t _t1 = vcombine_u16(float2bfloat(_sum10), float2bfloat(_sum11));
                    uint16x8_t _t2 = vcombine_u16(float2bfloat(_sum20), float2bfloat(_sum21));
                    uint16x8_t _t3 = vcombine_u16(float2bfloat(_sum30), float2bfloat(_sum31));
                    uint16x8_t _t4 = vcombine_u16(float2bfloat(_sum40), float2bfloat(_sum41));
                    uint16x8_t _t5 = vcombine_u16(float2bfloat(_sum50), float2bfloat(_sum51));
                    uint16x8_t _t6 = vcombine_u16(float2bfloat(_sum60), float2bfloat(_sum61));
                    uint16x8_t _t7 = vcombine_u16(float2bfloat(_sum70), float2bfloat(_sum71));
                    transpose8x8_u16(_t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7);

                    vst1q_u16(outptr0, _t0);
                    vst1q_u16(outptr0 + out_hstep, _t1);
                    vst1q_u16(outptr0 + out_hstep * 2, _t2);
                    vst1q_u16(outptr0 + out_hstep * 3, _t3);
                    vst1q_u16(outptr0 + out_hstep * 4, _t4);
                    vst1q_u16(outptr0 + out_hstep * 5, _t5);
                    vst1q_u16(outptr0 + out_hstep * 6, _t6);
                    vst1q_u16(outptr0 + out_hstep * 7, _t7);

                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
            }

            outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            if (use_a53_a55_optimized_kernel && cpu_support_arm_asimdhp())
            {
                // a55
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #64                 \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v24.4s}, [%8]              \n"
                    "ld1    {v28.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"

                    "2:                                 \n"
                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"

                    "mov    v29.16b, v28.16b            \n"
                    "mov    v30.16b, v28.16b            \n"
                    "mov    v31.16b, v28.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.4h, v5.4h, v6.4h}, [%1], #24 \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.4h, v1.4h}, [%2], #16   \n"

                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v4.4s, v4.4h, #16           \n"

                    ".align 4                           \n"
                    "4:                                 \n"
                    "shll   v5.4s, v5.4h, #16           \n"
                    "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                    "ldr    d7, [%1], #8                \n"
                    "fmla   v25.4s, v4.4s, v0.s[1]      \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "fmla   v26.4s, v4.4s, v0.s[2]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v27.4s, v4.4s, v0.s[3]      \n"
                    "shll   v6.4s, v6.4h, #16           \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "ldr    d8, [%1], #8                \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "ldr    d9, [%1], #8                \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "shll   v7.4s, v7.4h, #16           \n"
                    "fmla   v24.4s, v6.4s, v1.s[0]      \n"
                    "ldr    d10, [%1], #8               \n"
                    "fmla   v25.4s, v6.4s, v1.s[1]      \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "fmla   v26.4s, v6.4s, v1.s[2]      \n"
                    "ldr    d11, [%1], #8               \n"
                    "fmla   v27.4s, v6.4s, v1.s[3]      \n"
                    "shll   v8.4s, v8.4h, #16           \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "ldr    d4, [%1], #8                \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "prfm   pldl1keep, [%2, #256]       \n" // NOTE PRELOAD
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"
                    "shll   v9.4s, v9.4h, #16           \n"
                    "fmla   v24.4s, v8.4s, v2.s[0]      \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v25.4s, v8.4s, v2.s[1]      \n"
                    "shll   v3.4s, v3.4h, #16           \n"
                    "fmla   v26.4s, v8.4s, v2.s[2]      \n"
                    "ldr    d5, [%1], #8                \n"
                    "fmla   v27.4s, v8.4s, v2.s[3]      \n"
                    "shll   v10.4s, v10.4h, #16         \n"
                    "fmla   v28.4s, v9.4s, v2.s[0]      \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v29.4s, v9.4s, v2.s[1]      \n"
                    "ldr    d6, [%1], #8                \n"
                    "fmla   v30.4s, v9.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v9.4s, v2.s[3]      \n"
                    "shll   v11.4s, v11.4h, #16         \n"
                    "fmla   v24.4s, v10.4s, v3.s[0]     \n"
                    "fmla   v25.4s, v10.4s, v3.s[1]     \n"
                    "shll   v4.4s, v4.4h, #16           \n"
                    "fmla   v26.4s, v10.4s, v3.s[2]     \n"
                    "fmla   v27.4s, v10.4s, v3.s[3]     \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                    "fmla   v29.4s, v11.4s, v3.s[1]     \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "bne    4b                          \n"

                    "sub    %1, %1, #24                 \n"
                    "sub    %2, %2, #16                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4h}, [%2], #8           \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "ld1    {v3.8h}, [%1], #16          \n"
                    "shll   v4.4s, v3.4h, #16           \n"
                    "shll2  v5.4s, v3.8h, #16           \n"
                    "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v4.4s, v0.s[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "bne    6b                          \n"

                    "7:                                 \n"
                    "shrn   v0.4h, v24.4s, #16          \n"
                    "shrn2  v0.8h, v25.4s, #16          \n"
                    "shrn   v1.4h, v26.4s, #16          \n"
                    "shrn2  v1.8h, v27.4s, #16          \n"
                    "shrn   v2.4h, v28.4s, #16          \n"
                    "shrn2  v2.8h, v29.4s, #16          \n"
                    "shrn   v3.4h, v30.4s, #16          \n"
                    "shrn2  v3.8h, v31.4s, #16          \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v0.8h, v1.8h}, [%3], #32   \n"
                    "st1    {v2.8h, v3.8h}, [x4]        \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x4
                    "uzp1   v28.8h, v0.8h, v1.8h        \n"
                    "uzp2   v29.8h, v0.8h, v1.8h        \n"
                    "uzp1   v30.8h, v2.8h, v3.8h        \n"
                    "uzp2   v31.8h, v2.8h, v3.8h        \n"

                    "uzp1   v0.8h, v28.8h, v29.8h       \n"
                    "uzp2   v2.8h, v28.8h, v29.8h       \n"
                    "uzp1   v4.8h, v30.8h, v31.8h       \n"
                    "uzp2   v6.8h, v30.8h, v31.8h       \n"

                    "mov    v1.d[0], v0.d[1]            \n"
                    "mov    v3.d[0], v2.d[1]            \n"
                    "mov    v5.d[0], v4.d[1]            \n"
                    "mov    v7.d[0], v6.d[1]            \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.4h}, [%3], #8           \n"
                    "st1    {v1.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v2.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v3.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v4.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v5.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v7.4h}, [x4]               \n"

                    "9:                                 \n"
                    "add    %0, %0, #128                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                    "subs   %0, %0, #64                 \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "add    x4, %8, #16                 \n"
                    "ld1    {v24.4s}, [%8]              \n"
                    "ld1    {v28.4s}, [x4]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"
                    "eor    v28.16b, v28.16b, v28.16b   \n"

                    "2:                                 \n"
                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"

                    "mov    v29.16b, v28.16b            \n"
                    "mov    v30.16b, v28.16b            \n"
                    "mov    v31.16b, v28.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "shll   v1.4s, v1.4h, #16           \n"
                    "shll   v2.4s, v2.4h, #16           \n"
                    "shll   v3.4s, v3.4h, #16           \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n"
                    "shll   v4.4s, v12.4h, #16          \n"
                    "shll2  v5.4s, v12.8h, #16          \n"
                    "shll   v6.4s, v13.4h, #16          \n"
                    "shll2  v7.4s, v13.8h, #16          \n"
                    "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v4.4s, v0.s[3]      \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "fmla   v24.4s, v6.4s, v1.s[0]      \n"
                    "fmla   v25.4s, v6.4s, v1.s[1]      \n"
                    "fmla   v26.4s, v6.4s, v1.s[2]      \n"
                    "fmla   v27.4s, v6.4s, v1.s[3]      \n"
                    "fmla   v28.4s, v7.4s, v1.s[0]      \n"
                    "fmla   v29.4s, v7.4s, v1.s[1]      \n"
                    "fmla   v30.4s, v7.4s, v1.s[2]      \n"
                    "fmla   v31.4s, v7.4s, v1.s[3]      \n"
                    "shll   v8.4s, v14.4h, #16          \n"
                    "shll2  v9.4s, v14.8h, #16          \n"
                    "shll   v10.4s, v15.4h, #16         \n"
                    "shll2  v11.4s, v15.8h, #16         \n"
                    "fmla   v24.4s, v8.4s, v2.s[0]      \n"
                    "fmla   v25.4s, v8.4s, v2.s[1]      \n"
                    "fmla   v26.4s, v8.4s, v2.s[2]      \n"
                    "fmla   v27.4s, v8.4s, v2.s[3]      \n"
                    "fmla   v28.4s, v9.4s, v2.s[0]      \n"
                    "fmla   v29.4s, v9.4s, v2.s[1]      \n"
                    "fmla   v30.4s, v9.4s, v2.s[2]      \n"
                    "fmla   v31.4s, v9.4s, v2.s[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v24.4s, v10.4s, v3.s[0]     \n"
                    "fmla   v25.4s, v10.4s, v3.s[1]     \n"
                    "fmla   v26.4s, v10.4s, v3.s[2]     \n"
                    "fmla   v27.4s, v10.4s, v3.s[3]     \n"
                    "fmla   v28.4s, v11.4s, v3.s[0]     \n"
                    "fmla   v29.4s, v11.4s, v3.s[1]     \n"
                    "fmla   v30.4s, v11.4s, v3.s[2]     \n"
                    "fmla   v31.4s, v11.4s, v3.s[3]     \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4h}, [%2], #8           \n"
                    "shll   v0.4s, v0.4h, #16           \n"
                    "ld1    {v3.8h}, [%1], #16          \n"
                    "shll   v4.4s, v3.4h, #16           \n"
                    "shll2  v5.4s, v3.8h, #16           \n"
                    "fmla   v24.4s, v4.4s, v0.s[0]      \n"
                    "fmla   v25.4s, v4.4s, v0.s[1]      \n"
                    "fmla   v26.4s, v4.4s, v0.s[2]      \n"
                    "fmla   v27.4s, v4.4s, v0.s[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.4s, v5.4s, v0.s[0]      \n"
                    "fmla   v29.4s, v5.4s, v0.s[1]      \n"
                    "fmla   v30.4s, v5.4s, v0.s[2]      \n"
                    "fmla   v31.4s, v5.4s, v0.s[3]      \n"
                    "bne    6b                          \n"

                    "7:                                 \n"
                    "shrn   v0.4h, v24.4s, #16          \n"
                    "shrn2  v0.8h, v25.4s, #16          \n"
                    "shrn   v1.4h, v26.4s, #16          \n"
                    "shrn2  v1.8h, v27.4s, #16          \n"
                    "shrn   v2.4h, v28.4s, #16          \n"
                    "shrn2  v2.8h, v29.4s, #16          \n"
                    "shrn   v3.4h, v30.4s, #16          \n"
                    "shrn2  v3.8h, v31.4s, #16          \n"
                    "tst    %w11, #255                  \n"
                    "beq    10f                         \n"

                    // if out_elempack == 4
                    "cmp    %w12, #4                    \n"
                    "bne    8f                          \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v0.8h, v1.8h}, [%3], #32   \n"
                    "st1    {v2.8h, v3.8h}, [x4]        \n"
                    "b      9f                          \n"

                    // if out_elempack == 1
                    "8:                                 \n"
                    // transpose8x4
                    "uzp1   v28.8h, v0.8h, v1.8h        \n"
                    "uzp2   v29.8h, v0.8h, v1.8h        \n"
                    "uzp1   v30.8h, v2.8h, v3.8h        \n"
                    "uzp2   v31.8h, v2.8h, v3.8h        \n"

                    "uzp1   v0.8h, v28.8h, v29.8h       \n"
                    "uzp2   v2.8h, v28.8h, v29.8h       \n"
                    "uzp1   v4.8h, v30.8h, v31.8h       \n"
                    "uzp2   v6.8h, v30.8h, v31.8h       \n"

                    "mov    v1.d[0], v0.d[1]            \n"
                    "mov    v3.d[0], v2.d[1]            \n"
                    "mov    v5.d[0], v4.d[1]            \n"
                    "mov    v7.d[0], v6.d[1]            \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.4h}, [%3], #8           \n"
                    "st1    {v1.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v2.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v3.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v4.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v5.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.4h}, [x4]               \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v7.4h}, [x4]               \n"

                    "9:                                 \n"
                    "add    %0, %0, #128                \n"
                    "b      11f                         \n"

                    "10:                                \n"
                    "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    "11:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                    _sum20 = _sum00;
                    _sum21 = _sum01;
                    _sum30 = _sum00;
                    _sum31 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum20 = vdupq_n_f32(0.f);
                    _sum21 = vdupq_n_f32(0.f);
                    _sum30 = vdupq_n_f32(0.f);
                    _sum31 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = bfloat2float(vld1_u16(pA));
                float32x4_t _pA1 = bfloat2float(vld1_u16(pA + 4));

                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);

                pA += 8;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum20));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum30));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, float2bfloat(_sum31));

                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    uint16x8_t _t0 = vcombine_u16(float2bfloat(_sum00), float2bfloat(_sum01));
                    uint16x8_t _t1 = vcombine_u16(float2bfloat(_sum10), float2bfloat(_sum11));
                    uint16x8_t _t2 = vcombine_u16(float2bfloat(_sum20), float2bfloat(_sum21));
                    uint16x8_t _t3 = vcombine_u16(float2bfloat(_sum30), float2bfloat(_sum31));
                    transpose8x4_u16(_t0, _t1, _t2, _t3);

                    vst1_u16(outptr0, vget_low_u16(_t0));
                    vst1_u16(outptr0 + out_hstep * 1, vget_high_u16(_t0));
                    vst1_u16(outptr0 + out_hstep * 2, vget_low_u16(_t1));
                    vst1_u16(outptr0 + out_hstep * 3, vget_high_u16(_t1));
                    vst1_u16(outptr0 + out_hstep * 4, vget_low_u16(_t2));
                    vst1_u16(outptr0 + out_hstep * 5, vget_high_u16(_t2));
                    vst1_u16(outptr0 + out_hstep * 6, vget_low_u16(_t3));
                    vst1_u16(outptr0 + out_hstep * 7, vget_high_u16(_t3));

                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
            }

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "add    x4, %8, #16                 \n"
                "ld1    {v28.4s}, [%8]              \n"
                "ld1    {v30.4s}, [x4]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"

                "2:                                 \n"
                "mov    v29.16b, v28.16b            \n"
                "mov    v31.16b, v30.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #128]       \n"
                "ld1    {v0.4h, v1.4h}, [%2], #16   \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "shll   v1.4s, v1.4h, #16           \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n"
                "shll   v4.4s, v12.4h, #16          \n"
                "shll2  v5.4s, v12.8h, #16          \n"
                "shll   v6.4s, v13.4h, #16          \n"
                "shll2  v7.4s, v13.8h, #16          \n"
                "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                "fmla   v29.4s, v4.4s, v0.s[1]      \n"
                "fmla   v30.4s, v5.4s, v0.s[0]      \n"
                "fmla   v31.4s, v5.4s, v0.s[1]      \n"
                "fmla   v28.4s, v6.4s, v0.s[2]      \n"
                "fmla   v29.4s, v6.4s, v0.s[3]      \n"
                "fmla   v30.4s, v7.4s, v0.s[2]      \n"
                "fmla   v31.4s, v7.4s, v0.s[3]      \n"
                "shll   v8.4s, v14.4h, #16          \n"
                "shll2  v9.4s, v14.8h, #16          \n"
                "shll   v10.4s, v15.4h, #16         \n"
                "shll2  v11.4s, v15.8h, #16         \n"
                "fmla   v28.4s, v8.4s, v1.s[0]      \n"
                "fmla   v29.4s, v8.4s, v1.s[1]      \n"
                "fmla   v30.4s, v9.4s, v1.s[0]      \n"
                "fmla   v31.4s, v9.4s, v1.s[1]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v10.4s, v1.s[2]     \n"
                "fmla   v29.4s, v10.4s, v1.s[3]     \n"
                "fmla   v30.4s, v11.4s, v1.s[2]     \n"
                "fmla   v31.4s, v11.4s, v1.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.s}[0], [%2], #4         \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "ld1    {v3.8h}, [%1], #16          \n"
                "shll   v4.4s, v3.4h, #16           \n"
                "shll2  v5.4s, v3.8h, #16           \n"
                "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                "fmla   v29.4s, v4.4s, v0.s[1]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v5.4s, v0.s[0]      \n"
                "fmla   v31.4s, v5.4s, v0.s[1]      \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "shrn   v0.4h, v28.4s, #16          \n"
                "shrn2  v0.8h, v29.4s, #16          \n"
                "shrn   v1.4h, v30.4s, #16          \n"
                "shrn2  v1.8h, v31.4s, #16          \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "lsl    w4, %w13, #2                \n"
                "add    x4, %3, w4, sxtw 1          \n"
                "st1    {v0.8h}, [%3], #16          \n"
                "st1    {v1.8h}, [x4]               \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose8x2
                "uzp1   v2.8h, v0.8h, v1.8h         \n"
                "uzp2   v3.8h, v0.8h, v1.8h         \n"
                "uzp1   v0.8h, v2.8h, v3.8h         \n"
                "uzp2   v1.8h, v2.8h, v3.8h         \n"

                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v0.s}[0], [%3], #4         \n"
                "st1    {v0.s}[2], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v1.s}[0], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v1.s}[2], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v0.s}[1], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v0.s}[3], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v1.s}[1], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v1.s}[3], [x4]             \n"

                "9:                                 \n"
                "add    %0, %0, #64                 \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                    _sum10 = _sum00;
                    _sum11 = _sum01;
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = bfloat2float(vld1_u16(pA));
                float32x4_t _pA1 = bfloat2float(vld1_u16(pA + 4));

                float32x2_t _pB0 = vget_low_f32(bfloat2float(vld1_u16(pB)));

                _sum00 = vfmaq_lane_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pA1, _pB0, 1);

                pA += 8;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[8];
                    unsigned short sum1[8];
                    vst1_u16(sum0, float2bfloat(_sum00));
                    vst1_u16(sum0 + 4, float2bfloat(_sum01));
                    vst1_u16(sum1, float2bfloat(_sum10));
                    vst1_u16(sum1 + 4, float2bfloat(_sum11));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];

                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0[out_hstep * 4 + 1] = sum1[4];
                    outptr0[out_hstep * 5 + 1] = sum1[5];
                    outptr0[out_hstep * 6 + 1] = sum1[6];
                    outptr0[out_hstep * 7 + 1] = sum1[7];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
            }

            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj < max_jj; jj += 1)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v30.4s, v31.4s}, [%0]      \n"
                "b      2f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v30.4s, v31.4s}, [%8]      \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"

                "2:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "3:                                 \n"
                "prfm   pldl1keep, [%2, #64]        \n"
                "ld1    {v0.4h}, [%2], #8           \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n"
                "shll   v4.4s, v12.4h, #16          \n"
                "shll2  v5.4s, v12.8h, #16          \n"
                "shll   v6.4s, v13.4h, #16          \n"
                "shll2  v7.4s, v13.8h, #16          \n"
                "fmla   v28.4s, v4.4s, v0.s[0]      \n"
                "fmla   v29.4s, v5.4s, v0.s[0]      \n"
                "fmla   v30.4s, v6.4s, v0.s[1]      \n"
                "fmla   v31.4s, v7.4s, v0.s[1]      \n"
                "shll   v8.4s, v14.4h, #16          \n"
                "shll2  v9.4s, v14.8h, #16          \n"
                "shll   v10.4s, v15.4h, #16         \n"
                "shll2  v11.4s, v15.8h, #16         \n"
                "fmla   v28.4s, v8.4s, v0.s[2]      \n"
                "fmla   v29.4s, v9.4s, v0.s[2]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v10.4s, v0.s[3]     \n"
                "fmla   v31.4s, v11.4s, v0.s[3]     \n"
                "bne    3b                          \n"
                "fadd   v30.4s, v30.4s, v28.4s      \n"
                "fadd   v31.4s, v31.4s, v29.4s      \n"

                "4:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    6f                          \n"

                "5:                                 \n"
                "ld1r   {v0.4h}, [%2], #2           \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "ld1    {v3.8h}, [%1], #16          \n"
                "shll   v4.4s, v3.4h, #16           \n"
                "shll2  v5.4s, v3.8h, #16           \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v4.4s, v0.4s        \n"
                "fmla   v31.4s, v5.4s, v0.4s        \n"
                "bne    5b                          \n"

                "6:                                 \n"
                "shrn   v30.4h, v30.4s, #16         \n"
                "shrn   v31.4h, v31.4s, #16         \n"
                "tst    %w11, #255                  \n"
                "beq    9f                          \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    7f                          \n"

                "lsl    w4, %w13, #2                \n"
                "add    x4, %3, w4, sxtw 1          \n"
                "st1    {v30.4h}, [%3], #8          \n"
                "st1    {v31.4h}, [x4]              \n"
                "b      8f                          \n"

                // if out_elempack == 1
                "7:                                 \n"
                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v30.h}[0], [%3], #2        \n"
                "st1    {v30.h}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v30.h}[2], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v30.h}[3], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v31.h}[0], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v31.h}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v31.h}[2], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v31.h}[3], [x4]            \n"

                "8:                                 \n"
                "add    %0, %0, #32                 \n"
                "b      10f                         \n"

                "9:                                 \n"
                "st1    {v30.4s, v31.4s}, [%0], #32 \n"

                "10:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum00;
            float32x4_t _sum01;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vld1q_f32(pC);
                    _sum01 = vld1q_f32(pC + 4);
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = bfloat2float(vld1_u16(pA));
                float32x4_t _pA1 = bfloat2float(vld1_u16(pA + 4));

                float32x4_t _pB = bfloat2float(vld1_dup_u16(pB));

                _sum00 = vfmaq_f32(_sum00, _pA0, _pB);
                _sum01 = vfmaq_f32(_sum01, _pA1, _pB);

                pA += 8;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[8];
                    vst1_u16(sum0, float2bfloat(_sum00));
                    vst1_u16(sum0 + 4, float2bfloat(_sum01));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep * 1] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
            }

            outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
        }

        pAT += max_kk * 8;
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const unsigned short* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "subs   %0, %0, #128                \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v20.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"

                "2:                                 \n"
                "mov    v21.16b, v20.16b            \n"
                "mov    v22.16b, v20.16b            \n"
                "mov    v23.16b, v20.16b            \n"
                "mov    v24.16b, v20.16b            \n"
                "mov    v25.16b, v20.16b            \n"
                "mov    v26.16b, v20.16b            \n"
                "mov    v27.16b, v20.16b            \n"
                "mov    v28.16b, v20.16b            \n"
                "mov    v29.16b, v20.16b            \n"
                "mov    v30.16b, v20.16b            \n"
                "mov    v31.16b, v20.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n"
                "shll   v0.4s, v4.4h, #16           \n"
                "shll2  v1.4s, v4.8h, #16           \n"
                "shll   v2.4s, v5.4h, #16           \n"
                "shll2  v3.4s, v5.8h, #16           \n"
                "prfm   pldl1keep, [%1, #256]       \n"
                "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "shll   v17.4s, v17.4h, #16         \n"
                "shll   v18.4s, v18.4h, #16         \n"
                "shll   v19.4s, v19.4h, #16         \n"
                "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                "fmla   v21.4s, v16.4s, v0.s[1]     \n"
                "fmla   v22.4s, v16.4s, v0.s[2]     \n"
                "fmla   v23.4s, v16.4s, v0.s[3]     \n"
                "fmla   v24.4s, v16.4s, v1.s[0]     \n"
                "fmla   v25.4s, v16.4s, v1.s[1]     \n"
                "fmla   v26.4s, v16.4s, v1.s[2]     \n"
                "fmla   v27.4s, v16.4s, v1.s[3]     \n"
                "fmla   v28.4s, v16.4s, v2.s[0]     \n"
                "fmla   v29.4s, v16.4s, v2.s[1]     \n"
                "fmla   v30.4s, v16.4s, v2.s[2]     \n"
                "fmla   v31.4s, v16.4s, v2.s[3]     \n"
                "shll   v4.4s, v6.4h, #16           \n"
                "shll2  v5.4s, v6.8h, #16           \n"
                "shll   v6.4s, v7.4h, #16           \n"
                "shll2  v7.4s, v7.8h, #16           \n"
                "fmla   v20.4s, v17.4s, v3.s[0]     \n"
                "fmla   v21.4s, v17.4s, v3.s[1]     \n"
                "fmla   v22.4s, v17.4s, v3.s[2]     \n"
                "fmla   v23.4s, v17.4s, v3.s[3]     \n"
                "fmla   v24.4s, v17.4s, v4.s[0]     \n"
                "fmla   v25.4s, v17.4s, v4.s[1]     \n"
                "fmla   v26.4s, v17.4s, v4.s[2]     \n"
                "fmla   v27.4s, v17.4s, v4.s[3]     \n"
                "fmla   v28.4s, v17.4s, v5.s[0]     \n"
                "fmla   v29.4s, v17.4s, v5.s[1]     \n"
                "fmla   v30.4s, v17.4s, v5.s[2]     \n"
                "fmla   v31.4s, v17.4s, v5.s[3]     \n"
                "prfm   pldl1keep, [%2, #256]       \n"
                "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "shll   v1.4s, v1.4h, #16           \n"
                "shll   v2.4s, v2.4h, #16           \n"
                "shll   v3.4s, v3.4h, #16           \n"
                "fmla   v20.4s, v18.4s, v6.s[0]     \n"
                "fmla   v21.4s, v18.4s, v6.s[1]     \n"
                "fmla   v22.4s, v18.4s, v6.s[2]     \n"
                "fmla   v23.4s, v18.4s, v6.s[3]     \n"
                "fmla   v24.4s, v18.4s, v7.s[0]     \n"
                "fmla   v25.4s, v18.4s, v7.s[1]     \n"
                "fmla   v26.4s, v18.4s, v7.s[2]     \n"
                "fmla   v27.4s, v18.4s, v7.s[3]     \n"
                "fmla   v28.4s, v18.4s, v0.s[0]     \n"
                "fmla   v29.4s, v18.4s, v0.s[1]     \n"
                "fmla   v30.4s, v18.4s, v0.s[2]     \n"
                "fmla   v31.4s, v18.4s, v0.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v20.4s, v19.4s, v1.s[0]     \n"
                "fmla   v21.4s, v19.4s, v1.s[1]     \n"
                "fmla   v22.4s, v19.4s, v1.s[2]     \n"
                "fmla   v23.4s, v19.4s, v1.s[3]     \n"
                "fmla   v24.4s, v19.4s, v2.s[0]     \n"
                "fmla   v25.4s, v19.4s, v2.s[1]     \n"
                "fmla   v26.4s, v19.4s, v2.s[2]     \n"
                "fmla   v27.4s, v19.4s, v2.s[3]     \n"
                "fmla   v28.4s, v19.4s, v3.s[0]     \n"
                "fmla   v29.4s, v19.4s, v3.s[1]     \n"
                "fmla   v30.4s, v19.4s, v3.s[2]     \n"
                "fmla   v31.4s, v19.4s, v3.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "shll   v1.4s, v1.4h, #16           \n"
                "shll   v2.4s, v2.4h, #16           \n"
                "ld1    {v16.4h}, [%1], #8          \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "fmla   v20.4s, v16.4s, v0.s[0]     \n"
                "fmla   v21.4s, v16.4s, v0.s[1]     \n"
                "fmla   v22.4s, v16.4s, v0.s[2]     \n"
                "fmla   v23.4s, v16.4s, v0.s[3]     \n"
                "fmla   v24.4s, v16.4s, v1.s[0]     \n"
                "fmla   v25.4s, v16.4s, v1.s[1]     \n"
                "fmla   v26.4s, v16.4s, v1.s[2]     \n"
                "fmla   v27.4s, v16.4s, v1.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v16.4s, v2.s[0]     \n"
                "fmla   v29.4s, v16.4s, v2.s[1]     \n"
                "fmla   v30.4s, v16.4s, v2.s[2]     \n"
                "fmla   v31.4s, v16.4s, v2.s[3]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "shrn   v0.4h, v20.4s, #16          \n"
                "shrn2  v0.8h, v21.4s, #16          \n"
                "shrn   v1.4h, v22.4s, #16          \n"
                "shrn2  v1.8h, v23.4s, #16          \n"
                "shrn   v2.4h, v24.4s, #16          \n"
                "shrn2  v2.8h, v25.4s, #16          \n"
                "shrn   v3.4h, v26.4s, #16          \n"
                "shrn2  v3.8h, v27.4s, #16          \n"
                "shrn   v4.4h, v28.4s, #16          \n"
                "shrn2  v4.8h, v29.4s, #16          \n"
                "shrn   v5.4h, v30.4s, #16          \n"
                "shrn2  v5.8h, v31.4s, #16          \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
                "st1    {v4.8h, v5.8h}, [%3], #32   \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x12
                "uzp1   v20.8h, v0.8h, v1.8h        \n"
                "uzp2   v21.8h, v0.8h, v1.8h        \n"
                "uzp1   v22.8h, v2.8h, v3.8h        \n"
                "uzp2   v23.8h, v2.8h, v3.8h        \n"
                "uzp1   v24.8h, v4.8h, v5.8h        \n"
                "uzp2   v25.8h, v4.8h, v5.8h        \n"

                "uzp1   v0.8h, v20.8h, v21.8h       \n"
                "uzp2   v6.8h, v20.8h, v21.8h       \n"
                "uzp1   v1.8h, v22.8h, v23.8h       \n"
                "uzp2   v7.8h, v22.8h, v23.8h       \n"
                "uzp1   v2.8h, v24.8h, v25.8h       \n"
                "uzp2   v8.8h, v24.8h, v25.8h       \n"

                "mov    v3.d[0], v0.d[1]            \n"
                "mov    v4.d[0], v1.d[1]            \n"
                "mov    v5.d[0], v2.d[1]            \n"
                "mov    v9.d[0], v6.d[1]            \n"
                "mov    v10.d[0], v7.d[1]           \n"
                "mov    v11.d[0], v8.d[1]           \n"

                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v0.4h, v1.4h, v2.4h}, [%3], #24 \n"
                "st1    {v3.4h, v4.4h, v5.4h}, [x4] \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v6.4h, v7.4h, v8.4h}, [x4] \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v9.4h, v10.4h, v11.4h}, [x4] \n"

                "9:                                 \n"
                "add    %0, %0, #192                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;
            float32x4_t _sum8;
            float32x4_t _sum9;
            float32x4_t _suma;
            float32x4_t _sumb;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                    _sum8 = _sum0;
                    _sum9 = _sum0;
                    _suma = _sum0;
                    _sumb = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                    _sum4 = vdupq_n_f32(0.f);
                    _sum5 = vdupq_n_f32(0.f);
                    _sum6 = vdupq_n_f32(0.f);
                    _sum7 = vdupq_n_f32(0.f);
                    _sum8 = vdupq_n_f32(0.f);
                    _sum9 = vdupq_n_f32(0.f);
                    _suma = vdupq_n_f32(0.f);
                    _sumb = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
                _sum8 = vld1q_f32(outptr + 4 * 8);
                _sum9 = vld1q_f32(outptr + 4 * 9);
                _suma = vld1q_f32(outptr + 4 * 10);
                _sumb = vld1q_f32(outptr + 4 * 11);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
                _sum8 = vfmaq_laneq_f32(_sum8, _pA, _pB2, 0);
                _sum9 = vfmaq_laneq_f32(_sum9, _pA, _pB2, 1);
                _suma = vfmaq_laneq_f32(_suma, _pA, _pB2, 2);
                _sumb = vfmaq_laneq_f32(_sumb, _pA, _pB2, 3);

                pA += 4;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum2));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum3));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum4));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum5));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum6));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum7));
                    vst1_u16(outptr0 + 4 * 8, float2bfloat(_sum8));
                    vst1_u16(outptr0 + 4 * 9, float2bfloat(_sum9));
                    vst1_u16(outptr0 + 4 * 10, float2bfloat(_suma));
                    vst1_u16(outptr0 + 4 * 11, float2bfloat(_sumb));
                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    uint16x4_t _t0 = float2bfloat(_sum0);
                    uint16x4_t _t1 = float2bfloat(_sum1);
                    uint16x4_t _t2 = float2bfloat(_sum2);
                    uint16x4_t _t3 = float2bfloat(_sum3);
                    uint16x4_t _t4 = float2bfloat(_sum4);
                    uint16x4_t _t5 = float2bfloat(_sum5);
                    uint16x4_t _t6 = float2bfloat(_sum6);
                    uint16x4_t _t7 = float2bfloat(_sum7);
                    uint16x4_t _t8 = float2bfloat(_sum8);
                    uint16x4_t _t9 = float2bfloat(_sum9);
                    uint16x4_t _ta = float2bfloat(_suma);
                    uint16x4_t _tb = float2bfloat(_sumb);
                    transpose4x12_u16(_t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7, _t8, _t9, _ta, _tb);

                    vst1_u16(outptr0, _t0);
                    vst1_u16(outptr0 + 4, _t1);
                    vst1_u16(outptr0 + 8, _t2);
                    vst1_u16(outptr0 + out_hstep, _t3);
                    vst1_u16(outptr0 + out_hstep + 4, _t4);
                    vst1_u16(outptr0 + out_hstep + 8, _t5);
                    vst1_u16(outptr0 + out_hstep * 2, _t6);
                    vst1_u16(outptr0 + out_hstep * 2 + 4, _t7);
                    vst1_u16(outptr0 + out_hstep * 2 + 8, _t8);
                    vst1_u16(outptr0 + out_hstep * 3, _t9);
                    vst1_u16(outptr0 + out_hstep * 3 + 4, _ta);
                    vst1_u16(outptr0 + out_hstep * 3 + 8, _tb);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
                vst1q_f32(outptr + 4 * 8, _sum8);
                vst1q_f32(outptr + 4 * 9, _sum9);
                vst1q_f32(outptr + 4 * 10, _suma);
                vst1q_f32(outptr + 4 * 11, _sumb);
            }

            outptr += 48;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "subs   %0, %0, #64                 \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v24.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v24.16b, v24.16b, v24.16b   \n"

                "2:                                 \n"
                "mov    v25.16b, v24.16b            \n"
                "mov    v26.16b, v24.16b            \n"
                "mov    v27.16b, v24.16b            \n"
                "mov    v28.16b, v24.16b            \n"
                "mov    v29.16b, v24.16b            \n"
                "mov    v30.16b, v24.16b            \n"
                "mov    v31.16b, v24.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #512]       \n"
                "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%2], #64 \n"
                "shll   v0.4s, v4.4h, #16           \n"
                "shll2  v1.4s, v4.8h, #16           \n"
                "shll   v2.4s, v5.4h, #16           \n"
                "shll2  v3.4s, v5.8h, #16           \n"
                "prfm   pldl1keep, [%1, #256]       \n"
                "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "shll   v17.4s, v17.4h, #16         \n"
                "shll   v18.4s, v18.4h, #16         \n"
                "shll   v19.4s, v19.4h, #16         \n"
                "fmla   v24.4s, v16.4s, v0.s[0]     \n"
                "fmla   v25.4s, v16.4s, v0.s[1]     \n"
                "fmla   v26.4s, v16.4s, v0.s[2]     \n"
                "fmla   v27.4s, v16.4s, v0.s[3]     \n"
                "fmla   v28.4s, v16.4s, v1.s[0]     \n"
                "fmla   v29.4s, v16.4s, v1.s[1]     \n"
                "fmla   v30.4s, v16.4s, v1.s[2]     \n"
                "fmla   v31.4s, v16.4s, v1.s[3]     \n"
                "fmla   v24.4s, v17.4s, v2.s[0]     \n"
                "fmla   v25.4s, v17.4s, v2.s[1]     \n"
                "fmla   v26.4s, v17.4s, v2.s[2]     \n"
                "fmla   v27.4s, v17.4s, v2.s[3]     \n"
                "fmla   v28.4s, v17.4s, v3.s[0]     \n"
                "fmla   v29.4s, v17.4s, v3.s[1]     \n"
                "fmla   v30.4s, v17.4s, v3.s[2]     \n"
                "fmla   v31.4s, v17.4s, v3.s[3]     \n"
                "shll   v4.4s, v6.4h, #16           \n"
                "shll2  v5.4s, v6.8h, #16           \n"
                "shll   v6.4s, v7.4h, #16           \n"
                "shll2  v7.4s, v7.8h, #16           \n"
                "fmla   v24.4s, v18.4s, v4.s[0]     \n"
                "fmla   v25.4s, v18.4s, v4.s[1]     \n"
                "fmla   v26.4s, v18.4s, v4.s[2]     \n"
                "fmla   v27.4s, v18.4s, v4.s[3]     \n"
                "fmla   v28.4s, v18.4s, v5.s[0]     \n"
                "fmla   v29.4s, v18.4s, v5.s[1]     \n"
                "fmla   v30.4s, v18.4s, v5.s[2]     \n"
                "fmla   v31.4s, v18.4s, v5.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v24.4s, v19.4s, v6.s[0]     \n"
                "fmla   v25.4s, v19.4s, v6.s[1]     \n"
                "fmla   v26.4s, v19.4s, v6.s[2]     \n"
                "fmla   v27.4s, v19.4s, v6.s[3]     \n"
                "fmla   v28.4s, v19.4s, v7.s[0]     \n"
                "fmla   v29.4s, v19.4s, v7.s[1]     \n"
                "fmla   v30.4s, v19.4s, v7.s[2]     \n"
                "fmla   v31.4s, v19.4s, v7.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4h, v1.4h}, [%2], #16   \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "shll   v1.4s, v1.4h, #16           \n"
                "ld1    {v16.4h}, [%1], #8          \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "fmla   v24.4s, v16.4s, v0.s[0]     \n"
                "fmla   v25.4s, v16.4s, v0.s[1]     \n"
                "fmla   v26.4s, v16.4s, v0.s[2]     \n"
                "fmla   v27.4s, v16.4s, v0.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v16.4s, v1.s[0]     \n"
                "fmla   v29.4s, v16.4s, v1.s[1]     \n"
                "fmla   v30.4s, v16.4s, v1.s[2]     \n"
                "fmla   v31.4s, v16.4s, v1.s[3]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "shrn   v0.4h, v24.4s, #16          \n"
                "shrn2  v0.8h, v25.4s, #16          \n"
                "shrn   v1.4h, v26.4s, #16          \n"
                "shrn2  v1.8h, v27.4s, #16          \n"
                "shrn   v2.4h, v28.4s, #16          \n"
                "shrn2  v2.8h, v29.4s, #16          \n"
                "shrn   v3.4h, v30.4s, #16          \n"
                "shrn2  v3.8h, v31.4s, #16          \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x8
                "uzp1   v20.8h, v0.8h, v1.8h        \n"
                "uzp2   v21.8h, v0.8h, v1.8h        \n"
                "uzp1   v22.8h, v2.8h, v3.8h        \n"
                "uzp2   v23.8h, v2.8h, v3.8h        \n"

                "uzp1   v0.8h, v20.8h, v22.8h       \n"
                "uzp2   v2.8h, v20.8h, v22.8h       \n"
                "uzp1   v1.8h, v21.8h, v23.8h       \n"
                "uzp2   v3.8h, v21.8h, v23.8h       \n"

                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v0.8h}, [%3], #16          \n"
                "st1    {v1.8h}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v2.8h}, [x4]               \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v3.8h}, [x4]               \n"

                "9:                                 \n"
                "add    %0, %0, #128                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vldm       %0!, {d16-d23}      \n"
                "vldm       %0, {d24-d31}       \n"
                "sub        %0, %0, #64         \n"
                "b          3f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d16-d17}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q8, q8              \n"

                "2:                             \n"
                "vmov       q9, q8              \n"
                "vmov       q10, q8             \n"
                "vmov       q11, q8             \n"
                "vmov       q12, q8             \n"
                "vmov       q13, q8             \n"
                "vmov       q14, q8             \n"
                "vmov       q15, q8             \n"

                "3:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                "4:                             \n"
                "pld        [%2, #256]          \n"
                "vld1.u16   {d4-d7}, [%2 :64]!  \n"
                "pld        [%1, #256]          \n"
                "vld1.u16   {d12-d15}, [%1 :64]! \n"
                "vshll.u16  q0, d4, #16         \n"
                "vshll.u16  q1, d5, #16         \n"
                "vshll.u16  q2, d6, #16         \n"
                "vshll.u16  q3, d7, #16         \n"
                "vshll.u16  q4, d12, #16        \n"
                "vshll.u16  q5, d13, #16        \n"
                "vshll.u16  q6, d14, #16        \n"
                "vshll.u16  q7, d15, #16        \n"
                "vmla.f32   q8, q4, d0[0]       \n"
                "vmla.f32   q9, q4, d0[1]       \n"
                "vmla.f32   q10, q4, d1[0]      \n"
                "vmla.f32   q11, q4, d1[1]      \n"
                "vmla.f32   q12, q4, d2[0]      \n"
                "vmla.f32   q13, q4, d2[1]      \n"
                "vmla.f32   q14, q4, d3[0]      \n"
                "vmla.f32   q15, q4, d3[1]      \n"
                "vmla.f32   q8, q5, d4[0]       \n"
                "vmla.f32   q9, q5, d4[1]       \n"
                "vmla.f32   q10, q5, d5[0]      \n"
                "vmla.f32   q11, q5, d5[1]      \n"
                "vmla.f32   q12, q5, d6[0]      \n"
                "vmla.f32   q13, q5, d6[1]      \n"
                "vmla.f32   q14, q5, d7[0]      \n"
                "vmla.f32   q15, q5, d7[1]      \n"
                "pld        [%2, #256]          \n"
                "vld1.u16   {d4-d7}, [%2 :64]!  \n"
                "vshll.u16  q0, d4, #16         \n"
                "vshll.u16  q1, d5, #16         \n"
                "vshll.u16  q2, d6, #16         \n"
                "vshll.u16  q3, d7, #16         \n"
                "vmla.f32   q8, q6, d0[0]       \n"
                "vmla.f32   q9, q6, d0[1]       \n"
                "vmla.f32   q10, q6, d1[0]      \n"
                "vmla.f32   q11, q6, d1[1]      \n"
                "vmla.f32   q12, q6, d2[0]      \n"
                "vmla.f32   q13, q6, d2[1]      \n"
                "vmla.f32   q14, q6, d3[0]      \n"
                "vmla.f32   q15, q6, d3[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q8, q7, d4[0]       \n"
                "vmla.f32   q9, q7, d4[1]       \n"
                "vmla.f32   q10, q7, d5[0]      \n"
                "vmla.f32   q11, q7, d5[1]      \n"
                "vmla.f32   q12, q7, d6[0]      \n"
                "vmla.f32   q13, q7, d6[1]      \n"
                "vmla.f32   q14, q7, d7[0]      \n"
                "vmla.f32   q15, q7, d7[1]      \n"
                "bne        4b                  \n"

                "5:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        7f                  \n"

                "6:                             \n"
                "vld1.u16   {d2-d3}, [%2 :64]!  \n"
                "vshll.u16  q0, d2, #16         \n"
                "vshll.u16  q1, d3, #16         \n"
                "vld1.u16   {d9}, [%1 :64]!     \n"
                "vshll.u16  q4, d9, #16         \n"
                "vmla.f32   q8, q4, d0[0]       \n"
                "vmla.f32   q9, q4, d0[1]       \n"
                "vmla.f32   q10, q4, d1[0]      \n"
                "vmla.f32   q11, q4, d1[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q12, q4, d2[0]      \n"
                "vmla.f32   q13, q4, d2[1]      \n"
                "vmla.f32   q14, q4, d3[0]      \n"
                "vmla.f32   q15, q4, d3[1]      \n"
                "bne        6b                  \n"

                "7:                             \n"
                "vshrn.u32  d16, q8, #16        \n"
                "vshrn.u32  d17, q9, #16        \n"
                "vshrn.u32  d18, q10, #16       \n"
                "vshrn.u32  d19, q11, #16       \n"
                "vshrn.u32  d20, q12, #16       \n"
                "vshrn.u32  d21, q13, #16       \n"
                "vshrn.u32  d22, q14, #16       \n"
                "vshrn.u32  d23, q15, #16       \n"
                "cmp        %11, #0             \n"
                "beq        10f                 \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        8f                  \n"

                "vstm       %3!, {d16-d23}      \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // transpose4x8
                "vuzp.16    q8, q9              \n"
                "vuzp.16    q10, q11            \n"
                "vuzp.16    q8, q10             \n"
                "vuzp.16    q9, q11             \n"

                "add        r4, %3, %13, lsl #1 \n"
                "vst1.u16   {d16-d17}, [%3 :64]! \n"
                "vst1.u16   {d18-d19}, [r4 :64] \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u16   {d20-d21}, [r4 :64] \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u16   {d22-d23}, [r4 :64] \n"

                "9:                             \n"
                "add        %0, %0, #128        \n"
                "b          11f                 \n"

                "10:                            \n"
                "vstm       %0!, {d16-d23}      \n"
                "vstm       %0!, {d24-d31}      \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                    _sum4 = vdupq_n_f32(0.f);
                    _sum5 = vdupq_n_f32(0.f);
                    _sum6 = vdupq_n_f32(0.f);
                    _sum7 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB0), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB0), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB0), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB0), 1);
                _sum4 = vmlaq_lane_f32(_sum4, _pA, vget_low_f32(_pB1), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _pA, vget_low_f32(_pB1), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _pA, vget_high_f32(_pB1), 0);
                _sum7 = vmlaq_lane_f32(_sum7, _pA, vget_high_f32(_pB1), 1);
#endif

                pA += 4;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum2));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum3));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum4));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum5));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum6));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum7));
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    uint16x4_t _t0 = float2bfloat(_sum0);
                    uint16x4_t _t1 = float2bfloat(_sum1);
                    uint16x4_t _t2 = float2bfloat(_sum2);
                    uint16x4_t _t3 = float2bfloat(_sum3);
                    uint16x4_t _t4 = float2bfloat(_sum4);
                    uint16x4_t _t5 = float2bfloat(_sum5);
                    uint16x4_t _t6 = float2bfloat(_sum6);
                    uint16x4_t _t7 = float2bfloat(_sum7);
                    transpose4x8_u16(_t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7);

                    vst1_u16(outptr0, _t0);
                    vst1_u16(outptr0 + 4, _t1);
                    vst1_u16(outptr0 + out_hstep, _t2);
                    vst1_u16(outptr0 + out_hstep + 4, _t3);
                    vst1_u16(outptr0 + out_hstep * 2, _t4);
                    vst1_u16(outptr0 + out_hstep * 2 + 4, _t5);
                    vst1_u16(outptr0 + out_hstep * 3, _t6);
                    vst1_u16(outptr0 + out_hstep * 3 + 4, _t7);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
            }

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v28.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"

                "2:                                 \n"
                "mov    v29.16b, v28.16b            \n"
                "mov    v30.16b, v28.16b            \n"
                "mov    v31.16b, v28.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #256]       \n"
                "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "shll   v1.4s, v1.4h, #16           \n"
                "shll   v2.4s, v2.4h, #16           \n"
                "shll   v3.4s, v3.4h, #16           \n"
                "prfm   pldl1keep, [%1, #256]       \n"
                "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "shll   v17.4s, v17.4h, #16         \n"
                "shll   v18.4s, v18.4h, #16         \n"
                "shll   v19.4s, v19.4h, #16         \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                "fmla   v30.4s, v16.4s, v0.s[2]     \n"
                "fmla   v31.4s, v16.4s, v0.s[3]     \n"
                "fmla   v28.4s, v17.4s, v1.s[0]     \n"
                "fmla   v29.4s, v17.4s, v1.s[1]     \n"
                "fmla   v30.4s, v17.4s, v1.s[2]     \n"
                "fmla   v31.4s, v17.4s, v1.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v18.4s, v2.s[0]     \n"
                "fmla   v29.4s, v18.4s, v2.s[1]     \n"
                "fmla   v30.4s, v18.4s, v2.s[2]     \n"
                "fmla   v31.4s, v18.4s, v2.s[3]     \n"
                "fmla   v28.4s, v19.4s, v3.s[0]     \n"
                "fmla   v29.4s, v19.4s, v3.s[1]     \n"
                "fmla   v30.4s, v19.4s, v3.s[2]     \n"
                "fmla   v31.4s, v19.4s, v3.s[3]     \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4h}, [%2], #8           \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "ld1    {v16.4h}, [%1], #8          \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v16.4s, v0.s[2]     \n"
                "fmla   v31.4s, v16.4s, v0.s[3]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "shrn   v0.4h, v28.4s, #16          \n"
                "shrn2  v0.8h, v29.4s, #16          \n"
                "shrn   v1.4h, v30.4s, #16          \n"
                "shrn2  v1.8h, v31.4s, #16          \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v0.8h, v1.8h}, [%3], #32   \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x4
                "uzp1   v20.8h, v0.8h, v1.8h        \n"
                "uzp2   v21.8h, v0.8h, v1.8h        \n"

                "uzp1   v0.8h, v20.8h, v21.8h       \n"
                "uzp2   v1.8h, v20.8h, v21.8h       \n"

                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v0.d}[0], [%3], #8         \n"
                "st1    {v0.d}[1], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v1.d}[0], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v1.d}[1], [x4]             \n"

                "9:                                 \n"
                "add    %0, %0, #64                 \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vldm       %0, {d24-d31}       \n"
                "b          3f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d24-d25}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q12, q12            \n"

                "2:                             \n"
                "vmov       q13, q12            \n"
                "vmov       q14, q12            \n"
                "vmov       q15, q12            \n"

                "3:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                "4:                             \n"
                "pld        [%2, #256]          \n"
                "vld1.u16   {d4-d7}, [%2 :64]!  \n"
                "pld        [%1, #256]          \n"
                "vld1.u16   {d12-d15}, [%1 :64]! \n"
                "vshll.u16  q0, d4, #16         \n"
                "vshll.u16  q1, d5, #16         \n"
                "vshll.u16  q2, d6, #16         \n"
                "vshll.u16  q3, d7, #16         \n"
                "vshll.u16  q4, d12, #16        \n"
                "vshll.u16  q5, d13, #16        \n"
                "vshll.u16  q6, d14, #16        \n"
                "vshll.u16  q7, d15, #16        \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q4, d0[1]      \n"
                "vmla.f32   q14, q4, d1[0]      \n"
                "vmla.f32   q15, q4, d1[1]      \n"
                "vmla.f32   q12, q5, d2[0]      \n"
                "vmla.f32   q13, q5, d2[1]      \n"
                "vmla.f32   q14, q5, d3[0]      \n"
                "vmla.f32   q15, q5, d3[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q12, q6, d4[0]      \n"
                "vmla.f32   q13, q6, d4[1]      \n"
                "vmla.f32   q14, q6, d5[0]      \n"
                "vmla.f32   q15, q6, d5[1]      \n"
                "vmla.f32   q12, q7, d6[0]      \n"
                "vmla.f32   q13, q7, d6[1]      \n"
                "vmla.f32   q14, q7, d7[0]      \n"
                "vmla.f32   q15, q7, d7[1]      \n"
                "bne        4b                  \n"

                "5:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        7f                  \n"

                "6:                             \n"
                "vld1.u16   {d0}, [%2 :64]!     \n"
                "vshll.u16  q0, d0, #16         \n"
                "vld1.u16   {d8}, [%1 :64]!     \n"
                "vshll.u16  q4, d8, #16         \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q4, d0[1]      \n"
                "vmla.f32   q14, q4, d1[0]      \n"
                "vmla.f32   q15, q4, d1[1]      \n"
                "bne        6b                  \n"

                "7:                             \n"
                "vshrn.u32  d24, q12, #16       \n"
                "vshrn.u32  d25, q13, #16       \n"
                "vshrn.u32  d26, q14, #16       \n"
                "vshrn.u32  d27, q15, #16       \n"
                "cmp        %11, #0             \n"
                "beq        10f                 \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        8f                  \n"

                "vst1.u16   {d24-d27}, [%3]!    \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // transpose4x4
                "vuzp.16    q12, q13            \n"
                "vuzp.16    q12, q13            \n"

                "add        r4, %3, %13, lsl #1 \n"
                "vst1.u16   {d24}, [%3]!        \n"
                "vst1.u16   {d25}, [r4]         \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u16   {d26}, [r4]         \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u16   {d27}, [r4]         \n"

                "9:                             \n"
                "add        %0, %0, #64         \n"
                "b          11f                 \n"

                "10:                            \n"
                "vstm       %0!, {d24-d31}      \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                    _sum3 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB = bfloat2float(vld1_u16(pB));

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB), 1);
#endif

                pA += 4;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum2));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum3));
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    uint16x4_t _t0 = float2bfloat(_sum0);
                    uint16x4_t _t1 = float2bfloat(_sum1);
                    uint16x4_t _t2 = float2bfloat(_sum2);
                    uint16x4_t _t3 = float2bfloat(_sum3);
                    transpose4x4_u16(_t0, _t1, _t2, _t3);

                    vst1_u16(outptr0, _t0);
                    vst1_u16(outptr0 + out_hstep * 1, _t1);
                    vst1_u16(outptr0 + out_hstep * 2, _t2);
                    vst1_u16(outptr0 + out_hstep * 3, _t3);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
            }

            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v30.4s, v31.4s}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v30.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"

                "2:                                 \n"
                "mov    v31.16b, v30.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "4:                                 \n"
                "prfm   pldl1keep, [%2, #128]       \n"
                "ld1    {v0.4h, v1.4h}, [%2], #16   \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "shll   v1.4s, v1.4h, #16           \n"
                "prfm   pldl1keep, [%1, #256]       \n"
                "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "shll   v17.4s, v17.4h, #16         \n"
                "shll   v18.4s, v18.4h, #16         \n"
                "shll   v19.4s, v19.4h, #16         \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v16.4s, v0.s[1]     \n"
                "fmla   v30.4s, v17.4s, v0.s[2]     \n"
                "fmla   v31.4s, v17.4s, v0.s[3]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.4s, v18.4s, v1.s[0]     \n"
                "fmla   v29.4s, v18.4s, v1.s[1]     \n"
                "fmla   v30.4s, v19.4s, v1.s[2]     \n"
                "fmla   v31.4s, v19.4s, v1.s[3]     \n"
                "bne    4b                          \n"
                "fadd   v30.4s, v30.4s, v28.4s      \n"
                "fadd   v31.4s, v31.4s, v29.4s      \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.s}[0], [%2], #4         \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "ld1    {v16.4h}, [%1], #8          \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v16.4s, v0.s[0]     \n"
                "fmla   v31.4s, v16.4s, v0.s[1]     \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "shrn   v0.4h, v30.4s, #16          \n"
                "shrn   v1.4h, v31.4s, #16          \n"
                "tst    %w11, #255                  \n"
                "beq    10f                         \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    8f                          \n"

                "st1    {v0.4h, v1.4h}, [%3], #16   \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // transpose4x2
                "zip1   v30.4h, v0.4h, v1.4h        \n"
                "zip2   v31.4h, v0.4h, v1.4h        \n"

                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v30.s}[0], [%3], #4        \n"
                "st1    {v30.s}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v31.s}[0], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v31.s}[1], [x4]            \n"

                "9:                                 \n"
                "add    %0, %0, #32                 \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v30.4s, v31.4s}, [%0], #32 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vld1.f32   {d28-d31}, [%0 :128] \n"
                "b          3f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d28-d29}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q14, q14            \n"

                "2:                             \n"
                "vmov       q15, q14            \n"

                "3:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                "veor       q12, q12            \n"
                "veor       q13, q13            \n"
                "4:                             \n"
                "pld        [%2, #128]          \n"
                "vld1.u16   {d2-d3}, [%2 :64]!  \n"
                "pld        [%1, #256]          \n"
                "vld1.u16   {d12-d15}, [%1 :64]! \n"
                "vshll.u16  q0, d2, #16         \n"
                "vshll.u16  q1, d3, #16         \n"
                "vshll.u16  q4, d12, #16        \n"
                "vshll.u16  q5, d13, #16        \n"
                "vshll.u16  q6, d14, #16        \n"
                "vshll.u16  q7, d15, #16        \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q4, d0[1]      \n"
                "vmla.f32   q14, q5, d1[0]      \n"
                "vmla.f32   q15, q5, d1[1]      \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q12, q6, d2[0]      \n"
                "vmla.f32   q13, q6, d2[1]      \n"
                "vmla.f32   q14, q7, d3[0]      \n"
                "vmla.f32   q15, q7, d3[1]      \n"
                "bne        4b                  \n"
                "vadd.f32   q14, q14, q12       \n"
                "vadd.f32   q15, q15, q13       \n"

                "5:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        7f                  \n"

                "6:                             \n"
                "vld1.u32   {d0[0]}, [%2]!      \n"
                "vshll.u16  q0, d0, #16         \n"
                "vld1.u16   {d8}, [%1 :64]!     \n"
                "vshll.u16  q4, d8, #16         \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q14, q4, d0[0]      \n"
                "vmla.f32   q15, q4, d0[1]      \n"
                "bne        6b                  \n"

                "7:                             \n"
                "vshrn.u32  d28, q14, #16       \n"
                "vshrn.u32  d29, q15, #16       \n"
                "cmp        %11, #0             \n"
                "beq        10f                 \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        8f                  \n"

                "vst1.u16   {d28-d29}, [%3]!    \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // transpose4x2
                "vzip.16    d28, d29            \n"

                "add        r4, %3, %13, lsl #1 \n"
                "vst1.u32   {d28[0]}, [%3]!     \n"
                "vst1.u32   {d28[1]}, [r4]      \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u32   {d29[0]}, [r4]      \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u32   {d29[1]}, [r4]      \n"

                "9:                             \n"
                "add        %0, %0, #32         \n"
                "b          11f                 \n"

                "10:                            \n"
                "vst1.f32   {d28-d31}, [%0 :128]! \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q1", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                    _sum1 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x2_t _pB = vget_low_f32(bfloat2float(vld1_u16(pB)));

#if __aarch64__
                _sum0 = vfmaq_lane_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_lane_f32(_sum1, _pA, _pB, 1);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, _pB, 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, _pB, 1);
#endif

                pA += 4;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[4];
                    unsigned short sum1[4];
                    vst1_u16(sum0, float2bfloat(_sum0));
                    vst1_u16(sum1, float2bfloat(_sum1));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj < max_jj; jj += 1)
        {
            const unsigned short* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v31.4s}, [%0]              \n"
                "b      2f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v31.4s}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"

                "2:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "3:                                 \n"
                "prfm   pldl1keep, [%2, #64]        \n"
                "ld1    {v0.4h}, [%2], #8           \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "prfm   pldl1keep, [%1, #256]       \n"
                "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "shll   v17.4s, v17.4h, #16         \n"
                "shll   v18.4s, v18.4h, #16         \n"
                "shll   v19.4s, v19.4h, #16         \n"
                "fmla   v28.4s, v16.4s, v0.s[0]     \n"
                "fmla   v29.4s, v17.4s, v0.s[1]     \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.4s, v18.4s, v0.s[2]     \n"
                "fmla   v31.4s, v19.4s, v0.s[3]     \n"
                "bne    3b                          \n"
                "fadd   v30.4s, v30.4s, v28.4s      \n"
                "fadd   v31.4s, v31.4s, v29.4s      \n"
                "fadd   v31.4s, v31.4s, v30.4s      \n"

                "4:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    6f                          \n"

                "5:                                 \n"
                "ld1r   {v0.4h}, [%2], #2           \n"
                "shll   v0.4s, v0.4h, #16           \n"
                "ld1    {v16.4h}, [%1], #8          \n"
                "shll   v16.4s, v16.4h, #16         \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v31.4s, v16.4s, v0.4s       \n"
                "bne    5b                          \n"

                "6:                                 \n"
                "shrn   v0.4h, v31.4s, #16          \n"
                "tst    %w11, #255                  \n"
                "beq    9f                          \n"

                // if out_elempack == 4
                "cmp    %w12, #4                    \n"
                "bne    7f                          \n"

                "st1    {v0.4h}, [%3], #8           \n"
                "b      8f                          \n"

                // if out_elempack == 1
                "7:                                 \n"
                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v0.h}[0], [%3], #2         \n"
                "st1    {v0.h}[1], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v0.h}[2], [x4]             \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v0.h}[3], [x4]             \n"

                "8:                                 \n"
                "add    %0, %0, #16                 \n"
                "b      10f                         \n"

                "9:                                 \n"
                "st1    {v31.4s}, [%0], #16         \n"

                "10:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %10, #0             \n"
                "beq        0f                  \n"

                "vld1.f32   {d30-d31}, [%0 :64] \n"
                "b          2f                  \n"

                "0:                             \n"
                // if pC
                "cmp        %8, #0              \n"
                "beq        1f                  \n"

                "vld1.f32   {d30-d31}, [%8]     \n"
                "b          2f                  \n"

                // else
                "1:                             \n"
                "veor       q15, q15            \n"

                "2:                             \n"
                "lsr        r4, %9, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        4f                  \n"

                "veor       q12, q12            \n"
                "veor       q13, q13            \n"
                "veor       q14, q14            \n"
                "3:                             \n"
                "pld        [%2, #64]           \n"
                "vld1.u16   {d1}, [%2]!         \n"
                "pld        [%1, #256]          \n"
                "vld1.u16   {d12-d15}, [%1 :64]! \n"
                "vshll.u16  q0, d1, #16         \n"
                "vshll.u16  q4, d12, #16        \n"
                "vshll.u16  q5, d13, #16        \n"
                "vshll.u16  q6, d14, #16        \n"
                "vshll.u16  q7, d15, #16        \n"
                "vmla.f32   q12, q4, d0[0]      \n"
                "vmla.f32   q13, q5, d0[1]      \n"
                "vmla.f32   q14, q6, d1[0]      \n"
                "vmla.f32   q15, q7, d1[1]      \n"
                "subs       r4, r4, #1          \n"
                "bne        3b                  \n"
                "vadd.f32   q14, q14, q12       \n"
                "vadd.f32   q15, q15, q13       \n"
                "vadd.f32   q15, q15, q14       \n"

                "4:                             \n"
                "and        r4, %9, #3          \n" // r4 = remain = max_kk & 3
                "cmp        r4, #0              \n"
                "beq        6f                  \n"

                "5:                             \n"
                "vld1.u16   {d0[]}, [%2]!       \n"
                "vshll.u16  q0, d0, #16         \n"
                "vld1.u16   {d8}, [%1 :64]!     \n"
                "vshll.u16  q4, d8, #16         \n"
                "subs       r4, r4, #1          \n"
                "vmla.f32   q15, q4, q0         \n"
                "bne        5b                  \n"

                "6:                             \n"
                "vshrn.u32  d30, q15, #16       \n"
                "cmp        %11, #0             \n"
                "beq        9f                  \n"

                // if out_elempack == 4
                "cmp        %12, #4             \n"
                "bne        7f                  \n"

                "vst1.u16   {d30}, [%3]!        \n"
                "b          8f                  \n"

                // if out_elempack == 1
                "7:                             \n"

                "add        r4, %3, %13, lsl #1 \n"
                "vst1.u16   {d30[0]}, [%3]!     \n"
                "vst1.u16   {d30[1]}, [r4]      \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u16   {d30[2]}, [r4]      \n"
                "add        r4, r4, %13, lsl #1 \n"
                "vst1.u16   {d30[3]}, [r4]      \n"

                "8:                             \n"
                "add        %0, %0, #16         \n"
                "b          10f                 \n"

                "9:                             \n"
                "vst1.f32   {d30-d31}, [%0 :64]! \n"

                "10:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "r4", "q0", "q4", "q5", "q6", "q7", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _sum0;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f32(pC);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB = bfloat2float(vdup_n_u16(pB[0]));

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA, _pB);
#else
                _sum0 = vmlaq_f32(_sum0, _pA, _pB);
#endif

                pA += 4;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[4];
                    vst1_u16(sum0, float2bfloat(_sum0));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
            }

            outptr += 4;
#endif // NCNN_GNU_INLINE_ASM
        }

        pAT += max_kk * 4;
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const unsigned short* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum02;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum12;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vdupq_n_f32(pC[0]);
                    _sum01 = vdupq_n_f32(pC[0]);
                    _sum02 = vdupq_n_f32(pC[0]);
                    _sum10 = vdupq_n_f32(pC[1]);
                    _sum11 = vdupq_n_f32(pC[1]);
                    _sum12 = vdupq_n_f32(pC[1]);
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum02 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                    _sum12 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                float32x4x2_t _tmp45 = vld2q_f32(outptr + 16);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum02 = _tmp45.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
                _sum12 = _tmp45.val[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                float32x2_t _pA = vget_low_f32(bfloat2float(vld1_u16(pA)));

                _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum02 = vfmaq_lane_f32(_sum02, _pB2, _pA, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
                _sum12 = vfmaq_lane_f32(_sum12, _pB2, _pA, 1);

                pA += 2;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + 8, float2bfloat(_sum02));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep + 8, float2bfloat(_sum12));
                    outptr0 += 12;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float32x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
                vst2q_f32(outptr + 16, _tmp45);
            }

            outptr += 24;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vdupq_n_f32(pC[0]);
                    _sum01 = vdupq_n_f32(pC[0]);
                    _sum10 = vdupq_n_f32(pC[1]);
                    _sum11 = vdupq_n_f32(pC[1]);
                }
                else
                {
                    _sum00 = vdupq_n_f32(0.f);
                    _sum01 = vdupq_n_f32(0.f);
                    _sum10 = vdupq_n_f32(0.f);
                    _sum11 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));

                float32x2_t _pA = vget_low_f32(bfloat2float(vld1_u16(pA)));
#if __aarch64__
                _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
#else
                _sum00 = vmlaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vmlaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum10 = vmlaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vmlaq_lane_f32(_sum11, _pB1, _pA, 1);
#endif

                pA += 2;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum11));
                    outptr0 += 8;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
            }

            outptr += 16;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdupq_n_f32(pC[0]);
                    _sum1 = vdupq_n_f32(pC[1]);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                _sum0 = _tmp01.val[0];
                _sum1 = _tmp01.val[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = bfloat2float(vld1_u16(pB));

                float32x2_t _pA = vget_low_f32(bfloat2float(vld1_u16(pA)));
#if __aarch64__
                _sum0 = vfmaq_lane_f32(_sum0, _pB, _pA, 0);
                _sum1 = vfmaq_lane_f32(_sum1, _pB, _pA, 1);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pB, _pA, 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pB, _pA, 1);
#endif

                pA += 2;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum1));
                    outptr0 += 4;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2q_f32(outptr, _tmp01);
            }

            outptr += 8;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum00;
            float sum01;
            float sum10;
            float sum11;

            if (k == 0)
            {
                if (pC)
                {
                    sum00 = pC[0];
                    sum01 = pC[1];
                    sum10 = pC[0];
                    sum11 = pC[1];
                }
                else
                {
                    sum00 = 0.f;
                    sum01 = 0.f;
                    sum10 = 0.f;
                    sum11 = 0.f;
                }
            }
            else
            {
                sum00 = outptr[0];
                sum01 = outptr[1];
                sum10 = outptr[2];
                sum11 = outptr[3];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum00 += bfloat16_to_float32(pA[0]) * bfloat16_to_float32(pB[0]);
                sum01 += bfloat16_to_float32(pA[1]) * bfloat16_to_float32(pB[0]);
                sum10 += bfloat16_to_float32(pA[0]) * bfloat16_to_float32(pB[1]);
                sum11 += bfloat16_to_float32(pA[1]) * bfloat16_to_float32(pB[1]);

                pA += 2;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum00);
                    outptr0[1] = float32_to_bfloat16(sum10);
                    outptr0[out_hstep] = float32_to_bfloat16(sum01);
                    outptr0[out_hstep + 1] = float32_to_bfloat16(sum11);
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
            }

            outptr += 4;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                if (pC)
                {
                    sum0 = pC[0];
                    sum1 = pC[1];
                }
                else
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += bfloat16_to_float32(pA[0]) * bfloat16_to_float32(pB[0]);
                sum1 += bfloat16_to_float32(pA[1]) * bfloat16_to_float32(pB[0]);
                pA += 2;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum0);
                    outptr0[out_hstep] = float32_to_bfloat16(sum1);
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const unsigned short* pB = pBT;

        if (pC)
        {
            pC = (const float*)CT_tile + i + ii;
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdupq_n_f32(pC[0]);
                    _sum1 = vdupq_n_f32(pC[0]);
                    _sum2 = vdupq_n_f32(pC[0]);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                    _sum2 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
                _sum2 = vld1q_f32(outptr + 8);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                float32x4_t _pA0 = bfloat2float(vdup_n_u16(pA[0]));

                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
                _sum2 = vfmaq_f32(_sum2, _pA0, _pB2);

                pA += 1;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 8, float2bfloat(_sum2));
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 8, _sum2);
            }

            outptr += 12;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdupq_n_f32(pC[0]);
                    _sum1 = vdupq_n_f32(pC[0]);
                }
                else
                {
                    _sum0 = vdupq_n_f32(0.f);
                    _sum1 = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));

                float32x4_t _pA0 = bfloat2float(vdup_n_u16(pA[0]));
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
#else
                _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
#endif

                pA += 1;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum;

            if (k == 0)
            {
                if (pC)
                {
                    _sum = vdupq_n_f32(pC[0]);
                }
                else
                {
                    _sum = vdupq_n_f32(0.f);
                }
            }
            else
            {
                _sum = vld1q_f32(outptr);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = bfloat2float(vld1_u16(pB));
                float32x4_t _pA = bfloat2float(vdup_n_u16(pA[0]));

#if __aarch64__
                _sum = vfmaq_f32(_sum, _pA, _pB);
#else
                _sum = vmlaq_f32(_sum, _pA, _pB);
#endif

                pA += 1;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum));
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum);
            }

            outptr += 4;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                if (pC)
                {
                    sum0 = pC[0];
                    sum1 = pC[0];
                }
                else
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += bfloat16_to_float32(pA[0]) * bfloat16_to_float32(pB[0]);
                sum1 += bfloat16_to_float32(pA[0]) * bfloat16_to_float32(pB[1]);

                pA += 1;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum0);
                    outptr0[1] = float32_to_bfloat16(sum1);
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum;

            if (k == 0)
            {
                if (pC)
                {
                    sum = pC[0];
                }
                else
                {
                    sum = 0.f;
                }
            }
            else
            {
                sum = outptr[0];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum += bfloat16_to_float32(pA[0]) * bfloat16_to_float32(pB[0]);

                pA += 1;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum);
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}

static void convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const int l2_cache_size_bf16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve K
    {
        // try not to split K
#if __aarch64__
        int tile_size = (l2_cache_size_bf16 - 32) / 12;
#elif __ARM_NEON
        int tile_size = (l2_cache_size_bf16 - 16) / 8;
#else
        int tile_size = (l2_cache_size_bf16 - 2) / 3;
#endif

#if __aarch64__
        TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::max(4, tile_size / 4 * 4);
#else
        TILE_K = std::max(2, tile_size / 2 * 2);
#endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
#if __aarch64__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#else
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#endif
    }

    // solve M
    {
#if __aarch64__
        int nn_M = (M + 31) / 32;
#elif __ARM_NEON
        int nn_M = (M + 15) / 16;
#else
        int nn_M = (M + 7) / 8;
#endif

#if __aarch64__
        TILE_M = std::max(8, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_M = std::max(4, ((M + nn_M - 1) / nn_M + 3) / 4 * 4);
#else
        TILE_M = std::max(2, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif
    }

    {
        TILE_M *= std::min(nT, get_physical_cpu_count());

        int nn_M = (M + TILE_M - 1) / TILE_M;
#if __aarch64__
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 3) / 4 * 4);
#else
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif

        if (nT > 1)
        {
#if __aarch64__
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
#elif __ARM_NEON
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 3) / 4 * 4);
#else
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
#endif
        }
    }

    if (N > 0)
    {
        int tile_size;
        if (TILE_K >= K)
        {
            tile_size = (l2_cache_size_bf16 - TILE_M * TILE_K) / TILE_K;
        }
        else
        {
            tile_size = (l2_cache_size_bf16 - TILE_M * TILE_K) / (TILE_M * 2 + TILE_K);
        }

#if __aarch64__
        TILE_N = std::max(4, tile_size / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::max(4, tile_size / 4 * 4);
#else
        TILE_N = std::max(1, tile_size);
#endif

        int nn_N = (N + TILE_N - 1) / TILE_N;
#if __aarch64__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#else
        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
#endif
    }
}

static void convolution_im2col_gemm_transform_kernel_bf16s(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
    // NCNN_LOGE("convolution_im2col_gemm_transform_kernel");
    const int maxk = kernel_w * kernel_h;

    const int M = outch;
    const int K = inch * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    int elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        elempack = inch % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON

    // maxk-inch-outch to pa-maxk-inch/pa-outch
    Mat A_data;
    if (maxk == 1)
    {
        cast_float32_to_bfloat16(kernel, A_data);
        A_data = A_data.reshape(maxk * inch, outch);
    }
    else
    {
        Mat weight_data_r2 = kernel.reshape(maxk, inch, outch);

        A_data.create(maxk * inch, outch, (size_t)2u);

        for (int q = 0; q < outch; q += 1)
        {
            unsigned short* g00 = A_data.row<unsigned short>(q);

            for (int p = 0; p + (elempack - 1) < inch; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        const float* k00 = weight_data_r2.channel(q).row(p + i);
                        g00[0] = float32_to_bfloat16(k00[k]);
                        g00++;
                    }
                }
            }
        }
    }

    AT.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)2u);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_kk = std::min((K - k), TILE_K);

            Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

            convolution_im2col_pack_A_tile_bf16_fp16(A_data, AT_tile, i, max_ii, k, max_kk);
        }
    }
}

static int convolution_im2col_gemm_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
    const int maxk = kernel_w * kernel_h;

    const int M = top_blob.c * top_blob.elempack;
    const int N = top_blob.w * top_blob.h;
    const int K = bottom_blob.c * bottom_blob.elempack * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk_bf16s(M, N, K, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        // im2col
        convolution_im2col_input_tile_bf16_fp16(bottom_blob, BT_tile, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h);
    }

    Mat topT_tileX;
    if (K > TILE_K)
    {
        topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT_tileX.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat topT_tile;
        if (K > TILE_K)
            topT_tile = topT_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                const Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = k + TILE_K >= K;

                convolution_gemm_transB_packed_tile_bf16s(AT_tile, BT_tile, bias, topT_tile, top_blob, i, max_ii, j, max_jj, k, max_kk, k_end, opt.use_a53_a55_optimized_kernel);
            }
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_im2col_gemm_bf16s_fp16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_im2col_pack_A_tile_bf16_fp16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    // A = (pa, maxk, inch/pa), outch
    const int A_hstep = A.w;

    unsigned short* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;
        const unsigned short* p1 = (const unsigned short*)A + (i + ii + 1) * A_hstep + k;
        const unsigned short* p2 = (const unsigned short*)A + (i + ii + 2) * A_hstep + k;
        const unsigned short* p3 = (const unsigned short*)A + (i + ii + 3) * A_hstep + k;
        const unsigned short* p4 = (const unsigned short*)A + (i + ii + 4) * A_hstep + k;
        const unsigned short* p5 = (const unsigned short*)A + (i + ii + 5) * A_hstep + k;
        const unsigned short* p6 = (const unsigned short*)A + (i + ii + 6) * A_hstep + k;
        const unsigned short* p7 = (const unsigned short*)A + (i + ii + 7) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vld1q_u16(p0);
            uint16x8_t _r1 = vld1q_u16(p1);
            uint16x8_t _r2 = vld1q_u16(p2);
            uint16x8_t _r3 = vld1q_u16(p3);
            uint16x8_t _r4 = vld1q_u16(p4);
            uint16x8_t _r5 = vld1q_u16(p5);
            uint16x8_t _r6 = vld1q_u16(p6);
            uint16x8_t _r7 = vld1q_u16(p7);
            transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
            vst1q_u16(pp, _r0);
            vst1q_u16(pp + 8, _r1);
            vst1q_u16(pp + 8 * 2, _r2);
            vst1q_u16(pp + 8 * 3, _r3);
            vst1q_u16(pp + 8 * 4, _r4);
            vst1q_u16(pp + 8 * 5, _r5);
            vst1q_u16(pp + 8 * 6, _r6);
            vst1q_u16(pp + 8 * 7, _r7);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp[4] = p4[0];
            pp[5] = p5[0];
            pp[6] = p6[0];
            pp[7] = p7[0];
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;
        const unsigned short* p1 = (const unsigned short*)A + (i + ii + 1) * A_hstep + k;
        const unsigned short* p2 = (const unsigned short*)A + (i + ii + 2) * A_hstep + k;
        const unsigned short* p3 = (const unsigned short*)A + (i + ii + 3) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x4_t _r0123;
            _r0123.val[0] = vld1q_u16(p0);
            _r0123.val[1] = vld1q_u16(p1);
            _r0123.val[2] = vld1q_u16(p2);
            _r0123.val[3] = vld1q_u16(p3);
            vst4q_u16(pp, _r0123);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x4_t _r0123;
            _r0123.val[0] = vld1_u16(p0);
            _r0123.val[1] = vld1_u16(p1);
            _r0123.val[2] = vld1_u16(p2);
            _r0123.val[3] = vld1_u16(p3);
            vst4_u16(pp, _r0123);
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;
        const unsigned short* p1 = (const unsigned short*)A + (i + ii + 1) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x2_t _r01;
            _r01.val[0] = vld1q_u16(p0);
            _r01.val[1] = vld1q_u16(p1);
            vst2q_u16(pp, _r01);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x2_t _r01;
            _r01.val[0] = vld1_u16(p0);
            _r01.val[1] = vld1_u16(p1);
            vst2_u16(pp, _r01);
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 7 < max_kk; kk += 8)
        {
            vst1q_u16(pp, vld1q_u16(p0));
            pp += 8;
            p0 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            vst1_u16(pp, vld1_u16(p0));
            pp += 4;
            p0 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = (unsigned short)p0[0];
            pp += 1;
            p0++;
        }
    }
}

static void convolution_im2col_input_tile_conv1x1s1d1_bf16_fp16(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
{
    const int elempack = bottom_blob.elempack;

    unsigned short* pp = B;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 8) + (j + jj) * 8;

            int kk = 0;
            for (; kk < max_kk / 8; kk++)
            {
                // transpose8x12
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld4    {v16.8h, v17.8h, v18.8h, v19.8h}, [%0] \n"
                    "uzp1   v20.8h, v0.8h, v4.8h    \n"
                    "uzp2   v26.8h, v0.8h, v4.8h    \n"
                    "uzp1   v21.8h, v16.8h, v1.8h   \n"
                    "uzp2   v27.8h, v16.8h, v1.8h   \n"
                    "sub    %0, %0, #128            \n"
                    "uzp1   v22.8h, v5.8h, v17.8h   \n"
                    "uzp2   v28.8h, v5.8h, v17.8h   \n"
                    "uzp1   v23.8h, v2.8h, v6.8h    \n"
                    "uzp2   v29.8h, v2.8h, v6.8h    \n"
                    "uzp1   v24.8h, v18.8h, v3.8h   \n"
                    "uzp2   v30.8h, v18.8h, v3.8h   \n"
                    "uzp1   v25.8h, v7.8h, v19.8h   \n"
                    "uzp2   v31.8h, v7.8h, v19.8h   \n"
                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%1], #64 \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%1], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
                uint16x8_t _r0 = vld1q_u16(p0);
                uint16x8_t _r1 = vld1q_u16(p0 + 8);
                uint16x8_t _r2 = vld1q_u16(p0 + 8 * 2);
                uint16x8_t _r3 = vld1q_u16(p0 + 8 * 3);
                uint16x8_t _r4 = vld1q_u16(p0 + 8 * 4);
                uint16x8_t _r5 = vld1q_u16(p0 + 8 * 5);
                uint16x8_t _r6 = vld1q_u16(p0 + 8 * 6);
                uint16x8_t _r7 = vld1q_u16(p0 + 8 * 7);
                uint16x8_t _r8 = vld1q_u16(p0 + 8 * 8);
                uint16x8_t _r9 = vld1q_u16(p0 + 8 * 9);
                uint16x8_t _ra = vld1q_u16(p0 + 8 * 10);
                uint16x8_t _rb = vld1q_u16(p0 + 8 * 11);
                transpose8x12_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
                vst1q_u16(pp, _r0);
                vst1q_u16(pp + 8, _r1);
                vst1q_u16(pp + 8 * 2, _r2);
                vst1q_u16(pp + 8 * 3, _r3);
                vst1q_u16(pp + 8 * 4, _r4);
                vst1q_u16(pp + 8 * 5, _r5);
                vst1q_u16(pp + 8 * 6, _r6);
                vst1q_u16(pp + 8 * 7, _r7);
                vst1q_u16(pp + 8 * 8, _r8);
                vst1q_u16(pp + 8 * 9, _r9);
                vst1q_u16(pp + 8 * 10, _ra);
                vst1q_u16(pp + 8 * 11, _rb);
                pp += 96;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 8;
            }
        }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                // transpose4x12
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #256]   \n"
                    "ld4    {v4.4h, v5.4h, v6.4h, v7.4h}, [%0]      \n"
                    "st1    {v0.8h}, [%1], #16          \n"
                    "st1    {v4.4h}, [%1], #8           \n"
                    "st1    {v1.8h}, [%1], #16          \n"
                    "st1    {v5.4h}, [%1], #8           \n"
                    "sub    %0, %0, #64                 \n"
                    "st1    {v2.8h}, [%1], #16          \n"
                    "st1    {v6.4h}, [%1], #8           \n"
                    "st1    {v3.8h}, [%1], #16          \n"
                    "st1    {v7.4h}, [%1], #8           \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // NCNN_GNU_INLINE_ASM
                uint16x4x4_t _r0 = vld4_u16(p0);
                uint16x4x4_t _r1 = vld4_u16(p0 + 16);
                uint16x4x4_t _r2 = vld4_u16(p0 + 32);
                vst1_u16(pp, _r0.val[0]);
                vst1_u16(pp + 4, _r1.val[0]);
                vst1_u16(pp + 4 * 2, _r2.val[0]);
                vst1_u16(pp + 4 * 3, _r0.val[1]);
                vst1_u16(pp + 4 * 4, _r1.val[1]);
                vst1_u16(pp + 4 * 5, _r2.val[1]);
                vst1_u16(pp + 4 * 6, _r0.val[2]);
                vst1_u16(pp + 4 * 7, _r1.val[2]);
                vst1_u16(pp + 4 * 8, _r2.val[2]);
                vst1_u16(pp + 4 * 9, _r0.val[3]);
                vst1_u16(pp + 4 * 10, _r1.val[3]);
                vst1_u16(pp + 4 * 11, _r2.val[3]);
                pp += 48;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 4;
            }
        }

        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _r01 = vld1q_u16(p0);
                uint16x4_t _r2 = vld1_u16(p0 + 8);
                vst1q_u16(pp, _r01);
                vst1_u16(pp + 8, _r2);
                pp += 12;
                p0 += bottom_blob.cstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 8) + (j + jj) * 8;

            int kk = 0;
            for (; kk < max_kk / 8; kk++)
            {
                // transpose8x8
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0] \n"
                    "uzp1   v16.8h, v0.8h, v4.8h    \n"
                    "uzp2   v20.8h, v0.8h, v4.8h    \n"
                    "uzp1   v17.8h, v1.8h, v5.8h    \n"
                    "uzp2   v21.8h, v1.8h, v5.8h    \n"
                    "sub    %0, %0, #64             \n"
                    "uzp1   v18.8h, v2.8h, v6.8h    \n"
                    "uzp2   v22.8h, v2.8h, v6.8h    \n"
                    "uzp1   v19.8h, v3.8h, v7.8h    \n"
                    "uzp2   v23.8h, v3.8h, v7.8h    \n"
                    "st1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n"
                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else  // NCNN_GNU_INLINE_ASM
                uint16x8_t _r0 = vld1q_u16(p0);
                uint16x8_t _r1 = vld1q_u16(p0 + 8);
                uint16x8_t _r2 = vld1q_u16(p0 + 8 * 2);
                uint16x8_t _r3 = vld1q_u16(p0 + 8 * 3);
                uint16x8_t _r4 = vld1q_u16(p0 + 8 * 4);
                uint16x8_t _r5 = vld1q_u16(p0 + 8 * 5);
                uint16x8_t _r6 = vld1q_u16(p0 + 8 * 6);
                uint16x8_t _r7 = vld1q_u16(p0 + 8 * 7);
                transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
                vst1q_u16(pp, _r0);
                vst1q_u16(pp + 8, _r1);
                vst1q_u16(pp + 8 * 2, _r2);
                vst1q_u16(pp + 8 * 3, _r3);
                vst1q_u16(pp + 8 * 4, _r4);
                vst1q_u16(pp + 8 * 5, _r5);
                vst1q_u16(pp + 8 * 6, _r6);
                vst1q_u16(pp + 8 * 7, _r7);
                pp += 64;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 8;
            }
        }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                // transpose4x8
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld4.u16   {d0,d2,d4,d6}, [%0 :64]! \n"
                    "pld        [%0, #256]          \n"
                    "vld4.u16   {d1,d3,d5,d7}, [%0 :64] \n"
                    "sub        %0, %0, #32         \n"
                    "vstm       %1!, {d0-d7}        \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                uint16x8x4_t _r0 = vld4q_u16(p0);
                vst1q_u16(pp, _r0.val[0]);
                vst1q_u16(pp + 8, _r0.val[1]);
                vst1q_u16(pp + 16, _r0.val[2]);
                vst1q_u16(pp + 24, _r0.val[3]);
                pp += 32;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 4;
            }
        }

        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _r0 = vld1q_u16(p0);
                vst1q_u16(pp, _r0);
                pp += 8;
                p0 += bottom_blob.cstep;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 8) + (j + jj) * 8;

            int kk = 0;
            for (; kk < max_kk / 8; kk++)
            {
                // transpose8x4
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                    "st4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // NCNN_GNU_INLINE_ASM
                uint16x8x4_t _r0;
                _r0.val[0] = vld1q_u16(p0);
                _r0.val[1] = vld1q_u16(p0 + 8);
                _r0.val[2] = vld1q_u16(p0 + 8 * 2);
                _r0.val[3] = vld1q_u16(p0 + 8 * 3);
                vst4q_u16(pp, _r0);
                pp += 32;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 8;
            }
        }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                // transpose4x4
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0] \n"
                    "st4    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld1.u16   {d0-d3}, [%0 :64]   \n"
                    "vst4.u16   {d0-d3}, [%1 :64]!  \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp)
                    : "memory", "q0", "q1");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                uint16x4x4_t _r0;
                _r0.val[0] = vld1_u16(p0);
                _r0.val[1] = vld1_u16(p0 + 4);
                _r0.val[2] = vld1_u16(p0 + 4 * 2);
                _r0.val[3] = vld1_u16(p0 + 4 * 3);
                vst4_u16(pp, _r0);
                pp += 16;
#endif // NCNN_GNU_INLINE_ASM
                p0 += bottom_blob.cstep * 4;
            }
        }

        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                uint16x4_t _r0 = vld1_u16(p0);
                vst1_u16(pp, _r0);
                pp += 4;
                p0 += bottom_blob.cstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 8) + (j + jj) * 8;

            int kk = 0;
            for (; kk < max_kk / 8; kk++)
            {
                // transpose8x2
                uint16x8x2_t _r0;
                _r0.val[0] = vld1q_u16(p0);
                _r0.val[1] = vld1q_u16(p0 + 8);
                vst2q_u16(pp, _r0);
                pp += 16;
                p0 += bottom_blob.cstep * 8;
            }
        }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                // transpose4x2
                uint16x4x2_t _r0;
                _r0.val[0] = vld1_u16(p0);
                _r0.val[1] = vld1_u16(p0 + 4);
                vst2_u16(pp, _r0);
                pp += 8;
                p0 += bottom_blob.cstep * 4;
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp += 2;
                p0 += bottom_blob.cstep;
            }
        }
    }
    for (; jj < max_jj; jj++)
    {
#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 8) + (j + jj) * 8;

            int kk = 0;
            for (; kk < max_kk / 8; kk++)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += bottom_blob.cstep * 8;
            }
        }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k / 4) + (j + jj) * 4;

            int kk = 0;
            for (; kk < max_kk / 4; kk++)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += bottom_blob.cstep * 4;
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)bottom_blob.channel(k) + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0 += bottom_blob.cstep;
            }
        }
    }
}

static void convolution_im2col_input_tile_bf16_fp16(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
{
    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        convolution_im2col_input_tile_conv1x1s1d1_bf16_fp16(bottom_blob, B, j, max_jj, k, max_kk);
        return;
    }

    const int w = bottom_blob.w;
    // const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int outw = (w - kernel_extent_w) / stride_w + 1;

    // j max_jj     outw*outh    split w and h

    // k max_kk     pa*maxk*(inch/pa)    split inch

    // k/max_kk shall be multiple of maxk

    const int maxk = kernel_w * kernel_h;

    unsigned short* pp = B;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dy2 = (j + jj + 2) / outw;
        int dy3 = (j + jj + 3) / outw;
        int dy4 = (j + jj + 4) / outw;
        int dy5 = (j + jj + 5) / outw;
        int dy6 = (j + jj + 6) / outw;
        int dy7 = (j + jj + 7) / outw;
        int dy8 = (j + jj + 8) / outw;
        int dy9 = (j + jj + 9) / outw;
        int dya = (j + jj + 10) / outw;
        int dyb = (j + jj + 11) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;
        int dx2 = (j + jj + 2) % outw;
        int dx3 = (j + jj + 3) % outw;
        int dx4 = (j + jj + 4) % outw;
        int dx5 = (j + jj + 5) % outw;
        int dx6 = (j + jj + 6) % outw;
        int dx7 = (j + jj + 7) % outw;
        int dx8 = (j + jj + 8) % outw;
        int dx9 = (j + jj + 9) % outw;
        int dxa = (j + jj + 10) % outw;
        int dxb = (j + jj + 11) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int x2 = stride_w * dx2 + dilation_w * v;
            int x3 = stride_w * dx3 + dilation_w * v;
            int x4 = stride_w * dx4 + dilation_w * v;
            int x5 = stride_w * dx5 + dilation_w * v;
            int x6 = stride_w * dx6 + dilation_w * v;
            int x7 = stride_w * dx7 + dilation_w * v;
            int x8 = stride_w * dx8 + dilation_w * v;
            int x9 = stride_w * dx9 + dilation_w * v;
            int xa = stride_w * dxa + dilation_w * v;
            int xb = stride_w * dxb + dilation_w * v;

            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;
            int y2 = stride_h * dy2 + dilation_h * u;
            int y3 = stride_h * dy3 + dilation_h * u;
            int y4 = stride_h * dy4 + dilation_h * u;
            int y5 = stride_h * dy5 + dilation_h * u;
            int y6 = stride_h * dy6 + dilation_h * u;
            int y7 = stride_h * dy7 + dilation_h * u;
            int y8 = stride_h * dy8 + dilation_h * u;
            int y9 = stride_h * dy9 + dilation_h * u;
            int ya = stride_h * dya + dilation_h * u;
            int yb = stride_h * dyb + dilation_h * u;

            const unsigned short* sptr0 = img.row<const unsigned short>(y0) + x0 * elempack;
            const unsigned short* sptr1 = img.row<const unsigned short>(y1) + x1 * elempack;
            const unsigned short* sptr2 = img.row<const unsigned short>(y2) + x2 * elempack;
            const unsigned short* sptr3 = img.row<const unsigned short>(y3) + x3 * elempack;
            const unsigned short* sptr4 = img.row<const unsigned short>(y4) + x4 * elempack;
            const unsigned short* sptr5 = img.row<const unsigned short>(y5) + x5 * elempack;
            const unsigned short* sptr6 = img.row<const unsigned short>(y6) + x6 * elempack;
            const unsigned short* sptr7 = img.row<const unsigned short>(y7) + x7 * elempack;
            const unsigned short* sptr8 = img.row<const unsigned short>(y8) + x8 * elempack;
            const unsigned short* sptr9 = img.row<const unsigned short>(y9) + x9 * elempack;
            const unsigned short* sptra = img.row<const unsigned short>(ya) + xa * elempack;
            const unsigned short* sptrb = img.row<const unsigned short>(yb) + xb * elempack;

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 8)
            {
                uint16x8_t _r0 = vld1q_u16(sptr0);
                uint16x8_t _r1 = vld1q_u16(sptr1);
                uint16x8_t _r2 = vld1q_u16(sptr2);
                uint16x8_t _r3 = vld1q_u16(sptr3);
                uint16x8_t _r4 = vld1q_u16(sptr4);
                uint16x8_t _r5 = vld1q_u16(sptr5);
                uint16x8_t _r6 = vld1q_u16(sptr6);
                uint16x8_t _r7 = vld1q_u16(sptr7);
                uint16x8_t _r8 = vld1q_u16(sptr8);
                uint16x8_t _r9 = vld1q_u16(sptr9);
                uint16x8_t _ra = vld1q_u16(sptra);
                uint16x8_t _rb = vld1q_u16(sptrb);
                transpose8x12_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
                vst1q_u16(pp, _r0);
                vst1q_u16(pp + 8, _r1);
                vst1q_u16(pp + 8 * 2, _r2);
                vst1q_u16(pp + 8 * 3, _r3);
                vst1q_u16(pp + 8 * 4, _r4);
                vst1q_u16(pp + 8 * 5, _r5);
                vst1q_u16(pp + 8 * 6, _r6);
                vst1q_u16(pp + 8 * 7, _r7);
                vst1q_u16(pp + 8 * 8, _r8);
                vst1q_u16(pp + 8 * 9, _r9);
                vst1q_u16(pp + 8 * 10, _ra);
                vst1q_u16(pp + 8 * 11, _rb);
                pp += 96;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 4)
            {
                uint16x4_t _r0 = vld1_u16(sptr0);
                uint16x4_t _r1 = vld1_u16(sptr1);
                uint16x4_t _r2 = vld1_u16(sptr2);
                uint16x4_t _r3 = vld1_u16(sptr3);
                uint16x4_t _r4 = vld1_u16(sptr4);
                uint16x4_t _r5 = vld1_u16(sptr5);
                uint16x4_t _r6 = vld1_u16(sptr6);
                uint16x4_t _r7 = vld1_u16(sptr7);
                uint16x4_t _r8 = vld1_u16(sptr8);
                uint16x4_t _r9 = vld1_u16(sptr9);
                uint16x4_t _ra = vld1_u16(sptra);
                uint16x4_t _rb = vld1_u16(sptrb);
                transpose4x12_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb);
                vst1_u16(pp, _r0);
                vst1_u16(pp + 4, _r1);
                vst1_u16(pp + 4 * 2, _r2);
                vst1_u16(pp + 4 * 3, _r3);
                vst1_u16(pp + 4 * 4, _r4);
                vst1_u16(pp + 4 * 5, _r5);
                vst1_u16(pp + 4 * 6, _r6);
                vst1_u16(pp + 4 * 7, _r7);
                vst1_u16(pp + 4 * 8, _r8);
                vst1_u16(pp + 4 * 9, _r9);
                vst1_u16(pp + 4 * 10, _ra);
                vst1_u16(pp + 4 * 11, _rb);
                pp += 48;
            }
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr2[0];
                pp[3] = sptr3[0];
                pp[4] = sptr4[0];
                pp[5] = sptr5[0];
                pp[6] = sptr6[0];
                pp[7] = sptr7[0];
                pp[8] = sptr8[0];
                pp[9] = sptr9[0];
                pp[10] = sptra[0];
                pp[11] = sptrb[0];
                pp += 12;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dy2 = (j + jj + 2) / outw;
        int dy3 = (j + jj + 3) / outw;
        int dy4 = (j + jj + 4) / outw;
        int dy5 = (j + jj + 5) / outw;
        int dy6 = (j + jj + 6) / outw;
        int dy7 = (j + jj + 7) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;
        int dx2 = (j + jj + 2) % outw;
        int dx3 = (j + jj + 3) % outw;
        int dx4 = (j + jj + 4) % outw;
        int dx5 = (j + jj + 5) % outw;
        int dx6 = (j + jj + 6) % outw;
        int dx7 = (j + jj + 7) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int x2 = stride_w * dx2 + dilation_w * v;
            int x3 = stride_w * dx3 + dilation_w * v;
            int x4 = stride_w * dx4 + dilation_w * v;
            int x5 = stride_w * dx5 + dilation_w * v;
            int x6 = stride_w * dx6 + dilation_w * v;
            int x7 = stride_w * dx7 + dilation_w * v;
            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;
            int y2 = stride_h * dy2 + dilation_h * u;
            int y3 = stride_h * dy3 + dilation_h * u;
            int y4 = stride_h * dy4 + dilation_h * u;
            int y5 = stride_h * dy5 + dilation_h * u;
            int y6 = stride_h * dy6 + dilation_h * u;
            int y7 = stride_h * dy7 + dilation_h * u;

            const unsigned short* sptr0 = img.row<const unsigned short>(y0) + x0 * elempack;
            const unsigned short* sptr1 = img.row<const unsigned short>(y1) + x1 * elempack;
            const unsigned short* sptr2 = img.row<const unsigned short>(y2) + x2 * elempack;
            const unsigned short* sptr3 = img.row<const unsigned short>(y3) + x3 * elempack;
            const unsigned short* sptr4 = img.row<const unsigned short>(y4) + x4 * elempack;
            const unsigned short* sptr5 = img.row<const unsigned short>(y5) + x5 * elempack;
            const unsigned short* sptr6 = img.row<const unsigned short>(y6) + x6 * elempack;
            const unsigned short* sptr7 = img.row<const unsigned short>(y7) + x7 * elempack;

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 8)
            {
                uint16x8_t _r0 = vld1q_u16(sptr0);
                uint16x8_t _r1 = vld1q_u16(sptr1);
                uint16x8_t _r2 = vld1q_u16(sptr2);
                uint16x8_t _r3 = vld1q_u16(sptr3);
                uint16x8_t _r4 = vld1q_u16(sptr4);
                uint16x8_t _r5 = vld1q_u16(sptr5);
                uint16x8_t _r6 = vld1q_u16(sptr6);
                uint16x8_t _r7 = vld1q_u16(sptr7);
                transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
                vst1q_u16(pp, _r0);
                vst1q_u16(pp + 8, _r1);
                vst1q_u16(pp + 8 * 2, _r2);
                vst1q_u16(pp + 8 * 3, _r3);
                vst1q_u16(pp + 8 * 4, _r4);
                vst1q_u16(pp + 8 * 5, _r5);
                vst1q_u16(pp + 8 * 6, _r6);
                vst1q_u16(pp + 8 * 7, _r7);
                pp += 64;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 4)
            {
                uint16x4_t _r0 = vld1_u16(sptr0);
                uint16x4_t _r1 = vld1_u16(sptr1);
                uint16x4_t _r2 = vld1_u16(sptr2);
                uint16x4_t _r3 = vld1_u16(sptr3);
                uint16x4_t _r4 = vld1_u16(sptr4);
                uint16x4_t _r5 = vld1_u16(sptr5);
                uint16x4_t _r6 = vld1_u16(sptr6);
                uint16x4_t _r7 = vld1_u16(sptr7);
                transpose4x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
                vst1_u16(pp, _r0);
                vst1_u16(pp + 4, _r1);
                vst1_u16(pp + 4 * 2, _r2);
                vst1_u16(pp + 4 * 3, _r3);
                vst1_u16(pp + 4 * 4, _r4);
                vst1_u16(pp + 4 * 5, _r5);
                vst1_u16(pp + 4 * 6, _r6);
                vst1_u16(pp + 4 * 7, _r7);
                pp += 32;
            }
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr2[0];
                pp[3] = sptr3[0];
                pp[4] = sptr4[0];
                pp[5] = sptr5[0];
                pp[6] = sptr6[0];
                pp[7] = sptr7[0];
                pp += 8;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dy2 = (j + jj + 2) / outw;
        int dy3 = (j + jj + 3) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;
        int dx2 = (j + jj + 2) % outw;
        int dx3 = (j + jj + 3) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int x2 = stride_w * dx2 + dilation_w * v;
            int x3 = stride_w * dx3 + dilation_w * v;
            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;
            int y2 = stride_h * dy2 + dilation_h * u;
            int y3 = stride_h * dy3 + dilation_h * u;

            const unsigned short* sptr0 = img.row<const unsigned short>(y0) + x0 * elempack;
            const unsigned short* sptr1 = img.row<const unsigned short>(y1) + x1 * elempack;
            const unsigned short* sptr2 = img.row<const unsigned short>(y2) + x2 * elempack;
            const unsigned short* sptr3 = img.row<const unsigned short>(y3) + x3 * elempack;

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 8)
            {
                uint16x8x4_t _r0;
                _r0.val[0] = vld1q_u16(sptr0);
                _r0.val[1] = vld1q_u16(sptr1);
                _r0.val[2] = vld1q_u16(sptr2);
                _r0.val[3] = vld1q_u16(sptr3);
                vst4q_u16(pp, _r0);
                pp += 32;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 4)
            {
                uint16x4x4_t _r0;
                _r0.val[0] = vld1_u16(sptr0);
                _r0.val[1] = vld1_u16(sptr1);
                _r0.val[2] = vld1_u16(sptr2);
                _r0.val[3] = vld1_u16(sptr3);
                vst4_u16(pp, _r0);
                pp += 16;
            }
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr2[0];
                pp[3] = sptr3[0];
                pp += 4;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        int dy0 = (j + jj) / outw;
        int dy1 = (j + jj + 1) / outw;
        int dx0 = (j + jj) % outw;
        int dx1 = (j + jj + 1) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x0 = stride_w * dx0 + dilation_w * v;
            int x1 = stride_w * dx1 + dilation_w * v;
            int y0 = stride_h * dy0 + dilation_h * u;
            int y1 = stride_h * dy1 + dilation_h * u;

            const unsigned short* sptr0 = img.row<const unsigned short>(y0) + x0 * elempack;
            const unsigned short* sptr1 = img.row<const unsigned short>(y1) + x1 * elempack;

#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 8)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr0[1];
                pp[3] = sptr1[1];
                pp[4] = sptr0[2];
                pp[5] = sptr1[2];
                pp[6] = sptr0[3];
                pp[7] = sptr1[3];
                pp[8 + 0] = sptr0[4];
                pp[8 + 1] = sptr1[4];
                pp[8 + 2] = sptr0[5];
                pp[8 + 3] = sptr1[5];
                pp[8 + 4] = sptr0[6];
                pp[8 + 5] = sptr1[6];
                pp[8 + 6] = sptr0[7];
                pp[8 + 7] = sptr1[7];
                pp += 16;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 4)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp[2] = sptr0[1];
                pp[3] = sptr1[1];
                pp[4] = sptr0[2];
                pp[5] = sptr1[2];
                pp[6] = sptr0[3];
                pp[7] = sptr1[3];
                pp += 8;
            }
#endif // __ARM_NEON
            if (elempack == 1)
            {
                pp[0] = sptr0[0];
                pp[1] = sptr1[0];
                pp += 2;
            }
        }
    }
    for (; jj < max_jj; jj++)
    {
        int dy = (j + jj) / outw;
        int dx = (j + jj) % outw;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x = stride_w * dx + dilation_w * v;
            int y = stride_h * dy + dilation_h * u;

            const unsigned short* sptr = img.row<const unsigned short>(y) + x * elempack;

#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 8)
            {
                pp[0] = sptr[0];
                pp[1] = sptr[1];
                pp[2] = sptr[2];
                pp[3] = sptr[3];
                pp[4] = sptr[4];
                pp[5] = sptr[5];
                pp[6] = sptr[6];
                pp[7] = sptr[7];
                pp += 8;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            if (elempack == 4)
            {
                pp[0] = sptr[0];
                pp[1] = sptr[1];
                pp[2] = sptr[2];
                pp[3] = sptr[3];
                pp += 4;
            }
#endif // __ARM_NEON
            if (elempack == 1)
            {
                pp[0] = sptr[0];
                pp += 1;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_im2col_gemm_fp16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end, int use_a53_a55_optimized_kernel)
{
    // NCNN_LOGE("convolution_gemm_transB_packed_tile_fp16sa %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk);

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.cstep;

    const __fp16* pAT = AT_tile;
    const __fp16* pBT = BT_tile;
    const __fp16* pC = CT_tile;

    __fp16* outptr = topT_tile;

    int ii = 0;
    for (; ii + 7 < max_ii; ii += 8)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const __fp16* pB = pBT;

        if (pC)
        {
            pC = (const __fp16*)CT_tile + i + ii;
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            const __fp16* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            if (use_a53_a55_optimized_kernel)
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                    "subs   %0, %0, #128                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "ld1    {v20.8h}, [%8]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h}, [%2], #16          \n"

                    "ldr    d1, [%2], #8                \n"
                    "ldr    x21, [%2], #8               \n"

                    ".align 4                           \n"
                    "4:                                 \n"
                    "ldr    d5, [%1], #8                \n"
                    "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                    "ldr    x25, [%1], #8               \n"
                    "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                    "ldr    x22, [%2], #8               \n"
                    "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                    "ldr    d6, [%1], #8                \n"
                    "fmla   v24.8h, v4.8h, v0.h[4]      \n"
                    "ldr    x26, [%1], #8               \n"
                    "fmla   v25.8h, v4.8h, v0.h[5]      \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v26.8h, v4.8h, v0.h[6]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v27.8h, v4.8h, v0.h[7]      \n"
                    "ldr    x23, [%2], #8               \n"
                    "fmla   v28.8h, v4.8h, v1.h[0]      \n"
                    "prfm   pldl1keep, [%2, #256]       \n" // NOTE PRELOAD
                    "fmla   v29.8h, v4.8h, v1.h[1]      \n"
                    "ins    v5.d[1], x25                \n"
                    "fmla   v30.8h, v4.8h, v1.h[2]      \n"
                    "ldr    d8, [%2], #8                \n"
                    "fmla   v31.8h, v4.8h, v1.h[3]      \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v20.8h, v5.8h, v1.h[4]      \n"
                    "ldr    d7, [%1], #8                \n"
                    "fmla   v21.8h, v5.8h, v1.h[5]      \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v22.8h, v5.8h, v1.h[6]      \n"
                    "ldr    x27, [%1], #8               \n"
                    "fmla   v23.8h, v5.8h, v1.h[7]      \n"
                    "ldr    d9, [%2], #8                \n"
                    "fmla   v24.8h, v5.8h, v2.h[0]      \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v25.8h, v5.8h, v2.h[1]      \n"
                    "ins    v6.d[1], x26                \n"
                    "fmla   v26.8h, v5.8h, v2.h[2]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v27.8h, v5.8h, v2.h[3]      \n"
                    "ldr    d4, [%1], #8                \n"
                    "fmla   v28.8h, v5.8h, v2.h[4]      \n"
                    "ldr    x24, [%1], #8               \n"
                    "fmla   v29.8h, v5.8h, v2.h[5]      \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v30.8h, v5.8h, v2.h[6]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v31.8h, v5.8h, v2.h[7]      \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v20.8h, v6.8h, v3.h[0]      \n"
                    "fmla   v21.8h, v6.8h, v3.h[1]      \n"
                    "fmla   v22.8h, v6.8h, v3.h[2]      \n"
                    "fmla   v23.8h, v6.8h, v3.h[3]      \n"
                    "fmla   v24.8h, v6.8h, v3.h[4]      \n"
                    "fmla   v25.8h, v6.8h, v3.h[5]      \n"
                    "ins    v8.d[1], x20                \n"
                    "fmla   v26.8h, v6.8h, v3.h[6]      \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v27.8h, v6.8h, v3.h[7]      \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v28.8h, v6.8h, v8.h[0]      \n"
                    "fmla   v29.8h, v6.8h, v8.h[1]      \n"
                    "ins    v7.d[1], x27                \n"
                    "fmla   v30.8h, v6.8h, v8.h[2]      \n"
                    "fmla   v31.8h, v6.8h, v8.h[3]      \n"
                    "fmla   v20.8h, v7.8h, v8.h[4]      \n"
                    "fmla   v21.8h, v7.8h, v8.h[5]      \n"
                    "ins    v9.d[1], x21                \n"
                    "fmla   v22.8h, v7.8h, v8.h[6]      \n"
                    "fmla   v23.8h, v7.8h, v8.h[7]      \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v24.8h, v7.8h, v9.h[0]      \n"
                    "fmla   v25.8h, v7.8h, v9.h[1]      \n"
                    "ins    v4.d[1], x24                \n"
                    "fmla   v26.8h, v7.8h, v9.h[2]      \n"
                    "fmla   v27.8h, v7.8h, v9.h[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.8h, v7.8h, v9.h[4]      \n"
                    "fmla   v29.8h, v7.8h, v9.h[5]      \n"
                    "fmla   v30.8h, v7.8h, v9.h[6]      \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v31.8h, v7.8h, v9.h[7]      \n"
                    "bne    4b                          \n"

                    "sub    %1, %1, #16                 \n"
                    "sub    %2, %2, #32                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                    "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                    "fmla   v24.8h, v4.8h, v1.h[0]      \n"
                    "fmla   v25.8h, v4.8h, v1.h[1]      \n"
                    "fmla   v26.8h, v4.8h, v1.h[2]      \n"
                    "fmla   v27.8h, v4.8h, v1.h[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.8h, v4.8h, v2.h[0]      \n"
                    "fmla   v29.8h, v4.8h, v2.h[1]      \n"
                    "fmla   v30.8h, v4.8h, v2.h[2]      \n"
                    "fmla   v31.8h, v4.8h, v2.h[3]      \n"
                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    11f                         \n"

                    // if out_elempack == 8
                    "cmp    %w12, #8                    \n"
                    "bne    8f                          \n"

                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%3], #64 \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%3], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%3], #64 \n"
                    "b      10f                         \n"

                    // if out_elempack == 4
                    "8:                                 \n"
                    "cmp    %w12, #4                    \n"
                    "bne    9f                          \n"

                    "zip1   v12.2d, v20.2d, v21.2d      \n"
                    "zip2   v18.2d, v20.2d, v21.2d      \n"
                    "zip1   v13.2d, v22.2d, v23.2d      \n"
                    "zip2   v19.2d, v22.2d, v23.2d      \n"
                    "zip1   v14.2d, v24.2d, v25.2d      \n"
                    "zip2   v20.2d, v24.2d, v25.2d      \n"
                    "zip1   v15.2d, v26.2d, v27.2d      \n"
                    "zip2   v21.2d, v26.2d, v27.2d      \n"
                    "zip1   v16.2d, v28.2d, v29.2d      \n"
                    "zip2   v22.2d, v28.2d, v29.2d      \n"
                    "zip1   v17.2d, v30.2d, v31.2d      \n"
                    "zip2   v23.2d, v30.2d, v31.2d      \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n"
                    "st1    {v16.8h, v17.8h}, [%3], #32 \n"
                    "st1    {v18.8h, v19.8h, v20.8h, v21.8h}, [x4], #64 \n"
                    "st1    {v22.8h, v23.8h}, [x4]      \n"
                    "b      10f                         \n"

                    // if out_elempack == 1
                    "9:                                 \n"
                    // transpose8x12
                    "zip1   v18.8h, v20.8h, v21.8h      \n"
                    "zip2   v19.8h, v20.8h, v21.8h      \n"
                    "zip1   v20.8h, v22.8h, v23.8h      \n"
                    "zip2   v21.8h, v22.8h, v23.8h      \n"
                    "zip1   v22.8h, v24.8h, v25.8h      \n"
                    "zip2   v23.8h, v24.8h, v25.8h      \n"
                    "zip1   v24.8h, v26.8h, v27.8h      \n"
                    "zip2   v25.8h, v26.8h, v27.8h      \n"
                    "zip1   v26.8h, v28.8h, v29.8h      \n"
                    "zip2   v27.8h, v28.8h, v29.8h      \n"
                    "zip1   v28.8h, v30.8h, v31.8h      \n"
                    "zip2   v29.8h, v30.8h, v31.8h      \n"

                    "zip1   v0.4s, v18.4s, v20.4s       \n"
                    "zip2   v3.4s, v18.4s, v20.4s       \n"
                    "zip1   v6.4s, v19.4s, v21.4s       \n"
                    "zip2   v9.4s, v19.4s, v21.4s       \n"
                    "zip1   v1.4s, v22.4s, v24.4s       \n"
                    "zip2   v4.4s, v22.4s, v24.4s       \n"
                    "zip1   v7.4s, v23.4s, v25.4s       \n"
                    "zip2   v10.4s, v23.4s, v25.4s      \n"
                    "zip1   v2.4s, v26.4s, v28.4s       \n"
                    "zip2   v5.4s, v26.4s, v28.4s       \n"
                    "zip1   v8.4s, v27.4s, v29.4s       \n"
                    "zip2   v11.4s, v27.4s, v29.4s      \n"

                    "mov    v12.d[0], v0.d[1]           \n"
                    "mov    v13.d[0], v1.d[1]           \n"
                    "mov    v14.d[0], v2.d[1]           \n"
                    "mov    v15.d[0], v3.d[1]           \n"
                    "mov    v16.d[0], v4.d[1]           \n"
                    "mov    v17.d[0], v5.d[1]           \n"
                    "mov    v18.d[0], v6.d[1]           \n"
                    "mov    v19.d[0], v7.d[1]           \n"
                    "mov    v20.d[0], v8.d[1]           \n"
                    "mov    v21.d[0], v9.d[1]           \n"
                    "mov    v22.d[0], v10.d[1]          \n"
                    "mov    v23.d[0], v11.d[1]          \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.4h, v1.4h, v2.4h}, [%3], #24 \n"
                    "st1    {v12.4h, v13.4h, v14.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v3.4h, v4.4h, v5.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v15.4h, v16.4h, v17.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.4h, v7.4h, v8.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v18.4h, v19.4h, v20.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v9.4h, v10.4h, v11.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v21.4h, v22.4h, v23.4h}, [x4] \n"

                    "10:                                \n"
                    "add    %0, %0, #192                \n"
                    "b      12f                         \n"

                    "11:                                \n"
                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    "12:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                    "subs   %0, %0, #128                \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "ld1    {v20.8h}, [%8]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v20.16b, v20.16b, v20.16b   \n"

                    "2:                                 \n"
                    "mov    v21.16b, v20.16b            \n"
                    "mov    v22.16b, v20.16b            \n"
                    "mov    v23.16b, v20.16b            \n"
                    "mov    v24.16b, v20.16b            \n"
                    "mov    v25.16b, v20.16b            \n"
                    "mov    v26.16b, v20.16b            \n"
                    "mov    v27.16b, v20.16b            \n"
                    "mov    v28.16b, v20.16b            \n"
                    "mov    v29.16b, v20.16b            \n"
                    "mov    v30.16b, v20.16b            \n"
                    "mov    v31.16b, v20.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"

                    "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                    "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                    "fmla   v24.8h, v4.8h, v0.h[4]      \n"
                    "fmla   v25.8h, v4.8h, v0.h[5]      \n"
                    "fmla   v26.8h, v4.8h, v0.h[6]      \n"
                    "fmla   v27.8h, v4.8h, v0.h[7]      \n"
                    "fmla   v28.8h, v4.8h, v1.h[0]      \n"
                    "fmla   v29.8h, v4.8h, v1.h[1]      \n"
                    "fmla   v30.8h, v4.8h, v1.h[2]      \n"
                    "fmla   v31.8h, v4.8h, v1.h[3]      \n"

                    "fmla   v20.8h, v5.8h, v1.h[4]      \n"
                    "fmla   v21.8h, v5.8h, v1.h[5]      \n"
                    "fmla   v22.8h, v5.8h, v1.h[6]      \n"
                    "fmla   v23.8h, v5.8h, v1.h[7]      \n"
                    "fmla   v24.8h, v5.8h, v2.h[0]      \n"
                    "fmla   v25.8h, v5.8h, v2.h[1]      \n"
                    "fmla   v26.8h, v5.8h, v2.h[2]      \n"
                    "fmla   v27.8h, v5.8h, v2.h[3]      \n"
                    "fmla   v28.8h, v5.8h, v2.h[4]      \n"
                    "fmla   v29.8h, v5.8h, v2.h[5]      \n"
                    "fmla   v30.8h, v5.8h, v2.h[6]      \n"
                    "fmla   v31.8h, v5.8h, v2.h[7]      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v8.8h, v9.8h}, [%2], #32   \n"

                    "fmla   v20.8h, v6.8h, v3.h[0]      \n"
                    "fmla   v21.8h, v6.8h, v3.h[1]      \n"
                    "fmla   v22.8h, v6.8h, v3.h[2]      \n"
                    "fmla   v23.8h, v6.8h, v3.h[3]      \n"
                    "fmla   v24.8h, v6.8h, v3.h[4]      \n"
                    "fmla   v25.8h, v6.8h, v3.h[5]      \n"
                    "fmla   v26.8h, v6.8h, v3.h[6]      \n"
                    "fmla   v27.8h, v6.8h, v3.h[7]      \n"
                    "fmla   v28.8h, v6.8h, v8.h[0]      \n"
                    "fmla   v29.8h, v6.8h, v8.h[1]      \n"
                    "fmla   v30.8h, v6.8h, v8.h[2]      \n"
                    "fmla   v31.8h, v6.8h, v8.h[3]      \n"

                    "subs   w4, w4, #1                  \n"

                    "fmla   v20.8h, v7.8h, v8.h[4]      \n"
                    "fmla   v21.8h, v7.8h, v8.h[5]      \n"
                    "fmla   v22.8h, v7.8h, v8.h[6]      \n"
                    "fmla   v23.8h, v7.8h, v8.h[7]      \n"
                    "fmla   v24.8h, v7.8h, v9.h[0]      \n"
                    "fmla   v25.8h, v7.8h, v9.h[1]      \n"
                    "fmla   v26.8h, v7.8h, v9.h[2]      \n"
                    "fmla   v27.8h, v7.8h, v9.h[3]      \n"
                    "fmla   v28.8h, v7.8h, v9.h[4]      \n"
                    "fmla   v29.8h, v7.8h, v9.h[5]      \n"
                    "fmla   v30.8h, v7.8h, v9.h[6]      \n"
                    "fmla   v31.8h, v7.8h, v9.h[7]      \n"

                    "bne    4b                          \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%2], #24 \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "fmla   v20.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v21.8h, v4.8h, v0.h[1]      \n"
                    "fmla   v22.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v23.8h, v4.8h, v0.h[3]      \n"
                    "fmla   v24.8h, v4.8h, v1.h[0]      \n"
                    "fmla   v25.8h, v4.8h, v1.h[1]      \n"
                    "fmla   v26.8h, v4.8h, v1.h[2]      \n"
                    "fmla   v27.8h, v4.8h, v1.h[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.8h, v4.8h, v2.h[0]      \n"
                    "fmla   v29.8h, v4.8h, v2.h[1]      \n"
                    "fmla   v30.8h, v4.8h, v2.h[2]      \n"
                    "fmla   v31.8h, v4.8h, v2.h[3]      \n"
                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    11f                         \n"

                    // if out_elempack == 8
                    "cmp    %w12, #8                    \n"
                    "bne    8f                          \n"

                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%3], #64 \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%3], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%3], #64 \n"
                    "b      10f                         \n"

                    // if out_elempack == 4
                    "8:                                 \n"
                    "cmp    %w12, #4                    \n"
                    "bne    9f                          \n"

                    "zip1   v12.2d, v20.2d, v21.2d      \n"
                    "zip2   v18.2d, v20.2d, v21.2d      \n"
                    "zip1   v13.2d, v22.2d, v23.2d      \n"
                    "zip2   v19.2d, v22.2d, v23.2d      \n"
                    "zip1   v14.2d, v24.2d, v25.2d      \n"
                    "zip2   v20.2d, v24.2d, v25.2d      \n"
                    "zip1   v15.2d, v26.2d, v27.2d      \n"
                    "zip2   v21.2d, v26.2d, v27.2d      \n"
                    "zip1   v16.2d, v28.2d, v29.2d      \n"
                    "zip2   v22.2d, v28.2d, v29.2d      \n"
                    "zip1   v17.2d, v30.2d, v31.2d      \n"
                    "zip2   v23.2d, v30.2d, v31.2d      \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n"
                    "st1    {v16.8h, v17.8h}, [%3], #32 \n"
                    "st1    {v18.8h, v19.8h, v20.8h, v21.8h}, [x4], #64 \n"
                    "st1    {v22.8h, v23.8h}, [x4]      \n"
                    "b      10f                         \n"

                    // if out_elempack == 1
                    "9:                                 \n"
                    // transpose8x12
                    "zip1   v18.8h, v20.8h, v21.8h      \n"
                    "zip2   v19.8h, v20.8h, v21.8h      \n"
                    "zip1   v20.8h, v22.8h, v23.8h      \n"
                    "zip2   v21.8h, v22.8h, v23.8h      \n"
                    "zip1   v22.8h, v24.8h, v25.8h      \n"
                    "zip2   v23.8h, v24.8h, v25.8h      \n"
                    "zip1   v24.8h, v26.8h, v27.8h      \n"
                    "zip2   v25.8h, v26.8h, v27.8h      \n"
                    "zip1   v26.8h, v28.8h, v29.8h      \n"
                    "zip2   v27.8h, v28.8h, v29.8h      \n"
                    "zip1   v28.8h, v30.8h, v31.8h      \n"
                    "zip2   v29.8h, v30.8h, v31.8h      \n"

                    "zip1   v0.4s, v18.4s, v20.4s       \n"
                    "zip2   v3.4s, v18.4s, v20.4s       \n"
                    "zip1   v6.4s, v19.4s, v21.4s       \n"
                    "zip2   v9.4s, v19.4s, v21.4s       \n"
                    "zip1   v1.4s, v22.4s, v24.4s       \n"
                    "zip2   v4.4s, v22.4s, v24.4s       \n"
                    "zip1   v7.4s, v23.4s, v25.4s       \n"
                    "zip2   v10.4s, v23.4s, v25.4s      \n"
                    "zip1   v2.4s, v26.4s, v28.4s       \n"
                    "zip2   v5.4s, v26.4s, v28.4s       \n"
                    "zip1   v8.4s, v27.4s, v29.4s       \n"
                    "zip2   v11.4s, v27.4s, v29.4s      \n"

                    "mov    v12.d[0], v0.d[1]           \n"
                    "mov    v13.d[0], v1.d[1]           \n"
                    "mov    v14.d[0], v2.d[1]           \n"
                    "mov    v15.d[0], v3.d[1]           \n"
                    "mov    v16.d[0], v4.d[1]           \n"
                    "mov    v17.d[0], v5.d[1]           \n"
                    "mov    v18.d[0], v6.d[1]           \n"
                    "mov    v19.d[0], v7.d[1]           \n"
                    "mov    v20.d[0], v8.d[1]           \n"
                    "mov    v21.d[0], v9.d[1]           \n"
                    "mov    v22.d[0], v10.d[1]          \n"
                    "mov    v23.d[0], v11.d[1]          \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v0.4h, v1.4h, v2.4h}, [%3], #24 \n"
                    "st1    {v12.4h, v13.4h, v14.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v3.4h, v4.4h, v5.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v15.4h, v16.4h, v17.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v6.4h, v7.4h, v8.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v18.4h, v19.4h, v20.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v9.4h, v10.4h, v11.4h}, [x4] \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v21.4h, v22.4h, v23.4h}, [x4] \n"

                    "10:                                \n"
                    "add    %0, %0, #192                \n"
                    "b      12f                         \n"

                    "11:                                \n"
                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%0], #64 \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    "12:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else  // NCNN_GNU_INLINE_ASM
            float16x8_t _sum0;
            float16x8_t _sum1;
            float16x8_t _sum2;
            float16x8_t _sum3;
            float16x8_t _sum4;
            float16x8_t _sum5;
            float16x8_t _sum6;
            float16x8_t _sum7;
            float16x8_t _sum8;
            float16x8_t _sum9;
            float16x8_t _suma;
            float16x8_t _sumb;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f16(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                    _sum8 = _sum0;
                    _sum9 = _sum0;
                    _suma = _sum0;
                    _sumb = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                    _sum2 = vdupq_n_f16(0.f);
                    _sum3 = vdupq_n_f16(0.f);
                    _sum4 = vdupq_n_f16(0.f);
                    _sum5 = vdupq_n_f16(0.f);
                    _sum6 = vdupq_n_f16(0.f);
                    _sum7 = vdupq_n_f16(0.f);
                    _sum8 = vdupq_n_f16(0.f);
                    _sum9 = vdupq_n_f16(0.f);
                    _suma = vdupq_n_f16(0.f);
                    _sumb = vdupq_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8);
                _sum2 = vld1q_f16(outptr + 8 * 2);
                _sum3 = vld1q_f16(outptr + 8 * 3);
                _sum4 = vld1q_f16(outptr + 8 * 4);
                _sum5 = vld1q_f16(outptr + 8 * 5);
                _sum6 = vld1q_f16(outptr + 8 * 6);
                _sum7 = vld1q_f16(outptr + 8 * 7);
                _sum8 = vld1q_f16(outptr + 8 * 8);
                _sum9 = vld1q_f16(outptr + 8 * 9);
                _suma = vld1q_f16(outptr + 8 * 10);
                _sumb = vld1q_f16(outptr + 8 * 11);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                _sum0 = vfmaq_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_lane_f16(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_lane_f16(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_lane_f16(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_lane_f16(_sum7, _pA, _pB1, 3);
                _sum8 = vfmaq_lane_f16(_sum8, _pA, _pB2, 0);
                _sum9 = vfmaq_lane_f16(_sum9, _pA, _pB2, 1);
                _suma = vfmaq_lane_f16(_suma, _pA, _pB2, 2);
                _sumb = vfmaq_lane_f16(_sumb, _pA, _pB2, 3);

                pA += 8;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8, _sum1);
                    vst1q_f16(outptr0 + 8 * 2, _sum2);
                    vst1q_f16(outptr0 + 8 * 3, _sum3);
                    vst1q_f16(outptr0 + 8 * 4, _sum4);
                    vst1q_f16(outptr0 + 8 * 5, _sum5);
                    vst1q_f16(outptr0 + 8 * 6, _sum6);
                    vst1q_f16(outptr0 + 8 * 7, _sum7);
                    vst1q_f16(outptr0 + 8 * 8, _sum8);
                    vst1q_f16(outptr0 + 8 * 9, _sum9);
                    vst1q_f16(outptr0 + 8 * 10, _suma);
                    vst1q_f16(outptr0 + 8 * 11, _sumb);
                    outptr0 += 96;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + 4 * 2, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + 4 * 3, vget_low_f16(_sum3));
                    vst1_f16(outptr0 + 4 * 4, vget_low_f16(_sum4));
                    vst1_f16(outptr0 + 4 * 5, vget_low_f16(_sum5));
                    vst1_f16(outptr0 + 4 * 6, vget_low_f16(_sum6));
                    vst1_f16(outptr0 + 4 * 7, vget_low_f16(_sum7));
                    vst1_f16(outptr0 + 4 * 8, vget_low_f16(_sum8));
                    vst1_f16(outptr0 + 4 * 9, vget_low_f16(_sum9));
                    vst1_f16(outptr0 + 4 * 10, vget_low_f16(_suma));
                    vst1_f16(outptr0 + 4 * 11, vget_low_f16(_sumb));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 2, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 3, vget_high_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 4, vget_high_f16(_sum4));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 5, vget_high_f16(_sum5));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 6, vget_high_f16(_sum6));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 7, vget_high_f16(_sum7));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 8, vget_high_f16(_sum8));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 9, vget_high_f16(_sum9));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 10, vget_high_f16(_suma));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 11, vget_high_f16(_sumb));

                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose8x12_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb);

                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + 8, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep + 4, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep + 8, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 2, vget_low_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 2 + 4, vget_high_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 2 + 8, vget_low_f16(_sum4));
                    vst1_f16(outptr0 + out_hstep * 3, vget_high_f16(_sum4));
                    vst1_f16(outptr0 + out_hstep * 3 + 4, vget_low_f16(_sum5));
                    vst1_f16(outptr0 + out_hstep * 3 + 8, vget_high_f16(_sum5));
                    vst1_f16(outptr0 + out_hstep * 4, vget_low_f16(_sum6));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum6));
                    vst1_f16(outptr0 + out_hstep * 4 + 8, vget_low_f16(_sum7));
                    vst1_f16(outptr0 + out_hstep * 5, vget_high_f16(_sum7));
                    vst1_f16(outptr0 + out_hstep * 5 + 4, vget_low_f16(_sum8));
                    vst1_f16(outptr0 + out_hstep * 5 + 8, vget_high_f16(_sum8));
                    vst1_f16(outptr0 + out_hstep * 6, vget_low_f16(_sum9));
                    vst1_f16(outptr0 + out_hstep * 6 + 4, vget_high_f16(_sum9));
                    vst1_f16(outptr0 + out_hstep * 6 + 8, vget_low_f16(_suma));
                    vst1_f16(outptr0 + out_hstep * 7, vget_high_f16(_suma));
                    vst1_f16(outptr0 + out_hstep * 7 + 4, vget_low_f16(_sumb));
                    vst1_f16(outptr0 + out_hstep * 7 + 8, vget_high_f16(_sumb));

                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
                vst1q_f16(outptr + 8 * 4, _sum4);
                vst1q_f16(outptr + 8 * 5, _sum5);
                vst1q_f16(outptr + 8 * 6, _sum6);
                vst1q_f16(outptr + 8 * 7, _sum7);
                vst1q_f16(outptr + 8 * 8, _sum8);
                vst1q_f16(outptr + 8 * 9, _sum9);
                vst1q_f16(outptr + 8 * 10, _suma);
                vst1q_f16(outptr + 8 * 11, _sumb);
            }

            outptr += 96;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            const __fp16* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            if (use_a53_a55_optimized_kernel)
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                    "subs   %0, %0, #64                 \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "ld1    {v24.8h}, [%8]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"

                    "2:                                 \n"
                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"
                    "mov    v28.16b, v24.16b            \n"
                    "mov    v29.16b, v24.16b            \n"
                    "mov    v30.16b, v24.16b            \n"
                    "mov    v31.16b, v24.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h}, [%2], #16          \n"

                    "ldr    d5, [%1], #8                \n"
                    "ldr    x25, [%1], #8               \n"

                    ".align 4                           \n"
                    "4:                                 \n"
                    "ldr    d1, [%2], #8                \n"
                    "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                    "ldr    x21, [%2], #8               \n"
                    "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                    "ins    v5.d[1], x25                \n"
                    "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                    "ldr    d6, [%1], #8                \n"
                    "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                    "ldr    x26, [%1], #8               \n"
                    "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                    "ldr    d2, [%2], #8                \n"
                    "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                    "ins    v1.d[1], x21                \n"
                    "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                    "ldr    x22, [%2], #8               \n"
                    "fmla   v31.8h, v4.8h, v0.h[7]      \n"
                    "ldr    d7, [%1], #8                \n"
                    "fmla   v24.8h, v5.8h, v1.h[0]      \n"
                    "ldr    x27, [%1], #8               \n"
                    "fmla   v25.8h, v5.8h, v1.h[1]      \n"
                    "ins    v6.d[1], x26                \n"
                    "fmla   v26.8h, v5.8h, v1.h[2]      \n"
                    "ldr    d3, [%2], #8                \n"
                    "fmla   v27.8h, v5.8h, v1.h[3]      \n"
                    "ldr    x23, [%2], #8               \n"
                    "fmla   v28.8h, v5.8h, v1.h[4]      \n"
                    "prfm   pldl1keep, [%1, #512]       \n" // NOTE PRELOAD
                    "fmla   v29.8h, v5.8h, v1.h[5]      \n"
                    "ins    v2.d[1], x22                \n"
                    "fmla   v30.8h, v5.8h, v1.h[6]      \n"
                    "ldr    d4, [%1], #8                \n"
                    "fmla   v31.8h, v5.8h, v1.h[7]      \n"
                    "ldr    x24, [%1], #8               \n"
                    "fmla   v24.8h, v6.8h, v2.h[0]      \n"
                    "prfm   pldl1keep, [%2, #512]       \n" // NOTE PRELOAD
                    "fmla   v25.8h, v6.8h, v2.h[1]      \n"
                    "ins    v7.d[1], x27                \n"
                    "fmla   v26.8h, v6.8h, v2.h[2]      \n"
                    "ldr    d0, [%2], #8                \n"
                    "fmla   v27.8h, v6.8h, v2.h[3]      \n"
                    "ldr    x20, [%2], #8               \n"
                    "fmla   v28.8h, v6.8h, v2.h[4]      \n"
                    "ldr    d5, [%1], #8                \n"
                    "fmla   v29.8h, v6.8h, v2.h[5]      \n"
                    "ins    v3.d[1], x23                \n"
                    "fmla   v30.8h, v6.8h, v2.h[6]      \n"
                    "ldr    x25, [%1], #8               \n"
                    "fmla   v31.8h, v6.8h, v2.h[7]      \n"
                    "fmla   v24.8h, v7.8h, v3.h[0]      \n"
                    "fmla   v25.8h, v7.8h, v3.h[1]      \n"
                    "fmla   v26.8h, v7.8h, v3.h[2]      \n"
                    "ins    v4.d[1], x24                \n"
                    "fmla   v27.8h, v7.8h, v3.h[3]      \n"
                    "fmla   v28.8h, v7.8h, v3.h[4]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v29.8h, v7.8h, v3.h[5]      \n"
                    "fmla   v30.8h, v7.8h, v3.h[6]      \n"
                    "ins    v0.d[1], x20                \n"
                    "fmla   v31.8h, v7.8h, v3.h[7]      \n"
                    "bne    4b                          \n"

                    "sub    %1, %1, #32                 \n"
                    "sub    %2, %2, #16                 \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.8h}, [%2], #16          \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                    "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                    "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                    "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                    "fmla   v31.8h, v4.8h, v0.h[7]      \n"
                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    11f                         \n"

                    // if out_elempack == 8
                    "cmp    %w12, #8                    \n"
                    "bne    8f                          \n"

                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%3], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%3], #64 \n"
                    "b      10f                         \n"

                    // if out_elempack == 4
                    "8:                                 \n"
                    "cmp    %w12, #4                    \n"
                    "bne    9f                          \n"

                    "zip1   v16.2d, v24.2d, v25.2d      \n"
                    "zip2   v20.2d, v24.2d, v25.2d      \n"
                    "zip1   v17.2d, v26.2d, v27.2d      \n"
                    "zip2   v21.2d, v26.2d, v27.2d      \n"
                    "zip1   v18.2d, v28.2d, v29.2d      \n"
                    "zip2   v22.2d, v28.2d, v29.2d      \n"
                    "zip1   v19.2d, v30.2d, v31.2d      \n"
                    "zip2   v23.2d, v30.2d, v31.2d      \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%3], #64 \n"
                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [x4] \n"
                    "b      10f                         \n"

                    // if out_elempack == 1
                    "9:                                 \n"
                    // transpose8x8
                    "zip1   v22.8h, v24.8h, v25.8h      \n"
                    "zip2   v23.8h, v24.8h, v25.8h      \n"
                    "zip1   v24.8h, v26.8h, v27.8h      \n"
                    "zip2   v25.8h, v26.8h, v27.8h      \n"
                    "zip1   v26.8h, v28.8h, v29.8h      \n"
                    "zip2   v27.8h, v28.8h, v29.8h      \n"
                    "zip1   v28.8h, v30.8h, v31.8h      \n"
                    "zip2   v29.8h, v30.8h, v31.8h      \n"

                    "zip1   v16.4s, v22.4s, v24.4s      \n"
                    "zip2   v17.4s, v22.4s, v24.4s      \n"
                    "zip1   v18.4s, v23.4s, v25.4s      \n"
                    "zip2   v19.4s, v23.4s, v25.4s      \n"
                    "zip1   v20.4s, v26.4s, v28.4s      \n"
                    "zip2   v21.4s, v26.4s, v28.4s      \n"
                    "zip1   v22.4s, v27.4s, v29.4s      \n"
                    "zip2   v23.4s, v27.4s, v29.4s      \n"

                    "zip1   v24.2d, v16.2d, v20.2d      \n"
                    "zip2   v25.2d, v16.2d, v20.2d      \n"
                    "zip1   v26.2d, v17.2d, v21.2d      \n"
                    "zip2   v27.2d, v17.2d, v21.2d      \n"
                    "zip1   v28.2d, v18.2d, v22.2d      \n"
                    "zip2   v29.2d, v18.2d, v22.2d      \n"
                    "zip1   v30.2d, v19.2d, v23.2d      \n"
                    "zip2   v31.2d, v19.2d, v23.2d      \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v24.8h}, [%3], #16         \n"
                    "st1    {v25.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v26.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v27.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v28.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v29.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v30.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v31.8h}, [x4]              \n"

                    "10:                                \n"
                    "add    %0, %0, #128                \n"
                    "b      12f                         \n"

                    "11:                                \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    "12:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            else
            {
                asm volatile(
                    "cbz    %w10, 0f                    \n"

                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                    "subs   %0, %0, #64                 \n"
                    "b      3f                          \n"

                    "0:                                 \n"
                    // if pC
                    "cbz    %8, 1f                      \n"

                    "ld1    {v24.8h}, [%8]              \n"
                    "b      2f                          \n"

                    // else
                    "1:                                 \n"
                    "eor    v24.16b, v24.16b, v24.16b   \n"

                    "2:                                 \n"
                    "mov    v25.16b, v24.16b            \n"
                    "mov    v26.16b, v24.16b            \n"
                    "mov    v27.16b, v24.16b            \n"
                    "mov    v28.16b, v24.16b            \n"
                    "mov    v29.16b, v24.16b            \n"
                    "mov    v30.16b, v24.16b            \n"
                    "mov    v31.16b, v24.16b            \n"

                    "3:                                 \n"
                    "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                    "cmp    w4, #0                      \n"
                    "beq    5f                          \n"

                    "4:                                 \n"
                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"
                    "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                    "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                    "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                    "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                    "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                    "fmla   v31.8h, v4.8h, v0.h[7]      \n"
                    "fmla   v24.8h, v5.8h, v1.h[0]      \n"
                    "fmla   v25.8h, v5.8h, v1.h[1]      \n"
                    "fmla   v26.8h, v5.8h, v1.h[2]      \n"
                    "fmla   v27.8h, v5.8h, v1.h[3]      \n"
                    "fmla   v28.8h, v5.8h, v1.h[4]      \n"
                    "fmla   v29.8h, v5.8h, v1.h[5]      \n"
                    "fmla   v30.8h, v5.8h, v1.h[6]      \n"
                    "fmla   v31.8h, v5.8h, v1.h[7]      \n"
                    "fmla   v24.8h, v6.8h, v2.h[0]      \n"
                    "fmla   v25.8h, v6.8h, v2.h[1]      \n"
                    "fmla   v26.8h, v6.8h, v2.h[2]      \n"
                    "fmla   v27.8h, v6.8h, v2.h[3]      \n"
                    "fmla   v28.8h, v6.8h, v2.h[4]      \n"
                    "fmla   v29.8h, v6.8h, v2.h[5]      \n"
                    "fmla   v30.8h, v6.8h, v2.h[6]      \n"
                    "fmla   v31.8h, v6.8h, v2.h[7]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v24.8h, v7.8h, v3.h[0]      \n"
                    "fmla   v25.8h, v7.8h, v3.h[1]      \n"
                    "fmla   v26.8h, v7.8h, v3.h[2]      \n"
                    "fmla   v27.8h, v7.8h, v3.h[3]      \n"
                    "fmla   v28.8h, v7.8h, v3.h[4]      \n"
                    "fmla   v29.8h, v7.8h, v3.h[5]      \n"
                    "fmla   v30.8h, v7.8h, v3.h[6]      \n"
                    "fmla   v31.8h, v7.8h, v3.h[7]      \n"
                    "bne    4b                          \n"

                    "5:                                 \n"
                    "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                    "cmp    w4, #0                      \n"
                    "beq    7f                          \n"

                    "6:                                 \n"
                    "ld1    {v0.8h}, [%2], #16          \n"
                    "ld1    {v4.8h}, [%1], #16          \n"
                    "fmla   v24.8h, v4.8h, v0.h[0]      \n"
                    "fmla   v25.8h, v4.8h, v0.h[1]      \n"
                    "fmla   v26.8h, v4.8h, v0.h[2]      \n"
                    "fmla   v27.8h, v4.8h, v0.h[3]      \n"
                    "subs   w4, w4, #1                  \n"
                    "fmla   v28.8h, v4.8h, v0.h[4]      \n"
                    "fmla   v29.8h, v4.8h, v0.h[5]      \n"
                    "fmla   v30.8h, v4.8h, v0.h[6]      \n"
                    "fmla   v31.8h, v4.8h, v0.h[7]      \n"
                    "bne    6b                          \n"

                    "7:                                 \n"
                    "tst    %w11, #255                  \n"
                    "beq    11f                         \n"

                    // if out_elempack == 8
                    "cmp    %w12, #8                    \n"
                    "bne    8f                          \n"

                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%3], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%3], #64 \n"
                    "b      10f                         \n"

                    // if out_elempack == 4
                    "8:                                 \n"
                    "cmp    %w12, #4                    \n"
                    "bne    9f                          \n"

                    "zip1   v16.2d, v24.2d, v25.2d      \n"
                    "zip2   v20.2d, v24.2d, v25.2d      \n"
                    "zip1   v17.2d, v26.2d, v27.2d      \n"
                    "zip2   v21.2d, v26.2d, v27.2d      \n"
                    "zip1   v18.2d, v28.2d, v29.2d      \n"
                    "zip2   v22.2d, v28.2d, v29.2d      \n"
                    "zip1   v19.2d, v30.2d, v31.2d      \n"
                    "zip2   v23.2d, v30.2d, v31.2d      \n"

                    "lsl    w4, %w13, #2                \n"
                    "add    x4, %3, w4, sxtw 1          \n"
                    "st1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%3], #64 \n"
                    "st1    {v20.8h, v21.8h, v22.8h, v23.8h}, [x4] \n"
                    "b      10f                         \n"

                    // if out_elempack == 1
                    "9:                                 \n"
                    // transpose8x8
                    "zip1   v22.8h, v24.8h, v25.8h      \n"
                    "zip2   v23.8h, v24.8h, v25.8h      \n"
                    "zip1   v24.8h, v26.8h, v27.8h      \n"
                    "zip2   v25.8h, v26.8h, v27.8h      \n"
                    "zip1   v26.8h, v28.8h, v29.8h      \n"
                    "zip2   v27.8h, v28.8h, v29.8h      \n"
                    "zip1   v28.8h, v30.8h, v31.8h      \n"
                    "zip2   v29.8h, v30.8h, v31.8h      \n"

                    "zip1   v16.4s, v22.4s, v24.4s      \n"
                    "zip2   v17.4s, v22.4s, v24.4s      \n"
                    "zip1   v18.4s, v23.4s, v25.4s      \n"
                    "zip2   v19.4s, v23.4s, v25.4s      \n"
                    "zip1   v20.4s, v26.4s, v28.4s      \n"
                    "zip2   v21.4s, v26.4s, v28.4s      \n"
                    "zip1   v22.4s, v27.4s, v29.4s      \n"
                    "zip2   v23.4s, v27.4s, v29.4s      \n"

                    "zip1   v24.2d, v16.2d, v20.2d      \n"
                    "zip2   v25.2d, v16.2d, v20.2d      \n"
                    "zip1   v26.2d, v17.2d, v21.2d      \n"
                    "zip2   v27.2d, v17.2d, v21.2d      \n"
                    "zip1   v28.2d, v18.2d, v22.2d      \n"
                    "zip2   v29.2d, v18.2d, v22.2d      \n"
                    "zip1   v30.2d, v19.2d, v23.2d      \n"
                    "zip2   v31.2d, v19.2d, v23.2d      \n"

                    "add    x4, %3, %w13, sxtw 1        \n"
                    "st1    {v24.8h}, [%3], #16         \n"
                    "st1    {v25.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v26.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v27.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v28.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v29.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v30.8h}, [x4]              \n"
                    "add    x4, x4, %w13, sxtw 1        \n"
                    "st1    {v31.8h}, [x4]              \n"

                    "10:                                \n"
                    "add    %0, %0, #128                \n"
                    "b      12f                         \n"

                    "11:                                \n"
                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    "12:                                \n"

                    : "=r"(outptr), // %0
                    "=r"(pA),     // %1
                    "=r"(pB),     // %2
                    "=r"(outptr0) // %3
                    : "0"(outptr),
                    "1"(pA),
                    "2"(pB),
                    "3"(outptr0),
                    "r"(pC),           // %8
                    "r"(max_kk),       // %9
                    "r"(k),            // %10
                    "r"(k_end),        // %11
                    "r"(out_elempack), // %12
                    "r"(out_hstep)     // %13
                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else  // NCNN_GNU_INLINE_ASM
            float16x8_t _sum0;
            float16x8_t _sum1;
            float16x8_t _sum2;
            float16x8_t _sum3;
            float16x8_t _sum4;
            float16x8_t _sum5;
            float16x8_t _sum6;
            float16x8_t _sum7;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f16(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                    _sum2 = vdupq_n_f16(0.f);
                    _sum3 = vdupq_n_f16(0.f);
                    _sum4 = vdupq_n_f16(0.f);
                    _sum5 = vdupq_n_f16(0.f);
                    _sum6 = vdupq_n_f16(0.f);
                    _sum7 = vdupq_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8);
                _sum2 = vld1q_f16(outptr + 8 * 2);
                _sum3 = vld1q_f16(outptr + 8 * 3);
                _sum4 = vld1q_f16(outptr + 8 * 4);
                _sum5 = vld1q_f16(outptr + 8 * 5);
                _sum6 = vld1q_f16(outptr + 8 * 6);
                _sum7 = vld1q_f16(outptr + 8 * 7);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x8_t _pB = vld1q_f16(pB);

                _sum0 = vfmaq_laneq_f16(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_laneq_f16(_sum1, _pA, _pB, 1);
                _sum2 = vfmaq_laneq_f16(_sum2, _pA, _pB, 2);
                _sum3 = vfmaq_laneq_f16(_sum3, _pA, _pB, 3);
                _sum4 = vfmaq_laneq_f16(_sum4, _pA, _pB, 4);
                _sum5 = vfmaq_laneq_f16(_sum5, _pA, _pB, 5);
                _sum6 = vfmaq_laneq_f16(_sum6, _pA, _pB, 6);
                _sum7 = vfmaq_laneq_f16(_sum7, _pA, _pB, 7);

                pA += 8;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8, _sum1);
                    vst1q_f16(outptr0 + 8 * 2, _sum2);
                    vst1q_f16(outptr0 + 8 * 3, _sum3);
                    vst1q_f16(outptr0 + 8 * 4, _sum4);
                    vst1q_f16(outptr0 + 8 * 5, _sum5);
                    vst1q_f16(outptr0 + 8 * 6, _sum6);
                    vst1q_f16(outptr0 + 8 * 7, _sum7);
                    outptr0 += 64;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + 4 * 2, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + 4 * 3, vget_low_f16(_sum3));
                    vst1_f16(outptr0 + 4 * 4, vget_low_f16(_sum4));
                    vst1_f16(outptr0 + 4 * 5, vget_low_f16(_sum5));
                    vst1_f16(outptr0 + 4 * 6, vget_low_f16(_sum6));
                    vst1_f16(outptr0 + 4 * 7, vget_low_f16(_sum7));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 2, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 3, vget_high_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 4, vget_high_f16(_sum4));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 5, vget_high_f16(_sum5));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 6, vget_high_f16(_sum6));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 7, vget_high_f16(_sum7));

                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose8x8_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + out_hstep, _sum1);
                    vst1q_f16(outptr0 + out_hstep * 2, _sum2);
                    vst1q_f16(outptr0 + out_hstep * 3, _sum3);
                    vst1q_f16(outptr0 + out_hstep * 4, _sum4);
                    vst1q_f16(outptr0 + out_hstep * 5, _sum5);
                    vst1q_f16(outptr0 + out_hstep * 6, _sum6);
                    vst1q_f16(outptr0 + out_hstep * 7, _sum7);

                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
                vst1q_f16(outptr + 8 * 4, _sum4);
                vst1q_f16(outptr + 8 * 5, _sum5);
                vst1q_f16(outptr + 8 * 6, _sum6);
                vst1q_f16(outptr + 8 * 7, _sum7);
            }

            outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            const __fp16* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v28.8h}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"

                "2:                                 \n"
                "mov    v29.16b, v28.16b            \n"
                "mov    v30.16b, v28.16b            \n"
                "mov    v31.16b, v28.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "4:                                 \n"
                "prfm   pldl1keep, [%2, #256]       \n"
                "ld1    {v0.8h, v1.8h}, [%2], #32   \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"
                "fmla   v28.8h, v4.8h, v0.h[0]      \n"
                "fmla   v29.8h, v4.8h, v0.h[1]      \n"
                "fmla   v30.8h, v4.8h, v0.h[2]      \n"
                "fmla   v31.8h, v4.8h, v0.h[3]      \n"
                "fmla   v28.8h, v5.8h, v0.h[4]      \n"
                "fmla   v29.8h, v5.8h, v0.h[5]      \n"
                "fmla   v30.8h, v5.8h, v0.h[6]      \n"
                "fmla   v31.8h, v5.8h, v0.h[7]      \n"
                "fmla   v28.8h, v6.8h, v1.h[0]      \n"
                "fmla   v29.8h, v6.8h, v1.h[1]      \n"
                "fmla   v30.8h, v6.8h, v1.h[2]      \n"
                "fmla   v31.8h, v6.8h, v1.h[3]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v28.8h, v7.8h, v1.h[4]      \n"
                "fmla   v29.8h, v7.8h, v1.h[5]      \n"
                "fmla   v30.8h, v7.8h, v1.h[6]      \n"
                "fmla   v31.8h, v7.8h, v1.h[7]      \n"
                "bne    4b                          \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.4h}, [%2], #8           \n"
                "ld1    {v4.8h}, [%1], #16          \n"
                "fmla   v28.8h, v4.8h, v0.h[0]      \n"
                "fmla   v29.8h, v4.8h, v0.h[1]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.8h, v4.8h, v0.h[2]      \n"
                "fmla   v31.8h, v4.8h, v0.h[3]      \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    11f                         \n"

                // if out_elempack == 8
                "cmp    %w12, #8                    \n"
                "bne    8f                          \n"

                "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%3], #64 \n"
                "b      10f                         \n"

                // if out_elempack == 4
                "8:                                 \n"
                "cmp    %w12, #4                    \n"
                "bne    9f                          \n"

                "zip1   v24.2d, v28.2d, v29.2d      \n"
                "zip2   v26.2d, v28.2d, v29.2d      \n"
                "zip1   v25.2d, v30.2d, v31.2d      \n"
                "zip2   v27.2d, v30.2d, v31.2d      \n"

                "lsl    w4, %w13, #2                \n"
                "add    x4, %3, w4, sxtw 1          \n"
                "st1    {v24.8h, v25.8h}, [%3], #32 \n"
                "st1    {v26.8h, v27.8h}, [x4]      \n"
                "b      10f                         \n"

                // if out_elempack == 1
                "9:                                 \n"
                // transpose8x4
                "zip1   v24.8h, v28.8h, v29.8h      \n"
                "zip2   v25.8h, v28.8h, v29.8h      \n"
                "zip1   v26.8h, v30.8h, v31.8h      \n"
                "zip2   v27.8h, v30.8h, v31.8h      \n"

                "zip1   v20.4s, v24.4s, v26.4s      \n"
                "zip2   v22.4s, v24.4s, v26.4s      \n"
                "zip1   v24.4s, v25.4s, v27.4s      \n"
                "zip2   v26.4s, v25.4s, v27.4s      \n"

                "mov    v21.d[0], v20.d[1]          \n"
                "mov    v23.d[0], v22.d[1]          \n"
                "mov    v25.d[0], v24.d[1]          \n"
                "mov    v27.d[0], v26.d[1]          \n"

                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v20.4h}, [%3], #8          \n"
                "st1    {v21.4h}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v22.4h}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v23.4h}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v24.4h}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v25.4h}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v26.4h}, [x4]              \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v27.4h}, [x4]              \n"

                "10:                                \n"
                "add    %0, %0, #64                 \n"
                "b      12f                         \n"

                "11:                                \n"
                "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                "12:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v1", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float16x8_t _sum0;
            float16x8_t _sum1;
            float16x8_t _sum2;
            float16x8_t _sum3;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f16(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                    _sum2 = vdupq_n_f16(0.f);
                    _sum3 = vdupq_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8);
                _sum2 = vld1q_f16(outptr + 8 * 2);
                _sum3 = vld1q_f16(outptr + 8 * 3);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x4_t _pB = vld1_f16(pB);

                _sum0 = vfmaq_lane_f16(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _pA, _pB, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _pA, _pB, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _pA, _pB, 3);

                pA += 8;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8, _sum1);
                    vst1q_f16(outptr0 + 8 * 2, _sum2);
                    vst1q_f16(outptr0 + 8 * 3, _sum3);
                    outptr0 += 32;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + 4 * 2, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + 4 * 3, vget_low_f16(_sum3));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 2, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 3, vget_high_f16(_sum3));

                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose8x4_ph(_sum0, _sum1, _sum2, _sum3);

                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 1, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 2, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 3, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 5, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 6, vget_low_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 7, vget_high_f16(_sum3));

                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
            }

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const __fp16* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cbz    %w10, 0f                    \n"

                "ld1    {v30.8h, v31.8h}, [%0]      \n"
                "b      3f                          \n"

                "0:                                 \n"
                // if pC
                "cbz    %8, 1f                      \n"

                "ld1    {v30.8h}, [%8]              \n"
                "b      2f                          \n"

                // else
                "1:                                 \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"

                "2:                                 \n"
                "mov    v31.16b, v30.16b            \n"

                "3:                                 \n"
                "lsr    w4, %w9, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "4:                                 \n"
                "prfm   pldl1keep, [%2, #128]       \n"
                "ld1    {v0.8h}, [%2], #16          \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"
                "fmla   v28.8h, v4.8h, v0.h[0]      \n"
                "fmla   v29.8h, v4.8h, v0.h[1]      \n"
                "fmla   v30.8h, v5.8h, v0.h[2]      \n"
                "fmla   v31.8h, v5.8h, v0.h[3]      \n"
                "fmla   v28.8h, v6.8h, v0.h[4]      \n"
                "fmla   v29.8h, v6.8h, v0.h[5]      \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.8h, v7.8h, v0.h[6]      \n"
                "fmla   v31.8h, v7.8h, v0.h[7]      \n"
                "bne    4b                          \n"
                "fadd   v30.8h, v30.8h, v28.8h      \n"
                "fadd   v31.8h, v31.8h, v29.8h      \n"

                "5:                                 \n"
                "and    w4, %w9, #3                 \n" // w4 = remain = max_kk & 3
                "cmp    w4, #0                      \n"
                "beq    7f                          \n"

                "6:                                 \n"
                "ld1    {v0.s}[0], [%2], #4         \n"
                "ld1    {v4.8h}, [%1], #16          \n"
                "subs   w4, w4, #1                  \n"
                "fmla   v30.8h, v4.8h, v0.h[0]      \n"
                "fmla   v31.8h, v4.8h, v0.h[1]      \n"
                "bne    6b                          \n"

                "7:                                 \n"
                "tst    %w11, #255                  \n"
                "beq    11f                         \n"

                // if out_elempack == 8
                "cmp    %w12, #8                    \n"
                "bne    8f                          \n"

                "st1    {v30.8h, v31.8h}, [%3], #32 \n"
                "b      10f                         \n"

                // if out_elempack == 4
                "8:                                 \n"
                "cmp    %w12, #4                    \n"
                "bne    9f                          \n"

                "zip1   v28.2d, v30.2d, v31.2d      \n"
                "zip2   v29.2d, v30.2d, v31.2d      \n"

                "lsl    w4, %w13, #2                \n"
                "add    x4, %3, w4, sxtw 1          \n"
                "st1    {v28.8h}, [%3], #16         \n"
                "st1    {v29.8h}, [x4]              \n"
                "b      10f                         \n"

                // if out_elempack == 1
                "9:                                 \n"
                // transpose8x2
                "zip1   v28.8h, v30.8h, v31.8h      \n"
                "zip2   v29.8h, v30.8h, v31.8h      \n"

                "add    x4, %3, %w13, sxtw 1        \n"
                "st1    {v28.s}[0], [%3], #4        \n"
                "st1    {v28.s}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v28.s}[2], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v28.s}[3], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v29.s}[0], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v29.s}[1], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v29.s}[2], [x4]            \n"
                "add    x4, x4, %w13, sxtw 1        \n"
                "st1    {v29.s}[3], [x4]            \n"

                "10:                                \n"
                "add    %0, %0, #64                 \n"
                "b      12f                         \n"

                "11:                                \n"
                "st1    {v30.8h, v31.8h}, [%0], #32 \n"

                "12:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(pC),           // %8
                "r"(max_kk),       // %9
                "r"(k),            // %10
                "r"(k_end),        // %11
                "r"(out_elempack), // %12
                "r"(out_hstep)     // %13
                : "cc", "memory", "x4", "v0", "v4", "v5", "v6", "v7", "v28", "v29", "v30", "v31");
#else  // NCNN_GNU_INLINE_ASM
            float16x8_t _sum0;
            float16x8_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f16(pC);
                    _sum1 = _sum0;
                }
                else
                {
                    _sum0 = vdupq_n_f16(0.f);
                    _sum1 = vdupq_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x4_t _pB = vld1_f16(pB);

                _sum0 = vfmaq_lane_f16(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _pA, _pB, 1);

                pA += 8;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8, _sum1);
                    outptr0 += 16;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[8];
                    __fp16 sum1[8];
                    vst1q_f16(sum0, _sum0);
                    vst1q_f16(sum1, _sum1);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];

                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0[out_hstep * 4 + 1] = sum1[4];
                    outptr0[out_hstep * 5 + 1] = sum1[5];
                    outptr0[out_hstep * 6 + 1] = sum1[6];
                    outptr0[out_hstep * 7 + 1] = sum1[7];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
            }

            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj < max_jj; jj += 1)
        {
            const __fp16* pA = pAT;

            float16x8_t _sum0;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1q_f16(pC);
                }
                else
                {
                    _sum0 = vdupq_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
            }

            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x8_t _pB = vld1q_dup_f16(pB);

                _sum0 = vfmaq_f16(_sum0, _pA, _pB);

                pA += 8;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    outptr0 += 8;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[8];
                    vst1q_f16(sum0, _sum0);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep * 1] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
            }

            outptr += 8;
        }

        pAT += max_kk * 8;
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const __fp16* pB = pBT;

        if (pC)
        {
            pC = (const __fp16*)CT_tile + i + ii;
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;
            float16x4_t _sum3;
            float16x4_t _sum4;
            float16x4_t _sum5;
            float16x4_t _sum6;
            float16x4_t _sum7;
            float16x4_t _sum8;
            float16x4_t _sum9;
            float16x4_t _suma;
            float16x4_t _sumb;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1_f16(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                    _sum8 = _sum0;
                    _sum9 = _sum0;
                    _suma = _sum0;
                    _sumb = _sum0;
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                    _sum3 = vdup_n_f16(0.f);
                    _sum4 = vdup_n_f16(0.f);
                    _sum5 = vdup_n_f16(0.f);
                    _sum6 = vdup_n_f16(0.f);
                    _sum7 = vdup_n_f16(0.f);
                    _sum8 = vdup_n_f16(0.f);
                    _sum9 = vdup_n_f16(0.f);
                    _suma = vdup_n_f16(0.f);
                    _sumb = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4 * 1);
                _sum2 = vld1_f16(outptr + 4 * 2);
                _sum3 = vld1_f16(outptr + 4 * 3);
                _sum4 = vld1_f16(outptr + 4 * 4);
                _sum5 = vld1_f16(outptr + 4 * 5);
                _sum6 = vld1_f16(outptr + 4 * 6);
                _sum7 = vld1_f16(outptr + 4 * 7);
                _sum8 = vld1_f16(outptr + 4 * 8);
                _sum9 = vld1_f16(outptr + 4 * 9);
                _suma = vld1_f16(outptr + 4 * 10);
                _sumb = vld1_f16(outptr + 4 * 11);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                _sum0 = vfma_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfma_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfma_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfma_lane_f16(_sum3, _pA, _pB0, 3);
                _sum4 = vfma_lane_f16(_sum4, _pA, _pB1, 0);
                _sum5 = vfma_lane_f16(_sum5, _pA, _pB1, 1);
                _sum6 = vfma_lane_f16(_sum6, _pA, _pB1, 2);
                _sum7 = vfma_lane_f16(_sum7, _pA, _pB1, 3);
                _sum8 = vfma_lane_f16(_sum8, _pA, _pB2, 0);
                _sum9 = vfma_lane_f16(_sum9, _pA, _pB2, 1);
                _suma = vfma_lane_f16(_suma, _pA, _pB2, 2);
                _sumb = vfma_lane_f16(_sumb, _pA, _pB2, 3);

                pA += 4;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 4 * 2, _sum2);
                    vst1_f16(outptr0 + 4 * 3, _sum3);
                    vst1_f16(outptr0 + 4 * 4, _sum4);
                    vst1_f16(outptr0 + 4 * 5, _sum5);
                    vst1_f16(outptr0 + 4 * 6, _sum6);
                    vst1_f16(outptr0 + 4 * 7, _sum7);
                    vst1_f16(outptr0 + 4 * 8, _sum8);
                    vst1_f16(outptr0 + 4 * 9, _sum9);
                    vst1_f16(outptr0 + 4 * 10, _suma);
                    vst1_f16(outptr0 + 4 * 11, _sumb);
                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose4x12_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb);

                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 8, _sum2);
                    vst1_f16(outptr0 + out_hstep, _sum3);
                    vst1_f16(outptr0 + out_hstep + 4, _sum4);
                    vst1_f16(outptr0 + out_hstep + 8, _sum5);
                    vst1_f16(outptr0 + out_hstep * 2, _sum6);
                    vst1_f16(outptr0 + out_hstep * 2 + 4, _sum7);
                    vst1_f16(outptr0 + out_hstep * 2 + 8, _sum8);
                    vst1_f16(outptr0 + out_hstep * 3, _sum9);
                    vst1_f16(outptr0 + out_hstep * 3 + 4, _suma);
                    vst1_f16(outptr0 + out_hstep * 3 + 8, _sumb);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
                vst1_f16(outptr + 4 * 4, _sum4);
                vst1_f16(outptr + 4 * 5, _sum5);
                vst1_f16(outptr + 4 * 6, _sum6);
                vst1_f16(outptr + 4 * 7, _sum7);
                vst1_f16(outptr + 4 * 8, _sum8);
                vst1_f16(outptr + 4 * 9, _sum9);
                vst1_f16(outptr + 4 * 10, _suma);
                vst1_f16(outptr + 4 * 11, _sumb);
            }

            outptr += 48;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;
            float16x4_t _sum3;
            float16x4_t _sum4;
            float16x4_t _sum5;
            float16x4_t _sum6;
            float16x4_t _sum7;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1_f16(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                    _sum4 = _sum0;
                    _sum5 = _sum0;
                    _sum6 = _sum0;
                    _sum7 = _sum0;
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                    _sum3 = vdup_n_f16(0.f);
                    _sum4 = vdup_n_f16(0.f);
                    _sum5 = vdup_n_f16(0.f);
                    _sum6 = vdup_n_f16(0.f);
                    _sum7 = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4 * 1);
                _sum2 = vld1_f16(outptr + 4 * 2);
                _sum3 = vld1_f16(outptr + 4 * 3);
                _sum4 = vld1_f16(outptr + 4 * 4);
                _sum5 = vld1_f16(outptr + 4 * 5);
                _sum6 = vld1_f16(outptr + 4 * 6);
                _sum7 = vld1_f16(outptr + 4 * 7);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                _sum0 = vfma_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfma_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfma_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfma_lane_f16(_sum3, _pA, _pB0, 3);
                _sum4 = vfma_lane_f16(_sum4, _pA, _pB1, 0);
                _sum5 = vfma_lane_f16(_sum5, _pA, _pB1, 1);
                _sum6 = vfma_lane_f16(_sum6, _pA, _pB1, 2);
                _sum7 = vfma_lane_f16(_sum7, _pA, _pB1, 3);

                pA += 4;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 4 * 2, _sum2);
                    vst1_f16(outptr0 + 4 * 3, _sum3);
                    vst1_f16(outptr0 + 4 * 4, _sum4);
                    vst1_f16(outptr0 + 4 * 5, _sum5);
                    vst1_f16(outptr0 + 4 * 6, _sum6);
                    vst1_f16(outptr0 + 4 * 7, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose4x8_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + out_hstep, _sum2);
                    vst1_f16(outptr0 + out_hstep + 4, _sum3);
                    vst1_f16(outptr0 + out_hstep * 2, _sum4);
                    vst1_f16(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1_f16(outptr0 + out_hstep * 3, _sum6);
                    vst1_f16(outptr0 + out_hstep * 3 + 4, _sum7);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
                vst1_f16(outptr + 4 * 4, _sum4);
                vst1_f16(outptr + 4 * 5, _sum5);
                vst1_f16(outptr + 4 * 6, _sum6);
                vst1_f16(outptr + 4 * 7, _sum7);
            }

            outptr += 32;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;
            float16x4_t _sum3;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1_f16(pC);
                    _sum1 = _sum0;
                    _sum2 = _sum0;
                    _sum3 = _sum0;
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                    _sum3 = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4);
                _sum2 = vld1_f16(outptr + 4 * 2);
                _sum3 = vld1_f16(outptr + 4 * 3);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB = vld1_f16(pB);

                _sum0 = vfma_lane_f16(_sum0, _pA, _pB, 0);
                _sum1 = vfma_lane_f16(_sum1, _pA, _pB, 1);
                _sum2 = vfma_lane_f16(_sum2, _pA, _pB, 2);
                _sum3 = vfma_lane_f16(_sum3, _pA, _pB, 3);

                pA += 4;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 4 * 2, _sum2);
                    vst1_f16(outptr0 + 4 * 3, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose4x4_ph(_sum0, _sum1, _sum2, _sum3);

                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + out_hstep, _sum1);
                    vst1_f16(outptr0 + out_hstep * 2, _sum2);
                    vst1_f16(outptr0 + out_hstep * 3, _sum3);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
            }

            outptr += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1_f16(pC);
                    _sum1 = _sum0;
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);

                _sum0 = vfma_n_f16(_sum0, _pA, pB[0]);
                _sum1 = vfma_n_f16(_sum1, _pA, pB[1]);

                pA += 4;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[4];
                    __fp16 sum1[4];
                    vst1_f16(sum0, _sum0);
                    vst1_f16(sum1, _sum1);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj < max_jj; jj += 1)
        {
            float16x4_t _sum0;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vld1_f16(pC);
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB = vdup_n_f16(pB[0]);

                _sum0 = vfma_f16(_sum0, _pA, _pB);

                pA += 4;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[4];
                    vst1_f16(sum0, _sum0);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0++;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
            }

            outptr += 4;
        }

        pAT += max_kk * 4;
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j;

        const __fp16* pB = pBT;

        if (pC)
        {
            pC = (const __fp16*)CT_tile + i + ii;
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float16x4_t _sum00;
            float16x4_t _sum01;
            float16x4_t _sum02;
            float16x4_t _sum10;
            float16x4_t _sum11;
            float16x4_t _sum12;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vdup_n_f16(pC[0]);
                    _sum01 = vdup_n_f16(pC[0]);
                    _sum02 = vdup_n_f16(pC[0]);
                    _sum10 = vdup_n_f16(pC[1]);
                    _sum11 = vdup_n_f16(pC[1]);
                    _sum12 = vdup_n_f16(pC[1]);
                }
                else
                {
                    _sum00 = vdup_n_f16(0.f);
                    _sum01 = vdup_n_f16(0.f);
                    _sum02 = vdup_n_f16(0.f);
                    _sum10 = vdup_n_f16(0.f);
                    _sum11 = vdup_n_f16(0.f);
                    _sum12 = vdup_n_f16(0.f);
                }
            }
            else
            {
                float16x4x2_t _tmp01 = vld2_f16(outptr);
                float16x4x2_t _tmp23 = vld2_f16(outptr + 8);
                float16x4x2_t _tmp45 = vld2_f16(outptr + 16);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum02 = _tmp45.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
                _sum12 = _tmp45.val[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                float16x4_t _pA0 = vld1_dup_f16(pA);
                float16x4_t _pA1 = vld1_dup_f16(pA + 1);

                _sum00 = vfma_f16(_sum00, _pB0, _pA0);
                _sum01 = vfma_f16(_sum01, _pB1, _pA0);
                _sum02 = vfma_f16(_sum02, _pB2, _pA0);
                _sum10 = vfma_f16(_sum10, _pB0, _pA1);
                _sum11 = vfma_f16(_sum11, _pB1, _pA1);
                _sum12 = vfma_f16(_sum12, _pB2, _pA1);

                pA += 2;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum00);
                    vst1_f16(outptr0 + 4, _sum01);
                    vst1_f16(outptr0 + 8, _sum02);
                    vst1_f16(outptr0 + out_hstep, _sum10);
                    vst1_f16(outptr0 + out_hstep + 4, _sum11);
                    vst1_f16(outptr0 + out_hstep + 8, _sum12);
                    outptr0 += 12;
                }
            }
            else
            {
                float16x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float16x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float16x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2_f16(outptr, _tmp01);
                vst2_f16(outptr + 8, _tmp23);
                vst2_f16(outptr + 16, _tmp45);
            }

            outptr += 24;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float16x4_t _sum00;
            float16x4_t _sum01;
            float16x4_t _sum10;
            float16x4_t _sum11;

            if (k == 0)
            {
                if (pC)
                {
                    _sum00 = vdup_n_f16(pC[0]);
                    _sum01 = vdup_n_f16(pC[0]);
                    _sum10 = vdup_n_f16(pC[1]);
                    _sum11 = vdup_n_f16(pC[1]);
                }
                else
                {
                    _sum00 = vdup_n_f16(0.f);
                    _sum01 = vdup_n_f16(0.f);
                    _sum10 = vdup_n_f16(0.f);
                    _sum11 = vdup_n_f16(0.f);
                }
            }
            else
            {
                float16x4x2_t _tmp01 = vld2_f16(outptr);
                float16x4x2_t _tmp23 = vld2_f16(outptr + 8);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                float16x4_t _pA0 = vld1_dup_f16(pA);
                float16x4_t _pA1 = vld1_dup_f16(pA + 1);

                _sum00 = vfma_f16(_sum00, _pB0, _pA0);
                _sum01 = vfma_f16(_sum01, _pB1, _pA0);
                _sum10 = vfma_f16(_sum10, _pB0, _pA1);
                _sum11 = vfma_f16(_sum11, _pB1, _pA1);

                pA += 2;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum00);
                    vst1_f16(outptr0 + 4, _sum01);
                    vst1_f16(outptr0 + out_hstep, _sum10);
                    vst1_f16(outptr0 + out_hstep + 4, _sum11);
                    outptr0 += 8;
                }
            }
            else
            {
                float16x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float16x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2_f16(outptr, _tmp01);
                vst2_f16(outptr + 8, _tmp23);
            }

            outptr += 16;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdup_n_f16(pC[0]);
                    _sum1 = vdup_n_f16(pC[1]);
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                }
            }
            else
            {
                float16x4x2_t _tmp01 = vld2_f16(outptr);
                _sum0 = _tmp01.val[0];
                _sum1 = _tmp01.val[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB = vld1_f16(pB);

                _sum0 = vfma_n_f16(_sum0, _pB, pA[0]);
                _sum1 = vfma_n_f16(_sum1, _pB, pA[1]);

                pA += 2;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, (_sum0));
                    vst1_f16(outptr0 + out_hstep, (_sum1));
                    outptr0 += 4;
                }
            }
            else
            {
                float16x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2_f16(outptr, _tmp01);
            }

            outptr += 8;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            __fp16 sum00;
            __fp16 sum01;
            __fp16 sum10;
            __fp16 sum11;

            if (k == 0)
            {
                if (pC)
                {
                    sum00 = pC[0];
                    sum01 = pC[1];
                    sum10 = pC[0];
                    sum11 = pC[1];
                }
                else
                {
                    sum00 = 0.f;
                    sum01 = 0.f;
                    sum10 = 0.f;
                    sum11 = 0.f;
                }
            }
            else
            {
                sum00 = outptr[0];
                sum01 = outptr[1];
                sum10 = outptr[2];
                sum11 = outptr[3];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum00 += pA[0] * pB[0];
                sum01 += pA[1] * pB[0];
                sum10 += pA[0] * pB[1];
                sum11 += pA[1] * pB[1];

                pA += 2;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum00;
                    outptr0[1] = sum10;
                    outptr0[out_hstep] = sum01;
                    outptr0[out_hstep + 1] = sum11;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
            }

            outptr += 4;
        }
        for (; jj < max_jj; jj += 1)
        {
            __fp16 sum0;
            __fp16 sum1;

            if (k == 0)
            {
                if (pC)
                {
                    sum0 = pC[0];
                    sum1 = pC[1];
                }
                else
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[1] * pB[0];
                pA += 2;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[out_hstep] = sum1;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j;

        const __fp16* pB = pBT;

        if (pC)
        {
            pC = (const __fp16*)CT_tile + i + ii;
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdup_n_f16(pC[0]);
                    _sum1 = vdup_n_f16(pC[0]);
                    _sum2 = vdup_n_f16(pC[0]);
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                    _sum2 = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4);
                _sum2 = vld1_f16(outptr + 8);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);

                _sum0 = vfma_f16(_sum0, _pA0, _pB0);
                _sum1 = vfma_f16(_sum1, _pA0, _pB1);
                _sum2 = vfma_f16(_sum2, _pA0, _pB2);

                pA += 1;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 8, _sum2);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 8, _sum2);
            }

            outptr += 12;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;

            if (k == 0)
            {
                if (pC)
                {
                    _sum0 = vdup_n_f16(pC[0]);
                    _sum1 = vdup_n_f16(pC[0]);
                }
                else
                {
                    _sum0 = vdup_n_f16(0.f);
                    _sum1 = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);

                _sum0 = vfma_f16(_sum0, _pA0, _pB0);
                _sum1 = vfma_f16(_sum1, _pA0, _pB1);

                pA += 1;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float16x4_t _sum;

            if (k == 0)
            {
                if (pC)
                {
                    _sum = vdup_n_f16(pC[0]);
                }
                else
                {
                    _sum = vdup_n_f16(0.f);
                }
            }
            else
            {
                _sum = vld1_f16(outptr);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB = vld1_f16(pB);
                float16x4_t _pA = vdup_n_f16(pA[0]);

                _sum = vfma_f16(_sum, _pA, _pB);

                pA += 1;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1_f16(outptr, _sum);
            }

            outptr += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            __fp16 sum0;
            __fp16 sum1;

            if (k == 0)
            {
                if (pC)
                {
                    sum0 = pC[0];
                    sum1 = pC[0];
                }
                else
                {
                    sum0 = 0.f;
                    sum1 = 0.f;
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[0] * pB[1];

                pA += 1;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[1] = sum1;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            __fp16 sum;

            if (k == 0)
            {
                if (pC)
                {
                    sum = pC[0];
                }
                else
                {
                    sum = 0.f;
                }
            }
            else
            {
                sum = outptr[0];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum += pA[0] * pB[0];

                pA += 1;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}

static void convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const int l2_cache_size_fp16 = (int)(get_cpu_level2_cache_size() / sizeof(unsigned short));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve K
    {
        // try not to split K
        int tile_size = (l2_cache_size_fp16 - 32) / 12;

        TILE_K = std::max(8, tile_size / 8 * 8);

        int nn_K = (K + TILE_K - 1) / TILE_K;
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
    }

    // solve M
    {
        int nn_M = (M + 63) / 64;

        TILE_M = std::max(8, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
    }

    {
        TILE_M *= std::min(nT, get_physical_cpu_count());

        int nn_M = (M + TILE_M - 1) / TILE_M;
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);

        if (nT > 1)
        {
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
        }
    }

    if (N > 0)
    {
        int tile_size;
        if (TILE_K >= K)
        {
            tile_size = (l2_cache_size_fp16 - TILE_M * TILE_K) / TILE_K;
        }
        else
        {
            tile_size = (l2_cache_size_fp16 - TILE_M * TILE_K) / (TILE_M + TILE_K);
        }

        TILE_N = std::max(4, tile_size / 4 * 4);

        int nn_N = (N + TILE_N - 1) / TILE_N;
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
    }
}

static void convolution_im2col_gemm_transform_kernel_fp16sa(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
    // NCNN_LOGE("convolution_im2col_gemm_transform_kernel_fp16sa %p", kernel.data);
    const int maxk = kernel_w * kernel_h;

    const int M = outch;
    const int K = inch * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    int elempack = 1;
    if (opt.use_packing_layout)
    {
        elempack = inch % 8 == 0 ? 8 : inch % 4 == 0 ? 4 : 1;
    }

    // maxk-inch-outch to pa-maxk-inch/pa-outch
    Mat A_data;
    if (maxk == 1)
    {
        cast_float32_to_float16(kernel, A_data);
        A_data = A_data.reshape(maxk * inch, outch);
    }
    else
    {
        Mat weight_data_r2 = kernel.reshape(maxk, inch, outch);

        A_data.create(maxk * inch, outch, (size_t)2u);

        for (int q = 0; q < outch; q += 1)
        {
            __fp16* g00 = A_data.row<__fp16>(q);

            for (int p = 0; p + (elempack - 1) < inch; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        const float* k00 = weight_data_r2.channel(q).row(p + i);
                        g00[0] = (__fp16)k00[k];
                        g00++;
                    }
                }
            }
        }
    }

    AT.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)2u);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_kk = std::min((K - k), TILE_K);

            Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

            convolution_im2col_pack_A_tile_bf16_fp16(A_data, AT_tile, i, max_ii, k, max_kk);
        }
    }
}

static int convolution_im2col_gemm_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, const Mat& bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
    // NCNN_LOGE("convolution_im2col_gemm_fp16sa %p %p %p %p", bottom_blob.data, top_blob.data, AT.data, bias.data);
    const int maxk = kernel_w * kernel_h;

    const int M = top_blob.c * top_blob.elempack;
    const int N = top_blob.w * top_blob.h;
    const int K = bottom_blob.c * bottom_blob.elempack * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk_fp16sa(M, N, K, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        // im2col
        convolution_im2col_input_tile_bf16_fp16(bottom_blob, BT_tile, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h);
    }

    Mat topT_tileX;
    if (K > TILE_K)
    {
        topT_tileX.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
        if (topT_tileX.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat topT_tile;
        if (K > TILE_K)
            topT_tile = topT_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                const Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = k + TILE_K >= K;

                convolution_gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, bias, topT_tile, top_blob, i, max_ii, j, max_jj, k, max_kk, k_end, opt.use_a53_a55_optimized_kernel);
            }
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_im2col_gemm_int8.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
void convolution_im2col_gemm_transform_kernel_int8_i8mm(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt);
int convolution_im2col_gemm_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
void convolution_im2col_gemm_transform_kernel_int8_asimddp(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt);
int convolution_im2col_gemm_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt);
#endif

static void convolution_im2col_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    // A = (pa, maxk, inch/pa), outch
    const int A_hstep = A.w;

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k;
        const signed char* p1 = (const signed char*)A + (i + ii + 1) * A_hstep + k;
        const signed char* p2 = (const signed char*)A + (i + ii + 2) * A_hstep + k;
        const signed char* p3 = (const signed char*)A + (i + ii + 3) * A_hstep + k;
        const signed char* p4 = (const signed char*)A + (i + ii + 4) * A_hstep + k;
        const signed char* p5 = (const signed char*)A + (i + ii + 5) * A_hstep + k;
        const signed char* p6 = (const signed char*)A + (i + ii + 6) * A_hstep + k;
        const signed char* p7 = (const signed char*)A + (i + ii + 7) * A_hstep + k;

        int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _r0 = vld1q_s8(p0);
            int8x16_t _r1 = vld1q_s8(p1);
            int8x16_t _r2 = vld1q_s8(p2);
            int8x16_t _r3 = vld1q_s8(p3);
            int8x16_t _r4 = vld1q_s8(p4);
            int8x16_t _r5 = vld1q_s8(p5);
            int8x16_t _r6 = vld1q_s8(p6);
            int8x16_t _r7 = vld1q_s8(p7);
            int8x16_t _t0 = vcombine_s8(vget_low_s8(_r0), vget_low_s8(_r1));
            int8x16_t _t1 = vcombine_s8(vget_low_s8(_r2), vget_low_s8(_r3));
            int8x16_t _t2 = vcombine_s8(vget_low_s8(_r4), vget_low_s8(_r5));
            int8x16_t _t3 = vcombine_s8(vget_low_s8(_r6), vget_low_s8(_r7));
            int8x16_t _t4 = vcombine_s8(vget_high_s8(_r0), vget_high_s8(_r1));
            int8x16_t _t5 = vcombine_s8(vget_high_s8(_r2), vget_high_s8(_r3));
            int8x16_t _t6 = vcombine_s8(vget_high_s8(_r4), vget_high_s8(_r5));
            int8x16_t _t7 = vcombine_s8(vget_high_s8(_r6), vget_high_s8(_r7));
            vst1q_s8(pp, _t0);
            vst1q_s8(pp + 16, _t1);
            vst1q_s8(pp + 32, _t2);
            vst1q_s8(pp + 48, _t3);
            vst1q_s8(pp + 64, _t4);
            vst1q_s8(pp + 80, _t5);
            vst1q_s8(pp + 96, _t6);
            vst1q_s8(pp + 112, _t7);
            pp += 128;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
            p4 += 16;
            p5 += 16;
            p6 += 16;
            p7 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _r0 = vld1_s8(p0);
            int8x8_t _r1 = vld1_s8(p1);
            int8x8_t _r2 = vld1_s8(p2);
            int8x8_t _r3 = vld1_s8(p3);
            int8x8_t _r4 = vld1_s8(p4);
            int8x8_t _r5 = vld1_s8(p5);
            int8x8_t _r6 = vld1_s8(p6);
            int8x8_t _r7 = vld1_s8(p7);
            vst1_s8(pp, _r0);
            vst1_s8(pp + 8, _r1);
            vst1_s8(pp + 16, _r2);
            vst1_s8(pp + 24, _r3);
            vst1_s8(pp + 32, _r4);
            vst1_s8(pp + 40, _r5);
            vst1_s8(pp + 48, _r6);
            vst1_s8(pp + 56, _r7);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
#else  // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _r0 = vld1q_s8(p0);
            int8x16_t _r1 = vld1q_s8(p1);
            int8x16_t _r2 = vld1q_s8(p2);
            int8x16_t _r3 = vld1q_s8(p3);
            int8x16_t _r4 = vld1q_s8(p4);
            int8x16_t _r5 = vld1q_s8(p5);
            int8x16_t _r6 = vld1q_s8(p6);
            int8x16_t _r7 = vld1q_s8(p7);
            int32x4x2_t _r01 = vzipq_s32(vreinterpretq_s32_s8(_r0), vreinterpretq_s32_s8(_r1));
            int32x4x2_t _r23 = vzipq_s32(vreinterpretq_s32_s8(_r2), vreinterpretq_s32_s8(_r3));
            int32x4x2_t _r45 = vzipq_s32(vreinterpretq_s32_s8(_r4), vreinterpretq_s32_s8(_r5));
            int32x4x2_t _r67 = vzipq_s32(vreinterpretq_s32_s8(_r6), vreinterpretq_s32_s8(_r7));
            _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_r01.val[0]), vget_low_s32(_r23.val[0])));
            _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_r45.val[0]), vget_low_s32(_r67.val[0])));
            _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_r01.val[0]), vget_high_s32(_r23.val[0])));
            _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_r45.val[0]), vget_high_s32(_r67.val[0])));
            _r4 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_r01.val[1]), vget_low_s32(_r23.val[1])));
            _r5 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_r45.val[1]), vget_low_s32(_r67.val[1])));
            _r6 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_r01.val[1]), vget_high_s32(_r23.val[1])));
            _r7 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_r45.val[1]), vget_high_s32(_r67.val[1])));
            vst1q_s8(pp, _r0);
            vst1q_s8(pp + 16, _r1);
            vst1q_s8(pp + 32, _r2);
            vst1q_s8(pp + 48, _r3);
            vst1q_s8(pp + 64, _r4);
            vst1q_s8(pp + 80, _r5);
            vst1q_s8(pp + 96, _r6);
            vst1q_s8(pp + 112, _r7);
            pp += 128;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
            p4 += 16;
            p5 += 16;
            p6 += 16;
            p7 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _r0 = vld1_s8(p0);
            int8x8_t _r1 = vld1_s8(p1);
            int8x8_t _r2 = vld1_s8(p2);
            int8x8_t _r3 = vld1_s8(p3);
            int8x8_t _r4 = vld1_s8(p4);
            int8x8_t _r5 = vld1_s8(p5);
            int8x8_t _r6 = vld1_s8(p6);
            int8x8_t _r7 = vld1_s8(p7);
            int32x2x2_t _r01 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r1));
            int32x2x2_t _r23 = vzip_s32(vreinterpret_s32_s8(_r2), vreinterpret_s32_s8(_r3));
            int32x2x2_t _r45 = vzip_s32(vreinterpret_s32_s8(_r4), vreinterpret_s32_s8(_r5));
            int32x2x2_t _r67 = vzip_s32(vreinterpret_s32_s8(_r6), vreinterpret_s32_s8(_r7));
            int8x16_t _t0 = vreinterpretq_s8_s32(vcombine_s32(_r01.val[0], _r23.val[0]));
            int8x16_t _t1 = vreinterpretq_s8_s32(vcombine_s32(_r45.val[0], _r67.val[0]));
            int8x16_t _t2 = vreinterpretq_s8_s32(vcombine_s32(_r01.val[1], _r23.val[1]));
            int8x16_t _t3 = vreinterpretq_s8_s32(vcombine_s32(_r45.val[1], _r67.val[1]));
            vst1q_s8(pp, _t0);
            vst1q_s8(pp + 16, _t1);
            vst1q_s8(pp + 32, _t2);
            vst1q_s8(pp + 48, _t3);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
            pp[8] = p2[0];
            pp[9] = p2[1];
            pp[10] = p2[2];
            pp[11] = p2[3];
            pp[12] = p3[0];
            pp[13] = p3[1];
            pp[14] = p3[2];
            pp[15] = p3[3];
            pp[16] = p4[0];
            pp[17] = p4[1];
            pp[18] = p4[2];
            pp[19] = p4[3];
            pp[20] = p5[0];
            pp[21] = p5[1];
            pp[22] = p5[2];
            pp[23] = p5[3];
            pp[24] = p6[0];
            pp[25] = p6[1];
            pp[26] = p6[2];
            pp[27] = p6[3];
            pp[28] = p7[0];
            pp[29] = p7[1];
            pp[30] = p7[2];
            pp[31] = p7[3];
            pp += 32;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
            p4 += 4;
            p5 += 4;
            p6 += 4;
            p7 += 4;
        }
#else  // __ARM_FEATURE_DOTPROD
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _r0 = vld1q_s8(p0);
            int8x16_t _r1 = vld1q_s8(p1);
            int8x16_t _r2 = vld1q_s8(p2);
            int8x16_t _r3 = vld1q_s8(p3);
            int8x16_t _r4 = vld1q_s8(p4);
            int8x16_t _r5 = vld1q_s8(p5);
            int8x16_t _r6 = vld1q_s8(p6);
            int8x16_t _r7 = vld1q_s8(p7);
            int16x8x2_t _r01 = vzipq_s16(vreinterpretq_s16_s8(_r0), vreinterpretq_s16_s8(_r1));
            int16x8x2_t _r23 = vzipq_s16(vreinterpretq_s16_s8(_r2), vreinterpretq_s16_s8(_r3));
            int16x8x2_t _r45 = vzipq_s16(vreinterpretq_s16_s8(_r4), vreinterpretq_s16_s8(_r5));
            int16x8x2_t _r67 = vzipq_s16(vreinterpretq_s16_s8(_r6), vreinterpretq_s16_s8(_r7));
            int32x4x2_t _t0 = vzipq_s32(vreinterpretq_s32_s16(_r01.val[0]), vreinterpretq_s32_s16(_r23.val[0]));
            int32x4x2_t _t1 = vzipq_s32(vreinterpretq_s32_s16(_r01.val[1]), vreinterpretq_s32_s16(_r23.val[1]));
            int32x4x2_t _t2 = vzipq_s32(vreinterpretq_s32_s16(_r45.val[0]), vreinterpretq_s32_s16(_r67.val[0]));
            int32x4x2_t _t3 = vzipq_s32(vreinterpretq_s32_s16(_r45.val[1]), vreinterpretq_s32_s16(_r67.val[1]));
            _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t2.val[0])));
            _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t2.val[0])));
            _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t0.val[1]), vget_low_s32(_t2.val[1])));
            _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t0.val[1]), vget_high_s32(_t2.val[1])));
            _r4 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t3.val[0])));
            _r5 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t3.val[0])));
            _r6 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t3.val[1])));
            _r7 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t3.val[1])));
            vst1q_s8(pp, _r0);
            vst1q_s8(pp + 16, _r1);
            vst1q_s8(pp + 32, _r2);
            vst1q_s8(pp + 48, _r3);
            vst1q_s8(pp + 64, _r4);
            vst1q_s8(pp + 80, _r5);
            vst1q_s8(pp + 96, _r6);
            vst1q_s8(pp + 112, _r7);
            pp += 128;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
            p4 += 16;
            p5 += 16;
            p6 += 16;
            p7 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _r0 = vld1_s8(p0);
            int8x8_t _r1 = vld1_s8(p1);
            int8x8_t _r2 = vld1_s8(p2);
            int8x8_t _r3 = vld1_s8(p3);
            int8x8_t _r4 = vld1_s8(p4);
            int8x8_t _r5 = vld1_s8(p5);
            int8x8_t _r6 = vld1_s8(p6);
            int8x8_t _r7 = vld1_s8(p7);
            int16x8_t _r04 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r4));
            int16x8_t _r15 = vreinterpretq_s16_s8(vcombine_s8(_r1, _r5));
            int16x8_t _r26 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r6));
            int16x8_t _r37 = vreinterpretq_s16_s8(vcombine_s8(_r3, _r7));
            int16x8x2_t _t0 = vzipq_s16(_r04, _r15);
            int16x8x2_t _t1 = vzipq_s16(_r26, _r37);
            int32x4x2_t _t2 = vzipq_s32(vreinterpretq_s32_s16(_t0.val[0]), vreinterpretq_s32_s16(_t1.val[0]));
            int32x4x2_t _t3 = vzipq_s32(vreinterpretq_s32_s16(_t0.val[1]), vreinterpretq_s32_s16(_t1.val[1]));
            int8x16_t _t4 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0])));
            int8x16_t _t5 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0])));
            int8x16_t _t6 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t2.val[1]), vget_low_s32(_t3.val[1])));
            int8x16_t _t7 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t2.val[1]), vget_high_s32(_t3.val[1])));
            vst1q_s8(pp, _t4);
            vst1q_s8(pp + 16, _t5);
            vst1q_s8(pp + 32, _t6);
            vst1q_s8(pp + 48, _t7);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp[8] = p4[0];
            pp[9] = p4[1];
            pp[10] = p5[0];
            pp[11] = p5[1];
            pp[12] = p6[0];
            pp[13] = p6[1];
            pp[14] = p7[0];
            pp[15] = p7[1];
            pp += 16;
            p0 += 2;
            p1 += 2;
            p2 += 2;
            p3 += 2;
            p4 += 2;
            p5 += 2;
            p6 += 2;
            p7 += 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp[4] = p4[0];
            pp[5] = p5[0];
            pp[6] = p6[0];
            pp[7] = p7[0];
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k;
        const signed char* p1 = (const signed char*)A + (i + ii + 1) * A_hstep + k;
        const signed char* p2 = (const signed char*)A + (i + ii + 2) * A_hstep + k;
        const signed char* p3 = (const signed char*)A + (i + ii + 3) * A_hstep + k;

        int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 15 < max_kk; kk += 16)
        {
            int64x2x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s64_s8(vld1q_s8(p0));
            _r0123.val[1] = vreinterpretq_s64_s8(vld1q_s8(p1));
            _r0123.val[2] = vreinterpretq_s64_s8(vld1q_s8(p2));
            _r0123.val[3] = vreinterpretq_s64_s8(vld1q_s8(p3));
            vst4q_s64((int64_t*)pp, _r0123);
            pp += 64;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _r0 = vld1_s8(p0);
            int8x8_t _r1 = vld1_s8(p1);
            int8x8_t _r2 = vld1_s8(p2);
            int8x8_t _r3 = vld1_s8(p3);
            vst1_s8(pp, _r0);
            vst1_s8(pp + 8, _r1);
            vst1_s8(pp + 16, _r2);
            vst1_s8(pp + 24, _r3);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
#else  // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 15 < max_kk; kk += 16)
        {
            int32x4x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s32_s8(vld1q_s8(p0));
            _r0123.val[1] = vreinterpretq_s32_s8(vld1q_s8(p1));
            _r0123.val[2] = vreinterpretq_s32_s8(vld1q_s8(p2));
            _r0123.val[3] = vreinterpretq_s32_s8(vld1q_s8(p3));
            vst4q_s32((int*)pp, _r0123);
            pp += 64;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int32x2x4_t _r0123;
            _r0123.val[0] = vreinterpret_s32_s8(vld1_s8(p0));
            _r0123.val[1] = vreinterpret_s32_s8(vld1_s8(p1));
            _r0123.val[2] = vreinterpret_s32_s8(vld1_s8(p2));
            _r0123.val[3] = vreinterpret_s32_s8(vld1_s8(p3));
            vst4_s32((int*)pp, _r0123);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
            pp[8] = p2[0];
            pp[9] = p2[1];
            pp[10] = p2[2];
            pp[11] = p2[3];
            pp[12] = p3[0];
            pp[13] = p3[1];
            pp[14] = p3[2];
            pp[15] = p3[3];
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
#else  // __ARM_FEATURE_DOTPROD
        for (; kk + 15 < max_kk; kk += 16)
        {
            int16x8x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s16_s8(vld1q_s8(p0));
            _r0123.val[1] = vreinterpretq_s16_s8(vld1q_s8(p1));
            _r0123.val[2] = vreinterpretq_s16_s8(vld1q_s8(p2));
            _r0123.val[3] = vreinterpretq_s16_s8(vld1q_s8(p3));
            vst4q_s16((short*)pp, _r0123);
            pp += 64;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int16x4x4_t _r0123;
            _r0123.val[0] = vreinterpret_s16_s8(vld1_s8(p0));
            _r0123.val[1] = vreinterpret_s16_s8(vld1_s8(p1));
            _r0123.val[2] = vreinterpret_s16_s8(vld1_s8(p2));
            _r0123.val[3] = vreinterpret_s16_s8(vld1_s8(p3));
            vst4_s16((short*)pp, _r0123);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp += 8;
            p0 += 2;
            p1 += 2;
            p2 += 2;
            p3 += 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k;
        const signed char* p1 = (const signed char*)A + (i + ii + 1) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 15 < max_kk; kk += 16)
        {
            int64x2x2_t _r01;
            _r01.val[0] = vreinterpretq_s64_s8(vld1q_s8(p0));
            _r01.val[1] = vreinterpretq_s64_s8(vld1q_s8(p1));
            vst2q_s64((int64_t*)pp, _r01);
            pp += 32;
            p0 += 16;
            p1 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _r0 = vld1_s8(p0);
            int8x8_t _r1 = vld1_s8(p1);
            vst1_s8(pp, _r0);
            vst1_s8(pp + 8, _r1);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
#else  // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 15 < max_kk; kk += 16)
        {
            int32x4x2_t _r01;
            _r01.val[0] = vreinterpretq_s32_s8(vld1q_s8(p0));
            _r01.val[1] = vreinterpretq_s32_s8(vld1q_s8(p1));
            vst2q_s32((int*)pp, _r01);
            pp += 32;
            p0 += 16;
            p1 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int32x2x2_t _r01;
            _r01.val[0] = vreinterpret_s32_s8(vld1_s8(p0));
            _r01.val[1] = vreinterpret_s32_s8(vld1_s8(p1));
            vst2_s32((int*)pp, _r01);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
#else  // __ARM_FEATURE_DOTPROD
        for (; kk + 15 < max_kk; kk += 16)
        {
            int16x8x2_t _r01;
            _r01.val[0] = vreinterpretq_s16_s8(vld1q_s8(p0));
            _r01.val[1] = vreinterpretq_s16_s8(vld1q_s8(p1));
            vst2q_s16((short*)pp, _r01);
            pp += 32;
            p0 += 16;
            p1 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int16x4x2_t _r01;
            _r01.val[0] = vreinterpret_s16_s8(vld1_s8(p0));
            _r01.val[1] = vreinterpret_s16_s8(vld1_s8(p1));
            vst2_s16((short*)pp, _r01);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp += 4;
            p0 += 2;
            p1 += 2;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const signed char* p0 = (const signed char*)A + (i + ii) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 15 < max_kk; kk += 16)
        {
            vst1q_s8(pp, vld1q_s8(p0));
            pp += 16;
            p0 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            vst1_s8(pp, vld1_s8(p0));
            pp += 8;
            p0 += 8;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp += 1;
            p0++;
        }
    }
}

static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end)
{
    // NCNN_LOGE("convolution_gemm_transB_packed_tile_int8 %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk);

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.cstep;

    const signed char* pAT = AT_tile;
    const signed char* pBT = BT_tile;

    int* outptr = topT_tile;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const signed char* pB = pBT;

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
#if !__ARM_FEATURE_MATMUL_INT8
                "cmp    %w9, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "sub    %0, %0, #192                \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"

                "1:                                 \n"
#endif // !__ARM_FEATURE_MATMUL_INT8

#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v0.16b, v0.16b, v0.16b      \n"
                "eor    v1.16b, v1.16b, v1.16b      \n"
                "eor    v2.16b, v2.16b, v2.16b      \n"
                "eor    v3.16b, v3.16b, v3.16b      \n"
                "eor    v4.16b, v4.16b, v4.16b      \n"
                "eor    v5.16b, v5.16b, v5.16b      \n"
                "eor    v6.16b, v6.16b, v6.16b      \n"
                "eor    v7.16b, v7.16b, v7.16b      \n"
                "eor    v8.16b, v8.16b, v8.16b      \n"
                "eor    v9.16b, v9.16b, v9.16b      \n"
                "eor    v10.16b, v10.16b, v10.16b   \n"
                "eor    v11.16b, v11.16b, v11.16b   \n"
                "eor    v12.16b, v12.16b, v12.16b   \n"
                "eor    v13.16b, v13.16b, v13.16b   \n"
                "eor    v14.16b, v14.16b, v14.16b   \n"
                "eor    v15.16b, v15.16b, v15.16b   \n"

                "2:                                 \n"
                "ld1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%1], #64 \n"
                "ld1    {v20.16b, v21.16b, v22.16b, v23.16b}, [%2], #64 \n"
                "smmla  v0.4s, v16.16b, v20.16b     \n"
                "smmla  v1.4s, v17.16b, v20.16b     \n"
                "smmla  v2.4s, v16.16b, v21.16b     \n"
                "smmla  v3.4s, v17.16b, v21.16b     \n"
                "smmla  v4.4s, v18.16b, v20.16b     \n"
                "smmla  v5.4s, v19.16b, v20.16b     \n"
                "smmla  v6.4s, v18.16b, v21.16b     \n"
                "smmla  v7.4s, v19.16b, v21.16b     \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v8.4s, v16.16b, v22.16b     \n"
                "smmla  v9.4s, v17.16b, v22.16b     \n"
                "smmla  v10.4s, v16.16b, v23.16b    \n"
                "smmla  v11.4s, v17.16b, v23.16b    \n"
                "smmla  v12.4s, v18.16b, v22.16b    \n"
                "smmla  v13.4s, v19.16b, v22.16b    \n"
                "smmla  v14.4s, v18.16b, v23.16b    \n"
                "smmla  v15.4s, v19.16b, v23.16b    \n"
                "bne    2b                          \n"

                "uzp1   v16.4s, v0.4s, v1.4s        \n"
                "uzp2   v17.4s, v0.4s, v1.4s        \n"
                "uzp1   v18.4s, v2.4s, v3.4s        \n"
                "uzp2   v19.4s, v2.4s, v3.4s        \n"
                "uzp1   v20.4s, v4.4s, v5.4s        \n"
                "uzp2   v21.4s, v4.4s, v5.4s        \n"
                "uzp1   v22.4s, v6.4s, v7.4s        \n"
                "uzp2   v23.4s, v6.4s, v7.4s        \n"
                "uzp1   v24.4s, v8.4s, v9.4s        \n"
                "uzp2   v25.4s, v8.4s, v9.4s        \n"
                "uzp1   v26.4s, v10.4s, v11.4s      \n"
                "uzp2   v27.4s, v10.4s, v11.4s      \n"
                "uzp1   v28.4s, v12.4s, v13.4s      \n"
                "uzp2   v29.4s, v12.4s, v13.4s      \n"
                "uzp1   v30.4s, v14.4s, v15.4s      \n"
                "uzp2   v31.4s, v14.4s, v15.4s      \n"

                "cmp    %w9, #0                     \n"
                "beq    1f                          \n"

                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64   \n"
                "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64   \n"
                "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64 \n"
                "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0]    \n"
                "sub    %0, %0, #192                \n"
                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
                "add    v20.4s, v20.4s, v4.4s       \n"
                "add    v21.4s, v21.4s, v5.4s       \n"
                "add    v22.4s, v22.4s, v6.4s       \n"
                "add    v23.4s, v23.4s, v7.4s       \n"
                "add    v24.4s, v24.4s, v8.4s       \n"
                "add    v25.4s, v25.4s, v9.4s       \n"
                "add    v26.4s, v26.4s, v10.4s      \n"
                "add    v27.4s, v27.4s, v11.4s      \n"
                "add    v28.4s, v28.4s, v12.4s      \n"
                "add    v29.4s, v29.4s, v13.4s      \n"
                "add    v30.4s, v30.4s, v14.4s      \n"
                "add    v31.4s, v31.4s, v15.4s      \n"
                "b      1f                          \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "2:                                 \n"
                "ld1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n"
                "ld1    {v4.16b, v5.16b, v6.16b, v7.16b}, [%2], #64 \n"
                "sdot   v16.4s, v0.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v4.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v4.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v4.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v4.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v4.4b[3]    \n"
                "sdot   v24.4s, v0.16b, v5.4b[0]    \n"
                "sdot   v25.4s, v0.16b, v5.4b[1]    \n"
                "sdot   v26.4s, v0.16b, v5.4b[2]    \n"
                "sdot   v27.4s, v0.16b, v5.4b[3]    \n"
                "sdot   v28.4s, v1.16b, v5.4b[0]    \n"
                "sdot   v29.4s, v1.16b, v5.4b[1]    \n"
                "sdot   v30.4s, v1.16b, v5.4b[2]    \n"
                "sdot   v31.4s, v1.16b, v5.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v2.16b, v6.4b[0]    \n"
                "sdot   v17.4s, v2.16b, v6.4b[1]    \n"
                "sdot   v18.4s, v2.16b, v6.4b[2]    \n"
                "sdot   v19.4s, v2.16b, v6.4b[3]    \n"
                "sdot   v20.4s, v3.16b, v6.4b[0]    \n"
                "sdot   v21.4s, v3.16b, v6.4b[1]    \n"
                "sdot   v22.4s, v3.16b, v6.4b[2]    \n"
                "sdot   v23.4s, v3.16b, v6.4b[3]    \n"
                "sdot   v24.4s, v2.16b, v7.4b[0]    \n"
                "sdot   v25.4s, v2.16b, v7.4b[1]    \n"
                "sdot   v26.4s, v2.16b, v7.4b[2]    \n"
                "sdot   v27.4s, v2.16b, v7.4b[3]    \n"
                "sdot   v28.4s, v3.16b, v7.4b[0]    \n"
                "sdot   v29.4s, v3.16b, v7.4b[1]    \n"
                "sdot   v30.4s, v3.16b, v7.4b[2]    \n"
                "sdot   v31.4s, v3.16b, v7.4b[3]    \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
#if __ARM_FEATURE_MATMUL_INT8
                "cmp    %w9, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "sub    %0, %0, #192                \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"
                "1:                                 \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "and    w4, %w8, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v2.16b, v3.16b}, [%2], #32 \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v2.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v2.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v2.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v2.4b[3]    \n"
                "sdot   v24.4s, v0.16b, v3.4b[0]    \n"
                "sdot   v25.4s, v0.16b, v3.4b[1]    \n"
                "sdot   v26.4s, v0.16b, v3.4b[2]    \n"
                "sdot   v27.4s, v0.16b, v3.4b[3]    \n"
                "sdot   v28.4s, v1.16b, v3.4b[0]    \n"
                "sdot   v29.4s, v1.16b, v3.4b[1]    \n"
                "sdot   v30.4s, v1.16b, v3.4b[2]    \n"
                "sdot   v31.4s, v1.16b, v3.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull2 v9.8h, v0.16b, v4.16b       \n"
                "rev64  v2.4s, v0.4s                \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull2 v11.8h, v2.16b, v4.16b      \n"
                "rev64  v6.8h, v4.8h                \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull2 v13.8h, v0.16b, v6.16b      \n"
                "rev64  v3.4s, v1.4s                \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "smull2 v15.8h, v2.16b, v6.16b      \n"
                "rev64  v7.8h, v5.8h                \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal2 v9.8h, v1.16b, v5.16b       \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v11.8h, v3.16b, v5.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v7.16b      \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v15.8h, v3.16b, v7.16b      \n"
                "ext    v0.16b, v0.16b, v0.16b, #8  \n"
                "ext    v2.16b, v2.16b, v2.16b, #8  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v20.4s, v10.8h              \n"
                "sadalp v21.4s, v11.8h              \n"
                "ext    v1.16b, v1.16b, v1.16b, #8  \n"
                "ext    v3.16b, v3.16b, v3.16b, #8  \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull2 v9.8h, v0.16b, v4.16b       \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull2 v11.8h, v2.16b, v4.16b      \n"
                "sadalp v24.4s, v12.8h              \n"
                "sadalp v25.4s, v13.8h              \n"
                "sadalp v28.4s, v14.8h              \n"
                "sadalp v29.4s, v15.8h              \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull2 v13.8h, v0.16b, v6.16b      \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "smull2 v15.8h, v2.16b, v6.16b      \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal2 v9.8h, v1.16b, v5.16b       \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v11.8h, v3.16b, v5.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v7.16b      \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v15.8h, v3.16b, v7.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v18.4s, v8.8h               \n"
                "sadalp v19.4s, v9.8h               \n"
                "sadalp v22.4s, v10.8h              \n"
                "sadalp v23.4s, v11.8h              \n"
                "sadalp v26.4s, v12.8h              \n"
                "sadalp v27.4s, v13.8h              \n"
                "sadalp v30.4s, v14.8h              \n"
                "sadalp v31.4s, v15.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w8, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v1.16b}, [%2], #16         \n"
                "dup    v4.8h, v1.h[0]              \n"
                "dup    v5.8h, v1.h[1]              \n"
                "dup    v6.8h, v1.h[2]              \n"
                "dup    v7.8h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "smull2 v12.8h, v0.16b, v4.16b      \n"
                "smull2 v13.8h, v0.16b, v5.16b      \n"
                "smull2 v14.8h, v0.16b, v6.16b      \n"
                "smull2 v15.8h, v0.16b, v7.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
                "dup    v4.8h, v1.h[4]              \n"
                "dup    v5.8h, v1.h[5]              \n"
                "dup    v6.8h, v1.h[6]              \n"
                "dup    v7.8h, v1.h[7]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "smull2 v12.8h, v0.16b, v4.16b      \n"
                "smull2 v13.8h, v0.16b, v5.16b      \n"
                "smull2 v14.8h, v0.16b, v6.16b      \n"
                "smull2 v15.8h, v0.16b, v7.16b      \n"
                "sadalp v24.4s, v8.8h               \n"
                "sadalp v25.4s, v9.8h               \n"
                "sadalp v26.4s, v10.8h              \n"
                "sadalp v27.4s, v11.8h              \n"
                "sadalp v28.4s, v12.8h              \n"
                "sadalp v29.4s, v13.8h              \n"
                "sadalp v30.4s, v14.8h              \n"
                "sadalp v31.4s, v15.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "rev64  v1.4s, v0.4s                \n"
                "rev64  v3.8h, v2.8h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v20.4s, v10.8h              \n"
                "sadalp v21.4s, v11.8h              \n"
                "sadalp v24.4s, v12.8h              \n"
                "sadalp v25.4s, v13.8h              \n"
                "sadalp v28.4s, v14.8h              \n"
                "sadalp v29.4s, v15.8h              \n"
                "ext    v0.16b, v0.16b, v0.16b, #8  \n"
                "ext    v1.16b, v1.16b, v1.16b, #8  \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v18.4s, v8.8h               \n"
                "sadalp v19.4s, v9.8h               \n"
                "sadalp v22.4s, v10.8h              \n"
                "sadalp v23.4s, v11.8h              \n"
                "sadalp v26.4s, v12.8h              \n"
                "sadalp v27.4s, v13.8h              \n"
                "sadalp v30.4s, v14.8h              \n"
                "sadalp v31.4s, v15.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w8, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "dup    v8.8b, v1.b[0]              \n"
                "dup    v9.8b, v1.b[1]              \n"
                "dup    v10.8b, v1.b[2]             \n"
                "dup    v11.8b, v1.b[3]             \n"
                "dup    v12.8b, v1.b[4]             \n"
                "dup    v13.8b, v1.b[5]             \n"
                "dup    v14.8b, v1.b[6]             \n"
                "dup    v15.8b, v1.b[7]             \n"
                "smull  v8.8h, v0.8b, v8.8b         \n"
                "smull  v9.8h, v0.8b, v9.8b         \n"
                "smull  v10.8h, v0.8b, v10.8b       \n"
                "smull  v11.8h, v0.8b, v11.8b       \n"
                "smull  v12.8h, v0.8b, v12.8b       \n"
                "smull  v13.8h, v0.8b, v13.8b       \n"
                "smull  v14.8h, v0.8b, v14.8b       \n"
                "smull  v15.8h, v0.8b, v15.8b       \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw  v17.4s, v17.4s, v9.4h       \n"
                "saddw  v18.4s, v18.4s, v10.4h      \n"
                "saddw  v19.4s, v19.4s, v11.4h      \n"
                "saddw2 v20.4s, v20.4s, v8.8h       \n"
                "saddw2 v21.4s, v21.4s, v9.8h       \n"
                "saddw2 v22.4s, v22.4s, v10.8h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
                "saddw  v24.4s, v24.4s, v12.4h      \n"
                "saddw  v25.4s, v25.4s, v13.4h      \n"
                "saddw  v26.4s, v26.4s, v14.4h      \n"
                "saddw  v27.4s, v27.4s, v15.4h      \n"
                "saddw2 v28.4s, v28.4s, v12.8h      \n"
                "saddw2 v29.4s, v29.4s, v13.8h      \n"
                "saddw2 v30.4s, v30.4s, v14.8h      \n"
                "saddw2 v31.4s, v31.4s, v15.8h      \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v4.8b}, [%2], #8           \n"
                "ext    v1.8b, v0.8b, v0.8b, #4     \n"
                "rev32  v2.4h, v0.4h                \n"
                "rev64  v3.4h, v0.4h                \n"
                "rev32  v5.8b, v4.8b                \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v1.8b, v4.8b         \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull  v11.8h, v3.8b, v4.8b        \n"
                "smull  v12.8h, v0.8b, v5.8b        \n"
                "smull  v13.8h, v1.8b, v5.8b        \n"
                "smull  v14.8h, v2.8b, v5.8b        \n"
                "smull  v15.8h, v3.8b, v5.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw2 v21.4s, v21.4s, v10.8h      \n"
                "saddw  v22.4s, v22.4s, v11.4h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
                "saddw  v24.4s, v24.4s, v12.4h      \n"
                "saddw2 v25.4s, v25.4s, v12.8h      \n"
                "saddw  v26.4s, v26.4s, v13.4h      \n"
                "saddw2 v27.4s, v27.4s, v13.8h      \n"
                "saddw  v28.4s, v28.4s, v14.4h      \n"
                "saddw2 v29.4s, v29.4s, v14.8h      \n"
                "saddw  v30.4s, v30.4s, v15.4h      \n"
                "saddw2 v31.4s, v31.4s, v15.8h      \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "cmp    %w10, #0                    \n"
                "beq    10f                         \n"

#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                //      e2 f2 g2 h2
                //      e3 f3 g3 h3
                //      a4 b4 c4 d4
                //      a5 b5 c5 d5
                //      a6 b6 c6 d6
                //      a7 b7 c7 d7
                //      e4 f4 g4 h4
                //      e5 f5 g5 h5
                //      e6 f6 g6 h6
                //      e7 f7 g7 h7
                // if out_elempack == 4 / 8
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                // if out_elempack == 8
                "cmp    %w11, #8                    \n"
                "bne    7f                          \n"

                "st1    {v16.4s}, [%3], #16         \n"
                "st1    {v20.4s}, [%3], #16         \n"
                "st1    {v17.4s}, [%3], #16         \n"
                "st1    {v21.4s}, [%3], #16         \n"
                "st1    {v18.4s}, [%3], #16         \n"
                "st1    {v22.4s}, [%3], #16         \n"
                "st1    {v19.4s}, [%3], #16         \n"
                "st1    {v23.4s}, [%3], #16         \n"
                "st1    {v24.4s}, [%3], #16         \n"
                "st1    {v28.4s}, [%3], #16         \n"
                "st1    {v25.4s}, [%3], #16         \n"
                "st1    {v29.4s}, [%3], #16         \n"
                "st1    {v26.4s}, [%3], #16         \n"
                "st1    {v30.4s}, [%3], #16         \n"
                "st1    {v27.4s}, [%3], #16         \n"
                "st1    {v31.4s}, [%3], #16         \n"
                "b      9f                          \n"

                // if out_elempack == 4
                "7:                                 \n"
                "add    x4, %3, %12, lsl #4         \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%3], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [x4], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // to
                //      a0 a1 a2 a3
                //      a4 a5 a6 a7
                //      b0 b1 b2 b3
                //      b4 b5 b6 b7
                //      c0 c1 c2 c3
                //      c4 c5 c6 c7
                //      d0 d1 d2 d3
                //      d4 d5 d6 d7
                //      e0 e1 e2 e3
                //      e4 e5 e6 e7
                //      f0 f1 f2 f3
                //      f4 f5 f6 f7
                //      g0 g1 g2 g3
                //      g4 g5 g6 g7
                //      h0 h1 h2 h3
                //      h4 h5 h6 h7
                "zip1   v0.4s, v16.4s, v17.4s       \n"
                "zip2   v1.4s, v16.4s, v17.4s       \n"
                "zip1   v2.4s, v18.4s, v19.4s       \n"
                "zip2   v3.4s, v18.4s, v19.4s       \n"
                "zip1   v4.4s, v24.4s, v25.4s       \n"
                "zip2   v5.4s, v24.4s, v25.4s       \n"
                "zip1   v6.4s, v26.4s, v27.4s       \n"
                "zip2   v7.4s, v26.4s, v27.4s       \n"
                "zip1   v8.4s, v20.4s, v21.4s       \n"
                "zip2   v9.4s, v20.4s, v21.4s       \n"
                "zip1   v10.4s, v22.4s, v23.4s      \n"
                "zip2   v11.4s, v22.4s, v23.4s      \n"
                "zip1   v12.4s, v28.4s, v29.4s      \n"
                "zip2   v13.4s, v28.4s, v29.4s      \n"
                "zip1   v14.4s, v30.4s, v31.4s      \n"
                "zip2   v15.4s, v30.4s, v31.4s      \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v17.2d, v4.2d, v6.2d        \n"
                "zip2   v18.2d, v0.2d, v2.2d        \n"
                "zip2   v19.2d, v4.2d, v6.2d        \n"
                "zip1   v20.2d, v1.2d, v3.2d        \n"
                "zip1   v21.2d, v5.2d, v7.2d        \n"
                "zip2   v22.2d, v1.2d, v3.2d        \n"
                "zip2   v23.2d, v5.2d, v7.2d        \n"
                "zip1   v24.2d, v8.2d, v10.2d       \n"
                "zip1   v25.2d, v12.2d, v14.2d      \n"
                "zip2   v26.2d, v8.2d, v10.2d       \n"
                "zip2   v27.2d, v12.2d, v14.2d      \n"
                "zip1   v28.2d, v9.2d, v11.2d       \n"
                "zip1   v29.2d, v13.2d, v15.2d      \n"
                "zip2   v30.2d, v9.2d, v11.2d       \n"
                "zip2   v31.2d, v13.2d, v15.2d      \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s, v17.4s}, [%3], #32 \n"
                "st1    {v18.4s, v19.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v20.4s, v21.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v22.4s, v23.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v24.4s, v25.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v26.4s, v27.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v28.4s, v29.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v30.4s, v31.4s}, [x4]      \n"
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      e4 f5 g6 h7
                //      e0 f1 g2 h3
                //      a4 b5 c6 d7
                //      c0 d1 a2 b3
                //      g4 h5 e6 f7
                //      g0 h1 e2 f3
                //      c4 d5 a6 b7
                //      a3 b2 c1 d0
                //      e7 f6 g5 h4
                //      e3 f2 g1 h0
                //      a7 b6 c5 d4
                //      c3 d2 a1 b0
                //      g7 h6 e5 f4
                //      g3 h2 e1 f0
                //      c7 d6 a5 b4
                // if out_elempack == 4 / 8
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                "rev64  v24.4s, v24.4s              \n"
                "rev64  v25.4s, v25.4s              \n"
                "rev64  v26.4s, v26.4s              \n"
                "rev64  v27.4s, v27.4s              \n"
                "rev64  v28.4s, v28.4s              \n"
                "rev64  v29.4s, v29.4s              \n"
                "rev64  v30.4s, v30.4s              \n"
                "rev64  v31.4s, v31.4s              \n"
                "ext    v24.16b, v24.16b, v24.16b, #8 \n"
                "ext    v25.16b, v25.16b, v25.16b, #8 \n"
                "ext    v26.16b, v26.16b, v26.16b, #8 \n"
                "ext    v27.16b, v27.16b, v27.16b, #8 \n"
                "ext    v28.16b, v28.16b, v28.16b, #8 \n"
                "ext    v29.16b, v29.16b, v29.16b, #8 \n"
                "ext    v30.16b, v30.16b, v30.16b, #8 \n"
                "ext    v31.16b, v31.16b, v31.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v28.4s       \n"
                "zip2   v1.4s, v16.4s, v28.4s       \n"
                "zip1   v2.4s, v20.4s, v24.4s       \n"
                "zip2   v3.4s, v20.4s, v24.4s       \n"
                "zip1   v4.4s, v18.4s, v30.4s       \n"
                "zip2   v5.4s, v18.4s, v30.4s       \n"
                "zip1   v6.4s, v22.4s, v26.4s       \n"
                "zip2   v7.4s, v22.4s, v26.4s       \n"
                "zip1   v8.4s, v19.4s, v31.4s       \n"
                "zip2   v9.4s, v19.4s, v31.4s       \n"
                "zip1   v10.4s, v23.4s, v27.4s      \n"
                "zip2   v11.4s, v23.4s, v27.4s      \n"
                "zip1   v12.4s, v17.4s, v29.4s      \n"
                "zip2   v13.4s, v17.4s, v29.4s      \n"
                "zip1   v14.4s, v21.4s, v25.4s      \n"
                "zip2   v15.4s, v21.4s, v25.4s      \n"

                // if out_elempack == 8
                "cmp    %w11, #8                    \n"
                "bne    7f                          \n"

                // to
                //      a0 b0 c0 d0
                //      e0 f0 g0 h0
                //      a1 b1 c1 d1
                //      e1 f1 g1 h1
                //      a2 b2 c2 d2
                //      e2 f2 g2 h2
                //      a3 b3 c3 d3
                //      e3 f3 g3 h3
                //      a4 b4 c4 d4
                //      e4 f4 g4 h4
                //      a5 b5 c5 d5
                //      e5 f5 g5 h5
                //      a6 b6 c6 d6
                //      e6 f6 g6 h6
                //      a7 b7 c7 d7
                //      e7 f7 g7 h7
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v17.2d, v4.2d, v6.2d        \n"
                "zip2   v18.2d, v0.2d, v2.2d        \n"
                "zip2   v19.2d, v4.2d, v6.2d        \n"
                "zip1   v20.2d, v3.2d, v1.2d        \n"
                "zip1   v21.2d, v7.2d, v5.2d        \n"
                "zip2   v22.2d, v3.2d, v1.2d        \n"
                "zip2   v23.2d, v7.2d, v5.2d        \n"
                "zip1   v24.2d, v8.2d, v10.2d       \n"
                "zip1   v25.2d, v12.2d, v14.2d      \n"
                "zip2   v26.2d, v8.2d, v10.2d       \n"
                "zip2   v27.2d, v12.2d, v14.2d      \n"
                "zip1   v28.2d, v11.2d, v9.2d       \n"
                "zip1   v29.2d, v15.2d, v13.2d      \n"
                "zip2   v30.2d, v11.2d, v9.2d       \n"
                "zip2   v31.2d, v15.2d, v13.2d      \n"
                "rev64  v18.4s, v18.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v22.4s, v22.4s              \n"
                "rev64  v23.4s, v23.4s              \n"
                "rev64  v26.4s, v26.4s              \n"
                "rev64  v27.4s, v27.4s              \n"
                "rev64  v30.4s, v30.4s              \n"
                "rev64  v31.4s, v31.4s              \n"

                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%3], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 4
                "7:                                 \n"
                // to
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      a4 b4 c4 d4
                //      a5 b5 c5 d5
                //      a6 b6 c6 d6
                //      a7 b7 c7 d7
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                //      e2 f2 g2 h2
                //      e3 f3 g3 h3
                //      e4 f4 g4 h4
                //      e5 f5 g5 h5
                //      e6 f6 g6 h6
                //      e7 f7 g7 h7
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v24.2d, v4.2d, v6.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip2   v25.2d, v4.2d, v6.2d        \n"
                "zip1   v18.2d, v3.2d, v1.2d        \n"
                "zip1   v26.2d, v7.2d, v5.2d        \n"
                "zip2   v19.2d, v3.2d, v1.2d        \n"
                "zip2   v27.2d, v7.2d, v5.2d        \n"
                "zip1   v20.2d, v8.2d, v10.2d       \n"
                "zip1   v28.2d, v12.2d, v14.2d      \n"
                "zip2   v21.2d, v8.2d, v10.2d       \n"
                "zip2   v29.2d, v12.2d, v14.2d      \n"
                "zip1   v22.2d, v11.2d, v9.2d       \n"
                "zip1   v30.2d, v15.2d, v13.2d      \n"
                "zip2   v23.2d, v11.2d, v9.2d       \n"
                "zip2   v31.2d, v15.2d, v13.2d      \n"
                "rev64  v17.4s, v17.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v21.4s, v21.4s              \n"
                "rev64  v23.4s, v23.4s              \n"
                "rev64  v25.4s, v25.4s              \n"
                "rev64  v27.4s, v27.4s              \n"
                "rev64  v29.4s, v29.4s              \n"
                "rev64  v31.4s, v31.4s              \n"

                "add    x4, %3, %12, lsl #4         \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [x4] \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // to
                //      a0 a1 a2 a3
                //      a4 a5 a6 a7
                //      b0 b1 b2 b3
                //      b4 b5 b6 b7
                //      c0 c1 c2 c3
                //      c4 c5 c6 c7
                //      d0 d1 d2 d3
                //      d4 d5 d6 d7
                //      e0 e1 e2 e3
                //      e4 e5 e6 e7
                //      f0 f1 f2 f3
                //      f4 f5 f6 f7
                //      g0 g1 g2 g3
                //      g4 g5 g6 g7
                //      h0 h1 h2 h3
                //      h4 h5 h6 h7
                "ext    v20.16b, v20.16b, v20.16b, #8 \n"
                "ext    v21.16b, v21.16b, v21.16b, #8 \n"
                "ext    v22.16b, v22.16b, v22.16b, #8 \n"
                "ext    v23.16b, v23.16b, v23.16b, #8 \n"
                "ext    v28.16b, v28.16b, v28.16b, #8 \n"
                "ext    v29.16b, v29.16b, v29.16b, #8 \n"
                "ext    v30.16b, v30.16b, v30.16b, #8 \n"
                "ext    v31.16b, v31.16b, v31.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v28.4s       \n"
                "zip2   v1.4s, v16.4s, v28.4s       \n"
                "zip1   v2.4s, v20.4s, v24.4s       \n"
                "zip2   v3.4s, v20.4s, v24.4s       \n"
                "zip1   v4.4s, v19.4s, v31.4s       \n"
                "zip2   v5.4s, v19.4s, v31.4s       \n"
                "zip1   v6.4s, v23.4s, v27.4s       \n"
                "zip2   v7.4s, v23.4s, v27.4s       \n"
                "zip1   v8.4s, v18.4s, v30.4s       \n"
                "zip2   v9.4s, v18.4s, v30.4s       \n"
                "zip1   v10.4s, v22.4s, v26.4s      \n"
                "zip2   v11.4s, v22.4s, v26.4s      \n"
                "zip1   v12.4s, v17.4s, v29.4s      \n"
                "zip2   v13.4s, v17.4s, v29.4s      \n"
                "zip1   v14.4s, v21.4s, v25.4s      \n"
                "zip2   v15.4s, v21.4s, v25.4s      \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v17.2d, v4.2d, v6.2d        \n"
                "zip2   v18.2d, v0.2d, v2.2d        \n"
                "zip2   v19.2d, v4.2d, v6.2d        \n"
                "zip1   v20.2d, v3.2d, v1.2d        \n"
                "zip1   v21.2d, v7.2d, v5.2d        \n"
                "zip2   v22.2d, v3.2d, v1.2d        \n"
                "zip2   v23.2d, v7.2d, v5.2d        \n"
                "zip1   v24.2d, v8.2d, v10.2d       \n"
                "zip1   v25.2d, v12.2d, v14.2d      \n"
                "zip2   v26.2d, v8.2d, v10.2d       \n"
                "zip2   v27.2d, v12.2d, v14.2d      \n"
                "zip1   v28.2d, v11.2d, v9.2d       \n"
                "zip1   v29.2d, v15.2d, v13.2d      \n"
                "zip2   v30.2d, v11.2d, v9.2d       \n"
                "zip2   v31.2d, v15.2d, v13.2d      \n"
                "rev64  v18.4s, v18.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v22.4s, v22.4s              \n"
                "rev64  v23.4s, v23.4s              \n"
                "rev64  v26.4s, v26.4s              \n"
                "rev64  v27.4s, v27.4s              \n"
                "rev64  v30.4s, v30.4s              \n"
                "rev64  v31.4s, v31.4s              \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s, v17.4s}, [%3], #32 \n"
                "st1    {v18.4s, v19.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v20.4s, v21.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v22.4s, v23.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v24.4s, v25.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v26.4s, v27.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v28.4s, v29.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v30.4s, v31.4s}, [x4]      \n"
#endif // __ARM_FEATURE_DOTPROD

                "9:                                 \n"
                "add    %0, %0, #256                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(max_kk),       // %8
                "r"(k),            // %9
                "r"(k_end),        // %10
                "r"(out_elempack), // %11
                "r"(out_hstep)     // %12
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;
            int32x4_t _sum4;
            int32x4_t _sum5;
            int32x4_t _sum6;
            int32x4_t _sum7;
            int32x4_t _sum8;
            int32x4_t _sum9;
            int32x4_t _suma;
            int32x4_t _sumb;
            int32x4_t _sumc;
            int32x4_t _sumd;
            int32x4_t _sume;
            int32x4_t _sumf;

#if __ARM_FEATURE_MATMUL_INT8
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
                _sum8 = vdupq_n_s32(0);
                _sum9 = vdupq_n_s32(0);
                _suma = vdupq_n_s32(0);
                _sumb = vdupq_n_s32(0);
                _sumc = vdupq_n_s32(0);
                _sumd = vdupq_n_s32(0);
                _sume = vdupq_n_s32(0);
                _sumf = vdupq_n_s32(0);
            }
#else  // __ARM_FEATURE_MATMUL_INT8
            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
                _sum8 = vdupq_n_s32(0);
                _sum9 = vdupq_n_s32(0);
                _suma = vdupq_n_s32(0);
                _sumb = vdupq_n_s32(0);
                _sumc = vdupq_n_s32(0);
                _sumd = vdupq_n_s32(0);
                _sume = vdupq_n_s32(0);
                _sumf = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
                _sum4 = vld1q_s32(outptr + 16);
                _sum5 = vld1q_s32(outptr + 20);
                _sum6 = vld1q_s32(outptr + 24);
                _sum7 = vld1q_s32(outptr + 28);
                _sum8 = vld1q_s32(outptr + 32);
                _sum9 = vld1q_s32(outptr + 36);
                _suma = vld1q_s32(outptr + 40);
                _sumb = vld1q_s32(outptr + 44);
                _sumc = vld1q_s32(outptr + 48);
                _sumd = vld1q_s32(outptr + 52);
                _sume = vld1q_s32(outptr + 56);
                _sumf = vld1q_s32(outptr + 60);
            }
#endif // __ARM_FEATURE_MATMUL_INT8

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            {
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

                    _sum0 = vmmlaq_s32(_sum0, _pA0, _pB0);
                    _sum1 = vmmlaq_s32(_sum1, _pA1, _pB0);
                    _sum2 = vmmlaq_s32(_sum2, _pA0, _pB1);
                    _sum3 = vmmlaq_s32(_sum3, _pA1, _pB1);
                    _sum4 = vmmlaq_s32(_sum4, _pA2, _pB0);
                    _sum5 = vmmlaq_s32(_sum5, _pA3, _pB0);
                    _sum6 = vmmlaq_s32(_sum6, _pA2, _pB1);
                    _sum7 = vmmlaq_s32(_sum7, _pA3, _pB1);
                    _sum8 = vmmlaq_s32(_sum8, _pA0, _pB2);
                    _sum9 = vmmlaq_s32(_sum9, _pA1, _pB2);
                    _suma = vmmlaq_s32(_suma, _pA0, _pB3);
                    _sumb = vmmlaq_s32(_sumb, _pA1, _pB3);
                    _sumc = vmmlaq_s32(_sumc, _pA2, _pB2);
                    _sumd = vmmlaq_s32(_sumd, _pA3, _pB2);
                    _sume = vmmlaq_s32(_sume, _pA2, _pB3);
                    _sumf = vmmlaq_s32(_sumf, _pA3, _pB3);

                    pA += 64;
                    pB += 64;
                }

                int32x4x2_t _ss0 = vuzpq_s32(_sum0, _sum1);
                int32x4x2_t _ss1 = vuzpq_s32(_sum2, _sum3);
                int32x4x2_t _ss2 = vuzpq_s32(_sum4, _sum5);
                int32x4x2_t _ss3 = vuzpq_s32(_sum6, _sum7);
                int32x4x2_t _ss4 = vuzpq_s32(_sum8, _sum9);
                int32x4x2_t _ss5 = vuzpq_s32(_suma, _sumb);
                int32x4x2_t _ss6 = vuzpq_s32(_sumc, _sumd);
                int32x4x2_t _ss7 = vuzpq_s32(_sume, _sumf);

                if (k == 0)
                {
                    _sum0 = _ss0.val[0];
                    _sum1 = _ss0.val[1];
                    _sum2 = _ss1.val[0];
                    _sum3 = _ss1.val[1];
                    _sum4 = _ss2.val[0];
                    _sum5 = _ss2.val[1];
                    _sum6 = _ss3.val[0];
                    _sum7 = _ss3.val[1];
                    _sum8 = _ss4.val[0];
                    _sum9 = _ss4.val[1];
                    _suma = _ss5.val[0];
                    _sumb = _ss5.val[1];
                    _sumc = _ss6.val[0];
                    _sumd = _ss6.val[1];
                    _sume = _ss7.val[0];
                    _sumf = _ss7.val[1];
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                    _sum8 = vld1q_s32(outptr + 32);
                    _sum9 = vld1q_s32(outptr + 36);
                    _suma = vld1q_s32(outptr + 40);
                    _sumb = vld1q_s32(outptr + 44);
                    _sumc = vld1q_s32(outptr + 48);
                    _sumd = vld1q_s32(outptr + 52);
                    _sume = vld1q_s32(outptr + 56);
                    _sumf = vld1q_s32(outptr + 60);

                    _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                    _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                    _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                    _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
                    _sum4 = vaddq_s32(_sum4, _ss2.val[0]);
                    _sum5 = vaddq_s32(_sum5, _ss2.val[1]);
                    _sum6 = vaddq_s32(_sum6, _ss3.val[0]);
                    _sum7 = vaddq_s32(_sum7, _ss3.val[1]);
                    _sum8 = vaddq_s32(_sum8, _ss4.val[0]);
                    _sum9 = vaddq_s32(_sum9, _ss4.val[1]);
                    _suma = vaddq_s32(_suma, _ss5.val[0]);
                    _sumb = vaddq_s32(_sumb, _ss5.val[1]);
                    _sumc = vaddq_s32(_sumc, _ss6.val[0]);
                    _sumd = vaddq_s32(_sumd, _ss6.val[1]);
                    _sume = vaddq_s32(_sume, _ss7.val[0]);
                    _sumf = vaddq_s32(_sumf, _ss7.val[1]);
                }
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pA2 = vld1q_s8(pA + 32);
                int8x16_t _pA3 = vld1q_s8(pA + 48);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);
                int8x16_t _pB2 = vld1q_s8(pB + 32);
                int8x16_t _pB3 = vld1q_s8(pB + 48);

                // aaaa bbbb cccc dddd    eeee ffff gggg hhhh

                // 0000 1111 2222 3333    4444 5555 6666 7777
                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB0, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB0, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB0, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB0, 3);
                _sum8 = vdotq_laneq_s32(_sum8, _pA0, _pB1, 0);
                _sum9 = vdotq_laneq_s32(_sum9, _pA0, _pB1, 1);
                _suma = vdotq_laneq_s32(_suma, _pA0, _pB1, 2);
                _sumb = vdotq_laneq_s32(_sumb, _pA0, _pB1, 3);
                _sumc = vdotq_laneq_s32(_sumc, _pA1, _pB1, 0);
                _sumd = vdotq_laneq_s32(_sumd, _pA1, _pB1, 1);
                _sume = vdotq_laneq_s32(_sume, _pA1, _pB1, 2);
                _sumf = vdotq_laneq_s32(_sumf, _pA1, _pB1, 3);

                _sum0 = vdotq_laneq_s32(_sum0, _pA2, _pB2, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA2, _pB2, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA2, _pB2, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA2, _pB2, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA3, _pB2, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA3, _pB2, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA3, _pB2, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA3, _pB2, 3);
                _sum8 = vdotq_laneq_s32(_sum8, _pA2, _pB3, 0);
                _sum9 = vdotq_laneq_s32(_sum9, _pA2, _pB3, 1);
                _suma = vdotq_laneq_s32(_suma, _pA2, _pB3, 2);
                _sumb = vdotq_laneq_s32(_sumb, _pA2, _pB3, 3);
                _sumc = vdotq_laneq_s32(_sumc, _pA3, _pB3, 0);
                _sumd = vdotq_laneq_s32(_sumd, _pA3, _pB3, 1);
                _sume = vdotq_laneq_s32(_sume, _pA3, _pB3, 2);
                _sumf = vdotq_laneq_s32(_sumf, _pA3, _pB3, 3);

                pA += 64;
                pB += 64;
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

                // aaaa bbbb cccc dddd    eeee ffff gggg hhhh

                // 0000 1111 2222 3333    4444 5555 6666 7777
                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB0, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB0, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB0, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB0, 3);
                _sum8 = vdotq_laneq_s32(_sum8, _pA0, _pB1, 0);
                _sum9 = vdotq_laneq_s32(_sum9, _pA0, _pB1, 1);
                _suma = vdotq_laneq_s32(_suma, _pA0, _pB1, 2);
                _sumb = vdotq_laneq_s32(_sumb, _pA0, _pB1, 3);
                _sumc = vdotq_laneq_s32(_sumc, _pA1, _pB1, 0);
                _sumd = vdotq_laneq_s32(_sumd, _pA1, _pB1, 1);
                _sume = vdotq_laneq_s32(_sume, _pA1, _pB1, 2);
                _sumf = vdotq_laneq_s32(_sumf, _pA1, _pB1, 3);

#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB2 = vld1q_s8(pB + 16);

                // aabbccdd eeffgghh
                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));

                // aabbccdd eeffgghh
                // ccddaabb gghheeff

                int8x16_t _pA3 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA2)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB3 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s7 = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s8 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _s9 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sa = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _sb = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sc = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sd = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB1));
                int16x8_t _se = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sf = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB1));

                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), vget_low_s8(_pB2));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), vget_high_s8(_pB2));
                _s2 = vmlal_s8(_s2, vget_high_s8(_pA2), vget_low_s8(_pB2));
                _s3 = vmlal_s8(_s3, vget_low_s8(_pA2), vget_high_s8(_pB2));
                _s4 = vmlal_s8(_s4, vget_low_s8(_pA3), vget_low_s8(_pB2));
                _s5 = vmlal_s8(_s5, vget_high_s8(_pA3), vget_high_s8(_pB2));
                _s6 = vmlal_s8(_s6, vget_high_s8(_pA3), vget_low_s8(_pB2));
                _s7 = vmlal_s8(_s7, vget_low_s8(_pA3), vget_high_s8(_pB2));
                _s8 = vmlal_s8(_s8, vget_low_s8(_pA2), vget_low_s8(_pB3));
                _s9 = vmlal_s8(_s9, vget_high_s8(_pA2), vget_high_s8(_pB3));
                _sa = vmlal_s8(_sa, vget_high_s8(_pA2), vget_low_s8(_pB3));
                _sb = vmlal_s8(_sb, vget_low_s8(_pA2), vget_high_s8(_pB3));
                _sc = vmlal_s8(_sc, vget_low_s8(_pA3), vget_low_s8(_pB3));
                _sd = vmlal_s8(_sd, vget_high_s8(_pA3), vget_high_s8(_pB3));
                _se = vmlal_s8(_se, vget_high_s8(_pA3), vget_low_s8(_pB3));
                _sf = vmlal_s8(_sf, vget_low_s8(_pA3), vget_high_s8(_pB3));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
                _sum8 = vpadalq_s16(_sum8, _s8);
                _sum9 = vpadalq_s16(_sum9, _s9);
                _suma = vpadalq_s16(_suma, _sa);
                _sumb = vpadalq_s16(_sumb, _sb);
                _sumc = vpadalq_s16(_sumc, _sc);
                _sumd = vpadalq_s16(_sumd, _sd);
                _sume = vpadalq_s16(_sume, _se);
                _sumf = vpadalq_s16(_sumf, _sf);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

                // aabbccdd eeffgghh

                // 00112233 44556677

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 0)));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 1)));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 2)));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 3)));
                int16x8_t _s4 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 0)));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 1)));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 2)));
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 3)));
                int16x8_t _s8 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 0)));
                int16x8_t _s9 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 1)));
                int16x8_t _sa = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 2)));
                int16x8_t _sb = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 3)));
                int16x8_t _sc = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 0)));
                int16x8_t _sd = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 1)));
                int16x8_t _se = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 2)));
                int16x8_t _sf = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 3)));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
                _sum8 = vpadalq_s16(_sum8, _s8);
                _sum9 = vpadalq_s16(_sum9, _s9);
                _suma = vpadalq_s16(_suma, _sa);
                _sumb = vpadalq_s16(_sumb, _sb);
                _sumc = vpadalq_s16(_sumc, _sc);
                _sumd = vpadalq_s16(_sumd, _sd);
                _sume = vpadalq_s16(_sume, _se);
                _sumf = vpadalq_s16(_sumf, _sf);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);

                // aabbccdd eeffgghh

                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));

                // 00112233 44556677

                // 33221100 77665544

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s7 = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s8 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _s9 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sa = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _sb = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sc = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sd = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB1));
                int16x8_t _se = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sf = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB1));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
                _sum8 = vpadalq_s16(_sum8, _s8);
                _sum9 = vpadalq_s16(_sum9, _s9);
                _suma = vpadalq_s16(_suma, _sa);
                _sumb = vpadalq_s16(_sumb, _sb);
                _sumc = vpadalq_s16(_sumc, _sc);
                _sumd = vpadalq_s16(_sumd, _sd);
                _sume = vpadalq_s16(_sume, _se);
                _sumf = vpadalq_s16(_sumf, _sf);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                // int8x8_t _pB0 = vld1_s8(pB);

                // abcd efgh
                // 0123 4567

                int16x8_t _s01 = vmull_s8(_pA, vdup_n_s8(pB[0]));
                int16x8_t _s23 = vmull_s8(_pA, vdup_n_s8(pB[1]));
                int16x8_t _s45 = vmull_s8(_pA, vdup_n_s8(pB[2]));
                int16x8_t _s67 = vmull_s8(_pA, vdup_n_s8(pB[3]));
                int16x8_t _s89 = vmull_s8(_pA, vdup_n_s8(pB[4]));
                int16x8_t _sab = vmull_s8(_pA, vdup_n_s8(pB[5]));
                int16x8_t _scd = vmull_s8(_pA, vdup_n_s8(pB[6]));
                int16x8_t _sef = vmull_s8(_pA, vdup_n_s8(pB[7]));

                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s23));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s45));
                _sum3 = vaddw_s16(_sum3, vget_low_s16(_s67));
                _sum4 = vaddw_s16(_sum4, vget_high_s16(_s01));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s23));
                _sum6 = vaddw_s16(_sum6, vget_high_s16(_s45));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
                _sum8 = vaddw_s16(_sum8, vget_low_s16(_s89));
                _sum9 = vaddw_s16(_sum9, vget_low_s16(_sab));
                _suma = vaddw_s16(_suma, vget_low_s16(_scd));
                _sumb = vaddw_s16(_sumb, vget_low_s16(_sef));
                _sumc = vaddw_s16(_sumc, vget_high_s16(_s89));
                _sumd = vaddw_s16(_sumd, vget_high_s16(_sab));
                _sume = vaddw_s16(_sume, vget_high_s16(_scd));
                _sumf = vaddw_s16(_sumf, vget_high_s16(_sef));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vld1_s8(pB);

                // abcd efgh
                // efgh abcd
                // cdab ghef
                // ghef cdab

                // 0123 4567
                // 3210 7654

                // abcdefgh  ->  ghefcdab  ->  cdabghef

                int8x8_t _pA1 = vext_s8(_pA0, _pA0, 4);
                int8x8_t _pA2 = vreinterpret_s8_s16(vrev32_s16(vreinterpret_s16_s8(_pA0)));
                int8x8_t _pA3 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pA0)));

                // 01234567  ->  32107654

                int8x8_t _pB1 = vrev32_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA0, _pB0);
                int16x8_t _s23 = vmull_s8(_pA1, _pB0);
                int16x8_t _s45 = vmull_s8(_pA2, _pB0);
                int16x8_t _s67 = vmull_s8(_pA3, _pB0);
                int16x8_t _s89 = vmull_s8(_pA0, _pB1);
                int16x8_t _sab = vmull_s8(_pA1, _pB1);
                int16x8_t _scd = vmull_s8(_pA2, _pB1);
                int16x8_t _sef = vmull_s8(_pA3, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s45));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s45));
                _sum6 = vaddw_s16(_sum6, vget_low_s16(_s67));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
                _sum8 = vaddw_s16(_sum8, vget_low_s16(_s89));
                _sum9 = vaddw_s16(_sum9, vget_high_s16(_s89));
                _suma = vaddw_s16(_suma, vget_low_s16(_sab));
                _sumb = vaddw_s16(_sumb, vget_high_s16(_sab));
                _sumc = vaddw_s16(_sumc, vget_low_s16(_scd));
                _sumd = vaddw_s16(_sumd, vget_high_s16(_scd));
                _sume = vaddw_s16(_sume, vget_low_s16(_sef));
                _sumf = vaddw_s16(_sumf, vget_high_s16(_sef));
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 8;
            }

            if (k_end)
            {
#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                //      e2 f2 g2 h2
                //      e3 f3 g3 h3
                //      a4 b4 c4 d4
                //      a5 b5 c5 d5
                //      a6 b6 c6 d6
                //      a7 b7 c7 d7
                //      e4 f4 g4 h4
                //      e5 f5 g5 h5
                //      e6 f6 g6 h6
                //      e7 f7 g7 h7
                if (out_elempack == 8)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum4);
                    vst1q_s32(outptr0 + 8, _sum1);
                    vst1q_s32(outptr0 + 12, _sum5);
                    vst1q_s32(outptr0 + 16, _sum2);
                    vst1q_s32(outptr0 + 20, _sum6);
                    vst1q_s32(outptr0 + 24, _sum3);
                    vst1q_s32(outptr0 + 28, _sum7);
                    vst1q_s32(outptr0 + 32, _sum8);
                    vst1q_s32(outptr0 + 36, _sumc);
                    vst1q_s32(outptr0 + 40, _sum9);
                    vst1q_s32(outptr0 + 44, _sumd);
                    vst1q_s32(outptr0 + 48, _suma);
                    vst1q_s32(outptr0 + 52, _sume);
                    vst1q_s32(outptr0 + 56, _sumb);
                    vst1q_s32(outptr0 + 60, _sumf);
                    outptr0 += 64;
                }
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + 16, _sum8);
                    vst1q_s32(outptr0 + 20, _sum9);
                    vst1q_s32(outptr0 + 24, _suma);
                    vst1q_s32(outptr0 + 28, _sumb);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 4 + 8, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 4 + 12, _sum7);
                    vst1q_s32(outptr0 + out_hstep * 4 + 16, _sumc);
                    vst1q_s32(outptr0 + out_hstep * 4 + 20, _sumd);
                    vst1q_s32(outptr0 + out_hstep * 4 + 24, _sume);
                    vst1q_s32(outptr0 + out_hstep * 4 + 28, _sumf);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      a4 a5 a6 a7
                    //      b0 b1 b2 b3
                    //      b4 b5 b6 b7
                    //      c0 c1 c2 c3
                    //      c4 c5 c6 c7
                    //      d0 d1 d2 d3
                    //      d4 d5 d6 d7
                    //      e0 e1 e2 e3
                    //      e4 e5 e6 e7
                    //      f0 f1 f2 f3
                    //      f4 f5 f6 f7
                    //      g0 g1 g2 g3
                    //      g4 g5 g6 g7
                    //      h0 h1 h2 h3
                    //      h4 h5 h6 h7
                    {
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum3);
                        int32x4x2_t _t2 = vzipq_s32(_sum8, _sum9);
                        int32x4x2_t _t3 = vzipq_s32(_suma, _sumb);
                        int32x4x2_t _t4 = vzipq_s32(_sum4, _sum5);
                        int32x4x2_t _t5 = vzipq_s32(_sum6, _sum7);
                        int32x4x2_t _t6 = vzipq_s32(_sumc, _sumd);
                        int32x4x2_t _t7 = vzipq_s32(_sume, _sumf);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum2 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum3 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum4 = vcombine_s32(vget_low_s32(_t0.val[1]), vget_low_s32(_t1.val[1]));
                        _sum5 = vcombine_s32(vget_low_s32(_t2.val[1]), vget_low_s32(_t3.val[1]));
                        _sum6 = vcombine_s32(vget_high_s32(_t0.val[1]), vget_high_s32(_t1.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t2.val[1]), vget_high_s32(_t3.val[1]));
                        _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                        _sum9 = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                        _suma = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                        _sumb = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                        _sumc = vcombine_s32(vget_low_s32(_t4.val[1]), vget_low_s32(_t5.val[1]));
                        _sumd = vcombine_s32(vget_low_s32(_t6.val[1]), vget_low_s32(_t7.val[1]));
                        _sume = vcombine_s32(vget_high_s32(_t4.val[1]), vget_high_s32(_t5.val[1]));
                        _sumf = vcombine_s32(vget_high_s32(_t6.val[1]), vget_high_s32(_t7.val[1]));
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + out_hstep, _sum2);
                    vst1q_s32(outptr0 + out_hstep + 4, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 3 + 4, _sum7);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum8);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum9);
                    vst1q_s32(outptr0 + out_hstep * 5, _suma);
                    vst1q_s32(outptr0 + out_hstep * 5 + 4, _sumb);
                    vst1q_s32(outptr0 + out_hstep * 6, _sumc);
                    vst1q_s32(outptr0 + out_hstep * 6 + 4, _sumd);
                    vst1q_s32(outptr0 + out_hstep * 7, _sume);
                    vst1q_s32(outptr0 + out_hstep * 7 + 4, _sumf);
                    outptr0 += 8;
                }
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      e4 f5 g6 h7
                //      e0 f1 g2 h3
                //      a4 b5 c6 d7
                //      c0 d1 a2 b3
                //      g4 h5 e6 f7
                //      g0 h1 e2 f3
                //      c4 d5 a6 b7
                //      a3 b2 c1 d0
                //      e7 f6 g5 h4
                //      e3 f2 g1 h0
                //      a7 b6 c5 d4
                //      c3 d2 a1 b0
                //      g7 h6 e5 f4
                //      g3 h2 e1 f0
                //      c7 d6 a5 b4
                if (out_elempack == 8)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      e0 f0 g0 h0
                    //      a1 b1 c1 d1
                    //      e1 f1 g1 h1
                    //      a2 b2 c2 d2
                    //      e2 f2 g2 h2
                    //      a3 b3 c3 d3
                    //      e3 f3 g3 h3
                    //      a4 b4 c4 d4
                    //      e4 f4 g4 h4
                    //      a5 b5 c5 d5
                    //      e5 f5 g5 h5
                    //      a6 b6 c6 d6
                    //      e6 f6 g6 h6
                    //      a7 b7 c7 d7
                    //      e7 f7 g7 h7
                    {
                        _sum8 = vrev64q_s32(_sum8);
                        _sum9 = vrev64q_s32(_sum9);
                        _suma = vrev64q_s32(_suma);
                        _sumb = vrev64q_s32(_sumb);
                        _sumc = vrev64q_s32(_sumc);
                        _sumd = vrev64q_s32(_sumd);
                        _sume = vrev64q_s32(_sume);
                        _sumf = vrev64q_s32(_sumf);
                        _sum8 = vextq_s32(_sum8, _sum8, 2);
                        _sum9 = vextq_s32(_sum9, _sum9, 2);
                        _suma = vextq_s32(_suma, _suma, 2);
                        _sumb = vextq_s32(_sumb, _sumb, 2);
                        _sumc = vextq_s32(_sumc, _sumc, 2);
                        _sumd = vextq_s32(_sumd, _sumd, 2);
                        _sume = vextq_s32(_sume, _sume, 2);
                        _sumf = vextq_s32(_sumf, _sumf, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                        int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                        int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                        int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                        int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                        int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                        int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                        int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum2 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum3 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum4 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum5 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sum6 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                        _sum9 = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                        _suma = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                        _sumb = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                        _sumc = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                        _sumd = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                        _sume = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                        _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum6 = vrev64q_s32(_sum6);
                        _sum7 = vrev64q_s32(_sum7);
                        _suma = vrev64q_s32(_suma);
                        _sumb = vrev64q_s32(_sumb);
                        _sume = vrev64q_s32(_sume);
                        _sumf = vrev64q_s32(_sumf);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + 16, _sum4);
                    vst1q_s32(outptr0 + 20, _sum5);
                    vst1q_s32(outptr0 + 24, _sum6);
                    vst1q_s32(outptr0 + 28, _sum7);
                    vst1q_s32(outptr0 + 32, _sum8);
                    vst1q_s32(outptr0 + 36, _sum9);
                    vst1q_s32(outptr0 + 40, _suma);
                    vst1q_s32(outptr0 + 44, _sumb);
                    vst1q_s32(outptr0 + 48, _sumc);
                    vst1q_s32(outptr0 + 52, _sumd);
                    vst1q_s32(outptr0 + 56, _sume);
                    vst1q_s32(outptr0 + 60, _sumf);
                    outptr0 += 64;
                }
                if (out_elempack == 4)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      a1 b1 c1 d1
                    //      a2 b2 c2 d2
                    //      a3 b3 c3 d3
                    //      a4 b4 c4 d4
                    //      a5 b5 c5 d5
                    //      a6 b6 c6 d6
                    //      a7 b7 c7 d7
                    //      e0 f0 g0 h0
                    //      e1 f1 g1 h1
                    //      e2 f2 g2 h2
                    //      e3 f3 g3 h3
                    //      e4 f4 g4 h4
                    //      e5 f5 g5 h5
                    //      e6 f6 g6 h6
                    //      e7 f7 g7 h7
                    {
                        _sum8 = vrev64q_s32(_sum8);
                        _sum9 = vrev64q_s32(_sum9);
                        _suma = vrev64q_s32(_suma);
                        _sumb = vrev64q_s32(_sumb);
                        _sumc = vrev64q_s32(_sumc);
                        _sumd = vrev64q_s32(_sumd);
                        _sume = vrev64q_s32(_sume);
                        _sumf = vrev64q_s32(_sumf);
                        _sum8 = vextq_s32(_sum8, _sum8, 2);
                        _sum9 = vextq_s32(_sum9, _sum9, 2);
                        _suma = vextq_s32(_suma, _suma, 2);
                        _sumb = vextq_s32(_sumb, _sumb, 2);
                        _sumc = vextq_s32(_sumc, _sumc, 2);
                        _sumd = vextq_s32(_sumd, _sumd, 2);
                        _sume = vextq_s32(_sume, _sume, 2);
                        _sumf = vextq_s32(_sumf, _sumf, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                        int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                        int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                        int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                        int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                        int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                        int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                        int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum4 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                        _sum5 = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                        _sum6 = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                        _sum8 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum9 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _suma = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sumb = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sumc = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                        _sumd = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                        _sume = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                        _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum5 = vrev64q_s32(_sum5);
                        _sum7 = vrev64q_s32(_sum7);
                        _sum9 = vrev64q_s32(_sum9);
                        _sumb = vrev64q_s32(_sumb);
                        _sumd = vrev64q_s32(_sumd);
                        _sumf = vrev64q_s32(_sumf);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + 16, _sum4);
                    vst1q_s32(outptr0 + 20, _sum5);
                    vst1q_s32(outptr0 + 24, _sum6);
                    vst1q_s32(outptr0 + 28, _sum7);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum8);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum9);
                    vst1q_s32(outptr0 + out_hstep * 4 + 8, _suma);
                    vst1q_s32(outptr0 + out_hstep * 4 + 12, _sumb);
                    vst1q_s32(outptr0 + out_hstep * 4 + 16, _sumc);
                    vst1q_s32(outptr0 + out_hstep * 4 + 20, _sumd);
                    vst1q_s32(outptr0 + out_hstep * 4 + 24, _sume);
                    vst1q_s32(outptr0 + out_hstep * 4 + 28, _sumf);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      a4 a5 a6 a7
                    //      b0 b1 b2 b3
                    //      b4 b5 b6 b7
                    //      c0 c1 c2 c3
                    //      c4 c5 c6 c7
                    //      d0 d1 d2 d3
                    //      d4 d5 d6 d7
                    //      e0 e1 e2 e3
                    //      e4 e5 e6 e7
                    //      f0 f1 f2 f3
                    //      f4 f5 f6 f7
                    //      g0 g1 g2 g3
                    //      g4 g5 g6 g7
                    //      h0 h1 h2 h3
                    //      h4 h5 h6 h7
                    {
                        _sum4 = vextq_s32(_sum4, _sum4, 2);
                        _sum5 = vextq_s32(_sum5, _sum5, 2);
                        _sum6 = vextq_s32(_sum6, _sum6, 2);
                        _sum7 = vextq_s32(_sum7, _sum7, 2);
                        _sumc = vextq_s32(_sumc, _sumc, 2);
                        _sumd = vextq_s32(_sumd, _sumd, 2);
                        _sume = vextq_s32(_sume, _sume, 2);
                        _sumf = vextq_s32(_sumf, _sumf, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                        int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                        int32x4x2_t _t2 = vzipq_s32(_sum3, _sumf);
                        int32x4x2_t _t3 = vzipq_s32(_sum7, _sumb);
                        int32x4x2_t _t4 = vzipq_s32(_sum2, _sume);
                        int32x4x2_t _t5 = vzipq_s32(_sum6, _suma);
                        int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                        int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum2 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum3 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum4 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum5 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sum6 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                        _sum9 = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                        _suma = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                        _sumb = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                        _sumc = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                        _sumd = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                        _sume = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                        _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum6 = vrev64q_s32(_sum6);
                        _sum7 = vrev64q_s32(_sum7);
                        _suma = vrev64q_s32(_suma);
                        _sumb = vrev64q_s32(_sumb);
                        _sume = vrev64q_s32(_sume);
                        _sumf = vrev64q_s32(_sumf);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + out_hstep, _sum2);
                    vst1q_s32(outptr0 + out_hstep + 4, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 3 + 4, _sum7);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum8);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum9);
                    vst1q_s32(outptr0 + out_hstep * 5, _suma);
                    vst1q_s32(outptr0 + out_hstep * 5 + 4, _sumb);
                    vst1q_s32(outptr0 + out_hstep * 6, _sumc);
                    vst1q_s32(outptr0 + out_hstep * 6 + 4, _sumd);
                    vst1q_s32(outptr0 + out_hstep * 7, _sume);
                    vst1q_s32(outptr0 + out_hstep * 7 + 4, _sumf);
                    outptr0 += 8;
                }
#endif // __ARM_FEATURE_DOTPROD
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
                vst1q_s32(outptr + 32, _sum8);
                vst1q_s32(outptr + 36, _sum9);
                vst1q_s32(outptr + 40, _suma);
                vst1q_s32(outptr + 44, _sumb);
                vst1q_s32(outptr + 48, _sumc);
                vst1q_s32(outptr + 52, _sumd);
                vst1q_s32(outptr + 56, _sume);
                vst1q_s32(outptr + 60, _sumf);
            }

            outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cmp    %w9, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n"
                "sub    %0, %0, #64                 \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"

                "1:                                 \n"
#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"

#if __ARM_FEATURE_MATMUL_INT8
                "smmla  v24.4s, v0.16b, v4.16b      \n"
                "smmla  v25.4s, v1.16b, v4.16b      \n"
                "smmla  v26.4s, v0.16b, v5.16b      \n"
                "smmla  v27.4s, v1.16b, v5.16b      \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v28.4s, v2.16b, v4.16b      \n"
                "smmla  v29.4s, v3.16b, v4.16b      \n"
                "smmla  v30.4s, v2.16b, v5.16b      \n"
                "smmla  v31.4s, v3.16b, v5.16b      \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "sdot   v16.4s, v0.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v4.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v4.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v4.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v4.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v4.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v2.16b, v5.4b[0]    \n"
                "sdot   v17.4s, v2.16b, v5.4b[1]    \n"
                "sdot   v18.4s, v2.16b, v5.4b[2]    \n"
                "sdot   v19.4s, v2.16b, v5.4b[3]    \n"
                "sdot   v20.4s, v3.16b, v5.4b[0]    \n"
                "sdot   v21.4s, v3.16b, v5.4b[1]    \n"
                "sdot   v22.4s, v3.16b, v5.4b[2]    \n"
                "sdot   v23.4s, v3.16b, v5.4b[3]    \n"
#endif // __ARM_FEATURE_MATMUL_INT8
                "bne    2b                          \n"

#if __ARM_FEATURE_MATMUL_INT8
                "uzp1   v0.4s, v24.4s, v25.4s       \n"
                "uzp2   v1.4s, v24.4s, v25.4s       \n"
                "uzp1   v2.4s, v26.4s, v27.4s       \n"
                "uzp2   v3.4s, v26.4s, v27.4s       \n"
                "uzp1   v4.4s, v28.4s, v29.4s       \n"
                "uzp2   v5.4s, v28.4s, v29.4s       \n"
                "uzp1   v6.4s, v30.4s, v31.4s       \n"
                "uzp2   v7.4s, v30.4s, v31.4s       \n"

                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
                "add    v20.4s, v20.4s, v4.4s       \n"
                "add    v21.4s, v21.4s, v5.4s       \n"
                "add    v22.4s, v22.4s, v6.4s       \n"
                "add    v23.4s, v23.4s, v7.4s       \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
                "and    w4, %w8, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v2.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v2.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v2.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v2.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v4.16b}, [%2], #16         \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "rev64  v2.4s, v0.4s                \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "ext    v5.16b, v4.16b, v4.16b, #8  \n"
                "smull2 v9.8h, v0.16b, v5.16b       \n"
                "rev64  v6.8h, v4.8h                \n"
                "smull2 v11.8h, v2.16b, v5.16b      \n"
                "ext    v7.16b, v6.16b, v6.16b, #8  \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "rev64  v3.4s, v1.4s                \n"
                "smull2 v13.8h, v0.16b, v7.16b      \n"
                "smull2 v15.8h, v2.16b, v7.16b      \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v9.8h, v1.16b, v4.16b       \n"
                "smlal2 v11.8h, v3.16b, v4.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v6.16b      \n"
                "smlal2 v15.8h, v3.16b, v6.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w8, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "dup    v4.8h, v1.h[0]              \n"
                "dup    v5.8h, v1.h[1]              \n"
                "dup    v6.8h, v1.h[2]              \n"
                "dup    v7.8h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "smull2 v12.8h, v0.16b, v4.16b      \n"
                "smull2 v13.8h, v0.16b, v5.16b      \n"
                "smull2 v14.8h, v0.16b, v6.16b      \n"
                "smull2 v15.8h, v0.16b, v7.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1r   {v2.2d}, [%2]               \n"
                "add    %2, %2, #8                  \n"
                "rev64  v1.4s, v0.4s                \n"
                "rev64  v3.8h, v2.8h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w8, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.8b}, [%2]               \n"
                "add    %2, %2, #4                  \n"
                "dup    v8.8b, v1.b[0]              \n"
                "dup    v9.8b, v1.b[1]              \n"
                "dup    v10.8b, v1.b[2]             \n"
                "dup    v11.8b, v1.b[3]             \n"
                "smull  v8.8h, v0.8b, v8.8b         \n"
                "smull  v9.8h, v0.8b, v9.8b         \n"
                "smull  v10.8h, v0.8b, v10.8b       \n"
                "smull  v11.8h, v0.8b, v11.8b       \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw  v17.4s, v17.4s, v9.4h       \n"
                "saddw  v18.4s, v18.4s, v10.4h      \n"
                "saddw  v19.4s, v19.4s, v11.4h      \n"
                "saddw2 v20.4s, v20.4s, v8.8h       \n"
                "saddw2 v21.4s, v21.4s, v9.8h       \n"
                "saddw2 v22.4s, v22.4s, v10.8h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1r   {v4.2s}, [%2]               \n"
                "add    %2, %2, #4                  \n"
                "rev32  v1.4h, v0.4h                \n"
                "rev64  v5.8b, v4.8b                \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v1.8b, v4.8b         \n"
                "smull  v10.8h, v0.8b, v5.8b        \n"
                "smull  v11.8h, v1.8b, v5.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw2 v21.4s, v21.4s, v10.8h      \n"
                "saddw  v22.4s, v22.4s, v11.4h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "cmp    %w10, #0                    \n"
                "beq    10f                         \n"

#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                //      e2 f2 g2 h2
                //      e3 f3 g3 h3
                // if out_elempack == 4 / 8
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                // if out_elempack == 8
                "cmp    %w11, #8                    \n"
                "bne    7f                          \n"

                "st1    {v16.4s}, [%3], #16         \n"
                "st1    {v20.4s}, [%3], #16         \n"
                "st1    {v17.4s}, [%3], #16         \n"
                "st1    {v21.4s}, [%3], #16         \n"
                "st1    {v18.4s}, [%3], #16         \n"
                "st1    {v22.4s}, [%3], #16         \n"
                "st1    {v19.4s}, [%3], #16         \n"
                "st1    {v23.4s}, [%3], #16         \n"
                "b      9f                          \n"

                // if out_elempack == 4
                "7:                                 \n"
                "add    x4, %3, %12, lsl #4         \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [x4] \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // to
                //      a0 a1 a2 a3
                //      b0 b1 b2 b3
                //      c0 c1 c2 c3
                //      d0 d1 d2 d3
                //      e0 e1 e2 e3
                //      f0 f1 f2 f3
                //      g0 g1 g2 g3
                //      h0 h1 h2 h3
                "zip1   v0.4s, v16.4s, v17.4s       \n"
                "zip2   v1.4s, v16.4s, v17.4s       \n"
                "zip1   v2.4s, v18.4s, v19.4s       \n"
                "zip2   v3.4s, v18.4s, v19.4s       \n"
                "zip1   v4.4s, v20.4s, v21.4s       \n"
                "zip2   v5.4s, v20.4s, v21.4s       \n"
                "zip1   v6.4s, v22.4s, v23.4s       \n"
                "zip2   v7.4s, v22.4s, v23.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip1   v18.2d, v1.2d, v3.2d        \n"
                "zip2   v19.2d, v1.2d, v3.2d        \n"
                "zip1   v20.2d, v4.2d, v6.2d        \n"
                "zip2   v21.2d, v4.2d, v6.2d        \n"
                "zip1   v22.2d, v5.2d, v7.2d        \n"
                "zip2   v23.2d, v5.2d, v7.2d        \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s}, [%3], #16         \n"
                "st1    {v17.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v18.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v19.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v20.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v21.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v22.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v23.4s}, [x4]              \n"
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      e0 f1 g2 h3
                //      c0 d1 a2 b3
                //      g0 h1 e2 f3
                //      a3 b2 c1 d0
                //      e3 f2 g1 h0
                //      c3 d2 a1 b0
                //      g3 h2 e1 f0
                // if out_elempack == 4 / 8
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                "rev64  v20.4s, v20.4s              \n"
                "rev64  v21.4s, v21.4s              \n"
                "rev64  v22.4s, v22.4s              \n"
                "rev64  v23.4s, v23.4s              \n"
                "ext    v20.16b, v20.16b, v20.16b, #8 \n"
                "ext    v21.16b, v21.16b, v21.16b, #8 \n"
                "ext    v22.16b, v22.16b, v22.16b, #8 \n"
                "ext    v23.16b, v23.16b, v23.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v22.4s       \n"
                "zip2   v1.4s, v16.4s, v22.4s       \n"
                "zip1   v2.4s, v18.4s, v20.4s       \n"
                "zip2   v3.4s, v18.4s, v20.4s       \n"
                "zip1   v4.4s, v17.4s, v23.4s       \n"
                "zip2   v5.4s, v17.4s, v23.4s       \n"
                "zip1   v6.4s, v19.4s, v21.4s       \n"
                "zip2   v7.4s, v19.4s, v21.4s       \n"

                // if out_elempack == 8
                "cmp    %w11, #8                    \n"
                "bne    7f                          \n"

                // to
                //      a0 b0 c0 d0
                //      e0 f0 g0 h0
                //      a1 b1 c1 d1
                //      e1 f1 g1 h1
                //      a2 b2 c2 d2
                //      e2 f2 g2 h2
                //      a3 b3 c3 d3
                //      e3 f3 g3 h3
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v17.2d, v4.2d, v6.2d        \n"
                "zip2   v18.2d, v0.2d, v2.2d        \n"
                "zip2   v19.2d, v4.2d, v6.2d        \n"
                "zip1   v20.2d, v3.2d, v1.2d        \n"
                "zip1   v21.2d, v7.2d, v5.2d        \n"
                "zip2   v22.2d, v3.2d, v1.2d        \n"
                "zip2   v23.2d, v7.2d, v5.2d        \n"
                "rev64  v18.4s, v18.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v22.4s, v22.4s              \n"
                "rev64  v23.4s, v23.4s              \n"

                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 4
                "7:                                 \n"

                // to
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                //      e2 f2 g2 h2
                //      e3 f3 g3 h3
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v24.2d, v4.2d, v6.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip2   v25.2d, v4.2d, v6.2d        \n"
                "zip1   v18.2d, v3.2d, v1.2d        \n"
                "zip1   v26.2d, v7.2d, v5.2d        \n"
                "zip2   v19.2d, v3.2d, v1.2d        \n"
                "zip2   v27.2d, v7.2d, v5.2d        \n"
                "rev64  v17.4s, v17.4s              \n"
                "rev64  v25.4s, v25.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v27.4s, v27.4s              \n"

                "add    x4, %3, %12, lsl #4         \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [x4] \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"

                // to
                //      a0 a1 a2 a3
                //      b0 b1 b2 b3
                //      c0 c1 c2 c3
                //      d0 d1 d2 d3
                //      e0 e1 e2 e3
                //      f0 f1 f2 f3
                //      g0 g1 g2 g3
                //      h0 h1 h2 h3
                "ext    v18.16b, v18.16b, v18.16b, #8 \n"
                "ext    v19.16b, v19.16b, v19.16b, #8 \n"
                "ext    v22.16b, v22.16b, v22.16b, #8 \n"
                "ext    v23.16b, v23.16b, v23.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v22.4s       \n"
                "zip2   v1.4s, v16.4s, v22.4s       \n"
                "zip1   v2.4s, v18.4s, v20.4s       \n"
                "zip2   v3.4s, v18.4s, v20.4s       \n"
                "zip1   v4.4s, v17.4s, v23.4s       \n"
                "zip2   v5.4s, v17.4s, v23.4s       \n"
                "zip1   v6.4s, v19.4s, v21.4s       \n"
                "zip2   v7.4s, v19.4s, v21.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip1   v18.2d, v3.2d, v1.2d        \n"
                "zip2   v19.2d, v3.2d, v1.2d        \n"
                "zip1   v20.2d, v4.2d, v6.2d        \n"
                "zip2   v21.2d, v4.2d, v6.2d        \n"
                "zip1   v22.2d, v7.2d, v5.2d        \n"
                "zip2   v23.2d, v7.2d, v5.2d        \n"
                "rev64  v17.4s, v17.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v21.4s, v21.4s              \n"
                "rev64  v23.4s, v23.4s              \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s}, [%3], #16         \n"
                "st1    {v17.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v18.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v19.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v20.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v21.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v22.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v23.4s}, [x4]              \n"
#endif // __ARM_FEATURE_DOTPROD

                "9:                                 \n"
                "add    %0, %0, #128                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(max_kk),       // %8
                "r"(k),            // %9
                "r"(k_end),        // %10
                "r"(out_elempack), // %11
                "r"(out_hstep)     // %12
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %9, #0              \n"
                "beq        0f                  \n"

                "vldm       %0!, {d16-d23}      \n"
                "vldm       %0, {d24-d31}       \n"
                "sub        %0, %0, #64         \n"
                "b          1f                  \n"

                "0:                             \n"
                "veor       q8, q8              \n"
                "veor       q9, q9              \n"
                "veor       q10, q10            \n"
                "veor       q11, q11            \n"
                "veor       q12, q12            \n"
                "veor       q13, q13            \n"
                "veor       q14, q14            \n"
                "veor       q15, q15            \n"

                "1:                             \n"
                "lsr        r4, %8, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        3f                  \n"

                ".align 4                       \n"
                "2:                             \n"
                "pld        [%1, #256]          \n"
                "vld1.s8    {d0-d3}, [%1 :64]!  \n"
                "pld        [%2, #128]          \n"
                "vld1.s8    {d4-d5}, [%2]!      \n"
                "vmull.s8   q4, d0, d4          \n"
                "vrev64.32  q3, q0              \n"
                "vmull.s8   q5, d1, d4          \n"
                "vmull.s8   q6, d6, d4          \n"
                "vmull.s8   q7, d7, d4          \n"
                "vrev64.32  q0, q1              \n"
                "vmlal.s8   q4, d2, d5          \n"
                "vmlal.s8   q5, d3, d5          \n"
                "vmlal.s8   q6, d0, d5          \n"
                "vmlal.s8   q7, d1, d5          \n"
                "vrev64.16  q2, q2              \n"
                "vpadal.s16 q8, q4              \n"
                "vrev64.32  q1, q3              \n"
                "vpadal.s16 q9, q5              \n"
                "vmull.s8   q4, d6, d4          \n"
                "vpadal.s16 q10, q6             \n"
                "vmull.s8   q5, d7, d4          \n"
                "vpadal.s16 q11, q7             \n"
                "vmull.s8   q6, d2, d4          \n"
                "vmull.s8   q7, d3, d4          \n"
                "vrev64.32  q3, q0              \n"
                "vmlal.s8   q4, d0, d5          \n"
                "vmlal.s8   q5, d1, d5          \n"
                "vmlal.s8   q6, d6, d5          \n"
                "vmlal.s8   q7, d7, d5          \n"
                "subs       r4, r4, #1          \n"
                "vpadal.s16 q14, q4             \n"
                "vpadal.s16 q15, q5             \n"
                "vpadal.s16 q12, q6             \n"
                "vpadal.s16 q13, q7             \n"
                "bne        2b                  \n"

                "3:                             \n"
                "and        r4, %8, #2          \n" // r4 = remain = max_kk & 2
                "cmp        r4, #0              \n"
                "beq        4f                  \n"

                // kk += 2 part
                "vld1.s8    {d0-d1}, [%1 :64]!  \n"
                "vld1.s8    {d4}, [%2]!         \n"
                "vrev64.32  q1, q0              \n"
                "vrev64.16  d5, d4              \n"
                "vmull.s8   q4, d0, d4          \n"
                "vmull.s8   q5, d1, d4          \n"
                "vmull.s8   q6, d2, d4          \n"
                "vmull.s8   q7, d3, d4          \n"
                "vpadal.s16 q8, q4              \n"
                "vpadal.s16 q9, q5              \n"
                "vpadal.s16 q10, q6             \n"
                "vpadal.s16 q11, q7             \n"
                "vmull.s8   q4, d0, d5          \n"
                "vmull.s8   q5, d1, d5          \n"
                "vmull.s8   q6, d2, d5          \n"
                "vmull.s8   q7, d3, d5          \n"
                "vpadal.s16 q12, q4             \n"
                "vpadal.s16 q13, q5             \n"
                "vpadal.s16 q14, q6             \n"
                "vpadal.s16 q15, q7             \n"

                "4:                             \n"
                "and        r4, %8, #1          \n" // r4 = remain = max_kk & 1
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                // kk += 1 part
                "vld1.s8    {d0}, [%1 :64]!     \n"
                "vld1.s32   {d2[]}, [%2]!       \n"
                "vrev64.16  d1, d0              \n"
                "vrev64.8   d3, d2              \n"
                "vext.s8    d1, d1, #4          \n"
                "vmull.s8   q4, d0, d2          \n"
                "vmull.s8   q5, d1, d2          \n"
                "vmull.s8   q6, d0, d3          \n"
                "vmull.s8   q7, d1, d3          \n"
                "vaddw.s16  q8, d8              \n"
                "vaddw.s16  q9, d9              \n"
                "vaddw.s16  q10, d10            \n"
                "vaddw.s16  q11, d11            \n"
                "vaddw.s16  q12, d12            \n"
                "vaddw.s16  q13, d13            \n"
                "vaddw.s16  q14, d14            \n"
                "vaddw.s16  q15, d15            \n"

                "5:                             \n"
                "cmp        %10, #0             \n"
                "beq        10f                 \n"

                // from
                //      a0 b1 c2 d3
                //      e0 f1 g2 h3
                //      c0 d1 a2 b3
                //      g0 h1 e2 f3
                //      a3 b2 c1 d0
                //      e3 f2 g1 h0
                //      c3 d2 a1 b0
                //      g3 h2 e1 f0
                // if out_elempack == 4 / 8
                "cmp        %11, #1             \n"
                "beq        8f                  \n"

                "vrev64.32  q12, q12            \n"
                "vrev64.32  q13, q13            \n"
                "vrev64.32  q14, q14            \n"
                "vrev64.32  q15, q15            \n"
                "vext.32    q12, q12, #2        \n"
                "vext.32    q13, q13, #2        \n"
                "vext.32    q14, q14, #2        \n"
                "vext.32    q15, q15, #2        \n"
                "vzip.32    q8, q14             \n"
                "vzip.32    q10, q12            \n"
                "vzip.32    q9, q15             \n"
                "vzip.32    q11, q13            \n"
                "vswp       d17, d20            \n"
                "vswp       d19, d22            \n"
                "vswp       d28, d25            \n"
                "vswp       d30, d27            \n"
                "vrev64.32  q10, q10            \n"
                "vrev64.32  q11, q11            \n"
                "vrev64.32  q14, q14            \n"
                "vrev64.32  q15, q15            \n"

                // if out_elempack == 8
                "cmp        %11, #8             \n"
                "bne        7f                  \n"

                // to
                //      a0 b0 c0 d0
                //      e0 f0 g0 h0
                //      a1 b1 c1 d1
                //      e1 f1 g1 h1
                //      a2 b2 c2 d2
                //      e2 f2 g2 h2
                //      a3 b3 c3 d3
                //      e3 f3 g3 h3
                "vstm       %3!, {d16-d23}      \n"
                "vstm       %3!, {d24-d31}      \n"
                "b          9f                  \n"

                // if out_elempack == 4
                "7:                             \n"
                // to
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                //      e2 f2 g2 h2
                //      e3 f3 g3 h3
                "vswp       q9, q10             \n"
                "vswp       q13, q14            \n"
                "vswp       q10, q12            \n"
                "vswp       q11, q13            \n"

                "add        r4, %3, %12, lsl #4 \n"
                "vstm       %3!, {d16-d23}      \n"
                "vstm       r4, {d24-d31}       \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // to
                //      a0 a1 a2 a3
                //      b0 b1 b2 b3
                //      c0 c1 c2 c3
                //      d0 d1 d2 d3
                //      e0 e1 e2 e3
                //      f0 f1 f2 f3
                //      g0 g1 g2 g3
                //      h0 h1 h2 h3
                "vext.32    q10, q10, #2        \n"
                "vext.32    q11, q11, #2        \n"
                "vext.32    q14, q14, #2        \n"
                "vext.32    q15, q15, #2        \n"
                "vzip.32    q8, q14             \n"
                "vzip.32    q10, q12            \n"
                "vzip.32    q9, q15             \n"
                "vzip.32    q11, q13            \n"
                "vswp       d17, d20            \n"
                "vswp       d19, d22            \n"
                "vswp       d28, d25            \n"
                "vswp       d30, d27            \n"
                "vrev64.32  q10, q10            \n"
                "vrev64.32  q11, q11            \n"
                "vrev64.32  q14, q14            \n"
                "vrev64.32  q15, q15            \n"

                "add        r4, %3, %12, lsl #2 \n"
                "vst1.s32   {d16-d17}, [%3]!    \n"
                "vst1.s32   {d20-d21}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d24-d25}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d28-d29}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d18-d19}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d22-d23}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d26-d27}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d30-d31}, [r4]     \n"

                "9:                             \n"
                "add        %0, %0, #128        \n"
                "b          11f                 \n"

                "10:                            \n"
                "vstm       %0!, {d16-d23}      \n"
                "vstm       %0!, {d24-d31}      \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(max_kk),       // %8
                "r"(k),            // %9
                "r"(k_end),        // %10
                "r"(out_elempack), // %11
                "r"(out_hstep)     // %12
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;
            int32x4_t _sum4;
            int32x4_t _sum5;
            int32x4_t _sum6;
            int32x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
                _sum4 = vld1q_s32(outptr + 16);
                _sum5 = vld1q_s32(outptr + 20);
                _sum6 = vld1q_s32(outptr + 24);
                _sum7 = vld1q_s32(outptr + 28);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _s0 = vdupq_n_s32(0);
                int32x4_t _s1 = vdupq_n_s32(0);
                int32x4_t _s2 = vdupq_n_s32(0);
                int32x4_t _s3 = vdupq_n_s32(0);
                int32x4_t _s4 = vdupq_n_s32(0);
                int32x4_t _s5 = vdupq_n_s32(0);
                int32x4_t _s6 = vdupq_n_s32(0);
                int32x4_t _s7 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);

                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb ..... hhhhhhhh
                    // 00000000 11111111 22222222 33333333

                    _s0 = vmmlaq_s32(_s0, _pA0, _pB0);
                    _s1 = vmmlaq_s32(_s1, _pA1, _pB0);
                    _s2 = vmmlaq_s32(_s2, _pA0, _pB1);
                    _s3 = vmmlaq_s32(_s3, _pA1, _pB1);
                    _s4 = vmmlaq_s32(_s4, _pA2, _pB0);
                    _s5 = vmmlaq_s32(_s5, _pA3, _pB0);
                    _s6 = vmmlaq_s32(_s6, _pA2, _pB1);
                    _s7 = vmmlaq_s32(_s7, _pA3, _pB1);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                    _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB0, 0);
                    _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB0, 1);
                    _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB0, 2);
                    _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB0, 3);

                    _sum0 = vdotq_laneq_s32(_sum0, _pA2, _pB1, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA2, _pB1, 1);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA2, _pB1, 2);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA2, _pB1, 3);
                    _sum4 = vdotq_laneq_s32(_sum4, _pA3, _pB1, 0);
                    _sum5 = vdotq_laneq_s32(_sum5, _pA3, _pB1, 1);
                    _sum6 = vdotq_laneq_s32(_sum6, _pA3, _pB1, 2);
                    _sum7 = vdotq_laneq_s32(_sum7, _pA3, _pB1, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 64;
                    pB += 32;
                }
#if __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _ss0 = vuzpq_s32(_s0, _s1);
                int32x4x2_t _ss1 = vuzpq_s32(_s2, _s3);
                int32x4x2_t _ss2 = vuzpq_s32(_s4, _s5);
                int32x4x2_t _ss3 = vuzpq_s32(_s6, _s7);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
                _sum4 = vaddq_s32(_sum4, _ss2.val[0]);
                _sum5 = vaddq_s32(_sum5, _ss2.val[1]);
                _sum6 = vaddq_s32(_sum6, _ss3.val[0]);
                _sum7 = vaddq_s32(_sum7, _ss3.val[1]);
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB = vld1q_s8(pB);

                // aaaa bbbb cccc dddd   eeee ffff gggg hhhh

                // 0000 1111 2222 3333

                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB, 3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x16_t _pB02 = vld1q_s8(pB);

                // aabbccdd eeffgghh

                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));
                int8x16_t _pA3 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA2)));

                // 00112233 44556677

                // 33221100 77665544

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB02));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB02));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB13));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB13));
                int16x8_t _s6 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB13));
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB13));

                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), vget_high_s8(_pB02));
                _s2 = vmlal_s8(_s2, vget_low_s8(_pA3), vget_high_s8(_pB02));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA3), vget_high_s8(_pB02));
                _s4 = vmlal_s8(_s4, vget_low_s8(_pA2), vget_high_s8(_pB13));
                _s5 = vmlal_s8(_s5, vget_high_s8(_pA2), vget_high_s8(_pB13));
                _s6 = vmlal_s8(_s6, vget_low_s8(_pA3), vget_high_s8(_pB13));
                _s7 = vmlal_s8(_s7, vget_high_s8(_pA3), vget_high_s8(_pB13));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                // aabbccdd eeffgghh

                // 00112233
                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2)));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3)));
                int16x8_t _s4 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2)));
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3)));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x8_t _pB0 = vld1_s8(pB);

                // aabbccdd eeffgghh

                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));

                // 00112233

                // 33221100

                int8x8_t _pB1 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), _pB0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), _pB0);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA1), _pB0);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA1), _pB0);
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA0), _pB1);
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA0), _pB1);
                int16x8_t _s6 = vmull_s8(vget_low_s8(_pA1), _pB1);
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA1), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                // int8x8_t _pB0 = vreinterpret_s32_s8(vld1_dup_s32(pB));

                // abcdefgh

                // 0123

                int16x8_t _s01 = vmull_s8(_pA0, vdup_n_s8(pB[0]));
                int16x8_t _s23 = vmull_s8(_pA0, vdup_n_s8(pB[1]));
                int16x8_t _s45 = vmull_s8(_pA0, vdup_n_s8(pB[2]));
                int16x8_t _s67 = vmull_s8(_pA0, vdup_n_s8(pB[3]));
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s23));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s45));
                _sum3 = vaddw_s16(_sum3, vget_low_s16(_s67));
                _sum4 = vaddw_s16(_sum4, vget_high_s16(_s01));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s23));
                _sum6 = vaddw_s16(_sum6, vget_high_s16(_s45));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));
                // int8x8_t _pB0 = vld1_s8(pB);
                // _pB0 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pB0), vreinterpret_s32_s8(_pB0)).val[0]);

                // abcdefgh  ->  cdabghef
                int8x8_t _pA1 = vreinterpret_s8_s16(vrev32_s16(vreinterpret_s16_s8(_pA0)));

                // 01230123  ->  32103210
                int8x8_t _pB1 = vrev64_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA0, _pB0);
                int16x8_t _s23 = vmull_s8(_pA1, _pB0);
                int16x8_t _s45 = vmull_s8(_pA0, _pB1);
                int16x8_t _s67 = vmull_s8(_pA1, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s45));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s45));
                _sum6 = vaddw_s16(_sum6, vget_low_s16(_s67));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 4;
            }

            if (k_end)
            {
#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                //      e2 f2 g2 h2
                //      e3 f3 g3 h3
                if (out_elempack == 8)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum4);
                    vst1q_s32(outptr0 + 8, _sum1);
                    vst1q_s32(outptr0 + 12, _sum5);
                    vst1q_s32(outptr0 + 16, _sum2);
                    vst1q_s32(outptr0 + 20, _sum6);
                    vst1q_s32(outptr0 + 24, _sum3);
                    vst1q_s32(outptr0 + 28, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 4 + 8, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 4 + 12, _sum7);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      b0 b1 b2 b3
                    //      c0 c1 c2 c3
                    //      d0 d1 d2 d3
                    //      e0 e1 e2 e3
                    //      f0 f1 f2 f3
                    //      g0 g1 g2 g3
                    //      h0 h1 h2 h3
                    {
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum3);
                        int32x4x2_t _t2 = vzipq_s32(_sum4, _sum5);
                        int32x4x2_t _t3 = vzipq_s32(_sum6, _sum7);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_t0.val[1]), vget_low_s32(_t1.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t0.val[1]), vget_high_s32(_t1.val[1]));
                        _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum6 = vcombine_s32(vget_low_s32(_t2.val[1]), vget_low_s32(_t3.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t2.val[1]), vget_high_s32(_t3.val[1]));
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + out_hstep, _sum1);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum2);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 5, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 6, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 7, _sum7);
                    outptr0 += 4;
                }
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      e0 f1 g2 h3
                //      c0 d1 a2 b3
                //      g0 h1 e2 f3
                //      a3 b2 c1 d0
                //      e3 f2 g1 h0
                //      c3 d2 a1 b0
                //      g3 h2 e1 f0
                if (out_elempack == 8)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      e0 f0 g0 h0
                    //      a1 b1 c1 d1
                    //      e1 f1 g1 h1
                    //      a2 b2 c2 d2
                    //      e2 f2 g2 h2
                    //      a3 b3 c3 d3
                    //      e3 f3 g3 h3
                    {
                        _sum4 = vrev64q_s32(_sum4);
                        _sum5 = vrev64q_s32(_sum5);
                        _sum6 = vrev64q_s32(_sum6);
                        _sum7 = vrev64q_s32(_sum7);
                        _sum4 = vextq_s32(_sum4, _sum4, 2);
                        _sum5 = vextq_s32(_sum5, _sum5, 2);
                        _sum6 = vextq_s32(_sum6, _sum6, 2);
                        _sum7 = vextq_s32(_sum7, _sum7, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                        int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                        int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum2 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum3 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum4 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum5 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sum6 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum6 = vrev64q_s32(_sum6);
                        _sum7 = vrev64q_s32(_sum7);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + 16, _sum4);
                    vst1q_s32(outptr0 + 20, _sum5);
                    vst1q_s32(outptr0 + 24, _sum6);
                    vst1q_s32(outptr0 + 28, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 4)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      a1 b1 c1 d1
                    //      a2 b2 c2 d2
                    //      a3 b3 c3 d3
                    //      e0 f0 g0 h0
                    //      e1 f1 g1 h1
                    //      e2 f2 g2 h2
                    //      e3 f3 g3 h3
                    {
                        _sum4 = vrev64q_s32(_sum4);
                        _sum5 = vrev64q_s32(_sum5);
                        _sum6 = vrev64q_s32(_sum6);
                        _sum7 = vrev64q_s32(_sum7);
                        _sum4 = vextq_s32(_sum4, _sum4, 2);
                        _sum5 = vextq_s32(_sum5, _sum5, 2);
                        _sum6 = vextq_s32(_sum6, _sum6, 2);
                        _sum7 = vextq_s32(_sum7, _sum7, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                        int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                        int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum5 = vrev64q_s32(_sum5);
                        _sum7 = vrev64q_s32(_sum7);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 4 + 8, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 4 + 12, _sum7);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      b0 b1 b2 b3
                    //      c0 c1 c2 c3
                    //      d0 d1 d2 d3
                    //      e0 e1 e2 e3
                    //      f0 f1 f2 f3
                    //      g0 g1 g2 g3
                    //      h0 h1 h2 h3
                    {
                        _sum2 = vextq_s32(_sum2, _sum2, 2);
                        _sum3 = vextq_s32(_sum3, _sum3, 2);
                        _sum6 = vextq_s32(_sum6, _sum6, 2);
                        _sum7 = vextq_s32(_sum7, _sum7, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                        int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                        int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum5 = vrev64q_s32(_sum5);
                        _sum7 = vrev64q_s32(_sum7);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + out_hstep, _sum1);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum2);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 5, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 6, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 7, _sum7);
                    outptr0 += 4;
                }
#endif // __ARM_FEATURE_DOTPROD
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
            }

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _s0 = vdupq_n_s32(0);
                int32x4_t _s1 = vdupq_n_s32(0);
                int32x4_t _s2 = vdupq_n_s32(0);
                int32x4_t _s3 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);

                    int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb ..... hhhhhhhh
                    // 00000000 11111111

                    _s0 = vmmlaq_s32(_s0, _pA0, _pB);
                    _s1 = vmmlaq_s32(_s1, _pA1, _pB);
                    _s2 = vmmlaq_s32(_s2, _pA2, _pB);
                    _s3 = vmmlaq_s32(_s3, _pA3, _pB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB, 1);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA1, _pB, 0);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA1, _pB, 1);

                    _sum0 = vdotq_laneq_s32(_sum0, _pA2, _pB, 2);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA2, _pB, 3);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA3, _pB, 2);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA3, _pB, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 64;
                    pB += 16;
                }
#if __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _ss0 = vuzpq_s32(_s0, _s1);
                int32x4x2_t _ss1 = vuzpq_s32(_s2, _s3);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x8_t _pB = vld1_s8(pB);

                // aaaa bbbb cccc dddd eeee ffff gggg hhhh

                // 0000 1111

                _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pA0, _pB, 1);
                _sum2 = vdotq_lane_s32(_sum2, _pA1, _pB, 0);
                _sum3 = vdotq_lane_s32(_sum3, _pA1, _pB, 1);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x8_t _pB = vld1_s8(pB);

                // aabbccdd eeffgghh   aabbccdd eeffgghh

                // 00112233 -> 00110011 22332233

                // 11001100 33223322

                int32x2x2_t _pBB = vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB));
                int8x16_t _pB02 = vreinterpretq_s8_s32(vcombine_s32(_pBB.val[0], _pBB.val[1]));

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB13));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB13));
                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), vget_high_s8(_pB02));
                _s2 = vmlal_s8(_s2, vget_low_s8(_pA2), vget_high_s8(_pB13));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA2), vget_high_s8(_pB13));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 8;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int16x4_t _pB = vreinterpret_s16_s32(vld1_dup_s32((const int*)pB));

                int16x4x2_t _pB01 = vuzp_s16(_pB, _pB);
                int8x8_t _pB0 = vreinterpret_s8_s16(_pB01.val[0]);
                int8x8_t _pB1 = vreinterpret_s8_s16(_pB01.val[1]);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB0);
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), _pB1);
                int16x8_t _s2 = vmull_s8(vget_high_s8(_pA), _pB0);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // aabbccdd eeffgghh

                // 00110011
                // 11001100

                int8x8_t _pB1 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA), _pB0);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA), _pB1);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 4;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                int8x8x2_t _pB01 = vuzp_s8(_pB, _pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB01.val[0]);
                int16x8_t _s1 = vmull_s8(_pA, _pB01.val[1]);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s1));
                _sum2 = vaddw_s16(_sum2, vget_high_s16(_s0));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                // abcdefgh

                // 01010101
                // 10101010
                int8x8_t _pB1 = vext_s8(_pB0, _pB0, 1);

                int16x8_t _s0 = vmull_s8(_pA, _pB0);
                int16x8_t _s1 = vmull_s8(_pA, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 2;
            }

            if (k_end)
            {
#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      e0 f0 g0 h0
                //      e1 f1 g1 h1
                if (out_elempack == 8)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum2);
                    vst1q_s32(outptr0 + 8, _sum1);
                    vst1q_s32(outptr0 + 12, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum2);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum3);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 b0 b1
                    //      c0 c1 d0 d1
                    //      e0 e1 f0 f1
                    //      g0 g1 h0 h1
                    {
                        int32x4x2_t _sum02 = vzipq_s32(_sum0, _sum1);
                        int32x4x2_t _sum13 = vzipq_s32(_sum2, _sum3);
                        _sum0 = _sum02.val[0];
                        _sum1 = _sum02.val[1];
                        _sum2 = _sum13.val[0];
                        _sum3 = _sum13.val[1];
                    }

                    vst1_s32(outptr0, vget_low_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep, vget_high_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep * 2, vget_low_s32(_sum1));
                    vst1_s32(outptr0 + out_hstep * 3, vget_high_s32(_sum1));
                    vst1_s32(outptr0 + out_hstep * 4, vget_low_s32(_sum2));
                    vst1_s32(outptr0 + out_hstep * 5, vget_high_s32(_sum2));
                    vst1_s32(outptr0 + out_hstep * 6, vget_low_s32(_sum3));
                    vst1_s32(outptr0 + out_hstep * 7, vget_high_s32(_sum3));
                    outptr0 += 2;
                }
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c0 d1
                //      e0 f1 g0 h1
                //      a1 b0 c1 d0
                //      e1 f0 g1 h0
                if (out_elempack == 8)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      e0 f0 g0 h0
                    //      a1 b1 c1 d1
                    //      e1 f1 g1 h1
                    {
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                        int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                        _sum1 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                        _sum2 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 4)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      a1 b1 c1 d1
                    //      e0 f0 g0 h0
                    //      e1 f1 g1 h1
                    {
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                        int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum2);
                    vst1q_s32(outptr0 + out_hstep * 4 + 4, _sum3);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 c0 c1
                    //      b0 b1 d0 d1
                    //      e0 e1 g0 g1
                    //      f0 f1 h0 h1
                    {
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                        int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                    }

                    vst1_s32(outptr0, vget_low_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep, vget_low_s32(_sum1));
                    vst1_s32(outptr0 + out_hstep * 2, vget_high_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep * 3, vget_high_s32(_sum1));
                    vst1_s32(outptr0 + out_hstep * 4, vget_low_s32(_sum2));
                    vst1_s32(outptr0 + out_hstep * 5, vget_low_s32(_sum3));
                    vst1_s32(outptr0 + out_hstep * 6, vget_high_s32(_sum2));
                    vst1_s32(outptr0 + out_hstep * 7, vget_high_s32(_sum3));
                    outptr0 += 2;
                }
#endif // __ARM_FEATURE_DOTPROD
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
            }

            outptr += 16;
        }
        for (; jj < max_jj; jj += 1)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _s0 = vdupq_n_s32(0);
                int32x4_t _s1 = vdupq_n_s32(0);
                int32x4_t _s2 = vdupq_n_s32(0);
                int32x4_t _s3 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);

                    int8x8_t _pB = vld1_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb ..... hhhhhhhh
                    // 00000000
                    int8x16_t _pBB = vcombine_s8(_pB, _pB);

                    _s0 = vdotq_s32(_s0, _pA0, _pBB);
                    _s1 = vdotq_s32(_s1, _pA1, _pBB);
                    _s2 = vdotq_s32(_s2, _pA2, _pBB);
                    _s3 = vdotq_s32(_s3, _pA3, _pBB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                    _sum1 = vdotq_lane_s32(_sum1, _pA1, _pB, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pA2, _pB, 1);
                    _sum1 = vdotq_lane_s32(_sum1, _pA3, _pB, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 64;
                    pB += 8;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_s0, _s1));
                _sum1 = vaddq_s32(_sum1, vpaddq_s32(_s2, _s3));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);

                int8x8_t _pB = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // aaaa bbbb cccc dddd eeee ffff gggg hhhh

                // 0000 0000

                _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pA1, _pB, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));
                int8x8_t _pB1 = vreinterpret_s8_s16(vld1_dup_s16((const short*)(pB + 2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), _pB0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), _pB0);
                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), _pB1);
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA), _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);

                pA += 16;
                pB += 2;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_dup_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                pA += 8;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + out_hstep * 4, _sum1);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    outptr0[0] = vgetq_lane_s32(_sum0, 0);
                    outptr0[out_hstep] = vgetq_lane_s32(_sum0, 1);
                    outptr0[out_hstep * 2] = vgetq_lane_s32(_sum0, 2);
                    outptr0[out_hstep * 3] = vgetq_lane_s32(_sum0, 3);
                    outptr0[out_hstep * 4] = vgetq_lane_s32(_sum1, 0);
                    outptr0[out_hstep * 5] = vgetq_lane_s32(_sum1, 1);
                    outptr0[out_hstep * 6] = vgetq_lane_s32(_sum1, 2);
                    outptr0[out_hstep * 7] = vgetq_lane_s32(_sum1, 3);
                    outptr0++;
                }
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
            }

            outptr += 8;
        }

        pAT += max_kk * 8;
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const signed char* pB = pBT;

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cmp    %w9, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n"
                "sub    %0, %0, #64                 \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"

                "1:                                 \n"
#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v2.16b, v3.16b, v4.16b, v5.16b}, [%2], #64 \n"

#if __ARM_FEATURE_MATMUL_INT8
                "smmla  v24.4s, v0.16b, v2.16b      \n"
                "smmla  v25.4s, v1.16b, v2.16b      \n"
                "smmla  v26.4s, v0.16b, v3.16b      \n"
                "smmla  v27.4s, v1.16b, v3.16b      \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v28.4s, v0.16b, v4.16b      \n"
                "smmla  v29.4s, v1.16b, v4.16b      \n"
                "smmla  v30.4s, v0.16b, v5.16b      \n"
                "smmla  v31.4s, v1.16b, v5.16b      \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v0.16b, v3.4b[0]    \n"
                "sdot   v21.4s, v0.16b, v3.4b[1]    \n"
                "sdot   v22.4s, v0.16b, v3.4b[2]    \n"
                "sdot   v23.4s, v0.16b, v3.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v1.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v1.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v1.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v1.16b, v4.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v5.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v5.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v5.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v5.4b[3]    \n"
#endif // __ARM_FEATURE_MATMUL_INT8
                "bne    2b                          \n"

#if __ARM_FEATURE_MATMUL_INT8
                "uzp1   v0.4s, v24.4s, v25.4s       \n"
                "uzp2   v1.4s, v24.4s, v25.4s       \n"
                "uzp1   v2.4s, v26.4s, v27.4s       \n"
                "uzp2   v3.4s, v26.4s, v27.4s       \n"
                "uzp1   v4.4s, v28.4s, v29.4s       \n"
                "uzp2   v5.4s, v28.4s, v29.4s       \n"
                "uzp1   v6.4s, v30.4s, v31.4s       \n"
                "uzp2   v7.4s, v30.4s, v31.4s       \n"

                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
                "add    v20.4s, v20.4s, v4.4s       \n"
                "add    v21.4s, v21.4s, v5.4s       \n"
                "add    v22.4s, v22.4s, v6.4s       \n"
                "add    v23.4s, v23.4s, v7.4s       \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
                "and    w4, %w8, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v2.16b, v3.16b}, [%2], #32 \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v0.16b, v3.4b[0]    \n"
                "sdot   v21.4s, v0.16b, v3.4b[1]    \n"
                "sdot   v22.4s, v0.16b, v3.4b[2]    \n"
                "sdot   v23.4s, v0.16b, v3.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull2 v9.8h, v0.16b, v5.16b       \n"
                "rev64  v2.4s, v0.4s                \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull2 v11.8h, v2.16b, v5.16b      \n"
                "rev64  v6.8h, v4.8h                \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "rev64  v7.8h, v5.8h                \n"
                "smull2 v13.8h, v0.16b, v7.16b      \n"
                "smull2 v15.8h, v2.16b, v7.16b      \n"
                "ext    v1.16b, v0.16b, v0.16b, #8  \n"
                "ext    v3.16b, v2.16b, v2.16b, #8  \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal2 v9.8h, v1.16b, v4.16b       \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v11.8h, v3.16b, v4.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v6.16b      \n"
                "smlal2 v15.8h, v3.16b, v6.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w8, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.16b}, [%2], #16         \n"
                "dup    v4.8h, v1.h[0]              \n"
                "dup    v5.8h, v1.h[1]              \n"
                "dup    v6.8h, v1.h[2]              \n"
                "dup    v7.8h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "dup    v4.8h, v1.h[4]              \n"
                "dup    v5.8h, v1.h[5]              \n"
                "dup    v6.8h, v1.h[6]              \n"
                "dup    v7.8h, v1.h[7]              \n"
                "smull  v12.8h, v0.8b, v4.8b        \n"
                "smull  v13.8h, v0.8b, v5.8b        \n"
                "smull  v14.8h, v0.8b, v6.8b        \n"
                "smull  v15.8h, v0.8b, v7.8b        \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2d}, [%1]               \n"
                "add    %1, %1, #8                  \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "rev64  v1.4s, v0.4s                \n"
                "rev64  v3.8h, v2.8h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w8, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2s}, [%1]               \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "add    %1, %1, #4                  \n"
                "dup    v8.8h, v1.h[0]              \n"
                "dup    v9.8h, v1.h[1]              \n"
                "dup    v10.8h, v1.h[2]             \n"
                "dup    v11.8h, v1.h[3]             \n"
                "uzp1   v2.8b, v8.8b, v9.8b         \n"
                "uzp2   v3.8b, v8.8b, v9.8b         \n"
                "uzp1   v4.8b, v10.8b, v11.8b       \n"
                "uzp2   v5.8b, v10.8b, v11.8b       \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v0.8b, v3.8b         \n"
                "smull  v10.8h, v0.8b, v4.8b        \n"
                "smull  v11.8h, v0.8b, v5.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw  v17.4s, v17.4s, v9.4h       \n"
                "saddw2 v18.4s, v18.4s, v8.8h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw  v21.4s, v21.4s, v11.4h      \n"
                "saddw2 v22.4s, v22.4s, v10.8h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2s}, [%1]               \n"
                "ld1    {v2.8b}, [%2], #8           \n"
                "add    %1, %1, #4                  \n"
                "ext    v1.8b, v0.8b, v0.8b, #2     \n"
                "rev32  v3.8b, v2.8b                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v1.8b, v2.8b         \n"
                "smull  v10.8h, v0.8b, v3.8b        \n"
                "smull  v11.8h, v1.8b, v3.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw2 v21.4s, v21.4s, v10.8h      \n"
                "saddw  v22.4s, v22.4s, v11.4h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "cmp    %w10, #0                    \n"
                "beq    10f                         \n"

#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      a4 b4 c4 d4
                //      a5 b5 c5 d5
                //      a6 b6 c6 d6
                //      a7 b7 c7 d7
                // if out_elempack == 4
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // to
                //      a0 a1 a2 a3
                //      a4 a5 a6 a7
                //      b0 b1 b2 b3
                //      b4 b5 b6 b7
                //      c0 c1 c2 c3
                //      c4 c5 c6 c7
                //      d0 d1 d2 d3
                //      d4 d5 d6 d7
                "zip1   v0.4s, v16.4s, v17.4s       \n"
                "zip2   v1.4s, v16.4s, v17.4s       \n"
                "zip1   v2.4s, v18.4s, v19.4s       \n"
                "zip2   v3.4s, v18.4s, v19.4s       \n"
                "zip1   v4.4s, v20.4s, v21.4s       \n"
                "zip2   v5.4s, v20.4s, v21.4s       \n"
                "zip1   v6.4s, v22.4s, v23.4s       \n"
                "zip2   v7.4s, v22.4s, v23.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v17.2d, v4.2d, v6.2d        \n"
                "zip2   v18.2d, v0.2d, v2.2d        \n"
                "zip2   v19.2d, v4.2d, v6.2d        \n"
                "zip1   v20.2d, v1.2d, v3.2d        \n"
                "zip1   v21.2d, v5.2d, v7.2d        \n"
                "zip2   v22.2d, v1.2d, v3.2d        \n"
                "zip2   v23.2d, v5.2d, v7.2d        \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s, v17.4s}, [%3], #32 \n"
                "st1    {v18.4s, v19.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v20.4s, v21.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v22.4s, v23.4s}, [x4]      \n"
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      a4 b5 c6 d7
                //      c0 d1 a2 b3
                //      c4 d5 a6 b7
                //      a3 b2 c1 d0
                //      a7 b6 c5 d4
                //      c3 d2 a1 b0
                //      c7 d6 a5 b4
                // if out_elempack == 4
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                // to
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      a4 b4 c4 d4
                //      a5 b5 c5 d5
                //      a6 b6 c6 d6
                //      a7 b7 c7 d7
                "rev64  v20.4s, v20.4s              \n"
                "rev64  v21.4s, v21.4s              \n"
                "rev64  v22.4s, v22.4s              \n"
                "rev64  v23.4s, v23.4s              \n"
                "ext    v20.16b, v20.16b, v20.16b, #8 \n"
                "ext    v21.16b, v21.16b, v21.16b, #8 \n"
                "ext    v22.16b, v22.16b, v22.16b, #8 \n"
                "ext    v23.16b, v23.16b, v23.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v22.4s       \n"
                "zip2   v1.4s, v16.4s, v22.4s       \n"
                "zip1   v2.4s, v18.4s, v20.4s       \n"
                "zip2   v3.4s, v18.4s, v20.4s       \n"
                "zip1   v4.4s, v17.4s, v23.4s       \n"
                "zip2   v5.4s, v17.4s, v23.4s       \n"
                "zip1   v6.4s, v19.4s, v21.4s       \n"
                "zip2   v7.4s, v19.4s, v21.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip1   v18.2d, v3.2d, v1.2d        \n"
                "zip2   v19.2d, v3.2d, v1.2d        \n"
                "zip1   v20.2d, v4.2d, v6.2d        \n"
                "zip2   v21.2d, v4.2d, v6.2d        \n"
                "zip1   v22.2d, v7.2d, v5.2d        \n"
                "zip2   v23.2d, v7.2d, v5.2d        \n"
                "rev64  v17.4s, v17.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v21.4s, v21.4s              \n"
                "rev64  v23.4s, v23.4s              \n"

                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"

                // to
                //      a0 a1 a2 a3
                //      a4 a5 a6 a7
                //      b0 b1 b2 b3
                //      b4 b5 b6 b7
                //      c0 c1 c2 c3
                //      c4 c5 c6 c7
                //      d0 d1 d2 d3
                //      d4 d5 d6 d7
                "ext    v18.16b, v18.16b, v18.16b, #8 \n"
                "ext    v19.16b, v19.16b, v19.16b, #8 \n"
                "ext    v22.16b, v22.16b, v22.16b, #8 \n"
                "ext    v23.16b, v23.16b, v23.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v22.4s       \n"
                "zip2   v1.4s, v16.4s, v22.4s       \n"
                "zip1   v2.4s, v18.4s, v20.4s       \n"
                "zip2   v3.4s, v18.4s, v20.4s       \n"
                "zip1   v4.4s, v17.4s, v23.4s       \n"
                "zip2   v5.4s, v17.4s, v23.4s       \n"
                "zip1   v6.4s, v19.4s, v21.4s       \n"
                "zip2   v7.4s, v19.4s, v21.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip1   v17.2d, v4.2d, v6.2d        \n"
                "zip2   v18.2d, v0.2d, v2.2d        \n"
                "zip2   v19.2d, v4.2d, v6.2d        \n"
                "zip1   v20.2d, v3.2d, v1.2d        \n"
                "zip1   v21.2d, v7.2d, v5.2d        \n"
                "zip2   v22.2d, v3.2d, v1.2d        \n"
                "zip2   v23.2d, v7.2d, v5.2d        \n"
                "rev64  v18.4s, v18.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "rev64  v22.4s, v22.4s              \n"
                "rev64  v23.4s, v23.4s              \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s, v17.4s}, [%3], #32 \n"
                "st1    {v18.4s, v19.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v20.4s, v21.4s}, [x4]      \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v22.4s, v23.4s}, [x4]      \n"
#endif // __ARM_FEATURE_DOTPROD

                "9:                                 \n"
                "add    %0, %0, #128                \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(max_kk),       // %8
                "r"(k),            // %9
                "r"(k_end),        // %10
                "r"(out_elempack), // %11
                "r"(out_hstep)     // %12
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;
            int32x4_t _sum4;
            int32x4_t _sum5;
            int32x4_t _sum6;
            int32x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
                _sum4 = vld1q_s32(outptr + 16);
                _sum5 = vld1q_s32(outptr + 20);
                _sum6 = vld1q_s32(outptr + 24);
                _sum7 = vld1q_s32(outptr + 28);
            }

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum10 = vdupq_n_s32(0);
                int32x4_t _sum11 = vdupq_n_s32(0);
                int32x4_t _sum20 = vdupq_n_s32(0);
                int32x4_t _sum21 = vdupq_n_s32(0);
                int32x4_t _sum30 = vdupq_n_s32(0);
                int32x4_t _sum31 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000 11111111 22222222 33333333
                    // 44444444 55555555 66666666 77777777

                    _sum00 = vmmlaq_s32(_sum00, _pA0, _pB0);
                    _sum01 = vmmlaq_s32(_sum01, _pA1, _pB0);
                    _sum10 = vmmlaq_s32(_sum10, _pA0, _pB1);
                    _sum11 = vmmlaq_s32(_sum11, _pA1, _pB1);
                    _sum20 = vmmlaq_s32(_sum20, _pA0, _pB2);
                    _sum21 = vmmlaq_s32(_sum21, _pA1, _pB2);
                    _sum30 = vmmlaq_s32(_sum30, _pA0, _pB3);
                    _sum31 = vmmlaq_s32(_sum31, _pA1, _pB3);

                    // a0 a1 b0 b1
                    // c0 c1 d0 d1
                    // a2 a3 b2 b3
                    // c2 c3 d2 d3
                    // a4 a5 b4 b5
                    // c4 c5 d4 d5
                    // a6 a7 b6 b7
                    // c6 c7 d6 d7

                    pA += 32;
                    pB += 64;
                }
                int32x4x2_t _ss0 = vuzpq_s32(_sum00, _sum01);
                int32x4x2_t _ss1 = vuzpq_s32(_sum10, _sum11);
                int32x4x2_t _ss2 = vuzpq_s32(_sum20, _sum21);
                int32x4x2_t _ss3 = vuzpq_s32(_sum30, _sum31);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
                _sum4 = vaddq_s32(_sum4, _ss2.val[0]);
                _sum5 = vaddq_s32(_sum5, _ss2.val[1]);
                _sum6 = vaddq_s32(_sum6, _ss3.val[0]);
                _sum7 = vaddq_s32(_sum7, _ss3.val[1]);
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);
                int8x16_t _pB2 = vld1q_s8(pB + 32);
                int8x16_t _pB3 = vld1q_s8(pB + 48);

                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA0, _pB1, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA0, _pB1, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA0, _pB1, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA0, _pB1, 3);

                _sum0 = vdotq_laneq_s32(_sum0, _pA1, _pB2, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA1, _pB2, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA1, _pB2, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA1, _pB2, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB3, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB3, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB3, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB3, 3);

                pA += 32;
                pB += 64;
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

                _sum0 = vdotq_laneq_s32(_sum0, _pA, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA, _pB1, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA, _pB1, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA, _pB1, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA, _pB1, 3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA02 = vld1q_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB2 = vld1q_s8(pB + 16);

                int8x16_t _pA13 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA02)));

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));
                int8x16_t _pB3 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA02), vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA13), vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB1));
                int16x8_t _s5 = vmull_s8(vget_low_s8(_pA02), vget_high_s8(_pB1));
                int16x8_t _s6 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB1));
                int16x8_t _s7 = vmull_s8(vget_low_s8(_pA13), vget_high_s8(_pB1));

                _s0 = vmlal_s8(_s0, vget_high_s8(_pA02), vget_low_s8(_pB2));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA02), vget_high_s8(_pB2));
                _s2 = vmlal_s8(_s2, vget_high_s8(_pA13), vget_low_s8(_pB2));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA13), vget_high_s8(_pB2));
                _s4 = vmlal_s8(_s4, vget_high_s8(_pA02), vget_low_s8(_pB3));
                _s5 = vmlal_s8(_s5, vget_high_s8(_pA02), vget_high_s8(_pB3));
                _s6 = vmlal_s8(_s6, vget_high_s8(_pA13), vget_low_s8(_pB3));
                _s7 = vmlal_s8(_s7, vget_high_s8(_pA13), vget_high_s8(_pB3));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x16_t _pB01 = vld1q_s8(pB);

                // aabbccdd

                // 00112233 44556677

                int16x8_t _s0 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 0)));
                int16x8_t _s1 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 1)));
                int16x8_t _s2 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 2)));
                int16x8_t _s3 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 3)));
                int16x8_t _s4 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 0)));
                int16x8_t _s5 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 1)));
                int16x8_t _s6 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 2)));
                int16x8_t _s7 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 3)));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);

                // aabbccdd
                // ccddaabb

                int8x8_t _pA1 = vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(_pA0)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(_pA1, vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(_pA1, vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(_pA0, vget_low_s8(_pB1));
                int16x8_t _s5 = vmull_s8(_pA0, vget_high_s8(_pB1));
                int16x8_t _s6 = vmull_s8(_pA1, vget_low_s8(_pB1));
                int16x8_t _s7 = vmull_s8(_pA1, vget_high_s8(_pB1));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pAA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vld1_s8(pB);

                // abcdabcd
                // 01234567  ->  01010101 23232323 45454545 67676767
                int8x8_t _pB0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0));
                int8x8_t _pB2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1));
                int8x8_t _pB4 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2));
                int8x8_t _pB6 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3));

                int8x8x2_t _pB0123 = vuzp_s8(_pB0, _pB2);
                int8x8x2_t _pB4567 = vuzp_s8(_pB4, _pB6);

                int16x8_t _s02 = vmull_s8(_pAA, _pB0123.val[0]);
                int16x8_t _s13 = vmull_s8(_pAA, _pB0123.val[1]);
                int16x8_t _s46 = vmull_s8(_pAA, _pB4567.val[0]);
                int16x8_t _s57 = vmull_s8(_pAA, _pB4567.val[1]);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s02));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s13));
                _sum2 = vaddw_s16(_sum2, vget_high_s16(_s02));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s13));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s46));
                _sum5 = vaddw_s16(_sum5, vget_low_s16(_s57));
                _sum6 = vaddw_s16(_sum6, vget_high_s16(_s46));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s57));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB0 = vld1_s8(pB);

                // abcd abcd
                // cdab cdab

                int8x8_t _pA1 = vext_s8(_pA0, _pA0, 2);

                // 0123 4567
                // 3210 7654

                int8x8_t _pB1 = vrev32_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA0, _pB0);
                int16x8_t _s23 = vmull_s8(_pA1, _pB0);
                int16x8_t _s45 = vmull_s8(_pA0, _pB1);
                int16x8_t _s67 = vmull_s8(_pA1, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s45));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s45));
                _sum6 = vaddw_s16(_sum6, vget_low_s16(_s67));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 8;
            }

            if (k_end)
            {
#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                //      a4 b4 c4 d4
                //      a5 b5 c5 d5
                //      a6 b6 c6 d6
                //      a7 b7 c7 d7
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + 16, _sum4);
                    vst1q_s32(outptr0 + 20, _sum5);
                    vst1q_s32(outptr0 + 24, _sum6);
                    vst1q_s32(outptr0 + 28, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      a4 a5 a6 a7
                    //      b0 b1 b2 b3
                    //      b4 b5 b6 b7
                    //      c0 c1 c2 c3
                    //      c4 c5 c6 c7
                    //      d0 d1 d2 d3
                    //      d4 d5 d6 d7
                    {
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum3);
                        int32x4x2_t _t2 = vzipq_s32(_sum4, _sum5);
                        int32x4x2_t _t3 = vzipq_s32(_sum6, _sum7);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum2 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum3 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum4 = vcombine_s32(vget_low_s32(_t0.val[1]), vget_low_s32(_t1.val[1]));
                        _sum5 = vcombine_s32(vget_low_s32(_t2.val[1]), vget_low_s32(_t3.val[1]));
                        _sum6 = vcombine_s32(vget_high_s32(_t0.val[1]), vget_high_s32(_t1.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t2.val[1]), vget_high_s32(_t3.val[1]));
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + out_hstep, _sum2);
                    vst1q_s32(outptr0 + out_hstep + 4, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 3 + 4, _sum7);
                    outptr0 += 8;
                }
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      a4 b5 c6 d7
                //      c0 d1 a2 b3
                //      c4 d5 a6 b7
                //      a3 b2 c1 d0
                //      a7 b6 c5 d4
                //      c3 d2 a1 b0
                //      c7 d6 a5 b4
                if (out_elempack == 4)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      a1 b1 c1 d1
                    //      a2 b2 c2 d2
                    //      a3 b3 c3 d3
                    //      a4 b4 c4 d4
                    //      a5 b5 c5 d5
                    //      a6 b6 c6 d6
                    //      a7 b7 c7 d7
                    {
                        _sum4 = vrev64q_s32(_sum4);
                        _sum5 = vrev64q_s32(_sum5);
                        _sum6 = vrev64q_s32(_sum6);
                        _sum7 = vrev64q_s32(_sum7);
                        _sum4 = vextq_s32(_sum4, _sum4, 2);
                        _sum5 = vextq_s32(_sum5, _sum5, 2);
                        _sum6 = vextq_s32(_sum6, _sum6, 2);
                        _sum7 = vextq_s32(_sum7, _sum7, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                        int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                        int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum5 = vrev64q_s32(_sum5);
                        _sum7 = vrev64q_s32(_sum7);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    vst1q_s32(outptr0 + 16, _sum4);
                    vst1q_s32(outptr0 + 20, _sum5);
                    vst1q_s32(outptr0 + 24, _sum6);
                    vst1q_s32(outptr0 + 28, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      a4 a5 a6 a7
                    //      b0 b1 b2 b3
                    //      b4 b5 b6 b7
                    //      c0 c1 c2 c3
                    //      c4 c5 c6 c7
                    //      d0 d1 d2 d3
                    //      d4 d5 d6 d7
                    {
                        _sum2 = vextq_s32(_sum2, _sum2, 2);
                        _sum3 = vextq_s32(_sum3, _sum3, 2);
                        _sum6 = vextq_s32(_sum6, _sum6, 2);
                        _sum7 = vextq_s32(_sum7, _sum7, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                        int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                        int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                        int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                        _sum2 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum3 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                        _sum4 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum5 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                        _sum6 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum6 = vrev64q_s32(_sum6);
                        _sum7 = vrev64q_s32(_sum7);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + out_hstep, _sum2);
                    vst1q_s32(outptr0 + out_hstep + 4, _sum3);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum4);
                    vst1q_s32(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum6);
                    vst1q_s32(outptr0 + out_hstep * 3 + 4, _sum7);
                    outptr0 += 8;
                }
#endif // __ARM_FEATURE_DOTPROD
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                vst1q_s32(outptr + 16, _sum4);
                vst1q_s32(outptr + 20, _sum5);
                vst1q_s32(outptr + 24, _sum6);
                vst1q_s32(outptr + 28, _sum7);
            }

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cmp    %w9, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0] \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"

                "1:                                 \n"
#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"

#if __ARM_FEATURE_MATMUL_INT8
                "smmla  v24.4s, v0.16b, v4.16b      \n"
                "smmla  v25.4s, v1.16b, v4.16b      \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v26.4s, v0.16b, v5.16b      \n"
                "smmla  v27.4s, v1.16b, v5.16b      \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "sdot   v16.4s, v0.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v4.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v1.16b, v5.4b[0]    \n"
                "sdot   v17.4s, v1.16b, v5.4b[1]    \n"
                "sdot   v18.4s, v1.16b, v5.4b[2]    \n"
                "sdot   v19.4s, v1.16b, v5.4b[3]    \n"
#endif // __ARM_FEATURE_MATMUL_INT8
                "bne    2b                          \n"

#if __ARM_FEATURE_MATMUL_INT8
                "uzp1   v0.4s, v24.4s, v25.4s       \n"
                "uzp2   v1.4s, v24.4s, v25.4s       \n"
                "uzp1   v2.4s, v26.4s, v27.4s       \n"
                "uzp2   v3.4s, v26.4s, v27.4s       \n"

                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
                "and    w4, %w8, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w8, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v4.16b}, [%2], #16         \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "rev64  v1.4s, v0.4s                \n"
                "smull  v9.8h, v1.8b, v4.8b         \n"
                "rev64  v5.8h, v4.8h                \n"
                "smull  v10.8h, v0.8b, v5.8b        \n"
                "smull  v11.8h, v1.8b, v5.8b        \n"
                "smlal2 v8.8h, v0.16b, v4.16b       \n"
                "smlal2 v9.8h, v1.16b, v4.16b       \n"
                "smlal2 v10.8h, v0.16b, v5.16b      \n"
                "smlal2 v11.8h, v1.16b, v5.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w8, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "dup    v4.4h, v1.h[0]              \n"
                "dup    v5.4h, v1.h[1]              \n"
                "dup    v6.4h, v1.h[2]              \n"
                "dup    v7.4h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v2.8b}, [%2], #8           \n"
                "ext    v1.8b, v0.8b, v0.8b, #4     \n"
                "rev64  v3.4h, v2.4h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v1.8b, v2.8b         \n"
                "smull  v10.8h, v0.8b, v3.8b        \n"
                "smull  v11.8h, v1.8b, v3.8b        \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w8, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2s}, [%1]               \n"
                "ld1r   {v1.2s}, [%2]               \n"
                "add    %1, %1, #4                  \n"
                "add    %2, %2, #4                  \n"
                "zip1   v1.8b, v1.8b, v1.8b         \n"
                "zip1   v2.4h, v1.4h, v1.4h         \n"
                "zip2   v3.4h, v1.4h, v1.4h         \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v0.8b, v3.8b         \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1]               \n"
                "ld1r   {v4.2s}, [%2]               \n"
                "add    %1, %1, #4                  \n"
                "add    %2, %2, #4                  \n"
                "rev32  v1.4h, v0.4h                \n"
                "zip1   v0.2s, v0.2s, v1.2s         \n"
                "rev32  v5.8b, v4.8b                \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "cmp    %w10, #0                    \n"
                "beq    10f                         \n"

#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                // if out_elempack == 4
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"
                // to
                //      a0 a1 a2 a3
                //      b0 b1 b2 b3
                //      c0 c1 c2 c3
                //      d0 d1 d2 d3
                "zip1   v0.4s, v16.4s, v17.4s       \n"
                "zip2   v1.4s, v16.4s, v17.4s       \n"
                "zip1   v2.4s, v18.4s, v19.4s       \n"
                "zip2   v3.4s, v18.4s, v19.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip1   v18.2d, v1.2d, v3.2d        \n"
                "zip2   v19.2d, v1.2d, v3.2d        \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s}, [%3], #16         \n"
                "st1    {v17.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v18.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v19.4s}, [x4]              \n"
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      c0 d1 a2 b3
                //      a3 b2 c1 d0
                //      c3 d2 a1 b0
                // if out_elempack == 4
                "cmp    %w11, #1                    \n"
                "beq    8f                          \n"

                // to
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                "rev64  v18.4s, v18.4s              \n"
                "rev64  v19.4s, v19.4s              \n"
                "ext    v18.16b, v18.16b, v18.16b, #8 \n"
                "ext    v19.16b, v19.16b, v19.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v19.4s       \n"
                "zip2   v1.4s, v16.4s, v19.4s       \n"
                "zip1   v2.4s, v17.4s, v18.4s       \n"
                "zip2   v3.4s, v17.4s, v18.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip1   v18.2d, v3.2d, v1.2d        \n"
                "zip2   v19.2d, v3.2d, v1.2d        \n"
                "rev64  v17.4s, v17.4s              \n"
                "rev64  v19.4s, v19.4s              \n"

                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%3], #64 \n"
                "b      9f                          \n"

                // if out_elempack == 1
                "8:                                 \n"

                // to
                //      a0 a1 a2 a3
                //      b0 b1 b2 b3
                //      c0 c1 c2 c3
                //      d0 d1 d2 d3
                "ext    v17.16b, v17.16b, v17.16b, #8 \n"
                "ext    v19.16b, v19.16b, v19.16b, #8 \n"
                "zip1   v0.4s, v16.4s, v19.4s       \n"
                "zip2   v1.4s, v16.4s, v19.4s       \n"
                "zip1   v2.4s, v17.4s, v18.4s       \n"
                "zip2   v3.4s, v17.4s, v18.4s       \n"
                "zip1   v16.2d, v0.2d, v2.2d        \n"
                "zip2   v17.2d, v0.2d, v2.2d        \n"
                "zip1   v18.2d, v3.2d, v1.2d        \n"
                "zip2   v19.2d, v3.2d, v1.2d        \n"
                "rev64  v17.4s, v17.4s              \n"
                "rev64  v19.4s, v19.4s              \n"

                "add    x4, %3, %12, lsl #2         \n"
                "st1    {v16.4s}, [%3], #16         \n"
                "st1    {v17.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v18.4s}, [x4]              \n"
                "add    x4, x4, %12, lsl #2         \n"
                "st1    {v19.4s}, [x4]              \n"
#endif // __ARM_FEATURE_DOTPROD

                "9:                                 \n"
                "add    %0, %0, #64                 \n"
                "b      11f                         \n"

                "10:                                \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"

                "11:                                \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(max_kk),       // %8
                "r"(k),            // %9
                "r"(k_end),        // %10
                "r"(out_elempack), // %11
                "r"(out_hstep)     // %12
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %9, #0              \n"
                "beq        0f                  \n"

                "vldm       %0, {d16-d23}       \n"
                "b          1f                  \n"

                "0:                             \n"
                "veor       q8, q8              \n"
                "veor       q9, q9              \n"
                "veor       q10, q10            \n"
                "veor       q11, q11            \n"

                "1:                             \n"
                "lsr        r4, %8, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        3f                  \n"

                ".align 4                       \n"
                "2:                             \n"
                "pld        [%1, #256]          \n"
                "vld1.s8    {d0-d1}, [%1 :64]!  \n"
                "pld        [%2, #128]          \n"
                "vld1.s8    {d4-d5}, [%2]!      \n"
                "vrev64.32  q1, q0              \n"
                "vmull.s8   q4, d0, d4          \n"
                "vrev64.16  q3, q2              \n"
                "vmull.s8   q5, d2, d4          \n"
                "vmull.s8   q6, d0, d6          \n"
                "vmull.s8   q7, d2, d6          \n"
                "vmlal.s8   q4, d1, d5          \n"
                "vmlal.s8   q5, d3, d5          \n"
                "vmlal.s8   q6, d1, d7          \n"
                "vmlal.s8   q7, d3, d7          \n"
                "subs       r4, r4, #1          \n"
                "vpadal.s16 q8, q4              \n"
                "vpadal.s16 q9, q5              \n"
                "vpadal.s16 q10, q6             \n"
                "vpadal.s16 q11, q7             \n"
                "bne        2b                  \n"

                "3:                             \n"
                "and        r4, %8, #2          \n" // r4 = remain = max_kk & 2
                "cmp        r4, #0              \n"
                "beq        4f                  \n"

                // kk += 2 part
                "vld1.s8    {d0}, [%1 :64]!     \n"
                "vld1.s8    {d4}, [%2]!         \n"
                "vext.8     d1, d0, d0, #4      \n"
                "vrev64.16  d5, d4              \n"
                "vmull.s8   q4, d0, d4          \n"
                "vmull.s8   q5, d1, d4          \n"
                "vmull.s8   q6, d0, d5          \n"
                "vmull.s8   q7, d1, d5          \n"
                "vpadal.s16 q8, q4              \n"
                "vpadal.s16 q9, q5              \n"
                "vpadal.s16 q10, q6             \n"
                "vpadal.s16 q11, q7             \n"

                "4:                             \n"
                "and        r4, %8, #1          \n" // r4 = remain = max_kk & 1
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                // kk += 1 part
                "vld1.s32   {d0[0]}, [%1]!      \n"
                "vld1.s32   {d2[]}, [%2]!       \n"
                "vrev32.16  d1, d0              \n"
                "vrev32.s8  d3, d2              \n"
                "vzip.32    d0, d1              \n"
                "vmull.s8   q4, d0, d2          \n"
                "vmull.s8   q5, d0, d3          \n"
                "vaddw.s16  q8, d8              \n"
                "vaddw.s16  q9, d9              \n"
                "vaddw.s16  q10, d10            \n"
                "vaddw.s16  q11, d11            \n"

                "5:                             \n"
                "cmp        %10, #0             \n"
                "beq        10f                 \n"

                // from
                //      a0 b1 c2 d3
                //      c0 d1 a2 b3
                //      a3 b2 c1 d0
                //      c3 d2 a1 b0
                // if out_elempack == 4
                "cmp        %11, #1             \n"
                "beq        8f                  \n"

                // to
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                "vrev64.32  q10, q10            \n"
                "vrev64.32  q11, q11            \n"
                "vext.32    q10, q10, #2        \n"
                "vext.32    q11, q11, #2        \n"
                "vzip.32    q8, q11             \n"
                "vzip.32    q9, q10             \n"
                "vswp       d17, d18            \n"
                "vswp       d21, d22            \n"
                "vrev64.32  q9, q9              \n"
                "vrev64.32  q11, q11            \n"

                "vstm       %3!, {d16-d23}      \n"
                "b          9f                  \n"

                // if out_elempack == 1
                "8:                             \n"
                // to
                //      a0 a1 a2 a3
                //      b0 b1 b2 b3
                //      c0 c1 c2 c3
                //      d0 d1 d2 d3
                "vext.32    q9, q9, #2          \n"
                "vext.32    q11, q11, #2        \n"
                "vzip.32    q8, q11             \n"
                "vzip.32    q9, q10             \n"
                "vswp       d17, d18            \n"
                "vswp       d21, d22            \n"
                "vrev64.32  q9, q9              \n"
                "vrev64.32  q11, q11            \n"

                "add        r4, %3, %12, lsl #2 \n"
                "vst1.s32   {d16-d17}, [%3]!    \n"
                "vst1.s32   {d18-d19}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d20-d21}, [r4]     \n"
                "add        r4, r4, %12, lsl #2 \n"
                "vst1.s32   {d22-d23}, [r4]     \n"

                "9:                             \n"
                "add        %0, %0, #64         \n"
                "b          11f                 \n"

                "10:                            \n"
                "vstm       %0!, {d16-d23}      \n"

                "11:                            \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB),     // %2
                "=r"(outptr0) // %3
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "3"(outptr0),
                "r"(max_kk),       // %8
                "r"(k),            // %9
                "r"(k_end),        // %10
                "r"(out_elempack), // %11
                "r"(out_hstep)     // %12
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
            }

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum10 = vdupq_n_s32(0);
                int32x4_t _sum11 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000 11111111 22222222 33333333

                    _sum00 = vmmlaq_s32(_sum00, _pA0, _pB0);
                    _sum01 = vmmlaq_s32(_sum01, _pA1, _pB0);
                    _sum10 = vmmlaq_s32(_sum10, _pA0, _pB1);
                    _sum11 = vmmlaq_s32(_sum11, _pA1, _pB1);

                    // a0 a1 b0 b1
                    // c0 c1 d0 d1
                    // a2 a3 b2 b3
                    // c2 c3 d2 d3

                    pA += 32;
                    pB += 32;
                }
                int32x4x2_t _ss0 = vuzpq_s32(_sum00, _sum01);
                int32x4x2_t _ss1 = vuzpq_s32(_sum10, _sum11);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);

                _sum0 = vdotq_laneq_s32(_sum0, _pA1, _pB1, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA1, _pB1, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA1, _pB1, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA1, _pB1, 3);

                pA += 32;
                pB += 32;
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

                _sum0 = vdotq_laneq_s32(_sum0, _pA, _pB, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA, _pB, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA, _pB, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA, _pB, 3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA02 = vld1q_s8(pA);
                int8x16_t _pB02 = vld1q_s8(pB);

                // aabbccdd eeffgghh
                // ccddaabb gghheeff

                int8x16_t _pA13 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA02)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB02));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB13));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB13));

                _s0 = vmlal_s8(_s0, vget_high_s8(_pA02), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA13), vget_high_s8(_pB02));
                _s2 = vmlal_s8(_s2, vget_high_s8(_pA02), vget_high_s8(_pB13));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA13), vget_high_s8(_pB13));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s1 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                int16x8_t _s2 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2)));
                int16x8_t _s3 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3)));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vld1_s8(pB);

                // aabbccdd
                // ccddaabb

                int8x8_t _pA1 = vext_s8(_pA0, _pA0, 4);

                // 00112233
                // 33221100

                int8x8_t _pB1 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(_pA0, _pB0);
                int16x8_t _s1 = vmull_s8(_pA1, _pB0);
                int16x8_t _s2 = vmull_s8(_pA0, _pB1);
                int16x8_t _s3 = vmull_s8(_pA1, _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                _pB = vzip_s8(_pB, _pB).val[0];
                int16x4x2_t _pB0123 = vzip_s16(vreinterpret_s16_s8(_pB), vreinterpret_s16_s8(_pB));

                int16x8_t _s01 = vmull_s8(_pA, vreinterpret_s8_s16(_pB0123.val[0]));
                int16x8_t _s23 = vmull_s8(_pA, vreinterpret_s8_s16(_pB0123.val[1]));
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
#else  // __ARM_FEATURE_DOTPROD

                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // abcd.... -> cdab.... -> abcdcdab
                int8x8_t _pA1 = vreinterpret_s8_s16(vrev32_s16(vreinterpret_s16_s8(_pA0)));
                int8x8_t _pA01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pA0), vreinterpret_s32_s8(_pA1)).val[0]);

                // 01230123 -> 32103210
                int8x8_t _pB1 = vrev32_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA01, _pB0);
                int16x8_t _s23 = vmull_s8(_pA01, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 4;
            }

            if (k_end)
            {
#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                //      a2 b2 c2 d2
                //      a3 b3 c3 d3
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      b0 b1 b2 b3
                    //      c0 c1 c2 c3
                    //      d0 d1 d2 d3
                    {
                        int32x4x2_t _r01 = vzipq_s32(_sum0, _sum1);
                        int32x4x2_t _r23 = vzipq_s32(_sum2, _sum3);
                        _sum0 = vcombine_s32(vget_low_s32(_r01.val[0]), vget_low_s32(_r23.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_r01.val[0]), vget_high_s32(_r23.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_r01.val[1]), vget_low_s32(_r23.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_r01.val[1]), vget_high_s32(_r23.val[1]));
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + out_hstep, _sum1);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum2);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum3);
                    outptr0 += 4;
                }
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c2 d3
                //      c0 d1 a2 b3
                //      a3 b2 c1 d0
                //      c3 d2 a1 b0
                if (out_elempack == 4)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      a1 b1 c1 d1
                    //      a2 b2 c2 d2
                    //      a3 b3 c3 d3
                    {
                        _sum2 = vrev64q_s32(_sum2);
                        _sum3 = vrev64q_s32(_sum3);
                        _sum2 = vextq_s32(_sum2, _sum2, 2);
                        _sum3 = vextq_s32(_sum3, _sum3, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                        int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + 8, _sum2);
                    vst1q_s32(outptr0 + 12, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 a2 a3
                    //      b0 b1 b2 b3
                    //      c0 c1 c2 c3
                    //      d0 d1 d2 d3
                    {
                        _sum1 = vextq_s32(_sum1, _sum1, 2);
                        _sum3 = vextq_s32(_sum3, _sum3, 2);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                        int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                        _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                        _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                        _sum3 = vrev64q_s32(_sum3);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + out_hstep, _sum1);
                    vst1q_s32(outptr0 + out_hstep * 2, _sum2);
                    vst1q_s32(outptr0 + out_hstep * 3, _sum3);
                    outptr0 += 4;
                }
#endif // __ARM_FEATURE_DOTPROD
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
            }

            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000 11111111

                    _sum00 = vmmlaq_s32(_sum00, _pA0, _pB);
                    _sum01 = vmmlaq_s32(_sum01, _pA1, _pB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB, 1);
                    _sum0 = vdotq_laneq_s32(_sum0, _pA1, _pB, 2);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA1, _pB, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 32;
                    pB += 16;
                }
#if __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _ss = vuzpq_s32(_sum00, _sum01);
                _sum0 = vaddq_s32(_sum0, _ss.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss.val[1]);
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _sum0 = vdotq_lane_s32(_sum0, _pA, _pB, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pA, _pB, 1);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                // aabbccdd eeffgghh

                // 00112233 -> 00110011 22332233
                // 11001100 33223322

                int32x2x2_t _pBB = vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB));
                int8x16_t _pB02 = vreinterpretq_s8_s32(vcombine_s32(_pBB.val[0], _pBB.val[1]));

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), vget_low_s8(_pB13));
                _s0 = vmlal_s8(_s0, vget_high_s8(_pA), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA), vget_high_s8(_pB13));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 8;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);
                // aabbccdd
                // 0011....
                int16x8_t _s0 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s1 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // aabbccdd

                // 00110011
                // 11001100
                int8x8_t _pB1 = vext_s8(_pB0, _pB0, 2);

                int16x8_t _s0 = vmull_s8(_pA, _pB0);
                int16x8_t _s1 = vmull_s8(_pA, _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 4;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                // abcdabcd

                // 01010101 -> 00001111
                _pB = vuzp_s8(_pB, vext_s8(_pB, _pB, 1)).val[0];

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                // abcd abcd

                // 0101 0101 -> 0101 1010

                int8x8_t _pB1 = vext_s8(_pB0, _pB0, 1);
                int8x8_t _pB = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pB0), vreinterpret_s32_s8(_pB1)).val[0]);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 2;
            }

            if (k_end)
            {
#if __ARM_FEATURE_DOTPROD
                // from
                //      a0 b0 c0 d0
                //      a1 b1 c1 d1
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 b0 b1
                    //      c0 c1 d0 d1
                    {
                        int32x4x2_t _sum01 = vzipq_s32(_sum0, _sum1);
                        _sum0 = _sum01.val[0];
                        _sum1 = _sum01.val[1];
                    }

                    vst1_s32(outptr0, vget_low_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep, vget_high_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep * 2, vget_low_s32(_sum1));
                    vst1_s32(outptr0 + out_hstep * 3, vget_high_s32(_sum1));
                    outptr0 += 2;
                }
#else  // __ARM_FEATURE_DOTPROD

                // from
                //      a0 b1 c0 d1
                //      a1 b0 c1 d0
                if (out_elempack == 4)
                {
                    // to
                    //      a0 b0 c0 d0
                    //      a1 b1 c1 d1
                    {
                        _sum1 = vrev64q_s32(_sum1);
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                    }

                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    // to
                    //      a0 a1 c0 c1
                    //      b0 b1 d0 d1
                    {
                        int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                        _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                        _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                        _sum1 = vrev64q_s32(_sum1);
                    }

                    vst1_s32(outptr0, vget_low_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep, vget_low_s32(_sum1));
                    vst1_s32(outptr0 + out_hstep * 2, vget_high_s32(_sum0));
                    vst1_s32(outptr0 + out_hstep * 3, vget_high_s32(_sum1));
                    outptr0 += 2;
                }
#endif // __ARM_FEATURE_DOTPROD
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj < max_jj; jj += 1)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x8_t _pB = vld1_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000

                    int8x16_t _pBB = vcombine_s8(_pB, _pB);

                    _sum01 = vdotq_s32(_sum01, _pA0, _pBB);
                    _sum23 = vdotq_s32(_sum23, _pA1, _pBB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pA1, _pB, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 32;
                    pB += 8;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_sum01, _sum23));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _sum0 = vdotq_lane_s32(_sum0, _pA, _pB, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));
                int8x8_t _pB1 = vreinterpret_s8_s16(vld1_dup_s16((const short*)(pB + 2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB0);
                _s0 = vmlal_s8(_s0, vget_high_s8(_pA), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);

                pA += 8;
                pB += 2;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vld1_dup_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));

                pA += 4;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_s32(outptr0, _sum0);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    outptr0[0] = vgetq_lane_s32(_sum0, 0);
                    outptr0[out_hstep] = vgetq_lane_s32(_sum0, 1);
                    outptr0[out_hstep * 2] = vgetq_lane_s32(_sum0, 2);
                    outptr0[out_hstep * 3] = vgetq_lane_s32(_sum0, 3);
                    outptr0++;
                }
            }
            else
            {
                vst1q_s32(outptr, _sum0);
            }

            outptr += 4;
        }

        pAT += max_kk * 4;
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j;

        const signed char* pB = pBT;

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
                int32x4_t _sum45 = vdupq_n_s32(0);
                int32x4_t _sum67 = vdupq_n_s32(0);
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
                int32x2_t _sum20 = vdup_n_s32(0);
                int32x2_t _sum21 = vdup_n_s32(0);
                int32x2_t _sum30 = vdup_n_s32(0);
                int32x2_t _sum31 = vdup_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

#if __ARM_FEATURE_MATMUL_INT8
                    _sum01 = vmmlaq_s32(_sum01, _pA, _pB0);
                    _sum23 = vmmlaq_s32(_sum23, _pA, _pB1);
                    _sum45 = vmmlaq_s32(_sum45, _pA, _pB2);
                    _sum67 = vmmlaq_s32(_sum67, _pA, _pB3);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum00 = vdot_laneq_s32(_sum00, vget_low_s8(_pA), _pB0, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_low_s8(_pA), _pB0, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_low_s8(_pA), _pB0, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_low_s8(_pA), _pB0, 3);
                    _sum20 = vdot_laneq_s32(_sum20, vget_low_s8(_pA), _pB1, 0);
                    _sum21 = vdot_laneq_s32(_sum21, vget_low_s8(_pA), _pB1, 1);
                    _sum30 = vdot_laneq_s32(_sum30, vget_low_s8(_pA), _pB1, 2);
                    _sum31 = vdot_laneq_s32(_sum31, vget_low_s8(_pA), _pB1, 3);
                    _sum00 = vdot_laneq_s32(_sum00, vget_high_s8(_pA), _pB2, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_high_s8(_pA), _pB2, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_high_s8(_pA), _pB2, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_high_s8(_pA), _pB2, 3);
                    _sum20 = vdot_laneq_s32(_sum20, vget_high_s8(_pA), _pB3, 0);
                    _sum21 = vdot_laneq_s32(_sum21, vget_high_s8(_pA), _pB3, 1);
                    _sum30 = vdot_laneq_s32(_sum30, vget_high_s8(_pA), _pB3, 2);
                    _sum31 = vdot_laneq_s32(_sum31, vget_high_s8(_pA), _pB3, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 16;
                    pB += 64;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vcombine_s32(vget_low_s32(_sum01), vget_low_s32(_sum23)));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(vget_low_s32(_sum45), vget_low_s32(_sum67)));
                _sum2 = vaddq_s32(_sum2, vcombine_s32(vget_high_s32(_sum01), vget_high_s32(_sum23)));
                _sum3 = vaddq_s32(_sum3, vcombine_s32(vget_high_s32(_sum45), vget_high_s32(_sum67)));
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                int32x2x2_t _sum2x = vzip_s32(_sum20, _sum21);
                int32x2x2_t _sum3x = vzip_s32(_sum30, _sum31);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum2x.val[0], _sum3x.val[0]));
                _sum2 = vaddq_s32(_sum2, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
                _sum3 = vaddq_s32(_sum3, vcombine_s32(_sum2x.val[1], _sum3x.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_DOTPROD
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
                int32x2_t _sum20 = vdup_n_s32(0);
                int32x2_t _sum21 = vdup_n_s32(0);
                int32x2_t _sum30 = vdup_n_s32(0);
                int32x2_t _sum31 = vdup_n_s32(0);
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_DOTPROD
                    _sum00 = vdot_laneq_s32(_sum00, _pA, _pB0, 0);
                    _sum01 = vdot_laneq_s32(_sum01, _pA, _pB0, 1);
                    _sum10 = vdot_laneq_s32(_sum10, _pA, _pB0, 2);
                    _sum11 = vdot_laneq_s32(_sum11, _pA, _pB0, 3);
                    _sum20 = vdot_laneq_s32(_sum20, _pA, _pB1, 0);
                    _sum21 = vdot_laneq_s32(_sum21, _pA, _pB1, 1);
                    _sum30 = vdot_laneq_s32(_sum30, _pA, _pB1, 2);
                    _sum31 = vdot_laneq_s32(_sum31, _pA, _pB1, 3);
#else  // __ARM_FEATURE_DOTPROD
                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 3));

                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                    int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                    int16x8_t _s2 = vmull_s8(_pA1, vget_low_s8(_pB0));
                    int16x8_t _s3 = vmull_s8(_pA1, vget_high_s8(_pB0));
                    _s0 = vmlal_s8(_s0, _pA2, vget_low_s8(_pB1));
                    _s1 = vmlal_s8(_s1, _pA2, vget_high_s8(_pB1));
                    _s2 = vmlal_s8(_s2, _pA3, vget_low_s8(_pB1));
                    _s3 = vmlal_s8(_s3, _pA3, vget_high_s8(_pB1));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);
                    _sum2 = vpadalq_s16(_sum2, _s2);
                    _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                    pA += 8;
                    pB += 32;
                }
#if __ARM_FEATURE_DOTPROD
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                int32x2x2_t _sum2x = vzip_s32(_sum20, _sum21);
                int32x2x2_t _sum3x = vzip_s32(_sum30, _sum31);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum2x.val[0], _sum3x.val[0]));
                _sum2 = vaddq_s32(_sum2, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
                _sum3 = vaddq_s32(_sum3, vcombine_s32(_sum2x.val[1], _sum3x.val[1]));
#endif // __ARM_FEATURE_DOTPROD
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x4_t _pA = vreinterpret_s16_s32(vld1_dup_s32((const int*)pA));
                int8x16_t _pB = vld1q_s8(pB);

                int16x4x2_t _pA01 = vuzp_s16(_pA, _pA);
                int8x8_t _pA0 = vreinterpret_s8_s16(_pA01.val[0]);
                int8x8_t _pA1 = vreinterpret_s8_s16(_pA01.val[1]);

                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB));
                int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB));
                int16x8_t _s2 = vmull_s8(_pA1, vget_low_s8(_pB));
                int16x8_t _s3 = vmull_s8(_pA1, vget_high_s8(_pB));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                pA += 4;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x8_t _pB = vld1_s8(pB);

                int8x8x2_t _pA01 = vuzp_s8(_pA, _pA);

                int16x8_t _s0 = vmull_s8(_pA01.val[0], _pB);
                int16x8_t _s1 = vmull_s8(_pA01.val[1], _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));

                pA += 2;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    vst1q_s32(outptr0 + out_hstep, _sum2);
                    vst1q_s32(outptr0 + out_hstep + 4, _sum3);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
            }

            outptr += 16;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_MATMUL_INT8
                    _sum01 = vmmlaq_s32(_sum01, _pA, _pB0);
                    _sum23 = vmmlaq_s32(_sum23, _pA, _pB1);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum00 = vdot_laneq_s32(_sum00, vget_low_s8(_pA), _pB0, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_low_s8(_pA), _pB0, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_low_s8(_pA), _pB0, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_low_s8(_pA), _pB0, 3);
                    _sum00 = vdot_laneq_s32(_sum00, vget_high_s8(_pA), _pB1, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_high_s8(_pA), _pB1, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_high_s8(_pA), _pB1, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_high_s8(_pA), _pB1, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 16;
                    pB += 32;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vcombine_s32(vget_low_s32(_sum01), vget_low_s32(_sum23)));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(vget_high_s32(_sum01), vget_high_s32(_sum23)));
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_DOTPROD
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_DOTPROD
                    _sum00 = vdot_laneq_s32(_sum00, _pA, _pB, 0);
                    _sum01 = vdot_laneq_s32(_sum01, _pA, _pB, 1);
                    _sum10 = vdot_laneq_s32(_sum10, _pA, _pB, 2);
                    _sum11 = vdot_laneq_s32(_sum11, _pA, _pB, 3);
#else  // __ARM_FEATURE_DOTPROD
                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 3));

                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB));
                    int16x8_t _s1 = vmull_s8(_pA1, vget_low_s8(_pB));
                    _s0 = vmlal_s8(_s0, _pA2, vget_high_s8(_pB));
                    _s1 = vmlal_s8(_s1, _pA3, vget_high_s8(_pB));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                    pA += 8;
                    pB += 16;
                }
#if __ARM_FEATURE_DOTPROD
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
#endif // __ARM_FEATURE_DOTPROD
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x4_t _pA = vreinterpret_s16_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pA)), 0));
                int8x8_t _pB = vld1_s8(pB);

                int16x4x2_t _pA01 = vuzp_s16(_pA, _pA);
                int8x8_t _pA0 = vreinterpret_s8_s16(_pA01.val[0]);
                int8x8_t _pA1 = vreinterpret_s8_s16(_pA01.val[1]);

                int16x8_t _s0 = vmull_s8(_pA0, _pB);
                int16x8_t _s1 = vmull_s8(_pA1, _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);

                pA += 4;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x8_t _pB = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pB)), 0));

                _pA = vzip_s8(_pA, _pA).val[0];
                _pA = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA)).val[0]);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                pA += 2;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + out_hstep, _sum1);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
#if __ARM_NEON
            int32x4_t _sum;

            if (k == 0)
            {
                _sum = vdupq_n_s32(0);
            }
            else
            {
                _sum = vld1q_s32(outptr);
            }

            const signed char* pA = pAT;
            int kk = 0;

#if __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                _sum = vmmlaq_s32(_sum, _pA, _pB);
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _pAA = vzipq_s32(vreinterpretq_s32_s8(_pA), vreinterpretq_s32_s8(_pA));
                int8x16_t _pA01 = vreinterpretq_s8_s32(_pAA.val[0]);
                int8x16_t _pA23 = vreinterpretq_s8_s32(_pAA.val[1]);
                int8x16_t _pB01 = vcombine_s8(vget_low_s8(_pB), vget_low_s8(_pB));
                int8x16_t _pB23 = vcombine_s8(vget_high_s8(_pB), vget_high_s8(_pB));

                _sum = vdotq_s32(_sum, _pA01, _pB01);
                _sum = vdotq_s32(_sum, _pA23, _pB23);
#endif // __ARM_FEATURE_MATMUL_INT8

                pA += 16;
                pB += 16;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

#if __ARM_FEATURE_DOTPROD
                int32x2x2_t _pAA = vzip_s32(vreinterpret_s32_s8(_pA), vreinterpret_s32_s8(_pA));
                int8x16_t _pA01 = vreinterpretq_s8_s32(vcombine_s32(_pAA.val[0], _pAA.val[1]));

                int8x16_t _pB01 = vcombine_s8(_pB, _pB);

                _sum = vdotq_s32(_sum, _pA01, _pB01);
#else  // __ARM_FEATURE_DOTPROD
                int16x4x2_t _pA01 = vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA));
                int32x2x2_t _pB01 = vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB));

                int16x8_t _s0 = vmull_s8(vreinterpret_s8_s16(_pA01.val[0]), vreinterpret_s8_s32(_pB01.val[0]));
                _s0 = vmlal_s8(_s0, vreinterpret_s8_s16(_pA01.val[1]), vreinterpret_s8_s32(_pB01.val[1]));
                _sum = vpadalq_s16(_sum, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 8;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _pA = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA)).val[0]);
                _pB = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB)).val[0]);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum = vpadalq_s16(_sum, _s0);

                pA += 4;
                pB += 4;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x8_t _pB = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(pB)), 0));

                _pA = vzip_s8(_pA, _pA).val[0];

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum = vaddw_s16(_sum, vget_low_s16(_s0));

                pA += 2;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_s32(outptr0, vget_low_s32(_sum));
                    vst1_s32(outptr0 + out_hstep, vget_high_s32(_sum));
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_s32(outptr, _sum);
            }

            outptr += 4;
#else // __ARM_NEON
            int sum00;
            int sum10;
            int sum01;
            int sum11;

            if (k == 0)
            {
                sum00 = 0;
                sum10 = 0;
                sum01 = 0;
                sum11 = 0;
            }
            else
            {
                sum00 = outptr[0];
                sum10 = outptr[1];
                sum01 = outptr[2];
                sum11 = outptr[3];
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
            for (; kk + 1 < max_kk; kk += 2)
            {
                // fomit-frame-pointer implied in optimized flag spare one register
                // let us stay away from error: ‘asm’ operand has impossible constraints   --- nihui
#if __OPTIMIZE__
                asm volatile(
                    "ldr    r2, [%0], #4    \n" // int8x4_t _pA = *((int8x4_t*)pA); pA += 4;
                    "ldr    r4, [%1], #4    \n" // int8x4_t _pB = *((int8x4_t*)pB); pB += 4;
                    "ror    r3, r2, #8      \n" // int8x4_t _pA_r8 = __ror(_pA, 8);
                    "ror    r5, r4, #8      \n" // int8x4_t _pB_r8 = __ror(_pB, 8);
                    "sxtb16 r2, r2          \n" // int16x2_t _pA0 = __sxtb16(_pA);
                    "sxtb16 r4, r4          \n" // int16x2_t _pA1 = __sxtb16(_pA_r8);
                    "sxtb16 r3, r3          \n" // int16x2_t _pB0 = __sxtb16(_pB);
                    "sxtb16 r5, r5          \n" // int16x2_t _pB1 = __sxtb16(_pB_r8);
                    "smlad  %2, r2, r4, %2  \n" // sum00 = __smlad(_pA0, _pB0, sum00);
                    "smlad  %3, r3, r4, %3  \n" // sum10 = __smlad(_pA1, _pB0, sum10);
                    "smlad  %4, r2, r5, %4  \n" // sum01 = __smlad(_pA0, _pB1, sum01);
                    "smlad  %5, r3, r5, %5  \n" // sum11 = __smlad(_pA1, _pB1, sum11);
                    : "=r"(pA),
                    "=r"(pB),
                    "=r"(sum00),
                    "=r"(sum10),
                    "=r"(sum01),
                    "=r"(sum11)
                    : "0"(pA),
                    "1"(pB),
                    "2"(sum00),
                    "3"(sum10),
                    "4"(sum01),
                    "5"(sum11)
                    : "memory", "r2", "r3", "r4", "r5");
#else
                int _pA0 = *((int*)pA);
                int _pB0 = *((int*)pB);
                int _pA1;
                int _pB1;
                asm volatile("ror %0, %1, #8"
                             : "=r"(_pA1)
                             : "r"(_pA0)
                             :);
                asm volatile("ror %0, %1, #8"
                             : "=r"(_pB1)
                             : "r"(_pB0)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pA0)
                             : "0"(_pA0)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pA1)
                             : "0"(_pA1)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pB0)
                             : "0"(_pB0)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pB1)
                             : "0"(_pB1)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum00)
                             : "0"(sum00), "r"(_pA0), "r"(_pB0)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum10)
                             : "0"(sum10), "r"(_pA1), "r"(_pB0)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum01)
                             : "0"(sum01), "r"(_pA0), "r"(_pB1)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum11)
                             : "0"(sum11), "r"(_pA1), "r"(_pB1)
                             :);
                pA += 4;
                pB += 4;
#endif
            }
#endif // __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
            for (; kk < max_kk; kk += 1)
            {
                sum00 += pA[0] * pB[0];
                sum10 += pA[1] * pB[0];
                sum01 += pA[0] * pB[1];
                sum11 += pA[1] * pB[1];

                pA += 2;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum00;
                    outptr0[1] = sum01;
                    outptr0[out_hstep] = sum10;
                    outptr0[out_hstep + 1] = sum11;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum10;
                outptr[2] = sum01;
                outptr[3] = sum11;
            }

            outptr += 4;
#endif // __ARM_NEON
        }
        for (; jj < max_jj; jj += 1)
        {
#if __ARM_NEON
            int32x2_t _sum;

            if (k == 0)
            {
                _sum = vdup_n_s32(0);
            }
            else
            {
                _sum = vld1_s32(outptr);
            }
#else  // __ARM_NEON
            int sum0;
            int sum1;

            if (k == 0)
            {
                sum0 = 0;
                sum1 = 0;
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }
#endif // __ARM_NEON

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x8_t _pB = vld1_s8(pB);

                    int8x16_t _pBB = vcombine_s8(_pB, _pB);

                    _sum0 = vdotq_s32(_sum0, _pA, _pBB);

                    pA += 16;
                    pB += 8;
                }
                int32x2_t _ss = vpadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#else  // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _sum = vdot_lane_s32(_sum, vget_low_s8(_pA), _pB, 0);
                _sum = vdot_lane_s32(_sum, vget_high_s8(_pA), _pB, 1);

                pA += 16;
                pB += 8;
            }
#endif // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                _sum = vdot_s32(_sum, _pA, _pB);

                pA += 8;
                pB += 4;
            }
#else  // __ARM_FEATURE_DOTPROD
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x8_t _pB = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pB)), 0));

                    _pB = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pB), vreinterpret_s16_s8(_pB)).val[0]);

                    int16x8_t _s0 = vmull_s8(_pA, _pB);
                    _sum0 = vpadalq_s16(_sum0, _s0);

                    pA += 8;
                    pB += 4;
                }
                int32x2_t _ss = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#endif // __ARM_FEATURE_DOTPROD
            int sum0 = vget_lane_s32(_sum, 0);
            int sum1 = vget_lane_s32(_sum, 1);
            for (; kk + 1 < max_kk; kk += 2)
            {
                sum0 += pA[0] * pB[0];
                sum0 += pA[1] * pB[1];
                sum1 += pA[2] * pB[0];
                sum1 += pA[3] * pB[1];
                pA += 4;
                pB += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[1] * pB[0];
                pA += 2;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[out_hstep] = sum1;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        int* outptr0 = (int*)top_blob + (i + ii) * out_hstep + j;

        const signed char* pB = pBT;

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum10 = vdupq_n_s32(0);
                int32x4_t _sum11 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

#if __ARM_FEATURE_MATMUL_INT8
                    int8x16_t _pAA = vcombine_s8(_pA, _pA);
                    _sum00 = vdotq_s32(_sum00, _pAA, _pB0);
                    _sum01 = vdotq_s32(_sum01, _pAA, _pB1);
                    _sum10 = vdotq_s32(_sum10, _pAA, _pB2);
                    _sum11 = vdotq_s32(_sum11, _pAA, _pB3);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pB0, _pA, 0);
                    _sum1 = vdotq_lane_s32(_sum1, _pB1, _pA, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pB2, _pA, 1);
                    _sum1 = vdotq_lane_s32(_sum1, _pB3, _pA, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 8;
                    pB += 64;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_sum00, _sum01));
                _sum1 = vaddq_s32(_sum1, vpaddq_s32(_sum10, _sum11));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pA)), 0));
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_DOTPROD
                _sum0 = vdotq_lane_s32(_sum0, _pB0, _pA, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pB1, _pA, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                _s0 = vmlal_s8(_s0, _pA1, vget_low_s8(_pB1));
                _s1 = vmlal_s8(_s1, _pA1, vget_high_s8(_pB1));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x16_t _pB = vld1q_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, vget_low_s8(_pB));
                int16x8_t _s1 = vmull_s8(_pA, vget_high_s8(_pB));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);

                pA += 2;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vld1_dup_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                pA += 1;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_s32(outptr0, _sum0);
                    vst1q_s32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_MATMUL_INT8
                    int8x16_t _pAA = vcombine_s8(_pA, _pA);
                    _sum00 = vdotq_s32(_sum00, _pAA, _pB0);
                    _sum01 = vdotq_s32(_sum01, _pAA, _pB1);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pB0, _pA, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pB1, _pA, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 8;
                    pB += 32;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_sum00, _sum01));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_DOTPROD
                _sum0 = vdotq_lane_s32(_sum0, _pB, _pA, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB));
                _s0 = vmlal_s8(_s0, _pA1, vget_high_s8(_pB));
                _sum0 = vpadalq_s16(_sum0, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(pA)), 0));
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);

                pA += 2;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vld1_dup_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pB)), 0));

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));

                pA += 1;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_s32(outptr0, _sum0);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_s32(outptr, _sum0);
            }

            outptr += 4;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
#if __ARM_NEON
            int32x2_t _sum;

            if (k == 0)
            {
                _sum = vdup_n_s32(0);
            }
            else
            {
                _sum = vld1_s32(outptr);
            }
#else  // __ARM_NEON
            int sum0;
            int sum1;

            if (k == 0)
            {
                sum0 = 0;
                sum1 = 0;
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }
#endif // __ARM_NEON

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB = vld1q_s8(pB);

                    int8x16_t _pAA = vcombine_s8(_pA, _pA);

                    _sum0 = vdotq_s32(_sum0, _pAA, _pB);

                    pA += 8;
                    pB += 16;
                }
                int32x2_t _ss = vpadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#else  // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

                _sum = vdot_lane_s32(_sum, vget_low_s8(_pB), _pA, 0);
                _sum = vdot_lane_s32(_sum, vget_high_s8(_pB), _pA, 1);

                pA += 8;
                pB += 16;
            }
#endif // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vld1_s8(pB);

                _sum = vdot_s32(_sum, _pA, _pB);

                pA += 4;
                pB += 8;
            }
#else  // __ARM_FEATURE_DOTPROD
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pA)), 0));
                    int8x8_t _pB = vld1_s8(pB);

                    _pA = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA)).val[0]);

                    int16x8_t _s0 = vmull_s8(_pA, _pB);
                    _sum0 = vpadalq_s16(_sum0, _s0);

                    pA += 4;
                    pB += 8;
                }
                int32x2_t _ss = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#endif // __ARM_FEATURE_DOTPROD
            int sum0 = vget_lane_s32(_sum, 0);
            int sum1 = vget_lane_s32(_sum, 1);
            for (; kk + 1 < max_kk; kk += 2)
            {
                sum0 += pA[0] * pB[0];
                sum0 += pA[1] * pB[1];
                sum1 += pA[0] * pB[2];
                sum1 += pA[1] * pB[3];
                pA += 2;
                pB += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[0] * pB[1];
                pA += 1;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[1] = sum1;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            int sum;

            if (k == 0)
            {
                sum = 0;
            }
            else
            {
                sum = outptr[0];
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_NEON
            int32x4_t _sum = vdupq_n_s32(0);
            for (; kk + 15 < max_kk; kk += 16)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_DOTPROD
                _sum = vdotq_s32(_sum, _pA, _pB);
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vget_low_s8(_pB));
                _s0 = vmlal_s8(_s0, vget_high_s8(_pA), vget_high_s8(_pB));
                _sum = vpadalq_s16(_sum, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum = vpadalq_s16(_sum, _s0);

                pA += 8;
                pB += 8;
            }
#if __aarch64__
            sum += vaddvq_s32(_sum);
#else
            int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum));
            _ss = vpadd_s32(_ss, _ss);
            sum += vget_lane_s32(_ss, 0);
#endif
#endif // __ARM_NEON
            for (; kk < max_kk; kk += 1)
            {
                sum += pA[0] * pB[0];
                pA += 1;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}

static void convolution_im2col_gemm_get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(signed char));

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    // solve K
    {
        // try not to split K
#if __ARM_NEON
        int tile_size = (l2_cache_size_int8 - 16) / 8;
#else
        int tile_size = (l2_cache_size_int8 - 2) / 3;
#endif

#if __ARM_NEON
        TILE_K = std::max(8, tile_size / 8 * 8);
#else
        TILE_K = std::max(2, tile_size / 2 * 2);
#endif

        int nn_K = (K + TILE_K - 1) / TILE_K;
#if __ARM_NEON
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
#else
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#endif
    }

    // solve M
    {
#if __ARM_NEON
        int nn_M = (M + 31) / 32;
#else
        int nn_M = (M + 7) / 8;
#endif

#if __ARM_NEON
        TILE_M = std::max(8, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#else
        TILE_M = std::max(2, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif
    }

    {
        TILE_M *= std::min(nT, get_physical_cpu_count());

        int nn_M = (M + TILE_M - 1) / TILE_M;
#if __ARM_NEON
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#else
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif

        if (nT > 1)
        {
#if __ARM_NEON
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
#else
            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
#endif
        }
    }

    if (N > 0)
    {
        int tile_size;
        if (TILE_K >= K)
        {
            tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / TILE_K;
        }
        else
        {
            tile_size = (l2_cache_size_int8 - TILE_M * TILE_K) / (TILE_M * 4 + TILE_K);
        }

#if __ARM_NEON
        TILE_N = std::max(4, tile_size / 4 * 4);
#else
        TILE_N = std::max(1, tile_size);
#endif

        int nn_N = (N + TILE_N - 1) / TILE_N;
#if __ARM_NEON
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#else
        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
#endif
    }
}

static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
{
    const int elempack = bottom_blob.elempack;

    signed char* pp = B;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        if (elempack == 8)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k / 8) + (j + jj) * 8;
            const size_t cstep = bottom_blob.cstep * 8;

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], %4 \n"
                    "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // NCNN_GNU_INLINE_ASM
                int8x16_t _r01 = vld1q_s8(p0);
                int8x16_t _r23 = vld1q_s8(p0 + 16);
                int8x16_t _r45 = vld1q_s8(p0 + 32);
                int8x16_t _r67 = vld1q_s8(p0 + 48);
                vst1q_s8(pp, _r01);
                vst1q_s8(pp + 16, _r23);
                vst1q_s8(pp + 32, _r45);
                vst1q_s8(pp + 48, _r67);
                pp += 64;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], %4 \n"
                    "uzp1   v4.4s, v0.4s, v1.4s         \n"
                    "uzp2   v6.4s, v0.4s, v1.4s         \n"
                    "uzp1   v5.4s, v2.4s, v3.4s         \n"
                    "uzp2   v7.4s, v2.4s, v3.4s         \n"
                    "st1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // NCNN_GNU_INLINE_ASM
                int32x4x2_t _r0246 = vld2q_s32((const int*)p0);
                int32x4x2_t _r1357 = vld2q_s32((const int*)(p0 + 32));
                vst1q_s32((int*)pp, _r0246.val[0]);
                vst1q_s32((int*)(pp + 16), _r1357.val[0]);
                vst1q_s32((int*)(pp + 32), _r0246.val[1]);
                vst1q_s32((int*)(pp + 48), _r1357.val[1]);
                pp += 64;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]       \n"
                    "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], %4 \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // NCNN_GNU_INLINE_ASM
                int16x8x4_t _r0 = vld4q_s16((const short*)p0);
                vst1q_s16((short*)pp, _r0.val[0]);
                vst1q_s16((short*)(pp + 16), _r0.val[1]);
                vst1q_s16((short*)(pp + 32), _r0.val[2]);
                vst1q_s16((short*)(pp + 48), _r0.val[3]);
                pp += 64;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        }

        if (elempack == 1)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj);
            const size_t cstep = bottom_blob.cstep;

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            for (; kk + 7 < max_kk; kk += 8)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v0.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v1.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v0.d}[1], [%0], %4         \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v1.d}[1], [%0], %4         \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v2.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v3.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v2.d}[1], [%0], %4         \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v3.d}[1], [%0], %4         \n"
                    "zip1   v4.16b, v0.16b, v1.16b      \n"
                    "zip2   v5.16b, v0.16b, v1.16b      \n"
                    "zip1   v6.16b, v2.16b, v3.16b      \n"
                    "zip2   v7.16b, v2.16b, v3.16b      \n"
                    "st4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // NCNN_GNU_INLINE_ASM
                int8x8_t _r0 = vld1_s8(p0);
                int8x8_t _r1 = vld1_s8(p0 + cstep);
                int8x8_t _r2 = vld1_s8(p0 + cstep * 2);
                int8x8_t _r3 = vld1_s8(p0 + cstep * 3);
                int8x8_t _r4 = vld1_s8(p0 + cstep * 4);
                int8x8_t _r5 = vld1_s8(p0 + cstep * 5);
                int8x8_t _r6 = vld1_s8(p0 + cstep * 6);
                int8x8_t _r7 = vld1_s8(p0 + cstep * 7);
                // save as transpose8x8
                int8x8x2_t _r01 = vzip_s8(_r0, _r1);
                int8x8x2_t _r23 = vzip_s8(_r2, _r3);
                int8x8x2_t _r45 = vzip_s8(_r4, _r5);
                int8x8x2_t _r67 = vzip_s8(_r6, _r7);
                int16x8x4_t _r0246;
                _r0246.val[0] = vreinterpretq_s16_s8(vcombine_s8(_r01.val[0], _r01.val[1]));
                _r0246.val[1] = vreinterpretq_s16_s8(vcombine_s8(_r23.val[0], _r23.val[1]));
                _r0246.val[2] = vreinterpretq_s16_s8(vcombine_s8(_r45.val[0], _r45.val[1]));
                _r0246.val[3] = vreinterpretq_s16_s8(vcombine_s8(_r67.val[0], _r67.val[1]));
                vst4q_s16((short*)pp, _r0246);
                pp += 64;
                p0 += cstep * 8;
#endif // NCNN_GNU_INLINE_ASM
            }
#endif // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 3 < max_kk; kk += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v0.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v1.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v2.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v3.8b}, [%0], %4           \n"
                    "st4    {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // NCNN_GNU_INLINE_ASM
                int8x8x4_t _r0123;
                _r0123.val[0] = vld1_s8(p0);
                _r0123.val[1] = vld1_s8(p0 + cstep);
                _r0123.val[2] = vld1_s8(p0 + cstep * 2);
                _r0123.val[3] = vld1_s8(p0 + cstep * 3);
                vst4_s8(pp, _r0123);
                pp += 32;
                p0 += cstep * 4;
#endif // NCNN_GNU_INLINE_ASM
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 1 < max_kk; kk += 2)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v0.8b}, [%0], %4           \n"
                    "prfm   pldl1keep, [%0, #64]        \n"
                    "ld1    {v1.8b}, [%0], %4           \n"
                    "st2    {v0.8b, v1.8b}, [%1], #16   \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1");
#else  // NCNN_GNU_INLINE_ASM
                int8x8x2_t _r01;
                _r01.val[0] = vld1_s8(p0);
                _r01.val[1] = vld1_s8(p0 + cstep);
                vst2_s8(pp, _r01);
                pp += 16;
                p0 += cstep * 2;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; kk < max_kk; kk++)
            {
                vst1_s8(pp, vld1_s8(p0));
                pp += 8;
                p0 += cstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        if (elempack == 8)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k / 8) + (j + jj) * 8;
            const size_t cstep = bottom_blob.cstep * 8;

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]       \n"
                    "ld1    {v0.16b, v1.16b}, [%0], %4  \n"
                    "st1    {v0.16b, v1.16b}, [%1], #32 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1");
#else  // NCNN_GNU_INLINE_ASM
                int8x16_t _r01 = vld1q_s8(p0);
                int8x16_t _r23 = vld1q_s8(p0 + 16);
                vst1q_s8(pp, _r01);
                vst1q_s8(pp + 16, _r23);
                pp += 32;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]       \n"
                    "ld1    {v0.8b, v1.8b, v2.8b, v3.8b}, [%0], %4 \n"
                    "st4    {v0.2s, v1.2s, v2.2s, v3.2s}, [%1], #32 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1", "v2", "v3");
#else  // NCNN_GNU_INLINE_ASM
                int32x2x4_t _r0123;
                _r0123.val[0] = vreinterpret_s32_s8(vld1_s8(p0));
                _r0123.val[1] = vreinterpret_s32_s8(vld1_s8(p0 + 8));
                _r0123.val[2] = vreinterpret_s32_s8(vld1_s8(p0 + 16));
                _r0123.val[3] = vreinterpret_s32_s8(vld1_s8(p0 + 24));
                vst4_s32((int*)pp, _r0123);
                pp += 32;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]       \n"
                    "ld1    {v0.8b, v1.8b, v2.8b, v3.8b}, [%0], %4 \n"
                    "st4    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1", "v2", "v3");
#else
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld1.s8    {d0-d3}, [%0], %4   \n"
                    "vst4.s16   {d0-d3}, [%1 :64]!  \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "q0", "q1");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int16x4x4_t _r0123;
                _r0123.val[0] = vreinterpret_s16_s8(vld1_s8(p0));
                _r0123.val[1] = vreinterpret_s16_s8(vld1_s8(p0 + 8));
                _r0123.val[2] = vreinterpret_s16_s8(vld1_s8(p0 + 16));
                _r0123.val[3] = vreinterpret_s16_s8(vld1_s8(p0 + 24));
                vst4_s16((short*)pp, _r0123);
                pp += 32;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        }

        if (elempack == 1)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj);
            const size_t cstep = bottom_blob.cstep;

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            for (; kk + 7 < max_kk; kk += 8)
            {
                pp[0] = p0[0];
                pp[1] = p0[cstep + 0];
                pp[2] = p0[cstep * 2 + 0];
                pp[3] = p0[cstep * 3 + 0];
                pp[4] = p0[cstep * 4 + 0];
                pp[5] = p0[cstep * 5 + 0];
                pp[6] = p0[cstep * 6 + 0];
                pp[7] = p0[cstep * 7 + 0];
                pp[8] = p0[1];
                pp[9] = p0[cstep + 1];
                pp[10] = p0[cstep * 2 + 1];
                pp[11] = p0[cstep * 3 + 1];
                pp[12] = p0[cstep * 4 + 1];
                pp[13] = p0[cstep * 5 + 1];
                pp[14] = p0[cstep * 6 + 1];
                pp[15] = p0[cstep * 7 + 1];
                pp[16] = p0[2];
                pp[17] = p0[cstep + 2];
                pp[18] = p0[cstep * 2 + 2];
                pp[19] = p0[cstep * 3 + 2];
                pp[20] = p0[cstep * 4 + 2];
                pp[21] = p0[cstep * 5 + 2];
                pp[22] = p0[cstep * 6 + 2];
                pp[23] = p0[cstep * 7 + 2];
                pp[24] = p0[3];
                pp[25] = p0[cstep + 3];
                pp[26] = p0[cstep * 2 + 3];
                pp[27] = p0[cstep * 3 + 3];
                pp[28] = p0[cstep * 4 + 3];
                pp[29] = p0[cstep * 5 + 3];
                pp[30] = p0[cstep * 6 + 3];
                pp[31] = p0[cstep * 7 + 3];
                pp += 32;
                p0 += cstep * 8;
            }
#endif // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = p0[0];
                pp[1] = p0[cstep + 0];
                pp[2] = p0[cstep * 2 + 0];
                pp[3] = p0[cstep * 3 + 0];
                pp[4] = p0[1];
                pp[5] = p0[cstep + 1];
                pp[6] = p0[cstep * 2 + 1];
                pp[7] = p0[cstep * 3 + 1];
                pp[8] = p0[2];
                pp[9] = p0[cstep + 2];
                pp[10] = p0[cstep * 2 + 2];
                pp[11] = p0[cstep * 3 + 2];
                pp[12] = p0[3];
                pp[13] = p0[cstep + 3];
                pp[14] = p0[cstep * 2 + 3];
                pp[15] = p0[cstep * 3 + 3];
                pp += 16;
                p0 += cstep * 4;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[cstep + 0];
                pp[2] = p0[1];
                pp[3] = p0[cstep + 1];
                pp[4] = p0[2];
                pp[5] = p0[cstep + 2];
                pp[6] = p0[3];
                pp[7] = p0[cstep + 3];
                pp += 8;
                p0 += cstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp[2] = p0[2];
                pp[3] = p0[3];
                pp += 4;
                p0 += cstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
#if __ARM_NEON
        if (elempack == 8)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k / 8) + (j + jj) * 8;
            const size_t cstep = bottom_blob.cstep * 8;

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #128]       \n"
                    "ld1    {v0.16b}, [%0], %4          \n"
                    "st1    {v0.16b}, [%1], #16         \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0");
#else  // NCNN_GNU_INLINE_ASM
                vst1q_s8(pp, vld1q_s8(p0));
                pp += 16;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #128]       \n"
                    "ld1    {v0.8b, v1.8b}, [%0], %4    \n"
                    "st2    {v0.2s, v1.2s}, [%1], #16   \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1");
#else  // NCNN_GNU_INLINE_ASM
                int32x2x2_t _r01;
                _r01.val[0] = vreinterpret_s32_s8(vld1_s8(p0));
                _r01.val[1] = vreinterpret_s32_s8(vld1_s8(p0 + 8));
                vst2_s32((int*)pp, _r01);
                pp += 16;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk < max_kk / 8; kk++)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #128]       \n"
                    "ld1    {v0.8b, v1.8b}, [%0], %4    \n"
                    "st2    {v0.4h, v1.4h}, [%1], #16   \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "v0", "v1");
#else
                asm volatile(
                    "pld        [%0, #128]          \n"
                    "vld1.s8    {d0-d1}, [%0], %4   \n"
                    "vst2.s16   {d0-d1}, [%1 :64]!  \n"
                    : "=r"(p0), // %0
                    "=r"(pp)  // %1
                    : "0"(p0),
                    "1"(pp),
                    "r"(cstep)
                    : "memory", "q0");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                int16x4x2_t _r01;
                _r01.val[0] = vreinterpret_s16_s8(vld1_s8(p0));
                _r01.val[1] = vreinterpret_s16_s8(vld1_s8(p0 + 8));
                vst2_s16((short*)pp, _r01);
                pp += 16;
                p0 += cstep;
#endif // NCNN_GNU_INLINE_ASM
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj);
            const size_t cstep = bottom_blob.cstep;

            int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            for (; kk + 7 < max_kk; kk += 8)
            {
                pp[0] = p0[0];
                pp[1] = p0[cstep];
                pp[2] = p0[cstep * 2];
                pp[3] = p0[cstep * 3];
                pp[4] = p0[cstep * 4];
                pp[5] = p0[cstep * 5];
                pp[6] = p0[cstep * 6];
                pp[7] = p0[cstep * 7];
                pp[8] = p0[1];
                pp[9] = p0[cstep + 1];
                pp[10] = p0[cstep * 2 + 1];
                pp[11] = p0[cstep * 3 + 1];
                pp[12] = p0[cstep * 4 + 1];
                pp[13] = p0[cstep * 5 + 1];
                pp[14] = p0[cstep * 6 + 1];
                pp[15] = p0[cstep * 7 + 1];
                pp += 16;
                p0 += cstep * 8;
            }
#endif // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = p0[0];
                pp[1] = p0[cstep];
                pp[2] = p0[cstep * 2];
                pp[3] = p0[cstep * 3];
                pp[4] = p0[1];
                pp[5] = p0[cstep + 1];
                pp[6] = p0[cstep * 2 + 1];
                pp[7] = p0[cstep * 3 + 1];
                pp += 8;
                p0 += cstep * 4;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = p0[0];
                pp[1] = p0[cstep];
                pp[2] = p0[1];
                pp[3] = p0[cstep + 1];
                pp += 4;
                p0 += cstep * 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp += 2;
                p0 += cstep;
            }
        }
    }
    for (; jj < max_jj; jj++)
    {
#if __ARM_NEON
        if (elempack == 8)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k / 8) + (j + jj) * 8;
            const size_t cstep = bottom_blob.cstep * 8;

            int kk = 0;
            for (; kk < max_kk / 8; kk++)
            {
                vst1_s8(pp, vld1_s8(p0));
                pp += 8;
                p0 += cstep;
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            const signed char* p0 = (const signed char*)bottom_blob.channel(k) + (j + jj);
            const size_t cstep = bottom_blob.cstep;

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0 += cstep;
            }
        }
    }
}

static inline void convolution_im2col_input_tile_int8_impl(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
{
    const int w = bottom_blob.w;
    // const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int outw = (w - kernel_extent_w) / stride_w + 1;

    // j max_jj     outw*outh    split w and h

    // k max_kk     pa*maxk*(inch/pa)    split inch

    // k/max_kk shall be multiple of maxk

    const int maxk = kernel_w * kernel_h;

    signed char* pp = B;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        int dy0 = (j + jj) / outw * stride_h;
        int dy1 = (j + jj + 1) / outw * stride_h;
        int dy2 = (j + jj + 2) / outw * stride_h;
        int dy3 = (j + jj + 3) / outw * stride_h;
        int dy4 = (j + jj + 4) / outw * stride_h;
        int dy5 = (j + jj + 5) / outw * stride_h;
        int dy6 = (j + jj + 6) / outw * stride_h;
        int dy7 = (j + jj + 7) / outw * stride_h;
        int dx0 = (j + jj) % outw * stride_w;
        int dx1 = (j + jj + 1) % outw * stride_w;
        int dx2 = (j + jj + 2) % outw * stride_w;
        int dx3 = (j + jj + 3) % outw * stride_w;
        int dx4 = (j + jj + 4) % outw * stride_w;
        int dx5 = (j + jj + 5) % outw * stride_w;
        int dx6 = (j + jj + 6) % outw * stride_w;
        int dx7 = (j + jj + 7) % outw * stride_w;

        if (dy0 == dy7)
        {
            int kk = 0;
            if (elempack == 1)
            {
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int p4 = (k + kk + 4) / maxk;
                    int p5 = (k + kk + 5) / maxk;
                    int p6 = (k + kk + 6) / maxk;
                    int p7 = (k + kk + 7) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int uv4 = (k + kk + 4) % maxk;
                    int uv5 = (k + kk + 5) % maxk;
                    int uv6 = (k + kk + 6) % maxk;
                    int uv7 = (k + kk + 7) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int u4 = uv4 / kernel_w;
                    int u5 = uv5 / kernel_w;
                    int u6 = uv6 / kernel_w;
                    int u7 = uv7 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;
                    int v4 = uv4 % kernel_w;
                    int v5 = uv5 % kernel_w;
                    int v6 = uv6 % kernel_w;
                    int v7 = uv7 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);
                    const Mat img4 = bottom_blob.channel(p4);
                    const Mat img5 = bottom_blob.channel(p5);
                    const Mat img6 = bottom_blob.channel(p6);
                    const Mat img7 = bottom_blob.channel(p7);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;

                    int x40 = dx0 + dilation_w * v4;
                    int y40 = dy0 + dilation_h * u4;

                    int x50 = dx0 + dilation_w * v5;
                    int y50 = dy0 + dilation_h * u5;

                    int x60 = dx0 + dilation_w * v6;
                    int y60 = dy0 + dilation_h * u6;

                    int x70 = dx0 + dilation_w * v7;
                    int y70 = dy0 + dilation_h * u7;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr2 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr3 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr4 = img4.row<const signed char>(y40) + x40;
                    const signed char* sptr5 = img5.row<const signed char>(y50) + x50;
                    const signed char* sptr6 = img6.row<const signed char>(y60) + x60;
                    const signed char* sptr7 = img7.row<const signed char>(y70) + x70;

                    if (stride_w == 1)
                    {
                        int8x8_t _r0 = vld1_s8(sptr0);
                        int8x8_t _r1 = vld1_s8(sptr1);
                        int8x8_t _r2 = vld1_s8(sptr2);
                        int8x8_t _r3 = vld1_s8(sptr3);
                        int8x8_t _r4 = vld1_s8(sptr4);
                        int8x8_t _r5 = vld1_s8(sptr5);
                        int8x8_t _r6 = vld1_s8(sptr6);
                        int8x8_t _r7 = vld1_s8(sptr7);
                        // save as transpose8x8
                        int8x8x2_t _r01 = vzip_s8(_r0, _r1);
                        int8x8x2_t _r23 = vzip_s8(_r2, _r3);
                        int8x8x2_t _r45 = vzip_s8(_r4, _r5);
                        int8x8x2_t _r67 = vzip_s8(_r6, _r7);
                        int16x8x4_t _r0246;
                        _r0246.val[0] = vreinterpretq_s16_s8(vcombine_s8(_r01.val[0], _r01.val[1]));
                        _r0246.val[1] = vreinterpretq_s16_s8(vcombine_s8(_r23.val[0], _r23.val[1]));
                        _r0246.val[2] = vreinterpretq_s16_s8(vcombine_s8(_r45.val[0], _r45.val[1]));
                        _r0246.val[3] = vreinterpretq_s16_s8(vcombine_s8(_r67.val[0], _r67.val[1]));
                        vst4q_s16((short*)pp, _r0246);
                        pp += 64;
                    }
                    else if (stride_w == 2)
                    {
                        int8x16_t _r0 = vld1q_s8(sptr0);
                        int8x16_t _r1 = vld1q_s8(sptr1);
                        int8x16_t _r2 = vld1q_s8(sptr2);
                        int8x16_t _r3 = vld1q_s8(sptr3);
                        int8x16_t _r4 = vld1q_s8(sptr4);
                        int8x16_t _r5 = vld1q_s8(sptr5);
                        int8x16_t _r6 = vld1q_s8(sptr6);
                        int8x16_t _r7 = vld1q_s8(sptr7);
                        int8x16_t _r01 = vtrnq_s8(_r0, _r1).val[0];
                        int8x16_t _r23 = vtrnq_s8(_r2, _r3).val[0];
                        int8x16_t _r45 = vtrnq_s8(_r4, _r5).val[0];
                        int8x16_t _r67 = vtrnq_s8(_r6, _r7).val[0];
                        int16x8x4_t _r0123;
                        _r0123.val[0] = vreinterpretq_s16_s8(_r01);
                        _r0123.val[1] = vreinterpretq_s16_s8(_r23);
                        _r0123.val[2] = vreinterpretq_s16_s8(_r45);
                        _r0123.val[3] = vreinterpretq_s16_s8(_r67);
                        vst4q_s16((short*)pp, _r0123);
                        pp += 64;
                    }
                    else
                    {
                        pp[0] = sptr0[0];
                        pp[1] = sptr1[0];
                        pp[2] = sptr2[0];
                        pp[3] = sptr3[0];
                        pp[4] = sptr4[0];
                        pp[5] = sptr5[0];
                        pp[6] = sptr6[0];
                        pp[7] = sptr7[0];
                        pp[8] = sptr0[stride_w];
                        pp[9] = sptr1[stride_w];
                        pp[10] = sptr2[stride_w];
                        pp[11] = sptr3[stride_w];
                        pp[12] = sptr4[stride_w];
                        pp[13] = sptr5[stride_w];
                        pp[14] = sptr6[stride_w];
                        pp[15] = sptr7[stride_w];
                        pp[16] = sptr0[stride_w * 2];
                        pp[17] = sptr1[stride_w * 2];
                        pp[18] = sptr2[stride_w * 2];
                        pp[19] = sptr3[stride_w * 2];
                        pp[20] = sptr4[stride_w * 2];
                        pp[21] = sptr5[stride_w * 2];
                        pp[22] = sptr6[stride_w * 2];
                        pp[23] = sptr7[stride_w * 2];
                        pp[24] = sptr0[stride_w * 3];
                        pp[25] = sptr1[stride_w * 3];
                        pp[26] = sptr2[stride_w * 3];
                        pp[27] = sptr3[stride_w * 3];
                        pp[28] = sptr4[stride_w * 3];
                        pp[29] = sptr5[stride_w * 3];
                        pp[30] = sptr6[stride_w * 3];
                        pp[31] = sptr7[stride_w * 3];
                        pp[32] = sptr0[stride_w * 4];
                        pp[33] = sptr1[stride_w * 4];
                        pp[34] = sptr2[stride_w * 4];
                        pp[35] = sptr3[stride_w * 4];
                        pp[36] = sptr4[stride_w * 4];
                        pp[37] = sptr5[stride_w * 4];
                        pp[38] = sptr6[stride_w * 4];
                        pp[39] = sptr7[stride_w * 4];
                        pp[40] = sptr0[stride_w * 5];
                        pp[41] = sptr1[stride_w * 5];
                        pp[42] = sptr2[stride_w * 5];
                        pp[43] = sptr3[stride_w * 5];
                        pp[44] = sptr4[stride_w * 5];
                        pp[45] = sptr5[stride_w * 5];
                        pp[46] = sptr6[stride_w * 5];
                        pp[47] = sptr7[stride_w * 5];
                        pp[48] = sptr0[stride_w * 6];
                        pp[49] = sptr1[stride_w * 6];
                        pp[50] = sptr2[stride_w * 6];
                        pp[51] = sptr3[stride_w * 6];
                        pp[52] = sptr4[stride_w * 6];
                        pp[53] = sptr5[stride_w * 6];
                        pp[54] = sptr6[stride_w * 6];
                        pp[55] = sptr7[stride_w * 6];
                        pp[56] = sptr0[stride_w * 7];
                        pp[57] = sptr1[stride_w * 7];
                        pp[58] = sptr2[stride_w * 7];
                        pp[59] = sptr3[stride_w * 7];
                        pp[60] = sptr4[stride_w * 7];
                        pp[61] = sptr5[stride_w * 7];
                        pp[62] = sptr6[stride_w * 7];
                        pp[63] = sptr7[stride_w * 7];
                        pp += 64;
                    }
                }
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr2 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr3 = img3.row<const signed char>(y30) + x30;

                    if (stride_w == 1)
                    {
                        int8x8x4_t _r01;
                        _r01.val[0] = vld1_s8(sptr0);
                        _r01.val[1] = vld1_s8(sptr1);
                        _r01.val[2] = vld1_s8(sptr2);
                        _r01.val[3] = vld1_s8(sptr3);
                        vst4_s8(pp, _r01);
                        pp += 32;
                    }
                    else if (stride_w == 2)
                    {
                        int8x16_t _r0 = vld1q_s8(sptr0);
                        int8x16_t _r1 = vld1q_s8(sptr1);
                        int8x16_t _r2 = vld1q_s8(sptr2);
                        int8x16_t _r3 = vld1q_s8(sptr3);
                        int8x16_t _r01 = vtrnq_s8(_r0, _r1).val[0];
                        int8x16_t _r23 = vtrnq_s8(_r2, _r3).val[0];
                        int16x8x2_t _r0123;
                        _r0123.val[0] = vreinterpretq_s16_s8(_r01);
                        _r0123.val[1] = vreinterpretq_s16_s8(_r23);
                        vst2q_s16((short*)pp, _r0123);
                        pp += 32;
                    }
                    else
                    {
                        pp[0] = sptr0[0];
                        pp[1] = sptr1[0];
                        pp[2] = sptr2[0];
                        pp[3] = sptr3[0];
                        pp[4] = sptr0[stride_w];
                        pp[5] = sptr1[stride_w];
                        pp[6] = sptr2[stride_w];
                        pp[7] = sptr3[stride_w];
                        pp[8] = sptr0[stride_w * 2];
                        pp[9] = sptr1[stride_w * 2];
                        pp[10] = sptr2[stride_w * 2];
                        pp[11] = sptr3[stride_w * 2];
                        pp[12] = sptr0[stride_w * 3];
                        pp[13] = sptr1[stride_w * 3];
                        pp[14] = sptr2[stride_w * 3];
                        pp[15] = sptr3[stride_w * 3];
                        pp[16] = sptr0[stride_w * 4];
                        pp[17] = sptr1[stride_w * 4];
                        pp[18] = sptr2[stride_w * 4];
                        pp[19] = sptr3[stride_w * 4];
                        pp[20] = sptr0[stride_w * 5];
                        pp[21] = sptr1[stride_w * 5];
                        pp[22] = sptr2[stride_w * 5];
                        pp[23] = sptr3[stride_w * 5];
                        pp[24] = sptr0[stride_w * 6];
                        pp[25] = sptr1[stride_w * 6];
                        pp[26] = sptr2[stride_w * 6];
                        pp[27] = sptr3[stride_w * 6];
                        pp[28] = sptr0[stride_w * 7];
                        pp[29] = sptr1[stride_w * 7];
                        pp[30] = sptr2[stride_w * 7];
                        pp[31] = sptr3[stride_w * 7];
                        pp += 32;
                    }
                }
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 1 < max_kk; kk += 2)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;

                    if (stride_w == 1)
                    {
                        int8x8x2_t _r01;
                        _r01.val[0] = vld1_s8(sptr0);
                        _r01.val[1] = vld1_s8(sptr1);
                        vst2_s8(pp, _r01);
                        pp += 16;
                    }
                    else if (stride_w == 2)
                    {
                        int8x16_t _r0 = vld1q_s8(sptr0);
                        int8x16_t _r1 = vld1q_s8(sptr1);
                        int8x16_t _r01 = vtrnq_s8(_r0, _r1).val[0];
                        vst1q_s8(pp, _r01);
                        pp += 16;
                    }
                    else
                    {
                        pp[0] = sptr0[0];
                        pp[1] = sptr1[0];
                        pp[2] = sptr0[stride_w];
                        pp[3] = sptr1[stride_w];
                        pp[4] = sptr0[stride_w * 2];
                        pp[5] = sptr1[stride_w * 2];
                        pp[6] = sptr0[stride_w * 3];
                        pp[7] = sptr1[stride_w * 3];
                        pp[8] = sptr0[stride_w * 4];
                        pp[9] = sptr1[stride_w * 4];
                        pp[10] = sptr0[stride_w * 5];
                        pp[11] = sptr1[stride_w * 5];
                        pp[12] = sptr0[stride_w * 6];
                        pp[13] = sptr1[stride_w * 6];
                        pp[14] = sptr0[stride_w * 7];
                        pp[15] = sptr1[stride_w * 7];
                        pp += 16;
                    }
                }
            }
            for (; kk < max_kk / elempack; kk++)
            {
                int p = (k / elempack + kk) / maxk;
                int uv = (k / elempack + kk) % maxk;
                int u = uv / kernel_w;
                int v = uv % kernel_w;

                const Mat img = bottom_blob.channel(p);

                int x0 = dx0 + dilation_w * v;
                int y0 = dy0 + dilation_h * u;

                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;

                if (elempack == 8)
                {
#if __ARM_FEATURE_MATMUL_INT8
                    int8x8_t _r0 = vld1_s8(sptr);
                    int8x8_t _r1 = vld1_s8(sptr + stride_w * 8);
                    int8x8_t _r2 = vld1_s8(sptr + stride_w * 16);
                    int8x8_t _r3 = vld1_s8(sptr + stride_w * 24);
                    int8x8_t _r4 = vld1_s8(sptr + stride_w * 32);
                    int8x8_t _r5 = vld1_s8(sptr + stride_w * 40);
                    int8x8_t _r6 = vld1_s8(sptr + stride_w * 48);
                    int8x8_t _r7 = vld1_s8(sptr + stride_w * 56);
                    vst1_s8(pp, _r0);
                    vst1_s8(pp + 8, _r1);
                    vst1_s8(pp + 16, _r2);
                    vst1_s8(pp + 24, _r3);
                    vst1_s8(pp + 32, _r4);
                    vst1_s8(pp + 40, _r5);
                    vst1_s8(pp + 48, _r6);
                    vst1_s8(pp + 56, _r7);
                    pp += 64;
#elif __ARM_FEATURE_DOTPROD
                    int32x2_t _r0 = vreinterpret_s32_s8(vld1_s8(sptr));
                    int32x2_t _r1 = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 8));
                    int32x2_t _r2 = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 16));
                    int32x2_t _r3 = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 24));
                    int32x2_t _r4 = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 32));
                    int32x2_t _r5 = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 40));
                    int32x2_t _r6 = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 48));
                    int32x2_t _r7 = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 56));
                    int32x2x2_t _r01 = vzip_s32(_r0, _r1);
                    int32x2x2_t _r23 = vzip_s32(_r2, _r3);
                    int32x2x2_t _r45 = vzip_s32(_r4, _r5);
                    int32x2x2_t _r67 = vzip_s32(_r6, _r7);
                    vst1_s32((int*)pp, _r01.val[0]);
                    vst1_s32((int*)(pp + 8), _r23.val[0]);
                    vst1_s32((int*)(pp + 16), _r45.val[0]);
                    vst1_s32((int*)(pp + 24), _r67.val[0]);
                    vst1_s32((int*)(pp + 32), _r01.val[1]);
                    vst1_s32((int*)(pp + 40), _r23.val[1]);
                    vst1_s32((int*)(pp + 48), _r45.val[1]);
                    vst1_s32((int*)(pp + 56), _r67.val[1]);
                    pp += 64;
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                    int16x4_t _r0 = vreinterpret_s16_s8(vld1_s8(sptr));
                    int16x4_t _r1 = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 8));
                    int16x4_t _r2 = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 16));
                    int16x4_t _r3 = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 24));
                    int16x4_t _r4 = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 32));
                    int16x4_t _r5 = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 40));
                    int16x4_t _r6 = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 48));
                    int16x4_t _r7 = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 56));
                    int16x4x2_t _r01 = vzip_s16(_r0, _r1);
                    int16x4x2_t _r23 = vzip_s16(_r2, _r3);
                    int16x4x2_t _r45 = vzip_s16(_r4, _r5);
                    int16x4x2_t _r67 = vzip_s16(_r6, _r7);
                    int32x4x4_t _r0123;
                    _r0123.val[0] = vreinterpretq_s32_s16(vcombine_s16(_r01.val[0], _r01.val[1]));
                    _r0123.val[1] = vreinterpretq_s32_s16(vcombine_s16(_r23.val[0], _r23.val[1]));
                    _r0123.val[2] = vreinterpretq_s32_s16(vcombine_s16(_r45.val[0], _r45.val[1]));
                    _r0123.val[3] = vreinterpretq_s32_s16(vcombine_s16(_r67.val[0], _r67.val[1]));
                    vst4q_s32((int*)pp, _r0123);
                    pp += 64;
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                }
                if (elempack == 1)
                {
                    pp[0] = sptr[0];
                    pp[1] = sptr[stride_w];
                    pp[2] = sptr[stride_w * 2];
                    pp[3] = sptr[stride_w * 3];
                    pp[4] = sptr[stride_w * 4];
                    pp[5] = sptr[stride_w * 5];
                    pp[6] = sptr[stride_w * 6];
                    pp[7] = sptr[stride_w * 7];
                    pp += 8;
                }
            }
        }
        else
        {
            int kk = 0;
            if (elempack == 1)
            {
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int p4 = (k + kk + 4) / maxk;
                    int p5 = (k + kk + 5) / maxk;
                    int p6 = (k + kk + 6) / maxk;
                    int p7 = (k + kk + 7) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int uv4 = (k + kk + 4) % maxk;
                    int uv5 = (k + kk + 5) % maxk;
                    int uv6 = (k + kk + 6) % maxk;
                    int uv7 = (k + kk + 7) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int u4 = uv4 / kernel_w;
                    int u5 = uv5 / kernel_w;
                    int u6 = uv6 / kernel_w;
                    int u7 = uv7 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;
                    int v4 = uv4 % kernel_w;
                    int v5 = uv5 % kernel_w;
                    int v6 = uv6 % kernel_w;
                    int v7 = uv7 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);
                    const Mat img4 = bottom_blob.channel(p4);
                    const Mat img5 = bottom_blob.channel(p5);
                    const Mat img6 = bottom_blob.channel(p6);
                    const Mat img7 = bottom_blob.channel(p7);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int x02 = dx2 + dilation_w * v0;
                    int x03 = dx3 + dilation_w * v0;
                    int x04 = dx4 + dilation_w * v0;
                    int x05 = dx5 + dilation_w * v0;
                    int x06 = dx6 + dilation_w * v0;
                    int x07 = dx7 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int y02 = dy2 + dilation_h * u0;
                    int y03 = dy3 + dilation_h * u0;
                    int y04 = dy4 + dilation_h * u0;
                    int y05 = dy5 + dilation_h * u0;
                    int y06 = dy6 + dilation_h * u0;
                    int y07 = dy7 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int x12 = dx2 + dilation_w * v1;
                    int x13 = dx3 + dilation_w * v1;
                    int x14 = dx4 + dilation_w * v1;
                    int x15 = dx5 + dilation_w * v1;
                    int x16 = dx6 + dilation_w * v1;
                    int x17 = dx7 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;
                    int y12 = dy2 + dilation_h * u1;
                    int y13 = dy3 + dilation_h * u1;
                    int y14 = dy4 + dilation_h * u1;
                    int y15 = dy5 + dilation_h * u1;
                    int y16 = dy6 + dilation_h * u1;
                    int y17 = dy7 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int x21 = dx1 + dilation_w * v2;
                    int x22 = dx2 + dilation_w * v2;
                    int x23 = dx3 + dilation_w * v2;
                    int x24 = dx4 + dilation_w * v2;
                    int x25 = dx5 + dilation_w * v2;
                    int x26 = dx6 + dilation_w * v2;
                    int x27 = dx7 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int y21 = dy1 + dilation_h * u2;
                    int y22 = dy2 + dilation_h * u2;
                    int y23 = dy3 + dilation_h * u2;
                    int y24 = dy4 + dilation_h * u2;
                    int y25 = dy5 + dilation_h * u2;
                    int y26 = dy6 + dilation_h * u2;
                    int y27 = dy7 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int x31 = dx1 + dilation_w * v3;
                    int x32 = dx2 + dilation_w * v3;
                    int x33 = dx3 + dilation_w * v3;
                    int x34 = dx4 + dilation_w * v3;
                    int x35 = dx5 + dilation_w * v3;
                    int x36 = dx6 + dilation_w * v3;
                    int x37 = dx7 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;
                    int y31 = dy1 + dilation_h * u3;
                    int y32 = dy2 + dilation_h * u3;
                    int y33 = dy3 + dilation_h * u3;
                    int y34 = dy4 + dilation_h * u3;
                    int y35 = dy5 + dilation_h * u3;
                    int y36 = dy6 + dilation_h * u3;
                    int y37 = dy7 + dilation_h * u3;

                    int x40 = dx0 + dilation_w * v4;
                    int x41 = dx1 + dilation_w * v4;
                    int x42 = dx2 + dilation_w * v4;
                    int x43 = dx3 + dilation_w * v4;
                    int x44 = dx4 + dilation_w * v4;
                    int x45 = dx5 + dilation_w * v4;
                    int x46 = dx6 + dilation_w * v4;
                    int x47 = dx7 + dilation_w * v4;
                    int y40 = dy0 + dilation_h * u4;
                    int y41 = dy1 + dilation_h * u4;
                    int y42 = dy2 + dilation_h * u4;
                    int y43 = dy3 + dilation_h * u4;
                    int y44 = dy4 + dilation_h * u4;
                    int y45 = dy5 + dilation_h * u4;
                    int y46 = dy6 + dilation_h * u4;
                    int y47 = dy7 + dilation_h * u4;

                    int x50 = dx0 + dilation_w * v5;
                    int x51 = dx1 + dilation_w * v5;
                    int x52 = dx2 + dilation_w * v5;
                    int x53 = dx3 + dilation_w * v5;
                    int x54 = dx4 + dilation_w * v5;
                    int x55 = dx5 + dilation_w * v5;
                    int x56 = dx6 + dilation_w * v5;
                    int x57 = dx7 + dilation_w * v5;
                    int y50 = dy0 + dilation_h * u5;
                    int y51 = dy1 + dilation_h * u5;
                    int y52 = dy2 + dilation_h * u5;
                    int y53 = dy3 + dilation_h * u5;
                    int y54 = dy4 + dilation_h * u5;
                    int y55 = dy5 + dilation_h * u5;
                    int y56 = dy6 + dilation_h * u5;
                    int y57 = dy7 + dilation_h * u5;

                    int x60 = dx0 + dilation_w * v6;
                    int x61 = dx1 + dilation_w * v6;
                    int x62 = dx2 + dilation_w * v6;
                    int x63 = dx3 + dilation_w * v6;
                    int x64 = dx4 + dilation_w * v6;
                    int x65 = dx5 + dilation_w * v6;
                    int x66 = dx6 + dilation_w * v6;
                    int x67 = dx7 + dilation_w * v6;
                    int y60 = dy0 + dilation_h * u6;
                    int y61 = dy1 + dilation_h * u6;
                    int y62 = dy2 + dilation_h * u6;
                    int y63 = dy3 + dilation_h * u6;
                    int y64 = dy4 + dilation_h * u6;
                    int y65 = dy5 + dilation_h * u6;
                    int y66 = dy6 + dilation_h * u6;
                    int y67 = dy7 + dilation_h * u6;

                    int x70 = dx0 + dilation_w * v7;
                    int x71 = dx1 + dilation_w * v7;
                    int x72 = dx2 + dilation_w * v7;
                    int x73 = dx3 + dilation_w * v7;
                    int x74 = dx4 + dilation_w * v7;
                    int x75 = dx5 + dilation_w * v7;
                    int x76 = dx6 + dilation_w * v7;
                    int x77 = dx7 + dilation_w * v7;
                    int y70 = dy0 + dilation_h * u7;
                    int y71 = dy1 + dilation_h * u7;
                    int y72 = dy2 + dilation_h * u7;
                    int y73 = dy3 + dilation_h * u7;
                    int y74 = dy4 + dilation_h * u7;
                    int y75 = dy5 + dilation_h * u7;
                    int y76 = dy6 + dilation_h * u7;
                    int y77 = dy7 + dilation_h * u7;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;

                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;

                    const signed char* sptr20 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr21 = img2.row<const signed char>(y21) + x21;
                    const signed char* sptr22 = img2.row<const signed char>(y22) + x22;
                    const signed char* sptr23 = img2.row<const signed char>(y23) + x23;
                    const signed char* sptr24 = img2.row<const signed char>(y24) + x24;
                    const signed char* sptr25 = img2.row<const signed char>(y25) + x25;
                    const signed char* sptr26 = img2.row<const signed char>(y26) + x26;
                    const signed char* sptr27 = img2.row<const signed char>(y27) + x27;

                    const signed char* sptr30 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr31 = img3.row<const signed char>(y31) + x31;
                    const signed char* sptr32 = img3.row<const signed char>(y32) + x32;
                    const signed char* sptr33 = img3.row<const signed char>(y33) + x33;
                    const signed char* sptr34 = img3.row<const signed char>(y34) + x34;
                    const signed char* sptr35 = img3.row<const signed char>(y35) + x35;
                    const signed char* sptr36 = img3.row<const signed char>(y36) + x36;
                    const signed char* sptr37 = img3.row<const signed char>(y37) + x37;

                    const signed char* sptr40 = img4.row<const signed char>(y40) + x40;
                    const signed char* sptr41 = img4.row<const signed char>(y41) + x41;
                    const signed char* sptr42 = img4.row<const signed char>(y42) + x42;
                    const signed char* sptr43 = img4.row<const signed char>(y43) + x43;
                    const signed char* sptr44 = img4.row<const signed char>(y44) + x44;
                    const signed char* sptr45 = img4.row<const signed char>(y45) + x45;
                    const signed char* sptr46 = img4.row<const signed char>(y46) + x46;
                    const signed char* sptr47 = img4.row<const signed char>(y47) + x47;

                    const signed char* sptr50 = img5.row<const signed char>(y50) + x50;
                    const signed char* sptr51 = img5.row<const signed char>(y51) + x51;
                    const signed char* sptr52 = img5.row<const signed char>(y52) + x52;
                    const signed char* sptr53 = img5.row<const signed char>(y53) + x53;
                    const signed char* sptr54 = img5.row<const signed char>(y54) + x54;
                    const signed char* sptr55 = img5.row<const signed char>(y55) + x55;
                    const signed char* sptr56 = img5.row<const signed char>(y56) + x56;
                    const signed char* sptr57 = img5.row<const signed char>(y57) + x57;

                    const signed char* sptr60 = img6.row<const signed char>(y60) + x60;
                    const signed char* sptr61 = img6.row<const signed char>(y61) + x61;
                    const signed char* sptr62 = img6.row<const signed char>(y62) + x62;
                    const signed char* sptr63 = img6.row<const signed char>(y63) + x63;
                    const signed char* sptr64 = img6.row<const signed char>(y64) + x64;
                    const signed char* sptr65 = img6.row<const signed char>(y65) + x65;
                    const signed char* sptr66 = img6.row<const signed char>(y66) + x66;
                    const signed char* sptr67 = img6.row<const signed char>(y67) + x67;

                    const signed char* sptr70 = img7.row<const signed char>(y70) + x70;
                    const signed char* sptr71 = img7.row<const signed char>(y71) + x71;
                    const signed char* sptr72 = img7.row<const signed char>(y72) + x72;
                    const signed char* sptr73 = img7.row<const signed char>(y73) + x73;
                    const signed char* sptr74 = img7.row<const signed char>(y74) + x74;
                    const signed char* sptr75 = img7.row<const signed char>(y75) + x75;
                    const signed char* sptr76 = img7.row<const signed char>(y76) + x76;
                    const signed char* sptr77 = img7.row<const signed char>(y77) + x77;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr20[0];
                    pp[3] = sptr30[0];
                    pp[4] = sptr40[0];
                    pp[5] = sptr50[0];
                    pp[6] = sptr60[0];
                    pp[7] = sptr70[0];
                    pp[8] = sptr01[0];
                    pp[9] = sptr11[0];
                    pp[10] = sptr21[0];
                    pp[11] = sptr31[0];
                    pp[12] = sptr41[0];
                    pp[13] = sptr51[0];
                    pp[14] = sptr61[0];
                    pp[15] = sptr71[0];
                    pp[16] = sptr02[0];
                    pp[17] = sptr12[0];
                    pp[18] = sptr22[0];
                    pp[19] = sptr32[0];
                    pp[20] = sptr42[0];
                    pp[21] = sptr52[0];
                    pp[22] = sptr62[0];
                    pp[23] = sptr72[0];
                    pp[24] = sptr03[0];
                    pp[25] = sptr13[0];
                    pp[26] = sptr23[0];
                    pp[27] = sptr33[0];
                    pp[28] = sptr43[0];
                    pp[29] = sptr53[0];
                    pp[30] = sptr63[0];
                    pp[31] = sptr73[0];
                    pp[32] = sptr04[0];
                    pp[33] = sptr14[0];
                    pp[34] = sptr24[0];
                    pp[35] = sptr34[0];
                    pp[36] = sptr44[0];
                    pp[37] = sptr54[0];
                    pp[38] = sptr64[0];
                    pp[39] = sptr74[0];
                    pp[40] = sptr05[0];
                    pp[41] = sptr15[0];
                    pp[42] = sptr25[0];
                    pp[43] = sptr35[0];
                    pp[44] = sptr45[0];
                    pp[45] = sptr55[0];
                    pp[46] = sptr65[0];
                    pp[47] = sptr75[0];
                    pp[48] = sptr06[0];
                    pp[49] = sptr16[0];
                    pp[50] = sptr26[0];
                    pp[51] = sptr36[0];
                    pp[52] = sptr46[0];
                    pp[53] = sptr56[0];
                    pp[54] = sptr66[0];
                    pp[55] = sptr76[0];
                    pp[56] = sptr07[0];
                    pp[57] = sptr17[0];
                    pp[58] = sptr27[0];
                    pp[59] = sptr37[0];
                    pp[60] = sptr47[0];
                    pp[61] = sptr57[0];
                    pp[62] = sptr67[0];
                    pp[63] = sptr77[0];
                    pp += 64;
                }
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int x02 = dx2 + dilation_w * v0;
                    int x03 = dx3 + dilation_w * v0;
                    int x04 = dx4 + dilation_w * v0;
                    int x05 = dx5 + dilation_w * v0;
                    int x06 = dx6 + dilation_w * v0;
                    int x07 = dx7 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int y02 = dy2 + dilation_h * u0;
                    int y03 = dy3 + dilation_h * u0;
                    int y04 = dy4 + dilation_h * u0;
                    int y05 = dy5 + dilation_h * u0;
                    int y06 = dy6 + dilation_h * u0;
                    int y07 = dy7 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int x12 = dx2 + dilation_w * v1;
                    int x13 = dx3 + dilation_w * v1;
                    int x14 = dx4 + dilation_w * v1;
                    int x15 = dx5 + dilation_w * v1;
                    int x16 = dx6 + dilation_w * v1;
                    int x17 = dx7 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;
                    int y12 = dy2 + dilation_h * u1;
                    int y13 = dy3 + dilation_h * u1;
                    int y14 = dy4 + dilation_h * u1;
                    int y15 = dy5 + dilation_h * u1;
                    int y16 = dy6 + dilation_h * u1;
                    int y17 = dy7 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int x21 = dx1 + dilation_w * v2;
                    int x22 = dx2 + dilation_w * v2;
                    int x23 = dx3 + dilation_w * v2;
                    int x24 = dx4 + dilation_w * v2;
                    int x25 = dx5 + dilation_w * v2;
                    int x26 = dx6 + dilation_w * v2;
                    int x27 = dx7 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int y21 = dy1 + dilation_h * u2;
                    int y22 = dy2 + dilation_h * u2;
                    int y23 = dy3 + dilation_h * u2;
                    int y24 = dy4 + dilation_h * u2;
                    int y25 = dy5 + dilation_h * u2;
                    int y26 = dy6 + dilation_h * u2;
                    int y27 = dy7 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int x31 = dx1 + dilation_w * v3;
                    int x32 = dx2 + dilation_w * v3;
                    int x33 = dx3 + dilation_w * v3;
                    int x34 = dx4 + dilation_w * v3;
                    int x35 = dx5 + dilation_w * v3;
                    int x36 = dx6 + dilation_w * v3;
                    int x37 = dx7 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;
                    int y31 = dy1 + dilation_h * u3;
                    int y32 = dy2 + dilation_h * u3;
                    int y33 = dy3 + dilation_h * u3;
                    int y34 = dy4 + dilation_h * u3;
                    int y35 = dy5 + dilation_h * u3;
                    int y36 = dy6 + dilation_h * u3;
                    int y37 = dy7 + dilation_h * u3;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;

                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;

                    const signed char* sptr20 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr21 = img2.row<const signed char>(y21) + x21;
                    const signed char* sptr22 = img2.row<const signed char>(y22) + x22;
                    const signed char* sptr23 = img2.row<const signed char>(y23) + x23;
                    const signed char* sptr24 = img2.row<const signed char>(y24) + x24;
                    const signed char* sptr25 = img2.row<const signed char>(y25) + x25;
                    const signed char* sptr26 = img2.row<const signed char>(y26) + x26;
                    const signed char* sptr27 = img2.row<const signed char>(y27) + x27;

                    const signed char* sptr30 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr31 = img3.row<const signed char>(y31) + x31;
                    const signed char* sptr32 = img3.row<const signed char>(y32) + x32;
                    const signed char* sptr33 = img3.row<const signed char>(y33) + x33;
                    const signed char* sptr34 = img3.row<const signed char>(y34) + x34;
                    const signed char* sptr35 = img3.row<const signed char>(y35) + x35;
                    const signed char* sptr36 = img3.row<const signed char>(y36) + x36;
                    const signed char* sptr37 = img3.row<const signed char>(y37) + x37;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr20[0];
                    pp[3] = sptr30[0];
                    pp[4] = sptr01[0];
                    pp[5] = sptr11[0];
                    pp[6] = sptr21[0];
                    pp[7] = sptr31[0];
                    pp[8] = sptr02[0];
                    pp[9] = sptr12[0];
                    pp[10] = sptr22[0];
                    pp[11] = sptr32[0];
                    pp[12] = sptr03[0];
                    pp[13] = sptr13[0];
                    pp[14] = sptr23[0];
                    pp[15] = sptr33[0];
                    pp[16] = sptr04[0];
                    pp[17] = sptr14[0];
                    pp[18] = sptr24[0];
                    pp[19] = sptr34[0];
                    pp[20] = sptr05[0];
                    pp[21] = sptr15[0];
                    pp[22] = sptr25[0];
                    pp[23] = sptr35[0];
                    pp[24] = sptr06[0];
                    pp[25] = sptr16[0];
                    pp[26] = sptr26[0];
                    pp[27] = sptr36[0];
                    pp[28] = sptr07[0];
                    pp[29] = sptr17[0];
                    pp[30] = sptr27[0];
                    pp[31] = sptr37[0];
                    pp += 32;
                }
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 1 < max_kk; kk += 2)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int x02 = dx2 + dilation_w * v0;
                    int x03 = dx3 + dilation_w * v0;
                    int x04 = dx4 + dilation_w * v0;
                    int x05 = dx5 + dilation_w * v0;
                    int x06 = dx6 + dilation_w * v0;
                    int x07 = dx7 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int y02 = dy2 + dilation_h * u0;
                    int y03 = dy3 + dilation_h * u0;
                    int y04 = dy4 + dilation_h * u0;
                    int y05 = dy5 + dilation_h * u0;
                    int y06 = dy6 + dilation_h * u0;
                    int y07 = dy7 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int x12 = dx2 + dilation_w * v1;
                    int x13 = dx3 + dilation_w * v1;
                    int x14 = dx4 + dilation_w * v1;
                    int x15 = dx5 + dilation_w * v1;
                    int x16 = dx6 + dilation_w * v1;
                    int x17 = dx7 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;
                    int y12 = dy2 + dilation_h * u1;
                    int y13 = dy3 + dilation_h * u1;
                    int y14 = dy4 + dilation_h * u1;
                    int y15 = dy5 + dilation_h * u1;
                    int y16 = dy6 + dilation_h * u1;
                    int y17 = dy7 + dilation_h * u1;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;

                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr01[0];
                    pp[3] = sptr11[0];
                    pp[4] = sptr02[0];
                    pp[5] = sptr12[0];
                    pp[6] = sptr03[0];
                    pp[7] = sptr13[0];
                    pp[8] = sptr04[0];
                    pp[9] = sptr14[0];
                    pp[10] = sptr05[0];
                    pp[11] = sptr15[0];
                    pp[12] = sptr06[0];
                    pp[13] = sptr16[0];
                    pp[14] = sptr07[0];
                    pp[15] = sptr17[0];
                    pp += 16;
                }
            }
            for (; kk < max_kk / elempack; kk++)
            {
                int p = (k / elempack + kk) / maxk;
                int uv = (k / elempack + kk) % maxk;
                int u = uv / kernel_w;
                int v = uv % kernel_w;

                const Mat img = bottom_blob.channel(p);

                int x0 = dx0 + dilation_w * v;
                int x1 = dx1 + dilation_w * v;
                int x2 = dx2 + dilation_w * v;
                int x3 = dx3 + dilation_w * v;
                int x4 = dx4 + dilation_w * v;
                int x5 = dx5 + dilation_w * v;
                int x6 = dx6 + dilation_w * v;
                int x7 = dx7 + dilation_w * v;
                int y0 = dy0 + dilation_h * u;
                int y1 = dy1 + dilation_h * u;
                int y2 = dy2 + dilation_h * u;
                int y3 = dy3 + dilation_h * u;
                int y4 = dy4 + dilation_h * u;
                int y5 = dy5 + dilation_h * u;
                int y6 = dy6 + dilation_h * u;
                int y7 = dy7 + dilation_h * u;

                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;

                if (elempack == 8)
                {
#if __ARM_FEATURE_MATMUL_INT8
                    int8x8_t _r0 = vld1_s8(sptr0);
                    int8x8_t _r1 = vld1_s8(sptr1);
                    int8x8_t _r2 = vld1_s8(sptr2);
                    int8x8_t _r3 = vld1_s8(sptr3);
                    int8x8_t _r4 = vld1_s8(sptr4);
                    int8x8_t _r5 = vld1_s8(sptr5);
                    int8x8_t _r6 = vld1_s8(sptr6);
                    int8x8_t _r7 = vld1_s8(sptr7);
                    vst1_s8(pp, _r0);
                    vst1_s8(pp + 8, _r1);
                    vst1_s8(pp + 16, _r2);
                    vst1_s8(pp + 24, _r3);
                    vst1_s8(pp + 32, _r4);
                    vst1_s8(pp + 40, _r5);
                    vst1_s8(pp + 48, _r6);
                    vst1_s8(pp + 56, _r7);
                    pp += 64;
#elif __ARM_FEATURE_DOTPROD
                    int32x2_t _r0 = vreinterpret_s32_s8(vld1_s8(sptr0));
                    int32x2_t _r1 = vreinterpret_s32_s8(vld1_s8(sptr1));
                    int32x2_t _r2 = vreinterpret_s32_s8(vld1_s8(sptr2));
                    int32x2_t _r3 = vreinterpret_s32_s8(vld1_s8(sptr3));
                    int32x2_t _r4 = vreinterpret_s32_s8(vld1_s8(sptr4));
                    int32x2_t _r5 = vreinterpret_s32_s8(vld1_s8(sptr5));
                    int32x2_t _r6 = vreinterpret_s32_s8(vld1_s8(sptr6));
                    int32x2_t _r7 = vreinterpret_s32_s8(vld1_s8(sptr7));
                    int32x2x2_t _r01 = vzip_s32(_r0, _r1);
                    int32x2x2_t _r23 = vzip_s32(_r2, _r3);
                    int32x2x2_t _r45 = vzip_s32(_r4, _r5);
                    int32x2x2_t _r67 = vzip_s32(_r6, _r7);
                    vst1_s32((int*)pp, _r01.val[0]);
                    vst1_s32((int*)(pp + 8), _r23.val[0]);
                    vst1_s32((int*)(pp + 16), _r45.val[0]);
                    vst1_s32((int*)(pp + 24), _r67.val[0]);
                    vst1_s32((int*)(pp + 32), _r01.val[1]);
                    vst1_s32((int*)(pp + 40), _r23.val[1]);
                    vst1_s32((int*)(pp + 48), _r45.val[1]);
                    vst1_s32((int*)(pp + 56), _r67.val[1]);
                    pp += 64;
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                    int16x4_t _r0 = vreinterpret_s16_s8(vld1_s8(sptr0));
                    int16x4_t _r1 = vreinterpret_s16_s8(vld1_s8(sptr1));
                    int16x4_t _r2 = vreinterpret_s16_s8(vld1_s8(sptr2));
                    int16x4_t _r3 = vreinterpret_s16_s8(vld1_s8(sptr3));
                    int16x4_t _r4 = vreinterpret_s16_s8(vld1_s8(sptr4));
                    int16x4_t _r5 = vreinterpret_s16_s8(vld1_s8(sptr5));
                    int16x4_t _r6 = vreinterpret_s16_s8(vld1_s8(sptr6));
                    int16x4_t _r7 = vreinterpret_s16_s8(vld1_s8(sptr7));
                    int16x4x2_t _r01 = vzip_s16(_r0, _r1);
                    int16x4x2_t _r23 = vzip_s16(_r2, _r3);
                    int16x4x2_t _r45 = vzip_s16(_r4, _r5);
                    int16x4x2_t _r67 = vzip_s16(_r6, _r7);
                    int32x4x4_t _r0123;
                    _r0123.val[0] = vreinterpretq_s32_s16(vcombine_s16(_r01.val[0], _r01.val[1]));
                    _r0123.val[1] = vreinterpretq_s32_s16(vcombine_s16(_r23.val[0], _r23.val[1]));
                    _r0123.val[2] = vreinterpretq_s32_s16(vcombine_s16(_r45.val[0], _r45.val[1]));
                    _r0123.val[3] = vreinterpretq_s32_s16(vcombine_s16(_r67.val[0], _r67.val[1]));
                    vst4q_s32((int*)pp, _r0123);
                    pp += 64;
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                }
                if (elempack == 1)
                {
                    pp[0] = sptr0[0];
                    pp[1] = sptr1[0];
                    pp[2] = sptr2[0];
                    pp[3] = sptr3[0];
                    pp[4] = sptr4[0];
                    pp[5] = sptr5[0];
                    pp[6] = sptr6[0];
                    pp[7] = sptr7[0];
                    pp += 8;
                }
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        int dy0 = (j + jj) / outw * stride_h;
        int dy1 = (j + jj + 1) / outw * stride_h;
        int dy2 = (j + jj + 2) / outw * stride_h;
        int dy3 = (j + jj + 3) / outw * stride_h;
        int dx0 = (j + jj) % outw * stride_w;
        int dx1 = (j + jj + 1) % outw * stride_w;
        int dx2 = (j + jj + 2) % outw * stride_w;
        int dx3 = (j + jj + 3) % outw * stride_w;

        if (dy0 == dy3)
        {
            int kk = 0;
            if (elempack == 1)
            {
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int p4 = (k + kk + 4) / maxk;
                    int p5 = (k + kk + 5) / maxk;
                    int p6 = (k + kk + 6) / maxk;
                    int p7 = (k + kk + 7) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int uv4 = (k + kk + 4) % maxk;
                    int uv5 = (k + kk + 5) % maxk;
                    int uv6 = (k + kk + 6) % maxk;
                    int uv7 = (k + kk + 7) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int u4 = uv4 / kernel_w;
                    int u5 = uv5 / kernel_w;
                    int u6 = uv6 / kernel_w;
                    int u7 = uv7 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;
                    int v4 = uv4 % kernel_w;
                    int v5 = uv5 % kernel_w;
                    int v6 = uv6 % kernel_w;
                    int v7 = uv7 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);
                    const Mat img4 = bottom_blob.channel(p4);
                    const Mat img5 = bottom_blob.channel(p5);
                    const Mat img6 = bottom_blob.channel(p6);
                    const Mat img7 = bottom_blob.channel(p7);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;

                    int x40 = dx0 + dilation_w * v4;
                    int y40 = dy0 + dilation_h * u4;

                    int x50 = dx0 + dilation_w * v5;
                    int y50 = dy0 + dilation_h * u5;

                    int x60 = dx0 + dilation_w * v6;
                    int y60 = dy0 + dilation_h * u6;

                    int x70 = dx0 + dilation_w * v7;
                    int y70 = dy0 + dilation_h * u7;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr2 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr3 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr4 = img4.row<const signed char>(y40) + x40;
                    const signed char* sptr5 = img5.row<const signed char>(y50) + x50;
                    const signed char* sptr6 = img6.row<const signed char>(y60) + x60;
                    const signed char* sptr7 = img7.row<const signed char>(y70) + x70;

                    if (stride_w == 1)
                    {
                        int8x8_t _r0 = vld1_s8(sptr0);
                        int8x8_t _r1 = vld1_s8(sptr1);
                        int8x8_t _r2 = vld1_s8(sptr2);
                        int8x8_t _r3 = vld1_s8(sptr3);
                        int8x8_t _r4 = vld1_s8(sptr4);
                        int8x8_t _r5 = vld1_s8(sptr5);
                        int8x8_t _r6 = vld1_s8(sptr6);
                        int8x8_t _r7 = vld1_s8(sptr7);
                        int16x4x4_t _r0123;
                        _r0123.val[0] = vreinterpret_s16_s8(vzip_s8(_r0, _r1).val[0]);
                        _r0123.val[1] = vreinterpret_s16_s8(vzip_s8(_r2, _r3).val[0]);
                        _r0123.val[2] = vreinterpret_s16_s8(vzip_s8(_r4, _r5).val[0]);
                        _r0123.val[3] = vreinterpret_s16_s8(vzip_s8(_r6, _r7).val[0]);
                        vst4_s16((short*)pp, _r0123);
                        pp += 32;
                    }
                    else if (stride_w == 2)
                    {
                        int8x8_t _r0 = vld1_s8(sptr0);
                        int8x8_t _r1 = vld1_s8(sptr1);
                        int8x8_t _r2 = vld1_s8(sptr2);
                        int8x8_t _r3 = vld1_s8(sptr3);
                        int8x8_t _r4 = vld1_s8(sptr4);
                        int8x8_t _r5 = vld1_s8(sptr5);
                        int8x8_t _r6 = vld1_s8(sptr6);
                        int8x8_t _r7 = vld1_s8(sptr7);
                        int8x8_t _r01 = vtrn_s8(_r0, _r1).val[0];
                        int8x8_t _r23 = vtrn_s8(_r2, _r3).val[0];
                        int8x8_t _r45 = vtrn_s8(_r4, _r5).val[0];
                        int8x8_t _r67 = vtrn_s8(_r6, _r7).val[0];
                        int16x4x4_t _r0123;
                        _r0123.val[0] = vreinterpret_s16_s8(_r01);
                        _r0123.val[1] = vreinterpret_s16_s8(_r23);
                        _r0123.val[2] = vreinterpret_s16_s8(_r45);
                        _r0123.val[3] = vreinterpret_s16_s8(_r67);
                        vst4_s16((short*)pp, _r0123);
                        pp += 32;
                    }
                    else
                    {
                        pp[0] = sptr0[0];
                        pp[1] = sptr1[0];
                        pp[2] = sptr2[0];
                        pp[3] = sptr3[0];
                        pp[4] = sptr4[0];
                        pp[5] = sptr5[0];
                        pp[6] = sptr6[0];
                        pp[7] = sptr7[0];
                        pp[8] = sptr0[stride_w];
                        pp[9] = sptr1[stride_w];
                        pp[10] = sptr2[stride_w];
                        pp[11] = sptr3[stride_w];
                        pp[12] = sptr4[stride_w];
                        pp[13] = sptr5[stride_w];
                        pp[14] = sptr6[stride_w];
                        pp[15] = sptr7[stride_w];
                        pp[16] = sptr0[stride_w * 2];
                        pp[17] = sptr1[stride_w * 2];
                        pp[18] = sptr2[stride_w * 2];
                        pp[19] = sptr3[stride_w * 2];
                        pp[20] = sptr4[stride_w * 2];
                        pp[21] = sptr5[stride_w * 2];
                        pp[22] = sptr6[stride_w * 2];
                        pp[23] = sptr7[stride_w * 2];
                        pp[24] = sptr0[stride_w * 3];
                        pp[25] = sptr1[stride_w * 3];
                        pp[26] = sptr2[stride_w * 3];
                        pp[27] = sptr3[stride_w * 3];
                        pp[28] = sptr4[stride_w * 3];
                        pp[29] = sptr5[stride_w * 3];
                        pp[30] = sptr6[stride_w * 3];
                        pp[31] = sptr7[stride_w * 3];
                        pp += 32;
                    }
                }
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr2 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr3 = img3.row<const signed char>(y30) + x30;

                    if (stride_w == 1)
                    {
                        int8x8_t _r0 = vld1_s8(sptr0);
                        int8x8_t _r1 = vld1_s8(sptr1);
                        int8x8_t _r2 = vld1_s8(sptr2);
                        int8x8_t _r3 = vld1_s8(sptr3);
                        int16x4x2_t _r01;
                        _r01.val[0] = vreinterpret_s16_s8(vzip_s8(_r0, _r1).val[0]);
                        _r01.val[1] = vreinterpret_s16_s8(vzip_s8(_r2, _r3).val[0]);
                        vst2_s16((short*)pp, _r01);
                        pp += 16;
                    }
                    else if (stride_w == 2)
                    {
                        int8x8_t _r0 = vld1_s8(sptr0);
                        int8x8_t _r1 = vld1_s8(sptr1);
                        int8x8_t _r2 = vld1_s8(sptr2);
                        int8x8_t _r3 = vld1_s8(sptr3);
                        int8x8_t _r01 = vtrn_s8(_r0, _r1).val[0];
                        int8x8_t _r23 = vtrn_s8(_r2, _r3).val[0];
                        int16x4x2_t _r0123;
                        _r0123.val[0] = vreinterpret_s16_s8(_r01);
                        _r0123.val[1] = vreinterpret_s16_s8(_r23);
                        vst2_s16((short*)pp, _r0123);
                        pp += 16;
                    }
                    else
                    {
                        pp[0] = sptr0[0];
                        pp[1] = sptr1[0];
                        pp[2] = sptr2[0];
                        pp[3] = sptr3[0];
                        pp[4] = sptr0[stride_w];
                        pp[5] = sptr1[stride_w];
                        pp[6] = sptr2[stride_w];
                        pp[7] = sptr3[stride_w];
                        pp[8] = sptr0[stride_w * 2];
                        pp[9] = sptr1[stride_w * 2];
                        pp[10] = sptr2[stride_w * 2];
                        pp[11] = sptr3[stride_w * 2];
                        pp[12] = sptr0[stride_w * 3];
                        pp[13] = sptr1[stride_w * 3];
                        pp[14] = sptr2[stride_w * 3];
                        pp[15] = sptr3[stride_w * 3];
                        pp += 16;
                    }
                }
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 1 < max_kk; kk += 2)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;

                    if (stride_w == 1)
                    {
                        int8x8_t _r0 = vld1_s8(sptr0);
                        int8x8_t _r1 = vld1_s8(sptr1);
                        int8x8_t _r01 = vzip_s8(_r0, _r1).val[0];
                        vst1_s8(pp, _r01);
                        pp += 8;
                    }
                    else if (stride_w == 2)
                    {
                        int8x8_t _r0 = vld1_s8(sptr0);
                        int8x8_t _r1 = vld1_s8(sptr1);
                        int8x8_t _r01 = vtrn_s8(_r0, _r1).val[0];
                        vst1_s8(pp, _r01);
                        pp += 8;
                    }
                    else
                    {
                        pp[0] = sptr0[0];
                        pp[1] = sptr1[0];
                        pp[2] = sptr0[stride_w];
                        pp[3] = sptr1[stride_w];
                        pp[4] = sptr0[stride_w * 2];
                        pp[5] = sptr1[stride_w * 2];
                        pp[6] = sptr0[stride_w * 3];
                        pp[7] = sptr1[stride_w * 3];
                        pp += 8;
                    }
                }
            }
            for (; kk < max_kk / elempack; kk++)
            {
                int p = (k / elempack + kk) / maxk;
                int uv = (k / elempack + kk) % maxk;
                int u = uv / kernel_w;
                int v = uv % kernel_w;

                const Mat img = bottom_blob.channel(p);

                int x0 = dx0 + dilation_w * v;
                int y0 = dy0 + dilation_h * u;

                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;

                if (elempack == 8)
                {
#if __ARM_FEATURE_MATMUL_INT8
                    int8x8_t _r0 = vld1_s8(sptr);
                    int8x8_t _r1 = vld1_s8(sptr + stride_w * 8);
                    int8x8_t _r2 = vld1_s8(sptr + stride_w * 16);
                    int8x8_t _r3 = vld1_s8(sptr + stride_w * 24);
                    vst1_s8(pp, _r0);
                    vst1_s8(pp + 8, _r1);
                    vst1_s8(pp + 16, _r2);
                    vst1_s8(pp + 24, _r3);
                    pp += 32;
#elif __ARM_FEATURE_DOTPROD
                    int32x2x4_t _r0123;
                    _r0123.val[0] = vreinterpret_s32_s8(vld1_s8(sptr));
                    _r0123.val[1] = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 8));
                    _r0123.val[2] = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 16));
                    _r0123.val[3] = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 24));
                    vst4_s32((int*)pp, _r0123);
                    pp += 32;
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                    int16x4x4_t _r0123;
                    _r0123.val[0] = vreinterpret_s16_s8(vld1_s8(sptr));
                    _r0123.val[1] = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 8));
                    _r0123.val[2] = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 16));
                    _r0123.val[3] = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 24));
                    vst4_s16((short*)pp, _r0123);
                    pp += 32;
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                }
                if (elempack == 1)
                {
                    pp[0] = sptr[0];
                    pp[1] = sptr[stride_w];
                    pp[2] = sptr[stride_w * 2];
                    pp[3] = sptr[stride_w * 3];
                    pp += 4;
                }
            }
        }
        else
        {
            int kk = 0;
            if (elempack == 1)
            {
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int p4 = (k + kk + 4) / maxk;
                    int p5 = (k + kk + 5) / maxk;
                    int p6 = (k + kk + 6) / maxk;
                    int p7 = (k + kk + 7) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int uv4 = (k + kk + 4) % maxk;
                    int uv5 = (k + kk + 5) % maxk;
                    int uv6 = (k + kk + 6) % maxk;
                    int uv7 = (k + kk + 7) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int u4 = uv4 / kernel_w;
                    int u5 = uv5 / kernel_w;
                    int u6 = uv6 / kernel_w;
                    int u7 = uv7 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;
                    int v4 = uv4 % kernel_w;
                    int v5 = uv5 % kernel_w;
                    int v6 = uv6 % kernel_w;
                    int v7 = uv7 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);
                    const Mat img4 = bottom_blob.channel(p4);
                    const Mat img5 = bottom_blob.channel(p5);
                    const Mat img6 = bottom_blob.channel(p6);
                    const Mat img7 = bottom_blob.channel(p7);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int x02 = dx2 + dilation_w * v0;
                    int x03 = dx3 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int y02 = dy2 + dilation_h * u0;
                    int y03 = dy3 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int x12 = dx2 + dilation_w * v1;
                    int x13 = dx3 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;
                    int y12 = dy2 + dilation_h * u1;
                    int y13 = dy3 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int x21 = dx1 + dilation_w * v2;
                    int x22 = dx2 + dilation_w * v2;
                    int x23 = dx3 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int y21 = dy1 + dilation_h * u2;
                    int y22 = dy2 + dilation_h * u2;
                    int y23 = dy3 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int x31 = dx1 + dilation_w * v3;
                    int x32 = dx2 + dilation_w * v3;
                    int x33 = dx3 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;
                    int y31 = dy1 + dilation_h * u3;
                    int y32 = dy2 + dilation_h * u3;
                    int y33 = dy3 + dilation_h * u3;

                    int x40 = dx0 + dilation_w * v4;
                    int x41 = dx1 + dilation_w * v4;
                    int x42 = dx2 + dilation_w * v4;
                    int x43 = dx3 + dilation_w * v4;
                    int y40 = dy0 + dilation_h * u4;
                    int y41 = dy1 + dilation_h * u4;
                    int y42 = dy2 + dilation_h * u4;
                    int y43 = dy3 + dilation_h * u4;

                    int x50 = dx0 + dilation_w * v5;
                    int x51 = dx1 + dilation_w * v5;
                    int x52 = dx2 + dilation_w * v5;
                    int x53 = dx3 + dilation_w * v5;
                    int y50 = dy0 + dilation_h * u5;
                    int y51 = dy1 + dilation_h * u5;
                    int y52 = dy2 + dilation_h * u5;
                    int y53 = dy3 + dilation_h * u5;

                    int x60 = dx0 + dilation_w * v6;
                    int x61 = dx1 + dilation_w * v6;
                    int x62 = dx2 + dilation_w * v6;
                    int x63 = dx3 + dilation_w * v6;
                    int y60 = dy0 + dilation_h * u6;
                    int y61 = dy1 + dilation_h * u6;
                    int y62 = dy2 + dilation_h * u6;
                    int y63 = dy3 + dilation_h * u6;

                    int x70 = dx0 + dilation_w * v7;
                    int x71 = dx1 + dilation_w * v7;
                    int x72 = dx2 + dilation_w * v7;
                    int x73 = dx3 + dilation_w * v7;
                    int y70 = dy0 + dilation_h * u7;
                    int y71 = dy1 + dilation_h * u7;
                    int y72 = dy2 + dilation_h * u7;
                    int y73 = dy3 + dilation_h * u7;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;

                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;

                    const signed char* sptr20 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr21 = img2.row<const signed char>(y21) + x21;
                    const signed char* sptr22 = img2.row<const signed char>(y22) + x22;
                    const signed char* sptr23 = img2.row<const signed char>(y23) + x23;

                    const signed char* sptr30 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr31 = img3.row<const signed char>(y31) + x31;
                    const signed char* sptr32 = img3.row<const signed char>(y32) + x32;
                    const signed char* sptr33 = img3.row<const signed char>(y33) + x33;

                    const signed char* sptr40 = img4.row<const signed char>(y40) + x40;
                    const signed char* sptr41 = img4.row<const signed char>(y41) + x41;
                    const signed char* sptr42 = img4.row<const signed char>(y42) + x42;
                    const signed char* sptr43 = img4.row<const signed char>(y43) + x43;

                    const signed char* sptr50 = img5.row<const signed char>(y50) + x50;
                    const signed char* sptr51 = img5.row<const signed char>(y51) + x51;
                    const signed char* sptr52 = img5.row<const signed char>(y52) + x52;
                    const signed char* sptr53 = img5.row<const signed char>(y53) + x53;

                    const signed char* sptr60 = img6.row<const signed char>(y60) + x60;
                    const signed char* sptr61 = img6.row<const signed char>(y61) + x61;
                    const signed char* sptr62 = img6.row<const signed char>(y62) + x62;
                    const signed char* sptr63 = img6.row<const signed char>(y63) + x63;

                    const signed char* sptr70 = img7.row<const signed char>(y70) + x70;
                    const signed char* sptr71 = img7.row<const signed char>(y71) + x71;
                    const signed char* sptr72 = img7.row<const signed char>(y72) + x72;
                    const signed char* sptr73 = img7.row<const signed char>(y73) + x73;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr20[0];
                    pp[3] = sptr30[0];
                    pp[4] = sptr40[0];
                    pp[5] = sptr50[0];
                    pp[6] = sptr60[0];
                    pp[7] = sptr70[0];
                    pp[8] = sptr01[0];
                    pp[9] = sptr11[0];
                    pp[10] = sptr21[0];
                    pp[11] = sptr31[0];
                    pp[12] = sptr41[0];
                    pp[13] = sptr51[0];
                    pp[14] = sptr61[0];
                    pp[15] = sptr71[0];
                    pp[16] = sptr02[0];
                    pp[17] = sptr12[0];
                    pp[18] = sptr22[0];
                    pp[19] = sptr32[0];
                    pp[20] = sptr42[0];
                    pp[21] = sptr52[0];
                    pp[22] = sptr62[0];
                    pp[23] = sptr72[0];
                    pp[24] = sptr03[0];
                    pp[25] = sptr13[0];
                    pp[26] = sptr23[0];
                    pp[27] = sptr33[0];
                    pp[28] = sptr43[0];
                    pp[29] = sptr53[0];
                    pp[30] = sptr63[0];
                    pp[31] = sptr73[0];
                    pp += 32;
                }
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int x02 = dx2 + dilation_w * v0;
                    int x03 = dx3 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int y02 = dy2 + dilation_h * u0;
                    int y03 = dy3 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int x12 = dx2 + dilation_w * v1;
                    int x13 = dx3 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;
                    int y12 = dy2 + dilation_h * u1;
                    int y13 = dy3 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int x21 = dx1 + dilation_w * v2;
                    int x22 = dx2 + dilation_w * v2;
                    int x23 = dx3 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int y21 = dy1 + dilation_h * u2;
                    int y22 = dy2 + dilation_h * u2;
                    int y23 = dy3 + dilation_h * u2;

                    int x30 = dx0 + dilation_w * v3;
                    int x31 = dx1 + dilation_w * v3;
                    int x32 = dx2 + dilation_w * v3;
                    int x33 = dx3 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;
                    int y31 = dy1 + dilation_h * u3;
                    int y32 = dy2 + dilation_h * u3;
                    int y33 = dy3 + dilation_h * u3;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;

                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;

                    const signed char* sptr20 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr21 = img2.row<const signed char>(y21) + x21;
                    const signed char* sptr22 = img2.row<const signed char>(y22) + x22;
                    const signed char* sptr23 = img2.row<const signed char>(y23) + x23;

                    const signed char* sptr30 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr31 = img3.row<const signed char>(y31) + x31;
                    const signed char* sptr32 = img3.row<const signed char>(y32) + x32;
                    const signed char* sptr33 = img3.row<const signed char>(y33) + x33;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr20[0];
                    pp[3] = sptr30[0];
                    pp[4] = sptr01[0];
                    pp[5] = sptr11[0];
                    pp[6] = sptr21[0];
                    pp[7] = sptr31[0];
                    pp[8] = sptr02[0];
                    pp[9] = sptr12[0];
                    pp[10] = sptr22[0];
                    pp[11] = sptr32[0];
                    pp[12] = sptr03[0];
                    pp[13] = sptr13[0];
                    pp[14] = sptr23[0];
                    pp[15] = sptr33[0];
                    pp += 16;
                }
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 1 < max_kk; kk += 2)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int x02 = dx2 + dilation_w * v0;
                    int x03 = dx3 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int y02 = dy2 + dilation_h * u0;
                    int y03 = dy3 + dilation_h * u0;

                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int x12 = dx2 + dilation_w * v1;
                    int x13 = dx3 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;
                    int y12 = dy2 + dilation_h * u1;
                    int y13 = dy3 + dilation_h * u1;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;

                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr01[0];
                    pp[3] = sptr11[0];
                    pp[4] = sptr02[0];
                    pp[5] = sptr12[0];
                    pp[6] = sptr03[0];
                    pp[7] = sptr13[0];
                    pp += 8;
                }
            }
            for (; kk < max_kk / elempack; kk++)
            {
                int p = (k / elempack + kk) / maxk;
                int uv = (k / elempack + kk) % maxk;
                int u = uv / kernel_w;
                int v = uv % kernel_w;

                const Mat img = bottom_blob.channel(p);

                int x0 = dx0 + dilation_w * v;
                int x1 = dx1 + dilation_w * v;
                int x2 = dx2 + dilation_w * v;
                int x3 = dx3 + dilation_w * v;
                int y0 = dy0 + dilation_h * u;
                int y1 = dy1 + dilation_h * u;
                int y2 = dy2 + dilation_h * u;
                int y3 = dy3 + dilation_h * u;

                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;

                if (elempack == 8)
                {
#if __ARM_FEATURE_MATMUL_INT8
                    int8x8_t _r0 = vld1_s8(sptr0);
                    int8x8_t _r1 = vld1_s8(sptr1);
                    int8x8_t _r2 = vld1_s8(sptr2);
                    int8x8_t _r3 = vld1_s8(sptr3);
                    vst1_s8(pp, _r0);
                    vst1_s8(pp + 8, _r1);
                    vst1_s8(pp + 16, _r2);
                    vst1_s8(pp + 24, _r3);
                    pp += 32;
#elif __ARM_FEATURE_DOTPROD
                    int32x2x4_t _r0123;
                    _r0123.val[0] = vreinterpret_s32_s8(vld1_s8(sptr0));
                    _r0123.val[1] = vreinterpret_s32_s8(vld1_s8(sptr1));
                    _r0123.val[2] = vreinterpret_s32_s8(vld1_s8(sptr2));
                    _r0123.val[3] = vreinterpret_s32_s8(vld1_s8(sptr3));
                    vst4_s32((int*)pp, _r0123);
                    pp += 32;
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                    int16x4x4_t _r0123;
                    _r0123.val[0] = vreinterpret_s16_s8(vld1_s8(sptr0));
                    _r0123.val[1] = vreinterpret_s16_s8(vld1_s8(sptr1));
                    _r0123.val[2] = vreinterpret_s16_s8(vld1_s8(sptr2));
                    _r0123.val[3] = vreinterpret_s16_s8(vld1_s8(sptr3));
                    vst4_s16((short*)pp, _r0123);
                    pp += 32;
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                }
                if (elempack == 1)
                {
                    pp[0] = sptr0[0];
                    pp[1] = sptr1[0];
                    pp[2] = sptr2[0];
                    pp[3] = sptr3[0];
                    pp += 4;
                }
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        int dy0 = (j + jj) / outw * stride_h;
        int dy1 = (j + jj + 1) / outw * stride_h;
        int dx0 = (j + jj) % outw * stride_w;
        int dx1 = (j + jj + 1) % outw * stride_w;

        if (dy0 == dy1)
        {
            int kk = 0;
#if __ARM_NEON
            if (elempack == 1)
            {
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int p4 = (k + kk + 4) / maxk;
                    int p5 = (k + kk + 5) / maxk;
                    int p6 = (k + kk + 6) / maxk;
                    int p7 = (k + kk + 7) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int uv4 = (k + kk + 4) % maxk;
                    int uv5 = (k + kk + 5) % maxk;
                    int uv6 = (k + kk + 6) % maxk;
                    int uv7 = (k + kk + 7) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int u4 = uv4 / kernel_w;
                    int u5 = uv5 / kernel_w;
                    int u6 = uv6 / kernel_w;
                    int u7 = uv7 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;
                    int v4 = uv4 % kernel_w;
                    int v5 = uv5 % kernel_w;
                    int v6 = uv6 % kernel_w;
                    int v7 = uv7 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);
                    const Mat img4 = bottom_blob.channel(p4);
                    const Mat img5 = bottom_blob.channel(p5);
                    const Mat img6 = bottom_blob.channel(p6);
                    const Mat img7 = bottom_blob.channel(p7);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int x30 = dx0 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;

                    int x40 = dx0 + dilation_w * v4;
                    int y40 = dy0 + dilation_h * u4;
                    int x50 = dx0 + dilation_w * v5;
                    int y50 = dy0 + dilation_h * u5;

                    int x60 = dx0 + dilation_w * v6;
                    int y60 = dy0 + dilation_h * u6;
                    int x70 = dx0 + dilation_w * v7;
                    int y70 = dy0 + dilation_h * u7;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr2 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr3 = img3.row<const signed char>(y30) + x30;

                    const signed char* sptr4 = img4.row<const signed char>(y40) + x40;
                    const signed char* sptr5 = img5.row<const signed char>(y50) + x50;
                    const signed char* sptr6 = img6.row<const signed char>(y60) + x60;
                    const signed char* sptr7 = img7.row<const signed char>(y70) + x70;

                    pp[0] = sptr0[0];
                    pp[1] = sptr1[0];
                    pp[2] = sptr2[0];
                    pp[3] = sptr3[0];
                    pp[4] = sptr4[0];
                    pp[5] = sptr5[0];
                    pp[6] = sptr6[0];
                    pp[7] = sptr7[0];
                    pp[8] = sptr0[stride_w];
                    pp[9] = sptr1[stride_w];
                    pp[10] = sptr2[stride_w];
                    pp[11] = sptr3[stride_w];
                    pp[12] = sptr4[stride_w];
                    pp[13] = sptr5[stride_w];
                    pp[14] = sptr6[stride_w];
                    pp[15] = sptr7[stride_w];
                    pp += 16;
                }
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int x20 = dx0 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int x30 = dx0 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr2 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr3 = img3.row<const signed char>(y30) + x30;

                    pp[0] = sptr0[0];
                    pp[1] = sptr1[0];
                    pp[2] = sptr2[0];
                    pp[3] = sptr3[0];
                    pp[4] = sptr0[stride_w];
                    pp[5] = sptr1[stride_w];
                    pp[6] = sptr2[stride_w];
                    pp[7] = sptr3[stride_w];
                    pp += 8;
                }
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 1 < max_kk; kk += 2)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);

                    int x00 = dx0 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int x10 = dx0 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;

                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;

                    pp[0] = sptr0[0];
                    pp[1] = sptr1[0];
                    pp[2] = sptr0[stride_w];
                    pp[3] = sptr1[stride_w];
                    pp += 4;
                }
            }
#endif // __ARM_NEON
            for (; kk < max_kk / elempack; kk++)
            {
                int p = (k / elempack + kk) / maxk;
                int uv = (k / elempack + kk) % maxk;
                int u = uv / kernel_w;
                int v = uv % kernel_w;

                const Mat img = bottom_blob.channel(p);

                int x0 = dx0 + dilation_w * v;
                int y0 = dy0 + dilation_h * u;

                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;

#if __ARM_NEON
                if (elempack == 8)
                {
#if __ARM_FEATURE_MATMUL_INT8
                    int8x8_t _r0 = vld1_s8(sptr);
                    int8x8_t _r1 = vld1_s8(sptr + stride_w * 8);
                    vst1_s8(pp, _r0);
                    vst1_s8(pp + 8, _r1);
                    pp += 16;
#elif __ARM_FEATURE_DOTPROD
                    int32x2x2_t _r01;
                    _r01.val[0] = vreinterpret_s32_s8(vld1_s8(sptr));
                    _r01.val[1] = vreinterpret_s32_s8(vld1_s8(sptr + stride_w * 8));
                    vst2_s32((int*)pp, _r01);
                    pp += 16;
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                    int16x4x2_t _r01;
                    _r01.val[0] = vreinterpret_s16_s8(vld1_s8(sptr));
                    _r01.val[1] = vreinterpret_s16_s8(vld1_s8(sptr + stride_w * 8));
                    vst2_s16((short*)pp, _r01);
                    pp += 16;
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                }
#endif // __ARM_NEON
                if (elempack == 1)
                {
                    pp[0] = sptr[0];
                    pp[1] = sptr[stride_w];
                    pp += 2;
                }
            }
        }
        else
        {
            int kk = 0;
#if __ARM_NEON
            if (elempack == 1)
            {
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int p4 = (k + kk + 4) / maxk;
                    int p5 = (k + kk + 5) / maxk;
                    int p6 = (k + kk + 6) / maxk;
                    int p7 = (k + kk + 7) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int uv4 = (k + kk + 4) % maxk;
                    int uv5 = (k + kk + 5) % maxk;
                    int uv6 = (k + kk + 6) % maxk;
                    int uv7 = (k + kk + 7) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int u4 = uv4 / kernel_w;
                    int u5 = uv5 / kernel_w;
                    int u6 = uv6 / kernel_w;
                    int u7 = uv7 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;
                    int v4 = uv4 % kernel_w;
                    int v5 = uv5 % kernel_w;
                    int v6 = uv6 % kernel_w;
                    int v7 = uv7 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);
                    const Mat img4 = bottom_blob.channel(p4);
                    const Mat img5 = bottom_blob.channel(p5);
                    const Mat img6 = bottom_blob.channel(p6);
                    const Mat img7 = bottom_blob.channel(p7);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;

                    int x20 = dx0 + dilation_w * v2;
                    int x21 = dx1 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int y21 = dy1 + dilation_h * u2;
                    int x30 = dx0 + dilation_w * v3;
                    int x31 = dx1 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;
                    int y31 = dy1 + dilation_h * u3;

                    int x40 = dx0 + dilation_w * v4;
                    int x41 = dx1 + dilation_w * v4;
                    int y40 = dy0 + dilation_h * u4;
                    int y41 = dy1 + dilation_h * u4;
                    int x50 = dx0 + dilation_w * v5;
                    int x51 = dx1 + dilation_w * v5;
                    int y50 = dy0 + dilation_h * u5;
                    int y51 = dy1 + dilation_h * u5;

                    int x60 = dx0 + dilation_w * v6;
                    int x61 = dx1 + dilation_w * v6;
                    int y60 = dy0 + dilation_h * u6;
                    int y61 = dy1 + dilation_h * u6;
                    int x70 = dx0 + dilation_w * v7;
                    int x71 = dx1 + dilation_w * v7;
                    int y70 = dy0 + dilation_h * u7;
                    int y71 = dy1 + dilation_h * u7;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr20 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr21 = img2.row<const signed char>(y21) + x21;
                    const signed char* sptr30 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr31 = img3.row<const signed char>(y31) + x31;

                    const signed char* sptr40 = img4.row<const signed char>(y40) + x40;
                    const signed char* sptr41 = img4.row<const signed char>(y41) + x41;
                    const signed char* sptr50 = img5.row<const signed char>(y50) + x50;
                    const signed char* sptr51 = img5.row<const signed char>(y51) + x51;
                    const signed char* sptr60 = img6.row<const signed char>(y60) + x60;
                    const signed char* sptr61 = img6.row<const signed char>(y61) + x61;
                    const signed char* sptr70 = img7.row<const signed char>(y70) + x70;
                    const signed char* sptr71 = img7.row<const signed char>(y71) + x71;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr20[0];
                    pp[3] = sptr30[0];
                    pp[4] = sptr40[0];
                    pp[5] = sptr50[0];
                    pp[6] = sptr60[0];
                    pp[7] = sptr70[0];
                    pp[8] = sptr01[0];
                    pp[9] = sptr11[0];
                    pp[10] = sptr21[0];
                    pp[11] = sptr31[0];
                    pp[12] = sptr41[0];
                    pp[13] = sptr51[0];
                    pp[14] = sptr61[0];
                    pp[15] = sptr71[0];
                    pp += 16;
                }
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int p2 = (k + kk + 2) / maxk;
                    int p3 = (k + kk + 3) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int uv2 = (k + kk + 2) % maxk;
                    int uv3 = (k + kk + 3) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int u2 = uv2 / kernel_w;
                    int u3 = uv3 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;
                    int v2 = uv2 % kernel_w;
                    int v3 = uv3 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);
                    const Mat img2 = bottom_blob.channel(p2);
                    const Mat img3 = bottom_blob.channel(p3);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;
                    int x20 = dx0 + dilation_w * v2;
                    int x21 = dx1 + dilation_w * v2;
                    int y20 = dy0 + dilation_h * u2;
                    int y21 = dy1 + dilation_h * u2;
                    int x30 = dx0 + dilation_w * v3;
                    int x31 = dx1 + dilation_w * v3;
                    int y30 = dy0 + dilation_h * u3;
                    int y31 = dy1 + dilation_h * u3;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
                    const signed char* sptr20 = img2.row<const signed char>(y20) + x20;
                    const signed char* sptr21 = img2.row<const signed char>(y21) + x21;
                    const signed char* sptr30 = img3.row<const signed char>(y30) + x30;
                    const signed char* sptr31 = img3.row<const signed char>(y31) + x31;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr20[0];
                    pp[3] = sptr30[0];
                    pp[4] = sptr01[0];
                    pp[5] = sptr11[0];
                    pp[6] = sptr21[0];
                    pp[7] = sptr31[0];
                    pp += 8;
                }
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 1 < max_kk; kk += 2)
                {
                    int p0 = (k + kk) / maxk;
                    int p1 = (k + kk + 1) / maxk;
                    int uv0 = (k + kk) % maxk;
                    int uv1 = (k + kk + 1) % maxk;
                    int u0 = uv0 / kernel_w;
                    int u1 = uv1 / kernel_w;
                    int v0 = uv0 % kernel_w;
                    int v1 = uv1 % kernel_w;

                    const Mat img0 = bottom_blob.channel(p0);
                    const Mat img1 = bottom_blob.channel(p1);

                    int x00 = dx0 + dilation_w * v0;
                    int x01 = dx1 + dilation_w * v0;
                    int y00 = dy0 + dilation_h * u0;
                    int y01 = dy1 + dilation_h * u0;
                    int x10 = dx0 + dilation_w * v1;
                    int x11 = dx1 + dilation_w * v1;
                    int y10 = dy0 + dilation_h * u1;
                    int y11 = dy1 + dilation_h * u1;

                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;

                    pp[0] = sptr00[0];
                    pp[1] = sptr10[0];
                    pp[2] = sptr01[0];
                    pp[3] = sptr11[0];
                    pp += 4;
                }
            }
#endif // __ARM_NEON
            for (; kk < max_kk / elempack; kk++)
            {
                int p = (k / elempack + kk) / maxk;
                int uv = (k / elempack + kk) % maxk;
                int u = uv / kernel_w;
                int v = uv % kernel_w;

                const Mat img = bottom_blob.channel(p);

                int x0 = dx0 + dilation_w * v;
                int x1 = dx1 + dilation_w * v;
                int y0 = dy0 + dilation_h * u;
                int y1 = dy1 + dilation_h * u;

                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;

#if __ARM_NEON
                if (elempack == 8)
                {
#if __ARM_FEATURE_MATMUL_INT8
                    int8x8_t _r0 = vld1_s8(sptr0);
                    int8x8_t _r1 = vld1_s8(sptr1);
                    vst1_s8(pp, _r0);
                    vst1_s8(pp + 8, _r1);
                    pp += 16;
#elif __ARM_FEATURE_DOTPROD
                    int32x2x2_t _r01;
                    _r01.val[0] = vreinterpret_s32_s8(vld1_s8(sptr0));
                    _r01.val[1] = vreinterpret_s32_s8(vld1_s8(sptr1));
                    vst2_s32((int*)pp, _r01);
                    pp += 16;
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                    int16x4x2_t _r01;
                    _r01.val[0] = vreinterpret_s16_s8(vld1_s8(sptr0));
                    _r01.val[1] = vreinterpret_s16_s8(vld1_s8(sptr1));
                    vst2_s16((short*)pp, _r01);
                    pp += 16;
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                }
#endif // __ARM_NEON
                if (elempack == 1)
                {
                    pp[0] = sptr0[0];
                    pp[1] = sptr1[0];
                    pp += 2;
                }
            }
        }
    }
    for (; jj < max_jj; jj++)
    {
        int dy = (j + jj) / outw * stride_h;
        int dx = (j + jj) % outw * stride_w;

        int kk = 0;
        for (; kk < max_kk / elempack; kk++)
        {
            int p = (k / elempack + kk) / maxk;
            int uv = (k / elempack + kk) % maxk;
            int u = uv / kernel_w;
            int v = uv % kernel_w;

            const Mat img = bottom_blob.channel(p);

            int x = dx + dilation_w * v;
            int y = dy + dilation_h * u;

            const signed char* sptr = img.row<const signed char>(y) + x * elempack;

#if __ARM_NEON
            if (elempack == 8)
            {
                vst1_s8(pp, vld1_s8(sptr));
                pp += 8;
            }
#endif // __ARM_NEON
            if (elempack == 1)
            {
                pp[0] = sptr[0];
                pp += 1;
            }
        }
    }
}

template<int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h>
#if __ARM_FEATURE_MATMUL_INT8
void convolution_im2col_input_tile_int8_i8mm(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
#elif __ARM_FEATURE_DOTPROD
void convolution_im2col_input_tile_int8_asimddp(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
{
    convolution_im2col_input_tile_int8_impl(bottom_blob, B, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h);
}

#if __ARM_FEATURE_MATMUL_INT8
template void convolution_im2col_input_tile_int8_i8mm<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_i8mm<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_i8mm<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_i8mm<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_i8mm<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_i8mm<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
#elif __ARM_FEATURE_DOTPROD
template void convolution_im2col_input_tile_int8_asimddp<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_asimddp<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_asimddp<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_asimddp<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_asimddp<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8_asimddp<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
template void convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
template void convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD

static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
{
    if (kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1)
    {
        convolution_im2col_input_tile_conv1x1s1d1_int8(bottom_blob, B, j, max_jj, k, max_kk);
        return;
    }

    if (kernel_w == 1 && kernel_h == 1 && stride_w == 2 && stride_h == 2)
    {
#if __ARM_FEATURE_MATMUL_INT8
        convolution_im2col_input_tile_int8_i8mm<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#elif __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8_asimddp<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        return;
    }

    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
#if __ARM_FEATURE_MATMUL_INT8
        convolution_im2col_input_tile_int8_i8mm<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
#elif __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8_asimddp<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        return;
    }

    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
    {
#if __ARM_FEATURE_MATMUL_INT8
        convolution_im2col_input_tile_int8_i8mm<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#elif __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8_asimddp<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        return;
    }

    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
#if __ARM_FEATURE_MATMUL_INT8
        convolution_im2col_input_tile_int8_i8mm<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
#elif __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8_asimddp<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        return;
    }

    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
    {
#if __ARM_FEATURE_MATMUL_INT8
        convolution_im2col_input_tile_int8_i8mm<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#elif __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8_asimddp<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        return;
    }

    if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
    {
#if __ARM_FEATURE_MATMUL_INT8
        convolution_im2col_input_tile_int8_i8mm<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#elif __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8_asimddp<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
        return;
    }

    convolution_im2col_input_tile_int8_impl(bottom_blob, B, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h);
}

static void convolution_im2col_gemm_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, int kernel_w, int kernel_h, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        convolution_im2col_gemm_transform_kernel_int8_i8mm(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        convolution_im2col_gemm_transform_kernel_int8_asimddp(kernel, AT, inch, outch, kernel_w, kernel_h, opt);
        return;
    }
#endif

    // NCNN_LOGE("convolution_im2col_gemm_transform_kernel");
    const int maxk = kernel_w * kernel_h;

    const int M = outch;
    const int K = inch * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);

    const int nn_M = (M + TILE_M - 1) / TILE_M;

    int elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        elempack = inch % 8 == 0 ? 8 : 1;
    }
#endif // __ARM_NEON

    // maxk-inch-outch to pa-maxk-inch/pa-outch
    Mat A_data;
    if (maxk == 1)
    {
        A_data = kernel.reshape(maxk * inch, outch);
    }
    else
    {
        Mat weight_data_r2 = kernel.reshape(maxk, inch, outch);

        A_data.create(maxk * inch, outch, (size_t)1u, 1);

        for (int q = 0; q < outch; q += 1)
        {
            signed char* g00 = A_data.row<signed char>(q);

            for (int p = 0; p + (elempack - 1) < inch; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        const signed char* k00 = weight_data_r2.channel(q).row<const signed char>(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }

    AT.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, (size_t)1u, 1);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        for (int k = 0; k < K; k += TILE_K)
        {
            const int max_kk = std::min((K - k), TILE_K);

            Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

            convolution_im2col_pack_A_tile_int8(A_data, AT_tile, i, max_ii, k, max_kk);
        }
    }
}

static int convolution_im2col_gemm_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int nT, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        return convolution_im2col_gemm_int8_i8mm(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        return convolution_im2col_gemm_int8_asimddp(bottom_blob, top_blob, AT, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, nT, opt);
    }
#endif

    const int maxk = kernel_w * kernel_h;

    const int M = top_blob.c * top_blob.elempack;
    const int N = top_blob.w * top_blob.h;
    const int K = bottom_blob.c * bottom_blob.elempack * maxk;

    int TILE_M, TILE_N, TILE_K;
    convolution_im2col_gemm_get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);

    const int nn_M = (M + TILE_M - 1) / TILE_M;
    const int nn_N = (N + TILE_N - 1) / TILE_N;
    const int nn_K = (K + TILE_K - 1) / TILE_K;

    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        // im2col
        convolution_im2col_input_tile_int8(bottom_blob, BT_tile, j, max_jj, k, max_kk, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h);
    }

    Mat topT_tileX;
    if (K > TILE_K)
    {
        topT_tileX.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT_tileX.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppj = 0; ppj < nn_M; ppj++)
    {
        const int i = ppj * TILE_M;

        Mat topT_tile;
        if (K > TILE_K)
            topT_tile = topT_tileX.channel(get_omp_thread_num());

        const int max_ii = std::min((M - i), TILE_M);

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                const Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                const Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = k + TILE_K >= K;

                convolution_gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, top_blob, i, max_ii, j, max_jj, k, max_kk, k_end);
            }
        }
    }

    return 0;
}


================================================
FILE: src/layer/arm/convolution_packed.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_transform_kernel_packed(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb

    // clang-format off
    // *INDENT-OFF*
#if __ARM_NEON
#if __aarch64__
    if (outch >= 8)
    {
        if (inch >= 8)
            kernel_tm.create(8 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2);
        else if (inch >= 4)
            kernel_tm.create(8 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2);
        else if (inch >= 2)
            kernel_tm.create(8 * 2 * maxk, inch / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2);
        else
            kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2);
    }
    else
#endif // __aarch64__
    if (outch >= 4)
    {
#if __aarch64__
        if (inch >= 8)
            kernel_tm.create(4 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2);
        else
#endif // __aarch64__
        if (inch >= 4)
            kernel_tm.create(4 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2);
        else if (inch >= 2)
            kernel_tm.create(4 * 2 * maxk, inch / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2);
        else
            kernel_tm.create(4 * maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2);
    }
    else
#endif // __ARM_NEON
    if (outch >= 2)
    {
#if __ARM_NEON
#if __aarch64__
        if (inch >= 8)
            kernel_tm.create(2 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2);
        else
#endif // __aarch64__
        if (inch >= 4)
            kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2);
        else
#endif // __ARM_NEON
        if (inch >= 2)
            kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2);
        else
            kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2);
    }
    else
    {
#if __ARM_NEON
#if __aarch64__
        if (inch >= 8)
            kernel_tm.create(8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch);
        else
#endif // __aarch64__
        if (inch >= 4)
            kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch);
        else
#endif // __ARM_NEON
        if (inch >= 2)
            kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch);
        else
            kernel_tm.create(maxk, inch, outch);
    }
    // *INDENT-ON*
    // clang-format on

    int q = 0;
#if __ARM_NEON
#if __aarch64__
    for (; q + 7 < outch; q += 8)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;
        const float* kptr2 = (const float*)kernel + (q + 2) * inch * maxk;
        const float* kptr3 = (const float*)kernel + (q + 3) * inch * maxk;
        const float* kptr4 = (const float*)kernel + (q + 4) * inch * maxk;
        const float* kptr5 = (const float*)kernel + (q + 5) * inch * maxk;
        const float* kptr6 = (const float*)kernel + (q + 6) * inch * maxk;
        const float* kptr7 = (const float*)kernel + (q + 7) * inch * maxk;

        float* g00 = kernel_tm.channel(q / 8);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    g00[4] = k4[k];
                    g00[5] = k5[k];
                    g00[6] = k6[k];
                    g00[7] = k7[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    g00[4] = k4[k];
                    g00[5] = k5[k];
                    g00[6] = k6[k];
                    g00[7] = k7[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    g00[4] = k4[k];
                    g00[5] = k5[k];
                    g00[6] = k6[k];
                    g00[7] = k7[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;
            const float* k2 = kptr2 + p * maxk;
            const float* k3 = kptr3 + p * maxk;
            const float* k4 = kptr4 + p * maxk;
            const float* k5 = kptr5 + p * maxk;
            const float* k6 = kptr6 + p * maxk;
            const float* k7 = kptr7 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k0[k];
                g00[1] = k1[k];
                g00[2] = k2[k];
                g00[3] = k3[k];
                g00[4] = k4[k];
                g00[5] = k5[k];
                g00[6] = k6[k];
                g00[7] = k7[k];
                g00 += 8;
            }
        }
    }
#endif // __aarch64__
    for (; q + 3 < outch; q += 4)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;
        const float* kptr2 = (const float*)kernel + (q + 2) * inch * maxk;
        const float* kptr3 = (const float*)kernel + (q + 3) * inch * maxk;

#if __aarch64__
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
#else
        float* g00 = kernel_tm.channel(q / 4);
#endif

        int p = 0;
#if __aarch64__
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    g00[2] = k2[k];
                    g00[3] = k3[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;
            const float* k2 = kptr2 + p * maxk;
            const float* k3 = kptr3 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k0[k];
                g00[1] = k1[k];
                g00[2] = k2[k];
                g00[3] = k3[k];
                g00 += 4;
            }
        }
    }
#endif // __ARM_NEON
    for (; q + 1 < outch; q += 2)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;

#if __aarch64__
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);
#elif __ARM_NEON
        float* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2);
#else
        float* g00 = kernel_tm.channel(q / 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk + k;
                const float* k1 = kptr1 + p * maxk + k;

                g00[0] = k0[0];
                g00[1] = k0[maxk];
                g00[2] = k0[maxk * 2];
                g00[3] = k0[maxk * 3];
                g00[4] = k0[maxk * 4];
                g00[5] = k0[maxk * 5];
                g00[6] = k0[maxk * 6];
                g00[7] = k0[maxk * 7];
                g00[8] = k1[0];
                g00[9] = k1[maxk];
                g00[10] = k1[maxk * 2];
                g00[11] = k1[maxk * 3];
                g00[12] = k1[maxk * 4];
                g00[13] = k1[maxk * 5];
                g00[14] = k1[maxk * 6];
                g00[15] = k1[maxk * 7];
                g00 += 16;
            }
        }
#endif // __aarch64__
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk + k;
                const float* k1 = kptr1 + p * maxk + k;

                g00[0] = k0[0];
                g00[1] = k0[maxk];
                g00[2] = k0[maxk * 2];
                g00[3] = k0[maxk * 3];
                g00[4] = k1[0];
                g00[5] = k1[maxk];
                g00[6] = k1[maxk * 2];
                g00[7] = k1[maxk * 3];
                g00 += 8;
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    g00[1] = k1[k];
                    k0 += maxk;
                    k1 += maxk;
                    g00 += 2;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k0[k];
                g00[1] = k1[k];
                g00 += 2;
            }
        }
    }
    for (; q < outch; q++)
    {
        const float* kptr = (const float*)kernel + q * inch * maxk;

#if __aarch64__
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);
#elif __ARM_NEON
        float* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2 + q % 2);
#else
        float* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[k];
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[k];
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[k];
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k0[k];
                g00++;
            }
        }
    }
}

static void convolution_packed(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int elempack = bottom_blob.elempack;
    const int inch = bottom_blob.c * elempack;

    const size_t N = bottom_blob.cstep * elempack;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int outch = top_blob.c * out_elempack;

    const size_t M = top_blob.cstep * out_elempack;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2 * elempack;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const float* bias_data_ptr = bias_data;

    int nn_outch = 0;
    int remain_outch_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_outch = (outch - remain_outch_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        float* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);
                float32x4_t _sum4 = vdupq_n_f32(0.f);
                float32x4_t _sum5 = vdupq_n_f32(0.f);
                float32x4_t _sum6 = vdupq_n_f32(0.f);
                float32x4_t _sum7 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f32(bias_data_ptr + p);
                    _sum1 = vld1q_f32(bias_data_ptr + p + 4);
                }

                const float* kptr = weight_data_tm.channel(p / 8);

                int q = 0;
                for (; q + 7 < inch; q += 8)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                            _r1 = vld1q_f32(r0 + sok + N);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r1 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                            _r1 = vsetq_lane_f32(r0[sok + N * 4], _r1, 0);
                            _r1 = vsetq_lane_f32(r0[sok + N * 5], _r1, 1);
                            _r1 = vsetq_lane_f32(r0[sok + N * 6], _r1, 2);
                            _r1 = vsetq_lane_f32(r0[sok + N * 7], _r1, 3);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        float32x4_t _w2 = vld1q_f32(kptr + 4 * 2);
                        float32x4_t _w3 = vld1q_f32(kptr + 4 * 3);
                        float32x4_t _w4 = vld1q_f32(kptr + 4 * 4);
                        float32x4_t _w5 = vld1q_f32(kptr + 4 * 5);
                        float32x4_t _w6 = vld1q_f32(kptr + 4 * 6);
                        float32x4_t _w7 = vld1q_f32(kptr + 4 * 7);
                        float32x4_t _w8 = vld1q_f32(kptr + 4 * 8);
                        float32x4_t _w9 = vld1q_f32(kptr + 4 * 9);
                        float32x4_t _wa = vld1q_f32(kptr + 4 * 10);
                        float32x4_t _wb = vld1q_f32(kptr + 4 * 11);
                        float32x4_t _wc = vld1q_f32(kptr + 4 * 12);
                        float32x4_t _wd = vld1q_f32(kptr + 4 * 13);
                        float32x4_t _we = vld1q_f32(kptr + 4 * 14);
                        float32x4_t _wf = vld1q_f32(kptr + 4 * 15);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w8, _r1, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w9, _r1, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _wa, _r1, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _wb, _r1, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _wc, _r1, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _wd, _r1, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _we, _r1, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _wf, _r1, 3);

                        kptr += 64;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        float32x4_t _w2 = vld1q_f32(kptr + 4 * 2);
                        float32x4_t _w3 = vld1q_f32(kptr + 4 * 3);
                        float32x4_t _w4 = vld1q_f32(kptr + 4 * 4);
                        float32x4_t _w5 = vld1q_f32(kptr + 4 * 5);
                        float32x4_t _w6 = vld1q_f32(kptr + 4 * 6);
                        float32x4_t _w7 = vld1q_f32(kptr + 4 * 7);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);

                        kptr += 32;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        float32x4_t _w2 = vld1q_f32(kptr + 8);
                        float32x4_t _w3 = vld1q_f32(kptr + 12);
                        _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vfmaq_n_f32(_sum1, _w1, val0);
                        _sum2 = vfmaq_n_f32(_sum2, _w2, val1);
                        _sum3 = vfmaq_n_f32(_sum3, _w3, val1);

                        kptr += 16;
                    }
                }
                for (; q < inch; q++)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float32x4_t _val;
                        // if (elempack == 1)
                        {
                            _val = vdupq_n_f32(r0[space_ofs[k]]);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        _sum0 = vfmaq_f32(_sum0, _w0, _val);
                        _sum1 = vfmaq_f32(_sum1, _w1, _val);

                        kptr += 8;
                    }
                }

                _sum0 = vaddq_f32(_sum0, _sum2);
                _sum1 = vaddq_f32(_sum1, _sum3);
                _sum4 = vaddq_f32(_sum4, _sum6);
                _sum5 = vaddq_f32(_sum5, _sum7);
                _sum0 = vaddq_f32(_sum0, _sum4);
                _sum1 = vaddq_f32(_sum1, _sum5);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);
                _sum1 = activation_ps(_sum1, activation_type, activation_params);

                if (out_elempack == 4)
                {
                    vst1q_f32(outptr, _sum0);
                    vst1q_f32(outptr + M, _sum1);
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    outptr[0] = vgetq_lane_f32(_sum0, 0);
                    outptr[M] = vgetq_lane_f32(_sum0, 1);
                    outptr[M * 2] = vgetq_lane_f32(_sum0, 2);
                    outptr[M * 3] = vgetq_lane_f32(_sum0, 3);
                    outptr[M * 4] = vgetq_lane_f32(_sum1, 0);
                    outptr[M * 5] = vgetq_lane_f32(_sum1, 1);
                    outptr[M * 6] = vgetq_lane_f32(_sum1, 2);
                    outptr[M * 7] = vgetq_lane_f32(_sum1, 3);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 8;
    nn_outch = (outch - remain_outch_start) / 4;
#else // __aarch64__
    nn_outch = (outch - remain_outch_start) / 4;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __aarch64__
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        float* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f32(bias_data_ptr + p);
                }

#if __aarch64__
                const float* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);
#else
                const float* kptr = weight_data_tm.channel(p / 4);
#endif

                int q = 0;
#if __aarch64__
                for (; q + 7 < inch; q += 8)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                            _r1 = vld1q_f32(r0 + sok + N);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r1 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                            _r1 = vsetq_lane_f32(r0[sok + N * 4], _r1, 0);
                            _r1 = vsetq_lane_f32(r0[sok + N * 5], _r1, 1);
                            _r1 = vsetq_lane_f32(r0[sok + N * 6], _r1, 2);
                            _r1 = vsetq_lane_f32(r0[sok + N * 7], _r1, 3);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        float32x4_t _w2 = vld1q_f32(kptr + 8);
                        float32x4_t _w3 = vld1q_f32(kptr + 12);
                        float32x4_t _w4 = vld1q_f32(kptr + 16);
                        float32x4_t _w5 = vld1q_f32(kptr + 20);
                        float32x4_t _w6 = vld1q_f32(kptr + 24);
                        float32x4_t _w7 = vld1q_f32(kptr + 28);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w4, _r1, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w5, _r1, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w6, _r1, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w7, _r1, 3);

                        kptr += 32;
                    }
                }
#endif // __aarch64__
                for (; q + 3 < inch; q += 4)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        float32x4_t _w2 = vld1q_f32(kptr + 8);
                        float32x4_t _w3 = vld1q_f32(kptr + 12);
#if __aarch64__
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
#else
                        _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_r0), 0);
                        _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_r0), 1);
                        _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_r0), 0);
                        _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_r0), 1);
#endif

                        kptr += 16;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
#if __aarch64__
                        _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vfmaq_n_f32(_sum1, _w1, val1);
#else
                        _sum0 = vmlaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vmlaq_n_f32(_sum1, _w1, val1);
#endif

                        kptr += 8;
                    }
                }
                for (; q < inch; q++)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float32x4_t _val;
                        // if (elempack == 1)
                        {
                            _val = vdupq_n_f32(r0[space_ofs[k]]);
                        }

                        float32x4_t _w = vld1q_f32(kptr);
#if __aarch64__
                        _sum0 = vfmaq_f32(_sum0, _val, _w);
#else
                        _sum0 = vmlaq_f32(_sum0, _val, _w);
#endif

                        kptr += 4;
                    }
                }

                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                _sum0 = vaddq_f32(_sum0, _sum2);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);

                if (out_elempack == 4)
                {
                    vst1q_f32(outptr, _sum0);
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    outptr[0] = vgetq_lane_f32(_sum0, 0);
                    outptr[M] = vgetq_lane_f32(_sum0, 1);
                    outptr[M * 2] = vgetq_lane_f32(_sum0, 2);
                    outptr[M * 3] = vgetq_lane_f32(_sum0, 3);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 4;
    nn_outch = (outch - remain_outch_start) / 2;
#else // __ARM_NEON
    nn_outch = (outch - remain_outch_start) / 2;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __ARM_NEON
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum0 = 0.f;
                float sum1 = 0.f;

                if (bias_data_ptr)
                {
                    sum0 = bias_data_ptr[p];
                    sum1 = bias_data_ptr[p + 1];
                }

#if __aarch64__
                const float* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);
#elif __ARM_NEON
                const float* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2);
#else
                const float* kptr = weight_data_tm.channel(p / 2);
#endif

                int q = 0;
#if __ARM_NEON
#if __aarch64__
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                            _r1 = vld1q_f32(r0 + sok + N);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r1 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                            _r1 = vsetq_lane_f32(r0[sok + N * 4], _r1, 0);
                            _r1 = vsetq_lane_f32(r0[sok + N * 5], _r1, 1);
                            _r1 = vsetq_lane_f32(r0[sok + N * 6], _r1, 2);
                            _r1 = vsetq_lane_f32(r0[sok + N * 7], _r1, 3);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        float32x4_t _w2 = vld1q_f32(kptr + 8);
                        float32x4_t _w3 = vld1q_f32(kptr + 12);
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r1, _w1);
                        _sum2 = vfmaq_f32(_sum2, _r0, _w2);
                        _sum3 = vfmaq_f32(_sum3, _r1, _w3);

                        kptr += 16;
                    }
                }
                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                sum0 += vaddvq_f32(_sum0);
                sum1 += vaddvq_f32(_sum2);
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
#else  // __aarch64__
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
#endif // __aarch64__
                for (; q + 3 < inch; q += 4)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
#if __aarch64__
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r0, _w1);
#else
                        _sum0 = vmlaq_f32(_sum0, _r0, _w0);
                        _sum1 = vmlaq_f32(_sum1, _r0, _w1);
#endif

                        kptr += 8;
                    }
                }
#if __aarch64__
                sum0 += vaddvq_f32(_sum0);
                sum1 += vaddvq_f32(_sum1);
#else
                float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
                float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
                float32x2_t _ss = vpadd_f32(_ss0, _ss1);
                sum0 += vget_lane_f32(_ss, 0);
                sum1 += vget_lane_f32(_ss, 1);
#endif
#endif // __ARM_NEON
                for (; q + 1 < inch; q += 2)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        sum0 += val0 * kptr[0];
                        sum1 += val0 * kptr[1];
                        sum0 += val1 * kptr[2];
                        sum1 += val1 * kptr[3];

                        kptr += 4;
                    }
                }
                for (; q < inch; q++)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val;
                        // if (elempack == 1)
                        {
                            val = r0[space_ofs[k]];
                        }

                        sum0 += val * kptr[0];
                        sum1 += val * kptr[1];

                        kptr += 2;
                    }
                }

                sum0 = activation_ss(sum0, activation_type, activation_params);
                sum1 = activation_ss(sum1, activation_type, activation_params);

                outptr0[0] = sum0;
                outptr1[0] = sum1;
                outptr0 += 1;
                outptr1 += 1;
            }
        }
    }
    remain_outch_start += nn_outch * 2;
    for (int p = remain_outch_start; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_data_ptr)
                {
                    sum = bias_data_ptr[p];
                }

#if __aarch64__
                const float* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);
#elif __ARM_NEON
                const float* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2 + p % 2);
#else
                const float* kptr = weight_data_tm.channel(p / 2 + p % 2);
#endif

                int q = 0;
#if __ARM_NEON
#if __aarch64__
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                            _r1 = vld1q_f32(r0 + sok + N);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r1 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                            _r1 = vsetq_lane_f32(r0[sok + N * 4], _r1, 0);
                            _r1 = vsetq_lane_f32(r0[sok + N * 5], _r1, 1);
                            _r1 = vsetq_lane_f32(r0[sok + N * 6], _r1, 2);
                            _r1 = vsetq_lane_f32(r0[sok + N * 7], _r1, 3);
                        }

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r1, _w1);

                        kptr += 8;
                    }
                }
                _sum0 = vaddq_f32(_sum0, _sum1);
                sum += vaddvq_f32(_sum0);
#endif // __aarch64__
                float32x4_t _sum = vdupq_n_f32(0.f);
                for (; q + 3 < inch; q += 4)
                {
                    const float* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1q_f32(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float32x4_t();
                            _r0 = vsetq_lane_f32(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f32(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f32(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f32(r0[sok + N * 3], _r0, 3);
                        }

                        float32x4_t _w = vld1q_f32(kptr);
#if __aarch64__
                        _sum = vfmaq_f32(_sum, _r0, _w);
#else
                        _sum = vmlaq_f32(_sum, _r0, _w);
#endif

                        kptr += 4;
                    }
                }
#if __aarch64__
                sum += vaddvq_f32(_sum);
#else
                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                _ss = vpadd_f32(_ss, _ss);
                sum += vget_lane_f32(_ss, 0);
#endif
#endif // __ARM_NEON
                for (; q + 1 < inch; q += 2)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        sum += val0 * kptr[0];
                        sum += val1 * kptr[1];

                        kptr += 2;
                    }
                }
                for (; q < inch; q++)
                {
                    const float* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val;
                        // if (elempack == 1)
                        {
                            val = r0[space_ofs[k]];
                        }

                        sum += val * kptr[0];

                        kptr += 1;
                    }
                }

                sum = activation_ss(sum, activation_type, activation_params);

                outptr[0] = sum;
                outptr += 1;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_packed_bf16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_transform_kernel_packed_bf16s(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb

    // clang-format off
    // *INDENT-OFF*
#if __ARM_NEON
#if __aarch64__
    if (outch >= 8)
    {
        if (inch >= 8)
            kernel_tm.create(8 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else if (inch >= 4)
            kernel_tm.create(8 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else if (inch >= 2)
            kernel_tm.create(8 * 2 * maxk, inch / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else
            kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
    }
    else
#endif // __aarch64__
    if (outch >= 4)
    {
#if __aarch64__
        if (inch >= 8)
            kernel_tm.create(4 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else
#endif // __aarch64__
        if (inch >= 4)
            kernel_tm.create(4 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else if (inch >= 2)
            kernel_tm.create(4 * 2 * maxk, inch / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else
            kernel_tm.create(4 * maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
    }
    else
#endif // __ARM_NEON
    if (outch >= 2)
    {
#if __ARM_NEON
#if __aarch64__
        if (inch >= 8)
            kernel_tm.create(2 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
        else
#endif // __aarch64__
        if (inch >= 4)
            kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
        else
#endif // __ARM_NEON
        if (inch >= 2)
            kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
        else
            kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)2u);
    }
    else
    {
#if __ARM_NEON
#if __aarch64__
        if (inch >= 8)
            kernel_tm.create(8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch, (size_t)2u);
        else
#endif // __aarch64__
        if (inch >= 4)
            kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch, (size_t)2u);
        else
#endif // __ARM_NEON
        if (inch >= 2)
            kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch, (size_t)2u);
        else
            kernel_tm.create(maxk, inch, outch, (size_t)2u);
    }
    // *INDENT-ON*
    // clang-format on

    int q = 0;
#if __ARM_NEON
#if __aarch64__
    for (; q + 7 < outch; q += 8)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;
        const float* kptr2 = (const float*)kernel + (q + 2) * inch * maxk;
        const float* kptr3 = (const float*)kernel + (q + 3) * inch * maxk;
        const float* kptr4 = (const float*)kernel + (q + 4) * inch * maxk;
        const float* kptr5 = (const float*)kernel + (q + 5) * inch * maxk;
        const float* kptr6 = (const float*)kernel + (q + 6) * inch * maxk;
        const float* kptr7 = (const float*)kernel + (q + 7) * inch * maxk;

        unsigned short* g00 = kernel_tm.channel(q / 8);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    g00[4] = float32_to_bfloat16(k4[k]);
                    g00[5] = float32_to_bfloat16(k5[k]);
                    g00[6] = float32_to_bfloat16(k6[k]);
                    g00[7] = float32_to_bfloat16(k7[k]);
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    g00[4] = float32_to_bfloat16(k4[k]);
                    g00[5] = float32_to_bfloat16(k5[k]);
                    g00[6] = float32_to_bfloat16(k6[k]);
                    g00[7] = float32_to_bfloat16(k7[k]);
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    g00[4] = float32_to_bfloat16(k4[k]);
                    g00[5] = float32_to_bfloat16(k5[k]);
                    g00[6] = float32_to_bfloat16(k6[k]);
                    g00[7] = float32_to_bfloat16(k7[k]);
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;
            const float* k2 = kptr2 + p * maxk;
            const float* k3 = kptr3 + p * maxk;
            const float* k4 = kptr4 + p * maxk;
            const float* k5 = kptr5 + p * maxk;
            const float* k6 = kptr6 + p * maxk;
            const float* k7 = kptr7 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00[1] = float32_to_bfloat16(k1[k]);
                g00[2] = float32_to_bfloat16(k2[k]);
                g00[3] = float32_to_bfloat16(k3[k]);
                g00[4] = float32_to_bfloat16(k4[k]);
                g00[5] = float32_to_bfloat16(k5[k]);
                g00[6] = float32_to_bfloat16(k6[k]);
                g00[7] = float32_to_bfloat16(k7[k]);
                g00 += 8;
            }
        }
    }
#endif // __aarch64__
    for (; q + 3 < outch; q += 4)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;
        const float* kptr2 = (const float*)kernel + (q + 2) * inch * maxk;
        const float* kptr3 = (const float*)kernel + (q + 3) * inch * maxk;

#if __aarch64__
        unsigned short* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
#else
        unsigned short* g00 = kernel_tm.channel(q / 4);
#endif

        int p = 0;
#if __aarch64__
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    g00[2] = float32_to_bfloat16(k2[k]);
                    g00[3] = float32_to_bfloat16(k3[k]);
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;
            const float* k2 = kptr2 + p * maxk;
            const float* k3 = kptr3 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00[1] = float32_to_bfloat16(k1[k]);
                g00[2] = float32_to_bfloat16(k2[k]);
                g00[3] = float32_to_bfloat16(k3[k]);
                g00 += 4;
            }
        }
    }
#endif // __ARM_NEON
    for (; q + 1 < outch; q += 2)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;

#if __aarch64__
        unsigned short* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);
#elif __ARM_NEON
        unsigned short* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2);
#else
        unsigned short* g00 = kernel_tm.channel(q / 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk + k;
                const float* k1 = kptr1 + p * maxk + k;

                g00[0] = float32_to_bfloat16(k0[0]);
                g00[1] = float32_to_bfloat16(k0[maxk]);
                g00[2] = float32_to_bfloat16(k0[maxk * 2]);
                g00[3] = float32_to_bfloat16(k0[maxk * 3]);
                g00[4] = float32_to_bfloat16(k0[maxk * 4]);
                g00[5] = float32_to_bfloat16(k0[maxk * 5]);
                g00[6] = float32_to_bfloat16(k0[maxk * 6]);
                g00[7] = float32_to_bfloat16(k0[maxk * 7]);
                g00[8] = float32_to_bfloat16(k1[0]);
                g00[9] = float32_to_bfloat16(k1[maxk]);
                g00[10] = float32_to_bfloat16(k1[maxk * 2]);
                g00[11] = float32_to_bfloat16(k1[maxk * 3]);
                g00[12] = float32_to_bfloat16(k1[maxk * 4]);
                g00[13] = float32_to_bfloat16(k1[maxk * 5]);
                g00[14] = float32_to_bfloat16(k1[maxk * 6]);
                g00[15] = float32_to_bfloat16(k1[maxk * 7]);
                g00 += 16;
            }
        }
#endif // __aarch64__
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk + k;
                const float* k1 = kptr1 + p * maxk + k;

                g00[0] = float32_to_bfloat16(k0[0]);
                g00[1] = float32_to_bfloat16(k0[maxk]);
                g00[2] = float32_to_bfloat16(k0[maxk * 2]);
                g00[3] = float32_to_bfloat16(k0[maxk * 3]);
                g00[4] = float32_to_bfloat16(k1[0]);
                g00[5] = float32_to_bfloat16(k1[maxk]);
                g00[6] = float32_to_bfloat16(k1[maxk * 2]);
                g00[7] = float32_to_bfloat16(k1[maxk * 3]);
                g00 += 8;
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    g00[1] = float32_to_bfloat16(k1[k]);
                    k0 += maxk;
                    k1 += maxk;
                    g00 += 2;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00[1] = float32_to_bfloat16(k1[k]);
                g00 += 2;
            }
        }
    }
    for (; q < outch; q++)
    {
        const float* kptr = (const float*)kernel + q * inch * maxk;

#if __aarch64__
        unsigned short* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);
#elif __ARM_NEON
        unsigned short* g00 = kernel_tm.channel(q / 4 + (q % 4) / 2 + q % 2);
#else
        unsigned short* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        int p = 0;
#if __ARM_NEON
#if __aarch64__
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
#endif // __aarch64__
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
#endif // __ARM_NEON
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = float32_to_bfloat16(k0[k]);
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = float32_to_bfloat16(k0[k]);
                g00++;
            }
        }
    }
}

static void convolution_packed_bf16s(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int elempack = bottom_blob.elempack;
    const int inch = bottom_blob.c * elempack;

    const size_t N = bottom_blob.cstep * elempack;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int outch = top_blob.c * out_elempack;

    const size_t M = top_blob.cstep * out_elempack;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2 * elempack;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const float* bias_data_ptr = bias_data;

    int nn_outch = 0;
    int remain_outch_start = 0;
#if __ARM_NEON
#if __aarch64__
    nn_outch = (outch - remain_outch_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        unsigned short* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);
                float32x4_t _sum4 = vdupq_n_f32(0.f);
                float32x4_t _sum5 = vdupq_n_f32(0.f);
                float32x4_t _sum6 = vdupq_n_f32(0.f);
                float32x4_t _sum7 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f32(bias_data_ptr + p);
                    _sum1 = vld1q_f32(bias_data_ptr + p + 4);
                }

                const unsigned short* kptr = weight_data_tm.channel(p / 8);

                int q = 0;
                for (; q + 7 < inch; q += 8)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                            _r1 = bfloat2float(vld1_u16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x8_t _r_u16 = uint16x8_t();
                            _r_u16 = vsetq_lane_u16(r0[sok], _r_u16, 0);
                            _r_u16 = vsetq_lane_u16(r0[sok + N], _r_u16, 1);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 2], _r_u16, 2);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 3], _r_u16, 3);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 4], _r_u16, 4);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 5], _r_u16, 5);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 6], _r_u16, 6);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 7], _r_u16, 7);
                            _r0 = bfloat2float(vget_low_u16(_r_u16));
                            _r1 = bfloat2float(vget_high_u16(_r_u16));
                        }

                        uint16x8_t _w01 = vld1q_u16(kptr);
                        uint16x8_t _w23 = vld1q_u16(kptr + 8);
                        uint16x8_t _w45 = vld1q_u16(kptr + 16);
                        uint16x8_t _w67 = vld1q_u16(kptr + 24);
                        uint16x8_t _w89 = vld1q_u16(kptr + 32);
                        uint16x8_t _wab = vld1q_u16(kptr + 40);
                        uint16x8_t _wcd = vld1q_u16(kptr + 48);
                        uint16x8_t _wef = vld1q_u16(kptr + 56);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                        float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                        float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                        float32x4_t _w4 = bfloat2float(vget_low_u16(_w45));
                        float32x4_t _w5 = bfloat2float(vget_high_u16(_w45));
                        float32x4_t _w6 = bfloat2float(vget_low_u16(_w67));
                        float32x4_t _w7 = bfloat2float(vget_high_u16(_w67));
                        float32x4_t _w8 = bfloat2float(vget_low_u16(_w89));
                        float32x4_t _w9 = bfloat2float(vget_high_u16(_w89));
                        float32x4_t _wa = bfloat2float(vget_low_u16(_wab));
                        float32x4_t _wb = bfloat2float(vget_high_u16(_wab));
                        float32x4_t _wc = bfloat2float(vget_low_u16(_wcd));
                        float32x4_t _wd = bfloat2float(vget_high_u16(_wcd));
                        float32x4_t _we = bfloat2float(vget_low_u16(_wef));
                        float32x4_t _wf = bfloat2float(vget_high_u16(_wef));
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w8, _r1, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w9, _r1, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _wa, _r1, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _wb, _r1, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _wc, _r1, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _wd, _r1, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _we, _r1, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _wf, _r1, 3);

                        kptr += 64;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x4_t _r_u16 = uint16x4_t();
                            _r_u16 = vset_lane_u16(r0[sok], _r_u16, 0);
                            _r_u16 = vset_lane_u16(r0[sok + N], _r_u16, 1);
                            _r_u16 = vset_lane_u16(r0[sok + N * 2], _r_u16, 2);
                            _r_u16 = vset_lane_u16(r0[sok + N * 3], _r_u16, 3);
                            _r0 = bfloat2float(_r_u16);
                        }

                        uint16x8_t _w01 = vld1q_u16(kptr);
                        uint16x8_t _w23 = vld1q_u16(kptr + 8);
                        uint16x8_t _w45 = vld1q_u16(kptr + 16);
                        uint16x8_t _w67 = vld1q_u16(kptr + 24);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                        float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                        float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                        float32x4_t _w4 = bfloat2float(vget_low_u16(_w45));
                        float32x4_t _w5 = bfloat2float(vget_high_u16(_w45));
                        float32x4_t _w6 = bfloat2float(vget_low_u16(_w67));
                        float32x4_t _w7 = bfloat2float(vget_high_u16(_w67));
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);

                        kptr += 32;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = bfloat16_to_float32(r0[sok]);
                            val1 = bfloat16_to_float32(r0[sok + N]);
                        }

                        uint16x8_t _w01 = vld1q_u16(kptr);
                        uint16x8_t _w23 = vld1q_u16(kptr + 8);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                        float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                        float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                        _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vfmaq_n_f32(_sum1, _w1, val0);
                        _sum2 = vfmaq_n_f32(_sum2, _w2, val1);
                        _sum3 = vfmaq_n_f32(_sum3, _w3, val1);

                        kptr += 16;
                    }
                }
                for (; q < inch; q++)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float32x4_t _val;
                        // if (elempack == 1)
                        {
                            _val = bfloat2float(vdup_n_u16(r0[space_ofs[k]]));
                        }

                        uint16x8_t _w = vld1q_u16(kptr);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
                        _sum0 = vfmaq_f32(_sum0, _w0, _val);
                        _sum1 = vfmaq_f32(_sum1, _w1, _val);

                        kptr += 8;
                    }
                }

                _sum0 = vaddq_f32(_sum0, _sum2);
                _sum1 = vaddq_f32(_sum1, _sum3);
                _sum4 = vaddq_f32(_sum4, _sum6);
                _sum5 = vaddq_f32(_sum5, _sum7);
                _sum0 = vaddq_f32(_sum0, _sum4);
                _sum1 = vaddq_f32(_sum1, _sum5);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);
                _sum1 = activation_ps(_sum1, activation_type, activation_params);

                if (out_elempack == 4)
                {
                    vst1_u16(outptr, float2bfloat(_sum0));
                    vst1_u16(outptr + M, float2bfloat(_sum1));
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    uint16x4_t _sum0_u16 = float2bfloat(_sum0);
                    uint16x4_t _sum1_u16 = float2bfloat(_sum1);
                    outptr[0] = vget_lane_u16(_sum0_u16, 0);
                    outptr[M] = vget_lane_u16(_sum0_u16, 1);
                    outptr[M * 2] = vget_lane_u16(_sum0_u16, 2);
                    outptr[M * 3] = vget_lane_u16(_sum0_u16, 3);
                    outptr[M * 4] = vget_lane_u16(_sum1_u16, 0);
                    outptr[M * 5] = vget_lane_u16(_sum1_u16, 1);
                    outptr[M * 6] = vget_lane_u16(_sum1_u16, 2);
                    outptr[M * 7] = vget_lane_u16(_sum1_u16, 3);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 8;
    nn_outch = (outch - remain_outch_start) / 4;
#else // __aarch64__
    nn_outch = (outch - remain_outch_start) / 4;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __aarch64__
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        unsigned short* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f32(bias_data_ptr + p);
                }

#if __aarch64__
                const unsigned short* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);
#else
                const unsigned short* kptr = weight_data_tm.channel(p / 4);
#endif

                int q = 0;
#if __aarch64__
                for (; q + 7 < inch; q += 8)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                            _r1 = bfloat2float(vld1_u16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x8_t _r_u16 = uint16x8_t();
                            _r_u16 = vsetq_lane_u16(r0[sok], _r_u16, 0);
                            _r_u16 = vsetq_lane_u16(r0[sok + N], _r_u16, 1);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 2], _r_u16, 2);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 3], _r_u16, 3);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 4], _r_u16, 4);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 5], _r_u16, 5);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 6], _r_u16, 6);
                            _r_u16 = vsetq_lane_u16(r0[sok + N * 7], _r_u16, 7);
                            _r0 = bfloat2float(vget_low_u16(_r_u16));
                            _r1 = bfloat2float(vget_high_u16(_r_u16));
                        }

                        uint16x8_t _w01 = vld1q_u16(kptr);
                        uint16x8_t _w23 = vld1q_u16(kptr + 8);
                        uint16x8_t _w45 = vld1q_u16(kptr + 16);
                        uint16x8_t _w67 = vld1q_u16(kptr + 24);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                        float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                        float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                        float32x4_t _w4 = bfloat2float(vget_low_u16(_w45));
                        float32x4_t _w5 = bfloat2float(vget_high_u16(_w45));
                        float32x4_t _w6 = bfloat2float(vget_low_u16(_w67));
                        float32x4_t _w7 = bfloat2float(vget_high_u16(_w67));
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w4, _r1, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w5, _r1, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w6, _r1, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w7, _r1, 3);

                        kptr += 32;
                    }
                }
#endif // __aarch64__
                for (; q + 3 < inch; q += 4)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x4_t _r_u16 = uint16x4_t();
                            _r_u16 = vset_lane_u16(r0[sok], _r_u16, 0);
                            _r_u16 = vset_lane_u16(r0[sok + N], _r_u16, 1);
                            _r_u16 = vset_lane_u16(r0[sok + N * 2], _r_u16, 2);
                            _r_u16 = vset_lane_u16(r0[sok + N * 3], _r_u16, 3);
                            _r0 = bfloat2float(_r_u16);
                        }

                        uint16x8_t _w01 = vld1q_u16(kptr);
                        uint16x8_t _w23 = vld1q_u16(kptr + 8);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                        float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                        float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
#if __aarch64__
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
#else
                        _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_r0), 0);
                        _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_r0), 1);
                        _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_r0), 0);
                        _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_r0), 1);
#endif

                        kptr += 16;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = bfloat16_to_float32(r0[sok]);
                            val1 = bfloat16_to_float32(r0[sok + N]);
                        }

                        uint16x8_t _w = vld1q_u16(kptr);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
#if __aarch64__
                        _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vfmaq_n_f32(_sum1, _w1, val1);
#else
                        _sum0 = vmlaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vmlaq_n_f32(_sum1, _w1, val1);
#endif

                        kptr += 8;
                    }
                }
                for (; q < inch; q++)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float32x4_t _val;
                        // if (elempack == 1)
                        {
                            _val = bfloat2float(vdup_n_u16(r0[space_ofs[k]]));
                        }

                        float32x4_t _w = bfloat2float(vld1_u16(kptr));
#if __aarch64__
                        _sum0 = vfmaq_f32(_sum0, _val, _w);
#else
                        _sum0 = vmlaq_f32(_sum0, _val, _w);
#endif

                        kptr += 4;
                    }
                }

                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                _sum0 = vaddq_f32(_sum0, _sum2);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);

                if (out_elempack == 4)
                {
                    vst1_u16(outptr, float2bfloat(_sum0));
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    uint16x4_t _sum0_u16 = float2bfloat(_sum0);
                    outptr[0] = vget_lane_u16(_sum0_u16, 0);
                    outptr[M] = vget_lane_u16(_sum0_u16, 1);
                    outptr[M * 2] = vget_lane_u16(_sum0_u16, 2);
                    outptr[M * 3] = vget_lane_u16(_sum0_u16, 3);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 4;
    nn_outch = (outch - remain_outch_start) / 2;
#else // __ARM_NEON
    nn_outch = (outch - remain_outch_start) / 2;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __ARM_NEON
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        unsigned short* outptr0 = top_blob.channel(p);
        unsigned short* outptr1 = top_blob.channel(p + 1);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum0 = 0.f;
                float sum1 = 0.f;

                if (bias_data_ptr)
                {
                    sum0 = bias_data_ptr[p];
                    sum1 = bias_data_ptr[p + 1];
                }

#if __aarch64__
                const unsigned short* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);
#elif __ARM_NEON
                const unsigned short* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2);
#else
                const unsigned short* kptr = weight_data_tm.channel(p / 2);
#endif

                int q = 0;
#if __ARM_NEON
#if __aarch64__
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                            _r1 = bfloat2float(vld1_u16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x8_t _r01_u16 = uint16x8_t();
                            _r01_u16 = vsetq_lane_u16(r0[sok], _r01_u16, 0);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N], _r01_u16, 1);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 2], _r01_u16, 2);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 3], _r01_u16, 3);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 4], _r01_u16, 4);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 5], _r01_u16, 5);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 6], _r01_u16, 6);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 7], _r01_u16, 7);
                            _r0 = bfloat2float(vget_low_u16(_r01_u16));
                            _r1 = bfloat2float(vget_high_u16(_r01_u16));
                        }

                        uint16x8_t _w01 = vld1q_u16(kptr);
                        uint16x8_t _w23 = vld1q_u16(kptr + 8);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w01));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w01));
                        float32x4_t _w2 = bfloat2float(vget_low_u16(_w23));
                        float32x4_t _w3 = bfloat2float(vget_high_u16(_w23));
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r1, _w1);
                        _sum2 = vfmaq_f32(_sum2, _r0, _w2);
                        _sum3 = vfmaq_f32(_sum3, _r1, _w3);

                        kptr += 16;
                    }
                }
                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                sum0 += vaddvq_f32(_sum0);
                sum1 += vaddvq_f32(_sum2);
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
#else  // __aarch64__
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
#endif // __aarch64__
                for (; q + 3 < inch; q += 4)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x4_t _r0_u16 = uint16x4_t();
                            _r0_u16 = vset_lane_u16(r0[sok], _r0_u16, 0);
                            _r0_u16 = vset_lane_u16(r0[sok + N], _r0_u16, 1);
                            _r0_u16 = vset_lane_u16(r0[sok + N * 2], _r0_u16, 2);
                            _r0_u16 = vset_lane_u16(r0[sok + N * 3], _r0_u16, 3);
                            _r0 = bfloat2float(_r0_u16);
                        }

                        uint16x8_t _w = vld1q_u16(kptr);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
#if __aarch64__
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r0, _w1);
#else
                        _sum0 = vmlaq_f32(_sum0, _r0, _w0);
                        _sum1 = vmlaq_f32(_sum1, _r0, _w1);
#endif

                        kptr += 8;
                    }
                }
#if __aarch64__
                sum0 += vaddvq_f32(_sum0);
                sum1 += vaddvq_f32(_sum1);
#else
                float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
                float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
                float32x2_t _ss = vpadd_f32(_ss0, _ss1);
                sum0 += vget_lane_f32(_ss, 0);
                sum1 += vget_lane_f32(_ss, 1);
#endif
#endif // __ARM_NEON
                for (; q + 1 < inch; q += 2)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = bfloat16_to_float32(r0[sok]);
                            val1 = bfloat16_to_float32(r0[sok + N]);
                        }

                        sum0 += val0 * bfloat16_to_float32(kptr[0]);
                        sum1 += val0 * bfloat16_to_float32(kptr[1]);
                        sum0 += val1 * bfloat16_to_float32(kptr[2]);
                        sum1 += val1 * bfloat16_to_float32(kptr[3]);

                        kptr += 4;
                    }
                }
                for (; q < inch; q++)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val;
                        // if (elempack == 1)
                        {
                            val = bfloat16_to_float32(r0[space_ofs[k]]);
                        }

                        sum0 += val * bfloat16_to_float32(kptr[0]);
                        sum1 += val * bfloat16_to_float32(kptr[1]);

                        kptr += 2;
                    }
                }

                sum0 = activation_ss(sum0, activation_type, activation_params);
                sum1 = activation_ss(sum1, activation_type, activation_params);

                outptr0[0] = float32_to_bfloat16(sum0);
                outptr1[0] = float32_to_bfloat16(sum1);
                outptr0 += 1;
                outptr1 += 1;
            }
        }
    }
    remain_outch_start += nn_outch * 2;
    for (int p = remain_outch_start; p < outch; p++)
    {
        unsigned short* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_data_ptr)
                {
                    sum = bias_data_ptr[p];
                }

#if __aarch64__
                const unsigned short* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);
#elif __ARM_NEON
                const unsigned short* kptr = weight_data_tm.channel(p / 4 + (p % 4) / 2 + p % 2);
#else
                const unsigned short* kptr = weight_data_tm.channel(p / 2 + p % 2);
#endif

                int q = 0;
#if __ARM_NEON
#if __aarch64__
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                            _r1 = bfloat2float(vld1_u16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x8_t _r01_u16 = uint16x8_t();
                            _r01_u16 = vsetq_lane_u16(r0[sok], _r01_u16, 0);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N], _r01_u16, 1);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 2], _r01_u16, 2);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 3], _r01_u16, 3);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 4], _r01_u16, 4);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 5], _r01_u16, 5);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 6], _r01_u16, 6);
                            _r01_u16 = vsetq_lane_u16(r0[sok + N * 7], _r01_u16, 7);
                            _r0 = bfloat2float(vget_low_u16(_r01_u16));
                            _r1 = bfloat2float(vget_high_u16(_r01_u16));
                        }

                        uint16x8_t _w = vld1q_u16(kptr);
                        float32x4_t _w0 = bfloat2float(vget_low_u16(_w));
                        float32x4_t _w1 = bfloat2float(vget_high_u16(_w));
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r1, _w1);

                        kptr += 8;
                    }
                }
                _sum0 = vaddq_f32(_sum0, _sum1);
                sum += vaddvq_f32(_sum0);
#endif // __aarch64__
                float32x4_t _sum = vdupq_n_f32(0.f);
                for (; q + 3 < inch; q += 4)
                {
                    const unsigned short* r0 = bottom_blob.channel(q / elempack).row<const unsigned short>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = bfloat2float(vld1_u16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            uint16x4_t _r0_u16 = uint16x4_t();
                            _r0_u16 = vset_lane_u16(r0[sok], _r0_u16, 0);
                            _r0_u16 = vset_lane_u16(r0[sok + N], _r0_u16, 1);
                            _r0_u16 = vset_lane_u16(r0[sok + N * 2], _r0_u16, 2);
                            _r0_u16 = vset_lane_u16(r0[sok + N * 3], _r0_u16, 3);
                            _r0 = bfloat2float(_r0_u16);
                        }

                        float32x4_t _w = bfloat2float(vld1_u16(kptr));
#if __aarch64__
                        _sum = vfmaq_f32(_sum, _r0, _w);
#else
                        _sum = vmlaq_f32(_sum, _r0, _w);
#endif

                        kptr += 4;
                    }
                }
#if __aarch64__
                sum += vaddvq_f32(_sum);
#else
                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                _ss = vpadd_f32(_ss, _ss);
                sum += vget_lane_f32(_ss, 0);
#endif
#endif // __ARM_NEON
                for (; q + 1 < inch; q += 2)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = bfloat16_to_float32(r0[sok]);
                            val1 = bfloat16_to_float32(r0[sok + N]);
                        }

                        sum += val0 * bfloat16_to_float32(kptr[0]);
                        sum += val1 * bfloat16_to_float32(kptr[1]);

                        kptr += 2;
                    }
                }
                for (; q < inch; q++)
                {
                    const unsigned short* r0 = bottom_blob.channel(q).row<const unsigned short>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val;
                        // if (elempack == 1)
                        {
                            val = bfloat16_to_float32(r0[space_ofs[k]]);
                        }

                        sum += val * bfloat16_to_float32(kptr[0]);

                        kptr += 1;
                    }
                }

                sum = activation_ss(sum, activation_type, activation_params);

                outptr[0] = float32_to_bfloat16(sum);
                outptr += 1;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_packed_fp16s.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_transform_kernel_packed_fp16s(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb

    // clang-format off
    // *INDENT-OFF*
    if (outch >= 8)
    {
        if (inch >= 8)
            kernel_tm.create(8 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else if (inch >= 4)
            kernel_tm.create(8 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else if (inch >= 2)
            kernel_tm.create(8 * 2 * maxk, inch / 2 + inch % 2, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else
            kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
    }
    else if (outch >= 4)
    {
        if (inch >= 8)
            kernel_tm.create(4 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else if (inch >= 4)
            kernel_tm.create(4 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else if (inch >= 2)
            kernel_tm.create(4 * 2 * maxk, inch / 2 + inch % 2, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
        else
            kernel_tm.create(4 * maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)2u);
    }
    else if (outch >= 2)
    {
        if (inch >= 8)
            kernel_tm.create(2 * 8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
        else if (inch >= 4)
            kernel_tm.create(2 * 4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
        else if (inch >= 2)
            kernel_tm.create(2 * 2 * maxk, inch / 2 + inch % 2, outch / 2 + outch % 2, (size_t)2u);
        else
            kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)2u);
    }
    else
    {
        if (inch >= 8)
            kernel_tm.create(8 * maxk, inch / 8 + (inch % 8) / 4 + (inch % 4) / 2 + inch % 2, outch, (size_t)2u);
        else if (inch >= 4)
            kernel_tm.create(4 * maxk, inch / 4 + (inch % 4) / 2 + inch % 2, outch, (size_t)2u);
        else if (inch >= 2)
            kernel_tm.create(2 * maxk, inch / 2 + inch % 2, outch, (size_t)2u);
        else
            kernel_tm.create(maxk, inch, outch, (size_t)2u);
    }
    // *INDENT-ON*
    // clang-format on

    int q = 0;
    for (; q + 7 < outch; q += 8)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;
        const float* kptr2 = (const float*)kernel + (q + 2) * inch * maxk;
        const float* kptr3 = (const float*)kernel + (q + 3) * inch * maxk;
        const float* kptr4 = (const float*)kernel + (q + 4) * inch * maxk;
        const float* kptr5 = (const float*)kernel + (q + 5) * inch * maxk;
        const float* kptr6 = (const float*)kernel + (q + 6) * inch * maxk;
        const float* kptr7 = (const float*)kernel + (q + 7) * inch * maxk;

        __fp16* g00 = kernel_tm.channel(q / 8);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    g00[4] = (__fp16)k4[k];
                    g00[5] = (__fp16)k5[k];
                    g00[6] = (__fp16)k6[k];
                    g00[7] = (__fp16)k7[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    g00[4] = (__fp16)k4[k];
                    g00[5] = (__fp16)k5[k];
                    g00[6] = (__fp16)k6[k];
                    g00[7] = (__fp16)k7[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;
                const float* k4 = kptr4 + p * maxk;
                const float* k5 = kptr5 + p * maxk;
                const float* k6 = kptr6 + p * maxk;
                const float* k7 = kptr7 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    g00[4] = (__fp16)k4[k];
                    g00[5] = (__fp16)k5[k];
                    g00[6] = (__fp16)k6[k];
                    g00[7] = (__fp16)k7[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    k4 += maxk;
                    k5 += maxk;
                    k6 += maxk;
                    k7 += maxk;
                    g00 += 8;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;
            const float* k2 = kptr2 + p * maxk;
            const float* k3 = kptr3 + p * maxk;
            const float* k4 = kptr4 + p * maxk;
            const float* k5 = kptr5 + p * maxk;
            const float* k6 = kptr6 + p * maxk;
            const float* k7 = kptr7 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00[1] = (__fp16)k1[k];
                g00[2] = (__fp16)k2[k];
                g00[3] = (__fp16)k3[k];
                g00[4] = (__fp16)k4[k];
                g00[5] = (__fp16)k5[k];
                g00[6] = (__fp16)k6[k];
                g00[7] = (__fp16)k7[k];
                g00 += 8;
            }
        }
    }
    for (; q + 3 < outch; q += 4)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;
        const float* kptr2 = (const float*)kernel + (q + 2) * inch * maxk;
        const float* kptr3 = (const float*)kernel + (q + 3) * inch * maxk;

        __fp16* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;
                const float* k2 = kptr2 + p * maxk;
                const float* k3 = kptr3 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    g00[2] = (__fp16)k2[k];
                    g00[3] = (__fp16)k3[k];
                    k0 += maxk;
                    k1 += maxk;
                    k2 += maxk;
                    k3 += maxk;
                    g00 += 4;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;
            const float* k2 = kptr2 + p * maxk;
            const float* k3 = kptr3 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00[1] = (__fp16)k1[k];
                g00[2] = (__fp16)k2[k];
                g00[3] = (__fp16)k3[k];
                g00 += 4;
            }
        }
    }
    for (; q + 1 < outch; q += 2)
    {
        const float* kptr0 = (const float*)kernel + q * inch * maxk;
        const float* kptr1 = (const float*)kernel + (q + 1) * inch * maxk;

        __fp16* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk + k;
                const float* k1 = kptr1 + p * maxk + k;

                g00[0] = (__fp16)k0[0];
                g00[1] = (__fp16)k0[maxk];
                g00[2] = (__fp16)k0[maxk * 2];
                g00[3] = (__fp16)k0[maxk * 3];
                g00[4] = (__fp16)k0[maxk * 4];
                g00[5] = (__fp16)k0[maxk * 5];
                g00[6] = (__fp16)k0[maxk * 6];
                g00[7] = (__fp16)k0[maxk * 7];
                g00[8] = (__fp16)k1[0];
                g00[9] = (__fp16)k1[maxk];
                g00[10] = (__fp16)k1[maxk * 2];
                g00[11] = (__fp16)k1[maxk * 3];
                g00[12] = (__fp16)k1[maxk * 4];
                g00[13] = (__fp16)k1[maxk * 5];
                g00[14] = (__fp16)k1[maxk * 6];
                g00[15] = (__fp16)k1[maxk * 7];
                g00 += 16;
            }
        }
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk + k;
                const float* k1 = kptr1 + p * maxk + k;

                g00[0] = (__fp16)k0[0];
                g00[1] = (__fp16)k0[maxk];
                g00[2] = (__fp16)k0[maxk * 2];
                g00[3] = (__fp16)k0[maxk * 3];
                g00[4] = (__fp16)k1[0];
                g00[5] = (__fp16)k1[maxk];
                g00[6] = (__fp16)k1[maxk * 2];
                g00[7] = (__fp16)k1[maxk * 3];
                g00 += 8;
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr0 + p * maxk;
                const float* k1 = kptr1 + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    g00[1] = (__fp16)k1[k];
                    k0 += maxk;
                    k1 += maxk;
                    g00 += 2;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr0 + p * maxk;
            const float* k1 = kptr1 + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00[1] = (__fp16)k1[k];
                g00 += 2;
            }
        }
    }
    for (; q < outch; q++)
    {
        const float* kptr = (const float*)kernel + q * inch * maxk;

        __fp16* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 4; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
        for (; p + 1 < inch; p += 2)
        {
            for (int k = 0; k < maxk; k++)
            {
                const float* k0 = kptr + p * maxk;

                for (int i = 0; i < 2; i++)
                {
                    g00[0] = (__fp16)k0[k];
                    k0 += maxk;
                    g00 += 1;
                }
            }
        }
        for (; p < inch; p++)
        {
            const float* k0 = kptr + p * maxk;

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = (__fp16)k0[k];
                g00++;
            }
        }
    }
}

static void convolution_packed_fp16s(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int elempack = bottom_blob.elempack;
    const int inch = bottom_blob.c * elempack;

    const size_t N = bottom_blob.cstep * elempack;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int outch = top_blob.c * out_elempack;

    const size_t M = top_blob.cstep * out_elempack;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2 * elempack;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const float* bias_data_ptr = bias_data;

    int nn_outch = 0;
    int remain_outch_start = 0;
    nn_outch = (outch - remain_outch_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);
                float32x4_t _sum4 = vdupq_n_f32(0.f);
                float32x4_t _sum5 = vdupq_n_f32(0.f);
                float32x4_t _sum6 = vdupq_n_f32(0.f);
                float32x4_t _sum7 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f32(bias_data_ptr + p);
                    _sum1 = vld1q_f32(bias_data_ptr + p + 4);
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8);

                int q = 0;
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                            _r1 = vcvt_f32_f16(vld1_f16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            float16x8_t _r_f16 = float16x8_t();
                            _r_f16 = vsetq_lane_f16(r0[sok], _r_f16, 0);
                            _r_f16 = vsetq_lane_f16(r0[sok + N], _r_f16, 1);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 2], _r_f16, 2);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 3], _r_f16, 3);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 4], _r_f16, 4);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 5], _r_f16, 5);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 6], _r_f16, 6);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 7], _r_f16, 7);
                            _r0 = vcvt_f32_f16(vget_low_f16(_r_f16));
                            _r1 = vcvt_f32_f16(vget_high_f16(_r_f16));
                        }

                        float16x8_t _w01 = vld1q_f16(kptr);
                        float16x8_t _w23 = vld1q_f16(kptr + 8);
                        float16x8_t _w45 = vld1q_f16(kptr + 16);
                        float16x8_t _w67 = vld1q_f16(kptr + 24);
                        float16x8_t _w89 = vld1q_f16(kptr + 32);
                        float16x8_t _wab = vld1q_f16(kptr + 40);
                        float16x8_t _wcd = vld1q_f16(kptr + 48);
                        float16x8_t _wef = vld1q_f16(kptr + 56);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                        float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                        float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                        float32x4_t _w4 = vcvt_f32_f16(vget_low_f16(_w45));
                        float32x4_t _w5 = vcvt_f32_f16(vget_high_f16(_w45));
                        float32x4_t _w6 = vcvt_f32_f16(vget_low_f16(_w67));
                        float32x4_t _w7 = vcvt_f32_f16(vget_high_f16(_w67));
                        float32x4_t _w8 = vcvt_f32_f16(vget_low_f16(_w89));
                        float32x4_t _w9 = vcvt_f32_f16(vget_high_f16(_w89));
                        float32x4_t _wa = vcvt_f32_f16(vget_low_f16(_wab));
                        float32x4_t _wb = vcvt_f32_f16(vget_high_f16(_wab));
                        float32x4_t _wc = vcvt_f32_f16(vget_low_f16(_wcd));
                        float32x4_t _wd = vcvt_f32_f16(vget_high_f16(_wcd));
                        float32x4_t _we = vcvt_f32_f16(vget_low_f16(_wef));
                        float32x4_t _wf = vcvt_f32_f16(vget_high_f16(_wef));
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w8, _r1, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w9, _r1, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _wa, _r1, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _wb, _r1, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _wc, _r1, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _wd, _r1, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _we, _r1, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _wf, _r1, 3);

                        kptr += 64;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            float16x4_t _r_f16 = float16x4_t();
                            _r_f16 = vset_lane_f16(r0[sok], _r_f16, 0);
                            _r_f16 = vset_lane_f16(r0[sok + N], _r_f16, 1);
                            _r_f16 = vset_lane_f16(r0[sok + N * 2], _r_f16, 2);
                            _r_f16 = vset_lane_f16(r0[sok + N * 3], _r_f16, 3);
                            _r0 = vcvt_f32_f16(_r_f16);
                        }

                        float16x8_t _w01 = vld1q_f16(kptr);
                        float16x8_t _w23 = vld1q_f16(kptr + 8);
                        float16x8_t _w45 = vld1q_f16(kptr + 16);
                        float16x8_t _w67 = vld1q_f16(kptr + 24);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                        float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                        float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                        float32x4_t _w4 = vcvt_f32_f16(vget_low_f16(_w45));
                        float32x4_t _w5 = vcvt_f32_f16(vget_high_f16(_w45));
                        float32x4_t _w6 = vcvt_f32_f16(vget_low_f16(_w67));
                        float32x4_t _w7 = vcvt_f32_f16(vget_high_f16(_w67));
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 0);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 1);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 1);
                        _sum4 = vfmaq_laneq_f32(_sum4, _w4, _r0, 2);
                        _sum5 = vfmaq_laneq_f32(_sum5, _w5, _r0, 2);
                        _sum6 = vfmaq_laneq_f32(_sum6, _w6, _r0, 3);
                        _sum7 = vfmaq_laneq_f32(_sum7, _w7, _r0, 3);

                        kptr += 32;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = (float)(r0[sok]);
                            val1 = (float)(r0[sok + N]);
                        }

                        float16x8_t _w01 = vld1q_f16(kptr);
                        float16x8_t _w23 = vld1q_f16(kptr + 8);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                        float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                        float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                        _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vfmaq_n_f32(_sum1, _w1, val0);
                        _sum2 = vfmaq_n_f32(_sum2, _w2, val1);
                        _sum3 = vfmaq_n_f32(_sum3, _w3, val1);

                        kptr += 16;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float32x4_t _val;
                        // if (elempack == 1)
                        {
                            _val = vcvt_f32_f16(vdup_n_f16(r0[space_ofs[k]]));
                        }

                        float16x8_t _w = vld1q_f16(kptr);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                        _sum0 = vfmaq_f32(_sum0, _w0, _val);
                        _sum1 = vfmaq_f32(_sum1, _w1, _val);

                        kptr += 8;
                    }
                }

                _sum0 = vaddq_f32(_sum0, _sum2);
                _sum1 = vaddq_f32(_sum1, _sum3);
                _sum4 = vaddq_f32(_sum4, _sum6);
                _sum5 = vaddq_f32(_sum5, _sum7);
                _sum0 = vaddq_f32(_sum0, _sum4);
                _sum1 = vaddq_f32(_sum1, _sum5);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);
                _sum1 = activation_ps(_sum1, activation_type, activation_params);

                if (out_elempack == 4)
                {
                    vst1_f16(outptr, vcvt_f16_f32(_sum0));
                    vst1_f16(outptr + M, vcvt_f16_f32(_sum1));
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    float16x4_t _sum0_f16 = vcvt_f16_f32(_sum0);
                    float16x4_t _sum1_f16 = vcvt_f16_f32(_sum1);
                    outptr[0] = vget_lane_f16(_sum0_f16, 0);
                    outptr[M] = vget_lane_f16(_sum0_f16, 1);
                    outptr[M * 2] = vget_lane_f16(_sum0_f16, 2);
                    outptr[M * 3] = vget_lane_f16(_sum0_f16, 3);
                    outptr[M * 4] = vget_lane_f16(_sum1_f16, 0);
                    outptr[M * 5] = vget_lane_f16(_sum1_f16, 1);
                    outptr[M * 6] = vget_lane_f16(_sum1_f16, 2);
                    outptr[M * 7] = vget_lane_f16(_sum1_f16, 3);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 8;
    nn_outch = (outch - remain_outch_start) / 4;
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f32(bias_data_ptr + p);
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);

                int q = 0;
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                            _r1 = vcvt_f32_f16(vld1_f16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            float16x8_t _r_f16 = float16x8_t();
                            _r_f16 = vsetq_lane_f16(r0[sok], _r_f16, 0);
                            _r_f16 = vsetq_lane_f16(r0[sok + N], _r_f16, 1);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 2], _r_f16, 2);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 3], _r_f16, 3);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 4], _r_f16, 4);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 5], _r_f16, 5);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 6], _r_f16, 6);
                            _r_f16 = vsetq_lane_f16(r0[sok + N * 7], _r_f16, 7);
                            _r0 = vcvt_f32_f16(vget_low_f16(_r_f16));
                            _r1 = vcvt_f32_f16(vget_high_f16(_r_f16));
                        }

                        float16x8_t _w01 = vld1q_f16(kptr);
                        float16x8_t _w23 = vld1q_f16(kptr + 8);
                        float16x8_t _w45 = vld1q_f16(kptr + 16);
                        float16x8_t _w67 = vld1q_f16(kptr + 24);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                        float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                        float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                        float32x4_t _w4 = vcvt_f32_f16(vget_low_f16(_w45));
                        float32x4_t _w5 = vcvt_f32_f16(vget_high_f16(_w45));
                        float32x4_t _w6 = vcvt_f32_f16(vget_low_f16(_w67));
                        float32x4_t _w7 = vcvt_f32_f16(vget_high_f16(_w67));
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);
                        _sum0 = vfmaq_laneq_f32(_sum0, _w4, _r1, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w5, _r1, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w6, _r1, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w7, _r1, 3);

                        kptr += 32;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            float16x4_t _r_f16 = float16x4_t();
                            _r_f16 = vset_lane_f16(r0[sok], _r_f16, 0);
                            _r_f16 = vset_lane_f16(r0[sok + N], _r_f16, 1);
                            _r_f16 = vset_lane_f16(r0[sok + N * 2], _r_f16, 2);
                            _r_f16 = vset_lane_f16(r0[sok + N * 3], _r_f16, 3);
                            _r0 = vcvt_f32_f16(_r_f16);
                        }

                        float16x8_t _w01 = vld1q_f16(kptr);
                        float16x8_t _w23 = vld1q_f16(kptr + 8);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                        float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                        float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _r0, 3);

                        kptr += 16;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = (float)(r0[sok]);
                            val1 = (float)(r0[sok + N]);
                        }

                        float16x8_t _w = vld1q_f16(kptr);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                        _sum0 = vfmaq_n_f32(_sum0, _w0, val0);
                        _sum1 = vfmaq_n_f32(_sum1, _w1, val1);

                        kptr += 8;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float32x4_t _val;
                        // if (elempack == 1)
                        {
                            _val = vcvt_f32_f16(vdup_n_f16(r0[space_ofs[k]]));
                        }

                        float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
                        _sum0 = vfmaq_f32(_sum0, _val, _w);

                        kptr += 4;
                    }
                }

                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                _sum0 = vaddq_f32(_sum0, _sum2);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);

                if (out_elempack == 4)
                {
                    vst1_f16(outptr, vcvt_f16_f32(_sum0));
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    float16x4_t _sum0_f16 = vcvt_f16_f32(_sum0);
                    outptr[0] = vget_lane_f16(_sum0_f16, 0);
                    outptr[M] = vget_lane_f16(_sum0_f16, 1);
                    outptr[M * 2] = vget_lane_f16(_sum0_f16, 2);
                    outptr[M * 3] = vget_lane_f16(_sum0_f16, 3);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 4;
    nn_outch = (outch - remain_outch_start) / 2;
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        __fp16* outptr0 = top_blob.channel(p);
        __fp16* outptr1 = top_blob.channel(p + 1);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum0 = 0.f;
                float sum1 = 0.f;

                if (bias_data_ptr)
                {
                    sum0 = bias_data_ptr[p];
                    sum1 = bias_data_ptr[p + 1];
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);

                int q = 0;
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                            _r1 = vcvt_f32_f16(vld1_f16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            float16x8_t _r01_f16 = float16x8_t();
                            _r01_f16 = vsetq_lane_f16(r0[sok], _r01_f16, 0);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N], _r01_f16, 1);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 2], _r01_f16, 2);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 3], _r01_f16, 3);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 4], _r01_f16, 4);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 5], _r01_f16, 5);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 6], _r01_f16, 6);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 7], _r01_f16, 7);
                            _r0 = vcvt_f32_f16(vget_low_f16(_r01_f16));
                            _r1 = vcvt_f32_f16(vget_high_f16(_r01_f16));
                        }

                        float16x8_t _w01 = vld1q_f16(kptr);
                        float16x8_t _w23 = vld1q_f16(kptr + 8);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                        float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                        float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r1, _w1);
                        _sum2 = vfmaq_f32(_sum2, _r0, _w2);
                        _sum3 = vfmaq_f32(_sum3, _r1, _w3);

                        kptr += 16;
                    }
                }
                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                sum0 += vaddvq_f32(_sum0);
                sum1 += vaddvq_f32(_sum2);
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            float16x4_t _r0_f16 = float16x4_t();
                            _r0_f16 = vset_lane_f16(r0[sok], _r0_f16, 0);
                            _r0_f16 = vset_lane_f16(r0[sok + N], _r0_f16, 1);
                            _r0_f16 = vset_lane_f16(r0[sok + N * 2], _r0_f16, 2);
                            _r0_f16 = vset_lane_f16(r0[sok + N * 3], _r0_f16, 3);
                            _r0 = vcvt_f32_f16(_r0_f16);
                        }

                        float16x8_t _w = vld1q_f16(kptr);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r0, _w1);

                        kptr += 8;
                    }
                }
                sum0 += vaddvq_f32(_sum0);
                sum1 += vaddvq_f32(_sum1);
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = (float)(r0[sok]);
                            val1 = (float)(r0[sok + N]);
                        }

                        sum0 += val0 * (float)(kptr[0]);
                        sum1 += val0 * (float)(kptr[1]);
                        sum0 += val1 * (float)(kptr[2]);
                        sum1 += val1 * (float)(kptr[3]);

                        kptr += 4;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val;
                        // if (elempack == 1)
                        {
                            val = (float)(r0[space_ofs[k]]);
                        }

                        sum0 += val * (float)(kptr[0]);
                        sum1 += val * (float)(kptr[1]);

                        kptr += 2;
                    }
                }

                sum0 = activation_ss(sum0, activation_type, activation_params);
                sum1 = activation_ss(sum1, activation_type, activation_params);

                outptr0[0] = (__fp16)(sum0);
                outptr1[0] = (__fp16)(sum1);
                outptr0 += 1;
                outptr1 += 1;
            }
        }
    }
    remain_outch_start += nn_outch * 2;
    for (int p = remain_outch_start; p < outch; p++)
    {
        __fp16* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_data_ptr)
                {
                    sum = bias_data_ptr[p];
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);

                int q = 0;
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        float32x4_t _r1;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                            _r1 = vcvt_f32_f16(vld1_f16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            float16x8_t _r01_f16 = float16x8_t();
                            _r01_f16 = vsetq_lane_f16(r0[sok], _r01_f16, 0);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N], _r01_f16, 1);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 2], _r01_f16, 2);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 3], _r01_f16, 3);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 4], _r01_f16, 4);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 5], _r01_f16, 5);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 6], _r01_f16, 6);
                            _r01_f16 = vsetq_lane_f16(r0[sok + N * 7], _r01_f16, 7);
                            _r0 = vcvt_f32_f16(vget_low_f16(_r01_f16));
                            _r1 = vcvt_f32_f16(vget_high_f16(_r01_f16));
                        }

                        float16x8_t _w = vld1q_f16(kptr);
                        float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w));
                        float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w));
                        _sum0 = vfmaq_f32(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f32(_sum1, _r1, _w1);

                        kptr += 8;
                    }
                }
                _sum0 = vaddq_f32(_sum0, _sum1);
                sum += vaddvq_f32(_sum0);
                float32x4_t _sum = vdupq_n_f32(0.f);
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float32x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vcvt_f32_f16(vld1_f16(r0 + sok));
                        }
                        else // if (elempack == 1)
                        {
                            float16x4_t _r0_f16 = float16x4_t();
                            _r0_f16 = vset_lane_f16(r0[sok], _r0_f16, 0);
                            _r0_f16 = vset_lane_f16(r0[sok + N], _r0_f16, 1);
                            _r0_f16 = vset_lane_f16(r0[sok + N * 2], _r0_f16, 2);
                            _r0_f16 = vset_lane_f16(r0[sok + N * 3], _r0_f16, 3);
                            _r0 = vcvt_f32_f16(_r0_f16);
                        }

                        float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
                        _sum = vfmaq_f32(_sum, _r0, _w);

                        kptr += 4;
                    }
                }
                sum += vaddvq_f32(_sum);
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float val0;
                        float val1;
                        // if (elempack == 1)
                        {
                            val0 = (float)(r0[sok]);
                            val1 = (float)(r0[sok + N]);
                        }

                        sum += val0 * (float)(kptr[0]);
                        sum += val1 * (float)(kptr[1]);

                        kptr += 2;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val;
                        // if (elempack == 1)
                        {
                            val = (float)(r0[space_ofs[k]]);
                        }

                        sum += val * (float)(kptr[0]);

                        kptr += 1;
                    }
                }

                sum = activation_ss(sum, activation_type, activation_params);

                outptr[0] = (__fp16)(sum);
                outptr += 1;
            }
        }
    }
}

static void convolution_packed_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int elempack = bottom_blob.elempack;
    const int inch = bottom_blob.c * elempack;

    const size_t N = bottom_blob.cstep * elempack;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int outch = top_blob.c * out_elempack;

    const size_t M = top_blob.cstep * out_elempack;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2 * elempack;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const __fp16* bias_data_ptr = bias_data;

    int nn_outch = 0;
    int remain_outch_start = 0;
    nn_outch = (outch - remain_outch_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 8;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float16x8_t _sum0 = vdupq_n_f16(0.f);
                float16x8_t _sum1 = vdupq_n_f16(0.f);
                float16x8_t _sum2 = vdupq_n_f16(0.f);
                float16x8_t _sum3 = vdupq_n_f16(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f16(bias_data_ptr + p);
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8);

                int q = 0;
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x8_t _r0;
                        if (elempack == 8)
                        {
                            _r0 = vld1q_f16(r0 + sok);
                        }
                        else if (elempack == 4)
                        {
                            _r0 = vcombine_f16(vld1_f16(r0 + sok), vld1_f16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x8_t();
                            _r0 = vsetq_lane_f16(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f16(r0[sok + N * 3], _r0, 3);
                            _r0 = vsetq_lane_f16(r0[sok + N * 4], _r0, 4);
                            _r0 = vsetq_lane_f16(r0[sok + N * 5], _r0, 5);
                            _r0 = vsetq_lane_f16(r0[sok + N * 6], _r0, 6);
                            _r0 = vsetq_lane_f16(r0[sok + N * 7], _r0, 7);
                        }

                        float16x8_t _w0 = vld1q_f16(kptr);
                        float16x8_t _w1 = vld1q_f16(kptr + 8);
                        float16x8_t _w2 = vld1q_f16(kptr + 8 * 2);
                        float16x8_t _w3 = vld1q_f16(kptr + 8 * 3);
                        float16x8_t _w4 = vld1q_f16(kptr + 8 * 4);
                        float16x8_t _w5 = vld1q_f16(kptr + 8 * 5);
                        float16x8_t _w6 = vld1q_f16(kptr + 8 * 6);
                        float16x8_t _w7 = vld1q_f16(kptr + 8 * 7);
                        _sum0 = vfmaq_laneq_f16(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_laneq_f16(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_laneq_f16(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_laneq_f16(_sum3, _w3, _r0, 3);
                        _sum0 = vfmaq_laneq_f16(_sum0, _w4, _r0, 4);
                        _sum1 = vfmaq_laneq_f16(_sum1, _w5, _r0, 5);
                        _sum2 = vfmaq_laneq_f16(_sum2, _w6, _r0, 6);
                        _sum3 = vfmaq_laneq_f16(_sum3, _w7, _r0, 7);

                        kptr += 64;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1_f16(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x4_t();
                            _r0 = vset_lane_f16(r0[sok], _r0, 0);
                            _r0 = vset_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vset_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vset_lane_f16(r0[sok + N * 3], _r0, 3);
                        }

                        float16x8_t _w0 = vld1q_f16(kptr);
                        float16x8_t _w1 = vld1q_f16(kptr + 8);
                        float16x8_t _w2 = vld1q_f16(kptr + 8 * 2);
                        float16x8_t _w3 = vld1q_f16(kptr + 8 * 3);
                        _sum0 = vfmaq_lane_f16(_sum0, _w0, _r0, 0);
                        _sum1 = vfmaq_lane_f16(_sum1, _w1, _r0, 1);
                        _sum2 = vfmaq_lane_f16(_sum2, _w2, _r0, 2);
                        _sum3 = vfmaq_lane_f16(_sum3, _w3, _r0, 3);

                        kptr += 32;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        __fp16 val0;
                        __fp16 val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        float16x8_t _w0 = vld1q_f16(kptr);
                        float16x8_t _w1 = vld1q_f16(kptr + 8);
                        _sum0 = vfmaq_n_f16(_sum0, _w0, val0);
                        _sum1 = vfmaq_n_f16(_sum1, _w1, val1);

                        kptr += 16;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float16x8_t _val;
                        // if (elempack == 1)
                        {
                            _val = vdupq_n_f16(r0[space_ofs[k]]);
                        }

                        float16x8_t _w0 = vld1q_f16(kptr);
                        _sum0 = vfmaq_f16(_sum0, _w0, _val);

                        kptr += 8;
                    }
                }

                _sum0 = vaddq_f16(_sum0, _sum1);
                _sum2 = vaddq_f16(_sum2, _sum3);
                _sum0 = vaddq_f16(_sum0, _sum2);

                _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);

                if (out_elempack == 8)
                {
                    vst1q_f16(outptr, _sum0);
                    outptr += 8;
                }
                else if (out_elempack == 4)
                {
                    vst1_f16(outptr, vget_low_f16(_sum0));
                    vst1_f16(outptr + M, vget_high_f16(_sum0));
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    outptr[0] = vgetq_lane_f16(_sum0, 0);
                    outptr[M] = vgetq_lane_f16(_sum0, 1);
                    outptr[M * 2] = vgetq_lane_f16(_sum0, 2);
                    outptr[M * 3] = vgetq_lane_f16(_sum0, 3);
                    outptr[M * 4] = vgetq_lane_f16(_sum0, 4);
                    outptr[M * 5] = vgetq_lane_f16(_sum0, 5);
                    outptr[M * 6] = vgetq_lane_f16(_sum0, 6);
                    outptr[M * 7] = vgetq_lane_f16(_sum0, 7);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 8;
    nn_outch = (outch - remain_outch_start) / 4;
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 4;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int out_elempack = top_blob.elempack;

        __fp16* outptr = top_blob.channel(p / out_elempack);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float16x4_t _sum0 = vdup_n_f16(0.f);
                float16x4_t _sum1 = vdup_n_f16(0.f);
                float16x4_t _sum2 = vdup_n_f16(0.f);
                float16x4_t _sum3 = vdup_n_f16(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1_f16(bias_data_ptr + p);
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);

                int q = 0;
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x4_t _r0;
                        float16x4_t _r1;
                        if (elempack == 8)
                        {
                            float16x8_t _r01 = vld1q_f16(r0 + sok);
                            _r0 = vget_low_f16(_r01);
                            _r1 = vget_high_f16(_r01);
                        }
                        else if (elempack == 4)
                        {
                            _r0 = vld1_f16(r0 + sok);
                            _r1 = vld1_f16(r0 + sok + N);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x4_t();
                            _r1 = float16x4_t();
                            _r0 = vset_lane_f16(r0[sok], _r0, 0);
                            _r0 = vset_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vset_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vset_lane_f16(r0[sok + N * 3], _r0, 3);
                            _r1 = vset_lane_f16(r0[sok + N * 4], _r1, 0);
                            _r1 = vset_lane_f16(r0[sok + N * 5], _r1, 1);
                            _r1 = vset_lane_f16(r0[sok + N * 6], _r1, 2);
                            _r1 = vset_lane_f16(r0[sok + N * 7], _r1, 3);
                        }

                        float16x4_t _w0 = vld1_f16(kptr);
                        float16x4_t _w1 = vld1_f16(kptr + 4);
                        float16x4_t _w2 = vld1_f16(kptr + 8);
                        float16x4_t _w3 = vld1_f16(kptr + 12);
                        float16x4_t _w4 = vld1_f16(kptr + 16);
                        float16x4_t _w5 = vld1_f16(kptr + 20);
                        float16x4_t _w6 = vld1_f16(kptr + 24);
                        float16x4_t _w7 = vld1_f16(kptr + 28);
                        _sum0 = vfma_lane_f16(_sum0, _w0, _r0, 0);
                        _sum1 = vfma_lane_f16(_sum1, _w1, _r0, 1);
                        _sum2 = vfma_lane_f16(_sum2, _w2, _r0, 2);
                        _sum3 = vfma_lane_f16(_sum3, _w3, _r0, 3);
                        _sum0 = vfma_lane_f16(_sum0, _w4, _r1, 0);
                        _sum1 = vfma_lane_f16(_sum1, _w5, _r1, 1);
                        _sum2 = vfma_lane_f16(_sum2, _w6, _r1, 2);
                        _sum3 = vfma_lane_f16(_sum3, _w7, _r1, 3);

                        kptr += 32;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1_f16(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x4_t();
                            _r0 = vset_lane_f16(r0[sok], _r0, 0);
                            _r0 = vset_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vset_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vset_lane_f16(r0[sok + N * 3], _r0, 3);
                        }

                        float16x4_t _w0 = vld1_f16(kptr);
                        float16x4_t _w1 = vld1_f16(kptr + 4);
                        float16x4_t _w2 = vld1_f16(kptr + 8);
                        float16x4_t _w3 = vld1_f16(kptr + 12);
                        _sum0 = vfma_lane_f16(_sum0, _w0, _r0, 0);
                        _sum1 = vfma_lane_f16(_sum1, _w1, _r0, 1);
                        _sum2 = vfma_lane_f16(_sum2, _w2, _r0, 2);
                        _sum3 = vfma_lane_f16(_sum3, _w3, _r0, 3);

                        kptr += 16;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        __fp16 val0;
                        __fp16 val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        float16x4_t _w0 = vld1_f16(kptr);
                        float16x4_t _w1 = vld1_f16(kptr + 4);
                        _sum0 = vfma_n_f16(_sum0, _w0, val0);
                        _sum1 = vfma_n_f16(_sum1, _w1, val1);

                        kptr += 8;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float16x4_t _val;
                        // if (elempack == 1)
                        {
                            _val = vdup_n_f16(r0[space_ofs[k]]);
                        }

                        float16x4_t _w = vld1_f16(kptr);
                        _sum0 = vfma_f16(_sum0, _val, _w);

                        kptr += 4;
                    }
                }

                _sum0 = vadd_f16(_sum0, _sum1);
                _sum2 = vadd_f16(_sum2, _sum3);
                _sum0 = vadd_f16(_sum0, _sum2);

                _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);

                if (out_elempack == 4)
                {
                    vst1_f16(outptr, _sum0);
                    outptr += 4;
                }
                else // if (out_elempack == 1)
                {
                    outptr[0] = vget_lane_f16(_sum0, 0);
                    outptr[M] = vget_lane_f16(_sum0, 1);
                    outptr[M * 2] = vget_lane_f16(_sum0, 2);
                    outptr[M * 3] = vget_lane_f16(_sum0, 3);
                    outptr += 1;
                }
            }
        }
    }
    remain_outch_start += nn_outch * 4;
    nn_outch = (outch - remain_outch_start) / 2;
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 2;

        // shadowed variable for less openmp task args
        const int elempack = bottom_blob.elempack;
        const int inch = bottom_blob.c * elempack;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        __fp16* outptr0 = top_blob.channel(p);
        __fp16* outptr1 = top_blob.channel(p + 1);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __fp16 sum0 = 0.f;
                __fp16 sum1 = 0.f;

                if (bias_data_ptr)
                {
                    sum0 = bias_data_ptr[p];
                    sum1 = bias_data_ptr[p + 1];
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);

                int q = 0;
                float16x8_t _sum0 = vdupq_n_f16(0.f);
                float16x8_t _sum1 = vdupq_n_f16(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x8_t _r0;
                        if (elempack == 8)
                        {
                            _r0 = vld1q_f16(r0 + sok);
                        }
                        else if (elempack == 4)
                        {
                            _r0 = vcombine_f16(vld1_f16(r0 + sok), vld1_f16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x8_t();
                            _r0 = vsetq_lane_f16(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f16(r0[sok + N * 3], _r0, 3);
                            _r0 = vsetq_lane_f16(r0[sok + N * 4], _r0, 4);
                            _r0 = vsetq_lane_f16(r0[sok + N * 5], _r0, 5);
                            _r0 = vsetq_lane_f16(r0[sok + N * 6], _r0, 6);
                            _r0 = vsetq_lane_f16(r0[sok + N * 7], _r0, 7);
                        }

                        float16x8_t _w0 = vld1q_f16(kptr);
                        float16x8_t _w1 = vld1q_f16(kptr + 8);
                        _sum0 = vfmaq_f16(_sum0, _r0, _w0);
                        _sum1 = vfmaq_f16(_sum1, _r0, _w1);

                        kptr += 16;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1_f16(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x4_t();
                            _r0 = vset_lane_f16(r0[sok], _r0, 0);
                            _r0 = vset_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vset_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vset_lane_f16(r0[sok + N * 3], _r0, 3);
                        }

                        float16x4_t _w0 = vld1_f16(kptr);
                        float16x4_t _w1 = vld1_f16(kptr + 4);
                        _sum0 = vcombine_f16(vfma_f16(vget_low_f16(_sum0), _r0, _w0), vget_high_f16(_sum0));
                        _sum1 = vcombine_f16(vfma_f16(vget_low_f16(_sum1), _r0, _w1), vget_high_f16(_sum1));

                        kptr += 8;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        __fp16 val0;
                        __fp16 val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        sum0 += val0 * kptr[0];
                        sum1 += val0 * kptr[1];
                        sum0 += val1 * kptr[2];
                        sum1 += val1 * kptr[3];

                        kptr += 4;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        __fp16 val;
                        // if (elempack == 1)
                        {
                            val = r0[space_ofs[k]];
                        }

                        sum0 += val * kptr[0];
                        sum1 += val * kptr[1];

                        kptr += 2;
                    }
                }

                float16x4_t _ss0 = vadd_f16(vget_low_f16(_sum0), vget_high_f16(_sum0));
                float16x4_t _ss1 = vadd_f16(vget_low_f16(_sum1), vget_high_f16(_sum1));
                float16x4_t _ss = vpadd_f16(_ss0, _ss1);
                _ss = vpadd_f16(_ss, _ss);
                sum0 += vget_lane_f16(_ss, 0);
                sum1 += vget_lane_f16(_ss, 1);

                sum0 = activation_ss_f16(sum0, activation_type, activation_params);
                sum1 = activation_ss_f16(sum1, activation_type, activation_params);

                outptr0[0] = sum0;
                outptr1[0] = sum1;
                outptr0 += 1;
                outptr1 += 1;
            }
        }
    }
    remain_outch_start += nn_outch * 2;
    for (int p = remain_outch_start; p < outch; p++)
    {
        __fp16* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __fp16 sum = 0.f;

                if (bias_data_ptr)
                {
                    sum = bias_data_ptr[p];
                }

                const __fp16* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);

                int q = 0;
                float16x8_t _sum = vdupq_n_f16(0.f);
                for (; q + 7 < inch; q += 8)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x8_t _r0;
                        if (elempack == 8)
                        {
                            _r0 = vld1q_f16(r0 + sok);
                        }
                        else if (elempack == 4)
                        {
                            _r0 = vcombine_f16(vld1_f16(r0 + sok), vld1_f16(r0 + sok + N));
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x8_t();
                            _r0 = vsetq_lane_f16(r0[sok], _r0, 0);
                            _r0 = vsetq_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vsetq_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vsetq_lane_f16(r0[sok + N * 3], _r0, 3);
                            _r0 = vsetq_lane_f16(r0[sok + N * 4], _r0, 4);
                            _r0 = vsetq_lane_f16(r0[sok + N * 5], _r0, 5);
                            _r0 = vsetq_lane_f16(r0[sok + N * 6], _r0, 6);
                            _r0 = vsetq_lane_f16(r0[sok + N * 7], _r0, 7);
                        }

                        float16x8_t _w0 = vld1q_f16(kptr);
                        _sum = vfmaq_f16(_sum, _r0, _w0);

                        kptr += 8;
                    }
                }
                for (; q + 3 < inch; q += 4)
                {
                    const __fp16* r0 = bottom_blob.channel(q / elempack).row<const __fp16>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        float16x4_t _r0;
                        if (elempack == 4)
                        {
                            _r0 = vld1_f16(r0 + sok);
                        }
                        else // if (elempack == 1)
                        {
                            _r0 = float16x4_t();
                            _r0 = vset_lane_f16(r0[sok], _r0, 0);
                            _r0 = vset_lane_f16(r0[sok + N], _r0, 1);
                            _r0 = vset_lane_f16(r0[sok + N * 2], _r0, 2);
                            _r0 = vset_lane_f16(r0[sok + N * 3], _r0, 3);
                        }

                        float16x4_t _w = vld1_f16(kptr);
                        _sum = vcombine_f16(vfma_f16(vget_low_f16(_sum), _r0, _w), vget_high_f16(_sum));

                        kptr += 4;
                    }
                }
                for (; q + 1 < inch; q += 2)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        const int sok = space_ofs[k];
                        __fp16 val0;
                        __fp16 val1;
                        // if (elempack == 1)
                        {
                            val0 = r0[sok];
                            val1 = r0[sok + N];
                        }

                        sum += val0 * kptr[0];
                        sum += val1 * kptr[1];

                        kptr += 2;
                    }
                }
                for (; q < inch; q++)
                {
                    const __fp16* r0 = bottom_blob.channel(q).row<const __fp16>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        __fp16 val;
                        // if (elempack == 1)
                        {
                            val = r0[space_ofs[k]];
                        }

                        sum += val * kptr[0];

                        kptr += 1;
                    }
                }

                float16x4_t _ss = vadd_f16(vget_low_f16(_sum), vget_high_f16(_sum));
                _ss = vpadd_f16(_ss, _ss);
                _ss = vpadd_f16(_ss, _ss);
                sum += vget_lane_f16(_ss, 0);

                sum = activation_ss_f16(sum, activation_type, activation_params);

                outptr[0] = sum;
                outptr += 1;
            }
        }
    }
}


================================================
FILE: src/layer/arm/convolution_packed_int8.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
#endif

static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        convolution_transform_kernel_packed_int8_i8mm(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
        return;
    }
#endif

    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb

    // clang-format off
    // *INDENT-OFF*
#if __ARM_NEON
    if (outch >= 8)
    {
        if (inch >= 8)
            kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)64u, 64);
        else
            kernel_tm.create(maxk, inch, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)8u, 8);
    }
    else if (outch >= 4)
    {
        if (inch >= 8)
            kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)32u, 32);
        else
            kernel_tm.create(maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)4u, 4);
    }
    else
#endif // __ARM_NEON
    if (outch >= 2)
    {
#if __ARM_NEON
        if (inch >= 8)
            kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 2 + outch % 2, (size_t)16u, 16);
        else
#endif // __ARM_NEON
            kernel_tm.create(maxk, inch, outch / 2 + outch % 2, (size_t)2u, 2);
    }
    else
    {
#if __ARM_NEON
        if (inch >= 8)
            kernel_tm.create(maxk, inch / 8 + inch % 8, outch, (size_t)8u, 8);
        else
#endif // __ARM_NEON
            kernel_tm.create(maxk, inch, outch, (size_t)1u, 1);
    }
    // *INDENT-ON*
    // clang-format on

    int q = 0;
#if __ARM_NEON
    for (; q + 7 < outch; q += 8)
    {
        const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk;
        const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk;
        const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk;
        const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk;
        const signed char* kptr4 = (const signed char*)kernel + (q + 4) * inch * maxk;
        const signed char* kptr5 = (const signed char*)kernel + (q + 5) * inch * maxk;
        const signed char* kptr6 = (const signed char*)kernel + (q + 6) * inch * maxk;
        const signed char* kptr7 = (const signed char*)kernel + (q + 7) * inch * maxk;

        signed char* g00 = kernel_tm.channel(q / 8);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr0 + k;
                const signed char* k1 = kptr1 + k;
                const signed char* k2 = kptr2 + k;
                const signed char* k3 = kptr3 + k;
                const signed char* k4 = kptr4 + k;
                const signed char* k5 = kptr5 + k;
                const signed char* k6 = kptr6 + k;
                const signed char* k7 = kptr7 + k;

#if __ARM_FEATURE_MATMUL_INT8
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[0];
                    g00 += 1;
                    k0 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k1[0];
                    g00 += 1;
                    k1 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k2[0];
                    g00 += 1;
                    k2 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k3[0];
                    g00 += 1;
                    k3 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k4[0];
                    g00 += 1;
                    k4 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k5[0];
                    g00 += 1;
                    k5 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k6[0];
                    g00 += 1;
                    k6 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k7[0];
                    g00 += 1;
                    k7 += maxk;
                }
#elif __ARM_FEATURE_DOTPROD
                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[0];
                    g00[1] = k0[maxk];
                    g00[2] = k0[maxk * 2];
                    g00[3] = k0[maxk * 3];
                    g00[4] = k1[0];
                    g00[5] = k1[maxk];
                    g00[6] = k1[maxk * 2];
                    g00[7] = k1[maxk * 3];
                    g00[8] = k2[0];
                    g00[9] = k2[maxk];
                    g00[10] = k2[maxk * 2];
                    g00[11] = k2[maxk * 3];
                    g00[12] = k3[0];
                    g00[13] = k3[maxk];
                    g00[14] = k3[maxk * 2];
                    g00[15] = k3[maxk * 3];
                    g00[16] = k4[0];
                    g00[17] = k4[maxk];
                    g00[18] = k4[maxk * 2];
                    g00[19] = k4[maxk * 3];
                    g00[20] = k5[0];
                    g00[21] = k5[maxk];
                    g00[22] = k5[maxk * 2];
                    g00[23] = k5[maxk * 3];
                    g00[24] = k6[0];
                    g00[25] = k6[maxk];
                    g00[26] = k6[maxk * 2];
                    g00[27] = k6[maxk * 3];
                    g00[28] = k7[0];
                    g00[29] = k7[maxk];
                    g00[30] = k7[maxk * 2];
                    g00[31] = k7[maxk * 3];
                    g00 += 32;
                    k0 += maxk * 4;
                    k1 += maxk * 4;
                    k2 += maxk * 4;
                    k3 += maxk * 4;
                    k4 += maxk * 4;
                    k5 += maxk * 4;
                    k6 += maxk * 4;
                    k7 += maxk * 4;
                }
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[0];
                    g00[1] = k0[maxk];
                    g00[2] = k1[0];
                    g00[3] = k1[maxk];
                    g00[4] = k2[0];
                    g00[5] = k2[maxk];
                    g00[6] = k3[0];
                    g00[7] = k3[maxk];
                    g00[8] = k4[0];
                    g00[9] = k4[maxk];
                    g00[10] = k5[0];
                    g00[11] = k5[maxk];
                    g00[12] = k6[0];
                    g00[13] = k6[maxk];
                    g00[14] = k7[0];
                    g00[15] = k7[maxk];
                    g00 += 16;
                    k0 += maxk * 2;
                    k1 += maxk * 2;
                    k2 += maxk * 2;
                    k3 += maxk * 2;
                    k4 += maxk * 2;
                    k5 += maxk * 2;
                    k6 += maxk * 2;
                    k7 += maxk * 2;
                }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            }

            kptr0 += maxk * 8;
            kptr1 += maxk * 8;
            kptr2 += maxk * 8;
            kptr3 += maxk * 8;
            kptr4 += maxk * 8;
            kptr5 += maxk * 8;
            kptr6 += maxk * 8;
            kptr7 += maxk * 8;
        }
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr0 + k;
                const signed char* k1 = kptr1 + k;
                const signed char* k2 = kptr2 + k;
                const signed char* k3 = kptr3 + k;
                const signed char* k4 = kptr4 + k;
                const signed char* k5 = kptr5 + k;
                const signed char* k6 = kptr6 + k;
                const signed char* k7 = kptr7 + k;

                g00[0] = k0[0];
                g00[1] = k1[0];
                g00[2] = k2[0];
                g00[3] = k3[0];
                g00[4] = k4[0];
                g00[5] = k5[0];
                g00[6] = k6[0];
                g00[7] = k7[0];
                g00 += 8;
            }

            kptr0 += maxk;
            kptr1 += maxk;
            kptr2 += maxk;
            kptr3 += maxk;
            kptr4 += maxk;
            kptr5 += maxk;
            kptr6 += maxk;
            kptr7 += maxk;
        }
    }
    for (; q + 3 < outch; q += 4)
    {
        const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk;
        const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk;
        const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk;
        const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk;

        signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);

        int p = 0;
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr0 + k;
                const signed char* k1 = kptr1 + k;
                const signed char* k2 = kptr2 + k;
                const signed char* k3 = kptr3 + k;

#if __ARM_FEATURE_MATMUL_INT8
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[0];
                    g00 += 1;
                    k0 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k1[0];
                    g00 += 1;
                    k1 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k2[0];
                    g00 += 1;
                    k2 += maxk;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k3[0];
                    g00 += 1;
                    k3 += maxk;
                }
#elif __ARM_FEATURE_DOTPROD
                for (int i = 0; i < 2; i++)
                {
                    g00[0] = k0[0];
                    g00[1] = k0[maxk];
                    g00[2] = k0[maxk * 2];
                    g00[3] = k0[maxk * 3];
                    g00[4] = k1[0];
                    g00[5] = k1[maxk];
                    g00[6] = k1[maxk * 2];
                    g00[7] = k1[maxk * 3];
                    g00[8] = k2[0];
                    g00[9] = k2[maxk];
                    g00[10] = k2[maxk * 2];
                    g00[11] = k2[maxk * 3];
                    g00[12] = k3[0];
                    g00[13] = k3[maxk];
                    g00[14] = k3[maxk * 2];
                    g00[15] = k3[maxk * 3];
                    g00 += 16;
                    k0 += maxk * 4;
                    k1 += maxk * 4;
                    k2 += maxk * 4;
                    k3 += maxk * 4;
                }
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[0];
                    g00[1] = k0[maxk];
                    g00[2] = k1[0];
                    g00[3] = k1[maxk];
                    g00[4] = k2[0];
                    g00[5] = k2[maxk];
                    g00[6] = k3[0];
                    g00[7] = k3[maxk];
                    g00 += 8;
                    k0 += maxk * 2;
                    k1 += maxk * 2;
                    k2 += maxk * 2;
                    k3 += maxk * 2;
                }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            }

            kptr0 += maxk * 8;
            kptr1 += maxk * 8;
            kptr2 += maxk * 8;
            kptr3 += maxk * 8;
        }
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr0 + k;
                const signed char* k1 = kptr1 + k;
                const signed char* k2 = kptr2 + k;
                const signed char* k3 = kptr3 + k;

                g00[0] = k0[0];
                g00[1] = k1[0];
                g00[2] = k2[0];
                g00[3] = k3[0];
                g00 += 4;
            }

            kptr0 += maxk;
            kptr1 += maxk;
            kptr2 += maxk;
            kptr3 += maxk;
        }
    }
#endif // __ARM_NEON
    for (; q + 1 < outch; q += 2)
    {
        const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk;
        const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk;

#if __ARM_NEON
        signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);
#else
        signed char* g00 = kernel_tm.channel(q / 2);
#endif

        int p = 0;
#if __ARM_NEON
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr0 + k;
                const signed char* k1 = kptr1 + k;

#if __ARM_FEATURE_DOTPROD
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[0];
                    k0 += maxk;
                    g00 += 1;
                }
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k1[0];
                    k1 += maxk;
                    g00 += 1;
                }
#else  // __ARM_FEATURE_DOTPROD
                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k0[0];
                    k0 += maxk;
                    g00 += 1;
                }
                for (int i = 0; i < 4; i++)
                {
                    g00[0] = k1[0];
                    k1 += maxk;
                    g00 += 1;
                }

                for (int i = 4; i < 8; i++)
                {
                    g00[0] = k0[0];
                    k0 += maxk;
                    g00 += 1;
                }
                for (int i = 4; i < 8; i++)
                {
                    g00[0] = k1[0];
                    k1 += maxk;
                    g00 += 1;
                }
#endif // __ARM_FEATURE_DOTPROD
            }

            kptr0 += maxk * 8;
            kptr1 += maxk * 8;
        }
#endif // __ARM_NEON
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr0 + k;
                const signed char* k1 = kptr1 + k;

                g00[0] = k0[0];
                g00[1] = k1[0];
                g00 += 2;
            }

            kptr0 += maxk;
            kptr1 += maxk;
        }
    }
    for (; q < outch; q++)
    {
        const signed char* kptr = (const signed char*)kernel + q * inch * maxk;

#if __ARM_NEON
        signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);
#else
        signed char* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        int p = 0;
#if __ARM_NEON
        for (; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr + k;

                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0[0];
                    k0 += maxk;
                    g00 += 1;
                }
            }

            kptr += maxk * 8;
        }
#endif // __ARM_NEON
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k0 = kptr + k;
                g00[0] = k0[0];
                g00++;
            }

            kptr += maxk;
        }
    }
}

static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        convolution_packed_int8_i8mm(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        return;
    }
#endif

    const int w = bottom_blob.w;
    const int elempack = bottom_blob.elempack;
    const int inch = bottom_blob.c * elempack;

    const size_t N = bottom_blob.cstep * elempack;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int out_elempack = top_blob.elempack;
    const int outch = top_blob.c * out_elempack;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2 * elempack;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    int nn_outch = 0;
    int remain_outch_start = 0;
#if __ARM_NEON
    nn_outch = (outch - remain_outch_start) / 8;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 8;

        // shadowed variable for less openmp task args
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const size_t N = bottom_blob.cstep * elempack;
        const size_t M = top_blob.cstep * out_elempack;

        int* outptr = top_blob.channel(p / out_elempack);

        int ij = 0;
        for (; ij + 1 < outw * outh; ij += 2)
        {
            const int i0 = ij / outw;
            const int i1 = (ij + 1) / outw;
            const int j0 = ij % outw;
            const int j1 = (ij + 1) % outw;

            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            int32x4_t _sum3 = vdupq_n_s32(0);

            const signed char* kptr = weight_data_tm.channel(p / 8);

            int q = 0;
            {
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
                    const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];
                        const signed char* r1s = r1 + space_ofs[k];

                        int8x8_t _r0;
                        int8x8_t _r1;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                            _r1 = vld1_s8(r1s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]};
                            _r0 = vld1_s8(tmp0);
                            _r1 = vld1_s8(tmp1);
                        }

                        int8x16_t _w0 = vld1q_s8(kptr);
                        int8x16_t _w1 = vld1q_s8(kptr + 16);
                        int8x16_t _w2 = vld1q_s8(kptr + 32);
                        int8x16_t _w3 = vld1q_s8(kptr + 48);

#if __ARM_FEATURE_MATMUL_INT8
                        int8x16_t _r01 = vcombine_s8(_r0, _r1);
                        _sum0 = vmmlaq_s32(_sum0, _r01, _w0);
                        _sum1 = vmmlaq_s32(_sum1, _r01, _w1);
                        _sum2 = vmmlaq_s32(_sum2, _r01, _w2);
                        _sum3 = vmmlaq_s32(_sum3, _r01, _w3);
#elif __ARM_FEATURE_DOTPROD
                        _sum0 = vdotq_lane_s32(_sum0, _w0, _r0, 0);
                        _sum1 = vdotq_lane_s32(_sum1, _w1, _r0, 0);
                        _sum2 = vdotq_lane_s32(_sum2, _w0, _r1, 0);
                        _sum3 = vdotq_lane_s32(_sum3, _w1, _r1, 0);
                        _sum0 = vdotq_lane_s32(_sum0, _w2, _r0, 1);
                        _sum1 = vdotq_lane_s32(_sum1, _w3, _r0, 1);
                        _sum2 = vdotq_lane_s32(_sum2, _w2, _r1, 1);
                        _sum3 = vdotq_lane_s32(_sum3, _w3, _r1, 1);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                        int16x4_t _rr0 = vreinterpret_s16_s8(_r0);
                        int16x4_t _rr1 = vreinterpret_s16_s8(_r1);

                        int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0));
                        int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0));
                        int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2));
                        int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2));

                        int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0));
                        int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0));
                        int16x8_t _s2l = vmull_s8(_r1ll, vget_low_s8(_w0));
                        int16x8_t _s3l = vmull_s8(_r1ll, vget_high_s8(_w0));
                        _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2));
                        _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2));
                        _s2l = vmlal_s8(_s2l, _r1hl, vget_low_s8(_w2));
                        _s3l = vmlal_s8(_s3l, _r1hl, vget_high_s8(_w2));

                        _sum0 = vpadalq_s16(_sum0, _s0l);
                        _sum1 = vpadalq_s16(_sum1, _s1l);
                        _sum2 = vpadalq_s16(_sum2, _s2l);
                        _sum3 = vpadalq_s16(_sum3, _s3l);

                        int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1));
                        int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1));
                        int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3));
                        int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3));

                        int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1));
                        int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1));
                        int16x8_t _s2h = vmull_s8(_r1lh, vget_low_s8(_w1));
                        int16x8_t _s3h = vmull_s8(_r1lh, vget_high_s8(_w1));
                        _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3));
                        _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3));
                        _s2h = vmlal_s8(_s2h, _r1hh, vget_low_s8(_w3));
                        _s3h = vmlal_s8(_s3h, _r1hh, vget_high_s8(_w3));

                        _sum0 = vpadalq_s16(_sum0, _s0h);
                        _sum1 = vpadalq_s16(_sum1, _s1h);
                        _sum2 = vpadalq_s16(_sum2, _s2h);
                        _sum3 = vpadalq_s16(_sum3, _s3h);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD

                        kptr += 64;
                    }
                }
#if __ARM_FEATURE_MATMUL_INT8
                {
                    int32x4_t _tmp0 = vcombine_s32(vget_low_s32(_sum0), vget_low_s32(_sum1));
                    int32x4_t _tmp1 = vcombine_s32(vget_low_s32(_sum2), vget_low_s32(_sum3));
                    int32x4_t _tmp2 = vcombine_s32(vget_high_s32(_sum0), vget_high_s32(_sum1));
                    int32x4_t _tmp3 = vcombine_s32(vget_high_s32(_sum2), vget_high_s32(_sum3));
                    _sum0 = _tmp0;
                    _sum1 = _tmp1;
                    _sum2 = _tmp2;
                    _sum3 = _tmp3;
                }
#endif
            }
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];
                    const signed char* r1s = r1 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        int8x8_t _r0 = vdup_n_s8(r0s[0]);
                        int8x8_t _r1 = vdup_n_s8(r1s[0]);
                        int8x8_t _w = vld1_s8(kptr);
                        int16x8_t _s0 = vmull_s8(_r0, _w);
                        int16x8_t _s1 = vmull_s8(_r1, _w);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                        _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1));
                        _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));

                        kptr += 8;
                    }
                }
            }

            if (out_elempack == 8)
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                vst1q_s32(outptr + 8, _sum2);
                vst1q_s32(outptr + 12, _sum3);
                outptr += 16;
            }
            if (out_elempack == 4)
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum2);
                vst1q_s32(outptr + M, _sum1);
                vst1q_s32(outptr + M + 4, _sum3);
                outptr += 8;
            }
            if (out_elempack == 1)
            {
                outptr[0] = vgetq_lane_s32(_sum0, 0);
                outptr[1] = vgetq_lane_s32(_sum2, 0);
                outptr[M] = vgetq_lane_s32(_sum0, 1);
                outptr[M + 1] = vgetq_lane_s32(_sum2, 1);
                outptr[M * 2] = vgetq_lane_s32(_sum0, 2);
                outptr[M * 2 + 1] = vgetq_lane_s32(_sum2, 2);
                outptr[M * 3] = vgetq_lane_s32(_sum0, 3);
                outptr[M * 3 + 1] = vgetq_lane_s32(_sum2, 3);
                outptr[M * 4] = vgetq_lane_s32(_sum1, 0);
                outptr[M * 4 + 1] = vgetq_lane_s32(_sum3, 0);
                outptr[M * 5] = vgetq_lane_s32(_sum1, 1);
                outptr[M * 5 + 1] = vgetq_lane_s32(_sum3, 1);
                outptr[M * 6] = vgetq_lane_s32(_sum1, 2);
                outptr[M * 6 + 1] = vgetq_lane_s32(_sum3, 2);
                outptr[M * 7] = vgetq_lane_s32(_sum1, 3);
                outptr[M * 7 + 1] = vgetq_lane_s32(_sum3, 3);
                outptr += 2;
            }
        }
        for (; ij < outw * outh; ij++)
        {
            const int i = ij / outw;
            const int j = ij % outw;

            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            int32x4_t _sum3 = vdupq_n_s32(0);

            const signed char* kptr = weight_data_tm.channel(p / 8);

            int q = 0;
            {
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];

                        int8x8_t _r0;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            _r0 = vld1_s8(tmp);
                        }

                        int8x16_t _w0 = vld1q_s8(kptr);
                        int8x16_t _w1 = vld1q_s8(kptr + 16);
                        int8x16_t _w2 = vld1q_s8(kptr + 32);
                        int8x16_t _w3 = vld1q_s8(kptr + 48);

#if __ARM_FEATURE_MATMUL_INT8
                        int8x16_t _r00 = vcombine_s8(_r0, _r0);
                        _sum0 = vdotq_s32(_sum0, _r00, _w0);
                        _sum1 = vdotq_s32(_sum1, _r00, _w1);
                        _sum2 = vdotq_s32(_sum2, _r00, _w2);
                        _sum3 = vdotq_s32(_sum3, _r00, _w3);
#elif __ARM_FEATURE_DOTPROD
                        _sum0 = vdotq_lane_s32(_sum0, _w0, _r0, 0);
                        _sum1 = vdotq_lane_s32(_sum1, _w1, _r0, 0);
                        _sum2 = vdotq_lane_s32(_sum2, _w2, _r0, 1);
                        _sum3 = vdotq_lane_s32(_sum3, _w3, _r0, 1);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                        int16x4_t _rr0 = vreinterpret_s16_s8(_r0);
                        int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0));
                        int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1));
                        int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2));
                        int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3));

                        int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0));
                        int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0));
                        int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1));
                        int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1));
                        _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2));
                        _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2));
                        _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3));
                        _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3));

                        _sum0 = vpadalq_s16(_sum0, _s0l);
                        _sum1 = vpadalq_s16(_sum1, _s1l);
                        _sum2 = vpadalq_s16(_sum2, _s0h);
                        _sum3 = vpadalq_s16(_sum3, _s1h);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD

                        kptr += 64;
                    }
                }
#if __ARM_FEATURE_MATMUL_INT8
                {
                    _sum0 = vpaddq_s32(_sum0, _sum1);
                    _sum1 = vpaddq_s32(_sum2, _sum3);
                }
#else
                {
                    _sum0 = vaddq_s32(_sum0, _sum2);
                    _sum1 = vaddq_s32(_sum1, _sum3);
                }
#endif
            }
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        int8x8_t _val = vdup_n_s8(r0s[0]);
                        int8x8_t _w = vld1_s8(kptr);
                        int16x8_t _s0 = vmull_s8(_val, _w);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                        kptr += 8;
                    }
                }
            }

            if (out_elempack == 8)
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                outptr += 8;
            }
            if (out_elempack == 4)
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + M, _sum1);
                outptr += 4;
            }
            if (out_elempack == 1)
            {
                outptr[0] = vgetq_lane_s32(_sum0, 0);
                outptr[M] = vgetq_lane_s32(_sum0, 1);
                outptr[M * 2] = vgetq_lane_s32(_sum0, 2);
                outptr[M * 3] = vgetq_lane_s32(_sum0, 3);
                outptr[M * 4] = vgetq_lane_s32(_sum1, 0);
                outptr[M * 5] = vgetq_lane_s32(_sum1, 1);
                outptr[M * 6] = vgetq_lane_s32(_sum1, 2);
                outptr[M * 7] = vgetq_lane_s32(_sum1, 3);
                outptr += 1;
            }
        }
    }
    remain_outch_start += nn_outch * 8;
    nn_outch = (outch - remain_outch_start) / 4;
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 4;

        // shadowed variable for less openmp task args
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const size_t N = bottom_blob.cstep * elempack;
        const size_t M = top_blob.cstep * out_elempack;

        int* outptr = top_blob.channel(p / out_elempack);

        int ij = 0;
        for (; ij + 1 < outw * outh; ij += 2)
        {
            const int i0 = ij / outw;
            const int i1 = (ij + 1) / outw;
            const int j0 = ij % outw;
            const int j1 = (ij + 1) % outw;

            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);

            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);

            int q = 0;
            {
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
                    const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];
                        const signed char* r1s = r1 + space_ofs[k];

                        int8x8_t _r0;
                        int8x8_t _r1;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                            _r1 = vld1_s8(r1s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]};
                            _r0 = vld1_s8(tmp0);
                            _r1 = vld1_s8(tmp1);
                        }

                        int8x16_t _w0 = vld1q_s8(kptr);
                        int8x16_t _w1 = vld1q_s8(kptr + 16);

#if __ARM_FEATURE_MATMUL_INT8
                        int8x16_t _r01 = vcombine_s8(_r0, _r1);
                        _sum0 = vmmlaq_s32(_sum0, _r01, _w0);
                        _sum1 = vmmlaq_s32(_sum1, _r01, _w1);
#elif __ARM_FEATURE_DOTPROD
                        _sum0 = vdotq_lane_s32(_sum0, _w0, _r0, 0);
                        _sum1 = vdotq_lane_s32(_sum1, _w0, _r1, 0);
                        _sum0 = vdotq_lane_s32(_sum0, _w1, _r0, 1);
                        _sum1 = vdotq_lane_s32(_sum1, _w1, _r1, 1);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                        int16x4_t _rr0 = vreinterpret_s16_s8(_r0);
                        int16x4_t _rr1 = vreinterpret_s16_s8(_r1);

                        int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0));
                        int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0));
                        int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1));
                        int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1));

                        int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0));
                        int16x8_t _s1l = vmull_s8(_r1ll, vget_low_s8(_w0));
                        int16x8_t _s0h = vmull_s8(_r0lh, vget_high_s8(_w0));
                        int16x8_t _s1h = vmull_s8(_r1lh, vget_high_s8(_w0));

                        int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2));
                        int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2));
                        int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3));
                        int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3));

                        _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w1));
                        _s1l = vmlal_s8(_s1l, _r1hl, vget_low_s8(_w1));
                        _s0h = vmlal_s8(_s0h, _r0hh, vget_high_s8(_w1));
                        _s1h = vmlal_s8(_s1h, _r1hh, vget_high_s8(_w1));

                        _sum0 = vpadalq_s16(_sum0, _s0l);
                        _sum1 = vpadalq_s16(_sum1, _s1l);
                        _sum0 = vpadalq_s16(_sum0, _s0h);
                        _sum1 = vpadalq_s16(_sum1, _s1h);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD

                        kptr += 32;
                    }
                }
#if __ARM_FEATURE_MATMUL_INT8
                {
                    int32x4_t _tmp0 = vcombine_s32(vget_low_s32(_sum0), vget_low_s32(_sum1));
                    int32x4_t _tmp1 = vcombine_s32(vget_high_s32(_sum0), vget_high_s32(_sum1));
                    _sum0 = _tmp0;
                    _sum1 = _tmp1;
                }
#endif
            }
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];
                    const signed char* r1s = r1 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        int8x8_t _r0 = vdup_n_s8(r0s[0]);
                        int8x8_t _r1 = vdup_n_s8(r1s[0]);
                        int8x8_t _r01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r1)).val[0]);
                        int8x8_t _w = vld1_s8(kptr);
                        int8x8_t _ww = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_w), vreinterpret_s32_s8(_w)).val[0]);
                        int16x8_t _s01 = vmull_s8(_r01, _ww);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));

                        kptr += 4;
                    }
                }
            }

            if (out_elempack == 4)
            {
                vst1q_s32(outptr, _sum0);
                vst1q_s32(outptr + 4, _sum1);
                outptr += 8;
            }
            if (out_elempack == 1)
            {
                int32x4x2_t _sum01 = vzipq_s32(_sum0, _sum1);
                vst1_s32(outptr, vget_low_s32(_sum01.val[0]));
                vst1_s32(outptr + M, vget_high_s32(_sum01.val[0]));
                vst1_s32(outptr + M * 2, vget_low_s32(_sum01.val[1]));
                vst1_s32(outptr + M * 3, vget_high_s32(_sum01.val[1]));
                outptr += 2;
            }
        }
        for (; ij < outw * outh; ij++)
        {
            const int i = ij / outw;
            const int j = ij % outw;

            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);

            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);

            int q = 0;
            {
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];

                        int8x8_t _r0;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            _r0 = vld1_s8(tmp);
                        }

                        int8x16_t _w0 = vld1q_s8(kptr);
                        int8x16_t _w1 = vld1q_s8(kptr + 16);

#if __ARM_FEATURE_MATMUL_INT8
                        int8x16_t _r00 = vcombine_s8(_r0, _r0);
                        _sum0 = vdotq_s32(_sum0, _r00, _w0);
                        _sum1 = vdotq_s32(_sum1, _r00, _w1);
#elif __ARM_FEATURE_DOTPROD
                        _sum0 = vdotq_lane_s32(_sum0, _w0, _r0, 0);
                        _sum1 = vdotq_lane_s32(_sum1, _w1, _r0, 1);
#else  // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
                        int16x4_t _rr0 = vreinterpret_s16_s8(_r0);
                        int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0));
                        int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1));
                        int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2));
                        int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3));

                        int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0));
                        int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0));
                        _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1));
                        _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1));

                        _sum0 = vpadalq_s16(_sum0, _sl);
                        _sum1 = vpadalq_s16(_sum1, _sh);
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD

                        kptr += 32;
                    }
                }
#if __ARM_FEATURE_MATMUL_INT8
                {
                    _sum0 = vpaddq_s32(_sum0, _sum1);
                }
#else
                {
                    _sum0 = vaddq_s32(_sum0, _sum1);
                }
#endif
            }
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        int8x8_t _val = vdup_n_s8(r0s[0]);
                        int8x8_t _w = vld1_s8(kptr);
                        int16x8_t _s0 = vmull_s8(_val, _w);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));

                        kptr += 4;
                    }
                }
            }

            if (out_elempack == 4)
            {
                vst1q_s32(outptr, _sum0);
                outptr += 4;
            }
            if (out_elempack == 1)
            {
                outptr[0] = vgetq_lane_s32(_sum0, 0);
                outptr[M] = vgetq_lane_s32(_sum0, 1);
                outptr[M * 2] = vgetq_lane_s32(_sum0, 2);
                outptr[M * 3] = vgetq_lane_s32(_sum0, 3);
                outptr += 1;
            }
        }
    }
    remain_outch_start += nn_outch * 4;
    nn_outch = (outch - remain_outch_start) / 2;
#else // __ARM_NEON
    nn_outch = (outch - remain_outch_start) / 2;
    #pragma omp parallel for num_threads(opt.num_threads)
#endif // __ARM_NEON
    for (int pp = 0; pp < nn_outch; pp++)
    {
        const int p = remain_outch_start + pp * 2;

        // shadowed variable for less openmp task args
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const size_t N = bottom_blob.cstep * elempack;

        int* outptr0 = top_blob.channel(p);
        int* outptr1 = top_blob.channel(p + 1);

        int ij = 0;
        for (; ij + 1 < outw * outh; ij += 2)
        {
            const int i0 = ij / outw;
            const int i1 = (ij + 1) / outw;
            const int j0 = ij % outw;
            const int j1 = (ij + 1) % outw;

            int sum00 = 0;
            int sum01 = 0;
            int sum10 = 0;
            int sum11 = 0;

#if __ARM_NEON
            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);
#else
            const signed char* kptr = weight_data_tm.channel(p / 2);
#endif

            int q = 0;
#if __ARM_NEON
            {
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
                    const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];
                        const signed char* r1s = r1 + space_ofs[k];

                        int8x8_t _r0;
                        int8x8_t _r1;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                            _r1 = vld1_s8(r1s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]};
                            _r0 = vld1_s8(tmp0);
                            _r1 = vld1_s8(tmp1);
                        }

                        int8x16_t _w0 = vld1q_s8(kptr);

#if __ARM_FEATURE_DOTPROD
                        int8x16_t _r00 = vcombine_s8(_r0, _r0);
                        int8x16_t _r11 = vcombine_s8(_r1, _r1);
                        _sum01 = vdotq_s32(_sum01, _r00, _w0);
                        _sum23 = vdotq_s32(_sum23, _r11, _w0);
#else  // __ARM_FEATURE_DOTPROD
                        int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0));
                        int32x2x2_t _rr1 = vzip_s32(vreinterpret_s32_s8(_r1), vreinterpret_s32_s8(_r1));
                        int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]);
                        int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]);
                        int8x8_t _r1l = vreinterpret_s8_s32(_rr1.val[0]);
                        int8x8_t _r1h = vreinterpret_s8_s32(_rr1.val[1]);

                        int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0));
                        int16x8_t _s23 = vmull_s8(_r1l, vget_low_s8(_w0));
                        _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0));
                        _s23 = vmlal_s8(_s23, _r1h, vget_high_s8(_w0));

                        _sum01 = vpadalq_s16(_sum01, _s01);
                        _sum23 = vpadalq_s16(_sum23, _s23);
#endif // __ARM_FEATURE_DOTPROD

                        kptr += 16;
                    }
                }
                int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01));
                int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23));
                sum00 += vget_lane_s32(_s0, 0);
                sum01 += vget_lane_s32(_s1, 0);
                sum10 += vget_lane_s32(_s0, 1);
                sum11 += vget_lane_s32(_s1, 1);
            }
#endif // __ARM_NEON
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];
                    const signed char* r1s = r1 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        sum00 += r0s[0] * kptr[0];
                        sum01 += r1s[0] * kptr[0];
                        sum10 += r0s[0] * kptr[1];
                        sum11 += r1s[0] * kptr[1];

                        kptr += 2;
                    }
                }
            }

            outptr0[0] = sum00;
            outptr0[1] = sum01;
            outptr1[0] = sum10;
            outptr1[1] = sum11;
            outptr0 += 2;
            outptr1 += 2;
        }
        for (; ij < outw * outh; ij++)
        {
            const int i = ij / outw;
            const int j = ij % outw;

            int sum0 = 0;
            int sum1 = 0;

#if __ARM_NEON
            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);
#else
            const signed char* kptr = weight_data_tm.channel(p / 2);
#endif

            int q = 0;
#if __ARM_NEON
            {
                int32x4_t _sum01 = vdupq_n_s32(0);
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];

                        int8x8_t _r0;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            _r0 = vld1_s8(tmp);
                        }

                        int8x16_t _w0 = vld1q_s8(kptr);

#if __ARM_FEATURE_DOTPROD
                        int8x16_t _r00 = vcombine_s8(_r0, _r0);
                        _sum01 = vdotq_s32(_sum01, _r00, _w0);
#else  // __ARM_FEATURE_DOTPROD
                        int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0));
                        int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]);
                        int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]);

                        int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0));
                        _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0));

                        _sum01 = vpadalq_s16(_sum01, _s01);
#endif // __ARM_FEATURE_DOTPROD

                        kptr += 16;
                    }
                }
                int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01));
                sum0 += vget_lane_s32(_s0, 0);
                sum1 += vget_lane_s32(_s0, 1);
            }
#endif // __ARM_NEON
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        sum0 += r0s[0] * kptr[0];
                        sum1 += r0s[0] * kptr[1];

                        kptr += 2;
                    }
                }
            }

            outptr0[0] = sum0;
            outptr1[0] = sum1;
            outptr0 += 1;
            outptr1 += 1;
        }
    }
    remain_outch_start += nn_outch * 2;
    for (int p = remain_outch_start; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        int ij = 0;
        for (; ij + 1 < outw * outh; ij += 2)
        {
            const int i0 = ij / outw;
            const int i1 = (ij + 1) / outw;
            const int j0 = ij % outw;
            const int j1 = (ij + 1) % outw;

            int sum0 = 0;
            int sum1 = 0;

#if __ARM_NEON
            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);
#else
            const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2);
#endif

            int q = 0;
#if __ARM_NEON
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                int32x4_t _sum1 = vdupq_n_s32(0);
                int32x4_t _sum2 = vdupq_n_s32(0);
                int32x4_t _sum3 = vdupq_n_s32(0);
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
                    const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];
                        const signed char* r1s = r1 + space_ofs[k];

                        int8x8_t _r0;
                        int8x8_t _r1;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                            _r1 = vld1_s8(r1s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]};
                            _r0 = vld1_s8(tmp0);
                            _r1 = vld1_s8(tmp1);
                        }

                        int8x8_t _w = vld1_s8(kptr);

                        int16x8_t _s0 = vmull_s8(_r0, _w);
                        int16x8_t _s1 = vmull_s8(_r1, _w);

                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                        _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1));
                        _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));

                        kptr += 8;
                    }
                }
                _sum0 = vaddq_s32(_sum0, _sum1);
                _sum2 = vaddq_s32(_sum2, _sum3);
#if __aarch64__
                sum0 += vaddvq_s32(_sum0);
                sum1 += vaddvq_s32(_sum2);
#else
                int32x2_t _ss0 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                int32x2_t _ss2 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2));
                _ss0 = vpadd_s32(_ss0, _ss2);
                sum0 += vget_lane_s32(_ss0, 0);
                sum1 += vget_lane_s32(_ss0, 1);
#endif
            }
#endif // __ARM_NEON
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];
                    const signed char* r1s = r1 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        sum0 += r0s[0] * kptr[0];
                        sum1 += r1s[0] * kptr[0];

                        kptr += 1;
                    }
                }
            }

            outptr[0] = sum0;
            outptr[1] = sum1;
            outptr += 2;
        }
        for (; ij < outw * outh; ij++)
        {
            const int i = ij / outw;
            const int j = ij % outw;

            int sum = 0;

#if __ARM_NEON
            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);
#else
            const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2);
#endif

            int q = 0;
#if __ARM_NEON
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                int32x4_t _sum1 = vdupq_n_s32(0);
                for (; q + 7 < inch; q += 8)
                {
                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;

                    for (int k = 0; k < maxk; k++)
                    {
                        const signed char* r0s = r0 + space_ofs[k];

                        int8x8_t _r0;
                        if (elempack == 8)
                        {
                            _r0 = vld1_s8(r0s);
                        }
                        else // if (elempack == 1)
                        {
                            signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
                            _r0 = vld1_s8(tmp);
                        }

                        int8x8_t _w = vld1_s8(kptr);

                        int16x8_t _s0 = vmull_s8(_r0, _w);

                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                        kptr += 8;
                    }
                }
                int32x4_t _sum = vaddq_s32(_sum0, _sum1);
#if __aarch64__
                sum += vaddvq_s32(_sum);
#else
                int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum));
                _ss = vpadd_s32(_ss, _ss);
                sum += vget_lane_s32(_ss, 0);
#endif
            }
#endif // __ARM_NEON
            for (; q < inch; q++)
            {
                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;

                for (int k = 0; k < maxk; k++)
                {
                    const signed char* r0s = r0 + space_ofs[k];

                    // if (elempack == 1)
                    {
                        sum += r0s[0] * kptr[0];

                        kptr += 1;
                    }
                }
            }

            outptr[0] = sum;
            outptr += 1;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_3x3.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const float bias0 = bias ? bias[g] : 0.f;

        const float* kernel0 = kernel + g * 9;

        float* outptr = out;
        float* outptr2 = outptr + outw;

        const float* img0 = bottom_blob.channel(g);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w * 2;
        const float* r3 = img0 + w * 3;

#if __ARM_NEON
        float32x4_t _k012x = vld1q_f32(kernel0);
        float32x4_t _k345x = vld1q_f32(kernel0 + 3);
        float32x4_t _k678x = vld1q_f32(kernel0 + 6);

        _k012x = vsetq_lane_f32(0.f, _k012x, 3);
        _k345x = vsetq_lane_f32(0.f, _k345x, 3);
        _k678x = vsetq_lane_f32(0.f, _k678x, 3);

        float32x4_t _bias0 = vdupq_n_f32(bias0);
#else
        const float* k0 = kernel0;
        const float* k1 = kernel0 + 3;
        const float* k2 = kernel0 + 6;
#endif // __ARM_NEON

        int i = 0;

        for (; i + 1 < outh; i += 2)
        {
#if __ARM_NEON
#if __aarch64__
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int nn = outw >> 2;
            int remain = outw & 3;
#endif // __aarch64__
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #384]           \n"
                    "ld1    {v8.4s, v9.4s, v10.4s}, [%3]    \n" // r0
                    "add    %3, %3, #32                     \n"

                    "ext    v11.16b, v8.16b, v9.16b, #4     \n"
                    "ext    v13.16b, v9.16b, v10.16b, #4    \n"

                    "ext    v12.16b, v8.16b, v9.16b, #8     \n"
                    "ext    v14.16b, v9.16b, v10.16b, #8    \n"

                    "0:                                     \n"

                    "and    v4.16b, %17.16b, %17.16b        \n" // v4 = _bias0
                    "and    v5.16b, %17.16b, %17.16b        \n" // v5 = _bias0

                    "prfm   pldl1keep, [%6, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%6]  \n" // r3
                    "add    %6, %6, #32                     \n"

                    "and    v6.16b, %17.16b, %17.16b        \n" // v6 = _bias0
                    "and    v7.16b, %17.16b, %17.16b        \n" // v7 = _bias0

                    "ext    v15.16b, v16.16b, v17.16b, #4   \n"

                    "fmla   v4.4s, v8.4s, %14.s[0]          \n"
                    "fmla   v5.4s, v9.4s, %14.s[0]          \n"

                    "ext    v20.16b, v17.16b, v18.16b, #4   \n"

                    "fmla   v6.4s, v16.4s, %16.s[0]         \n"
                    "fmla   v7.4s, v17.4s, %16.s[0]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #8   \n"

                    "fmla   v4.4s, v11.4s, %14.s[1]         \n"
                    "fmla   v5.4s, v13.4s, %14.s[1]         \n"

                    "ext    v21.16b, v17.16b, v18.16b, #8   \n"

                    "fmla   v6.4s, v15.4s, %16.s[1]         \n"
                    "fmla   v7.4s, v20.4s, %16.s[1]         \n"

                    "prfm   pldl1keep, [%4, #384]           \n"
                    "ld1    {v22.4s, v23.4s, v24.4s}, [%4]  \n" // r1

                    "fmla   v4.4s, v12.4s, %14.s[2]         \n"
                    "fmla   v5.4s, v14.4s, %14.s[2]         \n"

                    "add    %4, %4, #32                     \n"

                    "fmla   v6.4s, v19.4s, %16.s[2]         \n"
                    "fmla   v7.4s, v21.4s, %16.s[2]         \n"

                    "ext    v25.16b, v22.16b, v23.16b, #4   \n"

                    "fmla   v4.4s, v22.4s, %15.s[0]         \n"
                    "fmla   v5.4s, v23.4s, %15.s[0]         \n"

                    "ext    v27.16b, v23.16b, v24.16b, #4   \n"

                    "fmla   v6.4s, v22.4s, %14.s[0]         \n"
                    "fmla   v7.4s, v23.4s, %14.s[0]         \n"

                    "ext    v26.16b, v22.16b, v23.16b, #8   \n"

                    "fmla   v4.4s, v25.4s, %15.s[1]         \n"
                    "fmla   v5.4s, v27.4s, %15.s[1]         \n"

                    "ext    v28.16b, v23.16b, v24.16b, #8   \n"

                    "fmla   v6.4s, v25.4s, %14.s[1]         \n"
                    "fmla   v7.4s, v27.4s, %14.s[1]         \n"

                    "prfm   pldl1keep, [%5, #384]           \n"
                    "ld1    {v8.4s, v9.4s, v10.4s}, [%5]    \n" // r2

                    "fmla   v4.4s, v26.4s, %15.s[2]         \n"
                    "fmla   v5.4s, v28.4s, %15.s[2]         \n"

                    "add    %5, %5, #32                     \n"

                    "fmla   v6.4s, v26.4s, %14.s[2]         \n"
                    "fmla   v7.4s, v28.4s, %14.s[2]         \n"

                    "ext    v11.16b, v8.16b, v9.16b, #4     \n"

                    "fmla   v4.4s, v8.4s, %16.s[0]          \n"
                    "fmla   v5.4s, v9.4s, %16.s[0]          \n"

                    "ext    v13.16b, v9.16b, v10.16b, #4    \n"

                    "fmla   v6.4s, v8.4s, %15.s[0]          \n"
                    "fmla   v7.4s, v9.4s, %15.s[0]          \n"

                    "ext    v12.16b, v8.16b, v9.16b, #8     \n"

                    "fmla   v4.4s, v11.4s, %16.s[1]         \n"
                    "fmla   v5.4s, v13.4s, %16.s[1]         \n"

                    "ext    v14.16b, v9.16b, v10.16b, #8    \n"

                    "fmla   v6.4s, v11.4s, %15.s[1]         \n"
                    "fmla   v7.4s, v13.4s, %15.s[1]         \n"

                    "prfm   pldl1keep, [%3, #384]           \n"
                    "ld1    {v8.4s, v9.4s, v10.4s}, [%3]    \n" // r0 next loop

                    "fmla   v4.4s, v12.4s, %16.s[2]         \n"
                    "fmla   v5.4s, v14.4s, %16.s[2]         \n"

                    "add    %3, %3, #32                     \n"
                    "ext    v11.16b, v8.16b, v9.16b, #4     \n"

                    "fmla   v6.4s, v12.4s, %15.s[2]         \n"
                    "fmla   v7.4s, v14.4s, %15.s[2]         \n"

                    "ext    v13.16b, v9.16b, v10.16b, #4    \n"
                    "ext    v12.16b, v8.16b, v9.16b, #8     \n"

                    "st1    {v4.4s, v5.4s}, [%1], #32       \n"

                    "ext    v14.16b, v9.16b, v10.16b, #8    \n"

                    "subs   %w0, %w0, #1                    \n"

                    "st1    {v6.4s, v7.4s}, [%2], #32       \n"

                    "bne    0b                              \n"
                    "sub    %3, %3, #32                     \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr),  // %1
                    "=r"(outptr2), // %2
                    "=r"(r0),      // %3
                    "=r"(r1),      // %4
                    "=r"(r2),      // %5
                    "=r"(r3)       // %6
                    : "0"(nn),
                    "1"(outptr),
                    "2"(outptr2),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "w"(_k012x), // %14
                    "w"(_k345x), // %15
                    "w"(_k678x), // %16
                    "w"(_bias0)  // %17
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
            }

            if (remain >= 4)
            {
                remain -= 4;

                asm volatile(
                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld1    {v8.4s, v9.4s}, [%2]            \n" // r0
                    "add    %2, %2, #16                     \n"

                    "and    v4.16b, %15.16b, %15.16b        \n" // v4 = _bias0
                    "and    v6.16b, %15.16b, %15.16b        \n" // v6 = _bias0

                    "prfm   pldl1keep, [%5, #256]           \n"
                    "ld1    {v16.4s, v17.4s}, [%5]          \n" // r3
                    "add    %5, %5, #16                     \n"

                    "ext    v11.16b, v8.16b, v9.16b, #4     \n"
                    "ext    v15.16b, v16.16b, v17.16b, #4   \n"

                    "fmla   v4.4s, v8.4s, %12.s[0]          \n"
                    "fmla   v6.4s, v16.4s, %14.s[0]         \n"

                    "ext    v12.16b, v8.16b, v9.16b, #8     \n"
                    "ext    v19.16b, v16.16b, v17.16b, #8   \n"

                    "fmla   v4.4s, v11.4s, %12.s[1]         \n"
                    "fmla   v6.4s, v15.4s, %14.s[1]         \n"

                    "prfm   pldl1keep, [%3, #256]           \n"
                    "ld1    {v22.4s, v23.4s}, [%3]          \n" // r1

                    "fmla   v4.4s, v12.4s, %12.s[2]         \n"

                    "add    %3, %3, #16                     \n"

                    "fmla   v6.4s, v19.4s, %14.s[2]         \n"

                    "ext    v25.16b, v22.16b, v23.16b, #4   \n"

                    "fmla   v4.4s, v22.4s, %13.s[0]         \n"
                    "fmla   v6.4s, v22.4s, %12.s[0]         \n"

                    "ext    v26.16b, v22.16b, v23.16b, #8   \n"

                    "fmla   v4.4s, v25.4s, %13.s[1]         \n"
                    "fmla   v6.4s, v25.4s, %12.s[1]         \n"

                    "prfm   pldl1keep, [%4, #256]           \n"
                    "ld1    {v8.4s, v9.4s}, [%4]            \n" // r2

                    "fmla   v4.4s, v26.4s, %13.s[2]         \n"

                    "add    %4, %4, #16                     \n"

                    "fmla   v6.4s, v26.4s, %12.s[2]         \n"

                    "ext    v11.16b, v8.16b, v9.16b, #4     \n"

                    "fmla   v4.4s, v8.4s, %14.s[0]          \n"
                    "fmla   v6.4s, v8.4s, %13.s[0]          \n"

                    "ext    v12.16b, v8.16b, v9.16b, #8     \n"

                    "fmla   v4.4s, v11.4s, %14.s[1]         \n"
                    "fmla   v6.4s, v11.4s, %13.s[1]         \n"

                    "fmla   v4.4s, v12.4s, %14.s[2]         \n"
                    "fmla   v6.4s, v12.4s, %13.s[2]         \n"

                    "st1    {v4.4s}, [%0], #16              \n"
                    "st1    {v6.4s}, [%1], #16              \n"

                    : "=r"(outptr),  // %0
                    "=r"(outptr2), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr),
                    "1"(outptr2),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k012x), // %12
                    "w"(_k345x), // %13
                    "w"(_k678x), // %14
                    "w"(_bias0)  // %15
                    : "cc", "memory", "v4", "v6", "v8", "v9", "v11", "v12", "v15", "v16", "v17", "v18", "v19", "v22", "v23", "v25", "v26");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%3, #192]          \n"
                    "vld1.f32   {d18-d20}, [%3 :64] \n" // r0
                    "add        %3, #16             \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "0:                             \n"

                    "vmul.f32   q7, q9, %e14[0]     \n"

                    "vand       q13, %q17, %q17     \n" // q13 = _bias0
                    "vmul.f32   q6, q11, %e14[1]    \n"
                    "vmla.f32   q13, q12, %f14[0]   \n"

                    "pld        [%4, #192]          \n"
                    "vld1.f32   {d18-d20}, [%4]     \n" // r1
                    "add        %4, #16             \n"

                    "vmla.f32   q7, q9, %e15[0]     \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "vmla.f32   q6, q11, %e15[1]    \n"
                    "vmla.f32   q13, q12, %f15[0]   \n"

                    "vmul.f32   q8, q9, %e14[0]     \n"

                    "vand       q15, %q17, %q17     \n" // q15 = _bias0
                    "vmul.f32   q14, q11, %e14[1]   \n"
                    "vmla.f32   q15, q12, %f14[0]   \n"

                    "pld        [%5, #192]          \n"
                    "vld1.f32   {d18-d20}, [%5 :64] \n" // r2
                    "add        %5, #16             \n"

                    "vmla.f32   q7, q9, %e16[0]     \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "vmla.f32   q6, q11, %e16[1]    \n"
                    "vmla.f32   q13, q12, %f16[0]   \n"

                    "vmla.f32   q8, q9, %e15[0]     \n"
                    "vmla.f32   q14, q11, %e15[1]   \n"
                    "vmla.f32   q15, q12, %f15[0]   \n"

                    "pld        [%6, #192]          \n"
                    "vld1.f32   {d18-d20}, [%6]     \n" // r3
                    "add        %6, #16             \n"

                    "vmla.f32   q8, q9, %e16[0]     \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "vmla.f32   q14, q11, %e16[1]   \n"
                    "vmla.f32   q15, q12, %f16[0]   \n"

                    "vadd.f32   q7, q7, q6          \n"

                    "pld        [%3, #192]          \n"
                    "vld1.f32   {d18-d20}, [%3 :64] \n" // r0

                    "vadd.f32   q8, q8, q14         \n"
                    "vadd.f32   q7, q7, q13         \n"
                    "vadd.f32   q8, q8, q15         \n"

                    "vext.32    q11, q9, q10, #1    \n"
                    "vext.32    q12, q9, q10, #2    \n"

                    "add        %3, #16             \n"

                    "vst1.f32   {d14-d15}, [%1]!    \n"
                    "vst1.f32   {d16-d17}, [%2]!    \n"

                    "subs       %0, #1              \n"
                    "bne        0b                  \n"

                    "sub        %3, #16             \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr),  // %1
                    "=r"(outptr2), // %2
                    "=r"(r0),      // %3
                    "=r"(r1),      // %4
                    "=r"(r2),      // %5
                    "=r"(r3)       // %6
                    : "0"(nn),
                    "1"(outptr),
                    "2"(outptr2),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "w"(_k012x), // %14
                    "w"(_k345x), // %15
                    "w"(_k678x), // %16
                    "w"(_bias0)  // %17
                    : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
#if __ARM_NEON
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r30 = vld1q_f32(r3);

                float32x4_t _sum = vmulq_f32(_r00, _k012x);
                _sum = vmlaq_f32(_sum, _r10, _k345x);
                _sum = vmlaq_f32(_sum, _r20, _k678x);

                float32x4_t _sum2 = vmulq_f32(_r10, _k012x);
                _sum2 = vmlaq_f32(_sum2, _r20, _k345x);
                _sum2 = vmlaq_f32(_sum2, _r30, _k678x);

                _sum = vsetq_lane_f32(bias0, _sum, 3);
                _sum2 = vsetq_lane_f32(bias0, _sum2, 3);
#if __aarch64__
                *outptr = vaddvq_f32(_sum);
                *outptr2 = vaddvq_f32(_sum2);
#else
                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));

                float32x2_t _sss2 = vpadd_f32(_ss, _ss2);

                *outptr = vget_lane_f32(_sss2, 0);
                *outptr2 = vget_lane_f32(_sss2, 1);
#endif // __aarch64__
#else
                float sum = bias0;
                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];

                float sum2 = bias0;
                sum2 += r1[0] * k0[0];
                sum2 += r1[1] * k0[1];
                sum2 += r1[2] * k0[2];
                sum2 += r2[0] * k1[0];
                sum2 += r2[1] * k1[1];
                sum2 += r2[2] * k1[2];
                sum2 += r3[0] * k2[0];
                sum2 += r3[1] * k2[1];
                sum2 += r3[2] * k2[2];

                *outptr = sum;
                *outptr2 = sum2;
#endif
                r0++;
                r1++;
                r2++;
                r3++;
                outptr++;
                outptr2++;
            }

            r0 += 2 + w;
            r1 += 2 + w;
            r2 += 2 + w;
            r3 += 2 + w;

            outptr += outw;
            outptr2 += outw;
        }

        for (; i < outh; i++)
        {
#if __ARM_NEON
#if __aarch64__
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int nn = outw >> 2;
            int remain = outw & 3;
#endif // __aarch64__
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm   pldl1keep, [%2, #384]           \n"
                    "ld1    {v8.4s, v9.4s, v10.4s}, [%2]    \n" // r0
                    "add    %2, %2, #32                     \n"

                    "ext    v12.16b, v8.16b, v9.16b, #4     \n"
                    "ext    v14.16b, v9.16b, v10.16b, #4    \n"

                    "0:                                     \n"

                    "fmul   v6.4s, v8.4s, %10.s[0]          \n"

                    "and    v4.16b, %13.16b, %13.16b        \n" // v4 = _bias0

                    "fmul   v7.4s, v9.4s, %10.s[0]          \n"

                    "and    v5.16b, %13.16b, %13.16b        \n" // v5 = _bias0

                    "fmla   v4.4s, v12.4s, %10.s[1]         \n"

                    "ext    v13.16b, v8.16b, v9.16b, #8     \n"

                    "fmla   v5.4s, v14.4s, %10.s[1]         \n"

                    "ext    v15.16b, v9.16b, v10.16b, #8    \n"

                    "fmla   v6.4s, v13.4s, %10.s[2]         \n"

                    "prfm   pldl1keep, [%3, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%3]  \n" // r1

                    "fmla   v7.4s, v15.4s, %10.s[2]         \n"

                    "add    %3, %3, #32                     \n"

                    "fmla   v4.4s, v16.4s, %11.s[0]         \n"

                    "ext    v20.16b, v16.16b, v17.16b, #4   \n"

                    "fmla   v5.4s, v17.4s, %11.s[0]         \n"

                    "ext    v22.16b, v17.16b, v18.16b, #4   \n"

                    "fmla   v6.4s, v20.4s, %11.s[1]         \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n"

                    "fmla   v7.4s, v22.4s, %11.s[1]         \n"

                    "ext    v23.16b, v17.16b, v18.16b, #8   \n"

                    "fmla   v4.4s, v21.4s, %11.s[2]         \n"

                    "prfm   pldl1keep, [%4, #384]           \n"
                    "ld1    {v24.4s, v25.4s, v26.4s}, [%4]  \n" // r2

                    "fmla   v5.4s, v23.4s, %11.s[2]         \n"

                    "add    %4, %4, #32                     \n"

                    "fmla   v6.4s, v24.4s, %12.s[0]         \n"

                    "ext    v12.16b, v24.16b, v25.16b, #4   \n"

                    "fmla   v7.4s, v25.4s, %12.s[0]         \n"

                    "ext    v14.16b, v25.16b, v26.16b, #4   \n"

                    "fmla   v4.4s, v12.4s, %12.s[1]         \n"

                    "ext    v13.16b, v24.16b, v25.16b, #8   \n"

                    "fmla   v5.4s, v14.4s, %12.s[1]         \n"

                    "ext    v15.16b, v25.16b, v26.16b, #8   \n"

                    "fmla   v6.4s, v13.4s, %12.s[2]         \n"
                    "fmla   v7.4s, v15.4s, %12.s[2]         \n"

                    "prfm   pldl1keep, [%2, #384]           \n"
                    "ld1    {v8.4s, v9.4s, v10.4s}, [%2]    \n" // r0 next loop

                    "fadd   v4.4s, v4.4s, v6.4s             \n"

                    "add    %2, %2, #32                     \n"

                    "fadd   v5.4s, v5.4s, v7.4s             \n"

                    "ext    v12.16b, v8.16b, v9.16b, #4     \n"
                    "ext    v14.16b, v9.16b, v10.16b, #4    \n"

                    "subs   %w0, %w0, #1                    \n"

                    "st1    {v4.4s, v5.4s}, [%1], #32       \n"

                    "bne    0b                              \n"
                    "sub    %2, %2, #32                     \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k012x), // %10
                    "w"(_k345x), // %11
                    "w"(_k678x), // %12
                    "w"(_bias0)  // %13
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26");
            }

            if (remain >= 4)
            {
                remain -= 4;

                asm volatile(
                    "prfm   pldl1keep, [%1, #192]           \n"
                    "ld1    {v8.4s, v9.4s}, [%1]            \n" // r0
                    "add    %1, %1, #16                     \n"

                    "and    v4.16b, %11.16b, %11.16b        \n" // v4 = _bias0

                    "ext    v12.16b, v8.16b, v9.16b, #4     \n"

                    "fmul   v6.4s, v8.4s, %8.s[0]           \n"

                    "ext    v13.16b, v8.16b, v9.16b, #8     \n"

                    "fmla   v4.4s, v12.4s, %8.s[1]          \n"

                    "prfm   pldl1keep, [%2, #192]           \n"
                    "ld1    {v16.4s, v17.4s}, [%2]          \n" // r1
                    "add    %2, %2, #16                     \n"

                    "fmla   v6.4s, v13.4s, %8.s[2]          \n"

                    "ext    v20.16b, v16.16b, v17.16b, #4   \n"

                    "fmla   v4.4s, v16.4s, %9.s[0]          \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n"

                    "fmla   v6.4s, v20.4s, %9.s[1]          \n"

                    "prfm   pldl1keep, [%3, #192]           \n"
                    "ld1    {v24.4s, v25.4s}, [%3]          \n" // r2
                    "add    %3, %3, #16                     \n"

                    "fmla   v4.4s, v21.4s, %9.s[2]          \n"

                    "ext    v12.16b, v24.16b, v25.16b, #4   \n"

                    "fmla   v6.4s, v24.4s, %10.s[0]         \n"

                    "ext    v13.16b, v24.16b, v25.16b, #8   \n"

                    "fmla   v4.4s, v12.4s, %10.s[1]         \n"

                    "fmla   v6.4s, v13.4s, %10.s[2]         \n"

                    "fadd   v4.4s, v4.4s, v6.4s             \n"

                    "st1    {v4.4s}, [%0], #16              \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k012x), // %8
                    "w"(_k345x), // %9
                    "w"(_k678x), // %10
                    "w"(_bias0)  // %11
                    : "cc", "memory", "v4", "v6", "v8", "v9", "v12", "v13", "v16", "v17", "v20", "v21", "v24", "v25");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%2, #192]          \n"
                    "vld1.f32   {d16-d18}, [%2]     \n" // r0
                    "add        %2, #16             \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "0:                             \n"

                    "vmul.f32   q7, q8, %e10[0]     \n"

                    "vand       q14, %q13, %q13     \n" // q14 = _bias0
                    "vmul.f32   q13, q10, %e10[1]   \n"
                    "vmla.f32   q14, q11, %f10[0]   \n"

                    "pld        [%3, #192]          \n"
                    "vld1.f32   {d16-d18}, [%3]     \n" // r1
                    "add        %3, #16             \n"

                    "vmla.f32   q7, q8, %e11[0]     \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "vmla.f32   q13, q10, %e11[1]   \n"
                    "vmla.f32   q14, q11, %f11[0]   \n"

                    "pld        [%4, #192]          \n"
                    "vld1.f32   {d16-d18}, [%4]     \n" // r2
                    "add        %4, #16             \n"

                    "vmla.f32   q7, q8, %e12[0]     \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "vmla.f32   q13, q10, %e12[1]   \n"
                    "vmla.f32   q14, q11, %f12[0]   \n"

                    "pld        [%2, #192]          \n"
                    "vld1.f32   {d16-d18}, [%2]     \n" // r0
                    "add        %2, #16             \n"

                    "vadd.f32   q7, q7, q13         \n"
                    "vadd.f32   q7, q7, q14         \n"

                    "vext.32    q10, q8, q9, #1     \n"
                    "vext.32    q11, q8, q9, #2     \n"

                    "vst1.f32   {d14-d15}, [%1]!    \n"

                    "subs       %0, #1              \n"
                    "bne        0b                  \n"

                    "sub        %2, #16             \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k012x), // %10
                    "w"(_k345x), // %11
                    "w"(_k678x), // %12
                    "w"(_bias0)  // %13
                    : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
#if __ARM_NEON
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r20 = vld1q_f32(r2);

                float32x4_t _sum = vmulq_f32(_r00, _k012x);
                _sum = vmlaq_f32(_sum, _r10, _k345x);
                _sum = vmlaq_f32(_sum, _r20, _k678x);

                _sum = vsetq_lane_f32(bias0, _sum, 3);
#if __aarch64__
                *outptr = vaddvq_f32(_sum);
#else
                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                _ss = vpadd_f32(_ss, _ss);

                *outptr = vget_lane_f32(_ss, 0);
#endif // __aarch64__
#else
                float sum = bias0;
                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];

                *outptr = sum;
#endif
                r0++;
                r1++;
                r2++;
                outptr++;
            }

            r0 += 2;
            r1 += 2;
            r2 += 2;
        }
    }
}

static void convdw3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const float bias0 = bias ? bias[g] : 0.f;

        const float* kernel0 = kernel + g * 9;

        float* outptr = out;

        const float* img0 = bottom_blob.channel(g);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w * 2;

#if __ARM_NEON
        float32x4_t _k012x = vld1q_f32(kernel0);
        float32x4_t _k345x = vld1q_f32(kernel0 + 3);
        float32x4_t _k678x = vld1q_f32(kernel0 + 6);

        _k012x = vsetq_lane_f32(0.f, _k012x, 3);
        _k345x = vsetq_lane_f32(0.f, _k345x, 3);
        _k678x = vsetq_lane_f32(0.f, _k678x, 3);

        float32x4_t _bias0 = vdupq_n_f32(bias0);
#else
        const float* k0 = kernel0;
        const float* k1 = kernel0 + 3;
        const float* k2 = kernel0 + 6;
#endif // __ARM_NEON

        int i = 0;

        for (; i < outh; i++)
        {
#if __ARM_NEON
            int nn = outw >> 2;
            int remain = outw & 3;
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld2        {v2.4s, v3.4s}, [%2], #32      \n"

                    "and        v11.16b, %13.16b, %13.16b      \n" // v11 = _bias0

                    "0:                                        \n"
                    "fmul       v0.4s,  v2.4s, %10.s[0]        \n"
                    "fmul       v10.4s, v3.4s, %10.s[1]        \n"

                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld2        {v8.4s, v9.4s}, [%2]           \n"
                    "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                    "fmla       v11.4s, v1.4s, %10.s[2]        \n"

                    "prfm       pldl1keep, [%3, #256]          \n"
                    "ld2        {v2.4s, v3.4s}, [%3], #32      \n"

                    "fmla       v0.4s,  v2.4s, %11.s[0]        \n"
                    "fmla       v10.4s, v3.4s, %11.s[1]        \n"

                    "prfm       pldl1keep, [%3, #256]          \n"
                    "ld2        {v8.4s, v9.4s}, [%3]           \n"
                    "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                    "fmla       v11.4s, v1.4s, %11.s[2]        \n"

                    "prfm       pldl1keep, [%4, #256]          \n"
                    "ld2        {v2.4s, v3.4s}, [%4], #32      \n"

                    "fmla       v0.4s,  v2.4s, %12.s[0]        \n"
                    "fmla       v10.4s, v3.4s, %12.s[1]        \n"

                    "prfm       pldl1keep, [%4, #256]          \n"
                    "ld2        {v8.4s, v9.4s}, [%4]           \n"
                    "ext        v1.16b, v2.16b, v8.16b, #4     \n"

                    "fmla       v11.4s, v1.4s, %12.s[2]        \n"

                    "prfm       pldl1keep, [%2, #256]          \n"
                    "ld2        {v2.4s, v3.4s}, [%2], #32      \n"

                    "fadd       v0.4s, v0.4s, v10.4s           \n"
                    "fadd       v0.4s, v0.4s, v11.4s           \n"

                    "and        v11.16b, %13.16b, %13.16b      \n" // v11 = _bias0

                    "subs       %w0, %w0, #1                   \n"
                    "st1        {v0.4s}, [%1], #16             \n"
                    "bne        0b                             \n"
                    "sub        %2, %2, #32                    \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k012x), // %10
                    "w"(_k345x), // %11
                    "w"(_k678x), // %12
                    "w"(_bias0)  // %13
                    : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"

                    "vand       q11, %q13, %q13     \n"

                    "0:                             \n"
                    "vmul.f32   q0, q2, %e10[0]     \n"
                    "vmul.f32   q10, q3, %e10[1]    \n"

                    "pld        [%2, #128]          \n"
                    "vld2.f32   {d16-d17}, [%2]     \n"
                    "vext.32    q1, q2, q8, #1      \n"

                    "vmla.f32   q11, q1, %f10[0]    \n"

                    "pld        [%3, #256]          \n"
                    "vld2.f32   {d4-d7}, [%3]!      \n"

                    "vmla.f32   q0, q2, %e11[0]     \n"
                    "vmla.f32   q10, q3, %e11[1]    \n"

                    "pld        [%3, #128]          \n"
                    "vld2.f32   {d16-d17}, [%3]     \n"
                    "vext.32    q1, q2, q8, #1      \n"

                    "vmla.f32   q11, q1, %f11[0]    \n"

                    "pld        [%4, #256]          \n"
                    "vld2.f32   {d4-d7}, [%4]!      \n"

                    "vmla.f32   q0, q2, %e12[0]     \n"
                    "vmla.f32   q10, q3, %e12[1]    \n"

                    "pld        [%4, #128]          \n"
                    "vld2.f32   {d16-d17}, [%4]     \n"
                    "vext.32    q1, q2, q8, #1      \n"

                    "vmla.f32   q11, q1, %f12[0]    \n"

                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"

                    "vadd.f32   q0, q0, q10         \n"
                    "vadd.f32   q0, q0, q11         \n"

                    "vand       q11, %q13, %q13     \n"

                    "subs       %0, #1              \n"
                    "vst1.f32   {d0-d1}, [%1]!      \n"
                    "bne        0b                  \n"
                    "sub        %2, #32             \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k012x), // %10
                    "w"(_k345x), // %11
                    "w"(_k678x), // %12
                    "w"(_bias0)  // %13
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
#if __ARM_NEON
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r20 = vld1q_f32(r2);

                float32x4_t _sum = vmulq_f32(_r00, _k012x);
                _sum = vmlaq_f32(_sum, _r10, _k345x);
                _sum = vmlaq_f32(_sum, _r20, _k678x);

                _sum = vsetq_lane_f32(bias0, _sum, 3);
#if __aarch64__
                *outptr = vaddvq_f32(_sum);
#else
                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                _ss = vpadd_f32(_ss, _ss);

                *outptr = vget_lane_f32(_ss, 0);
#endif // __aarch64__
#else
                float sum = bias0;
                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];

                *outptr = sum;
#endif // __ARM_NEON

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_3x3_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const __fp16* kernel = _kernel;
    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const __fp16 bias0 = bias ? bias[g] : 0.f;

        const __fp16* kernel0 = kernel + g * 9;

        __fp16* outptr0 = out;
        __fp16* outptr1 = outptr0 + outw;

        const __fp16* img0 = bottom_blob.channel(g);

        const __fp16* r0 = img0;
        const __fp16* r1 = img0 + w;
        const __fp16* r2 = img0 + w * 2;
        const __fp16* r3 = img0 + w * 3;

        float16x4_t _k012x = vld1_f16(kernel0);
        float16x4_t _k345x = vld1_f16(kernel0 + 3);
        float16x4_t _k678x = vld1_f16(kernel0 + 6);

        _k012x = vset_lane_f16(0.f, _k012x, 3);
        _k345x = vset_lane_f16(0.f, _k345x, 3);
        _k678x = vset_lane_f16(0.f, _k678x, 3);

        float16x8_t _bias0 = vdupq_n_f16(bias0);

        int i = 0;
        for (; i + 1 < outh; i += 2)
        {
            int j = 0;
            for (; j + 7 < outw; j += 8)
            {
                float16x8_t _r00 = vld1q_f16(r0);
                float16x8_t _r10 = vld1q_f16(r1);
                float16x8_t _r20 = vld1q_f16(r2);
                float16x8_t _r30 = vld1q_f16(r3);

                float16x8_t _r0n = vld1q_f16(r0 + 8);
                float16x8_t _r1n = vld1q_f16(r1 + 8);
                float16x8_t _r2n = vld1q_f16(r2 + 8);
                float16x8_t _r3n = vld1q_f16(r3 + 8);

                float16x8_t _r01 = vextq_f16(_r00, _r0n, 1);
                float16x8_t _r11 = vextq_f16(_r10, _r1n, 1);
                float16x8_t _r21 = vextq_f16(_r20, _r2n, 1);
                float16x8_t _r31 = vextq_f16(_r30, _r3n, 1);

                float16x8_t _r02 = vextq_f16(_r00, _r0n, 2);
                float16x8_t _r12 = vextq_f16(_r10, _r1n, 2);
                float16x8_t _r22 = vextq_f16(_r20, _r2n, 2);
                float16x8_t _r32 = vextq_f16(_r30, _r3n, 2);

                float16x8_t _sum0 = _bias0;
                float16x8_t _sum1 = _bias0;

                _sum0 = vfmaq_lane_f16(_sum0, _r00, _k012x, 0);
                _sum0 = vfmaq_lane_f16(_sum0, _r01, _k012x, 1);
                _sum0 = vfmaq_lane_f16(_sum0, _r02, _k012x, 2);
                _sum1 = vfmaq_lane_f16(_sum1, _r10, _k012x, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _r11, _k012x, 1);
                _sum1 = vfmaq_lane_f16(_sum1, _r12, _k012x, 2);

                _sum0 = vfmaq_lane_f16(_sum0, _r10, _k345x, 0);
                _sum0 = vfmaq_lane_f16(_sum0, _r11, _k345x, 1);
                _sum0 = vfmaq_lane_f16(_sum0, _r12, _k345x, 2);
                _sum1 = vfmaq_lane_f16(_sum1, _r20, _k345x, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _r21, _k345x, 1);
                _sum1 = vfmaq_lane_f16(_sum1, _r22, _k345x, 2);

                _sum0 = vfmaq_lane_f16(_sum0, _r20, _k678x, 0);
                _sum0 = vfmaq_lane_f16(_sum0, _r21, _k678x, 1);
                _sum0 = vfmaq_lane_f16(_sum0, _r22, _k678x, 2);
                _sum1 = vfmaq_lane_f16(_sum1, _r30, _k678x, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _r31, _k678x, 1);
                _sum1 = vfmaq_lane_f16(_sum1, _r32, _k678x, 2);

                vst1q_f16(outptr0, _sum0);
                vst1q_f16(outptr1, _sum1);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                r3 += 8;
                outptr0 += 8;
                outptr1 += 8;
            }
            for (; j + 3 < outw; j += 4)
            {
                float16x4_t _r00 = vld1_f16(r0);
                float16x4_t _r10 = vld1_f16(r1);
                float16x4_t _r20 = vld1_f16(r2);
                float16x4_t _r30 = vld1_f16(r3);

                float16x4_t _r0n = vld1_f16(r0 + 4);
                float16x4_t _r1n = vld1_f16(r1 + 4);
                float16x4_t _r2n = vld1_f16(r2 + 4);
                float16x4_t _r3n = vld1_f16(r3 + 4);

                float16x4_t _r01 = vext_f16(_r00, _r0n, 1);
                float16x4_t _r11 = vext_f16(_r10, _r1n, 1);
                float16x4_t _r21 = vext_f16(_r20, _r2n, 1);
                float16x4_t _r31 = vext_f16(_r30, _r3n, 1);

                float16x4_t _r02 = vext_f16(_r00, _r0n, 2);
                float16x4_t _r12 = vext_f16(_r10, _r1n, 2);
                float16x4_t _r22 = vext_f16(_r20, _r2n, 2);
                float16x4_t _r32 = vext_f16(_r30, _r3n, 2);

                float16x4_t _sum0 = vget_low_f16(_bias0);
                float16x4_t _sum1 = vget_low_f16(_bias0);

                _sum0 = vfma_lane_f16(_sum0, _r00, _k012x, 0);
                _sum0 = vfma_lane_f16(_sum0, _r01, _k012x, 1);
                _sum0 = vfma_lane_f16(_sum0, _r02, _k012x, 2);
                _sum1 = vfma_lane_f16(_sum1, _r10, _k012x, 0);
                _sum1 = vfma_lane_f16(_sum1, _r11, _k012x, 1);
                _sum1 = vfma_lane_f16(_sum1, _r12, _k012x, 2);

                _sum0 = vfma_lane_f16(_sum0, _r10, _k345x, 0);
                _sum0 = vfma_lane_f16(_sum0, _r11, _k345x, 1);
                _sum0 = vfma_lane_f16(_sum0, _r12, _k345x, 2);
                _sum1 = vfma_lane_f16(_sum1, _r20, _k345x, 0);
                _sum1 = vfma_lane_f16(_sum1, _r21, _k345x, 1);
                _sum1 = vfma_lane_f16(_sum1, _r22, _k345x, 2);

                _sum0 = vfma_lane_f16(_sum0, _r20, _k678x, 0);
                _sum0 = vfma_lane_f16(_sum0, _r21, _k678x, 1);
                _sum0 = vfma_lane_f16(_sum0, _r22, _k678x, 2);
                _sum1 = vfma_lane_f16(_sum1, _r30, _k678x, 0);
                _sum1 = vfma_lane_f16(_sum1, _r31, _k678x, 1);
                _sum1 = vfma_lane_f16(_sum1, _r32, _k678x, 2);

                vst1_f16(outptr0, _sum0);
                vst1_f16(outptr1, _sum1);

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
                outptr0 += 4;
                outptr1 += 4;
            }
            for (; j < outw; j++)
            {
                float16x4_t _r0 = vld1_f16(r0);
                float16x4_t _r1 = vld1_f16(r1);
                float16x4_t _r2 = vld1_f16(r2);
                float16x4_t _r3 = vld1_f16(r3);

                float16x4_t _sum0 = vmul_f16(_r0, _k012x);
                _sum0 = vfma_f16(_sum0, _r1, _k345x);
                _sum0 = vfma_f16(_sum0, _r2, _k678x);

                float16x4_t _sum1 = vmul_f16(_r1, _k012x);
                _sum1 = vfma_f16(_sum1, _r2, _k345x);
                _sum1 = vfma_f16(_sum1, _r3, _k678x);

                _sum0 = vset_lane_f16(bias0, _sum0, 3);
                _sum1 = vset_lane_f16(bias0, _sum1, 3);

                *outptr0 = (__fp16)vaddvq_f32(vcvt_f32_f16(_sum0));
                *outptr1 = (__fp16)vaddvq_f32(vcvt_f32_f16(_sum1));

                r0++;
                r1++;
                r2++;
                r3++;
                outptr0++;
                outptr1++;
            }

            r0 += 2 + w;
            r1 += 2 + w;
            r2 += 2 + w;
            r3 += 2 + w;

            outptr0 += outw;
            outptr1 += outw;
        }
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 7 < outw; j += 8)
            {
                float16x8_t _r00 = vld1q_f16(r0);
                float16x8_t _r10 = vld1q_f16(r1);
                float16x8_t _r20 = vld1q_f16(r2);

                float16x8_t _r0n = vld1q_f16(r0 + 8);
                float16x8_t _r1n = vld1q_f16(r1 + 8);
                float16x8_t _r2n = vld1q_f16(r2 + 8);

                float16x8_t _r01 = vextq_f16(_r00, _r0n, 1);
                float16x8_t _r11 = vextq_f16(_r10, _r1n, 1);
                float16x8_t _r21 = vextq_f16(_r20, _r2n, 1);

                float16x8_t _r02 = vextq_f16(_r00, _r0n, 2);
                float16x8_t _r12 = vextq_f16(_r10, _r1n, 2);
                float16x8_t _r22 = vextq_f16(_r20, _r2n, 2);

                float16x8_t _sum0 = _bias0;

                _sum0 = vfmaq_lane_f16(_sum0, _r00, _k012x, 0);
                _sum0 = vfmaq_lane_f16(_sum0, _r01, _k012x, 1);
                _sum0 = vfmaq_lane_f16(_sum0, _r02, _k012x, 2);

                _sum0 = vfmaq_lane_f16(_sum0, _r10, _k345x, 0);
                _sum0 = vfmaq_lane_f16(_sum0, _r11, _k345x, 1);
                _sum0 = vfmaq_lane_f16(_sum0, _r12, _k345x, 2);

                _sum0 = vfmaq_lane_f16(_sum0, _r20, _k678x, 0);
                _sum0 = vfmaq_lane_f16(_sum0, _r21, _k678x, 1);
                _sum0 = vfmaq_lane_f16(_sum0, _r22, _k678x, 2);

                vst1q_f16(outptr0, _sum0);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                outptr0 += 8;
            }
            for (; j + 3 < outw; j += 4)
            {
                float16x4_t _r00 = vld1_f16(r0);
                float16x4_t _r10 = vld1_f16(r1);
                float16x4_t _r20 = vld1_f16(r2);

                float16x4_t _r0n = vld1_f16(r0 + 4);
                float16x4_t _r1n = vld1_f16(r1 + 4);
                float16x4_t _r2n = vld1_f16(r2 + 4);

                float16x4_t _r01 = vext_f16(_r00, _r0n, 1);
                float16x4_t _r11 = vext_f16(_r10, _r1n, 1);
                float16x4_t _r21 = vext_f16(_r20, _r2n, 1);

                float16x4_t _r02 = vext_f16(_r00, _r0n, 2);
                float16x4_t _r12 = vext_f16(_r10, _r1n, 2);
                float16x4_t _r22 = vext_f16(_r20, _r2n, 2);

                float16x4_t _sum0 = vget_low_f16(_bias0);

                _sum0 = vfma_lane_f16(_sum0, _r00, _k012x, 0);
                _sum0 = vfma_lane_f16(_sum0, _r01, _k012x, 1);
                _sum0 = vfma_lane_f16(_sum0, _r02, _k012x, 2);

                _sum0 = vfma_lane_f16(_sum0, _r10, _k345x, 0);
                _sum0 = vfma_lane_f16(_sum0, _r11, _k345x, 1);
                _sum0 = vfma_lane_f16(_sum0, _r12, _k345x, 2);

                _sum0 = vfma_lane_f16(_sum0, _r20, _k678x, 0);
                _sum0 = vfma_lane_f16(_sum0, _r21, _k678x, 1);
                _sum0 = vfma_lane_f16(_sum0, _r22, _k678x, 2);

                vst1_f16(outptr0, _sum0);

                r0 += 4;
                r1 += 4;
                r2 += 4;
                outptr0 += 4;
            }
            for (; j < outw; j++)
            {
                float16x4_t _r0 = vld1_f16(r0);
                float16x4_t _r1 = vld1_f16(r1);
                float16x4_t _r2 = vld1_f16(r2);

                float16x4_t _sum = vmul_f16(_r0, _k012x);
                _sum = vfma_f16(_sum, _r1, _k345x);
                _sum = vfma_f16(_sum, _r2, _k678x);

                _sum = vset_lane_f16(bias0, _sum, 3);

                *outptr0 = (__fp16)vaddvq_f32(vcvt_f32_f16(_sum));

                r0++;
                r1++;
                r2++;
                outptr0++;
            }

            r0 += 2;
            r1 += 2;
            r2 += 2;
        }
    }
}

static void convdw3x3s2_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = w - 2 * outw + w;

    const __fp16* kernel = _kernel;
    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const __fp16 bias0 = bias ? bias[g] : 0.f;

        const __fp16* kernel0 = kernel + g * 9;

        __fp16* outptr = out;

        const __fp16* img0 = bottom_blob.channel(g);

        const __fp16* r0 = img0;
        const __fp16* r1 = img0 + w;
        const __fp16* r2 = img0 + w * 2;

        float16x4_t _k012x = vld1_f16(kernel0);
        float16x4_t _k345x = vld1_f16(kernel0 + 3);
        float16x4_t _k678x = vld1_f16(kernel0 + 6);

        _k012x = vset_lane_f16(0.f, _k012x, 3);
        _k345x = vset_lane_f16(0.f, _k345x, 3);
        _k678x = vset_lane_f16(0.f, _k678x, 3);

        float16x8_t _bias0 = vdupq_n_f16(bias0);

        int i = 0;
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 7 < outw; j += 8)
            {
                float16x8x2_t _r00 = vld2q_f16(r0);
                float16x8x2_t _r10 = vld2q_f16(r1);
                float16x8x2_t _r20 = vld2q_f16(r2);

                float16x8x2_t _r0n = vld2q_f16(r0 + 16);
                float16x8x2_t _r1n = vld2q_f16(r1 + 16);
                float16x8x2_t _r2n = vld2q_f16(r2 + 16);

                float16x8_t _r02 = vextq_f16(_r00.val[0], _r0n.val[0], 1);
                float16x8_t _r12 = vextq_f16(_r10.val[0], _r1n.val[0], 1);
                float16x8_t _r22 = vextq_f16(_r20.val[0], _r2n.val[0], 1);

                float16x8_t _sum = _bias0;

                _sum = vfmaq_lane_f16(_sum, _r00.val[0], _k012x, 0);
                _sum = vfmaq_lane_f16(_sum, _r00.val[1], _k012x, 1);
                _sum = vfmaq_lane_f16(_sum, _r02, _k012x, 2);

                _sum = vfmaq_lane_f16(_sum, _r10.val[0], _k345x, 0);
                _sum = vfmaq_lane_f16(_sum, _r10.val[1], _k345x, 1);
                _sum = vfmaq_lane_f16(_sum, _r12, _k345x, 2);

                _sum = vfmaq_lane_f16(_sum, _r20.val[0], _k678x, 0);
                _sum = vfmaq_lane_f16(_sum, _r20.val[1], _k678x, 1);
                _sum = vfmaq_lane_f16(_sum, _r22, _k678x, 2);

                vst1q_f16(outptr, _sum);

                r0 += 16;
                r1 += 16;
                r2 += 16;
                outptr += 8;
            }
            for (; j + 3 < outw; j += 4)
            {
                float16x4x2_t _r00 = vld2_f16(r0);
                float16x4x2_t _r10 = vld2_f16(r1);
                float16x4x2_t _r20 = vld2_f16(r2);

                float16x4x2_t _r0n = vld2_f16(r0 + 8);
                float16x4x2_t _r1n = vld2_f16(r1 + 8);
                float16x4x2_t _r2n = vld2_f16(r2 + 8);

                float16x4_t _r02 = vext_f16(_r00.val[0], _r0n.val[0], 1);
                float16x4_t _r12 = vext_f16(_r10.val[0], _r1n.val[0], 1);
                float16x4_t _r22 = vext_f16(_r20.val[0], _r2n.val[0], 1);

                float16x4_t _sum = vget_low_f16(_bias0);

                _sum = vfma_lane_f16(_sum, _r00.val[0], _k012x, 0);
                _sum = vfma_lane_f16(_sum, _r00.val[1], _k012x, 1);
                _sum = vfma_lane_f16(_sum, _r02, _k012x, 2);

                _sum = vfma_lane_f16(_sum, _r10.val[0], _k345x, 0);
                _sum = vfma_lane_f16(_sum, _r10.val[1], _k345x, 1);
                _sum = vfma_lane_f16(_sum, _r12, _k345x, 2);

                _sum = vfma_lane_f16(_sum, _r20.val[0], _k678x, 0);
                _sum = vfma_lane_f16(_sum, _r20.val[1], _k678x, 1);
                _sum = vfma_lane_f16(_sum, _r22, _k678x, 2);

                vst1_f16(outptr, _sum);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                outptr += 4;
            }
            for (; j < outw; j++)
            {
                float16x4_t _r0 = vld1_f16(r0);
                float16x4_t _r1 = vld1_f16(r1);
                float16x4_t _r2 = vld1_f16(r2);

                float16x4_t _sum = vmul_f16(_r0, _k012x);
                _sum = vfma_f16(_sum, _r1, _k345x);
                _sum = vfma_f16(_sum, _r2, _k678x);

                _sum = vset_lane_f16(bias0, _sum, 3);

                *outptr = (__fp16)vaddvq_f32(vcvt_f32_f16(_sum));

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_3x3_int8.h
================================================
// Copyright 2019 BUG1989
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const signed char* kernel = (const signed char*)_kernel + p * 9;

        int* outptr0 = out;
        int* outptr0n = outptr0 + outw;

        const signed char* img0 = bottom_blob.channel(p);

        const signed char* r0 = img0;
        const signed char* r1 = img0 + w;
        const signed char* r2 = img0 + w * 2;
        const signed char* r3 = img0 + w * 3;

        int i = 0;

#if __ARM_NEON
        int8x16_t _k0123456789x = vld1q_s8(kernel);
        int16x8_t _k_s16 = vmovl_s8(vget_low_s8(_k0123456789x));
        int16x8_t _kn_s16 = vmovl_s8(vget_high_s8(_k0123456789x));

        int16x4_t _k0123 = vget_low_s16(_k_s16);
        int16x4_t _k4567 = vget_high_s16(_k_s16);
        int16x4_t _k8xxx = vget_low_s16(_kn_s16);
#endif // __ARM_NEON

        for (; i + 1 < outh; i += 2)
        {
#if __ARM_NEON
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "0:                                   \n"
                    "ld1    {v4.8b, v5.8b}, [%3]          \n"
                    "ld1    {v6.8b, v7.8b}, [%4]          \n"
                    "ld1    {v8.8b, v9.8b}, [%5]          \n"
                    "ld1    {v10.8b, v11.8b}, [%6]        \n"
                    "add    %3, %3, #8                    \n"
                    "add    %4, %4, #8                    \n"
                    "add    %5, %5, #8                    \n"
                    "add    %6, %6, #8                    \n"

                    "ext    v12.8b, v4.8b, v5.8b, #1      \n"
                    "ext    v13.8b, v4.8b, v5.8b, #2      \n"
                    "ext    v14.8b, v6.8b, v7.8b, #1      \n"
                    "ext    v15.8b, v6.8b, v7.8b, #2      \n"
                    "ext    v16.8b, v8.8b, v9.8b, #1      \n"
                    "ext    v17.8b, v8.8b, v9.8b, #2      \n"
                    "ext    v18.8b, v10.8b, v11.8b, #1    \n"
                    "ext    v19.8b, v10.8b, v11.8b, #2    \n"

                    "sshll  v4.8h, v4.8b, #0              \n" // r00
                    "sshll  v12.8h, v12.8b, #0            \n" // r01
                    "sshll  v13.8h, v13.8b, #0            \n" // r02
                    "sshll  v6.8h, v6.8b, #0              \n" // r10
                    "sshll  v14.8h, v14.8b, #0            \n" // r11
                    "sshll  v15.8h, v15.8b, #0            \n" // r12
                    "sshll  v8.8h, v8.8b, #0              \n" // r20
                    "sshll  v16.8h, v16.8b, #0            \n" // r21
                    "sshll  v17.8h, v17.8b, #0            \n" // r22
                    "sshll  v10.8h, v10.8b, #0            \n" // r30
                    "sshll  v18.8h, v18.8b, #0            \n" // r31
                    "sshll  v19.8h, v19.8b, #0            \n" // r32

                    // r0
                    "smull  v20.4s, v4.4h, %14.h[0]       \n" // (r00 - r07) * k00
                    "smull2  v21.4s, v4.8h, %14.h[0]      \n"
                    "smull  v22.4s, v12.4h, %14.h[1]      \n" // (r01 - r08) * k01
                    "smull2  v23.4s, v12.8h, %14.h[1]     \n"
                    "smull  v24.4s, v13.4h, %14.h[2]      \n" // (r02 - r09) * k02
                    "smull2  v25.4s, v13.8h, %14.h[2]     \n"

                    // r1
                    "smull  v26.4s, v6.4h, %14.h[0]       \n" // (r10 - r17) * k00
                    "smull2  v27.4s, v6.8h, %14.h[0]      \n"
                    "smull  v28.4s, v14.4h, %14.h[1]      \n" // (r11 - r18) * k01
                    "smull2  v29.4s, v14.8h, %14.h[1]     \n"
                    "smull  v30.4s, v15.4h, %14.h[2]      \n" // (r12 - r19) * k02
                    "smull2  v31.4s, v15.8h, %14.h[2]     \n"

                    "smlal  v20.4s, v6.4h, %14.h[3]       \n" // (r10 - r17) * k03
                    "smlal2  v21.4s, v6.8h, %14.h[3]      \n"
                    "smlal  v22.4s, v14.4h, %15.h[0]      \n" // (r11 - r18) * k04
                    "smlal2  v23.4s, v14.8h, %15.h[0]     \n"
                    "smlal  v24.4s, v15.4h, %15.h[1]      \n" // (r12 - r19) * k05
                    "smlal2  v25.4s, v15.8h, %15.h[1]     \n"

                    // r2
                    "smlal  v26.4s, v8.4h, %14.h[3]       \n" // (r20 - r27) * k03
                    "smlal2  v27.4s, v8.8h, %14.h[3]      \n"
                    "smlal  v28.4s, v16.4h, %15.h[0]      \n" // (r21 - r28) * k04
                    "smlal2  v29.4s, v16.8h, %15.h[0]     \n"
                    "smlal  v30.4s, v17.4h, %15.h[1]      \n" // (r22 - r29) * k05
                    "smlal2  v31.4s, v17.8h, %15.h[1]     \n"

                    "smlal  v20.4s, v8.4h, %15.h[2]       \n" // (r20 - r27) * k06
                    "smlal2  v21.4s, v8.8h, %15.h[2]      \n"
                    "smlal  v22.4s, v16.4h, %15.h[3]      \n" // (r21 - r28) * k07
                    "smlal2  v23.4s, v16.8h, %15.h[3]     \n"
                    "smlal  v24.4s, v17.4h, %16.h[0]      \n" // (r22 - r29) * k08
                    "smlal2  v25.4s, v17.8h, %16.h[0]     \n"

                    // r3
                    "smlal  v26.4s, v10.4h, %15.h[2]      \n" // (r30 - r37) * k06
                    "smlal2  v27.4s, v10.8h, %15.h[2]     \n"
                    "smlal  v28.4s, v18.4h, %15.h[3]      \n" // (r31 - r38) * k07
                    "smlal2  v29.4s, v18.8h, %15.h[3]     \n"
                    "smlal  v30.4s, v19.4h, %16.h[0]      \n" // (r32 - r39) * k08
                    "smlal2  v31.4s, v19.8h, %16.h[0]     \n"

                    // add and save
                    "add    v20.4s, v20.4s, v22.4s        \n"
                    "add    v21.4s, v21.4s, v23.4s        \n"
                    "add    v26.4s, v26.4s, v28.4s        \n"
                    "add    v27.4s, v27.4s, v29.4s        \n"
                    "add    v20.4s, v20.4s, v24.4s        \n"
                    "add    v21.4s, v21.4s, v25.4s        \n"
                    "add    v26.4s, v26.4s, v30.4s        \n"
                    "add    v27.4s, v27.4s, v31.4s        \n"

                    "st1    {v20.4s, v21.4s}, [%1], #32   \n"
                    "st1    {v26.4s, v27.4s}, [%2], #32   \n"

                    "subs   %w0, %w0, #1                  \n"
                    "bne    0b                            \n"

                    : "=r"(nn),       // %0
                    "=r"(outptr0),  // %1
                    "=r"(outptr0n), // %2
                    "=r"(r0),       // %3
                    "=r"(r1),       // %4
                    "=r"(r2),       // %5
                    "=r"(r3)        // %6
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr0n),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "w"(_k0123), // %14
                    "w"(_k4567), // %15
                    "w"(_k8xxx)  // %16
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "0:                              \n"
                    // r0
                    "vld1.s8    {d30-d31}, [%3]      \n" // r0
                    "add    %3, %3, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r00
                    "vmovl.s8    q5, d10             \n" // r01
                    "vmovl.s8    q6, d12             \n" // r02
                    // sum0
                    "vmull.s16  q7, d30, %P14[0]     \n" // (r00 - r07) * k00
                    "vmull.s16  q8, d31, %P14[0]     \n"
                    "vmull.s16  q9, d10, %P14[1]     \n" // (r01 - r08) * k01
                    "vmull.s16  q10, d11, %P14[1]    \n"
                    "vmlal.s16  q7, d12, %P14[2]     \n" // (r02 - r09) * k02
                    "vmlal.s16  q8, d13, %P14[2]     \n"

                    // r1
                    "vld1.s8    {d30-d31}, [%4]      \n" // r1
                    "add    %4, %4, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r10
                    "vmovl.s8    q5, d10             \n" // r11
                    "vmovl.s8    q6, d12             \n" // r12
                    // sum0
                    "vmlal.s16  q7, d30, %P14[3]     \n" // (r10 - r17) * k03
                    "vmlal.s16  q8, d31, %P14[3]     \n"
                    "vmlal.s16  q9, d10, %P15[0]     \n" // (r11 - r18) * k04
                    "vmlal.s16  q10, d11, %P15[0]    \n"
                    "vmlal.s16  q7, d12, %P15[1]     \n" // (r12 - r19) * k05
                    "vmlal.s16  q8, d13, %P15[1]     \n"
                    // sum1
                    "vmull.s16  q11, d30, %P14[0]    \n" // (r10 - r17) * k00
                    "vmull.s16  q12, d31, %P14[0]    \n"
                    "vmull.s16  q13, d10, %P14[1]    \n" // (r11 - r18) * k01
                    "vmull.s16  q14, d11, %P14[1]    \n"
                    "vmlal.s16  q11, d12, %P14[2]    \n" // (r12 - r19) * k02
                    "vmlal.s16  q12, d13, %P14[2]    \n"

                    // r2
                    "vld1.s8    {d30-d31}, [%5]      \n" // r2
                    "add    %5, %5, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r20
                    "vmovl.s8    q5, d10             \n" // r21
                    "vmovl.s8    q6, d12             \n" // r22

                    // sum0
                    "vmlal.s16  q7, d30, %P15[2]     \n" // (r20 - r27) * k06
                    "vmlal.s16  q8, d31, %P15[2]     \n"
                    "vmlal.s16  q9, d10, %P15[3]     \n" // (r21 - r28) * k07
                    "vmlal.s16  q10, d11, %P15[3]    \n"
                    "vmlal.s16  q7, d12, %P16[0]     \n" // (r22 - r29) * k08
                    "vmlal.s16  q8, d13, %P16[0]     \n"
                    // sum1
                    "vmlal.s16  q11, d30, %P14[3]    \n" // (r20 - r27) * k03
                    "vmlal.s16  q12, d31, %P14[3]    \n"
                    "vmlal.s16  q13, d10, %P15[0]    \n" // (r21 - r28) * k04
                    "vmlal.s16  q14, d11, %P15[0]    \n"
                    "vmlal.s16  q11, d12, %P15[1]    \n" // (r22 - r29) * k05
                    "vmlal.s16  q12, d13, %P15[1]    \n"

                    // r3
                    "vld1.s8    {d30-d31}, [%6]      \n" // r3
                    "add    %6, %6, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r30
                    "vmovl.s8    q5, d10             \n" // r31
                    "vmovl.s8    q6, d12             \n" // r32

                    // sum1
                    "vmlal.s16  q11, d30, %P15[2]    \n" // (r30 - r37) * k06
                    "vmlal.s16  q12, d31, %P15[2]    \n"
                    "vmlal.s16  q13, d10, %P15[3]    \n" // (r31 - r38) * k07
                    "vmlal.s16  q14, d11, %P15[3]    \n"
                    "vmlal.s16  q11, d12, %P16[0]    \n" // (r32 - r39) * k08
                    "vmlal.s16  q12, d13, %P16[0]    \n"

                    "subs   %0, %0, #1               \n"

                    // add and save
                    "vadd.s32    q7, q7, q9          \n"
                    "vadd.s32    q8, q8, q10         \n"
                    "vadd.s32    q11, q11, q13       \n"
                    "vadd.s32    q12, q12, q14       \n"

                    "vst1.s32    {d14-d17}, [%1]!    \n"
                    "vst1.s32    {d22-d25}, [%2]!    \n"

                    "bne    0b                       \n"

                    : "=r"(nn),       // %0
                    "=r"(outptr0),  // %1
                    "=r"(outptr0n), // %2
                    "=r"(r0),       // %3
                    "=r"(r1),       // %4
                    "=r"(r2),       // %5
                    "=r"(r3)        // %6
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr0n),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "w"(_k0123), // %14
                    "w"(_k4567), // %15
                    "w"(_k8xxx)  // %16
                    : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                // TODO NEON
                int sum0 = 0;
                int sum0n = 0;

                sum0 += (int)r0[0] * kernel[0];
                sum0 += (int)r0[1] * kernel[1];
                sum0 += (int)r0[2] * kernel[2];
                sum0 += (int)r1[0] * kernel[3];
                sum0 += (int)r1[1] * kernel[4];
                sum0 += (int)r1[2] * kernel[5];
                sum0 += (int)r2[0] * kernel[6];
                sum0 += (int)r2[1] * kernel[7];
                sum0 += (int)r2[2] * kernel[8];

                sum0n += (int)r1[0] * kernel[0];
                sum0n += (int)r1[1] * kernel[1];
                sum0n += (int)r1[2] * kernel[2];
                sum0n += (int)r2[0] * kernel[3];
                sum0n += (int)r2[1] * kernel[4];
                sum0n += (int)r2[2] * kernel[5];
                sum0n += (int)r3[0] * kernel[6];
                sum0n += (int)r3[1] * kernel[7];
                sum0n += (int)r3[2] * kernel[8];

                *outptr0 = sum0;
                *outptr0n = sum0n;

                r0++;
                r1++;
                r2++;
                r3++;
                outptr0++;
                outptr0n++;
            }

            r0 += 2 + w;
            r1 += 2 + w;
            r2 += 2 + w;
            r3 += 2 + w;

            outptr0 += outw;
            outptr0n += outw;
        }

        for (; i < outh; i++)
        {
#if __ARM_NEON
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "0:                                   \n"
                    "ld1    {v4.8b, v5.8b}, [%2]          \n"
                    "ld1    {v6.8b, v7.8b}, [%3]          \n"
                    "ld1    {v8.8b, v9.8b}, [%4]          \n"
                    "add    %2, %2, #8                    \n"
                    "add    %3, %3, #8                    \n"
                    "add    %4, %4, #8                    \n"

                    "ext    v12.8b, v4.8b, v5.8b, #1      \n"
                    "ext    v13.8b, v4.8b, v5.8b, #2      \n"
                    "ext    v14.8b, v6.8b, v7.8b, #1      \n"
                    "ext    v15.8b, v6.8b, v7.8b, #2      \n"
                    "ext    v16.8b, v8.8b, v9.8b, #1      \n"
                    "ext    v17.8b, v8.8b, v9.8b, #2      \n"

                    "sshll  v4.8h, v4.8b, #0              \n" // r00
                    "sshll  v12.8h, v12.8b, #0            \n" // r01
                    "sshll  v13.8h, v13.8b, #0            \n" // r02
                    "sshll  v6.8h, v6.8b, #0              \n" // r10
                    "sshll  v14.8h, v14.8b, #0            \n" // r11
                    "sshll  v15.8h, v15.8b, #0            \n" // r12
                    "sshll  v8.8h, v8.8b, #0              \n" // r20
                    "sshll  v16.8h, v16.8b, #0            \n" // r21
                    "sshll  v17.8h, v17.8b, #0            \n" // r22

                    // r0
                    "smull  v20.4s, v4.4h, %10.h[0]       \n" // (r00 - r07) * k00
                    "smull2  v21.4s, v4.8h, %10.h[0]      \n"
                    "smull  v22.4s, v12.4h, %10.h[1]      \n" // (r01 - r08) * k01
                    "smull2  v23.4s, v12.8h, %10.h[1]     \n"
                    "smull  v24.4s, v13.4h, %10.h[2]      \n" // (r02 - r09) * k02
                    "smull2  v25.4s, v13.8h, %10.h[2]     \n"

                    // r1
                    "smlal  v20.4s, v6.4h, %10.h[3]       \n" // (r10 - r17) * k03
                    "smlal2  v21.4s, v6.8h, %10.h[3]      \n"
                    "smlal  v22.4s, v14.4h, %11.h[0]      \n" // (r11 - r18) * k04
                    "smlal2  v23.4s, v14.8h, %11.h[0]     \n"
                    "smlal  v24.4s, v15.4h, %11.h[1]      \n" // (r12 - r19) * k05
                    "smlal2  v25.4s, v15.8h, %11.h[1]     \n"

                    // r2
                    "smlal  v20.4s, v8.4h, %11.h[2]       \n" // (r20 - r27) * k06
                    "smlal2  v21.4s, v8.8h, %11.h[2]      \n"
                    "smlal  v22.4s, v16.4h, %11.h[3]      \n" // (r21 - r28) * k07
                    "smlal2  v23.4s, v16.8h, %11.h[3]     \n"
                    "smlal  v24.4s, v17.4h, %12.h[0]      \n" // (r22 - r29) * k08
                    "smlal2  v25.4s, v17.8h, %12.h[0]     \n"

                    // add and save
                    "add    v20.4s, v20.4s, v22.4s        \n"
                    "add    v21.4s, v21.4s, v23.4s        \n"
                    "add    v20.4s, v20.4s, v24.4s        \n"
                    "add    v21.4s, v21.4s, v25.4s        \n"

                    "st1    {v20.4s, v21.4s}, [%1], #32   \n"

                    "subs   %w0, %w0, #1                  \n"
                    "bne    0b                            \n"

                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2)       // %4
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123), // %10
                    "w"(_k4567), // %11
                    "w"(_k8xxx)  // %12
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "0:                              \n"
                    // r0
                    "vld1.s8    {d30-d31}, [%2]        \n" // r0
                    "add    %2, %2, #8               \n"

                    "vext.s8    d10, d30, d31, #1      \n"
                    "vext.s8    d12, d30, d31, #2      \n"

                    "vmovl.s8    q15, d30              \n" // r00
                    "vmovl.s8    q5, d10             \n"   // r01
                    "vmovl.s8    q6, d12             \n"   // r02
                    // sum0
                    "vmull.s16  q7, d30, %P10[0]      \n" // (r00 - r07) * k00
                    "vmull.s16  q8, d31, %P10[0]      \n"
                    "vmull.s16  q9, d10, %P10[1]     \n" // (r01 - r08) * k01
                    "vmull.s16  q10, d11, %P10[1]    \n"
                    "vmlal.s16  q7, d12, %P10[2]     \n" // (r02 - r09) * k02
                    "vmlal.s16  q8, d13, %P10[2]     \n"

                    // r1
                    "vld1.s8    {d30-d31}, [%3]        \n" // r1
                    "add    %3, %3, #8               \n"

                    "vext.s8    d10, d30, d31, #1      \n"
                    "vext.s8    d12, d30, d31, #2      \n"

                    "vmovl.s8    q15, d30              \n" // r10
                    "vmovl.s8    q5, d10             \n"   // r11
                    "vmovl.s8    q6, d12             \n"   // r12
                    // sum0
                    "vmlal.s16  q7, d30, %P10[3]      \n" // (r10 - r17) * k03
                    "vmlal.s16  q8, d31, %P10[3]      \n"
                    "vmlal.s16  q9, d10, %P11[0]     \n" // (r11 - r18) * k04
                    "vmlal.s16  q10, d11, %P11[0]    \n"
                    "vmlal.s16  q7, d12, %P11[1]     \n" // (r12 - r19) * k05
                    "vmlal.s16  q8, d13, %P11[1]     \n"

                    // r2
                    "vld1.s8    {d30-d31}, [%4]        \n" // r2
                    "add    %4, %4, #8               \n"

                    "vext.s8    d10, d30, d31, #1      \n"
                    "vext.s8    d12, d30, d31, #2      \n"

                    "vmovl.s8    q15, d30              \n" // r20
                    "vmovl.s8    q5, d10             \n"   // r21
                    "vmovl.s8    q6, d12             \n"   // r22

                    // sum0
                    "vmlal.s16  q7, d30, %P11[2]      \n" // (r20 - r27) * k06
                    "vmlal.s16  q8, d31, %P11[2]      \n"
                    "vmlal.s16  q9, d10, %P11[3]     \n" // (r21 - r28) * k07
                    "vmlal.s16  q10, d11, %P11[3]    \n"
                    "vmlal.s16  q7, d12, %P12[0]     \n" // (r22 - r29) * k08
                    "vmlal.s16  q8, d13, %P12[0]     \n"

                    "subs   %0, %0, #1               \n"

                    // add and save
                    "vadd.s32    q7, q7, q9          \n"
                    "vadd.s32    q8, q8, q10         \n"

                    "vst1.s32    {d14-d17}, [%1]!    \n"

                    "bne    0b                       \n"

                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2)       // %4
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123), // %10
                    "w"(_k4567), // %11
                    "w"(_k8xxx)  // %12
                    : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                int sum = 0;

                sum += (int)r0[0] * kernel[0];
                sum += (int)r0[1] * kernel[1];
                sum += (int)r0[2] * kernel[2];
                sum += (int)r1[0] * kernel[3];
                sum += (int)r1[1] * kernel[4];
                sum += (int)r1[2] * kernel[5];
                sum += (int)r2[0] * kernel[6];
                sum += (int)r2[1] * kernel[7];
                sum += (int)r2[2] * kernel[8];

                *outptr0 = sum;

                r0++;
                r1++;
                r2++;
                outptr0++;
            }

            r0 += 2;
            r1 += 2;
            r2 += 2;
        }
    }
}

static void convdw3x3s2_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const signed char* kernel = (const signed char*)_kernel + p * 9;

        int* outptr = out;

        const signed char* img = bottom_blob.channel(p);

        const signed char* r0 = img;
        const signed char* r1 = img + w;
        const signed char* r2 = img + w * 2;

        int i = 0;
#if __ARM_NEON
        int8x16_t _k0123456789x = vld1q_s8(kernel);
        int16x8_t _k_s16 = vmovl_s8(vget_low_s8(_k0123456789x));
        int16x8_t _kn_s16 = vmovl_s8(vget_high_s8(_k0123456789x));

        int16x4_t _k0123 = vget_low_s16(_k_s16);
        int16x4_t _k4567 = vget_high_s16(_k_s16);
        int16x4_t _k8xxx = vget_low_s16(_kn_s16);
#endif // __ARM_NEON
        for (; i < outh; i++)
        {
#if __ARM_NEON
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "0:                                   \n"
                    "ld2    {v4.8b, v5.8b}, [%2], #16     \n"
                    "ld2    {v6.8b, v7.8b}, [%2]          \n"
                    "ld2    {v8.8b, v9.8b}, [%3], #16     \n"
                    "ld2    {v10.8b, v11.8b}, [%3]        \n"
                    "ld2    {v12.8b, v13.8b}, [%4], #16   \n"
                    "ld2    {v14.8b, v15.8b}, [%4]        \n"

                    "ext    v6.8b, v4.8b, v6.8b, #1       \n"
                    "ext    v10.8b, v8.8b, v10.8b, #1     \n"
                    "ext    v14.8b, v12.8b, v14.8b, #1    \n"

                    "sshll  v4.8h, v4.8b, #0              \n" // r00
                    "sshll  v5.8h, v5.8b, #0              \n" // r01
                    "sshll  v6.8h, v6.8b, #0              \n" // r02
                    "sshll  v8.8h, v8.8b, #0              \n" // r10
                    "sshll  v9.8h, v9.8b, #0              \n" // r11
                    "sshll  v10.8h, v10.8b, #0            \n" // r12
                    "sshll  v12.8h, v12.8b, #0            \n" // r20
                    "sshll  v13.8h, v13.8b, #0            \n" // r21
                    "sshll  v14.8h, v14.8b, #0            \n" // r22

                    // r0
                    "smull  v20.4s, v4.4h, %10.h[0]       \n" // (r00 - r07) * k00
                    "smull2  v21.4s, v4.8h, %10.h[0]      \n"
                    "smull  v22.4s, v5.4h, %10.h[1]       \n" // (r01 - r08) * k01
                    "smull2  v23.4s, v5.8h, %10.h[1]      \n"
                    "smull  v24.4s, v6.4h, %10.h[2]       \n" // (r02 - r09) * k02
                    "smull2  v25.4s, v6.8h, %10.h[2]      \n"

                    // r1
                    "smlal  v20.4s, v8.4h, %10.h[3]       \n" // (r10 - r17) * k03
                    "smlal2  v21.4s, v8.8h, %10.h[3]      \n"
                    "smlal  v22.4s, v9.4h, %11.h[0]       \n" // (r11 - r18) * k04
                    "smlal2  v23.4s, v9.8h, %11.h[0]      \n"
                    "smlal  v24.4s, v10.4h, %11.h[1]      \n" // (r12 - r19) * k05
                    "smlal2  v25.4s, v10.8h, %11.h[1]     \n"

                    // r2
                    "smlal  v20.4s, v12.4h, %11.h[2]      \n" // (r20 - r27) * k06
                    "smlal2  v21.4s, v12.8h, %11.h[2]     \n"
                    "smlal  v22.4s, v13.4h, %11.h[3]      \n" // (r21 - r28) * k07
                    "smlal2  v23.4s, v13.8h, %11.h[3]     \n"
                    "smlal  v24.4s, v14.4h, %12.h[0]      \n" // (r22 - r29) * k08
                    "smlal2  v25.4s, v14.8h, %12.h[0]     \n"

                    // add and save
                    "add    v20.4s, v20.4s, v22.4s        \n"
                    "add    v21.4s, v21.4s, v23.4s        \n"
                    "add    v20.4s, v20.4s, v24.4s        \n"
                    "add    v21.4s, v21.4s, v25.4s        \n"

                    "st1    {v20.4s, v21.4s}, [%1], #32   \n"

                    "subs   %w0, %w0, #1                  \n"
                    "bne    0b                            \n"

                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123), // %10
                    "w"(_k4567), // %11
                    "w"(_k8xxx)  // %12
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "0:                              \n"
                    // r0
                    "vld2.s8    {d30-d31}, [%2]!     \n" // r0
                    "vld2.s8    {d10-d11}, [%2]      \n"
                    "vext.s8    d12, d30, d10, #1    \n"

                    "vmovl.s8    q5, d31             \n" // r01
                    "vmovl.s8    q15, d30            \n" // r00
                    "vmovl.s8    q6, d12             \n" // r02
                    // sum0
                    "vmull.s16  q7, d30, %P10[0]     \n" // (r00 - r07) * k00
                    "vmull.s16  q8, d31, %P10[0]     \n"
                    "vmull.s16  q9, d10, %P10[1]     \n" // (r01 - r08) * k01
                    "vmull.s16  q10, d11, %P10[1]    \n"
                    "vmlal.s16  q7, d12, %P10[2]     \n" // (r02 - r09) * k02
                    "vmlal.s16  q8, d13, %P10[2]     \n"

                    // r1
                    "vld2.s8    {d30-d31}, [%3]!     \n" // r1
                    "vld2.s8    {d10-d11}, [%3]      \n"
                    "vext.s8    d12, d30, d10, #1    \n"

                    "vmovl.s8    q5, d31             \n" // r11
                    "vmovl.s8    q15, d30            \n" // r10
                    "vmovl.s8    q6, d12             \n" // r12
                    // sum0
                    "vmlal.s16  q7, d30, %P10[3]     \n" // (r10 - r17) * k03
                    "vmlal.s16  q8, d31, %P10[3]     \n"
                    "vmlal.s16  q9, d10, %P11[0]     \n" // (r11 - r18) * k04
                    "vmlal.s16  q10, d11, %P11[0]    \n"
                    "vmlal.s16  q7, d12, %P11[1]     \n" // (r12 - r19) * k05
                    "vmlal.s16  q8, d13, %P11[1]     \n"

                    // r2
                    "vld2.s8    {d30-d31}, [%4]!     \n" // r2
                    "vld2.s8    {d10-d11}, [%4]      \n"
                    "vext.s8    d12, d30, d10, #1    \n"

                    "vmovl.s8    q5, d31             \n" // r21
                    "vmovl.s8    q15, d30            \n" // r20
                    "vmovl.s8    q6, d12             \n" // r22

                    // sum0
                    "vmlal.s16  q7, d30, %P11[2]     \n" // (r20 - r27) * k06
                    "vmlal.s16  q8, d31, %P11[2]     \n"
                    "vmlal.s16  q9, d10, %P11[3]     \n" // (r21 - r28) * k07
                    "vmlal.s16  q10, d11, %P11[3]    \n"
                    "vmlal.s16  q7, d12, %P12[0]     \n" // (r22 - r29) * k08
                    "vmlal.s16  q8, d13, %P12[0]     \n"

                    "subs   %0, %0, #1               \n"

                    // add and save
                    "vadd.s32    q7, q7, q9          \n"
                    "vadd.s32    q8, q8, q10         \n"

                    "vst1.s32    {d14-d17}, [%1]!    \n"

                    "bne    0b                       \n"

                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123), // %10
                    "w"(_k4567), // %11
                    "w"(_k8xxx)  // %12
                    : "cc", "memory", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                int sum = 0;

                sum += (int)r0[0] * kernel[0];
                sum += (int)r0[1] * kernel[1];
                sum += (int)r0[2] * kernel[2];
                sum += (int)r1[0] * kernel[3];
                sum += (int)r1[1] * kernel[4];
                sum += (int)r1[2] * kernel[5];
                sum += (int)r2[0] * kernel[6];
                sum += (int)r2[1] * kernel[7];
                sum += (int)r2[2] * kernel[8];

                *outptr = sum;

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}

static void convdw3x3s1_int8_requant_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, std::vector<float> scales_requant, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;
        const float scale_requant_in = scales_requant[2 * p];
        const float scale_requant_out = scales_requant[2 * p + 1];

        const signed char* kernel = (const signed char*)_kernel + p * 9;

        signed char* outptr0 = out;
        signed char* outptr0n = outptr0 + outw;

        const signed char* img0 = bottom_blob.channel(p);

        const signed char* r0 = img0;
        const signed char* r1 = img0 + w;
        const signed char* r2 = img0 + w * 2;
        const signed char* r3 = img0 + w * 3;

        int i = 0;

#if __ARM_NEON
        int8x16_t _k0123456789x = vld1q_s8(kernel);
        int16x8_t _k_s16 = vmovl_s8(vget_low_s8(_k0123456789x));
        int16x8_t _kn_s16 = vmovl_s8(vget_high_s8(_k0123456789x));

        int16x4_t _k0123 = vget_low_s16(_k_s16);
        int16x4_t _k4567 = vget_high_s16(_k_s16);
        int16x4_t _k8xxx = vget_low_s16(_kn_s16);
#endif // __ARM_NEON

        for (; i + 1 < outh; i += 2)
        {
#if __ARM_NEON
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "0:                                   \n"
                    "ld1    {v4.8b, v5.8b}, [%3]          \n"
                    "ld1    {v6.8b, v7.8b}, [%4]          \n"
                    "ld1    {v8.8b, v9.8b}, [%5]          \n"
                    "ld1    {v10.8b, v11.8b}, [%6]        \n"
                    "add    %3, %3, #8                    \n"
                    "add    %4, %4, #8                    \n"
                    "add    %5, %5, #8                    \n"
                    "add    %6, %6, #8                    \n"

                    "ext    v12.8b, v4.8b, v5.8b, #1      \n"
                    "ext    v13.8b, v4.8b, v5.8b, #2      \n"
                    "ext    v14.8b, v6.8b, v7.8b, #1      \n"
                    "ext    v15.8b, v6.8b, v7.8b, #2      \n"
                    "ext    v16.8b, v8.8b, v9.8b, #1      \n"
                    "ext    v17.8b, v8.8b, v9.8b, #2      \n"
                    "ext    v18.8b, v10.8b, v11.8b, #1    \n"
                    "ext    v19.8b, v10.8b, v11.8b, #2    \n"

                    "sshll  v4.8h, v4.8b, #0              \n" // r00
                    "sshll  v12.8h, v12.8b, #0            \n" // r01
                    "sshll  v13.8h, v13.8b, #0            \n" // r02
                    "sshll  v6.8h, v6.8b, #0              \n" // r10
                    "sshll  v14.8h, v14.8b, #0            \n" // r11
                    "sshll  v15.8h, v15.8b, #0            \n" // r12
                    "sshll  v8.8h, v8.8b, #0              \n" // r20
                    "sshll  v16.8h, v16.8b, #0            \n" // r21
                    "sshll  v17.8h, v17.8b, #0            \n" // r22
                    "sshll  v10.8h, v10.8b, #0            \n" // r30
                    "sshll  v18.8h, v18.8b, #0            \n" // r31
                    "sshll  v19.8h, v19.8b, #0            \n" // r32

                    // r0
                    "smull  v20.4s, v4.4h, %14.h[0]       \n" // (r00 - r07) * k00
                    "smull2  v21.4s, v4.8h, %14.h[0]      \n"
                    "smull  v22.4s, v12.4h, %14.h[1]      \n" // (r01 - r08) * k01
                    "smull2  v23.4s, v12.8h, %14.h[1]     \n"
                    "smull  v24.4s, v13.4h, %14.h[2]      \n" // (r02 - r09) * k02
                    "smull2  v25.4s, v13.8h, %14.h[2]     \n"

                    // r1
                    "smull  v26.4s, v6.4h, %14.h[0]       \n" // (r10 - r17) * k00
                    "smull2  v27.4s, v6.8h, %14.h[0]      \n"
                    "smull  v28.4s, v14.4h, %14.h[1]      \n" // (r11 - r18) * k01
                    "smull2  v29.4s, v14.8h, %14.h[1]     \n"
                    "smull  v30.4s, v15.4h, %14.h[2]      \n" // (r12 - r19) * k02
                    "smull2  v31.4s, v15.8h, %14.h[2]     \n"

                    "smlal  v20.4s, v6.4h, %14.h[3]       \n" // (r10 - r17) * k03
                    "smlal2  v21.4s, v6.8h, %14.h[3]      \n"
                    "smlal  v22.4s, v14.4h, %15.h[0]      \n" // (r11 - r18) * k04
                    "smlal2  v23.4s, v14.8h, %15.h[0]     \n"
                    "smlal  v24.4s, v15.4h, %15.h[1]      \n" // (r12 - r19) * k05
                    "smlal2  v25.4s, v15.8h, %15.h[1]     \n"

                    // r2
                    "smlal  v26.4s, v8.4h, %14.h[3]       \n" // (r20 - r27) * k03
                    "smlal2  v27.4s, v8.8h, %14.h[3]      \n"
                    "smlal  v28.4s, v16.4h, %15.h[0]      \n" // (r21 - r28) * k04
                    "smlal2  v29.4s, v16.8h, %15.h[0]     \n"
                    "smlal  v30.4s, v17.4h, %15.h[1]      \n" // (r22 - r29) * k05
                    "smlal2  v31.4s, v17.8h, %15.h[1]     \n"

                    "smlal  v20.4s, v8.4h, %15.h[2]       \n" // (r20 - r27) * k06
                    "smlal2  v21.4s, v8.8h, %15.h[2]      \n"
                    "smlal  v22.4s, v16.4h, %15.h[3]      \n" // (r21 - r28) * k07
                    "smlal2  v23.4s, v16.8h, %15.h[3]     \n"
                    "smlal  v24.4s, v17.4h, %16.h[0]      \n" // (r22 - r29) * k08
                    "smlal2  v25.4s, v17.8h, %16.h[0]     \n"

                    // r3
                    "smlal  v26.4s, v10.4h, %15.h[2]      \n" // (r30 - r37) * k06
                    "smlal2  v27.4s, v10.8h, %15.h[2]     \n"
                    "smlal  v28.4s, v18.4h, %15.h[3]      \n" // (r31 - r38) * k07
                    "smlal2  v29.4s, v18.8h, %15.h[3]     \n"
                    "smlal  v30.4s, v19.4h, %16.h[0]      \n" // (r32 - r39) * k08
                    "smlal2  v31.4s, v19.8h, %16.h[0]     \n"

                    // add and save
                    "add    v20.4s, v20.4s, v22.4s        \n"
                    "add    v21.4s, v21.4s, v23.4s        \n"
                    "add    v26.4s, v26.4s, v28.4s        \n"
                    "add    v27.4s, v27.4s, v29.4s        \n"
                    "add    v20.4s, v20.4s, v24.4s        \n"
                    "add    v21.4s, v21.4s, v25.4s        \n"
                    "add    v26.4s, v26.4s, v30.4s        \n"
                    "add    v27.4s, v27.4s, v31.4s        \n"

                    "dup    v4.4s, %w17                   \n" // bias
                    "dup    v5.4s, %w18                   \n" // scale_in
                    "dup    v6.4s, %w19                   \n" // scale_out

                    // top_s32 -> top_f32
                    "scvtf  v20.4s, v20.4s                 \n"
                    "scvtf  v21.4s, v21.4s                 \n"
                    "scvtf  v26.4s, v26.4s                 \n"
                    "scvtf  v27.4s, v27.4s                 \n"

                    // top_f32 = top_f32 * scale_in
                    "fmul   v20.4s, v20.4s, v5.4s          \n"
                    "fmul   v21.4s, v21.4s, v5.4s          \n"
                    "fmul   v26.4s, v26.4s, v5.4s          \n"
                    "fmul   v27.4s, v27.4s, v5.4s          \n"
                    // top_f32 = top_f32 + bias
                    "fadd   v20.4s, v20.4s, v4.4s          \n"
                    "fadd   v21.4s, v21.4s, v4.4s          \n"
                    "fadd   v26.4s, v26.4s, v4.4s          \n"
                    "fadd   v27.4s, v27.4s, v4.4s          \n"
                    // top_f32 = top_f32 * scale_out
                    "fmul   v20.4s, v20.4s, v6.4s          \n"
                    "fmul   v21.4s, v21.4s, v6.4s          \n"
                    "fmul   v26.4s, v26.4s, v6.4s          \n"
                    "fmul   v27.4s, v27.4s, v6.4s          \n"
                    // top_f32 -> top_s32
                    "fcvtas v20.4s, v20.4s                 \n"
                    "fcvtas v21.4s, v21.4s                 \n"
                    "fcvtas v26.4s, v26.4s                 \n"
                    "fcvtas v27.4s, v27.4s                 \n"
                    // top_s32 -> top_s16
                    "sqxtn  v7.4h, v20.4s                 \n"
                    "sqxtn  v9.4h, v26.4s                 \n"
                    "sqxtn2 v7.8h, v21.4s                 \n"
                    "sqxtn2 v9.8h, v27.4s                 \n"
                    // top_s16 -> top_s8
                    "sqxtn  v8.8b, v7.8h                  \n"
                    "sqxtn  v10.8b, v9.8h                 \n"
                    // save top_s8
                    "st1    {v8.8b}, [%1], #8             \n"
                    "st1    {v10.8b}, [%2], #8            \n"

                    "subs   %w0, %w0, #1                  \n"
                    "bne    0b                            \n"

                    : "=r"(nn),       // %0
                    "=r"(outptr0),  // %1
                    "=r"(outptr0n), // %2
                    "=r"(r0),       // %3
                    "=r"(r1),       // %4
                    "=r"(r2),       // %5
                    "=r"(r3)        // %6
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr0n),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "w"(_k0123),           // %14
                    "w"(_k4567),           // %15
                    "w"(_k8xxx),           // %16
                    "r"(bias0),            // %17
                    "r"(scale_requant_in), // %18
                    "r"(scale_requant_out) // %19
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "0:                              \n"
                    // r0
                    "vld1.s8    {d30-d31}, [%3]      \n" // r0
                    "add    %3, %3, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r00
                    "vmovl.s8    q5, d10             \n" // r01
                    "vmovl.s8    q6, d12             \n" // r02
                    // sum0
                    "vmull.s16  q7, d30, %P14[0]     \n" // (r00 - r07) * k00
                    "vmull.s16  q8, d31, %P14[0]     \n"
                    "vmull.s16  q9, d10, %P14[1]     \n" // (r01 - r08) * k01
                    "vmull.s16  q10, d11, %P14[1]    \n"
                    "vmlal.s16  q7, d12, %P14[2]     \n" // (r02 - r09) * k02
                    "vmlal.s16  q8, d13, %P14[2]     \n"

                    // r1
                    "vld1.s8    {d30-d31}, [%4]      \n" // r1
                    "add    %4, %4, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r10
                    "vmovl.s8    q5, d10             \n" // r11
                    "vmovl.s8    q6, d12             \n" // r12
                    // sum0
                    "vmlal.s16  q7, d30, %P14[3]     \n" // (r10 - r17) * k03
                    "vmlal.s16  q8, d31, %P14[3]     \n"
                    "vmlal.s16  q9, d10, %P15[0]     \n" // (r11 - r18) * k04
                    "vmlal.s16  q10, d11, %P15[0]    \n"
                    "vmlal.s16  q7, d12, %P15[1]     \n" // (r12 - r19) * k05
                    "vmlal.s16  q8, d13, %P15[1]     \n"
                    // sum1
                    "vmull.s16  q11, d30, %P14[0]    \n" // (r10 - r17) * k00
                    "vmull.s16  q12, d31, %P14[0]    \n"
                    "vmull.s16  q13, d10, %P14[1]    \n" // (r11 - r18) * k01
                    "vmull.s16  q14, d11, %P14[1]    \n"
                    "vmlal.s16  q11, d12, %P14[2]    \n" // (r12 - r19) * k02
                    "vmlal.s16  q12, d13, %P14[2]    \n"

                    // r2
                    "vld1.s8    {d30-d31}, [%5]      \n" // r2
                    "add    %5, %5, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r20
                    "vmovl.s8    q5, d10             \n" // r21
                    "vmovl.s8    q6, d12             \n" // r22

                    // sum0
                    "vmlal.s16  q7, d30, %P15[2]     \n" // (r20 - r27) * k06
                    "vmlal.s16  q8, d31, %P15[2]     \n"
                    "vmlal.s16  q9, d10, %P15[3]     \n" // (r21 - r28) * k07
                    "vmlal.s16  q10, d11, %P15[3]    \n"
                    "vmlal.s16  q7, d12, %P16[0]     \n" // (r22 - r29) * k08
                    "vmlal.s16  q8, d13, %P16[0]     \n"
                    // sum1
                    "vmlal.s16  q11, d30, %P14[3]    \n" // (r20 - r27) * k03
                    "vmlal.s16  q12, d31, %P14[3]    \n"
                    "vmlal.s16  q13, d10, %P15[0]    \n" // (r21 - r28) * k04
                    "vmlal.s16  q14, d11, %P15[0]    \n"
                    "vmlal.s16  q11, d12, %P15[1]    \n" // (r22 - r29) * k05
                    "vmlal.s16  q12, d13, %P15[1]    \n"

                    // r3
                    "vld1.s8    {d30-d31}, [%6]      \n" // r3
                    "add    %6, %6, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r30
                    "vmovl.s8    q5, d10             \n" // r31
                    "vmovl.s8    q6, d12             \n" // r32

                    // sum1
                    "vmlal.s16  q11, d30, %P15[2]    \n" // (r30 - r37) * k06
                    "vmlal.s16  q12, d31, %P15[2]    \n"
                    "vmlal.s16  q13, d10, %P15[3]    \n" // (r31 - r38) * k07
                    "vmlal.s16  q14, d11, %P15[3]    \n"
                    "vmlal.s16  q11, d12, %P16[0]    \n" // (r32 - r39) * k08
                    "vmlal.s16  q12, d13, %P16[0]    \n"

                    "subs   %0, %0, #1               \n"

                    // add and save
                    "vadd.s32    q7, q7, q9          \n"
                    "vadd.s32    q8, q8, q10         \n"
                    "vadd.s32    q11, q11, q13       \n"
                    "vadd.s32    q12, q12, q14       \n"

                    "vdup.f32   q13, %17             \n" // bias
                    "vdup.f32   q14, %18             \n" // scale_in
                    "vdup.f32   q15, %19             \n" // scale_out

                    // top_s32 -> top_f32
                    "vcvt.f32.s32 q7, q7            \n"
                    "vcvt.f32.s32 q8, q8            \n"
                    // top_f32 = top_f32 * scale_int
                    "vmul.f32   q0, q7, q14         \n"
                    "vmul.f32   q4, q8, q14         \n"
                    // top_f32 = top_f32 + bias
                    "vadd.f32   q0, q0, q13         \n"
                    "vadd.f32   q4, q4, q13         \n"
                    // top_f32 = top_f32 * scale_out
                    "vmul.f32   q0, q0, q15         \n"
                    "vmul.f32   q4, q4, q15         \n"
                    // top_f32 -> top_s32
                    "vcvtr.s32.f32 s0, s0           \n"
                    "vcvtr.s32.f32 s1, s1           \n"
                    "vcvtr.s32.f32 s2, s2           \n"
                    "vcvtr.s32.f32 s3, s3           \n"
                    "vcvtr.s32.f32 s16, s16           \n"
                    "vcvtr.s32.f32 s17, s17           \n"
                    "vcvtr.s32.f32 s18, s18           \n"
                    "vcvtr.s32.f32 s19, s19           \n"
                    // top_s32 -> top_s16
                    "vqmovn.s32 d14, q0             \n"
                    "vqmovn.s32 d15, q4             \n"
                    // top_s16 -> top_s8
                    "vqmovn.s16   d14, q7           \n"
                    // save top_s8
                    "vst1.8     {d14}, [%1]!        \n"

                    // top_s32 -> top_f32
                    "vcvt.f32.s32 q11, q11          \n"
                    "vcvt.f32.s32 q12, q12          \n"
                    // top_f32 = top_f32 * scale_int
                    "vmul.f32   q0, q11, q14        \n"
                    "vmul.f32   q4, q12, q14        \n"
                    // top_f32 = top_f32 + bias
                    "vadd.f32   q0, q0, q13         \n"
                    "vadd.f32   q4, q4, q13         \n"
                    // top_f32 = top_f32 * scale_out
                    "vmul.f32   q0, q0, q15         \n"
                    "vmul.f32   q4, q4, q15         \n"
                    // top_f32 -> top_s32
                    "vcvtr.s32.f32 s0, s0           \n"
                    "vcvtr.s32.f32 s1, s1           \n"
                    "vcvtr.s32.f32 s2, s2           \n"
                    "vcvtr.s32.f32 s3, s3           \n"
                    "vcvtr.s32.f32 s16, s16           \n"
                    "vcvtr.s32.f32 s17, s17           \n"
                    "vcvtr.s32.f32 s18, s18           \n"
                    "vcvtr.s32.f32 s19, s19           \n"
                    // top_s32 -> top_s16
                    "vqmovn.s32 d14, q0             \n"
                    "vqmovn.s32 d15, q4             \n"
                    // top_s16 -> top_s8
                    "vqmovn.s16   d14, q7           \n"
                    // save top_s8
                    "vst1.8     {d14}, [%2]!        \n"

                    "bne    0b                      \n"

                    : "=r"(nn),       // %0
                    "=r"(outptr0),  // %1
                    "=r"(outptr0n), // %2
                    "=r"(r0),       // %3
                    "=r"(r1),       // %4
                    "=r"(r2),       // %5
                    "=r"(r3)        // %6
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(outptr0n),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "w"(_k0123),           // %14
                    "w"(_k4567),           // %15
                    "w"(_k8xxx),           // %16
                    "r"(bias0),            // %17
                    "r"(scale_requant_in), // %18
                    "r"(scale_requant_out) // %19
                    : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                // TODO NEON
                int sum0 = 0;
                int sum0n = 0;

                sum0 += (int)r0[0] * kernel[0];
                sum0 += (int)r0[1] * kernel[1];
                sum0 += (int)r0[2] * kernel[2];
                sum0 += (int)r1[0] * kernel[3];
                sum0 += (int)r1[1] * kernel[4];
                sum0 += (int)r1[2] * kernel[5];
                sum0 += (int)r2[0] * kernel[6];
                sum0 += (int)r2[1] * kernel[7];
                sum0 += (int)r2[2] * kernel[8];

                sum0n += (int)r1[0] * kernel[0];
                sum0n += (int)r1[1] * kernel[1];
                sum0n += (int)r1[2] * kernel[2];
                sum0n += (int)r2[0] * kernel[3];
                sum0n += (int)r2[1] * kernel[4];
                sum0n += (int)r2[2] * kernel[5];
                sum0n += (int)r3[0] * kernel[6];
                sum0n += (int)r3[1] * kernel[7];
                sum0n += (int)r3[2] * kernel[8];

                *outptr0 = float2int8(((float)sum0 * scale_requant_in + bias0) * scale_requant_out);
                *outptr0n = float2int8(((float)sum0n * scale_requant_in + bias0) * scale_requant_out);

                r0++;
                r1++;
                r2++;
                r3++;
                outptr0++;
                outptr0n++;
            }

            r0 += 2 + w;
            r1 += 2 + w;
            r2 += 2 + w;
            r3 += 2 + w;

            outptr0 += outw;
            outptr0n += outw;
        }

        for (; i < outh; i++)
        {
#if __ARM_NEON
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "dup    v26.4s, %w13                  \n"
                    "dup    v27.4s, %w14                  \n"
                    "dup    v28.4s, %w15                  \n"

                    "0:                                   \n"
                    "ld1    {v4.8b, v5.8b}, [%2]          \n"
                    "ld1    {v6.8b, v7.8b}, [%3]          \n"
                    "ld1    {v8.8b, v9.8b}, [%4]          \n"
                    "add    %2, %2, #8                    \n"
                    "add    %3, %3, #8                    \n"
                    "add    %4, %4, #8                    \n"

                    "ext    v12.8b, v4.8b, v5.8b, #1      \n"
                    "ext    v13.8b, v4.8b, v5.8b, #2      \n"
                    "ext    v14.8b, v6.8b, v7.8b, #1      \n"
                    "ext    v15.8b, v6.8b, v7.8b, #2      \n"
                    "ext    v16.8b, v8.8b, v9.8b, #1      \n"
                    "ext    v17.8b, v8.8b, v9.8b, #2      \n"

                    "sshll  v4.8h, v4.8b, #0              \n" // r00
                    "sshll  v12.8h, v12.8b, #0            \n" // r01
                    "sshll  v13.8h, v13.8b, #0            \n" // r02
                    "sshll  v6.8h, v6.8b, #0              \n" // r10
                    "sshll  v14.8h, v14.8b, #0            \n" // r11
                    "sshll  v15.8h, v15.8b, #0            \n" // r12
                    "sshll  v8.8h, v8.8b, #0              \n" // r20
                    "sshll  v16.8h, v16.8b, #0            \n" // r21
                    "sshll  v17.8h, v17.8b, #0            \n" // r22

                    // r0
                    "smull  v20.4s, v4.4h, %10.h[0]       \n" // (r00 - r07) * k00
                    "smull2  v21.4s, v4.8h, %10.h[0]      \n"
                    "smull  v22.4s, v12.4h, %10.h[1]      \n" // (r01 - r08) * k01
                    "smull2  v23.4s, v12.8h, %10.h[1]     \n"
                    "smull  v24.4s, v13.4h, %10.h[2]      \n" // (r02 - r09) * k02
                    "smull2  v25.4s, v13.8h, %10.h[2]     \n"

                    // r1
                    "smlal  v20.4s, v6.4h, %10.h[3]       \n" // (r10 - r17) * k03
                    "smlal2  v21.4s, v6.8h, %10.h[3]      \n"
                    "smlal  v22.4s, v14.4h, %11.h[0]      \n" // (r11 - r18) * k04
                    "smlal2  v23.4s, v14.8h, %11.h[0]     \n"
                    "smlal  v24.4s, v15.4h, %11.h[1]      \n" // (r12 - r19) * k05
                    "smlal2  v25.4s, v15.8h, %11.h[1]     \n"

                    // r2
                    "smlal  v20.4s, v8.4h, %11.h[2]       \n" // (r20 - r27) * k06
                    "smlal2  v21.4s, v8.8h, %11.h[2]      \n"
                    "smlal  v22.4s, v16.4h, %11.h[3]      \n" // (r21 - r28) * k07
                    "smlal2  v23.4s, v16.8h, %11.h[3]     \n"
                    "smlal  v24.4s, v17.4h, %12.h[0]      \n" // (r22 - r29) * k08
                    "smlal2  v25.4s, v17.8h, %12.h[0]     \n"

                    // add and save
                    "add    v20.4s, v20.4s, v22.4s        \n"
                    "add    v21.4s, v21.4s, v23.4s        \n"
                    "add    v20.4s, v20.4s, v24.4s        \n"
                    "add    v21.4s, v21.4s, v25.4s        \n"

                    // top_s32 -> top_f32
                    "scvtf  v20.4s, v20.4s                \n"
                    "scvtf  v21.4s, v21.4s                \n"
                    // top_f32 = top_f32 * scale_in
                    "fmul   v20.4s, v20.4s, v27.4s        \n"
                    "fmul   v21.4s, v21.4s, v27.4s        \n"
                    // top_f32 = top_f32 + bias
                    "fadd   v20.4s, v20.4s, v26.4s        \n"
                    "fadd   v21.4s, v21.4s, v26.4s        \n"
                    // top_f32 = top_f32 * scale_out
                    "fmul   v20.4s, v20.4s, v28.4s        \n"
                    "fmul   v21.4s, v21.4s, v28.4s        \n"
                    // top_f32 -> top_s32
                    "fcvtas v20.4s, v20.4s                \n"
                    "fcvtas v21.4s, v21.4s                \n"
                    // top_s32 -> top_s16
                    "sqxtn  v7.4h, v20.4s                 \n"
                    "sqxtn2 v7.8h, v21.4s                 \n"
                    // top_s16 -> top_s8
                    "sqxtn  v8.8b, v7.8h                  \n"
                    // save top_s8
                    "st1    {v8.8b}, [%1], #8             \n"

                    "subs   %w0, %w0, #1                  \n"
                    "bne    0b                            \n"

                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2)       // %4
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123),           // %10
                    "w"(_k4567),           // %11
                    "w"(_k8xxx),           // %12
                    "r"(bias0),            // %13
                    "r"(scale_requant_in), // %14
                    "r"(scale_requant_out) // %15
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "0:                              \n"
                    // r0
                    "vld1.s8    {d30-d31}, [%2]      \n" // r0
                    "add    %2, %2, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r00
                    "vmovl.s8    q5, d10             \n" // r01
                    "vmovl.s8    q6, d12             \n" // r02
                    // sum0
                    "vmull.s16  q7, d30, %P10[0]     \n" // (r00 - r07) * k00
                    "vmull.s16  q8, d31, %P10[0]     \n"
                    "vmull.s16  q9, d10, %P10[1]     \n" // (r01 - r08) * k01
                    "vmull.s16  q10, d11, %P10[1]    \n"
                    "vmlal.s16  q7, d12, %P10[2]     \n" // (r02 - r09) * k02
                    "vmlal.s16  q8, d13, %P10[2]     \n"

                    // r1
                    "vld1.s8    {d30-d31}, [%3]      \n" // r1
                    "add    %3, %3, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r10
                    "vmovl.s8    q5, d10             \n" // r11
                    "vmovl.s8    q6, d12             \n" // r12
                    // sum0
                    "vmlal.s16  q7, d30, %P10[3]     \n" // (r10 - r17) * k03
                    "vmlal.s16  q8, d31, %P10[3]     \n"
                    "vmlal.s16  q9, d10, %P11[0]     \n" // (r11 - r18) * k04
                    "vmlal.s16  q10, d11, %P11[0]    \n"
                    "vmlal.s16  q7, d12, %P11[1]     \n" // (r12 - r19) * k05
                    "vmlal.s16  q8, d13, %P11[1]     \n"

                    // r2
                    "vld1.s8    {d30-d31}, [%4]      \n" // r2
                    "add    %4, %4, #8               \n"

                    "vext.s8    d10, d30, d31, #1    \n"
                    "vext.s8    d12, d30, d31, #2    \n"

                    "vmovl.s8    q15, d30            \n" // r20
                    "vmovl.s8    q5, d10             \n" // r21
                    "vmovl.s8    q6, d12             \n" // r22

                    // sum0
                    "vmlal.s16  q7, d30, %P11[2]     \n" // (r20 - r27) * k06
                    "vmlal.s16  q8, d31, %P11[2]     \n"
                    "vmlal.s16  q9, d10, %P11[3]     \n" // (r21 - r28) * k07
                    "vmlal.s16  q10, d11, %P11[3]    \n"
                    "vmlal.s16  q7, d12, %P12[0]     \n" // (r22 - r29) * k08
                    "vmlal.s16  q8, d13, %P12[0]     \n"

                    "subs   %0, %0, #1               \n"

                    // add and save
                    "vadd.s32    q7, q7, q9          \n"
                    "vadd.s32    q8, q8, q10         \n"

                    "vdup.f32   q13, %13             \n" // bias
                    "vdup.f32   q14, %14             \n" // scale_in
                    "vdup.f32   q15, %15             \n" // scale_out

                    // top_s32 -> top_f32
                    "vcvt.f32.s32 q7, q7            \n"
                    "vcvt.f32.s32 q8, q8            \n"
                    // top_f32 = top_f32 * scale_int
                    "vmul.f32   q0, q7, q14         \n"
                    "vmul.f32   q4, q8, q14         \n"
                    // top_f32 = top_f32 + bias
                    "vadd.f32   q0, q0, q13         \n"
                    "vadd.f32   q4, q4, q13         \n"
                    // top_f32 = top_f32 * scale_out
                    "vmul.f32   q0, q0, q15         \n"
                    "vmul.f32   q4, q4, q15         \n"
                    // top_f32 -> top_s32
                    "vcvtr.s32.f32 s0, s0           \n"
                    "vcvtr.s32.f32 s1, s1           \n"
                    "vcvtr.s32.f32 s2, s2           \n"
                    "vcvtr.s32.f32 s3, s3           \n"
                    "vcvtr.s32.f32 s16, s16           \n"
                    "vcvtr.s32.f32 s17, s17           \n"
                    "vcvtr.s32.f32 s18, s18           \n"
                    "vcvtr.s32.f32 s19, s19           \n"
                    // top_s32 -> top_s16
                    "vqmovn.s32 d14, q0             \n"
                    "vqmovn.s32 d15, q4             \n"
                    // top_s16 -> top_s8
                    "vqmovn.s16   d14, q7           \n"
                    // save top_s8
                    "vst1.8     {d14}, [%1]!        \n"

                    "bne    0b                      \n"

                    : "=r"(nn),      // %0
                    "=r"(outptr0), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2)       // %4
                    : "0"(nn),
                    "1"(outptr0),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123),           // %10
                    "w"(_k4567),           // %11
                    "w"(_k8xxx),           // %12
                    "r"(bias0),            // %13
                    "r"(scale_requant_in), // %14
                    "r"(scale_requant_out) // %15
                    : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                int sum = 0;

                sum += (int)r0[0] * kernel[0];
                sum += (int)r0[1] * kernel[1];
                sum += (int)r0[2] * kernel[2];
                sum += (int)r1[0] * kernel[3];
                sum += (int)r1[1] * kernel[4];
                sum += (int)r1[2] * kernel[5];
                sum += (int)r2[0] * kernel[6];
                sum += (int)r2[1] * kernel[7];
                sum += (int)r2[2] * kernel[8];

                *outptr0 = float2int8(((float)sum * scale_requant_in + bias0) * scale_requant_out);

                r0++;
                r1++;
                r2++;
                outptr0++;
            }

            r0 += 2;
            r1 += 2;
            r2 += 2;
        }
    }
}

static void convdw3x3s2_int8_requant_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, std::vector<float> scales_requant, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* bias = _bias;

    const int tailstep = w - 2 * outw + w;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;
        const float scale_requant_in = scales_requant[2 * p];
        const float scale_requant_out = scales_requant[2 * p + 1];

        const signed char* kernel = (const signed char*)_kernel + p * 9;

        signed char* outptr = out;

        const signed char* img = bottom_blob.channel(p);

        const signed char* r0 = img;
        const signed char* r1 = img + w;
        const signed char* r2 = img + w * 2;

        int i = 0;
#if __ARM_NEON
        int8x16_t _k0123456789x = vld1q_s8(kernel);
        int16x8_t _k_s16 = vmovl_s8(vget_low_s8(_k0123456789x));
        int16x8_t _kn_s16 = vmovl_s8(vget_high_s8(_k0123456789x));

        int16x4_t _k0123 = vget_low_s16(_k_s16);
        int16x4_t _k4567 = vget_high_s16(_k_s16);
        int16x4_t _k8xxx = vget_low_s16(_kn_s16);
#endif // __ARM_NEON
        for (; i < outh; i++)
        {
#if __ARM_NEON
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "dup    v26.4s, %w13                  \n"
                    "dup    v27.4s, %w14                  \n"
                    "dup    v28.4s, %w15                  \n"
                    "0:                                   \n"
                    "ld2    {v4.8b, v5.8b}, [%2], #16     \n"
                    "ld2    {v6.8b, v7.8b}, [%2]          \n"
                    "ld2    {v8.8b, v9.8b}, [%3], #16     \n"
                    "ld2    {v10.8b, v11.8b}, [%3]        \n"
                    "ld2    {v12.8b, v13.8b}, [%4], #16   \n"
                    "ld2    {v14.8b, v15.8b}, [%4]        \n"

                    "ext    v6.8b, v4.8b, v6.8b, #1       \n"
                    "ext    v10.8b, v8.8b, v10.8b, #1     \n"
                    "ext    v14.8b, v12.8b, v14.8b, #1    \n"

                    "sshll  v4.8h, v4.8b, #0              \n" // r00
                    "sshll  v5.8h, v5.8b, #0              \n" // r01
                    "sshll  v6.8h, v6.8b, #0              \n" // r02
                    "sshll  v8.8h, v8.8b, #0              \n" // r10
                    "sshll  v9.8h, v9.8b, #0              \n" // r11
                    "sshll  v10.8h, v10.8b, #0            \n" // r12
                    "sshll  v12.8h, v12.8b, #0            \n" // r20
                    "sshll  v13.8h, v13.8b, #0            \n" // r21
                    "sshll  v14.8h, v14.8b, #0            \n" // r22

                    // r0
                    "smull  v20.4s, v4.4h, %10.h[0]       \n" // (r00 - r07) * k00
                    "smull2  v21.4s, v4.8h, %10.h[0]      \n"
                    "smull  v22.4s, v5.4h, %10.h[1]       \n" // (r01 - r08) * k01
                    "smull2  v23.4s, v5.8h, %10.h[1]      \n"
                    "smull  v24.4s, v6.4h, %10.h[2]       \n" // (r02 - r09) * k02
                    "smull2  v25.4s, v6.8h, %10.h[2]      \n"

                    // r1
                    "smlal  v20.4s, v8.4h, %10.h[3]       \n" // (r10 - r17) * k03
                    "smlal2  v21.4s, v8.8h, %10.h[3]      \n"
                    "smlal  v22.4s, v9.4h, %11.h[0]       \n" // (r11 - r18) * k04
                    "smlal2  v23.4s, v9.8h, %11.h[0]      \n"
                    "smlal  v24.4s, v10.4h, %11.h[1]      \n" // (r12 - r19) * k05
                    "smlal2  v25.4s, v10.8h, %11.h[1]     \n"

                    // r2
                    "smlal  v20.4s, v12.4h, %11.h[2]      \n" // (r20 - r27) * k06
                    "smlal2  v21.4s, v12.8h, %11.h[2]     \n"
                    "smlal  v22.4s, v13.4h, %11.h[3]      \n" // (r21 - r28) * k07
                    "smlal2  v23.4s, v13.8h, %11.h[3]     \n"
                    "smlal  v24.4s, v14.4h, %12.h[0]      \n" // (r22 - r29) * k08
                    "smlal2  v25.4s, v14.8h, %12.h[0]     \n"

                    // add and save
                    "add    v20.4s, v20.4s, v22.4s        \n"
                    "add    v21.4s, v21.4s, v23.4s        \n"
                    "add    v20.4s, v20.4s, v24.4s        \n"
                    "add    v21.4s, v21.4s, v25.4s        \n"

                    // top_s32 -> top_f32
                    "scvtf  v20.4s, v20.4s                \n"
                    "scvtf  v21.4s, v21.4s                \n"
                    // top_f32 = top_f32 * scale_in
                    "fmul   v20.4s, v20.4s, v27.4s        \n"
                    "fmul   v21.4s, v21.4s, v27.4s        \n"
                    // top_f32 = top_f32 + bias
                    "fadd   v20.4s, v20.4s, v26.4s        \n"
                    "fadd   v21.4s, v21.4s, v26.4s        \n"
                    // top_f32 = top_f32 * scale_out
                    "fmul   v20.4s, v20.4s, v28.4s        \n"
                    "fmul   v21.4s, v21.4s, v28.4s        \n"
                    // top_f32 -> top_s32
                    "fcvtas v20.4s, v20.4s                \n"
                    "fcvtas v21.4s, v21.4s                \n"
                    // top_s32 -> top_s16
                    "sqxtn  v7.4h, v20.4s                 \n"
                    "sqxtn2 v7.8h, v21.4s                 \n"
                    // top_s16 -> top_s8
                    "sqxtn  v8.8b, v7.8h                  \n"
                    // save top_s8
                    "st1    {v8.8b}, [%1], #8             \n"

                    "subs   %w0, %w0, #1                  \n"
                    "bne    0b                            \n"

                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123),           // %10
                    "w"(_k4567),           // %11
                    "w"(_k8xxx),           // %12
                    "r"(bias0),            // %13
                    "r"(scale_requant_in), // %14
                    "r"(scale_requant_out) // %15
                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "0:                              \n"
                    // r0
                    "vld2.s8    {d30-d31}, [%2]!     \n" // r0
                    "vld2.s8    {d10-d11}, [%2]      \n"
                    "vext.s8    d12, d30, d10, #1    \n"

                    "vmovl.s8    q5, d31             \n" // r01
                    "vmovl.s8    q15, d30            \n" // r00
                    "vmovl.s8    q6, d12             \n" // r02
                    // sum0
                    "vmull.s16  q7, d30, %P10[0]     \n" // (r00 - r07) * k00
                    "vmull.s16  q8, d31, %P10[0]     \n"
                    "vmull.s16  q9, d10, %P10[1]     \n" // (r01 - r08) * k01
                    "vmull.s16  q10, d11, %P10[1]    \n"
                    "vmlal.s16  q7, d12, %P10[2]     \n" // (r02 - r09) * k02
                    "vmlal.s16  q8, d13, %P10[2]     \n"

                    // r1
                    "vld2.s8    {d30-d31}, [%3]!     \n" // r1
                    "vld2.s8    {d10-d11}, [%3]      \n"
                    "vext.s8    d12, d30, d10, #1    \n"

                    "vmovl.s8    q5, d31             \n" // r11
                    "vmovl.s8    q15, d30            \n" // r10
                    "vmovl.s8    q6, d12             \n" // r12
                    // sum0
                    "vmlal.s16  q7, d30, %P10[3]     \n" // (r10 - r17) * k03
                    "vmlal.s16  q8, d31, %P10[3]     \n"
                    "vmlal.s16  q9, d10, %P11[0]     \n" // (r11 - r18) * k04
                    "vmlal.s16  q10, d11, %P11[0]    \n"
                    "vmlal.s16  q7, d12, %P11[1]     \n" // (r12 - r19) * k05
                    "vmlal.s16  q8, d13, %P11[1]     \n"

                    // r2
                    "vld2.s8    {d30-d31}, [%4]!     \n" // r2
                    "vld2.s8    {d10-d11}, [%4]      \n"
                    "vext.s8    d12, d30, d10, #1    \n"

                    "vmovl.s8    q5, d31             \n" // r21
                    "vmovl.s8    q15, d30            \n" // r20
                    "vmovl.s8    q6, d12             \n" // r22

                    // sum0
                    "vmlal.s16  q7, d30, %P11[2]     \n" // (r20 - r27) * k06
                    "vmlal.s16  q8, d31, %P11[2]     \n"
                    "vmlal.s16  q9, d10, %P11[3]     \n" // (r21 - r28) * k07
                    "vmlal.s16  q10, d11, %P11[3]    \n"
                    "vmlal.s16  q7, d12, %P12[0]     \n" // (r22 - r29) * k08
                    "vmlal.s16  q8, d13, %P12[0]     \n"

                    "subs   %0, %0, #1               \n"

                    // add and save
                    "vadd.s32    q7, q7, q9          \n"
                    "vadd.s32    q8, q8, q10         \n"

                    "vdup.f32   q11, %13             \n" // bias
                    "vdup.f32   q12, %14             \n" // scale_in
                    "vdup.f32   q13, %15             \n" // scale_out

                    // top_s32 -> top_f32
                    "vcvt.f32.s32 q7, q7             \n"
                    "vcvt.f32.s32 q8, q8             \n"
                    // top_f32 = top_f32 * scale_int
                    "vmul.f32   q0, q7, q12          \n"
                    "vmul.f32   q4, q8, q12          \n"
                    // top_f32 = top_f32 + bias
                    "vadd.f32   q0, q0, q11          \n"
                    "vadd.f32   q4, q4, q11          \n"
                    // top_f32 = top_f32 * scale_out
                    "vmul.f32   q0, q0, q13          \n"
                    "vmul.f32   q4, q4, q13          \n"
                    // top_f32 -> top_s32
                    "vcvtr.s32.f32 s0, s0            \n"
                    "vcvtr.s32.f32 s1, s1            \n"
                    "vcvtr.s32.f32 s2, s2            \n"
                    "vcvtr.s32.f32 s3, s3            \n"
                    "vcvtr.s32.f32 s16, s16            \n"
                    "vcvtr.s32.f32 s17, s17            \n"
                    "vcvtr.s32.f32 s18, s18            \n"
                    "vcvtr.s32.f32 s19, s19            \n"
                    // top_s32 -> top_s16
                    "vqmovn.s32 d14, q0              \n"
                    "vqmovn.s32 d15, q4              \n"
                    // top_s16 -> top_s8
                    "vqmovn.s16   d14, q7            \n"
                    // save top_s8
                    "vst1.8     {d14}, [%1]!         \n"

                    "bne    0b                       \n"

                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2)      // %4
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "w"(_k0123),           // %10
                    "w"(_k4567),           // %11
                    "w"(_k8xxx),           // %12
                    "r"(bias0),            // %13
                    "r"(scale_requant_in), // %14
                    "r"(scale_requant_out) // %15
                    : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                int sum = 0;

                sum += (int)r0[0] * kernel[0];
                sum += (int)r0[1] * kernel[1];
                sum += (int)r0[2] * kernel[2];
                sum += (int)r1[0] * kernel[3];
                sum += (int)r1[1] * kernel[4];
                sum += (int)r1[2] * kernel[5];
                sum += (int)r2[0] * kernel[6];
                sum += (int)r2[1] * kernel[7];
                sum += (int)r2[2] * kernel[8];

                *outptr = float2int8(((float)sum * scale_requant_in + bias0) * scale_requant_out);

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_3x3_pack4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
#if __aarch64__
    const int w = bottom_blob.w;
#endif

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);

        const float* k0 = kernel.row(g);

        float* outptr0 = out.row(0);

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);

        float32x4_t _k00 = vld1q_f32(k0);
        float32x4_t _k01 = vld1q_f32(k0 + 4);
        float32x4_t _k02 = vld1q_f32(k0 + 8);
        float32x4_t _k10 = vld1q_f32(k0 + 12);
        float32x4_t _k11 = vld1q_f32(k0 + 16);
        float32x4_t _k12 = vld1q_f32(k0 + 20);
        float32x4_t _k20 = vld1q_f32(k0 + 24);
        float32x4_t _k21 = vld1q_f32(k0 + 28);
        float32x4_t _k22 = vld1q_f32(k0 + 32);

        int i = 0;

#if __aarch64__
        float* outptr1 = out.row(1);
        const float* r3 = img0.row(3);

        for (; i + 1 < outh; i += 2)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v10.4s, v11.4s}, [%3], #32 \n" // r10 r11

                    "mov    v16.16b, %21.16b            \n" // sum00
                    "mov    v17.16b, %21.16b            \n" // sum01
                    "mov    v18.16b, %21.16b            \n" // sum02
                    "mov    v19.16b, %21.16b            \n" // sum03

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%3] \n" // r12 r13 r14 r15

                    "mov    v20.16b, %21.16b            \n" // sum10
                    "mov    v21.16b, %21.16b            \n" // sum11
                    "mov    v22.16b, %21.16b            \n" // sum12
                    "mov    v23.16b, %21.16b            \n" // sum13

                    "fmla   v16.4s, %15.4s, v10.4s      \n"
                    "fmla   v17.4s, %15.4s, v11.4s      \n"
                    "fmla   v18.4s, %15.4s, v12.4s      \n"
                    "fmla   v19.4s, %15.4s, v13.4s      \n"
                    "fmla   v20.4s, %12.4s, v10.4s      \n"
                    "fmla   v21.4s, %12.4s, v11.4s      \n"
                    "fmla   v22.4s, %12.4s, v12.4s      \n"
                    "fmla   v23.4s, %12.4s, v13.4s      \n"

                    "add    %3, %3, #32                 \n"

                    "fmla   v16.4s, %16.4s, v11.4s      \n"
                    "fmla   v17.4s, %16.4s, v12.4s      \n"
                    "fmla   v18.4s, %16.4s, v13.4s      \n"
                    "fmla   v19.4s, %16.4s, v14.4s      \n"
                    "fmla   v20.4s, %13.4s, v11.4s      \n"
                    "fmla   v21.4s, %13.4s, v12.4s      \n"
                    "fmla   v22.4s, %13.4s, v13.4s      \n"
                    "fmla   v23.4s, %13.4s, v14.4s      \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v10.4s, v11.4s}, [%4], #32 \n" // r20 r21

                    "fmla   v16.4s, %17.4s, v12.4s      \n"
                    "fmla   v17.4s, %17.4s, v13.4s      \n"
                    "fmla   v18.4s, %17.4s, v14.4s      \n"
                    "fmla   v19.4s, %17.4s, v15.4s      \n"
                    "fmla   v20.4s, %14.4s, v12.4s      \n"
                    "fmla   v21.4s, %14.4s, v13.4s      \n"
                    "fmla   v22.4s, %14.4s, v14.4s      \n"
                    "fmla   v23.4s, %14.4s, v15.4s      \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4] \n" // r22 r23 r24 r25

                    "fmla   v16.4s, %18.4s, v10.4s      \n"
                    "fmla   v17.4s, %18.4s, v11.4s      \n"
                    "fmla   v18.4s, %18.4s, v12.4s      \n"
                    "fmla   v19.4s, %18.4s, v13.4s      \n"
                    "fmla   v20.4s, %15.4s, v10.4s      \n"
                    "fmla   v21.4s, %15.4s, v11.4s      \n"
                    "fmla   v22.4s, %15.4s, v12.4s      \n"
                    "fmla   v23.4s, %15.4s, v13.4s      \n"

                    "add    %4, %4, #32                 \n"

                    "fmla   v16.4s, %19.4s, v11.4s      \n"
                    "fmla   v17.4s, %19.4s, v12.4s      \n"
                    "fmla   v18.4s, %19.4s, v13.4s      \n"
                    "fmla   v19.4s, %19.4s, v14.4s      \n"
                    "fmla   v20.4s, %16.4s, v11.4s      \n"
                    "fmla   v21.4s, %16.4s, v12.4s      \n"
                    "fmla   v22.4s, %16.4s, v13.4s      \n"
                    "fmla   v23.4s, %16.4s, v14.4s      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v10.4s, v11.4s}, [%2], #32 \n" // r00 r01

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v24.4s, v25.4s}, [%5], #32 \n" // r30 r31

                    "fmla   v16.4s, %20.4s, v12.4s      \n"
                    "fmla   v17.4s, %20.4s, v13.4s      \n"
                    "fmla   v18.4s, %20.4s, v14.4s      \n"
                    "fmla   v19.4s, %20.4s, v15.4s      \n"
                    "fmla   v20.4s, %17.4s, v12.4s      \n"
                    "fmla   v21.4s, %17.4s, v13.4s      \n"
                    "fmla   v22.4s, %17.4s, v14.4s      \n"
                    "fmla   v23.4s, %17.4s, v15.4s      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%2] \n" // r02 r03 r04 r05

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v26.4s, v27.4s, v28.4s, v29.4s}, [%5] \n" // r32 r33 r34 r35

                    "fmla   v16.4s, %12.4s, v10.4s      \n"
                    "fmla   v17.4s, %12.4s, v11.4s      \n"
                    "fmla   v18.4s, %12.4s, v12.4s      \n"
                    "fmla   v19.4s, %12.4s, v13.4s      \n"
                    "fmla   v20.4s, %18.4s, v24.4s      \n"
                    "fmla   v21.4s, %18.4s, v25.4s      \n"
                    "fmla   v22.4s, %18.4s, v26.4s      \n"
                    "fmla   v23.4s, %18.4s, v27.4s      \n"

                    "add    %2, %2, #32                 \n"

                    "fmla   v16.4s, %13.4s, v11.4s      \n"
                    "fmla   v17.4s, %13.4s, v12.4s      \n"
                    "fmla   v18.4s, %13.4s, v13.4s      \n"
                    "fmla   v19.4s, %13.4s, v14.4s      \n"
                    "fmla   v20.4s, %19.4s, v25.4s      \n"
                    "fmla   v21.4s, %19.4s, v26.4s      \n"
                    "fmla   v22.4s, %19.4s, v27.4s      \n"
                    "fmla   v23.4s, %19.4s, v28.4s      \n"

                    "add    %5, %5, #32                 \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v13.4s      \n"
                    "fmla   v18.4s, %14.4s, v14.4s      \n"
                    "fmla   v19.4s, %14.4s, v15.4s      \n"
                    "fmla   v20.4s, %20.4s, v26.4s      \n"
                    "fmla   v21.4s, %20.4s, v27.4s      \n"
                    "fmla   v22.4s, %20.4s, v28.4s      \n"
                    "fmla   v23.4s, %20.4s, v29.4s      \n"

                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
            }
            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%3] \n" // r10 r11 r12 r13

                    "mov    v16.16b, %21.16b            \n" // sum00
                    "mov    v17.16b, %21.16b            \n" // sum01
                    "mov    v18.16b, %21.16b            \n" // sum10
                    "mov    v19.16b, %21.16b            \n" // sum11

                    "fmla   v16.4s, %15.4s, v10.4s      \n"
                    "fmla   v17.4s, %15.4s, v11.4s      \n"
                    "fmla   v18.4s, %12.4s, v10.4s      \n"
                    "fmla   v19.4s, %12.4s, v11.4s      \n"

                    "add    %3, %3, #32                 \n"

                    "fmla   v16.4s, %16.4s, v11.4s      \n"
                    "fmla   v17.4s, %16.4s, v12.4s      \n"
                    "fmla   v18.4s, %13.4s, v11.4s      \n"
                    "fmla   v19.4s, %13.4s, v12.4s      \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%4] \n" // r20 r21 r22 r23

                    "fmla   v16.4s, %17.4s, v12.4s      \n"
                    "fmla   v17.4s, %17.4s, v13.4s      \n"
                    "fmla   v18.4s, %14.4s, v12.4s      \n"
                    "fmla   v19.4s, %14.4s, v13.4s      \n"

                    "add    %4, %4, #32                 \n"

                    "fmla   v16.4s, %18.4s, v20.4s      \n"
                    "fmla   v17.4s, %18.4s, v21.4s      \n"
                    "fmla   v18.4s, %15.4s, v20.4s      \n"
                    "fmla   v19.4s, %15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%2] \n" // r00 r01 r02 r03

                    "fmla   v16.4s, %19.4s, v21.4s      \n"
                    "fmla   v17.4s, %19.4s, v22.4s      \n"
                    "fmla   v18.4s, %16.4s, v21.4s      \n"
                    "fmla   v19.4s, %16.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%5] \n" // r30 r31 r32 r33

                    "fmla   v16.4s, %20.4s, v22.4s      \n"
                    "fmla   v17.4s, %20.4s, v23.4s      \n"
                    "fmla   v18.4s, %17.4s, v22.4s      \n"
                    "fmla   v19.4s, %17.4s, v23.4s      \n"

                    "add    %2, %2, #32                 \n"

                    "fmla   v16.4s, %12.4s, v10.4s      \n"
                    "fmla   v17.4s, %12.4s, v11.4s      \n"
                    "fmla   v18.4s, %18.4s, v24.4s      \n"
                    "fmla   v19.4s, %18.4s, v25.4s      \n"

                    "add    %5, %5, #32                 \n"

                    "fmla   v16.4s, %13.4s, v11.4s      \n"
                    "fmla   v17.4s, %13.4s, v12.4s      \n"
                    "fmla   v18.4s, %19.4s, v25.4s      \n"
                    "fmla   v19.4s, %19.4s, v26.4s      \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v13.4s      \n"
                    "fmla   v18.4s, %20.4s, v26.4s      \n"
                    "fmla   v19.4s, %20.4s, v27.4s      \n"

                    "st1    {v16.4s, v17.4s}, [%0], #32 \n"
                    "st1    {v18.4s, v19.4s}, [%1], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #384]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s}, [%3] \n" // r10 r11 r12

                    "mov    v16.16b, %21.16b            \n" // sum0
                    "mov    v17.16b, %21.16b            \n" // sum1

                    "fmla   v16.4s, %15.4s, v10.4s      \n"
                    "fmla   v17.4s, %12.4s, v10.4s      \n"

                    "add    %3, %3, #16                 \n"

                    "fmla   v16.4s, %16.4s, v11.4s      \n"
                    "fmla   v17.4s, %13.4s, v11.4s      \n"

                    "prfm   pldl1keep, [%4, #384]       \n"
                    "ld1    {v20.4s, v21.4s, v22.4s}, [%4] \n" // r20 r21 r22

                    "fmla   v16.4s, %17.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v12.4s      \n"

                    "add    %4, %4, #16                 \n"

                    "fmla   v16.4s, %18.4s, v20.4s      \n"
                    "fmla   v17.4s, %15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%2, #384]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s}, [%2] \n" // r00 r01 r02

                    "fmla   v16.4s, %19.4s, v21.4s      \n"
                    "fmla   v17.4s, %16.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%5, #384]       \n"
                    "ld1    {v24.4s, v25.4s, v26.4s}, [%5] \n" // r30 r31 r32

                    "fmla   v16.4s, %20.4s, v22.4s      \n"
                    "fmla   v17.4s, %17.4s, v22.4s      \n"

                    "add    %2, %2, #16                 \n"

                    "fmla   v16.4s, %12.4s, v10.4s      \n"
                    "fmla   v17.4s, %18.4s, v24.4s      \n"

                    "add    %5, %5, #16                 \n"

                    "fmla   v16.4s, %13.4s, v11.4s      \n"
                    "fmla   v17.4s, %19.4s, v25.4s      \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %20.4s, v26.4s      \n"

                    "st1    {v16.4s}, [%0], #16         \n"
                    "st1    {v17.4s}, [%1], #16         \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v24", "v25", "v26");
            }

            r0 += 2 * 4 + w * 4;
            r1 += 2 * 4 + w * 4;
            r2 += 2 * 4 + w * 4;
            r3 += 2 * 4 + w * 4;

            outptr0 += outw * 4;
            outptr1 += outw * 4;
        }
#endif // __aarch64__
        for (; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v10.4s, v11.4s}, [%1], #32 \n" // r00 r01

                    "mov    v16.16b, %17.16b            \n" // sum00
                    "mov    v17.16b, %17.16b            \n" // sum01
                    "mov    v18.16b, %17.16b            \n" // sum02
                    "mov    v19.16b, %17.16b            \n" // sum03

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1] \n" // r02 r03 r04 r05

                    "fmla   v16.4s, %8.4s, v10.4s       \n"
                    "fmla   v17.4s, %8.4s, v11.4s       \n"
                    "fmla   v18.4s, %8.4s, v12.4s       \n"
                    "fmla   v19.4s, %8.4s, v13.4s       \n"

                    "add    %1, %1, #32                 \n"

                    "fmla   v16.4s, %9.4s, v11.4s       \n"
                    "fmla   v17.4s, %9.4s, v12.4s       \n"
                    "fmla   v18.4s, %9.4s, v13.4s       \n"
                    "fmla   v19.4s, %9.4s, v14.4s       \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v10.4s, v11.4s}, [%2], #32 \n" // r10 r11

                    "fmla   v16.4s, %10.4s, v12.4s      \n"
                    "fmla   v17.4s, %10.4s, v13.4s      \n"
                    "fmla   v18.4s, %10.4s, v14.4s      \n"
                    "fmla   v19.4s, %10.4s, v15.4s      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%2] \n" // r12 r13 r14 r15

                    "fmla   v16.4s, %11.4s, v10.4s      \n"
                    "fmla   v17.4s, %11.4s, v11.4s      \n"
                    "fmla   v18.4s, %11.4s, v12.4s      \n"
                    "fmla   v19.4s, %11.4s, v13.4s      \n"

                    "add    %2, %2, #32                 \n"

                    "fmla   v16.4s, %12.4s, v11.4s      \n"
                    "fmla   v17.4s, %12.4s, v12.4s      \n"
                    "fmla   v18.4s, %12.4s, v13.4s      \n"
                    "fmla   v19.4s, %12.4s, v14.4s      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v10.4s, v11.4s}, [%3], #32 \n" // r20 r21

                    "fmla   v16.4s, %13.4s, v12.4s      \n"
                    "fmla   v17.4s, %13.4s, v13.4s      \n"
                    "fmla   v18.4s, %13.4s, v14.4s      \n"
                    "fmla   v19.4s, %13.4s, v15.4s      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%3] \n" // r22 r23 r24 r25

                    "fmla   v16.4s, %14.4s, v10.4s      \n"
                    "fmla   v17.4s, %14.4s, v11.4s      \n"
                    "fmla   v18.4s, %14.4s, v12.4s      \n"
                    "fmla   v19.4s, %14.4s, v13.4s      \n"

                    "add    %3, %3, #32                 \n"

                    "fmla   v16.4s, %15.4s, v11.4s      \n"
                    "fmla   v17.4s, %15.4s, v12.4s      \n"
                    "fmla   v18.4s, %15.4s, v13.4s      \n"
                    "fmla   v19.4s, %15.4s, v14.4s      \n"

                    "fmla   v16.4s, %16.4s, v12.4s      \n"
                    "fmla   v17.4s, %16.4s, v13.4s      \n"
                    "fmla   v18.4s, %16.4s, v14.4s      \n"
                    "fmla   v19.4s, %16.4s, v15.4s      \n"

                    "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
#else
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128]! \n" // r00 r01

                    "vmov       q10, %q17       \n" // sum00
                    "vmov       q11, %q17       \n" // sum01

                    "vmla.f32   q10, %q8, q14   \n"
                    "vmla.f32   q11, %q8, q15   \n"
                    "vmla.f32   q10, %q9, q15   \n"

                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128]! \n" // r02 r03

                    "vmov       q12, %q17       \n" // sum02
                    "vmov       q13, %q17       \n" // sum03

                    "vmla.f32   q12, %q8, q14   \n"
                    "vmla.f32   q11, %q9, q14   \n"
                    "vmla.f32   q13, %q8, q15   \n"
                    "vmla.f32   q10, %q10, q14  \n"
                    "vmla.f32   q12, %q9, q15   \n"
                    "vmla.f32   q11, %q10, q15  \n"

                    //                     "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128] \n" // r04 r05

                    "vmla.f32   q13, %q9, q14   \n"
                    "vmla.f32   q12, %q10, q14  \n"
                    "vmla.f32   q13, %q10, q15  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128]! \n" // r10 r11

                    "vmla.f32   q10, %q11, q14  \n"
                    "vmla.f32   q11, %q11, q15  \n"
                    "vmla.f32   q10, %q12, q15  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128]! \n" // r12 r13

                    "vmla.f32   q12, %q11, q14  \n"
                    "vmla.f32   q11, %q12, q14  \n"
                    "vmla.f32   q13, %q11, q15  \n"
                    "vmla.f32   q10, %q13, q14  \n"
                    "vmla.f32   q12, %q12, q15  \n"
                    "vmla.f32   q11, %q13, q15  \n"

                    //                     "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128] \n" // r14 r15

                    "vmla.f32   q13, %q12, q14  \n"
                    "vmla.f32   q12, %q13, q14  \n"
                    "vmla.f32   q13, %q13, q15  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128]! \n" // r20 r21

                    "vmla.f32   q10, %q14, q14  \n"
                    "vmla.f32   q11, %q14, q15  \n"
                    "vmla.f32   q10, %q15, q15  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128]! \n" // r22 r23

                    "vmla.f32   q12, %q14, q14  \n"
                    "vmla.f32   q11, %q15, q14  \n"
                    "vmla.f32   q13, %q14, q15  \n"
                    "vmla.f32   q10, %q16, q14  \n"
                    "vmla.f32   q12, %q15, q15  \n"
                    "vmla.f32   q11, %q16, q15  \n"

                    //                     "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128] \n" // r24 r25

                    "vmla.f32   q13, %q15, q14  \n"
                    "vmla.f32   q12, %q16, q14  \n"
                    "vmla.f32   q13, %q16, q15  \n"

                    "vstm       %0!, {d20-d27}  \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
            }
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1] \n" // r00 r01 r02 r03

                    "mov    v16.16b, %17.16b            \n" // sum00
                    "mov    v17.16b, %17.16b            \n" // sum01

                    "eor    v18.16b, v18.16b, v18.16b   \n"
                    "eor    v19.16b, v19.16b, v19.16b   \n"

                    "fmla   v16.4s, %8.4s, v12.4s       \n"
                    "fmla   v17.4s, %8.4s, v13.4s       \n"

                    "add    %1, %1, #32                 \n"

                    "fmla   v18.4s, %9.4s, v13.4s       \n"
                    "fmla   v19.4s, %9.4s, v14.4s       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2] \n" // r10 r11 r12 r13

                    "fmla   v16.4s, %10.4s, v14.4s      \n"
                    "fmla   v17.4s, %10.4s, v15.4s      \n"

                    "add    %2, %2, #32                 \n"

                    "fmla   v18.4s, %11.4s, v20.4s      \n"
                    "fmla   v19.4s, %11.4s, v21.4s      \n"

                    "fmla   v16.4s, %12.4s, v21.4s      \n"
                    "fmla   v17.4s, %12.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%3] \n" // r20 r21 r22 r23

                    "fmla   v18.4s, %13.4s, v22.4s      \n"
                    "fmla   v19.4s, %13.4s, v23.4s      \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v13.4s      \n"

                    "fmla   v18.4s, %15.4s, v13.4s      \n"
                    "fmla   v19.4s, %15.4s, v14.4s      \n"

                    "fmla   v16.4s, %16.4s, v14.4s      \n"
                    "fmla   v17.4s, %16.4s, v15.4s      \n"

                    "add    %3, %3, #32                 \n"

                    "fadd   v16.4s, v16.4s, v18.4s      \n"
                    "fadd   v17.4s, v17.4s, v19.4s      \n"

                    "st1    {v16.4s, v17.4s}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d24-d27}, [%1 :128]! \n" // r00 r01

                    "vmov       q10, %q17       \n" // sum00
                    "vmov       q11, %q17       \n" // sum01

                    "vmla.f32   q10, %q8, q12   \n"
                    "vmla.f32   q11, %q8, q13   \n"

                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128] \n" // r02 r03

                    "vmla.f32   q10, %q9, q13   \n"

                    "vmla.f32   q11, %q9, q14   \n"
                    "vmla.f32   q10, %q10, q14  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d24-d27}, [%2 :128]! \n" // r10 r11

                    "vmla.f32   q11, %q10, q15  \n"

                    "vmla.f32   q10, %q11, q12  \n"
                    "vmla.f32   q11, %q11, q13  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128] \n" // r12 r13

                    "vmla.f32   q10, %q12, q13  \n"

                    "vmla.f32   q11, %q12, q14  \n"
                    "vmla.f32   q10, %q13, q14  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d24-d27}, [%3 :128]! \n" // r20 r21

                    "vmla.f32   q11, %q13, q15  \n"

                    "vmla.f32   q10, %q14, q12  \n"
                    "vmla.f32   q11, %q14, q13  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128] \n" // r22 r23

                    "vmla.f32   q10, %q15, q13  \n"

                    "vmla.f32   q11, %q15, q14  \n"
                    "vmla.f32   q10, %q16, q14  \n"
                    "vmla.f32   q11, %q16, q15  \n"

                    "vst1.f32   {d20-d23}, [%0 :128]! \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
            }
            for (; j < outw; j++)
            {
                float32x4_t _sum0 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);

                vst1q_f32(outptr0, _sum0);

                r0 += 4;
                r1 += 4;
                r2 += 4;
                outptr0 += 4;
            }

            r0 += 2 * 4;
            r1 += 2 * 4;
            r2 += 2 * 4;
        }
    }
}

static void convdw3x3s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);

        const float* k0 = kernel.row(g);

        float* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);

        float32x4_t _k00 = vld1q_f32(k0);
        float32x4_t _k01 = vld1q_f32(k0 + 4);
        float32x4_t _k02 = vld1q_f32(k0 + 8);
        float32x4_t _k10 = vld1q_f32(k0 + 12);
        float32x4_t _k11 = vld1q_f32(k0 + 16);
        float32x4_t _k12 = vld1q_f32(k0 + 20);
        float32x4_t _k20 = vld1q_f32(k0 + 24);
        float32x4_t _k21 = vld1q_f32(k0 + 28);
        float32x4_t _k22 = vld1q_f32(k0 + 32);

        int i = 0;

        for (; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%1], #64 \n" // r00 r01 r02 r03

                    "mov    v28.16b, %17.16b            \n" // sum00
                    "mov    v29.16b, %17.16b            \n" // sum01
                    "mov    v30.16b, %17.16b            \n" // sum02
                    "mov    v31.16b, %17.16b            \n" // sum03

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v14.4s, v15.4s, v16.4s, v17.4s}, [%1], #64 \n" // r04 r05 r06 r07

                    "fmla   v28.4s, %8.4s, v10.4s       \n"
                    "fmla   v29.4s, %8.4s, v12.4s       \n"
                    "fmla   v30.4s, %8.4s, v14.4s       \n"
                    "fmla   v31.4s, %8.4s, v16.4s       \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v18.4s}, [%1]              \n" // r08

                    "fmla   v28.4s, %9.4s, v11.4s       \n"
                    "fmla   v29.4s, %9.4s, v13.4s       \n"
                    "fmla   v30.4s, %9.4s, v15.4s       \n"
                    "fmla   v31.4s, %9.4s, v17.4s       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n" // r10 r11 r12 r13

                    "fmla   v28.4s, %10.4s, v12.4s      \n"
                    "fmla   v29.4s, %10.4s, v14.4s      \n"
                    "fmla   v30.4s, %10.4s, v16.4s      \n"
                    "fmla   v31.4s, %10.4s, v18.4s      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n" // r14 r15 r16 r17

                    "fmla   v28.4s, %11.4s, v20.4s      \n"
                    "fmla   v29.4s, %11.4s, v22.4s      \n"
                    "fmla   v30.4s, %11.4s, v24.4s      \n"
                    "fmla   v31.4s, %11.4s, v26.4s      \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v19.4s}, [%2]              \n" // r18

                    "fmla   v28.4s, %12.4s, v21.4s      \n"
                    "fmla   v29.4s, %12.4s, v23.4s      \n"
                    "fmla   v30.4s, %12.4s, v25.4s      \n"
                    "fmla   v31.4s, %12.4s, v27.4s      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%3], #64 \n" // r20 r21 r22 r23

                    "fmla   v28.4s, %13.4s, v22.4s      \n"
                    "fmla   v29.4s, %13.4s, v24.4s      \n"
                    "fmla   v30.4s, %13.4s, v26.4s      \n"
                    "fmla   v31.4s, %13.4s, v19.4s      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v14.4s, v15.4s, v16.4s, v17.4s}, [%3], #64 \n" // r24 r25 r26 r27

                    "fmla   v28.4s, %14.4s, v10.4s      \n"
                    "fmla   v29.4s, %14.4s, v12.4s      \n"
                    "fmla   v30.4s, %14.4s, v14.4s      \n"
                    "fmla   v31.4s, %14.4s, v16.4s      \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v18.4s}, [%3]              \n" // r28

                    "fmla   v28.4s, %15.4s, v11.4s      \n"
                    "fmla   v29.4s, %15.4s, v13.4s      \n"
                    "fmla   v30.4s, %15.4s, v15.4s      \n"
                    "fmla   v31.4s, %15.4s, v17.4s      \n"

                    "fmla   v28.4s, %16.4s, v12.4s      \n"
                    "fmla   v29.4s, %16.4s, v14.4s      \n"
                    "fmla   v30.4s, %16.4s, v16.4s      \n"
                    "fmla   v31.4s, %16.4s, v18.4s      \n"

                    "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128]! \n" // r00 r01

                    "vmov       q10, %q17       \n" // sum00

                    "vmla.f32   q10, %q8, q14   \n"

                    "vmov       q11, %q17       \n" // sum01

                    "vmla.f32   q10, %q9, q15   \n"

                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128]! \n" // r02 r03

                    "vmla.f32   q11, %q8, q14   \n"
                    "vmla.f32   q10, %q10, q14  \n"

                    "vmov       q12, %q17       \n" // sum02

                    "vmla.f32   q11, %q9, q15   \n"

                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128]! \n" // r04 r05

                    "vmla.f32   q12, %q8, q14   \n"
                    "vmla.f32   q11, %q10, q14  \n"

                    "vmla.f32   q12, %q9, q15   \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128]! \n" // r10 r11

                    "vmla.f32   q10, %q11, q14  \n"

                    "vmov       q13, %q17       \n" // sum03

                    "vmla.f32   q10, %q12, q15  \n"

                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128]! \n" // r06 r07

                    "vmla.f32   q13, %q8, q14   \n"
                    "vmla.f32   q12, %q10, q14  \n"

                    "vmla.f32   q13, %q9, q15   \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128]! \n" // r12 r13

                    "vmla.f32   q11, %q11, q14  \n"
                    "vmla.f32   q10, %q13, q14  \n"

                    "vmla.f32   q11, %q12, q15  \n"

                    "vld1.f32   {d28-d29}, [%1 :128] \n" // r08

                    "vmla.f32   q13, %q10, q14  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128]! \n" // r14 r15

                    "vmla.f32   q12, %q11, q14  \n"
                    "vmla.f32   q11, %q13, q14  \n"

                    "vmla.f32   q12, %q12, q15  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128]! \n" // r20 r21

                    "vmla.f32   q10, %q14, q14  \n"
                    "vmla.f32   q10, %q15, q15  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128]! \n" // r16 r17

                    "vmla.f32   q13, %q11, q14  \n"
                    "vmla.f32   q12, %q13, q14  \n"

                    "vmla.f32   q13, %q12, q15  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128]! \n" // r22 r23

                    "vmla.f32   q11, %q14, q14  \n"
                    "vmla.f32   q10, %q16, q14  \n"

                    "vmla.f32   q11, %q15, q15  \n"

                    "vld1.f32   {d28-d29}, [%2 :128] \n" // r18

                    "vmla.f32   q13, %q13, q14  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128]! \n" // r24 r25

                    "vmla.f32   q12, %q14, q14  \n"
                    "vmla.f32   q11, %q16, q14  \n"

                    "vmla.f32   q12, %q15, q15  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128]! \n" // r26 r27

                    "vmla.f32   q13, %q14, q14  \n"
                    "vmla.f32   q12, %q16, q14  \n"

                    "vmla.f32   q13, %q15, q15  \n"

                    "vld1.f32   {d28-d29}, [%3 :128] \n" // r28

                    "vmla.f32   q13, %q16, q14  \n"

                    "vstm       %0!, {d20-d27}  \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
            }
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%1], #64 \n" // r00 r01 r02 r03

                    "mov    v20.16b, %17.16b            \n" // sum00
                    "mov    v21.16b, %17.16b            \n" // sum01

                    "eor    v22.16b, v22.16b, v22.16b   \n"
                    "eor    v23.16b, v23.16b, v23.16b   \n"

                    "fmla   v20.4s, %8.4s, v10.4s       \n"
                    "fmla   v21.4s, %8.4s, v12.4s       \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v14.4s}, [%1]              \n" // r04

                    "fmla   v22.4s, %9.4s, v11.4s       \n"
                    "fmla   v23.4s, %9.4s, v13.4s       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%2], #64 \n" // r10 r11 r12 r13

                    "fmla   v20.4s, %10.4s, v12.4s      \n"
                    "fmla   v21.4s, %10.4s, v14.4s      \n"

                    "fmla   v22.4s, %11.4s, v16.4s      \n"
                    "fmla   v23.4s, %11.4s, v18.4s      \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v15.4s}, [%2]              \n" // r14

                    "fmla   v20.4s, %12.4s, v17.4s      \n"
                    "fmla   v21.4s, %12.4s, v19.4s      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v10.4s, v11.4s, v12.4s, v13.4s}, [%3], #64 \n" // r20 r21 r22 r23

                    "fmla   v22.4s, %13.4s, v18.4s      \n"
                    "fmla   v23.4s, %13.4s, v15.4s      \n"

                    "fmla   v20.4s, %14.4s, v10.4s      \n"
                    "fmla   v21.4s, %14.4s, v12.4s      \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v14.4s}, [%3]              \n" // r24

                    "fmla   v22.4s, %15.4s, v11.4s      \n"
                    "fmla   v23.4s, %15.4s, v13.4s      \n"

                    "fmla   v20.4s, %16.4s, v12.4s      \n"
                    "fmla   v21.4s, %16.4s, v14.4s      \n"

                    "fadd   v20.4s, v20.4s, v22.4s      \n"
                    "fadd   v21.4s, v21.4s, v23.4s      \n"

                    "st1    {v20.4s, v21.4s}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d24-d27}, [%1 :128]! \n" // r00 r01

                    "vmov       q10, %q17       \n" // sum00
                    "vmov       q11, %q17       \n" // sum01

                    "vmla.f32   q10, %q8, q12   \n"

                    "pld        [%1, #256]      \n"
                    "vld1.f32   {d28-d31}, [%1 :128]! \n" // r02 r03

                    "vmla.f32   q10, %q9, q13   \n"

                    "vmla.f32   q11, %q8, q14   \n"
                    "vmla.f32   q10, %q10, q14  \n"

                    "vld1.f32   {d24-d25}, [%1 :128] \n" // r04

                    "vmla.f32   q11, %q9, q15   \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d28-d31}, [%2 :128]! \n" // r10 r11

                    "vmla.f32   q11, %q10, q12  \n"

                    "vmla.f32   q10, %q11, q14  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.f32   {d24-d27}, [%2 :128]! \n" // r12 r13

                    "vmla.f32   q10, %q12, q15  \n"

                    "vmla.f32   q11, %q11, q12  \n"
                    "vmla.f32   q10, %q13, q12  \n"

                    "vld1.f32   {d28-d29}, [%2 :128] \n" // r14

                    "vmla.f32   q11, %q12, q13  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d24-d27}, [%3 :128]! \n" // r20 r21

                    "vmla.f32   q11, %q13, q14  \n"

                    "vmla.f32   q10, %q14, q12  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.f32   {d28-d31}, [%3 :128]! \n" // r22 r23

                    "vmla.f32   q10, %q15, q13  \n"

                    "vmla.f32   q11, %q14, q14  \n"
                    "vmla.f32   q10, %q16, q14  \n"

                    "vld1.f32   {d24-d25}, [%3 :128] \n" // r24

                    "vmla.f32   q11, %q15, q15  \n"

                    "vmla.f32   q11, %q16, q12  \n"

                    "vst1.f32   {d20-d23}, [%0 :128]! \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
            }
            for (; j < outw; j++)
            {
                float32x4_t _sum0 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);

                vst1q_f32(outptr0, _sum0);

                r0 += 2 * 4;
                r1 += 2 * 4;
                r2 += 2 * 4;
                outptr0 += 4;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_3x3_pack4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
#if __aarch64__
    const int w = bottom_blob.w;
#endif

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);

        const unsigned short* k0 = kernel.row<const unsigned short>(g);

        unsigned short* outptr0 = out.row<unsigned short>(0);

        const Mat img0 = bottom_blob.channel(g);

        const unsigned short* r0 = img0.row<const unsigned short>(0);
        const unsigned short* r1 = img0.row<const unsigned short>(1);
        const unsigned short* r2 = img0.row<const unsigned short>(2);

        float32x4_t _k00 = bfloat2float(vld1_u16(k0));
        float32x4_t _k01 = bfloat2float(vld1_u16(k0 + 4));
        float32x4_t _k02 = bfloat2float(vld1_u16(k0 + 8));
        float32x4_t _k10 = bfloat2float(vld1_u16(k0 + 12));
        float32x4_t _k11 = bfloat2float(vld1_u16(k0 + 16));
        float32x4_t _k12 = bfloat2float(vld1_u16(k0 + 20));
        float32x4_t _k20 = bfloat2float(vld1_u16(k0 + 24));
        float32x4_t _k21 = bfloat2float(vld1_u16(k0 + 28));
        float32x4_t _k22 = bfloat2float(vld1_u16(k0 + 32));

        int i = 0;

#if __aarch64__
        unsigned short* outptr1 = out.row<unsigned short>(1);
        const unsigned short* r3 = img0.row<const unsigned short>(3);

        for (; i + 1 < outh; i += 2)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%3], #32 \n" // r10 r11 r12 r13

                    "mov    v16.16b, %21.16b            \n" // sum00
                    "mov    v17.16b, %21.16b            \n" // sum01

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v28.4h, v29.4h}, [%3]      \n" // r14 r15

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "mov    v18.16b, %21.16b            \n" // sum02
                    "mov    v19.16b, %21.16b            \n" // sum03

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "mov    v20.16b, %21.16b            \n" // sum10

                    "fmla   v16.4s, %15.4s, v10.4s      \n"
                    "fmla   v17.4s, %15.4s, v11.4s      \n"

                    "mov    v21.16b, %21.16b            \n" // sum11

                    "fmla   v18.4s, %15.4s, v12.4s      \n"
                    "fmla   v19.4s, %15.4s, v13.4s      \n"

                    "mov    v22.16b, %21.16b            \n" // sum12

                    "fmla   v20.4s, %12.4s, v10.4s      \n"
                    "fmla   v21.4s, %12.4s, v11.4s      \n"

                    "mov    v23.16b, %21.16b            \n" // sum13

                    "fmla   v22.4s, %12.4s, v12.4s      \n"
                    "fmla   v23.4s, %12.4s, v13.4s      \n"

                    "shll   v28.4s, v28.4h, #16         \n"

                    "fmla   v16.4s, %16.4s, v11.4s      \n"
                    "fmla   v17.4s, %16.4s, v12.4s      \n"

                    "shll   v29.4s, v29.4h, #16         \n"

                    "fmla   v18.4s, %16.4s, v13.4s      \n"
                    "fmla   v19.4s, %16.4s, v28.4s      \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%4], #32 \n" // r20 r21 r22 r23

                    "fmla   v20.4s, %13.4s, v11.4s      \n"
                    "fmla   v21.4s, %13.4s, v12.4s      \n"
                    "fmla   v22.4s, %13.4s, v13.4s      \n"
                    "fmla   v23.4s, %13.4s, v28.4s      \n"

                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v14.4h, v15.4h}, [%4]      \n" // r24 r25

                    "fmla   v16.4s, %17.4s, v12.4s      \n"
                    "fmla   v17.4s, %17.4s, v13.4s      \n"

                    "shll   v24.4s, v24.4h, #16         \n"

                    "fmla   v18.4s, %17.4s, v28.4s      \n"
                    "fmla   v19.4s, %17.4s, v29.4s      \n"

                    "shll   v25.4s, v25.4h, #16         \n"

                    "fmla   v20.4s, %14.4s, v12.4s      \n"
                    "fmla   v21.4s, %14.4s, v13.4s      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%2], #32 \n" // r00 r01 r02 r03

                    "fmla   v22.4s, %14.4s, v28.4s      \n"
                    "fmla   v23.4s, %14.4s, v29.4s      \n"

                    "shll   v26.4s, v26.4h, #16         \n"

                    "fmla   v16.4s, %18.4s, v24.4s      \n"
                    "fmla   v17.4s, %18.4s, v25.4s      \n"

                    "shll   v27.4s, v27.4h, #16         \n"

                    "fmla   v18.4s, %18.4s, v26.4s      \n"
                    "fmla   v19.4s, %18.4s, v27.4s      \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%5], #32 \n" // r30 r31 r32 r33

                    "fmla   v20.4s, %15.4s, v24.4s      \n"
                    "fmla   v21.4s, %15.4s, v25.4s      \n"

                    "shll   v14.4s, v14.4h, #16         \n"

                    "fmla   v22.4s, %15.4s, v26.4s      \n"
                    "fmla   v23.4s, %15.4s, v27.4s      \n"

                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v16.4s, %19.4s, v25.4s      \n"
                    "fmla   v17.4s, %19.4s, v26.4s      \n"

                    "fmla   v18.4s, %19.4s, v27.4s      \n"
                    "fmla   v19.4s, %19.4s, v14.4s      \n"

                    "fmla   v20.4s, %16.4s, v25.4s      \n"
                    "fmla   v21.4s, %16.4s, v26.4s      \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v24.4h, v25.4h}, [%2]      \n" // r04 r05

                    "fmla   v22.4s, %16.4s, v27.4s      \n"
                    "fmla   v23.4s, %16.4s, v14.4s      \n"

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "fmla   v16.4s, %20.4s, v26.4s      \n"
                    "fmla   v17.4s, %20.4s, v27.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"

                    "fmla   v18.4s, %20.4s, v14.4s      \n"
                    "fmla   v19.4s, %20.4s, v15.4s      \n"

                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmla   v20.4s, %17.4s, v26.4s      \n"
                    "fmla   v21.4s, %17.4s, v27.4s      \n"

                    "prfm   pldl1keep, [%5, #128]       \n"
                    "ld1    {v26.4h, v27.4h}, [%5]      \n" // r34 r35

                    "fmla   v22.4s, %17.4s, v14.4s      \n"
                    "fmla   v23.4s, %17.4s, v15.4s      \n"

                    "shll   v28.4s, v28.4h, #16         \n"

                    "fmla   v16.4s, %12.4s, v10.4s      \n"
                    "fmla   v17.4s, %12.4s, v11.4s      \n"

                    "shll   v29.4s, v29.4h, #16         \n"

                    "fmla   v18.4s, %12.4s, v12.4s      \n"
                    "fmla   v19.4s, %12.4s, v13.4s      \n"

                    "shll   v30.4s, v30.4h, #16         \n"

                    "fmla   v20.4s, %18.4s, v28.4s      \n"
                    "fmla   v21.4s, %18.4s, v29.4s      \n"

                    "shll   v31.4s, v31.4h, #16         \n"

                    "fmla   v22.4s, %18.4s, v30.4s      \n"
                    "fmla   v23.4s, %18.4s, v31.4s      \n"

                    "shll   v24.4s, v24.4h, #16         \n"

                    "fmla   v16.4s, %13.4s, v11.4s      \n"
                    "fmla   v17.4s, %13.4s, v12.4s      \n"
                    "fmla   v18.4s, %13.4s, v13.4s      \n"
                    "fmla   v19.4s, %13.4s, v24.4s      \n"

                    "shll   v26.4s, v26.4h, #16         \n"

                    "fmla   v20.4s, %19.4s, v29.4s      \n"
                    "fmla   v21.4s, %19.4s, v30.4s      \n"
                    "fmla   v22.4s, %19.4s, v31.4s      \n"
                    "fmla   v23.4s, %19.4s, v26.4s      \n"

                    "shll   v25.4s, v25.4h, #16         \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v13.4s      \n"
                    "fmla   v18.4s, %14.4s, v24.4s      \n"
                    "fmla   v19.4s, %14.4s, v25.4s      \n"

                    "shll   v27.4s, v27.4h, #16         \n"

                    "fmla   v20.4s, %20.4s, v30.4s      \n"
                    "fmla   v21.4s, %20.4s, v31.4s      \n"
                    "fmla   v22.4s, %20.4s, v26.4s      \n"
                    "fmla   v23.4s, %20.4s, v27.4s      \n"

                    "shrn   v16.4h, v16.4s, #16         \n"
                    "shrn   v17.4h, v17.4s, #16         \n"
                    "shrn   v18.4h, v18.4s, #16         \n"
                    "shrn   v19.4h, v19.4s, #16         \n"
                    "shrn   v20.4h, v20.4s, #16         \n"
                    "shrn   v21.4h, v21.4s, #16         \n"
                    "shrn   v22.4h, v22.4s, #16         \n"
                    "shrn   v23.4h, v23.4s, #16         \n"

                    "st1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%0], #32 \n"
                    "st1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%1], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%3] \n" // r10 r11 r12 r13

                    "mov    v16.16b, %21.16b            \n" // sum00
                    "mov    v17.16b, %21.16b            \n" // sum01

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "mov    v18.16b, %21.16b            \n" // sum10
                    "mov    v19.16b, %21.16b            \n" // sum11

                    "fmla   v16.4s, %15.4s, v10.4s      \n"
                    "fmla   v17.4s, %15.4s, v11.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"

                    "fmla   v18.4s, %12.4s, v10.4s      \n"
                    "fmla   v19.4s, %12.4s, v11.4s      \n"

                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmla   v16.4s, %16.4s, v11.4s      \n"
                    "fmla   v17.4s, %16.4s, v12.4s      \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%4] \n" // r20 r21 r22 r23

                    "fmla   v18.4s, %13.4s, v11.4s      \n"
                    "fmla   v19.4s, %13.4s, v12.4s      \n"

                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v16.4s, %17.4s, v12.4s      \n"
                    "fmla   v17.4s, %17.4s, v13.4s      \n"

                    "shll   v21.4s, v21.4h, #16         \n"

                    "fmla   v18.4s, %14.4s, v12.4s      \n"
                    "fmla   v19.4s, %14.4s, v13.4s      \n"

                    "shll   v22.4s, v22.4h, #16         \n"

                    "fmla   v16.4s, %18.4s, v20.4s      \n"
                    "fmla   v17.4s, %18.4s, v21.4s      \n"

                    "shll   v23.4s, v23.4h, #16         \n"

                    "fmla   v18.4s, %15.4s, v20.4s      \n"
                    "fmla   v19.4s, %15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%2] \n" // r00 r01 r02 r03

                    "fmla   v16.4s, %19.4s, v21.4s      \n"
                    "fmla   v17.4s, %19.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%5] \n" // r30 r31 r32 r33

                    "fmla   v18.4s, %16.4s, v21.4s      \n"
                    "fmla   v19.4s, %16.4s, v22.4s      \n"

                    "shll   v10.4s, v10.4h, #16         \n"

                    "fmla   v16.4s, %20.4s, v22.4s      \n"
                    "fmla   v17.4s, %20.4s, v23.4s      \n"

                    "shll   v24.4s, v24.4h, #16         \n"

                    "fmla   v18.4s, %17.4s, v22.4s      \n"
                    "fmla   v19.4s, %17.4s, v23.4s      \n"

                    "shll   v11.4s, v11.4h, #16         \n"
                    "shll   v25.4s, v25.4h, #16         \n"

                    "fmla   v16.4s, %12.4s, v10.4s      \n"
                    "fmla   v17.4s, %12.4s, v11.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"

                    "fmla   v18.4s, %18.4s, v24.4s      \n"
                    "fmla   v19.4s, %18.4s, v25.4s      \n"

                    "shll   v26.4s, v26.4h, #16         \n"

                    "fmla   v16.4s, %13.4s, v11.4s      \n"
                    "fmla   v17.4s, %13.4s, v12.4s      \n"

                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmla   v18.4s, %19.4s, v25.4s      \n"
                    "fmla   v19.4s, %19.4s, v26.4s      \n"

                    "shll   v27.4s, v27.4h, #16         \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v13.4s      \n"

                    "add    %3, %3, #16                 \n"

                    "fmla   v18.4s, %20.4s, v26.4s      \n"
                    "fmla   v19.4s, %20.4s, v27.4s      \n"

                    "add    %4, %4, #16                 \n"

                    "shrn   v16.4h, v16.4s, #16         \n"
                    "shrn   v17.4h, v17.4s, #16         \n"

                    "add    %2, %2, #16                 \n"

                    "shrn   v18.4h, v18.4s, #16         \n"
                    "shrn   v19.4h, v19.4s, #16         \n"

                    "add    %5, %5, #16                 \n"

                    "st1    {v16.4h, v17.4h}, [%0], #16 \n"
                    "st1    {v18.4h, v19.4h}, [%1], #16 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #192]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h}, [%3] \n" // r10 r11 r12

                    "mov    v18.16b, %21.16b            \n" // sum0
                    "mov    v19.16b, %21.16b            \n" // sum1

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "fmul   v16.4s, %15.4s, v10.4s      \n"
                    "fmul   v17.4s, %12.4s, v10.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"

                    "fmla   v18.4s, %16.4s, v11.4s      \n"
                    "fmla   v19.4s, %13.4s, v11.4s      \n"

                    "prfm   pldl1keep, [%4, #192]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h}, [%4] \n" // r20 r21 r22

                    "fmla   v16.4s, %17.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v12.4s      \n"

                    "shll   v20.4s, v20.4h, #16         \n"
                    "shll   v21.4s, v21.4h, #16         \n"

                    "fmla   v18.4s, %18.4s, v20.4s      \n"
                    "fmla   v19.4s, %15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%2, #192]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h}, [%2] \n" // r00 r01 r02

                    "shll   v22.4s, v22.4h, #16         \n"

                    "prfm   pldl1keep, [%5, #192]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h}, [%5] \n" // r30 r31 r32

                    "fmla   v16.4s, %19.4s, v21.4s      \n"
                    "fmla   v17.4s, %16.4s, v21.4s      \n"

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v24.4s, v24.4h, #16         \n"

                    "fmla   v18.4s, %20.4s, v22.4s      \n"
                    "fmla   v19.4s, %17.4s, v22.4s      \n"

                    "shll   v11.4s, v11.4h, #16         \n"
                    "shll   v25.4s, v25.4h, #16         \n"

                    "fmla   v16.4s, %12.4s, v10.4s      \n"
                    "fmla   v17.4s, %18.4s, v24.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v26.4s, v26.4h, #16         \n"

                    "fmla   v18.4s, %13.4s, v11.4s      \n"
                    "fmla   v19.4s, %19.4s, v25.4s      \n"

                    "add    %3, %3, #8                  \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %20.4s, v26.4s      \n"

                    "add    %4, %4, #8                  \n"

                    "fadd   v18.4s, v18.4s, v16.4s      \n"
                    "fadd   v19.4s, v19.4s, v17.4s      \n"

                    "add    %2, %2, #8                  \n"

                    "shrn   v18.4h, v18.4s, #16         \n"
                    "shrn   v19.4h, v19.4s, #16         \n"

                    "add    %5, %5, #8                  \n"

                    "st1    {v18.4h}, [%0], #8          \n"
                    "st1    {v19.4h}, [%1], #8          \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v24", "v25", "v26");
            }

            r0 += 2 * 4 + w * 4;
            r1 += 2 * 4 + w * 4;
            r2 += 2 * 4 + w * 4;
            r3 += 2 * 4 + w * 4;

            outptr0 += outw * 4;
            outptr1 += outw * 4;
        }
#endif // __aarch64__
        for (; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%1], #32 \n" // r00 r01 r02 r03

                    "mov    v16.16b, %17.16b            \n" // sum00
                    "mov    v17.16b, %17.16b            \n" // sum01
                    "mov    v18.16b, %17.16b            \n" // sum02
                    "mov    v19.16b, %17.16b            \n" // sum03

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "fmla   v16.4s, %8.4s, v10.4s       \n"
                    "fmla   v17.4s, %8.4s, v11.4s       \n"

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmla   v18.4s, %8.4s, v12.4s       \n"
                    "fmla   v19.4s, %8.4s, v13.4s       \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v14.4h, v15.4h}, [%1]      \n" // r04 r05

                    "fmla   v16.4s, %9.4s, v11.4s       \n"
                    "fmla   v17.4s, %9.4s, v12.4s       \n"

                    "shll   v14.4s, v14.4h, #16         \n"

                    "fmla   v18.4s, %9.4s, v13.4s       \n"
                    "fmla   v19.4s, %9.4s, v14.4s       \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%2], #32 \n" // r10 r11 r12 r13

                    "fmla   v16.4s, %10.4s, v12.4s      \n"
                    "fmla   v17.4s, %10.4s, v13.4s      \n"

                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v18.4s, %10.4s, v14.4s      \n"
                    "fmla   v19.4s, %10.4s, v15.4s      \n"

                    "shll   v20.4s, v20.4h, #16         \n"
                    "shll   v21.4s, v21.4h, #16         \n"

                    "fmla   v16.4s, %11.4s, v20.4s      \n"
                    "fmla   v17.4s, %11.4s, v21.4s      \n"

                    "shll   v22.4s, v22.4h, #16         \n"
                    "shll   v23.4s, v23.4h, #16         \n"

                    "fmla   v18.4s, %11.4s, v22.4s      \n"
                    "fmla   v19.4s, %11.4s, v23.4s      \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v14.4h, v15.4h}, [%2]      \n" // r14 r15

                    "fmla   v16.4s, %12.4s, v21.4s      \n"
                    "fmla   v17.4s, %12.4s, v22.4s      \n"

                    "shll   v14.4s, v14.4h, #16         \n"

                    "fmla   v18.4s, %12.4s, v23.4s      \n"
                    "fmla   v19.4s, %12.4s, v14.4s      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%3], #32 \n" // r20 r21 r22 r23

                    "fmla   v16.4s, %13.4s, v22.4s      \n"
                    "fmla   v17.4s, %13.4s, v23.4s      \n"

                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v18.4s, %13.4s, v14.4s      \n"
                    "fmla   v19.4s, %13.4s, v15.4s      \n"

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "fmla   v16.4s, %14.4s, v10.4s      \n"
                    "fmla   v17.4s, %14.4s, v11.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmla   v18.4s, %14.4s, v12.4s      \n"
                    "fmla   v19.4s, %14.4s, v13.4s      \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v14.4h, v15.4h}, [%3]      \n" // r24 r25

                    "fmla   v16.4s, %15.4s, v11.4s      \n"
                    "fmla   v17.4s, %15.4s, v12.4s      \n"

                    "shll   v14.4s, v14.4h, #16         \n"

                    "fmla   v18.4s, %15.4s, v13.4s      \n"
                    "fmla   v19.4s, %15.4s, v14.4s      \n"

                    "fmla   v16.4s, %16.4s, v12.4s      \n"
                    "fmla   v17.4s, %16.4s, v13.4s      \n"

                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v18.4s, %16.4s, v14.4s      \n"
                    "fmla   v19.4s, %16.4s, v15.4s      \n"

                    "shrn   v16.4h, v16.4s, #16         \n"
                    "shrn   v17.4h, v17.4s, #16         \n"
                    "shrn   v18.4h, v18.4s, #16         \n"
                    "shrn   v19.4h, v19.4s, #16         \n"

                    "st1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                asm volatile(
                    "pld        [%1, #128]      \n"
                    "vld1.u16   {d30-d31}, [%1 :64]! \n" // r00 r01

                    "vmov       q10, %q17       \n" // sum00
                    "vmov       q11, %q17       \n" // sum01

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q10, %q8, q14   \n"
                    "vmla.f32   q11, %q8, q15   \n"
                    "vmla.f32   q10, %q9, q15   \n"

                    "pld        [%1, #128]      \n"
                    "vld1.u16   {d30-d31}, [%1 :64]! \n" // r02 r03

                    "vmov       q12, %q17       \n" // sum02
                    "vmov       q13, %q17       \n" // sum03

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q12, %q8, q14   \n"
                    "vmla.f32   q11, %q9, q14   \n"
                    "vmla.f32   q13, %q8, q15   \n"
                    "vmla.f32   q10, %q10, q14  \n"
                    "vmla.f32   q12, %q9, q15   \n"
                    "vmla.f32   q11, %q10, q15  \n"

                    //                     "pld        [%1, #128]      \n"
                    "vld1.u16   {d30-d31}, [%1 :64] \n" // r04 r05

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q13, %q9, q14   \n"
                    "vmla.f32   q12, %q10, q14  \n"
                    "vmla.f32   q13, %q10, q15  \n"

                    "pld        [%2, #128]      \n"
                    "vld1.u16   {d30-d31}, [%2 :64]! \n" // r10 r11

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q10, %q11, q14  \n"
                    "vmla.f32   q11, %q11, q15  \n"
                    "vmla.f32   q10, %q12, q15  \n"

                    "pld        [%2, #128]      \n"
                    "vld1.u16   {d30-d31}, [%2 :64]! \n" // r12 r13

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q12, %q11, q14  \n"
                    "vmla.f32   q11, %q12, q14  \n"
                    "vmla.f32   q13, %q11, q15  \n"
                    "vmla.f32   q10, %q13, q14  \n"
                    "vmla.f32   q12, %q12, q15  \n"
                    "vmla.f32   q11, %q13, q15  \n"

                    //                     "pld        [%2, #128]      \n"
                    "vld1.u16   {d30-d31}, [%2 :64] \n" // r14 r15

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q13, %q12, q14  \n"
                    "vmla.f32   q12, %q13, q14  \n"
                    "vmla.f32   q13, %q13, q15  \n"

                    "pld        [%3, #128]      \n"
                    "vld1.u16   {d30-d31}, [%3 :64]! \n" // r20 r21

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q10, %q14, q14  \n"
                    "vmla.f32   q11, %q14, q15  \n"
                    "vmla.f32   q10, %q15, q15  \n"

                    "pld        [%3, #128]      \n"
                    "vld1.u16   {d30-d31}, [%3 :64]! \n" // r22 r23

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q12, %q14, q14  \n"
                    "vmla.f32   q11, %q15, q14  \n"
                    "vmla.f32   q13, %q14, q15  \n"
                    "vmla.f32   q10, %q16, q14  \n"
                    "vmla.f32   q12, %q15, q15  \n"
                    "vmla.f32   q11, %q16, q15  \n"

                    //                     "pld        [%3, #128]      \n"
                    "vld1.u16   {d30-d31}, [%3 :64] \n" // r24 r25

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q13, %q15, q14  \n"
                    "vmla.f32   q12, %q16, q14  \n"
                    "vmla.f32   q13, %q16, q15  \n"

                    "vshrn.u32  d20, q10, #16   \n"
                    "vshrn.u32  d21, q11, #16   \n"
                    "vshrn.u32  d22, q12, #16   \n"
                    "vshrn.u32  d23, q13, #16   \n"

                    "vst1.u16   {d20-d23}, [%0 :64]! \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
            }
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v12.4h, v13.4h, v14.4h, v15.4h}, [%1] \n" // r00 r01 r02 r03

                    "mov    v18.16b, %17.16b            \n" // sum00
                    "mov    v19.16b, %17.16b            \n" // sum01

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmul   v16.4s, %8.4s, v12.4s       \n"
                    "fmul   v17.4s, %8.4s, v13.4s       \n"

                    "shll   v14.4s, v14.4h, #16         \n"
                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v18.4s, %9.4s, v13.4s       \n"
                    "fmla   v19.4s, %9.4s, v14.4s       \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%2] \n" // r10 r11 r12 r13

                    "fmla   v16.4s, %10.4s, v14.4s      \n"
                    "fmla   v17.4s, %10.4s, v15.4s      \n"

                    "shll   v20.4s, v20.4h, #16         \n"
                    "shll   v21.4s, v21.4h, #16         \n"

                    "fmla   v18.4s, %11.4s, v20.4s      \n"
                    "fmla   v19.4s, %11.4s, v21.4s      \n"

                    "shll   v22.4s, v22.4h, #16         \n"
                    "shll   v23.4s, v23.4h, #16         \n"

                    "fmla   v16.4s, %12.4s, v21.4s      \n"
                    "fmla   v17.4s, %12.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v12.4h, v13.4h, v14.4h, v15.4h}, [%3] \n" // r20 r21 r22 r23

                    "fmla   v18.4s, %13.4s, v22.4s      \n"
                    "fmla   v19.4s, %13.4s, v23.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmla   v16.4s, %14.4s, v12.4s      \n"
                    "fmla   v17.4s, %14.4s, v13.4s      \n"

                    "shll   v14.4s, v14.4h, #16         \n"
                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v18.4s, %15.4s, v13.4s      \n"
                    "fmla   v19.4s, %15.4s, v14.4s      \n"

                    "add    %1, %1, #16                 \n"

                    "fmla   v16.4s, %16.4s, v14.4s      \n"
                    "fmla   v17.4s, %16.4s, v15.4s      \n"

                    "add    %2, %2, #16                 \n"

                    "fadd   v18.4s, v18.4s, v16.4s      \n"
                    "fadd   v19.4s, v19.4s, v17.4s      \n"

                    "add    %3, %3, #16                 \n"

                    "shrn   v18.4h, v18.4s, #16         \n"
                    "shrn   v19.4h, v19.4s, #16         \n"

                    "st1    {v18.4h, v19.4h}, [%0], #16 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.u16   {d28-d31}, [%1 :64] \n" // r00 r01 r02 r03

                    "vmov       q10, %q17       \n" // sum00
                    "vmov       q11, %q17       \n" // sum01

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"

                    "vmla.f32   q10, %q8, q12   \n"
                    "vmla.f32   q11, %q8, q13   \n"

                    "vshll.u16  q14, d30, #16   \n"

                    "vmla.f32   q10, %q9, q13   \n"
                    "vmla.f32   q11, %q9, q14   \n"

                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q10, %q10, q14  \n"
                    "vmla.f32   q11, %q10, q15  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.u16   {d28-d31}, [%2 :64] \n" // r10 r11 r12 r13

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"

                    "vmla.f32   q10, %q11, q12  \n"
                    "vmla.f32   q11, %q11, q13  \n"

                    "vshll.u16  q14, d30, #16   \n"

                    "vmla.f32   q10, %q12, q13  \n"
                    "vmla.f32   q11, %q12, q14  \n"

                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q10, %q13, q14  \n"
                    "vmla.f32   q11, %q13, q15  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.u16   {d28-d31}, [%3 :64] \n" // r20 r21 r22 r23

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"

                    "vmla.f32   q10, %q14, q12  \n"
                    "vmla.f32   q11, %q14, q13  \n"

                    "vshll.u16  q14, d30, #16   \n"

                    "vmla.f32   q10, %q15, q13  \n"
                    "vmla.f32   q11, %q15, q14  \n"

                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q10, %q16, q14  \n"
                    "vmla.f32   q11, %q16, q15  \n"

                    "add        %1, %1, #16     \n"
                    "add        %2, %2, #16     \n"

                    "vshrn.u32  d20, q10, #16   \n"
                    "vshrn.u32  d21, q11, #16   \n"

                    "add        %3, %3, #16     \n"

                    "vst1.u16   {d20-d21}, [%0 :64]! \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
            }
            for (; j < outw; j++)
            {
                float32x4_t _sum0 = _bias0;

                float32x4_t _r00 = bfloat2float(vld1_u16(r0));
                float32x4_t _r01 = bfloat2float(vld1_u16(r0 + 4));
                float32x4_t _r02 = bfloat2float(vld1_u16(r0 + 8));
                float32x4_t _r10 = bfloat2float(vld1_u16(r1));
                float32x4_t _r11 = bfloat2float(vld1_u16(r1 + 4));
                float32x4_t _r12 = bfloat2float(vld1_u16(r1 + 8));
                float32x4_t _r20 = bfloat2float(vld1_u16(r2));
                float32x4_t _r21 = bfloat2float(vld1_u16(r2 + 4));
                float32x4_t _r22 = bfloat2float(vld1_u16(r2 + 8));

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);

                vst1_u16(outptr0, float2bfloat(_sum0));

                r0 += 4;
                r1 += 4;
                r2 += 4;
                outptr0 += 4;
            }

            r0 += 2 * 4;
            r1 += 2 * 4;
            r2 += 2 * 4;
        }
    }
}

static void convdw3x3s2_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);

        const unsigned short* k0 = kernel.row<const unsigned short>(g);

        unsigned short* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const unsigned short* r0 = img0.row<const unsigned short>(0);
        const unsigned short* r1 = img0.row<const unsigned short>(1);
        const unsigned short* r2 = img0.row<const unsigned short>(2);

        float32x4_t _k00 = bfloat2float(vld1_u16(k0));
        float32x4_t _k01 = bfloat2float(vld1_u16(k0 + 4));
        float32x4_t _k02 = bfloat2float(vld1_u16(k0 + 8));
        float32x4_t _k10 = bfloat2float(vld1_u16(k0 + 12));
        float32x4_t _k11 = bfloat2float(vld1_u16(k0 + 16));
        float32x4_t _k12 = bfloat2float(vld1_u16(k0 + 20));
        float32x4_t _k20 = bfloat2float(vld1_u16(k0 + 24));
        float32x4_t _k21 = bfloat2float(vld1_u16(k0 + 28));
        float32x4_t _k22 = bfloat2float(vld1_u16(k0 + 32));

        int i = 0;

        for (; i < outh; i++)
        {
            int j = 0;

#if __aarch64__
            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%1], #32 \n" // r00 r01 r02 r03

                    "mov    v28.16b, %17.16b            \n" // sum00
                    "mov    v29.16b, %17.16b            \n" // sum01
                    "mov    v30.16b, %17.16b            \n" // sum02
                    "mov    v31.16b, %17.16b            \n" // sum03

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v14.4h, v15.4h, v16.4h, v17.4h}, [%1], #32 \n" // r04 r05 r06 r07

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"
                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "prfm   pldl1keep, [%1, #64]        \n"
                    "ld1    {v18.4h}, [%1]              \n" // r08

                    "shll   v14.4s, v14.4h, #16         \n"
                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v28.4s, %8.4s, v10.4s       \n"
                    "fmla   v29.4s, %8.4s, v12.4s       \n"

                    "shll   v16.4s, v16.4h, #16         \n"

                    "fmla   v30.4s, %8.4s, v14.4s       \n"
                    "fmla   v31.4s, %8.4s, v16.4s       \n"

                    "shll   v17.4s, v17.4h, #16         \n"

                    "fmla   v28.4s, %9.4s, v11.4s       \n"
                    "fmla   v29.4s, %9.4s, v13.4s       \n"
                    "fmla   v30.4s, %9.4s, v15.4s       \n"
                    "fmla   v31.4s, %9.4s, v17.4s       \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%2], #32 \n" // r10 r11 r12 r13

                    "fmla   v28.4s, %10.4s, v12.4s      \n"
                    "fmla   v29.4s, %10.4s, v14.4s      \n"

                    "shll   v18.4s, v18.4h, #16         \n"

                    "fmla   v30.4s, %10.4s, v16.4s      \n"
                    "fmla   v31.4s, %10.4s, v18.4s      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%2], #32 \n" // r14 r15 r16 r17

                    "shll   v20.4s, v20.4h, #16         \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "shll   v23.4s, v23.4h, #16         \n"

                    "prfm   pldl1keep, [%2, #64]        \n"
                    "ld1    {v19.4h}, [%2]              \n" // r18

                    "shll   v24.4s, v24.4h, #16         \n"
                    "shll   v25.4s, v25.4h, #16         \n"

                    "fmla   v28.4s, %11.4s, v20.4s      \n"
                    "fmla   v29.4s, %11.4s, v22.4s      \n"

                    "shll   v26.4s, v26.4h, #16         \n"

                    "fmla   v30.4s, %11.4s, v24.4s      \n"
                    "fmla   v31.4s, %11.4s, v26.4s      \n"

                    "shll   v27.4s, v27.4h, #16         \n"

                    "fmla   v28.4s, %12.4s, v21.4s      \n"
                    "fmla   v29.4s, %12.4s, v23.4s      \n"
                    "fmla   v30.4s, %12.4s, v25.4s      \n"
                    "fmla   v31.4s, %12.4s, v27.4s      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%3], #32 \n" // r20 r21 r22 r23

                    "fmla   v28.4s, %13.4s, v22.4s      \n"
                    "fmla   v29.4s, %13.4s, v24.4s      \n"

                    "shll   v19.4s, v19.4h, #16         \n"

                    "fmla   v30.4s, %13.4s, v26.4s      \n"
                    "fmla   v31.4s, %13.4s, v19.4s      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v14.4h, v15.4h, v16.4h, v17.4h}, [%3], #32 \n" // r24 r25 r26 r27

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"
                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "prfm   pldl1keep, [%3, #64]        \n"
                    "ld1    {v18.4h}, [%3]              \n" // r28

                    "shll   v14.4s, v14.4h, #16         \n"
                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v28.4s, %14.4s, v10.4s      \n"
                    "fmla   v29.4s, %14.4s, v12.4s      \n"

                    "shll   v16.4s, v16.4h, #16         \n"

                    "fmla   v30.4s, %14.4s, v14.4s      \n"
                    "fmla   v31.4s, %14.4s, v16.4s      \n"

                    "shll   v17.4s, v17.4h, #16         \n"

                    "fmla   v28.4s, %15.4s, v11.4s      \n"
                    "fmla   v29.4s, %15.4s, v13.4s      \n"
                    "fmla   v30.4s, %15.4s, v15.4s      \n"
                    "fmla   v31.4s, %15.4s, v17.4s      \n"

                    "fmla   v28.4s, %16.4s, v12.4s      \n"
                    "fmla   v29.4s, %16.4s, v14.4s      \n"

                    "shll   v18.4s, v18.4h, #16         \n"

                    "fmla   v30.4s, %16.4s, v16.4s      \n"
                    "fmla   v31.4s, %16.4s, v18.4s      \n"

                    "shrn   v28.4h, v28.4s, #16         \n"
                    "shrn   v29.4h, v29.4s, #16         \n"
                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
#endif // __aarch64__
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%1], #32 \n" // r00 r01 r02 r03

                    "mov    v22.16b, %17.16b            \n" // sum00
                    "mov    v23.16b, %17.16b            \n" // sum01

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "fmul   v20.4s, %8.4s, v10.4s       \n"

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmul   v21.4s, %8.4s, v12.4s       \n"

                    "prfm   pldl1keep, [%1, #64]        \n"
                    "ld1    {v14.4h}, [%1]              \n" // r04

                    "fmla   v22.4s, %9.4s, v11.4s       \n"
                    "fmla   v23.4s, %9.4s, v13.4s       \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%2], #32 \n" // r10 r11 r12 r13

                    "shll   v14.4s, v14.4h, #16         \n"

                    "fmla   v20.4s, %10.4s, v12.4s      \n"
                    "fmla   v21.4s, %10.4s, v14.4s      \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "fmla   v22.4s, %11.4s, v16.4s      \n"

                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"

                    "fmla   v23.4s, %11.4s, v18.4s      \n"

                    "prfm   pldl1keep, [%2, #64]        \n"
                    "ld1    {v15.4h}, [%2]              \n" // r14

                    "fmla   v20.4s, %12.4s, v17.4s      \n"
                    "fmla   v21.4s, %12.4s, v19.4s      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v10.4h, v11.4h, v12.4h, v13.4h}, [%3], #32 \n" // r20 r21 r22 r23

                    "shll   v15.4s, v15.4h, #16         \n"

                    "fmla   v22.4s, %13.4s, v18.4s      \n"
                    "fmla   v23.4s, %13.4s, v15.4s      \n"

                    "shll   v10.4s, v10.4h, #16         \n"
                    "shll   v11.4s, v11.4h, #16         \n"

                    "fmla   v20.4s, %14.4s, v10.4s      \n"

                    "shll   v12.4s, v12.4h, #16         \n"
                    "shll   v13.4s, v13.4h, #16         \n"

                    "fmla   v21.4s, %14.4s, v12.4s      \n"

                    "prfm   pldl1keep, [%3, #64]        \n"
                    "ld1    {v14.4h}, [%3]              \n" // r24

                    "fmla   v22.4s, %15.4s, v11.4s      \n"
                    "fmla   v23.4s, %15.4s, v13.4s      \n"

                    "shll   v14.4s, v14.4h, #16         \n"

                    "fmla   v20.4s, %16.4s, v12.4s      \n"
                    "fmla   v21.4s, %16.4s, v14.4s      \n"

                    "fadd   v22.4s, v20.4s, v22.4s      \n"
                    "fadd   v23.4s, v21.4s, v23.4s      \n"

                    "shrn   v22.4h, v22.4s, #16         \n"
                    "shrn   v23.4h, v23.4s, #16         \n"

                    "st1    {v22.4h, v23.4h}, [%0], #16 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.u16   {d28-d31}, [%1 :64]! \n" // r00 r01 r02 r03

                    "vmov       q10, %q17       \n" // sum00
                    "vmov       q11, %q17       \n" // sum01

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"

                    "vmla.f32   q10, %q8, q12   \n"

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q11, %q8, q14   \n"

                    "vld1.u16   {d25}, [%1]     \n" // r04

                    "vmla.f32   q10, %q9, q13   \n"
                    "vmla.f32   q11, %q9, q15   \n"

                    "vshll.u16  q12, d25, #16   \n"

                    "vmla.f32   q10, %q10, q14  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.u16   {d28-d31}, [%2 :64]! \n" // r10 r11 r12 r13

                    "vmla.f32   q11, %q10, q12  \n"

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"

                    "vmla.f32   q10, %q11, q12  \n"

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q11, %q11, q14  \n"

                    "vld1.u16   {d25}, [%2]     \n" // r14

                    "vmla.f32   q10, %q12, q13  \n"
                    "vmla.f32   q11, %q12, q15  \n"

                    "vshll.u16  q12, d25, #16   \n"

                    "vmla.f32   q10, %q13, q14  \n"

                    "pld        [%3, #256]      \n"
                    "vld1.u16   {d28-d31}, [%3 :64]! \n" // r20 r21 r22 r23

                    "vmla.f32   q11, %q13, q12  \n"

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"

                    "vmla.f32   q10, %q14, q12  \n"

                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmla.f32   q11, %q14, q14  \n"

                    "vld1.u16   {d25}, [%3]     \n" // r24

                    "vmla.f32   q10, %q15, q13  \n"
                    "vmla.f32   q11, %q15, q15  \n"

                    "vshll.u16  q12, d25, #16   \n"

                    "vmla.f32   q10, %q16, q14  \n"
                    "vmla.f32   q11, %q16, q12  \n"

                    "vshrn.u32  d20, q10, #16   \n"
                    "vshrn.u32  d21, q11, #16   \n"

                    "vst1.u16   {d20-d21}, [%0 :64]! \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
            }
            for (; j < outw; j++)
            {
                float32x4_t _sum0 = _bias0;

                float32x4_t _r00 = bfloat2float(vld1_u16(r0));
                float32x4_t _r01 = bfloat2float(vld1_u16(r0 + 4));
                float32x4_t _r02 = bfloat2float(vld1_u16(r0 + 8));
                float32x4_t _r10 = bfloat2float(vld1_u16(r1));
                float32x4_t _r11 = bfloat2float(vld1_u16(r1 + 4));
                float32x4_t _r12 = bfloat2float(vld1_u16(r1 + 8));
                float32x4_t _r20 = bfloat2float(vld1_u16(r2));
                float32x4_t _r21 = bfloat2float(vld1_u16(r2 + 4));
                float32x4_t _r22 = bfloat2float(vld1_u16(r2 + 8));

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);

                vst1_u16(outptr0, float2bfloat(_sum0));

                r0 += 2 * 4;
                r1 += 2 * 4;
                r2 += 2 * 4;
                outptr0 += 4;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_3x3_pack8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + g * 8) : vdupq_n_f16((__fp16)0.f);

        const __fp16* k0 = kernel.row<const __fp16>(g);

        __fp16* outptr0 = out.row<__fp16>(0);
        __fp16* outptr1 = out.row<__fp16>(1);

        const Mat img0 = bottom_blob.channel(g);

        const __fp16* r0 = img0.row<const __fp16>(0);
        const __fp16* r1 = img0.row<const __fp16>(1);
        const __fp16* r2 = img0.row<const __fp16>(2);
        const __fp16* r3 = img0.row<const __fp16>(3);

        float16x8_t _k00 = vld1q_f16(k0);
        float16x8_t _k01 = vld1q_f16(k0 + 8);
        float16x8_t _k02 = vld1q_f16(k0 + 16);
        float16x8_t _k10 = vld1q_f16(k0 + 24);
        float16x8_t _k11 = vld1q_f16(k0 + 32);
        float16x8_t _k12 = vld1q_f16(k0 + 40);
        float16x8_t _k20 = vld1q_f16(k0 + 48);
        float16x8_t _k21 = vld1q_f16(k0 + 56);
        float16x8_t _k22 = vld1q_f16(k0 + 64);

        int i = 0;
        for (; i + 1 < outh; i += 2)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n" // r10 r11 r12 r13

                    "mov    v24.16b, %21.16b            \n" // sum00
                    "mov    v25.16b, %21.16b            \n" // sum01
                    "mov    v26.16b, %21.16b            \n" // sum02
                    "mov    v27.16b, %21.16b            \n" // sum03

                    "fmla   v24.8h, %15.8h, v12.8h      \n"
                    "fmla   v25.8h, %15.8h, v13.8h      \n"

                    "mov    v28.16b, %21.16b            \n" // sum10
                    "mov    v29.16b, %21.16b            \n" // sum11
                    "mov    v30.16b, %21.16b            \n" // sum12
                    "mov    v31.16b, %21.16b            \n" // sum13

                    "fmla   v26.8h, %15.8h, v14.8h      \n"
                    "fmla   v27.8h, %15.8h, v15.8h      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%3]      \n" // r14 r15

                    "fmla   v28.8h, %12.8h, v12.8h      \n"
                    "fmla   v29.8h, %12.8h, v13.8h      \n"
                    "fmla   v30.8h, %12.8h, v14.8h      \n"
                    "fmla   v31.8h, %12.8h, v15.8h      \n"

                    "fmla   v24.8h, %16.8h, v13.8h      \n"
                    "fmla   v25.8h, %16.8h, v14.8h      \n"
                    "fmla   v26.8h, %16.8h, v15.8h      \n"
                    "fmla   v27.8h, %16.8h, v16.8h      \n"

                    "fmla   v28.8h, %13.8h, v13.8h      \n"
                    "fmla   v29.8h, %13.8h, v14.8h      \n"
                    "fmla   v30.8h, %13.8h, v15.8h      \n"
                    "fmla   v31.8h, %13.8h, v16.8h      \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%4], #64 \n" // r20 r21 r22 r23

                    "fmla   v24.8h, %17.8h, v14.8h      \n"
                    "fmla   v25.8h, %17.8h, v15.8h      \n"
                    "fmla   v26.8h, %17.8h, v16.8h      \n"
                    "fmla   v27.8h, %17.8h, v17.8h      \n"

                    "fmla   v28.8h, %14.8h, v14.8h      \n"
                    "fmla   v29.8h, %14.8h, v15.8h      \n"
                    "fmla   v30.8h, %14.8h, v16.8h      \n"
                    "fmla   v31.8h, %14.8h, v17.8h      \n"

                    "fmla   v24.8h, %18.8h, v18.8h      \n"
                    "fmla   v25.8h, %18.8h, v19.8h      \n"
                    "fmla   v26.8h, %18.8h, v20.8h      \n"
                    "fmla   v27.8h, %18.8h, v21.8h      \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%4]      \n" // r24 r25

                    "fmla   v28.8h, %15.8h, v18.8h      \n"
                    "fmla   v29.8h, %15.8h, v19.8h      \n"
                    "fmla   v30.8h, %15.8h, v20.8h      \n"
                    "fmla   v31.8h, %15.8h, v21.8h      \n"

                    "fmla   v24.8h, %19.8h, v19.8h      \n"
                    "fmla   v25.8h, %19.8h, v20.8h      \n"
                    "fmla   v26.8h, %19.8h, v21.8h      \n"
                    "fmla   v27.8h, %19.8h, v22.8h      \n"

                    "fmla   v28.8h, %16.8h, v19.8h      \n"
                    "fmla   v29.8h, %16.8h, v20.8h      \n"
                    "fmla   v30.8h, %16.8h, v21.8h      \n"
                    "fmla   v31.8h, %16.8h, v22.8h      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%2], #64 \n" // r00 r01 r02 r03

                    "fmla   v24.8h, %20.8h, v20.8h      \n"
                    "fmla   v25.8h, %20.8h, v21.8h      \n"
                    "fmla   v26.8h, %20.8h, v22.8h      \n"
                    "fmla   v27.8h, %20.8h, v23.8h      \n"

                    "fmla   v28.8h, %17.8h, v20.8h      \n"
                    "fmla   v29.8h, %17.8h, v21.8h      \n"
                    "fmla   v30.8h, %17.8h, v22.8h      \n"
                    "fmla   v31.8h, %17.8h, v23.8h      \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%5], #64 \n" // r30 r31 r32 r33

                    "fmla   v24.8h, %12.8h, v12.8h      \n"
                    "fmla   v25.8h, %12.8h, v13.8h      \n"
                    "fmla   v26.8h, %12.8h, v14.8h      \n"
                    "fmla   v27.8h, %12.8h, v15.8h      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%2]      \n" // r04 r05

                    "fmla   v28.8h, %18.8h, v18.8h      \n"
                    "fmla   v29.8h, %18.8h, v19.8h      \n"
                    "fmla   v30.8h, %18.8h, v20.8h      \n"
                    "fmla   v31.8h, %18.8h, v21.8h      \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%5]      \n" // r34 r35

                    "fmla   v24.8h, %13.8h, v13.8h      \n"
                    "fmla   v25.8h, %13.8h, v14.8h      \n"
                    "fmla   v26.8h, %13.8h, v15.8h      \n"
                    "fmla   v27.8h, %13.8h, v16.8h      \n"

                    "fmla   v28.8h, %19.8h, v19.8h      \n"
                    "fmla   v29.8h, %19.8h, v20.8h      \n"
                    "fmla   v30.8h, %19.8h, v21.8h      \n"
                    "fmla   v31.8h, %19.8h, v22.8h      \n"

                    "fmla   v24.8h, %14.8h, v14.8h      \n"
                    "fmla   v25.8h, %14.8h, v15.8h      \n"
                    "fmla   v26.8h, %14.8h, v16.8h      \n"
                    "fmla   v27.8h, %14.8h, v17.8h      \n"

                    "fmla   v28.8h, %20.8h, v20.8h      \n"
                    "fmla   v29.8h, %20.8h, v21.8h      \n"
                    "fmla   v30.8h, %20.8h, v22.8h      \n"
                    "fmla   v31.8h, %20.8h, v23.8h      \n"

                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%1], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%3] \n" // r10 r11 r12 r13

                    "mov    v28.16b, %21.16b            \n" // sum00
                    "mov    v29.16b, %21.16b            \n" // sum01
                    "mov    v30.16b, %21.16b            \n" // sum10
                    "mov    v31.16b, %21.16b            \n" // sum11

                    "fmla   v28.8h, %15.8h, v16.8h      \n"
                    "fmla   v30.8h, %12.8h, v16.8h      \n"
                    "fmla   v29.8h, %15.8h, v17.8h      \n"
                    "fmla   v31.8h, %12.8h, v17.8h      \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n" // r20 r21 r22 r23

                    "fmla   v28.8h, %16.8h, v17.8h      \n"
                    "fmla   v30.8h, %13.8h, v17.8h      \n"
                    "fmla   v29.8h, %16.8h, v18.8h      \n"
                    "fmla   v31.8h, %13.8h, v18.8h      \n"

                    "fmla   v28.8h, %17.8h, v18.8h      \n"
                    "fmla   v30.8h, %14.8h, v18.8h      \n"
                    "fmla   v29.8h, %17.8h, v19.8h      \n"
                    "fmla   v31.8h, %14.8h, v19.8h      \n"

                    "fmla   v28.8h, %18.8h, v20.8h      \n"
                    "fmla   v30.8h, %15.8h, v20.8h      \n"
                    "fmla   v29.8h, %18.8h, v21.8h      \n"
                    "fmla   v31.8h, %15.8h, v21.8h      \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%2] \n" // r00 r01 r02 r03

                    "fmla   v28.8h, %19.8h, v21.8h      \n"
                    "fmla   v30.8h, %16.8h, v21.8h      \n"
                    "fmla   v29.8h, %19.8h, v22.8h      \n"
                    "fmla   v31.8h, %16.8h, v22.8h      \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%5] \n" // r30 r31 r32 r33

                    "fmla   v28.8h, %20.8h, v22.8h      \n"
                    "fmla   v30.8h, %17.8h, v22.8h      \n"
                    "fmla   v29.8h, %20.8h, v23.8h      \n"
                    "fmla   v31.8h, %17.8h, v23.8h      \n"

                    "fmla   v28.8h, %12.8h, v12.8h      \n"
                    "fmla   v30.8h, %18.8h, v24.8h      \n"
                    "fmla   v29.8h, %12.8h, v13.8h      \n"
                    "fmla   v31.8h, %18.8h, v25.8h      \n"
                    "fmla   v28.8h, %13.8h, v13.8h      \n"
                    "fmla   v30.8h, %19.8h, v25.8h      \n"
                    "fmla   v29.8h, %13.8h, v14.8h      \n"
                    "fmla   v31.8h, %19.8h, v26.8h      \n"
                    "fmla   v28.8h, %14.8h, v14.8h      \n"
                    "fmla   v30.8h, %20.8h, v26.8h      \n"
                    "fmla   v29.8h, %14.8h, v15.8h      \n"
                    "fmla   v31.8h, %20.8h, v27.8h      \n"

                    "add    %2, %2, #32                 \n"
                    "add    %3, %3, #32                 \n"
                    "add    %4, %4, #32                 \n"
                    "add    %5, %5, #32                 \n"

                    "st1    {v28.8h, v29.8h}, [%0], #32 \n"
                    "st1    {v30.8h, v31.8h}, [%1], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #384]       \n"
                    "ld1    {v15.8h, v16.8h, v17.8h}, [%3] \n" // r10 r11 r12

                    "mov    v28.16b, %21.16b            \n" // sum00
                    "mov    v30.16b, %21.16b            \n" // sum10

                    "fmul   v29.8h, %15.8h, v15.8h      \n"
                    "fmul   v31.8h, %12.8h, v15.8h      \n"

                    "prfm   pldl1keep, [%4, #384]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h}, [%4] \n" // r20 r21 r22

                    "fmla   v28.8h, %16.8h, v16.8h      \n"
                    "fmla   v30.8h, %13.8h, v16.8h      \n"

                    "fmla   v29.8h, %17.8h, v17.8h      \n"
                    "fmla   v31.8h, %14.8h, v17.8h      \n"

                    "prfm   pldl1keep, [%2, #384]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h}, [%2] \n" // r00 r01 r02

                    "fmla   v28.8h, %18.8h, v18.8h      \n"
                    "fmla   v30.8h, %15.8h, v18.8h      \n"

                    "fmla   v29.8h, %19.8h, v19.8h      \n"
                    "fmla   v31.8h, %16.8h, v19.8h      \n"

                    "prfm   pldl1keep, [%5, #384]       \n"
                    "ld1    {v21.8h, v22.8h, v23.8h}, [%5] \n" // r30 r31 r32

                    "fmla   v28.8h, %20.8h, v20.8h      \n"
                    "fmla   v30.8h, %17.8h, v20.8h      \n"

                    "fmla   v29.8h, %12.8h, v12.8h      \n"
                    "fmla   v31.8h, %18.8h, v21.8h      \n"
                    "fmla   v28.8h, %13.8h, v13.8h      \n"
                    "fmla   v30.8h, %19.8h, v22.8h      \n"
                    "fmla   v29.8h, %14.8h, v14.8h      \n"
                    "fmla   v31.8h, %20.8h, v23.8h      \n"

                    "add    %2, %2, #16                 \n"
                    "add    %3, %3, #16                 \n"

                    "fadd   v28.8h, v28.8h, v29.8h      \n"
                    "fadd   v30.8h, v30.8h, v31.8h      \n"

                    "add    %4, %4, #16                 \n"
                    "add    %5, %5, #16                 \n"

                    "st1    {v28.8h}, [%0], #16         \n"
                    "st1    {v30.8h}, [%1], #16         \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3)       // %5
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "w"(_k00),  // %12
                    "w"(_k01),  // %13
                    "w"(_k02),  // %14
                    "w"(_k10),  // %15
                    "w"(_k11),  // %16
                    "w"(_k12),  // %17
                    "w"(_k20),  // %18
                    "w"(_k21),  // %19
                    "w"(_k22),  // %20
                    "w"(_bias0) // %21
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
            }

            r0 += 2 * 8 + w * 8;
            r1 += 2 * 8 + w * 8;
            r2 += 2 * 8 + w * 8;
            r3 += 2 * 8 + w * 8;

            outptr0 += outw * 8;
            outptr1 += outw * 8;
        }
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n" // r00 r01 r02 r03

                    "mov    v28.16b, %17.16b            \n" // sum00
                    "mov    v29.16b, %17.16b            \n" // sum01
                    "mov    v30.16b, %17.16b            \n" // sum02
                    "mov    v31.16b, %17.16b            \n" // sum03

                    "fmla   v28.8h, %8.8h, v12.8h       \n"
                    "fmla   v29.8h, %8.8h, v13.8h       \n"
                    "fmla   v30.8h, %8.8h, v14.8h       \n"
                    "fmla   v31.8h, %8.8h, v15.8h       \n"

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%1]      \n" // r04 r05

                    "fmla   v28.8h, %9.8h, v13.8h       \n"
                    "fmla   v29.8h, %9.8h, v14.8h       \n"
                    "fmla   v30.8h, %9.8h, v15.8h       \n"
                    "fmla   v31.8h, %9.8h, v16.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%2], #64 \n" // r10 r11 r12 r13

                    "fmla   v28.8h, %10.8h, v14.8h      \n"
                    "fmla   v29.8h, %10.8h, v15.8h      \n"
                    "fmla   v30.8h, %10.8h, v16.8h      \n"
                    "fmla   v31.8h, %10.8h, v17.8h      \n"

                    "fmla   v28.8h, %11.8h, v18.8h      \n"
                    "fmla   v29.8h, %11.8h, v19.8h      \n"
                    "fmla   v30.8h, %11.8h, v20.8h      \n"
                    "fmla   v31.8h, %11.8h, v21.8h      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%2]      \n" // r14 r15

                    "fmla   v28.8h, %12.8h, v19.8h      \n"
                    "fmla   v29.8h, %12.8h, v20.8h      \n"
                    "fmla   v30.8h, %12.8h, v21.8h      \n"
                    "fmla   v31.8h, %12.8h, v22.8h      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n" // r20 r21 r22 r23

                    "fmla   v28.8h, %13.8h, v20.8h      \n"
                    "fmla   v29.8h, %13.8h, v21.8h      \n"
                    "fmla   v30.8h, %13.8h, v22.8h      \n"
                    "fmla   v31.8h, %13.8h, v23.8h      \n"

                    "fmla   v28.8h, %14.8h, v12.8h      \n"
                    "fmla   v29.8h, %14.8h, v13.8h      \n"
                    "fmla   v30.8h, %14.8h, v14.8h      \n"
                    "fmla   v31.8h, %14.8h, v15.8h      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%3]      \n" // r24 r25

                    "fmla   v28.8h, %15.8h, v13.8h      \n"
                    "fmla   v29.8h, %15.8h, v14.8h      \n"
                    "fmla   v30.8h, %15.8h, v15.8h      \n"
                    "fmla   v31.8h, %15.8h, v16.8h      \n"

                    "fmla   v28.8h, %16.8h, v14.8h      \n"
                    "fmla   v29.8h, %16.8h, v15.8h      \n"
                    "fmla   v30.8h, %16.8h, v16.8h      \n"
                    "fmla   v31.8h, %16.8h, v17.8h      \n"

                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
            }
            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1] \n" // r00 r01 r02 r03

                    "mov    v28.16b, %17.16b            \n" // sum00
                    "mov    v29.16b, %17.16b            \n" // sum01

                    "fmul   v30.8h, %8.8h, v12.8h       \n"
                    "fmul   v31.8h, %8.8h, v13.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%2] \n" // r10 r11 r12 r13

                    "fmla   v28.8h, %9.8h, v13.8h       \n"
                    "fmla   v29.8h, %9.8h, v14.8h       \n"
                    "fmla   v30.8h, %10.8h, v14.8h      \n"
                    "fmla   v31.8h, %10.8h, v15.8h      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%3] \n" // r20 r21 r22 r23

                    "fmla   v28.8h, %11.8h, v16.8h      \n"
                    "fmla   v29.8h, %11.8h, v17.8h      \n"
                    "fmla   v30.8h, %12.8h, v17.8h      \n"
                    "fmla   v31.8h, %12.8h, v18.8h      \n"
                    "fmla   v28.8h, %13.8h, v18.8h      \n"
                    "fmla   v29.8h, %13.8h, v19.8h      \n"

                    "fmla   v30.8h, %14.8h, v20.8h      \n"
                    "fmla   v31.8h, %14.8h, v21.8h      \n"
                    "fmla   v28.8h, %15.8h, v21.8h      \n"
                    "fmla   v29.8h, %15.8h, v22.8h      \n"
                    "fmla   v30.8h, %16.8h, v22.8h      \n"
                    "fmla   v31.8h, %16.8h, v23.8h      \n"

                    "add    %1, %1, #32                 \n"

                    "fadd   v28.8h, v28.8h, v30.8h      \n"
                    "fadd   v29.8h, v29.8h, v31.8h      \n"

                    "add    %2, %2, #32                 \n"
                    "add    %3, %3, #32                 \n"

                    "st1    {v28.8h, v29.8h}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v28", "v29", "v30", "v31");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #384]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h}, [%1] \n" // r00 r01 r02

                    "mov    v28.16b, %17.16b            \n" // sum00

                    "fmul   v29.8h, %8.8h, v12.8h       \n"

                    "prfm   pldl1keep, [%2, #384]       \n"
                    "ld1    {v15.8h, v16.8h, v17.8h}, [%2] \n" // r10 r11 r12

                    "fmul   v30.8h, %9.8h, v13.8h       \n"
                    "fmla   v28.8h, %10.8h, v14.8h      \n"

                    "prfm   pldl1keep, [%3, #384]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h}, [%3] \n" // r20 r21 r22

                    "fmla   v29.8h, %11.8h, v15.8h      \n"
                    "fmla   v30.8h, %12.8h, v16.8h      \n"
                    "fmla   v28.8h, %13.8h, v17.8h      \n"

                    "fmla   v29.8h, %14.8h, v18.8h      \n"
                    "fmla   v30.8h, %15.8h, v19.8h      \n"
                    "fmla   v28.8h, %16.8h, v20.8h      \n"

                    "add    %1, %1, #16                 \n"

                    "fadd   v29.8h, v29.8h, v30.8h      \n"
                    "fadd   v28.8h, v28.8h, v29.8h      \n"

                    "add    %2, %2, #16                 \n"
                    "add    %3, %3, #16                 \n"

                    "st1    {v28.8h}, [%0], #16         \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v28", "v29", "v30");
            }

            r0 += 2 * 8;
            r1 += 2 * 8;
            r2 += 2 * 8;
        }
    }
}

static void convdw3x3s2_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 8;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + g * 8) : vdupq_n_f16((__fp16)0.f);

        const __fp16* k0 = kernel.row<const __fp16>(g);

        __fp16* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const __fp16* r0 = img0.row<const __fp16>(0);
        const __fp16* r1 = img0.row<const __fp16>(1);
        const __fp16* r2 = img0.row<const __fp16>(2);

        float16x8_t _k00 = vld1q_f16(k0);
        float16x8_t _k01 = vld1q_f16(k0 + 8);
        float16x8_t _k02 = vld1q_f16(k0 + 16);
        float16x8_t _k10 = vld1q_f16(k0 + 24);
        float16x8_t _k11 = vld1q_f16(k0 + 32);
        float16x8_t _k12 = vld1q_f16(k0 + 40);
        float16x8_t _k20 = vld1q_f16(k0 + 48);
        float16x8_t _k21 = vld1q_f16(k0 + 56);
        float16x8_t _k22 = vld1q_f16(k0 + 64);

        int i = 0;
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n" // r00 r01 r02 r03

                    "mov    v28.16b, %17.16b            \n" // sum00
                    "mov    v29.16b, %17.16b            \n" // sum01
                    "mov    v30.16b, %17.16b            \n" // sum02
                    "mov    v31.16b, %17.16b            \n" // sum03

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%1], #64 \n" // r04 r05 r06 r07

                    "fmla   v28.8h, %8.8h, v0.8h        \n"
                    "fmla   v29.8h, %8.8h, v2.8h        \n"
                    "fmla   v30.8h, %8.8h, v4.8h        \n"
                    "fmla   v31.8h, %8.8h, v6.8h        \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v8.8h}, [%1]               \n" // r08

                    "fmla   v28.8h, %9.8h, v1.8h        \n"
                    "fmla   v29.8h, %9.8h, v3.8h        \n"
                    "fmla   v30.8h, %9.8h, v5.8h        \n"
                    "fmla   v31.8h, %9.8h, v7.8h        \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%2], #64 \n" // r10 r11 r12 r13

                    "fmla   v28.8h, %10.8h, v2.8h       \n"
                    "fmla   v29.8h, %10.8h, v4.8h       \n"
                    "fmla   v30.8h, %10.8h, v6.8h       \n"
                    "fmla   v31.8h, %10.8h, v8.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%2], #64 \n" // r14 r15 r16 r17

                    "fmla   v28.8h, %11.8h, v16.8h      \n"
                    "fmla   v29.8h, %11.8h, v18.8h      \n"
                    "fmla   v30.8h, %11.8h, v20.8h      \n"
                    "fmla   v31.8h, %11.8h, v22.8h      \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v24.8h}, [%2]              \n" // r18

                    "fmla   v28.8h, %12.8h, v17.8h      \n"
                    "fmla   v29.8h, %12.8h, v19.8h      \n"
                    "fmla   v30.8h, %12.8h, v21.8h      \n"
                    "fmla   v31.8h, %12.8h, v23.8h      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" // r20 r21 r22 r23

                    "fmla   v28.8h, %13.8h, v18.8h      \n"
                    "fmla   v29.8h, %13.8h, v20.8h      \n"
                    "fmla   v30.8h, %13.8h, v22.8h      \n"
                    "fmla   v31.8h, %13.8h, v24.8h      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%3], #64 \n" // r24 r25 r26 r27

                    "fmla   v28.8h, %14.8h, v0.8h       \n"
                    "fmla   v29.8h, %14.8h, v2.8h       \n"
                    "fmla   v30.8h, %14.8h, v4.8h       \n"
                    "fmla   v31.8h, %14.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v8.8h}, [%3]               \n" // r28

                    "fmla   v28.8h, %15.8h, v1.8h       \n"
                    "fmla   v29.8h, %15.8h, v3.8h       \n"
                    "fmla   v30.8h, %15.8h, v5.8h       \n"
                    "fmla   v31.8h, %15.8h, v7.8h       \n"

                    "fmla   v28.8h, %16.8h, v2.8h       \n"
                    "fmla   v29.8h, %16.8h, v4.8h       \n"
                    "fmla   v30.8h, %16.8h, v6.8h       \n"
                    "fmla   v31.8h, %16.8h, v8.8h       \n"

                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v28", "v29", "v30", "v31");
            }
            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n" // r00 r01 r02 r03

                    "mov    v28.16b, %17.16b            \n" // sum00
                    "mov    v29.16b, %17.16b            \n" // sum01

                    "fmul   v30.8h, %8.8h, v12.8h       \n"
                    "fmul   v31.8h, %8.8h, v14.8h       \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v16.8h}, [%1]              \n" // r04

                    "fmla   v28.8h, %9.8h, v13.8h       \n"
                    "fmla   v29.8h, %9.8h, v15.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v17.8h, v18.8h, v19.8h, v20.8h}, [%2], #64 \n" // r10 r11 r12 r13

                    "fmla   v30.8h, %10.8h, v14.8h      \n"
                    "fmla   v31.8h, %10.8h, v16.8h      \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v21.8h}, [%1]              \n" // r14

                    "fmla   v28.8h, %11.8h, v17.8h      \n"
                    "fmla   v29.8h, %11.8h, v19.8h      \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v22.8h, v23.8h, v24.8h, v25.8h}, [%3], #64 \n" // r20 r21 r22 r23

                    "fmla   v30.8h, %12.8h, v18.8h      \n"
                    "fmla   v31.8h, %12.8h, v20.8h      \n"

                    "fmla   v28.8h, %13.8h, v19.8h      \n"
                    "fmla   v29.8h, %13.8h, v21.8h      \n"

                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v26.8h}, [%1]              \n" // r24

                    "fmla   v30.8h, %14.8h, v22.8h      \n"
                    "fmla   v31.8h, %14.8h, v24.8h      \n"

                    "fmla   v28.8h, %15.8h, v23.8h      \n"
                    "fmla   v29.8h, %15.8h, v25.8h      \n"
                    "fmla   v30.8h, %16.8h, v24.8h      \n"
                    "fmla   v31.8h, %16.8h, v26.8h      \n"

                    "fadd   v28.8h, v28.8h, v30.8h      \n"
                    "fadd   v29.8h, v29.8h, v31.8h      \n"

                    "st1    {v28.8h, v29.8h}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v28", "v29", "v30", "v31");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #384]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h}, [%1] \n" // r00 r01 r02

                    "mov    v28.16b, %17.16b            \n" // sum00

                    "fmul   v29.8h, %8.8h, v12.8h       \n"

                    "prfm   pldl1keep, [%2, #384]       \n"
                    "ld1    {v15.8h, v16.8h, v17.8h}, [%2] \n" // r10 r11 r12

                    "fmul   v30.8h, %9.8h, v13.8h       \n"
                    "fmla   v28.8h, %10.8h, v14.8h      \n"

                    "prfm   pldl1keep, [%3, #384]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h}, [%3] \n" // r20 r21 r22

                    "fmla   v29.8h, %11.8h, v15.8h      \n"
                    "fmla   v30.8h, %12.8h, v16.8h      \n"
                    "fmla   v28.8h, %13.8h, v17.8h      \n"

                    "fmla   v29.8h, %14.8h, v18.8h      \n"
                    "fmla   v30.8h, %15.8h, v19.8h      \n"
                    "fmla   v28.8h, %16.8h, v20.8h      \n"

                    "add    %1, %1, #32                 \n"

                    "fadd   v29.8h, v29.8h, v30.8h      \n"
                    "fadd   v28.8h, v28.8h, v29.8h      \n"

                    "add    %2, %2, #32                 \n"
                    "add    %3, %3, #32                 \n"

                    "st1    {v28.8h}, [%0], #16         \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2)       // %3
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "w"(_k00),  // %8
                    "w"(_k01),  // %9
                    "w"(_k02),  // %10
                    "w"(_k10),  // %11
                    "w"(_k11),  // %12
                    "w"(_k12),  // %13
                    "w"(_k20),  // %14
                    "w"(_k21),  // %15
                    "w"(_k22),  // %16
                    "w"(_bias0) // %17
                    : "memory", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v28", "v29", "v30");
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_3x3_pack8_int8.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const signed char* k0 = kernel.row<const signed char>(g);

        int* outptr0 = out.row<int>(0);
        int* outptr1 = out.row<int>(1);

        const Mat img0 = bottom_blob.channel(g);

        const signed char* r0 = img0.row<const signed char>(0);
        const signed char* r1 = img0.row<const signed char>(1);
        const signed char* r2 = img0.row<const signed char>(2);
        const signed char* r3 = img0.row<const signed char>(3);

        int8x8_t _k00 = vld1_s8(k0);
        int8x8_t _k01 = vld1_s8(k0 + 8);
        int8x8_t _k02 = vld1_s8(k0 + 16);
        int8x8_t _k10 = vld1_s8(k0 + 24);
        int8x8_t _k11 = vld1_s8(k0 + 32);
        int8x8_t _k12 = vld1_s8(k0 + 40);
        int8x8_t _k20 = vld1_s8(k0 + 48);
        int8x8_t _k21 = vld1_s8(k0 + 56);
        int8x8_t _k22 = vld1_s8(k0 + 64);

        int i = 0;
        for (; i + 1 < outh; i += 2)
        {
            int j = 0;
            for (; j + 1 < outw; j += 2)
            {
                int8x16_t _r0001 = vld1q_s8(r0);
                int8x16_t _r0203 = vld1q_s8(r0 + 16);
                int8x16_t _r1011 = vld1q_s8(r1);
                int8x16_t _r1213 = vld1q_s8(r1 + 16);
                int8x16_t _r2021 = vld1q_s8(r2);
                int8x16_t _r2223 = vld1q_s8(r2 + 16);
                int8x16_t _r3031 = vld1q_s8(r3);
                int8x16_t _r3233 = vld1q_s8(r3 + 16);

                int16x8_t _s00 = vmull_s8(vget_low_s8(_r0001), _k00);
                int16x8_t _s01 = vmull_s8(vget_high_s8(_r0001), _k01);
                int16x8_t _s02 = vmull_s8(vget_low_s8(_r0203), _k02);
                int16x8_t _s03 = vmull_s8(vget_low_s8(_r1011), _k10);
                int16x8_t _s10 = vmull_s8(vget_high_s8(_r0001), _k00);
                int16x8_t _s11 = vmull_s8(vget_low_s8(_r0203), _k01);
                int16x8_t _s12 = vmull_s8(vget_high_s8(_r0203), _k02);
                int16x8_t _s13 = vmull_s8(vget_high_s8(_r1011), _k10);

                int16x8_t _s20 = vmull_s8(vget_low_s8(_r1011), _k00);
                int16x8_t _s21 = vmull_s8(vget_high_s8(_r1011), _k01);
                int16x8_t _s22 = vmull_s8(vget_low_s8(_r1213), _k02);
                int16x8_t _s23 = vmull_s8(vget_low_s8(_r2021), _k10);
                int16x8_t _s30 = vmull_s8(vget_high_s8(_r1011), _k00);
                int16x8_t _s31 = vmull_s8(vget_low_s8(_r1213), _k01);
                int16x8_t _s32 = vmull_s8(vget_high_s8(_r1213), _k02);
                int16x8_t _s33 = vmull_s8(vget_high_s8(_r2021), _k10);

                _s00 = vmlal_s8(_s00, vget_high_s8(_r1011), _k11);
                _s01 = vmlal_s8(_s01, vget_low_s8(_r1213), _k12);
                _s02 = vmlal_s8(_s02, vget_low_s8(_r2021), _k20);
                _s03 = vmlal_s8(_s03, vget_high_s8(_r2021), _k21);
                _s10 = vmlal_s8(_s10, vget_low_s8(_r1213), _k11);
                _s11 = vmlal_s8(_s11, vget_high_s8(_r1213), _k12);
                _s12 = vmlal_s8(_s12, vget_high_s8(_r2021), _k20);
                _s13 = vmlal_s8(_s13, vget_low_s8(_r2223), _k21);

                _s20 = vmlal_s8(_s20, vget_high_s8(_r2021), _k11);
                _s21 = vmlal_s8(_s21, vget_low_s8(_r2223), _k12);
                _s22 = vmlal_s8(_s22, vget_low_s8(_r3031), _k20);
                _s23 = vmlal_s8(_s23, vget_high_s8(_r3031), _k21);
                _s30 = vmlal_s8(_s30, vget_low_s8(_r2223), _k11);
                _s31 = vmlal_s8(_s31, vget_high_s8(_r2223), _k12);
                _s32 = vmlal_s8(_s32, vget_high_s8(_r3031), _k20);
                _s33 = vmlal_s8(_s33, vget_low_s8(_r3233), _k21);

                int16x8_t _s08 = vmull_s8(vget_low_s8(_r2223), _k22);
                int16x8_t _s18 = vmull_s8(vget_high_s8(_r2223), _k22);
                int16x8_t _s28 = vmull_s8(vget_low_s8(_r3233), _k22);
                int16x8_t _s38 = vmull_s8(vget_high_s8(_r3233), _k22);

                int32x4_t _sum00 = vaddl_s16(vget_low_s16(_s00), vget_low_s16(_s01));
                int32x4_t _sum01 = vaddl_s16(vget_high_s16(_s00), vget_high_s16(_s01));
                int32x4_t _sum02 = vaddl_s16(vget_low_s16(_s02), vget_low_s16(_s03));
                int32x4_t _sum03 = vaddl_s16(vget_high_s16(_s02), vget_high_s16(_s03));
                int32x4_t _sum10 = vaddl_s16(vget_low_s16(_s10), vget_low_s16(_s11));
                int32x4_t _sum11 = vaddl_s16(vget_high_s16(_s10), vget_high_s16(_s11));
                int32x4_t _sum12 = vaddl_s16(vget_low_s16(_s12), vget_low_s16(_s13));
                int32x4_t _sum13 = vaddl_s16(vget_high_s16(_s12), vget_high_s16(_s13));
                int32x4_t _sum20 = vaddl_s16(vget_low_s16(_s20), vget_low_s16(_s21));
                int32x4_t _sum21 = vaddl_s16(vget_high_s16(_s20), vget_high_s16(_s21));
                int32x4_t _sum22 = vaddl_s16(vget_low_s16(_s22), vget_low_s16(_s23));
                int32x4_t _sum23 = vaddl_s16(vget_high_s16(_s22), vget_high_s16(_s23));
                int32x4_t _sum30 = vaddl_s16(vget_low_s16(_s30), vget_low_s16(_s31));
                int32x4_t _sum31 = vaddl_s16(vget_high_s16(_s30), vget_high_s16(_s31));
                int32x4_t _sum32 = vaddl_s16(vget_low_s16(_s32), vget_low_s16(_s33));
                int32x4_t _sum33 = vaddl_s16(vget_high_s16(_s32), vget_high_s16(_s33));
                _sum00 = vaddw_s16(_sum00, vget_low_s16(_s08));
                _sum01 = vaddw_s16(_sum01, vget_high_s16(_s08));
                _sum10 = vaddw_s16(_sum10, vget_low_s16(_s18));
                _sum11 = vaddw_s16(_sum11, vget_high_s16(_s18));
                _sum20 = vaddw_s16(_sum20, vget_low_s16(_s28));
                _sum21 = vaddw_s16(_sum21, vget_high_s16(_s28));
                _sum30 = vaddw_s16(_sum30, vget_low_s16(_s38));
                _sum31 = vaddw_s16(_sum31, vget_high_s16(_s38));
                _sum00 = vaddq_s32(_sum00, _sum02);
                _sum01 = vaddq_s32(_sum01, _sum03);
                _sum10 = vaddq_s32(_sum10, _sum12);
                _sum11 = vaddq_s32(_sum11, _sum13);
                _sum20 = vaddq_s32(_sum20, _sum22);
                _sum21 = vaddq_s32(_sum21, _sum23);
                _sum30 = vaddq_s32(_sum30, _sum32);
                _sum31 = vaddq_s32(_sum31, _sum33);

                vst1q_s32(outptr0, _sum00);
                vst1q_s32(outptr0 + 4, _sum01);
                vst1q_s32(outptr0 + 8, _sum10);
                vst1q_s32(outptr0 + 12, _sum11);
                vst1q_s32(outptr1, _sum20);
                vst1q_s32(outptr1 + 4, _sum21);
                vst1q_s32(outptr1 + 8, _sum30);
                vst1q_s32(outptr1 + 12, _sum31);

                r0 += 16;
                r1 += 16;
                r2 += 16;
                r3 += 16;
                outptr0 += 16;
                outptr1 += 16;
            }
            for (; j < outw; j++)
            {
                int8x8_t _r00 = vld1_s8(r0);
                int8x8_t _r01 = vld1_s8(r0 + 8);
                int8x8_t _r02 = vld1_s8(r0 + 16);
                int8x8_t _r10 = vld1_s8(r1);
                int8x8_t _r11 = vld1_s8(r1 + 8);
                int8x8_t _r12 = vld1_s8(r1 + 16);
                int8x8_t _r20 = vld1_s8(r2);
                int8x8_t _r21 = vld1_s8(r2 + 8);
                int8x8_t _r22 = vld1_s8(r2 + 16);
                int8x8_t _r30 = vld1_s8(r3);
                int8x8_t _r31 = vld1_s8(r3 + 8);
                int8x8_t _r32 = vld1_s8(r3 + 16);

                int16x8_t _s00 = vmull_s8(_r00, _k00);
                int16x8_t _s01 = vmull_s8(_r01, _k01);
                int16x8_t _s02 = vmull_s8(_r02, _k02);
                int16x8_t _s03 = vmull_s8(_r10, _k10);
                int16x8_t _s10 = vmull_s8(_r10, _k00);
                int16x8_t _s11 = vmull_s8(_r11, _k01);
                int16x8_t _s12 = vmull_s8(_r12, _k02);
                int16x8_t _s13 = vmull_s8(_r20, _k10);
                _s00 = vmlal_s8(_s00, _r11, _k11);
                _s01 = vmlal_s8(_s01, _r12, _k12);
                _s02 = vmlal_s8(_s02, _r20, _k20);
                _s03 = vmlal_s8(_s03, _r21, _k21);
                _s10 = vmlal_s8(_s10, _r21, _k11);
                _s11 = vmlal_s8(_s11, _r22, _k12);
                _s12 = vmlal_s8(_s12, _r30, _k20);
                _s13 = vmlal_s8(_s13, _r31, _k21);
                int16x8_t _s08 = vmull_s8(_r22, _k22);
                int16x8_t _s18 = vmull_s8(_r32, _k22);

                int32x4_t _sum00 = vaddl_s16(vget_low_s16(_s00), vget_low_s16(_s01));
                int32x4_t _sum01 = vaddl_s16(vget_high_s16(_s00), vget_high_s16(_s01));
                int32x4_t _sum02 = vaddl_s16(vget_low_s16(_s02), vget_low_s16(_s03));
                int32x4_t _sum03 = vaddl_s16(vget_high_s16(_s02), vget_high_s16(_s03));
                int32x4_t _sum10 = vaddl_s16(vget_low_s16(_s10), vget_low_s16(_s11));
                int32x4_t _sum11 = vaddl_s16(vget_high_s16(_s10), vget_high_s16(_s11));
                int32x4_t _sum12 = vaddl_s16(vget_low_s16(_s12), vget_low_s16(_s13));
                int32x4_t _sum13 = vaddl_s16(vget_high_s16(_s12), vget_high_s16(_s13));
                _sum00 = vaddw_s16(_sum00, vget_low_s16(_s08));
                _sum01 = vaddw_s16(_sum01, vget_high_s16(_s08));
                _sum10 = vaddw_s16(_sum10, vget_low_s16(_s18));
                _sum11 = vaddw_s16(_sum11, vget_high_s16(_s18));
                _sum00 = vaddq_s32(_sum00, _sum02);
                _sum01 = vaddq_s32(_sum01, _sum03);
                _sum10 = vaddq_s32(_sum10, _sum12);
                _sum11 = vaddq_s32(_sum11, _sum13);

                vst1q_s32(outptr0, _sum00);
                vst1q_s32(outptr0 + 4, _sum01);
                vst1q_s32(outptr1, _sum10);
                vst1q_s32(outptr1 + 4, _sum11);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                r3 += 8;
                outptr0 += 8;
                outptr1 += 8;
            }

            r0 += 2 * 8 + w * 8;
            r1 += 2 * 8 + w * 8;
            r2 += 2 * 8 + w * 8;
            r3 += 2 * 8 + w * 8;

            outptr0 += outw * 8;
            outptr1 += outw * 8;
        }
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 1 < outw; j += 2)
            {
                int8x16_t _r0001 = vld1q_s8(r0);
                int8x16_t _r0203 = vld1q_s8(r0 + 16);
                int8x16_t _r1011 = vld1q_s8(r1);
                int8x16_t _r1213 = vld1q_s8(r1 + 16);
                int8x16_t _r2021 = vld1q_s8(r2);
                int8x16_t _r2223 = vld1q_s8(r2 + 16);

                int16x8_t _s00 = vmull_s8(vget_low_s8(_r0001), _k00);
                int16x8_t _s01 = vmull_s8(vget_high_s8(_r0001), _k01);
                int16x8_t _s02 = vmull_s8(vget_low_s8(_r0203), _k02);
                int16x8_t _s03 = vmull_s8(vget_low_s8(_r1011), _k10);
                int16x8_t _s10 = vmull_s8(vget_high_s8(_r0001), _k00);
                int16x8_t _s11 = vmull_s8(vget_low_s8(_r0203), _k01);
                int16x8_t _s12 = vmull_s8(vget_high_s8(_r0203), _k02);
                int16x8_t _s13 = vmull_s8(vget_high_s8(_r1011), _k10);
                _s00 = vmlal_s8(_s00, vget_high_s8(_r1011), _k11);
                _s01 = vmlal_s8(_s01, vget_low_s8(_r1213), _k12);
                _s02 = vmlal_s8(_s02, vget_low_s8(_r2021), _k20);
                _s03 = vmlal_s8(_s03, vget_high_s8(_r2021), _k21);
                _s10 = vmlal_s8(_s10, vget_low_s8(_r1213), _k11);
                _s11 = vmlal_s8(_s11, vget_high_s8(_r1213), _k12);
                _s12 = vmlal_s8(_s12, vget_high_s8(_r2021), _k20);
                _s13 = vmlal_s8(_s13, vget_low_s8(_r2223), _k21);
                int16x8_t _s08 = vmull_s8(vget_low_s8(_r2223), _k22);
                int16x8_t _s18 = vmull_s8(vget_high_s8(_r2223), _k22);

                int32x4_t _sum00 = vaddl_s16(vget_low_s16(_s00), vget_low_s16(_s01));
                int32x4_t _sum01 = vaddl_s16(vget_high_s16(_s00), vget_high_s16(_s01));
                int32x4_t _sum02 = vaddl_s16(vget_low_s16(_s02), vget_low_s16(_s03));
                int32x4_t _sum03 = vaddl_s16(vget_high_s16(_s02), vget_high_s16(_s03));
                int32x4_t _sum10 = vaddl_s16(vget_low_s16(_s10), vget_low_s16(_s11));
                int32x4_t _sum11 = vaddl_s16(vget_high_s16(_s10), vget_high_s16(_s11));
                int32x4_t _sum12 = vaddl_s16(vget_low_s16(_s12), vget_low_s16(_s13));
                int32x4_t _sum13 = vaddl_s16(vget_high_s16(_s12), vget_high_s16(_s13));
                _sum00 = vaddw_s16(_sum00, vget_low_s16(_s08));
                _sum01 = vaddw_s16(_sum01, vget_high_s16(_s08));
                _sum10 = vaddw_s16(_sum10, vget_low_s16(_s18));
                _sum11 = vaddw_s16(_sum11, vget_high_s16(_s18));
                _sum00 = vaddq_s32(_sum00, _sum02);
                _sum01 = vaddq_s32(_sum01, _sum03);
                _sum10 = vaddq_s32(_sum10, _sum12);
                _sum11 = vaddq_s32(_sum11, _sum13);

                vst1q_s32(outptr0, _sum00);
                vst1q_s32(outptr0 + 4, _sum01);
                vst1q_s32(outptr0 + 8, _sum10);
                vst1q_s32(outptr0 + 12, _sum11);

                r0 += 16;
                r1 += 16;
                r2 += 16;
                outptr0 += 16;
            }
            for (; j < outw; j++)
            {
                int8x8_t _r00 = vld1_s8(r0);
                int8x8_t _r01 = vld1_s8(r0 + 8);
                int8x8_t _r02 = vld1_s8(r0 + 16);
                int8x8_t _r10 = vld1_s8(r1);
                int8x8_t _r11 = vld1_s8(r1 + 8);
                int8x8_t _r12 = vld1_s8(r1 + 16);
                int8x8_t _r20 = vld1_s8(r2);
                int8x8_t _r21 = vld1_s8(r2 + 8);
                int8x8_t _r22 = vld1_s8(r2 + 16);

                int16x8_t _s0 = vmull_s8(_r00, _k00);
                int16x8_t _s1 = vmull_s8(_r01, _k01);
                int16x8_t _s2 = vmull_s8(_r02, _k02);
                int16x8_t _s3 = vmull_s8(_r10, _k10);
                _s0 = vmlal_s8(_s0, _r11, _k11);
                _s1 = vmlal_s8(_s1, _r12, _k12);
                _s2 = vmlal_s8(_s2, _r20, _k20);
                _s3 = vmlal_s8(_s3, _r21, _k21);
                int16x8_t _s4 = vmull_s8(_r22, _k22);

                int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1));
                int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1));
                int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3));
                int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3));
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4));
                _sum0 = vaddq_s32(_sum0, _sum2);
                _sum1 = vaddq_s32(_sum1, _sum3);

                vst1q_s32(outptr0, _sum0);
                vst1q_s32(outptr0 + 4, _sum1);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                outptr0 += 8;
            }

            r0 += 2 * 8;
            r1 += 2 * 8;
            r2 += 2 * 8;
        }
    }
}

static void convdw3x3s2_pack8_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 8;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const signed char* k0 = kernel.row<const signed char>(g);

        int* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const signed char* r0 = img0.row<const signed char>(0);
        const signed char* r1 = img0.row<const signed char>(1);
        const signed char* r2 = img0.row<const signed char>(2);

        int8x8_t _k00 = vld1_s8(k0);
        int8x8_t _k01 = vld1_s8(k0 + 8);
        int8x8_t _k02 = vld1_s8(k0 + 16);
        int8x8_t _k10 = vld1_s8(k0 + 24);
        int8x8_t _k11 = vld1_s8(k0 + 32);
        int8x8_t _k12 = vld1_s8(k0 + 40);
        int8x8_t _k20 = vld1_s8(k0 + 48);
        int8x8_t _k21 = vld1_s8(k0 + 56);
        int8x8_t _k22 = vld1_s8(k0 + 64);

        int i = 0;
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 1 < outw; j += 2)
            {
                int8x8_t _r00 = vld1_s8(r0);
                int8x8_t _r01 = vld1_s8(r0 + 8);
                int8x8_t _r02 = vld1_s8(r0 + 16);
                int8x8_t _r03 = vld1_s8(r0 + 24);
                int8x8_t _r04 = vld1_s8(r0 + 32);
                int8x8_t _r10 = vld1_s8(r1);
                int8x8_t _r11 = vld1_s8(r1 + 8);
                int8x8_t _r12 = vld1_s8(r1 + 16);
                int8x8_t _r13 = vld1_s8(r1 + 24);
                int8x8_t _r14 = vld1_s8(r1 + 32);
                int8x8_t _r20 = vld1_s8(r2);
                int8x8_t _r21 = vld1_s8(r2 + 8);
                int8x8_t _r22 = vld1_s8(r2 + 16);
                int8x8_t _r23 = vld1_s8(r2 + 24);
                int8x8_t _r24 = vld1_s8(r2 + 32);

                int16x8_t _s00 = vmull_s8(_r00, _k00);
                int16x8_t _s01 = vmull_s8(_r01, _k01);
                int16x8_t _s02 = vmull_s8(_r02, _k02);
                int16x8_t _s03 = vmull_s8(_r10, _k10);
                int16x8_t _s10 = vmull_s8(_r02, _k00);
                int16x8_t _s11 = vmull_s8(_r03, _k01);
                int16x8_t _s12 = vmull_s8(_r04, _k02);
                int16x8_t _s13 = vmull_s8(_r12, _k10);
                _s00 = vmlal_s8(_s00, _r11, _k11);
                _s01 = vmlal_s8(_s01, _r12, _k12);
                _s02 = vmlal_s8(_s02, _r20, _k20);
                _s03 = vmlal_s8(_s03, _r21, _k21);
                _s10 = vmlal_s8(_s10, _r13, _k11);
                _s11 = vmlal_s8(_s11, _r14, _k12);
                _s12 = vmlal_s8(_s12, _r22, _k20);
                _s13 = vmlal_s8(_s13, _r23, _k21);
                int16x8_t _s08 = vmull_s8(_r22, _k22);
                int16x8_t _s18 = vmull_s8(_r24, _k22);

                int32x4_t _sum00 = vaddl_s16(vget_low_s16(_s00), vget_low_s16(_s01));
                int32x4_t _sum01 = vaddl_s16(vget_high_s16(_s00), vget_high_s16(_s01));
                int32x4_t _sum02 = vaddl_s16(vget_low_s16(_s02), vget_low_s16(_s03));
                int32x4_t _sum03 = vaddl_s16(vget_high_s16(_s02), vget_high_s16(_s03));
                int32x4_t _sum10 = vaddl_s16(vget_low_s16(_s10), vget_low_s16(_s11));
                int32x4_t _sum11 = vaddl_s16(vget_high_s16(_s10), vget_high_s16(_s11));
                int32x4_t _sum12 = vaddl_s16(vget_low_s16(_s12), vget_low_s16(_s13));
                int32x4_t _sum13 = vaddl_s16(vget_high_s16(_s12), vget_high_s16(_s13));
                _sum00 = vaddw_s16(_sum00, vget_low_s16(_s08));
                _sum01 = vaddw_s16(_sum01, vget_high_s16(_s08));
                _sum10 = vaddw_s16(_sum10, vget_low_s16(_s18));
                _sum11 = vaddw_s16(_sum11, vget_high_s16(_s18));
                _sum00 = vaddq_s32(_sum00, _sum02);
                _sum01 = vaddq_s32(_sum01, _sum03);
                _sum10 = vaddq_s32(_sum10, _sum12);
                _sum11 = vaddq_s32(_sum11, _sum13);

                vst1q_s32(outptr0, _sum00);
                vst1q_s32(outptr0 + 4, _sum01);
                vst1q_s32(outptr0 + 8, _sum10);
                vst1q_s32(outptr0 + 12, _sum11);

                r0 += 32;
                r1 += 32;
                r2 += 32;
                outptr0 += 16;
            }
            for (; j < outw; j++)
            {
                int8x8_t _r00 = vld1_s8(r0);
                int8x8_t _r01 = vld1_s8(r0 + 8);
                int8x8_t _r02 = vld1_s8(r0 + 16);
                int8x8_t _r10 = vld1_s8(r1);
                int8x8_t _r11 = vld1_s8(r1 + 8);
                int8x8_t _r12 = vld1_s8(r1 + 16);
                int8x8_t _r20 = vld1_s8(r2);
                int8x8_t _r21 = vld1_s8(r2 + 8);
                int8x8_t _r22 = vld1_s8(r2 + 16);

                int16x8_t _s0 = vmull_s8(_r00, _k00);
                int16x8_t _s1 = vmull_s8(_r01, _k01);
                int16x8_t _s2 = vmull_s8(_r02, _k02);
                int16x8_t _s3 = vmull_s8(_r10, _k10);
                _s0 = vmlal_s8(_s0, _r11, _k11);
                _s1 = vmlal_s8(_s1, _r12, _k12);
                _s2 = vmlal_s8(_s2, _r20, _k20);
                _s3 = vmlal_s8(_s3, _r21, _k21);
                int16x8_t _s4 = vmull_s8(_r22, _k22);

                int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1));
                int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1));
                int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3));
                int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3));
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4));
                _sum0 = vaddq_s32(_sum0, _sum2);
                _sum1 = vaddq_s32(_sum1, _sum3);

                vst1q_s32(outptr0, _sum0);
                vst1q_s32(outptr0 + 4, _sum1);

                r0 += 16;
                r1 += 16;
                r2 += 16;
                outptr0 += 8;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_5x5.h
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw5x5s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const float bias0 = bias ? bias[g] : 0.f;

        const float* kernel0 = kernel + g * 25;

        float* outptr = out;
        float* outptr2 = outptr + outw;

        const float* img0 = bottom_blob.channel(g);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w * 2;
        const float* r3 = img0 + w * 3;
        const float* r4 = img0 + w * 4;
        const float* r5 = img0 + w * 5;

        const float* k0 = kernel0;
        const float* k1 = kernel0 + 5;
        const float* k2 = kernel0 + 10;
        const float* k3 = kernel0 + 15;
        const float* k4 = kernel0 + 20;

#if __ARM_NEON
        float32x4_t _k0123 = vld1q_f32(kernel0);
        float32x4_t _k4567 = vld1q_f32(kernel0 + 4);
        float32x4_t _k891011 = vld1q_f32(kernel0 + 8);
        float32x4_t _k12131415 = vld1q_f32(kernel0 + 12);
        float32x4_t _k16171819 = vld1q_f32(kernel0 + 16);
        float32x4_t _k20212223 = vld1q_f32(kernel0 + 20);
        float32x4_t _k24242424 = vdupq_n_f32(kernel0[24]);

        float32x4_t _bias0 = vdupq_n_f32(bias0);
#endif // __ARM_NEON

        int i = 0;

        for (; i + 1 < outh; i += 2)
        {
#if __ARM_NEON
#if __aarch64__
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int nn = outw >> 2;
            int remain = outw & 3;
#endif // __aarch64__
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    // r1
                    "prfm   pldl1keep, [%4, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%4]  \n" // v16 v17 v18 = r10 r14 r18

                    "mov    v8.16b, %25.16b                 \n" // v8 = _bias0
                    "mov    v9.16b, %25.16b                 \n" // v9 = _bias0

                    "0:                                     \n"

                    "mov    v10.16b, %25.16b                \n" // v10 = _bias0
                    "mov    v11.16b, %25.16b                \n" // v11 = _bias0

                    "fmla   v8.4s, v16.4s, %19.s[1]         \n"
                    "fmla   v10.4s, v16.4s, %18.s[0]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #4   \n" // r11

                    "fmla   v9.4s, v17.4s, %19.s[1]         \n"
                    "fmla   v11.4s, v17.4s, %18.s[0]        \n"

                    "ext    v20.16b, v17.16b, v18.16b, #4   \n" // r15

                    "fmla   v8.4s, v17.4s, %20.s[1]         \n"
                    "fmla   v10.4s, v17.4s, %19.s[0]        \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n" // r12

                    "fmla   v9.4s, v18.4s, %20.s[1]         \n"
                    "fmla   v11.4s, v18.4s, %19.s[0]        \n"

                    "ext    v22.16b, v17.16b, v18.16b, #8   \n" // r16

                    "fmla   v8.4s, v19.4s, %19.s[2]         \n"
                    "fmla   v10.4s, v19.4s, %18.s[1]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #12  \n" // r13

                    "fmla   v9.4s, v20.4s, %19.s[2]         \n"
                    "fmla   v11.4s, v20.4s, %18.s[1]        \n"

                    "ext    v20.16b, v17.16b, v18.16b, #12  \n" // r17

                    "fmla   v8.4s, v21.4s, %19.s[3]         \n"
                    "fmla   v10.4s, v21.4s, %18.s[2]        \n"

                    "add    %4, %4, #32                     \n"

                    "fmla   v9.4s, v22.4s, %19.s[3]         \n"
                    "fmla   v11.4s, v22.4s, %18.s[2]        \n"

                    // r2
                    "prfm   pldl1keep, [%5, #384]           \n"
                    "ld1    {v12.4s, v13.4s, v14.4s}, [%5]  \n" // v12 v13 v14 = r20 r24 r28

                    "fmla   v8.4s, v19.4s, %20.s[0]         \n"
                    "fmla   v10.4s, v19.4s, %18.s[3]        \n"
                    "fmla   v9.4s, v20.4s, %20.s[0]         \n"
                    "fmla   v11.4s, v20.4s, %18.s[3]        \n"

                    "add    %5, %5, #32                     \n"

                    "fmla   v8.4s, v12.4s, %20.s[2]         \n"
                    "fmla   v10.4s, v12.4s, %19.s[1]        \n"

                    "ext    v21.16b, v12.16b, v13.16b, #4   \n" // r21

                    "fmla   v9.4s, v13.4s, %20.s[2]         \n"
                    "fmla   v11.4s, v13.4s, %19.s[1]        \n"

                    "ext    v22.16b, v13.16b, v14.16b, #4   \n" // r25

                    "fmla   v8.4s, v13.4s, %21.s[2]         \n"
                    "fmla   v10.4s, v13.4s, %20.s[1]        \n"

                    "ext    v19.16b, v12.16b, v13.16b, #8   \n" // r22

                    "fmla   v9.4s, v14.4s, %21.s[2]         \n"
                    "fmla   v11.4s, v14.4s, %20.s[1]        \n"

                    "ext    v20.16b, v13.16b, v14.16b, #8   \n" // r26

                    "fmla   v8.4s, v21.4s, %20.s[3]         \n"
                    "fmla   v10.4s, v21.4s, %19.s[2]        \n"

                    "ext    v21.16b, v12.16b, v13.16b, #12  \n" // r23

                    "fmla   v9.4s, v22.4s, %20.s[3]         \n"
                    "fmla   v11.4s, v22.4s, %19.s[2]        \n"

                    "ext    v22.16b, v13.16b, v14.16b, #12  \n" // r27

                    "fmla   v8.4s, v19.4s, %21.s[0]         \n"
                    "fmla   v10.4s, v19.4s, %19.s[3]        \n"
                    "fmla   v9.4s, v20.4s, %21.s[0]         \n"
                    "fmla   v11.4s, v20.4s, %19.s[3]        \n"

                    // r3
                    "prfm   pldl1keep, [%6, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%6]  \n" // v16 v17 v18 = r30 r34 r38

                    "fmla   v8.4s, v21.4s, %21.s[1]         \n"
                    "fmla   v10.4s, v21.4s, %20.s[0]        \n"
                    "fmla   v9.4s, v22.4s, %21.s[1]         \n"
                    "fmla   v11.4s, v22.4s, %20.s[0]        \n"

                    "add    %6, %6, #32                     \n"

                    "fmla   v8.4s, v16.4s, %21.s[3]         \n"
                    "fmla   v10.4s, v16.4s, %20.s[2]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #4   \n" // r31

                    "fmla   v9.4s, v17.4s, %21.s[3]         \n"
                    "fmla   v11.4s, v17.4s, %20.s[2]        \n"

                    "ext    v20.16b, v17.16b, v18.16b, #4   \n" // r35

                    "fmla   v8.4s, v17.4s, %22.s[3]         \n"
                    "fmla   v10.4s, v17.4s, %21.s[2]        \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n" // r32

                    "fmla   v9.4s, v18.4s, %22.s[3]         \n"
                    "fmla   v11.4s, v18.4s, %21.s[2]        \n"

                    "ext    v22.16b, v17.16b, v18.16b, #8   \n" // r36

                    "fmla   v8.4s, v19.4s, %22.s[0]         \n"
                    "fmla   v10.4s, v19.4s, %20.s[3]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #12  \n" // r33

                    "fmla   v9.4s, v20.4s, %22.s[0]         \n"
                    "fmla   v11.4s, v20.4s, %20.s[3]        \n"

                    "ext    v20.16b, v17.16b, v18.16b, #12  \n" // r37

                    "fmla   v8.4s, v21.4s, %22.s[1]         \n"
                    "fmla   v10.4s, v21.4s, %21.s[0]        \n"
                    "fmla   v9.4s, v22.4s, %22.s[1]         \n"
                    "fmla   v11.4s, v22.4s, %21.s[0]        \n"

                    // r4
                    "prfm   pldl1keep, [%7, #384]           \n"
                    "ld1    {v12.4s, v13.4s, v14.4s}, [%7]  \n" // v12 v13 v14 = r40 r44 r48

                    "fmla   v8.4s, v19.4s, %22.s[2]         \n"
                    "fmla   v10.4s, v19.4s, %21.s[1]        \n"

                    "add    %7, %7, #32                     \n"

                    "fmla   v9.4s, v20.4s, %22.s[2]         \n"
                    "fmla   v11.4s, v20.4s, %21.s[1]        \n"

                    "ext    v21.16b, v12.16b, v13.16b, #4   \n" // r41

                    "fmla   v8.4s, v12.4s, %23.s[0]         \n"
                    "fmla   v10.4s, v12.4s, %21.s[3]        \n"

                    "ext    v22.16b, v13.16b, v14.16b, #4   \n" // r45

                    "fmla   v9.4s, v13.4s, %23.s[0]         \n"
                    "fmla   v11.4s, v13.4s, %21.s[3]        \n"

                    "ext    v19.16b, v12.16b, v13.16b, #8   \n" // r42

                    "fmla   v8.4s, v13.4s, %24.s[0]         \n"
                    "fmla   v10.4s, v13.4s, %22.s[3]        \n"

                    "ext    v20.16b, v13.16b, v14.16b, #8   \n" // r46

                    "fmla   v9.4s, v14.4s, %24.s[0]         \n"
                    "fmla   v11.4s, v14.4s, %22.s[3]        \n"

                    // r0 and r5
                    "prfm   pldl1keep, [%3, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%3]  \n" // v16 v17 v18 = r00 r04 r08

                    "fmla   v8.4s, v21.4s, %23.s[1]         \n"
                    "fmla   v10.4s, v21.4s, %22.s[0]        \n"

                    "ext    v21.16b, v12.16b, v13.16b, #12  \n" // r43

                    "fmla   v9.4s, v22.4s, %23.s[1]         \n"
                    "fmla   v11.4s, v22.4s, %22.s[0]        \n"

                    "ext    v22.16b, v13.16b, v14.16b, #12  \n" // r47

                    "fmla   v8.4s, v19.4s, %23.s[2]         \n"
                    "fmla   v10.4s, v19.4s, %22.s[1]        \n"

                    "prfm   pldl1keep, [%8, #384]           \n"
                    "ld1    {v12.4s, v13.4s, v14.4s}, [%8]  \n" // v12 v13 v14 = r50 r54 r58

                    "fmla   v9.4s, v20.4s, %23.s[2]         \n"
                    "fmla   v11.4s, v20.4s, %22.s[1]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #4   \n" // r01

                    "fmla   v8.4s, v21.4s, %23.s[3]         \n"
                    "fmla   v10.4s, v21.4s, %22.s[2]        \n"

                    "ext    v23.16b, v12.16b, v13.16b, #4   \n" // r51

                    "fmla   v9.4s, v22.4s, %23.s[3]         \n"
                    "fmla   v11.4s, v22.4s, %22.s[2]        \n"

                    "ext    v20.16b, v17.16b, v18.16b, #4   \n" // r05

                    "fmla   v8.4s, v16.4s, %18.s[0]         \n"
                    "fmla   v10.4s, v12.4s, %23.s[0]        \n"

                    "ext    v24.16b, v13.16b, v14.16b, #4   \n" // r55

                    "fmla   v9.4s, v17.4s, %18.s[0]         \n"
                    "fmla   v11.4s, v13.4s, %23.s[0]        \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n" // r02

                    "fmla   v8.4s, v17.4s, %19.s[0]         \n"
                    "fmla   v10.4s, v13.4s, %24.s[0]        \n"

                    "ext    v25.16b, v12.16b, v13.16b, #8   \n" // r52

                    "fmla   v9.4s, v18.4s, %19.s[0]         \n"
                    "fmla   v11.4s, v14.4s, %24.s[0]        \n"

                    "ext    v22.16b, v17.16b, v18.16b, #8   \n" // r06

                    "fmla   v8.4s, v19.4s, %18.s[1]         \n"
                    "fmla   v10.4s, v23.4s, %23.s[1]        \n"

                    "ext    v26.16b, v13.16b, v14.16b, #8   \n" // r56

                    "fmla   v9.4s, v20.4s, %18.s[1]         \n"
                    "fmla   v11.4s, v24.4s, %23.s[1]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #12  \n" // r03

                    "fmla   v8.4s, v21.4s, %18.s[2]         \n"
                    "fmla   v10.4s, v25.4s, %23.s[2]        \n"

                    "ext    v23.16b, v12.16b, v13.16b, #12  \n" // r53

                    "fmla   v9.4s, v22.4s, %18.s[2]         \n"
                    "fmla   v11.4s, v26.4s, %23.s[2]        \n"

                    "ext    v20.16b, v17.16b, v18.16b, #12  \n" // r07

                    "fmla   v8.4s, v19.4s, %18.s[3]         \n"
                    "fmla   v10.4s, v23.4s, %23.s[3]        \n"

                    "ext    v24.16b, v13.16b, v14.16b, #12  \n" // r57

                    "fmla   v9.4s, v20.4s, %18.s[3]         \n"

                    "add    %3, %3, #32                     \n"

                    "fmla   v11.4s, v24.4s, %23.s[3]        \n"

                    "add    %8, %8, #32                     \n"

                    // r1
                    "prfm   pldl1keep, [%4, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%4]  \n" // v16 v17 v18 = r10 r14 r18

                    "subs   %w0, %w0, #1                    \n"

                    "st1    {v8.4s, v9.4s}, [%1], #32       \n"

                    "mov    v8.16b, %25.16b                 \n" // v8 = _bias0
                    "mov    v9.16b, %25.16b                 \n" // v9 = _bias0

                    "st1    {v10.4s, v11.4s}, [%2], #32     \n"

                    "bne    0b                              \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr),  // %1
                    "=r"(outptr2), // %2
                    "=r"(r0),      // %3
                    "=r"(r1),      // %4
                    "=r"(r2),      // %5
                    "=r"(r3),      // %6
                    "=r"(r4),      // %7
                    "=r"(r5)       // %8
                    : "0"(nn),
                    "1"(outptr),
                    "2"(outptr2),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "7"(r4),
                    "8"(r5),
                    "w"(_k0123),     // %18
                    "w"(_k4567),     // %19
                    "w"(_k891011),   // %20
                    "w"(_k12131415), // %21
                    "w"(_k16171819), // %22
                    "w"(_k20212223), // %23
                    "w"(_k24242424), // %24
                    "w"(_bias0)      // %25
                    : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26");
            }

            if (remain >= 4)
            {
                remain -= 4;
                asm volatile(
                    // r1
                    "prfm   pldl1keep, [%3, #256]           \n"
                    "ld1    {v12.4s, v13.4s}, [%3]          \n" // v12 v13 = r10 r14

                    "mov    v8.16b, %23.16b                 \n" // v8 = _bias0
                    "mov    v9.16b, %23.16b                 \n" // v9 = _bias0

                    "fmul   v10.4s, v12.4s, %17.s[1]        \n"
                    "fmul   v11.4s, v12.4s, %16.s[0]        \n"

                    "ext    v21.16b, v12.16b, v13.16b, #4   \n" // r11

                    "fmla   v8.4s, v13.4s, %18.s[1]         \n"
                    "fmla   v9.4s, v13.4s, %17.s[0]         \n"

                    "ext    v22.16b, v12.16b, v13.16b, #8   \n" // r12

                    "fmla   v10.4s, v21.4s, %17.s[2]        \n"
                    "fmla   v11.4s, v21.4s, %16.s[1]        \n"

                    "ext    v23.16b, v12.16b, v13.16b, #12  \n" // r13

                    "fmla   v8.4s, v22.4s, %17.s[3]         \n"
                    "fmla   v9.4s, v22.4s, %16.s[2]         \n"

                    // r2
                    "prfm   pldl1keep, [%4, #256]           \n"
                    "ld1    {v16.4s, v17.4s}, [%4]          \n" // v16 v17 = r20 r24

                    "fmla   v10.4s, v23.4s, %18.s[0]        \n"
                    "fmla   v11.4s, v23.4s, %16.s[3]        \n"

                    "add    %4, %4, #16                     \n"

                    "fmla   v8.4s, v16.4s, %18.s[2]         \n"
                    "fmla   v9.4s, v16.4s, %17.s[1]         \n"

                    "ext    v18.16b, v16.16b, v17.16b, #4   \n" // r21

                    "fmla   v10.4s, v17.4s, %19.s[2]        \n"
                    "fmla   v11.4s, v17.4s, %18.s[1]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #8   \n" // r22

                    "fmla   v8.4s, v18.4s, %18.s[3]         \n"
                    "fmla   v9.4s, v18.4s, %17.s[2]         \n"

                    "ext    v20.16b, v16.16b, v17.16b, #12  \n" // r23

                    "fmla   v10.4s, v19.4s, %19.s[0]        \n"
                    "fmla   v11.4s, v19.4s, %17.s[3]        \n"

                    // r3
                    "prfm   pldl1keep, [%5, #256]           \n"
                    "ld1    {v12.4s, v13.4s}, [%5]          \n" // v12 v13 = r30 r34

                    "fmla   v8.4s, v20.4s, %19.s[1]         \n"
                    "fmla   v9.4s, v20.4s, %18.s[0]         \n"

                    "add    %5, %5, #16                     \n"

                    "fmla   v10.4s, v12.4s, %19.s[3]        \n"
                    "fmla   v11.4s, v12.4s, %18.s[2]        \n"

                    "ext    v21.16b, v12.16b, v13.16b, #4   \n" // r31

                    "fmla   v8.4s, v13.4s, %20.s[3]         \n"
                    "fmla   v9.4s, v13.4s, %19.s[2]         \n"

                    "ext    v22.16b, v12.16b, v13.16b, #8   \n" // r32

                    "fmla   v10.4s, v21.4s, %20.s[0]        \n"
                    "fmla   v11.4s, v21.4s, %18.s[3]        \n"

                    "ext    v23.16b, v12.16b, v13.16b, #12  \n" // r33

                    "fmla   v8.4s, v22.4s, %20.s[1]         \n"
                    "fmla   v9.4s, v22.4s, %19.s[0]         \n"

                    // r4
                    "prfm   pldl1keep, [%6, #256]           \n"
                    "ld1    {v16.4s, v17.4s}, [%6]          \n" // v16 v17 = r40 r44

                    "fmla   v10.4s, v23.4s, %20.s[2]        \n"
                    "fmla   v11.4s, v23.4s, %19.s[1]        \n"

                    "add    %6, %6, #16                     \n"

                    "fmla   v8.4s, v16.4s, %21.s[0]         \n"
                    "fmla   v9.4s, v16.4s, %19.s[3]         \n"

                    "ext    v18.16b, v16.16b, v17.16b, #4   \n" // r41

                    "fmla   v10.4s, v17.4s, %22.s[0]        \n"
                    "fmla   v11.4s, v17.4s, %20.s[3]        \n"

                    "ext    v19.16b, v16.16b, v17.16b, #8   \n" // r42

                    "fmla   v8.4s, v18.4s, %21.s[1]         \n"
                    "fmla   v9.4s, v18.4s, %20.s[0]         \n"

                    "ext    v20.16b, v16.16b, v17.16b, #12  \n" // r43

                    "fmla   v10.4s, v19.4s, %21.s[2]        \n"
                    "fmla   v11.4s, v19.4s, %20.s[1]        \n"

                    // r0
                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld1    {v16.4s, v17.4s}, [%2]          \n" // v16 v17 = r00 r04

                    "fmla   v8.4s, v20.4s, %21.s[3]         \n"
                    "fmla   v9.4s, v20.4s, %20.s[2]         \n"

                    // r5
                    "prfm   pldl1keep, [%7, #256]           \n"
                    "ld1    {v12.4s, v13.4s}, [%7]          \n" // v12 v13 = r50 r54

                    "fmla   v10.4s, v16.4s, %16.s[0]        \n"
                    "fmla   v11.4s, v12.4s, %21.s[0]        \n"

                    "ext    v18.16b, v16.16b, v17.16b, #4   \n" // r01

                    "fmla   v8.4s, v17.4s, %17.s[0]         \n"

                    "ext    v21.16b, v12.16b, v13.16b, #4   \n" // r51

                    "fmla   v9.4s, v13.4s, %22.s[0]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #8   \n" // r02

                    "fmla   v10.4s, v18.4s, %16.s[1]        \n"

                    "ext    v22.16b, v12.16b, v13.16b, #8   \n" // r52

                    "fmla   v11.4s, v21.4s, %21.s[1]        \n"

                    "ext    v20.16b, v16.16b, v17.16b, #12  \n" // r03

                    "fmla   v8.4s, v19.4s, %16.s[2]         \n"

                    "ext    v23.16b, v12.16b, v13.16b, #12  \n" // r53

                    "fmla   v9.4s, v22.4s, %21.s[2]         \n"

                    "add    %3, %3, #16                     \n"

                    "fmla   v10.4s, v20.4s, %16.s[3]        \n"
                    "fmla   v11.4s, v23.4s, %21.s[3]        \n"

                    "add    %2, %2, #16                     \n"

                    "fadd   v8.4s, v8.4s, v10.4s            \n"
                    "fadd   v9.4s, v9.4s, v11.4s            \n"

                    "add    %7, %7, #16                     \n"

                    "st1    {v8.4s}, [%0], #16              \n"
                    "st1    {v9.4s}, [%1], #16              \n"

                    : "=r"(outptr),  // %0
                    "=r"(outptr2), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3),      // %5
                    "=r"(r4),      // %6
                    "=r"(r5)       // %7
                    : "0"(outptr),
                    "1"(outptr2),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(r5),
                    "w"(_k0123),     // %16
                    "w"(_k4567),     // %17
                    "w"(_k891011),   // %18
                    "w"(_k12131415), // %19
                    "w"(_k16171819), // %20
                    "w"(_k20212223), // %21
                    "w"(_k24242424), // %22
                    "w"(_bias0)      // %23
                    : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    // r1
                    "pld        [%4, #256]          \n"
                    "vld1.f32   {d28-d31}, [%4]     \n" // q14 q15 = r10 r14

                    "vmov       q8, %q25            \n" // q8 = _bias0

                    "0:                             \n"

                    "vmov       q9, %q25            \n" // q9 = _bias0

                    "vmla.f32   q8, q14, %e19[1]    \n"
                    "vmla.f32   q9, q14, %e18[0]    \n"

                    "vext.32    q12, q14, q15, #1   \n" // r11

                    "vmla.f32   q8, q15, %e20[1]    \n"
                    "vmla.f32   q9, q15, %e19[0]    \n"

                    "vext.32    q13, q14, q15, #2   \n" // r12

                    "vmla.f32   q8, q12, %f19[0]    \n"
                    "vmla.f32   q9, q12, %e18[1]    \n"

                    "vext.32    q12, q14, q15, #3   \n" // r13

                    "vmla.f32   q8, q13, %f19[1]    \n"
                    "vmla.f32   q9, q13, %f18[0]    \n"

                    // r2
                    "pld        [%5, #256]          \n"
                    "vld1.f32   {d20-d23}, [%5]     \n" // q10 q11 = r20 r24

                    "vmla.f32   q8, q12, %e20[0]    \n"
                    "vmla.f32   q9, q12, %f18[1]    \n"

                    "add        %5, #16             \n"

                    "vmla.f32   q8, q10, %f20[0]    \n"
                    "vmla.f32   q9, q10, %e19[1]    \n"

                    "vext.32    q12, q10, q11, #1   \n" // r21

                    "vmla.f32   q8, q11, %f21[0]    \n"
                    "vmla.f32   q9, q11, %e20[1]    \n"

                    "vext.32    q13, q10, q11, #2   \n" // r22

                    "vmla.f32   q8, q12, %f20[1]    \n"
                    "vmla.f32   q9, q12, %f19[0]    \n"

                    "vext.32    q12, q10, q11, #3   \n" // r23

                    "vmla.f32   q8, q13, %e21[0]    \n"
                    "vmla.f32   q9, q13, %f19[1]    \n"

                    // r3
                    "pld        [%6, #256]          \n"
                    "vld1.f32   {d28-d31}, [%6]     \n" // q14 q15 = r30 r34

                    "vmla.f32   q8, q12, %e21[1]    \n"
                    "vmla.f32   q9, q12, %e20[0]    \n"

                    "add        %6, #16             \n"

                    "vmla.f32   q8, q14, %f21[1]    \n"
                    "vmla.f32   q9, q14, %f20[0]    \n"

                    "vext.32    q12, q14, q15, #1   \n" // r31

                    "vmla.f32   q8, q15, %f22[1]    \n"
                    "vmla.f32   q9, q15, %f21[0]    \n"

                    "vext.32    q13, q14, q15, #2   \n" // r32

                    "vmla.f32   q8, q12, %e22[0]    \n"
                    "vmla.f32   q9, q12, %f20[1]    \n"

                    "vext.32    q12, q14, q15, #3   \n" // r33

                    "vmla.f32   q8, q13, %e22[1]    \n"
                    "vmla.f32   q9, q13, %e21[0]    \n"

                    // r4
                    "pld        [%7, #256]          \n"
                    "vld1.f32   {d20-d23}, [%7]     \n" // q10 q11 = r40 r44

                    "vmla.f32   q8, q12, %f22[0]    \n"
                    "vmla.f32   q9, q12, %e21[1]    \n"

                    "add        %7, #16             \n"

                    "vmla.f32   q8, q10, %e23[0]    \n"
                    "vmla.f32   q9, q10, %f21[1]    \n"

                    "vext.32    q12, q10, q11, #1   \n" // r41

                    "vmla.f32   q8, q11, %e24[0]    \n"
                    "vmla.f32   q9, q11, %f22[1]    \n"

                    "vext.32    q13, q10, q11, #2   \n" // r42

                    "vmla.f32   q8, q12, %e23[1]    \n"
                    "vmla.f32   q9, q12, %e22[0]    \n"

                    "vext.32    q12, q10, q11, #3   \n" // r43

                    "vmla.f32   q8, q13, %f23[0]    \n"
                    "vmla.f32   q9, q13, %e22[1]    \n"

                    // r0 and r5
                    "pld        [%3, #256]          \n"
                    "vld1.f32   {d20-d23}, [%3]     \n" // q10 q11 = r00 r04

                    "vmla.f32   q8, q12, %f23[1]    \n"
                    "vmla.f32   q9, q12, %f22[0]    \n"

                    // r5
                    "pld        [%8, #256]          \n"
                    "vld1.f32   {d28-d31}, [%8]     \n" // q14 q15 = r50 r54

                    "vmla.f32   q8, q10, %e18[0]    \n"
                    "vmla.f32   q9, q14, %e23[0]    \n"

                    "vext.32    q12, q10, q11, #1   \n" // r01

                    "vmla.f32   q8, q11, %e19[0]    \n"
                    "vmla.f32   q9, q15, %e24[0]    \n"

                    "vext.32    q13, q14, q15, #1   \n" // r51

                    "vmla.f32   q8, q12, %e18[1]    \n"

                    "vext.32    q12, q10, q11, #2   \n" // r02

                    "vmla.f32   q9, q13, %e23[1]    \n"

                    "vext.32    q13, q14, q15, #2   \n" // r52

                    "vmla.f32   q8, q12, %f18[0]    \n"

                    "vext.32    q12, q10, q11, #3   \n" // r03

                    "vmla.f32   q9, q13, %f23[0]    \n"

                    "vext.32    q13, q14, q15, #3   \n" // r33

                    "vmla.f32   q8, q12, %f18[1]    \n"

                    "add        %3, #16             \n"

                    "vmla.f32   q9, q13, %f23[1]    \n"

                    "add        %4, #16             \n"

                    // r1
                    "pld        [%4, #256]          \n"
                    "vld1.f32   {d28-d31}, [%4]     \n" // q14 q15 = r10 r14

                    "add        %8, #16             \n"

                    "vst1.f32   {d16-d17}, [%1]!    \n"

                    "vmov       q8, %q25            \n" // q8 = _bias0

                    "subs       %0, #1              \n"

                    "vst1.f32   {d18-d19}, [%2]!    \n"

                    "bne        0b                  \n"
                    : "=r"(nn),      // %0
                    "=r"(outptr),  // %1
                    "=r"(outptr2), // %2
                    "=r"(r0),      // %3
                    "=r"(r1),      // %4
                    "=r"(r2),      // %5
                    "=r"(r3),      // %6
                    "=r"(r4),      // %7
                    "=r"(r5)       // %8
                    : "0"(nn),
                    "1"(outptr),
                    "2"(outptr2),
                    "3"(r0),
                    "4"(r1),
                    "5"(r2),
                    "6"(r3),
                    "7"(r4),
                    "8"(r5),
                    "w"(_k0123),     // %18
                    "w"(_k4567),     // %19
                    "w"(_k891011),   // %20
                    "w"(_k12131415), // %21
                    "w"(_k16171819), // %22
                    "w"(_k20212223), // %23
                    "w"(_k24242424), // %24
                    "w"(_bias0)      // %25
                    : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                float sum = bias0;
                float sum2 = bias0;
#if __ARM_NEON
                // TODO neon assembly optimize
                float32x4_t _r1 = vld1q_f32(r1);
                float32x4_t _k1 = vld1q_f32(k1);
                float32x4_t _sum = vmulq_f32(_r1, _k1);
                float32x4_t _sum2 = vmulq_f32(_r1, _k0123);

                float32x4_t _r2 = vld1q_f32(r2);
                float32x4_t _k2 = vld1q_f32(k2);
                _sum = vmlaq_f32(_sum, _r2, _k2);
                _sum2 = vmlaq_f32(_sum2, _r2, _k1);

                float32x4_t _r3 = vld1q_f32(r3);
                float32x4_t _k3 = vld1q_f32(k3);
                _sum = vmlaq_f32(_sum, _r3, _k3);
                _sum2 = vmlaq_f32(_sum2, _r3, _k2);

                float32x4_t _r4 = vld1q_f32(r4);
                _sum = vmlaq_f32(_sum, _r4, _k20212223);
                _sum2 = vmlaq_f32(_sum2, _r4, _k3);

                float32x4_t _r0 = vld1q_f32(r0);
                _sum = vmlaq_f32(_sum, _r0, _k0123);
                float32x4_t _r5 = vld1q_f32(r5);
                _sum2 = vmlaq_f32(_sum2, _r5, _k20212223);

                float32x4_t _k_t4 = {};

                _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
                _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
                _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
                _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);

                float32x4_t _r_t4 = {};

                _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
                _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
                _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
                _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
                _sum = vmlaq_f32(_sum, _r_t4, _k_t4);

                sum += r4[4] * k4[4];

                _r_t4 = vextq_f32(_r_t4, _r_t4, 1);
                _r_t4 = vsetq_lane_f32(r4[4], _r_t4, 3);
                _sum2 = vmlaq_f32(_sum2, _r_t4, _k_t4);

                sum2 += r5[4] * k4[4];

                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
                float32x2_t _ss_ss2 = vpadd_f32(_ss, _ss2);

                sum += vget_lane_f32(_ss_ss2, 0);
                sum2 += vget_lane_f32(_ss_ss2, 1);
#else
                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r0[3] * k0[3];
                sum += r0[4] * k0[4];

                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r1[3] * k1[3];
                sum += r1[4] * k1[4];

                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];
                sum += r2[3] * k2[3];
                sum += r2[4] * k2[4];

                sum += r3[0] * k3[0];
                sum += r3[1] * k3[1];
                sum += r3[2] * k3[2];
                sum += r3[3] * k3[3];
                sum += r3[4] * k3[4];

                sum += r4[0] * k4[0];
                sum += r4[1] * k4[1];
                sum += r4[2] * k4[2];
                sum += r4[3] * k4[3];
                sum += r4[4] * k4[4];

                sum2 += r1[0] * k0[0];
                sum2 += r1[1] * k0[1];
                sum2 += r1[2] * k0[2];
                sum2 += r1[3] * k0[3];
                sum2 += r1[4] * k0[4];

                sum2 += r2[0] * k1[0];
                sum2 += r2[1] * k1[1];
                sum2 += r2[2] * k1[2];
                sum2 += r2[3] * k1[3];
                sum2 += r2[4] * k1[4];

                sum2 += r3[0] * k2[0];
                sum2 += r3[1] * k2[1];
                sum2 += r3[2] * k2[2];
                sum2 += r3[3] * k2[3];
                sum2 += r3[4] * k2[4];

                sum2 += r4[0] * k3[0];
                sum2 += r4[1] * k3[1];
                sum2 += r4[2] * k3[2];
                sum2 += r4[3] * k3[3];
                sum2 += r4[4] * k3[4];

                sum2 += r5[0] * k4[0];
                sum2 += r5[1] * k4[1];
                sum2 += r5[2] * k4[2];
                sum2 += r5[3] * k4[3];
                sum2 += r5[4] * k4[4];
#endif // __ARM_NEON
                *outptr = sum;
                *outptr2 = sum2;

                r0++;
                r1++;
                r2++;
                r3++;
                r4++;
                r5++;
                outptr++;
                outptr2++;
            }

            r0 += 4 + w;
            r1 += 4 + w;
            r2 += 4 + w;
            r3 += 4 + w;
            r4 += 4 + w;
            r5 += 4 + w;

            outptr += outw;
            outptr2 += outw;
        }

        for (; i < outh; i++)
        {
#if __ARM_NEON
#if __aarch64__
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int nn = outw >> 2;
            int remain = outw & 3;
#endif // __aarch64__
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    // v10 v11
                    // r0
                    "prfm   pldl1keep, [%2, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%2]  \n" // v16 v17 v18 = r00 r04 r08

                    "mov    v8.16b, %21.16b                 \n" // v8 = _bias0
                    "mov    v9.16b, %21.16b                 \n" // v9 = _bias0

                    "0:                                     \n"

                    "fmul   v10.4s, v16.4s, %14.s[0]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #4   \n" // r01

                    "fmul   v11.4s, v17.4s, %14.s[0]         \n"

                    "ext    v20.16b, v17.16b, v18.16b, #4   \n" // r05

                    "fmla   v8.4s, v17.4s, %15.s[0]         \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n" // r02

                    "fmla   v9.4s, v18.4s, %15.s[0]         \n"

                    "ext    v22.16b, v17.16b, v18.16b, #8   \n" // r06

                    "fmla   v10.4s, v19.4s, %14.s[1]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #12  \n" // r03

                    "fmla   v11.4s, v20.4s, %14.s[1]         \n"

                    "ext    v20.16b, v17.16b, v18.16b, #12  \n" // r07

                    "fmla   v8.4s, v21.4s, %14.s[2]         \n"
                    "fmla   v9.4s, v22.4s, %14.s[2]         \n"

                    // r1
                    "prfm   pldl1keep, [%3, #384]           \n"
                    "ld1    {v12.4s, v13.4s, v14.4s}, [%3]  \n" // v12 v13 v14 = r10 r14 r18

                    "fmla   v10.4s, v19.4s, %14.s[3]         \n"
                    "fmla   v11.4s, v20.4s, %14.s[3]         \n"

                    "fmla   v8.4s, v12.4s, %15.s[1]         \n"

                    "ext    v19.16b, v12.16b, v13.16b, #4   \n" // r11

                    "fmla   v9.4s, v13.4s, %15.s[1]         \n"

                    "ext    v20.16b, v13.16b, v14.16b, #4   \n" // r15

                    "fmla   v10.4s, v13.4s, %16.s[1]         \n"

                    "ext    v21.16b, v12.16b, v13.16b, #8   \n" // r12

                    "fmla   v11.4s, v14.4s, %16.s[1]         \n"

                    "ext    v22.16b, v13.16b, v14.16b, #8   \n" // r16

                    "fmla   v8.4s, v19.4s, %15.s[2]         \n"

                    "ext    v19.16b, v12.16b, v13.16b, #12  \n" // r13

                    "fmla   v9.4s, v20.4s, %15.s[2]         \n"

                    "ext    v20.16b, v13.16b, v14.16b, #12  \n" // r17

                    "fmla   v10.4s, v21.4s, %15.s[3]         \n"
                    "fmla   v11.4s, v22.4s, %15.s[3]         \n"

                    // r2
                    "prfm   pldl1keep, [%4, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%4]  \n" // v16 v17 v18 = r20 r24 r28

                    "fmla   v8.4s, v19.4s, %16.s[0]         \n"
                    "fmla   v9.4s, v20.4s, %16.s[0]         \n"

                    "fmla   v10.4s, v16.4s, %16.s[2]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #4   \n" // r21

                    "fmla   v11.4s, v17.4s, %16.s[2]         \n"

                    "ext    v20.16b, v17.16b, v18.16b, #4   \n" // r25

                    "fmla   v8.4s, v17.4s, %17.s[2]         \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n" // r22

                    "fmla   v9.4s, v18.4s, %17.s[2]         \n"

                    "ext    v22.16b, v17.16b, v18.16b, #8   \n" // r26

                    "fmla   v10.4s, v19.4s, %16.s[3]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #12  \n" // r23

                    "fmla   v11.4s, v20.4s, %16.s[3]         \n"

                    "ext    v20.16b, v17.16b, v18.16b, #12  \n" // r27

                    "fmla   v8.4s, v21.4s, %17.s[0]         \n"
                    "fmla   v9.4s, v22.4s, %17.s[0]         \n"

                    // r3
                    "prfm   pldl1keep, [%5, #384]           \n"
                    "ld1    {v12.4s, v13.4s, v14.4s}, [%5]  \n" // v12 v13 v14 = r30 r34 r38

                    "fmla   v10.4s, v19.4s, %17.s[1]         \n"
                    "fmla   v11.4s, v20.4s, %17.s[1]         \n"

                    "fmla   v8.4s, v12.4s, %17.s[3]         \n"

                    "ext    v19.16b, v12.16b, v13.16b, #4   \n" // r11

                    "fmla   v9.4s, v13.4s, %17.s[3]         \n"

                    "ext    v20.16b, v13.16b, v14.16b, #4   \n" // r15

                    "fmla   v10.4s, v13.4s, %18.s[3]         \n"

                    "ext    v21.16b, v12.16b, v13.16b, #8   \n" // r12

                    "fmla   v11.4s, v14.4s, %18.s[3]         \n"

                    "ext    v22.16b, v13.16b, v14.16b, #8   \n" // r16

                    "fmla   v8.4s, v19.4s, %18.s[0]         \n"

                    "ext    v19.16b, v12.16b, v13.16b, #12  \n" // r13

                    "fmla   v9.4s, v20.4s, %18.s[0]         \n"

                    "ext    v20.16b, v13.16b, v14.16b, #12  \n" // r17

                    "fmla   v10.4s, v21.4s, %18.s[1]         \n"
                    "fmla   v11.4s, v22.4s, %18.s[1]         \n"

                    // r4
                    "prfm   pldl1keep, [%6, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%6]  \n" // v16 v17 v18 = r40 r44 r48

                    "fmla   v8.4s, v19.4s, %18.s[2]         \n"
                    "fmla   v9.4s, v20.4s, %18.s[2]         \n"

                    "fmla   v10.4s, v16.4s, %19.s[0]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #4   \n" // r41

                    "fmla   v11.4s, v17.4s, %19.s[0]         \n"

                    "ext    v20.16b, v17.16b, v18.16b, #4   \n" // r45

                    "fmla   v8.4s, v17.4s, %20.s[0]         \n"

                    "ext    v21.16b, v16.16b, v17.16b, #8   \n" // r42

                    "fmla   v9.4s, v18.4s, %20.s[0]         \n"

                    "ext    v22.16b, v17.16b, v18.16b, #8   \n" // r46

                    "fmla   v10.4s, v19.4s, %19.s[1]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #12  \n" // r43

                    "fmla   v11.4s, v20.4s, %19.s[1]         \n"

                    "ext    v20.16b, v17.16b, v18.16b, #12  \n" // r47

                    "fmla   v8.4s, v21.4s, %19.s[2]         \n"

                    "add    %2, %2, #32                     \n"

                    "fmla   v9.4s, v22.4s, %19.s[2]         \n"

                    "add    %3, %3, #32                     \n"

                    "fmla   v10.4s, v19.4s, %19.s[3]         \n"

                    "add    %4, %4, #32                     \n"

                    "fmla   v11.4s, v20.4s, %19.s[3]         \n"

                    // r0
                    "prfm   pldl1keep, [%2, #384]           \n"
                    "ld1    {v16.4s, v17.4s, v18.4s}, [%2]  \n" // v16 v17 v18 = r00 r04 r08

                    "add    %5, %5, #32                     \n"

                    "fadd   v10.4s, v8.4s, v10.4s           \n"

                    "add    %6, %6, #32                     \n"

                    "fadd   v11.4s, v9.4s, v11.4s           \n"

                    "mov    v8.16b, %21.16b                 \n" // v8 = _bias0
                    "mov    v9.16b, %21.16b                 \n" // v9 = _bias0

                    "subs   %w0, %w0, #1                    \n"

                    "st1    {v10.4s, v11.4s}, [%1], #32     \n"

                    "bne    0b                              \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2),     // %4
                    "=r"(r3),     // %5
                    "=r"(r4)      // %6
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "w"(_k0123),     // %14
                    "w"(_k4567),     // %15
                    "w"(_k891011),   // %16
                    "w"(_k12131415), // %17
                    "w"(_k16171819), // %18
                    "w"(_k20212223), // %19
                    "w"(_k24242424), // %20
                    "w"(_bias0)      // %21
                    : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20", "v21", "v22");
            }

            if (remain >= 4)
            {
                remain -= 4;
                asm volatile(
                    // r0
                    "prfm   pldl1keep, [%1, #256]           \n"
                    "ld1    {v16.4s, v17.4s}, [%1]          \n" // v16 v17 = r00 r04

                    "mov    v8.16b, %19.16b                 \n" // v8 = _bias0

                    "add    %1, %1, #16                     \n"

                    "fmul   v9.4s, v16.4s, %12.s[0]         \n"

                    "ext    v18.16b, v16.16b, v17.16b, #4   \n" // r01

                    "fmla   v8.4s, v17.4s, %13.s[0]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #8   \n" // r02

                    "fmla   v9.4s, v18.4s, %12.s[1]         \n"

                    "ext    v20.16b, v16.16b, v17.16b, #12  \n" // r03

                    "fmla   v8.4s, v19.4s, %12.s[2]         \n"

                    // r1
                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld1    {v10.4s, v11.4s}, [%2]          \n" // v10 v11 = r10 r14

                    "fmla   v9.4s, v20.4s, %12.s[3]         \n"

                    "add    %2, %2, #16                     \n"

                    "fmla   v8.4s, v10.4s, %13.s[1]         \n"

                    "ext    v12.16b, v10.16b, v11.16b, #4   \n" // r11

                    "fmla   v9.4s, v11.4s, %14.s[1]         \n"

                    "ext    v13.16b, v10.16b, v11.16b, #8   \n" // r12

                    "fmla   v8.4s, v12.4s, %13.s[2]         \n"

                    "ext    v14.16b, v10.16b, v11.16b, #12  \n" // r13

                    "fmla   v9.4s, v13.4s, %13.s[3]         \n"

                    // r2
                    "prfm   pldl1keep, [%3, #256]           \n"
                    "ld1    {v16.4s, v17.4s}, [%3]          \n" // v16 v17 = r20 r24

                    "fmla   v8.4s, v14.4s, %14.s[0]         \n"

                    "add    %3, %3, #16                     \n"

                    "fmla   v9.4s, v16.4s, %14.s[2]         \n"

                    "ext    v18.16b, v16.16b, v17.16b, #4   \n" // r21

                    "fmla   v8.4s, v17.4s, %15.s[2]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #8   \n" // r22

                    "fmla   v9.4s, v18.4s, %14.s[3]         \n"

                    "ext    v20.16b, v16.16b, v17.16b, #12  \n" // r23

                    "fmla   v8.4s, v19.4s, %15.s[0]         \n"

                    // r3
                    "prfm   pldl1keep, [%4, #256]           \n"
                    "ld1    {v10.4s, v11.4s}, [%4]          \n" // v10 v11 = r30 r34

                    "fmla   v9.4s, v20.4s, %15.s[1]         \n"

                    "add    %4, %4, #16                     \n"

                    "fmla   v8.4s, v10.4s, %15.s[3]         \n"

                    "ext    v12.16b, v10.16b, v11.16b, #4   \n" // r31

                    "fmla   v9.4s, v11.4s, %16.s[3]         \n"

                    "ext    v13.16b, v10.16b, v11.16b, #8   \n" // r32

                    "fmla   v8.4s, v12.4s, %16.s[0]         \n"

                    "ext    v14.16b, v10.16b, v11.16b, #12  \n" // r33

                    "fmla   v9.4s, v13.4s, %16.s[1]         \n"

                    // r4
                    "prfm   pldl1keep, [%5, #256]           \n"
                    "ld1    {v16.4s, v17.4s}, [%5]          \n" // v16 v17 = r40 r44

                    "fmla   v8.4s, v14.4s, %16.s[2]         \n"

                    "add    %5, %5, #16                     \n"

                    "fmla   v9.4s, v16.4s, %17.s[0]         \n"

                    "ext    v18.16b, v16.16b, v17.16b, #4   \n" // r41

                    "fmla   v8.4s, v17.4s, %18.s[0]         \n"

                    "ext    v19.16b, v16.16b, v17.16b, #8   \n" // r42

                    "fmla   v9.4s, v18.4s, %17.s[1]         \n"

                    "ext    v20.16b, v16.16b, v17.16b, #12  \n" // r43

                    "fmla   v8.4s, v19.4s, %17.s[2]         \n"

                    "fmla   v9.4s, v20.4s, %17.s[3]         \n"

                    "fadd   v8.4s, v8.4s, v9.4s             \n"

                    "st1    {v8.4s}, [%0], #16              \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2),     // %3
                    "=r"(r3),     // %4
                    "=r"(r4)      // %5
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k0123),     // %12
                    "w"(_k4567),     // %13
                    "w"(_k891011),   // %14
                    "w"(_k12131415), // %15
                    "w"(_k16171819), // %16
                    "w"(_k20212223), // %17
                    "w"(_k24242424), // %18
                    "w"(_bias0)      // %19
                    : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    // r0
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d20-d23}, [%2]     \n" // q10 q11 = r00 r04

                    "vmov       q8, %q21            \n" // q8 = _bias0

                    "0:                             \n"

                    "vmul.f32   q9, q10, %e14[0]    \n"

                    "vext.32    q12, q10, q11, #1   \n" // r01

                    "vmla.f32   q8, q11, %e15[0]    \n"

                    "vext.32    q13, q10, q11, #2   \n" // r02

                    "vmla.f32   q9, q12, %e14[1]    \n"

                    "vext.32    q12, q10, q11, #3   \n" // r03

                    "vmla.f32   q8, q13, %f14[0]    \n"

                    // r1
                    "pld        [%3, #256]          \n"
                    "vld1.f32   {d28-d31}, [%3]     \n" // q14 q15 = r10 r14

                    "vmla.f32   q9, q12, %f14[1]    \n"

                    "add        %3, #16             \n"

                    "vmla.f32   q8, q14, %e15[1]    \n"

                    "vext.32    q12, q14, q15, #1   \n" // r11

                    "vmla.f32   q9, q15, %e16[1]    \n"

                    "vext.32    q13, q14, q15, #2   \n" // r12

                    "vmla.f32   q8, q12, %f15[0]    \n"

                    "vext.32    q12, q14, q15, #3   \n" // r13

                    "vmla.f32   q9, q13, %f15[1]    \n"

                    // r2
                    "pld        [%4, #256]          \n"
                    "vld1.f32   {d20-d23}, [%4]     \n" // q10 q11 = r20 r24

                    "vmla.f32   q8, q12, %e16[0]    \n"

                    "add        %4, #16             \n"

                    "vmla.f32   q9, q10, %f16[0]    \n"

                    "vext.32    q12, q10, q11, #1   \n" // r21

                    "vmla.f32   q8, q11, %f17[0]    \n"

                    "vext.32    q13, q10, q11, #2   \n" // r22

                    "vmla.f32   q9, q12, %f16[1]    \n"

                    "vext.32    q12, q10, q11, #3   \n" // r23

                    "vmla.f32   q8, q13, %e17[0]    \n"

                    // r3
                    "pld        [%5, #256]          \n"
                    "vld1.f32   {d28-d31}, [%5]     \n" // q14 q15 = r30 r34

                    "vmla.f32   q9, q12, %e17[1]    \n"

                    "add        %5, #16             \n"

                    "vmla.f32   q8, q14, %f17[1]    \n"

                    "vext.32    q12, q14, q15, #1   \n" // r31

                    "vmla.f32   q9, q15, %f18[1]    \n"

                    "vext.32    q13, q14, q15, #2   \n" // r32

                    "vmla.f32   q8, q12, %e18[0]    \n"

                    "vext.32    q12, q14, q15, #3   \n" // r33

                    "vmla.f32   q9, q13, %e18[1]    \n"

                    // r4
                    "pld        [%6, #256]          \n"
                    "vld1.f32   {d20-d23}, [%6]     \n" // q10 q11 = r40 r44

                    "vmla.f32   q8, q12, %f18[0]    \n"

                    "add        %6, #16             \n"

                    "vmla.f32   q9, q10, %e19[0]    \n"

                    "vext.32    q12, q10, q11, #1   \n" // r41

                    "vmla.f32   q8, q11, %e20[0]    \n"

                    "vext.32    q13, q10, q11, #2   \n" // r42

                    "vmla.f32   q9, q12, %e19[1]    \n"

                    "vext.32    q12, q10, q11, #3   \n" // r43

                    "vmla.f32   q8, q13, %f19[0]    \n"

                    "add        %2, #16             \n"

                    "vmla.f32   q9, q12, %f19[1]    \n"

                    // r0
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d20-d23}, [%2]     \n" // q10 q11 = r00 r04

                    "vadd.f32   q9, q9, q8          \n"

                    "vmov       q8, %q21            \n" // q8 = _bias0

                    "subs       %0, #1              \n"

                    "vst1.f32   {d18-d19}, [%1]!    \n"

                    "bne        0b                  \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2),     // %4
                    "=r"(r3),     // %5
                    "=r"(r4)      // %6
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "w"(_k0123),     // %14
                    "w"(_k4567),     // %15
                    "w"(_k891011),   // %16
                    "w"(_k12131415), // %17
                    "w"(_k16171819), // %18
                    "w"(_k20212223), // %19
                    "w"(_k24242424), // %20
                    "w"(_bias0)      // %21
                    : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
#if __ARM_NEON
#if __aarch64__
                // TODO neon assembly optimize
                float sum = bias0;

                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _sum = vmulq_f32(_r0, _k0123);

                float32x4_t _r1 = vld1q_f32(r1);
                _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));

                float32x4_t _r2 = vld1q_f32(r2);
                _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));

                float32x4_t _r3 = vld1q_f32(r3);
                _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));

                float32x4_t _r4 = vld1q_f32(r4);
                _sum = vmlaq_f32(_sum, _r4, _k20212223);

                float32x4_t _k_t4 = {};

                _k_t4 = vsetq_lane_f32(k0[4], _k_t4, 0);
                _k_t4 = vsetq_lane_f32(k1[4], _k_t4, 1);
                _k_t4 = vsetq_lane_f32(k2[4], _k_t4, 2);
                _k_t4 = vsetq_lane_f32(k3[4], _k_t4, 3);

                float32x4_t _r_t4 = {};

                _r_t4 = vsetq_lane_f32(r0[4], _r_t4, 0);
                _r_t4 = vsetq_lane_f32(r1[4], _r_t4, 1);
                _r_t4 = vsetq_lane_f32(r2[4], _r_t4, 2);
                _r_t4 = vsetq_lane_f32(r3[4], _r_t4, 3);
                _sum = vmlaq_f32(_sum, _r_t4, _k_t4);

                sum += r4[4] * k4[4];

                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                _ss = vpadd_f32(_ss, _ss);

                sum += vget_lane_f32(_ss, 0);

                *outptr = sum;

                r0++;
                r1++;
                r2++;
                r3++;
                r4++;
                outptr++;
#else
                // TODO neon assembly optimize
                asm volatile(
                    "veor       q14, q14            \n"
                    "vext.32    q14, %q19, q14, #3  \n" // q14 = bias0 0 0 0

                    "vld1.f32   {d16-d17}, [%1]     \n" // q8 = r00 r01 r02 r03

                    "vld1.f32   {d18-d19}, [%2]     \n" // q9 = r10 r11 r12 r13(X)
                    "add        r4, %1, #16         \n"
                    "vld1.f32   {d19[1]}, [r4]      \n"
                    "vext.32    q9, q9, q9, #3      \n" // q9 = r04 r10 r11 r12

                    "vmla.f32   q14, q8, %q12       \n"

                    "add        r4, %2, #12         \n"
                    "vld1.f32   {d20}, [r4]         \n" // d20 = r13 r14
                    "vld1.f32   {d21}, [%3]         \n" // d21 = r20 r21

                    "vmla.f32   q14, q9, %q13       \n"

                    "add        r4, %3, #8          \n"
                    "vld1.f32   {d22-d23}, [r4]     \n" // q11 = r22 r23 r24 X
                    "vld1.f32   {d23[1]}, [%4]      \n" // q11 = r22 r23 r24 r30

                    "vmla.f32   q14, q10, %q14      \n"

                    "add        r4, %4, #4          \n"
                    "vld1.f32   {d24-d25}, [r4]     \n" // q12 = r31 r32 r33 r34

                    "vmla.f32   q14, q11, %q15      \n"

                    "vld1.f32   {d26-d27}, [%5]     \n" // q13 = r40 r41 r42 r43

                    "vmla.f32   q14, q12, %q16      \n"

                    "veor       d30, d30            \n"
                    "add        r4, %5, #16         \n"
                    "vld1.f32   {d30[0]}, [r4]      \n" // d30 = r44 0

                    "vmla.f32   q14, q13, %q17      \n"

                    "vmla.f32   d28, d30, %e18      \n"

                    "add        %1, #4              \n"

                    // h-sum
                    "vadd.f32   d28, d28, d29       \n"

                    "add        %2, #4              \n"
                    "add        %3, #4              \n"

                    "vpadd.f32  d28, d28, d28       \n"

                    "add        %4, #4              \n"
                    "add        %5, #4              \n"

                    "vst1.f32   {d28[0]}, [%0]!     \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2),     // %3
                    "=r"(r3),     // %4
                    "=r"(r4)      // %5
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k0123),     // %12
                    "w"(_k4567),     // %13
                    "w"(_k891011),   // %14
                    "w"(_k12131415), // %15
                    "w"(_k16171819), // %16
                    "w"(_k20212223), // %17
                    "w"(_k24242424), // %18
                    "w"(_bias0)      // %19
                    : "cc", "memory", "r4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else
                float sum = bias0;

                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r0[3] * k0[3];
                sum += r0[4] * k0[4];

                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r1[3] * k1[3];
                sum += r1[4] * k1[4];

                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];
                sum += r2[3] * k2[3];
                sum += r2[4] * k2[4];

                sum += r3[0] * k3[0];
                sum += r3[1] * k3[1];
                sum += r3[2] * k3[2];
                sum += r3[3] * k3[3];
                sum += r3[4] * k3[4];

                sum += r4[0] * k4[0];
                sum += r4[1] * k4[1];
                sum += r4[2] * k4[2];
                sum += r4[3] * k4[3];
                sum += r4[4] * k4[4];

                *outptr = sum;

                r0++;
                r1++;
                r2++;
                r3++;
                r4++;
                outptr++;
#endif
            }

            r0 += 4;
            r1 += 4;
            r2 += 4;
            r3 += 4;
            r4 += 4;
        }
    }
}

static void convdw5x5s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    //int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    //int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const int group = bottom_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const float bias0 = bias ? bias[g] : 0.f;

        const float* kernel0 = kernel + g * 25;

        float* outptr = out;

        const float* img0 = bottom_blob.channel(g);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w * 2;
        const float* r3 = img0 + w * 3;
        const float* r4 = img0 + w * 4;

        const float* k0 = kernel0;
        const float* k1 = kernel0 + 5;
        const float* k2 = kernel0 + 10;
        const float* k3 = kernel0 + 15;
        const float* k4 = kernel0 + 20;

#if __ARM_NEON
        float32x4_t _k0123 = vld1q_f32(kernel0);
        float32x4_t _k4567 = vld1q_f32(kernel0 + 4);
        float32x4_t _k891011 = vld1q_f32(kernel0 + 8);
        float32x4_t _k12131415 = vld1q_f32(kernel0 + 12);
        float32x4_t _k16171819 = vld1q_f32(kernel0 + 16);
        float32x4_t _k20212223 = vld1q_f32(kernel0 + 20);
        float32x4_t _k24242424 = vdupq_n_f32(kernel0[24]);

        float32x4_t _bias0 = vdupq_n_f32(bias0);
#endif // __ARM_NEON

        int i = 0;

        // NOTE unroll outh 2 results somewhat speed drop :| (about -4%)
        // so we do not implement it here

        for (; i < outh; i++)
        {
#if __ARM_NEON
#if __aarch64__
            int nn = outw >> 3;
            int remain = outw & 7;
#else
            int nn = outw >> 2;
            int remain = outw & 3;
#endif // __aarch64__
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    // r0
                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld2    {v16.4s, v17.4s}, [%2], #32     \n" // v16 v17 = r00 r01

                    "mov    v8.16b, %21.16b                 \n" // v8 = _bias0
                    "mov    v9.16b, %21.16b                 \n" // v9 = _bias0

                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld2    {v18.4s, v19.4s}, [%2], #32     \n" // v18 v19 = r08 r09

                    "0:                                     \n"

                    "fmul   v10.4s, v16.4s, %14.s[0]        \n"

                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld2    {v20.4s, v21.4s}, [%2]          \n" // v20 v21 = r016 r017

                    "fmul   v11.4s, v18.4s, %14.s[0]        \n"

                    "ext    v22.16b, v16.16b, v18.16b, #4   \n" // v22 = r02

                    "fmla   v8.4s, v17.4s, %14.s[1]         \n"

                    "ext    v25.16b, v18.16b, v20.16b, #4   \n" // v25 = r010

                    "fmla   v9.4s, v19.4s, %14.s[1]         \n"

                    "ext    v23.16b, v17.16b, v19.16b, #4   \n" // v23 = r03

                    "fmla   v10.4s, v22.4s, %14.s[2]        \n"

                    "ext    v26.16b, v19.16b, v21.16b, #4   \n" // v26 = r011

                    "fmla   v11.4s, v25.4s, %14.s[2]        \n"

                    "ext    v24.16b, v16.16b, v18.16b, #8   \n" // v24 = r04

                    "fmla   v8.4s, v23.4s, %14.s[3]         \n"

                    "ext    v27.16b, v18.16b, v20.16b, #8   \n" // v27 = r012

                    "fmla   v9.4s, v26.4s, %14.s[3]         \n"

                    // r1
                    "prfm   pldl1keep, [%3, #256]           \n"
                    "ld2    {v12.4s, v13.4s}, [%3], #32     \n" // v12 v13 = r10 r11

                    "fmla   v10.4s, v24.4s, %15.s[0]        \n"

                    "prfm   pldl1keep, [%3, #256]           \n"
                    "ld2    {v14.4s, v15.4s}, [%3], #32     \n" // v14 v15 = r18 r19

                    "fmla   v11.4s, v27.4s, %15.s[0]        \n"

                    "fmla   v8.4s, v12.4s, %15.s[1]         \n"

                    "prfm   pldl1keep, [%3, #256]           \n"
                    "ld2    {v20.4s, v21.4s}, [%3]          \n" // v20 v21 = r116 r117

                    "fmla   v9.4s, v14.4s, %15.s[1]         \n"

                    "ext    v22.16b, v12.16b, v14.16b, #4   \n" // v22 = r12

                    "fmla   v10.4s, v13.4s, %15.s[2]        \n"

                    "ext    v25.16b, v14.16b, v20.16b, #4   \n" // v25 = r110

                    "fmla   v11.4s, v15.4s, %15.s[2]        \n"

                    "ext    v23.16b, v13.16b, v15.16b, #4   \n" // v23 = r13

                    "fmla   v8.4s, v22.4s, %15.s[3]         \n"

                    "ext    v26.16b, v15.16b, v21.16b, #4   \n" // v26 = r111

                    "fmla   v9.4s, v25.4s, %15.s[3]         \n"

                    "ext    v24.16b, v12.16b, v14.16b, #8   \n" // v24 = r14

                    "fmla   v10.4s, v23.4s, %16.s[0]        \n"

                    "ext    v27.16b, v14.16b, v20.16b, #8   \n" // v27 = r112

                    "fmla   v11.4s, v26.4s, %16.s[0]        \n"

                    // r2
                    "prfm   pldl1keep, [%4, #256]           \n"
                    "ld2    {v16.4s, v17.4s}, [%4], #32     \n" // v16 v17 = r20 r21

                    "fmla   v8.4s, v24.4s, %16.s[1]         \n"

                    "prfm   pldl1keep, [%4, #256]           \n"
                    "ld2    {v18.4s, v19.4s}, [%4], #32     \n" // v18 v19 = r28 r29

                    "fmla   v9.4s, v27.4s, %16.s[1]         \n"

                    "fmla   v10.4s, v16.4s, %16.s[2]        \n"

                    "prfm   pldl1keep, [%4, #256]           \n"
                    "ld2    {v20.4s, v21.4s}, [%4]          \n" // v20 v21 = r216 r217

                    "fmla   v11.4s, v18.4s, %16.s[2]        \n"

                    "ext    v22.16b, v16.16b, v18.16b, #4   \n" // v22 = r22

                    "fmla   v8.4s, v17.4s, %16.s[3]         \n"

                    "ext    v25.16b, v18.16b, v20.16b, #4   \n" // v25 = r210

                    "fmla   v9.4s, v19.4s, %16.s[3]         \n"

                    "ext    v23.16b, v17.16b, v19.16b, #4   \n" // v23 = r23

                    "fmla   v10.4s, v22.4s, %17.s[0]        \n"

                    "ext    v26.16b, v19.16b, v21.16b, #4   \n" // v26 = r211

                    "fmla   v11.4s, v25.4s, %17.s[0]        \n"

                    "ext    v24.16b, v16.16b, v18.16b, #8   \n" // v24 = r24

                    "fmla   v8.4s, v23.4s, %17.s[1]         \n"

                    "ext    v27.16b, v18.16b, v20.16b, #8   \n" // v27 = r212

                    "fmla   v9.4s, v26.4s, %17.s[1]         \n"

                    // r3
                    "prfm   pldl1keep, [%5, #256]           \n"
                    "ld2    {v12.4s, v13.4s}, [%5], #32     \n" // v12 v13 = r30 r31

                    "fmla   v10.4s, v24.4s, %17.s[2]        \n"

                    "prfm   pldl1keep, [%5, #256]           \n"
                    "ld2    {v14.4s, v15.4s}, [%5], #32     \n" // v14 v15 = r38 r39

                    "fmla   v11.4s, v27.4s, %17.s[2]        \n"

                    "fmla   v8.4s, v12.4s, %17.s[3]         \n"

                    "prfm   pldl1keep, [%5, #256]           \n"
                    "ld2    {v20.4s, v21.4s}, [%5]          \n" // v20 v21 = r316 r317

                    "fmla   v9.4s, v14.4s, %17.s[3]         \n"

                    "ext    v22.16b, v12.16b, v14.16b, #4   \n" // v22 = r32

                    "fmla   v10.4s, v13.4s, %18.s[0]        \n"

                    "ext    v25.16b, v14.16b, v20.16b, #4   \n" // v25 = r310

                    "fmla   v11.4s, v15.4s, %18.s[0]        \n"

                    "ext    v23.16b, v13.16b, v15.16b, #4   \n" // v23 = r33

                    "fmla   v8.4s, v22.4s, %18.s[1]         \n"

                    "ext    v26.16b, v15.16b, v21.16b, #4   \n" // v26 = r311

                    "fmla   v9.4s, v25.4s, %18.s[1]         \n"

                    "ext    v24.16b, v12.16b, v14.16b, #8   \n" // v24 = r34

                    "fmla   v10.4s, v23.4s, %18.s[2]        \n"

                    "ext    v27.16b, v14.16b, v20.16b, #8   \n" // v27 = r312

                    "fmla   v11.4s, v26.4s, %18.s[2]        \n"

                    // r4
                    "prfm   pldl1keep, [%6, #256]           \n"
                    "ld2    {v16.4s, v17.4s}, [%6], #32     \n" // v16 v17 = r40 r41

                    "fmla   v8.4s, v24.4s, %18.s[3]         \n"

                    "prfm   pldl1keep, [%6, #256]           \n"
                    "ld2    {v18.4s, v19.4s}, [%6], #32     \n" // v18 v19 = r48 r49

                    "fmla   v9.4s, v27.4s, %18.s[3]         \n"

                    "fmla   v10.4s, v16.4s, %19.s[0]        \n"

                    "prfm   pldl1keep, [%6, #256]           \n"
                    "ld2    {v20.4s, v21.4s}, [%6]          \n" // v20 v21 = r416 r417

                    "fmla   v11.4s, v18.4s, %19.s[0]        \n"

                    "ext    v22.16b, v16.16b, v18.16b, #4   \n" // v22 = r42

                    "fmla   v8.4s, v17.4s, %19.s[1]         \n"

                    "ext    v25.16b, v18.16b, v20.16b, #4   \n" // v25 = r410

                    "fmla   v9.4s, v19.4s, %19.s[1]         \n"

                    "ext    v23.16b, v17.16b, v19.16b, #4   \n" // v23 = r43

                    "fmla   v10.4s, v22.4s, %19.s[2]        \n"

                    "ext    v26.16b, v19.16b, v21.16b, #4   \n" // v26 = r411

                    "fmla   v11.4s, v25.4s, %19.s[2]        \n"

                    "ext    v24.16b, v16.16b, v18.16b, #8   \n" // v24 = r44

                    "fmla   v8.4s, v23.4s, %19.s[3]         \n"

                    "ext    v27.16b, v18.16b, v20.16b, #8   \n" // v27 = r412

                    "fmla   v9.4s, v26.4s, %19.s[3]         \n"
                    "fmla   v10.4s, v24.4s, %20.s[0]        \n"

                    // r0
                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld2    {v16.4s, v17.4s}, [%2], #32     \n" // v16 v17 = r00 r01

                    "fmla   v11.4s, v27.4s, %20.s[0]        \n"

                    "prfm   pldl1keep, [%2, #256]           \n"
                    "ld2    {v18.4s, v19.4s}, [%2], #32     \n" // v18 v19 = r08 r09

                    "fadd   v10.4s, v8.4s, v10.4s           \n"
                    "fadd   v11.4s, v9.4s, v11.4s           \n"

                    "subs   %w0, %w0, #1                    \n"

                    "mov    v8.16b, %21.16b                 \n" // v8 = _bias0
                    "mov    v9.16b, %21.16b                 \n" // v9 = _bias0

                    "st1    {v10.4s, v11.4s}, [%1], #32     \n"

                    "bne    0b                              \n"
                    "sub    %2, %2, #64                     \n"
                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2),     // %4
                    "=r"(r3),     // %5
                    "=r"(r4)      // %6
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "w"(_k0123),     // %14
                    "w"(_k4567),     // %15
                    "w"(_k891011),   // %16
                    "w"(_k12131415), // %17
                    "w"(_k16171819), // %18
                    "w"(_k20212223), // %19
                    "w"(_k24242424), // %20
                    "w"(_bias0)      // %21
                    : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    // r0
                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d20-d23}, [%2]!    \n" // q10 q11 = r00 r01

                    "vmov       q8, %q21            \n"

                    "pld        [%2, #128]          \n"
                    "vld2.f32   {d24-d25}, [%2]     \n" // q12 = r08 x x

                    "0:                             \n"

                    "vmul.f32   q9, q10, %e14[0]    \n"

                    "vmov       d26, d25            \n" // q13 = r09 x x

                    "vext.32    q14, q10, q12, #1   \n" // q14 = r02

                    "vmla.f32   q8, q11, %e14[1]    \n"

                    "vext.32    q15, q11, q13, #1   \n" // q15 = r03

                    "vmla.f32   q9, q14, %f14[0]    \n"

                    "vext.32    q14, q10, q12, #2   \n" // q14 = r04

                    "vmla.f32   q8, q15, %f14[1]    \n"

                    // r1
                    "pld        [%3, #256]          \n"
                    "vld2.f32   {d20-d23}, [%3]!    \n" // q10 q11 = r10 r11

                    "vmla.f32   q9, q14, %e15[0]    \n"

                    "pld        [%3, #128]          \n"
                    "vld2.f32   {d24-d25}, [%3]     \n" // q12 = r18 x x

                    "vmla.f32   q8, q10, %e15[1]    \n"

                    "vmov       d26, d25            \n" // q13 = r19 x x

                    "vext.32    q14, q10, q12, #1   \n" // q14 = r12

                    "vmla.f32   q9, q11, %f15[0]    \n"

                    "vext.32    q15, q11, q13, #1   \n" // q15 = r13

                    "vmla.f32   q8, q14, %f15[1]    \n"

                    "vext.32    q14, q10, q12, #2   \n" // q14 = r14

                    "vmla.f32   q9, q15, %e16[0]    \n"

                    // r2
                    "pld        [%4, #256]          \n"
                    "vld2.f32   {d20-d23}, [%4]!    \n" // q10 q11 = r20 r21

                    "vmla.f32   q8, q14, %e16[1]    \n"

                    "pld        [%4, #128]          \n"
                    "vld2.f32   {d24-d25}, [%4]     \n" // q12 = r28 x x

                    "vmla.f32   q9, q10, %f16[0]    \n"

                    "vmov       d26, d25            \n" // q13 = r29 x x

                    "vext.32    q14, q10, q12, #1   \n" // q14 = r22

                    "vmla.f32   q8, q11, %f16[1]    \n"

                    "vext.32    q15, q11, q13, #1   \n" // q15 = r23

                    "vmla.f32   q9, q14, %e17[0]    \n"

                    "vext.32    q14, q10, q12, #2   \n" // q14 = r24

                    "vmla.f32   q8, q15, %e17[1]    \n"

                    // r3
                    "pld        [%5, #256]          \n"
                    "vld2.f32   {d20-d23}, [%5]!    \n" // q10 q11 = r30 r31

                    "vmla.f32   q9, q14, %f17[0]    \n"

                    "pld        [%5, #128]          \n"
                    "vld2.f32   {d24-d25}, [%5]     \n" // q12 = r38 x x

                    "vmla.f32   q8, q10, %f17[1]    \n"

                    "vmov       d26, d25            \n" // q13 = r39 x x

                    "vext.32    q14, q10, q12, #1   \n" // q14 = r32

                    "vmla.f32   q9, q11, %e18[0]    \n"

                    "vext.32    q15, q11, q13, #1   \n" // q15 = r33

                    "vmla.f32   q8, q14, %e18[1]    \n"

                    "vext.32    q14, q10, q12, #2   \n" // q14 = r34

                    "vmla.f32   q9, q15, %f18[0]    \n"

                    // r4
                    "pld        [%6, #256]          \n"
                    "vld2.f32   {d20-d23}, [%6]!    \n" // q10 q11 = r40 r41

                    "vmla.f32   q8, q14, %f18[1]    \n"

                    "pld        [%6, #128]          \n"
                    "vld2.f32   {d24-d25}, [%6]     \n" // q12 = r48 x x

                    "vmla.f32   q9, q10, %e19[0]    \n"

                    "vmov       d26, d25            \n" // q13 = r49 x x

                    "vext.32    q14, q10, q12, #1   \n" // q14 = r42

                    "vmla.f32   q8, q11, %e19[1]    \n"

                    "vext.32    q15, q11, q13, #1   \n" // q15 = r43

                    "vmla.f32   q9, q14, %f19[0]    \n"

                    "vext.32    q14, q10, q12, #2   \n" // q14 = r44

                    "vmla.f32   q8, q15, %f19[1]    \n"

                    // r0
                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d20-d23}, [%2]!    \n" // q10 q11 = r00 r01

                    "vmla.f32   q9, q14, %e20[0]    \n"

                    "pld        [%2, #128]          \n"
                    "vld2.f32   {d24-d25}, [%2]     \n" // q12 = r08 x x

                    "vadd.f32   q9, q8, q9          \n"

                    "vmov       q8, %q21            \n"

                    "subs       %0, #1              \n"

                    "vst1.f32   {d18-d19}, [%1]!    \n"

                    "bne        0b                  \n"
                    "sub        %2, #32             \n"

                    : "=r"(nn),     // %0
                    "=r"(outptr), // %1
                    "=r"(r0),     // %2
                    "=r"(r1),     // %3
                    "=r"(r2),     // %4
                    "=r"(r3),     // %5
                    "=r"(r4)      // %6
                    : "0"(nn),
                    "1"(outptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "w"(_k0123),     // %14
                    "w"(_k4567),     // %15
                    "w"(_k891011),   // %16
                    "w"(_k12131415), // %17
                    "w"(_k16171819), // %18
                    "w"(_k20212223), // %19
                    "w"(_k24242424), // %20
                    "w"(_bias0)      // %21
                    : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                float sum = bias0;
#if __ARM_NEON
                // TODO neon assembly optimize
                float32x4_t _r0 = vld1q_f32(r0);
                float32x4_t _sum = vmulq_f32(_r0, _k0123);

                float32x4_t _r1 = vld1q_f32(r1);
                _sum = vmlaq_f32(_sum, _r1, vld1q_f32(k1));

                float32x4_t _r2 = vld1q_f32(r2);
                _sum = vmlaq_f32(_sum, _r2, vld1q_f32(k2));

                float32x4_t _r3 = vld1q_f32(r3);
                _sum = vmlaq_f32(_sum, _r3, vld1q_f32(k3));

                float32x4_t _r4 = vld1q_f32(r4);
                _sum = vmlaq_f32(_sum, _r4, _k20212223);

                sum += r0[4] * k0[4];
                sum += r1[4] * k1[4];
                sum += r2[4] * k2[4];
                sum += r3[4] * k3[4];
                sum += r4[4] * k4[4];

                float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                _ss = vpadd_f32(_ss, _ss);

                sum += vget_lane_f32(_ss, 0);
#else
                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r0[3] * k0[3];
                sum += r0[4] * k0[4];

                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r1[3] * k1[3];
                sum += r1[4] * k1[4];

                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];
                sum += r2[3] * k2[3];
                sum += r2[4] * k2[4];

                sum += r3[0] * k3[0];
                sum += r3[1] * k3[1];
                sum += r3[2] * k3[2];
                sum += r3[3] * k3[3];
                sum += r3[4] * k3[4];

                sum += r4[0] * k4[0];
                sum += r4[1] * k4[1];
                sum += r4[2] * k4[2];
                sum += r4[3] * k4[3];
                sum += r4[4] * k4[4];
#endif
                *outptr = sum;

                r0 += 2;
                r1 += 2;
                r2 += 2;
                r3 += 2;
                r4 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
            r3 += tailstep;
            r4 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_5x5_pack4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw5x5s1_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
#if __aarch64__
    const int w = bottom_blob.w;
#endif

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);

        const float* k0 = kernel.row(g);

        float* outptr0 = out.row(0);

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);
        const float* r3 = img0.row(3);
        const float* r4 = img0.row(4);

        int i = 0;

#if __aarch64__
        float* outptr1 = out.row(1);
        const float* r5 = img0.row(5);

        for (; i + 1 < outh; i += 2)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
                float32x4_t _sum00 = _bias0;
                float32x4_t _sum01 = _bias0;
                float32x4_t _sum02 = _bias0;
                float32x4_t _sum03 = _bias0;
                float32x4_t _sum10 = _bias0;
                float32x4_t _sum11 = _bias0;
                float32x4_t _sum12 = _bias0;
                float32x4_t _sum13 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);
                float32x4_t _r05 = vld1q_f32(r0 + 20);
                float32x4_t _r06 = vld1q_f32(r0 + 24);
                float32x4_t _r07 = vld1q_f32(r0 + 28);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum00 = vmlaq_f32(_sum00, _k00, _r00);
                _sum00 = vmlaq_f32(_sum00, _k01, _r01);
                _sum00 = vmlaq_f32(_sum00, _k02, _r02);
                _sum00 = vmlaq_f32(_sum00, _k03, _r03);
                _sum00 = vmlaq_f32(_sum00, _k04, _r04);
                _sum01 = vmlaq_f32(_sum01, _k00, _r01);
                _sum01 = vmlaq_f32(_sum01, _k01, _r02);
                _sum01 = vmlaq_f32(_sum01, _k02, _r03);
                _sum01 = vmlaq_f32(_sum01, _k03, _r04);
                _sum01 = vmlaq_f32(_sum01, _k04, _r05);
                _sum02 = vmlaq_f32(_sum02, _k00, _r02);
                _sum02 = vmlaq_f32(_sum02, _k01, _r03);
                _sum02 = vmlaq_f32(_sum02, _k02, _r04);
                _sum02 = vmlaq_f32(_sum02, _k03, _r05);
                _sum02 = vmlaq_f32(_sum02, _k04, _r06);
                _sum03 = vmlaq_f32(_sum03, _k00, _r03);
                _sum03 = vmlaq_f32(_sum03, _k01, _r04);
                _sum03 = vmlaq_f32(_sum03, _k02, _r05);
                _sum03 = vmlaq_f32(_sum03, _k03, _r06);
                _sum03 = vmlaq_f32(_sum03, _k04, _r07);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);
                float32x4_t _r15 = vld1q_f32(r1 + 20);
                float32x4_t _r16 = vld1q_f32(r1 + 24);
                float32x4_t _r17 = vld1q_f32(r1 + 28);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum10 = vmlaq_f32(_sum10, _k00, _r10);
                _sum10 = vmlaq_f32(_sum10, _k01, _r11);
                _sum10 = vmlaq_f32(_sum10, _k02, _r12);
                _sum10 = vmlaq_f32(_sum10, _k03, _r13);
                _sum10 = vmlaq_f32(_sum10, _k04, _r14);
                _sum11 = vmlaq_f32(_sum11, _k00, _r11);
                _sum11 = vmlaq_f32(_sum11, _k01, _r12);
                _sum11 = vmlaq_f32(_sum11, _k02, _r13);
                _sum11 = vmlaq_f32(_sum11, _k03, _r14);
                _sum11 = vmlaq_f32(_sum11, _k04, _r15);
                _sum12 = vmlaq_f32(_sum12, _k00, _r12);
                _sum12 = vmlaq_f32(_sum12, _k01, _r13);
                _sum12 = vmlaq_f32(_sum12, _k02, _r14);
                _sum12 = vmlaq_f32(_sum12, _k03, _r15);
                _sum12 = vmlaq_f32(_sum12, _k04, _r16);
                _sum13 = vmlaq_f32(_sum13, _k00, _r13);
                _sum13 = vmlaq_f32(_sum13, _k01, _r14);
                _sum13 = vmlaq_f32(_sum13, _k02, _r15);
                _sum13 = vmlaq_f32(_sum13, _k03, _r16);
                _sum13 = vmlaq_f32(_sum13, _k04, _r17);

                _sum00 = vmlaq_f32(_sum00, _k10, _r10);
                _sum00 = vmlaq_f32(_sum00, _k11, _r11);
                _sum00 = vmlaq_f32(_sum00, _k12, _r12);
                _sum00 = vmlaq_f32(_sum00, _k13, _r13);
                _sum00 = vmlaq_f32(_sum00, _k14, _r14);
                _sum01 = vmlaq_f32(_sum01, _k10, _r11);
                _sum01 = vmlaq_f32(_sum01, _k11, _r12);
                _sum01 = vmlaq_f32(_sum01, _k12, _r13);
                _sum01 = vmlaq_f32(_sum01, _k13, _r14);
                _sum01 = vmlaq_f32(_sum01, _k14, _r15);
                _sum02 = vmlaq_f32(_sum02, _k10, _r12);
                _sum02 = vmlaq_f32(_sum02, _k11, _r13);
                _sum02 = vmlaq_f32(_sum02, _k12, _r14);
                _sum02 = vmlaq_f32(_sum02, _k13, _r15);
                _sum02 = vmlaq_f32(_sum02, _k14, _r16);
                _sum03 = vmlaq_f32(_sum03, _k10, _r13);
                _sum03 = vmlaq_f32(_sum03, _k11, _r14);
                _sum03 = vmlaq_f32(_sum03, _k12, _r15);
                _sum03 = vmlaq_f32(_sum03, _k13, _r16);
                _sum03 = vmlaq_f32(_sum03, _k14, _r17);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);
                float32x4_t _r25 = vld1q_f32(r2 + 20);
                float32x4_t _r26 = vld1q_f32(r2 + 24);
                float32x4_t _r27 = vld1q_f32(r2 + 28);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum10 = vmlaq_f32(_sum10, _k10, _r20);
                _sum10 = vmlaq_f32(_sum10, _k11, _r21);
                _sum10 = vmlaq_f32(_sum10, _k12, _r22);
                _sum10 = vmlaq_f32(_sum10, _k13, _r23);
                _sum10 = vmlaq_f32(_sum10, _k14, _r24);
                _sum11 = vmlaq_f32(_sum11, _k10, _r21);
                _sum11 = vmlaq_f32(_sum11, _k11, _r22);
                _sum11 = vmlaq_f32(_sum11, _k12, _r23);
                _sum11 = vmlaq_f32(_sum11, _k13, _r24);
                _sum11 = vmlaq_f32(_sum11, _k14, _r25);
                _sum12 = vmlaq_f32(_sum12, _k10, _r22);
                _sum12 = vmlaq_f32(_sum12, _k11, _r23);
                _sum12 = vmlaq_f32(_sum12, _k12, _r24);
                _sum12 = vmlaq_f32(_sum12, _k13, _r25);
                _sum12 = vmlaq_f32(_sum12, _k14, _r26);
                _sum13 = vmlaq_f32(_sum13, _k10, _r23);
                _sum13 = vmlaq_f32(_sum13, _k11, _r24);
                _sum13 = vmlaq_f32(_sum13, _k12, _r25);
                _sum13 = vmlaq_f32(_sum13, _k13, _r26);
                _sum13 = vmlaq_f32(_sum13, _k14, _r27);

                _sum00 = vmlaq_f32(_sum00, _k20, _r20);
                _sum00 = vmlaq_f32(_sum00, _k21, _r21);
                _sum00 = vmlaq_f32(_sum00, _k22, _r22);
                _sum00 = vmlaq_f32(_sum00, _k23, _r23);
                _sum00 = vmlaq_f32(_sum00, _k24, _r24);
                _sum01 = vmlaq_f32(_sum01, _k20, _r21);
                _sum01 = vmlaq_f32(_sum01, _k21, _r22);
                _sum01 = vmlaq_f32(_sum01, _k22, _r23);
                _sum01 = vmlaq_f32(_sum01, _k23, _r24);
                _sum01 = vmlaq_f32(_sum01, _k24, _r25);
                _sum02 = vmlaq_f32(_sum02, _k20, _r22);
                _sum02 = vmlaq_f32(_sum02, _k21, _r23);
                _sum02 = vmlaq_f32(_sum02, _k22, _r24);
                _sum02 = vmlaq_f32(_sum02, _k23, _r25);
                _sum02 = vmlaq_f32(_sum02, _k24, _r26);
                _sum03 = vmlaq_f32(_sum03, _k20, _r23);
                _sum03 = vmlaq_f32(_sum03, _k21, _r24);
                _sum03 = vmlaq_f32(_sum03, _k22, _r25);
                _sum03 = vmlaq_f32(_sum03, _k23, _r26);
                _sum03 = vmlaq_f32(_sum03, _k24, _r27);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);
                float32x4_t _r35 = vld1q_f32(r3 + 20);
                float32x4_t _r36 = vld1q_f32(r3 + 24);
                float32x4_t _r37 = vld1q_f32(r3 + 28);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum10 = vmlaq_f32(_sum10, _k20, _r30);
                _sum10 = vmlaq_f32(_sum10, _k21, _r31);
                _sum10 = vmlaq_f32(_sum10, _k22, _r32);
                _sum10 = vmlaq_f32(_sum10, _k23, _r33);
                _sum10 = vmlaq_f32(_sum10, _k24, _r34);
                _sum11 = vmlaq_f32(_sum11, _k20, _r31);
                _sum11 = vmlaq_f32(_sum11, _k21, _r32);
                _sum11 = vmlaq_f32(_sum11, _k22, _r33);
                _sum11 = vmlaq_f32(_sum11, _k23, _r34);
                _sum11 = vmlaq_f32(_sum11, _k24, _r35);
                _sum12 = vmlaq_f32(_sum12, _k20, _r32);
                _sum12 = vmlaq_f32(_sum12, _k21, _r33);
                _sum12 = vmlaq_f32(_sum12, _k22, _r34);
                _sum12 = vmlaq_f32(_sum12, _k23, _r35);
                _sum12 = vmlaq_f32(_sum12, _k24, _r36);
                _sum13 = vmlaq_f32(_sum13, _k20, _r33);
                _sum13 = vmlaq_f32(_sum13, _k21, _r34);
                _sum13 = vmlaq_f32(_sum13, _k22, _r35);
                _sum13 = vmlaq_f32(_sum13, _k23, _r36);
                _sum13 = vmlaq_f32(_sum13, _k24, _r37);

                _sum00 = vmlaq_f32(_sum00, _k30, _r30);
                _sum00 = vmlaq_f32(_sum00, _k31, _r31);
                _sum00 = vmlaq_f32(_sum00, _k32, _r32);
                _sum00 = vmlaq_f32(_sum00, _k33, _r33);
                _sum00 = vmlaq_f32(_sum00, _k34, _r34);
                _sum01 = vmlaq_f32(_sum01, _k30, _r31);
                _sum01 = vmlaq_f32(_sum01, _k31, _r32);
                _sum01 = vmlaq_f32(_sum01, _k32, _r33);
                _sum01 = vmlaq_f32(_sum01, _k33, _r34);
                _sum01 = vmlaq_f32(_sum01, _k34, _r35);
                _sum02 = vmlaq_f32(_sum02, _k30, _r32);
                _sum02 = vmlaq_f32(_sum02, _k31, _r33);
                _sum02 = vmlaq_f32(_sum02, _k32, _r34);
                _sum02 = vmlaq_f32(_sum02, _k33, _r35);
                _sum02 = vmlaq_f32(_sum02, _k34, _r36);
                _sum03 = vmlaq_f32(_sum03, _k30, _r33);
                _sum03 = vmlaq_f32(_sum03, _k31, _r34);
                _sum03 = vmlaq_f32(_sum03, _k32, _r35);
                _sum03 = vmlaq_f32(_sum03, _k33, _r36);
                _sum03 = vmlaq_f32(_sum03, _k34, _r37);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);
                float32x4_t _r45 = vld1q_f32(r4 + 20);
                float32x4_t _r46 = vld1q_f32(r4 + 24);
                float32x4_t _r47 = vld1q_f32(r4 + 28);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum10 = vmlaq_f32(_sum10, _k30, _r40);
                _sum10 = vmlaq_f32(_sum10, _k31, _r41);
                _sum10 = vmlaq_f32(_sum10, _k32, _r42);
                _sum10 = vmlaq_f32(_sum10, _k33, _r43);
                _sum10 = vmlaq_f32(_sum10, _k34, _r44);
                _sum11 = vmlaq_f32(_sum11, _k30, _r41);
                _sum11 = vmlaq_f32(_sum11, _k31, _r42);
                _sum11 = vmlaq_f32(_sum11, _k32, _r43);
                _sum11 = vmlaq_f32(_sum11, _k33, _r44);
                _sum11 = vmlaq_f32(_sum11, _k34, _r45);
                _sum12 = vmlaq_f32(_sum12, _k30, _r42);
                _sum12 = vmlaq_f32(_sum12, _k31, _r43);
                _sum12 = vmlaq_f32(_sum12, _k32, _r44);
                _sum12 = vmlaq_f32(_sum12, _k33, _r45);
                _sum12 = vmlaq_f32(_sum12, _k34, _r46);
                _sum13 = vmlaq_f32(_sum13, _k30, _r43);
                _sum13 = vmlaq_f32(_sum13, _k31, _r44);
                _sum13 = vmlaq_f32(_sum13, _k32, _r45);
                _sum13 = vmlaq_f32(_sum13, _k33, _r46);
                _sum13 = vmlaq_f32(_sum13, _k34, _r47);

                _sum00 = vmlaq_f32(_sum00, _k40, _r40);
                _sum00 = vmlaq_f32(_sum00, _k41, _r41);
                _sum00 = vmlaq_f32(_sum00, _k42, _r42);
                _sum00 = vmlaq_f32(_sum00, _k43, _r43);
                _sum00 = vmlaq_f32(_sum00, _k44, _r44);
                _sum01 = vmlaq_f32(_sum01, _k40, _r41);
                _sum01 = vmlaq_f32(_sum01, _k41, _r42);
                _sum01 = vmlaq_f32(_sum01, _k42, _r43);
                _sum01 = vmlaq_f32(_sum01, _k43, _r44);
                _sum01 = vmlaq_f32(_sum01, _k44, _r45);
                _sum02 = vmlaq_f32(_sum02, _k40, _r42);
                _sum02 = vmlaq_f32(_sum02, _k41, _r43);
                _sum02 = vmlaq_f32(_sum02, _k42, _r44);
                _sum02 = vmlaq_f32(_sum02, _k43, _r45);
                _sum02 = vmlaq_f32(_sum02, _k44, _r46);
                _sum03 = vmlaq_f32(_sum03, _k40, _r43);
                _sum03 = vmlaq_f32(_sum03, _k41, _r44);
                _sum03 = vmlaq_f32(_sum03, _k42, _r45);
                _sum03 = vmlaq_f32(_sum03, _k43, _r46);
                _sum03 = vmlaq_f32(_sum03, _k44, _r47);

                float32x4_t _r50 = vld1q_f32(r5);
                float32x4_t _r51 = vld1q_f32(r5 + 4);
                float32x4_t _r52 = vld1q_f32(r5 + 8);
                float32x4_t _r53 = vld1q_f32(r5 + 12);
                float32x4_t _r54 = vld1q_f32(r5 + 16);
                float32x4_t _r55 = vld1q_f32(r5 + 20);
                float32x4_t _r56 = vld1q_f32(r5 + 24);
                float32x4_t _r57 = vld1q_f32(r5 + 28);

                _sum10 = vmlaq_f32(_sum10, _k40, _r50);
                _sum10 = vmlaq_f32(_sum10, _k41, _r51);
                _sum10 = vmlaq_f32(_sum10, _k42, _r52);
                _sum10 = vmlaq_f32(_sum10, _k43, _r53);
                _sum10 = vmlaq_f32(_sum10, _k44, _r54);
                _sum11 = vmlaq_f32(_sum11, _k40, _r51);
                _sum11 = vmlaq_f32(_sum11, _k41, _r52);
                _sum11 = vmlaq_f32(_sum11, _k42, _r53);
                _sum11 = vmlaq_f32(_sum11, _k43, _r54);
                _sum11 = vmlaq_f32(_sum11, _k44, _r55);
                _sum12 = vmlaq_f32(_sum12, _k40, _r52);
                _sum12 = vmlaq_f32(_sum12, _k41, _r53);
                _sum12 = vmlaq_f32(_sum12, _k42, _r54);
                _sum12 = vmlaq_f32(_sum12, _k43, _r55);
                _sum12 = vmlaq_f32(_sum12, _k44, _r56);
                _sum13 = vmlaq_f32(_sum13, _k40, _r53);
                _sum13 = vmlaq_f32(_sum13, _k41, _r54);
                _sum13 = vmlaq_f32(_sum13, _k42, _r55);
                _sum13 = vmlaq_f32(_sum13, _k43, _r56);
                _sum13 = vmlaq_f32(_sum13, _k44, _r57);

                vst1q_f32(outptr0, _sum00);
                vst1q_f32(outptr0 + 4, _sum01);
                vst1q_f32(outptr0 + 8, _sum02);
                vst1q_f32(outptr0 + 12, _sum03);
                vst1q_f32(outptr1, _sum10);
                vst1q_f32(outptr1 + 4, _sum11);
                vst1q_f32(outptr1 + 8, _sum12);
                vst1q_f32(outptr1 + 12, _sum13);

                r0 += 16;
                r1 += 16;
                r2 += 16;
                r3 += 16;
                r4 += 16;
                r5 += 16;
                outptr0 += 16;
                outptr1 += 16;
            }
            for (; j + 1 < outw; j += 2)
            {
                float32x4_t _sum00 = _bias0;
                float32x4_t _sum01 = _bias0;
                float32x4_t _sum10 = _bias0;
                float32x4_t _sum11 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);
                float32x4_t _r05 = vld1q_f32(r0 + 20);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum00 = vmlaq_f32(_sum00, _k00, _r00);
                _sum00 = vmlaq_f32(_sum00, _k01, _r01);
                _sum00 = vmlaq_f32(_sum00, _k02, _r02);
                _sum00 = vmlaq_f32(_sum00, _k03, _r03);
                _sum00 = vmlaq_f32(_sum00, _k04, _r04);
                _sum01 = vmlaq_f32(_sum01, _k00, _r01);
                _sum01 = vmlaq_f32(_sum01, _k01, _r02);
                _sum01 = vmlaq_f32(_sum01, _k02, _r03);
                _sum01 = vmlaq_f32(_sum01, _k03, _r04);
                _sum01 = vmlaq_f32(_sum01, _k04, _r05);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);
                float32x4_t _r15 = vld1q_f32(r1 + 20);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum10 = vmlaq_f32(_sum10, _k00, _r10);
                _sum10 = vmlaq_f32(_sum10, _k01, _r11);
                _sum10 = vmlaq_f32(_sum10, _k02, _r12);
                _sum10 = vmlaq_f32(_sum10, _k03, _r13);
                _sum10 = vmlaq_f32(_sum10, _k04, _r14);
                _sum11 = vmlaq_f32(_sum11, _k00, _r11);
                _sum11 = vmlaq_f32(_sum11, _k01, _r12);
                _sum11 = vmlaq_f32(_sum11, _k02, _r13);
                _sum11 = vmlaq_f32(_sum11, _k03, _r14);
                _sum11 = vmlaq_f32(_sum11, _k04, _r15);

                _sum00 = vmlaq_f32(_sum00, _k10, _r10);
                _sum00 = vmlaq_f32(_sum00, _k11, _r11);
                _sum00 = vmlaq_f32(_sum00, _k12, _r12);
                _sum00 = vmlaq_f32(_sum00, _k13, _r13);
                _sum00 = vmlaq_f32(_sum00, _k14, _r14);
                _sum01 = vmlaq_f32(_sum01, _k10, _r11);
                _sum01 = vmlaq_f32(_sum01, _k11, _r12);
                _sum01 = vmlaq_f32(_sum01, _k12, _r13);
                _sum01 = vmlaq_f32(_sum01, _k13, _r14);
                _sum01 = vmlaq_f32(_sum01, _k14, _r15);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);
                float32x4_t _r25 = vld1q_f32(r2 + 20);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum10 = vmlaq_f32(_sum10, _k10, _r20);
                _sum10 = vmlaq_f32(_sum10, _k11, _r21);
                _sum10 = vmlaq_f32(_sum10, _k12, _r22);
                _sum10 = vmlaq_f32(_sum10, _k13, _r23);
                _sum10 = vmlaq_f32(_sum10, _k14, _r24);
                _sum11 = vmlaq_f32(_sum11, _k10, _r21);
                _sum11 = vmlaq_f32(_sum11, _k11, _r22);
                _sum11 = vmlaq_f32(_sum11, _k12, _r23);
                _sum11 = vmlaq_f32(_sum11, _k13, _r24);
                _sum11 = vmlaq_f32(_sum11, _k14, _r25);

                _sum00 = vmlaq_f32(_sum00, _k20, _r20);
                _sum00 = vmlaq_f32(_sum00, _k21, _r21);
                _sum00 = vmlaq_f32(_sum00, _k22, _r22);
                _sum00 = vmlaq_f32(_sum00, _k23, _r23);
                _sum00 = vmlaq_f32(_sum00, _k24, _r24);
                _sum01 = vmlaq_f32(_sum01, _k20, _r21);
                _sum01 = vmlaq_f32(_sum01, _k21, _r22);
                _sum01 = vmlaq_f32(_sum01, _k22, _r23);
                _sum01 = vmlaq_f32(_sum01, _k23, _r24);
                _sum01 = vmlaq_f32(_sum01, _k24, _r25);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);
                float32x4_t _r35 = vld1q_f32(r3 + 20);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum10 = vmlaq_f32(_sum10, _k20, _r30);
                _sum10 = vmlaq_f32(_sum10, _k21, _r31);
                _sum10 = vmlaq_f32(_sum10, _k22, _r32);
                _sum10 = vmlaq_f32(_sum10, _k23, _r33);
                _sum10 = vmlaq_f32(_sum10, _k24, _r34);
                _sum11 = vmlaq_f32(_sum11, _k20, _r31);
                _sum11 = vmlaq_f32(_sum11, _k21, _r32);
                _sum11 = vmlaq_f32(_sum11, _k22, _r33);
                _sum11 = vmlaq_f32(_sum11, _k23, _r34);
                _sum11 = vmlaq_f32(_sum11, _k24, _r35);

                _sum00 = vmlaq_f32(_sum00, _k30, _r30);
                _sum00 = vmlaq_f32(_sum00, _k31, _r31);
                _sum00 = vmlaq_f32(_sum00, _k32, _r32);
                _sum00 = vmlaq_f32(_sum00, _k33, _r33);
                _sum00 = vmlaq_f32(_sum00, _k34, _r34);
                _sum01 = vmlaq_f32(_sum01, _k30, _r31);
                _sum01 = vmlaq_f32(_sum01, _k31, _r32);
                _sum01 = vmlaq_f32(_sum01, _k32, _r33);
                _sum01 = vmlaq_f32(_sum01, _k33, _r34);
                _sum01 = vmlaq_f32(_sum01, _k34, _r35);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);
                float32x4_t _r45 = vld1q_f32(r4 + 20);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum10 = vmlaq_f32(_sum10, _k30, _r40);
                _sum10 = vmlaq_f32(_sum10, _k31, _r41);
                _sum10 = vmlaq_f32(_sum10, _k32, _r42);
                _sum10 = vmlaq_f32(_sum10, _k33, _r43);
                _sum10 = vmlaq_f32(_sum10, _k34, _r44);
                _sum11 = vmlaq_f32(_sum11, _k30, _r41);
                _sum11 = vmlaq_f32(_sum11, _k31, _r42);
                _sum11 = vmlaq_f32(_sum11, _k32, _r43);
                _sum11 = vmlaq_f32(_sum11, _k33, _r44);
                _sum11 = vmlaq_f32(_sum11, _k34, _r45);

                _sum00 = vmlaq_f32(_sum00, _k40, _r40);
                _sum00 = vmlaq_f32(_sum00, _k41, _r41);
                _sum00 = vmlaq_f32(_sum00, _k42, _r42);
                _sum00 = vmlaq_f32(_sum00, _k43, _r43);
                _sum00 = vmlaq_f32(_sum00, _k44, _r44);
                _sum01 = vmlaq_f32(_sum01, _k40, _r41);
                _sum01 = vmlaq_f32(_sum01, _k41, _r42);
                _sum01 = vmlaq_f32(_sum01, _k42, _r43);
                _sum01 = vmlaq_f32(_sum01, _k43, _r44);
                _sum01 = vmlaq_f32(_sum01, _k44, _r45);

                float32x4_t _r50 = vld1q_f32(r5);
                float32x4_t _r51 = vld1q_f32(r5 + 4);
                float32x4_t _r52 = vld1q_f32(r5 + 8);
                float32x4_t _r53 = vld1q_f32(r5 + 12);
                float32x4_t _r54 = vld1q_f32(r5 + 16);
                float32x4_t _r55 = vld1q_f32(r5 + 20);

                _sum10 = vmlaq_f32(_sum10, _k40, _r50);
                _sum10 = vmlaq_f32(_sum10, _k41, _r51);
                _sum10 = vmlaq_f32(_sum10, _k42, _r52);
                _sum10 = vmlaq_f32(_sum10, _k43, _r53);
                _sum10 = vmlaq_f32(_sum10, _k44, _r54);
                _sum11 = vmlaq_f32(_sum11, _k40, _r51);
                _sum11 = vmlaq_f32(_sum11, _k41, _r52);
                _sum11 = vmlaq_f32(_sum11, _k42, _r53);
                _sum11 = vmlaq_f32(_sum11, _k43, _r54);
                _sum11 = vmlaq_f32(_sum11, _k44, _r55);

                vst1q_f32(outptr0, _sum00);
                vst1q_f32(outptr0 + 4, _sum01);
                vst1q_f32(outptr1, _sum10);
                vst1q_f32(outptr1 + 4, _sum11);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                r3 += 8;
                r4 += 8;
                r5 += 8;
                outptr0 += 8;
                outptr1 += 8;
            }
            for (; j < outw; j++)
            {
                float32x4_t _sum0 = _bias0;
                float32x4_t _sum1 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k03, _r03);
                _sum0 = vmlaq_f32(_sum0, _k04, _r04);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum1 = vmlaq_f32(_sum1, _k00, _r10);
                _sum1 = vmlaq_f32(_sum1, _k01, _r11);
                _sum1 = vmlaq_f32(_sum1, _k02, _r12);
                _sum1 = vmlaq_f32(_sum1, _k03, _r13);
                _sum1 = vmlaq_f32(_sum1, _k04, _r14);

                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k13, _r13);
                _sum0 = vmlaq_f32(_sum0, _k14, _r14);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum1 = vmlaq_f32(_sum1, _k10, _r20);
                _sum1 = vmlaq_f32(_sum1, _k11, _r21);
                _sum1 = vmlaq_f32(_sum1, _k12, _r22);
                _sum1 = vmlaq_f32(_sum1, _k13, _r23);
                _sum1 = vmlaq_f32(_sum1, _k14, _r24);

                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);
                _sum0 = vmlaq_f32(_sum0, _k23, _r23);
                _sum0 = vmlaq_f32(_sum0, _k24, _r24);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum1 = vmlaq_f32(_sum1, _k20, _r30);
                _sum1 = vmlaq_f32(_sum1, _k21, _r31);
                _sum1 = vmlaq_f32(_sum1, _k22, _r32);
                _sum1 = vmlaq_f32(_sum1, _k23, _r33);
                _sum1 = vmlaq_f32(_sum1, _k24, _r34);

                _sum0 = vmlaq_f32(_sum0, _k30, _r30);
                _sum0 = vmlaq_f32(_sum0, _k31, _r31);
                _sum0 = vmlaq_f32(_sum0, _k32, _r32);
                _sum0 = vmlaq_f32(_sum0, _k33, _r33);
                _sum0 = vmlaq_f32(_sum0, _k34, _r34);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum1 = vmlaq_f32(_sum1, _k30, _r40);
                _sum1 = vmlaq_f32(_sum1, _k31, _r41);
                _sum1 = vmlaq_f32(_sum1, _k32, _r42);
                _sum1 = vmlaq_f32(_sum1, _k33, _r43);
                _sum1 = vmlaq_f32(_sum1, _k34, _r44);

                _sum0 = vmlaq_f32(_sum0, _k40, _r40);
                _sum0 = vmlaq_f32(_sum0, _k41, _r41);
                _sum0 = vmlaq_f32(_sum0, _k42, _r42);
                _sum0 = vmlaq_f32(_sum0, _k43, _r43);
                _sum0 = vmlaq_f32(_sum0, _k44, _r44);

                float32x4_t _r50 = vld1q_f32(r5);
                float32x4_t _r51 = vld1q_f32(r5 + 4);
                float32x4_t _r52 = vld1q_f32(r5 + 8);
                float32x4_t _r53 = vld1q_f32(r5 + 12);
                float32x4_t _r54 = vld1q_f32(r5 + 16);

                _sum1 = vmlaq_f32(_sum1, _k40, _r50);
                _sum1 = vmlaq_f32(_sum1, _k41, _r51);
                _sum1 = vmlaq_f32(_sum1, _k42, _r52);
                _sum1 = vmlaq_f32(_sum1, _k43, _r53);
                _sum1 = vmlaq_f32(_sum1, _k44, _r54);

                vst1q_f32(outptr0, _sum0);
                vst1q_f32(outptr1, _sum1);

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
                r4 += 4;
                r5 += 4;
                outptr0 += 4;
                outptr1 += 4;
            }

            r0 += 4 * 4 + w * 4;
            r1 += 4 * 4 + w * 4;
            r2 += 4 * 4 + w * 4;
            r3 += 4 * 4 + w * 4;
            r4 += 4 * 4 + w * 4;
            r5 += 4 * 4 + w * 4;

            outptr0 += outw * 4;
            outptr1 += outw * 4;
        }
#endif // __aarch64__
        for (; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
                float32x4_t _sum0 = _bias0;
                float32x4_t _sum1 = _bias0;
                float32x4_t _sum2 = _bias0;
                float32x4_t _sum3 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);
                float32x4_t _r05 = vld1q_f32(r0 + 20);
                float32x4_t _r06 = vld1q_f32(r0 + 24);
                float32x4_t _r07 = vld1q_f32(r0 + 28);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k03, _r03);
                _sum0 = vmlaq_f32(_sum0, _k04, _r04);
                _sum1 = vmlaq_f32(_sum1, _k00, _r01);
                _sum1 = vmlaq_f32(_sum1, _k01, _r02);
                _sum1 = vmlaq_f32(_sum1, _k02, _r03);
                _sum1 = vmlaq_f32(_sum1, _k03, _r04);
                _sum1 = vmlaq_f32(_sum1, _k04, _r05);
                _sum2 = vmlaq_f32(_sum2, _k00, _r02);
                _sum2 = vmlaq_f32(_sum2, _k01, _r03);
                _sum2 = vmlaq_f32(_sum2, _k02, _r04);
                _sum2 = vmlaq_f32(_sum2, _k03, _r05);
                _sum2 = vmlaq_f32(_sum2, _k04, _r06);
                _sum3 = vmlaq_f32(_sum3, _k00, _r03);
                _sum3 = vmlaq_f32(_sum3, _k01, _r04);
                _sum3 = vmlaq_f32(_sum3, _k02, _r05);
                _sum3 = vmlaq_f32(_sum3, _k03, _r06);
                _sum3 = vmlaq_f32(_sum3, _k04, _r07);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);
                float32x4_t _r15 = vld1q_f32(r1 + 20);
                float32x4_t _r16 = vld1q_f32(r1 + 24);
                float32x4_t _r17 = vld1q_f32(r1 + 28);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k13, _r13);
                _sum0 = vmlaq_f32(_sum0, _k14, _r14);
                _sum1 = vmlaq_f32(_sum1, _k10, _r11);
                _sum1 = vmlaq_f32(_sum1, _k11, _r12);
                _sum1 = vmlaq_f32(_sum1, _k12, _r13);
                _sum1 = vmlaq_f32(_sum1, _k13, _r14);
                _sum1 = vmlaq_f32(_sum1, _k14, _r15);
                _sum2 = vmlaq_f32(_sum2, _k10, _r12);
                _sum2 = vmlaq_f32(_sum2, _k11, _r13);
                _sum2 = vmlaq_f32(_sum2, _k12, _r14);
                _sum2 = vmlaq_f32(_sum2, _k13, _r15);
                _sum2 = vmlaq_f32(_sum2, _k14, _r16);
                _sum3 = vmlaq_f32(_sum3, _k10, _r13);
                _sum3 = vmlaq_f32(_sum3, _k11, _r14);
                _sum3 = vmlaq_f32(_sum3, _k12, _r15);
                _sum3 = vmlaq_f32(_sum3, _k13, _r16);
                _sum3 = vmlaq_f32(_sum3, _k14, _r17);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);
                float32x4_t _r25 = vld1q_f32(r2 + 20);
                float32x4_t _r26 = vld1q_f32(r2 + 24);
                float32x4_t _r27 = vld1q_f32(r2 + 28);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);
                _sum0 = vmlaq_f32(_sum0, _k23, _r23);
                _sum0 = vmlaq_f32(_sum0, _k24, _r24);
                _sum1 = vmlaq_f32(_sum1, _k20, _r21);
                _sum1 = vmlaq_f32(_sum1, _k21, _r22);
                _sum1 = vmlaq_f32(_sum1, _k22, _r23);
                _sum1 = vmlaq_f32(_sum1, _k23, _r24);
                _sum1 = vmlaq_f32(_sum1, _k24, _r25);
                _sum2 = vmlaq_f32(_sum2, _k20, _r22);
                _sum2 = vmlaq_f32(_sum2, _k21, _r23);
                _sum2 = vmlaq_f32(_sum2, _k22, _r24);
                _sum2 = vmlaq_f32(_sum2, _k23, _r25);
                _sum2 = vmlaq_f32(_sum2, _k24, _r26);
                _sum3 = vmlaq_f32(_sum3, _k20, _r23);
                _sum3 = vmlaq_f32(_sum3, _k21, _r24);
                _sum3 = vmlaq_f32(_sum3, _k22, _r25);
                _sum3 = vmlaq_f32(_sum3, _k23, _r26);
                _sum3 = vmlaq_f32(_sum3, _k24, _r27);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);
                float32x4_t _r35 = vld1q_f32(r3 + 20);
                float32x4_t _r36 = vld1q_f32(r3 + 24);
                float32x4_t _r37 = vld1q_f32(r3 + 28);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k30, _r30);
                _sum0 = vmlaq_f32(_sum0, _k31, _r31);
                _sum0 = vmlaq_f32(_sum0, _k32, _r32);
                _sum0 = vmlaq_f32(_sum0, _k33, _r33);
                _sum0 = vmlaq_f32(_sum0, _k34, _r34);
                _sum1 = vmlaq_f32(_sum1, _k30, _r31);
                _sum1 = vmlaq_f32(_sum1, _k31, _r32);
                _sum1 = vmlaq_f32(_sum1, _k32, _r33);
                _sum1 = vmlaq_f32(_sum1, _k33, _r34);
                _sum1 = vmlaq_f32(_sum1, _k34, _r35);
                _sum2 = vmlaq_f32(_sum2, _k30, _r32);
                _sum2 = vmlaq_f32(_sum2, _k31, _r33);
                _sum2 = vmlaq_f32(_sum2, _k32, _r34);
                _sum2 = vmlaq_f32(_sum2, _k33, _r35);
                _sum2 = vmlaq_f32(_sum2, _k34, _r36);
                _sum3 = vmlaq_f32(_sum3, _k30, _r33);
                _sum3 = vmlaq_f32(_sum3, _k31, _r34);
                _sum3 = vmlaq_f32(_sum3, _k32, _r35);
                _sum3 = vmlaq_f32(_sum3, _k33, _r36);
                _sum3 = vmlaq_f32(_sum3, _k34, _r37);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);
                float32x4_t _r45 = vld1q_f32(r4 + 20);
                float32x4_t _r46 = vld1q_f32(r4 + 24);
                float32x4_t _r47 = vld1q_f32(r4 + 28);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum0 = vmlaq_f32(_sum0, _k40, _r40);
                _sum0 = vmlaq_f32(_sum0, _k41, _r41);
                _sum0 = vmlaq_f32(_sum0, _k42, _r42);
                _sum0 = vmlaq_f32(_sum0, _k43, _r43);
                _sum0 = vmlaq_f32(_sum0, _k44, _r44);
                _sum1 = vmlaq_f32(_sum1, _k40, _r41);
                _sum1 = vmlaq_f32(_sum1, _k41, _r42);
                _sum1 = vmlaq_f32(_sum1, _k42, _r43);
                _sum1 = vmlaq_f32(_sum1, _k43, _r44);
                _sum1 = vmlaq_f32(_sum1, _k44, _r45);
                _sum2 = vmlaq_f32(_sum2, _k40, _r42);
                _sum2 = vmlaq_f32(_sum2, _k41, _r43);
                _sum2 = vmlaq_f32(_sum2, _k42, _r44);
                _sum2 = vmlaq_f32(_sum2, _k43, _r45);
                _sum2 = vmlaq_f32(_sum2, _k44, _r46);
                _sum3 = vmlaq_f32(_sum3, _k40, _r43);
                _sum3 = vmlaq_f32(_sum3, _k41, _r44);
                _sum3 = vmlaq_f32(_sum3, _k42, _r45);
                _sum3 = vmlaq_f32(_sum3, _k43, _r46);
                _sum3 = vmlaq_f32(_sum3, _k44, _r47);

                vst1q_f32(outptr0, _sum0);
                vst1q_f32(outptr0 + 4, _sum1);
                vst1q_f32(outptr0 + 8, _sum2);
                vst1q_f32(outptr0 + 12, _sum3);

                r0 += 16;
                r1 += 16;
                r2 += 16;
                r3 += 16;
                r4 += 16;
                outptr0 += 16;
            }
            for (; j + 1 < outw; j += 2)
            {
                float32x4_t _sum0 = _bias0;
                float32x4_t _sum1 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);
                float32x4_t _r05 = vld1q_f32(r0 + 20);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k03, _r03);
                _sum0 = vmlaq_f32(_sum0, _k04, _r04);
                _sum1 = vmlaq_f32(_sum1, _k00, _r01);
                _sum1 = vmlaq_f32(_sum1, _k01, _r02);
                _sum1 = vmlaq_f32(_sum1, _k02, _r03);
                _sum1 = vmlaq_f32(_sum1, _k03, _r04);
                _sum1 = vmlaq_f32(_sum1, _k04, _r05);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);
                float32x4_t _r15 = vld1q_f32(r1 + 20);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k13, _r13);
                _sum0 = vmlaq_f32(_sum0, _k14, _r14);
                _sum1 = vmlaq_f32(_sum1, _k10, _r11);
                _sum1 = vmlaq_f32(_sum1, _k11, _r12);
                _sum1 = vmlaq_f32(_sum1, _k12, _r13);
                _sum1 = vmlaq_f32(_sum1, _k13, _r14);
                _sum1 = vmlaq_f32(_sum1, _k14, _r15);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);
                float32x4_t _r25 = vld1q_f32(r2 + 20);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);
                _sum0 = vmlaq_f32(_sum0, _k23, _r23);
                _sum0 = vmlaq_f32(_sum0, _k24, _r24);
                _sum1 = vmlaq_f32(_sum1, _k20, _r21);
                _sum1 = vmlaq_f32(_sum1, _k21, _r22);
                _sum1 = vmlaq_f32(_sum1, _k22, _r23);
                _sum1 = vmlaq_f32(_sum1, _k23, _r24);
                _sum1 = vmlaq_f32(_sum1, _k24, _r25);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);
                float32x4_t _r35 = vld1q_f32(r3 + 20);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k30, _r30);
                _sum0 = vmlaq_f32(_sum0, _k31, _r31);
                _sum0 = vmlaq_f32(_sum0, _k32, _r32);
                _sum0 = vmlaq_f32(_sum0, _k33, _r33);
                _sum0 = vmlaq_f32(_sum0, _k34, _r34);
                _sum1 = vmlaq_f32(_sum1, _k30, _r31);
                _sum1 = vmlaq_f32(_sum1, _k31, _r32);
                _sum1 = vmlaq_f32(_sum1, _k32, _r33);
                _sum1 = vmlaq_f32(_sum1, _k33, _r34);
                _sum1 = vmlaq_f32(_sum1, _k34, _r35);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);
                float32x4_t _r45 = vld1q_f32(r4 + 20);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum0 = vmlaq_f32(_sum0, _k40, _r40);
                _sum0 = vmlaq_f32(_sum0, _k41, _r41);
                _sum0 = vmlaq_f32(_sum0, _k42, _r42);
                _sum0 = vmlaq_f32(_sum0, _k43, _r43);
                _sum0 = vmlaq_f32(_sum0, _k44, _r44);
                _sum1 = vmlaq_f32(_sum1, _k40, _r41);
                _sum1 = vmlaq_f32(_sum1, _k41, _r42);
                _sum1 = vmlaq_f32(_sum1, _k42, _r43);
                _sum1 = vmlaq_f32(_sum1, _k43, _r44);
                _sum1 = vmlaq_f32(_sum1, _k44, _r45);

                vst1q_f32(outptr0, _sum0);
                vst1q_f32(outptr0 + 4, _sum1);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                r3 += 8;
                r4 += 8;
                outptr0 += 8;
            }
            for (; j < outw; j++)
            {
                float32x4_t _sum0 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k03, _r03);
                _sum0 = vmlaq_f32(_sum0, _k04, _r04);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k13, _r13);
                _sum0 = vmlaq_f32(_sum0, _k14, _r14);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);
                _sum0 = vmlaq_f32(_sum0, _k23, _r23);
                _sum0 = vmlaq_f32(_sum0, _k24, _r24);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k30, _r30);
                _sum0 = vmlaq_f32(_sum0, _k31, _r31);
                _sum0 = vmlaq_f32(_sum0, _k32, _r32);
                _sum0 = vmlaq_f32(_sum0, _k33, _r33);
                _sum0 = vmlaq_f32(_sum0, _k34, _r34);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum0 = vmlaq_f32(_sum0, _k40, _r40);
                _sum0 = vmlaq_f32(_sum0, _k41, _r41);
                _sum0 = vmlaq_f32(_sum0, _k42, _r42);
                _sum0 = vmlaq_f32(_sum0, _k43, _r43);
                _sum0 = vmlaq_f32(_sum0, _k44, _r44);

                vst1q_f32(outptr0, _sum0);

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
                r4 += 4;
                outptr0 += 4;
            }

            r0 += 4 * 4;
            r1 += 4 * 4;
            r2 += 4 * 4;
            r3 += 4 * 4;
            r4 += 4 * 4;
        }
    }
}

static void convdw5x5s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);

        const float* k0 = kernel.row(g);

        float* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);
        const float* r3 = img0.row(3);
        const float* r4 = img0.row(4);

        int i = 0;

        for (; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
                float32x4_t _sum0 = _bias0;
                float32x4_t _sum1 = _bias0;
                float32x4_t _sum2 = _bias0;
                float32x4_t _sum3 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);
                float32x4_t _r05 = vld1q_f32(r0 + 20);
                float32x4_t _r06 = vld1q_f32(r0 + 24);
                float32x4_t _r07 = vld1q_f32(r0 + 28);
                float32x4_t _r08 = vld1q_f32(r0 + 32);
                float32x4_t _r09 = vld1q_f32(r0 + 36);
                float32x4_t _r010 = vld1q_f32(r0 + 40);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k03, _r03);
                _sum0 = vmlaq_f32(_sum0, _k04, _r04);
                _sum1 = vmlaq_f32(_sum1, _k00, _r02);
                _sum1 = vmlaq_f32(_sum1, _k01, _r03);
                _sum1 = vmlaq_f32(_sum1, _k02, _r04);
                _sum1 = vmlaq_f32(_sum1, _k03, _r05);
                _sum1 = vmlaq_f32(_sum1, _k04, _r06);
                _sum2 = vmlaq_f32(_sum2, _k00, _r04);
                _sum2 = vmlaq_f32(_sum2, _k01, _r05);
                _sum2 = vmlaq_f32(_sum2, _k02, _r06);
                _sum2 = vmlaq_f32(_sum2, _k03, _r07);
                _sum2 = vmlaq_f32(_sum2, _k04, _r08);
                _sum3 = vmlaq_f32(_sum3, _k00, _r06);
                _sum3 = vmlaq_f32(_sum3, _k01, _r07);
                _sum3 = vmlaq_f32(_sum3, _k02, _r08);
                _sum3 = vmlaq_f32(_sum3, _k03, _r09);
                _sum3 = vmlaq_f32(_sum3, _k04, _r010);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);
                float32x4_t _r15 = vld1q_f32(r1 + 20);
                float32x4_t _r16 = vld1q_f32(r1 + 24);
                float32x4_t _r17 = vld1q_f32(r1 + 28);
                float32x4_t _r18 = vld1q_f32(r1 + 32);
                float32x4_t _r19 = vld1q_f32(r1 + 36);
                float32x4_t _r110 = vld1q_f32(r1 + 40);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k13, _r13);
                _sum0 = vmlaq_f32(_sum0, _k14, _r14);
                _sum1 = vmlaq_f32(_sum1, _k10, _r12);
                _sum1 = vmlaq_f32(_sum1, _k11, _r13);
                _sum1 = vmlaq_f32(_sum1, _k12, _r14);
                _sum1 = vmlaq_f32(_sum1, _k13, _r15);
                _sum1 = vmlaq_f32(_sum1, _k14, _r16);
                _sum2 = vmlaq_f32(_sum2, _k10, _r14);
                _sum2 = vmlaq_f32(_sum2, _k11, _r15);
                _sum2 = vmlaq_f32(_sum2, _k12, _r16);
                _sum2 = vmlaq_f32(_sum2, _k13, _r17);
                _sum2 = vmlaq_f32(_sum2, _k14, _r18);
                _sum3 = vmlaq_f32(_sum3, _k10, _r16);
                _sum3 = vmlaq_f32(_sum3, _k11, _r17);
                _sum3 = vmlaq_f32(_sum3, _k12, _r18);
                _sum3 = vmlaq_f32(_sum3, _k13, _r19);
                _sum3 = vmlaq_f32(_sum3, _k14, _r110);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);
                float32x4_t _r25 = vld1q_f32(r2 + 20);
                float32x4_t _r26 = vld1q_f32(r2 + 24);
                float32x4_t _r27 = vld1q_f32(r2 + 28);
                float32x4_t _r28 = vld1q_f32(r2 + 32);
                float32x4_t _r29 = vld1q_f32(r2 + 36);
                float32x4_t _r210 = vld1q_f32(r2 + 40);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);
                _sum0 = vmlaq_f32(_sum0, _k23, _r23);
                _sum0 = vmlaq_f32(_sum0, _k24, _r24);
                _sum1 = vmlaq_f32(_sum1, _k20, _r22);
                _sum1 = vmlaq_f32(_sum1, _k21, _r23);
                _sum1 = vmlaq_f32(_sum1, _k22, _r24);
                _sum1 = vmlaq_f32(_sum1, _k23, _r25);
                _sum1 = vmlaq_f32(_sum1, _k24, _r26);
                _sum2 = vmlaq_f32(_sum2, _k20, _r24);
                _sum2 = vmlaq_f32(_sum2, _k21, _r25);
                _sum2 = vmlaq_f32(_sum2, _k22, _r26);
                _sum2 = vmlaq_f32(_sum2, _k23, _r27);
                _sum2 = vmlaq_f32(_sum2, _k24, _r28);
                _sum3 = vmlaq_f32(_sum3, _k20, _r26);
                _sum3 = vmlaq_f32(_sum3, _k21, _r27);
                _sum3 = vmlaq_f32(_sum3, _k22, _r28);
                _sum3 = vmlaq_f32(_sum3, _k23, _r29);
                _sum3 = vmlaq_f32(_sum3, _k24, _r210);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);
                float32x4_t _r35 = vld1q_f32(r3 + 20);
                float32x4_t _r36 = vld1q_f32(r3 + 24);
                float32x4_t _r37 = vld1q_f32(r3 + 28);
                float32x4_t _r38 = vld1q_f32(r3 + 32);
                float32x4_t _r39 = vld1q_f32(r3 + 36);
                float32x4_t _r310 = vld1q_f32(r3 + 40);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k30, _r30);
                _sum0 = vmlaq_f32(_sum0, _k31, _r31);
                _sum0 = vmlaq_f32(_sum0, _k32, _r32);
                _sum0 = vmlaq_f32(_sum0, _k33, _r33);
                _sum0 = vmlaq_f32(_sum0, _k34, _r34);
                _sum1 = vmlaq_f32(_sum1, _k30, _r32);
                _sum1 = vmlaq_f32(_sum1, _k31, _r33);
                _sum1 = vmlaq_f32(_sum1, _k32, _r34);
                _sum1 = vmlaq_f32(_sum1, _k33, _r35);
                _sum1 = vmlaq_f32(_sum1, _k34, _r36);
                _sum2 = vmlaq_f32(_sum2, _k30, _r34);
                _sum2 = vmlaq_f32(_sum2, _k31, _r35);
                _sum2 = vmlaq_f32(_sum2, _k32, _r36);
                _sum2 = vmlaq_f32(_sum2, _k33, _r37);
                _sum2 = vmlaq_f32(_sum2, _k34, _r38);
                _sum3 = vmlaq_f32(_sum3, _k30, _r36);
                _sum3 = vmlaq_f32(_sum3, _k31, _r37);
                _sum3 = vmlaq_f32(_sum3, _k32, _r38);
                _sum3 = vmlaq_f32(_sum3, _k33, _r39);
                _sum3 = vmlaq_f32(_sum3, _k34, _r310);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);
                float32x4_t _r45 = vld1q_f32(r4 + 20);
                float32x4_t _r46 = vld1q_f32(r4 + 24);
                float32x4_t _r47 = vld1q_f32(r4 + 28);
                float32x4_t _r48 = vld1q_f32(r4 + 32);
                float32x4_t _r49 = vld1q_f32(r4 + 36);
                float32x4_t _r410 = vld1q_f32(r4 + 40);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum0 = vmlaq_f32(_sum0, _k40, _r40);
                _sum0 = vmlaq_f32(_sum0, _k41, _r41);
                _sum0 = vmlaq_f32(_sum0, _k42, _r42);
                _sum0 = vmlaq_f32(_sum0, _k43, _r43);
                _sum0 = vmlaq_f32(_sum0, _k44, _r44);
                _sum1 = vmlaq_f32(_sum1, _k40, _r42);
                _sum1 = vmlaq_f32(_sum1, _k41, _r43);
                _sum1 = vmlaq_f32(_sum1, _k42, _r44);
                _sum1 = vmlaq_f32(_sum1, _k43, _r45);
                _sum1 = vmlaq_f32(_sum1, _k44, _r46);
                _sum2 = vmlaq_f32(_sum2, _k40, _r44);
                _sum2 = vmlaq_f32(_sum2, _k41, _r45);
                _sum2 = vmlaq_f32(_sum2, _k42, _r46);
                _sum2 = vmlaq_f32(_sum2, _k43, _r47);
                _sum2 = vmlaq_f32(_sum2, _k44, _r48);
                _sum3 = vmlaq_f32(_sum3, _k40, _r46);
                _sum3 = vmlaq_f32(_sum3, _k41, _r47);
                _sum3 = vmlaq_f32(_sum3, _k42, _r48);
                _sum3 = vmlaq_f32(_sum3, _k43, _r49);
                _sum3 = vmlaq_f32(_sum3, _k44, _r410);

                vst1q_f32(outptr0, _sum0);
                vst1q_f32(outptr0 + 4, _sum1);
                vst1q_f32(outptr0 + 8, _sum2);
                vst1q_f32(outptr0 + 12, _sum3);

                r0 += 8 * 4;
                r1 += 8 * 4;
                r2 += 8 * 4;
                r3 += 8 * 4;
                r4 += 8 * 4;
                outptr0 += 16;
            }
            for (; j + 1 < outw; j += 2)
            {
                float32x4_t _sum0 = _bias0;
                float32x4_t _sum1 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);
                float32x4_t _r05 = vld1q_f32(r0 + 20);
                float32x4_t _r06 = vld1q_f32(r0 + 24);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k03, _r03);
                _sum0 = vmlaq_f32(_sum0, _k04, _r04);
                _sum1 = vmlaq_f32(_sum1, _k00, _r02);
                _sum1 = vmlaq_f32(_sum1, _k01, _r03);
                _sum1 = vmlaq_f32(_sum1, _k02, _r04);
                _sum1 = vmlaq_f32(_sum1, _k03, _r05);
                _sum1 = vmlaq_f32(_sum1, _k04, _r06);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);
                float32x4_t _r15 = vld1q_f32(r1 + 20);
                float32x4_t _r16 = vld1q_f32(r1 + 24);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k13, _r13);
                _sum0 = vmlaq_f32(_sum0, _k14, _r14);
                _sum1 = vmlaq_f32(_sum1, _k10, _r12);
                _sum1 = vmlaq_f32(_sum1, _k11, _r13);
                _sum1 = vmlaq_f32(_sum1, _k12, _r14);
                _sum1 = vmlaq_f32(_sum1, _k13, _r15);
                _sum1 = vmlaq_f32(_sum1, _k14, _r16);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);
                float32x4_t _r25 = vld1q_f32(r2 + 20);
                float32x4_t _r26 = vld1q_f32(r2 + 24);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);
                _sum0 = vmlaq_f32(_sum0, _k23, _r23);
                _sum0 = vmlaq_f32(_sum0, _k24, _r24);
                _sum1 = vmlaq_f32(_sum1, _k20, _r22);
                _sum1 = vmlaq_f32(_sum1, _k21, _r23);
                _sum1 = vmlaq_f32(_sum1, _k22, _r24);
                _sum1 = vmlaq_f32(_sum1, _k23, _r25);
                _sum1 = vmlaq_f32(_sum1, _k24, _r26);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);
                float32x4_t _r35 = vld1q_f32(r3 + 20);
                float32x4_t _r36 = vld1q_f32(r3 + 24);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k30, _r30);
                _sum0 = vmlaq_f32(_sum0, _k31, _r31);
                _sum0 = vmlaq_f32(_sum0, _k32, _r32);
                _sum0 = vmlaq_f32(_sum0, _k33, _r33);
                _sum0 = vmlaq_f32(_sum0, _k34, _r34);
                _sum1 = vmlaq_f32(_sum1, _k30, _r32);
                _sum1 = vmlaq_f32(_sum1, _k31, _r33);
                _sum1 = vmlaq_f32(_sum1, _k32, _r34);
                _sum1 = vmlaq_f32(_sum1, _k33, _r35);
                _sum1 = vmlaq_f32(_sum1, _k34, _r36);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);
                float32x4_t _r45 = vld1q_f32(r4 + 20);
                float32x4_t _r46 = vld1q_f32(r4 + 24);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum0 = vmlaq_f32(_sum0, _k40, _r40);
                _sum0 = vmlaq_f32(_sum0, _k41, _r41);
                _sum0 = vmlaq_f32(_sum0, _k42, _r42);
                _sum0 = vmlaq_f32(_sum0, _k43, _r43);
                _sum0 = vmlaq_f32(_sum0, _k44, _r44);
                _sum1 = vmlaq_f32(_sum1, _k40, _r42);
                _sum1 = vmlaq_f32(_sum1, _k41, _r43);
                _sum1 = vmlaq_f32(_sum1, _k42, _r44);
                _sum1 = vmlaq_f32(_sum1, _k43, _r45);
                _sum1 = vmlaq_f32(_sum1, _k44, _r46);

                vst1q_f32(outptr0, _sum0);
                vst1q_f32(outptr0 + 4, _sum1);

                r0 += 4 * 4;
                r1 += 4 * 4;
                r2 += 4 * 4;
                r3 += 4 * 4;
                r4 += 4 * 4;
                outptr0 += 8;
            }
            for (; j < outw; j++)
            {
                float32x4_t _sum0 = _bias0;

                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r03 = vld1q_f32(r0 + 12);
                float32x4_t _r04 = vld1q_f32(r0 + 16);

                float32x4_t _k00 = vld1q_f32(k0);
                float32x4_t _k01 = vld1q_f32(k0 + 4);
                float32x4_t _k02 = vld1q_f32(k0 + 8);
                float32x4_t _k03 = vld1q_f32(k0 + 12);
                float32x4_t _k04 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k00, _r00);
                _sum0 = vmlaq_f32(_sum0, _k01, _r01);
                _sum0 = vmlaq_f32(_sum0, _k02, _r02);
                _sum0 = vmlaq_f32(_sum0, _k03, _r03);
                _sum0 = vmlaq_f32(_sum0, _k04, _r04);

                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r13 = vld1q_f32(r1 + 12);
                float32x4_t _r14 = vld1q_f32(r1 + 16);

                float32x4_t _k10 = vld1q_f32(k0);
                float32x4_t _k11 = vld1q_f32(k0 + 4);
                float32x4_t _k12 = vld1q_f32(k0 + 8);
                float32x4_t _k13 = vld1q_f32(k0 + 12);
                float32x4_t _k14 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k10, _r10);
                _sum0 = vmlaq_f32(_sum0, _k11, _r11);
                _sum0 = vmlaq_f32(_sum0, _k12, _r12);
                _sum0 = vmlaq_f32(_sum0, _k13, _r13);
                _sum0 = vmlaq_f32(_sum0, _k14, _r14);

                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);
                float32x4_t _r23 = vld1q_f32(r2 + 12);
                float32x4_t _r24 = vld1q_f32(r2 + 16);

                float32x4_t _k20 = vld1q_f32(k0);
                float32x4_t _k21 = vld1q_f32(k0 + 4);
                float32x4_t _k22 = vld1q_f32(k0 + 8);
                float32x4_t _k23 = vld1q_f32(k0 + 12);
                float32x4_t _k24 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k20, _r20);
                _sum0 = vmlaq_f32(_sum0, _k21, _r21);
                _sum0 = vmlaq_f32(_sum0, _k22, _r22);
                _sum0 = vmlaq_f32(_sum0, _k23, _r23);
                _sum0 = vmlaq_f32(_sum0, _k24, _r24);

                float32x4_t _r30 = vld1q_f32(r3);
                float32x4_t _r31 = vld1q_f32(r3 + 4);
                float32x4_t _r32 = vld1q_f32(r3 + 8);
                float32x4_t _r33 = vld1q_f32(r3 + 12);
                float32x4_t _r34 = vld1q_f32(r3 + 16);

                float32x4_t _k30 = vld1q_f32(k0);
                float32x4_t _k31 = vld1q_f32(k0 + 4);
                float32x4_t _k32 = vld1q_f32(k0 + 8);
                float32x4_t _k33 = vld1q_f32(k0 + 12);
                float32x4_t _k34 = vld1q_f32(k0 + 16);
                k0 += 20;

                _sum0 = vmlaq_f32(_sum0, _k30, _r30);
                _sum0 = vmlaq_f32(_sum0, _k31, _r31);
                _sum0 = vmlaq_f32(_sum0, _k32, _r32);
                _sum0 = vmlaq_f32(_sum0, _k33, _r33);
                _sum0 = vmlaq_f32(_sum0, _k34, _r34);

                float32x4_t _r40 = vld1q_f32(r4);
                float32x4_t _r41 = vld1q_f32(r4 + 4);
                float32x4_t _r42 = vld1q_f32(r4 + 8);
                float32x4_t _r43 = vld1q_f32(r4 + 12);
                float32x4_t _r44 = vld1q_f32(r4 + 16);

                float32x4_t _k40 = vld1q_f32(k0);
                float32x4_t _k41 = vld1q_f32(k0 + 4);
                float32x4_t _k42 = vld1q_f32(k0 + 8);
                float32x4_t _k43 = vld1q_f32(k0 + 12);
                float32x4_t _k44 = vld1q_f32(k0 + 16);
                k0 -= 80;

                _sum0 = vmlaq_f32(_sum0, _k40, _r40);
                _sum0 = vmlaq_f32(_sum0, _k41, _r41);
                _sum0 = vmlaq_f32(_sum0, _k42, _r42);
                _sum0 = vmlaq_f32(_sum0, _k43, _r43);
                _sum0 = vmlaq_f32(_sum0, _k44, _r44);

                vst1q_f32(outptr0, _sum0);

                r0 += 2 * 4;
                r1 += 2 * 4;
                r2 += 2 * 4;
                r3 += 2 * 4;
                r4 += 2 * 4;
                outptr0 += 4;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
            r3 += tailstep;
            r4 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_5x5_pack4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw5x5s1_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
#if __aarch64__
    const int w = bottom_blob.w;
#endif

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const unsigned short* kptr = kernel.row<const unsigned short>(g);

        unsigned short* outptr0 = out.row<unsigned short>(0);

        const Mat img0 = bottom_blob.channel(g);

        const unsigned short* r0 = img0.row<const unsigned short>(0);
        const unsigned short* r1 = img0.row<const unsigned short>(1);
        const unsigned short* r2 = img0.row<const unsigned short>(2);
        const unsigned short* r3 = img0.row<const unsigned short>(3);
        const unsigned short* r4 = img0.row<const unsigned short>(4);

#if __aarch64__
        unsigned short* outptr1 = out.row<unsigned short>(1);
        const unsigned short* r5 = img0.row<const unsigned short>(5);

        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);

        // 4 * 25
        uint16x8_t _k00_01 = vld1q_u16(kptr);
        uint16x8_t _k02_03 = vld1q_u16(kptr + 8);
        uint16x8_t _k04_10 = vld1q_u16(kptr + 16);
        uint16x8_t _k11_12 = vld1q_u16(kptr + 24);
        uint16x8_t _k13_14 = vld1q_u16(kptr + 32);
        uint16x8_t _k20_21 = vld1q_u16(kptr + 40);
        uint16x8_t _k22_23 = vld1q_u16(kptr + 48);
        uint16x8_t _k24_30 = vld1q_u16(kptr + 56);
        uint16x8_t _k31_32 = vld1q_u16(kptr + 64);
        uint16x8_t _k33_34 = vld1q_u16(kptr + 72);
        uint16x8_t _k40_41 = vld1q_u16(kptr + 80);
        uint16x8_t _k42_43 = vld1q_u16(kptr + 88);
        uint16x4_t _k44 = vld1_u16(kptr + 96);
#else  // __aarch64__
        float bias0_data[4];
        if (bias)
        {
            bias0_data[0] = bias[g * 4 + 0];
            bias0_data[1] = bias[g * 4 + 1];
            bias0_data[2] = bias[g * 4 + 2];
            bias0_data[3] = bias[g * 4 + 3];
        }
        else
        {
            bias0_data[0] = 0.f;
            bias0_data[1] = 0.f;
            bias0_data[2] = 0.f;
            bias0_data[3] = 0.f;
        }
        const float* bias0_data_ptr = bias0_data;
#endif // __aarch64__

        int i = 0;
#if __aarch64__
        for (; i + 1 < outh; i += 2)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%3], #32 \n" // r10 r11 r12 r13

                    "shll2  v14.4s, %18.8h, #16         \n"

                    "mov    v24.16b, %29.16b            \n" // sum00
                    "mov    v25.16b, %29.16b            \n" // sum01
                    "mov    v26.16b, %29.16b            \n" // sum02
                    "mov    v27.16b, %29.16b            \n" // sum03

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "mov    v28.16b, %29.16b            \n" // sum10
                    "mov    v29.16b, %29.16b            \n" // sum11
                    "mov    v30.16b, %29.16b            \n" // sum12
                    "mov    v31.16b, %29.16b            \n" // sum13

                    "shll   v15.4s, %16.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v25.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%3]      \n" // r14 r15 r16 r17
                    "fmla   v27.4s, v14.4s, v19.4s      \n"

                    "shll   v14.4s, %19.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"

                    "shll2  v15.4s, %16.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v17.4s      \n"
                    "fmla   v25.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v19.4s      \n"
                    "fmla   v27.4s, v14.4s, v20.4s      \n"

                    "shll2  v14.4s, %19.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "shll   v15.4s, %17.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v18.4s      \n"
                    "fmla   v25.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v20.4s      \n"
                    "fmla   v27.4s, v14.4s, v21.4s      \n"

                    "shll   v14.4s, %20.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "shll2  v15.4s, %17.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v19.4s      \n"
                    "fmla   v25.4s, v14.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v21.4s      \n"
                    "fmla   v27.4s, v14.4s, v22.4s      \n"

                    "shll2  v14.4s, %20.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%4], #32 \n" // r20 r21 r22 r23
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "shll   v15.4s, %18.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v20.4s      \n"
                    "fmla   v25.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v22.4s      \n"
                    "fmla   v27.4s, v14.4s, v23.4s      \n"

                    "shll   v14.4s, %21.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"

                    "shll2  v15.4s, %18.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v25.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%4]      \n" // r24 r25 r26 r27
                    "fmla   v27.4s, v14.4s, v19.4s      \n"

                    "shll2  v14.4s, %21.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"

                    "shll   v15.4s, %19.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v17.4s      \n"
                    "fmla   v25.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v19.4s      \n"
                    "fmla   v27.4s, v14.4s, v20.4s      \n"

                    "shll   v14.4s, %22.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "shll2  v15.4s, %19.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v18.4s      \n"
                    "fmla   v25.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v20.4s      \n"
                    "fmla   v27.4s, v14.4s, v21.4s      \n"

                    "shll2  v14.4s, %22.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "shll   v15.4s, %20.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v19.4s      \n"
                    "fmla   v25.4s, v14.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v21.4s      \n"
                    "fmla   v27.4s, v14.4s, v22.4s      \n"

                    "shll   v14.4s, %23.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%5], #32 \n" // r30 r31 r32 r33
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "shll2  v15.4s, %20.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v20.4s      \n"
                    "fmla   v25.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v22.4s      \n"
                    "fmla   v27.4s, v14.4s, v23.4s      \n"

                    "shll2  v14.4s, %23.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"

                    "shll   v15.4s, %21.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v25.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%5]      \n" // r34 r35 r36 r37
                    "fmla   v27.4s, v14.4s, v19.4s      \n"

                    "shll   v14.4s, %24.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"

                    "shll2  v15.4s, %21.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v17.4s      \n"
                    "fmla   v25.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v19.4s      \n"
                    "fmla   v27.4s, v14.4s, v20.4s      \n"

                    "shll2  v14.4s, %24.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "shll   v15.4s, %22.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v18.4s      \n"
                    "fmla   v25.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v20.4s      \n"
                    "fmla   v27.4s, v14.4s, v21.4s      \n"

                    "shll   v14.4s, %25.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"
                    "shll2  v15.4s, %22.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v19.4s      \n"
                    "fmla   v25.4s, v14.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v21.4s      \n"
                    "fmla   v27.4s, v14.4s, v22.4s      \n"

                    "shll2  v14.4s, %25.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%6, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%6], #32 \n" // r40 r41 r42 r43
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "shll   v15.4s, %23.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v20.4s      \n"
                    "fmla   v25.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v22.4s      \n"
                    "fmla   v27.4s, v14.4s, v23.4s      \n"
                    "shll   v14.4s, %26.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"

                    "shll2  v15.4s, %23.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v25.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%6, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%6]      \n" // r44 r45 r46 r47
                    "fmla   v27.4s, v14.4s, v19.4s      \n"

                    "shll2  v14.4s, %26.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll   v15.4s, %24.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v17.4s      \n"
                    "fmla   v25.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v19.4s      \n"
                    "fmla   v27.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %27.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll2  v15.4s, %24.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v18.4s      \n"
                    "fmla   v25.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v20.4s      \n"
                    "fmla   v27.4s, v14.4s, v21.4s      \n"
                    "shll2  v14.4s, %27.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"
                    "shll   v15.4s, %25.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v19.4s      \n"
                    "fmla   v25.4s, v14.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v21.4s      \n"
                    "fmla   v27.4s, v14.4s, v22.4s      \n"
                    "shll   v14.4s, %28.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%2], #32 \n" // r00 r01 r02 r03
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "shll2  v15.4s, %25.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v20.4s      \n"
                    "fmla   v25.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v22.4s      \n"
                    "fmla   v27.4s, v14.4s, v23.4s      \n"

                    "shll   v14.4s, %16.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"

                    "shll2  v15.4s, %16.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v25.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%2]      \n" // r04 r05 r06 r07
                    "fmla   v27.4s, v14.4s, v19.4s      \n"

                    "shll   v14.4s, %17.4h, #16         \n"

                    "fmla   v24.4s, v15.4s, v17.4s      \n"
                    "fmla   v25.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v26.4s, v15.4s, v19.4s      \n"
                    "fmla   v27.4s, v15.4s, v20.4s      \n"

                    "shll2  v15.4s, %17.8h, #16         \n"

                    "fmla   v24.4s, v14.4s, v18.4s      \n"
                    "fmla   v25.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v20.4s      \n"
                    "fmla   v27.4s, v14.4s, v21.4s      \n"

                    "shll   v14.4s, %18.4h, #16         \n"

                    "fmla   v24.4s, v15.4s, v19.4s      \n"
                    "fmla   v25.4s, v15.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v26.4s, v15.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%7, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%7], #32 \n" // r50 r51 r52 r53
                    "fmla   v27.4s, v15.4s, v22.4s      \n"

                    "shll   v15.4s, %26.4h, #16         \n"

                    "fmla   v24.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v25.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v26.4s, v14.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v27.4s, v14.4s, v23.4s      \n"

                    "shll2  v14.4s, %26.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%7, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%7]      \n" // r54 r55 r56 r57
                    "fmla   v31.4s, v15.4s, v19.4s      \n"

                    "shll   v15.4s, %27.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "fmla   v31.4s, v14.4s, v20.4s      \n"

                    "shll2  v14.4s, %27.8h, #16         \n"

                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "shll   v15.4s, %28.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v21.4s      \n"
                    "fmla   v31.4s, v14.4s, v22.4s      \n"

                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"

                    "shrn   v24.4h, v24.4s, #16         \n"
                    "shrn   v25.4h, v25.4s, #16         \n"
                    "shrn   v26.4h, v26.4s, #16         \n"
                    "shrn   v27.4h, v27.4s, #16         \n"
                    "shrn   v28.4h, v28.4s, #16         \n"
                    "shrn   v29.4h, v29.4s, #16         \n"
                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%0], #32 \n"
                    "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%1], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3),      // %5
                    "=r"(r4),      // %6
                    "=r"(r5)       // %7
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(r5),
                    "w"(_k00_01), // %16
                    "w"(_k02_03), // %17
                    "w"(_k04_10), // %18
                    "w"(_k11_12), // %19
                    "w"(_k13_14), // %20
                    "w"(_k20_21), // %21
                    "w"(_k22_23), // %22
                    "w"(_k24_30), // %23
                    "w"(_k31_32), // %24
                    "w"(_k33_34), // %25
                    "w"(_k40_41), // %26
                    "w"(_k42_43), // %27
                    "w"(_k44),    // %28
                    "w"(_bias0)   // %29
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%3], #16 \n" // r10 r11

                    "shll2  v14.4s, %18.8h, #16         \n"

                    "mov    v28.16b, %29.16b            \n" // sum00
                    "mov    v29.16b, %29.16b            \n" // sum01

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "mov    v30.16b, %29.16b            \n" // sum10
                    "mov    v31.16b, %29.16b            \n" // sum11

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%3] \n" // r12 r13 r14 r15

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v15.4s, %16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %19.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %16.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll2  v14.4s, %19.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll   v15.4s, %17.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%4], #16 \n" // r20 r21

                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %20.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %17.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll2  v14.4s, %20.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll   v15.4s, %18.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v14.4s, %21.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%4] \n" // r22 r23 r24 r25

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll2  v15.4s, %18.8h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll2  v14.4s, %21.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll   v15.4s, %19.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %19.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%5, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%5], #16 \n" // r30 r31

                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll2  v14.4s, %22.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll   v15.4s, %20.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll2  v15.4s, %20.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll2  v14.4s, %23.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%5] \n" // r32 r33 r34 r35

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v15.4s, %21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %24.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %21.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll2  v14.4s, %24.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll   v15.4s, %22.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%6, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%6], #16 \n" // r40 r41

                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %25.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %22.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll2  v14.4s, %25.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll   v15.4s, %23.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v14.4s, %26.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%6, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%6] \n" // r42 r43 r44 r45

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll2  v15.4s, %23.8h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll2  v14.4s, %26.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll   v15.4s, %24.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %27.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %24.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%2], #16 \n" // r00 r01

                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll2  v14.4s, %27.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll   v15.4s, %25.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%7, #128]       \n"
                    "ld1    {v22.4h, v23.4h}, [%7], #16 \n" // r50 r51
                    "shll   v16.4s, v16.4h, #16         \n"

                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %28.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll2  v15.4s, %25.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v14.4s, %16.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%2] \n" // r02 r03 r04 r05
                    "shll   v23.4s, v23.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v15.4s, %26.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "prfm   pldl1keep, [%7, #256]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h, v27.4h}, [%7] \n" // r52 r53 r54 r55

                    "shll2  v14.4s, %16.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"
                    "shll2  v15.4s, %26.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v24.4s, v24.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %17.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v23.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v24.4s      \n"
                    "shll   v15.4s, %27.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v25.4s, v25.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll2  v14.4s, %17.8h, #16         \n"
                    "fmla   v30.4s, v15.4s, v24.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v25.4s      \n"
                    "shll2  v15.4s, %27.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "shll   v26.4s, v26.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %18.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v25.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v26.4s      \n"
                    "shll   v15.4s, %28.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v27.4s, v27.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v26.4s      \n"
                    "fmla   v31.4s, v15.4s, v27.4s      \n"

                    "shrn   v28.4h, v28.4s, #16         \n"
                    "shrn   v29.4h, v29.4s, #16         \n"
                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v28.4h, v29.4h}, [%0], #16 \n"
                    "st1    {v30.4h, v31.4h}, [%1], #16 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3),      // %5
                    "=r"(r4),      // %6
                    "=r"(r5)       // %7
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(r5),
                    "w"(_k00_01), // %16
                    "w"(_k02_03), // %17
                    "w"(_k04_10), // %18
                    "w"(_k11_12), // %19
                    "w"(_k13_14), // %20
                    "w"(_k20_21), // %21
                    "w"(_k22_23), // %22
                    "w"(_k24_30), // %23
                    "w"(_k31_32), // %24
                    "w"(_k33_34), // %25
                    "w"(_k40_41), // %26
                    "w"(_k42_43), // %27
                    "w"(_k44),    // %28
                    "w"(_bias0)   // %29
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%3, #64]        \n"
                    "ld1    {v16.4h}, [%3], #8          \n" // r10

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%3] \n" // r11 r12 r13 r14

                    "mov    v30.16b, %29.16b            \n" // sum00
                    "mov    v31.16b, %29.16b            \n" // sum10

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "shll2  v14.4s, %18.8h, #16         \n"
                    "shll   v15.4s, %16.4h, #16         \n"
                    "fmul   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %19.4h, #16         \n"
                    "fmul   v29.4s, v15.4s, v16.4s      \n"
                    "shll2  v15.4s, %16.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v17.4s      \n"
                    "shll2  v14.4s, %19.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll   v15.4s, %17.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %20.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %17.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "shll2  v14.4s, %20.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll   v15.4s, %18.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %21.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%4, #64]        \n"
                    "ld1    {v16.4h}, [%4], #8          \n" // r20

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%4] \n" // r21 r22 r23 r24

                    "shll2  v15.4s, %18.8h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v30.4s, v14.4s, v16.4s      \n"
                    "shll2  v14.4s, %21.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v16.4s      \n"
                    "shll   v15.4s, %19.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %22.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %19.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v18.4s      \n"
                    "shll2  v14.4s, %22.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll   v15.4s, %20.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %23.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %20.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "shll2  v14.4s, %23.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%5, #64]        \n"
                    "ld1    {v16.4h}, [%5], #8          \n" // r30

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%5] \n" // r31 r32 r33 r34

                    "shll   v15.4s, %21.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %24.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v16.4s      \n"
                    "shll2  v15.4s, %21.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v17.4s      \n"
                    "shll2  v14.4s, %24.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll   v15.4s, %22.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %25.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %22.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "shll2  v14.4s, %25.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll   v15.4s, %23.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %26.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%6, #64]        \n"
                    "ld1    {v16.4h}, [%6], #8          \n" // r40

                    "prfm   pldl1keep, [%6, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%6] \n" // r41 r42 r43 r44

                    "shll2  v15.4s, %23.8h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v30.4s, v14.4s, v16.4s      \n"
                    "shll2  v14.4s, %26.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v16.4s      \n"
                    "shll   v15.4s, %24.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %27.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %24.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v18.4s      \n"
                    "shll2  v14.4s, %27.8h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll   v15.4s, %25.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %28.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %25.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%2, #64]        \n"
                    "ld1    {v16.4h}, [%2], #8          \n" // r00

                    "prfm   pldl1keep, [%7, #64]        \n"
                    "ld1    {v21.4h}, [%7], #8          \n" // r50

                    "prfm   pldl1keep, [%7, #256]       \n"
                    "ld1    {v22.4h, v23.4h, v24.4h, v25.4h}, [%7] \n" // r51 r52 r53 r54

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%2] \n" // r01 r02 r03 r04

                    "shll   v15.4s, %26.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v21.4s, v21.4h, #16         \n"

                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "shll   v22.4s, v22.4h, #16         \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "shll   v24.4s, v24.4h, #16         \n"
                    "shll   v25.4s, v25.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll2  v14.4s, %16.8h, #16         \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll2  v15.4s, %26.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v22.4s      \n"
                    "shll   v15.4s, %27.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll2  v14.4s, %17.8h, #16         \n"
                    "fmla   v29.4s, v15.4s, v23.4s      \n"
                    "shll2  v15.4s, %27.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v24.4s      \n"
                    "shll   v15.4s, %28.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "fmla   v29.4s, v15.4s, v25.4s      \n"

                    "fadd   v30.4s, v30.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"

                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v30.4h}, [%0], #8          \n"
                    "st1    {v31.4h}, [%1], #8          \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3),      // %5
                    "=r"(r4),      // %6
                    "=r"(r5)       // %7
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(r5),
                    "w"(_k00_01), // %16
                    "w"(_k02_03), // %17
                    "w"(_k04_10), // %18
                    "w"(_k11_12), // %19
                    "w"(_k13_14), // %20
                    "w"(_k20_21), // %21
                    "w"(_k22_23), // %22
                    "w"(_k24_30), // %23
                    "w"(_k31_32), // %24
                    "w"(_k33_34), // %25
                    "w"(_k40_41), // %26
                    "w"(_k42_43), // %27
                    "w"(_k44),    // %28
                    "w"(_bias0)   // %29
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }

            r0 += 4 * 4 + w * 4;
            r1 += 4 * 4 + w * 4;
            r2 += 4 * 4 + w * 4;
            r3 += 4 * 4 + w * 4;
            r4 += 4 * 4 + w * 4;
            r5 += 4 * 4 + w * 4;

            outptr0 += outw * 4;
            outptr1 += outw * 4;
        }
#endif // __aarch64__
        for (; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n" // r00 r01 r02 r03

                    "shll   v14.4s, %12.4h, #16         \n"

                    "mov    v28.16b, %25.16b            \n" // sum00
                    "mov    v29.16b, %25.16b            \n" // sum01

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "mov    v30.16b, %25.16b            \n" // sum02
                    "mov    v31.16b, %25.16b            \n" // sum03

                    "shll2  v15.4s, %12.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%1]      \n" // r04 r05 r06 r07
                    "fmla   v31.4s, v14.4s, v19.4s      \n"

                    "shll   v14.4s, %13.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "shll2  v15.4s, %13.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "fmla   v31.4s, v14.4s, v21.4s      \n"

                    "shll   v14.4s, %14.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%2], #32 \n" // r10 r11 r12 r13
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "shll2  v15.4s, %14.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v23.4s      \n"

                    "shll   v14.4s, %15.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%2]      \n" // r14 r15 r16 r17
                    "fmla   v31.4s, v15.4s, v19.4s      \n"

                    "shll2  v15.4s, %15.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "fmla   v31.4s, v14.4s, v20.4s      \n"

                    "shll   v14.4s, %16.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "shll2  v15.4s, %16.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%3], #32 \n" // r20 r21 r22 r23
                    "fmla   v31.4s, v14.4s, v22.4s      \n"

                    "shll   v14.4s, %17.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"

                    "shll2  v15.4s, %17.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%3]      \n" // r24 r25 r26 r27
                    "fmla   v31.4s, v14.4s, v19.4s      \n"

                    "shll   v14.4s, %18.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "shll2  v15.4s, %18.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "fmla   v31.4s, v14.4s, v21.4s      \n"

                    "shll   v14.4s, %19.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%4], #32 \n" // r30 r31 r32 r33
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "shll2  v15.4s, %19.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v23.4s      \n"

                    "shll   v14.4s, %20.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%4]      \n" // r34 r35 r36 r37
                    "fmla   v31.4s, v15.4s, v19.4s      \n"

                    "shll2  v15.4s, %20.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "fmla   v31.4s, v14.4s, v20.4s      \n"

                    "shll   v14.4s, %21.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "shll2  v15.4s, %21.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v21.4s      \n"
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%5], #32 \n" // r40 r41 r42 r43
                    "fmla   v31.4s, v14.4s, v22.4s      \n"

                    "shll   v14.4s, %22.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"

                    "shll2  v15.4s, %22.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%5]      \n" // r44 r45 r46 r47
                    "fmla   v31.4s, v14.4s, v19.4s      \n"

                    "shll   v14.4s, %23.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "shll2  v15.4s, %23.8h, #16         \n"

                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "fmla   v31.4s, v14.4s, v21.4s      \n"

                    "shll   v14.4s, %24.4h, #16         \n"

                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v22.4s      \n"
                    "fmla   v31.4s, v14.4s, v23.4s      \n"

                    "shrn   v28.4h, v28.4s, #16         \n"
                    "shrn   v29.4h, v29.4s, #16         \n"
                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4)       // %5
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k00_01), // %12
                    "w"(_k02_03), // %13
                    "w"(_k04_10), // %14
                    "w"(_k11_12), // %15
                    "w"(_k13_14), // %16
                    "w"(_k20_21), // %17
                    "w"(_k22_23), // %18
                    "w"(_k24_30), // %19
                    "w"(_k31_32), // %20
                    "w"(_k33_34), // %21
                    "w"(_k40_41), // %22
                    "w"(_k42_43), // %23
                    "w"(_k44),    // %24
                    "w"(_bias0)   // %25
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r00 r01 r02 r03

                    "vshll.u16  q8, d20, #16        \n" // k00

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d24-d25}, [%1]     \n"

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"

                    "vmov       q13, q12            \n" // sum0 sum1
                    "vmov       q14, q12            \n"

                    "vshll.u16  q9, d21, #16        \n" // k01

                    "vmov       q15, q12            \n" // sum2 sum3

                    "vmla.f32   q12, q8, q0         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vmla.f32   q13, q8, q1         \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vmla.f32   q14, q8, q2         \n"
                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d12-d15}, [%2 :64] \n" // r04 r05 r06 r07

                    "vmla.f32   q15, q8, q3         \n"

                    "vshll.u16  q10, d22, #16       \n" // k02
                    "vmla.f32   q12, q9, q1         \n"
                    "vmla.f32   q13, q9, q2         \n"
                    "vshll.u16  q4, d12, #16        \n"
                    "vmla.f32   q14, q9, q3         \n"
                    "vmla.f32   q15, q9, q4         \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "vshll.u16  q11, d23, #16       \n" // k03
                    "vmla.f32   q12, q10, q2        \n"
                    "vmla.f32   q13, q10, q3        \n"
                    "vshll.u16  q5, d13, #16        \n"
                    "vmla.f32   q14, q10, q4        \n"
                    "vmla.f32   q15, q10, q5        \n"

                    "vshll.u16  q10, d16, #16       \n" // k04
                    "vmla.f32   q12, q11, q3        \n"
                    "vmla.f32   q13, q11, q4        \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q11, q5        \n"
                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r10 r11 r12 r13

                    "vmla.f32   q15, q11, q6        \n"

                    "vshll.u16  q11, d17, #16       \n" // k10

                    "vmla.f32   q12, q10, q4        \n"
                    "vshll.u16  q0, d4, #16         \n"
                    "vmla.f32   q13, q10, q5        \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q10, q6        \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vmla.f32   q15, q10, q7        \n"

                    "vshll.u16  q8, d18, #16        \n" // k11
                    "vmla.f32   q12, q11, q0        \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vmla.f32   q13, q11, q1        \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vmla.f32   q14, q11, q2        \n"
                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d12-d15}, [%3 :64] \n" // r14 r15 r16 r17

                    "vmla.f32   q15, q11, q3        \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "vshll.u16  q9, d19, #16        \n" // k12
                    "vmla.f32   q12, q8, q1         \n"
                    "vmla.f32   q13, q8, q2         \n"
                    "vshll.u16  q4, d12, #16        \n"
                    "vmla.f32   q14, q8, q3         \n"
                    "vmla.f32   q15, q8, q4         \n"

                    "vshll.u16  q8, d20, #16        \n" // k13
                    "vmla.f32   q12, q9, q2         \n"
                    "vmla.f32   q13, q9, q3         \n"
                    "vshll.u16  q5, d13, #16        \n"
                    "vmla.f32   q14, q9, q4         \n"
                    "vmla.f32   q15, q9, q5         \n"

                    "vshll.u16  q9, d21, #16        \n" // k14

                    "vmla.f32   q12, q8, q3         \n"
                    "vmla.f32   q13, q8, q4         \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q8, q5         \n"
                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r20 r21 r22 r23

                    "vmla.f32   q15, q8, q6         \n"

                    "vshll.u16  q10, d22, #16       \n" // k20
                    "vmla.f32   q12, q9, q4         \n"
                    "vshll.u16  q0, d4, #16         \n"
                    "vmla.f32   q13, q9, q5         \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q9, q6         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vmla.f32   q15, q9, q7         \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "vshll.u16  q11, d23, #16       \n" // k21
                    "vmla.f32   q12, q10, q0        \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vmla.f32   q13, q10, q1        \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vmla.f32   q14, q10, q2        \n"
                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d12-d15}, [%4 :64] \n" // r24 r25 r26 r27

                    "vmla.f32   q15, q10, q3        \n"

                    "vshll.u16  q10, d16, #16       \n" // k22
                    "vmla.f32   q12, q11, q1        \n"
                    "vmla.f32   q13, q11, q2        \n"
                    "vshll.u16  q4, d12, #16        \n"
                    "vmla.f32   q14, q11, q3        \n"
                    "vmla.f32   q15, q11, q4        \n"

                    "vshll.u16  q11, d17, #16       \n" // k23

                    "vmla.f32   q12, q10, q2        \n"
                    "vmla.f32   q13, q10, q3        \n"
                    "vshll.u16  q5, d13, #16        \n"
                    "vmla.f32   q14, q10, q4        \n"
                    "vmla.f32   q15, q10, q5        \n"

                    "vshll.u16  q8, d18, #16        \n" // k24
                    "vmla.f32   q12, q11, q3        \n"
                    "vmla.f32   q13, q11, q4        \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q11, q5        \n"
                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r30 r31 r32 r33

                    "vmla.f32   q15, q11, q6        \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "vshll.u16  q9, d19, #16        \n" // k30
                    "vmla.f32   q12, q8, q4         \n"
                    "vshll.u16  q0, d4, #16         \n"
                    "vmla.f32   q13, q8, q5         \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q8, q6         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vmla.f32   q15, q8, q7         \n"

                    "vshll.u16  q8, d20, #16        \n" // k31
                    "vmla.f32   q12, q9, q0         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vmla.f32   q13, q9, q1         \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vmla.f32   q14, q9, q2         \n"
                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d12-d15}, [%5 :64] \n" // r34 r35 r36 r37

                    "vmla.f32   q15, q9, q3         \n"

                    "vshll.u16  q9, d21, #16        \n" // k32

                    "vmla.f32   q12, q8, q1         \n"
                    "vmla.f32   q13, q8, q2         \n"
                    "vshll.u16  q4, d12, #16        \n"
                    "vmla.f32   q14, q8, q3         \n"
                    "vmla.f32   q15, q8, q4         \n"

                    "vshll.u16  q10, d22, #16       \n" // k33
                    "vmla.f32   q12, q9, q2         \n"
                    "vmla.f32   q13, q9, q3         \n"
                    "vshll.u16  q5, d13, #16        \n"
                    "vmla.f32   q14, q9, q4         \n"
                    "vmla.f32   q15, q9, q5         \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "vmla.f32   q12, q10, q3        \n"
                    "vshll.u16  q11, d23, #16       \n" // k34
                    "vmla.f32   q13, q10, q4        \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q10, q5        \n"
                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d4-d7}, [%6 :64]!  \n" // r40 r41 r42 r43

                    "vmla.f32   q15, q10, q6        \n"

                    "vshll.u16  q10, d16, #16       \n" // k40
                    "vmla.f32   q12, q11, q4        \n"
                    "vshll.u16  q0, d4, #16         \n"
                    "vmla.f32   q13, q11, q5        \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q11, q6        \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vmla.f32   q15, q11, q7        \n"

                    "vshll.u16  q11, d17, #16       \n" // k41

                    "vmla.f32   q12, q10, q0        \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vmla.f32   q13, q10, q1        \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vmla.f32   q14, q10, q2        \n"
                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d12-d15}, [%6 :64] \n" // r44 r45 r46 r47

                    "vmla.f32   q15, q10, q3        \n"

                    "vshll.u16  q8, d18, #16        \n" // k42
                    "vmla.f32   q12, q11, q1        \n"
                    "vmla.f32   q13, q11, q2        \n"
                    "vshll.u16  q4, d12, #16        \n"
                    "vmla.f32   q14, q11, q3        \n"
                    "vmla.f32   q15, q11, q4        \n"

                    "pld        [%7, #64]           \n"
                    "vld1.u16   {d20}, [%7 :64]     \n"

                    "vmla.f32   q12, q8, q2         \n"
                    "vshll.u16  q9, d19, #16        \n" // k43
                    "vmla.f32   q13, q8, q3         \n"
                    "vshll.u16  q5, d13, #16        \n"
                    "vmla.f32   q14, q8, q4         \n"
                    "vmla.f32   q15, q8, q5         \n"

                    "vshll.u16  q8, d20, #16        \n" // k44

                    "vmla.f32   q12, q9, q3         \n"
                    "vmla.f32   q13, q9, q4         \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q9, q5         \n"
                    "vmla.f32   q15, q9, q6         \n"

                    "vmla.f32   q12, q8, q4         \n"
                    "vmla.f32   q13, q8, q5         \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q8, q6         \n"
                    "vmla.f32   q15, q8, q7         \n"

                    "sub        %7, %7, #192        \n" // kptr -= 24 * 4;

                    "vshrn.u32  d24, q12, #16       \n"
                    "vshrn.u32  d25, q13, #16       \n"
                    "vshrn.u32  d26, q14, #16       \n"
                    "vshrn.u32  d27, q15, #16       \n"

                    "vst1.u16   {d24-d27}, [%0 :64]! \n"

                    : "=r"(outptr0),        // %0
                    "=r"(bias0_data_ptr), // %1
                    "=r"(r0),             // %2
                    "=r"(r1),             // %3
                    "=r"(r2),             // %4
                    "=r"(r3),             // %5
                    "=r"(r4),             // %6
                    "=r"(kptr)            // %7
                    : "0"(outptr0),
                    "1"(bias0_data_ptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(kptr)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%1], #16 \n" // r00 r01

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%1] \n" // r02 r03 r04 r05

                    "shll   v14.4s, %12.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "mov    v30.16b, %25.16b            \n" // sum01
                    "mov    v31.16b, %25.16b            \n" // sum02

                    "shll2  v15.4s, %12.8h, #16         \n"
                    "fmul   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmul   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %13.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%2], #16 \n" // r10 r11
                    "shll2  v15.4s, %13.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %14.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll2  v15.4s, %14.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%2] \n" // r12 r13 r14 r15

                    "shll   v14.4s, %15.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %15.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%3], #16 \n" // r20 r21
                    "shll   v14.4s, %16.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %16.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %17.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%3] \n" // r22 r23 r24 r25

                    "shll2  v15.4s, %17.8h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %18.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%4], #16 \n" // r30 r31
                    "shll2  v15.4s, %18.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %19.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll2  v15.4s, %19.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%4] \n" // r32 r33 r34 r35

                    "shll   v14.4s, %20.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %20.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "prfm   pldl1keep, [%5, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%5], #16 \n" // r40 r41
                    "shll   v14.4s, %21.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %21.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h, v21.4h}, [%5] \n" // r42 r43 r44 r45

                    "shll2  v15.4s, %22.8h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %23.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %24.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"

                    "fadd   v30.4s, v30.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"

                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v30.4h, v31.4h}, [%0], #16 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4)       // %5
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k00_01), // %12
                    "w"(_k02_03), // %13
                    "w"(_k04_10), // %14
                    "w"(_k11_12), // %15
                    "w"(_k13_14), // %16
                    "w"(_k20_21), // %17
                    "w"(_k22_23), // %18
                    "w"(_k24_30), // %19
                    "w"(_k31_32), // %20
                    "w"(_k33_34), // %21
                    "w"(_k40_41), // %22
                    "w"(_k42_43), // %23
                    "w"(_k44),    // %24
                    "w"(_bias0)   // %25
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "pld        [%2, #128]          \n"
                    "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r00 r01

                    "vshll.u16  q8, d20, #16        \n" // k00

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d24-d25}, [%1]     \n"

                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d8-d11}, [%2 :64]  \n" // r02 r03 r04 r05

                    "vshll.u16  q0, d2, #16         \n"

                    "vmov       q13, q12            \n" // sum0 sum1

                    "vshll.u16  q1, d3, #16         \n"
                    "vshll.u16  q9, d21, #16        \n" // k01
                    "vmul.f32   q14, q8, q0         \n"
                    "vshll.u16  q2, d8, #16         \n"
                    "vmul.f32   q15, q8, q1         \n"
                    "vshll.u16  q10, d22, #16       \n" // k02
                    "vmla.f32   q12, q9, q1         \n"
                    "pld        [%3, #128]          \n"
                    "vld1.u16   {d2-d3}, [%3 :64]!  \n" // r10 r11

                    "vshll.u16  q3, d9, #16         \n"
                    "vmla.f32   q13, q9, q2         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vshll.u16  q11, d23, #16       \n" // k03
                    "vmla.f32   q14, q10, q2        \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q15, q10, q3        \n"
                    "vshll.u16  q10, d16, #16       \n" // k04
                    "vmla.f32   q12, q11, q3        \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q13, q11, q4        \n"
                    "vshll.u16  q11, d17, #16       \n" // k10
                    "vmla.f32   q14, q10, q4        \n"
                    "vshll.u16  q0, d2, #16         \n"
                    "vmla.f32   q15, q10, q5        \n"

                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d8-d11}, [%3 :64]  \n" // r12 r13 r14 r15

                    "vshll.u16  q1, d3, #16         \n"
                    "vshll.u16  q8, d18, #16        \n" // k11
                    "vmla.f32   q12, q11, q0        \n"
                    "vshll.u16  q2, d8, #16         \n"
                    "vmla.f32   q13, q11, q1        \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vshll.u16  q9, d19, #16        \n" // k12
                    "vmla.f32   q14, q8, q1         \n"
                    "pld        [%4, #128]          \n"
                    "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r20 r21

                    "vshll.u16  q3, d9, #16         \n"
                    "vmla.f32   q15, q8, q2         \n"
                    "vshll.u16  q8, d20, #16        \n" // k13
                    "vmla.f32   q12, q9, q2         \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q13, q9, q3         \n"
                    "vshll.u16  q9, d21, #16        \n" // k14
                    "vmla.f32   q14, q8, q3         \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q15, q8, q4         \n"
                    "vshll.u16  q10, d22, #16       \n" // k20
                    "vmla.f32   q12, q9, q4         \n"
                    "vshll.u16  q0, d2, #16         \n"
                    "vmla.f32   q13, q9, q5         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d8-d11}, [%4 :64]  \n" // r22 r23 r24 r25

                    "vshll.u16  q1, d3, #16         \n"
                    "vshll.u16  q11, d23, #16       \n" // k21
                    "vmla.f32   q14, q10, q0        \n"
                    "vshll.u16  q2, d8, #16         \n"
                    "vmla.f32   q15, q10, q1        \n"
                    "vshll.u16  q10, d16, #16       \n" // k22
                    "vmla.f32   q12, q11, q1        \n"
                    "pld        [%5, #128]          \n"
                    "vld1.u16   {d2-d3}, [%5 :64]!  \n" // r30 r31

                    "vshll.u16  q3, d9, #16         \n"
                    "vmla.f32   q13, q11, q2        \n"
                    "vshll.u16  q11, d17, #16       \n" // k23
                    "vmla.f32   q14, q10, q2        \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q15, q10, q3        \n"
                    "vshll.u16  q8, d18, #16        \n" // k24
                    "vmla.f32   q12, q11, q3        \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q13, q11, q4        \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vshll.u16  q9, d19, #16        \n" // k30
                    "vmla.f32   q14, q8, q4         \n"
                    "vshll.u16  q0, d2, #16         \n"
                    "vmla.f32   q15, q8, q5         \n"

                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d8-d11}, [%5 :64]  \n" // r32 r33 r34 r35

                    "vshll.u16  q1, d3, #16         \n"
                    "vshll.u16  q8, d20, #16        \n" // k31
                    "vmla.f32   q12, q9, q0         \n"
                    "vshll.u16  q2, d8, #16         \n"
                    "vmla.f32   q13, q9, q1         \n"
                    "vshll.u16  q9, d21, #16        \n" // k32
                    "vmla.f32   q14, q8, q1         \n"
                    "pld        [%6, #128]          \n"
                    "vld1.u16   {d2-d3}, [%6 :64]!  \n" // r40 r41

                    "vshll.u16  q3, d9, #16         \n"
                    "vmla.f32   q15, q8, q2         \n"
                    "vshll.u16  q10, d22, #16       \n" // k33
                    "vmla.f32   q12, q9, q2         \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q13, q9, q3         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vshll.u16  q11, d23, #16       \n" // k34
                    "vmla.f32   q14, q10, q3        \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q15, q10, q4        \n"
                    "vshll.u16  q10, d16, #16       \n" // k40
                    "vmla.f32   q12, q11, q4        \n"
                    "vshll.u16  q0, d2, #16         \n"
                    "vmla.f32   q13, q11, q5        \n"

                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d8-d11}, [%6 :64]  \n" // r42 r43 r44 r45

                    "vshll.u16  q1, d3, #16         \n"
                    "vshll.u16  q11, d17, #16       \n" // k41
                    "vmla.f32   q14, q10, q0        \n"
                    "vshll.u16  q2, d8, #16         \n"
                    "vmla.f32   q15, q10, q1        \n"
                    "vshll.u16  q8, d18, #16        \n" // k42
                    "vmla.f32   q12, q11, q1        \n"
                    "vshll.u16  q3, d9, #16         \n"
                    "vmla.f32   q13, q11, q2        \n"
                    "pld        [%7, #64]           \n"
                    "vld1.u16   {d20}, [%7 :64]     \n"
                    "vshll.u16  q9, d19, #16        \n" // k43
                    "vmla.f32   q14, q8, q2         \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q15, q8, q3         \n"
                    "vshll.u16  q8, d20, #16        \n" // k44
                    "vmla.f32   q12, q9, q3         \n"
                    "vmla.f32   q13, q9, q4         \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q14, q8, q4         \n"
                    "vmla.f32   q15, q8, q5         \n"

                    "vadd.f32   q12, q12, q14       \n"
                    "vadd.f32   q13, q13, q15       \n"

                    "sub        %7, %7, #192        \n" // kptr -= 24 * 4;

                    "vshrn.u32  d24, q12, #16       \n"
                    "vshrn.u32  d25, q13, #16       \n"

                    "vst1.u16   {d24-d25}, [%0 :64]! \n"

                    : "=r"(outptr0),        // %0
                    "=r"(bias0_data_ptr), // %1
                    "=r"(r0),             // %2
                    "=r"(r1),             // %3
                    "=r"(r2),             // %4
                    "=r"(r3),             // %5
                    "=r"(r4),             // %6
                    "=r"(kptr)            // %7
                    : "0"(outptr0),
                    "1"(bias0_data_ptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(kptr)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j < outw; j++)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #64]        \n"
                    "ld1    {v16.4h}, [%1], #8          \n" // r00

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%1] \n" // r01 r02 r03 r04

                    "shll   v14.4s, %12.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "mov    v31.16b, %25.16b            \n" // sum01

                    "shll2  v15.4s, %12.8h, #16         \n"
                    "fmul   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %13.4h, #16         \n"
                    "fmul   v29.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %13.8h, #16         \n"
                    "fmul   v30.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %14.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %14.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%2, #64]        \n"
                    "ld1    {v16.4h}, [%2], #8          \n" // r10

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%2] \n" // r11 r12 r13 r14

                    "shll   v14.4s, %15.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v29.4s, v15.4s, v16.4s      \n"
                    "shll2  v15.4s, %15.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %16.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %17.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%3, #64]        \n"
                    "ld1    {v16.4h}, [%3], #8          \n" // r20

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%3] \n" // r21 r22 r23 r24

                    "shll2  v15.4s, %17.8h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v30.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %18.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %19.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %19.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%4, #64]        \n"
                    "ld1    {v16.4h}, [%4], #8          \n" // r30

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%4] \n" // r31 r32 r33 r34

                    "shll   v14.4s, %20.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v31.4s, v15.4s, v16.4s      \n"
                    "shll2  v15.4s, %20.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %21.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %21.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %22.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%5, #64]        \n"
                    "ld1    {v16.4h}, [%5], #8          \n" // r40

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v17.4h, v18.4h, v19.4h, v20.4h}, [%5] \n" // r41 r42 r43 r44

                    "shll2  v15.4s, %22.8h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %23.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %23.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %24.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"

                    "fadd   v29.4s, v29.4s, v30.4s      \n"
                    "fadd   v31.4s, v31.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"

                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v31.4h}, [%0], #8          \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4)       // %5
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k00_01), // %12
                    "w"(_k02_03), // %13
                    "w"(_k04_10), // %14
                    "w"(_k11_12), // %15
                    "w"(_k13_14), // %16
                    "w"(_k20_21), // %17
                    "w"(_k22_23), // %18
                    "w"(_k24_30), // %19
                    "w"(_k31_32), // %20
                    "w"(_k33_34), // %21
                    "w"(_k40_41), // %22
                    "w"(_k42_43), // %23
                    "w"(_k44),    // %24
                    "w"(_bias0)   // %25
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d24-d25}, [%1]     \n" // sum0

                    "pld        [%2, #64]           \n"
                    "vld1.u16   {d1}, [%2 :64]!     \n" // r00

                    "vshll.u16  q8, d20, #16        \n" // k00

                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d6-d9}, [%2 :64]   \n" // r01 r02 r03 r04

                    "vshll.u16  q0, d1, #16         \n"

                    "vshll.u16  q9, d21, #16        \n" // k01
                    "vshll.u16  q1, d6, #16         \n"
                    "vmul.f32   q13, q8, q0         \n"
                    "pld        [%3, #64]           \n"
                    "vld1.u16   {d1}, [%3 :64]!     \n" // r10

                    "vshll.u16  q2, d7, #16         \n"
                    "vshll.u16  q10, d22, #16       \n" // k02
                    "vmul.f32   q14, q9, q1         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vshll.u16  q3, d8, #16         \n"
                    "vshll.u16  q11, d23, #16       \n" // k03
                    "vmul.f32   q15, q10, q2        \n"
                    "vshll.u16  q4, d9, #16         \n"
                    "vshll.u16  q10, d16, #16       \n" // k04
                    "vmla.f32   q12, q11, q3        \n"
                    "vshll.u16  q0, d1, #16         \n"
                    "vshll.u16  q11, d17, #16       \n" // k10
                    "vmla.f32   q13, q10, q4        \n"

                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d6-d9}, [%3 :64]   \n" // r11 r12 r13 r14

                    "vshll.u16  q8, d18, #16        \n" // k11
                    "vshll.u16  q1, d6, #16         \n"
                    "vmla.f32   q14, q11, q0        \n"
                    "pld        [%4, #64]           \n"
                    "vld1.u16   {d1}, [%4 :64]!     \n" // r20

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vshll.u16  q2, d7, #16         \n"
                    "vshll.u16  q9, d19, #16        \n" // k12
                    "vmla.f32   q15, q8, q1         \n"
                    "vshll.u16  q3, d8, #16         \n"
                    "vshll.u16  q8, d20, #16        \n" // k13
                    "vmla.f32   q12, q9, q2         \n"
                    "vshll.u16  q4, d9, #16         \n"
                    "vshll.u16  q9, d21, #16        \n" // k14
                    "vmla.f32   q13, q8, q3         \n"
                    "vshll.u16  q0, d1, #16         \n"
                    "vshll.u16  q10, d22, #16       \n" // k20
                    "vmla.f32   q14, q9, q4         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d6-d9}, [%4 :64]   \n" // r21 r22 r23 r24

                    "vshll.u16  q11, d23, #16       \n" // k21
                    "vshll.u16  q1, d6, #16         \n"
                    "vmla.f32   q15, q10, q0        \n"
                    "pld        [%5, #64]           \n"
                    "vld1.u16   {d1}, [%5 :64]!     \n" // r30

                    "vshll.u16  q2, d7, #16         \n"
                    "vshll.u16  q10, d16, #16       \n" // k22
                    "vmla.f32   q12, q11, q1        \n"
                    "vshll.u16  q3, d8, #16         \n"
                    "vshll.u16  q11, d17, #16       \n" // k23
                    "vmla.f32   q13, q10, q2        \n"
                    "vshll.u16  q4, d9, #16         \n"
                    "vshll.u16  q8, d18, #16        \n" // k24
                    "vmla.f32   q14, q11, q3        \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vshll.u16  q0, d1, #16         \n"
                    "vshll.u16  q9, d19, #16        \n" // k30
                    "vmla.f32   q15, q8, q4         \n"

                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d6-d9}, [%5 :64]   \n" // r31 r32 r33 r34

                    "vshll.u16  q8, d20, #16        \n" // k31
                    "vshll.u16  q1, d6, #16         \n"
                    "vmla.f32   q12, q9, q0         \n"
                    "pld        [%6, #64]           \n"
                    "vld1.u16   {d1}, [%6 :64]!     \n" // r40

                    "vshll.u16  q2, d7, #16         \n"
                    "vshll.u16  q9, d21, #16        \n" // k32
                    "vmla.f32   q13, q8, q1         \n"
                    "vshll.u16  q3, d8, #16         \n"
                    "vshll.u16  q10, d22, #16       \n" // k33
                    "vmla.f32   q14, q9, q2         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vshll.u16  q4, d9, #16         \n"
                    "vshll.u16  q11, d23, #16       \n" // k34
                    "vmla.f32   q15, q10, q3        \n"
                    "vshll.u16  q0, d1, #16         \n"
                    "vshll.u16  q10, d16, #16       \n" // k40
                    "vmla.f32   q12, q11, q4        \n"

                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d6-d9}, [%6 :64]   \n" // r41 r42 r43 r44

                    "vshll.u16  q11, d17, #16       \n" // k41
                    "vshll.u16  q1, d6, #16         \n"
                    "vmla.f32   q13, q10, q0        \n"
                    "vshll.u16  q2, d7, #16         \n"
                    "vshll.u16  q8, d18, #16        \n" // k42
                    "vmla.f32   q14, q11, q1        \n"
                    "pld        [%7, #64]           \n"
                    "vld1.u16   {d20}, [%7 :64]     \n"
                    "vshll.u16  q3, d8, #16         \n"
                    "vshll.u16  q9, d19, #16        \n" // k43
                    "vmla.f32   q15, q8, q2         \n"
                    "vshll.u16  q4, d9, #16         \n"
                    "vshll.u16  q8, d20, #16        \n" // k44
                    "vmla.f32   q12, q9, q3         \n"
                    "vmla.f32   q13, q8, q4         \n"

                    "vadd.f32   q14, q14, q15       \n"
                    "vadd.f32   q12, q12, q13       \n"
                    "vadd.f32   q12, q12, q14       \n"

                    "sub        %7, %7, #192        \n" // kptr -= 24 * 4;

                    "vshrn.u32  d24, q12, #16       \n"

                    "vst1.u16   {d24}, [%0 :64]!    \n"

                    : "=r"(outptr0),        // %0
                    "=r"(bias0_data_ptr), // %1
                    "=r"(r0),             // %2
                    "=r"(r1),             // %3
                    "=r"(r2),             // %4
                    "=r"(r3),             // %5
                    "=r"(r4),             // %6
                    "=r"(kptr)            // %7
                    : "0"(outptr0),
                    "1"(bias0_data_ptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(kptr)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }

            r0 += 4 * 4;
            r1 += 4 * 4;
            r2 += 4 * 4;
            r3 += 4 * 4;
            r4 += 4 * 4;
        }
    }
}

static void convdw5x5s2_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

#if __aarch64__
        float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);
#endif // __aarch64__

        const unsigned short* kptr = kernel.row<const unsigned short>(g);

        unsigned short* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const unsigned short* r0 = img0.row<const unsigned short>(0);
        const unsigned short* r1 = img0.row<const unsigned short>(1);
        const unsigned short* r2 = img0.row<const unsigned short>(2);
        const unsigned short* r3 = img0.row<const unsigned short>(3);
        const unsigned short* r4 = img0.row<const unsigned short>(4);

#if __aarch64__
        // 4 * 25
        uint16x8_t _k00_01 = vld1q_u16(kptr);
        uint16x8_t _k02_03 = vld1q_u16(kptr + 8);
        uint16x8_t _k04_10 = vld1q_u16(kptr + 16);
        uint16x8_t _k11_12 = vld1q_u16(kptr + 24);
        uint16x8_t _k13_14 = vld1q_u16(kptr + 32);
        uint16x8_t _k20_21 = vld1q_u16(kptr + 40);
        uint16x8_t _k22_23 = vld1q_u16(kptr + 48);
        uint16x8_t _k24_30 = vld1q_u16(kptr + 56);
        uint16x8_t _k31_32 = vld1q_u16(kptr + 64);
        uint16x8_t _k33_34 = vld1q_u16(kptr + 72);
        uint16x8_t _k40_41 = vld1q_u16(kptr + 80);
        uint16x8_t _k42_43 = vld1q_u16(kptr + 88);
        uint16x4_t _k44 = vld1_u16(kptr + 96);
#else  // __aarch64__
        float bias0_data[4];
        if (bias)
        {
            bias0_data[0] = bias[g * 4 + 0];
            bias0_data[1] = bias[g * 4 + 1];
            bias0_data[2] = bias[g * 4 + 2];
            bias0_data[3] = bias[g * 4 + 3];
        }
        else
        {
            bias0_data[0] = 0.f;
            bias0_data[1] = 0.f;
            bias0_data[2] = 0.f;
            bias0_data[3] = 0.f;
        }
        const float* bias0_data_ptr = bias0_data;
#endif // __aarch64__

        int i = 0;

        for (; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n" // r00 r01 r02 r03

                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%1], #32 \n" // r04 r05 r06 r07

                    "shll   v14.4s, %12.4h, #16         \n"

                    "mov    v28.16b, %25.16b            \n" // sum00
                    "mov    v29.16b, %25.16b            \n" // sum01
                    "mov    v30.16b, %25.16b            \n" // sum02
                    "mov    v31.16b, %25.16b            \n" // sum03

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"

                    "prfm   pldl1keep, [%1, #192]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h}, [%1] \n" // r08 r09 r010

                    "shll2  v15.4s, %12.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v22.4s      \n"
                    "shll   v14.4s, %13.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"
                    "shll2  v15.4s, %13.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v24.4s, v24.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v22.4s      \n"
                    "fmla   v31.4s, v14.4s, v24.4s      \n"
                    "shll   v14.4s, %14.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%2], #32 \n" // r10 r11 r12 r13

                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v25.4s, v25.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v23.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v25.4s      \n"
                    "shll2  v15.4s, %14.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v22.4s      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%2], #32 \n" // r14 r15 r16 r17

                    "shll   v26.4s, v26.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v24.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v26.4s      \n"

                    "prfm   pldl1keep, [%2, #192]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h}, [%2] \n" // r18 r19 r110

                    "shll   v14.4s, %15.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v22.4s      \n"
                    "shll2  v15.4s, %15.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v21.4s      \n"
                    "fmla   v31.4s, v14.4s, v23.4s      \n"
                    "shll   v14.4s, %16.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "shll   v24.4s, v24.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "fmla   v31.4s, v15.4s, v24.4s      \n"
                    "shll2  v15.4s, %16.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%3], #32 \n" // r20 r21 r22 r23

                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v25.4s, v25.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v23.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v25.4s      \n"
                    "shll   v14.4s, %17.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v22.4s      \n"
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%3], #32 \n" // r24 r25 r26 r27

                    "shll   v26.4s, v26.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v24.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v26.4s      \n"

                    "prfm   pldl1keep, [%3, #192]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h}, [%3] \n" // r28 r29 r210

                    "shll2  v15.4s, %17.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v22.4s      \n"
                    "shll   v14.4s, %18.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"
                    "shll2  v15.4s, %18.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v24.4s, v24.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v22.4s      \n"
                    "fmla   v31.4s, v14.4s, v24.4s      \n"
                    "shll   v14.4s, %19.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%4], #32 \n" // r30 r31 r32 r33

                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v25.4s, v25.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v23.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v25.4s      \n"
                    "shll2  v15.4s, %19.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v22.4s      \n"
                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%4], #32 \n" // r34 r35 r36 r37

                    "shll   v26.4s, v26.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v24.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v26.4s      \n"

                    "prfm   pldl1keep, [%4, #192]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h}, [%4] \n" // r38 r39 r310

                    "shll   v14.4s, %20.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v16.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v22.4s      \n"
                    "shll2  v15.4s, %20.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v21.4s      \n"
                    "fmla   v31.4s, v14.4s, v23.4s      \n"
                    "shll   v14.4s, %21.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v18.4s      \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"
                    "shll   v24.4s, v24.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v22.4s      \n"
                    "fmla   v31.4s, v15.4s, v24.4s      \n"
                    "shll2  v15.4s, %21.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%5], #32 \n" // r40 r41 r42 r43

                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v25.4s, v25.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v23.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v25.4s      \n"
                    "shll   v14.4s, %22.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v20.4s      \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v22.4s      \n"
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%5], #32 \n" // r44 r45 r46 r47

                    "shll   v26.4s, v26.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v24.4s      \n"
                    "shll   v18.4s, v18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v26.4s      \n"

                    "prfm   pldl1keep, [%5, #192]       \n"
                    "ld1    {v24.4h, v25.4h, v26.4h}, [%5] \n" // r48 r49 r410

                    "shll2  v15.4s, %22.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v14.4s, v22.4s      \n"
                    "shll   v14.4s, %23.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v17.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll   v23.4s, v23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v21.4s      \n"
                    "fmla   v31.4s, v15.4s, v23.4s      \n"
                    "shll2  v15.4s, %23.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v24.4s, v24.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v22.4s      \n"
                    "fmla   v31.4s, v14.4s, v24.4s      \n"
                    "shll   v14.4s, %24.4h, #16         \n"
                    "fmla   v28.4s, v15.4s, v19.4s      \n"
                    "fmla   v29.4s, v15.4s, v21.4s      \n"
                    "shll   v25.4s, v25.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v23.4s      \n"
                    "fmla   v31.4s, v15.4s, v25.4s      \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "fmla   v29.4s, v14.4s, v22.4s      \n"
                    "shll   v26.4s, v26.4h, #16         \n"
                    "fmla   v30.4s, v14.4s, v24.4s      \n"
                    "fmla   v31.4s, v14.4s, v26.4s      \n"

                    "shrn   v28.4h, v28.4s, #16         \n"
                    "shrn   v29.4h, v29.4s, #16         \n"
                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v28.4h, v29.4h, v30.4h, v31.4h}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4)       // %5
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k00_01), // %12
                    "w"(_k02_03), // %13
                    "w"(_k04_10), // %14
                    "w"(_k11_12), // %15
                    "w"(_k13_14), // %16
                    "w"(_k20_21), // %17
                    "w"(_k22_23), // %18
                    "w"(_k24_30), // %19
                    "w"(_k31_32), // %20
                    "w"(_k33_34), // %21
                    "w"(_k40_41), // %22
                    "w"(_k42_43), // %23
                    "w"(_k44),    // %24
                    "w"(_bias0)   // %25
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d24-d25}, [%1]     \n"
                    "vmov       q13, q12            \n" // sum0 sum1

                    "vshll.u16  q8, d20, #16        \n" // k00

                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r00 r01 r02 r03

                    "vmov       q14, q12            \n"
                    "vmov       q15, q12            \n" // sum2 sum3

                    "vshll.u16  q9, d21, #16        \n" // k01

                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d12-d15}, [%2 :64]! \n" // r04 r05 r06 r07

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vshll.u16  q4, d12, #16        \n"
                    "vshll.u16  q5, d13, #16        \n"

                    "vmla.f32   q12, q8, q0         \n"
                    "vmla.f32   q13, q8, q2         \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q8, q4         \n"
                    "vmla.f32   q15, q8, q6         \n"

                    "vshll.u16  q10, d22, #16       \n" // k02

                    "vmla.f32   q12, q9, q1         \n"
                    "vmla.f32   q13, q9, q3         \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q9, q5         \n"
                    "vmla.f32   q15, q9, q7         \n"

                    "pld        [%2, #128]          \n"
                    "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r08 r09

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "vmla.f32   q12, q10, q2        \n"
                    "vshll.u16  q11, d23, #16       \n" // k03
                    "vmla.f32   q13, q10, q4        \n"
                    "vshll.u16  q0, d2, #16         \n"
                    "vmla.f32   q14, q10, q6        \n"
                    "vmla.f32   q15, q10, q0        \n"

                    "vshll.u16  q10, d16, #16       \n" // k04

                    "vmla.f32   q12, q11, q3        \n"
                    "vmla.f32   q13, q11, q5        \n"
                    "vshll.u16  q1, d3, #16         \n"
                    "vmla.f32   q14, q11, q7        \n"
                    "vmla.f32   q15, q11, q1        \n"

                    "pld        [%2, #64]           \n"
                    "vld1.u16   {d5}, [%2 :64]      \n" // r010

                    "vmla.f32   q12, q10, q4        \n"
                    "vshll.u16  q11, d17, #16       \n" // k10
                    "vmla.f32   q13, q10, q6        \n"
                    "vshll.u16  q2, d5, #16         \n"
                    "vmla.f32   q14, q10, q0        \n"
                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d12-d15}, [%3 :64]! \n" // r10 r11 r12 r13

                    "vmla.f32   q15, q10, q2        \n"

                    "vshll.u16  q8, d18, #16        \n" // k11

                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r14 r15 r16 r17

                    "vshll.u16  q4, d12, #16        \n"
                    "vshll.u16  q5, d13, #16        \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vshll.u16  q7, d15, #16        \n"

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"

                    "vmla.f32   q12, q11, q4        \n"
                    "vmla.f32   q13, q11, q6        \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vmla.f32   q14, q11, q0        \n"
                    "vmla.f32   q15, q11, q2        \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "vmla.f32   q12, q8, q5         \n"
                    "vshll.u16  q9, d19, #16        \n" // k12
                    "vmla.f32   q13, q8, q7         \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vmla.f32   q14, q8, q1         \n"
                    "vmla.f32   q15, q8, q3         \n"

                    "pld        [%3, #128]          \n"
                    "vld1.u16   {d10-d11}, [%3 :64]! \n" // r18 r19

                    "vmla.f32   q12, q9, q6         \n"
                    "vshll.u16  q8, d20, #16        \n" // k13
                    "vmla.f32   q13, q9, q0         \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q14, q9, q2         \n"
                    "vmla.f32   q15, q9, q4         \n"

                    "vshll.u16  q9, d21, #16        \n" // k14

                    "vmla.f32   q12, q8, q7         \n"
                    "vmla.f32   q13, q8, q1         \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q14, q8, q3         \n"
                    "vmla.f32   q15, q8, q5         \n"

                    "pld        [%3, #64]           \n"
                    "vld1.u16   {d13}, [%3 :64]     \n" // r110

                    "vmla.f32   q12, q9, q0         \n"
                    "vshll.u16  q10, d22, #16       \n" // k20
                    "vmla.f32   q13, q9, q2         \n"
                    "vshll.u16  q6, d13, #16        \n"
                    "vmla.f32   q14, q9, q4         \n"
                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r20 r21 r22 r23

                    "vmla.f32   q15, q9, q6         \n"

                    "vshll.u16  q11, d23, #16       \n" // k21

                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d12-d15}, [%4 :64]! \n" // r24 r25 r26 r27

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vshll.u16  q4, d12, #16        \n"
                    "vshll.u16  q5, d13, #16        \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "vmla.f32   q12, q10, q0        \n"
                    "vmla.f32   q13, q10, q2        \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q10, q4        \n"
                    "vmla.f32   q15, q10, q6        \n"

                    "vshll.u16  q10, d16, #16       \n" // k22

                    "vmla.f32   q12, q11, q1        \n"
                    "vmla.f32   q13, q11, q3        \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q11, q5        \n"
                    "vmla.f32   q15, q11, q7        \n"

                    "pld        [%4, #128]          \n"
                    "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r28 r29

                    "vmla.f32   q12, q10, q2        \n"
                    "vshll.u16  q11, d17, #16       \n" // k23
                    "vmla.f32   q13, q10, q4        \n"
                    "vshll.u16  q0, d2, #16         \n"
                    "vmla.f32   q14, q10, q6        \n"
                    "vmla.f32   q15, q10, q0        \n"

                    "vshll.u16  q8, d18, #16        \n" // k24

                    "vmla.f32   q12, q11, q3        \n"
                    "vmla.f32   q13, q11, q5        \n"
                    "vshll.u16  q1, d3, #16         \n"
                    "vmla.f32   q14, q11, q7        \n"
                    "vmla.f32   q15, q11, q1        \n"

                    "pld        [%4, #64]           \n"
                    "vld1.u16   {d5}, [%4 :64]      \n" // r210

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "vmla.f32   q12, q8, q4         \n"
                    "vshll.u16  q9, d19, #16        \n" // k30
                    "vmla.f32   q13, q8, q6         \n"
                    "vshll.u16  q2, d5, #16         \n"
                    "vmla.f32   q14, q8, q0         \n"
                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d12-d15}, [%5 :64]! \n" // r30 r31 r32 r33

                    "vmla.f32   q15, q8, q2         \n"

                    "vshll.u16  q8, d20, #16        \n" // k31

                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r34 r35 r36 r37

                    "vshll.u16  q4, d12, #16        \n"
                    "vshll.u16  q5, d13, #16        \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vshll.u16  q7, d15, #16        \n"

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"

                    "vmla.f32   q12, q9, q4         \n"
                    "vmla.f32   q13, q9, q6         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vmla.f32   q14, q9, q0         \n"
                    "vmla.f32   q15, q9, q2         \n"

                    "vshll.u16  q9, d21, #16        \n" // k32

                    "vmla.f32   q12, q8, q5         \n"
                    "vmla.f32   q13, q8, q7         \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vmla.f32   q14, q8, q1         \n"
                    "vmla.f32   q15, q8, q3         \n"

                    "pld        [%5, #128]          \n"
                    "vld1.u16   {d10-d11}, [%5 :64]! \n" // r38 r39

                    "vmla.f32   q12, q9, q6         \n"
                    "vshll.u16  q10, d22, #16       \n" // k33
                    "vmla.f32   q13, q9, q0         \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q14, q9, q2         \n"
                    "vmla.f32   q15, q9, q4         \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "vmla.f32   q12, q10, q7        \n"
                    "vshll.u16  q11, d23, #16       \n" // k34
                    "vmla.f32   q13, q10, q1        \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q14, q10, q3        \n"
                    "vmla.f32   q15, q10, q5        \n"

                    "pld        [%5, #64]           \n"
                    "vld1.u16   {d13}, [%5 :64]     \n" // r310

                    "vmla.f32   q12, q11, q0        \n"
                    "vshll.u16  q10, d16, #16       \n" // k40
                    "vmla.f32   q13, q11, q2        \n"
                    "vshll.u16  q6, d13, #16        \n"
                    "vmla.f32   q14, q11, q4        \n"
                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d4-d7}, [%6 :64]!  \n" // r40 r41 r42 r43

                    "vmla.f32   q15, q11, q6        \n"

                    "vshll.u16  q11, d17, #16       \n" // k41

                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d12-d15}, [%6 :64]! \n" // r44 r45 r46 r47

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vshll.u16  q4, d12, #16        \n"
                    "vshll.u16  q5, d13, #16        \n"

                    "vmla.f32   q12, q10, q0        \n"
                    "vmla.f32   q13, q10, q2        \n"
                    "vshll.u16  q6, d14, #16        \n"
                    "vmla.f32   q14, q10, q4        \n"
                    "vmla.f32   q15, q10, q6        \n"

                    "vshll.u16  q8, d18, #16        \n" // k42

                    "vmla.f32   q12, q11, q1        \n"
                    "vmla.f32   q13, q11, q3        \n"
                    "vshll.u16  q7, d15, #16        \n"
                    "vmla.f32   q14, q11, q5        \n"
                    "pld        [%7, #64]           \n"
                    "vld1.u16   {d20}, [%7 :64]     \n"

                    "vmla.f32   q15, q11, q7        \n"

                    "pld        [%6, #128]          \n"
                    "vld1.u16   {d2-d3}, [%6 :64]!  \n" // r48 r49

                    "vmla.f32   q12, q8, q2         \n"
                    "vshll.u16  q9, d19, #16        \n" // k43
                    "vmla.f32   q13, q8, q4         \n"
                    "vshll.u16  q0, d2, #16         \n"
                    "vmla.f32   q14, q8, q6         \n"
                    "vmla.f32   q15, q8, q0         \n"

                    "vshll.u16  q8, d20, #16        \n" // k44

                    "vmla.f32   q12, q9, q3         \n"
                    "vmla.f32   q13, q9, q5         \n"
                    "vshll.u16  q1, d3, #16         \n"
                    "vmla.f32   q14, q9, q7         \n"
                    "vmla.f32   q15, q9, q1         \n"

                    "pld        [%6, #64]           \n"
                    "vld1.u16   {d5}, [%6 :64]      \n" // r410

                    "vmla.f32   q12, q8, q4         \n"
                    "vmla.f32   q13, q8, q6         \n"
                    "vshll.u16  q2, d5, #16         \n"
                    "vmla.f32   q14, q8, q0         \n"
                    "vmla.f32   q15, q8, q2         \n"

                    "sub        %7, %7, #192        \n" // kptr -= 24 * 4;

                    "sub        %2, %2, #16         \n"
                    "sub        %3, %3, #16         \n"
                    "sub        %4, %4, #16         \n"
                    "sub        %5, %5, #16         \n"
                    "sub        %6, %6, #16         \n"

                    "vshrn.u32  d24, q12, #16       \n"
                    "vshrn.u32  d25, q13, #16       \n"
                    "vshrn.u32  d26, q14, #16       \n"
                    "vshrn.u32  d27, q15, #16       \n"

                    "vst1.u16   {d24-d27}, [%0 :64]! \n"

                    : "=r"(outptr0),        // %0
                    "=r"(bias0_data_ptr), // %1
                    "=r"(r0),             // %2
                    "=r"(r1),             // %3
                    "=r"(r2),             // %4
                    "=r"(r3),             // %5
                    "=r"(r4),             // %6
                    "=r"(kptr)            // %7
                    : "0"(outptr0),
                    "1"(bias0_data_ptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(kptr)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%1], #32 \n" // r00 r01 r02 r03

                    "prfm   pldl1keep, [%1, #192]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h}, [%1] \n" // r04 r05 r06

                    "shll   v14.4s, %12.4h, #16         \n"
                    "shll2  v15.4s, %12.8h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"

                    "mov    v30.16b, %25.16b            \n" // sum00
                    "mov    v31.16b, %25.16b            \n" // sum01

                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"

                    "fmul   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmul   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %13.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %13.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %14.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%2], #32 \n" // r10 r11 r12 r13

                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"
                    "shll2  v15.4s, %14.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%2, #192]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h}, [%2] \n" // r14 r15 r16

                    "shll   v14.4s, %15.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"

                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %15.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %16.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll2  v15.4s, %16.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%3], #32 \n" // r20 r21 r22 r23

                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v14.4s, %17.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%3, #192]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h}, [%3] \n" // r24 r25 r26

                    "shll2  v15.4s, %17.8h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %18.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %18.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %19.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%4], #32 \n" // r30 r31 r32 r33

                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"
                    "shll2  v15.4s, %19.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%4, #192]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h}, [%4] \n" // r34 r35 r36

                    "shll   v14.4s, %20.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"

                    "fmla   v30.4s, v15.4s, v16.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %20.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %21.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v18.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"
                    "shll2  v15.4s, %21.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%5], #32 \n" // r40 r41 r42 r43

                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v21.4s      \n"
                    "shll   v14.4s, %22.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v20.4s      \n"
                    "shll   v16.4s, v16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v22.4s      \n"

                    "prfm   pldl1keep, [%5, #192]       \n"
                    "ld1    {v20.4h, v21.4h, v22.4h}, [%5] \n" // r44 r45 r46

                    "shll2  v15.4s, %22.8h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"
                    "shll   v18.4s, v18.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %23.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v17.4s      \n"
                    "shll   v20.4s, v20.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %23.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v21.4s, v21.4h, #16         \n"
                    "fmla   v29.4s, v14.4s, v20.4s      \n"
                    "shll   v14.4s, %24.4h, #16         \n"
                    "fmla   v30.4s, v15.4s, v19.4s      \n"
                    "fmla   v31.4s, v15.4s, v21.4s      \n"
                    "shll   v22.4s, v22.4h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"
                    "fmla   v29.4s, v14.4s, v22.4s      \n"

                    "fadd   v30.4s, v30.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"

                    "shrn   v30.4h, v30.4s, #16         \n"
                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v30.4h, v31.4h}, [%0], #16 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4)       // %5
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k00_01), // %12
                    "w"(_k02_03), // %13
                    "w"(_k04_10), // %14
                    "w"(_k11_12), // %15
                    "w"(_k13_14), // %16
                    "w"(_k20_21), // %17
                    "w"(_k22_23), // %18
                    "w"(_k24_30), // %19
                    "w"(_k31_32), // %20
                    "w"(_k33_34), // %21
                    "w"(_k40_41), // %22
                    "w"(_k42_43), // %23
                    "w"(_k44),    // %24
                    "w"(_bias0)   // %25
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d24-d25}, [%1]     \n"

                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d4-d7}, [%2 :64]!  \n" // r00 r01 r02 r03

                    "vshll.u16  q8, d20, #16        \n" // k00

                    "pld        [%2, #256]          \n"
                    "vld1.u16   {d10-d12}, [%2 :64] \n" // r04 r05 r06

                    "vmov       q13, q12            \n" // sum0 sum1

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vshll.u16  q9, d21, #16        \n" // k01
                    "vmul.f32   q14, q8, q0         \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmul.f32   q15, q8, q2         \n"
                    "vshll.u16  q10, d22, #16       \n" // k02
                    "vmla.f32   q12, q9, q1         \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q13, q9, q3         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vmla.f32   q14, q10, q2        \n"
                    "vshll.u16  q11, d23, #16       \n" // k03
                    "vmla.f32   q15, q10, q4        \n"
                    "vshll.u16  q10, d16, #16       \n" // k04
                    "vmla.f32   q12, q11, q3        \n"
                    "vshll.u16  q6, d12, #16        \n"
                    "vmla.f32   q13, q11, q5        \n"
                    "vshll.u16  q11, d17, #16       \n" // k10
                    "vmla.f32   q14, q10, q4        \n"
                    "vmla.f32   q15, q10, q6        \n"

                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d4-d7}, [%3 :64]!  \n" // r10 r11 r12 r13

                    "vshll.u16  q8, d18, #16        \n" // k11

                    "pld        [%3, #256]          \n"
                    "vld1.u16   {d10-d12}, [%3 :64] \n" // r14 r15 r16

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vmla.f32   q12, q11, q0        \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q13, q11, q2        \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vmla.f32   q14, q8, q1         \n"
                    "vshll.u16  q9, d19, #16        \n" // k12
                    "vmla.f32   q15, q8, q3         \n"
                    "vshll.u16  q8, d20, #16        \n" // k13
                    "vmla.f32   q12, q9, q2         \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q13, q9, q4         \n"
                    "vshll.u16  q9, d21, #16        \n" // k14
                    "vmla.f32   q14, q8, q3         \n"
                    "vshll.u16  q6, d12, #16        \n"
                    "vmla.f32   q15, q8, q5         \n"
                    "vshll.u16  q10, d22, #16       \n" // k20
                    "vmla.f32   q12, q9, q4         \n"
                    "vmla.f32   q13, q9, q6         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"

                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d4-d7}, [%4 :64]!  \n" // r20 r21 r22 r23

                    "vshll.u16  q11, d23, #16       \n" // k21

                    "pld        [%4, #256]          \n"
                    "vld1.u16   {d10-d12}, [%4 :64] \n" // r24 r25 r26

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vmla.f32   q14, q10, q0        \n"
                    "vmla.f32   q15, q10, q2        \n"
                    "vshll.u16  q10, d16, #16       \n" // k22
                    "vmla.f32   q12, q11, q1        \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q13, q11, q3        \n"
                    "vshll.u16  q11, d17, #16       \n" // k23
                    "vmla.f32   q14, q10, q2        \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q15, q10, q4        \n"
                    "vshll.u16  q8, d18, #16        \n" // k24
                    "vmla.f32   q12, q11, q3        \n"
                    "vshll.u16  q6, d12, #16        \n"
                    "vmla.f32   q13, q11, q5        \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vmla.f32   q14, q8, q4         \n"
                    "vshll.u16  q9, d19, #16        \n" // k30
                    "vmla.f32   q15, q8, q6         \n"

                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d4-d7}, [%5 :64]!  \n" // r30 r31 r32 r33

                    "vshll.u16  q8, d20, #16        \n" // k31

                    "pld        [%5, #256]          \n"
                    "vld1.u16   {d10-d12}, [%5 :64] \n" // r34 r35 r36

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vmla.f32   q12, q9, q0         \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q13, q9, q2         \n"
                    "vshll.u16  q9, d21, #16        \n" // k32
                    "vmla.f32   q14, q8, q1         \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q15, q8, q3         \n"
                    "vshll.u16  q10, d22, #16       \n" // k33
                    "vmla.f32   q12, q9, q2         \n"
                    "vshll.u16  q6, d12, #16        \n"
                    "vmla.f32   q13, q9, q4         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vmla.f32   q14, q10, q3        \n"
                    "vshll.u16  q11, d23, #16       \n" // k34
                    "vmla.f32   q15, q10, q5        \n"
                    "vshll.u16  q10, d16, #16       \n" // k40
                    "vmla.f32   q12, q11, q4        \n"
                    "vmla.f32   q13, q11, q6        \n"

                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d4-d7}, [%6 :64]!  \n" // r40 r41 r42 r43

                    "vshll.u16  q11, d17, #16       \n" // k41

                    "pld        [%6, #256]          \n"
                    "vld1.u16   {d10-d12}, [%6 :64] \n" // r44 r45 r46

                    "vshll.u16  q0, d4, #16         \n"
                    "vshll.u16  q1, d5, #16         \n"
                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"

                    "vmla.f32   q14, q10, q0        \n"
                    "vshll.u16  q4, d10, #16        \n"
                    "vmla.f32   q15, q10, q2        \n"
                    "vshll.u16  q8, d18, #16        \n" // k42
                    "vmla.f32   q12, q11, q1        \n"
                    "vshll.u16  q5, d11, #16        \n"
                    "vmla.f32   q13, q11, q3        \n"
                    "pld        [%7, #64]           \n"
                    "vld1.u16   {d20}, [%7 :64]     \n"
                    "vmla.f32   q14, q8, q2         \n"
                    "vshll.u16  q9, d19, #16        \n" // k43
                    "vmla.f32   q15, q8, q4         \n"
                    "vshll.u16  q8, d20, #16        \n" // k44
                    "vmla.f32   q12, q9, q3         \n"
                    "vshll.u16  q6, d12, #16        \n"
                    "vmla.f32   q13, q9, q5         \n"
                    "vmla.f32   q14, q8, q4         \n"
                    "vmla.f32   q15, q8, q6         \n"

                    "vadd.f32   q12, q12, q14       \n"
                    "vadd.f32   q13, q13, q15       \n"

                    "sub        %7, %7, #192        \n" // kptr -= 24 * 4;

                    "vshrn.u32  d24, q12, #16       \n"
                    "vshrn.u32  d25, q13, #16       \n"

                    "vst1.u16   {d24-d25}, [%0 :64]! \n"

                    : "=r"(outptr0),        // %0
                    "=r"(bias0_data_ptr), // %1
                    "=r"(r0),             // %2
                    "=r"(r1),             // %3
                    "=r"(r2),             // %4
                    "=r"(r3),             // %5
                    "=r"(r4),             // %6
                    "=r"(kptr)            // %7
                    : "0"(outptr0),
                    "1"(bias0_data_ptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(kptr)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j < outw; j++)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%1], #16 \n" // r00 r01

                    "prfm   pldl1keep, [%1, #192]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h}, [%1] \n" // r02 r03 r04

                    "shll   v14.4s, %12.4h, #16         \n"

                    "mov    v31.16b, %25.16b            \n" // sum00

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "shll2  v15.4s, %12.8h, #16         \n"
                    "fmul   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %13.4h, #16         \n"
                    "fmul   v29.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %13.8h, #16         \n"
                    "fmul   v30.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %14.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %14.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%2], #16 \n" // r10 r11

                    "prfm   pldl1keep, [%2, #192]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h}, [%2] \n" // r12 r13 r14

                    "shll   v14.4s, %15.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v29.4s, v15.4s, v16.4s      \n"
                    "shll2  v15.4s, %15.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %16.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %16.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %17.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%3], #16 \n" // r20 r21

                    "prfm   pldl1keep, [%3, #192]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h}, [%3] \n" // r22 r23 r24

                    "shll2  v15.4s, %17.8h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v30.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %18.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %18.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %19.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v19.4s      \n"
                    "shll2  v15.4s, %19.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%4], #16 \n" // r30 r31

                    "prfm   pldl1keep, [%4, #192]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h}, [%4] \n" // r32 r33 r34

                    "shll   v14.4s, %20.4h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v31.4s, v15.4s, v16.4s      \n"
                    "shll2  v15.4s, %20.8h, #16         \n"
                    "fmla   v28.4s, v14.4s, v17.4s      \n"
                    "shll   v14.4s, %21.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v18.4s      \n"
                    "shll2  v15.4s, %21.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v19.4s      \n"
                    "shll   v14.4s, %22.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v20.4s      \n"

                    "prfm   pldl1keep, [%5, #128]       \n"
                    "ld1    {v16.4h, v17.4h}, [%5], #16 \n" // r40 r41

                    "prfm   pldl1keep, [%5, #192]       \n"
                    "ld1    {v18.4h, v19.4h, v20.4h}, [%5] \n" // r42 r43 r44

                    "shll2  v15.4s, %22.8h, #16         \n"

                    "shll   v16.4s, v16.4h, #16         \n"
                    "shll   v17.4s, v17.4h, #16         \n"

                    "shll   v18.4s, v18.4h, #16         \n"
                    "shll   v19.4s, v19.4h, #16         \n"
                    "shll   v20.4s, v20.4h, #16         \n"

                    "fmla   v28.4s, v14.4s, v16.4s      \n"
                    "shll   v14.4s, %23.4h, #16         \n"
                    "fmla   v29.4s, v15.4s, v17.4s      \n"
                    "shll2  v15.4s, %23.8h, #16         \n"
                    "fmla   v30.4s, v14.4s, v18.4s      \n"
                    "shll   v14.4s, %24.4h, #16         \n"
                    "fmla   v31.4s, v15.4s, v19.4s      \n"
                    "fmla   v28.4s, v14.4s, v20.4s      \n"

                    "fadd   v29.4s, v29.4s, v30.4s      \n"
                    "fadd   v31.4s, v31.4s, v28.4s      \n"
                    "fadd   v31.4s, v31.4s, v29.4s      \n"

                    "shrn   v31.4h, v31.4s, #16         \n"

                    "st1    {v31.4h}, [%0], #8          \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4)       // %5
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "w"(_k00_01), // %12
                    "w"(_k02_03), // %13
                    "w"(_k04_10), // %14
                    "w"(_k11_12), // %15
                    "w"(_k13_14), // %16
                    "w"(_k20_21), // %17
                    "w"(_k22_23), // %18
                    "w"(_k24_30), // %19
                    "w"(_k31_32), // %20
                    "w"(_k33_34), // %21
                    "w"(_k40_41), // %22
                    "w"(_k42_43), // %23
                    "w"(_k44),    // %24
                    "w"(_bias0)   // %25
                    : "memory", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%2, #128]          \n"
                    "vld1.u16   {d2-d3}, [%2 :64]!  \n" // r00 r01

                    "pld        [%2, #192]          \n"
                    "vld1.u16   {d6-d8}, [%2 :64]   \n" // r02 r03 r04

                    "vshll.u16  q0, d2, #16         \n"
                    "vshll.u16  q1, d3, #16         \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"

                    "vshll.u16  q8, d20, #16        \n" // k00

                    "pld        [%1, #128]          \n"
                    "vld1.f32   {d24-d25}, [%1]     \n" // sum0

                    "vshll.u16  q9, d21, #16        \n" // k01
                    "vmul.f32   q13, q8, q0         \n"
                    "vshll.u16  q10, d22, #16       \n" // k02
                    "vmul.f32   q14, q9, q1         \n"

                    "pld        [%3, #128]          \n"
                    "vld1.u16   {d14-d15}, [%3 :64]! \n" // r10 r11

                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vshll.u16  q4, d8, #16         \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vshll.u16  q11, d23, #16       \n" // k03
                    "vmul.f32   q15, q10, q2        \n"
                    "vshll.u16  q10, d16, #16       \n" // k04
                    "vmla.f32   q12, q11, q3        \n"
                    "vshll.u16  q11, d17, #16       \n" // k10
                    "vmla.f32   q13, q10, q4        \n"

                    "pld        [%3, #192]          \n"
                    "vld1.u16   {d8-d10}, [%3 :64]  \n" // r12 r13 r14

                    "vshll.u16  q6, d14, #16        \n"
                    "vshll.u16  q7, d15, #16        \n"

                    "vshll.u16  q8, d18, #16        \n" // k11
                    "vmla.f32   q14, q11, q6        \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vshll.u16  q9, d19, #16        \n" // k12
                    "vmla.f32   q15, q8, q7         \n"

                    "pld        [%4, #128]          \n"
                    "vld1.u16   {d2-d3}, [%4 :64]!  \n" // r20 r21

                    "vshll.u16  q3, d8, #16         \n"
                    "vshll.u16  q4, d9, #16         \n"
                    "vshll.u16  q5, d10, #16        \n"

                    "vshll.u16  q8, d20, #16        \n" // k13
                    "vmla.f32   q12, q9, q3         \n"
                    "vshll.u16  q9, d21, #16        \n" // k14
                    "vmla.f32   q13, q8, q4         \n"
                    "vshll.u16  q10, d22, #16       \n" // k20
                    "vmla.f32   q14, q9, q5         \n"

                    "pld        [%4, #192]          \n"
                    "vld1.u16   {d6-d8}, [%4 :64]   \n" // r22 r23 r24

                    "vshll.u16  q0, d2, #16         \n"
                    "vshll.u16  q1, d3, #16         \n"

                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vshll.u16  q11, d23, #16       \n" // k21
                    "vmla.f32   q15, q10, q0        \n"
                    "vshll.u16  q10, d16, #16       \n" // k22
                    "vmla.f32   q12, q11, q1        \n"

                    "pld        [%5, #128]          \n"
                    "vld1.u16   {d14-d15}, [%5 :64]! \n" // r30 r31

                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vshll.u16  q4, d8, #16         \n"

                    "vshll.u16  q11, d17, #16       \n" // k23
                    "vmla.f32   q13, q10, q2        \n"
                    "vshll.u16  q8, d18, #16        \n" // k24
                    "vmla.f32   q14, q11, q3        \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d20-d23}, [%7 :64]! \n"
                    "vshll.u16  q9, d19, #16        \n" // k30
                    "vmla.f32   q15, q8, q4         \n"

                    "pld        [%5, #192]          \n"
                    "vld1.u16   {d8-d10}, [%5 :64]  \n" // r32 r33 r34

                    "vshll.u16  q6, d14, #16        \n"
                    "vshll.u16  q7, d15, #16        \n"

                    "vshll.u16  q8, d20, #16        \n" // k31
                    "vmla.f32   q12, q9, q6         \n"

                    "vshll.u16  q9, d21, #16        \n" // k32
                    "vmla.f32   q13, q8, q7         \n"

                    "pld        [%6, #128]          \n"
                    "vld1.u16   {d2-d3}, [%6 :64]!  \n" // r40 r41

                    "vshll.u16  q3, d8, #16         \n"
                    "vshll.u16  q4, d9, #16         \n"
                    "vshll.u16  q5, d10, #16        \n"

                    "vshll.u16  q10, d22, #16       \n" // k33
                    "vmla.f32   q14, q9, q3         \n"
                    "pld        [%7, #256]          \n"
                    "vld1.u16   {d16-d19}, [%7 :64]! \n"
                    "vshll.u16  q11, d23, #16       \n" // k34
                    "vmla.f32   q15, q10, q4        \n"
                    "vshll.u16  q10, d16, #16       \n" // k40
                    "vmla.f32   q12, q11, q5        \n"

                    "pld        [%6, #192]          \n"
                    "vld1.u16   {d6-d8}, [%6 :64]   \n" // r42 r43 r44

                    "vshll.u16  q0, d2, #16         \n"
                    "vshll.u16  q1, d3, #16         \n"

                    "vshll.u16  q11, d17, #16       \n" // k41
                    "vmla.f32   q13, q10, q0        \n"
                    "vshll.u16  q8, d18, #16        \n" // k42
                    "vmla.f32   q14, q11, q1        \n"

                    "vshll.u16  q2, d6, #16         \n"
                    "vshll.u16  q3, d7, #16         \n"
                    "vshll.u16  q4, d8, #16         \n"

                    "pld        [%7, #64]           \n"
                    "vld1.u16   {d20}, [%7 :64]     \n"
                    "vshll.u16  q9, d19, #16        \n" // k43
                    "vmla.f32   q15, q8, q2         \n"
                    "vshll.u16  q8, d20, #16        \n" // k44
                    "vmla.f32   q12, q9, q3         \n"

                    "vmla.f32   q13, q8, q4         \n"

                    "vadd.f32   q14, q14, q15       \n"
                    "vadd.f32   q12, q12, q13       \n"

                    "sub        %7, %7, #192        \n" // kptr -= 24 * 4;

                    "vadd.f32   q12, q12, q14       \n"

                    "vshrn.u32  d24, q12, #16       \n"

                    "vst1.u16   {d24}, [%0 :64]!    \n"

                    : "=r"(outptr0),        // %0
                    "=r"(bias0_data_ptr), // %1
                    "=r"(r0),             // %2
                    "=r"(r1),             // %3
                    "=r"(r2),             // %4
                    "=r"(r3),             // %5
                    "=r"(r4),             // %6
                    "=r"(kptr)            // %7
                    : "0"(outptr0),
                    "1"(bias0_data_ptr),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(kptr)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
            r3 += tailstep;
            r4 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_5x5_pack8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convdw5x5s1_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        __fp16 bias0_data[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};

        const __fp16* k0 = kernel.row<const __fp16>(g);

        __fp16* outptr0 = out.row<__fp16>(0);
        __fp16* outptr1 = out.row<__fp16>(1);

        const Mat img0 = bottom_blob.channel(g);

        const __fp16* r0 = img0.row<const __fp16>(0);
        const __fp16* r1 = img0.row<const __fp16>(1);
        const __fp16* r2 = img0.row<const __fp16>(2);
        const __fp16* r3 = img0.row<const __fp16>(3);
        const __fp16* r4 = img0.row<const __fp16>(4);
        const __fp16* r5 = img0.row<const __fp16>(5);

        int i = 0;
        for (; i + 1 < outh; i += 2)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                const __fp16* bias0_data_ptr = bias ? bias + g * 8 : bias0_data;

                asm volatile(
                    "prfm   pldl1keep, [%18, #512]      \n"
                    "ld1    {v31.8h}, [%18]             \n" // sum13

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%2], #64 \n" // r0_0123

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w0_0123

                    "mov    v24.16b, v31.16b            \n" // sum00
                    "mov    v25.16b, v31.16b            \n" // sum01
                    "mov    v26.16b, v31.16b            \n" // sum02
                    "mov    v27.16b, v31.16b            \n" // sum03

                    "fmla   v24.8h, v16.8h, v0.8h       \n"
                    "fmla   v25.8h, v17.8h, v0.8h       \n"
                    "fmla   v26.8h, v18.8h, v0.8h       \n"
                    "fmla   v27.8h, v19.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%2] \n" // r0_4567

                    "fmla   v24.8h, v17.8h, v1.8h       \n"
                    "fmla   v25.8h, v18.8h, v1.8h       \n"
                    "fmla   v26.8h, v19.8h, v1.8h       \n"
                    "fmla   v27.8h, v20.8h, v1.8h       \n"

                    "mov    v28.16b, v31.16b            \n" // sum10

                    "fmla   v24.8h, v18.8h, v2.8h       \n"
                    "fmla   v25.8h, v19.8h, v2.8h       \n"
                    "fmla   v26.8h, v20.8h, v2.8h       \n"
                    "fmla   v27.8h, v21.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w04 w1_012

                    "fmla   v24.8h, v19.8h, v3.8h       \n"
                    "fmla   v25.8h, v20.8h, v3.8h       \n"
                    "fmla   v26.8h, v21.8h, v3.8h       \n"
                    "fmla   v27.8h, v22.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%3], #64 \n" // r1_0123

                    "fmla   v24.8h, v20.8h, v4.8h       \n"
                    "fmla   v25.8h, v21.8h, v4.8h       \n"
                    "fmla   v26.8h, v22.8h, v4.8h       \n"
                    "fmla   v27.8h, v23.8h, v4.8h       \n"

                    "mov    v29.16b, v31.16b            \n" // sum11
                    "mov    v30.16b, v31.16b            \n" // sum12

                    "fmla   v28.8h, v8.8h, v0.8h        \n"
                    "fmla   v29.8h, v9.8h, v0.8h        \n"
                    "fmla   v30.8h, v10.8h, v0.8h       \n"
                    "fmla   v31.8h, v11.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%3] \n" // r1_4567

                    "fmla   v28.8h, v9.8h, v1.8h        \n"
                    "fmla   v29.8h, v10.8h, v1.8h       \n"
                    "fmla   v30.8h, v11.8h, v1.8h       \n"
                    "fmla   v31.8h, v12.8h, v1.8h       \n"

                    "fmla   v28.8h, v10.8h, v2.8h       \n"
                    "fmla   v29.8h, v11.8h, v2.8h       \n"
                    "fmla   v30.8h, v12.8h, v2.8h       \n"
                    "fmla   v31.8h, v13.8h, v2.8h       \n"

                    "fmla   v28.8h, v11.8h, v3.8h       \n"
                    "fmla   v29.8h, v12.8h, v3.8h       \n"
                    "fmla   v30.8h, v13.8h, v3.8h       \n"
                    "fmla   v31.8h, v14.8h, v3.8h       \n"

                    "fmla   v28.8h, v12.8h, v4.8h       \n"
                    "fmla   v29.8h, v13.8h, v4.8h       \n"
                    "fmla   v30.8h, v14.8h, v4.8h       \n"
                    "fmla   v31.8h, v15.8h, v4.8h       \n"

                    "fmla   v24.8h, v8.8h, v5.8h        \n"
                    "fmla   v25.8h, v9.8h, v5.8h        \n"
                    "fmla   v26.8h, v10.8h, v5.8h       \n"
                    "fmla   v27.8h, v11.8h, v5.8h       \n"

                    "fmla   v24.8h, v9.8h, v6.8h        \n"
                    "fmla   v25.8h, v10.8h, v6.8h       \n"
                    "fmla   v26.8h, v11.8h, v6.8h       \n"
                    "fmla   v27.8h, v12.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w1_34 w2_01

                    "fmla   v24.8h, v10.8h, v7.8h       \n"
                    "fmla   v25.8h, v11.8h, v7.8h       \n"
                    "fmla   v26.8h, v12.8h, v7.8h       \n"
                    "fmla   v27.8h, v13.8h, v7.8h       \n"

                    "fmla   v24.8h, v11.8h, v0.8h       \n"
                    "fmla   v25.8h, v12.8h, v0.8h       \n"
                    "fmla   v26.8h, v13.8h, v0.8h       \n"
                    "fmla   v27.8h, v14.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%4], #64 \n" // r2_0123

                    "fmla   v24.8h, v12.8h, v1.8h       \n"
                    "fmla   v25.8h, v13.8h, v1.8h       \n"
                    "fmla   v26.8h, v14.8h, v1.8h       \n"
                    "fmla   v27.8h, v15.8h, v1.8h       \n"

                    "fmla   v28.8h, v16.8h, v5.8h       \n"
                    "fmla   v29.8h, v17.8h, v5.8h       \n"
                    "fmla   v30.8h, v18.8h, v5.8h       \n"
                    "fmla   v31.8h, v19.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4] \n" // r2_4567

                    "fmla   v28.8h, v17.8h, v6.8h       \n"
                    "fmla   v29.8h, v18.8h, v6.8h       \n"
                    "fmla   v30.8h, v19.8h, v6.8h       \n"
                    "fmla   v31.8h, v20.8h, v6.8h       \n"

                    "fmla   v28.8h, v18.8h, v7.8h       \n"
                    "fmla   v29.8h, v19.8h, v7.8h       \n"
                    "fmla   v30.8h, v20.8h, v7.8h       \n"
                    "fmla   v31.8h, v21.8h, v7.8h       \n"

                    "fmla   v28.8h, v19.8h, v0.8h       \n"
                    "fmla   v29.8h, v20.8h, v0.8h       \n"
                    "fmla   v30.8h, v21.8h, v0.8h       \n"
                    "fmla   v31.8h, v22.8h, v0.8h       \n"

                    "fmla   v28.8h, v20.8h, v1.8h       \n"
                    "fmla   v29.8h, v21.8h, v1.8h       \n"
                    "fmla   v30.8h, v22.8h, v1.8h       \n"
                    "fmla   v31.8h, v23.8h, v1.8h       \n"

                    "fmla   v24.8h, v16.8h, v2.8h       \n"
                    "fmla   v25.8h, v17.8h, v2.8h       \n"
                    "fmla   v26.8h, v18.8h, v2.8h       \n"
                    "fmla   v27.8h, v19.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w2_234 w30

                    "fmla   v24.8h, v17.8h, v3.8h       \n"
                    "fmla   v25.8h, v18.8h, v3.8h       \n"
                    "fmla   v26.8h, v19.8h, v3.8h       \n"
                    "fmla   v27.8h, v20.8h, v3.8h       \n"

                    "fmla   v24.8h, v18.8h, v4.8h       \n"
                    "fmla   v25.8h, v19.8h, v4.8h       \n"
                    "fmla   v26.8h, v20.8h, v4.8h       \n"
                    "fmla   v27.8h, v21.8h, v4.8h       \n"

                    "fmla   v24.8h, v19.8h, v5.8h       \n"
                    "fmla   v25.8h, v20.8h, v5.8h       \n"
                    "fmla   v26.8h, v21.8h, v5.8h       \n"
                    "fmla   v27.8h, v22.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%5], #64 \n" // r3_0123

                    "fmla   v24.8h, v20.8h, v6.8h       \n"
                    "fmla   v25.8h, v21.8h, v6.8h       \n"
                    "fmla   v26.8h, v22.8h, v6.8h       \n"
                    "fmla   v27.8h, v23.8h, v6.8h       \n"

                    "fmla   v28.8h, v8.8h, v2.8h        \n"
                    "fmla   v29.8h, v9.8h, v2.8h        \n"
                    "fmla   v30.8h, v10.8h, v2.8h       \n"
                    "fmla   v31.8h, v11.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%5] \n" // r3_4567

                    "fmla   v28.8h, v9.8h, v3.8h        \n"
                    "fmla   v29.8h, v10.8h, v3.8h       \n"
                    "fmla   v30.8h, v11.8h, v3.8h       \n"
                    "fmla   v31.8h, v12.8h, v3.8h       \n"

                    "fmla   v28.8h, v10.8h, v4.8h       \n"
                    "fmla   v29.8h, v11.8h, v4.8h       \n"
                    "fmla   v30.8h, v12.8h, v4.8h       \n"
                    "fmla   v31.8h, v13.8h, v4.8h       \n"

                    "fmla   v28.8h, v11.8h, v5.8h       \n"
                    "fmla   v29.8h, v12.8h, v5.8h       \n"
                    "fmla   v30.8h, v13.8h, v5.8h       \n"
                    "fmla   v31.8h, v14.8h, v5.8h       \n"

                    "fmla   v28.8h, v12.8h, v6.8h       \n"
                    "fmla   v29.8h, v13.8h, v6.8h       \n"
                    "fmla   v30.8h, v14.8h, v6.8h       \n"
                    "fmla   v31.8h, v15.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w3_1234

                    "fmla   v24.8h, v8.8h, v7.8h        \n"
                    "fmla   v25.8h, v9.8h, v7.8h        \n"
                    "fmla   v26.8h, v10.8h, v7.8h       \n"
                    "fmla   v27.8h, v11.8h, v7.8h       \n"

                    "fmla   v24.8h, v9.8h, v0.8h        \n"
                    "fmla   v25.8h, v10.8h, v0.8h       \n"
                    "fmla   v26.8h, v11.8h, v0.8h       \n"
                    "fmla   v27.8h, v12.8h, v0.8h       \n"

                    "fmla   v24.8h, v10.8h, v1.8h       \n"
                    "fmla   v25.8h, v11.8h, v1.8h       \n"
                    "fmla   v26.8h, v12.8h, v1.8h       \n"
                    "fmla   v27.8h, v13.8h, v1.8h       \n"

                    "fmla   v24.8h, v11.8h, v2.8h       \n"
                    "fmla   v25.8h, v12.8h, v2.8h       \n"
                    "fmla   v26.8h, v13.8h, v2.8h       \n"
                    "fmla   v27.8h, v14.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%6], #64 \n" // r4_0123

                    "fmla   v24.8h, v12.8h, v3.8h       \n"
                    "fmla   v25.8h, v13.8h, v3.8h       \n"
                    "fmla   v26.8h, v14.8h, v3.8h       \n"
                    "fmla   v27.8h, v15.8h, v3.8h       \n"

                    "fmla   v28.8h, v16.8h, v7.8h       \n"
                    "fmla   v29.8h, v17.8h, v7.8h       \n"
                    "fmla   v30.8h, v18.8h, v7.8h       \n"
                    "fmla   v31.8h, v19.8h, v7.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%6] \n" // r4_4567

                    "fmla   v28.8h, v17.8h, v0.8h       \n"
                    "fmla   v29.8h, v18.8h, v0.8h       \n"
                    "fmla   v30.8h, v19.8h, v0.8h       \n"
                    "fmla   v31.8h, v20.8h, v0.8h       \n"

                    "fmla   v28.8h, v18.8h, v1.8h       \n"
                    "fmla   v29.8h, v19.8h, v1.8h       \n"
                    "fmla   v30.8h, v20.8h, v1.8h       \n"
                    "fmla   v31.8h, v21.8h, v1.8h       \n"

                    "fmla   v28.8h, v19.8h, v2.8h       \n"
                    "fmla   v29.8h, v20.8h, v2.8h       \n"
                    "fmla   v30.8h, v21.8h, v2.8h       \n"
                    "fmla   v31.8h, v22.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w4_0123

                    "fmla   v28.8h, v20.8h, v3.8h       \n"
                    "fmla   v29.8h, v21.8h, v3.8h       \n"
                    "fmla   v30.8h, v22.8h, v3.8h       \n"
                    "fmla   v31.8h, v23.8h, v3.8h       \n"

                    "fmla   v24.8h, v16.8h, v4.8h       \n"
                    "fmla   v25.8h, v17.8h, v4.8h       \n"
                    "fmla   v26.8h, v18.8h, v4.8h       \n"
                    "fmla   v27.8h, v19.8h, v4.8h       \n"

                    "fmla   v24.8h, v17.8h, v5.8h       \n"
                    "fmla   v25.8h, v18.8h, v5.8h       \n"
                    "fmla   v26.8h, v19.8h, v5.8h       \n"
                    "fmla   v27.8h, v20.8h, v5.8h       \n"

                    "fmla   v24.8h, v18.8h, v6.8h       \n"
                    "fmla   v25.8h, v19.8h, v6.8h       \n"
                    "fmla   v26.8h, v20.8h, v6.8h       \n"
                    "fmla   v27.8h, v21.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%8, #128]       \n"
                    "ld1    {v0.8h}, [%8]               \n" // w44

                    "fmla   v24.8h, v19.8h, v7.8h       \n"
                    "fmla   v25.8h, v20.8h, v7.8h       \n"
                    "fmla   v26.8h, v21.8h, v7.8h       \n"
                    "fmla   v27.8h, v22.8h, v7.8h       \n"

                    "prfm   pldl1keep, [%7, #512]       \n"
                    "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%7], #64 \n" // r5_0123

                    "fmla   v24.8h, v20.8h, v0.8h       \n"
                    "fmla   v25.8h, v21.8h, v0.8h       \n"
                    "fmla   v26.8h, v22.8h, v0.8h       \n"
                    "fmla   v27.8h, v23.8h, v0.8h       \n"

                    "fmla   v28.8h, v8.8h, v4.8h        \n"
                    "fmla   v29.8h, v9.8h, v4.8h        \n"
                    "fmla   v30.8h, v10.8h, v4.8h       \n"
                    "fmla   v31.8h, v11.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%7, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%7] \n" // r5_4567

                    "fmla   v28.8h, v9.8h, v5.8h        \n"
                    "fmla   v29.8h, v10.8h, v5.8h       \n"
                    "fmla   v30.8h, v11.8h, v5.8h       \n"
                    "fmla   v31.8h, v12.8h, v5.8h       \n"

                    "fmla   v28.8h, v10.8h, v6.8h       \n"
                    "fmla   v29.8h, v11.8h, v6.8h       \n"
                    "fmla   v30.8h, v12.8h, v6.8h       \n"
                    "fmla   v31.8h, v13.8h, v6.8h       \n"

                    "fmla   v28.8h, v11.8h, v7.8h       \n"
                    "fmla   v29.8h, v12.8h, v7.8h       \n"
                    "fmla   v30.8h, v13.8h, v7.8h       \n"
                    "fmla   v31.8h, v14.8h, v7.8h       \n"

                    "fmla   v28.8h, v12.8h, v0.8h       \n"
                    "fmla   v29.8h, v13.8h, v0.8h       \n"
                    "fmla   v30.8h, v14.8h, v0.8h       \n"
                    "fmla   v31.8h, v15.8h, v0.8h       \n"

                    "sub    %8, %8, #384                \n" // k0 -= 24 * 8

                    "st1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%0], #64 \n"
                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%1], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3),      // %5
                    "=r"(r4),      // %6
                    "=r"(r5),      // %7
                    "=r"(k0)       // %8
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(r5),
                    "8"(k0),
                    "r"(bias0_data_ptr) // %18
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }

            float16x8_t _bias0 = bias ? vld1q_f16(bias + g * 8) : vdupq_n_f16((__fp16)0.f);

            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%2], #32 \n" // r0_01

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w0_0123

                    "mov    v28.16b, %18.16b            \n" // sum00
                    "mov    v29.16b, %18.16b            \n" // sum01

                    "fmla   v28.8h, v16.8h, v0.8h       \n"
                    "fmla   v29.8h, v17.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%2] \n" // r0_2345

                    "mov    v30.16b, %18.16b            \n" // sum10
                    "mov    v31.16b, %18.16b            \n" // sum11

                    "fmla   v28.8h, v17.8h, v1.8h       \n"
                    "fmla   v29.8h, v18.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w04 w1_012

                    "fmla   v28.8h, v18.8h, v2.8h       \n"
                    "fmla   v29.8h, v19.8h, v2.8h       \n"
                    "fmla   v28.8h, v19.8h, v3.8h       \n"
                    "fmla   v29.8h, v20.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%3], #32 \n" // r1_01

                    "fmla   v28.8h, v20.8h, v4.8h       \n"
                    "fmla   v29.8h, v21.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%3] \n" // r1_2345

                    "fmla   v30.8h, v22.8h, v0.8h       \n"
                    "fmla   v31.8h, v23.8h, v0.8h       \n"
                    "fmla   v30.8h, v23.8h, v1.8h       \n"
                    "fmla   v31.8h, v24.8h, v1.8h       \n"
                    "fmla   v30.8h, v24.8h, v2.8h       \n"
                    "fmla   v31.8h, v25.8h, v2.8h       \n"
                    "fmla   v30.8h, v25.8h, v3.8h       \n"
                    "fmla   v31.8h, v26.8h, v3.8h       \n"
                    "fmla   v30.8h, v26.8h, v4.8h       \n"
                    "fmla   v31.8h, v27.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w1_34 w2_01

                    "fmla   v28.8h, v22.8h, v5.8h       \n"
                    "fmla   v29.8h, v23.8h, v5.8h       \n"
                    "fmla   v28.8h, v23.8h, v6.8h       \n"
                    "fmla   v29.8h, v24.8h, v6.8h       \n"
                    "fmla   v28.8h, v24.8h, v7.8h       \n"
                    "fmla   v29.8h, v25.8h, v7.8h       \n"
                    "fmla   v28.8h, v25.8h, v0.8h       \n"
                    "fmla   v29.8h, v26.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%4], #32 \n" // r2_01

                    "fmla   v28.8h, v26.8h, v1.8h       \n"
                    "fmla   v29.8h, v27.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%4] \n" // r2_2345

                    "fmla   v30.8h, v16.8h, v5.8h       \n"
                    "fmla   v31.8h, v17.8h, v5.8h       \n"
                    "fmla   v30.8h, v17.8h, v6.8h       \n"
                    "fmla   v31.8h, v18.8h, v6.8h       \n"
                    "fmla   v30.8h, v18.8h, v7.8h       \n"
                    "fmla   v31.8h, v19.8h, v7.8h       \n"
                    "fmla   v30.8h, v19.8h, v0.8h       \n"
                    "fmla   v31.8h, v20.8h, v0.8h       \n"
                    "fmla   v30.8h, v20.8h, v1.8h       \n"
                    "fmla   v31.8h, v21.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w2_234 w30

                    "fmla   v28.8h, v16.8h, v2.8h       \n"
                    "fmla   v29.8h, v17.8h, v2.8h       \n"
                    "fmla   v28.8h, v17.8h, v3.8h       \n"
                    "fmla   v29.8h, v18.8h, v3.8h       \n"
                    "fmla   v28.8h, v18.8h, v4.8h       \n"
                    "fmla   v29.8h, v19.8h, v4.8h       \n"
                    "fmla   v28.8h, v19.8h, v5.8h       \n"
                    "fmla   v29.8h, v20.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%5], #32 \n" // r3_01

                    "fmla   v28.8h, v20.8h, v6.8h       \n"
                    "fmla   v29.8h, v21.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%5] \n" // r3_2345

                    "fmla   v30.8h, v22.8h, v2.8h       \n"
                    "fmla   v31.8h, v23.8h, v2.8h       \n"
                    "fmla   v30.8h, v23.8h, v3.8h       \n"
                    "fmla   v31.8h, v24.8h, v3.8h       \n"
                    "fmla   v30.8h, v24.8h, v4.8h       \n"
                    "fmla   v31.8h, v25.8h, v4.8h       \n"
                    "fmla   v30.8h, v25.8h, v5.8h       \n"
                    "fmla   v31.8h, v26.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w3_1234

                    "fmla   v30.8h, v26.8h, v6.8h       \n"
                    "fmla   v31.8h, v27.8h, v6.8h       \n"

                    "fmla   v28.8h, v22.8h, v7.8h       \n"
                    "fmla   v29.8h, v23.8h, v7.8h       \n"
                    "fmla   v28.8h, v23.8h, v0.8h       \n"
                    "fmla   v29.8h, v24.8h, v0.8h       \n"
                    "fmla   v28.8h, v24.8h, v1.8h       \n"
                    "fmla   v29.8h, v25.8h, v1.8h       \n"
                    "fmla   v28.8h, v25.8h, v2.8h       \n"
                    "fmla   v29.8h, v26.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%6], #32 \n" // r4_01

                    "fmla   v28.8h, v26.8h, v3.8h       \n"
                    "fmla   v29.8h, v27.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%6] \n" // r4_2345

                    "fmla   v30.8h, v16.8h, v7.8h       \n"
                    "fmla   v31.8h, v17.8h, v7.8h       \n"
                    "fmla   v30.8h, v17.8h, v0.8h       \n"
                    "fmla   v31.8h, v18.8h, v0.8h       \n"
                    "fmla   v30.8h, v18.8h, v1.8h       \n"
                    "fmla   v31.8h, v19.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w4_0123

                    "fmla   v30.8h, v19.8h, v2.8h       \n"
                    "fmla   v31.8h, v20.8h, v2.8h       \n"
                    "fmla   v30.8h, v20.8h, v3.8h       \n"
                    "fmla   v31.8h, v21.8h, v3.8h       \n"

                    "fmla   v28.8h, v16.8h, v4.8h       \n"
                    "fmla   v29.8h, v17.8h, v4.8h       \n"
                    "fmla   v28.8h, v17.8h, v5.8h       \n"
                    "fmla   v29.8h, v18.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%8, #128]       \n"
                    "ld1    {v0.8h}, [%8]               \n" // w44

                    "fmla   v28.8h, v18.8h, v6.8h       \n"
                    "fmla   v29.8h, v19.8h, v6.8h       \n"
                    "fmla   v28.8h, v19.8h, v7.8h       \n"
                    "fmla   v29.8h, v20.8h, v7.8h       \n"

                    "prfm   pldl1keep, [%7, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%7], #32 \n" // r5_01

                    "fmla   v28.8h, v20.8h, v0.8h       \n"
                    "fmla   v29.8h, v21.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%7, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%7] \n" // r5_2345

                    "fmla   v30.8h, v22.8h, v4.8h       \n"
                    "fmla   v31.8h, v23.8h, v4.8h       \n"
                    "fmla   v30.8h, v23.8h, v5.8h       \n"
                    "fmla   v31.8h, v24.8h, v5.8h       \n"
                    "fmla   v30.8h, v24.8h, v6.8h       \n"
                    "fmla   v31.8h, v25.8h, v6.8h       \n"
                    "fmla   v30.8h, v25.8h, v7.8h       \n"
                    "fmla   v31.8h, v26.8h, v7.8h       \n"
                    "fmla   v30.8h, v26.8h, v0.8h       \n"
                    "fmla   v31.8h, v27.8h, v0.8h       \n"

                    "sub    %8, %8, #384                \n" // k0 -= 24 * 8

                    "st1    {v28.8h, v29.8h}, [%0], #32 \n"
                    "st1    {v30.8h, v31.8h}, [%1], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3),      // %5
                    "=r"(r4),      // %6
                    "=r"(r5),      // %7
                    "=r"(k0)       // %8
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(r5),
                    "8"(k0),
                    "w"(_bias0) // %18
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v16.8h}, [%2], #16         \n" // r0_0

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v17.8h, v18.8h, v19.8h, v20.8h}, [%2] \n" // r0_1234

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w0_0123

                    "mov    v30.16b, %18.16b            \n" // sum00
                    "mov    v31.16b, %18.16b            \n" // sum10

                    "fmla   v30.8h, v16.8h, v0.8h       \n"
                    "fmla   v30.8h, v17.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w04 w1_012

                    "fmla   v30.8h, v18.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v21.8h}, [%3], #16         \n" // r1_0

                    "fmla   v30.8h, v19.8h, v3.8h       \n"
                    "fmla   v30.8h, v20.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v22.8h, v23.8h, v24.8h, v25.8h}, [%3] \n" // r1_1234

                    "fmla   v31.8h, v21.8h, v0.8h       \n"
                    "fmla   v31.8h, v22.8h, v1.8h       \n"
                    "fmla   v31.8h, v23.8h, v2.8h       \n"
                    "fmla   v31.8h, v24.8h, v3.8h       \n"
                    "fmla   v31.8h, v25.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w1_34 w2_01

                    "fmla   v30.8h, v21.8h, v5.8h       \n"
                    "fmla   v30.8h, v22.8h, v6.8h       \n"
                    "fmla   v30.8h, v23.8h, v7.8h       \n"

                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v16.8h}, [%4], #16         \n" // r2_0

                    "fmla   v30.8h, v24.8h, v0.8h       \n"
                    "fmla   v30.8h, v25.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v17.8h, v18.8h, v19.8h, v20.8h}, [%4] \n" // r2_1234

                    "fmla   v31.8h, v16.8h, v5.8h       \n"
                    "fmla   v31.8h, v17.8h, v6.8h       \n"
                    "fmla   v31.8h, v18.8h, v7.8h       \n"
                    "fmla   v31.8h, v19.8h, v0.8h       \n"
                    "fmla   v31.8h, v20.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w2_234 w30

                    "fmla   v30.8h, v16.8h, v2.8h       \n"
                    "fmla   v30.8h, v17.8h, v3.8h       \n"
                    "fmla   v30.8h, v18.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%5, #128]       \n"
                    "ld1    {v21.8h}, [%5], #16         \n" // r3_0

                    "fmla   v30.8h, v19.8h, v5.8h       \n"
                    "fmla   v30.8h, v20.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v22.8h, v23.8h, v24.8h, v25.8h}, [%5] \n" // r3_1234

                    "fmla   v31.8h, v21.8h, v2.8h       \n"
                    "fmla   v31.8h, v22.8h, v3.8h       \n"
                    "fmla   v31.8h, v23.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%8], #64 \n" // w3_1234

                    "fmla   v31.8h, v24.8h, v5.8h       \n"
                    "fmla   v31.8h, v25.8h, v6.8h       \n"

                    "fmla   v30.8h, v21.8h, v7.8h       \n"
                    "fmla   v30.8h, v22.8h, v0.8h       \n"
                    "fmla   v30.8h, v23.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%6, #128]       \n"
                    "ld1    {v16.8h}, [%6], #16         \n" // r4_0

                    "fmla   v30.8h, v24.8h, v2.8h       \n"
                    "fmla   v30.8h, v25.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v17.8h, v18.8h, v19.8h, v20.8h}, [%6] \n" // r4_1234

                    "fmla   v31.8h, v16.8h, v7.8h       \n"
                    "fmla   v31.8h, v17.8h, v0.8h       \n"
                    "fmla   v31.8h, v18.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%8, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%8], #64 \n" // w4_0123

                    "fmla   v31.8h, v19.8h, v2.8h       \n"
                    "fmla   v31.8h, v20.8h, v3.8h       \n"

                    "fmla   v30.8h, v16.8h, v4.8h       \n"
                    "fmla   v30.8h, v17.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%8, #128]       \n"
                    "ld1    {v0.8h}, [%8]               \n" // w44

                    "fmla   v30.8h, v18.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%7, #128]       \n"
                    "ld1    {v21.8h}, [%7], #16         \n" // r5_0

                    "fmla   v30.8h, v19.8h, v7.8h       \n"
                    "fmla   v30.8h, v20.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%7, #512]       \n"
                    "ld1    {v22.8h, v23.8h, v24.8h, v25.8h}, [%7] \n" // r5_1234

                    "fmla   v31.8h, v21.8h, v4.8h       \n"
                    "fmla   v31.8h, v22.8h, v5.8h       \n"
                    "fmla   v31.8h, v23.8h, v6.8h       \n"
                    "fmla   v31.8h, v24.8h, v7.8h       \n"
                    "fmla   v31.8h, v25.8h, v0.8h       \n"

                    "sub    %8, %8, #384                \n" // k0 -= 24 * 8

                    "st1    {v30.8h}, [%0], #16         \n"
                    "st1    {v31.8h}, [%1], #16         \n"

                    : "=r"(outptr0), // %0
                    "=r"(outptr1), // %1
                    "=r"(r0),      // %2
                    "=r"(r1),      // %3
                    "=r"(r2),      // %4
                    "=r"(r3),      // %5
                    "=r"(r4),      // %6
                    "=r"(r5),      // %7
                    "=r"(k0)       // %8
                    : "0"(outptr0),
                    "1"(outptr1),
                    "2"(r0),
                    "3"(r1),
                    "4"(r2),
                    "5"(r3),
                    "6"(r4),
                    "7"(r5),
                    "8"(k0),
                    "w"(_bias0) // %18
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v30", "v31");
            }

            r0 += 4 * 8 + w * 8;
            r1 += 4 * 8 + w * 8;
            r2 += 4 * 8 + w * 8;
            r3 += 4 * 8 + w * 8;
            r4 += 4 * 8 + w * 8;
            r5 += 4 * 8 + w * 8;

            outptr0 += outw * 8;
            outptr1 += outw * 8;
        }

        float16x8_t _bias0 = bias ? vld1q_f16(bias + g * 8) : vdupq_n_f16((__fp16)0.f);

        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%1], #64 \n" // r0_0123

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w0_0123

                    "mov    v28.16b, %14.16b            \n" // sum00
                    "mov    v29.16b, %14.16b            \n" // sum01
                    "mov    v30.16b, %14.16b            \n" // sum02
                    "mov    v31.16b, %14.16b            \n" // sum03

                    "fmla   v28.8h, v12.8h, v0.8h       \n"
                    "fmla   v29.8h, v13.8h, v0.8h       \n"
                    "fmla   v30.8h, v14.8h, v0.8h       \n"
                    "fmla   v31.8h, v15.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%1] \n" // r0_4567

                    "fmla   v28.8h, v13.8h, v1.8h       \n"
                    "fmla   v29.8h, v14.8h, v1.8h       \n"
                    "fmla   v30.8h, v15.8h, v1.8h       \n"
                    "fmla   v31.8h, v16.8h, v1.8h       \n"

                    "fmla   v28.8h, v14.8h, v2.8h       \n"
                    "fmla   v29.8h, v15.8h, v2.8h       \n"
                    "fmla   v30.8h, v16.8h, v2.8h       \n"
                    "fmla   v31.8h, v17.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w04 w1_012

                    "fmla   v28.8h, v15.8h, v3.8h       \n"
                    "fmla   v29.8h, v16.8h, v3.8h       \n"
                    "fmla   v30.8h, v17.8h, v3.8h       \n"
                    "fmla   v31.8h, v18.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%2], #64 \n" // r1_0123

                    "fmla   v28.8h, v16.8h, v4.8h       \n"
                    "fmla   v29.8h, v17.8h, v4.8h       \n"
                    "fmla   v30.8h, v18.8h, v4.8h       \n"
                    "fmla   v31.8h, v19.8h, v4.8h       \n"

                    "fmla   v28.8h, v20.8h, v5.8h       \n"
                    "fmla   v29.8h, v21.8h, v5.8h       \n"
                    "fmla   v30.8h, v22.8h, v5.8h       \n"
                    "fmla   v31.8h, v23.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%2] \n" // r1_4567

                    "fmla   v28.8h, v21.8h, v6.8h       \n"
                    "fmla   v29.8h, v22.8h, v6.8h       \n"
                    "fmla   v30.8h, v23.8h, v6.8h       \n"
                    "fmla   v31.8h, v24.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w1_34 w2_01

                    "fmla   v28.8h, v22.8h, v7.8h       \n"
                    "fmla   v29.8h, v23.8h, v7.8h       \n"
                    "fmla   v30.8h, v24.8h, v7.8h       \n"
                    "fmla   v31.8h, v25.8h, v7.8h       \n"

                    "fmla   v28.8h, v23.8h, v0.8h       \n"
                    "fmla   v29.8h, v24.8h, v0.8h       \n"
                    "fmla   v30.8h, v25.8h, v0.8h       \n"
                    "fmla   v31.8h, v26.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%3], #64 \n" // r2_0123

                    "fmla   v28.8h, v24.8h, v1.8h       \n"
                    "fmla   v29.8h, v25.8h, v1.8h       \n"
                    "fmla   v30.8h, v26.8h, v1.8h       \n"
                    "fmla   v31.8h, v27.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%3] \n" // r2_4567

                    "fmla   v28.8h, v12.8h, v2.8h       \n"
                    "fmla   v29.8h, v13.8h, v2.8h       \n"
                    "fmla   v30.8h, v14.8h, v2.8h       \n"
                    "fmla   v31.8h, v15.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w2_234 w30

                    "fmla   v28.8h, v13.8h, v3.8h       \n"
                    "fmla   v29.8h, v14.8h, v3.8h       \n"
                    "fmla   v30.8h, v15.8h, v3.8h       \n"
                    "fmla   v31.8h, v16.8h, v3.8h       \n"

                    "fmla   v28.8h, v14.8h, v4.8h       \n"
                    "fmla   v29.8h, v15.8h, v4.8h       \n"
                    "fmla   v30.8h, v16.8h, v4.8h       \n"
                    "fmla   v31.8h, v17.8h, v4.8h       \n"

                    "fmla   v28.8h, v15.8h, v5.8h       \n"
                    "fmla   v29.8h, v16.8h, v5.8h       \n"
                    "fmla   v30.8h, v17.8h, v5.8h       \n"
                    "fmla   v31.8h, v18.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v20.8h, v21.8h, v22.8h, v23.8h}, [%4], #64 \n" // r3_0123

                    "fmla   v28.8h, v16.8h, v6.8h       \n"
                    "fmla   v29.8h, v17.8h, v6.8h       \n"
                    "fmla   v30.8h, v18.8h, v6.8h       \n"
                    "fmla   v31.8h, v19.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w3_1234

                    "fmla   v28.8h, v20.8h, v7.8h       \n"
                    "fmla   v29.8h, v21.8h, v7.8h       \n"
                    "fmla   v30.8h, v22.8h, v7.8h       \n"
                    "fmla   v31.8h, v23.8h, v7.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%4] \n" // r3_4567

                    "fmla   v28.8h, v21.8h, v0.8h       \n"
                    "fmla   v29.8h, v22.8h, v0.8h       \n"
                    "fmla   v30.8h, v23.8h, v0.8h       \n"
                    "fmla   v31.8h, v24.8h, v0.8h       \n"

                    "fmla   v28.8h, v22.8h, v1.8h       \n"
                    "fmla   v29.8h, v23.8h, v1.8h       \n"
                    "fmla   v30.8h, v24.8h, v1.8h       \n"
                    "fmla   v31.8h, v25.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%5], #64 \n" // r4_0123

                    "fmla   v28.8h, v23.8h, v2.8h       \n"
                    "fmla   v29.8h, v24.8h, v2.8h       \n"
                    "fmla   v30.8h, v25.8h, v2.8h       \n"
                    "fmla   v31.8h, v26.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w4_0123

                    "fmla   v28.8h, v24.8h, v3.8h       \n"
                    "fmla   v29.8h, v25.8h, v3.8h       \n"
                    "fmla   v30.8h, v26.8h, v3.8h       \n"
                    "fmla   v31.8h, v27.8h, v3.8h       \n"

                    "fmla   v28.8h, v12.8h, v4.8h       \n"
                    "fmla   v29.8h, v13.8h, v4.8h       \n"
                    "fmla   v30.8h, v14.8h, v4.8h       \n"
                    "fmla   v31.8h, v15.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%5] \n" // r4_4567

                    "fmla   v28.8h, v13.8h, v5.8h       \n"
                    "fmla   v29.8h, v14.8h, v5.8h       \n"
                    "fmla   v30.8h, v15.8h, v5.8h       \n"
                    "fmla   v31.8h, v16.8h, v5.8h       \n"

                    "fmla   v28.8h, v14.8h, v6.8h       \n"
                    "fmla   v29.8h, v15.8h, v6.8h       \n"
                    "fmla   v30.8h, v16.8h, v6.8h       \n"
                    "fmla   v31.8h, v17.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%6, #128]       \n"
                    "ld1    {v0.8h}, [%6]               \n" // w44

                    "fmla   v28.8h, v15.8h, v7.8h       \n"
                    "fmla   v29.8h, v16.8h, v7.8h       \n"
                    "fmla   v30.8h, v17.8h, v7.8h       \n"
                    "fmla   v31.8h, v18.8h, v7.8h       \n"

                    "fmla   v28.8h, v16.8h, v0.8h       \n"
                    "fmla   v29.8h, v17.8h, v0.8h       \n"
                    "fmla   v30.8h, v18.8h, v0.8h       \n"
                    "fmla   v31.8h, v19.8h, v0.8h       \n"

                    "sub    %6, %6, #384                \n" // k0 -= 24 * 8

                    "st1    {v28.8h, v29.8h, v30.8h, v31.8h}, [%0], #64 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4),      // %5
                    "=r"(k0)       // %6
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "6"(k0),
                    "w"(_bias0) // %14
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
            }
            for (; j + 1 < outw; j += 2)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%1], #32 \n" // r0_01

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w0_0123

                    "mov    v30.16b, %14.16b            \n" // sum00
                    "mov    v31.16b, %14.16b            \n" // sum01

                    "fmla   v30.8h, v16.8h, v0.8h       \n"
                    "fmla   v31.8h, v17.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%1] \n" // r0_2345

                    "fmla   v30.8h, v17.8h, v1.8h       \n"
                    "fmla   v31.8h, v18.8h, v1.8h       \n"
                    "fmla   v30.8h, v18.8h, v2.8h       \n"
                    "fmla   v31.8h, v19.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w04 w1_012

                    "fmla   v30.8h, v19.8h, v3.8h       \n"
                    "fmla   v31.8h, v20.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%2, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%2], #32 \n" // r1_01

                    "fmla   v30.8h, v20.8h, v4.8h       \n"
                    "fmla   v31.8h, v21.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%2] \n" // r1_2345

                    "fmla   v30.8h, v22.8h, v5.8h       \n"
                    "fmla   v31.8h, v23.8h, v5.8h       \n"
                    "fmla   v30.8h, v23.8h, v6.8h       \n"
                    "fmla   v31.8h, v24.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w1_34 w2_01

                    "fmla   v30.8h, v24.8h, v7.8h       \n"
                    "fmla   v31.8h, v25.8h, v7.8h       \n"
                    "fmla   v30.8h, v25.8h, v0.8h       \n"
                    "fmla   v31.8h, v26.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%3, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%3], #32 \n" // r2_01

                    "fmla   v30.8h, v26.8h, v1.8h       \n"
                    "fmla   v31.8h, v27.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%3] \n" // r2_2345

                    "fmla   v30.8h, v16.8h, v2.8h       \n"
                    "fmla   v31.8h, v17.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w2_234 w30

                    "fmla   v30.8h, v17.8h, v3.8h       \n"
                    "fmla   v31.8h, v18.8h, v3.8h       \n"
                    "fmla   v30.8h, v18.8h, v4.8h       \n"
                    "fmla   v31.8h, v19.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%4, #256]       \n"
                    "ld1    {v22.8h, v23.8h}, [%4], #32 \n" // r3_01

                    "fmla   v30.8h, v19.8h, v5.8h       \n"
                    "fmla   v31.8h, v20.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w3_1234

                    "fmla   v30.8h, v20.8h, v6.8h       \n"
                    "fmla   v31.8h, v21.8h, v6.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v24.8h, v25.8h, v26.8h, v27.8h}, [%4] \n" // r3_2345

                    "fmla   v30.8h, v22.8h, v7.8h       \n"
                    "fmla   v31.8h, v23.8h, v7.8h       \n"
                    "fmla   v30.8h, v23.8h, v0.8h       \n"
                    "fmla   v31.8h, v24.8h, v0.8h       \n"
                    "fmla   v30.8h, v24.8h, v1.8h       \n"
                    "fmla   v31.8h, v25.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v16.8h, v17.8h}, [%5], #32 \n" // r4_01

                    "fmla   v30.8h, v25.8h, v2.8h       \n"
                    "fmla   v31.8h, v26.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w4_0123

                    "fmla   v30.8h, v26.8h, v3.8h       \n"
                    "fmla   v31.8h, v27.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v18.8h, v19.8h, v20.8h, v21.8h}, [%5] \n" // r4_2345

                    "fmla   v30.8h, v16.8h, v4.8h       \n"
                    "fmla   v31.8h, v17.8h, v4.8h       \n"
                    "fmla   v30.8h, v17.8h, v5.8h       \n"
                    "fmla   v31.8h, v18.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%6, #128]       \n"
                    "ld1    {v0.8h}, [%6]               \n" // w44

                    "fmla   v30.8h, v18.8h, v6.8h       \n"
                    "fmla   v31.8h, v19.8h, v6.8h       \n"
                    "fmla   v30.8h, v19.8h, v7.8h       \n"
                    "fmla   v31.8h, v20.8h, v7.8h       \n"
                    "fmla   v30.8h, v20.8h, v0.8h       \n"
                    "fmla   v31.8h, v21.8h, v0.8h       \n"

                    "sub    %6, %6, #384                \n" // k0 -= 24 * 8

                    "st1    {v30.8h, v31.8h}, [%0], #32 \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4),      // %5
                    "=r"(k0)       // %6
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "6"(k0),
                    "w"(_bias0) // %14
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v30", "v31");
            }
            for (; j < outw; j++)
            {
                asm volatile(
                    "prfm   pldl1keep, [%1, #128]       \n"
                    "ld1    {v16.8h}, [%1], #16         \n" // r0_0

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w0_0123

                    "mov    v30.16b, %14.16b            \n" // sum00

                    "prfm   pldl1keep, [%1, #512]       \n"
                    "ld1    {v17.8h, v18.8h, v19.8h, v20.8h}, [%1] \n" // r0_1234

                    "fmla   v30.8h, v16.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w04 w1_012

                    "fmla   v30.8h, v17.8h, v1.8h       \n"

                    "fmla   v30.8h, v18.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%2, #128]       \n"
                    "ld1    {v21.8h}, [%2], #16         \n" // r1_0

                    "fmla   v30.8h, v19.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%2, #512]       \n"
                    "ld1    {v22.8h, v23.8h, v24.8h, v25.8h}, [%2] \n" // r1_1234

                    "fmla   v30.8h, v20.8h, v4.8h       \n"

                    "fmla   v30.8h, v21.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w1_34 w2_01

                    "fmla   v30.8h, v22.8h, v6.8h       \n"

                    "fmla   v30.8h, v23.8h, v7.8h       \n"

                    "prfm   pldl1keep, [%3, #128]       \n"
                    "ld1    {v16.8h}, [%3], #16         \n" // r2_0

                    "fmla   v30.8h, v24.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%3, #512]       \n"
                    "ld1    {v17.8h, v18.8h, v19.8h, v20.8h}, [%3] \n" // r2_1234

                    "fmla   v30.8h, v25.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w2_234 w30

                    "fmla   v30.8h, v16.8h, v2.8h       \n"
                    "fmla   v30.8h, v17.8h, v3.8h       \n"

                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v21.8h}, [%4], #16         \n" // r3_0

                    "fmla   v30.8h, v18.8h, v4.8h       \n"

                    "prfm   pldl1keep, [%4, #512]       \n"
                    "ld1    {v22.8h, v23.8h, v24.8h, v25.8h}, [%4] \n" // r3_1234

                    "fmla   v30.8h, v19.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%6], #64 \n" // w3_1234

                    "fmla   v30.8h, v20.8h, v6.8h       \n"

                    "fmla   v30.8h, v21.8h, v7.8h       \n"
                    "fmla   v30.8h, v22.8h, v0.8h       \n"

                    "prfm   pldl1keep, [%5, #128]       \n"
                    "ld1    {v16.8h}, [%5], #16         \n" // r4_0

                    "fmla   v30.8h, v23.8h, v1.8h       \n"

                    "prfm   pldl1keep, [%6, #512]       \n"
                    "ld1    {v4.8h, v5.8h, v6.8h, v7.8h}, [%6], #64 \n" // w4_0123

                    "fmla   v30.8h, v24.8h, v2.8h       \n"

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v17.8h, v18.8h, v19.8h, v20.8h}, [%5] \n" // r4_1234

                    "fmla   v30.8h, v25.8h, v3.8h       \n"

                    "fmla   v30.8h, v16.8h, v4.8h       \n"
                    "fmla   v30.8h, v17.8h, v5.8h       \n"

                    "prfm   pldl1keep, [%6, #128]       \n"
                    "ld1    {v0.8h}, [%6]               \n" // w44

                    "fmla   v30.8h, v18.8h, v6.8h       \n"
                    "fmla   v30.8h, v19.8h, v7.8h       \n"
                    "fmla   v30.8h, v20.8h, v0.8h       \n"

                    "sub    %6, %6, #384                \n" // k0 -= 24 * 8

                    "st1    {v30.8h}, [%0], #16         \n"

                    : "=r"(outptr0), // %0
                    "=r"(r0),      // %1
                    "=r"(r1),      // %2
                    "=r"(r2),      // %3
                    "=r"(r3),      // %4
                    "=r"(r4),      // %5
                    "=r"(k0)       // %6
                    : "0"(outptr0),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(r3),
                    "5"(r4),
                    "6"(k0),
                    "w"(_bias0) // %14
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v30");
            }

            r0 += 4 * 8;
            r1 += 4 * 8;
            r2 += 4 * 8;
            r3 += 4 * 8;
            r4 += 4 * 8;
        }
    }
}

static void convdw5x5s2_pack8_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 8;

    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        float16x8_t _bias0 = bias ? vld1q_f16(bias + g * 8) : vdupq_n_f16((__fp16)0.f);

        const __fp16* k0 = kernel.row<const __fp16>(g);

        __fp16* outptr0 = out.row<__fp16>(0);

        const Mat img0 = bottom_blob.channel(g);

        const __fp16* r0 = img0.row<const __fp16>(0);
        const __fp16* r1 = img0.row<const __fp16>(1);
        const __fp16* r2 = img0.row<const __fp16>(2);
        const __fp16* r3 = img0.row<const __fp16>(3);
        const __fp16* r4 = img0.row<const __fp16>(4);

        int i = 0;
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                float16x8_t _sum0 = _bias0;

                float16x8_t _r00 = vld1q_f16(r0);
                float16x8_t _r01 = vld1q_f16(r0 + 8);
                float16x8_t _r02 = vld1q_f16(r0 + 16);
                float16x8_t _r03 = vld1q_f16(r0 + 24);
                float16x8_t _r04 = vld1q_f16(r0 + 32);

                float16x8_t _k00 = vld1q_f16(k0);
                float16x8_t _k01 = vld1q_f16(k0 + 8);
                float16x8_t _k02 = vld1q_f16(k0 + 16);
                float16x8_t _k03 = vld1q_f16(k0 + 24);
                float16x8_t _k04 = vld1q_f16(k0 + 32);
                k0 += 40;

                _sum0 = vfmaq_f16(_sum0, _k00, _r00);
                _sum0 = vfmaq_f16(_sum0, _k01, _r01);
                _sum0 = vfmaq_f16(_sum0, _k02, _r02);
                _sum0 = vfmaq_f16(_sum0, _k03, _r03);
                _sum0 = vfmaq_f16(_sum0, _k04, _r04);

                float16x8_t _r10 = vld1q_f16(r1);
                float16x8_t _r11 = vld1q_f16(r1 + 8);
                float16x8_t _r12 = vld1q_f16(r1 + 16);
                float16x8_t _r13 = vld1q_f16(r1 + 24);
                float16x8_t _r14 = vld1q_f16(r1 + 32);

                float16x8_t _k10 = vld1q_f16(k0);
                float16x8_t _k11 = vld1q_f16(k0 + 8);
                float16x8_t _k12 = vld1q_f16(k0 + 16);
                float16x8_t _k13 = vld1q_f16(k0 + 24);
                float16x8_t _k14 = vld1q_f16(k0 + 32);
                k0 += 40;

                _sum0 = vfmaq_f16(_sum0, _k10, _r10);
                _sum0 = vfmaq_f16(_sum0, _k11, _r11);
                _sum0 = vfmaq_f16(_sum0, _k12, _r12);
                _sum0 = vfmaq_f16(_sum0, _k13, _r13);
                _sum0 = vfmaq_f16(_sum0, _k14, _r14);

                float16x8_t _r20 = vld1q_f16(r2);
                float16x8_t _r21 = vld1q_f16(r2 + 8);
                float16x8_t _r22 = vld1q_f16(r2 + 16);
                float16x8_t _r23 = vld1q_f16(r2 + 24);
                float16x8_t _r24 = vld1q_f16(r2 + 32);

                float16x8_t _k20 = vld1q_f16(k0);
                float16x8_t _k21 = vld1q_f16(k0 + 8);
                float16x8_t _k22 = vld1q_f16(k0 + 16);
                float16x8_t _k23 = vld1q_f16(k0 + 24);
                float16x8_t _k24 = vld1q_f16(k0 + 32);
                k0 += 40;

                _sum0 = vfmaq_f16(_sum0, _k20, _r20);
                _sum0 = vfmaq_f16(_sum0, _k21, _r21);
                _sum0 = vfmaq_f16(_sum0, _k22, _r22);
                _sum0 = vfmaq_f16(_sum0, _k23, _r23);
                _sum0 = vfmaq_f16(_sum0, _k24, _r24);

                float16x8_t _r30 = vld1q_f16(r3);
                float16x8_t _r31 = vld1q_f16(r3 + 8);
                float16x8_t _r32 = vld1q_f16(r3 + 16);
                float16x8_t _r33 = vld1q_f16(r3 + 24);
                float16x8_t _r34 = vld1q_f16(r3 + 32);

                float16x8_t _k30 = vld1q_f16(k0);
                float16x8_t _k31 = vld1q_f16(k0 + 8);
                float16x8_t _k32 = vld1q_f16(k0 + 16);
                float16x8_t _k33 = vld1q_f16(k0 + 24);
                float16x8_t _k34 = vld1q_f16(k0 + 32);
                k0 += 40;

                _sum0 = vfmaq_f16(_sum0, _k30, _r30);
                _sum0 = vfmaq_f16(_sum0, _k31, _r31);
                _sum0 = vfmaq_f16(_sum0, _k32, _r32);
                _sum0 = vfmaq_f16(_sum0, _k33, _r33);
                _sum0 = vfmaq_f16(_sum0, _k34, _r34);

                float16x8_t _r40 = vld1q_f16(r4);
                float16x8_t _r41 = vld1q_f16(r4 + 8);
                float16x8_t _r42 = vld1q_f16(r4 + 16);
                float16x8_t _r43 = vld1q_f16(r4 + 24);
                float16x8_t _r44 = vld1q_f16(r4 + 32);

                float16x8_t _k40 = vld1q_f16(k0);
                float16x8_t _k41 = vld1q_f16(k0 + 8);
                float16x8_t _k42 = vld1q_f16(k0 + 16);
                float16x8_t _k43 = vld1q_f16(k0 + 24);
                float16x8_t _k44 = vld1q_f16(k0 + 32);
                k0 -= 160;

                _sum0 = vfmaq_f16(_sum0, _k40, _r40);
                _sum0 = vfmaq_f16(_sum0, _k41, _r41);
                _sum0 = vfmaq_f16(_sum0, _k42, _r42);
                _sum0 = vfmaq_f16(_sum0, _k43, _r43);
                _sum0 = vfmaq_f16(_sum0, _k44, _r44);

                vst1q_f16(outptr0, _sum0);

                outptr0 += 8;

                r0 += 16;
                r1 += 16;
                r2 += 16;
                r3 += 16;
                r4 += 16;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
            r3 += tailstep;
            r4 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/convolutiondepthwise_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolutiondepthwise_arm.h"

#include "cpu.h"
#include "layer_type.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#if NCNN_GNU_INLINE_ASM
#include "convolutiondepthwise_3x3.h"
#include "convolutiondepthwise_5x5.h"

#if NCNN_INT8
#include "convolutiondepthwise_3x3_int8.h"
#endif // NCNN_INT8

#if __ARM_NEON
#include "convolutiondepthwise_3x3_pack4.h"
#include "convolutiondepthwise_5x5_pack4.h"

#if NCNN_BF16
#include "convolutiondepthwise_3x3_pack4_bf16s.h"
#include "convolutiondepthwise_5x5_pack4_bf16s.h"
#endif // NCNN_BF16

#if NCNN_INT8
#include "convolutiondepthwise_3x3_pack8_int8.h"
#endif // NCNN_INT8
#endif // __ARM_NEON
#endif // NCNN_GNU_INLINE_ASM

ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif

    activation = 0;
}

int ConvolutionDepthWise_arm::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_arm(opt);
    }
#endif

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        int elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            elempack = channels % 4 == 0 ? 4 : 1;
        }
#endif // __ARM_NEON

#if NCNN_BF16
        if (opt.use_bf16_storage)
        {
#if __ARM_NEON
            if (elempack == 4)
            {
                Mat weight_data_r2 = weight_data.reshape(maxk, group);
                Mat weight_data_r2_packed;
                convert_packing(weight_data_r2, weight_data_r2_packed, 4, opt);

                ncnn::cast_float32_to_bfloat16(weight_data_r2_packed, weight_data_tm, opt);
            }
#endif // __ARM_NEON

            if (elempack == 1)
            {
                ncnn::cast_float32_to_bfloat16(weight_data, weight_data_tm, opt);
            }

            if (opt.lightmode)
                weight_data.release();

            return 0;
        }
#endif // NCNN_BF16

#if __ARM_NEON
        // pack4
        if (elempack == 4)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                weight_data_tm = weight_data;
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                weight_data_tm = weight_data;
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                weight_data_tm = weight_data;
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                weight_data_tm = weight_data;
            }
            else
#endif // NCNN_GNU_INLINE_ASM
            {
                // group convolution
                create_group_ops(opt);
            }
        }

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    // group convolution
    create_group_ops(opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int ConvolutionDepthWise_arm::create_group_ops(const Option& opt)
{
    // create Convolution op for each group
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    for (int i = 0; i < (int)group_ops.size(); i++)
        delete group_ops[i];

    group_ops.clear();

    const int channels_g = channels / group;
    const int num_output_g = num_output / group;

    group_ops.resize(group);

    for (int g = 0; g < group; g++)
    {
        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
        Mat bias_data_g;
        if (bias_term)
            bias_data_g = bias_data.range(num_output_g * g, num_output_g);

        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);

        // set param
        ncnn::ParamDict pd;
        pd.set(0, num_output_g); // num_output
        pd.set(1, kernel_w);
        pd.set(11, kernel_h);
        pd.set(2, dilation_w);
        pd.set(12, dilation_h);
        pd.set(3, stride_w);
        pd.set(13, stride_h);
        pd.set(4, 0);  // pad_w
        pd.set(14, 0); // pad_h
        pd.set(5, bias_term);
        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
        pd.set(8, int8_scale_term);
        pd.set(9, activation_type);
        pd.set(10, activation_params);

        op->load_param(pd);

        // set weights
        if (bias_term)
        {
            ncnn::Mat weights[5];
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

#if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
                weights[2] = weight_data_int8_scales_g;
                weights[3] = bottom_blob_int8_scales.range(g, 1);
            }
            if (int8_scale_term > 100)
            {
                weights[4] = top_blob_int8_scales.range(g, 1);
            }
#endif

            op->load_model(ModelBinFromMatArray(weights));
        }
        else
        {
            ncnn::Mat weights[4];
            weights[0] = weight_data_g;

#if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
                weights[1] = weight_data_int8_scales_g;
                weights[2] = bottom_blob_int8_scales.range(g, 1);
            }
            if (int8_scale_term > 100)
            {
                weights[3] = top_blob_int8_scales.range(g, 1);
            }
#endif

            op->load_model(ModelBinFromMatArray(weights));
        }

        op->create_pipeline(opt);

        group_ops[g] = op;
    }

    return 0;
}

int ConvolutionDepthWise_arm::destroy_pipeline(const Option& opt)
{
    if (activation)
    {
        activation->destroy_pipeline(opt);
        delete activation;
        activation = 0;
    }

    for (int i = 0; i < (int)group_ops.size(); i++)
    {
        group_ops[i]->destroy_pipeline(opt);
        delete group_ops[i];
    }
    group_ops.clear();

    return 0;
}

int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && int8_scale_term)
    {
        return forward_int8_arm(bottom_blob, top_blob, opt);
    }
#endif

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw5x5s1_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw5x5s2_pack4_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
            else
#endif // NCNN_GNU_INLINE_ASM
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    float* outptr = top_blob.channel(g);
                    const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float32x4_t _sum = vdupq_n_f32(0.f);

                            if (bias_term)
                            {
                                _sum = vld1q_f32(((const float*)bias_data) + g * 4);
                            }

                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                            for (int k = 0; k < maxk; k++)
                            {
                                float32x4_t _val = vld1q_f32(sptr + space_ofs[k] * 4);
                                float32x4_t _w = vld1q_f32(kptr + k * 4);
                                _sum = vmlaq_f32(_sum, _val, _w);
                            }

                            _sum = activation_ps(_sum, activation_type, activation_params);

                            vst1q_f32(outptr + j * 4, _sum);
                        }

                        outptr += outw * 4;
                    }
                }

                return 0;
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw5x5s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw5x5s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }

                return 0;
            }
#endif // NCNN_GNU_INLINE_ASM
        }
    }

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = 1;
    int out_g_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 4 == 0 ? 4 : 1;
        out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
    }
#endif

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack == 4 && g_elempack == 1)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
        if (bottom_blob_bordered_unpacked.empty())
            return -100;
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack == 1 && out_elempack == 4)
    {
        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    for (int g = 0; g < group; g++)
    {
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

        const ncnn::Layer* op = group_ops[g];

        Option opt_g = opt;
        opt_g.blob_allocator = top_blob_unpacked.allocator;

        // forward
        int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
        if (ret != 0)
            return ret;
    }

    // packing
    if (out_g_elempack == 1 && out_elempack == 4)
    {
        convert_packing(top_blob_unpacked, top_blob, 4, opt);
        if (top_blob.empty())
            return -100;
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
}

int ConvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

#if NCNN_ARM82
    if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_ARM82
#if NCNN_BF16
    if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_BF16

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

#if NCNN_ARM82
        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_ARM82
#if NCNN_BF16
        if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_BF16

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(7, group);
    pd.set(8, int8_scale_term);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_BF16
int ConvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw5x5s1_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw5x5s2_pack4_bf16s_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else
#endif // NCNN_GNU_INLINE_ASM
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    unsigned short* outptr = top_blob.channel(g);
                    const unsigned short* kptr = (const unsigned short*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float32x4_t _sum = vdupq_n_f32(0.f);

                            if (bias_term)
                            {
                                _sum = vld1q_f32(((const float*)bias_data) + g * 4);
                            }

                            const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w * 4;

                            for (int k = 0; k < maxk; k++)
                            {
                                float32x4_t _val = bfloat2float(vld1_u16(sptr + space_ofs[k] * 4));
                                float32x4_t _w = bfloat2float(vld1_u16(kptr + k * 4));
                                _sum = vmlaq_f32(_sum, _val, _w);
                            }

                            _sum = activation_ps(_sum, activation_type, activation_params);

                            vst1_u16(outptr + j * 4, float2bfloat(_sum));
                        }

                        outptr += outw * 4;
                    }
                }
            }

            return 0;
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
#if NCNN_GNU_INLINE_ASM
            //             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            //             {
            //                 convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
            //
            //                 if (activation)
            //                 {
            //                     activation->forward_inplace(top_blob, opt);
            //                 }
            //
            //                 return 0;
            //             }
            //             else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            //             {
            //                 convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
            //
            //                 if (activation)
            //                 {
            //                     activation->forward_inplace(top_blob, opt);
            //                 }
            //
            //                 return 0;
            //             }
            //             else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            //             {
            //                 convdw5x5s1_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
            //
            //                 if (activation)
            //                 {
            //                     activation->forward_inplace(top_blob, opt);
            //                 }
            //
            //                 return 0;
            //             }
            //             else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            //             {
            //                 convdw5x5s2_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);
            //
            //                 if (activation)
            //                 {
            //                     activation->forward_inplace(top_blob, opt);
            //                 }
            //
            //                 return 0;
            //             }
            //             else
#endif // NCNN_GNU_INLINE_ASM
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < group; g++)
                {
                    unsigned short* outptr = top_blob.channel(g);
                    const unsigned short* kptr = (const unsigned short*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                                sum = bias_data[g];

                            const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = bfloat16_to_float32(sptr[space_ofs[k]]);
                                float w = bfloat16_to_float32(kptr[k]);
                                sum += val * w;
                            }

                            sum = activation_ss(sum, activation_type, activation_params);

                            outptr[j] = float32_to_bfloat16(sum);
                        }

                        outptr += outw;
                    }
                }
            }
        }

        return 0;
    }

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = 1;
    int out_g_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 4 == 0 ? 4 : 1;
        out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
    }
#endif

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack == 4 && g_elempack == 1)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
        if (bottom_blob_bordered_unpacked.empty())
            return -100;
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack == 1 && out_elempack == 4)
    {
        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    for (int g = 0; g < group; g++)
    {
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

        const ncnn::Layer* op = group_ops[g];

        Option opt_g = opt;
        opt_g.blob_allocator = top_blob_unpacked.allocator;

        // forward
        int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
        if (ret != 0)
            return ret;
    }

    // packing
    if (out_g_elempack == 1 && out_elempack == 4)
    {
        convert_packing(top_blob_unpacked, top_blob, 4, opt);
        if (top_blob.empty())
            return -100;
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int ConvolutionDepthWise_arm::create_pipeline_int8_arm(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        int elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            elempack = channels % 8 == 0 ? 8 : 1;
        }
#endif // __ARM_NEON

        if (elempack == 8)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
            convert_packing(weight_data_r2, weight_data_tm, 8, opt);
        }

        if (elempack == 1)
        {
            weight_data_tm = weight_data;
        }

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    // group convolution
    create_group_ops(opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int ConvolutionDepthWise_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;

    int elembits = bottom_blob.elembits();

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        const int channels_g = channels * elempack / group;

        Mat scales(channels * elempack);
        {
            float* ps = scales;
            for (int g = 0; g < group; g++)
            {
                float scale = bottom_blob_int8_scales[g];
                for (int q = 0; q < channels_g; q++)
                {
                    *ps++ = scale;
                }
            }
        }

        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
        if (bottom_blob_int8.empty())
            return -100;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;
    channels = bottom_blob_bordered.c;
    elempack = bottom_blob_bordered.elempack;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        }
#endif // __ARM_NEON
        bool use_int8_requantize = int8_scale_term > 100;
        size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
#if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage)
        {
            out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
        }
#endif
        if (opt.use_bf16_storage)
            out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;

        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        // TODO use fp16 / bf16
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 8)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1))
            {
                Mat top_blob_int32;
                top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator);
                if (top_blob_int32.empty())
                    return -100;

                convdw3x3s1_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt);

                Mat scale_in_data(group);
                for (int g = 0; g < group; g++)
                {
                    // dequantize
                    float scale_in;
                    if (weight_data_int8_scales[g] == 0)
                        scale_in = 0;
                    else
                        scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                    scale_in_data[g] = scale_in;
                }

                if (use_int8_requantize)
                {
                    requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
                }
                else
                {
                    dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

                    if (activation)
                    {
                        activation->forward_inplace(top_blob, opt);
                    }
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1))
            {
                Mat top_blob_int32;
                top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator);
                if (top_blob_int32.empty())
                    return -100;

                convdw3x3s2_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt);

                Mat scale_in_data(group);
                for (int g = 0; g < group; g++)
                {
                    // dequantize
                    float scale_in;
                    if (weight_data_int8_scales[g] == 0)
                        scale_in = 0;
                    else
                        scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                    scale_in_data[g] = scale_in;
                }

                if (use_int8_requantize)
                {
                    requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
                }
                else
                {
                    dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

                    if (activation)
                    {
                        activation->forward_inplace(top_blob, opt);
                    }
                }
            }
            else
#endif // NCNN_GNU_INLINE_ASM
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    signed char* outptr_s8 = top_blob.channel(g);
                    float* outptr_f32 = top_blob.channel(g);
                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            int32x4_t _sum0 = vdupq_n_s32(0);
                            int32x4_t _sum1 = vdupq_n_s32(0);

                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;

                            for (int k = 0; k < maxk; k++)
                            {
                                int8x8_t _val = vld1_s8(sptr + space_ofs[k] * 8);
                                int8x8_t _w = vld1_s8(kptr + k * 8);
                                int16x8_t _s0 = vmull_s8(_val, _w);
                                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                            }

                            float32x4_t _scale_in0;
                            float32x4_t _scale_in1;
                            {
                                float32x4_t _bottom_blob_int8_scales0 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8);
                                float32x4_t _bottom_blob_int8_scales1 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8 + 4);
                                float32x4_t _weight_data_int8_scales0 = vld1q_f32((const float*)weight_data_int8_scales + g * 8);
                                float32x4_t _weight_data_int8_scales1 = vld1q_f32((const float*)weight_data_int8_scales + g * 8 + 4);
                                _scale_in0 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
                                _scale_in1 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales1, _weight_data_int8_scales1));

                                uint32x4_t _m0 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales0, vdupq_n_f32(0.f)));
                                uint32x4_t _m1 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales1, vdupq_n_f32(0.f)));
                                _scale_in0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in0), _m0));
                                _scale_in1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in1), _m1));
                            }

                            float32x4_t _sumfp32_0 = vmulq_f32(vcvtq_f32_s32(_sum0), _scale_in0);
                            float32x4_t _sumfp32_1 = vmulq_f32(vcvtq_f32_s32(_sum1), _scale_in1);

                            if (bias_term)
                            {
                                float32x4_t _bias0 = vld1q_f32((const float*)bias_data + g * 8);
                                float32x4_t _bias1 = vld1q_f32((const float*)bias_data + g * 8 + 4);
                                _sumfp32_0 = vaddq_f32(_sumfp32_0, _bias0);
                                _sumfp32_1 = vaddq_f32(_sumfp32_1, _bias1);
                            }

                            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
                            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);

                            if (use_int8_requantize)
                            {
                                // requantize
                                float32x4_t _scale_out0 = vld1q_f32((const float*)top_blob_int8_scales + g * 8);
                                float32x4_t _scale_out1 = vld1q_f32((const float*)top_blob_int8_scales + g * 8 + 4);
                                int8x8_t _sum8 = float2int8(vmulq_f32(_sumfp32_0, _scale_out0), vmulq_f32(_sumfp32_1, _scale_out1));
                                vst1_s8(outptr_s8, _sum8);
                                outptr_s8 += 8;
                            }
                            else
                            {
                                // dequantize
                                vst1q_f32(outptr_f32, _sumfp32_0);
                                vst1q_f32(outptr_f32 + 4, _sumfp32_1);
                                outptr_f32 += 8;
                            }
                        }
                    }
                }
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1))
            {
                if (use_int8_requantize)
                {
                    std::vector<float> requantize_scales;
                    for (int g = 0; g < group; g++)
                    {
                        float scale_in;
                        if (weight_data_int8_scales[g] == 0)
                            scale_in = 0;
                        else
                            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                        float scale_out = top_blob_int8_scales[g];

                        requantize_scales.push_back(scale_in);
                        requantize_scales.push_back(scale_out);
                    }

                    convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt);
                }
                else
                {
                    Mat top_blob_int32;
                    top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
                    if (top_blob_int32.empty())
                        return -100;

                    convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt);
                    //                 convdw3x3s1_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt);

                    Mat scale_data(group);
                    for (int g = 0; g < group; g++)
                    {
                        // dequantize
                        float scale_in;
                        if (weight_data_int8_scales[g] == 0)
                            scale_in = 0;
                        else
                            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                        scale_data[g] = scale_in;
                    }

                    dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt);
                }

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1))
            {
                if (use_int8_requantize)
                {
                    std::vector<float> requantize_scales;
                    for (int g = 0; g < group; g++)
                    {
                        float scale_in;
                        if (weight_data_int8_scales[g] == 0)
                            scale_in = 0;
                        else
                            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                        float scale_out = top_blob_int8_scales[g];

                        requantize_scales.push_back(scale_in);
                        requantize_scales.push_back(scale_out);
                    }

                    convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt);
                }
                else
                {
                    Mat top_blob_int32;
                    top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
                    if (top_blob_int32.empty())
                        return -100;

                    convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt);
                    //                 convdw3x3s2_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt);

                    Mat scale_data(group);
                    for (int g = 0; g < group; g++)
                    {
                        // dequantize
                        float scale_in;
                        if (weight_data_int8_scales[g] == 0)
                            scale_in = 0;
                        else
                            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                        scale_data[g] = scale_in;
                    }

                    dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt);
                }

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else
#endif // NCNN_GNU_INLINE_ASM
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < group; g++)
                {
                    signed char* outptr_s8 = top_blob.channel(g);
                    float* outptr_f32 = top_blob.channel(g);
                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            int sum = 0;

                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                signed char val = sptr[space_ofs[k]];
                                signed char w = kptr[k];
                                sum += val * w;
                            }

                            float scale_in;
                            if (weight_data_int8_scales[g] == 0)
                                scale_in = 0;
                            else
                                scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                            float sumfp32 = sum * scale_in;

                            if (bias_term)
                                sumfp32 += bias_data[g];

                            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);

                            if (use_int8_requantize)
                            {
                                // requantize
                                float scale_out = top_blob_int8_scales[g];
                                signed char sums8 = float2int8(sumfp32 * scale_out);
                                outptr_s8[0] = sums8;
                                outptr_s8 += 1;
                            }
                            else
                            {
                                // dequantize
                                outptr_f32[0] = sumfp32;
                                outptr_f32 += 1;
                            }
                        }
                    }
                }
            }
        }

        return 0;
    }

    bool use_int8_requantize = int8_scale_term > 100;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        if (use_int8_requantize)
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        else
            out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
    }
#endif
    if (opt.use_bf16_storage)
        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = 1;
    int out_g_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 8 == 0 ? 8 : 1;
        if (use_int8_requantize)
            out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
        else
            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
        if (bottom_blob_bordered_unpacked.empty())
            return -100;
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    for (int g = 0; g < group; g++)
    {
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

        const ncnn::Layer* op = group_ops[g];

        Option opt_g = opt;
        opt_g.blob_allocator = top_blob_unpacked.allocator;

        // forward
        int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
        if (ret != 0)
            return ret;
    }

    // packing
    if (out_g_elempack < out_elempack)
    {
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        if (top_blob.empty())
            return -100;
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/arm/convolutiondepthwise_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTIONDEPTHWISE_ARM_H
#define LAYER_CONVOLUTIONDEPTHWISE_ARM_H

#include "convolutiondepthwise.h"

namespace ncnn {

class ConvolutionDepthWise_arm : public ConvolutionDepthWise
{
public:
    ConvolutionDepthWise_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    int create_group_ops(const Option& opt);
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Layer* activation;
    std::vector<ncnn::Layer*> group_ops;

    Mat weight_data_tm;

    // fp16
    Mat bias_data_fp16;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTIONDEPTHWISE_ARM_H


================================================
FILE: src/layer/arm/convolutiondepthwise_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolutiondepthwise_arm.h"

#include "cpu.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#if NCNN_GNU_INLINE_ASM
#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "convolutiondepthwise_3x3_fp16s.h"
#include "convolutiondepthwise_3x3_pack8_fp16s.h"
#include "convolutiondepthwise_5x5_pack8_fp16s.h"
#endif
#endif // __ARM_NEON
#endif // NCNN_GNU_INLINE_ASM

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int ConvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        int elempack = 1;

        if (opt.use_packing_layout)
        {
            elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
        }

        if (elempack == 8)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
            Mat weight_data_r2_packed;
            convert_packing(weight_data_r2, weight_data_r2_packed, 8, opt);

            ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
        }

        if (elempack == 4)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
            Mat weight_data_r2_packed;
            convert_packing(weight_data_r2, weight_data_r2_packed, 4, opt);

            ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
        }

        if (elempack == 1)
        {
            ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt);
        }

        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    // group convolution
    create_group_ops(opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int ConvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        if (elempack == 4)
        {
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float32x4_t _sum = vdupq_n_f32(0.f);

                            if (bias_term)
                            {
                                _sum = vld1q_f32(((const float*)bias_data) + g * 4);
                            }

                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;

                            for (int k = 0; k < maxk; k++)
                            {
                                float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr + space_ofs[k] * 4));
                                float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));
                                _sum = vfmaq_f32(_sum, _val, _w);
                            }

                            _sum = activation_ps(_sum, activation_type, activation_params);

                            vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }

        if (elempack == 1)
        {
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < group; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                                sum = bias_data[g];

                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = (float)sptr[space_ofs[k]];
                                float w = (float)kptr[k];
                                sum += val * w;
                            }

                            sum = activation_ss(sum, activation_type, activation_params);

                            outptr[j] = (__fp16)sum;
                        }

                        outptr += outw;
                    }
                }
            }
        }

        return 0;
    }

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = (opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
    int out_g_elempack = (opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
        if (bottom_blob_bordered_unpacked.empty())
            return -100;
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    for (int g = 0; g < group; g++)
    {
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

        const ncnn::Layer* op = group_ops[g];

        Option opt_g = opt;
        opt_g.blob_allocator = top_blob_unpacked.allocator;

        // forward
        int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
        if (ret != 0)
            return ret;
    }

    // packing
    if (out_g_elempack < out_elempack)
    {
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        if (top_blob.empty())
            return -100;
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
}

int ConvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        if (elempack == 8)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw5x5s1_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw5x5s2_pack8_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else
#endif // NCNN_GNU_INLINE_ASM
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * 8;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                            if (bias_term)
                            {
                                _sum = vld1q_f16(((const __fp16*)bias_data_fp16) + g * 8);
                            }

                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;

                            for (int k = 0; k < maxk; k++)
                            {
                                float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
                                float16x8_t _w = vld1q_f16(kptr + k * 8);
                                _sum = vfmaq_f16(_sum, _val, _w);
                            }

                            _sum = activation_ps_f16(_sum, activation_type, activation_params);

                            vst1q_f16(outptr + j * 8, _sum);
                        }

                        outptr += outw * 8;
                    }
                }
            }
        }

        if (elempack == 4)
        {
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float16x4_t _sum = vdup_n_f16((__fp16)0.f);

                            if (bias_term)
                            {
                                _sum = vld1_f16(((const __fp16*)bias_data_fp16) + g * 4);
                            }

                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;

                            for (int k = 0; k < maxk; k++)
                            {
                                float16x4_t _val = vld1_f16(sptr + space_ofs[k] * 4);
                                float16x4_t _w = vld1_f16(kptr + k * 4);
                                _sum = vfma_f16(_sum, _val, _w);
                            }

                            _sum = activation_ps_f16(_sum, activation_type, activation_params);

                            vst1_f16(outptr + j * 4, _sum);
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }

        if (elempack == 1)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_fp16sa_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data_fp16, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else
#endif // NCNN_GNU_INLINE_ASM
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < group; g++)
                {
                    __fp16* outptr = top_blob.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                                sum = bias_data[g];

                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                __fp16 val = sptr[space_ofs[k]];
                                __fp16 w = kptr[k];
                                sum += val * w;
                            }

                            sum = activation_ss_f16(sum, activation_type, activation_params);

                            outptr[j] = (__fp16)sum;
                        }

                        outptr += outw;
                    }
                }
            }
        }

        return 0;
    }

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = 1;
    int out_g_elempack = 1;
    if (opt.use_packing_layout)
    {
        g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
        out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
    }

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
        if (bottom_blob_bordered_unpacked.empty())
            return -100;
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    for (int g = 0; g < group; g++)
    {
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

        const ncnn::Layer* op = group_ops[g];

        Option opt_g = opt;
        opt_g.blob_allocator = top_blob_unpacked.allocator;

        // forward
        int ret = op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
        if (ret != 0)
            return ret;
    }

    // packing
    if (out_g_elempack < out_elempack)
    {
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        if (top_blob.empty())
            return -100;
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/crop_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "crop_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

Crop_arm::Crop_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

#if __ARM_NEON
static void crop_pack8_neon(const Mat& src, Mat& dst, int top, int left)
{
    int w = dst.w;
    int h = dst.h;
    int right = src.w - dst.w - left;

    const float* ptr = src.row(top) + left * 8;
    float* outptr = dst;

    for (int y = 0; y < h; y++)
    {
        for (int x = 0; x < w; x++)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            vst1q_f32(outptr, _p0);
            vst1q_f32(outptr + 4, _p1);
            ptr += 8;
            outptr += 8;
        }

        ptr += (left + right) * 8;
    }
}

static void crop_pack8_bf16_fp16s_neon(const Mat& src, Mat& dst, int top, int left)
{
    int w = dst.w;
    int h = dst.h;
    int right = src.w - dst.w - left;

    const unsigned short* ptr = src.row<unsigned short>(top) + left * 8;
    unsigned short* outptr = dst;

    for (int y = 0; y < h; y++)
    {
        for (int x = 0; x < w; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            vst1q_u16(outptr, _p);
            ptr += 8;
            outptr += 8;
        }

        ptr += (left + right) * 8;
    }
}

static void crop_pack4_neon(const Mat& src, Mat& dst, int top, int left)
{
    int w = dst.w;
    int h = dst.h;
    int right = src.w - dst.w - left;

    const float* ptr = src.row(top) + left * 4;
    float* outptr = dst;

    for (int y = 0; y < h; y++)
    {
        for (int x = 0; x < w; x++)
        {
            float32x4_t _p = vld1q_f32(ptr);
            vst1q_f32(outptr, _p);
            ptr += 4;
            outptr += 4;
        }

        ptr += (left + right) * 4;
    }
}

static void crop_pack4_bf16_fp16s_neon(const Mat& src, Mat& dst, int top, int left)
{
    int w = dst.w;
    int h = dst.h;
    int right = src.w - dst.w - left;

    const unsigned short* ptr = src.row<unsigned short>(top) + left * 4;
    unsigned short* outptr = dst;

    for (int y = 0; y < h; y++)
    {
        for (int x = 0; x < w; x++)
        {
            uint16x4_t _p = vld1_u16(ptr);
            vst1_u16(outptr, _p);
            ptr += 4;
            outptr += 4;
        }

        ptr += (left + right) * 4;
    }
}
#endif // __ARM_NEON

int Crop_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __ARM_NEON
    int _woffset, _hoffset, _doffset, _coffset;
    int _outw, _outh, _outd, _outc;
    if (!starts_expr.empty() && !ends_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(1);
        bottom_blob_shapes[0] = bottom_blob.shape();
        eval_crop_expr(bottom_blob_shapes, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else
    {
        resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }

    if (elempack == 8)
    {
        if (dims == 1)
        {
            int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw / out_elempack == w && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_woffset % 8 == 0 && out_elempack == 8)
            {
                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 16u)
                    crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
                else
                    crop_pack8_neon(bottom_blob, top_blob, 0, _woffset / elempack);

                return 0;
            }
        }

        if (dims == 2)
        {
            int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh / out_elempack == h && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_hoffset % 8 == 0 && out_elempack == 8)
            {
                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 16u)
                    crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
                else
                    crop_pack8_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);

                return 0;
            }
        }

        if (dims == 3)
        {
            int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 8 == 0 && out_elempack == 8)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    const Mat m = bottom_blob_sliced.channel(q);
                    Mat borderm = top_blob.channel(q);

                    if (elemsize == 16u)
                        crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                    else
                        crop_pack8_neon(m, borderm, _hoffset, _woffset);
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 8 == 0 && out_elempack == 8)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h && _outd == d)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    for (int z = 0; z < _outd; z++)
                    {
                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                        Mat borderm = top_blob.channel(q).depth(z);

                        if (elemsize == 16u)
                            crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                        else
                            crop_pack8_neon(m, borderm, _hoffset, _woffset);
                    }
                }

                return 0;
            }
        }
    }

    if (elempack == 4)
    {
        if (dims == 1)
        {
            int out_elempack = _outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw / out_elempack == w && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_woffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 8u)
                    crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
                else
                    crop_pack4_neon(bottom_blob, top_blob, 0, _woffset / elempack);

                return 0;
            }
        }

        if (dims == 2)
        {
            int out_elempack = _outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_hoffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 8u)
                    crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
                else
                    crop_pack4_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);

                return 0;
            }
        }

        if (dims == 3)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    const Mat m = bottom_blob_sliced.channel(q);
                    Mat borderm = top_blob.channel(q);

                    if (elemsize == 8u)
                        crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                    else
                        crop_pack4_neon(m, borderm, _hoffset, _woffset);
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h && _outd == d)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    for (int z = 0; z < _outd; z++)
                    {
                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                        Mat borderm = top_blob.channel(q).depth(z);

                        if (elemsize == 8u)
                            crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                        else
                            crop_pack4_neon(m, borderm, _hoffset, _woffset);
                    }
                }

                return 0;
            }
        }
    }
#endif // __ARM_NEON

    Mat bottom_blob_unpacked = bottom_blob;
    if (elempack != 1)
    {
        Option opt_pack1 = opt;
        opt_pack1.blob_allocator = opt.workspace_allocator;

        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
        if (bottom_blob_unpacked.empty())
            return -100;
    }

    return Crop::forward(bottom_blob_unpacked, top_blob, opt);
}

int Crop_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int ref_elempack = reference_blob.elempack;

    Mat& top_blob = top_blobs[0];

#if __ARM_NEON
    int _woffset, _hoffset, _doffset, _coffset;
    int _outw, _outh, _outd, _outc;
    if (!starts_expr.empty() && !ends_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(bottom_blobs.size());
        for (size_t i = 0; i < bottom_blobs.size(); i++)
        {
            bottom_blob_shapes[i] = bottom_blobs[i].shape();
        }
        eval_crop_expr(bottom_blob_shapes, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else if (woffset == -233)
    {
        resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else
    {
        resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }

    if (elempack == 8)
    {
        if (dims == 1)
        {
            int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw / out_elempack == w && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_woffset % 8 == 0 && out_elempack == 8)
            {
                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 16u)
                    crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
                else
                    crop_pack8_neon(bottom_blob, top_blob, 0, _woffset / elempack);

                return 0;
            }
        }

        if (dims == 2)
        {
            int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh / out_elempack == h && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_hoffset % 8 == 0 && out_elempack == 8)
            {
                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 16u)
                    crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
                else
                    crop_pack8_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);

                return 0;
            }
        }

        if (dims == 3)
        {
            int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 8 == 0 && out_elempack == 8)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    const Mat m = bottom_blob_sliced.channel(q);
                    Mat borderm = top_blob.channel(q);

                    if (elemsize == 16u)
                        crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                    else
                        crop_pack8_neon(m, borderm, _hoffset, _woffset);
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 8)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 8 == 0 && out_elempack == 8)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h && _outd == d)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    for (int z = 0; z < _outd; z++)
                    {
                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                        Mat borderm = top_blob.channel(q).depth(z);

                        if (elemsize == 16u)
                            crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                        else
                            crop_pack8_neon(m, borderm, _hoffset, _woffset);
                    }
                }

                return 0;
            }
        }
    }

    if (elempack == 4)
    {
        if (dims == 1)
        {
            int out_elempack = _outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw / out_elempack == w && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_woffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 8u)
                    crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
                else
                    crop_pack4_neon(bottom_blob, top_blob, 0, _woffset / elempack);

                return 0;
            }
        }

        if (dims == 2)
        {
            int out_elempack = _outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_hoffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                if (elemsize == 8u)
                    crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
                else
                    crop_pack4_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);

                return 0;
            }
        }

        if (dims == 3)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    const Mat m = bottom_blob_sliced.channel(q);
                    Mat borderm = top_blob.channel(q);

                    if (elemsize == 8u)
                        crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                    else
                        crop_pack4_neon(m, borderm, _hoffset, _woffset);
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h && _outd == d)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    for (int z = 0; z < _outd; z++)
                    {
                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                        Mat borderm = top_blob.channel(q).depth(z);

                        if (elemsize == 8u)
                            crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
                        else
                            crop_pack4_neon(m, borderm, _hoffset, _woffset);
                    }
                }

                return 0;
            }
        }
    }
#endif // __ARM_NEON

    std::vector<Mat> bottom_blobs_unpacked(bottom_blobs.size());
    for (size_t i = 0; i < bottom_blobs.size(); i++)
    {
        Mat bottom_blob_unpacked = bottom_blobs[i];
        if (elempack != 1)
        {
            Option opt_pack1 = opt;
            opt_pack1.blob_allocator = opt.workspace_allocator;

            convert_packing(bottom_blobs[i], bottom_blob_unpacked, 1, opt_pack1);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        bottom_blobs_unpacked[i] = bottom_blob_unpacked;
    }

    return Crop::forward(bottom_blobs_unpacked, top_blobs, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/crop_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CROP_ARM_H
#define LAYER_CROP_ARM_H

#include "crop.h"

namespace ncnn {

class Crop_arm : public Crop
{
public:
    Crop_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CROP_ARM_H


================================================
FILE: src/layer/arm/deconvolution_3x3.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void deconv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 9 + q * 9;

            const float* r0 = img0;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

#if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(k0);
            float32x4_t _k1 = vld1q_f32(k1);
            float32x4_t _k2 = vld1q_f32(k2);
#endif // __ARM_NEON

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.row(i);

                float* outptr0 = outptr;
                float* outptr1 = outptr + outw;
                float* outptr2 = outptr + outw * 2;

                int j = 0;

#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    float32x4_t _v = vld1q_f32(r0);

                    //
                    float32x4_t _out00 = vld1q_f32(outptr0 + 0);
                    _out00 = vmlaq_lane_f32(_out00, _v, vget_low_f32(_k0), 0);
                    vst1q_f32(outptr0 + 0, _out00);

                    float32x4_t _out01 = vld1q_f32(outptr0 + 1);
                    _out01 = vmlaq_lane_f32(_out01, _v, vget_low_f32(_k0), 1);
                    vst1q_f32(outptr0 + 1, _out01);

                    float32x4_t _out02 = vld1q_f32(outptr0 + 2);
                    _out02 = vmlaq_lane_f32(_out02, _v, vget_high_f32(_k0), 0);
                    vst1q_f32(outptr0 + 2, _out02);

                    //
                    float32x4_t _out10 = vld1q_f32(outptr1 + 0);
                    _out10 = vmlaq_lane_f32(_out10, _v, vget_low_f32(_k1), 0);
                    vst1q_f32(outptr1 + 0, _out10);

                    float32x4_t _out11 = vld1q_f32(outptr1 + 1);
                    _out11 = vmlaq_lane_f32(_out11, _v, vget_low_f32(_k1), 1);
                    vst1q_f32(outptr1 + 1, _out11);

                    float32x4_t _out12 = vld1q_f32(outptr1 + 2);
                    _out12 = vmlaq_lane_f32(_out12, _v, vget_high_f32(_k1), 0);
                    vst1q_f32(outptr1 + 2, _out12);

                    //
                    float32x4_t _out20 = vld1q_f32(outptr2 + 0);
                    _out20 = vmlaq_lane_f32(_out20, _v, vget_low_f32(_k2), 0);
                    vst1q_f32(outptr2 + 0, _out20);

                    float32x4_t _out21 = vld1q_f32(outptr2 + 1);
                    _out21 = vmlaq_lane_f32(_out21, _v, vget_low_f32(_k2), 1);
                    vst1q_f32(outptr2 + 1, _out21);

                    float32x4_t _out22 = vld1q_f32(outptr2 + 2);
                    _out22 = vmlaq_lane_f32(_out22, _v, vget_high_f32(_k2), 0);
                    vst1q_f32(outptr2 + 2, _out22);

                    r0 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                }
#endif // __ARM_NEON

                for (; j < w; j++)
                {
                    float val = r0[0];

                    outptr0[0] += val * k0[0];
                    outptr0[1] += val * k0[1];
                    outptr0[2] += val * k0[2];

                    outptr1[0] += val * k1[0];
                    outptr1[1] += val * k1[1];
                    outptr1[2] += val * k1[2];

                    outptr2[0] += val * k2[0];
                    outptr2[1] += val * k2[1];
                    outptr2[2] += val * k2[2];

                    r0++;
                    outptr0++;
                    outptr1++;
                    outptr2++;
                }
            }
        }
    }
}

static void deconv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 9 + q * 9;

            const float* r0 = img0;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

#if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(k0);
            float32x4_t _k1 = vld1q_f32(k1);
            float32x4_t _k2 = vld1q_f32(k2);
#endif // __ARM_NEON

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.row(i * 2);

                float* outptr0 = outptr;
                float* outptr1 = outptr0 + outw;
                float* outptr2 = outptr1 + outw;

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    float32x4_t _v = vld1q_f32(r0);

                    // out row 0
                    float32x4_t _out00 = vmulq_lane_f32(_v, vget_low_f32(_k0), 0);  // 0,2,4,6
                    float32x4_t _out01 = vmulq_lane_f32(_v, vget_low_f32(_k0), 1);  // 1,3,5,7
                    float32x4_t _out02 = vmulq_lane_f32(_v, vget_high_f32(_k0), 0); // 2,4,6,8

                    float32x4x2_t _out0 = vld2q_f32(outptr0);
                    _out0.val[0] = vaddq_f32(_out0.val[0], _out00); // 0,2,4,6
                    _out0.val[1] = vaddq_f32(_out0.val[1], _out01); // 1,3,5,7
                    vst2q_f32(outptr0, _out0);

                    _out0 = vld2q_f32(outptr0 + 2);
                    _out0.val[0] = vaddq_f32(_out0.val[0], _out02); // 2,4,6,8
                    vst2q_f32(outptr0 + 2, _out0);

                    // out row 1
                    float32x4_t _out10 = vmulq_lane_f32(_v, vget_low_f32(_k1), 0);  // 0,2,4,6
                    float32x4_t _out11 = vmulq_lane_f32(_v, vget_low_f32(_k1), 1);  // 1,3,5,7
                    float32x4_t _out12 = vmulq_lane_f32(_v, vget_high_f32(_k1), 0); // 2,4,6,8

                    float32x4x2_t _out1 = vld2q_f32(outptr1);
                    _out1.val[0] = vaddq_f32(_out1.val[0], _out10); // 0,2,4,6
                    _out1.val[1] = vaddq_f32(_out1.val[1], _out11); // 1,3,5,7
                    vst2q_f32(outptr1, _out1);

                    _out1 = vld2q_f32(outptr1 + 2);
                    _out1.val[0] = vaddq_f32(_out1.val[0], _out12); // 2,4,6,8
                    vst2q_f32(outptr1 + 2, _out1);

                    // out row 2
                    float32x4_t _out20 = vmulq_lane_f32(_v, vget_low_f32(_k2), 0);  // 0,2,4,6
                    float32x4_t _out21 = vmulq_lane_f32(_v, vget_low_f32(_k2), 1);  // 1,3,5,7
                    float32x4_t _out22 = vmulq_lane_f32(_v, vget_high_f32(_k2), 0); // 2,4,6,8

                    float32x4x2_t _out2 = vld2q_f32(outptr2);
                    _out2.val[0] = vaddq_f32(_out2.val[0], _out20); // 0,2,4,6
                    _out2.val[1] = vaddq_f32(_out2.val[1], _out21); // 1,3,5,7
                    vst2q_f32(outptr2, _out2);

                    _out2 = vld2q_f32(outptr2 + 2);
                    _out2.val[0] = vaddq_f32(_out2.val[0], _out22); // 2,4,6,8
                    vst2q_f32(outptr2 + 2, _out2);

                    r0 += 4;
                    outptr0 += 8;
                    outptr1 += 8;
                    outptr2 += 8;
                }
#endif // __ARM_NEON

                for (; j < w; j++)
                {
                    float val = r0[0];

                    outptr0[0] += val * k0[0];
                    outptr0[1] += val * k0[1];
                    outptr0[2] += val * k0[2];

                    outptr1[0] += val * k1[0];
                    outptr1[1] += val * k1[1];
                    outptr1[2] += val * k1[2];

                    outptr2[0] += val * k2[0];
                    outptr2[1] += val * k2[1];
                    outptr2[2] += val * k2[2];

                    r0++;
                    outptr0 += 2;
                    outptr1 += 2;
                    outptr2 += 2;
                }
            }
        }
    }
}


================================================
FILE: src/layer/arm/deconvolution_4x4.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void deconv4x4s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 16 + q * 16;

            const float* r0 = img0;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 4;
            const float* k2 = kernel0 + 8;
            const float* k3 = kernel0 + 12;

#if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(k0);
            float32x4_t _k1 = vld1q_f32(k1);
            float32x4_t _k2 = vld1q_f32(k2);
            float32x4_t _k3 = vld1q_f32(k3);
#endif // __ARM_NEON

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.row(i);

                float* outptr0 = outptr;
                float* outptr1 = outptr0 + outw;
                float* outptr2 = outptr1 + outw;
                float* outptr3 = outptr2 + outw;

                int j = 0;

#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    float32x4_t _v = vld1q_f32(r0);

                    //
                    float32x4_t _out00 = vld1q_f32(outptr0 + 0);
                    _out00 = vmlaq_lane_f32(_out00, _v, vget_low_f32(_k0), 0);
                    vst1q_f32(outptr0 + 0, _out00);

                    float32x4_t _out01 = vld1q_f32(outptr0 + 1);
                    _out01 = vmlaq_lane_f32(_out01, _v, vget_low_f32(_k0), 1);
                    vst1q_f32(outptr0 + 1, _out01);

                    float32x4_t _out02 = vld1q_f32(outptr0 + 2);
                    _out02 = vmlaq_lane_f32(_out02, _v, vget_high_f32(_k0), 0);
                    vst1q_f32(outptr0 + 2, _out02);

                    float32x4_t _out03 = vld1q_f32(outptr0 + 3);
                    _out03 = vmlaq_lane_f32(_out03, _v, vget_high_f32(_k0), 1);
                    vst1q_f32(outptr0 + 3, _out03);

                    //
                    float32x4_t _out10 = vld1q_f32(outptr1 + 0);
                    _out10 = vmlaq_lane_f32(_out10, _v, vget_low_f32(_k1), 0);
                    vst1q_f32(outptr1 + 0, _out10);

                    float32x4_t _out11 = vld1q_f32(outptr1 + 1);
                    _out11 = vmlaq_lane_f32(_out11, _v, vget_low_f32(_k1), 1);
                    vst1q_f32(outptr1 + 1, _out11);

                    float32x4_t _out12 = vld1q_f32(outptr1 + 2);
                    _out12 = vmlaq_lane_f32(_out12, _v, vget_high_f32(_k1), 0);
                    vst1q_f32(outptr1 + 2, _out12);

                    float32x4_t _out13 = vld1q_f32(outptr1 + 3);
                    _out13 = vmlaq_lane_f32(_out13, _v, vget_high_f32(_k1), 1);
                    vst1q_f32(outptr1 + 3, _out13);

                    //
                    float32x4_t _out20 = vld1q_f32(outptr2 + 0);
                    _out20 = vmlaq_lane_f32(_out20, _v, vget_low_f32(_k2), 0);
                    vst1q_f32(outptr2 + 0, _out20);

                    float32x4_t _out21 = vld1q_f32(outptr2 + 1);
                    _out21 = vmlaq_lane_f32(_out21, _v, vget_low_f32(_k2), 1);
                    vst1q_f32(outptr2 + 1, _out21);

                    float32x4_t _out22 = vld1q_f32(outptr2 + 2);
                    _out22 = vmlaq_lane_f32(_out22, _v, vget_high_f32(_k2), 0);
                    vst1q_f32(outptr2 + 2, _out22);

                    float32x4_t _out23 = vld1q_f32(outptr2 + 3);
                    _out23 = vmlaq_lane_f32(_out23, _v, vget_high_f32(_k2), 1);
                    vst1q_f32(outptr2 + 3, _out23);

                    //
                    float32x4_t _out30 = vld1q_f32(outptr3 + 0);
                    _out30 = vmlaq_lane_f32(_out30, _v, vget_low_f32(_k3), 0);
                    vst1q_f32(outptr3 + 0, _out30);

                    float32x4_t _out31 = vld1q_f32(outptr3 + 1);
                    _out31 = vmlaq_lane_f32(_out31, _v, vget_low_f32(_k3), 1);
                    vst1q_f32(outptr3 + 1, _out31);

                    float32x4_t _out32 = vld1q_f32(outptr3 + 2);
                    _out32 = vmlaq_lane_f32(_out32, _v, vget_high_f32(_k3), 0);
                    vst1q_f32(outptr3 + 2, _out32);

                    float32x4_t _out33 = vld1q_f32(outptr3 + 3);
                    _out33 = vmlaq_lane_f32(_out33, _v, vget_high_f32(_k3), 1);
                    vst1q_f32(outptr3 + 3, _out33);

                    r0 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }

#endif // __ARM_NEON

                for (; j < w; j++)
                {
                    float val = r0[0];

                    outptr0[0] += val * k0[0];
                    outptr0[1] += val * k0[1];
                    outptr0[2] += val * k0[2];
                    outptr0[3] += val * k0[3];

                    outptr1[0] += val * k1[0];
                    outptr1[1] += val * k1[1];
                    outptr1[2] += val * k1[2];
                    outptr1[3] += val * k1[3];

                    outptr2[0] += val * k2[0];
                    outptr2[1] += val * k2[1];
                    outptr2[2] += val * k2[2];
                    outptr2[3] += val * k2[3];

                    outptr3[0] += val * k3[0];
                    outptr3[1] += val * k3[1];
                    outptr3[2] += val * k3[2];
                    outptr3[3] += val * k3[3];

                    r0++;
                    outptr0++;
                    outptr1++;
                    outptr2++;
                    outptr3++;
                }
            }
        }
    }
}

static void deconv4x4s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outch = top_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            const float* img0 = bottom_blob.channel(q);

            const float* kernel0 = kernel + p * inch * 16 + q * 16;

            const float* r0 = img0;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 4;
            const float* k2 = kernel0 + 8;
            const float* k3 = kernel0 + 12;

#if __ARM_NEON
            float32x4_t _k0 = vld1q_f32(k0);
            float32x4_t _k1 = vld1q_f32(k1);
            float32x4_t _k2 = vld1q_f32(k2);
            float32x4_t _k3 = vld1q_f32(k3);
#endif // __ARM_NEON

            for (int i = 0; i < h; i++)
            {
                float* outptr = out.row(i * 2);

                float* outptr0 = outptr;
                float* outptr1 = outptr0 + outw;
                float* outptr2 = outptr1 + outw;
                float* outptr3 = outptr2 + outw;

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    float32x4_t _v = vld1q_f32(r0);

                    // row 0
                    float32x4x2_t _out0 = vld2q_f32(outptr0);
                    // 0,2,4,6
                    _out0.val[0] = vmlaq_lane_f32(_out0.val[0], _v, vget_low_f32(_k0), 0);
                    // 1,3,5,7
                    _out0.val[1] = vmlaq_lane_f32(_out0.val[1], _v, vget_low_f32(_k0), 1);
                    vst2q_f32(outptr0, _out0);

                    _out0 = vld2q_f32(outptr0 + 2);
                    // 2,4,6,8
                    _out0.val[0] = vmlaq_lane_f32(_out0.val[0], _v, vget_high_f32(_k0), 0);
                    // 3,5,7,9
                    _out0.val[1] = vmlaq_lane_f32(_out0.val[1], _v, vget_high_f32(_k0), 1);
                    vst2q_f32(outptr0 + 2, _out0);

                    // row 1
                    float32x4x2_t _out1 = vld2q_f32(outptr1);
                    // 0,2,4,6
                    _out1.val[0] = vmlaq_lane_f32(_out1.val[0], _v, vget_low_f32(_k1), 0);
                    // 1,3,5,7
                    _out1.val[1] = vmlaq_lane_f32(_out1.val[1], _v, vget_low_f32(_k1), 1);
                    vst2q_f32(outptr1, _out1);

                    _out1 = vld2q_f32(outptr1 + 2);
                    // 2,4,6,8
                    _out1.val[0] = vmlaq_lane_f32(_out1.val[0], _v, vget_high_f32(_k1), 0);
                    // 3,5,7,9
                    _out1.val[1] = vmlaq_lane_f32(_out1.val[1], _v, vget_high_f32(_k1), 1);
                    vst2q_f32(outptr1 + 2, _out1);

                    // row 2
                    float32x4x2_t _out2 = vld2q_f32(outptr2);
                    _out2.val[0] = vmlaq_lane_f32(_out2.val[0], _v, vget_low_f32(_k2), 0);
                    _out2.val[1] = vmlaq_lane_f32(_out2.val[1], _v, vget_low_f32(_k2), 1);
                    vst2q_f32(outptr2, _out2);

                    _out2 = vld2q_f32(outptr2 + 2);
                    _out2.val[0] = vmlaq_lane_f32(_out2.val[0], _v, vget_high_f32(_k2), 0);
                    _out2.val[1] = vmlaq_lane_f32(_out2.val[1], _v, vget_high_f32(_k2), 1);
                    vst2q_f32(outptr2 + 2, _out2);

                    // row 3
                    float32x4x2_t _out3 = vld2q_f32(outptr3);
                    _out3.val[0] = vmlaq_lane_f32(_out3.val[0], _v, vget_low_f32(_k3), 0);
                    _out3.val[1] = vmlaq_lane_f32(_out3.val[1], _v, vget_low_f32(_k3), 1);
                    vst2q_f32(outptr3, _out3);

                    _out3 = vld2q_f32(outptr3 + 2);
                    _out3.val[0] = vmlaq_lane_f32(_out3.val[0], _v, vget_high_f32(_k3), 0);
                    _out3.val[1] = vmlaq_lane_f32(_out3.val[1], _v, vget_high_f32(_k3), 1);
                    vst2q_f32(outptr3 + 2, _out3);

                    r0 += 4;
                    outptr0 += 8;
                    outptr1 += 8;
                    outptr2 += 8;
                    outptr3 += 8;
                }

#endif // __ARM_NEON

                for (; j < w; j++)
                {
                    float val = r0[0];

                    outptr0[0] += val * k0[0];
                    outptr0[1] += val * k0[1];
                    outptr0[2] += val * k0[2];
                    outptr0[3] += val * k0[3];

                    outptr1[0] += val * k1[0];
                    outptr1[1] += val * k1[1];
                    outptr1[2] += val * k1[2];
                    outptr1[3] += val * k1[3];

                    outptr2[0] += val * k2[0];
                    outptr2[1] += val * k2[1];
                    outptr2[2] += val * k2[2];
                    outptr2[3] += val * k2[3];

                    outptr3[0] += val * k3[0];
                    outptr3[1] += val * k3[1];
                    outptr3[2] += val * k3[2];
                    outptr3[3] += val * k3[3];

                    r0++;
                    outptr0 += 2;
                    outptr1 += 2;
                    outptr2 += 2;
                    outptr3 += 2;
                }
            }
        }
    }
}


================================================
FILE: src/layer/arm/deconvolution_4x4_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void deconv4x4s2_fp16sa_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outch = top_blob.c;

    const __fp16* kernel = _kernel;
    const __fp16* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const __fp16 bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            const __fp16* img0 = bottom_blob.channel(q);

            const __fp16* kernel0 = kernel + p * inch * 16 + q * 16;

            const __fp16* r0 = img0;

            const __fp16* k0 = kernel0;
            const __fp16* k1 = kernel0 + 4;
            const __fp16* k2 = kernel0 + 8;
            const __fp16* k3 = kernel0 + 12;

            float16x4_t _k0 = vld1_f16(k0);
            float16x4_t _k1 = vld1_f16(k1);
            float16x4_t _k2 = vld1_f16(k2);
            float16x4_t _k3 = vld1_f16(k3);

            for (int i = 0; i < h; i++)
            {
                __fp16* outptr = out.row<__fp16>(i * 2);

                __fp16* outptr0 = outptr;
                __fp16* outptr1 = outptr0 + outw;
                __fp16* outptr2 = outptr1 + outw;
                __fp16* outptr3 = outptr2 + outw;

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    float16x4_t _v = vld1_f16(r0);

                    // row 0
                    float16x4x2_t _out0 = vld2_f16(outptr0);
                    // 0,2,4,6
                    _out0.val[0] = vfma_lane_f16(_out0.val[0], _v, _k0, 0);
                    // 1,3,5,7
                    _out0.val[1] = vfma_lane_f16(_out0.val[1], _v, _k0, 1);
                    vst2_f16(outptr0, _out0);

                    _out0 = vld2_f16(outptr0 + 2);
                    // 2,4,6,8
                    _out0.val[0] = vfma_lane_f16(_out0.val[0], _v, _k0, 2);
                    // 3,5,7,9
                    _out0.val[1] = vfma_lane_f16(_out0.val[1], _v, _k0, 3);
                    vst2_f16(outptr0 + 2, _out0);

                    // row 1
                    float16x4x2_t _out1 = vld2_f16(outptr1);
                    // 0,2,4,6
                    _out1.val[0] = vfma_lane_f16(_out1.val[0], _v, _k1, 0);
                    // 1,3,5,7
                    _out1.val[1] = vfma_lane_f16(_out1.val[1], _v, _k1, 1);
                    vst2_f16(outptr1, _out1);

                    _out1 = vld2_f16(outptr1 + 2);
                    // 2,4,6,8
                    _out1.val[0] = vfma_lane_f16(_out1.val[0], _v, _k1, 2);
                    // 3,5,7,9
                    _out1.val[1] = vfma_lane_f16(_out1.val[1], _v, _k1, 3);
                    vst2_f16(outptr1 + 2, _out1);

                    // row 2
                    float16x4x2_t _out2 = vld2_f16(outptr2);
                    _out2.val[0] = vfma_lane_f16(_out2.val[0], _v, _k2, 0);
                    _out2.val[1] = vfma_lane_f16(_out2.val[1], _v, _k2, 1);
                    vst2_f16(outptr2, _out2);

                    _out2 = vld2_f16(outptr2 + 2);
                    _out2.val[0] = vfma_lane_f16(_out2.val[0], _v, _k2, 2);
                    _out2.val[1] = vfma_lane_f16(_out2.val[1], _v, _k2, 3);
                    vst2_f16(outptr2 + 2, _out2);

                    // row 3
                    float16x4x2_t _out3 = vld2_f16(outptr3);
                    _out3.val[0] = vfma_lane_f16(_out3.val[0], _v, _k3, 0);
                    _out3.val[1] = vfma_lane_f16(_out3.val[1], _v, _k3, 1);
                    vst2_f16(outptr3, _out3);

                    _out3 = vld2_f16(outptr3 + 2);
                    _out3.val[0] = vfma_lane_f16(_out3.val[0], _v, _k3, 2);
                    _out3.val[1] = vfma_lane_f16(_out3.val[1], _v, _k3, 3);
                    vst2_f16(outptr3 + 2, _out3);

                    r0 += 4;
                    outptr0 += 8;
                    outptr1 += 8;
                    outptr2 += 8;
                    outptr3 += 8;
                }
                for (; j < w; j++)
                {
                    __fp16 val = r0[0];

                    outptr0[0] += val * k0[0];
                    outptr0[1] += val * k0[1];
                    outptr0[2] += val * k0[2];
                    outptr0[3] += val * k0[3];

                    outptr1[0] += val * k1[0];
                    outptr1[1] += val * k1[1];
                    outptr1[2] += val * k1[2];
                    outptr1[3] += val * k1[3];

                    outptr2[0] += val * k2[0];
                    outptr2[1] += val * k2[1];
                    outptr2[2] += val * k2[2];
                    outptr2[3] += val * k2[3];

                    outptr3[0] += val * k3[0];
                    outptr3[1] += val * k3[1];
                    outptr3[2] += val * k3[2];
                    outptr3[3] += val * k3[3];

                    r0++;
                    outptr0 += 2;
                    outptr1 += 2;
                    outptr2 += 2;
                    outptr3 += 2;
                }
            }
        }
    }
}


================================================
FILE: src/layer/arm/deconvolution_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolution_arm.h"

#include "layer_type.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#include "deconvolution_3x3.h"
#include "deconvolution_4x4.h"

Deconvolution_arm::Deconvolution_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif

    activation = 0;
    gemm = 0;
}

int Deconvolution_arm::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

    const int maxk = kernel_w * kernel_h;
    int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

    if (opt.use_sgemm_convolution)
    {
        const int maxk = kernel_w * kernel_h;

        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);

        ncnn::ParamDict pd;
        pd.set(2, 1);                 // transA
        pd.set(3, 0);                 // transB
        pd.set(4, 1);                 // constantA
        pd.set(5, 0);                 // constantB
        pd.set(6, 1);                 // constantC
        pd.set(7, maxk * num_output); // M = maxk*num_output
        pd.set(8, 0);                 // N = size
        pd.set(9, num_input);         // K = inch
        pd.set(10, -1);               // constant_broadcast_type_C = null
        pd.set(11, 0);                // output_N1M
        pd.set(12, out_elempack);

        gemm->load_param(pd);

        // maxk-inch-outch to pa-maxk-outch/pa-inch
        Mat tmp;
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

            tmp.create(maxk * num_output, num_input);

            for (int p = 0; p < num_input; p += 1)
            {
                float* g00 = tmp.row(p);

                for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        for (int i = 0; i < out_elempack; i++)
                        {
                            const float* k00 = weight_data_r2.channel(q + i).row(p);
                            g00[0] = k00[k];
                            g00++;
                        }
                    }
                }
            }
        }

        ncnn::Mat weights[1];
        weights[0] = tmp;

        gemm->load_model(ModelBinFromMatArray(weights));

        Option opt1 = opt;
        opt1.use_fp16_storage = false;
        gemm->create_pipeline(opt1);
    }
    else
    {
        Mat weight_data_transposed(weight_data.w);
        {
            float* pt = weight_data_transposed;
            const float* p = weight_data;

            for (int i = 0; i < num_input * num_output; i++)
            {
                for (int k = 0; k < maxk; k++)
                {
                    pt[maxk - 1 - k] = p[k];
                }

                p += maxk;
                pt += maxk;
            }
        }

        // src = kw-kh-inch-outch
        // dst = pb-pa-kw-kh-inch/pa-outch/pb
        Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            float* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }

        // pack1
        if (elempack == 1 && out_elempack == 1)
        {
            if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
            {
                weight_data_tm = weight_data;
            }
            else if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
            {
                weight_data_tm = weight_data;
            }
            else if (kernel_w == 4 && kernel_h == 4 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
            {
                weight_data_tm = weight_data;
            }
            else if (kernel_w == 4 && kernel_h == 4 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
            {
                weight_data_tm = weight_data;
            }
            else
            {
                weight_data_tm = weight_data_transposed;
            }
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Deconvolution_arm::destroy_pipeline(const Option& opt)
{
    if (activation)
    {
        activation->destroy_pipeline(opt);
        delete activation;
        activation = 0;
    }

    if (gemm)
    {
        gemm->destroy_pipeline(opt);
        delete gemm;
        gemm = 0;
    }

    return 0;
}

int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    int out_channels = num_output / out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, out_channels, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, out_channels, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    if (opt.use_sgemm_convolution)
    {
        // sgemm
        Mat bottom_blob_2 = bottom_blob;
        {
            bottom_blob_2.w = bottom_blob.w * bottom_blob.h;
            bottom_blob_2.h = 1;
        }
        Mat top_col2im;
        Option opt_b = opt;
        opt_b.blob_allocator = top_blob_bordered.allocator;
        int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b);
        if (ret != 0)
            return ret;

        {
            // col2im
            const int gap = (outw * stride_h - w * stride_w) * out_elempack;

#if __ARM_NEON
            if (out_elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int p = 0; p < out_channels; p++)
                {
                    const float* sptr = top_col2im.row(p * maxk);
                    Mat outm = top_blob_bordered.channel(p);

                    if (bias_data.empty())
                    {
                        outm.fill(vdupq_n_f32(0.f));
                    }
                    else
                    {
                        outm.fill(vld1q_f32((const float*)bias_data + p * 4));
                    }

                    for (int u = 0; u < kernel_h; u++)
                    {
                        for (int v = 0; v < kernel_w; v++)
                        {
                            float* ptr = outm.row(dilation_h * u) + dilation_w * v * 4;

                            for (int i = 0; i < h; i++)
                            {
                                for (int j = 0; j < w; j++)
                                {
                                    float32x4_t _val = vld1q_f32(ptr);
                                    float32x4_t _s = vld1q_f32(sptr);
                                    _val = vaddq_f32(_val, _s);
                                    vst1q_f32(ptr, _val);

                                    ptr += stride_w * 4;
                                    sptr += 4;
                                }

                                ptr += gap;
                            }
                        }
                    }
                }
            }
#endif // __ARM_NEON

            if (out_elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int p = 0; p < out_channels; p++)
                {
                    const float* sptr = top_col2im.row(p * maxk);
                    Mat outm = top_blob_bordered.channel(p);

                    const float bias = bias_data.empty() ? 0.f : bias_data[p];
                    outm.fill(bias);

                    for (int u = 0; u < kernel_h; u++)
                    {
                        for (int v = 0; v < kernel_w; v++)
                        {
                            float* ptr = outm.row(dilation_h * u) + dilation_w * v;

                            for (int i = 0; i < h; i++)
                            {
                                for (int j = 0; j < w; j++)
                                {
                                    ptr[0] += sptr[0];

                                    ptr += stride_w;
                                    sptr += 1;
                                }

                                ptr += gap;
                            }
                        }
                    }
                }
            }
        }

        if (activation)
        {
            activation->forward_inplace(top_blob_bordered, opt);
        }
    }
    else
    {
#if __ARM_NEON
        if (elempack == 4 && out_elempack == 4)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                float* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32(((const float*)bias_data) + p * 4);
                        }

                        const float* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const float* sptr = m.row(sy) + sx * 4;

                                    float32x4_t _val = vld1q_f32(sptr);

                                    int k = y * kernel_w + x;

                                    float32x4_t _w0 = vld1q_f32(kptr + k * 16);
                                    float32x4_t _w1 = vld1q_f32(kptr + k * 16 + 4);
                                    float32x4_t _w2 = vld1q_f32(kptr + k * 16 + 8);
                                    float32x4_t _w3 = vld1q_f32(kptr + k * 16 + 12);

#if __aarch64__
                                    _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
                                    _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
                                    _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
                                    _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
#else
                                    _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
                                    _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
                                    _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
                                    _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
#endif
                                }
                            }

                            kptr += maxk * 16;
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1q_f32(outptr + j * 4, _sum);
                    }

                    outptr += outw * 4;
                }
            }
        }

        if (elempack == 1 && out_elempack == 4)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                float* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32(((const float*)bias_data) + p * 4);
                        }

                        const float* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const float* sptr = m.row(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float32x4_t _val = vdupq_n_f32(sptr[sx]);

                                    int k = y * kernel_w + x;

                                    float32x4_t _w = vld1q_f32(kptr + k * 4);

                                    _sum = vmlaq_f32(_sum, _val, _w);
                                }
                            }

                            kptr += maxk * 4;
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1q_f32(outptr + j * 4, _sum);
                    }

                    outptr += outw * 4;
                }
            }
        }

        if (elempack == 4 && out_elempack == 1)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output; p++)
            {
                float* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const float* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const float* sptr = m.row(sy) + sx * 4;

                                    float32x4_t _val = vld1q_f32(sptr);

                                    int k = y * kernel_w + x;

                                    float32x4_t _w = vld1q_f32(kptr + k * 4);

                                    float32x4_t _s4 = vmulq_f32(_val, _w);
#if __aarch64__
                                    sum += vaddvq_f32(_s4); // dot
#else
                                    float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
                                    _ss = vpadd_f32(_ss, _ss);
                                    sum += vget_lane_f32(_ss, 0);
#endif
                                }
                            }

                            kptr += maxk * 4;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = sum;
                    }

                    outptr += outw;
                }
            }
        }
#endif // __ARM_NEON

        if (elempack == 1 && out_elempack == 1)
        {
            if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
            {
                deconv3x3s1_neon(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob_bordered, opt);
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
            {
                deconv3x3s2_neon(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob_bordered, opt);
                }
            }
            else if (kernel_w == 4 && kernel_h == 4 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
            {
                deconv4x4s1_neon(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob_bordered, opt);
                }
            }
            else if (kernel_w == 4 && kernel_h == 4 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
            {
                deconv4x4s2_neon(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob_bordered, opt);
                }
            }
            else
            {
                // num_output
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int p = 0; p < num_output; p++)
                {
                    float* outptr = top_blob_bordered.channel(p);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                            {
                                sum = bias_data[p];
                            }

                            const float* kptr = (const float*)weight_data_tm + maxk * channels * p;

                            // channels
                            for (int q = 0; q < channels; q++)
                            {
                                const Mat m = bottom_blob.channel(q);

                                for (int y = 0; y < kernel_h; y++)
                                {
                                    int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                    if (sys < 0 || sys % stride_h != 0)
                                        continue;

                                    int sy = sys / stride_h;
                                    if (sy >= h)
                                        continue;

                                    const float* sptr = m.row(sy);

                                    for (int x = 0; x < kernel_w; x++)
                                    {
                                        int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                        if (sxs < 0 || sxs % stride_w != 0)
                                            continue;

                                        int sx = sxs / stride_w;
                                        if (sx >= w)
                                            continue;

                                        float val = sptr[sx];

                                        int k = y * kernel_w + x;

                                        float w = kptr[k];

                                        sum += val * w;
                                    }
                                }

                                kptr += maxk;
                            }

                            sum = activation_ss(sum, activation_type, activation_params);

                            outptr[j] = sum;
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int Deconvolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.c * bottom_blob.elempack;
    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.d * 1;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

#if NCNN_ARM82
    if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_ARM82
#if NCNN_BF16
    if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_BF16

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / 1;
        const int inch_g = _num_input / 1;
        const int maxk = _kernel_h * _kernel_w;

        for (int g = 0; g < 1; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

#if NCNN_ARM82
        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_ARM82
#if NCNN_BF16
        if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_BF16

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, output_pad_right);
    pd.set(19, output_pad_bottom);
    pd.set(20, output_w);
    pd.set(21, output_h);
    pd.set(5, bias_term);
    pd.set(6, weight_data_transposed.w);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_transposed;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_BF16
int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

    Mat weight_data_transposed(weight_data.w);
    {
        float* pt = weight_data_transposed;
        const float* p = weight_data;

        for (int i = 0; i < num_input * num_output; i++)
        {
            for (int k = 0; k < maxk; k++)
            {
                pt[maxk - 1 - k] = p[k];
            }

            p += maxk;
            pt += maxk;
        }
    }

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            unsigned short* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = float32_to_bfloat16(k00[k]);

                            g00++;
                        }
                    }
                }
            }
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Deconvolution_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // deconvolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

#if __ARM_NEON
    if (elempack == 4 && out_elempack == 4)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output / out_elempack; p++)
            {
                unsigned short* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32(((const float*)bias_data) + p * 4);
                        }

                        const unsigned short* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;

                                    float32x4_t _val = bfloat2float(vld1_u16(sptr));

                                    int k = y * kernel_w + x;

                                    float32x4_t _w0 = bfloat2float(vld1_u16(kptr + k * 16));
                                    float32x4_t _w1 = bfloat2float(vld1_u16(kptr + k * 16 + 4));
                                    float32x4_t _w2 = bfloat2float(vld1_u16(kptr + k * 16 + 8));
                                    float32x4_t _w3 = bfloat2float(vld1_u16(kptr + k * 16 + 12));

#if __aarch64__
                                    _sum = vmlaq_laneq_f32(_sum, _w0, _val, 0);
                                    _sum = vmlaq_laneq_f32(_sum, _w1, _val, 1);
                                    _sum = vmlaq_laneq_f32(_sum, _w2, _val, 2);
                                    _sum = vmlaq_laneq_f32(_sum, _w3, _val, 3);
#else
                                    _sum = vmlaq_lane_f32(_sum, _w0, vget_low_f32(_val), 0);
                                    _sum = vmlaq_lane_f32(_sum, _w1, vget_low_f32(_val), 1);
                                    _sum = vmlaq_lane_f32(_sum, _w2, vget_high_f32(_val), 0);
                                    _sum = vmlaq_lane_f32(_sum, _w3, vget_high_f32(_val), 1);
#endif
                                }
                            }

                            kptr += maxk * 16;
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1_u16(outptr + j * 4, float2bfloat(_sum));
                    }

                    outptr += outw * 4;
                }
            }
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output / out_elempack; p++)
            {
                unsigned short* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32(((const float*)bias_data) + p * 4);
                        }

                        const unsigned short* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const unsigned short* sptr = m.row<const unsigned short>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float32x4_t _val = vdupq_n_f32(bfloat16_to_float32(sptr[sx]));

                                    int k = y * kernel_w + x;

                                    float32x4_t _w = bfloat2float(vld1_u16(kptr + k * 4));

                                    _sum = vmlaq_f32(_sum, _val, _w);
                                }
                            }

                            kptr += maxk * 4;
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1_u16(outptr + j * 4, float2bfloat(_sum));
                    }

                    outptr += outw * 4;
                }
            }
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output / out_elempack; p++)
            {
                unsigned short* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const unsigned short* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;

                                    float32x4_t _val = bfloat2float(vld1_u16(sptr));

                                    int k = y * kernel_w + x;

                                    float32x4_t _w = bfloat2float(vld1_u16(kptr + k * 4));

                                    float32x4_t _s4 = vmulq_f32(_val, _w);
#if __aarch64__
                                    sum += vaddvq_f32(_s4); // dot
#else
                                    float32x2_t _ss = vadd_f32(vget_low_f32(_s4), vget_high_f32(_s4));
                                    _ss = vpadd_f32(_ss, _ss);
                                    sum += vget_lane_f32(_ss, 0);
#endif
                                }
                            }

                            kptr += maxk * 4;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = float32_to_bfloat16(sum);
                    }

                    outptr += outw;
                }
            }
        }
    }
#endif // __ARM_NEON

    if (elempack == 1 && out_elempack == 1)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output; p++)
            {
                unsigned short* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const unsigned short* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const unsigned short* sptr = m.row<const unsigned short>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float val = bfloat16_to_float32(sptr[sx]);

                                    int k = y * kernel_w + x;

                                    float w = bfloat16_to_float32(kptr[k]);

                                    sum += val * w;
                                }
                            }

                            kptr += maxk;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = float32_to_bfloat16(sum);
                    }

                    outptr += outw;
                }
            }
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/deconvolution_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTION_ARM_H
#define LAYER_DECONVOLUTION_ARM_H

#include "deconvolution.h"

namespace ncnn {

class Deconvolution_arm : public Deconvolution
{
public:
    Deconvolution_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Layer* activation;
    Layer* gemm;

    Mat weight_data_tm;

    // fp16
    Mat bias_data_fp16;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTION_ARM_H


================================================
FILE: src/layer/arm/deconvolution_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolution_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "deconvolution_4x4_fp16s.h"
#endif

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Deconvolution_arm::create_pipeline_fp16s(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;

    if (opt.use_packing_layout)
    {
        elempack = opt.use_fp16_arithmetic && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }

    if (opt.use_fp16_arithmetic && opt.use_sgemm_convolution)
    {
        const int maxk = kernel_w * kernel_h;

        gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);

        ncnn::ParamDict pd;
        pd.set(2, 1);                 // transA
        pd.set(3, 0);                 // transB
        pd.set(4, 1);                 // constantA
        pd.set(5, 0);                 // constantB
        pd.set(6, 1);                 // constantC
        pd.set(7, maxk * num_output); // M = maxk*num_output
        pd.set(8, 0);                 // N = size
        pd.set(9, num_input);         // K = inch
        pd.set(10, -1);               // constant_broadcast_type_C = null
        pd.set(11, 0);                // output_N1M
        pd.set(12, out_elempack);

        gemm->load_param(pd);

        // maxk-inch-outch to pa-maxk-outch/pa-inch
        Mat tmp;
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

            tmp.create(maxk * num_output, num_input);

            for (int p = 0; p < num_input; p += 1)
            {
                float* g00 = tmp.row(p);

                for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        for (int i = 0; i < out_elempack; i++)
                        {
                            const float* k00 = weight_data_r2.channel(q + i).row(p);
                            g00[0] = k00[k];
                            g00++;
                        }
                    }
                }
            }
        }

        ncnn::Mat weights[1];
        weights[0] = tmp;

        gemm->load_model(ModelBinFromMatArray(weights));

        gemm->create_pipeline(opt);
    }
    else
    {
        Mat weight_data_transposed(weight_data.w);
        {
            float* pt = weight_data_transposed;
            const float* p = weight_data;

            for (int i = 0; i < num_input * num_output; i++)
            {
                for (int k = 0; k < maxk; k++)
                {
                    pt[maxk - 1 - k] = p[k];
                }

                p += maxk;
                pt += maxk;
            }
        }

        // src = kw-kh-inch-outch
        // dst = pb-pa-kw-kh-inch/pa-outch/pb
        Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)2u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            __fp16* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = (__fp16)k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }

    if (elempack == 1 && out_elempack == 1 && opt.use_fp16_arithmetic)
    {
        if (kernel_w == 4 && kernel_h == 4 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
        {
            ncnn::cast_float32_to_float16(weight_data, weight_data_tm, opt);
        }
    }

    ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Deconvolution_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // deconvolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = opt.use_packing_layout && num_output % 4 == 0 ? 4 : 1;
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    if (elempack == 4 && out_elempack == 4)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output / out_elempack; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32(((const float*)bias_data) + p * 4);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;

                                    float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));

                                    int k = y * kernel_w + x;

                                    float32x4_t _w0 = vcvt_f32_f16(vld1_f16(kptr + k * 16));
                                    float32x4_t _w1 = vcvt_f32_f16(vld1_f16(kptr + k * 16 + 4));
                                    float32x4_t _w2 = vcvt_f32_f16(vld1_f16(kptr + k * 16 + 8));
                                    float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr + k * 16 + 12));

                                    _sum = vfmaq_laneq_f32(_sum, _w0, _val, 0);
                                    _sum = vfmaq_laneq_f32(_sum, _w1, _val, 1);
                                    _sum = vfmaq_laneq_f32(_sum, _w2, _val, 2);
                                    _sum = vfmaq_laneq_f32(_sum, _w3, _val, 3);
                                }
                            }

                            kptr += maxk * 16;
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
                    }

                    outptr += outw * 4;
                }
            }
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output / out_elempack; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32(((const float*)bias_data) + p * 4);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const __fp16* sptr = m.row<const __fp16>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float32x4_t _val = vdupq_n_f32((float)sptr[sx]);

                                    int k = y * kernel_w + x;

                                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));

                                    _sum = vfmaq_f32(_sum, _val, _w);
                                }
                            }

                            kptr += maxk * 4;
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
                    }

                    outptr += outw * 4;
                }
            }
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output / out_elempack; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;

                                    float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));

                                    int k = y * kernel_w + x;

                                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));

                                    float32x4_t _s4 = vmulq_f32(_val, _w);

                                    sum += vaddvq_f32(_s4); // dot
                                }
                            }

                            kptr += maxk * 4;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = (__fp16)sum;
                    }

                    outptr += outw;
                }
            }
        }
    }

    if (elempack == 1 && out_elempack == 1)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const __fp16* sptr = m.row<const __fp16>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float val = (float)sptr[sx];

                                    int k = y * kernel_w + x;

                                    float w = (float)kptr[k];

                                    sum += val * w;
                                }
                            }

                            kptr += maxk;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = (__fp16)sum;
                    }

                    outptr += outw;
                }
            }
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int Deconvolution_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // deconvolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    int out_channels = num_output / out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, out_channels, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, out_channels, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    if (opt.use_sgemm_convolution)
    {
        // sgemm
        Mat bottom_blob_2 = bottom_blob;
        {
            bottom_blob_2.w = bottom_blob.w * bottom_blob.h;
            bottom_blob_2.h = 1;
        }
        Mat top_col2im;
        Option opt_b = opt;
        opt_b.blob_allocator = top_blob_bordered.allocator;
        int ret = gemm->forward(bottom_blob_2, top_col2im, opt_b);
        if (ret != 0)
            return ret;

        {
            // col2im
            const int gap = (outw * stride_h - w * stride_w) * out_elempack;

            if (out_elempack == 8)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int p = 0; p < out_channels; p++)
                {
                    const __fp16* sptr = top_col2im.row<const __fp16>(p * maxk);
                    Mat outm = top_blob_bordered.channel(p);

                    if (bias_data.empty())
                    {
                        outm.fill(vdupq_n_f16(0.f));
                    }
                    else
                    {
                        outm.fill(vld1q_f16((const __fp16*)bias_data_fp16 + p * 8));
                    }

                    for (int u = 0; u < kernel_h; u++)
                    {
                        for (int v = 0; v < kernel_w; v++)
                        {
                            __fp16* ptr = outm.row<__fp16>(dilation_h * u) + dilation_w * v * 8;

                            for (int i = 0; i < h; i++)
                            {
                                for (int j = 0; j < w; j++)
                                {
                                    float16x8_t _val = vld1q_f16(ptr);
                                    float16x8_t _s = vld1q_f16(sptr);
                                    _val = vaddq_f16(_val, _s);
                                    vst1q_f16(ptr, _val);

                                    ptr += stride_w * 8;
                                    sptr += 8;
                                }

                                ptr += gap;
                            }
                        }
                    }
                }
            }

            if (out_elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int p = 0; p < out_channels; p++)
                {
                    const __fp16* sptr = top_col2im.row<const __fp16>(p * maxk);
                    Mat outm = top_blob_bordered.channel(p);

                    if (bias_data.empty())
                    {
                        outm.fill(vdup_n_f16(0.f));
                    }
                    else
                    {
                        outm.fill(vld1_f16((const __fp16*)bias_data_fp16 + p * 4));
                    }

                    for (int u = 0; u < kernel_h; u++)
                    {
                        for (int v = 0; v < kernel_w; v++)
                        {
                            __fp16* ptr = outm.row<__fp16>(dilation_h * u) + dilation_w * v * 4;

                            for (int i = 0; i < h; i++)
                            {
                                for (int j = 0; j < w; j++)
                                {
                                    float16x4_t _val = vld1_f16(ptr);
                                    float16x4_t _s = vld1_f16(sptr);
                                    _val = vadd_f16(_val, _s);
                                    vst1_f16(ptr, _val);

                                    ptr += stride_w * 4;
                                    sptr += 4;
                                }

                                ptr += gap;
                            }
                        }
                    }
                }
            }

            if (out_elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int p = 0; p < out_channels; p++)
                {
                    const __fp16* sptr = top_col2im.row<const __fp16>(p * maxk);
                    Mat outm = top_blob_bordered.channel(p);

                    const __fp16 bias = bias_data_fp16.empty() ? 0.f : ((const __fp16*)bias_data_fp16)[p];
                    outm.fill(bias);

                    for (int u = 0; u < kernel_h; u++)
                    {
                        for (int v = 0; v < kernel_w; v++)
                        {
                            __fp16* ptr = outm.row<__fp16>(dilation_h * u) + dilation_w * v;

                            for (int i = 0; i < h; i++)
                            {
                                for (int j = 0; j < w; j++)
                                {
                                    ptr[0] += sptr[0];

                                    ptr += stride_w;
                                    sptr += 1;
                                }

                                ptr += gap;
                            }
                        }
                    }
                }
            }
        }

        if (activation)
        {
            activation->forward_inplace(top_blob_bordered, opt);
        }
    }
    else
    {
        if (elempack == 8 && out_elempack == 8)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;

                                    float16x8_t _val = vld1q_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x8_t _w0 = vld1q_f16(kptr + k * 64);
                                    float16x8_t _w1 = vld1q_f16(kptr + k * 64 + 8);
                                    float16x8_t _w2 = vld1q_f16(kptr + k * 64 + 16);
                                    float16x8_t _w3 = vld1q_f16(kptr + k * 64 + 24);
                                    float16x8_t _w4 = vld1q_f16(kptr + k * 64 + 32);
                                    float16x8_t _w5 = vld1q_f16(kptr + k * 64 + 40);
                                    float16x8_t _w6 = vld1q_f16(kptr + k * 64 + 48);
                                    float16x8_t _w7 = vld1q_f16(kptr + k * 64 + 56);

                                    _sum = vfmaq_laneq_f16(_sum, _w0, _val, 0);
                                    _sum = vfmaq_laneq_f16(_sum, _w1, _val, 1);
                                    _sum = vfmaq_laneq_f16(_sum, _w2, _val, 2);
                                    _sum = vfmaq_laneq_f16(_sum, _w3, _val, 3);
                                    _sum = vfmaq_laneq_f16(_sum, _w4, _val, 4);
                                    _sum = vfmaq_laneq_f16(_sum, _w5, _val, 5);
                                    _sum = vfmaq_laneq_f16(_sum, _w6, _val, 6);
                                    _sum = vfmaq_laneq_f16(_sum, _w7, _val, 7);
                                }
                            }

                            kptr += maxk * 64;
                        }

                        _sum = activation_ps_f16(_sum, activation_type, activation_params);

                        vst1q_f16(outptr + j * 8, _sum);
                    }

                    outptr += outw * 8;
                }
            }
        }

        if (elempack == 1 && out_elempack == 8)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const __fp16* sptr = m.row<const __fp16>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float16x8_t _val = vdupq_n_f16(sptr[sx]);

                                    int k = y * kernel_w + x;

                                    float16x8_t _w = vld1q_f16(kptr + k * 8);

                                    _sum = vfmaq_f16(_sum, _val, _w);
                                }
                            }

                            kptr += maxk * 8;
                        }

                        _sum = activation_ps_f16(_sum, activation_type, activation_params);

                        vst1q_f16(outptr + j * 8, _sum);
                    }

                    outptr += outw * 8;
                }
            }
        }

        if (elempack == 4 && out_elempack == 8)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;

                                    float16x4_t _val = vld1_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x8_t _w0 = vld1q_f16(kptr + k * 32);
                                    float16x8_t _w1 = vld1q_f16(kptr + k * 32 + 8);
                                    float16x8_t _w2 = vld1q_f16(kptr + k * 32 + 16);
                                    float16x8_t _w3 = vld1q_f16(kptr + k * 32 + 24);

                                    _sum = vfmaq_lane_f16(_sum, _w0, _val, 0);
                                    _sum = vfmaq_lane_f16(_sum, _w1, _val, 1);
                                    _sum = vfmaq_lane_f16(_sum, _w2, _val, 2);
                                    _sum = vfmaq_lane_f16(_sum, _w3, _val, 3);
                                }
                            }

                            kptr += maxk * 32;
                        }

                        _sum = activation_ps_f16(_sum, activation_type, activation_params);

                        vst1q_f16(outptr + j * 8, _sum);
                    }

                    outptr += outw * 8;
                }
            }
        }

        if (elempack == 8 && out_elempack == 1)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;

                                    float16x8_t _val = vld1q_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x8_t _w = vld1q_f16(kptr + k * 8);

                                    float16x8_t _s8 = vmulq_f16(_val, _w);

                                    float16x4_t _s4 = vadd_f16(vget_low_f16(_s8), vget_high_f16(_s8));
                                    sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot
                                }
                            }

                            kptr += maxk * 8;
                        }

                        sum = activation_ss_f16(sum, activation_type, activation_params);

                        outptr[j] = (__fp16)sum;
                    }

                    outptr += outw;
                }
            }
        }

        if (elempack == 8 && out_elempack == 4)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float16x4_t _sum = vdup_n_f16((__fp16)0.f);

                        if (bias_term)
                        {
                            _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;

                                    float16x8_t _val = vld1q_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x4_t _w0 = vld1_f16(kptr + k * 32);
                                    float16x4_t _w1 = vld1_f16(kptr + k * 32 + 4);
                                    float16x4_t _w2 = vld1_f16(kptr + k * 32 + 8);
                                    float16x4_t _w3 = vld1_f16(kptr + k * 32 + 12);
                                    float16x4_t _w4 = vld1_f16(kptr + k * 32 + 16);
                                    float16x4_t _w5 = vld1_f16(kptr + k * 32 + 20);
                                    float16x4_t _w6 = vld1_f16(kptr + k * 32 + 24);
                                    float16x4_t _w7 = vld1_f16(kptr + k * 32 + 28);

                                    _sum = vfma_laneq_f16(_sum, _w0, _val, 0);
                                    _sum = vfma_laneq_f16(_sum, _w1, _val, 1);
                                    _sum = vfma_laneq_f16(_sum, _w2, _val, 2);
                                    _sum = vfma_laneq_f16(_sum, _w3, _val, 3);
                                    _sum = vfma_laneq_f16(_sum, _w4, _val, 4);
                                    _sum = vfma_laneq_f16(_sum, _w5, _val, 5);
                                    _sum = vfma_laneq_f16(_sum, _w6, _val, 6);
                                    _sum = vfma_laneq_f16(_sum, _w7, _val, 7);
                                }
                            }

                            kptr += maxk * 32;
                        }

                        _sum = activation_ps_f16(_sum, activation_type, activation_params);

                        vst1_f16(outptr + j * 4, _sum);
                    }

                    outptr += outw * 4;
                }
            }
        }

        if (elempack == 4 && out_elempack == 4)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float16x4_t _sum = vdup_n_f16((__fp16)0.f);

                        if (bias_term)
                        {
                            _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;

                                    float16x4_t _val = vld1_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x4_t _w0 = vld1_f16(kptr + k * 16);
                                    float16x4_t _w1 = vld1_f16(kptr + k * 16 + 4);
                                    float16x4_t _w2 = vld1_f16(kptr + k * 16 + 8);
                                    float16x4_t _w3 = vld1_f16(kptr + k * 16 + 12);

                                    _sum = vfma_lane_f16(_sum, _w0, _val, 0);
                                    _sum = vfma_lane_f16(_sum, _w1, _val, 1);
                                    _sum = vfma_lane_f16(_sum, _w2, _val, 2);
                                    _sum = vfma_lane_f16(_sum, _w3, _val, 3);
                                }
                            }

                            kptr += maxk * 16;
                        }

                        _sum = activation_ps_f16(_sum, activation_type, activation_params);

                        vst1_f16(outptr + j * 4, _sum);
                    }

                    outptr += outw * 4;
                }
            }
        }

        if (elempack == 1 && out_elempack == 4)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float16x4_t _sum = vdup_n_f16((__fp16)0.f);

                        if (bias_term)
                        {
                            _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const __fp16* sptr = m.row<const __fp16>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float16x4_t _val = vdup_n_f16(sptr[sx]);

                                    int k = y * kernel_w + x;

                                    float16x4_t _w = vld1_f16(kptr + k * 4);

                                    _sum = vfma_f16(_sum, _val, _w);
                                }
                            }

                            kptr += maxk * 4;
                        }

                        _sum = activation_ps_f16(_sum, activation_type, activation_params);

                        vst1_f16(outptr + j * 4, _sum);
                    }

                    outptr += outw * 4;
                }
            }
        }

        if (elempack == 4 && out_elempack == 1)
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < out_channels; p++)
            {
                __fp16* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const __fp16* kptr = weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;

                                    float16x4_t _val = vld1_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x4_t _w = vld1_f16(kptr + k * 4);

                                    float16x4_t _s4 = vmul_f16(_val, _w);

                                    sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot
                                }
                            }

                            kptr += maxk * 4;
                        }

                        sum = activation_ss_f16(sum, activation_type, activation_params);

                        outptr[j] = (__fp16)sum;
                    }

                    outptr += outw;
                }
            }
        }

        if (elempack == 1 && out_elempack == 1)
        {
            if (kernel_w == 4 && kernel_h == 4 && stride_w == 2 && stride_h == 2 && dilation_w == 1 && dilation_h == 1)
            {
                deconv4x4s2_fp16sa_neon(bottom_blob, top_blob_bordered, weight_data_tm, bias_data_fp16, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob_bordered, opt);
                }
            }
            else
            {
                // num_output
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int p = 0; p < num_output; p++)
                {
                    __fp16* outptr = top_blob_bordered.channel(p);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                            {
                                sum = bias_data[p];
                            }

                            const __fp16* kptr = weight_data_tm.channel(p);

                            // channels
                            for (int q = 0; q < channels; q++)
                            {
                                const Mat m = bottom_blob.channel(q);

                                for (int y = 0; y < kernel_h; y++)
                                {
                                    int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                    if (sys < 0 || sys % stride_h != 0)
                                        continue;

                                    int sy = sys / stride_h;
                                    if (sy >= h)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy);

                                    for (int x = 0; x < kernel_w; x++)
                                    {
                                        int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                        if (sxs < 0 || sxs % stride_w != 0)
                                            continue;

                                        int sx = sxs / stride_w;
                                        if (sx >= w)
                                            continue;

                                        __fp16 val = sptr[sx];

                                        int k = y * kernel_w + x;

                                        __fp16 w = kptr[k];

                                        sum += val * w;
                                    }
                                }

                                kptr += maxk;
                            }

                            sum = activation_ss_f16(sum, activation_type, activation_params);

                            outptr[j] = (__fp16)sum;
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/deconvolutiondepthwise_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolutiondepthwise_arm.h"

#include "layer_type.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

    // create Deconvolution op for each group
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        int elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            elempack = channels % 4 == 0 ? 4 : 1;
        }
#endif

        Mat weight_data_transposed(weight_data.w);
        {
            float* pt = weight_data_transposed;
            const float* p = weight_data;

            for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
            {
                for (int k = 0; k < maxk; k++)
                {
                    pt[maxk - 1 - k] = p[k];
                }

                p += maxk;
                pt += maxk;
            }
        }

#if NCNN_BF16
        if (opt.use_bf16_storage)
        {
#if __ARM_NEON
            if (elempack == 4)
            {
                Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
                Mat weight_data_r2_packed;
                convert_packing(weight_data_r2, weight_data_r2_packed, 4, opt);

                ncnn::cast_float32_to_bfloat16(weight_data_r2_packed, weight_data_tm, opt);
            }
#endif // __ARM_NEON

            if (elempack == 1)
            {
                ncnn::cast_float32_to_bfloat16(weight_data_transposed, weight_data_tm, opt);
            }

            if (opt.lightmode)
                weight_data.release();

            return 0;
        }
#endif // NCNN_BF16

#if __ARM_NEON
        // pack4
        if (elempack == 4)
        {
            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
        }
#endif // __ARM_NEON

        // pack1
        if (elempack == 1)
        {
            weight_data_tm = weight_data_transposed;
        }
    }
    else
    {
        // group deconvolution
        for (int i = 0; i < (int)group_ops.size(); i++)
            delete group_ops[i];

        group_ops.clear();

        const int channels_g = channels / group;
        const int num_output_g = num_output / group;

        group_ops.resize(group);

        for (int g = 0; g < group; g++)
        {
            Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
            Mat bias_data_g;
            if (bias_term)
                bias_data_g = bias_data.range(num_output_g * g, num_output_g);

            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);

            // set param
            ncnn::ParamDict pd;
            pd.set(0, num_output_g); // num_output
            pd.set(1, kernel_w);
            pd.set(11, kernel_h);
            pd.set(2, dilation_w);
            pd.set(12, dilation_h);
            pd.set(3, stride_w);
            pd.set(13, stride_h);
            pd.set(4, 0);  // pad_w
            pd.set(14, 0); // pad_h
            pd.set(18, output_pad_right);
            pd.set(19, output_pad_bottom);
            pd.set(5, bias_term);
            pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
            pd.set(9, activation_type);
            pd.set(10, activation_params);

            op->load_param(pd);

            // set weights
            if (bias_term)
            {
                ncnn::Mat weights[2];
                weights[0] = weight_data_g;
                weights[1] = bias_data_g;

                op->load_model(ModelBinFromMatArray(weights));
            }
            else
            {
                ncnn::Mat weights[1];
                weights[0] = weight_data_g;

                op->load_model(ModelBinFromMatArray(weights));
            }

            op->create_pipeline(opt);

            group_ops[g] = op;
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int DeconvolutionDepthWise_arm::destroy_pipeline(const Option& opt)
{
    for (int i = 0; i < (int)group_ops.size(); i++)
    {
        group_ops[i]->destroy_pipeline(opt);
        delete group_ops[i];
    }
    group_ops.clear();

    return 0;
}

int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    // convolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int g = 0; g < channels; g++)
            {
                float* outptr = top_blob_bordered.channel(g);
                const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
                const Mat m = bottom_blob.channel(g);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32((const float*)bias_data + g * 4);
                        }

                        for (int y = 0; y < kernel_h; y++)
                        {
                            int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                            if (sys < 0 || sys % stride_h != 0)
                                continue;

                            int sy = sys / stride_h;
                            if (sy >= h)
                                continue;

                            for (int x = 0; x < kernel_w; x++)
                            {
                                int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                if (sxs < 0 || sxs % stride_w != 0)
                                    continue;

                                int sx = sxs / stride_w;
                                if (sx >= w)
                                    continue;

                                const float* sptr = m.row(sy) + sx * 4;

                                float32x4_t _val = vld1q_f32(sptr);

                                int k = y * kernel_w + x;

                                float32x4_t _w = vld1q_f32(kptr + k * 4);

                                _sum = vmlaq_f32(_sum, _val, _w);
                            }
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1q_f32(outptr + j * 4, _sum);
                    }

                    outptr += outw * 4;
                }
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int g = 0; g < channels; g++)
            {
                float* outptr = top_blob_bordered.channel(g);
                const float* kptr = (const float*)weight_data_tm + maxk * g;
                const Mat m = bottom_blob.channel(g);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[g];
                        }

                        for (int y = 0; y < kernel_h; y++)
                        {
                            int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                            if (sys < 0 || sys % stride_h != 0)
                                continue;

                            int sy = sys / stride_h;
                            if (sy >= h)
                                continue;

                            const float* sptr = m.row(sy);

                            for (int x = 0; x < kernel_w; x++)
                            {
                                int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                if (sxs < 0 || sxs % stride_w != 0)
                                    continue;

                                int sx = sxs / stride_w;
                                if (sx >= w)
                                    continue;

                                float val = sptr[sx];

                                int k = y * kernel_w + x;

                                float w = kptr[k];

                                sum += val * w;
                            }
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = sum;
                    }

                    outptr += outw;
                }
            }
        }
    }
    else
    {
        // group deconvolution
        const int channels_g = channels * elempack / group;
        const int num_output_g = num_output / group;

        int g_elempack = 1;
        int out_g_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            g_elempack = channels_g % 4 == 0 ? 4 : 1;
            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
        }
#endif

        // unpacking
        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack == 4 && g_elempack == 1)
        {
            Option opt_p = opt;
            opt_p.blob_allocator = opt.workspace_allocator;
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        Mat top_blob_bordered_unpacked = top_blob_bordered;
        if (out_g_elempack == 1 && out_elempack == 4)
        {
            top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
            if (top_blob_bordered_unpacked.empty())
                return -100;
        }

        for (int g = 0; g < group; g++)
        {
            const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
            Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

            const ncnn::Layer* op = group_ops[g];

            Option opt_g = opt;
            opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

            // forward
            int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
            if (ret != 0)
                return ret;
        }

        // packing
        if (out_g_elempack == 1 && out_elempack == 4)
        {
            convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
            if (top_blob_bordered.empty())
                return -100;
        }
        else
        {
            top_blob_bordered = top_blob_bordered_unpacked;
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int DeconvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.c * bottom_blob.elempack;
    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.d * group;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

#if NCNN_ARM82
    if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_ARM82
#if NCNN_BF16
    if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
    {
        Mat weight_data_flattened_fp32;
        cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
        weight_data_flattened = weight_data_flattened_fp32;
    }
#endif // NCNN_BF16

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / group;
        const int inch_g = _num_input / group;
        const int maxk = _kernel_h * _kernel_w;

        for (int g = 0; g < group; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

#if NCNN_ARM82
        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_ARM82
#if NCNN_BF16
        if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
        {
            Mat bias_data_flattened_fp32;
            cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
            bias_data_flattened = bias_data_flattened_fp32;
        }
#endif // NCNN_BF16

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, output_pad_right);
    pd.set(19, output_pad_bottom);
    pd.set(20, output_w);
    pd.set(21, output_h);
    pd.set(5, bias_term);
    pd.set(6, weight_data_transposed.w);
    pd.set(7, group);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_transposed;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_BF16
int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int g = 0; g < channels; g++)
            {
                unsigned short* outptr = top_blob_bordered.channel(g);
                const unsigned short* kptr = (const unsigned short*)weight_data_tm + maxk * g * 4;
                const Mat m = bottom_blob.channel(g);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float32x4_t _sum = vdupq_n_f32(0.f);

                        if (bias_term)
                        {
                            _sum = vld1q_f32((const float*)bias_data + g * 4);
                        }

                        for (int y = 0; y < kernel_h; y++)
                        {
                            int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                            if (sys < 0 || sys % stride_h != 0)
                                continue;

                            int sy = sys / stride_h;
                            if (sy >= h)
                                continue;

                            for (int x = 0; x < kernel_w; x++)
                            {
                                int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                if (sxs < 0 || sxs % stride_w != 0)
                                    continue;

                                int sx = sxs / stride_w;
                                if (sx >= w)
                                    continue;

                                const unsigned short* sptr = m.row<const unsigned short>(sy) + sx * 4;

                                float32x4_t _val = bfloat2float(vld1_u16(sptr));

                                int k = y * kernel_w + x;

                                float32x4_t _w = bfloat2float(vld1_u16(kptr + k * 4));

                                _sum = vmlaq_f32(_sum, _val, _w);
                            }
                        }

                        _sum = activation_ps(_sum, activation_type, activation_params);

                        vst1_u16(outptr + j * 4, float2bfloat(_sum));
                    }

                    outptr += outw * 4;
                }
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int g = 0; g < channels; g++)
            {
                unsigned short* outptr = top_blob_bordered.channel(g);
                const unsigned short* kptr = (const unsigned short*)weight_data_tm + maxk * g;
                const Mat m = bottom_blob.channel(g);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[g];
                        }

                        for (int y = 0; y < kernel_h; y++)
                        {
                            int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                            if (sys < 0 || sys % stride_h != 0)
                                continue;

                            int sy = sys / stride_h;
                            if (sy >= h)
                                continue;

                            const unsigned short* sptr = m.row<const unsigned short>(sy);

                            for (int x = 0; x < kernel_w; x++)
                            {
                                int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                if (sxs < 0 || sxs % stride_w != 0)
                                    continue;

                                int sx = sxs / stride_w;
                                if (sx >= w)
                                    continue;

                                float val = bfloat16_to_float32(sptr[sx]);

                                int k = y * kernel_w + x;

                                float w = bfloat16_to_float32(kptr[k]);

                                sum += val * w;
                            }
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = float32_to_bfloat16(sum);
                    }

                    outptr += outw;
                }
            }
        }
    }
    else
    {
        // group deconvolution
        const int channels_g = channels * elempack / group;
        const int num_output_g = num_output / group;

        int g_elempack = 1;
        int out_g_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            g_elempack = channels_g % 4 == 0 ? 4 : 1;
            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
        }
#endif

        // unpacking
        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack == 4 && g_elempack == 1)
        {
            Option opt_p = opt;
            opt_p.blob_allocator = opt.workspace_allocator;
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        Mat top_blob_bordered_unpacked = top_blob_bordered;
        if (out_g_elempack == 1 && out_elempack == 4)
        {
            top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
            if (top_blob_bordered_unpacked.empty())
                return -100;
        }

        for (int g = 0; g < group; g++)
        {
            const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
            Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

            const ncnn::Layer* op = group_ops[g];

            Option opt_g = opt;
            opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

            // forward
            int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
            if (ret != 0)
                return ret;
        }

        // packing
        if (out_g_elempack == 1 && out_elempack == 4)
        {
            convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
            if (top_blob_bordered.empty())
                return -100;
        }
        else
        {
            top_blob_bordered = top_blob_bordered_unpacked;
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/deconvolutiondepthwise_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTIONDEPTHWISE_ARM_H
#define LAYER_DECONVOLUTIONDEPTHWISE_ARM_H

#include "deconvolutiondepthwise.h"

namespace ncnn {

class DeconvolutionDepthWise_arm : public DeconvolutionDepthWise
{
public:
    DeconvolutionDepthWise_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    std::vector<ncnn::Layer*> group_ops;

    Mat weight_data_tm;

    // fp16
    Mat bias_data_fp16;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTIONDEPTHWISE_ARM_H


================================================
FILE: src/layer/arm/deconvolutiondepthwise_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolutiondepthwise_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int DeconvolutionDepthWise_arm::create_pipeline_fp16s(const Option& opt)
{
    // create Deconvolution op for each group
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        Mat weight_data_transposed(weight_data.w);
        {
            float* pt = weight_data_transposed;
            const float* p = weight_data;

            for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
            {
                for (int k = 0; k < maxk; k++)
                {
                    pt[maxk - 1 - k] = p[k];
                }

                p += maxk;
                pt += maxk;
            }
        }

        int elempack = 1;
        if (opt.use_packing_layout)
        {
            elempack = opt.use_fp16_arithmetic && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
        }

        if (elempack == 8)
        {
            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
            Mat weight_data_r2_packed;
            convert_packing(weight_data_r2, weight_data_r2_packed, 8, opt);

            ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
        }

        if (elempack == 4)
        {
            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
            Mat weight_data_r2_packed;
            convert_packing(weight_data_r2, weight_data_r2_packed, 4, opt);

            ncnn::cast_float32_to_float16(weight_data_r2_packed, weight_data_tm, opt);
        }

        if (elempack == 1)
        {
            ncnn::cast_float32_to_float16(weight_data_transposed, weight_data_tm, opt);
        }

        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
    }
    else
    {
        // group deconvolution
        for (int i = 0; i < (int)group_ops.size(); i++)
            delete group_ops[i];

        group_ops.clear();

        const int channels_g = channels / group;
        const int num_output_g = num_output / group;

        group_ops.resize(group);

        for (int g = 0; g < group; g++)
        {
            Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
            Mat bias_data_g;
            if (bias_term)
                bias_data_g = bias_data.range(num_output_g * g, num_output_g);

            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);

            // set param
            ncnn::ParamDict pd;
            pd.set(0, num_output_g); // num_output
            pd.set(1, kernel_w);
            pd.set(11, kernel_h);
            pd.set(2, dilation_w);
            pd.set(12, dilation_h);
            pd.set(3, stride_w);
            pd.set(13, stride_h);
            pd.set(4, 0);  // pad_w
            pd.set(14, 0); // pad_h
            pd.set(18, output_pad_right);
            pd.set(19, output_pad_bottom);
            pd.set(5, bias_term);
            pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
            pd.set(9, activation_type);
            pd.set(10, activation_params);

            op->load_param(pd);

            // set weights
            if (bias_term)
            {
                ncnn::Mat weights[2];
                weights[0] = weight_data_g;
                weights[1] = bias_data_g;

                op->load_model(ModelBinFromMatArray(weights));
            }
            else
            {
                ncnn::Mat weights[1];
                weights[0] = weight_data_g;

                op->load_model(ModelBinFromMatArray(weights));
            }

            op->create_pipeline(opt);

            group_ops[g] = op;
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int DeconvolutionDepthWise_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = (opt.use_packing_layout && num_output % 4 == 0) ? 4 : 1;
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        if (elempack == 4)
        {
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob_bordered.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float32x4_t _sum = vdupq_n_f32(0.f);

                            if (bias_term)
                            {
                                _sum = vld1q_f32((const float*)bias_data + g * 4);
                            }

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;

                                    float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));

                                    int k = y * kernel_w + x;

                                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr + k * 4));

                                    _sum = vfmaq_f32(_sum, _val, _w);
                                }
                            }

                            _sum = activation_ps(_sum, activation_type, activation_params);

                            vst1_f16(outptr + j * 4, vcvt_f16_f32(_sum));
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }

        if (elempack == 1)
        {
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob_bordered.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                            {
                                sum = bias_data[g];
                            }

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const __fp16* sptr = m.row<const __fp16>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float val = (float)sptr[sx];

                                    int k = y * kernel_w + x;

                                    float w = (float)kptr[k];

                                    sum += val * w;
                                }
                            }

                            sum = activation_ss(sum, activation_type, activation_params);

                            outptr[j] = (__fp16)sum;
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }
    else
    {
        // group deconvolution
        const int channels_g = channels * elempack / group;
        const int num_output_g = num_output / group;

        int g_elempack = (opt.use_packing_layout && channels_g % 4 == 0) ? 4 : 1;
        int out_g_elempack = (opt.use_packing_layout && num_output_g % 4 == 0) ? 4 : 1;

        // unpacking
        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack == 4 && g_elempack == 1)
        {
            Option opt_p = opt;
            opt_p.blob_allocator = opt.workspace_allocator;
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        Mat top_blob_bordered_unpacked = top_blob_bordered;
        if (out_g_elempack == 1 && out_elempack == 4)
        {
            top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
            if (top_blob_bordered_unpacked.empty())
                return -100;
        }

        for (int g = 0; g < group; g++)
        {
            const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
            Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

            const ncnn::Layer* op = group_ops[g];

            Option opt_g = opt;
            opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

            // forward
            int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
            if (ret != 0)
                return ret;
        }

        // packing
        if (out_g_elempack == 1 && out_elempack == 4)
        {
            convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
            if (top_blob_bordered.empty())
                return -100;
        }
        else
        {
            top_blob_bordered = top_blob_bordered_unpacked;
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int DeconvolutionDepthWise_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        if (elempack == 8)
        {
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob_bordered.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * 8;
                    const Mat m = bottom_blob.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                            if (bias_term)
                            {
                                _sum = vld1q_f16((const __fp16*)bias_data_fp16 + g * 8);
                            }

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 8;

                                    float16x8_t _val = vld1q_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x8_t _w = vld1q_f16(kptr + k * 8);

                                    _sum = vfmaq_f16(_sum, _val, _w);
                                }
                            }

                            _sum = activation_ps_f16(_sum, activation_type, activation_params);

                            vst1q_f16(outptr + j * 8, _sum);
                        }

                        outptr += outw * 8;
                    }
                }
            }
        }

        if (elempack == 4)
        {
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob_bordered.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float16x4_t _sum = vdup_n_f16((__fp16)0.f);

                            if (bias_term)
                            {
                                _sum = vld1_f16((const __fp16*)bias_data_fp16 + g * 4);
                            }

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const __fp16* sptr = m.row<const __fp16>(sy) + sx * 4;

                                    float16x4_t _val = vld1_f16(sptr);

                                    int k = y * kernel_w + x;

                                    float16x4_t _w = vld1_f16(kptr + k * 4);

                                    _sum = vfma_f16(_sum, _val, _w);
                                }
                            }

                            _sum = activation_ps_f16(_sum, activation_type, activation_params);

                            vst1_f16(outptr + j * 4, _sum);
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }

        if (elempack == 1)
        {
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    __fp16* outptr = top_blob_bordered.channel(g);
                    const __fp16* kptr = (const __fp16*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                            {
                                sum = bias_data[g];
                            }

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const __fp16* sptr = m.row<const __fp16>(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    __fp16 val = sptr[sx];

                                    int k = y * kernel_w + x;

                                    __fp16 w = kptr[k];

                                    sum += val * w;
                                }
                            }

                            sum = activation_ss_f16(sum, activation_type, activation_params);

                            outptr[j] = (__fp16)sum;
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }
    else
    {
        // group deconvolution
        const int channels_g = channels * elempack / group;
        const int num_output_g = num_output / group;

        int g_elempack = 1;
        int out_g_elempack = 1;
        if (opt.use_packing_layout)
        {
            g_elempack = opt.use_fp16_arithmetic && channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
            out_g_elempack = opt.use_fp16_arithmetic && num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
        }

        // unpacking
        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > g_elempack)
        {
            Option opt_p = opt;
            opt_p.blob_allocator = opt.workspace_allocator;
            convert_packing(bottom_blob, bottom_blob_unpacked, g_elempack, opt_p);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        Mat top_blob_bordered_unpacked = top_blob_bordered;
        if (out_g_elempack < out_elempack)
        {
            top_blob_bordered_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
            if (top_blob_bordered_unpacked.empty())
                return -100;
        }

        for (int g = 0; g < group; g++)
        {
            const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
            Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

            const ncnn::Layer* op = group_ops[g];

            Option opt_g = opt;
            opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

            // forward
            int ret = op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
            if (ret != 0)
                return ret;
        }

        // packing
        if (out_g_elempack < out_elempack)
        {
            convert_packing(top_blob_bordered_unpacked, top_blob_bordered, out_elempack, opt);
            if (top_blob_bordered.empty())
                return -100;
        }
        else
        {
            top_blob_bordered = top_blob_bordered_unpacked;
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/dequantize_arm.cpp
================================================
// Copyright 2018 Tencent
// Copyright 2019 BUG1989
// SPDX-License-Identifier: BSD-3-Clause

#include "dequantize_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

Dequantize_arm::Dequantize_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int bias_data_size = bias_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("dequantize %d %d   %d %d", scale_data_size, bias_data_size, elemcount, elempack);

    float scale = scale_data[0];
#if __ARM_NEON
    float32x4_t _scale0 = vdupq_n_f32(scale);
    float32x4_t _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        if (elempack == 4)
        {
            _scale0 = vld1q_f32((const float*)scale_data);
            _scale1 = _scale0;
        }
        if (elempack == 8)
        {
            _scale0 = vld1q_f32((const float*)scale_data);
            _scale1 = vld1q_f32((const float*)scale_data + 4);
        }
    }
#endif // __ARM_NEON

    if (bias_data_size == 0)
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
            _v0 = vmulq_f32(_v0, _scale0);
            _v1 = vmulq_f32(_v1, _scale1);
            vst1q_f32(ptr, _v0);
            vst1q_f32(ptr + 4, _v1);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
            _v = vmulq_f32(_v, _scale0);
            vst1q_f32(ptr, _v);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = *intptr * scale;
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __ARM_NEON
        float32x4_t _bias0 = vdupq_n_f32(bias);
        float32x4_t _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 4)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = _bias0;
            }
            if (elempack == 8)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = vld1q_f32((const float*)bias_data + 4);
            }
        }
#endif // __ARM_NEON

        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
#if __aarch64__
            _v0 = vfmaq_f32(_bias0, _v0, _scale0);
            _v1 = vfmaq_f32(_bias1, _v1, _scale1);
#else
            _v0 = vmlaq_f32(_bias0, _v0, _scale0);
            _v1 = vmlaq_f32(_bias1, _v1, _scale1);
#endif
            vst1q_f32(ptr, _v0);
            vst1q_f32(ptr + 4, _v1);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
#if __aarch64__
            _v = vfmaq_f32(_bias0, _v, _scale0);
#else
            _v = vmlaq_f32(_bias0, _v, _scale0);
#endif
            vst1q_f32(ptr, _v);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = *intptr * scale + bias;
            intptr++;
            ptr++;
        }
    }
}

int Dequantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // assert bottom_blob.elembits() == 32

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 1)
    {
        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const int* intptr = (const int*)bottom_blob + i * elempack;
            float* ptr = (float*)top_blob + i * elempack;

            // assert scale_data_size == 1
            // assert bias_data_size == 0 || bias_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            dequantize(intptr, ptr, scale_data, bias_data, size, 1);
        }
    }

    if (dims == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            const int* intptr = bottom_blob.row<const int>(i);
            float* ptr = top_blob.row(i);

            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;

            dequantize(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
        }
    }

    if (dims == 3)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int* intptr = bottom_blob.channel(q);
            float* ptr = top_blob.channel(q);

            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;

            dequantize(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack);
        }
    }

    return 0;
}

#if NCNN_BF16
static void dequantize_bf16s(const int* intptr, unsigned short* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int bias_data_size = bias_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("dequantize_bf16s %d %d   %d %d", scale_data_size, bias_data_size, elemcount, elempack);

    float scale = scale_data[0];
#if __ARM_NEON
    float32x4_t _scale0 = vdupq_n_f32(scale);
    float32x4_t _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        if (elempack == 4)
        {
            _scale0 = vld1q_f32((const float*)scale_data);
            _scale1 = _scale0;
        }
        if (elempack == 8)
        {
            _scale0 = vld1q_f32((const float*)scale_data);
            _scale1 = vld1q_f32((const float*)scale_data + 4);
        }
    }
#endif // __ARM_NEON

    if (bias_data_size == 0)
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
            _v0 = vmulq_f32(_v0, _scale0);
            _v1 = vmulq_f32(_v1, _scale1);
            vst1q_u16(ptr, vcombine_u16(float2bfloat(_v0), float2bfloat(_v1)));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
            _v = vmulq_f32(_v, _scale0);
            vst1_u16(ptr, float2bfloat(_v));
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = float32_to_bfloat16(*intptr * scale);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __ARM_NEON
        float32x4_t _bias0 = vdupq_n_f32(bias);
        float32x4_t _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 4)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = _bias0;
            }
            if (elempack == 8)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = vld1q_f32((const float*)bias_data + 4);
            }
        }
#endif // __ARM_NEON

        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
#if __aarch64__
            _v0 = vfmaq_f32(_bias0, _v0, _scale0);
            _v1 = vfmaq_f32(_bias1, _v1, _scale1);
#else
            _v0 = vmlaq_f32(_bias0, _v0, _scale0);
            _v1 = vmlaq_f32(_bias1, _v1, _scale1);
#endif
            vst1q_u16(ptr, vcombine_u16(float2bfloat(_v0), float2bfloat(_v1)));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
#if __aarch64__
            _v = vfmaq_f32(_bias0, _v, _scale0);
#else
            _v = vmlaq_f32(_bias0, _v, _scale0);
#endif
            vst1_u16(ptr, float2bfloat(_v));
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = float32_to_bfloat16(*intptr * scale + bias);
            intptr++;
            ptr++;
        }
    }
}

int Dequantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;
    const size_t out_elemsize = elempack * 2u;

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const int* intptr = (const int*)bottom_blob + i * elempack;
            unsigned short* ptr = (unsigned short*)top_blob + i * elempack;

            // assert scale_data_size == 1
            // assert bias_data_size == 0 || bias_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            dequantize_bf16s(intptr, ptr, scale_data, bias_data, size, 1);
        }
    }

    if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            const int* intptr = bottom_blob.row<const int>(i);
            unsigned short* ptr = top_blob.row<unsigned short>(i);

            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;

            dequantize_bf16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
        }
    }

    if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int* intptr = bottom_blob.channel(q);
            unsigned short* ptr = top_blob.channel(q);

            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;

            dequantize_bf16s(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack);
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/dequantize_arm.h
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DEQUANTIZE_ARM_H
#define LAYER_DEQUANTIZE_ARM_H

#include "dequantize.h"

namespace ncnn {

class Dequantize_arm : public Dequantize
{
public:
    Dequantize_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_DEQUANTIZE_ARM_H


================================================
FILE: src/layer/arm/dequantize_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "dequantize_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static void dequantize_fp16s(const int* intptr, __fp16* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int bias_data_size = bias_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("dequantize_fp16s %d %d   %d %d", scale_data_size, bias_data_size, elemcount, elempack);

    float scale = scale_data[0];
    float32x4_t _scale0 = vdupq_n_f32(scale);
    float32x4_t _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale0 = vld1q_f32((const float*)scale_data);
            _scale1 = vld1q_f32((const float*)scale_data + 4);
        }
        if (elempack == 4)
        {
            _scale0 = vld1q_f32((const float*)scale_data);
            _scale1 = _scale0;
        }
    }

    if (bias_data_size == 0)
    {
        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
            _v0 = vmulq_f32(_v0, _scale0);
            _v1 = vmulq_f32(_v1, _scale1);
            vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1)));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
            _v = vmulq_f32(_v, _scale0);
            vst1_f16(ptr, vcvt_f16_f32(_v));
            intptr += 4;
            ptr += 4;
        }
        for (; i < size; i++)
        {
            *ptr = (__fp16)(*intptr * scale);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
        float32x4_t _bias0 = vdupq_n_f32(bias);
        float32x4_t _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 8)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = vld1q_f32((const float*)bias_data + 4);
            }
            if (elempack == 4)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = _bias0;
            }
        }

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
            _v0 = vfmaq_f32(_bias0, _v0, _scale0);
            _v1 = vfmaq_f32(_bias1, _v1, _scale1);
            vst1q_f16(ptr, vcombine_f16(vcvt_f16_f32(_v0), vcvt_f16_f32(_v1)));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
            _v = vfmaq_f32(_bias0, _v, _scale0);
            vst1_f16(ptr, vcvt_f16_f32(_v));
            intptr += 4;
            ptr += 4;
        }
        for (; i < size; i++)
        {
            *ptr = (__fp16)(*intptr * scale + bias);
            intptr++;
            ptr++;
        }
    }
}

int Dequantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;
    const size_t out_elemsize = elempack * 2u;

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const int* intptr = (const int*)bottom_blob + i * elempack;
            __fp16* ptr = (__fp16*)top_blob + i * elempack;

            // assert scale_data_size == 1
            // assert bias_data_size == 0 || bias_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            dequantize_fp16s(intptr, ptr, scale_data, bias_data, size, 1);
        }
    }

    if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            const int* intptr = bottom_blob.row<const int>(i);
            __fp16* ptr = top_blob.row<__fp16>(i);

            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;

            dequantize_fp16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
        }
    }

    if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int* intptr = bottom_blob.channel(q);
            __fp16* ptr = top_blob.channel(q);

            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;

            dequantize_fp16s(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack);
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/dropout_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "dropout_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

Dropout_arm::Dropout_arm()
{
#if __ARM_NEON
    support_packing = true;
#endif // __ARM_NEON
}

int Dropout_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    if (scale == 1.f)
    {
        return 0;
    }

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _scale = vdupq_n_f32(scale);
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                "fmul   v0.4s, v0.4s, %2.4s     \n"
                "fmul   v1.4s, v1.4s, %2.4s     \n"
                "fmul   v2.4s, v2.4s, %2.4s     \n"
                "fmul   v3.4s, v3.4s, %2.4s     \n"
                "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_scale) // %2
                : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #512]      \n"
                "vldm       %0, {d0-d7}     \n"
                "vmul.f32   q0, q0, %q2     \n"
                "vmul.f32   q1, q1, %q2     \n"
                "vmul.f32   q2, q2, %q2     \n"
                "vmul.f32   q3, q3, %q2     \n"
                "vstm       %0!, {d0-d7}    \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_scale) // %2
                : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _p2 = vld1q_f32(ptr + 8);
            float32x4_t _p3 = vld1q_f32(ptr + 12);
            _p0 = vmulq_f32(_p0, _scale);
            _p1 = vmulq_f32(_p1, _scale);
            _p2 = vmulq_f32(_p2, _scale);
            _p3 = vmulq_f32(_p3, _scale);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            vst1q_f32(ptr + 8, _p2);
            vst1q_f32(ptr + 12, _p3);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            _p0 = vmulq_f32(_p0, _scale);
            _p1 = vmulq_f32(_p1, _scale);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vmulq_f32(_p, _scale);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = *ptr * scale;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/dropout_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DROPOUT_ARM_H
#define LAYER_DROPOUT_ARM_H

#include "dropout.h"

namespace ncnn {

class Dropout_arm : public Dropout
{
public:
    Dropout_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_DROPOUT_ARM_H


================================================
FILE: src/layer/arm/eltwise_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "eltwise_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

Eltwise_arm::Eltwise_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Eltwise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int elembits = bottom_blobs[0].elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blobs, top_blobs, opt);
        else
            return forward_fp16s(bottom_blobs, top_blobs, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;
    int size = w * h * d * elempack;

    Mat& top_blob = top_blobs[0];
    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
#if __ARM_NEON
            for (; i + 7 < size; i += 8)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                float32x4_t _q0 = vld1q_f32(ptr1);
                float32x4_t _q1 = vld1q_f32(ptr1 + 4);
                _p0 = vmulq_f32(_p0, _q0);
                _p1 = vmulq_f32(_p1, _q1);
                vst1q_f32(outptr, _p0);
                vst1q_f32(outptr + 4, _p1);

                ptr += 8;
                ptr1 += 8;
                outptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _q = vld1q_f32(ptr1);
                _p = vmulq_f32(_p, _q);
                vst1q_f32(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *outptr = *ptr * *ptr1;

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        for (size_t b = 2; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(outptr);
                    float32x4_t _p1 = vld1q_f32(outptr + 4);
                    float32x4_t _q0 = vld1q_f32(ptr);
                    float32x4_t _q1 = vld1q_f32(ptr + 4);
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(outptr);
                    float32x4_t _q = vld1q_f32(ptr);
                    _p = vmulq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr *= *ptr;

                    ptr++;
                    outptr++;
                }
            }
        }
    }
    if (op_type == Operation_SUM)
    {
        if (coeffs.w == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(ptr);
                    float32x4_t _p1 = vld1q_f32(ptr + 4);
                    float32x4_t _q0 = vld1q_f32(ptr1);
                    float32x4_t _q1 = vld1q_f32(ptr1 + 4);
                    _p0 = vaddq_f32(_p0, _q0);
                    _p1 = vaddq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(ptr);
                    float32x4_t _q = vld1q_f32(ptr1);
                    _p = vaddq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = *ptr + *ptr1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            for (size_t b = 2; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    int i = 0;
#if __ARM_NEON
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        float32x4_t _q0 = vld1q_f32(ptr);
                        float32x4_t _q1 = vld1q_f32(ptr + 4);
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(outptr);
                        float32x4_t _q = vld1q_f32(ptr);
                        _p = vaddq_f32(_p, _q);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr += *ptr;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
        else
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                const float coeff0 = coeffs[0];
                const float coeff1 = coeffs[1];

                int i = 0;
#if __ARM_NEON
                float32x4_t _coeff0 = vdupq_n_f32(coeff0);
                float32x4_t _coeff1 = vdupq_n_f32(coeff1);
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(ptr);
                    float32x4_t _p1 = vld1q_f32(ptr + 4);
                    float32x4_t _q0 = vld1q_f32(ptr1);
                    float32x4_t _q1 = vld1q_f32(ptr1 + 4);
                    _p0 = vmulq_f32(_p0, _coeff0);
                    _p1 = vmulq_f32(_p1, _coeff0);
                    _p0 = vmlaq_f32(_p0, _q0, _coeff1);
                    _p1 = vmlaq_f32(_p1, _q1, _coeff1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(ptr);
                    float32x4_t _q = vld1q_f32(ptr1);
                    _p = vmulq_f32(_p, _coeff0);
                    _p = vmlaq_f32(_p, _q, _coeff1);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            for (size_t b = 2; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    const float coeff = coeffs[b];

                    int i = 0;
#if __ARM_NEON
                    float32x4_t _coeff = vdupq_n_f32(coeff);
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        float32x4_t _q0 = vld1q_f32(ptr);
                        float32x4_t _q1 = vld1q_f32(ptr + 4);
                        _p0 = vmlaq_f32(_p0, _q0, _coeff);
                        _p1 = vmlaq_f32(_p1, _q1, _coeff);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(outptr);
                        float32x4_t _q = vld1q_f32(ptr);
                        _p = vmlaq_f32(_p, _q, _coeff);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr += *ptr * coeff;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
    }
    if (op_type == Operation_MAX)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
#if __ARM_NEON
            for (; i + 7 < size; i += 8)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                float32x4_t _q0 = vld1q_f32(ptr1);
                float32x4_t _q1 = vld1q_f32(ptr1 + 4);
                _p0 = vmaxq_f32(_p0, _q0);
                _p1 = vmaxq_f32(_p1, _q1);
                vst1q_f32(outptr, _p0);
                vst1q_f32(outptr + 4, _p1);

                ptr += 8;
                ptr1 += 8;
                outptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _q = vld1q_f32(ptr1);
                _p = vmaxq_f32(_p, _q);
                vst1q_f32(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *outptr = std::max(*ptr, *ptr1);

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        for (size_t b = 2; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(outptr);
                    float32x4_t _p1 = vld1q_f32(outptr + 4);
                    float32x4_t _q0 = vld1q_f32(ptr);
                    float32x4_t _q1 = vld1q_f32(ptr + 4);
                    _p0 = vmaxq_f32(_p0, _q0);
                    _p1 = vmaxq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(outptr);
                    float32x4_t _q = vld1q_f32(ptr);
                    _p = vmaxq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = std::max(*ptr, *outptr);

                    ptr++;
                    outptr++;
                }
            }
        }
    }

    return 0;
}

#if NCNN_BF16
int Eltwise_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;
    int size = w * h * d * elempack;

    Mat& top_blob = top_blobs[0];
    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (bottom_blobs.size() == 2)
    {
        // fast path without fp32 accumulator
        if (op_type == Operation_PROD)
        {
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob1.channel(q);
                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    uint16x8_t _p01 = vld1q_u16(ptr);
                    uint16x8_t _q01 = vld1q_u16(ptr1);
                    float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                    float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                    _p = vmulq_f32(_p, _q);
                    vst1_u16(outptr, float2bfloat(_p));

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = float32_to_bfloat16(bfloat16_to_float32(*ptr) * bfloat16_to_float32(*ptr1));

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }
        }
        if (op_type == Operation_SUM)
        {
            if (coeffs.w == 0)
            {
                const Mat& bottom_blob1 = bottom_blobs[1];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob.channel(q);
                    const unsigned short* ptr1 = bottom_blob1.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    int i = 0;
#if __ARM_NEON
                    for (; i + 7 < size; i += 8)
                    {
                        uint16x8_t _p01 = vld1q_u16(ptr);
                        uint16x8_t _q01 = vld1q_u16(ptr1);
                        float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                        float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                        float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                        float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                        ptr += 8;
                        ptr1 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = bfloat2float(vld1_u16(ptr));
                        float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                        _p = vaddq_f32(_p, _q);
                        vst1_u16(outptr, float2bfloat(_p));

                        ptr += 4;
                        ptr1 += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr = float32_to_bfloat16(bfloat16_to_float32(*ptr) + bfloat16_to_float32(*ptr1));

                        ptr++;
                        ptr1++;
                        outptr++;
                    }
                }
            }
            else
            {
                const Mat& bottom_blob1 = bottom_blobs[1];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob.channel(q);
                    const unsigned short* ptr1 = bottom_blob1.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    const float coeff0 = coeffs[0];
                    const float coeff1 = coeffs[1];

                    int i = 0;
#if __ARM_NEON
                    float32x4_t _coeff0 = vdupq_n_f32(coeff0);
                    float32x4_t _coeff1 = vdupq_n_f32(coeff1);
                    for (; i + 7 < size; i += 8)
                    {
                        uint16x8_t _p01 = vld1q_u16(ptr);
                        uint16x8_t _q01 = vld1q_u16(ptr1);
                        float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                        float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                        float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                        float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                        _p0 = vmulq_f32(_p0, _coeff0);
                        _p1 = vmulq_f32(_p1, _coeff0);
                        _p0 = vmlaq_f32(_p0, _q0, _coeff1);
                        _p1 = vmlaq_f32(_p1, _q1, _coeff1);
                        vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                        ptr += 8;
                        ptr1 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = bfloat2float(vld1_u16(ptr));
                        float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                        _p = vmulq_f32(_p, _coeff0);
                        _p = vmlaq_f32(_p, _q, _coeff1);
                        vst1_u16(outptr, float2bfloat(_p));

                        ptr += 4;
                        ptr1 += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr = float32_to_bfloat16(bfloat16_to_float32(*ptr) * coeff0 + bfloat16_to_float32(*ptr1) * coeff1);

                        ptr++;
                        ptr1++;
                        outptr++;
                    }
                }
            }
        }
        if (op_type == Operation_MAX)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob1.channel(q);
                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    uint16x8_t _p01 = vld1q_u16(ptr);
                    uint16x8_t _q01 = vld1q_u16(ptr1);
                    float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                    float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vmaxq_f32(_p0, _q0);
                    _p1 = vmaxq_f32(_p1, _q1);
                    vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                    _p = vmaxq_f32(_p, _q);
                    vst1_u16(outptr, float2bfloat(_p));

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = float32_to_bfloat16(std::max(bfloat16_to_float32(*ptr), bfloat16_to_float32(*ptr1)));

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }
        }

        return 0;
    }

    Mat top_blob_fp32(w, h, d, channels, (size_t)4u * elempack, elempack, opt.workspace_allocator);
    if (top_blob_fp32.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            const unsigned short* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob_fp32.channel(q);

            int i = 0;
#if __ARM_NEON
            for (; i + 7 < size; i += 8)
            {
                uint16x8_t _p01 = vld1q_u16(ptr);
                uint16x8_t _q01 = vld1q_u16(ptr1);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                _p0 = vmulq_f32(_p0, _q0);
                _p1 = vmulq_f32(_p1, _q1);
                vst1q_f32(outptr, _p0);
                vst1q_f32(outptr + 4, _p1);

                ptr += 8;
                ptr1 += 8;
                outptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                _p = vmulq_f32(_p, _q);
                vst1q_f32(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *outptr = bfloat16_to_float32(*ptr) * bfloat16_to_float32(*ptr1);

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        size_t b = 2;
        for (; b < bottom_blobs.size() - 1; b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob_fp32.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(outptr);
                    float32x4_t _p1 = vld1q_f32(outptr + 4);
                    uint16x8_t _q01 = vld1q_u16(ptr);
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(outptr);
                    float32x4_t _q = bfloat2float(vld1_u16(ptr));
                    _p = vmulq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr *= bfloat16_to_float32(*ptr);

                    ptr++;
                    outptr++;
                }
            }
        }
        for (; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob1.channel(q);
                const float* ptr0 = top_blob_fp32.channel(q);
                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                    uint16x8_t _q01 = vld1q_u16(ptr);
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                    ptr += 8;
                    ptr0 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(ptr0);
                    float32x4_t _q = bfloat2float(vld1_u16(ptr));
                    _p = vmulq_f32(_p, _q);
                    vst1_u16(outptr, float2bfloat(_p));

                    ptr += 4;
                    ptr0 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = float32_to_bfloat16(*ptr0 * bfloat16_to_float32(*ptr));

                    ptr++;
                    ptr0++;
                    outptr++;
                }
            }
        }
    }
    if (op_type == Operation_SUM)
    {
        if (coeffs.w == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob_fp32.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    uint16x8_t _p01 = vld1q_u16(ptr);
                    uint16x8_t _q01 = vld1q_u16(ptr1);
                    float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                    float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vaddq_f32(_p0, _q0);
                    _p1 = vaddq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                    _p = vaddq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = bfloat16_to_float32(*ptr) + bfloat16_to_float32(*ptr1);

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            size_t b = 2;
            for (; b < bottom_blobs.size() - 1; b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob_fp32.channel(q);

                    int i = 0;
#if __ARM_NEON
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        uint16x8_t _q01 = vld1q_u16(ptr);
                        float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                        float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(outptr);
                        float32x4_t _q = bfloat2float(vld1_u16(ptr));
                        _p = vaddq_f32(_p, _q);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr += bfloat16_to_float32(*ptr);

                        ptr++;
                        outptr++;
                    }
                }
            }
            for (; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob1.channel(q);
                    const float* ptr0 = top_blob_fp32.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    int i = 0;
#if __ARM_NEON
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(ptr0);
                        float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                        uint16x8_t _q01 = vld1q_u16(ptr);
                        float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                        float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                        ptr += 8;
                        ptr0 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(ptr0);
                        float32x4_t _q = bfloat2float(vld1_u16(ptr));
                        _p = vaddq_f32(_p, _q);
                        vst1_u16(outptr, float2bfloat(_p));

                        ptr += 4;
                        ptr0 += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr = float32_to_bfloat16(*ptr0 + bfloat16_to_float32(*ptr));

                        ptr++;
                        ptr0++;
                        outptr++;
                    }
                }
            }
        }
        else
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob_fp32.channel(q);

                const float coeff0 = coeffs[0];
                const float coeff1 = coeffs[1];

                int i = 0;
#if __ARM_NEON
                float32x4_t _coeff0 = vdupq_n_f32(coeff0);
                float32x4_t _coeff1 = vdupq_n_f32(coeff1);
                for (; i + 7 < size; i += 8)
                {
                    uint16x8_t _p01 = vld1q_u16(ptr);
                    uint16x8_t _q01 = vld1q_u16(ptr1);
                    float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                    float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vmulq_f32(_p0, _coeff0);
                    _p1 = vmulq_f32(_p1, _coeff0);
                    _p0 = vmlaq_f32(_p0, _q0, _coeff1);
                    _p1 = vmlaq_f32(_p1, _q1, _coeff1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                    _p = vmulq_f32(_p, _coeff0);
                    _p = vmlaq_f32(_p, _q, _coeff1);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = bfloat16_to_float32(*ptr) * coeff0 + bfloat16_to_float32(*ptr1) * coeff1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            size_t b = 2;
            for (; b < bottom_blobs.size() - 1; b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob_fp32.channel(q);

                    const float coeff = coeffs[b];

                    int i = 0;
#if __ARM_NEON
                    float32x4_t _coeff = vdupq_n_f32(coeff);
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        uint16x8_t _q01 = vld1q_u16(ptr);
                        float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                        float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                        _p0 = vmlaq_f32(_p0, _q0, _coeff);
                        _p1 = vmlaq_f32(_p1, _q1, _coeff);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(outptr);
                        float32x4_t _q = bfloat2float(vld1_u16(ptr));
                        _p = vmlaq_f32(_p, _q, _coeff);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr += bfloat16_to_float32(*ptr) * coeff;

                        ptr++;
                        outptr++;
                    }
                }
            }
            for (; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob1.channel(q);
                    const float* ptr0 = top_blob_fp32.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    const float coeff = coeffs[b];

                    int i = 0;
#if __ARM_NEON
                    float32x4_t _coeff = vdupq_n_f32(coeff);
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(ptr0);
                        float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                        uint16x8_t _q01 = vld1q_u16(ptr);
                        float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                        float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                        _p0 = vmlaq_f32(_p0, _q0, _coeff);
                        _p1 = vmlaq_f32(_p1, _q1, _coeff);
                        vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                        ptr += 8;
                        ptr0 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(ptr0);
                        float32x4_t _q = bfloat2float(vld1_u16(ptr));
                        _p = vmlaq_f32(_p, _q, _coeff);
                        vst1_u16(outptr, float2bfloat(_p));

                        ptr += 4;
                        ptr0 += 4;
                        outptr += 4;
                    }
#endif // __ARM_NEON
                    for (; i < size; i++)
                    {
                        *outptr = float32_to_bfloat16(*ptr0 + bfloat16_to_float32(*ptr) * coeff);

                        ptr++;
                        ptr0++;
                        outptr++;
                    }
                }
            }
        }
    }
    if (op_type == Operation_MAX)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            const unsigned short* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob_fp32.channel(q);

            int i = 0;
#if __ARM_NEON
            for (; i + 7 < size; i += 8)
            {
                uint16x8_t _p01 = vld1q_u16(ptr);
                uint16x8_t _q01 = vld1q_u16(ptr1);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
                float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                _p0 = vmaxq_f32(_p0, _q0);
                _p1 = vmaxq_f32(_p1, _q1);
                vst1q_f32(outptr, _p0);
                vst1q_f32(outptr + 4, _p1);

                ptr += 8;
                ptr1 += 8;
                outptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                float32x4_t _q = bfloat2float(vld1_u16(ptr1));
                _p = vmaxq_f32(_p, _q);
                vst1q_f32(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *outptr = std::max(bfloat16_to_float32(*ptr), bfloat16_to_float32(*ptr1));

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        size_t b = 2;
        for (; b < bottom_blobs.size() - 1; b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob_fp32.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(outptr);
                    float32x4_t _p1 = vld1q_f32(outptr + 4);
                    uint16x8_t _q01 = vld1q_u16(ptr);
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vmaxq_f32(_p0, _q0);
                    _p1 = vmaxq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(outptr);
                    float32x4_t _q = bfloat2float(vld1_u16(ptr));
                    _p = vmaxq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = std::max(bfloat16_to_float32(*ptr), *outptr);

                    ptr++;
                    outptr++;
                }
            }
        }
        for (; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob1.channel(q);
                const float* ptr0 = top_blob_fp32.channel(q);
                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                    uint16x8_t _q01 = vld1q_u16(ptr);
                    float32x4_t _q0 = bfloat2float(vget_low_u16(_q01));
                    float32x4_t _q1 = bfloat2float(vget_high_u16(_q01));
                    _p0 = vmaxq_f32(_p0, _q0);
                    _p1 = vmaxq_f32(_p1, _q1);
                    vst1q_u16(outptr, vcombine_u16(float2bfloat(_p0), float2bfloat(_p1)));

                    ptr += 8;
                    ptr0 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(ptr0);
                    float32x4_t _q = bfloat2float(vld1_u16(ptr));
                    _p = vmaxq_f32(_p, _q);
                    vst1_u16(outptr, float2bfloat(_p));

                    ptr += 4;
                    ptr0 += 4;
                    outptr += 4;
                }
#endif // __ARM_NEON
                for (; i < size; i++)
                {
                    *outptr = float32_to_bfloat16(std::max(bfloat16_to_float32(*ptr), *ptr0));

                    ptr++;
                    ptr0++;
                    outptr++;
                }
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/eltwise_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ELTWISE_ARM_H
#define LAYER_ELTWISE_ARM_H

#include "eltwise.h"

namespace ncnn {

class Eltwise_arm : public Eltwise
{
public:
    Eltwise_arm();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_ELTWISE_ARM_H


================================================
FILE: src/layer/arm/eltwise_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "eltwise_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Eltwise_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;
    int size = w * h * d * elempack;

    Mat& top_blob = top_blobs[0];
    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (bottom_blobs.size() == 2)
    {
        // fast path without fp32 accumulator
        if (op_type == Operation_PROD)
        {
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                const __fp16* ptr1 = bottom_blob1.channel(q);
                __fp16* outptr = top_blob.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p0 = vld1q_f16(ptr);
                    float16x8_t _p1 = vld1q_f16(ptr + 8);
                    float16x8_t _q0 = vld1q_f16(ptr1);
                    float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                    _p0 = vmulq_f16(_p0, _q0);
                    _p1 = vmulq_f16(_p1, _q1);
                    vst1q_f16(outptr, _p0);
                    vst1q_f16(outptr + 8, _p1);

                    ptr += 16;
                    ptr1 += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    float16x8_t _q = vld1q_f16(ptr1);
                    _p = vmulq_f16(_p, _q);
                    vst1q_f16(outptr, _p);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    float16x4_t _q = vld1_f16(ptr1);
                    _p = vmul_f16(_p, _q);
                    vst1_f16(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = *ptr * *ptr1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }
        }
        if (op_type == Operation_SUM)
        {
            if (coeffs.w == 0)
            {
                const Mat& bottom_blob1 = bottom_blobs[1];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);
                    const __fp16* ptr1 = bottom_blob1.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float16x8_t _p0 = vld1q_f16(ptr);
                        float16x8_t _p1 = vld1q_f16(ptr + 8);
                        float16x8_t _q0 = vld1q_f16(ptr1);
                        float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                        _p0 = vaddq_f16(_p0, _q0);
                        _p1 = vaddq_f16(_p1, _q1);
                        vst1q_f16(outptr, _p0);
                        vst1q_f16(outptr + 8, _p1);

                        ptr += 16;
                        ptr1 += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float16x8_t _p = vld1q_f16(ptr);
                        float16x8_t _q = vld1q_f16(ptr1);
                        _p = vaddq_f16(_p, _q);
                        vst1q_f16(outptr, _p);

                        ptr += 8;
                        ptr1 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float16x4_t _p = vld1_f16(ptr);
                        float16x4_t _q = vld1_f16(ptr1);
                        _p = vadd_f16(_p, _q);
                        vst1_f16(outptr, _p);

                        ptr += 4;
                        ptr1 += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr = *ptr + *ptr1;

                        ptr++;
                        ptr1++;
                        outptr++;
                    }
                }
            }
            else
            {
                const Mat& bottom_blob1 = bottom_blobs[1];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);
                    const __fp16* ptr1 = bottom_blob1.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    const float coeff0 = coeffs[0];
                    const float coeff1 = coeffs[1];
                    float16x8_t _coeff0 = vdupq_n_f16((__fp16)coeff0);
                    float16x8_t _coeff1 = vdupq_n_f16((__fp16)coeff1);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float16x8_t _p0 = vld1q_f16(ptr);
                        float16x8_t _p1 = vld1q_f16(ptr + 8);
                        float16x8_t _q0 = vld1q_f16(ptr1);
                        float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                        _p0 = vmulq_f16(_p0, _coeff0);
                        _p1 = vmulq_f16(_p1, _coeff0);
                        _p0 = vfmaq_f16(_p0, _q0, _coeff1);
                        _p1 = vfmaq_f16(_p1, _q1, _coeff1);
                        vst1q_f16(outptr, _p0);
                        vst1q_f16(outptr + 8, _p1);

                        ptr += 16;
                        ptr1 += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float16x8_t _p = vld1q_f16(ptr);
                        float16x8_t _q = vld1q_f16(ptr1);
                        _p = vmulq_f16(_p, _coeff0);
                        _p = vfmaq_f16(_p, _q, _coeff1);
                        vst1q_f16(outptr, _p);

                        ptr += 8;
                        ptr1 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float16x4_t _p = vld1_f16(ptr);
                        float16x4_t _q = vld1_f16(ptr1);
                        _p = vmul_f16(_p, vget_low_f16(_coeff0));
                        _p = vfma_f16(_p, _q, vget_low_f16(_coeff1));
                        vst1_f16(outptr, _p);

                        ptr += 4;
                        ptr1 += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr = (__fp16)((float)(*ptr) * coeff0 + (float)(*ptr1) * coeff1);

                        ptr++;
                        ptr1++;
                        outptr++;
                    }
                }
            }
        }
        if (op_type == Operation_MAX)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                const __fp16* ptr1 = bottom_blob1.channel(q);
                __fp16* outptr = top_blob.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p0 = vld1q_f16(ptr);
                    float16x8_t _p1 = vld1q_f16(ptr + 8);
                    float16x8_t _q0 = vld1q_f16(ptr1);
                    float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                    _p0 = vmaxq_f16(_p0, _q0);
                    _p1 = vmaxq_f16(_p1, _q1);
                    vst1q_f16(outptr, _p0);
                    vst1q_f16(outptr + 8, _p1);

                    ptr += 16;
                    ptr1 += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    float16x8_t _q = vld1q_f16(ptr1);
                    _p = vmaxq_f16(_p, _q);
                    vst1q_f16(outptr, _p);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    float16x4_t _q = vld1_f16(ptr1);
                    _p = vmax_f16(_p, _q);
                    vst1_f16(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = std::max(*ptr, *ptr1);

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }
        }

        return 0;
    }

    Mat top_blob_fp32(w, h, d, channels, (size_t)4u * elempack, elempack, opt.workspace_allocator);
    if (top_blob_fp32.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const __fp16* ptr = bottom_blob.channel(q);
            const __fp16* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob_fp32.channel(q);

            int i = 0;
            for (; i + 15 < size; i += 16)
            {
                float16x8_t _p01 = vld1q_f16(ptr);
                float16x8_t _p23 = vld1q_f16(ptr + 8);
                float16x8_t _q01 = vld1q_f16(ptr1);
                float16x8_t _q23 = vld1q_f16(ptr1 + 8);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
                float32x4_t _p2 = vcvt_f32_f16(vget_low_f16(_p23));
                float32x4_t _p3 = vcvt_f32_f16(vget_high_f16(_p23));
                float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                _p0 = vmulq_f32(_p0, _q0);
                _p1 = vmulq_f32(_p1, _q1);
                _p2 = vmulq_f32(_p2, _q2);
                _p3 = vmulq_f32(_p3, _q3);
                vst1q_f32(outptr, _p0);
                vst1q_f32(outptr + 4, _p1);
                vst1q_f32(outptr + 8, _p2);
                vst1q_f32(outptr + 12, _p3);

                ptr += 16;
                ptr1 += 16;
                outptr += 16;
            }
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p01 = vld1q_f16(ptr);
                float16x8_t _q01 = vld1q_f16(ptr1);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
                float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                _p0 = vmulq_f32(_p0, _q0);
                _p1 = vmulq_f32(_p1, _q1);
                vst1q_f32(outptr, _p0);
                vst1q_f32(outptr + 4, _p1);

                ptr += 8;
                ptr1 += 8;
                outptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr1));
                _p = vmulq_f32(_p, _q);
                vst1q_f32(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
            for (; i < size; i++)
            {
                *outptr = (float)(*ptr) * (float)(*ptr1);

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        size_t b = 2;
        for (; b < bottom_blobs.size() - 1; b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob_fp32.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float32x4_t _p0 = vld1q_f32(outptr);
                    float32x4_t _p1 = vld1q_f32(outptr + 4);
                    float32x4_t _p2 = vld1q_f32(outptr + 8);
                    float32x4_t _p3 = vld1q_f32(outptr + 12);
                    float16x8_t _q01 = vld1q_f16(ptr);
                    float16x8_t _q23 = vld1q_f16(ptr + 8);
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                    float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    _p2 = vmulq_f32(_p2, _q2);
                    _p3 = vmulq_f32(_p3, _q3);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);
                    vst1q_f32(outptr + 8, _p2);
                    vst1q_f32(outptr + 12, _p3);

                    ptr += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(outptr);
                    float32x4_t _p1 = vld1q_f32(outptr + 4);
                    float16x8_t _q01 = vld1q_f16(ptr);
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(outptr);
                    float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr));
                    _p = vmulq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr *= (float)(*ptr);

                    ptr++;
                    outptr++;
                }
            }
        }
        for (; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob1.channel(q);
                const float* ptr0 = top_blob_fp32.channel(q);
                __fp16* outptr = top_blob.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                    float32x4_t _p2 = vld1q_f32(ptr0 + 8);
                    float32x4_t _p3 = vld1q_f32(ptr0 + 12);
                    float16x8_t _q01 = vld1q_f16(ptr);
                    float16x8_t _q23 = vld1q_f16(ptr + 8);
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                    float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    _p2 = vmulq_f32(_p2, _q2);
                    _p3 = vmulq_f32(_p3, _q3);
                    vst1q_f16(outptr, vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)));
                    vst1q_f16(outptr + 8, vcombine_f16(vcvt_f16_f32(_p2), vcvt_f16_f32(_p3)));

                    ptr += 16;
                    ptr0 += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                    float16x8_t _q01 = vld1q_f16(ptr);
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    _p0 = vmulq_f32(_p0, _q0);
                    _p1 = vmulq_f32(_p1, _q1);
                    vst1q_f16(outptr, vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)));

                    ptr += 8;
                    ptr0 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vld1q_f32(ptr0);
                    float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr));
                    _p = vmulq_f32(_p, _q);
                    vst1_f16(outptr, vcvt_f16_f32(_p));

                    ptr += 4;
                    ptr0 += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = (__fp16)(*ptr0 * (float)(*ptr));

                    ptr++;
                    ptr0++;
                    outptr++;
                }
            }
        }
    }
    if (op_type == Operation_SUM)
    {
        if (coeffs.w == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                const __fp16* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob_fp32.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p01 = vld1q_f16(ptr);
                    float16x8_t _p23 = vld1q_f16(ptr + 8);
                    float16x8_t _q01 = vld1q_f16(ptr1);
                    float16x8_t _q23 = vld1q_f16(ptr1 + 8);
                    float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
                    float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
                    float32x4_t _p2 = vcvt_f32_f16(vget_low_f16(_p23));
                    float32x4_t _p3 = vcvt_f32_f16(vget_high_f16(_p23));
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                    float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                    _p0 = vaddq_f32(_p0, _q0);
                    _p1 = vaddq_f32(_p1, _q1);
                    _p2 = vaddq_f32(_p2, _q2);
                    _p3 = vaddq_f32(_p3, _q3);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);
                    vst1q_f32(outptr + 8, _p2);
                    vst1q_f32(outptr + 12, _p3);

                    ptr += 16;
                    ptr1 += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p01 = vld1q_f16(ptr);
                    float16x8_t _q01 = vld1q_f16(ptr1);
                    float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
                    float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    _p0 = vaddq_f32(_p0, _q0);
                    _p1 = vaddq_f32(_p1, _q1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr1));
                    _p = vaddq_f32(_p, _q);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = (float)(*ptr) + (float)(*ptr1);

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            size_t b = 2;
            for (; b < bottom_blobs.size() - 1; b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob_fp32.channel(q);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        float32x4_t _p2 = vld1q_f32(outptr + 8);
                        float32x4_t _p3 = vld1q_f32(outptr + 12);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float16x8_t _q23 = vld1q_f16(ptr + 8);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                        float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        _p2 = vaddq_f32(_p2, _q2);
                        _p3 = vaddq_f32(_p3, _q3);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);
                        vst1q_f32(outptr + 8, _p2);
                        vst1q_f32(outptr + 12, _p3);

                        ptr += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(outptr);
                        float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr));
                        _p = vaddq_f32(_p, _q);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr += (float)(*ptr);

                        ptr++;
                        outptr++;
                    }
                }
            }
            for (; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob1.channel(q);
                    const float* ptr0 = top_blob_fp32.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float32x4_t _p0 = vld1q_f32(ptr0);
                        float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                        float32x4_t _p2 = vld1q_f32(ptr0 + 8);
                        float32x4_t _p3 = vld1q_f32(ptr0 + 12);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float16x8_t _q23 = vld1q_f16(ptr + 8);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                        float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        _p2 = vaddq_f32(_p2, _q2);
                        _p3 = vaddq_f32(_p3, _q3);
                        vst1q_f16(outptr, vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)));
                        vst1q_f16(outptr + 8, vcombine_f16(vcvt_f16_f32(_p2), vcvt_f16_f32(_p3)));

                        ptr += 16;
                        ptr0 += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(ptr0);
                        float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        _p0 = vaddq_f32(_p0, _q0);
                        _p1 = vaddq_f32(_p1, _q1);
                        vst1q_f16(outptr, vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)));

                        ptr += 8;
                        ptr0 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(ptr0);
                        float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr));
                        _p = vaddq_f32(_p, _q);
                        vst1_f16(outptr, vcvt_f16_f32(_p));

                        ptr += 4;
                        ptr0 += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr = (__fp16)(*ptr0 + (float)(*ptr));

                        ptr++;
                        ptr0++;
                        outptr++;
                    }
                }
            }
        }
        else
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                const __fp16* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob_fp32.channel(q);

                const float coeff0 = coeffs[0];
                const float coeff1 = coeffs[1];
                float32x4_t _coeff0 = vdupq_n_f32(coeff0);
                float32x4_t _coeff1 = vdupq_n_f32(coeff1);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p01 = vld1q_f16(ptr);
                    float16x8_t _p23 = vld1q_f16(ptr + 8);
                    float16x8_t _q01 = vld1q_f16(ptr1);
                    float16x8_t _q23 = vld1q_f16(ptr1 + 8);
                    float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
                    float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
                    float32x4_t _p2 = vcvt_f32_f16(vget_low_f16(_p23));
                    float32x4_t _p3 = vcvt_f32_f16(vget_high_f16(_p23));
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                    float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                    _p0 = vmulq_f32(_p0, _coeff0);
                    _p1 = vmulq_f32(_p1, _coeff0);
                    _p2 = vmulq_f32(_p2, _coeff0);
                    _p3 = vmulq_f32(_p3, _coeff0);
                    _p0 = vfmaq_f32(_p0, _q0, _coeff1);
                    _p1 = vfmaq_f32(_p1, _q1, _coeff1);
                    _p2 = vfmaq_f32(_p2, _q2, _coeff1);
                    _p3 = vfmaq_f32(_p3, _q3, _coeff1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);
                    vst1q_f32(outptr + 8, _p2);
                    vst1q_f32(outptr + 12, _p3);

                    ptr += 16;
                    ptr1 += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p01 = vld1q_f16(ptr);
                    float16x8_t _q01 = vld1q_f16(ptr1);
                    float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
                    float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
                    float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                    float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                    _p0 = vmulq_f32(_p0, _coeff0);
                    _p1 = vmulq_f32(_p1, _coeff0);
                    _p0 = vfmaq_f32(_p0, _q0, _coeff1);
                    _p1 = vfmaq_f32(_p1, _q1, _coeff1);
                    vst1q_f32(outptr, _p0);
                    vst1q_f32(outptr + 4, _p1);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr1));
                    _p = vmulq_f32(_p, _coeff0);
                    _p = vfmaq_f32(_p, _q, _coeff1);
                    vst1q_f32(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = (float)(*ptr) * coeff0 + (float)(*ptr1) * coeff1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            size_t b = 2;
            for (; b < bottom_blobs.size() - 1; b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob_fp32.channel(q);

                    const float coeff = coeffs[b];
                    float32x4_t _coeff = vdupq_n_f32(coeff);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        float32x4_t _p2 = vld1q_f32(outptr + 8);
                        float32x4_t _p3 = vld1q_f32(outptr + 12);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float16x8_t _q23 = vld1q_f16(ptr + 8);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                        float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                        _p0 = vfmaq_f32(_p0, _q0, _coeff);
                        _p1 = vfmaq_f32(_p1, _q1, _coeff);
                        _p2 = vfmaq_f32(_p2, _q2, _coeff);
                        _p3 = vfmaq_f32(_p3, _q3, _coeff);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);
                        vst1q_f32(outptr + 8, _p2);
                        vst1q_f32(outptr + 12, _p3);

                        ptr += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(outptr);
                        float32x4_t _p1 = vld1q_f32(outptr + 4);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        _p0 = vfmaq_f32(_p0, _q0, _coeff);
                        _p1 = vfmaq_f32(_p1, _q1, _coeff);
                        vst1q_f32(outptr, _p0);
                        vst1q_f32(outptr + 4, _p1);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(outptr);
                        float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr));
                        _p = vfmaq_f32(_p, _q, _coeff);
                        vst1q_f32(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr += (float)(*ptr) * coeff;

                        ptr++;
                        outptr++;
                    }
                }
            }
            for (; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob1.channel(q);
                    const float* ptr0 = top_blob_fp32.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    const float coeff = coeffs[b];
                    float32x4_t _coeff = vdupq_n_f32(coeff);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float32x4_t _p0 = vld1q_f32(ptr0);
                        float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                        float32x4_t _p2 = vld1q_f32(ptr0 + 8);
                        float32x4_t _p3 = vld1q_f32(ptr0 + 12);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float16x8_t _q23 = vld1q_f16(ptr + 8);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        float32x4_t _q2 = vcvt_f32_f16(vget_low_f16(_q23));
                        float32x4_t _q3 = vcvt_f32_f16(vget_high_f16(_q23));
                        _p0 = vfmaq_f32(_p0, _q0, _coeff);
                        _p1 = vfmaq_f32(_p1, _q1, _coeff);
                        _p2 = vfmaq_f32(_p2, _q2, _coeff);
                        _p3 = vfmaq_f32(_p3, _q3, _coeff);
                        vst1q_f16(outptr, vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)));
                        vst1q_f16(outptr + 8, vcombine_f16(vcvt_f16_f32(_p2), vcvt_f16_f32(_p3)));

                        ptr += 16;
                        ptr0 += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float32x4_t _p0 = vld1q_f32(ptr0);
                        float32x4_t _p1 = vld1q_f32(ptr0 + 4);
                        float16x8_t _q01 = vld1q_f16(ptr);
                        float32x4_t _q0 = vcvt_f32_f16(vget_low_f16(_q01));
                        float32x4_t _q1 = vcvt_f32_f16(vget_high_f16(_q01));
                        _p0 = vfmaq_f32(_p0, _q0, _coeff);
                        _p1 = vfmaq_f32(_p1, _q1, _coeff);
                        vst1q_f16(outptr, vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1)));

                        ptr += 8;
                        ptr0 += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float32x4_t _p = vld1q_f32(ptr0);
                        float32x4_t _q = vcvt_f32_f16(vld1_f16(ptr));
                        _p = vfmaq_f32(_p, _q, _coeff);
                        vst1_f16(outptr, vcvt_f16_f32(_p));

                        ptr += 4;
                        ptr0 += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr = (__fp16)(*ptr0 + (float)(*ptr) * coeff);

                        ptr++;
                        ptr0++;
                        outptr++;
                    }
                }
            }
        }
    }
    if (op_type == Operation_MAX)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const __fp16* ptr = bottom_blob.channel(q);
            const __fp16* ptr1 = bottom_blob1.channel(q);
            __fp16* outptr = top_blob.channel(q);

            int i = 0;
            for (; i + 15 < size; i += 16)
            {
                float16x8_t _p0 = vld1q_f16(ptr);
                float16x8_t _p1 = vld1q_f16(ptr + 8);
                float16x8_t _q0 = vld1q_f16(ptr1);
                float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                _p0 = vmaxq_f16(_p0, _q0);
                _p1 = vmaxq_f16(_p1, _q1);
                vst1q_f16(outptr, _p0);
                vst1q_f16(outptr + 8, _p1);

                ptr += 16;
                ptr1 += 16;
                outptr += 16;
            }
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float16x8_t _q = vld1q_f16(ptr1);
                _p = vmaxq_f16(_p, _q);
                vst1q_f16(outptr, _p);

                ptr += 8;
                ptr1 += 8;
                outptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                float16x4_t _q = vld1_f16(ptr1);
                _p = vmax_f16(_p, _q);
                vst1_f16(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
            for (; i < size; i++)
            {
                *outptr = std::max(*ptr, *ptr1);

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        size_t b = 2;
        for (; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob1.channel(q);
                __fp16* outptr = top_blob.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p0 = vld1q_f16(outptr);
                    float16x8_t _p1 = vld1q_f16(outptr + 8);
                    float16x8_t _q0 = vld1q_f16(ptr);
                    float16x8_t _q1 = vld1q_f16(ptr + 8);
                    _p0 = vmaxq_f16(_p0, _q0);
                    _p1 = vmaxq_f16(_p1, _q1);
                    vst1q_f16(outptr, _p0);
                    vst1q_f16(outptr + 8, _p1);

                    ptr += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p = vld1q_f16(outptr);
                    float16x8_t _q = vld1q_f16(ptr);
                    _p = vmaxq_f16(_p, _q);
                    vst1q_f16(outptr, _p);

                    ptr += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float16x4_t _p = vld1_f16(outptr);
                    float16x4_t _q = vld1_f16(ptr);
                    _p = vmax_f16(_p, _q);
                    vst1_f16(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = std::max(*ptr, *outptr);

                    ptr++;
                    outptr++;
                }
            }
        }
    }

    return 0;
}

int Eltwise_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    if (bottom_blobs.size() == 2)
    {
        // fast path without fp32 accumulator
        return forward_fp16s(bottom_blobs, top_blobs, opt);
    }

    if (op_type == Operation_MAX)
    {
        return forward_fp16s(bottom_blobs, top_blobs, opt);
    }

    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;
    int size = w * h * d * elempack;

    Mat& top_blob = top_blobs[0];
    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const __fp16* ptr = bottom_blob.channel(q);
            const __fp16* ptr1 = bottom_blob1.channel(q);
            __fp16* outptr = top_blob.channel(q);

            int i = 0;
            for (; i + 15 < size; i += 16)
            {
                float16x8_t _p0 = vld1q_f16(ptr);
                float16x8_t _p1 = vld1q_f16(ptr + 8);
                float16x8_t _q0 = vld1q_f16(ptr1);
                float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                _p0 = vmulq_f16(_p0, _q0);
                _p1 = vmulq_f16(_p1, _q1);
                vst1q_f16(outptr, _p0);
                vst1q_f16(outptr + 8, _p1);

                ptr += 16;
                ptr1 += 16;
                outptr += 16;
            }
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float16x8_t _q = vld1q_f16(ptr1);
                _p = vmulq_f16(_p, _q);
                vst1q_f16(outptr, _p);

                ptr += 8;
                ptr1 += 8;
                outptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                float16x4_t _q = vld1_f16(ptr1);
                _p = vmul_f16(_p, _q);
                vst1_f16(outptr, _p);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
            for (; i < size; i++)
            {
                *outptr = *ptr * *ptr1;

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        size_t b = 2;
        for (; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob1.channel(q);
                __fp16* outptr = top_blob.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p0 = vld1q_f16(outptr);
                    float16x8_t _p1 = vld1q_f16(outptr + 8);
                    float16x8_t _q0 = vld1q_f16(ptr);
                    float16x8_t _q1 = vld1q_f16(ptr + 8);
                    _p0 = vmulq_f16(_p0, _q0);
                    _p1 = vmulq_f16(_p1, _q1);
                    vst1q_f16(outptr, _p0);
                    vst1q_f16(outptr + 8, _p1);

                    ptr += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p = vld1q_f16(outptr);
                    float16x8_t _q = vld1q_f16(ptr);
                    _p = vmulq_f16(_p, _q);
                    vst1q_f16(outptr, _p);

                    ptr += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float16x4_t _p = vld1_f16(outptr);
                    float16x4_t _q = vld1_f16(ptr);
                    _p = vmul_f16(_p, _q);
                    vst1_f16(outptr, _p);

                    ptr += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr *= *ptr;

                    ptr++;
                    outptr++;
                }
            }
        }
    }
    if (op_type == Operation_SUM)
    {
        if (coeffs.w == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                const __fp16* ptr1 = bottom_blob1.channel(q);
                __fp16* outptr = top_blob.channel(q);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p0 = vld1q_f16(ptr);
                    float16x8_t _p1 = vld1q_f16(ptr + 8);
                    float16x8_t _q0 = vld1q_f16(ptr1);
                    float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                    _p0 = vaddq_f16(_p0, _q0);
                    _p1 = vaddq_f16(_p1, _q1);
                    vst1q_f16(outptr, _p0);
                    vst1q_f16(outptr + 8, _p1);

                    ptr += 16;
                    ptr1 += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    float16x8_t _q = vld1q_f16(ptr1);
                    _p = vaddq_f16(_p, _q);
                    vst1q_f16(outptr, _p);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    float16x4_t _q = vld1_f16(ptr1);
                    _p = vadd_f16(_p, _q);
                    vst1_f16(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = *ptr + *ptr1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            size_t b = 2;
            for (; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob1.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float16x8_t _p0 = vld1q_f16(outptr);
                        float16x8_t _p1 = vld1q_f16(outptr + 8);
                        float16x8_t _q0 = vld1q_f16(ptr);
                        float16x8_t _q1 = vld1q_f16(ptr + 8);
                        _p0 = vaddq_f16(_p0, _q0);
                        _p1 = vaddq_f16(_p1, _q1);
                        vst1q_f16(outptr, _p0);
                        vst1q_f16(outptr + 8, _p1);

                        ptr += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float16x8_t _p = vld1q_f16(outptr);
                        float16x8_t _q = vld1q_f16(ptr);
                        _p = vaddq_f16(_p, _q);
                        vst1q_f16(outptr, _p);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float16x4_t _p = vld1_f16(outptr);
                        float16x4_t _q = vld1_f16(ptr);
                        _p = vadd_f16(_p, _q);
                        vst1_f16(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr += *ptr;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
        else
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                const __fp16* ptr1 = bottom_blob1.channel(q);
                __fp16* outptr = top_blob.channel(q);

                const __fp16 coeff0 = (__fp16)coeffs[0];
                const __fp16 coeff1 = (__fp16)coeffs[1];
                float16x8_t _coeff0 = vdupq_n_f16(coeff0);
                float16x8_t _coeff1 = vdupq_n_f16(coeff1);

                int i = 0;
                for (; i + 15 < size; i += 16)
                {
                    float16x8_t _p0 = vld1q_f16(ptr);
                    float16x8_t _p1 = vld1q_f16(ptr + 8);
                    float16x8_t _q0 = vld1q_f16(ptr1);
                    float16x8_t _q1 = vld1q_f16(ptr1 + 8);
                    _p0 = vmulq_f16(_p0, _coeff0);
                    _p1 = vmulq_f16(_p1, _coeff0);
                    _p0 = vfmaq_f16(_p0, _q0, _coeff1);
                    _p1 = vfmaq_f16(_p1, _q1, _coeff1);
                    vst1q_f16(outptr, _p0);
                    vst1q_f16(outptr + 8, _p1);

                    ptr += 16;
                    ptr1 += 16;
                    outptr += 16;
                }
                for (; i + 7 < size; i += 8)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    float16x8_t _q = vld1q_f16(ptr1);
                    _p = vmulq_f16(_p, _coeff0);
                    _p = vfmaq_f16(_p, _q, _coeff1);
                    vst1q_f16(outptr, _p);

                    ptr += 8;
                    ptr1 += 8;
                    outptr += 8;
                }
                for (; i + 3 < size; i += 4)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    float16x4_t _q = vld1_f16(ptr1);
                    _p = vmul_f16(_p, vget_low_f16(_coeff0));
                    _p = vfma_f16(_p, _q, vget_low_f16(_coeff1));
                    vst1_f16(outptr, _p);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
                for (; i < size; i++)
                {
                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            size_t b = 2;
            for (; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob1.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    const __fp16 coeff = (__fp16)coeffs[b];
                    float16x8_t _coeff = vdupq_n_f16(coeff);

                    int i = 0;
                    for (; i + 15 < size; i += 16)
                    {
                        float16x8_t _p0 = vld1q_f16(outptr);
                        float16x8_t _p1 = vld1q_f16(outptr + 8);
                        float16x8_t _q0 = vld1q_f16(ptr);
                        float16x8_t _q1 = vld1q_f16(ptr + 8);
                        _p0 = vfmaq_f16(_p0, _q0, _coeff);
                        _p1 = vfmaq_f16(_p1, _q1, _coeff);
                        vst1q_f16(outptr, _p0);
                        vst1q_f16(outptr + 8, _p1);

                        ptr += 16;
                        outptr += 16;
                    }
                    for (; i + 7 < size; i += 8)
                    {
                        float16x8_t _p = vld1q_f16(outptr);
                        float16x8_t _q = vld1q_f16(ptr);
                        _p = vfmaq_f16(_p, _q, _coeff);
                        vst1q_f16(outptr, _p);

                        ptr += 8;
                        outptr += 8;
                    }
                    for (; i + 3 < size; i += 4)
                    {
                        float16x4_t _p = vld1_f16(outptr);
                        float16x4_t _q = vld1_f16(ptr);
                        _p = vfma_f16(_p, _q, vget_low_f16(_coeff));
                        vst1_f16(outptr, _p);

                        ptr += 4;
                        outptr += 4;
                    }
                    for (; i < size; i++)
                    {
                        *outptr += *ptr * coeff;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/flatten_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "flatten_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

Flatten_arm::Flatten_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif // NCNN_BF16
}

int Flatten_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

    if (elembits == 8)
        return forward_int8(bottom_blob, top_blob, opt);

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    int size = w * h * d;

    int total = size * channels * elempack;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = total % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    if (out_elempack == 1)
    {
        return Flatten::forward(bottom_blob, top_blob, opt);
    }

    if (dims == 2 && elempack == 1) // out_elempack == 4
    {
        top_blob = bottom_blob;
        top_blob.dims = 1;
        top_blob.w = total / out_elempack;
        top_blob.h = 1;
        top_blob.cstep = bottom_blob.cstep / out_elempack;
        top_blob.elemsize = out_elemsize;
        top_blob.elempack = out_elempack;
        return 0;
    }

    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 2)
    {
        if (elempack == 4) // out_elempack == 4
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* ptr = bottom_blob.row(i);
                float* outptr0 = (float*)top_blob + w * i * 4;
                float* outptr1 = (float*)top_blob + w * (i * 4 + 1);
                float* outptr2 = (float*)top_blob + w * (i * 4 + 2);
                float* outptr3 = (float*)top_blob + w * (i * 4 + 3);

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    float32x4x4_t _v4 = vld4q_f32(ptr);
                    vst1q_f32(outptr0, _v4.val[0]);
                    vst1q_f32(outptr1, _v4.val[1]);
                    vst1q_f32(outptr2, _v4.val[2]);
                    vst1q_f32(outptr3, _v4.val[3]);

                    ptr += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; j < w; j++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];

                    ptr += 4;
                }
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        if (elempack == 4) // out_elempack == 4
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr0 = (float*)top_blob + size * q * 4;
                float* outptr1 = (float*)top_blob + size * (q * 4 + 1);
                float* outptr2 = (float*)top_blob + size * (q * 4 + 2);
                float* outptr3 = (float*)top_blob + size * (q * 4 + 3);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    float32x4x4_t _v4 = vld4q_f32(ptr);
                    vst1q_f32(outptr0, _v4.val[0]);
                    vst1q_f32(outptr1, _v4.val[1]);
                    vst1q_f32(outptr2, _v4.val[2]);
                    vst1q_f32(outptr3, _v4.val[3]);

                    ptr += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];

                    ptr += 4;
                }
            }
        }

        if (elempack == 1) // out_elempack == 4
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = (float*)top_blob + size * q;

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _v = vld1q_f32(ptr);
                    vst1q_f32(outptr, _v);
                    ptr += 4;
                    outptr += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr++ = *ptr++;
                }
            }
        }
    }

    return 0;
}

int Flatten_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    int size = w * h * d;

    int total = size * channels * elempack;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
#if NCNN_ARM82
        out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1;
#else
        out_elempack = total % 4 == 0 ? 4 : 1;
#endif
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    if (out_elempack == 1)
    {
        return Flatten::forward(bottom_blob, top_blob, opt);
    }

    if (dims == 2 && elempack == 1) // out_elempack == 4 || out_elempack == 8
    {
        top_blob = bottom_blob;
        top_blob.dims = 1;
        top_blob.w = total / out_elempack;
        top_blob.h = 1;
        top_blob.cstep = bottom_blob.cstep / out_elempack;
        top_blob.elemsize = out_elemsize;
        top_blob.elempack = out_elempack;
        return 0;
    }

    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 2)
    {
#if NCNN_ARM82
        if (elempack == 8) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const unsigned short* ptr = bottom_blob.row<const unsigned short>(i);
                unsigned short* outptr0 = (unsigned short*)top_blob + w * i * 8;
                unsigned short* outptr1 = (unsigned short*)top_blob + w * (i * 8 + 1);
                unsigned short* outptr2 = (unsigned short*)top_blob + w * (i * 8 + 2);
                unsigned short* outptr3 = (unsigned short*)top_blob + w * (i * 8 + 3);
                unsigned short* outptr4 = (unsigned short*)top_blob + w * (i * 8 + 4);
                unsigned short* outptr5 = (unsigned short*)top_blob + w * (i * 8 + 5);
                unsigned short* outptr6 = (unsigned short*)top_blob + w * (i * 8 + 6);
                unsigned short* outptr7 = (unsigned short*)top_blob + w * (i * 8 + 7);

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    uint16x8x4_t _v4 = vld4q_u16(ptr);
                    uint16x8_t _v_01 = vuzp1q_u16(_v4.val[0], _v4.val[1]);
                    uint16x8_t _v_23 = vuzp1q_u16(_v4.val[2], _v4.val[3]);
                    uint16x8_t _v_45 = vuzp2q_u16(_v4.val[0], _v4.val[1]);
                    uint16x8_t _v_67 = vuzp2q_u16(_v4.val[2], _v4.val[3]);
                    vst1_u16(outptr0, vget_low_u16(_v_01));
                    vst1_u16(outptr1, vget_high_u16(_v_01));
                    vst1_u16(outptr2, vget_low_u16(_v_23));
                    vst1_u16(outptr3, vget_high_u16(_v_23));
                    vst1_u16(outptr4, vget_low_u16(_v_45));
                    vst1_u16(outptr5, vget_high_u16(_v_45));
                    vst1_u16(outptr6, vget_low_u16(_v_67));
                    vst1_u16(outptr7, vget_high_u16(_v_67));

                    ptr += 32;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                    outptr4 += 4;
                    outptr5 += 4;
                    outptr6 += 4;
                    outptr7 += 4;
                }
                for (; j < w; j++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];
                    *outptr4++ = ptr[4];
                    *outptr5++ = ptr[5];
                    *outptr6++ = ptr[6];
                    *outptr7++ = ptr[7];

                    ptr += 8;
                }
            }
        }
#endif // NCNN_ARM82

        if (elempack == 4) // out_elempack == 4 || out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const unsigned short* ptr = bottom_blob.row<const unsigned short>(i);
                unsigned short* outptr0 = (unsigned short*)top_blob + w * i * 4;
                unsigned short* outptr1 = (unsigned short*)top_blob + w * (i * 4 + 1);
                unsigned short* outptr2 = (unsigned short*)top_blob + w * (i * 4 + 2);
                unsigned short* outptr3 = (unsigned short*)top_blob + w * (i * 4 + 3);

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    uint16x4x4_t _v4 = vld4_u16(ptr);
                    vst1_u16(outptr0, _v4.val[0]);
                    vst1_u16(outptr1, _v4.val[1]);
                    vst1_u16(outptr2, _v4.val[2]);
                    vst1_u16(outptr3, _v4.val[3]);

                    ptr += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; j < w; j++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];

                    ptr += 4;
                }
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
#if NCNN_ARM82
        if (elempack == 8) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                unsigned short* outptr0 = (unsigned short*)top_blob + size * q * 8;
                unsigned short* outptr1 = (unsigned short*)top_blob + size * (q * 8 + 1);
                unsigned short* outptr2 = (unsigned short*)top_blob + size * (q * 8 + 2);
                unsigned short* outptr3 = (unsigned short*)top_blob + size * (q * 8 + 3);
                unsigned short* outptr4 = (unsigned short*)top_blob + size * (q * 8 + 4);
                unsigned short* outptr5 = (unsigned short*)top_blob + size * (q * 8 + 5);
                unsigned short* outptr6 = (unsigned short*)top_blob + size * (q * 8 + 6);
                unsigned short* outptr7 = (unsigned short*)top_blob + size * (q * 8 + 7);

                int i = 0;
                for (; i + 3 < size; i += 4)
                {
                    uint16x8x4_t _v4 = vld4q_u16(ptr);
                    uint16x8_t _v_01 = vuzp1q_u16(_v4.val[0], _v4.val[1]);
                    uint16x8_t _v_23 = vuzp1q_u16(_v4.val[2], _v4.val[3]);
                    uint16x8_t _v_45 = vuzp2q_u16(_v4.val[0], _v4.val[1]);
                    uint16x8_t _v_67 = vuzp2q_u16(_v4.val[2], _v4.val[3]);
                    vst1_u16(outptr0, vget_low_u16(_v_01));
                    vst1_u16(outptr1, vget_high_u16(_v_01));
                    vst1_u16(outptr2, vget_low_u16(_v_23));
                    vst1_u16(outptr3, vget_high_u16(_v_23));
                    vst1_u16(outptr4, vget_low_u16(_v_45));
                    vst1_u16(outptr5, vget_high_u16(_v_45));
                    vst1_u16(outptr6, vget_low_u16(_v_67));
                    vst1_u16(outptr7, vget_high_u16(_v_67));

                    ptr += 32;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                    outptr4 += 4;
                    outptr5 += 4;
                    outptr6 += 4;
                    outptr7 += 4;
                }
                for (; i < size; i++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];
                    *outptr4++ = ptr[4];
                    *outptr5++ = ptr[5];
                    *outptr6++ = ptr[6];
                    *outptr7++ = ptr[7];

                    ptr += 8;
                }
            }
        }
#endif // NCNN_ARM82

        if (elempack == 4) // out_elempack == 4 || out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                unsigned short* outptr0 = (unsigned short*)top_blob + size * q * 4;
                unsigned short* outptr1 = (unsigned short*)top_blob + size * (q * 4 + 1);
                unsigned short* outptr2 = (unsigned short*)top_blob + size * (q * 4 + 2);
                unsigned short* outptr3 = (unsigned short*)top_blob + size * (q * 4 + 3);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    uint16x4x4_t _v4 = vld4_u16(ptr);
                    vst1_u16(outptr0, _v4.val[0]);
                    vst1_u16(outptr1, _v4.val[1]);
                    vst1_u16(outptr2, _v4.val[2]);
                    vst1_u16(outptr3, _v4.val[3]);

                    ptr += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];

                    ptr += 4;
                }
            }
        }

        if (elempack == 1) // out_elempack == 4 || out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                unsigned short* outptr = (unsigned short*)top_blob + size * q;

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    uint16x4_t _v = vld1_u16(ptr);
                    vst1_u16(outptr, _v);
                    ptr += 4;
                    outptr += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr++ = *ptr++;
                }
            }
        }
    }

    return 0;
}

int Flatten_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    int size = w * h * d;

    int total = size * channels * elempack;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = total % 8 == 0 ? 8 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    if (out_elempack == 1)
    {
        return Flatten::forward(bottom_blob, top_blob, opt);
    }

    if (dims == 2 && elempack == 1) // out_elempack == 8
    {
        top_blob = bottom_blob;
        top_blob.dims = 1;
        top_blob.w = total / out_elempack;
        top_blob.h = 1;
        top_blob.cstep = bottom_blob.cstep / out_elempack;
        top_blob.elemsize = out_elemsize;
        top_blob.elempack = out_elempack;
        return 0;
    }

    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 2)
    {
        if (elempack == 8) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const signed char* ptr = bottom_blob.row<const signed char>(i);
                signed char* outptr0 = (signed char*)top_blob + w * i * 8;
                signed char* outptr1 = (signed char*)top_blob + w * (i * 8 + 1);
                signed char* outptr2 = (signed char*)top_blob + w * (i * 8 + 2);
                signed char* outptr3 = (signed char*)top_blob + w * (i * 8 + 3);
                signed char* outptr4 = (signed char*)top_blob + w * (i * 8 + 4);
                signed char* outptr5 = (signed char*)top_blob + w * (i * 8 + 5);
                signed char* outptr6 = (signed char*)top_blob + w * (i * 8 + 6);
                signed char* outptr7 = (signed char*)top_blob + w * (i * 8 + 7);

                int j = 0;
                for (; j < w; j++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];
                    *outptr4++ = ptr[4];
                    *outptr5++ = ptr[5];
                    *outptr6++ = ptr[6];
                    *outptr7++ = ptr[7];

                    ptr += 8;
                }
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        if (elempack == 8) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const signed char* ptr = bottom_blob.channel(q);
                signed char* outptr0 = (signed char*)top_blob + size * q * 8;
                signed char* outptr1 = (signed char*)top_blob + size * (q * 8 + 1);
                signed char* outptr2 = (signed char*)top_blob + size * (q * 8 + 2);
                signed char* outptr3 = (signed char*)top_blob + size * (q * 8 + 3);
                signed char* outptr4 = (signed char*)top_blob + size * (q * 8 + 4);
                signed char* outptr5 = (signed char*)top_blob + size * (q * 8 + 5);
                signed char* outptr6 = (signed char*)top_blob + size * (q * 8 + 6);
                signed char* outptr7 = (signed char*)top_blob + size * (q * 8 + 7);

                int i = 0;
                for (; i < size; i++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];
                    *outptr4++ = ptr[4];
                    *outptr5++ = ptr[5];
                    *outptr6++ = ptr[6];
                    *outptr7++ = ptr[7];

                    ptr += 8;
                }
            }
        }

        if (elempack == 1) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const signed char* ptr = bottom_blob.channel(q);
                signed char* outptr = (signed char*)top_blob + size * q;

                int i = 0;
                for (; i < size; i++)
                {
                    *outptr++ = *ptr++;
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/flatten_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_FLATTEN_ARM_H
#define LAYER_FLATTEN_ARM_H

#include "flatten.h"

namespace ncnn {

class Flatten_arm : public Flatten
{
public:
    Flatten_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_FLATTEN_ARM_H


================================================
FILE: src/layer/arm/gelu_arm.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gelu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

GELU_arm::GELU_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int GELU_arm::create_pipeline(const Option& /*opt*/)
{
    if (!fast_gelu)
    {
        support_packing = false;
        support_fp16_storage = false;
        support_bf16_storage = false;
    }
    return 0;
}

int GELU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    if (!fast_gelu)
    {
        return GELU::forward_inplace(bottom_top_blob, opt);
    }

    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int elempack = bottom_top_blob.elempack;
    int channels = bottom_top_blob.c;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;

#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _pLoad = vld1q_f32(ptr);

            float32x4_t _blob = vmulq_f32(_pLoad, _pLoad);
            _blob = vmulq_f32(_pLoad, _blob);
            _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob);
            _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad);
            _blob = tanh_ps(_blob);
            _blob = vaddq_f32(vdupq_n_f32(1.f), _blob);
            _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad));
            vst1q_f32(ptr, _blob);
            ptr += 4;
        }
#endif
        for (; i < size; i++)
        {
            // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
            *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr)));

            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int GELU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int elempack = bottom_top_blob.elempack;
    int channels = bottom_top_blob.c;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

        int i = 0;

#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _pLoad = bfloat2float(vld1_u16(ptr));

            float32x4_t _blob = vmulq_f32(_pLoad, _pLoad);
            _blob = vmulq_f32(_pLoad, _blob);
            _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob);
            _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad);
            _blob = tanh_ps(_blob);
            _blob = vaddq_f32(vdupq_n_f32(1.f), _blob);
            _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad));
            vst1_u16(ptr, float2bfloat(_blob));
            ptr += 4;
        }
#endif // __ARM_NEON

        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(*ptr);
            v = 0.5f * v * (1.0f + tanhf(0.79788452f * (v + 0.044715f * v * v * v)));
            *ptr = float32_to_bfloat16(v);
            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/gelu_arm.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GELU_ARM_H
#define LAYER_GELU_ARM_H

#include "gelu.h"

namespace ncnn {

class GELU_arm : public GELU
{
public:
    GELU_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_GELU_ARM_H


================================================
FILE: src/layer/arm/gelu_arm_asimdhp.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gelu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#include "neon_mathfun.h"
#if NCNN_ARM82
#include "neon_mathfun_fp16s.h"
#endif
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int GELU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int elempack = bottom_top_blob.elempack;
    int channels = bottom_top_blob.c;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = (__fp16*)bottom_top_blob.channel(q);

        int i = 0;

        for (; i + 3 < size; i += 4)
        {
            float32x4_t _pLoad = vcvt_f32_f16(vld1_f16(ptr));

            float32x4_t _blob = vmulq_f32(_pLoad, _pLoad);
            _blob = vmulq_f32(_pLoad, _blob);
            _blob = vmulq_f32(vdupq_n_f32(0.044715f * 0.79788452f), _blob);
            _blob = vmlaq_f32(_blob, vdupq_n_f32(0.79788452f), _pLoad);
            _blob = tanh_ps(_blob);
            _blob = vaddq_f32(vdupq_n_f32(1.f), _blob);
            _blob = vmulq_f32(vdupq_n_f32(0.5f), vmulq_f32(_blob, _pLoad));
            vst1_f16(ptr, vcvt_f16_f32(_blob));
            ptr += 4;
        }

        for (; i < size; i++)
        {
            float v = (float)*ptr;
            v = 0.5f * v * (1.0f + tanhf(0.79788452f * (v + 0.044715f * v * v * v)));
            *ptr = (__fp16)v;
            ptr++;
        }
    }

    return 0;
}

int GELU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int elempack = bottom_top_blob.elempack;
    int channels = bottom_top_blob.c;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = (__fp16*)bottom_top_blob.channel(q);

        int i = 0;

        for (; i + 7 < size; i += 8)
        {
            float16x8_t _pLoad = vld1q_f16(ptr);

            float16x8_t _blob = vmulq_f16(_pLoad, _pLoad);
            _blob = vmulq_f16(_pLoad, _blob);
            _blob = vmulq_f16(vdupq_n_f16(0.044715f * 0.79788452f), _blob);
            _blob = vfmaq_f16(_blob, vdupq_n_f16(0.79788452f), _pLoad);
            _blob = tanh_ps_f16(_blob);
            _blob = vaddq_f16(vdupq_n_f16(1.f), _blob);
            _blob = vmulq_f16(vdupq_n_f16(0.5f), vmulq_f16(_blob, _pLoad));
            vst1q_f16(ptr, _blob);
            ptr += 8;
        }

        for (; i < size; i++)
        {
            *ptr = (__fp16)0.5f * *ptr * (__fp16)(1.0f + tanhf((__fp16)0.79788452f * (*ptr + (__fp16)0.044715f * *ptr * *ptr * *ptr)));
            ptr++;
        }
    }

    return 0;
}
#endif

} // namespace ncnn


================================================
FILE: src/layer/arm/gemm_arm.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gemm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#if NCNN_BF16
#include "gemm_bf16s_fp16s.h"
#include "gemm_bf16s.h"
#endif

#if NCNN_INT8
#include "gemm_int8.h"
#if NCNN_BF16
#include "gemm_int8_bf16s.h"
#endif
#endif

Gemm_arm::Gemm_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_VFPV4
    support_fp16_storage = cpu_support_arm_vfpv4();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif

    nT = 0;
}

void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    float* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep + k * 4;
            const float* p1 = (const float*)A + (i + ii + 4) * A_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                vst1q_f32(pp + 4, vld1q_f32(p1));
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
            const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
            const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
            const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;
            const float* p4 = (const float*)A + (i + ii + 4) * A_hstep + k;
            const float* p5 = (const float*)A + (i + ii + 5) * A_hstep + k;
            const float* p6 = (const float*)A + (i + ii + 6) * A_hstep + k;
            const float* p7 = (const float*)A + (i + ii + 7) * A_hstep + k;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _r0l = vld1q_f32(p0);
                float32x4_t _r0h = vld1q_f32(p0 + 4);
                float32x4_t _r1l = vld1q_f32(p1);
                float32x4_t _r1h = vld1q_f32(p1 + 4);
                float32x4_t _r2l = vld1q_f32(p2);
                float32x4_t _r2h = vld1q_f32(p2 + 4);
                float32x4_t _r3l = vld1q_f32(p3);
                float32x4_t _r3h = vld1q_f32(p3 + 4);
                float32x4_t _r4l = vld1q_f32(p4);
                float32x4_t _r4h = vld1q_f32(p4 + 4);
                float32x4_t _r5l = vld1q_f32(p5);
                float32x4_t _r5h = vld1q_f32(p5 + 4);
                float32x4_t _r6l = vld1q_f32(p6);
                float32x4_t _r6h = vld1q_f32(p6 + 4);
                float32x4_t _r7l = vld1q_f32(p7);
                float32x4_t _r7h = vld1q_f32(p7 + 4);
                transpose8x8_ps(_r0l, _r0h, _r1l, _r1h, _r2l, _r2h, _r3l, _r3h, _r4l, _r4h, _r5l, _r5h, _r6l, _r6h, _r7l, _r7h);
                vst1q_f32(pp, _r0l);
                vst1q_f32(pp + 4, _r0h);
                vst1q_f32(pp + 8, _r1l);
                vst1q_f32(pp + 12, _r1h);
                vst1q_f32(pp + 8 * 2, _r2l);
                vst1q_f32(pp + 8 * 2 + 4, _r2h);
                vst1q_f32(pp + 8 * 3, _r3l);
                vst1q_f32(pp + 8 * 3 + 4, _r3h);
                vst1q_f32(pp + 8 * 4, _r4l);
                vst1q_f32(pp + 8 * 4 + 4, _r4h);
                vst1q_f32(pp + 8 * 5, _r5l);
                vst1q_f32(pp + 8 * 5 + 4, _r5h);
                vst1q_f32(pp + 8 * 6, _r6l);
                vst1q_f32(pp + 8 * 6 + 4, _r6h);
                vst1q_f32(pp + 8 * 7, _r7l);
                vst1q_f32(pp + 8 * 7 + 4, _r7h);
                pp += 64;
                p0 += 8;
                p1 += 8;
                p2 += 8;
                p3 += 8;
                p4 += 8;
                p5 += 8;
                p6 += 8;
                p7 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp[4] = p4[0];
                pp[5] = p5[0];
                pp[6] = p6[0];
                pp[7] = p7[0];
                pp += 8;
                p0++;
                p1++;
                p2++;
                p3++;
                p4++;
                p5++;
                p6++;
                p7++;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
            const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
            const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
            const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x4_t _r0123;
                _r0123.val[0] = vld1q_f32(p0);
                _r0123.val[1] = vld1q_f32(p1);
                _r0123.val[2] = vld1q_f32(p2);
                _r0123.val[3] = vld1q_f32(p3);
                vst4q_f32(pp, _r0123);
                pp += 16;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp += 4;
                p0++;
                p1++;
                p2++;
                p3++;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        // if (elempack == 1)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
            const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x2_t _r01;
                _r01.val[0] = vld1q_f32(p0);
                _r01.val[1] = vld1q_f32(p1);
                vst2q_f32(pp, _r01);
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp += 2;
                p0++;
                p1++;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        // if (elempack == 1)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    float* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x4_t _r0123 = vld4q_f32(p0);
                float32x4x4_t _r4567 = vld4q_f32(p0 + 16);
                vst1q_f32(pp, _r0123.val[0]);
                vst1q_f32(pp + 4, _r4567.val[0]);
                vst1q_f32(pp + 4 * 2, _r0123.val[1]);
                vst1q_f32(pp + 4 * 3, _r4567.val[1]);
                vst1q_f32(pp + 4 * 4, _r0123.val[2]);
                vst1q_f32(pp + 4 * 5, _r4567.val[2]);
                vst1q_f32(pp + 4 * 6, _r0123.val[3]);
                vst1q_f32(pp + 4 * 7, _r4567.val[3]);
                pp += 32;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                vst1q_f32(pp + 4, vld1q_f32(p0 + 4));
                pp += 8;
                p0 += A_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x4_t _r0123 = vld4q_f32(p0);
                vst1q_f32(pp, _r0123.val[0]);
                vst1q_f32(pp + 4, _r0123.val[1]);
                vst1q_f32(pp + 4 * 2, _r0123.val[2]);
                vst1q_f32(pp + 4 * 3, _r0123.val[3]);
                pp += 16;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += A_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x2_t _r01;
                _r01.val[0] = vld1q_f32(p0);
                _r01.val[1] = vld1q_f32(p0 + 4);
                vst2q_f32(pp, _r01);
                pp += 8;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp += 2;
                p0 += A_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0 += A_hstep;
            }
        }
    }
}

static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    float* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4;
            const float* p1 = (const float*)B + (j + jj + 4) * B_hstep + k * 4;
            const float* p2 = (const float*)B + (j + jj + 8) * B_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                vst1q_f32(pp + 4, vld1q_f32(p1));
                vst1q_f32(pp + 8, vld1q_f32(p2));
                pp += 12;
                p0 += 4;
                p1 += 4;
                p2 += 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
            const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
            const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
            const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
            const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
            const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
            const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;
            const float* p8 = (const float*)B + (j + jj + 8) * B_hstep + k;
            const float* p9 = (const float*)B + (j + jj + 9) * B_hstep + k;
            const float* pa = (const float*)B + (j + jj + 10) * B_hstep + k;
            const float* pb = (const float*)B + (j + jj + 11) * B_hstep + k;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _r0 = vld1q_f32(p0);
                float32x4_t _r1 = vld1q_f32(p1);
                float32x4_t _r2 = vld1q_f32(p2);
                float32x4_t _r3 = vld1q_f32(p3);
                float32x4_t _r4 = vld1q_f32(p4);
                float32x4_t _r5 = vld1q_f32(p5);
                float32x4_t _r6 = vld1q_f32(p6);
                float32x4_t _r7 = vld1q_f32(p7);
                float32x4_t _r8 = vld1q_f32(p8);
                float32x4_t _r9 = vld1q_f32(p9);
                float32x4_t _ra = vld1q_f32(pa);
                float32x4_t _rb = vld1q_f32(pb);

                transpose4x4_ps(_r0, _r1, _r2, _r3);
                transpose4x4_ps(_r4, _r5, _r6, _r7);
                transpose4x4_ps(_r8, _r9, _ra, _rb);

                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r4);
                vst1q_f32(pp + 4 * 2, _r8);
                vst1q_f32(pp + 4 * 3, _r1);
                vst1q_f32(pp + 4 * 4, _r5);
                vst1q_f32(pp + 4 * 5, _r9);
                vst1q_f32(pp + 4 * 6, _r2);
                vst1q_f32(pp + 4 * 7, _r6);
                vst1q_f32(pp + 4 * 8, _ra);
                vst1q_f32(pp + 4 * 9, _r3);
                vst1q_f32(pp + 4 * 10, _r7);
                vst1q_f32(pp + 4 * 11, _rb);
                pp += 48;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
                p4 += 4;
                p5 += 4;
                p6 += 4;
                p7 += 4;
                p8 += 4;
                p9 += 4;
                pa += 4;
                pb += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp[4] = p4[0];
                pp[5] = p5[0];
                pp[6] = p6[0];
                pp[7] = p7[0];
                pp[8] = p8[0];
                pp[9] = p9[0];
                pp[10] = pa[0];
                pp[11] = pb[0];
                pp += 12;
                p0++;
                p1++;
                p2++;
                p3++;
                p4++;
                p5++;
                p6++;
                p7++;
                p8++;
                p9++;
                pa++;
                pb++;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4;
            const float* p1 = (const float*)B + (j + jj + 4) * B_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                vst1q_f32(pp + 4, vld1q_f32(p1));
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
            const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
            const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
            const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
            const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
            const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
            const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _r0 = vld1q_f32(p0);
                float32x4_t _r1 = vld1q_f32(p1);
                float32x4_t _r2 = vld1q_f32(p2);
                float32x4_t _r3 = vld1q_f32(p3);
                float32x4_t _r4 = vld1q_f32(p4);
                float32x4_t _r5 = vld1q_f32(p5);
                float32x4_t _r6 = vld1q_f32(p6);
                float32x4_t _r7 = vld1q_f32(p7);

                transpose4x4_ps(_r0, _r1, _r2, _r3);
                transpose4x4_ps(_r4, _r5, _r6, _r7);

                vst1q_f32(pp, _r0);
                vst1q_f32(pp + 4, _r4);
                vst1q_f32(pp + 4 * 2, _r1);
                vst1q_f32(pp + 4 * 3, _r5);
                vst1q_f32(pp + 4 * 4, _r2);
                vst1q_f32(pp + 4 * 5, _r6);
                vst1q_f32(pp + 4 * 6, _r3);
                vst1q_f32(pp + 4 * 7, _r7);
                pp += 32;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
                p4 += 4;
                p5 += 4;
                p6 += 4;
                p7 += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp[4] = p4[0];
                pp[5] = p5[0];
                pp[6] = p6[0];
                pp[7] = p7[0];
                pp += 8;
                p0++;
                p1++;
                p2++;
                p3++;
                p4++;
                p5++;
                p6++;
                p7++;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
            const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
            const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x4_t _r0123;
                _r0123.val[0] = vld1q_f32(p0);
                _r0123.val[1] = vld1q_f32(p1);
                _r0123.val[2] = vld1q_f32(p2);
                _r0123.val[3] = vld1q_f32(p3);
                vst4q_f32(pp, _r0123);
                pp += 16;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp += 4;
                p0++;
                p1++;
                p2++;
                p3++;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        // if (elempack == 1)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x2_t _r01;
                _r01.val[0] = vld1q_f32(p0);
                _r01.val[1] = vld1q_f32(p1);
                vst2q_f32(pp, _r01);
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp += 2;
                p0++;
                p1++;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        // if (elempack == 1)
        {
            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    float* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x4_t _r0123 = vld4q_f32(p0);
                float32x4x4_t _r4567 = vld4q_f32(p0 + 16);
                float32x4x4_t _r89ab = vld4q_f32(p0 + 32);
                vst1q_f32(pp, _r0123.val[0]);
                vst1q_f32(pp + 4, _r4567.val[0]);
                vst1q_f32(pp + 4 * 2, _r89ab.val[0]);
                vst1q_f32(pp + 4 * 3, _r0123.val[1]);
                vst1q_f32(pp + 4 * 4, _r4567.val[1]);
                vst1q_f32(pp + 4 * 5, _r89ab.val[1]);
                vst1q_f32(pp + 4 * 6, _r0123.val[2]);
                vst1q_f32(pp + 4 * 7, _r4567.val[2]);
                vst1q_f32(pp + 4 * 8, _r89ab.val[2]);
                vst1q_f32(pp + 4 * 9, _r0123.val[3]);
                vst1q_f32(pp + 4 * 10, _r4567.val[3]);
                vst1q_f32(pp + 4 * 11, _r89ab.val[3]);
                pp += 48;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                vst1q_f32(pp + 4, vld1q_f32(p0 + 4));
                vst1q_f32(pp + 8, vld1q_f32(p0 + 8));
                pp += 12;
                p0 += B_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x4_t _r0123 = vld4q_f32(p0);
                float32x4x4_t _r4567 = vld4q_f32(p0 + 16);
                vst1q_f32(pp, _r0123.val[0]);
                vst1q_f32(pp + 4, _r4567.val[0]);
                vst1q_f32(pp + 4 * 2, _r0123.val[1]);
                vst1q_f32(pp + 4 * 3, _r4567.val[1]);
                vst1q_f32(pp + 4 * 4, _r0123.val[2]);
                vst1q_f32(pp + 4 * 5, _r4567.val[2]);
                vst1q_f32(pp + 4 * 6, _r0123.val[3]);
                vst1q_f32(pp + 4 * 7, _r4567.val[3]);
                pp += 32;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                vst1q_f32(pp + 4, vld1q_f32(p0 + 4));
                pp += 8;
                p0 += B_hstep;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x4_t _r0123 = vld4q_f32(p0);
                vst1q_f32(pp, _r0123.val[0]);
                vst1q_f32(pp + 4, _r0123.val[1]);
                vst1q_f32(pp + 4 * 2, _r0123.val[2]);
                vst1q_f32(pp + 4 * 3, _r0123.val[3]);
                pp += 16;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += B_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4x2_t _r01;
                _r01.val[0] = vld1q_f32(p0);
                _r01.val[1] = vld1q_f32(p0 + 4);
                vst2q_f32(pp, _r01);
                pp += 8;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp += 2;
                p0 += B_hstep;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1q_f32(pp, vld1q_f32(p0));
                pp += 4;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const float* p0 = (const float*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0 += B_hstep;
            }
        }
    }
}

static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, int max_ii, int j, int max_jj)
{
    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const float* pp = topT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (out_elempack == 4)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                float32x4x4_t _r0;
                float32x4x4_t _r1;
                _r0.val[0] = vld1q_f32(pp);
                _r1.val[0] = vld1q_f32(pp + 4);
                _r0.val[1] = vld1q_f32(pp + 8);
                _r1.val[1] = vld1q_f32(pp + 12);
                _r0.val[2] = vld1q_f32(pp + 16);
                _r1.val[2] = vld1q_f32(pp + 20);
                _r0.val[3] = vld1q_f32(pp + 24);
                _r1.val[3] = vld1q_f32(pp + 28);
                vst4q_f32(p0, _r0);
                vst4q_f32(p0 + 16, _r1);
                pp += 32;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                float32x4_t _r0 = vld1q_f32(pp);
                float32x4_t _r1 = vld1q_f32(pp + 4);
                vst1q_f32(p0, _r0);
                vst1q_f32(p0 + 4, _r1);
                pp += 8;
                p0 += out_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        if (out_elempack == 4)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                float32x4x4_t _r0123;
                _r0123.val[0] = vld1q_f32(pp);
                _r0123.val[1] = vld1q_f32(pp + 4);
                _r0123.val[2] = vld1q_f32(pp + 8);
                _r0123.val[3] = vld1q_f32(pp + 12);
                vst4q_f32(p0, _r0123);
                pp += 16;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                float32x4_t _r0 = vld1q_f32(pp);
                vst1q_f32(p0, _r0);
                pp += 4;
                p0 += out_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        if (out_elempack == 4)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                p0[0] = pp[0];
                p0[1] = pp[2];
                p0[2] = pp[4];
                p0[3] = pp[6];
                p0[4] = pp[1];
                p0[5] = pp[3];
                p0[6] = pp[5];
                p0[7] = pp[7];
                pp += 8;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = pp[0];
                p0[1] = pp[1];
                pp += 2;
                p0 += out_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
#if __ARM_NEON
        if (out_elempack == 4)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                float32x4_t _r0 = vld1q_f32(pp);
                vst1q_f32(p0, _r0);
                pp += 4;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = pp[0];
                pp += 1;
                p0 += out_hstep;
            }
        }
    }
}

static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end)
{
    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const float* pAT = AT_tile;
    const float* pBT = BT_tile;
    const float* pC = CT_tile;

    float* outptr = topT_tile;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const float* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;
            float32x4_t _sum80;
            float32x4_t _sum81;
            float32x4_t _sum90;
            float32x4_t _sum91;
            float32x4_t _suma0;
            float32x4_t _suma1;
            float32x4_t _sumb0;
            float32x4_t _sumb1;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);
                _sum40 = vdupq_n_f32(0.f);
                _sum41 = vdupq_n_f32(0.f);
                _sum50 = vdupq_n_f32(0.f);
                _sum51 = vdupq_n_f32(0.f);
                _sum60 = vdupq_n_f32(0.f);
                _sum61 = vdupq_n_f32(0.f);
                _sum70 = vdupq_n_f32(0.f);
                _sum71 = vdupq_n_f32(0.f);
                _sum80 = vdupq_n_f32(0.f);
                _sum81 = vdupq_n_f32(0.f);
                _sum90 = vdupq_n_f32(0.f);
                _sum91 = vdupq_n_f32(0.f);
                _suma0 = vdupq_n_f32(0.f);
                _suma1 = vdupq_n_f32(0.f);
                _sumb0 = vdupq_n_f32(0.f);
                _sumb1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[0]);
                        _sum11 = vdupq_n_f32(pC[0]);
                        _sum20 = vdupq_n_f32(pC[0]);
                        _sum21 = vdupq_n_f32(pC[0]);
                        _sum30 = vdupq_n_f32(pC[0]);
                        _sum31 = vdupq_n_f32(pC[0]);
                        _sum40 = vdupq_n_f32(pC[0]);
                        _sum41 = vdupq_n_f32(pC[0]);
                        _sum50 = vdupq_n_f32(pC[0]);
                        _sum51 = vdupq_n_f32(pC[0]);
                        _sum60 = vdupq_n_f32(pC[0]);
                        _sum61 = vdupq_n_f32(pC[0]);
                        _sum70 = vdupq_n_f32(pC[0]);
                        _sum71 = vdupq_n_f32(pC[0]);
                        _sum80 = vdupq_n_f32(pC[0]);
                        _sum81 = vdupq_n_f32(pC[0]);
                        _sum90 = vdupq_n_f32(pC[0]);
                        _sum91 = vdupq_n_f32(pC[0]);
                        _suma0 = vdupq_n_f32(pC[0]);
                        _suma1 = vdupq_n_f32(pC[0]);
                        _sumb0 = vdupq_n_f32(pC[0]);
                        _sumb1 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                        _sum40 = _sum00;
                        _sum41 = _sum01;
                        _sum50 = _sum00;
                        _sum51 = _sum01;
                        _sum60 = _sum00;
                        _sum61 = _sum01;
                        _sum70 = _sum00;
                        _sum71 = _sum01;
                        _sum80 = _sum00;
                        _sum81 = _sum01;
                        _sum90 = _sum00;
                        _sum91 = _sum01;
                        _suma0 = _sum00;
                        _suma1 = _sum01;
                        _sumb0 = _sum00;
                        _sumb1 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        _sum40 = vld1q_f32(pC + 4 * 8);
                        _sum41 = vld1q_f32(pC + 4 * 9);
                        _sum50 = vld1q_f32(pC + 4 * 10);
                        _sum51 = vld1q_f32(pC + 4 * 11);
                        _sum60 = vld1q_f32(pC + 4 * 12);
                        _sum61 = vld1q_f32(pC + 4 * 13);
                        _sum70 = vld1q_f32(pC + 4 * 14);
                        _sum71 = vld1q_f32(pC + 4 * 15);
                        _sum80 = vld1q_f32(pC + 4 * 16);
                        _sum81 = vld1q_f32(pC + 4 * 17);
                        _sum90 = vld1q_f32(pC + 4 * 18);
                        _sum91 = vld1q_f32(pC + 4 * 19);
                        _suma0 = vld1q_f32(pC + 4 * 20);
                        _suma1 = vld1q_f32(pC + 4 * 21);
                        _sumb0 = vld1q_f32(pC + 4 * 22);
                        _sumb1 = vld1q_f32(pC + 4 * 23);
                        pC += 96;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum40 = vdupq_n_f32(pC[4]);
                        _sum50 = vdupq_n_f32(pC[5]);
                        _sum60 = vdupq_n_f32(pC[6]);
                        _sum70 = vdupq_n_f32(pC[7]);
                        _sum80 = vdupq_n_f32(pC[8]);
                        _sum90 = vdupq_n_f32(pC[9]);
                        _suma0 = vdupq_n_f32(pC[10]);
                        _sumb0 = vdupq_n_f32(pC[11]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        _sum41 = _sum40;
                        _sum51 = _sum50;
                        _sum61 = _sum60;
                        _sum71 = _sum70;
                        _sum81 = _sum80;
                        _sum91 = _sum90;
                        _suma1 = _suma0;
                        _sumb1 = _sumb0;
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
                _sum80 = vld1q_f32(outptr + 4 * 16);
                _sum81 = vld1q_f32(outptr + 4 * 17);
                _sum90 = vld1q_f32(outptr + 4 * 18);
                _sum91 = vld1q_f32(outptr + 4 * 19);
                _suma0 = vld1q_f32(outptr + 4 * 20);
                _suma1 = vld1q_f32(outptr + 4 * 21);
                _sumb0 = vld1q_f32(outptr + 4 * 22);
                _sumb1 = vld1q_f32(outptr + 4 * 23);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;

                _pA0 = vld1q_f32(pA);
                _pA1 = vld1q_f32(pA + 4);

                _pB0 = vld1q_f32(pB);
                _pB1 = vld1q_f32(pB + 4);
                _pB2 = vld1q_f32(pB + 8);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;

                _pA0 = vld1q_f32(pA);
                _pA1 = vld1q_f32(pA + 4);

                _pB0 = vld1q_f32(pB);
                _pB1 = vld1q_f32(pB + 4);
                _pB2 = vld1q_f32(pB + 8);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;

                _pA0 = vld1q_f32(pA);
                _pA1 = vld1q_f32(pA + 4);

                _pB0 = vld1q_f32(pB);
                _pB1 = vld1q_f32(pB + 4);
                _pB2 = vld1q_f32(pB + 8);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;
            }
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);
                    vst1q_f32(outptr0 + 4 * 2, _sum20);
                    vst1q_f32(outptr0 + 4 * 3, _sum30);
                    vst1q_f32(outptr0 + 4 * 4, _sum40);
                    vst1q_f32(outptr0 + 4 * 5, _sum50);
                    vst1q_f32(outptr0 + 4 * 6, _sum60);
                    vst1q_f32(outptr0 + 4 * 7, _sum70);
                    vst1q_f32(outptr0 + 4 * 8, _sum80);
                    vst1q_f32(outptr0 + 4 * 9, _sum90);
                    vst1q_f32(outptr0 + 4 * 10, _suma0);
                    vst1q_f32(outptr0 + 4 * 11, _sumb0);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 2, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 3, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 4, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 5, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 6, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 7, _sum71);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 8, _sum81);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 9, _sum91);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 10, _suma1);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 11, _sumb1);

                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose8x12_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, _sum80, _sum81, _sum90, _sum91, _suma0, _suma1, _sumb0, _sumb1);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + 8, _sum10);
                    vst1q_f32(outptr0 + out_hstep, _sum11);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum20);
                    vst1q_f32(outptr0 + out_hstep + 8, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum30);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 2 + 8, _sum40);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _sum50);
                    vst1q_f32(outptr0 + out_hstep * 3 + 8, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum60);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 4 + 8, _sum70);
                    vst1q_f32(outptr0 + out_hstep * 5, _sum71);
                    vst1q_f32(outptr0 + out_hstep * 5 + 4, _sum80);
                    vst1q_f32(outptr0 + out_hstep * 5 + 8, _sum81);
                    vst1q_f32(outptr0 + out_hstep * 6, _sum90);
                    vst1q_f32(outptr0 + out_hstep * 6 + 4, _sum91);
                    vst1q_f32(outptr0 + out_hstep * 6 + 8, _suma0);
                    vst1q_f32(outptr0 + out_hstep * 7, _suma1);
                    vst1q_f32(outptr0 + out_hstep * 7 + 4, _sumb0);
                    vst1q_f32(outptr0 + out_hstep * 7 + 8, _sumb1);

                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
                vst1q_f32(outptr + 4 * 16, _sum80);
                vst1q_f32(outptr + 4 * 17, _sum81);
                vst1q_f32(outptr + 4 * 18, _sum90);
                vst1q_f32(outptr + 4 * 19, _sum91);
                vst1q_f32(outptr + 4 * 20, _suma0);
                vst1q_f32(outptr + 4 * 21, _suma1);
                vst1q_f32(outptr + 4 * 22, _sumb0);
                vst1q_f32(outptr + 4 * 23, _sumb1);
            }

            outptr += 96;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);
                _sum40 = vdupq_n_f32(0.f);
                _sum41 = vdupq_n_f32(0.f);
                _sum50 = vdupq_n_f32(0.f);
                _sum51 = vdupq_n_f32(0.f);
                _sum60 = vdupq_n_f32(0.f);
                _sum61 = vdupq_n_f32(0.f);
                _sum70 = vdupq_n_f32(0.f);
                _sum71 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[0]);
                        _sum11 = vdupq_n_f32(pC[0]);
                        _sum20 = vdupq_n_f32(pC[0]);
                        _sum21 = vdupq_n_f32(pC[0]);
                        _sum30 = vdupq_n_f32(pC[0]);
                        _sum31 = vdupq_n_f32(pC[0]);
                        _sum40 = vdupq_n_f32(pC[0]);
                        _sum41 = vdupq_n_f32(pC[0]);
                        _sum50 = vdupq_n_f32(pC[0]);
                        _sum51 = vdupq_n_f32(pC[0]);
                        _sum60 = vdupq_n_f32(pC[0]);
                        _sum61 = vdupq_n_f32(pC[0]);
                        _sum70 = vdupq_n_f32(pC[0]);
                        _sum71 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                        _sum40 = _sum00;
                        _sum41 = _sum01;
                        _sum50 = _sum00;
                        _sum51 = _sum01;
                        _sum60 = _sum00;
                        _sum61 = _sum01;
                        _sum70 = _sum00;
                        _sum71 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        _sum40 = vld1q_f32(pC + 4 * 8);
                        _sum41 = vld1q_f32(pC + 4 * 9);
                        _sum50 = vld1q_f32(pC + 4 * 10);
                        _sum51 = vld1q_f32(pC + 4 * 11);
                        _sum60 = vld1q_f32(pC + 4 * 12);
                        _sum61 = vld1q_f32(pC + 4 * 13);
                        _sum70 = vld1q_f32(pC + 4 * 14);
                        _sum71 = vld1q_f32(pC + 4 * 15);
                        pC += 64;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum40 = vdupq_n_f32(pC[4]);
                        _sum50 = vdupq_n_f32(pC[5]);
                        _sum60 = vdupq_n_f32(pC[6]);
                        _sum70 = vdupq_n_f32(pC[7]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        _sum41 = _sum40;
                        _sum51 = _sum50;
                        _sum61 = _sum60;
                        _sum71 = _sum70;
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);

                pA += 8;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);
                    vst1q_f32(outptr0 + 4 * 2, _sum20);
                    vst1q_f32(outptr0 + 4 * 3, _sum30);
                    vst1q_f32(outptr0 + 4 * 4, _sum40);
                    vst1q_f32(outptr0 + 4 * 5, _sum50);
                    vst1q_f32(outptr0 + 4 * 6, _sum60);
                    vst1q_f32(outptr0 + 4 * 7, _sum70);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 2, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 3, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 4, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 5, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 6, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 7, _sum71);

                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose8x8_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep, _sum10);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum20);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum30);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _sum31);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum40);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum41);
                    vst1q_f32(outptr0 + out_hstep * 5, _sum50);
                    vst1q_f32(outptr0 + out_hstep * 5 + 4, _sum51);
                    vst1q_f32(outptr0 + out_hstep * 6, _sum60);
                    vst1q_f32(outptr0 + out_hstep * 6 + 4, _sum61);
                    vst1q_f32(outptr0 + out_hstep * 7, _sum70);
                    vst1q_f32(outptr0 + out_hstep * 7 + 4, _sum71);

                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
            }

            outptr += 64;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[0]);
                        _sum11 = vdupq_n_f32(pC[0]);
                        _sum20 = vdupq_n_f32(pC[0]);
                        _sum21 = vdupq_n_f32(pC[0]);
                        _sum30 = vdupq_n_f32(pC[0]);
                        _sum31 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB0 = vld1q_f32(pB);

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);

                pA += 8;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);
                    vst1q_f32(outptr0 + 4 * 2, _sum20);
                    vst1q_f32(outptr0 + 4 * 3, _sum30);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 2, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4 * 3, _sum31);

                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose8x4_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31);

                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + out_hstep * 1, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum10);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum11);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum20);
                    vst1q_f32(outptr0 + out_hstep * 5, _sum21);
                    vst1q_f32(outptr0 + out_hstep * 6, _sum30);
                    vst1q_f32(outptr0 + out_hstep * 7, _sum31);

                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
            }

            outptr += 32;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[0]);
                        _sum11 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x2_t _pB0 = vld1_f32(pB);

                _sum00 = vfmaq_lane_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pA1, _pB0, 1);

                pA += 8;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum10);

                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep * 4 + 4, _sum11);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    float sum0[8];
                    float sum1[8];
                    vst1q_f32(sum0, _sum00);
                    vst1q_f32(sum0 + 4, _sum01);
                    vst1q_f32(sum1, _sum10);
                    vst1q_f32(sum1 + 4, _sum11);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];

                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0[out_hstep * 4 + 1] = sum1[4];
                    outptr0[out_hstep * 5 + 1] = sum1[5];
                    outptr0[out_hstep * 6 + 1] = sum1[6];
                    outptr0[out_hstep * 7 + 1] = sum1[7];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
            }

            outptr += 16;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA0 = vld1q_f32(pA);
                float32x4_t _pA1 = vld1q_f32(pA + 4);

                float32x4_t _pB = vld1q_dup_f32(pB);

                _sum00 = vfmaq_f32(_sum00, _pA0, _pB);
                _sum01 = vfmaq_f32(_sum01, _pA1, _pB);

                pA += 8;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + out_hstep * 4, _sum01);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    float sum0[8];
                    vst1q_f32(sum0, _sum00);
                    vst1q_f32(sum0 + 4, _sum01);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep * 1] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
            }

            outptr += 8;
        }

        pAT += max_kk * 8;
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const float* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;
            float32x4_t _sum8;
            float32x4_t _sum9;
            float32x4_t _suma;
            float32x4_t _sumb;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);
                _sum4 = vdupq_n_f32(0.f);
                _sum5 = vdupq_n_f32(0.f);
                _sum6 = vdupq_n_f32(0.f);
                _sum7 = vdupq_n_f32(0.f);
                _sum8 = vdupq_n_f32(0.f);
                _sum9 = vdupq_n_f32(0.f);
                _suma = vdupq_n_f32(0.f);
                _sumb = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                        _sum2 = vdupq_n_f32(pC[0]);
                        _sum3 = vdupq_n_f32(pC[0]);
                        _sum4 = vdupq_n_f32(pC[0]);
                        _sum5 = vdupq_n_f32(pC[0]);
                        _sum6 = vdupq_n_f32(pC[0]);
                        _sum7 = vdupq_n_f32(pC[0]);
                        _sum8 = vdupq_n_f32(pC[0]);
                        _sum9 = vdupq_n_f32(pC[0]);
                        _suma = vdupq_n_f32(pC[0]);
                        _sumb = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                        _sum8 = _sum0;
                        _sum9 = _sum0;
                        _suma = _sum0;
                        _sumb = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        _sum4 = vld1q_f32(pC + 16);
                        _sum5 = vld1q_f32(pC + 20);
                        _sum6 = vld1q_f32(pC + 24);
                        _sum7 = vld1q_f32(pC + 28);
                        _sum8 = vld1q_f32(pC + 32);
                        _sum9 = vld1q_f32(pC + 36);
                        _suma = vld1q_f32(pC + 40);
                        _sumb = vld1q_f32(pC + 44);
                        pC += 48;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        _sum4 = vdupq_n_f32(pC[4]);
                        _sum5 = vdupq_n_f32(pC[5]);
                        _sum6 = vdupq_n_f32(pC[6]);
                        _sum7 = vdupq_n_f32(pC[7]);
                        _sum8 = vdupq_n_f32(pC[8]);
                        _sum9 = vdupq_n_f32(pC[9]);
                        _suma = vdupq_n_f32(pC[10]);
                        _sumb = vdupq_n_f32(pC[11]);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
                _sum8 = vld1q_f32(outptr + 4 * 8);
                _sum9 = vld1q_f32(outptr + 4 * 9);
                _suma = vld1q_f32(outptr + 4 * 10);
                _sumb = vld1q_f32(outptr + 4 * 11);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
                _sum8 = vfmaq_laneq_f32(_sum8, _pA, _pB2, 0);
                _sum9 = vfmaq_laneq_f32(_sum9, _pA, _pB2, 1);
                _suma = vfmaq_laneq_f32(_suma, _pA, _pB2, 2);
                _sumb = vfmaq_laneq_f32(_sumb, _pA, _pB2, 3);

                pA += 4;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 4 * 2, _sum2);
                    vst1q_f32(outptr0 + 4 * 3, _sum3);
                    vst1q_f32(outptr0 + 4 * 4, _sum4);
                    vst1q_f32(outptr0 + 4 * 5, _sum5);
                    vst1q_f32(outptr0 + 4 * 6, _sum6);
                    vst1q_f32(outptr0 + 4 * 7, _sum7);
                    vst1q_f32(outptr0 + 4 * 8, _sum8);
                    vst1q_f32(outptr0 + 4 * 9, _sum9);
                    vst1q_f32(outptr0 + 4 * 10, _suma);
                    vst1q_f32(outptr0 + 4 * 11, _sumb);
                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose4x12_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 8, _sum2);
                    vst1q_f32(outptr0 + out_hstep, _sum3);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum4);
                    vst1q_f32(outptr0 + out_hstep + 8, _sum5);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum6);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum7);
                    vst1q_f32(outptr0 + out_hstep * 2 + 8, _sum8);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum9);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _suma);
                    vst1q_f32(outptr0 + out_hstep * 3 + 8, _sumb);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
                vst1q_f32(outptr + 4 * 8, _sum8);
                vst1q_f32(outptr + 4 * 9, _sum9);
                vst1q_f32(outptr + 4 * 10, _suma);
                vst1q_f32(outptr + 4 * 11, _sumb);
            }

            outptr += 48;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);
                _sum4 = vdupq_n_f32(0.f);
                _sum5 = vdupq_n_f32(0.f);
                _sum6 = vdupq_n_f32(0.f);
                _sum7 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                        _sum2 = vdupq_n_f32(pC[0]);
                        _sum3 = vdupq_n_f32(pC[0]);
                        _sum4 = vdupq_n_f32(pC[0]);
                        _sum5 = vdupq_n_f32(pC[0]);
                        _sum6 = vdupq_n_f32(pC[0]);
                        _sum7 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        _sum4 = vld1q_f32(pC + 16);
                        _sum5 = vld1q_f32(pC + 20);
                        _sum6 = vld1q_f32(pC + 24);
                        _sum7 = vld1q_f32(pC + 28);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        _sum4 = vdupq_n_f32(pC[4]);
                        _sum5 = vdupq_n_f32(pC[5]);
                        _sum6 = vdupq_n_f32(pC[6]);
                        _sum7 = vdupq_n_f32(pC[7]);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #128]   \n"
                    "ld1    {v2.4s}, [%0], #16      \n"
                    "prfm   pldl1keep, [%1, #256]   \n"
                    "ld1    {v0.4s, v1.4s}, [%1], #32 \n"
                    "fmla   %2.4s, v2.4s, v0.s[0]   \n"
                    "fmla   %3.4s, v2.4s, v0.s[1]   \n"
                    "fmla   %4.4s, v2.4s, v0.s[2]   \n"
                    "fmla   %5.4s, v2.4s, v0.s[3]   \n"
                    "fmla   %6.4s, v2.4s, v1.s[0]   \n"
                    "fmla   %7.4s, v2.4s, v1.s[1]   \n"
                    "fmla   %8.4s, v2.4s, v1.s[2]   \n"
                    "fmla   %9.4s, v2.4s, v1.s[3]   \n"
                    : "=r"(pA),
                    "=r"(pB),
                    "=w"(_sum0),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3),
                    "=w"(_sum4),
                    "=w"(_sum5),
                    "=w"(_sum6),
                    "=w"(_sum7)
                    : "0"(pA),
                    "1"(pB),
                    "2"(_sum0),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3),
                    "6"(_sum4),
                    "7"(_sum5),
                    "8"(_sum6),
                    "9"(_sum7)
                    : "memory", "v0", "v1", "v2", "v3");
#else
                asm volatile(
                    "pld        [%0, #128]          \n"
                    "vld1.f32   {d4-d5}, [%0]!      \n"
                    "pld        [%1, #256]          \n"
                    "vld1.f32   {d0-d3}, [%1]!      \n"
                    "vmla.f32   %q2, q2, d0[0]      \n"
                    "vmla.f32   %q3, q2, d0[1]      \n"
                    "vmla.f32   %q4, q2, d1[0]      \n"
                    "vmla.f32   %q5, q2, d1[1]      \n"
                    "vmla.f32   %q6, q2, d2[0]      \n"
                    "vmla.f32   %q7, q2, d2[1]      \n"
                    "vmla.f32   %q8, q2, d3[0]      \n"
                    "vmla.f32   %q9, q2, d3[1]      \n"
                    : "=r"(pA),
                    "=r"(pB),
                    "=w"(_sum0),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3),
                    "=w"(_sum4),
                    "=w"(_sum5),
                    "=w"(_sum6),
                    "=w"(_sum7)
                    : "0"(pA),
                    "1"(pB),
                    "2"(_sum0),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3),
                    "6"(_sum4),
                    "7"(_sum5),
                    "8"(_sum6),
                    "9"(_sum7)
                    : "memory", "q0", "q1", "q2");
#endif
#else // NCNN_GNU_INLINE_ASM
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB0), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB0), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB0), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB0), 1);
                _sum4 = vmlaq_lane_f32(_sum4, _pA, vget_low_f32(_pB1), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _pA, vget_low_f32(_pB1), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _pA, vget_high_f32(_pB1), 0);
                _sum7 = vmlaq_lane_f32(_sum7, _pA, vget_high_f32(_pB1), 1);
#endif

                pA += 4;
                pB += 8;
#endif // NCNN_GNU_INLINE_ASM
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 4 * 2, _sum2);
                    vst1q_f32(outptr0 + 4 * 3, _sum3);
                    vst1q_f32(outptr0 + 4 * 4, _sum4);
                    vst1q_f32(outptr0 + 4 * 5, _sum5);
                    vst1q_f32(outptr0 + 4 * 6, _sum6);
                    vst1q_f32(outptr0 + 4 * 7, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose4x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + out_hstep, _sum2);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum3);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum4);
                    vst1q_f32(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum6);
                    vst1q_f32(outptr0 + out_hstep * 3 + 4, _sum7);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
            }

            outptr += 32;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                        _sum2 = vdupq_n_f32(pC[0]);
                        _sum3 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB = vld1q_f32(pB);

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB), 1);
#endif

                pA += 4;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 4 * 2, _sum2);
                    vst1q_f32(outptr0 + 4 * 3, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose4x4_ps(_sum0, _sum1, _sum2, _sum3);

                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + out_hstep * 1, _sum1);
                    vst1q_f32(outptr0 + out_hstep * 2, _sum2);
                    vst1q_f32(outptr0 + out_hstep * 3, _sum3);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
            }

            outptr += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x2_t _pB = vld1_f32(pB);

#if __aarch64__
                _sum0 = vfmaq_lane_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_lane_f32(_sum1, _pA, _pB, 1);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, _pB, 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, _pB, 1);
#endif

                pA += 4;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    float sum0[4];
                    float sum1[4];
                    vst1q_f32(sum0, _sum0);
                    vst1q_f32(sum1, _sum1);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = vld1q_f32(pA);
                float32x4_t _pB = vdupq_n_f32(pB[0]);

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA, _pB);
#else
                _sum0 = vmlaq_f32(_sum0, _pA, _pB);
#endif

                pA += 4;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(outptr0, _sum0);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    float sum0[4];
                    vst1q_f32(sum0, _sum0);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
            }

            outptr += 4;
        }

        pAT += max_kk * 4;
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j;

        const float* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum02;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum12;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum02 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum12 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum02 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[0]);
                        _sum11 = vdupq_n_f32(pC[0]);
                        _sum12 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum02 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum11 = vdupq_n_f32(pC[1]);
                        _sum12 = vdupq_n_f32(pC[1]);
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        float32x4x2_t _tmp23 = vld2q_f32(pC + 8);
                        float32x4x2_t _tmp45 = vld2q_f32(pC + 16);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum02 = _tmp45.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        _sum12 = _tmp45.val[1];
                        pC += 24;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum02 = vld1q_f32(pC + 8);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum12 = _sum02;
                        pC += 12;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                float32x4x2_t _tmp45 = vld2q_f32(outptr + 16);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum02 = _tmp45.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
                _sum12 = _tmp45.val[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                float32x2_t _pA = vld1_f32(pA);

                _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum02 = vfmaq_lane_f32(_sum02, _pB2, _pA, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
                _sum12 = vfmaq_lane_f32(_sum12, _pB2, _pA, 1);

                pA += 2;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + 8, _sum02);
                    vst1q_f32(outptr0 + out_hstep, _sum10);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum11);
                    vst1q_f32(outptr0 + out_hstep + 8, _sum12);
                    outptr0 += 12;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float32x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
                vst2q_f32(outptr + 16, _tmp45);
            }

            outptr += 24;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[0]);
                        _sum11 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum11 = vdupq_n_f32(pC[1]);
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        float32x4x2_t _tmp23 = vld2q_f32(pC + 8);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        pC += 8;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

                float32x2_t _pA = vld1_f32(pA);
#if __aarch64__
                _sum00 = vfmaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vfmaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum10 = vfmaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vfmaq_lane_f32(_sum11, _pB1, _pA, 1);
#else
                _sum00 = vmlaq_lane_f32(_sum00, _pB0, _pA, 0);
                _sum01 = vmlaq_lane_f32(_sum01, _pB1, _pA, 0);
                _sum10 = vmlaq_lane_f32(_sum10, _pB0, _pA, 1);
                _sum11 = vmlaq_lane_f32(_sum11, _pB1, _pA, 1);
#endif

                pA += 2;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum00);
                    vst1q_f32(outptr0 + 4, _sum01);
                    vst1q_f32(outptr0 + out_hstep, _sum10);
                    vst1q_f32(outptr0 + out_hstep + 4, _sum11);
                    outptr0 += 8;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
            }

            outptr += 16;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        _sum0 = _tmp01.val[0];
                        _sum1 = _tmp01.val[1];
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        pC += 4;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                _sum0 = _tmp01.val[0];
                _sum1 = _tmp01.val[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = vld1q_f32(pB);

                float32x2_t _pA = vld1_f32(pA);
#if __aarch64__
                _sum0 = vfmaq_lane_f32(_sum0, _pB, _pA, 0);
                _sum1 = vfmaq_lane_f32(_sum1, _pB, _pA, 1);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pB, _pA, 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pB, _pA, 1);
#endif

                pA += 2;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + out_hstep, _sum1);
                    outptr0 += 4;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2q_f32(outptr, _tmp01);
            }

            outptr += 8;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum00;
            float sum01;
            float sum10;
            float sum11;

            if (k == 0)
            {
                sum00 = 0.f;
                sum01 = 0.f;
                sum10 = 0.f;
                sum11 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[0];
                        sum11 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[0];
                        sum11 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[2];
                        sum11 = pC[3];
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[1];
                        sum11 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum00 = outptr[0];
                sum01 = outptr[1];
                sum10 = outptr[2];
                sum11 = outptr[3];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum00 += pA[0] * pB[0];
                sum01 += pA[1] * pB[0];
                sum10 += pA[0] * pB[1];
                sum11 += pA[1] * pB[1];

                pA += 2;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum00;
                    outptr0[1] = sum10;
                    outptr0[out_hstep] = sum01;
                    outptr0[out_hstep + 1] = sum11;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
            }

            outptr += 4;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[1] * pB[0];
                pA += 2;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[out_hstep] = sum1;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j;

        const float* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                        _sum2 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
                _sum2 = vld1q_f32(outptr + 8);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);
                float32x4_t _pB2 = vld1q_f32(pB + 8);

                float32x4_t _pA0 = vdupq_n_f32(pA[0]);

                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
                _sum2 = vfmaq_f32(_sum2, _pA0, _pB2);

                pA += 1;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    vst1q_f32(outptr0 + 8, _sum2);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 8, _sum2);
            }

            outptr += 12;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB0 = vld1q_f32(pB);
                float32x4_t _pB1 = vld1q_f32(pB + 4);

                float32x4_t _pA0 = vdupq_n_f32(pA[0]);
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
#else
                _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
#endif

                pA += 1;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum0);
                    vst1q_f32(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum;

            if (k == 0)
            {
                _sum = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum = vld1q_f32(pC);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum = vld1q_f32(outptr);
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = vld1q_f32(pB);
                float32x4_t _pA = vdupq_n_f32(pA[0]);

#if __aarch64__
                _sum = vfmaq_f32(_sum, _pA, _pB);
#else
                _sum = vmlaq_f32(_sum, _pA, _pB);
#endif

                pA += 1;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1q_f32(outptr0, _sum);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum);
            }

            outptr += 4;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[0] * pB[1];

                pA += 1;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[1] = sum1;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum;

            if (k == 0)
            {
                sum = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum = outptr[0];
            }

            const float* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum += pA[0] * pB[0];
                pA += 1;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}

static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float));

#if __aarch64__
    TILE_M = std::max(8, tile_size / 8 * 8);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(8, tile_size / 8 * 8);
#elif __ARM_NEON
    TILE_M = std::max(4, tile_size / 4 * 4);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(4, tile_size / 4 * 4);
#else
    TILE_M = std::max(2, tile_size / 2 * 2);
    TILE_N = std::max(1, tile_size);
    TILE_K = std::max(2, tile_size / 2 * 2);
#endif

    if (K > 0)
    {
        int nn_K = (K + TILE_K - 1) / TILE_K;
#if __aarch64__
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
#else
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
#endif

        if (nn_K == 1)
        {
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);

#if __aarch64__
            TILE_M = std::max(8, tile_size / 8 * 8);
            TILE_N = std::max(4, tile_size / 4 * 4);
#elif __ARM_NEON
            TILE_M = std::max(4, tile_size / 4 * 4);
            TILE_N = std::max(4, tile_size / 4 * 4);
#else
            TILE_M = std::max(2, tile_size / 2 * 2);
            TILE_N = std::max(1, tile_size);
#endif
        }
    }

    TILE_M *= std::min(nT, get_physical_cpu_count());

    if (M > 0)
    {
        int nn_M = (M + TILE_M - 1) / TILE_M;
#if __aarch64__
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 3) / 4 * 4);
#else
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
#endif
    }

    if (N > 0)
    {
        int nn_N = (N + TILE_N - 1) / TILE_N;
#if __aarch64__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#elif __ARM_NEON
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#else
        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
#endif
    }

    if (nT > 1)
    {
#if __aarch64__
        TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
#elif __ARM_NEON
        TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 3) / 4 * 4);
#else
        TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
#endif
    }

    // always take constant TILE_M/N/K value when provided
    if (constant_TILE_M > 0)
    {
#if __aarch64__
        TILE_M = (constant_TILE_M + 7) / 8 * 8;
#elif __ARM_NEON
        TILE_M = (constant_TILE_M + 3) / 4 * 4;
#else
        TILE_M = (constant_TILE_M + 1) / 2 * 2;
#endif
    }

    if (constant_TILE_N > 0)
    {
#if __aarch64__
        TILE_N = (constant_TILE_N + 3) / 4 * 4;
#elif __ARM_NEON
        TILE_N = (constant_TILE_N + 3) / 4 * 4;
#else
        TILE_N = constant_TILE_N;
#endif
    }

    if (constant_TILE_K > 0)
    {
#if __aarch64__
        TILE_K = (constant_TILE_K + 7) / 8 * 8;
#elif __ARM_NEON
        TILE_K = (constant_TILE_K + 3) / 4 * 4;
#else
        TILE_K = (constant_TILE_K + 1) / 2 * 2;
#endif
    }
}

static int gemm_arm(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
    const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;
    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_arm(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_BT_arm(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_BT_arm(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

int Gemm_arm::create_pipeline(const Option& opt)
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return create_pipeline_int8(opt);
    }
#endif

#if NCNN_ARM82
    if (cpu_support_arm_asimdhp() && opt.use_fp16_storage)
    {
        if (opt.use_fp16_arithmetic)
            return create_pipeline_fp16sa(opt);
        else
            return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

#if NCNN_VFPV4
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

    if (constantA)
    {
        const int M = constantM;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_M = (M + TILE_M - 1) / TILE_M;

        AT_data.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 4u, (Allocator*)0);
        if (AT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_M; ppj++)
        {
            const int i = ppj * TILE_M;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_ii = std::min((M - i), TILE_M);
                const int max_kk = std::min((K - k), TILE_K);

                Mat AT_tile = AT_data.channel(i / TILE_M).row_range(k / TILE_K, 1);

                if (transA)
                {
                    transpose_pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk);
                }
                else
                {
                    pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
    {
        const int N = constantN;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_N = (N + TILE_N - 1) / TILE_N;

        BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, (Allocator*)0);
        if (BT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_N; ppj++)
        {
            const int j = ppj * TILE_N;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_jj = std::min((N - j), TILE_N);
                const int max_kk = std::min((K - k), TILE_K);

                Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (transB)
                {
                    pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
                }
                else
                {
                    transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
    {
        CT_data = C_data;

#if __ARM_NEON
        if (constant_broadcast_type_C == 3 && opt.use_packing_layout)
        {
            int C_elempack = constantM % 4 == 0 ? 4 : 1;
            convert_packing(C_data, CT_data, C_elempack, opt);
            if (CT_data.empty())
                return -100;
        }
#endif // __ARM_NEON

        // pre-multiply C with beta
        if (beta != 1.f)
        {
            Mat C2;
            C2.create_like(CT_data);
            if (C2.empty())
                return -100;

            const int size = CT_data.total() * CT_data.elempack;
            for (int i = 0; i < size; i++)
            {
                C2[i] = CT_data[i] * beta;
            }

            CT_data = C2;
        }

        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
    {
        nT = opt.num_threads;
    }

    return 0;
}

int Gemm_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blobs, top_blobs, opt);
    }
#endif

    const Mat& bottom_blob = constantA ? AT_data : bottom_blobs[0];
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (cpu_support_arm_asimdhp() && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blobs, top_blobs, opt);
        else
            return forward_fp16s(bottom_blobs, top_blobs, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_VFPV4
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        return forward_fp16s(bottom_blobs, top_blobs, opt);
    }
#endif

    int M;
    int N;
    if (constantA && constantB)
    {
        M = constantM;
        N = constantN;
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        M = constantM;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = constantN;
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }

    Mat C;
    int broadcast_type_C = 0;
    if (constantC)
    {
        C = CT_data;
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (constantA && constantB)
        {
            C = bottom_blobs.size() == 1 ? bottom_blobs[0] : Mat();
        }
        else if (constantA)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else if (constantB)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else
        {
            C = bottom_blobs.size() == 3 ? bottom_blobs[2] : Mat();
        }

        if (!C.empty())
        {
            if (C.dims == 1 && C.w == 1)
            {
                // scalar
                broadcast_type_C = 0;
            }
            if (C.dims == 1 && C.w * C.elempack == M)
            {
                // M
                // auto broadcast from h to w is the ncnn-style convention
                broadcast_type_C = 1;
            }
            if (C.dims == 1 && C.w * C.elempack == N)
            {
                // N
                broadcast_type_C = 4;
            }
            if (C.dims == 2 && C.w == 1 && C.h * C.elempack == M)
            {
                // Mx1
                broadcast_type_C = 2;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == M)
            {
                // MxN
                broadcast_type_C = 3;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == 1)
            {
                // 1xN
                broadcast_type_C = 4;
            }

            // pre-multiply C with beta
            if (beta != 1.f)
            {
                Mat CT_data;
                CT_data.create_like(C, opt.workspace_allocator);
                if (CT_data.empty())
                    return -100;

                const int size = C.total() * C.elempack;
                for (int i = 0; i < size; i++)
                {
                    CT_data[i] = C[i] * beta;
                }

                C = CT_data;
            }
        }
    }

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        int outh = output_transpose ? N : M;
        out_elempack = outh % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON
    if (output_elempack)
        out_elempack = output_elempack;
    size_t out_elemsize = 4u * out_elempack;

    Mat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int _nT = nT ? nT : opt.num_threads;
    if (nT != 0 && opt.num_threads != nT)
    {
        // force num_threads the same as in create_pipeline
        // so we could use pre-packed A/B from the same tile config
        NCNN_LOGE("opt.num_threads %d changed, gemm will use load-time value %d", opt.num_threads, nT);
    }

    int ret = 0;
    if (constantA && constantB)
    {
        ret = gemm_AT_BT_arm(AT_data, BT_data, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        ret = gemm_AT_arm(AT_data, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        ret = gemm_BT_arm(A, BT_data, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        ret = gemm_arm(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    if (ret != 0)
        return ret;

    // multiply top_blob with alpha
    if (alpha != 1.f)
    {
        const int size = top_blob.total() * out_elempack;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < size; i++)
        {
            top_blob[i] *= alpha;
        }
    }

    return 0;
}

#if NCNN_BF16
static int gemm_arm_bf16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
    const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;
    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_bf16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_bf16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_arm_bf16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_bf16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_bf16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_BT_arm_bf16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_bf16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_bf16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_BT_arm_bf16s(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_bf16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_bf16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

int Gemm_arm::create_pipeline_bf16s(const Option& opt)
{
    if (constantA)
    {
        const int M = constantM;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_bf16s_fp16s(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_M = (M + TILE_M - 1) / TILE_M;

        AT_data.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);
        if (AT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_M; ppj++)
        {
            const int i = ppj * TILE_M;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_ii = std::min((M - i), TILE_M);
                const int max_kk = std::min((K - k), TILE_K);

                Mat AT_tile = AT_data.channel(i / TILE_M).row_range(k / TILE_K, 1);

                if (transA)
                {
                    transpose_pack_A_tile_fp32_to_bf16(A_data, AT_tile, i, max_ii, k, max_kk);
                }
                else
                {
                    pack_A_tile_fp32_to_bf16(A_data, AT_tile, i, max_ii, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
    {
        const int N = constantN;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_bf16s_fp16s(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_N = (N + TILE_N - 1) / TILE_N;

        BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, (Allocator*)0);
        if (BT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_N; ppj++)
        {
            const int j = ppj * TILE_N;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_jj = std::min((N - j), TILE_N);
                const int max_kk = std::min((K - k), TILE_K);

                Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (transB)
                {
                    pack_B_tile_fp32_to_bf16(B_data, BT_tile, j, max_jj, k, max_kk);
                }
                else
                {
                    transpose_pack_B_tile_fp32_to_bf16(B_data, BT_tile, j, max_jj, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
    {
        CT_data = C_data;

#if __ARM_NEON
        if (constant_broadcast_type_C == 3 && opt.use_packing_layout)
        {
            int C_elempack = constantM % 4 == 0 ? 4 : 1;
            convert_packing(C_data, CT_data, C_elempack, opt);
            if (CT_data.empty())
                return -100;
        }
#endif // __ARM_NEON

        // pre-multiply C with beta
        if (beta != 1.f)
        {
            Mat C2;
            C2.create_like(CT_data);
            if (C2.empty())
                return -100;

            const int size = CT_data.total() * CT_data.elempack;
            for (int i = 0; i < size; i++)
            {
                C2[i] = CT_data[i] * beta;
            }

            CT_data = C2;
        }

        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
    {
        nT = opt.num_threads;
    }

    return 0;
}

int Gemm_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int M;
    int N;
    if (constantA && constantB)
    {
        M = constantM;
        N = constantN;
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        M = constantM;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = constantN;
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }

    Mat C;
    int broadcast_type_C = 0;
    if (constantC)
    {
        C = CT_data;
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (constantA && constantB)
        {
            C = bottom_blobs.size() == 1 ? bottom_blobs[0] : Mat();
        }
        else if (constantA)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else if (constantB)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else
        {
            C = bottom_blobs.size() == 3 ? bottom_blobs[2] : Mat();
        }

        if (!C.empty())
        {
            if (C.dims == 1 && C.w == 1)
            {
                // scalar
                broadcast_type_C = 0;
            }
            if (C.dims == 1 && C.w * C.elempack == M)
            {
                // M
                // auto broadcast from h to w is the ncnn-style convention
                broadcast_type_C = 1;
            }
            if (C.dims == 1 && C.w * C.elempack == N)
            {
                // N
                broadcast_type_C = 4;
            }
            if (C.dims == 2 && C.w == 1 && C.h * C.elempack == M)
            {
                // Mx1
                broadcast_type_C = 2;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == M)
            {
                // MxN
                broadcast_type_C = 3;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == 1)
            {
                // 1xN
                broadcast_type_C = 4;
            }

            // cast to fp32
            {
                Option opt_cast = opt;
                opt_cast.blob_allocator = opt.workspace_allocator;

                Mat C_fp32;
                cast_bfloat16_to_float32(C, C_fp32, opt_cast);
                if (C_fp32.empty())
                    return -100;

                C = C_fp32;
            }

            // pre-multiply C with beta
            if (beta != 1.f)
            {
                Mat CT_data;
                CT_data.create_like(C, opt.workspace_allocator);
                if (CT_data.empty())
                    return -100;

                const int size = C.total() * C.elempack;
                for (int i = 0; i < size; i++)
                {
                    CT_data[i] = C[i] * beta;
                }

                C = CT_data;
            }
        }
    }

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        int outh = output_transpose ? N : M;
        out_elempack = outh % 4 == 0 ? 4 : 1;
    }
    if (output_elempack)
        out_elempack = output_elempack;
    size_t out_elemsize = 2u * out_elempack;

    Mat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int _nT = nT ? nT : opt.num_threads;
    if (nT != 0 && opt.num_threads != nT)
    {
        // force num_threads the same as in create_pipeline
        // so we could use pre-packed A/B from the same tile config
        NCNN_LOGE("opt.num_threads %d changed, gemm will use load-time value %d", opt.num_threads, nT);
    }

    int ret = 0;
    if (constantA && constantB)
    {
        ret = gemm_AT_BT_arm_bf16s(AT_data, BT_data, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        ret = gemm_AT_arm_bf16s(AT_data, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        ret = gemm_BT_arm_bf16s(A, BT_data, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        ret = gemm_arm_bf16s(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }

    return ret;
}
#endif // NCNN_BF16

#if NCNN_INT8

#if NCNN_VFPV4
extern void compute_A_tile_fp16_int8_scales_vfpv4(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii);
extern void pack_A_tile_fp16_to_int8_vfpv4(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
extern void transpose_compute_A_tile_fp16_int8_scales_vfpv4(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii);
extern void transpose_pack_A_tile_fp16_to_int8_vfpv4(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
extern void compute_B_fp16_int8_scale_vfpv4(const Mat& B, float& scale);
extern void pack_B_tile_fp16_to_int8_vfpv4(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
extern void transpose_pack_B_tile_fp16_to_int8_vfpv4(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
extern void unpack_output_tile_int32_to_fp16_vfpv4(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
extern void transpose_unpack_output_tile_int32_to_fp16_vfpv4(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
#endif

static void compute_A_tile_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii, int input_elemtype)
{
#if NCNN_VFPV4
    if (A.elembits() == 16 && input_elemtype == 2)
    {
        compute_A_tile_fp16_int8_scales_vfpv4(A, scales, B_scale, out_descales, i, max_ii);
        return;
    }
#endif

#if NCNN_BF16
    if (A.elembits() == 16 && input_elemtype == 3)
    {
        compute_A_tile_bf16_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
        return;
    }
#endif

    compute_A_tile_fp32_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
}

static void transpose_compute_A_tile_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii, int input_elemtype)
{
#if NCNN_VFPV4
    if (A.elembits() == 16 && input_elemtype == 2)
    {
        transpose_compute_A_tile_fp16_int8_scales_vfpv4(A, scales, B_scale, out_descales, i, max_ii);
        return;
    }
#endif

#if NCNN_BF16
    if (A.elembits() == 16 && input_elemtype == 3)
    {
        transpose_compute_A_tile_bf16_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
        return;
    }
#endif

    transpose_compute_A_tile_fp32_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
}

static void pack_A_tile_quantize(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales, int input_elemtype)
{
#if NCNN_VFPV4
    if (A.elembits() == 16 && input_elemtype == 2)
    {
        pack_A_tile_fp16_to_int8_vfpv4(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_BF16
    if (A.elembits() == 16 && input_elemtype == 3)
    {
        pack_A_tile_bf16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    pack_A_tile_fp32_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

static void transpose_pack_A_tile_quantize(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales, int input_elemtype)
{
#if NCNN_VFPV4
    if (A.elembits() == 16 && input_elemtype == 2)
    {
        transpose_pack_A_tile_fp16_to_int8_vfpv4(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_BF16
    if (A.elembits() == 16 && input_elemtype == 3)
    {
        transpose_pack_A_tile_bf16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    transpose_pack_A_tile_fp32_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

static void compute_B_int8_scale(const Mat& B, float& scale, int input_elemtype)
{
#if NCNN_VFPV4
    if (B.elembits() == 16 && input_elemtype == 2)
    {
        compute_B_fp16_int8_scale_vfpv4(B, scale);
        return;
    }
#endif

#if NCNN_BF16
    if (B.elembits() == 16 && input_elemtype == 3)
    {
        compute_B_bf16_int8_scale(B, scale);
        return;
    }
#endif

    compute_B_fp32_int8_scale(B, scale);
}

static void pack_B_tile_quantize(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale, int input_elemtype)
{
#if NCNN_VFPV4
    if (B.elembits() == 16 && input_elemtype == 2)
    {
        pack_B_tile_fp16_to_int8_vfpv4(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_BF16
    if (B.elembits() == 16 && input_elemtype == 3)
    {
        pack_B_tile_bf16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    pack_B_tile_fp32_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

static void transpose_pack_B_tile_quantize(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale, int input_elemtype)
{
#if NCNN_VFPV4
    if (B.elembits() == 16 && input_elemtype == 2)
    {
        transpose_pack_B_tile_fp16_to_int8_vfpv4(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_BF16
    if (B.elembits() == 16 && input_elemtype == 3)
    {
        transpose_pack_B_tile_bf16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    transpose_pack_B_tile_fp32_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

static void unpack_output_tile_dequantize(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta, int output_elemtype)
{
#if NCNN_VFPV4
    if (top_blob.elembits() == 16 && output_elemtype == 2)
    {
        unpack_output_tile_int32_to_fp16_vfpv4(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

#if NCNN_BF16
    if (top_blob.elembits() == 16 && output_elemtype == 3)
    {
        unpack_output_tile_int32_to_bf16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    unpack_output_tile_int32_to_fp32(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

static void transpose_unpack_output_tile_dequantize(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta, int output_elemtype)
{
#if NCNN_VFPV4
    if (top_blob.elembits() == 16 && output_elemtype == 2)
    {
        transpose_unpack_output_tile_int32_to_fp16_vfpv4(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

#if NCNN_BF16
    if (top_blob.elembits() == 16 && output_elemtype == 3)
    {
        transpose_unpack_output_tile_int32_to_bf16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    transpose_unpack_output_tile_int32_to_fp32(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

struct gemm_arm_int8_omp_args
{
    int TILE_M;
    int TILE_N;
    int TILE_K;
    int broadcast_type_C;
    int transA;
    int output_transpose;
    float alpha;
    float beta;
    int input_elemtype;
    int output_elemtype;
};

static int gemm_arm_int8(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, float alpha, float beta, int input_elemtype, int output_elemtype, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("gemm_arm_int8");

    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
    const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 1u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;
    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    Mat A_int8_scales(M, 4u, opt.workspace_allocator);
    if (A_int8_scales.empty())
        return -100;

    // dynamic quantize B
    float B_int8_scale;
    compute_B_int8_scale(B, B_int8_scale, input_elemtype);

    // const float output_descale = 1.f / (A_int8_scale * B_int8_scale);
    Mat output_descales(M, 4u, opt.workspace_allocator);
    if (output_descales.empty())
        return -100;

    // NCNN_LOGE("arm ds %f %f", 1/A_int8_scale, 1/B_int8_scale);

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
            pack_B_tile_quantize(B, BT_tile, j, max_jj, k, max_kk, B_int8_scale, input_elemtype);
        else
            transpose_pack_B_tile_quantize(B, BT_tile, j, max_jj, k, max_kk, B_int8_scale, input_elemtype);
    }

    Mat topT(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (topT.empty())
        return -100;

    const struct gemm_arm_int8_omp_args args = {TILE_M, TILE_N, TILE_K, broadcast_type_C, transA, output_transpose, alpha, beta, input_elemtype, output_elemtype};

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        // shadowed variable for less openmp task args
        const int TILE_M = args.TILE_M;
        const int TILE_N = args.TILE_N;
        const int TILE_K = args.TILE_K;
        const int broadcast_type_C = args.broadcast_type_C;
        const int transA = args.transA;
        const int output_transpose = args.output_transpose;
        const float alpha = args.alpha;
        const float beta = args.beta;
        const int input_elemtype = args.input_elemtype;
        const int output_elemtype = args.output_elemtype;

        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (k == 0)
                    {
                        if (transA)
                            transpose_compute_A_tile_int8_scales(A, A_int8_scales, B_int8_scale, output_descales, i, max_ii, input_elemtype);
                        else
                            compute_A_tile_int8_scales(A, A_int8_scales, B_int8_scale, output_descales, i, max_ii, input_elemtype);

                        // NCNN_LOGE("A_int8_scales %f  B_int8_scale %f", A_int8_scales[0], B_int8_scale);
                    }

                    if (transA)
                        transpose_pack_A_tile_quantize(A, AT_tile, i, max_ii, k, max_kk, A_int8_scales, input_elemtype);
                    else
                        pack_A_tile_quantize(A, AT_tile, i, max_ii, k, max_kk, A_int8_scales, input_elemtype);
                }

                gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
            }

            if (output_transpose)
                transpose_unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
            else
                unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
        }
    }

    return 0;
}

static int gemm_AT_arm_int8(const Mat& AT, const Mat& A_int8_scales, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, float alpha, float beta, int input_elemtype, int output_elemtype, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("gemm_AT_arm_int8");

    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // dynamic quantize B
    float B_int8_scale;
    compute_B_int8_scale(B, B_int8_scale, input_elemtype);

    // NCNN_LOGE("%.4f %.4f", A_int8_scale, B_int8_scale);

    // const float output_descale = 1.f / (A_int8_scale * B_int8_scale);
    Mat output_descales(M, 4u, opt.workspace_allocator);
    if (output_descales.empty())
        return -100;

    for (int i = 0; i < M; i++)
    {
        output_descales[i] = 1.f / (A_int8_scales[i] * B_int8_scale);
    }

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
            pack_B_tile_quantize(B, BT_tile, j, max_jj, k, max_kk, B_int8_scale, input_elemtype);
        else
            transpose_pack_B_tile_quantize(B, BT_tile, j, max_jj, k, max_kk, B_int8_scale, input_elemtype);
    }

    Mat topT(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (topT.empty())
        return -100;

    const struct gemm_arm_int8_omp_args args = {TILE_M, TILE_N, TILE_K, broadcast_type_C, 0, output_transpose, alpha, beta, input_elemtype, output_elemtype};

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        // shadowed variable for less openmp task args
        const int TILE_M = args.TILE_M;
        const int TILE_N = args.TILE_N;
        const int TILE_K = args.TILE_K;
        const int broadcast_type_C = args.broadcast_type_C;
        const int output_transpose = args.output_transpose;
        const float alpha = args.alpha;
        const float beta = args.beta;
        const int output_elemtype = args.output_elemtype;

        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
            }

            if (output_transpose)
                transpose_unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
            else
                unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
        }
    }

    return 0;
}

static int gemm_BT_arm_int8(const Mat& A, const Mat& BT, float B_int8_scale, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, float alpha, float beta, int input_elemtype, int output_elemtype, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("gemm_BT_arm_int8");

    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat A_int8_scales(M, 4u, opt.workspace_allocator);
    if (A_int8_scales.empty())
        return -100;

    // const float output_descale = 1.f / (A_int8_scale * B_int8_scale);
    Mat output_descales(M, 4u, opt.workspace_allocator);
    if (output_descales.empty())
        return -100;

    // NCNN_LOGE("scale %.4f  %.4f", A_int8_scale, B_int8_scale);

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 1u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;

    Mat topT(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (topT.empty())
        return -100;

    const struct gemm_arm_int8_omp_args args = {TILE_M, TILE_N, TILE_K, broadcast_type_C, transA, output_transpose, alpha, beta, input_elemtype, output_elemtype};

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        // shadowed variable for less openmp task args
        const int TILE_M = args.TILE_M;
        const int TILE_N = args.TILE_N;
        const int TILE_K = args.TILE_K;
        const int broadcast_type_C = args.broadcast_type_C;
        const int transA = args.transA;
        const int output_transpose = args.output_transpose;
        const float alpha = args.alpha;
        const float beta = args.beta;
        const int input_elemtype = args.input_elemtype;
        const int output_elemtype = args.output_elemtype;

        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (k == 0)
                    {
                        if (transA)
                            transpose_compute_A_tile_int8_scales(A, A_int8_scales, B_int8_scale, output_descales, i, max_ii, input_elemtype);
                        else
                            compute_A_tile_int8_scales(A, A_int8_scales, B_int8_scale, output_descales, i, max_ii, input_elemtype);

                        // NCNN_LOGE("A_int8_scales %f  B_int8_scale %f", A_int8_scales[0], B_int8_scale);
                    }

                    if (transA)
                        transpose_pack_A_tile_quantize(A, AT_tile, i, max_ii, k, max_kk, A_int8_scales, input_elemtype);
                    else
                        pack_A_tile_quantize(A, AT_tile, i, max_ii, k, max_kk, A_int8_scales, input_elemtype);
                }

                gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
            }

            if (output_transpose)
                transpose_unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
            else
                unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
        }
    }

    return 0;
}

static int gemm_AT_BT_arm_int8(const Mat& AT, const Mat& A_int8_scales, const Mat& BT, float B_int8_scale, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, float alpha, float beta, int input_elemtype, int output_elemtype, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("gemm_AT_BT_arm_int8");

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_int8(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    // const float output_descale = 1.f / (A_int8_scale * B_int8_scale);
    Mat output_descales(M, 4u, opt.workspace_allocator);
    if (output_descales.empty())
        return -100;

    for (int i = 0; i < M; i++)
    {
        output_descales[i] = 1.f / (A_int8_scales[i] * B_int8_scale);
    }

    Mat topT(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
    if (topT.empty())
        return -100;

    const struct gemm_arm_int8_omp_args args = {TILE_M, TILE_N, TILE_K, broadcast_type_C, 0, output_transpose, alpha, beta, input_elemtype, output_elemtype};

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        // shadowed variable for less openmp task args
        const int TILE_M = args.TILE_M;
        const int TILE_N = args.TILE_N;
        const int TILE_K = args.TILE_K;
        const int broadcast_type_C = args.broadcast_type_C;
        const int output_transpose = args.output_transpose;
        const float alpha = args.alpha;
        const float beta = args.beta;
        const int output_elemtype = args.output_elemtype;

        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
            }

            if (output_transpose)
                transpose_unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
            else
                unpack_output_tile_dequantize(topT_tile, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, output_descales, alpha, beta, output_elemtype);
        }
    }

    return 0;
}

int Gemm_arm::create_pipeline_int8(const Option& opt)
{
    // finalize input_elemtype from cpu capability and opt
    {
        // armv8.2                  + use-fp16              = fp16
        // armv8.2                  + no-fp16 + use-bf16    = bf16
        // armv8.2                  + no-fp16 + no-bf16     = fp32
        // armv8.0/armv7-vfpv4      + use-bf16              = bf16
        // armv8.0/armv7-vfpv4      + no-bf16 + use-fp16    = fp16
        // armv8.0/armv7-vfpv4      + no-fp16 + no-bf16     = fp32
        // armv7                    + use-bf16              = bf16
        // armv7                    + no-bf16               = fp32

        bool use_fp16 = false;
        bool use_bf16 = false;

#if NCNN_ARM82
        if (ncnn::cpu_support_arm_asimdhp())
        {
            use_fp16 = opt.use_fp16_storage;
            use_bf16 = opt.use_bf16_storage && !opt.use_fp16_storage;
        }
        else
#endif
#if NCNN_VFPV4
            if (ncnn::cpu_support_arm_vfpv4())
            {
                use_bf16 = opt.use_bf16_storage;
                use_fp16 = opt.use_fp16_storage && !opt.use_bf16_storage;
            }
            else
#endif
            {
                use_bf16 = opt.use_bf16_storage;
            }

        input_elemtype = 1; // fp32
        if (use_fp16) input_elemtype = 2;
        if (use_bf16) input_elemtype = 3;

        // NCNN_LOGE("input_elemtype = %d", input_elemtype);
    }

    if (constantA)
    {
        const int M = constantM;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_int8(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_M = (M + TILE_M - 1) / TILE_M;

        AT_data.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 1u, (Allocator*)0);
        if (AT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_M; ppj++)
        {
            const int i = ppj * TILE_M;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_ii = std::min((M - i), TILE_M);
                const int max_kk = std::min((K - k), TILE_K);

                Mat AT_tile = AT_data.channel(i / TILE_M).row_range(k / TILE_K, 1);

                if (transA)
                {
                    transpose_pack_A_tile_int8(A_data, AT_tile, i, max_ii, k, max_kk);
                }
                else
                {
                    pack_A_tile_int8(A_data, AT_tile, i, max_ii, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
    {
        const int N = constantN;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_int8(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_N = (N + TILE_N - 1) / TILE_N;

        BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 1u, (Allocator*)0);
        if (BT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_N; ppj++)
        {
            const int j = ppj * TILE_N;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_jj = std::min((N - j), TILE_N);
                const int max_kk = std::min((K - k), TILE_K);

                Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (transB)
                {
                    pack_B_tile_int8(B_data, BT_tile, j, max_jj, k, max_kk);
                }
                else
                {
                    transpose_pack_B_tile_int8(B_data, BT_tile, j, max_jj, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
    {
        CT_data = C_data;

#if NCNN_VFPV4
        if (input_elemtype == 2)
        {
            Mat C2;
            ncnn::cast_float32_to_float16(CT_data, C2);
            CT_data = C2;
        }
#endif
#if NCNN_BF16
        if (input_elemtype == 3)
        {
            Mat C2;
            ncnn::cast_float32_to_bfloat16(CT_data, C2);
            CT_data = C2;
        }
#endif

        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
    {
        nT = opt.num_threads;
    }

    return 0;
}

int Gemm_arm::forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int M;
    int N;
    if (constantA && constantB)
    {
        M = constantM;
        N = constantN;
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        M = constantM;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = constantN;
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }

    Mat C;
    int broadcast_type_C = 0;
    if (constantC)
    {
        C = CT_data;
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (constantA && constantB)
        {
            C = bottom_blobs.size() == 1 ? bottom_blobs[0] : Mat();
        }
        else if (constantA)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else if (constantB)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else
        {
            C = bottom_blobs.size() == 3 ? bottom_blobs[2] : Mat();
        }

        if (!C.empty())
        {
            if (C.dims == 1 && C.w == 1)
            {
                // scalar
                broadcast_type_C = 0;
            }
            if (C.dims == 1 && C.w * C.elempack == M)
            {
                // M
                // auto broadcast from h to w is the ncnn-style convention
                broadcast_type_C = 1;
            }
            if (C.dims == 1 && C.w * C.elempack == N)
            {
                // N
                broadcast_type_C = 4;
            }
            if (C.dims == 2 && C.w == 1 && C.h * C.elempack == M)
            {
                // Mx1
                broadcast_type_C = 2;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == M)
            {
                // MxN
                broadcast_type_C = 3;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == 1)
            {
                // 1xN
                broadcast_type_C = 4;
            }
        }
    }

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        int outh = output_transpose ? N : M;
        out_elempack = outh % 4 == 0 ? 4 : 1;
#if NCNN_ARM82
        if (cpu_support_arm_asimdhp() && opt.use_fp16_storage && opt.use_fp16_arithmetic)
        {
            // TODO use output_elemtype
            out_elempack = outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
        }
#endif
    }
#endif // __ARM_NEON

    // FIXME use output_elempack
    // int output_elempack = out_elempack > 4 ? 4 : out_elempack;

    if (output_elempack)
        out_elempack = output_elempack;
    size_t out_elemsize = 4u * out_elempack;

    // FIXME use output_elemtype instead of input_elemtype
    int output_elemtype = input_elemtype;

    // TODO use output_elemtype
    if (opt.use_bf16_storage)
    {
        out_elemsize = 2u * out_elempack;
    }
#if NCNN_VFPV4
    else if (support_fp16_storage && opt.use_fp16_storage)
    {
        out_elemsize = 2u * out_elempack;
    }
#endif

    Mat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int _nT = nT ? nT : opt.num_threads;
    if (nT != 0 && opt.num_threads != nT)
    {
        // force num_threads the same as in create_pipeline
        // so we could use pre-packed A/B from the same tile config
        NCNN_LOGE("opt.num_threads %d changed, gemm will use load-time value %d", opt.num_threads, nT);
    }

    int ret = 0;
    if (constantA && constantB)
    {
        ret = gemm_AT_BT_arm_int8(AT_data, A_data_int8_scales, BT_data, B_data_int8_scale, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, alpha, beta, input_elemtype, output_elemtype, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        ret = gemm_AT_arm_int8(AT_data, A_data_int8_scales, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, alpha, beta, input_elemtype, output_elemtype, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        ret = gemm_BT_arm_int8(A, BT_data, B_data_int8_scale, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, alpha, beta, input_elemtype, output_elemtype, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        ret = gemm_arm_int8(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, alpha, beta, input_elemtype, output_elemtype, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }

    return ret;
}
#endif

} // namespace ncnn


================================================
FILE: src/layer/arm/gemm_arm.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GEMM_ARM_H
#define LAYER_GEMM_ARM_H

#include "gemm.h"

namespace ncnn {

class Gemm_arm : public Gemm
{
public:
    Gemm_arm();

    virtual int create_pipeline(const Option& opt);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_VFPV4
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#if NCNN_ARM82
    int create_pipeline_fp16sa(const Option& opt);
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8(const Option& opt);
    int forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
    int nT;
    Mat AT_data;
    Mat BT_data;
    Mat CT_data;

    int input_elemtype; // 0=auto 1=fp32 2=fp16 3=bf16
};

} // namespace ncnn

#endif // LAYER_GEMM_ARM_H


================================================
FILE: src/layer/arm/gemm_arm_asimddp.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "arm_usability.h"

namespace ncnn {

#include "gemm_int8.h"
#include "gemm_int8_fp16s.h"

#if NCNN_BF16
#include "gemm_int8_bf16s.h"
#endif

void pack_A_tile_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
}

void transpose_pack_A_tile_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    transpose_pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
}

void pack_B_tile_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    pack_B_tile_int8(B, BT, j, max_jj, k, max_kk);
}

void transpose_pack_B_tile_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    transpose_pack_B_tile_int8(B, BT, j, max_jj, k, max_kk);
}

void pack_A_tile_fp32_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    pack_A_tile_fp32_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void transpose_pack_A_tile_fp32_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    transpose_pack_A_tile_fp32_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void pack_B_tile_fp32_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    pack_B_tile_fp32_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void transpose_pack_B_tile_fp32_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    transpose_pack_B_tile_fp32_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void unpack_output_tile_int32_to_fp32_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    unpack_output_tile_int32_to_fp32(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

void transpose_unpack_output_tile_int32_to_fp32_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    transpose_unpack_output_tile_int32_to_fp32(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

void gemm_transB_packed_tile_int8_asimddp(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, int i, int max_ii, int j, int max_jj, int k, int max_kk)
{
    gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
}

void pack_A_tile_fp16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    pack_A_tile_fp16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void transpose_pack_A_tile_fp16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    transpose_pack_A_tile_fp16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void pack_B_tile_fp16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    pack_B_tile_fp16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void transpose_pack_B_tile_fp16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    transpose_pack_B_tile_fp16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void unpack_output_tile_int32_to_fp16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    unpack_output_tile_int32_to_fp16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

void transpose_unpack_output_tile_int32_to_fp16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    transpose_unpack_output_tile_int32_to_fp16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

#if NCNN_BF16
void pack_A_tile_bf16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    pack_A_tile_bf16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void transpose_pack_A_tile_bf16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    transpose_pack_A_tile_bf16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void pack_B_tile_bf16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    pack_B_tile_bf16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void transpose_pack_B_tile_bf16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    transpose_pack_B_tile_bf16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void unpack_output_tile_int32_to_bf16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    unpack_output_tile_int32_to_bf16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

void transpose_unpack_output_tile_int32_to_bf16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    transpose_unpack_output_tile_int32_to_bf16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/gemm_arm_asimdfhm.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gemm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#include "gemm_fp16s.h"

void gemm_transB_packed_tile_fp16s_asimdfhm(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, float alpha, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end)
{
    gemm_transB_packed_tile_fp16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/gemm_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gemm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#include "gemm_bf16s_fp16s.h"
#include "gemm_fp16s.h"

#if NCNN_INT8
#include "gemm_int8_fp16s.h"
#endif

static void gemm_transB_packed_tile_fp16sa(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end)
{
    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const __fp16* pAT = AT_tile;
    const __fp16* pBT = BT_tile;
    const __fp16* pC = CT_tile;

    __fp16* outptr = topT_tile;

    int ii = 0;
    for (; ii + 7 < max_ii; ii += 8)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const __fp16* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const __fp16*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const __fp16*)CT_tile + j;
            }
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float16x8_t _sum0;
            float16x8_t _sum1;
            float16x8_t _sum2;
            float16x8_t _sum3;
            float16x8_t _sum4;
            float16x8_t _sum5;
            float16x8_t _sum6;
            float16x8_t _sum7;
            float16x8_t _sum8;
            float16x8_t _sum9;
            float16x8_t _suma;
            float16x8_t _sumb;

            if (k == 0)
            {
                _sum0 = vdupq_n_f16(0.f);
                _sum1 = vdupq_n_f16(0.f);
                _sum2 = vdupq_n_f16(0.f);
                _sum3 = vdupq_n_f16(0.f);
                _sum4 = vdupq_n_f16(0.f);
                _sum5 = vdupq_n_f16(0.f);
                _sum6 = vdupq_n_f16(0.f);
                _sum7 = vdupq_n_f16(0.f);
                _sum8 = vdupq_n_f16(0.f);
                _sum9 = vdupq_n_f16(0.f);
                _suma = vdupq_n_f16(0.f);
                _sumb = vdupq_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[0]);
                        _sum2 = vdupq_n_f16(pC[0]);
                        _sum3 = vdupq_n_f16(pC[0]);
                        _sum4 = vdupq_n_f16(pC[0]);
                        _sum5 = vdupq_n_f16(pC[0]);
                        _sum6 = vdupq_n_f16(pC[0]);
                        _sum7 = vdupq_n_f16(pC[0]);
                        _sum8 = vdupq_n_f16(pC[0]);
                        _sum9 = vdupq_n_f16(pC[0]);
                        _suma = vdupq_n_f16(pC[0]);
                        _sumb = vdupq_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                        _sum8 = _sum0;
                        _sum9 = _sum0;
                        _suma = _sum0;
                        _sumb = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = vld1q_f16(pC + 8);
                        _sum2 = vld1q_f16(pC + 8 * 2);
                        _sum3 = vld1q_f16(pC + 8 * 3);
                        _sum4 = vld1q_f16(pC + 8 * 4);
                        _sum5 = vld1q_f16(pC + 8 * 5);
                        _sum6 = vld1q_f16(pC + 8 * 6);
                        _sum7 = vld1q_f16(pC + 8 * 7);
                        _sum8 = vld1q_f16(pC + 8 * 8);
                        _sum9 = vld1q_f16(pC + 8 * 9);
                        _suma = vld1q_f16(pC + 8 * 10);
                        _sumb = vld1q_f16(pC + 8 * 11);
                        pC += 96;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[1]);
                        _sum2 = vdupq_n_f16(pC[2]);
                        _sum3 = vdupq_n_f16(pC[3]);
                        _sum4 = vdupq_n_f16(pC[4]);
                        _sum5 = vdupq_n_f16(pC[5]);
                        _sum6 = vdupq_n_f16(pC[6]);
                        _sum7 = vdupq_n_f16(pC[7]);
                        _sum8 = vdupq_n_f16(pC[8]);
                        _sum9 = vdupq_n_f16(pC[9]);
                        _suma = vdupq_n_f16(pC[10]);
                        _sumb = vdupq_n_f16(pC[11]);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8 * 1);
                _sum2 = vld1q_f16(outptr + 8 * 2);
                _sum3 = vld1q_f16(outptr + 8 * 3);
                _sum4 = vld1q_f16(outptr + 8 * 4);
                _sum5 = vld1q_f16(outptr + 8 * 5);
                _sum6 = vld1q_f16(outptr + 8 * 6);
                _sum7 = vld1q_f16(outptr + 8 * 7);
                _sum8 = vld1q_f16(outptr + 8 * 8);
                _sum9 = vld1q_f16(outptr + 8 * 9);
                _suma = vld1q_f16(outptr + 8 * 10);
                _sumb = vld1q_f16(outptr + 8 * 11);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v3.8h}, [%0], #16      \n"
                    "ld1    {v0.4h, v1.4h, v2.4h}, [%1], #24 \n"
                    "fmla   %2.8h, v3.8h, v0.h[0]   \n"
                    "fmla   %3.8h, v3.8h, v0.h[1]   \n"
                    "fmla   %4.8h, v3.8h, v0.h[2]   \n"
                    "fmla   %5.8h, v3.8h, v0.h[3]   \n"
                    "fmla   %6.8h, v3.8h, v1.h[0]   \n"
                    "fmla   %7.8h, v3.8h, v1.h[1]   \n"
                    "fmla   %8.8h, v3.8h, v1.h[2]   \n"
                    "fmla   %9.8h, v3.8h, v1.h[3]   \n"
                    "fmla   %10.8h, v3.8h, v2.h[0]  \n"
                    "fmla   %11.8h, v3.8h, v2.h[1]  \n"
                    "fmla   %12.8h, v3.8h, v2.h[2]  \n"
                    "fmla   %13.8h, v3.8h, v2.h[3]  \n"
                    : "=r"(pA),
                    "=r"(pB),
                    "=w"(_sum0),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3),
                    "=w"(_sum4),
                    "=w"(_sum5),
                    "=w"(_sum6),
                    "=w"(_sum7),
                    "=w"(_sum8),
                    "=w"(_sum9),
                    "=w"(_suma),
                    "=w"(_sumb)
                    : "0"(pA),
                    "1"(pB),
                    "2"(_sum0),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3),
                    "6"(_sum4),
                    "7"(_sum5),
                    "8"(_sum6),
                    "9"(_sum7),
                    "10"(_sum8),
                    "11"(_sum9),
                    "12"(_suma),
                    "13"(_sumb)
                    : "memory", "v0", "v1", "v2", "v3");
#else
                float16x8_t _pA = vld1q_f16(pA);

                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                _sum0 = vfmaq_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_lane_f16(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_lane_f16(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_lane_f16(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_lane_f16(_sum7, _pA, _pB1, 3);
                _sum8 = vfmaq_lane_f16(_sum8, _pA, _pB2, 0);
                _sum9 = vfmaq_lane_f16(_sum9, _pA, _pB2, 1);
                _suma = vfmaq_lane_f16(_suma, _pA, _pB2, 2);
                _sumb = vfmaq_lane_f16(_sumb, _pA, _pB2, 3);

                pA += 8;
                pB += 12;
#endif
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8 * 1, _sum1);
                    vst1q_f16(outptr0 + 8 * 2, _sum2);
                    vst1q_f16(outptr0 + 8 * 3, _sum3);
                    vst1q_f16(outptr0 + 8 * 4, _sum4);
                    vst1q_f16(outptr0 + 8 * 5, _sum5);
                    vst1q_f16(outptr0 + 8 * 6, _sum6);
                    vst1q_f16(outptr0 + 8 * 7, _sum7);
                    vst1q_f16(outptr0 + 8 * 8, _sum8);
                    vst1q_f16(outptr0 + 8 * 9, _sum9);
                    vst1q_f16(outptr0 + 8 * 10, _suma);
                    vst1q_f16(outptr0 + 8 * 11, _sumb);
                    outptr0 += 96;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + 4 * 2, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + 4 * 3, vget_low_f16(_sum3));
                    vst1_f16(outptr0 + 4 * 4, vget_low_f16(_sum4));
                    vst1_f16(outptr0 + 4 * 5, vget_low_f16(_sum5));
                    vst1_f16(outptr0 + 4 * 6, vget_low_f16(_sum6));
                    vst1_f16(outptr0 + 4 * 7, vget_low_f16(_sum7));
                    vst1_f16(outptr0 + 4 * 8, vget_low_f16(_sum8));
                    vst1_f16(outptr0 + 4 * 9, vget_low_f16(_sum9));
                    vst1_f16(outptr0 + 4 * 10, vget_low_f16(_suma));
                    vst1_f16(outptr0 + 4 * 11, vget_low_f16(_sumb));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 2, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 3, vget_high_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 4, vget_high_f16(_sum4));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 5, vget_high_f16(_sum5));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 6, vget_high_f16(_sum6));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 7, vget_high_f16(_sum7));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 8, vget_high_f16(_sum8));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 9, vget_high_f16(_sum9));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 10, vget_high_f16(_suma));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 11, vget_high_f16(_sumb));

                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose8x8_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + out_hstep * 1, _sum1);
                    vst1q_f16(outptr0 + out_hstep * 2, _sum2);
                    vst1q_f16(outptr0 + out_hstep * 3, _sum3);
                    vst1q_f16(outptr0 + out_hstep * 4, _sum4);
                    vst1q_f16(outptr0 + out_hstep * 5, _sum5);
                    vst1q_f16(outptr0 + out_hstep * 6, _sum6);
                    vst1q_f16(outptr0 + out_hstep * 7, _sum7);

                    transpose8x4_ph(_sum8, _sum9, _suma, _sumb);

                    vst1_f16(outptr0 + 8, vget_low_f16(_sum8));
                    vst1_f16(outptr0 + out_hstep * 1 + 8, vget_high_f16(_sum8));
                    vst1_f16(outptr0 + out_hstep * 2 + 8, vget_low_f16(_sum9));
                    vst1_f16(outptr0 + out_hstep * 3 + 8, vget_high_f16(_sum9));
                    vst1_f16(outptr0 + out_hstep * 4 + 8, vget_low_f16(_suma));
                    vst1_f16(outptr0 + out_hstep * 5 + 8, vget_high_f16(_suma));
                    vst1_f16(outptr0 + out_hstep * 6 + 8, vget_low_f16(_sumb));
                    vst1_f16(outptr0 + out_hstep * 7 + 8, vget_high_f16(_sumb));

                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8 * 1, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
                vst1q_f16(outptr + 8 * 4, _sum4);
                vst1q_f16(outptr + 8 * 5, _sum5);
                vst1q_f16(outptr + 8 * 6, _sum6);
                vst1q_f16(outptr + 8 * 7, _sum7);
                vst1q_f16(outptr + 8 * 8, _sum8);
                vst1q_f16(outptr + 8 * 9, _sum9);
                vst1q_f16(outptr + 8 * 10, _suma);
                vst1q_f16(outptr + 8 * 11, _sumb);
            }

            outptr += 96;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float16x8_t _sum0;
            float16x8_t _sum1;
            float16x8_t _sum2;
            float16x8_t _sum3;
            float16x8_t _sum4;
            float16x8_t _sum5;
            float16x8_t _sum6;
            float16x8_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_f16(0.f);
                _sum1 = vdupq_n_f16(0.f);
                _sum2 = vdupq_n_f16(0.f);
                _sum3 = vdupq_n_f16(0.f);
                _sum4 = vdupq_n_f16(0.f);
                _sum5 = vdupq_n_f16(0.f);
                _sum6 = vdupq_n_f16(0.f);
                _sum7 = vdupq_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[0]);
                        _sum2 = vdupq_n_f16(pC[0]);
                        _sum3 = vdupq_n_f16(pC[0]);
                        _sum4 = vdupq_n_f16(pC[0]);
                        _sum5 = vdupq_n_f16(pC[0]);
                        _sum6 = vdupq_n_f16(pC[0]);
                        _sum7 = vdupq_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = vld1q_f16(pC + 8);
                        _sum2 = vld1q_f16(pC + 8 * 2);
                        _sum3 = vld1q_f16(pC + 8 * 3);
                        _sum4 = vld1q_f16(pC + 8 * 4);
                        _sum5 = vld1q_f16(pC + 8 * 5);
                        _sum6 = vld1q_f16(pC + 8 * 6);
                        _sum7 = vld1q_f16(pC + 8 * 7);
                        pC += 64;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[1]);
                        _sum2 = vdupq_n_f16(pC[2]);
                        _sum3 = vdupq_n_f16(pC[3]);
                        _sum4 = vdupq_n_f16(pC[4]);
                        _sum5 = vdupq_n_f16(pC[5]);
                        _sum6 = vdupq_n_f16(pC[6]);
                        _sum7 = vdupq_n_f16(pC[7]);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8 * 1);
                _sum2 = vld1q_f16(outptr + 8 * 2);
                _sum3 = vld1q_f16(outptr + 8 * 3);
                _sum4 = vld1q_f16(outptr + 8 * 4);
                _sum5 = vld1q_f16(outptr + 8 * 5);
                _sum6 = vld1q_f16(outptr + 8 * 6);
                _sum7 = vld1q_f16(outptr + 8 * 7);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                _sum0 = vfmaq_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_lane_f16(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_lane_f16(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_lane_f16(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_lane_f16(_sum7, _pA, _pB1, 3);

                pA += 8;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8 * 1, _sum1);
                    vst1q_f16(outptr0 + 8 * 2, _sum2);
                    vst1q_f16(outptr0 + 8 * 3, _sum3);
                    vst1q_f16(outptr0 + 8 * 4, _sum4);
                    vst1q_f16(outptr0 + 8 * 5, _sum5);
                    vst1q_f16(outptr0 + 8 * 6, _sum6);
                    vst1q_f16(outptr0 + 8 * 7, _sum7);
                    outptr0 += 64;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + 4 * 2, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + 4 * 3, vget_low_f16(_sum3));
                    vst1_f16(outptr0 + 4 * 4, vget_low_f16(_sum4));
                    vst1_f16(outptr0 + 4 * 5, vget_low_f16(_sum5));
                    vst1_f16(outptr0 + 4 * 6, vget_low_f16(_sum6));
                    vst1_f16(outptr0 + 4 * 7, vget_low_f16(_sum7));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 2, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 3, vget_high_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 4, vget_high_f16(_sum4));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 5, vget_high_f16(_sum5));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 6, vget_high_f16(_sum6));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 7, vget_high_f16(_sum7));

                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose8x8_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + out_hstep * 1, _sum1);
                    vst1q_f16(outptr0 + out_hstep * 2, _sum2);
                    vst1q_f16(outptr0 + out_hstep * 3, _sum3);
                    vst1q_f16(outptr0 + out_hstep * 4, _sum4);
                    vst1q_f16(outptr0 + out_hstep * 5, _sum5);
                    vst1q_f16(outptr0 + out_hstep * 6, _sum6);
                    vst1q_f16(outptr0 + out_hstep * 7, _sum7);

                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8 * 1, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
                vst1q_f16(outptr + 8 * 4, _sum4);
                vst1q_f16(outptr + 8 * 5, _sum5);
                vst1q_f16(outptr + 8 * 6, _sum6);
                vst1q_f16(outptr + 8 * 7, _sum7);
            }

            outptr += 64;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float16x8_t _sum0;
            float16x8_t _sum1;
            float16x8_t _sum2;
            float16x8_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_f16(0.f);
                _sum1 = vdupq_n_f16(0.f);
                _sum2 = vdupq_n_f16(0.f);
                _sum3 = vdupq_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[0]);
                        _sum2 = vdupq_n_f16(pC[0]);
                        _sum3 = vdupq_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = vld1q_f16(pC + 8);
                        _sum2 = vld1q_f16(pC + 8 * 2);
                        _sum3 = vld1q_f16(pC + 8 * 3);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[1]);
                        _sum2 = vdupq_n_f16(pC[2]);
                        _sum3 = vdupq_n_f16(pC[3]);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8 * 1);
                _sum2 = vld1q_f16(outptr + 8 * 2);
                _sum3 = vld1q_f16(outptr + 8 * 3);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x4_t _pB0 = vld1_f16(pB);

                _sum0 = vfmaq_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _pA, _pB0, 3);

                pA += 8;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8 * 1, _sum1);
                    vst1q_f16(outptr0 + 8 * 2, _sum2);
                    vst1q_f16(outptr0 + 8 * 3, _sum3);
                    outptr0 += 32;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + 4 * 2, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + 4 * 3, vget_low_f16(_sum3));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 2, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 4 + 4 * 3, vget_high_f16(_sum3));

                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose8x4_ph(_sum0, _sum1, _sum2, _sum3);

                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 1, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 2, vget_low_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 3, vget_high_f16(_sum1));
                    vst1_f16(outptr0 + out_hstep * 4, vget_low_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 5, vget_high_f16(_sum2));
                    vst1_f16(outptr0 + out_hstep * 6, vget_low_f16(_sum3));
                    vst1_f16(outptr0 + out_hstep * 7, vget_high_f16(_sum3));

                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8 * 1, _sum1);
                vst1q_f16(outptr + 8 * 2, _sum2);
                vst1q_f16(outptr + 8 * 3, _sum3);
            }

            outptr += 32;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float16x8_t _sum0;
            float16x8_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f16(0.f);
                _sum1 = vdupq_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f16(pC);
                        _sum1 = vld1q_f16(pC + 8);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        _sum1 = vdupq_n_f16(pC[1]);
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
                _sum1 = vld1q_f16(outptr + 8);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x8_t _pB0 = vdupq_n_f16(pB[0]);
                float16x8_t _pB1 = vdupq_n_f16(pB[1]);

                _sum0 = vfmaq_f16(_sum0, _pA, _pB0);
                _sum1 = vfmaq_f16(_sum1, _pA, _pB1);

                pA += 8;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    vst1q_f16(outptr0 + 8, _sum1);
                    outptr0 += 16;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + 4, vget_low_f16(_sum1));

                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4 + 4, vget_high_f16(_sum1));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[8];
                    __fp16 sum1[8];
                    vst1q_f16(sum0, _sum0);
                    vst1q_f16(sum1, _sum1);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];

                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0[out_hstep * 4 + 1] = sum1[4];
                    outptr0[out_hstep * 5 + 1] = sum1[5];
                    outptr0[out_hstep * 6 + 1] = sum1[6];
                    outptr0[out_hstep * 7 + 1] = sum1[7];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
                vst1q_f16(outptr + 8, _sum1);
            }

            outptr += 16;
        }
        for (; jj < max_jj; jj += 1)
        {
            float16x8_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f16(pC);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f16(pC);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f16(pC[0]);
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f16(outptr);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x8_t _pA = vld1q_f16(pA);

                float16x8_t _pB = vdupq_n_f16(pB[0]);

                _sum0 = vfmaq_f16(_sum0, _pA, _pB);

                pA += 8;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 8)
                {
                    vst1q_f16(outptr0, _sum0);
                    outptr0 += 8;
                }
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, vget_low_f16(_sum0));
                    vst1_f16(outptr0 + out_hstep * 4, vget_high_f16(_sum0));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[8];
                    vst1q_f16(sum0, _sum0);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep * 1] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f16(outptr, _sum0);
            }

            outptr += 8;
        }

        pAT += max_kk * 8;
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const __fp16* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const __fp16*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const __fp16*)CT_tile + j;
            }
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;
            float16x4_t _sum3;
            float16x4_t _sum4;
            float16x4_t _sum5;
            float16x4_t _sum6;
            float16x4_t _sum7;
            float16x4_t _sum8;
            float16x4_t _sum9;
            float16x4_t _suma;
            float16x4_t _sumb;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);
                _sum1 = vdup_n_f16(0.f);
                _sum2 = vdup_n_f16(0.f);
                _sum3 = vdup_n_f16(0.f);
                _sum4 = vdup_n_f16(0.f);
                _sum5 = vdup_n_f16(0.f);
                _sum6 = vdup_n_f16(0.f);
                _sum7 = vdup_n_f16(0.f);
                _sum8 = vdup_n_f16(0.f);
                _sum9 = vdup_n_f16(0.f);
                _suma = vdup_n_f16(0.f);
                _sumb = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[0]);
                        _sum2 = vdup_n_f16(pC[0]);
                        _sum3 = vdup_n_f16(pC[0]);
                        _sum4 = vdup_n_f16(pC[0]);
                        _sum5 = vdup_n_f16(pC[0]);
                        _sum6 = vdup_n_f16(pC[0]);
                        _sum7 = vdup_n_f16(pC[0]);
                        _sum8 = vdup_n_f16(pC[0]);
                        _sum9 = vdup_n_f16(pC[0]);
                        _suma = vdup_n_f16(pC[0]);
                        _sumb = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                        _sum8 = _sum0;
                        _sum9 = _sum0;
                        _suma = _sum0;
                        _sumb = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = vld1_f16(pC + 4);
                        _sum2 = vld1_f16(pC + 8);
                        _sum3 = vld1_f16(pC + 12);
                        _sum4 = vld1_f16(pC + 16);
                        _sum5 = vld1_f16(pC + 20);
                        _sum6 = vld1_f16(pC + 24);
                        _sum7 = vld1_f16(pC + 28);
                        _sum8 = vld1_f16(pC + 32);
                        _sum9 = vld1_f16(pC + 36);
                        _suma = vld1_f16(pC + 40);
                        _sumb = vld1_f16(pC + 44);
                        pC += 48;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[1]);
                        _sum2 = vdup_n_f16(pC[2]);
                        _sum3 = vdup_n_f16(pC[3]);
                        _sum4 = vdup_n_f16(pC[4]);
                        _sum5 = vdup_n_f16(pC[5]);
                        _sum6 = vdup_n_f16(pC[6]);
                        _sum7 = vdup_n_f16(pC[7]);
                        _sum8 = vdup_n_f16(pC[8]);
                        _sum9 = vdup_n_f16(pC[9]);
                        _suma = vdup_n_f16(pC[10]);
                        _sumb = vdup_n_f16(pC[11]);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4 * 1);
                _sum2 = vld1_f16(outptr + 4 * 2);
                _sum3 = vld1_f16(outptr + 4 * 3);
                _sum4 = vld1_f16(outptr + 4 * 4);
                _sum5 = vld1_f16(outptr + 4 * 5);
                _sum6 = vld1_f16(outptr + 4 * 6);
                _sum7 = vld1_f16(outptr + 4 * 7);
                _sum8 = vld1_f16(outptr + 4 * 8);
                _sum9 = vld1_f16(outptr + 4 * 9);
                _suma = vld1_f16(outptr + 4 * 10);
                _sumb = vld1_f16(outptr + 4 * 11);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                _sum0 = vfma_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfma_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfma_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfma_lane_f16(_sum3, _pA, _pB0, 3);
                _sum4 = vfma_lane_f16(_sum4, _pA, _pB1, 0);
                _sum5 = vfma_lane_f16(_sum5, _pA, _pB1, 1);
                _sum6 = vfma_lane_f16(_sum6, _pA, _pB1, 2);
                _sum7 = vfma_lane_f16(_sum7, _pA, _pB1, 3);
                _sum8 = vfma_lane_f16(_sum8, _pA, _pB2, 0);
                _sum9 = vfma_lane_f16(_sum9, _pA, _pB2, 1);
                _suma = vfma_lane_f16(_suma, _pA, _pB2, 2);
                _sumb = vfma_lane_f16(_sumb, _pA, _pB2, 3);

                pA += 4;
                pB += 12;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 4 * 2, _sum2);
                    vst1_f16(outptr0 + 4 * 3, _sum3);
                    vst1_f16(outptr0 + 4 * 4, _sum4);
                    vst1_f16(outptr0 + 4 * 5, _sum5);
                    vst1_f16(outptr0 + 4 * 6, _sum6);
                    vst1_f16(outptr0 + 4 * 7, _sum7);
                    vst1_f16(outptr0 + 4 * 8, _sum8);
                    vst1_f16(outptr0 + 4 * 9, _sum9);
                    vst1_f16(outptr0 + 4 * 10, _suma);
                    vst1_f16(outptr0 + 4 * 11, _sumb);
                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose4x12_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb);

                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 8, _sum2);
                    vst1_f16(outptr0 + out_hstep, _sum3);
                    vst1_f16(outptr0 + out_hstep + 4, _sum4);
                    vst1_f16(outptr0 + out_hstep + 8, _sum5);
                    vst1_f16(outptr0 + out_hstep * 2, _sum6);
                    vst1_f16(outptr0 + out_hstep * 2 + 4, _sum7);
                    vst1_f16(outptr0 + out_hstep * 2 + 8, _sum8);
                    vst1_f16(outptr0 + out_hstep * 3, _sum9);
                    vst1_f16(outptr0 + out_hstep * 3 + 4, _suma);
                    vst1_f16(outptr0 + out_hstep * 3 + 8, _sumb);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
                vst1_f16(outptr + 4 * 4, _sum4);
                vst1_f16(outptr + 4 * 5, _sum5);
                vst1_f16(outptr + 4 * 6, _sum6);
                vst1_f16(outptr + 4 * 7, _sum7);
                vst1_f16(outptr + 4 * 8, _sum8);
                vst1_f16(outptr + 4 * 9, _sum9);
                vst1_f16(outptr + 4 * 10, _suma);
                vst1_f16(outptr + 4 * 11, _sumb);
            }

            outptr += 48;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;
            float16x4_t _sum3;
            float16x4_t _sum4;
            float16x4_t _sum5;
            float16x4_t _sum6;
            float16x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);
                _sum1 = vdup_n_f16(0.f);
                _sum2 = vdup_n_f16(0.f);
                _sum3 = vdup_n_f16(0.f);
                _sum4 = vdup_n_f16(0.f);
                _sum5 = vdup_n_f16(0.f);
                _sum6 = vdup_n_f16(0.f);
                _sum7 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[0]);
                        _sum2 = vdup_n_f16(pC[0]);
                        _sum3 = vdup_n_f16(pC[0]);
                        _sum4 = vdup_n_f16(pC[0]);
                        _sum5 = vdup_n_f16(pC[0]);
                        _sum6 = vdup_n_f16(pC[0]);
                        _sum7 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = vld1_f16(pC + 4);
                        _sum2 = vld1_f16(pC + 8);
                        _sum3 = vld1_f16(pC + 12);
                        _sum4 = vld1_f16(pC + 16);
                        _sum5 = vld1_f16(pC + 20);
                        _sum6 = vld1_f16(pC + 24);
                        _sum7 = vld1_f16(pC + 28);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[1]);
                        _sum2 = vdup_n_f16(pC[2]);
                        _sum3 = vdup_n_f16(pC[3]);
                        _sum4 = vdup_n_f16(pC[4]);
                        _sum5 = vdup_n_f16(pC[5]);
                        _sum6 = vdup_n_f16(pC[6]);
                        _sum7 = vdup_n_f16(pC[7]);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4 * 1);
                _sum2 = vld1_f16(outptr + 4 * 2);
                _sum3 = vld1_f16(outptr + 4 * 3);
                _sum4 = vld1_f16(outptr + 4 * 4);
                _sum5 = vld1_f16(outptr + 4 * 5);
                _sum6 = vld1_f16(outptr + 4 * 6);
                _sum7 = vld1_f16(outptr + 4 * 7);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                _sum0 = vfma_lane_f16(_sum0, _pA, _pB0, 0);
                _sum1 = vfma_lane_f16(_sum1, _pA, _pB0, 1);
                _sum2 = vfma_lane_f16(_sum2, _pA, _pB0, 2);
                _sum3 = vfma_lane_f16(_sum3, _pA, _pB0, 3);
                _sum4 = vfma_lane_f16(_sum4, _pA, _pB1, 0);
                _sum5 = vfma_lane_f16(_sum5, _pA, _pB1, 1);
                _sum6 = vfma_lane_f16(_sum6, _pA, _pB1, 2);
                _sum7 = vfma_lane_f16(_sum7, _pA, _pB1, 3);

                pA += 4;
                pB += 8;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 4 * 2, _sum2);
                    vst1_f16(outptr0 + 4 * 3, _sum3);
                    vst1_f16(outptr0 + 4 * 4, _sum4);
                    vst1_f16(outptr0 + 4 * 5, _sum5);
                    vst1_f16(outptr0 + 4 * 6, _sum6);
                    vst1_f16(outptr0 + 4 * 7, _sum7);
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose4x8_ph(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + out_hstep, _sum2);
                    vst1_f16(outptr0 + out_hstep + 4, _sum3);
                    vst1_f16(outptr0 + out_hstep * 2, _sum4);
                    vst1_f16(outptr0 + out_hstep * 2 + 4, _sum5);
                    vst1_f16(outptr0 + out_hstep * 3, _sum6);
                    vst1_f16(outptr0 + out_hstep * 3 + 4, _sum7);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
                vst1_f16(outptr + 4 * 4, _sum4);
                vst1_f16(outptr + 4 * 5, _sum5);
                vst1_f16(outptr + 4 * 6, _sum6);
                vst1_f16(outptr + 4 * 7, _sum7);
            }

            outptr += 32;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;
            float16x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);
                _sum1 = vdup_n_f16(0.f);
                _sum2 = vdup_n_f16(0.f);
                _sum3 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[0]);
                        _sum2 = vdup_n_f16(pC[0]);
                        _sum3 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = vld1_f16(pC + 4);
                        _sum2 = vld1_f16(pC + 8);
                        _sum3 = vld1_f16(pC + 12);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[1]);
                        _sum2 = vdup_n_f16(pC[2]);
                        _sum3 = vdup_n_f16(pC[3]);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4 * 1);
                _sum2 = vld1_f16(outptr + 4 * 2);
                _sum3 = vld1_f16(outptr + 4 * 3);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB = vld1_f16(pB);

                _sum0 = vfma_lane_f16(_sum0, _pA, _pB, 0);
                _sum1 = vfma_lane_f16(_sum1, _pA, _pB, 1);
                _sum2 = vfma_lane_f16(_sum2, _pA, _pB, 2);
                _sum3 = vfma_lane_f16(_sum3, _pA, _pB, 3);

                pA += 4;
                pB += 4;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 4 * 2, _sum2);
                    vst1_f16(outptr0 + 4 * 3, _sum3);
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose4x4_ph(_sum0, _sum1, _sum2, _sum3);

                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + out_hstep * 1, _sum1);
                    vst1_f16(outptr0 + out_hstep * 2, _sum2);
                    vst1_f16(outptr0 + out_hstep * 3, _sum3);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 4 * 2, _sum2);
                vst1_f16(outptr + 4 * 3, _sum3);
            }

            outptr += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);
                _sum1 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = vld1_f16(pC + 4);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[1]);
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB0 = vdup_n_f16(pB[0]);
                float16x4_t _pB1 = vdup_n_f16(pB[1]);

                _sum0 = vfma_f16(_sum0, _pA, _pB0);
                _sum1 = vfma_f16(_sum1, _pA, _pB1);

                pA += 4;
                pB += 2;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[4];
                    __fp16 sum1[4];
                    vst1_f16(sum0, _sum0);
                    vst1_f16(sum1, _sum1);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj < max_jj; jj += 1)
        {
            float16x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1_f16(pC);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1_f16(pC);
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pA = vld1_f16(pA);
                float16x4_t _pB = vdup_n_f16(pB[0]);

                _sum0 = vfma_f16(_sum0, _pA, _pB);

                pA += 4;
                pB += 1;
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_f16(outptr0, _sum0);
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    __fp16 sum0[4];
                    vst1_f16(sum0, _sum0);

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0++;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
            }

            outptr += 4;
        }

        pAT += max_kk * 4;
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j;

        const __fp16* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const __fp16*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const __fp16*)CT_tile + j;
            }
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float16x4_t _sum00;
            float16x4_t _sum01;
            float16x4_t _sum02;
            float16x4_t _sum10;
            float16x4_t _sum11;
            float16x4_t _sum12;

            if (k == 0)
            {
                _sum00 = vdup_n_f16(0.f);
                _sum01 = vdup_n_f16(0.f);
                _sum02 = vdup_n_f16(0.f);
                _sum10 = vdup_n_f16(0.f);
                _sum11 = vdup_n_f16(0.f);
                _sum12 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdup_n_f16(pC[0]);
                        _sum01 = _sum00;
                        _sum02 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum12 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdup_n_f16(pC[0]);
                        _sum01 = _sum00;
                        _sum02 = _sum00;
                        _sum10 = vdup_n_f16(pC[1]);
                        _sum11 = _sum10;
                        _sum12 = _sum10;
                    }
                    if (broadcast_type_C == 3)
                    {
                        float16x4x2_t _tmp01 = vld2_f16(pC);
                        float16x4x2_t _tmp23 = vld2_f16(pC + 8);
                        float16x4x2_t _tmp45 = vld2_f16(pC + 16);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum02 = _tmp45.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        _sum12 = _tmp45.val[1];
                        pC += 24;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1_f16(pC);
                        _sum01 = vld1_f16(pC + 4);
                        _sum02 = vld1_f16(pC + 8);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum12 = _sum02;
                        pC += 12;
                    }
                }
            }
            else
            {
                float16x4x2_t _tmp01 = vld2_f16(outptr);
                float16x4x2_t _tmp23 = vld2_f16(outptr + 8);
                float16x4x2_t _tmp45 = vld2_f16(outptr + 16);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum02 = _tmp45.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
                _sum12 = _tmp45.val[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);
                float16x4_t _pA1 = vdup_n_f16(pA[1]);

                _sum00 = vfma_f16(_sum00, _pB0, _pA0);
                _sum01 = vfma_f16(_sum01, _pB1, _pA0);
                _sum02 = vfma_f16(_sum02, _pB2, _pA0);
                _sum10 = vfma_f16(_sum10, _pB0, _pA1);
                _sum11 = vfma_f16(_sum11, _pB1, _pA1);
                _sum12 = vfma_f16(_sum12, _pB2, _pA1);

                pA += 2;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum00);
                    vst1_f16(outptr0 + 4, _sum01);
                    vst1_f16(outptr0 + 8, _sum02);
                    vst1_f16(outptr0 + out_hstep, _sum10);
                    vst1_f16(outptr0 + out_hstep + 4, _sum11);
                    vst1_f16(outptr0 + out_hstep + 8, _sum12);
                    outptr0 += 12;
                }
            }
            else
            {
                float16x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float16x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float16x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2_f16(outptr, _tmp01);
                vst2_f16(outptr + 8, _tmp23);
                vst2_f16(outptr + 16, _tmp45);
            }

            outptr += 24;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float16x4_t _sum00;
            float16x4_t _sum01;
            float16x4_t _sum10;
            float16x4_t _sum11;

            if (k == 0)
            {
                _sum00 = vdup_n_f16(0.f);
                _sum01 = vdup_n_f16(0.f);
                _sum10 = vdup_n_f16(0.f);
                _sum11 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdup_n_f16(pC[0]);
                        _sum01 = vdup_n_f16(pC[0]);
                        _sum10 = vdup_n_f16(pC[0]);
                        _sum11 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdup_n_f16(pC[0]);
                        _sum01 = vdup_n_f16(pC[0]);
                        _sum10 = vdup_n_f16(pC[1]);
                        _sum11 = vdup_n_f16(pC[1]);
                    }
                    if (broadcast_type_C == 3)
                    {
                        float16x4x2_t _tmp01 = vld2_f16(pC);
                        float16x4x2_t _tmp23 = vld2_f16(pC + 8);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1_f16(pC);
                        _sum01 = vld1_f16(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        pC += 8;
                    }
                }
            }
            else
            {
                float16x4x2_t _tmp01 = vld2_f16(outptr);
                float16x4x2_t _tmp23 = vld2_f16(outptr + 8);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);
                float16x4_t _pA1 = vdup_n_f16(pA[1]);

                _sum00 = vfma_f16(_sum00, _pB0, _pA0);
                _sum01 = vfma_f16(_sum01, _pB1, _pA0);
                _sum10 = vfma_f16(_sum10, _pB0, _pA1);
                _sum11 = vfma_f16(_sum11, _pB1, _pA1);

                pA += 2;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum00);
                    vst1_f16(outptr0 + 4, _sum01);
                    vst1_f16(outptr0 + out_hstep, _sum10);
                    vst1_f16(outptr0 + out_hstep + 4, _sum11);
                    outptr0 += 8;
                }
            }
            else
            {
                float16x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float16x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2_f16(outptr, _tmp01);
                vst2_f16(outptr + 8, _tmp23);
            }

            outptr += 16;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);
                _sum1 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[1]);
                    }
                    if (broadcast_type_C == 3)
                    {
                        float16x4x2_t _tmp01 = vld2_f16(pC);
                        _sum0 = _tmp01.val[0];
                        _sum1 = _tmp01.val[1];
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = _sum0;
                        pC += 4;
                    }
                }
            }
            else
            {
                float16x4x2_t _tmp01 = vld2_f16(outptr);
                _sum0 = _tmp01.val[0];
                _sum1 = _tmp01.val[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB = vld1_f16(pB);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);
                float16x4_t _pA1 = vdup_n_f16(pA[1]);

                _sum0 = vfma_f16(_sum0, _pB, _pA0);
                _sum1 = vfma_f16(_sum1, _pB, _pA1);

                pA += 2;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + out_hstep, _sum1);
                    outptr0 += 4;
                }
            }
            else
            {
                float16x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2_f16(outptr, _tmp01);
            }

            outptr += 8;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            __fp16 sum00;
            __fp16 sum01;
            __fp16 sum10;
            __fp16 sum11;

            if (k == 0)
            {
                sum00 = 0.f;
                sum01 = 0.f;
                sum10 = 0.f;
                sum11 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[0];
                        sum11 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[0];
                        sum11 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[2];
                        sum11 = pC[3];
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[1];
                        sum11 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum00 = outptr[0];
                sum01 = outptr[1];
                sum10 = outptr[2];
                sum11 = outptr[3];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum00 += pA[0] * pB[0];
                sum01 += pA[1] * pB[0];
                sum10 += pA[0] * pB[1];
                sum11 += pA[1] * pB[1];

                pA += 2;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum00;
                    outptr0[1] = sum10;
                    outptr0[out_hstep] = sum01;
                    outptr0[out_hstep + 1] = sum11;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
            }

            outptr += 4;
        }
        for (; jj < max_jj; jj += 1)
        {
            __fp16 sum0;
            __fp16 sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[1] * pB[0];
                pA += 2;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[out_hstep] = sum1;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        __fp16* outptr0 = (__fp16*)top_blob + (i + ii) * out_hstep + j;

        const __fp16* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const __fp16*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const __fp16*)CT_tile + j;
            }
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;
            float16x4_t _sum2;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);
                _sum1 = vdup_n_f16(0.f);
                _sum2 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[0]);
                        _sum2 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = vld1_f16(pC + 4);
                        _sum2 = vld1_f16(pC + 8);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4);
                _sum2 = vld1_f16(outptr + 8);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);

                _sum0 = vfma_f16(_sum0, _pA0, _pB0);
                _sum1 = vfma_f16(_sum1, _pA0, _pB1);
                _sum2 = vfma_f16(_sum2, _pA0, _pB2);

                pA += 1;
                pB += 12;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    vst1_f16(outptr0 + 8, _sum2);
                    outptr0 += 12;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
                vst1_f16(outptr + 8, _sum2);
            }

            outptr += 12;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float16x4_t _sum0;
            float16x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdup_n_f16(0.f);
                _sum1 = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdup_n_f16(pC[0]);
                        _sum1 = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1_f16(pC);
                        _sum1 = vld1_f16(pC + 4);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1_f16(outptr);
                _sum1 = vld1_f16(outptr + 4);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);

                _sum0 = vfma_f16(_sum0, _pA0, _pB0);
                _sum1 = vfma_f16(_sum1, _pA0, _pB1);

                pA += 1;
                pB += 8;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum0);
                    vst1_f16(outptr0 + 4, _sum1);
                    outptr0 += 8;
                }
            }
            else
            {
                vst1_f16(outptr, _sum0);
                vst1_f16(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float16x4_t _sum;

            if (k == 0)
            {
                _sum = vdup_n_f16(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum = vdup_n_f16(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum = vld1_f16(pC);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum = vld1_f16(outptr);
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float16x4_t _pB = vld1_f16(pB);
                float16x4_t _pA = vdup_n_f16(pA[0]);

                _sum = vfma_f16(_sum, _pA, _pB);

                pA += 1;
                pB += 4;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_f16(outptr0, _sum);
                    outptr0 += 4;
                }
            }
            else
            {
                vst1_f16(outptr, _sum);
            }

            outptr += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            __fp16 sum0;
            __fp16 sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[0] * pB[1];

                pA += 1;
                pB += 2;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum0;
                    outptr0[1] = sum1;
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            __fp16 sum;

            if (k == 0)
            {
                sum = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum = outptr[0];
            }

            const __fp16* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                sum += pA[0] * pB[0];
                pA += 1;
                pB += 1;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = sum;
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}

static void get_optimal_tile_mnk_fp16sa(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(__fp16));

    TILE_M = std::max(8, tile_size / 8 * 8);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(8, tile_size / 8 * 8);

    if (K > 0)
    {
        int nn_K = (K + TILE_K - 1) / TILE_K;
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);

        if (nn_K == 1)
        {
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(__fp16) / TILE_K);

            TILE_M = std::max(8, tile_size / 8 * 8);
            TILE_N = std::max(4, tile_size / 4 * 4);
        }
    }

    TILE_M *= std::min(nT, get_physical_cpu_count());

    if (M > 0)
    {
        int nn_M = (M + TILE_M - 1) / TILE_M;
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
    }

    if (N > 0)
    {
        int nn_N = (N + TILE_N - 1) / TILE_N;
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
    }

    if (nT > 1)
    {
        TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
    }

    // always take constant TILE_M/N/K value when provided
    if (constant_TILE_M > 0)
    {
        TILE_M = (constant_TILE_M + 7) / 8 * 8;
    }

    if (constant_TILE_N > 0)
    {
        TILE_N = (constant_TILE_N + 3) / 4 * 4;
    }

    if (constant_TILE_K > 0)
    {
        TILE_K = (constant_TILE_K + 7) / 8 * 8;
    }
}

static int gemm_arm_fp16sa(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
    const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_fp16sa(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;
    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_arm_fp16sa(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_fp16sa(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_BT_arm_fp16sa(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_fp16sa(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_BT_arm_fp16sa(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_fp16sa(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 2u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile_bf16_fp16(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;

                gemm_transB_packed_tile_fp16sa(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_bf16_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

int Gemm_arm::create_pipeline_fp16sa(const Option& opt)
{
    if (constantA)
    {
        const int M = constantM;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_fp16sa(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_M = (M + TILE_M - 1) / TILE_M;

        AT_data.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);
        if (AT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_M; ppj++)
        {
            const int i = ppj * TILE_M;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_ii = std::min((M - i), TILE_M);
                const int max_kk = std::min((K - k), TILE_K);

                Mat AT_tile = AT_data.channel(i / TILE_M).row_range(k / TILE_K, 1);

                if (transA)
                {
                    transpose_pack_A_tile_fp32_to_fp16(A_data, AT_tile, i, max_ii, k, max_kk);
                }
                else
                {
                    pack_A_tile_fp32_to_fp16(A_data, AT_tile, i, max_ii, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
    {
        const int N = constantN;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_fp16sa(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_N = (N + TILE_N - 1) / TILE_N;

        BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, (Allocator*)0);
        if (BT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_N; ppj++)
        {
            const int j = ppj * TILE_N;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_jj = std::min((N - j), TILE_N);
                const int max_kk = std::min((K - k), TILE_K);

                Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (transB)
                {
                    pack_B_tile_fp32_to_fp16(B_data, BT_tile, j, max_jj, k, max_kk);
                }
                else
                {
                    transpose_pack_B_tile_fp32_to_fp16(B_data, BT_tile, j, max_jj, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
    {
        cast_float32_to_float16(C_data, CT_data, opt);
        if (CT_data.empty())
            return -100;

        if (constant_broadcast_type_C == 3 && opt.use_packing_layout)
        {
            int C_elempack = constantM % 8 == 0 ? 8 : constantM % 4 == 0 ? 4 : 1;
            Mat tmp;
            convert_packing(CT_data, tmp, C_elempack, opt);
            CT_data = tmp;
            if (CT_data.empty())
                return -100;
        }

        // pre-multiply C with beta
        if (beta != 1.f)
        {
            const int size = CT_data.total() * CT_data.elempack;
            __fp16* ptr = CT_data;
            for (int i = 0; i < size; i++)
            {
                ptr[i] *= beta;
            }
        }

        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
    {
        nT = opt.num_threads;
    }

    return 0;
}

int Gemm_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int M;
    int N;
    if (constantA && constantB)
    {
        M = constantM;
        N = constantN;
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        M = constantM;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = constantN;
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }

    Mat C;
    int broadcast_type_C = 0;
    if (constantC)
    {
        C = CT_data;
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (constantA && constantB)
        {
            C = bottom_blobs.size() == 1 ? bottom_blobs[0] : Mat();
        }
        else if (constantA)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else if (constantB)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else
        {
            C = bottom_blobs.size() == 3 ? bottom_blobs[2] : Mat();
        }

        if (!C.empty())
        {
            if (C.dims == 1 && C.w == 1)
            {
                // scalar
                broadcast_type_C = 0;
            }
            if (C.dims == 1 && C.w * C.elempack == M)
            {
                // M
                // auto broadcast from h to w is the ncnn-style convention
                broadcast_type_C = 1;
            }
            if (C.dims == 1 && C.w * C.elempack == N)
            {
                // N
                broadcast_type_C = 4;
            }
            if (C.dims == 2 && C.w == 1 && C.h * C.elempack == M)
            {
                // Mx1
                broadcast_type_C = 2;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == M)
            {
                // MxN
                broadcast_type_C = 3;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == 1)
            {
                // 1xN
                broadcast_type_C = 4;
            }

            // pre-multiply C with beta
            if (beta != 1.f)
            {
                Mat CT_data;
                CT_data.create_like(C, opt.workspace_allocator);
                if (CT_data.empty())
                    return -100;

                const int size = C.total() * C.elempack;
                const __fp16* ptr = C;
                __fp16* outptr = CT_data;
                for (int i = 0; i < size; i++)
                {
                    outptr[i] = ptr[i] * (__fp16)beta;
                }

                C = CT_data;
            }
        }
    }

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        int outh = output_transpose ? N : M;
        out_elempack = outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
    }
    if (output_elempack)
        out_elempack = output_elempack;
    size_t out_elemsize = 2u * out_elempack;

    Mat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int _nT = nT ? nT : opt.num_threads;
    if (nT != 0 && opt.num_threads != nT)
    {
        // force num_threads the same as in create_pipeline
        // so we could use pre-packed A/B from the same tile config
        NCNN_LOGE("opt.num_threads %d changed, gemm will use load-time value %d", opt.num_threads, nT);
    }

    int ret = 0;
    if (constantA && constantB)
    {
        ret = gemm_AT_BT_arm_fp16sa(AT_data, BT_data, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        ret = gemm_AT_arm_fp16sa(AT_data, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        ret = gemm_BT_arm_fp16sa(A, BT_data, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        ret = gemm_arm_fp16sa(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    if (ret != 0)
        return ret;

    // multiply top_blob with alpha
    if (alpha != 1.f)
    {
        const int size = top_blob.total() * out_elempack;
        __fp16* ptr = top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < size; i++)
        {
            ptr[i] *= alpha;
        }
    }

    return 0;
}

#if NCNN_INT8
void compute_A_tile_fp16_int8_scales_asimdhp(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    compute_A_tile_fp16_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
}

void transpose_compute_A_tile_fp16_int8_scales_asimdhp(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    transpose_compute_A_tile_fp16_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
}

void compute_B_fp16_int8_scale_asimdhp(const Mat& B, float& scale)
{
    compute_B_fp16_int8_scale(B, scale);
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/arm/gemm_arm_i8mm.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "arm_usability.h"

namespace ncnn {

#include "gemm_int8.h"
#include "gemm_int8_fp16s.h"

#if NCNN_BF16
#include "gemm_int8_bf16s.h"
#endif

void pack_A_tile_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
}

void transpose_pack_A_tile_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    transpose_pack_A_tile_int8(A, AT, i, max_ii, k, max_kk);
}

void pack_B_tile_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    pack_B_tile_int8(B, BT, j, max_jj, k, max_kk);
}

void transpose_pack_B_tile_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    transpose_pack_B_tile_int8(B, BT, j, max_jj, k, max_kk);
}

void pack_A_tile_fp32_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    pack_A_tile_fp32_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void transpose_pack_A_tile_fp32_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    transpose_pack_A_tile_fp32_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void pack_B_tile_fp32_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    pack_B_tile_fp32_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void transpose_pack_B_tile_fp32_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    transpose_pack_B_tile_fp32_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void gemm_transB_packed_tile_int8_i8mm(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, int i, int max_ii, int j, int max_jj, int k, int max_kk)
{
    gemm_transB_packed_tile_int8(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
}

void pack_A_tile_fp16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    pack_A_tile_fp16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void transpose_pack_A_tile_fp16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    transpose_pack_A_tile_fp16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void pack_B_tile_fp16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    pack_B_tile_fp16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void transpose_pack_B_tile_fp16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    transpose_pack_B_tile_fp16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

#if NCNN_BF16
void pack_A_tile_bf16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    pack_A_tile_bf16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void transpose_pack_A_tile_bf16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    transpose_pack_A_tile_bf16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void pack_B_tile_bf16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    pack_B_tile_bf16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void transpose_pack_B_tile_bf16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    transpose_pack_B_tile_bf16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/gemm_arm_vfpv4.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gemm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#include "gemm_bf16s_fp16s.h"
#include "gemm_fp16s.h"

#if NCNN_INT8
#include "gemm_int8_fp16s.h"
#endif

extern void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk);

static int gemm_arm_fp16s(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
    const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;
    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_fp16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_arm_fp16s(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    int nn_N = (N + TILE_N - 1) / TILE_N;
    int nn_K = (K + TILE_K - 1) / TILE_K;

    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
    if (BT.empty())
        return -100;

    const int nn_NK = nn_N * nn_K;

    // pack B
    #pragma omp parallel for num_threads(nT)
    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
    {
        const int ppj = ppjk / nn_K;
        const int ppk = ppjk % nn_K;

        const int j = ppj * TILE_N;
        const int k = ppk * TILE_K;

        const int max_jj = std::min((N - j), TILE_N);
        const int max_kk = std::min((K - k), TILE_K);

        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

        if (transB)
        {
            pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
        else
        {
            transpose_pack_B_tile_bf16_fp16(B, BT_tile, j, max_jj, k, max_kk);
        }
    }

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_fp16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_BT_arm_fp16s(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;

    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 2u, opt.workspace_allocator);
    if (ATX.empty())
        return -100;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        // shadowed variable for less openmp task args
        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (j == 0)
                {
                    if (transA)
                    {
                        transpose_pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                    else
                    {
                        pack_A_tile_bf16_fp16(A, AT_tile, i, max_ii, k, max_kk);
                    }
                }

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_fp16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

static int gemm_AT_BT_arm_fp16s(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, float alpha, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, const Option& opt)
{
    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);

    int TILE_M, TILE_N, TILE_K;
    get_optimal_tile_mnk_bf16s_fp16s(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);

    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);

    int nn_M = (M + TILE_M - 1) / TILE_M;
    // int nn_N = (N + TILE_N - 1) / TILE_N;

    Mat topT;
    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
    {
        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
        if (topT.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(nT)
    for (int ppi = 0; ppi < nn_M; ppi++)
    {
        const int i = ppi * TILE_M;

        const int max_ii = std::min((M - i), TILE_M);

        Mat topT_tile;
        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
            topT_tile = topT.channel(get_omp_thread_num());

        for (int j = 0; j < N; j += TILE_N)
        {
            const int max_jj = std::min((N - j), TILE_N);

            if (broadcast_type_C == 3)
            {
                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj);
            }

            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_kk = std::min((K - k), TILE_K);

                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);

                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);

                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);

                bool k_end = !output_transpose && k + TILE_K >= K;
                float _alpha = k + TILE_K >= K ? alpha : 1.f;

                gemm_transB_packed_tile_fp16s(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, _alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
            }

            if (output_transpose)
            {
                transpose_unpack_output_tile_fp32_to_fp16(topT_tile, top_blob, i, max_ii, j, max_jj);
            }
        }
    }

    return 0;
}

int Gemm_arm::create_pipeline_fp16s(const Option& opt)
{
    if (constantA)
    {
        const int M = constantM;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_bf16s_fp16s(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_M = (M + TILE_M - 1) / TILE_M;

        AT_data.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);
        if (AT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_M; ppj++)
        {
            const int i = ppj * TILE_M;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_ii = std::min((M - i), TILE_M);
                const int max_kk = std::min((K - k), TILE_K);

                Mat AT_tile = AT_data.channel(i / TILE_M).row_range(k / TILE_K, 1);

                if (transA)
                {
                    transpose_pack_A_tile_fp32_to_fp16(A_data, AT_tile, i, max_ii, k, max_kk);
                }
                else
                {
                    pack_A_tile_fp32_to_fp16(A_data, AT_tile, i, max_ii, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            A_data.release();
    }

    if (constantB)
    {
        const int N = constantN;
        const int K = constantK;

        int TILE_M, TILE_N, TILE_K;
        get_optimal_tile_mnk_bf16s_fp16s(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);

        const int nn_N = (N + TILE_N - 1) / TILE_N;

        BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, (Allocator*)0);
        if (BT_data.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ppj = 0; ppj < nn_N; ppj++)
        {
            const int j = ppj * TILE_N;

            for (int k = 0; k < K; k += TILE_K)
            {
                const int max_jj = std::min((N - j), TILE_N);
                const int max_kk = std::min((K - k), TILE_K);

                Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);

                if (transB)
                {
                    pack_B_tile_fp32_to_fp16(B_data, BT_tile, j, max_jj, k, max_kk);
                }
                else
                {
                    transpose_pack_B_tile_fp32_to_fp16(B_data, BT_tile, j, max_jj, k, max_kk);
                }
            }
        }

        if (opt.lightmode)
            B_data.release();
    }

    if (constantC && constant_broadcast_type_C != -1)
    {
        CT_data = C_data;

#if __ARM_NEON
        if (constant_broadcast_type_C == 3 && opt.use_packing_layout)
        {
            int C_elempack = constantM % 4 == 0 ? 4 : 1;
            convert_packing(C_data, CT_data, C_elempack, opt);
            if (CT_data.empty())
                return -100;
        }
#endif // __ARM_NEON

        // pre-multiply C with beta
        if (beta != 1.f)
        {
            Mat C2;
            C2.create_like(CT_data);
            if (C2.empty())
                return -100;

            const int size = CT_data.total() * CT_data.elempack;
            for (int i = 0; i < size; i++)
            {
                C2[i] = CT_data[i] * beta;
            }

            CT_data = C2;
        }

        if (opt.lightmode)
            C_data.release();
    }

    if (constantA || constantB || constantC)
    {
        nT = opt.num_threads;
    }

    return 0;
}

int Gemm_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int M;
    int N;
    if (constantA && constantB)
    {
        M = constantM;
        N = constantN;
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        M = constantM;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = constantN;
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
    }

    Mat C;
    int broadcast_type_C = 0;
    if (constantC)
    {
        C = CT_data;
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (constantA && constantB)
        {
            C = bottom_blobs.size() == 1 ? bottom_blobs[0] : Mat();
        }
        else if (constantA)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else if (constantB)
        {
            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
        }
        else
        {
            C = bottom_blobs.size() == 3 ? bottom_blobs[2] : Mat();
        }

        if (!C.empty())
        {
            if (C.dims == 1 && C.w == 1)
            {
                // scalar
                broadcast_type_C = 0;
            }
            if (C.dims == 1 && C.w * C.elempack == M)
            {
                // M
                // auto broadcast from h to w is the ncnn-style convention
                broadcast_type_C = 1;
            }
            if (C.dims == 1 && C.w * C.elempack == N)
            {
                // N
                broadcast_type_C = 4;
            }
            if (C.dims == 2 && C.w == 1 && C.h * C.elempack == M)
            {
                // Mx1
                broadcast_type_C = 2;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == M)
            {
                // MxN
                broadcast_type_C = 3;
            }
            if (C.dims == 2 && C.w == N && C.h * C.elempack == 1)
            {
                // 1xN
                broadcast_type_C = 4;
            }

            // cast to fp32
            {
                Mat CT_data;
                cast_float16_to_float32(C, CT_data);
                C = CT_data;
                if (C.empty())
                    return -100;
            }

            // pre-multiply C with beta
            if (beta != 1.f)
            {
                Mat CT_data;
                CT_data.create_like(C, opt.workspace_allocator);
                if (CT_data.empty())
                    return -100;

                const int size = C.total() * C.elempack;
                for (int i = 0; i < size; i++)
                {
                    CT_data[i] = C[i] * beta;
                }

                C = CT_data;
            }
        }
    }

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        int outh = output_transpose ? N : M;
        out_elempack = outh % 4 == 0 ? 4 : 1;
    }
    if (output_elempack)
        out_elempack = output_elempack;
    size_t out_elemsize = 2u * out_elempack;

    Mat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        else
            top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int _nT = nT ? nT : opt.num_threads;
    if (nT != 0 && opt.num_threads != nT)
    {
        // force num_threads the same as in create_pipeline
        // so we could use pre-packed A/B from the same tile config
        NCNN_LOGE("opt.num_threads %d changed, gemm will use load-time value %d", opt.num_threads, nT);
    }

    int ret = 0;
    if (constantA && constantB)
    {
        ret = gemm_AT_BT_arm_fp16s(AT_data, BT_data, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantA)
    {
        const Mat& B = bottom_blobs[0];
        ret = gemm_AT_arm_fp16s(AT_data, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else if (constantB)
    {
        const Mat& A = bottom_blobs[0];
        ret = gemm_BT_arm_fp16s(A, BT_data, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }
    else
    {
        const Mat& A = bottom_blobs[0];
        const Mat& B = bottom_blobs[1];
        ret = gemm_arm_fp16s(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, alpha, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, opt);
    }

    return ret;
}

#if NCNN_INT8
void compute_A_tile_fp16_int8_scales_vfpv4(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    compute_A_tile_fp16_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
}

void transpose_compute_A_tile_fp16_int8_scales_vfpv4(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    transpose_compute_A_tile_fp16_int8_scales(A, scales, B_scale, out_descales, i, max_ii);
}

void pack_A_tile_fp16_to_int8_vfpv4(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    pack_A_tile_fp16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void transpose_pack_A_tile_fp16_to_int8_vfpv4(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
    transpose_pack_A_tile_fp16_to_int8(A, AT, i, max_ii, k, max_kk, scales);
}

void compute_B_fp16_int8_scale_vfpv4(const Mat& B, float& scale)
{
    compute_B_fp16_int8_scale(B, scale);
}

void pack_B_tile_fp16_to_int8_vfpv4(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    pack_B_tile_fp16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void transpose_pack_B_tile_fp16_to_int8_vfpv4(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
    transpose_pack_B_tile_fp16_to_int8(B, BT, j, max_jj, k, max_kk, scale);
}

void unpack_output_tile_int32_to_fp16_vfpv4(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    unpack_output_tile_int32_to_fp16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}

void transpose_unpack_output_tile_int32_to_fp16_vfpv4(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
    transpose_unpack_output_tile_int32_to_fp16(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/arm/gemm_bf16s.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pack_A_tile_fp32_to_bf16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    unsigned short* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
        const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
        const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;
        const float* p4 = (const float*)A + (i + ii + 4) * A_hstep + k;
        const float* p5 = (const float*)A + (i + ii + 5) * A_hstep + k;
        const float* p6 = (const float*)A + (i + ii + 6) * A_hstep + k;
        const float* p7 = (const float*)A + (i + ii + 7) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            uint16x8_t _r1 = vcombine_u16(float2bfloat(vld1q_f32(p1)), float2bfloat(vld1q_f32(p1 + 4)));
            uint16x8_t _r2 = vcombine_u16(float2bfloat(vld1q_f32(p2)), float2bfloat(vld1q_f32(p2 + 4)));
            uint16x8_t _r3 = vcombine_u16(float2bfloat(vld1q_f32(p3)), float2bfloat(vld1q_f32(p3 + 4)));
            uint16x8_t _r4 = vcombine_u16(float2bfloat(vld1q_f32(p4)), float2bfloat(vld1q_f32(p4 + 4)));
            uint16x8_t _r5 = vcombine_u16(float2bfloat(vld1q_f32(p5)), float2bfloat(vld1q_f32(p5 + 4)));
            uint16x8_t _r6 = vcombine_u16(float2bfloat(vld1q_f32(p6)), float2bfloat(vld1q_f32(p6 + 4)));
            uint16x8_t _r7 = vcombine_u16(float2bfloat(vld1q_f32(p7)), float2bfloat(vld1q_f32(p7 + 4)));
            transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
            vst1q_u16(pp, _r0);
            vst1q_u16(pp + 8, _r1);
            vst1q_u16(pp + 8 * 2, _r2);
            vst1q_u16(pp + 8 * 3, _r3);
            vst1q_u16(pp + 8 * 4, _r4);
            vst1q_u16(pp + 8 * 5, _r5);
            vst1q_u16(pp + 8 * 6, _r6);
            vst1q_u16(pp + 8 * 7, _r7);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p1[0]);
            pp[2] = float32_to_bfloat16(p2[0]);
            pp[3] = float32_to_bfloat16(p3[0]);
            pp[4] = float32_to_bfloat16(p4[0]);
            pp[5] = float32_to_bfloat16(p5[0]);
            pp[6] = float32_to_bfloat16(p6[0]);
            pp[7] = float32_to_bfloat16(p7[0]);
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
        const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
        const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x4_t _r0123;
            _r0123.val[0] = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            _r0123.val[1] = vcombine_u16(float2bfloat(vld1q_f32(p1)), float2bfloat(vld1q_f32(p1 + 4)));
            _r0123.val[2] = vcombine_u16(float2bfloat(vld1q_f32(p2)), float2bfloat(vld1q_f32(p2 + 4)));
            _r0123.val[3] = vcombine_u16(float2bfloat(vld1q_f32(p3)), float2bfloat(vld1q_f32(p3 + 4)));
            vst4q_u16(pp, _r0123);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x4_t _r0123;
            _r0123.val[0] = float2bfloat(vld1q_f32(p0));
            _r0123.val[1] = float2bfloat(vld1q_f32(p1));
            _r0123.val[2] = float2bfloat(vld1q_f32(p2));
            _r0123.val[3] = float2bfloat(vld1q_f32(p3));
            vst4_u16(pp, _r0123);
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p1[0]);
            pp[2] = float32_to_bfloat16(p2[0]);
            pp[3] = float32_to_bfloat16(p3[0]);
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x2_t _r01;
            _r01.val[0] = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            _r01.val[1] = vcombine_u16(float2bfloat(vld1q_f32(p1)), float2bfloat(vld1q_f32(p1 + 4)));
            vst2q_u16(pp, _r01);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x2_t _r01;
            _r01.val[0] = float2bfloat(vld1q_f32(p0));
            _r01.val[1] = float2bfloat(vld1q_f32(p1));
            vst2_u16(pp, _r01);
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p1[0]);
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = float2bfloat(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp += 1;
            p0++;
        }
    }
}

static void transpose_pack_A_tile_fp32_to_bf16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    unsigned short* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x8_t _r0 = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += A_hstep;
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x4_t _r0 = float2bfloat(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += A_hstep;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p0[1]);
            pp += 2;
            p0 += A_hstep;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp += 1;
            p0 += A_hstep;
        }
    }
}

static void pack_B_tile_fp32_to_bf16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    unsigned short* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
        const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
        const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
        const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
        const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
        const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
        const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;
        const float* p8 = (const float*)B + (j + jj + 8) * B_hstep + k;
        const float* p9 = (const float*)B + (j + jj + 9) * B_hstep + k;
        const float* pa = (const float*)B + (j + jj + 10) * B_hstep + k;
        const float* pb = (const float*)B + (j + jj + 11) * B_hstep + k;

        int kk = 0;
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = float2bfloat(vld1q_f32(p0));
            uint16x4_t _r1 = float2bfloat(vld1q_f32(p1));
            uint16x4_t _r2 = float2bfloat(vld1q_f32(p2));
            uint16x4_t _r3 = float2bfloat(vld1q_f32(p3));
            uint16x4_t _r4 = float2bfloat(vld1q_f32(p4));
            uint16x4_t _r5 = float2bfloat(vld1q_f32(p5));
            uint16x4_t _r6 = float2bfloat(vld1q_f32(p6));
            uint16x4_t _r7 = float2bfloat(vld1q_f32(p7));
            uint16x4_t _r8 = float2bfloat(vld1q_f32(p8));
            uint16x4_t _r9 = float2bfloat(vld1q_f32(p9));
            uint16x4_t _ra = float2bfloat(vld1q_f32(pa));
            uint16x4_t _rb = float2bfloat(vld1q_f32(pb));

            transpose4x4_u16(_r0, _r1, _r2, _r3);
            transpose4x4_u16(_r4, _r5, _r6, _r7);
            transpose4x4_u16(_r8, _r9, _ra, _rb);

            vst1_u16(pp, _r0);
            vst1_u16(pp + 4, _r4);
            vst1_u16(pp + 4 * 2, _r8);
            vst1_u16(pp + 4 * 3, _r1);
            vst1_u16(pp + 4 * 4, _r5);
            vst1_u16(pp + 4 * 5, _r9);
            vst1_u16(pp + 4 * 6, _r2);
            vst1_u16(pp + 4 * 7, _r6);
            vst1_u16(pp + 4 * 8, _ra);
            vst1_u16(pp + 4 * 9, _r3);
            vst1_u16(pp + 4 * 10, _r7);
            vst1_u16(pp + 4 * 11, _rb);
            pp += 48;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
            p4 += 4;
            p5 += 4;
            p6 += 4;
            p7 += 4;
            p8 += 4;
            p9 += 4;
            pa += 4;
            pb += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p1[0]);
            pp[2] = float32_to_bfloat16(p2[0]);
            pp[3] = float32_to_bfloat16(p3[0]);
            pp[4] = float32_to_bfloat16(p4[0]);
            pp[5] = float32_to_bfloat16(p5[0]);
            pp[6] = float32_to_bfloat16(p6[0]);
            pp[7] = float32_to_bfloat16(p7[0]);
            pp[8] = float32_to_bfloat16(p8[0]);
            pp[9] = float32_to_bfloat16(p9[0]);
            pp[10] = float32_to_bfloat16(pa[0]);
            pp[11] = float32_to_bfloat16(pb[0]);
            pp += 12;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
            p8++;
            p9++;
            pa++;
            pb++;
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
        const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
        const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
        const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
        const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
        const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
        const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            uint16x8_t _r1 = vcombine_u16(float2bfloat(vld1q_f32(p1)), float2bfloat(vld1q_f32(p1 + 4)));
            uint16x8_t _r2 = vcombine_u16(float2bfloat(vld1q_f32(p2)), float2bfloat(vld1q_f32(p2 + 4)));
            uint16x8_t _r3 = vcombine_u16(float2bfloat(vld1q_f32(p3)), float2bfloat(vld1q_f32(p3 + 4)));
            uint16x8_t _r4 = vcombine_u16(float2bfloat(vld1q_f32(p4)), float2bfloat(vld1q_f32(p4 + 4)));
            uint16x8_t _r5 = vcombine_u16(float2bfloat(vld1q_f32(p5)), float2bfloat(vld1q_f32(p5 + 4)));
            uint16x8_t _r6 = vcombine_u16(float2bfloat(vld1q_f32(p6)), float2bfloat(vld1q_f32(p6 + 4)));
            uint16x8_t _r7 = vcombine_u16(float2bfloat(vld1q_f32(p7)), float2bfloat(vld1q_f32(p7 + 4)));
            transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
            vst1q_u16(pp, _r0);
            vst1q_u16(pp + 8, _r1);
            vst1q_u16(pp + 8 * 2, _r2);
            vst1q_u16(pp + 8 * 3, _r3);
            vst1q_u16(pp + 8 * 4, _r4);
            vst1q_u16(pp + 8 * 5, _r5);
            vst1q_u16(pp + 8 * 6, _r6);
            vst1q_u16(pp + 8 * 7, _r7);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = float2bfloat(vld1q_f32(p0));
            uint16x4_t _r1 = float2bfloat(vld1q_f32(p1));
            uint16x4_t _r2 = float2bfloat(vld1q_f32(p2));
            uint16x4_t _r3 = float2bfloat(vld1q_f32(p3));
            uint16x4_t _r4 = float2bfloat(vld1q_f32(p4));
            uint16x4_t _r5 = float2bfloat(vld1q_f32(p5));
            uint16x4_t _r6 = float2bfloat(vld1q_f32(p6));
            uint16x4_t _r7 = float2bfloat(vld1q_f32(p7));

            transpose4x4_u16(_r0, _r1, _r2, _r3);
            transpose4x4_u16(_r4, _r5, _r6, _r7);

            vst1_u16(pp, _r0);
            vst1_u16(pp + 4, _r4);
            vst1_u16(pp + 4 * 2, _r1);
            vst1_u16(pp + 4 * 3, _r5);
            vst1_u16(pp + 4 * 4, _r2);
            vst1_u16(pp + 4 * 5, _r6);
            vst1_u16(pp + 4 * 6, _r3);
            vst1_u16(pp + 4 * 7, _r7);
            pp += 32;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
            p4 += 4;
            p5 += 4;
            p6 += 4;
            p7 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p1[0]);
            pp[2] = float32_to_bfloat16(p2[0]);
            pp[3] = float32_to_bfloat16(p3[0]);
            pp[4] = float32_to_bfloat16(p4[0]);
            pp[5] = float32_to_bfloat16(p5[0]);
            pp[6] = float32_to_bfloat16(p6[0]);
            pp[7] = float32_to_bfloat16(p7[0]);
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
        const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
        const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x4_t _r0123;
            _r0123.val[0] = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            _r0123.val[1] = vcombine_u16(float2bfloat(vld1q_f32(p1)), float2bfloat(vld1q_f32(p1 + 4)));
            _r0123.val[2] = vcombine_u16(float2bfloat(vld1q_f32(p2)), float2bfloat(vld1q_f32(p2 + 4)));
            _r0123.val[3] = vcombine_u16(float2bfloat(vld1q_f32(p3)), float2bfloat(vld1q_f32(p3 + 4)));
            vst4q_u16(pp, _r0123);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x4_t _r0123;
            _r0123.val[0] = float2bfloat(vld1q_f32(p0));
            _r0123.val[1] = float2bfloat(vld1q_f32(p1));
            _r0123.val[2] = float2bfloat(vld1q_f32(p2));
            _r0123.val[3] = float2bfloat(vld1q_f32(p3));
            vst4_u16(pp, _r0123);
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p1[0]);
            pp[2] = float32_to_bfloat16(p2[0]);
            pp[3] = float32_to_bfloat16(p3[0]);
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x2_t _r01;
            _r01.val[0] = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            _r01.val[1] = vcombine_u16(float2bfloat(vld1q_f32(p1)), float2bfloat(vld1q_f32(p1 + 4)));
            vst2q_u16(pp, _r01);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x2_t _r01;
            _r01.val[0] = float2bfloat(vld1q_f32(p0));
            _r01.val[1] = float2bfloat(vld1q_f32(p1));
            vst2_u16(pp, _r01);
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p1[0]);
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = float2bfloat(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += 4;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp += 1;
            p0++;
        }
    }
}

static void transpose_pack_B_tile_fp32_to_bf16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    unsigned short* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            vst1_u16(pp, float2bfloat(vld1q_f32(p0)));
            vst1_u16(pp + 4, float2bfloat(vld1q_f32(p0 + 4)));
            vst1_u16(pp + 8, float2bfloat(vld1q_f32(p0 + 8)));
            pp += 12;
            p0 += B_hstep;
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x8_t _r0 = vcombine_u16(float2bfloat(vld1q_f32(p0)), float2bfloat(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += B_hstep;
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x4_t _r0 = float2bfloat(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += B_hstep;
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp[1] = float32_to_bfloat16(p0[1]);
            pp += 2;
            p0 += B_hstep;
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_bfloat16(p0[0]);
            pp += 1;
            p0 += B_hstep;
        }
    }
}

static void transpose_unpack_output_tile_fp32_to_bf16(const Mat& topT, Mat& top_blob, int i, int max_ii, int j, int max_jj)
{
    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const float* pp = topT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x8x4_t _r0;
                _r0.val[0] = vcombine_u16(float2bfloat(vld1q_f32(pp)), float2bfloat(vld1q_f32(pp + 4)));
                _r0.val[1] = vcombine_u16(float2bfloat(vld1q_f32(pp + 8)), float2bfloat(vld1q_f32(pp + 12)));
                _r0.val[2] = vcombine_u16(float2bfloat(vld1q_f32(pp + 16)), float2bfloat(vld1q_f32(pp + 20)));
                _r0.val[3] = vcombine_u16(float2bfloat(vld1q_f32(pp + 24)), float2bfloat(vld1q_f32(pp + 28)));
                vst4q_u16(p0, _r0);
                pp += 32;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                uint16x8_t _r0 = vcombine_u16(float2bfloat(vld1q_f32(pp)), float2bfloat(vld1q_f32(pp + 4)));
                vst1q_u16(p0, _r0);
                pp += 8;
                p0 += out_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x4x4_t _r0123;
                _r0123.val[0] = float2bfloat(vld1q_f32(pp));
                _r0123.val[1] = float2bfloat(vld1q_f32(pp + 4));
                _r0123.val[2] = float2bfloat(vld1q_f32(pp + 8));
                _r0123.val[3] = float2bfloat(vld1q_f32(pp + 12));
                vst4_u16(p0, _r0123);
                pp += 16;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                uint16x4_t _r0 = float2bfloat(vld1q_f32(pp));
                vst1_u16(p0, _r0);
                pp += 4;
                p0 += out_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                p0[0] = float32_to_bfloat16(pp[0]);
                p0[1] = float32_to_bfloat16(pp[2]);
                p0[2] = float32_to_bfloat16(pp[4]);
                p0[3] = float32_to_bfloat16(pp[6]);
                p0[4] = float32_to_bfloat16(pp[1]);
                p0[5] = float32_to_bfloat16(pp[3]);
                p0[6] = float32_to_bfloat16(pp[5]);
                p0[7] = float32_to_bfloat16(pp[7]);
                pp += 8;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = float32_to_bfloat16(pp[0]);
                p0[1] = float32_to_bfloat16(pp[1]);
                pp += 2;
                p0 += out_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
#if __ARM_NEON
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x4_t _r0 = float2bfloat(vld1q_f32(pp));
                vst1_u16(p0, _r0);
                pp += 4;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = float32_to_bfloat16(pp[0]);
                pp += 1;
                p0 += out_hstep;
            }
        }
    }
}

static void gemm_transB_packed_tile_bf16s(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, float alpha, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end)
{
    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const unsigned short* pAT = AT_tile;
    const unsigned short* pBT = BT_tile;
    const float* pC = CT_tile;

    float* outptr = topT_tile;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const unsigned short* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;
            float32x4_t _sum80;
            float32x4_t _sum81;
            float32x4_t _sum90;
            float32x4_t _sum91;
            float32x4_t _suma0;
            float32x4_t _suma1;
            float32x4_t _sumb0;
            float32x4_t _sumb1;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);
                _sum40 = vdupq_n_f32(0.f);
                _sum41 = vdupq_n_f32(0.f);
                _sum50 = vdupq_n_f32(0.f);
                _sum51 = vdupq_n_f32(0.f);
                _sum60 = vdupq_n_f32(0.f);
                _sum61 = vdupq_n_f32(0.f);
                _sum70 = vdupq_n_f32(0.f);
                _sum71 = vdupq_n_f32(0.f);
                _sum80 = vdupq_n_f32(0.f);
                _sum81 = vdupq_n_f32(0.f);
                _sum90 = vdupq_n_f32(0.f);
                _sum91 = vdupq_n_f32(0.f);
                _suma0 = vdupq_n_f32(0.f);
                _suma1 = vdupq_n_f32(0.f);
                _sumb0 = vdupq_n_f32(0.f);
                _sumb1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum20 = _sum00;
                        _sum21 = _sum00;
                        _sum30 = _sum00;
                        _sum31 = _sum00;
                        _sum40 = _sum00;
                        _sum41 = _sum00;
                        _sum50 = _sum00;
                        _sum51 = _sum00;
                        _sum60 = _sum00;
                        _sum61 = _sum00;
                        _sum70 = _sum00;
                        _sum71 = _sum00;
                        _sum80 = _sum00;
                        _sum81 = _sum00;
                        _sum90 = _sum00;
                        _sum91 = _sum00;
                        _suma0 = _sum00;
                        _suma1 = _sum00;
                        _sumb0 = _sum00;
                        _sumb1 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                        _sum40 = _sum00;
                        _sum41 = _sum01;
                        _sum50 = _sum00;
                        _sum51 = _sum01;
                        _sum60 = _sum00;
                        _sum61 = _sum01;
                        _sum70 = _sum00;
                        _sum71 = _sum01;
                        _sum80 = _sum00;
                        _sum81 = _sum01;
                        _sum90 = _sum00;
                        _sum91 = _sum01;
                        _suma0 = _sum00;
                        _suma1 = _sum01;
                        _sumb0 = _sum00;
                        _sumb1 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        _sum40 = vld1q_f32(pC + 4 * 8);
                        _sum41 = vld1q_f32(pC + 4 * 9);
                        _sum50 = vld1q_f32(pC + 4 * 10);
                        _sum51 = vld1q_f32(pC + 4 * 11);
                        _sum60 = vld1q_f32(pC + 4 * 12);
                        _sum61 = vld1q_f32(pC + 4 * 13);
                        _sum70 = vld1q_f32(pC + 4 * 14);
                        _sum71 = vld1q_f32(pC + 4 * 15);
                        _sum80 = vld1q_f32(pC + 4 * 16);
                        _sum81 = vld1q_f32(pC + 4 * 17);
                        _sum90 = vld1q_f32(pC + 4 * 18);
                        _sum91 = vld1q_f32(pC + 4 * 19);
                        _suma0 = vld1q_f32(pC + 4 * 20);
                        _suma1 = vld1q_f32(pC + 4 * 21);
                        _sumb0 = vld1q_f32(pC + 4 * 22);
                        _sumb1 = vld1q_f32(pC + 4 * 23);
                        pC += 96;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum40 = vdupq_n_f32(pC[4]);
                        _sum50 = vdupq_n_f32(pC[5]);
                        _sum60 = vdupq_n_f32(pC[6]);
                        _sum70 = vdupq_n_f32(pC[7]);
                        _sum80 = vdupq_n_f32(pC[8]);
                        _sum90 = vdupq_n_f32(pC[9]);
                        _suma0 = vdupq_n_f32(pC[10]);
                        _sumb0 = vdupq_n_f32(pC[11]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        _sum41 = _sum40;
                        _sum51 = _sum50;
                        _sum61 = _sum60;
                        _sum71 = _sum70;
                        _sum81 = _sum80;
                        _sum91 = _sum90;
                        _suma1 = _suma0;
                        _sumb1 = _sumb0;
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
                _sum80 = vld1q_f32(outptr + 4 * 16);
                _sum81 = vld1q_f32(outptr + 4 * 17);
                _sum90 = vld1q_f32(outptr + 4 * 18);
                _sum91 = vld1q_f32(outptr + 4 * 19);
                _suma0 = vld1q_f32(outptr + 4 * 20);
                _suma1 = vld1q_f32(outptr + 4 * 21);
                _sumb0 = vld1q_f32(outptr + 4 * 22);
                _sumb1 = vld1q_f32(outptr + 4 * 23);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = bfloat2float(vget_low_u16(_pA));
                float32x4_t _pA1 = bfloat2float(vget_high_u16(_pA));

                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);

                pA += 8;
                pB += 12;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum20 = vmulq_f32(_sum20, _alpha);
                _sum21 = vmulq_f32(_sum21, _alpha);
                _sum30 = vmulq_f32(_sum30, _alpha);
                _sum31 = vmulq_f32(_sum31, _alpha);
                _sum40 = vmulq_f32(_sum40, _alpha);
                _sum41 = vmulq_f32(_sum41, _alpha);
                _sum50 = vmulq_f32(_sum50, _alpha);
                _sum51 = vmulq_f32(_sum51, _alpha);
                _sum60 = vmulq_f32(_sum60, _alpha);
                _sum61 = vmulq_f32(_sum61, _alpha);
                _sum70 = vmulq_f32(_sum70, _alpha);
                _sum71 = vmulq_f32(_sum71, _alpha);
                _sum80 = vmulq_f32(_sum80, _alpha);
                _sum81 = vmulq_f32(_sum81, _alpha);
                _sum90 = vmulq_f32(_sum90, _alpha);
                _sum91 = vmulq_f32(_sum91, _alpha);
                _suma0 = vmulq_f32(_suma0, _alpha);
                _suma1 = vmulq_f32(_suma1, _alpha);
                _sumb0 = vmulq_f32(_sumb0, _alpha);
                _sumb1 = vmulq_f32(_sumb1, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum20));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum30));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum40));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum50));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum60));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum70));
                    vst1_u16(outptr0 + 4 * 8, float2bfloat(_sum80));
                    vst1_u16(outptr0 + 4 * 9, float2bfloat(_sum90));
                    vst1_u16(outptr0 + 4 * 10, float2bfloat(_suma0));
                    vst1_u16(outptr0 + 4 * 11, float2bfloat(_sumb0));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, float2bfloat(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 4, float2bfloat(_sum41));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 5, float2bfloat(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 6, float2bfloat(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 7, float2bfloat(_sum71));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 8, float2bfloat(_sum81));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 9, float2bfloat(_sum91));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 10, float2bfloat(_suma1));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 11, float2bfloat(_sumb1));

                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose8x12_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, _sum80, _sum81, _sum90, _sum91, _suma0, _suma1, _sumb0, _sumb1);

                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + 8, float2bfloat(_sum10));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum20));
                    vst1_u16(outptr0 + out_hstep + 8, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 2, float2bfloat(_sum30));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, float2bfloat(_sum31));
                    vst1_u16(outptr0 + out_hstep * 2 + 8, float2bfloat(_sum40));
                    vst1_u16(outptr0 + out_hstep * 3, float2bfloat(_sum41));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, float2bfloat(_sum50));
                    vst1_u16(outptr0 + out_hstep * 3 + 8, float2bfloat(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum60));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 8, float2bfloat(_sum70));
                    vst1_u16(outptr0 + out_hstep * 5, float2bfloat(_sum71));
                    vst1_u16(outptr0 + out_hstep * 5 + 4, float2bfloat(_sum80));
                    vst1_u16(outptr0 + out_hstep * 5 + 8, float2bfloat(_sum81));
                    vst1_u16(outptr0 + out_hstep * 6, float2bfloat(_sum90));
                    vst1_u16(outptr0 + out_hstep * 6 + 4, float2bfloat(_sum91));
                    vst1_u16(outptr0 + out_hstep * 6 + 8, float2bfloat(_suma0));
                    vst1_u16(outptr0 + out_hstep * 7, float2bfloat(_suma1));
                    vst1_u16(outptr0 + out_hstep * 7 + 4, float2bfloat(_sumb0));
                    vst1_u16(outptr0 + out_hstep * 7 + 8, float2bfloat(_sumb1));

                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
                vst1q_f32(outptr + 4 * 16, _sum80);
                vst1q_f32(outptr + 4 * 17, _sum81);
                vst1q_f32(outptr + 4 * 18, _sum90);
                vst1q_f32(outptr + 4 * 19, _sum91);
                vst1q_f32(outptr + 4 * 20, _suma0);
                vst1q_f32(outptr + 4 * 21, _suma1);
                vst1q_f32(outptr + 4 * 22, _sumb0);
                vst1q_f32(outptr + 4 * 23, _sumb1);
            }

            outptr += 96;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);
                _sum40 = vdupq_n_f32(0.f);
                _sum41 = vdupq_n_f32(0.f);
                _sum50 = vdupq_n_f32(0.f);
                _sum51 = vdupq_n_f32(0.f);
                _sum60 = vdupq_n_f32(0.f);
                _sum61 = vdupq_n_f32(0.f);
                _sum70 = vdupq_n_f32(0.f);
                _sum71 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum20 = _sum00;
                        _sum21 = _sum00;
                        _sum30 = _sum00;
                        _sum31 = _sum00;
                        _sum40 = _sum00;
                        _sum41 = _sum00;
                        _sum50 = _sum00;
                        _sum51 = _sum00;
                        _sum60 = _sum00;
                        _sum61 = _sum00;
                        _sum70 = _sum00;
                        _sum71 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                        _sum40 = _sum00;
                        _sum41 = _sum01;
                        _sum50 = _sum00;
                        _sum51 = _sum01;
                        _sum60 = _sum00;
                        _sum61 = _sum01;
                        _sum70 = _sum00;
                        _sum71 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        _sum40 = vld1q_f32(pC + 4 * 8);
                        _sum41 = vld1q_f32(pC + 4 * 9);
                        _sum50 = vld1q_f32(pC + 4 * 10);
                        _sum51 = vld1q_f32(pC + 4 * 11);
                        _sum60 = vld1q_f32(pC + 4 * 12);
                        _sum61 = vld1q_f32(pC + 4 * 13);
                        _sum70 = vld1q_f32(pC + 4 * 14);
                        _sum71 = vld1q_f32(pC + 4 * 15);
                        pC += 64;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum40 = vdupq_n_f32(pC[4]);
                        _sum50 = vdupq_n_f32(pC[5]);
                        _sum60 = vdupq_n_f32(pC[6]);
                        _sum70 = vdupq_n_f32(pC[7]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        _sum41 = _sum40;
                        _sum51 = _sum50;
                        _sum61 = _sum60;
                        _sum71 = _sum70;
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = bfloat2float(vget_low_u16(_pA));
                float32x4_t _pA1 = bfloat2float(vget_high_u16(_pA));

                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);

                pA += 8;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum20 = vmulq_f32(_sum20, _alpha);
                _sum21 = vmulq_f32(_sum21, _alpha);
                _sum30 = vmulq_f32(_sum30, _alpha);
                _sum31 = vmulq_f32(_sum31, _alpha);
                _sum40 = vmulq_f32(_sum40, _alpha);
                _sum41 = vmulq_f32(_sum41, _alpha);
                _sum50 = vmulq_f32(_sum50, _alpha);
                _sum51 = vmulq_f32(_sum51, _alpha);
                _sum60 = vmulq_f32(_sum60, _alpha);
                _sum61 = vmulq_f32(_sum61, _alpha);
                _sum70 = vmulq_f32(_sum70, _alpha);
                _sum71 = vmulq_f32(_sum71, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum20));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum30));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum40));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum50));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum60));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum70));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, float2bfloat(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 4, float2bfloat(_sum41));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 5, float2bfloat(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 6, float2bfloat(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 7, float2bfloat(_sum71));

                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose8x8_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71);

                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 2, float2bfloat(_sum20));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 3, float2bfloat(_sum30));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, float2bfloat(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum40));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum41));
                    vst1_u16(outptr0 + out_hstep * 5, float2bfloat(_sum50));
                    vst1_u16(outptr0 + out_hstep * 5 + 4, float2bfloat(_sum51));
                    vst1_u16(outptr0 + out_hstep * 6, float2bfloat(_sum60));
                    vst1_u16(outptr0 + out_hstep * 6 + 4, float2bfloat(_sum61));
                    vst1_u16(outptr0 + out_hstep * 7, float2bfloat(_sum70));
                    vst1_u16(outptr0 + out_hstep * 7 + 4, float2bfloat(_sum71));

                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
            }

            outptr += 64;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum20 = _sum00;
                        _sum21 = _sum00;
                        _sum30 = _sum00;
                        _sum31 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = bfloat2float(vget_low_u16(_pA));
                float32x4_t _pA1 = bfloat2float(vget_high_u16(_pA));

                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);

                pA += 8;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum20 = vmulq_f32(_sum20, _alpha);
                _sum21 = vmulq_f32(_sum21, _alpha);
                _sum30 = vmulq_f32(_sum30, _alpha);
                _sum31 = vmulq_f32(_sum31, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum20));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum30));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, float2bfloat(_sum31));

                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose8x4_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31);

                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + out_hstep * 1, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 2, float2bfloat(_sum10));
                    vst1_u16(outptr0 + out_hstep * 3, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum20));
                    vst1_u16(outptr0 + out_hstep * 5, float2bfloat(_sum21));
                    vst1_u16(outptr0 + out_hstep * 6, float2bfloat(_sum30));
                    vst1_u16(outptr0 + out_hstep * 7, float2bfloat(_sum31));

                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
            }

            outptr += 32;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = bfloat2float(vget_low_u16(_pA));
                float32x4_t _pA1 = bfloat2float(vget_high_u16(_pA));

                float32x4_t _pB0 = bfloat2float(vdup_n_u16(pB[0]));
                float32x4_t _pB1 = bfloat2float(vdup_n_u16(pB[1]));

                _sum00 = vfmaq_f32(_sum00, _pA0, _pB0);
                _sum01 = vfmaq_f32(_sum01, _pA1, _pB0);
                _sum10 = vfmaq_f32(_sum10, _pA0, _pB1);
                _sum11 = vfmaq_f32(_sum11, _pA1, _pB1);

                pA += 8;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum10));

                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, float2bfloat(_sum11));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[8];
                    unsigned short sum1[8];
                    vst1_u16(sum0, float2bfloat(_sum00));
                    vst1_u16(sum0 + 4, float2bfloat(_sum01));
                    vst1_u16(sum1, float2bfloat(_sum10));
                    vst1_u16(sum1 + 4, float2bfloat(_sum11));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];

                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0[out_hstep * 4 + 1] = sum1[4];
                    outptr0[out_hstep * 5 + 1] = sum1[5];
                    outptr0[out_hstep * 6 + 1] = sum1[6];
                    outptr0[out_hstep * 7 + 1] = sum1[7];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
            }

            outptr += 16;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = bfloat2float(vget_low_u16(_pA));
                float32x4_t _pA1 = bfloat2float(vget_high_u16(_pA));

                float32x4_t _pB = bfloat2float(vld1_dup_u16(pB));

                _sum00 = vfmaq_f32(_sum00, _pA0, _pB);
                _sum01 = vfmaq_f32(_sum01, _pA1, _pB);

                pA += 8;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + out_hstep * 4, float2bfloat(_sum01));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[8];
                    vst1_u16(sum0, float2bfloat(_sum00));
                    vst1_u16(sum0 + 4, float2bfloat(_sum01));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep * 1] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
            }

            outptr += 8;
        }

        pAT += max_kk * 8;
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        const unsigned short* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;
            float32x4_t _sum8;
            float32x4_t _sum9;
            float32x4_t _suma;
            float32x4_t _sumb;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);
                _sum4 = vdupq_n_f32(0.f);
                _sum5 = vdupq_n_f32(0.f);
                _sum6 = vdupq_n_f32(0.f);
                _sum7 = vdupq_n_f32(0.f);
                _sum8 = vdupq_n_f32(0.f);
                _sum9 = vdupq_n_f32(0.f);
                _suma = vdupq_n_f32(0.f);
                _sumb = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                        _sum8 = _sum0;
                        _sum9 = _sum0;
                        _suma = _sum0;
                        _sumb = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                        _sum8 = _sum0;
                        _sum9 = _sum0;
                        _suma = _sum0;
                        _sumb = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        _sum4 = vld1q_f32(pC + 16);
                        _sum5 = vld1q_f32(pC + 20);
                        _sum6 = vld1q_f32(pC + 24);
                        _sum7 = vld1q_f32(pC + 28);
                        _sum8 = vld1q_f32(pC + 32);
                        _sum9 = vld1q_f32(pC + 36);
                        _suma = vld1q_f32(pC + 40);
                        _sumb = vld1q_f32(pC + 44);
                        pC += 48;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        _sum4 = vdupq_n_f32(pC[4]);
                        _sum5 = vdupq_n_f32(pC[5]);
                        _sum6 = vdupq_n_f32(pC[6]);
                        _sum7 = vdupq_n_f32(pC[7]);
                        _sum8 = vdupq_n_f32(pC[8]);
                        _sum9 = vdupq_n_f32(pC[9]);
                        _suma = vdupq_n_f32(pC[10]);
                        _sumb = vdupq_n_f32(pC[11]);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
                _sum8 = vld1q_f32(outptr + 4 * 8);
                _sum9 = vld1q_f32(outptr + 4 * 9);
                _suma = vld1q_f32(outptr + 4 * 10);
                _sumb = vld1q_f32(outptr + 4 * 11);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
                _sum8 = vfmaq_laneq_f32(_sum8, _pA, _pB2, 0);
                _sum9 = vfmaq_laneq_f32(_sum9, _pA, _pB2, 1);
                _suma = vfmaq_laneq_f32(_suma, _pA, _pB2, 2);
                _sumb = vfmaq_laneq_f32(_sumb, _pA, _pB2, 3);

                pA += 4;
                pB += 12;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
                _sum3 = vmulq_f32(_sum3, _alpha);
                _sum4 = vmulq_f32(_sum4, _alpha);
                _sum5 = vmulq_f32(_sum5, _alpha);
                _sum6 = vmulq_f32(_sum6, _alpha);
                _sum7 = vmulq_f32(_sum7, _alpha);
                _sum8 = vmulq_f32(_sum8, _alpha);
                _sum9 = vmulq_f32(_sum9, _alpha);
                _suma = vmulq_f32(_suma, _alpha);
                _sumb = vmulq_f32(_sumb, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum2));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum3));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum4));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum5));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum6));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum7));
                    vst1_u16(outptr0 + 4 * 8, float2bfloat(_sum8));
                    vst1_u16(outptr0 + 4 * 9, float2bfloat(_sum9));
                    vst1_u16(outptr0 + 4 * 10, float2bfloat(_suma));
                    vst1_u16(outptr0 + 4 * 11, float2bfloat(_sumb));
                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose4x12_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb);

                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 8, float2bfloat(_sum2));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum3));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum4));
                    vst1_u16(outptr0 + out_hstep + 8, float2bfloat(_sum5));
                    vst1_u16(outptr0 + out_hstep * 2, float2bfloat(_sum6));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, float2bfloat(_sum7));
                    vst1_u16(outptr0 + out_hstep * 2 + 8, float2bfloat(_sum8));
                    vst1_u16(outptr0 + out_hstep * 3, float2bfloat(_sum9));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, float2bfloat(_suma));
                    vst1_u16(outptr0 + out_hstep * 3 + 8, float2bfloat(_sumb));
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
                vst1q_f32(outptr + 4 * 8, _sum8);
                vst1q_f32(outptr + 4 * 9, _sum9);
                vst1q_f32(outptr + 4 * 10, _suma);
                vst1q_f32(outptr + 4 * 11, _sumb);
            }

            outptr += 48;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);
                _sum4 = vdupq_n_f32(0.f);
                _sum5 = vdupq_n_f32(0.f);
                _sum6 = vdupq_n_f32(0.f);
                _sum7 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        _sum4 = vld1q_f32(pC + 16);
                        _sum5 = vld1q_f32(pC + 20);
                        _sum6 = vld1q_f32(pC + 24);
                        _sum7 = vld1q_f32(pC + 28);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        _sum4 = vdupq_n_f32(pC[4]);
                        _sum5 = vdupq_n_f32(pC[5]);
                        _sum6 = vdupq_n_f32(pC[6]);
                        _sum7 = vdupq_n_f32(pC[7]);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB0 = bfloat2float(vld1_u16(pB));
                float32x4_t _pB1 = bfloat2float(vld1_u16(pB + 4));

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB0), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB0), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB0), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB0), 1);
                _sum4 = vmlaq_lane_f32(_sum4, _pA, vget_low_f32(_pB1), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _pA, vget_low_f32(_pB1), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _pA, vget_high_f32(_pB1), 0);
                _sum7 = vmlaq_lane_f32(_sum7, _pA, vget_high_f32(_pB1), 1);
#endif

                pA += 4;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
                _sum3 = vmulq_f32(_sum3, _alpha);
                _sum4 = vmulq_f32(_sum4, _alpha);
                _sum5 = vmulq_f32(_sum5, _alpha);
                _sum6 = vmulq_f32(_sum6, _alpha);
                _sum7 = vmulq_f32(_sum7, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum2));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum3));
                    vst1_u16(outptr0 + 4 * 4, float2bfloat(_sum4));
                    vst1_u16(outptr0 + 4 * 5, float2bfloat(_sum5));
                    vst1_u16(outptr0 + 4 * 6, float2bfloat(_sum6));
                    vst1_u16(outptr0 + 4 * 7, float2bfloat(_sum7));
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose4x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum2));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum3));
                    vst1_u16(outptr0 + out_hstep * 2, float2bfloat(_sum4));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, float2bfloat(_sum5));
                    vst1_u16(outptr0 + out_hstep * 3, float2bfloat(_sum6));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, float2bfloat(_sum7));
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
            }

            outptr += 32;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB = bfloat2float(vld1_u16(pB));

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB), 1);
#endif

                pA += 4;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
                _sum3 = vmulq_f32(_sum3, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 4 * 2, float2bfloat(_sum2));
                    vst1_u16(outptr0 + 4 * 3, float2bfloat(_sum3));
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose4x4_ps(_sum0, _sum1, _sum2, _sum3);

                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum1));
                    vst1_u16(outptr0 + out_hstep * 2, float2bfloat(_sum2));
                    vst1_u16(outptr0 + out_hstep * 3, float2bfloat(_sum3));
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
            }

            outptr += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB0 = bfloat2float(vdup_n_u16(pB[0]));
                float32x4_t _pB1 = bfloat2float(vdup_n_u16(pB[1]));

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA, _pB1);
#else
                _sum0 = vmlaq_f32(_sum0, _pA, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA, _pB1);
#endif

                pA += 4;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[4];
                    unsigned short sum1[4];
                    vst1_u16(sum0, float2bfloat(_sum0));
                    vst1_u16(sum1, float2bfloat(_sum1));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pA = bfloat2float(vld1_u16(pA));
                float32x4_t _pB = bfloat2float(vdup_n_u16(pB[0]));

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA, _pB);
#else
                _sum0 = vmlaq_f32(_sum0, _pA, _pB);
#endif

                pA += 4;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[4];
                    vst1_u16(sum0, float2bfloat(_sum0));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
            }

            outptr += 4;
        }

        pAT += max_kk * 4;
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const unsigned short* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum02;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum12;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum02 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum12 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum02 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum12 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum02 = _sum00;
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum11 = _sum10;
                        _sum12 = _sum10;
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        float32x4x2_t _tmp23 = vld2q_f32(pC + 8);
                        float32x4x2_t _tmp45 = vld2q_f32(pC + 16);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum02 = _tmp45.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        _sum12 = _tmp45.val[1];
                        pC += 24;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum02 = vld1q_f32(pC + 8);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum12 = _sum02;
                        pC += 12;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                float32x4x2_t _tmp45 = vld2q_f32(outptr + 16);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum02 = _tmp45.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
                _sum12 = _tmp45.val[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = bfloat2float(vget_low_u16(_pB));
                float32x4_t _pB1 = bfloat2float(vget_high_u16(_pB));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                float32x4_t _pA0 = bfloat2float(vdup_n_u16(pA[0]));
                float32x4_t _pA1 = bfloat2float(vdup_n_u16(pA[1]));

                _sum00 = vfmaq_f32(_sum00, _pB0, _pA0);
                _sum01 = vfmaq_f32(_sum01, _pB1, _pA0);
                _sum02 = vfmaq_f32(_sum02, _pB2, _pA0);
                _sum10 = vfmaq_f32(_sum10, _pB0, _pA1);
                _sum11 = vfmaq_f32(_sum11, _pB1, _pA1);
                _sum12 = vfmaq_f32(_sum12, _pB2, _pA1);

                pA += 2;
                pB += 12;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum02 = vmulq_f32(_sum02, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum12 = vmulq_f32(_sum12, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + 8, float2bfloat(_sum02));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum11));
                    vst1_u16(outptr0 + out_hstep + 8, float2bfloat(_sum12));
                    outptr0 += 12;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float32x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
                vst2q_f32(outptr + 16, _tmp45);
            }

            outptr += 24;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum11 = _sum10;
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        float32x4x2_t _tmp23 = vld2q_f32(pC + 8);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        pC += 8;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = bfloat2float(vget_low_u16(_pB));
                float32x4_t _pB1 = bfloat2float(vget_high_u16(_pB));

                float32x4_t _pA0 = bfloat2float(vdup_n_u16(pA[0]));
                float32x4_t _pA1 = bfloat2float(vdup_n_u16(pA[1]));
#if __aarch64__
                _sum00 = vfmaq_f32(_sum00, _pB0, _pA0);
                _sum01 = vfmaq_f32(_sum01, _pB1, _pA0);
                _sum10 = vfmaq_f32(_sum10, _pB0, _pA1);
                _sum11 = vfmaq_f32(_sum11, _pB1, _pA1);
#else
                _sum00 = vmlaq_f32(_sum00, _pB0, _pA0);
                _sum01 = vmlaq_f32(_sum01, _pB1, _pA0);
                _sum10 = vmlaq_f32(_sum10, _pB0, _pA1);
                _sum11 = vmlaq_f32(_sum11, _pB1, _pA1);
#endif

                pA += 2;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum00));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum01));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, float2bfloat(_sum11));
                    outptr0 += 8;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
            }

            outptr += 16;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        _sum0 = _tmp01.val[0];
                        _sum1 = _tmp01.val[1];
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        pC += 4;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                _sum0 = _tmp01.val[0];
                _sum1 = _tmp01.val[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = bfloat2float(vld1_u16(pB));

                float32x4_t _pA0 = bfloat2float(vdup_n_u16(pA[0]));
                float32x4_t _pA1 = bfloat2float(vdup_n_u16(pA[1]));
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pB, _pA0);
                _sum1 = vfmaq_f32(_sum1, _pB, _pA1);
#else
                _sum0 = vmlaq_f32(_sum0, _pB, _pA0);
                _sum1 = vmlaq_f32(_sum1, _pB, _pA1);
#endif

                pA += 2;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + out_hstep, float2bfloat(_sum1));
                    outptr0 += 4;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2q_f32(outptr, _tmp01);
            }

            outptr += 8;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum00;
            float sum01;
            float sum10;
            float sum11;

            if (k == 0)
            {
                sum00 = 0.f;
                sum01 = 0.f;
                sum10 = 0.f;
                sum11 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[0];
                        sum11 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[0];
                        sum11 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[2];
                        sum11 = pC[3];
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[1];
                        sum11 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum00 = outptr[0];
                sum01 = outptr[1];
                sum10 = outptr[2];
                sum11 = outptr[3];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
#if __ARM_NEON
            // clang 15.0.1 on aarch64 auto vectorization produces wrong result on this loop
            // we have to teach it a bit  :$   --- nihui
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _pA0123 = bfloat2float(vld1_u16(pA));
                float32x4_t _pB0123 = bfloat2float(vld1_u16(pB));

                float32x4x2_t _pB0213 = vtrnq_f32(_pB0123, _pB0123);

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA0123, _pB0213.val[0]);
                _sum1 = vfmaq_f32(_sum1, _pA0123, _pB0213.val[1]);
#else
                _sum0 = vmlaq_f32(_sum0, _pA0123, _pB0213.val[0]);
                _sum1 = vmlaq_f32(_sum1, _pA0123, _pB0213.val[1]);
#endif

                pA += 4;
                pB += 4;
            }
            sum00 += vgetq_lane_f32(_sum0, 0) + vgetq_lane_f32(_sum0, 2);
            sum01 += vgetq_lane_f32(_sum0, 1) + vgetq_lane_f32(_sum0, 3);
            sum10 += vgetq_lane_f32(_sum1, 0) + vgetq_lane_f32(_sum1, 2);
            sum11 += vgetq_lane_f32(_sum1, 1) + vgetq_lane_f32(_sum1, 3);
#endif // __ARM_NEON
            for (; kk < max_kk; kk += 1)
            {
                float pA0 = bfloat16_to_float32(pA[0]);
                float pA1 = bfloat16_to_float32(pA[1]);
                float pB0 = bfloat16_to_float32(pB[0]);
                float pB1 = bfloat16_to_float32(pB[1]);

                sum00 += pA0 * pB0;
                sum01 += pA1 * pB0;
                sum10 += pA0 * pB1;
                sum11 += pA1 * pB1;

                pA += 2;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                sum00 *= alpha;
                sum01 *= alpha;
                sum10 *= alpha;
                sum11 *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum00);
                    outptr0[1] = float32_to_bfloat16(sum10);
                    outptr0[out_hstep] = float32_to_bfloat16(sum01);
                    outptr0[out_hstep + 1] = float32_to_bfloat16(sum11);
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
            }

            outptr += 4;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float pA0 = bfloat16_to_float32(pA[0]);
                float pA1 = bfloat16_to_float32(pA[1]);
                float pB0 = bfloat16_to_float32(pB[0]);

                sum0 += pA0 * pB0;
                sum1 += pA1 * pB0;
                pA += 2;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                sum0 *= alpha;
                sum1 *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum0);
                    outptr0[out_hstep] = float32_to_bfloat16(sum1);
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const unsigned short* pB = pBT;

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                        _sum2 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
                _sum2 = vld1q_f32(outptr + 8);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = bfloat2float(vget_low_u16(_pB));
                float32x4_t _pB1 = bfloat2float(vget_high_u16(_pB));
                float32x4_t _pB2 = bfloat2float(vld1_u16(pB + 8));

                float32x4_t _pA0 = bfloat2float(vdup_n_u16(pA[0]));

                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
                _sum2 = vfmaq_f32(_sum2, _pA0, _pB2);

                pA += 1;
                pB += 12;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    vst1_u16(outptr0 + 8, float2bfloat(_sum2));
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 8, _sum2);
            }

            outptr += 12;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = bfloat2float(vget_low_u16(_pB));
                float32x4_t _pB1 = bfloat2float(vget_high_u16(_pB));

                float32x4_t _pA0 = bfloat2float(vdup_n_u16(pA[0]));
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
#else
                _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
#endif

                pA += 1;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum0));
                    vst1_u16(outptr0 + 4, float2bfloat(_sum1));
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum;

            if (k == 0)
            {
                _sum = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum = vld1q_f32(pC);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum = vld1q_f32(outptr);
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float32x4_t _pB = bfloat2float(vld1_u16(pB));
                float32x4_t _pA = bfloat2float(vdup_n_u16(pA[0]));
#if __aarch64__
                _sum = vfmaq_f32(_sum, _pA, _pB);
#else
                _sum = vmlaq_f32(_sum, _pA, _pB);
#endif

                pA += 1;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum = vmulq_f32(_sum, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, float2bfloat(_sum));
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum);
            }

            outptr += 4;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float pA0 = bfloat16_to_float32(pA[0]);
                float pB0 = bfloat16_to_float32(pB[0]);
                float pB1 = bfloat16_to_float32(pB[1]);

                sum0 += pA0 * pB0;
                sum1 += pA0 * pB1;

                pA += 1;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                sum0 *= alpha;
                sum1 *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum0);
                    outptr0[1] = float32_to_bfloat16(sum1);
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum;

            if (k == 0)
            {
                sum = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum = outptr[0];
            }

            const unsigned short* pA = pAT;
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
                float pA0 = bfloat16_to_float32(pA[0]);
                float pB0 = bfloat16_to_float32(pB[0]);

                sum += pA0 * pB0;
                pA += 1;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                sum *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_bfloat16(sum);
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}


================================================
FILE: src/layer/arm/gemm_bf16s_fp16s.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pack_A_tile_bf16_fp16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    unsigned short* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * 8;

            for (int kk = 0; kk < max_kk; kk++)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * 4;
            const unsigned short* p1 = (const unsigned short*)A + (i + ii + 4) * A_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                uint16x8_t _r0 = vcombine_u16(vld1_u16(p0), vld1_u16(p1));
                vst1q_u16(pp, _r0);
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;
            const unsigned short* p1 = (const unsigned short*)A + (i + ii + 1) * A_hstep + k;
            const unsigned short* p2 = (const unsigned short*)A + (i + ii + 2) * A_hstep + k;
            const unsigned short* p3 = (const unsigned short*)A + (i + ii + 3) * A_hstep + k;
            const unsigned short* p4 = (const unsigned short*)A + (i + ii + 4) * A_hstep + k;
            const unsigned short* p5 = (const unsigned short*)A + (i + ii + 5) * A_hstep + k;
            const unsigned short* p6 = (const unsigned short*)A + (i + ii + 6) * A_hstep + k;
            const unsigned short* p7 = (const unsigned short*)A + (i + ii + 7) * A_hstep + k;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _r0 = vld1q_u16(p0);
                uint16x8_t _r1 = vld1q_u16(p1);
                uint16x8_t _r2 = vld1q_u16(p2);
                uint16x8_t _r3 = vld1q_u16(p3);
                uint16x8_t _r4 = vld1q_u16(p4);
                uint16x8_t _r5 = vld1q_u16(p5);
                uint16x8_t _r6 = vld1q_u16(p6);
                uint16x8_t _r7 = vld1q_u16(p7);
                transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
                vst1q_u16(pp, _r0);
                vst1q_u16(pp + 8, _r1);
                vst1q_u16(pp + 8 * 2, _r2);
                vst1q_u16(pp + 8 * 3, _r3);
                vst1q_u16(pp + 8 * 4, _r4);
                vst1q_u16(pp + 8 * 5, _r5);
                vst1q_u16(pp + 8 * 6, _r6);
                vst1q_u16(pp + 8 * 7, _r7);
                pp += 64;
                p0 += 8;
                p1 += 8;
                p2 += 8;
                p3 += 8;
                p4 += 8;
                p5 += 8;
                p6 += 8;
                p7 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp[4] = p4[0];
                pp[5] = p5[0];
                pp[6] = p6[0];
                pp[7] = p7[0];
                pp += 8;
                p0++;
                p1++;
                p2++;
                p3++;
                p4++;
                p5++;
                p6++;
                p7++;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * 4;

            int kk = 0;
            for (; kk + 1 < max_kk; kk += 2)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;
            const unsigned short* p1 = (const unsigned short*)A + (i + ii + 1) * A_hstep + k;
            const unsigned short* p2 = (const unsigned short*)A + (i + ii + 2) * A_hstep + k;
            const unsigned short* p3 = (const unsigned short*)A + (i + ii + 3) * A_hstep + k;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x4_t _r0123;
                _r0123.val[0] = vld1q_u16(p0);
                _r0123.val[1] = vld1q_u16(p1);
                _r0123.val[2] = vld1q_u16(p2);
                _r0123.val[3] = vld1q_u16(p3);
                vst4q_u16(pp, _r0123);
                pp += 32;
                p0 += 8;
                p1 += 8;
                p2 += 8;
                p3 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x4_t _r0123;
                _r0123.val[0] = vld1_u16(p0);
                _r0123.val[1] = vld1_u16(p1);
                _r0123.val[2] = vld1_u16(p2);
                _r0123.val[3] = vld1_u16(p3);
                vst4_u16(pp, _r0123);
                pp += 16;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp += 4;
                p0++;
                p1++;
                p2++;
                p3++;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        // if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;
            const unsigned short* p1 = (const unsigned short*)A + (i + ii + 1) * A_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x2_t _r01;
                _r01.val[0] = vld1q_u16(p0);
                _r01.val[1] = vld1q_u16(p1);
                vst2q_u16(pp, _r01);
                pp += 16;
                p0 += 8;
                p1 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x2_t _r01;
                _r01.val[0] = vld1_u16(p0);
                _r01.val[1] = vld1_u16(p1);
                vst2_u16(pp, _r01);
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp += 2;
                p0++;
                p1++;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        // if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = (unsigned short)p0[0];
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_pack_A_tile_bf16_fp16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    unsigned short* pp = AT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x4_t _r0123 = vld4q_u16(p0);
                uint16x8x4_t _r4567 = vld4q_u16(p0 + 32);
                uint16x8x2_t _r04 = vuzpq_u16(_r0123.val[0], _r4567.val[0]);
                uint16x8x2_t _r15 = vuzpq_u16(_r0123.val[1], _r4567.val[1]);
                uint16x8x2_t _r26 = vuzpq_u16(_r0123.val[2], _r4567.val[2]);
                uint16x8x2_t _r37 = vuzpq_u16(_r0123.val[3], _r4567.val[3]);
                vst1q_u16(pp, _r04.val[0]);
                vst1q_u16(pp + 8, _r15.val[0]);
                vst1q_u16(pp + 16, _r26.val[0]);
                vst1q_u16(pp + 24, _r37.val[0]);
                vst1q_u16(pp + 32, _r04.val[1]);
                vst1q_u16(pp + 40, _r15.val[1]);
                vst1q_u16(pp + 48, _r26.val[1]);
                vst1q_u16(pp + 56, _r37.val[1]);
                pp += 64;
                p0 += A_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8x4_t _r0123 = vld4q_u16(p0);
                vst1q_u16(pp, _r0123.val[0]);
                vst1q_u16(pp + 8, _r0123.val[1]);
                vst1q_u16(pp + 16, _r0123.val[2]);
                vst1q_u16(pp + 24, _r0123.val[3]);
                pp += 32;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += A_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x4_t _r0123;
                _r0123.val[0] = vld1q_u16(p0);
                _r0123.val[1] = vld1q_u16(p0 + 8);
                _r0123.val[2] = vld1q_u16(p0 + 16);
                _r0123.val[3] = vld1q_u16(p0 + 24);
                vst4q_u16(pp, _r0123);
                pp += 32;
                p0 += A_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x4_t _r0123 = vld4_u16(p0);
                vst1q_u16(pp, vcombine_u16(_r0123.val[0], _r0123.val[1]));
                vst1q_u16(pp + 8, vcombine_u16(_r0123.val[2], _r0123.val[3]));
                pp += 16;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += A_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x2_t _r01;
                _r01.val[0] = vld1q_u16(p0);
                _r01.val[1] = vld1q_u16(p0 + 8);
                vst2q_u16(pp, _r01);
                pp += 16;
                p0 += A_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x2_t _r01;
                _r01.val[0] = vld1_u16(p0);
                _r01.val[1] = vld1_u16(p0 + 4);
                vst2_u16(pp, _r01);
                pp += 8;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp += 2;
                p0 += A_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
#if __ARM_NEON
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += A_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0 += A_hstep;
            }
        }
    }
}

static void pack_B_tile_bf16_fp16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    unsigned short* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) / 8 * 8 * B_hstep + k * 8;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 8) / 8 * 8 * B_hstep + k * 8;

            if ((j + jj) % 8 == 0)
            {
                for (int kk = 0; kk < max_kk; kk++)
                {
                    vst1q_u16(pp, vld1q_u16(p0));
                    vst1_u16(pp + 8, vld1_u16(p1));
                    pp += 12;
                    p0 += 8;
                    p1 += 8;
                }
            }
            if ((j + jj) % 8 == 4)
            {
                for (int kk = 0; kk < max_kk; kk++)
                {
                    vst1_u16(pp, vld1_u16(p0 + 4));
                    vst1q_u16(pp + 4, vld1q_u16(p1));
                    pp += 12;
                    p0 += 8;
                    p1 += 8;
                }
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k * 4;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 4) * B_hstep + k * 4;
            const unsigned short* p2 = (const unsigned short*)B + (j + jj + 8) * B_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                vst1_u16(pp, vld1_u16(p0));
                vst1_u16(pp + 4, vld1_u16(p1));
                vst1_u16(pp + 8, vld1_u16(p2));
                pp += 12;
                p0 += 4;
                p1 += 4;
                p2 += 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 1) * B_hstep + k;
            const unsigned short* p2 = (const unsigned short*)B + (j + jj + 2) * B_hstep + k;
            const unsigned short* p3 = (const unsigned short*)B + (j + jj + 3) * B_hstep + k;
            const unsigned short* p4 = (const unsigned short*)B + (j + jj + 4) * B_hstep + k;
            const unsigned short* p5 = (const unsigned short*)B + (j + jj + 5) * B_hstep + k;
            const unsigned short* p6 = (const unsigned short*)B + (j + jj + 6) * B_hstep + k;
            const unsigned short* p7 = (const unsigned short*)B + (j + jj + 7) * B_hstep + k;
            const unsigned short* p8 = (const unsigned short*)B + (j + jj + 8) * B_hstep + k;
            const unsigned short* p9 = (const unsigned short*)B + (j + jj + 9) * B_hstep + k;
            const unsigned short* pa = (const unsigned short*)B + (j + jj + 10) * B_hstep + k;
            const unsigned short* pb = (const unsigned short*)B + (j + jj + 11) * B_hstep + k;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4_t _r0 = vld1_u16(p0);
                uint16x4_t _r1 = vld1_u16(p1);
                uint16x4_t _r2 = vld1_u16(p2);
                uint16x4_t _r3 = vld1_u16(p3);
                uint16x4_t _r4 = vld1_u16(p4);
                uint16x4_t _r5 = vld1_u16(p5);
                uint16x4_t _r6 = vld1_u16(p6);
                uint16x4_t _r7 = vld1_u16(p7);
                uint16x4_t _r8 = vld1_u16(p8);
                uint16x4_t _r9 = vld1_u16(p9);
                uint16x4_t _ra = vld1_u16(pa);
                uint16x4_t _rb = vld1_u16(pb);

                transpose4x4_u16(_r0, _r1, _r2, _r3);
                transpose4x4_u16(_r4, _r5, _r6, _r7);
                transpose4x4_u16(_r8, _r9, _ra, _rb);

                vst1_u16(pp, _r0);
                vst1_u16(pp + 4, _r4);
                vst1_u16(pp + 4 * 2, _r8);
                vst1_u16(pp + 4 * 3, _r1);
                vst1_u16(pp + 4 * 4, _r5);
                vst1_u16(pp + 4 * 5, _r9);
                vst1_u16(pp + 4 * 6, _r2);
                vst1_u16(pp + 4 * 7, _r6);
                vst1_u16(pp + 4 * 8, _ra);
                vst1_u16(pp + 4 * 9, _r3);
                vst1_u16(pp + 4 * 10, _r7);
                vst1_u16(pp + 4 * 11, _rb);
                pp += 48;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
                p4 += 4;
                p5 += 4;
                p6 += 4;
                p7 += 4;
                p8 += 4;
                p9 += 4;
                pa += 4;
                pb += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp[4] = p4[0];
                pp[5] = p5[0];
                pp[6] = p6[0];
                pp[7] = p7[0];
                pp[8] = p8[0];
                pp[9] = p9[0];
                pp[10] = pa[0];
                pp[11] = pb[0];
                pp += 12;
                p0++;
                p1++;
                p2++;
                p3++;
                p4++;
                p5++;
                p6++;
                p7++;
                p8++;
                p9++;
                pa++;
                pb++;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) / 8 * 8 * B_hstep + k * 8;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 8) / 8 * 8 * B_hstep + k * 8;

            if ((j + jj) % 8 == 0)
            {
                for (int kk = 0; kk < max_kk; kk++)
                {
                    vst1q_u16(pp, vld1q_u16(p0));
                    pp += 8;
                    p0 += 8;
                }
            }
            if ((j + jj) % 8 == 4)
            {
                for (int kk = 0; kk < max_kk; kk++)
                {
                    vst1q_u16(pp, vcombine_u16(vld1_u16(p0 + 4), vld1_u16(p1)));
                    pp += 8;
                    p0 += 8;
                    p1 += 8;
                }
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k * 4;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 4) * B_hstep + k * 4;

            for (int kk = 0; kk < max_kk; kk++)
            {
                uint16x8_t _r0 = vcombine_u16(vld1_u16(p0), vld1_u16(p1));
                vst1q_u16(pp, _r0);
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 1) * B_hstep + k;
            const unsigned short* p2 = (const unsigned short*)B + (j + jj + 2) * B_hstep + k;
            const unsigned short* p3 = (const unsigned short*)B + (j + jj + 3) * B_hstep + k;
            const unsigned short* p4 = (const unsigned short*)B + (j + jj + 4) * B_hstep + k;
            const unsigned short* p5 = (const unsigned short*)B + (j + jj + 5) * B_hstep + k;
            const unsigned short* p6 = (const unsigned short*)B + (j + jj + 6) * B_hstep + k;
            const unsigned short* p7 = (const unsigned short*)B + (j + jj + 7) * B_hstep + k;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _r0 = vld1q_u16(p0);
                uint16x8_t _r1 = vld1q_u16(p1);
                uint16x8_t _r2 = vld1q_u16(p2);
                uint16x8_t _r3 = vld1q_u16(p3);
                uint16x8_t _r4 = vld1q_u16(p4);
                uint16x8_t _r5 = vld1q_u16(p5);
                uint16x8_t _r6 = vld1q_u16(p6);
                uint16x8_t _r7 = vld1q_u16(p7);
                transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
                vst1q_u16(pp, _r0);
                vst1q_u16(pp + 8, _r1);
                vst1q_u16(pp + 8 * 2, _r2);
                vst1q_u16(pp + 8 * 3, _r3);
                vst1q_u16(pp + 8 * 4, _r4);
                vst1q_u16(pp + 8 * 5, _r5);
                vst1q_u16(pp + 8 * 6, _r6);
                vst1q_u16(pp + 8 * 7, _r7);
                pp += 64;
                p0 += 8;
                p1 += 8;
                p2 += 8;
                p3 += 8;
                p4 += 8;
                p5 += 8;
                p6 += 8;
                p7 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4_t _r0 = vld1_u16(p0);
                uint16x4_t _r1 = vld1_u16(p1);
                uint16x4_t _r2 = vld1_u16(p2);
                uint16x4_t _r3 = vld1_u16(p3);
                uint16x4_t _r4 = vld1_u16(p4);
                uint16x4_t _r5 = vld1_u16(p5);
                uint16x4_t _r6 = vld1_u16(p6);
                uint16x4_t _r7 = vld1_u16(p7);

                transpose4x4_u16(_r0, _r1, _r2, _r3);
                transpose4x4_u16(_r4, _r5, _r6, _r7);

                vst1_u16(pp, _r0);
                vst1_u16(pp + 4, _r4);
                vst1_u16(pp + 4 * 2, _r1);
                vst1_u16(pp + 4 * 3, _r5);
                vst1_u16(pp + 4 * 4, _r2);
                vst1_u16(pp + 4 * 5, _r6);
                vst1_u16(pp + 4 * 6, _r3);
                vst1_u16(pp + 4 * 7, _r7);
                pp += 32;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
                p4 += 4;
                p5 += 4;
                p6 += 4;
                p7 += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp[4] = p4[0];
                pp[5] = p5[0];
                pp[6] = p6[0];
                pp[7] = p7[0];
                pp += 8;
                p0++;
                p1++;
                p2++;
                p3++;
                p4++;
                p5++;
                p6++;
                p7++;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) / 8 * 8 * B_hstep + k * 8;

            if ((j + jj) % 8 == 0)
            {
                for (int kk = 0; kk < max_kk; kk++)
                {
                    vst1_u16(pp, vld1_u16(p0));
                    pp += 4;
                    p0 += 8;
                }
            }
            if ((j + jj) % 8 == 4)
            {
                for (int kk = 0; kk < max_kk; kk++)
                {
                    vst1_u16(pp, vld1_u16(p0 + 4));
                    pp += 4;
                    p0 += 8;
                }
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k * 4;

            int kk = 0;
            for (; kk + 1 < max_kk; kk += 2)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 1) * B_hstep + k;
            const unsigned short* p2 = (const unsigned short*)B + (j + jj + 2) * B_hstep + k;
            const unsigned short* p3 = (const unsigned short*)B + (j + jj + 3) * B_hstep + k;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x4_t _r0123;
                _r0123.val[0] = vld1q_u16(p0);
                _r0123.val[1] = vld1q_u16(p1);
                _r0123.val[2] = vld1q_u16(p2);
                _r0123.val[3] = vld1q_u16(p3);
                vst4q_u16(pp, _r0123);
                pp += 32;
                p0 += 8;
                p1 += 8;
                p2 += 8;
                p3 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x4_t _r0123;
                _r0123.val[0] = vld1_u16(p0);
                _r0123.val[1] = vld1_u16(p1);
                _r0123.val[2] = vld1_u16(p2);
                _r0123.val[3] = vld1_u16(p3);
                vst4_u16(pp, _r0123);
                pp += 16;
                p0 += 4;
                p1 += 4;
                p2 += 4;
                p3 += 4;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp[2] = p2[0];
                pp[3] = p3[0];
                pp += 4;
                p0++;
                p1++;
                p2++;
                p3++;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        // if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;
            const unsigned short* p1 = (const unsigned short*)B + (j + jj + 1) * B_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x2_t _r01;
                _r01.val[0] = vld1q_u16(p0);
                _r01.val[1] = vld1q_u16(p1);
                vst2q_u16(pp, _r01);
                pp += 16;
                p0 += 8;
                p1 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x2_t _r01;
                _r01.val[0] = vld1_u16(p0);
                _r01.val[1] = vld1_u16(p1);
                vst2_u16(pp, _r01);
                pp += 8;
                p0 += 4;
                p1 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p1[0];
                pp += 2;
                p0++;
                p1++;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        // if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;

            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_pack_B_tile_bf16_fp16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    unsigned short* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x4_t _r0123 = vld4q_u16(p0);
                uint16x8x4_t _r4567 = vld4q_u16(p0 + 32);
                uint16x8x4_t _r89ab = vld4q_u16(p0 + 64);
                uint16x8x2_t _r04 = vuzpq_u16(_r0123.val[0], _r4567.val[0]);
                uint16x8x2_t _r15 = vuzpq_u16(_r0123.val[1], _r4567.val[1]);
                uint16x8x2_t _r26 = vuzpq_u16(_r0123.val[2], _r4567.val[2]);
                uint16x8x2_t _r37 = vuzpq_u16(_r0123.val[3], _r4567.val[3]);
                uint16x4x2_t _r04_1 = vuzp_u16(vget_low_u16(_r89ab.val[0]), vget_high_u16(_r89ab.val[0]));
                uint16x4x2_t _r15_1 = vuzp_u16(vget_low_u16(_r89ab.val[1]), vget_high_u16(_r89ab.val[1]));
                uint16x4x2_t _r26_1 = vuzp_u16(vget_low_u16(_r89ab.val[2]), vget_high_u16(_r89ab.val[2]));
                uint16x4x2_t _r37_1 = vuzp_u16(vget_low_u16(_r89ab.val[3]), vget_high_u16(_r89ab.val[3]));
                vst1q_u16(pp, _r04.val[0]);
                vst1_u16(pp + 8, _r04_1.val[0]);
                vst1q_u16(pp + 12, _r15.val[0]);
                vst1_u16(pp + 20, _r15_1.val[0]);
                vst1q_u16(pp + 24, _r26.val[0]);
                vst1_u16(pp + 32, _r26_1.val[0]);
                vst1q_u16(pp + 36, _r37.val[0]);
                vst1_u16(pp + 44, _r37_1.val[0]);
                vst1q_u16(pp + 48, _r04.val[1]);
                vst1_u16(pp + 56, _r04_1.val[1]);
                vst1q_u16(pp + 60, _r15.val[1]);
                vst1_u16(pp + 68, _r15_1.val[1]);
                vst1q_u16(pp + 72, _r26.val[1]);
                vst1_u16(pp + 80, _r26_1.val[1]);
                vst1q_u16(pp + 84, _r37.val[1]);
                vst1_u16(pp + 92, _r37_1.val[1]);
                pp += 96;
                p0 += B_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8x4_t _r0123 = vld4q_u16(p0);
                uint16x4x4_t _r89ab = vld4_u16(p0 + 32);
                vst1q_u16(pp, _r0123.val[0]);
                vst1_u16(pp + 8, _r89ab.val[0]);
                vst1q_u16(pp + 12, _r0123.val[1]);
                vst1_u16(pp + 20, _r89ab.val[1]);
                vst1q_u16(pp + 24, _r0123.val[2]);
                vst1_u16(pp + 32, _r89ab.val[2]);
                vst1q_u16(pp + 36, _r0123.val[3]);
                vst1_u16(pp + 44, _r89ab.val[3]);
                pp += 48;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                vst1_u16(pp + 8, vld1_u16(p0 + 8));
                pp += 12;
                p0 += B_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x4_t _r0123 = vld4q_u16(p0);
                uint16x8x4_t _r4567 = vld4q_u16(p0 + 32);
                uint16x8x2_t _r04 = vuzpq_u16(_r0123.val[0], _r4567.val[0]);
                uint16x8x2_t _r15 = vuzpq_u16(_r0123.val[1], _r4567.val[1]);
                uint16x8x2_t _r26 = vuzpq_u16(_r0123.val[2], _r4567.val[2]);
                uint16x8x2_t _r37 = vuzpq_u16(_r0123.val[3], _r4567.val[3]);
                vst1q_u16(pp, _r04.val[0]);
                vst1q_u16(pp + 8, _r15.val[0]);
                vst1q_u16(pp + 16, _r26.val[0]);
                vst1q_u16(pp + 24, _r37.val[0]);
                vst1q_u16(pp + 32, _r04.val[1]);
                vst1q_u16(pp + 40, _r15.val[1]);
                vst1q_u16(pp + 48, _r26.val[1]);
                vst1q_u16(pp + 56, _r37.val[1]);
                pp += 64;
                p0 += B_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8x4_t _r0123 = vld4q_u16(p0);
                vst1q_u16(pp, _r0123.val[0]);
                vst1q_u16(pp + 8, _r0123.val[1]);
                vst1q_u16(pp + 16, _r0123.val[2]);
                vst1q_u16(pp + 24, _r0123.val[3]);
                pp += 32;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += B_hstep;
            }
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x4_t _r0123;
                _r0123.val[0] = vld1q_u16(p0);
                _r0123.val[1] = vld1q_u16(p0 + 8);
                _r0123.val[2] = vld1q_u16(p0 + 16);
                _r0123.val[3] = vld1q_u16(p0 + 24);
                vst4q_u16(pp, _r0123);
                pp += 32;
                p0 += B_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x4_t _r0123 = vld4_u16(p0);
                vst1q_u16(pp, vcombine_u16(_r0123.val[0], _r0123.val[1]));
                vst1q_u16(pp + 8, vcombine_u16(_r0123.val[2], _r0123.val[3]));
                pp += 16;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += B_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
#if __ARM_NEON
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8x2_t _r01;
                _r01.val[0] = vld1q_u16(p0);
                _r01.val[1] = vld1q_u16(p0 + 8);
                vst2q_u16(pp, _r01);
                pp += 16;
                p0 += B_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x4x2_t _r01;
                _r01.val[0] = vld1_u16(p0);
                _r01.val[1] = vld1_u16(p0 + 4);
                vst2_u16(pp, _r01);
                pp += 8;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp[1] = p0[1];
                pp += 2;
                p0 += B_hstep;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
#if __ARM_NEON
        if (elempack == 8)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 8;

            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                vst1q_u16(pp, vld1q_u16(p0));
                pp += 8;
                p0 += B_hstep * 8;
            }
        }
        if (elempack == 4)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * 4;

            int kk = 0;
            for (; kk + 3 < max_kk; kk += 4)
            {
                vst1_u16(pp, vld1_u16(p0));
                pp += 4;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj);

            int kk = 0;
            for (; kk < max_kk; kk++)
            {
                pp[0] = p0[0];
                pp += 1;
                p0 += B_hstep;
            }
        }
    }
}

static void transpose_unpack_output_tile_bf16_fp16(const Mat& topT, Mat& top_blob, int i, int max_ii, int j, int max_jj)
{
    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const unsigned short* pp = topT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (out_elempack == 8)
        {
            unsigned short* p0 = (unsigned short*)top_blob + (j / 8 * 8) * out_hstep + (i + ii) * 8;

            int jj = 0;
            if (j % 8 == 4)
            {
                uint16x8_t _r0 = vld1q_u16(pp);
                uint16x8_t _r1 = vld1q_u16(pp + 8);
                uint16x8_t _r2 = vld1q_u16(pp + 8 * 2);
                uint16x8_t _r3 = vld1q_u16(pp + 8 * 3);
                transpose8x4_u16(_r0, _r1, _r2, _r3);
                vst1_u16(p0 + 4, vget_low_u16(_r0));
                vst1_u16(p0 + 8 + 4, vget_high_u16(_r0));
                vst1_u16(p0 + 8 * 2 + 4, vget_low_u16(_r1));
                vst1_u16(p0 + 8 * 3 + 4, vget_high_u16(_r1));
                vst1_u16(p0 + 8 * 4 + 4, vget_low_u16(_r2));
                vst1_u16(p0 + 8 * 5 + 4, vget_high_u16(_r2));
                vst1_u16(p0 + 8 * 6 + 4, vget_low_u16(_r3));
                vst1_u16(p0 + 8 * 7 + 4, vget_high_u16(_r3));
                pp += 32;
                p0 += out_hstep * 8;
                jj += 4;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                uint16x8_t _r0 = vld1q_u16(pp);
                uint16x8_t _r1 = vld1q_u16(pp + 8);
                uint16x8_t _r2 = vld1q_u16(pp + 8 * 2);
                uint16x8_t _r3 = vld1q_u16(pp + 8 * 3);
                uint16x8_t _r4 = vld1q_u16(pp + 8 * 4);
                uint16x8_t _r5 = vld1q_u16(pp + 8 * 5);
                uint16x8_t _r6 = vld1q_u16(pp + 8 * 6);
                uint16x8_t _r7 = vld1q_u16(pp + 8 * 7);
                transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
                vst1q_u16(p0, _r0);
                vst1q_u16(p0 + 8, _r1);
                vst1q_u16(p0 + 8 * 2, _r2);
                vst1q_u16(p0 + 8 * 3, _r3);
                vst1q_u16(p0 + 8 * 4, _r4);
                vst1q_u16(p0 + 8 * 5, _r5);
                vst1q_u16(p0 + 8 * 6, _r6);
                vst1q_u16(p0 + 8 * 7, _r7);
                pp += 64;
                p0 += out_hstep * 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                uint16x8_t _r0 = vld1q_u16(pp);
                uint16x8_t _r1 = vld1q_u16(pp + 8);
                uint16x8_t _r2 = vld1q_u16(pp + 8 * 2);
                uint16x8_t _r3 = vld1q_u16(pp + 8 * 3);
                transpose8x4_u16(_r0, _r1, _r2, _r3);
                vst1_u16(p0, vget_low_u16(_r0));
                vst1_u16(p0 + 8, vget_high_u16(_r0));
                vst1_u16(p0 + 8 * 2, vget_low_u16(_r1));
                vst1_u16(p0 + 8 * 3, vget_high_u16(_r1));
                vst1_u16(p0 + 8 * 4, vget_low_u16(_r2));
                vst1_u16(p0 + 8 * 5, vget_high_u16(_r2));
                vst1_u16(p0 + 8 * 6, vget_low_u16(_r3));
                vst1_u16(p0 + 8 * 7, vget_high_u16(_r3));
                pp += 32;
                p0 += out_hstep * 8;
            }
        }
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x8x4_t _r0123;
                _r0123.val[0] = vld1q_u16(pp);
                _r0123.val[1] = vld1q_u16(pp + 8);
                _r0123.val[2] = vld1q_u16(pp + 8 * 2);
                _r0123.val[3] = vld1q_u16(pp + 8 * 3);
                vst4q_u16(p0, _r0123);
                pp += 32;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                uint16x8_t _r0 = vld1q_u16(pp);
                vst1q_u16(p0, _r0);
                pp += 8;
                p0 += out_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
#if __aarch64__
        if (out_elempack == 8)
        {
            unsigned short* p0 = (unsigned short*)top_blob + (j / 8 * 8) * out_hstep + (i + ii) * 8;

            int jj = 0;
            if (j % 8 == 4)
            {
                uint16x4x4_t _r0123 = vld4_u16(pp);
                vst1_u16(p0 + 4, _r0123.val[0]);
                vst1_u16(p0 + 8 + 4, _r0123.val[1]);
                vst1_u16(p0 + 16 + 4, _r0123.val[2]);
                vst1_u16(p0 + 24 + 4, _r0123.val[3]);
                pp += 16;
                p0 += out_hstep * 8;
                jj += 4;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                uint16x8x4_t _r0123 = vld4q_u16(pp);
                vst1q_u16(p0, _r0123.val[0]);
                vst1q_u16(p0 + 8, _r0123.val[1]);
                vst1q_u16(p0 + 16, _r0123.val[2]);
                vst1q_u16(p0 + 24, _r0123.val[3]);
                pp += 32;
                p0 += out_hstep * 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                uint16x4x4_t _r0123 = vld4_u16(pp);
                vst1_u16(p0, _r0123.val[0]);
                vst1_u16(p0 + 8, _r0123.val[1]);
                vst1_u16(p0 + 16, _r0123.val[2]);
                vst1_u16(p0 + 24, _r0123.val[3]);
                pp += 16;
                p0 += out_hstep * 8;
            }
        }
#endif // __aarch64__
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x4x4_t _r0123;
                _r0123.val[0] = vld1_u16(pp);
                _r0123.val[1] = vld1_u16(pp + 4);
                _r0123.val[2] = vld1_u16(pp + 8);
                _r0123.val[3] = vld1_u16(pp + 12);
                vst4_u16(p0, _r0123);
                pp += 16;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                uint16x4_t _r0 = vld1_u16(pp);
                vst1_u16(p0, _r0);
                pp += 4;
                p0 += out_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
#if __aarch64__
        if (out_elempack == 8)
        {
            unsigned short* p0 = (unsigned short*)top_blob + (j / 8 * 8) * out_hstep + (i + ii) * 8;

            int jj = 0;
            if (j % 8 == 4)
            {
                p0[0 + 4] = pp[0];
                p0[1 + 4] = pp[2];
                p0[2 + 4] = pp[4];
                p0[3 + 4] = pp[6];
                p0[8 + 4] = pp[1];
                p0[9 + 4] = pp[3];
                p0[10 + 4] = pp[5];
                p0[11 + 4] = pp[7];
                pp += 8;
                p0 += out_hstep * 8;
                jj += 4;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                p0[0] = pp[0];
                p0[1] = pp[2];
                p0[2] = pp[4];
                p0[3] = pp[6];
                p0[4] = pp[8];
                p0[5] = pp[10];
                p0[6] = pp[12];
                p0[7] = pp[14];
                p0[8] = pp[1];
                p0[9] = pp[3];
                p0[10] = pp[5];
                p0[11] = pp[7];
                p0[12] = pp[9];
                p0[13] = pp[11];
                p0[14] = pp[13];
                p0[15] = pp[15];
                pp += 16;
                p0 += out_hstep * 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                p0[0] = pp[0];
                p0[1] = pp[2];
                p0[2] = pp[4];
                p0[3] = pp[6];
                p0[8] = pp[1];
                p0[9] = pp[3];
                p0[10] = pp[5];
                p0[11] = pp[7];
                pp += 8;
                p0 += out_hstep * 8;
            }
        }
#endif // __aarch64__
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                p0[0] = pp[0];
                p0[1] = pp[2];
                p0[2] = pp[4];
                p0[3] = pp[6];
                p0[4] = pp[1];
                p0[5] = pp[3];
                p0[6] = pp[5];
                p0[7] = pp[7];
                pp += 8;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = pp[0];
                p0[1] = pp[1];
                pp += 2;
                p0 += out_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
#if __ARM_NEON
#if __aarch64__
        if (out_elempack == 8)
        {
            unsigned short* p0 = (unsigned short*)top_blob + (j / 8 * 8) * out_hstep + (i + ii) * 8;

            int jj = 0;
            if (j % 8 == 4)
            {
                uint16x4_t _r0 = vld1_u16(pp);
                vst1_u16(p0 + 4, _r0);
                pp += 4;
                p0 += out_hstep * 8;
                jj += 4;
            }
            for (; jj + 7 < max_jj; jj += 8)
            {
                uint16x8_t _r0 = vld1q_u16(pp);
                vst1q_u16(p0, _r0);
                pp += 8;
                p0 += out_hstep * 8;
            }
            for (; jj + 3 < max_jj; jj += 4)
            {
                uint16x4_t _r0 = vld1_u16(pp);
                vst1_u16(p0, _r0);
                pp += 4;
                p0 += out_hstep * 8;
            }
        }
#endif // __aarch64__
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x4_t _r0 = vld1_u16(pp);
                vst1_u16(p0, _r0);
                pp += 4;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = pp[0];
                pp += 1;
                p0 += out_hstep;
            }
        }
    }
}

static void get_optimal_tile_mnk_bf16s_fp16s(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(unsigned short) + sizeof(float)));

    TILE_M = std::max(8, tile_size / 8 * 8);
    TILE_N = std::max(4, tile_size / 4 * 4);
    TILE_K = std::max(8, tile_size / 8 * 8);

    if (K > 0)
    {
        int nn_K = (K + TILE_K - 1) / TILE_K;
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);

        if (nn_K == 1)
        {
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(unsigned short) / TILE_K);

            TILE_M = std::max(8, tile_size / 8 * 8);
            TILE_N = std::max(4, tile_size / 4 * 4);
        }
    }

    TILE_M *= std::min(nT, get_physical_cpu_count());

    if (M > 0)
    {
        int nn_M = (M + TILE_M - 1) / TILE_M;
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
    }

    if (N > 0)
    {
        int nn_N = (N + TILE_N - 1) / TILE_N;
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
    }

    if (nT > 1)
    {
        TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
    }

    // always take constant TILE_M/N/K value when provided
    if (constant_TILE_M > 0)
    {
        TILE_M = (constant_TILE_M + 7) / 8 * 8;
    }

    if (constant_TILE_N > 0)
    {
        TILE_N = (constant_TILE_N + 3) / 4 * 4;
    }

    if (constant_TILE_K > 0)
    {
        TILE_K = (constant_TILE_K + 7) / 8 * 8;
    }
}


================================================
FILE: src/layer/arm/gemm_fp16s.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
void gemm_transB_packed_tile_fp16s_asimdfhm(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, float alpha, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end);
#endif

static void pack_A_tile_fp32_to_fp16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    unsigned short* pp = AT;

    int ii = 0;
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
        const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
        const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;
        const float* p4 = (const float*)A + (i + ii + 4) * A_hstep + k;
        const float* p5 = (const float*)A + (i + ii + 5) * A_hstep + k;
        const float* p6 = (const float*)A + (i + ii + 6) * A_hstep + k;
        const float* p7 = (const float*)A + (i + ii + 7) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            uint16x8_t _r1 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p1)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1 + 4)));
            uint16x8_t _r2 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p2)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2 + 4)));
            uint16x8_t _r3 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p3)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3 + 4)));
            uint16x8_t _r4 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p4)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p4 + 4)));
            uint16x8_t _r5 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p5)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p5 + 4)));
            uint16x8_t _r6 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p6)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p6 + 4)));
            uint16x8_t _r7 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p7)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p7 + 4)));
            transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
            vst1q_u16(pp, _r0);
            vst1q_u16(pp + 8, _r1);
            vst1q_u16(pp + 8 * 2, _r2);
            vst1q_u16(pp + 8 * 3, _r3);
            vst1q_u16(pp + 8 * 4, _r4);
            vst1q_u16(pp + 8 * 5, _r5);
            vst1q_u16(pp + 8 * 6, _r6);
            vst1q_u16(pp + 8 * 7, _r7);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p1[0]);
            pp[2] = float32_to_float16(p2[0]);
            pp[3] = float32_to_float16(p3[0]);
            pp[4] = float32_to_float16(p4[0]);
            pp[5] = float32_to_float16(p5[0]);
            pp[6] = float32_to_float16(p6[0]);
            pp[7] = float32_to_float16(p7[0]);
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
        const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
        const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x4_t _r0123;
            _r0123.val[0] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            _r0123.val[1] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p1)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1 + 4)));
            _r0123.val[2] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p2)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2 + 4)));
            _r0123.val[3] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p3)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3 + 4)));
            vst4q_u16(pp, _r0123);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x4_t _r0123;
            _r0123.val[0] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            _r0123.val[1] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1));
            _r0123.val[2] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2));
            _r0123.val[3] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3));
            vst4_u16(pp, _r0123);
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p1[0]);
            pp[2] = float32_to_float16(p2[0]);
            pp[3] = float32_to_float16(p3[0]);
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
        const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x2_t _r01;
            _r01.val[0] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            _r01.val[1] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p1)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1 + 4)));
            vst2q_u16(pp, _r01);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x2_t _r01;
            _r01.val[0] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            _r01.val[1] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1));
            vst2_u16(pp, _r01);
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p1[0]);
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp += 1;
            p0++;
        }
    }
}

static void transpose_pack_A_tile_fp32_to_fp16(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    unsigned short* pp = AT;

    int ii = 0;
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x8_t _r0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += A_hstep;
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += A_hstep;
        }
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p0[1]);
            pp += 2;
            p0 += A_hstep;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp += 1;
            p0 += A_hstep;
        }
    }
}

static void pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    unsigned short* pp = BT;

    int jj = 0;
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
        const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
        const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
        const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
        const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
        const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
        const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;
        const float* p8 = (const float*)B + (j + jj + 8) * B_hstep + k;
        const float* p9 = (const float*)B + (j + jj + 9) * B_hstep + k;
        const float* pa = (const float*)B + (j + jj + 10) * B_hstep + k;
        const float* pb = (const float*)B + (j + jj + 11) * B_hstep + k;

        int kk = 0;
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            uint16x4_t _r1 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1));
            uint16x4_t _r2 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2));
            uint16x4_t _r3 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3));
            uint16x4_t _r4 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p4));
            uint16x4_t _r5 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p5));
            uint16x4_t _r6 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p6));
            uint16x4_t _r7 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p7));
            uint16x4_t _r8 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p8));
            uint16x4_t _r9 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p9));
            uint16x4_t _ra = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pa));
            uint16x4_t _rb = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pb));

            transpose4x4_u16(_r0, _r1, _r2, _r3);
            transpose4x4_u16(_r4, _r5, _r6, _r7);
            transpose4x4_u16(_r8, _r9, _ra, _rb);

            vst1_u16(pp, _r0);
            vst1_u16(pp + 4, _r4);
            vst1_u16(pp + 4 * 2, _r8);
            vst1_u16(pp + 4 * 3, _r1);
            vst1_u16(pp + 4 * 4, _r5);
            vst1_u16(pp + 4 * 5, _r9);
            vst1_u16(pp + 4 * 6, _r2);
            vst1_u16(pp + 4 * 7, _r6);
            vst1_u16(pp + 4 * 8, _ra);
            vst1_u16(pp + 4 * 9, _r3);
            vst1_u16(pp + 4 * 10, _r7);
            vst1_u16(pp + 4 * 11, _rb);
            pp += 48;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
            p4 += 4;
            p5 += 4;
            p6 += 4;
            p7 += 4;
            p8 += 4;
            p9 += 4;
            pa += 4;
            pb += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p1[0]);
            pp[2] = float32_to_float16(p2[0]);
            pp[3] = float32_to_float16(p3[0]);
            pp[4] = float32_to_float16(p4[0]);
            pp[5] = float32_to_float16(p5[0]);
            pp[6] = float32_to_float16(p6[0]);
            pp[7] = float32_to_float16(p7[0]);
            pp[8] = float32_to_float16(p8[0]);
            pp[9] = float32_to_float16(p9[0]);
            pp[10] = float32_to_float16(pa[0]);
            pp[11] = float32_to_float16(pb[0]);
            pp += 12;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
            p8++;
            p9++;
            pa++;
            pb++;
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
        const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
        const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
        const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
        const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
        const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
        const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            uint16x8_t _r1 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p1)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1 + 4)));
            uint16x8_t _r2 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p2)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2 + 4)));
            uint16x8_t _r3 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p3)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3 + 4)));
            uint16x8_t _r4 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p4)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p4 + 4)));
            uint16x8_t _r5 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p5)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p5 + 4)));
            uint16x8_t _r6 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p6)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p6 + 4)));
            uint16x8_t _r7 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p7)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p7 + 4)));
            transpose8x8_u16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
            vst1q_u16(pp, _r0);
            vst1q_u16(pp + 8, _r1);
            vst1q_u16(pp + 8 * 2, _r2);
            vst1q_u16(pp + 8 * 3, _r3);
            vst1q_u16(pp + 8 * 4, _r4);
            vst1q_u16(pp + 8 * 5, _r5);
            vst1q_u16(pp + 8 * 6, _r6);
            vst1q_u16(pp + 8 * 7, _r7);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            uint16x4_t _r1 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1));
            uint16x4_t _r2 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2));
            uint16x4_t _r3 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3));
            uint16x4_t _r4 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p4));
            uint16x4_t _r5 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p5));
            uint16x4_t _r6 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p6));
            uint16x4_t _r7 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p7));

            transpose4x4_u16(_r0, _r1, _r2, _r3);
            transpose4x4_u16(_r4, _r5, _r6, _r7);

            vst1_u16(pp, _r0);
            vst1_u16(pp + 4, _r4);
            vst1_u16(pp + 4 * 2, _r1);
            vst1_u16(pp + 4 * 3, _r5);
            vst1_u16(pp + 4 * 4, _r2);
            vst1_u16(pp + 4 * 5, _r6);
            vst1_u16(pp + 4 * 6, _r3);
            vst1_u16(pp + 4 * 7, _r7);
            pp += 32;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
            p4 += 4;
            p5 += 4;
            p6 += 4;
            p7 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p1[0]);
            pp[2] = float32_to_float16(p2[0]);
            pp[3] = float32_to_float16(p3[0]);
            pp[4] = float32_to_float16(p4[0]);
            pp[5] = float32_to_float16(p5[0]);
            pp[6] = float32_to_float16(p6[0]);
            pp[7] = float32_to_float16(p7[0]);
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
        const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
        const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x4_t _r0123;
            _r0123.val[0] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            _r0123.val[1] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p1)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1 + 4)));
            _r0123.val[2] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p2)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2 + 4)));
            _r0123.val[3] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p3)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3 + 4)));
            vst4q_u16(pp, _r0123);
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x4_t _r0123;
            _r0123.val[0] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            _r0123.val[1] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1));
            _r0123.val[2] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p2));
            _r0123.val[3] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p3));
            vst4_u16(pp, _r0123);
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p1[0]);
            pp[2] = float32_to_float16(p2[0]);
            pp[3] = float32_to_float16(p3[0]);
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
    for (; jj + 1 < max_jj; jj += 2)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
        const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8x2_t _r01;
            _r01.val[0] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            _r01.val[1] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p1)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1 + 4)));
            vst2q_u16(pp, _r01);
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4x2_t _r01;
            _r01.val[0] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            _r01.val[1] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p1));
            vst2_u16(pp, _r01);
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p1[0]);
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;

        int kk = 0;
        for (; kk + 7 < max_kk; kk += 8)
        {
            uint16x8_t _r0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
            uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += 4;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp += 1;
            p0++;
        }
    }
}

static void transpose_pack_B_tile_fp32_to_fp16(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    unsigned short* pp = BT;

    int jj = 0;
#if __aarch64__
    for (; jj + 11 < max_jj; jj += 12)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            vst1_u16(pp, (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)));
            vst1_u16(pp + 4, (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            vst1_u16(pp + 8, (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 8)));
            pp += 12;
            p0 += B_hstep;
        }
    }
#endif // __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x8_t _r0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(p0)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0 + 4)));
            vst1q_u16(pp, _r0);
            pp += 8;
            p0 += B_hstep;
        }
    }
    for (; jj + 3 < max_jj; jj += 4)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(p0));
            vst1_u16(pp, _r0);
            pp += 4;
            p0 += B_hstep;
        }
    }
    for (; jj + 1 < max_jj; jj += 2)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp[1] = float32_to_float16(p0[1]);
            pp += 2;
            p0 += B_hstep;
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj);

        int kk = 0;
        for (; kk < max_kk; kk++)
        {
            pp[0] = float32_to_float16(p0[0]);
            pp += 1;
            p0 += B_hstep;
        }
    }
}

static void transpose_unpack_output_tile_fp32_to_fp16(const Mat& topT, Mat& top_blob, int i, int max_ii, int j, int max_jj)
{
    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const float* pp = topT;

    int ii = 0;
#if __ARM_NEON
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x8x4_t _r0;
                _r0.val[0] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(pp)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 4)));
                _r0.val[1] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 8)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 12)));
                _r0.val[2] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 16)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 20)));
                _r0.val[3] = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 24)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 28)));
                vst4q_u16(p0, _r0);
                pp += 32;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                uint16x8_t _r0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(vld1q_f32(pp)), (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 4)));
                vst1q_u16(p0, _r0);
                pp += 8;
                p0 += out_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x4x4_t _r0123;
                _r0123.val[0] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp));
                _r0123.val[1] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 4));
                _r0123.val[2] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 8));
                _r0123.val[3] = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp + 12));
                vst4_u16(p0, _r0123);
                pp += 16;
                p0 += out_hstep * 4;
            }
        }
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp));
                vst1_u16(p0, _r0);
                pp += 4;
                p0 += out_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
#if __ARM_NEON
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                p0[0] = float32_to_float16(pp[0]);
                p0[1] = float32_to_float16(pp[2]);
                p0[2] = float32_to_float16(pp[4]);
                p0[3] = float32_to_float16(pp[6]);
                p0[4] = float32_to_float16(pp[1]);
                p0[5] = float32_to_float16(pp[3]);
                p0[6] = float32_to_float16(pp[5]);
                p0[7] = float32_to_float16(pp[7]);
                pp += 8;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = float32_to_float16(pp[0]);
                p0[1] = float32_to_float16(pp[1]);
                pp += 2;
                p0 += out_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
#if __ARM_NEON
        if (out_elempack == 4)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * 4;

            for (int jj = 0; jj + 3 < max_jj; jj += 4)
            {
                uint16x4_t _r0 = (uint16x4_t)vcvt_f16_f32(vld1q_f32(pp));
                vst1_u16(p0, _r0);
                pp += 4;
                p0 += out_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (out_elempack == 1)
        {
            unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii);

            for (int jj = 0; jj < max_jj; jj += 1)
            {
                p0[0] = float32_to_float16(pp[0]);
                pp += 1;
                p0 += out_hstep;
            }
        }
    }
}

static void gemm_transB_packed_tile_fp16s(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, float alpha, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdfhm())
    {
        gemm_transB_packed_tile_fp16s_asimdfhm(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, alpha, i, max_ii, j, max_jj, k, max_kk, k_end);
        return;
    }
#endif

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

#if __ARM_FEATURE_FP16_FML
    const __fp16* pAT = AT_tile;
    const __fp16* pBT = BT_tile;
#else
    const unsigned short* pAT = AT_tile;
    const unsigned short* pBT = BT_tile;
#endif
    const float* pC = CT_tile;

    float* outptr = topT_tile;

    int ii = 0;
#if __aarch64__
    for (; ii + 7 < max_ii; ii += 8)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

#if __ARM_FEATURE_FP16_FML
        const __fp16* pB = pBT;
#else
        const unsigned short* pB = pBT;
#endif

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;
            float32x4_t _sum80;
            float32x4_t _sum81;
            float32x4_t _sum90;
            float32x4_t _sum91;
            float32x4_t _suma0;
            float32x4_t _suma1;
            float32x4_t _sumb0;
            float32x4_t _sumb1;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);
                _sum40 = vdupq_n_f32(0.f);
                _sum41 = vdupq_n_f32(0.f);
                _sum50 = vdupq_n_f32(0.f);
                _sum51 = vdupq_n_f32(0.f);
                _sum60 = vdupq_n_f32(0.f);
                _sum61 = vdupq_n_f32(0.f);
                _sum70 = vdupq_n_f32(0.f);
                _sum71 = vdupq_n_f32(0.f);
                _sum80 = vdupq_n_f32(0.f);
                _sum81 = vdupq_n_f32(0.f);
                _sum90 = vdupq_n_f32(0.f);
                _sum91 = vdupq_n_f32(0.f);
                _suma0 = vdupq_n_f32(0.f);
                _suma1 = vdupq_n_f32(0.f);
                _sumb0 = vdupq_n_f32(0.f);
                _sumb1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum20 = _sum00;
                        _sum21 = _sum00;
                        _sum30 = _sum00;
                        _sum31 = _sum00;
                        _sum40 = _sum00;
                        _sum41 = _sum00;
                        _sum50 = _sum00;
                        _sum51 = _sum00;
                        _sum60 = _sum00;
                        _sum61 = _sum00;
                        _sum70 = _sum00;
                        _sum71 = _sum00;
                        _sum80 = _sum00;
                        _sum81 = _sum00;
                        _sum90 = _sum00;
                        _sum91 = _sum00;
                        _suma0 = _sum00;
                        _suma1 = _sum00;
                        _sumb0 = _sum00;
                        _sumb1 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                        _sum40 = _sum00;
                        _sum41 = _sum01;
                        _sum50 = _sum00;
                        _sum51 = _sum01;
                        _sum60 = _sum00;
                        _sum61 = _sum01;
                        _sum70 = _sum00;
                        _sum71 = _sum01;
                        _sum80 = _sum00;
                        _sum81 = _sum01;
                        _sum90 = _sum00;
                        _sum91 = _sum01;
                        _suma0 = _sum00;
                        _suma1 = _sum01;
                        _sumb0 = _sum00;
                        _sumb1 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        _sum40 = vld1q_f32(pC + 4 * 8);
                        _sum41 = vld1q_f32(pC + 4 * 9);
                        _sum50 = vld1q_f32(pC + 4 * 10);
                        _sum51 = vld1q_f32(pC + 4 * 11);
                        _sum60 = vld1q_f32(pC + 4 * 12);
                        _sum61 = vld1q_f32(pC + 4 * 13);
                        _sum70 = vld1q_f32(pC + 4 * 14);
                        _sum71 = vld1q_f32(pC + 4 * 15);
                        _sum80 = vld1q_f32(pC + 4 * 16);
                        _sum81 = vld1q_f32(pC + 4 * 17);
                        _sum90 = vld1q_f32(pC + 4 * 18);
                        _sum91 = vld1q_f32(pC + 4 * 19);
                        _suma0 = vld1q_f32(pC + 4 * 20);
                        _suma1 = vld1q_f32(pC + 4 * 21);
                        _sumb0 = vld1q_f32(pC + 4 * 22);
                        _sumb1 = vld1q_f32(pC + 4 * 23);
                        pC += 96;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum40 = vdupq_n_f32(pC[4]);
                        _sum50 = vdupq_n_f32(pC[5]);
                        _sum60 = vdupq_n_f32(pC[6]);
                        _sum70 = vdupq_n_f32(pC[7]);
                        _sum80 = vdupq_n_f32(pC[8]);
                        _sum90 = vdupq_n_f32(pC[9]);
                        _suma0 = vdupq_n_f32(pC[10]);
                        _sumb0 = vdupq_n_f32(pC[11]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        _sum41 = _sum40;
                        _sum51 = _sum50;
                        _sum61 = _sum60;
                        _sum71 = _sum70;
                        _sum81 = _sum80;
                        _sum91 = _sum90;
                        _suma1 = _suma0;
                        _sumb1 = _sumb0;
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
                _sum80 = vld1q_f32(outptr + 4 * 16);
                _sum81 = vld1q_f32(outptr + 4 * 17);
                _sum90 = vld1q_f32(outptr + 4 * 18);
                _sum91 = vld1q_f32(outptr + 4 * 19);
                _suma0 = vld1q_f32(outptr + 4 * 20);
                _suma1 = vld1q_f32(outptr + 4 * 21);
                _sumb0 = vld1q_f32(outptr + 4 * 22);
                _sumb1 = vld1q_f32(outptr + 4 * 23);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pA = vld1q_f16(pA);

                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                _sum00 = vfmlalq_lane_low_f16(_sum00, _pA, _pB0, 0);
                _sum01 = vfmlalq_lane_high_f16(_sum01, _pA, _pB0, 0);
                _sum10 = vfmlalq_lane_low_f16(_sum10, _pA, _pB0, 1);
                _sum11 = vfmlalq_lane_high_f16(_sum11, _pA, _pB0, 1);
                _sum20 = vfmlalq_lane_low_f16(_sum20, _pA, _pB0, 2);
                _sum21 = vfmlalq_lane_high_f16(_sum21, _pA, _pB0, 2);
                _sum30 = vfmlalq_lane_low_f16(_sum30, _pA, _pB0, 3);
                _sum31 = vfmlalq_lane_high_f16(_sum31, _pA, _pB0, 3);
                _sum40 = vfmlalq_lane_low_f16(_sum40, _pA, _pB1, 0);
                _sum41 = vfmlalq_lane_high_f16(_sum41, _pA, _pB1, 0);
                _sum50 = vfmlalq_lane_low_f16(_sum50, _pA, _pB1, 1);
                _sum51 = vfmlalq_lane_high_f16(_sum51, _pA, _pB1, 1);
                _sum60 = vfmlalq_lane_low_f16(_sum60, _pA, _pB1, 2);
                _sum61 = vfmlalq_lane_high_f16(_sum61, _pA, _pB1, 2);
                _sum70 = vfmlalq_lane_low_f16(_sum70, _pA, _pB1, 3);
                _sum71 = vfmlalq_lane_high_f16(_sum71, _pA, _pB1, 3);
                _sum80 = vfmlalq_lane_low_f16(_sum80, _pA, _pB2, 0);
                _sum81 = vfmlalq_lane_high_f16(_sum81, _pA, _pB2, 0);
                _sum90 = vfmlalq_lane_low_f16(_sum90, _pA, _pB2, 1);
                _sum91 = vfmlalq_lane_high_f16(_sum91, _pA, _pB2, 1);
                _suma0 = vfmlalq_lane_low_f16(_suma0, _pA, _pB2, 2);
                _suma1 = vfmlalq_lane_high_f16(_suma1, _pA, _pB2, 2);
                _sumb0 = vfmlalq_lane_low_f16(_sumb0, _pA, _pB2, 3);
                _sumb1 = vfmlalq_lane_high_f16(_sumb1, _pA, _pB2, 3);
#else
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pA));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pA));

                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vld1_u16(pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 4));
                float32x4_t _pB2 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 8));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
                _sum80 = vfmaq_laneq_f32(_sum80, _pA0, _pB2, 0);
                _sum81 = vfmaq_laneq_f32(_sum81, _pA1, _pB2, 0);
                _sum90 = vfmaq_laneq_f32(_sum90, _pA0, _pB2, 1);
                _sum91 = vfmaq_laneq_f32(_sum91, _pA1, _pB2, 1);
                _suma0 = vfmaq_laneq_f32(_suma0, _pA0, _pB2, 2);
                _suma1 = vfmaq_laneq_f32(_suma1, _pA1, _pB2, 2);
                _sumb0 = vfmaq_laneq_f32(_sumb0, _pA0, _pB2, 3);
                _sumb1 = vfmaq_laneq_f32(_sumb1, _pA1, _pB2, 3);
#endif

                pA += 8;
                pB += 12;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum20 = vmulq_f32(_sum20, _alpha);
                _sum21 = vmulq_f32(_sum21, _alpha);
                _sum30 = vmulq_f32(_sum30, _alpha);
                _sum31 = vmulq_f32(_sum31, _alpha);
                _sum40 = vmulq_f32(_sum40, _alpha);
                _sum41 = vmulq_f32(_sum41, _alpha);
                _sum50 = vmulq_f32(_sum50, _alpha);
                _sum51 = vmulq_f32(_sum51, _alpha);
                _sum60 = vmulq_f32(_sum60, _alpha);
                _sum61 = vmulq_f32(_sum61, _alpha);
                _sum70 = vmulq_f32(_sum70, _alpha);
                _sum71 = vmulq_f32(_sum71, _alpha);
                _sum80 = vmulq_f32(_sum80, _alpha);
                _sum81 = vmulq_f32(_sum81, _alpha);
                _sum90 = vmulq_f32(_sum90, _alpha);
                _sum91 = vmulq_f32(_sum91, _alpha);
                _suma0 = vmulq_f32(_suma0, _alpha);
                _suma1 = vmulq_f32(_suma1, _alpha);
                _sumb0 = vmulq_f32(_sumb0, _alpha);
                _sumb1 = vmulq_f32(_sumb1, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum20));
                    vst1_u16(outptr0 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum30));
                    vst1_u16(outptr0 + 4 * 4, (uint16x4_t)vcvt_f16_f32(_sum40));
                    vst1_u16(outptr0 + 4 * 5, (uint16x4_t)vcvt_f16_f32(_sum50));
                    vst1_u16(outptr0 + 4 * 6, (uint16x4_t)vcvt_f16_f32(_sum60));
                    vst1_u16(outptr0 + 4 * 7, (uint16x4_t)vcvt_f16_f32(_sum70));
                    vst1_u16(outptr0 + 4 * 8, (uint16x4_t)vcvt_f16_f32(_sum80));
                    vst1_u16(outptr0 + 4 * 9, (uint16x4_t)vcvt_f16_f32(_sum90));
                    vst1_u16(outptr0 + 4 * 10, (uint16x4_t)vcvt_f16_f32(_suma0));
                    vst1_u16(outptr0 + 4 * 11, (uint16x4_t)vcvt_f16_f32(_sumb0));

                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, (uint16x4_t)vcvt_f16_f32(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 4, (uint16x4_t)vcvt_f16_f32(_sum41));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 5, (uint16x4_t)vcvt_f16_f32(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 6, (uint16x4_t)vcvt_f16_f32(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 7, (uint16x4_t)vcvt_f16_f32(_sum71));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 8, (uint16x4_t)vcvt_f16_f32(_sum81));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 9, (uint16x4_t)vcvt_f16_f32(_sum91));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 10, (uint16x4_t)vcvt_f16_f32(_suma1));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 11, (uint16x4_t)vcvt_f16_f32(_sumb1));

                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose8x12_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, _sum80, _sum81, _sum90, _sum91, _suma0, _suma1, _sumb0, _sumb1);

                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + 8, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum11));
                    vst1_u16(outptr0 + out_hstep + 4, (uint16x4_t)vcvt_f16_f32(_sum20));
                    vst1_u16(outptr0 + out_hstep + 8, (uint16x4_t)vcvt_f16_f32(_sum21));
                    vst1_u16(outptr0 + out_hstep * 2, (uint16x4_t)vcvt_f16_f32(_sum30));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, (uint16x4_t)vcvt_f16_f32(_sum31));
                    vst1_u16(outptr0 + out_hstep * 2 + 8, (uint16x4_t)vcvt_f16_f32(_sum40));
                    vst1_u16(outptr0 + out_hstep * 3, (uint16x4_t)vcvt_f16_f32(_sum41));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, (uint16x4_t)vcvt_f16_f32(_sum50));
                    vst1_u16(outptr0 + out_hstep * 3 + 8, (uint16x4_t)vcvt_f16_f32(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum60));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, (uint16x4_t)vcvt_f16_f32(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 8, (uint16x4_t)vcvt_f16_f32(_sum70));
                    vst1_u16(outptr0 + out_hstep * 5, (uint16x4_t)vcvt_f16_f32(_sum71));
                    vst1_u16(outptr0 + out_hstep * 5 + 4, (uint16x4_t)vcvt_f16_f32(_sum80));
                    vst1_u16(outptr0 + out_hstep * 5 + 8, (uint16x4_t)vcvt_f16_f32(_sum81));
                    vst1_u16(outptr0 + out_hstep * 6, (uint16x4_t)vcvt_f16_f32(_sum90));
                    vst1_u16(outptr0 + out_hstep * 6 + 4, (uint16x4_t)vcvt_f16_f32(_sum91));
                    vst1_u16(outptr0 + out_hstep * 6 + 8, (uint16x4_t)vcvt_f16_f32(_suma0));
                    vst1_u16(outptr0 + out_hstep * 7, (uint16x4_t)vcvt_f16_f32(_suma1));
                    vst1_u16(outptr0 + out_hstep * 7 + 4, (uint16x4_t)vcvt_f16_f32(_sumb0));
                    vst1_u16(outptr0 + out_hstep * 7 + 8, (uint16x4_t)vcvt_f16_f32(_sumb1));

                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
                vst1q_f32(outptr + 4 * 16, _sum80);
                vst1q_f32(outptr + 4 * 17, _sum81);
                vst1q_f32(outptr + 4 * 18, _sum90);
                vst1q_f32(outptr + 4 * 19, _sum91);
                vst1q_f32(outptr + 4 * 20, _suma0);
                vst1q_f32(outptr + 4 * 21, _suma1);
                vst1q_f32(outptr + 4 * 22, _sumb0);
                vst1q_f32(outptr + 4 * 23, _sumb1);
            }

            outptr += 96;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;
            float32x4_t _sum40;
            float32x4_t _sum41;
            float32x4_t _sum50;
            float32x4_t _sum51;
            float32x4_t _sum60;
            float32x4_t _sum61;
            float32x4_t _sum70;
            float32x4_t _sum71;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);
                _sum40 = vdupq_n_f32(0.f);
                _sum41 = vdupq_n_f32(0.f);
                _sum50 = vdupq_n_f32(0.f);
                _sum51 = vdupq_n_f32(0.f);
                _sum60 = vdupq_n_f32(0.f);
                _sum61 = vdupq_n_f32(0.f);
                _sum70 = vdupq_n_f32(0.f);
                _sum71 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum20 = _sum00;
                        _sum21 = _sum00;
                        _sum30 = _sum00;
                        _sum31 = _sum00;
                        _sum40 = _sum00;
                        _sum41 = _sum00;
                        _sum50 = _sum00;
                        _sum51 = _sum00;
                        _sum60 = _sum00;
                        _sum61 = _sum00;
                        _sum70 = _sum00;
                        _sum71 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                        _sum40 = _sum00;
                        _sum41 = _sum01;
                        _sum50 = _sum00;
                        _sum51 = _sum01;
                        _sum60 = _sum00;
                        _sum61 = _sum01;
                        _sum70 = _sum00;
                        _sum71 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        _sum40 = vld1q_f32(pC + 4 * 8);
                        _sum41 = vld1q_f32(pC + 4 * 9);
                        _sum50 = vld1q_f32(pC + 4 * 10);
                        _sum51 = vld1q_f32(pC + 4 * 11);
                        _sum60 = vld1q_f32(pC + 4 * 12);
                        _sum61 = vld1q_f32(pC + 4 * 13);
                        _sum70 = vld1q_f32(pC + 4 * 14);
                        _sum71 = vld1q_f32(pC + 4 * 15);
                        pC += 64;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum40 = vdupq_n_f32(pC[4]);
                        _sum50 = vdupq_n_f32(pC[5]);
                        _sum60 = vdupq_n_f32(pC[6]);
                        _sum70 = vdupq_n_f32(pC[7]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        _sum41 = _sum40;
                        _sum51 = _sum50;
                        _sum61 = _sum60;
                        _sum71 = _sum70;
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
                _sum40 = vld1q_f32(outptr + 4 * 8);
                _sum41 = vld1q_f32(outptr + 4 * 9);
                _sum50 = vld1q_f32(outptr + 4 * 10);
                _sum51 = vld1q_f32(outptr + 4 * 11);
                _sum60 = vld1q_f32(outptr + 4 * 12);
                _sum61 = vld1q_f32(outptr + 4 * 13);
                _sum70 = vld1q_f32(outptr + 4 * 14);
                _sum71 = vld1q_f32(outptr + 4 * 15);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pA = vld1q_f16(pA);
                float16x8_t _pB = vld1q_f16(pB);

                _sum00 = vfmlalq_laneq_low_f16(_sum00, _pA, _pB, 0);
                _sum01 = vfmlalq_laneq_high_f16(_sum01, _pA, _pB, 0);
                _sum10 = vfmlalq_laneq_low_f16(_sum10, _pA, _pB, 1);
                _sum11 = vfmlalq_laneq_high_f16(_sum11, _pA, _pB, 1);
                _sum20 = vfmlalq_laneq_low_f16(_sum20, _pA, _pB, 2);
                _sum21 = vfmlalq_laneq_high_f16(_sum21, _pA, _pB, 2);
                _sum30 = vfmlalq_laneq_low_f16(_sum30, _pA, _pB, 3);
                _sum31 = vfmlalq_laneq_high_f16(_sum31, _pA, _pB, 3);
                _sum40 = vfmlalq_laneq_low_f16(_sum40, _pA, _pB, 4);
                _sum41 = vfmlalq_laneq_high_f16(_sum41, _pA, _pB, 4);
                _sum50 = vfmlalq_laneq_low_f16(_sum50, _pA, _pB, 5);
                _sum51 = vfmlalq_laneq_high_f16(_sum51, _pA, _pB, 5);
                _sum60 = vfmlalq_laneq_low_f16(_sum60, _pA, _pB, 6);
                _sum61 = vfmlalq_laneq_high_f16(_sum61, _pA, _pB, 6);
                _sum70 = vfmlalq_laneq_low_f16(_sum70, _pA, _pB, 7);
                _sum71 = vfmlalq_laneq_high_f16(_sum71, _pA, _pB, 7);
#else
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pA));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pA));

                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vld1_u16(pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 4));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
                _sum40 = vfmaq_laneq_f32(_sum40, _pA0, _pB1, 0);
                _sum41 = vfmaq_laneq_f32(_sum41, _pA1, _pB1, 0);
                _sum50 = vfmaq_laneq_f32(_sum50, _pA0, _pB1, 1);
                _sum51 = vfmaq_laneq_f32(_sum51, _pA1, _pB1, 1);
                _sum60 = vfmaq_laneq_f32(_sum60, _pA0, _pB1, 2);
                _sum61 = vfmaq_laneq_f32(_sum61, _pA1, _pB1, 2);
                _sum70 = vfmaq_laneq_f32(_sum70, _pA0, _pB1, 3);
                _sum71 = vfmaq_laneq_f32(_sum71, _pA1, _pB1, 3);
#endif

                pA += 8;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum20 = vmulq_f32(_sum20, _alpha);
                _sum21 = vmulq_f32(_sum21, _alpha);
                _sum30 = vmulq_f32(_sum30, _alpha);
                _sum31 = vmulq_f32(_sum31, _alpha);
                _sum40 = vmulq_f32(_sum40, _alpha);
                _sum41 = vmulq_f32(_sum41, _alpha);
                _sum50 = vmulq_f32(_sum50, _alpha);
                _sum51 = vmulq_f32(_sum51, _alpha);
                _sum60 = vmulq_f32(_sum60, _alpha);
                _sum61 = vmulq_f32(_sum61, _alpha);
                _sum70 = vmulq_f32(_sum70, _alpha);
                _sum71 = vmulq_f32(_sum71, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum20));
                    vst1_u16(outptr0 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum30));
                    vst1_u16(outptr0 + 4 * 4, (uint16x4_t)vcvt_f16_f32(_sum40));
                    vst1_u16(outptr0 + 4 * 5, (uint16x4_t)vcvt_f16_f32(_sum50));
                    vst1_u16(outptr0 + 4 * 6, (uint16x4_t)vcvt_f16_f32(_sum60));
                    vst1_u16(outptr0 + 4 * 7, (uint16x4_t)vcvt_f16_f32(_sum70));

                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, (uint16x4_t)vcvt_f16_f32(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 4, (uint16x4_t)vcvt_f16_f32(_sum41));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 5, (uint16x4_t)vcvt_f16_f32(_sum51));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 6, (uint16x4_t)vcvt_f16_f32(_sum61));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 7, (uint16x4_t)vcvt_f16_f32(_sum71));

                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose8x8_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71);

                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, (uint16x4_t)vcvt_f16_f32(_sum11));
                    vst1_u16(outptr0 + out_hstep * 2, (uint16x4_t)vcvt_f16_f32(_sum20));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, (uint16x4_t)vcvt_f16_f32(_sum21));
                    vst1_u16(outptr0 + out_hstep * 3, (uint16x4_t)vcvt_f16_f32(_sum30));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, (uint16x4_t)vcvt_f16_f32(_sum31));
                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum40));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, (uint16x4_t)vcvt_f16_f32(_sum41));
                    vst1_u16(outptr0 + out_hstep * 5, (uint16x4_t)vcvt_f16_f32(_sum50));
                    vst1_u16(outptr0 + out_hstep * 5 + 4, (uint16x4_t)vcvt_f16_f32(_sum51));
                    vst1_u16(outptr0 + out_hstep * 6, (uint16x4_t)vcvt_f16_f32(_sum60));
                    vst1_u16(outptr0 + out_hstep * 6 + 4, (uint16x4_t)vcvt_f16_f32(_sum61));
                    vst1_u16(outptr0 + out_hstep * 7, (uint16x4_t)vcvt_f16_f32(_sum70));
                    vst1_u16(outptr0 + out_hstep * 7 + 4, (uint16x4_t)vcvt_f16_f32(_sum71));

                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
                vst1q_f32(outptr + 4 * 8, _sum40);
                vst1q_f32(outptr + 4 * 9, _sum41);
                vst1q_f32(outptr + 4 * 10, _sum50);
                vst1q_f32(outptr + 4 * 11, _sum51);
                vst1q_f32(outptr + 4 * 12, _sum60);
                vst1q_f32(outptr + 4 * 13, _sum61);
                vst1q_f32(outptr + 4 * 14, _sum70);
                vst1q_f32(outptr + 4 * 15, _sum71);
            }

            outptr += 64;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum20;
            float32x4_t _sum21;
            float32x4_t _sum30;
            float32x4_t _sum31;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum20 = vdupq_n_f32(0.f);
                _sum21 = vdupq_n_f32(0.f);
                _sum30 = vdupq_n_f32(0.f);
                _sum31 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum20 = _sum00;
                        _sum21 = _sum00;
                        _sum30 = _sum00;
                        _sum31 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum20 = _sum00;
                        _sum21 = _sum01;
                        _sum30 = _sum00;
                        _sum31 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        _sum20 = vld1q_f32(pC + 4 * 4);
                        _sum21 = vld1q_f32(pC + 4 * 5);
                        _sum30 = vld1q_f32(pC + 4 * 6);
                        _sum31 = vld1q_f32(pC + 4 * 7);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum20 = vdupq_n_f32(pC[2]);
                        _sum30 = vdupq_n_f32(pC[3]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        _sum21 = _sum20;
                        _sum31 = _sum30;
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
                _sum20 = vld1q_f32(outptr + 4 * 4);
                _sum21 = vld1q_f32(outptr + 4 * 5);
                _sum30 = vld1q_f32(outptr + 4 * 6);
                _sum31 = vld1q_f32(outptr + 4 * 7);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pA = vld1q_f16(pA);
                float16x4_t _pB = vld1_f16(pB);

                _sum00 = vfmlalq_lane_low_f16(_sum00, _pA, _pB, 0);
                _sum01 = vfmlalq_lane_high_f16(_sum01, _pA, _pB, 0);
                _sum10 = vfmlalq_lane_low_f16(_sum10, _pA, _pB, 1);
                _sum11 = vfmlalq_lane_high_f16(_sum11, _pA, _pB, 1);
                _sum20 = vfmlalq_lane_low_f16(_sum20, _pA, _pB, 2);
                _sum21 = vfmlalq_lane_high_f16(_sum21, _pA, _pB, 2);
                _sum30 = vfmlalq_lane_low_f16(_sum30, _pA, _pB, 3);
                _sum31 = vfmlalq_lane_high_f16(_sum31, _pA, _pB, 3);
#else
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pA));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pA));

                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vld1_u16(pB));

                _sum00 = vfmaq_laneq_f32(_sum00, _pA0, _pB0, 0);
                _sum01 = vfmaq_laneq_f32(_sum01, _pA1, _pB0, 0);
                _sum10 = vfmaq_laneq_f32(_sum10, _pA0, _pB0, 1);
                _sum11 = vfmaq_laneq_f32(_sum11, _pA1, _pB0, 1);
                _sum20 = vfmaq_laneq_f32(_sum20, _pA0, _pB0, 2);
                _sum21 = vfmaq_laneq_f32(_sum21, _pA1, _pB0, 2);
                _sum30 = vfmaq_laneq_f32(_sum30, _pA0, _pB0, 3);
                _sum31 = vfmaq_laneq_f32(_sum31, _pA1, _pB0, 3);
#endif

                pA += 8;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum20 = vmulq_f32(_sum20, _alpha);
                _sum21 = vmulq_f32(_sum21, _alpha);
                _sum30 = vmulq_f32(_sum30, _alpha);
                _sum31 = vmulq_f32(_sum31, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum20));
                    vst1_u16(outptr0 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum30));

                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, (uint16x4_t)vcvt_f16_f32(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum21));
                    vst1_u16(outptr0 + out_hstep * 4 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum31));

                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose8x4_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31);

                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + out_hstep * 1, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + out_hstep * 2, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + out_hstep * 3, (uint16x4_t)vcvt_f16_f32(_sum11));
                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum20));
                    vst1_u16(outptr0 + out_hstep * 5, (uint16x4_t)vcvt_f16_f32(_sum21));
                    vst1_u16(outptr0 + out_hstep * 6, (uint16x4_t)vcvt_f16_f32(_sum30));
                    vst1_u16(outptr0 + out_hstep * 7, (uint16x4_t)vcvt_f16_f32(_sum31));

                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
                vst1q_f32(outptr + 4 * 4, _sum20);
                vst1q_f32(outptr + 4 * 5, _sum21);
                vst1q_f32(outptr + 4 * 6, _sum30);
                vst1q_f32(outptr + 4 * 7, _sum31);
            }

            outptr += 32;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4 * 1);
                        _sum10 = vld1q_f32(pC + 4 * 2);
                        _sum11 = vld1q_f32(pC + 4 * 3);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum01 = _sum00;
                        _sum11 = _sum10;
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4 * 1);
                _sum10 = vld1q_f32(outptr + 4 * 2);
                _sum11 = vld1q_f32(outptr + 4 * 3);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pA = vld1q_f16(pA);
                float16x4_t _pB0 = vdup_n_f16(pB[0]);
                float16x4_t _pB1 = vdup_n_f16(pB[1]);
                float16x8_t _pB01 = vcombine_f16(_pB0, _pB1);
                float16x8_t _pB10 = vcombine_f16(_pB1, _pB0);

                _sum00 = vfmlalq_low_f16(_sum00, _pA, _pB01);
                _sum01 = vfmlalq_high_f16(_sum01, _pA, _pB10);
                _sum10 = vfmlalq_low_f16(_sum10, _pA, _pB10);
                _sum11 = vfmlalq_high_f16(_sum11, _pA, _pB01);
#else
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pA));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pA));

                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pB[0]));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pB[1]));

                _sum00 = vfmaq_f32(_sum00, _pA0, _pB0);
                _sum01 = vfmaq_f32(_sum01, _pA1, _pB0);
                _sum10 = vfmaq_f32(_sum10, _pA0, _pB1);
                _sum11 = vfmaq_f32(_sum11, _pA1, _pB1);
#endif

                pA += 8;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum10));

                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + out_hstep * 4 + 4, (uint16x4_t)vcvt_f16_f32(_sum11));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[8];
                    unsigned short sum1[8];
                    vst1_u16(sum0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(sum0 + 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(sum1, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(sum1 + 4, (uint16x4_t)vcvt_f16_f32(_sum11));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];

                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0[out_hstep * 4 + 1] = sum1[4];
                    outptr0[out_hstep * 5 + 1] = sum1[5];
                    outptr0[out_hstep * 6 + 1] = sum1[6];
                    outptr0[out_hstep * 7 + 1] = sum1[7];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
                vst1q_f32(outptr + 4 * 2, _sum10);
                vst1q_f32(outptr + 4 * 3, _sum11);
            }

            outptr += 16;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum00 = vld1q_f32(outptr);
                _sum01 = vld1q_f32(outptr + 4);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pA = vld1q_f16(pA);
                float16x8_t _pB = vdupq_n_f16(pB[0]);

                _sum00 = vfmlalq_low_f16(_sum00, _pA, _pB);
                _sum01 = vfmlalq_high_f16(_sum01, _pA, _pB);
#else
                uint16x8_t _pA = vld1q_u16(pA);
                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pA));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pA));

                float32x4_t _pB = vcvt_f32_f16((float16x4_t)vld1_dup_u16(pB));

                _sum00 = vfmaq_f32(_sum00, _pA0, _pB);
                _sum01 = vfmaq_f32(_sum01, _pA1, _pB);
#endif

                pA += 8;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + out_hstep * 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[8];
                    vst1_u16(sum0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(sum0 + 4, (uint16x4_t)vcvt_f16_f32(_sum01));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep * 1] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[out_hstep * 4] = sum0[4];
                    outptr0[out_hstep * 5] = sum0[5];
                    outptr0[out_hstep * 6] = sum0[6];
                    outptr0[out_hstep * 7] = sum0[7];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum00);
                vst1q_f32(outptr + 4, _sum01);
            }

            outptr += 8;
        }

        pAT += max_kk * 8;
    }
#endif // __aarch64__
    for (; ii + 3 < max_ii; ii += 4)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

#if __ARM_FEATURE_FP16_FML
        const __fp16* pB = pBT;
#else
        const unsigned short* pB = pBT;
#endif

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;
            float32x4_t _sum8;
            float32x4_t _sum9;
            float32x4_t _suma;
            float32x4_t _sumb;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);
                _sum4 = vdupq_n_f32(0.f);
                _sum5 = vdupq_n_f32(0.f);
                _sum6 = vdupq_n_f32(0.f);
                _sum7 = vdupq_n_f32(0.f);
                _sum8 = vdupq_n_f32(0.f);
                _sum9 = vdupq_n_f32(0.f);
                _suma = vdupq_n_f32(0.f);
                _sumb = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                        _sum8 = _sum0;
                        _sum9 = _sum0;
                        _suma = _sum0;
                        _sumb = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                        _sum8 = _sum0;
                        _sum9 = _sum0;
                        _suma = _sum0;
                        _sumb = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        _sum4 = vld1q_f32(pC + 16);
                        _sum5 = vld1q_f32(pC + 20);
                        _sum6 = vld1q_f32(pC + 24);
                        _sum7 = vld1q_f32(pC + 28);
                        _sum8 = vld1q_f32(pC + 32);
                        _sum9 = vld1q_f32(pC + 36);
                        _suma = vld1q_f32(pC + 40);
                        _sumb = vld1q_f32(pC + 44);
                        pC += 48;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        _sum4 = vdupq_n_f32(pC[4]);
                        _sum5 = vdupq_n_f32(pC[5]);
                        _sum6 = vdupq_n_f32(pC[6]);
                        _sum7 = vdupq_n_f32(pC[7]);
                        _sum8 = vdupq_n_f32(pC[8]);
                        _sum9 = vdupq_n_f32(pC[9]);
                        _suma = vdupq_n_f32(pC[10]);
                        _sumb = vdupq_n_f32(pC[11]);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
                _sum8 = vld1q_f32(outptr + 4 * 8);
                _sum9 = vld1q_f32(outptr + 4 * 9);
                _suma = vld1q_f32(outptr + 4 * 10);
                _sumb = vld1q_f32(outptr + 4 * 11);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x4_t _pA = vld1_f16(pA);
                float16x8_t _pAA = vcombine_f16(_pA, _pA);

                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);
                float16x4_t _pB2 = vld1_f16(pB + 8);

                _sum0 = vfmlalq_lane_low_f16(_sum0, _pAA, _pB0, 0);
                _sum1 = vfmlalq_lane_low_f16(_sum1, _pAA, _pB0, 1);
                _sum2 = vfmlalq_lane_low_f16(_sum2, _pAA, _pB0, 2);
                _sum3 = vfmlalq_lane_low_f16(_sum3, _pAA, _pB0, 3);
                _sum4 = vfmlalq_lane_low_f16(_sum4, _pAA, _pB1, 0);
                _sum5 = vfmlalq_lane_low_f16(_sum5, _pAA, _pB1, 1);
                _sum6 = vfmlalq_lane_low_f16(_sum6, _pAA, _pB1, 2);
                _sum7 = vfmlalq_lane_low_f16(_sum7, _pAA, _pB1, 3);
                _sum8 = vfmlalq_lane_low_f16(_sum8, _pAA, _pB2, 0);
                _sum9 = vfmlalq_lane_low_f16(_sum9, _pAA, _pB2, 1);
                _suma = vfmlalq_lane_low_f16(_suma, _pAA, _pB2, 2);
                _sumb = vfmlalq_lane_low_f16(_sumb, _pAA, _pB2, 3);

                pA += 4;
                pB += 12;
#else
#if __aarch64__
                float32x4_t _pA = vcvt_f32_f16((float16x4_t)vld1_u16(pA));
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vld1_u16(pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 4));
                float32x4_t _pB2 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 8));

                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
                _sum8 = vfmaq_laneq_f32(_sum8, _pA, _pB2, 0);
                _sum9 = vfmaq_laneq_f32(_sum9, _pA, _pB2, 1);
                _suma = vfmaq_laneq_f32(_suma, _pA, _pB2, 2);
                _sumb = vfmaq_laneq_f32(_sumb, _pA, _pB2, 3);

                pA += 4;
                pB += 12;
#else // __aarch64__
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "pld        [%0, #64]       \n"
                    "pld        [%1, #192]      \n"
                    "vld1.u16   {d6}, [%0 :64]! \n"
                    "vld1.u16   {d2-d4}, [%1 :64]! \n"
                    "vcvt.f32.f16 q3, d6        \n"
                    "vcvt.f32.f16 q0, d2        \n"
                    "vcvt.f32.f16 q1, d3        \n"
                    "vcvt.f32.f16 q2, d4        \n"
                    "vmla.f32   %q2, q3, d0[0]  \n"
                    "vmla.f32   %q3, q3, d0[1]  \n"
                    "vmla.f32   %q4, q3, d1[0]  \n"
                    "vmla.f32   %q5, q3, d1[1]  \n"
                    "vmla.f32   %q6, q3, d2[0]  \n"
                    "vmla.f32   %q7, q3, d2[1]  \n"
                    "vmla.f32   %q8, q3, d3[0]  \n"
                    "vmla.f32   %q9, q3, d3[1]  \n"
                    "vmla.f32   %q10, q3, d4[0] \n"
                    "vmla.f32   %q11, q3, d4[1] \n"
                    "vmla.f32   %q12, q3, d5[0] \n"
                    "vmla.f32   %q13, q3, d5[1] \n"
                    : "=r"(pA),
                    "=r"(pB),
                    "=w"(_sum0),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3),
                    "=w"(_sum4),
                    "=w"(_sum5),
                    "=w"(_sum6),
                    "=w"(_sum7),
                    "=w"(_sum8),
                    "=w"(_sum9),
                    "=w"(_suma),
                    "=w"(_sumb)
                    : "0"(pA),
                    "1"(pB),
                    "2"(_sum0),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3),
                    "6"(_sum4),
                    "7"(_sum5),
                    "8"(_sum6),
                    "9"(_sum7),
                    "10"(_sum8),
                    "11"(_sum9),
                    "12"(_suma),
                    "13"(_sumb)
                    : "memory", "q0", "q1", "q2", "q3");
#else
                float32x4_t _pA = vcvt_f32_f16((float16x4_t)vld1_u16(pA));
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vld1_u16(pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 4));
                float32x4_t _pB2 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 8));

                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB0), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB0), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB0), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB0), 1);
                _sum4 = vmlaq_lane_f32(_sum4, _pA, vget_low_f32(_pB1), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _pA, vget_low_f32(_pB1), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _pA, vget_high_f32(_pB1), 0);
                _sum7 = vmlaq_lane_f32(_sum7, _pA, vget_high_f32(_pB1), 1);
                _sum8 = vmlaq_lane_f32(_sum8, _pA, vget_low_f32(_pB2), 0);
                _sum9 = vmlaq_lane_f32(_sum9, _pA, vget_low_f32(_pB2), 1);
                _suma = vmlaq_lane_f32(_suma, _pA, vget_high_f32(_pB2), 0);
                _sumb = vmlaq_lane_f32(_sumb, _pA, vget_high_f32(_pB2), 1);

                pA += 4;
                pB += 12;
#endif
#endif // __aarch64__
#endif
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
                _sum3 = vmulq_f32(_sum3, _alpha);
                _sum4 = vmulq_f32(_sum4, _alpha);
                _sum5 = vmulq_f32(_sum5, _alpha);
                _sum6 = vmulq_f32(_sum6, _alpha);
                _sum7 = vmulq_f32(_sum7, _alpha);
                _sum8 = vmulq_f32(_sum8, _alpha);
                _sum9 = vmulq_f32(_sum9, _alpha);
                _suma = vmulq_f32(_suma, _alpha);
                _sumb = vmulq_f32(_sumb, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    vst1_u16(outptr0 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum2));
                    vst1_u16(outptr0 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum3));
                    vst1_u16(outptr0 + 4 * 4, (uint16x4_t)vcvt_f16_f32(_sum4));
                    vst1_u16(outptr0 + 4 * 5, (uint16x4_t)vcvt_f16_f32(_sum5));
                    vst1_u16(outptr0 + 4 * 6, (uint16x4_t)vcvt_f16_f32(_sum6));
                    vst1_u16(outptr0 + 4 * 7, (uint16x4_t)vcvt_f16_f32(_sum7));
                    vst1_u16(outptr0 + 4 * 8, (uint16x4_t)vcvt_f16_f32(_sum8));
                    vst1_u16(outptr0 + 4 * 9, (uint16x4_t)vcvt_f16_f32(_sum9));
                    vst1_u16(outptr0 + 4 * 10, (uint16x4_t)vcvt_f16_f32(_suma));
                    vst1_u16(outptr0 + 4 * 11, (uint16x4_t)vcvt_f16_f32(_sumb));
                    outptr0 += 48;
                }
                if (out_elempack == 1)
                {
                    transpose4x12_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb);

                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    vst1_u16(outptr0 + 8, (uint16x4_t)vcvt_f16_f32(_sum2));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum3));
                    vst1_u16(outptr0 + out_hstep + 4, (uint16x4_t)vcvt_f16_f32(_sum4));
                    vst1_u16(outptr0 + out_hstep + 8, (uint16x4_t)vcvt_f16_f32(_sum5));
                    vst1_u16(outptr0 + out_hstep * 2, (uint16x4_t)vcvt_f16_f32(_sum6));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, (uint16x4_t)vcvt_f16_f32(_sum7));
                    vst1_u16(outptr0 + out_hstep * 2 + 8, (uint16x4_t)vcvt_f16_f32(_sum8));
                    vst1_u16(outptr0 + out_hstep * 3, (uint16x4_t)vcvt_f16_f32(_sum9));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, (uint16x4_t)vcvt_f16_f32(_suma));
                    vst1_u16(outptr0 + out_hstep * 3 + 8, (uint16x4_t)vcvt_f16_f32(_sumb));
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
                vst1q_f32(outptr + 4 * 8, _sum8);
                vst1q_f32(outptr + 4 * 9, _sum9);
                vst1q_f32(outptr + 4 * 10, _suma);
                vst1q_f32(outptr + 4 * 11, _sumb);
            }

            outptr += 48;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;
            float32x4_t _sum4;
            float32x4_t _sum5;
            float32x4_t _sum6;
            float32x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);
                _sum4 = vdupq_n_f32(0.f);
                _sum5 = vdupq_n_f32(0.f);
                _sum6 = vdupq_n_f32(0.f);
                _sum7 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                        _sum4 = _sum0;
                        _sum5 = _sum0;
                        _sum6 = _sum0;
                        _sum7 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        _sum4 = vld1q_f32(pC + 16);
                        _sum5 = vld1q_f32(pC + 20);
                        _sum6 = vld1q_f32(pC + 24);
                        _sum7 = vld1q_f32(pC + 28);
                        pC += 32;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        _sum4 = vdupq_n_f32(pC[4]);
                        _sum5 = vdupq_n_f32(pC[5]);
                        _sum6 = vdupq_n_f32(pC[6]);
                        _sum7 = vdupq_n_f32(pC[7]);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
                _sum4 = vld1q_f32(outptr + 4 * 4);
                _sum5 = vld1q_f32(outptr + 4 * 5);
                _sum6 = vld1q_f32(outptr + 4 * 6);
                _sum7 = vld1q_f32(outptr + 4 * 7);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x4_t _pA = vld1_f16(pA);
                float16x8_t _pAA = vcombine_f16(_pA, _pA);

                float16x4_t _pB0 = vld1_f16(pB);
                float16x4_t _pB1 = vld1_f16(pB + 4);

                _sum0 = vfmlalq_lane_low_f16(_sum0, _pAA, _pB0, 0);
                _sum1 = vfmlalq_lane_low_f16(_sum1, _pAA, _pB0, 1);
                _sum2 = vfmlalq_lane_low_f16(_sum2, _pAA, _pB0, 2);
                _sum3 = vfmlalq_lane_low_f16(_sum3, _pAA, _pB0, 3);
                _sum4 = vfmlalq_lane_low_f16(_sum4, _pAA, _pB1, 0);
                _sum5 = vfmlalq_lane_low_f16(_sum5, _pAA, _pB1, 1);
                _sum6 = vfmlalq_lane_low_f16(_sum6, _pAA, _pB1, 2);
                _sum7 = vfmlalq_lane_low_f16(_sum7, _pAA, _pB1, 3);
#else
                float32x4_t _pA = vcvt_f32_f16((float16x4_t)vld1_u16(pA));
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vld1_u16(pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 4));

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB0, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB0, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB0, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB0, 3);
                _sum4 = vfmaq_laneq_f32(_sum4, _pA, _pB1, 0);
                _sum5 = vfmaq_laneq_f32(_sum5, _pA, _pB1, 1);
                _sum6 = vfmaq_laneq_f32(_sum6, _pA, _pB1, 2);
                _sum7 = vfmaq_laneq_f32(_sum7, _pA, _pB1, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB0), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB0), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB0), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB0), 1);
                _sum4 = vmlaq_lane_f32(_sum4, _pA, vget_low_f32(_pB1), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _pA, vget_low_f32(_pB1), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _pA, vget_high_f32(_pB1), 0);
                _sum7 = vmlaq_lane_f32(_sum7, _pA, vget_high_f32(_pB1), 1);
#endif
#endif

                pA += 4;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
                _sum3 = vmulq_f32(_sum3, _alpha);
                _sum4 = vmulq_f32(_sum4, _alpha);
                _sum5 = vmulq_f32(_sum5, _alpha);
                _sum6 = vmulq_f32(_sum6, _alpha);
                _sum7 = vmulq_f32(_sum7, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    vst1_u16(outptr0 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum2));
                    vst1_u16(outptr0 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum3));
                    vst1_u16(outptr0 + 4 * 4, (uint16x4_t)vcvt_f16_f32(_sum4));
                    vst1_u16(outptr0 + 4 * 5, (uint16x4_t)vcvt_f16_f32(_sum5));
                    vst1_u16(outptr0 + 4 * 6, (uint16x4_t)vcvt_f16_f32(_sum6));
                    vst1_u16(outptr0 + 4 * 7, (uint16x4_t)vcvt_f16_f32(_sum7));
                    outptr0 += 32;
                }
                if (out_elempack == 1)
                {
                    transpose4x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7);

                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum2));
                    vst1_u16(outptr0 + out_hstep + 4, (uint16x4_t)vcvt_f16_f32(_sum3));
                    vst1_u16(outptr0 + out_hstep * 2, (uint16x4_t)vcvt_f16_f32(_sum4));
                    vst1_u16(outptr0 + out_hstep * 2 + 4, (uint16x4_t)vcvt_f16_f32(_sum5));
                    vst1_u16(outptr0 + out_hstep * 3, (uint16x4_t)vcvt_f16_f32(_sum6));
                    vst1_u16(outptr0 + out_hstep * 3 + 4, (uint16x4_t)vcvt_f16_f32(_sum7));
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
                vst1q_f32(outptr + 4 * 4, _sum4);
                vst1q_f32(outptr + 4 * 5, _sum5);
                vst1q_f32(outptr + 4 * 6, _sum6);
                vst1q_f32(outptr + 4 * 7, _sum7);
            }

            outptr += 32;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;
            float32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);
                _sum3 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        _sum2 = _sum0;
                        _sum3 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        _sum3 = vld1q_f32(pC + 12);
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        _sum2 = vdupq_n_f32(pC[2]);
                        _sum3 = vdupq_n_f32(pC[3]);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4 * 1);
                _sum2 = vld1q_f32(outptr + 4 * 2);
                _sum3 = vld1q_f32(outptr + 4 * 3);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x4_t _pA = vld1_f16(pA);
                float16x8_t _pAA = vcombine_f16(_pA, _pA);

                float16x4_t _pB0 = vld1_f16(pB);

                _sum0 = vfmlalq_lane_low_f16(_sum0, _pAA, _pB0, 0);
                _sum1 = vfmlalq_lane_low_f16(_sum1, _pAA, _pB0, 1);
                _sum2 = vfmlalq_lane_low_f16(_sum2, _pAA, _pB0, 2);
                _sum3 = vfmlalq_lane_low_f16(_sum3, _pAA, _pB0, 3);
#else
                float32x4_t _pA = vcvt_f32_f16((float16x4_t)vld1_u16(pA));
                float32x4_t _pB = vcvt_f32_f16((float16x4_t)vld1_u16(pB));

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _pA, _pB, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _pA, _pB, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _pA, _pB, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _pA, _pB, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _pA, vget_low_f32(_pB), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _pA, vget_low_f32(_pB), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _pA, vget_high_f32(_pB), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _pA, vget_high_f32(_pB), 1);
#endif
#endif

                pA += 4;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
                _sum3 = vmulq_f32(_sum3, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    vst1_u16(outptr0 + 4 * 2, (uint16x4_t)vcvt_f16_f32(_sum2));
                    vst1_u16(outptr0 + 4 * 3, (uint16x4_t)vcvt_f16_f32(_sum3));
                    outptr0 += 16;
                }
                if (out_elempack == 1)
                {
                    transpose4x4_ps(_sum0, _sum1, _sum2, _sum3);

                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum1));
                    vst1_u16(outptr0 + out_hstep * 2, (uint16x4_t)vcvt_f16_f32(_sum2));
                    vst1_u16(outptr0 + out_hstep * 3, (uint16x4_t)vcvt_f16_f32(_sum3));
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 4 * 2, _sum2);
                vst1q_f32(outptr + 4 * 3, _sum3);
            }

            outptr += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                        pC += 2;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x4_t _pA = vld1_f16(pA);
                float16x8_t _pAA = vcombine_f16(_pA, _pA);

                float16x4_t _pB0 = vdup_n_f16(pB[0]);
                float16x4_t _pB1 = vdup_n_f16(pB[1]);
                float16x8_t _pB01 = vcombine_f16(_pB0, _pB1);

                _sum0 = vfmlalq_low_f16(_sum0, _pAA, _pB01);
                _sum1 = vfmlalq_high_f16(_sum1, _pAA, _pB01);
#else
                float32x4_t _pA = vcvt_f32_f16((float16x4_t)vld1_u16(pA));
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pB[0]));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pB[1]));

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA, _pB1);
#else
                _sum0 = vmlaq_f32(_sum0, _pA, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA, _pB1);
#endif
#endif

                pA += 4;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    outptr0 += 8;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[4];
                    unsigned short sum1[4];
                    vst1_u16(sum0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(sum1, (uint16x4_t)vcvt_f16_f32(_sum1));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0[1] = sum1[0];
                    outptr0[out_hstep + 1] = sum1[1];
                    outptr0[out_hstep * 2 + 1] = sum1[2];
                    outptr0[out_hstep * 3 + 1] = sum1[3];
                    outptr0 += 2;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vld1q_f32(pC);
                    }
                    if (broadcast_type_C == 3)
                    {
                        _sum0 = vld1q_f32(pC);
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        pC += 1;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x4_t _pA = vld1_f16(pA);
                float16x8_t _pAA = vcombine_f16(_pA, _pA);

                float16x8_t _pB = vdupq_n_f16(pB[0]);

                _sum0 = vfmlalq_low_f16(_sum0, _pAA, _pB);
#else
                float32x4_t _pA = vcvt_f32_f16((float16x4_t)vld1_u16(pA));
                float32x4_t _pB = vcvt_f32_f16((float16x4_t)vdup_n_u16(pB[0]));

#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA, _pB);
#else
                _sum0 = vmlaq_f32(_sum0, _pA, _pB);
#endif
#endif

                pA += 4;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
            }

            if (k_end)
            {
                if (out_elempack == 4)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    outptr0 += 4;
                }
                if (out_elempack == 1)
                {
                    unsigned short sum0[4];
                    vst1_u16(sum0, (uint16x4_t)vcvt_f16_f32(_sum0));

                    outptr0[0] = sum0[0];
                    outptr0[out_hstep] = sum0[1];
                    outptr0[out_hstep * 2] = sum0[2];
                    outptr0[out_hstep * 3] = sum0[3];
                    outptr0++;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
            }

            outptr += 4;
        }

        pAT += max_kk * 4;
    }
    for (; ii + 1 < max_ii; ii += 2)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

#if __ARM_FEATURE_FP16_FML
        const __fp16* pB = pBT;
#else
        const unsigned short* pB = pBT;
#endif

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum02;
            float32x4_t _sum10;
            float32x4_t _sum11;
            float32x4_t _sum12;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum02 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);
                _sum12 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum02 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                        _sum12 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum02 = _sum00;
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum11 = _sum10;
                        _sum12 = _sum10;
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        float32x4x2_t _tmp23 = vld2q_f32(pC + 8);
                        float32x4x2_t _tmp45 = vld2q_f32(pC + 16);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum02 = _tmp45.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        _sum12 = _tmp45.val[1];
                        pC += 24;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum02 = vld1q_f32(pC + 8);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        _sum12 = _sum02;
                        pC += 12;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                float32x4x2_t _tmp45 = vld2q_f32(outptr + 16);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum02 = _tmp45.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
                _sum12 = _tmp45.val[1];
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pB01 = vld1q_f16(pB);
                float16x4_t _pB2 = vld1_f16(pB + 8);
                float16x8_t _pB22 = vcombine_f16(_pB2, _pB2);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);
                float16x4_t _pA1 = vdup_n_f16(pA[1]);
                float16x8_t _pA01 = vcombine_f16(_pA0, _pA1);
                float16x8_t _pA10 = vcombine_f16(_pA1, _pA0);

                _sum00 = vfmlalq_low_f16(_sum00, _pB01, _pA01);
                _sum01 = vfmlalq_high_f16(_sum01, _pB01, _pA10);
                _sum02 = vfmlalq_low_f16(_sum02, _pB22, _pA01);
                _sum10 = vfmlalq_low_f16(_sum10, _pB01, _pA10);
                _sum11 = vfmlalq_high_f16(_sum11, _pB01, _pA01);
                _sum12 = vfmlalq_low_f16(_sum12, _pB22, _pA10);
#else
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pB));
                float32x4_t _pB2 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 8));

                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[0]));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[1]));
#if __aarch64__
                _sum00 = vfmaq_f32(_sum00, _pB0, _pA0);
                _sum01 = vfmaq_f32(_sum01, _pB1, _pA0);
                _sum02 = vfmaq_f32(_sum02, _pB2, _pA0);
                _sum10 = vfmaq_f32(_sum10, _pB0, _pA1);
                _sum11 = vfmaq_f32(_sum11, _pB1, _pA1);
                _sum12 = vfmaq_f32(_sum12, _pB2, _pA1);
#else
                _sum00 = vmlaq_f32(_sum00, _pB0, _pA0);
                _sum01 = vmlaq_f32(_sum01, _pB1, _pA0);
                _sum02 = vmlaq_f32(_sum02, _pB2, _pA0);
                _sum10 = vmlaq_f32(_sum10, _pB0, _pA1);
                _sum11 = vmlaq_f32(_sum11, _pB1, _pA1);
                _sum12 = vmlaq_f32(_sum12, _pB2, _pA1);
#endif
#endif

                pA += 2;
                pB += 12;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum02 = vmulq_f32(_sum02, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
                _sum12 = vmulq_f32(_sum12, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + 8, (uint16x4_t)vcvt_f16_f32(_sum02));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, (uint16x4_t)vcvt_f16_f32(_sum11));
                    vst1_u16(outptr0 + out_hstep + 8, (uint16x4_t)vcvt_f16_f32(_sum12));
                    outptr0 += 12;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                float32x4x2_t _tmp45;
                _tmp45.val[0] = _sum02;
                _tmp45.val[1] = _sum12;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
                vst2q_f32(outptr + 16, _tmp45);
            }

            outptr += 24;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum00;
            float32x4_t _sum01;
            float32x4_t _sum10;
            float32x4_t _sum11;

            if (k == 0)
            {
                _sum00 = vdupq_n_f32(0.f);
                _sum01 = vdupq_n_f32(0.f);
                _sum10 = vdupq_n_f32(0.f);
                _sum11 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = _sum00;
                        _sum11 = _sum00;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum00 = vdupq_n_f32(pC[0]);
                        _sum01 = _sum00;
                        _sum10 = vdupq_n_f32(pC[1]);
                        _sum11 = _sum10;
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        float32x4x2_t _tmp23 = vld2q_f32(pC + 8);
                        _sum00 = _tmp01.val[0];
                        _sum01 = _tmp23.val[0];
                        _sum10 = _tmp01.val[1];
                        _sum11 = _tmp23.val[1];
                        pC += 16;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum00 = vld1q_f32(pC);
                        _sum01 = vld1q_f32(pC + 4);
                        _sum10 = _sum00;
                        _sum11 = _sum01;
                        pC += 8;
                    }
                }
            }
            else
            {
                float32x4x2_t _tmp01 = vld2q_f32(outptr);
                float32x4x2_t _tmp23 = vld2q_f32(outptr + 8);
                _sum00 = _tmp01.val[0];
                _sum01 = _tmp23.val[0];
                _sum10 = _tmp01.val[1];
                _sum11 = _tmp23.val[1];
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pB01 = vld1q_f16(pB);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);
                float16x4_t _pA1 = vdup_n_f16(pA[1]);
                float16x8_t _pA01 = vcombine_f16(_pA0, _pA1);
                float16x8_t _pA10 = vcombine_f16(_pA1, _pA0);

                _sum00 = vfmlalq_low_f16(_sum00, _pB01, _pA01);
                _sum01 = vfmlalq_high_f16(_sum01, _pB01, _pA10);
                _sum10 = vfmlalq_low_f16(_sum10, _pB01, _pA10);
                _sum11 = vfmlalq_high_f16(_sum11, _pB01, _pA01);
#else
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pB));

                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[0]));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[1]));
#if __aarch64__
                _sum00 = vfmaq_f32(_sum00, _pB0, _pA0);
                _sum01 = vfmaq_f32(_sum01, _pB1, _pA0);
                _sum10 = vfmaq_f32(_sum10, _pB0, _pA1);
                _sum11 = vfmaq_f32(_sum11, _pB1, _pA1);
#else
                _sum00 = vmlaq_f32(_sum00, _pB0, _pA0);
                _sum01 = vmlaq_f32(_sum01, _pB1, _pA0);
                _sum10 = vmlaq_f32(_sum10, _pB0, _pA1);
                _sum11 = vmlaq_f32(_sum11, _pB1, _pA1);
#endif
#endif

                pA += 2;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum00 = vmulq_f32(_sum00, _alpha);
                _sum01 = vmulq_f32(_sum01, _alpha);
                _sum10 = vmulq_f32(_sum10, _alpha);
                _sum11 = vmulq_f32(_sum11, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum00));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum01));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum10));
                    vst1_u16(outptr0 + out_hstep + 4, (uint16x4_t)vcvt_f16_f32(_sum11));
                    outptr0 += 8;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum00;
                _tmp01.val[1] = _sum10;
                float32x4x2_t _tmp23;
                _tmp23.val[0] = _sum01;
                _tmp23.val[1] = _sum11;
                vst2q_f32(outptr, _tmp01);
                vst2q_f32(outptr + 8, _tmp23);
            }

            outptr += 16;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = _sum0;
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[1]);
                    }
                    if (broadcast_type_C == 3)
                    {
                        float32x4x2_t _tmp01 = vld2q_f32(pC);
                        _sum0 = _tmp01.val[0];
                        _sum1 = _tmp01.val[1];
                        pC += 8;
                    }
                    if (broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = _sum0;
                        pC += 4;
                    }
                }
            }
            else
            {
                float32x4_t _tmp0 = vld1q_f32(outptr);
                float32x4_t _tmp1 = vld1q_f32(outptr + 4);
                float32x4x2_t _tmp01 = vuzpq_f32(_tmp0, _tmp1);
                _sum0 = _tmp01.val[0];
                _sum1 = _tmp01.val[1];
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x4_t _pB = vld1_f16(pB);
                float16x8_t _pBB = vcombine_f16(_pB, _pB);

                float16x4_t _pA0 = vdup_n_f16(pA[0]);
                float16x4_t _pA1 = vdup_n_f16(pA[1]);
                float16x8_t _pA01 = vcombine_f16(_pA0, _pA1);

                _sum0 = vfmlalq_low_f16(_sum0, _pBB, _pA01);
                _sum1 = vfmlalq_high_f16(_sum1, _pBB, _pA01);
#else
                float32x4_t _pB = vcvt_f32_f16((float16x4_t)vld1_u16(pB));

                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[0]));
                float32x4_t _pA1 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[1]));
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pB, _pA0);
                _sum1 = vfmaq_f32(_sum1, _pB, _pA1);
#else
                _sum0 = vmlaq_f32(_sum0, _pB, _pA0);
                _sum1 = vmlaq_f32(_sum1, _pB, _pA1);
#endif
#endif

                pA += 2;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_sum1));
                    outptr0 += 4;
                }
            }
            else
            {
                float32x4x2_t _tmp01;
                _tmp01.val[0] = _sum0;
                _tmp01.val[1] = _sum1;
                vst2q_f32(outptr, _tmp01);
            }

            outptr += 8;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum00;
            float sum01;
            float sum10;
            float sum11;

            if (k == 0)
            {
                sum00 = 0.f;
                sum01 = 0.f;
                sum10 = 0.f;
                sum11 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[0];
                        sum11 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[0];
                        sum11 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum00 = pC[0];
                        sum01 = pC[1];
                        sum10 = pC[2];
                        sum11 = pC[3];
                        pC += 4;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum00 = pC[0];
                        sum01 = pC[0];
                        sum10 = pC[1];
                        sum11 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum00 = outptr[0];
                sum01 = outptr[1];
                sum10 = outptr[2];
                sum11 = outptr[3];
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                __fp16 pA0 = pA[0];
                __fp16 pA1 = pA[1];
                __fp16 pB0 = pB[0];
                __fp16 pB1 = pB[1];
#else
                float pA0 = float16_to_float32(pA[0]);
                float pA1 = float16_to_float32(pA[1]);
                float pB0 = float16_to_float32(pB[0]);
                float pB1 = float16_to_float32(pB[1]);
#endif

                sum00 += pA0 * pB0;
                sum01 += pA1 * pB0;
                sum10 += pA0 * pB1;
                sum11 += pA1 * pB1;

                pA += 2;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                sum00 *= alpha;
                sum01 *= alpha;
                sum10 *= alpha;
                sum11 *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_float16(sum00);
                    outptr0[1] = float32_to_float16(sum10);
                    outptr0[out_hstep] = float32_to_float16(sum01);
                    outptr0[out_hstep + 1] = float32_to_float16(sum11);
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum00;
                outptr[1] = sum01;
                outptr[2] = sum10;
                outptr[3] = sum11;
            }

            outptr += 4;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                    }
                    if (broadcast_type_C == 3)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                    if (broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                __fp16 pA0 = pA[0];
                __fp16 pA1 = pA[1];
                __fp16 pB0 = pB[0];
#else
                float pA0 = float16_to_float32(pA[0]);
                float pA1 = float16_to_float32(pA[1]);
                float pB0 = float16_to_float32(pB[0]);
#endif

                sum0 += pA0 * pB0;
                sum1 += pA1 * pB0;
                pA += 2;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                sum0 *= alpha;
                sum1 *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_float16(sum0);
                    outptr0[out_hstep] = float32_to_float16(sum1);
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        unsigned short* outptr0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

#if __ARM_FEATURE_FP16_FML
        const __fp16* pB = pBT;
#else
        const unsigned short* pB = pBT;
#endif

        if (pC)
        {
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)CT_tile + i + ii;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)CT_tile + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 11 < max_jj; jj += 12)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;
            float32x4_t _sum2;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);
                _sum2 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                        _sum2 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        _sum2 = vld1q_f32(pC + 8);
                        pC += 12;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
                _sum2 = vld1q_f32(outptr + 8);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pB01 = vld1q_f16(pB);
                float16x4_t _pB2 = vld1_f16(pB + 8);
                float16x8_t _pB22 = vcombine_f16(_pB2, _pB2);

                float16x8_t _pA = vdupq_n_f16(pA[0]);

                _sum0 = vfmlalq_low_f16(_sum0, _pA, _pB01);
                _sum1 = vfmlalq_high_f16(_sum1, _pA, _pB01);
                _sum2 = vfmlalq_low_f16(_sum2, _pA, _pB22);
#else
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pB));
                float32x4_t _pB2 = vcvt_f32_f16((float16x4_t)vld1_u16(pB + 8));

                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[0]));
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
                _sum2 = vfmaq_f32(_sum2, _pA0, _pB2);
#else
                _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
                _sum2 = vmlaq_f32(_sum2, _pA0, _pB2);
#endif
#endif

                pA += 1;
                pB += 12;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
                _sum2 = vmulq_f32(_sum2, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    vst1_u16(outptr0 + 8, (uint16x4_t)vcvt_f16_f32(_sum2));
                    outptr0 += 12;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 8, _sum2);
            }

            outptr += 12;
        }
#endif // __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            float32x4_t _sum0;
            float32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_f32(0.f);
                _sum1 = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum0 = vdupq_n_f32(pC[0]);
                        _sum1 = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum0 = vld1q_f32(pC);
                        _sum1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                }
            }
            else
            {
                _sum0 = vld1q_f32(outptr);
                _sum1 = vld1q_f32(outptr + 4);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x8_t _pB01 = vld1q_f16(pB);
                float16x8_t _pA = vdupq_n_f16(pA[0]);

                _sum0 = vfmlalq_low_f16(_sum0, _pA, _pB01);
                _sum1 = vfmlalq_high_f16(_sum1, _pA, _pB01);
#else
                uint16x8_t _pB = vld1q_u16(pB);
                float32x4_t _pB0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_pB));
                float32x4_t _pB1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_pB));

                float32x4_t _pA0 = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[0]));
#if __aarch64__
                _sum0 = vfmaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vfmaq_f32(_sum1, _pA0, _pB1);
#else
                _sum0 = vmlaq_f32(_sum0, _pA0, _pB0);
                _sum1 = vmlaq_f32(_sum1, _pA0, _pB1);
#endif
#endif

                pA += 1;
                pB += 8;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum0 = vmulq_f32(_sum0, _alpha);
                _sum1 = vmulq_f32(_sum1, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum0));
                    vst1_u16(outptr0 + 4, (uint16x4_t)vcvt_f16_f32(_sum1));
                    outptr0 += 8;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
            }

            outptr += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _sum;

            if (k == 0)
            {
                _sum = vdupq_n_f32(0.f);

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        _sum = vdupq_n_f32(pC[0]);
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        _sum = vld1q_f32(pC);
                        pC += 4;
                    }
                }
            }
            else
            {
                _sum = vld1q_f32(outptr);
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                float16x4_t _pB = vld1_f16(pB);
                float16x8_t _pBB = vcombine_f16(_pB, _pB);
                float16x8_t _pA = vdupq_n_f16(pA[0]);

                _sum = vfmlalq_low_f16(_sum, _pA, _pBB);
#else
                float32x4_t _pB = vcvt_f32_f16((float16x4_t)vld1_u16(pB));
                float32x4_t _pA = vcvt_f32_f16((float16x4_t)vdup_n_u16(pA[0]));
#if __aarch64__
                _sum = vfmaq_f32(_sum, _pA, _pB);
#else
                _sum = vmlaq_f32(_sum, _pA, _pB);
#endif
#endif

                pA += 1;
                pB += 4;
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _sum = vmulq_f32(_sum, _alpha);
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    vst1_u16(outptr0, (uint16x4_t)vcvt_f16_f32(_sum));
                    outptr0 += 4;
                }
            }
            else
            {
                vst1q_f32(outptr, _sum);
            }

            outptr += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float sum0;
            float sum1;

            if (k == 0)
            {
                sum0 = 0.f;
                sum1 = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum0 = pC[0];
                        sum1 = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum0 = pC[0];
                        sum1 = pC[1];
                        pC += 2;
                    }
                }
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                __fp16 pA0 = pA[0];
                __fp16 pB0 = pB[0];
                __fp16 pB1 = pB[1];
#else
                float pA0 = float16_to_float32(pA[0]);
                float pB0 = float16_to_float32(pB[0]);
                float pB1 = float16_to_float32(pB[1]);
#endif

                sum0 += pA0 * pB0;
                sum1 += pA0 * pB1;

                pA += 1;
                pB += 2;
            }

            if (alpha != 1.f)
            {
                sum0 *= alpha;
                sum1 *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_float16(sum0);
                    outptr0[1] = float32_to_float16(sum1);
                    outptr0 += 2;
                }
            }
            else
            {
                outptr[0] = sum0;
                outptr[1] = sum1;
            }

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float sum;

            if (k == 0)
            {
                sum = 0.f;

                if (pC)
                {
                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                    {
                        sum = pC[0];
                    }
                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
                    {
                        sum = pC[0];
                        pC += 1;
                    }
                }
            }
            else
            {
                sum = outptr[0];
            }

#if __ARM_FEATURE_FP16_FML
            const __fp16* pA = pAT;
#else
            const unsigned short* pA = pAT;
#endif
            int kk = 0;
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_FP16_FML
                __fp16 pA0 = pA[0];
                __fp16 pB0 = pB[0];
#else
                float pA0 = float16_to_float32(pA[0]);
                float pB0 = float16_to_float32(pB[0]);
#endif

                sum += pA0 * pB0;
                pA += 1;
                pB += 1;
            }

            if (alpha != 1.f)
            {
                sum *= alpha;
            }

            if (k_end)
            {
                // if (out_elempack == 1)
                {
                    outptr0[0] = float32_to_float16(sum);
                    outptr0++;
                }
            }
            else
            {
                outptr[0] = sum;
            }

            outptr += 1;
        }

        pAT += max_kk;
    }
}


================================================
FILE: src/layer/arm/gemm_int8.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
void pack_A_tile_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk);
void transpose_pack_A_tile_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk);
void pack_B_tile_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk);
void transpose_pack_B_tile_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk);
void pack_A_tile_fp32_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void transpose_pack_A_tile_fp32_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void pack_B_tile_fp32_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void transpose_pack_B_tile_fp32_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void gemm_transB_packed_tile_int8_i8mm(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, int i, int max_ii, int j, int max_jj, int k, int max_kk);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
void pack_A_tile_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk);
void transpose_pack_A_tile_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk);
void pack_B_tile_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk);
void transpose_pack_B_tile_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk);
void pack_A_tile_fp32_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void transpose_pack_A_tile_fp32_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void pack_B_tile_fp32_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void transpose_pack_B_tile_fp32_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void unpack_output_tile_int32_to_fp32_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
void transpose_unpack_output_tile_int32_to_fp32_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
void gemm_transB_packed_tile_int8_asimddp(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, int i, int max_ii, int j, int max_jj, int k, int max_kk);
#endif

static void pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_A_tile_int8_i8mm(A, AT, i, max_ii, k, max_kk);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_A_tile_int8_asimddp(A, AT, i, max_ii, k, max_kk);
        return;
    }
#endif

    // NCNN_LOGE("pack_A_tile_int8");
    // assert A.elempack == 1
    // assert A.dims == 2

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const signed char* p0 = A.row<const signed char>(i + ii) + k;
        const signed char* p1 = A.row<const signed char>(i + ii + 1) + k;
        const signed char* p2 = A.row<const signed char>(i + ii + 2) + k;
        const signed char* p3 = A.row<const signed char>(i + ii + 3) + k;
        const signed char* p4 = A.row<const signed char>(i + ii + 4) + k;
        const signed char* p5 = A.row<const signed char>(i + ii + 5) + k;
        const signed char* p6 = A.row<const signed char>(i + ii + 6) + k;
        const signed char* p7 = A.row<const signed char>(i + ii + 7) + k;

        int kk = 0;
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _p0 = vld1q_s8(p0);
            int8x16_t _p1 = vld1q_s8(p1);
            int8x16_t _p2 = vld1q_s8(p2);
            int8x16_t _p3 = vld1q_s8(p3);
            int8x16_t _p4 = vld1q_s8(p4);
            int8x16_t _p5 = vld1q_s8(p5);
            int8x16_t _p6 = vld1q_s8(p6);
            int8x16_t _p7 = vld1q_s8(p7);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int8x16_t _r0 = vcombine_s8(vget_low_s8(_p0), vget_low_s8(_p1));
            int8x16_t _r1 = vcombine_s8(vget_low_s8(_p2), vget_low_s8(_p3));
            int8x16_t _r2 = vcombine_s8(vget_low_s8(_p4), vget_low_s8(_p5));
            int8x16_t _r3 = vcombine_s8(vget_low_s8(_p6), vget_low_s8(_p7));
            int8x16_t _r4 = vcombine_s8(vget_high_s8(_p0), vget_high_s8(_p1));
            int8x16_t _r5 = vcombine_s8(vget_high_s8(_p2), vget_high_s8(_p3));
            int8x16_t _r6 = vcombine_s8(vget_high_s8(_p4), vget_high_s8(_p5));
            int8x16_t _r7 = vcombine_s8(vget_high_s8(_p6), vget_high_s8(_p7));
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x4x2_t _p01 = vzipq_s32(vreinterpretq_s32_s8(_p0), vreinterpretq_s32_s8(_p1));
            int32x4x2_t _p23 = vzipq_s32(vreinterpretq_s32_s8(_p2), vreinterpretq_s32_s8(_p3));
            int32x4x2_t _p45 = vzipq_s32(vreinterpretq_s32_s8(_p4), vreinterpretq_s32_s8(_p5));
            int32x4x2_t _p67 = vzipq_s32(vreinterpretq_s32_s8(_p6), vreinterpretq_s32_s8(_p7));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p01.val[0]), vget_low_s32(_p23.val[0])));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p45.val[0]), vget_low_s32(_p67.val[0])));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p01.val[0]), vget_high_s32(_p23.val[0])));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p45.val[0]), vget_high_s32(_p67.val[0])));
            int8x16_t _r4 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p01.val[1]), vget_low_s32(_p23.val[1])));
            int8x16_t _r5 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p45.val[1]), vget_low_s32(_p67.val[1])));
            int8x16_t _r6 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p01.val[1]), vget_high_s32(_p23.val[1])));
            int8x16_t _r7 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p45.val[1]), vget_high_s32(_p67.val[1])));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8x2_t _p01 = vzipq_s16(vreinterpretq_s16_s8(_p0), vreinterpretq_s16_s8(_p1));
            int16x8x2_t _p23 = vzipq_s16(vreinterpretq_s16_s8(_p2), vreinterpretq_s16_s8(_p3));
            int16x8x2_t _p45 = vzipq_s16(vreinterpretq_s16_s8(_p4), vreinterpretq_s16_s8(_p5));
            int16x8x2_t _p67 = vzipq_s16(vreinterpretq_s16_s8(_p6), vreinterpretq_s16_s8(_p7));
            int32x4x2_t _t0 = vzipq_s32(vreinterpretq_s32_s16(_p01.val[0]), vreinterpretq_s32_s16(_p23.val[0]));
            int32x4x2_t _t1 = vzipq_s32(vreinterpretq_s32_s16(_p01.val[1]), vreinterpretq_s32_s16(_p23.val[1]));
            int32x4x2_t _t2 = vzipq_s32(vreinterpretq_s32_s16(_p45.val[0]), vreinterpretq_s32_s16(_p67.val[0]));
            int32x4x2_t _t3 = vzipq_s32(vreinterpretq_s32_s16(_p45.val[1]), vreinterpretq_s32_s16(_p67.val[1]));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t2.val[0])));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t2.val[0])));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t0.val[1]), vget_low_s32(_t2.val[1])));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t0.val[1]), vget_high_s32(_t2.val[1])));
            int8x16_t _r4 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t3.val[0])));
            int8x16_t _r5 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t3.val[0])));
            int8x16_t _r6 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t3.val[1])));
            int8x16_t _r7 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t3.val[1])));
#endif // __ARM_FEATURE_DOTPROD
            vst1q_s8(pp, _r0);
            vst1q_s8(pp + 16, _r1);
            vst1q_s8(pp + 32, _r2);
            vst1q_s8(pp + 48, _r3);
            vst1q_s8(pp + 64, _r4);
            vst1q_s8(pp + 80, _r5);
            vst1q_s8(pp + 96, _r6);
            vst1q_s8(pp + 112, _r7);
            pp += 128;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
            p4 += 16;
            p5 += 16;
            p6 += 16;
            p7 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _p0 = vld1_s8(p0);
            int8x8_t _p1 = vld1_s8(p1);
            int8x8_t _p2 = vld1_s8(p2);
            int8x8_t _p3 = vld1_s8(p3);
            int8x8_t _p4 = vld1_s8(p4);
            int8x8_t _p5 = vld1_s8(p5);
            int8x8_t _p6 = vld1_s8(p6);
            int8x8_t _p7 = vld1_s8(p7);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int8x16_t _r0 = vcombine_s8(_p0, _p1);
            int8x16_t _r1 = vcombine_s8(_p2, _p3);
            int8x16_t _r2 = vcombine_s8(_p4, _p5);
            int8x16_t _r3 = vcombine_s8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x2x2_t _p01 = vzip_s32(vreinterpret_s32_s8(_p0), vreinterpret_s32_s8(_p1));
            int32x2x2_t _p23 = vzip_s32(vreinterpret_s32_s8(_p2), vreinterpret_s32_s8(_p3));
            int32x2x2_t _p45 = vzip_s32(vreinterpret_s32_s8(_p4), vreinterpret_s32_s8(_p5));
            int32x2x2_t _p67 = vzip_s32(vreinterpret_s32_s8(_p6), vreinterpret_s32_s8(_p7));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(_p01.val[0], _p23.val[0]));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(_p45.val[0], _p67.val[0]));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(_p01.val[1], _p23.val[1]));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(_p45.val[1], _p67.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8_t _p04 = vreinterpretq_s16_s8(vcombine_s8(_p0, _p4));
            int16x8_t _p15 = vreinterpretq_s16_s8(vcombine_s8(_p1, _p5));
            int16x8_t _p26 = vreinterpretq_s16_s8(vcombine_s8(_p2, _p6));
            int16x8_t _p37 = vreinterpretq_s16_s8(vcombine_s8(_p3, _p7));
            int16x8x2_t _t0 = vzipq_s16(_p04, _p15);
            int16x8x2_t _t1 = vzipq_s16(_p26, _p37);
            int32x4x2_t _t2 = vzipq_s32(vreinterpretq_s32_s16(_t0.val[0]), vreinterpretq_s32_s16(_t1.val[0]));
            int32x4x2_t _t3 = vzipq_s32(vreinterpretq_s32_s16(_t0.val[1]), vreinterpretq_s32_s16(_t1.val[1]));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0])));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0])));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t2.val[1]), vget_low_s32(_t3.val[1])));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t2.val[1]), vget_high_s32(_t3.val[1])));
#endif // __ARM_FEATURE_DOTPROD
            vst1q_s8(pp, _r0);
            vst1q_s8(pp + 16, _r1);
            vst1q_s8(pp + 32, _r2);
            vst1q_s8(pp + 48, _r3);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
#if __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
            pp[8] = p2[0];
            pp[9] = p2[1];
            pp[10] = p2[2];
            pp[11] = p2[3];
            pp[12] = p3[0];
            pp[13] = p3[1];
            pp[14] = p3[2];
            pp[15] = p3[3];
            pp[16] = p4[0];
            pp[17] = p4[1];
            pp[18] = p4[2];
            pp[19] = p4[3];
            pp[20] = p5[0];
            pp[21] = p5[1];
            pp[22] = p5[2];
            pp[23] = p5[3];
            pp[24] = p6[0];
            pp[25] = p6[1];
            pp[26] = p6[2];
            pp[27] = p6[3];
            pp[28] = p7[0];
            pp[29] = p7[1];
            pp[30] = p7[2];
            pp[31] = p7[3];
#else  // __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp[8] = p4[0];
            pp[9] = p4[1];
            pp[10] = p5[0];
            pp[11] = p5[1];
            pp[12] = p6[0];
            pp[13] = p6[1];
            pp[14] = p7[0];
            pp[15] = p7[1];
            pp[16] = p0[2];
            pp[17] = p0[3];
            pp[18] = p1[2];
            pp[19] = p1[3];
            pp[20] = p2[2];
            pp[21] = p2[3];
            pp[22] = p3[2];
            pp[23] = p3[3];
            pp[24] = p4[2];
            pp[25] = p4[3];
            pp[26] = p5[2];
            pp[27] = p5[3];
            pp[28] = p6[2];
            pp[29] = p6[3];
            pp[30] = p7[2];
            pp[31] = p7[3];
#endif // __ARM_FEATURE_DOTPROD
            pp += 32;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
            p4 += 4;
            p5 += 4;
            p6 += 4;
            p7 += 4;
        }
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp[8] = p4[0];
            pp[9] = p4[1];
            pp[10] = p5[0];
            pp[11] = p5[1];
            pp[12] = p6[0];
            pp[13] = p6[1];
            pp[14] = p7[0];
            pp[15] = p7[1];
            pp += 16;
            p0 += 2;
            p1 += 2;
            p2 += 2;
            p3 += 2;
            p4 += 2;
            p5 += 2;
            p6 += 2;
            p7 += 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp[4] = p4[0];
            pp[5] = p5[0];
            pp[6] = p6[0];
            pp[7] = p7[0];
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const signed char* p0 = A.row<const signed char>(i + ii) + k;
        const signed char* p1 = A.row<const signed char>(i + ii + 1) + k;
        const signed char* p2 = A.row<const signed char>(i + ii + 2) + k;
        const signed char* p3 = A.row<const signed char>(i + ii + 3) + k;

        int kk = 0;
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _p0 = vld1q_s8(p0);
            int8x16_t _p1 = vld1q_s8(p1);
            int8x16_t _p2 = vld1q_s8(p2);
            int8x16_t _p3 = vld1q_s8(p3);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int64x2x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s64_s8(_p0);
            _r0123.val[1] = vreinterpretq_s64_s8(_p1);
            _r0123.val[2] = vreinterpretq_s64_s8(_p2);
            _r0123.val[3] = vreinterpretq_s64_s8(_p3);
            vst4q_s64((int64_t*)pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x4x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s32_s8(_p0);
            _r0123.val[1] = vreinterpretq_s32_s8(_p1);
            _r0123.val[2] = vreinterpretq_s32_s8(_p2);
            _r0123.val[3] = vreinterpretq_s32_s8(_p3);
            vst4q_s32((int*)pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s16_s8(_p0);
            _r0123.val[1] = vreinterpretq_s16_s8(_p1);
            _r0123.val[2] = vreinterpretq_s16_s8(_p2);
            _r0123.val[3] = vreinterpretq_s16_s8(_p3);
            vst4q_s16((short*)pp, _r0123);
#endif // __ARM_FEATURE_DOTPROD
            pp += 64;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _p0 = vld1_s8(p0);
            int8x8_t _p1 = vld1_s8(p1);
            int8x8_t _p2 = vld1_s8(p2);
            int8x8_t _p3 = vld1_s8(p3);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            vst1q_s8(pp, vcombine_s8(_p0, _p1));
            vst1q_s8(pp + 16, vcombine_s8(_p2, _p3));
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x2x4_t _r0123;
            _r0123.val[0] = vreinterpret_s32_s8(_p0);
            _r0123.val[1] = vreinterpret_s32_s8(_p1);
            _r0123.val[2] = vreinterpret_s32_s8(_p2);
            _r0123.val[3] = vreinterpret_s32_s8(_p3);
            vst4_s32((int*)pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x4x4_t _r0123;
            _r0123.val[0] = vreinterpret_s16_s8(_p0);
            _r0123.val[1] = vreinterpret_s16_s8(_p1);
            _r0123.val[2] = vreinterpret_s16_s8(_p2);
            _r0123.val[3] = vreinterpret_s16_s8(_p3);
            vst4_s16((short*)pp, _r0123);
#endif // __ARM_FEATURE_DOTPROD
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
#if __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
            pp[8] = p2[0];
            pp[9] = p2[1];
            pp[10] = p2[2];
            pp[11] = p2[3];
            pp[12] = p3[0];
            pp[13] = p3[1];
            pp[14] = p3[2];
            pp[15] = p3[3];
#else  // __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp[8] = p0[2];
            pp[9] = p0[3];
            pp[10] = p1[2];
            pp[11] = p1[3];
            pp[12] = p2[2];
            pp[13] = p2[3];
            pp[14] = p3[2];
            pp[15] = p3[3];
#endif // __ARM_FEATURE_DOTPROD
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp += 8;
            p0 += 2;
            p1 += 2;
            p2 += 2;
            p3 += 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const signed char* p0 = A.row<const signed char>(i + ii) + k;
        const signed char* p1 = A.row<const signed char>(i + ii + 1) + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _p0 = vld1q_s8(p0);
            int8x16_t _p1 = vld1q_s8(p1);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int64x2x2_t _r01;
            _r01.val[0] = vreinterpretq_s64_s8(_p0);
            _r01.val[1] = vreinterpretq_s64_s8(_p1);
            vst2q_s64((int64_t*)pp, _r01);
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x4x2_t _r01;
            _r01.val[0] = vreinterpretq_s32_s8(_p0);
            _r01.val[1] = vreinterpretq_s32_s8(_p1);
            vst2q_s32((int*)pp, _r01);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8x2_t _r01;
            _r01.val[0] = vreinterpretq_s16_s8(_p0);
            _r01.val[1] = vreinterpretq_s16_s8(_p1);
            vst2q_s16((short*)pp, _r01);
#endif // __ARM_FEATURE_DOTPROD
            pp += 32;
            p0 += 16;
            p1 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _p0 = vld1_s8(p0);
            int8x8_t _p1 = vld1_s8(p1);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            vst1q_s8(pp, vcombine_s8(_p0, _p1));
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x2x2_t _r01;
            _r01.val[0] = vreinterpret_s32_s8(_p0);
            _r01.val[1] = vreinterpret_s32_s8(_p1);
            vst2_s32((int*)pp, _r01);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x4x2_t _r01;
            _r01.val[0] = vreinterpret_s16_s8(_p0);
            _r01.val[1] = vreinterpret_s16_s8(_p1);
            vst2_s16((short*)pp, _r01);
#endif // __ARM_FEATURE_DOTPROD
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
#if __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
#else  // __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p0[2];
            pp[5] = p0[3];
            pp[6] = p1[2];
            pp[7] = p1[3];
#endif // __ARM_FEATURE_DOTPROD
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp += 4;
            p0 += 2;
            p1 += 2;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const signed char* p0 = A.row<const signed char>(i + ii) + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 15 < max_kk; kk += 16)
        {
            vst1q_s8(pp, vld1q_s8(p0));
            pp += 16;
            p0 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            vst1_s8(pp, vld1_s8(p0));
            pp += 8;
            p0 += 8;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp += 1;
            p0++;
        }
    }
}

static void transpose_pack_A_tile_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_A_tile_int8_i8mm(A, AT, i, max_ii, k, max_kk);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_A_tile_int8_asimddp(A, AT, i, max_ii, k, max_kk);
        return;
    }
#endif

    // NCNN_LOGE("transpose_pack_A_tile_int8");
    // assert A.elempack == 1
    // assert A.dims == 2

    const int A_hstep = A.w;

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const signed char* p0 = A.row<const signed char>(k) + (i + ii);

        int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _r0 = vld1_s8(p0);
            int8x8_t _r1 = vld1_s8(p0 + A_hstep);
            int8x8_t _r2 = vld1_s8(p0 + A_hstep * 2);
            int8x8_t _r3 = vld1_s8(p0 + A_hstep * 3);
            int8x8_t _r4 = vld1_s8(p0 + A_hstep * 4);
            int8x8_t _r5 = vld1_s8(p0 + A_hstep * 5);
            int8x8_t _r6 = vld1_s8(p0 + A_hstep * 6);
            int8x8_t _r7 = vld1_s8(p0 + A_hstep * 7);
            // transpose8x8
            int8x8x2_t _r04 = vzip_s8(_r0, _r4);
            int8x8x2_t _r15 = vzip_s8(_r1, _r5);
            int8x8x2_t _r26 = vzip_s8(_r2, _r6);
            int8x8x2_t _r37 = vzip_s8(_r3, _r7);
            int8x8x4_t _r0123;
            _r0123.val[0] = _r04.val[0];
            _r0123.val[1] = _r15.val[0];
            _r0123.val[2] = _r26.val[0];
            _r0123.val[3] = _r37.val[0];
            int8x8x4_t _r4567;
            _r4567.val[0] = _r04.val[1];
            _r4567.val[1] = _r15.val[1];
            _r4567.val[2] = _r26.val[1];
            _r4567.val[3] = _r37.val[1];
            vst4_s8(pp, _r0123);
            vst4_s8(pp + 32, _r4567);
            pp += 64;
            p0 += A_hstep * 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            int8x8x4_t _r0123;
            _r0123.val[0] = vld1_s8(p0);
            _r0123.val[1] = vld1_s8(p0 + A_hstep);
            _r0123.val[2] = vld1_s8(p0 + A_hstep * 2);
            _r0123.val[3] = vld1_s8(p0 + A_hstep * 3);
            vst4_s8(pp, _r0123);
            pp += 32;
            p0 += A_hstep * 4;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            int8x8x2_t _r01;
            _r01.val[0] = vld1_s8(p0);
            _r01.val[1] = vld1_s8(p0 + A_hstep);
            vst2_s8(pp, _r01);
            pp += 16;
            p0 += A_hstep * 2;
        }
        for (; kk < max_kk; kk++)
        {
            vst1_s8(pp, vld1_s8(p0));
            pp += 8;
            p0 += A_hstep;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const signed char* p0 = A.row<const signed char>(k) + (i + ii);

        int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 7 < max_kk; kk += 8)
        {
            pp[0] = p0[0];
            pp[1] = p0[A_hstep];
            pp[2] = p0[A_hstep * 2];
            pp[3] = p0[A_hstep * 3];
            pp[4] = p0[A_hstep * 4];
            pp[5] = p0[A_hstep * 5];
            pp[6] = p0[A_hstep * 6];
            pp[7] = p0[A_hstep * 7];
            pp[8] = p0[1];
            pp[9] = p0[A_hstep + 1];
            pp[10] = p0[A_hstep * 2 + 1];
            pp[11] = p0[A_hstep * 3 + 1];
            pp[12] = p0[A_hstep * 4 + 1];
            pp[13] = p0[A_hstep * 5 + 1];
            pp[14] = p0[A_hstep * 6 + 1];
            pp[15] = p0[A_hstep * 7 + 1];
            pp[16] = p0[2];
            pp[17] = p0[A_hstep + 2];
            pp[18] = p0[A_hstep * 2 + 2];
            pp[19] = p0[A_hstep * 3 + 2];
            pp[20] = p0[A_hstep * 4 + 2];
            pp[21] = p0[A_hstep * 5 + 2];
            pp[22] = p0[A_hstep * 6 + 2];
            pp[23] = p0[A_hstep * 7 + 2];
            pp[24] = p0[3];
            pp[25] = p0[A_hstep + 3];
            pp[26] = p0[A_hstep * 2 + 3];
            pp[27] = p0[A_hstep * 3 + 3];
            pp[28] = p0[A_hstep * 4 + 3];
            pp[29] = p0[A_hstep * 5 + 3];
            pp[30] = p0[A_hstep * 6 + 3];
            pp[31] = p0[A_hstep * 7 + 3];
            pp += 32;
            p0 += A_hstep * 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            pp[0] = p0[0];
            pp[1] = p0[A_hstep];
            pp[2] = p0[A_hstep * 2];
            pp[3] = p0[A_hstep * 3];
            pp[4] = p0[1];
            pp[5] = p0[A_hstep + 1];
            pp[6] = p0[A_hstep * 2 + 1];
            pp[7] = p0[A_hstep * 3 + 1];
            pp[8] = p0[2];
            pp[9] = p0[A_hstep + 2];
            pp[10] = p0[A_hstep * 2 + 2];
            pp[11] = p0[A_hstep * 3 + 2];
            pp[12] = p0[3];
            pp[13] = p0[A_hstep + 3];
            pp[14] = p0[A_hstep * 2 + 3];
            pp[15] = p0[A_hstep * 3 + 3];
            pp += 16;
            p0 += A_hstep * 4;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[A_hstep];
            pp[2] = p0[1];
            pp[3] = p0[A_hstep + 1];
            pp[4] = p0[2];
            pp[5] = p0[A_hstep + 2];
            pp[6] = p0[3];
            pp[7] = p0[A_hstep + 3];
            pp += 8;
            p0 += A_hstep * 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp += 4;
            p0 += A_hstep;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const signed char* p0 = A.row<const signed char>(k) + (i + ii);

        int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 7 < max_kk; kk += 8)
        {
            pp[0] = p0[0];
            pp[1] = p0[A_hstep];
            pp[2] = p0[A_hstep * 2];
            pp[3] = p0[A_hstep * 3];
            pp[4] = p0[A_hstep * 4];
            pp[5] = p0[A_hstep * 5];
            pp[6] = p0[A_hstep * 6];
            pp[7] = p0[A_hstep * 7];
            pp[8] = p0[1];
            pp[9] = p0[A_hstep + 1];
            pp[10] = p0[A_hstep * 2 + 1];
            pp[11] = p0[A_hstep * 3 + 1];
            pp[12] = p0[A_hstep * 4 + 1];
            pp[13] = p0[A_hstep * 5 + 1];
            pp[14] = p0[A_hstep * 6 + 1];
            pp[15] = p0[A_hstep * 7 + 1];
            pp += 16;
            p0 += A_hstep * 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            pp[0] = p0[0];
            pp[1] = p0[A_hstep];
            pp[2] = p0[A_hstep * 2];
            pp[3] = p0[A_hstep * 3];
            pp[4] = p0[1];
            pp[5] = p0[A_hstep + 1];
            pp[6] = p0[A_hstep * 2 + 1];
            pp[7] = p0[A_hstep * 3 + 1];
            pp += 8;
            p0 += A_hstep * 4;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[A_hstep];
            pp[2] = p0[1];
            pp[3] = p0[A_hstep + 1];
            pp += 4;
            p0 += A_hstep * 2;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp += 2;
            p0 += A_hstep;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const signed char* p0 = A.row<const signed char>(k) + (i + ii);

        int kk = 0;
        // for (; kk + 1 < max_kk; kk += 2)
        // {
        //     pp[0] = p0[0];
        //     pp[1] = p0[A_hstep];
        //     pp += 2;
        //     p0 += A_hstep * 2;
        // }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp += 1;
            p0 += A_hstep;
        }
    }
}

static void pack_B_tile_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_B_tile_int8_i8mm(B, BT, j, max_jj, k, max_kk);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_B_tile_int8_asimddp(B, BT, j, max_jj, k, max_kk);
        return;
    }
#endif

    // NCNN_LOGE("pack_B_tile_int8");
    // assert B.elempack == 1
    // assert B.dims == 2

    signed char* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const signed char* p0 = B.row<const signed char>(j + jj) + k;
        const signed char* p1 = B.row<const signed char>(j + jj + 1) + k;
        const signed char* p2 = B.row<const signed char>(j + jj + 2) + k;
        const signed char* p3 = B.row<const signed char>(j + jj + 3) + k;
        const signed char* p4 = B.row<const signed char>(j + jj + 4) + k;
        const signed char* p5 = B.row<const signed char>(j + jj + 5) + k;
        const signed char* p6 = B.row<const signed char>(j + jj + 6) + k;
        const signed char* p7 = B.row<const signed char>(j + jj + 7) + k;

        int kk = 0;
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _p0 = vld1q_s8(p0);
            int8x16_t _p1 = vld1q_s8(p1);
            int8x16_t _p2 = vld1q_s8(p2);
            int8x16_t _p3 = vld1q_s8(p3);
            int8x16_t _p4 = vld1q_s8(p4);
            int8x16_t _p5 = vld1q_s8(p5);
            int8x16_t _p6 = vld1q_s8(p6);
            int8x16_t _p7 = vld1q_s8(p7);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int8x16_t _r0 = vcombine_s8(vget_low_s8(_p0), vget_low_s8(_p1));
            int8x16_t _r1 = vcombine_s8(vget_low_s8(_p2), vget_low_s8(_p3));
            int8x16_t _r2 = vcombine_s8(vget_low_s8(_p4), vget_low_s8(_p5));
            int8x16_t _r3 = vcombine_s8(vget_low_s8(_p6), vget_low_s8(_p7));
            int8x16_t _r4 = vcombine_s8(vget_high_s8(_p0), vget_high_s8(_p1));
            int8x16_t _r5 = vcombine_s8(vget_high_s8(_p2), vget_high_s8(_p3));
            int8x16_t _r6 = vcombine_s8(vget_high_s8(_p4), vget_high_s8(_p5));
            int8x16_t _r7 = vcombine_s8(vget_high_s8(_p6), vget_high_s8(_p7));
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x4x2_t _p01 = vzipq_s32(vreinterpretq_s32_s8(_p0), vreinterpretq_s32_s8(_p1));
            int32x4x2_t _p23 = vzipq_s32(vreinterpretq_s32_s8(_p2), vreinterpretq_s32_s8(_p3));
            int32x4x2_t _p45 = vzipq_s32(vreinterpretq_s32_s8(_p4), vreinterpretq_s32_s8(_p5));
            int32x4x2_t _p67 = vzipq_s32(vreinterpretq_s32_s8(_p6), vreinterpretq_s32_s8(_p7));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p01.val[0]), vget_low_s32(_p23.val[0])));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p45.val[0]), vget_low_s32(_p67.val[0])));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p01.val[0]), vget_high_s32(_p23.val[0])));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p45.val[0]), vget_high_s32(_p67.val[0])));
            int8x16_t _r4 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p01.val[1]), vget_low_s32(_p23.val[1])));
            int8x16_t _r5 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_p45.val[1]), vget_low_s32(_p67.val[1])));
            int8x16_t _r6 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p01.val[1]), vget_high_s32(_p23.val[1])));
            int8x16_t _r7 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_p45.val[1]), vget_high_s32(_p67.val[1])));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8x2_t _p01 = vzipq_s16(vreinterpretq_s16_s8(_p0), vreinterpretq_s16_s8(_p1));
            int16x8x2_t _p23 = vzipq_s16(vreinterpretq_s16_s8(_p2), vreinterpretq_s16_s8(_p3));
            int16x8x2_t _p45 = vzipq_s16(vreinterpretq_s16_s8(_p4), vreinterpretq_s16_s8(_p5));
            int16x8x2_t _p67 = vzipq_s16(vreinterpretq_s16_s8(_p6), vreinterpretq_s16_s8(_p7));
            int32x4x2_t _t0 = vzipq_s32(vreinterpretq_s32_s16(_p01.val[0]), vreinterpretq_s32_s16(_p23.val[0]));
            int32x4x2_t _t1 = vzipq_s32(vreinterpretq_s32_s16(_p01.val[1]), vreinterpretq_s32_s16(_p23.val[1]));
            int32x4x2_t _t2 = vzipq_s32(vreinterpretq_s32_s16(_p45.val[0]), vreinterpretq_s32_s16(_p67.val[0]));
            int32x4x2_t _t3 = vzipq_s32(vreinterpretq_s32_s16(_p45.val[1]), vreinterpretq_s32_s16(_p67.val[1]));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t2.val[0])));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t2.val[0])));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t0.val[1]), vget_low_s32(_t2.val[1])));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t0.val[1]), vget_high_s32(_t2.val[1])));
            int8x16_t _r4 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t3.val[0])));
            int8x16_t _r5 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t3.val[0])));
            int8x16_t _r6 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t3.val[1])));
            int8x16_t _r7 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t3.val[1])));
#endif // __ARM_FEATURE_DOTPROD
            vst1q_s8(pp, _r0);
            vst1q_s8(pp + 16, _r1);
            vst1q_s8(pp + 32, _r2);
            vst1q_s8(pp + 48, _r3);
            vst1q_s8(pp + 64, _r4);
            vst1q_s8(pp + 80, _r5);
            vst1q_s8(pp + 96, _r6);
            vst1q_s8(pp + 112, _r7);
            pp += 128;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
            p4 += 16;
            p5 += 16;
            p6 += 16;
            p7 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _p0 = vld1_s8(p0);
            int8x8_t _p1 = vld1_s8(p1);
            int8x8_t _p2 = vld1_s8(p2);
            int8x8_t _p3 = vld1_s8(p3);
            int8x8_t _p4 = vld1_s8(p4);
            int8x8_t _p5 = vld1_s8(p5);
            int8x8_t _p6 = vld1_s8(p6);
            int8x8_t _p7 = vld1_s8(p7);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int8x16_t _r0 = vcombine_s8(_p0, _p1);
            int8x16_t _r1 = vcombine_s8(_p2, _p3);
            int8x16_t _r2 = vcombine_s8(_p4, _p5);
            int8x16_t _r3 = vcombine_s8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x2x2_t _p01 = vzip_s32(vreinterpret_s32_s8(_p0), vreinterpret_s32_s8(_p1));
            int32x2x2_t _p23 = vzip_s32(vreinterpret_s32_s8(_p2), vreinterpret_s32_s8(_p3));
            int32x2x2_t _p45 = vzip_s32(vreinterpret_s32_s8(_p4), vreinterpret_s32_s8(_p5));
            int32x2x2_t _p67 = vzip_s32(vreinterpret_s32_s8(_p6), vreinterpret_s32_s8(_p7));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(_p01.val[0], _p23.val[0]));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(_p45.val[0], _p67.val[0]));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(_p01.val[1], _p23.val[1]));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(_p45.val[1], _p67.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8_t _p04 = vreinterpretq_s16_s8(vcombine_s8(_p0, _p4));
            int16x8_t _p15 = vreinterpretq_s16_s8(vcombine_s8(_p1, _p5));
            int16x8_t _p26 = vreinterpretq_s16_s8(vcombine_s8(_p2, _p6));
            int16x8_t _p37 = vreinterpretq_s16_s8(vcombine_s8(_p3, _p7));
            int16x8x2_t _t0 = vzipq_s16(_p04, _p15);
            int16x8x2_t _t1 = vzipq_s16(_p26, _p37);
            int32x4x2_t _t2 = vzipq_s32(vreinterpretq_s32_s16(_t0.val[0]), vreinterpretq_s32_s16(_t1.val[0]));
            int32x4x2_t _t3 = vzipq_s32(vreinterpretq_s32_s16(_t0.val[1]), vreinterpretq_s32_s16(_t1.val[1]));
            int8x16_t _r0 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0])));
            int8x16_t _r1 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0])));
            int8x16_t _r2 = vreinterpretq_s8_s32(vcombine_s32(vget_low_s32(_t2.val[1]), vget_low_s32(_t3.val[1])));
            int8x16_t _r3 = vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(_t2.val[1]), vget_high_s32(_t3.val[1])));
#endif // __ARM_FEATURE_DOTPROD
            vst1q_s8(pp, _r0);
            vst1q_s8(pp + 16, _r1);
            vst1q_s8(pp + 32, _r2);
            vst1q_s8(pp + 48, _r3);
            pp += 64;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
            p4 += 8;
            p5 += 8;
            p6 += 8;
            p7 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
#if __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
            pp[8] = p2[0];
            pp[9] = p2[1];
            pp[10] = p2[2];
            pp[11] = p2[3];
            pp[12] = p3[0];
            pp[13] = p3[1];
            pp[14] = p3[2];
            pp[15] = p3[3];
            pp[16] = p4[0];
            pp[17] = p4[1];
            pp[18] = p4[2];
            pp[19] = p4[3];
            pp[20] = p5[0];
            pp[21] = p5[1];
            pp[22] = p5[2];
            pp[23] = p5[3];
            pp[24] = p6[0];
            pp[25] = p6[1];
            pp[26] = p6[2];
            pp[27] = p6[3];
            pp[28] = p7[0];
            pp[29] = p7[1];
            pp[30] = p7[2];
            pp[31] = p7[3];
#else  // __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp[8] = p4[0];
            pp[9] = p4[1];
            pp[10] = p5[0];
            pp[11] = p5[1];
            pp[12] = p6[0];
            pp[13] = p6[1];
            pp[14] = p7[0];
            pp[15] = p7[1];
            pp[16] = p0[2];
            pp[17] = p0[3];
            pp[18] = p1[2];
            pp[19] = p1[3];
            pp[20] = p2[2];
            pp[21] = p2[3];
            pp[22] = p3[2];
            pp[23] = p3[3];
            pp[24] = p4[2];
            pp[25] = p4[3];
            pp[26] = p5[2];
            pp[27] = p5[3];
            pp[28] = p6[2];
            pp[29] = p6[3];
            pp[30] = p7[2];
            pp[31] = p7[3];
#endif // __ARM_FEATURE_DOTPROD
            pp += 32;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
            p4 += 4;
            p5 += 4;
            p6 += 4;
            p7 += 4;
        }
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp[8] = p4[0];
            pp[9] = p4[1];
            pp[10] = p5[0];
            pp[11] = p5[1];
            pp[12] = p6[0];
            pp[13] = p6[1];
            pp[14] = p7[0];
            pp[15] = p7[1];
            pp += 16;
            p0 += 2;
            p1 += 2;
            p2 += 2;
            p3 += 2;
            p4 += 2;
            p5 += 2;
            p6 += 2;
            p7 += 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp[4] = p4[0];
            pp[5] = p5[0];
            pp[6] = p6[0];
            pp[7] = p7[0];
            pp += 8;
            p0++;
            p1++;
            p2++;
            p3++;
            p4++;
            p5++;
            p6++;
            p7++;
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const signed char* p0 = B.row<const signed char>(j + jj) + k;
        const signed char* p1 = B.row<const signed char>(j + jj + 1) + k;
        const signed char* p2 = B.row<const signed char>(j + jj + 2) + k;
        const signed char* p3 = B.row<const signed char>(j + jj + 3) + k;

        int kk = 0;
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _p0 = vld1q_s8(p0);
            int8x16_t _p1 = vld1q_s8(p1);
            int8x16_t _p2 = vld1q_s8(p2);
            int8x16_t _p3 = vld1q_s8(p3);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int64x2x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s64_s8(_p0);
            _r0123.val[1] = vreinterpretq_s64_s8(_p1);
            _r0123.val[2] = vreinterpretq_s64_s8(_p2);
            _r0123.val[3] = vreinterpretq_s64_s8(_p3);
            vst4q_s64((int64_t*)pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x4x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s32_s8(_p0);
            _r0123.val[1] = vreinterpretq_s32_s8(_p1);
            _r0123.val[2] = vreinterpretq_s32_s8(_p2);
            _r0123.val[3] = vreinterpretq_s32_s8(_p3);
            vst4q_s32((int*)pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8x4_t _r0123;
            _r0123.val[0] = vreinterpretq_s16_s8(_p0);
            _r0123.val[1] = vreinterpretq_s16_s8(_p1);
            _r0123.val[2] = vreinterpretq_s16_s8(_p2);
            _r0123.val[3] = vreinterpretq_s16_s8(_p3);
            vst4q_s16((short*)pp, _r0123);
#endif // __ARM_FEATURE_DOTPROD
            pp += 64;
            p0 += 16;
            p1 += 16;
            p2 += 16;
            p3 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _p0 = vld1_s8(p0);
            int8x8_t _p1 = vld1_s8(p1);
            int8x8_t _p2 = vld1_s8(p2);
            int8x8_t _p3 = vld1_s8(p3);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            vst1q_s8(pp, vcombine_s8(_p0, _p1));
            vst1q_s8(pp + 16, vcombine_s8(_p2, _p3));
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x2x4_t _r0123;
            _r0123.val[0] = vreinterpret_s32_s8(_p0);
            _r0123.val[1] = vreinterpret_s32_s8(_p1);
            _r0123.val[2] = vreinterpret_s32_s8(_p2);
            _r0123.val[3] = vreinterpret_s32_s8(_p3);
            vst4_s32((int*)pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x4x4_t _r0123;
            _r0123.val[0] = vreinterpret_s16_s8(_p0);
            _r0123.val[1] = vreinterpret_s16_s8(_p1);
            _r0123.val[2] = vreinterpret_s16_s8(_p2);
            _r0123.val[3] = vreinterpret_s16_s8(_p3);
            vst4_s16((short*)pp, _r0123);
#endif // __ARM_FEATURE_DOTPROD
            pp += 32;
            p0 += 8;
            p1 += 8;
            p2 += 8;
            p3 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
#if __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
            pp[8] = p2[0];
            pp[9] = p2[1];
            pp[10] = p2[2];
            pp[11] = p2[3];
            pp[12] = p3[0];
            pp[13] = p3[1];
            pp[14] = p3[2];
            pp[15] = p3[3];
#else  // __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp[8] = p0[2];
            pp[9] = p0[3];
            pp[10] = p1[2];
            pp[11] = p1[3];
            pp[12] = p2[2];
            pp[13] = p2[3];
            pp[14] = p3[2];
            pp[15] = p3[3];
#endif // __ARM_FEATURE_DOTPROD
            pp += 16;
            p0 += 4;
            p1 += 4;
            p2 += 4;
            p3 += 4;
        }
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p2[0];
            pp[5] = p2[1];
            pp[6] = p3[0];
            pp[7] = p3[1];
            pp += 8;
            p0 += 2;
            p1 += 2;
            p2 += 2;
            p3 += 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp[2] = p2[0];
            pp[3] = p3[0];
            pp += 4;
            p0++;
            p1++;
            p2++;
            p3++;
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const signed char* p0 = B.row<const signed char>(j + jj) + k;
        const signed char* p1 = B.row<const signed char>(j + jj + 1) + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 15 < max_kk; kk += 16)
        {
            int8x16_t _p0 = vld1q_s8(p0);
            int8x16_t _p1 = vld1q_s8(p1);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            int64x2x2_t _r01;
            _r01.val[0] = vreinterpretq_s64_s8(_p0);
            _r01.val[1] = vreinterpretq_s64_s8(_p1);
            vst2q_s64((int64_t*)pp, _r01);
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x4x2_t _r01;
            _r01.val[0] = vreinterpretq_s32_s8(_p0);
            _r01.val[1] = vreinterpretq_s32_s8(_p1);
            vst2q_s32((int*)pp, _r01);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x8x2_t _r01;
            _r01.val[0] = vreinterpretq_s16_s8(_p0);
            _r01.val[1] = vreinterpretq_s16_s8(_p1);
            vst2q_s16((short*)pp, _r01);
#endif // __ARM_FEATURE_DOTPROD
            pp += 32;
            p0 += 16;
            p1 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _p0 = vld1_s8(p0);
            int8x8_t _p1 = vld1_s8(p1);
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            vst1q_s8(pp, vcombine_s8(_p0, _p1));
#else  // __ARM_FEATURE_MATMUL_INT8
            int32x2x2_t _r01;
            _r01.val[0] = vreinterpret_s32_s8(_p0);
            _r01.val[1] = vreinterpret_s32_s8(_p1);
            vst2_s32((int*)pp, _r01);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
            int16x4x2_t _r01;
            _r01.val[0] = vreinterpret_s16_s8(_p0);
            _r01.val[1] = vreinterpret_s16_s8(_p1);
            vst2_s16((short*)pp, _r01);
#endif // __ARM_FEATURE_DOTPROD
            pp += 16;
            p0 += 8;
            p1 += 8;
        }
        for (; kk + 3 < max_kk; kk += 4)
        {
#if __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp[4] = p1[0];
            pp[5] = p1[1];
            pp[6] = p1[2];
            pp[7] = p1[3];
#else  // __ARM_FEATURE_DOTPROD
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp[4] = p0[2];
            pp[5] = p0[3];
            pp[6] = p1[2];
            pp[7] = p1[3];
#endif // __ARM_FEATURE_DOTPROD
            pp += 8;
            p0 += 4;
            p1 += 4;
        }
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p1[0];
            pp[3] = p1[1];
            pp += 4;
            p0 += 2;
            p1 += 2;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p1[0];
            pp += 2;
            p0++;
            p1++;
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const signed char* p0 = B.row<const signed char>(j + jj) + k;

        int kk = 0;
#if __ARM_NEON
        for (; kk + 15 < max_kk; kk += 16)
        {
            vst1q_s8(pp, vld1q_s8(p0));
            pp += 16;
            p0 += 16;
        }
        for (; kk + 7 < max_kk; kk += 8)
        {
            vst1_s8(pp, vld1_s8(p0));
            pp += 8;
            p0 += 8;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp += 1;
            p0++;
        }
    }
}

static void transpose_pack_B_tile_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_B_tile_int8_i8mm(B, BT, j, max_jj, k, max_kk);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_B_tile_int8_asimddp(B, BT, j, max_jj, k, max_kk);
        return;
    }
#endif

    // NCNN_LOGE("transpose_pack_B_tile_int8");
    // assert B.elempack == 1
    // assert B.dims == 2

    const int B_hstep = B.w;

    signed char* pp = BT;

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const signed char* p0 = B.row<const signed char>(k) + (j + jj);

        int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 7 < max_kk; kk += 8)
        {
            int8x8_t _r0 = vld1_s8(p0);
            int8x8_t _r1 = vld1_s8(p0 + B_hstep);
            int8x8_t _r2 = vld1_s8(p0 + B_hstep * 2);
            int8x8_t _r3 = vld1_s8(p0 + B_hstep * 3);
            int8x8_t _r4 = vld1_s8(p0 + B_hstep * 4);
            int8x8_t _r5 = vld1_s8(p0 + B_hstep * 5);
            int8x8_t _r6 = vld1_s8(p0 + B_hstep * 6);
            int8x8_t _r7 = vld1_s8(p0 + B_hstep * 7);
            // transpose8x8
            int8x8x2_t _r04 = vzip_s8(_r0, _r4);
            int8x8x2_t _r15 = vzip_s8(_r1, _r5);
            int8x8x2_t _r26 = vzip_s8(_r2, _r6);
            int8x8x2_t _r37 = vzip_s8(_r3, _r7);
            int8x8x4_t _r0123;
            _r0123.val[0] = _r04.val[0];
            _r0123.val[1] = _r15.val[0];
            _r0123.val[2] = _r26.val[0];
            _r0123.val[3] = _r37.val[0];
            int8x8x4_t _r4567;
            _r4567.val[0] = _r04.val[1];
            _r4567.val[1] = _r15.val[1];
            _r4567.val[2] = _r26.val[1];
            _r4567.val[3] = _r37.val[1];
            vst4_s8(pp, _r0123);
            vst4_s8(pp + 32, _r4567);
            pp += 64;
            p0 += B_hstep * 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            int8x8x4_t _r0123;
            _r0123.val[0] = vld1_s8(p0);
            _r0123.val[1] = vld1_s8(p0 + B_hstep);
            _r0123.val[2] = vld1_s8(p0 + B_hstep * 2);
            _r0123.val[3] = vld1_s8(p0 + B_hstep * 3);
            vst4_s8(pp, _r0123);
            pp += 32;
            p0 += B_hstep * 4;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            int8x8x2_t _r01;
            _r01.val[0] = vld1_s8(p0);
            _r01.val[1] = vld1_s8(p0 + B_hstep);
            vst2_s8(pp, _r01);
            pp += 16;
            p0 += B_hstep * 2;
        }
        for (; kk < max_kk; kk++)
        {
            vst1_s8(pp, vld1_s8(p0));
            pp += 8;
            p0 += B_hstep;
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const signed char* p0 = B.row<const signed char>(k) + (j + jj);

        int kk = 0;
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 7 < max_kk; kk += 8)
        {
            pp[0] = p0[0];
            pp[1] = p0[B_hstep];
            pp[2] = p0[B_hstep * 2];
            pp[3] = p0[B_hstep * 3];
            pp[4] = p0[B_hstep * 4];
            pp[5] = p0[B_hstep * 5];
            pp[6] = p0[B_hstep * 6];
            pp[7] = p0[B_hstep * 7];
            pp[8] = p0[1];
            pp[9] = p0[B_hstep + 1];
            pp[10] = p0[B_hstep * 2 + 1];
            pp[11] = p0[B_hstep * 3 + 1];
            pp[12] = p0[B_hstep * 4 + 1];
            pp[13] = p0[B_hstep * 5 + 1];
            pp[14] = p0[B_hstep * 6 + 1];
            pp[15] = p0[B_hstep * 7 + 1];
            pp[16] = p0[2];
            pp[17] = p0[B_hstep + 2];
            pp[18] = p0[B_hstep * 2 + 2];
            pp[19] = p0[B_hstep * 3 + 2];
            pp[20] = p0[B_hstep * 4 + 2];
            pp[21] = p0[B_hstep * 5 + 2];
            pp[22] = p0[B_hstep * 6 + 2];
            pp[23] = p0[B_hstep * 7 + 2];
            pp[24] = p0[3];
            pp[25] = p0[B_hstep + 3];
            pp[26] = p0[B_hstep * 2 + 3];
            pp[27] = p0[B_hstep * 3 + 3];
            pp[28] = p0[B_hstep * 4 + 3];
            pp[29] = p0[B_hstep * 5 + 3];
            pp[30] = p0[B_hstep * 6 + 3];
            pp[31] = p0[B_hstep * 7 + 3];
            pp += 32;
            p0 += B_hstep * 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            pp[0] = p0[0];
            pp[1] = p0[B_hstep];
            pp[2] = p0[B_hstep * 2];
            pp[3] = p0[B_hstep * 3];
            pp[4] = p0[1];
            pp[5] = p0[B_hstep + 1];
            pp[6] = p0[B_hstep * 2 + 1];
            pp[7] = p0[B_hstep * 3 + 1];
            pp[8] = p0[2];
            pp[9] = p0[B_hstep + 2];
            pp[10] = p0[B_hstep * 2 + 2];
            pp[11] = p0[B_hstep * 3 + 2];
            pp[12] = p0[3];
            pp[13] = p0[B_hstep + 3];
            pp[14] = p0[B_hstep * 2 + 3];
            pp[15] = p0[B_hstep * 3 + 3];
            pp += 16;
            p0 += B_hstep * 4;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[B_hstep];
            pp[2] = p0[1];
            pp[3] = p0[B_hstep + 1];
            pp[4] = p0[2];
            pp[5] = p0[B_hstep + 2];
            pp[6] = p0[3];
            pp[7] = p0[B_hstep + 3];
            pp += 8;
            p0 += B_hstep * 2;
        }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp[2] = p0[2];
            pp[3] = p0[3];
            pp += 4;
            p0 += B_hstep;
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const signed char* p0 = B.row<const signed char>(k) + (j + jj);

        int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
        for (; kk + 7 < max_kk; kk += 8)
        {
            pp[0] = p0[0];
            pp[1] = p0[B_hstep];
            pp[2] = p0[B_hstep * 2];
            pp[3] = p0[B_hstep * 3];
            pp[4] = p0[B_hstep * 4];
            pp[5] = p0[B_hstep * 5];
            pp[6] = p0[B_hstep * 6];
            pp[7] = p0[B_hstep * 7];
            pp[8] = p0[1];
            pp[9] = p0[B_hstep + 1];
            pp[10] = p0[B_hstep * 2 + 1];
            pp[11] = p0[B_hstep * 3 + 1];
            pp[12] = p0[B_hstep * 4 + 1];
            pp[13] = p0[B_hstep * 5 + 1];
            pp[14] = p0[B_hstep * 6 + 1];
            pp[15] = p0[B_hstep * 7 + 1];
            pp += 16;
            p0 += B_hstep * 8;
        }
#endif // __ARM_FEATURE_MATMUL_INT8
        for (; kk + 3 < max_kk; kk += 4)
        {
            pp[0] = p0[0];
            pp[1] = p0[B_hstep];
            pp[2] = p0[B_hstep * 2];
            pp[3] = p0[B_hstep * 3];
            pp[4] = p0[1];
            pp[5] = p0[B_hstep + 1];
            pp[6] = p0[B_hstep * 2 + 1];
            pp[7] = p0[B_hstep * 3 + 1];
            pp += 8;
            p0 += B_hstep * 4;
        }
#endif // __ARM_FEATURE_DOTPROD
        for (; kk + 1 < max_kk; kk += 2)
        {
            pp[0] = p0[0];
            pp[1] = p0[B_hstep];
            pp[2] = p0[1];
            pp[3] = p0[B_hstep + 1];
            pp += 4;
            p0 += B_hstep * 2;
        }
#endif // __ARM_NEON
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp[1] = p0[1];
            pp += 2;
            p0 += B_hstep;
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const signed char* p0 = B.row<const signed char>(k) + (j + jj);

        int kk = 0;
        // for (; kk + 1 < max_kk; kk += 2)
        // {
        //     pp[0] = p0[0];
        //     pp[1] = p0[B_hstep];
        //     pp += 2;
        //     p0 += B_hstep * 2;
        // }
        for (; kk < max_kk; kk++)
        {
            pp[0] = p0[0];
            pp += 1;
            p0 += B_hstep;
        }
    }
}

static void compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
    const int K = A.w;

    // NCNN_LOGE("compute_A_tile_int8_scales %d %d", max_ii, elempack);

    const float v127_B_scale = 127.f * B_scale;

    float* ps = (float*)scales + i;
    float* pods = (float*)out_descales + i;

#if __ARM_NEON
    if (elempack == 4)
    {
#if __aarch64__
        float32x4_t _v127 = vdupq_n_f32(127.f);
        float32x4_t _v127_B_scale = vdupq_n_f32(v127_B_scale);
#endif

        for (int ii = 0; ii + 3 < max_ii; ii += 4)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = vld1q_f32(p0);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += 4;
            }

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax0);
            float32x4_t _out_descale = vdivq_f32(_absmax0, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax0);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax0, _recp_v127_B_scale);

            float tmp[4];
            vst1q_f32(tmp, _absmax0);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

#endif
            ps += 4;
            pods += 4;
        }
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        for (int ii = 0; ii < max_ii; ii++)
        {
            const float* p0 = (const float*)A + (i + ii) * A_hstep;

            float absmax = 0.f;
            int kk = 0;
#if __ARM_NEON
            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            for (; kk + 15 < K; kk += 16)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 7 < K; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p = vld1q_f32(p0);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += 4;
            }
            float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            absmax = std::max(absmax, std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1)));
#endif // __ARM_NEON
            for (; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf(p0[0]));
                p0++;
            }

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
}

static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_A_tile_fp32_to_int8_i8mm(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_A_tile_fp32_to_int8_asimddp(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    // NCNN_LOGE("pack_A_tile_fp32_to_int8 %d %d", max_ii, elempack);

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack;

        float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
        float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);
                float32x4x4_t _q = vld4q_f32(p0 + 16);
                float32x4x4_t _r = vld4q_f32(p0 + A_hstep * 4);
                float32x4x4_t _s = vld4q_f32(p0 + A_hstep * 4 + 16);

                float32x4_t _p0 = vmulq_laneq_f32(_p.val[0], _scale0, 0);
                float32x4_t _p1 = vmulq_laneq_f32(_p.val[1], _scale0, 1);
                float32x4_t _p2 = vmulq_laneq_f32(_p.val[2], _scale0, 2);
                float32x4_t _p3 = vmulq_laneq_f32(_p.val[3], _scale0, 3);
                float32x4_t _p4 = vmulq_laneq_f32(_q.val[0], _scale0, 0);
                float32x4_t _p5 = vmulq_laneq_f32(_q.val[1], _scale0, 1);
                float32x4_t _p6 = vmulq_laneq_f32(_q.val[2], _scale0, 2);
                float32x4_t _p7 = vmulq_laneq_f32(_q.val[3], _scale0, 3);
                float32x4_t _p8 = vmulq_laneq_f32(_r.val[0], _scale1, 0);
                float32x4_t _p9 = vmulq_laneq_f32(_r.val[1], _scale1, 1);
                float32x4_t _pa = vmulq_laneq_f32(_r.val[2], _scale1, 2);
                float32x4_t _pb = vmulq_laneq_f32(_r.val[3], _scale1, 3);
                float32x4_t _pc = vmulq_laneq_f32(_s.val[0], _scale1, 0);
                float32x4_t _pd = vmulq_laneq_f32(_s.val[1], _scale1, 1);
                float32x4_t _pe = vmulq_laneq_f32(_s.val[2], _scale1, 2);
                float32x4_t _pf = vmulq_laneq_f32(_s.val[3], _scale1, 3);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
                int8x8_t _r4 = float2int8(_p8, _pc);
                int8x8_t _r5 = float2int8(_p9, _pd);
                int8x8_t _r6 = float2int8(_pa, _pe);
                int8x8_t _r7 = float2int8(_pb, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p8, _p9);
                int8x8_t _r3 = float2int8(_pa, _pb);
                int8x8_t _r4 = float2int8(_p4, _p5);
                int8x8_t _r5 = float2int8(_p6, _p7);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);
                float32x4_t _p8 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + A_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + A_hstep * 4 + 8);
                float32x4_t _pb = vld1q_f32(p0 + A_hstep * 4 + 12);
                float32x4_t _pc = vld1q_f32(p0 + A_hstep * 4 + 16);
                float32x4_t _pd = vld1q_f32(p0 + A_hstep * 4 + 20);
                float32x4_t _pe = vld1q_f32(p0 + A_hstep * 4 + 24);
                float32x4_t _pf = vld1q_f32(p0 + A_hstep * 4 + 28);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale0);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale0);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale0);
                _p8 = vmulq_f32(_p8, _scale1);
                _p9 = vmulq_f32(_p9, _scale1);
                _pa = vmulq_f32(_pa, _scale1);
                _pb = vmulq_f32(_pb, _scale1);
                _pc = vmulq_f32(_pc, _scale1);
                _pd = vmulq_f32(_pd, _scale1);
                _pe = vmulq_f32(_pe, _scale1);
                _pf = vmulq_f32(_pf, _scale1);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p8), float2int8(_p2, _pa));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p9), float2int8(_p3, _pb));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p4, _pc), float2int8(_p6, _pe));
                _r23.val[1] = vcombine_s8(float2int8(_p5, _pd), float2int8(_p7, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);
                float32x4x4_t _q = vld4q_f32(p0 + A_hstep * 4);

                float32x4_t _p0 = vmulq_laneq_f32(_p.val[0], _scale0, 0);
                float32x4_t _p1 = vmulq_laneq_f32(_p.val[1], _scale0, 1);
                float32x4_t _p2 = vmulq_laneq_f32(_p.val[2], _scale0, 2);
                float32x4_t _p3 = vmulq_laneq_f32(_p.val[3], _scale0, 3);
                float32x4_t _p4 = vmulq_laneq_f32(_q.val[0], _scale1, 0);
                float32x4_t _p5 = vmulq_laneq_f32(_q.val[1], _scale1, 1);
                float32x4_t _p6 = vmulq_laneq_f32(_q.val[2], _scale1, 2);
                float32x4_t _p7 = vmulq_laneq_f32(_q.val[3], _scale1, 3);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 4 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 4 + 8);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 4 + 12);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale0);
                _p4 = vmulq_f32(_p4, _scale1);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale1);
                _p7 = vmulq_f32(_p7, _scale1);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p4), float2int8(_p2, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p5), float2int8(_p3, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p0n = vld1q_f32(p0 + 4);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p1n = vld1q_f32(p0 + A_hstep * 4 + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p0n = vmulq_f32(_p0n, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p1n = vmulq_f32(_p1n, _scale1);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p0n, _p1n);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep * 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 3 + 4);
                float32x4_t _p8 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + A_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + A_hstep * 5);
                float32x4_t _pb = vld1q_f32(p0 + A_hstep * 5 + 4);
                float32x4_t _pc = vld1q_f32(p0 + A_hstep * 6);
                float32x4_t _pd = vld1q_f32(p0 + A_hstep * 6 + 4);
                float32x4_t _pe = vld1q_f32(p0 + A_hstep * 7);
                float32x4_t _pf = vld1q_f32(p0 + A_hstep * 7 + 4);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale0, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale0, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale0, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale0, 3);
                _p8 = vmulq_laneq_f32(_p8, _scale1, 0);
                _p9 = vmulq_laneq_f32(_p9, _scale1, 0);
                _pa = vmulq_laneq_f32(_pa, _scale1, 1);
                _pb = vmulq_laneq_f32(_pb, _scale1, 1);
                _pc = vmulq_laneq_f32(_pc, _scale1, 2);
                _pd = vmulq_laneq_f32(_pd, _scale1, 2);
                _pe = vmulq_laneq_f32(_pe, _scale1, 3);
                _pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 0);
                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale0), 1);
                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale0), 0);
                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale0), 0);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale0), 1);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale0), 1);
                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale1), 0);
                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale1), 0);
                _pa = vmulq_lane_f32(_pa, vget_low_f32(_scale1), 1);
                _pb = vmulq_lane_f32(_pb, vget_low_f32(_scale1), 1);
                _pc = vmulq_lane_f32(_pc, vget_high_f32(_scale1), 0);
                _pd = vmulq_lane_f32(_pd, vget_high_f32(_scale1), 0);
                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 1);
                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p8, _pa));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_pc, _pe));
                int16x4_t _t4 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t5 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4_t _t6 = vreinterpret_s16_s8(float2int8(_p9, _pb));
                int16x4_t _t7 = vreinterpret_s16_s8(float2int8(_pd, _pf));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int16x4x2_t _t45 = vuzp_s16(_t4, _t5);
                int16x4x2_t _t67 = vuzp_s16(_t6, _t7);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
                int8x8_t _r4 = vreinterpret_s8_s16(_t45.val[0]);
                int8x8_t _r5 = vreinterpret_s8_s16(_t67.val[0]);
                int8x8_t _r6 = vreinterpret_s8_s16(_t45.val[1]);
                int8x8_t _r7 = vreinterpret_s8_s16(_t67.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));

                pp += 64;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 5);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 6);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 7);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + A_hstep);
                float32x2_t _p2 = vld1_f32(p0 + A_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + A_hstep * 3);
                float32x2_t _p4 = vld1_f32(p0 + A_hstep * 4);
                float32x2_t _p5 = vld1_f32(p0 + A_hstep * 5);
                float32x2_t _p6 = vld1_f32(p0 + A_hstep * 6);
                float32x2_t _p7 = vld1_f32(p0 + A_hstep * 7);

                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);
                float32x4_t _p45 = vcombine_f32(_p4, _p5);
                float32x4_t _p67 = vcombine_f32(_p6, _p7);

                float32x4x2_t _scale01 = vzipq_f32(_scale0, _scale0);
                float32x4x2_t _scale23 = vzipq_f32(_scale1, _scale1);

                _p01 = vmulq_f32(_p01, _scale01.val[0]);
                _p23 = vmulq_f32(_p23, _scale01.val[1]);
                _p45 = vmulq_f32(_p45, _scale23.val[0]);
                _p67 = vmulq_f32(_p67, _scale23.val[1]);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = float32x4_t();
                float32x4_t _p1 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[A_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[A_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[A_hstep * 3], _p0, 3);
                _p1 = vsetq_lane_f32(p0[A_hstep * 4], _p1, 0);
                _p1 = vsetq_lane_f32(p0[A_hstep * 5], _p1, 1);
                _p1 = vsetq_lane_f32(p0[A_hstep * 6], _p1, 2);
                _p1 = vsetq_lane_f32(p0[A_hstep * 7], _p1, 3);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0++;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack;

        float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);
                float32x4x4_t _q = vld4q_f32(p0 + 16);

                float32x4_t _p0 = vmulq_laneq_f32(_p.val[0], _scale, 0);
                float32x4_t _p1 = vmulq_laneq_f32(_p.val[1], _scale, 1);
                float32x4_t _p2 = vmulq_laneq_f32(_p.val[2], _scale, 2);
                float32x4_t _p3 = vmulq_laneq_f32(_p.val[3], _scale, 3);
                float32x4_t _p4 = vmulq_laneq_f32(_q.val[0], _scale, 0);
                float32x4_t _p5 = vmulq_laneq_f32(_q.val[1], _scale, 1);
                float32x4_t _p6 = vmulq_laneq_f32(_q.val[2], _scale, 2);
                float32x4_t _p7 = vmulq_laneq_f32(_q.val[3], _scale, 3);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);

                float32x4_t _p0 = vmulq_laneq_f32(_p.val[0], _scale, 0);
                float32x4_t _p1 = vmulq_laneq_f32(_p.val[1], _scale, 1);
                float32x4_t _p2 = vmulq_laneq_f32(_p.val[2], _scale, 2);
                float32x4_t _p3 = vmulq_laneq_f32(_p.val[3], _scale, 3);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 3 + 4);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 0);
                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale), 1);
                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale), 1);
                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale), 0);
                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale), 0);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 1);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + A_hstep);
                float32x2_t _p2 = vld1_f32(p0 + A_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + A_hstep * 3);

                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);

                float32x4x2_t _scale01 = vzipq_f32(_scale, _scale);

                _p01 = vmulq_f32(_p01, _scale01.val[0]);
                _p23 = vmulq_f32(_p23, _scale01.val[1]);

                int8x8_t _r0 = float2int8(_p01, _p23);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[A_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[A_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[A_hstep * 3], _p0, 3);

                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0++;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

        const float scale0 = scales[i + ii];
        const float scale1 = scales[i + ii + 1];

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale0 = vdupq_n_f32(scale0);
            float32x4_t _scale1 = vdupq_n_f32(scale1);
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale1);
                _p3 = vmulq_f32(_p3, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p2));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p2));
                float32x4_t _t2 = vcombine_f32(vget_low_f32(_p1), vget_low_f32(_p3));
                float32x4_t _t3 = vcombine_f32(vget_high_f32(_p1), vget_high_f32(_p3));
                int8x8_t _r0 = float2int8(_t0, _t1);
                int8x8_t _r1 = float2int8(_t2, _t3);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);
                vst1_s8(pp + 8, _r1);

                pp += 16;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r0 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(p0[0] * scale0);
                pp[1] = float2int8(p0[1] * scale0);
                pp[2] = float2int8(p0[A_hstep] * scale1);
                pp[3] = float2int8(p0[A_hstep + 1] * scale1);
                pp += 4;
                p0 += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale0);
                pp[1] = float2int8(p0[A_hstep] * scale1);
                pp += 2;
                p0++;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

        const float scale = scales[i + ii];

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale = vdupq_n_f32(scale);
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
    const int K = A.dims == 3 ? A.c : A.h;

    // NCNN_LOGE("transpose_compute_A_tile_int8_scales %d %d", max_ii, elempack);

    const float v127_B_scale = 127.f * B_scale;

#if __ARM_NEON
#if __aarch64__
    float32x4_t _v127 = vdupq_n_f32(127.f);
    float32x4_t _v127_B_scale = vdupq_n_f32(v127_B_scale);
#endif
#endif

    float* ps = (float*)scales + i;
    float* pods = (float*)out_descales + i;

#if __ARM_NEON
    if (elempack == 4)
    {
        int ii = 0;
        for (; ii + 3 < max_ii; ii += 4)
        {
            const float* p0 = (const float*)A + (i + ii) * 4;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            for (int kk = 0; kk < K; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 4;
            }
            float32x2_t _aa0 = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            float32x2_t _aa1 = vmax_f32(vget_low_f32(_absmax1), vget_high_f32(_absmax1));
            float32x2_t _aa2 = vmax_f32(vget_low_f32(_absmax2), vget_high_f32(_absmax2));
            float32x2_t _aa3 = vmax_f32(vget_low_f32(_absmax3), vget_high_f32(_absmax3));
            float32x2_t _aa01 = vpmax_f32(_aa0, _aa1);
            float32x2_t _aa23 = vpmax_f32(_aa2, _aa3);
            float32x4_t _absmax = vcombine_f32(_aa01, _aa23);

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax);
            float32x4_t _out_descale = vdivq_f32(_absmax, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            float tmp[4];
            vst1q_f32(tmp, _absmax);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax, _recp_v127_B_scale);
#endif

            ps += 4;
            pods += 4;
        }
        for (; ii < max_ii; ii++)
        {
            const float* p0 = (const float*)A + (i + ii) * 4;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 8);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 12);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep * 4);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += A_hstep * 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = vld1q_f32(p0);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += A_hstep * 4;
            }
            float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            float absmax = std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1));

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        int ii = 0;
#if __ARM_NEON
        for (; ii + 3 < max_ii; ii += 4)
        {
            const float* p0 = (const float*)A + (i + ii);

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 4;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += A_hstep * 2;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = vld1q_f32(p0);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += A_hstep;
            }

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax0);
            float32x4_t _out_descale = vdivq_f32(_absmax0, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            float tmp[4];
            vst1q_f32(tmp, _absmax0);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax0);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax0, _recp_v127_B_scale);
#endif

            ps += 4;
            pods += 4;
        }
        for (; ii + 1 < max_ii; ii += 2)
        {
            const float* p0 = (const float*)A + (i + ii);

            float32x2_t _absmax0 = vdup_n_f32(0.f);
            float32x2_t _absmax1 = vdup_n_f32(0.f);
            float32x2_t _absmax2 = vdup_n_f32(0.f);
            float32x2_t _absmax3 = vdup_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + A_hstep);
                float32x2_t _p2 = vld1_f32(p0 + A_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + A_hstep * 3);
                _absmax0 = vmax_f32(_absmax0, vabs_f32(_p0));
                _absmax1 = vmax_f32(_absmax1, vabs_f32(_p1));
                _absmax2 = vmax_f32(_absmax2, vabs_f32(_p2));
                _absmax3 = vmax_f32(_absmax3, vabs_f32(_p3));
                p0 += A_hstep * 4;
            }
            _absmax0 = vmax_f32(_absmax0, _absmax2);
            _absmax1 = vmax_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + A_hstep);
                _absmax0 = vmax_f32(_absmax0, vabs_f32(_p0));
                _absmax1 = vmax_f32(_absmax1, vabs_f32(_p1));
                p0 += A_hstep * 2;
            }
            _absmax0 = vmax_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x2_t _p = vld1_f32(p0);
                _absmax0 = vmax_f32(_absmax0, vabs_f32(_p));
                p0 += A_hstep;
            }

#if __aarch64__
            float32x2_t _scale = vdiv_f32(vget_low_f32(_v127), _absmax0);
            float32x2_t _out_descale = vdiv_f32(_absmax0, vget_low_f32(_v127_B_scale));

            vst1_f32(ps, _scale);
            vst1_f32(pods, _out_descale);
#else
            float tmp[2];
            vst1_f32(tmp, _absmax0);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;

            // float32x2_t _recp_absmax = vrecpe_f32(_absmax0);
            // _recp_absmax = vmul_f32(vrecps_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmul_f32(vrecps_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmul_f32(vrecps_f32(_absmax0, _recp_absmax), _recp_absmax);
            // float32x2_t _scale = vmul_f32(vget_low_f32(_v127), _recp_absmax);
            // float32x2_t _out_descale = vmul_f32(_absmax0, vget_low_f32(_recp_v127_B_scale));
#endif

            ps += 2;
            pods += 2;
        }
#endif // __ARM_NEON
        for (; ii < max_ii; ii++)
        {
            const float* p0 = (const float*)A + (i + ii);

            float absmax = 0.f;
            for (int kk = 0; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf(p0[0]));
                p0 += A_hstep;
            }

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
}

static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_A_tile_fp32_to_int8_i8mm(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_A_tile_fp32_to_int8_asimddp(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    // NCNN_LOGE("transpose_pack_A_tile_fp32_to_int8 %d %d", max_ii, elempack);

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

        float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
        float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);
                float32x4_t _p8 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + A_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + A_hstep * 4 + 8);
                float32x4_t _pb = vld1q_f32(p0 + A_hstep * 4 + 12);
                float32x4_t _pc = vld1q_f32(p0 + A_hstep * 4 + 16);
                float32x4_t _pd = vld1q_f32(p0 + A_hstep * 4 + 20);
                float32x4_t _pe = vld1q_f32(p0 + A_hstep * 4 + 24);
                float32x4_t _pf = vld1q_f32(p0 + A_hstep * 4 + 28);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
                _p8 = vmulq_laneq_f32(_p8, _scale0, 0);
                _p9 = vmulq_laneq_f32(_p9, _scale0, 1);
                _pa = vmulq_laneq_f32(_pa, _scale0, 2);
                _pb = vmulq_laneq_f32(_pb, _scale0, 3);
                _pc = vmulq_laneq_f32(_pc, _scale1, 0);
                _pd = vmulq_laneq_f32(_pd, _scale1, 1);
                _pe = vmulq_laneq_f32(_pe, _scale1, 2);
                _pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale0), 0);
                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale0), 1);
                _pa = vmulq_lane_f32(_pa, vget_high_f32(_scale0), 0);
                _pb = vmulq_lane_f32(_pb, vget_high_f32(_scale0), 1);
                _pc = vmulq_lane_f32(_pc, vget_low_f32(_scale1), 0);
                _pd = vmulq_lane_f32(_pd, vget_low_f32(_scale1), 1);
                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 0);
                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p8);
                int8x8_t _r1 = float2int8(_p1, _p9);
                int8x8_t _r2 = float2int8(_p2, _pa);
                int8x8_t _r3 = float2int8(_p3, _pb);
                int8x8_t _r4 = float2int8(_p4, _pc);
                int8x8_t _r5 = float2int8(_p5, _pd);
                int8x8_t _r6 = float2int8(_p6, _pe);
                int8x8_t _r7 = float2int8(_p7, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

#if __ARM_FEATURE_DOTPROD
                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8x2_t _rr = vuzpq_s16(_r01, _r23);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 3 + 4);
                float32x4_t _p8 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + A_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + A_hstep * 5);
                float32x4_t _pb = vld1q_f32(p0 + A_hstep * 5 + 4);
                float32x4_t _pc = vld1q_f32(p0 + A_hstep * 6);
                float32x4_t _pd = vld1q_f32(p0 + A_hstep * 6 + 4);
                float32x4_t _pe = vld1q_f32(p0 + A_hstep * 7);
                float32x4_t _pf = vld1q_f32(p0 + A_hstep * 7 + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);
                _p8 = vmulq_f32(_p8, _scale0);
                _p9 = vmulq_f32(_p9, _scale1);
                _pa = vmulq_f32(_pa, _scale0);
                _pb = vmulq_f32(_pb, _scale1);
                _pc = vmulq_f32(_pc, _scale0);
                _pd = vmulq_f32(_pd, _scale1);
                _pe = vmulq_f32(_pe, _scale0);
                _pf = vmulq_f32(_pf, _scale1);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r04 = vzip_s8(_r0, _r4);
                int8x8x2_t _r15 = vzip_s8(_r1, _r5);
                int8x8x2_t _r26 = vzip_s8(_r2, _r6);
                int8x8x2_t _r37 = vzip_s8(_r3, _r7);
                int8x16x4_t _r0123;
                _r0123.val[0] = vcombine_s8(_r04.val[0], _r04.val[1]);
                _r0123.val[1] = vcombine_s8(_r15.val[0], _r15.val[1]);
                _r0123.val[2] = vcombine_s8(_r26.val[0], _r26.val[1]);
                _r0123.val[3] = vcombine_s8(_r37.val[0], _r37.val[1]);

                vst4q_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = _r0;
                _r0123.val[1] = _r1;
                _r0123.val[2] = _r2;
                _r0123.val[3] = _r3;
                int8x8x4_t _r4567;
                _r4567.val[0] = _r4;
                _r4567.val[1] = _r5;
                _r4567.val[2] = _r6;
                _r4567.val[3] = _r7;

                vst4_s8(pp, _r0123);
                vst4_s8(pp + 32, _r4567);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(_r0, _r2);
                _r01.val[1] = vcombine_s8(_r1, _r3);
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(_r4, _r6);
                _r23.val[1] = vcombine_s8(_r5, _r7);

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 3 + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += A_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

        float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 4 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 4 + 8);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 4 + 12);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);
                float32x4_t _p4 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + A_hstep * 5);
                float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 6);
                float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 7);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                float32x4x2_t _p04 = vzipq_f32(_p0, _p4);
                float32x4x2_t _p15 = vzipq_f32(_p1, _p5);
                float32x4x2_t _p26 = vzipq_f32(_p2, _p6);
                float32x4x2_t _p37 = vzipq_f32(_p3, _p7);
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p04.val[0], _p04.val[1]);
                _r0123.val[1] = float2int8(_p15.val[0], _p15.val[1]);
                _r0123.val[2] = float2int8(_p26.val[0], _p26.val[1]);
                _r0123.val[3] = float2int8(_p37.val[0], _p37.val[1]);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p4);
                _r0123.val[1] = float2int8(_p1, _p5);
                _r0123.val[2] = float2int8(_p2, _p6);
                _r0123.val[3] = float2int8(_p3, _p7);

                vst4_s8(pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                transpose4x4_ps(_p0, _p1, _p2, _p3);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));
#else  // __ARM_FEATURE_DOTPROD
                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);
                pp += 4;
                p0 += A_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

        const float scale0 = scales[i + ii];
        const float scale1 = scales[i + ii + 1];

#if __ARM_NEON
        float32x4_t _scale0 = vdupq_n_f32(scale0);
        float32x4_t _scale1 = vdupq_n_f32(scale1);
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 4 + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r01 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r01 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            const float* p0 = (const float*)A + k * A_hstep + (i + ii);

            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale = vzipq_f32(_scale0, _scale1).val[0];
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + A_hstep);
                float32x2_t _p2 = vld1_f32(p0 + A_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + A_hstep * 3);
                float32x2_t _p4 = vld1_f32(p0 + A_hstep * 4);
                float32x2_t _p5 = vld1_f32(p0 + A_hstep * 5);
                float32x2_t _p6 = vld1_f32(p0 + A_hstep * 6);
                float32x2_t _p7 = vld1_f32(p0 + A_hstep * 7);

#if __ARM_FEATURE_DOTPROD
                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);
                float32x4_t _p45 = vcombine_f32(_p4, _p5);
                float32x4_t _p67 = vcombine_f32(_p6, _p7);

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vuzp_s8(_r0, _r1);

                vst1q_s8(pp, vcombine_s8(_r01.val[0], _r01.val[1]));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vtrn_s8(_r0, _r1);
                int8x8x2_t _rr01 = vuzp_s8(_r01.val[0], _r01.val[1]);

                vst1q_s8(pp, vcombine_s8(_rr01.val[0], _rr01.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p02 = vcombine_f32(_p0, _p2);
                float32x4_t _p46 = vcombine_f32(_p4, _p6);
                float32x4_t _p13 = vcombine_f32(_p1, _p3);
                float32x4_t _p57 = vcombine_f32(_p5, _p7);

                _p02 = vmulq_f32(_p02, _scale);
                _p46 = vmulq_f32(_p46, _scale);
                _p13 = vmulq_f32(_p13, _scale);
                _p57 = vmulq_f32(_p57, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p02, _p46);
                _r01.val[1] = float2int8(_p13, _p57);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + A_hstep);
                float32x2_t _p2 = vld1_f32(p0 + A_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + A_hstep * 3);

#if __ARM_FEATURE_DOTPROD
                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                float32x4x2_t _pp = vuzpq_f32(_p01, _p23);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p02 = vcombine_f32(_p0, _p2);
                float32x4_t _p13 = vcombine_f32(_p1, _p3);

                _p02 = vmulq_f32(_p02, _scale);
                _p13 = vmulq_f32(_p13, _scale);

                float32x4x2_t _pp = vzipq_f32(_p02, _p13);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(p0[0] * scale0);
                pp[1] = float2int8(p0[A_hstep + 0] * scale0);
                pp[2] = float2int8(p0[1] * scale1);
                pp[3] = float2int8(p0[A_hstep + 1] * scale1);
                pp += 4;
                p0 += A_hstep * 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale0);
                pp[1] = float2int8(p0[1] * scale1);
                pp += 2;
                p0 += A_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

        const float scale = scales[i + ii];

#if __ARM_NEON
        float32x4_t _scale = vdupq_n_f32(scale);
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep * 4);
                float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 8);
                float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += A_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + A_hstep * 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp[1] = float2int8(p0[1] * scale);
                pp[2] = float2int8(p0[2] * scale);
                pp[3] = float2int8(p0[3] * scale);
                pp += 4;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = float32x4_t();
                float32x4_t _p1 = float32x4_t();
                float32x4_t _p2 = float32x4_t();
                float32x4_t _p3 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[A_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[A_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[A_hstep * 3], _p0, 3);
                _p1 = vsetq_lane_f32(p0[A_hstep * 4], _p1, 0);
                _p1 = vsetq_lane_f32(p0[A_hstep * 5], _p1, 1);
                _p1 = vsetq_lane_f32(p0[A_hstep * 6], _p1, 2);
                _p1 = vsetq_lane_f32(p0[A_hstep * 7], _p1, 3);
                _p2 = vsetq_lane_f32(p0[A_hstep * 8], _p2, 0);
                _p2 = vsetq_lane_f32(p0[A_hstep * 9], _p2, 1);
                _p2 = vsetq_lane_f32(p0[A_hstep * 10], _p2, 2);
                _p2 = vsetq_lane_f32(p0[A_hstep * 11], _p2, 3);
                _p3 = vsetq_lane_f32(p0[A_hstep * 12], _p3, 0);
                _p3 = vsetq_lane_f32(p0[A_hstep * 13], _p3, 1);
                _p3 = vsetq_lane_f32(p0[A_hstep * 14], _p3, 2);
                _p3 = vsetq_lane_f32(p0[A_hstep * 15], _p3, 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += A_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = float32x4_t();
                float32x4_t _p1 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[A_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[A_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[A_hstep * 3], _p0, 3);
                _p1 = vsetq_lane_f32(p0[A_hstep * 4], _p1, 0);
                _p1 = vsetq_lane_f32(p0[A_hstep * 5], _p1, 1);
                _p1 = vsetq_lane_f32(p0[A_hstep * 6], _p1, 2);
                _p1 = vsetq_lane_f32(p0[A_hstep * 7], _p1, 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp += 1;
                p0 += A_hstep;
            }
        }
    }
}

static void compute_B_fp32_int8_scale(const Mat& B, float& scale)
{
    float absmax = 0.f;
#if __ARM_NEON
    float32x4_t _absmax = vdupq_n_f32(0.f);
#endif
    for (int i = 0; i < (B.dims == 3 ? B.c : B.h); i++)
    {
        const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;
        const float* ptr = (const float*)B + i * B_hstep * B.elempack;

        const int size = B.w * B.elempack;

        int j = 0;
#if __ARM_NEON
        for (; j + 3 < size; j += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _absmax = vmaxq_f32(_absmax, vabsq_f32(_p));
            ptr += 4;
        }
#endif
        for (; j < size; j++)
        {
            absmax = std::max(absmax, (float)fabsf(ptr[0]));
            ptr++;
        }
    }
#if __ARM_NEON
    float32x2_t _aa = vmax_f32(vget_low_f32(_absmax), vget_high_f32(_absmax));
    absmax = std::max(absmax, std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1)));
#endif

    scale = absmax == 0.f ? 1.f : 127.f / absmax;
}

static void pack_B_tile_fp32_to_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_B_tile_fp32_to_int8_i8mm(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_B_tile_fp32_to_int8_asimddp(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    // NCNN_LOGE("pack_B_tile_fp32_to_int8 %d %d %d", max_jj, max_kk, elempack);

    signed char* pp = BT;

#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
#endif

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);
                float32x4x4_t _q = vld4q_f32(p0 + 16);
                float32x4x4_t _r = vld4q_f32(p0 + B_hstep * 4);
                float32x4x4_t _s = vld4q_f32(p0 + B_hstep * 4 + 16);

                float32x4_t _p0 = vmulq_f32(_p.val[0], _scale);
                float32x4_t _p1 = vmulq_f32(_p.val[1], _scale);
                float32x4_t _p2 = vmulq_f32(_p.val[2], _scale);
                float32x4_t _p3 = vmulq_f32(_p.val[3], _scale);
                float32x4_t _p4 = vmulq_f32(_q.val[0], _scale);
                float32x4_t _p5 = vmulq_f32(_q.val[1], _scale);
                float32x4_t _p6 = vmulq_f32(_q.val[2], _scale);
                float32x4_t _p7 = vmulq_f32(_q.val[3], _scale);
                float32x4_t _p8 = vmulq_f32(_r.val[0], _scale);
                float32x4_t _p9 = vmulq_f32(_r.val[1], _scale);
                float32x4_t _pa = vmulq_f32(_r.val[2], _scale);
                float32x4_t _pb = vmulq_f32(_r.val[3], _scale);
                float32x4_t _pc = vmulq_f32(_s.val[0], _scale);
                float32x4_t _pd = vmulq_f32(_s.val[1], _scale);
                float32x4_t _pe = vmulq_f32(_s.val[2], _scale);
                float32x4_t _pf = vmulq_f32(_s.val[3], _scale);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
                int8x8_t _r4 = float2int8(_p8, _pc);
                int8x8_t _r5 = float2int8(_p9, _pd);
                int8x8_t _r6 = float2int8(_pa, _pe);
                int8x8_t _r7 = float2int8(_pb, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p8, _p9);
                int8x8_t _r3 = float2int8(_pa, _pb);
                int8x8_t _r4 = float2int8(_p4, _p5);
                int8x8_t _r5 = float2int8(_p6, _p7);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);
                float32x4_t _p8 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + B_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + B_hstep * 4 + 8);
                float32x4_t _pb = vld1q_f32(p0 + B_hstep * 4 + 12);
                float32x4_t _pc = vld1q_f32(p0 + B_hstep * 4 + 16);
                float32x4_t _pd = vld1q_f32(p0 + B_hstep * 4 + 20);
                float32x4_t _pe = vld1q_f32(p0 + B_hstep * 4 + 24);
                float32x4_t _pf = vld1q_f32(p0 + B_hstep * 4 + 28);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p8), float2int8(_p2, _pa));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p9), float2int8(_p3, _pb));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p4, _pc), float2int8(_p6, _pe));
                _r23.val[1] = vcombine_s8(float2int8(_p5, _pd), float2int8(_p7, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);
                float32x4x4_t _q = vld4q_f32(p0 + B_hstep * 4);

                float32x4_t _p0 = vmulq_f32(_p.val[0], _scale);
                float32x4_t _p1 = vmulq_f32(_p.val[1], _scale);
                float32x4_t _p2 = vmulq_f32(_p.val[2], _scale);
                float32x4_t _p3 = vmulq_f32(_p.val[3], _scale);
                float32x4_t _p4 = vmulq_f32(_q.val[0], _scale);
                float32x4_t _p5 = vmulq_f32(_q.val[1], _scale);
                float32x4_t _p6 = vmulq_f32(_q.val[2], _scale);
                float32x4_t _p7 = vmulq_f32(_q.val[3], _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 4 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 4 + 8);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 4 + 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p4), float2int8(_p2, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p5), float2int8(_p3, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep * 4 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep * 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 3 + 4);
                float32x4_t _p8 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + B_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + B_hstep * 5);
                float32x4_t _pb = vld1q_f32(p0 + B_hstep * 5 + 4);
                float32x4_t _pc = vld1q_f32(p0 + B_hstep * 6);
                float32x4_t _pd = vld1q_f32(p0 + B_hstep * 6 + 4);
                float32x4_t _pe = vld1q_f32(p0 + B_hstep * 7);
                float32x4_t _pf = vld1q_f32(p0 + B_hstep * 7 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p8, _pa));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_pc, _pe));
                int16x4_t _t4 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t5 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4_t _t6 = vreinterpret_s16_s8(float2int8(_p9, _pb));
                int16x4_t _t7 = vreinterpret_s16_s8(float2int8(_pd, _pf));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int16x4x2_t _t45 = vuzp_s16(_t4, _t5);
                int16x4x2_t _t67 = vuzp_s16(_t6, _t7);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
                int8x8_t _r4 = vreinterpret_s8_s16(_t45.val[0]);
                int8x8_t _r5 = vreinterpret_s8_s16(_t67.val[0]);
                int8x8_t _r6 = vreinterpret_s8_s16(_t45.val[1]);
                int8x8_t _r7 = vreinterpret_s8_s16(_t67.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));

                pp += 64;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep * 3);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 5);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 6);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 7);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + B_hstep);
                float32x2_t _p2 = vld1_f32(p0 + B_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + B_hstep * 3);
                float32x2_t _p4 = vld1_f32(p0 + B_hstep * 4);
                float32x2_t _p5 = vld1_f32(p0 + B_hstep * 5);
                float32x2_t _p6 = vld1_f32(p0 + B_hstep * 6);
                float32x2_t _p7 = vld1_f32(p0 + B_hstep * 7);

                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);
                float32x4_t _p45 = vcombine_f32(_p4, _p5);
                float32x4_t _p67 = vcombine_f32(_p6, _p7);

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = float32x4_t();
                float32x4_t _p1 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[B_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[B_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[B_hstep * 3], _p0, 3);
                _p1 = vsetq_lane_f32(p0[B_hstep * 4], _p1, 0);
                _p1 = vsetq_lane_f32(p0[B_hstep * 5], _p1, 1);
                _p1 = vsetq_lane_f32(p0[B_hstep * 6], _p1, 2);
                _p1 = vsetq_lane_f32(p0[B_hstep * 7], _p1, 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0++;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);
                float32x4x4_t _q = vld4q_f32(p0 + 16);

                float32x4_t _p0 = vmulq_f32(_p.val[0], _scale);
                float32x4_t _p1 = vmulq_f32(_p.val[1], _scale);
                float32x4_t _p2 = vmulq_f32(_p.val[2], _scale);
                float32x4_t _p3 = vmulq_f32(_p.val[3], _scale);
                float32x4_t _p4 = vmulq_f32(_q.val[0], _scale);
                float32x4_t _p5 = vmulq_f32(_q.val[1], _scale);
                float32x4_t _p6 = vmulq_f32(_q.val[2], _scale);
                float32x4_t _p7 = vmulq_f32(_q.val[3], _scale);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                float32x4x4_t _p = vld4q_f32(p0);

                float32x4_t _p0 = vmulq_f32(_p.val[0], _scale);
                float32x4_t _p1 = vmulq_f32(_p.val[1], _scale);
                float32x4_t _p2 = vmulq_f32(_p.val[2], _scale);
                float32x4_t _p3 = vmulq_f32(_p.val[3], _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 3 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep * 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + B_hstep);
                float32x2_t _p2 = vld1_f32(p0 + B_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + B_hstep * 3);

                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[B_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[B_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[B_hstep * 3], _p0, 3);

                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0++;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p2));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p2));
                float32x4_t _t2 = vcombine_f32(vget_low_f32(_p1), vget_low_f32(_p3));
                float32x4_t _t3 = vcombine_f32(vget_high_f32(_p1), vget_high_f32(_p3));
                int8x8_t _r0 = float2int8(_t0, _t1);
                int8x8_t _r1 = float2int8(_t2, _t3);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);
                vst1_s8(pp + 8, _r1);

                pp += 16;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r0 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp[1] = float2int8(p0[1] * scale);
                pp[2] = float2int8(p0[B_hstep] * scale);
                pp[3] = float2int8(p0[B_hstep + 1] * scale);
                pp += 4;
                p0 += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp[1] = float2int8(p0[B_hstep] * scale);
                pp += 2;
                p0++;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const float* p0 = (const float*)B + (j + jj) * B_hstep + k;

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_pack_B_tile_fp32_to_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_B_tile_fp32_to_int8_i8mm(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_B_tile_fp32_to_int8_asimddp(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    // NCNN_LOGE("transpose_pack_B_tile_fp32_to_int8 %d %d", max_jj, elempack);

    signed char* pp = BT;

#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
#endif

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj) * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);
                float32x4_t _p8 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + B_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + B_hstep * 4 + 8);
                float32x4_t _pb = vld1q_f32(p0 + B_hstep * 4 + 12);
                float32x4_t _pc = vld1q_f32(p0 + B_hstep * 4 + 16);
                float32x4_t _pd = vld1q_f32(p0 + B_hstep * 4 + 20);
                float32x4_t _pe = vld1q_f32(p0 + B_hstep * 4 + 24);
                float32x4_t _pf = vld1q_f32(p0 + B_hstep * 4 + 28);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p8);
                int8x8_t _r1 = float2int8(_p1, _p9);
                int8x8_t _r2 = float2int8(_p2, _pa);
                int8x8_t _r3 = float2int8(_p3, _pb);
                int8x8_t _r4 = float2int8(_p4, _pc);
                int8x8_t _r5 = float2int8(_p5, _pd);
                int8x8_t _r6 = float2int8(_p6, _pe);
                int8x8_t _r7 = float2int8(_p7, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + 16);
                float32x4_t _p5 = vld1q_f32(p0 + 20);
                float32x4_t _p6 = vld1q_f32(p0 + 24);
                float32x4_t _p7 = vld1q_f32(p0 + 28);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

#if __ARM_FEATURE_DOTPROD
                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8x2_t _rr = vuzpq_s16(_r01, _r23);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 3 + 4);
                float32x4_t _p8 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p9 = vld1q_f32(p0 + B_hstep * 4 + 4);
                float32x4_t _pa = vld1q_f32(p0 + B_hstep * 5);
                float32x4_t _pb = vld1q_f32(p0 + B_hstep * 5 + 4);
                float32x4_t _pc = vld1q_f32(p0 + B_hstep * 6);
                float32x4_t _pd = vld1q_f32(p0 + B_hstep * 6 + 4);
                float32x4_t _pe = vld1q_f32(p0 + B_hstep * 7);
                float32x4_t _pf = vld1q_f32(p0 + B_hstep * 7 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r04 = vzip_s8(_r0, _r4);
                int8x8x2_t _r15 = vzip_s8(_r1, _r5);
                int8x8x2_t _r26 = vzip_s8(_r2, _r6);
                int8x8x2_t _r37 = vzip_s8(_r3, _r7);
                int8x16x4_t _r0123;
                _r0123.val[0] = vcombine_s8(_r04.val[0], _r04.val[1]);
                _r0123.val[1] = vcombine_s8(_r15.val[0], _r15.val[1]);
                _r0123.val[2] = vcombine_s8(_r26.val[0], _r26.val[1]);
                _r0123.val[3] = vcombine_s8(_r37.val[0], _r37.val[1]);

                vst4q_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = _r0;
                _r0123.val[1] = _r1;
                _r0123.val[2] = _r2;
                _r0123.val[3] = _r3;
                int8x8x4_t _r4567;
                _r4567.val[0] = _r4;
                _r4567.val[1] = _r5;
                _r4567.val[2] = _r6;
                _r4567.val[3] = _r7;

                vst4_s8(pp, _r0123);
                vst4_s8(pp + 32, _r4567);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(_r0, _r2);
                _r01.val[1] = vcombine_s8(_r1, _r3);
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(_r4, _r6);
                _r23.val[1] = vcombine_s8(_r5, _r7);

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep + 4);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 2 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 3);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 3 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += B_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += B_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj) * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 4 + 4);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 4 + 8);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 4 + 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + 8);
                float32x4_t _p3 = vld1q_f32(p0 + 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep * 3);
                float32x4_t _p4 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p5 = vld1q_f32(p0 + B_hstep * 5);
                float32x4_t _p6 = vld1q_f32(p0 + B_hstep * 6);
                float32x4_t _p7 = vld1q_f32(p0 + B_hstep * 7);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                float32x4x2_t _p04 = vzipq_f32(_p0, _p4);
                float32x4x2_t _p15 = vzipq_f32(_p1, _p5);
                float32x4x2_t _p26 = vzipq_f32(_p2, _p6);
                float32x4x2_t _p37 = vzipq_f32(_p3, _p7);
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p04.val[0], _p04.val[1]);
                _r0123.val[1] = float2int8(_p15.val[0], _p15.val[1]);
                _r0123.val[2] = float2int8(_p26.val[0], _p26.val[1]);
                _r0123.val[3] = float2int8(_p37.val[0], _p37.val[1]);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p4);
                _r0123.val[1] = float2int8(_p1, _p5);
                _r0123.val[2] = float2int8(_p2, _p6);
                _r0123.val[3] = float2int8(_p3, _p7);

                vst4_s8(pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep * 2);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep * 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                transpose4x4_ps(_p0, _p1, _p2, _p3);
                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));
#else  // __ARM_FEATURE_DOTPROD
                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);
                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp[1] = float2int8(p0[1] * scale);
                pp[2] = float2int8(p0[2] * scale);
                pp[3] = float2int8(p0[3] * scale);
                pp += 4;
                p0 += B_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj) * elempack;

#if __ARM_NEON
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep * 4 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r01 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r01 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + B_hstep);
                float32x2_t _p2 = vld1_f32(p0 + B_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + B_hstep * 3);
                float32x2_t _p4 = vld1_f32(p0 + B_hstep * 4);
                float32x2_t _p5 = vld1_f32(p0 + B_hstep * 5);
                float32x2_t _p6 = vld1_f32(p0 + B_hstep * 6);
                float32x2_t _p7 = vld1_f32(p0 + B_hstep * 7);

#if __ARM_FEATURE_DOTPROD
                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);
                float32x4_t _p45 = vcombine_f32(_p4, _p5);
                float32x4_t _p67 = vcombine_f32(_p6, _p7);

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vuzp_s8(_r0, _r1);

                vst1q_s8(pp, vcombine_s8(_r01.val[0], _r01.val[1]));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vtrn_s8(_r0, _r1);
                int8x8x2_t _rr01 = vuzp_s8(_r01.val[0], _r01.val[1]);

                vst1q_s8(pp, vcombine_s8(_rr01.val[0], _rr01.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p02 = vcombine_f32(_p0, _p2);
                float32x4_t _p46 = vcombine_f32(_p4, _p6);
                float32x4_t _p13 = vcombine_f32(_p1, _p3);
                float32x4_t _p57 = vcombine_f32(_p5, _p7);

                _p02 = vmulq_f32(_p02, _scale);
                _p46 = vmulq_f32(_p46, _scale);
                _p13 = vmulq_f32(_p13, _scale);
                _p57 = vmulq_f32(_p57, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p02, _p46);
                _r01.val[1] = float2int8(_p13, _p57);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x2_t _p0 = vld1_f32(p0);
                float32x2_t _p1 = vld1_f32(p0 + B_hstep);
                float32x2_t _p2 = vld1_f32(p0 + B_hstep * 2);
                float32x2_t _p3 = vld1_f32(p0 + B_hstep * 3);

#if __ARM_FEATURE_DOTPROD
                float32x4_t _p01 = vcombine_f32(_p0, _p1);
                float32x4_t _p23 = vcombine_f32(_p2, _p3);

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                float32x4x2_t _pp = vuzpq_f32(_p01, _p23);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _p02 = vcombine_f32(_p0, _p2);
                float32x4_t _p13 = vcombine_f32(_p1, _p3);

                _p02 = vmulq_f32(_p02, _scale);
                _p13 = vmulq_f32(_p13, _scale);

                float32x4x2_t _pp = vzipq_f32(_p02, _p13);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp[1] = float2int8(p0[B_hstep + 0] * scale);
                pp[2] = float2int8(p0[1] * scale);
                pp[3] = float2int8(p0[B_hstep + 1] * scale);
                pp += 4;
                p0 += B_hstep * 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp[1] = float2int8(p0[1] * scale);
                pp += 2;
                p0 += B_hstep;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const float* p0 = (const float*)B + k * B_hstep + (j + jj) * elempack;

#if __ARM_NEON
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep * 4);
                float32x4_t _p2 = vld1q_f32(p0 + B_hstep * 8);
                float32x4_t _p3 = vld1q_f32(p0 + B_hstep * 12);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += B_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vld1q_f32(p0);
                float32x4_t _p1 = vld1q_f32(p0 + B_hstep * 4);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp[1] = float2int8(p0[1] * scale);
                pp[2] = float2int8(p0[2] * scale);
                pp[3] = float2int8(p0[3] * scale);
                pp += 4;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = float32x4_t();
                float32x4_t _p1 = float32x4_t();
                float32x4_t _p2 = float32x4_t();
                float32x4_t _p3 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[B_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[B_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[B_hstep * 3], _p0, 3);
                _p1 = vsetq_lane_f32(p0[B_hstep * 4], _p1, 0);
                _p1 = vsetq_lane_f32(p0[B_hstep * 5], _p1, 1);
                _p1 = vsetq_lane_f32(p0[B_hstep * 6], _p1, 2);
                _p1 = vsetq_lane_f32(p0[B_hstep * 7], _p1, 3);
                _p2 = vsetq_lane_f32(p0[B_hstep * 8], _p2, 0);
                _p2 = vsetq_lane_f32(p0[B_hstep * 9], _p2, 1);
                _p2 = vsetq_lane_f32(p0[B_hstep * 10], _p2, 2);
                _p2 = vsetq_lane_f32(p0[B_hstep * 11], _p2, 3);
                _p3 = vsetq_lane_f32(p0[B_hstep * 12], _p3, 0);
                _p3 = vsetq_lane_f32(p0[B_hstep * 13], _p3, 1);
                _p3 = vsetq_lane_f32(p0[B_hstep * 14], _p3, 2);
                _p3 = vsetq_lane_f32(p0[B_hstep * 15], _p3, 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += B_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = float32x4_t();
                float32x4_t _p1 = float32x4_t();
                _p0 = vsetq_lane_f32(p0[0], _p0, 0);
                _p0 = vsetq_lane_f32(p0[B_hstep], _p0, 1);
                _p0 = vsetq_lane_f32(p0[B_hstep * 2], _p0, 2);
                _p0 = vsetq_lane_f32(p0[B_hstep * 3], _p0, 3);
                _p1 = vsetq_lane_f32(p0[B_hstep * 4], _p1, 0);
                _p1 = vsetq_lane_f32(p0[B_hstep * 5], _p1, 1);
                _p1 = vsetq_lane_f32(p0[B_hstep * 6], _p1, 2);
                _p1 = vsetq_lane_f32(p0[B_hstep * 7], _p1, 3);

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(p0[0] * scale);
                pp += 1;
                p0 += B_hstep;
            }
        }
    }
}

static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        unpack_output_tile_int32_to_fp32_asimddp(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const size_t c_hstep = C.dims == 3 ? C.cstep : (size_t)C.w;
    const int c_elempack = C.elempack;
    const float* pC = C;

    // NCNN_LOGE("unpack_output_tile_int32_to_fp32  %d %d %d %d  %d  %d  %d", i, max_ii, j, max_jj, out_elempack, broadcast_type_C, c_elempack);

    const int* pp = topT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
        float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

        float32x4_t _c0;
        float32x4_t _c1;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(pC[0] * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                _c0 = vld1q_f32(pC);
                _c1 = vld1q_f32(pC + 4);
                _c0 = vmulq_n_f32(_c0, beta);
                _c1 = vmulq_n_f32(_c1, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const float*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);
            int32x4_t _sum8 = vld1q_s32(pp + 32);
            int32x4_t _sum9 = vld1q_s32(pp + 36);
            int32x4_t _suma = vld1q_s32(pp + 40);
            int32x4_t _sumb = vld1q_s32(pp + 44);
            int32x4_t _sumc = vld1q_s32(pp + 48);
            int32x4_t _sumd = vld1q_s32(pp + 52);
            int32x4_t _sume = vld1q_s32(pp + 56);
            int32x4_t _sumf = vld1q_s32(pp + 60);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
#else
            // from
            //      a0 b1 c2 d3
            //      e4 f5 g6 h7
            //      e0 f1 g2 h3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      g4 h5 e6 f7
            //      g0 h1 e2 f3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      e7 f6 g5 h4
            //      e3 f2 g1 h0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      g7 h6 e5 f4
            //      g3 h2 e1 f0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
            {
                _sum8 = vrev64q_s32(_sum8);
                _sum9 = vrev64q_s32(_sum9);
                _suma = vrev64q_s32(_suma);
                _sumb = vrev64q_s32(_sumb);
                _sumc = vrev64q_s32(_sumc);
                _sumd = vrev64q_s32(_sumd);
                _sume = vrev64q_s32(_sume);
                _sumf = vrev64q_s32(_sumf);
                _sum8 = vextq_s32(_sum8, _sum8, 2);
                _sum9 = vextq_s32(_sum9, _sum9, 2);
                _suma = vextq_s32(_suma, _suma, 2);
                _sumb = vextq_s32(_sumb, _sumb, 2);
                _sumc = vextq_s32(_sumc, _sumc, 2);
                _sumd = vextq_s32(_sumd, _sumd, 2);
                _sume = vextq_s32(_sume, _sume, 2);
                _sumf = vextq_s32(_sumf, _sumf, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                _sum9 = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                _suma = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                _sumb = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                _sumc = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                _sumd = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                _sume = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
                _sum9 = vrev64q_s32(_sum9);
                _sumb = vrev64q_s32(_sumb);
                _sumd = vrev64q_s32(_sumd);
                _sumf = vrev64q_s32(_sumf);
            }
#endif

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum8), _descale0);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum9), _descale0);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_suma), _descale0);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sumb), _descale0);
            float32x4_t _f8 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f9 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _fa = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _fb = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);
            float32x4_t _fc = vmulq_f32(vcvtq_f32_s32(_sumc), _descale1);
            float32x4_t _fd = vmulq_f32(vcvtq_f32_s32(_sumd), _descale1);
            float32x4_t _fe = vmulq_f32(vcvtq_f32_s32(_sume), _descale1);
            float32x4_t _ff = vmulq_f32(vcvtq_f32_s32(_sumf), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c0);
                    _fa = vaddq_f32(_fa, _c0);
                    _fb = vaddq_f32(_fb, _c0);
                    _fc = vaddq_f32(_fc, _c0);
                    _fd = vaddq_f32(_fd, _c0);
                    _fe = vaddq_f32(_fe, _c0);
                    _ff = vaddq_f32(_ff, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c1);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c1);
                    _fb = vaddq_f32(_fb, _c1);
                    _fc = vaddq_f32(_fc, _c1);
                    _fd = vaddq_f32(_fd, _c1);
                    _fe = vaddq_f32(_fe, _c1);
                    _ff = vaddq_f32(_ff, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        float32x4_t _c2 = vld1q_f32(pC + 4 * 2);
                        float32x4_t _c3 = vld1q_f32(pC + 4 * 3);
                        float32x4_t _c4 = vld1q_f32(pC + 4 * 4);
                        float32x4_t _c5 = vld1q_f32(pC + 4 * 5);
                        float32x4_t _c6 = vld1q_f32(pC + 4 * 6);
                        float32x4_t _c7 = vld1q_f32(pC + 4 * 7);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 4 + 4);
                        _c2 = vld1q_f32(pC + c_hstep * 4 + 4 * 2);
                        _c3 = vld1q_f32(pC + c_hstep * 4 + 4 * 3);
                        _c4 = vld1q_f32(pC + c_hstep * 4 + 4 * 4);
                        _c5 = vld1q_f32(pC + c_hstep * 4 + 4 * 5);
                        _c6 = vld1q_f32(pC + c_hstep * 4 + 4 * 6);
                        _c7 = vld1q_f32(pC + c_hstep * 4 + 4 * 7);
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        float32x4_t _c2 = vld1q_f32(pC + c_hstep);
                        float32x4_t _c3 = vld1q_f32(pC + c_hstep + 4);
                        float32x4_t _c4 = vld1q_f32(pC + c_hstep * 2);
                        float32x4_t _c5 = vld1q_f32(pC + c_hstep * 2 + 4);
                        float32x4_t _c6 = vld1q_f32(pC + c_hstep * 3);
                        float32x4_t _c7 = vld1q_f32(pC + c_hstep * 3 + 4);
                        transpose8x4_ps(_c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 4 + 4);
                        _c2 = vld1q_f32(pC + c_hstep * 5);
                        _c3 = vld1q_f32(pC + c_hstep * 5 + 4);
                        _c4 = vld1q_f32(pC + c_hstep * 6);
                        _c5 = vld1q_f32(pC + c_hstep * 6 + 4);
                        _c6 = vld1q_f32(pC + c_hstep * 7);
                        _c7 = vld1q_f32(pC + c_hstep * 7 + 4);
                        transpose8x4_ps(_c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7);
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 8;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _cc0 = vld1q_f32(pC);
                    float32x4_t _cc1 = vld1q_f32(pC + 4);
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c2);
                    _fb = vaddq_f32(_fb, _c3);
                    _fc = vaddq_f32(_fc, _c4);
                    _fd = vaddq_f32(_fd, _c5);
                    _fe = vaddq_f32(_fe, _c6);
                    _ff = vaddq_f32(_ff, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
                _f8 = vmulq_f32(_f8, _alpha);
                _f9 = vmulq_f32(_f9, _alpha);
                _fa = vmulq_f32(_fa, _alpha);
                _fb = vmulq_f32(_fb, _alpha);
                _fc = vmulq_f32(_fc, _alpha);
                _fd = vmulq_f32(_fd, _alpha);
                _fe = vmulq_f32(_fe, _alpha);
                _ff = vmulq_f32(_ff, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
                vst1q_f32(p0 + 8, _f2);
                vst1q_f32(p0 + 12, _f3);
                vst1q_f32(p0 + 16, _f4);
                vst1q_f32(p0 + 20, _f5);
                vst1q_f32(p0 + 24, _f6);
                vst1q_f32(p0 + 28, _f7);
                vst1q_f32(p0 + out_hstep * 4, _f8);
                vst1q_f32(p0 + out_hstep * 4 + 4, _f9);
                vst1q_f32(p0 + out_hstep * 4 + 8, _fa);
                vst1q_f32(p0 + out_hstep * 4 + 12, _fb);
                vst1q_f32(p0 + out_hstep * 4 + 16, _fc);
                vst1q_f32(p0 + out_hstep * 4 + 20, _fd);
                vst1q_f32(p0 + out_hstep * 4 + 24, _fe);
                vst1q_f32(p0 + out_hstep * 4 + 28, _ff);
                p0 += 32;
            }
            if (out_elempack == 1)
            {
                transpose4x4_ps(_f0, _f1, _f2, _f3);
                transpose4x4_ps(_f4, _f5, _f6, _f7);
                transpose4x4_ps(_f8, _f9, _fa, _fb);
                transpose4x4_ps(_fc, _fd, _fe, _ff);
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f4);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep + 4, _f5);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 2 + 4, _f6);
                vst1q_f32(p0 + out_hstep * 3, _f3);
                vst1q_f32(p0 + out_hstep * 3 + 4, _f7);
                vst1q_f32(p0 + out_hstep * 4, _f8);
                vst1q_f32(p0 + out_hstep * 4 + 4, _fc);
                vst1q_f32(p0 + out_hstep * 5, _f9);
                vst1q_f32(p0 + out_hstep * 5 + 4, _fd);
                vst1q_f32(p0 + out_hstep * 6, _fa);
                vst1q_f32(p0 + out_hstep * 6 + 4, _fe);
                vst1q_f32(p0 + out_hstep * 7, _fb);
                vst1q_f32(p0 + out_hstep * 7 + 4, _ff);
                p0 += 8;
            }

            pp += 64;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
#else
            // from
            //      a0 b1 c2 d3
            //      e0 f1 g2 h3
            //      c0 d1 a2 b3
            //      g0 h1 e2 f3
            //      a3 b2 c1 d0
            //      e3 f2 g1 h0
            //      c3 d2 a1 b0
            //      g3 h2 e1 f0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c1);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c1);
                    _f7 = vaddq_f32(_f7, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + 8);
                        _c3 = vld1q_f32(pC + 12);
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + c_hstep);
                        _c2 = vld1q_f32(pC + c_hstep * 2);
                        _c3 = vld1q_f32(pC + c_hstep * 3);
                        transpose4x4_ps(_c0, _c1, _c2, _c3);
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 4 + 4);
                        _c2 = vld1q_f32(pC + c_hstep * 4 + 8);
                        _c3 = vld1q_f32(pC + c_hstep * 4 + 12);
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 5);
                        _c2 = vld1q_f32(pC + c_hstep * 6);
                        _c3 = vld1q_f32(pC + c_hstep * 7);
                        transpose4x4_ps(_c0, _c1, _c2, _c3);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f4 = vaddq_f32(_f4, _c0);
                        _f5 = vaddq_f32(_f5, _c1);
                        _f6 = vaddq_f32(_f6, _c2);
                        _f7 = vaddq_f32(_f7, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f4 = vmlaq_f32(_f4, _c0, _beta);
                        _f5 = vmlaq_f32(_f5, _c1, _beta);
                        _f6 = vmlaq_f32(_f6, _c2, _beta);
                        _f7 = vmlaq_f32(_f7, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = vld1q_f32(pC);
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c2);
                    _f7 = vaddq_f32(_f7, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
                vst1q_f32(p0 + 8, _f2);
                vst1q_f32(p0 + 12, _f3);
                vst1q_f32(p0 + out_hstep * 4, _f4);
                vst1q_f32(p0 + out_hstep * 4 + 4, _f5);
                vst1q_f32(p0 + out_hstep * 4 + 8, _f6);
                vst1q_f32(p0 + out_hstep * 4 + 12, _f7);
                p0 += 16;
            }
            if (out_elempack == 1)
            {
                transpose4x4_ps(_f0, _f1, _f2, _f3);
                transpose4x4_ps(_f4, _f5, _f6, _f7);
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 3, _f3);
                vst1q_f32(p0 + out_hstep * 4, _f4);
                vst1q_f32(p0 + out_hstep * 5, _f5);
                vst1q_f32(p0 + out_hstep * 6, _f6);
                vst1q_f32(p0 + out_hstep * 7, _f7);
                p0 += 4;
            }

            pp += 32;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
#else
            // from
            //      a0 b1 c0 d1
            //      e0 f1 g0 h1
            //      a1 b0 c1 d0
            //      e1 f0 g1 h0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale1);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + c_hstep * 4);
                        _c3 = vld1q_f32(pC + c_hstep * 4 + 4);
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        float32x2_t _cc0 = vld1_f32(pC);
                        float32x2_t _cc1 = vld1_f32(pC + c_hstep);
                        float32x2_t _cc2 = vld1_f32(pC + c_hstep * 2);
                        float32x2_t _cc3 = vld1_f32(pC + c_hstep * 3);
                        float32x4_t _c01 = vcombine_f32(_cc0, _cc1);
                        float32x4_t _c23 = vcombine_f32(_cc2, _cc3);
                        float32x4x2_t _ccc0 = vuzpq_f32(_c01, _c23);
                        _c0 = _ccc0.val[0];
                        _c1 = _ccc0.val[1];
                        float32x2_t _cc4 = vld1_f32(pC + c_hstep * 4);
                        float32x2_t _cc5 = vld1_f32(pC + c_hstep * 5);
                        float32x2_t _cc6 = vld1_f32(pC + c_hstep * 6);
                        float32x2_t _cc7 = vld1_f32(pC + c_hstep * 7);
                        float32x4_t _c45 = vcombine_f32(_cc4, _cc5);
                        float32x4_t _c67 = vcombine_f32(_cc6, _cc7);
                        float32x4x2_t _ccc1 = vuzpq_f32(_c45, _c67);
                        _c2 = _ccc1.val[0];
                        _c3 = _ccc1.val[1];
                        pC += 2;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x2_t _c = vld1_f32(pC);
                    _c = vmul_n_f32(_c, beta);
                    _c0 = vdupq_lane_f32(_c, 0);
                    _c1 = vdupq_lane_f32(_c, 1);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
                vst1q_f32(p0 + out_hstep * 4, _f2);
                vst1q_f32(p0 + out_hstep * 4 + 4, _f3);
                p0 += 8;
            }
            if (out_elempack == 1)
            {
                float32x4x2_t _f01 = vzipq_f32(_f0, _f1);
                float32x4x2_t _f23 = vzipq_f32(_f2, _f3);
                vst1_f32(p0, vget_low_f32(_f01.val[0]));
                vst1_f32(p0 + out_hstep, vget_high_f32(_f01.val[0]));
                vst1_f32(p0 + out_hstep * 2, vget_low_f32(_f01.val[1]));
                vst1_f32(p0 + out_hstep * 3, vget_high_f32(_f01.val[1]));
                vst1_f32(p0 + out_hstep * 4, vget_low_f32(_f23.val[0]));
                vst1_f32(p0 + out_hstep * 5, vget_high_f32(_f23.val[0]));
                vst1_f32(p0 + out_hstep * 6, vget_low_f32(_f23.val[1]));
                vst1_f32(p0 + out_hstep * 7, vget_high_f32(_f23.val[1]));
                p0 += 2;
            }

            pp += 16;
        }
        for (; jj < max_jj; jj++)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + c_hstep * 4);
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vsetq_lane_f32(pC[0], _c0, 0);
                        _c0 = vsetq_lane_f32(pC[c_hstep], _c0, 1);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 2], _c0, 2);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 3], _c0, 3);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 4], _c1, 0);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 5], _c1, 1);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 6], _c1, 2);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 7], _c1, 3);
                        pC += 1;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(pC[0] * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + out_hstep * 4, _f1);
                p0 += 4;
            }
            if (out_elempack == 1)
            {
                p0[0] = vgetq_lane_f32(_f0, 0);
                p0[out_hstep] = vgetq_lane_f32(_f0, 1);
                p0[out_hstep * 2] = vgetq_lane_f32(_f0, 2);
                p0[out_hstep * 3] = vgetq_lane_f32(_f0, 3);
                p0[out_hstep * 4] = vgetq_lane_f32(_f1, 0);
                p0[out_hstep * 5] = vgetq_lane_f32(_f1, 1);
                p0[out_hstep * 6] = vgetq_lane_f32(_f1, 2);
                p0[out_hstep * 7] = vgetq_lane_f32(_f1, 3);
                p0++;
            }

            pp += 8;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

        float32x4_t _c0;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(pC[0] * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                _c0 = vld1q_f32(pC);
                _c0 = vmulq_n_f32(_c0, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const float*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
#else
            // from
            //      a0 b1 c2 d3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    float32x4_t _c4;
                    float32x4_t _c5;
                    float32x4_t _c6;
                    float32x4_t _c7;
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + 8);
                        _c3 = vld1q_f32(pC + 12);
                        _c4 = vld1q_f32(pC + 16);
                        _c5 = vld1q_f32(pC + 20);
                        _c6 = vld1q_f32(pC + 24);
                        _c7 = vld1q_f32(pC + 28);
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + c_hstep);
                        _c3 = vld1q_f32(pC + c_hstep + 4);
                        _c4 = vld1q_f32(pC + c_hstep * 2);
                        _c5 = vld1q_f32(pC + c_hstep * 2 + 4);
                        _c6 = vld1q_f32(pC + c_hstep * 3);
                        _c7 = vld1q_f32(pC + c_hstep * 3 + 4);
                        transpose8x4_ps(_c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7);
                        pC += 8;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                        _f4 = vaddq_f32(_f4, _c4);
                        _f5 = vaddq_f32(_f5, _c5);
                        _f6 = vaddq_f32(_f6, _c6);
                        _f7 = vaddq_f32(_f7, _c7);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                        _f4 = vmlaq_f32(_f4, _c4, _beta);
                        _f5 = vmlaq_f32(_f5, _c5, _beta);
                        _f6 = vmlaq_f32(_f6, _c6, _beta);
                        _f7 = vmlaq_f32(_f7, _c7, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _cc0 = vld1q_f32(pC);
                    float32x4_t _cc1 = vld1q_f32(pC + 4);
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
                vst1q_f32(p0 + 8, _f2);
                vst1q_f32(p0 + 12, _f3);
                vst1q_f32(p0 + 16, _f4);
                vst1q_f32(p0 + 20, _f5);
                vst1q_f32(p0 + 24, _f6);
                vst1q_f32(p0 + 28, _f7);
                p0 += 32;
            }
            if (out_elempack == 1)
            {
                transpose4x4_ps(_f0, _f1, _f2, _f3);
                transpose4x4_ps(_f4, _f5, _f6, _f7);
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f4);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep + 4, _f5);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 2 + 4, _f6);
                vst1q_f32(p0 + out_hstep * 3, _f3);
                vst1q_f32(p0 + out_hstep * 3 + 4, _f7);
                p0 += 8;
            }

            pp += 32;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
#else
            // from
            //      a0 b1 c2 d3
            //      c0 d1 a2 b3
            //      a3 b2 c1 d0
            //      c3 d2 a1 b0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                _sum2 = vextq_s32(_sum2, _sum2, 2);
                _sum3 = vextq_s32(_sum3, _sum3, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + 8);
                        _c3 = vld1q_f32(pC + 12);
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + c_hstep * 1);
                        _c2 = vld1q_f32(pC + c_hstep * 2);
                        _c3 = vld1q_f32(pC + c_hstep * 3);
                        transpose4x4_ps(_c0, _c1, _c2, _c3);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = vld1q_f32(pC);
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    float32x4_t _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
                vst1q_f32(p0 + 8, _f2);
                vst1q_f32(p0 + 12, _f3);
                p0 += 16;
            }
            if (out_elempack == 1)
            {
                transpose4x4_ps(_f0, _f1, _f2, _f3);
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 3, _f3);
                p0 += 4;
            }

            pp += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
#else
            // from
            //      a0 b1 c0 d1
            //      a1 b0 c1 d0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            {
                _sum1 = vrev64q_s32(_sum1);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        float32x2_t _cc0 = vld1_f32(pC);
                        float32x2_t _cc1 = vld1_f32(pC + c_hstep);
                        float32x2_t _cc2 = vld1_f32(pC + c_hstep * 2);
                        float32x2_t _cc3 = vld1_f32(pC + c_hstep * 3);
                        float32x4_t _c01 = vcombine_f32(_cc0, _cc1);
                        float32x4_t _c23 = vcombine_f32(_cc2, _cc3);
                        float32x4x2_t _cc = vuzpq_f32(_c01, _c23);
                        _c0 = _cc.val[0];
                        _c1 = _cc.val[1];
                        pC += 2;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x2_t _c = vld1_f32(pC);
                    _c = vmul_n_f32(_c, beta);
                    _c0 = vdupq_lane_f32(_c, 0);
                    float32x4_t _c1 = vdupq_lane_f32(_c, 1);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
                p0 += 8;
            }
            if (out_elempack == 1)
            {
                float32x4x2_t _f01 = vzipq_f32(_f0, _f1);
                vst1_f32(p0, vget_low_f32(_f01.val[0]));
                vst1_f32(p0 + out_hstep, vget_high_f32(_f01.val[0]));
                vst1_f32(p0 + out_hstep * 2, vget_low_f32(_f01.val[1]));
                vst1_f32(p0 + out_hstep * 3, vget_high_f32(_f01.val[1]));
                p0 += 2;
            }

            pp += 8;
        }
        for (; jj < max_jj; jj++)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vsetq_lane_f32(pC[0], _c0, 0);
                        _c0 = vsetq_lane_f32(pC[c_hstep], _c0, 1);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 2], _c0, 2);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 3], _c0, 3);
                        pC += 1;
                    }
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(pC[0] * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    pC += 1;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                p0 += 4;
            }
            if (out_elempack == 1)
            {
                p0[0] = vgetq_lane_f32(_f0, 0);
                p0[out_hstep] = vgetq_lane_f32(_f0, 1);
                p0[out_hstep * 2] = vgetq_lane_f32(_f0, 2);
                p0[out_hstep * 3] = vgetq_lane_f32(_f0, 3);
                p0++;
            }

            pp += 4;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        // out_elempack == 1
        float* p0 = (float*)top_blob + (i + ii) * out_hstep + j;

        const float descale0 = descales[i + ii];
        const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
        float32x2_t _descale = vld1_f32((const float*)descales + i + ii);
#endif

        float c0;
        float c1;
#if __ARM_NEON
        float32x4_t _c0;
        float32x4_t _c1;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = pC[0] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                c0 = pC[0] * beta;
                c1 = pC[1] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
                _c1 = vdupq_n_f32(c1);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const float*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale, 0);
            float32x4_t _f2 = vmulq_lane_f32(vcvtq_f32_s32(_sum2), _descale, 1);
            float32x4_t _f3 = vmulq_lane_f32(vcvtq_f32_s32(_sum3), _descale, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = vld1q_f32(pC);
                    _c1 = vld1q_f32(pC + 4);
                    float32x4_t _c2 = vld1q_f32(pC + c_hstep);
                    float32x4_t _c3 = vld1q_f32(pC + c_hstep + 4);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 8;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vld1q_f32(pC);
                    _c1 = vld1q_f32(pC + 4);
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _c0 = vmulq_f32(_c0, _beta);
                        _c1 = vmulq_f32(_c1, _beta);
                    }
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_f32(p0, _f0);
            vst1q_f32(p0 + 4, _f1);
            vst1q_f32(p0 + out_hstep, _f2);
            vst1q_f32(p0 + out_hstep + 4, _f3);

            pp += 16;
            p0 += 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = vld1q_f32(pC);
                    _c1 = vld1q_f32(pC + c_hstep);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 4;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vld1q_f32(pC);
                    _c0 = vmulq_n_f32(_c0, beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_f32(p0, _f0);
            vst1q_f32(p0 + out_hstep, _f1);

            pp += 8;
            p0 += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);

            float32x2x2_t _descale01 = vzip_f32(_descale, _descale);
            float32x4_t _descale0011 = vcombine_f32(_descale01.val[0], _descale01.val[1]);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0011);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    float32x4_t _c0011 = vcombine_f32(vget_low_f32(_c0), vget_high_f32(_c1));
                    _f0 = vaddq_f32(_f0, _c0011);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = vcombine_f32(vld1_f32(pC), vld1_f32(pC + c_hstep));
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
                if (broadcast_type_C == 4)
                {
                    float32x2_t _c = vld1_f32(pC);
                    _c0 = vcombine_f32(_c, _c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1_f32(p0, vget_low_f32(_f0));
            vst1_f32(p0 + out_hstep, vget_high_f32(_f0));

            pp += 4;
            p0 += 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj++)
        {
            float f0 = pp[0] * descale0;
            float f1 = pp[1] * descale1;

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    f0 += c0;
                    f1 += c0;
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                    f1 += c1;
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    f0 += pC[0] * beta;
                    f1 += pC[c_hstep] * beta;
                    pC += 1;
                }
                if (broadcast_type_C == 4)
                {
                    f0 += pC[0] * beta;
                    f1 += pC[0] * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;
            f1 *= alpha;

            p0[0] = f0;
            p0[out_hstep] = f1;

            pp += 2;
            p0++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        // out_elempack == 1
        float* p0 = (float*)top_blob + (i + ii) * out_hstep + j;

        const float descale = descales[i + ii];
#if __ARM_NEON
        float32x4_t _descale = vdupq_n_f32(descale);
#endif

        float c0;
#if __ARM_NEON
        float32x4_t _c0;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = pC[0] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                c0 = pC[0] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const float*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
        for (; jj + 15 < max_jj; jj += 16)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // out_elempack == 1
                    _c0 = vld1q_f32(pC);
                    float32x4_t _c1 = vld1q_f32(pC + 4);
                    float32x4_t _c2 = vld1q_f32(pC + 8);
                    float32x4_t _c3 = vld1q_f32(pC + 12);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 16;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_f32(p0, _f0);
            vst1q_f32(p0 + 4, _f1);
            vst1q_f32(p0 + 8, _f2);
            vst1q_f32(p0 + 12, _f3);

            pp += 16;
            p0 += 16;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // out_elempack == 1
                    _c0 = vld1q_f32(pC);
                    float32x4_t _c1 = vld1q_f32(pC + 4);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_f32(p0, _f0);
            vst1q_f32(p0 + 4, _f1);

            pp += 8;
            p0 += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // out_elempack == 1
                    _c0 = vld1q_f32(pC);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 4;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1q_f32(p0, _f0);

            pp += 4;
            p0 += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x2_t _f0 = vmul_f32(vcvt_f32_s32(vld1_s32(pp)), vget_low_f32(_descale));

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vadd_f32(_f0, vget_low_f32(_c0));
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // out_elempack == 1
                    float32x2_t _c = vld1_f32(pC);
                    _f0 = vmla_n_f32(_f0, _c, beta);
                    pC += 2;
                }
            }

            _f0 = vmul_n_f32(_f0, alpha);

            vst1_f32(p0, _f0);

            pp += 2;
            p0 += 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj++)
        {
            float f0 = pp[0] * descale;

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // out_elempack == 1
                    f0 += pC[0] * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;

            p0[0] = f0;

            pp += 1;
            p0++;
        }
    }
}

static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_unpack_output_tile_int32_to_fp32_asimddp(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const size_t c_hstep = C.dims == 3 ? C.cstep : (size_t)C.w;
    const int c_elempack = C.elempack;
    const float* pC = C;

    // NCNN_LOGE("transpose_unpack_output_tile_int32_to_fp32  %d %d %d %d  %d  %d  %d", i, max_ii, j, max_jj, out_elempack, broadcast_type_C, c_elempack);

    const int* pp = topT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
        float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

        float32x4_t _c0;
        float32x4_t _c1;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(pC[0] * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                _c0 = vld1q_f32(pC);
                _c1 = vld1q_f32(pC + 4);
                _c0 = vmulq_n_f32(_c0, beta);
                _c1 = vmulq_n_f32(_c1, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const float*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);
            int32x4_t _sum8 = vld1q_s32(pp + 32);
            int32x4_t _sum9 = vld1q_s32(pp + 36);
            int32x4_t _suma = vld1q_s32(pp + 40);
            int32x4_t _sumb = vld1q_s32(pp + 44);
            int32x4_t _sumc = vld1q_s32(pp + 48);
            int32x4_t _sumd = vld1q_s32(pp + 52);
            int32x4_t _sume = vld1q_s32(pp + 56);
            int32x4_t _sumf = vld1q_s32(pp + 60);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
#else
            // from
            //      a0 b1 c2 d3
            //      e4 f5 g6 h7
            //      e0 f1 g2 h3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      g4 h5 e6 f7
            //      g0 h1 e2 f3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      e7 f6 g5 h4
            //      e3 f2 g1 h0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      g7 h6 e5 f4
            //      g3 h2 e1 f0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
            {
                _sum8 = vrev64q_s32(_sum8);
                _sum9 = vrev64q_s32(_sum9);
                _suma = vrev64q_s32(_suma);
                _sumb = vrev64q_s32(_sumb);
                _sumc = vrev64q_s32(_sumc);
                _sumd = vrev64q_s32(_sumd);
                _sume = vrev64q_s32(_sume);
                _sumf = vrev64q_s32(_sumf);
                _sum8 = vextq_s32(_sum8, _sum8, 2);
                _sum9 = vextq_s32(_sum9, _sum9, 2);
                _suma = vextq_s32(_suma, _suma, 2);
                _sumb = vextq_s32(_sumb, _sumb, 2);
                _sumc = vextq_s32(_sumc, _sumc, 2);
                _sumd = vextq_s32(_sumd, _sumd, 2);
                _sume = vextq_s32(_sume, _sume, 2);
                _sumf = vextq_s32(_sumf, _sumf, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                _sum9 = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                _suma = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                _sumb = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                _sumc = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                _sumd = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                _sume = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
                _sum9 = vrev64q_s32(_sum9);
                _sumb = vrev64q_s32(_sumb);
                _sumd = vrev64q_s32(_sumd);
                _sumf = vrev64q_s32(_sumf);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum8), _descale0);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum9), _descale0);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_suma), _descale0);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sumb), _descale0);
            float32x4_t _f8 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f9 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _fa = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _fb = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);
            float32x4_t _fc = vmulq_f32(vcvtq_f32_s32(_sumc), _descale1);
            float32x4_t _fd = vmulq_f32(vcvtq_f32_s32(_sumd), _descale1);
            float32x4_t _fe = vmulq_f32(vcvtq_f32_s32(_sume), _descale1);
            float32x4_t _ff = vmulq_f32(vcvtq_f32_s32(_sumf), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c0);
                    _fa = vaddq_f32(_fa, _c0);
                    _fb = vaddq_f32(_fb, _c0);
                    _fc = vaddq_f32(_fc, _c0);
                    _fd = vaddq_f32(_fd, _c0);
                    _fe = vaddq_f32(_fe, _c0);
                    _ff = vaddq_f32(_ff, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c1);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c1);
                    _fb = vaddq_f32(_fb, _c1);
                    _fc = vaddq_f32(_fc, _c1);
                    _fd = vaddq_f32(_fd, _c1);
                    _fe = vaddq_f32(_fe, _c1);
                    _ff = vaddq_f32(_ff, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        float32x4_t _c2 = vld1q_f32(pC + 8);
                        float32x4_t _c3 = vld1q_f32(pC + 12);
                        float32x4_t _c4 = vld1q_f32(pC + 16);
                        float32x4_t _c5 = vld1q_f32(pC + 20);
                        float32x4_t _c6 = vld1q_f32(pC + 24);
                        float32x4_t _c7 = vld1q_f32(pC + 28);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 4 + 4);
                        _c2 = vld1q_f32(pC + c_hstep * 4 + 8);
                        _c3 = vld1q_f32(pC + c_hstep * 4 + 12);
                        _c4 = vld1q_f32(pC + c_hstep * 4 + 16);
                        _c5 = vld1q_f32(pC + c_hstep * 4 + 20);
                        _c6 = vld1q_f32(pC + c_hstep * 4 + 24);
                        _c7 = vld1q_f32(pC + c_hstep * 4 + 28);
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        float32x4_t _c2 = vld1q_f32(pC + c_hstep);
                        float32x4_t _c3 = vld1q_f32(pC + c_hstep + 4);
                        float32x4_t _c4 = vld1q_f32(pC + c_hstep * 2);
                        float32x4_t _c5 = vld1q_f32(pC + c_hstep * 2 + 4);
                        float32x4_t _c6 = vld1q_f32(pC + c_hstep * 3);
                        float32x4_t _c7 = vld1q_f32(pC + c_hstep * 3 + 4);
                        transpose8x4_ps(_c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 4 + 4);
                        _c2 = vld1q_f32(pC + c_hstep * 5);
                        _c3 = vld1q_f32(pC + c_hstep * 5 + 4);
                        _c4 = vld1q_f32(pC + c_hstep * 6);
                        _c5 = vld1q_f32(pC + c_hstep * 6 + 4);
                        _c6 = vld1q_f32(pC + c_hstep * 7);
                        _c7 = vld1q_f32(pC + c_hstep * 7 + 4);
                        transpose8x4_ps(_c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7);
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 8;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _cc0 = vld1q_f32(pC);
                    float32x4_t _cc1 = vld1q_f32(pC + 4);
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c2);
                    _fb = vaddq_f32(_fb, _c3);
                    _fc = vaddq_f32(_fc, _c4);
                    _fd = vaddq_f32(_fd, _c5);
                    _fe = vaddq_f32(_fe, _c6);
                    _ff = vaddq_f32(_ff, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
                _f8 = vmulq_f32(_f8, _alpha);
                _f9 = vmulq_f32(_f9, _alpha);
                _fa = vmulq_f32(_fa, _alpha);
                _fb = vmulq_f32(_fb, _alpha);
                _fc = vmulq_f32(_fc, _alpha);
                _fd = vmulq_f32(_fd, _alpha);
                _fe = vmulq_f32(_fe, _alpha);
                _ff = vmulq_f32(_ff, _alpha);
            }

            if (out_elempack == 4)
            {
                float32x4x4_t _ffa;
                float32x4x4_t _ffb;
                float32x4x4_t _ffc;
                float32x4x4_t _ffd;
                _ffa.val[0] = _f0;
                _ffa.val[1] = _f1;
                _ffa.val[2] = _f2;
                _ffa.val[3] = _f3;
                _ffb.val[0] = _f4;
                _ffb.val[1] = _f5;
                _ffb.val[2] = _f6;
                _ffb.val[3] = _f7;
                _ffc.val[0] = _f8;
                _ffc.val[1] = _f9;
                _ffc.val[2] = _fa;
                _ffc.val[3] = _fb;
                _ffd.val[0] = _fc;
                _ffd.val[1] = _fd;
                _ffd.val[2] = _fe;
                _ffd.val[3] = _ff;
                vst4q_f32(p0, _ffa);
                vst4q_f32(p0 + 16, _ffc);
                vst4q_f32(p0 + out_hstep * 4, _ffb);
                vst4q_f32(p0 + out_hstep * 4 + 16, _ffd);
            }
            if (out_elempack == 1)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f8);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep + 4, _f9);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 2 + 4, _fa);
                vst1q_f32(p0 + out_hstep * 3, _f3);
                vst1q_f32(p0 + out_hstep * 3 + 4, _fb);
                vst1q_f32(p0 + out_hstep * 4, _f4);
                vst1q_f32(p0 + out_hstep * 4 + 4, _fc);
                vst1q_f32(p0 + out_hstep * 5, _f5);
                vst1q_f32(p0 + out_hstep * 5 + 4, _fd);
                vst1q_f32(p0 + out_hstep * 6, _f6);
                vst1q_f32(p0 + out_hstep * 6 + 4, _fe);
                vst1q_f32(p0 + out_hstep * 7, _f7);
                vst1q_f32(p0 + out_hstep * 7 + 4, _ff);
            }

            pp += 64;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3

#else
            // from
            //      a0 b1 c2 d3
            //      e0 f1 g2 h3
            //      c0 d1 a2 b3
            //      g0 h1 e2 f3
            //      a3 b2 c1 d0
            //      e3 f2 g1 h0
            //      c3 d2 a1 b0
            //      g3 h2 e1 f0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c1);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c1);
                    _f7 = vaddq_f32(_f7, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        float32x4_t _c2 = vld1q_f32(pC + 8);
                        float32x4_t _c3 = vld1q_f32(pC + 12);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 4 + 4);
                        _c2 = vld1q_f32(pC + c_hstep * 4 + 8);
                        _c3 = vld1q_f32(pC + c_hstep * 4 + 12);
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + c_hstep);
                        float32x4_t _c2 = vld1q_f32(pC + c_hstep * 2);
                        float32x4_t _c3 = vld1q_f32(pC + c_hstep * 3);
                        transpose4x4_ps(_c0, _c1, _c2, _c3);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _c0 = vld1q_f32(pC + c_hstep * 4);
                        _c1 = vld1q_f32(pC + c_hstep * 5);
                        _c2 = vld1q_f32(pC + c_hstep * 6);
                        _c3 = vld1q_f32(pC + c_hstep * 7);
                        transpose4x4_ps(_c0, _c1, _c2, _c3);
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 4;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _cc = vld1q_f32(pC);
                    _cc = vmulq_n_f32(_cc, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_cc, 0);
                    _c1 = vdupq_laneq_f32(_cc, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_cc), 0);
                    _c1 = vdupq_lane_f32(vget_low_f32(_cc), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_cc), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_cc), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c2);
                    _f7 = vaddq_f32(_f7, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            if (out_elempack == 4)
            {
                float32x4x4_t _fa;
                float32x4x4_t _fb;
                _fa.val[0] = _f0;
                _fa.val[1] = _f1;
                _fa.val[2] = _f2;
                _fa.val[3] = _f3;
                _fb.val[0] = _f4;
                _fb.val[1] = _f5;
                _fb.val[2] = _f6;
                _fb.val[3] = _f7;
                vst4q_f32(p0, _fa);
                vst4q_f32(p0 + 16, _fb);
            }
            if (out_elempack == 1)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f4);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep + 4, _f5);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 2 + 4, _f6);
                vst1q_f32(p0 + out_hstep * 3, _f3);
                vst1q_f32(p0 + out_hstep * 3 + 4, _f7);
            }

            pp += 32;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
#else
            // from
            //      a0 b1 c0 d1
            //      e0 f1 g0 h1
            //      a1 b0 c1 d0
            //      e1 f0 g1 h0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale1);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 1)
                    {
                        float32x2_t _cc0 = vld1_f32(pC);
                        float32x2_t _cc1 = vld1_f32(pC + c_hstep);
                        float32x2_t _cc2 = vld1_f32(pC + c_hstep * 2);
                        float32x2_t _cc3 = vld1_f32(pC + c_hstep * 3);
                        float32x2_t _cc4 = vld1_f32(pC + c_hstep * 4);
                        float32x2_t _cc5 = vld1_f32(pC + c_hstep * 5);
                        float32x2_t _cc6 = vld1_f32(pC + c_hstep * 6);
                        float32x2_t _cc7 = vld1_f32(pC + c_hstep * 7);
                        float32x4_t _cc01 = vcombine_f32(_cc0, _cc1);
                        float32x4_t _cc23 = vcombine_f32(_cc2, _cc3);
                        float32x4_t _cc45 = vcombine_f32(_cc4, _cc5);
                        float32x4_t _cc67 = vcombine_f32(_cc6, _cc7);
                        float32x4x2_t _ccc0 = vuzpq_f32(_cc01, _cc23);
                        float32x4x2_t _ccc1 = vuzpq_f32(_cc45, _cc67);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _ccc0.val[0]);
                            _f1 = vaddq_f32(_f1, _ccc0.val[1]);
                            _f2 = vaddq_f32(_f2, _ccc1.val[0]);
                            _f3 = vaddq_f32(_f3, _ccc1.val[1]);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _ccc0.val[0], _beta);
                            _f1 = vmlaq_f32(_f1, _ccc0.val[1], _beta);
                            _f2 = vmlaq_f32(_f2, _ccc1.val[0], _beta);
                            _f3 = vmlaq_f32(_f3, _ccc1.val[1], _beta);
                        }
                        pC += 2;
                    }
                    else // if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        float32x4_t _c2 = vld1q_f32(pC + c_hstep * 4);
                        float32x4_t _c3 = vld1q_f32(pC + c_hstep * 4 + 4);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        pC += 8;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x2_t _cc = vld1_f32(pC);
                    _cc = vmul_n_f32(_cc, beta);
                    _c0 = vdupq_lane_f32(_cc, 0);
                    _c1 = vdupq_lane_f32(_cc, 1);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_f32(p0, _f0);
            vst1q_f32(p0 + 4, _f2);
            vst1q_f32(p0 + out_hstep, _f1);
            vst1q_f32(p0 + out_hstep + 4, _f3);

            pp += 16;
            p0 += out_hstep * 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp + 4)), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 1)
                    {
                        _c0 = vsetq_lane_f32(pC[0], _c0, 0);
                        _c0 = vsetq_lane_f32(pC[c_hstep], _c0, 1);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 2], _c0, 2);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 3], _c0, 3);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 4], _c1, 0);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 5], _c1, 1);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 6], _c1, 2);
                        _c1 = vsetq_lane_f32(pC[c_hstep * 7], _c1, 3);
                        pC += 1;
                    }
                    else // if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + c_hstep * 4);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(pC[0] * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_f32(p0, _f0);
            vst1q_f32(p0 + 4, _f1);
            pp += 8;
            p0 += out_hstep;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

        float32x4_t _c0;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(pC[0] * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                _c0 = vld1q_f32(pC);
                _c0 = vmulq_n_f32(_c0, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const float*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
#else
            // from
            //      a0 b1 c2 d3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    float32x4_t _c4;
                    float32x4_t _c5;
                    float32x4_t _c6;
                    float32x4_t _c7;
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + 8);
                        _c3 = vld1q_f32(pC + 12);
                        _c4 = vld1q_f32(pC + 16);
                        _c5 = vld1q_f32(pC + 20);
                        _c6 = vld1q_f32(pC + 24);
                        _c7 = vld1q_f32(pC + 28);
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + c_hstep);
                        _c3 = vld1q_f32(pC + c_hstep + 4);
                        _c4 = vld1q_f32(pC + c_hstep * 2);
                        _c5 = vld1q_f32(pC + c_hstep * 2 + 4);
                        _c6 = vld1q_f32(pC + c_hstep * 3);
                        _c7 = vld1q_f32(pC + c_hstep * 3 + 4);
                        transpose8x4_ps(_c0, _c1, _c2, _c3, _c4, _c5, _c6, _c7);
                        pC += 8;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                        _f4 = vaddq_f32(_f4, _c4);
                        _f5 = vaddq_f32(_f5, _c5);
                        _f6 = vaddq_f32(_f6, _c6);
                        _f7 = vaddq_f32(_f7, _c7);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                        _f4 = vmlaq_f32(_f4, _c4, _beta);
                        _f5 = vmlaq_f32(_f5, _c5, _beta);
                        _f6 = vmlaq_f32(_f6, _c6, _beta);
                        _f7 = vmlaq_f32(_f7, _c7, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _cc0 = vld1q_f32(pC);
                    float32x4_t _cc1 = vld1q_f32(pC + 4);
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            if (out_elempack == 4)
            {
                float32x4x4_t _fa;
                float32x4x4_t _fb;
                _fa.val[0] = _f0;
                _fa.val[1] = _f1;
                _fa.val[2] = _f2;
                _fa.val[3] = _f3;
                _fb.val[0] = _f4;
                _fb.val[1] = _f5;
                _fb.val[2] = _f6;
                _fb.val[3] = _f7;
                vst4q_f32(p0, _fa);
                vst4q_f32(p0 + out_hstep * 4, _fb);
            }
            if (out_elempack == 1)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 3, _f3);
                vst1q_f32(p0 + out_hstep * 4, _f4);
                vst1q_f32(p0 + out_hstep * 5, _f5);
                vst1q_f32(p0 + out_hstep * 6, _f6);
                vst1q_f32(p0 + out_hstep * 7, _f7);
            }

            pp += 32;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
#else
            // from
            //      a0 b1 c2 d3
            //      c0 d1 a2 b3
            //      a3 b2 c1 d0
            //      c3 d2 a1 b0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                _sum2 = vextq_s32(_sum2, _sum2, 2);
                _sum3 = vextq_s32(_sum3, _sum3, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        _c2 = vld1q_f32(pC + 8);
                        _c3 = vld1q_f32(pC + 12);
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + c_hstep);
                        _c2 = vld1q_f32(pC + c_hstep * 2);
                        _c3 = vld1q_f32(pC + c_hstep * 3);
                        transpose4x4_ps(_c0, _c1, _c2, _c3);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _cc = vld1q_f32(pC);
                    _cc = vmulq_n_f32(_cc, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_cc, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_cc, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_cc), 0);
                    float32x4_t _c1 = vdupq_lane_f32(vget_low_f32(_cc), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_cc), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_cc), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            if (out_elempack == 4)
            {
                float32x4x4_t _f;
                _f.val[0] = _f0;
                _f.val[1] = _f1;
                _f.val[2] = _f2;
                _f.val[3] = _f3;
                vst4q_f32(p0, _f);
            }
            if (out_elempack == 1)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + out_hstep, _f1);
                vst1q_f32(p0 + out_hstep * 2, _f2);
                vst1q_f32(p0 + out_hstep * 3, _f3);
            }

            pp += 16;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
#else
            // from
            //      a0 b1 c0 d1
            //      a1 b0 c1 d0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            {
                _sum1 = vrev64q_s32(_sum1);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    if (c_elempack == 1)
                    {
                        float32x2_t _cc0 = vld1_f32(pC);
                        float32x2_t _cc1 = vld1_f32(pC + c_hstep);
                        float32x2_t _cc2 = vld1_f32(pC + c_hstep * 2);
                        float32x2_t _cc3 = vld1_f32(pC + c_hstep * 3);
                        float32x4_t _cc01 = vcombine_f32(_cc0, _cc1);
                        float32x4_t _cc23 = vcombine_f32(_cc2, _cc3);
                        float32x4x2_t _cc = vuzpq_f32(_cc01, _cc23);
                        _c0 = _cc.val[0];
                        _c1 = _cc.val[1];
                        pC += 2;
                    }
                    else // if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        _c1 = vld1q_f32(pC + 4);
                        pC += 8;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x2_t _c = vld1_f32(pC);
                    _c = vmul_n_f32(_c, beta);
                    _c0 = vdupq_lane_f32(_c, 0);
                    float32x4_t _c1 = vdupq_lane_f32(_c, 1);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_f32(p0, _f0);
            vst1q_f32(p0 + out_hstep, _f1);

            pp += 8;
            p0 += out_hstep * 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 1)
                    {
                        _c0 = vsetq_lane_f32(pC[0], _c0, 0);
                        _c0 = vsetq_lane_f32(pC[c_hstep], _c0, 1);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 2], _c0, 2);
                        _c0 = vsetq_lane_f32(pC[c_hstep * 3], _c0, 3);
                        pC += 1;
                    }
                    else // if (c_elempack == 4)
                    {
                        _c0 = vld1q_f32(pC);
                        pC += 4;
                    }
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(pC[0] * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    pC += 1;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1q_f32(p0, _f0);
            pp += 4;
            p0 += out_hstep;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        const float descale0 = descales[i + ii];
        const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
        float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii);
#endif

        float c0;
        float c1;
#if __ARM_NEON
        float32x4_t _c0;
        float32x4_t _c1;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = pC[0] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                c0 = pC[0] * beta;
                c1 = pC[1] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
                _c1 = vdupq_n_f32(c1);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const float*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale01, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale01, 0);
            float32x4_t _f2 = vmulq_lane_f32(vcvtq_f32_s32(_sum2), _descale01, 1);
            float32x4_t _f3 = vmulq_lane_f32(vcvtq_f32_s32(_sum3), _descale01, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = vld1q_f32(pC);
                    _c1 = vld1q_f32(pC + 4);
                    float32x4_t _c2 = vld1q_f32(pC + c_hstep);
                    float32x4_t _c3 = vld1q_f32(pC + c_hstep + 4);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 8;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vld1q_f32(pC);
                    _c1 = vld1q_f32(pC + 4);
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _c0 = vmulq_f32(_c0, _beta);
                        _c1 = vmulq_f32(_c1, _beta);
                    }
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f2);
                vst1q_f32(p0 + out_hstep * 4, _f1);
                vst1q_f32(p0 + out_hstep * 4 + 4, _f3);
            }
            if (out_elempack == 1)
            {
                float32x4x2_t _f02 = vzipq_f32(_f0, _f2);
                float32x4x2_t _f13 = vzipq_f32(_f1, _f3);
                vst1_f32(p0, vget_low_f32(_f02.val[0]));
                vst1_f32(p0 + out_hstep, vget_high_f32(_f02.val[0]));
                vst1_f32(p0 + out_hstep * 2, vget_low_f32(_f02.val[1]));
                vst1_f32(p0 + out_hstep * 3, vget_high_f32(_f02.val[1]));
                vst1_f32(p0 + out_hstep * 4, vget_low_f32(_f13.val[0]));
                vst1_f32(p0 + out_hstep * 5, vget_high_f32(_f13.val[0]));
                vst1_f32(p0 + out_hstep * 6, vget_low_f32(_f13.val[1]));
                vst1_f32(p0 + out_hstep * 7, vget_high_f32(_f13.val[1]));
            }

            pp += 16;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            // a0 a1 a2 a3
            // b0 b1 b2 b3

            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale01, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale01, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = vld1q_f32(pC);
                    _c1 = vld1q_f32(pC + c_hstep);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 4;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vld1q_f32(pC);
                    _c0 = vmulq_n_f32(_c0, beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            if (out_elempack == 4)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
            }
            if (out_elempack == 1)
            {
                float32x4x2_t _f01 = vzipq_f32(_f0, _f1);
                vst1_f32(p0, vget_low_f32(_f01.val[0]));
                vst1_f32(p0 + out_hstep, vget_high_f32(_f01.val[0]));
                vst1_f32(p0 + out_hstep * 2, vget_low_f32(_f01.val[1]));
                vst1_f32(p0 + out_hstep * 3, vget_high_f32(_f01.val[1]));
            }

            pp += 8;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            // a0 a1 b0 b1
            int32x2x2_t _sum0 = vld2_s32(pp);

            float32x4_t _descale = vcombine_f32(_descale01, _descale01);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vcombine_s32(_sum0.val[0], _sum0.val[1])), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    float32x4_t _cc = vzipq_f32(_c0, _c1).val[0];
                    _f0 = vaddq_f32(_f0, _cc);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    float32x2_t _cc0 = vld1_f32(pC);
                    float32x2_t _cc1 = vld1_f32(pC + c_hstep);
                    float32x2x2_t _c01 = vzip_f32(_cc0, _cc1);
                    _c0 = vcombine_f32(_c01.val[0], _c01.val[1]);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
                if (broadcast_type_C == 4)
                {
                    float32x2_t _cc = vld1_f32(pC);
                    float32x2x2_t _c01 = vzip_f32(_cc, _cc);
                    _c0 = vcombine_f32(_c01.val[0], _c01.val[1]);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1_f32(p0, vget_low_f32(_f0));
            vst1_f32(p0 + out_hstep, vget_high_f32(_f0));

            pp += 4;
            p0 += out_hstep * 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj += 1)
        {
            float f0 = pp[0] * descale0;
            float f1 = pp[1] * descale1;

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    f0 += c0;
                    f1 += c0;
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                    f1 += c1;
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    f0 += pC[0] * beta;
                    f1 += pC[c_hstep] * beta;
                    pC += 1;
                }
                if (broadcast_type_C == 4)
                {
                    f0 += pC[0] * beta;
                    f1 += pC[0] * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;
            f1 *= alpha;

            p0[0] = f0;
            p0[1] = f1;

            pp += 2;
            p0 += out_hstep;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        const float descale = descales[i + ii];
#if __ARM_NEON
        float32x4_t _descale = vdupq_n_f32(descale);
#endif

        float c0;
#if __ARM_NEON
        float32x4_t _c0;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = pC[0] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const float*)C + i + ii;
                c0 = pC[0] * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const float*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const float*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
        for (; jj + 15 < max_jj; jj += 16)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    _c0 = vld1q_f32(pC);
                    float32x4_t _c1 = vld1q_f32(pC + 4);
                    float32x4_t _c2 = vld1q_f32(pC + 8);
                    float32x4_t _c3 = vld1q_f32(pC + 12);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 16;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            if (out_hstep == 1)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
                vst1q_f32(p0 + 8, _f2);
                vst1q_f32(p0 + 12, _f3);
            }
            else
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(p0, _f0);
                    vst1q_f32(p0 + out_hstep * 4, _f1);
                    vst1q_f32(p0 + out_hstep * 8, _f2);
                    vst1q_f32(p0 + out_hstep * 12, _f3);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vgetq_lane_f32(_f0, 0);
                    p0[out_hstep] = vgetq_lane_f32(_f0, 1);
                    p0[out_hstep * 2] = vgetq_lane_f32(_f0, 2);
                    p0[out_hstep * 3] = vgetq_lane_f32(_f0, 3);
                    p0[out_hstep * 4] = vgetq_lane_f32(_f1, 0);
                    p0[out_hstep * 5] = vgetq_lane_f32(_f1, 1);
                    p0[out_hstep * 6] = vgetq_lane_f32(_f1, 2);
                    p0[out_hstep * 7] = vgetq_lane_f32(_f1, 3);
                    p0[out_hstep * 8] = vgetq_lane_f32(_f2, 0);
                    p0[out_hstep * 9] = vgetq_lane_f32(_f2, 1);
                    p0[out_hstep * 10] = vgetq_lane_f32(_f2, 2);
                    p0[out_hstep * 11] = vgetq_lane_f32(_f2, 3);
                    p0[out_hstep * 12] = vgetq_lane_f32(_f3, 0);
                    p0[out_hstep * 13] = vgetq_lane_f32(_f3, 1);
                    p0[out_hstep * 14] = vgetq_lane_f32(_f3, 2);
                    p0[out_hstep * 15] = vgetq_lane_f32(_f3, 3);
                }
            }

            pp += 16;
            p0 += out_hstep * 16;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    _c0 = vld1q_f32(pC);
                    float32x4_t _c1 = vld1q_f32(pC + 4);
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            if (out_hstep == 1)
            {
                vst1q_f32(p0, _f0);
                vst1q_f32(p0 + 4, _f1);
            }
            else
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(p0, _f0);
                    vst1q_f32(p0 + out_hstep * 4, _f1);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vgetq_lane_f32(_f0, 0);
                    p0[out_hstep] = vgetq_lane_f32(_f0, 1);
                    p0[out_hstep * 2] = vgetq_lane_f32(_f0, 2);
                    p0[out_hstep * 3] = vgetq_lane_f32(_f0, 3);
                    p0[out_hstep * 4] = vgetq_lane_f32(_f1, 0);
                    p0[out_hstep * 5] = vgetq_lane_f32(_f1, 1);
                    p0[out_hstep * 6] = vgetq_lane_f32(_f1, 2);
                    p0[out_hstep * 7] = vgetq_lane_f32(_f1, 3);
                }
            }

            pp += 8;
            p0 += out_hstep * 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    _c0 = vld1q_f32(pC);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 4;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            if (out_hstep == 1)
            {
                vst1q_f32(p0, _f0);
            }
            else
            {
                if (out_elempack == 4)
                {
                    vst1q_f32(p0, _f0);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vgetq_lane_f32(_f0, 0);
                    p0[out_hstep] = vgetq_lane_f32(_f0, 1);
                    p0[out_hstep * 2] = vgetq_lane_f32(_f0, 2);
                    p0[out_hstep * 3] = vgetq_lane_f32(_f0, 3);
                }
            }

            pp += 4;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x2_t _f0 = vmul_f32(vcvt_f32_s32(vld1_s32(pp)), vget_low_f32(_descale));

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vadd_f32(_f0, vget_low_f32(_c0));
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    float32x2_t _c = vld1_f32(pC);
                    _f0 = vmla_n_f32(_f0, _c, beta);
                    pC += 2;
                }
            }

            _f0 = vmul_n_f32(_f0, alpha);

            if (out_hstep == 1)
            {
                vst1_f32(p0, _f0);
            }
            else
            {
                p0[0] = vget_lane_f32(_f0, 0);
                p0[out_hstep] = vget_lane_f32(_f0, 1);
            }

            pp += 2;
            p0 += out_hstep * 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj += 1)
        {
            float f0 = pp[0] * descale;

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    f0 += pC[0] * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;

            p0[0] = f0;

            pp += 1;
            p0 += out_hstep;
        }
    }
}

static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& topT_tile, int i, int max_ii, int j, int max_jj, int k, int max_kk)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        gemm_transB_packed_tile_int8_i8mm(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        gemm_transB_packed_tile_int8_asimddp(AT_tile, BT_tile, topT_tile, i, max_ii, j, max_jj, k, max_kk);
        return;
    }
#endif

    // NCNN_LOGE("gemm_transB_packed_tile_int8 %d %d %d %d %d %d", i, max_ii, j, max_jj, k, max_kk);

    const signed char* pAT = AT_tile;
    const signed char* pBT = BT_tile;

    int* outptr = topT_tile;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const signed char* pB = pBT;

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
#if !__ARM_FEATURE_MATMUL_INT8
                "cmp    %w7, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "sub    %0, %0, #192                \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"

                "1:                                 \n"
#endif // !__ARM_FEATURE_MATMUL_INT8

#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v0.16b, v0.16b, v0.16b      \n"
                "eor    v1.16b, v1.16b, v1.16b      \n"
                "eor    v2.16b, v2.16b, v2.16b      \n"
                "eor    v3.16b, v3.16b, v3.16b      \n"
                "eor    v4.16b, v4.16b, v4.16b      \n"
                "eor    v5.16b, v5.16b, v5.16b      \n"
                "eor    v6.16b, v6.16b, v6.16b      \n"
                "eor    v7.16b, v7.16b, v7.16b      \n"
                "eor    v8.16b, v8.16b, v8.16b      \n"
                "eor    v9.16b, v9.16b, v9.16b      \n"
                "eor    v10.16b, v10.16b, v10.16b   \n"
                "eor    v11.16b, v11.16b, v11.16b   \n"
                "eor    v12.16b, v12.16b, v12.16b   \n"
                "eor    v13.16b, v13.16b, v13.16b   \n"
                "eor    v14.16b, v14.16b, v14.16b   \n"
                "eor    v15.16b, v15.16b, v15.16b   \n"

                "2:                                 \n"
                "ld1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%1], #64 \n"
                "ld1    {v20.16b, v21.16b, v22.16b, v23.16b}, [%2], #64 \n"
                "smmla  v0.4s, v16.16b, v20.16b     \n"
                "smmla  v1.4s, v17.16b, v20.16b     \n"
                "smmla  v2.4s, v16.16b, v21.16b     \n"
                "smmla  v3.4s, v17.16b, v21.16b     \n"
                "smmla  v4.4s, v18.16b, v20.16b     \n"
                "smmla  v5.4s, v19.16b, v20.16b     \n"
                "smmla  v6.4s, v18.16b, v21.16b     \n"
                "smmla  v7.4s, v19.16b, v21.16b     \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v8.4s, v16.16b, v22.16b     \n"
                "smmla  v9.4s, v17.16b, v22.16b     \n"
                "smmla  v10.4s, v16.16b, v23.16b    \n"
                "smmla  v11.4s, v17.16b, v23.16b    \n"
                "smmla  v12.4s, v18.16b, v22.16b    \n"
                "smmla  v13.4s, v19.16b, v22.16b    \n"
                "smmla  v14.4s, v18.16b, v23.16b    \n"
                "smmla  v15.4s, v19.16b, v23.16b    \n"
                "bne    2b                          \n"

                "uzp1   v16.4s, v0.4s, v1.4s        \n"
                "uzp2   v17.4s, v0.4s, v1.4s        \n"
                "uzp1   v18.4s, v2.4s, v3.4s        \n"
                "uzp2   v19.4s, v2.4s, v3.4s        \n"
                "uzp1   v20.4s, v4.4s, v5.4s        \n"
                "uzp2   v21.4s, v4.4s, v5.4s        \n"
                "uzp1   v22.4s, v6.4s, v7.4s        \n"
                "uzp2   v23.4s, v6.4s, v7.4s        \n"
                "uzp1   v24.4s, v8.4s, v9.4s        \n"
                "uzp2   v25.4s, v8.4s, v9.4s        \n"
                "uzp1   v26.4s, v10.4s, v11.4s      \n"
                "uzp2   v27.4s, v10.4s, v11.4s      \n"
                "uzp1   v28.4s, v12.4s, v13.4s      \n"
                "uzp2   v29.4s, v12.4s, v13.4s      \n"
                "uzp1   v30.4s, v14.4s, v15.4s      \n"
                "uzp2   v31.4s, v14.4s, v15.4s      \n"

                "cmp    %w7, #0                     \n"
                "beq    1f                          \n"

                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64   \n"
                "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64   \n"
                "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0], #64 \n"
                "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%0]    \n"
                "sub    %0, %0, #192                \n"
                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
                "add    v20.4s, v20.4s, v4.4s       \n"
                "add    v21.4s, v21.4s, v5.4s       \n"
                "add    v22.4s, v22.4s, v6.4s       \n"
                "add    v23.4s, v23.4s, v7.4s       \n"
                "add    v24.4s, v24.4s, v8.4s       \n"
                "add    v25.4s, v25.4s, v9.4s       \n"
                "add    v26.4s, v26.4s, v10.4s      \n"
                "add    v27.4s, v27.4s, v11.4s      \n"
                "add    v28.4s, v28.4s, v12.4s      \n"
                "add    v29.4s, v29.4s, v13.4s      \n"
                "add    v30.4s, v30.4s, v14.4s      \n"
                "add    v31.4s, v31.4s, v15.4s      \n"
                "b      1f                          \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "2:                                 \n"
                "ld1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n"
                "ld1    {v4.16b, v5.16b, v6.16b, v7.16b}, [%2], #64 \n"
                "sdot   v16.4s, v0.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v4.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v4.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v4.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v4.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v4.4b[3]    \n"
                "sdot   v24.4s, v0.16b, v5.4b[0]    \n"
                "sdot   v25.4s, v0.16b, v5.4b[1]    \n"
                "sdot   v26.4s, v0.16b, v5.4b[2]    \n"
                "sdot   v27.4s, v0.16b, v5.4b[3]    \n"
                "sdot   v28.4s, v1.16b, v5.4b[0]    \n"
                "sdot   v29.4s, v1.16b, v5.4b[1]    \n"
                "sdot   v30.4s, v1.16b, v5.4b[2]    \n"
                "sdot   v31.4s, v1.16b, v5.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v2.16b, v6.4b[0]    \n"
                "sdot   v17.4s, v2.16b, v6.4b[1]    \n"
                "sdot   v18.4s, v2.16b, v6.4b[2]    \n"
                "sdot   v19.4s, v2.16b, v6.4b[3]    \n"
                "sdot   v20.4s, v3.16b, v6.4b[0]    \n"
                "sdot   v21.4s, v3.16b, v6.4b[1]    \n"
                "sdot   v22.4s, v3.16b, v6.4b[2]    \n"
                "sdot   v23.4s, v3.16b, v6.4b[3]    \n"
                "sdot   v24.4s, v2.16b, v7.4b[0]    \n"
                "sdot   v25.4s, v2.16b, v7.4b[1]    \n"
                "sdot   v26.4s, v2.16b, v7.4b[2]    \n"
                "sdot   v27.4s, v2.16b, v7.4b[3]    \n"
                "sdot   v28.4s, v3.16b, v7.4b[0]    \n"
                "sdot   v29.4s, v3.16b, v7.4b[1]    \n"
                "sdot   v30.4s, v3.16b, v7.4b[2]    \n"
                "sdot   v31.4s, v3.16b, v7.4b[3]    \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
#if __ARM_FEATURE_MATMUL_INT8
                "cmp    %w7, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "ld1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0]      \n"
                "sub    %0, %0, #192                \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"
                "1:                                 \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "and    w4, %w6, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v2.16b, v3.16b}, [%2], #32 \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v2.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v2.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v2.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v2.4b[3]    \n"
                "sdot   v24.4s, v0.16b, v3.4b[0]    \n"
                "sdot   v25.4s, v0.16b, v3.4b[1]    \n"
                "sdot   v26.4s, v0.16b, v3.4b[2]    \n"
                "sdot   v27.4s, v0.16b, v3.4b[3]    \n"
                "sdot   v28.4s, v1.16b, v3.4b[0]    \n"
                "sdot   v29.4s, v1.16b, v3.4b[1]    \n"
                "sdot   v30.4s, v1.16b, v3.4b[2]    \n"
                "sdot   v31.4s, v1.16b, v3.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull2 v9.8h, v0.16b, v4.16b       \n"
                "rev64  v2.4s, v0.4s                \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull2 v11.8h, v2.16b, v4.16b      \n"
                "rev64  v6.8h, v4.8h                \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull2 v13.8h, v0.16b, v6.16b      \n"
                "rev64  v3.4s, v1.4s                \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "smull2 v15.8h, v2.16b, v6.16b      \n"
                "rev64  v7.8h, v5.8h                \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal2 v9.8h, v1.16b, v5.16b       \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v11.8h, v3.16b, v5.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v7.16b      \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v15.8h, v3.16b, v7.16b      \n"
                "ext    v0.16b, v0.16b, v0.16b, #8  \n"
                "ext    v2.16b, v2.16b, v2.16b, #8  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v20.4s, v10.8h              \n"
                "sadalp v21.4s, v11.8h              \n"
                "ext    v1.16b, v1.16b, v1.16b, #8  \n"
                "ext    v3.16b, v3.16b, v3.16b, #8  \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull2 v9.8h, v0.16b, v4.16b       \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull2 v11.8h, v2.16b, v4.16b      \n"
                "sadalp v24.4s, v12.8h              \n"
                "sadalp v25.4s, v13.8h              \n"
                "sadalp v28.4s, v14.8h              \n"
                "sadalp v29.4s, v15.8h              \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull2 v13.8h, v0.16b, v6.16b      \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "smull2 v15.8h, v2.16b, v6.16b      \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal2 v9.8h, v1.16b, v5.16b       \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v11.8h, v3.16b, v5.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v7.16b      \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v15.8h, v3.16b, v7.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v18.4s, v8.8h               \n"
                "sadalp v19.4s, v9.8h               \n"
                "sadalp v22.4s, v10.8h              \n"
                "sadalp v23.4s, v11.8h              \n"
                "sadalp v26.4s, v12.8h              \n"
                "sadalp v27.4s, v13.8h              \n"
                "sadalp v30.4s, v14.8h              \n"
                "sadalp v31.4s, v15.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w6, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v1.16b}, [%2], #16         \n"
                "dup    v4.8h, v1.h[0]              \n"
                "dup    v5.8h, v1.h[1]              \n"
                "dup    v6.8h, v1.h[2]              \n"
                "dup    v7.8h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "smull2 v12.8h, v0.16b, v4.16b      \n"
                "smull2 v13.8h, v0.16b, v5.16b      \n"
                "smull2 v14.8h, v0.16b, v6.16b      \n"
                "smull2 v15.8h, v0.16b, v7.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
                "dup    v4.8h, v1.h[4]              \n"
                "dup    v5.8h, v1.h[5]              \n"
                "dup    v6.8h, v1.h[6]              \n"
                "dup    v7.8h, v1.h[7]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "smull2 v12.8h, v0.16b, v4.16b      \n"
                "smull2 v13.8h, v0.16b, v5.16b      \n"
                "smull2 v14.8h, v0.16b, v6.16b      \n"
                "smull2 v15.8h, v0.16b, v7.16b      \n"
                "sadalp v24.4s, v8.8h               \n"
                "sadalp v25.4s, v9.8h               \n"
                "sadalp v26.4s, v10.8h              \n"
                "sadalp v27.4s, v11.8h              \n"
                "sadalp v28.4s, v12.8h              \n"
                "sadalp v29.4s, v13.8h              \n"
                "sadalp v30.4s, v14.8h              \n"
                "sadalp v31.4s, v15.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "rev64  v1.4s, v0.4s                \n"
                "rev64  v3.8h, v2.8h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v20.4s, v10.8h              \n"
                "sadalp v21.4s, v11.8h              \n"
                "sadalp v24.4s, v12.8h              \n"
                "sadalp v25.4s, v13.8h              \n"
                "sadalp v28.4s, v14.8h              \n"
                "sadalp v29.4s, v15.8h              \n"
                "ext    v0.16b, v0.16b, v0.16b, #8  \n"
                "ext    v1.16b, v1.16b, v1.16b, #8  \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v18.4s, v8.8h               \n"
                "sadalp v19.4s, v9.8h               \n"
                "sadalp v22.4s, v10.8h              \n"
                "sadalp v23.4s, v11.8h              \n"
                "sadalp v26.4s, v12.8h              \n"
                "sadalp v27.4s, v13.8h              \n"
                "sadalp v30.4s, v14.8h              \n"
                "sadalp v31.4s, v15.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w6, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "dup    v8.8b, v1.b[0]              \n"
                "dup    v9.8b, v1.b[1]              \n"
                "dup    v10.8b, v1.b[2]             \n"
                "dup    v11.8b, v1.b[3]             \n"
                "dup    v12.8b, v1.b[4]             \n"
                "dup    v13.8b, v1.b[5]             \n"
                "dup    v14.8b, v1.b[6]             \n"
                "dup    v15.8b, v1.b[7]             \n"
                "smull  v8.8h, v0.8b, v8.8b         \n"
                "smull  v9.8h, v0.8b, v9.8b         \n"
                "smull  v10.8h, v0.8b, v10.8b       \n"
                "smull  v11.8h, v0.8b, v11.8b       \n"
                "smull  v12.8h, v0.8b, v12.8b       \n"
                "smull  v13.8h, v0.8b, v13.8b       \n"
                "smull  v14.8h, v0.8b, v14.8b       \n"
                "smull  v15.8h, v0.8b, v15.8b       \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw  v17.4s, v17.4s, v9.4h       \n"
                "saddw  v18.4s, v18.4s, v10.4h      \n"
                "saddw  v19.4s, v19.4s, v11.4h      \n"
                "saddw2 v20.4s, v20.4s, v8.8h       \n"
                "saddw2 v21.4s, v21.4s, v9.8h       \n"
                "saddw2 v22.4s, v22.4s, v10.8h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
                "saddw  v24.4s, v24.4s, v12.4h      \n"
                "saddw  v25.4s, v25.4s, v13.4h      \n"
                "saddw  v26.4s, v26.4s, v14.4h      \n"
                "saddw  v27.4s, v27.4s, v15.4h      \n"
                "saddw2 v28.4s, v28.4s, v12.8h      \n"
                "saddw2 v29.4s, v29.4s, v13.8h      \n"
                "saddw2 v30.4s, v30.4s, v14.8h      \n"
                "saddw2 v31.4s, v31.4s, v15.8h      \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v4.8b}, [%2], #8           \n"
                "ext    v1.8b, v0.8b, v0.8b, #4     \n"
                "rev32  v2.4h, v0.4h                \n"
                "rev64  v3.4h, v0.4h                \n"
                "rev32  v5.8b, v4.8b                \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v1.8b, v4.8b         \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull  v11.8h, v3.8b, v4.8b        \n"
                "smull  v12.8h, v0.8b, v5.8b        \n"
                "smull  v13.8h, v1.8b, v5.8b        \n"
                "smull  v14.8h, v2.8b, v5.8b        \n"
                "smull  v15.8h, v3.8b, v5.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw2 v21.4s, v21.4s, v10.8h      \n"
                "saddw  v22.4s, v22.4s, v11.4h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
                "saddw  v24.4s, v24.4s, v12.4h      \n"
                "saddw2 v25.4s, v25.4s, v12.8h      \n"
                "saddw  v26.4s, v26.4s, v13.4h      \n"
                "saddw2 v27.4s, v27.4s, v13.8h      \n"
                "saddw  v28.4s, v28.4s, v14.4h      \n"
                "saddw2 v29.4s, v29.4s, v14.8h      \n"
                "saddw  v30.4s, v30.4s, v15.4h      \n"
                "saddw2 v31.4s, v31.4s, v15.8h      \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
                "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%0], #64 \n"
                "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%0], #64 \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB)      // %2
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "r"(max_kk), // %6
                "r"(k)       // %7
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;
            int32x4_t _sum4;
            int32x4_t _sum5;
            int32x4_t _sum6;
            int32x4_t _sum7;
            int32x4_t _sum8;
            int32x4_t _sum9;
            int32x4_t _suma;
            int32x4_t _sumb;
            int32x4_t _sumc;
            int32x4_t _sumd;
            int32x4_t _sume;
            int32x4_t _sumf;

#if __ARM_FEATURE_MATMUL_INT8
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
                _sum8 = vdupq_n_s32(0);
                _sum9 = vdupq_n_s32(0);
                _suma = vdupq_n_s32(0);
                _sumb = vdupq_n_s32(0);
                _sumc = vdupq_n_s32(0);
                _sumd = vdupq_n_s32(0);
                _sume = vdupq_n_s32(0);
                _sumf = vdupq_n_s32(0);
            }
#else  // __ARM_FEATURE_MATMUL_INT8
            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
                _sum8 = vdupq_n_s32(0);
                _sum9 = vdupq_n_s32(0);
                _suma = vdupq_n_s32(0);
                _sumb = vdupq_n_s32(0);
                _sumc = vdupq_n_s32(0);
                _sumd = vdupq_n_s32(0);
                _sume = vdupq_n_s32(0);
                _sumf = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
                _sum4 = vld1q_s32(outptr + 16);
                _sum5 = vld1q_s32(outptr + 20);
                _sum6 = vld1q_s32(outptr + 24);
                _sum7 = vld1q_s32(outptr + 28);
                _sum8 = vld1q_s32(outptr + 32);
                _sum9 = vld1q_s32(outptr + 36);
                _suma = vld1q_s32(outptr + 40);
                _sumb = vld1q_s32(outptr + 44);
                _sumc = vld1q_s32(outptr + 48);
                _sumd = vld1q_s32(outptr + 52);
                _sume = vld1q_s32(outptr + 56);
                _sumf = vld1q_s32(outptr + 60);
            }
#endif // __ARM_FEATURE_MATMUL_INT8

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            {
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

                    _sum0 = vmmlaq_s32(_sum0, _pA0, _pB0);
                    _sum1 = vmmlaq_s32(_sum1, _pA1, _pB0);
                    _sum2 = vmmlaq_s32(_sum2, _pA0, _pB1);
                    _sum3 = vmmlaq_s32(_sum3, _pA1, _pB1);
                    _sum4 = vmmlaq_s32(_sum4, _pA2, _pB0);
                    _sum5 = vmmlaq_s32(_sum5, _pA3, _pB0);
                    _sum6 = vmmlaq_s32(_sum6, _pA2, _pB1);
                    _sum7 = vmmlaq_s32(_sum7, _pA3, _pB1);
                    _sum8 = vmmlaq_s32(_sum8, _pA0, _pB2);
                    _sum9 = vmmlaq_s32(_sum9, _pA1, _pB2);
                    _suma = vmmlaq_s32(_suma, _pA0, _pB3);
                    _sumb = vmmlaq_s32(_sumb, _pA1, _pB3);
                    _sumc = vmmlaq_s32(_sumc, _pA2, _pB2);
                    _sumd = vmmlaq_s32(_sumd, _pA3, _pB2);
                    _sume = vmmlaq_s32(_sume, _pA2, _pB3);
                    _sumf = vmmlaq_s32(_sumf, _pA3, _pB3);

                    pA += 64;
                    pB += 64;
                }

                int32x4x2_t _ss0 = vuzpq_s32(_sum0, _sum1);
                int32x4x2_t _ss1 = vuzpq_s32(_sum2, _sum3);
                int32x4x2_t _ss2 = vuzpq_s32(_sum4, _sum5);
                int32x4x2_t _ss3 = vuzpq_s32(_sum6, _sum7);
                int32x4x2_t _ss4 = vuzpq_s32(_sum8, _sum9);
                int32x4x2_t _ss5 = vuzpq_s32(_suma, _sumb);
                int32x4x2_t _ss6 = vuzpq_s32(_sumc, _sumd);
                int32x4x2_t _ss7 = vuzpq_s32(_sume, _sumf);

                if (k == 0)
                {
                    _sum0 = _ss0.val[0];
                    _sum1 = _ss0.val[1];
                    _sum2 = _ss1.val[0];
                    _sum3 = _ss1.val[1];
                    _sum4 = _ss2.val[0];
                    _sum5 = _ss2.val[1];
                    _sum6 = _ss3.val[0];
                    _sum7 = _ss3.val[1];
                    _sum8 = _ss4.val[0];
                    _sum9 = _ss4.val[1];
                    _suma = _ss5.val[0];
                    _sumb = _ss5.val[1];
                    _sumc = _ss6.val[0];
                    _sumd = _ss6.val[1];
                    _sume = _ss7.val[0];
                    _sumf = _ss7.val[1];
                }
                else
                {
                    _sum0 = vld1q_s32(outptr);
                    _sum1 = vld1q_s32(outptr + 4);
                    _sum2 = vld1q_s32(outptr + 8);
                    _sum3 = vld1q_s32(outptr + 12);
                    _sum4 = vld1q_s32(outptr + 16);
                    _sum5 = vld1q_s32(outptr + 20);
                    _sum6 = vld1q_s32(outptr + 24);
                    _sum7 = vld1q_s32(outptr + 28);
                    _sum8 = vld1q_s32(outptr + 32);
                    _sum9 = vld1q_s32(outptr + 36);
                    _suma = vld1q_s32(outptr + 40);
                    _sumb = vld1q_s32(outptr + 44);
                    _sumc = vld1q_s32(outptr + 48);
                    _sumd = vld1q_s32(outptr + 52);
                    _sume = vld1q_s32(outptr + 56);
                    _sumf = vld1q_s32(outptr + 60);

                    _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                    _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                    _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                    _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
                    _sum4 = vaddq_s32(_sum4, _ss2.val[0]);
                    _sum5 = vaddq_s32(_sum5, _ss2.val[1]);
                    _sum6 = vaddq_s32(_sum6, _ss3.val[0]);
                    _sum7 = vaddq_s32(_sum7, _ss3.val[1]);
                    _sum8 = vaddq_s32(_sum8, _ss4.val[0]);
                    _sum9 = vaddq_s32(_sum9, _ss4.val[1]);
                    _suma = vaddq_s32(_suma, _ss5.val[0]);
                    _sumb = vaddq_s32(_sumb, _ss5.val[1]);
                    _sumc = vaddq_s32(_sumc, _ss6.val[0]);
                    _sumd = vaddq_s32(_sumd, _ss6.val[1]);
                    _sume = vaddq_s32(_sume, _ss7.val[0]);
                    _sumf = vaddq_s32(_sumf, _ss7.val[1]);
                }
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pA2 = vld1q_s8(pA + 32);
                int8x16_t _pA3 = vld1q_s8(pA + 48);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);
                int8x16_t _pB2 = vld1q_s8(pB + 32);
                int8x16_t _pB3 = vld1q_s8(pB + 48);

                // aaaa bbbb cccc dddd    eeee ffff gggg hhhh

                // 0000 1111 2222 3333    4444 5555 6666 7777
                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB0, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB0, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB0, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB0, 3);
                _sum8 = vdotq_laneq_s32(_sum8, _pA0, _pB1, 0);
                _sum9 = vdotq_laneq_s32(_sum9, _pA0, _pB1, 1);
                _suma = vdotq_laneq_s32(_suma, _pA0, _pB1, 2);
                _sumb = vdotq_laneq_s32(_sumb, _pA0, _pB1, 3);
                _sumc = vdotq_laneq_s32(_sumc, _pA1, _pB1, 0);
                _sumd = vdotq_laneq_s32(_sumd, _pA1, _pB1, 1);
                _sume = vdotq_laneq_s32(_sume, _pA1, _pB1, 2);
                _sumf = vdotq_laneq_s32(_sumf, _pA1, _pB1, 3);

                _sum0 = vdotq_laneq_s32(_sum0, _pA2, _pB2, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA2, _pB2, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA2, _pB2, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA2, _pB2, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA3, _pB2, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA3, _pB2, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA3, _pB2, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA3, _pB2, 3);
                _sum8 = vdotq_laneq_s32(_sum8, _pA2, _pB3, 0);
                _sum9 = vdotq_laneq_s32(_sum9, _pA2, _pB3, 1);
                _suma = vdotq_laneq_s32(_suma, _pA2, _pB3, 2);
                _sumb = vdotq_laneq_s32(_sumb, _pA2, _pB3, 3);
                _sumc = vdotq_laneq_s32(_sumc, _pA3, _pB3, 0);
                _sumd = vdotq_laneq_s32(_sumd, _pA3, _pB3, 1);
                _sume = vdotq_laneq_s32(_sume, _pA3, _pB3, 2);
                _sumf = vdotq_laneq_s32(_sumf, _pA3, _pB3, 3);

                pA += 64;
                pB += 64;
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

                // aaaa bbbb cccc dddd    eeee ffff gggg hhhh

                // 0000 1111 2222 3333    4444 5555 6666 7777
                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB0, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB0, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB0, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB0, 3);
                _sum8 = vdotq_laneq_s32(_sum8, _pA0, _pB1, 0);
                _sum9 = vdotq_laneq_s32(_sum9, _pA0, _pB1, 1);
                _suma = vdotq_laneq_s32(_suma, _pA0, _pB1, 2);
                _sumb = vdotq_laneq_s32(_sumb, _pA0, _pB1, 3);
                _sumc = vdotq_laneq_s32(_sumc, _pA1, _pB1, 0);
                _sumd = vdotq_laneq_s32(_sumd, _pA1, _pB1, 1);
                _sume = vdotq_laneq_s32(_sume, _pA1, _pB1, 2);
                _sumf = vdotq_laneq_s32(_sumf, _pA1, _pB1, 3);

#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB2 = vld1q_s8(pB + 16);

                // aabbccdd eeffgghh
                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));

                // aabbccdd eeffgghh
                // ccddaabb gghheeff

                int8x16_t _pA3 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA2)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB3 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s7 = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s8 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _s9 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sa = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _sb = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sc = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sd = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB1));
                int16x8_t _se = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sf = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB1));

                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), vget_low_s8(_pB2));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), vget_high_s8(_pB2));
                _s2 = vmlal_s8(_s2, vget_high_s8(_pA2), vget_low_s8(_pB2));
                _s3 = vmlal_s8(_s3, vget_low_s8(_pA2), vget_high_s8(_pB2));
                _s4 = vmlal_s8(_s4, vget_low_s8(_pA3), vget_low_s8(_pB2));
                _s5 = vmlal_s8(_s5, vget_high_s8(_pA3), vget_high_s8(_pB2));
                _s6 = vmlal_s8(_s6, vget_high_s8(_pA3), vget_low_s8(_pB2));
                _s7 = vmlal_s8(_s7, vget_low_s8(_pA3), vget_high_s8(_pB2));
                _s8 = vmlal_s8(_s8, vget_low_s8(_pA2), vget_low_s8(_pB3));
                _s9 = vmlal_s8(_s9, vget_high_s8(_pA2), vget_high_s8(_pB3));
                _sa = vmlal_s8(_sa, vget_high_s8(_pA2), vget_low_s8(_pB3));
                _sb = vmlal_s8(_sb, vget_low_s8(_pA2), vget_high_s8(_pB3));
                _sc = vmlal_s8(_sc, vget_low_s8(_pA3), vget_low_s8(_pB3));
                _sd = vmlal_s8(_sd, vget_high_s8(_pA3), vget_high_s8(_pB3));
                _se = vmlal_s8(_se, vget_high_s8(_pA3), vget_low_s8(_pB3));
                _sf = vmlal_s8(_sf, vget_low_s8(_pA3), vget_high_s8(_pB3));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
                _sum8 = vpadalq_s16(_sum8, _s8);
                _sum9 = vpadalq_s16(_sum9, _s9);
                _suma = vpadalq_s16(_suma, _sa);
                _sumb = vpadalq_s16(_sumb, _sb);
                _sumc = vpadalq_s16(_sumc, _sc);
                _sumd = vpadalq_s16(_sumd, _sd);
                _sume = vpadalq_s16(_sume, _se);
                _sumf = vpadalq_s16(_sumf, _sf);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

                // aabbccdd eeffgghh

                // 00112233 44556677

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 0)));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 1)));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 2)));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 3)));
                int16x8_t _s4 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 0)));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 1)));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 2)));
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB)), 3)));
                int16x8_t _s8 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 0)));
                int16x8_t _s9 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 1)));
                int16x8_t _sa = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 2)));
                int16x8_t _sb = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 3)));
                int16x8_t _sc = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 0)));
                int16x8_t _sd = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 1)));
                int16x8_t _se = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 2)));
                int16x8_t _sf = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB)), 3)));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
                _sum8 = vpadalq_s16(_sum8, _s8);
                _sum9 = vpadalq_s16(_sum9, _s9);
                _suma = vpadalq_s16(_suma, _sa);
                _sumb = vpadalq_s16(_sumb, _sb);
                _sumc = vpadalq_s16(_sumc, _sc);
                _sumd = vpadalq_s16(_sumd, _sd);
                _sume = vpadalq_s16(_sume, _se);
                _sumf = vpadalq_s16(_sumf, _sf);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);

                // aabbccdd eeffgghh

                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));

                // 00112233 44556677

                // 33221100 77665544

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB0));
                int16x8_t _s7 = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB0));
                int16x8_t _s8 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _s9 = vmull_s8(vget_high_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sa = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB1));
                int16x8_t _sb = vmull_s8(vget_low_s8(_pA0), vget_high_s8(_pB1));
                int16x8_t _sc = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sd = vmull_s8(vget_high_s8(_pA1), vget_high_s8(_pB1));
                int16x8_t _se = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB1));
                int16x8_t _sf = vmull_s8(vget_low_s8(_pA1), vget_high_s8(_pB1));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
                _sum8 = vpadalq_s16(_sum8, _s8);
                _sum9 = vpadalq_s16(_sum9, _s9);
                _suma = vpadalq_s16(_suma, _sa);
                _sumb = vpadalq_s16(_sumb, _sb);
                _sumc = vpadalq_s16(_sumc, _sc);
                _sumd = vpadalq_s16(_sumd, _sd);
                _sume = vpadalq_s16(_sume, _se);
                _sumf = vpadalq_s16(_sumf, _sf);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                // int8x8_t _pB0 = vld1_s8(pB);

                // abcd efgh
                // 0123 4567

                int16x8_t _s01 = vmull_s8(_pA, vdup_n_s8(pB[0]));
                int16x8_t _s23 = vmull_s8(_pA, vdup_n_s8(pB[1]));
                int16x8_t _s45 = vmull_s8(_pA, vdup_n_s8(pB[2]));
                int16x8_t _s67 = vmull_s8(_pA, vdup_n_s8(pB[3]));
                int16x8_t _s89 = vmull_s8(_pA, vdup_n_s8(pB[4]));
                int16x8_t _sab = vmull_s8(_pA, vdup_n_s8(pB[5]));
                int16x8_t _scd = vmull_s8(_pA, vdup_n_s8(pB[6]));
                int16x8_t _sef = vmull_s8(_pA, vdup_n_s8(pB[7]));

                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s23));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s45));
                _sum3 = vaddw_s16(_sum3, vget_low_s16(_s67));
                _sum4 = vaddw_s16(_sum4, vget_high_s16(_s01));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s23));
                _sum6 = vaddw_s16(_sum6, vget_high_s16(_s45));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
                _sum8 = vaddw_s16(_sum8, vget_low_s16(_s89));
                _sum9 = vaddw_s16(_sum9, vget_low_s16(_sab));
                _suma = vaddw_s16(_suma, vget_low_s16(_scd));
                _sumb = vaddw_s16(_sumb, vget_low_s16(_sef));
                _sumc = vaddw_s16(_sumc, vget_high_s16(_s89));
                _sumd = vaddw_s16(_sumd, vget_high_s16(_sab));
                _sume = vaddw_s16(_sume, vget_high_s16(_scd));
                _sumf = vaddw_s16(_sumf, vget_high_s16(_sef));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vld1_s8(pB);

                // abcd efgh
                // efgh abcd
                // cdab ghef
                // ghef cdab

                // 0123 4567
                // 3210 7654

                // abcdefgh  ->  ghefcdab  ->  cdabghef

                int8x8_t _pA1 = vext_s8(_pA0, _pA0, 4);
                int8x8_t _pA2 = vreinterpret_s8_s16(vrev32_s16(vreinterpret_s16_s8(_pA0)));
                int8x8_t _pA3 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pA0)));

                // 01234567  ->  32107654

                int8x8_t _pB1 = vrev32_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA0, _pB0);
                int16x8_t _s23 = vmull_s8(_pA1, _pB0);
                int16x8_t _s45 = vmull_s8(_pA2, _pB0);
                int16x8_t _s67 = vmull_s8(_pA3, _pB0);
                int16x8_t _s89 = vmull_s8(_pA0, _pB1);
                int16x8_t _sab = vmull_s8(_pA1, _pB1);
                int16x8_t _scd = vmull_s8(_pA2, _pB1);
                int16x8_t _sef = vmull_s8(_pA3, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s45));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s45));
                _sum6 = vaddw_s16(_sum6, vget_low_s16(_s67));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
                _sum8 = vaddw_s16(_sum8, vget_low_s16(_s89));
                _sum9 = vaddw_s16(_sum9, vget_high_s16(_s89));
                _suma = vaddw_s16(_suma, vget_low_s16(_sab));
                _sumb = vaddw_s16(_sumb, vget_high_s16(_sab));
                _sumc = vaddw_s16(_sumc, vget_low_s16(_scd));
                _sumd = vaddw_s16(_sumd, vget_high_s16(_scd));
                _sume = vaddw_s16(_sume, vget_low_s16(_sef));
                _sumf = vaddw_s16(_sumf, vget_high_s16(_sef));
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 8;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);
            vst1q_s32(outptr + 8, _sum2);
            vst1q_s32(outptr + 12, _sum3);
            vst1q_s32(outptr + 16, _sum4);
            vst1q_s32(outptr + 20, _sum5);
            vst1q_s32(outptr + 24, _sum6);
            vst1q_s32(outptr + 28, _sum7);
            vst1q_s32(outptr + 32, _sum8);
            vst1q_s32(outptr + 36, _sum9);
            vst1q_s32(outptr + 40, _suma);
            vst1q_s32(outptr + 44, _sumb);
            vst1q_s32(outptr + 48, _sumc);
            vst1q_s32(outptr + 52, _sumd);
            vst1q_s32(outptr + 56, _sume);
            vst1q_s32(outptr + 60, _sumf);

            outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cmp    %w7, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n"
                "sub    %0, %0, #64                 \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"

                "1:                                 \n"
#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"

#if __ARM_FEATURE_MATMUL_INT8
                "smmla  v24.4s, v0.16b, v4.16b      \n"
                "smmla  v25.4s, v1.16b, v4.16b      \n"
                "smmla  v26.4s, v0.16b, v5.16b      \n"
                "smmla  v27.4s, v1.16b, v5.16b      \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v28.4s, v2.16b, v4.16b      \n"
                "smmla  v29.4s, v3.16b, v4.16b      \n"
                "smmla  v30.4s, v2.16b, v5.16b      \n"
                "smmla  v31.4s, v3.16b, v5.16b      \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "sdot   v16.4s, v0.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v4.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v4.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v4.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v4.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v4.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v2.16b, v5.4b[0]    \n"
                "sdot   v17.4s, v2.16b, v5.4b[1]    \n"
                "sdot   v18.4s, v2.16b, v5.4b[2]    \n"
                "sdot   v19.4s, v2.16b, v5.4b[3]    \n"
                "sdot   v20.4s, v3.16b, v5.4b[0]    \n"
                "sdot   v21.4s, v3.16b, v5.4b[1]    \n"
                "sdot   v22.4s, v3.16b, v5.4b[2]    \n"
                "sdot   v23.4s, v3.16b, v5.4b[3]    \n"
#endif // __ARM_FEATURE_MATMUL_INT8
                "bne    2b                          \n"

#if __ARM_FEATURE_MATMUL_INT8
                "uzp1   v0.4s, v24.4s, v25.4s       \n"
                "uzp2   v1.4s, v24.4s, v25.4s       \n"
                "uzp1   v2.4s, v26.4s, v27.4s       \n"
                "uzp2   v3.4s, v26.4s, v27.4s       \n"
                "uzp1   v4.4s, v28.4s, v29.4s       \n"
                "uzp2   v5.4s, v28.4s, v29.4s       \n"
                "uzp1   v6.4s, v30.4s, v31.4s       \n"
                "uzp2   v7.4s, v30.4s, v31.4s       \n"

                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
                "add    v20.4s, v20.4s, v4.4s       \n"
                "add    v21.4s, v21.4s, v5.4s       \n"
                "add    v22.4s, v22.4s, v6.4s       \n"
                "add    v23.4s, v23.4s, v7.4s       \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
                "and    w4, %w6, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v2.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v2.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v2.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v2.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v4.16b}, [%2], #16         \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "rev64  v2.4s, v0.4s                \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "ext    v5.16b, v4.16b, v4.16b, #8  \n"
                "smull2 v9.8h, v0.16b, v5.16b       \n"
                "rev64  v6.8h, v4.8h                \n"
                "smull2 v11.8h, v2.16b, v5.16b      \n"
                "ext    v7.16b, v6.16b, v6.16b, #8  \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "rev64  v3.4s, v1.4s                \n"
                "smull2 v13.8h, v0.16b, v7.16b      \n"
                "smull2 v15.8h, v2.16b, v7.16b      \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v9.8h, v1.16b, v4.16b       \n"
                "smlal2 v11.8h, v3.16b, v4.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v6.16b      \n"
                "smlal2 v15.8h, v3.16b, v6.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w6, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "dup    v4.8h, v1.h[0]              \n"
                "dup    v5.8h, v1.h[1]              \n"
                "dup    v6.8h, v1.h[2]              \n"
                "dup    v7.8h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "smull2 v12.8h, v0.16b, v4.16b      \n"
                "smull2 v13.8h, v0.16b, v5.16b      \n"
                "smull2 v14.8h, v0.16b, v6.16b      \n"
                "smull2 v15.8h, v0.16b, v7.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1r   {v2.2d}, [%2]               \n"
                "add    %2, %2, #8                  \n"
                "rev64  v1.4s, v0.4s                \n"
                "rev64  v3.8h, v2.8h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w6, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.8b}, [%2]               \n"
                "add    %2, %2, #4                  \n"
                "dup    v8.8b, v1.b[0]              \n"
                "dup    v9.8b, v1.b[1]              \n"
                "dup    v10.8b, v1.b[2]             \n"
                "dup    v11.8b, v1.b[3]             \n"
                "smull  v8.8h, v0.8b, v8.8b         \n"
                "smull  v9.8h, v0.8b, v9.8b         \n"
                "smull  v10.8h, v0.8b, v10.8b       \n"
                "smull  v11.8h, v0.8b, v11.8b       \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw  v17.4s, v17.4s, v9.4h       \n"
                "saddw  v18.4s, v18.4s, v10.4h      \n"
                "saddw  v19.4s, v19.4s, v11.4h      \n"
                "saddw2 v20.4s, v20.4s, v8.8h       \n"
                "saddw2 v21.4s, v21.4s, v9.8h       \n"
                "saddw2 v22.4s, v22.4s, v10.8h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1r   {v4.2s}, [%2]               \n"
                "add    %2, %2, #4                  \n"
                "rev32  v1.4h, v0.4h                \n"
                "rev64  v5.8b, v4.8b                \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v1.8b, v4.8b         \n"
                "smull  v10.8h, v0.8b, v5.8b        \n"
                "smull  v11.8h, v1.8b, v5.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw2 v21.4s, v21.4s, v10.8h      \n"
                "saddw  v22.4s, v22.4s, v11.4h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB)      // %2
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "r"(max_kk), // %6
                "r"(k)       // %7
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %7, #0              \n"
                "beq        0f                  \n"

                "vldm       %0!, {d16-d23}      \n"
                "vldm       %0, {d24-d31}       \n"
                "sub        %0, %0, #64         \n"
                "b          1f                  \n"

                "0:                             \n"
                "veor       q8, q8              \n"
                "veor       q9, q9              \n"
                "veor       q10, q10            \n"
                "veor       q11, q11            \n"
                "veor       q12, q12            \n"
                "veor       q13, q13            \n"
                "veor       q14, q14            \n"
                "veor       q15, q15            \n"

                "1:                             \n"
                "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        3f                  \n"

                ".align 4                       \n"
                "2:                             \n"
                "pld        [%1, #256]          \n"
                "vld1.s8    {d0-d3}, [%1 :64]!  \n"
                "pld        [%2, #128]          \n"
                "vld1.s8    {d4-d5}, [%2]!      \n"
                "vmull.s8   q4, d0, d4          \n"
                "vrev64.32  q3, q0              \n"
                "vmull.s8   q5, d1, d4          \n"
                "vmull.s8   q6, d6, d4          \n"
                "vmull.s8   q7, d7, d4          \n"
                "vrev64.32  q0, q1              \n"
                "vmlal.s8   q4, d2, d5          \n"
                "vmlal.s8   q5, d3, d5          \n"
                "vmlal.s8   q6, d0, d5          \n"
                "vmlal.s8   q7, d1, d5          \n"
                "vrev64.16  q2, q2              \n"
                "vpadal.s16 q8, q4              \n"
                "vrev64.32  q1, q3              \n"
                "vpadal.s16 q9, q5              \n"
                "vmull.s8   q4, d6, d4          \n"
                "vpadal.s16 q10, q6             \n"
                "vmull.s8   q5, d7, d4          \n"
                "vpadal.s16 q11, q7             \n"
                "vmull.s8   q6, d2, d4          \n"
                "vmull.s8   q7, d3, d4          \n"
                "vrev64.32  q3, q0              \n"
                "vmlal.s8   q4, d0, d5          \n"
                "vmlal.s8   q5, d1, d5          \n"
                "vmlal.s8   q6, d6, d5          \n"
                "vmlal.s8   q7, d7, d5          \n"
                "subs       r4, r4, #1          \n"
                "vpadal.s16 q14, q4             \n"
                "vpadal.s16 q15, q5             \n"
                "vpadal.s16 q12, q6             \n"
                "vpadal.s16 q13, q7             \n"
                "bne        2b                  \n"

                "3:                             \n"
                "and        r4, %6, #2          \n" // r4 = remain = max_kk & 2
                "cmp        r4, #0              \n"
                "beq        4f                  \n"

                // kk += 2 part
                "vld1.s8    {d0-d1}, [%1 :64]!  \n"
                "vld1.s8    {d4}, [%2]!         \n"
                "vrev64.32  q1, q0              \n"
                "vrev64.16  d5, d4              \n"
                "vmull.s8   q4, d0, d4          \n"
                "vmull.s8   q5, d1, d4          \n"
                "vmull.s8   q6, d2, d4          \n"
                "vmull.s8   q7, d3, d4          \n"
                "vpadal.s16 q8, q4              \n"
                "vpadal.s16 q9, q5              \n"
                "vpadal.s16 q10, q6             \n"
                "vpadal.s16 q11, q7             \n"
                "vmull.s8   q4, d0, d5          \n"
                "vmull.s8   q5, d1, d5          \n"
                "vmull.s8   q6, d2, d5          \n"
                "vmull.s8   q7, d3, d5          \n"
                "vpadal.s16 q12, q4             \n"
                "vpadal.s16 q13, q5             \n"
                "vpadal.s16 q14, q6             \n"
                "vpadal.s16 q15, q7             \n"

                "4:                             \n"
                "and        r4, %6, #1          \n" // r4 = remain = max_kk & 1
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                // kk += 1 part
                "vld1.s8    {d0}, [%1 :64]!     \n"
                "vld1.s32   {d2[]}, [%2]!       \n"
                "vrev64.16  d1, d0              \n"
                "vrev64.8   d3, d2              \n"
                "vext.s8    d1, d1, #4          \n"
                "vmull.s8   q4, d0, d2          \n"
                "vmull.s8   q5, d1, d2          \n"
                "vmull.s8   q6, d0, d3          \n"
                "vmull.s8   q7, d1, d3          \n"
                "vaddw.s16  q8, d8              \n"
                "vaddw.s16  q9, d9              \n"
                "vaddw.s16  q10, d10            \n"
                "vaddw.s16  q11, d11            \n"
                "vaddw.s16  q12, d12            \n"
                "vaddw.s16  q13, d13            \n"
                "vaddw.s16  q14, d14            \n"
                "vaddw.s16  q15, d15            \n"

                "5:                             \n"
                "vstm       %0!, {d16-d23}      \n"
                "vstm       %0!, {d24-d31}      \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB)      // %2
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "r"(max_kk), // %6
                "r"(k)       // %7
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;
            int32x4_t _sum4;
            int32x4_t _sum5;
            int32x4_t _sum6;
            int32x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
                _sum4 = vld1q_s32(outptr + 16);
                _sum5 = vld1q_s32(outptr + 20);
                _sum6 = vld1q_s32(outptr + 24);
                _sum7 = vld1q_s32(outptr + 28);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _s0 = vdupq_n_s32(0);
                int32x4_t _s1 = vdupq_n_s32(0);
                int32x4_t _s2 = vdupq_n_s32(0);
                int32x4_t _s3 = vdupq_n_s32(0);
                int32x4_t _s4 = vdupq_n_s32(0);
                int32x4_t _s5 = vdupq_n_s32(0);
                int32x4_t _s6 = vdupq_n_s32(0);
                int32x4_t _s7 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);

                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb ..... hhhhhhhh
                    // 00000000 11111111 22222222 33333333

                    _s0 = vmmlaq_s32(_s0, _pA0, _pB0);
                    _s1 = vmmlaq_s32(_s1, _pA1, _pB0);
                    _s2 = vmmlaq_s32(_s2, _pA0, _pB1);
                    _s3 = vmmlaq_s32(_s3, _pA1, _pB1);
                    _s4 = vmmlaq_s32(_s4, _pA2, _pB0);
                    _s5 = vmmlaq_s32(_s5, _pA3, _pB0);
                    _s6 = vmmlaq_s32(_s6, _pA2, _pB1);
                    _s7 = vmmlaq_s32(_s7, _pA3, _pB1);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                    _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB0, 0);
                    _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB0, 1);
                    _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB0, 2);
                    _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB0, 3);

                    _sum0 = vdotq_laneq_s32(_sum0, _pA2, _pB1, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA2, _pB1, 1);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA2, _pB1, 2);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA2, _pB1, 3);
                    _sum4 = vdotq_laneq_s32(_sum4, _pA3, _pB1, 0);
                    _sum5 = vdotq_laneq_s32(_sum5, _pA3, _pB1, 1);
                    _sum6 = vdotq_laneq_s32(_sum6, _pA3, _pB1, 2);
                    _sum7 = vdotq_laneq_s32(_sum7, _pA3, _pB1, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 64;
                    pB += 32;
                }
#if __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _ss0 = vuzpq_s32(_s0, _s1);
                int32x4x2_t _ss1 = vuzpq_s32(_s2, _s3);
                int32x4x2_t _ss2 = vuzpq_s32(_s4, _s5);
                int32x4x2_t _ss3 = vuzpq_s32(_s6, _s7);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
                _sum4 = vaddq_s32(_sum4, _ss2.val[0]);
                _sum5 = vaddq_s32(_sum5, _ss2.val[1]);
                _sum6 = vaddq_s32(_sum6, _ss3.val[0]);
                _sum7 = vaddq_s32(_sum7, _ss3.val[1]);
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB = vld1q_s8(pB);

                // aaaa bbbb cccc dddd   eeee ffff gggg hhhh

                // 0000 1111 2222 3333

                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB, 3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x16_t _pB02 = vld1q_s8(pB);

                // aabbccdd eeffgghh

                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));
                int8x16_t _pA3 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA2)));

                // 00112233 44556677

                // 33221100 77665544

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB02));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB02));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB13));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB13));
                int16x8_t _s6 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB13));
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA1), vget_low_s8(_pB13));

                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), vget_high_s8(_pB02));
                _s2 = vmlal_s8(_s2, vget_low_s8(_pA3), vget_high_s8(_pB02));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA3), vget_high_s8(_pB02));
                _s4 = vmlal_s8(_s4, vget_low_s8(_pA2), vget_high_s8(_pB13));
                _s5 = vmlal_s8(_s5, vget_high_s8(_pA2), vget_high_s8(_pB13));
                _s6 = vmlal_s8(_s6, vget_low_s8(_pA3), vget_high_s8(_pB13));
                _s7 = vmlal_s8(_s7, vget_high_s8(_pA3), vget_high_s8(_pB13));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                // aabbccdd eeffgghh

                // 00112233
                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2)));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3)));
                int16x8_t _s4 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                int16x8_t _s6 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2)));
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA), vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3)));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x8_t _pB0 = vld1_s8(pB);

                // aabbccdd eeffgghh

                // ccddaabb gghheeff

                int8x16_t _pA1 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA0)));

                // 00112233

                // 33221100

                int8x8_t _pB1 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), _pB0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), _pB0);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA1), _pB0);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA1), _pB0);
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA0), _pB1);
                int16x8_t _s5 = vmull_s8(vget_high_s8(_pA0), _pB1);
                int16x8_t _s6 = vmull_s8(vget_low_s8(_pA1), _pB1);
                int16x8_t _s7 = vmull_s8(vget_high_s8(_pA1), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                // int8x8_t _pB0 = vreinterpret_s32_s8(vld1_dup_s32(pB));

                // abcdefgh

                // 0123

                int16x8_t _s01 = vmull_s8(_pA0, vdup_n_s8(pB[0]));
                int16x8_t _s23 = vmull_s8(_pA0, vdup_n_s8(pB[1]));
                int16x8_t _s45 = vmull_s8(_pA0, vdup_n_s8(pB[2]));
                int16x8_t _s67 = vmull_s8(_pA0, vdup_n_s8(pB[3]));
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s23));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s45));
                _sum3 = vaddw_s16(_sum3, vget_low_s16(_s67));
                _sum4 = vaddw_s16(_sum4, vget_high_s16(_s01));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s23));
                _sum6 = vaddw_s16(_sum6, vget_high_s16(_s45));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));
                // int8x8_t _pB0 = vld1_s8(pB);
                // _pB0 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pB0), vreinterpret_s32_s8(_pB0)).val[0]);

                // abcdefgh  ->  cdabghef
                int8x8_t _pA1 = vreinterpret_s8_s16(vrev32_s16(vreinterpret_s16_s8(_pA0)));

                // 01230123  ->  32103210
                int8x8_t _pB1 = vrev64_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA0, _pB0);
                int16x8_t _s23 = vmull_s8(_pA1, _pB0);
                int16x8_t _s45 = vmull_s8(_pA0, _pB1);
                int16x8_t _s67 = vmull_s8(_pA1, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s45));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s45));
                _sum6 = vaddw_s16(_sum6, vget_low_s16(_s67));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 4;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);
            vst1q_s32(outptr + 8, _sum2);
            vst1q_s32(outptr + 12, _sum3);
            vst1q_s32(outptr + 16, _sum4);
            vst1q_s32(outptr + 20, _sum5);
            vst1q_s32(outptr + 24, _sum6);
            vst1q_s32(outptr + 28, _sum7);

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _s0 = vdupq_n_s32(0);
                int32x4_t _s1 = vdupq_n_s32(0);
                int32x4_t _s2 = vdupq_n_s32(0);
                int32x4_t _s3 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);

                    int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb ..... hhhhhhhh
                    // 00000000 11111111

                    _s0 = vmmlaq_s32(_s0, _pA0, _pB);
                    _s1 = vmmlaq_s32(_s1, _pA1, _pB);
                    _s2 = vmmlaq_s32(_s2, _pA2, _pB);
                    _s3 = vmmlaq_s32(_s3, _pA3, _pB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB, 1);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA1, _pB, 0);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA1, _pB, 1);

                    _sum0 = vdotq_laneq_s32(_sum0, _pA2, _pB, 2);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA2, _pB, 3);
                    _sum2 = vdotq_laneq_s32(_sum2, _pA3, _pB, 2);
                    _sum3 = vdotq_laneq_s32(_sum3, _pA3, _pB, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 64;
                    pB += 16;
                }
#if __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _ss0 = vuzpq_s32(_s0, _s1);
                int32x4x2_t _ss1 = vuzpq_s32(_s2, _s3);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x8_t _pB = vld1_s8(pB);

                // aaaa bbbb cccc dddd eeee ffff gggg hhhh

                // 0000 1111

                _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pA0, _pB, 1);
                _sum2 = vdotq_lane_s32(_sum2, _pA1, _pB, 0);
                _sum3 = vdotq_lane_s32(_sum3, _pA1, _pB, 1);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x8_t _pB = vld1_s8(pB);

                // aabbccdd eeffgghh   aabbccdd eeffgghh

                // 00112233 -> 00110011 22332233

                // 11001100 33223322

                int32x2x2_t _pBB = vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB));
                int8x16_t _pB02 = vreinterpretq_s8_s32(vcombine_s32(_pBB.val[0], _pBB.val[1]));

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB02));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB13));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA0), vget_low_s8(_pB13));
                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), vget_high_s8(_pB02));
                _s2 = vmlal_s8(_s2, vget_low_s8(_pA2), vget_high_s8(_pB13));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA2), vget_high_s8(_pB13));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 8;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int16x4_t _pB = vreinterpret_s16_s32(vld1_dup_s32((const int*)pB));

                int16x4x2_t _pB01 = vuzp_s16(_pB, _pB);
                int8x8_t _pB0 = vreinterpret_s8_s16(_pB01.val[0]);
                int8x8_t _pB1 = vreinterpret_s8_s16(_pB01.val[1]);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB0);
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), _pB1);
                int16x8_t _s2 = vmull_s8(vget_high_s8(_pA), _pB0);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // aabbccdd eeffgghh

                // 00110011
                // 11001100

                int8x8_t _pB1 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA), _pB0);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA), _pB1);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_pA), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 4;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                int8x8x2_t _pB01 = vuzp_s8(_pB, _pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB01.val[0]);
                int16x8_t _s1 = vmull_s8(_pA, _pB01.val[1]);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s1));
                _sum2 = vaddw_s16(_sum2, vget_high_s16(_s0));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                // abcdefgh

                // 01010101
                // 10101010
                int8x8_t _pB1 = vext_s8(_pB0, _pB0, 1);

                int16x8_t _s0 = vmull_s8(_pA, _pB0);
                int16x8_t _s1 = vmull_s8(_pA, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 2;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);
            vst1q_s32(outptr + 8, _sum2);
            vst1q_s32(outptr + 12, _sum3);

            outptr += 16;
        }
        for (; jj < max_jj; jj += 1)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _s0 = vdupq_n_s32(0);
                int32x4_t _s1 = vdupq_n_s32(0);
                int32x4_t _s2 = vdupq_n_s32(0);
                int32x4_t _s3 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pA2 = vld1q_s8(pA + 32);
                    int8x16_t _pA3 = vld1q_s8(pA + 48);

                    int8x8_t _pB = vld1_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb ..... hhhhhhhh
                    // 00000000
                    int8x16_t _pBB = vcombine_s8(_pB, _pB);

                    _s0 = vdotq_s32(_s0, _pA0, _pBB);
                    _s1 = vdotq_s32(_s1, _pA1, _pBB);
                    _s2 = vdotq_s32(_s2, _pA2, _pBB);
                    _s3 = vdotq_s32(_s3, _pA3, _pBB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                    _sum1 = vdotq_lane_s32(_sum1, _pA1, _pB, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pA2, _pB, 1);
                    _sum1 = vdotq_lane_s32(_sum1, _pA3, _pB, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 64;
                    pB += 8;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_s0, _s1));
                _sum1 = vaddq_s32(_sum1, vpaddq_s32(_s2, _s3));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);

                int8x8_t _pB = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // aaaa bbbb cccc dddd eeee ffff gggg hhhh

                // 0000 0000

                _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pA1, _pB, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA2 = vld1q_s8(pA + 16);
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));
                int8x8_t _pB1 = vreinterpret_s8_s16(vld1_dup_s16((const short*)(pB + 2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), _pB0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA0), _pB0);
                _s0 = vmlal_s8(_s0, vget_low_s8(_pA2), _pB1);
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA2), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_pA), _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);

                pA += 16;
                pB += 2;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_dup_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                pA += 8;
                pB += 1;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);

            outptr += 8;
        }

        pAT += max_kk * 8;
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const signed char* pB = pBT;

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "cmp    %w7, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n"
                "sub    %0, %0, #64                 \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"
                "eor    v20.16b, v20.16b, v20.16b   \n"
                "eor    v21.16b, v21.16b, v21.16b   \n"
                "eor    v22.16b, v22.16b, v22.16b   \n"
                "eor    v23.16b, v23.16b, v23.16b   \n"

                "1:                                 \n"
#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
                "eor    v28.16b, v28.16b, v28.16b   \n"
                "eor    v29.16b, v29.16b, v29.16b   \n"
                "eor    v30.16b, v30.16b, v30.16b   \n"
                "eor    v31.16b, v31.16b, v31.16b   \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v2.16b, v3.16b, v4.16b, v5.16b}, [%2], #64 \n"

#if __ARM_FEATURE_MATMUL_INT8
                "smmla  v24.4s, v0.16b, v2.16b      \n"
                "smmla  v25.4s, v1.16b, v2.16b      \n"
                "smmla  v26.4s, v0.16b, v3.16b      \n"
                "smmla  v27.4s, v1.16b, v3.16b      \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v28.4s, v0.16b, v4.16b      \n"
                "smmla  v29.4s, v1.16b, v4.16b      \n"
                "smmla  v30.4s, v0.16b, v5.16b      \n"
                "smmla  v31.4s, v1.16b, v5.16b      \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v0.16b, v3.4b[0]    \n"
                "sdot   v21.4s, v0.16b, v3.4b[1]    \n"
                "sdot   v22.4s, v0.16b, v3.4b[2]    \n"
                "sdot   v23.4s, v0.16b, v3.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v1.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v1.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v1.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v1.16b, v4.4b[3]    \n"
                "sdot   v20.4s, v1.16b, v5.4b[0]    \n"
                "sdot   v21.4s, v1.16b, v5.4b[1]    \n"
                "sdot   v22.4s, v1.16b, v5.4b[2]    \n"
                "sdot   v23.4s, v1.16b, v5.4b[3]    \n"
#endif // __ARM_FEATURE_MATMUL_INT8
                "bne    2b                          \n"

#if __ARM_FEATURE_MATMUL_INT8
                "uzp1   v0.4s, v24.4s, v25.4s       \n"
                "uzp2   v1.4s, v24.4s, v25.4s       \n"
                "uzp1   v2.4s, v26.4s, v27.4s       \n"
                "uzp2   v3.4s, v26.4s, v27.4s       \n"
                "uzp1   v4.4s, v28.4s, v29.4s       \n"
                "uzp2   v5.4s, v28.4s, v29.4s       \n"
                "uzp1   v6.4s, v30.4s, v31.4s       \n"
                "uzp2   v7.4s, v30.4s, v31.4s       \n"

                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
                "add    v20.4s, v20.4s, v4.4s       \n"
                "add    v21.4s, v21.4s, v5.4s       \n"
                "add    v22.4s, v22.4s, v6.4s       \n"
                "add    v23.4s, v23.4s, v7.4s       \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
                "and    w4, %w6, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v2.16b, v3.16b}, [%2], #32 \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
                "sdot   v20.4s, v0.16b, v3.4b[0]    \n"
                "sdot   v21.4s, v0.16b, v3.4b[1]    \n"
                "sdot   v22.4s, v0.16b, v3.4b[2]    \n"
                "sdot   v23.4s, v0.16b, v3.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull2 v9.8h, v0.16b, v5.16b       \n"
                "rev64  v2.4s, v0.4s                \n"
                "smull  v10.8h, v2.8b, v4.8b        \n"
                "smull2 v11.8h, v2.16b, v5.16b      \n"
                "rev64  v6.8h, v4.8h                \n"
                "smull  v12.8h, v0.8b, v6.8b        \n"
                "smull  v14.8h, v2.8b, v6.8b        \n"
                "rev64  v7.8h, v5.8h                \n"
                "smull2 v13.8h, v0.16b, v7.16b      \n"
                "smull2 v15.8h, v2.16b, v7.16b      \n"
                "ext    v1.16b, v0.16b, v0.16b, #8  \n"
                "ext    v3.16b, v2.16b, v2.16b, #8  \n"
                "smlal  v8.8h, v1.8b, v5.8b         \n"
                "smlal2 v9.8h, v1.16b, v4.16b       \n"
                "smlal  v10.8h, v3.8b, v5.8b        \n"
                "smlal2 v11.8h, v3.16b, v4.16b      \n"
                "smlal  v12.8h, v1.8b, v7.8b        \n"
                "smlal  v14.8h, v3.8b, v7.8b        \n"
                "smlal2 v13.8h, v1.16b, v6.16b      \n"
                "smlal2 v15.8h, v3.16b, v6.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w6, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.16b}, [%2], #16         \n"
                "dup    v4.8h, v1.h[0]              \n"
                "dup    v5.8h, v1.h[1]              \n"
                "dup    v6.8h, v1.h[2]              \n"
                "dup    v7.8h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "dup    v4.8h, v1.h[4]              \n"
                "dup    v5.8h, v1.h[5]              \n"
                "dup    v6.8h, v1.h[6]              \n"
                "dup    v7.8h, v1.h[7]              \n"
                "smull  v12.8h, v0.8b, v4.8b        \n"
                "smull  v13.8h, v0.8b, v5.8b        \n"
                "smull  v14.8h, v0.8b, v6.8b        \n"
                "smull  v15.8h, v0.8b, v7.8b        \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2d}, [%1]               \n"
                "add    %1, %1, #8                  \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "rev64  v1.4s, v0.4s                \n"
                "rev64  v3.8h, v2.8h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull2 v9.8h, v0.16b, v2.16b       \n"
                "smull  v10.8h, v1.8b, v2.8b        \n"
                "smull2 v11.8h, v1.16b, v2.16b      \n"
                "smull  v12.8h, v0.8b, v3.8b        \n"
                "smull2 v13.8h, v0.16b, v3.16b      \n"
                "smull  v14.8h, v1.8b, v3.8b        \n"
                "smull2 v15.8h, v1.16b, v3.16b      \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "sadalp v20.4s, v12.8h              \n"
                "sadalp v21.4s, v13.8h              \n"
                "sadalp v22.4s, v14.8h              \n"
                "sadalp v23.4s, v15.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w6, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2s}, [%1]               \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "add    %1, %1, #4                  \n"
                "dup    v8.8h, v1.h[0]              \n"
                "dup    v9.8h, v1.h[1]              \n"
                "dup    v10.8h, v1.h[2]             \n"
                "dup    v11.8h, v1.h[3]             \n"
                "uzp1   v2.8b, v8.8b, v9.8b         \n"
                "uzp2   v3.8b, v8.8b, v9.8b         \n"
                "uzp1   v4.8b, v10.8b, v11.8b       \n"
                "uzp2   v5.8b, v10.8b, v11.8b       \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v0.8b, v3.8b         \n"
                "smull  v10.8h, v0.8b, v4.8b        \n"
                "smull  v11.8h, v0.8b, v5.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw  v17.4s, v17.4s, v9.4h       \n"
                "saddw2 v18.4s, v18.4s, v8.8h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw  v21.4s, v21.4s, v11.4h      \n"
                "saddw2 v22.4s, v22.4s, v10.8h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2s}, [%1]               \n"
                "ld1    {v2.8b}, [%2], #8           \n"
                "add    %1, %1, #4                  \n"
                "ext    v1.8b, v0.8b, v0.8b, #2     \n"
                "rev32  v3.8b, v2.8b                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v1.8b, v2.8b         \n"
                "smull  v10.8h, v0.8b, v3.8b        \n"
                "smull  v11.8h, v1.8b, v3.8b        \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
                "saddw  v20.4s, v20.4s, v10.4h      \n"
                "saddw2 v21.4s, v21.4s, v10.8h      \n"
                "saddw  v22.4s, v22.4s, v11.4h      \n"
                "saddw2 v23.4s, v23.4s, v11.8h      \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
                "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB)      // %2
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "r"(max_kk), // %6
                "r"(k)       // %7
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;
            int32x4_t _sum4;
            int32x4_t _sum5;
            int32x4_t _sum6;
            int32x4_t _sum7;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
                _sum4 = vdupq_n_s32(0);
                _sum5 = vdupq_n_s32(0);
                _sum6 = vdupq_n_s32(0);
                _sum7 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
                _sum4 = vld1q_s32(outptr + 16);
                _sum5 = vld1q_s32(outptr + 20);
                _sum6 = vld1q_s32(outptr + 24);
                _sum7 = vld1q_s32(outptr + 28);
            }

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum10 = vdupq_n_s32(0);
                int32x4_t _sum11 = vdupq_n_s32(0);
                int32x4_t _sum20 = vdupq_n_s32(0);
                int32x4_t _sum21 = vdupq_n_s32(0);
                int32x4_t _sum30 = vdupq_n_s32(0);
                int32x4_t _sum31 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000 11111111 22222222 33333333
                    // 44444444 55555555 66666666 77777777

                    _sum00 = vmmlaq_s32(_sum00, _pA0, _pB0);
                    _sum01 = vmmlaq_s32(_sum01, _pA1, _pB0);
                    _sum10 = vmmlaq_s32(_sum10, _pA0, _pB1);
                    _sum11 = vmmlaq_s32(_sum11, _pA1, _pB1);
                    _sum20 = vmmlaq_s32(_sum20, _pA0, _pB2);
                    _sum21 = vmmlaq_s32(_sum21, _pA1, _pB2);
                    _sum30 = vmmlaq_s32(_sum30, _pA0, _pB3);
                    _sum31 = vmmlaq_s32(_sum31, _pA1, _pB3);

                    // a0 a1 b0 b1
                    // c0 c1 d0 d1
                    // a2 a3 b2 b3
                    // c2 c3 d2 d3
                    // a4 a5 b4 b5
                    // c4 c5 d4 d5
                    // a6 a7 b6 b7
                    // c6 c7 d6 d7

                    pA += 32;
                    pB += 64;
                }
                int32x4x2_t _ss0 = vuzpq_s32(_sum00, _sum01);
                int32x4x2_t _ss1 = vuzpq_s32(_sum10, _sum11);
                int32x4x2_t _ss2 = vuzpq_s32(_sum20, _sum21);
                int32x4x2_t _ss3 = vuzpq_s32(_sum30, _sum31);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
                _sum4 = vaddq_s32(_sum4, _ss2.val[0]);
                _sum5 = vaddq_s32(_sum5, _ss2.val[1]);
                _sum6 = vaddq_s32(_sum6, _ss3.val[0]);
                _sum7 = vaddq_s32(_sum7, _ss3.val[1]);
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);
                int8x16_t _pB2 = vld1q_s8(pB + 32);
                int8x16_t _pB3 = vld1q_s8(pB + 48);

                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA0, _pB1, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA0, _pB1, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA0, _pB1, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA0, _pB1, 3);

                _sum0 = vdotq_laneq_s32(_sum0, _pA1, _pB2, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA1, _pB2, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA1, _pB2, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA1, _pB2, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA1, _pB3, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA1, _pB3, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA1, _pB3, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA1, _pB3, 3);

                pA += 32;
                pB += 64;
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

                _sum0 = vdotq_laneq_s32(_sum0, _pA, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA, _pB0, 3);
                _sum4 = vdotq_laneq_s32(_sum4, _pA, _pB1, 0);
                _sum5 = vdotq_laneq_s32(_sum5, _pA, _pB1, 1);
                _sum6 = vdotq_laneq_s32(_sum6, _pA, _pB1, 2);
                _sum7 = vdotq_laneq_s32(_sum7, _pA, _pB1, 3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA02 = vld1q_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB2 = vld1q_s8(pB + 16);

                int8x16_t _pA13 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA02)));

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));
                int8x16_t _pB3 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA02), vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA13), vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB1));
                int16x8_t _s5 = vmull_s8(vget_low_s8(_pA02), vget_high_s8(_pB1));
                int16x8_t _s6 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB1));
                int16x8_t _s7 = vmull_s8(vget_low_s8(_pA13), vget_high_s8(_pB1));

                _s0 = vmlal_s8(_s0, vget_high_s8(_pA02), vget_low_s8(_pB2));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA02), vget_high_s8(_pB2));
                _s2 = vmlal_s8(_s2, vget_high_s8(_pA13), vget_low_s8(_pB2));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA13), vget_high_s8(_pB2));
                _s4 = vmlal_s8(_s4, vget_high_s8(_pA02), vget_low_s8(_pB3));
                _s5 = vmlal_s8(_s5, vget_high_s8(_pA02), vget_high_s8(_pB3));
                _s6 = vmlal_s8(_s6, vget_high_s8(_pA13), vget_low_s8(_pB3));
                _s7 = vmlal_s8(_s7, vget_high_s8(_pA13), vget_high_s8(_pB3));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x16_t _pB01 = vld1q_s8(pB);

                // aabbccdd

                // 00112233 44556677

                int16x8_t _s0 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 0)));
                int16x8_t _s1 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 1)));
                int16x8_t _s2 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 2)));
                int16x8_t _s3 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pB01)), 3)));
                int16x8_t _s4 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 0)));
                int16x8_t _s5 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 1)));
                int16x8_t _s6 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 2)));
                int16x8_t _s7 = vmull_s8(_pA0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pB01)), 3)));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x16_t _pB0 = vld1q_s8(pB);

                // aabbccdd
                // ccddaabb

                int8x8_t _pA1 = vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(_pA0)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB1 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                int16x8_t _s2 = vmull_s8(_pA1, vget_low_s8(_pB0));
                int16x8_t _s3 = vmull_s8(_pA1, vget_high_s8(_pB0));
                int16x8_t _s4 = vmull_s8(_pA0, vget_low_s8(_pB1));
                int16x8_t _s5 = vmull_s8(_pA0, vget_high_s8(_pB1));
                int16x8_t _s6 = vmull_s8(_pA1, vget_low_s8(_pB1));
                int16x8_t _s7 = vmull_s8(_pA1, vget_high_s8(_pB1));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
                _sum4 = vpadalq_s16(_sum4, _s4);
                _sum5 = vpadalq_s16(_sum5, _s5);
                _sum6 = vpadalq_s16(_sum6, _s6);
                _sum7 = vpadalq_s16(_sum7, _s7);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pAA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vld1_s8(pB);

                // abcdabcd
                // 01234567  ->  01010101 23232323 45454545 67676767
                int8x8_t _pB0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0));
                int8x8_t _pB2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1));
                int8x8_t _pB4 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2));
                int8x8_t _pB6 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3));

                int8x8x2_t _pB0123 = vuzp_s8(_pB0, _pB2);
                int8x8x2_t _pB4567 = vuzp_s8(_pB4, _pB6);

                int16x8_t _s02 = vmull_s8(_pAA, _pB0123.val[0]);
                int16x8_t _s13 = vmull_s8(_pAA, _pB0123.val[1]);
                int16x8_t _s46 = vmull_s8(_pAA, _pB4567.val[0]);
                int16x8_t _s57 = vmull_s8(_pAA, _pB4567.val[1]);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s02));
                _sum1 = vaddw_s16(_sum1, vget_low_s16(_s13));
                _sum2 = vaddw_s16(_sum2, vget_high_s16(_s02));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s13));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s46));
                _sum5 = vaddw_s16(_sum5, vget_low_s16(_s57));
                _sum6 = vaddw_s16(_sum6, vget_high_s16(_s46));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s57));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB0 = vld1_s8(pB);

                // abcd abcd
                // cdab cdab

                int8x8_t _pA1 = vext_s8(_pA0, _pA0, 2);

                // 0123 4567
                // 3210 7654

                int8x8_t _pB1 = vrev32_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA0, _pB0);
                int16x8_t _s23 = vmull_s8(_pA1, _pB0);
                int16x8_t _s45 = vmull_s8(_pA0, _pB1);
                int16x8_t _s67 = vmull_s8(_pA1, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
                _sum4 = vaddw_s16(_sum4, vget_low_s16(_s45));
                _sum5 = vaddw_s16(_sum5, vget_high_s16(_s45));
                _sum6 = vaddw_s16(_sum6, vget_low_s16(_s67));
                _sum7 = vaddw_s16(_sum7, vget_high_s16(_s67));
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 8;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);
            vst1q_s32(outptr + 8, _sum2);
            vst1q_s32(outptr + 12, _sum3);
            vst1q_s32(outptr + 16, _sum4);
            vst1q_s32(outptr + 20, _sum5);
            vst1q_s32(outptr + 24, _sum6);
            vst1q_s32(outptr + 28, _sum7);

            outptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            const signed char* pA = pAT;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "cmp    %w7, #0                     \n"
                "beq    0f                          \n"

                "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0] \n"
                "b      1f                          \n"

                "0:                                 \n"
                "eor    v16.16b, v16.16b, v16.16b   \n"
                "eor    v17.16b, v17.16b, v17.16b   \n"
                "eor    v18.16b, v18.16b, v18.16b   \n"
                "eor    v19.16b, v19.16b, v19.16b   \n"

                "1:                                 \n"
#if __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #3                 \n" // w4 = max_kk >> 3
                "cmp    w4, #0                      \n"
                "beq    101f                        \n"

#if __ARM_FEATURE_MATMUL_INT8
                "eor    v24.16b, v24.16b, v24.16b   \n"
                "eor    v25.16b, v25.16b, v25.16b   \n"
                "eor    v26.16b, v26.16b, v26.16b   \n"
                "eor    v27.16b, v27.16b, v27.16b   \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "2:                                 \n"
                "ld1    {v0.16b, v1.16b}, [%1], #32 \n"
                "ld1    {v4.16b, v5.16b}, [%2], #32 \n"

#if __ARM_FEATURE_MATMUL_INT8
                "smmla  v24.4s, v0.16b, v4.16b      \n"
                "smmla  v25.4s, v1.16b, v4.16b      \n"
                "subs   w4, w4, #1                  \n"
                "smmla  v26.4s, v0.16b, v5.16b      \n"
                "smmla  v27.4s, v1.16b, v5.16b      \n"
#else  // __ARM_FEATURE_MATMUL_INT8
                "sdot   v16.4s, v0.16b, v4.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v4.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v4.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v4.4b[3]    \n"
                "subs   w4, w4, #1                  \n"
                "sdot   v16.4s, v1.16b, v5.4b[0]    \n"
                "sdot   v17.4s, v1.16b, v5.4b[1]    \n"
                "sdot   v18.4s, v1.16b, v5.4b[2]    \n"
                "sdot   v19.4s, v1.16b, v5.4b[3]    \n"
#endif // __ARM_FEATURE_MATMUL_INT8
                "bne    2b                          \n"

#if __ARM_FEATURE_MATMUL_INT8
                "uzp1   v0.4s, v24.4s, v25.4s       \n"
                "uzp2   v1.4s, v24.4s, v25.4s       \n"
                "uzp1   v2.4s, v26.4s, v27.4s       \n"
                "uzp2   v3.4s, v26.4s, v27.4s       \n"

                "add    v16.4s, v16.4s, v0.4s       \n"
                "add    v17.4s, v17.4s, v1.4s       \n"
                "add    v18.4s, v18.4s, v2.4s       \n"
                "add    v19.4s, v19.4s, v3.4s       \n"
#endif // __ARM_FEATURE_MATMUL_INT8

                "101:                               \n"
                "and    w4, %w6, #4                 \n" // w4 = remain = max_kk & 4
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                // kk += 4 part
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v2.16b}, [%2], #16         \n"
                "sdot   v16.4s, v0.16b, v2.4b[0]    \n"
                "sdot   v17.4s, v0.16b, v2.4b[1]    \n"
                "sdot   v18.4s, v0.16b, v2.4b[2]    \n"
                "sdot   v19.4s, v0.16b, v2.4b[3]    \n"
#else  // __ARM_FEATURE_DOTPROD
                "lsr    w4, %w6, #2                 \n" // w4 = max_kk >> 2
                "cmp    w4, #0                      \n"
                "beq    3f                          \n"

                "2:                                 \n"
                "ld1    {v0.16b}, [%1], #16         \n"
                "ld1    {v4.16b}, [%2], #16         \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "rev64  v1.4s, v0.4s                \n"
                "smull  v9.8h, v1.8b, v4.8b         \n"
                "rev64  v5.8h, v4.8h                \n"
                "smull  v10.8h, v0.8b, v5.8b        \n"
                "smull  v11.8h, v1.8b, v5.8b        \n"
                "smlal2 v8.8h, v0.16b, v4.16b       \n"
                "smlal2 v9.8h, v1.16b, v4.16b       \n"
                "smlal2 v10.8h, v0.16b, v5.16b      \n"
                "smlal2 v11.8h, v1.16b, v5.16b      \n"
                "subs   w4, w4, #1                  \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
                "bne    2b                          \n"
#endif // __ARM_FEATURE_DOTPROD

                "3:                                 \n"
                "and    w4, %w6, #2                 \n" // w4 = remain = max_kk & 2
                "cmp    w4, #0                      \n"
                "beq    4f                          \n"

                // kk += 2 part
#if __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v1.8b}, [%2], #8           \n"
                "dup    v4.4h, v1.h[0]              \n"
                "dup    v5.4h, v1.h[1]              \n"
                "dup    v6.4h, v1.h[2]              \n"
                "dup    v7.4h, v1.h[3]              \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "smull  v10.8h, v0.8b, v6.8b        \n"
                "smull  v11.8h, v0.8b, v7.8b        \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1], #8           \n"
                "ld1    {v2.8b}, [%2], #8           \n"
                "ext    v1.8b, v0.8b, v0.8b, #4     \n"
                "rev64  v3.4h, v2.4h                \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v1.8b, v2.8b         \n"
                "smull  v10.8h, v0.8b, v3.8b        \n"
                "smull  v11.8h, v1.8b, v3.8b        \n"
                "sadalp v16.4s, v8.8h               \n"
                "sadalp v17.4s, v9.8h               \n"
                "sadalp v18.4s, v10.8h              \n"
                "sadalp v19.4s, v11.8h              \n"
#endif // __ARM_FEATURE_DOTPROD

                "4:                                 \n"
                "and    w4, %w6, #1                 \n" // w4 = remain = max_kk & 1
                "cmp    w4, #0                      \n"
                "beq    5f                          \n"

                // kk += 1 part
#if __ARM_FEATURE_DOTPROD
                "ld1r   {v0.2s}, [%1]               \n"
                "ld1r   {v1.2s}, [%2]               \n"
                "add    %1, %1, #4                  \n"
                "add    %2, %2, #4                  \n"
                "zip1   v1.8b, v1.8b, v1.8b         \n"
                "zip1   v2.4h, v1.4h, v1.4h         \n"
                "zip2   v3.4h, v1.4h, v1.4h         \n"
                "smull  v8.8h, v0.8b, v2.8b         \n"
                "smull  v9.8h, v0.8b, v3.8b         \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
#else  // __ARM_FEATURE_DOTPROD
                "ld1    {v0.8b}, [%1]               \n"
                "ld1r   {v4.2s}, [%2]               \n"
                "add    %1, %1, #4                  \n"
                "add    %2, %2, #4                  \n"
                "rev32  v1.4h, v0.4h                \n"
                "zip1   v0.2s, v0.2s, v1.2s         \n"
                "rev32  v5.8b, v4.8b                \n"
                "smull  v8.8h, v0.8b, v4.8b         \n"
                "smull  v9.8h, v0.8b, v5.8b         \n"
                "saddw  v16.4s, v16.4s, v8.4h       \n"
                "saddw2 v17.4s, v17.4s, v8.8h       \n"
                "saddw  v18.4s, v18.4s, v9.4h       \n"
                "saddw2 v19.4s, v19.4s, v9.8h       \n"
#endif // __ARM_FEATURE_DOTPROD

                "5:                                 \n"
                "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB)      // %2
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "r"(max_kk), // %6
                "r"(k)       // %7
                : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
            asm volatile(
                "cmp        %7, #0              \n"
                "beq        0f                  \n"

                "vldm       %0, {d16-d23}       \n"
                "b          1f                  \n"

                "0:                             \n"
                "veor       q8, q8              \n"
                "veor       q9, q9              \n"
                "veor       q10, q10            \n"
                "veor       q11, q11            \n"

                "1:                             \n"
                "lsr        r4, %6, #2          \n" // r4 = max_kk >> 2
                "cmp        r4, #0              \n"
                "beq        3f                  \n"

                ".align 4                       \n"
                "2:                             \n"
                "pld        [%1, #256]          \n"
                "vld1.s8    {d0-d1}, [%1 :64]!  \n"
                "pld        [%2, #128]          \n"
                "vld1.s8    {d4-d5}, [%2]!      \n"
                "vrev64.32  q1, q0              \n"
                "vmull.s8   q4, d0, d4          \n"
                "vrev64.16  q3, q2              \n"
                "vmull.s8   q5, d2, d4          \n"
                "vmull.s8   q6, d0, d6          \n"
                "vmull.s8   q7, d2, d6          \n"
                "vmlal.s8   q4, d1, d5          \n"
                "vmlal.s8   q5, d3, d5          \n"
                "vmlal.s8   q6, d1, d7          \n"
                "vmlal.s8   q7, d3, d7          \n"
                "subs       r4, r4, #1          \n"
                "vpadal.s16 q8, q4              \n"
                "vpadal.s16 q9, q5              \n"
                "vpadal.s16 q10, q6             \n"
                "vpadal.s16 q11, q7             \n"
                "bne        2b                  \n"

                "3:                             \n"
                "and        r4, %6, #2          \n" // r4 = remain = max_kk & 2
                "cmp        r4, #0              \n"
                "beq        4f                  \n"

                // kk += 2 part
                "vld1.s8    {d0}, [%1 :64]!     \n"
                "vld1.s8    {d4}, [%2]!         \n"
                "vext.8     d1, d0, d0, #4      \n"
                "vrev64.16  d5, d4              \n"
                "vmull.s8   q4, d0, d4          \n"
                "vmull.s8   q5, d1, d4          \n"
                "vmull.s8   q6, d0, d5          \n"
                "vmull.s8   q7, d1, d5          \n"
                "vpadal.s16 q8, q4              \n"
                "vpadal.s16 q9, q5              \n"
                "vpadal.s16 q10, q6             \n"
                "vpadal.s16 q11, q7             \n"

                "4:                             \n"
                "and        r4, %6, #1          \n" // r4 = remain = max_kk & 1
                "cmp        r4, #0              \n"
                "beq        5f                  \n"

                // kk += 1 part
                "vld1.s32   {d0[0]}, [%1]!      \n"
                "vld1.s32   {d2[]}, [%2]!       \n"
                "vrev32.16  d1, d0              \n"
                "vrev32.s8  d3, d2              \n"
                "vzip.32    d0, d1              \n"
                "vmull.s8   q4, d0, d2          \n"
                "vmull.s8   q5, d0, d3          \n"
                "vaddw.s16  q8, d8              \n"
                "vaddw.s16  q9, d9              \n"
                "vaddw.s16  q10, d10            \n"
                "vaddw.s16  q11, d11            \n"

                "5:                             \n"
                "vstm       %0!, {d16-d23}      \n"

                : "=r"(outptr), // %0
                "=r"(pA),     // %1
                "=r"(pB)      // %2
                : "0"(outptr),
                "1"(pA),
                "2"(pB),
                "r"(max_kk), // %6
                "r"(k)       // %7
                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
            }

            int kk = 0;
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum10 = vdupq_n_s32(0);
                int32x4_t _sum11 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000 11111111 22222222 33333333

                    _sum00 = vmmlaq_s32(_sum00, _pA0, _pB0);
                    _sum01 = vmmlaq_s32(_sum01, _pA1, _pB0);
                    _sum10 = vmmlaq_s32(_sum10, _pA0, _pB1);
                    _sum11 = vmmlaq_s32(_sum11, _pA1, _pB1);

                    // a0 a1 b0 b1
                    // c0 c1 d0 d1
                    // a2 a3 b2 b3
                    // c2 c3 d2 d3

                    pA += 32;
                    pB += 32;
                }
                int32x4x2_t _ss0 = vuzpq_s32(_sum00, _sum01);
                int32x4x2_t _ss1 = vuzpq_s32(_sum10, _sum11);
                _sum0 = vaddq_s32(_sum0, _ss0.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss0.val[1]);
                _sum2 = vaddq_s32(_sum2, _ss1.val[0]);
                _sum3 = vaddq_s32(_sum3, _ss1.val[1]);
            }
#elif __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

                _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB0, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB0, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA0, _pB0, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA0, _pB0, 3);

                _sum0 = vdotq_laneq_s32(_sum0, _pA1, _pB1, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA1, _pB1, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA1, _pB1, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA1, _pB1, 3);

                pA += 32;
                pB += 32;
            }
#endif // __ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

                _sum0 = vdotq_laneq_s32(_sum0, _pA, _pB, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _pA, _pB, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _pA, _pB, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _pA, _pB, 3);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA02 = vld1q_s8(pA);
                int8x16_t _pB02 = vld1q_s8(pB);

                // aabbccdd eeffgghh
                // ccddaabb gghheeff

                int8x16_t _pA13 = vreinterpretq_s8_s32(vrev64q_s32(vreinterpretq_s32_s8(_pA02)));

                // 00112233 44556677
                // 33221100 77665544

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB02));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_pA02), vget_low_s8(_pB13));
                int16x8_t _s3 = vmull_s8(vget_low_s8(_pA13), vget_low_s8(_pB13));

                _s0 = vmlal_s8(_s0, vget_high_s8(_pA02), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA13), vget_high_s8(_pB02));
                _s2 = vmlal_s8(_s2, vget_high_s8(_pA02), vget_high_s8(_pB13));
                _s3 = vmlal_s8(_s3, vget_high_s8(_pA13), vget_high_s8(_pB13));

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s1 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                int16x8_t _s2 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 2)));
                int16x8_t _s3 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 3)));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vld1_s8(pB);

                // aabbccdd
                // ccddaabb

                int8x8_t _pA1 = vext_s8(_pA0, _pA0, 4);

                // 00112233
                // 33221100

                int8x8_t _pB1 = vreinterpret_s8_s16(vrev64_s16(vreinterpret_s16_s8(_pB0)));

                int16x8_t _s0 = vmull_s8(_pA0, _pB0);
                int16x8_t _s1 = vmull_s8(_pA1, _pB0);
                int16x8_t _s2 = vmull_s8(_pA0, _pB1);
                int16x8_t _s3 = vmull_s8(_pA1, _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                _pB = vzip_s8(_pB, _pB).val[0];
                int16x4x2_t _pB0123 = vzip_s16(vreinterpret_s16_s8(_pB), vreinterpret_s16_s8(_pB));

                int16x8_t _s01 = vmull_s8(_pA, vreinterpret_s8_s16(_pB0123.val[0]));
                int16x8_t _s23 = vmull_s8(_pA, vreinterpret_s8_s16(_pB0123.val[1]));
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
#else  // __ARM_FEATURE_DOTPROD

                int8x8_t _pA0 = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // abcd.... -> cdab.... -> abcdcdab
                int8x8_t _pA1 = vreinterpret_s8_s16(vrev32_s16(vreinterpret_s16_s8(_pA0)));
                int8x8_t _pA01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pA0), vreinterpret_s32_s8(_pA1)).val[0]);

                // 01230123 -> 32103210
                int8x8_t _pB1 = vrev32_s8(_pB0);

                int16x8_t _s01 = vmull_s8(_pA01, _pB0);
                int16x8_t _s23 = vmull_s8(_pA01, _pB1);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s23));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s23));
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 4;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);
            vst1q_s32(outptr + 8, _sum2);
            vst1q_s32(outptr + 12, _sum3);

            outptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000 11111111

                    _sum00 = vmmlaq_s32(_sum00, _pA0, _pB);
                    _sum01 = vmmlaq_s32(_sum01, _pA1, _pB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_laneq_s32(_sum0, _pA0, _pB, 0);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA0, _pB, 1);
                    _sum0 = vdotq_laneq_s32(_sum0, _pA1, _pB, 2);
                    _sum1 = vdotq_laneq_s32(_sum1, _pA1, _pB, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 32;
                    pB += 16;
                }
#if __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _ss = vuzpq_s32(_sum00, _sum01);
                _sum0 = vaddq_s32(_sum0, _ss.val[0]);
                _sum1 = vaddq_s32(_sum1, _ss.val[1]);
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _sum0 = vdotq_lane_s32(_sum0, _pA, _pB, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pA, _pB, 1);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                // aabbccdd eeffgghh

                // 00112233 -> 00110011 22332233
                // 11001100 33223322

                int32x2x2_t _pBB = vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB));
                int8x16_t _pB02 = vreinterpretq_s8_s32(vcombine_s32(_pBB.val[0], _pBB.val[1]));

                int8x16_t _pB13 = vreinterpretq_s8_s16(vrev64q_s16(vreinterpretq_s16_s8(_pB02)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vget_low_s8(_pB02));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA), vget_low_s8(_pB13));
                _s0 = vmlal_s8(_s0, vget_high_s8(_pA), vget_high_s8(_pB02));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA), vget_high_s8(_pB13));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 8;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);
                // aabbccdd
                // 0011....
                int16x8_t _s0 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 0)));
                int16x8_t _s1 = vmull_s8(_pA, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pB), 1)));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                // aabbccdd

                // 00110011
                // 11001100
                int8x8_t _pB1 = vext_s8(_pB0, _pB0, 2);

                int16x8_t _s0 = vmull_s8(_pA, _pB0);
                int16x8_t _s1 = vmull_s8(_pA, _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 4;
            }
            for (; kk < max_kk; kk += 1)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                // abcdabcd

                // 01010101 -> 00001111
                _pB = vuzp_s8(_pB, vext_s8(_pB, _pB, 1)).val[0];

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                // abcd abcd

                // 0101 0101 -> 0101 1010

                int8x8_t _pB1 = vext_s8(_pB0, _pB0, 1);
                int8x8_t _pB = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pB0), vreinterpret_s32_s8(_pB1)).val[0]);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 2;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);

            outptr += 8;
        }
        for (; jj < max_jj; jj += 1)
        {
            const signed char* pA = pAT;

            int32x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
            }

            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA0 = vld1q_s8(pA);
                    int8x16_t _pA1 = vld1q_s8(pA + 16);
                    int8x8_t _pB = vld1_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                    // aaaaaaaa bbbbbbbb cccccccc dddddddd

                    // 00000000

                    int8x16_t _pBB = vcombine_s8(_pB, _pB);

                    _sum01 = vdotq_s32(_sum01, _pA0, _pBB);
                    _sum23 = vdotq_s32(_sum23, _pA1, _pBB);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pA0, _pB, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pA1, _pB, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 32;
                    pB += 8;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_sum01, _sum23));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _sum0 = vdotq_lane_s32(_sum0, _pA, _pB, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB0 = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));
                int8x8_t _pB1 = vreinterpret_s8_s16(vld1_dup_s16((const short*)(pB + 2)));

                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), _pB0);
                _s0 = vmlal_s8(_s0, vget_high_s8(_pA), _pB1);
                _sum0 = vpadalq_s16(_sum0, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s16(vld1_dup_s16((const short*)pB));

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);

                pA += 8;
                pB += 2;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vld1_dup_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));

                pA += 4;
                pB += 1;
            }

            vst1q_s32(outptr, _sum0);

            outptr += 4;
        }

        pAT += max_kk * 4;
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const signed char* pB = pBT;

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0;
            int32x4_t _sum1;
            int32x4_t _sum2;
            int32x4_t _sum3;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
                _sum2 = vdupq_n_s32(0);
                _sum3 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
                _sum2 = vld1q_s32(outptr + 8);
                _sum3 = vld1q_s32(outptr + 12);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
                int32x4_t _sum45 = vdupq_n_s32(0);
                int32x4_t _sum67 = vdupq_n_s32(0);
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
                int32x2_t _sum20 = vdup_n_s32(0);
                int32x2_t _sum21 = vdup_n_s32(0);
                int32x2_t _sum30 = vdup_n_s32(0);
                int32x2_t _sum31 = vdup_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

#if __ARM_FEATURE_MATMUL_INT8
                    _sum01 = vmmlaq_s32(_sum01, _pA, _pB0);
                    _sum23 = vmmlaq_s32(_sum23, _pA, _pB1);
                    _sum45 = vmmlaq_s32(_sum45, _pA, _pB2);
                    _sum67 = vmmlaq_s32(_sum67, _pA, _pB3);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum00 = vdot_laneq_s32(_sum00, vget_low_s8(_pA), _pB0, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_low_s8(_pA), _pB0, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_low_s8(_pA), _pB0, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_low_s8(_pA), _pB0, 3);
                    _sum20 = vdot_laneq_s32(_sum20, vget_low_s8(_pA), _pB1, 0);
                    _sum21 = vdot_laneq_s32(_sum21, vget_low_s8(_pA), _pB1, 1);
                    _sum30 = vdot_laneq_s32(_sum30, vget_low_s8(_pA), _pB1, 2);
                    _sum31 = vdot_laneq_s32(_sum31, vget_low_s8(_pA), _pB1, 3);
                    _sum00 = vdot_laneq_s32(_sum00, vget_high_s8(_pA), _pB2, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_high_s8(_pA), _pB2, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_high_s8(_pA), _pB2, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_high_s8(_pA), _pB2, 3);
                    _sum20 = vdot_laneq_s32(_sum20, vget_high_s8(_pA), _pB3, 0);
                    _sum21 = vdot_laneq_s32(_sum21, vget_high_s8(_pA), _pB3, 1);
                    _sum30 = vdot_laneq_s32(_sum30, vget_high_s8(_pA), _pB3, 2);
                    _sum31 = vdot_laneq_s32(_sum31, vget_high_s8(_pA), _pB3, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 16;
                    pB += 64;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vcombine_s32(vget_low_s32(_sum01), vget_low_s32(_sum23)));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(vget_low_s32(_sum45), vget_low_s32(_sum67)));
                _sum2 = vaddq_s32(_sum2, vcombine_s32(vget_high_s32(_sum01), vget_high_s32(_sum23)));
                _sum3 = vaddq_s32(_sum3, vcombine_s32(vget_high_s32(_sum45), vget_high_s32(_sum67)));
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                int32x2x2_t _sum2x = vzip_s32(_sum20, _sum21);
                int32x2x2_t _sum3x = vzip_s32(_sum30, _sum31);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum2x.val[0], _sum3x.val[0]));
                _sum2 = vaddq_s32(_sum2, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
                _sum3 = vaddq_s32(_sum3, vcombine_s32(_sum2x.val[1], _sum3x.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_DOTPROD
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
                int32x2_t _sum20 = vdup_n_s32(0);
                int32x2_t _sum21 = vdup_n_s32(0);
                int32x2_t _sum30 = vdup_n_s32(0);
                int32x2_t _sum31 = vdup_n_s32(0);
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_DOTPROD
                    _sum00 = vdot_laneq_s32(_sum00, _pA, _pB0, 0);
                    _sum01 = vdot_laneq_s32(_sum01, _pA, _pB0, 1);
                    _sum10 = vdot_laneq_s32(_sum10, _pA, _pB0, 2);
                    _sum11 = vdot_laneq_s32(_sum11, _pA, _pB0, 3);
                    _sum20 = vdot_laneq_s32(_sum20, _pA, _pB1, 0);
                    _sum21 = vdot_laneq_s32(_sum21, _pA, _pB1, 1);
                    _sum30 = vdot_laneq_s32(_sum30, _pA, _pB1, 2);
                    _sum31 = vdot_laneq_s32(_sum31, _pA, _pB1, 3);
#else  // __ARM_FEATURE_DOTPROD
                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 3));

                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                    int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                    int16x8_t _s2 = vmull_s8(_pA1, vget_low_s8(_pB0));
                    int16x8_t _s3 = vmull_s8(_pA1, vget_high_s8(_pB0));
                    _s0 = vmlal_s8(_s0, _pA2, vget_low_s8(_pB1));
                    _s1 = vmlal_s8(_s1, _pA2, vget_high_s8(_pB1));
                    _s2 = vmlal_s8(_s2, _pA3, vget_low_s8(_pB1));
                    _s3 = vmlal_s8(_s3, _pA3, vget_high_s8(_pB1));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);
                    _sum2 = vpadalq_s16(_sum2, _s2);
                    _sum3 = vpadalq_s16(_sum3, _s3);
#endif // __ARM_FEATURE_DOTPROD

                    pA += 8;
                    pB += 32;
                }
#if __ARM_FEATURE_DOTPROD
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                int32x2x2_t _sum2x = vzip_s32(_sum20, _sum21);
                int32x2x2_t _sum3x = vzip_s32(_sum30, _sum31);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum2x.val[0], _sum3x.val[0]));
                _sum2 = vaddq_s32(_sum2, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
                _sum3 = vaddq_s32(_sum3, vcombine_s32(_sum2x.val[1], _sum3x.val[1]));
#endif // __ARM_FEATURE_DOTPROD
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x4_t _pA = vreinterpret_s16_s32(vld1_dup_s32((const int*)pA));
                int8x16_t _pB = vld1q_s8(pB);

                int16x4x2_t _pA01 = vuzp_s16(_pA, _pA);
                int8x8_t _pA0 = vreinterpret_s8_s16(_pA01.val[0]);
                int8x8_t _pA1 = vreinterpret_s8_s16(_pA01.val[1]);

                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB));
                int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB));
                int16x8_t _s2 = vmull_s8(_pA1, vget_low_s8(_pB));
                int16x8_t _s3 = vmull_s8(_pA1, vget_high_s8(_pB));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                pA += 4;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x8_t _pB = vld1_s8(pB);

                int8x8x2_t _pA01 = vuzp_s8(_pA, _pA);

                int16x8_t _s0 = vmull_s8(_pA01.val[0], _pB);
                int16x8_t _s1 = vmull_s8(_pA01.val[1], _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1));
                _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1));

                pA += 2;
                pB += 8;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);
            vst1q_s32(outptr + 8, _sum2);
            vst1q_s32(outptr + 12, _sum3);

            outptr += 16;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum23 = vdupq_n_s32(0);
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_MATMUL_INT8
                    _sum01 = vmmlaq_s32(_sum01, _pA, _pB0);
                    _sum23 = vmmlaq_s32(_sum23, _pA, _pB1);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum00 = vdot_laneq_s32(_sum00, vget_low_s8(_pA), _pB0, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_low_s8(_pA), _pB0, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_low_s8(_pA), _pB0, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_low_s8(_pA), _pB0, 3);
                    _sum00 = vdot_laneq_s32(_sum00, vget_high_s8(_pA), _pB1, 0);
                    _sum01 = vdot_laneq_s32(_sum01, vget_high_s8(_pA), _pB1, 1);
                    _sum10 = vdot_laneq_s32(_sum10, vget_high_s8(_pA), _pB1, 2);
                    _sum11 = vdot_laneq_s32(_sum11, vget_high_s8(_pA), _pB1, 3);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 16;
                    pB += 32;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vcombine_s32(vget_low_s32(_sum01), vget_low_s32(_sum23)));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(vget_high_s32(_sum01), vget_high_s32(_sum23)));
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#endif // __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_DOTPROD
                int32x2_t _sum00 = vdup_n_s32(0);
                int32x2_t _sum01 = vdup_n_s32(0);
                int32x2_t _sum10 = vdup_n_s32(0);
                int32x2_t _sum11 = vdup_n_s32(0);
#endif // __ARM_FEATURE_DOTPROD
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_DOTPROD
                    _sum00 = vdot_laneq_s32(_sum00, _pA, _pB, 0);
                    _sum01 = vdot_laneq_s32(_sum01, _pA, _pB, 1);
                    _sum10 = vdot_laneq_s32(_sum10, _pA, _pB, 2);
                    _sum11 = vdot_laneq_s32(_sum11, _pA, _pB, 3);
#else  // __ARM_FEATURE_DOTPROD
                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 3));

                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB));
                    int16x8_t _s1 = vmull_s8(_pA1, vget_low_s8(_pB));
                    _s0 = vmlal_s8(_s0, _pA2, vget_high_s8(_pB));
                    _s1 = vmlal_s8(_s1, _pA3, vget_high_s8(_pB));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                    pA += 8;
                    pB += 16;
                }
#if __ARM_FEATURE_DOTPROD
                int32x2x2_t _sum0x = vzip_s32(_sum00, _sum01);
                int32x2x2_t _sum1x = vzip_s32(_sum10, _sum11);
                _sum0 = vaddq_s32(_sum0, vcombine_s32(_sum0x.val[0], _sum1x.val[0]));
                _sum1 = vaddq_s32(_sum1, vcombine_s32(_sum0x.val[1], _sum1x.val[1]));
#endif // __ARM_FEATURE_DOTPROD
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int16x4_t _pA = vreinterpret_s16_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pA)), 0));
                int8x8_t _pB = vld1_s8(pB);

                int16x4x2_t _pA01 = vuzp_s16(_pA, _pA);
                int8x8_t _pA0 = vreinterpret_s8_s16(_pA01.val[0]);
                int8x8_t _pA1 = vreinterpret_s8_s16(_pA01.val[1]);

                int16x8_t _s0 = vmull_s8(_pA0, _pB);
                int16x8_t _s1 = vmull_s8(_pA1, _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);

                pA += 4;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x8_t _pB = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pB)), 0));

                _pA = vzip_s8(_pA, _pA).val[0];
                _pA = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA)).val[0]);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                pA += 2;
                pB += 4;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);

            outptr += 8;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
#if __ARM_NEON
            int32x4_t _sum;

            if (k == 0)
            {
                _sum = vdupq_n_s32(0);
            }
            else
            {
                _sum = vld1q_s32(outptr);
            }

            const signed char* pA = pAT;
            int kk = 0;

#if __ARM_FEATURE_DOTPROD
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_MATMUL_INT8
                _sum = vmmlaq_s32(_sum, _pA, _pB);
#else  // __ARM_FEATURE_MATMUL_INT8
                int32x4x2_t _pAA = vzipq_s32(vreinterpretq_s32_s8(_pA), vreinterpretq_s32_s8(_pA));
                int8x16_t _pA01 = vreinterpretq_s8_s32(_pAA.val[0]);
                int8x16_t _pA23 = vreinterpretq_s8_s32(_pAA.val[1]);
                int8x16_t _pB01 = vcombine_s8(vget_low_s8(_pB), vget_low_s8(_pB));
                int8x16_t _pB23 = vcombine_s8(vget_high_s8(_pB), vget_high_s8(_pB));

                _sum = vdotq_s32(_sum, _pA01, _pB01);
                _sum = vdotq_s32(_sum, _pA23, _pB23);
#endif // __ARM_FEATURE_MATMUL_INT8

                pA += 16;
                pB += 16;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

#if __ARM_FEATURE_DOTPROD
                int32x2x2_t _pAA = vzip_s32(vreinterpret_s32_s8(_pA), vreinterpret_s32_s8(_pA));
                int8x16_t _pA01 = vreinterpretq_s8_s32(vcombine_s32(_pAA.val[0], _pAA.val[1]));

                int8x16_t _pB01 = vcombine_s8(_pB, _pB);

                _sum = vdotq_s32(_sum, _pA01, _pB01);
#else  // __ARM_FEATURE_DOTPROD
                int16x4x2_t _pA01 = vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA));
                int32x2x2_t _pB01 = vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB));

                int16x8_t _s0 = vmull_s8(vreinterpret_s8_s16(_pA01.val[0]), vreinterpret_s8_s32(_pB01.val[0]));
                _s0 = vmlal_s8(_s0, vreinterpret_s8_s16(_pA01.val[1]), vreinterpret_s8_s32(_pB01.val[1]));
                _sum = vpadalq_s16(_sum, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 8;
                pB += 8;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _pA = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA)).val[0]);
                _pB = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_pB), vreinterpret_s32_s8(_pB)).val[0]);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum = vpadalq_s16(_sum, _s0);

                // A0 A1 A2 A3
                // B0 B1 B2 B3

                // A0 A1 A0 A1 A2 A3 A2 A3
                // B0 B1 B2 B3 B0 B1 B2 B3

                pA += 4;
                pB += 4;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x8_t _pB = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(pB)), 0));

                _pA = vzip_s8(_pA, _pA).val[0];

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum = vaddw_s16(_sum, vget_low_s16(_s0));

                // A0 A1 A0 A1
                // B0 B1 B0 B1

                // A0 A0 A1 A1

                pA += 2;
                pB += 2;
            }

            vst1q_s32(outptr, _sum);

            outptr += 4;
#else // __ARM_NEON
            int sum00;
            int sum10;
            int sum01;
            int sum11;

            if (k == 0)
            {
                sum00 = 0;
                sum10 = 0;
                sum01 = 0;
                sum11 = 0;
            }
            else
            {
                sum00 = outptr[0];
                sum10 = outptr[1];
                sum01 = outptr[2];
                sum11 = outptr[3];
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
            for (; kk + 1 < max_kk; kk += 2)
            {
                // fomit-frame-pointer implied in optimized flag spare one register
                // let us stay away from error: ‘asm’ operand has impossible constraints   --- nihui
#if __OPTIMIZE__
                asm volatile(
                    "ldr    r2, [%0], #4    \n" // int8x4_t _pA = *((int8x4_t*)pA); pA += 4;
                    "ldr    r4, [%1], #4    \n" // int8x4_t _pB = *((int8x4_t*)pB); pB += 4;
                    "ror    r3, r2, #8      \n" // int8x4_t _pA_r8 = __ror(_pA, 8);
                    "ror    r5, r4, #8      \n" // int8x4_t _pB_r8 = __ror(_pB, 8);
                    "sxtb16 r2, r2          \n" // int16x2_t _pA0 = __sxtb16(_pA);
                    "sxtb16 r4, r4          \n" // int16x2_t _pA1 = __sxtb16(_pA_r8);
                    "sxtb16 r3, r3          \n" // int16x2_t _pB0 = __sxtb16(_pB);
                    "sxtb16 r5, r5          \n" // int16x2_t _pB1 = __sxtb16(_pB_r8);
                    "smlad  %2, r2, r4, %2  \n" // sum00 = __smlad(_pA0, _pB0, sum00);
                    "smlad  %3, r3, r4, %3  \n" // sum10 = __smlad(_pA1, _pB0, sum10);
                    "smlad  %4, r2, r5, %4  \n" // sum01 = __smlad(_pA0, _pB1, sum01);
                    "smlad  %5, r3, r5, %5  \n" // sum11 = __smlad(_pA1, _pB1, sum11);
                    : "=r"(pA),
                    "=r"(pB),
                    "=r"(sum00),
                    "=r"(sum10),
                    "=r"(sum01),
                    "=r"(sum11)
                    : "0"(pA),
                    "1"(pB),
                    "2"(sum00),
                    "3"(sum10),
                    "4"(sum01),
                    "5"(sum11)
                    : "memory", "r2", "r3", "r4", "r5");
#else
                int _pA0 = *((int*)pA);
                int _pB0 = *((int*)pB);
                int _pA1;
                int _pB1;
                asm volatile("ror %0, %1, #8"
                             : "=r"(_pA1)
                             : "r"(_pA0)
                             :);
                asm volatile("ror %0, %1, #8"
                             : "=r"(_pB1)
                             : "r"(_pB0)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pA0)
                             : "0"(_pA0)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pA1)
                             : "0"(_pA1)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pB0)
                             : "0"(_pB0)
                             :);
                asm volatile("sxtb16 %0, %0"
                             : "=r"(_pB1)
                             : "0"(_pB1)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum00)
                             : "0"(sum00), "r"(_pA0), "r"(_pB0)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum10)
                             : "0"(sum10), "r"(_pA1), "r"(_pB0)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum01)
                             : "0"(sum01), "r"(_pA0), "r"(_pB1)
                             :);
                asm volatile("smlad %0, %2, %3, %0"
                             : "=r"(sum11)
                             : "0"(sum11), "r"(_pA1), "r"(_pB1)
                             :);
                pA += 4;
                pB += 4;
#endif
            }
#endif // __ARM_FEATURE_SIMD32 && NCNN_GNU_INLINE_ASM
            for (; kk < max_kk; kk += 1)
            {
                sum00 += pA[0] * pB[0];
                sum10 += pA[1] * pB[0];
                sum01 += pA[0] * pB[1];
                sum11 += pA[1] * pB[1];

                pA += 2;
                pB += 2;
            }

            outptr[0] = sum00;
            outptr[1] = sum10;
            outptr[2] = sum01;
            outptr[3] = sum11;

            outptr += 4;
#endif // __ARM_NEON
        }
        for (; jj < max_jj; jj += 1)
        {
#if __ARM_NEON
            int32x2_t _sum;

            if (k == 0)
            {
                _sum = vdup_n_s32(0);
            }
            else
            {
                _sum = vld1_s32(outptr);
            }
#else  // __ARM_NEON
            int sum0;
            int sum1;

            if (k == 0)
            {
                sum0 = 0;
                sum1 = 0;
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }
#endif // __ARM_NEON

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x8_t _pB = vld1_s8(pB);

                    int8x16_t _pBB = vcombine_s8(_pB, _pB);

                    _sum0 = vdotq_s32(_sum0, _pA, _pBB);

                    pA += 16;
                    pB += 8;
                }
                int32x2_t _ss = vpadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#else  // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                _sum = vdot_lane_s32(_sum, vget_low_s8(_pA), _pB, 0);
                _sum = vdot_lane_s32(_sum, vget_high_s8(_pA), _pB, 1);

                pA += 16;
                pB += 8;
            }
#endif // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s32(vld1_dup_s32((const int*)pB));

                _sum = vdot_s32(_sum, _pA, _pB);

                pA += 8;
                pB += 4;
            }
#else  // __ARM_FEATURE_DOTPROD
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x8_t _pB = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pB)), 0));

                    _pB = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pB), vreinterpret_s16_s8(_pB)).val[0]);

                    int16x8_t _s0 = vmull_s8(_pA, _pB);
                    _sum0 = vpadalq_s16(_sum0, _s0);

                    pA += 8;
                    pB += 4;
                }
                int32x2_t _ss = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#endif // __ARM_FEATURE_DOTPROD
            int sum0 = vget_lane_s32(_sum, 0);
            int sum1 = vget_lane_s32(_sum, 1);
            for (; kk + 1 < max_kk; kk += 2)
            {
                sum0 += pA[0] * pB[0];
                sum0 += pA[1] * pB[1];
                sum1 += pA[2] * pB[0];
                sum1 += pA[3] * pB[1];
                pA += 4;
                pB += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[1] * pB[0];
                pA += 2;
                pB += 1;
            }

            outptr[0] = sum0;
            outptr[1] = sum1;

            outptr += 2;
        }

        pAT += max_kk * 2;
    }
    for (; ii < max_ii; ii += 1)
    {
        const signed char* pB = pBT;

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0;
            int32x4_t _sum1;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
                _sum1 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
                _sum1 = vld1q_s32(outptr + 4);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
                int32x4_t _sum10 = vdupq_n_s32(0);
                int32x4_t _sum11 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

#if __ARM_FEATURE_MATMUL_INT8
                    int8x16_t _pAA = vcombine_s8(_pA, _pA);
                    _sum00 = vdotq_s32(_sum00, _pAA, _pB0);
                    _sum01 = vdotq_s32(_sum01, _pAA, _pB1);
                    _sum10 = vdotq_s32(_sum10, _pAA, _pB2);
                    _sum11 = vdotq_s32(_sum11, _pAA, _pB3);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pB0, _pA, 0);
                    _sum1 = vdotq_lane_s32(_sum1, _pB1, _pA, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pB2, _pA, 1);
                    _sum1 = vdotq_lane_s32(_sum1, _pB3, _pA, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 8;
                    pB += 64;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_sum00, _sum01));
                _sum1 = vaddq_s32(_sum1, vpaddq_s32(_sum10, _sum11));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#else  // __ARM_FEATURE_DOTPROD
            {
                int32x4_t _sum2 = vdupq_n_s32(0);
                int32x4_t _sum3 = vdupq_n_s32(0);
                int32x4_t _sum4 = vdupq_n_s32(0);
                int32x4_t _sum5 = vdupq_n_s32(0);
                int32x4_t _sum6 = vdupq_n_s32(0);
                int32x4_t _sum7 = vdupq_n_s32(0);
                for (; kk + 15 < max_kk; kk += 16)
                {
                    // TODO
                    // __builtin_prefetch(pA + 16);
                    // __builtin_prefetch(pB + 128);
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);
                    int8x16_t _pB4 = vld1q_s8(pB + 64);
                    int8x16_t _pB5 = vld1q_s8(pB + 80);
                    int8x16_t _pB6 = vld1q_s8(pB + 96);
                    int8x16_t _pB7 = vld1q_s8(pB + 112);

                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 3));
                    int8x8_t _pA4 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 0));
                    int8x8_t _pA5 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 1));
                    int8x8_t _pA6 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 2));
                    int8x8_t _pA7 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 3));
                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                    int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                    int16x8_t _s2 = vmull_s8(_pA2, vget_low_s8(_pB2));
                    int16x8_t _s3 = vmull_s8(_pA2, vget_high_s8(_pB2));
                    int16x8_t _s4 = vmull_s8(_pA4, vget_low_s8(_pB4));
                    int16x8_t _s5 = vmull_s8(_pA4, vget_high_s8(_pB4));
                    int16x8_t _s6 = vmull_s8(_pA6, vget_low_s8(_pB6));
                    int16x8_t _s7 = vmull_s8(_pA6, vget_high_s8(_pB6));
                    _s0 = vmlal_s8(_s0, _pA1, vget_low_s8(_pB1));
                    _s1 = vmlal_s8(_s1, _pA1, vget_high_s8(_pB1));
                    _s2 = vmlal_s8(_s2, _pA3, vget_low_s8(_pB3));
                    _s3 = vmlal_s8(_s3, _pA3, vget_high_s8(_pB3));
                    _s4 = vmlal_s8(_s4, _pA5, vget_low_s8(_pB5));
                    _s5 = vmlal_s8(_s5, _pA5, vget_high_s8(_pB5));
                    _s6 = vmlal_s8(_s6, _pA7, vget_low_s8(_pB7));
                    _s7 = vmlal_s8(_s7, _pA7, vget_high_s8(_pB7));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);
                    _sum2 = vpadalq_s16(_sum2, _s2);
                    _sum3 = vpadalq_s16(_sum3, _s3);
                    _sum4 = vpadalq_s16(_sum4, _s4);
                    _sum5 = vpadalq_s16(_sum5, _s5);
                    _sum6 = vpadalq_s16(_sum6, _s6);
                    _sum7 = vpadalq_s16(_sum7, _s7);

                    pA += 16;
                    pB += 128;
                }
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 3));
                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                    int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                    int16x8_t _s2 = vmull_s8(_pA2, vget_low_s8(_pB2));
                    int16x8_t _s3 = vmull_s8(_pA2, vget_high_s8(_pB2));
                    _s0 = vmlal_s8(_s0, _pA1, vget_low_s8(_pB1));
                    _s1 = vmlal_s8(_s1, _pA1, vget_high_s8(_pB1));
                    _s2 = vmlal_s8(_s2, _pA3, vget_low_s8(_pB3));
                    _s3 = vmlal_s8(_s3, _pA3, vget_high_s8(_pB3));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);
                    _sum2 = vpadalq_s16(_sum2, _s2);
                    _sum3 = vpadalq_s16(_sum3, _s3);

                    pA += 8;
                    pB += 64;
                }
                _sum0 = vaddq_s32(_sum0, _sum2);
                _sum1 = vaddq_s32(_sum1, _sum3);
                _sum0 = vaddq_s32(_sum0, _sum4);
                _sum1 = vaddq_s32(_sum1, _sum5);
                _sum0 = vaddq_s32(_sum0, _sum6);
                _sum1 = vaddq_s32(_sum1, _sum7);
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pA)), 0));
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_DOTPROD
                _sum0 = vdotq_lane_s32(_sum0, _pB0, _pA, 0);
                _sum1 = vdotq_lane_s32(_sum1, _pB1, _pA, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(_pA0, vget_high_s8(_pB0));
                _s0 = vmlal_s8(_s0, _pA1, vget_low_s8(_pB1));
                _s1 = vmlal_s8(_s1, _pA1, vget_high_s8(_pB1));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vld1_dup_s16((const short*)pA));
                int8x16_t _pB = vld1q_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, vget_low_s8(_pB));
                int16x8_t _s1 = vmull_s8(_pA, vget_high_s8(_pB));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);

                pA += 2;
                pB += 16;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vld1_dup_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                pA += 1;
                pB += 8;
            }

            vst1q_s32(outptr, _sum0);
            vst1q_s32(outptr + 4, _sum1);

            outptr += 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0;

            if (k == 0)
            {
                _sum0 = vdupq_n_s32(0);
            }
            else
            {
                _sum0 = vld1q_s32(outptr);
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_FEATURE_DOTPROD
            {
#if __ARM_FEATURE_MATMUL_INT8
                int32x4_t _sum00 = vdupq_n_s32(0);
                int32x4_t _sum01 = vdupq_n_s32(0);
#endif // __ARM_FEATURE_MATMUL_INT8
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_MATMUL_INT8
                    int8x16_t _pAA = vcombine_s8(_pA, _pA);
                    _sum00 = vdotq_s32(_sum00, _pAA, _pB0);
                    _sum01 = vdotq_s32(_sum01, _pAA, _pB1);
#else  // __ARM_FEATURE_MATMUL_INT8
                    _sum0 = vdotq_lane_s32(_sum0, _pB0, _pA, 0);
                    _sum0 = vdotq_lane_s32(_sum0, _pB1, _pA, 1);
#endif // __ARM_FEATURE_MATMUL_INT8

                    pA += 8;
                    pB += 32;
                }
#if __ARM_FEATURE_MATMUL_INT8
                _sum0 = vaddq_s32(_sum0, vpaddq_s32(_sum00, _sum01));
#endif // __ARM_FEATURE_MATMUL_INT8
            }
#else  // __ARM_FEATURE_DOTPROD
            {
                int32x4_t _sum1 = vdupq_n_s32(0);
                int32x4_t _sum2 = vdupq_n_s32(0);
                int32x4_t _sum3 = vdupq_n_s32(0);
                for (; kk + 15 < max_kk; kk += 16)
                {
                    // TODO
                    // __builtin_prefetch(pA + 16);
                    // __builtin_prefetch(pB + 64);
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);
                    int8x16_t _pB2 = vld1q_s8(pB + 32);
                    int8x16_t _pB3 = vld1q_s8(pB + 48);

                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_low_s8(_pA)), 3));
                    int8x8_t _pA4 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 0));
                    int8x8_t _pA5 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 1));
                    int8x8_t _pA6 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 2));
                    int8x8_t _pA7 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vget_high_s8(_pA)), 3));
                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                    int16x8_t _s1 = vmull_s8(_pA2, vget_low_s8(_pB1));
                    int16x8_t _s2 = vmull_s8(_pA4, vget_low_s8(_pB2));
                    int16x8_t _s3 = vmull_s8(_pA6, vget_low_s8(_pB3));
                    _s0 = vmlal_s8(_s0, _pA1, vget_high_s8(_pB0));
                    _s1 = vmlal_s8(_s1, _pA3, vget_high_s8(_pB1));
                    _s2 = vmlal_s8(_s2, _pA5, vget_high_s8(_pB2));
                    _s3 = vmlal_s8(_s3, _pA7, vget_high_s8(_pB3));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);
                    _sum2 = vpadalq_s16(_sum2, _s2);
                    _sum3 = vpadalq_s16(_sum3, _s3);

                    pA += 16;
                    pB += 64;
                }
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

                    int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 2));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 3));
                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                    int16x8_t _s1 = vmull_s8(_pA2, vget_low_s8(_pB1));
                    _s0 = vmlal_s8(_s0, _pA1, vget_high_s8(_pB0));
                    _s1 = vmlal_s8(_s1, _pA3, vget_high_s8(_pB1));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);

                    pA += 8;
                    pB += 32;
                }
                _sum0 = vaddq_s32(_sum0, _sum1);
                _sum0 = vaddq_s32(_sum0, _sum2);
                _sum0 = vaddq_s32(_sum0, _sum3);
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_DOTPROD
                _sum0 = vdotq_lane_s32(_sum0, _pB, _pA, 0);
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _pA0 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 0));
                int8x8_t _pA1 = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(_pA), 1));
                int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB));
                _s0 = vmlal_s8(_s0, _pA1, vget_high_s8(_pB));
                _sum0 = vpadalq_s16(_sum0, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 4;
                pB += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                int8x8_t _pA = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(pA)), 0));
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vpadalq_s16(_sum0, _s0);

                pA += 2;
                pB += 8;
            }
            for (; kk < max_kk; kk += 1)
            {
                int8x8_t _pA = vld1_dup_s8(pA);
                int8x8_t _pB = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pB)), 0));

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));

                pA += 1;
                pB += 4;
            }

            vst1q_s32(outptr, _sum0);

            outptr += 4;
        }
#endif // __ARM_NEON
        for (; jj + 1 < max_jj; jj += 2)
        {
#if __ARM_NEON
            int32x2_t _sum;

            if (k == 0)
            {
                _sum = vdup_n_s32(0);
            }
            else
            {
                _sum = vld1_s32(outptr);
            }
#else  // __ARM_NEON
            int sum0;
            int sum1;

            if (k == 0)
            {
                sum0 = 0;
                sum1 = 0;
            }
            else
            {
                sum0 = outptr[0];
                sum1 = outptr[1];
            }
#endif // __ARM_NEON

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB = vld1q_s8(pB);

                    int8x16_t _pAA = vcombine_s8(_pA, _pA);

                    _sum0 = vdotq_s32(_sum0, _pAA, _pB);

                    pA += 8;
                    pB += 16;
                }
                int32x2_t _ss = vpadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#else  // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

                _sum = vdot_lane_s32(_sum, vget_low_s8(_pB), _pA, 0);
                _sum = vdot_lane_s32(_sum, vget_high_s8(_pB), _pA, 1);

                pA += 8;
                pB += 16;
            }
#endif // __ARM_FEATURE_MATMUL_INT8
            for (; kk + 3 < max_kk; kk += 4)
            {
                int8x8_t _pA = vreinterpret_s8_s32(vld1_dup_s32((const int*)pA));
                int8x8_t _pB = vld1_s8(pB);

                _sum = vdot_s32(_sum, _pA, _pB);

                pA += 4;
                pB += 8;
            }
#else  // __ARM_FEATURE_DOTPROD
            {
                int32x4_t _sum0 = vdupq_n_s32(0);
                int32x4_t _sum1 = vdupq_n_s32(0);
                for (; kk + 15 < max_kk; kk += 16)
                {
                    int8x16_t _pA = vld1q_s8(pA);
                    int8x16_t _pB0 = vld1q_s8(pB);
                    int8x16_t _pB1 = vld1q_s8(pB + 16);

                    int16x8x2_t _pAA = vzipq_s16(vreinterpretq_s16_s8(_pA), vreinterpretq_s16_s8(_pA));

                    int8x8_t _pA0 = vreinterpret_s8_s16(vget_low_s16(_pAA.val[0]));
                    int8x8_t _pA1 = vreinterpret_s8_s16(vget_high_s16(_pAA.val[0]));
                    int8x8_t _pA2 = vreinterpret_s8_s16(vget_low_s16(_pAA.val[1]));
                    int8x8_t _pA3 = vreinterpret_s8_s16(vget_high_s16(_pAA.val[1]));

                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB0));
                    int16x8_t _s1 = vmull_s8(_pA2, vget_low_s8(_pB1));
                    _s0 = vmlal_s8(_s0, _pA1, vget_high_s8(_pB0));
                    _s1 = vmlal_s8(_s1, _pA3, vget_high_s8(_pB1));
                    _sum0 = vpadalq_s16(_sum0, _s0);
                    _sum1 = vpadalq_s16(_sum1, _s1);

                    pA += 16;
                    pB += 32;
                }
                _sum0 = vaddq_s32(_sum0, _sum1);
                for (; kk + 7 < max_kk; kk += 8)
                {
                    int8x8_t _pA = vld1_s8(pA);
                    int8x16_t _pB = vld1q_s8(pB);

                    int16x4x2_t _pAA = vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA));

                    int8x8_t _pA0 = vreinterpret_s8_s16(_pAA.val[0]);
                    int8x8_t _pA1 = vreinterpret_s8_s16(_pAA.val[1]);

                    int16x8_t _s0 = vmull_s8(_pA0, vget_low_s8(_pB));
                    _s0 = vmlal_s8(_s0, _pA1, vget_high_s8(_pB));
                    _sum0 = vpadalq_s16(_sum0, _s0);

                    pA += 8;
                    pB += 16;
                }
                for (; kk + 3 < max_kk; kk += 4)
                {
                    int8x8_t _pA = vreinterpret_s8_s32(vdup_lane_s32(vreinterpret_s32_s8(vld1_s8(pA)), 0));
                    int8x8_t _pB = vld1_s8(pB);

                    _pA = vreinterpret_s8_s16(vzip_s16(vreinterpret_s16_s8(_pA), vreinterpret_s16_s8(_pA)).val[0]);

                    int16x8_t _s0 = vmull_s8(_pA, _pB);
                    _sum0 = vpadalq_s16(_sum0, _s0);

                    pA += 4;
                    pB += 8;
                }
                int32x2_t _ss = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                _sum = vadd_s32(_sum, _ss);
            }
#endif // __ARM_FEATURE_DOTPROD
            int sum0 = vget_lane_s32(_sum, 0);
            int sum1 = vget_lane_s32(_sum, 1);
            for (; kk + 1 < max_kk; kk += 2)
            {
                sum0 += pA[0] * pB[0];
                sum0 += pA[1] * pB[1];
                sum1 += pA[0] * pB[2];
                sum1 += pA[1] * pB[3];
                pA += 2;
                pB += 4;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk += 1)
            {
                sum0 += pA[0] * pB[0];
                sum1 += pA[0] * pB[1];
                pA += 1;
                pB += 2;
            }

            outptr[0] = sum0;
            outptr[1] = sum1;

            outptr += 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            int sum;

            if (k == 0)
            {
                sum = 0;
            }
            else
            {
                sum = outptr[0];
            }

            const signed char* pA = pAT;
            int kk = 0;
#if __ARM_NEON
            int32x4_t _sum = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);
            for (; kk + 31 < max_kk; kk += 32)
            {
                int8x16_t _pA0 = vld1q_s8(pA);
                int8x16_t _pA1 = vld1q_s8(pA + 16);
                int8x16_t _pB0 = vld1q_s8(pB);
                int8x16_t _pB1 = vld1q_s8(pB + 16);

#if __ARM_FEATURE_DOTPROD
                _sum = vdotq_s32(_sum, _pA0, _pB0);
                _sum1 = vdotq_s32(_sum1, _pA1, _pB1);
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA0), vget_low_s8(_pB0));
                int16x8_t _s1 = vmull_s8(vget_low_s8(_pA1), vget_low_s8(_pB1));
                _s0 = vmlal_s8(_s0, vget_high_s8(_pA0), vget_high_s8(_pB0));
                _s1 = vmlal_s8(_s1, vget_high_s8(_pA1), vget_high_s8(_pB1));
                _sum = vpadalq_s16(_sum, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
#endif // __ARM_FEATURE_DOTPROD

                pA += 32;
                pB += 32;
            }
            _sum = vaddq_s32(_sum, _sum1);
            for (; kk + 15 < max_kk; kk += 16)
            {
                int8x16_t _pA = vld1q_s8(pA);
                int8x16_t _pB = vld1q_s8(pB);

#if __ARM_FEATURE_DOTPROD
                _sum = vdotq_s32(_sum, _pA, _pB);
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _s0 = vmull_s8(vget_low_s8(_pA), vget_low_s8(_pB));
                _s0 = vmlal_s8(_s0, vget_high_s8(_pA), vget_high_s8(_pB));
                _sum = vpadalq_s16(_sum, _s0);
#endif // __ARM_FEATURE_DOTPROD

                pA += 16;
                pB += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                int8x8_t _pA = vld1_s8(pA);
                int8x8_t _pB = vld1_s8(pB);

                int16x8_t _s0 = vmull_s8(_pA, _pB);
                _sum = vpadalq_s16(_sum, _s0);

                pA += 8;
                pB += 8;
            }
#if __aarch64__
            sum += vaddvq_s32(_sum);
#else
            int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum));
            _ss = vpadd_s32(_ss, _ss);
            sum += vget_lane_s32(_ss, 0);
#endif
#endif // __ARM_NEON
            for (; kk < max_kk; kk += 1)
            {
                sum += pA[0] * pB[0];
                pA += 1;
                pB += 1;
            }

            outptr[0] = sum;

            outptr += 1;
        }

        pAT += max_kk;
    }
}

static void get_optimal_tile_mnk_int8(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
{
    // resolve optimal tile size from cache size
    const size_t l2_cache_size = get_cpu_level2_cache_size();

    if (nT == 0)
        nT = get_physical_big_cpu_count();

    int tile_size = (int)sqrtf((float)l2_cache_size / (2 * sizeof(signed char) + sizeof(int)));

    TILE_M = std::max(8, tile_size / 8 * 8);
#if __aarch64__
    TILE_N = std::max(8, tile_size / 8 * 8);
#else
    TILE_N = std::max(4, tile_size / 4 * 4);
#endif
    TILE_K = std::max(8, tile_size / 8 * 8);

    if (K > 0)
    {
        int nn_K = (K + TILE_K - 1) / TILE_K;
        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);

        if (nn_K == 1)
        {
            tile_size = (int)((float)l2_cache_size / 2 / sizeof(signed char) / TILE_K);

            TILE_M = std::max(8, tile_size / 8 * 8);
#if __aarch64__
            TILE_N = std::max(8, tile_size / 8 * 8);
#else
            TILE_N = std::max(4, tile_size / 4 * 4);
#endif
        }
    }

    TILE_M *= std::min(nT, get_physical_cpu_count());

    if (M > 0)
    {
        int nn_M = (M + TILE_M - 1) / TILE_M;
        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
    }

    if (N > 0)
    {
        int nn_N = (N + TILE_N - 1) / TILE_N;
#if __aarch64__
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 7) / 8 * 8);
#else
        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
#endif
    }

    if (nT > 1)
    {
        TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
    }

    // always take constant TILE_M/N/K value when provided
    if (constant_TILE_M > 0)
    {
        TILE_M = (constant_TILE_M + 7) / 8 * 8;
    }

    if (constant_TILE_N > 0)
    {
#if __aarch64__
        TILE_N = (constant_TILE_N + 7) / 8 * 8;
#else
        TILE_N = (constant_TILE_N + 3) / 4 * 4;
#endif
    }

    if (constant_TILE_K > 0)
    {
        TILE_K = (constant_TILE_K + 7) / 8 * 8;
    }
}


================================================
FILE: src/layer/arm/gemm_int8_bf16s.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
void pack_A_tile_bf16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void transpose_pack_A_tile_bf16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void pack_B_tile_bf16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void transpose_pack_B_tile_bf16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
void pack_A_tile_bf16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void transpose_pack_A_tile_bf16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void pack_B_tile_bf16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void transpose_pack_B_tile_bf16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void unpack_output_tile_int32_to_bf16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
void transpose_unpack_output_tile_int32_to_bf16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
#endif

static void compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
    const int K = A.w;

    // NCNN_LOGE("compute_A_tile_bf16_int8_scales %d %d", max_ii, elempack);

    const float v127_B_scale = 127.f * B_scale;

    float* ps = (float*)scales + i;
    float* pods = (float*)out_descales + i;

#if __ARM_NEON
    if (elempack == 4)
    {
#if __aarch64__
        float32x4_t _v127 = vdupq_n_f32(127.f);
        float32x4_t _v127_B_scale = vdupq_n_f32(v127_B_scale);
#endif

        for (int ii = 0; ii + 3 < max_ii; ii += 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += 4;
            }

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax0);
            float32x4_t _out_descale = vdivq_f32(_absmax0, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax0);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax0, _recp_v127_B_scale);

            float tmp[4];
            vst1q_f32(tmp, _absmax0);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

#endif
            ps += 4;
            pods += 4;
        }
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        for (int ii = 0; ii < max_ii; ii++)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep;

            float absmax = 0.f;
            int kk = 0;
#if __ARM_NEON
            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            for (; kk + 15 < K; kk += 16)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 7 < K; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += 4;
            }
            float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            absmax = std::max(absmax, std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1)));
#endif // __ARM_NEON
            for (; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf(bfloat16_to_float32(p0[0])));
                p0++;
            }

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
}

static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_A_tile_bf16_to_int8_i8mm(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_A_tile_bf16_to_int8_asimddp(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    // NCNN_LOGE("pack_A_tile_bf16_to_int8 %d %d", max_ii, elempack);

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

        float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
        float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);
                uint16x8x4_t _q = vld4q_u16(p0 + A_hstep * 4);

                float32x4_t _p0 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[0])), _scale0, 0);
                float32x4_t _p1 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[1])), _scale0, 1);
                float32x4_t _p2 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[2])), _scale0, 2);
                float32x4_t _p3 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[3])), _scale0, 3);
                float32x4_t _p4 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[0])), _scale0, 0);
                float32x4_t _p5 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[1])), _scale0, 1);
                float32x4_t _p6 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[2])), _scale0, 2);
                float32x4_t _p7 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[3])), _scale0, 3);
                float32x4_t _p8 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_q.val[0])), _scale1, 0);
                float32x4_t _p9 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_q.val[1])), _scale1, 1);
                float32x4_t _pa = vmulq_laneq_f32(bfloat2float(vget_low_u16(_q.val[2])), _scale1, 2);
                float32x4_t _pb = vmulq_laneq_f32(bfloat2float(vget_low_u16(_q.val[3])), _scale1, 3);
                float32x4_t _pc = vmulq_laneq_f32(bfloat2float(vget_high_u16(_q.val[0])), _scale1, 0);
                float32x4_t _pd = vmulq_laneq_f32(bfloat2float(vget_high_u16(_q.val[1])), _scale1, 1);
                float32x4_t _pe = vmulq_laneq_f32(bfloat2float(vget_high_u16(_q.val[2])), _scale1, 2);
                float32x4_t _pf = vmulq_laneq_f32(bfloat2float(vget_high_u16(_q.val[3])), _scale1, 3);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
                int8x8_t _r4 = float2int8(_p8, _pc);
                int8x8_t _r5 = float2int8(_p9, _pd);
                int8x8_t _r6 = float2int8(_pa, _pe);
                int8x8_t _r7 = float2int8(_pb, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p8, _p9);
                int8x8_t _r3 = float2int8(_pa, _pb);
                int8x8_t _r4 = float2int8(_p4, _p5);
                int8x8_t _r5 = float2int8(_p6, _p7);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 4 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale0);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale0);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale0);
                _p8 = vmulq_f32(_p8, _scale1);
                _p9 = vmulq_f32(_p9, _scale1);
                _pa = vmulq_f32(_pa, _scale1);
                _pb = vmulq_f32(_pb, _scale1);
                _pc = vmulq_f32(_pc, _scale1);
                _pd = vmulq_f32(_pd, _scale1);
                _pe = vmulq_f32(_pe, _scale1);
                _pf = vmulq_f32(_pf, _scale1);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p8), float2int8(_p2, _pa));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p9), float2int8(_p3, _pb));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p4, _pc), float2int8(_p6, _pe));
                _r23.val[1] = vcombine_s8(float2int8(_p5, _pd), float2int8(_p7, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);
                uint16x4x4_t _q = vld4_u16(p0 + A_hstep * 4);

                float32x4_t _p0 = vmulq_laneq_f32(bfloat2float(_p.val[0]), _scale0, 0);
                float32x4_t _p1 = vmulq_laneq_f32(bfloat2float(_p.val[1]), _scale0, 1);
                float32x4_t _p2 = vmulq_laneq_f32(bfloat2float(_p.val[2]), _scale0, 2);
                float32x4_t _p3 = vmulq_laneq_f32(bfloat2float(_p.val[3]), _scale0, 3);
                float32x4_t _p4 = vmulq_laneq_f32(bfloat2float(_q.val[0]), _scale1, 0);
                float32x4_t _p5 = vmulq_laneq_f32(bfloat2float(_q.val[1]), _scale1, 1);
                float32x4_t _p6 = vmulq_laneq_f32(bfloat2float(_q.val[2]), _scale1, 2);
                float32x4_t _p7 = vmulq_laneq_f32(bfloat2float(_q.val[3]), _scale1, 3);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 4 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale0);
                _p4 = vmulq_f32(_p4, _scale1);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale1);
                _p7 = vmulq_f32(_p7, _scale1);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p4), float2int8(_p2, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p5), float2int8(_p3, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep * 4);

                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p0n = bfloat2float(vget_high_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p1n = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p0n = vmulq_f32(_p0n, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p1n = vmulq_f32(_p1n, _scale1);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p0n, _p1n);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale0, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale0, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale0, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale0, 3);
                _p8 = vmulq_laneq_f32(_p8, _scale1, 0);
                _p9 = vmulq_laneq_f32(_p9, _scale1, 0);
                _pa = vmulq_laneq_f32(_pa, _scale1, 1);
                _pb = vmulq_laneq_f32(_pb, _scale1, 1);
                _pc = vmulq_laneq_f32(_pc, _scale1, 2);
                _pd = vmulq_laneq_f32(_pd, _scale1, 2);
                _pe = vmulq_laneq_f32(_pe, _scale1, 3);
                _pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 0);
                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale0), 1);
                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale0), 0);
                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale0), 0);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale0), 1);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale0), 1);
                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale1), 0);
                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale1), 0);
                _pa = vmulq_lane_f32(_pa, vget_low_f32(_scale1), 1);
                _pb = vmulq_lane_f32(_pb, vget_low_f32(_scale1), 1);
                _pc = vmulq_lane_f32(_pc, vget_high_f32(_scale1), 0);
                _pd = vmulq_lane_f32(_pd, vget_high_f32(_scale1), 0);
                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 1);
                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p8, _pa));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_pc, _pe));
                int16x4_t _t4 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t5 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4_t _t6 = vreinterpret_s16_s8(float2int8(_p9, _pb));
                int16x4_t _t7 = vreinterpret_s16_s8(float2int8(_pd, _pf));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int16x4x2_t _t45 = vuzp_s16(_t4, _t5);
                int16x4x2_t _t67 = vuzp_s16(_t6, _t7);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
                int8x8_t _r4 = vreinterpret_s8_s16(_t45.val[0]);
                int8x8_t _r5 = vreinterpret_s8_s16(_t67.val[0]);
                int8x8_t _r6 = vreinterpret_s8_s16(_t45.val[1]);
                int8x8_t _r7 = vreinterpret_s8_s16(_t67.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));

                pp += 64;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 3));
                float32x4_t _p4 = bfloat2float(vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p5 = bfloat2float(vld1_u16(p0 + A_hstep * 5));
                float32x4_t _p6 = bfloat2float(vld1_u16(p0 + A_hstep * 6));
                float32x4_t _p7 = bfloat2float(vld1_u16(p0 + A_hstep * 7));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p45 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p67 = bfloat2float(vget_high_u16(_q));

                float32x4x2_t _scale01 = vzipq_f32(_scale0, _scale0);
                float32x4x2_t _scale23 = vzipq_f32(_scale1, _scale1);

                _p01 = vmulq_f32(_p01, _scale01.val[0]);
                _p23 = vmulq_f32(_p23, _scale01.val[1]);
                _p45 = vmulq_f32(_p45, _scale23.val[0]);
                _p67 = vmulq_f32(_p67, _scale23.val[1]);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 7], _p, 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0++;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

        float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);

                float32x4_t _p0 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[0])), _scale, 0);
                float32x4_t _p1 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[1])), _scale, 1);
                float32x4_t _p2 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[2])), _scale, 2);
                float32x4_t _p3 = vmulq_laneq_f32(bfloat2float(vget_low_u16(_p.val[3])), _scale, 3);
                float32x4_t _p4 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[0])), _scale, 0);
                float32x4_t _p5 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[1])), _scale, 1);
                float32x4_t _p6 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[2])), _scale, 2);
                float32x4_t _p7 = vmulq_laneq_f32(bfloat2float(vget_high_u16(_p.val[3])), _scale, 3);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);

                float32x4_t _p0 = vmulq_laneq_f32(bfloat2float(_p.val[0]), _scale, 0);
                float32x4_t _p1 = vmulq_laneq_f32(bfloat2float(_p.val[1]), _scale, 1);
                float32x4_t _p2 = vmulq_laneq_f32(bfloat2float(_p.val[2]), _scale, 2);
                float32x4_t _p3 = vmulq_laneq_f32(bfloat2float(_p.val[3]), _scale, 3);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 0);
                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale), 1);
                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale), 1);
                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale), 0);
                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale), 0);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 1);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 3));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));

                float32x4x2_t _scale01 = vzipq_f32(_scale, _scale);

                _p01 = vmulq_f32(_p01, _scale01.val[0]);
                _p23 = vmulq_f32(_p23, _scale01.val[1]);

                int8x8_t _r0 = float2int8(_p01, _p23);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x4_t _p = uint16x4_t();
                _p = vset_lane_u16(p0[0], _p, 0);
                _p = vset_lane_u16(p0[A_hstep], _p, 1);
                _p = vset_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vset_lane_u16(p0[A_hstep * 3], _p, 3);
                float32x4_t _p0 = bfloat2float(_p);

                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0++;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

        const float scale0 = scales[i + ii];
        const float scale1 = scales[i + ii + 1];

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale0 = vdupq_n_f32(scale0);
            float32x4_t _scale1 = vdupq_n_f32(scale1);
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale1);
                _p3 = vmulq_f32(_p3, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p2));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p2));
                float32x4_t _t2 = vcombine_f32(vget_low_f32(_p1), vget_low_f32(_p3));
                float32x4_t _t3 = vcombine_f32(vget_high_f32(_p1), vget_high_f32(_p3));
                int8x8_t _r0 = float2int8(_t0, _t1);
                int8x8_t _r1 = float2int8(_t2, _t3);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);
                vst1_s8(pp + 8, _r1);

                pp += 16;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r0 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(bfloat16_to_float32(p0[1]) * scale0);
                pp[2] = float2int8(bfloat16_to_float32(p0[A_hstep]) * scale1);
                pp[3] = float2int8(bfloat16_to_float32(p0[A_hstep + 1]) * scale1);
                pp += 4;
                p0 += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(bfloat16_to_float32(p0[A_hstep]) * scale1);
                pp += 2;
                p0++;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

        const float scale = scales[i + ii];

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale = vdupq_n_f32(scale);
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
    const int K = A.dims == 3 ? A.c : A.h;

    // NCNN_LOGE("transpose_compute_A_tile_bf16_int8_scales %d %d", max_ii, elempack);

    const float v127_B_scale = 127.f * B_scale;

#if __ARM_NEON
#if __aarch64__
    float32x4_t _v127 = vdupq_n_f32(127.f);
    float32x4_t _v127_B_scale = vdupq_n_f32(v127_B_scale);
#endif
#endif

    float* ps = (float*)scales + i;
    float* pods = (float*)out_descales + i;

#if __ARM_NEON
    if (elempack == 4)
    {
        int ii = 0;
        for (; ii + 3 < max_ii; ii += 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * 4;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            for (int kk = 0; kk < K; kk++)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 4;
            }
            float32x2_t _aa0 = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            float32x2_t _aa1 = vmax_f32(vget_low_f32(_absmax1), vget_high_f32(_absmax1));
            float32x2_t _aa2 = vmax_f32(vget_low_f32(_absmax2), vget_high_f32(_absmax2));
            float32x2_t _aa3 = vmax_f32(vget_low_f32(_absmax3), vget_high_f32(_absmax3));
            float32x2_t _aa01 = vpmax_f32(_aa0, _aa1);
            float32x2_t _aa23 = vpmax_f32(_aa2, _aa3);
            float32x4_t _absmax = vcombine_f32(_aa01, _aa23);

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax);
            float32x4_t _out_descale = vdivq_f32(_absmax, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            float tmp[4];
            vst1q_f32(tmp, _absmax);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax, _recp_v127_B_scale);
#endif

            ps += 4;
            pods += 4;
        }
        for (; ii < max_ii; ii++)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * 4;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 8));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 12));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep * 4));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += A_hstep * 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += A_hstep * 4;
            }
            float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            float absmax = std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1));

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        int ii = 0;
#if __ARM_NEON
        for (; ii + 3 < max_ii; ii += 4)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii);

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 3));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 4;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += A_hstep * 2;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += A_hstep;
            }

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax0);
            float32x4_t _out_descale = vdivq_f32(_absmax0, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            float tmp[4];
            vst1q_f32(tmp, _absmax0);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax0);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax0, _recp_v127_B_scale);
#endif

            ps += 4;
            pods += 4;
        }
#endif // __ARM_NEON
        for (; ii < max_ii; ii++)
        {
            const unsigned short* p0 = (const unsigned short*)A + (i + ii);

            float absmax = 0.f;
            for (int kk = 0; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf(bfloat16_to_float32(p0[0])));
                p0 += A_hstep;
            }

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
}

static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_A_tile_bf16_to_int8_i8mm(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_A_tile_bf16_to_int8_asimddp(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    // NCNN_LOGE("transpose_pack_A_tile_bf16_to_int8 %d %d", max_ii, elempack);

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
        float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 4 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
                _p8 = vmulq_laneq_f32(_p8, _scale0, 0);
                _p9 = vmulq_laneq_f32(_p9, _scale0, 1);
                _pa = vmulq_laneq_f32(_pa, _scale0, 2);
                _pb = vmulq_laneq_f32(_pb, _scale0, 3);
                _pc = vmulq_laneq_f32(_pc, _scale1, 0);
                _pd = vmulq_laneq_f32(_pd, _scale1, 1);
                _pe = vmulq_laneq_f32(_pe, _scale1, 2);
                _pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale0), 0);
                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale0), 1);
                _pa = vmulq_lane_f32(_pa, vget_high_f32(_scale0), 0);
                _pb = vmulq_lane_f32(_pb, vget_high_f32(_scale0), 1);
                _pc = vmulq_lane_f32(_pc, vget_low_f32(_scale1), 0);
                _pd = vmulq_lane_f32(_pd, vget_low_f32(_scale1), 1);
                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 0);
                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p8);
                int8x8_t _r1 = float2int8(_p1, _p9);
                int8x8_t _r2 = float2int8(_p2, _pa);
                int8x8_t _r3 = float2int8(_p3, _pb);
                int8x8_t _r4 = float2int8(_p4, _pc);
                int8x8_t _r5 = float2int8(_p5, _pd);
                int8x8_t _r6 = float2int8(_p6, _pe);
                int8x8_t _r7 = float2int8(_p7, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

#if __ARM_FEATURE_DOTPROD
                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8x2_t _rr = vuzpq_s16(_r01, _r23);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);
                _p8 = vmulq_f32(_p8, _scale0);
                _p9 = vmulq_f32(_p9, _scale1);
                _pa = vmulq_f32(_pa, _scale0);
                _pb = vmulq_f32(_pb, _scale1);
                _pc = vmulq_f32(_pc, _scale0);
                _pd = vmulq_f32(_pd, _scale1);
                _pe = vmulq_f32(_pe, _scale0);
                _pf = vmulq_f32(_pf, _scale1);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r04 = vzip_s8(_r0, _r4);
                int8x8x2_t _r15 = vzip_s8(_r1, _r5);
                int8x8x2_t _r26 = vzip_s8(_r2, _r6);
                int8x8x2_t _r37 = vzip_s8(_r3, _r7);
                int8x16x4_t _r0123;
                _r0123.val[0] = vcombine_s8(_r04.val[0], _r04.val[1]);
                _r0123.val[1] = vcombine_s8(_r15.val[0], _r15.val[1]);
                _r0123.val[2] = vcombine_s8(_r26.val[0], _r26.val[1]);
                _r0123.val[3] = vcombine_s8(_r37.val[0], _r37.val[1]);

                vst4q_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = _r0;
                _r0123.val[1] = _r1;
                _r0123.val[2] = _r2;
                _r0123.val[3] = _r3;
                int8x8x4_t _r4567;
                _r4567.val[0] = _r4;
                _r4567.val[1] = _r5;
                _r4567.val[2] = _r6;
                _r4567.val[3] = _r7;

                vst4_s8(pp, _r0123);
                vst4_s8(pp + 32, _r4567);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(_r0, _r2);
                _r01.val[1] = vcombine_s8(_r1, _r3);
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(_r4, _r6);
                _r23.val[1] = vcombine_s8(_r5, _r7);

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);

                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += A_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 4 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 3));
                float32x4_t _p4 = bfloat2float(vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p5 = bfloat2float(vld1_u16(p0 + A_hstep * 5));
                float32x4_t _p6 = bfloat2float(vld1_u16(p0 + A_hstep * 6));
                float32x4_t _p7 = bfloat2float(vld1_u16(p0 + A_hstep * 7));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                float32x4x2_t _p04 = vzipq_f32(_p0, _p4);
                float32x4x2_t _p15 = vzipq_f32(_p1, _p5);
                float32x4x2_t _p26 = vzipq_f32(_p2, _p6);
                float32x4x2_t _p37 = vzipq_f32(_p3, _p7);
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p04.val[0], _p04.val[1]);
                _r0123.val[1] = float2int8(_p15.val[0], _p15.val[1]);
                _r0123.val[2] = float2int8(_p26.val[0], _p26.val[1]);
                _r0123.val[3] = float2int8(_p37.val[0], _p37.val[1]);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p4);
                _r0123.val[1] = float2int8(_p1, _p5);
                _r0123.val[2] = float2int8(_p2, _p6);
                _r0123.val[3] = float2int8(_p3, _p7);

                vst4_s8(pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 3));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                transpose4x4_ps(_p0, _p1, _p2, _p3);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));
#else  // __ARM_FEATURE_DOTPROD
                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);
                pp += 4;
                p0 += A_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        const float scale0 = scales[i + ii];
        const float scale1 = scales[i + ii + 1];

#if __ARM_NEON
        float32x4_t _scale0 = vdupq_n_f32(scale0);
        float32x4_t _scale1 = vdupq_n_f32(scale1);
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep * 4);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r01 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r01 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale = vzipq_f32(_scale0, _scale1).val[0];
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p45 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p67 = bfloat2float(vget_high_u16(_q));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vuzp_s8(_r0, _r1);

                vst1q_s8(pp, vcombine_s8(_r01.val[0], _r01.val[1]));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vtrn_s8(_r0, _r1);
                int8x8x2_t _rr01 = vuzp_s8(_r01.val[0], _r01.val[1]);

                vst1q_s8(pp, vcombine_s8(_rr01.val[0], _rr01.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 4 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 6 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep + 1], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 3], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 3 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 5], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 5 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 7 + 1], _q, 7);
                float32x4_t _p02 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p46 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p13 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p57 = bfloat2float(vget_high_u16(_q));

                _p02 = vmulq_f32(_p02, _scale);
                _p46 = vmulq_f32(_p46, _scale);
                _p13 = vmulq_f32(_p13, _scale);
                _p57 = vmulq_f32(_p57, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p02, _p46);
                _r01.val[1] = float2int8(_p13, _p57);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                float32x4x2_t _pp = vuzpq_f32(_p01, _p23);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                float32x4_t _p02 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p13 = bfloat2float(vget_high_u16(_p));

                _p02 = vmulq_f32(_p02, _scale);
                _p13 = vmulq_f32(_p13, _scale);

                float32x4x2_t _pp = vzipq_f32(_p02, _p13);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(bfloat16_to_float32(p0[A_hstep + 0]) * scale0);
                pp[2] = float2int8(bfloat16_to_float32(p0[1]) * scale1);
                pp[3] = float2int8(bfloat16_to_float32(p0[A_hstep + 1]) * scale1);
                pp += 4;
                p0 += A_hstep * 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(bfloat16_to_float32(p0[1]) * scale1);
                pp += 2;
                p0 += A_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        const float scale = scales[i + ii];

#if __ARM_NEON
        float32x4_t _scale = vdupq_n_f32(scale);
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 8));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 12));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += A_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + A_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(bfloat16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(bfloat16_to_float32(p0[2]) * scale);
                pp[3] = float2int8(bfloat16_to_float32(p0[3]) * scale);
                pp += 4;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 7], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep * 8], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep * 9], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 10], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 11], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 12], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 13], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 14], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 15], _q, 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += A_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 7], _p, 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp += 1;
                p0 += A_hstep;
            }
        }
    }
}

static void compute_B_bf16_int8_scale(const Mat& B, float& scale)
{
    float absmax = 0.f;
#if __ARM_NEON
    float32x4_t _absmax = vdupq_n_f32(0.f);
#endif
    for (int i = 0; i < (B.dims == 3 ? B.c : B.h); i++)
    {
        const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;
        const unsigned short* ptr = (const unsigned short*)B + i * B_hstep * B.elempack;

        const int size = B.w * B.elempack;

        int j = 0;
#if __ARM_NEON
        for (; j + 7 < size; j += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _absmax = vmaxq_f32(_absmax, vabsq_f32(_p0));
            _absmax = vmaxq_f32(_absmax, vabsq_f32(_p1));
            ptr += 8;
        }
        for (; j + 3 < size; j += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _absmax = vmaxq_f32(_absmax, vabsq_f32(_p));
            ptr += 4;
        }
#endif
        for (; j < size; j++)
        {
            absmax = std::max(absmax, (float)fabsf(bfloat16_to_float32(ptr[0])));
            ptr++;
        }
    }
#if __ARM_NEON
    float32x2_t _aa = vmax_f32(vget_low_f32(_absmax), vget_high_f32(_absmax));
    absmax = std::max(absmax, std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1)));
#endif

    scale = absmax == 0.f ? 1.f : 127.f / absmax;
}

static void pack_B_tile_bf16_to_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_B_tile_bf16_to_int8_i8mm(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_B_tile_bf16_to_int8_asimddp(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    // NCNN_LOGE("pack_B_tile_bf16_to_int8 %d %d", max_jj, elempack);

    signed char* pp = BT;

#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
#endif

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);
                uint16x8x4_t _q = vld4q_u16(p0 + B_hstep * 4);

                float32x4_t _p0 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[0])), _scale);
                float32x4_t _p1 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[1])), _scale);
                float32x4_t _p2 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[2])), _scale);
                float32x4_t _p3 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[3])), _scale);
                float32x4_t _p4 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[0])), _scale);
                float32x4_t _p5 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[1])), _scale);
                float32x4_t _p6 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[2])), _scale);
                float32x4_t _p7 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[3])), _scale);
                float32x4_t _p8 = vmulq_f32(bfloat2float(vget_low_u16(_q.val[0])), _scale);
                float32x4_t _p9 = vmulq_f32(bfloat2float(vget_low_u16(_q.val[1])), _scale);
                float32x4_t _pa = vmulq_f32(bfloat2float(vget_low_u16(_q.val[2])), _scale);
                float32x4_t _pb = vmulq_f32(bfloat2float(vget_low_u16(_q.val[3])), _scale);
                float32x4_t _pc = vmulq_f32(bfloat2float(vget_high_u16(_q.val[0])), _scale);
                float32x4_t _pd = vmulq_f32(bfloat2float(vget_high_u16(_q.val[1])), _scale);
                float32x4_t _pe = vmulq_f32(bfloat2float(vget_high_u16(_q.val[2])), _scale);
                float32x4_t _pf = vmulq_f32(bfloat2float(vget_high_u16(_q.val[3])), _scale);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
                int8x8_t _r4 = float2int8(_p8, _pc);
                int8x8_t _r5 = float2int8(_p9, _pd);
                int8x8_t _r6 = float2int8(_pa, _pe);
                int8x8_t _r7 = float2int8(_pb, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p8, _p9);
                int8x8_t _r3 = float2int8(_pa, _pb);
                int8x8_t _r4 = float2int8(_p4, _p5);
                int8x8_t _r5 = float2int8(_p6, _p7);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 4 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p8), float2int8(_p2, _pa));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p9), float2int8(_p3, _pb));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p4, _pc), float2int8(_p6, _pe));
                _r23.val[1] = vcombine_s8(float2int8(_p5, _pd), float2int8(_p7, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);
                uint16x4x4_t _q = vld4_u16(p0 + B_hstep * 4);

                float32x4_t _p0 = vmulq_f32(bfloat2float(_p.val[0]), _scale);
                float32x4_t _p1 = vmulq_f32(bfloat2float(_p.val[1]), _scale);
                float32x4_t _p2 = vmulq_f32(bfloat2float(_p.val[2]), _scale);
                float32x4_t _p3 = vmulq_f32(bfloat2float(_p.val[3]), _scale);
                float32x4_t _p4 = vmulq_f32(bfloat2float(_q.val[0]), _scale);
                float32x4_t _p5 = vmulq_f32(bfloat2float(_q.val[1]), _scale);
                float32x4_t _p6 = vmulq_f32(bfloat2float(_q.val[2]), _scale);
                float32x4_t _p7 = vmulq_f32(bfloat2float(_q.val[3]), _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 4 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p4), float2int8(_p2, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p5), float2int8(_p3, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep * 4);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p8, _pa));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_pc, _pe));
                int16x4_t _t4 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t5 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4_t _t6 = vreinterpret_s16_s8(float2int8(_p9, _pb));
                int16x4_t _t7 = vreinterpret_s16_s8(float2int8(_pd, _pf));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int16x4x2_t _t45 = vuzp_s16(_t4, _t5);
                int16x4x2_t _t67 = vuzp_s16(_t6, _t7);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
                int8x8_t _r4 = vreinterpret_s8_s16(_t45.val[0]);
                int8x8_t _r5 = vreinterpret_s8_s16(_t67.val[0]);
                int8x8_t _r6 = vreinterpret_s8_s16(_t45.val[1]);
                int8x8_t _r7 = vreinterpret_s8_s16(_t67.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));

                pp += 64;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + B_hstep * 3));
                float32x4_t _p4 = bfloat2float(vld1_u16(p0 + B_hstep * 4));
                float32x4_t _p5 = bfloat2float(vld1_u16(p0 + B_hstep * 5));
                float32x4_t _p6 = bfloat2float(vld1_u16(p0 + B_hstep * 6));
                float32x4_t _p7 = bfloat2float(vld1_u16(p0 + B_hstep * 7));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p45 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p67 = bfloat2float(vget_high_u16(_q));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 7], _p, 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0++;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);

                float32x4_t _p0 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[0])), _scale);
                float32x4_t _p1 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[1])), _scale);
                float32x4_t _p2 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[2])), _scale);
                float32x4_t _p3 = vmulq_f32(bfloat2float(vget_low_u16(_p.val[3])), _scale);
                float32x4_t _p4 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[0])), _scale);
                float32x4_t _p5 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[1])), _scale);
                float32x4_t _p6 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[2])), _scale);
                float32x4_t _p7 = vmulq_f32(bfloat2float(vget_high_u16(_p.val[3])), _scale);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);

                float32x4_t _p0 = vmulq_f32(bfloat2float(_p.val[0]), _scale);
                float32x4_t _p1 = vmulq_f32(bfloat2float(_p.val[1]), _scale);
                float32x4_t _p2 = vmulq_f32(bfloat2float(_p.val[2]), _scale);
                float32x4_t _p3 = vmulq_f32(bfloat2float(_p.val[3]), _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + B_hstep * 3));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x4_t _p = uint16x4_t();
                _p = vset_lane_u16(p0[0], _p, 0);
                _p = vset_lane_u16(p0[B_hstep], _p, 1);
                _p = vset_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vset_lane_u16(p0[B_hstep * 3], _p, 3);
                float32x4_t _p0 = bfloat2float(_p);

                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0++;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p2));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p2));
                float32x4_t _t2 = vcombine_f32(vget_low_f32(_p1), vget_low_f32(_p3));
                float32x4_t _t3 = vcombine_f32(vget_high_f32(_p1), vget_high_f32(_p3));
                int8x8_t _r0 = float2int8(_t0, _t1);
                int8x8_t _r1 = float2int8(_t2, _t3);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);
                vst1_s8(pp + 8, _r1);

                pp += 16;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r0 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(bfloat16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(bfloat16_to_float32(p0[B_hstep]) * scale);
                pp[3] = float2int8(bfloat16_to_float32(p0[B_hstep + 1]) * scale);
                pp += 4;
                p0 += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(bfloat16_to_float32(p0[B_hstep]) * scale);
                pp += 2;
                p0++;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_pack_B_tile_bf16_to_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_B_tile_bf16_to_int8_i8mm(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_B_tile_bf16_to_int8_asimddp(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    // NCNN_LOGE("transpose_pack_B_tile_bf16_to_int8 %d %d", max_jj, elempack);

    signed char* pp = BT;

#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
#endif

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 4 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p8);
                int8x8_t _r1 = float2int8(_p1, _p9);
                int8x8_t _r2 = float2int8(_p2, _pa);
                int8x8_t _r3 = float2int8(_p3, _pb);
                int8x8_t _r4 = float2int8(_p4, _pc);
                int8x8_t _r5 = float2int8(_p5, _pd);
                int8x8_t _r6 = float2int8(_p6, _pe);
                int8x8_t _r7 = float2int8(_p7, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

#if __ARM_FEATURE_DOTPROD
                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8x2_t _rr = vuzpq_s16(_r01, _r23);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
                float32x4_t _p8 = bfloat2float(vget_low_u16(_t));
                float32x4_t _p9 = bfloat2float(vget_high_u16(_t));
                float32x4_t _pa = bfloat2float(vget_low_u16(_u));
                float32x4_t _pb = bfloat2float(vget_high_u16(_u));
                float32x4_t _pc = bfloat2float(vget_low_u16(_v));
                float32x4_t _pd = bfloat2float(vget_high_u16(_v));
                float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                float32x4_t _pf = bfloat2float(vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r04 = vzip_s8(_r0, _r4);
                int8x8x2_t _r15 = vzip_s8(_r1, _r5);
                int8x8x2_t _r26 = vzip_s8(_r2, _r6);
                int8x8x2_t _r37 = vzip_s8(_r3, _r7);
                int8x16x4_t _r0123;
                _r0123.val[0] = vcombine_s8(_r04.val[0], _r04.val[1]);
                _r0123.val[1] = vcombine_s8(_r15.val[0], _r15.val[1]);
                _r0123.val[2] = vcombine_s8(_r26.val[0], _r26.val[1]);
                _r0123.val[3] = vcombine_s8(_r37.val[0], _r37.val[1]);

                vst4q_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = _r0;
                _r0123.val[1] = _r1;
                _r0123.val[2] = _r2;
                _r0123.val[3] = _r3;
                int8x8x4_t _r4567;
                _r4567.val[0] = _r4;
                _r4567.val[1] = _r5;
                _r4567.val[2] = _r6;
                _r4567.val[3] = _r7;

                vst4_s8(pp, _r0123);
                vst4_s8(pp + 32, _r4567);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(_r0, _r2);
                _r01.val[1] = vcombine_s8(_r1, _r3);
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(_r4, _r6);
                _r23.val[1] = vcombine_s8(_r5, _r7);

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += B_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += B_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 4 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                float32x4_t _p4 = bfloat2float(vget_low_u16(_r));
                float32x4_t _p5 = bfloat2float(vget_high_u16(_r));
                float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                float32x4_t _p7 = bfloat2float(vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + B_hstep * 3));
                float32x4_t _p4 = bfloat2float(vld1_u16(p0 + B_hstep * 4));
                float32x4_t _p5 = bfloat2float(vld1_u16(p0 + B_hstep * 5));
                float32x4_t _p6 = bfloat2float(vld1_u16(p0 + B_hstep * 6));
                float32x4_t _p7 = bfloat2float(vld1_u16(p0 + B_hstep * 7));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                float32x4x2_t _p04 = vzipq_f32(_p0, _p4);
                float32x4x2_t _p15 = vzipq_f32(_p1, _p5);
                float32x4x2_t _p26 = vzipq_f32(_p2, _p6);
                float32x4x2_t _p37 = vzipq_f32(_p3, _p7);
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p04.val[0], _p04.val[1]);
                _r0123.val[1] = float2int8(_p15.val[0], _p15.val[1]);
                _r0123.val[2] = float2int8(_p26.val[0], _p26.val[1]);
                _r0123.val[3] = float2int8(_p37.val[0], _p37.val[1]);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p4);
                _r0123.val[1] = float2int8(_p1, _p5);
                _r0123.val[2] = float2int8(_p2, _p6);
                _r0123.val[3] = float2int8(_p3, _p7);

                vst4_s8(pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + B_hstep * 3));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                transpose4x4_ps(_p0, _p1, _p2, _p3);
                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));
#else  // __ARM_FEATURE_DOTPROD
                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);
                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(bfloat16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(bfloat16_to_float32(p0[2]) * scale);
                pp[3] = float2int8(bfloat16_to_float32(p0[3]) * scale);
                pp += 4;
                p0 += B_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

#if __ARM_NEON
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep * 4);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r01 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r01 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p45 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p67 = bfloat2float(vget_high_u16(_q));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vuzp_s8(_r0, _r1);

                vst1q_s8(pp, vcombine_s8(_r01.val[0], _r01.val[1]));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vtrn_s8(_r0, _r1);
                int8x8x2_t _rr01 = vuzp_s8(_r01.val[0], _r01.val[1]);

                vst1q_s8(pp, vcombine_s8(_rr01.val[0], _rr01.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 4 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 6 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep + 1], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 3], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 3 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 5], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 5 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 7 + 1], _q, 7);
                float32x4_t _p02 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p46 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p13 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p57 = bfloat2float(vget_high_u16(_q));

                _p02 = vmulq_f32(_p02, _scale);
                _p46 = vmulq_f32(_p46, _scale);
                _p13 = vmulq_f32(_p13, _scale);
                _p57 = vmulq_f32(_p57, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p02, _p46);
                _r01.val[1] = float2int8(_p13, _p57);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p23 = bfloat2float(vget_high_u16(_p));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                float32x4x2_t _pp = vuzpq_f32(_p01, _p23);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                float32x4_t _p02 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p13 = bfloat2float(vget_high_u16(_p));

                _p02 = vmulq_f32(_p02, _scale);
                _p13 = vmulq_f32(_p13, _scale);

                float32x4x2_t _pp = vzipq_f32(_p02, _p13);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(bfloat16_to_float32(p0[B_hstep + 0]) * scale);
                pp[2] = float2int8(bfloat16_to_float32(p0[1]) * scale);
                pp[3] = float2int8(bfloat16_to_float32(p0[B_hstep + 1]) * scale);
                pp += 4;
                p0 += B_hstep * 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(bfloat16_to_float32(p0[1]) * scale);
                pp += 2;
                p0 += B_hstep;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

#if __ARM_NEON
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep * 4));
                float32x4_t _p2 = bfloat2float(vld1_u16(p0 + B_hstep * 8));
                float32x4_t _p3 = bfloat2float(vld1_u16(p0 + B_hstep * 12));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += B_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = bfloat2float(vld1_u16(p0));
                float32x4_t _p1 = bfloat2float(vld1_u16(p0 + B_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(bfloat16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(bfloat16_to_float32(p0[2]) * scale);
                pp[3] = float2int8(bfloat16_to_float32(p0[3]) * scale);
                pp += 4;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 7], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep * 8], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep * 9], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 10], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 11], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 12], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 13], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 14], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 15], _q, 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += B_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 7], _p, 7);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(bfloat16_to_float32(p0[0]) * scale);
                pp += 1;
                p0 += B_hstep;
            }
        }
    }
}

static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        unpack_output_tile_int32_to_bf16_asimddp(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const size_t c_hstep = C.dims == 3 ? C.cstep : (size_t)C.w;
    const int c_elempack = C.elempack;
    const unsigned short* pC = C;

    // NCNN_LOGE("unpack_output_tile_int32_to_bf16  %d %d %d %d  %d  %d  %d", i, max_ii, j, max_jj, out_elempack, broadcast_type_C, c_elempack);

    const int* pp = topT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
        float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

        float32x4_t _c0;
        float32x4_t _c1;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                uint16x8_t _c = vld1q_u16(pC);
                _c0 = bfloat2float(vget_low_u16(_c));
                _c1 = bfloat2float(vget_high_u16(_c));
                _c0 = vmulq_n_f32(_c0, beta);
                _c1 = vmulq_n_f32(_c1, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);
            int32x4_t _sum8 = vld1q_s32(pp + 32);
            int32x4_t _sum9 = vld1q_s32(pp + 36);
            int32x4_t _suma = vld1q_s32(pp + 40);
            int32x4_t _sumb = vld1q_s32(pp + 44);
            int32x4_t _sumc = vld1q_s32(pp + 48);
            int32x4_t _sumd = vld1q_s32(pp + 52);
            int32x4_t _sume = vld1q_s32(pp + 56);
            int32x4_t _sumf = vld1q_s32(pp + 60);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
#else
            // from
            //      a0 b1 c2 d3
            //      e4 f5 g6 h7
            //      e0 f1 g2 h3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      g4 h5 e6 f7
            //      g0 h1 e2 f3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      e7 f6 g5 h4
            //      e3 f2 g1 h0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      g7 h6 e5 f4
            //      g3 h2 e1 f0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
            {
                _sum8 = vrev64q_s32(_sum8);
                _sum9 = vrev64q_s32(_sum9);
                _suma = vrev64q_s32(_suma);
                _sumb = vrev64q_s32(_sumb);
                _sumc = vrev64q_s32(_sumc);
                _sumd = vrev64q_s32(_sumd);
                _sume = vrev64q_s32(_sume);
                _sumf = vrev64q_s32(_sumf);
                _sum8 = vextq_s32(_sum8, _sum8, 2);
                _sum9 = vextq_s32(_sum9, _sum9, 2);
                _suma = vextq_s32(_suma, _suma, 2);
                _sumb = vextq_s32(_sumb, _sumb, 2);
                _sumc = vextq_s32(_sumc, _sumc, 2);
                _sumd = vextq_s32(_sumd, _sumd, 2);
                _sume = vextq_s32(_sume, _sume, 2);
                _sumf = vextq_s32(_sumf, _sumf, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                _sum9 = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                _suma = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                _sumb = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                _sumc = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                _sumd = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                _sume = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
                _sum9 = vrev64q_s32(_sum9);
                _sumb = vrev64q_s32(_sumb);
                _sumd = vrev64q_s32(_sumd);
                _sumf = vrev64q_s32(_sumf);
            }
#endif

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum8), _descale0);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum9), _descale0);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_suma), _descale0);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sumb), _descale0);
            float32x4_t _f8 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f9 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _fa = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _fb = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);
            float32x4_t _fc = vmulq_f32(vcvtq_f32_s32(_sumc), _descale1);
            float32x4_t _fd = vmulq_f32(vcvtq_f32_s32(_sumd), _descale1);
            float32x4_t _fe = vmulq_f32(vcvtq_f32_s32(_sume), _descale1);
            float32x4_t _ff = vmulq_f32(vcvtq_f32_s32(_sumf), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c0);
                    _fa = vaddq_f32(_fa, _c0);
                    _fb = vaddq_f32(_fb, _c0);
                    _fc = vaddq_f32(_fc, _c0);
                    _fd = vaddq_f32(_fd, _c0);
                    _fe = vaddq_f32(_fe, _c0);
                    _ff = vaddq_f32(_ff, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c1);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c1);
                    _fb = vaddq_f32(_fb, _c1);
                    _fc = vaddq_f32(_fc, _c1);
                    _fd = vaddq_f32(_fd, _c1);
                    _fe = vaddq_f32(_fe, _c1);
                    _ff = vaddq_f32(_ff, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        uint16x8_t _c45 = vld1q_u16(pC + 16);
                        uint16x8_t _c67 = vld1q_u16(pC + 24);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                        float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                        float32x4_t _c4 = bfloat2float(vget_low_u16(_c45));
                        float32x4_t _c5 = bfloat2float(vget_high_u16(_c45));
                        float32x4_t _c6 = bfloat2float(vget_low_u16(_c67));
                        float32x4_t _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c45 = vld1q_u16(pC + c_hstep * 4 + 16);
                        _c67 = vld1q_u16(pC + c_hstep * 4 + 24);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        _c4 = bfloat2float(vget_low_u16(_c45));
                        _c5 = bfloat2float(vget_high_u16(_c45));
                        _c6 = bfloat2float(vget_low_u16(_c67));
                        _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                        uint16x8_t _c45 = vld1q_u16(pC + c_hstep * 2);
                        uint16x8_t _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                        float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                        float32x4_t _c4 = bfloat2float(vget_low_u16(_c45));
                        float32x4_t _c5 = bfloat2float(vget_high_u16(_c45));
                        float32x4_t _c6 = bfloat2float(vget_low_u16(_c67));
                        float32x4_t _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 5);
                        _c45 = vld1q_u16(pC + c_hstep * 6);
                        _c67 = vld1q_u16(pC + c_hstep * 7);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        _c4 = bfloat2float(vget_low_u16(_c45));
                        _c5 = bfloat2float(vget_high_u16(_c45));
                        _c6 = bfloat2float(vget_low_u16(_c67));
                        _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 8;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _cc = vld1q_u16(pC);
                    float32x4_t _cc0 = bfloat2float(vget_low_u16(_cc));
                    float32x4_t _cc1 = bfloat2float(vget_high_u16(_cc));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c2);
                    _fb = vaddq_f32(_fb, _c3);
                    _fc = vaddq_f32(_fc, _c4);
                    _fd = vaddq_f32(_fd, _c5);
                    _fe = vaddq_f32(_fe, _c6);
                    _ff = vaddq_f32(_ff, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
                _f8 = vmulq_f32(_f8, _alpha);
                _f9 = vmulq_f32(_f9, _alpha);
                _fa = vmulq_f32(_fa, _alpha);
                _fb = vmulq_f32(_fb, _alpha);
                _fc = vmulq_f32(_fc, _alpha);
                _fd = vmulq_f32(_fd, _alpha);
                _fe = vmulq_f32(_fe, _alpha);
                _ff = vmulq_f32(_ff, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);
            uint16x4_t _bf4 = float2bfloat(_f4);
            uint16x4_t _bf5 = float2bfloat(_f5);
            uint16x4_t _bf6 = float2bfloat(_f6);
            uint16x4_t _bf7 = float2bfloat(_f7);
            uint16x4_t _bf8 = float2bfloat(_f8);
            uint16x4_t _bf9 = float2bfloat(_f9);
            uint16x4_t _bfa = float2bfloat(_fa);
            uint16x4_t _bfb = float2bfloat(_fb);
            uint16x4_t _bfc = float2bfloat(_fc);
            uint16x4_t _bfd = float2bfloat(_fd);
            uint16x4_t _bfe = float2bfloat(_fe);
            uint16x4_t _bff = float2bfloat(_ff);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
                vst1q_u16(p0 + 8, vcombine_u16(_bf2, _bf3));
                vst1q_u16(p0 + 16, vcombine_u16(_bf4, _bf5));
                vst1q_u16(p0 + 24, vcombine_u16(_bf6, _bf7));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_bf8, _bf9));
                vst1q_u16(p0 + out_hstep * 4 + 8, vcombine_u16(_bfa, _bfb));
                vst1q_u16(p0 + out_hstep * 4 + 16, vcombine_u16(_bfc, _bfd));
                vst1q_u16(p0 + out_hstep * 4 + 24, vcombine_u16(_bfe, _bff));
                p0 += 32;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_bf0, _bf1, _bf2, _bf3);
                transpose4x4_u16(_bf4, _bf5, _bf6, _bf7);
                transpose4x4_u16(_bf8, _bf9, _bfa, _bfb);
                transpose4x4_u16(_bfc, _bfd, _bfe, _bff);
                vst1q_u16(p0, vcombine_u16(_bf0, _bf4));
                vst1q_u16(p0 + out_hstep, vcombine_u16(_bf1, _bf5));
                vst1q_u16(p0 + out_hstep * 2, vcombine_u16(_bf2, _bf6));
                vst1q_u16(p0 + out_hstep * 3, vcombine_u16(_bf3, _bf7));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_bf8, _bfc));
                vst1q_u16(p0 + out_hstep * 5, vcombine_u16(_bf9, _bfd));
                vst1q_u16(p0 + out_hstep * 6, vcombine_u16(_bfa, _bfe));
                vst1q_u16(p0 + out_hstep * 7, vcombine_u16(_bfb, _bff));
                p0 += 8;
            }

            pp += 64;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
#else
            // from
            //      a0 b1 c2 d3
            //      e0 f1 g2 h3
            //      c0 d1 a2 b3
            //      g0 h1 e2 f3
            //      a3 b2 c1 d0
            //      e3 f2 g1 h0
            //      c3 d2 a1 b0
            //      g3 h2 e1 f0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c1);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c1);
                    _f7 = vaddq_f32(_f7, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                        float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = bfloat2float(_cc0);
                        _c1 = bfloat2float(_cc1);
                        float32x4_t _c2 = bfloat2float(_cc2);
                        float32x4_t _c3 = bfloat2float(_cc3);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _cc0 = vld1_u16(pC + c_hstep * 4);
                        _cc1 = vld1_u16(pC + c_hstep * 5);
                        _cc2 = vld1_u16(pC + c_hstep * 6);
                        _cc3 = vld1_u16(pC + c_hstep * 7);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = bfloat2float(_cc0);
                        _c1 = bfloat2float(_cc1);
                        _c2 = bfloat2float(_cc2);
                        _c3 = bfloat2float(_cc3);
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 4;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = bfloat2float(vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c2);
                    _f7 = vaddq_f32(_f7, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);
            uint16x4_t _bf4 = float2bfloat(_f4);
            uint16x4_t _bf5 = float2bfloat(_f5);
            uint16x4_t _bf6 = float2bfloat(_f6);
            uint16x4_t _bf7 = float2bfloat(_f7);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
                vst1q_u16(p0 + 8, vcombine_u16(_bf2, _bf3));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_bf4, _bf5));
                vst1q_u16(p0 + out_hstep * 4 + 8, vcombine_u16(_bf6, _bf7));
                p0 += 16;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_bf0, _bf1, _bf2, _bf3);
                transpose4x4_u16(_bf4, _bf5, _bf6, _bf7);
                vst1_u16(p0, _bf0);
                vst1_u16(p0 + out_hstep, _bf1);
                vst1_u16(p0 + out_hstep * 2, _bf2);
                vst1_u16(p0 + out_hstep * 3, _bf3);
                vst1_u16(p0 + out_hstep * 4, _bf4);
                vst1_u16(p0 + out_hstep * 5, _bf5);
                vst1_u16(p0 + out_hstep * 6, _bf6);
                vst1_u16(p0 + out_hstep * 7, _bf7);
                p0 += 4;
            }

            pp += 32;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
#else
            // from
            //      a0 b1 c0 d1
            //      e0 f1 g0 h1
            //      a1 b0 c1 d0
            //      e1 f0 g1 h0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale1);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c01;
                    uint16x8_t _c23;
                    if (c_elempack == 4)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + c_hstep * 4);
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[1], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep + 1], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c01, 7);
                        _c23 = uint16x8_t();
                        _c23 = vsetq_lane_u16(pC[c_hstep * 4], _c23, 0);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 5], _c23, 1);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 6], _c23, 2);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 7], _c23, 3);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 4 + 1], _c23, 4);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 5 + 1], _c23, 5);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 6 + 1], _c23, 6);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 7 + 1], _c23, 7);
                        pC += 2;
                    }
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    _c1 = bfloat2float(vget_high_u16(_c01));
                    float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                    float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    _c1 = vdupq_n_f32(bfloat16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_bf2, _bf3));
                p0 += 8;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_bf0, 0);
                p0[1] = vget_lane_u16(_bf1, 0);
                p0[out_hstep] = vget_lane_u16(_bf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_bf1, 1);
                p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_bf1, 2);
                p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_bf1, 3);
                p0[out_hstep * 4] = vget_lane_u16(_bf2, 0);
                p0[out_hstep * 4 + 1] = vget_lane_u16(_bf3, 0);
                p0[out_hstep * 5] = vget_lane_u16(_bf2, 1);
                p0[out_hstep * 5 + 1] = vget_lane_u16(_bf3, 1);
                p0[out_hstep * 6] = vget_lane_u16(_bf2, 2);
                p0[out_hstep * 6 + 1] = vget_lane_u16(_bf3, 2);
                p0[out_hstep * 7] = vget_lane_u16(_bf2, 3);
                p0[out_hstep * 7 + 1] = vget_lane_u16(_bf3, 3);
                p0 += 2;
            }

            pp += 16;
        }
        for (; jj < max_jj; jj++)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        _c0 = bfloat2float(vld1_u16(pC));
                        _c1 = bfloat2float(vld1_u16(pC + c_hstep * 4));
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 4], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 5], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 6], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 7], _c01, 7);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        pC += 1;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);

            if (out_elempack == 4)
            {
                vst1_u16(p0, _bf0);
                vst1_u16(p0 + out_hstep * 4, _bf1);
                p0 += 4;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_bf0, 0);
                p0[out_hstep] = vget_lane_u16(_bf0, 1);
                p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                p0[out_hstep * 4] = vget_lane_u16(_bf1, 0);
                p0[out_hstep * 5] = vget_lane_u16(_bf1, 1);
                p0[out_hstep * 6] = vget_lane_u16(_bf1, 2);
                p0[out_hstep * 7] = vget_lane_u16(_bf1, 3);
                p0++;
            }

            pp += 8;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

        float32x4_t _c0;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                _c0 = bfloat2float(vld1_u16(pC));
                _c0 = vmulq_n_f32(_c0, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
#else
            // from
            //      a0 b1 c2 d3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c01;
                    uint16x8_t _c23;
                    uint16x8_t _c45;
                    uint16x8_t _c67;
                    if (c_elempack == 4)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + 8);
                        _c45 = vld1q_u16(pC + 16);
                        _c67 = vld1q_u16(pC + 24);
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + c_hstep);
                        _c45 = vld1q_u16(pC + c_hstep * 2);
                        _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        pC += 8;
                    }
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c01));
                    float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                    float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                    float32x4_t _c4 = bfloat2float(vget_low_u16(_c45));
                    float32x4_t _c5 = bfloat2float(vget_high_u16(_c45));
                    float32x4_t _c6 = bfloat2float(vget_low_u16(_c67));
                    float32x4_t _c7 = bfloat2float(vget_high_u16(_c67));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                        _f4 = vaddq_f32(_f4, _c4);
                        _f5 = vaddq_f32(_f5, _c5);
                        _f6 = vaddq_f32(_f6, _c6);
                        _f7 = vaddq_f32(_f7, _c7);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                        _f4 = vmlaq_f32(_f4, _c4, _beta);
                        _f5 = vmlaq_f32(_f5, _c5, _beta);
                        _f6 = vmlaq_f32(_f6, _c6, _beta);
                        _f7 = vmlaq_f32(_f7, _c7, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    float32x4_t _cc0 = bfloat2float(vget_low_u16(_c));
                    float32x4_t _cc1 = bfloat2float(vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);
            uint16x4_t _bf4 = float2bfloat(_f4);
            uint16x4_t _bf5 = float2bfloat(_f5);
            uint16x4_t _bf6 = float2bfloat(_f6);
            uint16x4_t _bf7 = float2bfloat(_f7);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
                vst1q_u16(p0 + 8, vcombine_u16(_bf2, _bf3));
                vst1q_u16(p0 + 16, vcombine_u16(_bf4, _bf5));
                vst1q_u16(p0 + 24, vcombine_u16(_bf6, _bf7));
                p0 += 32;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_bf0, _bf1, _bf2, _bf3);
                transpose4x4_u16(_bf4, _bf5, _bf6, _bf7);
                vst1q_u16(p0, vcombine_u16(_bf0, _bf4));
                vst1q_u16(p0 + out_hstep, vcombine_u16(_bf1, _bf5));
                vst1q_u16(p0 + out_hstep * 2, vcombine_u16(_bf2, _bf6));
                vst1q_u16(p0 + out_hstep * 3, vcombine_u16(_bf3, _bf7));
                p0 += 8;
            }

            pp += 32;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
#else
            // from
            //      a0 b1 c2 d3
            //      c0 d1 a2 b3
            //      a3 b2 c1 d0
            //      c3 d2 a1 b0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                _sum2 = vextq_s32(_sum2, _sum2, 2);
                _sum3 = vextq_s32(_sum3, _sum3, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep * 1);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = bfloat2float(_cc0);
                        _c1 = bfloat2float(_cc1);
                        _c2 = bfloat2float(_cc2);
                        _c3 = bfloat2float(_cc3);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = bfloat2float(vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    float32x4_t _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
                vst1q_u16(p0 + 8, vcombine_u16(_bf2, _bf3));
                p0 += 16;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_bf0, _bf1, _bf2, _bf3);
                vst1_u16(p0, _bf0);
                vst1_u16(p0 + out_hstep, _bf1);
                vst1_u16(p0 + out_hstep * 2, _bf2);
                vst1_u16(p0 + out_hstep * 3, _bf3);
                p0 += 4;
            }

            pp += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
#else
            // from
            //      a0 b1 c0 d1
            //      a1 b0 c1 d0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            {
                _sum1 = vrev64q_s32(_sum1);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1q_u16(pC);
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x8_t();
                        _c = vsetq_lane_u16(pC[0], _c, 0);
                        _c = vsetq_lane_u16(pC[c_hstep], _c, 1);
                        _c = vsetq_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vsetq_lane_u16(pC[c_hstep * 3], _c, 3);
                        _c = vsetq_lane_u16(pC[1], _c, 4);
                        _c = vsetq_lane_u16(pC[c_hstep + 1], _c, 5);
                        _c = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c, 6);
                        _c = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c, 7);
                        pC += 2;
                    }
                    _c0 = bfloat2float(vget_low_u16(_c));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    float32x4_t _c1 = vdupq_n_f32(bfloat16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
                p0 += 8;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_bf0, 0);
                p0[1] = vget_lane_u16(_bf1, 0);
                p0[out_hstep] = vget_lane_u16(_bf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_bf1, 1);
                p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_bf1, 2);
                p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_bf1, 3);
                p0 += 2;
            }

            pp += 8;
        }
        for (; jj < max_jj; jj++)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x4_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1_u16(pC);
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x4_t();
                        _c = vset_lane_u16(pC[0], _c, 0);
                        _c = vset_lane_u16(pC[c_hstep], _c, 1);
                        _c = vset_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vset_lane_u16(pC[c_hstep * 3], _c, 3);
                        pC += 1;
                    }
                    _c0 = bfloat2float(_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    pC += 1;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _bf0 = float2bfloat(_f0);

            if (out_elempack == 4)
            {
                vst1_u16(p0, _bf0);
                p0 += 4;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_bf0, 0);
                p0[out_hstep] = vget_lane_u16(_bf0, 1);
                p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                p0++;
            }

            pp += 4;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        // out_elempack == 1
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const float descale0 = descales[i + ii];
        const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
        float32x2_t _descale = vld1_f32((const float*)descales + i + ii);
#endif

        float c0;
        float c1;
#if __ARM_NEON
        float32x4_t _c0;
        float32x4_t _c1;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = bfloat16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = bfloat16_to_float32(pC[0]) * beta;
                c1 = bfloat16_to_float32(pC[1]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
                _c1 = vdupq_n_f32(c1);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale, 0);
            float32x4_t _f2 = vmulq_lane_f32(vcvtq_f32_s32(_sum2), _descale, 1);
            float32x4_t _f3 = vmulq_lane_f32(vcvtq_f32_s32(_sum3), _descale, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c01));
                    float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                    float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 8;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    _c0 = bfloat2float(vget_low_u16(_c));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _c0 = vmulq_f32(_c0, _beta);
                        _c1 = vmulq_f32(_c1, _beta);
                    }
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_u16(p0, vcombine_u16(float2bfloat(_f0), float2bfloat(_f1)));
            vst1q_u16(p0 + out_hstep, vcombine_u16(float2bfloat(_f2), float2bfloat(_f3)));

            pp += 16;
            p0 += 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = bfloat2float(vld1_u16(pC));
                    float32x4_t _c1 = bfloat2float(vld1_u16(pC + c_hstep));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 4;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = bfloat2float(vld1_u16(pC));
                    _c0 = vmulq_n_f32(_c0, beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1_u16(p0, float2bfloat(_f0));
            vst1_u16(p0 + out_hstep, float2bfloat(_f1));

            pp += 8;
            p0 += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);

            float32x2x2_t _descale01 = vzip_f32(_descale, _descale);
            float32x4_t _descale0011 = vcombine_f32(_descale01.val[0], _descale01.val[1]);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0011);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    float32x4_t _c0011 = vcombine_f32(vget_low_f32(_c0), vget_high_f32(_c1));
                    _f0 = vaddq_f32(_f0, _c0011);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[1], _c, 1);
                    _c = vset_lane_u16(pC[c_hstep], _c, 2);
                    _c = vset_lane_u16(pC[c_hstep + 1], _c, 3);
                    _c0 = bfloat2float(_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[1], _c, 1);
                    _c = vset_lane_u16(pC[0], _c, 2);
                    _c = vset_lane_u16(pC[1], _c, 3);
                    _c0 = bfloat2float(_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _bf0 = float2bfloat(_f0);

            p0[0] = vget_lane_u16(_bf0, 0);
            p0[1] = vget_lane_u16(_bf0, 1);
            p0[out_hstep] = vget_lane_u16(_bf0, 2);
            p0[out_hstep + 1] = vget_lane_u16(_bf0, 3);

            pp += 4;
            p0 += 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj++)
        {
            float f0 = pp[0] * descale0;
            float f1 = pp[1] * descale1;

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    f0 += c0;
                    f1 += c0;
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                    f1 += c1;
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    f0 += bfloat16_to_float32(pC[0]) * beta;
                    f1 += bfloat16_to_float32(pC[c_hstep]) * beta;
                    pC += 1;
                }
                if (broadcast_type_C == 4)
                {
                    f0 += bfloat16_to_float32(pC[0]) * beta;
                    f1 += bfloat16_to_float32(pC[0]) * beta;
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                f0 *= alpha;
                f1 *= alpha;
            }

            p0[0] = float32_to_bfloat16(f0);
            p0[out_hstep] = float32_to_bfloat16(f1);

            pp += 2;
            p0++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        // out_elempack == 1
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const float descale = descales[i + ii];
#if __ARM_NEON
        float32x4_t _descale = vdupq_n_f32(descale);
#endif

        float c0;
#if __ARM_NEON
        float32x4_t _c0;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = bfloat16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = bfloat16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
        for (; jj + 15 < max_jj; jj += 16)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + 8);
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c01));
                    float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                    float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 16;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_u16(p0, vcombine_u16(float2bfloat(_f0), float2bfloat(_f1)));
            vst1q_u16(p0 + 8, vcombine_u16(float2bfloat(_f2), float2bfloat(_f3)));

            pp += 16;
            p0 += 16;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c01));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_u16(p0, vcombine_u16(float2bfloat(_f0), float2bfloat(_f1)));

            pp += 8;
            p0 += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    _c0 = bfloat2float(vld1_u16(pC));
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 4;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1_u16(p0, float2bfloat(_f0));

            pp += 4;
            p0 += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x2_t _f0 = vmul_f32(vcvt_f32_s32(vld1_s32(pp)), vget_low_f32(_descale));

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vadd_f32(_f0, vget_low_f32(_c0));
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    float32x2_t _cc = float32x2_t();
                    _cc = vset_lane_f32(bfloat16_to_float32(pC[0]), _cc, 0);
                    _cc = vset_lane_f32(bfloat16_to_float32(pC[1]), _cc, 1);
                    _f0 = vmla_n_f32(_f0, _cc, beta);
                    pC += 2;
                }
            }

            _f0 = vmul_n_f32(_f0, alpha);

            p0[0] = float32_to_bfloat16(vget_lane_f32(_f0, 0));
            p0[1] = float32_to_bfloat16(vget_lane_f32(_f0, 1));

            pp += 2;
            p0 += 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj++)
        {
            float f0 = pp[0] * descale;

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    f0 += bfloat16_to_float32(pC[0]) * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;

            p0[0] = float32_to_bfloat16(f0);

            pp += 1;
            p0++;
        }
    }
}

static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_unpack_output_tile_int32_to_bf16_asimddp(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const size_t c_hstep = C.dims == 3 ? C.cstep : (size_t)C.w;
    const int c_elempack = C.elempack;
    const unsigned short* pC = C;

    // NCNN_LOGE("transpose_unpack_output_tile_int32_to_bf16  %d %d %d %d  %d  %d  %d", i, max_ii, j, max_jj, out_elempack, broadcast_type_C, c_elempack);

    const int* pp = topT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
        float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

        float32x4_t _c0;
        float32x4_t _c1;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                uint16x8_t _c = vld1q_u16(pC);
                _c0 = bfloat2float(vget_low_u16(_c));
                _c1 = bfloat2float(vget_high_u16(_c));
                _c0 = vmulq_n_f32(_c0, beta);
                _c1 = vmulq_n_f32(_c1, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);
            int32x4_t _sum8 = vld1q_s32(pp + 32);
            int32x4_t _sum9 = vld1q_s32(pp + 36);
            int32x4_t _suma = vld1q_s32(pp + 40);
            int32x4_t _sumb = vld1q_s32(pp + 44);
            int32x4_t _sumc = vld1q_s32(pp + 48);
            int32x4_t _sumd = vld1q_s32(pp + 52);
            int32x4_t _sume = vld1q_s32(pp + 56);
            int32x4_t _sumf = vld1q_s32(pp + 60);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
#else
            // from
            //      a0 b1 c2 d3
            //      e4 f5 g6 h7
            //      e0 f1 g2 h3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      g4 h5 e6 f7
            //      g0 h1 e2 f3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      e7 f6 g5 h4
            //      e3 f2 g1 h0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      g7 h6 e5 f4
            //      g3 h2 e1 f0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
            {
                _sum8 = vrev64q_s32(_sum8);
                _sum9 = vrev64q_s32(_sum9);
                _suma = vrev64q_s32(_suma);
                _sumb = vrev64q_s32(_sumb);
                _sumc = vrev64q_s32(_sumc);
                _sumd = vrev64q_s32(_sumd);
                _sume = vrev64q_s32(_sume);
                _sumf = vrev64q_s32(_sumf);
                _sum8 = vextq_s32(_sum8, _sum8, 2);
                _sum9 = vextq_s32(_sum9, _sum9, 2);
                _suma = vextq_s32(_suma, _suma, 2);
                _sumb = vextq_s32(_sumb, _sumb, 2);
                _sumc = vextq_s32(_sumc, _sumc, 2);
                _sumd = vextq_s32(_sumd, _sumd, 2);
                _sume = vextq_s32(_sume, _sume, 2);
                _sumf = vextq_s32(_sumf, _sumf, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                _sum9 = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                _suma = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                _sumb = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                _sumc = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                _sumd = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                _sume = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
                _sum9 = vrev64q_s32(_sum9);
                _sumb = vrev64q_s32(_sumb);
                _sumd = vrev64q_s32(_sumd);
                _sumf = vrev64q_s32(_sumf);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum8), _descale0);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum9), _descale0);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_suma), _descale0);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sumb), _descale0);
            float32x4_t _f8 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f9 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _fa = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _fb = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);
            float32x4_t _fc = vmulq_f32(vcvtq_f32_s32(_sumc), _descale1);
            float32x4_t _fd = vmulq_f32(vcvtq_f32_s32(_sumd), _descale1);
            float32x4_t _fe = vmulq_f32(vcvtq_f32_s32(_sume), _descale1);
            float32x4_t _ff = vmulq_f32(vcvtq_f32_s32(_sumf), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c0);
                    _fa = vaddq_f32(_fa, _c0);
                    _fb = vaddq_f32(_fb, _c0);
                    _fc = vaddq_f32(_fc, _c0);
                    _fd = vaddq_f32(_fd, _c0);
                    _fe = vaddq_f32(_fe, _c0);
                    _ff = vaddq_f32(_ff, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c1);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c1);
                    _fb = vaddq_f32(_fb, _c1);
                    _fc = vaddq_f32(_fc, _c1);
                    _fd = vaddq_f32(_fd, _c1);
                    _fe = vaddq_f32(_fe, _c1);
                    _ff = vaddq_f32(_ff, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        uint16x8_t _c45 = vld1q_u16(pC + 16);
                        uint16x8_t _c67 = vld1q_u16(pC + 24);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                        float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                        float32x4_t _c4 = bfloat2float(vget_low_u16(_c45));
                        float32x4_t _c5 = bfloat2float(vget_high_u16(_c45));
                        float32x4_t _c6 = bfloat2float(vget_low_u16(_c67));
                        float32x4_t _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c45 = vld1q_u16(pC + c_hstep * 4 + 16);
                        _c67 = vld1q_u16(pC + c_hstep * 4 + 24);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        _c4 = bfloat2float(vget_low_u16(_c45));
                        _c5 = bfloat2float(vget_high_u16(_c45));
                        _c6 = bfloat2float(vget_low_u16(_c67));
                        _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                        uint16x8_t _c45 = vld1q_u16(pC + c_hstep * 2);
                        uint16x8_t _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                        float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                        float32x4_t _c4 = bfloat2float(vget_low_u16(_c45));
                        float32x4_t _c5 = bfloat2float(vget_high_u16(_c45));
                        float32x4_t _c6 = bfloat2float(vget_low_u16(_c67));
                        float32x4_t _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 5);
                        _c45 = vld1q_u16(pC + c_hstep * 6);
                        _c67 = vld1q_u16(pC + c_hstep * 7);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        _c4 = bfloat2float(vget_low_u16(_c45));
                        _c5 = bfloat2float(vget_high_u16(_c45));
                        _c6 = bfloat2float(vget_low_u16(_c67));
                        _c7 = bfloat2float(vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 8;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    float32x4_t _cc0 = bfloat2float(vget_low_u16(_c));
                    float32x4_t _cc1 = bfloat2float(vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c2);
                    _fb = vaddq_f32(_fb, _c3);
                    _fc = vaddq_f32(_fc, _c4);
                    _fd = vaddq_f32(_fd, _c5);
                    _fe = vaddq_f32(_fe, _c6);
                    _ff = vaddq_f32(_ff, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
                _f8 = vmulq_f32(_f8, _alpha);
                _f9 = vmulq_f32(_f9, _alpha);
                _fa = vmulq_f32(_fa, _alpha);
                _fb = vmulq_f32(_fb, _alpha);
                _fc = vmulq_f32(_fc, _alpha);
                _fd = vmulq_f32(_fd, _alpha);
                _fe = vmulq_f32(_fe, _alpha);
                _ff = vmulq_f32(_ff, _alpha);
            }

            uint16x8_t _bf0 = vcombine_u16(float2bfloat(_f0), float2bfloat(_f8));
            uint16x8_t _bf1 = vcombine_u16(float2bfloat(_f1), float2bfloat(_f9));
            uint16x8_t _bf2 = vcombine_u16(float2bfloat(_f2), float2bfloat(_fa));
            uint16x8_t _bf3 = vcombine_u16(float2bfloat(_f3), float2bfloat(_fb));
            uint16x8_t _bf4 = vcombine_u16(float2bfloat(_f4), float2bfloat(_fc));
            uint16x8_t _bf5 = vcombine_u16(float2bfloat(_f5), float2bfloat(_fd));
            uint16x8_t _bf6 = vcombine_u16(float2bfloat(_f6), float2bfloat(_fe));
            uint16x8_t _bf7 = vcombine_u16(float2bfloat(_f7), float2bfloat(_ff));

            if (out_elempack == 4)
            {
                uint16x8x4_t _bfa;
                uint16x8x4_t _bfb;
                _bfa.val[0] = _bf0;
                _bfa.val[1] = _bf1;
                _bfa.val[2] = _bf2;
                _bfa.val[3] = _bf3;
                _bfb.val[0] = _bf4;
                _bfb.val[1] = _bf5;
                _bfb.val[2] = _bf6;
                _bfb.val[3] = _bf7;
                vst4q_u16(p0, _bfa);
                vst4q_u16(p0 + out_hstep * 4, _bfb);
            }
            if (out_elempack == 1)
            {
                vst1q_u16(p0, _bf0);
                vst1q_u16(p0 + out_hstep, _bf1);
                vst1q_u16(p0 + out_hstep * 2, _bf2);
                vst1q_u16(p0 + out_hstep * 3, _bf3);
                vst1q_u16(p0 + out_hstep * 4, _bf4);
                vst1q_u16(p0 + out_hstep * 5, _bf5);
                vst1q_u16(p0 + out_hstep * 6, _bf6);
                vst1q_u16(p0 + out_hstep * 7, _bf7);
            }

            pp += 64;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3

#else
            // from
            //      a0 b1 c2 d3
            //      e0 f1 g2 h3
            //      c0 d1 a2 b3
            //      g0 h1 e2 f3
            //      a3 b2 c1 d0
            //      e3 f2 g1 h0
            //      c3 d2 a1 b0
            //      g3 h2 e1 f0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3

            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c1);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c1);
                    _f7 = vaddq_f32(_f7, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                        float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = bfloat2float(_cc0);
                        _c1 = bfloat2float(_cc1);
                        float32x4_t _c2 = bfloat2float(_cc2);
                        float32x4_t _c3 = bfloat2float(_cc3);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _cc0 = vld1_u16(pC + c_hstep * 4);
                        _cc1 = vld1_u16(pC + c_hstep * 5);
                        _cc2 = vld1_u16(pC + c_hstep * 6);
                        _cc3 = vld1_u16(pC + c_hstep * 7);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = bfloat2float(_cc0);
                        _c1 = bfloat2float(_cc1);
                        _c2 = bfloat2float(_cc2);
                        _c3 = bfloat2float(_cc3);
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 4;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = bfloat2float(vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c2);
                    _f7 = vaddq_f32(_f7, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x8_t _bf0 = vcombine_u16(float2bfloat(_f0), float2bfloat(_f4));
            uint16x8_t _bf1 = vcombine_u16(float2bfloat(_f1), float2bfloat(_f5));
            uint16x8_t _bf2 = vcombine_u16(float2bfloat(_f2), float2bfloat(_f6));
            uint16x8_t _bf3 = vcombine_u16(float2bfloat(_f3), float2bfloat(_f7));

            if (out_elempack == 4)
            {
                uint16x8x4_t _bf;
                _bf.val[0] = _bf0;
                _bf.val[1] = _bf1;
                _bf.val[2] = _bf2;
                _bf.val[3] = _bf3;
                vst4q_u16(p0, _bf);
            }
            if (out_elempack == 1)
            {
                vst1q_u16(p0, _bf0);
                vst1q_u16(p0 + out_hstep, _bf1);
                vst1q_u16(p0 + out_hstep * 2, _bf2);
                vst1q_u16(p0 + out_hstep * 3, _bf3);
            }

            pp += 32;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
#else
            // from
            //      a0 b1 c0 d1
            //      e0 f1 g0 h1
            //      a1 b0 c1 d0
            //      e1 f0 g1 h0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale1);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + c_hstep * 4);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 4], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 5], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 6], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 7], _c01, 7);

                        uint16x8_t _c23 = uint16x8_t();
                        _c23 = vsetq_lane_u16(pC[1], _c23, 0);
                        _c23 = vsetq_lane_u16(pC[c_hstep + 1], _c23, 1);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c23, 2);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c23, 3);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 4 + 1], _c23, 4);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 5 + 1], _c23, 5);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 6 + 1], _c23, 6);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 7 + 1], _c23, 7);

                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_low_u16(_c23));
                        _c2 = bfloat2float(vget_high_u16(_c01));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        pC += 2;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    _c1 = vdupq_n_f32(bfloat16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_u16(p0, vcombine_u16(float2bfloat(_f0), float2bfloat(_f2)));
            vst1q_u16(p0 + out_hstep, vcombine_u16(float2bfloat(_f1), float2bfloat(_f3)));

            pp += 16;
            p0 += out_hstep * 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp + 4)), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    if (c_elempack == 4)
                    {
                        _c0 = bfloat2float(vld1_u16(pC));
                        _c1 = bfloat2float(vld1_u16(pC + c_hstep * 4));
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 4], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 5], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 6], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 7], _c01, 7);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        pC += 1;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_u16(p0, vcombine_u16(float2bfloat(_f0), float2bfloat(_f1)));
            pp += 8;
            p0 += out_hstep;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

        float32x4_t _c0;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                _c0 = bfloat2float(vld1_u16(pC));
                _c0 = vmulq_n_f32(_c0, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
#else
            // from
            //      a0 b1 c2 d3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c01;
                    uint16x8_t _c23;
                    uint16x8_t _c45;
                    uint16x8_t _c67;
                    if (c_elempack == 4)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + 8);
                        _c45 = vld1q_u16(pC + 16);
                        _c67 = vld1q_u16(pC + 24);
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + c_hstep);
                        _c45 = vld1q_u16(pC + c_hstep * 2);
                        _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        pC += 8;
                    }
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c01));
                    float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                    float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                    float32x4_t _c4 = bfloat2float(vget_low_u16(_c45));
                    float32x4_t _c5 = bfloat2float(vget_high_u16(_c45));
                    float32x4_t _c6 = bfloat2float(vget_low_u16(_c67));
                    float32x4_t _c7 = bfloat2float(vget_high_u16(_c67));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                        _f4 = vaddq_f32(_f4, _c4);
                        _f5 = vaddq_f32(_f5, _c5);
                        _f6 = vaddq_f32(_f6, _c6);
                        _f7 = vaddq_f32(_f7, _c7);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                        _f4 = vmlaq_f32(_f4, _c4, _beta);
                        _f5 = vmlaq_f32(_f5, _c5, _beta);
                        _f6 = vmlaq_f32(_f6, _c6, _beta);
                        _f7 = vmlaq_f32(_f7, _c7, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    float32x4_t _cc0 = bfloat2float(vget_low_u16(_c));
                    float32x4_t _cc1 = bfloat2float(vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);
            uint16x4_t _bf4 = float2bfloat(_f4);
            uint16x4_t _bf5 = float2bfloat(_f5);
            uint16x4_t _bf6 = float2bfloat(_f6);
            uint16x4_t _bf7 = float2bfloat(_f7);

            if (out_elempack == 4)
            {
                uint16x4x4_t _bfa;
                uint16x4x4_t _bfb;
                _bfa.val[0] = _bf0;
                _bfa.val[1] = _bf1;
                _bfa.val[2] = _bf2;
                _bfa.val[3] = _bf3;
                _bfb.val[0] = _bf4;
                _bfb.val[1] = _bf5;
                _bfb.val[2] = _bf6;
                _bfb.val[3] = _bf7;
                vst4_u16(p0, _bfa);
                vst4_u16(p0 + out_hstep * 4, _bfb);
            }
            if (out_elempack == 1)
            {
                vst1_u16(p0, _bf0);
                vst1_u16(p0 + out_hstep, _bf1);
                vst1_u16(p0 + out_hstep * 2, _bf2);
                vst1_u16(p0 + out_hstep * 3, _bf3);
                vst1_u16(p0 + out_hstep * 4, _bf4);
                vst1_u16(p0 + out_hstep * 5, _bf5);
                vst1_u16(p0 + out_hstep * 6, _bf6);
                vst1_u16(p0 + out_hstep * 7, _bf7);
            }

            pp += 32;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
#else
            // from
            //      a0 b1 c2 d3
            //      c0 d1 a2 b3
            //      a3 b2 c1 d0
            //      c3 d2 a1 b0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                _sum2 = vextq_s32(_sum2, _sum2, 2);
                _sum3 = vextq_s32(_sum3, _sum3, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = bfloat2float(vget_low_u16(_c01));
                        _c1 = bfloat2float(vget_high_u16(_c01));
                        _c2 = bfloat2float(vget_low_u16(_c23));
                        _c3 = bfloat2float(vget_high_u16(_c23));
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = bfloat2float(_cc0);
                        _c1 = bfloat2float(_cc1);
                        _c2 = bfloat2float(_cc2);
                        _c3 = bfloat2float(_cc3);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = bfloat2float(vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    float32x4_t _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);

            if (out_elempack == 4)
            {
                uint16x4x4_t _bf;
                _bf.val[0] = _bf0;
                _bf.val[1] = _bf1;
                _bf.val[2] = _bf2;
                _bf.val[3] = _bf3;
                vst4_u16(p0, _bf);
            }
            if (out_elempack == 1)
            {
                vst1_u16(p0, _bf0);
                vst1_u16(p0 + out_hstep, _bf1);
                vst1_u16(p0 + out_hstep * 2, _bf2);
                vst1_u16(p0 + out_hstep * 3, _bf3);
            }

            pp += 16;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
#else
            // from
            //      a0 b1 c0 d1
            //      a1 b0 c1 d0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            {
                _sum1 = vrev64q_s32(_sum1);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1q_u16(pC);
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x8_t();
                        _c = vsetq_lane_u16(pC[0], _c, 0);
                        _c = vsetq_lane_u16(pC[c_hstep], _c, 1);
                        _c = vsetq_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vsetq_lane_u16(pC[c_hstep * 3], _c, 3);
                        _c = vsetq_lane_u16(pC[1], _c, 4);
                        _c = vsetq_lane_u16(pC[c_hstep + 1], _c, 5);
                        _c = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c, 6);
                        _c = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c, 7);
                        pC += 2;
                    }
                    _c0 = bfloat2float(vget_low_u16(_c));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    float32x4_t _c1 = vdupq_n_f32(bfloat16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1_u16(p0, float2bfloat(_f0));
            vst1_u16(p0 + out_hstep, float2bfloat(_f1));

            pp += 8;
            p0 += out_hstep * 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x4_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1_u16(pC);
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x4_t();
                        _c = vset_lane_u16(pC[0], _c, 0);
                        _c = vset_lane_u16(pC[c_hstep], _c, 1);
                        _c = vset_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vset_lane_u16(pC[c_hstep * 3], _c, 3);
                        pC += 1;
                    }
                    _c0 = bfloat2float(_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(bfloat16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    pC += 1;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1_u16(p0, float2bfloat(_f0));
            pp += 4;
            p0 += out_hstep;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        const float descale0 = descales[i + ii];
        const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
        float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii);
#endif

        float c0;
        float c1;
#if __ARM_NEON
        float32x4_t _c0;
        float32x4_t _c1;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = bfloat16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = bfloat16_to_float32(pC[0]) * beta;
                c1 = bfloat16_to_float32(pC[1]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
                _c1 = vdupq_n_f32(c1);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale01, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale01, 0);
            float32x4_t _f2 = vmulq_lane_f32(vcvtq_f32_s32(_sum2), _descale01, 1);
            float32x4_t _f3 = vmulq_lane_f32(vcvtq_f32_s32(_sum3), _descale01, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    _c1 = bfloat2float(vget_high_u16(_c01));
                    float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                    float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 8;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    _c0 = bfloat2float(vget_low_u16(_c));
                    _c1 = bfloat2float(vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _c0 = vmulq_f32(_c0, _beta);
                        _c1 = vmulq_f32(_c1, _beta);
                    }
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf2));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_bf1, _bf3));
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_bf0, 0);
                p0[1] = vget_lane_u16(_bf2, 0);
                p0[out_hstep] = vget_lane_u16(_bf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_bf2, 1);
                p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_bf2, 2);
                p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_bf2, 3);
                p0[out_hstep * 4] = vget_lane_u16(_bf1, 0);
                p0[out_hstep * 4 + 1] = vget_lane_u16(_bf3, 0);
                p0[out_hstep * 5] = vget_lane_u16(_bf1, 1);
                p0[out_hstep * 5 + 1] = vget_lane_u16(_bf3, 1);
                p0[out_hstep * 6] = vget_lane_u16(_bf1, 2);
                p0[out_hstep * 6 + 1] = vget_lane_u16(_bf3, 2);
                p0[out_hstep * 7] = vget_lane_u16(_bf1, 3);
                p0[out_hstep * 7 + 1] = vget_lane_u16(_bf3, 3);
            }

            pp += 16;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            // a0 a1 a2 a3
            // b0 b1 b2 b3

            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale01, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale01, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = bfloat2float(vld1_u16(pC));
                    _c1 = bfloat2float(vld1_u16(pC + c_hstep));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 4;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = bfloat2float(vld1_u16(pC));
                    _c0 = vmulq_n_f32(_c0, beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_bf0, 0);
                p0[1] = vget_lane_u16(_bf1, 0);
                p0[out_hstep] = vget_lane_u16(_bf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_bf1, 1);
                p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_bf1, 2);
                p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_bf1, 3);
            }

            pp += 8;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            // a0 a1 b0 b1
            int32x2x2_t _sum0 = vld2_s32(pp);

            float32x4_t _descale = vcombine_f32(_descale01, _descale01);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vcombine_s32(_sum0.val[0], _sum0.val[1])), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    float32x4_t _cc = vzipq_f32(_c0, _c1).val[0];
                    _f0 = vaddq_f32(_f0, _cc);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[c_hstep], _c, 1);
                    _c = vset_lane_u16(pC[1], _c, 2);
                    _c = vset_lane_u16(pC[c_hstep + 1], _c, 3);
                    _c0 = bfloat2float(_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[0], _c, 1);
                    _c = vset_lane_u16(pC[1], _c, 2);
                    _c = vset_lane_u16(pC[1], _c, 3);
                    _c0 = bfloat2float(_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _bf0 = float2bfloat(_f0);

            p0[0] = vget_lane_u16(_bf0, 0);
            p0[1] = vget_lane_u16(_bf0, 1);
            p0[out_hstep] = vget_lane_u16(_bf0, 2);
            p0[out_hstep + 1] = vget_lane_u16(_bf0, 3);

            pp += 4;
            p0 += out_hstep * 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj += 1)
        {
            float f0 = pp[0] * descale0;
            float f1 = pp[1] * descale1;

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    f0 += c0;
                    f1 += c0;
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                    f1 += c1;
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    f0 += bfloat16_to_float32(pC[0]) * beta;
                    f1 += bfloat16_to_float32(pC[c_hstep]) * beta;
                    pC += 1;
                }
                if (broadcast_type_C == 4)
                {
                    c0 = bfloat16_to_float32(pC[0]) * beta;
                    f0 += c0;
                    f1 += c0;
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                f0 *= alpha;
                f1 *= alpha;
            }

            p0[0] = float32_to_bfloat16(f0);
            p0[1] = float32_to_bfloat16(f1);
            pp += 2;
            p0 += out_hstep;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        const float descale = descales[i + ii];
#if __ARM_NEON
        float32x4_t _descale = vdupq_n_f32(descale);
#endif

        float c0;
#if __ARM_NEON
        float32x4_t _c0;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = bfloat16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = bfloat16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
        for (; jj + 15 < max_jj; jj += 16)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + 8);
                    _c0 = bfloat2float(vget_low_u16(_c01));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c01));
                    float32x4_t _c2 = bfloat2float(vget_low_u16(_c23));
                    float32x4_t _c3 = bfloat2float(vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 16;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);
            uint16x4_t _bf2 = float2bfloat(_f2);
            uint16x4_t _bf3 = float2bfloat(_f3);

            if (out_hstep == 1)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
                vst1q_u16(p0 + 8, vcombine_u16(_bf2, _bf3));
            }
            else
            {
                if (out_elempack == 4)
                {
                    vst1_u16(p0, _bf0);
                    vst1_u16(p0 + out_hstep * 4, _bf1);
                    vst1_u16(p0 + out_hstep * 8, _bf2);
                    vst1_u16(p0 + out_hstep * 12, _bf3);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vget_lane_u16(_bf0, 0);
                    p0[out_hstep] = vget_lane_u16(_bf0, 1);
                    p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                    p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                    p0[out_hstep * 4] = vget_lane_u16(_bf1, 0);
                    p0[out_hstep * 5] = vget_lane_u16(_bf1, 1);
                    p0[out_hstep * 6] = vget_lane_u16(_bf1, 2);
                    p0[out_hstep * 7] = vget_lane_u16(_bf1, 3);
                    p0[out_hstep * 8] = vget_lane_u16(_bf2, 0);
                    p0[out_hstep * 9] = vget_lane_u16(_bf2, 1);
                    p0[out_hstep * 10] = vget_lane_u16(_bf2, 2);
                    p0[out_hstep * 11] = vget_lane_u16(_bf2, 3);
                    p0[out_hstep * 12] = vget_lane_u16(_bf3, 0);
                    p0[out_hstep * 13] = vget_lane_u16(_bf3, 1);
                    p0[out_hstep * 14] = vget_lane_u16(_bf3, 2);
                    p0[out_hstep * 15] = vget_lane_u16(_bf3, 3);
                }
            }

            pp += 16;
            p0 += out_hstep * 16;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    uint16x8_t _c = vld1q_u16(pC);
                    _c0 = bfloat2float(vget_low_u16(_c));
                    float32x4_t _c1 = bfloat2float(vget_high_u16(_c));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _bf0 = float2bfloat(_f0);
            uint16x4_t _bf1 = float2bfloat(_f1);

            if (out_hstep == 1)
            {
                vst1q_u16(p0, vcombine_u16(_bf0, _bf1));
            }
            else
            {
                if (out_elempack == 4)
                {
                    vst1_u16(p0, _bf0);
                    vst1_u16(p0 + out_hstep * 4, _bf1);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vget_lane_u16(_bf0, 0);
                    p0[out_hstep] = vget_lane_u16(_bf0, 1);
                    p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                    p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                    p0[out_hstep * 4] = vget_lane_u16(_bf1, 0);
                    p0[out_hstep * 5] = vget_lane_u16(_bf1, 1);
                    p0[out_hstep * 6] = vget_lane_u16(_bf1, 2);
                    p0[out_hstep * 7] = vget_lane_u16(_bf1, 3);
                }
            }

            pp += 8;
            p0 += out_hstep * 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    _c0 = bfloat2float(vld1_u16(pC));
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 4;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _bf0 = float2bfloat(_f0);

            if (out_hstep == 1)
            {
                vst1_u16(p0, _bf0);
            }
            else
            {
                if (out_elempack == 4)
                {
                    vst1_u16(p0, _bf0);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vget_lane_u16(_bf0, 0);
                    p0[out_hstep] = vget_lane_u16(_bf0, 1);
                    p0[out_hstep * 2] = vget_lane_u16(_bf0, 2);
                    p0[out_hstep * 3] = vget_lane_u16(_bf0, 3);
                }
            }

            pp += 4;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x2_t _f0 = vmul_f32(vcvt_f32_s32(vld1_s32(pp)), vget_low_f32(_descale));

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vadd_f32(_f0, vget_low_f32(_c0));
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    float32x2_t _c = float32x2_t();
                    _c = vset_lane_f32(bfloat16_to_float32(pC[0]), _c, 0);
                    _c = vset_lane_f32(bfloat16_to_float32(pC[1]), _c, 1);
                    _f0 = vmla_n_f32(_f0, _c, beta);
                    pC += 2;
                }
            }

            _f0 = vmul_n_f32(_f0, alpha);

            p0[0] = float32_to_bfloat16(vget_lane_f32(_f0, 0));
            p0[out_hstep] = float32_to_bfloat16(vget_lane_f32(_f0, 1));

            pp += 2;
            p0 += out_hstep * 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj += 1)
        {
            float f0 = pp[0] * descale;

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    f0 += bfloat16_to_float32(pC[0]) * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;

            p0[0] = float32_to_bfloat16(f0);

            pp += 1;
            p0 += out_hstep;
        }
    }
}


================================================
FILE: src/layer/arm/gemm_int8_fp16s.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
void pack_A_tile_fp16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void transpose_pack_A_tile_fp16_to_int8_i8mm(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void pack_B_tile_fp16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void transpose_pack_B_tile_fp16_to_int8_i8mm(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
void pack_A_tile_fp16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void transpose_pack_A_tile_fp16_to_int8_asimddp(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales);
void pack_B_tile_fp16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void transpose_pack_B_tile_fp16_to_int8_asimddp(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale);
void unpack_output_tile_int32_to_fp16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
void transpose_unpack_output_tile_int32_to_fp16_asimddp(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
void compute_A_tile_fp16_int8_scales_asimdhp(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii);
void transpose_compute_A_tile_fp16_int8_scales_asimdhp(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii);
void compute_B_fp16_int8_scale_asimdhp(const Mat& B, float& scale);
#endif

static void compute_A_tile_fp16_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (ncnn::cpu_support_arm_asimdhp())
    {
        compute_A_tile_fp16_int8_scales_asimdhp(A, scales, B_scale, out_descales, i, max_ii);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
    const int K = A.w;

    // NCNN_LOGE("compute_A_tile_fp16_int8_scales %d %d", max_ii, elempack);

    const float v127_B_scale = 127.f * B_scale;

    float* ps = (float*)scales + i;
    float* pods = (float*)out_descales + i;

#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (elempack == 8)
    {
        float32x4_t _v127 = vdupq_n_f32(127.f);
        float32x4_t _v127_B_scale = vdupq_n_f32(v127_B_scale);

        for (int ii = 0; ii + 7 < max_ii; ii += 8)
        {
            const __fp16* p0 = (const __fp16*)A + (i + ii) * A_hstep;

            float16x8_t _amax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                float16x8_t _p2 = vld1q_f16(p0 + 16);
                float16x8_t _p3 = vld1q_f16(p0 + 24);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p0));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(_p1));
                _amax2 = vmaxq_f16(_amax2, vabsq_f16(_p2));
                _amax3 = vmaxq_f16(_amax3, vabsq_f16(_p3));
                p0 += 32;
            }
            _amax0 = vmaxq_f16(_amax0, _amax2);
            _amax1 = vmaxq_f16(_amax1, _amax3);
            for (; kk + 1 < K; kk += 2)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p0));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(_p1));
                p0 += 16;
            }
            _amax0 = vmaxq_f16(_amax0, _amax1);
            for (; kk < K; kk++)
            {
                float16x8_t _p = vld1q_f16(p0);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p));
                p0 += 8;
            }
            float32x4_t _absmax0 = vcvt_f32_f16(vget_low_f16(_amax0));
            float32x4_t _absmax1 = vcvt_f32_f16(vget_high_f16(_amax0));

            float32x4_t _scale0 = vdivq_f32(_v127, _absmax0);
            float32x4_t _scale1 = vdivq_f32(_v127, _absmax1);
            float32x4_t _out_descale0 = vdivq_f32(_absmax0, _v127_B_scale);
            float32x4_t _out_descale1 = vdivq_f32(_absmax1, _v127_B_scale);

            vst1q_f32(ps, _scale0);
            vst1q_f32(ps + 4, _scale1);
            vst1q_f32(pods, _out_descale0);
            vst1q_f32(pods + 4, _out_descale1);

            ps += 8;
            pods += 8;
        }
    }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (elempack == 4)
    {
#if __aarch64__
        float32x4_t _v127 = vdupq_n_f32(127.f);
        float32x4_t _v127_B_scale = vdupq_n_f32(v127_B_scale);
#endif

        for (int ii = 0; ii + 3 < max_ii; ii += 4)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const __fp16* p0 = (const __fp16*)A + (i + ii) * A_hstep;

            float16x8_t _amax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 7 < K; kk += 8)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                float16x8_t _p2 = vld1q_f16(p0 + 16);
                float16x8_t _p3 = vld1q_f16(p0 + 24);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p0));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(_p1));
                _amax2 = vmaxq_f16(_amax2, vabsq_f16(_p2));
                _amax3 = vmaxq_f16(_amax3, vabsq_f16(_p3));
                p0 += 32;
            }
            _amax0 = vmaxq_f16(_amax0, _amax2);
            _amax1 = vmaxq_f16(_amax1, _amax3);
            for (; kk + 3 < K; kk += 4)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p0));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(_p1));
                p0 += 16;
            }
            _amax0 = vmaxq_f16(_amax0, _amax1);
            for (; kk + 1 < K; kk += 2)
            {
                float16x8_t _p = vld1q_f16(p0);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p));
                p0 += 8;
            }
            float16x4_t _amax = vmax_f16(vget_low_f16(_amax0), vget_high_f16(_amax0));
            for (; kk < K; kk++)
            {
                float16x4_t _p = vld1_f16(p0);
                _amax = vmax_f16(_amax, vabs_f16(_p));
                p0 += 4;
            }
            float32x4_t _absmax0 = vcvt_f32_f16(_amax);
#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += 4;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax0);
            float32x4_t _out_descale = vdivq_f32(_absmax0, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax0);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax0, _recp_v127_B_scale);

            float tmp[4];
            vst1q_f32(tmp, _absmax0);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

#endif
            ps += 4;
            pods += 4;
        }
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        for (int ii = 0; ii < max_ii; ii++)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const __fp16* p0 = (const __fp16*)A + (i + ii) * A_hstep;

            float absmax = 0.f;
            float16x8_t _amax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 31 < K; kk += 32)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                float16x8_t _p2 = vld1q_f16(p0 + 16);
                float16x8_t _p3 = vld1q_f16(p0 + 24);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p0));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(_p1));
                _amax2 = vmaxq_f16(_amax2, vabsq_f16(_p2));
                _amax3 = vmaxq_f16(_amax3, vabsq_f16(_p3));
                p0 += 32;
            }
            _amax0 = vmaxq_f16(_amax0, _amax2);
            _amax1 = vmaxq_f16(_amax1, _amax3);
            for (; kk + 15 < K; kk += 16)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p0));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(_p1));
                p0 += 16;
            }
            _amax0 = vmaxq_f16(_amax0, _amax1);
            for (; kk + 7 < K; kk += 8)
            {
                float16x8_t _p = vld1q_f16(p0);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(_p));
                p0 += 8;
            }
            absmax = (float)vmaxvq_f16(_amax0);
            for (; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf(p0[0]));
                p0++;
            }
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep;

            float absmax = 0.f;
            int kk = 0;
#if __ARM_NEON
            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            for (; kk + 15 < K; kk += 16)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 7 < K; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += 4;
            }
#if __aarch64__
            absmax = vmaxvq_f32(_absmax0);
#else
            float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            absmax = std::max(absmax, std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1)));
#endif
#endif // __ARM_NEON
            for (; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf(float16_to_float32(p0[0])));
                p0++;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
}

static void pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_A_tile_fp16_to_int8_i8mm(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_A_tile_fp16_to_int8_asimddp(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    // NCNN_LOGE("pack_A_tile_fp16_to_int8 %d %d", max_ii, elempack);

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

        float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
        float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + 32);
                uint16x8_t _u = vld1q_u16(p0 + 40);
                uint16x8_t _v = vld1q_u16(p0 + 48);
                uint16x8_t _w = vld1q_u16(p0 + 56);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);
                _p8 = vmulq_f32(_p8, _scale0);
                _p9 = vmulq_f32(_p9, _scale1);
                _pa = vmulq_f32(_pa, _scale0);
                _pb = vmulq_f32(_pb, _scale1);
                _pc = vmulq_f32(_pc, _scale0);
                _pd = vmulq_f32(_pd, _scale1);
                _pe = vmulq_f32(_pe, _scale0);
                _pf = vmulq_f32(_pf, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _p04 = vzip_s8(float2int8(_p0, _p1), float2int8(_p8, _p9));
                int8x8x2_t _p15 = vzip_s8(float2int8(_p2, _p3), float2int8(_pa, _pb));
                int8x8x2_t _p26 = vzip_s8(float2int8(_p4, _p5), float2int8(_pc, _pd));
                int8x8x2_t _p37 = vzip_s8(float2int8(_p6, _p7), float2int8(_pe, _pf));

                int8x16x4_t _rr;
                _rr.val[0] = vcombine_s8(_p04.val[0], _p04.val[1]);
                _rr.val[1] = vcombine_s8(_p15.val[0], _p15.val[1]);
                _rr.val[2] = vcombine_s8(_p26.val[0], _p26.val[1]);
                _rr.val[3] = vcombine_s8(_p37.val[0], _p37.val[1]);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x16x4_t _rr;
                _rr.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p8, _p9));
                _rr.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_pa, _pb));
                _rr.val[2] = vcombine_s8(float2int8(_p4, _p5), float2int8(_pc, _pd));
                _rr.val[3] = vcombine_s8(float2int8(_p6, _p7), float2int8(_pe, _pf));
#endif // __ARM_FEATURE_MATMUL_INT8

                vst4q_s8(pp, _rr);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p8, _p9), float2int8(_pc, _pd));
                _r23.val[1] = vcombine_s8(float2int8(_pa, _pb), float2int8(_pe, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 64;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                uint16x8_t _p23 = vld1q_u16(p0 + 8);

                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p23));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p23));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 16;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);
                uint16x8x4_t _q = vld4q_u16(p0 + A_hstep * 4);

                float32x4_t _p0 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[0])), _scale0, 0);
                float32x4_t _p1 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[1])), _scale0, 1);
                float32x4_t _p2 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[2])), _scale0, 2);
                float32x4_t _p3 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[3])), _scale0, 3);
                float32x4_t _p4 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[0])), _scale0, 0);
                float32x4_t _p5 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[1])), _scale0, 1);
                float32x4_t _p6 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[2])), _scale0, 2);
                float32x4_t _p7 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[3])), _scale0, 3);
                float32x4_t _p8 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[0])), _scale1, 0);
                float32x4_t _p9 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[1])), _scale1, 1);
                float32x4_t _pa = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[2])), _scale1, 2);
                float32x4_t _pb = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[3])), _scale1, 3);
                float32x4_t _pc = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[0])), _scale1, 0);
                float32x4_t _pd = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[1])), _scale1, 1);
                float32x4_t _pe = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[2])), _scale1, 2);
                float32x4_t _pf = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[3])), _scale1, 3);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
                int8x8_t _r4 = float2int8(_p8, _pc);
                int8x8_t _r5 = float2int8(_p9, _pd);
                int8x8_t _r6 = float2int8(_pa, _pe);
                int8x8_t _r7 = float2int8(_pb, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p8, _p9);
                int8x8_t _r3 = float2int8(_pa, _pb);
                int8x8_t _r4 = float2int8(_p4, _p5);
                int8x8_t _r5 = float2int8(_p6, _p7);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 4 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale0);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale0);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale0);
                _p8 = vmulq_f32(_p8, _scale1);
                _p9 = vmulq_f32(_p9, _scale1);
                _pa = vmulq_f32(_pa, _scale1);
                _pb = vmulq_f32(_pb, _scale1);
                _pc = vmulq_f32(_pc, _scale1);
                _pd = vmulq_f32(_pd, _scale1);
                _pe = vmulq_f32(_pe, _scale1);
                _pf = vmulq_f32(_pf, _scale1);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p8), float2int8(_p2, _pa));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p9), float2int8(_p3, _pb));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p4, _pc), float2int8(_p6, _pe));
                _r23.val[1] = vcombine_s8(float2int8(_p5, _pd), float2int8(_p7, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);
                uint16x4x4_t _q = vld4_u16(p0 + A_hstep * 4);

                float32x4_t _p0 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[0]), _scale0, 0);
                float32x4_t _p1 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[1]), _scale0, 1);
                float32x4_t _p2 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[2]), _scale0, 2);
                float32x4_t _p3 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[3]), _scale0, 3);
                float32x4_t _p4 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_q.val[0]), _scale1, 0);
                float32x4_t _p5 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_q.val[1]), _scale1, 1);
                float32x4_t _p6 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_q.val[2]), _scale1, 2);
                float32x4_t _p7 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_q.val[3]), _scale1, 3);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 4 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale0);
                _p4 = vmulq_f32(_p4, _scale1);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale1);
                _p7 = vmulq_f32(_p7, _scale1);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p4), float2int8(_p2, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p5), float2int8(_p3, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep * 4);

                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p0n = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p1n = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p0n = vmulq_f32(_p0n, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p1n = vmulq_f32(_p1n, _scale1);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p0n, _p1n);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale0, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale0, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale0, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale0, 3);
                _p8 = vmulq_laneq_f32(_p8, _scale1, 0);
                _p9 = vmulq_laneq_f32(_p9, _scale1, 0);
                _pa = vmulq_laneq_f32(_pa, _scale1, 1);
                _pb = vmulq_laneq_f32(_pb, _scale1, 1);
                _pc = vmulq_laneq_f32(_pc, _scale1, 2);
                _pd = vmulq_laneq_f32(_pd, _scale1, 2);
                _pe = vmulq_laneq_f32(_pe, _scale1, 3);
                _pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 0);
                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale0), 1);
                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale0), 0);
                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale0), 0);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale0), 1);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale0), 1);
                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale1), 0);
                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale1), 0);
                _pa = vmulq_lane_f32(_pa, vget_low_f32(_scale1), 1);
                _pb = vmulq_lane_f32(_pb, vget_low_f32(_scale1), 1);
                _pc = vmulq_lane_f32(_pc, vget_high_f32(_scale1), 0);
                _pd = vmulq_lane_f32(_pd, vget_high_f32(_scale1), 0);
                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 1);
                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p8, _pa));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_pc, _pe));
                int16x4_t _t4 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t5 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4_t _t6 = vreinterpret_s16_s8(float2int8(_p9, _pb));
                int16x4_t _t7 = vreinterpret_s16_s8(float2int8(_pd, _pf));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int16x4x2_t _t45 = vuzp_s16(_t4, _t5);
                int16x4x2_t _t67 = vuzp_s16(_t6, _t7);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
                int8x8_t _r4 = vreinterpret_s8_s16(_t45.val[0]);
                int8x8_t _r5 = vreinterpret_s8_s16(_t67.val[0]);
                int8x8_t _r6 = vreinterpret_s8_s16(_t45.val[1]);
                int8x8_t _r7 = vreinterpret_s8_s16(_t67.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));

                pp += 64;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 3));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 5));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 6));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 7));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p45 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p67 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                float32x4x2_t _scale01 = vzipq_f32(_scale0, _scale0);
                float32x4x2_t _scale23 = vzipq_f32(_scale1, _scale1);

                _p01 = vmulq_f32(_p01, _scale01.val[0]);
                _p23 = vmulq_f32(_p23, _scale01.val[1]);
                _p45 = vmulq_f32(_p45, _scale23.val[0]);
                _p67 = vmulq_f32(_p67, _scale23.val[1]);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 7], _p, 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0++;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

        float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);

                float32x4_t _p0 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[0])), _scale, 0);
                float32x4_t _p1 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[1])), _scale, 1);
                float32x4_t _p2 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[2])), _scale, 2);
                float32x4_t _p3 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[3])), _scale, 3);
                float32x4_t _p4 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[0])), _scale, 0);
                float32x4_t _p5 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[1])), _scale, 1);
                float32x4_t _p6 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[2])), _scale, 2);
                float32x4_t _p7 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[3])), _scale, 3);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);

                float32x4_t _p0 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[0]), _scale, 0);
                float32x4_t _p1 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[1]), _scale, 1);
                float32x4_t _p2 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[2]), _scale, 2);
                float32x4_t _p3 = vmulq_laneq_f32(vcvt_f32_f16((float16x4_t)_p.val[3]), _scale, 3);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 0);
                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale), 1);
                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale), 1);
                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale), 0);
                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale), 0);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 1);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 3));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                float32x4x2_t _scale01 = vzipq_f32(_scale, _scale);

                _p01 = vmulq_f32(_p01, _scale01.val[0]);
                _p23 = vmulq_f32(_p23, _scale01.val[1]);

                int8x8_t _r0 = float2int8(_p01, _p23);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x4_t _p = uint16x4_t();
                _p = vset_lane_u16(p0[0], _p, 0);
                _p = vset_lane_u16(p0[A_hstep], _p, 1);
                _p = vset_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vset_lane_u16(p0[A_hstep * 3], _p, 3);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)_p);

                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0++;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

        const float scale0 = scales[i + ii];
        const float scale1 = scales[i + ii + 1];

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale0 = vdupq_n_f32(scale0);
            float32x4_t _scale1 = vdupq_n_f32(scale1);
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale1);
                _p3 = vmulq_f32(_p3, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p2));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p2));
                float32x4_t _t2 = vcombine_f32(vget_low_f32(_p1), vget_low_f32(_p3));
                float32x4_t _t3 = vcombine_f32(vget_high_f32(_p1), vget_high_f32(_p3));
                int8x8_t _r0 = float2int8(_t0, _t1);
                int8x8_t _r1 = float2int8(_t2, _t3);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);
                vst1_s8(pp + 8, _r1);

                pp += 16;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r0 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(float16_to_float32(p0[1]) * scale0);
                pp[2] = float2int8(float16_to_float32(p0[A_hstep]) * scale1);
                pp[3] = float2int8(float16_to_float32(p0[A_hstep + 1]) * scale1);
                pp += 4;
                p0 += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(float16_to_float32(p0[A_hstep]) * scale1);
                pp += 2;
                p0++;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

        const float scale = scales[i + ii];

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale = vdupq_n_f32(scale);
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_compute_A_tile_fp16_int8_scales(const Mat& A, Mat& scales, float B_scale, Mat& out_descales, int i, int max_ii)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (ncnn::cpu_support_arm_asimdhp())
    {
        transpose_compute_A_tile_fp16_int8_scales_asimdhp(A, scales, B_scale, out_descales, i, max_ii);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
    const int K = A.dims == 3 ? A.c : A.h;

    // NCNN_LOGE("transpose_compute_A_tile_fp16_int8_scales %d %d", max_ii, elempack);

    const float v127_B_scale = 127.f * B_scale;

#if __ARM_NEON
#if __aarch64__
    float32x4_t _v127 = vdupq_n_f32(127.f);
    float32x4_t _v127_B_scale = vdupq_n_f32(v127_B_scale);
#endif
#endif

    float* ps = (float*)scales + i;
    float* pods = (float*)out_descales + i;

#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (elempack == 8)
    {
        int ii = 0;
        for (; ii + 1 < max_ii; ii += 2)
        {
            const __fp16* p0 = (const __fp16*)A + (i + ii) * 8;

            float16x8_t _absmax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 1 < K; kk += 2)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                float16x8_t _p2 = vld1q_f16(p0 + A_hstep * 8);
                float16x8_t _p3 = vld1q_f16(p0 + A_hstep * 8 + 8);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
                _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
                _absmax2 = vmaxq_f16(_absmax2, vabsq_f16(_p2));
                _absmax3 = vmaxq_f16(_absmax3, vabsq_f16(_p3));
                p0 += A_hstep * 16;
            }
            _absmax0 = vmaxq_f16(_absmax0, _absmax2);
            _absmax1 = vmaxq_f16(_absmax1, _absmax3);
            for (; kk < K; kk++)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
                _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
                p0 += A_hstep * 8;
            }
            float absmax0 = (float)vmaxvq_f16(_absmax0);
            float absmax1 = (float)vmaxvq_f16(_absmax1);

            ps[0] = 127.f / absmax0;
            ps[1] = 127.f / absmax1;
            pods[0] = absmax0 / v127_B_scale;
            pods[1] = absmax1 / v127_B_scale;
            ps += 2;
            pods += 2;
        }
        for (; ii < max_ii; ii++)
        {
            const __fp16* p0 = (const __fp16*)A + (i + ii) * 8;

            float16x8_t _absmax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + A_hstep * 8);
                float16x8_t _p2 = vld1q_f16(p0 + A_hstep * 16);
                float16x8_t _p3 = vld1q_f16(p0 + A_hstep * 24);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
                _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
                _absmax2 = vmaxq_f16(_absmax2, vabsq_f16(_p2));
                _absmax3 = vmaxq_f16(_absmax3, vabsq_f16(_p3));
                p0 += A_hstep * 32;
            }
            _absmax0 = vmaxq_f16(_absmax0, _absmax2);
            _absmax1 = vmaxq_f16(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + A_hstep * 8);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
                _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
                p0 += A_hstep * 16;
            }
            _absmax0 = vmaxq_f16(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float16x8_t _p = vld1q_f16(p0);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p));
                p0 += A_hstep * 8;
            }
            float absmax = (float)vmaxvq_f16(_absmax0);

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (elempack == 4)
    {
        int ii = 0;
        for (; ii + 3 < max_ii; ii += 4)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const __fp16* p0 = (const __fp16*)A + (i + ii) * 4;

            float16x8_t _absmax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _absmax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 1 < K; kk += 2)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                float16x8_t _p2 = vld1q_f16(p0 + A_hstep * 4);
                float16x8_t _p3 = vld1q_f16(p0 + A_hstep * 4 + 8);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
                _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
                _absmax2 = vmaxq_f16(_absmax2, vabsq_f16(_p2));
                _absmax3 = vmaxq_f16(_absmax3, vabsq_f16(_p3));
                p0 += A_hstep * 8;
            }
            _absmax0 = vmaxq_f16(_absmax0, _absmax2);
            _absmax1 = vmaxq_f16(_absmax1, _absmax3);
            for (; kk < K; kk++)
            {
                float16x8_t _p0 = vld1q_f16(p0);
                float16x8_t _p1 = vld1q_f16(p0 + 8);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
                _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
                p0 += A_hstep * 4;
            }
            float16x8_t _aa0123 = vpmaxq_f16(_absmax0, _absmax1);
            float32x4_t _absmax = vcvt_f32_f16(vpmax_f16(vget_low_f16(_aa0123), vget_high_f16(_aa0123)));
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * 4;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            for (int kk = 0; kk < K; kk++)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 4;
            }
#if __aarch64__
            float32x4_t _aa01 = vpmaxq_f32(_absmax0, _absmax1);
            float32x4_t _aa23 = vpmaxq_f32(_absmax2, _absmax3);
            float32x4_t _absmax = vpmaxq_f32(_aa01, _aa23);
#else
            float32x2_t _aa0 = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            float32x2_t _aa1 = vmax_f32(vget_low_f32(_absmax1), vget_high_f32(_absmax1));
            float32x2_t _aa2 = vmax_f32(vget_low_f32(_absmax2), vget_high_f32(_absmax2));
            float32x2_t _aa3 = vmax_f32(vget_low_f32(_absmax3), vget_high_f32(_absmax3));
            float32x2_t _aa01 = vpmax_f32(_aa0, _aa1);
            float32x2_t _aa23 = vpmax_f32(_aa2, _aa3);
            float32x4_t _absmax = vcombine_f32(_aa01, _aa23);
#endif
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax);
            float32x4_t _out_descale = vdivq_f32(_absmax, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            float tmp[4];
            vst1q_f32(tmp, _absmax);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax, _recp_v127_B_scale);
#endif

            ps += 4;
            pods += 4;
        }
        for (; ii < max_ii; ii++)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const __fp16* p0 = (const __fp16*)A + (i + ii) * 4;

            float16x8_t _amax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 7 < K; kk += 8)
            {
                float16x4_t _p0 = vld1_f16(p0);
                float16x4_t _p1 = vld1_f16(p0 + A_hstep * 4);
                float16x4_t _p2 = vld1_f16(p0 + A_hstep * 8);
                float16x4_t _p3 = vld1_f16(p0 + A_hstep * 12);
                float16x4_t _p4 = vld1_f16(p0 + A_hstep * 16);
                float16x4_t _p5 = vld1_f16(p0 + A_hstep * 20);
                float16x4_t _p6 = vld1_f16(p0 + A_hstep * 24);
                float16x4_t _p7 = vld1_f16(p0 + A_hstep * 28);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(vcombine_f16(_p0, _p1)));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(vcombine_f16(_p2, _p3)));
                _amax2 = vmaxq_f16(_amax2, vabsq_f16(vcombine_f16(_p4, _p5)));
                _amax3 = vmaxq_f16(_amax3, vabsq_f16(vcombine_f16(_p6, _p7)));
                p0 += A_hstep * 32;
            }
            _amax0 = vmaxq_f16(_amax0, _amax2);
            _amax1 = vmaxq_f16(_amax1, _amax3);
            for (; kk + 3 < K; kk += 4)
            {
                float16x4_t _p0 = vld1_f16(p0);
                float16x4_t _p1 = vld1_f16(p0 + A_hstep * 4);
                float16x4_t _p2 = vld1_f16(p0 + A_hstep * 8);
                float16x4_t _p3 = vld1_f16(p0 + A_hstep * 12);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(vcombine_f16(_p0, _p1)));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(vcombine_f16(_p2, _p3)));
                p0 += A_hstep * 16;
            }
            _amax0 = vmaxq_f16(_amax0, _amax1);
            for (; kk + 1 < K; kk += 2)
            {
                float16x4_t _p0 = vld1_f16(p0);
                float16x4_t _p1 = vld1_f16(p0 + A_hstep * 4);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(vcombine_f16(_p0, _p1)));
                p0 += A_hstep * 8;
            }
            float16x4_t _amax01 = vmax_f16(vget_low_f16(_amax0), vget_high_f16(_amax0));
            for (; kk < K; kk++)
            {
                float16x4_t _p = vld1_f16(p0);
                _amax01 = vmax_f16(_amax01, vabs_f16(_p));
                p0 += A_hstep * 4;
            }
            float absmax = (float)vmaxv_f16(_amax01);
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const unsigned short* p0 = (const unsigned short*)A + (i + ii) * 4;

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 8));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 12));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 16;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 4));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += A_hstep * 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += A_hstep * 4;
            }
#if __aarch64__
            float absmax = vmaxvq_f32(_absmax0);
#else
            float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            float absmax = std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1));
#endif
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        int ii = 0;
#if __ARM_NEON
        for (; ii + 3 < max_ii; ii += 4)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const __fp16* p0 = (const __fp16*)A + (i + ii);

            float16x8_t _amax0 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _amax3 = vdupq_n_f16((__fp16)0.f);
            int kk = 0;
            for (; kk + 7 < K; kk += 8)
            {
                float16x4_t _p0 = vld1_f16(p0);
                float16x4_t _p1 = vld1_f16(p0 + A_hstep);
                float16x4_t _p2 = vld1_f16(p0 + A_hstep * 2);
                float16x4_t _p3 = vld1_f16(p0 + A_hstep * 3);
                float16x4_t _p4 = vld1_f16(p0 + A_hstep * 4);
                float16x4_t _p5 = vld1_f16(p0 + A_hstep * 5);
                float16x4_t _p6 = vld1_f16(p0 + A_hstep * 6);
                float16x4_t _p7 = vld1_f16(p0 + A_hstep * 7);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(vcombine_f16(_p0, _p1)));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(vcombine_f16(_p2, _p3)));
                _amax2 = vmaxq_f16(_amax2, vabsq_f16(vcombine_f16(_p4, _p5)));
                _amax3 = vmaxq_f16(_amax3, vabsq_f16(vcombine_f16(_p6, _p7)));
                p0 += A_hstep * 8;
            }
            _amax0 = vmaxq_f16(_amax0, _amax2);
            _amax1 = vmaxq_f16(_amax1, _amax3);
            for (; kk + 3 < K; kk += 4)
            {
                float16x4_t _p0 = vld1_f16(p0);
                float16x4_t _p1 = vld1_f16(p0 + A_hstep);
                float16x4_t _p2 = vld1_f16(p0 + A_hstep * 2);
                float16x4_t _p3 = vld1_f16(p0 + A_hstep * 3);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(vcombine_f16(_p0, _p1)));
                _amax1 = vmaxq_f16(_amax1, vabsq_f16(vcombine_f16(_p2, _p3)));
                p0 += A_hstep * 4;
            }
            _amax0 = vmaxq_f16(_amax0, _amax1);
            for (; kk + 1 < K; kk += 2)
            {
                float16x4_t _p0 = vld1_f16(p0);
                float16x4_t _p1 = vld1_f16(p0 + A_hstep);
                _amax0 = vmaxq_f16(_amax0, vabsq_f16(vcombine_f16(_p0, _p1)));
                p0 += A_hstep * 2;
            }
            float16x4_t _amax = vmax_f16(vget_low_f16(_amax0), vget_high_f16(_amax0));
            for (; kk < K; kk++)
            {
                float16x4_t _p = vld1_f16(p0);
                _amax = vmax_f16(_amax, vabs_f16(_p));
                p0 += A_hstep;
            }
            float32x4_t _absmax0 = vcvt_f32_f16(_amax);
#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const unsigned short* p0 = (const unsigned short*)A + (i + ii);

            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            float32x4_t _absmax2 = vdupq_n_f32(0.f);
            float32x4_t _absmax3 = vdupq_n_f32(0.f);
            int kk = 0;
            for (; kk + 3 < K; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 3));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
                _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
                p0 += A_hstep * 4;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax2);
            _absmax1 = vmaxq_f32(_absmax1, _absmax3);
            for (; kk + 1 < K; kk += 2)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += A_hstep * 2;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk < K; kk++)
            {
                float32x4_t _p = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
                p0 += A_hstep;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

#if __aarch64__
            float32x4_t _scale = vdivq_f32(_v127, _absmax0);
            float32x4_t _out_descale = vdivq_f32(_absmax0, _v127_B_scale);

            vst1q_f32(ps, _scale);
            vst1q_f32(pods, _out_descale);
#else
            float tmp[4];
            vst1q_f32(tmp, _absmax0);

            ps[0] = 127.f / tmp[0];
            ps[1] = 127.f / tmp[1];
            ps[2] = 127.f / tmp[2];
            ps[3] = 127.f / tmp[3];

            pods[0] = tmp[0] / v127_B_scale;
            pods[1] = tmp[1] / v127_B_scale;
            pods[2] = tmp[2] / v127_B_scale;
            pods[3] = tmp[3] / v127_B_scale;

            // float32x4_t _recp_absmax = vrecpeq_f32(_absmax0);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // _recp_absmax = vmulq_f32(vrecpsq_f32(_absmax0, _recp_absmax), _recp_absmax);
            // float32x4_t _scale = vmulq_f32(_v127, _recp_absmax);
            // float32x4_t _out_descale = vmulq_f32(_absmax0, _recp_v127_B_scale);
#endif

            ps += 4;
            pods += 4;
        }
#endif // __ARM_NEON
        for (; ii < max_ii; ii++)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const __fp16* p0 = (const __fp16*)A + (i + ii);

            float absmax = 0.f;
            int kk = 0;
            float16x8_t _absmax0 = vdupq_n_f16((__fp16)0.f);
            for (; kk + 7 < K; kk += 8)
            {
                float16x8_t _p = float16x8_t();
                _p = vsetq_lane_f16(p0[0], _p, 0);
                _p = vsetq_lane_f16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_f16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_f16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_f16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_f16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_f16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_f16(p0[A_hstep * 7], _p, 7);
                _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p));
                p0 += A_hstep * 8;
            }
            float16x4_t _amax0 = vmax_f16(vget_low_f16(_absmax0), vget_high_f16(_absmax0));
            for (; kk + 3 < K; kk += 4)
            {
                float16x4_t _p = float16x4_t();
                _p = vset_lane_f16(p0[0], _p, 0);
                _p = vset_lane_f16(p0[A_hstep], _p, 1);
                _p = vset_lane_f16(p0[A_hstep * 2], _p, 2);
                _p = vset_lane_f16(p0[A_hstep * 3], _p, 3);
                _amax0 = vmax_f16(_amax0, vabs_f16(_p));
                p0 += A_hstep * 4;
            }
            absmax = (float)vmaxv_f16(_amax0);
            for (; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf((float)p0[0]));
                p0 += A_hstep;
            }
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            const unsigned short* p0 = (const unsigned short*)A + (i + ii);

            float absmax = 0.f;
            int kk = 0;
#if __ARM_NEON
            float32x4_t _absmax0 = vdupq_n_f32(0.f);
            float32x4_t _absmax1 = vdupq_n_f32(0.f);
            for (; kk + 7 < K; kk += 8)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 7], _p, 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
                p0 += A_hstep * 8;
            }
            _absmax0 = vmaxq_f32(_absmax0, _absmax1);
            for (; kk + 3 < K; kk += 4)
            {
                uint16x4_t _p = uint16x4_t();
                _p = vset_lane_u16(p0[0], _p, 0);
                _p = vset_lane_u16(p0[A_hstep], _p, 1);
                _p = vset_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vset_lane_u16(p0[A_hstep * 3], _p, 3);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)_p);
                _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
                p0 += A_hstep * 4;
            }
#if __aarch64__
            absmax = vmaxvq_f32(_absmax0);
#else
            float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
            absmax = std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1));
#endif
#endif // __ARM_NEON
            for (; kk < K; kk++)
            {
                absmax = std::max(absmax, (float)fabsf(float16_to_float32(p0[0])));
                p0 += A_hstep;
            }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

            ps[0] = 127.f / absmax;
            pods[0] = absmax / v127_B_scale;
            ps++;
            pods++;
        }
    }
}

static void transpose_pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, const Mat& scales)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_A_tile_fp16_to_int8_i8mm(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_A_tile_fp16_to_int8_asimddp(A, AT, i, max_ii, k, max_kk, scales);
        return;
    }
#endif

    const int elempack = A.elempack;
    const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;

    // NCNN_LOGE("transpose_pack_A_tile_fp16_to_int8 %d %d", max_ii, elempack);

    signed char* pp = AT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
        float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + 32);
                uint16x8_t _u = vld1q_u16(p0 + 40);
                uint16x8_t _v = vld1q_u16(p0 + 48);
                uint16x8_t _w = vld1q_u16(p0 + 56);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale0, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale0, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale0, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale0, 3);
                _p8 = vmulq_laneq_f32(_p8, _scale1, 0);
                _p9 = vmulq_laneq_f32(_p9, _scale1, 0);
                _pa = vmulq_laneq_f32(_pa, _scale1, 1);
                _pb = vmulq_laneq_f32(_pb, _scale1, 1);
                _pc = vmulq_laneq_f32(_pc, _scale1, 2);
                _pd = vmulq_laneq_f32(_pd, _scale1, 2);
                _pe = vmulq_laneq_f32(_pe, _scale1, 3);
                _pf = vmulq_laneq_f32(_pf, _scale1, 3);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += A_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 4 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
                _p8 = vmulq_laneq_f32(_p8, _scale0, 0);
                _p9 = vmulq_laneq_f32(_p9, _scale0, 1);
                _pa = vmulq_laneq_f32(_pa, _scale0, 2);
                _pb = vmulq_laneq_f32(_pb, _scale0, 3);
                _pc = vmulq_laneq_f32(_pc, _scale1, 0);
                _pd = vmulq_laneq_f32(_pd, _scale1, 1);
                _pe = vmulq_laneq_f32(_pe, _scale1, 2);
                _pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale0), 0);
                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale0), 1);
                _pa = vmulq_lane_f32(_pa, vget_high_f32(_scale0), 0);
                _pb = vmulq_lane_f32(_pb, vget_high_f32(_scale0), 1);
                _pc = vmulq_lane_f32(_pc, vget_low_f32(_scale1), 0);
                _pd = vmulq_lane_f32(_pd, vget_low_f32(_scale1), 1);
                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 0);
                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p8);
                int8x8_t _r1 = float2int8(_p1, _p9);
                int8x8_t _r2 = float2int8(_p2, _pa);
                int8x8_t _r3 = float2int8(_p3, _pb);
                int8x8_t _r4 = float2int8(_p4, _pc);
                int8x8_t _r5 = float2int8(_p5, _pd);
                int8x8_t _r6 = float2int8(_p6, _pe);
                int8x8_t _r7 = float2int8(_p7, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale0, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale1, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

#if __ARM_FEATURE_DOTPROD
                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8x2_t _rr = vuzpq_s16(_r01, _r23);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + A_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + A_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + A_hstep * 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);
                _p8 = vmulq_f32(_p8, _scale0);
                _p9 = vmulq_f32(_p9, _scale1);
                _pa = vmulq_f32(_pa, _scale0);
                _pb = vmulq_f32(_pb, _scale1);
                _pc = vmulq_f32(_pc, _scale0);
                _pd = vmulq_f32(_pd, _scale1);
                _pe = vmulq_f32(_pe, _scale0);
                _pf = vmulq_f32(_pf, _scale1);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r04 = vzip_s8(_r0, _r4);
                int8x8x2_t _r15 = vzip_s8(_r1, _r5);
                int8x8x2_t _r26 = vzip_s8(_r2, _r6);
                int8x8x2_t _r37 = vzip_s8(_r3, _r7);
                int8x16x4_t _r0123;
                _r0123.val[0] = vcombine_s8(_r04.val[0], _r04.val[1]);
                _r0123.val[1] = vcombine_s8(_r15.val[0], _r15.val[1]);
                _r0123.val[2] = vcombine_s8(_r26.val[0], _r26.val[1]);
                _r0123.val[3] = vcombine_s8(_r37.val[0], _r37.val[1]);

                vst4q_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = _r0;
                _r0123.val[1] = _r1;
                _r0123.val[2] = _r2;
                _r0123.val[3] = _r3;
                int8x8x4_t _r4567;
                _r4567.val[0] = _r4;
                _r4567.val[1] = _r5;
                _r4567.val[2] = _r6;
                _r4567.val[3] = _r7;

                vst4_s8(pp, _r0123);
                vst4_s8(pp + 32, _r4567);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(_r0, _r2);
                _r01.val[1] = vcombine_s8(_r1, _r3);
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(_r4, _r6);
                _r23.val[1] = vcombine_s8(_r5, _r7);

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 3);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);
                _p4 = vmulq_f32(_p4, _scale0);
                _p5 = vmulq_f32(_p5, _scale1);
                _p6 = vmulq_f32(_p6, _scale0);
                _p7 = vmulq_f32(_p7, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep);

                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += A_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep;
            }
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 0);
                _p2 = vmulq_laneq_f32(_p2, _scale, 1);
                _p3 = vmulq_laneq_f32(_p3, _scale, 1);
                _p4 = vmulq_laneq_f32(_p4, _scale, 2);
                _p5 = vmulq_laneq_f32(_p5, _scale, 2);
                _p6 = vmulq_laneq_f32(_p6, _scale, 3);
                _p7 = vmulq_laneq_f32(_p7, _scale, 3);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += A_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + A_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + A_hstep * 4 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
                _p4 = vmulq_laneq_f32(_p4, _scale, 0);
                _p5 = vmulq_laneq_f32(_p5, _scale, 1);
                _p6 = vmulq_laneq_f32(_p6, _scale, 2);
                _p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale), 0);
                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale), 1);
                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 0);
                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

#if __aarch64__
                _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                _p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += A_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 3));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 5));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 6));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 7));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                float32x4x2_t _p04 = vzipq_f32(_p0, _p4);
                float32x4x2_t _p15 = vzipq_f32(_p1, _p5);
                float32x4x2_t _p26 = vzipq_f32(_p2, _p6);
                float32x4x2_t _p37 = vzipq_f32(_p3, _p7);
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p04.val[0], _p04.val[1]);
                _r0123.val[1] = float2int8(_p15.val[0], _p15.val[1]);
                _r0123.val[2] = float2int8(_p26.val[0], _p26.val[1]);
                _r0123.val[3] = float2int8(_p37.val[0], _p37.val[1]);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p4);
                _r0123.val[1] = float2int8(_p1, _p5);
                _r0123.val[2] = float2int8(_p2, _p6);
                _r0123.val[3] = float2int8(_p3, _p7);

                vst4_s8(pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 3));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                transpose4x4_ps(_p0, _p1, _p2, _p3);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));
#else  // __ARM_FEATURE_DOTPROD
                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);
                pp += 4;
                p0 += A_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        const float scale0 = scales[i + ii];
        const float scale1 = scales[i + ii + 1];

#if __ARM_NEON
        float32x4_t _scale0 = vdupq_n_f32(scale0);
        float32x4_t _scale1 = vdupq_n_f32(scale1);
#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale0);
                _p2 = vmulq_f32(_p2, _scale1);
                _p3 = vmulq_f32(_p3, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += A_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + A_hstep * 4);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);
                _p2 = vmulq_f32(_p2, _scale0);
                _p3 = vmulq_f32(_p3, _scale1);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale0);
                _p1 = vmulq_f32(_p1, _scale1);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r01 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r01 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            float32x4_t _scale = vzipq_f32(_scale0, _scale1).val[0];
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p45 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p67 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vuzp_s8(_r0, _r1);

                vst1q_s8(pp, vcombine_s8(_r01.val[0], _r01.val[1]));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vtrn_s8(_r0, _r1);
                int8x8x2_t _rr01 = vuzp_s8(_r01.val[0], _r01.val[1]);

                vst1q_s8(pp, vcombine_s8(_rr01.val[0], _rr01.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 4 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 6 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep + 1], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 3], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 3 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 5], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 5 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 7 + 1], _q, 7);
                float32x4_t _p02 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p46 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p13 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p57 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p02 = vmulq_f32(_p02, _scale);
                _p46 = vmulq_f32(_p46, _scale);
                _p13 = vmulq_f32(_p13, _scale);
                _p57 = vmulq_f32(_p57, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p02, _p46);
                _r01.val[1] = float2int8(_p13, _p57);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                float32x4x2_t _pp = vuzpq_f32(_p01, _p23);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep + 1], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 3 + 1], _p, 7);
                float32x4_t _p02 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p13 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p02 = vmulq_f32(_p02, _scale);
                _p13 = vmulq_f32(_p13, _scale);

                float32x4x2_t _pp = vzipq_f32(_p02, _p13);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(float16_to_float32(p0[A_hstep + 0]) * scale0);
                pp[2] = float2int8(float16_to_float32(p0[1]) * scale1);
                pp[3] = float2int8(float16_to_float32(p0[A_hstep + 1]) * scale1);
                pp += 4;
                p0 += A_hstep * 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale0);
                pp[1] = float2int8(float16_to_float32(p0[1]) * scale1);
                pp += 2;
                p0 += A_hstep;
            }
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

        const float scale = scales[i + ii];

#if __ARM_NEON
        float32x4_t _scale = vdupq_n_f32(scale);
#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                uint16x8_t _p23 = vld1q_u16(p0 + A_hstep * 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p23));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p23));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += A_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);
                pp += 8;
                p0 += A_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 4));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 8));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 12));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += A_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + A_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(float16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(float16_to_float32(p0[2]) * scale);
                pp[3] = float2int8(float16_to_float32(p0[3]) * scale);
                pp += 4;
                p0 += A_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 7], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[A_hstep * 8], _q, 0);
                _q = vsetq_lane_u16(p0[A_hstep * 9], _q, 1);
                _q = vsetq_lane_u16(p0[A_hstep * 10], _q, 2);
                _q = vsetq_lane_u16(p0[A_hstep * 11], _q, 3);
                _q = vsetq_lane_u16(p0[A_hstep * 12], _q, 4);
                _q = vsetq_lane_u16(p0[A_hstep * 13], _q, 5);
                _q = vsetq_lane_u16(p0[A_hstep * 14], _q, 6);
                _q = vsetq_lane_u16(p0[A_hstep * 15], _q, 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += A_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[A_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[A_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[A_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[A_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[A_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[A_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[A_hstep * 7], _p, 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += A_hstep * 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp += 1;
                p0 += A_hstep;
            }
        }
    }
}

static void compute_B_fp16_int8_scale(const Mat& B, float& scale)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (ncnn::cpu_support_arm_asimdhp())
    {
        compute_B_fp16_int8_scale_asimdhp(B, scale);
        return;
    }
#endif

    float absmax = 0.f;
#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    float16x8_t _absmax0 = vdupq_n_f16((__fp16)0.f);
    float16x8_t _absmax1 = vdupq_n_f16((__fp16)0.f);
    float16x8_t _absmax2 = vdupq_n_f16((__fp16)0.f);
    float16x8_t _absmax3 = vdupq_n_f16((__fp16)0.f);
    float16x4_t _amax = vdup_n_f16((__fp16)0.f);
#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    float32x4_t _absmax0 = vdupq_n_f32(0.f);
    float32x4_t _absmax1 = vdupq_n_f32(0.f);
    float32x4_t _absmax2 = vdupq_n_f32(0.f);
    float32x4_t _absmax3 = vdupq_n_f32(0.f);
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#endif
    for (int i = 0; i < (B.dims == 3 ? B.c : B.h); i++)
    {
        const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

        const int size = B.w * B.elempack;

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        const __fp16* ptr = (const __fp16*)B + i * B_hstep * B.elempack;

        int j = 0;
        for (; j + 31 < size; j += 32)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
            _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
            _absmax2 = vmaxq_f16(_absmax2, vabsq_f16(_p2));
            _absmax3 = vmaxq_f16(_absmax3, vabsq_f16(_p3));
            ptr += 32;
        }
        for (; j + 15 < size; j += 16)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p0));
            _absmax1 = vmaxq_f16(_absmax1, vabsq_f16(_p1));
            ptr += 16;
        }
        for (; j + 7 < size; j += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _absmax0 = vmaxq_f16(_absmax0, vabsq_f16(_p));
            ptr += 8;
        }
        for (; j + 3 < size; j += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _amax = vmax_f16(_amax, vabs_f16(_p));
            ptr += 4;
        }
        for (; j < size; j++)
        {
            absmax = std::max(absmax, (float)fabsf((float)ptr[0]));
            ptr++;
        }
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        const unsigned short* ptr = (const unsigned short*)B + i * B_hstep * B.elempack;

        int j = 0;
#if __ARM_NEON
        for (; j + 15 < size; j += 16)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            uint16x8_t _q = vld1q_u16(ptr + 8);
            float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
            float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
            float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
            float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
            _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
            _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
            _absmax2 = vmaxq_f32(_absmax2, vabsq_f32(_p2));
            _absmax3 = vmaxq_f32(_absmax3, vabsq_f32(_p3));
            ptr += 16;
        }
        for (; j + 7 < size; j += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
            float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
            _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p0));
            _absmax1 = vmaxq_f32(_absmax1, vabsq_f32(_p1));
            ptr += 8;
        }
        for (; j + 3 < size; j += 4)
        {
            float32x4_t _p = vcvt_f32_f16((float16x4_t)vld1_u16(ptr));
            _absmax0 = vmaxq_f32(_absmax0, vabsq_f32(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size; j++)
        {
            absmax = std::max(absmax, (float)fabsf(float16_to_float32(ptr[0])));
            ptr++;
        }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    }
#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    _absmax0 = vmaxq_f16(_absmax0, _absmax2);
    _absmax1 = vmaxq_f16(_absmax1, _absmax3);
    _absmax0 = vmaxq_f16(_absmax0, _absmax1);
    absmax = std::max(absmax, (float)vmaxvq_f16(_absmax0));
    absmax = std::max(absmax, (float)vmaxv_f16(_amax));
#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    _absmax0 = vmaxq_f32(_absmax0, _absmax2);
    _absmax1 = vmaxq_f32(_absmax1, _absmax3);
    _absmax0 = vmaxq_f32(_absmax0, _absmax1);
#if __aarch64__
    absmax = std::max(absmax, vmaxvq_f32(_absmax0));
#else
    float32x2_t _aa = vmax_f32(vget_low_f32(_absmax0), vget_high_f32(_absmax0));
    absmax = std::max(absmax, std::max(vget_lane_f32(_aa, 0), vget_lane_f32(_aa, 1)));
#endif
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#endif // __ARM_NEON

    scale = absmax == 0.f ? 1.f : 127.f / absmax;
}

static void pack_B_tile_fp16_to_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        pack_B_tile_fp16_to_int8_i8mm(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        pack_B_tile_fp16_to_int8_asimddp(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    // NCNN_LOGE("pack_B_tile_fp16_to_int8 %d %d", max_jj, elempack);

    signed char* pp = BT;

#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
#endif

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k * elempack;

#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + 32);
                uint16x8_t _u = vld1q_u16(p0 + 40);
                uint16x8_t _v = vld1q_u16(p0 + 48);
                uint16x8_t _w = vld1q_u16(p0 + 56);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _p04 = vzip_s8(float2int8(_p0, _p1), float2int8(_p8, _p9));
                int8x8x2_t _p15 = vzip_s8(float2int8(_p2, _p3), float2int8(_pa, _pb));
                int8x8x2_t _p26 = vzip_s8(float2int8(_p4, _p5), float2int8(_pc, _pd));
                int8x8x2_t _p37 = vzip_s8(float2int8(_p6, _p7), float2int8(_pe, _pf));

                int8x16x4_t _rr;
                _rr.val[0] = vcombine_s8(_p04.val[0], _p04.val[1]);
                _rr.val[1] = vcombine_s8(_p15.val[0], _p15.val[1]);
                _rr.val[2] = vcombine_s8(_p26.val[0], _p26.val[1]);
                _rr.val[3] = vcombine_s8(_p37.val[0], _p37.val[1]);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x16x4_t _rr;
                _rr.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p8, _p9));
                _rr.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_pa, _pb));
                _rr.val[2] = vcombine_s8(float2int8(_p4, _p5), float2int8(_pc, _pd));
                _rr.val[3] = vcombine_s8(float2int8(_p6, _p7), float2int8(_pe, _pf));
#endif // __ARM_FEATURE_MATMUL_INT8

                vst4q_s8(pp, _rr);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p8, _p9), float2int8(_pc, _pd));
                _r23.val[1] = vcombine_s8(float2int8(_pa, _pb), float2int8(_pe, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 64;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                uint16x8_t _p23 = vld1q_u16(p0 + 8);

                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p23));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p23));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 16;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);
                uint16x8x4_t _q = vld4q_u16(p0 + B_hstep * 4);

                float32x4_t _p0 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[0])), _scale);
                float32x4_t _p1 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[1])), _scale);
                float32x4_t _p2 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[2])), _scale);
                float32x4_t _p3 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[3])), _scale);
                float32x4_t _p4 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[0])), _scale);
                float32x4_t _p5 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[1])), _scale);
                float32x4_t _p6 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[2])), _scale);
                float32x4_t _p7 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[3])), _scale);
                float32x4_t _p8 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[0])), _scale);
                float32x4_t _p9 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[1])), _scale);
                float32x4_t _pa = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[2])), _scale);
                float32x4_t _pb = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_q.val[3])), _scale);
                float32x4_t _pc = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[0])), _scale);
                float32x4_t _pd = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[1])), _scale);
                float32x4_t _pe = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[2])), _scale);
                float32x4_t _pf = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_q.val[3])), _scale);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
                int8x8_t _r4 = float2int8(_p8, _pc);
                int8x8_t _r5 = float2int8(_p9, _pd);
                int8x8_t _r6 = float2int8(_pa, _pe);
                int8x8_t _r7 = float2int8(_pb, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p8, _p9);
                int8x8_t _r3 = float2int8(_pa, _pb);
                int8x8_t _r4 = float2int8(_p4, _p5);
                int8x8_t _r5 = float2int8(_p6, _p7);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 4 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p8), float2int8(_p2, _pa));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p9), float2int8(_p3, _pb));
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(float2int8(_p4, _pc), float2int8(_p6, _pe));
                _r23.val[1] = vcombine_s8(float2int8(_p5, _pd), float2int8(_p7, _pf));

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);
                uint16x4x4_t _q = vld4_u16(p0 + B_hstep * 4);

                float32x4_t _p0 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[0]), _scale);
                float32x4_t _p1 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[1]), _scale);
                float32x4_t _p2 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[2]), _scale);
                float32x4_t _p3 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[3]), _scale);
                float32x4_t _p4 = vmulq_f32(vcvt_f32_f16((float16x4_t)_q.val[0]), _scale);
                float32x4_t _p5 = vmulq_f32(vcvt_f32_f16((float16x4_t)_q.val[1]), _scale);
                float32x4_t _p6 = vmulq_f32(vcvt_f32_f16((float16x4_t)_q.val[2]), _scale);
                float32x4_t _p7 = vmulq_f32(vcvt_f32_f16((float16x4_t)_q.val[3]), _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 4 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p4), float2int8(_p2, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p5), float2int8(_p3, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep * 4);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p8, _pa));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_pc, _pe));
                int16x4_t _t4 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t5 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4_t _t6 = vreinterpret_s16_s8(float2int8(_p9, _pb));
                int16x4_t _t7 = vreinterpret_s16_s8(float2int8(_pd, _pf));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int16x4x2_t _t45 = vuzp_s16(_t4, _t5);
                int16x4x2_t _t67 = vuzp_s16(_t6, _t7);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
                int8x8_t _r4 = vreinterpret_s8_s16(_t45.val[0]);
                int8x8_t _r5 = vreinterpret_s8_s16(_t67.val[0]);
                int8x8_t _r6 = vreinterpret_s8_s16(_t45.val[1]);
                int8x8_t _r7 = vreinterpret_s8_s16(_t67.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));

                pp += 64;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 3));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 4));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 5));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 6));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 7));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p45 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p67 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 7], _p, 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0++;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k * elempack;

        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8x4_t _p = vld4q_u16(p0);

                float32x4_t _p0 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[0])), _scale);
                float32x4_t _p1 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[1])), _scale);
                float32x4_t _p2 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[2])), _scale);
                float32x4_t _p3 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_low_u16(_p.val[3])), _scale);
                float32x4_t _p4 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[0])), _scale);
                float32x4_t _p5 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[1])), _scale);
                float32x4_t _p6 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[2])), _scale);
                float32x4_t _p7 = vmulq_f32(vcvt_f32_f16((float16x4_t)vget_high_u16(_p.val[3])), _scale);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += 32;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x4x4_t _p = vld4_u16(p0);

                float32x4_t _p0 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[0]), _scale);
                float32x4_t _p1 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[1]), _scale);
                float32x4_t _p2 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[2]), _scale);
                float32x4_t _p3 = vmulq_f32(vcvt_f32_f16((float16x4_t)_p.val[3]), _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += 16;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += 8;
            }
            for (; kk < max_kk; kk++)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0 += 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 3));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x4_t _p = uint16x4_t();
                _p = vset_lane_u16(p0[0], _p, 0);
                _p = vset_lane_u16(p0[B_hstep], _p, 1);
                _p = vset_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vset_lane_u16(p0[B_hstep * 3], _p, 3);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)_p);

                _p0 = vmulq_f32(_p0, _scale);
                int8x8_t _r0 = float2int8(_p0, _p0);

                pp[0] = vget_lane_s8(_r0, 0);
                pp[1] = vget_lane_s8(_r0, 1);
                pp[2] = vget_lane_s8(_r0, 2);
                pp[3] = vget_lane_s8(_r0, 3);

                pp += 4;
                p0++;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p2));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p2));
                float32x4_t _t2 = vcombine_f32(vget_low_f32(_p1), vget_low_f32(_p3));
                float32x4_t _t3 = vcombine_f32(vget_high_f32(_p1), vget_high_f32(_p3));
                int8x8_t _r0 = float2int8(_t0, _t1);
                int8x8_t _r1 = float2int8(_t2, _t3);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);
                vst1_s8(pp + 8, _r1);

                pp += 16;
                p0 += 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r0 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(float16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(float16_to_float32(p0[B_hstep]) * scale);
                pp[3] = float2int8(float16_to_float32(p0[B_hstep + 1]) * scale);
                pp += 4;
                p0 += 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(float16_to_float32(p0[B_hstep]) * scale);
                pp += 2;
                p0++;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const unsigned short* p0 = (const unsigned short*)B + (j + jj) * B_hstep + k;

        // if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp += 1;
                p0++;
            }
        }
    }
}

static void transpose_pack_B_tile_fp16_to_int8(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, float scale)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_i8mm())
    {
        transpose_pack_B_tile_fp16_to_int8_i8mm(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_pack_B_tile_fp16_to_int8_asimddp(B, BT, j, max_jj, k, max_kk, scale);
        return;
    }
#endif

    const int elempack = B.elempack;
    const size_t B_hstep = B.dims == 3 ? B.cstep : (size_t)B.w;

    // NCNN_LOGE("transpose_pack_B_tile_fp16_to_int8 %d %d", max_jj, elempack);

    signed char* pp = BT;

#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
#endif

    int jj = 0;
#if __ARM_NEON
#if __aarch64__
    for (; jj + 7 < max_jj; jj += 8)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + 32);
                uint16x8_t _u = vld1q_u16(p0 + 40);
                uint16x8_t _v = vld1q_u16(p0 + 48);
                uint16x8_t _w = vld1q_u16(p0 + 56);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p8, _pa);
                int8x8_t _r3 = float2int8(_pc, _pe);
                int8x8_t _r4 = float2int8(_p1, _p3);
                int8x8_t _r5 = float2int8(_p5, _p7);
                int8x8_t _r6 = float2int8(_p9, _pb);
                int8x8_t _r7 = float2int8(_pd, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += B_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 4 + 8);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 4 + 16);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 4 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p8);
                int8x8_t _r1 = float2int8(_p1, _p9);
                int8x8_t _r2 = float2int8(_p2, _pa);
                int8x8_t _r3 = float2int8(_p3, _pb);
                int8x8_t _r4 = float2int8(_p4, _pc);
                int8x8_t _r5 = float2int8(_p5, _pd);
                int8x8_t _r6 = float2int8(_p6, _pe);
                int8x8_t _r7 = float2int8(_p7, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
                vst1q_s8(pp + 32, vcombine_s8(_r4, _r5));
                vst1q_s8(pp + 48, vcombine_s8(_r6, _r7));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8_t _r45 = vreinterpretq_s16_s8(vcombine_s8(_r4, _r5));
                int16x8_t _r67 = vreinterpretq_s16_s8(vcombine_s8(_r6, _r7));
                int16x8x2_t _rr0 = vuzpq_s16(_r01, _r23);
                int16x8x2_t _rr1 = vuzpq_s16(_r45, _r67);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr0.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr0.val[1]));
                vst1q_s8(pp + 32, vreinterpretq_s8_s16(_rr1.val[0]));
                vst1q_s8(pp + 48, vreinterpretq_s8_s16(_rr1.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);

#if __ARM_FEATURE_DOTPROD
                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));
#else  // __ARM_FEATURE_DOTPROD
                int16x8_t _r01 = vreinterpretq_s16_s8(vcombine_s8(_r0, _r1));
                int16x8_t _r23 = vreinterpretq_s16_s8(vcombine_s8(_r2, _r3));
                int16x8x2_t _rr = vuzpq_s16(_r01, _r23);

                vst1q_s8(pp, vreinterpretq_s8_s16(_rr.val[0]));
                vst1q_s8(pp + 16, vreinterpretq_s8_s16(_rr.val[1]));
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                uint16x8_t _t = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _u = vld1q_u16(p0 + B_hstep * 5);
                uint16x8_t _v = vld1q_u16(p0 + B_hstep * 6);
                uint16x8_t _w = vld1q_u16(p0 + B_hstep * 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));
                float32x4_t _p8 = vcvt_f32_f16((float16x4_t)vget_low_u16(_t));
                float32x4_t _p9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_t));
                float32x4_t _pa = vcvt_f32_f16((float16x4_t)vget_low_u16(_u));
                float32x4_t _pb = vcvt_f32_f16((float16x4_t)vget_high_u16(_u));
                float32x4_t _pc = vcvt_f32_f16((float16x4_t)vget_low_u16(_v));
                float32x4_t _pd = vcvt_f32_f16((float16x4_t)vget_high_u16(_v));
                float32x4_t _pe = vcvt_f32_f16((float16x4_t)vget_low_u16(_w));
                float32x4_t _pf = vcvt_f32_f16((float16x4_t)vget_high_u16(_w));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);
                _p8 = vmulq_f32(_p8, _scale);
                _p9 = vmulq_f32(_p9, _scale);
                _pa = vmulq_f32(_pa, _scale);
                _pb = vmulq_f32(_pb, _scale);
                _pc = vmulq_f32(_pc, _scale);
                _pd = vmulq_f32(_pd, _scale);
                _pe = vmulq_f32(_pe, _scale);
                _pf = vmulq_f32(_pf, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
                int8x8_t _r4 = float2int8(_p8, _p9);
                int8x8_t _r5 = float2int8(_pa, _pb);
                int8x8_t _r6 = float2int8(_pc, _pd);
                int8x8_t _r7 = float2int8(_pe, _pf);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r04 = vzip_s8(_r0, _r4);
                int8x8x2_t _r15 = vzip_s8(_r1, _r5);
                int8x8x2_t _r26 = vzip_s8(_r2, _r6);
                int8x8x2_t _r37 = vzip_s8(_r3, _r7);
                int8x16x4_t _r0123;
                _r0123.val[0] = vcombine_s8(_r04.val[0], _r04.val[1]);
                _r0123.val[1] = vcombine_s8(_r15.val[0], _r15.val[1]);
                _r0123.val[2] = vcombine_s8(_r26.val[0], _r26.val[1]);
                _r0123.val[3] = vcombine_s8(_r37.val[0], _r37.val[1]);

                vst4q_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = _r0;
                _r0123.val[1] = _r1;
                _r0123.val[2] = _r2;
                _r0123.val[3] = _r3;
                int8x8x4_t _r4567;
                _r4567.val[0] = _r4;
                _r4567.val[1] = _r5;
                _r4567.val[2] = _r6;
                _r4567.val[3] = _r7;

                vst4_s8(pp, _r0123);
                vst4_s8(pp + 32, _r4567);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(_r0, _r2);
                _r01.val[1] = vcombine_s8(_r1, _r3);
                int8x16x2_t _r23;
                _r23.val[0] = vcombine_s8(_r4, _r6);
                _r23.val[1] = vcombine_s8(_r5, _r7);

                vst2q_s8(pp, _r01);
                vst2q_s8(pp + 32, _r23);
#endif // __ARM_FEATURE_DOTPROD

                pp += 64;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 2);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 3);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p1);
                _r0123.val[1] = float2int8(_p2, _p3);
                _r0123.val[2] = float2int8(_p4, _p5);
                _r0123.val[3] = float2int8(_p6, _p7);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p1), float2int8(_p4, _p5));
                _r01.val[1] = vcombine_s8(float2int8(_p2, _p3), float2int8(_p6, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p1);
                _r01.val[1] = float2int8(_p2, _p3);

                vst2_s8(pp, _r01);

                pp += 16;
                p0 += B_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r0 = float2int8(_p0, _p1);

                vst1_s8(pp, _r0);

                pp += 8;
                p0 += B_hstep;
            }
        }
    }
#endif // __aarch64__
    for (; jj + 3 < max_jj; jj += 4)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + 16);
                uint16x8_t _s = vld1q_u16(p0 + 24);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p4, _p6);
                int8x8_t _r2 = float2int8(_p1, _p3);
                int8x8_t _r3 = float2int8(_p5, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p4, _p6));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p5, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += B_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                uint16x8_t _r = vld1q_u16(p0 + B_hstep * 4);
                uint16x8_t _s = vld1q_u16(p0 + B_hstep * 4 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_r));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_r));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_s));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_s));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p4);
                int8x8_t _r1 = float2int8(_p1, _p5);
                int8x8_t _r2 = float2int8(_p2, _p6);
                int8x8_t _r3 = float2int8(_p3, _p7);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
                int8x8_t _r2 = float2int8(_p4, _p5);
                int8x8_t _r3 = float2int8(_p6, _p7);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4_t _t2 = vreinterpret_s16_s8(float2int8(_p4, _p5));
                int16x4_t _t3 = vreinterpret_s16_s8(float2int8(_p6, _p7));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int16x4x2_t _t23 = vuzp_s16(_t2, _t3);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
                int8x8_t _r2 = vreinterpret_s8_s16(_t23.val[0]);
                int8x8_t _r3 = vreinterpret_s8_s16(_t23.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));
                vst1q_s8(pp + 16, vcombine_s8(_r2, _r3));

                pp += 32;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vuzp_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += B_hstep * 4;
            }
        }
        if (elempack == 1)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 3));
                float32x4_t _p4 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 4));
                float32x4_t _p5 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 5));
                float32x4_t _p6 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 6));
                float32x4_t _p7 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 7));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);
                _p4 = vmulq_f32(_p4, _scale);
                _p5 = vmulq_f32(_p5, _scale);
                _p6 = vmulq_f32(_p6, _scale);
                _p7 = vmulq_f32(_p7, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                float32x4x2_t _p04 = vzipq_f32(_p0, _p4);
                float32x4x2_t _p15 = vzipq_f32(_p1, _p5);
                float32x4x2_t _p26 = vzipq_f32(_p2, _p6);
                float32x4x2_t _p37 = vzipq_f32(_p3, _p7);
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p04.val[0], _p04.val[1]);
                _r0123.val[1] = float2int8(_p15.val[0], _p15.val[1]);
                _r0123.val[2] = float2int8(_p26.val[0], _p26.val[1]);
                _r0123.val[3] = float2int8(_p37.val[0], _p37.val[1]);

                vst4_s8(pp, _r0123);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x4_t _r0123;
                _r0123.val[0] = float2int8(_p0, _p4);
                _r0123.val[1] = float2int8(_p1, _p5);
                _r0123.val[2] = float2int8(_p2, _p6);
                _r0123.val[3] = float2int8(_p3, _p7);

                vst4_s8(pp, _r0123);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int8x16x2_t _r01;
                _r01.val[0] = vcombine_s8(float2int8(_p0, _p2), float2int8(_p4, _p6));
                _r01.val[1] = vcombine_s8(float2int8(_p1, _p3), float2int8(_p5, _p7));

                vst2q_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 32;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 2));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 3));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
                transpose4x4_ps(_p0, _p1, _p2, _p3);
                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));
#else  // __ARM_FEATURE_DOTPROD
                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p0, _p2);
                _r01.val[1] = float2int8(_p1, _p3);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                float32x4x2_t _p01 = vzipq_f32(_p0, _p1);
                int8x8_t _r01 = float2int8(_p01.val[0], _p01.val[1]);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 2;
            }
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(float16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(float16_to_float32(p0[2]) * scale);
                pp[3] = float2int8(float16_to_float32(p0[3]) * scale);
                pp += 4;
                p0 += B_hstep;
            }
        }
    }
#endif // __ARM_NEON
    for (; jj + 1 < max_jj; jj += 2)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

#if __ARM_NEON
#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p1));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p2, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += B_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = vld1q_u16(p0);
                uint16x8_t _q = vld1q_u16(p0 + B_hstep * 4);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p2);
                int8x8_t _r1 = float2int8(_p1, _p3);
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8_t _r0 = float2int8(_p0, _p1);
                int8x8_t _r1 = float2int8(_p2, _p3);
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                int16x4_t _t0 = vreinterpret_s16_s8(float2int8(_p0, _p2));
                int16x4_t _t1 = vreinterpret_s16_s8(float2int8(_p1, _p3));
                int16x4x2_t _t01 = vzip_s16(_t0, _t1);
                int8x8_t _r0 = vreinterpret_s8_s16(_t01.val[0]);
                int8x8_t _r1 = vreinterpret_s8_s16(_t01.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1q_s8(pp, vcombine_s8(_r0, _r1));

                pp += 16;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                uint16x8_t _p = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

#if __ARM_FEATURE_DOTPROD
                int8x8_t _r01 = float2int8(_p0, _p1);
#else  // __ARM_FEATURE_DOTPROD
                float32x4_t _t0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                float32x4_t _t1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));
                int8x8_t _r01 = float2int8(_t0, _t1);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 7 < max_kk; kk += 8)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep * 4], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep * 4 + 1], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 5], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 5 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 6], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 6 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 7 + 1], _q, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p45 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p67 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);
                _p45 = vmulq_f32(_p45, _scale);
                _p67 = vmulq_f32(_p67, _scale);

                int8x8_t _r0 = float2int8(_p01, _p23);
                int8x8_t _r1 = float2int8(_p45, _p67);

#if __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vuzp_s8(_r0, _r1);

                vst1q_s8(pp, vcombine_s8(_r01.val[0], _r01.val[1]));
#else  // __ARM_FEATURE_MATMUL_INT8
                int8x8x2_t _r01 = vtrn_s8(_r0, _r1);
                int8x8x2_t _rr01 = vuzp_s8(_r01.val[0], _r01.val[1]);

                vst1q_s8(pp, vcombine_s8(_rr01.val[0], _rr01.val[1]));
#endif // __ARM_FEATURE_MATMUL_INT8
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 4 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 6 + 1], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep + 1], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 3], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 3 + 1], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 5], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 5 + 1], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 7], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 7 + 1], _q, 7);
                float32x4_t _p02 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p46 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p13 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p57 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p02 = vmulq_f32(_p02, _scale);
                _p46 = vmulq_f32(_p46, _scale);
                _p13 = vmulq_f32(_p13, _scale);
                _p57 = vmulq_f32(_p57, _scale);

                int8x8x2_t _r01;
                _r01.val[0] = float2int8(_p02, _p46);
                _r01.val[1] = float2int8(_p13, _p57);

                vst2_s8(pp, _r01);
#endif // __ARM_FEATURE_DOTPROD

                pp += 16;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
#if __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                float32x4_t _p01 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p23 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p01 = vmulq_f32(_p01, _scale);
                _p23 = vmulq_f32(_p23, _scale);

                float32x4x2_t _pp = vuzpq_f32(_p01, _p23);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#else  // __ARM_FEATURE_DOTPROD
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[1], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 2 + 1], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep + 1], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 3 + 1], _p, 7);
                float32x4_t _p02 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p13 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p02 = vmulq_f32(_p02, _scale);
                _p13 = vmulq_f32(_p13, _scale);

                float32x4x2_t _pp = vzipq_f32(_p02, _p13);
                int8x8_t _r01 = float2int8(_pp.val[0], _pp.val[1]);
#endif // __ARM_FEATURE_DOTPROD

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 4;
            }
            for (; kk + 1 < max_kk; kk += 2)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(float16_to_float32(p0[B_hstep + 0]) * scale);
                pp[2] = float2int8(float16_to_float32(p0[1]) * scale);
                pp[3] = float2int8(float16_to_float32(p0[B_hstep + 1]) * scale);
                pp += 4;
                p0 += B_hstep * 2;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(float16_to_float32(p0[1]) * scale);
                pp += 2;
                p0 += B_hstep;
            }
        }
    }
    for (; jj < max_jj; jj += 1)
    {
        const unsigned short* p0 = (const unsigned short*)B + k * B_hstep + (j + jj) * elempack;

#if __ARM_NEON
#if __aarch64__
        if (elempack == 8)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                uint16x8_t _p23 = vld1q_u16(p0 + B_hstep * 8);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p23));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p23));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += B_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p01 = vld1q_u16(p0);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p01));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p01));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);
                pp += 8;
                p0 += B_hstep * 8;
            }
        }
#endif // __aarch64__
        if (elempack == 4)
        {
            int kk = 0;
            for (; kk + 15 < max_kk; kk += 16)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 4));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 8));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 12));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += B_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vld1_u16(p0));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vld1_u16(p0 + B_hstep * 4));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 8;
            }
            for (; kk + 3 < max_kk; kk += 4)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp[1] = float2int8(float16_to_float32(p0[1]) * scale);
                pp[2] = float2int8(float16_to_float32(p0[2]) * scale);
                pp[3] = float2int8(float16_to_float32(p0[3]) * scale);
                pp += 4;
                p0 += B_hstep * 4;
            }
        }
#endif // __ARM_NEON
        if (elempack == 1)
        {
            int kk = 0;
#if __ARM_NEON
            for (; kk + 15 < max_kk; kk += 16)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 7], _p, 7);
                uint16x8_t _q = uint16x8_t();
                _q = vsetq_lane_u16(p0[B_hstep * 8], _q, 0);
                _q = vsetq_lane_u16(p0[B_hstep * 9], _q, 1);
                _q = vsetq_lane_u16(p0[B_hstep * 10], _q, 2);
                _q = vsetq_lane_u16(p0[B_hstep * 11], _q, 3);
                _q = vsetq_lane_u16(p0[B_hstep * 12], _q, 4);
                _q = vsetq_lane_u16(p0[B_hstep * 13], _q, 5);
                _q = vsetq_lane_u16(p0[B_hstep * 14], _q, 6);
                _q = vsetq_lane_u16(p0[B_hstep * 15], _q, 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));
                float32x4_t _p2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_q));
                float32x4_t _p3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_q));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);
                _p2 = vmulq_f32(_p2, _scale);
                _p3 = vmulq_f32(_p3, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);
                int8x8_t _r23 = float2int8(_p2, _p3);

                vst1q_s8(pp, vcombine_s8(_r01, _r23));

                pp += 16;
                p0 += B_hstep * 16;
            }
            for (; kk + 7 < max_kk; kk += 8)
            {
                uint16x8_t _p = uint16x8_t();
                _p = vsetq_lane_u16(p0[0], _p, 0);
                _p = vsetq_lane_u16(p0[B_hstep], _p, 1);
                _p = vsetq_lane_u16(p0[B_hstep * 2], _p, 2);
                _p = vsetq_lane_u16(p0[B_hstep * 3], _p, 3);
                _p = vsetq_lane_u16(p0[B_hstep * 4], _p, 4);
                _p = vsetq_lane_u16(p0[B_hstep * 5], _p, 5);
                _p = vsetq_lane_u16(p0[B_hstep * 6], _p, 6);
                _p = vsetq_lane_u16(p0[B_hstep * 7], _p, 7);
                float32x4_t _p0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_p));
                float32x4_t _p1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_p));

                _p0 = vmulq_f32(_p0, _scale);
                _p1 = vmulq_f32(_p1, _scale);

                int8x8_t _r01 = float2int8(_p0, _p1);

                vst1_s8(pp, _r01);

                pp += 8;
                p0 += B_hstep * 8;
            }
#endif // __ARM_NEON
            for (; kk < max_kk; kk++)
            {
                pp[0] = float2int8(float16_to_float32(p0[0]) * scale);
                pp += 1;
                p0 += B_hstep;
            }
        }
    }
}

static void unpack_output_tile_int32_to_fp16(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        unpack_output_tile_int32_to_fp16_asimddp(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const size_t c_hstep = C.dims == 3 ? C.cstep : (size_t)C.w;
    const int c_elempack = C.elempack;
    const unsigned short* pC = C;

    // NCNN_LOGE("unpack_output_tile_int32_to_fp16  %d %d %d %d  %d  %d  %d", i, max_ii, j, max_jj, out_elempack, broadcast_type_C, c_elempack);

    const int* pp = topT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
        float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

        float32x4_t _c0;
        float32x4_t _c1;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                uint16x8_t _c = vld1q_u16(pC);
                _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                _c0 = vmulq_n_f32(_c0, beta);
                _c1 = vmulq_n_f32(_c1, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);
            int32x4_t _sum8 = vld1q_s32(pp + 32);
            int32x4_t _sum9 = vld1q_s32(pp + 36);
            int32x4_t _suma = vld1q_s32(pp + 40);
            int32x4_t _sumb = vld1q_s32(pp + 44);
            int32x4_t _sumc = vld1q_s32(pp + 48);
            int32x4_t _sumd = vld1q_s32(pp + 52);
            int32x4_t _sume = vld1q_s32(pp + 56);
            int32x4_t _sumf = vld1q_s32(pp + 60);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
#else
            // from
            //      a0 b1 c2 d3
            //      e4 f5 g6 h7
            //      e0 f1 g2 h3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      g4 h5 e6 f7
            //      g0 h1 e2 f3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      e7 f6 g5 h4
            //      e3 f2 g1 h0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      g7 h6 e5 f4
            //      g3 h2 e1 f0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
            {
                _sum8 = vrev64q_s32(_sum8);
                _sum9 = vrev64q_s32(_sum9);
                _suma = vrev64q_s32(_suma);
                _sumb = vrev64q_s32(_sumb);
                _sumc = vrev64q_s32(_sumc);
                _sumd = vrev64q_s32(_sumd);
                _sume = vrev64q_s32(_sume);
                _sumf = vrev64q_s32(_sumf);
                _sum8 = vextq_s32(_sum8, _sum8, 2);
                _sum9 = vextq_s32(_sum9, _sum9, 2);
                _suma = vextq_s32(_suma, _suma, 2);
                _sumb = vextq_s32(_sumb, _sumb, 2);
                _sumc = vextq_s32(_sumc, _sumc, 2);
                _sumd = vextq_s32(_sumd, _sumd, 2);
                _sume = vextq_s32(_sume, _sume, 2);
                _sumf = vextq_s32(_sumf, _sumf, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                _sum9 = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                _suma = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                _sumb = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                _sumc = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                _sumd = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                _sume = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
                _sum9 = vrev64q_s32(_sum9);
                _sumb = vrev64q_s32(_sumb);
                _sumd = vrev64q_s32(_sumd);
                _sumf = vrev64q_s32(_sumf);
            }
#endif

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum8), _descale0);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum9), _descale0);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_suma), _descale0);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sumb), _descale0);
            float32x4_t _f8 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f9 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _fa = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _fb = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);
            float32x4_t _fc = vmulq_f32(vcvtq_f32_s32(_sumc), _descale1);
            float32x4_t _fd = vmulq_f32(vcvtq_f32_s32(_sumd), _descale1);
            float32x4_t _fe = vmulq_f32(vcvtq_f32_s32(_sume), _descale1);
            float32x4_t _ff = vmulq_f32(vcvtq_f32_s32(_sumf), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c0);
                    _fa = vaddq_f32(_fa, _c0);
                    _fb = vaddq_f32(_fb, _c0);
                    _fc = vaddq_f32(_fc, _c0);
                    _fd = vaddq_f32(_fd, _c0);
                    _fe = vaddq_f32(_fe, _c0);
                    _ff = vaddq_f32(_ff, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c1);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c1);
                    _fb = vaddq_f32(_fb, _c1);
                    _fc = vaddq_f32(_fc, _c1);
                    _fd = vaddq_f32(_fd, _c1);
                    _fe = vaddq_f32(_fe, _c1);
                    _ff = vaddq_f32(_ff, _c1);
                }
                if (broadcast_type_C == 3)
                {
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _c08 = vld1q_u16(pC);
                        uint16x8_t _c19 = vld1q_u16(pC + 8);
                        uint16x8_t _c2a = vld1q_u16(pC + 16);
                        uint16x8_t _c3b = vld1q_u16(pC + 24);
                        uint16x8_t _c4c = vld1q_u16(pC + 32);
                        uint16x8_t _c5d = vld1q_u16(pC + 40);
                        uint16x8_t _c6e = vld1q_u16(pC + 48);
                        uint16x8_t _c7f = vld1q_u16(pC + 56);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c08));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c19));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c2a));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c3b));
                        float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c4c));
                        float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c5d));
                        float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c6e));
                        float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c7f));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c0 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c08));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c19));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c2a));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c3b));
                        _c4 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c4c));
                        _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c5d));
                        _c6 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c6e));
                        _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c7f));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 64;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        uint16x8_t _c45 = vld1q_u16(pC + 16);
                        uint16x8_t _c67 = vld1q_u16(pC + 24);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c45 = vld1q_u16(pC + c_hstep * 4 + 16);
                        _c67 = vld1q_u16(pC + c_hstep * 4 + 24);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                        uint16x8_t _c45 = vld1q_u16(pC + c_hstep * 2);
                        uint16x8_t _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 5);
                        _c45 = vld1q_u16(pC + c_hstep * 6);
                        _c67 = vld1q_u16(pC + c_hstep * 7);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 8;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _cc = vld1q_u16(pC);
                    float32x4_t _cc0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_cc));
                    float32x4_t _cc1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_cc));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c2);
                    _fb = vaddq_f32(_fb, _c3);
                    _fc = vaddq_f32(_fc, _c4);
                    _fd = vaddq_f32(_fd, _c5);
                    _fe = vaddq_f32(_fe, _c6);
                    _ff = vaddq_f32(_ff, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
                _f8 = vmulq_f32(_f8, _alpha);
                _f9 = vmulq_f32(_f9, _alpha);
                _fa = vmulq_f32(_fa, _alpha);
                _fb = vmulq_f32(_fb, _alpha);
                _fc = vmulq_f32(_fc, _alpha);
                _fd = vmulq_f32(_fd, _alpha);
                _fe = vmulq_f32(_fe, _alpha);
                _ff = vmulq_f32(_ff, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);
            uint16x4_t _hf4 = (uint16x4_t)vcvt_f16_f32(_f4);
            uint16x4_t _hf5 = (uint16x4_t)vcvt_f16_f32(_f5);
            uint16x4_t _hf6 = (uint16x4_t)vcvt_f16_f32(_f6);
            uint16x4_t _hf7 = (uint16x4_t)vcvt_f16_f32(_f7);
            uint16x4_t _hf8 = (uint16x4_t)vcvt_f16_f32(_f8);
            uint16x4_t _hf9 = (uint16x4_t)vcvt_f16_f32(_f9);
            uint16x4_t _hfa = (uint16x4_t)vcvt_f16_f32(_fa);
            uint16x4_t _hfb = (uint16x4_t)vcvt_f16_f32(_fb);
            uint16x4_t _hfc = (uint16x4_t)vcvt_f16_f32(_fc);
            uint16x4_t _hfd = (uint16x4_t)vcvt_f16_f32(_fd);
            uint16x4_t _hfe = (uint16x4_t)vcvt_f16_f32(_fe);
            uint16x4_t _hff = (uint16x4_t)vcvt_f16_f32(_ff);

#if __aarch64__
            if (out_elempack == 8)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf8));
                vst1q_u16(p0 + 8, vcombine_u16(_hf1, _hf9));
                vst1q_u16(p0 + 16, vcombine_u16(_hf2, _hfa));
                vst1q_u16(p0 + 24, vcombine_u16(_hf3, _hfb));
                vst1q_u16(p0 + 32, vcombine_u16(_hf4, _hfc));
                vst1q_u16(p0 + 40, vcombine_u16(_hf5, _hfd));
                vst1q_u16(p0 + 48, vcombine_u16(_hf6, _hfe));
                vst1q_u16(p0 + 56, vcombine_u16(_hf7, _hff));
                p0 += 64;
            }
#endif // __aarch64__
            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                vst1q_u16(p0 + 8, vcombine_u16(_hf2, _hf3));
                vst1q_u16(p0 + 16, vcombine_u16(_hf4, _hf5));
                vst1q_u16(p0 + 24, vcombine_u16(_hf6, _hf7));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_hf8, _hf9));
                vst1q_u16(p0 + out_hstep * 4 + 8, vcombine_u16(_hfa, _hfb));
                vst1q_u16(p0 + out_hstep * 4 + 16, vcombine_u16(_hfc, _hfd));
                vst1q_u16(p0 + out_hstep * 4 + 24, vcombine_u16(_hfe, _hff));
                p0 += 32;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_hf0, _hf1, _hf2, _hf3);
                transpose4x4_u16(_hf4, _hf5, _hf6, _hf7);
                vst1q_u16(p0, vcombine_u16(_hf0, _hf4));
                vst1q_u16(p0 + out_hstep, vcombine_u16(_hf1, _hf5));
                vst1q_u16(p0 + out_hstep * 2, vcombine_u16(_hf2, _hf6));
                vst1q_u16(p0 + out_hstep * 3, vcombine_u16(_hf3, _hf7));
                transpose4x4_u16(_hf8, _hf9, _hfa, _hfb);
                transpose4x4_u16(_hfc, _hfd, _hfe, _hff);
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_hf8, _hfc));
                vst1q_u16(p0 + out_hstep * 5, vcombine_u16(_hf9, _hfd));
                vst1q_u16(p0 + out_hstep * 6, vcombine_u16(_hfa, _hfe));
                vst1q_u16(p0 + out_hstep * 7, vcombine_u16(_hfb, _hff));
                p0 += 8;
            }

            pp += 64;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
#else
            // from
            //      a0 b1 c2 d3
            //      e0 f1 g2 h3
            //      c0 d1 a2 b3
            //      g0 h1 e2 f3
            //      a3 b2 c1 d0
            //      e3 f2 g1 h0
            //      c3 d2 a1 b0
            //      g3 h2 e1 f0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c1);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c1);
                    _f7 = vaddq_f32(_f7, _c1);
                }
                if (broadcast_type_C == 3)
                {
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _c04 = vld1q_u16(pC);
                        uint16x8_t _c15 = vld1q_u16(pC + 8);
                        uint16x8_t _c26 = vld1q_u16(pC + 16);
                        uint16x8_t _c37 = vld1q_u16(pC + 24);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c04));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c15));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c26));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c37));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _c0 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c04));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c15));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c26));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c37));
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 32;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = vcvt_f32_f16((float16x4_t)_cc0);
                        _c1 = vcvt_f32_f16((float16x4_t)_cc1);
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)_cc2);
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)_cc3);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _cc0 = vld1_u16(pC + c_hstep * 4);
                        _cc1 = vld1_u16(pC + c_hstep * 5);
                        _cc2 = vld1_u16(pC + c_hstep * 6);
                        _cc3 = vld1_u16(pC + c_hstep * 7);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = vcvt_f32_f16((float16x4_t)_cc0);
                        _c1 = vcvt_f32_f16((float16x4_t)_cc1);
                        _c2 = vcvt_f32_f16((float16x4_t)_cc2);
                        _c3 = vcvt_f32_f16((float16x4_t)_cc3);
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 4;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c2);
                    _f7 = vaddq_f32(_f7, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);
            uint16x4_t _hf4 = (uint16x4_t)vcvt_f16_f32(_f4);
            uint16x4_t _hf5 = (uint16x4_t)vcvt_f16_f32(_f5);
            uint16x4_t _hf6 = (uint16x4_t)vcvt_f16_f32(_f6);
            uint16x4_t _hf7 = (uint16x4_t)vcvt_f16_f32(_f7);

#if __aarch64__
            if (out_elempack == 8)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf4));
                vst1q_u16(p0 + 8, vcombine_u16(_hf1, _hf5));
                vst1q_u16(p0 + 16, vcombine_u16(_hf2, _hf6));
                vst1q_u16(p0 + 24, vcombine_u16(_hf3, _hf7));
                p0 += 32;
            }
#endif // __aarch64__
            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                vst1q_u16(p0 + 8, vcombine_u16(_hf2, _hf3));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_hf4, _hf5));
                vst1q_u16(p0 + out_hstep * 4 + 8, vcombine_u16(_hf6, _hf7));
                p0 += 16;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_hf0, _hf1, _hf2, _hf3);
                transpose4x4_u16(_hf4, _hf5, _hf6, _hf7);
                vst1_u16(p0, _hf0);
                vst1_u16(p0 + out_hstep, _hf1);
                vst1_u16(p0 + out_hstep * 2, _hf2);
                vst1_u16(p0 + out_hstep * 3, _hf3);
                vst1_u16(p0 + out_hstep * 4, _hf4);
                vst1_u16(p0 + out_hstep * 5, _hf5);
                vst1_u16(p0 + out_hstep * 6, _hf6);
                vst1_u16(p0 + out_hstep * 7, _hf7);
                p0 += 4;
            }

            pp += 32;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
#else
            // from
            //      a0 b1 c0 d1
            //      e0 f1 g0 h1
            //      a1 b0 c1 d0
            //      e1 f0 g1 h0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale1);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c2;
                    float32x4_t _c3;
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _cc0 = vld1q_u16(pC);
                        uint16x8_t _cc1 = vld1q_u16(pC + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_cc0));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_cc1));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_high_u16(_cc0));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_cc1));
                        pC += 16;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + c_hstep * 4);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[1], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep + 1], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c01, 7);
                        uint16x8_t _c23 = uint16x8_t();
                        _c23 = vsetq_lane_u16(pC[c_hstep * 4], _c23, 0);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 5], _c23, 1);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 6], _c23, 2);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 7], _c23, 3);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 4 + 1], _c23, 4);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 5 + 1], _c23, 5);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 6 + 1], _c23, 6);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 7 + 1], _c23, 7);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        pC += 2;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    _c1 = vdupq_n_f32(float16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);

#if __aarch64__
            if (out_elempack == 8)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf2));
                vst1q_u16(p0 + 8, vcombine_u16(_hf1, _hf3));
                p0 += 16;
            }
#endif // __aarch64__
            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_hf2, _hf3));
                p0 += 8;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_hf0, 0);
                p0[1] = vget_lane_u16(_hf1, 0);
                p0[out_hstep] = vget_lane_u16(_hf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_hf1, 1);
                p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_hf1, 2);
                p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_hf1, 3);
                p0[out_hstep * 4] = vget_lane_u16(_hf2, 0);
                p0[out_hstep * 4 + 1] = vget_lane_u16(_hf3, 0);
                p0[out_hstep * 5] = vget_lane_u16(_hf2, 1);
                p0[out_hstep * 5 + 1] = vget_lane_u16(_hf3, 1);
                p0[out_hstep * 6] = vget_lane_u16(_hf2, 2);
                p0[out_hstep * 6 + 1] = vget_lane_u16(_hf3, 2);
                p0[out_hstep * 7] = vget_lane_u16(_hf2, 3);
                p0[out_hstep * 7 + 1] = vget_lane_u16(_hf3, 3);
                p0 += 2;
            }

            pp += 16;
        }
        for (; jj < max_jj; jj++)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _c = vld1q_u16(pC);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                        pC += 8;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                        _c1 = vcvt_f32_f16((float16x4_t)vld1_u16(pC + c_hstep * 4));
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 4], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 5], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 6], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 7], _c01, 7);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        pC += 1;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);

#if __aarch64__
            if (out_elempack == 8)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                p0 += 8;
            }
#endif // __aarch64__
            if (out_elempack == 4)
            {
                vst1_u16(p0, _hf0);
                vst1_u16(p0 + out_hstep * 4, _hf1);
                p0 += 4;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_hf0, 0);
                p0[out_hstep] = vget_lane_u16(_hf0, 1);
                p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                p0[out_hstep * 4] = vget_lane_u16(_hf1, 0);
                p0[out_hstep * 5] = vget_lane_u16(_hf1, 1);
                p0[out_hstep * 6] = vget_lane_u16(_hf1, 2);
                p0[out_hstep * 7] = vget_lane_u16(_hf1, 3);
                p0++;
            }

            pp += 8;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

        float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

        float32x4_t _c0;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                _c0 = vmulq_n_f32(_c0, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
#else
            // from
            //      a0 b1 c2 d3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c01;
                    uint16x8_t _c23;
                    uint16x8_t _c45;
                    uint16x8_t _c67;
                    if (c_elempack == 4)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + 8);
                        _c45 = vld1q_u16(pC + 16);
                        _c67 = vld1q_u16(pC + 24);
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + c_hstep);
                        _c45 = vld1q_u16(pC + c_hstep * 2);
                        _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        pC += 8;
                    }
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                    float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                    float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                    float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                    float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                    float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                    float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                        _f4 = vaddq_f32(_f4, _c4);
                        _f5 = vaddq_f32(_f5, _c5);
                        _f6 = vaddq_f32(_f6, _c6);
                        _f7 = vaddq_f32(_f7, _c7);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                        _f4 = vmlaq_f32(_f4, _c4, _beta);
                        _f5 = vmlaq_f32(_f5, _c5, _beta);
                        _f6 = vmlaq_f32(_f6, _c6, _beta);
                        _f7 = vmlaq_f32(_f7, _c7, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    float32x4_t _cc0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    float32x4_t _cc1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);
            uint16x4_t _hf4 = (uint16x4_t)vcvt_f16_f32(_f4);
            uint16x4_t _hf5 = (uint16x4_t)vcvt_f16_f32(_f5);
            uint16x4_t _hf6 = (uint16x4_t)vcvt_f16_f32(_f6);
            uint16x4_t _hf7 = (uint16x4_t)vcvt_f16_f32(_f7);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                vst1q_u16(p0 + 8, vcombine_u16(_hf2, _hf3));
                vst1q_u16(p0 + 16, vcombine_u16(_hf4, _hf5));
                vst1q_u16(p0 + 24, vcombine_u16(_hf6, _hf7));
                p0 += 32;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_hf0, _hf1, _hf2, _hf3);
                transpose4x4_u16(_hf4, _hf5, _hf6, _hf7);
                vst1q_u16(p0, vcombine_u16(_hf0, _hf4));
                vst1q_u16(p0 + out_hstep, vcombine_u16(_hf1, _hf5));
                vst1q_u16(p0 + out_hstep * 2, vcombine_u16(_hf2, _hf6));
                vst1q_u16(p0 + out_hstep * 3, vcombine_u16(_hf3, _hf7));
                p0 += 8;
            }

            pp += 32;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
#else
            // from
            //      a0 b1 c2 d3
            //      c0 d1 a2 b3
            //      a3 b2 c1 d0
            //      c3 d2 a1 b0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                _sum2 = vextq_s32(_sum2, _sum2, 2);
                _sum3 = vextq_s32(_sum3, _sum3, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep * 1);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = vcvt_f32_f16((float16x4_t)_cc0);
                        _c1 = vcvt_f32_f16((float16x4_t)_cc1);
                        _c2 = vcvt_f32_f16((float16x4_t)_cc2);
                        _c3 = vcvt_f32_f16((float16x4_t)_cc3);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    float32x4_t _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                vst1q_u16(p0 + 8, vcombine_u16(_hf2, _hf3));
                p0 += 16;
            }
            if (out_elempack == 1)
            {
                transpose4x4_u16(_hf0, _hf1, _hf2, _hf3);
                vst1_u16(p0, _hf0);
                vst1_u16(p0 + out_hstep, _hf1);
                vst1_u16(p0 + out_hstep * 2, _hf2);
                vst1_u16(p0 + out_hstep * 3, _hf3);
                p0 += 4;
            }

            pp += 16;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
#else
            // from
            //      a0 b1 c0 d1
            //      a1 b0 c1 d0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            {
                _sum1 = vrev64q_s32(_sum1);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1q_u16(pC);
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x8_t();
                        _c = vsetq_lane_u16(pC[0], _c, 0);
                        _c = vsetq_lane_u16(pC[c_hstep], _c, 1);
                        _c = vsetq_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vsetq_lane_u16(pC[c_hstep * 3], _c, 3);
                        _c = vsetq_lane_u16(pC[1], _c, 4);
                        _c = vsetq_lane_u16(pC[c_hstep + 1], _c, 5);
                        _c = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c, 6);
                        _c = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c, 7);
                        pC += 2;
                    }
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    float32x4_t _c1 = vdupq_n_f32(float16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                p0 += 8;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_hf0, 0);
                p0[1] = vget_lane_u16(_hf1, 0);
                p0[out_hstep] = vget_lane_u16(_hf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_hf1, 1);
                p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_hf1, 2);
                p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_hf1, 3);
                p0 += 2;
            }

            pp += 8;
        }
        for (; jj < max_jj; jj++)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x4_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1_u16(pC);
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x4_t();
                        _c = vset_lane_u16(pC[0], _c, 0);
                        _c = vset_lane_u16(pC[c_hstep], _c, 1);
                        _c = vset_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vset_lane_u16(pC[c_hstep * 3], _c, 3);
                        pC += 1;
                    }
                    _c0 = vcvt_f32_f16((float16x4_t)_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    pC += 1;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);

            if (out_elempack == 4)
            {
                vst1_u16(p0, _hf0);
                p0 += 4;
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_hf0, 0);
                p0[out_hstep] = vget_lane_u16(_hf0, 1);
                p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                p0++;
            }

            pp += 4;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        // out_elempack == 1
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const float descale0 = descales[i + ii];
        const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
        float32x2_t _descale = vld1_f32((const float*)descales + i + ii);
#endif

        float c0;
        float c1;
#if __ARM_NEON
        float32x4_t _c0;
        float32x4_t _c1;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = float16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = float16_to_float32(pC[0]) * beta;
                c1 = float16_to_float32(pC[1]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
                _c1 = vdupq_n_f32(c1);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale, 0);
            float32x4_t _f2 = vmulq_lane_f32(vcvtq_f32_s32(_sum2), _descale, 1);
            float32x4_t _f3 = vmulq_lane_f32(vcvtq_f32_s32(_sum3), _descale, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                    float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                    float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 8;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _c0 = vmulq_f32(_c0, _beta);
                        _c1 = vmulq_f32(_c1, _beta);
                    }
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_u16(p0, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f0), (uint16x4_t)vcvt_f16_f32(_f1)));
            vst1q_u16(p0 + out_hstep, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f2), (uint16x4_t)vcvt_f16_f32(_f3)));

            pp += 16;
            p0 += 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vld1_u16(pC + c_hstep));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 4;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _c0 = vmulq_n_f32(_c0, beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1_u16(p0, (uint16x4_t)vcvt_f16_f32(_f0));
            vst1_u16(p0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_f1));

            pp += 8;
            p0 += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);

            float32x2x2_t _descale01 = vzip_f32(_descale, _descale);
            float32x4_t _descale0011 = vcombine_f32(_descale01.val[0], _descale01.val[1]);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0011);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    float32x4_t _c0011 = vcombine_f32(vget_low_f32(_c0), vget_high_f32(_c1));
                    _f0 = vaddq_f32(_f0, _c0011);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[1], _c, 1);
                    _c = vset_lane_u16(pC[c_hstep], _c, 2);
                    _c = vset_lane_u16(pC[c_hstep + 1], _c, 3);
                    _c0 = vcvt_f32_f16((float16x4_t)_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[1], _c, 1);
                    _c = vset_lane_u16(pC[0], _c, 2);
                    _c = vset_lane_u16(pC[1], _c, 3);
                    _c0 = vcvt_f32_f16((float16x4_t)_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);

            p0[0] = vget_lane_u16(_hf0, 0);
            p0[1] = vget_lane_u16(_hf0, 1);
            p0[out_hstep] = vget_lane_u16(_hf0, 2);
            p0[out_hstep + 1] = vget_lane_u16(_hf0, 3);

            pp += 4;
            p0 += 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj++)
        {
            float f0 = pp[0] * descale0;
            float f1 = pp[1] * descale1;

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    f0 += c0;
                    f1 += c0;
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                    f1 += c1;
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    f0 += float16_to_float32(pC[0]) * beta;
                    f1 += float16_to_float32(pC[c_hstep]) * beta;
                    pC += 1;
                }
                if (broadcast_type_C == 4)
                {
                    f0 += float16_to_float32(pC[0]) * beta;
                    f1 += float16_to_float32(pC[0]) * beta;
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                f0 *= alpha;
                f1 *= alpha;
            }

            p0[0] = float32_to_float16(f0);
            p0[out_hstep] = float32_to_float16(f1);

            pp += 2;
            p0++;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        // out_elempack == 1
        unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

        const float descale = descales[i + ii];
#if __ARM_NEON
        float32x4_t _descale = vdupq_n_f32(descale);
#endif

        float c0;
#if __ARM_NEON
        float32x4_t _c0;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = float16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = float16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
        for (; jj + 15 < max_jj; jj += 16)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + 8);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                    float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                    float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 16;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_u16(p0, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f0), (uint16x4_t)vcvt_f16_f32(_f1)));
            vst1q_u16(p0 + 8, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f2), (uint16x4_t)vcvt_f16_f32(_f3)));

            pp += 16;
            p0 += 16;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_u16(p0, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f0), (uint16x4_t)vcvt_f16_f32(_f1)));

            pp += 8;
            p0 += 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 4;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1_u16(p0, (uint16x4_t)vcvt_f16_f32(_f0));

            pp += 4;
            p0 += 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x2_t _f0 = vmul_f32(vcvt_f32_s32(vld1_s32(pp)), vget_low_f32(_descale));

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vadd_f32(_f0, vget_low_f32(_c0));
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    float32x2_t _cc = float32x2_t();
                    _cc = vset_lane_f32(float16_to_float32(pC[0]), _cc, 0);
                    _cc = vset_lane_f32(float16_to_float32(pC[1]), _cc, 1);
                    _f0 = vmla_n_f32(_f0, _cc, beta);
                    pC += 2;
                }
            }

            _f0 = vmul_n_f32(_f0, alpha);

            p0[0] = float32_to_float16(vget_lane_f32(_f0, 0));
            p0[1] = float32_to_float16(vget_lane_f32(_f0, 1));

            pp += 2;
            p0 += 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj++)
        {
            float f0 = pp[0] * descale;

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    f0 += float16_to_float32(pC[0]) * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;

            p0[0] = float32_to_float16(f0);

            pp += 1;
            p0++;
        }
    }
}

static void transpose_unpack_output_tile_int32_to_fp16(const Mat& topT, const Mat& C, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, const Mat& descales, float alpha, float beta)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD && !__ARM_FEATURE_MATMUL_INT8
    if (ncnn::cpu_support_arm_asimddp())
    {
        transpose_unpack_output_tile_int32_to_fp16_asimddp(topT, C, top_blob, broadcast_type_C, i, max_ii, j, max_jj, descales, alpha, beta);
        return;
    }
#endif

    const int out_elempack = top_blob.elempack;
    const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

    const size_t c_hstep = C.dims == 3 ? C.cstep : (size_t)C.w;
    const int c_elempack = C.elempack;
    const unsigned short* pC = C;

    // NCNN_LOGE("transpose_unpack_output_tile_int32_to_fp16  %d %d %d %d  %d  %d  %d", i, max_ii, j, max_jj, out_elempack, broadcast_type_C, c_elempack);

    const int* pp = topT;

    int ii = 0;
#if __ARM_NEON
    for (; ii + 7 < max_ii; ii += 8)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
        float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

        float32x4_t _c0;
        float32x4_t _c1;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                uint16x8_t _c = vld1q_u16(pC);
                _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                _c0 = vmulq_n_f32(_c0, beta);
                _c1 = vmulq_n_f32(_c1, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);
            int32x4_t _sum8 = vld1q_s32(pp + 32);
            int32x4_t _sum9 = vld1q_s32(pp + 36);
            int32x4_t _suma = vld1q_s32(pp + 40);
            int32x4_t _sumb = vld1q_s32(pp + 44);
            int32x4_t _sumc = vld1q_s32(pp + 48);
            int32x4_t _sumd = vld1q_s32(pp + 52);
            int32x4_t _sume = vld1q_s32(pp + 56);
            int32x4_t _sumf = vld1q_s32(pp + 60);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
#else
            // from
            //      a0 b1 c2 d3
            //      e4 f5 g6 h7
            //      e0 f1 g2 h3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      g4 h5 e6 f7
            //      g0 h1 e2 f3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      e7 f6 g5 h4
            //      e3 f2 g1 h0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      g7 h6 e5 f4
            //      g3 h2 e1 f0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            //      e4 f4 g4 h4
            //      e5 f5 g5 h5
            //      e6 f6 g6 h6
            //      e7 f7 g7 h7
            {
                _sum8 = vrev64q_s32(_sum8);
                _sum9 = vrev64q_s32(_sum9);
                _suma = vrev64q_s32(_suma);
                _sumb = vrev64q_s32(_sumb);
                _sumc = vrev64q_s32(_sumc);
                _sumd = vrev64q_s32(_sumd);
                _sume = vrev64q_s32(_sume);
                _sumf = vrev64q_s32(_sumf);
                _sum8 = vextq_s32(_sum8, _sum8, 2);
                _sum9 = vextq_s32(_sum9, _sum9, 2);
                _suma = vextq_s32(_suma, _suma, 2);
                _sumb = vextq_s32(_sumb, _sumb, 2);
                _sumc = vextq_s32(_sumc, _sumc, 2);
                _sumd = vextq_s32(_sumd, _sumd, 2);
                _sume = vextq_s32(_sume, _sume, 2);
                _sumf = vextq_s32(_sumf, _sumf, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sumc);
                int32x4x2_t _t1 = vzipq_s32(_sum4, _sum8);
                int32x4x2_t _t2 = vzipq_s32(_sum2, _sume);
                int32x4x2_t _t3 = vzipq_s32(_sum6, _suma);
                int32x4x2_t _t4 = vzipq_s32(_sum3, _sumf);
                int32x4x2_t _t5 = vzipq_s32(_sum7, _sumb);
                int32x4x2_t _t6 = vzipq_s32(_sum1, _sumd);
                int32x4x2_t _t7 = vzipq_s32(_sum5, _sum9);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum8 = vcombine_s32(vget_low_s32(_t4.val[0]), vget_low_s32(_t5.val[0]));
                _sum9 = vcombine_s32(vget_high_s32(_t4.val[0]), vget_high_s32(_t5.val[0]));
                _suma = vcombine_s32(vget_low_s32(_t5.val[1]), vget_low_s32(_t4.val[1]));
                _sumb = vcombine_s32(vget_high_s32(_t5.val[1]), vget_high_s32(_t4.val[1]));
                _sumc = vcombine_s32(vget_low_s32(_t6.val[0]), vget_low_s32(_t7.val[0]));
                _sumd = vcombine_s32(vget_high_s32(_t6.val[0]), vget_high_s32(_t7.val[0]));
                _sume = vcombine_s32(vget_low_s32(_t7.val[1]), vget_low_s32(_t6.val[1]));
                _sumf = vcombine_s32(vget_high_s32(_t7.val[1]), vget_high_s32(_t6.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
                _sum9 = vrev64q_s32(_sum9);
                _sumb = vrev64q_s32(_sumb);
                _sumd = vrev64q_s32(_sumd);
                _sumf = vrev64q_s32(_sumf);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum8), _descale0);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum9), _descale0);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_suma), _descale0);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sumb), _descale0);
            float32x4_t _f8 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f9 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _fa = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _fb = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);
            float32x4_t _fc = vmulq_f32(vcvtq_f32_s32(_sumc), _descale1);
            float32x4_t _fd = vmulq_f32(vcvtq_f32_s32(_sumd), _descale1);
            float32x4_t _fe = vmulq_f32(vcvtq_f32_s32(_sume), _descale1);
            float32x4_t _ff = vmulq_f32(vcvtq_f32_s32(_sumf), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c0);
                    _fa = vaddq_f32(_fa, _c0);
                    _fb = vaddq_f32(_fb, _c0);
                    _fc = vaddq_f32(_fc, _c0);
                    _fd = vaddq_f32(_fd, _c0);
                    _fe = vaddq_f32(_fe, _c0);
                    _ff = vaddq_f32(_ff, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                    _f8 = vaddq_f32(_f8, _c1);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c1);
                    _fb = vaddq_f32(_fb, _c1);
                    _fc = vaddq_f32(_fc, _c1);
                    _fd = vaddq_f32(_fd, _c1);
                    _fe = vaddq_f32(_fe, _c1);
                    _ff = vaddq_f32(_ff, _c1);
                }
                if (broadcast_type_C == 3)
                {
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _c08 = vld1q_u16(pC);
                        uint16x8_t _c19 = vld1q_u16(pC + 8);
                        uint16x8_t _c2a = vld1q_u16(pC + 16);
                        uint16x8_t _c3b = vld1q_u16(pC + 24);
                        uint16x8_t _c4c = vld1q_u16(pC + 32);
                        uint16x8_t _c5d = vld1q_u16(pC + 40);
                        uint16x8_t _c6e = vld1q_u16(pC + 48);
                        uint16x8_t _c7f = vld1q_u16(pC + 56);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c08));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c19));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c2a));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c3b));
                        float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c4c));
                        float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c5d));
                        float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c6e));
                        float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c7f));
                        float32x4_t _c8 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c08));
                        float32x4_t _c9 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c19));
                        float32x4_t _ca = vcvt_f32_f16((float16x4_t)vget_high_u16(_c2a));
                        float32x4_t _cb = vcvt_f32_f16((float16x4_t)vget_high_u16(_c3b));
                        float32x4_t _cc = vcvt_f32_f16((float16x4_t)vget_high_u16(_c4c));
                        float32x4_t _cd = vcvt_f32_f16((float16x4_t)vget_high_u16(_c5d));
                        float32x4_t _ce = vcvt_f32_f16((float16x4_t)vget_high_u16(_c6e));
                        float32x4_t _cf = vcvt_f32_f16((float16x4_t)vget_high_u16(_c7f));

                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                            _f8 = vaddq_f32(_f8, _c8);
                            _f9 = vaddq_f32(_f9, _c9);
                            _fa = vaddq_f32(_fa, _ca);
                            _fb = vaddq_f32(_fb, _cb);
                            _fc = vaddq_f32(_fc, _cc);
                            _fd = vaddq_f32(_fd, _cd);
                            _fe = vaddq_f32(_fe, _ce);
                            _ff = vaddq_f32(_ff, _cf);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                            _f8 = vmlaq_f32(_f8, _c8, _beta);
                            _f9 = vmlaq_f32(_f9, _c9, _beta);
                            _fa = vmlaq_f32(_fa, _ca, _beta);
                            _fb = vmlaq_f32(_fb, _cb, _beta);
                            _fc = vmlaq_f32(_fc, _cc, _beta);
                            _fd = vmlaq_f32(_fd, _cd, _beta);
                            _fe = vmlaq_f32(_fe, _ce, _beta);
                            _ff = vmlaq_f32(_ff, _cf, _beta);
                        }
                        pC += 64;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        uint16x8_t _c45 = vld1q_u16(pC + 16);
                        uint16x8_t _c67 = vld1q_u16(pC + 24);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c45 = vld1q_u16(pC + c_hstep * 4 + 16);
                        _c67 = vld1q_u16(pC + c_hstep * 4 + 24);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                        uint16x8_t _c45 = vld1q_u16(pC + c_hstep * 2);
                        uint16x8_t _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 5);
                        _c45 = vld1q_u16(pC + c_hstep * 6);
                        _c67 = vld1q_u16(pC + c_hstep * 7);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                        _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                        _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                        _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                        if (beta == 1.f)
                        {
                            _f8 = vaddq_f32(_f8, _c0);
                            _f9 = vaddq_f32(_f9, _c1);
                            _fa = vaddq_f32(_fa, _c2);
                            _fb = vaddq_f32(_fb, _c3);
                            _fc = vaddq_f32(_fc, _c4);
                            _fd = vaddq_f32(_fd, _c5);
                            _fe = vaddq_f32(_fe, _c6);
                            _ff = vaddq_f32(_ff, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f8 = vmlaq_f32(_f8, _c0, _beta);
                            _f9 = vmlaq_f32(_f9, _c1, _beta);
                            _fa = vmlaq_f32(_fa, _c2, _beta);
                            _fb = vmlaq_f32(_fb, _c3, _beta);
                            _fc = vmlaq_f32(_fc, _c4, _beta);
                            _fd = vmlaq_f32(_fd, _c5, _beta);
                            _fe = vmlaq_f32(_fe, _c6, _beta);
                            _ff = vmlaq_f32(_ff, _c7, _beta);
                        }
                        pC += 8;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    float32x4_t _cc0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    float32x4_t _cc1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    _f8 = vaddq_f32(_f8, _c0);
                    _f9 = vaddq_f32(_f9, _c1);
                    _fa = vaddq_f32(_fa, _c2);
                    _fb = vaddq_f32(_fb, _c3);
                    _fc = vaddq_f32(_fc, _c4);
                    _fd = vaddq_f32(_fd, _c5);
                    _fe = vaddq_f32(_fe, _c6);
                    _ff = vaddq_f32(_ff, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
                _f8 = vmulq_f32(_f8, _alpha);
                _f9 = vmulq_f32(_f9, _alpha);
                _fa = vmulq_f32(_fa, _alpha);
                _fb = vmulq_f32(_fb, _alpha);
                _fc = vmulq_f32(_fc, _alpha);
                _fd = vmulq_f32(_fd, _alpha);
                _fe = vmulq_f32(_fe, _alpha);
                _ff = vmulq_f32(_ff, _alpha);
            }

            uint16x8_t _hf0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f0), (uint16x4_t)vcvt_f16_f32(_f8));
            uint16x8_t _hf1 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f1), (uint16x4_t)vcvt_f16_f32(_f9));
            uint16x8_t _hf2 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f2), (uint16x4_t)vcvt_f16_f32(_fa));
            uint16x8_t _hf3 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f3), (uint16x4_t)vcvt_f16_f32(_fb));
            uint16x8_t _hf4 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f4), (uint16x4_t)vcvt_f16_f32(_fc));
            uint16x8_t _hf5 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f5), (uint16x4_t)vcvt_f16_f32(_fd));
            uint16x8_t _hf6 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f6), (uint16x4_t)vcvt_f16_f32(_fe));
            uint16x8_t _hf7 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f7), (uint16x4_t)vcvt_f16_f32(_ff));

#if __aarch64__
            if (out_elempack == 8)
            {
                transpose8x8_u16(_hf0, _hf1, _hf2, _hf3, _hf4, _hf5, _hf6, _hf7);
                vst1q_u16(p0, _hf0);
                vst1q_u16(p0 + 8, _hf1);
                vst1q_u16(p0 + 16, _hf2);
                vst1q_u16(p0 + 24, _hf3);
                vst1q_u16(p0 + 32, _hf4);
                vst1q_u16(p0 + 40, _hf5);
                vst1q_u16(p0 + 48, _hf6);
                vst1q_u16(p0 + 56, _hf7);
            }
#endif // __aarch64__
            if (out_elempack == 4)
            {
                uint16x8x4_t _hfa;
                uint16x8x4_t _hfb;
                _hfa.val[0] = _hf0;
                _hfa.val[1] = _hf1;
                _hfa.val[2] = _hf2;
                _hfa.val[3] = _hf3;
                _hfb.val[0] = _hf4;
                _hfb.val[1] = _hf5;
                _hfb.val[2] = _hf6;
                _hfb.val[3] = _hf7;
                vst4q_u16(p0, _hfa);
                vst4q_u16(p0 + out_hstep * 4, _hfb);
            }
            if (out_elempack == 1)
            {
                vst1q_u16(p0, _hf0);
                vst1q_u16(p0 + out_hstep, _hf1);
                vst1q_u16(p0 + out_hstep * 2, _hf2);
                vst1q_u16(p0 + out_hstep * 3, _hf3);
                vst1q_u16(p0 + out_hstep * 4, _hf4);
                vst1q_u16(p0 + out_hstep * 5, _hf5);
                vst1q_u16(p0 + out_hstep * 6, _hf6);
                vst1q_u16(p0 + out_hstep * 7, _hf7);
            }

            pp += 64;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3

#else
            // from
            //      a0 b1 c2 d3
            //      e0 f1 g2 h3
            //      c0 d1 a2 b3
            //      g0 h1 e2 f3
            //      a3 b2 c1 d0
            //      e3 f2 g1 h0
            //      c3 d2 a1 b0
            //      g3 h2 e1 f0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            //      e2 f2 g2 h2
            //      e3 f3 g3 h3

            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale0);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale0);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale1);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale1);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale1);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c1);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c1);
                    _f7 = vaddq_f32(_f7, _c1);
                }
                if (broadcast_type_C == 3)
                {
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _c04 = vld1q_u16(pC);
                        uint16x8_t _c15 = vld1q_u16(pC + 8);
                        uint16x8_t _c26 = vld1q_u16(pC + 16);
                        uint16x8_t _c37 = vld1q_u16(pC + 24);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c04));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c15));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c26));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c37));
                        float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c04));
                        float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c15));
                        float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c26));
                        float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c37));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                            _f4 = vaddq_f32(_f4, _c4);
                            _f5 = vaddq_f32(_f5, _c5);
                            _f6 = vaddq_f32(_f6, _c6);
                            _f7 = vaddq_f32(_f7, _c7);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                            _f4 = vmlaq_f32(_f4, _c4, _beta);
                            _f5 = vmlaq_f32(_f5, _c5, _beta);
                            _f6 = vmlaq_f32(_f6, _c6, _beta);
                            _f7 = vmlaq_f32(_f7, _c7, _beta);
                        }
                        pC += 32;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _c01 = vld1q_u16(pC + c_hstep * 4);
                        _c23 = vld1q_u16(pC + c_hstep * 4 + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = vcvt_f32_f16((float16x4_t)_cc0);
                        _c1 = vcvt_f32_f16((float16x4_t)_cc1);
                        float32x4_t _c2 = vcvt_f32_f16((float16x4_t)_cc2);
                        float32x4_t _c3 = vcvt_f32_f16((float16x4_t)_cc3);
                        if (beta == 1.f)
                        {
                            _f0 = vaddq_f32(_f0, _c0);
                            _f1 = vaddq_f32(_f1, _c1);
                            _f2 = vaddq_f32(_f2, _c2);
                            _f3 = vaddq_f32(_f3, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f0 = vmlaq_f32(_f0, _c0, _beta);
                            _f1 = vmlaq_f32(_f1, _c1, _beta);
                            _f2 = vmlaq_f32(_f2, _c2, _beta);
                            _f3 = vmlaq_f32(_f3, _c3, _beta);
                        }
                        _cc0 = vld1_u16(pC + c_hstep * 4);
                        _cc1 = vld1_u16(pC + c_hstep * 5);
                        _cc2 = vld1_u16(pC + c_hstep * 6);
                        _cc3 = vld1_u16(pC + c_hstep * 7);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = vcvt_f32_f16((float16x4_t)_cc0);
                        _c1 = vcvt_f32_f16((float16x4_t)_cc1);
                        _c2 = vcvt_f32_f16((float16x4_t)_cc2);
                        _c3 = vcvt_f32_f16((float16x4_t)_cc3);
                        if (beta == 1.f)
                        {
                            _f4 = vaddq_f32(_f4, _c0);
                            _f5 = vaddq_f32(_f5, _c1);
                            _f6 = vaddq_f32(_f6, _c2);
                            _f7 = vaddq_f32(_f7, _c3);
                        }
                        else
                        {
                            float32x4_t _beta = vdupq_n_f32(beta);
                            _f4 = vmlaq_f32(_f4, _c0, _beta);
                            _f5 = vmlaq_f32(_f5, _c1, _beta);
                            _f6 = vmlaq_f32(_f6, _c2, _beta);
                            _f7 = vmlaq_f32(_f7, _c3, _beta);
                        }
                        pC += 4;
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c1);
                    _f6 = vaddq_f32(_f6, _c2);
                    _f7 = vaddq_f32(_f7, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x8_t _hf0 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f0), (uint16x4_t)vcvt_f16_f32(_f4));
            uint16x8_t _hf1 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f1), (uint16x4_t)vcvt_f16_f32(_f5));
            uint16x8_t _hf2 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f2), (uint16x4_t)vcvt_f16_f32(_f6));
            uint16x8_t _hf3 = vcombine_u16((uint16x4_t)vcvt_f16_f32(_f3), (uint16x4_t)vcvt_f16_f32(_f7));

            if (out_elempack == 4)
            {
                uint16x8x4_t _hf;
                _hf.val[0] = _hf0;
                _hf.val[1] = _hf1;
                _hf.val[2] = _hf2;
                _hf.val[3] = _hf3;
                vst4q_u16(p0, _hf);
            }
            if (out_elempack == 1)
            {
                vst1q_u16(p0, _hf0);
                vst1q_u16(p0 + out_hstep, _hf1);
                vst1q_u16(p0 + out_hstep * 2, _hf2);
                vst1q_u16(p0 + out_hstep * 3, _hf3);
            }

            pp += 32;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
#else
            // from
            //      a0 b1 c0 d1
            //      e0 f1 g0 h1
            //      a1 b0 c1 d0
            //      e1 f0 g1 h0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      e0 f0 g0 h0
            //      e1 f1 g1 h1
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum2);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[0]), vget_low_s32(_t1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[0]), vget_high_s32(_t1.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale0);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale1);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c2;
                    float32x4_t _c3;
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _c02 = vld1q_u16(pC);
                        uint16x8_t _c13 = vld1q_u16(pC + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c02));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c13));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c02));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c13));
                        pC += 16;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + c_hstep * 4);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 4], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 5], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 6], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 7], _c01, 7);

                        uint16x8_t _c23 = uint16x8_t();
                        _c23 = vsetq_lane_u16(pC[1], _c23, 0);
                        _c23 = vsetq_lane_u16(pC[c_hstep + 1], _c23, 1);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c23, 2);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c23, 3);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 4 + 1], _c23, 4);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 5 + 1], _c23, 5);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 6 + 1], _c23, 6);
                        _c23 = vsetq_lane_u16(pC[c_hstep * 7 + 1], _c23, 7);

                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        pC += 2;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    _c1 = vdupq_n_f32(float16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            vst1q_u16(p0, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f0), (uint16x4_t)vcvt_f16_f32(_f2)));
            vst1q_u16(p0 + out_hstep, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f1), (uint16x4_t)vcvt_f16_f32(_f3)));

            pp += 16;
            p0 += out_hstep * 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale0);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp + 4)), _descale1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
#if __aarch64__
                    if (c_elempack == 8)
                    {
                        uint16x8_t _c = vld1q_u16(pC);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                        pC += 8;
                    }
#endif // __aarch64__
                    if (c_elempack == 4)
                    {
                        _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                        _c1 = vcvt_f32_f16((float16x4_t)vld1_u16(pC + c_hstep * 4));
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x8_t _c01 = uint16x8_t();
                        _c01 = vsetq_lane_u16(pC[0], _c01, 0);
                        _c01 = vsetq_lane_u16(pC[c_hstep], _c01, 1);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 2], _c01, 2);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 3], _c01, 3);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 4], _c01, 4);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 5], _c01, 5);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 6], _c01, 6);
                        _c01 = vsetq_lane_u16(pC[c_hstep * 7], _c01, 7);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        pC += 1;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1q_u16(p0, vcombine_u16((uint16x4_t)vcvt_f16_f32(_f0), (uint16x4_t)vcvt_f16_f32(_f1)));
            pp += 8;
            p0 += out_hstep;
        }
    }
    for (; ii + 3 < max_ii; ii += 4)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

        float32x4_t _c0;
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                _c0 = vmulq_n_f32(_c0, beta);
            }
            if (broadcast_type_C == 3)
            {
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j * c_elempack;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);
            int32x4_t _sum4 = vld1q_s32(pp + 16);
            int32x4_t _sum5 = vld1q_s32(pp + 20);
            int32x4_t _sum6 = vld1q_s32(pp + 24);
            int32x4_t _sum7 = vld1q_s32(pp + 28);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
#else
            // from
            //      a0 b1 c2 d3
            //      a4 b5 c6 d7
            //      c0 d1 a2 b3
            //      c4 d5 a6 b7
            //      a3 b2 c1 d0
            //      a7 b6 c5 d4
            //      c3 d2 a1 b0
            //      c7 d6 a5 b4

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            //      a4 b4 c4 d4
            //      a5 b5 c5 d5
            //      a6 b6 c6 d6
            //      a7 b7 c7 d7
            {
                _sum4 = vrev64q_s32(_sum4);
                _sum5 = vrev64q_s32(_sum5);
                _sum6 = vrev64q_s32(_sum6);
                _sum7 = vrev64q_s32(_sum7);
                _sum4 = vextq_s32(_sum4, _sum4, 2);
                _sum5 = vextq_s32(_sum5, _sum5, 2);
                _sum6 = vextq_s32(_sum6, _sum6, 2);
                _sum7 = vextq_s32(_sum7, _sum7, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum6);
                int32x4x2_t _t1 = vzipq_s32(_sum2, _sum4);
                int32x4x2_t _t2 = vzipq_s32(_sum1, _sum7);
                int32x4x2_t _t3 = vzipq_s32(_sum3, _sum5);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum4 = vcombine_s32(vget_low_s32(_t2.val[0]), vget_low_s32(_t3.val[0]));
                _sum5 = vcombine_s32(vget_high_s32(_t2.val[0]), vget_high_s32(_t3.val[0]));
                _sum6 = vcombine_s32(vget_low_s32(_t3.val[1]), vget_low_s32(_t2.val[1]));
                _sum7 = vcombine_s32(vget_high_s32(_t3.val[1]), vget_high_s32(_t2.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
                _sum5 = vrev64q_s32(_sum5);
                _sum7 = vrev64q_s32(_sum7);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);
            float32x4_t _f4 = vmulq_f32(vcvtq_f32_s32(_sum4), _descale);
            float32x4_t _f5 = vmulq_f32(vcvtq_f32_s32(_sum5), _descale);
            float32x4_t _f6 = vmulq_f32(vcvtq_f32_s32(_sum6), _descale);
            float32x4_t _f7 = vmulq_f32(vcvtq_f32_s32(_sum7), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                    _f4 = vaddq_f32(_f4, _c0);
                    _f5 = vaddq_f32(_f5, _c0);
                    _f6 = vaddq_f32(_f6, _c0);
                    _f7 = vaddq_f32(_f7, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c01;
                    uint16x8_t _c23;
                    uint16x8_t _c45;
                    uint16x8_t _c67;
                    if (c_elempack == 4)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + 8);
                        _c45 = vld1q_u16(pC + 16);
                        _c67 = vld1q_u16(pC + 24);
                        pC += 32;
                    }
                    if (c_elempack == 1)
                    {
                        _c01 = vld1q_u16(pC);
                        _c23 = vld1q_u16(pC + c_hstep);
                        _c45 = vld1q_u16(pC + c_hstep * 2);
                        _c67 = vld1q_u16(pC + c_hstep * 3);
                        transpose8x4_u16(_c01, _c23, _c45, _c67);
                        pC += 8;
                    }
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                    float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                    float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                    float32x4_t _c4 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c45));
                    float32x4_t _c5 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c45));
                    float32x4_t _c6 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c67));
                    float32x4_t _c7 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c67));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                        _f4 = vaddq_f32(_f4, _c4);
                        _f5 = vaddq_f32(_f5, _c5);
                        _f6 = vaddq_f32(_f6, _c6);
                        _f7 = vaddq_f32(_f7, _c7);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                        _f4 = vmlaq_f32(_f4, _c4, _beta);
                        _f5 = vmlaq_f32(_f5, _c5, _beta);
                        _f6 = vmlaq_f32(_f6, _c6, _beta);
                        _f7 = vmlaq_f32(_f7, _c7, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    float32x4_t _cc0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    float32x4_t _cc1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _cc0 = vmulq_f32(_cc0, _beta);
                        _cc1 = vmulq_f32(_cc1, _beta);
                    }
                    _c0 = vdupq_laneq_f32(_cc0, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_cc0, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_cc0, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_cc0, 3);
                    float32x4_t _c4 = vdupq_laneq_f32(_cc1, 0);
                    float32x4_t _c5 = vdupq_laneq_f32(_cc1, 1);
                    float32x4_t _c6 = vdupq_laneq_f32(_cc1, 2);
                    float32x4_t _c7 = vdupq_laneq_f32(_cc1, 3);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    _f4 = vaddq_f32(_f4, _c4);
                    _f5 = vaddq_f32(_f5, _c5);
                    _f6 = vaddq_f32(_f6, _c6);
                    _f7 = vaddq_f32(_f7, _c7);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
                _f4 = vmulq_f32(_f4, _alpha);
                _f5 = vmulq_f32(_f5, _alpha);
                _f6 = vmulq_f32(_f6, _alpha);
                _f7 = vmulq_f32(_f7, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);
            uint16x4_t _hf4 = (uint16x4_t)vcvt_f16_f32(_f4);
            uint16x4_t _hf5 = (uint16x4_t)vcvt_f16_f32(_f5);
            uint16x4_t _hf6 = (uint16x4_t)vcvt_f16_f32(_f6);
            uint16x4_t _hf7 = (uint16x4_t)vcvt_f16_f32(_f7);

#if __aarch64__
            if (out_elempack == 8)
            {
                transpose4x4_u16(_hf0, _hf1, _hf2, _hf3);
                transpose4x4_u16(_hf4, _hf5, _hf6, _hf7);
                vst1q_u16(p0, vcombine_u16(_hf0, _hf4));
                vst1q_u16(p0 + 8, vcombine_u16(_hf1, _hf5));
                vst1q_u16(p0 + 16, vcombine_u16(_hf2, _hf6));
                vst1q_u16(p0 + 24, vcombine_u16(_hf3, _hf7));
            }
#endif // __aarch64__
            if (out_elempack == 4)
            {
                uint16x4x4_t _hfa;
                uint16x4x4_t _hfb;
                _hfa.val[0] = _hf0;
                _hfa.val[1] = _hf1;
                _hfa.val[2] = _hf2;
                _hfa.val[3] = _hf3;
                _hfb.val[0] = _hf4;
                _hfb.val[1] = _hf5;
                _hfb.val[2] = _hf6;
                _hfb.val[3] = _hf7;
                vst4_u16(p0, _hfa);
                vst4_u16(p0 + out_hstep * 4, _hfb);
            }
            if (out_elempack == 1)
            {
                vst1_u16(p0, _hf0);
                vst1_u16(p0 + out_hstep, _hf1);
                vst1_u16(p0 + out_hstep * 2, _hf2);
                vst1_u16(p0 + out_hstep * 3, _hf3);
                vst1_u16(p0 + out_hstep * 4, _hf4);
                vst1_u16(p0 + out_hstep * 5, _hf5);
                vst1_u16(p0 + out_hstep * 6, _hf6);
                vst1_u16(p0 + out_hstep * 7, _hf7);
            }

            pp += 32;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
#else
            // from
            //      a0 b1 c2 d3
            //      c0 d1 a2 b3
            //      a3 b2 c1 d0
            //      c3 d2 a1 b0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            //      a2 b2 c2 d2
            //      a3 b3 c3 d3
            {
                _sum2 = vrev64q_s32(_sum2);
                _sum3 = vrev64q_s32(_sum3);
                _sum2 = vextq_s32(_sum2, _sum2, 2);
                _sum3 = vextq_s32(_sum3, _sum3, 2);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum3);
                int32x4x2_t _t1 = vzipq_s32(_sum1, _sum2);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_t1.val[1]), vget_low_s32(_t0.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_t1.val[1]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
                _sum3 = vrev64q_s32(_sum3);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    float32x4_t _c1;
                    float32x4_t _c2;
                    float32x4_t _c3;
                    if (c_elempack == 4)
                    {
                        uint16x8_t _c01 = vld1q_u16(pC);
                        uint16x8_t _c23 = vld1q_u16(pC + 8);
                        _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                        _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                        _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                        _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                        pC += 16;
                    }
                    if (c_elempack == 1)
                    {
                        uint16x4_t _cc0 = vld1_u16(pC);
                        uint16x4_t _cc1 = vld1_u16(pC + c_hstep);
                        uint16x4_t _cc2 = vld1_u16(pC + c_hstep * 2);
                        uint16x4_t _cc3 = vld1_u16(pC + c_hstep * 3);
                        transpose4x4_u16(_cc0, _cc1, _cc2, _cc3);
                        _c0 = vcvt_f32_f16((float16x4_t)_cc0);
                        _c1 = vcvt_f32_f16((float16x4_t)_cc1);
                        _c2 = vcvt_f32_f16((float16x4_t)_cc2);
                        _c3 = vcvt_f32_f16((float16x4_t)_cc3);
                        pC += 4;
                    }
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    float32x4_t _c = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _c = vmulq_n_f32(_c, beta);
#if __aarch64__
                    _c0 = vdupq_laneq_f32(_c, 0);
                    float32x4_t _c1 = vdupq_laneq_f32(_c, 1);
                    float32x4_t _c2 = vdupq_laneq_f32(_c, 2);
                    float32x4_t _c3 = vdupq_laneq_f32(_c, 3);
#else
                    _c0 = vdupq_lane_f32(vget_low_f32(_c), 0);
                    float32x4_t _c1 = vdupq_lane_f32(vget_low_f32(_c), 1);
                    float32x4_t _c2 = vdupq_lane_f32(vget_high_f32(_c), 0);
                    float32x4_t _c3 = vdupq_lane_f32(vget_high_f32(_c), 1);
#endif
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c2);
                    _f3 = vaddq_f32(_f3, _c3);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);

            if (out_elempack == 4)
            {
                uint16x4x4_t _hf;
                _hf.val[0] = _hf0;
                _hf.val[1] = _hf1;
                _hf.val[2] = _hf2;
                _hf.val[3] = _hf3;
                vst4_u16(p0, _hf);
            }
            if (out_elempack == 1)
            {
                vst1_u16(p0, _hf0);
                vst1_u16(p0 + out_hstep, _hf1);
                vst1_u16(p0 + out_hstep * 2, _hf2);
                vst1_u16(p0 + out_hstep * 3, _hf3);
            }

            pp += 16;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

#if __ARM_FEATURE_DOTPROD
            // from/to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
#else
            // from
            //      a0 b1 c0 d1
            //      a1 b0 c1 d0

            // to
            //      a0 b0 c0 d0
            //      a1 b1 c1 d1
            {
                _sum1 = vrev64q_s32(_sum1);
                int32x4x2_t _t0 = vzipq_s32(_sum0, _sum1);
                _sum0 = vcombine_s32(vget_low_s32(_t0.val[0]), vget_low_s32(_t0.val[1]));
                _sum1 = vcombine_s32(vget_high_s32(_t0.val[0]), vget_high_s32(_t0.val[1]));
                _sum1 = vrev64q_s32(_sum1);
            }
#endif // __ARM_FEATURE_DOTPROD

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x8_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1q_u16(pC);
                        pC += 8;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x8_t();
                        _c = vsetq_lane_u16(pC[0], _c, 0);
                        _c = vsetq_lane_u16(pC[c_hstep], _c, 1);
                        _c = vsetq_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vsetq_lane_u16(pC[c_hstep * 3], _c, 3);
                        _c = vsetq_lane_u16(pC[1], _c, 4);
                        _c = vsetq_lane_u16(pC[c_hstep + 1], _c, 5);
                        _c = vsetq_lane_u16(pC[c_hstep * 2 + 1], _c, 6);
                        _c = vsetq_lane_u16(pC[c_hstep * 3 + 1], _c, 7);
                        pC += 2;
                    }
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    float32x4_t _c1 = vdupq_n_f32(float16_to_float32(pC[1]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    pC += 2;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            vst1_u16(p0, (uint16x4_t)vcvt_f16_f32(_f0));
            vst1_u16(p0 + out_hstep, (uint16x4_t)vcvt_f16_f32(_f1));

            pp += 8;
            p0 += out_hstep * 2;
        }
        for (; jj < max_jj; jj += 1)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3)
                {
                    uint16x4_t _c;
                    if (c_elempack == 4)
                    {
                        _c = vld1_u16(pC);
                        pC += 4;
                    }
                    if (c_elempack == 1)
                    {
                        _c = uint16x4_t();
                        _c = vset_lane_u16(pC[0], _c, 0);
                        _c = vset_lane_u16(pC[c_hstep], _c, 1);
                        _c = vset_lane_u16(pC[c_hstep * 2], _c, 2);
                        _c = vset_lane_u16(pC[c_hstep * 3], _c, 3);
                        pC += 1;
                    }
                    _c0 = vcvt_f32_f16((float16x4_t)_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vdupq_n_f32(float16_to_float32(pC[0]) * beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    pC += 1;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            vst1_u16(p0, (uint16x4_t)vcvt_f16_f32(_f0));
            pp += 4;
            p0 += out_hstep;
        }
    }
#endif // __ARM_NEON
    for (; ii + 1 < max_ii; ii += 2)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        const float descale0 = descales[i + ii];
        const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
        float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii);
#endif

        float c0;
        float c1;
#if __ARM_NEON
        float32x4_t _c0;
        float32x4_t _c1;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = float16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = float16_to_float32(pC[0]) * beta;
                c1 = float16_to_float32(pC[1]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
                _c1 = vdupq_n_f32(c1);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
#if __aarch64__
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale01, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale01, 0);
            float32x4_t _f2 = vmulq_lane_f32(vcvtq_f32_s32(_sum2), _descale01, 1);
            float32x4_t _f3 = vmulq_lane_f32(vcvtq_f32_s32(_sum3), _descale01, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c1);
                    _f3 = vaddq_f32(_f3, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + c_hstep);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                    _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                    float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                    float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 8;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x8_t _c = vld1q_u16(pC);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta != 1.f)
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _c0 = vmulq_f32(_c0, _beta);
                        _c1 = vmulq_f32(_c1, _beta);
                    }
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c1);
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);

#if __aarch64__
            if (out_elempack == 8)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                vst1q_u16(p0 + 8, vcombine_u16(_hf2, _hf3));
            }
#endif // __aarch64__
            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf2));
                vst1q_u16(p0 + out_hstep * 4, vcombine_u16(_hf1, _hf3));
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_hf0, 0);
                p0[1] = vget_lane_u16(_hf2, 0);
                p0[out_hstep] = vget_lane_u16(_hf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_hf2, 1);
                p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_hf2, 2);
                p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_hf2, 3);
                p0[out_hstep * 4] = vget_lane_u16(_hf1, 0);
                p0[out_hstep * 4 + 1] = vget_lane_u16(_hf3, 0);
                p0[out_hstep * 5] = vget_lane_u16(_hf1, 1);
                p0[out_hstep * 5 + 1] = vget_lane_u16(_hf3, 1);
                p0[out_hstep * 6] = vget_lane_u16(_hf1, 2);
                p0[out_hstep * 6 + 1] = vget_lane_u16(_hf3, 2);
                p0[out_hstep * 7] = vget_lane_u16(_hf1, 3);
                p0[out_hstep * 7 + 1] = vget_lane_u16(_hf3, 3);
            }

            pp += 16;
            p0 += out_hstep * 8;
        }
#endif // __aarch64__
        for (; jj + 3 < max_jj; jj += 4)
        {
            // a0 a1 a2 a3
            // b0 b1 b2 b3

            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_lane_f32(vcvtq_f32_s32(_sum0), _descale01, 0);
            float32x4_t _f1 = vmulq_lane_f32(vcvtq_f32_s32(_sum1), _descale01, 1);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c1);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _c1 = vcvt_f32_f16((float16x4_t)vld1_u16(pC + c_hstep));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 4;
                }
                if (broadcast_type_C == 4)
                {
                    _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _c0 = vmulq_n_f32(_c0, beta);
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    pC += 4;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);

            if (out_elempack == 4)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
            }
            if (out_elempack == 1)
            {
                p0[0] = vget_lane_u16(_hf0, 0);
                p0[1] = vget_lane_u16(_hf1, 0);
                p0[out_hstep] = vget_lane_u16(_hf0, 1);
                p0[out_hstep + 1] = vget_lane_u16(_hf1, 1);
                p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                p0[out_hstep * 2 + 1] = vget_lane_u16(_hf1, 2);
                p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                p0[out_hstep * 3 + 1] = vget_lane_u16(_hf1, 3);
            }

            pp += 8;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            // a0 a1 b0 b1
            int32x2x2_t _sum0 = vld2_s32(pp);

            float32x4_t _descale = vcombine_f32(_descale01, _descale01);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vcombine_s32(_sum0.val[0], _sum0.val[1])), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    float32x4_t _cc = vzipq_f32(_c0, _c1).val[0];
                    _f0 = vaddq_f32(_f0, _cc);
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[c_hstep], _c, 1);
                    _c = vset_lane_u16(pC[1], _c, 2);
                    _c = vset_lane_u16(pC[c_hstep + 1], _c, 3);
                    _c0 = vcvt_f32_f16((float16x4_t)_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
                if (broadcast_type_C == 4)
                {
                    uint16x4_t _c = uint16x4_t();
                    _c = vset_lane_u16(pC[0], _c, 0);
                    _c = vset_lane_u16(pC[0], _c, 1);
                    _c = vset_lane_u16(pC[1], _c, 2);
                    _c = vset_lane_u16(pC[1], _c, 3);
                    _c0 = vcvt_f32_f16((float16x4_t)_c);
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 2;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);

            p0[0] = vget_lane_u16(_hf0, 0);
            p0[1] = vget_lane_u16(_hf0, 1);
            p0[out_hstep] = vget_lane_u16(_hf0, 2);
            p0[out_hstep + 1] = vget_lane_u16(_hf0, 3);

            pp += 4;
            p0 += out_hstep * 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj += 1)
        {
            float f0 = pp[0] * descale0;
            float f1 = pp[1] * descale1;

            if (pC)
            {
                if (broadcast_type_C == 0)
                {
                    f0 += c0;
                    f1 += c0;
                }
                if (broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                    f1 += c1;
                }
                if (broadcast_type_C == 3)
                {
                    // c_elempack == 1
                    f0 += float16_to_float32(pC[0]) * beta;
                    f1 += float16_to_float32(pC[c_hstep]) * beta;
                    pC += 1;
                }
                if (broadcast_type_C == 4)
                {
                    c0 = float16_to_float32(pC[0]) * beta;
                    f0 += c0;
                    f1 += c0;
                    pC += 1;
                }
            }

            if (alpha != 1.f)
            {
                f0 *= alpha;
                f1 *= alpha;
            }

            p0[0] = float32_to_float16(f0);
            p0[1] = float32_to_float16(f1);
            pp += 2;
            p0 += out_hstep;
        }
    }
    for (; ii < max_ii; ii += 1)
    {
        unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

        const float descale = descales[i + ii];
#if __ARM_NEON
        float32x4_t _descale = vdupq_n_f32(descale);
#endif

        float c0;
#if __ARM_NEON
        float32x4_t _c0;
#endif
        if (pC)
        {
            if (broadcast_type_C == 0)
            {
                c0 = float16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 1 || broadcast_type_C == 2)
            {
                pC = (const unsigned short*)C + i + ii;
                c0 = float16_to_float32(pC[0]) * beta;
#if __ARM_NEON
                _c0 = vdupq_n_f32(c0);
#endif
            }
            if (broadcast_type_C == 3)
            {
                // c_elempack == 1
                pC = (const unsigned short*)C + (i + ii) * c_hstep + j;
            }
            if (broadcast_type_C == 4)
            {
                pC = (const unsigned short*)C + j;
            }
        }

        int jj = 0;
#if __ARM_NEON
        for (; jj + 15 < max_jj; jj += 16)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);
            int32x4_t _sum2 = vld1q_s32(pp + 8);
            int32x4_t _sum3 = vld1q_s32(pp + 12);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);
            float32x4_t _f2 = vmulq_f32(vcvtq_f32_s32(_sum2), _descale);
            float32x4_t _f3 = vmulq_f32(vcvtq_f32_s32(_sum3), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                    _f2 = vaddq_f32(_f2, _c0);
                    _f3 = vaddq_f32(_f3, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    uint16x8_t _c01 = vld1q_u16(pC);
                    uint16x8_t _c23 = vld1q_u16(pC + 8);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c01));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c01));
                    float32x4_t _c2 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c23));
                    float32x4_t _c3 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c23));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                        _f2 = vaddq_f32(_f2, _c2);
                        _f3 = vaddq_f32(_f3, _c3);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                        _f2 = vmlaq_f32(_f2, _c2, _beta);
                        _f3 = vmlaq_f32(_f3, _c3, _beta);
                    }
                    pC += 16;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
                _f2 = vmulq_f32(_f2, _alpha);
                _f3 = vmulq_f32(_f3, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);
            uint16x4_t _hf2 = (uint16x4_t)vcvt_f16_f32(_f2);
            uint16x4_t _hf3 = (uint16x4_t)vcvt_f16_f32(_f3);

            if (out_hstep == 1)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                vst1q_u16(p0 + 8, vcombine_u16(_hf2, _hf3));
            }
            else
            {
#if __aarch64__
                if (out_elempack == 8)
                {
                    vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                    vst1q_u16(p0 + out_hstep * 8, vcombine_u16(_hf2, _hf3));
                }
#endif // __aarch64__
                if (out_elempack == 4)
                {
                    vst1_u16(p0, _hf0);
                    vst1_u16(p0 + out_hstep * 4, _hf1);
                    vst1_u16(p0 + out_hstep * 8, _hf2);
                    vst1_u16(p0 + out_hstep * 12, _hf3);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vget_lane_u16(_hf0, 0);
                    p0[out_hstep] = vget_lane_u16(_hf0, 1);
                    p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                    p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                    p0[out_hstep * 4] = vget_lane_u16(_hf1, 0);
                    p0[out_hstep * 5] = vget_lane_u16(_hf1, 1);
                    p0[out_hstep * 6] = vget_lane_u16(_hf1, 2);
                    p0[out_hstep * 7] = vget_lane_u16(_hf1, 3);
                    p0[out_hstep * 8] = vget_lane_u16(_hf2, 0);
                    p0[out_hstep * 9] = vget_lane_u16(_hf2, 1);
                    p0[out_hstep * 10] = vget_lane_u16(_hf2, 2);
                    p0[out_hstep * 11] = vget_lane_u16(_hf2, 3);
                    p0[out_hstep * 12] = vget_lane_u16(_hf3, 0);
                    p0[out_hstep * 13] = vget_lane_u16(_hf3, 1);
                    p0[out_hstep * 14] = vget_lane_u16(_hf3, 2);
                    p0[out_hstep * 15] = vget_lane_u16(_hf3, 3);
                }
            }

            pp += 16;
            p0 += out_hstep * 16;
        }
        for (; jj + 7 < max_jj; jj += 8)
        {
            int32x4_t _sum0 = vld1q_s32(pp);
            int32x4_t _sum1 = vld1q_s32(pp + 4);

            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(_sum0), _descale);
            float32x4_t _f1 = vmulq_f32(vcvtq_f32_s32(_sum1), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                    _f1 = vaddq_f32(_f1, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // out_elempack == 1
                    uint16x8_t _c = vld1q_u16(pC);
                    _c0 = vcvt_f32_f16((float16x4_t)vget_low_u16(_c));
                    float32x4_t _c1 = vcvt_f32_f16((float16x4_t)vget_high_u16(_c));
                    if (beta == 1.f)
                    {
                        _f0 = vaddq_f32(_f0, _c0);
                        _f1 = vaddq_f32(_f1, _c1);
                    }
                    else
                    {
                        float32x4_t _beta = vdupq_n_f32(beta);
                        _f0 = vmlaq_f32(_f0, _c0, _beta);
                        _f1 = vmlaq_f32(_f1, _c1, _beta);
                    }
                    pC += 8;
                }
            }

            if (alpha != 1.f)
            {
                float32x4_t _alpha = vdupq_n_f32(alpha);
                _f0 = vmulq_f32(_f0, _alpha);
                _f1 = vmulq_f32(_f1, _alpha);
            }

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);
            uint16x4_t _hf1 = (uint16x4_t)vcvt_f16_f32(_f1);

            if (out_hstep == 1)
            {
                vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
            }
            else
            {
#if __aarch64__
                if (out_elempack == 8)
                {
                    vst1q_u16(p0, vcombine_u16(_hf0, _hf1));
                }
#endif // __aarch64__
                if (out_elempack == 4)
                {
                    vst1_u16(p0, _hf0);
                    vst1_u16(p0 + out_hstep * 4, _hf1);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vget_lane_u16(_hf0, 0);
                    p0[out_hstep] = vget_lane_u16(_hf0, 1);
                    p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                    p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                    p0[out_hstep * 4] = vget_lane_u16(_hf1, 0);
                    p0[out_hstep * 5] = vget_lane_u16(_hf1, 1);
                    p0[out_hstep * 6] = vget_lane_u16(_hf1, 2);
                    p0[out_hstep * 7] = vget_lane_u16(_hf1, 3);
                }
            }

            pp += 8;
            p0 += out_hstep * 8;
        }
        for (; jj + 3 < max_jj; jj += 4)
        {
            float32x4_t _f0 = vmulq_f32(vcvtq_f32_s32(vld1q_s32(pp)), _descale);

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vaddq_f32(_f0, _c0);
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // out_elempack == 1
                    _c0 = vcvt_f32_f16((float16x4_t)vld1_u16(pC));
                    _f0 = vmlaq_n_f32(_f0, _c0, beta);
                    pC += 4;
                }
            }

            _f0 = vmulq_n_f32(_f0, alpha);

            uint16x4_t _hf0 = (uint16x4_t)vcvt_f16_f32(_f0);

            if (out_hstep == 1)
            {
                vst1_u16(p0, _hf0);
            }
            else
            {
                if (out_elempack == 4)
                {
                    vst1_u16(p0, _hf0);
                }
                if (out_elempack == 1)
                {
                    p0[0] = vget_lane_u16(_hf0, 0);
                    p0[out_hstep] = vget_lane_u16(_hf0, 1);
                    p0[out_hstep * 2] = vget_lane_u16(_hf0, 2);
                    p0[out_hstep * 3] = vget_lane_u16(_hf0, 3);
                }
            }

            pp += 4;
            p0 += out_hstep * 4;
        }
        for (; jj + 1 < max_jj; jj += 2)
        {
            float32x2_t _f0 = vmul_f32(vcvt_f32_s32(vld1_s32(pp)), vget_low_f32(_descale));

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    _f0 = vadd_f32(_f0, vget_low_f32(_c0));
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    float32x2_t _c = float32x2_t();
                    _c = vset_lane_f32(float16_to_float32(pC[0]), _c, 0);
                    _c = vset_lane_f32(float16_to_float32(pC[1]), _c, 1);
                    _f0 = vmla_n_f32(_f0, _c, beta);
                    pC += 2;
                }
            }

            _f0 = vmul_n_f32(_f0, alpha);

            p0[0] = float32_to_float16(vget_lane_f32(_f0, 0));
            p0[out_hstep] = float32_to_float16(vget_lane_f32(_f0, 1));

            pp += 2;
            p0 += out_hstep * 2;
        }
#endif // __ARM_NEON
        for (; jj < max_jj; jj += 1)
        {
            float f0 = pp[0] * descale;

            if (pC)
            {
                if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
                {
                    f0 += c0;
                }
                if (broadcast_type_C == 3 || broadcast_type_C == 4)
                {
                    // c_elempack == 1
                    f0 += float16_to_float32(pC[0]) * beta;
                    pC += 1;
                }
            }

            f0 *= alpha;

            p0[0] = float32_to_float16(f0);

            pp += 1;
            p0 += out_hstep;
        }
    }
}


================================================
FILE: src/layer/arm/groupnorm_arm.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "groupnorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

GroupNorm_arm::GroupNorm_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

static void groupnorm(float* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int channels, int size, int elempack, size_t cstep)
{
#if __ARM_NEON
    float32x4_t _mean = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float mean = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const float* ptr0 = ptr + cstep * q * elempack;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            _mean = vaddq_f32(_mean, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            mean += ptr0[0];
            ptr0++;
        }
    }

    {
#if __ARM_NEON
#if __aarch64__
        mean += vaddvq_f32(_mean);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_mean), vget_high_f32(_mean));
        _s2 = vpadd_f32(_s2, _s2);
        mean += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        mean = mean / (channels * size);
#if __ARM_NEON
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

#if __ARM_NEON
    float32x4_t _var = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float var = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const float* ptr0 = ptr + cstep * q * elempack;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            _p = vsubq_f32(_p, _mean);
            _var = vmlaq_f32(_var, _p, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = ptr0[0] - mean;
            var += v * v;
            ptr0++;
        }
    }

    {
#if __ARM_NEON
#if __aarch64__
        var += vaddvq_f32(_var);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_var), vget_high_f32(_var));
        _s2 = vpadd_f32(_s2, _s2);
        var += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        var = 1.f / sqrtf(var / (channels * size) + eps);
        mean = -mean * var;
#if __ARM_NEON
        _var = vdupq_n_f32(var);
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

    if (gamma_ptr && beta_ptr)
    {
        for (int q = 0; q < channels; q++)
        {
            float* ptr0 = ptr + cstep * q * elempack;

#if __ARM_NEON
            float32x4_t _a = vdupq_n_f32(0.f);
            float32x4_t _b = vdupq_n_f32(0.f);
#endif // __ARM_NEON
            float a = 0.f;
            float b = 0.f;

#if __ARM_NEON
            if (elempack == 4)
            {
                float32x4_t _gamma = vld1q_f32(gamma_ptr + q * elempack);
                float32x4_t _beta = vld1q_f32(beta_ptr + q * elempack);

                _a = vmulq_f32(_var, _gamma);
                _b = vmlaq_f32(_beta, _mean, _gamma);
            }
#endif // __ARM_NEON
            if (elempack == 1)
            {
                const float gamma = gamma_ptr[q];
                const float beta = beta_ptr[q];

                a = var * gamma;
                b = mean * gamma + beta;
#if __ARM_NEON
                _a = vdupq_n_f32(a);
                _b = vdupq_n_f32(b);
#endif // __ARM_NEON
            }

            int i = 0;
#if __ARM_NEON
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr0);
                _p = vmlaq_f32(_b, _p, _a);
                vst1q_f32(ptr0, _p);
                ptr0 += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *ptr0 = *ptr0 * a + b;
                ptr0++;
            }
        }
    }
    else
    {
        for (int q = 0; q < channels; q++)
        {
            float* ptr0 = ptr + cstep * q * elempack;

            int i = 0;
#if __ARM_NEON
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr0);
                _p = vmlaq_f32(_mean, _p, _var);
                vst1q_f32(ptr0, _p);
                ptr0 += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *ptr0 = *ptr0 * var + mean;
                ptr0++;
            }
        }
    }
}

int GroupNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    const int dims = bottom_top_blob.dims;
    const int elempack = bottom_top_blob.elempack;
    const int channels_g = channels / group;

    int g_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON

    Mat bottom_top_blob_unpacked = bottom_top_blob;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_top_blob, bottom_top_blob_unpacked, g_elempack, opt_p);
        if (bottom_top_blob_unpacked.empty())
            return -100;
    }

    if (dims == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, 1 * g_elempack, g_elempack, 1);
        }
    }

    if (dims == 2)
    {
        const int w = bottom_top_blob_unpacked.w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.row_range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, w * g_elempack, g_elempack, w);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int size = bottom_top_blob_unpacked.w * bottom_top_blob_unpacked.h * bottom_top_blob_unpacked.d;
        const size_t cstep = bottom_top_blob_unpacked.cstep;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.channel_range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, size * g_elempack, g_elempack, cstep);
        }
    }

    if (g_elempack != elempack)
    {
        convert_packing(bottom_top_blob_unpacked, bottom_top_blob, elempack, opt);
    }

    return 0;
}

#if NCNN_BF16
static void groupnorm_bf16s(unsigned short* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int channels, int size, int elempack, size_t cstep)
{
#if __ARM_NEON
    float32x4_t _mean = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float mean = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const unsigned short* ptr0 = ptr + cstep * q * elempack;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
            _mean = vaddq_f32(_mean, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            mean += bfloat16_to_float32(ptr0[0]);
            ptr0++;
        }
    }

    {
#if __ARM_NEON
#if __aarch64__
        mean += vaddvq_f32(_mean);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_mean), vget_high_f32(_mean));
        _s2 = vpadd_f32(_s2, _s2);
        mean += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        mean = mean / (channels * size);
#if __ARM_NEON
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

#if __ARM_NEON
    float32x4_t _var = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float var = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const unsigned short* ptr0 = ptr + cstep * q * elempack;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
            _p = vsubq_f32(_p, _mean);
            _var = vmlaq_f32(_var, _p, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(ptr0[0]) - mean;
            var += v * v;
            ptr0++;
        }
    }

    {
#if __ARM_NEON
#if __aarch64__
        var += vaddvq_f32(_var);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_var), vget_high_f32(_var));
        _s2 = vpadd_f32(_s2, _s2);
        var += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        var = 1.f / sqrtf(var / (channels * size) + eps);
        mean = -mean * var;
#if __ARM_NEON
        _var = vdupq_n_f32(var);
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

    if (gamma_ptr && beta_ptr)
    {
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr0 = ptr + cstep * q * elempack;

#if __ARM_NEON
            float32x4_t _a = vdupq_n_f32(0.f);
            float32x4_t _b = vdupq_n_f32(0.f);
#endif // __ARM_NEON
            float a = 0.f;
            float b = 0.f;

#if __ARM_NEON
            if (elempack == 4)
            {
                float32x4_t _gamma = vld1q_f32(gamma_ptr + q * elempack);
                float32x4_t _beta = vld1q_f32(beta_ptr + q * elempack);

                _a = vmulq_f32(_var, _gamma);
                _b = vmlaq_f32(_beta, _mean, _gamma);
            }
#endif // __ARM_NEON
            if (elempack == 1)
            {
                const float gamma = gamma_ptr[q];
                const float beta = beta_ptr[q];

                a = var * gamma;
                b = mean * gamma + beta;
#if __ARM_NEON
                _a = vdupq_n_f32(a);
                _b = vdupq_n_f32(b);
#endif // __ARM_NEON
            }

            int i = 0;
#if __ARM_NEON
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr0));
                _p = vmlaq_f32(_b, _p, _a);
                vst1_u16(ptr0, float2bfloat(_p));
                ptr0 += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *ptr0 = float32_to_bfloat16(bfloat16_to_float32(*ptr0) * a + b);
                ptr0++;
            }
        }
    }
    else
    {
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr0 = ptr + cstep * q * elempack;

            int i = 0;
#if __ARM_NEON
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr0));
                _p = vmlaq_f32(_mean, _p, _var);
                vst1_u16(ptr0, float2bfloat(_p));
                ptr0 += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *ptr0 = float32_to_bfloat16(bfloat16_to_float32(*ptr0) * var + mean);
                ptr0++;
            }
        }
    }
}

int GroupNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int elempack = bottom_top_blob.elempack;
    const int channels_g = channels / group;

    int g_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON

    Mat bottom_top_blob_unpacked = bottom_top_blob;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_top_blob, bottom_top_blob_unpacked, g_elempack, opt_p);
        if (bottom_top_blob_unpacked.empty())
            return -100;
    }

    if (dims == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm_bf16s(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, 1 * g_elempack, g_elempack, 1);
        }
    }

    if (dims == 2)
    {
        const int w = bottom_top_blob_unpacked.w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.row_range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm_bf16s(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, w * g_elempack, g_elempack, w);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int size = bottom_top_blob_unpacked.w * bottom_top_blob_unpacked.h * bottom_top_blob_unpacked.d;
        const size_t cstep = bottom_top_blob_unpacked.cstep;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.channel_range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm_bf16s(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, size * g_elempack, g_elempack, cstep);
        }
    }

    if (g_elempack != elempack)
    {
        convert_packing(bottom_top_blob_unpacked, bottom_top_blob, elempack, opt);
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/groupnorm_arm.h
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GROUPNORM_ARM_H
#define LAYER_GROUPNORM_ARM_H

#include "groupnorm.h"

namespace ncnn {

class GroupNorm_arm : public GroupNorm
{
public:
    GroupNorm_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_GROUPNORM_ARM_H


================================================
FILE: src/layer/arm/groupnorm_arm_asimdhp.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "groupnorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static void groupnorm_fp16s(__fp16* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int channels, int size, int elempack, size_t cstep)
{
    float32x4_t _mean0 = vdupq_n_f32(0.f);
    float32x4_t _mean1 = vdupq_n_f32(0.f);
    float mean = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const __fp16* ptr0 = ptr + cstep * q * elempack;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr0);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _mean0 = vaddq_f32(_mean0, _p0);
            _mean1 = vaddq_f32(_mean1, _p1);
            ptr0 += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
            _mean0 = vaddq_f32(_mean0, _p);
            ptr0 += 4;
        }
        for (; i < size; i++)
        {
            mean += (float)ptr0[0];
            ptr0++;
        }
    }

    {
        _mean0 = vaddq_f32(_mean0, _mean1);
        mean += vaddvq_f32(_mean0);

        mean = mean / (channels * size);
        _mean0 = vdupq_n_f32(mean);
        _mean1 = _mean0;
    }

    float32x4_t _var0 = vdupq_n_f32(0.f);
    float32x4_t _var1 = vdupq_n_f32(0.f);
    float var = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const __fp16* ptr0 = ptr + cstep * q * elempack;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr0);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _p0 = vsubq_f32(_p0, _mean0);
            _p1 = vsubq_f32(_p1, _mean1);
            _var0 = vfmaq_f32(_var0, _p0, _p0);
            _var1 = vfmaq_f32(_var1, _p1, _p1);
            ptr0 += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
            _p = vsubq_f32(_p, _mean0);
            _var0 = vfmaq_f32(_var0, _p, _p);
            ptr0 += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)ptr0[0] - mean;
            var += v * v;
            ptr0++;
        }
    }

    {
        _var0 = vaddq_f32(_var0, _var1);
        var += vaddvq_f32(_var0);

        var = 1.f / sqrtf(var / (channels * size) + eps);
        mean = -mean * var;
        _var0 = vdupq_n_f32(var);
        _mean0 = vdupq_n_f32(mean);
        _var1 = _var0;
        _mean1 = _mean0;
    }

    if (gamma_ptr && beta_ptr)
    {
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr0 = ptr + cstep * q * elempack;

            float32x4_t _a0 = vdupq_n_f32(0.f);
            float32x4_t _b0 = vdupq_n_f32(0.f);
            float32x4_t _a1 = vdupq_n_f32(0.f);
            float32x4_t _b1 = vdupq_n_f32(0.f);
            float a = 0.f;
            float b = 0.f;

            if (elempack == 8)
            {
                float32x4_t _gamma0 = vld1q_f32(gamma_ptr + q * elempack);
                float32x4_t _gamma1 = vld1q_f32(gamma_ptr + q * elempack + 4);
                float32x4_t _beta0 = vld1q_f32(beta_ptr + q * elempack);
                float32x4_t _beta1 = vld1q_f32(beta_ptr + q * elempack + 4);

                _a0 = vmulq_f32(_var0, _gamma0);
                _a1 = vmulq_f32(_var1, _gamma1);
                _b0 = vfmaq_f32(_beta0, _mean0, _gamma0);
                _b1 = vfmaq_f32(_beta1, _mean1, _gamma1);
            }
            if (elempack == 4)
            {
                float32x4_t _gamma = vld1q_f32(gamma_ptr + q * elempack);
                float32x4_t _beta = vld1q_f32(beta_ptr + q * elempack);

                _a0 = vmulq_f32(_var0, _gamma);
                _b0 = vfmaq_f32(_beta, _mean0, _gamma);
                _a1 = _a0;
                _b1 = _b0;
            }
            if (elempack == 1)
            {
                const float gamma = gamma_ptr[q];
                const float beta = beta_ptr[q];

                a = var * gamma;
                b = mean * gamma + beta;
                _a0 = vdupq_n_f32(a);
                _b0 = vdupq_n_f32(b);
                _a1 = _a0;
                _b1 = _b0;
            }

            int i = 0;
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr0);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                _p0 = vfmaq_f32(_b0, _p0, _a0);
                _p1 = vfmaq_f32(_b1, _p1, _a1);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr0, _p);
                ptr0 += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
                _p = vfmaq_f32(_b0, _p, _a0);
                vst1_f16(ptr0, vcvt_f16_f32(_p));
                ptr0 += 4;
            }
            for (; i < size; i++)
            {
                *ptr0 = (__fp16)((float)*ptr0 * a + b);
                ptr0++;
            }
        }
    }
    else
    {
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr0 = ptr + cstep * q * elempack;

            int i = 0;
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr0);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                _p0 = vfmaq_f32(_mean0, _p0, _var0);
                _p1 = vfmaq_f32(_mean1, _p1, _var1);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr0, _p);
                ptr0 += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
                _p = vfmaq_f32(_mean0, _p, _var0);
                vst1_f16(ptr0, vcvt_f16_f32(_p));
                ptr0 += 4;
            }
            for (; i < size; i++)
            {
                *ptr0 = (__fp16)((float)*ptr0 * var + mean);
                ptr0++;
            }
        }
    }
}

int GroupNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int elempack = bottom_top_blob.elempack;
    const int channels_g = channels / group;

    int g_elempack = 1;
    if (opt.use_packing_layout)
    {
        if (opt.use_fp16_arithmetic)
            g_elempack = channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
        else
            g_elempack = channels_g % 4 == 0 ? 4 : 1;
    }

    Mat bottom_top_blob_unpacked = bottom_top_blob;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_top_blob, bottom_top_blob_unpacked, g_elempack, opt_p);
        if (bottom_top_blob_unpacked.empty())
            return -100;
    }

    if (dims == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm_fp16s(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, 1 * g_elempack, g_elempack, 1);
        }
    }

    if (dims == 2)
    {
        const int w = bottom_top_blob_unpacked.w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.row_range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm_fp16s(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, w * g_elempack, g_elempack, w);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int size = bottom_top_blob_unpacked.w * bottom_top_blob_unpacked.h * bottom_top_blob_unpacked.d;
        const size_t cstep = bottom_top_blob_unpacked.cstep;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob_unpacked.channel_range(g * channels_g / g_elempack, channels_g / g_elempack);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm_fp16s(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g / g_elempack, size * g_elempack, g_elempack, cstep);
        }
    }

    if (g_elempack != elempack)
    {
        convert_packing(bottom_top_blob_unpacked, bottom_top_blob, elempack, opt);
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/gru_arm.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gru_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#if NCNN_INT8
#include "gru_int8.h"
#endif

GRU_arm::GRU_arm()
{
#if __ARM_NEON
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int GRU_arm::create_pipeline(const Option& opt)
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return create_pipeline_int8(opt);
    }
#endif

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

    // pack RUN
    const int num_directions = direction == 2 ? 2 : 1;
    const int size = weight_data_size / num_directions / num_output / 3;

#if __ARM_NEON
    weight_xc_data_packed.create(size * 12, num_output / 4 + num_output % 4, num_directions);
    bias_c_data_packed.create(num_output, 1, num_directions, 16u, 4);
    weight_hc_data_packed.create(num_output * 12, num_output / 4 + num_output % 4, num_directions);
#else
    weight_xc_data_packed.create(size * 3, num_output, num_directions);
    bias_c_data_packed.create(num_output, 1, num_directions, 16u, 4);
    weight_hc_data_packed.create(num_output * 3, num_output, num_directions);
#endif

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat bias_c = bias_c_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        const float* bias_c_R = bias_c.row(0);
        const float* bias_c_U = bias_c.row(1);
        const float* bias_c_WN = bias_c.row(2);
        const float* bias_c_BN = bias_c.row(3);

        float* bias_c_RUBNWN = bias_c_data_packed_dr.row(0);

        int q = 0;
#if __ARM_NEON
        for (; q + 3 < num_output; q += 4)
        {
            vst1q_f32(bias_c_RUBNWN, vld1q_f32(bias_c_R + q));
            vst1q_f32(bias_c_RUBNWN + 4, vld1q_f32(bias_c_U + q));
            vst1q_f32(bias_c_RUBNWN + 8, vld1q_f32(bias_c_BN + q));
            vst1q_f32(bias_c_RUBNWN + 12, vld1q_f32(bias_c_WN + q));

            bias_c_RUBNWN += 16;

            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);

            const float* weight_xc_R_1 = weight_xc.row(num_output * 0 + q + 1);
            const float* weight_xc_U_1 = weight_xc.row(num_output * 1 + q + 1);
            const float* weight_xc_N_1 = weight_xc.row(num_output * 2 + q + 1);

            const float* weight_xc_R_2 = weight_xc.row(num_output * 0 + q + 2);
            const float* weight_xc_U_2 = weight_xc.row(num_output * 1 + q + 2);
            const float* weight_xc_N_2 = weight_xc.row(num_output * 2 + q + 2);

            const float* weight_xc_R_3 = weight_xc.row(num_output * 0 + q + 3);
            const float* weight_xc_U_3 = weight_xc.row(num_output * 1 + q + 3);
            const float* weight_xc_N_3 = weight_xc.row(num_output * 2 + q + 3);

            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);

            const float* weight_hc_R_1 = weight_hc.row(num_output * 0 + q + 1);
            const float* weight_hc_U_1 = weight_hc.row(num_output * 1 + q + 1);
            const float* weight_hc_N_1 = weight_hc.row(num_output * 2 + q + 1);

            const float* weight_hc_R_2 = weight_hc.row(num_output * 0 + q + 2);
            const float* weight_hc_U_2 = weight_hc.row(num_output * 1 + q + 2);
            const float* weight_hc_N_2 = weight_hc.row(num_output * 2 + q + 2);

            const float* weight_hc_R_3 = weight_hc.row(num_output * 0 + q + 3);
            const float* weight_hc_U_3 = weight_hc.row(num_output * 1 + q + 3);
            const float* weight_hc_N_3 = weight_hc.row(num_output * 2 + q + 3);

            float* weight_xc_RUN = weight_xc_data_packed_dr.row(q / 4);
            float* weight_hc_RUN = weight_hc_data_packed_dr.row(q / 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = weight_xc_R[i];
                weight_xc_RUN[1] = weight_xc_R_1[i];
                weight_xc_RUN[2] = weight_xc_R_2[i];
                weight_xc_RUN[3] = weight_xc_R_3[i];
                weight_xc_RUN[4] = weight_xc_U[i];
                weight_xc_RUN[5] = weight_xc_U_1[i];
                weight_xc_RUN[6] = weight_xc_U_2[i];
                weight_xc_RUN[7] = weight_xc_U_3[i];

                weight_xc_RUN += 8;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = weight_hc_R[i];
                weight_hc_RUN[1] = weight_hc_R_1[i];
                weight_hc_RUN[2] = weight_hc_R_2[i];
                weight_hc_RUN[3] = weight_hc_R_3[i];
                weight_hc_RUN[4] = weight_hc_U[i];
                weight_hc_RUN[5] = weight_hc_U_1[i];
                weight_hc_RUN[6] = weight_hc_U_2[i];
                weight_hc_RUN[7] = weight_hc_U_3[i];

                weight_hc_RUN += 8;
            }

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = weight_xc_N[i];
                weight_xc_RUN[1] = weight_xc_N_1[i];
                weight_xc_RUN[2] = weight_xc_N_2[i];
                weight_xc_RUN[3] = weight_xc_N_3[i];

                weight_xc_RUN += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = weight_hc_N[i];
                weight_hc_RUN[1] = weight_hc_N_1[i];
                weight_hc_RUN[2] = weight_hc_N_2[i];
                weight_hc_RUN[3] = weight_hc_N_3[i];

                weight_hc_RUN += 4;
            }
        }
#endif // __ARM_NEON
        for (; q < num_output; q++)
        {
            bias_c_RUBNWN[0] = bias_c_R[q];
            bias_c_RUBNWN[1] = bias_c_U[q];
            bias_c_RUBNWN[2] = bias_c_BN[q];
            bias_c_RUBNWN[3] = bias_c_WN[q];

            bias_c_RUBNWN += 4;

            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);

            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);

#if __ARM_NEON
            float* weight_xc_RUN = weight_xc_data_packed_dr.row(q / 4 + q % 4);
            float* weight_hc_RUN = weight_hc_data_packed_dr.row(q / 4 + q % 4);
#else
            float* weight_xc_RUN = weight_xc_data_packed_dr.row(q);
            float* weight_hc_RUN = weight_hc_data_packed_dr.row(q);
#endif // __ARM_NEON

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = weight_xc_R[i];
                weight_xc_RUN[1] = weight_xc_U[i];

                weight_xc_RUN += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = weight_hc_R[i];
                weight_hc_RUN[1] = weight_hc_U[i];

                weight_hc_RUN += 2;
            }

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = weight_xc_N[i];

                weight_xc_RUN += 1;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = weight_hc_N[i];

                weight_hc_RUN += 1;
            }
        }
    }

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // 2 x num_output
#if __ARM_NEON
    Mat gates(4 * 2, num_output / 4 + num_output % 4, 4u, opt.workspace_allocator);
#else
    Mat gates(2, num_output, 4u, opt.workspace_allocator);
#endif
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        int remain_num_output_start = 0;
#if __ARM_NEON
        int nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const float* x = bottom_blob.row(ti);

            // gate reset update
            const float* bias_c_RUBNWN = (const float*)bias_c + q * 4;

            const float* weight_xc_RUN = weight_xc.row(q / 4);
            const float* weight_hc_RUN = weight_hc.row(q / 4);

            float32x4_t _gru_R = vld1q_f32(bias_c_RUBNWN);
            float32x4_t _gru_U = vld1q_f32(bias_c_RUBNWN + 4);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            float32x4_t _sum4 = vdupq_n_f32(0.f);
            float32x4_t _sum5 = vdupq_n_f32(0.f);
            float32x4_t _sum6 = vdupq_n_f32(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = vld1q_f32(x + i);
                float32x4_t _weight_xc_R = vld1q_f32(weight_xc_RUN);
                float32x4_t _weight_xc_U = vld1q_f32(weight_xc_RUN + 4);
                float32x4_t _weight_xc_R_1 = vld1q_f32(weight_xc_RUN + 8);
                float32x4_t _weight_xc_U_1 = vld1q_f32(weight_xc_RUN + 12);
                float32x4_t _weight_xc_R_2 = vld1q_f32(weight_xc_RUN + 16);
                float32x4_t _weight_xc_U_2 = vld1q_f32(weight_xc_RUN + 20);
                float32x4_t _weight_xc_R_3 = vld1q_f32(weight_xc_RUN + 24);
                float32x4_t _weight_xc_U_3 = vld1q_f32(weight_xc_RUN + 28);
#if __aarch64__
                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_xc_R, _xi, 0);
                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_xc_U, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_R_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_U_1, _xi, 1);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_R_2, _xi, 2);
                _sum4 = vfmaq_laneq_f32(_sum4, _weight_xc_U_2, _xi, 2);
                _sum5 = vfmaq_laneq_f32(_sum5, _weight_xc_R_3, _xi, 3);
                _sum6 = vfmaq_laneq_f32(_sum6, _weight_xc_U_3, _xi, 3);
#else
                _gru_R = vmlaq_lane_f32(_gru_R, _weight_xc_R, vget_low_f32(_xi), 0);
                _gru_U = vmlaq_lane_f32(_gru_U, _weight_xc_U, vget_low_f32(_xi), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_R_1, vget_low_f32(_xi), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_U_1, vget_low_f32(_xi), 1);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_R_2, vget_high_f32(_xi), 0);
                _sum4 = vmlaq_lane_f32(_sum4, _weight_xc_U_2, vget_high_f32(_xi), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _weight_xc_R_3, vget_high_f32(_xi), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _weight_xc_U_3, vget_high_f32(_xi), 1);
#endif

                weight_xc_RUN += 32;
            }
            for (; i < size; i++)
            {
                float xi = x[i];

                float32x4_t _xi = vdupq_n_f32(xi);
                float32x4_t _weight_xc_R = vld1q_f32(weight_xc_RUN);
                float32x4_t _weight_xc_U = vld1q_f32(weight_xc_RUN + 4);
                _gru_R = vmlaq_f32(_gru_R, _weight_xc_R, _xi);
                _gru_U = vmlaq_f32(_gru_U, _weight_xc_U, _xi);

                weight_xc_RUN += 8;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc_R = vld1q_f32(weight_hc_RUN);
                float32x4_t _weight_hc_U = vld1q_f32(weight_hc_RUN + 4);
                float32x4_t _weight_hc_R_1 = vld1q_f32(weight_hc_RUN + 8);
                float32x4_t _weight_hc_U_1 = vld1q_f32(weight_hc_RUN + 12);
                float32x4_t _weight_hc_R_2 = vld1q_f32(weight_hc_RUN + 16);
                float32x4_t _weight_hc_U_2 = vld1q_f32(weight_hc_RUN + 20);
                float32x4_t _weight_hc_R_3 = vld1q_f32(weight_hc_RUN + 24);
                float32x4_t _weight_hc_U_3 = vld1q_f32(weight_hc_RUN + 28);
#if __aarch64__
                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_hc_R, _h_cont, 0);
                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_hc_U, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_R_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_U_1, _h_cont, 1);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_R_2, _h_cont, 2);
                _sum4 = vfmaq_laneq_f32(_sum4, _weight_hc_U_2, _h_cont, 2);
                _sum5 = vfmaq_laneq_f32(_sum5, _weight_hc_R_3, _h_cont, 3);
                _sum6 = vfmaq_laneq_f32(_sum6, _weight_hc_U_3, _h_cont, 3);
#else
                _gru_R = vmlaq_lane_f32(_gru_R, _weight_hc_R, vget_low_f32(_h_cont), 0);
                _gru_U = vmlaq_lane_f32(_gru_U, _weight_hc_U, vget_low_f32(_h_cont), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_R_1, vget_low_f32(_h_cont), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_U_1, vget_low_f32(_h_cont), 1);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_R_2, vget_high_f32(_h_cont), 0);
                _sum4 = vmlaq_lane_f32(_sum4, _weight_hc_U_2, vget_high_f32(_h_cont), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _weight_hc_R_3, vget_high_f32(_h_cont), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _weight_hc_U_3, vget_high_f32(_h_cont), 1);
#endif

                weight_hc_RUN += 32;
            }
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_R = vld1q_f32(weight_hc_RUN);
                float32x4_t _weight_hc_U = vld1q_f32(weight_hc_RUN + 4);
                _gru_R = vmlaq_f32(_gru_R, _weight_hc_R, _h_cont);
                _gru_U = vmlaq_f32(_gru_U, _weight_hc_U, _h_cont);

                weight_hc_RUN += 8;
            }

            _gru_R = vaddq_f32(_gru_R, _sum1);
            _gru_U = vaddq_f32(_gru_U, _sum2);
            _sum3 = vaddq_f32(_sum3, _sum5);
            _sum4 = vaddq_f32(_sum4, _sum6);
            _gru_R = vaddq_f32(_gru_R, _sum3);
            _gru_U = vaddq_f32(_gru_U, _sum4);

            // sigmoid(R)
            // sigmoid(U)
            _gru_R = sigmoid_ps(_gru_R);
            _gru_U = sigmoid_ps(_gru_U);

            // gate new
            float32x4_t _gru_N = vld1q_f32(bias_c_RUBNWN + 8);
            _sum1 = vdupq_n_f32(0.f);
            _sum2 = vdupq_n_f32(0.f);
            _sum3 = vdupq_n_f32(0.f);

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc_N = vld1q_f32(weight_hc_RUN);
                float32x4_t _weight_hc_N_1 = vld1q_f32(weight_hc_RUN + 4);
                float32x4_t _weight_hc_N_2 = vld1q_f32(weight_hc_RUN + 8);
                float32x4_t _weight_hc_N_3 = vld1q_f32(weight_hc_RUN + 12);
#if __aarch64__
                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_hc_N, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_N_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_N_2, _h_cont, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_N_3, _h_cont, 3);
#else
                _gru_N = vmlaq_lane_f32(_gru_N, _weight_hc_N, vget_low_f32(_h_cont), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_N_1, vget_low_f32(_h_cont), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_N_2, vget_high_f32(_h_cont), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_N_3, vget_high_f32(_h_cont), 1);
#endif

                weight_hc_RUN += 16;
            }
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_N = vld1q_f32(weight_hc_RUN);
                _gru_N = vmlaq_f32(_gru_N, _weight_hc_N, _h_cont);

                weight_hc_RUN += 4;
            }

            _gru_N = vaddq_f32(_gru_N, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _gru_N = vaddq_f32(_gru_N, _sum2);

            _gru_N = vmlaq_f32(vld1q_f32(bias_c_RUBNWN + 12), _gru_R, _gru_N);
            _sum1 = vdupq_n_f32(0.f);
            _sum2 = vdupq_n_f32(0.f);
            _sum3 = vdupq_n_f32(0.f);

            i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = vld1q_f32(x + i);
                float32x4_t _weight_xc_N = vld1q_f32(weight_xc_RUN);
                float32x4_t _weight_xc_N_1 = vld1q_f32(weight_xc_RUN + 4);
                float32x4_t _weight_xc_N_2 = vld1q_f32(weight_xc_RUN + 8);
                float32x4_t _weight_xc_N_3 = vld1q_f32(weight_xc_RUN + 12);
#if __aarch64__
                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_xc_N, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_N_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_N_2, _xi, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_N_3, _xi, 3);
#else
                _gru_N = vmlaq_lane_f32(_gru_N, _weight_xc_N, vget_low_f32(_xi), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_N_1, vget_low_f32(_xi), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_N_2, vget_high_f32(_xi), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_N_3, vget_high_f32(_xi), 1);
#endif

                weight_xc_RUN += 16;
            }
            for (; i < size; i++)
            {
                float xi = x[i];

                float32x4_t _xi = vdupq_n_f32(xi);
                float32x4_t _weight_xc_N = vld1q_f32(weight_xc_RUN);
                _gru_N = vmlaq_f32(_gru_N, _weight_xc_N, _xi);

                weight_xc_RUN += 4;
            }

            _gru_N = vaddq_f32(_gru_N, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _gru_N = vaddq_f32(_gru_N, _sum2);

            // tanh(N)
            _gru_N = tanh_ps(_gru_N);

            float* gates_data = gates.row(q / 4);

            vst1q_f32(gates_data, _gru_U);
            vst1q_f32(gates_data + 4, _gru_N);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const float* x = bottom_blob.row(ti);

            // gate reset update
            const float* bias_c_RUBNWN = (const float*)bias_c + q * 4;

#if __ARM_NEON
            const float* weight_xc_RUN = weight_xc.row(q / 4 + q % 4);
            const float* weight_hc_RUN = weight_hc.row(q / 4 + q % 4);
#else
            const float* weight_xc_RUN = weight_xc.row(q);
            const float* weight_hc_RUN = weight_hc.row(q);
#endif

            float R = bias_c_RUBNWN[0];
            float U = bias_c_RUBNWN[1];

            for (int i = 0; i < size; i++)
            {
                float xi = x[i];

                R += weight_xc_RUN[0] * xi;
                U += weight_xc_RUN[1] * xi;

                weight_xc_RUN += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                R += weight_hc_RUN[0] * h_cont;
                U += weight_hc_RUN[1] * h_cont;

                weight_hc_RUN += 2;
            }

            // sigmoid(R)
            // sigmoid(U)
            R = 1.f / (1.f + expf(-R));
            U = 1.f / (1.f + expf(-U));

            // gate new
            float N = bias_c_RUBNWN[2];

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                N += weight_hc_RUN[0] * h_cont;

                weight_hc_RUN += 1;
            }

            N = bias_c_RUBNWN[3] + R * N;

            for (int i = 0; i < size; i++)
            {
                float xi = x[i];

                N += weight_xc_RUN[0] * xi;

                weight_xc_RUN += 1;
            }

            // tanh(N)
            N = tanhf(N);

#if __ARM_NEON
            float* gates_data = gates.row(q / 4 + q % 4);
#else
            float* gates_data = gates.row(q);
#endif

            gates_data[0] = U;
            gates_data[1] = N;
        }

        // h_t := (1 - update) .* new + update .* h_{t-1}
        float* output_data = top_blob.row(ti);

        float* hidden_ptr = hidden_state;

#if __ARM_NEON
        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const float* gates_data = gates.row(q / 4);

            float32x4_t _gru_U = vld1q_f32(gates_data);
            float32x4_t _gru_N = vld1q_f32(gates_data + 4);

            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));

            vst1q_f32(hidden_ptr + q, _gru_H);
            vst1q_f32(output_data + q, _gru_H);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
#if __ARM_NEON
            const float* gates_data = gates.row(q / 4 + q % 4);
#else
            const float* gates_data = gates.row(q);
#endif

            float U = gates_data[0];
            float N = gates_data[1];

            float H = (1 - U) * N + U * hidden_ptr[q];

            hidden_ptr[q] = H;
            output_data[q] = H;
        }
    }

    return 0;
}

int GRU_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
#endif

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = gru(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = gru(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.0f);

        {
            int ret = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    return 0;
}

int GRU_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blobs, top_blobs, opt);
    }
#endif

    const Mat& bottom_blob = bottom_blobs[0];
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        hidden = bottom_blobs[1].clone(hidden_allocator);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = gru(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            int ret = gru(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            int ret = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    if (top_blobs.size() == 2)
    {
        top_blobs[1] = hidden;
    }

    return 0;
}

#if NCNN_BF16
static int gru_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // 2 x num_output
#if __ARM_NEON
    Mat gates(4 * 2, num_output / 4 + num_output % 4, 4u, opt.workspace_allocator);
#else
    Mat gates(2, num_output, 4u, opt.workspace_allocator);
#endif
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        int remain_num_output_start = 0;
#if __ARM_NEON
        int nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const unsigned short* x = bottom_blob.row<const unsigned short>(ti);

            // gate reset update
            const unsigned short* bias_c_RUBNWN = (const unsigned short*)bias_c + q * 4;

            const unsigned short* weight_xc_RUN = weight_xc.row<const unsigned short>(q / 4);
            const unsigned short* weight_hc_RUN = weight_hc.row<const unsigned short>(q / 4);

            float32x4_t _gru_R = bfloat2float(vld1_u16(bias_c_RUBNWN));
            float32x4_t _gru_U = bfloat2float(vld1_u16(bias_c_RUBNWN + 4));
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            float32x4_t _sum4 = vdupq_n_f32(0.f);
            float32x4_t _sum5 = vdupq_n_f32(0.f);
            float32x4_t _sum6 = vdupq_n_f32(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = bfloat2float(vld1_u16(x + i));
                float32x4_t _weight_xc_R = bfloat2float(vld1_u16(weight_xc_RUN));
                float32x4_t _weight_xc_U = bfloat2float(vld1_u16(weight_xc_RUN + 4));
                float32x4_t _weight_xc_R_1 = bfloat2float(vld1_u16(weight_xc_RUN + 8));
                float32x4_t _weight_xc_U_1 = bfloat2float(vld1_u16(weight_xc_RUN + 12));
                float32x4_t _weight_xc_R_2 = bfloat2float(vld1_u16(weight_xc_RUN + 16));
                float32x4_t _weight_xc_U_2 = bfloat2float(vld1_u16(weight_xc_RUN + 20));
                float32x4_t _weight_xc_R_3 = bfloat2float(vld1_u16(weight_xc_RUN + 24));
                float32x4_t _weight_xc_U_3 = bfloat2float(vld1_u16(weight_xc_RUN + 28));
#if __aarch64__
                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_xc_R, _xi, 0);
                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_xc_U, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_R_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_U_1, _xi, 1);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_R_2, _xi, 2);
                _sum4 = vfmaq_laneq_f32(_sum4, _weight_xc_U_2, _xi, 2);
                _sum5 = vfmaq_laneq_f32(_sum5, _weight_xc_R_3, _xi, 3);
                _sum6 = vfmaq_laneq_f32(_sum6, _weight_xc_U_3, _xi, 3);
#else
                _gru_R = vmlaq_lane_f32(_gru_R, _weight_xc_R, vget_low_f32(_xi), 0);
                _gru_U = vmlaq_lane_f32(_gru_U, _weight_xc_U, vget_low_f32(_xi), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_R_1, vget_low_f32(_xi), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_U_1, vget_low_f32(_xi), 1);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_R_2, vget_high_f32(_xi), 0);
                _sum4 = vmlaq_lane_f32(_sum4, _weight_xc_U_2, vget_high_f32(_xi), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _weight_xc_R_3, vget_high_f32(_xi), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _weight_xc_U_3, vget_high_f32(_xi), 1);
#endif

                weight_xc_RUN += 32;
            }
            for (; i < size; i++)
            {
                unsigned short xi = x[i];

                float32x4_t _xi = bfloat2float(vdup_n_u16(xi));
                float32x4_t _weight_xc_R = bfloat2float(vld1_u16(weight_xc_RUN));
                float32x4_t _weight_xc_U = bfloat2float(vld1_u16(weight_xc_RUN + 4));
                _gru_R = vmlaq_f32(_gru_R, _weight_xc_R, _xi);
                _gru_U = vmlaq_f32(_gru_U, _weight_xc_U, _xi);

                weight_xc_RUN += 8;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc_R = bfloat2float(vld1_u16(weight_hc_RUN));
                float32x4_t _weight_hc_U = bfloat2float(vld1_u16(weight_hc_RUN + 4));
                float32x4_t _weight_hc_R_1 = bfloat2float(vld1_u16(weight_hc_RUN + 8));
                float32x4_t _weight_hc_U_1 = bfloat2float(vld1_u16(weight_hc_RUN + 12));
                float32x4_t _weight_hc_R_2 = bfloat2float(vld1_u16(weight_hc_RUN + 16));
                float32x4_t _weight_hc_U_2 = bfloat2float(vld1_u16(weight_hc_RUN + 20));
                float32x4_t _weight_hc_R_3 = bfloat2float(vld1_u16(weight_hc_RUN + 24));
                float32x4_t _weight_hc_U_3 = bfloat2float(vld1_u16(weight_hc_RUN + 28));
#if __aarch64__
                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_hc_R, _h_cont, 0);
                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_hc_U, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_R_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_U_1, _h_cont, 1);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_R_2, _h_cont, 2);
                _sum4 = vfmaq_laneq_f32(_sum4, _weight_hc_U_2, _h_cont, 2);
                _sum5 = vfmaq_laneq_f32(_sum5, _weight_hc_R_3, _h_cont, 3);
                _sum6 = vfmaq_laneq_f32(_sum6, _weight_hc_U_3, _h_cont, 3);
#else
                _gru_R = vmlaq_lane_f32(_gru_R, _weight_hc_R, vget_low_f32(_h_cont), 0);
                _gru_U = vmlaq_lane_f32(_gru_U, _weight_hc_U, vget_low_f32(_h_cont), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_R_1, vget_low_f32(_h_cont), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_U_1, vget_low_f32(_h_cont), 1);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_R_2, vget_high_f32(_h_cont), 0);
                _sum4 = vmlaq_lane_f32(_sum4, _weight_hc_U_2, vget_high_f32(_h_cont), 0);
                _sum5 = vmlaq_lane_f32(_sum5, _weight_hc_R_3, vget_high_f32(_h_cont), 1);
                _sum6 = vmlaq_lane_f32(_sum6, _weight_hc_U_3, vget_high_f32(_h_cont), 1);
#endif

                weight_hc_RUN += 32;
            }
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_R = bfloat2float(vld1_u16(weight_hc_RUN));
                float32x4_t _weight_hc_U = bfloat2float(vld1_u16(weight_hc_RUN + 4));
                _gru_R = vmlaq_f32(_gru_R, _weight_hc_R, _h_cont);
                _gru_U = vmlaq_f32(_gru_U, _weight_hc_U, _h_cont);

                weight_hc_RUN += 8;
            }

            _gru_R = vaddq_f32(_gru_R, _sum1);
            _gru_U = vaddq_f32(_gru_U, _sum2);
            _sum3 = vaddq_f32(_sum3, _sum5);
            _sum4 = vaddq_f32(_sum4, _sum6);
            _gru_R = vaddq_f32(_gru_R, _sum3);
            _gru_U = vaddq_f32(_gru_U, _sum4);

            // sigmoid(R)
            // sigmoid(U)
            _gru_R = sigmoid_ps(_gru_R);
            _gru_U = sigmoid_ps(_gru_U);

            // gate new
            float32x4_t _gru_N = bfloat2float(vld1_u16(bias_c_RUBNWN + 8));
            _sum1 = vdupq_n_f32(0.f);
            _sum2 = vdupq_n_f32(0.f);
            _sum3 = vdupq_n_f32(0.f);

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc_N = bfloat2float(vld1_u16(weight_hc_RUN));
                float32x4_t _weight_hc_N_1 = bfloat2float(vld1_u16(weight_hc_RUN + 4));
                float32x4_t _weight_hc_N_2 = bfloat2float(vld1_u16(weight_hc_RUN + 8));
                float32x4_t _weight_hc_N_3 = bfloat2float(vld1_u16(weight_hc_RUN + 12));
#if __aarch64__
                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_hc_N, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_N_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_N_2, _h_cont, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_N_3, _h_cont, 3);
#else
                _gru_N = vmlaq_lane_f32(_gru_N, _weight_hc_N, vget_low_f32(_h_cont), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_N_1, vget_low_f32(_h_cont), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_N_2, vget_high_f32(_h_cont), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_N_3, vget_high_f32(_h_cont), 1);
#endif

                weight_hc_RUN += 16;
            }
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_N = bfloat2float(vld1_u16(weight_hc_RUN));
                _gru_N = vmlaq_f32(_gru_N, _weight_hc_N, _h_cont);

                weight_hc_RUN += 4;
            }

            _gru_N = vaddq_f32(_gru_N, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _gru_N = vaddq_f32(_gru_N, _sum2);

            _gru_N = vmlaq_f32(bfloat2float(vld1_u16(bias_c_RUBNWN + 12)), _gru_R, _gru_N);
            _sum1 = vdupq_n_f32(0.f);
            _sum2 = vdupq_n_f32(0.f);
            _sum3 = vdupq_n_f32(0.f);

            i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = bfloat2float(vld1_u16(x + i));
                float32x4_t _weight_xc_N = bfloat2float(vld1_u16(weight_xc_RUN));
                float32x4_t _weight_xc_N_1 = bfloat2float(vld1_u16(weight_xc_RUN + 4));
                float32x4_t _weight_xc_N_2 = bfloat2float(vld1_u16(weight_xc_RUN + 8));
                float32x4_t _weight_xc_N_3 = bfloat2float(vld1_u16(weight_xc_RUN + 12));
#if __aarch64__
                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_xc_N, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_N_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_N_2, _xi, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_N_3, _xi, 3);
#else
                _gru_N = vmlaq_lane_f32(_gru_N, _weight_xc_N, vget_low_f32(_xi), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_N_1, vget_low_f32(_xi), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_N_2, vget_high_f32(_xi), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_N_3, vget_high_f32(_xi), 1);
#endif

                weight_xc_RUN += 16;
            }
            for (; i < size; i++)
            {
                unsigned short xi = x[i];

                float32x4_t _xi = bfloat2float(vdup_n_u16(xi));
                float32x4_t _weight_xc_N = bfloat2float(vld1_u16(weight_xc_RUN));
                _gru_N = vmlaq_f32(_gru_N, _weight_xc_N, _xi);

                weight_xc_RUN += 4;
            }

            _gru_N = vaddq_f32(_gru_N, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _gru_N = vaddq_f32(_gru_N, _sum2);

            // tanh(N)
            _gru_N = tanh_ps(_gru_N);

            float* gates_data = gates.row(q / 4);

            vst1q_f32(gates_data, _gru_U);
            vst1q_f32(gates_data + 4, _gru_N);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const unsigned short* x = bottom_blob.row<const unsigned short>(ti);

            // gate reset update
            const unsigned short* bias_c_RUBNWN = (const unsigned short*)bias_c + q * 4;

#if __ARM_NEON
            const unsigned short* weight_xc_RUN = weight_xc.row<const unsigned short>(q / 4 + q % 4);
            const unsigned short* weight_hc_RUN = weight_hc.row<const unsigned short>(q / 4 + q % 4);
#else
            const unsigned short* weight_xc_RUN = weight_xc.row<const unsigned short>(q);
            const unsigned short* weight_hc_RUN = weight_hc.row<const unsigned short>(q);
#endif

            float R = bfloat16_to_float32(bias_c_RUBNWN[0]);
            float U = bfloat16_to_float32(bias_c_RUBNWN[1]);

            for (int i = 0; i < size; i++)
            {
                float xi = bfloat16_to_float32(x[i]);

                R += bfloat16_to_float32(weight_xc_RUN[0]) * xi;
                U += bfloat16_to_float32(weight_xc_RUN[1]) * xi;

                weight_xc_RUN += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                R += bfloat16_to_float32(weight_hc_RUN[0]) * h_cont;
                U += bfloat16_to_float32(weight_hc_RUN[1]) * h_cont;

                weight_hc_RUN += 2;
            }

            // sigmoid(R)
            // sigmoid(U)
            R = 1.f / (1.f + expf(-R));
            U = 1.f / (1.f + expf(-U));

            // gate new
            float N = bfloat16_to_float32(bias_c_RUBNWN[2]);

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                N += bfloat16_to_float32(weight_hc_RUN[0]) * h_cont;

                weight_hc_RUN += 1;
            }

            N = bfloat16_to_float32(bias_c_RUBNWN[3]) + R * N;

            for (int i = 0; i < size; i++)
            {
                float xi = bfloat16_to_float32(x[i]);

                N += bfloat16_to_float32(weight_xc_RUN[0]) * xi;

                weight_xc_RUN += 1;
            }

            // tanh(N)
            N = tanhf(N);

#if __ARM_NEON
            float* gates_data = gates.row(q / 4 + q % 4);
#else
            float* gates_data = gates.row(q);
#endif

            gates_data[0] = U;
            gates_data[1] = N;
        }

        // h_t := (1 - update) .* new + update .* h_{t-1}
        unsigned short* output_data = top_blob.row<unsigned short>(ti);

        float* hidden_ptr = hidden_state;

#if __ARM_NEON
        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const float* gates_data = gates.row(q / 4);

            float32x4_t _gru_U = vld1q_f32(gates_data);
            float32x4_t _gru_N = vld1q_f32(gates_data + 4);

            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));

            vst1q_f32(hidden_ptr + q, _gru_H);
            vst1_u16(output_data + q, float2bfloat(_gru_H));
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
#if __ARM_NEON
            const float* gates_data = gates.row(q / 4 + q % 4);
#else
            const float* gates_data = gates.row(q);
#endif

            float U = gates_data[0];
            float N = gates_data[1];

            float H = (1 - U) * N + U * hidden_ptr[q];

            hidden_ptr[q] = H;
            output_data[q] = float32_to_bfloat16(H);
        }
    }

    return 0;
}

int GRU_arm::create_pipeline_bf16s(const Option& opt)
{
    // pack RUN
    int num_directions = direction == 2 ? 2 : 1;
    int size = weight_data_size / num_directions / num_output / 3;

#if __ARM_NEON
    weight_xc_data_packed.create(size * 12, num_output / 4 + num_output % 4, num_directions, 2u, 1);
    bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4);
    weight_hc_data_packed.create(num_output * 12, num_output / 4 + num_output % 4, num_directions, 2u, 1);
#else
    weight_xc_data_packed.create(size * 3, num_output, num_directions, 2u, 1);
    bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4);
    weight_hc_data_packed.create(num_output * 3, num_output, num_directions, 2u, 1);
#endif

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat bias_c = bias_c_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        const float* bias_c_R = bias_c.row(0);
        const float* bias_c_U = bias_c.row(1);
        const float* bias_c_WN = bias_c.row(2);
        const float* bias_c_BN = bias_c.row(3);

        unsigned short* bias_c_RUBNWN = bias_c_data_packed_dr.row<unsigned short>(0);

        int q = 0;
#if __ARM_NEON
        for (; q + 3 < num_output; q += 4)
        {
            vst1_u16(bias_c_RUBNWN, float2bfloat(vld1q_f32(bias_c_R + q)));
            vst1_u16(bias_c_RUBNWN + 4, float2bfloat(vld1q_f32(bias_c_U + q)));
            vst1_u16(bias_c_RUBNWN + 8, float2bfloat(vld1q_f32(bias_c_BN + q)));
            vst1_u16(bias_c_RUBNWN + 12, float2bfloat(vld1q_f32(bias_c_WN + q)));

            bias_c_RUBNWN += 16;

            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);

            const float* weight_xc_R_1 = weight_xc.row(num_output * 0 + q + 1);
            const float* weight_xc_U_1 = weight_xc.row(num_output * 1 + q + 1);
            const float* weight_xc_N_1 = weight_xc.row(num_output * 2 + q + 1);

            const float* weight_xc_R_2 = weight_xc.row(num_output * 0 + q + 2);
            const float* weight_xc_U_2 = weight_xc.row(num_output * 1 + q + 2);
            const float* weight_xc_N_2 = weight_xc.row(num_output * 2 + q + 2);

            const float* weight_xc_R_3 = weight_xc.row(num_output * 0 + q + 3);
            const float* weight_xc_U_3 = weight_xc.row(num_output * 1 + q + 3);
            const float* weight_xc_N_3 = weight_xc.row(num_output * 2 + q + 3);

            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);

            const float* weight_hc_R_1 = weight_hc.row(num_output * 0 + q + 1);
            const float* weight_hc_U_1 = weight_hc.row(num_output * 1 + q + 1);
            const float* weight_hc_N_1 = weight_hc.row(num_output * 2 + q + 1);

            const float* weight_hc_R_2 = weight_hc.row(num_output * 0 + q + 2);
            const float* weight_hc_U_2 = weight_hc.row(num_output * 1 + q + 2);
            const float* weight_hc_N_2 = weight_hc.row(num_output * 2 + q + 2);

            const float* weight_hc_R_3 = weight_hc.row(num_output * 0 + q + 3);
            const float* weight_hc_U_3 = weight_hc.row(num_output * 1 + q + 3);
            const float* weight_hc_N_3 = weight_hc.row(num_output * 2 + q + 3);

            unsigned short* weight_xc_RUN = weight_xc_data_packed_dr.row<unsigned short>(q / 4);
            unsigned short* weight_hc_RUN = weight_hc_data_packed_dr.row<unsigned short>(q / 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = float32_to_bfloat16(weight_xc_R[i]);
                weight_xc_RUN[1] = float32_to_bfloat16(weight_xc_R_1[i]);
                weight_xc_RUN[2] = float32_to_bfloat16(weight_xc_R_2[i]);
                weight_xc_RUN[3] = float32_to_bfloat16(weight_xc_R_3[i]);
                weight_xc_RUN[4] = float32_to_bfloat16(weight_xc_U[i]);
                weight_xc_RUN[5] = float32_to_bfloat16(weight_xc_U_1[i]);
                weight_xc_RUN[6] = float32_to_bfloat16(weight_xc_U_2[i]);
                weight_xc_RUN[7] = float32_to_bfloat16(weight_xc_U_3[i]);

                weight_xc_RUN += 8;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = float32_to_bfloat16(weight_hc_R[i]);
                weight_hc_RUN[1] = float32_to_bfloat16(weight_hc_R_1[i]);
                weight_hc_RUN[2] = float32_to_bfloat16(weight_hc_R_2[i]);
                weight_hc_RUN[3] = float32_to_bfloat16(weight_hc_R_3[i]);
                weight_hc_RUN[4] = float32_to_bfloat16(weight_hc_U[i]);
                weight_hc_RUN[5] = float32_to_bfloat16(weight_hc_U_1[i]);
                weight_hc_RUN[6] = float32_to_bfloat16(weight_hc_U_2[i]);
                weight_hc_RUN[7] = float32_to_bfloat16(weight_hc_U_3[i]);

                weight_hc_RUN += 8;
            }

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = float32_to_bfloat16(weight_xc_N[i]);
                weight_xc_RUN[1] = float32_to_bfloat16(weight_xc_N_1[i]);
                weight_xc_RUN[2] = float32_to_bfloat16(weight_xc_N_2[i]);
                weight_xc_RUN[3] = float32_to_bfloat16(weight_xc_N_3[i]);

                weight_xc_RUN += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = float32_to_bfloat16(weight_hc_N[i]);
                weight_hc_RUN[1] = float32_to_bfloat16(weight_hc_N_1[i]);
                weight_hc_RUN[2] = float32_to_bfloat16(weight_hc_N_2[i]);
                weight_hc_RUN[3] = float32_to_bfloat16(weight_hc_N_3[i]);

                weight_hc_RUN += 4;
            }
        }
#endif // __ARM_NEON
        for (; q < num_output; q++)
        {
            bias_c_RUBNWN[0] = float32_to_bfloat16(bias_c_R[q]);
            bias_c_RUBNWN[1] = float32_to_bfloat16(bias_c_U[q]);
            bias_c_RUBNWN[2] = float32_to_bfloat16(bias_c_BN[q]);
            bias_c_RUBNWN[3] = float32_to_bfloat16(bias_c_WN[q]);

            bias_c_RUBNWN += 4;

            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);

            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);

#if __ARM_NEON
            unsigned short* weight_xc_RUN = weight_xc_data_packed_dr.row<unsigned short>(q / 4 + q % 4);
            unsigned short* weight_hc_RUN = weight_hc_data_packed_dr.row<unsigned short>(q / 4 + q % 4);
#else
            unsigned short* weight_xc_RUN = weight_xc_data_packed_dr.row<unsigned short>(q);
            unsigned short* weight_hc_RUN = weight_hc_data_packed_dr.row<unsigned short>(q);
#endif // __ARM_NEON

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = float32_to_bfloat16(weight_xc_R[i]);
                weight_xc_RUN[1] = float32_to_bfloat16(weight_xc_U[i]);

                weight_xc_RUN += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = float32_to_bfloat16(weight_hc_R[i]);
                weight_hc_RUN[1] = float32_to_bfloat16(weight_hc_U[i]);

                weight_hc_RUN += 2;
            }

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = float32_to_bfloat16(weight_xc_N[i]);

                weight_xc_RUN += 1;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = float32_to_bfloat16(weight_hc_N[i]);

                weight_hc_RUN += 1;
            }
        }
    }

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

int GRU_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = gru_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = gru_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.f);

        {
            int ret = gru_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
            const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
            unsigned short* ptr = top_blob.row<unsigned short>(i);

            memcpy(ptr, pf, num_output * sizeof(unsigned short));
            memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
        }
    }

    return 0;
}

int GRU_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        Option opt_cast = opt;
        opt_cast.blob_allocator = hidden_allocator;
        cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = gru_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            int ret = gru_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            int ret = gru_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
            const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
            unsigned short* ptr = top_blob.row<unsigned short>(i);

            memcpy(ptr, pf, num_output * sizeof(unsigned short));
            memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
        }
    }

    if (top_blobs.size() == 2)
    {
        cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
    }

    return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int GRU_arm::create_pipeline_int8(const Option& opt)
{
    const int num_directions = direction == 2 ? 2 : 1;
    const int size = weight_data_size / num_directions / num_output / 3;

    gru_transform_weight_int8(weight_xc_data, weight_xc_data_int8_scales, weight_hc_data, weight_hc_data_int8_scales, bias_c_data, weight_data_tm, weight_data_tm_int8_descales, bias_c_data_packed, size, num_output, num_directions, opt);

    if (opt.lightmode)
    {
        weight_xc_data.release();
        weight_hc_data.release();
        bias_c_data.release();
        weight_xc_data_int8_scales.release();
        weight_hc_data_int8_scales.release();
    }

    return 0;
}

void GRU_arm::dynamic_quantize(const Mat& bottom_blob, int elemtype, Mat& bottom_blob_int8, Mat& bottom_blob_int8_descales, const Option& opt) const
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    // dynamic quantize bottom_blob
    bottom_blob_int8_descales.create(T, (size_t)4u, 1, opt.blob_allocator);

    Mat bottom_blob_int8_scales(T, (size_t)4u, 1, opt.blob_allocator);

    if (elemtype == 1)
    {
        // fp32
        for (int t = 0; t < T; t++)
        {
            const float* x = bottom_blob.row(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(x[i]));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }
    if (elemtype == 2)
    {
        // fp16
        for (int t = 0; t < T; t++)
        {
            const unsigned short* x = bottom_blob.row<const unsigned short>(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(float16_to_float32(x[i])));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }
    if (elemtype == 4)
    {
        // bf16
        for (int t = 0; t < T; t++)
        {
            const unsigned short* x = bottom_blob.row<const unsigned short>(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(bfloat16_to_float32(x[i])));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }

    quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt);
}

int GRU_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elemtype = 1; // fp32
    {
        int elembits = bottom_blob.elembits();

        // clang-format off
        // *INDENT-OFF*

#if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        {
            elemtype = 2; // fp16
        }
        else
#endif
#if NCNN_BF16
        if (opt.use_bf16_storage && elembits == 16)
        {
            elemtype = 4; // bf16
        }
        else
#endif
        {
            // fp32
        }

        // *INDENT-ON*
        // clang-format on
    }

    int T = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8;
    Mat bottom_blob_int8_descales;
    {
        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        dynamic_quantize(bottom_blob, elemtype, bottom_blob_int8, bottom_blob_int8_descales, opt_quant);
    }

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        gru_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, direction, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden, opt);
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            gru_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_forward, elemtype, 0, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden, opt);
        }

        hidden.fill(0.f);

        {
            gru_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_reverse, elemtype, 1, weight_data_tm.channel(1), weight_data_tm_int8_descales.channel(1), bias_c_data_packed.channel(1), hidden, opt);
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned char* pf = top_blob_forward.row<const unsigned char>(i);
            const unsigned char* pr = top_blob_reverse.row<const unsigned char>(i);
            unsigned char* ptr = top_blob.row<unsigned char>(i);

            memcpy(ptr, pf, num_output * elemsize);
            memcpy(ptr + num_output * elemsize, pr, num_output * elemsize);
        }
    }

    return 0;
}

int GRU_arm::forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];

    int elemtype = 1; // fp32
    {
        int elembits = bottom_blob.elembits();

        // clang-format off
        // *INDENT-OFF*

#if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        {
            elemtype = 2; // fp16
        }
        else
#endif
#if NCNN_BF16
        if (opt.use_bf16_storage && elembits == 16)
        {
            elemtype = 4; // bf16
        }
        else
#endif
        {
            // fp32
        }

        // *INDENT-ON*
        // clang-format on
    }

    int T = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        if (elemtype == 1)
        {
            hidden = bottom_blobs[1].clone(hidden_allocator);
        }
        if (elemtype == 2)
        {
            Option opt_cast = opt;
            opt_cast.blob_allocator = hidden_allocator;
            cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
        }
        if (elemtype == 4)
        {
            Option opt_cast = opt;
            opt_cast.blob_allocator = hidden_allocator;
            cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
        }
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8;
    Mat bottom_blob_int8_descales;
    {
        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        dynamic_quantize(bottom_blob, elemtype, bottom_blob_int8, bottom_blob_int8_descales, opt_quant);
    }

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        gru_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, direction, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden, opt);
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            gru_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_forward, elemtype, 0, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden0, opt);
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            gru_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_reverse, elemtype, 1, weight_data_tm.channel(1), weight_data_tm_int8_descales.channel(1), bias_c_data_packed.channel(1), hidden1, opt);
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned char* pf = top_blob_forward.row<const unsigned char>(i);
            const unsigned char* pr = top_blob_reverse.row<const unsigned char>(i);
            unsigned char* ptr = top_blob.row<unsigned char>(i);

            memcpy(ptr, pf, num_output * elemsize);
            memcpy(ptr + num_output * elemsize, pr, num_output * elemsize);
        }
    }

    if (top_blobs.size() == 2)
    {
        if (elemtype == 1)
        {
            top_blobs[1] = hidden;
        }
        if (elemtype == 2)
        {
            cast_float32_to_float16(hidden, top_blobs[1], opt);
        }
        if (elemtype == 4)
        {
            cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/arm/gru_arm.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GRU_ARM_H
#define LAYER_GRU_ARM_H

#include "gru.h"

namespace ncnn {

class GRU_arm : public GRU
{
public:
    GRU_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8(const Option& opt);
    void dynamic_quantize(const Mat& bottom_blob, int elemtype, Mat& bottom_blob_int8, Mat& bottom_blob_int8_descales, const Option& opt) const;
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
    Mat weight_xc_data_packed;
    Mat bias_c_data_packed;
    Mat weight_hc_data_packed;

    Mat weight_data_tm;

#if NCNN_INT8
    Mat weight_data_tm_int8_descales;
#endif
};

} // namespace ncnn

#endif // LAYER_GRU_ARM_H


================================================
FILE: src/layer/arm/gru_arm_asimddp.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "layer.h"
#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "gru_int8.h"

void gru_transform_weight_int8_asimddp(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, const Option& opt)
{
    gru_transform_weight_int8(weight_xc, weight_xc_int8_scales, weight_hc, weight_hc_int8_scales, bias_c, weight_data_tm, weight_data_tm_int8_descales, bias_c_tm, size, num_output, num_directions, opt);
}

void gru_int8_asimddp(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, Mat& hidden_state, const Option& opt)
{
    gru_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, reverse, weight_data_tm, weight_data_tm_int8_descales, bias_c, hidden_state, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/gru_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gru_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // 2 x num_output
    Mat gates(4 * 2, num_output / 4 + num_output % 4, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        int nn_num_output = num_output >> 2;
        int remain_num_output_start = nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const __fp16* x = bottom_blob.row<const __fp16>(ti);

            // gate reset update
            const __fp16* bias_c_RUBNWN = (const __fp16*)bias_c + q * 4;

            const __fp16* weight_xc_RUN = weight_xc.row<const __fp16>(q / 4);
            const __fp16* weight_hc_RUN = weight_hc.row<const __fp16>(q / 4);

            float16x8_t _RU = vld1q_f16(bias_c_RUBNWN);
            float16x8_t _sum1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _sum2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _sum3 = vdupq_n_f16((__fp16)0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4h}, [%0], #8       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    "fmla   %2.8h, v0.8h, v4.h[0]   \n"
                    "fmla   %3.8h, v1.8h, v4.h[1]   \n"
                    "fmla   %4.8h, v2.8h, v4.h[2]   \n"
                    "fmla   %5.8h, v3.8h, v4.h[3]   \n"
                    : "=r"(x),
                    "=r"(weight_xc_RUN),
                    "=w"(_RU),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3)
                    : "0"(x),
                    "1"(weight_xc_RUN),
                    "2"(_RU),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _x = vld1_f16(x);
                float16x8_t _w0 = vld1q_f16(weight_xc_RUN);
                float16x8_t _w1 = vld1q_f16(weight_xc_RUN + 8);
                float16x8_t _w2 = vld1q_f16(weight_xc_RUN + 16);
                float16x8_t _w3 = vld1q_f16(weight_xc_RUN + 24);
                _RU = vfmaq_lane_f16(_RU, _w0, _x, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _w1, _x, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _w2, _x, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _w3, _x, 3);

                x += 4;
                weight_xc_RUN += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < size; i++)
            {
                __fp16 xi = *x++;

                float16x8_t _xi = vdupq_n_f16(xi);
                float16x8_t _weight_xc_RU = vld1q_f16(weight_xc_RUN);
                _RU = vfmaq_f16(_RU, _weight_xc_RU, _xi);

                weight_xc_RUN += 8;
            }

            const float* hidden_ptr = hidden_state;

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4s}, [%0], #16      \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    "fcvtn  v4.4h, v4.4s            \n"
                    "fmla   %2.8h, v0.8h, v4.h[0]   \n"
                    "fmla   %3.8h, v1.8h, v4.h[1]   \n"
                    "fmla   %4.8h, v2.8h, v4.h[2]   \n"
                    "fmla   %5.8h, v3.8h, v4.h[3]   \n"
                    : "=r"(hidden_ptr),
                    "=r"(weight_hc_RUN),
                    "=w"(_RU),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3)
                    : "0"(hidden_ptr),
                    "1"(weight_hc_RUN),
                    "2"(_RU),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _h_cont = vcvt_f16_f32(vld1q_f32(hidden_ptr));
                float16x8_t _w0 = vld1q_f16(weight_hc_RUN);
                float16x8_t _w1 = vld1q_f16(weight_hc_RUN + 8);
                float16x8_t _w2 = vld1q_f16(weight_hc_RUN + 16);
                float16x8_t _w3 = vld1q_f16(weight_hc_RUN + 24);
                _RU = vfmaq_lane_f16(_RU, _w0, _h_cont, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _w1, _h_cont, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _w2, _h_cont, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _w3, _h_cont, 3);

                hidden_ptr += 4;
                weight_hc_RUN += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < num_output; i++)
            {
                float h_cont = *hidden_ptr++;

                float16x8_t _h_cont = vdupq_n_f16((__fp16)h_cont);
                float16x8_t _weight_hc_RU = vld1q_f16(weight_hc_RUN);
                _RU = vfmaq_f16(_RU, _weight_hc_RU, _h_cont);

                weight_hc_RUN += 8;
            }

            _RU = vaddq_f16(_RU, _sum1);
            _sum2 = vaddq_f16(_sum2, _sum3);
            _RU = vaddq_f16(_RU, _sum2);

            // sigmoid(R)
            // sigmoid(U)
            float32x4_t _R32 = sigmoid_ps(vcvt_f32_f16(vget_low_f16(_RU)));
            float32x4_t _U32 = sigmoid_ps(vcvt_f32_f16(vget_high_f16(_RU)));

            x -= size;
            hidden_ptr = hidden_state;

            // gate new
            float16x4_t _gru_N = vld1_f16(bias_c_RUBNWN + 8);
            float16x4_t _sum4 = vdup_n_f16((__fp16)0.f);
            float16x4_t _sum5 = vdup_n_f16((__fp16)0.f);
            float16x4_t _sum6 = vdup_n_f16((__fp16)0.f);

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4s}, [%0], #16      \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                    "fcvtn  v4.4h, v4.4s            \n"
                    "fmla   %2.4h, v0.4h, v4.h[0]   \n"
                    "fmla   %3.4h, v1.4h, v4.h[1]   \n"
                    "fmla   %4.4h, v2.4h, v4.h[2]   \n"
                    "fmla   %5.4h, v3.4h, v4.h[3]   \n"
                    : "=r"(hidden_ptr),
                    "=r"(weight_hc_RUN),
                    "=w"(_gru_N),
                    "=w"(_sum4),
                    "=w"(_sum5),
                    "=w"(_sum6)
                    : "0"(hidden_ptr),
                    "1"(weight_hc_RUN),
                    "2"(_gru_N),
                    "3"(_sum4),
                    "4"(_sum5),
                    "5"(_sum6)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _h_cont = vcvt_f16_f32(vld1q_f32(hidden_ptr));
                float16x4_t _w0 = vld1_f16(weight_hc_RUN);
                float16x4_t _w1 = vld1_f16(weight_hc_RUN + 4);
                float16x4_t _w2 = vld1_f16(weight_hc_RUN + 8);
                float16x4_t _w3 = vld1_f16(weight_hc_RUN + 12);
                _gru_N = vfma_lane_f16(_gru_N, _w0, _h_cont, 0);
                _sum4 = vfma_lane_f16(_sum4, _w1, _h_cont, 1);
                _sum5 = vfma_lane_f16(_sum5, _w2, _h_cont, 2);
                _sum6 = vfma_lane_f16(_sum6, _w3, _h_cont, 3);

                hidden_ptr += 4;
                weight_hc_RUN += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < num_output; i++)
            {
                float h_cont = *hidden_ptr++;

                float16x4_t _h_cont = vdup_n_f16((__fp16)h_cont);
                float16x4_t _weight_hc_N = vld1_f16(weight_hc_RUN);
                _gru_N = vfma_f16(_gru_N, _weight_hc_N, _h_cont);

                weight_hc_RUN += 4;
            }

            _gru_N = vadd_f16(_gru_N, _sum4);
            _sum5 = vadd_f16(_sum5, _sum6);
            _gru_N = vadd_f16(_gru_N, _sum5);

            _gru_N = vfma_f16(vld1_f16(bias_c_RUBNWN + 12), vcvt_f16_f32(_R32), _gru_N);
            _sum4 = vdup_n_f16((__fp16)0.f);
            _sum5 = vdup_n_f16((__fp16)0.f);
            _sum6 = vdup_n_f16((__fp16)0.f);

            i = 0;
            for (; i + 3 < size; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4h}, [%0], #8       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                    "fmla   %2.4h, v0.4h, v4.h[0]   \n"
                    "fmla   %3.4h, v1.4h, v4.h[1]   \n"
                    "fmla   %4.4h, v2.4h, v4.h[2]   \n"
                    "fmla   %5.4h, v3.4h, v4.h[3]   \n"
                    : "=r"(x),
                    "=r"(weight_xc_RUN),
                    "=w"(_gru_N),
                    "=w"(_sum4),
                    "=w"(_sum5),
                    "=w"(_sum6)
                    : "0"(x),
                    "1"(weight_xc_RUN),
                    "2"(_gru_N),
                    "3"(_sum4),
                    "4"(_sum5),
                    "5"(_sum6)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _x = vld1_f16(x);
                float16x4_t _w0 = vld1_f16(weight_xc_RUN);
                float16x4_t _w1 = vld1_f16(weight_xc_RUN + 4);
                float16x4_t _w2 = vld1_f16(weight_xc_RUN + 8);
                float16x4_t _w3 = vld1_f16(weight_xc_RUN + 12);
                _gru_N = vfma_lane_f16(_gru_N, _w0, _x, 0);
                _sum4 = vfma_lane_f16(_sum4, _w1, _x, 1);
                _sum5 = vfma_lane_f16(_sum5, _w2, _x, 2);
                _sum6 = vfma_lane_f16(_sum6, _w3, _x, 3);

                x += 4;
                weight_xc_RUN += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < size; i++)
            {
                __fp16 xi = *x++;

                float16x4_t _xi = vdup_n_f16(xi);
                float16x4_t _weight_xc_N = vld1_f16(weight_xc_RUN);
                _gru_N = vfma_f16(_gru_N, _weight_xc_N, _xi);

                weight_xc_RUN += 4;
            }

            _gru_N = vadd_f16(_gru_N, _sum4);
            _sum5 = vadd_f16(_sum5, _sum6);
            _gru_N = vadd_f16(_gru_N, _sum5);

            // tanh(N)
            float32x4_t _N32 = tanh_ps(vcvt_f32_f16(_gru_N));

            float* gates_data = gates.row(q / 4);

            vst1q_f32(gates_data, _U32);
            vst1q_f32(gates_data + 4, _N32);
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const __fp16* x = bottom_blob.row<const __fp16>(ti);

            // gate reset update
            const __fp16* bias_c_RUBNWN = (const __fp16*)bias_c + q * 4;

            const __fp16* weight_xc_RUN = weight_xc.row<const __fp16>(q / 4 + q % 4);
            const __fp16* weight_hc_RUN = weight_hc.row<const __fp16>(q / 4 + q % 4);

            __fp16 R = bias_c_RUBNWN[0];
            __fp16 U = bias_c_RUBNWN[1];

            for (int i = 0; i < size; i++)
            {
                __fp16 xi = x[i];

                R += weight_xc_RUN[0] * xi;
                U += weight_xc_RUN[1] * xi;

                weight_xc_RUN += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                __fp16 h_cont = (__fp16)hidden_state[i];

                R += weight_hc_RUN[0] * h_cont;
                U += weight_hc_RUN[1] * h_cont;

                weight_hc_RUN += 2;
            }

            // sigmoid(R)
            // sigmoid(U)
            float R32 = 1.f / (1.f + expf((float)-R));
            float U32 = 1.f / (1.f + expf((float)-U));

            // gate new
            __fp16 N = bias_c_RUBNWN[2];

            for (int i = 0; i < num_output; i++)
            {
                __fp16 h_cont = (__fp16)hidden_state[i];

                N += weight_hc_RUN[0] * h_cont;

                weight_hc_RUN += 1;
            }

            N = bias_c_RUBNWN[3] + (__fp16)R32 * N;

            for (int i = 0; i < size; i++)
            {
                __fp16 xi = x[i];

                N += weight_xc_RUN[0] * xi;

                weight_xc_RUN += 1;
            }

            // tanh(N)
            float N32 = tanhf((float)N);

            float* gates_data = gates.row(q / 4 + q % 4);

            gates_data[0] = U32;
            gates_data[1] = N32;
        }

        // h_t := (1 - update) .* new + update .* h_{t-1}
        __fp16* output_data = top_blob.row<__fp16>(ti);

        float* hidden_ptr = hidden_state;

        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const float* gates_data = gates.row(q / 4);

            float32x4_t _gru_U = vld1q_f32(gates_data);
            float32x4_t _gru_N = vld1q_f32(gates_data + 4);

            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));

            vst1q_f32(hidden_ptr + q, _gru_H);
            vst1_f16(output_data + q, vcvt_f16_f32(_gru_H));
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const float* gates_data = gates.row(q / 4 + q % 4);

            float U = gates_data[0];
            float N = gates_data[1];

            float H = (1 - U) * N + U * hidden_ptr[q];

            hidden_ptr[q] = H;
            output_data[q] = (__fp16)H;
        }
    }

    return 0;
}

static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    if (opt.use_fp16_arithmetic)
        return gru_fp16sa(bottom_blob, top_blob, reverse, weight_xc, bias_c, weight_hc, hidden_state, opt);

    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // 2 x num_output
    Mat gates(4 * 2, num_output / 4 + num_output % 4, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        int nn_num_output = num_output >> 2;
        int remain_num_output_start = nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const __fp16* x = bottom_blob.row<const __fp16>(ti);

            // gate reset update
            const __fp16* bias_c_RUBNWN = (const __fp16*)bias_c + q * 4;

            const __fp16* weight_xc_RUN = weight_xc.row<const __fp16>(q / 4);
            const __fp16* weight_hc_RUN = weight_hc.row<const __fp16>(q / 4);

            float32x4_t _gru_R = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN));
            float32x4_t _gru_U = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 4));
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
            float32x4_t _sum4 = vdupq_n_f32(0.f);
            float32x4_t _sum5 = vdupq_n_f32(0.f);
            float32x4_t _sum6 = vdupq_n_f32(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = vcvt_f32_f16(vld1_f16(x + i));
                float32x4_t _weight_xc_R = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
                float32x4_t _weight_xc_U = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 4));
                float32x4_t _weight_xc_R_1 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 8));
                float32x4_t _weight_xc_U_1 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 12));
                float32x4_t _weight_xc_R_2 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 16));
                float32x4_t _weight_xc_U_2 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 20));
                float32x4_t _weight_xc_R_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 24));
                float32x4_t _weight_xc_U_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 28));
                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_xc_R, _xi, 0);
                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_xc_U, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_R_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_U_1, _xi, 1);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_R_2, _xi, 2);
                _sum4 = vfmaq_laneq_f32(_sum4, _weight_xc_U_2, _xi, 2);
                _sum5 = vfmaq_laneq_f32(_sum5, _weight_xc_R_3, _xi, 3);
                _sum6 = vfmaq_laneq_f32(_sum6, _weight_xc_U_3, _xi, 3);

                weight_xc_RUN += 32;
            }
            for (; i < size; i++)
            {
                __fp16 xi = x[i];

                float32x4_t _xi = vcvt_f32_f16(vdup_n_f16(xi));
                float32x4_t _weight_xc_R = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
                float32x4_t _weight_xc_U = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 4));
                _gru_R = vmlaq_f32(_gru_R, _weight_xc_R, _xi);
                _gru_U = vmlaq_f32(_gru_U, _weight_xc_U, _xi);

                weight_xc_RUN += 8;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc_R = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
                float32x4_t _weight_hc_U = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 4));
                float32x4_t _weight_hc_R_1 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 8));
                float32x4_t _weight_hc_U_1 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 12));
                float32x4_t _weight_hc_R_2 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 16));
                float32x4_t _weight_hc_U_2 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 20));
                float32x4_t _weight_hc_R_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 24));
                float32x4_t _weight_hc_U_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 28));
                _gru_R = vfmaq_laneq_f32(_gru_R, _weight_hc_R, _h_cont, 0);
                _gru_U = vfmaq_laneq_f32(_gru_U, _weight_hc_U, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_R_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_U_1, _h_cont, 1);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_R_2, _h_cont, 2);
                _sum4 = vfmaq_laneq_f32(_sum4, _weight_hc_U_2, _h_cont, 2);
                _sum5 = vfmaq_laneq_f32(_sum5, _weight_hc_R_3, _h_cont, 3);
                _sum6 = vfmaq_laneq_f32(_sum6, _weight_hc_U_3, _h_cont, 3);

                weight_hc_RUN += 32;
            }
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_R = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
                float32x4_t _weight_hc_U = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 4));
                _gru_R = vmlaq_f32(_gru_R, _weight_hc_R, _h_cont);
                _gru_U = vmlaq_f32(_gru_U, _weight_hc_U, _h_cont);

                weight_hc_RUN += 8;
            }

            _gru_R = vaddq_f32(_gru_R, _sum1);
            _gru_U = vaddq_f32(_gru_U, _sum2);
            _sum3 = vaddq_f32(_sum3, _sum5);
            _sum4 = vaddq_f32(_sum4, _sum6);
            _gru_R = vaddq_f32(_gru_R, _sum3);
            _gru_U = vaddq_f32(_gru_U, _sum4);

            // sigmoid(R)
            // sigmoid(U)
            _gru_R = sigmoid_ps(_gru_R);
            _gru_U = sigmoid_ps(_gru_U);

            // gate new
            float32x4_t _gru_N = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 8));
            _sum1 = vdupq_n_f32(0.f);
            _sum2 = vdupq_n_f32(0.f);
            _sum3 = vdupq_n_f32(0.f);

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc_N = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
                float32x4_t _weight_hc_N_1 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 4));
                float32x4_t _weight_hc_N_2 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 8));
                float32x4_t _weight_hc_N_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 12));
                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_hc_N, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_N_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_N_2, _h_cont, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_N_3, _h_cont, 3);

                weight_hc_RUN += 16;
            }
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_N = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
                _gru_N = vmlaq_f32(_gru_N, _weight_hc_N, _h_cont);

                weight_hc_RUN += 4;
            }

            _gru_N = vaddq_f32(_gru_N, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _gru_N = vaddq_f32(_gru_N, _sum2);

            _gru_N = vmlaq_f32(vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 12)), _gru_R, _gru_N);
            _sum1 = vdupq_n_f32(0.f);
            _sum2 = vdupq_n_f32(0.f);
            _sum3 = vdupq_n_f32(0.f);

            i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = vcvt_f32_f16(vld1_f16(x + i));
                float32x4_t _weight_xc_N = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
                float32x4_t _weight_xc_N_1 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 4));
                float32x4_t _weight_xc_N_2 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 8));
                float32x4_t _weight_xc_N_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 12));
                _gru_N = vfmaq_laneq_f32(_gru_N, _weight_xc_N, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_N_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_N_2, _xi, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_N_3, _xi, 3);

                weight_xc_RUN += 16;
            }
            for (; i < size; i++)
            {
                __fp16 xi = x[i];

                float32x4_t _xi = vcvt_f32_f16(vdup_n_f16(xi));
                float32x4_t _weight_xc_N = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
                _gru_N = vmlaq_f32(_gru_N, _weight_xc_N, _xi);

                weight_xc_RUN += 4;
            }

            _gru_N = vaddq_f32(_gru_N, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _gru_N = vaddq_f32(_gru_N, _sum2);

            // tanh(N)
            _gru_N = tanh_ps(_gru_N);

            float* gates_data = gates.row(q / 4);

            vst1q_f32(gates_data, _gru_U);
            vst1q_f32(gates_data + 4, _gru_N);
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const __fp16* x = bottom_blob.row<const __fp16>(ti);

            // gate reset update
            const __fp16* bias_c_RUBNWN = (const __fp16*)bias_c + q * 4;

            const __fp16* weight_xc_RUN = weight_xc.row<const __fp16>(q / 4 + q % 4);
            const __fp16* weight_hc_RUN = weight_hc.row<const __fp16>(q / 4 + q % 4);

            float R = (float)bias_c_RUBNWN[0];
            float U = (float)bias_c_RUBNWN[1];

            for (int i = 0; i < size; i++)
            {
                float xi = (float)x[i];

                R += (float)weight_xc_RUN[0] * xi;
                U += (float)weight_xc_RUN[1] * xi;

                weight_xc_RUN += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                R += (float)weight_hc_RUN[0] * h_cont;
                U += (float)weight_hc_RUN[1] * h_cont;

                weight_hc_RUN += 2;
            }

            // sigmoid(R)
            // sigmoid(U)
            R = 1.f / (1.f + expf(-R));
            U = 1.f / (1.f + expf(-U));

            // gate new
            float N = (float)bias_c_RUBNWN[2];

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                N += (float)weight_hc_RUN[0] * h_cont;

                weight_hc_RUN += 1;
            }

            N = (float)bias_c_RUBNWN[3] + R * N;

            for (int i = 0; i < size; i++)
            {
                float xi = (float)x[i];

                N += (float)weight_xc_RUN[0] * xi;

                weight_xc_RUN += 1;
            }

            // tanh(N)
            N = tanhf(N);

            float* gates_data = gates.row(q / 4 + q % 4);

            gates_data[0] = U;
            gates_data[1] = N;
        }

        // h_t := (1 - update) .* new + update .* h_{t-1}
        __fp16* output_data = top_blob.row<__fp16>(ti);

        float* hidden_ptr = hidden_state;

        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const float* gates_data = gates.row(q / 4);

            float32x4_t _gru_U = vld1q_f32(gates_data);
            float32x4_t _gru_N = vld1q_f32(gates_data + 4);

            float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));

            vst1q_f32(hidden_ptr + q, _gru_H);
            vst1_f16(output_data + q, vcvt_f16_f32(_gru_H));
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const float* gates_data = gates.row(q / 4 + q % 4);

            float U = gates_data[0];
            float N = gates_data[1];

            float H = (1 - U) * N + U * hidden_ptr[q];

            hidden_ptr[q] = H;
            output_data[q] = (__fp16)H;
        }
    }

    return 0;
}

int GRU_arm::create_pipeline_fp16s(const Option& opt)
{
    // pack RUN
    int num_directions = direction == 2 ? 2 : 1;
    int size = weight_data_size / num_directions / num_output / 3;

    weight_xc_data_packed.create(size * 12, num_output / 4 + num_output % 4, num_directions, 2u, 1);
    bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4);
    weight_hc_data_packed.create(num_output * 12, num_output / 4 + num_output % 4, num_directions, 2u, 1);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat bias_c = bias_c_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        const float* bias_c_R = bias_c.row(0);
        const float* bias_c_U = bias_c.row(1);
        const float* bias_c_WN = bias_c.row(2);
        const float* bias_c_BN = bias_c.row(3);

        __fp16* bias_c_RUBNWN = bias_c_data_packed_dr.row<__fp16>(0);

        int q = 0;
        for (; q + 3 < num_output; q += 4)
        {
            bias_c_RUBNWN[0] = (__fp16)bias_c_R[q];
            bias_c_RUBNWN[1] = (__fp16)bias_c_R[q + 1];
            bias_c_RUBNWN[2] = (__fp16)bias_c_R[q + 2];
            bias_c_RUBNWN[3] = (__fp16)bias_c_R[q + 3];
            bias_c_RUBNWN[4] = (__fp16)bias_c_U[q];
            bias_c_RUBNWN[5] = (__fp16)bias_c_U[q + 1];
            bias_c_RUBNWN[6] = (__fp16)bias_c_U[q + 2];
            bias_c_RUBNWN[7] = (__fp16)bias_c_U[q + 3];
            bias_c_RUBNWN[8] = (__fp16)bias_c_BN[q];
            bias_c_RUBNWN[9] = (__fp16)bias_c_BN[q + 1];
            bias_c_RUBNWN[10] = (__fp16)bias_c_BN[q + 2];
            bias_c_RUBNWN[11] = (__fp16)bias_c_BN[q + 3];
            bias_c_RUBNWN[12] = (__fp16)bias_c_WN[q];
            bias_c_RUBNWN[13] = (__fp16)bias_c_WN[q + 1];
            bias_c_RUBNWN[14] = (__fp16)bias_c_WN[q + 2];
            bias_c_RUBNWN[15] = (__fp16)bias_c_WN[q + 3];

            bias_c_RUBNWN += 16;

            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);

            const float* weight_xc_R_1 = weight_xc.row(num_output * 0 + q + 1);
            const float* weight_xc_U_1 = weight_xc.row(num_output * 1 + q + 1);
            const float* weight_xc_N_1 = weight_xc.row(num_output * 2 + q + 1);

            const float* weight_xc_R_2 = weight_xc.row(num_output * 0 + q + 2);
            const float* weight_xc_U_2 = weight_xc.row(num_output * 1 + q + 2);
            const float* weight_xc_N_2 = weight_xc.row(num_output * 2 + q + 2);

            const float* weight_xc_R_3 = weight_xc.row(num_output * 0 + q + 3);
            const float* weight_xc_U_3 = weight_xc.row(num_output * 1 + q + 3);
            const float* weight_xc_N_3 = weight_xc.row(num_output * 2 + q + 3);

            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);

            const float* weight_hc_R_1 = weight_hc.row(num_output * 0 + q + 1);
            const float* weight_hc_U_1 = weight_hc.row(num_output * 1 + q + 1);
            const float* weight_hc_N_1 = weight_hc.row(num_output * 2 + q + 1);

            const float* weight_hc_R_2 = weight_hc.row(num_output * 0 + q + 2);
            const float* weight_hc_U_2 = weight_hc.row(num_output * 1 + q + 2);
            const float* weight_hc_N_2 = weight_hc.row(num_output * 2 + q + 2);

            const float* weight_hc_R_3 = weight_hc.row(num_output * 0 + q + 3);
            const float* weight_hc_U_3 = weight_hc.row(num_output * 1 + q + 3);
            const float* weight_hc_N_3 = weight_hc.row(num_output * 2 + q + 3);

            __fp16* weight_xc_RUN = weight_xc_data_packed_dr.row<__fp16>(q / 4);
            __fp16* weight_hc_RUN = weight_hc_data_packed_dr.row<__fp16>(q / 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = (__fp16)weight_xc_R[i];
                weight_xc_RUN[1] = (__fp16)weight_xc_R_1[i];
                weight_xc_RUN[2] = (__fp16)weight_xc_R_2[i];
                weight_xc_RUN[3] = (__fp16)weight_xc_R_3[i];
                weight_xc_RUN[4] = (__fp16)weight_xc_U[i];
                weight_xc_RUN[5] = (__fp16)weight_xc_U_1[i];
                weight_xc_RUN[6] = (__fp16)weight_xc_U_2[i];
                weight_xc_RUN[7] = (__fp16)weight_xc_U_3[i];

                weight_xc_RUN += 8;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = (__fp16)weight_hc_R[i];
                weight_hc_RUN[1] = (__fp16)weight_hc_R_1[i];
                weight_hc_RUN[2] = (__fp16)weight_hc_R_2[i];
                weight_hc_RUN[3] = (__fp16)weight_hc_R_3[i];
                weight_hc_RUN[4] = (__fp16)weight_hc_U[i];
                weight_hc_RUN[5] = (__fp16)weight_hc_U_1[i];
                weight_hc_RUN[6] = (__fp16)weight_hc_U_2[i];
                weight_hc_RUN[7] = (__fp16)weight_hc_U_3[i];

                weight_hc_RUN += 8;
            }

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = (__fp16)weight_xc_N[i];
                weight_xc_RUN[1] = (__fp16)weight_xc_N_1[i];
                weight_xc_RUN[2] = (__fp16)weight_xc_N_2[i];
                weight_xc_RUN[3] = (__fp16)weight_xc_N_3[i];

                weight_xc_RUN += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = (__fp16)weight_hc_N[i];
                weight_hc_RUN[1] = (__fp16)weight_hc_N_1[i];
                weight_hc_RUN[2] = (__fp16)weight_hc_N_2[i];
                weight_hc_RUN[3] = (__fp16)weight_hc_N_3[i];

                weight_hc_RUN += 4;
            }
        }
        for (; q < num_output; q++)
        {
            bias_c_RUBNWN[0] = (__fp16)bias_c_R[q];
            bias_c_RUBNWN[1] = (__fp16)bias_c_U[q];
            bias_c_RUBNWN[2] = (__fp16)bias_c_BN[q];
            bias_c_RUBNWN[3] = (__fp16)bias_c_WN[q];

            bias_c_RUBNWN += 4;

            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);

            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);

            __fp16* weight_xc_RUN = weight_xc_data_packed_dr.row<__fp16>(q / 4 + q % 4);
            __fp16* weight_hc_RUN = weight_hc_data_packed_dr.row<__fp16>(q / 4 + q % 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = (__fp16)weight_xc_R[i];
                weight_xc_RUN[1] = (__fp16)weight_xc_U[i];

                weight_xc_RUN += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = (__fp16)weight_hc_R[i];
                weight_hc_RUN[1] = (__fp16)weight_hc_U[i];

                weight_hc_RUN += 2;
            }

            for (int i = 0; i < size; i++)
            {
                weight_xc_RUN[0] = (__fp16)weight_xc_N[i];

                weight_xc_RUN += 1;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_RUN[0] = (__fp16)weight_hc_N[i];

                weight_hc_RUN += 1;
            }
        }
    }

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

int GRU_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.f);

        {
            int ret = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
            __fp16* ptr = top_blob.row<__fp16>(i);

            memcpy(ptr, pf, num_output * sizeof(__fp16));
            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
        }
    }

    return 0;
}

int GRU_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        Option opt_cast = opt;
        opt_cast.blob_allocator = hidden_allocator;
        cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            int ret = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            int ret = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
            __fp16* ptr = top_blob.row<__fp16>(i);

            memcpy(ptr, pf, num_output * sizeof(__fp16));
            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
        }
    }

    if (top_blobs.size() == 2)
    {
        cast_float32_to_float16(hidden, top_blobs[1], opt);
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/gru_arm_vfpv4.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "layer.h"
#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "gru_int8.h"

void gru_int8_gate_output_vfpv4(const Mat& gates, Mat& hidden_state, Mat& top_blob, int ti, int elemtype, const Option& opt)
{
    gru_int8_gate_output(gates, hidden_state, top_blob, ti, elemtype, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/gru_int8.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
void gru_transform_weight_int8_asimddp(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, const Option& opt);
void gru_int8_asimddp(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, Mat& hidden_state, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
void gru_int8_gate_output_vfpv4(const Mat& gates, Mat& hidden_state, Mat& top_blob, int ti, int elemtype, const Option& opt);
#endif

static void gru_transform_weight_int8(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
    if (ncnn::cpu_support_arm_asimddp())
    {
        gru_transform_weight_int8_asimddp(weight_xc, weight_xc_int8_scales, weight_hc, weight_hc_int8_scales, bias_c, weight_data_tm, weight_data_tm_int8_descales, bias_c_tm, size, num_output, num_directions, opt);
        return;
    }
#endif

#if __ARM_NEON
    weight_data_tm.create(size * 12 + num_output * 12, num_output / 4 + num_output % 4, num_directions, 1u, 1);
    weight_data_tm_int8_descales.create(12 + 12, num_output / 4 + num_output % 4, num_directions);
#else
    weight_data_tm.create(size * 3 + num_output * 3, num_output, num_directions, 1u, 1);
    weight_data_tm_int8_descales.create(3 + 3, num_output, num_directions);
#endif
    bias_c_tm.create(num_output, 1, num_directions, 16u, 4);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc_dr = weight_xc.channel(dr);
        const Mat weight_hc_dr = weight_hc.channel(dr);
        const Mat bias_c_dr = bias_c.channel(dr);
        const float* weight_xc_int8_scales_ptr = weight_xc_int8_scales.row(dr);
        const float* weight_hc_int8_scales_ptr = weight_hc_int8_scales.row(dr);

        Mat weight_data_tm_dr = weight_data_tm.channel(dr);
        Mat bias_c_tm_dr = bias_c_tm.channel(dr);
        Mat weight_data_tm_int8_descales_dr = weight_data_tm_int8_descales.channel(dr);

        const float* bias_c_R = bias_c_dr.row(0);
        const float* bias_c_U = bias_c_dr.row(1);
        const float* bias_c_WN = bias_c_dr.row(2);
        const float* bias_c_BN = bias_c_dr.row(3);

        float* bias_c_RUBNWN = bias_c_tm_dr.row(0);

        int q = 0;
#if __ARM_NEON
        for (; q + 3 < num_output; q += 4)
        {
            vst1q_f32(bias_c_RUBNWN, vld1q_f32(bias_c_R + q));
            vst1q_f32(bias_c_RUBNWN + 4, vld1q_f32(bias_c_U + q));
            vst1q_f32(bias_c_RUBNWN + 8, vld1q_f32(bias_c_BN + q));
            vst1q_f32(bias_c_RUBNWN + 12, vld1q_f32(bias_c_WN + q));

            bias_c_RUBNWN += 16;

            const signed char* weight_xc_R_0 = weight_xc_dr.row<const signed char>(num_output * 0 + q);
            const signed char* weight_xc_U_0 = weight_xc_dr.row<const signed char>(num_output * 1 + q);
            const signed char* weight_xc_N_0 = weight_xc_dr.row<const signed char>(num_output * 2 + q);

            const signed char* weight_xc_R_1 = weight_xc_dr.row<const signed char>(num_output * 0 + q + 1);
            const signed char* weight_xc_U_1 = weight_xc_dr.row<const signed char>(num_output * 1 + q + 1);
            const signed char* weight_xc_N_1 = weight_xc_dr.row<const signed char>(num_output * 2 + q + 1);

            const signed char* weight_xc_R_2 = weight_xc_dr.row<const signed char>(num_output * 0 + q + 2);
            const signed char* weight_xc_U_2 = weight_xc_dr.row<const signed char>(num_output * 1 + q + 2);
            const signed char* weight_xc_N_2 = weight_xc_dr.row<const signed char>(num_output * 2 + q + 2);

            const signed char* weight_xc_R_3 = weight_xc_dr.row<const signed char>(num_output * 0 + q + 3);
            const signed char* weight_xc_U_3 = weight_xc_dr.row<const signed char>(num_output * 1 + q + 3);
            const signed char* weight_xc_N_3 = weight_xc_dr.row<const signed char>(num_output * 2 + q + 3);

            const signed char* weight_hc_R_0 = weight_hc_dr.row<const signed char>(num_output * 0 + q);
            const signed char* weight_hc_U_0 = weight_hc_dr.row<const signed char>(num_output * 1 + q);
            const signed char* weight_hc_N_0 = weight_hc_dr.row<const signed char>(num_output * 2 + q);

            const signed char* weight_hc_R_1 = weight_hc_dr.row<const signed char>(num_output * 0 + q + 1);
            const signed char* weight_hc_U_1 = weight_hc_dr.row<const signed char>(num_output * 1 + q + 1);
            const signed char* weight_hc_N_1 = weight_hc_dr.row<const signed char>(num_output * 2 + q + 1);

            const signed char* weight_hc_R_2 = weight_hc_dr.row<const signed char>(num_output * 0 + q + 2);
            const signed char* weight_hc_U_2 = weight_hc_dr.row<const signed char>(num_output * 1 + q + 2);
            const signed char* weight_hc_N_2 = weight_hc_dr.row<const signed char>(num_output * 2 + q + 2);

            const signed char* weight_hc_R_3 = weight_hc_dr.row<const signed char>(num_output * 0 + q + 3);
            const signed char* weight_hc_U_3 = weight_hc_dr.row<const signed char>(num_output * 1 + q + 3);
            const signed char* weight_hc_N_3 = weight_hc_dr.row<const signed char>(num_output * 2 + q + 3);

            signed char* kptr = weight_data_tm_dr.row<signed char>(q / 4);
            float* descales_ptr = weight_data_tm_int8_descales_dr.row(q / 4);

            int i = 0;
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
                kptr[0] = weight_xc_R_0[i];
                kptr[1] = weight_xc_R_0[i + 1];
                kptr[2] = weight_xc_R_0[i + 2];
                kptr[3] = weight_xc_R_0[i + 3];
                kptr[4] = weight_xc_R_1[i];
                kptr[5] = weight_xc_R_1[i + 1];
                kptr[6] = weight_xc_R_1[i + 2];
                kptr[7] = weight_xc_R_1[i + 3];
                kptr[8 + 0] = weight_xc_R_2[i];
                kptr[8 + 1] = weight_xc_R_2[i + 1];
                kptr[8 + 2] = weight_xc_R_2[i + 2];
                kptr[8 + 3] = weight_xc_R_2[i + 3];
                kptr[8 + 4] = weight_xc_R_3[i];
                kptr[8 + 5] = weight_xc_R_3[i + 1];
                kptr[8 + 6] = weight_xc_R_3[i + 2];
                kptr[8 + 7] = weight_xc_R_3[i + 3];
                kptr[16 + 0] = weight_xc_U_0[i];
                kptr[16 + 1] = weight_xc_U_0[i + 1];
                kptr[16 + 2] = weight_xc_U_0[i + 2];
                kptr[16 + 3] = weight_xc_U_0[i + 3];
                kptr[16 + 4] = weight_xc_U_1[i];
                kptr[16 + 5] = weight_xc_U_1[i + 1];
                kptr[16 + 6] = weight_xc_U_1[i + 2];
                kptr[16 + 7] = weight_xc_U_1[i + 3];
                kptr[24 + 0] = weight_xc_U_2[i];
                kptr[24 + 1] = weight_xc_U_2[i + 1];
                kptr[24 + 2] = weight_xc_U_2[i + 2];
                kptr[24 + 3] = weight_xc_U_2[i + 3];
                kptr[24 + 4] = weight_xc_U_3[i];
                kptr[24 + 5] = weight_xc_U_3[i + 1];
                kptr[24 + 6] = weight_xc_U_3[i + 2];
                kptr[24 + 7] = weight_xc_U_3[i + 3];

                kptr += 32;
            }
#else
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _w0 = vld1_s8(weight_xc_R_0 + i);
                int8x8_t _w1 = vld1_s8(weight_xc_R_1 + i);
                int8x8_t _w2 = vld1_s8(weight_xc_R_2 + i);
                int8x8_t _w3 = vld1_s8(weight_xc_R_3 + i);
                int8x8_t _w4 = vld1_s8(weight_xc_U_0 + i);
                int8x8_t _w5 = vld1_s8(weight_xc_U_1 + i);
                int8x8_t _w6 = vld1_s8(weight_xc_U_2 + i);
                int8x8_t _w7 = vld1_s8(weight_xc_U_3 + i);

                int32x2x2_t _t0 = vtrn_s32(vreinterpret_s32_s8(_w0), vreinterpret_s32_s8(_w4));
                int32x2x2_t _t1 = vtrn_s32(vreinterpret_s32_s8(_w1), vreinterpret_s32_s8(_w5));
                int32x2x2_t _t2 = vtrn_s32(vreinterpret_s32_s8(_w2), vreinterpret_s32_s8(_w6));
                int32x2x2_t _t3 = vtrn_s32(vreinterpret_s32_s8(_w3), vreinterpret_s32_s8(_w7));

                int32x4x4_t _w;
                _w.val[0] = vcombine_s32(_t0.val[0], _t0.val[1]);
                _w.val[1] = vcombine_s32(_t1.val[0], _t1.val[1]);
                _w.val[2] = vcombine_s32(_t2.val[0], _t2.val[1]);
                _w.val[3] = vcombine_s32(_t3.val[0], _t3.val[1]);

                vst4q_s32((int*)kptr, _w);

                kptr += 64;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < size; i += 2)
            {
                kptr[0] = weight_xc_R_0[i];
                kptr[1] = weight_xc_R_0[i + 1];
                kptr[2] = weight_xc_R_1[i];
                kptr[3] = weight_xc_R_1[i + 1];
                kptr[4] = weight_xc_R_2[i];
                kptr[5] = weight_xc_R_2[i + 1];
                kptr[6] = weight_xc_R_3[i];
                kptr[7] = weight_xc_R_3[i + 1];
                kptr[8 + 0] = weight_xc_U_0[i];
                kptr[8 + 1] = weight_xc_U_0[i + 1];
                kptr[8 + 2] = weight_xc_U_1[i];
                kptr[8 + 3] = weight_xc_U_1[i + 1];
                kptr[8 + 4] = weight_xc_U_2[i];
                kptr[8 + 5] = weight_xc_U_2[i + 1];
                kptr[8 + 6] = weight_xc_U_3[i];
                kptr[8 + 7] = weight_xc_U_3[i + 1];

                kptr += 16;
            }
            for (; i < size; i++)
            {
                kptr[0] = weight_xc_R_0[i];
                kptr[1] = weight_xc_R_1[i];
                kptr[2] = weight_xc_R_2[i];
                kptr[3] = weight_xc_R_3[i];
                kptr[4] = weight_xc_U_0[i];
                kptr[5] = weight_xc_U_1[i];
                kptr[6] = weight_xc_U_2[i];
                kptr[7] = weight_xc_U_3[i];

                kptr += 8;
            }

            i = 0;
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
                kptr[0] = weight_hc_R_0[i];
                kptr[1] = weight_hc_R_0[i + 1];
                kptr[2] = weight_hc_R_0[i + 2];
                kptr[3] = weight_hc_R_0[i + 3];
                kptr[4] = weight_hc_R_1[i];
                kptr[5] = weight_hc_R_1[i + 1];
                kptr[6] = weight_hc_R_1[i + 2];
                kptr[7] = weight_hc_R_1[i + 3];
                kptr[8 + 0] = weight_hc_R_2[i];
                kptr[8 + 1] = weight_hc_R_2[i + 1];
                kptr[8 + 2] = weight_hc_R_2[i + 2];
                kptr[8 + 3] = weight_hc_R_2[i + 3];
                kptr[8 + 4] = weight_hc_R_3[i];
                kptr[8 + 5] = weight_hc_R_3[i + 1];
                kptr[8 + 6] = weight_hc_R_3[i + 2];
                kptr[8 + 7] = weight_hc_R_3[i + 3];
                kptr[16 + 0] = weight_hc_U_0[i];
                kptr[16 + 1] = weight_hc_U_0[i + 1];
                kptr[16 + 2] = weight_hc_U_0[i + 2];
                kptr[16 + 3] = weight_hc_U_0[i + 3];
                kptr[16 + 4] = weight_hc_U_1[i];
                kptr[16 + 5] = weight_hc_U_1[i + 1];
                kptr[16 + 6] = weight_hc_U_1[i + 2];
                kptr[16 + 7] = weight_hc_U_1[i + 3];
                kptr[24 + 0] = weight_hc_U_2[i];
                kptr[24 + 1] = weight_hc_U_2[i + 1];
                kptr[24 + 2] = weight_hc_U_2[i + 2];
                kptr[24 + 3] = weight_hc_U_2[i + 3];
                kptr[24 + 4] = weight_hc_U_3[i];
                kptr[24 + 5] = weight_hc_U_3[i + 1];
                kptr[24 + 6] = weight_hc_U_3[i + 2];
                kptr[24 + 7] = weight_hc_U_3[i + 3];

                kptr += 32;
            }
#else
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _w0 = vld1_s8(weight_hc_R_0 + i);
                int8x8_t _w1 = vld1_s8(weight_hc_R_1 + i);
                int8x8_t _w2 = vld1_s8(weight_hc_R_2 + i);
                int8x8_t _w3 = vld1_s8(weight_hc_R_3 + i);
                int8x8_t _w4 = vld1_s8(weight_hc_U_0 + i);
                int8x8_t _w5 = vld1_s8(weight_hc_U_1 + i);
                int8x8_t _w6 = vld1_s8(weight_hc_U_2 + i);
                int8x8_t _w7 = vld1_s8(weight_hc_U_3 + i);

                int32x2x2_t _t0 = vtrn_s32(vreinterpret_s32_s8(_w0), vreinterpret_s32_s8(_w4));
                int32x2x2_t _t1 = vtrn_s32(vreinterpret_s32_s8(_w1), vreinterpret_s32_s8(_w5));
                int32x2x2_t _t2 = vtrn_s32(vreinterpret_s32_s8(_w2), vreinterpret_s32_s8(_w6));
                int32x2x2_t _t3 = vtrn_s32(vreinterpret_s32_s8(_w3), vreinterpret_s32_s8(_w7));

                int32x4x4_t _w;
                _w.val[0] = vcombine_s32(_t0.val[0], _t0.val[1]);
                _w.val[1] = vcombine_s32(_t1.val[0], _t1.val[1]);
                _w.val[2] = vcombine_s32(_t2.val[0], _t2.val[1]);
                _w.val[3] = vcombine_s32(_t3.val[0], _t3.val[1]);

                vst4q_s32((int*)kptr, _w);

                kptr += 64;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < num_output; i += 2)
            {
                kptr[0] = weight_hc_R_0[i];
                kptr[1] = weight_hc_R_0[i + 1];
                kptr[2] = weight_hc_R_1[i];
                kptr[3] = weight_hc_R_1[i + 1];
                kptr[4] = weight_hc_R_2[i];
                kptr[5] = weight_hc_R_2[i + 1];
                kptr[6] = weight_hc_R_3[i];
                kptr[7] = weight_hc_R_3[i + 1];
                kptr[8 + 0] = weight_hc_U_0[i];
                kptr[8 + 1] = weight_hc_U_0[i + 1];
                kptr[8 + 2] = weight_hc_U_1[i];
                kptr[8 + 3] = weight_hc_U_1[i + 1];
                kptr[8 + 4] = weight_hc_U_2[i];
                kptr[8 + 5] = weight_hc_U_2[i + 1];
                kptr[8 + 6] = weight_hc_U_3[i];
                kptr[8 + 7] = weight_hc_U_3[i + 1];

                kptr += 16;
            }
            for (; i < num_output; i++)
            {
                kptr[0] = weight_hc_R_0[i];
                kptr[1] = weight_hc_R_1[i];
                kptr[2] = weight_hc_R_2[i];
                kptr[3] = weight_hc_R_3[i];
                kptr[4] = weight_hc_U_0[i];
                kptr[5] = weight_hc_U_1[i];
                kptr[6] = weight_hc_U_2[i];
                kptr[7] = weight_hc_U_3[i];

                kptr += 8;
            }

            i = 0;
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
                kptr[0] = weight_hc_N_0[i];
                kptr[1] = weight_hc_N_0[i + 1];
                kptr[2] = weight_hc_N_0[i + 2];
                kptr[3] = weight_hc_N_0[i + 3];
                kptr[4] = weight_hc_N_1[i];
                kptr[5] = weight_hc_N_1[i + 1];
                kptr[6] = weight_hc_N_1[i + 2];
                kptr[7] = weight_hc_N_1[i + 3];
                kptr[8 + 0] = weight_hc_N_2[i];
                kptr[8 + 1] = weight_hc_N_2[i + 1];
                kptr[8 + 2] = weight_hc_N_2[i + 2];
                kptr[8 + 3] = weight_hc_N_2[i + 3];
                kptr[8 + 4] = weight_hc_N_3[i];
                kptr[8 + 5] = weight_hc_N_3[i + 1];
                kptr[8 + 6] = weight_hc_N_3[i + 2];
                kptr[8 + 7] = weight_hc_N_3[i + 3];

                kptr += 16;
            }
#else
            for (; i + 7 < num_output; i += 8)
            {
                vst1_s8(kptr, vld1_s8(weight_hc_N_0 + i));
                vst1_s8(kptr + 8, vld1_s8(weight_hc_N_1 + i));
                vst1_s8(kptr + 16, vld1_s8(weight_hc_N_2 + i));
                vst1_s8(kptr + 24, vld1_s8(weight_hc_N_3 + i));
                kptr += 32;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < num_output; i += 2)
            {
                kptr[0] = weight_hc_N_0[i];
                kptr[1] = weight_hc_N_0[i + 1];
                kptr[2] = weight_hc_N_1[i];
                kptr[3] = weight_hc_N_1[i + 1];
                kptr[4] = weight_hc_N_2[i];
                kptr[5] = weight_hc_N_2[i + 1];
                kptr[6] = weight_hc_N_3[i];
                kptr[7] = weight_hc_N_3[i + 1];

                kptr += 8;
            }
            for (; i < num_output; i++)
            {
                kptr[0] = weight_hc_N_0[i];
                kptr[1] = weight_hc_N_1[i];
                kptr[2] = weight_hc_N_2[i];
                kptr[3] = weight_hc_N_3[i];

                kptr += 4;
            }

            i = 0;
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
                kptr[0] = weight_xc_N_0[i];
                kptr[1] = weight_xc_N_0[i + 1];
                kptr[2] = weight_xc_N_0[i + 2];
                kptr[3] = weight_xc_N_0[i + 3];
                kptr[4] = weight_xc_N_1[i];
                kptr[5] = weight_xc_N_1[i + 1];
                kptr[6] = weight_xc_N_1[i + 2];
                kptr[7] = weight_xc_N_1[i + 3];
                kptr[8 + 0] = weight_xc_N_2[i];
                kptr[8 + 1] = weight_xc_N_2[i + 1];
                kptr[8 + 2] = weight_xc_N_2[i + 2];
                kptr[8 + 3] = weight_xc_N_2[i + 3];
                kptr[8 + 4] = weight_xc_N_3[i];
                kptr[8 + 5] = weight_xc_N_3[i + 1];
                kptr[8 + 6] = weight_xc_N_3[i + 2];
                kptr[8 + 7] = weight_xc_N_3[i + 3];

                kptr += 16;
            }
#else
            for (; i + 7 < size; i += 8)
            {
                vst1_s8(kptr, vld1_s8(weight_xc_N_0 + i));
                vst1_s8(kptr + 8, vld1_s8(weight_xc_N_1 + i));
                vst1_s8(kptr + 16, vld1_s8(weight_xc_N_2 + i));
                vst1_s8(kptr + 24, vld1_s8(weight_xc_N_3 + i));
                kptr += 32;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < size; i += 2)
            {
                kptr[0] = weight_xc_N_0[i];
                kptr[1] = weight_xc_N_0[i + 1];
                kptr[2] = weight_xc_N_1[i];
                kptr[3] = weight_xc_N_1[i + 1];
                kptr[4] = weight_xc_N_2[i];
                kptr[5] = weight_xc_N_2[i + 1];
                kptr[6] = weight_xc_N_3[i];
                kptr[7] = weight_xc_N_3[i + 1];

                kptr += 8;
            }
            for (; i < size; i++)
            {
                kptr[0] = weight_xc_N_0[i];
                kptr[1] = weight_xc_N_1[i];
                kptr[2] = weight_xc_N_2[i];
                kptr[3] = weight_xc_N_3[i];

                kptr += 4;
            }

            float32x4_t _xc_R0 = vld1q_f32(weight_xc_int8_scales_ptr + q);
            float32x4_t _xc_U0 = vld1q_f32(weight_xc_int8_scales_ptr + num_output + q);
            float32x4_t _xc_N0 = vld1q_f32(weight_xc_int8_scales_ptr + num_output * 2 + q);
            float32x4_t _hc_R0 = vld1q_f32(weight_hc_int8_scales_ptr + q);
            float32x4_t _hc_U0 = vld1q_f32(weight_hc_int8_scales_ptr + num_output + q);
            float32x4_t _hc_N0 = vld1q_f32(weight_hc_int8_scales_ptr + num_output * 2 + q);

#if __aarch64__
            float32x4_t _one = vdupq_n_f32(1.f);
            float32x4_t _reciprocal_xc_R0 = vdivq_f32(_one, _xc_R0);
            float32x4_t _reciprocal_xc_U0 = vdivq_f32(_one, _xc_U0);
            float32x4_t _reciprocal_xc_N0 = vdivq_f32(_one, _xc_N0);
            float32x4_t _reciprocal_hc_R0 = vdivq_f32(_one, _hc_R0);
            float32x4_t _reciprocal_hc_U0 = vdivq_f32(_one, _hc_U0);
            float32x4_t _reciprocal_hc_N0 = vdivq_f32(_one, _hc_N0);
#else
            float32x4_t _reciprocal_xc_R0 = vrecpeq_f32(_xc_R0);
            float32x4_t _reciprocal_xc_U0 = vrecpeq_f32(_xc_U0);
            float32x4_t _reciprocal_xc_N0 = vrecpeq_f32(_xc_N0);
            _reciprocal_xc_R0 = vmulq_f32(vrecpsq_f32(_xc_R0, _reciprocal_xc_R0), _reciprocal_xc_R0);
            _reciprocal_xc_U0 = vmulq_f32(vrecpsq_f32(_xc_U0, _reciprocal_xc_U0), _reciprocal_xc_U0);
            _reciprocal_xc_N0 = vmulq_f32(vrecpsq_f32(_xc_N0, _reciprocal_xc_N0), _reciprocal_xc_N0);
            _reciprocal_xc_R0 = vmulq_f32(vrecpsq_f32(_xc_R0, _reciprocal_xc_R0), _reciprocal_xc_R0);
            _reciprocal_xc_U0 = vmulq_f32(vrecpsq_f32(_xc_U0, _reciprocal_xc_U0), _reciprocal_xc_U0);
            _reciprocal_xc_N0 = vmulq_f32(vrecpsq_f32(_xc_N0, _reciprocal_xc_N0), _reciprocal_xc_N0);
            float32x4_t _reciprocal_hc_R0 = vrecpeq_f32(_hc_R0);
            float32x4_t _reciprocal_hc_U0 = vrecpeq_f32(_hc_U0);
            float32x4_t _reciprocal_hc_N0 = vrecpeq_f32(_hc_N0);
            _reciprocal_hc_R0 = vmulq_f32(vrecpsq_f32(_hc_R0, _reciprocal_hc_R0), _reciprocal_hc_R0);
            _reciprocal_hc_U0 = vmulq_f32(vrecpsq_f32(_hc_U0, _reciprocal_hc_U0), _reciprocal_hc_U0);
            _reciprocal_hc_N0 = vmulq_f32(vrecpsq_f32(_hc_N0, _reciprocal_hc_N0), _reciprocal_hc_N0);
            _reciprocal_hc_R0 = vmulq_f32(vrecpsq_f32(_hc_R0, _reciprocal_hc_R0), _reciprocal_hc_R0);
            _reciprocal_hc_U0 = vmulq_f32(vrecpsq_f32(_hc_U0, _reciprocal_hc_U0), _reciprocal_hc_U0);
            _reciprocal_hc_N0 = vmulq_f32(vrecpsq_f32(_hc_N0, _reciprocal_hc_N0), _reciprocal_hc_N0);
#endif

            vst1q_f32(descales_ptr, _reciprocal_xc_R0);
            vst1q_f32(descales_ptr + 4, _reciprocal_xc_U0);
            vst1q_f32(descales_ptr + 8, _reciprocal_hc_R0);
            vst1q_f32(descales_ptr + 12, _reciprocal_hc_U0);
            vst1q_f32(descales_ptr + 16, _reciprocal_hc_N0);
            vst1q_f32(descales_ptr + 20, _reciprocal_xc_N0);
        }
#endif // __ARM_NEON
        for (; q < num_output; q++)
        {
            bias_c_RUBNWN[0] = bias_c_R[q];
            bias_c_RUBNWN[1] = bias_c_U[q];
            bias_c_RUBNWN[2] = bias_c_BN[q];
            bias_c_RUBNWN[3] = bias_c_WN[q];

            bias_c_RUBNWN += 4;

            const signed char* weight_xc_R = weight_xc_dr.row<const signed char>(num_output * 0 + q);
            const signed char* weight_xc_U = weight_xc_dr.row<const signed char>(num_output * 1 + q);
            const signed char* weight_xc_N = weight_xc_dr.row<const signed char>(num_output * 2 + q);

            const signed char* weight_hc_R = weight_hc_dr.row<const signed char>(num_output * 0 + q);
            const signed char* weight_hc_U = weight_hc_dr.row<const signed char>(num_output * 1 + q);
            const signed char* weight_hc_N = weight_hc_dr.row<const signed char>(num_output * 2 + q);

#if __ARM_NEON
            signed char* kptr = weight_data_tm_dr.row<signed char>(q / 4 + q % 4);
            float* descales_ptr = weight_data_tm_int8_descales_dr.row(q / 4 + q % 4);
#else
            signed char* kptr = weight_data_tm_dr.row<signed char>(q);
            float* descales_ptr = weight_data_tm_int8_descales_dr.row(q);
#endif // __ARM_NEON

            for (int i = 0; i < size; i++)
            {
                kptr[0] = weight_xc_R[i];
                kptr[1] = weight_xc_U[i];
                kptr += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                kptr[0] = weight_hc_R[i];
                kptr[1] = weight_hc_U[i];
                kptr += 2;
            }

            for (int i = 0; i < num_output; i++)
            {
                kptr[0] = weight_hc_N[i];
                kptr += 1;
            }

            for (int i = 0; i < size; i++)
            {
                kptr[0] = weight_xc_N[i];
                kptr += 1;
            }

            descales_ptr[0] = 1.f / weight_xc_int8_scales_ptr[num_output * 0 + q];
            descales_ptr[1] = 1.f / weight_xc_int8_scales_ptr[num_output * 1 + q];
            descales_ptr[2] = 1.f / weight_hc_int8_scales_ptr[num_output * 0 + q];
            descales_ptr[3] = 1.f / weight_hc_int8_scales_ptr[num_output * 1 + q];
            descales_ptr[4] = 1.f / weight_hc_int8_scales_ptr[num_output * 2 + q];
            descales_ptr[5] = 1.f / weight_xc_int8_scales_ptr[num_output * 2 + q];
        }
    }
}

static void gru_int8_gate_output(const Mat& gates, Mat& hidden_state, Mat& top_blob, int ti, int elemtype, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
    if (ncnn::cpu_support_arm_vfpv4())
    {
        gru_int8_gate_output_vfpv4(gates, hidden_state, top_blob, ti, elemtype, opt);
        return;
    }
#endif

    const int num_output = top_blob.w;

    // h_t := (1 - update) .* new + update .* h_{t-1}
    float* output_data = top_blob.row(ti);

    float* hidden_ptr = hidden_state;

    int remain_num_output_start = 0;
#if __ARM_NEON
    int nn_num_output = num_output >> 2;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int qq = 0; qq < nn_num_output; qq++)
    {
        int q = qq * 4;

        const float* gates_data = gates.row(q / 4);

        float32x4_t _gru_U0 = vld1q_f32(gates_data);
        float32x4_t _gru_N0 = vld1q_f32(gates_data + 4);

        float32x4_t _gru_H0 = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U0), _gru_N0), vmulq_f32(_gru_U0, vld1q_f32(hidden_ptr + q)));

        vst1q_f32(hidden_ptr + q, _gru_H0);

        if (elemtype == 1)
        {
            // fp32
            vst1q_f32(output_data + q, _gru_H0);
        }
        if (elemtype == 2)
        {
            // fp16
            unsigned short* outptr = (unsigned short*)output_data + q;
#if (__ARM_FP & 2)
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "fcvtn  v0.4h, %2.4s        \n"
                "st1    {v0.4h}, [%0]       \n"
                : "=r"(outptr) // %0
                : "0"(outptr),
                "w"(_gru_H0)
                : "memory", "v0");
#else  // __aarch64__
            asm volatile(
                "vcvt.f16.f32 d0, %q2       \n"
                "vst1.u16   {d0}, [%0]      \n"
                : "=r"(outptr) // %0
                : "0"(outptr),
                "w"(_gru_H0)
                : "memory", "q0");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            vst1_u16(outptr, (uint16x4_t)vcvt_f16_f32(_gru_H0));
#endif // NCNN_GNU_INLINE_ASM
#else
            outptr[q] = float32_to_float16(hidden_ptr[q]);
            outptr[q + 1] = float32_to_float16(hidden_ptr[q + 1]);
            outptr[q + 2] = float32_to_float16(hidden_ptr[q + 2]);
            outptr[q + 3] = float32_to_float16(hidden_ptr[q + 3]);
#endif // (__ARM_FP & 2)
        }
        if (elemtype == 4)
        {
            // bf16
            vst1_u16((unsigned short*)output_data + q, float2bfloat(_gru_H0));
        }
    }
    remain_num_output_start += nn_num_output << 2;
#endif // __ARM_NEON
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = remain_num_output_start; q < num_output; q++)
    {
#if __ARM_NEON
        const float* gates_data = gates.row(q / 4 + q % 4);
#else
        const float* gates_data = gates.row(q);
#endif

        float U = gates_data[0];
        float N = gates_data[1];

        float H = (1 - U) * N + U * hidden_ptr[q];

        hidden_ptr[q] = H;

        if (elemtype == 1)
        {
            output_data[q] = H;
        }
        if (elemtype == 2)
        {
            ((unsigned short*)output_data)[q] = float32_to_float16(H);
        }
        if (elemtype == 4)
        {
            ((unsigned short*)output_data)[q] = float32_to_bfloat16(H);
        }
    }
}

static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, Mat& hidden_state, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
    if (ncnn::cpu_support_arm_asimddp())
    {
        gru_int8_asimddp(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, reverse, weight_data_tm, weight_data_tm_int8_descales, bias_c, hidden_state, opt);
        return;
    }
#endif

    int size = bottom_blob_int8.w;
    int T = bottom_blob_int8.h;

    int num_output = top_blob.w;

    // 2 x num_output
#if __ARM_NEON
    Mat gates(4 * 2, num_output / 4 + num_output % 4, 4u, opt.workspace_allocator);
#else
    Mat gates(2, num_output, 4u, opt.workspace_allocator);
#endif

    Mat hidden_state_int8(num_output, (size_t)1u, 1, opt.workspace_allocator);
    float hidden_state_int8_scale = 1.f;
    float hidden_state_int8_descale = 1.f;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        // dynamic quantize hidden_state
        {
            float absmax = 0.f;
            for (int i = 0; i < num_output; i++)
            {
                absmax = std::max(absmax, (float)fabs(hidden_state[i]));
            }

            if (absmax == 0.f)
            {
                hidden_state_int8.fill<signed char>(0);
            }
            else
            {
                hidden_state_int8_scale = 127.f / absmax;
                hidden_state_int8_descale = absmax / 127.f;

                signed char* hs = hidden_state_int8;
                for (int i = 0; i < num_output; i++)
                {
                    hs[i] = float2int8(hidden_state[i] * hidden_state_int8_scale);
                }
            }
        }

        int remain_num_output_start = 0;
#if __ARM_NEON
        int nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const signed char* x = bottom_blob_int8.row<const signed char>(ti);
            const signed char* hs = hidden_state_int8;
            const float descale_x = bottom_blob_int8_descales[ti];
            const float descale_h = hidden_state_int8_descale;

            // gate reset update
            const float* bias_c_RUBNWN = (const float*)bias_c + q * 4;

            const signed char* kptr = weight_data_tm.row<const signed char>(q / 4);

            const float* descales_ptr = weight_data_tm_int8_descales.row(q / 4);

            int32x4_t _gru_Rx0 = vdupq_n_s32(0);
            int32x4_t _gru_Ux0 = vdupq_n_s32(0);
            int i = 0;
#if __ARM_FEATURE_DOTPROD
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);
                _gru_Rx0 = vdotq_lane_s32(_gru_Rx0, _w0, _xi, 0);
                _gru_Ux0 = vdotq_lane_s32(_gru_Ux0, _w1, _xi, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w2, _xi, 1);
                _sum2 = vdotq_lane_s32(_sum2, _w3, _xi, 1);

                kptr += 64;
            }
            _gru_Rx0 = vaddq_s32(_gru_Rx0, _sum1);
            _gru_Ux0 = vaddq_s32(_gru_Ux0, _sum2);
#else
            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            int32x4_t _sum3 = vdupq_n_s32(0);
            for (; i + 7 < size; i += 8)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* xptr = x + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16}, [%0]         \n"
                    "vdup.32    d17, d16[0]         \n"
                    "vdup.32    d16, d16[1]         \n"
                    "vmull.s8   q4, d0, d17         \n"
                    "vmull.s8   q5, d1, d17         \n"
                    "vmull.s8   q6, d2, d17         \n"
                    "vmull.s8   q7, d3, d17         \n"
                    "vmlal.s8   q4, d4, d16         \n"
                    "vmlal.s8   q5, d5, d16         \n"
                    "vmlal.s8   q6, d6, d16         \n"
                    "vmlal.s8   q7, d7, d16         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(xptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(xptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int32x2_t _xi01 = vreinterpret_s32_s8(vld1_s8(x + i));
                int8x8_t _xi0 = vreinterpret_s8_s32(vdup_lane_s32(_xi01, 0));
                int8x8_t _xi1 = vreinterpret_s8_s32(vdup_lane_s32(_xi01, 1));
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _xi0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _xi0);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _xi0);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _xi0);
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), _xi1);
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), _xi1);
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), _xi1);
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), _xi1);

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            {
                int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum1), vget_high_s32(_sum1));
                int32x2_t _s2 = vpadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2));
                int32x2_t _s3 = vpadd_s32(vget_low_s32(_sum3), vget_high_s32(_sum3));
                _gru_Rx0 = vaddq_s32(_gru_Rx0, vcombine_s32(_s0, _s1));
                _gru_Ux0 = vaddq_s32(_gru_Ux0, vcombine_s32(_s2, _s3));
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _gru_Rx0 = vdotq_lane_s32(_gru_Rx0, _w0, _xi, 0);
                _gru_Ux0 = vdotq_lane_s32(_gru_Ux0, _w1, _xi, 0);
#else
                int16x4_t _xi01 = vreinterpret_s16_s8(vld1_s8(x + i));
                int8x8_t _xi0 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 0));
                int8x8_t _xi1 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 1));
                int8x16_t _weight_xc_RU0 = vld1q_s8(kptr);
                int8x16_t _weight_xc_RU1 = vld1q_s8(kptr + 16);

                int16x8_t _gru_Rx = vmull_s8(vget_low_s8(_weight_xc_RU0), _xi0);
                int16x8_t _gru_Ux = vmull_s8(vget_high_s8(_weight_xc_RU0), _xi0);
                _gru_Rx = vmlal_s8(_gru_Rx, vget_low_s8(_weight_xc_RU1), _xi1);
                _gru_Ux = vmlal_s8(_gru_Ux, vget_high_s8(_weight_xc_RU1), _xi1);

                _gru_Rx0 = vpadalq_s16(_gru_Rx0, _gru_Rx);
                _gru_Ux0 = vpadalq_s16(_gru_Ux0, _gru_Ux);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 32;
            }
            for (; i + 1 < size; i += 2)
            {
                int8x8_t _xi = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(x + i)), 0));
                int8x16_t _weight_xc_RU = vld1q_s8(kptr);

                int16x8_t _gru_Rx = vmull_s8(vget_low_s8(_weight_xc_RU), _xi);
                int16x8_t _gru_Ux = vmull_s8(vget_high_s8(_weight_xc_RU), _xi);

                _gru_Rx0 = vpadalq_s16(_gru_Rx0, _gru_Rx);
                _gru_Ux0 = vpadalq_s16(_gru_Ux0, _gru_Ux);

                kptr += 16;
            }
            for (; i < size; i++)
            {
                int8x8_t _xi = vdup_n_s8(x[i]);
                int8x8_t _weight_xc_RU = vld1_s8(kptr);

                int16x8_t _gru_RxUx = vmull_s8(_weight_xc_RU, _xi);
                _gru_Rx0 = vaddw_s16(_gru_Rx0, vget_low_s16(_gru_RxUx));
                _gru_Ux0 = vaddw_s16(_gru_Ux0, vget_high_s16(_gru_RxUx));

                kptr += 8;
            }

            int32x4_t _gru_Rh0 = vdupq_n_s32(0);
            int32x4_t _gru_Uh0 = vdupq_n_s32(0);
            i = 0;
#if __ARM_FEATURE_DOTPROD
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);
                _gru_Rh0 = vdotq_lane_s32(_gru_Rh0, _w0, _h_cont, 0);
                _gru_Uh0 = vdotq_lane_s32(_gru_Uh0, _w1, _h_cont, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w2, _h_cont, 1);
                _sum2 = vdotq_lane_s32(_sum2, _w3, _h_cont, 1);

                kptr += 64;
            }
            _gru_Rh0 = vaddq_s32(_gru_Rh0, _sum1);
            _gru_Uh0 = vaddq_s32(_gru_Uh0, _sum2);
#else
            _sum0 = vdupq_n_s32(0);
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            _sum3 = vdupq_n_s32(0);
            for (; i + 7 < num_output; i += 8)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* hsptr = hs + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16}, [%0]         \n"
                    "vdup.32    d17, d16[0]         \n"
                    "vdup.32    d16, d16[1]         \n"
                    "vmull.s8   q4, d0, d17         \n"
                    "vmull.s8   q5, d1, d17         \n"
                    "vmull.s8   q6, d2, d17         \n"
                    "vmull.s8   q7, d3, d17         \n"
                    "vmlal.s8   q4, d4, d16         \n"
                    "vmlal.s8   q5, d5, d16         \n"
                    "vmlal.s8   q6, d6, d16         \n"
                    "vmlal.s8   q7, d7, d16         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(hsptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(hsptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int32x2_t _h_cont01 = vreinterpret_s32_s8(vld1_s8(hs + i));
                int8x8_t _h_cont0 = vreinterpret_s8_s32(vdup_lane_s32(_h_cont01, 0));
                int8x8_t _h_cont1 = vreinterpret_s8_s32(vdup_lane_s32(_h_cont01, 1));
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _h_cont0);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _h_cont0);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _h_cont0);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _h_cont0);
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), _h_cont1);
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), _h_cont1);
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), _h_cont1);
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), _h_cont1);

                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            {
                int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum1), vget_high_s32(_sum1));
                int32x2_t _s2 = vpadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2));
                int32x2_t _s3 = vpadd_s32(vget_low_s32(_sum3), vget_high_s32(_sum3));
                _gru_Rh0 = vaddq_s32(_gru_Rh0, vcombine_s32(_s0, _s1));
                _gru_Uh0 = vaddq_s32(_gru_Uh0, vcombine_s32(_s2, _s3));
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _gru_Rh0 = vdotq_lane_s32(_gru_Rh0, _w0, _h_cont, 0);
                _gru_Uh0 = vdotq_lane_s32(_gru_Uh0, _w1, _h_cont, 0);
#else
                int16x4_t _h_cont01 = vreinterpret_s16_s8(vld1_s8(hs + i));
                int8x8_t _h_cont0 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 0));
                int8x8_t _h_cont1 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 1));
                int8x16_t _weight_hc_RU0 = vld1q_s8(kptr);
                int8x16_t _weight_hc_RU1 = vld1q_s8(kptr + 16);

                int16x8_t _gru_Rh = vmull_s8(vget_low_s8(_weight_hc_RU0), _h_cont0);
                int16x8_t _gru_Uh = vmull_s8(vget_high_s8(_weight_hc_RU0), _h_cont0);
                _gru_Rh = vmlal_s8(_gru_Rh, vget_low_s8(_weight_hc_RU1), _h_cont1);
                _gru_Uh = vmlal_s8(_gru_Uh, vget_high_s8(_weight_hc_RU1), _h_cont1);

                _gru_Rh0 = vpadalq_s16(_gru_Rh0, _gru_Rh);
                _gru_Uh0 = vpadalq_s16(_gru_Uh0, _gru_Uh);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 32;
            }
            for (; i + 1 < num_output; i += 2)
            {
                int8x8_t _h_cont = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(hs + i)), 0));
                int8x16_t _weight_hc_RU = vld1q_s8(kptr);

                int16x8_t _gru_Rh = vmull_s8(vget_low_s8(_weight_hc_RU), _h_cont);
                int16x8_t _gru_Uh = vmull_s8(vget_high_s8(_weight_hc_RU), _h_cont);

                _gru_Rh0 = vpadalq_s16(_gru_Rh0, _gru_Rh);
                _gru_Uh0 = vpadalq_s16(_gru_Uh0, _gru_Uh);

                kptr += 16;
            }
            for (; i < num_output; i++)
            {
                int8x8_t _h_cont = vdup_n_s8(hs[i]);
                int8x8_t _weight_hc_RU = vld1_s8(kptr);

                int16x8_t _gru_RhUh = vmull_s8(_weight_hc_RU, _h_cont);
                _gru_Rh0 = vaddw_s16(_gru_Rh0, vget_low_s16(_gru_RhUh));
                _gru_Uh0 = vaddw_s16(_gru_Uh0, vget_high_s16(_gru_RhUh));

                kptr += 8;
            }

            float32x4_t _descale_x = vdupq_n_f32(descale_x);
            float32x4_t _descale_h = vdupq_n_f32(descale_h);

            float32x4_t _gru_R0 = vld1q_f32(bias_c_RUBNWN);
            float32x4_t _gru_U0 = vld1q_f32(bias_c_RUBNWN + 4);

            float32x4_t _descale_xc_R0 = vld1q_f32(descales_ptr);
            float32x4_t _descale_xc_U0 = vld1q_f32(descales_ptr + 4);

            _gru_R0 = vmlaq_f32(_gru_R0, vcvtq_f32_s32(_gru_Rx0), vmulq_f32(_descale_x, _descale_xc_R0));
            _gru_U0 = vmlaq_f32(_gru_U0, vcvtq_f32_s32(_gru_Ux0), vmulq_f32(_descale_x, _descale_xc_U0));

            float32x4_t _descale_hc_R0 = vld1q_f32(descales_ptr + 8);
            float32x4_t _descale_hc_U0 = vld1q_f32(descales_ptr + 12);

            _gru_R0 = vmlaq_f32(_gru_R0, vcvtq_f32_s32(_gru_Rh0), vmulq_f32(_descale_h, _descale_hc_R0));
            _gru_U0 = vmlaq_f32(_gru_U0, vcvtq_f32_s32(_gru_Uh0), vmulq_f32(_descale_h, _descale_hc_U0));

            // sigmoid(R)
            // sigmoid(U)
            _gru_R0 = sigmoid_ps(_gru_R0);
            _gru_U0 = sigmoid_ps(_gru_U0);

            // gate new

            int32x4_t _gru_Nh0 = vdupq_n_s32(0);
            i = 0;
#if __ARM_FEATURE_DOTPROD
            _sum1 = vdupq_n_s32(0);
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _gru_Nh0 = vdotq_lane_s32(_gru_Nh0, _w0, _h_cont, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w1, _h_cont, 1);

                kptr += 32;
            }
            _gru_Nh0 = vaddq_s32(_gru_Nh0, _sum1);
#else
            _sum0 = vdupq_n_s32(0);
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            _sum3 = vdupq_n_s32(0);
            for (; i + 15 < num_output; i += 16)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* hsptr = hs + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16-d17}, [%0]     \n"
                    "vmull.s8   q4, d0, d16         \n"
                    "vmull.s8   q5, d1, d16         \n"
                    "vmull.s8   q6, d2, d16         \n"
                    "vmull.s8   q7, d3, d16         \n"
                    "vmlal.s8   q4, d4, d17         \n"
                    "vmlal.s8   q5, d5, d17         \n"
                    "vmlal.s8   q6, d6, d17         \n"
                    "vmlal.s8   q7, d7, d17         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(hsptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(hsptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int8x16_t _h_cont = vld1q_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), vget_low_s8(_h_cont));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), vget_low_s8(_h_cont));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), vget_low_s8(_h_cont));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), vget_low_s8(_h_cont));
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), vget_high_s8(_h_cont));
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), vget_high_s8(_h_cont));
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), vget_high_s8(_h_cont));
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), vget_high_s8(_h_cont));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _h_cont);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _h_cont);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _h_cont);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _h_cont);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 32;
            }
            {
                int32x4x2_t _tmp0 = vzipq_s32(_sum0, _sum1);
                int32x4x2_t _tmp1 = vzipq_s32(_sum2, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_tmp0.val[0]), vget_low_s32(_tmp1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_tmp0.val[0]), vget_high_s32(_tmp1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_tmp0.val[1]), vget_low_s32(_tmp1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_tmp0.val[1]), vget_high_s32(_tmp1.val[1]));
            }
            _gru_Nh0 = vaddq_s32(_gru_Nh0, _sum0);
            _gru_Nh0 = vaddq_s32(_gru_Nh0, _sum1);
            _gru_Nh0 = vaddq_s32(_gru_Nh0, _sum2);
            _gru_Nh0 = vaddq_s32(_gru_Nh0, _sum3);
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w = vld1q_s8(kptr);
                _gru_Nh0 = vdotq_lane_s32(_gru_Nh0, _w, _h_cont, 0);
#else
                int16x4_t _h_cont01 = vreinterpret_s16_s8(vld1_s8(hs + i));
                int8x8_t _h_cont0 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 0));
                int8x8_t _h_cont1 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 1));
                int8x16_t _w01 = vld1q_s8(kptr);

                int16x8_t _gru_Nh = vmull_s8(vget_low_s8(_w01), _h_cont0);
                _gru_Nh = vmlal_s8(_gru_Nh, vget_high_s8(_w01), _h_cont1);
                _gru_Nh0 = vpadalq_s16(_gru_Nh0, _gru_Nh);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 16;
            }
            for (; i + 1 < num_output; i += 2)
            {
                int8x8_t _h_cont = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(hs + i)), 0));
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _gru_Nh = vmull_s8(_w, _h_cont);
                _gru_Nh0 = vpadalq_s16(_gru_Nh0, _gru_Nh);

                kptr += 8;
            }
            for (; i < num_output; i++)
            {
                int8x8_t _h_cont = vdup_n_s8(hs[i]);
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _gru_Nh = vmull_s8(_w, _h_cont);
                _gru_Nh0 = vaddw_s16(_gru_Nh0, vget_low_s16(_gru_Nh));

                kptr += 4;
            }

            int32x4_t _gru_Nx0 = vdupq_n_s32(0);
            i = 0;
#if __ARM_FEATURE_DOTPROD
            _sum1 = vdupq_n_s32(0);
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _gru_Nx0 = vdotq_lane_s32(_gru_Nx0, _w0, _xi, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w1, _xi, 1);

                kptr += 32;
            }
            _gru_Nx0 = vaddq_s32(_gru_Nx0, _sum1);
#else
            _sum0 = vdupq_n_s32(0);
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            _sum3 = vdupq_n_s32(0);
            for (; i + 15 < size; i += 16)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* xptr = x + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16-d17}, [%0]     \n"
                    "vmull.s8   q4, d0, d16         \n"
                    "vmull.s8   q5, d1, d16         \n"
                    "vmull.s8   q6, d2, d16         \n"
                    "vmull.s8   q7, d3, d16         \n"
                    "vmlal.s8   q4, d4, d17         \n"
                    "vmlal.s8   q5, d5, d17         \n"
                    "vmlal.s8   q6, d6, d17         \n"
                    "vmlal.s8   q7, d7, d17         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(xptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(xptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int8x16_t _xi = vld1q_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), vget_low_s8(_xi));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), vget_low_s8(_xi));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), vget_low_s8(_xi));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), vget_low_s8(_xi));
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), vget_high_s8(_xi));
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), vget_high_s8(_xi));
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), vget_high_s8(_xi));
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), vget_high_s8(_xi));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _xi);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _xi);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _xi);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _xi);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 32;
            }
            {
                int32x4x2_t _tmp0 = vzipq_s32(_sum0, _sum1);
                int32x4x2_t _tmp1 = vzipq_s32(_sum2, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_tmp0.val[0]), vget_low_s32(_tmp1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_tmp0.val[0]), vget_high_s32(_tmp1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_tmp0.val[1]), vget_low_s32(_tmp1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_tmp0.val[1]), vget_high_s32(_tmp1.val[1]));
            }
            _gru_Nx0 = vaddq_s32(_gru_Nx0, _sum0);
            _gru_Nx0 = vaddq_s32(_gru_Nx0, _sum1);
            _gru_Nx0 = vaddq_s32(_gru_Nx0, _sum2);
            _gru_Nx0 = vaddq_s32(_gru_Nx0, _sum3);
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w = vld1q_s8(kptr);
                _gru_Nx0 = vdotq_lane_s32(_gru_Nx0, _w, _xi, 0);
#else
                int16x4_t _xi01 = vreinterpret_s16_s8(vld1_s8(x + i));
                int8x8_t _xi0 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 0));
                int8x8_t _xi1 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 1));
                int8x16_t _w01 = vld1q_s8(kptr);

                int16x8_t _gru_Nx = vmull_s8(vget_low_s8(_w01), _xi0);
                _gru_Nx = vmlal_s8(_gru_Nx, vget_high_s8(_w01), _xi1);
                _gru_Nx0 = vpadalq_s16(_gru_Nx0, _gru_Nx);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 16;
            }
            for (; i + 1 < size; i += 2)
            {
                int8x8_t _xi = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(x + i)), 0));
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _gru_Nx = vmull_s8(_w, _xi);
                _gru_Nx0 = vpadalq_s16(_gru_Nx0, _gru_Nx);

                kptr += 8;
            }
            for (; i < size; i++)
            {
                int8x8_t _xi = vdup_n_s8(x[i]);
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _gru_Nx = vmull_s8(_w, _xi);
                _gru_Nx0 = vaddw_s16(_gru_Nx0, vget_low_s16(_gru_Nx));

                kptr += 4;
            }

            float32x4_t _gru_N0 = vld1q_f32(bias_c_RUBNWN + 8);

            float32x4_t _descale_hc_N0 = vld1q_f32(descales_ptr + 16);

            _gru_N0 = vmlaq_f32(_gru_N0, vcvtq_f32_s32(_gru_Nh0), vmulq_f32(_descale_h, _descale_hc_N0));

            _gru_N0 = vmlaq_f32(vld1q_f32(bias_c_RUBNWN + 12), _gru_R0, _gru_N0);

            float32x4_t _descale_xc_N0 = vld1q_f32(descales_ptr + 20);

            _gru_N0 = vmlaq_f32(_gru_N0, vcvtq_f32_s32(_gru_Nx0), vmulq_f32(_descale_x, _descale_xc_N0));

            // tanh(N)
            _gru_N0 = tanh_ps(_gru_N0);

            float* gates_data = gates.row(q / 4);

            vst1q_f32(gates_data, _gru_U0);
            vst1q_f32(gates_data + 4, _gru_N0);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const signed char* x = bottom_blob_int8.row<const signed char>(ti);
            const signed char* hs = hidden_state_int8;
            const float descale_x = bottom_blob_int8_descales[ti];
            const float descale_h = hidden_state_int8_descale;

            // gate reset update
            const float* bias_c_RUBNWN = (const float*)bias_c + q * 4;

#if __ARM_NEON
            const signed char* kptr = weight_data_tm.row<const signed char>(q / 4 + q % 4);
            const float* descales_ptr = weight_data_tm_int8_descales.row(q / 4 + q % 4);
#else
            const signed char* kptr = weight_data_tm.row<const signed char>(q);
            const float* descales_ptr = weight_data_tm_int8_descales.row(q);
#endif

            const float descale_xc_R = descales_ptr[0];
            const float descale_xc_U = descales_ptr[1];
            const float descale_hc_R = descales_ptr[2];
            const float descale_hc_U = descales_ptr[3];
            const float descale_hc_N = descales_ptr[4];
            const float descale_xc_N = descales_ptr[5];

            int Rx = 0;
            int Ux = 0;
            for (int i = 0; i < size; i++)
            {
                signed char xi = x[i];

                Rx += kptr[0] * xi;
                Ux += kptr[1] * xi;

                kptr += 2;
            }

            int Rh = 0;
            int Uh = 0;
            for (int i = 0; i < num_output; i++)
            {
                signed char h_cont = hs[i];

                Rh += kptr[0] * h_cont;
                Uh += kptr[1] * h_cont;

                kptr += 2;
            }

            float R = bias_c_RUBNWN[0] + Rx * (descale_x * descale_xc_R) + Rh * (descale_h * descale_hc_R);
            float U = bias_c_RUBNWN[1] + Ux * (descale_x * descale_xc_U) + Uh * (descale_h * descale_hc_U);

            // sigmoid(R)
            // sigmoid(U)
            R = 1.f / (1.f + expf(-R));
            U = 1.f / (1.f + expf(-U));

            // gate new

            int Nh = 0;
            for (int i = 0; i < num_output; i++)
            {
                Nh += kptr[0] * hs[i];
                kptr += 1;
            }

            int Nx = 0;
            for (int i = 0; i < size; i++)
            {
                Nx += kptr[0] * x[i];
                kptr += 1;
            }

            float N = bias_c_RUBNWN[2] + Nh * (descale_h * descale_hc_N);
            N = bias_c_RUBNWN[3] + R * N + Nx * (descale_x * descale_xc_N);

            // tanh(N)
            N = tanhf(N);

#if __ARM_NEON
            float* gates_data = gates.row(q / 4 + q % 4);
#else
            float* gates_data = gates.row(q);
#endif

            gates_data[0] = U;
            gates_data[1] = N;
        }

        gru_int8_gate_output(gates, hidden_state, top_blob, ti, elemtype, opt);
    }
}


================================================
FILE: src/layer/arm/hardsigmoid_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "hardsigmoid_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

HardSigmoid_arm::HardSigmoid_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _alpha = vdupq_n_f32(alpha);
        float32x4_t _beta = vdupq_n_f32(beta);
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                "mov    v4.16b, %5.16b          \n"
                "mov    v5.16b, %5.16b          \n"
                "mov    v6.16b, %5.16b          \n"
                "mov    v7.16b, %5.16b          \n"
                "fmla   v4.4s, v0.4s, %4.4s     \n"
                "fmla   v5.4s, v1.4s, %4.4s     \n"
                "fmla   v6.4s, v2.4s, %4.4s     \n"
                "fmla   v7.4s, v3.4s, %4.4s     \n"
                "fmax   v0.4s, v4.4s, %2.4s     \n"
                "fmax   v1.4s, v5.4s, %2.4s     \n"
                "fmax   v2.4s, v6.4s, %2.4s     \n"
                "fmax   v3.4s, v7.4s, %2.4s     \n"
                "fmin   v0.4s, v0.4s, %3.4s     \n"
                "fmin   v1.4s, v1.4s, %3.4s     \n"
                "fmin   v2.4s, v2.4s, %3.4s     \n"
                "fmin   v3.4s, v3.4s, %3.4s     \n"
                "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #512]      \n"
                "vldm       %0, {d0-d7}     \n"
                "vmov       q4, %q5         \n"
                "vmov       q5, %q5         \n"
                "vmov       q6, %q5         \n"
                "vmov       q7, %q5         \n"
                "vmla.f32   q4, q0, %q4     \n"
                "vmla.f32   q5, q1, %q4     \n"
                "vmla.f32   q6, q2, %q4     \n"
                "vmla.f32   q7, q3, %q4     \n"
                "vmax.f32   q0, q4, %q2     \n"
                "vmax.f32   q1, q5, %q2     \n"
                "vmax.f32   q2, q6, %q2     \n"
                "vmax.f32   q3, q7, %q2     \n"
                "vmin.f32   q0, q0, %q3     \n"
                "vmin.f32   q1, q1, %q3     \n"
                "vmin.f32   q2, q2, %q3     \n"
                "vmin.f32   q3, q3, %q3     \n"
                "vstm       %0!, {d0-d7}    \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _p2 = vld1q_f32(ptr + 8);
            float32x4_t _p3 = vld1q_f32(ptr + 12);
            _p0 = vmlaq_f32(_beta, _p0, _alpha);
            _p1 = vmlaq_f32(_beta, _p1, _alpha);
            _p2 = vmlaq_f32(_beta, _p2, _alpha);
            _p3 = vmlaq_f32(_beta, _p3, _alpha);
            _p0 = vmaxq_f32(_p0, _zero);
            _p1 = vmaxq_f32(_p1, _zero);
            _p2 = vmaxq_f32(_p2, _zero);
            _p3 = vmaxq_f32(_p3, _zero);
            _p0 = vminq_f32(_p0, _one);
            _p1 = vminq_f32(_p1, _one);
            _p2 = vminq_f32(_p2, _one);
            _p3 = vminq_f32(_p3, _one);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            vst1q_f32(ptr + 8, _p2);
            vst1q_f32(ptr + 12, _p3);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            _p0 = vmlaq_f32(_beta, _p0, _alpha);
            _p1 = vmlaq_f32(_beta, _p1, _alpha);
            _p0 = vmaxq_f32(_p0, _zero);
            _p1 = vmaxq_f32(_p1, _zero);
            _p0 = vminq_f32(_p0, _one);
            _p1 = vminq_f32(_p1, _one);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vmlaq_f32(_beta, _p, _alpha);
            _p = vmaxq_f32(_p, _zero);
            _p = vminq_f32(_p, _one);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            if (*ptr < lower)
                *ptr = 0.f;
            else if (*ptr > upper)
                *ptr = 1.f;
            else
                *ptr = *ptr * alpha + beta;

            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _alpha = vdupq_n_f32(alpha);
        float32x4_t _beta = vdupq_n_f32(beta);
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #256]   \n"
                "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0] \n"
                "shll   v0.4s, v0.4h, #16       \n"
                "shll   v1.4s, v1.4h, #16       \n"
                "shll   v2.4s, v2.4h, #16       \n"
                "shll   v3.4s, v3.4h, #16       \n"
                "mov    v4.16b, %5.16b          \n"
                "mov    v5.16b, %5.16b          \n"
                "mov    v6.16b, %5.16b          \n"
                "mov    v7.16b, %5.16b          \n"
                "fmla   v4.4s, v0.4s, %4.4s     \n"
                "fmla   v5.4s, v1.4s, %4.4s     \n"
                "fmla   v6.4s, v2.4s, %4.4s     \n"
                "fmla   v7.4s, v3.4s, %4.4s     \n"
                "fmax   v0.4s, v4.4s, %2.4s     \n"
                "fmax   v1.4s, v5.4s, %2.4s     \n"
                "fmax   v2.4s, v6.4s, %2.4s     \n"
                "fmax   v3.4s, v7.4s, %2.4s     \n"
                "fmin   v0.4s, v0.4s, %3.4s     \n"
                "fmin   v1.4s, v1.4s, %3.4s     \n"
                "fmin   v2.4s, v2.4s, %3.4s     \n"
                "fmin   v3.4s, v3.4s, %3.4s     \n"
                "shrn   v0.4h, v0.4s, #16       \n"
                "shrn   v1.4h, v1.4s, #16       \n"
                "shrn   v2.4h, v2.4s, #16       \n"
                "shrn   v3.4h, v3.4s, #16       \n"
                "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #256]      \n"
                "vld1.u16   {d4-d7}, [%0]   \n"
                "vshll.u16  q0, d4, #16     \n"
                "vshll.u16  q1, d5, #16     \n"
                "vshll.u16  q2, d6, #16     \n"
                "vshll.u16  q3, d7, #16     \n"
                "vmov       q4, %q5         \n"
                "vmov       q5, %q5         \n"
                "vmov       q6, %q5         \n"
                "vmov       q7, %q5         \n"
                "vmla.f32   q4, q0, %q4     \n"
                "vmla.f32   q5, q1, %q4     \n"
                "vmla.f32   q6, q2, %q4     \n"
                "vmla.f32   q7, q3, %q4     \n"
                "vmax.f32   q0, q4, %q2     \n"
                "vmax.f32   q1, q5, %q2     \n"
                "vmax.f32   q2, q6, %q2     \n"
                "vmax.f32   q3, q7, %q2     \n"
                "vmin.f32   q0, q0, %q3     \n"
                "vmin.f32   q1, q1, %q3     \n"
                "vmin.f32   q2, q2, %q3     \n"
                "vmin.f32   q3, q3, %q3     \n"
                "vshrn.u32  d0, q0, #16     \n"
                "vshrn.u32  d1, q1, #16     \n"
                "vshrn.u32  d2, q2, #16     \n"
                "vshrn.u32  d3, q3, #16     \n"
                "vst1.u16   {d0-d3}, [%0]!  \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            uint16x8_t _p = vld1q_u16(ptr);
            uint16x8_t _q = vld1q_u16(ptr + 8);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
            float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
            _p0 = vmlaq_f32(_beta, _p0, _alpha);
            _p1 = vmlaq_f32(_beta, _p1, _alpha);
            _p2 = vmlaq_f32(_beta, _p2, _alpha);
            _p3 = vmlaq_f32(_beta, _p3, _alpha);
            _p0 = vmaxq_f32(_p0, _zero);
            _p1 = vmaxq_f32(_p1, _zero);
            _p2 = vmaxq_f32(_p2, _zero);
            _p3 = vmaxq_f32(_p3, _zero);
            _p0 = vminq_f32(_p0, _one);
            _p1 = vminq_f32(_p1, _one);
            _p2 = vminq_f32(_p2, _one);
            _p3 = vminq_f32(_p3, _one);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            _q = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
            vst1q_u16(ptr, _p);
            vst1q_u16(ptr + 8, _q);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _p0 = vmlaq_f32(_beta, _p0, _alpha);
            _p1 = vmlaq_f32(_beta, _p1, _alpha);
            _p0 = vmaxq_f32(_p0, _zero);
            _p1 = vmaxq_f32(_p1, _zero);
            _p0 = vminq_f32(_p0, _one);
            _p1 = vminq_f32(_p1, _one);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = vmlaq_f32(_beta, _p, _alpha);
            _p = vmaxq_f32(_p, _zero);
            _p = vminq_f32(_p, _one);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(*ptr);
            if (v < lower)
                v = 0.f;
            else if (v > upper)
                v = 1.f;
            else
                v = v * alpha + beta;
            *ptr = float32_to_bfloat16(v);

            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/hardsigmoid_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_HARDSIGMOID_ARM_H
#define LAYER_HARDSIGMOID_ARM_H

#include "hardsigmoid.h"

namespace ncnn {

class HardSigmoid_arm : public HardSigmoid
{
public:
    HardSigmoid_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_HARDSIGMOID_ARM_H


================================================
FILE: src/layer/arm/hardsigmoid_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "hardsigmoid_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int HardSigmoid_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _alpha = vdupq_n_f32(alpha);
        float32x4_t _beta = vdupq_n_f32(beta);

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _p0 = vfmaq_f32(_beta, _p0, _alpha);
            _p1 = vfmaq_f32(_beta, _p1, _alpha);
            _p0 = vmaxq_f32(_p0, _zero);
            _p1 = vmaxq_f32(_p1, _zero);
            _p0 = vminq_f32(_p0, _one);
            _p1 = vminq_f32(_p1, _one);
            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _p = vfmaq_f32(_beta, _p, _alpha);
            _p = vmaxq_f32(_p, _zero);
            _p = vminq_f32(_p, _one);
            vst1_f16(ptr, vcvt_f16_f32(_p));
            ptr += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)*ptr;
            if (v < lower)
                v = 0.f;
            else if (v > upper)
                v = 1.f;
            else
                v = v * alpha + beta;
            *ptr = (__fp16)v;

            ptr++;
        }
    }

    return 0;
}

int HardSigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        __fp16 alpha_fp16 = (__fp16)alpha;
        __fp16 beta_fp16 = (__fp16)beta;

        float16x8_t _zero = vdupq_n_f16((__fp16)0.f);
        float16x8_t _one = vdupq_n_f16((__fp16)1.f);
        float16x8_t _alpha = vdupq_n_f16(alpha_fp16);
        float16x8_t _beta = vdupq_n_f16(beta_fp16);

        int i = 0;
        for (; i + 31 < size; i += 32)
        {
#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                "mov    v4.16b, %5.16b          \n"
                "mov    v5.16b, %5.16b          \n"
                "mov    v6.16b, %5.16b          \n"
                "mov    v7.16b, %5.16b          \n"
                "fmla   v4.8h, v0.8h, %4.8h     \n"
                "fmla   v5.8h, v1.8h, %4.8h     \n"
                "fmla   v6.8h, v2.8h, %4.8h     \n"
                "fmla   v7.8h, v3.8h, %4.8h     \n"
                "fmax   v0.8h, v4.8h, %2.8h     \n"
                "fmax   v1.8h, v5.8h, %2.8h     \n"
                "fmax   v2.8h, v6.8h, %2.8h     \n"
                "fmax   v3.8h, v7.8h, %2.8h     \n"
                "fmin   v0.8h, v0.8h, %3.8h     \n"
                "fmin   v1.8h, v1.8h, %3.8h     \n"
                "fmin   v2.8h, v2.8h, %3.8h     \n"
                "fmin   v3.8h, v3.8h, %3.8h     \n"
                "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // NCNN_GNU_INLINE_ASM
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            _p0 = vfmaq_f16(_beta, _p0, _alpha);
            _p1 = vfmaq_f16(_beta, _p1, _alpha);
            _p2 = vfmaq_f16(_beta, _p2, _alpha);
            _p3 = vfmaq_f16(_beta, _p3, _alpha);
            _p0 = vmaxq_f16(_p0, _zero);
            _p1 = vmaxq_f16(_p1, _zero);
            _p2 = vmaxq_f16(_p2, _zero);
            _p3 = vmaxq_f16(_p3, _zero);
            _p0 = vminq_f16(_p0, _one);
            _p1 = vminq_f16(_p1, _one);
            _p2 = vminq_f16(_p2, _one);
            _p3 = vminq_f16(_p3, _one);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            ptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            _p0 = vfmaq_f16(_beta, _p0, _alpha);
            _p1 = vfmaq_f16(_beta, _p1, _alpha);
            _p0 = vmaxq_f16(_p0, _zero);
            _p1 = vmaxq_f16(_p1, _zero);
            _p0 = vminq_f16(_p0, _one);
            _p1 = vminq_f16(_p1, _one);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _p = vfmaq_f16(_beta, _p, _alpha);
            _p = vmaxq_f16(_p, _zero);
            _p = vminq_f16(_p, _one);
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = vfma_f16(vget_low_f16(_beta), _p, vget_low_f16(_alpha));
            _p = vmax_f16(_p, vget_low_f16(_zero));
            _p = vmin_f16(_p, vget_low_f16(_one));
            vst1_f16(ptr, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = *ptr;
            if (v < (__fp16)lower)
                v = (__fp16)0.f;
            else if (v > (__fp16)upper)
                v = (__fp16)1.f;
            else
                v = v * alpha_fp16 + beta_fp16;
            *ptr = v;

            ptr++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/hardswish_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "hardswish_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

HardSwish_arm::HardSwish_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _alpha = vdupq_n_f32(alpha);
        float32x4_t _beta = vdupq_n_f32(beta);
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                "mov    v4.16b, %5.16b          \n"
                "mov    v5.16b, %5.16b          \n"
                "mov    v6.16b, %5.16b          \n"
                "mov    v7.16b, %5.16b          \n"
                "fmla   v4.4s, v0.4s, %4.4s     \n"
                "fmla   v5.4s, v1.4s, %4.4s     \n"
                "fmla   v6.4s, v2.4s, %4.4s     \n"
                "fmla   v7.4s, v3.4s, %4.4s     \n"
                "fmax   v4.4s, v4.4s, %2.4s     \n"
                "fmax   v5.4s, v5.4s, %2.4s     \n"
                "fmax   v6.4s, v6.4s, %2.4s     \n"
                "fmax   v7.4s, v7.4s, %2.4s     \n"
                "fmin   v4.4s, v4.4s, %3.4s     \n"
                "fmin   v5.4s, v5.4s, %3.4s     \n"
                "fmin   v6.4s, v6.4s, %3.4s     \n"
                "fmin   v7.4s, v7.4s, %3.4s     \n"
                "fmul   v0.4s, v4.4s, v0.4s     \n"
                "fmul   v1.4s, v5.4s, v1.4s     \n"
                "fmul   v2.4s, v6.4s, v2.4s     \n"
                "fmul   v3.4s, v7.4s, v3.4s     \n"
                "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #512]      \n"
                "vldm       %0, {d0-d7}     \n"
                "vmov       q4, %q5         \n"
                "vmov       q5, %q5         \n"
                "vmov       q6, %q5         \n"
                "vmov       q7, %q5         \n"
                "vmla.f32   q4, q0, %q4     \n"
                "vmla.f32   q5, q1, %q4     \n"
                "vmla.f32   q6, q2, %q4     \n"
                "vmla.f32   q7, q3, %q4     \n"
                "vmax.f32   q4, q4, %q2     \n"
                "vmax.f32   q5, q5, %q2     \n"
                "vmax.f32   q6, q6, %q2     \n"
                "vmax.f32   q7, q7, %q2     \n"
                "vmin.f32   q4, q4, %q3     \n"
                "vmin.f32   q5, q5, %q3     \n"
                "vmin.f32   q6, q6, %q3     \n"
                "vmin.f32   q7, q7, %q3     \n"
                "vmul.f32   q0, q4, q0      \n"
                "vmul.f32   q1, q5, q1      \n"
                "vmul.f32   q2, q6, q2      \n"
                "vmul.f32   q3, q7, q3      \n"
                "vstm       %0!, {d0-d7}    \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _p2 = vld1q_f32(ptr + 8);
            float32x4_t _p3 = vld1q_f32(ptr + 12);
            float32x4_t _ans0 = vmlaq_f32(_beta, _p0, _alpha);
            float32x4_t _ans1 = vmlaq_f32(_beta, _p1, _alpha);
            float32x4_t _ans2 = vmlaq_f32(_beta, _p2, _alpha);
            float32x4_t _ans3 = vmlaq_f32(_beta, _p3, _alpha);
            _ans0 = vmaxq_f32(_ans0, _zero);
            _ans1 = vmaxq_f32(_ans1, _zero);
            _ans2 = vmaxq_f32(_ans2, _zero);
            _ans3 = vmaxq_f32(_ans3, _zero);
            _ans0 = vminq_f32(_ans0, _one);
            _ans1 = vminq_f32(_ans1, _one);
            _ans2 = vminq_f32(_ans2, _one);
            _ans3 = vminq_f32(_ans3, _one);
            _p0 = vmulq_f32(_ans0, _p0);
            _p1 = vmulq_f32(_ans1, _p1);
            _p2 = vmulq_f32(_ans2, _p2);
            _p3 = vmulq_f32(_ans3, _p3);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            vst1q_f32(ptr + 8, _p2);
            vst1q_f32(ptr + 12, _p3);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _ans0 = vmlaq_f32(_beta, _p0, _alpha);
            float32x4_t _ans1 = vmlaq_f32(_beta, _p1, _alpha);
            _ans0 = vmaxq_f32(_ans0, _zero);
            _ans1 = vmaxq_f32(_ans1, _zero);
            _ans0 = vminq_f32(_ans0, _one);
            _ans1 = vminq_f32(_ans1, _one);
            _p0 = vmulq_f32(_ans0, _p0);
            _p1 = vmulq_f32(_ans1, _p1);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _ans = vmlaq_f32(_beta, _p, _alpha);
            _ans = vmaxq_f32(_ans, _zero);
            _ans = vminq_f32(_ans, _one);
            _p = vmulq_f32(_ans, _p);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            if (*ptr < lower)
                *ptr = 0.f;
            else if (*ptr > upper)
                ;
            else
                *ptr = *ptr * (*ptr * alpha + beta);

            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _alpha = vdupq_n_f32(alpha);
        float32x4_t _beta = vdupq_n_f32(beta);
        for (; i + 15 < size; i += 16)
        {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "prfm   pldl1keep, [%0, #256]   \n"
                "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0] \n"
                "shll   v0.4s, v0.4h, #16       \n"
                "shll   v1.4s, v1.4h, #16       \n"
                "shll   v2.4s, v2.4h, #16       \n"
                "shll   v3.4s, v3.4h, #16       \n"
                "mov    v4.16b, %5.16b          \n"
                "mov    v5.16b, %5.16b          \n"
                "mov    v6.16b, %5.16b          \n"
                "mov    v7.16b, %5.16b          \n"
                "fmla   v4.4s, v0.4s, %4.4s     \n"
                "fmla   v5.4s, v1.4s, %4.4s     \n"
                "fmla   v6.4s, v2.4s, %4.4s     \n"
                "fmla   v7.4s, v3.4s, %4.4s     \n"
                "fmax   v4.4s, v4.4s, %2.4s     \n"
                "fmax   v5.4s, v5.4s, %2.4s     \n"
                "fmax   v6.4s, v6.4s, %2.4s     \n"
                "fmax   v7.4s, v7.4s, %2.4s     \n"
                "fmin   v4.4s, v4.4s, %3.4s     \n"
                "fmin   v5.4s, v5.4s, %3.4s     \n"
                "fmin   v6.4s, v6.4s, %3.4s     \n"
                "fmin   v7.4s, v7.4s, %3.4s     \n"
                "fmul   v0.4s, v4.4s, v0.4s     \n"
                "fmul   v1.4s, v5.4s, v1.4s     \n"
                "fmul   v2.4s, v6.4s, v2.4s     \n"
                "fmul   v3.4s, v7.4s, v3.4s     \n"
                "shrn   v0.4h, v0.4s, #16       \n"
                "shrn   v1.4h, v1.4s, #16       \n"
                "shrn   v2.4h, v2.4s, #16       \n"
                "shrn   v3.4h, v3.4s, #16       \n"
                "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #256]      \n"
                "vld1.u16   {d4-d7}, [%0]   \n"
                "vshll.u16  q0, d4, #16     \n"
                "vshll.u16  q1, d5, #16     \n"
                "vshll.u16  q2, d6, #16     \n"
                "vshll.u16  q3, d7, #16     \n"
                "vmov       q4, %q5         \n"
                "vmov       q5, %q5         \n"
                "vmov       q6, %q5         \n"
                "vmov       q7, %q5         \n"
                "vmla.f32   q4, q0, %q4     \n"
                "vmla.f32   q5, q1, %q4     \n"
                "vmla.f32   q6, q2, %q4     \n"
                "vmla.f32   q7, q3, %q4     \n"
                "vmax.f32   q4, q4, %q2     \n"
                "vmax.f32   q5, q5, %q2     \n"
                "vmax.f32   q6, q6, %q2     \n"
                "vmax.f32   q7, q7, %q2     \n"
                "vmin.f32   q4, q4, %q3     \n"
                "vmin.f32   q5, q5, %q3     \n"
                "vmin.f32   q6, q6, %q3     \n"
                "vmin.f32   q7, q7, %q3     \n"
                "vmul.f32   q0, q4, q0      \n"
                "vmul.f32   q1, q5, q1      \n"
                "vmul.f32   q2, q6, q2      \n"
                "vmul.f32   q3, q7, q3      \n"
                "vshrn.u32  d0, q0, #16     \n"
                "vshrn.u32  d1, q1, #16     \n"
                "vshrn.u32  d2, q2, #16     \n"
                "vshrn.u32  d3, q3, #16     \n"
                "vst1.u16   {d0-d3}, [%0]!  \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            uint16x8_t _p = vld1q_u16(ptr);
            uint16x8_t _q = vld1q_u16(ptr + 8);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
            float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
            float32x4_t _ans0 = vmlaq_f32(_beta, _p0, _alpha);
            float32x4_t _ans1 = vmlaq_f32(_beta, _p1, _alpha);
            float32x4_t _ans2 = vmlaq_f32(_beta, _p2, _alpha);
            float32x4_t _ans3 = vmlaq_f32(_beta, _p3, _alpha);
            _ans0 = vmaxq_f32(_ans0, _zero);
            _ans1 = vmaxq_f32(_ans1, _zero);
            _ans2 = vmaxq_f32(_ans2, _zero);
            _ans3 = vmaxq_f32(_ans3, _zero);
            _ans0 = vminq_f32(_ans0, _one);
            _ans1 = vminq_f32(_ans1, _one);
            _ans2 = vminq_f32(_ans2, _one);
            _ans3 = vminq_f32(_ans3, _one);
            _p0 = vmulq_f32(_ans0, _p0);
            _p1 = vmulq_f32(_ans1, _p1);
            _p2 = vmulq_f32(_ans2, _p2);
            _p3 = vmulq_f32(_ans3, _p3);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            _q = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
            vst1q_u16(ptr, _p);
            vst1q_u16(ptr + 8, _q);
            ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            float32x4_t _ans0 = vmlaq_f32(_beta, _p0, _alpha);
            float32x4_t _ans1 = vmlaq_f32(_beta, _p1, _alpha);
            _ans0 = vmaxq_f32(_ans0, _zero);
            _ans1 = vmaxq_f32(_ans1, _zero);
            _ans0 = vminq_f32(_ans0, _one);
            _ans1 = vminq_f32(_ans1, _one);
            _p0 = vmulq_f32(_ans0, _p0);
            _p1 = vmulq_f32(_ans1, _p1);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _ans = vmlaq_f32(_beta, _p, _alpha);
            _ans = vmaxq_f32(_ans, _zero);
            _ans = vminq_f32(_ans, _one);
            _p = vmulq_f32(_ans, _p);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(*ptr);
            if (v < lower)
                v = 0.f;
            else if (v > upper)
                ;
            else
                v = v * (v * alpha + beta);
            *ptr = float32_to_bfloat16(v);

            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/hardswish_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_HARDSWISH_ARM_H
#define LAYER_HARDSWISH_ARM_H

#include "hardswish.h"

namespace ncnn {

class HardSwish_arm : public HardSwish
{
public:
    HardSwish_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_HARDSWISH_ARM_H


================================================
FILE: src/layer/arm/hardswish_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "hardswish_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int HardSwish_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _alpha = vdupq_n_f32(alpha);
        float32x4_t _beta = vdupq_n_f32(beta);

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            float32x4_t _ans0 = vfmaq_f32(_beta, _p0, _alpha);
            float32x4_t _ans1 = vfmaq_f32(_beta, _p1, _alpha);
            _ans0 = vmaxq_f32(_ans0, _zero);
            _ans1 = vmaxq_f32(_ans1, _zero);
            _ans0 = vminq_f32(_ans0, _one);
            _ans1 = vminq_f32(_ans1, _one);
            _p0 = vmulq_f32(_ans0, _p0);
            _p1 = vmulq_f32(_ans1, _p1);
            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            float32x4_t _ans = vfmaq_f32(_beta, _p, _alpha);
            _ans = vmaxq_f32(_ans, _zero);
            _ans = vminq_f32(_ans, _one);
            _p = vmulq_f32(_ans, _p);
            vst1_f16(ptr, vcvt_f16_f32(_p));
            ptr += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)*ptr;
            if (v < lower)
                v = 0.f;
            else if (v > upper)
                ;
            else
                v = v * (v * alpha + beta);
            *ptr = (__fp16)v;

            ptr++;
        }
    }

    return 0;
}

int HardSwish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        __fp16 alpha_fp16 = (__fp16)alpha;
        __fp16 beta_fp16 = (__fp16)beta;

        float16x8_t _zero = vdupq_n_f16((__fp16)0.f);
        float16x8_t _one = vdupq_n_f16((__fp16)1.f);
        float16x8_t _alpha = vdupq_n_f16(alpha_fp16);
        float16x8_t _beta = vdupq_n_f16(beta_fp16);

        int i = 0;
        for (; i + 31 < size; i += 32)
        {
#if NCNN_GNU_INLINE_ASM
            asm volatile(
                "prfm   pldl1keep, [%0, #512]   \n"
                "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                "mov    v4.16b, %5.16b          \n"
                "mov    v5.16b, %5.16b          \n"
                "mov    v6.16b, %5.16b          \n"
                "mov    v7.16b, %5.16b          \n"
                "fmla   v4.8h, v0.8h, %4.8h     \n"
                "fmla   v5.8h, v1.8h, %4.8h     \n"
                "fmla   v6.8h, v2.8h, %4.8h     \n"
                "fmla   v7.8h, v3.8h, %4.8h     \n"
                "fmax   v4.8h, v4.8h, %2.8h     \n"
                "fmax   v5.8h, v5.8h, %2.8h     \n"
                "fmax   v6.8h, v6.8h, %2.8h     \n"
                "fmax   v7.8h, v7.8h, %2.8h     \n"
                "fmin   v4.8h, v4.8h, %3.8h     \n"
                "fmin   v5.8h, v5.8h, %3.8h     \n"
                "fmin   v6.8h, v6.8h, %3.8h     \n"
                "fmin   v7.8h, v7.8h, %3.8h     \n"
                "fmul   v0.8h, v4.8h, v0.8h     \n"
                "fmul   v1.8h, v5.8h, v1.8h     \n"
                "fmul   v2.8h, v6.8h, v2.8h     \n"
                "fmul   v3.8h, v7.8h, v3.8h     \n"
                "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                : "=r"(ptr) // %0
                : "0"(ptr),
                "w"(_zero),  // %2
                "w"(_one),   // %3
                "w"(_alpha), // %4
                "w"(_beta)   // %5
                : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else  // NCNN_GNU_INLINE_ASM
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x8_t _ans0 = vfmaq_f16(_beta, _p0, _alpha);
            float16x8_t _ans1 = vfmaq_f16(_beta, _p1, _alpha);
            float16x8_t _ans2 = vfmaq_f16(_beta, _p2, _alpha);
            float16x8_t _ans3 = vfmaq_f16(_beta, _p3, _alpha);
            _ans0 = vmaxq_f16(_ans0, _zero);
            _ans1 = vmaxq_f16(_ans1, _zero);
            _ans2 = vmaxq_f16(_ans2, _zero);
            _ans3 = vmaxq_f16(_ans3, _zero);
            _ans0 = vminq_f16(_ans0, _one);
            _ans1 = vminq_f16(_ans1, _one);
            _ans2 = vminq_f16(_ans2, _one);
            _ans3 = vminq_f16(_ans3, _one);
            _p0 = vmulq_f16(_ans0, _p0);
            _p1 = vmulq_f16(_ans1, _p1);
            _p2 = vmulq_f16(_ans2, _p2);
            _p3 = vmulq_f16(_ans3, _p3);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            ptr += 32;
#endif // NCNN_GNU_INLINE_ASM
        }
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _ans0 = vfmaq_f16(_beta, _p0, _alpha);
            float16x8_t _ans1 = vfmaq_f16(_beta, _p1, _alpha);
            _ans0 = vmaxq_f16(_ans0, _zero);
            _ans1 = vmaxq_f16(_ans1, _zero);
            _ans0 = vminq_f16(_ans0, _one);
            _ans1 = vminq_f16(_ans1, _one);
            _p0 = vmulq_f16(_ans0, _p0);
            _p1 = vmulq_f16(_ans1, _p1);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x8_t _ans = vfmaq_f16(_beta, _p, _alpha);
            _ans = vmaxq_f16(_ans, _zero);
            _ans = vminq_f16(_ans, _one);
            _p = vmulq_f16(_ans, _p);
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            float16x4_t _ans = vfma_f16(vget_low_f16(_beta), _p, vget_low_f16(_alpha));
            _ans = vmax_f16(_ans, vget_low_f16(_zero));
            _ans = vmin_f16(_ans, vget_low_f16(_one));
            _p = vmul_f16(_ans, _p);
            vst1_f16(ptr, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = *ptr;
            if (v < (__fp16)lower)
                v = (__fp16)0.f;
            else if (v > (__fp16)upper)
                ;
            else
                v = v * (v * alpha_fp16 + beta_fp16);
            *ptr = v;

            ptr++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/innerproduct_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "innerproduct_arm.h"

#include "layer_type.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

InnerProduct_arm::InnerProduct_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif

    flatten = 0;
}

int InnerProduct_arm::create_pipeline(const Option& opt)
{
    {
        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);

        ncnn::ParamDict pd;

        flatten->load_param(pd);

        flatten->create_pipeline(opt);
    }

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_arm(opt);
    }
#endif

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

#if NCNN_VFPV4
    if (cpu_support_arm_vfpv4() && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;

#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON

    if (out_elempack == 4)
    {
        // src = inch-outch
        // dst = pb-inch-outch/pb
        {
            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

            weight_data_tm.create(num_input, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack);

            for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
            {
                float* g0 = weight_data_tm.row(q / out_elempack);

                for (int p = 0; p < num_input; p++)
                {
                    for (int j = 0; j < out_elempack; j++)
                    {
                        *g0++ = weight_data_r2.row(q + j)[p];
                    }
                }
            }
        }
    }
    else
    {
        weight_data_tm = weight_data;
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int InnerProduct_arm::destroy_pipeline(const Option& opt)
{
    if (flatten)
    {
        flatten->destroy_pipeline(opt);
        delete flatten;
        flatten = 0;
    }

    return 0;
}

int InnerProduct_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && int8_scale_term)
    {
        return forward_int8_arm(bottom_blob, top_blob, opt);
    }
#endif

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_VFPV4
    if (cpu_support_arm_vfpv4() && opt.use_fp16_storage)
    {
        return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
    {
        // gemm
        int h = bottom_blob.h;
        size_t elemsize = bottom_blob.elemsize;
        int elempack = bottom_blob.elempack;

        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int num_output_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
        }
#endif

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
#if __ARM_NEON
            if (elempack == 4 && num_output_elempack == 4)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const float* kptr = weight_data_tm.row(p);
                    const float* m = bottom_blob.row(j);

                    float32x4_t _sum0 = vdupq_n_f32(0.f);
                    float32x4_t _sum1 = vdupq_n_f32(0.f);
                    float32x4_t _sum2 = vdupq_n_f32(0.f);
                    float32x4_t _sum3 = vdupq_n_f32(0.f);

                    if (bias_term)
                    {
                        _sum0 = vdupq_n_f32(bias_data[p * 4 + 0]);
                        _sum1 = vdupq_n_f32(bias_data[p * 4 + 1]);
                        _sum2 = vdupq_n_f32(bias_data[p * 4 + 2]);
                        _sum3 = vdupq_n_f32(bias_data[p * 4 + 3]);
                    }

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        float32x4_t _val = vld1q_f32(m);
                        float32x4_t _w = vld1q_f32(kptr);
#if __aarch64__
                        _sum0 = vfmaq_laneq_f32(_sum0, _val, _w, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _val, _w, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _val, _w, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _val, _w, 3);
#else
                        _sum0 = vmlaq_lane_f32(_sum0, _val, vget_low_f32(_w), 0);
                        _sum1 = vmlaq_lane_f32(_sum1, _val, vget_low_f32(_w), 1);
                        _sum2 = vmlaq_lane_f32(_sum2, _val, vget_high_f32(_w), 0);
                        _sum3 = vmlaq_lane_f32(_sum3, _val, vget_high_f32(_w), 1);
#endif
                        m += 4;
                        kptr += 4;
                    }

                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps(_sum3, activation_type, activation_params);

                    vst1q_f32(outptr, _sum0);
                    vst1q_f32(outptr + 4, _sum1);
                    vst1q_f32(outptr + 8, _sum2);
                    vst1q_f32(outptr + 12, _sum3);
                    outptr += 16;
                }
            }

            if (elempack == 1 && num_output_elempack == 4)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const float* kptr = weight_data_tm.row(p);
                    const float* m = bottom_blob.row(j);

                    float32x4_t _sum0 = vdupq_n_f32(0.f);
                    float32x4_t _sum1 = vdupq_n_f32(0.f);
                    float32x4_t _sum2 = vdupq_n_f32(0.f);
                    float32x4_t _sum3 = vdupq_n_f32(0.f);

                    if (bias_term)
                    {
                        _sum0 = vld1q_f32((const float*)bias_data + p * 4);
                    }

                    int i = 0;
                    for (; i + 3 < num_input; i += 4)
                    {
                        float32x4_t _val = vld1q_f32(m);

                        float32x4_t _w0 = vld1q_f32(kptr);
                        float32x4_t _w1 = vld1q_f32(kptr + 4);
                        float32x4_t _w2 = vld1q_f32(kptr + 8);
                        float32x4_t _w3 = vld1q_f32(kptr + 12);

#if __aarch64__
                        _sum0 = vfmaq_laneq_f32(_sum0, _w0, _val, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _w1, _val, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _w2, _val, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _w3, _val, 3);
#else
                        _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_val), 0);
                        _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_val), 1);
                        _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_val), 0);
                        _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_val), 1);
#endif

                        m += 4;
                        kptr += 16;
                    }
                    for (; i < num_input; i++)
                    {
                        float32x4_t _val = vld1q_dup_f32(m);
                        float32x4_t _k = vld1q_f32(kptr);
                        _sum0 = vmlaq_f32(_sum0, _val, _k);

                        m += 1;
                        kptr += 4;
                    }

                    _sum0 = vaddq_f32(_sum0, _sum1);
                    _sum2 = vaddq_f32(_sum2, _sum3);
                    _sum0 = vaddq_f32(_sum0, _sum2);

                    _sum0 = activation_ps(_sum0, activation_type, activation_params);

                    vst1q_f32(outptr, _sum0);
                    outptr += 4;
                }
            }

            if (elempack == 4 && num_output_elempack == 1)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const float* kptr = (const float*)weight_data_tm + num_input * p;
                    const float* m = bottom_blob.row(j);

                    float32x4_t _sum = vdupq_n_f32(0.f);

                    if (bias_term)
                    {
                        _sum = vdupq_n_f32(bias_data[p]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float32x4_t _val = vld1q_f32(m);
                        float32x4_t _k = vdupq_n_f32(kptr[0]);
                        _sum = vmlaq_f32(_sum, _val, _k);

                        m += 4;
                        kptr += 1;
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    vst1q_f32(outptr, _sum);
                    outptr += 4;
                }
            }
#endif // __ARM_NEON

            if (elempack == 1 && num_output_elempack == 1)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const float* kptr = (const float*)weight_data_tm + num_input * p;
                    const float* m = bottom_blob.row(j);

                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    int i = 0;
#if __ARM_NEON
                    float32x4_t _sum = vdupq_n_f32(0.f);
                    for (; i + 3 < num_input; i += 4)
                    {
                        float32x4_t _val = vld1q_f32(m);
                        float32x4_t _k = vld1q_f32(kptr);
                        _sum = vmlaq_f32(_sum, _val, _k);

                        m += 4;
                        kptr += 4;
                    }
#if __aarch64__
                    sum += vaddvq_f32(_sum);
#else
                    float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
                    _ss = vpadd_f32(_ss, _ss);
                    sum += vget_lane_f32(_ss, 0);
#endif
#endif // __ARM_NEON
                    for (; i < num_input; i++)
                    {
                        sum += *m * *kptr;

                        m += 1;
                        kptr += 1;
                    }

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[0] = sum;
                    outptr += 1;
                }
            }
        }

        return 0;
    }

    // flatten
    Mat bottom_blob_flattened = bottom_blob;
    if (bottom_blob.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;

        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
        if (bottom_blob_flattened.empty())
            return -100;
    }

    size_t elemsize = bottom_blob_flattened.elemsize;
    int elempack = bottom_blob_flattened.elempack;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __ARM_NEON
    if (out_elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            float32x4_t _sum0 = bias_term ? vld1q_f32((const float*)bias_data + p * 4) : vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            const float* kptr = weight_data_tm.row(p);

            const float* sptr = bottom_blob_flattened;

            int i = 0;
#if NCNN_GNU_INLINE_ASM
            for (; i + 7 < num_input; i += 8)
            {
#if __aarch64__
                asm volatile(
                    "prfm       pldl1keep, [%0, #256]     \n"
                    "ld1        {v0.4s, v1.4s}, [%0], #32 \n"
                    "prfm       pldl1keep, [%1, #512]     \n"
                    "ld1        {v2.4s, v3.4s, v4.4s, v5.4s}, [%1], #64 \n"
                    "prfm       pldl1keep, [%1, #512]     \n"
                    "ld1        {v6.4s, v7.4s, v8.4s, v9.4s}, [%1], #64 \n"
                    "fmla       %2.4s, v2.4s, v0.s[0]     \n"
                    "fmla       %3.4s, v3.4s, v0.s[1]     \n"
                    "fmla       %4.4s, v4.4s, v0.s[2]     \n"
                    "fmla       %5.4s, v5.4s, v0.s[3]     \n"
                    "fmla       %2.4s, v6.4s, v1.s[0]     \n"
                    "fmla       %3.4s, v7.4s, v1.s[1]     \n"
                    "fmla       %4.4s, v8.4s, v1.s[2]     \n"
                    "fmla       %5.4s, v9.4s, v1.s[3]     \n"
                    : "=r"(sptr),  // %0
                    "=r"(kptr),  // %1
                    "=w"(_sum0), // %2
                    "=w"(_sum1), // %3
                    "=w"(_sum2), // %4
                    "=w"(_sum3)  // %5
                    : "0"(sptr),
                    "1"(kptr),
                    "2"(_sum0),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#else
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld1.f32   {d0-d3}, [%0 :128]! \n"
                    "pld        [%1, #512]          \n"
                    "vldm       %1!, {d4-d11}       \n"
                    "pld        [%1, #512]          \n"
                    "vldm       %1!, {d12-d19}      \n"
                    "vmla.f32   %q2, q2, d0[0]      \n"
                    "vmla.f32   %q3, q3, d0[1]      \n"
                    "vmla.f32   %q4, q4, d1[0]      \n"
                    "vmla.f32   %q5, q5, d1[1]      \n"
                    "vmla.f32   %q2, q6, d2[0]      \n"
                    "vmla.f32   %q3, q7, d2[1]      \n"
                    "vmla.f32   %q4, q8, d3[0]      \n"
                    "vmla.f32   %q5, q9, d3[1]      \n"
                    : "=r"(sptr),  // %0
                    "=r"(kptr),  // %1
                    "=w"(_sum0), // %2
                    "=w"(_sum1), // %3
                    "=w"(_sum2), // %4
                    "=w"(_sum3)  // %5
                    : "0"(sptr),
                    "1"(kptr),
                    "2"(_sum0),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9");
#endif
            }
#endif // NCNN_GNU_INLINE_ASM
            for (; i + 3 < num_input; i += 4)
            {
                float32x4_t _val = vld1q_f32(sptr);

                float32x4_t _w0 = vld1q_f32(kptr);
                float32x4_t _w1 = vld1q_f32(kptr + 4);
                float32x4_t _w2 = vld1q_f32(kptr + 8);
                float32x4_t _w3 = vld1q_f32(kptr + 12);

#if __aarch64__
                _sum0 = vfmaq_laneq_f32(_sum0, _w0, _val, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _w1, _val, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _w2, _val, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _w3, _val, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_val), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_val), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_val), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_val), 1);
#endif

                sptr += 4;
                kptr += 16;
            }
            for (; i < num_input; i++)
            {
                float32x4_t _val = vld1q_dup_f32(sptr);
                float32x4_t _w = vld1q_f32(kptr);
                _sum0 = vmlaq_f32(_sum0, _val, _w);

                sptr += 1;
                kptr += 4;
            }

            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _sum0 = vaddq_f32(_sum0, _sum2);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);

            float* outptr = top_blob;
            vst1q_f32(outptr + p * 4, _sum0);
        }
    }
#endif // __ARM_NEON

    if (out_elempack == 1)
    {
        const float* weight_data_ptr = weight_data_tm;

        int nn_num_output = num_output >> 2;
        int remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp = 0; pp < nn_num_output; pp++)
        {
            int p = pp * 4;

            float sum0 = 0.f;
            float sum1 = 0.f;
            float sum2 = 0.f;
            float sum3 = 0.f;

            if (bias_term)
            {
                sum0 = bias_data[p];
                sum1 = bias_data[p + 1];
                sum2 = bias_data[p + 2];
                sum3 = bias_data[p + 3];
            }

            const float* w0 = weight_data_ptr + num_input * p;
            const float* w1 = weight_data_ptr + num_input * (p + 1);
            const float* w2 = weight_data_ptr + num_input * (p + 2);
            const float* w3 = weight_data_ptr + num_input * (p + 3);

            const float* m = bottom_blob_flattened;

            int i = 0;
#if __ARM_NEON
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
#if NCNN_GNU_INLINE_ASM
            for (; i + 7 < num_input; i += 8)
            {
#if __aarch64__
                asm volatile(
                    "prfm       pldl1keep, [%0, #256]     \n"
                    "ld1        {v0.4s, v1.4s}, [%0], #32 \n"
                    "prfm       pldl1keep, [%1, #256]     \n"
                    "ld1        {v2.4s, v3.4s}, [%1], #32 \n"
                    "prfm       pldl1keep, [%2, #256]     \n"
                    "ld1        {v4.4s, v5.4s}, [%2], #32 \n"
                    "prfm       pldl1keep, [%3, #256]     \n"
                    "ld1        {v6.4s, v7.4s}, [%3], #32 \n"
                    "prfm       pldl1keep, [%4, #256]     \n"
                    "ld1        {v8.4s, v9.4s}, [%4], #32 \n"
                    "fmla       %5.4s, v0.4s, v2.4s       \n"
                    "fmla       %6.4s, v0.4s, v4.4s       \n"
                    "fmla       %7.4s, v0.4s, v6.4s       \n"
                    "fmla       %8.4s, v0.4s, v8.4s       \n"
                    "fmla       %5.4s, v1.4s, v3.4s       \n"
                    "fmla       %6.4s, v1.4s, v5.4s       \n"
                    "fmla       %7.4s, v1.4s, v7.4s       \n"
                    "fmla       %8.4s, v1.4s, v9.4s       \n"
                    : "=r"(m),     // %0
                    "=r"(w0),    // %1
                    "=r"(w1),    // %2
                    "=r"(w2),    // %3
                    "=r"(w3),    // %4
                    "=w"(_sum0), // %5
                    "=w"(_sum1), // %6
                    "=w"(_sum2), // %7
                    "=w"(_sum3)  // %8
                    : "0"(m),
                    "1"(w0),
                    "2"(w1),
                    "3"(w2),
                    "4"(w3),
                    "5"(_sum0),
                    "6"(_sum1),
                    "7"(_sum2),
                    "8"(_sum3)
                    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#else
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld1.f32   {d0-d3}, [%0 :128]! \n"
                    "pld        [%1, #256]          \n"
                    "vld1.f32   {d4-d7}, [%1]!      \n"
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d8-d11}, [%2]!     \n"
                    "pld        [%3, #256]          \n"
                    "vld1.f32   {d12-d15}, [%3]!    \n"
                    "pld        [%4, #256]          \n"
                    "vld1.f32   {d16-d19}, [%4]!    \n"
                    "vmla.f32   %q5, q0, q2         \n"
                    "vmla.f32   %q6, q0, q4         \n"
                    "vmla.f32   %q7, q0, q6         \n"
                    "vmla.f32   %q8, q0, q8         \n"
                    "vmla.f32   %q5, q1, q3         \n"
                    "vmla.f32   %q6, q1, q5         \n"
                    "vmla.f32   %q7, q1, q7         \n"
                    "vmla.f32   %q8, q1, q9         \n"
                    : "=r"(m),     // %0
                    "=r"(w0),    // %1
                    "=r"(w1),    // %2
                    "=r"(w2),    // %3
                    "=r"(w3),    // %4
                    "=w"(_sum0), // %5
                    "=w"(_sum1), // %6
                    "=w"(_sum2), // %7
                    "=w"(_sum3)  // %8
                    : "0"(m),
                    "1"(w0),
                    "2"(w1),
                    "3"(w2),
                    "4"(w3),
                    "5"(_sum0),
                    "6"(_sum1),
                    "7"(_sum2),
                    "8"(_sum3)
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9");
#endif // __aarch64__
            }
#endif // NCNN_GNU_INLINE_ASM
            for (; i + 3 < num_input; i += 4)
            {
                float32x4_t _val = vld1q_f32(m);

                float32x4_t _w0 = vld1q_f32(w0);
                float32x4_t _w1 = vld1q_f32(w1);
                float32x4_t _w2 = vld1q_f32(w2);
                float32x4_t _w3 = vld1q_f32(w3);

                _sum0 = vmlaq_f32(_sum0, _val, _w0);
                _sum1 = vmlaq_f32(_sum1, _val, _w1);
                _sum2 = vmlaq_f32(_sum2, _val, _w2);
                _sum3 = vmlaq_f32(_sum3, _val, _w3);

                m += 4;
                w0 += 4;
                w1 += 4;
                w2 += 4;
                w3 += 4;
            }

            float32x2_t _sum0ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
            float32x2_t _sum1ss = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
            float32x2_t _sum2ss = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
            float32x2_t _sum3ss = vadd_f32(vget_low_f32(_sum3), vget_high_f32(_sum3));

            float32x2_t _sum01ss = vpadd_f32(_sum0ss, _sum1ss);
            float32x2_t _sum23ss = vpadd_f32(_sum2ss, _sum3ss);

            sum0 += vget_lane_f32(_sum01ss, 0);
            sum1 += vget_lane_f32(_sum01ss, 1);
            sum2 += vget_lane_f32(_sum23ss, 0);
            sum3 += vget_lane_f32(_sum23ss, 1);
#endif // __ARM_NEON
            for (; i < num_input; i++)
            {
                sum0 += *m * *w0;
                sum1 += *m * *w1;
                sum2 += *m * *w2;
                sum3 += *m * *w3;

                m++;
                w0++;
                w1++;
                w2++;
                w3++;
            }

            sum0 = activation_ss(sum0, activation_type, activation_params);
            sum1 = activation_ss(sum1, activation_type, activation_params);
            sum2 = activation_ss(sum2, activation_type, activation_params);
            sum3 = activation_ss(sum3, activation_type, activation_params);

            top_blob[p] = sum0;
            top_blob[p + 1] = sum1;
            top_blob[p + 2] = sum2;
            top_blob[p + 3] = sum3;
        }

        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = remain_num_output_start; p < num_output; p++)
        {
            float sum = 0.f;

            if (bias_term)
                sum = bias_data[p];

            const float* w = weight_data_ptr + num_input * p;

            const float* m = bottom_blob_flattened;

            int i = 0;
#if __ARM_NEON
            float32x4_t _sum = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
#if NCNN_GNU_INLINE_ASM
            for (; i + 7 < num_input; i += 8)
            {
#if __aarch64__
                asm volatile(
                    "prfm       pldl1keep, [%0, #256]     \n"
                    "ld1        {v0.4s, v1.4s}, [%0], #32 \n"
                    "prfm       pldl1keep, [%1, #256]     \n"
                    "ld1        {v2.4s, v3.4s}, [%1], #32 \n"
                    "fmla       %2.4s, v0.4s, v2.4s       \n"
                    "fmla       %3.4s, v1.4s, v3.4s       \n"
                    : "=r"(m),    // %0
                    "=r"(w),    // %1
                    "=w"(_sum), // %2
                    "=w"(_sum2) // %3
                    : "0"(m),
                    "1"(w),
                    "2"(_sum),
                    "3"(_sum2)
                    : "cc", "memory", "v0", "v1", "v2", "v3");
#else
                asm volatile(
                    "pld        [%0, #256]          \n"
                    "vld1.f32   {d0-d3}, [%0 :128]! \n"
                    "pld        [%1, #256]          \n"
                    "vld1.f32   {d4-d7}, [%1]!      \n"
                    "vmla.f32   %q2, q0, q2         \n"
                    "vmla.f32   %q3, q1, q3         \n"
                    : "=r"(m),    // %0
                    "=r"(w),    // %1
                    "=w"(_sum), // %2
                    "=w"(_sum2) // %3
                    : "0"(m),
                    "1"(w),
                    "2"(_sum),
                    "3"(_sum2)
                    : "cc", "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
            }
#endif // NCNN_GNU_INLINE_ASM
            for (; i + 3 < num_input; i += 4)
            {
                float32x4_t _val = vld1q_f32(m);
                float32x4_t _w = vld1q_f32(w);
                _sum = vmlaq_f32(_sum, _val, _w);
                m += 4;
                w += 4;
            }

            _sum = vaddq_f32(_sum, _sum2);
#if __aarch64__
            sum += vaddvq_f32(_sum);
#else
            float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
            _sumss = vpadd_f32(_sumss, _sumss);
            sum += vget_lane_f32(_sumss, 0);
#endif // __aarch64__
#endif // __ARM_NEON
            for (; i < num_input; i++)
            {
                sum += *m * *w;

                m++;
                w++;
            }

            sum = activation_ss(sum, activation_type, activation_params);

            top_blob[p] = sum;
        }
    }

    return 0;
}

#if NCNN_BF16
int InnerProduct_arm::create_pipeline_bf16s(const Option& opt)
{
    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON

    // src = inch-outch
    // dst = pb-inch-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)2u * out_elempack, out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / out_elempack);

            for (int p = 0; p < num_input; p++)
            {
                for (int j = 0; j < out_elempack; j++)
                {
                    *g0++ = float32_to_bfloat16(weight_data_r2.row(q + j)[p]);
                }
            }
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int InnerProduct_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
    {
        // gemm
        int h = bottom_blob.h;
        size_t elemsize = bottom_blob.elemsize;
        int elempack = bottom_blob.elempack;

        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int num_output_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
        }
#endif // __ARM_NEON

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
#if __ARM_NEON
            if (elempack == 4 && num_output_elempack == 4)
            {
                unsigned short* outptr = top_blob.row<unsigned short>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const unsigned short* kptr = (const unsigned short*)weight_data_tm + num_input * p * 4;
                    const unsigned short* m = bottom_blob.row<const unsigned short>(j);

                    float32x4_t _sum0 = vdupq_n_f32(0.f);
                    float32x4_t _sum1 = vdupq_n_f32(0.f);
                    float32x4_t _sum2 = vdupq_n_f32(0.f);
                    float32x4_t _sum3 = vdupq_n_f32(0.f);

                    if (bias_term)
                    {
                        _sum0 = vdupq_n_f32(bias_data[p * 4 + 0]);
                        _sum1 = vdupq_n_f32(bias_data[p * 4 + 1]);
                        _sum2 = vdupq_n_f32(bias_data[p * 4 + 2]);
                        _sum3 = vdupq_n_f32(bias_data[p * 4 + 3]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float32x4_t _val = bfloat2float(vld1_u16(m));
                        float32x4_t _k = bfloat2float(vld1_u16(kptr));
#if __aarch64__
                        _sum0 = vfmaq_laneq_f32(_sum0, _val, _k, 0);
                        _sum1 = vfmaq_laneq_f32(_sum1, _val, _k, 1);
                        _sum2 = vfmaq_laneq_f32(_sum2, _val, _k, 2);
                        _sum3 = vfmaq_laneq_f32(_sum3, _val, _k, 3);
#else
                        _sum0 = vmlaq_lane_f32(_sum0, _val, vget_low_f32(_k), 0);
                        _sum1 = vmlaq_lane_f32(_sum1, _val, vget_low_f32(_k), 1);
                        _sum2 = vmlaq_lane_f32(_sum2, _val, vget_high_f32(_k), 0);
                        _sum3 = vmlaq_lane_f32(_sum3, _val, vget_high_f32(_k), 1);
#endif

                        m += 4;
                        kptr += 4;
                    }

                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps(_sum3, activation_type, activation_params);

                    vst1_u16(outptr, float2bfloat(_sum0));
                    vst1_u16(outptr + 4, float2bfloat(_sum1));
                    vst1_u16(outptr + 8, float2bfloat(_sum2));
                    vst1_u16(outptr + 12, float2bfloat(_sum3));
                    outptr += 16;
                }
            }

            if (elempack == 1 && num_output_elempack == 4)
            {
                unsigned short* outptr = top_blob.row<unsigned short>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const unsigned short* kptr = (const unsigned short*)weight_data_tm + num_input * p * 4;
                    const unsigned short* m = bottom_blob.row<const unsigned short>(j);

                    float32x4_t _sum = vdupq_n_f32(0.f);

                    if (bias_term)
                    {
                        _sum = vld1q_f32((const float*)bias_data + p * 4);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float32x4_t _val = vdupq_n_f32(bfloat16_to_float32(m[0]));
                        float32x4_t _k = bfloat2float(vld1_u16(kptr));
                        _sum = vmlaq_f32(_sum, _val, _k);

                        m += 1;
                        kptr += 4;
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    vst1_u16(outptr, float2bfloat(_sum));
                    outptr += 4;
                }
            }

            if (elempack == 4 && num_output_elempack == 1)
            {
                unsigned short* outptr = top_blob.row<unsigned short>(j);

                for (int p = 0; p < num_output; p++)
                {
                    const unsigned short* kptr = (const unsigned short*)weight_data_tm + num_input * p;
                    const unsigned short* m = bottom_blob.row<const unsigned short>(j);

                    float32x4_t _sum = vdupq_n_f32(0.f);

                    if (bias_term)
                    {
                        _sum = vdupq_n_f32(bias_data[p]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float32x4_t _val = bfloat2float(vld1_u16(m));
                        float32x4_t _k = vdupq_n_f32(bfloat16_to_float32(kptr[0]));
                        _sum = vmlaq_f32(_sum, _val, _k);

                        m += 4;
                        kptr += 1;
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    vst1_u16(outptr, float2bfloat(_sum));
                    outptr += 4;
                }
            }
#endif // __ARM_NEON

            if (elempack == 1 && num_output_elempack == 1)
            {
                unsigned short* outptr = top_blob.row<unsigned short>(j);

                for (int p = 0; p < num_output; p++)
                {
                    const unsigned short* kptr = (const unsigned short*)weight_data_tm + num_input * p;
                    const unsigned short* m = bottom_blob.row<const unsigned short>(j);

                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        sum += bfloat16_to_float32(*m) * bfloat16_to_float32(*kptr);

                        m += 1;
                        kptr += 1;
                    }

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[0] = float32_to_bfloat16(sum);
                    outptr += 1;
                }
            }
        }

        return 0;
    }

    // flatten
    Mat bottom_blob_flattened = bottom_blob;
    if (bottom_blob.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;

        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
        if (bottom_blob_flattened.empty())
            return -100;
    }

    size_t elemsize = bottom_blob_flattened.elemsize;
    int elempack = bottom_blob_flattened.elempack;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __ARM_NEON
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __ARM_NEON
    if (out_elempack == 4)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            if (bias_term)
            {
                _sum0 = vld1q_f32(((const float*)bias_data) + p * 4);
            }

            const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);

            const unsigned short* sptr = bottom_blob_flattened;

            int i = 0;
            for (; i + 3 < num_input; i += 4)
            {
                float32x4_t _val = bfloat2float(vld1_u16(sptr));

                float32x4_t _w0 = bfloat2float(vld1_u16(kptr));
                float32x4_t _w1 = bfloat2float(vld1_u16(kptr + 4));
                float32x4_t _w2 = bfloat2float(vld1_u16(kptr + 8));
                float32x4_t _w3 = bfloat2float(vld1_u16(kptr + 12));

#if __aarch64__
                _sum0 = vmlaq_laneq_f32(_sum0, _w0, _val, 0);
                _sum1 = vmlaq_laneq_f32(_sum1, _w1, _val, 1);
                _sum2 = vmlaq_laneq_f32(_sum2, _w2, _val, 2);
                _sum3 = vmlaq_laneq_f32(_sum3, _w3, _val, 3);
#else
                _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_val), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_val), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_val), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_val), 1);
#endif

                sptr += 4;
                kptr += 16;
            }
            for (; i < num_input; i++)
            {
                float32x4_t _val = vdupq_n_f32(bfloat16_to_float32(sptr[0]));

                float32x4_t _w = bfloat2float(vld1_u16(kptr));

                _sum0 = vmlaq_f32(_sum0, _val, _w);

                sptr += 1;
                kptr += 4;
            }

            _sum0 = vaddq_f32(_sum0, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _sum0 = vaddq_f32(_sum0, _sum2);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);

            unsigned short* outptr = (unsigned short*)top_blob;
            vst1_u16(outptr + p * 4, float2bfloat(_sum0));
        }
    }
#endif // __ARM_NEON

    if (out_elempack == 1)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output; p++)
        {
            float sum = 0.f;

            if (bias_term)
                sum = bias_data[p];

            const unsigned short* kptr = weight_data_tm.row<unsigned short>(p);

            const unsigned short* sptr = bottom_blob_flattened;

            int i = 0;
#if __ARM_NEON
            float32x4_t _sum = vdupq_n_f32(0.f);
            for (; i + 3 < num_input; i += 4)
            {
                float32x4_t _m = bfloat2float(vld1_u16(sptr));
                float32x4_t _w = bfloat2float(vld1_u16(kptr));

                _sum = vmlaq_f32(_sum, _m, _w);

                sptr += 4;
                kptr += 4;
            }
#endif // __ARM_NEON
            for (; i < num_input; i++)
            {
                float v = bfloat16_to_float32(*sptr);
                float k = bfloat16_to_float32(*kptr);

                sum += v * k;

                sptr++;
                kptr++;
            }

#if __ARM_NEON
#if __aarch64__
            sum += vaddvq_f32(_sum);
#else
            float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
            _sumss = vpadd_f32(_sumss, _sumss);
            sum += vget_lane_f32(_sumss, 0);
#endif // __aarch64__
#endif // __ARM_NEON

            sum = activation_ss(sum, activation_type, activation_params);

            unsigned short* outptr = (unsigned short*)top_blob;
            outptr[p] = float32_to_bfloat16(sum);
        }
    }

    return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int InnerProduct_arm::create_pipeline_int8_arm(const Option& opt)
{
    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }
#endif

    // src = inch-outch
    // dst = pb-inch-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            signed char* g0 = weight_data_tm.row<signed char>(q / out_elempack);

            for (int p = 0; p < num_input; p++)
            {
                for (int j = 0; j < out_elempack; j++)
                {
                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
                }
            }
        }
    }

    scale_in_data.create(num_output);
    for (int p = 0; p < num_output; p++)
    {
        // dequantize
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        scale_in_data[p] = scale_in;
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int InnerProduct_arm::forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int num_input = weight_data_size / num_output;

    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
        if (bottom_blob_int8.empty())
            return -100;
    }

    if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input)
    {
        // gemm
        Mat bottom_blob_int8_unpacked;
        Option opt_unpack = opt;
        opt_unpack.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);
        if (bottom_blob_int8_unpacked.empty())
            return -100;

        int h = bottom_blob_int8_unpacked.h;

        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = h % 4 == 0 ? 4 : 1;
        }
#endif

        int outh = h / out_elempack;

        top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int num_output_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            num_output_elempack = num_output % 8 == 0 ? 8 : 1;
        }
#endif

#if __ARM_NEON
        if (num_output_elempack == 8 && out_elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);

                    int32x4_t _sum00 = vdupq_n_s32(0);
                    int32x4_t _sum01 = vdupq_n_s32(0);
                    int32x4_t _sum10 = vdupq_n_s32(0);
                    int32x4_t _sum11 = vdupq_n_s32(0);
                    int32x4_t _sum20 = vdupq_n_s32(0);
                    int32x4_t _sum21 = vdupq_n_s32(0);
                    int32x4_t _sum30 = vdupq_n_s32(0);
                    int32x4_t _sum31 = vdupq_n_s32(0);

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        int8x8_t _val0 = vld1_dup_s8(m0);
                        int8x8_t _val1 = vld1_dup_s8(m1);
                        int8x8_t _val2 = vld1_dup_s8(m2);
                        int8x8_t _val3 = vld1_dup_s8(m3);

                        int8x8_t _w = vld1_s8(kptr);

                        int16x8_t _s0 = vmull_s8(_val0, _w);
                        int16x8_t _s1 = vmull_s8(_val1, _w);
                        int16x8_t _s2 = vmull_s8(_val2, _w);
                        int16x8_t _s3 = vmull_s8(_val3, _w);
                        _sum00 = vaddw_s16(_sum00, vget_low_s16(_s0));
                        _sum01 = vaddw_s16(_sum01, vget_high_s16(_s0));
                        _sum10 = vaddw_s16(_sum10, vget_low_s16(_s1));
                        _sum11 = vaddw_s16(_sum11, vget_high_s16(_s1));
                        _sum20 = vaddw_s16(_sum20, vget_low_s16(_s2));
                        _sum21 = vaddw_s16(_sum21, vget_high_s16(_s2));
                        _sum30 = vaddw_s16(_sum30, vget_low_s16(_s3));
                        _sum31 = vaddw_s16(_sum31, vget_high_s16(_s3));

                        m0++;
                        m1++;
                        m2++;
                        m3++;
                        kptr += 8;
                    }

                    // dequantize and relu
                    float32x4_t _scale_in0 = vld1q_f32((const float*)scale_in_data + p * 8);
                    float32x4_t _scale_in1 = vld1q_f32((const float*)scale_in_data + p * 8 + 4);

                    float32x4_t _sumfp32_00 = vcvtq_f32_s32(_sum00);
                    float32x4_t _sumfp32_01 = vcvtq_f32_s32(_sum01);
                    float32x4_t _sumfp32_10 = vcvtq_f32_s32(_sum10);
                    float32x4_t _sumfp32_11 = vcvtq_f32_s32(_sum11);
                    float32x4_t _sumfp32_20 = vcvtq_f32_s32(_sum20);
                    float32x4_t _sumfp32_21 = vcvtq_f32_s32(_sum21);
                    float32x4_t _sumfp32_30 = vcvtq_f32_s32(_sum30);
                    float32x4_t _sumfp32_31 = vcvtq_f32_s32(_sum31);
                    if (bias_term)
                    {
                        float32x4_t _bias0 = vld1q_f32((const float*)bias_data + p * 8);
                        float32x4_t _bias1 = vld1q_f32((const float*)bias_data + p * 8 + 4);
                        _sumfp32_00 = vmlaq_f32(_bias0, _sumfp32_00, _scale_in0);
                        _sumfp32_01 = vmlaq_f32(_bias1, _sumfp32_01, _scale_in1);
                        _sumfp32_10 = vmlaq_f32(_bias0, _sumfp32_10, _scale_in0);
                        _sumfp32_11 = vmlaq_f32(_bias1, _sumfp32_11, _scale_in1);
                        _sumfp32_20 = vmlaq_f32(_bias0, _sumfp32_20, _scale_in0);
                        _sumfp32_21 = vmlaq_f32(_bias1, _sumfp32_21, _scale_in1);
                        _sumfp32_30 = vmlaq_f32(_bias0, _sumfp32_30, _scale_in0);
                        _sumfp32_31 = vmlaq_f32(_bias1, _sumfp32_31, _scale_in1);
                    }
                    else
                    {
                        _sumfp32_00 = vmulq_f32(_sumfp32_00, _scale_in0);
                        _sumfp32_01 = vmulq_f32(_sumfp32_01, _scale_in1);
                        _sumfp32_10 = vmulq_f32(_sumfp32_10, _scale_in0);
                        _sumfp32_11 = vmulq_f32(_sumfp32_11, _scale_in1);
                        _sumfp32_20 = vmulq_f32(_sumfp32_20, _scale_in0);
                        _sumfp32_21 = vmulq_f32(_sumfp32_21, _scale_in1);
                        _sumfp32_30 = vmulq_f32(_sumfp32_30, _scale_in0);
                        _sumfp32_31 = vmulq_f32(_sumfp32_31, _scale_in1);
                    }

                    _sumfp32_00 = activation_ps(_sumfp32_00, activation_type, activation_params);
                    _sumfp32_01 = activation_ps(_sumfp32_01, activation_type, activation_params);
                    _sumfp32_10 = activation_ps(_sumfp32_10, activation_type, activation_params);
                    _sumfp32_11 = activation_ps(_sumfp32_11, activation_type, activation_params);
                    _sumfp32_20 = activation_ps(_sumfp32_20, activation_type, activation_params);
                    _sumfp32_21 = activation_ps(_sumfp32_21, activation_type, activation_params);
                    _sumfp32_30 = activation_ps(_sumfp32_30, activation_type, activation_params);
                    _sumfp32_31 = activation_ps(_sumfp32_31, activation_type, activation_params);

                    // transpose 4x8
                    float32x4x4_t _sumfp32_0;
                    _sumfp32_0.val[0] = _sumfp32_00;
                    _sumfp32_0.val[1] = _sumfp32_10;
                    _sumfp32_0.val[2] = _sumfp32_20;
                    _sumfp32_0.val[3] = _sumfp32_30;
                    float32x4x4_t _sumfp32_1;
                    _sumfp32_1.val[0] = _sumfp32_01;
                    _sumfp32_1.val[1] = _sumfp32_11;
                    _sumfp32_1.val[2] = _sumfp32_21;
                    _sumfp32_1.val[3] = _sumfp32_31;

                    vst4q_f32(outptr, _sumfp32_0);
                    vst4q_f32(outptr + 16, _sumfp32_1);

                    outptr += 32;
                }
            }
        }

        if (num_output_elempack == 1 && out_elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);

                    int sum0 = 0;
                    int sum1 = 0;
                    int sum2 = 0;
                    int sum3 = 0;

                    int i = 0;

                    int32x4_t _sum0 = vdupq_n_s32(0);
                    int32x4_t _sum1 = vdupq_n_s32(0);
                    int32x4_t _sum2 = vdupq_n_s32(0);
                    int32x4_t _sum3 = vdupq_n_s32(0);
                    for (; i + 7 < num_input; i += 8)
                    {
                        int8x8_t _val0 = vld1_s8(m0);
                        int8x8_t _val1 = vld1_s8(m1);
                        int8x8_t _val2 = vld1_s8(m2);
                        int8x8_t _val3 = vld1_s8(m3);
                        int8x8_t _w = vld1_s8(kptr);

                        int16x8_t _s0 = vmull_s8(_val0, _w);
                        int16x8_t _s1 = vmull_s8(_val1, _w);
                        int16x8_t _s2 = vmull_s8(_val2, _w);
                        int16x8_t _s3 = vmull_s8(_val3, _w);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_low_s16(_s1));
                        _sum2 = vaddw_s16(_sum2, vget_low_s16(_s2));
                        _sum3 = vaddw_s16(_sum3, vget_low_s16(_s3));
                        _sum0 = vaddw_s16(_sum0, vget_high_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s1));
                        _sum2 = vaddw_s16(_sum2, vget_high_s16(_s2));
                        _sum3 = vaddw_s16(_sum3, vget_high_s16(_s3));

                        m0 += 8;
                        m1 += 8;
                        m2 += 8;
                        m3 += 8;
                        kptr += 8;
                    }
#if __aarch64__
                    sum0 = vaddvq_s32(_sum0);
                    sum1 = vaddvq_s32(_sum1);
                    sum2 = vaddvq_s32(_sum2);
                    sum3 = vaddvq_s32(_sum3);
#else
                    int32x2_t _s20 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                    int32x2_t _s21 = vadd_s32(vget_low_s32(_sum1), vget_high_s32(_sum1));
                    int32x2_t _s22 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2));
                    int32x2_t _s23 = vadd_s32(vget_low_s32(_sum3), vget_high_s32(_sum3));
                    int32x2_t _s201 = vpadd_s32(_s20, _s21);
                    int32x2_t _s223 = vpadd_s32(_s22, _s23);
                    sum0 = vget_lane_s32(_s201, 0);
                    sum1 = vget_lane_s32(_s201, 1);
                    sum2 = vget_lane_s32(_s223, 0);
                    sum3 = vget_lane_s32(_s223, 1);
#endif
                    for (; i < num_input; i++)
                    {
                        sum0 += *m0++ * kptr[0];
                        sum1 += *m1++ * kptr[0];
                        sum2 += *m2++ * kptr[0];
                        sum3 += *m3++ * kptr[0];
                        kptr += 1;
                    }

                    // dequantize and relu
                    float sumfp32_0 = sum0 * scale_in_data[p];
                    float sumfp32_1 = sum1 * scale_in_data[p];
                    float sumfp32_2 = sum2 * scale_in_data[p];
                    float sumfp32_3 = sum3 * scale_in_data[p];

                    if (bias_term)
                    {
                        sumfp32_0 += bias_data[p];
                        sumfp32_1 += bias_data[p];
                        sumfp32_2 += bias_data[p];
                        sumfp32_3 += bias_data[p];
                    }

                    outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params);
                    outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params);
                    outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params);
                    outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params);
                    outptr += 4;
                }
            }
        }

        if (num_output_elempack == 8 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);

                    int32x4_t _sum0 = vdupq_n_s32(0);
                    int32x4_t _sum1 = vdupq_n_s32(0);

                    int i = 0;
                    for (; i + 3 < num_input; i += 4)
                    {
                        int8x8_t _val0 = vdup_n_s8(m[0]);
                        int8x8_t _val1 = vdup_n_s8(m[1]);
                        int8x8_t _val2 = vdup_n_s8(m[2]);
                        int8x8_t _val3 = vdup_n_s8(m[3]);

                        int8x16_t _w0 = vld1q_s8(kptr);
                        int8x16_t _w1 = vld1q_s8(kptr + 16);

                        int16x8_t _s0 = vmull_s8(_val0, vget_low_s8(_w0));
                        int16x8_t _s1 = vmull_s8(_val2, vget_low_s8(_w1));
                        _s0 = vmlal_s8(_s0, _val1, vget_high_s8(_w0));
                        _s1 = vmlal_s8(_s1, _val3, vget_high_s8(_w1));

                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s1));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s1));

                        m += 4;
                        kptr += 32;
                    }
                    for (; i < num_input; i++)
                    {
                        int8x8_t _val = vld1_dup_s8(m);
                        int8x8_t _w = vld1_s8(kptr);

                        int16x8_t _s0 = vmull_s8(_val, _w);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                        m++;
                        kptr += 8;
                    }

                    // dequantize and relu
                    float32x4_t _scale_in0 = vld1q_f32((const float*)scale_in_data + p * 8);
                    float32x4_t _scale_in1 = vld1q_f32((const float*)scale_in_data + p * 8 + 4);

                    float32x4_t _sumfp32_0 = vcvtq_f32_s32(_sum0);
                    float32x4_t _sumfp32_1 = vcvtq_f32_s32(_sum1);

                    if (bias_term)
                    {
                        float32x4_t _bias0 = vld1q_f32((const float*)bias_data + p * 8);
                        float32x4_t _bias1 = vld1q_f32((const float*)bias_data + p * 8 + 4);
                        _sumfp32_0 = vmlaq_f32(_bias0, _sumfp32_0, _scale_in0);
                        _sumfp32_1 = vmlaq_f32(_bias1, _sumfp32_1, _scale_in1);
                    }
                    else
                    {
                        _sumfp32_0 = vmulq_f32(_sumfp32_0, _scale_in0);
                        _sumfp32_1 = vmulq_f32(_sumfp32_1, _scale_in1);
                    }

                    _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
                    _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);

                    vst1q_f32(outptr, _sumfp32_0);
                    vst1q_f32(outptr + 4, _sumfp32_1);
                    outptr += 8;
                }
            }
        }
#endif // __ARM_NEON

        if (num_output_elempack == 1 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);

                    int sum = 0;

                    int i = 0;
#if __ARM_NEON
                    int32x4_t _sum0 = vdupq_n_s32(0);
                    int32x4_t _sum1 = vdupq_n_s32(0);
                    for (; i + 7 < num_input; i += 8)
                    {
                        int8x8_t _val = vld1_s8(m);
                        int8x8_t _w = vld1_s8(kptr);

                        int16x8_t _s0 = vmull_s8(_val, _w);
                        _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                        _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                        m += 8;
                        kptr += 8;
                    }

                    _sum0 = vaddq_s32(_sum0, _sum1);
#if __aarch64__
                    sum = vaddvq_s32(_sum0);
#else
                    int32x2_t _s2 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0));
                    _s2 = vpadd_s32(_s2, _s2);
                    sum = vget_lane_s32(_s2, 0);
#endif
#endif // __ARM_NEON
                    for (; i < num_input; i++)
                    {
                        sum += *m++ * *kptr++;
                    }

                    // dequantize and relu
                    float sumfp32 = sum * scale_in_data[p];

                    if (bias_term)
                        sumfp32 += bias_data[p];

                    outptr[0] = activation_ss(sumfp32, activation_type, activation_params);
                    outptr += 1;
                }
            }
        }

        return 0;
    }

    Mat bottom_blob_int8_flattened = bottom_blob_int8;
    if (bottom_blob_int8.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;
        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
        if (bottom_blob_int8_flattened.empty())
            return -100;
    }

    //     int elempack = bottom_blob_int8_flattened.elempack;

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }
#endif

    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __ARM_NEON
    if (out_elempack == 8)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            const signed char* kptr = weight_data_tm.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);

            int i = 0;
            for (; i + 1 < num_input; i += 2)
            {
                int8x8_t _val0 = vdup_n_s8(sptr[0]);
                int8x8_t _val1 = vdup_n_s8(sptr[1]);

                int8x8_t _w0 = vld1_s8(kptr);
                int8x8_t _w1 = vld1_s8(kptr + 8);

                int16x8_t _s0 = vmull_s8(_val0, _w0);
                _s0 = vmlal_s8(_s0, _val1, _w1);

                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                sptr += 2;
                kptr += 16;
            }
            for (; i < num_input; i++)
            {
                int8x8_t _val = vdup_n_s8(sptr[0]);

                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _s0 = vmull_s8(_val, _w);
                _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
                _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));

                sptr += 1;
                kptr += 8;
            }

            // dequantize and relu
            float32x4_t _scale_in0 = vld1q_f32((const float*)scale_in_data + p * 8);
            float32x4_t _scale_in1 = vld1q_f32((const float*)scale_in_data + p * 8 + 4);

            float32x4_t _sumfp32_0 = vcvtq_f32_s32(_sum0);
            float32x4_t _sumfp32_1 = vcvtq_f32_s32(_sum1);

            if (bias_term)
            {
                float32x4_t _bias0 = vld1q_f32((const float*)bias_data + p * 8);
                float32x4_t _bias1 = vld1q_f32((const float*)bias_data + p * 8 + 4);
                _sumfp32_0 = vmlaq_f32(_bias0, _sumfp32_0, _scale_in0);
                _sumfp32_1 = vmlaq_f32(_bias1, _sumfp32_1, _scale_in1);
            }
            else
            {
                _sumfp32_0 = vmulq_f32(_sumfp32_0, _scale_in0);
                _sumfp32_1 = vmulq_f32(_sumfp32_1, _scale_in1);
            }

            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);

            float* outptr = (float*)top_blob + p * 8;
            vst1q_f32(outptr, _sumfp32_0);
            vst1q_f32(outptr + 4, _sumfp32_1);
        }
    }
#endif // __ARM_NEON

    if (out_elempack == 1)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            const signed char* kptr = weight_data_tm.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int sum = 0;

            int i = 0;
            for (; i < num_input; i++)
            {
                signed char val = sptr[0];

                signed char w = kptr[0];

                sum += val * w;

                sptr += 1;
                kptr += 1;
            }

            // dequantize and relu
            float sumfp32 = sum * scale_in_data[p];

            if (bias_term)
                sumfp32 += bias_data[p];

            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);

            top_blob[p] = sumfp32;
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/arm/innerproduct_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INNERPRODUCT_ARM_H
#define LAYER_INNERPRODUCT_ARM_H

#include "innerproduct.h"

namespace ncnn {

class InnerProduct_arm : public InnerProduct
{
public:
    InnerProduct_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
#if NCNN_VFPV4
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_ARM82
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8_arm(const Option& opt);
    int forward_int8_arm(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Layer* flatten;

    Mat weight_data_tm;

    // fp16
    Mat bias_data_fp16;

#if NCNN_INT8
    Mat scale_in_data;
#endif
};

} // namespace ncnn

#endif // LAYER_INNERPRODUCT_ARM_H


================================================
FILE: src/layer/arm/innerproduct_arm_asimdfhm.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "innerproduct_arm.h"

#include "cpu.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "innerproduct_fp16s.h"
#include "innerproduct_gemm_fp16s.h"

void innerproduct_pack4_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
    innerproduct_pack4_fp16s_neon(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
}

void innerproduct_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
    innerproduct_fp16s_neon(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
}

void innerproduct_gemm_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
    innerproduct_gemm_fp16s_neon(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
}

void innerproduct_transform_kernel_fp16s_neon_asimdfhm(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
{
    innerproduct_transform_kernel_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/innerproduct_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "innerproduct_arm.h"

#include "cpu.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "innerproduct_fp16s.h"
#include "innerproduct_gemm_fp16s.h"

void innerproduct_pack4_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
    innerproduct_pack4_fp16s_neon(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
}

void innerproduct_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
    innerproduct_fp16s_neon(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
}

void innerproduct_gemm_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
    innerproduct_gemm_fp16s_neon(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
}

void innerproduct_transform_kernel_fp16s_neon_asimdhp(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
{
    innerproduct_transform_kernel_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, opt);
}

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int InnerProduct_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
    {
        // gemm
        int h = bottom_blob.h;
        size_t elemsize = bottom_blob.elemsize;
        int elempack = bottom_blob.elempack;

        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int num_output_elempack = 1;
        if (opt.use_packing_layout)
        {
            num_output_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
            if (elempack == 8 && num_output_elempack == 8)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * 8;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x8_t _sum0 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum1 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum2 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum3 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum4 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum5 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum6 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum7 = vdupq_n_f16((__fp16)0.f);

                    if (bias_term)
                    {
                        _sum0 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 0]);
                        _sum1 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 1]);
                        _sum2 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 2]);
                        _sum3 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 3]);
                        _sum4 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 4]);
                        _sum5 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 5]);
                        _sum6 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 6]);
                        _sum7 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 7]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x8_t _val = vld1q_f16(m);
                        float16x8_t _k = vld1q_f16(kptr);
                        _sum0 = vfmaq_laneq_f16(_sum0, _val, _k, 0);
                        _sum1 = vfmaq_laneq_f16(_sum1, _val, _k, 1);
                        _sum2 = vfmaq_laneq_f16(_sum2, _val, _k, 2);
                        _sum3 = vfmaq_laneq_f16(_sum3, _val, _k, 3);
                        _sum4 = vfmaq_laneq_f16(_sum4, _val, _k, 4);
                        _sum5 = vfmaq_laneq_f16(_sum5, _val, _k, 5);
                        _sum6 = vfmaq_laneq_f16(_sum6, _val, _k, 6);
                        _sum7 = vfmaq_laneq_f16(_sum7, _val, _k, 7);

                        m += 8;
                        kptr += 8;
                    }

                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);
                    _sum4 = activation_ps_f16(_sum4, activation_type, activation_params);
                    _sum5 = activation_ps_f16(_sum5, activation_type, activation_params);
                    _sum6 = activation_ps_f16(_sum6, activation_type, activation_params);
                    _sum7 = activation_ps_f16(_sum7, activation_type, activation_params);

                    vst1q_f16(outptr, _sum0);
                    vst1q_f16(outptr + 8, _sum1);
                    vst1q_f16(outptr + 16, _sum2);
                    vst1q_f16(outptr + 24, _sum3);
                    vst1q_f16(outptr + 32, _sum4);
                    vst1q_f16(outptr + 40, _sum5);
                    vst1q_f16(outptr + 48, _sum6);
                    vst1q_f16(outptr + 56, _sum7);
                    outptr += 64;
                }
            }

            if (elempack == 1 && num_output_elempack == 8)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * 8;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x8_t _sum = vdupq_n_f16(0.f);

                    if (bias_term)
                    {
                        _sum = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x8_t _val = vdupq_n_f16(m[0]);
                        float16x8_t _k = vld1q_f16(kptr);
                        _sum = vfmaq_f16(_sum, _val, _k);

                        m += 1;
                        kptr += 8;
                    }

                    _sum = activation_ps_f16(_sum, activation_type, activation_params);

                    vst1q_f16(outptr, _sum);
                    outptr += 8;
                }
            }

            if (elempack == 4 && num_output_elempack == 8)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * 8;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x4_t _sum0 = vdup_n_f16(0.f);
                    float16x4_t _sum1 = vdup_n_f16(0.f);
                    float16x4_t _sum2 = vdup_n_f16(0.f);
                    float16x4_t _sum3 = vdup_n_f16(0.f);
                    float16x4_t _sum4 = vdup_n_f16(0.f);
                    float16x4_t _sum5 = vdup_n_f16(0.f);
                    float16x4_t _sum6 = vdup_n_f16(0.f);
                    float16x4_t _sum7 = vdup_n_f16(0.f);

                    if (bias_term)
                    {
                        _sum0 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 0]);
                        _sum1 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 1]);
                        _sum2 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 2]);
                        _sum3 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 3]);
                        _sum4 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 4]);
                        _sum5 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 5]);
                        _sum6 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 6]);
                        _sum7 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 8 + 7]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x4_t _val = vld1_f16(m);
                        float16x8_t _k = vld1q_f16(kptr);
                        _sum0 = vfma_laneq_f16(_sum0, _val, _k, 0);
                        _sum1 = vfma_laneq_f16(_sum1, _val, _k, 1);
                        _sum2 = vfma_laneq_f16(_sum2, _val, _k, 2);
                        _sum3 = vfma_laneq_f16(_sum3, _val, _k, 3);
                        _sum4 = vfma_laneq_f16(_sum4, _val, _k, 4);
                        _sum5 = vfma_laneq_f16(_sum5, _val, _k, 5);
                        _sum6 = vfma_laneq_f16(_sum6, _val, _k, 6);
                        _sum7 = vfma_laneq_f16(_sum7, _val, _k, 7);

                        m += 4;
                        kptr += 8;
                    }

                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);
                    _sum4 = activation_ps_f16(_sum4, activation_type, activation_params);
                    _sum5 = activation_ps_f16(_sum5, activation_type, activation_params);
                    _sum6 = activation_ps_f16(_sum6, activation_type, activation_params);
                    _sum7 = activation_ps_f16(_sum7, activation_type, activation_params);

                    vst1_f16(outptr, _sum0);
                    vst1_f16(outptr + 4, _sum1);
                    vst1_f16(outptr + 8, _sum2);
                    vst1_f16(outptr + 12, _sum3);
                    vst1_f16(outptr + 16, _sum4);
                    vst1_f16(outptr + 20, _sum5);
                    vst1_f16(outptr + 24, _sum6);
                    vst1_f16(outptr + 28, _sum7);
                    outptr += 32;
                }
            }

            if (elempack == 8 && num_output_elempack == 1)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                    if (bias_term)
                    {
                        _sum = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x8_t _val = vld1q_f16(m);
                        float16x8_t _k = vdupq_n_f16(kptr[0]);
                        _sum = vfmaq_f16(_sum, _val, _k);

                        m += 8;
                        kptr += 1;
                    }

                    _sum = activation_ps_f16(_sum, activation_type, activation_params);

                    vst1q_f16(outptr, _sum);
                    outptr += 8;
                }
            }

            if (elempack == 8 && num_output_elempack == 4)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * 4;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x8_t _sum0 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum1 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum2 = vdupq_n_f16((__fp16)0.f);
                    float16x8_t _sum3 = vdupq_n_f16((__fp16)0.f);

                    if (bias_term)
                    {
                        _sum0 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 0]);
                        _sum1 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 1]);
                        _sum2 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 2]);
                        _sum3 = vdupq_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 3]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x8_t _val = vld1q_f16(m);
                        float16x4_t _k = vld1_f16(kptr);
                        _sum0 = vfmaq_lane_f16(_sum0, _val, _k, 0);
                        _sum1 = vfmaq_lane_f16(_sum1, _val, _k, 1);
                        _sum2 = vfmaq_lane_f16(_sum2, _val, _k, 2);
                        _sum3 = vfmaq_lane_f16(_sum3, _val, _k, 3);

                        m += 8;
                        kptr += 4;
                    }

                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);

                    vst1q_f16(outptr, _sum0);
                    vst1q_f16(outptr + 8, _sum1);
                    vst1q_f16(outptr + 16, _sum2);
                    vst1q_f16(outptr + 24, _sum3);
                    outptr += 32;
                }
            }

            if (elempack == 4 && num_output_elempack == 4)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * 4;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x4_t _sum0 = vdup_n_f16(0.f);
                    float16x4_t _sum1 = vdup_n_f16(0.f);
                    float16x4_t _sum2 = vdup_n_f16(0.f);
                    float16x4_t _sum3 = vdup_n_f16(0.f);

                    if (bias_term)
                    {
                        _sum0 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 0]);
                        _sum1 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 1]);
                        _sum2 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 2]);
                        _sum3 = vdup_n_f16(((const __fp16*)bias_data_fp16)[p * 4 + 3]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x4_t _val = vld1_f16(m);
                        float16x4_t _k = vld1_f16(kptr);
                        _sum0 = vfma_lane_f16(_sum0, _val, _k, 0);
                        _sum1 = vfma_lane_f16(_sum1, _val, _k, 1);
                        _sum2 = vfma_lane_f16(_sum2, _val, _k, 2);
                        _sum3 = vfma_lane_f16(_sum3, _val, _k, 3);

                        m += 4;
                        kptr += 4;
                    }

                    _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps_f16(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps_f16(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps_f16(_sum3, activation_type, activation_params);

                    vst1_f16(outptr, _sum0);
                    vst1_f16(outptr + 4, _sum1);
                    vst1_f16(outptr + 8, _sum2);
                    vst1_f16(outptr + 12, _sum3);
                    outptr += 16;
                }
            }

            if (elempack == 1 && num_output_elempack == 4)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p * 4;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x4_t _sum = vdup_n_f16(0.f);

                    if (bias_term)
                    {
                        _sum = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x4_t _val = vdup_n_f16(m[0]);
                        float16x4_t _k = vld1_f16(kptr);
                        _sum = vfma_f16(_sum, _val, _k);

                        m += 1;
                        kptr += 4;
                    }

                    _sum = activation_ps_f16(_sum, activation_type, activation_params);

                    vst1_f16(outptr, _sum);
                    outptr += 4;
                }
            }

            if (elempack == 4 && num_output_elempack == 1)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float16x4_t _sum = vdup_n_f16(0.f);

                    if (bias_term)
                    {
                        _sum = vdup_n_f16(((const __fp16*)bias_data_fp16)[p]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        float16x4_t _val = vld1_f16(m);
                        float16x4_t _k = vdup_n_f16(kptr[0]);
                        _sum = vfma_f16(_sum, _val, _k);

                        m += 4;
                        kptr += 1;
                    }

                    _sum = activation_ps_f16(_sum, activation_type, activation_params);

                    vst1_f16(outptr, _sum);
                    outptr += 4;
                }
            }

            if (elempack == 1 && num_output_elempack == 1)
            {
                __fp16* outptr = top_blob.row<__fp16>(j);

                for (int p = 0; p < num_output; p++)
                {
                    const __fp16* kptr = (const __fp16*)weight_data_tm + num_input * p;
                    const __fp16* m = bottom_blob.row<const __fp16>(j);

                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        sum += (float)(*m * *kptr);

                        m += 1;
                        kptr += 1;
                    }

                    sum = activation_ss_f16(sum, activation_type, activation_params);

                    outptr[0] = (__fp16)sum;
                    outptr += 1;
                }
            }
        }

        return 0;
    }

    // flatten
    Mat bottom_blob_flattened = bottom_blob;
    if (bottom_blob.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;

        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
        if (bottom_blob_flattened.empty())
            return -100;
    }

    size_t elemsize = bottom_blob_flattened.elemsize;
    int elempack = bottom_blob_flattened.elempack;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (out_elempack == 8)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            float16x8_t _sum0 = vdupq_n_f16(0.f);
            float16x8_t _sum1 = vdupq_n_f16(0.f);
            float16x8_t _sum2 = vdupq_n_f16(0.f);
            float16x8_t _sum3 = vdupq_n_f16(0.f);
            float16x8_t _sum4 = vdupq_n_f16(0.f);
            float16x8_t _sum5 = vdupq_n_f16(0.f);
            float16x8_t _sum6 = vdupq_n_f16(0.f);
            float16x8_t _sum7 = vdupq_n_f16(0.f);

            if (bias_term)
            {
                _sum0 = vld1q_f16((const __fp16*)bias_data_fp16 + p * 8);
            }

            const __fp16* kptr = weight_data_tm.row<const __fp16>(p);

            const __fp16* sptr = bottom_blob_flattened;

            int i = 0;
#if NCNN_GNU_INLINE_ASM
            for (; i + 7 < num_input; i += 8)
            {
                asm volatile(
                    "prfm   pldl1keep, [%8, #128]       \n"
                    "ld1    {v0.8h}, [%8], #16          \n" // _val

                    "prfm   pldl1keep, [%9, #512]       \n"
                    "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%9], #64 \n" // w0123

                    "prfm   pldl1keep, [%9, #512]       \n"
                    "ld1    {v12.8h, v13.8h, v14.8h, v15.8h}, [%9], #64 \n" // w4567

                    "fmla   %0.8h, v8.8h, v0.h[0]       \n"
                    "fmla   %1.8h, v9.8h, v0.h[1]       \n"
                    "fmla   %2.8h, v10.8h, v0.h[2]      \n"
                    "fmla   %3.8h, v11.8h, v0.h[3]      \n"
                    "fmla   %4.8h, v12.8h, v0.h[4]      \n"
                    "fmla   %5.8h, v13.8h, v0.h[5]      \n"
                    "fmla   %6.8h, v14.8h, v0.h[6]      \n"
                    "fmla   %7.8h, v15.8h, v0.h[7]      \n"

                    : "=w"(_sum0), // %0
                    "=w"(_sum1), // %1
                    "=w"(_sum2), // %2
                    "=w"(_sum3), // %3
                    "=w"(_sum4), // %4
                    "=w"(_sum5), // %5
                    "=w"(_sum6), // %6
                    "=w"(_sum7), // %7
                    "=r"(sptr),  // %8
                    "=r"(kptr)   // %9
                    : "0"(_sum0),
                    "1"(_sum1),
                    "2"(_sum2),
                    "3"(_sum3),
                    "4"(_sum4),
                    "5"(_sum5),
                    "6"(_sum6),
                    "7"(_sum7),
                    "8"(sptr),
                    "9"(kptr)
                    : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
            }
            for (; i + 3 < num_input; i += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v0.4h}, [%4], #8           \n" // _val

                    "prfm   pldl1keep, [%5, #512]       \n"
                    "ld1    {v8.8h, v9.8h, v10.8h, v11.8h}, [%5], #64 \n" // w0123

                    "fmla   %0.8h, v8.8h, v0.h[0]       \n"
                    "fmla   %1.8h, v9.8h, v0.h[1]       \n"
                    "fmla   %2.8h, v10.8h, v0.h[2]      \n"
                    "fmla   %3.8h, v11.8h, v0.h[3]      \n"

                    : "=w"(_sum0), // %0
                    "=w"(_sum1), // %1
                    "=w"(_sum2), // %2
                    "=w"(_sum3), // %3
                    "=r"(sptr),  // %4
                    "=r"(kptr)   // %5
                    : "0"(_sum0),
                    "1"(_sum1),
                    "2"(_sum2),
                    "3"(_sum3),
                    "4"(sptr),
                    "5"(kptr)
                    : "cc", "memory", "v0", "v8", "v9", "v10", "v11");
            }
#endif // NCNN_GNU_INLINE_ASM
            for (; i < num_input; i++)
            {
                float16x8_t _val = vdupq_n_f16(sptr[0]);

                float16x8_t _w = vld1q_f16(kptr);

                _sum0 = vfmaq_f16(_sum0, _val, _w);

                sptr += 1;
                kptr += 8;
            }

            _sum0 = vaddq_f16(_sum0, _sum1);
            _sum2 = vaddq_f16(_sum2, _sum3);
            _sum4 = vaddq_f16(_sum4, _sum5);
            _sum6 = vaddq_f16(_sum6, _sum7);
            _sum0 = vaddq_f16(_sum0, _sum2);
            _sum4 = vaddq_f16(_sum4, _sum6);
            _sum0 = vaddq_f16(_sum0, _sum4);

            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);

            __fp16* outptr = (__fp16*)top_blob;
            vst1q_f16(outptr + p * 8, _sum0);
        }
    }

    if (out_elempack == 4)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            float16x4_t _sum0 = vdup_n_f16(0.f);
            float16x4_t _sum1 = vdup_n_f16(0.f);
            float16x4_t _sum2 = vdup_n_f16(0.f);
            float16x4_t _sum3 = vdup_n_f16(0.f);
            float16x4_t _sum4 = vdup_n_f16(0.f);
            float16x4_t _sum5 = vdup_n_f16(0.f);
            float16x4_t _sum6 = vdup_n_f16(0.f);
            float16x4_t _sum7 = vdup_n_f16(0.f);

            if (bias_term)
            {
                _sum0 = vld1_f16((const __fp16*)bias_data_fp16 + p * 4);
            }

            const __fp16* kptr = weight_data_tm.row<const __fp16>(p);

            const __fp16* sptr = bottom_blob_flattened;

            int i = 0;
#if NCNN_GNU_INLINE_ASM
            for (; i + 7 < num_input; i += 8)
            {
                asm volatile(
                    "prfm   pldl1keep, [%8, #128]       \n"
                    "ld1    {v0.8h}, [%8], #16          \n" // _val

                    "prfm   pldl1keep, [%9, #256]       \n"
                    "ld1    {v8.4h, v9.4h, v10.4h, v11.4h}, [%9], #32 \n" // w0123

                    "prfm   pldl1keep, [%9, #256]       \n"
                    "ld1    {v12.4h, v13.4h, v14.4h, v15.4h}, [%9], #32 \n" // w4567

                    "fmla   %0.4h, v8.4h, v0.h[0]       \n"
                    "fmla   %1.4h, v9.4h, v0.h[1]       \n"
                    "fmla   %2.4h, v10.4h, v0.h[2]      \n"
                    "fmla   %3.4h, v11.4h, v0.h[3]      \n"
                    "fmla   %4.4h, v12.4h, v0.h[4]      \n"
                    "fmla   %5.4h, v13.4h, v0.h[5]      \n"
                    "fmla   %6.4h, v14.4h, v0.h[6]      \n"
                    "fmla   %7.4h, v15.4h, v0.h[7]      \n"

                    : "=w"(_sum0), // %0
                    "=w"(_sum1), // %1
                    "=w"(_sum2), // %2
                    "=w"(_sum3), // %3
                    "=w"(_sum4), // %4
                    "=w"(_sum5), // %5
                    "=w"(_sum6), // %6
                    "=w"(_sum7), // %7
                    "=r"(sptr),  // %8
                    "=r"(kptr)   // %9
                    : "0"(_sum0),
                    "1"(_sum1),
                    "2"(_sum2),
                    "3"(_sum3),
                    "4"(_sum4),
                    "5"(_sum5),
                    "6"(_sum6),
                    "7"(_sum7),
                    "8"(sptr),
                    "9"(kptr)
                    : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
            }
            for (; i + 3 < num_input; i += 4)
            {
                asm volatile(
                    "prfm   pldl1keep, [%4, #128]       \n"
                    "ld1    {v0.4h}, [%4], #8           \n" // _val

                    "prfm   pldl1keep, [%5, #256]       \n"
                    "ld1    {v8.4h, v9.4h, v10.4h, v11.4h}, [%5], #32 \n" // w0123

                    "fmla   %0.4h, v8.4h, v0.h[0]       \n"
                    "fmla   %1.4h, v9.4h, v0.h[1]       \n"
                    "fmla   %2.4h, v10.4h, v0.h[2]      \n"
                    "fmla   %3.4h, v11.4h, v0.h[3]      \n"

                    : "=w"(_sum0), // %0
                    "=w"(_sum1), // %1
                    "=w"(_sum2), // %2
                    "=w"(_sum3), // %3
                    "=r"(sptr),  // %4
                    "=r"(kptr)   // %5
                    : "0"(_sum0),
                    "1"(_sum1),
                    "2"(_sum2),
                    "3"(_sum3),
                    "4"(sptr),
                    "5"(kptr)
                    : "cc", "memory", "v0", "v8", "v9", "v10", "v11");
            }
#endif // NCNN_GNU_INLINE_ASM
            for (; i < num_input; i++)
            {
                float16x4_t _val = vdup_n_f16(sptr[0]);

                float16x4_t _w = vld1_f16(kptr);

                _sum0 = vfma_f16(_sum0, _val, _w);

                sptr += 1;
                kptr += 4;
            }

            _sum0 = vadd_f16(_sum0, _sum1);
            _sum2 = vadd_f16(_sum2, _sum3);
            _sum4 = vadd_f16(_sum4, _sum5);
            _sum6 = vadd_f16(_sum6, _sum7);
            _sum0 = vadd_f16(_sum0, _sum2);
            _sum4 = vadd_f16(_sum4, _sum6);
            _sum0 = vadd_f16(_sum0, _sum4);

            _sum0 = activation_ps_f16(_sum0, activation_type, activation_params);

            __fp16* outptr = (__fp16*)top_blob;
            vst1_f16(outptr + p * 4, _sum0);
        }
    }

    if (out_elempack == 1)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output; p++)
        {
            float sum = 0.f;

            if (bias_term)
                sum = bias_data[p];

            const __fp16* kptr = weight_data_tm.row<__fp16>(p);

            const __fp16* sptr = bottom_blob_flattened;

            float16x8_t _sum = vdupq_n_f16(0.f);
            int i = 0;
            for (; i + 7 < num_input; i += 8)
            {
                float16x8_t _m = vld1q_f16(sptr);
                float16x8_t _w = vld1q_f16(kptr);

                _sum = vfmaq_f16(_sum, _m, _w);

                sptr += 8;
                kptr += 8;
            }
            for (; i < num_input; i++)
            {
                __fp16 v = *sptr;
                __fp16 k = *kptr;

                sum += (float)(v * k);

                sptr++;
                kptr++;
            }

            float16x4_t _s4 = vadd_f16(vget_low_f16(_sum), vget_high_f16(_sum));
            sum += vaddvq_f32(vcvt_f32_f16(_s4)); // dot

            sum = activation_ss_f16(sum, activation_type, activation_params);

            __fp16* outptr = (__fp16*)top_blob;
            outptr[p] = (__fp16)sum;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/innerproduct_arm_vfpv4.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "innerproduct_arm.h"

#include "cpu.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "innerproduct_fp16s.h"
#include "innerproduct_gemm_fp16s.h"

int InnerProduct_arm::create_pipeline_fp16s(const Option& opt)
{
    const int num_input = weight_data_size / num_output;

    innerproduct_transform_kernel_fp16s_neon(weight_data, weight_data_tm, num_input, num_output, opt);

#if NCNN_ARM82
    if (ncnn::cpu_support_arm_asimdhp() && opt.use_fp16_arithmetic)
    {
        ncnn::cast_float32_to_float16(bias_data, bias_data_fp16, opt);
    }
#endif

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int InnerProduct_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
    {
        // gemm
        int h = bottom_blob.h;
        size_t elemsize = bottom_blob.elemsize;
        int elempack = bottom_blob.elempack;

        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        innerproduct_gemm_fp16s_neon(bottom_blob, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);

        return 0;
    }

    // flatten
    Mat bottom_blob_flattened = bottom_blob;
    if (bottom_blob.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;

        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
        if (bottom_blob_flattened.empty())
            return -100;
    }

    size_t elemsize = bottom_blob_flattened.elemsize;
    int elempack = bottom_blob_flattened.elempack;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (out_elempack == 4)
    {
        innerproduct_pack4_fp16s_neon(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
    }

    if (out_elempack == 1)
    {
        innerproduct_fp16s_neon(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt);
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/innerproduct_fp16s.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
void innerproduct_pack4_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
void innerproduct_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
void innerproduct_transform_kernel_fp16s_neon_asimdfhm(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
void innerproduct_pack4_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
void innerproduct_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
void innerproduct_transform_kernel_fp16s_neon_asimdhp(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt);
#endif

static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdfhm())
    {
        innerproduct_pack4_fp16s_neon_asimdfhm(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdhp())
    {
        innerproduct_pack4_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
        return;
    }
#endif

    const int num_input = bottom_blob.w * bottom_blob.elempack;
    const int num_output = top_blob.w;

    const float* bias_data_ptr = bias_data;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < num_output; p++)
    {
        float32x4_t _sum0 = vdupq_n_f32(0.f);

        if (bias_data_ptr)
        {
            _sum0 = vld1q_f32(bias_data_ptr + p * 4);
        }

        float32x4_t _sum1 = vdupq_n_f32(0.f);
        float32x4_t _sum2 = vdupq_n_f32(0.f);
        float32x4_t _sum3 = vdupq_n_f32(0.f);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        const __fp16* sptr = bottom_blob;
        const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
#else
        const float* sptr = bottom_blob;
        const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
#endif

        int i = 0;
#if NCNN_GNU_INLINE_ASM
        for (; i + 7 < num_input; i += 8)
        {
#if __aarch64__
#if __ARM_FEATURE_FP16_FML
            asm volatile(
                "prfm   pldl1keep, [%0, #128]       \n"
                "ld1    {v0.8h}, [%0], #16          \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n"
                "fmlal  %2.4s, v2.4h, v0.h[0]       \n"
                "fmlal2 %3.4s, v2.4h, v0.h[1]       \n"
                "fmlal  %4.4s, v3.4h, v0.h[2]       \n"
                "fmlal2 %5.4s, v3.4h, v0.h[3]       \n"
                "fmlal  %2.4s, v4.4h, v0.h[4]       \n"
                "fmlal2 %3.4s, v4.4h, v0.h[5]       \n"
                "fmlal  %4.4s, v5.4h, v0.h[6]       \n"
                "fmlal2 %5.4s, v5.4h, v0.h[7]       \n"
                : "=r"(sptr),  // %0
                "=r"(kptr),  // %1
                "=w"(_sum0), // %2
                "=w"(_sum1), // %3
                "=w"(_sum2), // %4
                "=w"(_sum3)  // %5
                : "0"(sptr),
                "1"(kptr),
                "2"(_sum0),
                "3"(_sum1),
                "4"(_sum2),
                "5"(_sum3)
                : "cc", "memory", "v0", "v2", "v3", "v4", "v5");
#elif __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            asm volatile(
                "prfm   pldl1keep, [%0, #128]       \n"
                "ld1    {v1.8h}, [%0], #16          \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v6.8h, v7.8h, v8.8h, v9.8h}, [%1], #64 \n"
                "fcvtl  v0.4s, v1.4h                \n"
                "fcvtl2 v1.4s, v1.8h                \n"
                "fcvtl  v2.4s, v6.4h                \n"
                "fcvtl2 v3.4s, v6.8h                \n"
                "fcvtl  v4.4s, v7.4h                \n"
                "fcvtl2 v5.4s, v7.8h                \n"
                "fcvtl  v6.4s, v8.4h                \n"
                "fcvtl2 v7.4s, v8.8h                \n"
                "fcvtl  v8.4s, v9.4h                \n"
                "fcvtl2 v9.4s, v9.8h                \n"
                "fmla   %2.4s, v2.4s, v0.s[0]       \n"
                "fmla   %3.4s, v3.4s, v0.s[1]       \n"
                "fmla   %4.4s, v4.4s, v0.s[2]       \n"
                "fmla   %5.4s, v5.4s, v0.s[3]       \n"
                "fmla   %2.4s, v6.4s, v1.s[0]       \n"
                "fmla   %3.4s, v7.4s, v1.s[1]       \n"
                "fmla   %4.4s, v8.4s, v1.s[2]       \n"
                "fmla   %5.4s, v9.4s, v1.s[3]       \n"
                : "=r"(sptr),  // %0
                "=r"(kptr),  // %1
                "=w"(_sum0), // %2
                "=w"(_sum1), // %3
                "=w"(_sum2), // %4
                "=w"(_sum3)  // %5
                : "0"(sptr),
                "1"(kptr),
                "2"(_sum0),
                "3"(_sum1),
                "4"(_sum2),
                "5"(_sum3)
                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            asm volatile(
                "prfm   pldl1keep, [%0, #256]       \n"
                "ld1    {v0.4s, v1.4s}, [%0], #32   \n"
                "prfm   pldl1keep, [%1, #512]       \n"
                "ld1    {v6.8h, v7.8h, v8.8h, v9.8h}, [%1], #64 \n"
                "fcvtl  v2.4s, v6.4h                \n"
                "fcvtl2 v3.4s, v6.8h                \n"
                "fcvtl  v4.4s, v7.4h                \n"
                "fcvtl2 v5.4s, v7.8h                \n"
                "fcvtl  v6.4s, v8.4h                \n"
                "fcvtl2 v7.4s, v8.8h                \n"
                "fcvtl  v8.4s, v9.4h                \n"
                "fcvtl2 v9.4s, v9.8h                \n"
                "fmla   %2.4s, v2.4s, v0.s[0]       \n"
                "fmla   %3.4s, v3.4s, v0.s[1]       \n"
                "fmla   %4.4s, v4.4s, v0.s[2]       \n"
                "fmla   %5.4s, v5.4s, v0.s[3]       \n"
                "fmla   %2.4s, v6.4s, v1.s[0]       \n"
                "fmla   %3.4s, v7.4s, v1.s[1]       \n"
                "fmla   %4.4s, v8.4s, v1.s[2]       \n"
                "fmla   %5.4s, v9.4s, v1.s[3]       \n"
                : "=r"(sptr),  // %0
                "=r"(kptr),  // %1
                "=w"(_sum0), // %2
                "=w"(_sum1), // %3
                "=w"(_sum2), // %4
                "=w"(_sum3)  // %5
                : "0"(sptr),
                "1"(kptr),
                "2"(_sum0),
                "3"(_sum1),
                "4"(_sum2),
                "5"(_sum3)
                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9");
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#else  // __aarch64__
            asm volatile(
                "pld        [%0, #256]          \n"
                "vld1.f32   {d0-d3}, [%0 :128]! \n"
                "pld        [%1, #512]          \n"
                "vldm       %1!, {d12-d19}      \n"
                "vcvt.f32.f16 q2, d12           \n"
                "vcvt.f32.f16 q3, d13           \n"
                "vcvt.f32.f16 q4, d14           \n"
                "vcvt.f32.f16 q5, d15           \n"
                "vcvt.f32.f16 q6, d16           \n"
                "vcvt.f32.f16 q7, d17           \n"
                "vcvt.f32.f16 q8, d18           \n"
                "vcvt.f32.f16 q9, d19           \n"
                "vmla.f32   %q2, q2, d0[0]      \n"
                "vmla.f32   %q3, q3, d0[1]      \n"
                "vmla.f32   %q4, q4, d1[0]      \n"
                "vmla.f32   %q5, q5, d1[1]      \n"
                "vmla.f32   %q2, q6, d2[0]      \n"
                "vmla.f32   %q3, q7, d2[1]      \n"
                "vmla.f32   %q4, q8, d3[0]      \n"
                "vmla.f32   %q5, q9, d3[1]      \n"
                : "=r"(sptr),  // %0
                "=r"(kptr),  // %1
                "=w"(_sum0), // %2
                "=w"(_sum1), // %3
                "=w"(_sum2), // %4
                "=w"(_sum3)  // %5
                : "0"(sptr),
                "1"(kptr),
                "2"(_sum0),
                "3"(_sum1),
                "4"(_sum2),
                "5"(_sum3)
                : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9");
#endif // __aarch64__
        }
#endif // NCNN_GNU_INLINE_ASM
        for (; i + 3 < num_input; i += 4)
        {
#if __ARM_FEATURE_FP16_FML
            float16x4_t _val = vld1_f16(sptr);
            float16x8_t _w01 = vld1q_f16(kptr);
            float16x8_t _w23 = vld1q_f16(kptr + 8);

            _sum0 = vfmlalq_lane_low_f16(_sum0, _w01, _val, 0);
            _sum1 = vfmlalq_lane_high_f16(_sum1, _w01, _val, 1);
            _sum2 = vfmlalq_lane_low_f16(_sum2, _w23, _val, 2);
            _sum3 = vfmlalq_lane_high_f16(_sum3, _w23, _val, 3);
#else // __ARM_FEATURE_FP16_FML
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
            float16x8_t _w01 = vld1q_f16(kptr);
            float16x8_t _w23 = vld1q_f16(kptr + 8);
            float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
            float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
            float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
            float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
#else
            float32x4_t _val = vld1q_f32(sptr);
            uint16x8_t _w01 = vld1q_u16(kptr);
            uint16x8_t _w23 = vld1q_u16(kptr + 8);
            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
#endif

#if __aarch64__
            _sum0 = vfmaq_laneq_f32(_sum0, _w0, _val, 0);
            _sum1 = vfmaq_laneq_f32(_sum1, _w1, _val, 1);
            _sum2 = vfmaq_laneq_f32(_sum2, _w2, _val, 2);
            _sum3 = vfmaq_laneq_f32(_sum3, _w3, _val, 3);
#else
            _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_val), 0);
            _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_val), 1);
            _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_val), 0);
            _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_val), 1);
#endif
#endif // __ARM_FEATURE_FP16_FML

            sptr += 4;
            kptr += 16;
        }
        for (; i < num_input; i++)
        {
            float32x4_t _val = vdupq_n_f32((float)sptr[0]);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
#else
            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
#endif
            _sum0 = vfmaq_f32(_sum0, _val, _w);

            sptr += 1;
            kptr += 4;
        }

        _sum0 = vaddq_f32(_sum0, _sum1);
        _sum2 = vaddq_f32(_sum2, _sum3);
        _sum0 = vaddq_f32(_sum0, _sum2);

        _sum0 = activation_ps(_sum0, activation_type, activation_params);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        __fp16* outptr = (__fp16*)top_blob;
        vst1_f16(outptr + p * 4, vcvt_f16_f32(_sum0));
#else
        float* outptr = top_blob;
        vst1q_f32(outptr + p * 4, _sum0);
#endif
    }
}

static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdfhm())
    {
        innerproduct_fp16s_neon_asimdfhm(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdhp())
    {
        innerproduct_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
        return;
    }
#endif

    const int num_input = bottom_blob.w * bottom_blob.elempack;
    const int num_output = top_blob.w;

    const float* bias_data_ptr = bias_data;

    int nn_num_output = num_output >> 2;
    int remain_num_output_start = nn_num_output << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_num_output; pp++)
    {
        int p = pp * 4;

        float sums[4] = {0.0f};
        if (bias_data_ptr)
        {
            sums[0] = bias_data_ptr[p];
            sums[1] = bias_data_ptr[p + 1];
            sums[2] = bias_data_ptr[p + 2];
            sums[3] = bias_data_ptr[p + 3];
        }

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        const __fp16* sptr = bottom_blob;
        const __fp16* kptr0 = weight_data_fp16.row<const __fp16>(p);
        const __fp16* kptr1 = weight_data_fp16.row<const __fp16>(p + 1);
        const __fp16* kptr2 = weight_data_fp16.row<const __fp16>(p + 2);
        const __fp16* kptr3 = weight_data_fp16.row<const __fp16>(p + 3);
#else
        const float* sptr = bottom_blob;
        const unsigned short* kptr0 = weight_data_fp16.row<const unsigned short>(p);
        const unsigned short* kptr1 = weight_data_fp16.row<const unsigned short>(p + 1);
        const unsigned short* kptr2 = weight_data_fp16.row<const unsigned short>(p + 2);
        const unsigned short* kptr3 = weight_data_fp16.row<const unsigned short>(p + 3);
#endif

        int i = 0;

        float32x4_t _sum0 = vdupq_n_f32(0.f);
        float32x4_t _sum1 = vdupq_n_f32(0.f);
        float32x4_t _sum2 = vdupq_n_f32(0.f);
        float32x4_t _sum3 = vdupq_n_f32(0.f);
        for (; i + 3 < num_input; i += 4)
        {
#if __ARM_FEATURE_FP16_FML
            float16x4_t _val = vld1_f16(sptr);
            float16x4_t _w0 = vld1_f16(kptr0);
            float16x4_t _w1 = vld1_f16(kptr1);
            float16x4_t _w2 = vld1_f16(kptr2);
            float16x4_t _w3 = vld1_f16(kptr3);
            float16x8_t _w01 = vcombine_f16(_w0, _w1);
            float16x8_t _w23 = vcombine_f16(_w2, _w3);
            float16x8_t _valval = vcombine_f16(_val, _val);

            _sum0 = vfmlalq_low_f16(_sum0, _w01, _valval);
            _sum1 = vfmlalq_high_f16(_sum1, _w01, _valval);
            _sum2 = vfmlalq_low_f16(_sum2, _w23, _valval);
            _sum3 = vfmlalq_high_f16(_sum3, _w23, _valval);
#else // __ARM_FEATURE_FP16_FML
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
            float32x4_t _w0 = vcvt_f32_f16(vld1_f16(kptr0));
            float32x4_t _w1 = vcvt_f32_f16(vld1_f16(kptr1));
            float32x4_t _w2 = vcvt_f32_f16(vld1_f16(kptr2));
            float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr3));
#else
            float32x4_t _val = vld1q_f32(sptr);
            float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr0)));
            float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr1)));
            float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr2)));
            float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr3)));
#endif

            _sum0 = vfmaq_f32(_sum0, _val, _w0);
            _sum1 = vfmaq_f32(_sum1, _val, _w1);
            _sum2 = vfmaq_f32(_sum2, _val, _w2);
            _sum3 = vfmaq_f32(_sum3, _val, _w3);
#endif // __ARM_FEATURE_FP16_FML

            sptr += 4;
            kptr0 += 4;
            kptr1 += 4;
            kptr2 += 4;
            kptr3 += 4;
        }

#if __aarch64__
        sums[0] += vaddvq_f32(_sum0);
        sums[1] += vaddvq_f32(_sum1);
        sums[2] += vaddvq_f32(_sum2);
        sums[3] += vaddvq_f32(_sum3);
#else
        float32x2_t _sum0ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
        float32x2_t _sum1ss = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
        float32x2_t _sum2ss = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
        float32x2_t _sum3ss = vadd_f32(vget_low_f32(_sum3), vget_high_f32(_sum3));
        float32x2_t _sum01ss = vpadd_f32(_sum0ss, _sum1ss);
        float32x2_t _sum23ss = vpadd_f32(_sum2ss, _sum3ss);
        sums[0] += vget_lane_f32(_sum01ss, 0);
        sums[1] += vget_lane_f32(_sum01ss, 1);
        sums[2] += vget_lane_f32(_sum23ss, 0);
        sums[3] += vget_lane_f32(_sum23ss, 1);
#endif

        for (; i < num_input; i++)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            sums[0] += (float)(*sptr) * (float)(*kptr0);
            sums[1] += (float)(*sptr) * (float)(*kptr1);
            sums[2] += (float)(*sptr) * (float)(*kptr2);
            sums[3] += (float)(*sptr) * (float)(*kptr3);
#else
            sums[0] += *sptr * float16_to_float32(*kptr0);
            sums[1] += *sptr * float16_to_float32(*kptr1);
            sums[2] += *sptr * float16_to_float32(*kptr2);
            sums[3] += *sptr * float16_to_float32(*kptr3);
#endif

            sptr++;
            kptr0++;
            kptr1++;
            kptr2++;
            kptr3++;
        }

        float32x4_t _sum = vld1q_f32(sums);

        _sum = activation_ps(_sum, activation_type, activation_params);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        __fp16* outptr = (__fp16*)top_blob;
        vst1_f16(outptr + p, vcvt_f16_f32(_sum));
#else
        float* outptr = top_blob;
        vst1q_f32(outptr + p, _sum);
#endif
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_num_output_start; p < num_output; p++)
    {
        float sum = 0.f;

        if (bias_data_ptr)
            sum = bias_data_ptr[p];

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        const __fp16* sptr = bottom_blob;
        const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
#else
        const float* sptr = bottom_blob;
        const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
#endif

        int i = 0;

        float32x4_t _sum = vdupq_n_f32(0.f);
        for (; i + 3 < num_input; i += 4)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr));
            float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
#else
            float32x4_t _val = vld1q_f32(sptr);
            float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
#endif
            _sum = vfmaq_f32(_sum, _val, _w);

            sptr += 4;
            kptr += 4;
        }
        for (; i < num_input; i++)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            sum += (float)(*sptr) * (float)(*kptr);
#else
            sum += *sptr * float16_to_float32(*kptr);
#endif
            sptr++;
            kptr++;
        }

#if __aarch64__
        sum += vaddvq_f32(_sum);
#else
        float32x2_t _sumss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
        _sumss = vpadd_f32(_sumss, _sumss);
        sum += vget_lane_f32(_sumss, 0);
#endif // __aarch64__

        sum = activation_ss(sum, activation_type, activation_params);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        __fp16* outptr = (__fp16*)top_blob;
        outptr[p] = (__fp16)sum;
#else
        float* outptr = top_blob;
        outptr[p] = sum;
#endif
    }
}

static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdfhm())
    {
        innerproduct_transform_kernel_fp16s_neon_asimdfhm(weight_data, weight_data_tm, num_input, num_output, opt);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdhp())
    {
        innerproduct_transform_kernel_fp16s_neon_asimdhp(weight_data, weight_data_tm, num_input, num_output, opt);
        return;
    }
#endif

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
        out_elempack = opt.use_fp16_arithmetic && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
#else
        out_elempack = num_output % 4 == 0 ? 4 : 1;
#endif
    }

    // src = inch-outch
    // dst = pb-inch-outch/pb
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
    if (out_elempack == 8)
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_tm.create(num_input, num_output / 8, (size_t)16u, 8);

        for (int q = 0; q + 7 < num_output; q += 8)
        {
            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 8);

            const float* k0 = weight_data_r2.row(q);
            const float* k1 = weight_data_r2.row(q + 1);
            const float* k2 = weight_data_r2.row(q + 2);
            const float* k3 = weight_data_r2.row(q + 3);
            const float* k4 = weight_data_r2.row(q + 4);
            const float* k5 = weight_data_r2.row(q + 5);
            const float* k6 = weight_data_r2.row(q + 6);
            const float* k7 = weight_data_r2.row(q + 7);

            int p = 0;
#if NCNN_GNU_INLINE_ASM
            for (; p + 7 < num_input; p += 8)
            {
                // transpose 8x8
                asm volatile(
                    "ld1    {v0.4s, v1.4s}, [%0], #32   \n"
                    "ld1    {v2.4s, v3.4s}, [%1], #32   \n"
                    "ld1    {v4.4s, v5.4s}, [%2], #32   \n"
                    "ld1    {v6.4s, v7.4s}, [%3], #32   \n"
                    "ld1    {v8.4s, v9.4s}, [%4], #32   \n"
                    "ld1    {v10.4s, v11.4s}, [%5], #32 \n"
                    "ld1    {v12.4s, v13.4s}, [%6], #32 \n"
                    "ld1    {v14.4s, v15.4s}, [%7], #32 \n"

                    "fcvtn  v0.4h, v0.4s            \n"
                    "fcvtn2 v0.8h, v1.4s            \n"
                    "fcvtn  v1.4h, v2.4s            \n"
                    "fcvtn2 v1.8h, v3.4s            \n"
                    "fcvtn  v2.4h, v4.4s            \n"
                    "fcvtn2 v2.8h, v5.4s            \n"
                    "fcvtn  v3.4h, v6.4s            \n"
                    "fcvtn2 v3.8h, v7.4s            \n"
                    "fcvtn  v4.4h, v8.4s            \n"
                    "fcvtn2 v4.8h, v9.4s            \n"
                    "fcvtn  v5.4h, v10.4s           \n"
                    "fcvtn2 v5.8h, v11.4s           \n"
                    "fcvtn  v6.4h, v12.4s           \n"
                    "fcvtn2 v6.8h, v13.4s           \n"
                    "fcvtn  v7.4h, v14.4s           \n"
                    "fcvtn2 v7.8h, v15.4s           \n"

                    "zip1   v16.8h, v0.8h, v4.8h    \n"
                    "zip2   v20.8h, v0.8h, v4.8h    \n"
                    "zip1   v17.8h, v1.8h, v5.8h    \n"
                    "zip2   v21.8h, v1.8h, v5.8h    \n"
                    "zip1   v18.8h, v2.8h, v6.8h    \n"
                    "zip2   v22.8h, v2.8h, v6.8h    \n"
                    "zip1   v19.8h, v3.8h, v7.8h    \n"
                    "zip2   v23.8h, v3.8h, v7.8h    \n"

                    "st4    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"
                    "st4    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"
                    : "=r"(k0), // %0
                    "=r"(k1), // %1
                    "=r"(k2), // %2
                    "=r"(k3), // %3
                    "=r"(k4), // %4
                    "=r"(k5), // %5
                    "=r"(k6), // %6
                    "=r"(k7), // %7
                    "=r"(g0)  // %8
                    : "0"(k0),
                    "1"(k1),
                    "2"(k2),
                    "3"(k3),
                    "4"(k4),
                    "5"(k5),
                    "6"(k6),
                    "7"(k7),
                    "8"(g0)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
            }
#endif // NCNN_GNU_INLINE_ASM
            for (; p < num_input; p++)
            {
                g0[0] = float32_to_float16(*k0++);
                g0[1] = float32_to_float16(*k1++);
                g0[2] = float32_to_float16(*k2++);
                g0[3] = float32_to_float16(*k3++);
                g0[4] = float32_to_float16(*k4++);
                g0[5] = float32_to_float16(*k5++);
                g0[6] = float32_to_float16(*k6++);
                g0[7] = float32_to_float16(*k7++);
                g0 += 8;
            }
        }
    }
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

    if (out_elempack == 4)
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4);

        for (int q = 0; q + 3 < num_output; q += 4)
        {
            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 4);

            const float* k0 = weight_data_r2.row(q);
            const float* k1 = weight_data_r2.row(q + 1);
            const float* k2 = weight_data_r2.row(q + 2);
            const float* k3 = weight_data_r2.row(q + 3);

            int p = 0;
            for (; p + 3 < num_input; p += 4)
            {
                // transpose 4x4
                uint16x4x4_t _p;
                _p.val[0] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k0)));
                _p.val[1] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k1)));
                _p.val[2] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k2)));
                _p.val[3] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k3)));
                vst4_u16(g0, _p);

                k0 += 4;
                k1 += 4;
                k2 += 4;
                k3 += 4;
                g0 += 16;
            }
            for (; p < num_input; p++)
            {
                g0[0] = float32_to_float16(*k0++);
                g0[1] = float32_to_float16(*k1++);
                g0[2] = float32_to_float16(*k2++);
                g0[3] = float32_to_float16(*k3++);
                g0 += 4;
            }
        }
    }

    if (out_elempack == 1)
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
    }
}


================================================
FILE: src/layer/arm/innerproduct_gemm_fp16s.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
void innerproduct_gemm_fp16s_neon_asimdfhm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
void innerproduct_gemm_fp16s_neon_asimdhp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt);
#endif

static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82FP16FML && __aarch64__ && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdfhm())
    {
        innerproduct_gemm_fp16s_neon_asimdfhm(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
        return;
    }
#endif

#if NCNN_RUNTIME_CPU && NCNN_ARM82 && __aarch64__ && !__ARM_FEATURE_FP16_VECTOR_ARITHMETIC && !__ARM_FEATURE_FP16_FML
    if (ncnn::cpu_support_arm_asimdhp())
    {
        innerproduct_gemm_fp16s_neon_asimdhp(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt);
        return;
    }
#endif

    const int num_input = bottom_blob.w;
    const int elempack = bottom_blob.elempack;
    const int num_output = top_blob.w;
    const int h = bottom_blob.h;

    const float* bias_data_ptr = bias_data;

    int num_output_elempack = 1;
    if (opt.use_packing_layout)
    {
        num_output_elempack = num_output % 4 == 0 ? 4 : 1;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int j = 0; j < h; j++)
    {
        if (elempack == 4 && num_output_elempack == 4)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            __fp16* outptr = top_blob.row<__fp16>(j);
#else
            float* outptr = top_blob.row(j);
#endif

            for (int p = 0; p < num_output / num_output_elempack; p++)
            {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                const __fp16* m = bottom_blob.row<const __fp16>(j);
                const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
#else
                const float* m = bottom_blob.row(j);
                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
#endif

                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vdupq_n_f32(bias_data_ptr[p * 4 + 0]);
                    _sum1 = vdupq_n_f32(bias_data_ptr[p * 4 + 1]);
                    _sum2 = vdupq_n_f32(bias_data_ptr[p * 4 + 2]);
                    _sum3 = vdupq_n_f32(bias_data_ptr[p * 4 + 3]);
                }

                int i = 0;
                for (; i < num_input; i++)
                {
#if __ARM_FEATURE_FP16_FML
                    float16x4_t _val = vld1_f16(m);
                    float16x4_t _w = vld1_f16(kptr);
                    float16x8_t _valval = vcombine_f16(_val, _val);

                    _sum0 = vfmlalq_lane_low_f16(_sum0, _valval, _w, 0);
                    _sum1 = vfmlalq_lane_low_f16(_sum1, _valval, _w, 1);
                    _sum2 = vfmlalq_lane_low_f16(_sum2, _valval, _w, 2);
                    _sum3 = vfmlalq_lane_low_f16(_sum3, _valval, _w, 3);
#else // __ARM_FEATURE_FP16_FML
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    float32x4_t _val = vcvt_f32_f16(vld1_f16(m));
                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
#else
                    float32x4_t _val = vld1q_f32(m);
                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
#endif

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _val, _w, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _val, _w, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _val, _w, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _val, _w, 3);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _val, vget_low_f32(_w), 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _val, vget_low_f32(_w), 1);
                    _sum2 = vmlaq_lane_f32(_sum2, _val, vget_high_f32(_w), 0);
                    _sum3 = vmlaq_lane_f32(_sum3, _val, vget_high_f32(_w), 1);
#endif
#endif // __ARM_FEATURE_FP16_FML

                    m += 4;
                    kptr += 4;
                }

                _sum0 = activation_ps(_sum0, activation_type, activation_params);
                _sum1 = activation_ps(_sum1, activation_type, activation_params);
                _sum2 = activation_ps(_sum2, activation_type, activation_params);
                _sum3 = activation_ps(_sum3, activation_type, activation_params);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                vst1_f16(outptr, vcvt_f16_f32(_sum0));
                vst1_f16(outptr + 4, vcvt_f16_f32(_sum1));
                vst1_f16(outptr + 8, vcvt_f16_f32(_sum2));
                vst1_f16(outptr + 12, vcvt_f16_f32(_sum3));
#else
                vst1q_f32(outptr, _sum0);
                vst1q_f32(outptr + 4, _sum1);
                vst1q_f32(outptr + 8, _sum2);
                vst1q_f32(outptr + 12, _sum3);
#endif
                outptr += 16;
            }
        }

        if (elempack == 1 && num_output_elempack == 4)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            __fp16* outptr = top_blob.row<__fp16>(j);
#else
            float* outptr = top_blob.row(j);
#endif

            for (int p = 0; p < num_output / num_output_elempack; p++)
            {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                const __fp16* m = bottom_blob.row<const __fp16>(j);
                const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
#else
                const float* m = bottom_blob.row(j);
                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
#endif

                float32x4_t _sum0 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vld1q_f32(bias_data_ptr + p * 4);
                }

                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);

                int i = 0;
                for (; i + 3 < num_input; i += 4)
                {
#if __ARM_FEATURE_FP16_FML
                    float16x4_t _val = vld1_f16(m);
                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);

                    _sum0 = vfmlalq_lane_low_f16(_sum0, _w01, _val, 0);
                    _sum1 = vfmlalq_lane_high_f16(_sum1, _w01, _val, 1);
                    _sum2 = vfmlalq_lane_low_f16(_sum2, _w23, _val, 2);
                    _sum3 = vfmlalq_lane_high_f16(_sum3, _w23, _val, 3);
#else // __ARM_FEATURE_FP16_FML
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    float32x4_t _val = vcvt_f32_f16(vld1_f16(m));
                    float16x8_t _w01 = vld1q_f16(kptr);
                    float16x8_t _w23 = vld1q_f16(kptr + 8);
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
                    float32x4_t _w2 = vcvt_f32_f16(vget_low_f16(_w23));
                    float32x4_t _w3 = vcvt_f32_f16(vget_high_f16(_w23));
#else
                    float32x4_t _val = vld1q_f32(m);
                    uint16x8_t _w01 = vld1q_u16(kptr);
                    uint16x8_t _w23 = vld1q_u16(kptr + 8);
                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
                    float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23)));
                    float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23)));
#endif

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _w0, _val, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _w1, _val, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _w2, _val, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _w3, _val, 3);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _w0, vget_low_f32(_val), 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _w1, vget_low_f32(_val), 1);
                    _sum2 = vmlaq_lane_f32(_sum2, _w2, vget_high_f32(_val), 0);
                    _sum3 = vmlaq_lane_f32(_sum3, _w3, vget_high_f32(_val), 1);
#endif
#endif // __ARM_FEATURE_FP16_FML

                    m += 4;
                    kptr += 16;
                }
                for (; i < num_input; i++)
                {
                    float32x4_t _val = vdupq_n_f32((float)m[0]);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
#else
                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
#endif
                    _sum0 = vfmaq_f32(_sum0, _val, _w);

                    m += 1;
                    kptr += 4;
                }

                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                _sum0 = vaddq_f32(_sum0, _sum2);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                vst1_f16(outptr, vcvt_f16_f32(_sum0));
#else
                vst1q_f32(outptr, _sum0);
#endif
                outptr += 4;
            }
        }

        if (elempack == 4 && num_output_elempack == 1)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            __fp16* outptr = top_blob.row<__fp16>(j);
#else
            float* outptr = top_blob.row(j);
#endif

            for (int p = 0; p < num_output; p++)
            {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                const __fp16* m = bottom_blob.row<const __fp16>(j);
                const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
#else
                const float* m = bottom_blob.row(j);
                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
#endif

                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                float32x4_t _sum2 = vdupq_n_f32(0.f);
                float32x4_t _sum3 = vdupq_n_f32(0.f);

                if (bias_data_ptr)
                {
                    _sum0 = vdupq_n_f32(bias_data_ptr[p]);
                }

                int i = 0;
                for (; i + 3 < num_input; i += 4)
                {
#if __ARM_FEATURE_FP16_FML
                    float16x8_t _val01 = vld1q_f16(m);
                    float16x8_t _val23 = vld1q_f16(m + 8);
                    float16x4_t _w = vld1_f16(kptr);

                    _sum0 = vfmlalq_lane_low_f16(_sum0, _val01, _w, 0);
                    _sum1 = vfmlalq_lane_high_f16(_sum1, _val01, _w, 1);
                    _sum2 = vfmlalq_lane_low_f16(_sum2, _val23, _w, 2);
                    _sum3 = vfmlalq_lane_high_f16(_sum3, _val23, _w, 3);
#else // __ARM_FEATURE_FP16_FML
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    float32x4_t _val0 = vcvt_f32_f16(vld1_f16(m));
                    float32x4_t _val1 = vcvt_f32_f16(vld1_f16(m + 4));
                    float32x4_t _val2 = vcvt_f32_f16(vld1_f16(m + 8));
                    float32x4_t _val3 = vcvt_f32_f16(vld1_f16(m + 12));
                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
#else
                    float32x4_t _val0 = vld1q_f32(m);
                    float32x4_t _val1 = vld1q_f32(m + 4);
                    float32x4_t _val2 = vld1q_f32(m + 8);
                    float32x4_t _val3 = vld1q_f32(m + 12);
                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
#endif

#if __aarch64__
                    _sum0 = vfmaq_laneq_f32(_sum0, _val0, _w, 0);
                    _sum1 = vfmaq_laneq_f32(_sum1, _val1, _w, 1);
                    _sum2 = vfmaq_laneq_f32(_sum2, _val2, _w, 2);
                    _sum3 = vfmaq_laneq_f32(_sum3, _val3, _w, 3);
#else
                    _sum0 = vmlaq_lane_f32(_sum0, _val0, vget_low_f32(_w), 0);
                    _sum1 = vmlaq_lane_f32(_sum1, _val1, vget_low_f32(_w), 1);
                    _sum2 = vmlaq_lane_f32(_sum2, _val2, vget_high_f32(_w), 0);
                    _sum3 = vmlaq_lane_f32(_sum3, _val3, vget_high_f32(_w), 1);
#endif
#endif // __ARM_FEATURE_FP16_FML

                    m += 16;
                    kptr += 4;
                }
                for (; i < num_input; i++)
                {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    float32x4_t _val = vcvt_f32_f16(vld1_f16(m));
                    float32x4_t _k = vdupq_n_f32((float)(kptr[0]));
#else
                    float32x4_t _val = vld1q_f32(m);
                    float32x4_t _k = vdupq_n_f32(float16_to_float32(kptr[0]));
#endif
                    _sum0 = vfmaq_f32(_sum0, _val, _k);

                    m += 4;
                    kptr += 1;
                }

                _sum0 = vaddq_f32(_sum0, _sum1);
                _sum2 = vaddq_f32(_sum2, _sum3);
                _sum0 = vaddq_f32(_sum0, _sum2);

                _sum0 = activation_ps(_sum0, activation_type, activation_params);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                vst1_f16(outptr, vcvt_f16_f32(_sum0));
#else
                vst1q_f32(outptr, _sum0);
#endif
                outptr += 4;
            }
        }

        if (elempack == 1 && num_output_elempack == 1)
        {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
            __fp16* outptr = top_blob.row<__fp16>(j);
#else
            float* outptr = top_blob.row(j);
#endif

            for (int p = 0; p < num_output; p++)
            {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                const __fp16* m = bottom_blob.row<const __fp16>(j);
                const __fp16* kptr = weight_data_fp16.row<const __fp16>(p);
#else
                const float* m = bottom_blob.row(j);
                const unsigned short* kptr = weight_data_fp16.row<const unsigned short>(p);
#endif

                float sum = 0.f;

                if (bias_data_ptr)
                {
                    sum = bias_data_ptr[p];
                }

                int i = 0;
                float32x4_t _sum0 = vdupq_n_f32(0.f);
                float32x4_t _sum1 = vdupq_n_f32(0.f);
                for (; i + 7 < num_input; i += 8)
                {
#if __ARM_FEATURE_FP16_FML
                    float16x8_t _val01 = vld1q_f16(m);
                    float16x8_t _w01 = vld1q_f16(kptr);

                    _sum0 = vfmlalq_low_f16(_sum0, _val01, _w01);
                    _sum1 = vfmlalq_high_f16(_sum1, _val01, _w01);
#else // __ARM_FEATURE_FP16_FML
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    float16x8_t _val01 = vld1q_f16(m);
                    float16x8_t _w01 = vld1q_f16(kptr);
                    float32x4_t _val0 = vcvt_f32_f16(vget_low_f16(_val01));
                    float32x4_t _val1 = vcvt_f32_f16(vget_high_f16(_val01));
                    float32x4_t _w0 = vcvt_f32_f16(vget_low_f16(_w01));
                    float32x4_t _w1 = vcvt_f32_f16(vget_high_f16(_w01));
#else
                    float32x4_t _val0 = vld1q_f32(m);
                    float32x4_t _val1 = vld1q_f32(m + 4);
                    uint16x8_t _w01 = vld1q_u16(kptr);
                    float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01)));
                    float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01)));
#endif

                    _sum0 = vfmaq_f32(_sum0, _val0, _w0);
                    _sum1 = vfmaq_f32(_sum1, _val1, _w1);
#endif // __ARM_FEATURE_FP16_FML

                    m += 8;
                    kptr += 8;
                }
                _sum0 = vaddq_f32(_sum0, _sum1);
                for (; i + 3 < num_input; i += 4)
                {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    float32x4_t _val = vcvt_f32_f16(vld1_f16(m));
                    float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr));
#else
                    float32x4_t _val = vld1q_f32(m);
                    float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr)));
#endif

                    _sum0 = vfmaq_f32(_sum0, _val, _w);

                    m += 4;
                    kptr += 4;
                }
#if __aarch64__
                sum += vaddvq_f32(_sum0);
#else
                float32x2_t _ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
                _ss = vpadd_f32(_ss, _ss);
                sum += vget_lane_f32(_ss, 0);
#endif
                for (; i < num_input; i++)
                {
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                    sum += (float)(*m++) * (float)(*kptr++);
#else
                    sum += *m++ * float16_to_float32(*kptr++);
#endif
                }

                sum = activation_ss(sum, activation_type, activation_params);

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                outptr[0] = (__fp16)sum;
#else
                outptr[0] = sum;
#endif
                outptr += 1;
            }
        }
    }
}


================================================
FILE: src/layer/arm/instancenorm_arm.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "instancenorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

InstanceNorm_arm::InstanceNorm_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int InstanceNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr0 = bottom_top_blob.channel(q);

            float32x4_t _div_size = vdupq_n_f32(1.f / size);

            // mean and var
            float32x4_t _sum = vdupq_n_f32(0.f);
            float32x4_t _sqsum = vdupq_n_f32(0.f);
            const float* ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _sum = vaddq_f32(_sum, _p);
                ptr += 4;
                //sqsum += ptr[i] * ptr[i];
            }
            float32x4_t _mean = vmulq_f32(_sum, _div_size);
            ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _tmp = vsubq_f32(_p, _mean);
                _sqsum = vmlaq_f32(_sqsum, _tmp, _tmp);
                ptr += 4;
            }
            float32x4_t _var_eps = vmlaq_f32(vdupq_n_f32(eps), _sqsum, _div_size);
            // the var maybe minus due to accuracy
            //float var = sqsum / size - mean * mean;

            float32x4_t _reciprocal = vrsqrteq_f32(_var_eps);
            _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps, _reciprocal), _reciprocal), _reciprocal);
            // _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps, _reciprocal), _reciprocal), _reciprocal);

            float32x4_t _a;
            float32x4_t _b;
            if (affine)
            {
                float32x4_t _gamma = vld1q_f32((const float*)gamma_data + q * 4);
                float32x4_t _beta = vld1q_f32((const float*)beta_data + q * 4);

                _a = vmulq_f32(_gamma, _reciprocal);
                _b = vmlsq_f32(_beta, _mean, _a);
            }
            else
            {
                _a = _reciprocal;
                _b = vnegq_f32(vmulq_f32(_mean, _a));
            }

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vld1q_f32(ptr0);
                _p = vmlaq_f32(_b, _p, _a);
                vst1q_f32(ptr0, _p);
                ptr0 += 4;
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr0 = bottom_top_blob.channel(q);

        // mean and var
        float sum = 0.f;
        float sqsum = 0.f;
        const float* ptr = ptr0;
        int i = 0;
#if __ARM_NEON
        float32x4_t _sum = vdupq_n_f32(0.f);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _sum = vaddq_f32(_sum, _p);
            ptr += 4;
        }
#if __aarch64__
        sum = vaddvq_f32(_sum);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
        _s2 = vpadd_f32(_s2, _s2);
        sum = vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            sum += *ptr++;
            //sqsum += ptr[i] * ptr[i];
        }
        float mean = sum / size;
        ptr = ptr0;
        i = 0;
#if __ARM_NEON
        float32x4_t _sqsum = vdupq_n_f32(0.f);
        float32x4_t _mean = vdupq_n_f32(mean);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _tmp = vsubq_f32(_p, _mean);
            _sqsum = vmlaq_f32(_sqsum, _tmp, _tmp);
            ptr += 4;
        }
#if __aarch64__
        sqsum = vaddvq_f32(_sqsum);
#else
        float32x2_t _sq2 = vadd_f32(vget_low_f32(_sqsum), vget_high_f32(_sqsum));
        _sq2 = vpadd_f32(_sq2, _sq2);
        sqsum = vget_lane_f32(_sq2, 0);
#endif
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float tmp = *ptr++ - mean;
            sqsum += tmp * tmp;
        }
        float var = sqsum / size;
        // the var maybe minus due to accuracy
        //float var = sqsum / size - mean * mean;

        float a;
        float b;
        if (affine)
        {
            float gamma = gamma_data[q];
            float beta = beta_data[q];

            a = (float)(gamma / (sqrtf(var + eps)));
            b = (float)(-mean * a + beta);
        }
        else
        {
            a = (float)(1.f / (sqrtf(var + eps)));
            b = (float)(-mean * a);
        }

        i = 0;
#if __ARM_NEON
        float32x4_t _a = vdupq_n_f32(a);
        float32x4_t _b = vdupq_n_f32(b);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            _p = vmlaq_f32(_b, _p, _a);
            vst1q_f32(ptr0, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr0 = *ptr0 * a + b;
            ptr0++;
        }
    }

    return 0;
}

#if NCNN_BF16
int InstanceNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr0 = bottom_top_blob.channel(q);

            float32x4_t _div_size = vdupq_n_f32(1.f / size);

            // mean and var
            float32x4_t _sum = vdupq_n_f32(0.f);
            float32x4_t _sqsum = vdupq_n_f32(0.f);
            const unsigned short* ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                _sum = vaddq_f32(_sum, _p);
                ptr += 4;
                //sqsum += ptr[i] * ptr[i];
            }
            float32x4_t _mean = vmulq_f32(_sum, _div_size);
            ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                float32x4_t _tmp = vsubq_f32(_p, _mean);
                _sqsum = vmlaq_f32(_sqsum, _tmp, _tmp);
                ptr += 4;
            }
            float32x4_t _var_eps = vmlaq_f32(vdupq_n_f32(eps), _sqsum, _div_size);
            // the var maybe minus due to accuracy
            //float var = sqsum / size - mean * mean;

            float32x4_t _reciprocal = vrsqrteq_f32(_var_eps);
            _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps, _reciprocal), _reciprocal), _reciprocal);
            // _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps, _reciprocal), _reciprocal), _reciprocal);

            float32x4_t _a;
            float32x4_t _b;
            if (affine)
            {
                float32x4_t _gamma = vld1q_f32((const float*)gamma_data + q * 4);
                float32x4_t _beta = vld1q_f32((const float*)beta_data + q * 4);

                _a = vmulq_f32(_gamma, _reciprocal);
                _b = vmlsq_f32(_beta, _mean, _a);
            }
            else
            {
                _a = _reciprocal;
                _b = vnegq_f32(vmulq_f32(_mean, _a));
            }

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr0));
                _p = vmlaq_f32(_b, _p, _a);
                vst1_u16(ptr0, float2bfloat(_p));
                ptr0 += 4;
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr0 = bottom_top_blob.channel(q);

        // mean and var
        float sum = 0.f;
        float sqsum = 0.f;
        const unsigned short* ptr = ptr0;
        int i = 0;
#if __ARM_NEON
        float32x4_t _sum = vdupq_n_f32(0.f);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _sum = vaddq_f32(_sum, _p);
            ptr += 4;
        }
#if __aarch64__
        sum = vaddvq_f32(_sum);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
        _s2 = vpadd_f32(_s2, _s2);
        sum = vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            sum += bfloat16_to_float32(*ptr++);
            //sqsum += ptr[i] * ptr[i];
        }
        float mean = sum / size;
        ptr = ptr0;
        i = 0;
#if __ARM_NEON
        float32x4_t _sqsum = vdupq_n_f32(0.f);
        float32x4_t _mean = vdupq_n_f32(mean);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _tmp = vsubq_f32(_p, _mean);
            _sqsum = vmlaq_f32(_sqsum, _tmp, _tmp);
            ptr += 4;
        }
#if __aarch64__
        sqsum = vaddvq_f32(_sqsum);
#else
        float32x2_t _sq2 = vadd_f32(vget_low_f32(_sqsum), vget_high_f32(_sqsum));
        _sq2 = vpadd_f32(_sq2, _sq2);
        sqsum = vget_lane_f32(_sq2, 0);
#endif
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float tmp = bfloat16_to_float32(*ptr++) - mean;
            sqsum += tmp * tmp;
        }
        float var = sqsum / size;
        // the var maybe minus due to accuracy
        //float var = sqsum / size - mean * mean;

        float a;
        float b;
        if (affine)
        {
            float gamma = gamma_data[q];
            float beta = beta_data[q];

            a = (float)(gamma / (sqrtf(var + eps)));
            b = (float)(-mean * a + beta);
        }
        else
        {
            a = (float)(1.f / (sqrtf(var + eps)));
            b = (float)(-mean * a);
        }

        i = 0;
#if __ARM_NEON
        float32x4_t _a = vdupq_n_f32(a);
        float32x4_t _b = vdupq_n_f32(b);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
            _p = vmlaq_f32(_b, _p, _a);
            vst1_u16(ptr0, float2bfloat(_p));
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr0 = float32_to_bfloat16(bfloat16_to_float32(*ptr0) * a + b);
            ptr0++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/instancenorm_arm.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INSTANCENORM_ARM_H
#define LAYER_INSTANCENORM_ARM_H

#include "instancenorm.h"

namespace ncnn {

class InstanceNorm_arm : public InstanceNorm
{
public:
    InstanceNorm_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_INSTANCENORM_ARM_H


================================================
FILE: src/layer/arm/instancenorm_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "instancenorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int InstanceNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 8)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr0 = bottom_top_blob.channel(q);

            float32x4_t _div_size = vdupq_n_f32(1.f / size);
            float32x4_t _eps = vdupq_n_f32(eps);

            // mean and var
            float32x4_t _sum0 = vdupq_n_f32(0.f);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sqsum0 = vdupq_n_f32(0.f);
            float32x4_t _sqsum1 = vdupq_n_f32(0.f);
            const __fp16* ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                _sum0 = vaddq_f32(_sum0, _p0);
                _sum1 = vaddq_f32(_sum1, _p1);
                ptr += 8;
                //sqsum += ptr[i] * ptr[i];
            }
            float32x4_t _mean0 = vmulq_f32(_sum0, _div_size);
            float32x4_t _mean1 = vmulq_f32(_sum1, _div_size);
            ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                float32x4_t _tmp0 = vsubq_f32(_p0, _mean0);
                float32x4_t _tmp1 = vsubq_f32(_p1, _mean1);
                _sqsum0 = vfmaq_f32(_sqsum0, _tmp0, _tmp0);
                _sqsum1 = vfmaq_f32(_sqsum1, _tmp1, _tmp1);
                ptr += 8;
            }
            float32x4_t _var_eps0 = vfmaq_f32(_eps, _sqsum0, _div_size);
            float32x4_t _var_eps1 = vfmaq_f32(_eps, _sqsum1, _div_size);
            // the var maybe minus due to accuracy
            //float var = sqsum / size - mean * mean;

            float32x4_t _reciprocal0 = vrsqrteq_f32(_var_eps0);
            float32x4_t _reciprocal1 = vrsqrteq_f32(_var_eps1);
            _reciprocal0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps0, _reciprocal0), _reciprocal0), _reciprocal0);
            _reciprocal1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps1, _reciprocal1), _reciprocal1), _reciprocal1);
            // _reciprocal0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps0, _reciprocal0), _reciprocal0), _reciprocal0);
            // _reciprocal1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps1, _reciprocal1), _reciprocal1), _reciprocal1);

            float16x8_t _a;
            float16x8_t _b;
            if (affine)
            {
                float32x4_t _gamma0 = vld1q_f32((const float*)gamma_data + q * 8);
                float32x4_t _gamma1 = vld1q_f32((const float*)gamma_data + q * 8 + 4);
                float32x4_t _beta0 = vld1q_f32((const float*)beta_data + q * 8);
                float32x4_t _beta1 = vld1q_f32((const float*)beta_data + q * 8 + 4);

                float32x4_t _a320 = vmulq_f32(_gamma0, _reciprocal0);
                float32x4_t _a321 = vmulq_f32(_gamma1, _reciprocal1);
                float16x4_t _a0 = vcvt_f16_f32(_a320);
                float16x4_t _a1 = vcvt_f16_f32(_a321);
                float16x4_t _b0 = vcvt_f16_f32(vmlsq_f32(_beta0, _mean0, _a320));
                float16x4_t _b1 = vcvt_f16_f32(vmlsq_f32(_beta1, _mean1, _a321));

                _a = vcombine_f16(_a0, _a1);
                _b = vcombine_f16(_b0, _b1);
            }
            else
            {
                float16x4_t _a0 = vcvt_f16_f32(_reciprocal0);
                float16x4_t _a1 = vcvt_f16_f32(_reciprocal1);
                float16x4_t _b0 = vcvt_f16_f32(vnegq_f32(vmulq_f32(_mean0, _reciprocal0)));
                float16x4_t _b1 = vcvt_f16_f32(vnegq_f32(vmulq_f32(_mean1, _reciprocal1)));

                _a = vcombine_f16(_a0, _a1);
                _b = vcombine_f16(_b0, _b1);
            }

            for (int i = 0; i < size; i++)
            {
                float16x8_t _p = vld1q_f16(ptr0);
                _p = vfmaq_f16(_b, _p, _a);
                vst1q_f16(ptr0, _p);
                ptr0 += 8;
            }
        }

        return 0;
    }

    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr0 = bottom_top_blob.channel(q);

            float32x4_t _div_size = vdupq_n_f32(1.f / size);

            // mean and var
            float32x4_t _sum = vdupq_n_f32(0.f);
            float32x4_t _sqsum = vdupq_n_f32(0.f);
            const __fp16* ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                _sum = vaddq_f32(_sum, _p);
                ptr += 4;
                //sqsum += ptr[i] * ptr[i];
            }
            float32x4_t _mean = vmulq_f32(_sum, _div_size);
            ptr = ptr0;
            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                float32x4_t _tmp = vsubq_f32(_p, _mean);
                _sqsum = vfmaq_f32(_sqsum, _tmp, _tmp);
                ptr += 4;
            }
            float32x4_t _var_eps = vfmaq_f32(vdupq_n_f32(eps), _sqsum, _div_size);
            // the var maybe minus due to accuracy
            //float var = sqsum / size - mean * mean;

            float32x4_t _reciprocal = vrsqrteq_f32(_var_eps);
            _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps, _reciprocal), _reciprocal), _reciprocal);
            // _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var_eps, _reciprocal), _reciprocal), _reciprocal);

            float16x4_t _a;
            float16x4_t _b;
            if (affine)
            {
                float32x4_t _gamma = vld1q_f32((const float*)gamma_data + q * 4);
                float32x4_t _beta = vld1q_f32((const float*)beta_data + q * 4);

                float32x4_t _a32 = vmulq_f32(_gamma, _reciprocal);
                _a = vcvt_f16_f32(_a32);
                _b = vcvt_f16_f32(vmlsq_f32(_beta, _mean, _a32));
            }
            else
            {
                _a = vcvt_f16_f32(_reciprocal);
                _b = vcvt_f16_f32(vnegq_f32(vmulq_f32(_mean, _reciprocal)));
            }

            for (int i = 0; i < size; i++)
            {
                float16x4_t _p = vld1_f16(ptr0);
                _p = vfma_f16(_b, _p, _a);
                vst1_f16(ptr0, _p);
                ptr0 += 4;
            }
        }

        return 0;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr0 = bottom_top_blob.channel(q);

        // mean and var
        float sum = 0.f;
        float sqsum = 0.f;
        const __fp16* ptr = ptr0;
        int i = 0;
#if __ARM_NEON
        float32x4_t _sum = vdupq_n_f32(0.f);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _sum = vaddq_f32(_sum, _p);
            ptr += 4;
        }
        sum = vaddvq_f32(_sum);
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            sum += *ptr++;
            //sqsum += ptr[i] * ptr[i];
        }
        float mean = sum / size;
        ptr = ptr0;
        i = 0;
#if __ARM_NEON
        float32x4_t _sqsum = vdupq_n_f32(0.f);
        float32x4_t _mean = vdupq_n_f32(mean);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            float32x4_t _tmp = vsubq_f32(_p, _mean);
            _sqsum = vmlaq_f32(_sqsum, _tmp, _tmp);
            ptr += 4;
        }
        sqsum = vaddvq_f32(_sqsum);
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float tmp = (float)*ptr - mean;
            sqsum += tmp * tmp;
            ptr++;
        }
        float var = sqsum / size;
        // the var maybe minus due to accuracy
        //float var = sqsum / size - mean * mean;

        __fp16 a;
        __fp16 b;
        if (affine)
        {
            float gamma = gamma_data[q];
            float beta = beta_data[q];

            float a_fp32 = gamma / (sqrtf(var + eps));
            a = (__fp16)(a_fp32);
            b = (__fp16)(-mean * a_fp32 + beta);
        }
        else
        {
            float a_fp32 = 1.f / (sqrtf(var + eps));
            a = (__fp16)(a_fp32);
            b = (__fp16)(-mean * a_fp32);
        }

        i = 0;
#if __ARM_NEON
        float16x8_t _a = vdupq_n_f16(a);
        float16x8_t _b = vdupq_n_f16(b);
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr0);
            _p = vfmaq_f16(_b, _p, _a);
            vst1q_f16(ptr0, _p);
            ptr0 += 8;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr0 = *ptr0 * a + b;
            ptr0++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/interp_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "interp_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

#include "interp_bicubic.h"
#include "interp_bilinear.h"

#if NCNN_BF16
#include "interp_bicubic_bf16s.h"
#include "interp_bilinear_bf16s.h"
#endif

#if __ARM_NEON
#include "interp_bicubic_pack4.h"
#include "interp_bilinear_pack4.h"
#if NCNN_BF16
#include "interp_bicubic_pack4_bf16s.h"
#include "interp_bilinear_pack4_bf16s.h"
#endif
#endif

Interp_arm::Interp_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Interp_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blobs, top_blobs, opt);
        else
            return forward_fp16s(bottom_blobs, top_blobs, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

    int h = bottom_blob.h;
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = reference_blob.w;
    int outh = reference_blob.h;

    if (!size_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(bottom_blobs.size());
        for (size_t i = 0; i < bottom_blobs.size(); i++)
        {
            bottom_blob_shapes[i] = bottom_blobs[i].shape();
        }
        eval_size_expr(bottom_blob_shapes, outw, outh);
    }

    if (dims == 1)
    {
        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < w; q++)
            {
                Mat top_blob_c = top_blob.channel(q);
                float32x4_t _v = vld1q_f32((const float*)bottom_blob + q * 4);
                top_blob_c.fill(_v);
            }

            return 0;
        }
#endif // __ARM_NEON

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < w; q++)
        {
            Mat top_blob_c = top_blob.channel(q);
            const float v = bottom_blob[q];
            top_blob_c.fill(v);
        }

        return 0;
    }

    if (dims == 2)
    {
        if (outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4)
        {
            if (resize_type == 1) // nearest
            {
                const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const float* ptr = bottom_blob.row(y);
                    float* outptr = top_blob.row(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        float32x4_t _p = vld1q_f32(ptr + in_x * 4);
                        vst1q_f32(outptr, _p);

                        outptr += 4;
                    }
                }
            }

            if (resize_type == 2) // bilinear
            {
                int* buf = new int[outw + outw * 2];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                linear_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const float* ptr = bottom_blob.row(y);
                    float* outptr = top_blob.row(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const float* Sp = ptr + sx;

                        float32x2_t _a01 = vld1_f32(alphap);

                        float32x4_t _S0 = vld1q_f32(Sp);
                        float32x4_t _S1 = vld1q_f32(Sp + 4);
                        float32x4_t _p = vmulq_lane_f32(_S0, _a01, 0);
                        _p = vmlaq_lane_f32(_p, _S1, _a01, 1);
                        vst1q_f32(outptr, _p);

                        alphap += 2;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            if (resize_type == 3) // bicubic
            {
                int* buf = new int[outw + outw * 4];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                cubic_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const float* ptr = bottom_blob.row(y);
                    float* outptr = top_blob.row(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const float* Sp = ptr + sx;

                        float32x4_t _a0123 = vld1q_f32(alphap);

                        float32x4_t _S0 = vld1q_f32(Sp - 4);
                        float32x4_t _S1 = vld1q_f32(Sp + 0);
                        float32x4_t _S2 = vld1q_f32(Sp + 4);
                        float32x4_t _S3 = vld1q_f32(Sp + 8);
                        float32x4_t _p = vmulq_lane_f32(_S0, vget_low_f32(_a0123), 0);
                        _p = vmlaq_lane_f32(_p, _S1, vget_low_f32(_a0123), 1);
                        _p = vmlaq_lane_f32(_p, _S2, vget_high_f32(_a0123), 0);
                        _p = vmlaq_lane_f32(_p, _S3, vget_high_f32(_a0123), 1);
                        vst1q_f32(outptr, _p);

                        alphap += 4;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            return 0;
        }
#endif // __ARM_NEON

        if (resize_type == 1) // nearest
        {
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outw * 2];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            linear_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const float* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
                    alphap += 2;
                }
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outw * 4];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            cubic_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const float* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    float a2 = alphap[2];
                    float a3 = alphap[3];
                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
                    alphap += 4;
                }
            }

            delete[] buf;
        }

        return 0;
    }

    if (outw == w && outh == h)
    {
        top_blob = bottom_blob;
        return 0;
    }

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __ARM_NEON
    if (elempack == 4)
    {
        if (resize_type == 1) // nearest
        {
            const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                for (int y = 0; y < outh; y++)
                {
                    int in_y = std::min((int)(y * hs), (h - 1));

                    const float* ptr = src.row(in_y);
                    float* outptr = dst.row(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        float32x4_t _p = vld1q_f32(ptr + in_x * 4);
                        vst1q_f32(outptr, _p);

                        outptr += 4;
                    }
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outh + outw * 2 + outh * 2];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

            linear_coeffs(w, outw, xofs, alpha, align_corner);
            linear_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outh + outw * 4 + outh * 4];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

            cubic_coeffs(w, outw, xofs, alpha, align_corner);
            cubic_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        return 0;
    }
#endif // __ARM_NEON

    if (resize_type == 1) // nearest
    {
        const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
        const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            for (int y = 0; y < outh; y++)
            {
                int in_y = std::min((int)(y * hs), (h - 1));

                const float* ptr = src.row(in_y);
                float* outptr = dst.row(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }
    }

    if (resize_type == 2) // bilinear
    {
        int* buf = new int[outw + outh + outw * 2 + outh * 2];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

        linear_coeffs(w, outw, xofs, alpha, align_corner);
        linear_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    if (resize_type == 3) // bicubic
    {
        int* buf = new int[outw + outh + outw * 4 + outh * 4];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

        cubic_coeffs(w, outw, xofs, alpha, align_corner);
        cubic_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    return 0;
}

#if NCNN_BF16
int Interp_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int h = bottom_blob.h;
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = reference_blob.w;
    int outh = reference_blob.h;

    if (!size_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(bottom_blobs.size());
        for (size_t i = 0; i < bottom_blobs.size(); i++)
        {
            bottom_blob_shapes[i] = bottom_blobs[i].shape();
        }
        eval_size_expr(bottom_blob_shapes, outw, outh);
    }

    if (dims == 1)
    {
        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < w; q++)
            {
                Mat top_blob_c = top_blob.channel(q);
                uint16x4_t _v = vld1_u16((const unsigned short*)bottom_blob + q * 4);
                top_blob_c.fill(_v);
            }

            return 0;
        }
#endif // __ARM_NEON

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < w; q++)
        {
            Mat top_blob_c = top_blob.channel(q);
            const unsigned short* ptr = bottom_blob;
            top_blob_c.fill(ptr[q]);
        }

        return 0;
    }

    if (dims == 2)
    {
        if (outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4)
        {
            if (resize_type == 1) // nearest
            {
                const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const unsigned short* ptr = bottom_blob.row<const unsigned short>(y);
                    unsigned short* outptr = top_blob.row<unsigned short>(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        uint16x4_t _p = vld1_u16(ptr + in_x * 4);
                        vst1_u16(outptr, _p);

                        outptr += 4;
                    }
                }
            }

            if (resize_type == 2) // bilinear
            {
                int* buf = new int[outw + outw * 2];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                linear_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const unsigned short* ptr = bottom_blob.row<const unsigned short>(y);
                    unsigned short* outptr = top_blob.row<unsigned short>(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const unsigned short* Sp = ptr + sx;

                        float32x2_t _a01 = vld1_f32(alphap);

                        float32x4_t _S0 = bfloat2float(vld1_u16(Sp));
                        float32x4_t _S1 = bfloat2float(vld1_u16(Sp + 4));
                        float32x4_t _p = vmulq_lane_f32(_S0, _a01, 0);
                        _p = vmlaq_lane_f32(_p, _S1, _a01, 1);
                        vst1_u16(outptr, float2bfloat(_p));

                        alphap += 2;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            if (resize_type == 3) // bicubic
            {
                int* buf = new int[outw + outw * 4];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                cubic_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const unsigned short* ptr = bottom_blob.row<const unsigned short>(y);
                    unsigned short* outptr = top_blob.row<unsigned short>(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const unsigned short* Sp = ptr + sx;

                        float32x4_t _a0123 = vld1q_f32(alphap);

                        float32x4_t _S0 = bfloat2float(vld1_u16(Sp - 4));
                        float32x4_t _S1 = bfloat2float(vld1_u16(Sp + 0));
                        float32x4_t _S2 = bfloat2float(vld1_u16(Sp + 4));
                        float32x4_t _S3 = bfloat2float(vld1_u16(Sp + 8));
                        float32x4_t _p = vmulq_lane_f32(_S0, vget_low_f32(_a0123), 0);
                        _p = vmlaq_lane_f32(_p, _S1, vget_low_f32(_a0123), 1);
                        _p = vmlaq_lane_f32(_p, _S2, vget_high_f32(_a0123), 0);
                        _p = vmlaq_lane_f32(_p, _S3, vget_high_f32(_a0123), 1);
                        vst1_u16(outptr, float2bfloat(_p));

                        alphap += 4;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            return 0;
        }
#endif // __ARM_NEON

        if (resize_type == 1) // nearest
        {
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const unsigned short* ptr = bottom_blob.row<const unsigned short>(y);
                unsigned short* outptr = top_blob.row<unsigned short>(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outw * 2];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            linear_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const unsigned short* ptr = bottom_blob.row<const unsigned short>(y);
                unsigned short* outptr = top_blob.row<unsigned short>(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const unsigned short* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    *outptr++ = float32_to_bfloat16(bfloat16_to_float32(Sp[0]) * a0 + bfloat16_to_float32(Sp[1]) * a1);
                    alphap += 2;
                }
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outw * 4];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            cubic_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const unsigned short* ptr = bottom_blob.row<const unsigned short>(y);
                unsigned short* outptr = top_blob.row<unsigned short>(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const unsigned short* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    float a2 = alphap[2];
                    float a3 = alphap[3];
                    *outptr++ = float32_to_bfloat16(bfloat16_to_float32(Sp[-1]) * a0 + bfloat16_to_float32(Sp[0]) * a1 + bfloat16_to_float32(Sp[1]) * a2 + bfloat16_to_float32(Sp[2]) * a3);
                    alphap += 4;
                }
            }

            delete[] buf;
        }

        return 0;
    }

    if (outw == w && outh == h)
    {
        top_blob = bottom_blob;
        return 0;
    }

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __ARM_NEON
    if (elempack == 4)
    {
        if (resize_type == 1) // nearest
        {
            const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                for (int y = 0; y < outh; y++)
                {
                    int in_y = std::min((int)(y * hs), (h - 1));

                    const unsigned short* ptr = src.row<const unsigned short>(in_y);
                    unsigned short* outptr = dst.row<unsigned short>(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        uint16x4_t _p = vld1_u16(ptr + in_x * 4);
                        vst1_u16(outptr, _p);

                        outptr += 4;
                    }
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outh + outw * 2 + outh * 2];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

            linear_coeffs(w, outw, xofs, alpha, align_corner);
            linear_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bilinear_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outh + outw * 4 + outh * 4];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

            cubic_coeffs(w, outw, xofs, alpha, align_corner);
            cubic_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bicubic_image_pack4_bf16s(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        return 0;
    }
#endif // __ARM_NEON

    if (resize_type == 1) // nearest
    {
        const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
        const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            for (int y = 0; y < outh; y++)
            {
                int in_y = std::min((int)(y * hs), (h - 1));

                const unsigned short* ptr = src.row<const unsigned short>(in_y);
                unsigned short* outptr = dst.row<unsigned short>(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }
    }

    if (resize_type == 2) // bilinear
    {
        int* buf = new int[outw + outh + outw * 2 + outh * 2];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

        linear_coeffs(w, outw, xofs, alpha, align_corner);
        linear_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bilinear_image_bf16s(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    if (resize_type == 3) // bicubic
    {
        int* buf = new int[outw + outh + outw * 4 + outh * 4];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

        cubic_coeffs(w, outw, xofs, alpha, align_corner);
        cubic_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bicubic_image_bf16s(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/interp_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INTERP_ARM_H
#define LAYER_INTERP_ARM_H

#include "interp.h"

namespace ncnn {

class Interp_arm : public Interp
{
public:
    Interp_arm();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
    int forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_INTERP_ARM_H


================================================
FILE: src/layer/arm/interp_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "interp_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#include "interp_bicubic.h"
#include "interp_bilinear.h"

#if __ARM_NEON
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "interp_bicubic_fp16s.h"
#include "interp_bicubic_pack4_fp16s.h"
#include "interp_bicubic_pack8_fp16s.h"
#include "interp_bilinear_fp16s.h"
#include "interp_bilinear_pack4_fp16s.h"
#include "interp_bilinear_pack8_fp16s.h"
#endif
#endif

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Interp_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int h = bottom_blob.h;
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = reference_blob.w;
    int outh = reference_blob.h;

    if (!size_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(bottom_blobs.size());
        for (size_t i = 0; i < bottom_blobs.size(); i++)
        {
            bottom_blob_shapes[i] = bottom_blobs[i].shape();
        }
        eval_size_expr(bottom_blob_shapes, outw, outh);
    }

    if (dims == 1)
    {
        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < w; q++)
            {
                Mat top_blob_c = top_blob.channel(q);
                float16x4_t _v = vld1_f16((const __fp16*)bottom_blob + q * 4);
                top_blob_c.fill(_v);
            }

            return 0;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < w; q++)
        {
            Mat top_blob_c = top_blob.channel(q);
            const __fp16* ptr = bottom_blob;
            top_blob_c.fill(ptr[q]);
        }

        return 0;
    }

    if (dims == 2)
    {
        if (outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 4)
        {
            if (resize_type == 1) // nearest
            {
                const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        float16x4_t _p = vld1_f16(ptr + in_x * 4);
                        vst1_f16(outptr, _p);

                        outptr += 4;
                    }
                }
            }

            if (resize_type == 2) // bilinear
            {
                int* buf = new int[outw + outw * 2];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                linear_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const __fp16* Sp = ptr + sx;

                        float32x2_t _a01 = vld1_f32(alphap);

                        float32x4_t _S0 = vcvt_f32_f16(vld1_f16(Sp));
                        float32x4_t _S1 = vcvt_f32_f16(vld1_f16(Sp + 4));
                        float32x4_t _p = vmulq_lane_f32(_S0, _a01, 0);
                        _p = vmlaq_lane_f32(_p, _S1, _a01, 1);
                        vst1_f16(outptr, vcvt_f16_f32(_p));

                        alphap += 2;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            if (resize_type == 3) // bicubic
            {
                int* buf = new int[outw + outw * 4];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                cubic_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const __fp16* Sp = ptr + sx;

                        float32x4_t _a0123 = vld1q_f32(alphap);

                        float32x4_t _S0 = vcvt_f32_f16(vld1_f16(Sp - 4));
                        float32x4_t _S1 = vcvt_f32_f16(vld1_f16(Sp + 0));
                        float32x4_t _S2 = vcvt_f32_f16(vld1_f16(Sp + 4));
                        float32x4_t _S3 = vcvt_f32_f16(vld1_f16(Sp + 8));
                        float32x4_t _p = vmulq_laneq_f32(_S0, _a0123, 0);
                        _p = vfmaq_laneq_f32(_p, _S1, _a0123, 1);
                        _p = vfmaq_laneq_f32(_p, _S2, _a0123, 2);
                        _p = vfmaq_laneq_f32(_p, _S3, _a0123, 3);
                        vst1_f16(outptr, vcvt_f16_f32(_p));

                        alphap += 4;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            return 0;
        }

        if (resize_type == 1) // nearest
        {
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                __fp16* outptr = top_blob.row<__fp16>(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outw * 2];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            linear_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                __fp16* outptr = top_blob.row<__fp16>(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const __fp16* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    *outptr++ = (__fp16)((float)Sp[0] * a0 + (float)Sp[1] * a1);
                    alphap += 2;
                }
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outw * 4];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            cubic_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                __fp16* outptr = top_blob.row<__fp16>(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const __fp16* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    float a2 = alphap[2];
                    float a3 = alphap[3];
                    *outptr++ = (__fp16)((float)Sp[-1] * a0 + (float)Sp[0] * a1 + (float)Sp[1] * a2 + (float)Sp[2] * a3);
                    alphap += 4;
                }
            }

            delete[] buf;
        }

        return 0;
    }

    if (outw == w && outh == h)
    {
        top_blob = bottom_blob;
        return 0;
    }

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (elempack == 4)
    {
        if (resize_type == 1) // nearest
        {
            const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                for (int y = 0; y < outh; y++)
                {
                    int in_y = std::min((int)(y * hs), (h - 1));

                    const __fp16* ptr = src.row<const __fp16>(in_y);
                    __fp16* outptr = dst.row<__fp16>(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        float16x4_t _p = vld1_f16(ptr + in_x * 4);
                        vst1_f16(outptr, _p);

                        outptr += 4;
                    }
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outh + outw * 2 + outh * 2];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

            linear_coeffs(w, outw, xofs, alpha, align_corner);
            linear_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bilinear_image_pack4_fp16s(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outh + outw * 4 + outh * 4];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

            cubic_coeffs(w, outw, xofs, alpha, align_corner);
            cubic_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bicubic_image_pack4_fp16s(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        return 0;
    }

    if (resize_type == 1) // nearest
    {
        const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
        const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            for (int y = 0; y < outh; y++)
            {
                int in_y = std::min((int)(y * hs), (h - 1));

                const __fp16* ptr = src.row<const __fp16>(in_y);
                __fp16* outptr = dst.row<__fp16>(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }
    }

    if (resize_type == 2) // bilinear
    {
        int* buf = new int[outw + outh + outw * 2 + outh * 2];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

        linear_coeffs(w, outw, xofs, alpha, align_corner);
        linear_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bilinear_image_fp16s(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    if (resize_type == 3) // bicubic
    {
        int* buf = new int[outw + outh + outw * 4 + outh * 4];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

        cubic_coeffs(w, outw, xofs, alpha, align_corner);
        cubic_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bicubic_image_fp16s(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    return 0;
}

int Interp_arm::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int h = bottom_blob.h;
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = reference_blob.w;
    int outh = reference_blob.h;

    if (!size_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(bottom_blobs.size());
        for (size_t i = 0; i < bottom_blobs.size(); i++)
        {
            bottom_blob_shapes[i] = bottom_blobs[i].shape();
        }
        eval_size_expr(bottom_blob_shapes, outw, outh);
    }

    if ((elempack == 1 || elempack == 4) && (dims == 1 || resize_type == 1)) // nearest
    {
        return forward_fp16s(bottom_blobs, top_blobs, opt);
    }

    if (dims == 1)
    {
        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < w; q++)
            {
                Mat top_blob_c = top_blob.channel(q);
                float16x8_t _v = vld1q_f16((const __fp16*)bottom_blob + q * 8);
                top_blob_c.fill(_v);
            }

            return 0;
        }

        return 0;
    }

    if (dims == 2)
    {
        if (outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 8)
        {
            if (resize_type == 1) // nearest
            {
                const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        float16x8_t _p = vld1q_f16(ptr + in_x * 8);
                        vst1q_f16(outptr, _p);

                        outptr += 8;
                    }
                }
            }

            if (resize_type == 2) // bilinear
            {
                int* buf = new int[outw + outw * 2];

                int* xofs = buf;
                __fp16* alpha = (__fp16*)(buf + outw);

                linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    const __fp16* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 8;
                        const __fp16* Sp = ptr + sx;

                        float16x4_t _a01 = vld1_f16(alphap);

                        float16x8_t _S0 = vld1q_f16(Sp);
                        float16x8_t _S1 = vld1q_f16(Sp + 8);
                        float16x8_t _p = vmulq_lane_f16(_S0, _a01, 0);
                        _p = vfmaq_lane_f16(_p, _S1, _a01, 1);
                        vst1q_f16(outptr, _p);

                        alphap += 2;
                        outptr += 8;
                    }
                }

                delete[] buf;
            }

            if (resize_type == 3) // bicubic
            {
                int* buf = new int[outw + outw * 4];

                int* xofs = buf;
                __fp16* alpha = (__fp16*)(buf + outw);

                cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    const __fp16* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 8;
                        const __fp16* Sp = ptr + sx;

                        float16x4_t _a0123 = vld1_f16(alphap);

                        float16x8_t _S0 = vld1q_f16(Sp - 8);
                        float16x8_t _S1 = vld1q_f16(Sp + 0);
                        float16x8_t _S2 = vld1q_f16(Sp + 8);
                        float16x8_t _S3 = vld1q_f16(Sp + 16);
                        float16x8_t _p = vmulq_lane_f16(_S0, _a0123, 0);
                        _p = vfmaq_lane_f16(_p, _S1, _a0123, 1);
                        _p = vfmaq_lane_f16(_p, _S2, _a0123, 2);
                        _p = vfmaq_lane_f16(_p, _S3, _a0123, 3);
                        vst1q_f16(outptr, _p);

                        alphap += 4;
                        outptr += 8;
                    }
                }

                delete[] buf;
            }

            return 0;
        }

        if (elempack == 4)
        {
            if (resize_type == 2) // bilinear
            {
                int* buf = new int[outw + outw * 2];

                int* xofs = buf;
                __fp16* alpha = (__fp16*)(buf + outw);

                linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    const __fp16* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const __fp16* Sp = ptr + sx;

                        float16x4_t _a01 = vld1_f16(alphap);

                        float16x4_t _S0 = vld1_f16(Sp);
                        float16x4_t _S1 = vld1_f16(Sp + 4);
                        float16x4_t _p = vmul_lane_f16(_S0, _a01, 0);
                        _p = vfma_lane_f16(_p, _S1, _a01, 1);
                        vst1_f16(outptr, _p);

                        alphap += 2;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            if (resize_type == 3) // bicubic
            {
                int* buf = new int[outw + outw * 4];

                int* xofs = buf;
                __fp16* alpha = (__fp16*)(buf + outw);

                cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                    __fp16* outptr = top_blob.row<__fp16>(y);
                    const __fp16* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const __fp16* Sp = ptr + sx;

                        float16x4_t _a0123 = vld1_f16(alphap);

                        float16x4_t _S0 = vld1_f16(Sp - 4);
                        float16x4_t _S1 = vld1_f16(Sp + 0);
                        float16x4_t _S2 = vld1_f16(Sp + 4);
                        float16x4_t _S3 = vld1_f16(Sp + 8);
                        float16x4_t _p = vmul_lane_f16(_S0, _a0123, 0);
                        _p = vfma_lane_f16(_p, _S1, _a0123, 1);
                        _p = vfma_lane_f16(_p, _S2, _a0123, 2);
                        _p = vfma_lane_f16(_p, _S3, _a0123, 3);
                        vst1_f16(outptr, _p);

                        alphap += 4;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            return 0;
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outw * 2];

            int* xofs = buf;
            __fp16* alpha = (__fp16*)(buf + outw);

            linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                __fp16* outptr = top_blob.row<__fp16>(y);
                const __fp16* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const __fp16* Sp = ptr + sx;
                    __fp16 a0 = alphap[0];
                    __fp16 a1 = alphap[1];
                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
                    alphap += 2;
                }
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outw * 4];

            int* xofs = buf;
            __fp16* alpha = (__fp16*)(buf + outw);

            cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(y);
                __fp16* outptr = top_blob.row<__fp16>(y);
                const __fp16* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const __fp16* Sp = ptr + sx;
                    __fp16 a0 = alphap[0];
                    __fp16 a1 = alphap[1];
                    __fp16 a2 = alphap[2];
                    __fp16 a3 = alphap[3];
                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
                    alphap += 4;
                }
            }

            delete[] buf;
        }

        return 0;
    }

    if (outw == w && outh == h)
    {
        top_blob = bottom_blob;
        return 0;
    }

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (elempack == 8)
    {
        if (resize_type == 1) // nearest
        {
            const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                for (int y = 0; y < outh; y++)
                {
                    int in_y = std::min((int)(y * hs), (h - 1));

                    const __fp16* ptr = src.row<const __fp16>(in_y);
                    __fp16* outptr = dst.row<__fp16>(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        float16x8_t _p = vld1q_f16(ptr + in_x * 8);
                        vst1q_f16(outptr, _p);

                        outptr += 8;
                    }
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outh + outw * 2 + outh * 2];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];

            linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
            linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bilinear_image_pack8_fp16sa(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outh + outw * 4 + outh * 4];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];

            cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
            cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bicubic_image_pack8_fp16sa(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        return 0;
    }

    if (elempack == 4)
    {
        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outh + outw * 2 + outh * 2];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];

            linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
            linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bilinear_image_pack4_fp16sa(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outh + outw * 4 + outh * 4];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
            __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];

            cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
            cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bicubic_image_pack4_fp16sa(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        return 0;
    }

    if (resize_type == 2) // bilinear
    {
        int* buf = new int[outw + outh + outw * 2 + outh * 2];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 2];
        __fp16* beta = (__fp16*)(buf + outw + outh + outw * 2); //new __fp16[outh * 2];

        linear_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
        linear_coeffs_fp16sa(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bilinear_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    if (resize_type == 3) // bicubic
    {
        int* buf = new int[outw + outh + outw * 4 + outh * 4];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        __fp16* alpha = (__fp16*)(buf + outw + outh);           //new __fp16[outw * 4];
        __fp16* beta = (__fp16*)(buf + outw + outh + outw * 4); //new __fp16[outh * 4];

        cubic_coeffs_fp16sa(w, outw, xofs, alpha, align_corner);
        cubic_coeffs_fp16sa(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bicubic_image_fp16sa(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/interp_bicubic.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static inline void interpolate_cubic(float fx, float* coeffs)
{
    const float A = -0.75f;

    float fx0 = fx + 1;
    float fx1 = fx;
    float fx2 = 1 - fx;
    // float fx3 = 2 - fx;

    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}

static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = (float)(dx * scale);
        }

        int sx = static_cast<int>(floor(fx));
        fx -= sx;

        interpolate_cubic(fx, alpha + dx * 4);

        if (sx <= -1)
        {
            sx = 1;
            alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 3];
            alpha[dx * 4 + 2] = 0.f;
            alpha[dx * 4 + 3] = 0.f;
        }
        if (sx == 0)
        {
            sx = 1;
            alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 2];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 3];
            alpha[dx * 4 + 3] = 0.f;
        }
        if (sx == w - 2)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 1];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 0];
            alpha[dx * 4 + 0] = 0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 0];
            alpha[dx * 4 + 1] = 0.f;
            alpha[dx * 4 + 0] = 0.f;
        }

        xofs[dx] = sx;
    }
}

static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    Mat rowsbuf2(w);
    Mat rowsbuf3(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const float* S0 = src.row(sy - 1);
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3;
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];
        float b2 = beta[2];
        float b3 = beta[3];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        float* Dp = dst.row(dy);
        for (int dx = 0; dx < w; dx++)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/arm/interp_bicubic_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bicubic_image_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    Mat rowsbuf2(w);
    Mat rowsbuf3(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const unsigned short* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows3p[dx] = bfloat16_to_float32(S3p[-1]) * a0 + bfloat16_to_float32(S3p[0]) * a1 + bfloat16_to_float32(S3p[1]) * a2 + bfloat16_to_float32(S3p[2]) * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const unsigned short* S2 = src.row<const unsigned short>(sy + 1);
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const unsigned short* S2p = S2 + sx;
                const unsigned short* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows2p[dx] = bfloat16_to_float32(S2p[-1]) * a0 + bfloat16_to_float32(S2p[0]) * a1 + bfloat16_to_float32(S2p[1]) * a2 + bfloat16_to_float32(S2p[2]) * a3;
                rows3p[dx] = bfloat16_to_float32(S3p[-1]) * a0 + bfloat16_to_float32(S3p[0]) * a1 + bfloat16_to_float32(S3p[1]) * a2 + bfloat16_to_float32(S3p[2]) * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const unsigned short* S1 = src.row<const unsigned short>(sy);
            const unsigned short* S2 = src.row<const unsigned short>(sy + 1);
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const unsigned short* S1p = S1 + sx;
                const unsigned short* S2p = S2 + sx;
                const unsigned short* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows1p[dx] = bfloat16_to_float32(S1p[-1]) * a0 + bfloat16_to_float32(S1p[0]) * a1 + bfloat16_to_float32(S1p[1]) * a2 + bfloat16_to_float32(S1p[2]) * a3;
                rows2p[dx] = bfloat16_to_float32(S2p[-1]) * a0 + bfloat16_to_float32(S2p[0]) * a1 + bfloat16_to_float32(S2p[1]) * a2 + bfloat16_to_float32(S2p[2]) * a3;
                rows3p[dx] = bfloat16_to_float32(S3p[-1]) * a0 + bfloat16_to_float32(S3p[0]) * a1 + bfloat16_to_float32(S3p[1]) * a2 + bfloat16_to_float32(S3p[2]) * a3;

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const unsigned short* S0 = src.row<const unsigned short>(sy - 1);
            const unsigned short* S1 = src.row<const unsigned short>(sy);
            const unsigned short* S2 = src.row<const unsigned short>(sy + 1);
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const unsigned short* S0p = S0 + sx;
                const unsigned short* S1p = S1 + sx;
                const unsigned short* S2p = S2 + sx;
                const unsigned short* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows0p[dx] = bfloat16_to_float32(S0p[-1]) * a0 + bfloat16_to_float32(S0p[0]) * a1 + bfloat16_to_float32(S0p[1]) * a2 + bfloat16_to_float32(S0p[2]) * a3;
                rows1p[dx] = bfloat16_to_float32(S1p[-1]) * a0 + bfloat16_to_float32(S1p[0]) * a1 + bfloat16_to_float32(S1p[1]) * a2 + bfloat16_to_float32(S1p[2]) * a3;
                rows2p[dx] = bfloat16_to_float32(S2p[-1]) * a0 + bfloat16_to_float32(S2p[0]) * a1 + bfloat16_to_float32(S2p[1]) * a2 + bfloat16_to_float32(S2p[2]) * a3;
                rows3p[dx] = bfloat16_to_float32(S3p[-1]) * a0 + bfloat16_to_float32(S3p[0]) * a1 + bfloat16_to_float32(S3p[1]) * a2 + bfloat16_to_float32(S3p[2]) * a3;

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];
        float b2 = beta[2];
        float b3 = beta[3];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        unsigned short* Dp = dst.row<unsigned short>(dy);
        for (int dx = 0; dx < w; dx++)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
            *Dp++ = float32_to_bfloat16(*rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3);
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/arm/interp_bicubic_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static inline void interpolate_cubic_fp16sa(float fx, __fp16* coeffs)
{
    const float A = -0.75f;

    float fx0 = fx + 1;
    float fx1 = fx;
    float fx2 = 1 - fx;
    // float fx3 = 2 - fx;

    coeffs[0] = (__fp16)(A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A);
    coeffs[1] = (__fp16)((A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1);
    coeffs[2] = (__fp16)((A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1);
    coeffs[3] = (__fp16)((__fp16)1.f - coeffs[0] - coeffs[1] - coeffs[2]);
}

static void cubic_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = static_cast<float>(dx * scale);
        }

        int sx = static_cast<int>(floor(fx));
        fx -= sx;

        interpolate_cubic_fp16sa(fx, alpha + dx * 4);

        if (sx <= -1)
        {
            sx = 1;
            alpha[dx * 4 + 0] = (__fp16)((__fp16)1.f - alpha[dx * 4 + 3]);
            alpha[dx * 4 + 1] = (__fp16)alpha[dx * 4 + 3];
            alpha[dx * 4 + 2] = (__fp16)0.f;
            alpha[dx * 4 + 3] = (__fp16)0.f;
        }
        if (sx == 0)
        {
            sx = 1;
            alpha[dx * 4 + 0] = (__fp16)(alpha[dx * 4 + 0] + alpha[dx * 4 + 1]);
            alpha[dx * 4 + 1] = (__fp16)alpha[dx * 4 + 2];
            alpha[dx * 4 + 2] = (__fp16)alpha[dx * 4 + 3];
            alpha[dx * 4 + 3] = (__fp16)0.f;
        }
        if (sx == w - 2)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = (__fp16)(alpha[dx * 4 + 2] + alpha[dx * 4 + 3]);
            alpha[dx * 4 + 2] = (__fp16)alpha[dx * 4 + 1];
            alpha[dx * 4 + 1] = (__fp16)alpha[dx * 4 + 0];
            alpha[dx * 4 + 0] = (__fp16)0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = (__fp16)((__fp16)1.f - alpha[dx * 4 + 0]);
            alpha[dx * 4 + 2] = (__fp16)(alpha[dx * 4 + 0]);
            alpha[dx * 4 + 1] = (__fp16)0.f;
            alpha[dx * 4 + 0] = (__fp16)0.f;
        }

        xofs[dx] = sx;
    }
}

static void resize_bicubic_image_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    Mat rowsbuf2(w);
    Mat rowsbuf3(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows3p[dx] = (float)S3p[-1] * a0 + (float)S3p[0] * a1 + (float)S3p[1] * a2 + (float)S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows2p[dx] = (float)S2p[-1] * a0 + (float)S2p[0] * a1 + (float)S2p[1] * a2 + (float)S2p[2] * a3;
                rows3p[dx] = (float)S3p[-1] * a0 + (float)S3p[0] * a1 + (float)S3p[1] * a2 + (float)S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows1p[dx] = (float)S1p[-1] * a0 + (float)S1p[0] * a1 + (float)S1p[1] * a2 + (float)S1p[2] * a3;
                rows2p[dx] = (float)S2p[-1] * a0 + (float)S2p[0] * a1 + (float)S2p[1] * a2 + (float)S2p[2] * a3;
                rows3p[dx] = (float)S3p[-1] * a0 + (float)S3p[0] * a1 + (float)S3p[1] * a2 + (float)S3p[2] * a3;

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const __fp16* S0 = src.row<const __fp16>(sy - 1);
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows0p[dx] = (float)S0p[-1] * a0 + (float)S0p[0] * a1 + (float)S0p[1] * a2 + (float)S0p[2] * a3;
                rows1p[dx] = (float)S1p[-1] * a0 + (float)S1p[0] * a1 + (float)S1p[1] * a2 + (float)S1p[2] * a3;
                rows2p[dx] = (float)S2p[-1] * a0 + (float)S2p[0] * a1 + (float)S2p[1] * a2 + (float)S2p[2] * a3;
                rows3p[dx] = (float)S3p[-1] * a0 + (float)S3p[0] * a1 + (float)S3p[1] * a2 + (float)S3p[2] * a3;

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];
        float b2 = beta[2];
        float b3 = beta[3];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        __fp16* Dp = dst.row<__fp16>(dy);
        for (int dx = 0; dx < w; dx++)
        {
            // D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
            *Dp++ = (__fp16)(*rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3);
        }

        beta += 4;
    }
}

static void resize_bicubic_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)2u);
    Mat rowsbuf1(w, (size_t)2u);
    Mat rowsbuf2(w, (size_t)2u);
    Mat rowsbuf3(w, (size_t)2u);
    __fp16* rows0 = rowsbuf0;
    __fp16* rows1 = rowsbuf1;
    __fp16* rows2 = rowsbuf2;
    __fp16* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            __fp16* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S3p = S3 + sx;

                __fp16 a0 = alphap[0];
                __fp16 a1 = alphap[1];
                __fp16 a2 = alphap[2];
                __fp16 a3 = alphap[3];
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            __fp16* rows0_old = rows0;
            __fp16* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                __fp16 a0 = alphap[0];
                __fp16 a1 = alphap[1];
                __fp16 a2 = alphap[2];
                __fp16 a3 = alphap[3];
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            __fp16* rows0_old = rows0;
            __fp16* rows1_old = rows1;
            __fp16* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows1p = rows1;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                __fp16 a0 = alphap[0];
                __fp16 a1 = alphap[1];
                __fp16 a2 = alphap[2];
                __fp16 a3 = alphap[3];
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const __fp16* S0 = src.row<const __fp16>(sy - 1);
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows0p = rows0;
            __fp16* rows1p = rows1;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                __fp16 a0 = alphap[0];
                __fp16 a1 = alphap[1];
                __fp16 a2 = alphap[2];
                __fp16 a3 = alphap[3];
                rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3;
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        __fp16 b0 = beta[0];
        __fp16 b1 = beta[1];
        __fp16 b2 = beta[2];
        __fp16 b3 = beta[3];

        __fp16* rows0p = rows0;
        __fp16* rows1p = rows1;
        __fp16* rows2p = rows2;
        __fp16* rows3p = rows3;
        __fp16* Dp = dst.row<__fp16>(dy);
        for (int dx = 0; dx < w; dx++)
        {
            // D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
            *Dp++ = (*rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3);
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/arm/interp_bicubic_pack4.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S30 = vld1q_f32(S3p - 4);
                float32x4_t _S31 = vld1q_f32(S3p + 0);
                float32x4_t _S32 = vld1q_f32(S3p + 4);
                float32x4_t _S33 = vld1q_f32(S3p + 8);
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S20 = vld1q_f32(S2p - 4);
                float32x4_t _S21 = vld1q_f32(S2p + 0);
                float32x4_t _S22 = vld1q_f32(S2p + 4);
                float32x4_t _S23 = vld1q_f32(S2p + 8);
                float32x4_t _S30 = vld1q_f32(S3p - 4);
                float32x4_t _S31 = vld1q_f32(S3p + 0);
                float32x4_t _S32 = vld1q_f32(S3p + 4);
                float32x4_t _S33 = vld1q_f32(S3p + 8);
                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S10 = vld1q_f32(S1p - 4);
                float32x4_t _S11 = vld1q_f32(S1p + 0);
                float32x4_t _S12 = vld1q_f32(S1p + 4);
                float32x4_t _S13 = vld1q_f32(S1p + 8);
                float32x4_t _S20 = vld1q_f32(S2p - 4);
                float32x4_t _S21 = vld1q_f32(S2p + 0);
                float32x4_t _S22 = vld1q_f32(S2p + 4);
                float32x4_t _S23 = vld1q_f32(S2p + 8);
                float32x4_t _S30 = vld1q_f32(S3p - 4);
                float32x4_t _S31 = vld1q_f32(S3p + 0);
                float32x4_t _S32 = vld1q_f32(S3p + 4);
                float32x4_t _S33 = vld1q_f32(S3p + 8);
                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows1p + dx * 4, _rows1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const float* S0 = src.row(sy - 1);
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                // TODO check the generated assembly on armv7
                float32x4_t _S00 = vld1q_f32(S0p - 4);
                float32x4_t _S01 = vld1q_f32(S0p + 0);
                float32x4_t _S02 = vld1q_f32(S0p + 4);
                float32x4_t _S03 = vld1q_f32(S0p + 8);
                float32x4_t _S10 = vld1q_f32(S1p - 4);
                float32x4_t _S11 = vld1q_f32(S1p + 0);
                float32x4_t _S12 = vld1q_f32(S1p + 4);
                float32x4_t _S13 = vld1q_f32(S1p + 8);
                float32x4_t _S20 = vld1q_f32(S2p - 4);
                float32x4_t _S21 = vld1q_f32(S2p + 0);
                float32x4_t _S22 = vld1q_f32(S2p + 4);
                float32x4_t _S23 = vld1q_f32(S2p + 8);
                float32x4_t _S30 = vld1q_f32(S3p - 4);
                float32x4_t _S31 = vld1q_f32(S3p + 0);
                float32x4_t _S32 = vld1q_f32(S3p + 4);
                float32x4_t _S33 = vld1q_f32(S3p + 8);
                float32x4_t _rows0 = vmulq_lane_f32(_S00, vget_low_f32(_a0123), 0);
                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows0 = vmlaq_lane_f32(_rows0, _S01, vget_low_f32(_a0123), 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows0 = vmlaq_lane_f32(_rows0, _S02, vget_high_f32(_a0123), 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows0 = vmlaq_lane_f32(_rows0, _S03, vget_high_f32(_a0123), 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows0p + dx * 4, _rows0);
                vst1q_f32(rows1p + dx * 4, _rows1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float32x4_t _b0123 = vld1q_f32(beta);

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        float* Dp = dst.row(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);
            float32x4_t _rows2 = vld1q_f32(rows2p);
            float32x4_t _rows3 = vld1q_f32(rows3p);
            float32x4_t _Dp = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
            _Dp = vmlaq_lane_f32(_Dp, _rows1, vget_low_f32(_b0123), 1);
            _Dp = vmlaq_lane_f32(_Dp, _rows2, vget_high_f32(_b0123), 0);
            _Dp = vmlaq_lane_f32(_Dp, _rows3, vget_high_f32(_b0123), 1);
            vst1q_f32(Dp, _Dp);

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
            rows2p += 4;
            rows3p += 4;
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/arm/interp_bicubic_pack4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bicubic_image_pack4_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const unsigned short* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S30 = bfloat2float(vld1_u16(S3p - 4));
                float32x4_t _S31 = bfloat2float(vld1_u16(S3p + 0));
                float32x4_t _S32 = bfloat2float(vld1_u16(S3p + 4));
                float32x4_t _S33 = bfloat2float(vld1_u16(S3p + 8));
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const unsigned short* S2 = src.row<const unsigned short>(sy + 1);
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const unsigned short* S2p = S2 + sx;
                const unsigned short* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S20 = bfloat2float(vld1_u16(S2p - 4));
                float32x4_t _S21 = bfloat2float(vld1_u16(S2p + 0));
                float32x4_t _S22 = bfloat2float(vld1_u16(S2p + 4));
                float32x4_t _S23 = bfloat2float(vld1_u16(S2p + 8));
                float32x4_t _S30 = bfloat2float(vld1_u16(S3p - 4));
                float32x4_t _S31 = bfloat2float(vld1_u16(S3p + 0));
                float32x4_t _S32 = bfloat2float(vld1_u16(S3p + 4));
                float32x4_t _S33 = bfloat2float(vld1_u16(S3p + 8));
                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const unsigned short* S1 = src.row<const unsigned short>(sy);
            const unsigned short* S2 = src.row<const unsigned short>(sy + 1);
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const unsigned short* S1p = S1 + sx;
                const unsigned short* S2p = S2 + sx;
                const unsigned short* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S10 = bfloat2float(vld1_u16(S1p - 4));
                float32x4_t _S11 = bfloat2float(vld1_u16(S1p + 0));
                float32x4_t _S12 = bfloat2float(vld1_u16(S1p + 4));
                float32x4_t _S13 = bfloat2float(vld1_u16(S1p + 8));
                float32x4_t _S20 = bfloat2float(vld1_u16(S2p - 4));
                float32x4_t _S21 = bfloat2float(vld1_u16(S2p + 0));
                float32x4_t _S22 = bfloat2float(vld1_u16(S2p + 4));
                float32x4_t _S23 = bfloat2float(vld1_u16(S2p + 8));
                float32x4_t _S30 = bfloat2float(vld1_u16(S3p - 4));
                float32x4_t _S31 = bfloat2float(vld1_u16(S3p + 0));
                float32x4_t _S32 = bfloat2float(vld1_u16(S3p + 4));
                float32x4_t _S33 = bfloat2float(vld1_u16(S3p + 8));
                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows1p + dx * 4, _rows1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const unsigned short* S0 = src.row<const unsigned short>(sy - 1);
            const unsigned short* S1 = src.row<const unsigned short>(sy);
            const unsigned short* S2 = src.row<const unsigned short>(sy + 1);
            const unsigned short* S3 = src.row<const unsigned short>(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const unsigned short* S0p = S0 + sx;
                const unsigned short* S1p = S1 + sx;
                const unsigned short* S2p = S2 + sx;
                const unsigned short* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                // TODO check the generated assembly on armv7
                float32x4_t _S00 = bfloat2float(vld1_u16(S0p - 4));
                float32x4_t _S01 = bfloat2float(vld1_u16(S0p + 0));
                float32x4_t _S02 = bfloat2float(vld1_u16(S0p + 4));
                float32x4_t _S03 = bfloat2float(vld1_u16(S0p + 8));
                float32x4_t _S10 = bfloat2float(vld1_u16(S1p - 4));
                float32x4_t _S11 = bfloat2float(vld1_u16(S1p + 0));
                float32x4_t _S12 = bfloat2float(vld1_u16(S1p + 4));
                float32x4_t _S13 = bfloat2float(vld1_u16(S1p + 8));
                float32x4_t _S20 = bfloat2float(vld1_u16(S2p - 4));
                float32x4_t _S21 = bfloat2float(vld1_u16(S2p + 0));
                float32x4_t _S22 = bfloat2float(vld1_u16(S2p + 4));
                float32x4_t _S23 = bfloat2float(vld1_u16(S2p + 8));
                float32x4_t _S30 = bfloat2float(vld1_u16(S3p - 4));
                float32x4_t _S31 = bfloat2float(vld1_u16(S3p + 0));
                float32x4_t _S32 = bfloat2float(vld1_u16(S3p + 4));
                float32x4_t _S33 = bfloat2float(vld1_u16(S3p + 8));
                float32x4_t _rows0 = vmulq_lane_f32(_S00, vget_low_f32(_a0123), 0);
                float32x4_t _rows1 = vmulq_lane_f32(_S10, vget_low_f32(_a0123), 0);
                float32x4_t _rows2 = vmulq_lane_f32(_S20, vget_low_f32(_a0123), 0);
                float32x4_t _rows3 = vmulq_lane_f32(_S30, vget_low_f32(_a0123), 0);
                _rows0 = vmlaq_lane_f32(_rows0, _S01, vget_low_f32(_a0123), 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, vget_low_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S21, vget_low_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S31, vget_low_f32(_a0123), 1);
                _rows0 = vmlaq_lane_f32(_rows0, _S02, vget_high_f32(_a0123), 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S12, vget_high_f32(_a0123), 0);
                _rows2 = vmlaq_lane_f32(_rows2, _S22, vget_high_f32(_a0123), 0);
                _rows3 = vmlaq_lane_f32(_rows3, _S32, vget_high_f32(_a0123), 0);
                _rows0 = vmlaq_lane_f32(_rows0, _S03, vget_high_f32(_a0123), 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S13, vget_high_f32(_a0123), 1);
                _rows2 = vmlaq_lane_f32(_rows2, _S23, vget_high_f32(_a0123), 1);
                _rows3 = vmlaq_lane_f32(_rows3, _S33, vget_high_f32(_a0123), 1);
                vst1q_f32(rows0p + dx * 4, _rows0);
                vst1q_f32(rows1p + dx * 4, _rows1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float32x4_t _b0123 = vld1q_f32(beta);

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        unsigned short* Dp = dst.row<unsigned short>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);
            float32x4_t _rows2 = vld1q_f32(rows2p);
            float32x4_t _rows3 = vld1q_f32(rows3p);
            float32x4_t _Dp = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
            _Dp = vmlaq_lane_f32(_Dp, _rows1, vget_low_f32(_b0123), 1);
            _Dp = vmlaq_lane_f32(_Dp, _rows2, vget_high_f32(_b0123), 0);
            _Dp = vmlaq_lane_f32(_Dp, _rows3, vget_high_f32(_b0123), 1);
            vst1_u16(Dp, float2bfloat(_Dp));

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
            rows2p += 4;
            rows3p += 4;
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/arm/interp_bicubic_pack4_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bicubic_image_pack4_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S30 = vcvt_f32_f16(vld1_f16(S3p - 4));
                float32x4_t _S31 = vcvt_f32_f16(vld1_f16(S3p + 0));
                float32x4_t _S32 = vcvt_f32_f16(vld1_f16(S3p + 4));
                float32x4_t _S33 = vcvt_f32_f16(vld1_f16(S3p + 8));
                float32x4_t _rows3 = vmulq_laneq_f32(_S30, _a0123, 0);
                _rows3 = vfmaq_laneq_f32(_rows3, _S31, _a0123, 1);
                _rows3 = vfmaq_laneq_f32(_rows3, _S32, _a0123, 2);
                _rows3 = vfmaq_laneq_f32(_rows3, _S33, _a0123, 3);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S20 = vcvt_f32_f16(vld1_f16(S2p - 4));
                float32x4_t _S21 = vcvt_f32_f16(vld1_f16(S2p + 0));
                float32x4_t _S22 = vcvt_f32_f16(vld1_f16(S2p + 4));
                float32x4_t _S23 = vcvt_f32_f16(vld1_f16(S2p + 8));
                float32x4_t _S30 = vcvt_f32_f16(vld1_f16(S3p - 4));
                float32x4_t _S31 = vcvt_f32_f16(vld1_f16(S3p + 0));
                float32x4_t _S32 = vcvt_f32_f16(vld1_f16(S3p + 4));
                float32x4_t _S33 = vcvt_f32_f16(vld1_f16(S3p + 8));
                float32x4_t _rows2 = vmulq_laneq_f32(_S20, _a0123, 0);
                float32x4_t _rows3 = vmulq_laneq_f32(_S30, _a0123, 0);
                _rows2 = vfmaq_laneq_f32(_rows2, _S21, _a0123, 1);
                _rows3 = vfmaq_laneq_f32(_rows3, _S31, _a0123, 1);
                _rows2 = vfmaq_laneq_f32(_rows2, _S22, _a0123, 2);
                _rows3 = vfmaq_laneq_f32(_rows3, _S32, _a0123, 2);
                _rows2 = vfmaq_laneq_f32(_rows2, _S23, _a0123, 3);
                _rows3 = vfmaq_laneq_f32(_rows3, _S33, _a0123, 3);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S10 = vcvt_f32_f16(vld1_f16(S1p - 4));
                float32x4_t _S11 = vcvt_f32_f16(vld1_f16(S1p + 0));
                float32x4_t _S12 = vcvt_f32_f16(vld1_f16(S1p + 4));
                float32x4_t _S13 = vcvt_f32_f16(vld1_f16(S1p + 8));
                float32x4_t _S20 = vcvt_f32_f16(vld1_f16(S2p - 4));
                float32x4_t _S21 = vcvt_f32_f16(vld1_f16(S2p + 0));
                float32x4_t _S22 = vcvt_f32_f16(vld1_f16(S2p + 4));
                float32x4_t _S23 = vcvt_f32_f16(vld1_f16(S2p + 8));
                float32x4_t _S30 = vcvt_f32_f16(vld1_f16(S3p - 4));
                float32x4_t _S31 = vcvt_f32_f16(vld1_f16(S3p + 0));
                float32x4_t _S32 = vcvt_f32_f16(vld1_f16(S3p + 4));
                float32x4_t _S33 = vcvt_f32_f16(vld1_f16(S3p + 8));
                float32x4_t _rows1 = vmulq_laneq_f32(_S10, _a0123, 0);
                float32x4_t _rows2 = vmulq_laneq_f32(_S20, _a0123, 0);
                float32x4_t _rows3 = vmulq_laneq_f32(_S30, _a0123, 0);
                _rows1 = vfmaq_laneq_f32(_rows1, _S11, _a0123, 1);
                _rows2 = vfmaq_laneq_f32(_rows2, _S21, _a0123, 1);
                _rows3 = vfmaq_laneq_f32(_rows3, _S31, _a0123, 1);
                _rows1 = vfmaq_laneq_f32(_rows1, _S12, _a0123, 2);
                _rows2 = vfmaq_laneq_f32(_rows2, _S22, _a0123, 2);
                _rows3 = vfmaq_laneq_f32(_rows3, _S32, _a0123, 2);
                _rows1 = vfmaq_laneq_f32(_rows1, _S13, _a0123, 3);
                _rows2 = vfmaq_laneq_f32(_rows2, _S23, _a0123, 3);
                _rows3 = vfmaq_laneq_f32(_rows3, _S33, _a0123, 3);
                vst1q_f32(rows1p + dx * 4, _rows1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const __fp16* S0 = src.row<const __fp16>(sy - 1);
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float32x4_t _a0123 = vld1q_f32(alphap);

                float32x4_t _S00 = vcvt_f32_f16(vld1_f16(S0p - 4));
                float32x4_t _S01 = vcvt_f32_f16(vld1_f16(S0p + 0));
                float32x4_t _S02 = vcvt_f32_f16(vld1_f16(S0p + 4));
                float32x4_t _S03 = vcvt_f32_f16(vld1_f16(S0p + 8));
                float32x4_t _S10 = vcvt_f32_f16(vld1_f16(S1p - 4));
                float32x4_t _S11 = vcvt_f32_f16(vld1_f16(S1p + 0));
                float32x4_t _S12 = vcvt_f32_f16(vld1_f16(S1p + 4));
                float32x4_t _S13 = vcvt_f32_f16(vld1_f16(S1p + 8));
                float32x4_t _S20 = vcvt_f32_f16(vld1_f16(S2p - 4));
                float32x4_t _S21 = vcvt_f32_f16(vld1_f16(S2p + 0));
                float32x4_t _S22 = vcvt_f32_f16(vld1_f16(S2p + 4));
                float32x4_t _S23 = vcvt_f32_f16(vld1_f16(S2p + 8));
                float32x4_t _S30 = vcvt_f32_f16(vld1_f16(S3p - 4));
                float32x4_t _S31 = vcvt_f32_f16(vld1_f16(S3p + 0));
                float32x4_t _S32 = vcvt_f32_f16(vld1_f16(S3p + 4));
                float32x4_t _S33 = vcvt_f32_f16(vld1_f16(S3p + 8));
                float32x4_t _rows0 = vmulq_laneq_f32(_S00, _a0123, 0);
                float32x4_t _rows1 = vmulq_laneq_f32(_S10, _a0123, 0);
                float32x4_t _rows2 = vmulq_laneq_f32(_S20, _a0123, 0);
                float32x4_t _rows3 = vmulq_laneq_f32(_S30, _a0123, 0);
                _rows0 = vfmaq_laneq_f32(_rows0, _S01, _a0123, 1);
                _rows1 = vfmaq_laneq_f32(_rows1, _S11, _a0123, 1);
                _rows2 = vfmaq_laneq_f32(_rows2, _S21, _a0123, 1);
                _rows3 = vfmaq_laneq_f32(_rows3, _S31, _a0123, 1);
                _rows0 = vfmaq_laneq_f32(_rows0, _S02, _a0123, 2);
                _rows1 = vfmaq_laneq_f32(_rows1, _S12, _a0123, 2);
                _rows2 = vfmaq_laneq_f32(_rows2, _S22, _a0123, 2);
                _rows3 = vfmaq_laneq_f32(_rows3, _S32, _a0123, 2);
                _rows0 = vfmaq_laneq_f32(_rows0, _S03, _a0123, 3);
                _rows1 = vfmaq_laneq_f32(_rows1, _S13, _a0123, 3);
                _rows2 = vfmaq_laneq_f32(_rows2, _S23, _a0123, 3);
                _rows3 = vfmaq_laneq_f32(_rows3, _S33, _a0123, 3);
                vst1q_f32(rows0p + dx * 4, _rows0);
                vst1q_f32(rows1p + dx * 4, _rows1);
                vst1q_f32(rows2p + dx * 4, _rows2);
                vst1q_f32(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float32x4_t _b0123 = vld1q_f32(beta);

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        __fp16* Dp = dst.row<__fp16>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);
            float32x4_t _rows2 = vld1q_f32(rows2p);
            float32x4_t _rows3 = vld1q_f32(rows3p);
            float32x4_t _Dp = vmulq_laneq_f32(_rows0, _b0123, 0);
            _Dp = vfmaq_laneq_f32(_Dp, _rows1, _b0123, 1);
            _Dp = vfmaq_laneq_f32(_Dp, _rows2, _b0123, 2);
            _Dp = vfmaq_laneq_f32(_Dp, _rows3, _b0123, 3);
            vst1_f16(Dp, vcvt_f16_f32(_Dp));

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
            rows2p += 4;
            rows3p += 4;
        }

        beta += 4;
    }
}

static void resize_bicubic_image_pack4_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 2u, 4);
    Mat rowsbuf1(w, (size_t)4 * 2u, 4);
    Mat rowsbuf2(w, (size_t)4 * 2u, 4);
    Mat rowsbuf3(w, (size_t)4 * 2u, 4);
    __fp16* rows0 = rowsbuf0;
    __fp16* rows1 = rowsbuf1;
    __fp16* rows2 = rowsbuf2;
    __fp16* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            __fp16* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x4_t _S30 = vld1_f16(S3p - 4);
                float16x4_t _S31 = vld1_f16(S3p + 0);
                float16x4_t _S32 = vld1_f16(S3p + 4);
                float16x4_t _S33 = vld1_f16(S3p + 8);
                float16x4_t _rows3 = vmul_lane_f16(_S30, _a0123, 0);
                _rows3 = vfma_lane_f16(_rows3, _S31, _a0123, 1);
                _rows3 = vfma_lane_f16(_rows3, _S32, _a0123, 2);
                _rows3 = vfma_lane_f16(_rows3, _S33, _a0123, 3);
                vst1_f16(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            __fp16* rows0_old = rows0;
            __fp16* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x4_t _S20 = vld1_f16(S2p - 4);
                float16x4_t _S21 = vld1_f16(S2p + 0);
                float16x4_t _S22 = vld1_f16(S2p + 4);
                float16x4_t _S23 = vld1_f16(S2p + 8);
                float16x4_t _S30 = vld1_f16(S3p - 4);
                float16x4_t _S31 = vld1_f16(S3p + 0);
                float16x4_t _S32 = vld1_f16(S3p + 4);
                float16x4_t _S33 = vld1_f16(S3p + 8);
                float16x4_t _rows2 = vmul_lane_f16(_S20, _a0123, 0);
                float16x4_t _rows3 = vmul_lane_f16(_S30, _a0123, 0);
                _rows2 = vfma_lane_f16(_rows2, _S21, _a0123, 1);
                _rows3 = vfma_lane_f16(_rows3, _S31, _a0123, 1);
                _rows2 = vfma_lane_f16(_rows2, _S22, _a0123, 2);
                _rows3 = vfma_lane_f16(_rows3, _S32, _a0123, 2);
                _rows2 = vfma_lane_f16(_rows2, _S23, _a0123, 3);
                _rows3 = vfma_lane_f16(_rows3, _S33, _a0123, 3);
                vst1_f16(rows2p + dx * 4, _rows2);
                vst1_f16(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            __fp16* rows0_old = rows0;
            __fp16* rows1_old = rows1;
            __fp16* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows1p = rows1;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x4_t _S10 = vld1_f16(S1p - 4);
                float16x4_t _S11 = vld1_f16(S1p + 0);
                float16x4_t _S12 = vld1_f16(S1p + 4);
                float16x4_t _S13 = vld1_f16(S1p + 8);
                float16x4_t _S20 = vld1_f16(S2p - 4);
                float16x4_t _S21 = vld1_f16(S2p + 0);
                float16x4_t _S22 = vld1_f16(S2p + 4);
                float16x4_t _S23 = vld1_f16(S2p + 8);
                float16x4_t _S30 = vld1_f16(S3p - 4);
                float16x4_t _S31 = vld1_f16(S3p + 0);
                float16x4_t _S32 = vld1_f16(S3p + 4);
                float16x4_t _S33 = vld1_f16(S3p + 8);
                float16x4_t _rows1 = vmul_lane_f16(_S10, _a0123, 0);
                float16x4_t _rows2 = vmul_lane_f16(_S20, _a0123, 0);
                float16x4_t _rows3 = vmul_lane_f16(_S30, _a0123, 0);
                _rows1 = vfma_lane_f16(_rows1, _S11, _a0123, 1);
                _rows2 = vfma_lane_f16(_rows2, _S21, _a0123, 1);
                _rows3 = vfma_lane_f16(_rows3, _S31, _a0123, 1);
                _rows1 = vfma_lane_f16(_rows1, _S12, _a0123, 2);
                _rows2 = vfma_lane_f16(_rows2, _S22, _a0123, 2);
                _rows3 = vfma_lane_f16(_rows3, _S32, _a0123, 2);
                _rows1 = vfma_lane_f16(_rows1, _S13, _a0123, 3);
                _rows2 = vfma_lane_f16(_rows2, _S23, _a0123, 3);
                _rows3 = vfma_lane_f16(_rows3, _S33, _a0123, 3);
                vst1_f16(rows1p + dx * 4, _rows1);
                vst1_f16(rows2p + dx * 4, _rows2);
                vst1_f16(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const __fp16* S0 = src.row<const __fp16>(sy - 1);
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows0p = rows0;
            __fp16* rows1p = rows1;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x4_t _S00 = vld1_f16(S0p - 4);
                float16x4_t _S01 = vld1_f16(S0p + 0);
                float16x4_t _S02 = vld1_f16(S0p + 4);
                float16x4_t _S03 = vld1_f16(S0p + 8);
                float16x4_t _S10 = vld1_f16(S1p - 4);
                float16x4_t _S11 = vld1_f16(S1p + 0);
                float16x4_t _S12 = vld1_f16(S1p + 4);
                float16x4_t _S13 = vld1_f16(S1p + 8);
                float16x4_t _S20 = vld1_f16(S2p - 4);
                float16x4_t _S21 = vld1_f16(S2p + 0);
                float16x4_t _S22 = vld1_f16(S2p + 4);
                float16x4_t _S23 = vld1_f16(S2p + 8);
                float16x4_t _S30 = vld1_f16(S3p - 4);
                float16x4_t _S31 = vld1_f16(S3p + 0);
                float16x4_t _S32 = vld1_f16(S3p + 4);
                float16x4_t _S33 = vld1_f16(S3p + 8);
                float16x4_t _rows0 = vmul_lane_f16(_S00, _a0123, 0);
                float16x4_t _rows1 = vmul_lane_f16(_S10, _a0123, 0);
                float16x4_t _rows2 = vmul_lane_f16(_S20, _a0123, 0);
                float16x4_t _rows3 = vmul_lane_f16(_S30, _a0123, 0);
                _rows0 = vfma_lane_f16(_rows0, _S01, _a0123, 1);
                _rows1 = vfma_lane_f16(_rows1, _S11, _a0123, 1);
                _rows2 = vfma_lane_f16(_rows2, _S21, _a0123, 1);
                _rows3 = vfma_lane_f16(_rows3, _S31, _a0123, 1);
                _rows0 = vfma_lane_f16(_rows0, _S02, _a0123, 2);
                _rows1 = vfma_lane_f16(_rows1, _S12, _a0123, 2);
                _rows2 = vfma_lane_f16(_rows2, _S22, _a0123, 2);
                _rows3 = vfma_lane_f16(_rows3, _S32, _a0123, 2);
                _rows0 = vfma_lane_f16(_rows0, _S03, _a0123, 3);
                _rows1 = vfma_lane_f16(_rows1, _S13, _a0123, 3);
                _rows2 = vfma_lane_f16(_rows2, _S23, _a0123, 3);
                _rows3 = vfma_lane_f16(_rows3, _S33, _a0123, 3);
                vst1_f16(rows0p + dx * 4, _rows0);
                vst1_f16(rows1p + dx * 4, _rows1);
                vst1_f16(rows2p + dx * 4, _rows2);
                vst1_f16(rows3p + dx * 4, _rows3);

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float16x4_t _b0123 = vld1_f16(beta);

        __fp16* rows0p = rows0;
        __fp16* rows1p = rows1;
        __fp16* rows2p = rows2;
        __fp16* rows3p = rows3;
        __fp16* Dp = dst.row<__fp16>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float16x4_t _rows0 = vld1_f16(rows0p);
            float16x4_t _rows1 = vld1_f16(rows1p);
            float16x4_t _rows2 = vld1_f16(rows2p);
            float16x4_t _rows3 = vld1_f16(rows3p);
            float16x4_t _Dp = vmul_lane_f16(_rows0, _b0123, 0);
            _Dp = vfma_lane_f16(_Dp, _rows1, _b0123, 1);
            _Dp = vfma_lane_f16(_Dp, _rows2, _b0123, 2);
            _Dp = vfma_lane_f16(_Dp, _rows3, _b0123, 3);
            vst1_f16(Dp, _Dp);

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
            rows2p += 4;
            rows3p += 4;
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/arm/interp_bicubic_pack8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bicubic_image_pack8_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)8 * 2u, 8);
    Mat rowsbuf1(w, (size_t)8 * 2u, 8);
    Mat rowsbuf2(w, (size_t)8 * 2u, 8);
    Mat rowsbuf3(w, (size_t)8 * 2u, 8);
    __fp16* rows0 = rowsbuf0;
    __fp16* rows1 = rowsbuf1;
    __fp16* rows2 = rowsbuf2;
    __fp16* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            __fp16* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 8;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x8_t _S30 = vld1q_f16(S3p - 8);
                float16x8_t _S31 = vld1q_f16(S3p + 0);
                float16x8_t _S32 = vld1q_f16(S3p + 8);
                float16x8_t _S33 = vld1q_f16(S3p + 16);
                float16x8_t _rows3 = vmulq_lane_f16(_S30, _a0123, 0);
                _rows3 = vfmaq_lane_f16(_rows3, _S31, _a0123, 1);
                _rows3 = vfmaq_lane_f16(_rows3, _S32, _a0123, 2);
                _rows3 = vfmaq_lane_f16(_rows3, _S33, _a0123, 3);
                vst1q_f16(rows3p + dx * 8, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            __fp16* rows0_old = rows0;
            __fp16* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 8;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x8_t _S20 = vld1q_f16(S2p - 8);
                float16x8_t _S21 = vld1q_f16(S2p + 0);
                float16x8_t _S22 = vld1q_f16(S2p + 8);
                float16x8_t _S23 = vld1q_f16(S2p + 16);
                float16x8_t _S30 = vld1q_f16(S3p - 8);
                float16x8_t _S31 = vld1q_f16(S3p + 0);
                float16x8_t _S32 = vld1q_f16(S3p + 8);
                float16x8_t _S33 = vld1q_f16(S3p + 16);
                float16x8_t _rows2 = vmulq_lane_f16(_S20, _a0123, 0);
                float16x8_t _rows3 = vmulq_lane_f16(_S30, _a0123, 0);
                _rows2 = vfmaq_lane_f16(_rows2, _S21, _a0123, 1);
                _rows3 = vfmaq_lane_f16(_rows3, _S31, _a0123, 1);
                _rows2 = vfmaq_lane_f16(_rows2, _S22, _a0123, 2);
                _rows3 = vfmaq_lane_f16(_rows3, _S32, _a0123, 2);
                _rows2 = vfmaq_lane_f16(_rows2, _S23, _a0123, 3);
                _rows3 = vfmaq_lane_f16(_rows3, _S33, _a0123, 3);
                vst1q_f16(rows2p + dx * 8, _rows2);
                vst1q_f16(rows3p + dx * 8, _rows3);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            __fp16* rows0_old = rows0;
            __fp16* rows1_old = rows1;
            __fp16* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows1p = rows1;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 8;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x8_t _S10 = vld1q_f16(S1p - 8);
                float16x8_t _S11 = vld1q_f16(S1p + 0);
                float16x8_t _S12 = vld1q_f16(S1p + 8);
                float16x8_t _S13 = vld1q_f16(S1p + 16);
                float16x8_t _S20 = vld1q_f16(S2p - 8);
                float16x8_t _S21 = vld1q_f16(S2p + 0);
                float16x8_t _S22 = vld1q_f16(S2p + 8);
                float16x8_t _S23 = vld1q_f16(S2p + 16);
                float16x8_t _S30 = vld1q_f16(S3p - 8);
                float16x8_t _S31 = vld1q_f16(S3p + 0);
                float16x8_t _S32 = vld1q_f16(S3p + 8);
                float16x8_t _S33 = vld1q_f16(S3p + 16);
                float16x8_t _rows1 = vmulq_lane_f16(_S10, _a0123, 0);
                float16x8_t _rows2 = vmulq_lane_f16(_S20, _a0123, 0);
                float16x8_t _rows3 = vmulq_lane_f16(_S30, _a0123, 0);
                _rows1 = vfmaq_lane_f16(_rows1, _S11, _a0123, 1);
                _rows2 = vfmaq_lane_f16(_rows2, _S21, _a0123, 1);
                _rows3 = vfmaq_lane_f16(_rows3, _S31, _a0123, 1);
                _rows1 = vfmaq_lane_f16(_rows1, _S12, _a0123, 2);
                _rows2 = vfmaq_lane_f16(_rows2, _S22, _a0123, 2);
                _rows3 = vfmaq_lane_f16(_rows3, _S32, _a0123, 2);
                _rows1 = vfmaq_lane_f16(_rows1, _S13, _a0123, 3);
                _rows2 = vfmaq_lane_f16(_rows2, _S23, _a0123, 3);
                _rows3 = vfmaq_lane_f16(_rows3, _S33, _a0123, 3);
                vst1q_f16(rows1p + dx * 8, _rows1);
                vst1q_f16(rows2p + dx * 8, _rows2);
                vst1q_f16(rows3p + dx * 8, _rows3);

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const __fp16* S0 = src.row<const __fp16>(sy - 1);
            const __fp16* S1 = src.row<const __fp16>(sy);
            const __fp16* S2 = src.row<const __fp16>(sy + 1);
            const __fp16* S3 = src.row<const __fp16>(sy + 2);

            const __fp16* alphap = alpha;
            __fp16* rows0p = rows0;
            __fp16* rows1p = rows1;
            __fp16* rows2p = rows2;
            __fp16* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 8;
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;
                const __fp16* S2p = S2 + sx;
                const __fp16* S3p = S3 + sx;

                float16x4_t _a0123 = vld1_f16(alphap);

                float16x8_t _S00 = vld1q_f16(S0p - 8);
                float16x8_t _S01 = vld1q_f16(S0p + 0);
                float16x8_t _S02 = vld1q_f16(S0p + 8);
                float16x8_t _S03 = vld1q_f16(S0p + 16);
                float16x8_t _S10 = vld1q_f16(S1p - 8);
                float16x8_t _S11 = vld1q_f16(S1p + 0);
                float16x8_t _S12 = vld1q_f16(S1p + 8);
                float16x8_t _S13 = vld1q_f16(S1p + 16);
                float16x8_t _S20 = vld1q_f16(S2p - 8);
                float16x8_t _S21 = vld1q_f16(S2p + 0);
                float16x8_t _S22 = vld1q_f16(S2p + 8);
                float16x8_t _S23 = vld1q_f16(S2p + 16);
                float16x8_t _S30 = vld1q_f16(S3p - 8);
                float16x8_t _S31 = vld1q_f16(S3p + 0);
                float16x8_t _S32 = vld1q_f16(S3p + 8);
                float16x8_t _S33 = vld1q_f16(S3p + 16);
                float16x8_t _rows0 = vmulq_lane_f16(_S00, _a0123, 0);
                float16x8_t _rows1 = vmulq_lane_f16(_S10, _a0123, 0);
                float16x8_t _rows2 = vmulq_lane_f16(_S20, _a0123, 0);
                float16x8_t _rows3 = vmulq_lane_f16(_S30, _a0123, 0);
                _rows0 = vfmaq_lane_f16(_rows0, _S01, _a0123, 1);
                _rows1 = vfmaq_lane_f16(_rows1, _S11, _a0123, 1);
                _rows2 = vfmaq_lane_f16(_rows2, _S21, _a0123, 1);
                _rows3 = vfmaq_lane_f16(_rows3, _S31, _a0123, 1);
                _rows0 = vfmaq_lane_f16(_rows0, _S02, _a0123, 2);
                _rows1 = vfmaq_lane_f16(_rows1, _S12, _a0123, 2);
                _rows2 = vfmaq_lane_f16(_rows2, _S22, _a0123, 2);
                _rows3 = vfmaq_lane_f16(_rows3, _S32, _a0123, 2);
                _rows0 = vfmaq_lane_f16(_rows0, _S03, _a0123, 3);
                _rows1 = vfmaq_lane_f16(_rows1, _S13, _a0123, 3);
                _rows2 = vfmaq_lane_f16(_rows2, _S23, _a0123, 3);
                _rows3 = vfmaq_lane_f16(_rows3, _S33, _a0123, 3);
                vst1q_f16(rows0p + dx * 8, _rows0);
                vst1q_f16(rows1p + dx * 8, _rows1);
                vst1q_f16(rows2p + dx * 8, _rows2);
                vst1q_f16(rows3p + dx * 8, _rows3);

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float16x4_t _b0123 = vld1_f16(beta);

        __fp16* rows0p = rows0;
        __fp16* rows1p = rows1;
        __fp16* rows2p = rows2;
        __fp16* rows3p = rows3;
        __fp16* Dp = dst.row<__fp16>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float16x8_t _rows0 = vld1q_f16(rows0p);
            float16x8_t _rows1 = vld1q_f16(rows1p);
            float16x8_t _rows2 = vld1q_f16(rows2p);
            float16x8_t _rows3 = vld1q_f16(rows3p);
            float16x8_t _Dp = vmulq_lane_f16(_rows0, _b0123, 0);
            _Dp = vfmaq_lane_f16(_Dp, _rows1, _b0123, 1);
            _Dp = vfmaq_lane_f16(_Dp, _rows2, _b0123, 2);
            _Dp = vfmaq_lane_f16(_Dp, _rows3, _b0123, 3);
            vst1q_f16(Dp, _Dp);

            Dp += 8;
            rows0p += 8;
            rows1p += 8;
            rows2p += 8;
            rows3p += 8;
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/arm/interp_bilinear.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = (float)(dx * scale);
        }

        int sx = floor(fx);
        fx -= sx;

        if (sx < 0)
        {
            sx = 0;
            fx = 0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 2;
            fx = 1.f;
        }

        xofs[dx] = sx;

        alpha[dx * 2] = 1.f - fx;
        alpha[dx * 2 + 1] = fx;
    }
}

static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
#if __ARM_NEON
            for (; dx + 1 < w; dx += 2)
            {
                int sx = xofs[dx];
                int sxn = xofs[dx + 1];
                const float* S1p = S1 + sx;
                const float* S1np = S1 + sxn;

                float32x4_t _a = vld1q_f32(alphap);
                float32x2_t _S1 = vld1_f32(S1p);
                float32x2_t _S1n = vld1_f32(S1np);

                float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
                float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
                float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));

                vst1_f32(rows1p + dx, _rows1);

                alphap += 4;
            }
#endif // __ARM_NEON
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const float* S0 = src.row(sy);
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
#if __ARM_NEON
            for (; dx + 1 < w; dx += 2)
            {
                int sx = xofs[dx];
                int sxn = xofs[dx + 1];
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;
                const float* S0np = S0 + sxn;
                const float* S1np = S1 + sxn;

                float32x4_t _a = vld1q_f32(alphap);
                float32x2_t _S0 = vld1_f32(S0p);
                float32x2_t _S1 = vld1_f32(S1p);
                float32x2_t _S0n = vld1_f32(S0np);
                float32x2_t _S1n = vld1_f32(S1np);

                float32x4_t _S0S0n = vcombine_f32(_S0, _S0n);
                float32x4_t _S1S1n = vcombine_f32(_S1, _S1n);
                float32x4_t _ms0 = vmulq_f32(_S0S0n, _a);
                float32x4_t _ms1 = vmulq_f32(_S1S1n, _a);
                float32x2_t _rows0 = vpadd_f32(vget_low_f32(_ms0), vget_high_f32(_ms0));
                float32x2_t _rows1 = vpadd_f32(vget_low_f32(_ms1), vget_high_f32(_ms1));

                vst1_f32(rows0p + dx, _rows0);
                vst1_f32(rows1p + dx, _rows1);

                alphap += 4;
            }
#endif // __ARM_NEON
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* Dp = dst.row(dy);

#if __ARM_NEON
        int nn = w >> 3;
#else
        int nn = 0;
#endif
        int remain = w - (nn << 3);

#if __ARM_NEON
        float32x4_t _b0 = vdupq_n_f32(b0);
        float32x4_t _b1 = vdupq_n_f32(b1);
        for (; nn > 0; nn--)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);

            float32x4_t _Dp = vmulq_f32(_rows0, _b0);
            _Dp = vmlaq_f32(_Dp, _rows1, _b1);

            vst1q_f32(Dp, _Dp);

            float32x4_t _rows0n = vld1q_f32(rows0p + 4);
            float32x4_t _rows1n = vld1q_f32(rows1p + 4);

            float32x4_t _Dpn = vmulq_f32(_rows0n, _b0);
            _Dpn = vmlaq_f32(_Dpn, _rows1n, _b1);

            vst1q_f32(Dp + 4, _Dpn);

            Dp += 8;
            rows0p += 8;
            rows1p += 8;
        }
#endif // __ARM_NEON
        for (; remain; --remain)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1;
            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/arm/interp_bilinear_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bilinear_image_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const unsigned short* S1 = src.row<const unsigned short>(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const unsigned short* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows1p[dx] = bfloat16_to_float32(S1p[0]) * a0 + bfloat16_to_float32(S1p[1]) * a1;

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const unsigned short* S0 = src.row<const unsigned short>(sy);
            const unsigned short* S1 = src.row<const unsigned short>(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const unsigned short* S0p = S0 + sx;
                const unsigned short* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows0p[dx] = bfloat16_to_float32(S0p[0]) * a0 + bfloat16_to_float32(S0p[1]) * a1;
                rows1p[dx] = bfloat16_to_float32(S1p[0]) * a0 + bfloat16_to_float32(S1p[1]) * a1;

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];

        float* rows0p = rows0;
        float* rows1p = rows1;
        unsigned short* Dp = dst.row<unsigned short>(dy);

#if __ARM_NEON
        int nn = w >> 3;
#else
        int nn = 0;
#endif
        int remain = w - (nn << 3);

#if __ARM_NEON
        float32x4_t _b0 = vdupq_n_f32(b0);
        float32x4_t _b1 = vdupq_n_f32(b1);
        for (; nn > 0; nn--)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);

            float32x4_t _Dp = vmulq_f32(_rows0, _b0);
            _Dp = vmlaq_f32(_Dp, _rows1, _b1);

            vst1_u16(Dp, float2bfloat(_Dp));

            float32x4_t _rows0n = vld1q_f32(rows0p + 4);
            float32x4_t _rows1n = vld1q_f32(rows1p + 4);

            float32x4_t _Dpn = vmulq_f32(_rows0n, _b0);
            _Dpn = vmlaq_f32(_Dpn, _rows1n, _b1);

            vst1_u16(Dp + 4, float2bfloat(_Dpn));

            Dp += 8;
            rows0p += 8;
            rows1p += 8;
        }
#endif // __ARM_NEON
        for (; remain; --remain)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1;
            *Dp++ = float32_to_bfloat16(*rows0p++ * b0 + *rows1p++ * b1);
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/arm/interp_bilinear_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void linear_coeffs_fp16sa(int w, int outw, int* xofs, __fp16* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = static_cast<float>(dx * scale);
        }

        int sx = floor(fx);
        fx -= sx;

        if (sx < 0)
        {
            sx = 0;
            fx = 0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 2;
            fx = 1.f;
        }

        xofs[dx] = sx;

        alpha[dx * 2] = (__fp16)(1.f - fx);
        alpha[dx * 2 + 1] = (__fp16)fx;
    }
}

static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows1p[dx] = (float)S1p[0] * a0 + (float)S1p[1] * a1;

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const __fp16* S0 = src.row<const __fp16>(sy);
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows0p[dx] = (float)S0p[0] * a0 + (float)S0p[1] * a1;
                rows1p[dx] = (float)S1p[0] * a0 + (float)S1p[1] * a1;

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];

        float* rows0p = rows0;
        float* rows1p = rows1;
        __fp16* Dp = dst.row<__fp16>(dy);

        int nn = w >> 3;
        int remain = w - (nn << 3);

        float32x4_t _b0 = vdupq_n_f32(b0);
        float32x4_t _b1 = vdupq_n_f32(b1);
        for (; nn > 0; nn--)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);

            float32x4_t _Dp = vmulq_f32(_rows0, _b0);
            _Dp = vfmaq_f32(_Dp, _rows1, _b1);

            vst1_f16(Dp, vcvt_f16_f32(_Dp));

            float32x4_t _rows0n = vld1q_f32(rows0p + 4);
            float32x4_t _rows1n = vld1q_f32(rows1p + 4);

            float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
            _Dn = vfmaq_f32(_Dn, _rows1n, _b1);

            vst1_f16(Dp + 4, vcvt_f16_f32(_Dn));

            Dp += 8;
            rows0p += 8;
            rows1p += 8;
        }
        for (; remain; --remain)
        {
            // D[x] = rows0[x]*b0 + rows1[x]*b1;
            *Dp++ = (__fp16)(*rows0p++ * b0 + *rows1p++ * b1);
        }

        beta += 2;
    }
}

static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)2u);
    Mat rowsbuf1(w, (size_t)2u);
    __fp16* rows0 = rowsbuf0;
    __fp16* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            __fp16* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const __fp16* alphap = alpha;
            __fp16* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S1p = S1 + sx;

                __fp16 a0 = alphap[0];
                __fp16 a1 = alphap[1];
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const __fp16* S0 = src.row<const __fp16>(sy);
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const __fp16* alphap = alpha;
            __fp16* rows0p = rows0;
            __fp16* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;

                __fp16 a0 = alphap[0];
                __fp16 a1 = alphap[1];
                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        __fp16 b0 = beta[0];
        __fp16 b1 = beta[1];

        __fp16* rows0p = rows0;
        __fp16* rows1p = rows1;
        __fp16* Dp = dst.row<__fp16>(dy);

        int nn = w >> 3;
        int remain = w - (nn << 3);

        float16x8_t _b0 = vdupq_n_f16(b0);
        float16x8_t _b1 = vdupq_n_f16(b1);
        for (; nn > 0; nn--)
        {
            float16x8_t _rows0 = vld1q_f16(rows0p);
            float16x8_t _rows1 = vld1q_f16(rows1p);

            float16x8_t _Dp = vmulq_f16(_rows0, _b0);
            _Dp = vfmaq_f16(_Dp, _rows1, _b1);

            vst1q_f16(Dp, _Dp);

            Dp += 8;
            rows0p += 8;
            rows1p += 8;
        }
        for (; remain; --remain)
        {
            // D[x] = rows0[x]*b0 + rows1[x]*b1;
            *Dp++ = (__fp16)(*rows0p++ * b0 + *rows1p++ * b1);
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/arm/interp_bilinear_pack4.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S1p = S1 + sx;

                float32x2_t _a01 = vld1_f32(alphap);

                float32x4_t _S10 = vld1q_f32(S1p);
                float32x4_t _S11 = vld1q_f32(S1p + 4);
                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
                vst1q_f32(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const float* S0 = src.row(sy);
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;

                float32x2_t _a01 = vld1_f32(alphap);

                float32x4_t _S00 = vld1q_f32(S0p);
                float32x4_t _S01 = vld1q_f32(S0p + 4);
                float32x4_t _S10 = vld1q_f32(S1p);
                float32x4_t _S11 = vld1q_f32(S1p + 4);
                float32x4_t _rows0 = vmulq_lane_f32(_S00, _a01, 0);
                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
                _rows0 = vmlaq_lane_f32(_rows0, _S01, _a01, 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
                vst1q_f32(rows0p + dx * 4, _rows0);
                vst1q_f32(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float32x2_t _b01 = vld1_f32(beta);

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* Dp = dst.row(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);
            float32x4_t _Dp = vmulq_lane_f32(_rows0, _b01, 0);
            _Dp = vmlaq_lane_f32(_Dp, _rows1, _b01, 1);
            vst1q_f32(Dp, _Dp);

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/arm/interp_bilinear_pack4_bf16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bilinear_image_pack4_bf16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const unsigned short* S1 = src.row<const unsigned short>(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const unsigned short* S1p = S1 + sx;

                float32x2_t _a01 = vld1_f32(alphap);

                float32x4_t _S10 = bfloat2float(vld1_u16(S1p));
                float32x4_t _S11 = bfloat2float(vld1_u16(S1p + 4));
                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
                vst1q_f32(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const unsigned short* S0 = src.row<const unsigned short>(sy);
            const unsigned short* S1 = src.row<const unsigned short>(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const unsigned short* S0p = S0 + sx;
                const unsigned short* S1p = S1 + sx;

                float32x2_t _a01 = vld1_f32(alphap);

                float32x4_t _S00 = bfloat2float(vld1_u16(S0p));
                float32x4_t _S01 = bfloat2float(vld1_u16(S0p + 4));
                float32x4_t _S10 = bfloat2float(vld1_u16(S1p));
                float32x4_t _S11 = bfloat2float(vld1_u16(S1p + 4));
                float32x4_t _rows0 = vmulq_lane_f32(_S00, _a01, 0);
                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
                _rows0 = vmlaq_lane_f32(_rows0, _S01, _a01, 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
                vst1q_f32(rows0p + dx * 4, _rows0);
                vst1q_f32(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float32x2_t _b01 = vld1_f32(beta);

        float* rows0p = rows0;
        float* rows1p = rows1;
        unsigned short* Dp = dst.row<unsigned short>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);
            float32x4_t _Dp = vmulq_lane_f32(_rows0, _b01, 0);
            _Dp = vmlaq_lane_f32(_Dp, _rows1, _b01, 1);
            vst1_u16(Dp, float2bfloat(_Dp));

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/arm/interp_bilinear_pack4_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bilinear_image_pack4_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S1p = S1 + sx;

                float32x2_t _a01 = vld1_f32(alphap);

                float32x4_t _S10 = vcvt_f32_f16(vld1_f16(S1p));
                float32x4_t _S11 = vcvt_f32_f16(vld1_f16(S1p + 4));
                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
                vst1q_f32(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const __fp16* S0 = src.row<const __fp16>(sy);
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;

                float32x2_t _a01 = vld1_f32(alphap);

                float32x4_t _S00 = vcvt_f32_f16(vld1_f16(S0p));
                float32x4_t _S01 = vcvt_f32_f16(vld1_f16(S0p + 4));
                float32x4_t _S10 = vcvt_f32_f16(vld1_f16(S1p));
                float32x4_t _S11 = vcvt_f32_f16(vld1_f16(S1p + 4));
                float32x4_t _rows0 = vmulq_lane_f32(_S00, _a01, 0);
                float32x4_t _rows1 = vmulq_lane_f32(_S10, _a01, 0);
                _rows0 = vmlaq_lane_f32(_rows0, _S01, _a01, 1);
                _rows1 = vmlaq_lane_f32(_rows1, _S11, _a01, 1);
                vst1q_f32(rows0p + dx * 4, _rows0);
                vst1q_f32(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float32x2_t _b01 = vld1_f32(beta);

        float* rows0p = rows0;
        float* rows1p = rows1;
        __fp16* Dp = dst.row<__fp16>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float32x4_t _rows0 = vld1q_f32(rows0p);
            float32x4_t _rows1 = vld1q_f32(rows1p);
            float32x4_t _Dp = vmulq_lane_f32(_rows0, _b01, 0);
            _Dp = vmlaq_lane_f32(_Dp, _rows1, _b01, 1);
            vst1_f16(Dp, vcvt_f16_f32(_Dp));

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
        }

        beta += 2;
    }
}

static void resize_bilinear_image_pack4_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 2u, 4);
    Mat rowsbuf1(w, (size_t)4 * 2u, 4);
    __fp16* rows0 = rowsbuf0;
    __fp16* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            __fp16* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const __fp16* alphap = alpha;
            __fp16* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S1p = S1 + sx;

                float16x4_t _a01 = vld1_f16(alphap);

                float16x4_t _S10 = vld1_f16(S1p);
                float16x4_t _S11 = vld1_f16(S1p + 4);
                float16x4_t _rows1 = vmul_lane_f16(_S10, _a01, 0);
                _rows1 = vfma_lane_f16(_rows1, _S11, _a01, 1);
                vst1_f16(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const __fp16* S0 = src.row<const __fp16>(sy);
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const __fp16* alphap = alpha;
            __fp16* rows0p = rows0;
            __fp16* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;

                float16x4_t _a01 = vld1_f16(alphap);

                float16x4_t _S00 = vld1_f16(S0p);
                float16x4_t _S01 = vld1_f16(S0p + 4);
                float16x4_t _S10 = vld1_f16(S1p);
                float16x4_t _S11 = vld1_f16(S1p + 4);
                float16x4_t _rows0 = vmul_lane_f16(_S00, _a01, 0);
                float16x4_t _rows1 = vmul_lane_f16(_S10, _a01, 0);
                _rows0 = vfma_lane_f16(_rows0, _S01, _a01, 1);
                _rows1 = vfma_lane_f16(_rows1, _S11, _a01, 1);
                vst1_f16(rows0p + dx * 4, _rows0);
                vst1_f16(rows1p + dx * 4, _rows1);

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float16x4_t _b01 = vld1_f16(beta);

        __fp16* rows0p = rows0;
        __fp16* rows1p = rows1;
        __fp16* Dp = dst.row<__fp16>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float16x4_t _rows0 = vld1_f16(rows0p);
            float16x4_t _rows1 = vld1_f16(rows1p);
            float16x4_t _Dp = vmul_lane_f16(_rows0, _b01, 0);
            _Dp = vfma_lane_f16(_Dp, _rows1, _b01, 1);
            vst1_f16(Dp, _Dp);

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/arm/interp_bilinear_pack8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bilinear_image_pack8_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)8 * 2u, 8);
    Mat rowsbuf1(w, (size_t)8 * 2u, 8);
    __fp16* rows0 = rowsbuf0;
    __fp16* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            __fp16* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const __fp16* alphap = alpha;
            __fp16* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 8;
                const __fp16* S1p = S1 + sx;

                float16x4_t _a01 = vld1_f16(alphap);

                float16x8_t _S10 = vld1q_f16(S1p);
                float16x8_t _S11 = vld1q_f16(S1p + 8);
                float16x8_t _rows1 = vmulq_lane_f16(_S10, _a01, 0);
                _rows1 = vfmaq_lane_f16(_rows1, _S11, _a01, 1);
                vst1q_f16(rows1p + dx * 8, _rows1);

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const __fp16* S0 = src.row<const __fp16>(sy);
            const __fp16* S1 = src.row<const __fp16>(sy + 1);

            const __fp16* alphap = alpha;
            __fp16* rows0p = rows0;
            __fp16* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 8;
                const __fp16* S0p = S0 + sx;
                const __fp16* S1p = S1 + sx;

                float16x4_t _a01 = vld1_f16(alphap);

                float16x8_t _S00 = vld1q_f16(S0p);
                float16x8_t _S01 = vld1q_f16(S0p + 8);
                float16x8_t _S10 = vld1q_f16(S1p);
                float16x8_t _S11 = vld1q_f16(S1p + 8);
                float16x8_t _rows0 = vmulq_lane_f16(_S00, _a01, 0);
                float16x8_t _rows1 = vmulq_lane_f16(_S10, _a01, 0);
                _rows0 = vfmaq_lane_f16(_rows0, _S01, _a01, 1);
                _rows1 = vfmaq_lane_f16(_rows1, _S11, _a01, 1);
                vst1q_f16(rows0p + dx * 8, _rows0);
                vst1q_f16(rows1p + dx * 8, _rows1);

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float16x4_t _b01 = vld1_f16(beta);

        __fp16* rows0p = rows0;
        __fp16* rows1p = rows1;
        __fp16* Dp = dst.row<__fp16>(dy);

        for (int dx = 0; dx < w; dx++)
        {
            float16x8_t _rows0 = vld1q_f16(rows0p);
            float16x8_t _rows1 = vld1q_f16(rows1p);
            float16x8_t _Dp = vmulq_lane_f16(_rows0, _b01, 0);
            _Dp = vfmaq_lane_f16(_Dp, _rows1, _b01, 1);
            vst1q_f16(Dp, _Dp);

            Dp += 8;
            rows0p += 8;
            rows1p += 8;
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/arm/layernorm_arm.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "layernorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

LayerNorm_arm::LayerNorm_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

static void layernorm(float* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

#if __ARM_NEON
    float32x4_t _mean = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float mean = 0.f;
    {
        const float* ptr0 = ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            _mean = vaddq_f32(_mean, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            mean += ptr0[0];
            ptr0++;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        _mean = div_ps(_mean, _elemcount);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
#if __ARM_NEON
#if __aarch64__
        mean += vaddvq_f32(_mean);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_mean), vget_high_f32(_mean));
        _s2 = vpadd_f32(_s2, _s2);
        mean += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        mean = mean / elemcount;
#if __ARM_NEON
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

#if __ARM_NEON
    float32x4_t _var = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float var = 0.f;
    {
        const float* ptr0 = ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            _p = vsubq_f32(_p, _mean);
            _var = vmlaq_f32(_var, _p, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = ptr0[0] - mean;
            var += v * v;
            ptr0++;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);
        _var = div_ps(_var, _elemcount);
        _var = vaddq_f32(_var, _eps);
        float32x4_t _rsqrt_var = vrsqrteq_f32(_var);
        _rsqrt_var = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var, _rsqrt_var), _rsqrt_var), _rsqrt_var);
        _var = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var, _rsqrt_var), _rsqrt_var), _rsqrt_var);
        _mean = vmulq_f32(_mean, _var);
        _mean = vnegq_f32(_mean);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
#if __ARM_NEON
#if __aarch64__
        var += vaddvq_f32(_var);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_var), vget_high_f32(_var));
        _s2 = vpadd_f32(_s2, _s2);
        var += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        var = 1.f / sqrtf(var / elemcount + eps);
        mean = -mean * var;
#if __ARM_NEON
        _var = vdupq_n_f32(var);
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

    if (gamma_ptr && beta_ptr)
    {
        int i = 0;
#if __ARM_NEON
        if (elempack == 4)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                float32x4_t _beta = vdupq_n_f32(beta_ptr[0]);
                _p = vmlaq_f32(_mean, _p, _var);
                _p = vmlaq_f32(_beta, _p, _gamma);
                vst1q_f32(ptr, _p);
                ptr += 4;
                gamma_ptr += 1;
                beta_ptr += 1;
            }
        }
        if (elempack == 1)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _gamma = vld1q_f32(gamma_ptr);
                float32x4_t _beta = vld1q_f32(beta_ptr);
                _p = vmlaq_f32(_mean, _p, _var);
                _p = vmlaq_f32(_beta, _p, _gamma);
                vst1q_f32(ptr, _p);
                ptr += 4;
                gamma_ptr += 4;
                beta_ptr += 4;
            }
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            ptr[0] = (ptr[0] * var + mean) * gamma_ptr[0] + beta_ptr[0];
            ptr++;
            gamma_ptr++;
            beta_ptr++;
        }
    }
    else
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vmlaq_f32(_mean, _p, _var);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            ptr[0] = ptr[0] * var + mean;
            ptr++;
        }
    }
}

int LayerNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        // assert affine_size == w

        float* ptr = bottom_top_blob;
        layernorm(ptr, gamma_data, beta_data, eps, w * elempack, 1);
    }

    if (dims == 2)
    {
        // assert affine_size == w

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            layernorm(ptr, gamma_data, beta_data, eps, w, elempack);
        }
    }

    if (dims == 3)
    {
        if (affine_size == w)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                for (int i = 0; i < h; i++)
                {
                    float* ptr = bottom_top_blob.channel(q).row(i);
                    layernorm(ptr, gamma_data, beta_data, eps, w, elempack);
                }
            }
        }
        else // if (affine_size == w * h)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                float* ptr = bottom_top_blob.channel(q);
                layernorm(ptr, gamma_data, beta_data, eps, w * h, elempack);
            }
        }
    }

    return 0;
}

#if NCNN_BF16
static void layernorm_bf16s(unsigned short* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

#if __ARM_NEON
    float32x4_t _mean = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float mean = 0.f;
    {
        const unsigned short* ptr0 = ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
            _mean = vaddq_f32(_mean, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            mean += bfloat16_to_float32(ptr0[0]);
            ptr0++;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        _mean = div_ps(_mean, _elemcount);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
#if __ARM_NEON
#if __aarch64__
        mean += vaddvq_f32(_mean);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_mean), vget_high_f32(_mean));
        _s2 = vpadd_f32(_s2, _s2);
        mean += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        mean = mean / elemcount;
#if __ARM_NEON
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

#if __ARM_NEON
    float32x4_t _var = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float var = 0.f;
    {
        const unsigned short* ptr0 = ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
            _p = vsubq_f32(_p, _mean);
            _var = vmlaq_f32(_var, _p, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(ptr0[0]) - mean;
            var += v * v;
            ptr0++;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);
        _var = div_ps(_var, _elemcount);
        _var = vaddq_f32(_var, _eps);
        float32x4_t _rsqrt_var = vrsqrteq_f32(_var);
        _rsqrt_var = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var, _rsqrt_var), _rsqrt_var), _rsqrt_var);
        _var = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var, _rsqrt_var), _rsqrt_var), _rsqrt_var);
        _mean = vmulq_f32(_mean, _var);
        _mean = vnegq_f32(_mean);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
#if __ARM_NEON
#if __aarch64__
        var += vaddvq_f32(_var);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_var), vget_high_f32(_var));
        _s2 = vpadd_f32(_s2, _s2);
        var += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        var = 1.f / sqrtf(var / elemcount + eps);
        mean = -mean * var;
#if __ARM_NEON
        _var = vdupq_n_f32(var);
        _mean = vdupq_n_f32(mean);
#endif // __ARM_NEON
    }

    if (gamma_ptr && beta_ptr)
    {
        int i = 0;
#if __ARM_NEON
        if (elempack == 4)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                float32x4_t _beta = vdupq_n_f32(beta_ptr[0]);
                _p = vmlaq_f32(_mean, _p, _var);
                _p = vmlaq_f32(_beta, _p, _gamma);
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
                gamma_ptr += 1;
                beta_ptr += 1;
            }
        }
        if (elempack == 1)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                float32x4_t _gamma = vld1q_f32(gamma_ptr);
                float32x4_t _beta = vld1q_f32(beta_ptr);
                _p = vmlaq_f32(_mean, _p, _var);
                _p = vmlaq_f32(_beta, _p, _gamma);
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
                gamma_ptr += 4;
                beta_ptr += 4;
            }
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(ptr[0]);
            ptr[0] = float32_to_bfloat16((v * var + mean) * gamma_ptr[0] + beta_ptr[0]);
            ptr++;
            gamma_ptr++;
            beta_ptr++;
        }
    }
    else
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = vmlaq_f32(_mean, _p, _var);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(ptr[0]);
            ptr[0] = float32_to_bfloat16(v * var + mean);
            ptr++;
        }
    }
}

int LayerNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        // assert affine_size == w

        unsigned short* ptr = bottom_top_blob;
        layernorm_bf16s(ptr, gamma_data, beta_data, eps, w * elempack, 1);
    }

    if (dims == 2)
    {
        // assert affine_size == w

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);
            layernorm_bf16s(ptr, gamma_data, beta_data, eps, w, elempack);
        }
    }

    if (dims == 3)
    {
        if (affine_size == w)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                for (int i = 0; i < h; i++)
                {
                    unsigned short* ptr = bottom_top_blob.channel(q).row<unsigned short>(i);
                    layernorm_bf16s(ptr, gamma_data, beta_data, eps, w, elempack);
                }
            }
        }
        else // if (affine_size == w * h)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                unsigned short* ptr = bottom_top_blob.channel(q);
                layernorm_bf16s(ptr, gamma_data, beta_data, eps, w * h, elempack);
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/layernorm_arm.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_LAYERNORM_ARM_H
#define LAYER_LAYERNORM_ARM_H

#include "layernorm.h"

namespace ncnn {

class LayerNorm_arm : public LayerNorm
{
public:
    LayerNorm_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_LAYERNORM_ARM_H


================================================
FILE: src/layer/arm/layernorm_arm_asimdhp.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "layernorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static void layernorm_fp16s(__fp16* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

    float32x4_t _mean0 = vdupq_n_f32(0.f);
    float32x4_t _mean1 = vdupq_n_f32(0.f);
    float mean = 0.f;
    {
        const __fp16* ptr0 = ptr;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr0);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _mean0 = vaddq_f32(_mean0, _p0);
            _mean1 = vaddq_f32(_mean1, _p1);
            ptr0 += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
            _mean0 = vaddq_f32(_mean0, _p);
            ptr0 += 4;
        }
        for (; i < size; i++)
        {
            mean += (float)ptr0[0];
            ptr0++;
        }
    }

    if (elempack == 8)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        _mean0 = vdivq_f32(_mean0, _elemcount);
        _mean1 = vdivq_f32(_mean1, _elemcount);
    }
    if (elempack == 4)
    {
        _mean0 = vaddq_f32(_mean0, _mean1);

        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        _mean0 = vdivq_f32(_mean0, _elemcount);
        _mean1 = _mean0;
    }
    if (elempack == 1)
    {
        _mean0 = vaddq_f32(_mean0, _mean1);
        mean += vaddvq_f32(_mean0);

        mean = mean / elemcount;
        _mean0 = vdupq_n_f32(mean);
        _mean1 = _mean0;
    }

    float32x4_t _var0 = vdupq_n_f32(0.f);
    float32x4_t _var1 = vdupq_n_f32(0.f);
    float var = 0.f;
    {
        const __fp16* ptr0 = ptr;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr0);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _p0 = vsubq_f32(_p0, _mean0);
            _p1 = vsubq_f32(_p1, _mean1);
            _var0 = vmlaq_f32(_var0, _p0, _p0);
            _var1 = vmlaq_f32(_var1, _p1, _p1);
            ptr0 += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
            _p = vsubq_f32(_p, _mean0);
            _var0 = vmlaq_f32(_var0, _p, _p);
            ptr0 += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)ptr0[0] - mean;
            var += v * v;
            ptr0++;
        }
    }

    if (elempack == 8)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);
        _var0 = vdivq_f32(_var0, _elemcount);
        _var1 = vdivq_f32(_var1, _elemcount);
        _var0 = vaddq_f32(_var0, _eps);
        _var1 = vaddq_f32(_var1, _eps);
        float32x4_t _rsqrt_var0 = vrsqrteq_f32(_var0);
        float32x4_t _rsqrt_var1 = vrsqrteq_f32(_var1);
        _rsqrt_var0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var0, _rsqrt_var0), _rsqrt_var0), _rsqrt_var0);
        _rsqrt_var1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var1, _rsqrt_var1), _rsqrt_var1), _rsqrt_var1);
        _var0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var0, _rsqrt_var0), _rsqrt_var0), _rsqrt_var0);
        _var1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var1, _rsqrt_var1), _rsqrt_var1), _rsqrt_var1);
        _mean0 = vmulq_f32(_mean0, _var0);
        _mean1 = vmulq_f32(_mean1, _var1);
        _mean0 = vnegq_f32(_mean0);
        _mean1 = vnegq_f32(_mean1);
    }
    if (elempack == 4)
    {
        _var0 = vaddq_f32(_var0, _var1);

        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);
        _var0 = vdivq_f32(_var0, _elemcount);
        _var0 = vaddq_f32(_var0, _eps);
        float32x4_t _rsqrt_var = vrsqrteq_f32(_var0);
        _rsqrt_var = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var0, _rsqrt_var), _rsqrt_var), _rsqrt_var);
        _var0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_var0, _rsqrt_var), _rsqrt_var), _rsqrt_var);
        _var1 = _var0;
        _mean0 = vmulq_f32(_mean0, _var0);
        _mean0 = vnegq_f32(_mean0);
        _mean1 = _mean0;
    }
    if (elempack == 1)
    {
        _var0 = vaddq_f32(_var0, _var1);
        var += vaddvq_f32(_var0);

        var = 1.f / sqrtf(var / elemcount + eps);
        mean = -mean * var;
        _var0 = vdupq_n_f32(var);
        _var1 = _var0;
        _mean0 = vdupq_n_f32(mean);
        _mean1 = _mean0;
    }

    if (gamma_ptr && beta_ptr)
    {
        int i = 0;
        if (elempack == 8)
        {
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                float32x4_t _beta = vdupq_n_f32(beta_ptr[0]);
                _p0 = vmlaq_f32(_mean0, _p0, _var0);
                _p1 = vmlaq_f32(_mean1, _p1, _var1);
                _p0 = vmlaq_f32(_beta, _p0, _gamma);
                _p1 = vmlaq_f32(_beta, _p1, _gamma);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr, _p);
                ptr += 8;
                gamma_ptr += 1;
                beta_ptr += 1;
            }
        }
        if (elempack == 4)
        {
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                float32x4_t _gamma0 = vdupq_n_f32(gamma_ptr[0]);
                float32x4_t _gamma1 = vdupq_n_f32(gamma_ptr[1]);
                float32x4_t _beta0 = vdupq_n_f32(beta_ptr[0]);
                float32x4_t _beta1 = vdupq_n_f32(beta_ptr[1]);
                _p0 = vmlaq_f32(_mean0, _p0, _var0);
                _p1 = vmlaq_f32(_mean1, _p1, _var1);
                _p0 = vmlaq_f32(_beta0, _p0, _gamma0);
                _p1 = vmlaq_f32(_beta1, _p1, _gamma1);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr, _p);
                ptr += 8;
                gamma_ptr += 2;
                beta_ptr += 2;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                float32x4_t _beta = vdupq_n_f32(beta_ptr[0]);
                _p = vmlaq_f32(_mean0, _p, _var0);
                _p = vmlaq_f32(_beta, _p, _gamma);
                vst1_f16(ptr, vcvt_f16_f32(_p));
                ptr += 4;
                gamma_ptr += 1;
                beta_ptr += 1;
            }
        }
        if (elempack == 1)
        {
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                float32x4_t _gamma0 = vld1q_f32(gamma_ptr);
                float32x4_t _gamma1 = vld1q_f32(gamma_ptr + 4);
                float32x4_t _beta0 = vld1q_f32(beta_ptr);
                float32x4_t _beta1 = vld1q_f32(beta_ptr + 4);
                _p0 = vmlaq_f32(_mean0, _p0, _var0);
                _p1 = vmlaq_f32(_mean1, _p1, _var1);
                _p0 = vmlaq_f32(_beta0, _p0, _gamma0);
                _p1 = vmlaq_f32(_beta1, _p1, _gamma1);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr, _p);
                ptr += 8;
                gamma_ptr += 8;
                beta_ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                float32x4_t _gamma = vld1q_f32(gamma_ptr);
                float32x4_t _beta = vld1q_f32(beta_ptr);
                _p = vmlaq_f32(_mean0, _p, _var0);
                _p = vmlaq_f32(_beta, _p, _gamma);
                vst1_f16(ptr, vcvt_f16_f32(_p));
                ptr += 4;
                gamma_ptr += 4;
                beta_ptr += 4;
            }
        }
        for (; i < size; i++)
        {
            float v = (float)ptr[0];
            ptr[0] = (__fp16)((v * var + mean) * gamma_ptr[0] + beta_ptr[0]);
            ptr++;
            gamma_ptr++;
            beta_ptr++;
        }
    }
    else
    {
        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _p0 = vmlaq_f32(_mean0, _p0, _var0);
            _p1 = vmlaq_f32(_mean1, _p1, _var1);
            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _p = vmlaq_f32(_mean0, _p, _var0);
            vst1_f16(ptr, vcvt_f16_f32(_p));
            ptr += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)ptr[0];
            ptr[0] = (__fp16)(v * var + mean);
            ptr++;
        }
    }
}

int LayerNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        // assert affine_size == w

        __fp16* ptr = bottom_top_blob;
        layernorm_fp16s(ptr, gamma_data, beta_data, eps, w * elempack, 1);
    }

    if (dims == 2)
    {
        // assert affine_size == w

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            __fp16* ptr = bottom_top_blob.row<__fp16>(i);
            layernorm_fp16s(ptr, gamma_data, beta_data, eps, w, elempack);
        }
    }

    if (dims == 3)
    {
        if (affine_size == w)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                for (int i = 0; i < h; i++)
                {
                    __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
                    layernorm_fp16s(ptr, gamma_data, beta_data, eps, w, elempack);
                }
            }
        }
        else // if (affine_size == w * h)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                __fp16* ptr = bottom_top_blob.channel(q);
                layernorm_fp16s(ptr, gamma_data, beta_data, eps, w * h, elempack);
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/lrn_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "lrn_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

namespace ncnn {

int LRN_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    size_t elemsize = bottom_top_blob.elemsize;
    int size = w * h;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = bottom_top_blob.channel(q);
        float* outptr = square_blob.channel(q);

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        for (; nn > 0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vmulq_f32(_p, _p);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
#endif // __ARM_NEON
        for (; remain > 0; remain--)
        {
            *outptr = *ptr * *ptr;

            ptr++;
            outptr++;
        }
    }

    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        Mat square_sum;
        square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
        if (square_sum.empty())
            return -100;
        square_sum.fill(0.f);

        const float alpha_div_size = alpha / local_size;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            // square sum
            for (int p = q - local_size / 2; p <= q + local_size / 2; p++)
            {
                if (p < 0 || p >= channels)
                    continue;

                const float* sptr = square_blob.channel(p);
                float* ssptr = square_sum.channel(q);

#if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
#else
                int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
                for (; nn > 0; nn--)
                {
                    float32x4_t _sp = vld1q_f32(sptr);
                    float32x4_t _ssp = vld1q_f32(ssptr);
                    _ssp = vaddq_f32(_ssp, _sp);
                    vst1q_f32(ssptr, _ssp);

                    sptr += 4;
                    ssptr += 4;
                }
#endif // __ARM_NEON
                for (; remain > 0; remain--)
                {
                    *ssptr += *sptr;
                    sptr++;
                    ssptr++;
                }
            }

            float* ptr = bottom_top_blob.channel(q);
            float* ssptr = square_sum.channel(q);

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _bias = vdupq_n_f32(bias);
            float32x4_t _ads = vdupq_n_f32(alpha_div_size);
            float32x4_t _mb = vdupq_n_f32(-beta);
            for (; nn > 0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _ssp = vld1q_f32(ssptr);
                _ssp = vmulq_f32(_ssp, _ads);
                _ssp = vaddq_f32(_ssp, _bias);
                _ssp = pow_ps(_ssp, _mb);
                _p = vmulq_f32(_p, _ssp);
                vst1q_f32(ptr, _p);

                ssptr += 4;
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                *ptr = *ptr * powf(bias + alpha_div_size * *ssptr, -beta);

                ssptr++;
                ptr++;
            }
        }
    }
    else if (region_type == NormRegion_WITHIN_CHANNEL)
    {
        int outw = w;
        int outh = h;

        Mat square_blob_bordered = square_blob;
        int pad = local_size / 2;
        if (pad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt_b);
            if (square_blob_bordered.empty())
                return -100;

            w = square_blob_bordered.w;
            h = square_blob_bordered.h;
        }

        const int maxk = local_size * local_size;

        const float alpha_div_size = alpha / maxk;

        // norm window offsets
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;
            int gap = w - local_size;
            for (int i = 0; i < local_size; i++)
            {
                for (int j = 0; j < local_size; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            const Mat m = square_blob_bordered.channel(q);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    const float* sptr = m.row(i) + j;

                    float ss = 0.f;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val = sptr[space_ofs[k]];
                        ss += val;
                    }

                    ptr[j] = ptr[j] * powf(bias + alpha_div_size * ss, -beta);
                }

                ptr += outw;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/lrn_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_LRN_ARM_H
#define LAYER_LRN_ARM_H

#include "lrn.h"

namespace ncnn {

class LRN_arm : public LRN
{
public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_LRN_ARM_H


================================================
FILE: src/layer/arm/lstm_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "lstm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#include "lstm_int8.h"

LSTM_arm::LSTM_arm()
{
#if __ARM_NEON
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int LSTM_arm::create_pipeline(const Option& opt)
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return create_pipeline_int8(opt);
    }
#endif

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

    // pack IFOG
    int num_directions = direction == 2 ? 2 : 1;
    int size = weight_data_size / num_directions / hidden_size / 4;

    weight_xc_data_packed.create(size, hidden_size, num_directions, 16u, 4);
    bias_c_data_packed.create(hidden_size, 1, num_directions, 16u, 4);
    weight_hc_data_packed.create(num_output, hidden_size, num_directions, 16u, 4);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat bias_c = bias_c_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        const float* bias_c_I = bias_c.row(0);
        const float* bias_c_F = bias_c.row(1);
        const float* bias_c_O = bias_c.row(2);
        const float* bias_c_G = bias_c.row(3);

        float* bias_c_IFOG = bias_c_data_packed_dr.row(0);

        for (int q = 0; q < hidden_size; q++)
        {
            bias_c_IFOG[0] = bias_c_I[q];
            bias_c_IFOG[1] = bias_c_F[q];
            bias_c_IFOG[2] = bias_c_O[q];
            bias_c_IFOG[3] = bias_c_G[q];

            bias_c_IFOG += 4;

            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);

            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);

            float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q);
            float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q);

            for (int i = 0; i < size; i++)
            {
                weight_xc_IFOG[0] = weight_xc_I[i];
                weight_xc_IFOG[1] = weight_xc_F[i];
                weight_xc_IFOG[2] = weight_xc_O[i];
                weight_xc_IFOG[3] = weight_xc_G[i];

                weight_xc_IFOG += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_IFOG[0] = weight_hc_I[i];
                weight_hc_IFOG[1] = weight_hc_F[i];
                weight_hc_IFOG[2] = weight_hc_O[i];
                weight_hc_IFOG[3] = weight_hc_G[i];

                weight_hc_IFOG += 4;
            }
        }
    }

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;
    int hidden_size = cell_state.w;

    // 4 x hidden_size
    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    Mat tmp_hidden_state;
    if (num_output != hidden_size)
    {
        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
        if (tmp_hidden_state.empty())
            return -100;
    }

    // unroll
    for (int t = 0; t < T; t++)
    {
        // clip hidden by continuation indicator
        // h_cont_{t-1} = cont_t * h_{t-1}
        // h_cont_{t-1} = h_{t-1} if cont_t == 1
        //                0       otherwise
        // calculate hidden
        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c

        int ti = reverse ? T - 1 - t : t;

        const float* x = bottom_blob.row(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const float* bias_c_IFOG = (const float*)bias_c + q * 4;

            // gate I F O G
            const float* weight_xc_IFOG = weight_xc.row(q);

            const float* weight_hc_IFOG = weight_hc.row(q);

#if __ARM_NEON
            float32x4_t _IFOG = vld1q_f32(bias_c_IFOG);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
#else
            float I = bias_c_IFOG[0];
            float F = bias_c_IFOG[1];
            float O = bias_c_IFOG[2];
            float G = bias_c_IFOG[3];
#endif // __ARM_NEON

            int i = 0;
#if __ARM_NEON
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = vld1q_f32(x + i);

                float32x4_t _weight_xc_IFOG_0 = vld1q_f32(weight_xc_IFOG);
                float32x4_t _weight_xc_IFOG_1 = vld1q_f32(weight_xc_IFOG + 4);
                float32x4_t _weight_xc_IFOG_2 = vld1q_f32(weight_xc_IFOG + 8);
                float32x4_t _weight_xc_IFOG_3 = vld1q_f32(weight_xc_IFOG + 12);

#if __aarch64__
                _IFOG = vfmaq_laneq_f32(_IFOG, _weight_xc_IFOG_0, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_IFOG_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_IFOG_2, _xi, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_IFOG_3, _xi, 3);
#else
                _IFOG = vmlaq_lane_f32(_IFOG, _weight_xc_IFOG_0, vget_low_f32(_xi), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_IFOG_1, vget_low_f32(_xi), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_IFOG_2, vget_high_f32(_xi), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_IFOG_3, vget_high_f32(_xi), 1);
#endif

                weight_xc_IFOG += 16;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                float xi = x[i];

#if __ARM_NEON
                float32x4_t _xi = vdupq_n_f32(xi);
                float32x4_t _weight_xc_IFOG = vld1q_f32(weight_xc_IFOG);
                _IFOG = vmlaq_f32(_IFOG, _weight_xc_IFOG, _xi);
#else
                I += weight_xc_IFOG[0] * xi;
                F += weight_xc_IFOG[1] * xi;
                O += weight_xc_IFOG[2] * xi;
                G += weight_xc_IFOG[3] * xi;
#endif // __ARM_NEON

                weight_xc_IFOG += 4;
            }

            i = 0;
#if __ARM_NEON
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);

                float32x4_t _weight_hc_IFOG_0 = vld1q_f32(weight_hc_IFOG);
                float32x4_t _weight_hc_IFOG_1 = vld1q_f32(weight_hc_IFOG + 4);
                float32x4_t _weight_hc_IFOG_2 = vld1q_f32(weight_hc_IFOG + 8);
                float32x4_t _weight_hc_IFOG_3 = vld1q_f32(weight_hc_IFOG + 12);

#if __aarch64__
                _IFOG = vfmaq_laneq_f32(_IFOG, _weight_hc_IFOG_0, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_IFOG_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_IFOG_2, _h_cont, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_IFOG_3, _h_cont, 3);
#else
                _IFOG = vmlaq_lane_f32(_IFOG, _weight_hc_IFOG_0, vget_low_f32(_h_cont), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_IFOG_1, vget_low_f32(_h_cont), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_IFOG_2, vget_high_f32(_h_cont), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_IFOG_3, vget_high_f32(_h_cont), 1);
#endif

                weight_hc_IFOG += 16;
            }
#endif // __ARM_NEON
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

#if __ARM_NEON
                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_IFOG = vld1q_f32(weight_hc_IFOG);
                _IFOG = vmlaq_f32(_IFOG, _weight_hc_IFOG, _h_cont);
#else
                I += weight_hc_IFOG[0] * h_cont;
                F += weight_hc_IFOG[1] * h_cont;
                O += weight_hc_IFOG[2] * h_cont;
                G += weight_hc_IFOG[3] * h_cont;
#endif // __ARM_NEON

                weight_hc_IFOG += 4;
            }

            float* gates_data = gates.row(q);

#if __ARM_NEON
            _IFOG = vaddq_f32(_IFOG, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _IFOG = vaddq_f32(_IFOG, _sum2);

            vst1q_f32(gates_data, _IFOG);
#else
            gates_data[0] = I;
            gates_data[1] = F;
            gates_data[2] = O;
            gates_data[3] = G;
#endif // __ARM_NEON
        }

        // lstm unit
        // sigmoid(I)
        // sigmoid(F)
        // sigmoid(O)
        // tanh(G)
        // c_t := f_t .* c_{t-1} + i_t .* g_t
        // h_t := o_t .* tanh[c_t]
        float* output_data = top_blob.row(ti);

        float* cell_ptr = cell_state;
        float* hidden_ptr = hidden_state;
        float* tmp_hidden_ptr = tmp_hidden_state;

        int remain_hidden_size_start = 0;
#if __ARM_NEON
        int nn_hidden_size = hidden_size >> 2;
        remain_hidden_size_start = nn_hidden_size << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_hidden_size; qq++)
        {
            int q = qq * 4;

            const float* gates_data = gates.row(q);

            float32x4x4_t _IFOG_4x4 = vld4q_f32(gates_data);

            float32x4_t _lstm_I = sigmoid_ps(_IFOG_4x4.val[0]);
            float32x4_t _lstm_F = sigmoid_ps(_IFOG_4x4.val[1]);
            float32x4_t _lstm_O = sigmoid_ps(_IFOG_4x4.val[2]);
            float32x4_t _lstm_G = tanh_ps(_IFOG_4x4.val[3]);

            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));

            vst1q_f32(cell_ptr + q, _cell2);

            if (num_output == hidden_size)
            {
                vst1q_f32(hidden_ptr + q, _lstm_H);
                vst1q_f32(output_data + q, _lstm_H);
            }
            else
            {
                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
            }
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_hidden_size_start; q < hidden_size; q++)
        {
            const float* gates_data = gates.row(q);

            float I = gates_data[0];
            float F = gates_data[1];
            float O = gates_data[2];
            float G = gates_data[3];

            I = 1.f / (1.f + expf(-I));
            F = 1.f / (1.f + expf(-F));
            O = 1.f / (1.f + expf(-O));
            G = tanhf(G);

            float cell2 = F * cell_ptr[q] + I * G;
            float H = O * tanhf(cell2);

            cell_ptr[q] = cell2;
            if (num_output == hidden_size)
            {
                hidden_ptr[q] = H;
                output_data[q] = H;
            }
            else
            {
                tmp_hidden_ptr[q] = H;
            }
        }

        if (num_output != hidden_size)
        {
            // int nn_num_output = num_output >> 2;
            // int remain_num_output_start = nn_num_output << 2;
            // #pragma omp parallel for num_threads(opt.num_threads)
            // for (int qq = 0; qq < nn_num_output; qq++)
            // {
            //     int q = qq * 4;
            //
            // }
            int remain_num_output_start = 0;
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = remain_num_output_start; q < num_output; q++)
            {
                const float* hr = weight_hr.row(q);
                const float* tmp_hidden_ptr = tmp_hidden_state;

                float H = 0;
                for (int i = 0; i < hidden_size; i++)
                {
                    H += tmp_hidden_ptr[i] * hr[i];
                }

                hidden_ptr[q] = H;
                output_data[q] = H;
            }
        }
    }

    return 0;
}

int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
#endif

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    Mat cell(hidden_size, 4u, opt.workspace_allocator);
    if (cell.empty())
        return -100;
    cell.fill(0.f);

    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.0f);
        cell.fill(0.0f);

        {
            int ret = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    return 0;
}

int LSTM_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blobs, top_blobs, opt);
    }
#endif

    const Mat& bottom_blob = bottom_blobs[0];
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Mat cell;
    Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 3)
    {
        hidden = bottom_blobs[1].clone(hidden_cell_allocator);
        cell = bottom_blobs[2].clone(hidden_cell_allocator);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);

        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
        if (cell.empty())
            return -100;
        cell.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        Mat cell0 = cell.row_range(0, 1);
        {
            int ret = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        Mat cell1 = cell.row_range(1, 1);
        {
            int ret = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    if (top_blobs.size() == 3)
    {
        top_blobs[1] = hidden;
        top_blobs[2] = cell;
    }

    return 0;
}

#if NCNN_BF16
static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;
    int hidden_size = cell_state.w;

    // 4 x hidden_size
    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    Mat tmp_hidden_state;
    if (num_output != hidden_size)
    {
        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
        if (tmp_hidden_state.empty())
            return -100;
    }

    // unroll
    for (int t = 0; t < T; t++)
    {
        // clip hidden by continuation indicator
        // h_cont_{t-1} = cont_t * h_{t-1}
        // h_cont_{t-1} = h_{t-1} if cont_t == 1
        //                0       otherwise
        // calculate hidden
        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c

        int ti = reverse ? T - 1 - t : t;

        const unsigned short* x = bottom_blob.row<const unsigned short>(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const unsigned short* bias_c_IFOG = (const unsigned short*)bias_c + q * 4;

            // gate I F O G
            const unsigned short* weight_xc_IFOG = weight_xc.row<const unsigned short>(q);

            const unsigned short* weight_hc_IFOG = weight_hc.row<const unsigned short>(q);

#if __ARM_NEON
            float32x4_t _IFOG = bfloat2float(vld1_u16(bias_c_IFOG));
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);
#else
            float I = bfloat16_to_float32(bias_c_IFOG[0]);
            float F = bfloat16_to_float32(bias_c_IFOG[1]);
            float O = bfloat16_to_float32(bias_c_IFOG[2]);
            float G = bfloat16_to_float32(bias_c_IFOG[3]);
#endif // __ARM_NEON

            int i = 0;
#if __ARM_NEON
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = bfloat2float(vld1_u16(x + i));

                float32x4_t _weight_xc_IFOG_0 = bfloat2float(vld1_u16(weight_xc_IFOG));
                float32x4_t _weight_xc_IFOG_1 = bfloat2float(vld1_u16(weight_xc_IFOG + 4));
                float32x4_t _weight_xc_IFOG_2 = bfloat2float(vld1_u16(weight_xc_IFOG + 8));
                float32x4_t _weight_xc_IFOG_3 = bfloat2float(vld1_u16(weight_xc_IFOG + 12));

#if __aarch64__
                _IFOG = vfmaq_laneq_f32(_IFOG, _weight_xc_IFOG_0, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_IFOG_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_IFOG_2, _xi, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_IFOG_3, _xi, 3);
#else
                _IFOG = vmlaq_lane_f32(_IFOG, _weight_xc_IFOG_0, vget_low_f32(_xi), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_IFOG_1, vget_low_f32(_xi), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_IFOG_2, vget_high_f32(_xi), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_IFOG_3, vget_high_f32(_xi), 1);
#endif

                weight_xc_IFOG += 16;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
#if __ARM_NEON
                unsigned short xi = x[i];

                float32x4_t _xi = bfloat2float(vdup_n_u16(xi));
                float32x4_t _weight_xc_IFOG = bfloat2float(vld1_u16(weight_xc_IFOG));
                _IFOG = vmlaq_f32(_IFOG, _weight_xc_IFOG, _xi);
#else
                float xi = bfloat16_to_float32(x[i]);

                I += bfloat16_to_float32(weight_xc_IFOG[0]) * xi;
                F += bfloat16_to_float32(weight_xc_IFOG[1]) * xi;
                O += bfloat16_to_float32(weight_xc_IFOG[2]) * xi;
                G += bfloat16_to_float32(weight_xc_IFOG[3]) * xi;
#endif // __ARM_NEON

                weight_xc_IFOG += 4;
            }

            i = 0;
#if __ARM_NEON
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);

                float32x4_t _weight_hc_IFOG_0 = bfloat2float(vld1_u16(weight_hc_IFOG));
                float32x4_t _weight_hc_IFOG_1 = bfloat2float(vld1_u16(weight_hc_IFOG + 4));
                float32x4_t _weight_hc_IFOG_2 = bfloat2float(vld1_u16(weight_hc_IFOG + 8));
                float32x4_t _weight_hc_IFOG_3 = bfloat2float(vld1_u16(weight_hc_IFOG + 12));

#if __aarch64__
                _IFOG = vfmaq_laneq_f32(_IFOG, _weight_hc_IFOG_0, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_IFOG_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_IFOG_2, _h_cont, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_IFOG_3, _h_cont, 3);
#else
                _IFOG = vmlaq_lane_f32(_IFOG, _weight_hc_IFOG_0, vget_low_f32(_h_cont), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_IFOG_1, vget_low_f32(_h_cont), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_IFOG_2, vget_high_f32(_h_cont), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_IFOG_3, vget_high_f32(_h_cont), 1);
#endif

                weight_hc_IFOG += 16;
            }
#endif // __ARM_NEON
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

#if __ARM_NEON
                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_IFOG = bfloat2float(vld1_u16(weight_hc_IFOG));
                _IFOG = vmlaq_f32(_IFOG, _weight_hc_IFOG, _h_cont);
#else
                I += bfloat16_to_float32(weight_hc_IFOG[0]) * h_cont;
                F += bfloat16_to_float32(weight_hc_IFOG[1]) * h_cont;
                O += bfloat16_to_float32(weight_hc_IFOG[2]) * h_cont;
                G += bfloat16_to_float32(weight_hc_IFOG[3]) * h_cont;
#endif // __ARM_NEON

                weight_hc_IFOG += 4;
            }

            float* gates_data = gates.row(q);

#if __ARM_NEON
            _IFOG = vaddq_f32(_IFOG, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _IFOG = vaddq_f32(_IFOG, _sum2);

            vst1q_f32(gates_data, _IFOG);
#else
            gates_data[0] = I;
            gates_data[1] = F;
            gates_data[2] = O;
            gates_data[3] = G;
#endif // __ARM_NEON
        }

        // lstm unit
        // sigmoid(I)
        // sigmoid(F)
        // sigmoid(O)
        // tanh(G)
        // c_t := f_t .* c_{t-1} + i_t .* g_t
        // h_t := o_t .* tanh[c_t]
        unsigned short* output_data = top_blob.row<unsigned short>(ti);

        float* cell_ptr = cell_state;
        float* hidden_ptr = hidden_state;
        float* tmp_hidden_ptr = tmp_hidden_state;

        int remain_hidden_size_start = 0;
#if __ARM_NEON
        int nn_hidden_size = hidden_size >> 2;
        remain_hidden_size_start = nn_hidden_size << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_hidden_size; qq++)
        {
            int q = qq * 4;

            const float* gates_data = gates.row(q);

            float32x4x4_t _IFOG_4x4 = vld4q_f32(gates_data);

            float32x4_t _lstm_I = sigmoid_ps(_IFOG_4x4.val[0]);
            float32x4_t _lstm_F = sigmoid_ps(_IFOG_4x4.val[1]);
            float32x4_t _lstm_O = sigmoid_ps(_IFOG_4x4.val[2]);
            float32x4_t _lstm_G = tanh_ps(_IFOG_4x4.val[3]);

            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));

            vst1q_f32(cell_ptr + q, _cell2);

            if (num_output == hidden_size)
            {
                vst1q_f32(hidden_ptr + q, _lstm_H);
                vst1_u16(output_data + q, float2bfloat(_lstm_H));
            }
            else
            {
                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
            }
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_hidden_size_start; q < hidden_size; q++)
        {
            const float* gates_data = gates.row(q);

            float I = gates_data[0];
            float F = gates_data[1];
            float O = gates_data[2];
            float G = gates_data[3];

            I = 1.f / (1.f + expf(-I));
            F = 1.f / (1.f + expf(-F));
            O = 1.f / (1.f + expf(-O));
            G = tanhf(G);

            float cell2 = F * cell_ptr[q] + I * G;
            float H = O * tanhf(cell2);

            cell_ptr[q] = cell2;
            if (num_output == hidden_size)
            {
                hidden_ptr[q] = H;
                output_data[q] = float32_to_bfloat16(H);
            }
            else
            {
                tmp_hidden_ptr[q] = H;
            }
        }

        if (num_output != hidden_size)
        {
            // int nn_num_output = num_output >> 2;
            // int remain_num_output_start = nn_num_output << 2;
            // #pragma omp parallel for num_threads(opt.num_threads)
            // for (int qq = 0; qq < nn_num_output; qq++)
            // {
            //     int q = qq * 4;
            //
            // }
            int remain_num_output_start = 0;
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = remain_num_output_start; q < num_output; q++)
            {
                const float* hr = weight_hr.row(q);
                const float* tmp_hidden_ptr = tmp_hidden_state;

                float H = 0;
                for (int i = 0; i < hidden_size; i++)
                {
                    H += tmp_hidden_ptr[i] * hr[i];
                }

                hidden_ptr[q] = H;
                output_data[q] = float32_to_bfloat16(H);
            }
        }
    }

    return 0;
}

int LSTM_arm::create_pipeline_bf16s(const Option& opt)
{
    // pack IFOG
    int num_directions = direction == 2 ? 2 : 1;
    int size = weight_data_size / num_directions / hidden_size / 4;

    weight_xc_data_packed.create(size, hidden_size, num_directions, 8u, 4);
    bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4);
    weight_hc_data_packed.create(num_output, hidden_size, num_directions, 8u, 4);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat bias_c = bias_c_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        const float* bias_c_I = bias_c.row(0);
        const float* bias_c_F = bias_c.row(1);
        const float* bias_c_O = bias_c.row(2);
        const float* bias_c_G = bias_c.row(3);

        unsigned short* bias_c_IFOG = bias_c_data_packed_dr.row<unsigned short>(0);

        for (int q = 0; q < hidden_size; q++)
        {
            bias_c_IFOG[0] = float32_to_bfloat16(bias_c_I[q]);
            bias_c_IFOG[1] = float32_to_bfloat16(bias_c_F[q]);
            bias_c_IFOG[2] = float32_to_bfloat16(bias_c_O[q]);
            bias_c_IFOG[3] = float32_to_bfloat16(bias_c_G[q]);

            bias_c_IFOG += 4;

            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);

            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);

            unsigned short* weight_xc_IFOG = weight_xc_data_packed_dr.row<unsigned short>(q);
            unsigned short* weight_hc_IFOG = weight_hc_data_packed_dr.row<unsigned short>(q);

            for (int i = 0; i < size; i++)
            {
                weight_xc_IFOG[0] = float32_to_bfloat16(weight_xc_I[i]);
                weight_xc_IFOG[1] = float32_to_bfloat16(weight_xc_F[i]);
                weight_xc_IFOG[2] = float32_to_bfloat16(weight_xc_O[i]);
                weight_xc_IFOG[3] = float32_to_bfloat16(weight_xc_G[i]);

                weight_xc_IFOG += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_IFOG[0] = float32_to_bfloat16(weight_hc_I[i]);
                weight_hc_IFOG[1] = float32_to_bfloat16(weight_hc_F[i]);
                weight_hc_IFOG[2] = float32_to_bfloat16(weight_hc_O[i]);
                weight_hc_IFOG[3] = float32_to_bfloat16(weight_hc_G[i]);

                weight_hc_IFOG += 4;
            }
        }
    }

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

int LSTM_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    Mat cell(hidden_size, 4u, opt.workspace_allocator);
    if (cell.empty())
        return -100;
    cell.fill(0.f);

    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.f);
        cell.fill(0.f);

        {
            int ret = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
            const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
            unsigned short* ptr = top_blob.row<unsigned short>(i);

            memcpy(ptr, pf, num_output * sizeof(unsigned short));
            memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
        }
    }

    return 0;
}

int LSTM_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Mat cell;
    Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 3)
    {
        Option opt_cast = opt;
        opt_cast.blob_allocator = hidden_cell_allocator;
        cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
        cast_bfloat16_to_float32(bottom_blobs[2], cell, opt_cast);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);

        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
        if (cell.empty())
            return -100;
        cell.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        Mat cell0 = cell.row_range(0, 1);
        {
            int ret = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        Mat cell1 = cell.row_range(1, 1);
        {
            int ret = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
            const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
            unsigned short* ptr = top_blob.row<unsigned short>(i);

            memcpy(ptr, pf, num_output * sizeof(unsigned short));
            memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
        }
    }

    if (top_blobs.size() == 3)
    {
        cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
        cast_float32_to_bfloat16(cell, top_blobs[2], opt);
    }

    return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int LSTM_arm::create_pipeline_int8(const Option& opt)
{
    // pack IFOG
    const int num_directions = direction == 2 ? 2 : 1;
    const int size = weight_data_size / num_directions / hidden_size / 4;

    lstm_transform_weight_int8(weight_xc_data, weight_xc_data_int8_scales, weight_hc_data, weight_hc_data_int8_scales, bias_c_data, weight_data_tm, weight_data_tm_int8_descales, bias_c_data_packed, size, num_output, num_directions, hidden_size, opt);

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
        weight_xc_data_int8_scales.release();
        weight_hc_data_int8_scales.release();
    }

    return 0;
}

void LSTM_arm::dynamic_quantize(const Mat& bottom_blob, int elemtype, Mat& bottom_blob_int8, Mat& bottom_blob_int8_descales, const Option& opt) const
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    // dynamic quantize bottom_blob
    bottom_blob_int8_descales.create(T, (size_t)4u, 1, opt.blob_allocator);

    Mat bottom_blob_int8_scales(T, (size_t)4u, 1, opt.blob_allocator);

    if (elemtype == 1)
    {
        // fp32
        for (int t = 0; t < T; t++)
        {
            const float* x = bottom_blob.row(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(x[i]));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }
    if (elemtype == 2)
    {
        // fp16
        for (int t = 0; t < T; t++)
        {
            const unsigned short* x = bottom_blob.row<const unsigned short>(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(float16_to_float32(x[i])));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }
    if (elemtype == 4)
    {
        // bf16
        for (int t = 0; t < T; t++)
        {
            const unsigned short* x = bottom_blob.row<const unsigned short>(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(bfloat16_to_float32(x[i])));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }

    quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt);
}

int LSTM_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elemtype = 1; // fp32
    {
        int elembits = bottom_blob.elembits();

        // clang-format off
        // *INDENT-OFF*

#if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        {
            elemtype = 2; // fp16
        }
        else
#endif
#if NCNN_BF16
        if (opt.use_bf16_storage && elembits == 16)
        {
            elemtype = 4; // bf16
        }
        else
#endif
        {
            // fp32
        }

        // *INDENT-ON*
        // clang-format on
    }

    int T = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    Mat cell(hidden_size, 4u, opt.workspace_allocator);
    if (cell.empty())
        return -100;
    cell.fill(0.f);

    top_blob.create(num_output * num_directions, T, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8;
    Mat bottom_blob_int8_descales;
    {
        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        dynamic_quantize(bottom_blob, elemtype, bottom_blob_int8, bottom_blob_int8_descales, opt_quant);
    }

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        lstm_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, direction, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            lstm_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_forward, elemtype, 0, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
        }

        hidden.fill(0.f);
        cell.fill(0.0f);

        {
            lstm_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_reverse, elemtype, 1, weight_data_tm.channel(1), weight_data_tm_int8_descales.channel(1), bias_c_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned char* pf = top_blob_forward.row<const unsigned char>(i);
            const unsigned char* pr = top_blob_reverse.row<const unsigned char>(i);
            unsigned char* ptr = top_blob.row<unsigned char>(i);

            memcpy(ptr, pf, num_output * elemsize);
            memcpy(ptr + num_output * elemsize, pr, num_output * elemsize);
        }
    }

    return 0;
}

int LSTM_arm::forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];

    int elemtype = 1; // fp32
    {
        int elembits = bottom_blob.elembits();

        // clang-format off
        // *INDENT-OFF*

#if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        {
            elemtype = 2; // fp16
        }
        else
#endif
#if NCNN_BF16
        if (opt.use_bf16_storage && elembits == 16)
        {
            elemtype = 4; // bf16
        }
        else
#endif
        {
            // fp32
        }

        // *INDENT-ON*
        // clang-format on
    }

    int T = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Mat cell;
    Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 3)
    {
        if (elemtype == 1)
        {
            hidden = bottom_blobs[1].clone(hidden_cell_allocator);
            cell = bottom_blobs[2].clone(hidden_cell_allocator);
        }
        if (elemtype == 2)
        {
            Option opt_cast = opt;
            opt_cast.blob_allocator = hidden_cell_allocator;
            cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
            cast_float16_to_float32(bottom_blobs[2], cell, opt_cast);
        }
        if (elemtype == 4)
        {
            Option opt_cast = opt;
            opt_cast.blob_allocator = hidden_cell_allocator;
            cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
            cast_bfloat16_to_float32(bottom_blobs[2], cell, opt_cast);
        }
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);

        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
        if (cell.empty())
            return -100;
        cell.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8;
    Mat bottom_blob_int8_descales;
    {
        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        dynamic_quantize(bottom_blob, elemtype, bottom_blob_int8, bottom_blob_int8_descales, opt_quant);
    }

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        lstm_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, direction, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        Mat cell0 = cell.row_range(0, 1);
        {
            lstm_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_forward, elemtype, 0, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
        }

        Mat hidden1 = hidden.row_range(1, 1);
        Mat cell1 = cell.row_range(1, 1);
        {
            lstm_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_reverse, elemtype, 1, weight_data_tm.channel(1), weight_data_tm_int8_descales.channel(1), bias_c_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned char* pf = top_blob_forward.row<const unsigned char>(i);
            const unsigned char* pr = top_blob_reverse.row<const unsigned char>(i);
            unsigned char* ptr = top_blob.row<unsigned char>(i);

            memcpy(ptr, pf, num_output * elemsize);
            memcpy(ptr + num_output * elemsize, pr, num_output * elemsize);
        }
    }

    if (top_blobs.size() == 3)
    {
        if (elemtype == 1)
        {
            top_blobs[1] = hidden;
            top_blobs[2] = cell;
        }
        if (elemtype == 2)
        {
            cast_float32_to_float16(hidden, top_blobs[1], opt);
            cast_float32_to_float16(cell, top_blobs[2], opt);
        }
        if (elemtype == 4)
        {
            cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
            cast_float32_to_bfloat16(cell, top_blobs[2], opt);
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/arm/lstm_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_LSTM_ARM_H
#define LAYER_LSTM_ARM_H

#include "lstm.h"

namespace ncnn {

class LSTM_arm : public LSTM
{
public:
    LSTM_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8(const Option& opt);
    void dynamic_quantize(const Mat& bottom_blob, int elemtype, Mat& bottom_blob_int8, Mat& bottom_blob_int8_descales, const Option& opt) const;
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
    Mat weight_xc_data_packed;
    Mat bias_c_data_packed;
    Mat weight_hc_data_packed;

    Mat weight_data_tm;

#if NCNN_INT8
    Mat weight_data_tm_int8_descales;
#endif
};

} // namespace ncnn

#endif // LAYER_LSTM_ARM_H


================================================
FILE: src/layer/arm/lstm_arm_asimddp.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "layer.h"
#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "lstm_int8.h"

void lstm_transform_weight_int8_asimddp(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, int hidden_size, const Option& opt)
{
    lstm_transform_weight_int8(weight_xc, weight_xc_int8_scales, weight_hc, weight_hc_int8_scales, bias_c, weight_data_tm, weight_data_tm_int8_descales, bias_c_tm, size, num_output, num_directions, hidden_size, opt);
}

void lstm_int8_asimddp(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
    lstm_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, reverse, weight_data_tm, weight_data_tm_int8_descales, bias_c, weight_hr, hidden_state, cell_state, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/lstm_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "lstm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;
    int hidden_size = cell_state.w;

    // 4 x hidden_size
    Mat gates(4, hidden_size, 2u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    Mat tmp_hidden_state;
    if (num_output != hidden_size)
    {
        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
        if (tmp_hidden_state.empty())
            return -100;
    }

    // unroll
    for (int t = 0; t < T; t++)
    {
        // clip hidden by continuation indicator
        // h_cont_{t-1} = cont_t * h_{t-1}
        // h_cont_{t-1} = h_{t-1} if cont_t == 1
        //                0       otherwise
        // calculate hidden
        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c

        int ti = reverse ? T - 1 - t : t;

        int nn_hidden_size = hidden_size >> 1;
        int remain_hidden_size_start = nn_hidden_size << 1;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_hidden_size; qq++)
        {
            int q = qq * 2;

            const __fp16* bias_c_IFOG = (const __fp16*)bias_c + q * 4;

            // gate I F O G
            const __fp16* weight_xc_IFOG = weight_xc.row<const __fp16>(q / 2);

            const __fp16* weight_hc_IFOG = weight_hc.row<const __fp16>(q / 2);

            float16x8_t _IFOG = vld1q_f16(bias_c_IFOG);
            float16x8_t _sum1 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _sum2 = vdupq_n_f16((__fp16)0.f);
            float16x8_t _sum3 = vdupq_n_f16((__fp16)0.f);

            const __fp16* x = bottom_blob.row<const __fp16>(ti);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4h}, [%0], #8       \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    "fmla   %2.8h, v0.8h, v4.h[0]   \n"
                    "fmla   %3.8h, v1.8h, v4.h[1]   \n"
                    "fmla   %4.8h, v2.8h, v4.h[2]   \n"
                    "fmla   %5.8h, v3.8h, v4.h[3]   \n"
                    : "=r"(x),
                    "=r"(weight_xc_IFOG),
                    "=w"(_IFOG),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3)
                    : "0"(x),
                    "1"(weight_xc_IFOG),
                    "2"(_IFOG),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _x = vld1_f16(x);
                float16x8_t _w0 = vld1q_f16(weight_xc_IFOG);
                float16x8_t _w1 = vld1q_f16(weight_xc_IFOG + 8);
                float16x8_t _w2 = vld1q_f16(weight_xc_IFOG + 16);
                float16x8_t _w3 = vld1q_f16(weight_xc_IFOG + 24);
                _IFOG = vfmaq_lane_f16(_IFOG, _w0, _x, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _w1, _x, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _w2, _x, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _w3, _x, 3);

                x += 4;
                weight_xc_IFOG += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < size; i++)
            {
                __fp16 xi = *x++;

                float16x8_t _xi = vdupq_n_f16(xi);
                float16x8_t _weight_xc_IFOG = vld1q_f16(weight_xc_IFOG);
                _IFOG = vfmaq_f16(_IFOG, _weight_xc_IFOG, _xi);

                weight_xc_IFOG += 8;
            }

            const float* hidden_ptr = hidden_state;

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4s}, [%0], #16      \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%1], #64 \n"
                    "fcvtn  v4.4h, v4.4s            \n"
                    "fmla   %2.8h, v0.8h, v4.h[0]   \n"
                    "fmla   %3.8h, v1.8h, v4.h[1]   \n"
                    "fmla   %4.8h, v2.8h, v4.h[2]   \n"
                    "fmla   %5.8h, v3.8h, v4.h[3]   \n"
                    : "=r"(hidden_ptr),
                    "=r"(weight_hc_IFOG),
                    "=w"(_IFOG),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3)
                    : "0"(hidden_ptr),
                    "1"(weight_hc_IFOG),
                    "2"(_IFOG),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _h_cont = vcvt_f16_f32(vld1q_f32(hidden_ptr));
                float16x8_t _w0 = vld1q_f16(weight_hc_IFOG);
                float16x8_t _w1 = vld1q_f16(weight_hc_IFOG + 8);
                float16x8_t _w2 = vld1q_f16(weight_hc_IFOG + 16);
                float16x8_t _w3 = vld1q_f16(weight_hc_IFOG + 24);
                _IFOG = vfmaq_lane_f16(_IFOG, _w0, _h_cont, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _w1, _h_cont, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _w2, _h_cont, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _w3, _h_cont, 3);

                hidden_ptr += 4;
                weight_hc_IFOG += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < num_output; i++)
            {
                float h_cont = *hidden_ptr++;

                float16x8_t _h_cont = vdupq_n_f16((__fp16)h_cont);
                float16x8_t _weight_hc_IFOG = vld1q_f16(weight_hc_IFOG);
                _IFOG = vfmaq_f16(_IFOG, _weight_hc_IFOG, _h_cont);

                weight_hc_IFOG += 8;
            }

            __fp16* gates_data = gates.row<__fp16>(q);

            _IFOG = vaddq_f16(_IFOG, _sum1);
            _sum2 = vaddq_f16(_sum2, _sum3);
            _IFOG = vaddq_f16(_IFOG, _sum2);

            vst1q_f16(gates_data, _IFOG);
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_hidden_size_start; q < hidden_size; q++)
        {
            const __fp16* bias_c_IFOG = (const __fp16*)bias_c + q * 4;

            // gate I F O G
            const __fp16* weight_xc_IFOG = weight_xc.row<const __fp16>(q / 2 + q % 2);

            const __fp16* weight_hc_IFOG = weight_hc.row<const __fp16>(q / 2 + q % 2);

            float16x4_t _IFOG = vld1_f16(bias_c_IFOG);
            float16x4_t _sum1 = vdup_n_f16((__fp16)0.f);
            float16x4_t _sum2 = vdup_n_f16((__fp16)0.f);
            float16x4_t _sum3 = vdup_n_f16((__fp16)0.f);

            const __fp16* x = bottom_blob.row<const __fp16>(ti);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4h}, [%0], #8       \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                    "fmla   %2.4h, v0.4h, v4.h[0]   \n"
                    "fmla   %3.4h, v1.4h, v4.h[1]   \n"
                    "fmla   %4.4h, v2.4h, v4.h[2]   \n"
                    "fmla   %5.4h, v3.4h, v4.h[3]   \n"
                    : "=r"(x),
                    "=r"(weight_xc_IFOG),
                    "=w"(_IFOG),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3)
                    : "0"(x),
                    "1"(weight_xc_IFOG),
                    "2"(_IFOG),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _x = vld1_f16(x);
                float16x4_t _w0 = vld1_f16(weight_xc_IFOG);
                float16x4_t _w1 = vld1_f16(weight_xc_IFOG + 4);
                float16x4_t _w2 = vld1_f16(weight_xc_IFOG + 8);
                float16x4_t _w3 = vld1_f16(weight_xc_IFOG + 12);
                _IFOG = vfma_lane_f16(_IFOG, _w0, _x, 0);
                _sum1 = vfma_lane_f16(_sum1, _w1, _x, 1);
                _sum2 = vfma_lane_f16(_sum2, _w2, _x, 2);
                _sum3 = vfma_lane_f16(_sum3, _w3, _x, 3);

                x += 4;
                weight_xc_IFOG += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < size; i++)
            {
                __fp16 xi = *x++;

                float16x4_t _xi = vdup_n_f16(xi);
                float16x4_t _weight_xc_IFOG = vld1_f16(weight_xc_IFOG);
                _IFOG = vfma_f16(_IFOG, _weight_xc_IFOG, _xi);

                weight_xc_IFOG += 4;
            }

            const float* hidden_ptr = hidden_state;

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "ld1    {v4.4s}, [%0], #16      \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"
                    "fcvtn  v4.4h, v4.4s            \n"
                    "fmla   %2.4h, v0.4h, v4.h[0]   \n"
                    "fmla   %3.4h, v1.4h, v4.h[1]   \n"
                    "fmla   %4.4h, v2.4h, v4.h[2]   \n"
                    "fmla   %5.4h, v3.4h, v4.h[3]   \n"
                    : "=r"(hidden_ptr),
                    "=r"(weight_hc_IFOG),
                    "=w"(_IFOG),
                    "=w"(_sum1),
                    "=w"(_sum2),
                    "=w"(_sum3)
                    : "0"(hidden_ptr),
                    "1"(weight_hc_IFOG),
                    "2"(_IFOG),
                    "3"(_sum1),
                    "4"(_sum2),
                    "5"(_sum3)
                    : "memory", "v0", "v1", "v2", "v3", "v4");
#else  // NCNN_GNU_INLINE_ASM
                float16x4_t _h_cont = vcvt_f16_f32(vld1q_f32(hidden_ptr));
                float16x4_t _w0 = vld1_f16(weight_hc_IFOG);
                float16x4_t _w1 = vld1_f16(weight_hc_IFOG + 4);
                float16x4_t _w2 = vld1_f16(weight_hc_IFOG + 8);
                float16x4_t _w3 = vld1_f16(weight_hc_IFOG + 12);
                _IFOG = vfma_lane_f16(_IFOG, _w0, _h_cont, 0);
                _sum1 = vfma_lane_f16(_sum1, _w1, _h_cont, 1);
                _sum2 = vfma_lane_f16(_sum2, _w2, _h_cont, 2);
                _sum3 = vfma_lane_f16(_sum3, _w3, _h_cont, 3);

                hidden_ptr += 4;
                weight_hc_IFOG += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i < num_output; i++)
            {
                float h_cont = *hidden_ptr++;

                float16x4_t _h_cont = vdup_n_f16((__fp16)h_cont);
                float16x4_t _weight_hc_IFOG = vld1_f16(weight_hc_IFOG);
                _IFOG = vfma_f16(_IFOG, _weight_hc_IFOG, _h_cont);

                weight_hc_IFOG += 4;
            }

            __fp16* gates_data = gates.row<__fp16>(q);

            _IFOG = vadd_f16(_IFOG, _sum1);
            _sum2 = vadd_f16(_sum2, _sum3);
            _IFOG = vadd_f16(_IFOG, _sum2);

            vst1_f16(gates_data, _IFOG);
        }

        // lstm unit
        // sigmoid(I)
        // sigmoid(F)
        // sigmoid(O)
        // tanh(G)
        // c_t := f_t .* c_{t-1} + i_t .* g_t
        // h_t := o_t .* tanh[c_t]
        __fp16* output_data = top_blob.row<__fp16>(ti);

        float* cell_ptr = cell_state;
        float* hidden_ptr = hidden_state;
        float* tmp_hidden_ptr = tmp_hidden_state;

        nn_hidden_size = hidden_size >> 2;
        remain_hidden_size_start = nn_hidden_size << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_hidden_size; qq++)
        {
            int q = qq * 4;

            const __fp16* gates_data = gates.row<const __fp16>(q);

            float16x4x4_t _IFOG_4x4 = vld4_f16(gates_data);

            float32x4_t _lstm_I = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[0]));
            float32x4_t _lstm_F = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[1]));
            float32x4_t _lstm_O = sigmoid_ps(vcvt_f32_f16(_IFOG_4x4.val[2]));
            float32x4_t _lstm_G = tanh_ps(vcvt_f32_f16(_IFOG_4x4.val[3]));

            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));

            vst1q_f32(cell_ptr + q, _cell2);

            if (num_output == hidden_size)
            {
                vst1q_f32(hidden_ptr + q, _lstm_H);
                vst1_f16(output_data + q, vcvt_f16_f32(_lstm_H));
            }
            else
            {
                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
            }
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_hidden_size_start; q < hidden_size; q++)
        {
            const __fp16* gates_data = gates.row<const __fp16>(q);

            float I = (float)gates_data[0];
            float F = (float)gates_data[1];
            float O = (float)gates_data[2];
            float G = (float)gates_data[3];

            I = 1.f / (1.f + expf(-I));
            F = 1.f / (1.f + expf(-F));
            O = 1.f / (1.f + expf(-O));
            G = tanhf(G);

            float cell2 = F * cell_ptr[q] + I * G;
            float H = O * tanhf(cell2);

            cell_ptr[q] = cell2;
            if (num_output == hidden_size)
            {
                hidden_ptr[q] = H;
                output_data[q] = (__fp16)H;
            }
            else
            {
                tmp_hidden_ptr[q] = H;
            }
        }

        if (num_output != hidden_size)
        {
            // int nn_num_output = num_output >> 2;
            // int remain_num_output_start = nn_num_output << 2;
            // #pragma omp parallel for num_threads(opt.num_threads)
            // for (int qq = 0; qq < nn_num_output; qq++)
            // {
            //     int q = qq * 4;
            //
            // }
            int remain_num_output_start = 0;
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = remain_num_output_start; q < num_output; q++)
            {
                const float* hr = weight_hr.row(q);
                const float* tmp_hidden_ptr = tmp_hidden_state;

                float H = 0;
                for (int i = 0; i < hidden_size; i++)
                {
                    H += tmp_hidden_ptr[i] * hr[i];
                }

                hidden_ptr[q] = H;
                output_data[q] = (__fp16)H;
            }
        }
    }

    return 0;
}

static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
    if (opt.use_fp16_arithmetic)
        return lstm_fp16sa(bottom_blob, top_blob, reverse, weight_xc, bias_c, weight_hc, weight_hr, hidden_state, cell_state, opt);

    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;
    int hidden_size = cell_state.w;

    // 4 x hidden_size
    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    Mat tmp_hidden_state;
    if (num_output != hidden_size)
    {
        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
        if (tmp_hidden_state.empty())
            return -100;
    }

    // unroll
    for (int t = 0; t < T; t++)
    {
        // clip hidden by continuation indicator
        // h_cont_{t-1} = cont_t * h_{t-1}
        // h_cont_{t-1} = h_{t-1} if cont_t == 1
        //                0       otherwise
        // calculate hidden
        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c

        int ti = reverse ? T - 1 - t : t;

        const __fp16* x = bottom_blob.row<const __fp16>(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const __fp16* bias_c_IFOG = (const __fp16*)bias_c + q * 4;

            // gate I F O G
            const __fp16* weight_xc_IFOG = weight_xc.row<const __fp16>(q);

            const __fp16* weight_hc_IFOG = weight_hc.row<const __fp16>(q);

            float32x4_t _IFOG = vcvt_f32_f16(vld1_f16(bias_c_IFOG));
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _xi = vcvt_f32_f16(vld1_f16(x + i));

                float32x4_t _weight_xc_IFOG_0 = vcvt_f32_f16(vld1_f16(weight_xc_IFOG));
                float32x4_t _weight_xc_IFOG_1 = vcvt_f32_f16(vld1_f16(weight_xc_IFOG + 4));
                float32x4_t _weight_xc_IFOG_2 = vcvt_f32_f16(vld1_f16(weight_xc_IFOG + 8));
                float32x4_t _weight_xc_IFOG_3 = vcvt_f32_f16(vld1_f16(weight_xc_IFOG + 12));

                _IFOG = vfmaq_laneq_f32(_IFOG, _weight_xc_IFOG_0, _xi, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_IFOG_1, _xi, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_IFOG_2, _xi, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_IFOG_3, _xi, 3);

                weight_xc_IFOG += 16;
            }
            for (; i < size; i++)
            {
                __fp16 xi = x[i];

                float32x4_t _xi = vcvt_f32_f16(vdup_n_f16(xi));
                float32x4_t _weight_xc_IFOG = vcvt_f32_f16(vld1_f16(weight_xc_IFOG));
                _IFOG = vfmaq_f32(_IFOG, _weight_xc_IFOG, _xi);

                weight_xc_IFOG += 4;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _h_cont = vld1q_f32((const float*)hidden_state + i);

                float32x4_t _weight_hc_IFOG_0 = vcvt_f32_f16(vld1_f16(weight_hc_IFOG));
                float32x4_t _weight_hc_IFOG_1 = vcvt_f32_f16(vld1_f16(weight_hc_IFOG + 4));
                float32x4_t _weight_hc_IFOG_2 = vcvt_f32_f16(vld1_f16(weight_hc_IFOG + 8));
                float32x4_t _weight_hc_IFOG_3 = vcvt_f32_f16(vld1_f16(weight_hc_IFOG + 12));

                _IFOG = vfmaq_laneq_f32(_IFOG, _weight_hc_IFOG_0, _h_cont, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_IFOG_1, _h_cont, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_IFOG_2, _h_cont, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_IFOG_3, _h_cont, 3);

                weight_hc_IFOG += 16;
            }
            for (; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                float32x4_t _h_cont = vdupq_n_f32(h_cont);
                float32x4_t _weight_hc_IFOG = vcvt_f32_f16(vld1_f16(weight_hc_IFOG));
                _IFOG = vfmaq_f32(_IFOG, _weight_hc_IFOG, _h_cont);

                weight_hc_IFOG += 4;
            }

            float* gates_data = gates.row(q);

            _IFOG = vaddq_f32(_IFOG, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _IFOG = vaddq_f32(_IFOG, _sum2);

            vst1q_f32(gates_data, _IFOG);
        }

        // lstm unit
        // sigmoid(I)
        // sigmoid(F)
        // sigmoid(O)
        // tanh(G)
        // c_t := f_t .* c_{t-1} + i_t .* g_t
        // h_t := o_t .* tanh[c_t]
        __fp16* output_data = top_blob.row<__fp16>(ti);

        float* cell_ptr = cell_state;
        float* hidden_ptr = hidden_state;
        float* tmp_hidden_ptr = tmp_hidden_state;

        int nn_hidden_size = hidden_size >> 2;
        int remain_hidden_size_start = nn_hidden_size << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_hidden_size; qq++)
        {
            int q = qq * 4;

            const float* gates_data = gates.row(q);

            float32x4x4_t _IFOG_4x4 = vld4q_f32(gates_data);

            float32x4_t _lstm_I = sigmoid_ps(_IFOG_4x4.val[0]);
            float32x4_t _lstm_F = sigmoid_ps(_IFOG_4x4.val[1]);
            float32x4_t _lstm_O = sigmoid_ps(_IFOG_4x4.val[2]);
            float32x4_t _lstm_G = tanh_ps(_IFOG_4x4.val[3]);

            float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
            float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));

            vst1q_f32(cell_ptr + q, _cell2);

            if (num_output == hidden_size)
            {
                vst1q_f32(hidden_ptr + q, _lstm_H);
                vst1_f16(output_data + q, vcvt_f16_f32(_lstm_H));
            }
            else
            {
                vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
            }
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_hidden_size_start; q < hidden_size; q++)
        {
            const float* gates_data = gates.row(q);

            float I = gates_data[0];
            float F = gates_data[1];
            float O = gates_data[2];
            float G = gates_data[3];

            I = 1.f / (1.f + expf(-I));
            F = 1.f / (1.f + expf(-F));
            O = 1.f / (1.f + expf(-O));
            G = tanhf(G);

            float cell2 = F * cell_ptr[q] + I * G;
            float H = O * tanhf(cell2);

            cell_ptr[q] = cell2;
            if (num_output == hidden_size)
            {
                hidden_ptr[q] = H;
                output_data[q] = (__fp16)H;
            }
            else
            {
                tmp_hidden_ptr[q] = H;
            }
        }

        if (num_output != hidden_size)
        {
            // int nn_num_output = num_output >> 2;
            // int remain_num_output_start = nn_num_output << 2;
            // #pragma omp parallel for num_threads(opt.num_threads)
            // for (int qq = 0; qq < nn_num_output; qq++)
            // {
            //     int q = qq * 4;
            //
            // }
            int remain_num_output_start = 0;
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = remain_num_output_start; q < num_output; q++)
            {
                const float* hr = weight_hr.row(q);
                const float* tmp_hidden_ptr = tmp_hidden_state;

                float H = 0;
                for (int i = 0; i < hidden_size; i++)
                {
                    H += tmp_hidden_ptr[i] * hr[i];
                }

                hidden_ptr[q] = H;
                output_data[q] = (__fp16)H;
            }
        }
    }

    return 0;
}

int LSTM_arm::create_pipeline_fp16s(const Option& opt)
{
    // pack IFOG
    const int num_directions = direction == 2 ? 2 : 1;
    const int size = weight_data_size / num_directions / hidden_size / 4;

    if (opt.use_fp16_arithmetic)
    {
        weight_xc_data_packed.create(size, hidden_size / 2 + hidden_size % 2, num_directions, 16u, 8);
        bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4);
        weight_hc_data_packed.create(num_output, hidden_size / 2 + hidden_size % 2, num_directions, 16u, 8);
    }
    else
    {
        weight_xc_data_packed.create(size, hidden_size, num_directions, 8u, 4);
        bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4);
        weight_hc_data_packed.create(num_output, hidden_size, num_directions, 8u, 4);
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat bias_c = bias_c_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        const float* bias_c_I = bias_c.row(0);
        const float* bias_c_F = bias_c.row(1);
        const float* bias_c_O = bias_c.row(2);
        const float* bias_c_G = bias_c.row(3);

        __fp16* bias_c_IFOG = bias_c_data_packed_dr.row<__fp16>(0);

        int q = 0;
        if (opt.use_fp16_arithmetic)
        {
            for (; q + 1 < hidden_size; q += 2)
            {
                bias_c_IFOG[0] = (__fp16)bias_c_I[q];
                bias_c_IFOG[1] = (__fp16)bias_c_F[q];
                bias_c_IFOG[2] = (__fp16)bias_c_O[q];
                bias_c_IFOG[3] = (__fp16)bias_c_G[q];
                bias_c_IFOG[4] = (__fp16)bias_c_I[q + 1];
                bias_c_IFOG[5] = (__fp16)bias_c_F[q + 1];
                bias_c_IFOG[6] = (__fp16)bias_c_O[q + 1];
                bias_c_IFOG[7] = (__fp16)bias_c_G[q + 1];

                bias_c_IFOG += 8;

                const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
                const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
                const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
                const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);
                const float* weight_xc_I_1 = weight_xc.row(hidden_size * 0 + q + 1);
                const float* weight_xc_F_1 = weight_xc.row(hidden_size * 1 + q + 1);
                const float* weight_xc_O_1 = weight_xc.row(hidden_size * 2 + q + 1);
                const float* weight_xc_G_1 = weight_xc.row(hidden_size * 3 + q + 1);

                const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
                const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
                const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
                const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);
                const float* weight_hc_I_1 = weight_hc.row(hidden_size * 0 + q + 1);
                const float* weight_hc_F_1 = weight_hc.row(hidden_size * 1 + q + 1);
                const float* weight_hc_O_1 = weight_hc.row(hidden_size * 2 + q + 1);
                const float* weight_hc_G_1 = weight_hc.row(hidden_size * 3 + q + 1);

                __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q / 2);
                __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q / 2);

                for (int i = 0; i < size; i++)
                {
                    weight_xc_IFOG[0] = (__fp16)weight_xc_I[i];
                    weight_xc_IFOG[1] = (__fp16)weight_xc_F[i];
                    weight_xc_IFOG[2] = (__fp16)weight_xc_O[i];
                    weight_xc_IFOG[3] = (__fp16)weight_xc_G[i];
                    weight_xc_IFOG[4] = (__fp16)weight_xc_I_1[i];
                    weight_xc_IFOG[5] = (__fp16)weight_xc_F_1[i];
                    weight_xc_IFOG[6] = (__fp16)weight_xc_O_1[i];
                    weight_xc_IFOG[7] = (__fp16)weight_xc_G_1[i];

                    weight_xc_IFOG += 8;
                }

                for (int i = 0; i < num_output; i++)
                {
                    weight_hc_IFOG[0] = (__fp16)weight_hc_I[i];
                    weight_hc_IFOG[1] = (__fp16)weight_hc_F[i];
                    weight_hc_IFOG[2] = (__fp16)weight_hc_O[i];
                    weight_hc_IFOG[3] = (__fp16)weight_hc_G[i];
                    weight_hc_IFOG[4] = (__fp16)weight_hc_I_1[i];
                    weight_hc_IFOG[5] = (__fp16)weight_hc_F_1[i];
                    weight_hc_IFOG[6] = (__fp16)weight_hc_O_1[i];
                    weight_hc_IFOG[7] = (__fp16)weight_hc_G_1[i];

                    weight_hc_IFOG += 8;
                }
            }
        }
        for (; q < hidden_size; q++)
        {
            bias_c_IFOG[0] = (__fp16)bias_c_I[q];
            bias_c_IFOG[1] = (__fp16)bias_c_F[q];
            bias_c_IFOG[2] = (__fp16)bias_c_O[q];
            bias_c_IFOG[3] = (__fp16)bias_c_G[q];

            bias_c_IFOG += 4;

            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);

            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);

            const int qq = opt.use_fp16_arithmetic ? q / 2 + q % 2 : q;
            __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(qq);
            __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(qq);

            for (int i = 0; i < size; i++)
            {
                weight_xc_IFOG[0] = (__fp16)weight_xc_I[i];
                weight_xc_IFOG[1] = (__fp16)weight_xc_F[i];
                weight_xc_IFOG[2] = (__fp16)weight_xc_O[i];
                weight_xc_IFOG[3] = (__fp16)weight_xc_G[i];

                weight_xc_IFOG += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc_IFOG[0] = (__fp16)weight_hc_I[i];
                weight_hc_IFOG[1] = (__fp16)weight_hc_F[i];
                weight_hc_IFOG[2] = (__fp16)weight_hc_O[i];
                weight_hc_IFOG[3] = (__fp16)weight_hc_G[i];

                weight_hc_IFOG += 4;
            }
        }
    }

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

int LSTM_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    Mat cell(hidden_size, 4u, opt.workspace_allocator);
    if (cell.empty())
        return -100;
    cell.fill(0.f);

    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.f);
        cell.fill(0.f);

        {
            int ret = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
            __fp16* ptr = top_blob.row<__fp16>(i);

            memcpy(ptr, pf, num_output * sizeof(__fp16));
            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
        }
    }

    return 0;
}

int LSTM_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Mat cell;
    Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 3)
    {
        Option opt_cast = opt;
        opt_cast.blob_allocator = hidden_cell_allocator;
        cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
        cast_float16_to_float32(bottom_blobs[2], cell, opt_cast);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);

        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
        if (cell.empty())
            return -100;
        cell.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        Mat cell0 = cell.row_range(0, 1);
        {
            int ret = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        Mat cell1 = cell.row_range(1, 1);
        {
            int ret = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
            __fp16* ptr = top_blob.row<__fp16>(i);

            memcpy(ptr, pf, num_output * sizeof(__fp16));
            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
        }
    }

    if (top_blobs.size() == 3)
    {
        cast_float32_to_float16(hidden, top_blobs[1], opt);
        cast_float32_to_float16(cell, top_blobs[2], opt);
    }

    return 0;
}
#endif

} // namespace ncnn


================================================
FILE: src/layer/arm/lstm_arm_vfpv4.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "layer.h"
#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "lstm_int8.h"

void lstm_int8_gate_output_vfpv4(const Mat& gates, const Mat& weight_hr, Mat& hidden_state, Mat& tmp_hidden_state, Mat& cell_state, Mat& top_blob, int ti, int elemtype, const Option& opt)
{
    lstm_int8_gate_output(gates, weight_hr, hidden_state, tmp_hidden_state, cell_state, top_blob, ti, elemtype, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/lstm_int8.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
void lstm_transform_weight_int8_asimddp(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, int hidden_size, const Option& opt);
void lstm_int8_asimddp(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
void lstm_int8_gate_output_vfpv4(const Mat& gates, const Mat& weight_hr, Mat& hidden_state, Mat& tmp_hidden_state, Mat& cell_state, Mat& top_blob, int ti, int elemtype, const Option& opt);
#endif

static void lstm_transform_weight_int8(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, int hidden_size, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
    if (ncnn::cpu_support_arm_asimddp())
    {
        lstm_transform_weight_int8_asimddp(weight_xc, weight_xc_int8_scales, weight_hc, weight_hc_int8_scales, bias_c, weight_data_tm, weight_data_tm_int8_descales, bias_c_tm, size, num_output, num_directions, hidden_size, opt);
        return;
    }
#endif

    weight_data_tm.create(size + num_output, hidden_size, num_directions, 4u, 4);
    weight_data_tm_int8_descales.create(4 + 4, hidden_size, num_directions);
    bias_c_tm.create(hidden_size, 1, num_directions, 16u, 4);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc_dr = weight_xc.channel(dr);
        const Mat weight_hc_dr = weight_hc.channel(dr);
        const Mat bias_c_dr = bias_c.channel(dr);
        const float* weight_xc_int8_scales_ptr = weight_xc_int8_scales.row(dr);
        const float* weight_hc_int8_scales_ptr = weight_hc_int8_scales.row(dr);

        Mat weight_data_tm_dr = weight_data_tm.channel(dr);
        Mat bias_c_tm_dr = bias_c_tm.channel(dr);
        Mat weight_data_tm_int8_descales_dr = weight_data_tm_int8_descales.channel(dr);

        const float* bias_c_I = bias_c_dr.row(0);
        const float* bias_c_F = bias_c_dr.row(1);
        const float* bias_c_O = bias_c_dr.row(2);
        const float* bias_c_G = bias_c_dr.row(3);

        float* bias_c_IFOG = bias_c_tm_dr.row(0);

        int q = 0;
        for (; q < hidden_size; q++)
        {
            bias_c_IFOG[0] = bias_c_I[q];
            bias_c_IFOG[1] = bias_c_F[q];
            bias_c_IFOG[2] = bias_c_O[q];
            bias_c_IFOG[3] = bias_c_G[q];

            bias_c_IFOG += 4;

            const signed char* weight_xc_I = weight_xc_dr.row<const signed char>(hidden_size * 0 + q);
            const signed char* weight_xc_F = weight_xc_dr.row<const signed char>(hidden_size * 1 + q);
            const signed char* weight_xc_O = weight_xc_dr.row<const signed char>(hidden_size * 2 + q);
            const signed char* weight_xc_G = weight_xc_dr.row<const signed char>(hidden_size * 3 + q);

            const signed char* weight_hc_I = weight_hc_dr.row<const signed char>(hidden_size * 0 + q);
            const signed char* weight_hc_F = weight_hc_dr.row<const signed char>(hidden_size * 1 + q);
            const signed char* weight_hc_O = weight_hc_dr.row<const signed char>(hidden_size * 2 + q);
            const signed char* weight_hc_G = weight_hc_dr.row<const signed char>(hidden_size * 3 + q);

            signed char* kptr = weight_data_tm_dr.row<signed char>(q);
            float* descales_ptr = weight_data_tm_int8_descales_dr.row(q);

            int i = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
                kptr[0] = weight_xc_I[i];
                kptr[1] = weight_xc_I[i + 1];
                kptr[2] = weight_xc_I[i + 2];
                kptr[3] = weight_xc_I[i + 3];
                kptr[4] = weight_xc_F[i];
                kptr[5] = weight_xc_F[i + 1];
                kptr[6] = weight_xc_F[i + 2];
                kptr[7] = weight_xc_F[i + 3];
                kptr[8 + 0] = weight_xc_O[i];
                kptr[8 + 1] = weight_xc_O[i + 1];
                kptr[8 + 2] = weight_xc_O[i + 2];
                kptr[8 + 3] = weight_xc_O[i + 3];
                kptr[8 + 4] = weight_xc_G[i];
                kptr[8 + 5] = weight_xc_G[i + 1];
                kptr[8 + 6] = weight_xc_G[i + 2];
                kptr[8 + 7] = weight_xc_G[i + 3];
                kptr += 16;
            }
#else
            for (; i + 7 < size; i += 8)
            {
                vst1_s8(kptr, vld1_s8(weight_xc_I + i));
                vst1_s8(kptr + 8, vld1_s8(weight_xc_F + i));
                vst1_s8(kptr + 16, vld1_s8(weight_xc_O + i));
                vst1_s8(kptr + 24, vld1_s8(weight_xc_G + i));
                kptr += 32;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < size; i += 2)
            {
                kptr[0] = weight_xc_I[i];
                kptr[1] = weight_xc_I[i + 1];
                kptr[2] = weight_xc_F[i];
                kptr[3] = weight_xc_F[i + 1];
                kptr[4] = weight_xc_O[i];
                kptr[5] = weight_xc_O[i + 1];
                kptr[6] = weight_xc_G[i];
                kptr[7] = weight_xc_G[i + 1];
                kptr += 8;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                kptr[0] = weight_xc_I[i];
                kptr[1] = weight_xc_F[i];
                kptr[2] = weight_xc_O[i];
                kptr[3] = weight_xc_G[i];
                kptr += 4;
            }

            i = 0;
#if __ARM_NEON
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
                kptr[0] = weight_hc_I[i];
                kptr[1] = weight_hc_I[i + 1];
                kptr[2] = weight_hc_I[i + 2];
                kptr[3] = weight_hc_I[i + 3];
                kptr[4] = weight_hc_F[i];
                kptr[5] = weight_hc_F[i + 1];
                kptr[6] = weight_hc_F[i + 2];
                kptr[7] = weight_hc_F[i + 3];
                kptr[8 + 0] = weight_hc_O[i];
                kptr[8 + 1] = weight_hc_O[i + 1];
                kptr[8 + 2] = weight_hc_O[i + 2];
                kptr[8 + 3] = weight_hc_O[i + 3];
                kptr[8 + 4] = weight_hc_G[i];
                kptr[8 + 5] = weight_hc_G[i + 1];
                kptr[8 + 6] = weight_hc_G[i + 2];
                kptr[8 + 7] = weight_hc_G[i + 3];
                kptr += 16;
            }
#else
            for (; i + 7 < num_output; i += 8)
            {
                vst1_s8(kptr, vld1_s8(weight_hc_I + i));
                vst1_s8(kptr + 8, vld1_s8(weight_hc_F + i));
                vst1_s8(kptr + 16, vld1_s8(weight_hc_O + i));
                vst1_s8(kptr + 24, vld1_s8(weight_hc_G + i));
                kptr += 32;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < num_output; i += 2)
            {
                kptr[0] = weight_hc_I[i];
                kptr[1] = weight_hc_I[i + 1];
                kptr[2] = weight_hc_F[i];
                kptr[3] = weight_hc_F[i + 1];
                kptr[4] = weight_hc_O[i];
                kptr[5] = weight_hc_O[i + 1];
                kptr[6] = weight_hc_G[i];
                kptr[7] = weight_hc_G[i + 1];
                kptr += 8;
            }
#endif // __ARM_NEON
            for (; i < num_output; i++)
            {
                kptr[0] = weight_hc_I[i];
                kptr[1] = weight_hc_F[i];
                kptr[2] = weight_hc_O[i];
                kptr[3] = weight_hc_G[i];
                kptr += 4;
            }

            descales_ptr[0] = 1.f / weight_xc_int8_scales_ptr[hidden_size * 0 + q];
            descales_ptr[1] = 1.f / weight_xc_int8_scales_ptr[hidden_size * 1 + q];
            descales_ptr[2] = 1.f / weight_xc_int8_scales_ptr[hidden_size * 2 + q];
            descales_ptr[3] = 1.f / weight_xc_int8_scales_ptr[hidden_size * 3 + q];
            descales_ptr[4] = 1.f / weight_hc_int8_scales_ptr[hidden_size * 0 + q];
            descales_ptr[5] = 1.f / weight_hc_int8_scales_ptr[hidden_size * 1 + q];
            descales_ptr[6] = 1.f / weight_hc_int8_scales_ptr[hidden_size * 2 + q];
            descales_ptr[7] = 1.f / weight_hc_int8_scales_ptr[hidden_size * 3 + q];
        }
    }
}

static void lstm_int8_gate_output(const Mat& gates, const Mat& weight_hr, Mat& hidden_state, Mat& tmp_hidden_state, Mat& cell_state, Mat& top_blob, int ti, int elemtype, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
    if (ncnn::cpu_support_arm_vfpv4())
    {
        lstm_int8_gate_output_vfpv4(gates, weight_hr, hidden_state, tmp_hidden_state, cell_state, top_blob, ti, elemtype, opt);
        return;
    }
#endif

    const int num_output = top_blob.w;
    const int hidden_size = cell_state.w;

    // lstm unit
    // sigmoid(I)
    // sigmoid(F)
    // sigmoid(O)
    // tanh(G)
    // c_t := f_t .* c_{t-1} + i_t .* g_t
    // h_t := o_t .* tanh[c_t]
    float* output_data = top_blob.row(ti);

    float* cell_ptr = cell_state;
    float* hidden_ptr = hidden_state;
    float* tmp_hidden_ptr = tmp_hidden_state;

    int remain_hidden_size_start = 0;
#if __ARM_NEON
    int nn_hidden_size = hidden_size >> 2;
    remain_hidden_size_start = nn_hidden_size << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int qq = 0; qq < nn_hidden_size; qq++)
    {
        int q = qq * 4;

        const float* gates_data = gates.row(q);

        float32x4x4_t _IFOG_4x4 = vld4q_f32(gates_data);

        float32x4_t _lstm_I = sigmoid_ps(_IFOG_4x4.val[0]);
        float32x4_t _lstm_F = sigmoid_ps(_IFOG_4x4.val[1]);
        float32x4_t _lstm_O = sigmoid_ps(_IFOG_4x4.val[2]);
        float32x4_t _lstm_G = tanh_ps(_IFOG_4x4.val[3]);

        float32x4_t _cell2 = vaddq_f32(vmulq_f32(_lstm_F, vld1q_f32(cell_ptr + q)), vmulq_f32(_lstm_I, _lstm_G));
        float32x4_t _lstm_H = vmulq_f32(_lstm_O, tanh_ps(_cell2));

        vst1q_f32(cell_ptr + q, _cell2);

        if (num_output == hidden_size)
        {
            vst1q_f32(hidden_ptr + q, _lstm_H);

            if (elemtype == 1)
            {
                // fp32
                vst1q_f32(output_data + q, _lstm_H);
            }
            if (elemtype == 2)
            {
                // fp16
                unsigned short* outptr = (unsigned short*)output_data + q;
#if (__ARM_FP & 2)
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "fcvtn  v0.4h, %2.4s        \n"
                    "st1    {v0.4h}, [%0]       \n"
                    : "=r"(outptr) // %0
                    : "0"(outptr),
                    "w"(_lstm_H)
                    : "memory", "v0");
#else  // __aarch64__
                asm volatile(
                    "vcvt.f16.f32 d0, %q2       \n"
                    "vst1.u16   {d0}, [%0]      \n"
                    : "=r"(outptr) // %0
                    : "0"(outptr),
                    "w"(_lstm_H)
                    : "memory", "q0");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                vst1_u16(outptr, (uint16x4_t)vcvt_f16_f32(_lstm_H));
#endif // NCNN_GNU_INLINE_ASM
#else
                outptr[q] = float32_to_float16(hidden_ptr[q]);
                outptr[q + 1] = float32_to_float16(hidden_ptr[q + 1]);
                outptr[q + 2] = float32_to_float16(hidden_ptr[q + 2]);
                outptr[q + 3] = float32_to_float16(hidden_ptr[q + 3]);
#endif // (__ARM_FP & 2)
            }
            if (elemtype == 4)
            {
                // bf16
                vst1_u16((unsigned short*)output_data + q, float2bfloat(_lstm_H));
            }
        }
        else
        {
            vst1q_f32(tmp_hidden_ptr + q, _lstm_H);
        }
    }
#endif // __ARM_NEON
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = remain_hidden_size_start; q < hidden_size; q++)
    {
        const float* gates_data = gates.row(q);

        float I = gates_data[0];
        float F = gates_data[1];
        float O = gates_data[2];
        float G = gates_data[3];

        I = 1.f / (1.f + expf(-I));
        F = 1.f / (1.f + expf(-F));
        O = 1.f / (1.f + expf(-O));
        G = tanhf(G);

        float cell2 = F * cell_ptr[q] + I * G;
        float H = O * tanhf(cell2);

        cell_ptr[q] = cell2;
        if (num_output == hidden_size)
        {
            hidden_ptr[q] = H;

            if (elemtype == 1)
            {
                output_data[q] = H;
            }
            if (elemtype == 2)
            {
                ((unsigned short*)output_data)[q] = float32_to_float16(H);
            }
            if (elemtype == 4)
            {
                ((unsigned short*)output_data)[q] = float32_to_bfloat16(H);
            }
        }
        else
        {
            tmp_hidden_ptr[q] = H;
        }
    }

    if (num_output != hidden_size)
    {
        // int nn_num_output = num_output >> 2;
        // int remain_num_output_start = nn_num_output << 2;
        // #pragma omp parallel for num_threads(opt.num_threads)
        // for (int qq = 0; qq < nn_num_output; qq++)
        // {
        //     int q = qq * 4;
        //
        // }
        int remain_num_output_start = 0;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const float* hr = weight_hr.row(q);
            const float* tmp_hidden_ptr = tmp_hidden_state;

            float H = 0;
            for (int i = 0; i < hidden_size; i++)
            {
                H += tmp_hidden_ptr[i] * hr[i];
            }

            hidden_ptr[q] = H;

            if (elemtype == 1)
            {
                output_data[q] = H;
            }
            if (elemtype == 2)
            {
                ((unsigned short*)output_data)[q] = float32_to_float16(H);
            }
            if (elemtype == 4)
            {
                ((unsigned short*)output_data)[q] = float32_to_bfloat16(H);
            }
        }
    }
}

static void lstm_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
    if (ncnn::cpu_support_arm_asimddp())
    {
        lstm_int8_asimddp(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, reverse, weight_data_tm, weight_data_tm_int8_descales, bias_c, weight_hr, hidden_state, cell_state, opt);
        return;
    }
#endif

    int size = bottom_blob_int8.w;
    int T = bottom_blob_int8.h;

    int num_output = top_blob.w;
    int hidden_size = cell_state.w;

    // 4 x hidden_size
    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);

    Mat tmp_hidden_state;
    if (num_output != hidden_size)
    {
        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
    }

    Mat hidden_state_int8(num_output, (size_t)1u, 1, opt.workspace_allocator);
    float hidden_state_int8_scale = 1.f;
    float hidden_state_int8_descale = 1.f;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        // dynamic quantize hidden_state
        {
            float absmax = 0.f;
            for (int i = 0; i < num_output; i++)
            {
                absmax = std::max(absmax, (float)fabs(hidden_state[i]));
            }

            if (absmax == 0.f)
            {
                hidden_state_int8.fill<signed char>(0);
            }
            else
            {
                hidden_state_int8_scale = 127.f / absmax;
                hidden_state_int8_descale = absmax / 127.f;

                signed char* hs = hidden_state_int8;
                for (int i = 0; i < num_output; i++)
                {
                    hs[i] = float2int8(hidden_state[i] * hidden_state_int8_scale);
                }
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const signed char* x = bottom_blob_int8.row<const signed char>(ti);
            const signed char* hs = hidden_state_int8;
            const float descale_x = bottom_blob_int8_descales[ti];
            const float descale_h = hidden_state_int8_descale;

            // gate reset update
            const float* bias_c_IFOG = (const float*)bias_c + q * 4;

            const signed char* kptr = weight_data_tm.row<const signed char>(q);
            const float* descales_ptr = weight_data_tm_int8_descales.row(q);

            float* gates_data = gates.row(q);

#if __ARM_NEON
            int32x4_t _lstm_IFOGx0 = vdupq_n_s32(0);
            int i = 0;
#if __ARM_FEATURE_DOTPROD
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            int32x4_t _sum3 = vdupq_n_s32(0);
            for (; i + 15 < size; i += 16)
            {
                int8x16_t _xi = vld1q_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);
                _lstm_IFOGx0 = vdotq_laneq_s32(_lstm_IFOGx0, _w0, _xi, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _w1, _xi, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _w2, _xi, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _w3, _xi, 3);

                kptr += 64;
            }
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _lstm_IFOGx0 = vdotq_lane_s32(_lstm_IFOGx0, _w0, _xi, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w1, _xi, 1);

                kptr += 32;
            }
            _lstm_IFOGx0 = vaddq_s32(_lstm_IFOGx0, _sum1);
            _lstm_IFOGx0 = vaddq_s32(_lstm_IFOGx0, _sum2);
            _lstm_IFOGx0 = vaddq_s32(_lstm_IFOGx0, _sum3);
#else
            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            int32x4_t _sum3 = vdupq_n_s32(0);
            for (; i + 15 < size; i += 16)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* xptr = x + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16-d17}, [%0]     \n"
                    "vmull.s8   q4, d0, d16         \n"
                    "vmull.s8   q5, d1, d16         \n"
                    "vmull.s8   q6, d2, d16         \n"
                    "vmull.s8   q7, d3, d16         \n"
                    "vmlal.s8   q4, d4, d17         \n"
                    "vmlal.s8   q5, d5, d17         \n"
                    "vmlal.s8   q6, d6, d17         \n"
                    "vmlal.s8   q7, d7, d17         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(xptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(xptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int8x16_t _xi = vld1q_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), vget_low_s8(_xi));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), vget_low_s8(_xi));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), vget_low_s8(_xi));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), vget_low_s8(_xi));
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), vget_high_s8(_xi));
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), vget_high_s8(_xi));
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), vget_high_s8(_xi));
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), vget_high_s8(_xi));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _xi);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _xi);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _xi);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _xi);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 32;
            }
            {
                int32x4x2_t _tmp0 = vzipq_s32(_sum0, _sum1);
                int32x4x2_t _tmp1 = vzipq_s32(_sum2, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_tmp0.val[0]), vget_low_s32(_tmp1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_tmp0.val[0]), vget_high_s32(_tmp1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_tmp0.val[1]), vget_low_s32(_tmp1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_tmp0.val[1]), vget_high_s32(_tmp1.val[1]));
            }
            _lstm_IFOGx0 = vaddq_s32(_lstm_IFOGx0, _sum0);
            _lstm_IFOGx0 = vaddq_s32(_lstm_IFOGx0, _sum1);
            _lstm_IFOGx0 = vaddq_s32(_lstm_IFOGx0, _sum2);
            _lstm_IFOGx0 = vaddq_s32(_lstm_IFOGx0, _sum3);
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w = vld1q_s8(kptr);
                _lstm_IFOGx0 = vdotq_lane_s32(_lstm_IFOGx0, _w, _xi, 0);
#else
                int16x4_t _xi01 = vreinterpret_s16_s8(vld1_s8(x + i));
                int8x8_t _xi0 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 0));
                int8x8_t _xi1 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 1));
                int8x16_t _w01 = vld1q_s8(kptr);

                int16x8_t _lstm_IFOGx = vmull_s8(vget_low_s8(_w01), _xi0);
                _lstm_IFOGx = vmlal_s8(_lstm_IFOGx, vget_high_s8(_w01), _xi1);
                _lstm_IFOGx0 = vpadalq_s16(_lstm_IFOGx0, _lstm_IFOGx);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 16;
            }
            for (; i + 1 < size; i += 2)
            {
                int8x8_t _xi = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(x + i)), 0));
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _lstm_IFOGx = vmull_s8(_w, _xi);
                _lstm_IFOGx0 = vpadalq_s16(_lstm_IFOGx0, _lstm_IFOGx);

                kptr += 8;
            }
            for (; i < size; i++)
            {
                int8x8_t _xi = vdup_n_s8(x[i]);
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _lstm_IFOGx = vmull_s8(_w, _xi);
                _lstm_IFOGx0 = vaddw_s16(_lstm_IFOGx0, vget_low_s16(_lstm_IFOGx));

                kptr += 4;
            }

            int32x4_t _lstm_IFOGh0 = vdupq_n_s32(0);
            i = 0;
#if __ARM_FEATURE_DOTPROD
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            _sum3 = vdupq_n_s32(0);
            for (; i + 15 < num_output; i += 16)
            {
                int8x16_t _h_cont = vld1q_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);
                _lstm_IFOGh0 = vdotq_laneq_s32(_lstm_IFOGh0, _w0, _h_cont, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _w1, _h_cont, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _w2, _h_cont, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _w3, _h_cont, 3);

                kptr += 64;
            }
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _lstm_IFOGh0 = vdotq_lane_s32(_lstm_IFOGh0, _w0, _h_cont, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w1, _h_cont, 1);

                kptr += 32;
            }
            _lstm_IFOGh0 = vaddq_s32(_lstm_IFOGh0, _sum1);
            _lstm_IFOGh0 = vaddq_s32(_lstm_IFOGh0, _sum2);
            _lstm_IFOGh0 = vaddq_s32(_lstm_IFOGh0, _sum3);
#else
            _sum0 = vdupq_n_s32(0);
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            _sum3 = vdupq_n_s32(0);
            for (; i + 15 < num_output; i += 16)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* hsptr = hs + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16-d17}, [%0]     \n"
                    "vmull.s8   q4, d0, d16         \n"
                    "vmull.s8   q5, d1, d16         \n"
                    "vmull.s8   q6, d2, d16         \n"
                    "vmull.s8   q7, d3, d16         \n"
                    "vmlal.s8   q4, d4, d17         \n"
                    "vmlal.s8   q5, d5, d17         \n"
                    "vmlal.s8   q6, d6, d17         \n"
                    "vmlal.s8   q7, d7, d17         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(hsptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(hsptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int8x16_t _h_cont = vld1q_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), vget_low_s8(_h_cont));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), vget_low_s8(_h_cont));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), vget_low_s8(_h_cont));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), vget_low_s8(_h_cont));
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), vget_high_s8(_h_cont));
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), vget_high_s8(_h_cont));
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), vget_high_s8(_h_cont));
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), vget_high_s8(_h_cont));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _h_cont);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _h_cont);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _h_cont);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _h_cont);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 32;
            }
            {
                int32x4x2_t _tmp0 = vzipq_s32(_sum0, _sum1);
                int32x4x2_t _tmp1 = vzipq_s32(_sum2, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_tmp0.val[0]), vget_low_s32(_tmp1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_tmp0.val[0]), vget_high_s32(_tmp1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_tmp0.val[1]), vget_low_s32(_tmp1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_tmp0.val[1]), vget_high_s32(_tmp1.val[1]));
            }
            _lstm_IFOGh0 = vaddq_s32(_lstm_IFOGh0, _sum0);
            _lstm_IFOGh0 = vaddq_s32(_lstm_IFOGh0, _sum1);
            _lstm_IFOGh0 = vaddq_s32(_lstm_IFOGh0, _sum2);
            _lstm_IFOGh0 = vaddq_s32(_lstm_IFOGh0, _sum3);
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w = vld1q_s8(kptr);
                _lstm_IFOGh0 = vdotq_lane_s32(_lstm_IFOGh0, _w, _h_cont, 0);
#else
                int16x4_t _h_cont01 = vreinterpret_s16_s8(vld1_s8(hs + i));
                int8x8_t _h_cont0 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 0));
                int8x8_t _h_cont1 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 1));
                int8x16_t _w01 = vld1q_s8(kptr);

                int16x8_t _lstm_IFOGh = vmull_s8(vget_low_s8(_w01), _h_cont0);
                _lstm_IFOGh = vmlal_s8(_lstm_IFOGh, vget_high_s8(_w01), _h_cont1);
                _lstm_IFOGh0 = vpadalq_s16(_lstm_IFOGh0, _lstm_IFOGh);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 16;
            }
            for (; i + 1 < num_output; i += 2)
            {
                int8x8_t _h_cont = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(hs + i)), 0));
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _lstm_IFOGh = vmull_s8(_w, _h_cont);
                _lstm_IFOGh0 = vpadalq_s16(_lstm_IFOGh0, _lstm_IFOGh);

                kptr += 8;
            }
            for (; i < num_output; i++)
            {
                int8x8_t _h_cont = vdup_n_s8(hs[i]);
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _lstm_IFOGh = vmull_s8(_w, _h_cont);
                _lstm_IFOGh0 = vaddw_s16(_lstm_IFOGh0, vget_low_s16(_lstm_IFOGh));

                kptr += 4;
            }

            float32x4_t _descale_x = vdupq_n_f32(descale_x);
            float32x4_t _descale_h = vdupq_n_f32(descale_h);

            float32x4_t _lstm_IFOG0 = vld1q_f32(bias_c_IFOG);

            float32x4_t _descale_xc_IFOG = vld1q_f32(descales_ptr);

            _lstm_IFOG0 = vmlaq_f32(_lstm_IFOG0, vcvtq_f32_s32(_lstm_IFOGx0), vmulq_f32(_descale_x, _descale_xc_IFOG));

            float32x4_t _descale_hc_IFOG = vld1q_f32(descales_ptr + 4);

            _lstm_IFOG0 = vmlaq_f32(_lstm_IFOG0, vcvtq_f32_s32(_lstm_IFOGh0), vmulq_f32(_descale_h, _descale_hc_IFOG));

            vst1q_f32(gates_data, _lstm_IFOG0);
#else
            int Ix = 0;
            int Fx = 0;
            int Ox = 0;
            int Gx = 0;
            for (int i = 0; i < size; i++)
            {
                signed char xi = x[i];

                Ix += kptr[0] * xi;
                Fx += kptr[1] * xi;
                Ox += kptr[2] * xi;
                Gx += kptr[3] * xi;

                kptr += 4;
            }

            int Ih = 0;
            int Fh = 0;
            int Oh = 0;
            int Gh = 0;
            for (int i = 0; i < num_output; i++)
            {
                signed char h_cont = hs[i];

                Ih += kptr[0] * h_cont;
                Fh += kptr[1] * h_cont;
                Oh += kptr[2] * h_cont;
                Gh += kptr[3] * h_cont;

                kptr += 4;
            }

            const float descale_xc_I = descales_ptr[0];
            const float descale_xc_F = descales_ptr[1];
            const float descale_xc_O = descales_ptr[2];
            const float descale_xc_G = descales_ptr[3];
            const float descale_hc_I = descales_ptr[4];
            const float descale_hc_F = descales_ptr[5];
            const float descale_hc_O = descales_ptr[6];
            const float descale_hc_G = descales_ptr[7];

            float I = bias_c_IFOG[0] + Ix * (descale_x * descale_xc_I) + Ih * (descale_h * descale_hc_I);
            float F = bias_c_IFOG[1] + Fx * (descale_x * descale_xc_F) + Fh * (descale_h * descale_hc_F);
            float O = bias_c_IFOG[2] + Ox * (descale_x * descale_xc_O) + Oh * (descale_h * descale_hc_O);
            float G = bias_c_IFOG[3] + Gx * (descale_x * descale_xc_G) + Gh * (descale_h * descale_hc_G);

            gates_data[0] = I;
            gates_data[1] = F;
            gates_data[2] = O;
            gates_data[3] = G;
#endif // __ARM_NEON
        }

        lstm_int8_gate_output(gates, weight_hr, hidden_state, tmp_hidden_state, cell_state, top_blob, ti, elemtype, opt);
    }
}


================================================
FILE: src/layer/arm/matmul_arm.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "matmul_arm.h"

#include "layer_type.h"

#include "cpu.h"

namespace ncnn {

MatMul_arm::MatMul_arm()
{
#if __ARM_NEON
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif

    gemm = 0;
}

int MatMul_arm::create_pipeline(const Option& opt)
{
    gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);

    ncnn::ParamDict pd;
    pd.set(2, 0);      // transA
    pd.set(3, transB); // transB
    pd.set(4, 0);      // constantA
    pd.set(5, 0);      // constantB
    pd.set(6, 1);      // constantC
    pd.set(7, 0);      // M = outch
    pd.set(8, 0);      // N = size
    pd.set(9, 0);      // K = maxk*inch
    pd.set(10, -1);    // constant_broadcast_type_C = null
    pd.set(11, 0);     // output_N1M
    pd.set(12, 1);     // output_elempack

    gemm->load_param(pd);

    gemm->load_model(ModelBinFromMatArray(0));

    gemm->create_pipeline(opt);

    return 0;
}

int MatMul_arm::destroy_pipeline(const Option& opt)
{
    if (gemm)
    {
        gemm->destroy_pipeline(opt);
        delete gemm;
        gemm = 0;
    }

    return 0;
}

int MatMul_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int Adims = A.dims;
    const int Bdims = B.dims;
    const int max_ABdims = std::max(Adims, Bdims);
    const size_t elemsize = A.elemsize;

    if (Adims == 1 && Bdims == 1)
    {
        // dot product
        std::vector<Mat> _bottom_blobs(2);
        _bottom_blobs[0] = A.reshape(A.w, 1);
        _bottom_blobs[1] = transB ? B.reshape(B.w, 1) : B.reshape(1, B.w);
        gemm->forward(_bottom_blobs, top_blobs, opt);

        top_blob = top_blob.reshape(1, opt.blob_allocator);
    }
    else if (Adims == 2 && Bdims == 2)
    {
        // matrix multiply
        gemm->forward(bottom_blobs, top_blobs, opt);
    }
    else if (Adims == 1 && Bdims == 2)
    {
        // matrix multiply
        std::vector<Mat> _bottom_blobs(2);
        _bottom_blobs[0] = A.reshape(A.w, 1);
        _bottom_blobs[1] = B;
        gemm->forward(_bottom_blobs, top_blobs, opt);

        top_blob = top_blob.reshape(top_blob.w, opt.blob_allocator);
    }
    else if (Adims == 2 && Bdims == 1)
    {
        // matrix multiply
        std::vector<Mat> _bottom_blobs(2);
        _bottom_blobs[0] = A;
        _bottom_blobs[1] = transB ? B.reshape(B.w, 1) : B.reshape(1, B.w);
        gemm->forward(_bottom_blobs, top_blobs, opt);

        top_blob = top_blob.reshape(top_blob.h, opt.blob_allocator);
    }
    else if (Adims == 1 && Bdims > 2)
    {
        // batched matrix multiply
        const int N = transB == 0 ? B.w : B.h;
        const int batch_size = B.d * B.c;

        Mat top_blob1(N, 1, batch_size, elemsize, opt.blob_allocator);
        if (top_blob1.empty())
            return -100;

        Mat A1 = A.reshape(A.w, 1);
        Mat B1 = B.reshape(B.w, B.h, batch_size);

        for (int p = 0; p < batch_size; p++)
        {
            std::vector<Mat> _bottom_blobs(2);
            _bottom_blobs[0] = A1;
            _bottom_blobs[1] = B1.channel(p);
            std::vector<Mat> _top_blobs(1);
            _top_blobs[0] = top_blob1.channel(p);
            gemm->forward(_bottom_blobs, _top_blobs, opt);
        }

        if (Bdims == 3)
            top_blob = top_blob1.reshape(N, B.d * B.c, opt.blob_allocator);
        else
            top_blob = top_blob1.reshape(N, B.d, B.c, opt.blob_allocator);
    }
    else if (Adims > 2 && Bdims == 1)
    {
        // batched matrix multiply
        const int M = A.h;
        const int batch_size = A.d * A.c;

        Mat top_blob1(1, M, batch_size, elemsize, opt.blob_allocator);
        if (top_blob1.empty())
            return -100;

        Mat A1 = A.reshape(A.w, A.h, batch_size);
        Mat BT = transB ? B.reshape(B.w, 1) : B.reshape(1, B.w);

        for (int p = 0; p < batch_size; p++)
        {
            std::vector<Mat> _bottom_blobs(2);
            _bottom_blobs[0] = A1.channel(p);
            _bottom_blobs[1] = BT;
            std::vector<Mat> _top_blobs(1);
            _top_blobs[0] = top_blob1.channel(p);
            gemm->forward(_bottom_blobs, _top_blobs, opt);
        }

        if (Adims == 3)
            top_blob = top_blob1.reshape(M, A.d * A.c, opt.blob_allocator);
        else
            top_blob = top_blob1.reshape(M, A.d, A.c, opt.blob_allocator);
    }
    else if (max_ABdims == 3)
    {
        Mat A1 = Adims == 2 ? A.reshape(A.w, A.h, 1) : A;
        Mat B1 = Bdims == 2 ? B.reshape(B.w, B.h, 1) : B;

        const int M = A1.h;
        const int N = transB == 0 ? B1.w : B1.h;
        const int batch_size = std::max(A1.c, B1.c);

        top_blob.create(N, M, batch_size, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        for (int p = 0; p < batch_size; p++)
        {
            int Ap = A1.c == 1 ? 0 : p;
            int Bp = B1.c == 1 ? 0 : p;

            std::vector<Mat> _bottom_blobs(2);
            _bottom_blobs[0] = A1.channel(Ap);
            _bottom_blobs[1] = B1.channel(Bp);
            std::vector<Mat> _top_blobs(1);
            _top_blobs[0] = top_blob.channel(p);
            gemm->forward(_bottom_blobs, _top_blobs, opt);
        }
    }
    else if (max_ABdims == 4)
    {
        Mat A1 = Adims == 3 ? A.reshape(A.w, A.h, A.c, 1) : A;
        Mat B1 = Bdims == 3 ? B.reshape(B.w, B.h, B.c, 1) : B;

        const int M = A1.h;
        const int N = transB == 0 ? B1.w : B1.h;
        const int batch_size_d = std::max(A1.d, B1.d);
        const int batch_size_c = std::max(A1.c, B1.c);

        top_blob.create(N, M, batch_size_d, batch_size_c, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        for (int p = 0; p < batch_size_c; p++)
        {
            int Ap = A1.c == 1 ? 0 : p;
            int Bp = B1.c == 1 ? 0 : p;

            for (int q = 0; q < batch_size_d; q++)
            {
                int Ad = A1.d == 1 ? 0 : q;
                int Bd = B1.d == 1 ? 0 : q;

                std::vector<Mat> _bottom_blobs(2);
                _bottom_blobs[0] = A1.channel(Ap).depth(Ad);
                _bottom_blobs[1] = B1.channel(Bp).depth(Bd);
                std::vector<Mat> _top_blobs(1);
                _top_blobs[0] = top_blob.channel(p).depth(q);
                gemm->forward(_bottom_blobs, _top_blobs, opt);
            }
        }
    }
    else
    {
        NCNN_LOGE("impossible matmul %d %d", Adims, Bdims);
        return -1;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/matmul_arm.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_MATMUL_ARM_H
#define LAYER_MATMUL_ARM_H

#include "matmul.h"

namespace ncnn {

class MatMul_arm : public MatMul
{
public:
    MatMul_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    Layer* gemm;
};

} // namespace ncnn

#endif // LAYER_MATMUL_ARM_H


================================================
FILE: src/layer/arm/mish_arm.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "mish_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

Mish_arm::Mish_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Mish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmulq_f32(_p, tanh_ps(log_ps(vaddq_f32(exp_ps(_p), vdupq_n_f32(1.f)))));
                vst1q_f32(ptr, _p);
                ptr += 4;
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        for (; nn > 0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vmulq_f32(_p, tanh_ps(log_ps(vaddq_f32(exp_ps(_p), vdupq_n_f32(1.f)))));
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; remain > 0; remain--)
        {
            *ptr = *ptr * tanhf(logf(expf(*ptr) + 1.f));
            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int Mish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                _p = vmulq_f32(_p, tanh_ps(log_ps(vaddq_f32(exp_ps(_p), vdupq_n_f32(1.f)))));
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        for (; nn > 0; nn--)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = vmulq_f32(_p, tanh_ps(log_ps(vaddq_f32(exp_ps(_p), vdupq_n_f32(1.f)))));
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; remain > 0; remain--)
        {
            float v = bfloat16_to_float32(*ptr);
            v = v * tanhf(logf(expf(v) + 1.f));
            *ptr = float32_to_bfloat16(v);
            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/mish_arm.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_MISH_ARM_H
#define LAYER_MISH_ARM_H

#include "mish.h"

namespace ncnn {

class Mish_arm : public Mish
{
public:
    Mish_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_MISH_ARM_H


================================================
FILE: src/layer/arm/mish_arm_asimdhp.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "mish_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#include "neon_mathfun.h"
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "neon_mathfun_fp16s.h"
#endif
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Mish_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                _p = vmulq_f32(_p, tanh_ps(log_ps(vaddq_f32(exp_ps(_p), vdupq_n_f32(1.f)))));
                vst1_f16(ptr, vcvt_f16_f32(_p));

                ptr += 4;
            }
        }

        return 0;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _p = vmulq_f32(_p, tanh_ps(log_ps(vaddq_f32(exp_ps(_p), vdupq_n_f32(1.f)))));
            vst1_f16(ptr, vcvt_f16_f32(_p));

            ptr += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)*ptr;
            v = v * tanhf(logf(expf(v) + 1.f));
            *ptr = (__fp16)v;
            ptr++;
        }
    }

    return 0;
}

int Mish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 8)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float16x8_t _p = vld1q_f16(ptr);
                _p = vmulq_f16(_p, tanh_ps_f16(log_ps_f16(vaddq_f16(exp_ps_f16(_p), vdupq_n_f16(1.f)))));
                vst1q_f16(ptr, _p);

                ptr += 8;
            }
        }

        return 0;
    }

    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float16x4_t _p = vld1_f16(ptr);
                _p = vmul_f16(_p, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_p), vdup_n_f16(1.f)))));
                vst1_f16(ptr, _p);

                ptr += 4;
            }
        }

        return 0;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = vmul_f16(_p, tanh_ps_f16(log_ps_f16(vadd_f16(exp_ps_f16(_p), vdup_n_f16(1.f)))));
            vst1_f16(ptr, _p);

            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = *ptr;
            v = v * (__fp16)tanhf(logf(expf(v) + 1.f));
            *ptr = v;
            ptr++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/multiheadattention_arm.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "multiheadattention_arm.h"

#include "cpu.h"
#include "layer_type.h"

namespace ncnn {

MultiHeadAttention_arm::MultiHeadAttention_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

    support_bf16_storage = false; // TODO enable bf16 when gemm has proper out_elemtype support

    q_gemm = 0;
    k_gemm = 0;
    v_gemm = 0;
    o_gemm = 0;

    qk_gemm = 0;
    qkv_gemm = 0;

    qk_softmax = 0;
}

int MultiHeadAttention_arm::create_pipeline(const Option& _opt)
{
    Option opt = _opt;
    opt.use_fp16_storage &= support_fp16_storage;
    opt.use_bf16_storage &= support_bf16_storage;

    {
        qk_softmax = ncnn::create_layer_cpu(ncnn::LayerType::Softmax);
        ncnn::ParamDict pd;
        pd.set(0, -1);
        pd.set(1, 1);
        qk_softmax->load_param(pd);
        qk_softmax->load_model(ModelBinFromMatArray(0));
        qk_softmax->create_pipeline(opt);
    }

    const int qdim = weight_data_size / embed_dim;

    {
        q_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
        ncnn::ParamDict pd;
        pd.set(0, scale);
        pd.set(1, 1.f);
        pd.set(2, 0);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 1);         // constantA
        pd.set(5, 0);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, embed_dim); // M
        pd.set(8, 0);         // N
        pd.set(9, qdim);      // K
        pd.set(10, 1);        // constant_broadcast_type_C
        pd.set(11, 0);        // output_N1M
        pd.set(12, 1);        // output_elempack
        pd.set(14, 0);        // output_transpose
#if NCNN_INT8
        pd.set(18, int8_scale_term);
#endif
        q_gemm->load_param(pd);
        Mat weights[3];
        weights[0] = q_weight_data;
        weights[1] = q_bias_data;
#if NCNN_INT8
        weights[2] = q_weight_data_int8_scales;
#endif
        q_gemm->load_model(ModelBinFromMatArray(weights));
        q_gemm->create_pipeline(opt);

        if (opt.lightmode)
        {
            q_weight_data.release();
            q_bias_data.release();
        }
    }

    {
        k_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
        ncnn::ParamDict pd;
        pd.set(2, 0);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 1);         // constantA
        pd.set(5, 0);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, embed_dim); // M
        pd.set(8, 0);         // N
        pd.set(9, kdim);      // K
        pd.set(10, 1);        // constant_broadcast_type_C
        pd.set(11, 0);        // output_N1M
        pd.set(12, 1);        // output_elempack
        pd.set(14, 0);        // output_transpose
#if NCNN_INT8
        pd.set(18, int8_scale_term);
#endif
        k_gemm->load_param(pd);
        Mat weights[3];
        weights[0] = k_weight_data;
        weights[1] = k_bias_data;
#if NCNN_INT8
        weights[2] = k_weight_data_int8_scales;
#endif
        k_gemm->load_model(ModelBinFromMatArray(weights));
        k_gemm->create_pipeline(opt);

        if (opt.lightmode)
        {
            k_weight_data.release();
            k_bias_data.release();
        }
    }

    {
        v_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
        ncnn::ParamDict pd;
        pd.set(2, 0);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 1);         // constantA
        pd.set(5, 0);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, embed_dim); // M
        pd.set(8, 0);         // N
        pd.set(9, vdim);      // K
        pd.set(10, 1);        // constant_broadcast_type_C
        pd.set(11, 0);        // output_N1M
        pd.set(12, 1);        // output_elempack
        pd.set(14, 0);        // output_transpose
#if NCNN_INT8
        pd.set(18, int8_scale_term);
#endif
        v_gemm->load_param(pd);
        Mat weights[3];
        weights[0] = v_weight_data;
        weights[1] = v_bias_data;
#if NCNN_INT8
        weights[2] = v_weight_data_int8_scales;
#endif
        v_gemm->load_model(ModelBinFromMatArray(weights));
        v_gemm->create_pipeline(opt);

        if (opt.lightmode)
        {
            v_weight_data.release();
            v_bias_data.release();
        }
    }

    {
        o_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
        ncnn::ParamDict pd;
        pd.set(2, 1);         // transA
        pd.set(3, 1);         // transB
        pd.set(4, 0);         // constantA
        pd.set(5, 1);         // constantB
        pd.set(6, 1);         // constantC
        pd.set(7, 0);         // M = outch
        pd.set(8, qdim);      // N = size
        pd.set(9, embed_dim); // K = maxk*inch
        pd.set(10, 4);        // constant_broadcast_type_C = null
        pd.set(11, 0);        // output_N1M
#if NCNN_INT8
        pd.set(18, int8_scale_term);
#endif
        o_gemm->load_param(pd);
        Mat weights[3];
        weights[0] = out_weight_data;
        weights[1] = out_bias_data;
#if NCNN_INT8
        Mat out_weight_data_int8_scales(1);
        out_weight_data_int8_scales[0] = out_weight_data_int8_scale;
        weights[2] = out_weight_data_int8_scales;
#endif
        o_gemm->load_model(ModelBinFromMatArray(weights));
        o_gemm->create_pipeline(opt);

        if (opt.lightmode)
        {
            out_weight_data.release();
            out_bias_data.release();
        }
    }

    {
        qk_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
        ncnn::ParamDict pd;
        pd.set(2, 1);                   // transA
        pd.set(3, 0);                   // transB
        pd.set(4, 0);                   // constantA
        pd.set(5, 0);                   // constantB
        pd.set(6, attn_mask ? 0 : 1);   // constantC
        pd.set(7, 0);                   // M
        pd.set(8, 0);                   // N
        pd.set(9, 0);                   // K
        pd.set(10, attn_mask ? 3 : -1); // constant_broadcast_type_C
        pd.set(11, 0);                  // output_N1M
        pd.set(12, 1);                  // output_elempack
#if NCNN_INT8
        pd.set(18, int8_scale_term);
#endif
        qk_gemm->load_param(pd);
        qk_gemm->load_model(ModelBinFromMatArray(0));
        Option opt1 = opt;
        opt1.num_threads = 1;
        qk_gemm->create_pipeline(opt1);
    }

    {
        qkv_gemm = ncnn::create_layer_cpu(ncnn::LayerType::Gemm);
        ncnn::ParamDict pd;
        pd.set(2, 0);   // transA
        pd.set(3, 1);   // transB
        pd.set(4, 0);   // constantA
        pd.set(5, 0);   // constantB
        pd.set(6, 1);   // constantC
        pd.set(7, 0);   // M
        pd.set(8, 0);   // N
        pd.set(9, 0);   // K
        pd.set(10, -1); // constant_broadcast_type_C
        pd.set(11, 0);  // output_N1M
        pd.set(12, 1);  // output_elempack
        pd.set(14, 1);  // output_transpose
#if NCNN_INT8
        pd.set(18, int8_scale_term);
#endif
        qkv_gemm->load_param(pd);
        qkv_gemm->load_model(ModelBinFromMatArray(0));
        Option opt1 = opt;
        opt1.num_threads = 1;
        qkv_gemm->create_pipeline(opt1);
    }

    return 0;
}

int MultiHeadAttention_arm::destroy_pipeline(const Option& _opt)
{
    Option opt = _opt;
    opt.use_fp16_storage &= support_fp16_storage;
    opt.use_bf16_storage &= support_bf16_storage;

    if (qk_softmax)
    {
        qk_softmax->destroy_pipeline(opt);
        delete qk_softmax;
        qk_softmax = 0;
    }

    if (q_gemm)
    {
        q_gemm->destroy_pipeline(opt);
        delete q_gemm;
        q_gemm = 0;
    }

    if (k_gemm)
    {
        k_gemm->destroy_pipeline(opt);
        delete k_gemm;
        k_gemm = 0;
    }

    if (v_gemm)
    {
        v_gemm->destroy_pipeline(opt);
        delete v_gemm;
        v_gemm = 0;
    }

    if (o_gemm)
    {
        o_gemm->destroy_pipeline(opt);
        delete o_gemm;
        o_gemm = 0;
    }

    if (qk_gemm)
    {
        qk_gemm->destroy_pipeline(opt);
        delete qk_gemm;
        qk_gemm = 0;
    }

    if (qkv_gemm)
    {
        qkv_gemm->destroy_pipeline(opt);
        delete qkv_gemm;
        qkv_gemm = 0;
    }

    return 0;
}

int MultiHeadAttention_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& _opt) const
{
    int q_blob_i = 0;
    int k_blob_i = 0;
    int v_blob_i = 0;
    int attn_mask_i = 0;
    int cached_xk_i = 0;
    int cached_xv_i = 0;
    resolve_bottom_blob_index((int)bottom_blobs.size(), q_blob_i, k_blob_i, v_blob_i, attn_mask_i, cached_xk_i, cached_xv_i);

    const Mat& q_blob = bottom_blobs[q_blob_i];
    const Mat& k_blob = bottom_blobs[k_blob_i];
    const Mat& v_blob = bottom_blobs[v_blob_i];
    const Mat& attn_mask_blob = attn_mask ? bottom_blobs[attn_mask_i] : Mat();
    const Mat& cached_xk_blob = kv_cache ? bottom_blobs[cached_xk_i] : Mat();
    const Mat& cached_xv_blob = kv_cache ? bottom_blobs[cached_xv_i] : Mat();

    Option opt = _opt;
    opt.use_fp16_storage &= support_fp16_storage;
    opt.use_bf16_storage &= support_bf16_storage;

    Mat attn_mask_blob_unpacked;
    if (attn_mask && attn_mask_blob.elempack != 1)
    {
        convert_packing(attn_mask_blob, attn_mask_blob_unpacked, 1, opt);
        if (attn_mask_blob_unpacked.empty())
            return -100;
    }
    else
    {
        attn_mask_blob_unpacked = attn_mask_blob;
    }

    Mat cached_xk_blob_unpacked;
    if (kv_cache && !cached_xk_blob.empty() && cached_xk_blob.elempack != 1)
    {
        convert_packing(cached_xk_blob, cached_xk_blob_unpacked, 1, opt);
        if (cached_xk_blob_unpacked.empty())
            return -100;
    }
    else
    {
        cached_xk_blob_unpacked = cached_xk_blob;
    }

    Mat cached_xv_blob_unpacked;
    if (kv_cache && !cached_xv_blob.empty() && cached_xv_blob.elempack != 1)
    {
        convert_packing(cached_xv_blob, cached_xv_blob_unpacked, 1, opt);
        if (cached_xv_blob_unpacked.empty())
            return -100;
    }
    else
    {
        cached_xv_blob_unpacked = cached_xv_blob;
    }

    const int embed_dim_per_head = embed_dim / num_heads;
    const int src_seqlen = q_blob.h * q_blob.elempack;
    const int cur_seqlen = k_blob.h * k_blob.elempack;
    const int past_seqlen = kv_cache && !cached_xk_blob_unpacked.empty() ? cached_xk_blob_unpacked.w : 0;
    const int dst_seqlen = past_seqlen > 0 ? (q_blob_i == k_blob_i ? (past_seqlen + cur_seqlen) : past_seqlen) : cur_seqlen;

    // const int elembits = q_blob.elembits();

    size_t elemsize = q_blob.elemsize / q_blob.elempack;

    Mat q_affine;
    int retq = q_gemm->forward(q_blob, q_affine, opt);
    if (retq != 0)
        return retq;

    Mat k_affine;
    if (past_seqlen > 0)
    {
        if (q_blob_i == k_blob_i)
        {
            Mat k_affine_q;
            int retk = k_gemm->forward(q_blob, k_affine_q, opt);
            if (retk != 0)
                return retk;

            // assert dst_seqlen == cached_xk_blob_unpacked.w + k_affine_q.w

            // merge cached_xk_blob_unpacked and k_affine_q
            k_affine.create(dst_seqlen, embed_dim, k_affine_q.elemsize);
            if (k_affine.empty())
                return -100;

            for (int i = 0; i < embed_dim; i++)
            {
                const unsigned char* ptr = cached_xk_blob_unpacked.row<const unsigned char>(i);
                const unsigned char* ptrq = k_affine_q.row<const unsigned char>(i);
                unsigned char* outptr = k_affine.row<unsigned char>(i);

                memcpy(outptr, ptr, past_seqlen * k_affine.elemsize);
                memcpy(outptr + past_seqlen * k_affine.elemsize, ptrq, cur_seqlen * k_affine.elemsize);
            }
        }
        else
        {
            k_affine = cached_xk_blob_unpacked;
        }
    }
    else
    {
        int retk = k_gemm->forward(k_blob, k_affine, opt);
        if (retk != 0)
            return retk;
    }

    Mat qk_cross(dst_seqlen, src_seqlen * num_heads, elemsize, opt.blob_allocator);
    if (qk_cross.empty())
        return -100;

    std::vector<int> retqks;
    retqks.resize(num_heads);
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < num_heads; i++)
    {
        std::vector<Mat> qk_bottom_blobs(2);
        qk_bottom_blobs[0] = q_affine.row_range(i * embed_dim_per_head, embed_dim_per_head);
        qk_bottom_blobs[1] = k_affine.row_range(i * embed_dim_per_head, embed_dim_per_head);
        if (attn_mask)
        {
            const Mat& maskm = attn_mask_blob_unpacked.dims == 3 ? attn_mask_blob_unpacked.channel(i) : attn_mask_blob_unpacked;
            qk_bottom_blobs.push_back(maskm);
        }
        std::vector<Mat> qk_top_blobs(1);
        qk_top_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen);
        Option opt1 = opt;
        opt1.num_threads = 1;
        retqks[i] = qk_gemm->forward(qk_bottom_blobs, qk_top_blobs, opt1);
    }
    for (int i = 0; i < num_heads; i++)
    {
        if (retqks[i] != 0)
            return retqks[i];
    }

    q_affine.release();

    if (!kv_cache)
    {
        k_affine.release();
    }

    int retqk = qk_softmax->forward_inplace(qk_cross, opt);
    if (retqk != 0)
        return retqk;

    Mat v_affine;
    if (past_seqlen > 0)
    {
        if (q_blob_i == v_blob_i)
        {
            Mat v_affine_q;
            int retk = v_gemm->forward(v_blob, v_affine_q, opt);
            if (retk != 0)
                return retk;

            // assert dst_seqlen == cached_xv_blob_unpacked.w + v_affine_q.w

            // merge cached_xv_blob_unpacked and v_affine_q
            v_affine.create(dst_seqlen, embed_dim, v_affine_q.elemsize);
            if (v_affine.empty())
                return -100;

            for (int i = 0; i < embed_dim; i++)
            {
                const unsigned char* ptr = cached_xv_blob_unpacked.row<const unsigned char>(i);
                const unsigned char* ptrq = v_affine_q.row<const unsigned char>(i);
                unsigned char* outptr = v_affine.row<unsigned char>(i);

                memcpy(outptr, ptr, past_seqlen * v_affine.elemsize);
                memcpy(outptr + past_seqlen * v_affine.elemsize, ptrq, cur_seqlen * v_affine.elemsize);
            }
        }
        else
        {
            v_affine = cached_xv_blob_unpacked;
        }
    }
    else
    {
        int retv = v_gemm->forward(v_blob, v_affine, opt);
        if (retv != 0)
            return retv;
    }

    Mat qkv_cross(src_seqlen, embed_dim_per_head * num_heads, elemsize, opt.blob_allocator);
    if (qkv_cross.empty())
        return -100;

    std::vector<int> retqkvs;
    retqkvs.resize(num_heads);
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < num_heads; i++)
    {
        std::vector<Mat> qkv_bottom_blobs(2);
        qkv_bottom_blobs[0] = qk_cross.row_range(i * src_seqlen, src_seqlen);
        qkv_bottom_blobs[1] = v_affine.row_range(i * embed_dim_per_head, embed_dim_per_head);
        std::vector<Mat> qkv_top_blobs(1);
        qkv_top_blobs[0] = qkv_cross.row_range(i * embed_dim_per_head, embed_dim_per_head);
        Option opt1 = opt;
        opt1.num_threads = 1;
        retqkvs[i] = qkv_gemm->forward(qkv_bottom_blobs, qkv_top_blobs, opt1);
    }
    for (int i = 0; i < num_heads; i++)
    {
        if (retqkvs[i] != 0)
            return retqkvs[i];
    }

    if (!kv_cache)
    {
        v_affine.release();
    }

    int reto = o_gemm->forward(qkv_cross, top_blobs[0], opt);
    if (reto != 0)
        return reto;

    if (kv_cache)
    {
        // assert top_blobs.size() == 3
        top_blobs[1] = k_affine;
        top_blobs[2] = v_affine;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/multiheadattention_arm.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_MULTIHEADATTENTION_ARM_H
#define LAYER_MULTIHEADATTENTION_ARM_H

#include "multiheadattention.h"

namespace ncnn {

class MultiHeadAttention_arm : public MultiHeadAttention
{
public:
    MultiHeadAttention_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    Layer* q_gemm;
    Layer* k_gemm;
    Layer* v_gemm;
    Layer* o_gemm;

    Layer* qk_gemm;
    Layer* qkv_gemm;

    Layer* qk_softmax;
};

} // namespace ncnn

#endif // LAYER_MULTIHEADATTENTION_ARM_H


================================================
FILE: src/layer/arm/neon_mathfun.h
================================================
/* NEON implementation of sin, cos, exp and log
 *
 *   Inspired by Intel Approximate Math library, and based on the
 *   corresponding algorithms of the cephes math library
 */

/* Copyright (C) 2011  Julien Pommier
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty.  In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgment in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 *
 *  (this is the zlib license)
 */

#ifndef NEON_MATHFUN_H
#define NEON_MATHFUN_H

#include <arm_neon.h>

// Portable FMA macros: use hardware FMA on AArch64, fall back to MLA on AArch32
#if defined(__aarch64__)
#define VFMAQ_F32(a, b, c) vfmaq_f32(a, b, c)
#define VFMSQ_F32(a, b, c) vfmsq_f32(a, b, c)
#else
#define VFMAQ_F32(a, b, c) vmlaq_f32(a, b, c)
#define VFMSQ_F32(a, b, c) vmlsq_f32(a, b, c)
#endif

#define c_inv_mant_mask ~0x7f800000u
#define c_cephes_SQRTHF 0.707106781186547524
#define c_cephes_log_p0 7.0376836292E-2
#define c_cephes_log_p1 -1.1514610310E-1
#define c_cephes_log_p2 1.1676998740E-1
#define c_cephes_log_p3 -1.2420140846E-1
#define c_cephes_log_p4 +1.4249322787E-1
#define c_cephes_log_p5 -1.6668057665E-1
#define c_cephes_log_p6 +2.0000714765E-1
#define c_cephes_log_p7 -2.4999993993E-1
#define c_cephes_log_p8 +3.3333331174E-1
#define c_cephes_log_q1 -2.12194440e-4
#define c_cephes_log_q2 0.693359375

/* natural logarithm computed for 4 simultaneous float
 *   return NaN for x <= 0
 */
static inline float32x4_t log_ps(float32x4_t x)
{
    float32x4_t one = vdupq_n_f32(1);

    x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
    uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));

    int32x4_t ux = vreinterpretq_s32_f32(x);

    int32x4_t emm0 = vshrq_n_s32(ux, 23);

    /* keep only the fractional part */
    ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
    ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
    x = vreinterpretq_f32_s32(ux);

    emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
    float32x4_t e = vcvtq_f32_s32(emm0);

    e = vaddq_f32(e, one);

    /* part2:
     *     if( x < SQRTHF ) {
     *       e -= 1;
     *       x = x + x - 1.0;
     *     } else { x = x - 1.0; }
     */
    uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
    float32x4_t tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
    x = vsubq_f32(x, one);
    e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
    x = vaddq_f32(x, tmp);

    float32x4_t z = vmulq_f32(x, x);

    float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p1), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p2), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p3), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p4), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p5), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p6), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p7), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_log_p8), y, x);
    y = vmulq_f32(y, x);

    y = vmulq_f32(y, z);

    y = VFMAQ_F32(y, e, vdupq_n_f32(c_cephes_log_q1));

    y = VFMSQ_F32(y, z, vdupq_n_f32(0.5f));

    x = vaddq_f32(x, y);
    x = VFMAQ_F32(x, e, vdupq_n_f32(c_cephes_log_q2));
    x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
    return x;
}

#define c_exp_hi 88.3762626647949f
#define c_exp_lo -88.3762626647949f

#define c_cephes_LOG2EF 1.44269504088896341
#define c_cephes_exp_C1 0.693359375
#define c_cephes_exp_C2 -2.12194440e-4

#define c_cephes_exp_p0 1.9875691500E-4
#define c_cephes_exp_p1 1.3981999507E-3
#define c_cephes_exp_p2 8.3334519073E-3
#define c_cephes_exp_p3 4.1665795894E-2
#define c_cephes_exp_p4 1.6666665459E-1
#define c_cephes_exp_p5 5.0000001201E-1

/* exp() computed for 4 float at once */
static inline float32x4_t exp_ps(float32x4_t x)
{
    float32x4_t tmp, fx;

    float32x4_t one = vdupq_n_f32(1);
    x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
    x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));

    /* express exp(x) as exp(g + n*log(2)) */
    fx = VFMAQ_F32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));

    /* perform a floorf */
    tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));

    /* if greater, substract 1 */
    uint32x4_t mask = vcgtq_f32(tmp, fx);
    mask = vandq_u32(mask, vreinterpretq_u32_f32(one));

    fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));

    tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
    float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
    x = vsubq_f32(x, tmp);
    x = vsubq_f32(x, z);

    z = vmulq_f32(x, x);

    float32x4_t y = vdupq_n_f32(c_cephes_exp_p0);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_exp_p1), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_exp_p2), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_exp_p3), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_exp_p4), y, x);
    y = VFMAQ_F32(vdupq_n_f32(c_cephes_exp_p5), y, x);

    y = VFMAQ_F32(x, y, z);
    y = vaddq_f32(y, one);

    /* build 2^n */
    int32x4_t mm;
    mm = vcvtq_s32_f32(fx);
    mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
    mm = vshlq_n_s32(mm, 23);
    float32x4_t pow2n = vreinterpretq_f32_s32(mm);

    y = vmulq_f32(y, pow2n);
    return y;
}

#define c_minus_cephes_DP1 -0.78515625
#define c_minus_cephes_DP2 -2.4187564849853515625e-4
#define c_minus_cephes_DP3 -3.77489497744594108e-8
#define c_sincof_p0        -1.9515295891E-4
#define c_sincof_p1        8.3321608736E-3
#define c_sincof_p2        -1.6666654611E-1
#define c_coscof_p0        2.443315711809948E-005
#define c_coscof_p1        -1.388731625493765E-003
#define c_coscof_p2        4.166664568298827E-002
#define c_cephes_FOPI      1.27323954473516 // 4 / M_PI

/* evaluation of 4 sines & cosines at once.
 *
 *   The code is the exact rewriting of the cephes sinf function.
 *   Precision is excellent as long as x < 8192 (I did not bother to
 *   take into account the special handling they have for greater values
 *   -- it does not return garbage for arguments over 8192, though, but
 *   the extra precision is missing).
 *
 *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
 *   surprising but correct result.
 *
 *   Note also that when you compute sin(x), cos(x) is available at
 *   almost no extra price so both sin_ps and cos_ps make use of
 *   sincos_ps..
 */
static inline void sincos_ps(float32x4_t x, float32x4_t* ysin, float32x4_t* ycos)
{
    // any x
    float32x4_t y;

    uint32x4_t emm2;

    uint32x4_t sign_mask_sin, sign_mask_cos;
    sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
    x = vabsq_f32(x);

    /* scale by 4/Pi */
    y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));

    /* store the integer part of y in mm0 */
    emm2 = vcvtq_u32_f32(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
    emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
    y = vcvtq_f32_u32(emm2);

    /* get the polynom selection mask
     *     there is one polynom for 0 <= x <= Pi/4
     *     and another one for Pi/4<x<=Pi/2
     *
     *     Both branches will be computed.
     */
    uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));

    /* The magic pass: "Extended precision modular arithmetic"
     *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
    x = VFMAQ_F32(x, y, vdupq_n_f32(c_minus_cephes_DP1));
    x = VFMAQ_F32(x, y, vdupq_n_f32(c_minus_cephes_DP2));
    x = VFMAQ_F32(x, y, vdupq_n_f32(c_minus_cephes_DP3));

    sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
    sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));

    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
     *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
    float32x4_t z = vmulq_f32(x, x);
    float32x4_t y1, y2;

    y1 = VFMAQ_F32(vdupq_n_f32(c_coscof_p1), z, vdupq_n_f32(c_coscof_p0));
    y2 = VFMAQ_F32(vdupq_n_f32(c_sincof_p1), z, vdupq_n_f32(c_sincof_p0));
    y1 = VFMAQ_F32(vdupq_n_f32(c_coscof_p2), y1, z);
    y2 = VFMAQ_F32(vdupq_n_f32(c_sincof_p2), y2, z);
    y1 = vmulq_f32(y1, z);
    y2 = vmulq_f32(y2, z);
    y1 = vmulq_f32(y1, z);
    y1 = VFMSQ_F32(y1, z, vdupq_n_f32(0.5f));
    y2 = VFMAQ_F32(x, y2, x);
    y1 = vaddq_f32(y1, vdupq_n_f32(1));

    /* select the correct result from the two polynoms */
    float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
    float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
    *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
    *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
}

static inline float32x4_t sin_ps(float32x4_t x)
{
    float32x4_t ysin, ycos;
    sincos_ps(x, &ysin, &ycos);
    return ysin;
}

static inline float32x4_t cos_ps(float32x4_t x)
{
    float32x4_t ysin, ycos;
    sincos_ps(x, &ysin, &ycos);
    return ycos;
}

static inline float32x4_t div_ps(float32x4_t a, float32x4_t b)
{
#if __aarch64__
    return vdivq_f32(a, b);
#else
    float32x4_t reciprocal = vrecpeq_f32(b);
    reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
    reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
    return vmulq_f32(a, reciprocal);
#endif
}

static inline float32x4_t tan_ps(float32x4_t x)
{
    float32x4_t ysin, ycos;
    sincos_ps(x, &ysin, &ycos);
    float32x4_t ytan = div_ps(ysin, ycos);
    return ytan;
}

static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b)
{
    // pow(x, m) = exp(m * log(x))
    return exp_ps(vmulq_f32(b, log_ps(a)));
}

static inline float32x4_t sigmoid_ps(float32x4_t _v)
{
    float32x4_t _one = vdupq_n_f32(1.f);
    _v = vnegq_f32(_v);
    _v = exp_ps(_v);
    _v = vaddq_f32(_v, _one);
    float32x4_t _outp = vrecpeq_f32(_v);
    _outp = vmulq_f32(vrecpsq_f32(_v, _outp), _outp);
    return vmulq_f32(vrecpsq_f32(_v, _outp), _outp);
}

static const float asinf_lut[7] = {
    1.5707961728,
    -0.2145852647,
    0.0887556286,
    -0.0488025043,
    0.0268999482,
    -0.0111462294,
    0.0022959648
};

static inline void asincos_ps(float32x4_t x, float32x4_t* yasin, float32x4_t* yacos)
{
    int i = 0;
    float32x4_t one = vdupq_n_f32(1);
    float32x4_t negone = vdupq_n_f32(-1);
    float32x4_t lut[7];
    float32x4_t xv[5];
    float32x4_t a0, a1, a2, a3;
    float32x4_t phx;
    float32x4_t arcsinx, arcnsinx;
    float32x4_t sat = vdupq_n_f32(0.9999999f);
    float32x4_t m_pi_2 = vdupq_n_f32(1.570796326);
    for (i = 0; i <= 6; i++)
    {
        lut[i] = vdupq_n_f32(asinf_lut[i]);
    }

    uint32x4_t sign_mask_asin, saturate;
    sign_mask_asin = vcltq_f32(x, vdupq_n_f32(0));
    x = vabsq_f32(x);
    saturate = vcgeq_f32(x, one);
    x = vbslq_f32(saturate, sat, x);
    float32x4_t y = vsubq_f32(one, x);

#if __aarch64__
    y = vsqrtq_f32(y);
#else
    float32x4_t _reciprocal = vrsqrteq_f32(y);
    _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(y, _reciprocal), _reciprocal), _reciprocal);
    y = vmulq_f32(y, _reciprocal);
#endif

    xv[0] = vmulq_f32(x, x);
    for (i = 1; i < 5; i++)
    {
        xv[i] = vmulq_f32(xv[i - 1], x);
    }

    a0 = vaddq_f32(lut[0], vmulq_f32(lut[1], x));
    a1 = vaddq_f32(vmulq_f32(lut[2], xv[0]), vmulq_f32(lut[3], xv[1]));
    a2 = vaddq_f32(vmulq_f32(lut[4], xv[2]), vmulq_f32(lut[5], xv[3]));
    a3 = vmulq_f32(lut[6], xv[4]);
    phx = vaddq_f32(vaddq_f32(a0, vaddq_f32(a1, a2)), a3);

    arcsinx = vmulq_f32(y, phx);
    arcsinx = vsubq_f32(m_pi_2, arcsinx);
    arcnsinx = vmulq_f32(negone, arcsinx);
    arcsinx = vbslq_f32(sign_mask_asin, arcnsinx, arcsinx);

    *yasin = arcsinx;
    *yacos = vsubq_f32(m_pi_2, arcsinx);
}

static inline float32x4_t asin_ps(float32x4_t x)
{
    float32x4_t yasin, yacos;
    asincos_ps(x, &yasin, &yacos);
    return yasin;
}

static inline float32x4_t acos_ps(float32x4_t x)
{
    float32x4_t yasin, yacos;
    asincos_ps(x, &yasin, &yacos);
    return yacos;
}

static inline float32x4_t atan2_ps(float32x4_t a, float32x4_t b)
{
    //TODO neon optimize
    float tmpx[4];
    float tmpy[4];
    vst1q_f32(tmpx, a);
    vst1q_f32(tmpy, b);
    for (int i = 0; i < 4; i++)
        tmpx[i] = atan2f(tmpx[i], tmpy[i]);
    return vld1q_f32(tmpx);
}

static inline float32x4_t trunc_ps(const float32x4_t& x)
{
    // truncate toward zero
#if __aarch64__
    return vrndq_f32(x);
#else
    int32x4_t xi = vcvtq_s32_f32(x);
    return vcvtq_f32_s32(xi);
#endif
}

static inline float32x4_t fmod_ps(const float32x4_t& x, const float32x4_t& y)
{
    // fmod(x,y) = x - trunc(x/y) * y
#if __aarch64__
    float32x4_t q = vdivq_f32(x, y);
#else
    float32x4_t q = div_ps(x, y);
#endif
    float32x4_t tq = trunc_ps(q);
    return vsubq_f32(x, vmulq_f32(tq, y));
}

static inline float32x4_t round_ps(const float32x4_t& x)
{
#if __aarch64__
    return vrndnq_f32(x);
#else
    float32x4_t half = vdupq_n_f32(0.5f);
    float32x4_t one = vdupq_n_f32(1.0f);
    uint32x4_t sign_mask = vcltq_f32(x, vdupq_n_f32(0));
    float32x4_t abs_x = vabsq_f32(x);
    int32x4_t xi = vcvtq_s32_f32(abs_x);
    float32x4_t truncated = vcvtq_f32_s32(xi);
    float32x4_t diff = vsubq_f32(abs_x, truncated);
    uint32x4_t diff_gt_half = vcgtq_f32(diff, half);
    uint32x4_t diff_eq_half = vceqq_f32(diff, half);
    int32x4_t xi_and_1 = vandq_s32(xi, vdupq_n_s32(1));
    uint32x4_t is_odd = vcgtq_s32(xi_and_1, vdupq_n_s32(0));
    uint32x4_t round_up = vorrq_u32(diff_gt_half, vandq_u32(diff_eq_half, is_odd));
    float32x4_t rounded = vaddq_f32(truncated, vreinterpretq_f32_u32(vandq_u32(round_up, vreinterpretq_u32_f32(one))));
    return vbslq_f32(sign_mask, vnegq_f32(rounded), rounded);
#endif
}

static inline float32x4_t logaddexp_ps(const float32x4_t& x, const float32x4_t& y)
{
    float32x4_t max_xy = vmaxq_f32(x, y);
    float32x4_t min_xy = vminq_f32(x, y);
    float32x4_t diff = vsubq_f32(min_xy, max_xy);
    float32x4_t exp_diff = exp_ps(diff);
    float32x4_t one_plus_exp = vaddq_f32(vdupq_n_f32(1.0f), exp_diff);
    float32x4_t log_result = log_ps(one_plus_exp);
    return vaddq_f32(max_xy, log_result);
}

static inline float32x4_t floor_ps(const float32x4_t& x)
{
#if __aarch64__
    return vrndmq_f32(x);
#else
    float32x4_t truncated = vcvtq_f32_s32(vcvtq_s32_f32(x));
    uint32x4_t need_adjust = vcltq_f32(x, truncated);
    float32x4_t adjusted = vsubq_f32(truncated, vdupq_n_f32(1.0f));
    return vbslq_f32(need_adjust, adjusted, truncated);
#endif
}

static inline float32x4_t floor_divide_ps(const float32x4_t& x, const float32x4_t& y)
{
#if __aarch64__
    float32x4_t q = vdivq_f32(x, y);
#else
    float32x4_t q = div_ps(x, y);
#endif
    return floor_ps(q);
}

static inline float32x4_t remainder_ps(const float32x4_t& x, const float32x4_t& y)
{
#if __aarch64__
    float32x4_t q = vdivq_f32(x, y);
#else
    float32x4_t q = div_ps(x, y);
#endif
    float32x4_t rq = round_ps(q);
    return vsubq_f32(x, vmulq_f32(rq, y));
}

#include "neon_mathfun_tanh.h"

// Clean up macros
#undef VFMAQ_F32
#undef VFMSQ_F32

#endif // NEON_MATHFUN_H


================================================
FILE: src/layer/arm/neon_mathfun_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

/* NEON implementation of sin, cos, exp and log
 *
 *   Inspired by Intel Approximate Math library, and based on the
 *   corresponding algorithms of the cephes math library
 */

/* Copyright (C) 2011  Julien Pommier
 *
 *  This software is provided 'as-is', without any express or implied
 *  warranty.  In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgment in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 *
 *  (this is the zlib license)
 */

#ifndef NEON_MATHFUN_FP16S_H
#define NEON_MATHFUN_FP16S_H

#include <arm_neon.h>

#define c_inv_mant_mask_f16 -31745 // ~0x7c00u
#define c_cephes_SQRTHF     0.707106781186547524
#define c_cephes_log_p0     7.0376836292E-2
#define c_cephes_log_p1     -1.1514610310E-1
#define c_cephes_log_p2     1.1676998740E-1
#define c_cephes_log_p3     -1.2420140846E-1
#define c_cephes_log_p4     +1.4249322787E-1
#define c_cephes_log_p5     -1.6668057665E-1
#define c_cephes_log_p6     +2.0000714765E-1
#define c_cephes_log_p7     -2.4999993993E-1
#define c_cephes_log_p8     +3.3333331174E-1
#define c_cephes_log_q1     -2.12194440e-4
#define c_cephes_log_q2     0.693359375

/* natural logarithm computed for 4 simultaneous float
 *   return NaN for x <= 0
 */
static inline float16x4_t log_ps_f16(float16x4_t x)
{
    float16x4_t one = vdup_n_f16(1);

    x = vmax_f16(x, vdup_n_f16(0)); /* force flush to zero on denormal values */
    uint16x4_t invalid_mask = vcle_f16(x, vdup_n_f16(0));

    int16x4_t ux = vreinterpret_s16_f16(x);

    int16x4_t emm0 = vshr_n_s16(ux, 10);

    /* keep only the fractional part */
    ux = vand_s16(ux, vdup_n_s16(c_inv_mant_mask_f16));
    ux = vorr_s16(ux, vreinterpret_s16_f16(vdup_n_f16(0.5f)));
    x = vreinterpret_f16_s16(ux);

    emm0 = vsub_s16(emm0, vdup_n_s16(0xf));
    float16x4_t e = vcvt_f16_s16(emm0);

    e = vadd_f16(e, one);

    /* part2:
     *     if( x < SQRTHF ) {
     *       e -= 1;
     *       x = x + x - 1.0;
     *     } else { x = x - 1.0; }
     */
    uint16x4_t mask = vclt_f16(x, vdup_n_f16(c_cephes_SQRTHF));
    float16x4_t tmp = (float16x4_t)(vand_u16((uint16x4_t)(x), mask));
    x = vsub_f16(x, one);
    e = vsub_f16(e, (float16x4_t)(vand_u16((uint16x4_t)(one), mask)));
    x = vadd_f16(x, tmp);

    float16x4_t z = vmul_f16(x, x);

    float16x4_t y = vdup_n_f16(c_cephes_log_p0);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p1), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p2), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p3), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p4), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p5), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p6), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p7), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_log_p8), y, x);
    y = vmul_f16(y, x);

    y = vmul_f16(y, z);

    y = vfma_f16(y, e, vdup_n_f16(c_cephes_log_q1));

    y = vfms_f16(y, z, vdup_n_f16(0.5f));

    x = vadd_f16(x, y);
    x = vfma_f16(x, e, vdup_n_f16(c_cephes_log_q2));
    x = (float16x4_t)(vorr_u16((uint16x4_t)(x), invalid_mask)); // negative arg will be NAN
    return x;
}

static inline float16x8_t log_ps_f16(float16x8_t x)
{
    float16x8_t one = vdupq_n_f16(1);

    x = vmaxq_f16(x, vdupq_n_f16(0)); /* force flush to zero on denormal values */
    uint16x8_t invalid_mask = vcleq_f16(x, vdupq_n_f16(0));

    int16x8_t ux = vreinterpretq_s16_f16(x);

    int16x8_t emm0 = vshrq_n_s16(ux, 10);

    /* keep only the fractional part */
    ux = vandq_s16(ux, vdupq_n_s16(c_inv_mant_mask_f16));
    ux = vorrq_s16(ux, vreinterpretq_s16_f16(vdupq_n_f16(0.5f)));
    x = vreinterpretq_f16_s16(ux);

    emm0 = vsubq_s16(emm0, vdupq_n_s16(0xf));
    float16x8_t e = vcvtq_f16_s16(emm0);

    e = vaddq_f16(e, one);

    /* part2:
     *     if( x < SQRTHF ) {
     *       e -= 1;
     *       x = x + x - 1.0;
     *     } else { x = x - 1.0; }
     */
    uint16x8_t mask = vcltq_f16(x, vdupq_n_f16(c_cephes_SQRTHF));
    float16x8_t tmp = vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(x), mask));
    x = vsubq_f16(x, one);
    e = vsubq_f16(e, vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(one), mask)));
    x = vaddq_f16(x, tmp);

    float16x8_t z = vmulq_f16(x, x);

    float16x8_t y = vdupq_n_f16(c_cephes_log_p0);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p1), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p2), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p3), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p4), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p5), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p6), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p7), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_log_p8), y, x);
    y = vmulq_f16(y, x);

    y = vmulq_f16(y, z);

    y = vfmaq_f16(y, e, vdupq_n_f16(c_cephes_log_q1));

    y = vfmsq_f16(y, z, vdupq_n_f16(0.5f));

    x = vaddq_f16(x, y);
    x = vfmaq_f16(x, e, vdupq_n_f16(c_cephes_log_q2));
    x = vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(x), invalid_mask)); // negative arg will be NAN
    return x;
}

#define c_exp_hi_f16 10.7421875f
#define c_exp_lo_f16 -10.7421875f

#define c_cephes_LOG2EF 1.44269504088896341
#define c_cephes_exp_C1 0.693359375
#define c_cephes_exp_C2 -2.12194440e-4

#define c_cephes_exp_p0 1.9875691500E-4
#define c_cephes_exp_p1 1.3981999507E-3
#define c_cephes_exp_p2 8.3334519073E-3
#define c_cephes_exp_p3 4.1665795894E-2
#define c_cephes_exp_p4 1.6666665459E-1
#define c_cephes_exp_p5 5.0000001201E-1

/* exp() computed for 4 float at once */
static inline float16x4_t exp_ps_f16(float16x4_t x)
{
    float16x4_t tmp, fx;

    float16x4_t one = vdup_n_f16(1);
    x = vmin_f16(x, vdup_n_f16(c_exp_hi_f16));
    x = vmax_f16(x, vdup_n_f16(c_exp_lo_f16));

    /* express exp(x) as exp(g + n*log(2)) */
#if defined(_MSC_VER) && !defined(__clang__)
    fx = vfma_f16(vdup_n_f16(0.5f), x, vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF)));
#else
    fx = vfma_f16(vdup_n_f16(0.5f), x, vdup_n_f16(c_cephes_LOG2EF));
#endif

    /* perform a floorf */
    tmp = vcvt_f16_s16(vcvt_s16_f16(fx));

    /* if greater, substract 1 */
    uint16x4_t mask = vcgt_f16(tmp, fx);
    mask = vand_u16(mask, (uint16x4_t)(one));

    fx = vsub_f16(tmp, (float16x4_t)(mask));

#if defined(_MSC_VER) && !defined(__clang__)
    tmp = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1)));
    float16x4_t z = vmul_f16(fx, vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2)));
#else
    tmp = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C1));
    float16x4_t z = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C2));
#endif
    x = vsub_f16(x, tmp);
    x = vsub_f16(x, z);

    z = vmul_f16(x, x);

    float16x4_t y = vdup_n_f16(c_cephes_exp_p0);
    y = vfma_f16(vdup_n_f16(c_cephes_exp_p1), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_exp_p2), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_exp_p3), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_exp_p4), y, x);
    y = vfma_f16(vdup_n_f16(c_cephes_exp_p5), y, x);

    y = vfma_f16(x, y, z);
    y = vadd_f16(y, one);

    /* build 2^n */
    int16x4_t mm;
    mm = vcvt_s16_f16(fx);
    mm = vadd_s16(mm, vdup_n_s16(0xf));
    mm = vshl_n_s16(mm, 10);
    float16x4_t pow2n = vreinterpret_f16_s16(mm);

    y = vmul_f16(y, pow2n);
    return y;
}

static inline float16x8_t exp_ps_f16(float16x8_t x)
{
    float16x8_t tmp, fx;

    float16x8_t one = vdupq_n_f16(1);
    x = vminq_f16(x, vdupq_n_f16(c_exp_hi_f16));
    x = vmaxq_f16(x, vdupq_n_f16(c_exp_lo_f16));

    /* express exp(x) as exp(g + n*log(2)) */
#if defined(_MSC_VER) && !defined(__clang__)
    float16x4_t _c_cephes_LOG2EF = vcvt_f16_f32(vdupq_n_f32(c_cephes_LOG2EF));
    fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vcombine_f16(_c_cephes_LOG2EF, _c_cephes_LOG2EF));
#else
    fx = vfmaq_f16(vdupq_n_f16(0.5f), x, vdupq_n_f16(c_cephes_LOG2EF));
#endif

    /* perform a floorf */
    tmp = vcvtq_f16_s16(vcvtq_s16_f16(fx));

    /* if greater, substract 1 */
    uint16x8_t mask = vcgtq_f16(tmp, fx);
    mask = vandq_u16(mask, vreinterpretq_u16_f16(one));

    fx = vsubq_f16(tmp, vreinterpretq_f16_u16(mask));

#if defined(_MSC_VER) && !defined(__clang__)
    float16x4_t _c_cephes_exp_C1 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C1));
    tmp = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C1, _c_cephes_exp_C1));
    float16x4_t _c_cephes_exp_C2 = vcvt_f16_f32(vdupq_n_f32(c_cephes_exp_C2));
    float16x8_t z = vmulq_f16(fx, vcombine_f16(_c_cephes_exp_C2, _c_cephes_exp_C2));
#else
    tmp = vmulq_f16(fx, vdupq_n_f16(c_cephes_exp_C1));
    float16x8_t z = vmulq_f16(fx, vdupq_n_f16(c_cephes_exp_C2));
#endif
    x = vsubq_f16(x, tmp);
    x = vsubq_f16(x, z);

    z = vmulq_f16(x, x);

    float16x8_t y = vdupq_n_f16(c_cephes_exp_p0);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_exp_p1), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_exp_p2), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_exp_p3), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_exp_p4), y, x);
    y = vfmaq_f16(vdupq_n_f16(c_cephes_exp_p5), y, x);

    y = vfmaq_f16(x, y, z);
    y = vaddq_f16(y, one);

    /* build 2^n */
    int16x8_t mm;
    mm = vcvtq_s16_f16(fx);
    mm = vaddq_s16(mm, vdupq_n_s16(0xf));
    mm = vshlq_n_s16(mm, 10);
    float16x8_t pow2n = vreinterpretq_f16_s16(mm);

    y = vmulq_f16(y, pow2n);
    return y;
}

#define c_minus_cephes_DP1 -0.78515625
#define c_minus_cephes_DP2 -2.4187564849853515625e-4
#define c_minus_cephes_DP3 -3.77489497744594108e-8
#define c_sincof_p0        -1.9515295891E-4
#define c_sincof_p1        8.3321608736E-3
#define c_sincof_p2        -1.6666654611E-1
#define c_coscof_p0        2.443315711809948E-005
#define c_coscof_p1        -1.388731625493765E-003
#define c_coscof_p2        4.166664568298827E-002
#define c_cephes_FOPI      1.27323954473516 // 4 / M_PI

/* evaluation of 4 sines & cosines at once.
 *
 *   The code is the exact rewriting of the cephes sinf function.
 *   Precision is excellent as long as x < 8192 (I did not bother to
 *   take into account the special handling they have for greater values
 *   -- it does not return garbage for arguments over 8192, though, but
 *   the extra precision is missing).
 *
 *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
 *   surprising but correct result.
 *
 *   Note also that when you compute sin(x), cos(x) is available at
 *   almost no extra price so both sin_ps and cos_ps make use of
 *   sincos_ps..
 */
static inline void sincos_ps_f16(float16x4_t x, float16x4_t* ysin, float16x4_t* ycos)
{
    // any x
    float16x4_t y;

    uint16x4_t emm2;

    uint16x4_t sign_mask_sin, sign_mask_cos;
    sign_mask_sin = vclt_f16(x, vdup_n_f16(0));
    x = vabs_f16(x);

    /* scale by 4/Pi */
#if defined(_MSC_VER) && !defined(__clang__)
    float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI));
    y = vmul_f16(x, _c_cephes_FOPI);
#else
    y = vmul_f16(x, vdup_n_f16(c_cephes_FOPI));
#endif

    /* store the integer part of y in mm0 */
    emm2 = vcvt_u16_f16(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = vadd_u16(emm2, vdup_n_u16(1));
    emm2 = vand_u16(emm2, vdup_n_u16(~1));
    y = vcvt_f16_u16(emm2);

    /* get the polynom selection mask
     *     there is one polynom for 0 <= x <= Pi/4
     *     and another one for Pi/4<x<=Pi/2
     *
     *     Both branches will be computed.
     */
    uint16x4_t poly_mask = vtst_u16(emm2, vdup_n_u16(2));

    /* The magic pass: "Extended precision modular arithmetic"
     *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
#if defined(_MSC_VER) && !defined(__clang__)
    float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1));
    float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2));
    float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3));
    x = vfma_f16(x, y, _c_minus_cephes_DP1);
    x = vfma_f16(x, y, _c_minus_cephes_DP2);
    x = vfma_f16(x, y, _c_minus_cephes_DP3);
#else
    x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP1));
    x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP2));
    x = vfma_f16(x, y, vdup_n_f16(c_minus_cephes_DP3));
#endif

    sign_mask_sin = veor_u16(sign_mask_sin, vtst_u16(emm2, vdup_n_u16(4)));
    sign_mask_cos = vtst_u16(vsub_u16(emm2, vdup_n_u16(2)), vdup_n_u16(4));

    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
     *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
    float16x4_t z = vmul_f16(x, x);
    float16x4_t y1, y2;

    y1 = vfma_f16(vdup_n_f16(c_coscof_p1), z, vdup_n_f16(c_coscof_p0));
    y2 = vfma_f16(vdup_n_f16(c_sincof_p1), z, vdup_n_f16(c_sincof_p0));
    y1 = vfma_f16(vdup_n_f16(c_coscof_p2), y1, z);
    y2 = vfma_f16(vdup_n_f16(c_sincof_p2), y2, z);
    y1 = vmul_f16(y1, z);
    y2 = vmul_f16(y2, z);
    y1 = vmul_f16(y1, z);
    y1 = vfms_f16(y1, z, vdup_n_f16(0.5f));
    y2 = vfma_f16(x, y2, x);
    y1 = vadd_f16(y1, vdup_n_f16(1));

    /* select the correct result from the two polynoms */
    float16x4_t ys = vbsl_f16(poly_mask, y1, y2);
    float16x4_t yc = vbsl_f16(poly_mask, y2, y1);
    *ysin = vbsl_f16(sign_mask_sin, vneg_f16(ys), ys);
    *ycos = vbsl_f16(sign_mask_cos, yc, vneg_f16(yc));
}

static inline void sincos_ps_f16(float16x8_t x, float16x8_t* ysin, float16x8_t* ycos)
{
    // any x
    float16x8_t y;

    uint16x8_t emm2;

    uint16x8_t sign_mask_sin, sign_mask_cos;
    sign_mask_sin = vcltq_f16(x, vdupq_n_f16(0));
    x = vabsq_f16(x);

    /* scale by 4/Pi */
#if defined(_MSC_VER) && !defined(__clang__)
    float16x4_t _c_cephes_FOPI = vcvt_f16_f32(vdupq_n_f32(c_cephes_FOPI));
    y = vmulq_f16(x, vcombine_f16(_c_cephes_FOPI, _c_cephes_FOPI));
#else
    y = vmulq_f16(x, vdupq_n_f16(c_cephes_FOPI));
#endif

    /* store the integer part of y in mm0 */
    emm2 = vcvtq_u16_f16(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = vaddq_u16(emm2, vdupq_n_u16(1));
    emm2 = vandq_u16(emm2, vdupq_n_u16(~1));
    y = vcvtq_f16_u16(emm2);

    /* get the polynom selection mask
     *     there is one polynom for 0 <= x <= Pi/4
     *     and another one for Pi/4<x<=Pi/2
     *
     *     Both branches will be computed.
     */
    uint16x8_t poly_mask = vtstq_u16(emm2, vdupq_n_u16(2));

    /* The magic pass: "Extended precision modular arithmetic"
     *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
#if defined(_MSC_VER) && !defined(__clang__)
    float16x4_t _c_minus_cephes_DP1 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP1));
    float16x4_t _c_minus_cephes_DP2 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP2));
    float16x4_t _c_minus_cephes_DP3 = vcvt_f16_f32(vdupq_n_f32(c_minus_cephes_DP3));
    x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP1, _c_minus_cephes_DP1));
    x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP2, _c_minus_cephes_DP2));
    x = vfmaq_f16(x, y, vcombine_f16(_c_minus_cephes_DP3, _c_minus_cephes_DP3));
#else
    x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP1));
    x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP2));
    x = vfmaq_f16(x, y, vdupq_n_f16(c_minus_cephes_DP3));
#endif

    sign_mask_sin = veorq_u16(sign_mask_sin, vtstq_u16(emm2, vdupq_n_u16(4)));
    sign_mask_cos = vtstq_u16(vsubq_u16(emm2, vdupq_n_u16(2)), vdupq_n_u16(4));

    /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
     *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
    float16x8_t z = vmulq_f16(x, x);
    float16x8_t y1, y2;

    y1 = vfmaq_f16(vdupq_n_f16(c_coscof_p1), z, vdupq_n_f16(c_coscof_p0));
    y2 = vfmaq_f16(vdupq_n_f16(c_sincof_p1), z, vdupq_n_f16(c_sincof_p0));
    y1 = vfmaq_f16(vdupq_n_f16(c_coscof_p2), y1, z);
    y2 = vfmaq_f16(vdupq_n_f16(c_sincof_p2), y2, z);
    y1 = vmulq_f16(y1, z);
    y2 = vmulq_f16(y2, z);
    y1 = vmulq_f16(y1, z);
    y1 = vfmsq_f16(y1, z, vdupq_n_f16(0.5f));
    y2 = vfmaq_f16(x, y2, x);
    y1 = vaddq_f16(y1, vdupq_n_f16(1));

    /* select the correct result from the two polynoms */
    float16x8_t ys = vbslq_f16(poly_mask, y1, y2);
    float16x8_t yc = vbslq_f16(poly_mask, y2, y1);
    *ysin = vbslq_f16(sign_mask_sin, vnegq_f16(ys), ys);
    *ycos = vbslq_f16(sign_mask_cos, yc, vnegq_f16(yc));
}

static inline float16x4_t sin_ps_f16(float16x4_t x)
{
    float16x4_t ysin, ycos;
    sincos_ps_f16(x, &ysin, &ycos);
    return ysin;
}

static inline float16x8_t sin_ps_f16(float16x8_t x)
{
    float16x8_t ysin, ycos;
    sincos_ps_f16(x, &ysin, &ycos);
    return ysin;
}

static inline float16x4_t cos_ps_f16(float16x4_t x)
{
    float16x4_t ysin, ycos;
    sincos_ps_f16(x, &ysin, &ycos);
    return ycos;
}

static inline float16x8_t cos_ps_f16(float16x8_t x)
{
    float16x8_t ysin, ycos;
    sincos_ps_f16(x, &ysin, &ycos);
    return ycos;
}

#define c_tanh_tiny 1e-4f
#define c_tanh_hi   9.0f
// The monomial coefficients of the numerator polynomial (odd).
#define c_tanh_alpha_1  4.89352455891786e-3f
#define c_tanh_alpha_3  6.37261928875436e-4f
#define c_tanh_alpha_5  1.48572235717979e-5f
#define c_tanh_alpha_7  5.12229709037114e-8f
#define c_tanh_alpha_9  -8.60467152213735e-11f
#define c_tanh_alpha_11 2.00018790482477e-13f
#define c_tanh_alpha_13 -2.76076847742355e-16f
// The monomial coefficients of the denominator polynomial (even).
#define c_tanh_beta_0 4.89352518554385e-3f
#define c_tanh_beta_2 2.26843463243900e-3f
#define c_tanh_beta_4 1.18534705686654e-4f
#define c_tanh_beta_6 1.19825839466702e-6f

/* Single precision hyperbolic tangent computed for 4 simultaneous float */
static inline float16x4_t tanh_ps_f16(float16x4_t x)
{
    float16x4_t x2 = vabs_f16(x);

    uint16x4_t tiny_mask = vcge_f16(x2, vdup_n_f16(c_tanh_tiny));

    // clamp the inputs to the range [-9, 9] since anything outside
    // this range is -/+1.0f in single-precision.
    x2 = (float16x4_t)(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), (uint16x4_t)(x2), (uint16x4_t)(vdup_n_f16(c_tanh_hi))));

    // since the polynomials are odd/even, we need x**2.
    float16x4_t z = vmul_f16(x2, x2);

    // evaluate the numerator polynomial y.
    float16x4_t y = vdup_n_f16(c_tanh_alpha_13);
    y = vfma_f16(vdup_n_f16(c_tanh_alpha_11), y, z);
    y = vfma_f16(vdup_n_f16(c_tanh_alpha_9), y, z);
    y = vfma_f16(vdup_n_f16(c_tanh_alpha_7), y, z);
    y = vfma_f16(vdup_n_f16(c_tanh_alpha_5), y, z);
    y = vfma_f16(vdup_n_f16(c_tanh_alpha_3), y, z);
    y = vfma_f16(vdup_n_f16(c_tanh_alpha_1), y, z);
    y = vmul_f16(y, x2);

    // evaluate the denominator polynomial w.
    float16x4_t w = vdup_n_f16(c_tanh_beta_6);
    w = vfma_f16(vdup_n_f16(c_tanh_beta_4), w, z);
    w = vfma_f16(vdup_n_f16(c_tanh_beta_2), w, z);
    w = vfma_f16(vdup_n_f16(c_tanh_beta_0), w, z);

    // divide the numerator by the denominator.
    y = vdiv_f16(y, w);

    // reinstate the sign.
    y = (float16x4_t)(vbsl_u16(vdup_n_u16(1u << 15), (uint16x4_t)(x), (uint16x4_t)(y)));

    // when the argument is very small in magnitude it's more accurate to just return it.
    y = (float16x4_t)(vbsl_u16(tiny_mask, (uint16x4_t)(y), (uint16x4_t)(x)));

    return y;
}

static inline float16x8_t tanh_ps_f16(float16x8_t x)
{
    float16x8_t x2 = vabsq_f16(x);

    uint16x8_t tiny_mask = vcgeq_f16(x2, vdupq_n_f16(c_tanh_tiny));

    // clamp the inputs to the range [-9, 9] since anything outside
    // this range is -/+1.0f in single-precision.
    x2 = vreinterpretq_f16_u16(vbslq_u16(vcgeq_f16(vdupq_n_f16(c_tanh_hi), x2), vreinterpretq_u16_f16(x2), vreinterpretq_u16_f16(vdupq_n_f16(c_tanh_hi))));

    // since the polynomials are odd/even, we need x**2.
    float16x8_t z = vmulq_f16(x2, x2);

    // evaluate the numerator polynomial y.
    float16x8_t y = vdupq_n_f16(c_tanh_alpha_13);
    y = vfmaq_f16(vdupq_n_f16(c_tanh_alpha_11), y, z);
    y = vfmaq_f16(vdupq_n_f16(c_tanh_alpha_9), y, z);
    y = vfmaq_f16(vdupq_n_f16(c_tanh_alpha_7), y, z);
    y = vfmaq_f16(vdupq_n_f16(c_tanh_alpha_5), y, z);
    y = vfmaq_f16(vdupq_n_f16(c_tanh_alpha_3), y, z);
    y = vfmaq_f16(vdupq_n_f16(c_tanh_alpha_1), y, z);
    y = vmulq_f16(y, x2);

    // evaluate the denominator polynomial w.
    float16x8_t w = vdupq_n_f16(c_tanh_beta_6);
    w = vfmaq_f16(vdupq_n_f16(c_tanh_beta_4), w, z);
    w = vfmaq_f16(vdupq_n_f16(c_tanh_beta_2), w, z);
    w = vfmaq_f16(vdupq_n_f16(c_tanh_beta_0), w, z);

    // divide the numerator by the denominator.
    y = vdivq_f16(y, w);

    // reinstate the sign.
    y = vreinterpretq_f16_u16(vbslq_u16(vdupq_n_u16(1u << 15), vreinterpretq_u16_f16(x), vreinterpretq_u16_f16(y)));

    // when the argument is very small in magnitude it's more accurate to just return it.
    y = vreinterpretq_f16_u16(vbslq_u16(tiny_mask, vreinterpretq_u16_f16(y), vreinterpretq_u16_f16(x)));

    return y;
}

static inline float16x4_t sigmoid_ps_f16(float16x4_t _v)
{
    float16x4_t _one = vdup_n_f16(1.f);
    _v = vneg_f16(_v);
    _v = exp_ps_f16(_v);
    _v = vadd_f16(_v, _one);
    return vdiv_f16(_one, _v);
}

static inline float16x8_t sigmoid_ps_f16(float16x8_t _v)
{
    float16x8_t _one = vdupq_n_f16(1.f);
    _v = vnegq_f16(_v);
    _v = exp_ps_f16(_v);
    _v = vaddq_f16(_v, _one);
    return vdivq_f16(_one, _v);
}

#endif // NEON_MATHFUN_FP16S_H


================================================
FILE: src/layer/arm/neon_mathfun_tanh.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef NEON_MATHFUN_TANH_H
#define NEON_MATHFUN_TANH_H

#include <arm_neon.h>

#define c_tanh_tiny 1e-4f
#define c_tanh_hi   9.0f
// The monomial coefficients of the numerator polynomial (odd).
#define c_tanh_alpha_1  4.89352455891786e-3f
#define c_tanh_alpha_3  6.37261928875436e-4f
#define c_tanh_alpha_5  1.48572235717979e-5f
#define c_tanh_alpha_7  5.12229709037114e-8f
#define c_tanh_alpha_9  -8.60467152213735e-11f
#define c_tanh_alpha_11 2.00018790482477e-13f
#define c_tanh_alpha_13 -2.76076847742355e-16f
// The monomial coefficients of the denominator polynomial (even).
#define c_tanh_beta_0 4.89352518554385e-3f
#define c_tanh_beta_2 2.26843463243900e-3f
#define c_tanh_beta_4 1.18534705686654e-4f
#define c_tanh_beta_6 1.19825839466702e-6f

/* Single precision hyperbolic tangent computed for 4 simultaneous float */
static inline float32x4_t tanh_ps(float32x4_t x)
{
    float32x4_t x2 = vabsq_f32(x);

    uint32x4_t tiny_mask = vcgeq_f32(x2, vdupq_n_f32(c_tanh_tiny));

    // clamp the inputs to the range [-9, 9] since anything outside
    // this range is -/+1.0f in single-precision.
    x2 = vreinterpretq_f32_u32(vbslq_u32(vcgeq_f32(vdupq_n_f32(c_tanh_hi), x2), vreinterpretq_u32_f32(x2), vreinterpretq_u32_f32(vdupq_n_f32(c_tanh_hi))));

    // since the polynomials are odd/even, we need x**2.
    float32x4_t z = vmulq_f32(x2, x2);

    // evaluate the numerator polynomial y.
    float32x4_t y = vdupq_n_f32(c_tanh_alpha_13);
    y = vmlaq_f32(vdupq_n_f32(c_tanh_alpha_11), y, z);
    y = vmlaq_f32(vdupq_n_f32(c_tanh_alpha_9), y, z);
    y = vmlaq_f32(vdupq_n_f32(c_tanh_alpha_7), y, z);
    y = vmlaq_f32(vdupq_n_f32(c_tanh_alpha_5), y, z);
    y = vmlaq_f32(vdupq_n_f32(c_tanh_alpha_3), y, z);
    y = vmlaq_f32(vdupq_n_f32(c_tanh_alpha_1), y, z);
    y = vmulq_f32(y, x2);

    // evaluate the denominator polynomial w.
    float32x4_t w = vdupq_n_f32(c_tanh_beta_6);
    w = vmlaq_f32(vdupq_n_f32(c_tanh_beta_4), w, z);
    w = vmlaq_f32(vdupq_n_f32(c_tanh_beta_2), w, z);
    w = vmlaq_f32(vdupq_n_f32(c_tanh_beta_0), w, z);

    // divide the numerator by the denominator.
#if __aarch64__
    y = vdivq_f32(y, w);
#else
    y = div_ps(y, w);
#endif

    // reinstate the sign.
    y = vreinterpretq_f32_u32(vbslq_u32(vdupq_n_u32(1u << 31), vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y)));

    // when the argument is very small in magnitude it's more accurate to just return it.
    y = vreinterpretq_f32_u32(vbslq_u32(tiny_mask, vreinterpretq_u32_f32(y), vreinterpretq_u32_f32(x)));

    return y;
}

#endif // NEON_MATHFUN_TANH_H


================================================
FILE: src/layer/arm/packing_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "packing_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

Packing_arm::Packing_arm()
{
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif

    support_bf16_storage = true;
}

int Packing_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

    if (elembits == 8)
        return forward_int8(bottom_blob, top_blob, opt);

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);

    if (use_padding)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    if (elembits != 32)
    {
        // non-fp32 type
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    if (elempack == out_elempack)
    {
        top_blob = bottom_blob;
        return 0;
    }

    bool pack1to4 = elempack == 1 && out_elempack == 4;
    bool pack4to1 = elempack == 4 && out_elempack == 1;

    if (!pack1to4 && !pack4to1)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    if (!use_padding)
    {
        // identity if use_padding not allowed
        if (dims == 1 && w * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if (dims == 2 && h * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
    }

    if (dims == 1)
    {
        top_blob = bottom_blob;
        top_blob.w = w * elempack / out_elempack;
        top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
        top_blob.elemsize = elemsize / elempack * out_elempack;
        top_blob.elempack = out_elempack;
        return 0;
    }

    if (dims == 2)
    {
        int outh = h * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const float* r0 = bottom_blob.row(i * 4);
                const float* r1 = bottom_blob.row(i * 4 + 1);
                const float* r2 = bottom_blob.row(i * 4 + 2);
                const float* r3 = bottom_blob.row(i * 4 + 3);

                float* outptr = top_blob.row(i);

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    float32x4x4_t _p;
                    _p.val[0] = vld1q_f32(r0);
                    _p.val[1] = vld1q_f32(r1);
                    _p.val[2] = vld1q_f32(r2);
                    _p.val[3] = vld1q_f32(r3);
                    vst4q_f32(outptr, _p);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr += 16;
                }
#endif
                for (; j < w; j++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;

                    outptr += 4;
                }
            }
        }
        if (pack4to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* r0 = bottom_blob.row(i);

                float* outptr0 = top_blob.row(i * 4);
                float* outptr1 = top_blob.row(i * 4 + 1);
                float* outptr2 = top_blob.row(i * 4 + 2);
                float* outptr3 = top_blob.row(i * 4 + 3);

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    float32x4x4_t _p = vld4q_f32(r0);
                    vst1q_f32(outptr0, _p.val[0]);
                    vst1q_f32(outptr1, _p.val[1]);
                    vst1q_f32(outptr2, _p.val[2]);
                    vst1q_f32(outptr3, _p.val[3]);

                    r0 += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; j < w; j++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];

                    r0 += 4;
                }
            }
        }

        return 0;
    }

    if (dims == 3 || dims == 4)
    {
        int size = w * h * d;
        int outc = channels * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if (dims == 3)
            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        else // if (dims == 4)
            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const float* r0 = bottom_blob.channel(q * 4);
                const float* r1 = bottom_blob.channel(q * 4 + 1);
                const float* r2 = bottom_blob.channel(q * 4 + 2);
                const float* r3 = bottom_blob.channel(q * 4 + 3);

                float* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    float32x4x4_t _p;
                    _p.val[0] = vld1q_f32(r0);
                    _p.val[1] = vld1q_f32(r1);
                    _p.val[2] = vld1q_f32(r2);
                    _p.val[3] = vld1q_f32(r3);
                    vst4q_f32(outptr, _p);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr += 16;
                }
#endif
                for (; i < size; i++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;

                    outptr += 4;
                }
            }
        }
        if (pack4to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* r0 = bottom_blob.channel(q);

                float* outptr0 = top_blob.channel(q * 4);
                float* outptr1 = top_blob.channel(q * 4 + 1);
                float* outptr2 = top_blob.channel(q * 4 + 2);
                float* outptr3 = top_blob.channel(q * 4 + 3);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    float32x4x4_t _p = vld4q_f32(r0);
                    vst1q_f32(outptr0, _p.val[0]);
                    vst1q_f32(outptr1, _p.val[1]);
                    vst1q_f32(outptr2, _p.val[2]);
                    vst1q_f32(outptr3, _p.val[3]);

                    r0 += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];

                    r0 += 4;
                }
            }
        }

        return 0;
    }

    return 0;
}

int Packing_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (use_padding)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    if (elempack == out_elempack)
    {
        top_blob = bottom_blob;
        return 0;
    }

    bool pack1to4 = elempack == 1 && out_elempack == 4;
    bool pack4to1 = elempack == 4 && out_elempack == 1;
    bool pack1to8 = elempack == 1 && out_elempack == 8;
    bool pack8to1 = elempack == 8 && out_elempack == 1;
    bool pack4to8 = elempack == 4 && out_elempack == 8;
    bool pack8to4 = elempack == 8 && out_elempack == 4;

    if (!pack1to4 && !pack4to1 && !pack1to8 && !pack8to1 && !pack4to8 && !pack8to4)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    if (!use_padding)
    {
        // identity if use_padding not allowed
        if (dims == 1 && w * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if (dims == 2 && h * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
    }

    if (dims == 1)
    {
        top_blob = bottom_blob;
        top_blob.w = w * elempack / out_elempack;
        top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
        top_blob.elemsize = elemsize / elempack * out_elempack;
        top_blob.elempack = out_elempack;
        return 0;
    }

    if (dims == 2)
    {
        int outh = h * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 4);
                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 4 + 1);
                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 4 + 2);
                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 4 + 3);

                unsigned short* outptr = top_blob.row<unsigned short>(i);

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    uint16x4x4_t _p;
                    _p.val[0] = vld1_u16(r0);
                    _p.val[1] = vld1_u16(r1);
                    _p.val[2] = vld1_u16(r2);
                    _p.val[3] = vld1_u16(r3);
                    vst4_u16(outptr, _p);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr += 16;
                }
#endif
                for (; j < w; j++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;

                    outptr += 4;
                }
            }
        }
        if (pack4to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);

                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 4);
                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 4 + 1);
                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 4 + 2);
                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 4 + 3);

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < w; j += 4)
                {
                    uint16x4x4_t _p = vld4_u16(r0);
                    vst1_u16(outptr0, _p.val[0]);
                    vst1_u16(outptr1, _p.val[1]);
                    vst1_u16(outptr2, _p.val[2]);
                    vst1_u16(outptr3, _p.val[3]);

                    r0 += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; j < w; j++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];

                    r0 += 4;
                }
            }
        }
        if (pack1to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 8);
                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 8 + 1);
                const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 8 + 2);
                const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 8 + 3);
                const unsigned short* r4 = bottom_blob.row<const unsigned short>(i * 8 + 4);
                const unsigned short* r5 = bottom_blob.row<const unsigned short>(i * 8 + 5);
                const unsigned short* r6 = bottom_blob.row<const unsigned short>(i * 8 + 6);
                const unsigned short* r7 = bottom_blob.row<const unsigned short>(i * 8 + 7);

                unsigned short* outptr = top_blob.row<unsigned short>(i);

                int j = 0;
#if __ARM_NEON
                for (; j + 7 < w; j += 8)
                {
                    // transpose 8x8
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                    asm volatile(
                        "ld1    {v0.8h}, [%0], #16      \n"
                        "ld1    {v1.8h}, [%1], #16      \n"
                        "ld1    {v2.8h}, [%2], #16      \n"
                        "ld1    {v3.8h}, [%3], #16      \n"
                        "ld1    {v4.8h}, [%4], #16      \n"
                        "ld1    {v5.8h}, [%5], #16      \n"
                        "ld1    {v6.8h}, [%6], #16      \n"
                        "ld1    {v7.8h}, [%7], #16      \n"

                        "zip1   v16.8h, v0.8h, v4.8h    \n"
                        "zip2   v20.8h, v0.8h, v4.8h    \n"
                        "zip1   v17.8h, v1.8h, v5.8h    \n"
                        "zip2   v21.8h, v1.8h, v5.8h    \n"
                        "zip1   v18.8h, v2.8h, v6.8h    \n"
                        "zip2   v22.8h, v2.8h, v6.8h    \n"
                        "zip1   v19.8h, v3.8h, v7.8h    \n"
                        "zip2   v23.8h, v3.8h, v7.8h    \n"

                        "st4    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"
                        "st4    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(r2),    // %2
                        "=r"(r3),    // %3
                        "=r"(r4),    // %4
                        "=r"(r5),    // %5
                        "=r"(r6),    // %6
                        "=r"(r7),    // %7
                        "=r"(outptr) // %8
                        : "0"(r0),
                        "1"(r1),
                        "2"(r2),
                        "3"(r3),
                        "4"(r4),
                        "5"(r5),
                        "6"(r6),
                        "7"(r7),
                        "8"(outptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                    asm volatile(
                        "vld1.u16   {d16-d17}, [%0]!    \n"
                        "vld1.u16   {d18-d19}, [%1]!    \n"
                        "vld1.u16   {d20-d21}, [%2]!    \n"
                        "vld1.u16   {d22-d23}, [%3]!    \n"
                        "vld1.u16   {d24-d25}, [%4]!    \n"
                        "vld1.u16   {d26-d27}, [%5]!    \n"
                        "vld1.u16   {d28-d29}, [%6]!    \n"
                        "vld1.u16   {d30-d31}, [%7]!    \n"

                        "vtrn.u16   q8, q9              \n"
                        "vtrn.u16   q10, q11            \n"
                        "vtrn.u16   q12, q13            \n"
                        "vtrn.u16   q14, q15            \n"

                        "vtrn.u32   q8, q10             \n"
                        "vtrn.u32   q9, q11             \n"
                        "vtrn.u32   q12, q14            \n"
                        "vtrn.u32   q13, q15            \n"

                        "vswp       d17, d24            \n"
                        "vswp       d19, d26            \n"
                        "vswp       d21, d28            \n"
                        "vswp       d23, d30            \n"

                        "vstm       %8!, {d16-d23}      \n"
                        "vstm       %8!, {d24-d31}      \n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(r2),    // %2
                        "=r"(r3),    // %3
                        "=r"(r4),    // %4
                        "=r"(r5),    // %5
                        "=r"(r6),    // %6
                        "=r"(r7),    // %7
                        "=r"(outptr) // %8
                        : "0"(r0),
                        "1"(r1),
                        "2"(r2),
                        "3"(r3),
                        "4"(r4),
                        "5"(r5),
                        "6"(r6),
                        "7"(r7),
                        "8"(outptr)
                        : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
#else  // NCNN_GNU_INLINE_ASM
                    uint16x8_t _r0 = vld1q_u16(r0);
                    uint16x8_t _r1 = vld1q_u16(r1);
                    uint16x8_t _r2 = vld1q_u16(r2);
                    uint16x8_t _r3 = vld1q_u16(r3);
                    uint16x8_t _r4 = vld1q_u16(r4);
                    uint16x8_t _r5 = vld1q_u16(r5);
                    uint16x8_t _r6 = vld1q_u16(r6);
                    uint16x8_t _r7 = vld1q_u16(r7);
                    uint16x8x2_t _r04 = vzipq_u16(_r0, _r4);
                    uint16x8x2_t _r15 = vzipq_u16(_r1, _r5);
                    uint16x8x2_t _r26 = vzipq_u16(_r2, _r6);
                    uint16x8x2_t _r37 = vzipq_u16(_r3, _r7);
                    uint16x8x4_t _r0123;
                    _r0123.val[0] = _r04.val[0];
                    _r0123.val[1] = _r15.val[0];
                    _r0123.val[2] = _r26.val[0];
                    _r0123.val[3] = _r37.val[0];
                    uint16x8x4_t _r4567;
                    _r4567.val[0] = _r04.val[1];
                    _r4567.val[1] = _r15.val[1];
                    _r4567.val[2] = _r26.val[1];
                    _r4567.val[3] = _r37.val[1];
                    vst4q_u16(outptr, _r0123);
                    vst4q_u16(outptr + 32, _r4567);

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                    r3 += 8;
                    r4 += 8;
                    r5 += 8;
                    r6 += 8;
                    r7 += 8;
                    outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
                }
#endif
                for (; j < w; j++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;
                    outptr[4] = *r4++;
                    outptr[5] = *r5++;
                    outptr[6] = *r6++;
                    outptr[7] = *r7++;

                    outptr += 8;
                }
            }
        }
        if (pack8to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);

                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 8);
                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 8 + 1);
                unsigned short* outptr2 = top_blob.row<unsigned short>(i * 8 + 2);
                unsigned short* outptr3 = top_blob.row<unsigned short>(i * 8 + 3);
                unsigned short* outptr4 = top_blob.row<unsigned short>(i * 8 + 4);
                unsigned short* outptr5 = top_blob.row<unsigned short>(i * 8 + 5);
                unsigned short* outptr6 = top_blob.row<unsigned short>(i * 8 + 6);
                unsigned short* outptr7 = top_blob.row<unsigned short>(i * 8 + 7);

                int j = 0;
#if __ARM_NEON
                for (; j + 7 < w; j += 8)
                {
                    // transpose 8x8
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                    asm volatile(
                        "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                        "ld4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0], #64 \n"

                        "uzp1   v16.8h, v0.8h, v4.8h    \n"
                        "uzp2   v20.8h, v0.8h, v4.8h    \n"
                        "uzp1   v17.8h, v1.8h, v5.8h    \n"
                        "uzp2   v21.8h, v1.8h, v5.8h    \n"
                        "uzp1   v18.8h, v2.8h, v6.8h    \n"
                        "uzp2   v22.8h, v2.8h, v6.8h    \n"
                        "uzp1   v19.8h, v3.8h, v7.8h    \n"
                        "uzp2   v23.8h, v3.8h, v7.8h    \n"

                        "st1    {v16.8h}, [%1], #16      \n"
                        "st1    {v17.8h}, [%2], #16      \n"
                        "st1    {v18.8h}, [%3], #16      \n"
                        "st1    {v19.8h}, [%4], #16      \n"
                        "st1    {v20.8h}, [%5], #16      \n"
                        "st1    {v21.8h}, [%6], #16      \n"
                        "st1    {v22.8h}, [%7], #16      \n"
                        "st1    {v23.8h}, [%8], #16      \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(outptr4), // %5
                        "=r"(outptr5), // %6
                        "=r"(outptr6), // %7
                        "=r"(outptr7)  // %8
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(outptr4),
                        "6"(outptr5),
                        "7"(outptr6),
                        "8"(outptr7)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                    asm volatile(
                        "vldm       %0!, {d16-d23}      \n"
                        "vldm       %0!, {d24-d31}      \n"

                        "vtrn.u16   q8, q9              \n"
                        "vtrn.u16   q10, q11            \n"
                        "vtrn.u16   q12, q13            \n"
                        "vtrn.u16   q14, q15            \n"

                        "vtrn.u32   q8, q10             \n"
                        "vtrn.u32   q9, q11             \n"
                        "vtrn.u32   q12, q14            \n"
                        "vtrn.u32   q13, q15            \n"

                        "vswp       d17, d24            \n"
                        "vswp       d19, d26            \n"
                        "vswp       d21, d28            \n"
                        "vswp       d23, d30            \n"

                        "vst1.u16   {d16-d17}, [%1]!    \n"
                        "vst1.u16   {d18-d19}, [%2]!    \n"
                        "vst1.u16   {d20-d21}, [%3]!    \n"
                        "vst1.u16   {d22-d23}, [%4]!    \n"
                        "vst1.u16   {d24-d25}, [%5]!    \n"
                        "vst1.u16   {d26-d27}, [%6]!    \n"
                        "vst1.u16   {d28-d29}, [%7]!    \n"
                        "vst1.u16   {d30-d31}, [%8]!    \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(outptr4), // %5
                        "=r"(outptr5), // %6
                        "=r"(outptr6), // %7
                        "=r"(outptr7)  // %8
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(outptr4),
                        "6"(outptr5),
                        "7"(outptr6),
                        "8"(outptr7)
                        : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
#else  // NCNN_GNU_INLINE_ASM
                    uint16x8x4_t _r0246 = vld4q_u16(r0);
                    uint16x8x4_t _r1357 = vld4q_u16(r0 + 32);
                    uint16x8x2_t _r04 = vuzpq_u16(_r0246.val[0], _r1357.val[0]);
                    uint16x8x2_t _r15 = vuzpq_u16(_r0246.val[1], _r1357.val[1]);
                    uint16x8x2_t _r26 = vuzpq_u16(_r0246.val[2], _r1357.val[2]);
                    uint16x8x2_t _r37 = vuzpq_u16(_r0246.val[3], _r1357.val[3]);
                    vst1q_u16(outptr0, _r04.val[0]);
                    vst1q_u16(outptr1, _r15.val[0]);
                    vst1q_u16(outptr2, _r26.val[0]);
                    vst1q_u16(outptr3, _r37.val[0]);
                    vst1q_u16(outptr4, _r04.val[1]);
                    vst1q_u16(outptr5, _r15.val[1]);
                    vst1q_u16(outptr6, _r26.val[1]);
                    vst1q_u16(outptr7, _r37.val[1]);

                    r0 += 64;
                    outptr0 += 8;
                    outptr1 += 8;
                    outptr2 += 8;
                    outptr3 += 8;
                    outptr4 += 8;
                    outptr5 += 8;
                    outptr6 += 8;
                    outptr7 += 8;
#endif // NCNN_GNU_INLINE_ASM
                }
#endif
                for (; j < w; j++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];
                    *outptr4++ = r0[4];
                    *outptr5++ = r0[5];
                    *outptr6++ = r0[6];
                    *outptr7++ = r0[7];

                    r0 += 8;
                }
            }
        }
        if (pack4to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 2);
                const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 2 + 1);

                unsigned short* outptr = top_blob.row<unsigned short>(i);

                int j = 0;
#if NCNN_GNU_INLINE_ASM
#if __ARM_NEON
                for (; j + 1 < w; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "ld1    {v0.8h}, [%0], #16      \n"
                        "ld1    {v1.8h}, [%1], #16      \n"

                        "zip1   v2.2d, v0.2d, v1.2d     \n"
                        "zip2   v3.2d, v0.2d, v1.2d     \n"

                        "st1    {v2.8h, v3.8h}, [%2], #32\n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(outptr) // %2
                        : "0"(r0),
                        "1"(r1),
                        "2"(outptr)
                        : "memory", "v0", "v1", "v2", "v3");
#else
                    asm volatile(
                        "vld1.u16   {d0-d1}, [%0 :64]!  \n"
                        "vld1.u16   {d2-d3}, [%1 :64]!  \n"

                        "vswp       d1, d2              \n"

                        "vst1.u16   {d0-d3}, [%2 :128]! \n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(outptr) // %2
                        : "0"(r0),
                        "1"(r1),
                        "2"(outptr)
                        : "memory", "q0", "q1");
#endif
                }
#endif
#endif // NCNN_GNU_INLINE_ASM
                for (; j < w; j++)
                {
                    outptr[0] = r0[0];
                    outptr[1] = r0[1];
                    outptr[2] = r0[2];
                    outptr[3] = r0[3];
                    outptr[4] = r1[0];
                    outptr[5] = r1[1];
                    outptr[6] = r1[2];
                    outptr[7] = r1[3];

                    r0 += 4;
                    r1 += 4;
                    outptr += 8;
                }
            }
        }
        if (pack8to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);

                unsigned short* outptr0 = top_blob.row<unsigned short>(i * 2);
                unsigned short* outptr1 = top_blob.row<unsigned short>(i * 2 + 1);

                int j = 0;
#if NCNN_GNU_INLINE_ASM
#if __ARM_NEON
                for (; j + 1 < w; j += 2)
                {
#if __aarch64__
                    asm volatile(
                        "ld1    {v0.8h, v1.8h}, [%0], #32 \n"

                        "uzp1   v2.2d, v0.2d, v1.2d     \n"
                        "uzp2   v3.2d, v0.2d, v1.2d     \n"

                        "st1    {v2.8h}, [%1], #16      \n"
                        "st1    {v3.8h}, [%2], #16      \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1)  // %2
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1)
                        : "memory", "v0", "v1", "v2", "v3");
#else
                    asm volatile(
                        "vld1.u16   {d0-d3}, [%0 :128]! \n"

                        "vswp       d1, d2              \n"

                        "vst1.u16   {d0-d1}, [%1 :64]!  \n"
                        "vst1.u16   {d2-d3}, [%2 :64]!  \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1)  // %2
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1)
                        : "memory", "q0", "q1");
#endif
                }
#endif
#endif // NCNN_GNU_INLINE_ASM
                for (; j < w; j++)
                {
                    outptr0[0] = r0[0];
                    outptr0[1] = r0[1];
                    outptr0[2] = r0[2];
                    outptr0[3] = r0[3];
                    outptr1[0] = r0[4];
                    outptr1[1] = r0[5];
                    outptr1[2] = r0[6];
                    outptr1[3] = r0[7];

                    r0 += 8;
                    outptr0 += 4;
                    outptr1 += 4;
                }
            }
        }

        return 0;
    }

    if (dims == 3 || dims == 4)
    {
        int size = w * h * d;
        int outc = channels * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if (dims == 3)
            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        else // if (dims == 4)
            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const unsigned short* r0 = bottom_blob.channel(q * 4);
                const unsigned short* r1 = bottom_blob.channel(q * 4 + 1);
                const unsigned short* r2 = bottom_blob.channel(q * 4 + 2);
                const unsigned short* r3 = bottom_blob.channel(q * 4 + 3);

                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    uint16x4x4_t _p;
                    _p.val[0] = vld1_u16(r0);
                    _p.val[1] = vld1_u16(r1);
                    _p.val[2] = vld1_u16(r2);
                    _p.val[3] = vld1_u16(r3);
                    vst4_u16(outptr, _p);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr += 16;
                }
#endif
                for (; i < size; i++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;

                    outptr += 4;
                }
            }
        }
        if (pack4to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* r0 = bottom_blob.channel(q);

                unsigned short* outptr0 = top_blob.channel(q * 4);
                unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
                unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
                unsigned short* outptr3 = top_blob.channel(q * 4 + 3);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    uint16x4x4_t _p = vld4_u16(r0);
                    vst1_u16(outptr0, _p.val[0]);
                    vst1_u16(outptr1, _p.val[1]);
                    vst1_u16(outptr2, _p.val[2]);
                    vst1_u16(outptr3, _p.val[3]);

                    r0 += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];

                    r0 += 4;
                }
            }
        }
        if (pack1to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const unsigned short* r0 = bottom_blob.channel(q * 8);
                const unsigned short* r1 = bottom_blob.channel(q * 8 + 1);
                const unsigned short* r2 = bottom_blob.channel(q * 8 + 2);
                const unsigned short* r3 = bottom_blob.channel(q * 8 + 3);
                const unsigned short* r4 = bottom_blob.channel(q * 8 + 4);
                const unsigned short* r5 = bottom_blob.channel(q * 8 + 5);
                const unsigned short* r6 = bottom_blob.channel(q * 8 + 6);
                const unsigned short* r7 = bottom_blob.channel(q * 8 + 7);

                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    // transpose 8x8
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                    asm volatile(
                        "ld1    {v0.8h}, [%0], #16      \n"
                        "ld1    {v1.8h}, [%1], #16      \n"
                        "ld1    {v2.8h}, [%2], #16      \n"
                        "ld1    {v3.8h}, [%3], #16      \n"
                        "ld1    {v4.8h}, [%4], #16      \n"
                        "ld1    {v5.8h}, [%5], #16      \n"
                        "ld1    {v6.8h}, [%6], #16      \n"
                        "ld1    {v7.8h}, [%7], #16      \n"

                        "zip1   v16.8h, v0.8h, v4.8h    \n"
                        "zip2   v20.8h, v0.8h, v4.8h    \n"
                        "zip1   v17.8h, v1.8h, v5.8h    \n"
                        "zip2   v21.8h, v1.8h, v5.8h    \n"
                        "zip1   v18.8h, v2.8h, v6.8h    \n"
                        "zip2   v22.8h, v2.8h, v6.8h    \n"
                        "zip1   v19.8h, v3.8h, v7.8h    \n"
                        "zip2   v23.8h, v3.8h, v7.8h    \n"

                        "st4    {v16.8h, v17.8h, v18.8h, v19.8h}, [%8], #64 \n"
                        "st4    {v20.8h, v21.8h, v22.8h, v23.8h}, [%8], #64 \n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(r2),    // %2
                        "=r"(r3),    // %3
                        "=r"(r4),    // %4
                        "=r"(r5),    // %5
                        "=r"(r6),    // %6
                        "=r"(r7),    // %7
                        "=r"(outptr) // %8
                        : "0"(r0),
                        "1"(r1),
                        "2"(r2),
                        "3"(r3),
                        "4"(r4),
                        "5"(r5),
                        "6"(r6),
                        "7"(r7),
                        "8"(outptr)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                    asm volatile(
                        "vld1.u16   {d16-d17}, [%0 : 128]! \n"
                        "vld1.u16   {d18-d19}, [%1 : 128]! \n"
                        "vld1.u16   {d20-d21}, [%2 : 128]! \n"
                        "vld1.u16   {d22-d23}, [%3 : 128]! \n"
                        "vld1.u16   {d24-d25}, [%4 : 128]! \n"
                        "vld1.u16   {d26-d27}, [%5 : 128]! \n"
                        "vld1.u16   {d28-d29}, [%6 : 128]! \n"
                        "vld1.u16   {d30-d31}, [%7 : 128]! \n"

                        "vtrn.u16   q8, q9              \n"
                        "vtrn.u16   q10, q11            \n"
                        "vtrn.u16   q12, q13            \n"
                        "vtrn.u16   q14, q15            \n"

                        "vtrn.u32   q8, q10             \n"
                        "vtrn.u32   q9, q11             \n"
                        "vtrn.u32   q12, q14            \n"
                        "vtrn.u32   q13, q15            \n"

                        "vswp       d17, d24            \n"
                        "vswp       d19, d26            \n"
                        "vswp       d21, d28            \n"
                        "vswp       d23, d30            \n"

                        "vstm       %8!, {d16-d23}      \n"
                        "vstm       %8!, {d24-d31}      \n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(r2),    // %2
                        "=r"(r3),    // %3
                        "=r"(r4),    // %4
                        "=r"(r5),    // %5
                        "=r"(r6),    // %6
                        "=r"(r7),    // %7
                        "=r"(outptr) // %8
                        : "0"(r0),
                        "1"(r1),
                        "2"(r2),
                        "3"(r3),
                        "4"(r4),
                        "5"(r5),
                        "6"(r6),
                        "7"(r7),
                        "8"(outptr)
                        : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
#else  // NCNN_GNU_INLINE_ASM
                    uint16x8_t _r0 = vld1q_u16(r0);
                    uint16x8_t _r1 = vld1q_u16(r1);
                    uint16x8_t _r2 = vld1q_u16(r2);
                    uint16x8_t _r3 = vld1q_u16(r3);
                    uint16x8_t _r4 = vld1q_u16(r4);
                    uint16x8_t _r5 = vld1q_u16(r5);
                    uint16x8_t _r6 = vld1q_u16(r6);
                    uint16x8_t _r7 = vld1q_u16(r7);
                    uint16x8x2_t _r04 = vzipq_u16(_r0, _r4);
                    uint16x8x2_t _r15 = vzipq_u16(_r1, _r5);
                    uint16x8x2_t _r26 = vzipq_u16(_r2, _r6);
                    uint16x8x2_t _r37 = vzipq_u16(_r3, _r7);
                    uint16x8x4_t _r0123;
                    _r0123.val[0] = _r04.val[0];
                    _r0123.val[1] = _r15.val[0];
                    _r0123.val[2] = _r26.val[0];
                    _r0123.val[3] = _r37.val[0];
                    uint16x8x4_t _r4567;
                    _r4567.val[0] = _r04.val[1];
                    _r4567.val[1] = _r15.val[1];
                    _r4567.val[2] = _r26.val[1];
                    _r4567.val[3] = _r37.val[1];
                    vst4q_u16(outptr, _r0123);
                    vst4q_u16(outptr + 32, _r4567);

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                    r3 += 8;
                    r4 += 8;
                    r5 += 8;
                    r6 += 8;
                    r7 += 8;
                    outptr += 64;
#endif // NCNN_GNU_INLINE_ASM
                }
#endif
                for (; i < size; i++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;
                    outptr[4] = *r4++;
                    outptr[5] = *r5++;
                    outptr[6] = *r6++;
                    outptr[7] = *r7++;

                    outptr += 8;
                }
            }
        }
        if (pack8to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* r0 = bottom_blob.channel(q);

                unsigned short* outptr0 = top_blob.channel(q * 8);
                unsigned short* outptr1 = top_blob.channel(q * 8 + 1);
                unsigned short* outptr2 = top_blob.channel(q * 8 + 2);
                unsigned short* outptr3 = top_blob.channel(q * 8 + 3);
                unsigned short* outptr4 = top_blob.channel(q * 8 + 4);
                unsigned short* outptr5 = top_blob.channel(q * 8 + 5);
                unsigned short* outptr6 = top_blob.channel(q * 8 + 6);
                unsigned short* outptr7 = top_blob.channel(q * 8 + 7);

                int i = 0;
#if __ARM_NEON
                for (; i + 7 < size; i += 8)
                {
                    // transpose 8x8
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                    asm volatile(
                        "ld4    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                        "ld4    {v4.8h, v5.8h, v6.8h, v7.8h}, [%0], #64 \n"

                        "uzp1   v16.8h, v0.8h, v4.8h    \n"
                        "uzp2   v20.8h, v0.8h, v4.8h    \n"
                        "uzp1   v17.8h, v1.8h, v5.8h    \n"
                        "uzp2   v21.8h, v1.8h, v5.8h    \n"
                        "uzp1   v18.8h, v2.8h, v6.8h    \n"
                        "uzp2   v22.8h, v2.8h, v6.8h    \n"
                        "uzp1   v19.8h, v3.8h, v7.8h    \n"
                        "uzp2   v23.8h, v3.8h, v7.8h    \n"

                        "st1    {v16.8h}, [%1], #16      \n"
                        "st1    {v17.8h}, [%2], #16      \n"
                        "st1    {v18.8h}, [%3], #16      \n"
                        "st1    {v19.8h}, [%4], #16      \n"
                        "st1    {v20.8h}, [%5], #16      \n"
                        "st1    {v21.8h}, [%6], #16      \n"
                        "st1    {v22.8h}, [%7], #16      \n"
                        "st1    {v23.8h}, [%8], #16      \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(outptr4), // %5
                        "=r"(outptr5), // %6
                        "=r"(outptr6), // %7
                        "=r"(outptr7)  // %8
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(outptr4),
                        "6"(outptr5),
                        "7"(outptr6),
                        "8"(outptr7)
                        : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else
                    asm volatile(
                        "vldm       %0!, {d16-d23}      \n"
                        "vldm       %0!, {d24-d31}      \n"

                        "vtrn.u16   q8, q9              \n"
                        "vtrn.u16   q10, q11            \n"
                        "vtrn.u16   q12, q13            \n"
                        "vtrn.u16   q14, q15            \n"

                        "vtrn.u32   q8, q10             \n"
                        "vtrn.u32   q9, q11             \n"
                        "vtrn.u32   q12, q14            \n"
                        "vtrn.u32   q13, q15            \n"

                        "vswp       d17, d24            \n"
                        "vswp       d19, d26            \n"
                        "vswp       d21, d28            \n"
                        "vswp       d23, d30            \n"

                        "vst1.u16   {d16-d17}, [%1 : 128]! \n"
                        "vst1.u16   {d18-d19}, [%2 : 128]! \n"
                        "vst1.u16   {d20-d21}, [%3 : 128]! \n"
                        "vst1.u16   {d22-d23}, [%4 : 128]! \n"
                        "vst1.u16   {d24-d25}, [%5 : 128]! \n"
                        "vst1.u16   {d26-d27}, [%6 : 128]! \n"
                        "vst1.u16   {d28-d29}, [%7 : 128]! \n"
                        "vst1.u16   {d30-d31}, [%8 : 128]! \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1), // %2
                        "=r"(outptr2), // %3
                        "=r"(outptr3), // %4
                        "=r"(outptr4), // %5
                        "=r"(outptr5), // %6
                        "=r"(outptr6), // %7
                        "=r"(outptr7)  // %8
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1),
                        "3"(outptr2),
                        "4"(outptr3),
                        "5"(outptr4),
                        "6"(outptr5),
                        "7"(outptr6),
                        "8"(outptr7)
                        : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif
#else  // NCNN_GNU_INLINE_ASM
                    uint16x8x4_t _r0246 = vld4q_u16(r0);
                    uint16x8x4_t _r1357 = vld4q_u16(r0 + 32);
                    uint16x8x2_t _r04 = vuzpq_u16(_r0246.val[0], _r1357.val[0]);
                    uint16x8x2_t _r15 = vuzpq_u16(_r0246.val[1], _r1357.val[1]);
                    uint16x8x2_t _r26 = vuzpq_u16(_r0246.val[2], _r1357.val[2]);
                    uint16x8x2_t _r37 = vuzpq_u16(_r0246.val[3], _r1357.val[3]);
                    vst1q_u16(outptr0, _r04.val[0]);
                    vst1q_u16(outptr1, _r15.val[0]);
                    vst1q_u16(outptr2, _r26.val[0]);
                    vst1q_u16(outptr3, _r37.val[0]);
                    vst1q_u16(outptr4, _r04.val[1]);
                    vst1q_u16(outptr5, _r15.val[1]);
                    vst1q_u16(outptr6, _r26.val[1]);
                    vst1q_u16(outptr7, _r37.val[1]);

                    r0 += 64;
                    outptr0 += 8;
                    outptr1 += 8;
                    outptr2 += 8;
                    outptr3 += 8;
                    outptr4 += 8;
                    outptr5 += 8;
                    outptr6 += 8;
                    outptr7 += 8;
#endif // NCNN_GNU_INLINE_ASM
                }
#endif
                for (; i < size; i++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];
                    *outptr4++ = r0[4];
                    *outptr5++ = r0[5];
                    *outptr6++ = r0[6];
                    *outptr7++ = r0[7];

                    r0 += 8;
                }
            }
        }
        if (pack4to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const unsigned short* r0 = bottom_blob.channel(q * 2);
                const unsigned short* r1 = bottom_blob.channel(q * 2 + 1);

                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if NCNN_GNU_INLINE_ASM
#if __ARM_NEON
                for (; i + 1 < size; i += 2)
                {
#if __aarch64__
                    asm volatile(
                        "ld1    {v0.8h}, [%0], #16      \n"
                        "ld1    {v1.8h}, [%1], #16      \n"

                        "zip1   v2.2d, v0.2d, v1.2d     \n"
                        "zip2   v3.2d, v0.2d, v1.2d     \n"

                        "st1    {v2.8h, v3.8h}, [%2], #32\n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(outptr) // %2
                        : "0"(r0),
                        "1"(r1),
                        "2"(outptr)
                        : "memory", "v0", "v1", "v2", "v3");
#else
                    asm volatile(
                        "vld1.u16   {d0-d1}, [%0 :128]! \n"
                        "vld1.u16   {d2-d3}, [%1 :128]! \n"

                        "vswp       d1, d2              \n"

                        "vst1.u16   {d0-d3}, [%2 :128]! \n"
                        : "=r"(r0),    // %0
                        "=r"(r1),    // %1
                        "=r"(outptr) // %2
                        : "0"(r0),
                        "1"(r1),
                        "2"(outptr)
                        : "memory", "q0", "q1");
#endif
                }
#endif
#endif // NCNN_GNU_INLINE_ASM
                for (; i < size; i++)
                {
                    outptr[0] = r0[0];
                    outptr[1] = r0[1];
                    outptr[2] = r0[2];
                    outptr[3] = r0[3];
                    outptr[4] = r1[0];
                    outptr[5] = r1[1];
                    outptr[6] = r1[2];
                    outptr[7] = r1[3];

                    r0 += 4;
                    r1 += 4;
                    outptr += 8;
                }
            }
        }
        if (pack8to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* r0 = bottom_blob.channel(q);

                unsigned short* outptr0 = top_blob.channel(q * 2);
                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);

                int i = 0;
#if NCNN_GNU_INLINE_ASM
#if __ARM_NEON
                for (; i + 1 < size; i += 2)
                {
#if __aarch64__
                    asm volatile(
                        "ld1    {v0.8h, v1.8h}, [%0], #32 \n"

                        "uzp1   v2.2d, v0.2d, v1.2d     \n"
                        "uzp2   v3.2d, v0.2d, v1.2d     \n"

                        "st1    {v2.8h}, [%1], #16      \n"
                        "st1    {v3.8h}, [%2], #16      \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1)  // %2
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1)
                        : "memory", "v0", "v1", "v2", "v3");
#else
                    asm volatile(
                        "vld1.u16   {d0-d3}, [%0 :128]! \n"

                        "vswp       d1, d2              \n"

                        "vst1.u16   {d0-d1}, [%1 :128]! \n"
                        "vst1.u16   {d2-d3}, [%2 :128]! \n"
                        : "=r"(r0),      // %0
                        "=r"(outptr0), // %1
                        "=r"(outptr1)  // %2
                        : "0"(r0),
                        "1"(outptr0),
                        "2"(outptr1)
                        : "memory", "q0", "q1");
#endif
                }
#endif
#endif // NCNN_GNU_INLINE_ASM
                for (; i < size; i++)
                {
                    outptr0[0] = r0[0];
                    outptr0[1] = r0[1];
                    outptr0[2] = r0[2];
                    outptr0[3] = r0[3];
                    outptr1[0] = r0[4];
                    outptr1[1] = r0[5];
                    outptr1[2] = r0[6];
                    outptr1[3] = r0[7];

                    r0 += 8;
                    outptr0 += 4;
                    outptr1 += 4;
                }
            }
        }

        return 0;
    }

    return 0;
}

int Packing_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (use_padding)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    if (elempack == out_elempack)
    {
        top_blob = bottom_blob;
        return 0;
    }

    bool pack1to8 = elempack == 1 && out_elempack == 8;
    bool pack8to1 = elempack == 8 && out_elempack == 1;

    if (!pack1to8 && !pack8to1)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    if (!use_padding)
    {
        // identity if use_padding not allowed
        if (dims == 1 && w * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if (dims == 2 && h * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
    }

    if (dims == 1)
    {
        top_blob = bottom_blob;
        top_blob.w = w * elempack / out_elempack;
        top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
        top_blob.elemsize = elemsize / elempack * out_elempack;
        top_blob.elempack = out_elempack;
        return 0;
    }

    if (dims == 2)
    {
        int outh = h * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);

                signed char* outptr = top_blob.row<signed char>(i);

                int j = 0;
                for (; j < w; j++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;
                    outptr[4] = *r4++;
                    outptr[5] = *r5++;
                    outptr[6] = *r6++;
                    outptr[7] = *r7++;

                    outptr += 8;
                }
            }
        }
        if (pack8to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const signed char* r0 = bottom_blob.row<const signed char>(i);

                signed char* outptr0 = top_blob.row<signed char>(i * 8);
                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);

                int j = 0;
                for (; j < w; j++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];
                    *outptr4++ = r0[4];
                    *outptr5++ = r0[5];
                    *outptr6++ = r0[6];
                    *outptr7++ = r0[7];

                    r0 += 8;
                }
            }
        }

        return 0;
    }

    if (dims == 3 || dims == 4)
    {
        int size = w * h * d;
        int outc = channels * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if (dims == 3)
            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        else // if (dims == 4)
            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const signed char* r0 = bottom_blob.channel(q * 8);
                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
                const signed char* r7 = bottom_blob.channel(q * 8 + 7);

                signed char* outptr = top_blob.channel(q);

                int i = 0;
                for (; i < size; i++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;
                    outptr[4] = *r4++;
                    outptr[5] = *r5++;
                    outptr[6] = *r6++;
                    outptr[7] = *r7++;

                    outptr += 8;
                }
            }
        }
        if (pack8to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const signed char* r0 = bottom_blob.channel(q);

                signed char* outptr0 = top_blob.channel(q * 8);
                signed char* outptr1 = top_blob.channel(q * 8 + 1);
                signed char* outptr2 = top_blob.channel(q * 8 + 2);
                signed char* outptr3 = top_blob.channel(q * 8 + 3);
                signed char* outptr4 = top_blob.channel(q * 8 + 4);
                signed char* outptr5 = top_blob.channel(q * 8 + 5);
                signed char* outptr6 = top_blob.channel(q * 8 + 6);
                signed char* outptr7 = top_blob.channel(q * 8 + 7);

                int i = 0;
                for (; i < size; i++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];
                    *outptr4++ = r0[4];
                    *outptr5++ = r0[5];
                    *outptr6++ = r0[6];
                    *outptr7++ = r0[7];

                    r0 += 8;
                }
            }
        }

        return 0;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/packing_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_PACKING_ARM_H
#define LAYER_PACKING_ARM_H

#include "packing.h"

namespace ncnn {

class Packing_arm : public Packing
{
public:
    Packing_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_PACKING_ARM_H


================================================
FILE: src/layer/arm/padding_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "padding_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

#if __ARM_NEON
#include "padding_pack4.h"
#include "padding_pack4_bf16s_fp16s.h"
#include "padding_pack8_int8.h"
#if NCNN_ARM82
#include "padding_pack8_fp16s.h"
#endif
#endif // __ARM_NEON

Padding_arm::Padding_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Padding_arm::create_pipeline(const Option& opt)
{
#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        value_fp16 = float32_to_float16(value);

        ncnn::cast_float32_to_float16(per_channel_pad_data, per_channel_pad_data_fp16, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        value_bf16 = float32_to_bfloat16(value);

        ncnn::cast_float32_to_bfloat16(per_channel_pad_data, per_channel_pad_data_bf16, opt);
    }
#endif

    return 0;
}

int Padding_arm::destroy_pipeline(const Option& /*opt*/)
{
    return 0;
}

int Padding_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int elembits = bottom_blob.elembits();

    if (elembits == 8)
        return forward_int8(bottom_blob, top_blob, opt);

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        if (dims == 1)
        {
            int outw = w * elempack + left + right;

            int out_elempack = outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (left % 4 == 0 && out_elempack == 4 && type == 0)
            {
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                float32x4_t pad_value = vdupq_n_f32(value);
                padding_constant_pack4_neon(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value);

                return 0;
            }
        }

        if (dims == 2)
        {
            int outw = w + left + right;
            int outh = h * elempack + top + bottom;

            int out_elempack = outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (top % 4 == 0 && out_elempack == 4 && type == 0)
            {
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                float32x4_t pad_value = vdupq_n_f32(value);
                padding_constant_pack4_neon(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value);

                return 0;
            }
        }

        if (dims == 3)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outc = channels * elempack + front + behind;

            int out_elempack = outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
            {
                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int front_ = front / elempack;
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < outc / out_elempack; q++)
                {
                    Mat borderm = top_blob.channel(q);

                    float32x4_t pad_value = per_channel_pad_data_size ? vld1q_f32((const float*)per_channel_pad_data + q * 4) : vdupq_n_f32(value);
                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
                        borderm.fill(pad_value);
                    }
                    else
                    {
                        const Mat m = bottom_blob.channel(q - front_);
                        if (type == 0)
                            padding_constant_pack4_neon(m, borderm, top, bottom, left, right, pad_value);
                        if (type == 1)
                            padding_replicate_pack4_neon(m, borderm, top, bottom, left, right);
                        if (type == 2)
                            padding_reflect_pack4_neon(m, borderm, top, bottom, left, right);
                    }
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outd = d + front + behind;

            if (type == 0)
            {
                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    float32x4_t pad_value = per_channel_pad_data_size ? vld1q_f32((const float*)per_channel_pad_data + q * 4) : vdupq_n_f32(value);

                    for (int z = 0; z < outd; z++)
                    {
                        Mat borderm = top_blob.channel(q).depth(z);

                        // depth padding
                        if ((z - front) < 0 || (z - front) >= d)
                        {
                            borderm.fill(pad_value);
                        }
                        else
                        {
                            const Mat m = bottom_blob.channel(q).depth(z - front);
                            padding_constant_pack4_neon(m, borderm, top, bottom, left, right, pad_value);
                        }
                    }
                }

                return 0;
            }
        }
    }
#endif // __ARM_NEON

    Mat bottom_blob_unpacked = bottom_blob;
    if (elempack != 1)
    {
        Option opt_pack1 = opt;
        opt_pack1.blob_allocator = opt.workspace_allocator;

        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
        if (bottom_blob_unpacked.empty())
            return -100;
    }

    return Padding::forward(bottom_blob_unpacked, top_blob, opt);
}

int Padding_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __ARM_NEON
#if NCNN_ARM82
    if (elempack == 8)
    {
        if (dims == 1)
        {
            int outw = w * elempack + left + right;

            int out_elempack = outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (left % 8 == 0 && out_elempack == 8 && type == 0)
            {
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                uint16x8_t pad_value = vdupq_n_u16(value_fp16);
                padding_constant_pack8_fp16s_neon(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value);

                return 0;
            }
        }

        if (dims == 2)
        {
            int outw = w + left + right;
            int outh = h * elempack + top + bottom;

            int out_elempack = outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (top % 8 == 0 && out_elempack == 8 && type == 0)
            {
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                uint16x8_t pad_value = vdupq_n_u16(value_fp16);
                padding_constant_pack8_fp16s_neon(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value);

                return 0;
            }
        }

        if (dims == 3)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outc = channels * elempack + front + behind;

            int out_elempack = outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
            {
                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int front_ = front / elempack;
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < outc / out_elempack; q++)
                {
                    Mat borderm = top_blob.channel(q);

                    uint16x8_t pad_value = per_channel_pad_data_size ? vld1q_u16((const unsigned short*)per_channel_pad_data_fp16 + q * 8) : vdupq_n_u16(value_fp16);

                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
                        borderm.fill(pad_value);
                    }
                    else
                    {
                        const Mat m = bottom_blob.channel(q - front_);
                        if (type == 0)
                            padding_constant_pack8_fp16s_neon(m, borderm, top, bottom, left, right, pad_value);
                        if (type == 1)
                            padding_replicate_pack8_fp16s_neon(m, borderm, top, bottom, left, right);
                        if (type == 2)
                            padding_reflect_pack8_fp16s_neon(m, borderm, top, bottom, left, right);
                    }
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outd = d + front + behind;

            if (type == 0)
            {
                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    uint16x8_t pad_value = per_channel_pad_data_size ? vld1q_u16((const unsigned short*)per_channel_pad_data_fp16 + q * 8) : vdupq_n_u16(value_fp16);

                    for (int z = 0; z < outd; z++)
                    {
                        Mat borderm = top_blob.channel(q).depth(z);

                        // depth padding
                        if ((z - front) < 0 || (z - front) >= d)
                        {
                            borderm.fill(pad_value);
                        }
                        else
                        {
                            const Mat m = bottom_blob.channel(q).depth(z - front);
                            padding_constant_pack8_fp16s_neon(m, borderm, top, bottom, left, right, pad_value);
                        }
                    }
                }

                return 0;
            }
        }
    }
#endif

    if (elempack == 4)
    {
        if (dims == 1)
        {
            int outw = w * elempack + left + right;

#if NCNN_ARM82
            int out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
#else
            int out_elempack = outw % 4 == 0 ? 4 : 1;
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (left % 4 == 0 && out_elempack == 4 && type == 0)
            {
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                // clang-format off
                // *INDENT-OFF*
                uint16x4_t pad_value;
#if NCNN_ARM82
                if (support_fp16_storage && opt.use_fp16_storage)
                {
                    pad_value = vdup_n_u16(value_fp16);
                }
                else
#endif
#if NCNN_BF16
                if (opt.use_bf16_storage)
                {
                    pad_value = vdup_n_u16(value_bf16);
                }
                else
#endif
                {
                    // shall never reach here
                    pad_value = vdup_n_u16(0);
                }
                // *INDENT-ON*
                // clang-format on
                padding_constant_pack4_bf16_fp16s_neon(bottom_blob, top_blob, 0, 0, left / 4, right / 4, vcombine_u16(pad_value, pad_value));

                return 0;
            }
        }

        if (dims == 2)
        {
            int outw = w + left + right;
            int outh = h * elempack + top + bottom;

#if NCNN_ARM82
            int out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
#else
            int out_elempack = outh % 4 == 0 ? 4 : 1;
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (top % 4 == 0 && out_elempack == 4 && type == 0)
            {
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                // clang-format off
                // *INDENT-OFF*
                uint16x4_t pad_value;
#if NCNN_ARM82
                if (support_fp16_storage && opt.use_fp16_storage)
                {
                    pad_value = vdup_n_u16(value_fp16);
                }
                else
#endif
#if NCNN_BF16
                if (opt.use_bf16_storage)
                {
                    pad_value = vdup_n_u16(value_bf16);
                }
                else
#endif
                {
                    // shall never reach here
                    pad_value = vdup_n_u16(0);
                }
                // *INDENT-ON*
                // clang-format on
                padding_constant_pack4_bf16_fp16s_neon(bottom_blob, top_blob, top / 4, bottom / 4, left, right, vcombine_u16(pad_value, pad_value));

                return 0;
            }
        }

        if (dims == 3)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outc = channels * elempack + front + behind;

#if NCNN_ARM82
            int out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
#else
            int out_elempack = outc % 4 == 0 ? 4 : 1;
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
            {
                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int front_ = front / elempack;
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < outc / out_elempack; q++)
                {
                    Mat borderm = top_blob.channel(q);

                    // clang-format off
                    // *INDENT-OFF*
                    uint16x4_t pad_value;
#if NCNN_ARM82
                    if (support_fp16_storage && opt.use_fp16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_fp16 + q * 4) : vdup_n_u16(value_fp16);
                    }
                    else
#endif
#if NCNN_BF16
                    if (opt.use_bf16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16);
                    }
                    else
#endif
                    {
                        // shall never reach here
                        pad_value = vdup_n_u16(0);
                    }
                    // *INDENT-ON*
                    // clang-format on

                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
                        borderm.fill(pad_value);
                    }
                    else
                    {
                        const Mat m = bottom_blob.channel(q - front_);
                        if (type == 0)
                            padding_constant_pack4_bf16_fp16s_neon(m, borderm, top, bottom, left, right, vcombine_u16(pad_value, pad_value));
                        if (type == 1)
                            padding_replicate_pack4_bf16_fp16s_neon(m, borderm, top, bottom, left, right);
                        if (type == 2)
                            padding_reflect_pack4_bf16_fp16s_neon(m, borderm, top, bottom, left, right);
                    }
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outd = d + front + behind;

            if (type == 0)
            {
                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    // clang-format off
                    // *INDENT-OFF*
                    uint16x4_t pad_value;
#if NCNN_ARM82
                    if (support_fp16_storage && opt.use_fp16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_fp16 + q * 4) : vdup_n_u16(value_fp16);
                    }
                    else
#endif
#if NCNN_BF16
                    if (opt.use_bf16_storage)
                    {
                        pad_value = per_channel_pad_data_size ? vld1_u16((const unsigned short*)per_channel_pad_data_bf16 + q * 4) : vdup_n_u16(value_bf16);
                    }
                    else
#endif
                    {
                        // shall never reach here
                        pad_value = vdup_n_u16(0);
                    }
                    // *INDENT-ON*
                    // clang-format on

                    for (int z = 0; z < outd; z++)
                    {
                        Mat borderm = top_blob.channel(q).depth(z);

                        // depth padding
                        if ((z - front) < 0 || (z - front) >= d)
                        {
                            borderm.fill(pad_value);
                        }
                        else
                        {
                            const Mat m = bottom_blob.channel(q).depth(z - front);
                            padding_constant_pack4_bf16_fp16s_neon(m, borderm, top, bottom, left, right, vcombine_u16(pad_value, pad_value));
                        }
                    }
                }

                return 0;
            }
        }
    }
#endif // __ARM_NEON

    Mat bottom_blob_unpacked = bottom_blob;
    if (elempack != 1)
    {
        Option opt_pack1 = opt;
        opt_pack1.blob_allocator = opt.workspace_allocator;

        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
        if (bottom_blob_unpacked.empty())
            return -100;
    }

    return Padding::forward(bottom_blob_unpacked, top_blob, opt);
}

int Padding_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __ARM_NEON
    if (elempack == 8)
    {
        if (dims == 1)
        {
            int outw = w * elempack + left + right;

            int out_elempack = outw % 8 == 0 ? 8 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (left % 8 == 0 && out_elempack == 8 && type == 0)
            {
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int8x8_t pad_value = vdup_n_s8((signed char)value);
                padding_constant_pack8_int8_neon(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value);

                return 0;
            }
        }

        if (dims == 2)
        {
            int outw = w + left + right;
            int outh = h * elempack + top + bottom;

            int out_elempack = outh % 8 == 0 ? 8 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (top % 8 == 0 && out_elempack == 8 && type == 0)
            {
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int8x8_t pad_value = vdup_n_s8((signed char)value);
                padding_constant_pack8_int8_neon(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value);

                return 0;
            }
        }

        if (dims == 3)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outc = channels * elempack + front + behind;

            int out_elempack = outc % 8 == 0 ? 8 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
            {
                int front_ = front / elempack;
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < outc / out_elempack; q++)
                {
                    Mat borderm = top_blob.channel(q);

                    // TODO perchannel
                    //                     int8x8_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
                    int8x8_t pad_value = vdup_n_s8((signed char)value);

                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
                        borderm.fill<int8x8_t>(pad_value);
                    }
                    else
                    {
                        const Mat m = bottom_blob.channel(q - front_);
                        if (type == 0)
                            padding_constant_pack8_int8_neon(m, borderm, top, bottom, left, right, pad_value);
                        if (type == 1)
                            padding_replicate_pack8_int8_neon(m, borderm, top, bottom, left, right);
                        if (type == 2)
                            padding_reflect_pack8_int8_neon(m, borderm, top, bottom, left, right);
                    }
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outd = d + front + behind;

            top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            if (type == 0)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    // TODO perchannel
                    //                     int8x8_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
                    int8x8_t pad_value = vdup_n_s8((signed char)value);

                    for (int z = 0; z < outd; z++)
                    {
                        Mat borderm = top_blob.channel(q).depth(z);

                        // depth padding
                        if ((z - front) < 0 || (z - front) >= d)
                        {
                            borderm.fill<int8x8_t>(pad_value);
                        }
                        else
                        {
                            const Mat m = bottom_blob.channel(q).depth(z - front);
                            padding_constant_pack8_int8_neon(m, borderm, top, bottom, left, right, pad_value);
                        }
                    }
                }

                return 0;
            }
        }
    }
#endif // __ARM_NEON

    Mat bottom_blob_unpacked = bottom_blob;
    if (elempack != 1)
    {
        Option opt_pack1 = opt;
        opt_pack1.blob_allocator = opt.workspace_allocator;

        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
        if (bottom_blob_unpacked.empty())
            return -100;
    }

    return Padding::forward(bottom_blob_unpacked, top_blob, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/padding_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_PADDING_ARM_H
#define LAYER_PADDING_ARM_H

#include "padding.h"

namespace ncnn {

class Padding_arm : public Padding
{
public:
    Padding_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
#if NCNN_BF16
    // bf16
    unsigned short value_bf16;
    Mat per_channel_pad_data_bf16;
#endif

    // fp16
    unsigned short value_fp16;
    Mat per_channel_pad_data_fp16;
};

} // namespace ncnn

#endif // LAYER_PADDING_ARM_H


================================================
FILE: src/layer/arm/padding_pack4.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, float32x4_t v)
{
    const float* ptr = src;
    float* outptr = dst;

    int w = src.w;
    int h = src.h;

    int top_size = top * dst.w;
    int bottom_size = bottom * dst.w;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
    asm volatile(
        "mov    v0.16b, %10.16b         \n"
        "mov    v1.16b, %10.16b         \n"
        "mov    v2.16b, %10.16b         \n"
        "mov    v3.16b, %10.16b         \n"

        // fill top
        "lsr    w4, %w8, #3             \n" // w4 = nn = top_size >> 3
        "cmp    w4, #0                  \n"
        "beq    1f                      \n"

        "0:                             \n"
        "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
        "bne    0b                      \n"

        "1:                             \n"

        // fill top remain
        "and    w4, %w8, #7             \n" // w4 = remain = top_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    2f                      \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
        "2:                             \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    3f                      \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.4s, v1.4s}, [%0], #32 \n"
        "3:                             \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    4f                      \n"
        "st1    {v0.4s}, [%0], #16      \n"
        "4:                             \n"

        // fill center h loop
        "cmp    %w5, #0                 \n"
        "beq    15f                     \n"
        "5:                             \n"

        // fill left
        "mov    w4, %w6                 \n" // w4 = left
        "cmp    w4, #0                  \n"
        "beq    7f                      \n"

        "6:                             \n"
        "st1    {v0.4s}, [%0], #16      \n"
        "subs   w4, w4, #1              \n"
        "bne    6b                      \n"

        "7:                             \n"

        // fill middle
        "lsr    w4, %w4, #3             \n" // w4 = nn = w >> 3
        "cmp    w4, #0                  \n"
        "beq    9f                      \n"

        "8:                             \n"
        "prfm   pldl1keep, [%1, #512]   \n"
        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
        "prfm   pldl1keep, [%1, #512]   \n"
        "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
        "subs   w4, w4, #1              \n"
        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
        "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
        "bne    8b                      \n"

        "9:                             \n"

        "and    w4, %w4, #7             \n" // w4 = remain = w & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    10f                     \n"
        "prfm   pldl1keep, [%1, #512]   \n"
        "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
        "sub    w4, w4, #4              \n"
        "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
        "10:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    11f                     \n"
        "prfm   pldl1keep, [%1, #256]   \n"
        "ld1    {v16.4s, v17.4s}, [%1], #32 \n"
        "sub    w4, w4, #2              \n"
        "st1    {v16.4s, v17.4s}, [%0], #32 \n"
        "11:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    12f                     \n"
        "prfm   pldl1keep, [%1, #128]   \n"
        "ld1    {v16.4s}, [%1], #16     \n"
        "st1    {v16.4s}, [%0], #16     \n"
        "12:                            \n"

        // fill right
        "mov    w4, %w7                 \n" // w4 = right
        "cmp    w4, #0                  \n"
        "beq    14f                     \n"

        "13:                            \n"
        "subs   w4, w4, #1              \n"
        "st1    {v0.4s}, [%0], #16      \n"
        "bne    13b                     \n"
        "14:                            \n"

        "subs   %w5, %w5, #1            \n"
        "bne    5b                      \n"

        "15:                            \n"

        // fill bottom
        "lsr    w4, %w9, #3             \n" // w4 = nn = bottom_size >> 3
        "cmp    w4, #0                  \n"
        "beq    17f                     \n"

        "16:                            \n"
        "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
        "bne    16b                     \n"
        "17:                            \n"

        // fill bottom remain
        "and    w4, %w9, #7             \n" // w4 = remain = bottom_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    18f                     \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
        "18:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    19f                     \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.4s, v1.4s}, [%0], #32 \n"
        "19:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    20f                     \n"
        "st1    {v0.4s}, [%0], #16      \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else  // __aarch64__
    asm volatile(
        "vmov       q0, %q10            \n"
        "vmov       q1, %q10            \n"
        "vmov       q2, %q10            \n"
        "vmov       q3, %q10            \n"

        // fill top
        "lsr        r4, %8, #3          \n" // r4 = nn = top_size >> 3
        "cmp        r4, #0              \n"
        "beq        1f                  \n"

        "0:                             \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "vstm       %0!, {d0-d7}        \n"
        "bne        0b                  \n"

        "1:                             \n"

        // fill top remain
        "and        r4, %8, #7          \n" // r4 = remain = top_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        2f                  \n"
        "sub        r4, r4, #4          \n"
        "vstm       %0!, {d0-d7}        \n"
        "2:                             \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        3f                  \n"
        "sub        r4, r4, #2          \n"
        "vst1.f32   {d0-d3}, [%0 :128]! \n"
        "3:                             \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        4f                  \n"
        "vst1.f32   {d0-d1}, [%0 :128]! \n"
        "4:                             \n"

        // fill center h loop
        "cmp        %5, #0              \n"
        "beq        15f                 \n"
        "5:                             \n"

        // fill left
        "mov        r4, %6              \n" // r4 = left
        "cmp        r4, #0              \n"
        "beq        7f                  \n"

        "6:                             \n"
        "vst1.f32   {d0-d1}, [%0 :128]! \n"
        "subs       r4, r4, #1          \n"
        "bne        6b                  \n"

        "7:                             \n"

        // fill middle
        "lsr        r4, %4, #3          \n" // r4 = nn = w >> 3
        "cmp        r4, #0              \n"
        "beq        9f                  \n"

        "8:                             \n"
        "pld        [%1, #512]          \n"
        "vldm       %1!, {d16-d23}      \n"
        "pld        [%1, #512]          \n"
        "vldm       %1!, {d24-d31}      \n"
        "subs       r4, r4, #1          \n"
        "vstm       %0!, {d16-d23}      \n"
        "vstm       %0!, {d24-d31}      \n"
        "bne        8b                  \n"

        "9:                             \n"

        "and        r4, %4, #7          \n" // r4 = remain = w & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        10f                 \n"
        "pld        [%1, #512]          \n"
        "vldm       %1!, {d16-d23}      \n"
        "sub        r4, r4, #4          \n"
        "vstm       %0!, {d16-d23}      \n"
        "10:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        11f                 \n"
        "pld        [%1, #256]          \n"
        "vld1.f32   {d16-d19}, [%1 :128]! \n"
        "sub        r4, r4, #2          \n"
        "vst1.f32   {d16-d19}, [%0 :128]! \n"
        "11:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        12f                 \n"
        "pld        [%1, #128]          \n"
        "vld1.f32   {d16-d17}, [%1 :128]! \n"
        "vst1.f32   {d16-d17}, [%0 :128]! \n"
        "12:                            \n"

        // fill right
        "mov        r4, %7              \n" // r4 = right
        "cmp        r4, #0              \n"
        "beq        14f                 \n"

        "13:                            \n"
        "subs       r4, r4, #1          \n"
        "vst1.f32   {d0-d1}, [%0 :128]! \n"
        "bne        13b                 \n"
        "14:                            \n"

        "subs       %5, %5, #1          \n"
        "bne        5b                  \n"

        "15:                            \n"

        // fill bottom
        "lsr        r4, %9, #3          \n" // r4 = nn = bottom_size >> 3
        "cmp        r4, #0              \n"
        "beq        17f                 \n"

        "16:                            \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "vstm       %0!, {d0-d7}        \n"
        "bne        16b                 \n"
        "17:                            \n"

        // fill bottom remain
        "and        r4, %9, #7          \n" // r4 = remain = bottom_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        18f                 \n"
        "sub        r4, r4, #4          \n"
        "vstm       %0!, {d0-d7}        \n"
        "18:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        19f                 \n"
        "sub        r4, r4, #2          \n"
        "vst1.f32   {d0-d3}, [%0 :128]! \n"
        "19:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        20f                 \n"
        "vst1.f32   {d0-d1}, [%0 :128]! \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM

    // fill top
    {
        int x = 0;
        for (; x + 3 < top_size; x += 4)
        {
            vst1q_f32(outptr, v);
            vst1q_f32(outptr + 4, v);
            vst1q_f32(outptr + 8, v);
            vst1q_f32(outptr + 12, v);
            outptr += 16;
        }
        for (; x < top_size; x++)
        {
            vst1q_f32(outptr, v);
            outptr += 4;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            vst1q_f32(outptr, v);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            float32x4_t _p = vld1q_f32(ptr);
            vst1q_f32(outptr, _p);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_f32(outptr, v);
            outptr += 4;
        }
    }
    // fill bottom
    {
        int x = 0;
        for (; x + 3 < bottom_size; x += 4)
        {
            vst1q_f32(outptr, v);
            vst1q_f32(outptr + 4, v);
            vst1q_f32(outptr + 8, v);
            vst1q_f32(outptr + 12, v);
            outptr += 16;
        }
        for (; x < bottom_size; x++)
        {
            vst1q_f32(outptr, v);
            outptr += 4;
        }
    }
#endif // NCNN_GNU_INLINE_ASM
}

static void padding_replicate_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const float* ptr = src;
    float* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        const float* ptr0 = ptr;
        float32x4_t _p = vld1q_f32(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1q_f32(ptr0);
            vst1q_f32(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        float32x4_t _p = vld1q_f32(ptr);
        for (int x = 0; x < left; x++)
        {
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1q_f32(ptr);
            vst1q_f32(outptr, _p);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
    }
    // fill bottom
    ptr -= src.w * 4;
    for (int y = 0; y < bottom; y++)
    {
        const float* ptr0 = ptr;
        float32x4_t _p = vld1q_f32(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1q_f32(ptr0);
            vst1q_f32(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
    }
}

static void padding_reflect_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const float* ptr = src;
    float* outptr = dst;

    // fill top
    ptr += top * src.w * 4;
    for (int y = 0; y < top; y++)
    {
        const float* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            float32x4_t _p = vld1q_f32(ptr0 + (left - x) * 4);
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            vst1q_f32(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            float32x4_t _p = vld1q_f32(ptr0 - 8 - x * 4);
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        ptr -= src.w * 4;
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            float32x4_t _p = vld1q_f32(ptr + (left - x) * 4);
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            float32x4_t _p = vld1q_f32(ptr);
            vst1q_f32(outptr, _p);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            float32x4_t _p = vld1q_f32(ptr - 8 - x * 4);
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
    }
    // fill bottom
    ptr -= 2 * src.w * 4;
    for (int y = 0; y < bottom; y++)
    {
        const float* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            float32x4_t _p = vld1q_f32(ptr0 + (left - x) * 4);
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            vst1q_f32(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            float32x4_t _p = vld1q_f32(ptr0 - 8 - x * 4);
            vst1q_f32(outptr, _p);
            outptr += 4;
        }
        ptr -= src.w * 4;
    }
}


================================================
FILE: src/layer/arm/padding_pack4_bf16s_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void padding_constant_pack4_bf16_fp16s_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, uint16x8_t v)
{
    const unsigned short* ptr = src;
    unsigned short* outptr = dst;

    int w = src.w;
    int h = src.h;

    int top_size = top * dst.w;
    int bottom_size = bottom * dst.w;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
    asm volatile(
        "mov    v0.16b, %10.16b         \n"
        "mov    v1.16b, %10.16b         \n"
        "mov    v2.16b, %10.16b         \n"
        "mov    v3.16b, %10.16b         \n"

        // fill top
        "lsr    w4, %w8, #3             \n" // w4 = nn = top_size >> 3
        "cmp    w4, #0                  \n"
        "beq    1f                      \n"

        "0:                             \n"
        "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    0b                      \n"

        "1:                             \n"

        // fill top remain
        "and    w4, %w8, #7             \n" // w4 = remain = top_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    2f                      \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.8h, v1.8h}, [%0], #32 \n"
        "2:                             \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    3f                      \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.8h}, [%0], #16      \n"
        "3:                             \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    4f                      \n"
        "st1    {v0.4h}, [%0], #8       \n"
        "4:                             \n"

        // fill center h loop
        "cmp    %w5, #0                 \n"
        "beq    15f                     \n"
        "5:                             \n"

        // fill left
        "mov    w4, %w6                 \n" // w4 = left
        "cmp    w4, #0                  \n"
        "beq    7f                      \n"

        "6:                             \n"
        "st1    {v0.4h}, [%0], #8       \n"
        "subs   w4, w4, #1              \n"
        "bne    6b                      \n"

        "7:                             \n"

        // fill middle
        "lsr    w4, %w4, #3             \n" // w4 = nn = w >> 3
        "cmp    w4, #0                  \n"
        "beq    9f                      \n"

        "8:                             \n"
        "prfm   pldl1keep, [%1, #256]   \n"
        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n"
        "subs   w4, w4, #1              \n"
        "st1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%0], #64 \n"
        "bne    8b                      \n"

        "9:                             \n"

        "and    w4, %w4, #7             \n" // w4 = remain = w & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    10f                     \n"
        "prfm   pldl1keep, [%1, #256]   \n"
        "ld1    {v16.8h, v17.8h}, [%1], #32 \n"
        "sub    w4, w4, #4              \n"
        "st1    {v16.8h, v17.8h}, [%0], #32 \n"
        "10:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    11f                     \n"
        "prfm   pldl1keep, [%1, #128]   \n"
        "ld1    {v16.8h}, [%1], #16     \n"
        "sub    w4, w4, #2              \n"
        "st1    {v16.8h}, [%0], #16     \n"
        "11:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    12f                     \n"
        "prfm   pldl1keep, [%1, #64]    \n"
        "ld1    {v16.4h}, [%1], #8      \n"
        "st1    {v16.4h}, [%0], #8      \n"
        "12:                            \n"

        // fill right
        "mov    w4, %w7                 \n" // w4 = right
        "cmp    w4, #0                  \n"
        "beq    14f                     \n"

        "13:                            \n"
        "subs   w4, w4, #1              \n"
        "st1    {v0.4h}, [%0], #8       \n"
        "bne    13b                     \n"
        "14:                            \n"

        "subs   %w5, %w5, #1            \n"
        "bne    5b                      \n"

        "15:                            \n"

        // fill bottom
        "lsr    w4, %w9, #3             \n" // w4 = nn = bottom_size >> 3
        "cmp    w4, #0                  \n"
        "beq    17f                     \n"

        "16:                            \n"
        "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    16b                     \n"
        "17:                            \n"

        // fill bottom remain
        "and    w4, %w9, #7             \n" // w4 = remain = bottom_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    18f                     \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.8h, v1.8h}, [%0], #32 \n"
        "18:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    19f                     \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.8h}, [%0], #16      \n"
        "19:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    20f                     \n"
        "st1    {v0.4h}, [%0], #8       \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
#else  // __aarch64__
    asm volatile(
        "vmov       q0, %q10            \n"
        "vmov       q1, %q10            \n"
        "vmov       q2, %q10            \n"
        "vmov       q3, %q10            \n"

        // fill top
        "lsr        r4, %8, #3          \n" // r4 = nn = top_size >> 3
        "cmp        r4, #0              \n"
        "beq        1f                  \n"

        "0:                             \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "bne        0b                  \n"

        "1:                             \n"

        // fill top remain
        "and        r4, %8, #7          \n" // r4 = remain = top_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        2f                  \n"
        "sub        r4, r4, #4          \n"
        "vst1.u16   {d0-d3}, [%0 :64]!  \n"
        "2:                             \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        3f                  \n"
        "sub        r4, r4, #2          \n"
        "vst1.u16   {d0-d1}, [%0 :64]!  \n"
        "3:                             \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        4f                  \n"
        "vst1.u16   {d0}, [%0 :64]!     \n"
        "4:                             \n"

        // fill center h loop
        "cmp        %5, #0              \n"
        "beq        15f                 \n"
        "5:                             \n"

        // fill left
        "mov        r4, %6              \n" // r4 = left
        "cmp        r4, #0              \n"
        "beq        7f                  \n"

        "6:                             \n"
        "vst1.u16   {d0}, [%0 :64]!     \n"
        "subs       r4, r4, #1          \n"
        "bne        6b                  \n"

        "7:                             \n"

        // fill middle
        "lsr        r4, %4, #3          \n" // r4 = nn = w >> 3
        "cmp        r4, #0              \n"
        "beq        9f                  \n"

        "8:                             \n"
        "pld        [%1, #512]          \n"
        "vldm       %1!, {d16-d23}      \n"
        "subs       r4, r4, #1          \n"
        "vstm       %0!, {d16-d23}      \n"
        "bne        8b                  \n"

        "9:                             \n"

        "and        r4, %4, #7          \n" // r4 = remain = w & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        10f                 \n"
        "pld        [%1, #256]          \n"
        "vld1.u16   {d16-d19}, [%1 :64]! \n"
        "sub        r4, r4, #4          \n"
        "vst1.u16   {d16-d19}, [%0 :64]! \n"
        "10:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        11f                 \n"
        "pld        [%1, #128]          \n"
        "vld1.u16   {d16-d17}, [%1 :64]! \n"
        "sub        r4, r4, #2          \n"
        "vst1.u16   {d16-d17}, [%0 :64]! \n"
        "11:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        12f                 \n"
        "pld        [%1, #64]           \n"
        "vld1.u16   {d16}, [%1 :64]!    \n"
        "vst1.u16   {d16}, [%0 :64]!    \n"
        "12:                            \n"

        // fill right
        "mov        r4, %7              \n" // r4 = right
        "cmp        r4, #0              \n"
        "beq        14f                 \n"

        "13:                            \n"
        "subs       r4, r4, #1          \n"
        "vst1.u16   {d0}, [%0 :64]!     \n"
        "bne        13b                 \n"
        "14:                            \n"

        "subs       %5, %5, #1          \n"
        "bne        5b                  \n"

        "15:                            \n"

        // fill bottom
        "lsr        r4, %9, #3          \n" // r4 = nn = bottom_size >> 3
        "cmp        r4, #0              \n"
        "beq        17f                 \n"

        "16:                            \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "bne        16b                 \n"
        "17:                            \n"

        // fill bottom remain
        "and        r4, %9, #7          \n" // r4 = remain = bottom_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        18f                 \n"
        "sub        r4, r4, #4          \n"
        "vst1.u16   {d0-d3}, [%0 :64]!  \n"
        "18:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        19f                 \n"
        "sub        r4, r4, #2          \n"
        "vst1.u16   {d0-d1}, [%0 :64]!  \n"
        "19:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        20f                 \n"
        "vst1.u16   {d0}, [%0 :64]!     \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM

    // fill top
    {
        int x = 0;
        for (; x + 3 < top_size; x += 4)
        {
            vst1q_u16(outptr, v);
            vst1q_u16(outptr + 8, v);
            outptr += 16;
        }
        for (; x < top_size; x++)
        {
            vst1_u16(outptr, vget_low_u16(v));
            outptr += 4;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            vst1_u16(outptr, vget_low_u16(v));
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x4_t _p = vld1_u16(ptr);
            vst1_u16(outptr, _p);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_u16(outptr, vget_low_u16(v));
            outptr += 4;
        }
    }
    // fill bottom
    {
        int x = 0;
        for (; x + 3 < bottom_size; x += 4)
        {
            vst1q_u16(outptr, v);
            vst1q_u16(outptr + 8, v);
            outptr += 16;
        }
        for (; x < bottom_size; x++)
        {
            vst1_u16(outptr, vget_low_u16(v));
            outptr += 4;
        }
    }
#endif // NCNN_GNU_INLINE_ASM
}

static void padding_replicate_pack4_bf16_fp16s_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const unsigned short* ptr = src;
    unsigned short* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        const unsigned short* ptr0 = ptr;
        uint16x4_t _p = vld1_u16(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1_u16(ptr0);
            vst1_u16(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_u16(outptr, _p);
            outptr += 4;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        uint16x4_t _p = vld1_u16(ptr);
        for (int x = 0; x < left; x++)
        {
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1_u16(ptr);
            vst1_u16(outptr, _p);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_u16(outptr, _p);
            outptr += 4;
        }
    }
    // fill bottom
    ptr -= src.w * 4;
    for (int y = 0; y < bottom; y++)
    {
        const unsigned short* ptr0 = ptr;
        uint16x4_t _p = vld1_u16(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1_u16(ptr0);
            vst1_u16(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_u16(outptr, _p);
            outptr += 4;
        }
    }
}

static void padding_reflect_pack4_bf16_fp16s_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const unsigned short* ptr = src;
    unsigned short* outptr = dst;

    // fill top
    ptr += top * src.w * 4;
    for (int y = 0; y < top; y++)
    {
        const unsigned short* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            uint16x4_t _p = vld1_u16(ptr0 + (left - x) * 4);
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x4_t _p = vld1_u16(ptr0);
            vst1_u16(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            uint16x4_t _p = vld1_u16(ptr0 - 8 - x * 4);
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        ptr -= src.w * 4;
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            uint16x4_t _p = vld1_u16(ptr + (left - x) * 4);
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x4_t _p = vld1_u16(ptr);
            vst1_u16(outptr, _p);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            uint16x4_t _p = vld1_u16(ptr - 8 - x * 4);
            vst1_u16(outptr, _p);
            outptr += 4;
        }
    }
    // fill bottom
    ptr -= 2 * src.w * 4;
    for (int y = 0; y < bottom; y++)
    {
        const unsigned short* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            uint16x4_t _p = vld1_u16(ptr0 + (left - x) * 4);
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x4_t _p = vld1_u16(ptr0);
            vst1_u16(outptr, _p);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            uint16x4_t _p = vld1_u16(ptr0 - 8 - x * 4);
            vst1_u16(outptr, _p);
            outptr += 4;
        }
        ptr -= src.w * 4;
    }
}


================================================
FILE: src/layer/arm/padding_pack8_fp16s.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void padding_constant_pack8_fp16s_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, uint16x8_t v)
{
    const unsigned short* ptr = src;
    unsigned short* outptr = dst;

    int w = src.w;
    int h = src.h;

    int top_size = top * dst.w;
    int bottom_size = bottom * dst.w;

#if NCNN_GNU_INLINE_ASM
    asm volatile(
        "mov    v0.16b, %10.16b         \n"
        "mov    v1.16b, %10.16b         \n"
        "mov    v2.16b, %10.16b         \n"
        "mov    v3.16b, %10.16b         \n"

        // fill top
        "lsr    w4, %w8, #2             \n" // w4 = nn = top_size >> 2
        "cmp    w4, #0                  \n"
        "beq    1f                      \n"

        "0:                             \n"
        "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    0b                      \n"

        "1:                             \n"

        // fill top remain
        "and    w4, %w8, #3             \n" // w4 = remain = top_size & 3

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    2f                      \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.8h, v1.8h}, [%0], #32 \n"
        "2:                             \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    3f                      \n"
        "st1    {v0.8h}, [%0], #16      \n"
        "3:                             \n"

        // fill center h loop
        "cmp    %w5, #0                 \n"
        "beq    13f                     \n"
        "4:                             \n"

        // fill left
        "mov    w4, %w6                 \n" // w4 = left
        "cmp    w4, #0                  \n"
        "beq    6f                      \n"

        "5:                             \n"
        "st1    {v0.8h}, [%0], #16      \n"
        "subs   w4, w4, #1              \n"
        "bne    5b                      \n"

        "6:                             \n"

        // fill middle
        "lsr    w4, %w4, #2             \n" // w4 = nn = w >> 2
        "cmp    w4, #0                  \n"
        "beq    8f                      \n"

        "7:                             \n"
        "prfm   pldl1keep, [%1, #512]   \n"
        "ld1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%1], #64 \n"
        "subs   w4, w4, #1              \n"
        "st1    {v16.8h, v17.8h, v18.8h, v19.8h}, [%0], #64 \n"
        "bne    7b                      \n"

        "8:                             \n"

        "and    w4, %w4, #3             \n" // w4 = remain = w & 3

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    9f                      \n"
        "prfm   pldl1keep, [%1, #256]   \n"
        "ld1    {v16.8h, v17.8h}, [%1], #32 \n"
        "sub    w4, w4, #2              \n"
        "st1    {v16.8h, v17.8h}, [%0], #32 \n"
        "9:                             \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    10f                     \n"
        "prfm   pldl1keep, [%1, #128]   \n"
        "ld1    {v16.8h}, [%1], #16     \n"
        "st1    {v16.8h}, [%0], #16     \n"
        "10:                            \n"

        // fill right
        "mov    w4, %w7                 \n" // w4 = right
        "cmp    w4, #0                  \n"
        "beq    12f                     \n"

        "11:                            \n"
        "subs   w4, w4, #1              \n"
        "st1    {v0.8h}, [%0], #16      \n"
        "bne    11b                     \n"
        "12:                            \n"

        "subs   %w5, %w5, #1            \n"
        "bne    4b                      \n"

        "13:                            \n"

        // fill bottom
        "lsr    w4, %w9, #2             \n" // w4 = nn = bottom_size >> 2
        "cmp    w4, #0                  \n"
        "beq    15f                     \n"

        "14:                            \n"
        "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    14b                     \n"
        "15:                            \n"

        // fill bottom remain
        "and    w4, %w9, #3             \n" // w4 = remain = bottom_size & 3

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    16f                     \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.8h, v1.8h}, [%0], #32 \n"
        "16:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    17f                     \n"
        "st1    {v0.8h}, [%0], #16      \n"
        "17:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
#else  // NCNN_GNU_INLINE_ASM

    // fill top
    {
        int x = 0;
        for (; x + 3 < top_size; x += 4)
        {
            vst1q_u16(outptr, v);
            vst1q_u16(outptr + 8, v);
            vst1q_u16(outptr + 16, v);
            vst1q_u16(outptr + 24, v);
            outptr += 32;
        }
        for (; x < top_size; x++)
        {
            vst1q_u16(outptr, v);
            outptr += 8;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            vst1q_u16(outptr, v);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            vst1q_u16(outptr, _p);
            ptr += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_u16(outptr, v);
            outptr += 8;
        }
    }
    // fill bottom
    {
        int x = 0;
        for (; x + 3 < bottom_size; x += 4)
        {
            vst1q_u16(outptr, v);
            vst1q_u16(outptr + 8, v);
            vst1q_u16(outptr + 16, v);
            vst1q_u16(outptr + 24, v);
            outptr += 32;
        }
        for (; x < bottom_size; x++)
        {
            vst1q_u16(outptr, v);
            outptr += 8;
        }
    }
#endif // NCNN_GNU_INLINE_ASM
}

static void padding_replicate_pack8_fp16s_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const unsigned short* ptr = src;
    unsigned short* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        const unsigned short* ptr0 = ptr;
        uint16x8_t _p = vld1q_u16(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1q_u16(ptr0);
            vst1q_u16(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        uint16x8_t _p = vld1q_u16(ptr);
        for (int x = 0; x < left; x++)
        {
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1q_u16(ptr);
            vst1q_u16(outptr, _p);
            ptr += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
    }
    // fill bottom
    ptr -= src.w * 8;
    for (int y = 0; y < bottom; y++)
    {
        const unsigned short* ptr0 = ptr;
        uint16x8_t _p = vld1q_u16(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1q_u16(ptr0);
            vst1q_u16(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
    }
}

static void padding_reflect_pack8_fp16s_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const unsigned short* ptr = src;
    unsigned short* outptr = dst;

    // fill top
    ptr += top * src.w * 8;
    for (int y = 0; y < top; y++)
    {
        const unsigned short* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr0 + (left - x) * 8);
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr0);
            vst1q_u16(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr0 - 16 - x * 8);
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        ptr -= src.w * 8;
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr + (left - x) * 8);
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            vst1q_u16(outptr, _p);
            ptr += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr - 16 - x * 8);
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
    }
    // fill bottom
    ptr -= 2 * src.w * 8;
    for (int y = 0; y < bottom; y++)
    {
        const unsigned short* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr0 + (left - x) * 8);
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr0);
            vst1q_u16(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            uint16x8_t _p = vld1q_u16(ptr0 - 16 - x * 8);
            vst1q_u16(outptr, _p);
            outptr += 8;
        }
        ptr -= src.w * 8;
    }
}


================================================
FILE: src/layer/arm/padding_pack8_int8.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void padding_constant_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int8x8_t v)
{
    const signed char* ptr = src;
    signed char* outptr = dst;

    int w = src.w;
    int h = src.h;

    int top_size = top * dst.w;
    int bottom_size = bottom * dst.w;

#if NCNN_GNU_INLINE_ASM
#if __aarch64__
    asm volatile(
        "mov    v0.8b, %10.8b           \n"
        "mov    v0.d[1], v0.d[0]        \n"
        "mov    v1.16b, v0.16b          \n"
        "mov    v2.16b, v0.16b          \n"
        "mov    v3.16b, v0.16b          \n"

        // fill top
        "lsr    w4, %w8, #3             \n" // w4 = nn = top_size >> 3
        "cmp    w4, #0                  \n"
        "beq    1f                      \n"

        "0:                             \n"
        "st1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    0b                      \n"

        "1:                             \n"

        // fill top remain
        "and    w4, %w8, #7             \n" // w4 = remain = top_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    2f                      \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.16b, v1.16b}, [%0], #32 \n"
        "2:                             \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    3f                      \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.16b}, [%0], #16     \n"
        "3:                             \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    4f                      \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "4:                             \n"

        // fill center h loop
        "cmp    %w5, #0                 \n"
        "beq    15f                     \n"
        "5:                             \n"

        // fill left
        "mov    w4, %w6                 \n" // w4 = left
        "cmp    w4, #0                  \n"
        "beq    7f                      \n"

        "6:                             \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "subs   w4, w4, #1              \n"
        "bne    6b                      \n"

        "7:                             \n"

        // fill middle
        "lsr    w4, %w4, #3             \n" // w4 = nn = w >> 3
        "cmp    w4, #0                  \n"
        "beq    9f                      \n"

        "8:                             \n"
        "prfm   pldl1keep, [%1, #512]   \n"
        "ld1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%1], #64 \n"
        "subs   w4, w4, #1              \n"
        "st1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%0], #64 \n"
        "bne    8b                      \n"

        "9:                             \n"

        "and    w4, %w4, #7             \n" // w4 = remain = w & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    10f                     \n"
        "prfm   pldl1keep, [%1, #256]   \n"
        "ld1    {v16.16b, v17.16b}, [%1], #32 \n"
        "sub    w4, w4, #4              \n"
        "st1    {v16.16b, v17.16b}, [%0], #32 \n"
        "10:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    11f                     \n"
        "prfm   pldl1keep, [%1, #128]   \n"
        "ld1    {v16.16b}, [%1], #16    \n"
        "sub    w4, w4, #2              \n"
        "st1    {v16.16b}, [%0], #16    \n"
        "11:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    12f                     \n"
        "prfm   pldl1keep, [%1, #64]    \n"
        "ld1    {v16.8b}, [%1], #8      \n"
        "st1    {v16.8b}, [%0], #8      \n"
        "12:                            \n"

        // fill right
        "mov    w4, %w7                 \n" // w4 = right
        "cmp    w4, #0                  \n"
        "beq    14f                     \n"

        "13:                            \n"
        "subs   w4, w4, #1              \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "bne    13b                     \n"
        "14:                            \n"

        "subs   %w5, %w5, #1            \n"
        "bne    5b                      \n"

        "15:                            \n"

        // fill bottom
        "lsr    w4, %w9, #3             \n" // w4 = nn = bottom_size >> 3
        "cmp    w4, #0                  \n"
        "beq    17f                     \n"

        "16:                            \n"
        "st1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
        "subs   w4, w4, #1              \n"
        "bne    16b                     \n"
        "17:                            \n"

        // fill bottom remain
        "and    w4, %w9, #7             \n" // w4 = remain = bottom_size & 7

        "cmp    w4, #4                  \n" // w4 >= 4
        "blt    18f                     \n"
        "sub    w4, w4, #4              \n"
        "st1    {v0.16b, v1.16b}, [%0], #32 \n"
        "18:                            \n"

        "cmp    w4, #2                  \n" // w4 >= 2
        "blt    19f                     \n"
        "sub    w4, w4, #2              \n"
        "st1    {v0.16b}, [%0], #16     \n"
        "19:                            \n"

        "cmp    w4, #0                  \n" // w4 > 0
        "beq    20f                     \n"
        "st1    {v0.8b}, [%0], #8       \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
#else  // __aarch64__
    asm volatile(
        "vmov       d0, %P10            \n"
        "vmov       d1, d0              \n"
        "vmov       q1, q0              \n"
        "vmov       q2, q0              \n"
        "vmov       q3, q0              \n"

        // fill top
        "lsr        r4, %8, #3          \n" // r4 = nn = top_size >> 3
        "cmp        r4, #0              \n"
        "beq        1f                  \n"

        "0:                             \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "bne        0b                  \n"

        "1:                             \n"

        // fill top remain
        "and        r4, %8, #7          \n" // r4 = remain = top_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        2f                  \n"
        "sub        r4, r4, #4          \n"
        "vst1.s8    {d0-d3}, [%0 :128]! \n"
        "2:                             \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        3f                  \n"
        "sub        r4, r4, #2          \n"
        "vst1.s8    {d0-d1}, [%0 :128]! \n"
        "3:                             \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        4f                  \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "4:                             \n"

        // fill center h loop
        "cmp        %5, #0              \n"
        "beq        15f                 \n"
        "5:                             \n"

        // fill left
        "mov        r4, %6              \n" // r4 = left
        "cmp        r4, #0              \n"
        "beq        7f                  \n"

        "6:                             \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "subs       r4, r4, #1          \n"
        "bne        6b                  \n"

        "7:                             \n"

        // fill middle
        "lsr        r4, %4, #3          \n" // r4 = nn = w >> 3
        "cmp        r4, #0              \n"
        "beq        9f                  \n"

        "8:                             \n"
        "pld        [%1, #512]          \n"
        "vldm       %1!, {d16-d23}      \n"
        "subs       r4, r4, #1          \n"
        "vstm       %0!, {d16-d23}      \n"
        "bne        8b                  \n"

        "9:                             \n"

        "and        r4, %4, #7          \n" // r4 = remain = w & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        10f                 \n"
        "pld        [%1, #256]          \n"
        "vld1.s8    {d16-d19}, [%1 :64]! \n"
        "sub        r4, r4, #4          \n"
        "vst1.s8    {d16-d19}, [%0 :64]! \n"
        "10:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        11f                 \n"
        "pld        [%1, #128]          \n"
        "vld1.s8    {d16-d17}, [%1 :64]! \n"
        "sub        r4, r4, #2          \n"
        "vst1.s8    {d16-d17}, [%0 :64]! \n"
        "11:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        12f                 \n"
        "pld        [%1, #64]           \n"
        "vld1.s8    {d16}, [%1 :64]!    \n"
        "vst1.s8    {d16}, [%0 :64]!    \n"
        "12:                            \n"

        // fill right
        "mov        r4, %7              \n" // r4 = right
        "cmp        r4, #0              \n"
        "beq        14f                 \n"

        "13:                            \n"
        "subs       r4, r4, #1          \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "bne        13b                 \n"
        "14:                            \n"

        "subs       %5, %5, #1          \n"
        "bne        5b                  \n"

        "15:                            \n"

        // fill bottom
        "lsr        r4, %9, #3          \n" // r4 = nn = bottom_size >> 3
        "cmp        r4, #0              \n"
        "beq        17f                 \n"

        "16:                            \n"
        "vstm       %0!, {d0-d7}        \n"
        "subs       r4, r4, #1          \n"
        "bne        16b                 \n"
        "17:                            \n"

        // fill bottom remain
        "and        r4, %9, #7          \n" // r4 = remain = bottom_size & 7

        "cmp        r4, #4              \n" // r4 >= 4
        "blt        18f                 \n"
        "sub        r4, r4, #4          \n"
        "vst1.s8    {d0-d3}, [%0 :64]!  \n"
        "18:                            \n"

        "cmp        r4, #2              \n" // r4 >= 2
        "blt        19f                 \n"
        "sub        r4, r4, #2          \n"
        "vst1.s8    {d0-d1}, [%0 :64]!  \n"
        "19:                            \n"

        "cmp        r4, #0              \n" // r4 > 0
        "beq        20f                 \n"
        "vst1.s8    {d0}, [%0 :64]!     \n"
        "20:                            \n"

        : "=r"(outptr), // %0
        "=r"(ptr)     // %1
        : "0"(outptr),
        "1"(ptr),
        "r"(w),           // %4
        "r"(h),           // %5
        "r"(left),        // %6
        "r"(right),       // %7
        "r"(top_size),    // %8
        "r"(bottom_size), // %9
        "w"(v)            // %10
        : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM

    // fill top
    {
        int x = 0;
        for (; x + 3 < top_size; x += 4)
        {
            vst1_s8(outptr, v);
            vst1_s8(outptr + 8, v);
            vst1_s8(outptr + 16, v);
            vst1_s8(outptr + 24, v);
            outptr += 32;
        }
        for (; x < top_size; x++)
        {
            vst1_s8(outptr, v);
            outptr += 8;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            vst1_s8(outptr, v);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            int8x8_t _p = vld1_s8(ptr);
            vst1_s8(outptr, _p);
            ptr += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_s8(outptr, v);
            outptr += 8;
        }
    }
    // fill bottom
    {
        int x = 0;
        for (; x + 3 < bottom_size; x += 4)
        {
            vst1_s8(outptr, v);
            vst1_s8(outptr + 8, v);
            vst1_s8(outptr + 16, v);
            vst1_s8(outptr + 24, v);
            outptr += 32;
        }
        for (; x < bottom_size; x++)
        {
            vst1_s8(outptr, v);
            outptr += 8;
        }
    }
#endif // NCNN_GNU_INLINE_ASM
}

static void padding_replicate_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const signed char* ptr = src;
    signed char* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        const signed char* ptr0 = ptr;
        int8x8_t _p = vld1_s8(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1_s8(ptr0);
            vst1_s8(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_s8(outptr, _p);
            outptr += 8;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        int8x8_t _p = vld1_s8(ptr);
        for (int x = 0; x < left; x++)
        {
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1_s8(ptr);
            vst1_s8(outptr, _p);
            ptr += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_s8(outptr, _p);
            outptr += 8;
        }
    }
    // fill bottom
    ptr -= src.w * 8;
    for (int y = 0; y < bottom; y++)
    {
        const signed char* ptr0 = ptr;
        int8x8_t _p = vld1_s8(ptr0);
        for (int x = 0; x < left; x++)
        {
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = vld1_s8(ptr0);
            vst1_s8(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            vst1_s8(outptr, _p);
            outptr += 8;
        }
    }
}

static void padding_reflect_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const signed char* ptr = src;
    signed char* outptr = dst;

    // fill top
    ptr += top * src.w * 8;
    for (int y = 0; y < top; y++)
    {
        const signed char* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            int8x8_t _p = vld1_s8(ptr0 + (left - x) * 8);
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            int8x8_t _p = vld1_s8(ptr0);
            vst1_s8(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            int8x8_t _p = vld1_s8(ptr0 - 16 - x * 8);
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        ptr -= src.w * 8;
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            int8x8_t _p = vld1_s8(ptr + (left - x) * 8);
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            int8x8_t _p = vld1_s8(ptr);
            vst1_s8(outptr, _p);
            ptr += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            int8x8_t _p = vld1_s8(ptr - 16 - x * 8);
            vst1_s8(outptr, _p);
            outptr += 8;
        }
    }
    // fill bottom
    ptr -= 2 * src.w * 8;
    for (int y = 0; y < bottom; y++)
    {
        const signed char* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            int8x8_t _p = vld1_s8(ptr0 + (left - x) * 8);
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        for (int x = 0; x < src.w; x++)
        {
            int8x8_t _p = vld1_s8(ptr0);
            vst1_s8(outptr, _p);
            ptr0 += 8;
            outptr += 8;
        }
        for (int x = 0; x < right; x++)
        {
            int8x8_t _p = vld1_s8(ptr0 - 16 - x * 8);
            vst1_s8(outptr, _p);
            outptr += 8;
        }
        ptr -= src.w * 8;
    }
}


================================================
FILE: src/layer/arm/pixelshuffle_arm.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "pixelshuffle_arm.h"

#include "layer_type.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

PixelShuffle_arm::PixelShuffle_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int PixelShuffle_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = w * upscale_factor;
    int outh = h * upscale_factor;
    int outc = channels * elempack / (upscale_factor * upscale_factor);

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
        out_elempack = outc % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    if (upscale_factor != 2 || mode != 0)
    {
        Option opt_pack = opt;
        opt_pack.blob_allocator = opt.workspace_allocator;

        Mat bottom_blob_unpacked;
        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);

        return PixelShuffle::forward(bottom_blob_unpacked, top_blob, opt);
    }

    top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __ARM_NEON
    if (elempack == 4 && out_elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc / out_elempack; p++)
        {
            Mat m = top_blob.channel(p);

            const float* sptr0 = bottom_blob.channel(p * 4);
            const float* sptr1 = bottom_blob.channel(p * 4 + 1);
            const float* sptr2 = bottom_blob.channel(p * 4 + 2);
            const float* sptr3 = bottom_blob.channel(p * 4 + 3);

            for (int i = 0; i < h; i++)
            {
                float* outptr0 = m.row(i * 2);
                float* outptr1 = m.row(i * 2 + 1);

                int j = 0;
                for (; j + 1 < w; j += 2)
                {
                    float32x4_t _p00 = vld1q_f32(sptr0);
                    float32x4_t _p10 = vld1q_f32(sptr1);
                    float32x4_t _p20 = vld1q_f32(sptr2);
                    float32x4_t _p30 = vld1q_f32(sptr3);

                    float32x4_t _p01 = vld1q_f32(sptr0 + 4);
                    float32x4_t _p11 = vld1q_f32(sptr1 + 4);
                    float32x4_t _p21 = vld1q_f32(sptr2 + 4);
                    float32x4_t _p31 = vld1q_f32(sptr3 + 4);

                    float32x4x4_t _s0;
                    _s0.val[0] = vcombine_f32(vget_low_f32(_p00), vget_low_f32(_p01));
                    _s0.val[1] = vcombine_f32(vget_low_f32(_p10), vget_low_f32(_p11));
                    _s0.val[2] = vcombine_f32(vget_low_f32(_p20), vget_low_f32(_p21));
                    _s0.val[3] = vcombine_f32(vget_low_f32(_p30), vget_low_f32(_p31));

                    float32x4x4_t _s1;
                    _s1.val[0] = vcombine_f32(vget_high_f32(_p00), vget_high_f32(_p01));
                    _s1.val[1] = vcombine_f32(vget_high_f32(_p10), vget_high_f32(_p11));
                    _s1.val[2] = vcombine_f32(vget_high_f32(_p20), vget_high_f32(_p21));
                    _s1.val[3] = vcombine_f32(vget_high_f32(_p30), vget_high_f32(_p31));

                    vst4q_f32(outptr0, _s0);
                    vst4q_f32(outptr1, _s1);

                    sptr0 += 8;
                    sptr1 += 8;
                    sptr2 += 8;
                    sptr3 += 8;
                    outptr0 += 16;
                    outptr1 += 16;
                }
                for (; j < w; j++)
                {
                    outptr0[0] = sptr0[0];
                    outptr0[1] = sptr1[0];
                    outptr0[2] = sptr2[0];
                    outptr0[3] = sptr3[0];

                    outptr0[4] = sptr0[1];
                    outptr0[5] = sptr1[1];
                    outptr0[6] = sptr2[1];
                    outptr0[7] = sptr3[1];

                    outptr1[0] = sptr0[2];
                    outptr1[1] = sptr1[2];
                    outptr1[2] = sptr2[2];
                    outptr1[3] = sptr3[2];

                    outptr1[4] = sptr0[3];
                    outptr1[5] = sptr1[3];
                    outptr1[6] = sptr2[3];
                    outptr1[7] = sptr3[3];

                    sptr0 += 4;
                    sptr1 += 4;
                    sptr2 += 4;
                    sptr3 += 4;
                    outptr0 += 8;
                    outptr1 += 8;
                }
            }
        }

        return 0;
    }

    if (elempack == 4 && out_elempack == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc / out_elempack; p++)
        {
            Mat m = top_blob.channel(p);

            const float* sptr = bottom_blob.channel(p);

            for (int i = 0; i < h; i++)
            {
                float* outptr0 = m.row(i * 2);
                float* outptr1 = m.row(i * 2 + 1);

                int j = 0;
                for (; j + 1 < w; j += 2)
                {
                    float32x4_t _p0 = vld1q_f32(sptr);
                    float32x4_t _p1 = vld1q_f32(sptr + 4);

                    float32x4_t _s0 = vcombine_f32(vget_low_f32(_p0), vget_low_f32(_p1));
                    float32x4_t _s1 = vcombine_f32(vget_high_f32(_p0), vget_high_f32(_p1));

                    vst1q_f32(outptr0, _s0);
                    vst1q_f32(outptr1, _s1);

                    sptr += 8;
                    outptr0 += 4;
                    outptr1 += 4;
                }
                for (; j < w; j++)
                {
                    outptr0[0] = sptr[0];
                    outptr0[1] = sptr[1];
                    outptr1[0] = sptr[2];
                    outptr1[1] = sptr[3];

                    sptr += 4;
                    outptr0 += 2;
                    outptr1 += 2;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    return PixelShuffle::forward(bottom_blob, top_blob, opt);
}

int PixelShuffle_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = w * upscale_factor;
    int outh = h * upscale_factor;
    int outc = channels * elempack / (upscale_factor * upscale_factor);

    int out_elempack = 1;
#if __ARM_NEON
    if (opt.use_packing_layout)
    {
#if NCNN_ARM82
        out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
#else
        out_elempack = outc % 4 == 0 ? 4 : 1;
#endif
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    if (upscale_factor != 2 || mode != 0)
    {
        Option opt_pack = opt;
        opt_pack.blob_allocator = opt.workspace_allocator;

        Mat bottom_blob_unpacked;
        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);

        top_blob.create(outw, outh, outc, 2u, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc; p++)
        {
            Mat m = top_blob.channel(p);

            for (int sh = 0; sh < upscale_factor; sh++)
            {
                for (int sw = 0; sw < upscale_factor; sw++)
                {
                    int q;
                    if (mode == 0)
                        q = p * upscale_factor * upscale_factor + sh * upscale_factor + sw;
                    else // if (mode == 1)
                        q = (sh * upscale_factor + sw) * outc + p;

                    const unsigned short* sptr = bottom_blob_unpacked.channel(q);

                    for (int i = 0; i < h; i++)
                    {
                        unsigned short* outptr = m.row<unsigned short>(i * upscale_factor + sh) + sw;
                        for (int j = 0; j < w; j++)
                        {
                            outptr[0] = sptr[0];

                            sptr++;
                            outptr += upscale_factor;
                        }
                    }
                }
            }
        }

        return 0;
    }

    top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __ARM_NEON
    if (elempack == 8 && out_elempack == 8)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc / out_elempack; p++)
        {
            Mat m = top_blob.channel(p);

            const unsigned short* sptr0 = bottom_blob.channel(p * 4);
            const unsigned short* sptr1 = bottom_blob.channel(p * 4 + 1);
            const unsigned short* sptr2 = bottom_blob.channel(p * 4 + 2);
            const unsigned short* sptr3 = bottom_blob.channel(p * 4 + 3);

            for (int i = 0; i < h; i++)
            {
                unsigned short* outptr0 = m.row<unsigned short>(i * 2);
                unsigned short* outptr1 = m.row<unsigned short>(i * 2 + 1);

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    uint16x8x4_t _p0 = vld4q_u16(sptr0);
                    uint16x8x4_t _p1 = vld4q_u16(sptr1);
                    uint16x8x4_t _p2 = vld4q_u16(sptr2);
                    uint16x8x4_t _p3 = vld4q_u16(sptr3);

                    uint32x4x2_t _s04_0 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[0]), vreinterpretq_u32_u16(_p1.val[0]));
                    uint32x4x2_t _s15_0 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[1]), vreinterpretq_u32_u16(_p1.val[1]));
                    uint32x4x2_t _s26_0 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[2]), vreinterpretq_u32_u16(_p1.val[2]));
                    uint32x4x2_t _s37_0 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[3]), vreinterpretq_u32_u16(_p1.val[3]));
                    uint32x4x2_t _s04_1 = vzipq_u32(vreinterpretq_u32_u16(_p2.val[0]), vreinterpretq_u32_u16(_p3.val[0]));
                    uint32x4x2_t _s15_1 = vzipq_u32(vreinterpretq_u32_u16(_p2.val[1]), vreinterpretq_u32_u16(_p3.val[1]));
                    uint32x4x2_t _s26_1 = vzipq_u32(vreinterpretq_u32_u16(_p2.val[2]), vreinterpretq_u32_u16(_p3.val[2]));
                    uint32x4x2_t _s37_1 = vzipq_u32(vreinterpretq_u32_u16(_p2.val[3]), vreinterpretq_u32_u16(_p3.val[3]));

                    uint16x8_t _s0_0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s04_0.val[0]), vget_low_u32(_s04_1.val[0])));
                    uint16x8_t _s0_1 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s15_0.val[0]), vget_low_u32(_s15_1.val[0])));
                    uint16x8_t _s0_2 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s04_0.val[0]), vget_high_u32(_s04_1.val[0])));
                    uint16x8_t _s0_3 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s15_0.val[0]), vget_high_u32(_s15_1.val[0])));
                    uint16x8_t _s0_4 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s04_0.val[1]), vget_low_u32(_s04_1.val[1])));
                    uint16x8_t _s0_5 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s15_0.val[1]), vget_low_u32(_s15_1.val[1])));
                    uint16x8_t _s0_6 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s04_0.val[1]), vget_high_u32(_s04_1.val[1])));
                    uint16x8_t _s0_7 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s15_0.val[1]), vget_high_u32(_s15_1.val[1])));
                    uint16x8_t _s1_0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s26_0.val[0]), vget_low_u32(_s26_1.val[0])));
                    uint16x8_t _s1_1 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s37_0.val[0]), vget_low_u32(_s37_1.val[0])));
                    uint16x8_t _s1_2 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s26_0.val[0]), vget_high_u32(_s26_1.val[0])));
                    uint16x8_t _s1_3 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s37_0.val[0]), vget_high_u32(_s37_1.val[0])));
                    uint16x8_t _s1_4 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s26_0.val[1]), vget_low_u32(_s26_1.val[1])));
                    uint16x8_t _s1_5 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s37_0.val[1]), vget_low_u32(_s37_1.val[1])));
                    uint16x8_t _s1_6 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s26_0.val[1]), vget_high_u32(_s26_1.val[1])));
                    uint16x8_t _s1_7 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s37_0.val[1]), vget_high_u32(_s37_1.val[1])));

                    vst1q_u16(outptr0, _s0_0);
                    vst1q_u16(outptr0 + 8, _s0_1);
                    vst1q_u16(outptr0 + 16, _s0_2);
                    vst1q_u16(outptr0 + 24, _s0_3);
                    vst1q_u16(outptr0 + 32, _s0_4);
                    vst1q_u16(outptr0 + 40, _s0_5);
                    vst1q_u16(outptr0 + 48, _s0_6);
                    vst1q_u16(outptr0 + 56, _s0_7);
                    vst1q_u16(outptr1, _s1_0);
                    vst1q_u16(outptr1 + 8, _s1_1);
                    vst1q_u16(outptr1 + 16, _s1_2);
                    vst1q_u16(outptr1 + 24, _s1_3);
                    vst1q_u16(outptr1 + 32, _s1_4);
                    vst1q_u16(outptr1 + 40, _s1_5);
                    vst1q_u16(outptr1 + 48, _s1_6);
                    vst1q_u16(outptr1 + 56, _s1_7);

                    sptr0 += 32;
                    sptr1 += 32;
                    sptr2 += 32;
                    sptr3 += 32;
                    outptr0 += 64;
                    outptr1 += 64;
                }
                for (; j < w; j++)
                {
                    outptr0[0] = sptr0[0];
                    outptr0[1] = sptr0[4];
                    outptr0[2] = sptr1[0];
                    outptr0[3] = sptr1[4];
                    outptr0[4] = sptr2[0];
                    outptr0[5] = sptr2[4];
                    outptr0[6] = sptr3[0];
                    outptr0[7] = sptr3[4];

                    outptr0[8] = sptr0[1];
                    outptr0[9] = sptr0[5];
                    outptr0[10] = sptr1[1];
                    outptr0[11] = sptr1[5];
                    outptr0[12] = sptr2[1];
                    outptr0[13] = sptr2[5];
                    outptr0[14] = sptr3[1];
                    outptr0[15] = sptr3[5];

                    outptr1[0] = sptr0[2];
                    outptr1[1] = sptr0[6];
                    outptr1[2] = sptr1[2];
                    outptr1[3] = sptr1[6];
                    outptr1[4] = sptr2[2];
                    outptr1[5] = sptr2[6];
                    outptr1[6] = sptr3[2];
                    outptr1[7] = sptr3[6];

                    outptr1[8] = sptr0[3];
                    outptr1[9] = sptr0[7];
                    outptr1[10] = sptr1[3];
                    outptr1[11] = sptr1[7];
                    outptr1[12] = sptr2[3];
                    outptr1[13] = sptr2[7];
                    outptr1[14] = sptr3[3];
                    outptr1[15] = sptr3[7];

                    sptr0 += 8;
                    sptr1 += 8;
                    sptr2 += 8;
                    sptr3 += 8;
                    outptr0 += 16;
                    outptr1 += 16;
                }
            }
        }

        return 0;
    }

    if (elempack == 8 && out_elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc / out_elempack; p++)
        {
            Mat m = top_blob.channel(p);

            const unsigned short* sptr0 = bottom_blob.channel(p * 2);
            const unsigned short* sptr1 = bottom_blob.channel(p * 2 + 1);

            for (int i = 0; i < h; i++)
            {
                unsigned short* outptr0 = m.row<unsigned short>(i * 2);
                unsigned short* outptr1 = m.row<unsigned short>(i * 2 + 1);

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    uint16x8x4_t _p0 = vld4q_u16(sptr0);
                    uint16x8x4_t _p1 = vld4q_u16(sptr1);

                    uint32x4x2_t _s04 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[0]), vreinterpretq_u32_u16(_p1.val[0]));
                    uint32x4x2_t _s15 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[1]), vreinterpretq_u32_u16(_p1.val[1]));
                    uint32x4x2_t _s26 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[2]), vreinterpretq_u32_u16(_p1.val[2]));
                    uint32x4x2_t _s37 = vzipq_u32(vreinterpretq_u32_u16(_p0.val[3]), vreinterpretq_u32_u16(_p1.val[3]));

                    uint16x8_t _s0_0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s04.val[0]), vget_low_u32(_s15.val[0])));
                    uint16x8_t _s0_1 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s04.val[0]), vget_high_u32(_s15.val[0])));
                    uint16x8_t _s0_2 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s04.val[1]), vget_low_u32(_s15.val[1])));
                    uint16x8_t _s0_3 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s04.val[1]), vget_high_u32(_s15.val[1])));
                    uint16x8_t _s1_0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s26.val[0]), vget_low_u32(_s37.val[0])));
                    uint16x8_t _s1_1 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s26.val[0]), vget_high_u32(_s37.val[0])));
                    uint16x8_t _s1_2 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_s26.val[1]), vget_low_u32(_s37.val[1])));
                    uint16x8_t _s1_3 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_s26.val[1]), vget_high_u32(_s37.val[1])));

                    vst1q_u16(outptr0, _s0_0);
                    vst1q_u16(outptr0 + 8, _s0_1);
                    vst1q_u16(outptr0 + 16, _s0_2);
                    vst1q_u16(outptr0 + 24, _s0_3);
                    vst1q_u16(outptr1, _s1_0);
                    vst1q_u16(outptr1 + 8, _s1_1);
                    vst1q_u16(outptr1 + 16, _s1_2);
                    vst1q_u16(outptr1 + 24, _s1_3);

                    sptr0 += 32;
                    sptr1 += 32;
                    outptr0 += 32;
                    outptr1 += 32;
                }
                for (; j < w; j++)
                {
                    outptr0[0] = sptr0[0];
                    outptr0[1] = sptr0[4];
                    outptr0[2] = sptr1[0];
                    outptr0[3] = sptr1[4];

                    outptr0[4] = sptr0[1];
                    outptr0[5] = sptr0[5];
                    outptr0[6] = sptr1[1];
                    outptr0[7] = sptr1[5];

                    outptr1[0] = sptr0[2];
                    outptr1[1] = sptr0[6];
                    outptr1[2] = sptr1[2];
                    outptr1[3] = sptr1[6];

                    outptr1[4] = sptr0[3];
                    outptr1[5] = sptr0[7];
                    outptr1[6] = sptr1[3];
                    outptr1[7] = sptr1[7];

                    sptr0 += 8;
                    sptr1 += 8;
                    outptr0 += 8;
                    outptr1 += 8;
                }
            }
        }

        return 0;
    }

    if (elempack == 8 && out_elempack == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc / out_elempack / 2; p++)
        {
            Mat m0 = top_blob.channel(p * 2);
            Mat m1 = top_blob.channel(p * 2 + 1);

            const unsigned short* sptr = bottom_blob.channel(p);

            for (int i = 0; i < h; i++)
            {
                unsigned short* outptr00 = m0.row<unsigned short>(i * 2);
                unsigned short* outptr01 = m0.row<unsigned short>(i * 2 + 1);
                unsigned short* outptr10 = m1.row<unsigned short>(i * 2);
                unsigned short* outptr11 = m1.row<unsigned short>(i * 2 + 1);

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    uint32x4x4_t _p = vld4q_u32((unsigned int*)sptr);

                    uint16x8_t _s0 = vreinterpretq_u16_u32(_p.val[0]);
                    uint16x8_t _s1 = vreinterpretq_u16_u32(_p.val[1]);
                    uint16x8_t _s2 = vreinterpretq_u16_u32(_p.val[2]);
                    uint16x8_t _s3 = vreinterpretq_u16_u32(_p.val[3]);

                    vst1q_u16(outptr00, _s0);
                    vst1q_u16(outptr01, _s1);
                    vst1q_u16(outptr10, _s2);
                    vst1q_u16(outptr11, _s3);

                    sptr += 32;
                    outptr00 += 8;
                    outptr01 += 8;
                    outptr10 += 8;
                    outptr11 += 8;
                }
                for (; j < w; j++)
                {
                    outptr00[0] = sptr[0];
                    outptr00[1] = sptr[1];
                    outptr01[0] = sptr[2];
                    outptr01[1] = sptr[3];

                    outptr10[0] = sptr[4];
                    outptr10[1] = sptr[5];
                    outptr11[0] = sptr[6];
                    outptr11[1] = sptr[7];

                    sptr += 8;
                    outptr00 += 2;
                    outptr01 += 2;
                    outptr10 += 2;
                    outptr11 += 2;
                }
            }
        }

        return 0;
    }

    if (elempack == 4 && out_elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc / out_elempack; p++)
        {
            Mat m = top_blob.channel(p);

            const unsigned short* sptr0 = bottom_blob.channel(p * 4);
            const unsigned short* sptr1 = bottom_blob.channel(p * 4 + 1);
            const unsigned short* sptr2 = bottom_blob.channel(p * 4 + 2);
            const unsigned short* sptr3 = bottom_blob.channel(p * 4 + 3);

            for (int i = 0; i < h; i++)
            {
                unsigned short* outptr0 = m.row<unsigned short>(i * 2);
                unsigned short* outptr1 = m.row<unsigned short>(i * 2 + 1);

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    uint16x8_t _p00 = vld1q_u16(sptr0);
                    uint16x8_t _p10 = vld1q_u16(sptr1);
                    uint16x8_t _p20 = vld1q_u16(sptr2);
                    uint16x8_t _p30 = vld1q_u16(sptr3);
                    uint16x8_t _p01 = vld1q_u16(sptr0 + 8);
                    uint16x8_t _p11 = vld1q_u16(sptr1 + 8);
                    uint16x8_t _p21 = vld1q_u16(sptr2 + 8);
                    uint16x8_t _p31 = vld1q_u16(sptr3 + 8);

                    uint32x4x2_t _p0 = vuzpq_u32(vreinterpretq_u32_u16(_p00), vreinterpretq_u32_u16(_p01));
                    uint32x4x2_t _p1 = vuzpq_u32(vreinterpretq_u32_u16(_p10), vreinterpretq_u32_u16(_p11));
                    uint32x4x2_t _p2 = vuzpq_u32(vreinterpretq_u32_u16(_p20), vreinterpretq_u32_u16(_p21));
                    uint32x4x2_t _p3 = vuzpq_u32(vreinterpretq_u32_u16(_p30), vreinterpretq_u32_u16(_p31));

                    uint16x8x4_t _s0;
                    _s0.val[0] = vreinterpretq_u16_u32(_p0.val[0]);
                    _s0.val[1] = vreinterpretq_u16_u32(_p1.val[0]);
                    _s0.val[2] = vreinterpretq_u16_u32(_p2.val[0]);
                    _s0.val[3] = vreinterpretq_u16_u32(_p3.val[0]);

                    uint16x8x4_t _s1;
                    _s1.val[0] = vreinterpretq_u16_u32(_p0.val[1]);
                    _s1.val[1] = vreinterpretq_u16_u32(_p1.val[1]);
                    _s1.val[2] = vreinterpretq_u16_u32(_p2.val[1]);
                    _s1.val[3] = vreinterpretq_u16_u32(_p3.val[1]);

                    vst4q_u16(outptr0, _s0);
                    vst4q_u16(outptr1, _s1);

                    sptr0 += 16;
                    sptr1 += 16;
                    sptr2 += 16;
                    sptr3 += 16;
                    outptr0 += 32;
                    outptr1 += 32;
                }
                for (; j < w; j++)
                {
                    outptr0[0] = sptr0[0];
                    outptr0[1] = sptr1[0];
                    outptr0[2] = sptr2[0];
                    outptr0[3] = sptr3[0];

                    outptr0[4] = sptr0[1];
                    outptr0[5] = sptr1[1];
                    outptr0[6] = sptr2[1];
                    outptr0[7] = sptr3[1];

                    outptr1[0] = sptr0[2];
                    outptr1[1] = sptr1[2];
                    outptr1[2] = sptr2[2];
                    outptr1[3] = sptr3[2];

                    outptr1[4] = sptr0[3];
                    outptr1[5] = sptr1[3];
                    outptr1[6] = sptr2[3];
                    outptr1[7] = sptr3[3];

                    sptr0 += 4;
                    sptr1 += 4;
                    sptr2 += 4;
                    sptr3 += 4;
                    outptr0 += 8;
                    outptr1 += 8;
                }
            }
        }

        return 0;
    }

    if (elempack == 4 && out_elempack == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc / out_elempack; p++)
        {
            Mat m = top_blob.channel(p);

            const unsigned short* sptr = bottom_blob.channel(p);

            for (int i = 0; i < h; i++)
            {
                unsigned short* outptr0 = m.row<unsigned short>(i * 2);
                unsigned short* outptr1 = m.row<unsigned short>(i * 2 + 1);

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    uint16x8_t _p0 = vld1q_u16(sptr);
                    uint16x8_t _p1 = vld1q_u16(sptr + 8);

                    uint32x4x2_t _s01 = vuzpq_u32(vreinterpretq_u32_u16(_p0), vreinterpretq_u32_u16(_p1));

                    uint16x8_t _s0 = vreinterpretq_u16_u32(_s01.val[0]);
                    uint16x8_t _s1 = vreinterpretq_u16_u32(_s01.val[1]);

                    vst1q_u16(outptr0, _s0);
                    vst1q_u16(outptr1, _s1);

                    sptr += 16;
                    outptr0 += 8;
                    outptr1 += 8;
                }
                for (; j < w; j++)
                {
                    outptr0[0] = sptr[0];
                    outptr0[1] = sptr[1];
                    outptr1[0] = sptr[2];
                    outptr1[1] = sptr[3];

                    sptr += 4;
                    outptr0 += 2;
                    outptr1 += 2;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outc; p++)
    {
        Mat m = top_blob.channel(p);

        for (int sh = 0; sh < upscale_factor; sh++)
        {
            for (int sw = 0; sw < upscale_factor; sw++)
            {
                int q = p * upscale_factor * upscale_factor + sh * upscale_factor + sw;

                const unsigned short* sptr = bottom_blob.channel(q);

                for (int i = 0; i < h; i++)
                {
                    unsigned short* outptr = m.row<unsigned short>(i * upscale_factor + sh) + sw;
                    for (int j = 0; j < w; j++)
                    {
                        outptr[0] = sptr[0];

                        sptr++;
                        outptr += upscale_factor;
                    }
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/pixelshuffle_arm.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_PIXELSHUFFLE_ARM_H
#define LAYER_PIXELSHUFFLE_ARM_H

#include "pixelshuffle.h"

namespace ncnn {

class PixelShuffle_arm : public PixelShuffle
{
public:
    PixelShuffle_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_PIXELSHUFFLE_ARM_H


================================================
FILE: src/layer/arm/pooling_2x2.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pooling2x2s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const float* img0 = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        const float* r0 = img0;
        const float* r1 = img0 + w;

        for (int i = 0; i < outh; i++)
        {
#if __ARM_NEON
            int nn = outw >> 2;
            int remain = outw - (nn << 2);
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "0:                                   \n"
                    "prfm       pldl1keep, [%1, #256]     \n"
                    "prfm       pldl1keep, [%2, #256]     \n"
                    "ld1        {v0.4s, v1.4s}, [%1], #32 \n"
                    "ld1        {v2.4s, v3.4s}, [%2], #32 \n"
                    "fmax       v0.4s, v0.4s, v2.4s       \n"
                    "fmax       v1.4s, v1.4s, v3.4s       \n"
                    "fmaxp      v2.4s, v0.4s, v1.4s       \n"
                    "subs       %w0, %w0, #1              \n"
                    "st1        {v2.4s}, [%3], #16        \n"
                    "bne        0b                        \n"
                    : "=r"(nn),    // %0
                    "=r"(r0),    // %1
                    "=r"(r1),    // %2
                    "=r"(outptr) // %3
                    : "0"(nn),
                    "1"(r0),
                    "2"(r1),
                    "3"(outptr)
                    : "cc", "memory", "v0", "v1", "v2", "v3");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "0:                             \n"
                    "pld        [%1, #256]          \n"
                    "pld        [%2, #256]          \n"
                    "vld1.f32   {d0-d3}, [%1]!      \n"
                    "vld1.f32   {d4-d7}, [%2]!      \n"
                    "vmax.f32   q0, q0, q2          \n"
                    "vmax.f32   q1, q1, q3          \n"
                    "vpmax.f32  d4, d0, d1          \n"
                    "vpmax.f32  d5, d2, d3          \n"
                    "subs       %0, #1              \n"
                    "vst1.f32   {d4-d5}, [%3]!      \n"
                    "bne        0b                  \n"
                    : "=r"(nn),    // %0
                    "=r"(r0),    // %1
                    "=r"(r1),    // %2
                    "=r"(outptr) // %3
                    : "0"(nn),
                    "1"(r0),
                    "2"(r1),
                    "3"(outptr)
                    : "cc", "memory", "q0", "q1", "q2", "q3");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                float max0 = std::max(r0[0], r0[1]);
                float max1 = std::max(r1[0], r1[1]);

                *outptr = std::max(max0, max1);

                r0 += 2;
                r1 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/pooling_2x2_pack4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pooling2x2s2_max_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 4;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]   \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"

                    "fmax   v0.4s, v0.4s, v1.4s     \n"
                    "fmax   v2.4s, v2.4s, v3.4s     \n"

                    "prfm   pldl1keep, [%1, #512]   \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"

                    "fmax   v4.4s, v4.4s, v5.4s     \n"
                    "fmax   v6.4s, v6.4s, v7.4s     \n"

                    "prfm   pldl1keep, [%2, #512]   \n"
                    "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%2], #64 \n"

                    "fmax   v16.4s, v16.4s, v17.4s  \n"
                    "fmax   v18.4s, v18.4s, v19.4s  \n"

                    "prfm   pldl1keep, [%2, #512]   \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"

                    "fmax   v20.4s, v20.4s, v21.4s  \n"
                    "fmax   v22.4s, v22.4s, v23.4s  \n"

                    "fmax   v0.4s, v0.4s, v16.4s    \n"
                    "fmax   v1.4s, v2.4s, v18.4s    \n"
                    "fmax   v2.4s, v4.4s, v20.4s    \n"
                    "fmax   v3.4s, v6.4s, v22.4s    \n"

                    "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1)      // %2
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #512]      \n"
                    "vldm       %1!, {d0-d7}    \n"

                    "vmax.f32   q0, q0, q1      \n"
                    "vmax.f32   q2, q2, q3      \n"

                    "pld        [%1, #512]      \n"
                    "vldm       %1!, {d8-d15}   \n"

                    "vmax.f32   q4, q4, q5      \n"
                    "vmax.f32   q6, q6, q7      \n"

                    "pld        [%2, #512]      \n"
                    "vldm       %2!, {d16-d23}  \n"

                    "vmax.f32   q8, q8, q9      \n"
                    "vmax.f32   q10, q10, q11   \n"

                    "pld        [%2, #512]      \n"
                    "vldm       %2!, {d24-d31}  \n"

                    "vmax.f32   q12, q12, q13   \n"
                    "vmax.f32   q14, q14, q15   \n"

                    "vmax.f32   q0, q0, q8      \n"
                    "vmax.f32   q1, q2, q10     \n"
                    "vmax.f32   q2, q4, q12     \n"
                    "vmax.f32   q3, q6, q14     \n"

                    "vstm       %0!, {d0-d7}    \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1)      // %2
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j < outw; j++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);

                float32x4_t _max0 = vmaxq_f32(_r00, _r01);
                float32x4_t _max1 = vmaxq_f32(_r10, _r11);
                float32x4_t _max = vmaxq_f32(_max0, _max1);

                vst1q_f32(outptr, _max);

                r0 += 8;
                r1 += 8;
                outptr += 4;
            }

            r0 += tailstep;
            r1 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/pooling_2x2_pack4_bf16s.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pooling2x2s2_max_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 4;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        unsigned short* outptr = top_blob.channel(q);

        const unsigned short* r0 = img0.row<const unsigned short>(0);
        const unsigned short* r1 = img0.row<const unsigned short>(1);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]   \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"

                    "shll   v0.4s, v0.4h, #16       \n"
                    "shll   v1.4s, v1.4h, #16       \n"
                    "shll   v2.4s, v2.4h, #16       \n"
                    "shll   v3.4s, v3.4h, #16       \n"

                    "fmax   v0.4s, v0.4s, v1.4s     \n"
                    "fmax   v2.4s, v2.4s, v3.4s     \n"

                    "prfm   pldl1keep, [%1, #256]   \n"
                    "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%1], #32 \n"

                    "shll   v4.4s, v4.4h, #16       \n"
                    "shll   v5.4s, v5.4h, #16       \n"
                    "shll   v6.4s, v6.4h, #16       \n"
                    "shll   v7.4s, v7.4h, #16       \n"

                    "fmax   v4.4s, v4.4s, v5.4s     \n"
                    "fmax   v6.4s, v6.4s, v7.4s     \n"

                    "prfm   pldl1keep, [%2, #256]   \n"
                    "ld1    {v16.4h, v17.4h, v18.4h, v19.4h}, [%2], #32 \n"

                    "shll   v16.4s, v16.4h, #16     \n"
                    "shll   v17.4s, v17.4h, #16     \n"
                    "shll   v18.4s, v18.4h, #16     \n"
                    "shll   v19.4s, v19.4h, #16     \n"

                    "fmax   v16.4s, v16.4s, v17.4s  \n"
                    "fmax   v18.4s, v18.4s, v19.4s  \n"

                    "prfm   pldl1keep, [%2, #256]   \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%2], #32 \n"

                    "shll   v20.4s, v20.4h, #16     \n"
                    "shll   v21.4s, v21.4h, #16     \n"
                    "shll   v22.4s, v22.4h, #16     \n"
                    "shll   v23.4s, v23.4h, #16     \n"

                    "fmax   v20.4s, v20.4s, v21.4s  \n"
                    "fmax   v22.4s, v22.4s, v23.4s  \n"

                    "fmax   v0.4s, v0.4s, v16.4s    \n"
                    "fmax   v1.4s, v2.4s, v18.4s    \n"
                    "fmax   v2.4s, v4.4s, v20.4s    \n"
                    "fmax   v3.4s, v6.4s, v22.4s    \n"

                    "shrn   v0.4h, v0.4s, #16       \n"
                    "shrn   v1.4h, v1.4s, #16       \n"
                    "shrn   v2.4h, v2.4s, #16       \n"
                    "shrn   v3.4h, v3.4s, #16       \n"

                    "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1)      // %2
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.u16   {d4-d7}, [%1]!  \n"

                    "vshll.u16  q0, d4, #16     \n"
                    "vshll.u16  q1, d5, #16     \n"
                    "vshll.u16  q2, d6, #16     \n"
                    "vshll.u16  q3, d7, #16     \n"

                    "vmax.f32   q0, q0, q1      \n"
                    "vmax.f32   q2, q2, q3      \n"

                    "pld        [%1, #256]      \n"
                    "vld1.u16   {d12-d15}, [%1]! \n"

                    "vshll.u16  q4, d12, #16    \n"
                    "vshll.u16  q5, d13, #16    \n"
                    "vshll.u16  q6, d14, #16    \n"
                    "vshll.u16  q7, d15, #16    \n"

                    "vmax.f32   q4, q4, q5      \n"
                    "vmax.f32   q6, q6, q7      \n"

                    "pld        [%2, #256]      \n"
                    "vld1.u16   {d20-d23}, [%2]! \n"

                    "vshll.u16  q8, d20, #16    \n"
                    "vshll.u16  q9, d21, #16    \n"
                    "vshll.u16  q10, d22, #16   \n"
                    "vshll.u16  q11, d23, #16   \n"

                    "vmax.f32   q8, q8, q9      \n"
                    "vmax.f32   q10, q10, q11   \n"

                    "pld        [%2, #256]      \n"
                    "vld1.u16   {d28-d31}, [%2]! \n"

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"
                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmax.f32   q12, q12, q13   \n"
                    "vmax.f32   q14, q14, q15   \n"

                    "vmax.f32   q0, q0, q8      \n"
                    "vmax.f32   q1, q2, q10     \n"
                    "vmax.f32   q2, q4, q12     \n"
                    "vmax.f32   q3, q6, q14     \n"

                    "vshrn.u32  d0, q0, #16     \n"
                    "vshrn.u32  d1, q1, #16     \n"
                    "vshrn.u32  d2, q2, #16     \n"
                    "vshrn.u32  d3, q3, #16     \n"

                    "vst1.u16   {d0-d3}, [%0]!  \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1)      // %2
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j < outw; j++)
            {
                float32x4_t _r00 = bfloat2float(vld1_u16(r0));
                float32x4_t _r01 = bfloat2float(vld1_u16(r0 + 4));
                float32x4_t _r10 = bfloat2float(vld1_u16(r1));
                float32x4_t _r11 = bfloat2float(vld1_u16(r1 + 4));

                float32x4_t _max0 = vmaxq_f32(_r00, _r01);
                float32x4_t _max1 = vmaxq_f32(_r10, _r11);
                float32x4_t _max = vmaxq_f32(_max0, _max1);

                vst1_u16(outptr, float2bfloat(_max));

                r0 += 8;
                r1 += 8;
                outptr += 4;
            }

            r0 += tailstep;
            r1 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/pooling_3x3.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const float* img0 = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w * 2;

        for (int i = 0; i < outh; i++)
        {
#if __ARM_NEON
            int nn = outw >> 2;
            int remain = outw - (nn << 2);
#else
            int remain = outw;
#endif // __ARM_NEON

#if __ARM_NEON
#if __aarch64__
            if (nn > 0)
            {
                asm volatile(
                    "prfm       pldl1keep, [%1, #256]       \n"
                    "ld2        {v0.4s, v1.4s}, [%1], #32   \n"
                    "prfm       pldl1keep, [%2, #256]       \n"
                    "ld2        {v2.4s, v3.4s}, [%2], #32   \n"
                    "prfm       pldl1keep, [%3, #256]       \n"
                    "ld2        {v4.4s, v5.4s}, [%3], #32   \n"
                    "0:                                     \n"

                    "prfm       pldl1keep, [%1, #256]       \n"
                    "ld2        {v6.4s, v7.4s}, [%1], #32   \n"

                    "fmax       v12.4s, v0.4s, v1.4s        \n"
                    "fmax       v13.4s, v2.4s, v3.4s        \n"

                    "prfm       pldl1keep, [%2, #256]       \n"
                    "ld2        {v8.4s, v9.4s}, [%2], #32   \n"

                    "fmax       v14.4s, v4.4s, v5.4s        \n"
                    "ext        v0.16b, v0.16b, v6.16b, #4  \n"

                    "prfm       pldl1keep, [%3, #256]       \n"
                    "ld2        {v10.4s, v11.4s}, [%3], #32 \n"

                    "ext        v2.16b, v2.16b, v8.16b, #4  \n"

                    "fmax       v12.4s, v12.4s, v0.4s       \n"
                    "ext        v4.16b, v4.16b, v10.16b, #4 \n"

                    "fmax       v13.4s, v13.4s, v2.4s       \n"
                    "fmax       v14.4s, v14.4s, v4.4s       \n"
                    "fmax       v12.4s, v12.4s, v13.4s      \n"

                    "orr        v0.16b, v6.16b, v6.16b      \n"
                    "orr        v1.16b, v7.16b, v7.16b      \n"
                    "fmax       v12.4s, v12.4s, v14.4s      \n"

                    "orr        v2.16b, v8.16b, v8.16b      \n"
                    "orr        v3.16b, v9.16b, v9.16b      \n"
                    "orr        v4.16b, v10.16b, v10.16b    \n"
                    "orr        v5.16b, v11.16b, v11.16b    \n"

                    "subs       %w0, %w0, #1                \n"
                    "st1        {v12.4s}, [%4], #16         \n"
                    "bne        0b                          \n"
                    "sub        %1, %1, #32                 \n"
                    "sub        %2, %2, #32                 \n"
                    "sub        %3, %3, #32                 \n"
                    : "=r"(nn),    // %0
                    "=r"(r0),    // %1
                    "=r"(r1),    // %2
                    "=r"(r2),    // %3
                    "=r"(outptr) // %4
                    : "0"(nn),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(outptr)
                    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14");
            }
#else
            if (nn > 0)
            {
                asm volatile(
                    "pld        [%1, #256]          \n"
                    "vld2.f32   {d0-d3}, [%1]!      \n" // q0 = 0 2 4 6  q1 = 1 3 5 7
                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d4-d7}, [%2]!      \n"
                    "pld        [%3, #256]          \n"
                    "vld2.f32   {d8-d11}, [%3]!     \n"
                    "0:                             \n"
                    "pld        [%1, #256]          \n"
                    "vld2.f32   {d12-d15}, [%1]!    \n" // q6 = 8 10 12 14  q7 = 9 11 13 15

                    "vmax.f32   q12, q0, q1         \n"
                    "vmax.f32   q13, q2, q3         \n"

                    "pld        [%2, #256]          \n"
                    "vld2.f32   {d16-d19}, [%2]!    \n"

                    "vmax.f32   q14, q4, q5         \n"
                    "vext.32    q0, q0, q6, #1      \n"

                    "pld        [%3, #256]          \n"
                    "vld2.f32   {d20-d23}, [%3]!    \n"

                    "vext.32    q2, q2, q8, #1      \n"

                    "vmax.f32   q12, q12, q0        \n"
                    "vext.32    q4, q4, q10, #1     \n"

                    "vmax.f32   q13, q13, q2        \n"
                    "vmax.f32   q14, q14, q4        \n"
                    "vmax.f32   q12, q12, q13       \n"

                    "vorr       q0, q6, q6          \n"
                    "vorr       q1, q7, q7          \n"
                    "vmax.f32   q12, q12, q14       \n"

                    "vorr       q2, q8, q8          \n"
                    "vorr       q3, q9, q9          \n"
                    "vorr       q4, q10, q10        \n"
                    "vorr       q5, q11, q11        \n"

                    "subs       %0, #1              \n"
                    "vst1.f32   {d24-d25}, [%4]!    \n"
                    "bne        0b                  \n"
                    "sub        %1, #32             \n"
                    "sub        %2, #32             \n"
                    "sub        %3, #32             \n"
                    : "=r"(nn),    // %0
                    "=r"(r0),    // %1
                    "=r"(r1),    // %2
                    "=r"(r2),    // %3
                    "=r"(outptr) // %4
                    : "0"(nn),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2),
                    "4"(outptr)
                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14");
            }
#endif // __aarch64__
#endif // __ARM_NEON
            for (; remain > 0; remain--)
            {
                float max0 = std::max(std::max(r0[0], r0[1]), r0[2]);
                float max1 = std::max(std::max(r1[0], r1[1]), r1[2]);
                float max2 = std::max(std::max(r2[0], r2[1]), r2[2]);

                *outptr = std::max(std::max(max0, max1), max2);

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep; //1 + w;
            r1 += tailstep; //1 + w;
            r2 += tailstep; //1 + w;
        }
    }
}


================================================
FILE: src/layer/arm/pooling_3x3_pack4.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pooling3x3s2_max_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 4;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        float* outptr = top_blob.channel(q);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]   \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"

                    "fmax   v16.4s, v0.4s, v1.4s    \n"
                    "fmax   v17.4s, v2.4s, v3.4s    \n"

                    "prfm   pldl1keep, [%1, #512]   \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"

                    "fmax   v18.4s, v4.4s, v5.4s    \n"
                    "fmax   v19.4s, v6.4s, v7.4s    \n"

                    "ld1    {v8.4s}, [%1]           \n"

                    "fmax   v20.4s, v16.4s, v2.4s   \n"
                    "fmax   v21.4s, v17.4s, v4.4s   \n"

                    "prfm   pldl1keep, [%2, #512]   \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"

                    "fmax   v22.4s, v18.4s, v6.4s   \n"
                    "fmax   v23.4s, v19.4s, v8.4s   \n"

                    "prfm   pldl1keep, [%2, #512]   \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"

                    "fmax   v16.4s, v0.4s, v1.4s    \n"
                    "fmax   v17.4s, v2.4s, v3.4s    \n"

                    "fmax   v18.4s, v4.4s, v5.4s    \n"
                    "fmax   v19.4s, v6.4s, v7.4s    \n"

                    "ld1    {v8.4s}, [%2]           \n"

                    "fmax   v24.4s, v16.4s, v2.4s   \n"
                    "fmax   v25.4s, v17.4s, v4.4s   \n"

                    "prfm   pldl1keep, [%3, #512]   \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"

                    "fmax   v26.4s, v18.4s, v6.4s   \n"
                    "fmax   v27.4s, v19.4s, v8.4s   \n"

                    "prfm   pldl1keep, [%3, #512]   \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n"

                    "fmax   v16.4s, v0.4s, v1.4s    \n"
                    "fmax   v17.4s, v2.4s, v3.4s    \n"

                    "fmax   v18.4s, v4.4s, v5.4s    \n"
                    "fmax   v19.4s, v6.4s, v7.4s    \n"

                    "ld1    {v8.4s}, [%3]           \n"

                    "fmax   v28.4s, v16.4s, v2.4s   \n"
                    "fmax   v29.4s, v17.4s, v4.4s   \n"
                    "fmax   v30.4s, v18.4s, v6.4s   \n"
                    "fmax   v31.4s, v19.4s, v8.4s   \n"

                    "fmax   v20.4s, v20.4s, v24.4s  \n"
                    "fmax   v21.4s, v21.4s, v25.4s  \n"
                    "fmax   v22.4s, v22.4s, v26.4s  \n"
                    "fmax   v23.4s, v23.4s, v27.4s  \n"

                    "fmax   v20.4s, v20.4s, v28.4s  \n"
                    "fmax   v21.4s, v21.4s, v29.4s  \n"
                    "fmax   v22.4s, v22.4s, v30.4s  \n"
                    "fmax   v23.4s, v23.4s, v31.4s  \n"

                    "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #512]      \n"
                    "vldm       %1!, {d0-d7}    \n"

                    "pld        [%2, #512]      \n"
                    "vldm       %2!, {d8-d15}   \n"

                    "vmax.f32   q0, q0, q4      \n"
                    "vmax.f32   q1, q1, q5      \n"

                    "pld        [%3, #512]      \n"
                    "vldm       %3!, {d16-d23}  \n"

                    "vmax.f32   q2, q2, q6      \n"
                    "vmax.f32   q3, q3, q7      \n"

                    "vmax.f32   q0, q0, q8      \n"
                    "vmax.f32   q1, q1, q9      \n"

                    "pld        [%1, #512]      \n"
                    "vldm       %1!, {d8-d15}   \n"

                    "vmax.f32   q2, q2, q10     \n"
                    "vmax.f32   q3, q3, q11     \n"

                    "pld        [%2, #512]      \n"
                    "vldm       %2!, {d16-d23}  \n"

                    "vmax.f32   q4, q4, q8      \n"
                    "vmax.f32   q5, q5, q9      \n"

                    "pld        [%3, #512]      \n"
                    "vldm       %3!, {d24-d31}  \n"

                    "vmax.f32   q6, q6, q10     \n"
                    "vmax.f32   q7, q7, q11     \n"

                    "vmax.f32   q4, q4, q12     \n"
                    "vmax.f32   q5, q5, q13     \n"

                    "vld1.f32   {d24-d25}, [%1 :128] \n"
                    "vld1.f32   {d26-d27}, [%2 :128] \n"

                    "vmax.f32   q6, q6, q14     \n"
                    "vmax.f32   q7, q7, q15     \n"

                    "vld1.f32   {d28-d29}, [%3 :128] \n"

                    "vmax.f32   q8, q12, q13    \n"
                    "vmax.f32   q8, q8, q14     \n"

                    "vmax.f32   q12, q0, q1     \n"
                    "vmax.f32   q13, q2, q3     \n"
                    "vmax.f32   q14, q4, q5     \n"
                    "vmax.f32   q15, q6, q7     \n"

                    "vmax.f32   q12, q12, q2    \n"
                    "vmax.f32   q13, q13, q4    \n"
                    "vmax.f32   q14, q14, q6    \n"
                    "vmax.f32   q15, q15, q8    \n"

                    "vstm       %0!, {d24-d31}  \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #512]   \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"

                    "prfm   pldl1keep, [%2, #512]   \n"
                    "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"

                    "fmax   v16.4s, v0.4s, v4.4s    \n"
                    "fmax   v17.4s, v1.4s, v5.4s    \n"

                    "prfm   pldl1keep, [%3, #512]   \n"
                    "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"

                    "fmax   v18.4s, v2.4s, v6.4s    \n"
                    "fmax   v19.4s, v3.4s, v7.4s    \n"

                    "ld1    {v0.4s}, [%1]           \n"

                    "fmax   v16.4s, v16.4s, v20.4s  \n"
                    "fmax   v17.4s, v17.4s, v21.4s  \n"

                    "ld1    {v1.4s}, [%2]           \n"

                    "fmax   v18.4s, v18.4s, v22.4s  \n"
                    "fmax   v19.4s, v19.4s, v23.4s  \n"

                    "ld1    {v2.4s}, [%3]           \n"

                    "fmax   v3.4s, v0.4s, v1.4s     \n"

                    "fmax   v20.4s, v16.4s, v17.4s  \n"
                    "fmax   v21.4s, v18.4s, v19.4s  \n"

                    "fmax   v3.4s, v3.4s, v2.4s     \n"

                    "fmax   v20.4s, v20.4s, v18.4s  \n"
                    "fmax   v21.4s, v21.4s, v3.4s   \n"

                    "st1    {v20.4s, v21.4s}, [%0], #32 \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #512]      \n"
                    "vldm       %1!, {d0-d7}    \n"

                    "pld        [%2, #512]      \n"
                    "vldm       %2!, {d8-d15}   \n"

                    "vmax.f32   q12, q0, q4     \n"
                    "vmax.f32   q13, q1, q5     \n"

                    "pld        [%3, #512]      \n"
                    "vldm       %3!, {d16-d23}  \n"

                    "vmax.f32   q14, q2, q6     \n"
                    "vmax.f32   q15, q3, q7     \n"

                    "vld1.f32   {d0-d1}, [%1 :128] \n"

                    "vmax.f32   q12, q12, q8    \n"
                    "vmax.f32   q13, q13, q9    \n"

                    "vld1.f32   {d2-d3}, [%2 :128] \n"

                    "vmax.f32   q14, q14, q10   \n"
                    "vmax.f32   q15, q15, q11   \n"

                    "vld1.f32   {d4-d5}, [%3 :128] \n"

                    "vmax.f32   q3, q0, q1      \n"

                    "vmax.f32   q4, q12, q13    \n"
                    "vmax.f32   q5, q14, q15    \n"

                    "vmax.f32   q3, q3, q2      \n"

                    "vmax.f32   q4, q4, q14     \n"
                    "vmax.f32   q5, q5, q3      \n"

                    "vst1.f32   {d8-d11}, [%0 :128]! \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j < outw; j++)
            {
                float32x4_t _r00 = vld1q_f32(r0);
                float32x4_t _r01 = vld1q_f32(r0 + 4);
                float32x4_t _r02 = vld1q_f32(r0 + 8);
                float32x4_t _r10 = vld1q_f32(r1);
                float32x4_t _r11 = vld1q_f32(r1 + 4);
                float32x4_t _r12 = vld1q_f32(r1 + 8);
                float32x4_t _r20 = vld1q_f32(r2);
                float32x4_t _r21 = vld1q_f32(r2 + 4);
                float32x4_t _r22 = vld1q_f32(r2 + 8);

                float32x4_t _max0 = vmaxq_f32(vmaxq_f32(_r00, _r01), _r02);
                float32x4_t _max1 = vmaxq_f32(vmaxq_f32(_r10, _r11), _r12);
                float32x4_t _max2 = vmaxq_f32(vmaxq_f32(_r20, _r21), _r22);

                float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);

                vst1q_f32(outptr, _max);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                outptr += 4;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/pooling_3x3_pack4_bf16s.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void pooling3x3s2_max_pack4_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 4;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        unsigned short* outptr = top_blob.channel(q);

        const unsigned short* r0 = img0.row<const unsigned short>(0);
        const unsigned short* r1 = img0.row<const unsigned short>(1);
        const unsigned short* r2 = img0.row<const unsigned short>(2);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;

            for (; j + 3 < outw; j += 4)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]   \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"

                    "shll   v0.4s, v0.4h, #16       \n"
                    "shll   v1.4s, v1.4h, #16       \n"
                    "shll   v2.4s, v2.4h, #16       \n"
                    "shll   v3.4s, v3.4h, #16       \n"

                    "fmax   v16.4s, v0.4s, v1.4s    \n"
                    "fmax   v17.4s, v2.4s, v3.4s    \n"

                    "prfm   pldl1keep, [%1, #256]   \n"
                    "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%1], #32 \n"

                    "shll   v4.4s, v4.4h, #16       \n"
                    "shll   v5.4s, v5.4h, #16       \n"
                    "shll   v6.4s, v6.4h, #16       \n"
                    "shll   v7.4s, v7.4h, #16       \n"

                    "fmax   v18.4s, v4.4s, v5.4s    \n"
                    "fmax   v19.4s, v6.4s, v7.4s    \n"

                    "ld1    {v8.4h}, [%1]           \n"
                    "shll   v8.4s, v8.4h, #16       \n"

                    "fmax   v20.4s, v16.4s, v2.4s   \n"
                    "fmax   v21.4s, v17.4s, v4.4s   \n"

                    "prfm   pldl1keep, [%2, #256]   \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%2], #32 \n"

                    "shll   v0.4s, v0.4h, #16       \n"
                    "shll   v1.4s, v1.4h, #16       \n"
                    "shll   v2.4s, v2.4h, #16       \n"
                    "shll   v3.4s, v3.4h, #16       \n"

                    "fmax   v22.4s, v18.4s, v6.4s   \n"
                    "fmax   v23.4s, v19.4s, v8.4s   \n"

                    "prfm   pldl1keep, [%2, #256]   \n"
                    "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%2], #32 \n"

                    "shll   v4.4s, v4.4h, #16       \n"
                    "shll   v5.4s, v5.4h, #16       \n"
                    "shll   v6.4s, v6.4h, #16       \n"
                    "shll   v7.4s, v7.4h, #16       \n"

                    "fmax   v16.4s, v0.4s, v1.4s    \n"
                    "fmax   v17.4s, v2.4s, v3.4s    \n"

                    "fmax   v18.4s, v4.4s, v5.4s    \n"
                    "fmax   v19.4s, v6.4s, v7.4s    \n"

                    "ld1    {v8.4h}, [%2]           \n"
                    "shll   v8.4s, v8.4h, #16       \n"

                    "fmax   v24.4s, v16.4s, v2.4s   \n"
                    "fmax   v25.4s, v17.4s, v4.4s   \n"

                    "prfm   pldl1keep, [%3, #256]   \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%3], #32 \n"

                    "shll   v0.4s, v0.4h, #16       \n"
                    "shll   v1.4s, v1.4h, #16       \n"
                    "shll   v2.4s, v2.4h, #16       \n"
                    "shll   v3.4s, v3.4h, #16       \n"

                    "fmax   v26.4s, v18.4s, v6.4s   \n"
                    "fmax   v27.4s, v19.4s, v8.4s   \n"

                    "prfm   pldl1keep, [%3, #256]   \n"
                    "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%3], #32 \n"

                    "shll   v4.4s, v4.4h, #16       \n"
                    "shll   v5.4s, v5.4h, #16       \n"
                    "shll   v6.4s, v6.4h, #16       \n"
                    "shll   v7.4s, v7.4h, #16       \n"

                    "fmax   v16.4s, v0.4s, v1.4s    \n"
                    "fmax   v17.4s, v2.4s, v3.4s    \n"

                    "fmax   v18.4s, v4.4s, v5.4s    \n"
                    "fmax   v19.4s, v6.4s, v7.4s    \n"

                    "ld1    {v8.4h}, [%3]           \n"
                    "shll   v8.4s, v8.4h, #16       \n"

                    "fmax   v28.4s, v16.4s, v2.4s   \n"
                    "fmax   v29.4s, v17.4s, v4.4s   \n"
                    "fmax   v30.4s, v18.4s, v6.4s   \n"
                    "fmax   v31.4s, v19.4s, v8.4s   \n"

                    "fmax   v20.4s, v20.4s, v24.4s  \n"
                    "fmax   v21.4s, v21.4s, v25.4s  \n"
                    "fmax   v22.4s, v22.4s, v26.4s  \n"
                    "fmax   v23.4s, v23.4s, v27.4s  \n"

                    "fmax   v20.4s, v20.4s, v28.4s  \n"
                    "fmax   v21.4s, v21.4s, v29.4s  \n"
                    "fmax   v22.4s, v22.4s, v30.4s  \n"
                    "fmax   v23.4s, v23.4s, v31.4s  \n"

                    "shrn   v20.4h, v20.4s, #16     \n"
                    "shrn   v21.4h, v21.4s, #16     \n"
                    "shrn   v22.4h, v22.4s, #16     \n"
                    "shrn   v23.4h, v23.4s, #16     \n"

                    "st1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%0], #32 \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.u16   {d4-d7}, [%1]!  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.u16   {d12-d15}, [%2]! \n"

                    "vshll.u16  q0, d4, #16     \n"
                    "vshll.u16  q1, d5, #16     \n"
                    "vshll.u16  q2, d6, #16     \n"
                    "vshll.u16  q3, d7, #16     \n"

                    "vshll.u16  q4, d12, #16    \n"
                    "vshll.u16  q5, d13, #16    \n"
                    "vshll.u16  q6, d14, #16    \n"
                    "vshll.u16  q7, d15, #16    \n"

                    "vmax.f32   q0, q0, q4      \n"
                    "vmax.f32   q1, q1, q5      \n"

                    "pld        [%3, #256]      \n"
                    "vld1.u16   {d20-d23}, [%3]! \n"

                    "vshll.u16  q8, d20, #16    \n"
                    "vshll.u16  q9, d21, #16    \n"
                    "vshll.u16  q10, d22, #16   \n"
                    "vshll.u16  q11, d23, #16   \n"

                    "vmax.f32   q2, q2, q6      \n"
                    "vmax.f32   q3, q3, q7      \n"

                    "vmax.f32   q0, q0, q8      \n"
                    "vmax.f32   q1, q1, q9      \n"

                    "pld        [%1, #256]      \n"
                    "vld1.u16   {d12-d15}, [%1]! \n"

                    "vshll.u16  q4, d12, #16    \n"
                    "vshll.u16  q5, d13, #16    \n"
                    "vshll.u16  q6, d14, #16    \n"
                    "vshll.u16  q7, d15, #16    \n"

                    "vmax.f32   q2, q2, q10     \n"
                    "vmax.f32   q3, q3, q11     \n"

                    "pld        [%2, #256]      \n"
                    "vld1.u16   {d20-d23}, [%2]! \n"

                    "vshll.u16  q8, d20, #16    \n"
                    "vshll.u16  q9, d21, #16    \n"
                    "vshll.u16  q10, d22, #16   \n"
                    "vshll.u16  q11, d23, #16   \n"

                    "vmax.f32   q4, q4, q8      \n"
                    "vmax.f32   q5, q5, q9      \n"

                    "pld        [%3, #256]      \n"
                    "vld1.u16   {d28-d31}, [%3]! \n"

                    "vshll.u16  q12, d28, #16   \n"
                    "vshll.u16  q13, d29, #16   \n"
                    "vshll.u16  q14, d30, #16   \n"
                    "vshll.u16  q15, d31, #16   \n"

                    "vmax.f32   q6, q6, q10     \n"
                    "vmax.f32   q7, q7, q11     \n"

                    "vmax.f32   q4, q4, q12     \n"
                    "vmax.f32   q5, q5, q13     \n"

                    "vld1.u16   {d25}, [%1]     \n"
                    "vld1.u16   {d27}, [%2]     \n"
                    "vshll.u16  q12, d25, #16   \n"
                    "vshll.u16  q13, d27, #16   \n"

                    "vmax.f32   q6, q6, q14     \n"
                    "vmax.f32   q7, q7, q15     \n"

                    "vld1.u16   {d29}, [%3]     \n"
                    "vshll.u16  q14, d29, #16   \n"

                    "vmax.f32   q8, q12, q13    \n"
                    "vmax.f32   q8, q8, q14     \n"

                    "vmax.f32   q12, q0, q1     \n"
                    "vmax.f32   q13, q2, q3     \n"
                    "vmax.f32   q14, q4, q5     \n"
                    "vmax.f32   q15, q6, q7     \n"

                    "vmax.f32   q12, q12, q2    \n"
                    "vmax.f32   q13, q13, q4    \n"
                    "vmax.f32   q14, q14, q6    \n"
                    "vmax.f32   q15, q15, q8    \n"

                    "vshrn.u32  d24, q12, #16   \n"
                    "vshrn.u32  d25, q13, #16   \n"
                    "vshrn.u32  d26, q14, #16   \n"
                    "vshrn.u32  d27, q15, #16   \n"

                    "vst1.u16   {d24-d27}, [%0]! \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j + 1 < outw; j += 2)
            {
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%1, #256]   \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n"

                    "prfm   pldl1keep, [%2, #256]   \n"
                    "ld1    {v4.4h, v5.4h, v6.4h, v7.4h}, [%2], #32 \n"

                    "shll   v0.4s, v0.4h, #16       \n"
                    "shll   v1.4s, v1.4h, #16       \n"
                    "shll   v2.4s, v2.4h, #16       \n"
                    "shll   v3.4s, v3.4h, #16       \n"

                    "shll   v4.4s, v4.4h, #16       \n"
                    "shll   v5.4s, v5.4h, #16       \n"
                    "shll   v6.4s, v6.4h, #16       \n"
                    "shll   v7.4s, v7.4h, #16       \n"

                    "fmax   v16.4s, v0.4s, v4.4s    \n"
                    "fmax   v17.4s, v1.4s, v5.4s    \n"

                    "prfm   pldl1keep, [%3, #256]   \n"
                    "ld1    {v20.4h, v21.4h, v22.4h, v23.4h}, [%3], #32 \n"

                    "shll   v20.4s, v20.4h, #16     \n"
                    "shll   v21.4s, v21.4h, #16     \n"
                    "shll   v22.4s, v22.4h, #16     \n"
                    "shll   v23.4s, v23.4h, #16     \n"

                    "fmax   v18.4s, v2.4s, v6.4s    \n"
                    "fmax   v19.4s, v3.4s, v7.4s    \n"

                    "ld1    {v0.4s}, [%1]           \n"

                    "fmax   v16.4s, v16.4s, v20.4s  \n"
                    "fmax   v17.4s, v17.4s, v21.4s  \n"

                    "ld1    {v1.4s}, [%2]           \n"

                    "fmax   v18.4s, v18.4s, v22.4s  \n"
                    "fmax   v19.4s, v19.4s, v23.4s  \n"

                    "ld1    {v2.4s}, [%3]           \n"

                    "fmax   v3.4s, v0.4s, v1.4s     \n"

                    "fmax   v20.4s, v16.4s, v17.4s  \n"
                    "fmax   v21.4s, v18.4s, v19.4s  \n"

                    "fmax   v3.4s, v3.4s, v2.4s     \n"

                    "fmax   v20.4s, v20.4s, v18.4s  \n"
                    "fmax   v21.4s, v21.4s, v3.4s   \n"

                    "shrn   v20.4h, v20.4s, #16     \n"
                    "shrn   v21.4h, v21.4s, #16     \n"

                    "st1    {v20.4h, v21.4h}, [%0], #16 \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
#else  // __aarch64__
                asm volatile(
                    "pld        [%1, #256]      \n"
                    "vld1.u16   {d4-d7}, [%1]!  \n"

                    "pld        [%2, #256]      \n"
                    "vld1.u16   {d12-d15}, [%2]! \n"

                    "vshll.u16  q0, d4, #16     \n"
                    "vshll.u16  q1, d5, #16     \n"
                    "vshll.u16  q2, d6, #16     \n"
                    "vshll.u16  q3, d7, #16     \n"

                    "vshll.u16  q4, d12, #16    \n"
                    "vshll.u16  q5, d13, #16    \n"
                    "vshll.u16  q6, d14, #16    \n"
                    "vshll.u16  q7, d15, #16    \n"

                    "vmax.f32   q12, q0, q4     \n"
                    "vmax.f32   q13, q1, q5     \n"

                    "pld        [%3, #256]      \n"
                    "vld1.u16   {d20-d23}, [%3]! \n"

                    "vshll.u16  q8, d20, #16    \n"
                    "vshll.u16  q9, d21, #16    \n"
                    "vshll.u16  q10, d22, #16   \n"
                    "vshll.u16  q11, d23, #16   \n"

                    "vmax.f32   q14, q2, q6     \n"
                    "vmax.f32   q15, q3, q7     \n"

                    "vld1.u16   {d1}, [%1]      \n"
                    "vshll.u16  q0, d1, #16     \n"

                    "vmax.f32   q12, q12, q8    \n"
                    "vmax.f32   q13, q13, q9    \n"

                    "vld1.u16   {d3}, [%2]      \n"
                    "vshll.u16  q1, d3, #16     \n"

                    "vmax.f32   q14, q14, q10   \n"
                    "vmax.f32   q15, q15, q11   \n"

                    "vld1.u16   {d5}, [%3]      \n"
                    "vshll.u16  q2, d5, #16     \n"

                    "vmax.f32   q3, q0, q1      \n"

                    "vmax.f32   q4, q12, q13    \n"
                    "vmax.f32   q5, q14, q15    \n"

                    "vmax.f32   q3, q3, q2      \n"

                    "vmax.f32   q4, q4, q14     \n"
                    "vmax.f32   q5, q5, q3      \n"

                    "vshrn.u32  d8, q4, #16     \n"
                    "vshrn.u32  d9, q5, #16     \n"

                    "vst1.u16   {d8-d9}, [%0]!  \n"

                    : "=r"(outptr), // %0
                    "=r"(r0),     // %1
                    "=r"(r1),     // %2
                    "=r"(r2)      // %3
                    : "0"(outptr),
                    "1"(r0),
                    "2"(r1),
                    "3"(r2)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
            }
            for (; j < outw; j++)
            {
                float32x4_t _r00 = bfloat2float(vld1_u16(r0));
                float32x4_t _r01 = bfloat2float(vld1_u16(r0 + 4));
                float32x4_t _r02 = bfloat2float(vld1_u16(r0 + 8));
                float32x4_t _r10 = bfloat2float(vld1_u16(r1));
                float32x4_t _r11 = bfloat2float(vld1_u16(r1 + 4));
                float32x4_t _r12 = bfloat2float(vld1_u16(r1 + 8));
                float32x4_t _r20 = bfloat2float(vld1_u16(r2));
                float32x4_t _r21 = bfloat2float(vld1_u16(r2 + 4));
                float32x4_t _r22 = bfloat2float(vld1_u16(r2 + 8));

                float32x4_t _max0 = vmaxq_f32(vmaxq_f32(_r00, _r01), _r02);
                float32x4_t _max1 = vmaxq_f32(vmaxq_f32(_r10, _r11), _r12);
                float32x4_t _max2 = vmaxq_f32(vmaxq_f32(_r20, _r21), _r22);

                float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);

                vst1_u16(outptr, float2bfloat(_max));

                r0 += 8;
                r1 += 8;
                r2 += 8;
                outptr += 4;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/arm/pooling_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "pooling_arm.h"

#include <float.h>

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#if NCNN_GNU_INLINE_ASM
#include "pooling_2x2.h"
#include "pooling_3x3.h"

#if __ARM_NEON
#include "pooling_2x2_pack4.h"
#include "pooling_3x3_pack4.h"
#include "pooling_2x2_pack4_bf16s.h"
#include "pooling_3x3_pack4_bf16s.h"
#endif
#endif // NCNN_GNU_INLINE_ASM

Pooling_arm::Pooling_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Pooling_arm::create_pipeline(const Option& /*opt*/)
{
    if (adaptive_pooling)
    {
        support_packing = false;

        support_bf16_storage = false;
        support_fp16_storage = false;
        support_int8_storage = false;
        support_tensor_storage = false;
    }
    return 0;
}

int Pooling_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (adaptive_pooling)
    {
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    // max value in NxN window
    // avg value in NxN window

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __ARM_NEON
    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);

    if (elempack == 4)
    {
        if (global_pooling)
        {
            top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            int size = w * h;

            if (pooling_type == PoolMethod_MAX)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);

                    float32x4_t _max = vld1q_f32(ptr);
                    for (int i = 0; i < size; i++)
                    {
                        float32x4_t _val = vld1q_f32(ptr);
                        _max = vmaxq_f32(_max, _val);
                        ptr += 4;
                    }

                    float* outptr = top_blob;
                    vst1q_f32(outptr + q * 4, _max);
                }
            }
            else if (pooling_type == PoolMethod_AVE)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);

                    float32x4_t _sum = vdupq_n_f32(0.f);
                    for (int i = 0; i < size; i++)
                    {
                        float32x4_t _val = vld1q_f32(ptr);
                        _sum = vaddq_f32(_sum, _val);
                        ptr += 4;
                    }

                    float32x4_t _inv_size = vdupq_n_f32(1.f / size);
                    float32x4_t _avg = vmulq_f32(_sum, _inv_size);

                    float* outptr = top_blob;
                    vst1q_f32(outptr + q * 4, _avg);
                }
            }

            return 0;
        }

        Mat bottom_blob_bordered;
        make_padding(bottom_blob, bottom_blob_bordered, opt);
        if (bottom_blob_bordered.empty())
            return -100;

        w = bottom_blob_bordered.w;
        h = bottom_blob_bordered.h;

        int outw = (w - kernel_w) / stride_w + 1;
        int outh = (h - kernel_h) / stride_h + 1;

        top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int maxk = kernel_w * kernel_h;

        // kernel offsets
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;
            int gap = w - kernel_w;
            for (int i = 0; i < kernel_h; i++)
            {
                for (int j = 0; j < kernel_w; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        if (pooling_type == PoolMethod_MAX)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 2 && kernel_h == 2 && stride_w == 2 && stride_h == 2)
            {
                pooling2x2s2_max_pack4_neon(bottom_blob_bordered, top_blob, opt);

                return 0;
            }

            if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2)
            {
                pooling3x3s2_max_pack4_neon(bottom_blob_bordered, top_blob, opt);

                return 0;
            }
#endif // NCNN_GNU_INLINE_ASM

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat m = bottom_blob_bordered.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                        float32x4_t _max = vld1q_f32(sptr);

                        for (int k = 0; k < maxk; k++)
                        {
                            float32x4_t _val = vld1q_f32(sptr + space_ofs[k] * 4);
                            _max = vmaxq_f32(_max, _val);
                        }

                        vst1q_f32(outptr + j * 4, _max);
                    }

                    outptr += outw * 4;
                }
            }
        }
        else if (pooling_type == PoolMethod_AVE)
        {
            if (avgpool_count_include_pad == 0)
            {
                int wtailpad = 0;
                int htailpad = 0;

                if (pad_mode == 0) // full padding
                {
                    wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
                    htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    float* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            float32x4_t _sum = vdupq_n_f32(0.f);
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    float32x4_t _val = vld1q_f32(m.row(sy) + sx * 4);
                                    _sum = vaddq_f32(_sum, _val);
                                    area += 1;
                                }
                            }

                            float32x4_t _inv_area = vdupq_n_f32(1.f / area);
                            float32x4_t _avg = vmulq_f32(_sum, _inv_area);
                            vst1q_f32(outptr + j * 4, _avg);
                        }

                        outptr += outw * 4;
                    }
                }
            }
            else // if (avgpool_count_include_pad == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    float* outptr = top_blob.channel(q);

                    float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                            float32x4_t _sum = vdupq_n_f32(0.f);

                            for (int k = 0; k < maxk; k++)
                            {
                                float32x4_t _val = vld1q_f32(sptr + space_ofs[k] * 4);
                                _sum = vaddq_f32(_sum, _val);
                            }

                            float32x4_t _avg = vmulq_f32(_sum, _inv_maxk);
                            vst1q_f32(outptr + j * 4, _avg);
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

#if NCNN_GNU_INLINE_ASM
    if (kernel_w != kernel_h || stride_w != stride_h)
    {
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    const int kernel_size = kernel_w;
    const int stride = stride_w;

    if (pooling_type != PoolMethod_MAX || stride != 2 || global_pooling == 1)
    {
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    if (kernel_size != 2 && kernel_size != 3)
    {
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_w) / stride_w + 1;
    int outh = (h - kernel_h) / stride_h + 1;

    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (kernel_size == 2)
        pooling2x2s2_max_neon(bottom_blob_bordered, top_blob, opt);
    if (kernel_size == 3)
        pooling3x3s2_max_neon(bottom_blob_bordered, top_blob, opt);

    return 0;
#else  // NCNN_GNU_INLINE_ASM
    return Pooling::forward(bottom_blob, top_blob, opt);
#endif // NCNN_GNU_INLINE_ASM
}

#if NCNN_BF16
int Pooling_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // max value in NxN window
    // avg value in NxN window

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);

    if (global_pooling)
    {
        top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int size = w * h;

        if (pooling_type == PoolMethod_MAX)
        {
#if __ARM_NEON
            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob.channel(q);

                    float32x4_t _max = vdupq_n_f32(-FLT_MAX);
                    for (int i = 0; i < size; i++)
                    {
                        float32x4_t _val = bfloat2float(vld1_u16(ptr));
                        _max = vmaxq_f32(_max, _val);
                        ptr += 4;
                    }

                    unsigned short* outptr = top_blob;
                    vst1_u16(outptr + q * 4, float2bfloat(_max));
                }
            }
#endif // __ARM_NEON

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob.channel(q);

                    float max = -FLT_MAX;
                    for (int i = 0; i < size; i++)
                    {
                        max = std::max(max, bfloat16_to_float32(ptr[i]));
                    }

                    unsigned short* outptr = top_blob;
                    outptr[q] = float32_to_bfloat16(max);
                }
            }
        }

        if (pooling_type == PoolMethod_AVE)
        {
#if __ARM_NEON
            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob.channel(q);

                    float32x4_t _sum = vdupq_n_f32(0.f);
                    for (int i = 0; i < size; i++)
                    {
                        float32x4_t _val = bfloat2float(vld1_u16(ptr));
                        _sum = vaddq_f32(_sum, _val);
                        ptr += 4;
                    }

                    float32x4_t _inv_size = vdupq_n_f32(1.f / size);
                    float32x4_t _avg = vmulq_f32(_sum, _inv_size);

                    unsigned short* outptr = top_blob;
                    vst1_u16(outptr + q * 4, float2bfloat(_avg));
                }
            }
#endif // __ARM_NEON

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const unsigned short* ptr = bottom_blob.channel(q);

                    float sum = 0.f;
                    for (int i = 0; i < size; i++)
                    {
                        sum += bfloat16_to_float32(ptr[i]);
                    }

                    unsigned short* outptr = top_blob;
                    outptr[q] = float32_to_bfloat16(sum / size);
                }
            }
        }

        return 0;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_w) / stride_w + 1;
    int outh = (h - kernel_h) / stride_h + 1;

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w - kernel_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2++;
            }
            p2 += gap;
        }
    }

    if (pooling_type == PoolMethod_MAX)
    {
#if __ARM_NEON
        if (elempack == 4)
        {
#if NCNN_GNU_INLINE_ASM
            if (kernel_w == 2 && kernel_h == 2 && stride_w == 2 && stride_h == 2)
            {
                pooling2x2s2_max_pack4_bf16s_neon(bottom_blob_bordered, top_blob, opt);

                return 0;
            }

            if (kernel_w == 3 && kernel_h == 3 && stride_w == 2 && stride_h == 2)
            {
                pooling3x3s2_max_pack4_bf16s_neon(bottom_blob_bordered, top_blob, opt);

                return 0;
            }
#endif // NCNN_GNU_INLINE_ASM

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat m = bottom_blob_bordered.channel(q);
                unsigned short* outptr = top_blob.channel(q);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w * 4;

                        float32x4_t _max = vdupq_n_f32(-FLT_MAX);

                        for (int k = 0; k < maxk; k++)
                        {
                            float32x4_t _val = bfloat2float(vld1_u16(sptr + space_ofs[k] * 4));
                            _max = vmaxq_f32(_max, _val);
                        }

                        vst1_u16(outptr + j * 4, float2bfloat(_max));
                    }

                    outptr += outw * 4;
                }
            }
        }
#endif // __ARM_NEON

        if (elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat m = bottom_blob_bordered.channel(q);
                unsigned short* outptr = top_blob.channel(q);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w;

                        float max = -FLT_MAX;

                        for (int k = 0; k < maxk; k++)
                        {
                            float val = bfloat16_to_float32(sptr[space_ofs[k]]);
                            max = std::max(max, val);
                        }

                        outptr[j] = float32_to_bfloat16(max);
                    }

                    outptr += outw;
                }
            }
        }
    }

    if (pooling_type == PoolMethod_AVE)
    {
        if (avgpool_count_include_pad == 0)
        {
            int wtailpad = 0;
            int htailpad = 0;

            if (pad_mode == 0) // full padding
            {
                wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
                htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
            }

#if __ARM_NEON
            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            float32x4_t _sum = vdupq_n_f32(0.f);
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    float32x4_t _val = bfloat2float(vld1_u16(m.row<const unsigned short>(sy) + sx * 4));
                                    _sum = vaddq_f32(_sum, _val);
                                    area += 1;
                                }
                            }

                            float32x4_t _inv_area = vdupq_n_f32(1.f / area);
                            float32x4_t _avg = vmulq_f32(_sum, _inv_area);
                            vst1_u16(outptr + j * 4, float2bfloat(_avg));
                        }

                        outptr += outw * 4;
                    }
                }
            }
#endif // __ARM_NEON

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            float sum = 0;
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    float val = bfloat16_to_float32(m.row<const unsigned short>(sy)[sx]);
                                    sum += val;
                                    area += 1;
                                }
                            }

                            outptr[j] = float32_to_bfloat16(sum / area);
                        }

                        outptr += outw;
                    }
                }
            }
        }

        if (avgpool_count_include_pad == 1)
        {
#if __ARM_NEON
            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w * 4;

                            float32x4_t _sum = vdupq_n_f32(0.f);

                            for (int k = 0; k < maxk; k++)
                            {
                                float32x4_t _val = bfloat2float(vld1_u16(sptr + space_ofs[k] * 4));
                                _sum = vaddq_f32(_sum, _val);
                            }

                            float32x4_t _avg = vmulq_f32(_sum, _inv_maxk);
                            vst1_u16(outptr + j * 4, float2bfloat(_avg));
                        }

                        outptr += outw * 4;
                    }
                }
            }
#endif // __ARM_NEON

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    unsigned short* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const unsigned short* sptr = m.row<const unsigned short>(i * stride_h) + j * stride_w;

                            float sum = 0.f;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = bfloat16_to_float32(sptr[space_ofs[k]]);
                                sum += val;
                            }

                            outptr[j] = float32_to_bfloat16(sum / maxk);
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/pooling_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_POOLING_ARM_H
#define LAYER_POOLING_ARM_H

#include "pooling.h"

namespace ncnn {

class Pooling_arm : public Pooling
{
public:
    Pooling_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_POOLING_ARM_H


================================================
FILE: src/layer/arm/pooling_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "pooling_arm.h"

#include <float.h>

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Pooling_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // max value in NxN window
    // avg value in NxN window

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);

    if (global_pooling)
    {
        top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int size = w * h;

        if (pooling_type == PoolMethod_MAX)
        {
            if (elempack == 8)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);

                    float16x8_t _max = vdupq_n_f16((__fp16)-FLT_MAX);
                    for (int i = 0; i < size; i++)
                    {
                        float16x8_t _val = vld1q_f16(ptr);
                        _max = vmaxq_f16(_max, _val);
                        ptr += 8;
                    }

                    __fp16* outptr = top_blob;
                    vst1q_f16(outptr + q * 8, _max);
                }
            }

            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);

                    float16x4_t _max = vdup_n_f16((__fp16)-FLT_MAX);
                    for (int i = 0; i < size; i++)
                    {
                        float16x4_t _val = vld1_f16(ptr);
                        _max = vmax_f16(_max, _val);
                        ptr += 4;
                    }

                    __fp16* outptr = top_blob;
                    vst1_f16(outptr + q * 4, _max);
                }
            }

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);

                    __fp16 max = (__fp16)-FLT_MAX;
                    for (int i = 0; i < size; i++)
                    {
                        max = std::max(max, ptr[i]);
                    }

                    __fp16* outptr = top_blob;
                    outptr[q] = max;
                }
            }
        }

        if (pooling_type == PoolMethod_AVE)
        {
            if (elempack == 8)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);

                    float32x4_t _sum0 = vdupq_n_f32(0.f);
                    float32x4_t _sum1 = vdupq_n_f32(0.f);
                    for (int i = 0; i < size; i++)
                    {
                        float16x8_t _val = vld1q_f16(ptr);
                        _sum0 = vaddq_f32(_sum0, vcvt_f32_f16(vget_low_f16(_val)));
                        _sum1 = vaddq_f32(_sum1, vcvt_f32_f16(vget_high_f16(_val)));
                        ptr += 8;
                    }

                    float32x4_t _inv_size = vdupq_n_f32(1.f / size);
                    float32x4_t _avg0 = vmulq_f32(_sum0, _inv_size);
                    float32x4_t _avg1 = vmulq_f32(_sum1, _inv_size);

                    __fp16* outptr = top_blob;
                    vst1q_f16(outptr + q * 8, vcombine_f16(vcvt_f16_f32(_avg0), vcvt_f16_f32(_avg1)));
                }
            }

            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);

                    float32x4_t _sum = vdupq_n_f32(0.f);
                    for (int i = 0; i < size; i++)
                    {
                        float32x4_t _val = vcvt_f32_f16(vld1_f16(ptr));
                        _sum = vaddq_f32(_sum, _val);
                        ptr += 4;
                    }

                    float32x4_t _inv_size = vdupq_n_f32(1.f / size);
                    float32x4_t _avg = vmulq_f32(_sum, _inv_size);

                    __fp16* outptr = top_blob;
                    vst1_f16(outptr + q * 4, vcvt_f16_f32(_avg));
                }
            }

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const __fp16* ptr = bottom_blob.channel(q);

                    float sum = 0.f;
                    for (int i = 0; i < size; i++)
                    {
                        sum += (float)ptr[i];
                    }

                    __fp16* outptr = top_blob;
                    outptr[q] = (__fp16)(sum / size);
                }
            }
        }

        return 0;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_w) / stride_w + 1;
    int outh = (h - kernel_h) / stride_h + 1;

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w - kernel_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2++;
            }
            p2 += gap;
        }
    }

    if (pooling_type == PoolMethod_MAX)
    {
        if (elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat m = bottom_blob_bordered.channel(q);
                __fp16* outptr = top_blob.channel(q);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;

                        float16x8_t _max = vdupq_n_f16((__fp16)-FLT_MAX);

                        for (int k = 0; k < maxk; k++)
                        {
                            float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
                            _max = vmaxq_f16(_max, _val);
                        }

                        vst1q_f16(outptr + j * 8, _max);
                    }

                    outptr += outw * 8;
                }
            }
        }

        if (elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat m = bottom_blob_bordered.channel(q);
                __fp16* outptr = top_blob.channel(q);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;

                        float16x4_t _max = vdup_n_f16((__fp16)-FLT_MAX);

                        for (int k = 0; k < maxk; k++)
                        {
                            float16x4_t _val = vld1_f16(sptr + space_ofs[k] * 4);
                            _max = vmax_f16(_max, _val);
                        }

                        vst1_f16(outptr + j * 4, _max);
                    }

                    outptr += outw * 4;
                }
            }
        }

        if (elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat m = bottom_blob_bordered.channel(q);
                __fp16* outptr = top_blob.channel(q);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;

                        __fp16 max = (__fp16)-FLT_MAX;

                        for (int k = 0; k < maxk; k++)
                        {
                            __fp16 val = sptr[space_ofs[k]];
                            max = std::max(max, val);
                        }

                        outptr[j] = max;
                    }

                    outptr += outw;
                }
            }
        }
    }

    if (pooling_type == PoolMethod_AVE)
    {
        if (avgpool_count_include_pad == 0)
        {
            int wtailpad = 0;
            int htailpad = 0;

            if (pad_mode == 0) // full padding
            {
                wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
                htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
            }

            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            float32x4_t _sum = vdupq_n_f32(0.f);
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    float32x4_t _val = vcvt_f32_f16(vld1_f16(m.row<const __fp16>(sy) + sx * 4));
                                    _sum = vaddq_f32(_sum, _val);
                                    area += 1;
                                }
                            }

                            float32x4_t _inv_area = vdupq_n_f32(1.f / area);
                            float32x4_t _avg = vmulq_f32(_sum, _inv_area);
                            vst1_f16(outptr + j * 4, vcvt_f16_f32(_avg));
                        }

                        outptr += outw * 4;
                    }
                }
            }

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            float sum = 0.f;
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    float val = (float)(m.row<const __fp16>(sy)[sx]);
                                    sum += val;
                                    area += 1;
                                }
                            }

                            outptr[j] = (__fp16)(sum / area);
                        }

                        outptr += outw;
                    }
                }
            }
        }

        if (avgpool_count_include_pad == 1)
        {
            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    float32x4_t _inv_maxk = vdupq_n_f32(1.f / maxk);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;

                            float32x4_t _sum = vdupq_n_f32(0.f);

                            for (int k = 0; k < maxk; k++)
                            {
                                float32x4_t _val = vcvt_f32_f16(vld1_f16(sptr + space_ofs[k] * 4));
                                _sum = vaddq_f32(_sum, _val);
                            }

                            float32x4_t _avg = vmulq_f32(_sum, _inv_maxk);
                            vst1_f16(outptr + j * 4, vcvt_f16_f32(_avg));
                        }

                        outptr += outw * 4;
                    }
                }
            }

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;

                            float sum = 0.f;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = (float)(sptr[space_ofs[k]]);
                                sum += val;
                            }

                            outptr[j] = (__fp16)(sum / maxk);
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }

    return 0;
}

int Pooling_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // max value in NxN window
    // avg value in NxN window

    if (pooling_type == PoolMethod_MAX || global_pooling)
    {
        return forward_fp16s(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_w) / stride_w + 1;
    int outh = (h - kernel_h) / stride_h + 1;

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w - kernel_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2++;
            }
            p2 += gap;
        }
    }

    if (pooling_type == PoolMethod_AVE)
    {
        if (avgpool_count_include_pad == 0)
        {
            int wtailpad = 0;
            int htailpad = 0;

            if (pad_mode == 0) // full padding
            {
                wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
                htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
            }

            if (elempack == 8)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            float16x8_t _sum = vdupq_n_f16((__fp16)0.f);
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    float16x8_t _val = vld1q_f16(m.row<const __fp16>(sy) + sx * 8);
                                    _sum = vaddq_f16(_sum, _val);
                                    area += 1;
                                }
                            }

#if defined(_MSC_VER) && !defined(__clang__)
                            float16x4_t _inv_area0 = vcvt_f16_f32(vdupq_n_f32(1.f / area));
                            float16x8_t _inv_area = vcombine_f16(_inv_area0, _inv_area0);
#else
                            float16x8_t _inv_area = vdupq_n_f16((__fp16)(1.f / area));
#endif
                            float16x8_t _avg = vmulq_f16(_sum, _inv_area);
                            vst1q_f16(outptr + j * 8, _avg);
                        }

                        outptr += outw * 8;
                    }
                }
            }

            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            float16x4_t _sum = vdup_n_f16((__fp16)0.f);
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    float16x4_t _val = vld1_f16(m.row<const __fp16>(sy) + sx * 4);
                                    _sum = vadd_f16(_sum, _val);
                                    area += 1;
                                }
                            }

#if defined(_MSC_VER) && !defined(__clang__)
                            float16x4_t _inv_area = vcvt_f16_f32(vdupq_n_f32(1.f / area));
#else
                            float16x4_t _inv_area = vdup_n_f16((__fp16)(1.f / area));
#endif
                            float16x4_t _avg = vmul_f16(_sum, _inv_area);
                            vst1_f16(outptr + j * 4, _avg);
                        }

                        outptr += outw * 4;
                    }
                }
            }

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            __fp16 sum = (__fp16)0.f;
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    __fp16 val = m.row<const __fp16>(sy)[sx];
                                    sum += val;
                                    area += 1;
                                }
                            }

                            outptr[j] = sum / (__fp16)area;
                        }

                        outptr += outw;
                    }
                }
            }
        }

        if (avgpool_count_include_pad == 1)
        {
            if (elempack == 8)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

#if defined(_MSC_VER) && !defined(__clang__)
                    float16x4_t _inv_maxk0 = vcvt_f16_f32(vdupq_n_f32(1.f / maxk));
                    float16x8_t _inv_maxk = vcombine_f16(_inv_maxk0, _inv_maxk0);
#else
                    float16x8_t _inv_maxk = vdupq_n_f16((__fp16)(1.f / maxk));
#endif

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 8;

                            float16x8_t _sum = vdupq_n_f16((__fp16)0.f);

                            for (int k = 0; k < maxk; k++)
                            {
                                float16x8_t _val = vld1q_f16(sptr + space_ofs[k] * 8);
                                _sum = vaddq_f16(_sum, _val);
                            }

                            float16x8_t _avg = vmulq_f16(_sum, _inv_maxk);
                            vst1q_f16(outptr + j * 8, _avg);
                        }

                        outptr += outw * 8;
                    }
                }
            }

            if (elempack == 4)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

#if defined(_MSC_VER) && !defined(__clang__)
                    float16x4_t _inv_maxk = vcvt_f16_f32(vdupq_n_f32(1.f / maxk));
#else
                    float16x4_t _inv_maxk = vdup_n_f16((__fp16)(1.f / maxk));
#endif

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w * 4;

                            float16x4_t _sum = vdup_n_f16((__fp16)0.f);

                            for (int k = 0; k < maxk; k++)
                            {
                                float16x4_t _val = vld1_f16(sptr + space_ofs[k] * 4);
                                _sum = vadd_f16(_sum, _val);
                            }

                            float16x4_t _avg = vmul_f16(_sum, _inv_maxk);
                            vst1_f16(outptr + j * 4, _avg);
                        }

                        outptr += outw * 4;
                    }
                }
            }

            if (elempack == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    __fp16* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const __fp16* sptr = m.row<const __fp16>(i * stride_h) + j * stride_w;

                            __fp16 sum = (__fp16)0.f;

                            for (int k = 0; k < maxk; k++)
                            {
                                __fp16 val = sptr[space_ofs[k]];
                                sum += val;
                            }

                            outptr[j] = sum / (__fp16)maxk;
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/prelu_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "prelu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

PReLU_arm::PReLU_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int PReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _zero = vdupq_n_f32(0.f);

        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            if (num_slope > 1)
            {
                const float* slope = slope_data;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    float* ptr = (float*)bottom_top_blob + i * 4;

                    float32x4_t _p = vld1q_f32(ptr);
                    float32x4_t _slope = vld1q_f32(slope + i * 4);
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1q_f32(ptr, _p);
                }
            }
            else
            {
                float32x4_t _slope = vdupq_n_f32(slope_data[0]);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    float* ptr = (float*)bottom_top_blob + i * 4;

                    float32x4_t _p = vld1q_f32(ptr);
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1q_f32(ptr, _p);
                }
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                float* ptr = bottom_top_blob.row(i);
                float32x4_t _slope = num_slope > 1 ? vld1q_f32((const float*)slope_data + i * 4) : vdupq_n_f32(slope_data[0]);

                for (int j = 0; j < w; j++)
                {
                    float32x4_t _p = vld1q_f32(ptr);
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1q_f32(ptr, _p);

                    ptr += 4;
                }
            }
        }

        if (dims == 3)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int channels = bottom_top_blob.c;
            int size = w * h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                float* ptr = bottom_top_blob.channel(q);
                float32x4_t _slope = num_slope > 1 ? vld1q_f32((const float*)slope_data + q * 4) : vdupq_n_f32(slope_data[0]);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p = vld1q_f32(ptr);
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1q_f32(ptr, _p);

                    ptr += 4;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        float* ptr = bottom_top_blob;

        if (num_slope > 1)
        {
            const float* slope = slope_data;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = ptr[i];
                if (v < 0.f)
                    ptr[i] = v * slope[i];
            }
        }
        else
        {
            const float slope = slope_data[0];

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = ptr[i];
                if (v < 0.f)
                    ptr[i] = v * slope;
            }
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];

            int j = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);

            for (; j + 3 < w; j += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < w; j++)
            {
                float v = *ptr;
                if (v < 0.f)
                    *ptr = v * slope;

                ptr++;
            }
        }
    }

    if (dims == 3)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h;

        const float* slope_data_ptr = slope_data;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];

            int j = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);

            for (; j + 15 < size; j += 16)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                float32x4_t _p2 = vld1q_f32(ptr + 8);
                float32x4_t _p3 = vld1q_f32(ptr + 12);
                uint32x4_t _lemask0 = vcleq_f32(_p0, _zero);
                uint32x4_t _lemask1 = vcleq_f32(_p1, _zero);
                uint32x4_t _lemask2 = vcleq_f32(_p2, _zero);
                uint32x4_t _lemask3 = vcleq_f32(_p3, _zero);
                float32x4_t _ps0 = vmulq_f32(_p0, _slope);
                float32x4_t _ps1 = vmulq_f32(_p1, _slope);
                float32x4_t _ps2 = vmulq_f32(_p2, _slope);
                float32x4_t _ps3 = vmulq_f32(_p3, _slope);
                _p0 = vbslq_f32(_lemask0, _ps0, _p0);
                _p1 = vbslq_f32(_lemask1, _ps1, _p1);
                _p2 = vbslq_f32(_lemask2, _ps2, _p2);
                _p3 = vbslq_f32(_lemask3, _ps3, _p3);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                vst1q_f32(ptr + 8, _p2);
                vst1q_f32(ptr + 12, _p3);
                ptr += 16;
            }
            for (; j + 7 < size; j += 8)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                uint32x4_t _lemask0 = vcleq_f32(_p0, _zero);
                uint32x4_t _lemask1 = vcleq_f32(_p1, _zero);
                float32x4_t _ps0 = vmulq_f32(_p0, _slope);
                float32x4_t _ps1 = vmulq_f32(_p1, _slope);
                _p0 = vbslq_f32(_lemask0, _ps0, _p0);
                _p1 = vbslq_f32(_lemask1, _ps1, _p1);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                ptr += 8;
            }
            for (; j + 3 < size; j += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1q_f32(ptr, _p);
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < size; j++)
            {
                if (*ptr < 0)
                    *ptr *= slope;
                ptr++;
            }
        }
    }

    return 0;
}

#if NCNN_BF16
int PReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _zero = vdupq_n_f32(0.f);

        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            if (num_slope > 1)
            {
                const float* slope = slope_data;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    unsigned short* ptr = (unsigned short*)bottom_top_blob + i * 4;

                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    float32x4_t _slope = vld1q_f32(slope + i * 4);
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_u16(ptr, float2bfloat(_p));
                }
            }
            else
            {
                float32x4_t _slope = vdupq_n_f32(slope_data[0]);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    unsigned short* ptr = (unsigned short*)bottom_top_blob + i * 4;

                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_u16(ptr, float2bfloat(_p));
                }
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);
                float32x4_t _slope = num_slope > 1 ? vld1q_f32((const float*)slope_data + i * 4) : vdupq_n_f32(slope_data[0]);

                for (int j = 0; j < w; j++)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_u16(ptr, float2bfloat(_p));

                    ptr += 4;
                }
            }
        }

        if (dims == 3)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int channels = bottom_top_blob.c;
            int size = w * h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                unsigned short* ptr = bottom_top_blob.channel(q);
                float32x4_t _slope = num_slope > 1 ? vld1q_f32((const float*)slope_data + q * 4) : vdupq_n_f32(slope_data[0]);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p = bfloat2float(vld1_u16(ptr));
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_u16(ptr, float2bfloat(_p));

                    ptr += 4;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        unsigned short* ptr = bottom_top_blob;

        if (num_slope > 1)
        {
            const float* slope = slope_data;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = bfloat16_to_float32(ptr[i]);
                if (v < 0.f)
                    ptr[i] = float32_to_bfloat16(v * slope[i]);
            }
        }
        else
        {
            const float slope = slope_data[0];

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = bfloat16_to_float32(ptr[i]);
                if (v < 0.f)
                    ptr[i] = float32_to_bfloat16(v * slope);
            }
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);

            const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];

            int j = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);

            for (; j + 3 < w; j += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1_u16(ptr, float2bfloat(_p));

                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < w; j++)
            {
                float v = bfloat16_to_float32(*ptr);
                if (v < 0.f)
                    *ptr = float32_to_bfloat16(v * slope);

                ptr++;
            }
        }
    }

    if (dims == 3)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            const float slope = num_slope > 1 ? slope_data[q] : slope_data[0];

            int j = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);

            for (; j + 3 < size; j += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1_u16(ptr, float2bfloat(_p));

                ptr += 4;
            }
#endif // __ARM_NEON
            for (; j < size; j++)
            {
                float v = bfloat16_to_float32(*ptr);
                if (v < 0.f)
                    *ptr = float32_to_bfloat16(v * slope);

                ptr++;
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/prelu_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_PRELU_ARM_H
#define LAYER_PRELU_ARM_H

#include "prelu.h"

namespace ncnn {

class PReLU_arm : public PReLU
{
public:
    PReLU_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_PRELU_ARM_H


================================================
FILE: src/layer/arm/prelu_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "prelu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int PReLU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 4)
    {
        float32x4_t _zero = vdupq_n_f32(0.f);

        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            if (num_slope > 1)
            {
                const float* slope = slope_data;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    __fp16* ptr = (__fp16*)bottom_top_blob + i * 4;

                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    float32x4_t _slope = vld1q_f32(slope + i * 4);
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_f16(ptr, vcvt_f16_f32(_p));
                }
            }
            else
            {
                float32x4_t _slope = vdupq_n_f32(slope_data[0]);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    __fp16* ptr = (__fp16*)bottom_top_blob + i * 4;

                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_f16(ptr, vcvt_f16_f32(_p));
                }
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
                float32x4_t _slope = num_slope > 1 ? vld1q_f32((const float*)slope_data + i * 4) : vdupq_n_f32(slope_data[0]);

                for (int j = 0; j < w; j++)
                {
                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_f16(ptr, vcvt_f16_f32(_p));

                    ptr += 4;
                }
            }
        }

        if (dims == 3)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int channels = bottom_top_blob.c;
            int size = w * h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                __fp16* ptr = bottom_top_blob.channel(q);
                float32x4_t _slope = num_slope > 1 ? vld1q_f32((const float*)slope_data + q * 4) : vdupq_n_f32(slope_data[0]);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                    uint32x4_t _lemask = vcleq_f32(_p, _zero);
                    float32x4_t _ps = vmulq_f32(_p, _slope);
                    _p = vbslq_f32(_lemask, _ps, _p);
                    vst1_f16(ptr, vcvt_f16_f32(_p));

                    ptr += 4;
                }
            }
        }

        return 0;
    }

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        __fp16* ptr = bottom_top_blob;

        if (num_slope > 1)
        {
            const float* slope = slope_data;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = (float)ptr[i];
                if (v < 0.f)
                    ptr[i] = (__fp16)(v * slope[i]);
            }
        }
        else
        {
            const float slope = slope_data[0];

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = (float)ptr[i];
                if (v < 0.f)
                    ptr[i] = (__fp16)(v * slope);
            }
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            __fp16* ptr = bottom_top_blob.row<__fp16>(i);

            const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];

            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);

            int j = 0;
            for (; j + 3 < w; j += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1_f16(ptr, vcvt_f16_f32(_p));

                ptr += 4;
            }
            for (; j < w; j++)
            {
                float v = (float)*ptr;
                if (v < 0.f)
                    *ptr = (__fp16)(v * slope);

                ptr++;
            }
        }
    }

    if (dims == 3)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            const float slope = num_slope > 1 ? slope_data[q] : slope_data[0];

            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);

            int j = 0;
            for (; j + 3 < size; j += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1_f16(ptr, vcvt_f16_f32(_p));

                ptr += 4;
            }
            for (; j < size; j++)
            {
                float v = (float)*ptr;
                if (v < 0.f)
                    *ptr = (__fp16)(v * slope);

                ptr++;
            }
        }
    }

    return 0;
}

int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 8)
    {
        float16x8_t _zero = vdupq_n_f16(0.f);

        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            if (num_slope > 1)
            {
                const float* slope = slope_data;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    __fp16* ptr = (__fp16*)bottom_top_blob + i * 8;

                    float16x8_t _p = vld1q_f16(ptr);
                    float16x8_t _slope = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)slope + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)slope + i * 8 + 4)));
                    uint16x8_t _lemask = vcleq_f16(_p, _zero);
                    float16x8_t _ps = vmulq_f16(_p, _slope);
                    _p = vbslq_f16(_lemask, _ps, _p);
                    vst1q_f16(ptr, _p);
                }
            }
            else
            {
#if defined(_MSC_VER) && !defined(__clang__)
                float16x4_t _slope0 = vcvt_f16_f32(vdupq_n_f32(slope_data[0]));
                float16x8_t _slope = vcombine_f16(_slope0, _slope0);
#else
                float16x8_t _slope = vdupq_n_f16((__fp16)slope_data[0]);
#endif

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    __fp16* ptr = (__fp16*)bottom_top_blob + i * 8;

                    float16x8_t _p = vld1q_f16(ptr);
                    uint16x8_t _lemask = vcleq_f16(_p, _zero);
                    float16x8_t _ps = vmulq_f16(_p, _slope);
                    _p = vbslq_f16(_lemask, _ps, _p);
                    vst1q_f16(ptr, _p);
                }
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
                float16x8_t _slope = num_slope > 1 ? vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)slope_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)slope_data + i * 8 + 4))) : vdupq_n_f16((__fp16)slope_data[0]);

                for (int j = 0; j < w; j++)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    uint16x8_t _lemask = vcleq_f16(_p, _zero);
                    float16x8_t _ps = vmulq_f16(_p, _slope);
                    _p = vbslq_f16(_lemask, _ps, _p);
                    vst1q_f16(ptr, _p);

                    ptr += 8;
                }
            }
        }

        if (dims == 3)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int channels = bottom_top_blob.c;
            int size = w * h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                __fp16* ptr = bottom_top_blob.channel(q);
                float16x8_t _slope = num_slope > 1 ? vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)slope_data + q * 8)), vcvt_f16_f32(vld1q_f32((const float*)slope_data + q * 8 + 4))) : vdupq_n_f16((__fp16)slope_data[0]);

                for (int i = 0; i < size; i++)
                {
                    float16x8_t _p = vld1q_f16(ptr);
                    uint16x8_t _lemask = vcleq_f16(_p, _zero);
                    float16x8_t _ps = vmulq_f16(_p, _slope);
                    _p = vbslq_f16(_lemask, _ps, _p);
                    vst1q_f16(ptr, _p);

                    ptr += 8;
                }
            }
        }

        return 0;
    }

    if (elempack == 4)
    {
        if (dims == 1)
        {
#if defined(_MSC_VER) && !defined(__clang__)
            float16x8_t _zero = vdupq_n_f16(0.f);
#else
            float16x4_t _zero = vdup_n_f16(0.f);
#endif

            int w = bottom_top_blob.w;

            if (num_slope > 1)
            {
                const float* slope = slope_data;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    __fp16* ptr = (__fp16*)bottom_top_blob + i * 4;

                    float16x4_t _p = vld1_f16(ptr);
                    float16x4_t _slope = vcvt_f16_f32(vld1q_f32(slope + i * 4));
#if defined(_MSC_VER) && !defined(__clang__)
                    uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero));
#else
                    uint16x4_t _lemask = vcle_f16(_p, _zero);
#endif
                    float16x4_t _ps = vmul_f16(_p, _slope);
                    _p = vbsl_f16(_lemask, _ps, _p);
                    vst1_f16(ptr, _p);
                }
            }
            else
            {
#if defined(_MSC_VER) && !defined(__clang__)
                float16x8_t _slope = vdupq_n_f16((__fp16)slope_data[0]);
#else
                float16x4_t _slope = vdup_n_f16((__fp16)slope_data[0]);
#endif

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    __fp16* ptr = (__fp16*)bottom_top_blob + i * 4;

                    float16x4_t _p = vld1_f16(ptr);
#if defined(_MSC_VER) && !defined(__clang__)
                    uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero));
                    float16x4_t _ps = vmul_f16(_p, vget_low_f16(_slope));
#else
                    uint16x4_t _lemask = vcle_f16(_p, _zero);
                    float16x4_t _ps = vmul_f16(_p, _slope);
#endif
                    _p = vbsl_f16(_lemask, _ps, _p);
                    vst1_f16(ptr, _p);
                }
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                __fp16* ptr = bottom_top_blob.row<__fp16>(i);
                float16x4_t _zero = vdup_n_f16(0.f);
                float16x4_t _slope = num_slope > 1 ? vcvt_f16_f32(vld1q_f32((const float*)slope_data + i * 4)) : vdup_n_f16((__fp16)slope_data[0]);

                for (int j = 0; j < w; j++)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    uint16x4_t _lemask = vcle_f16(_p, _zero);
                    float16x4_t _ps = vmul_f16(_p, _slope);
                    _p = vbsl_f16(_lemask, _ps, _p);
                    vst1_f16(ptr, _p);

                    ptr += 4;
                }
            }
        }

        if (dims == 3)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int channels = bottom_top_blob.c;
            int size = w * h;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                __fp16* ptr = bottom_top_blob.channel(q);
                float16x4_t _zero = vdup_n_f16(0.f);
                float16x4_t _slope = num_slope > 1 ? vcvt_f16_f32(vld1q_f32((const float*)slope_data + q * 4)) : vdup_n_f16((__fp16)slope_data[0]);

                for (int i = 0; i < size; i++)
                {
                    float16x4_t _p = vld1_f16(ptr);
                    uint16x4_t _lemask = vcle_f16(_p, _zero);
                    float16x4_t _ps = vmul_f16(_p, _slope);
                    _p = vbsl_f16(_lemask, _ps, _p);
                    vst1_f16(ptr, _p);

                    ptr += 4;
                }
            }
        }

        return 0;
    }

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        __fp16* ptr = bottom_top_blob;

        if (num_slope > 1)
        {
            const float* slope = slope_data;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = (float)ptr[i];
                if (v < (__fp16)0.f)
                    ptr[i] = (__fp16)(v * slope[i]);
            }
        }
        else
        {
            const float slope = slope_data[0];

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < w; i++)
            {
                float v = (float)ptr[i];
                if (v < (__fp16)0.f)
                    ptr[i] = (__fp16)(v * slope);
            }
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            __fp16* ptr = bottom_top_blob.row<__fp16>(i);

            const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];

            float16x4_t _zero = vdup_n_f16(0.f);
#if defined(_MSC_VER) && !defined(__clang__)
            float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope));
#else
            float16x4_t _slope = vdup_n_f16((__fp16)slope);
#endif

            int j = 0;
            for (; j + 3 < w; j += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                uint16x4_t _lemask = vcle_f16(_p, _zero);
                float16x4_t _ps = vmul_f16(_p, _slope);
                _p = vbsl_f16(_lemask, _ps, _p);
                vst1_f16(ptr, _p);

                ptr += 4;
            }
            for (; j < w; j++)
            {
                float v = (float)*ptr;
                if (v < (__fp16)0.f)
                    *ptr = (__fp16)(v * slope);

                ptr++;
            }
        }
    }

    if (dims == 3)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            const float slope = num_slope > 1 ? slope_data[q] : slope_data[0];

            float16x4_t _zero = vdup_n_f16(0.f);
#if defined(_MSC_VER) && !defined(__clang__)
            float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope));
#else
            float16x4_t _slope = vdup_n_f16((__fp16)slope);
#endif

            int j = 0;
            for (; j + 3 < size; j += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                uint16x4_t _lemask = vcle_f16(_p, _zero);
                float16x4_t _ps = vmul_f16(_p, _slope);
                _p = vbsl_f16(_lemask, _ps, _p);
                vst1_f16(ptr, _p);

                ptr += 4;
            }
            for (; j < size; j++)
            {
                float v = (float)*ptr;
                if (v < (__fp16)0.f)
                    *ptr = (__fp16)(v * slope);

                ptr++;
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/quantize_arm.cpp
================================================
// Copyright 2018 Tencent
// Copyright 2019 BUG1989
// SPDX-License-Identifier: BSD-3-Clause

#include "quantize_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

Quantize_arm::Quantize_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("quantize %d   %d %d", scale_data_size, elemcount, elempack);

    float scale = scale_data[0];
#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
    if (scale_data_size > 1)
    {
        if (elempack == 4)
        {
            _scale = vld1q_f32((const float*)scale_data);
        }
    }
#endif // __ARM_NEON

    int i = 0;
#if __ARM_NEON
    for (; i + 15 < size; i += 16)
    {
        float32x4_t _v0 = vld1q_f32(ptr);
        float32x4_t _v1 = vld1q_f32(ptr + 4);
        float32x4_t _v2 = vld1q_f32(ptr + 8);
        float32x4_t _v3 = vld1q_f32(ptr + 12);
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        _v2 = vmulq_f32(_v2, _scale);
        _v3 = vmulq_f32(_v3, _scale);
        vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3)));
        ptr += 16;
        s8ptr += 16;
    }
    for (; i + 7 < size; i += 8)
    {
        float32x4_t _v0 = vld1q_f32(ptr);
        float32x4_t _v1 = vld1q_f32(ptr + 4);
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        vst1_s8(s8ptr, float2int8(_v0, _v1));
        ptr += 8;
        s8ptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _v = vld1q_f32(ptr);
        _v = vmulq_f32(_v, _scale);
        int8x8_t v = float2int8(_v, _v);
        s8ptr[0] = vget_lane_s8(v, 0);
        s8ptr[1] = vget_lane_s8(v, 1);
        s8ptr[2] = vget_lane_s8(v, 2);
        s8ptr[3] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        float v = *ptr * scale;
        *s8ptr = float2int8(v);
        ptr++;
        s8ptr++;
    }
}

#if __ARM_NEON
static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to8 %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    float32x4_t _scale0 = vdupq_n_f32(scale);
    float32x4_t _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        _scale0 = vld1q_f32((const float*)scale_data);
        _scale1 = vld1q_f32((const float*)scale_data + 4);
    }

    int i = 0;
    for (; i + 1 < elemcount; i += 2)
    {
        float32x4_t _v0 = vld1q_f32(ptr0);
        float32x4_t _v1 = vld1q_f32(ptr1);
        float32x4_t _v2 = vld1q_f32(ptr0 + 4);
        float32x4_t _v3 = vld1q_f32(ptr1 + 4);
        _v0 = vmulq_f32(_v0, _scale0);
        _v1 = vmulq_f32(_v1, _scale1);
        _v2 = vmulq_f32(_v2, _scale0);
        _v3 = vmulq_f32(_v3, _scale1);
        vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3)));
        ptr0 += 8;
        ptr1 += 8;
        s8ptr += 16;
    }
    for (; i < elemcount; i++)
    {
        float32x4_t _v0 = vld1q_f32(ptr0);
        float32x4_t _v1 = vld1q_f32(ptr1);
        _v0 = vmulq_f32(_v0, _scale0);
        _v1 = vmulq_f32(_v1, _scale1);
        vst1_s8(s8ptr, float2int8(_v0, _v1));
        ptr0 += 4;
        ptr1 += 4;
        s8ptr += 8;
    }
}

static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to1 %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    float32x4_t _scale = vdupq_n_f32(scale);
    if (scale_data_size > 1)
    {
        _scale = vld1q_f32((const float*)scale_data);
    }

    int i = 0;
    for (; i + 7 < elemcount; i += 8)
    {
        float32x4_t _v0 = vld1q_f32(ptr);
        float32x4_t _v1 = vld1q_f32(ptr + 4);
        float32x4_t _v2 = vld1q_f32(ptr + 8);
        float32x4_t _v3 = vld1q_f32(ptr + 12);
        float32x4_t _v4 = vld1q_f32(ptr + 16);
        float32x4_t _v5 = vld1q_f32(ptr + 20);
        float32x4_t _v6 = vld1q_f32(ptr + 24);
        float32x4_t _v7 = vld1q_f32(ptr + 28);
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        _v2 = vmulq_f32(_v2, _scale);
        _v3 = vmulq_f32(_v3, _scale);
        _v4 = vmulq_f32(_v4, _scale);
        _v5 = vmulq_f32(_v5, _scale);
        _v6 = vmulq_f32(_v6, _scale);
        _v7 = vmulq_f32(_v7, _scale);
        int8x8_t v0 = float2int8(_v0, _v1);
        int8x8_t v1 = float2int8(_v2, _v3);
        int8x8_t v2 = float2int8(_v4, _v5);
        int8x8_t v3 = float2int8(_v6, _v7);
        int8x16_t v01 = vcombine_s8(v0, v1);
        int8x16_t v23 = vcombine_s8(v2, v3);
        int8x16x2_t v0213 = vuzpq_s8(v01, v23);
        int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]);
        vst1_s8(s8ptr0, vget_low_s8(v0123.val[0]));
        vst1_s8(s8ptr1, vget_high_s8(v0123.val[0]));
        vst1_s8(s8ptr2, vget_low_s8(v0123.val[1]));
        vst1_s8(s8ptr3, vget_high_s8(v0123.val[1]));
        ptr += 32;
        s8ptr0 += 8;
        s8ptr1 += 8;
        s8ptr2 += 8;
        s8ptr3 += 8;
    }
    for (; i < elemcount; i++)
    {
        float32x4_t _v = vld1q_f32(ptr);
        _v = vmulq_f32(_v, _scale);
        int8x8_t v = float2int8(_v, _v);
        s8ptr0[0] = vget_lane_s8(v, 0);
        s8ptr1[0] = vget_lane_s8(v, 1);
        s8ptr2[0] = vget_lane_s8(v, 2);
        s8ptr3[0] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr0 += 1;
        s8ptr1 += 1;
        s8ptr2 += 1;
        s8ptr3 += 1;
    }
}
#endif // __ARM_NEON

int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_fp16sa(bottom_blob, top_blob, opt);
        else
            return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    if (dims == 1)
    {
        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = w * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outw = w * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const float* ptr = (const float*)bottom_blob + i * elempack;
            signed char* s8ptr = (signed char*)top_blob + i * elempack;

            // assert scale_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            quantize(ptr, s8ptr, scale_data, size, 1);
        }
    }

    if (dims == 2)
    {
        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = h * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outh = h * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const float* ptr0 = bottom_blob.row(i * 2);
                const float* ptr1 = bottom_blob.row(i * 2 + 1);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* ptr = bottom_blob.row(i);
                signed char* s8ptr0 = top_blob.row<signed char>(i * 4);
                signed char* s8ptr1 = top_blob.row<signed char>(i * 4 + 1);
                signed char* s8ptr2 = top_blob.row<signed char>(i * 4 + 2);
                signed char* s8ptr3 = top_blob.row<signed char>(i * 4 + 3);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w);
            }
        }
#endif // __ARM_NEON
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* ptr = bottom_blob.row(i);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize(ptr, s8ptr, scale_data_i, w, elempack);
            }
        }
    }

    if (dims == 3)
    {
        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outc = channels * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const float* ptr0 = bottom_blob.channel(q * 2);
                const float* ptr1 = bottom_blob.channel(q * 2 + 1);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                signed char* s8ptr0 = top_blob.channel(q * 4);
                signed char* s8ptr1 = top_blob.channel(q * 4 + 1);
                signed char* s8ptr2 = top_blob.channel(q * 4 + 2);
                signed char* s8ptr3 = top_blob.channel(q * 4 + 3);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h);
            }
        }
#endif // __ARM_NEON
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
            }
        }
    }

    return 0;
}

#if NCNN_BF16
static void quantize_bf16s(const unsigned short* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("quantize_bf16s %d   %d %d", scale_data_size, elemcount, elempack);

    float scale = scale_data[0];
#if __ARM_NEON
    float32x4_t _scale = vdupq_n_f32(scale);
    if (scale_data_size > 1)
    {
        if (elempack == 4)
        {
            _scale = vld1q_f32((const float*)scale_data);
        }
    }
#endif // __ARM_NEON

    int i = 0;
#if __ARM_NEON
    for (; i + 15 < size; i += 16)
    {
        uint16x8_t _v01 = vld1q_u16(ptr);
        uint16x8_t _v23 = vld1q_u16(ptr + 8);
        float32x4_t _v0 = bfloat2float(vget_low_u16(_v01));
        float32x4_t _v1 = bfloat2float(vget_high_u16(_v01));
        float32x4_t _v2 = bfloat2float(vget_low_u16(_v23));
        float32x4_t _v3 = bfloat2float(vget_high_u16(_v23));
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        _v2 = vmulq_f32(_v2, _scale);
        _v3 = vmulq_f32(_v3, _scale);
        vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3)));
        ptr += 16;
        s8ptr += 16;
    }
    for (; i + 7 < size; i += 8)
    {
        uint16x8_t _v01 = vld1q_u16(ptr);
        float32x4_t _v0 = bfloat2float(vget_low_u16(_v01));
        float32x4_t _v1 = bfloat2float(vget_high_u16(_v01));
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        vst1_s8(s8ptr, float2int8(_v0, _v1));
        ptr += 8;
        s8ptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _v = bfloat2float(vld1_u16(ptr));
        _v = vmulq_f32(_v, _scale);
        int8x8_t v = float2int8(_v, _v);
        s8ptr[0] = vget_lane_s8(v, 0);
        s8ptr[1] = vget_lane_s8(v, 1);
        s8ptr[2] = vget_lane_s8(v, 2);
        s8ptr[3] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr += 4;
    }
#endif // __ARM_NEON
    for (; i < size; i++)
    {
        float v = bfloat16_to_float32(*ptr) * scale;
        *s8ptr = float2int8(v);
        ptr++;
        s8ptr++;
    }
}

#if __ARM_NEON
static void quantize_pack4to8_bf16s(const unsigned short* ptr0, const unsigned short* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to8_bf16s %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    float32x4_t _scale0 = vdupq_n_f32(scale);
    float32x4_t _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        _scale0 = vld1q_f32((const float*)scale_data);
        _scale1 = vld1q_f32((const float*)scale_data + 4);
    }

    int i = 0;
    for (; i + 1 < elemcount; i += 2)
    {
        uint16x8_t _v02 = vld1q_u16(ptr0);
        uint16x8_t _v13 = vld1q_u16(ptr1);
        float32x4_t _v0 = bfloat2float(vget_low_u16(_v02));
        float32x4_t _v1 = bfloat2float(vget_low_u16(_v13));
        float32x4_t _v2 = bfloat2float(vget_high_u16(_v02));
        float32x4_t _v3 = bfloat2float(vget_high_u16(_v13));
        _v0 = vmulq_f32(_v0, _scale0);
        _v1 = vmulq_f32(_v1, _scale1);
        _v2 = vmulq_f32(_v2, _scale0);
        _v3 = vmulq_f32(_v3, _scale1);
        vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3)));
        ptr0 += 8;
        ptr1 += 8;
        s8ptr += 16;
    }
    for (; i < elemcount; i++)
    {
        float32x4_t _v0 = bfloat2float(vld1_u16(ptr0));
        float32x4_t _v1 = bfloat2float(vld1_u16(ptr1));
        _v0 = vmulq_f32(_v0, _scale0);
        _v1 = vmulq_f32(_v1, _scale1);
        vst1_s8(s8ptr, float2int8(_v0, _v1));
        ptr0 += 4;
        ptr1 += 4;
        s8ptr += 8;
    }
}

static void quantize_pack4to1_bf16s(const unsigned short* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to1_bf16s %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    float32x4_t _scale = vdupq_n_f32(scale);
    if (scale_data_size > 1)
    {
        _scale = vld1q_f32((const float*)scale_data);
    }

    int i = 0;
    for (; i + 7 < elemcount; i += 8)
    {
        uint16x8_t _v01 = vld1q_u16(ptr);
        uint16x8_t _v23 = vld1q_u16(ptr + 8);
        uint16x8_t _v45 = vld1q_u16(ptr + 16);
        uint16x8_t _v67 = vld1q_u16(ptr + 24);
        float32x4_t _v0 = bfloat2float(vget_low_u16(_v01));
        float32x4_t _v1 = bfloat2float(vget_high_u16(_v01));
        float32x4_t _v2 = bfloat2float(vget_low_u16(_v23));
        float32x4_t _v3 = bfloat2float(vget_high_u16(_v23));
        float32x4_t _v4 = bfloat2float(vget_low_u16(_v45));
        float32x4_t _v5 = bfloat2float(vget_high_u16(_v45));
        float32x4_t _v6 = bfloat2float(vget_low_u16(_v67));
        float32x4_t _v7 = bfloat2float(vget_high_u16(_v67));
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        _v2 = vmulq_f32(_v2, _scale);
        _v3 = vmulq_f32(_v3, _scale);
        _v4 = vmulq_f32(_v4, _scale);
        _v5 = vmulq_f32(_v5, _scale);
        _v6 = vmulq_f32(_v6, _scale);
        _v7 = vmulq_f32(_v7, _scale);
        int8x8_t v0 = float2int8(_v0, _v1);
        int8x8_t v1 = float2int8(_v2, _v3);
        int8x8_t v2 = float2int8(_v4, _v5);
        int8x8_t v3 = float2int8(_v6, _v7);
        int8x16_t v01 = vcombine_s8(v0, v1);
        int8x16_t v23 = vcombine_s8(v2, v3);
        int8x16x2_t v0213 = vuzpq_s8(v01, v23);
        int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]);
        vst1_s8(s8ptr0, vget_low_s8(v0123.val[0]));
        vst1_s8(s8ptr1, vget_high_s8(v0123.val[0]));
        vst1_s8(s8ptr2, vget_low_s8(v0123.val[1]));
        vst1_s8(s8ptr3, vget_high_s8(v0123.val[1]));
        ptr += 32;
        s8ptr0 += 8;
        s8ptr1 += 8;
        s8ptr2 += 8;
        s8ptr3 += 8;
    }
    for (; i < elemcount; i++)
    {
        float32x4_t _v = bfloat2float(vld1_u16(ptr));
        _v = vmulq_f32(_v, _scale);
        int8x8_t v = float2int8(_v, _v);
        s8ptr0[0] = vget_lane_s8(v, 0);
        s8ptr1[0] = vget_lane_s8(v, 1);
        s8ptr2[0] = vget_lane_s8(v, 2);
        s8ptr3[0] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr0 += 1;
        s8ptr1 += 1;
        s8ptr2 += 1;
        s8ptr3 += 1;
    }
}
#endif // __ARM_NEON

int Quantize_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    if (dims == 1)
    {
        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = w * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outw = w * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const unsigned short* ptr = (const unsigned short*)bottom_blob + i * elempack;
            signed char* s8ptr = (signed char*)top_blob + i * elempack;

            // assert scale_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            quantize_bf16s(ptr, s8ptr, scale_data, size, 1);
        }
    }

    if (dims == 2)
    {
        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = h * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outh = h * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const unsigned short* ptr0 = bottom_blob.row<const unsigned short>(i * 2);
                const unsigned short* ptr1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8_bf16s(ptr0, ptr1, s8ptr, scale_data_i, w);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const unsigned short* ptr = bottom_blob.row<const unsigned short>(i);
                signed char* s8ptr0 = top_blob.row<signed char>(i * 4);
                signed char* s8ptr1 = top_blob.row<signed char>(i * 4 + 1);
                signed char* s8ptr2 = top_blob.row<signed char>(i * 4 + 2);
                signed char* s8ptr3 = top_blob.row<signed char>(i * 4 + 3);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_pack4to1_bf16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w);
            }
        }
#endif // __ARM_NEON
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const unsigned short* ptr = bottom_blob.row<const unsigned short>(i);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_bf16s(ptr, s8ptr, scale_data_i, w, elempack);
            }
        }
    }

    if (dims == 3)
    {
        int out_elempack = 1;
#if __ARM_NEON
        if (opt.use_packing_layout)
        {
            out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outc = channels * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __ARM_NEON
        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q * 2);
                const unsigned short* ptr1 = bottom_blob.channel(q * 2 + 1);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8_bf16s(ptr0, ptr1, s8ptr, scale_data_q, w * h);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                signed char* s8ptr0 = top_blob.channel(q * 4);
                signed char* s8ptr1 = top_blob.channel(q * 4 + 1);
                signed char* s8ptr2 = top_blob.channel(q * 4 + 2);
                signed char* s8ptr3 = top_blob.channel(q * 4 + 3);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_pack4to1_bf16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h);
            }
        }
#endif // __ARM_NEON
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const unsigned short* ptr = bottom_blob.channel(q);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_bf16s(ptr, s8ptr, scale_data_q, w * h, elempack);
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/quantize_arm.h
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_QUANTIZE_ARM_H
#define LAYER_QUANTIZE_ARM_H

#include "quantize.h"

namespace ncnn {

class Quantize_arm : public Quantize
{
public:
    Quantize_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_QUANTIZE_ARM_H


================================================
FILE: src/layer/arm/quantize_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "quantize_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static void quantize_fp16s(const __fp16* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("quantize_fp16s %d   %d %d", scale_data_size, elemcount, elempack);

    float scale = scale_data[0];
    float32x4_t _scale = vdupq_n_f32(scale);
    if (scale_data_size > 1)
    {
        if (elempack == 4)
        {
            _scale = vld1q_f32((const float*)scale_data);
        }
    }

    int i = 0;
    for (; i + 15 < size; i += 16)
    {
        float16x8_t _v01 = vld1q_f16(ptr);
        float16x8_t _v23 = vld1q_f16(ptr + 8);
        float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01));
        float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01));
        float32x4_t _v2 = vcvt_f32_f16(vget_low_f16(_v23));
        float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v23));
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        _v2 = vmulq_f32(_v2, _scale);
        _v3 = vmulq_f32(_v3, _scale);
        vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3)));
        ptr += 16;
        s8ptr += 16;
    }
    for (; i + 7 < size; i += 8)
    {
        float16x8_t _v01 = vld1q_f16(ptr);
        float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01));
        float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01));
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        vst1_s8(s8ptr, float2int8(_v0, _v1));
        ptr += 8;
        s8ptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float32x4_t _v = vcvt_f32_f16(vld1_f16(ptr));
        _v = vmulq_f32(_v, _scale);
        int8x8_t v = float2int8(_v, _v);
        s8ptr[0] = vget_lane_s8(v, 0);
        s8ptr[1] = vget_lane_s8(v, 1);
        s8ptr[2] = vget_lane_s8(v, 2);
        s8ptr[3] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr += 4;
    }
    for (; i < size; i++)
    {
        float v = (float)(*ptr) * scale;
        *s8ptr = float2int8(v);
        ptr++;
        s8ptr++;
    }
}

static void quantize_pack4to8_fp16s(const __fp16* ptr0, const __fp16* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to8_fp16s %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    float32x4_t _scale0 = vdupq_n_f32(scale);
    float32x4_t _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        _scale0 = vld1q_f32((const float*)scale_data);
        _scale1 = vld1q_f32((const float*)scale_data + 4);
    }

    int i = 0;
    for (; i + 1 < elemcount; i += 2)
    {
        float16x8_t _v02 = vld1q_f16(ptr0);
        float16x8_t _v13 = vld1q_f16(ptr1);
        float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v02));
        float32x4_t _v1 = vcvt_f32_f16(vget_low_f16(_v13));
        float32x4_t _v2 = vcvt_f32_f16(vget_high_f16(_v02));
        float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v13));
        _v0 = vmulq_f32(_v0, _scale0);
        _v1 = vmulq_f32(_v1, _scale1);
        _v2 = vmulq_f32(_v2, _scale0);
        _v3 = vmulq_f32(_v3, _scale1);
        vst1q_s8(s8ptr, vcombine_s8(float2int8(_v0, _v1), float2int8(_v2, _v3)));
        ptr0 += 8;
        ptr1 += 8;
        s8ptr += 16;
    }
    for (; i < elemcount; i++)
    {
        float32x4_t _v0 = vcvt_f32_f16(vld1_f16(ptr0));
        float32x4_t _v1 = vcvt_f32_f16(vld1_f16(ptr1));
        _v0 = vmulq_f32(_v0, _scale0);
        _v1 = vmulq_f32(_v1, _scale1);
        vst1_s8(s8ptr, float2int8(_v0, _v1));
        ptr0 += 4;
        ptr1 += 4;
        s8ptr += 8;
    }
}

static void quantize_pack4to1_fp16s(const __fp16* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to1_fp16s %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    float32x4_t _scale = vdupq_n_f32(scale);
    if (scale_data_size > 1)
    {
        _scale = vld1q_f32((const float*)scale_data);
    }

    int i = 0;
    for (; i + 7 < elemcount; i += 8)
    {
        float16x8_t _v01 = vld1q_f16(ptr);
        float16x8_t _v23 = vld1q_f16(ptr + 8);
        float16x8_t _v45 = vld1q_f16(ptr + 16);
        float16x8_t _v67 = vld1q_f16(ptr + 24);
        float32x4_t _v0 = vcvt_f32_f16(vget_low_f16(_v01));
        float32x4_t _v1 = vcvt_f32_f16(vget_high_f16(_v01));
        float32x4_t _v2 = vcvt_f32_f16(vget_low_f16(_v23));
        float32x4_t _v3 = vcvt_f32_f16(vget_high_f16(_v23));
        float32x4_t _v4 = vcvt_f32_f16(vget_low_f16(_v45));
        float32x4_t _v5 = vcvt_f32_f16(vget_high_f16(_v45));
        float32x4_t _v6 = vcvt_f32_f16(vget_low_f16(_v67));
        float32x4_t _v7 = vcvt_f32_f16(vget_high_f16(_v67));
        _v0 = vmulq_f32(_v0, _scale);
        _v1 = vmulq_f32(_v1, _scale);
        _v2 = vmulq_f32(_v2, _scale);
        _v3 = vmulq_f32(_v3, _scale);
        _v4 = vmulq_f32(_v4, _scale);
        _v5 = vmulq_f32(_v5, _scale);
        _v6 = vmulq_f32(_v6, _scale);
        _v7 = vmulq_f32(_v7, _scale);
        int8x8_t v0 = float2int8(_v0, _v1);
        int8x8_t v1 = float2int8(_v2, _v3);
        int8x8_t v2 = float2int8(_v4, _v5);
        int8x8_t v3 = float2int8(_v6, _v7);
        int8x16_t v01 = vcombine_s8(v0, v1);
        int8x16_t v23 = vcombine_s8(v2, v3);
        int8x16x2_t v0213 = vuzpq_s8(v01, v23);
        int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]);
        vst1_s8(s8ptr0, vget_low_s8(v0123.val[0]));
        vst1_s8(s8ptr1, vget_high_s8(v0123.val[0]));
        vst1_s8(s8ptr2, vget_low_s8(v0123.val[1]));
        vst1_s8(s8ptr3, vget_high_s8(v0123.val[1]));
        ptr += 32;
        s8ptr0 += 8;
        s8ptr1 += 8;
        s8ptr2 += 8;
        s8ptr3 += 8;
    }
    for (; i < elemcount; i++)
    {
        float32x4_t _v = vcvt_f32_f16(vld1_f16(ptr));
        _v = vmulq_f32(_v, _scale);
        int8x8_t v = float2int8(_v, _v);
        s8ptr0[0] = vget_lane_s8(v, 0);
        s8ptr1[0] = vget_lane_s8(v, 1);
        s8ptr2[0] = vget_lane_s8(v, 2);
        s8ptr3[0] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr0 += 1;
        s8ptr1 += 1;
        s8ptr2 += 1;
        s8ptr3 += 1;
    }
}

int Quantize_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    if (dims == 1)
    {
        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
            out_elempack = w * elempack % 8 == 0 ? 8 : 1;
        }
        const int outw = w * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const __fp16* ptr = (const __fp16*)bottom_blob + i * elempack;
            signed char* s8ptr = (signed char*)top_blob + i * elempack;

            // assert scale_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            quantize_fp16s(ptr, s8ptr, scale_data, size, 1);
        }
    }

    if (dims == 2)
    {
        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
            out_elempack = h * elempack % 8 == 0 ? 8 : 1;
        }
        const int outh = h * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const __fp16* ptr0 = bottom_blob.row<const __fp16>(i * 2);
                const __fp16* ptr1 = bottom_blob.row<const __fp16>(i * 2 + 1);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8_fp16s(ptr0, ptr1, s8ptr, scale_data_i, w);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(i);
                signed char* s8ptr0 = top_blob.row<signed char>(i * 4);
                signed char* s8ptr1 = top_blob.row<signed char>(i * 4 + 1);
                signed char* s8ptr2 = top_blob.row<signed char>(i * 4 + 2);
                signed char* s8ptr3 = top_blob.row<signed char>(i * 4 + 3);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_pack4to1_fp16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w);
            }
        }
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(i);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_fp16s(ptr, s8ptr, scale_data_i, w, elempack);
            }
        }
    }

    if (dims == 3)
    {
        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
            out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
        }
        const int outc = channels * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const __fp16* ptr0 = bottom_blob.channel(q * 2);
                const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8_fp16s(ptr0, ptr1, s8ptr, scale_data_q, w * h);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                signed char* s8ptr0 = top_blob.channel(q * 4);
                signed char* s8ptr1 = top_blob.channel(q * 4 + 1);
                signed char* s8ptr2 = top_blob.channel(q * 4 + 2);
                signed char* s8ptr3 = top_blob.channel(q * 4 + 3);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_pack4to1_fp16s(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h);
            }
        }
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_fp16s(ptr, s8ptr, scale_data_q, w * h, elempack);
            }
        }
    }

    return 0;
}

static void quantize_fp16sa(const __fp16* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("quantize_fp16sa %d   %d %d", scale_data_size, elemcount, elempack);

    __fp16 scale = (__fp16)scale_data[0];
    float16x4_t _scale0 = vdup_n_f16(scale);
    float16x4_t _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale0 = vcvt_f16_f32(vld1q_f32((const float*)scale_data));
            _scale1 = vcvt_f16_f32(vld1q_f32((const float*)scale_data + 4));
        }
        if (elempack == 4)
        {
            _scale0 = vcvt_f16_f32(vld1q_f32((const float*)scale_data));
            _scale1 = _scale0;
        }
    }
    float16x8_t _scale = vcombine_f16(_scale0, _scale1);

    int i = 0;
    for (; i + 7 < size; i += 8)
    {
        float16x8_t _v = vld1q_f16(ptr);
        _v = vmulq_f16(_v, _scale);
        vst1_s8(s8ptr, float2int8(_v));
        ptr += 8;
        s8ptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        float16x4_t _v = vld1_f16(ptr);
        _v = vmul_f16(_v, _scale0);
        int8x8_t v = float2int8(vcombine_f16(_v, _v));
        s8ptr[0] = vget_lane_s8(v, 0);
        s8ptr[1] = vget_lane_s8(v, 1);
        s8ptr[2] = vget_lane_s8(v, 2);
        s8ptr[3] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr += 4;
    }
    for (; i < size; i++)
    {
        __fp16 v = *ptr * scale;
        *s8ptr = float2int8(v);
        ptr++;
        s8ptr++;
    }
}

static void quantize_pack4to1_fp16sa(const __fp16* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to1_fp16sa %d   %d", scale_data_size, elemcount);

    __fp16 scale = (__fp16)scale_data[0];
    float16x4_t _scale = vdup_n_f16(scale);
    if (scale_data_size > 1)
    {
        _scale = vcvt_f16_f32(vld1q_f32((const float*)scale_data));
    }
    float16x8_t _scale01 = vcombine_f16(_scale, _scale);

    int i = 0;
    for (; i + 7 < elemcount; i += 8)
    {
        float16x8_t _v01 = vld1q_f16(ptr);
        float16x8_t _v23 = vld1q_f16(ptr + 8);
        float16x8_t _v45 = vld1q_f16(ptr + 16);
        float16x8_t _v67 = vld1q_f16(ptr + 24);
        _v01 = vmulq_f16(_v01, _scale01);
        _v23 = vmulq_f16(_v23, _scale01);
        _v45 = vmulq_f16(_v45, _scale01);
        _v67 = vmulq_f16(_v67, _scale01);
        int8x8_t v0 = float2int8(_v01);
        int8x8_t v1 = float2int8(_v23);
        int8x8_t v2 = float2int8(_v45);
        int8x8_t v3 = float2int8(_v67);
        int8x16_t v01 = vcombine_s8(v0, v1);
        int8x16_t v23 = vcombine_s8(v2, v3);
        int8x16x2_t v0213 = vuzpq_s8(v01, v23);
        int8x16x2_t v0123 = vuzpq_s8(v0213.val[0], v0213.val[1]);
        vst1_s8(s8ptr0, vget_low_s8(v0123.val[0]));
        vst1_s8(s8ptr1, vget_high_s8(v0123.val[0]));
        vst1_s8(s8ptr2, vget_low_s8(v0123.val[1]));
        vst1_s8(s8ptr3, vget_high_s8(v0123.val[1]));
        ptr += 32;
        s8ptr0 += 8;
        s8ptr1 += 8;
        s8ptr2 += 8;
        s8ptr3 += 8;
    }
    for (; i < elemcount; i++)
    {
        float16x4_t _v = vld1_f16(ptr);
        _v = vmul_f16(_v, _scale);
        int8x8_t v = float2int8(vcombine_f16(_v, _v));
        s8ptr0[0] = vget_lane_s8(v, 0);
        s8ptr1[0] = vget_lane_s8(v, 1);
        s8ptr2[0] = vget_lane_s8(v, 2);
        s8ptr3[0] = vget_lane_s8(v, 3);
        ptr += 4;
        s8ptr0 += 1;
        s8ptr1 += 1;
        s8ptr2 += 1;
        s8ptr3 += 1;
    }
}

int Quantize_arm::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    if (dims == 1)
    {
        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
            out_elempack = w * elempack % 8 == 0 ? 8 : 1;
        }
        const int outw = w * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const __fp16* ptr = (const __fp16*)bottom_blob + i * elempack;
            signed char* s8ptr = (signed char*)top_blob + i * elempack;

            // assert scale_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            quantize_fp16sa(ptr, s8ptr, scale_data, size, 1);
        }
    }

    if (dims == 2)
    {
        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
            out_elempack = h * elempack % 8 == 0 ? 8 : 1;
        }
        const int outh = h * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(i);
                signed char* s8ptr0 = top_blob.row<signed char>(i * 4);
                signed char* s8ptr1 = top_blob.row<signed char>(i * 4 + 1);
                signed char* s8ptr2 = top_blob.row<signed char>(i * 4 + 2);
                signed char* s8ptr3 = top_blob.row<signed char>(i * 4 + 3);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_pack4to1_fp16sa(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w);
            }
        }
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const __fp16* ptr = bottom_blob.row<const __fp16>(i);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_fp16sa(ptr, s8ptr, scale_data_i, w, elempack);
            }
        }
    }

    if (dims == 3)
    {
        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
            out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
        }
        const int outc = channels * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                signed char* s8ptr0 = top_blob.channel(q * 4);
                signed char* s8ptr1 = top_blob.channel(q * 4 + 1);
                signed char* s8ptr2 = top_blob.channel(q * 4 + 2);
                signed char* s8ptr3 = top_blob.channel(q * 4 + 3);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_pack4to1_fp16sa(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h);
            }
        }
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const __fp16* ptr = bottom_blob.channel(q);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_fp16sa(ptr, s8ptr, scale_data_q, w * h, elempack);
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/relu_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "relu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

ReLU_arm::ReLU_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

    if (elembits == 8)
        return forward_inplace_int8(bottom_top_blob, opt);

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    if (slope == 0.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            int i = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            for (; i + 15 < size; i += 16)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                    "fmax   v0.4s, v0.4s, %2.4s     \n"
                    "fmax   v1.4s, v1.4s, %2.4s     \n"
                    "fmax   v2.4s, v2.4s, %2.4s     \n"
                    "fmax   v3.4s, v3.4s, %2.4s     \n"
                    "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_zero) // %2
                    : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #512]      \n"
                    "vldm       %0, {d0-d7}     \n"
                    "vmax.f32   q0, q0, %q2     \n"
                    "vmax.f32   q1, q1, %q2     \n"
                    "vmax.f32   q2, q2, %q2     \n"
                    "vmax.f32   q3, q3, %q2     \n"
                    "vstm       %0!, {d0-d7}    \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_zero) // %2
                    : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                float32x4_t _p2 = vld1q_f32(ptr + 8);
                float32x4_t _p3 = vld1q_f32(ptr + 12);
                _p0 = vmaxq_f32(_p0, _zero);
                _p1 = vmaxq_f32(_p1, _zero);
                _p2 = vmaxq_f32(_p2, _zero);
                _p3 = vmaxq_f32(_p3, _zero);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                vst1q_f32(ptr + 8, _p2);
                vst1q_f32(ptr + 12, _p3);
                ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i + 7 < size; i += 8)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                _p0 = vmaxq_f32(_p0, _zero);
                _p1 = vmaxq_f32(_p1, _zero);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _ptr = vld1q_f32(ptr);
                _ptr = vmaxq_f32(_ptr, _zero);
                vst1q_f32(ptr, _ptr);
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                *ptr = std::max(*ptr, 0.f);
                ptr++;
            }
        }
    }
    else
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            int i = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);
            for (; i + 15 < size; i += 16)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
                    "fcmle  v4.4s, v0.4s, #0        \n"
                    "fcmle  v5.4s, v1.4s, #0        \n"
                    "fcmle  v6.4s, v2.4s, #0        \n"
                    "fcmle  v7.4s, v3.4s, #0        \n"
                    "fmul   v8.4s, v0.4s, %2.4s     \n"
                    "fmul   v9.4s, v1.4s, %2.4s     \n"
                    "fmul   v10.4s, v2.4s, %2.4s    \n"
                    "fmul   v11.4s, v3.4s, %2.4s    \n"
                    "bit    v0.16b, v8.16b, v4.16b  \n"
                    "bit    v1.16b, v9.16b, v5.16b  \n"
                    "bit    v2.16b, v10.16b, v6.16b \n"
                    "bit    v3.16b, v11.16b, v7.16b \n"
                    "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_slope) // %2
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #512]      \n"
                    "vldm       %0, {d0-d7}     \n"
                    "vcle.f32   q4, q0, %q2     \n"
                    "vcle.f32   q5, q1, %q2     \n"
                    "vcle.f32   q6, q2, %q2     \n"
                    "vcle.f32   q7, q3, %q2     \n"
                    "vmul.f32   q8, q0, %q3     \n"
                    "vmul.f32   q9, q1, %q3     \n"
                    "vmul.f32   q10, q2, %q3    \n"
                    "vmul.f32   q11, q3, %q3    \n"
                    "vbit.32    q0, q8, q4      \n"
                    "vbit.32    q1, q9, q5      \n"
                    "vbit.32    q2, q10, q6     \n"
                    "vbit.32    q3, q11, q7     \n"
                    "vstm       %0!, {d0-d7}    \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_zero), // %2
                    "w"(_slope) // %3
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                float32x4_t _p2 = vld1q_f32(ptr + 8);
                float32x4_t _p3 = vld1q_f32(ptr + 12);
                uint32x4_t _lemask0 = vcleq_f32(_p0, _zero);
                uint32x4_t _lemask1 = vcleq_f32(_p1, _zero);
                uint32x4_t _lemask2 = vcleq_f32(_p2, _zero);
                uint32x4_t _lemask3 = vcleq_f32(_p3, _zero);
                float32x4_t _ps0 = vmulq_f32(_p0, _slope);
                float32x4_t _ps1 = vmulq_f32(_p1, _slope);
                float32x4_t _ps2 = vmulq_f32(_p2, _slope);
                float32x4_t _ps3 = vmulq_f32(_p3, _slope);
                _p0 = vbslq_f32(_lemask0, _ps0, _p0);
                _p1 = vbslq_f32(_lemask1, _ps1, _p1);
                _p2 = vbslq_f32(_lemask2, _ps2, _p2);
                _p3 = vbslq_f32(_lemask3, _ps3, _p3);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                vst1q_f32(ptr + 8, _p2);
                vst1q_f32(ptr + 12, _p3);
                ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i + 7 < size; i += 8)
            {
                float32x4_t _p0 = vld1q_f32(ptr);
                float32x4_t _p1 = vld1q_f32(ptr + 4);
                uint32x4_t _lemask0 = vcleq_f32(_p0, _zero);
                uint32x4_t _lemask1 = vcleq_f32(_p1, _zero);
                float32x4_t _ps0 = vmulq_f32(_p0, _slope);
                float32x4_t _ps1 = vmulq_f32(_p1, _slope);
                _p0 = vbslq_f32(_lemask0, _ps0, _p0);
                _p1 = vbslq_f32(_lemask1, _ps1, _p1);
                vst1q_f32(ptr, _p0);
                vst1q_f32(ptr + 4, _p1);
                ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1q_f32(ptr, _p);
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                if (*ptr < 0)
                    *ptr *= slope;
                ptr++;
            }
        }
    }

    return 0;
}

#if NCNN_BF16
int ReLU_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    if (slope == 0.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            int i = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            for (; i + 15 < size; i += 16)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]   \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0] \n"
                    "shll   v0.4s, v0.4h, #16       \n"
                    "shll   v1.4s, v1.4h, #16       \n"
                    "shll   v2.4s, v2.4h, #16       \n"
                    "shll   v3.4s, v3.4h, #16       \n"
                    "fmax   v0.4s, v0.4s, %2.4s     \n"
                    "fmax   v1.4s, v1.4s, %2.4s     \n"
                    "fmax   v2.4s, v2.4s, %2.4s     \n"
                    "fmax   v3.4s, v3.4s, %2.4s     \n"
                    "shrn   v0.4h, v0.4s, #16       \n"
                    "shrn   v1.4h, v1.4s, #16       \n"
                    "shrn   v2.4h, v2.4s, #16       \n"
                    "shrn   v3.4h, v3.4s, #16       \n"
                    "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_zero) // %2
                    : "memory", "v0", "v1", "v2", "v3");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #256]      \n"
                    "vld1.u16   {d4-d7}, [%0]   \n"
                    "vshll.u16  q0, d4, #16     \n"
                    "vshll.u16  q1, d5, #16     \n"
                    "vshll.u16  q2, d6, #16     \n"
                    "vshll.u16  q3, d7, #16     \n"
                    "vmax.f32   q0, q0, %q2     \n"
                    "vmax.f32   q1, q1, %q2     \n"
                    "vmax.f32   q2, q2, %q2     \n"
                    "vmax.f32   q3, q3, %q2     \n"
                    "vshrn.u32  d0, q0, #16     \n"
                    "vshrn.u32  d1, q1, #16     \n"
                    "vshrn.u32  d2, q2, #16     \n"
                    "vshrn.u32  d3, q3, #16     \n"
                    "vst1.u16   {d0-d3}, [%0]!  \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_zero) // %2
                    : "memory", "q0", "q1", "q2", "q3");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                uint16x8_t _p = vld1q_u16(ptr);
                uint16x8_t _q = vld1q_u16(ptr + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                _p0 = vmaxq_f32(_p0, _zero);
                _p1 = vmaxq_f32(_p1, _zero);
                _p2 = vmaxq_f32(_p2, _zero);
                _p3 = vmaxq_f32(_p3, _zero);
                _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
                _q = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
                vst1q_u16(ptr, _p);
                vst1q_u16(ptr + 8, _q);
                ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i + 7 < size; i += 8)
            {
                uint16x8_t _p = vld1q_u16(ptr);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                _p0 = vmaxq_f32(_p0, _zero);
                _p1 = vmaxq_f32(_p1, _zero);
                _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
                vst1q_u16(ptr, _p);
                ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                _p = vmaxq_f32(_p, _zero);
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                float v = bfloat16_to_float32(ptr[0]);
                if (v < 0.f)
                    ptr[0] = float32_to_bfloat16(0.f);
                ptr += 1;
            }
        }
    }
    else
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            int i = 0;
#if __ARM_NEON
            float32x4_t _zero = vdupq_n_f32(0.f);
            float32x4_t _slope = vdupq_n_f32(slope);
            for (; i + 15 < size; i += 16)
            {
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
                asm volatile(
                    "prfm   pldl1keep, [%0, #256]   \n"
                    "ld1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0] \n"
                    "shll   v0.4s, v0.4h, #16       \n"
                    "shll   v1.4s, v1.4h, #16       \n"
                    "shll   v2.4s, v2.4h, #16       \n"
                    "shll   v3.4s, v3.4h, #16       \n"
                    "fcmle  v4.4s, v0.4s, #0        \n"
                    "fcmle  v5.4s, v1.4s, #0        \n"
                    "fcmle  v6.4s, v2.4s, #0        \n"
                    "fcmle  v7.4s, v3.4s, #0        \n"
                    "fmul   v8.4s, v0.4s, %2.4s     \n"
                    "fmul   v9.4s, v1.4s, %2.4s     \n"
                    "fmul   v10.4s, v2.4s, %2.4s    \n"
                    "fmul   v11.4s, v3.4s, %2.4s    \n"
                    "bit    v0.16b, v8.16b, v4.16b  \n"
                    "bit    v1.16b, v9.16b, v5.16b  \n"
                    "bit    v2.16b, v10.16b, v6.16b \n"
                    "bit    v3.16b, v11.16b, v7.16b \n"
                    "shrn   v0.4h, v0.4s, #16       \n"
                    "shrn   v1.4h, v1.4s, #16       \n"
                    "shrn   v2.4h, v2.4s, #16       \n"
                    "shrn   v3.4h, v3.4s, #16       \n"
                    "st1    {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_slope) // %2
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
#else  // __aarch64__
                asm volatile(
                    "pld        [%0, #256]      \n"
                    "vld1.u16   {d4-d7}, [%0]   \n"
                    "vshll.u16  q0, d4, #16     \n"
                    "vshll.u16  q1, d5, #16     \n"
                    "vshll.u16  q2, d6, #16     \n"
                    "vshll.u16  q3, d7, #16     \n"
                    "vcle.f32   q4, q0, %q2     \n"
                    "vcle.f32   q5, q1, %q2     \n"
                    "vcle.f32   q6, q2, %q2     \n"
                    "vcle.f32   q7, q3, %q2     \n"
                    "vmul.f32   q8, q0, %q3     \n"
                    "vmul.f32   q9, q1, %q3     \n"
                    "vmul.f32   q10, q2, %q3    \n"
                    "vmul.f32   q11, q3, %q3    \n"
                    "vbit.32    q0, q8, q4      \n"
                    "vbit.32    q1, q9, q5      \n"
                    "vbit.32    q2, q10, q6     \n"
                    "vbit.32    q3, q11, q7     \n"
                    "vshrn.u32  d0, q0, #16     \n"
                    "vshrn.u32  d1, q1, #16     \n"
                    "vshrn.u32  d2, q2, #16     \n"
                    "vshrn.u32  d3, q3, #16     \n"
                    "vst1.u16   {d0-d3}, [%0]!  \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_zero), // %2
                    "w"(_slope) // %3
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
                uint16x8_t _p = vld1q_u16(ptr);
                uint16x8_t _q = vld1q_u16(ptr + 8);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
                uint32x4_t _lemask0 = vcleq_f32(_p0, _zero);
                uint32x4_t _lemask1 = vcleq_f32(_p1, _zero);
                uint32x4_t _lemask2 = vcleq_f32(_p2, _zero);
                uint32x4_t _lemask3 = vcleq_f32(_p3, _zero);
                float32x4_t _ps0 = vmulq_f32(_p0, _slope);
                float32x4_t _ps1 = vmulq_f32(_p1, _slope);
                float32x4_t _ps2 = vmulq_f32(_p2, _slope);
                float32x4_t _ps3 = vmulq_f32(_p3, _slope);
                _p0 = vbslq_f32(_lemask0, _ps0, _p0);
                _p1 = vbslq_f32(_lemask1, _ps1, _p1);
                _p2 = vbslq_f32(_lemask2, _ps2, _p2);
                _p3 = vbslq_f32(_lemask3, _ps3, _p3);
                _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
                _q = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
                vst1q_u16(ptr, _p);
                vst1q_u16(ptr + 8, _q);
                ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i + 7 < size; i += 8)
            {
                uint16x8_t _p = vld1q_u16(ptr);
                float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
                float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
                uint32x4_t _lemask0 = vcleq_f32(_p0, _zero);
                uint32x4_t _lemask1 = vcleq_f32(_p1, _zero);
                float32x4_t _ps0 = vmulq_f32(_p0, _slope);
                float32x4_t _ps1 = vmulq_f32(_p1, _slope);
                _p0 = vbslq_f32(_lemask0, _ps0, _p0);
                _p1 = vbslq_f32(_lemask1, _ps1, _p1);
                _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
                vst1q_u16(ptr, _p);
                ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                uint32x4_t _lemask = vcleq_f32(_p, _zero);
                float32x4_t _ps = vmulq_f32(_p, _slope);
                _p = vbslq_f32(_lemask, _ps, _p);
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                float v = bfloat16_to_float32(ptr[0]);
                if (v < 0.f)
                    ptr[0] = float32_to_bfloat16(v * slope);
                ptr += 1;
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 8)
    {
        if (slope == 0.f)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                signed char* ptr = bottom_top_blob.channel(q);

                int i = 0;
                int8x16_t _zero = vdupq_n_s8(0);
                for (; i + 1 < size; i += 2)
                {
                    int8x16_t _p = vld1q_s8(ptr);
                    _p = vmaxq_s8(_p, _zero);
                    vst1q_s8(ptr, _p);

                    ptr += 16;
                }
                for (; i < size; i++)
                {
                    int8x8_t _p = vld1_s8(ptr);
                    _p = vmax_s8(_p, vget_low_s8(_zero));
                    vst1_s8(ptr, _p);

                    ptr += 8;
                }
            }
        }
        else
        {
            // TODO leakyrelu
        }

        return 0;
    }
#endif // __ARM_NEON

    if (slope == 0.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            signed char* ptr = bottom_top_blob.channel(q);

            int i = 0;
#if __ARM_NEON
            int8x16_t _zero = vdupq_n_s8(0);
            for (; i + 15 < size; i += 16)
            {
                int8x16_t _p = vld1q_s8(ptr);
                _p = vmaxq_s8(_p, _zero);
                vst1q_s8(ptr, _p);

                ptr += 16;
            }
#endif // __ARM_NEON
            for (; i < size; i++)
            {
                if (*ptr < 0)
                    *ptr = 0;

                ptr++;
            }
        }
    }
    else
    {
        // TODO leakyrelu
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/relu_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_RELU_ARM_H
#define LAYER_RELU_ARM_H

#include "relu.h"

namespace ncnn {

class ReLU_arm : public ReLU
{
public:
    ReLU_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
    int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_RELU_ARM_H


================================================
FILE: src/layer/arm/relu_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "relu_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int ReLU_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    if (slope == 0.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            float16x8_t _zero = vdupq_n_f16((__fp16)0.f);

            int i = 0;
            for (; i + 31 < size; i += 32)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                    "fmax   v0.8h, v0.8h, %2.8h     \n"
                    "fmax   v1.8h, v1.8h, %2.8h     \n"
                    "fmax   v2.8h, v2.8h, %2.8h     \n"
                    "fmax   v3.8h, v3.8h, %2.8h     \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_zero) // %2
                    : "memory", "v0", "v1", "v2", "v3");
#else  // NCNN_GNU_INLINE_ASM
                float16x8_t _p0 = vld1q_f16(ptr);
                float16x8_t _p1 = vld1q_f16(ptr + 8);
                float16x8_t _p2 = vld1q_f16(ptr + 16);
                float16x8_t _p3 = vld1q_f16(ptr + 24);
                _p0 = vmaxq_f16(_p0, _zero);
                _p1 = vmaxq_f16(_p1, _zero);
                _p2 = vmaxq_f16(_p2, _zero);
                _p3 = vmaxq_f16(_p3, _zero);
                vst1q_f16(ptr, _p0);
                vst1q_f16(ptr + 8, _p1);
                vst1q_f16(ptr + 16, _p2);
                vst1q_f16(ptr + 24, _p3);
                ptr += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i + 15 < size; i += 16)
            {
                float16x8_t _p0 = vld1q_f16(ptr);
                float16x8_t _p1 = vld1q_f16(ptr + 8);
                _p0 = vmaxq_f16(_p0, _zero);
                _p1 = vmaxq_f16(_p1, _zero);
                vst1q_f16(ptr, _p0);
                vst1q_f16(ptr + 8, _p1);
                ptr += 16;
            }
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                _p = vmaxq_f16(_p, _zero);
                vst1q_f16(ptr, _p);
                ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                _p = vmax_f16(_p, vget_low_f16(_zero));
                vst1_f16(ptr, _p);
                ptr += 4;
            }
            for (; i < size; i++)
            {
                __fp16 v = ptr[0];
                if (v < (__fp16)0.f)
                    ptr[0] = (__fp16)0.f;

                ptr += 1;
            }
        }
    }
    else
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            float16x8_t _zero = vdupq_n_f16((__fp16)0.f);
            float16x8_t _slope = vdupq_n_f16((__fp16)slope);

            int i = 0;
            for (; i + 31 < size; i += 32)
            {
#if NCNN_GNU_INLINE_ASM
                asm volatile(
                    "prfm   pldl1keep, [%0, #512]   \n"
                    "ld1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
                    "fcmle  v4.8h, v0.8h, #0        \n"
                    "fcmle  v5.8h, v1.8h, #0        \n"
                    "fcmle  v6.8h, v2.8h, #0        \n"
                    "fcmle  v7.8h, v3.8h, #0        \n"
                    "fmul   v8.8h, v0.8h, %2.8h     \n"
                    "fmul   v9.8h, v1.8h, %2.8h     \n"
                    "fmul   v10.8h, v2.8h, %2.8h    \n"
                    "fmul   v11.8h, v3.8h, %2.8h    \n"
                    "bit    v0.16b, v8.16b, v4.16b  \n"
                    "bit    v1.16b, v9.16b, v5.16b  \n"
                    "bit    v2.16b, v10.16b, v6.16b \n"
                    "bit    v3.16b, v11.16b, v7.16b \n"
                    "st1    {v0.8h, v1.8h, v2.8h, v3.8h}, [%0], #64 \n"
                    : "=r"(ptr) // %0
                    : "0"(ptr),
                    "w"(_slope) // %2
                    : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
#else  // NCNN_GNU_INLINE_ASM
                float16x8_t _p0 = vld1q_f16(ptr);
                float16x8_t _p1 = vld1q_f16(ptr + 8);
                float16x8_t _p2 = vld1q_f16(ptr + 16);
                float16x8_t _p3 = vld1q_f16(ptr + 24);
                uint16x8_t _lemask0 = vcleq_f16(_p0, _zero);
                uint16x8_t _lemask1 = vcleq_f16(_p1, _zero);
                uint16x8_t _lemask2 = vcleq_f16(_p2, _zero);
                uint16x8_t _lemask3 = vcleq_f16(_p3, _zero);
                float16x8_t _ps0 = vmulq_f16(_p0, _slope);
                float16x8_t _ps1 = vmulq_f16(_p1, _slope);
                float16x8_t _ps2 = vmulq_f16(_p2, _slope);
                float16x8_t _ps3 = vmulq_f16(_p3, _slope);
                _p0 = vbslq_f16(_lemask0, _ps0, _p0);
                _p1 = vbslq_f16(_lemask1, _ps1, _p1);
                _p2 = vbslq_f16(_lemask2, _ps2, _p2);
                _p3 = vbslq_f16(_lemask3, _ps3, _p3);
                vst1q_f16(ptr, _p0);
                vst1q_f16(ptr + 8, _p1);
                vst1q_f16(ptr + 16, _p2);
                vst1q_f16(ptr + 24, _p3);
                ptr += 32;
#endif // NCNN_GNU_INLINE_ASM
            }
            for (; i + 15 < size; i += 16)
            {
                float16x8_t _p0 = vld1q_f16(ptr);
                float16x8_t _p1 = vld1q_f16(ptr + 8);
                uint16x8_t _lemask0 = vcleq_f16(_p0, _zero);
                uint16x8_t _lemask1 = vcleq_f16(_p1, _zero);
                float16x8_t _ps0 = vmulq_f16(_p0, _slope);
                float16x8_t _ps1 = vmulq_f16(_p1, _slope);
                _p0 = vbslq_f16(_lemask0, _ps0, _p0);
                _p1 = vbslq_f16(_lemask1, _ps1, _p1);
                vst1q_f16(ptr, _p0);
                vst1q_f16(ptr + 8, _p1);
                ptr += 16;
            }
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                uint16x8_t _lemask = vcleq_f16(_p, _zero);
                float16x8_t _ps = vmulq_f16(_p, _slope);
                _p = vbslq_f16(_lemask, _ps, _p);
                vst1q_f16(ptr, _p);
                ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float16x4_t _p = vld1_f16(ptr);
                uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero));
                float16x4_t _ps = vmul_f16(_p, vget_low_f16(_slope));
                _p = vbsl_f16(_lemask, _ps, _p);
                vst1_f16(ptr, _p);
                ptr += 4;
            }
            for (; i < size; i++)
            {
                __fp16 v = ptr[0];
                if (v < (__fp16)0.f)
                    ptr[0] = v * (__fp16)slope;

                ptr += 1;
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/requantize_arm.cpp
================================================
// Copyright 2019 BUG1989
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "requantize_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

Requantize_arm::Requantize_arm()
{
#if __ARM_NEON
    support_packing = true;
#endif // __ARM_NEON
}

static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount, int elempack)
{
    const int scale_in_data_size = scale_in_data.w;
    const int bias_data_size = bias_data.w;
    const int scale_out_data_size = scale_out_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("requantize_relu %d %d %d   %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack);

    // int8(relu(v * scale_in) * scale_out)
    // int8_relu(v * (scale_in * scale_out))

    // int8(relu(v * scale_in + bias) * scale_out)
    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))

    float scale_in = scale_in_data[0];
#if __ARM_NEON
    float32x4_t _scale_in0 = vdupq_n_f32(scale_in);
    float32x4_t _scale_in1 = _scale_in0;
    if (scale_in_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_in0 = vld1q_f32((const float*)scale_in_data);
            _scale_in1 = vld1q_f32((const float*)scale_in_data + 4);
        }
    }
#endif // __ARM_NEON

    float scale_out = scale_out_data[0];
#if __ARM_NEON
    float32x4_t _scale_out0 = vdupq_n_f32(scale_out);
    float32x4_t _scale_out1 = _scale_out0;
    if (scale_out_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_out0 = vld1q_f32((const float*)scale_out_data);
            _scale_out1 = vld1q_f32((const float*)scale_out_data + 4);
        }
    }
#endif // __ARM_NEON

    float scale = scale_in * scale_out;
#if __ARM_NEON
    float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0);
    float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1);
#endif // __ARM_NEON

    if (bias_data_size == 0)
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
            _v0 = vmulq_f32(_v0, _scale0);
            _v1 = vmulq_f32(_v1, _scale1);
            vst1_s8(ptr, float2int8relu(_v0, _v1));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
            _v = vmulq_f32(_v, _scale0);
            int8x8_t v = float2int8relu(_v, _v);
            ptr[0] = vget_lane_s8(v, 0);
            ptr[1] = vget_lane_s8(v, 1);
            ptr[2] = vget_lane_s8(v, 2);
            ptr[3] = vget_lane_s8(v, 3);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = *intptr * scale;
            if (v < 0) v = 0;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __ARM_NEON
        float32x4_t _bias0 = vdupq_n_f32(bias);
        float32x4_t _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 8)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = vld1q_f32((const float*)bias_data + 4);
            }
        }
#endif // __ARM_NEON

        bias = bias * scale_out;
#if __ARM_NEON
        _bias0 = vmulq_f32(_bias0, _scale_out0);
        _bias1 = vmulq_f32(_bias1, _scale_out1);
#endif // __ARM_NEON

        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
#if __aarch64__
            _v0 = vfmaq_f32(_bias0, _v0, _scale0);
            _v1 = vfmaq_f32(_bias1, _v1, _scale1);
#else  // __aarch64__
            _v0 = vmlaq_f32(_bias0, _v0, _scale0);
            _v1 = vmlaq_f32(_bias1, _v1, _scale1);
#endif // __aarch64__
            vst1_s8(ptr, float2int8relu(_v0, _v1));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
#if __aarch64__
            _v = vfmaq_f32(_bias0, _v, _scale0);
#else  // __aarch64__
            _v = vmlaq_f32(_bias0, _v, _scale0);
#endif // __aarch64__
            int8x8_t v = float2int8relu(_v, _v);
            ptr[0] = vget_lane_s8(v, 0);
            ptr[1] = vget_lane_s8(v, 1);
            ptr[2] = vget_lane_s8(v, 2);
            ptr[3] = vget_lane_s8(v, 3);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = *intptr * scale + bias;
            if (v < 0) v = 0;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
}

static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, float slope, int elemcount, int elempack)
{
    const int scale_in_data_size = scale_in_data.w;
    const int bias_data_size = bias_data.w;
    const int scale_out_data_size = scale_out_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("requantize_leakyrelu %d %d %d   %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack);

    // int8(leakyrelu(v * scale_in, slope) * scale_out)
    // int8_leakyrelu(v * (scale_in * scale_out), slope)

    // int8(leakyrelu(v * scale_in + bias, slope) * scale_out)
    // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope)

    float scale_in = scale_in_data[0];
#if __ARM_NEON
    float32x4_t _scale_in0 = vdupq_n_f32(scale_in);
    float32x4_t _scale_in1 = _scale_in0;
    if (scale_in_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_in0 = vld1q_f32((const float*)scale_in_data);
            _scale_in1 = vld1q_f32((const float*)scale_in_data + 4);
        }
    }
#endif // __ARM_NEON

    float scale_out = scale_out_data[0];
#if __ARM_NEON
    float32x4_t _scale_out0 = vdupq_n_f32(scale_out);
    float32x4_t _scale_out1 = _scale_out0;
    if (scale_out_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_out0 = vld1q_f32((const float*)scale_out_data);
            _scale_out1 = vld1q_f32((const float*)scale_out_data + 4);
        }
    }
#endif // __ARM_NEON

    float scale = scale_in * scale_out;
#if __ARM_NEON
    float32x4_t _scale0 = vmulq_f32(_scale_in0, _scale_out0);
    float32x4_t _scale1 = vmulq_f32(_scale_in1, _scale_out1);
    float32x4_t _slope = vdupq_n_f32(slope);
#endif // __ARM_NEON

    if (bias_data_size == 0)
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
            _v0 = vmulq_f32(_v0, _scale0);
            _v1 = vmulq_f32(_v1, _scale1);
            vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
            _v = vmulq_f32(_v, _scale0);
            int8x8_t v = float2int8leakyrelu(_v, _v, _slope);
            ptr[0] = vget_lane_s8(v, 0);
            ptr[1] = vget_lane_s8(v, 1);
            ptr[2] = vget_lane_s8(v, 2);
            ptr[3] = vget_lane_s8(v, 3);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = *intptr * scale;
            if (v < 0) v *= slope;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __ARM_NEON
        float32x4_t _bias0 = vdupq_n_f32(bias);
        float32x4_t _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 8)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = vld1q_f32((const float*)bias_data + 4);
            }
        }
#endif // __ARM_NEON

        bias = bias * scale_out;
#if __ARM_NEON
        _bias0 = vmulq_f32(_bias0, _scale_out0);
        _bias1 = vmulq_f32(_bias1, _scale_out1);
#endif // __ARM_NEON

        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
#if __aarch64__
            _v0 = vfmaq_f32(_bias0, _v0, _scale0);
            _v1 = vfmaq_f32(_bias1, _v1, _scale1);
#else  // __aarch64__
            _v0 = vmlaq_f32(_bias0, _v0, _scale0);
            _v1 = vmlaq_f32(_bias1, _v1, _scale1);
#endif // __aarch64__
            vst1_s8(ptr, float2int8leakyrelu(_v0, _v1, _slope));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
#if __aarch64__
            _v = vfmaq_f32(_bias0, _v, _scale0);
#else  // __aarch64__
            _v = vmlaq_f32(_bias0, _v, _scale0);
#endif // __aarch64__
            int8x8_t v = float2int8leakyrelu(_v, _v, _slope);
            ptr[0] = vget_lane_s8(v, 0);
            ptr[1] = vget_lane_s8(v, 1);
            ptr[2] = vget_lane_s8(v, 2);
            ptr[3] = vget_lane_s8(v, 3);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = *intptr * scale + bias;
            if (v < 0) v *= slope;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
}

static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int elempack)
{
    if (activation_type == 1)
    {
        requantize_relu(intptr, ptr, scale_in_data, bias_data, scale_out_data, elemcount, elempack);
        return;
    }

    if (activation_type == 2 && activation_params[0] > 0.f)
    {
        const float slope = activation_params[0];
        requantize_leakyrelu(intptr, ptr, scale_in_data, bias_data, scale_out_data, slope, elemcount, elempack);
        return;
    }

    const int scale_in_data_size = scale_in_data.w;
    const int bias_data_size = bias_data.w;
    const int scale_out_data_size = scale_out_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("requantize %d %d %d   %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack);

    float scale_in = scale_in_data[0];
#if __ARM_NEON
    float32x4_t _scale_in0 = vdupq_n_f32(scale_in);
    float32x4_t _scale_in1 = _scale_in0;
    if (scale_in_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_in0 = vld1q_f32((const float*)scale_in_data);
            _scale_in1 = vld1q_f32((const float*)scale_in_data + 4);
        }
    }
#endif // __ARM_NEON

    float scale_out = scale_out_data[0];
#if __ARM_NEON
    float32x4_t _scale_out0 = vdupq_n_f32(scale_out);
    float32x4_t _scale_out1 = _scale_out0;
    if (scale_out_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_out0 = vld1q_f32((const float*)scale_out_data);
            _scale_out1 = vld1q_f32((const float*)scale_out_data + 4);
        }
    }
#endif // __ARM_NEON

    if (bias_data_size == 0)
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
            _v0 = vmulq_f32(_v0, _scale_in0);
            _v1 = vmulq_f32(_v1, _scale_in1);
            _v0 = activation_ps(_v0, activation_type, activation_params);
            _v1 = activation_ps(_v1, activation_type, activation_params);
            _v0 = vmulq_f32(_v0, _scale_out0);
            _v1 = vmulq_f32(_v1, _scale_out1);
            vst1_s8(ptr, float2int8(_v0, _v1));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
            _v = vmulq_f32(_v, _scale_in0);
            _v = activation_ps(_v, activation_type, activation_params);
            _v = vmulq_f32(_v, _scale_out0);
            int8x8_t v = float2int8(_v, _v);
            ptr[0] = vget_lane_s8(v, 0);
            ptr[1] = vget_lane_s8(v, 1);
            ptr[2] = vget_lane_s8(v, 2);
            ptr[3] = vget_lane_s8(v, 3);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = *intptr * scale_in;
            v = activation_ss(v, activation_type, activation_params);
            *ptr = float2int8(v * scale_out);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __ARM_NEON
        float32x4_t _bias0 = vdupq_n_f32(bias);
        float32x4_t _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 8)
            {
                _bias0 = vld1q_f32((const float*)bias_data);
                _bias1 = vld1q_f32((const float*)bias_data + 4);
            }
        }
#endif // __ARM_NEON

        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _v0 = vcvtq_f32_s32(vld1q_s32(intptr));
            float32x4_t _v1 = vcvtq_f32_s32(vld1q_s32(intptr + 4));
#if __aarch64__
            _v0 = vfmaq_f32(_bias0, _v0, _scale_in0);
            _v1 = vfmaq_f32(_bias1, _v1, _scale_in1);
#else  // __aarch64__
            _v0 = vmlaq_f32(_bias0, _v0, _scale_in0);
            _v1 = vmlaq_f32(_bias1, _v1, _scale_in1);
#endif // __aarch64__
            _v0 = activation_ps(_v0, activation_type, activation_params);
            _v1 = activation_ps(_v1, activation_type, activation_params);
            _v0 = vmulq_f32(_v0, _scale_out0);
            _v1 = vmulq_f32(_v1, _scale_out1);
            vst1_s8(ptr, float2int8(_v0, _v1));
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _v = vcvtq_f32_s32(vld1q_s32(intptr));
#if __aarch64__
            _v = vfmaq_f32(_bias0, _v, _scale_in0);
#else  // __aarch64__
            _v = vmlaq_f32(_bias0, _v, _scale_in0);
#endif // __aarch64__
            _v = activation_ps(_v, activation_type, activation_params);
            _v = vmulq_f32(_v, _scale_out0);
            int8x8_t v = float2int8(_v, _v);
            ptr[0] = vget_lane_s8(v, 0);
            ptr[1] = vget_lane_s8(v, 1);
            ptr[2] = vget_lane_s8(v, 2);
            ptr[3] = vget_lane_s8(v, 3);
            intptr += 4;
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = *intptr * scale_in + bias;
            v = activation_ss(v, activation_type, activation_params);
            *ptr = float2int8(v * scale_out);
            intptr++;
            ptr++;
        }
    }
}

int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;
    const size_t out_elemsize = elempack * 1u;

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const int* intptr = (const int*)bottom_blob + i * elempack;
            signed char* ptr = (signed char*)top_blob + i * elempack;

            // assert scale_in_data_size == 1
            // assert bias_data_size == 0 || bias_data_size == 1
            // assert scale_out_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            requantize(intptr, ptr, scale_in_data, bias_data, scale_out_data, activation_type, activation_params, size, 1);
        }
    }

    if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            const int* intptr = bottom_blob.row<const int>(i);
            signed char* ptr = top_blob.row<signed char>(i);

            const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
            const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;

            requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
        }
    }

    if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int* intptr = bottom_blob.channel(q);
            signed char* ptr = top_blob.channel(q);

            const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
            const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;

            requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/requantize_arm.h
================================================
// Copyright 2019 BUG1989
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_REQUANTIZE_ARM_H
#define LAYER_REQUANTIZE_ARM_H

#include "requantize.h"

namespace ncnn {

class Requantize_arm : public Requantize
{
public:
    Requantize_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_REQUANTIZE_ARM_H


================================================
FILE: src/layer/arm/reshape_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "reshape_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

Reshape_arm::Reshape_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Reshape_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    Mat& top_blob = top_blobs[0];

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

    // resolve out shape
    int outw = w;
    int outh = h;
    int outd = d;
    int outc = c;

    if (!shape_expr.empty())
    {
        int er = eval_shape_expr(bottom_blobs, outw, outh, outd, outc);
        if (er != 0)
            return -1;
    }

    if (ndim == 1)
    {
        // flatten
        flatten(bottom_blob, top_blob, opt);
        if (top_blob.empty())
            return -100;

        return 0;
    }

    const int dims = bottom_blob.dims;
    const int elempack = bottom_blob.elempack;
    const size_t elemsize = bottom_blob.elemsize;

    const int total = bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * elempack;

    if (ndim == 2)
    {
        if (outw == 0)
            outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
        if (outh == 0)
            outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;

        if (outw == -1)
            outw = total / outh;
        if (outh == -1)
            outh = total / outw;

        int out_elempack = opt.use_packing_layout && outh % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if (dims == 2 && bottom_blob.h * elempack == outh && elempack == out_elempack)
        {
            top_blob = bottom_blob;
            return 0;
        }

        if (out_elempack == 1)
        {
            // flatten
            flatten(bottom_blob, top_blob, opt);
            if (top_blob.empty())
                return -100;

            top_blob.dims = 2;
            top_blob.w = outw;
            top_blob.h = outh;
            top_blob.cstep = top_blob.cstep * top_blob.elempack;
            top_blob.elemsize = out_elemsize;
            top_blob.elempack = out_elempack;

            return 0;
        }

        // flatten
        Mat bottom_blob_flattened = bottom_blob;
        {
            Option opt_flatten = opt;
            opt_flatten.blob_allocator = opt.workspace_allocator;

            flatten(bottom_blob, bottom_blob_flattened, opt_flatten);
            if (bottom_blob_flattened.empty())
                return -100;
        }

        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        // assert out_elempack == 4

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < top_blob.h; i++)
        {
            const float* ptr0 = (const float*)bottom_blob_flattened + outw * i * 4;
            const float* ptr1 = (const float*)bottom_blob_flattened + outw * (i * 4 + 1);
            const float* ptr2 = (const float*)bottom_blob_flattened + outw * (i * 4 + 2);
            const float* ptr3 = (const float*)bottom_blob_flattened + outw * (i * 4 + 3);
            float* outptr = top_blob.row(i);

            int j = 0;
#if __ARM_NEON
            for (; j + 3 < outw; j += 4)
            {
                float32x4x4_t _v4;
                _v4.val[0] = vld1q_f32(ptr0);
                _v4.val[1] = vld1q_f32(ptr1);
                _v4.val[2] = vld1q_f32(ptr2);
                _v4.val[3] = vld1q_f32(ptr3);

                vst4q_f32(outptr, _v4);

                ptr0 += 4;
                ptr1 += 4;
                ptr2 += 4;
                ptr3 += 4;
                outptr += 16;
            }
#endif
            for (; j < outw; j++)
            {
                outptr[0] = *ptr0++;
                outptr[1] = *ptr1++;
                outptr[2] = *ptr2++;
                outptr[3] = *ptr3++;

                outptr += 4;
            }
        }
    }

    if (ndim == 3 || ndim == 4)
    {
        if (ndim == 3)
        {
            if (outw == 0)
                outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
            if (outh == 0)
                outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;
            if (outc == 0)
                outc = dims == 3 ? bottom_blob.c * elempack : bottom_blob.c;

            if (outw == -1)
                outw = total / outc / outh;
            if (outh == -1)
                outh = total / outc / outw;
            if (outc == -1)
                outc = total / outh / outw;

            outd = 1;
        }
        else // if (ndim == 4)
        {
            if (outw == 0)
                outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
            if (outh == 0)
                outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;
            if (outd == 0)
                outd = bottom_blob.d;
            if (outc == 0)
                outc = (dims == 3 || dims == 4) ? bottom_blob.c * elempack : bottom_blob.c;

            if (outw == -1)
                outw = total / outc / outd / outh;
            if (outh == -1)
                outh = total / outc / outd / outw;
            if (outd == -1)
                outd = total / outc / outh / outw;
            if (outc == -1)
                outc = total / outd / outh / outw;
        }

        int out_elempack = opt.use_packing_layout && outc % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if ((dims == 3 || dims == 4) && bottom_blob.c * elempack == outc && elempack == out_elempack)
        {
            top_blob = bottom_blob;
            top_blob.dims = ndim;
            top_blob.w = outw;
            top_blob.h = outh;
            top_blob.d = outd;
            return 0;
        }

        // flatten
        Mat bottom_blob_flattened = bottom_blob;
        {
            Option opt_flatten = opt;
            opt_flatten.blob_allocator = opt.workspace_allocator;

            flatten(bottom_blob, bottom_blob_flattened, opt_flatten);
            if (bottom_blob_flattened.empty())
                return -100;
        }

        if (ndim == 3)
        {
            top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        }
        else // if (ndim == 4)
        {
            top_blob.create(outw, outh, outd, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        }
        if (top_blob.empty())
            return -100;

        int size = top_blob.w * top_blob.h * top_blob.d;

        if (out_elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < top_blob.c; q++)
            {
                const float* ptr0 = (const float*)bottom_blob_flattened + size * q * 4;
                const float* ptr1 = (const float*)bottom_blob_flattened + size * (q * 4 + 1);
                const float* ptr2 = (const float*)bottom_blob_flattened + size * (q * 4 + 2);
                const float* ptr3 = (const float*)bottom_blob_flattened + size * (q * 4 + 3);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    float32x4x4_t _v4;
                    _v4.val[0] = vld1q_f32(ptr0);
                    _v4.val[1] = vld1q_f32(ptr1);
                    _v4.val[2] = vld1q_f32(ptr2);
                    _v4.val[3] = vld1q_f32(ptr3);

                    vst4q_f32(outptr, _v4);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    ptr3 += 4;
                    outptr += 16;
                }
#endif
                for (; i < size; i++)
                {
                    outptr[0] = *ptr0++;
                    outptr[1] = *ptr1++;
                    outptr[2] = *ptr2++;
                    outptr[3] = *ptr3++;

                    outptr += 4;
                }
            }
        }

        if (out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < top_blob.c; q++)
            {
                const float* ptr = (const float*)bottom_blob_flattened + size * q;
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    float32x4_t _v = vld1q_f32(ptr);
                    vst1q_f32(outptr, _v);
                    ptr += 4;
                    outptr += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr++ = *ptr++;
                }
            }
        }
    }

    return 0;
}

int Reshape_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    Mat& top_blob = top_blobs[0];

    // resolve out shape
    int outw = w;
    int outh = h;
    int outd = d;
    int outc = c;

    if (!shape_expr.empty())
    {
        int er = eval_shape_expr(bottom_blobs, outw, outh, outd, outc);
        if (er != 0)
            return -1;
    }

    if (ndim == 1)
    {
        // flatten
        flatten(bottom_blob, top_blob, opt);
        if (top_blob.empty())
            return -100;

        return 0;
    }

    const int dims = bottom_blob.dims;
    const int elempack = bottom_blob.elempack;
    const size_t elemsize = bottom_blob.elemsize;

    const int total = bottom_blob.w * bottom_blob.h * bottom_blob.d * bottom_blob.c * elempack;

    if (ndim == 2)
    {
        if (outw == 0)
            outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
        if (outh == 0)
            outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;

        if (outw == -1)
            outw = total / outh;
        if (outh == -1)
            outh = total / outw;

        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
#if NCNN_ARM82
            out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
#else
            out_elempack = outh % 4 == 0 ? 4 : 1;
#endif
        }
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if (dims == 2 && bottom_blob.h * elempack == outh && elempack == out_elempack)
        {
            top_blob = bottom_blob;
            return 0;
        }

        if (out_elempack == 1)
        {
            // flatten
            flatten(bottom_blob, top_blob, opt);
            if (top_blob.empty())
                return -100;

            top_blob.dims = 2;
            top_blob.w = outw;
            top_blob.h = outh;
            top_blob.cstep = top_blob.cstep * top_blob.elempack;
            top_blob.elemsize = out_elemsize;
            top_blob.elempack = out_elempack;

            return 0;
        }

        // flatten
        Mat bottom_blob_flattened = bottom_blob;
        {
            Option opt_flatten = opt;
            opt_flatten.blob_allocator = opt.workspace_allocator;

            flatten(bottom_blob, bottom_blob_flattened, opt_flatten);
            if (bottom_blob_flattened.empty())
                return -100;
        }

        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if NCNN_ARM82
        if (out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < top_blob.h; i++)
            {
                const unsigned short* ptr0 = (const unsigned short*)bottom_blob_flattened + outw * i * 8;
                const unsigned short* ptr1 = (const unsigned short*)bottom_blob_flattened + outw * (i * 8 + 1);
                const unsigned short* ptr2 = (const unsigned short*)bottom_blob_flattened + outw * (i * 8 + 2);
                const unsigned short* ptr3 = (const unsigned short*)bottom_blob_flattened + outw * (i * 8 + 3);
                const unsigned short* ptr4 = (const unsigned short*)bottom_blob_flattened + outw * (i * 8 + 4);
                const unsigned short* ptr5 = (const unsigned short*)bottom_blob_flattened + outw * (i * 8 + 5);
                const unsigned short* ptr6 = (const unsigned short*)bottom_blob_flattened + outw * (i * 8 + 6);
                const unsigned short* ptr7 = (const unsigned short*)bottom_blob_flattened + outw * (i * 8 + 7);
                unsigned short* outptr = top_blob.row<unsigned short>(i);

                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    uint16x8_t _p01 = vcombine_u16(vld1_u16(ptr0), vld1_u16(ptr1));
                    uint16x8_t _p23 = vcombine_u16(vld1_u16(ptr2), vld1_u16(ptr3));
                    uint16x8_t _p45 = vcombine_u16(vld1_u16(ptr4), vld1_u16(ptr5));
                    uint16x8_t _p67 = vcombine_u16(vld1_u16(ptr6), vld1_u16(ptr7));

                    uint16x8x2_t _p0415 = vzipq_u16(_p01, _p45);
                    uint16x8x2_t _p2637 = vzipq_u16(_p23, _p67);

                    uint16x8x4_t _v4;
                    _v4.val[0] = _p0415.val[0];
                    _v4.val[1] = _p0415.val[1];
                    _v4.val[2] = _p2637.val[0];
                    _v4.val[3] = _p2637.val[1];

                    vst4q_u16(outptr, _v4);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    ptr3 += 4;
                    ptr4 += 4;
                    ptr5 += 4;
                    ptr6 += 4;
                    ptr7 += 4;
                    outptr += 32;
                }
                for (; j < outw; j++)
                {
                    outptr[0] = *ptr0++;
                    outptr[1] = *ptr1++;
                    outptr[2] = *ptr2++;
                    outptr[3] = *ptr3++;
                    outptr[4] = *ptr4++;
                    outptr[5] = *ptr5++;
                    outptr[6] = *ptr6++;
                    outptr[7] = *ptr7++;

                    outptr += 8;
                }
            }
        }
#endif // NCNN_ARM82

        if (out_elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < top_blob.h; i++)
            {
                const unsigned short* ptr0 = (const unsigned short*)bottom_blob_flattened + outw * i * 4;
                const unsigned short* ptr1 = (const unsigned short*)bottom_blob_flattened + outw * (i * 4 + 1);
                const unsigned short* ptr2 = (const unsigned short*)bottom_blob_flattened + outw * (i * 4 + 2);
                const unsigned short* ptr3 = (const unsigned short*)bottom_blob_flattened + outw * (i * 4 + 3);
                unsigned short* outptr = top_blob.row<unsigned short>(i);

                int j = 0;
#if __ARM_NEON
                for (; j + 3 < outw; j += 4)
                {
                    uint16x4x4_t _v4;
                    _v4.val[0] = vld1_u16(ptr0);
                    _v4.val[1] = vld1_u16(ptr1);
                    _v4.val[2] = vld1_u16(ptr2);
                    _v4.val[3] = vld1_u16(ptr3);

                    vst4_u16(outptr, _v4);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    ptr3 += 4;
                    outptr += 16;
                }
#endif
                for (; j < outw; j++)
                {
                    outptr[0] = *ptr0++;
                    outptr[1] = *ptr1++;
                    outptr[2] = *ptr2++;
                    outptr[3] = *ptr3++;

                    outptr += 4;
                }
            }
        }
    }

    if (ndim == 3 || ndim == 4)
    {
        if (ndim == 3)
        {
            if (outw == 0)
                outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
            if (outh == 0)
                outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;
            if (outc == 0)
                outc = dims == 3 ? bottom_blob.c * elempack : bottom_blob.c;

            if (outw == -1)
                outw = total / outc / outh;
            if (outh == -1)
                outh = total / outc / outw;
            if (outc == -1)
                outc = total / outh / outw;

            outd = 1;
        }
        else // if (ndim == 4)
        {
            if (outw == 0)
                outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
            if (outh == 0)
                outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;
            if (outd == 0)
                outd = bottom_blob.d;
            if (outc == 0)
                outc = (dims == 3 || dims == 4) ? bottom_blob.c * elempack : bottom_blob.c;

            if (outw == -1)
                outw = total / outc / outd / outh;
            if (outh == -1)
                outh = total / outc / outd / outw;
            if (outd == -1)
                outd = total / outc / outh / outw;
            if (outc == -1)
                outc = total / outd / outh / outw;
        }

        int out_elempack = 1;
        if (opt.use_packing_layout)
        {
#if NCNN_ARM82
            out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
#else
            out_elempack = outc % 4 == 0 ? 4 : 1;
#endif
        }
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if ((dims == 3 || dims == 4) && bottom_blob.c * elempack == outc && elempack == out_elempack)
        {
            top_blob = bottom_blob;
            top_blob.dims = ndim;
            top_blob.w = outw;
            top_blob.h = outh;
            top_blob.d = outd;
            return 0;
        }

        // flatten
        Mat bottom_blob_flattened = bottom_blob;
        {
            Option opt_flatten = opt;
            opt_flatten.blob_allocator = opt.workspace_allocator;

            flatten(bottom_blob, bottom_blob_flattened, opt_flatten);
            if (bottom_blob_flattened.empty())
                return -100;
        }

        if (ndim == 3)
        {
            top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        }
        else // if (ndim == 4)
        {
            top_blob.create(outw, outh, outd, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        }
        if (top_blob.empty())
            return -100;

        int size = top_blob.w * top_blob.h * top_blob.d;

#if NCNN_ARM82
        if (out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < top_blob.c; q++)
            {
                const unsigned short* ptr0 = (const unsigned short*)bottom_blob_flattened + size * q * 8;
                const unsigned short* ptr1 = (const unsigned short*)bottom_blob_flattened + size * (q * 8 + 1);
                const unsigned short* ptr2 = (const unsigned short*)bottom_blob_flattened + size * (q * 8 + 2);
                const unsigned short* ptr3 = (const unsigned short*)bottom_blob_flattened + size * (q * 8 + 3);
                const unsigned short* ptr4 = (const unsigned short*)bottom_blob_flattened + size * (q * 8 + 4);
                const unsigned short* ptr5 = (const unsigned short*)bottom_blob_flattened + size * (q * 8 + 5);
                const unsigned short* ptr6 = (const unsigned short*)bottom_blob_flattened + size * (q * 8 + 6);
                const unsigned short* ptr7 = (const unsigned short*)bottom_blob_flattened + size * (q * 8 + 7);
                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
                for (; i + 3 < size; i += 4)
                {
                    uint16x8_t _p01 = vcombine_u16(vld1_u16(ptr0), vld1_u16(ptr1));
                    uint16x8_t _p23 = vcombine_u16(vld1_u16(ptr2), vld1_u16(ptr3));
                    uint16x8_t _p45 = vcombine_u16(vld1_u16(ptr4), vld1_u16(ptr5));
                    uint16x8_t _p67 = vcombine_u16(vld1_u16(ptr6), vld1_u16(ptr7));

                    uint16x8x2_t _p0415 = vzipq_u16(_p01, _p45);
                    uint16x8x2_t _p2637 = vzipq_u16(_p23, _p67);

                    uint16x8x4_t _v4;
                    _v4.val[0] = _p0415.val[0];
                    _v4.val[1] = _p0415.val[1];
                    _v4.val[2] = _p2637.val[0];
                    _v4.val[3] = _p2637.val[1];

                    vst4q_u16(outptr, _v4);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    ptr3 += 4;
                    ptr4 += 4;
                    ptr5 += 4;
                    ptr6 += 4;
                    ptr7 += 4;
                    outptr += 32;
                }
                for (; i < size; i++)
                {
                    outptr[0] = *ptr0++;
                    outptr[1] = *ptr1++;
                    outptr[2] = *ptr2++;
                    outptr[3] = *ptr3++;
                    outptr[4] = *ptr4++;
                    outptr[5] = *ptr5++;
                    outptr[6] = *ptr6++;
                    outptr[7] = *ptr7++;

                    outptr += 8;
                }
            }
        }
#endif // NCNN_ARM82

        if (out_elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < top_blob.c; q++)
            {
                const unsigned short* ptr0 = (const unsigned short*)bottom_blob_flattened + size * q * 4;
                const unsigned short* ptr1 = (const unsigned short*)bottom_blob_flattened + size * (q * 4 + 1);
                const unsigned short* ptr2 = (const unsigned short*)bottom_blob_flattened + size * (q * 4 + 2);
                const unsigned short* ptr3 = (const unsigned short*)bottom_blob_flattened + size * (q * 4 + 3);
                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    uint16x4x4_t _v4;
                    _v4.val[0] = vld1_u16(ptr0);
                    _v4.val[1] = vld1_u16(ptr1);
                    _v4.val[2] = vld1_u16(ptr2);
                    _v4.val[3] = vld1_u16(ptr3);

                    vst4_u16(outptr, _v4);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    ptr3 += 4;
                    outptr += 16;
                }
#endif
                for (; i < size; i++)
                {
                    outptr[0] = *ptr0++;
                    outptr[1] = *ptr1++;
                    outptr[2] = *ptr2++;
                    outptr[3] = *ptr3++;

                    outptr += 4;
                }
            }
        }

        if (out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < top_blob.c; q++)
            {
                const unsigned short* ptr = (const unsigned short*)bottom_blob_flattened + size * q;
                unsigned short* outptr = top_blob.channel(q);

                int i = 0;
#if __ARM_NEON
                for (; i + 3 < size; i += 4)
                {
                    uint16x4_t _v = vld1_u16(ptr);
                    vst1_u16(outptr, _v);
                    ptr += 4;
                    outptr += 4;
                }
#endif
                for (; i < size; i++)
                {
                    *outptr++ = *ptr++;
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/reshape_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_RESHAPE_ARM_H
#define LAYER_RESHAPE_ARM_H

#include "reshape.h"

namespace ncnn {

class Reshape_arm : public Reshape
{
public:
    Reshape_arm();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_RESHAPE_ARM_H


================================================
FILE: src/layer/arm/rmsnorm_arm.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "rmsnorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

RMSNorm_arm::RMSNorm_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

static void rmsnorm(float* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

#if __ARM_NEON
    float32x4_t _rms = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float rms = 0.f;
    {
        const float* ptr0 = ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr0);
            _rms = vmlaq_f32(_rms, _p, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            rms += ptr0[0] * ptr0[0];
            ptr0++;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);

#if __aarch64__
        _rms = vdivq_f32(_rms, _elemcount);
        _rms = vaddq_f32(_rms, _eps);
#else
        float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount);
        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
        _rms = vmlaq_f32(_eps, _rms, _inv_elemcount);
#endif

        float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms);
        _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
        _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
#if __ARM_NEON
#if __aarch64__
        rms += vaddvq_f32(_rms);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms));
        _s2 = vpadd_f32(_s2, _s2);
        rms += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        rms = 1.f / sqrtf(rms / elemcount + eps);
#if __ARM_NEON
        _rms = vdupq_n_f32(rms);
#endif // __ARM_NEON
    }

    if (gamma_ptr)
    {
        int i = 0;
#if __ARM_NEON
        if (elempack == 4)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                _p = vmulq_f32(_p, _rms);
                _p = vmulq_f32(_p, _gamma);
                vst1q_f32(ptr, _p);
                ptr += 4;
                gamma_ptr += 1;
            }
        }
        if (elempack == 1)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _gamma = vld1q_f32(gamma_ptr);
                _p = vmulq_f32(_p, _rms);
                _p = vmulq_f32(_p, _gamma);
                vst1q_f32(ptr, _p);
                ptr += 4;
                gamma_ptr += 4;
            }
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            ptr[0] = (ptr[0] * rms) * gamma_ptr[0];
            ptr++;
            gamma_ptr++;
        }
    }
    else
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vmulq_f32(_p, _rms);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            ptr[0] = ptr[0] * rms;
            ptr++;
        }
    }
}

int RMSNorm_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        // assert affine_size == w

        float* ptr = bottom_top_blob;
        rmsnorm(ptr, gamma_data, eps, w * elempack, 1);
    }

    if (dims == 2)
    {
        // assert affine_size == w

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            rmsnorm(ptr, gamma_data, eps, w, elempack);
        }
    }

    if (dims == 3)
    {
        if (affine_size == w)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                for (int i = 0; i < h; i++)
                {
                    float* ptr = bottom_top_blob.channel(q).row(i);
                    rmsnorm(ptr, gamma_data, eps, w, elempack);
                }
            }
        }
        else // if (affine_size == w * h)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                float* ptr = bottom_top_blob.channel(q);
                rmsnorm(ptr, gamma_data, eps, w * h, elempack);
            }
        }
    }

    return 0;
}

#if NCNN_BF16
static void rmsnorm_bf16s(unsigned short* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

#if __ARM_NEON
    float32x4_t _rms = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float rms = 0.f;
    {
        const unsigned short* ptr0 = ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr0));
            _rms = vmlaq_f32(_rms, _p, _p);
            ptr0 += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(ptr0[0]);
            rms += v * v;
            ptr0++;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);

#if __aarch64__
        _rms = vdivq_f32(_rms, _elemcount);
        _rms = vaddq_f32(_rms, _eps);
#else
        float32x4_t _inv_elemcount = vrecpeq_f32(_elemcount);
        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
        _inv_elemcount = vmulq_f32(vrecpsq_f32(_elemcount, _inv_elemcount), _inv_elemcount);
        _rms = vmlaq_f32(_eps, _rms, _inv_elemcount);
#endif

        float32x4_t _rsqrt_rms = vrsqrteq_f32(_rms);
        _rsqrt_rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
        _rms = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms, _rsqrt_rms), _rsqrt_rms), _rsqrt_rms);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
#if __ARM_NEON
#if __aarch64__
        rms += vaddvq_f32(_rms);
#else
        float32x2_t _s2 = vadd_f32(vget_low_f32(_rms), vget_high_f32(_rms));
        _s2 = vpadd_f32(_s2, _s2);
        rms += vget_lane_f32(_s2, 0);
#endif
#endif // __ARM_NEON

        rms = 1.f / sqrtf(rms / elemcount + eps);
#if __ARM_NEON
        _rms = vdupq_n_f32(rms);
#endif // __ARM_NEON
    }

    if (gamma_ptr)
    {
        int i = 0;
#if __ARM_NEON
        if (elempack == 4)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                _p = vmulq_f32(_p, _rms);
                _p = vmulq_f32(_p, _gamma);
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
                gamma_ptr += 1;
            }
        }
        if (elempack == 1)
        {
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                float32x4_t _gamma = vld1q_f32(gamma_ptr);
                _p = vmulq_f32(_p, _rms);
                _p = vmulq_f32(_p, _gamma);
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
                gamma_ptr += 4;
            }
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(ptr[0]);
            ptr[0] = float32_to_bfloat16((v * rms) * gamma_ptr[0]);
            ptr++;
            gamma_ptr++;
        }
    }
    else
    {
        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = vmulq_f32(_p, _rms);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(ptr[0]);
            ptr[0] = float32_to_bfloat16(v * rms);
            ptr++;
        }
    }
}

int RMSNorm_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        // assert affine_size == w

        unsigned short* ptr = bottom_top_blob;
        rmsnorm_bf16s(ptr, gamma_data, eps, w * elempack, 1);
    }

    if (dims == 2)
    {
        // assert affine_size == w

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);
            rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack);
        }
    }

    if (dims == 3)
    {
        if (affine_size == w)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                for (int i = 0; i < h; i++)
                {
                    unsigned short* ptr = bottom_top_blob.channel(q).row<unsigned short>(i);
                    rmsnorm_bf16s(ptr, gamma_data, eps, w, elempack);
                }
            }
        }
        else // if (affine_size == w * h)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                unsigned short* ptr = bottom_top_blob.channel(q);
                rmsnorm_bf16s(ptr, gamma_data, eps, w * h, elempack);
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/rmsnorm_arm.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_RMSNORM_ARM_H
#define LAYER_RMSNORM_ARM_H

#include "rmsnorm.h"

namespace ncnn {

class RMSNorm_arm : public RMSNorm
{
public:
    RMSNorm_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_RMSNORM_ARM_H


================================================
FILE: src/layer/arm/rmsnorm_arm_asimdhp.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "rmsnorm_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static void rmsnorm_fp16s(__fp16* ptr, const float* gamma_ptr, float eps, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

    float32x4_t _rms0 = vdupq_n_f32(0.f);
    float32x4_t _rms1 = vdupq_n_f32(0.f);
    float rms = 0.f;
    {
        const __fp16* ptr0 = ptr;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr0);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _rms0 = vmlaq_f32(_rms0, _p0, _p0);
            _rms1 = vmlaq_f32(_rms1, _p1, _p1);
            ptr0 += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr0));
            _rms0 = vmlaq_f32(_rms0, _p, _p);
            ptr0 += 4;
        }
        for (; i < size; i++)
        {
            rms += (float)ptr0[0] * (float)ptr0[0];
            ptr0++;
        }
    }

    if (elempack == 8)
    {
        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);

        _rms0 = vdivq_f32(_rms0, _elemcount);
        _rms1 = vdivq_f32(_rms1, _elemcount);
        _rms0 = vaddq_f32(_rms0, _eps);
        _rms1 = vaddq_f32(_rms1, _eps);

        float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0);
        float32x4_t _rsqrt_rms1 = vrsqrteq_f32(_rms1);
        _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
        _rsqrt_rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1);
        _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
        _rms1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms1, _rsqrt_rms1), _rsqrt_rms1), _rsqrt_rms1);
    }
    if (elempack == 4)
    {
        _rms0 = vaddq_f32(_rms0, _rms1);

        float32x4_t _elemcount = vdupq_n_f32(elemcount);
        float32x4_t _eps = vdupq_n_f32(eps);

        _rms0 = vdivq_f32(_rms0, _elemcount);
        _rms0 = vaddq_f32(_rms0, _eps);

        float32x4_t _rsqrt_rms0 = vrsqrteq_f32(_rms0);
        _rsqrt_rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
        _rms0 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(_rms0, _rsqrt_rms0), _rsqrt_rms0), _rsqrt_rms0);
        _rms1 = _rms0;
    }
    if (elempack == 1)
    {
        _rms0 = vaddq_f32(_rms0, _rms1);
        rms += vaddvq_f32(_rms0);

        rms = 1.f / sqrtf(rms / elemcount + eps);
        _rms0 = vdupq_n_f32(rms);
        _rms1 = _rms0;
    }

    if (gamma_ptr)
    {
        int i = 0;
        if (elempack == 8)
        {
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                _p0 = vmulq_f32(_p0, _rms0);
                _p1 = vmulq_f32(_p1, _rms1);
                _p0 = vmulq_f32(_p0, _gamma);
                _p1 = vmulq_f32(_p1, _gamma);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr, _p);
                ptr += 8;
                gamma_ptr += 1;
            }
        }
        if (elempack == 4)
        {
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                float32x4_t _gamma0 = vdupq_n_f32(gamma_ptr[0]);
                float32x4_t _gamma1 = vdupq_n_f32(gamma_ptr[1]);
                _p0 = vmulq_f32(_p0, _rms0);
                _p1 = vmulq_f32(_p1, _rms1);
                _p0 = vmulq_f32(_p0, _gamma0);
                _p1 = vmulq_f32(_p1, _gamma1);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr, _p);
                ptr += 8;
                gamma_ptr += 2;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                float32x4_t _gamma = vdupq_n_f32(gamma_ptr[0]);
                _p = vmulq_f32(_p, _rms0);
                _p = vmulq_f32(_p, _gamma);
                vst1_f16(ptr, vcvt_f16_f32(_p));
                ptr += 4;
                gamma_ptr += 1;
            }
        }
        if (elempack == 1)
        {
            for (; i + 7 < size; i += 8)
            {
                float16x8_t _p = vld1q_f16(ptr);
                float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
                float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
                float32x4_t _gamma0 = vld1q_f32(gamma_ptr);
                float32x4_t _gamma1 = vld1q_f32(gamma_ptr + 4);
                _p0 = vmulq_f32(_p0, _rms0);
                _p1 = vmulq_f32(_p1, _rms1);
                _p0 = vmulq_f32(_p0, _gamma0);
                _p1 = vmulq_f32(_p1, _gamma1);
                _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
                vst1q_f16(ptr, _p);
                ptr += 8;
                gamma_ptr += 8;
            }
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                float32x4_t _gamma = vld1q_f32(gamma_ptr);
                _p = vmulq_f32(_p, _rms0);
                _p = vmulq_f32(_p, _gamma);
                vst1_f16(ptr, vcvt_f16_f32(_p));
                ptr += 4;
                gamma_ptr += 4;
            }
        }
        for (; i < size; i++)
        {
            ptr[0] = (__fp16)(((float)ptr[0] * rms) * gamma_ptr[0]);
            ptr++;
            gamma_ptr++;
        }
    }
    else
    {
        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _p0 = vmulq_f32(_p0, _rms0);
            _p1 = vmulq_f32(_p1, _rms1);
            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _p = vmulq_f32(_p, _rms0);
            vst1_f16(ptr, vcvt_f16_f32(_p));
            ptr += 4;
        }
        for (; i < size; i++)
        {
            ptr[0] = (__fp16)((float)ptr[0] * rms);
            ptr++;
        }
    }
}

int RMSNorm_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        // assert affine_size == w

        __fp16* ptr = bottom_top_blob;
        rmsnorm_fp16s(ptr, gamma_data, eps, w * elempack, 1);
    }

    if (dims == 2)
    {
        // assert affine_size == w

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            __fp16* ptr = bottom_top_blob.row<__fp16>(i);
            rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack);
        }
    }

    if (dims == 3)
    {
        if (affine_size == w)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                for (int i = 0; i < h; i++)
                {
                    __fp16* ptr = bottom_top_blob.channel(q).row<__fp16>(i);
                    rmsnorm_fp16s(ptr, gamma_data, eps, w, elempack);
                }
            }
        }
        else // if (affine_size == w * h)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                __fp16* ptr = bottom_top_blob.channel(q);
                rmsnorm_fp16s(ptr, gamma_data, eps, w * h, elempack);
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/rnn_arm.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "rnn_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"
#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

#if NCNN_INT8
#include "rnn_int8.h"
#endif

RNN_arm::RNN_arm()
{
#if __ARM_NEON
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int RNN_arm::create_pipeline(const Option& opt)
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return create_pipeline_int8(opt);
    }
#endif

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage)
    {
        return create_pipeline_bf16s(opt);
    }
#endif

    const int num_directions = direction == 2 ? 2 : 1;
    const int size = weight_data_size / num_directions / num_output;

#if __ARM_NEON
    weight_xc_data_packed.create(size * 4, num_output / 4 + num_output % 4, num_directions);
    weight_hc_data_packed.create(num_output * 4, num_output / 4 + num_output % 4, num_directions);
#else
    weight_xc_data_packed.create(size, num_output, num_directions);
    weight_hc_data_packed.create(num_output, num_output, num_directions);
#endif

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        int q = 0;
#if __ARM_NEON
        for (; q + 3 < num_output; q += 4)
        {
            const float* weight_xc_0 = weight_xc.row(q);
            const float* weight_xc_1 = weight_xc.row(q + 1);
            const float* weight_xc_2 = weight_xc.row(q + 2);
            const float* weight_xc_3 = weight_xc.row(q + 3);

            const float* weight_hc_0 = weight_hc.row(q);
            const float* weight_hc_1 = weight_hc.row(q + 1);
            const float* weight_hc_2 = weight_hc.row(q + 2);
            const float* weight_hc_3 = weight_hc.row(q + 3);

            float* weight_xc = weight_xc_data_packed_dr.row(q / 4);
            float* weight_hc = weight_hc_data_packed_dr.row(q / 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc[0] = weight_xc_0[i];
                weight_xc[1] = weight_xc_1[i];
                weight_xc[2] = weight_xc_2[i];
                weight_xc[3] = weight_xc_3[i];

                weight_xc += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc[0] = weight_hc_0[i];
                weight_hc[1] = weight_hc_1[i];
                weight_hc[2] = weight_hc_2[i];
                weight_hc[3] = weight_hc_3[i];

                weight_hc += 4;
            }
        }
#endif // __ARM_NEON
        for (; q < num_output; q++)
        {
            const float* weight_xc_0 = weight_xc.row(q);
            const float* weight_hc_0 = weight_hc.row(q);

#if __ARM_NEON
            float* weight_xc = weight_xc_data_packed_dr.row(q / 4 + q % 4);
            float* weight_hc = weight_hc_data_packed_dr.row(q / 4 + q % 4);
#else
            float* weight_xc = weight_xc_data_packed_dr.row(q);
            float* weight_hc = weight_hc_data_packed_dr.row(q);
#endif // __ARM_NEON

            for (int i = 0; i < size; i++)
            {
                weight_xc[i] = weight_xc_0[i];
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc[i] = weight_hc_0[i];
            }
        }
    }

    bias_c_data_packed = bias_c_data;

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

static int rnn(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // num_output
    Mat gates(num_output, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        const float* x = bottom_blob.row(ti);

        int remain_num_output_start = 0;
#if __ARM_NEON
        int nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const float* weight_xc_ptr = weight_xc.row(q / 4);
            const float* weight_hc_ptr = weight_hc.row(q / 4);

            float32x4_t _rnn_H = vld1q_f32((const float*)bias_c + q);
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _x = vld1q_f32(x + i);
                float32x4_t _weight_xc = vld1q_f32(weight_xc_ptr);
                float32x4_t _weight_xc_1 = vld1q_f32(weight_xc_ptr + 4);
                float32x4_t _weight_xc_2 = vld1q_f32(weight_xc_ptr + 8);
                float32x4_t _weight_xc_3 = vld1q_f32(weight_xc_ptr + 12);
#if __aarch64__
                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_xc, _x, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_1, _x, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_2, _x, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_3, _x, 3);
#else
                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_xc, vget_low_f32(_x), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_1, vget_low_f32(_x), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_2, vget_high_f32(_x), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_3, vget_high_f32(_x), 1);
#endif

                weight_xc_ptr += 16;
            }
            for (; i < size; i++)
            {
                float32x4_t _x = vdupq_n_f32(x[i]);
                float32x4_t _weight_xc = vld1q_f32(weight_xc_ptr);
                _rnn_H = vmlaq_f32(_rnn_H, _weight_xc, _x);

                weight_xc_ptr += 4;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _hidden_state = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc = vld1q_f32(weight_hc_ptr);
                float32x4_t _weight_hc_1 = vld1q_f32(weight_hc_ptr + 4);
                float32x4_t _weight_hc_2 = vld1q_f32(weight_hc_ptr + 8);
                float32x4_t _weight_hc_3 = vld1q_f32(weight_hc_ptr + 12);
#if __aarch64__
                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_hc, _hidden_state, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_1, _hidden_state, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_2, _hidden_state, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_3, _hidden_state, 3);
#else
                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_hc, vget_low_f32(_hidden_state), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_1, vget_low_f32(_hidden_state), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_2, vget_high_f32(_hidden_state), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_3, vget_high_f32(_hidden_state), 1);
#endif

                weight_hc_ptr += 16;
            }
            for (; i < num_output; i++)
            {
                float32x4_t _hidden_state = vdupq_n_f32(hidden_state[i]);
                float32x4_t _weight_hc = vld1q_f32(weight_hc_ptr);
                _rnn_H = vmlaq_f32(_rnn_H, _weight_hc, _hidden_state);

                weight_hc_ptr += 4;
            }

            _rnn_H = vaddq_f32(_rnn_H, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _rnn_H = vaddq_f32(_rnn_H, _sum2);

            _rnn_H = tanh_ps(_rnn_H);

            vst1q_f32((float*)gates + q, _rnn_H);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
#if __ARM_NEON
            const float* weight_xc_ptr = weight_xc.row(q / 4 + q % 4);
            const float* weight_hc_ptr = weight_hc.row(q / 4 + q % 4);
#else
            const float* weight_xc_ptr = weight_xc.row(q);
            const float* weight_hc_ptr = weight_hc.row(q);
#endif // __ARM_NEON

            float H = bias_c[q];

            for (int i = 0; i < size; i++)
            {
                H += weight_xc_ptr[i] * x[i];
            }

            for (int i = 0; i < num_output; i++)
            {
                H += weight_hc_ptr[i] * hidden_state[i];
            }

            H = tanhf(H);

            gates[q] = H;
        }

        float* output_data = top_blob.row(ti);

        float* hidden_ptr = hidden_state;

#if __ARM_NEON
        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);

            vst1q_f32(hidden_ptr + q, _rnn_H);
            vst1q_f32(output_data + q, _rnn_H);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            float H = gates[q];

            hidden_ptr[q] = H;
            output_data[q] = H;
        }
    }

    return 0;
}

int RNN_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
#endif

    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blob, top_blob, opt);
#endif

    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = rnn(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.0f);

        {
            int ret = rnn(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    return 0;
}

int RNN_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blobs, top_blobs, opt);
    }
#endif

    const Mat& bottom_blob = bottom_blobs[0];
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s(bottom_blobs, top_blobs, opt);
#endif

    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        hidden = bottom_blobs[1].clone(hidden_allocator);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = rnn(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            int ret = rnn(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            int ret = rnn(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    if (top_blobs.size() == 2)
    {
        top_blobs[1] = hidden;
    }

    return 0;
}

#if NCNN_BF16
static int rnn_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // num_output
    Mat gates(num_output, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        const unsigned short* x = bottom_blob.row<const unsigned short>(ti);

        int remain_num_output_start = 0;
#if __ARM_NEON
        int nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const unsigned short* weight_xc_ptr = weight_xc.row<const unsigned short>(q / 4);
            const unsigned short* weight_hc_ptr = weight_hc.row<const unsigned short>(q / 4);

            float32x4_t _rnn_H = bfloat2float(vld1_u16((const unsigned short*)bias_c + q));
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _x = bfloat2float(vld1_u16(x + i));
                float32x4_t _weight_xc = bfloat2float(vld1_u16(weight_xc_ptr));
                float32x4_t _weight_xc_1 = bfloat2float(vld1_u16(weight_xc_ptr + 4));
                float32x4_t _weight_xc_2 = bfloat2float(vld1_u16(weight_xc_ptr + 8));
                float32x4_t _weight_xc_3 = bfloat2float(vld1_u16(weight_xc_ptr + 12));
#if __aarch64__
                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_xc, _x, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_1, _x, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_2, _x, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_3, _x, 3);
#else
                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_xc, vget_low_f32(_x), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_xc_1, vget_low_f32(_x), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_xc_2, vget_high_f32(_x), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_xc_3, vget_high_f32(_x), 1);
#endif

                weight_xc_ptr += 16;
            }
            for (; i < size; i++)
            {
                float32x4_t _x = bfloat2float(vdup_n_u16(x[i]));
                float32x4_t _weight_xc = bfloat2float(vld1_u16(weight_xc_ptr));
                _rnn_H = vmlaq_f32(_rnn_H, _weight_xc, _x);

                weight_xc_ptr += 4;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _hidden_state = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc = bfloat2float(vld1_u16(weight_hc_ptr));
                float32x4_t _weight_hc_1 = bfloat2float(vld1_u16(weight_hc_ptr + 4));
                float32x4_t _weight_hc_2 = bfloat2float(vld1_u16(weight_hc_ptr + 8));
                float32x4_t _weight_hc_3 = bfloat2float(vld1_u16(weight_hc_ptr + 12));
#if __aarch64__
                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_hc, _hidden_state, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_1, _hidden_state, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_2, _hidden_state, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_3, _hidden_state, 3);
#else
                _rnn_H = vmlaq_lane_f32(_rnn_H, _weight_hc, vget_low_f32(_hidden_state), 0);
                _sum1 = vmlaq_lane_f32(_sum1, _weight_hc_1, vget_low_f32(_hidden_state), 1);
                _sum2 = vmlaq_lane_f32(_sum2, _weight_hc_2, vget_high_f32(_hidden_state), 0);
                _sum3 = vmlaq_lane_f32(_sum3, _weight_hc_3, vget_high_f32(_hidden_state), 1);
#endif

                weight_hc_ptr += 16;
            }
            for (; i < num_output; i++)
            {
                float32x4_t _hidden_state = vdupq_n_f32(hidden_state[i]);
                float32x4_t _weight_hc = bfloat2float(vld1_u16(weight_hc_ptr));
                _rnn_H = vmlaq_f32(_rnn_H, _weight_hc, _hidden_state);

                weight_hc_ptr += 4;
            }

            _rnn_H = vaddq_f32(_rnn_H, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _rnn_H = vaddq_f32(_rnn_H, _sum2);

            _rnn_H = tanh_ps(_rnn_H);

            vst1q_f32((float*)gates + q, _rnn_H);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
#if __ARM_NEON
            const unsigned short* weight_xc_ptr = weight_xc.row<const unsigned short>(q / 4 + q % 4);
            const unsigned short* weight_hc_ptr = weight_hc.row<const unsigned short>(q / 4 + q % 4);
#else
            const unsigned short* weight_xc_ptr = weight_xc.row<const unsigned short>(q);
            const unsigned short* weight_hc_ptr = weight_hc.row<const unsigned short>(q);
#endif // __ARM_NEON

            float H = bfloat16_to_float32(((const unsigned short*)bias_c)[q]);

            for (int i = 0; i < size; i++)
            {
                H += bfloat16_to_float32(weight_xc_ptr[i]) * bfloat16_to_float32(x[i]);
            }

            for (int i = 0; i < num_output; i++)
            {
                H += bfloat16_to_float32(weight_hc_ptr[i]) * hidden_state[i];
            }

            H = tanhf(H);

            gates[q] = H;
        }

        unsigned short* output_data = top_blob.row<unsigned short>(ti);

        float* hidden_ptr = hidden_state;

#if __ARM_NEON
        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);

            vst1q_f32(hidden_ptr + q, _rnn_H);
            vst1_u16(output_data + q, float2bfloat(_rnn_H));
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            float H = gates[q];

            hidden_ptr[q] = H;
            output_data[q] = float32_to_bfloat16(H);
        }
    }

    return 0;
}

int RNN_arm::create_pipeline_bf16s(const Option& opt)
{
    int num_directions = direction == 2 ? 2 : 1;
    int size = weight_data_size / num_directions / num_output;

#if __ARM_NEON
    weight_xc_data_packed.create(size * 4, num_output / 4 + num_output % 4, num_directions, 2u, 1);
    weight_hc_data_packed.create(num_output * 4, num_output / 4 + num_output % 4, num_directions, 2u, 1);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        int q = 0;
#if __ARM_NEON
        for (; q + 3 < num_output; q += 4)
        {
            const float* weight_xc_0 = weight_xc.row(q);
            const float* weight_xc_1 = weight_xc.row(q + 1);
            const float* weight_xc_2 = weight_xc.row(q + 2);
            const float* weight_xc_3 = weight_xc.row(q + 3);

            const float* weight_hc_0 = weight_hc.row(q);
            const float* weight_hc_1 = weight_hc.row(q + 1);
            const float* weight_hc_2 = weight_hc.row(q + 2);
            const float* weight_hc_3 = weight_hc.row(q + 3);

            unsigned short* weight_xc = weight_xc_data_packed_dr.row<unsigned short>(q / 4);
            unsigned short* weight_hc = weight_hc_data_packed_dr.row<unsigned short>(q / 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc[0] = float32_to_bfloat16(weight_xc_0[i]);
                weight_xc[1] = float32_to_bfloat16(weight_xc_1[i]);
                weight_xc[2] = float32_to_bfloat16(weight_xc_2[i]);
                weight_xc[3] = float32_to_bfloat16(weight_xc_3[i]);

                weight_xc += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc[0] = float32_to_bfloat16(weight_hc_0[i]);
                weight_hc[1] = float32_to_bfloat16(weight_hc_1[i]);
                weight_hc[2] = float32_to_bfloat16(weight_hc_2[i]);
                weight_hc[3] = float32_to_bfloat16(weight_hc_3[i]);

                weight_hc += 4;
            }
        }
#endif // __ARM_NEON
        for (; q < num_output; q++)
        {
            const float* weight_xc_0 = weight_xc.row(q);
            const float* weight_hc_0 = weight_hc.row(q);

#if __ARM_NEON
            unsigned short* weight_xc = weight_xc_data_packed_dr.row<unsigned short>(q / 4 + q % 4);
            unsigned short* weight_hc = weight_hc_data_packed_dr.row<unsigned short>(q / 4 + q % 4);
#else
            unsigned short* weight_xc = weight_xc_data_packed_dr.row<unsigned short>(q);
            unsigned short* weight_hc = weight_hc_data_packed_dr.row<unsigned short>(q);
#endif // __ARM_NEON

            for (int i = 0; i < size; i++)
            {
                weight_xc[i] = float32_to_bfloat16(weight_xc_0[i]);
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc[i] = float32_to_bfloat16(weight_hc_0[i]);
            }
        }
    }
#else
    cast_float32_to_bfloat16(weight_xc_data, weight_xc_data_packed, opt);
    cast_float32_to_bfloat16(weight_hc_data, weight_hc_data_packed, opt);
#endif

    cast_float32_to_bfloat16(bias_c_data, bias_c_data_packed, opt);

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

int RNN_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = rnn_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = rnn_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.f);

        {
            int ret = rnn_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
            const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
            unsigned short* ptr = top_blob.row<unsigned short>(i);

            memcpy(ptr, pf, num_output * sizeof(unsigned short));
            memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
        }
    }

    return 0;
}

int RNN_arm::forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        Option opt_cast = opt;
        opt_cast.blob_allocator = hidden_allocator;
        cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = rnn_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            int ret = rnn_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            int ret = rnn_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned short* pf = top_blob_forward.row<const unsigned short>(i);
            const unsigned short* pr = top_blob_reverse.row<const unsigned short>(i);
            unsigned short* ptr = top_blob.row<unsigned short>(i);

            memcpy(ptr, pf, num_output * sizeof(unsigned short));
            memcpy(ptr + num_output, pr, num_output * sizeof(unsigned short));
        }
    }

    if (top_blobs.size() == 2)
    {
        cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
    }

    return 0;
}
#endif // NCNN_BF16

#if NCNN_INT8
int RNN_arm::create_pipeline_int8(const Option& opt)
{
    const int num_directions = direction == 2 ? 2 : 1;
    const int size = weight_data_size / num_directions / num_output;

    rnn_transform_weight_int8(weight_xc_data, weight_xc_data_int8_scales, weight_hc_data, weight_hc_data_int8_scales, bias_c_data, weight_data_tm, weight_data_tm_int8_descales, bias_c_data_packed, size, num_output, num_directions, opt);

    if (opt.lightmode)
    {
        weight_xc_data.release();
        weight_hc_data.release();
        bias_c_data.release();
        weight_xc_data_int8_scales.release();
        weight_hc_data_int8_scales.release();
    }

    return 0;
}

void RNN_arm::dynamic_quantize(const Mat& bottom_blob, int elemtype, Mat& bottom_blob_int8, Mat& bottom_blob_int8_descales, const Option& opt) const
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    // dynamic quantize bottom_blob
    bottom_blob_int8_descales.create(T, (size_t)4u, 1, opt.blob_allocator);

    Mat bottom_blob_int8_scales(T, (size_t)4u, 1, opt.blob_allocator);

    if (elemtype == 1)
    {
        // fp32
        for (int t = 0; t < T; t++)
        {
            const float* x = bottom_blob.row(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(x[i]));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }
    if (elemtype == 2)
    {
        // fp16
        for (int t = 0; t < T; t++)
        {
            const unsigned short* x = bottom_blob.row<const unsigned short>(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(float16_to_float32(x[i])));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }
    if (elemtype == 4)
    {
        // bf16
        for (int t = 0; t < T; t++)
        {
            const unsigned short* x = bottom_blob.row<const unsigned short>(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(bfloat16_to_float32(x[i])));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
            bottom_blob_int8_descales[t] = absmax / 127.f;
        }
    }

    quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt);
}

int RNN_arm::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elemtype = 1; // fp32
    {
        int elembits = bottom_blob.elembits();

        // clang-format off
        // *INDENT-OFF*

#if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        {
            elemtype = 2; // fp16
        }
        else
#endif
#if NCNN_BF16
        if (opt.use_bf16_storage && elembits == 16)
        {
            elemtype = 4; // bf16
        }
        else
#endif
        {
            // fp32
        }

        // *INDENT-ON*
        // clang-format on
    }

    int T = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8;
    Mat bottom_blob_int8_descales;
    {
        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        dynamic_quantize(bottom_blob, elemtype, bottom_blob_int8, bottom_blob_int8_descales, opt_quant);
    }

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        rnn_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, direction, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden, opt);
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            rnn_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_forward, elemtype, 0, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden, opt);
        }

        hidden.fill(0.0f);

        {
            rnn_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_reverse, elemtype, 1, weight_data_tm.channel(1), weight_data_tm_int8_descales.channel(1), bias_c_data_packed.channel(1), hidden, opt);
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned char* pf = top_blob_forward.row<const unsigned char>(i);
            const unsigned char* pr = top_blob_reverse.row<const unsigned char>(i);
            unsigned char* ptr = top_blob.row<unsigned char>(i);

            memcpy(ptr, pf, num_output * elemsize);
            memcpy(ptr + num_output * elemsize, pr, num_output * elemsize);
        }
    }

    return 0;
}

int RNN_arm::forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];

    int elemtype = 1; // fp32
    {
        int elembits = bottom_blob.elembits();

        // clang-format off
        // *INDENT-OFF*

#if NCNN_ARM82
        if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        {
            elemtype = 2; // fp16
        }
        else
#endif
#if NCNN_BF16
        if (opt.use_bf16_storage && elembits == 16)
        {
            elemtype = 4; // bf16
        }
        else
#endif
        {
            // fp32
        }

        // *INDENT-ON*
        // clang-format on
    }

    int T = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        if (elemtype == 1)
        {
            hidden = bottom_blobs[1].clone(hidden_allocator);
        }
        if (elemtype == 2)
        {
            Option opt_cast = opt;
            opt_cast.blob_allocator = hidden_allocator;
            cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
        }
        if (elemtype == 4)
        {
            Option opt_cast = opt;
            opt_cast.blob_allocator = hidden_allocator;
            cast_bfloat16_to_float32(bottom_blobs[1], hidden, opt_cast);
        }
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8;
    Mat bottom_blob_int8_descales;
    {
        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        dynamic_quantize(bottom_blob, elemtype, bottom_blob_int8, bottom_blob_int8_descales, opt_quant);
    }

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        rnn_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, direction, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden, opt);
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, elemsize, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            rnn_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_forward, elemtype, 0, weight_data_tm.channel(0), weight_data_tm_int8_descales.channel(0), bias_c_data_packed.channel(0), hidden0, opt);
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            rnn_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob_reverse, elemtype, 1, weight_data_tm.channel(1), weight_data_tm_int8_descales.channel(1), bias_c_data_packed.channel(1), hidden1, opt);
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const unsigned char* pf = top_blob_forward.row<const unsigned char>(i);
            const unsigned char* pr = top_blob_reverse.row<const unsigned char>(i);
            unsigned char* ptr = top_blob.row<unsigned char>(i);

            memcpy(ptr, pf, num_output * elemsize);
            memcpy(ptr + num_output * elemsize, pr, num_output * elemsize);
        }
    }

    if (top_blobs.size() == 2)
    {
        if (elemtype == 1)
        {
            top_blobs[1] = hidden;
        }
        if (elemtype == 2)
        {
            cast_float32_to_float16(hidden, top_blobs[1], opt);
        }
        if (elemtype == 4)
        {
            cast_float32_to_bfloat16(hidden, top_blobs[1], opt);
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/arm/rnn_arm.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_RNN_ARM_H
#define LAYER_RNN_ARM_H

#include "rnn.h"

namespace ncnn {

class RNN_arm : public RNN
{
public:
    RNN_arm();

    virtual int create_pipeline(const Option& opt);
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_BF16
    int create_pipeline_bf16s(const Option& opt);
    int forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_bf16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8(const Option& opt);
    void dynamic_quantize(const Mat& bottom_blob, int elemtype, Mat& bottom_blob_int8, Mat& bottom_blob_int8_descales, const Option& opt) const;
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    int forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
    Mat weight_xc_data_packed;
    Mat bias_c_data_packed;
    Mat weight_hc_data_packed;

    Mat weight_data_tm;

#if NCNN_INT8
    Mat weight_data_tm_int8_descales;
#endif
};

} // namespace ncnn

#endif // LAYER_RNN_ARM_H


================================================
FILE: src/layer/arm/rnn_arm_asimddp.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "layer.h"
#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "rnn_int8.h"

void rnn_transform_weight_int8_asimddp(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, const Option& opt)
{
    rnn_transform_weight_int8(weight_xc, weight_xc_int8_scales, weight_hc, weight_hc_int8_scales, bias_c, weight_data_tm, weight_data_tm_int8_descales, bias_c_tm, size, num_output, num_directions, opt);
}

void rnn_int8_asimddp(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, Mat& hidden_state, const Option& opt)
{
    rnn_int8(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, reverse, weight_data_tm, weight_data_tm_int8_descales, bias_c, hidden_state, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/rnn_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "rnn_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "arm_activation.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static int rnn_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // num_output
    Mat gates(num_output, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        const __fp16* x = bottom_blob.row<const __fp16>(ti);

        int nn_num_output = num_output >> 3;
        int remain_num_output_start = nn_num_output << 3;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 8;

            const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 8);
            const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 8);

            float16x8_t _rnn_H = vld1q_f16((const __fp16*)bias_c + q);
            float16x8_t _sum1 = vdupq_n_f16(0.f);
            float16x8_t _sum2 = vdupq_n_f16(0.f);
            float16x8_t _sum3 = vdupq_n_f16(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float16x4_t _x = vld1_f16(x + i);
                float16x8_t _weight_xc = vld1q_f16(weight_xc_ptr);
                float16x8_t _weight_xc_1 = vld1q_f16(weight_xc_ptr + 8);
                float16x8_t _weight_xc_2 = vld1q_f16(weight_xc_ptr + 16);
                float16x8_t _weight_xc_3 = vld1q_f16(weight_xc_ptr + 24);
                _rnn_H = vfmaq_lane_f16(_rnn_H, _weight_xc, _x, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _weight_xc_1, _x, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _weight_xc_2, _x, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _weight_xc_3, _x, 3);

                weight_xc_ptr += 32;
            }
            for (; i < size; i++)
            {
                float16x8_t _x = vdupq_n_f16(x[i]);
                float16x8_t _weight_xc = vld1q_f16(weight_xc_ptr);
                _rnn_H = vfmaq_f16(_rnn_H, _weight_xc, _x);

                weight_xc_ptr += 8;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float16x4_t _hidden_state = vcvt_f16_f32(vld1q_f32((const float*)hidden_state + i));
                float16x8_t _weight_hc = vld1q_f16(weight_hc_ptr);
                float16x8_t _weight_hc_1 = vld1q_f16(weight_hc_ptr + 8);
                float16x8_t _weight_hc_2 = vld1q_f16(weight_hc_ptr + 16);
                float16x8_t _weight_hc_3 = vld1q_f16(weight_hc_ptr + 24);
                _rnn_H = vfmaq_lane_f16(_rnn_H, _weight_hc, _hidden_state, 0);
                _sum1 = vfmaq_lane_f16(_sum1, _weight_hc_1, _hidden_state, 1);
                _sum2 = vfmaq_lane_f16(_sum2, _weight_hc_2, _hidden_state, 2);
                _sum3 = vfmaq_lane_f16(_sum3, _weight_hc_3, _hidden_state, 3);

                weight_hc_ptr += 32;
            }
            for (; i < num_output; i++)
            {
                float16x8_t _hidden_state = vdupq_n_f16((__fp16)hidden_state[i]);
                float16x8_t _weight_hc = vld1q_f16(weight_hc_ptr);
                _rnn_H = vfmaq_f16(_rnn_H, _weight_hc, _hidden_state);

                weight_hc_ptr += 8;
            }

            _rnn_H = vaddq_f16(_rnn_H, _sum1);
            _sum2 = vaddq_f16(_sum2, _sum3);
            _rnn_H = vaddq_f16(_rnn_H, _sum2);

            float32x4_t _H32low = tanh_ps(vcvt_f32_f16(vget_low_f16(_rnn_H)));
            float32x4_t _H32high = tanh_ps(vcvt_f32_f16(vget_high_f16(_rnn_H)));

            vst1q_f32((float*)gates + q, _H32low);
            vst1q_f32((float*)gates + q + 4, _H32high);
        }
        nn_num_output = (num_output - remain_num_output_start) >> 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = remain_num_output_start + qq * 4;

            const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 8 + (q % 8) / 4);
            const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 8 + (q % 8) / 4);

            float16x4_t _rnn_H = vld1_f16((const __fp16*)bias_c + q);
            float16x4_t _sum1 = vdup_n_f16(0.f);
            float16x4_t _sum2 = vdup_n_f16(0.f);
            float16x4_t _sum3 = vdup_n_f16(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float16x4_t _x = vld1_f16(x + i);
                float16x4_t _weight_xc = vld1_f16(weight_xc_ptr);
                float16x4_t _weight_xc_1 = vld1_f16(weight_xc_ptr + 4);
                float16x4_t _weight_xc_2 = vld1_f16(weight_xc_ptr + 8);
                float16x4_t _weight_xc_3 = vld1_f16(weight_xc_ptr + 12);
                _rnn_H = vfma_lane_f16(_rnn_H, _weight_xc, _x, 0);
                _sum1 = vfma_lane_f16(_sum1, _weight_xc_1, _x, 1);
                _sum2 = vfma_lane_f16(_sum2, _weight_xc_2, _x, 2);
                _sum3 = vfma_lane_f16(_sum3, _weight_xc_3, _x, 3);

                weight_xc_ptr += 16;
            }
            for (; i < size; i++)
            {
                float16x4_t _x = vdup_n_f16(x[i]);
                float16x4_t _weight_xc = vld1_f16(weight_xc_ptr);
                _rnn_H = vfma_f16(_rnn_H, _weight_xc, _x);

                weight_xc_ptr += 4;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float16x4_t _hidden_state = vcvt_f16_f32(vld1q_f32((const float*)hidden_state + i));
                float16x4_t _weight_hc = vld1_f16(weight_hc_ptr);
                float16x4_t _weight_hc_1 = vld1_f16(weight_hc_ptr + 4);
                float16x4_t _weight_hc_2 = vld1_f16(weight_hc_ptr + 8);
                float16x4_t _weight_hc_3 = vld1_f16(weight_hc_ptr + 12);
                _rnn_H = vfma_lane_f16(_rnn_H, _weight_hc, _hidden_state, 0);
                _sum1 = vfma_lane_f16(_sum1, _weight_hc_1, _hidden_state, 1);
                _sum2 = vfma_lane_f16(_sum2, _weight_hc_2, _hidden_state, 2);
                _sum3 = vfma_lane_f16(_sum3, _weight_hc_3, _hidden_state, 3);

                weight_hc_ptr += 16;
            }
            for (; i < num_output; i++)
            {
                float16x4_t _hidden_state = vdup_n_f16((__fp16)hidden_state[i]);
                float16x4_t _weight_hc = vld1_f16(weight_hc_ptr);
                _rnn_H = vfma_f16(_rnn_H, _weight_hc, _hidden_state);

                weight_hc_ptr += 4;
            }

            _rnn_H = vadd_f16(_rnn_H, _sum1);
            _sum2 = vadd_f16(_sum2, _sum3);
            _rnn_H = vadd_f16(_rnn_H, _sum2);

            float32x4_t _H32 = tanh_ps(vcvt_f32_f16(_rnn_H));

            vst1q_f32((float*)gates + q, _H32);
        }
        remain_num_output_start += nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 8 + (q % 8) / 4 + q % 4);
            const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 8 + (q % 8) / 4 + q % 4);

            __fp16 H = ((const __fp16*)bias_c)[q];

            for (int i = 0; i < size; i++)
            {
                H += weight_xc_ptr[i] * x[i];
            }

            for (int i = 0; i < num_output; i++)
            {
                H += weight_hc_ptr[i] * (__fp16)hidden_state[i];
            }

            float H32 = tanhf((float)H);

            gates[q] = H32;
        }

        __fp16* output_data = top_blob.row<__fp16>(ti);

        float* hidden_ptr = hidden_state;

        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);

            vst1q_f32(hidden_ptr + q, _rnn_H);
            vst1_f16(output_data + q, vcvt_f16_f32(_rnn_H));
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            float H = gates[q];

            hidden_ptr[q] = H;
            output_data[q] = (__fp16)H;
        }
    }

    return 0;
}

static int rnn_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    if (opt.use_fp16_arithmetic)
        return rnn_fp16sa(bottom_blob, top_blob, reverse, weight_xc, bias_c, weight_hc, hidden_state, opt);

    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // num_output
    Mat gates(num_output, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        const __fp16* x = bottom_blob.row<const __fp16>(ti);

        int nn_num_output = num_output >> 2;
        int remain_num_output_start = nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 4);
            const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 4);

            float32x4_t _rnn_H = vcvt_f32_f16(vld1_f16((const __fp16*)bias_c + q));
            float32x4_t _sum1 = vdupq_n_f32(0.f);
            float32x4_t _sum2 = vdupq_n_f32(0.f);
            float32x4_t _sum3 = vdupq_n_f32(0.f);

            int i = 0;
            for (; i + 3 < size; i += 4)
            {
                float32x4_t _x = vcvt_f32_f16(vld1_f16(x + i));
                float32x4_t _weight_xc = vcvt_f32_f16(vld1_f16(weight_xc_ptr));
                float32x4_t _weight_xc_1 = vcvt_f32_f16(vld1_f16(weight_xc_ptr + 4));
                float32x4_t _weight_xc_2 = vcvt_f32_f16(vld1_f16(weight_xc_ptr + 8));
                float32x4_t _weight_xc_3 = vcvt_f32_f16(vld1_f16(weight_xc_ptr + 12));
                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_xc, _x, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_1, _x, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_2, _x, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_3, _x, 3);

                weight_xc_ptr += 16;
            }
            for (; i < size; i++)
            {
                float32x4_t _x = vcvt_f32_f16(vdup_n_f16(x[i]));
                float32x4_t _weight_xc = vcvt_f32_f16(vld1_f16(weight_xc_ptr));
                _rnn_H = vfmaq_f32(_rnn_H, _weight_xc, _x);

                weight_xc_ptr += 4;
            }

            i = 0;
            for (; i + 3 < num_output; i += 4)
            {
                float32x4_t _hidden_state = vld1q_f32((const float*)hidden_state + i);
                float32x4_t _weight_hc = vcvt_f32_f16(vld1_f16(weight_hc_ptr));
                float32x4_t _weight_hc_1 = vcvt_f32_f16(vld1_f16(weight_hc_ptr + 4));
                float32x4_t _weight_hc_2 = vcvt_f32_f16(vld1_f16(weight_hc_ptr + 8));
                float32x4_t _weight_hc_3 = vcvt_f32_f16(vld1_f16(weight_hc_ptr + 12));
                _rnn_H = vfmaq_laneq_f32(_rnn_H, _weight_hc, _hidden_state, 0);
                _sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_1, _hidden_state, 1);
                _sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_2, _hidden_state, 2);
                _sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_3, _hidden_state, 3);

                weight_hc_ptr += 16;
            }
            for (; i < num_output; i++)
            {
                float32x4_t _hidden_state = vdupq_n_f32(hidden_state[i]);
                float32x4_t _weight_hc = vcvt_f32_f16(vld1_f16(weight_hc_ptr));
                _rnn_H = vfmaq_f32(_rnn_H, _weight_hc, _hidden_state);

                weight_hc_ptr += 4;
            }

            _rnn_H = vaddq_f32(_rnn_H, _sum1);
            _sum2 = vaddq_f32(_sum2, _sum3);
            _rnn_H = vaddq_f32(_rnn_H, _sum2);

            _rnn_H = tanh_ps(_rnn_H);

            vst1q_f32((float*)gates + q, _rnn_H);
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const __fp16* weight_xc_ptr = weight_xc.row<const __fp16>(q / 4 + q % 4);
            const __fp16* weight_hc_ptr = weight_hc.row<const __fp16>(q / 4 + q % 4);

            float H = (float)(((const __fp16*)bias_c)[q]);

            for (int i = 0; i < size; i++)
            {
                H += (float)weight_xc_ptr[i] * (float)x[i];
            }

            for (int i = 0; i < num_output; i++)
            {
                H += (float)weight_hc_ptr[i] * hidden_state[i];
            }

            H = tanhf(H);

            gates[q] = H;
        }

        __fp16* output_data = top_blob.row<__fp16>(ti);

        float* hidden_ptr = hidden_state;

        nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            float32x4_t _rnn_H = vld1q_f32((float*)gates + q);

            vst1q_f32(hidden_ptr + q, _rnn_H);
            vst1_f16(output_data + q, vcvt_f16_f32(_rnn_H));
        }
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            float H = gates[q];

            hidden_ptr[q] = H;
            output_data[q] = (__fp16)H;
        }
    }

    return 0;
}

int RNN_arm::create_pipeline_fp16s(const Option& opt)
{
    int num_directions = direction == 2 ? 2 : 1;
    int size = weight_data_size / num_directions / num_output;

    if (opt.use_fp16_arithmetic)
    {
        weight_xc_data_packed.create(size * 8, num_output / 8 + (num_output % 8) / 4 + num_output % 4, num_directions, 2u, 1);
        weight_hc_data_packed.create(num_output * 8, num_output / 8 + (num_output % 8) / 4 + num_output % 4, num_directions, 2u, 1);
    }
    else
    {
        weight_xc_data_packed.create(size * 4, num_output / 4 + num_output % 4, num_directions, 2u, 1);
        weight_hc_data_packed.create(num_output * 4, num_output / 4 + num_output % 4, num_directions, 2u, 1);
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc = weight_xc_data.channel(dr);
        const Mat weight_hc = weight_hc_data.channel(dr);

        Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr);
        Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr);

        int q = 0;
        if (opt.use_fp16_arithmetic)
        {
            for (; q + 7 < num_output; q += 8)
            {
                const float* weight_xc_0 = weight_xc.row(q);
                const float* weight_xc_1 = weight_xc.row(q + 1);
                const float* weight_xc_2 = weight_xc.row(q + 2);
                const float* weight_xc_3 = weight_xc.row(q + 3);
                const float* weight_xc_4 = weight_xc.row(q + 4);
                const float* weight_xc_5 = weight_xc.row(q + 5);
                const float* weight_xc_6 = weight_xc.row(q + 6);
                const float* weight_xc_7 = weight_xc.row(q + 7);

                const float* weight_hc_0 = weight_hc.row(q);
                const float* weight_hc_1 = weight_hc.row(q + 1);
                const float* weight_hc_2 = weight_hc.row(q + 2);
                const float* weight_hc_3 = weight_hc.row(q + 3);
                const float* weight_hc_4 = weight_hc.row(q + 4);
                const float* weight_hc_5 = weight_hc.row(q + 5);
                const float* weight_hc_6 = weight_hc.row(q + 6);
                const float* weight_hc_7 = weight_hc.row(q + 7);

                __fp16* weight_xc = weight_xc_data_packed_dr.row<__fp16>(q / 8);
                __fp16* weight_hc = weight_hc_data_packed_dr.row<__fp16>(q / 8);

                for (int i = 0; i < size; i++)
                {
                    weight_xc[0] = (__fp16)weight_xc_0[i];
                    weight_xc[1] = (__fp16)weight_xc_1[i];
                    weight_xc[2] = (__fp16)weight_xc_2[i];
                    weight_xc[3] = (__fp16)weight_xc_3[i];
                    weight_xc[4] = (__fp16)weight_xc_4[i];
                    weight_xc[5] = (__fp16)weight_xc_5[i];
                    weight_xc[6] = (__fp16)weight_xc_6[i];
                    weight_xc[7] = (__fp16)weight_xc_7[i];

                    weight_xc += 8;
                }

                for (int i = 0; i < num_output; i++)
                {
                    weight_hc[0] = (__fp16)weight_hc_0[i];
                    weight_hc[1] = (__fp16)weight_hc_1[i];
                    weight_hc[2] = (__fp16)weight_hc_2[i];
                    weight_hc[3] = (__fp16)weight_hc_3[i];
                    weight_hc[4] = (__fp16)weight_hc_4[i];
                    weight_hc[5] = (__fp16)weight_hc_5[i];
                    weight_hc[6] = (__fp16)weight_hc_6[i];
                    weight_hc[7] = (__fp16)weight_hc_7[i];

                    weight_hc += 8;
                }
            }
        }
        for (; q + 3 < num_output; q += 4)
        {
            const float* weight_xc_0 = weight_xc.row(q);
            const float* weight_xc_1 = weight_xc.row(q + 1);
            const float* weight_xc_2 = weight_xc.row(q + 2);
            const float* weight_xc_3 = weight_xc.row(q + 3);

            const float* weight_hc_0 = weight_hc.row(q);
            const float* weight_hc_1 = weight_hc.row(q + 1);
            const float* weight_hc_2 = weight_hc.row(q + 2);
            const float* weight_hc_3 = weight_hc.row(q + 3);

            __fp16* weight_xc = opt.use_fp16_arithmetic ? weight_xc_data_packed_dr.row<__fp16>(q / 8 + (q % 8) / 4) : weight_xc_data_packed_dr.row<__fp16>(q / 4);
            __fp16* weight_hc = opt.use_fp16_arithmetic ? weight_hc_data_packed_dr.row<__fp16>(q / 8 + (q % 8) / 4) : weight_hc_data_packed_dr.row<__fp16>(q / 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc[0] = (__fp16)weight_xc_0[i];
                weight_xc[1] = (__fp16)weight_xc_1[i];
                weight_xc[2] = (__fp16)weight_xc_2[i];
                weight_xc[3] = (__fp16)weight_xc_3[i];

                weight_xc += 4;
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc[0] = (__fp16)weight_hc_0[i];
                weight_hc[1] = (__fp16)weight_hc_1[i];
                weight_hc[2] = (__fp16)weight_hc_2[i];
                weight_hc[3] = (__fp16)weight_hc_3[i];

                weight_hc += 4;
            }
        }
        for (; q < num_output; q++)
        {
            const float* weight_xc_0 = weight_xc.row(q);
            const float* weight_hc_0 = weight_hc.row(q);

            __fp16* weight_xc = opt.use_fp16_arithmetic ? weight_xc_data_packed_dr.row<__fp16>(q / 8 + (q % 8) / 4 + q % 4) : weight_xc_data_packed_dr.row<__fp16>(q / 4 + q % 4);
            __fp16* weight_hc = opt.use_fp16_arithmetic ? weight_hc_data_packed_dr.row<__fp16>(q / 8 + (q % 8) / 4 + q % 4) : weight_hc_data_packed_dr.row<__fp16>(q / 4 + q % 4);

            for (int i = 0; i < size; i++)
            {
                weight_xc[i] = (__fp16)weight_xc_0[i];
            }

            for (int i = 0; i < num_output; i++)
            {
                weight_hc[i] = (__fp16)weight_hc_0[i];
            }
        }
    }

    cast_float32_to_float16(bias_c_data, bias_c_data_packed, opt);

    if (opt.lightmode)
    {
        weight_xc_data.release();
        bias_c_data.release();
        weight_hc_data.release();
    }

    return 0;
}

int RNN_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = rnn_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        {
            int ret = rnn_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.f);

        {
            int ret = rnn_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
            __fp16* ptr = top_blob.row<__fp16>(i);

            memcpy(ptr, pf, num_output * sizeof(__fp16));
            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
        }
    }

    return 0;
}

int RNN_arm::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        Option opt_cast = opt;
        opt_cast.blob_allocator = hidden_allocator;
        cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
        int ret = rnn_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, opt);
        if (ret != 0)
            return ret;
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        {
            int ret = rnn_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        {
            int ret = rnn_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
            __fp16* ptr = top_blob.row<__fp16>(i);

            memcpy(ptr, pf, num_output * sizeof(__fp16));
            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
        }
    }

    if (top_blobs.size() == 2)
    {
        cast_float32_to_float16(hidden, top_blobs[1], opt);
    }

    return 0;
}
#endif

} // namespace ncnn


================================================
FILE: src/layer/arm/rnn_arm_vfpv4.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#include "layer.h"
#include "arm_activation.h"
#include "arm_usability.h"

namespace ncnn {

#include "rnn_int8.h"

void rnn_int8_gate_output_vfpv4(const Mat& gates, Mat& hidden_state, Mat& top_blob, int ti, int elemtype, const Option& opt)
{
    rnn_int8_gate_output(gates, hidden_state, top_blob, ti, elemtype, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/rnn_int8.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
void rnn_transform_weight_int8_asimddp(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, const Option& opt);
void rnn_int8_asimddp(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, Mat& hidden_state, const Option& opt);
#endif

#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
void rnn_int8_gate_output_vfpv4(const Mat& gates, Mat& hidden_state, Mat& top_blob, int ti, int elemtype, const Option& opt);
#endif

static void rnn_transform_weight_int8(const Mat& weight_xc, const Mat& weight_xc_int8_scales, const Mat& weight_hc, const Mat& weight_hc_int8_scales, const Mat& bias_c, Mat& weight_data_tm, Mat& weight_data_tm_int8_descales, Mat& bias_c_tm, int size, int num_output, int num_directions, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
    if (ncnn::cpu_support_arm_asimddp())
    {
        rnn_transform_weight_int8_asimddp(weight_xc, weight_xc_int8_scales, weight_hc, weight_hc_int8_scales, bias_c, weight_data_tm, weight_data_tm_int8_descales, bias_c_tm, size, num_output, num_directions, opt);
        return;
    }
#endif

#if __ARM_NEON
    weight_data_tm.create(size * 4 + num_output * 4, num_output / 4 + num_output % 4, num_directions, 1u, 1);
    weight_data_tm_int8_descales.create(4 + 4, num_output / 4 + num_output % 4, num_directions);
#else
    weight_data_tm.create(size + num_output, num_output, num_directions, 1u, 1);
    weight_data_tm_int8_descales.create(1 + 1, num_output, num_directions);
#endif
    bias_c_tm = bias_c;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int dr = 0; dr < num_directions; dr++)
    {
        const Mat weight_xc_dr = weight_xc.channel(dr);
        const Mat weight_hc_dr = weight_hc.channel(dr);
        const float* weight_xc_int8_scales_ptr = weight_xc_int8_scales.row(dr);
        const float* weight_hc_int8_scales_ptr = weight_hc_int8_scales.row(dr);

        Mat weight_data_tm_dr = weight_data_tm.channel(dr);
        Mat weight_data_tm_int8_descales_dr = weight_data_tm_int8_descales.channel(dr);

        int q = 0;
#if __ARM_NEON
        for (; q + 3 < num_output; q += 4)
        {
            const signed char* weight_xc_0 = weight_xc_dr.row<const signed char>(q);
            const signed char* weight_xc_1 = weight_xc_dr.row<const signed char>(q + 1);
            const signed char* weight_xc_2 = weight_xc_dr.row<const signed char>(q + 2);
            const signed char* weight_xc_3 = weight_xc_dr.row<const signed char>(q + 3);

            const signed char* weight_hc_0 = weight_hc_dr.row<const signed char>(q);
            const signed char* weight_hc_1 = weight_hc_dr.row<const signed char>(q + 1);
            const signed char* weight_hc_2 = weight_hc_dr.row<const signed char>(q + 2);
            const signed char* weight_hc_3 = weight_hc_dr.row<const signed char>(q + 3);

            signed char* kptr = weight_data_tm_dr.row<signed char>(q / 4);
            float* descales_ptr = weight_data_tm_int8_descales_dr.row(q / 4);

            int i = 0;
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
                kptr[0] = weight_xc_0[i];
                kptr[1] = weight_xc_0[i + 1];
                kptr[2] = weight_xc_0[i + 2];
                kptr[3] = weight_xc_0[i + 3];
                kptr[4] = weight_xc_1[i];
                kptr[5] = weight_xc_1[i + 1];
                kptr[6] = weight_xc_1[i + 2];
                kptr[7] = weight_xc_1[i + 3];
                kptr[8 + 0] = weight_xc_2[i];
                kptr[8 + 1] = weight_xc_2[i + 1];
                kptr[8 + 2] = weight_xc_2[i + 2];
                kptr[8 + 3] = weight_xc_2[i + 3];
                kptr[8 + 4] = weight_xc_3[i];
                kptr[8 + 5] = weight_xc_3[i + 1];
                kptr[8 + 6] = weight_xc_3[i + 2];
                kptr[8 + 7] = weight_xc_3[i + 3];

                kptr += 16;
            }
#else
            for (; i + 7 < size; i += 8)
            {
                vst1_s8(kptr, vld1_s8(weight_xc_0 + i));
                vst1_s8(kptr + 8, vld1_s8(weight_xc_1 + i));
                vst1_s8(kptr + 16, vld1_s8(weight_xc_2 + i));
                vst1_s8(kptr + 24, vld1_s8(weight_xc_3 + i));
                kptr += 32;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < size; i += 2)
            {
                kptr[0] = weight_xc_0[i];
                kptr[1] = weight_xc_0[i + 1];
                kptr[2] = weight_xc_1[i];
                kptr[3] = weight_xc_1[i + 1];
                kptr[4] = weight_xc_2[i];
                kptr[5] = weight_xc_2[i + 1];
                kptr[6] = weight_xc_3[i];
                kptr[7] = weight_xc_3[i + 1];

                kptr += 8;
            }
            for (; i < size; i++)
            {
                kptr[0] = weight_xc_0[i];
                kptr[1] = weight_xc_1[i];
                kptr[2] = weight_xc_2[i];
                kptr[3] = weight_xc_3[i];

                kptr += 4;
            }

            i = 0;
#if __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
                kptr[0] = weight_hc_0[i];
                kptr[1] = weight_hc_0[i + 1];
                kptr[2] = weight_hc_0[i + 2];
                kptr[3] = weight_hc_0[i + 3];
                kptr[4] = weight_hc_1[i];
                kptr[5] = weight_hc_1[i + 1];
                kptr[6] = weight_hc_1[i + 2];
                kptr[7] = weight_hc_1[i + 3];
                kptr[8 + 0] = weight_hc_2[i];
                kptr[8 + 1] = weight_hc_2[i + 1];
                kptr[8 + 2] = weight_hc_2[i + 2];
                kptr[8 + 3] = weight_hc_2[i + 3];
                kptr[8 + 4] = weight_hc_3[i];
                kptr[8 + 5] = weight_hc_3[i + 1];
                kptr[8 + 6] = weight_hc_3[i + 2];
                kptr[8 + 7] = weight_hc_3[i + 3];

                kptr += 16;
            }
#else
            for (; i + 7 < num_output; i += 8)
            {
                vst1_s8(kptr, vld1_s8(weight_hc_0 + i));
                vst1_s8(kptr + 8, vld1_s8(weight_hc_1 + i));
                vst1_s8(kptr + 16, vld1_s8(weight_hc_2 + i));
                vst1_s8(kptr + 24, vld1_s8(weight_hc_3 + i));
                kptr += 32;
            }
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 1 < num_output; i += 2)
            {
                kptr[0] = weight_hc_0[i];
                kptr[1] = weight_hc_0[i + 1];
                kptr[2] = weight_hc_1[i];
                kptr[3] = weight_hc_1[i + 1];
                kptr[4] = weight_hc_2[i];
                kptr[5] = weight_hc_2[i + 1];
                kptr[6] = weight_hc_3[i];
                kptr[7] = weight_hc_3[i + 1];

                kptr += 8;
            }
            for (; i < num_output; i++)
            {
                kptr[0] = weight_hc_0[i];
                kptr[1] = weight_hc_1[i];
                kptr[2] = weight_hc_2[i];
                kptr[3] = weight_hc_3[i];

                kptr += 4;
            }

            float32x4_t _xc = vld1q_f32(weight_xc_int8_scales_ptr + q);
            float32x4_t _hc = vld1q_f32(weight_hc_int8_scales_ptr + q);

#if __aarch64__
            float32x4_t _one = vdupq_n_f32(1.f);
            float32x4_t _reciprocal_xc = vdivq_f32(_one, _xc);
            float32x4_t _reciprocal_hc = vdivq_f32(_one, _hc);
#else
            float32x4_t _reciprocal_xc = vrecpeq_f32(_xc);
            _reciprocal_xc = vmulq_f32(vrecpsq_f32(_xc, _reciprocal_xc), _reciprocal_xc);
            _reciprocal_xc = vmulq_f32(vrecpsq_f32(_xc, _reciprocal_xc), _reciprocal_xc);
            float32x4_t _reciprocal_hc = vrecpeq_f32(_hc);
            _reciprocal_hc = vmulq_f32(vrecpsq_f32(_hc, _reciprocal_hc), _reciprocal_hc);
            _reciprocal_hc = vmulq_f32(vrecpsq_f32(_hc, _reciprocal_hc), _reciprocal_hc);
#endif

            vst1q_f32(descales_ptr, _reciprocal_xc);
            vst1q_f32(descales_ptr + 4, _reciprocal_hc);
        }
#endif // __ARM_NEON
        for (; q < num_output; q++)
        {
            const signed char* weight_xc_0 = weight_xc_dr.row<const signed char>(q);
            const signed char* weight_hc_0 = weight_hc_dr.row<const signed char>(q);

#if __ARM_NEON
            signed char* kptr = weight_data_tm_dr.row<signed char>(q / 4 + q % 4);
            float* descales_ptr = weight_data_tm_int8_descales_dr.row(q / 4 + q % 4);
#else
            signed char* kptr = weight_data_tm_dr.row<signed char>(q);
            float* descales_ptr = weight_data_tm_int8_descales_dr.row(q);
#endif // __ARM_NEON

            for (int i = 0; i < size; i++)
            {
                kptr[0] = weight_xc_0[i];
                kptr += 1;
            }

            for (int i = 0; i < num_output; i++)
            {
                kptr[0] = weight_hc_0[i];
                kptr += 1;
            }

            descales_ptr[0] = 1.f / weight_xc_int8_scales_ptr[q];
            descales_ptr[1] = 1.f / weight_hc_int8_scales_ptr[q];
        }
    }
}

static void rnn_int8_gate_output(const Mat& gates, Mat& hidden_state, Mat& top_blob, int ti, int elemtype, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2)
    if (ncnn::cpu_support_arm_vfpv4())
    {
        rnn_int8_gate_output_vfpv4(gates, hidden_state, top_blob, ti, elemtype, opt);
        return;
    }
#endif

    const int num_output = top_blob.w;

    float* output_data = top_blob.row(ti);

    float* hidden_ptr = hidden_state;

    int remain_num_output_start = 0;
#if __ARM_NEON
    int nn_num_output = num_output >> 2;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int qq = 0; qq < nn_num_output; qq++)
    {
        int q = qq * 4;

        float32x4_t _rnn_H = vld1q_f32((const float*)gates + q);

        vst1q_f32(hidden_ptr + q, _rnn_H);

        if (elemtype == 1)
        {
            // fp32
            vst1q_f32(output_data + q, _rnn_H);
        }
        if (elemtype == 2)
        {
            // fp16
            unsigned short* outptr = (unsigned short*)output_data + q;
#if (__ARM_FP & 2)
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
            asm volatile(
                "fcvtn  v0.4h, %2.4s        \n"
                "st1    {v0.4h}, [%0]       \n"
                : "=r"(outptr) // %0
                : "0"(outptr),
                "w"(_rnn_H)
                : "memory", "v0");
#else  // __aarch64__
            asm volatile(
                "vcvt.f16.f32 d0, %q2       \n"
                "vst1.u16   {d0}, [%0]      \n"
                : "=r"(outptr) // %0
                : "0"(outptr),
                "w"(_rnn_H)
                : "memory", "q0");
#endif // __aarch64__
#else  // NCNN_GNU_INLINE_ASM
            vst1_u16(outptr, (uint16x4_t)vcvt_f16_f32(_rnn_H));
#endif // NCNN_GNU_INLINE_ASM
#else
            outptr[q] = float32_to_float16(hidden_ptr[q]);
            outptr[q + 1] = float32_to_float16(hidden_ptr[q + 1]);
            outptr[q + 2] = float32_to_float16(hidden_ptr[q + 2]);
            outptr[q + 3] = float32_to_float16(hidden_ptr[q + 3]);
#endif // (__ARM_FP & 2)
        }
        if (elemtype == 4)
        {
            // bf16
            vst1_u16((unsigned short*)output_data + q, float2bfloat(_rnn_H));
        }
    }
    remain_num_output_start += nn_num_output << 2;
#endif // __ARM_NEON
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = remain_num_output_start; q < num_output; q++)
    {
        float H = gates[q];

        hidden_ptr[q] = H;

        if (elemtype == 1)
        {
            output_data[q] = H;
        }
        if (elemtype == 2)
        {
            ((unsigned short*)output_data)[q] = float32_to_float16(H);
        }
        if (elemtype == 4)
        {
            ((unsigned short*)output_data)[q] = float32_to_bfloat16(H);
        }
    }
}

static void rnn_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_descales, Mat& top_blob, int elemtype, int reverse, const Mat& weight_data_tm, const Mat& weight_data_tm_int8_descales, const Mat& bias_c, Mat& hidden_state, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
    if (ncnn::cpu_support_arm_asimddp())
    {
        rnn_int8_asimddp(bottom_blob_int8, bottom_blob_int8_descales, top_blob, elemtype, reverse, weight_data_tm, weight_data_tm_int8_descales, bias_c, hidden_state, opt);
        return;
    }
#endif

    int size = bottom_blob_int8.w;
    int T = bottom_blob_int8.h;

    int num_output = top_blob.w;

    // num_output
    Mat gates(num_output, 4u, opt.workspace_allocator);

    Mat hidden_state_int8(num_output, (size_t)1u, 1, opt.workspace_allocator);
    float hidden_state_int8_scale = 1.f;
    float hidden_state_int8_descale = 1.f;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        // dynamic quantize hidden_state
        {
            float absmax = 0.f;
            for (int i = 0; i < num_output; i++)
            {
                absmax = std::max(absmax, (float)fabs(hidden_state[i]));
            }

            if (absmax == 0.f)
            {
                hidden_state_int8.fill<signed char>(0);
            }
            else
            {
                hidden_state_int8_scale = 127.f / absmax;
                hidden_state_int8_descale = absmax / 127.f;

                signed char* hs = hidden_state_int8;
                for (int i = 0; i < num_output; i++)
                {
                    hs[i] = float2int8(hidden_state[i] * hidden_state_int8_scale);
                }
            }
        }

        int remain_num_output_start = 0;
#if __ARM_NEON
        int nn_num_output = num_output >> 2;
        remain_num_output_start = nn_num_output << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int qq = 0; qq < nn_num_output; qq++)
        {
            int q = qq * 4;

            const signed char* x = bottom_blob_int8.row<const signed char>(ti);
            const signed char* hs = hidden_state_int8;
            const float descale_x = bottom_blob_int8_descales[ti];
            const float descale_h = hidden_state_int8_descale;

            const signed char* kptr = weight_data_tm.row<const signed char>(q / 4);

            const float* descales_ptr = weight_data_tm_int8_descales.row(q / 4);

            int32x4_t _rnn_Hx0 = vdupq_n_s32(0);
            int i = 0;
#if __ARM_FEATURE_DOTPROD
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            int32x4_t _sum3 = vdupq_n_s32(0);
            for (; i + 15 < size; i += 16)
            {
                int8x16_t _xi = vld1q_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);
                _rnn_Hx0 = vdotq_laneq_s32(_rnn_Hx0, _w0, _xi, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _w1, _xi, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _w2, _xi, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _w3, _xi, 3);

                kptr += 64;
            }
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _rnn_Hx0 = vdotq_lane_s32(_rnn_Hx0, _w0, _xi, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w1, _xi, 1);

                kptr += 32;
            }
            _rnn_Hx0 = vaddq_s32(_rnn_Hx0, _sum1);
            _rnn_Hx0 = vaddq_s32(_rnn_Hx0, _sum2);
            _rnn_Hx0 = vaddq_s32(_rnn_Hx0, _sum3);
#else
            int32x4_t _sum0 = vdupq_n_s32(0);
            int32x4_t _sum1 = vdupq_n_s32(0);
            int32x4_t _sum2 = vdupq_n_s32(0);
            int32x4_t _sum3 = vdupq_n_s32(0);
            for (; i + 15 < size; i += 16)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* xptr = x + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16-d17}, [%0]     \n"
                    "vmull.s8   q4, d0, d16         \n"
                    "vmull.s8   q5, d1, d16         \n"
                    "vmull.s8   q6, d2, d16         \n"
                    "vmull.s8   q7, d3, d16         \n"
                    "vmlal.s8   q4, d4, d17         \n"
                    "vmlal.s8   q5, d5, d17         \n"
                    "vmlal.s8   q6, d6, d17         \n"
                    "vmlal.s8   q7, d7, d17         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(xptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(xptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int8x16_t _xi = vld1q_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), vget_low_s8(_xi));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), vget_low_s8(_xi));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), vget_low_s8(_xi));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), vget_low_s8(_xi));
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), vget_high_s8(_xi));
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), vget_high_s8(_xi));
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), vget_high_s8(_xi));
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), vget_high_s8(_xi));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            for (; i + 7 < size; i += 8)
            {
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _xi);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _xi);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _xi);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _xi);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 32;
            }
            {
                int32x4x2_t _tmp0 = vzipq_s32(_sum0, _sum1);
                int32x4x2_t _tmp1 = vzipq_s32(_sum2, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_tmp0.val[0]), vget_low_s32(_tmp1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_tmp0.val[0]), vget_high_s32(_tmp1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_tmp0.val[1]), vget_low_s32(_tmp1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_tmp0.val[1]), vget_high_s32(_tmp1.val[1]));
            }
            _rnn_Hx0 = vaddq_s32(_rnn_Hx0, _sum0);
            _rnn_Hx0 = vaddq_s32(_rnn_Hx0, _sum1);
            _rnn_Hx0 = vaddq_s32(_rnn_Hx0, _sum2);
            _rnn_Hx0 = vaddq_s32(_rnn_Hx0, _sum3);
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < size; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _xi = vld1_s8(x + i);
                int8x16_t _w = vld1q_s8(kptr);
                _rnn_Hx0 = vdotq_lane_s32(_rnn_Hx0, _w, _xi, 0);
#else
                int16x4_t _xi01 = vreinterpret_s16_s8(vld1_s8(x + i));
                int8x8_t _xi0 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 0));
                int8x8_t _xi1 = vreinterpret_s8_s16(vdup_lane_s16(_xi01, 1));
                int8x16_t _w01 = vld1q_s8(kptr);

                int16x8_t _rnn_Hx = vmull_s8(vget_low_s8(_w01), _xi0);
                _rnn_Hx = vmlal_s8(_rnn_Hx, vget_high_s8(_w01), _xi1);
                _rnn_Hx0 = vpadalq_s16(_rnn_Hx0, _rnn_Hx);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 16;
            }
            for (; i + 1 < size; i += 2)
            {
                int8x8_t _xi = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(x + i)), 0));
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _rnn_Hx = vmull_s8(_w, _xi);
                _rnn_Hx0 = vpadalq_s16(_rnn_Hx0, _rnn_Hx);

                kptr += 8;
            }
            for (; i < size; i++)
            {
                int8x8_t _xi = vdup_n_s8(x[i]);
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _rnn_Hx = vmull_s8(_w, _xi);
                _rnn_Hx0 = vaddw_s16(_rnn_Hx0, vget_low_s16(_rnn_Hx));

                kptr += 4;
            }

            int32x4_t _rnn_Hh0 = vdupq_n_s32(0);
            i = 0;
#if __ARM_FEATURE_DOTPROD
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            _sum3 = vdupq_n_s32(0);
            for (; i + 15 < num_output; i += 16)
            {
                int8x16_t _h_cont = vld1q_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);
                _rnn_Hh0 = vdotq_laneq_s32(_rnn_Hh0, _w0, _h_cont, 0);
                _sum1 = vdotq_laneq_s32(_sum1, _w1, _h_cont, 1);
                _sum2 = vdotq_laneq_s32(_sum2, _w2, _h_cont, 2);
                _sum3 = vdotq_laneq_s32(_sum3, _w3, _h_cont, 3);

                kptr += 64;
            }
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                _rnn_Hh0 = vdotq_lane_s32(_rnn_Hh0, _w0, _h_cont, 0);
                _sum1 = vdotq_lane_s32(_sum1, _w1, _h_cont, 1);

                kptr += 32;
            }
            _rnn_Hh0 = vaddq_s32(_rnn_Hh0, _sum1);
            _rnn_Hh0 = vaddq_s32(_rnn_Hh0, _sum2);
            _rnn_Hh0 = vaddq_s32(_rnn_Hh0, _sum3);
#else
            _sum0 = vdupq_n_s32(0);
            _sum1 = vdupq_n_s32(0);
            _sum2 = vdupq_n_s32(0);
            _sum3 = vdupq_n_s32(0);
            for (; i + 15 < num_output; i += 16)
            {
#if NCNN_GNU_INLINE_ASM && !__aarch64__
                const signed char* hsptr = hs + i;

                asm volatile(
                    "vldm       %1!, {d0-d7}        \n"
                    "vld1.s8    {d16-d17}, [%0]     \n"
                    "vmull.s8   q4, d0, d16         \n"
                    "vmull.s8   q5, d1, d16         \n"
                    "vmull.s8   q6, d2, d16         \n"
                    "vmull.s8   q7, d3, d16         \n"
                    "vmlal.s8   q4, d4, d17         \n"
                    "vmlal.s8   q5, d5, d17         \n"
                    "vmlal.s8   q6, d6, d17         \n"
                    "vmlal.s8   q7, d7, d17         \n"
                    "vpadal.s16 %q2, q4             \n"
                    "vpadal.s16 %q3, q5             \n"
                    "vpadal.s16 %q4, q6             \n"
                    "vpadal.s16 %q5, q7             \n"
                    : "=r"(hsptr), "=r"(kptr), "=w"(_sum0), "=w"(_sum1), "=w"(_sum2), "=w"(_sum3)
                    : "0"(hsptr), "1"(kptr), "2"(_sum0), "3"(_sum1), "4"(_sum2), "5"(_sum3)
                    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8");
#else
                int8x16_t _h_cont = vld1q_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);
                int8x16_t _w2 = vld1q_s8(kptr + 32);
                int8x16_t _w3 = vld1q_s8(kptr + 48);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), vget_low_s8(_h_cont));
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), vget_low_s8(_h_cont));
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), vget_low_s8(_h_cont));
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), vget_low_s8(_h_cont));
                _s0 = vmlal_s8(_s0, vget_low_s8(_w2), vget_high_s8(_h_cont));
                _s1 = vmlal_s8(_s1, vget_high_s8(_w2), vget_high_s8(_h_cont));
                _s2 = vmlal_s8(_s2, vget_low_s8(_w3), vget_high_s8(_h_cont));
                _s3 = vmlal_s8(_s3, vget_high_s8(_w3), vget_high_s8(_h_cont));
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 64;
#endif
            }
            for (; i + 7 < num_output; i += 8)
            {
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w0 = vld1q_s8(kptr);
                int8x16_t _w1 = vld1q_s8(kptr + 16);

                int16x8_t _s0 = vmull_s8(vget_low_s8(_w0), _h_cont);
                int16x8_t _s1 = vmull_s8(vget_high_s8(_w0), _h_cont);
                int16x8_t _s2 = vmull_s8(vget_low_s8(_w1), _h_cont);
                int16x8_t _s3 = vmull_s8(vget_high_s8(_w1), _h_cont);
                _sum0 = vpadalq_s16(_sum0, _s0);
                _sum1 = vpadalq_s16(_sum1, _s1);
                _sum2 = vpadalq_s16(_sum2, _s2);
                _sum3 = vpadalq_s16(_sum3, _s3);

                kptr += 32;
            }
            {
                int32x4x2_t _tmp0 = vzipq_s32(_sum0, _sum1);
                int32x4x2_t _tmp1 = vzipq_s32(_sum2, _sum3);
                _sum0 = vcombine_s32(vget_low_s32(_tmp0.val[0]), vget_low_s32(_tmp1.val[0]));
                _sum1 = vcombine_s32(vget_high_s32(_tmp0.val[0]), vget_high_s32(_tmp1.val[0]));
                _sum2 = vcombine_s32(vget_low_s32(_tmp0.val[1]), vget_low_s32(_tmp1.val[1]));
                _sum3 = vcombine_s32(vget_high_s32(_tmp0.val[1]), vget_high_s32(_tmp1.val[1]));
            }
            _rnn_Hh0 = vaddq_s32(_rnn_Hh0, _sum0);
            _rnn_Hh0 = vaddq_s32(_rnn_Hh0, _sum1);
            _rnn_Hh0 = vaddq_s32(_rnn_Hh0, _sum2);
            _rnn_Hh0 = vaddq_s32(_rnn_Hh0, _sum3);
#endif // __ARM_FEATURE_DOTPROD
            for (; i + 3 < num_output; i += 4)
            {
#if __ARM_FEATURE_DOTPROD
                int8x8_t _h_cont = vld1_s8(hs + i);
                int8x16_t _w = vld1q_s8(kptr);
                _rnn_Hh0 = vdotq_lane_s32(_rnn_Hh0, _w, _h_cont, 0);
#else
                int16x4_t _h_cont01 = vreinterpret_s16_s8(vld1_s8(hs + i));
                int8x8_t _h_cont0 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 0));
                int8x8_t _h_cont1 = vreinterpret_s8_s16(vdup_lane_s16(_h_cont01, 1));
                int8x16_t _w01 = vld1q_s8(kptr);

                int16x8_t _rnn_Hh = vmull_s8(vget_low_s8(_w01), _h_cont0);
                _rnn_Hh = vmlal_s8(_rnn_Hh, vget_high_s8(_w01), _h_cont1);
                _rnn_Hh0 = vpadalq_s16(_rnn_Hh0, _rnn_Hh);
#endif // __ARM_FEATURE_DOTPROD

                kptr += 16;
            }
            for (; i + 1 < num_output; i += 2)
            {
                int8x8_t _h_cont = vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(vld1_s8(hs + i)), 0));
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _rnn_Hh = vmull_s8(_w, _h_cont);
                _rnn_Hh0 = vpadalq_s16(_rnn_Hh0, _rnn_Hh);

                kptr += 8;
            }
            for (; i < num_output; i++)
            {
                int8x8_t _h_cont = vdup_n_s8(hs[i]);
                int8x8_t _w = vld1_s8(kptr);

                int16x8_t _rnn_Hh = vmull_s8(_w, _h_cont);
                _rnn_Hh0 = vaddw_s16(_rnn_Hh0, vget_low_s16(_rnn_Hh));

                kptr += 4;
            }

            float32x4_t _descale_x = vdupq_n_f32(descale_x);
            float32x4_t _descale_h = vdupq_n_f32(descale_h);

            float32x4_t _rnn_H = vld1q_f32((const float*)bias_c + q);

            float32x4_t _descale_xc = vld1q_f32(descales_ptr);

            _rnn_H = vmlaq_f32(_rnn_H, vcvtq_f32_s32(_rnn_Hx0), vmulq_f32(_descale_x, _descale_xc));

            float32x4_t _descale_hc = vld1q_f32(descales_ptr + 4);

            _rnn_H = vmlaq_f32(_rnn_H, vcvtq_f32_s32(_rnn_Hh0), vmulq_f32(_descale_h, _descale_hc));

            _rnn_H = tanh_ps(_rnn_H);

            vst1q_f32((float*)gates + q, _rnn_H);
        }
#endif // __ARM_NEON
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = remain_num_output_start; q < num_output; q++)
        {
            const signed char* x = bottom_blob_int8.row<const signed char>(ti);
            const signed char* hs = hidden_state_int8;
            const float descale_x = bottom_blob_int8_descales[ti];
            const float descale_h = hidden_state_int8_descale;

#if __ARM_NEON
            const signed char* kptr = weight_data_tm.row<const signed char>(q / 4 + q % 4);
            const float* descales_ptr = weight_data_tm_int8_descales.row(q / 4 + q % 4);
#else
            const signed char* kptr = weight_data_tm.row<const signed char>(q);
            const float* descales_ptr = weight_data_tm_int8_descales.row(q);
#endif // __ARM_NEON

            const float descale_xc = descales_ptr[0];
            const float descale_hc = descales_ptr[1];

            int Hx = 0;
            for (int i = 0; i < size; i++)
            {
                Hx += kptr[0] * x[i];
                kptr += 1;
            }

            int Hh = 0;
            for (int i = 0; i < num_output; i++)
            {
                Hh += kptr[0] * hs[i];
                kptr += 1;
            }

            float H = bias_c[q] + Hx * (descale_x * descale_xc) + Hh * (descale_h * descale_hc);

            H = tanhf(H);

            gates[q] = H;
        }

        rnn_int8_gate_output(gates, hidden_state, top_blob, ti, elemtype, opt);
    }
}


================================================
FILE: src/layer/arm/scale_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "scale_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

Scale_arm::Scale_arm()
{
#if __ARM_NEON
    support_packing = true;
#endif // __ARM_NEON
}

int Scale_arm::forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const
{
    Mat& bottom_top_blob = bottom_top_blobs[0];
    const Mat& scale_blob = bottom_top_blobs[1];

    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        if (dims == 1)
        {
            int w = bottom_top_blob.w;

            const float* scale = scale_blob;
            if (bias_term)
            {
                const float* bias = bias_data;
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    float* ptr = (float*)bottom_top_blob + i * 4;

                    float32x4_t _p = vld1q_f32(ptr);
                    float32x4_t _s = vld1q_f32(scale + i * 4);
                    float32x4_t _bias = vld1q_f32(bias + i * 4);
                    _p = vmlaq_f32(_bias, _p, _s);
                    vst1q_f32(ptr, _p);
                }
            }
            else
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < w; i++)
                {
                    float* ptr = (float*)bottom_top_blob + i * 4;

                    float32x4_t _p = vld1q_f32(ptr);
                    float32x4_t _s = vld1q_f32(scale + i * 4);
                    _p = vmulq_f32(_p, _s);
                    vst1q_f32(ptr, _p);
                }
            }
        }

        if (dims == 2)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;

            if (bias_term)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < h; i++)
                {
                    float* ptr = bottom_top_blob.row(i);
                    float32x4_t _s = vld1q_f32((const float*)scale_blob + i * 4);
                    float32x4_t _bias = vld1q_f32((const float*)bias_data + i * 4);

                    for (int j = 0; j < w; j++)
                    {
                        float32x4_t _p = vld1q_f32(ptr);
                        _p = vmlaq_f32(_bias, _p, _s);
                        vst1q_f32(ptr, _p);

                        ptr += 4;
                    }
                }
            }
            else
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i = 0; i < h; i++)
                {
                    float* ptr = bottom_top_blob.row(i);
                    float32x4_t _s = vld1q_f32((const float*)scale_blob + i * 4);

                    for (int j = 0; j < w; j++)
                    {
                        float32x4_t _p = vld1q_f32(ptr);
                        _p = vmulq_f32(_p, _s);
                        vst1q_f32(ptr, _p);

                        ptr += 4;
                    }
                }
            }
        }

        if (dims == 3)
        {
            int w = bottom_top_blob.w;
            int h = bottom_top_blob.h;
            int channels = bottom_top_blob.c;
            int size = w * h;

            if (bias_term)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    float* ptr = bottom_top_blob.channel(q);
                    float32x4_t _s = vld1q_f32((const float*)scale_blob + q * 4);
                    float32x4_t _bias = vld1q_f32((const float*)bias_data + q * 4);

                    for (int i = 0; i < size; i++)
                    {
                        float32x4_t _p = vld1q_f32(ptr);
                        _p = vmlaq_f32(_bias, _p, _s);
                        vst1q_f32(ptr, _p);

                        ptr += 4;
                    }
                }
            }
            else
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    float* ptr = bottom_top_blob.channel(q);
                    float32x4_t _s = vld1q_f32((const float*)scale_blob + q * 4);

                    for (int i = 0; i < size; i++)
                    {
                        float32x4_t _p = vld1q_f32(ptr);
                        _p = vmulq_f32(_p, _s);
                        vst1q_f32(ptr, _p);

                        ptr += 4;
                    }
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    if (dims != 3)
        return Scale::forward_inplace(bottom_top_blobs, opt);

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (bias_term)
    {
        const float* scale_ptr = scale_blob;
        const float* bias_ptr = bias_data;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float s = scale_ptr[q];
            float bias = bias_ptr[q];

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            float32x4_t _bias = vdupq_n_f32(bias);
            for (; nn > 0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmlaq_f32(_bias, _p, _s);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
#endif // __ARM_NEON

            for (; remain > 0; remain--)
            {
                *ptr = *ptr * s + bias;

                ptr++;
            }
        }
    }
    else
    {
        const float* scale_ptr = scale_blob;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float s = scale_ptr[q];

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _s = vdupq_n_f32(s);
            for (; nn > 0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = vmulq_f32(_p, _s);
                vst1q_f32(ptr, _p);

                ptr += 4;
            }
#endif // __ARM_NEON

            for (; remain > 0; remain--)
            {
                *ptr *= s;

                ptr++;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/scale_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SCALE_ARM_H
#define LAYER_SCALE_ARM_H

#include "scale.h"

namespace ncnn {

class Scale_arm : public Scale
{
public:
    Scale_arm();

    virtual int forward_inplace(std::vector<Mat>& bottom_top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SCALE_ARM_H


================================================
FILE: src/layer/arm/selu_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "selu_arm.h"

#if __ARM_NEON
#include "neon_mathfun.h"

#include <arm_neon.h>
#endif // __ARM_NEON

namespace ncnn {

int SELU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    float alphaxlambda = alpha * lambda;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        float32x4_t _one = vdupq_n_f32(1.f);
        float32x4_t _zero = vdupq_n_f32(0.f);
        float32x4_t _alphaxlambda = vdupq_n_f32(alphaxlambda);
        float32x4_t _lambda = vdupq_n_f32(lambda);
        for (; nn > 0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            uint32x4_t _lemask = vcleq_f32(_p, _zero);

            float32x4_t _nps = exp_ps(_p);
            _nps = vsubq_f32(_nps, _one);
            _nps = vmulq_f32(_nps, _alphaxlambda);

            _p = vmulq_f32(_p, _lambda);

            _p = vbslq_f32(_lemask, _nps, _p);
            vst1q_f32(ptr, _p);

            ptr += 4;
        }
#endif // __ARM_NEON
        for (; remain > 0; remain--)
        {
            if (*ptr < 0.f)
                *ptr = (expf(*ptr) - 1.f) * alphaxlambda;
            else
                *ptr *= lambda;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/selu_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SELU_ARM_H
#define LAYER_SELU_ARM_H

#include "selu.h"

namespace ncnn {

class SELU_arm : public SELU
{
public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SELU_ARM_H


================================================
FILE: src/layer/arm/shufflechannel_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "shufflechannel_arm.h"

#include "layer_type.h"

#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

ShuffleChannel_arm::ShuffleChannel_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int ShuffleChannel_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
#endif

    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;

    int _group = reverse ? channels * elempack / group : group;

    if (_group == 1)
    {
        top_blob = bottom_blob;
        return 0;
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        if (_group == 2 && channels % _group != 0)
        {
            int w = bottom_blob.w;
            int h = bottom_blob.h;
            int size = w * h;
            size_t elemsize = bottom_blob.elemsize;

            top_blob.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            int channels_per_group = channels / _group;

            // TODO unroll me
            for (int q = 0; q < channels_per_group; q++)
            {
                const float* ptr0 = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob.channel(channels_per_group + q);
                const float* ptr2 = bottom_blob.channel(channels_per_group + q + 1);
                float* outptr0 = top_blob.channel(q * 2);
                float* outptr1 = top_blob.channel(q * 2 + 1);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr1);
                    float32x4_t _p2 = vld1q_f32(ptr2);

                    float32x4_t _p12 = vextq_f32(_p1, _p2, 2);

                    float32x4x2_t _p01 = vzipq_f32(_p0, _p12);

                    vst1q_f32(outptr0, _p01.val[0]);
                    vst1q_f32(outptr1, _p01.val[1]);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                }
            }

            // handle the last channel
            {
                const float* ptr0 = bottom_blob.channel(channels_per_group);
                const float* ptr1 = bottom_blob.channel(channels_per_group + channels_per_group);
                float* outptr0 = top_blob.channel(channels_per_group * 2);

                ptr1 += 2;

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr1);

                    float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                    vst1q_f32(outptr0, _p01.val[0]);

                    ptr0 += 4;
                    ptr1 += 4;
                    outptr0 += 4;
                }
            }

            return 0;
        }

        if (_group > 4 || channels % _group != 0)
        {
            // slow path for too large group or shuffle inside elempack
            Option opt_pack = opt;
            opt_pack.blob_allocator = opt.workspace_allocator;

            Mat bottom_blob_unpacked;
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
            if (bottom_blob_unpacked.empty())
                return -100;

            Mat top_blob_unpacked;
            int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack);
            if (ret != 0)
                return ret;

            convert_packing(top_blob_unpacked, top_blob, elempack, opt);

            return 0;
        }

        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int size = w * h;
        size_t elemsize = bottom_blob.elemsize;

        top_blob.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int channels_per_group = channels / _group;

        if (_group == 2)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const float* ptr0 = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob.channel(channels_per_group + q);
                float* outptr0 = top_blob.channel(q * 2);
                float* outptr1 = top_blob.channel(q * 2 + 1);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr1);

                    float32x4x2_t _p01 = vzipq_f32(_p0, _p1);

                    vst1q_f32(outptr0, _p01.val[0]);
                    vst1q_f32(outptr1, _p01.val[1]);

                    ptr0 += 4;
                    ptr1 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                }
            }
        }

        if (_group == 3)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const float* ptr0 = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob.channel(channels_per_group + q);
                const float* ptr2 = bottom_blob.channel(channels_per_group * 2 + q);
                float* outptr0 = top_blob.channel(q * 3);
                float* outptr1 = top_blob.channel(q * 3 + 1);
                float* outptr2 = top_blob.channel(q * 3 + 2);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr1);
                    float32x4_t _p2 = vld1q_f32(ptr2);

                    float32x4x2_t _p01 = vzipq_f32(_p0, _p1);
                    float32x4x2_t _p12 = vzipq_f32(_p1, _p2);

                    float32x4_t _0415 = _p01.val[0];
                    float32x4_t _2637 = _p01.val[1];
                    float32x4_t _4859 = _p12.val[0];
                    float32x4_t _6x7y = _p12.val[1];

                    float32x2_t _15 = vget_high_f32(_0415);
                    float32x2_t _37 = vget_high_f32(_2637);
                    float32x2_t _48 = vget_low_f32(_4859);
                    float32x2_t _6x = vget_low_f32(_6x7y);

                    float32x2_t _81 = vext_f32(_48, _15, 1);
                    float32x2_t _x3 = vext_f32(_6x, _37, 1);

                    float32x4_t _0481 = vcombine_f32(vget_low_f32(_0415), _81);
                    float32x4_t _5926 = vextq_f32(_4859, _2637, 2);
                    float32x4_t _x37y = vcombine_f32(_x3, vget_high_f32(_6x7y));

                    vst1q_f32(outptr0, _0481);
                    vst1q_f32(outptr1, _5926);
                    vst1q_f32(outptr2, _x37y);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                }
            }
        }

        if (_group == 4)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const float* ptr0 = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob.channel(channels_per_group + q);
                const float* ptr2 = bottom_blob.channel(channels_per_group * 2 + q);
                const float* ptr3 = bottom_blob.channel(channels_per_group * 3 + q);
                float* outptr0 = top_blob.channel(q * 4);
                float* outptr1 = top_blob.channel(q * 4 + 1);
                float* outptr2 = top_blob.channel(q * 4 + 2);
                float* outptr3 = top_blob.channel(q * 4 + 3);

                for (int i = 0; i < size; i++)
                {
                    float32x4_t _p0 = vld1q_f32(ptr0);
                    float32x4_t _p1 = vld1q_f32(ptr1);
                    float32x4_t _p2 = vld1q_f32(ptr2);
                    float32x4_t _p3 = vld1q_f32(ptr3);

                    // transpose 4x4
                    float32x4x2_t _p01 = vtrnq_f32(_p0, _p1);
                    float32x4x2_t _p23 = vtrnq_f32(_p2, _p3);
                    _p0 = vcombine_f32(vget_low_f32(_p01.val[0]), vget_low_f32(_p23.val[0]));
                    _p1 = vcombine_f32(vget_low_f32(_p01.val[1]), vget_low_f32(_p23.val[1]));
                    _p2 = vcombine_f32(vget_high_f32(_p01.val[0]), vget_high_f32(_p23.val[0]));
                    _p3 = vcombine_f32(vget_high_f32(_p01.val[1]), vget_high_f32(_p23.val[1]));

                    vst1q_f32(outptr0, _p0);
                    vst1q_f32(outptr1, _p1);
                    vst1q_f32(outptr2, _p2);
                    vst1q_f32(outptr3, _p3);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    ptr3 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    return ShuffleChannel::forward(bottom_blob, top_blob, opt);
}

int ShuffleChannel_arm::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;

    int _group = reverse ? channels * elempack / group : group;

    if (_group == 1)
    {
        top_blob = bottom_blob;
        return 0;
    }

#if NCNN_ARM82
    if (elempack == 8)
    {
        if (_group == 2 && channels % _group != 0)
        {
            int w = bottom_blob.w;
            int h = bottom_blob.h;
            int size = w * h;
            size_t elemsize = bottom_blob.elemsize;

            top_blob.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            int channels_per_group = channels / _group;

            // TODO unroll me
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                const unsigned short* ptr2 = bottom_blob.channel(channels_per_group + q + 1);
                unsigned short* outptr0 = top_blob.channel(q * 2);
                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);

                for (int i = 0; i < size; i++)
                {
                    uint16x8_t _p0 = vld1q_u16(ptr0);
                    uint16x8_t _p1 = vld1q_u16(ptr1);
                    uint16x8_t _p2 = vld1q_u16(ptr2);

                    uint16x8_t _p12 = vextq_u16(_p1, _p2, 4);

                    uint16x8x2_t _p01 = vzipq_u16(_p0, _p12);

                    vst1q_u16(outptr0, _p01.val[0]);
                    vst1q_u16(outptr1, _p01.val[1]);

                    ptr0 += 8;
                    ptr1 += 8;
                    ptr2 += 8;
                    outptr0 += 8;
                    outptr1 += 8;
                }
            }

            // handle the last channel
            {
                const unsigned short* ptr0 = bottom_blob.channel(channels_per_group);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + channels_per_group);
                unsigned short* outptr0 = top_blob.channel(channels_per_group * 2);

                ptr1 += 4;

                for (int i = 0; i < size; i++)
                {
                    uint16x4_t _p0 = vld1_u16(ptr0);
                    uint16x4_t _p1 = vld1_u16(ptr1);

                    uint16x4x2_t _p01 = vzip_u16(_p0, _p1);

                    vst1_u16(outptr0, _p01.val[0]);
                    vst1_u16(outptr0 + 4, _p01.val[1]);

                    ptr0 += 8;
                    ptr1 += 8;
                    outptr0 += 8;
                }
            }

            return 0;
        }

        if (_group > 4 || channels % _group != 0)
        {
            // slow path for too large group or shuffle inside elempack
            Option opt_pack = opt;
            opt_pack.blob_allocator = opt.workspace_allocator;

            Mat bottom_blob_unpacked;
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
            if (bottom_blob_unpacked.empty())
                return -100;

            Mat top_blob_unpacked;
            int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack);
            if (ret != 0)
                return ret;

            convert_packing(top_blob_unpacked, top_blob, elempack, opt);
            if (top_blob.empty())
                return -100;

            return 0;
        }

        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int size = w * h;
        size_t elemsize = bottom_blob.elemsize;

        top_blob.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int channels_per_group = channels / _group;

        if (_group == 2)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                unsigned short* outptr0 = top_blob.channel(q * 2);
                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);

                for (int i = 0; i < size; i++)
                {
                    uint16x8_t _p0 = vld1q_u16(ptr0);
                    uint16x8_t _p1 = vld1q_u16(ptr1);

                    uint16x8x2_t _p01 = vzipq_u16(_p0, _p1);

                    vst1q_u16(outptr0, _p01.val[0]);
                    vst1q_u16(outptr1, _p01.val[1]);

                    ptr0 += 8;
                    ptr1 += 8;
                    outptr0 += 8;
                    outptr1 += 8;
                }
            }
        }

        if (_group == 3)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                const unsigned short* ptr2 = bottom_blob.channel(channels_per_group * 2 + q);
                unsigned short* outptr0 = top_blob.channel(q * 3);
                unsigned short* outptr1 = top_blob.channel(q * 3 + 1);
                unsigned short* outptr2 = top_blob.channel(q * 3 + 2);

                for (int i = 0; i < size; i++)
                {
                    uint16x8_t _p0 = vld1q_u16(ptr0);
                    uint16x8_t _p1 = vld1q_u16(ptr1);
                    uint16x8_t _p2 = vld1q_u16(ptr2);

                    // TODO figure out a faster way

                    // 01234567        08g19h2a
                    // 89abcdef   ->   i3bj4ck5
                    // ghijklmn        dl6em7fn

                    uint16x8x3_t _p012;
                    _p012.val[0] = _p0;
                    _p012.val[1] = _p1;
                    _p012.val[2] = _p2;

                    unsigned short tmp[24];
                    vst3q_u16(&tmp[0], _p012);

                    _p0 = vld1q_u16(&tmp[0]);
                    _p1 = vld1q_u16(&tmp[8]);
                    _p2 = vld1q_u16(&tmp[16]);

                    vst1q_u16(outptr0, _p0);
                    vst1q_u16(outptr1, _p1);
                    vst1q_u16(outptr2, _p2);

                    ptr0 += 8;
                    ptr1 += 8;
                    ptr2 += 8;
                    outptr0 += 8;
                    outptr1 += 8;
                    outptr2 += 8;
                }
            }
        }

        if (_group == 4)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                const unsigned short* ptr2 = bottom_blob.channel(channels_per_group * 2 + q);
                const unsigned short* ptr3 = bottom_blob.channel(channels_per_group * 3 + q);
                unsigned short* outptr0 = top_blob.channel(q * 4);
                unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
                unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
                unsigned short* outptr3 = top_blob.channel(q * 4 + 3);

                for (int i = 0; i < size; i++)
                {
                    uint16x8_t _p0 = vld1q_u16(ptr0);
                    uint16x8_t _p1 = vld1q_u16(ptr1);
                    uint16x8_t _p2 = vld1q_u16(ptr2);
                    uint16x8_t _p3 = vld1q_u16(ptr3);

                    // transpose 4x4
                    uint16x8x2_t _p01 = vtrnq_u16(_p0, _p1);
                    uint16x8x2_t _p23 = vtrnq_u16(_p2, _p3);
                    uint32x4x2_t _p02 = vtrnq_u32(vreinterpretq_u32_u16(_p01.val[0]), vreinterpretq_u32_u16(_p23.val[0]));
                    uint32x4x2_t _p13 = vtrnq_u32(vreinterpretq_u32_u16(_p01.val[1]), vreinterpretq_u32_u16(_p23.val[1]));
                    _p0 = vreinterpretq_u16_u32(_p02.val[0]);
                    _p1 = vreinterpretq_u16_u32(_p13.val[0]);
                    _p2 = vreinterpretq_u16_u32(_p02.val[1]);
                    _p3 = vreinterpretq_u16_u32(_p13.val[1]);

                    vst1q_u16(outptr0, vcombine_u16(vget_low_u16(_p0), vget_low_u16(_p1)));
                    vst1q_u16(outptr1, vcombine_u16(vget_low_u16(_p2), vget_low_u16(_p3)));
                    vst1q_u16(outptr2, vcombine_u16(vget_high_u16(_p0), vget_high_u16(_p1)));
                    vst1q_u16(outptr3, vcombine_u16(vget_high_u16(_p2), vget_high_u16(_p3)));

                    ptr0 += 8;
                    ptr1 += 8;
                    ptr2 += 8;
                    ptr3 += 8;
                    outptr0 += 8;
                    outptr1 += 8;
                    outptr2 += 8;
                    outptr3 += 8;
                }
            }
        }

        return 0;
    }
#endif // NCNN_ARM82

#if __ARM_NEON
    if (elempack == 4)
    {
        if (_group == 2 && channels % _group != 0)
        {
            int w = bottom_blob.w;
            int h = bottom_blob.h;
            int size = w * h;
            size_t elemsize = bottom_blob.elemsize;

            top_blob.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            int channels_per_group = channels / _group;

            // TODO unroll me
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                const unsigned short* ptr2 = bottom_blob.channel(channels_per_group + q + 1);
                unsigned short* outptr0 = top_blob.channel(q * 2);
                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);

                for (int i = 0; i < size; i++)
                {
                    uint16x4_t _p0 = vld1_u16(ptr0);
                    uint16x4_t _p1 = vld1_u16(ptr1);
                    uint16x4_t _p2 = vld1_u16(ptr2);

                    uint16x4_t _p12 = vext_u16(_p1, _p2, 2);

                    uint16x4x2_t _p01 = vzip_u16(_p0, _p12);

                    vst1_u16(outptr0, _p01.val[0]);
                    vst1_u16(outptr1, _p01.val[1]);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                }
            }

            // handle the last channel
            {
                const unsigned short* ptr0 = bottom_blob.channel(channels_per_group);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + channels_per_group);
                unsigned short* outptr0 = top_blob.channel(channels_per_group * 2);

                ptr1 += 2;

                for (int i = 0; i < size; i++)
                {
                    uint16x4_t _p0 = vld1_u16(ptr0);
                    uint16x4_t _p1 = vld1_u16(ptr1);

                    uint16x4x2_t _p01 = vzip_u16(_p0, _p1);

                    vst1_u16(outptr0, _p01.val[0]);

                    ptr0 += 4;
                    ptr1 += 4;
                    outptr0 += 4;
                }
            }

            return 0;
        }

        if (_group > 4 || channels % _group != 0)
        {
            // slow path for too large group or shuffle inside elempack
            Option opt_pack = opt;
            opt_pack.blob_allocator = opt.workspace_allocator;

            Mat bottom_blob_unpacked;
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack);
            if (bottom_blob_unpacked.empty())
                return -100;

            Mat top_blob_unpacked;
            int ret = ShuffleChannel::forward(bottom_blob_unpacked, top_blob_unpacked, opt_pack);
            if (ret != 0)
                return ret;

            convert_packing(top_blob_unpacked, top_blob, elempack, opt);
            if (top_blob.empty())
                return -100;

            return 0;
        }

        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int size = w * h;
        size_t elemsize = bottom_blob.elemsize;

        top_blob.create(w, h, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int channels_per_group = channels / _group;

        if (_group == 2)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                unsigned short* outptr0 = top_blob.channel(q * 2);
                unsigned short* outptr1 = top_blob.channel(q * 2 + 1);

                for (int i = 0; i < size; i++)
                {
                    uint16x4_t _p0 = vld1_u16(ptr0);
                    uint16x4_t _p1 = vld1_u16(ptr1);

                    uint16x4x2_t _p01 = vzip_u16(_p0, _p1);

                    vst1_u16(outptr0, _p01.val[0]);
                    vst1_u16(outptr1, _p01.val[1]);

                    ptr0 += 4;
                    ptr1 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                }
            }
        }

        if (_group == 3)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                const unsigned short* ptr2 = bottom_blob.channel(channels_per_group * 2 + q);
                unsigned short* outptr0 = top_blob.channel(q * 3);
                unsigned short* outptr1 = top_blob.channel(q * 3 + 1);
                unsigned short* outptr2 = top_blob.channel(q * 3 + 2);

                for (int i = 0; i < size; i++)
                {
                    uint16x4_t _p0 = vld1_u16(ptr0);
                    uint16x4_t _p1 = vld1_u16(ptr1);
                    uint16x4_t _p2 = vld1_u16(ptr2);

                    // TODO figure out a faster way
                    uint16x4x2_t _p01 = vzip_u16(_p0, _p1);
                    uint16x4x2_t _p12 = vzip_u16(_p1, _p2);

                    uint32x2_t _0415 = vreinterpret_u32_u16(_p01.val[0]);
                    uint16x4_t _2637 = _p01.val[1];
                    uint16x4_t _4859 = _p12.val[0];
                    uint32x2_t _6x7y = vreinterpret_u32_u16(_p12.val[1]);

                    uint16x4_t _98yx = vrev32_u16(_p2);
                    uint16x4x2_t _90y281x3 = vtrn_u16(_98yx, _p0);

                    uint32x2_t _81x3 = vreinterpret_u32_u16(_90y281x3.val[1]);

                    uint32x2x2_t _048115x3 = vtrn_u32(_0415, _81x3);
                    uint32x2x2_t _816xx37y = vtrn_u32(_81x3, _6x7y);

                    uint16x4_t _0481 = vreinterpret_u16_u32(_048115x3.val[0]);
                    uint16x4_t _5926 = vext_u16(_4859, _2637, 2);
                    uint16x4_t _x37y = vreinterpret_u16_u32(_816xx37y.val[1]);

                    vst1_u16(outptr0, _0481);
                    vst1_u16(outptr1, _5926);
                    vst1_u16(outptr2, _x37y);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                }
            }
        }

        if (_group == 4)
        {
            for (int q = 0; q < channels_per_group; q++)
            {
                const unsigned short* ptr0 = bottom_blob.channel(q);
                const unsigned short* ptr1 = bottom_blob.channel(channels_per_group + q);
                const unsigned short* ptr2 = bottom_blob.channel(channels_per_group * 2 + q);
                const unsigned short* ptr3 = bottom_blob.channel(channels_per_group * 3 + q);
                unsigned short* outptr0 = top_blob.channel(q * 4);
                unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
                unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
                unsigned short* outptr3 = top_blob.channel(q * 4 + 3);

                for (int i = 0; i < size; i++)
                {
                    uint16x4_t _p0 = vld1_u16(ptr0);
                    uint16x4_t _p1 = vld1_u16(ptr1);
                    uint16x4_t _p2 = vld1_u16(ptr2);
                    uint16x4_t _p3 = vld1_u16(ptr3);

                    // transpose 4x4
                    uint16x4x2_t _p01 = vtrn_u16(_p0, _p1);
                    uint16x4x2_t _p23 = vtrn_u16(_p2, _p3);
                    uint32x2x2_t _p02 = vtrn_u32(vreinterpret_u32_u16(_p01.val[0]), vreinterpret_u32_u16(_p23.val[0]));
                    uint32x2x2_t _p13 = vtrn_u32(vreinterpret_u32_u16(_p01.val[1]), vreinterpret_u32_u16(_p23.val[1]));
                    _p0 = vreinterpret_u16_u32(_p02.val[0]);
                    _p1 = vreinterpret_u16_u32(_p13.val[0]);
                    _p2 = vreinterpret_u16_u32(_p02.val[1]);
                    _p3 = vreinterpret_u16_u32(_p13.val[1]);

                    vst1_u16(outptr0, _p0);
                    vst1_u16(outptr1, _p1);
                    vst1_u16(outptr2, _p2);
                    vst1_u16(outptr3, _p3);

                    ptr0 += 4;
                    ptr1 += 4;
                    ptr2 += 4;
                    ptr3 += 4;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    return ShuffleChannel::forward(bottom_blob, top_blob, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/arm/shufflechannel_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SHUFFLECHANNEL_ARM_H
#define LAYER_SHUFFLECHANNEL_ARM_H

#include "shufflechannel.h"

namespace ncnn {

class ShuffleChannel_arm : public ShuffleChannel
{
public:
    ShuffleChannel_arm();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SHUFFLECHANNEL_ARM_H


================================================
FILE: src/layer/arm/sigmoid_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "sigmoid_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

Sigmoid_arm::Sigmoid_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Sigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
#if __aarch64__
        for (; i + 15 < size; i += 16)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _p2 = vld1q_f32(ptr + 8);
            float32x4_t _p3 = vld1q_f32(ptr + 12);
            _p0 = sigmoid_ps(_p0);
            _p1 = sigmoid_ps(_p1);
            _p2 = sigmoid_ps(_p2);
            _p3 = sigmoid_ps(_p3);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            vst1q_f32(ptr + 8, _p2);
            vst1q_f32(ptr + 12, _p3);
            ptr += 16;
        }
#endif // __aarch64__
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            _p0 = sigmoid_ps(_p0);
            _p1 = sigmoid_ps(_p1);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = sigmoid_ps(_p);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = 1.f / (1.f + expf(-*ptr));

            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int Sigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
#if __aarch64__
        for (; i + 15 < size; i += 16)
        {
            uint16x8_t _p01 = vld1q_u16(ptr);
            uint16x8_t _p23 = vld1q_u16(ptr + 8);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
            float32x4_t _p2 = bfloat2float(vget_low_u16(_p23));
            float32x4_t _p3 = bfloat2float(vget_high_u16(_p23));
            _p0 = sigmoid_ps(_p0);
            _p1 = sigmoid_ps(_p1);
            _p2 = sigmoid_ps(_p2);
            _p3 = sigmoid_ps(_p3);
            _p01 = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            _p23 = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
            vst1q_u16(ptr, _p01);
            vst1q_u16(ptr + 8, _p23);
            ptr += 16;
        }
#endif // __aarch64__
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _p0 = sigmoid_ps(_p0);
            _p1 = sigmoid_ps(_p1);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = sigmoid_ps(_p);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(*ptr);
            v = 1.f / (1.f + expf(-v));
            *ptr = float32_to_bfloat16(v);

            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/sigmoid_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SIGMOID_ARM_H
#define LAYER_SIGMOID_ARM_H

#include "sigmoid.h"

namespace ncnn {

class Sigmoid_arm : public Sigmoid
{
public:
    Sigmoid_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_SIGMOID_ARM_H


================================================
FILE: src/layer/arm/sigmoid_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "sigmoid_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#include "neon_mathfun.h"
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "neon_mathfun_fp16s.h"
#endif
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Sigmoid_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        int i = 0;
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p01 = vld1q_f16(ptr);
            float16x8_t _p23 = vld1q_f16(ptr + 8);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
            float32x4_t _p2 = vcvt_f32_f16(vget_low_f16(_p23));
            float32x4_t _p3 = vcvt_f32_f16(vget_high_f16(_p23));
            _p0 = sigmoid_ps(_p0);
            _p1 = sigmoid_ps(_p1);
            _p2 = sigmoid_ps(_p2);
            _p3 = sigmoid_ps(_p3);
            _p01 = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            _p23 = vcombine_f16(vcvt_f16_f32(_p2), vcvt_f16_f32(_p3));
            vst1q_f16(ptr, _p01);
            vst1q_f16(ptr + 8, _p23);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _p0 = sigmoid_ps(_p0);
            _p1 = sigmoid_ps(_p1);
            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _p = sigmoid_ps(_p);
            vst1_f16(ptr, vcvt_f16_f32(_p));
            ptr += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)*ptr;
            v = 1.f / (1.f + expf(-v));
            *ptr = (__fp16)v;

            ptr++;
        }
    }

    return 0;
}

int Sigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        int i = 0;
        for (; i + 31 < size; i += 32)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            _p0 = sigmoid_ps_f16(_p0);
            _p1 = sigmoid_ps_f16(_p1);
            _p2 = sigmoid_ps_f16(_p2);
            _p3 = sigmoid_ps_f16(_p3);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            ptr += 32;
        }
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            _p0 = sigmoid_ps_f16(_p0);
            _p1 = sigmoid_ps_f16(_p1);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _p = sigmoid_ps_f16(_p);
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = sigmoid_ps_f16(_p);
            vst1_f16(ptr, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = *ptr;
            v = 1.f / (1.f + expf(-v));
            *ptr = v;

            ptr++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/slice_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "slice_arm.h"

#include "cpu.h"

namespace ncnn {

Slice_arm::Slice_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Slice_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int elembits = bottom_blobs[0].elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
#endif

    const Mat& bottom_blob = bottom_blobs[0];
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    const int* slices_ptr = slices;
    const int* indices_ptr = indices;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // slice vector
        int w = bottom_blob.w * elempack;
        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __ARM_NEON
            if (opt.use_packing_layout)
            {
                out_elempack = slice % 4 == 0 ? 4 : 1;
            }
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            const float* ptr = (const float*)bottom_blob + q;
            float* outptr = top_blob;
            memcpy(outptr, ptr, top_blob.w * top_blob.elemsize);

            q += slice;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // slice image height
        int w = bottom_blob.w;
        int h = bottom_blob.h * elempack;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = h - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? h + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((h - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __ARM_NEON
            if (opt.use_packing_layout)
            {
                out_elempack = slice % 4 == 0 ? 4 : 1;
            }
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        size_t out_elemsize = top_blobs[0].elemsize;
        int out_elempack = top_blobs[0].elempack;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
        }

        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > out_elempack)
        {
            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        const float* ptr = bottom_blob_unpacked;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            Mat& top_blob = top_blobs[i];

            if (out_elempack == 1 && top_blob.elempack == 4)
            {
                for (int j = 0; j < top_blob.h; j++)
                {
                    const float* r0 = ptr;
                    const float* r1 = ptr + w;
                    const float* r2 = ptr + w * 2;
                    const float* r3 = ptr + w * 3;

                    float* outptr0 = top_blob.row(j);

                    for (int j = 0; j < w; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;

                        outptr0 += 4;
                    }

                    ptr += w * 4;
                }
            }
            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
            {
                int size = w * top_blob.h;

                float* outptr = top_blob;
                memcpy(outptr, ptr, size * top_blob.elemsize);

                ptr += size * top_blob.elempack;
            }
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // slice image width
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
            const float* ptr = bottom_blob.row(j);
            for (size_t i = 0; i < top_blobs.size(); i++)
            {
                Mat& top_blob = top_blobs[i];

                float* outptr = top_blob.row(j);
                memcpy(outptr, ptr, top_blob.w * elemsize);

                ptr += top_blob.w * elempack;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // slice dim channel
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c * elempack;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = channels - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? channels + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((channels - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __ARM_NEON
            if (opt.use_packing_layout)
            {
                out_elempack = slice % 4 == 0 ? 4 : 1;
            }
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, h, d, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        size_t out_elemsize = top_blobs[0].elemsize;
        int out_elempack = top_blobs[0].elempack;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
        }

        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > out_elempack)
        {
            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        int p = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            Mat& top_blob = top_blobs[i];

            if (out_elempack == 1 && top_blob.elempack == 4)
            {
                int size = top_blob.w * top_blob.h * top_blob.d;

                for (int q = 0; q < top_blob.c; q++)
                {
                    const float* r0 = bottom_blob_unpacked.channel(p);
                    const float* r1 = bottom_blob_unpacked.channel(p + 1);
                    const float* r2 = bottom_blob_unpacked.channel(p + 2);
                    const float* r3 = bottom_blob_unpacked.channel(p + 3);

                    float* outptr0 = top_blob.channel(q);

                    for (int j = 0; j < size; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;

                        outptr0 += 4;
                    }

                    p += 4;
                }
            }
            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
            {
                int size = top_blob.total();

                const float* ptr = bottom_blob_unpacked.channel(p);
                float* outptr = top_blob;
                memcpy(outptr, ptr, size * top_blob.elemsize);

                p += top_blob.c;
            }
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // slice dim height
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = h - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? h + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((h - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice, d, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const float* ptr = bottom_blob.channel(p);

            for (int j = 0; j < d; j++)
            {
                for (size_t i = 0; i < top_blobs.size(); i++)
                {
                    Mat& top_blob = top_blobs[i];

                    int size = top_blob.w * top_blob.h;

                    float* outptr = top_blob.channel(p).depth(j);
                    memcpy(outptr, ptr, size * elemsize);

                    ptr += size * elempack;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // slice dim width
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, d, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const float* ptr = bottom_blob.channel(p);

            for (int j = 0; j < d; j++)
            {
                for (int k = 0; k < h; k++)
                {
                    for (size_t i = 0; i < top_blobs.size(); i++)
                    {
                        Mat& top_blob = top_blobs[i];

                        float* outptr = top_blob.channel(p).depth(j).row(k);
                        memcpy(outptr, ptr, top_blob.w * elemsize);

                        ptr += top_blob.w * elempack;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = d - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? d + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((d - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, h, slice, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const float* ptr = bottom_blob.channel(p);

            for (size_t i = 0; i < top_blobs.size(); i++)
            {
                Mat& top_blob = top_blobs[i];

                int size = top_blob.w * top_blob.h * top_blob.d;

                float* outptr = top_blob.channel(p);
                memcpy(outptr, ptr, size * elemsize);

                ptr += size * elempack;
            }
        }
    }

    return 0;
}

int Slice_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    const int* slices_ptr = slices;
    const int* indices_ptr = indices;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // slice vector
        int w = bottom_blob.w * elempack;
        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __ARM_NEON
            if (opt.use_packing_layout)
            {
#if NCNN_ARM82
                out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && slice % 8 == 0 ? 8 : slice % 4 == 0 ? 4 : 1;
#else
                out_elempack = slice % 4 == 0 ? 4 : 1;
#endif
            }
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            const unsigned short* ptr = (const unsigned short*)bottom_blob + q;
            unsigned short* outptr = top_blob;
            memcpy(outptr, ptr, top_blob.w * top_blob.elemsize);

            q += slice;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // slice image height
        int w = bottom_blob.w;
        int h = bottom_blob.h * elempack;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = h - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? h + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((h - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __ARM_NEON
            if (opt.use_packing_layout)
            {
#if NCNN_ARM82
                out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && slice % 8 == 0 ? 8 : slice % 4 == 0 ? 4 : 1;
#else
                out_elempack = slice % 4 == 0 ? 4 : 1;
#endif
            }
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        size_t out_elemsize = top_blobs[0].elemsize;
        int out_elempack = top_blobs[0].elempack;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
        }

        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > out_elempack)
        {
            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        const unsigned short* ptr = bottom_blob_unpacked;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            Mat& top_blob = top_blobs[i];

#if NCNN_ARM82
            if (out_elempack == 4 && top_blob.elempack == 8)
            {
                for (int j = 0; j < top_blob.h; j++)
                {
                    const unsigned short* r0 = ptr;
                    const unsigned short* r1 = ptr + w * 4;

                    unsigned short* outptr0 = top_blob.row<unsigned short>(j);

                    for (int j = 0; j < w; j++)
                    {
                        outptr0[0] = r0[0];
                        outptr0[1] = r0[1];
                        outptr0[2] = r0[2];
                        outptr0[3] = r0[3];
                        outptr0[4] = r1[0];
                        outptr0[5] = r1[1];
                        outptr0[6] = r1[2];
                        outptr0[7] = r1[3];

                        r0 += 4;
                        r1 += 4;
                        outptr0 += 8;
                    }

                    ptr += w * 8;
                }
            }
            if (out_elempack == 1 && top_blob.elempack == 8)
            {
                for (int j = 0; j < top_blob.h; j++)
                {
                    const unsigned short* r0 = ptr;
                    const unsigned short* r1 = ptr + w;
                    const unsigned short* r2 = ptr + w * 2;
                    const unsigned short* r3 = ptr + w * 3;
                    const unsigned short* r4 = ptr + w * 4;
                    const unsigned short* r5 = ptr + w * 5;
                    const unsigned short* r6 = ptr + w * 6;
                    const unsigned short* r7 = ptr + w * 7;

                    unsigned short* outptr0 = top_blob.row<unsigned short>(j);

                    for (int j = 0; j < w; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;
                        outptr0[4] = *r4++;
                        outptr0[5] = *r5++;
                        outptr0[6] = *r6++;
                        outptr0[7] = *r7++;

                        outptr0 += 8;
                    }

                    ptr += w * 8;
                }
            }
#endif // NCNN_ARM82
            if (out_elempack == 1 && top_blob.elempack == 4)
            {
                for (int j = 0; j < top_blob.h; j++)
                {
                    const unsigned short* r0 = ptr;
                    const unsigned short* r1 = ptr + w;
                    const unsigned short* r2 = ptr + w * 2;
                    const unsigned short* r3 = ptr + w * 3;

                    unsigned short* outptr0 = top_blob.row<unsigned short>(j);

                    for (int j = 0; j < w; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;

                        outptr0 += 4;
                    }

                    ptr += w * 4;
                }
            }
            if (out_elempack == top_blob.elempack) // 1-1 4-4 8-8
            {
                int size = w * top_blob.h;

                unsigned short* outptr = top_blob;
                memcpy(outptr, ptr, size * top_blob.elemsize);

                ptr += size * top_blob.elempack;
            }
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // slice image width
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
            const unsigned short* ptr = bottom_blob.row<const unsigned short>(j);
            for (size_t i = 0; i < top_blobs.size(); i++)
            {
                Mat& top_blob = top_blobs[i];

                unsigned short* outptr = top_blob.row<unsigned short>(j);
                memcpy(outptr, ptr, top_blob.w * elemsize);

                ptr += top_blob.w * elempack;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // slice dim channel
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c * elempack;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = channels - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? channels + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((channels - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __ARM_NEON
            if (opt.use_packing_layout)
            {
#if NCNN_ARM82
                out_elempack = support_fp16_storage && opt.use_fp16_arithmetic && slice % 8 == 0 ? 8 : slice % 4 == 0 ? 4 : 1;
#else
                out_elempack = slice % 4 == 0 ? 4 : 1;
#endif
            }
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, h, d, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        size_t out_elemsize = top_blobs[0].elemsize;
        int out_elempack = top_blobs[0].elempack;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
        }

        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > out_elempack)
        {
            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        int p = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            Mat& top_blob = top_blobs[i];

#if NCNN_ARM82
            if (out_elempack == 4 && top_blob.elempack == 8)
            {
                int size = top_blob.w * top_blob.h * top_blob.d;

                for (int q = 0; q < top_blob.c; q++)
                {
                    const unsigned short* r0 = bottom_blob_unpacked.channel(p);
                    const unsigned short* r1 = bottom_blob_unpacked.channel(p + 1);

                    unsigned short* outptr0 = top_blob.channel(q);

                    for (int j = 0; j < size; j++)
                    {
                        outptr0[0] = r0[0];
                        outptr0[1] = r0[1];
                        outptr0[2] = r0[2];
                        outptr0[3] = r0[3];
                        outptr0[4] = r1[0];
                        outptr0[5] = r1[1];
                        outptr0[6] = r1[2];
                        outptr0[7] = r1[3];

                        r0 += 4;
                        r1 += 4;
                        outptr0 += 8;
                    }

                    p += 2;
                }
            }
            if (out_elempack == 1 && top_blob.elempack == 8)
            {
                int size = top_blob.w * top_blob.h * top_blob.d;

                for (int q = 0; q < top_blob.c; q++)
                {
                    const unsigned short* r0 = bottom_blob_unpacked.channel(p);
                    const unsigned short* r1 = bottom_blob_unpacked.channel(p + 1);
                    const unsigned short* r2 = bottom_blob_unpacked.channel(p + 2);
                    const unsigned short* r3 = bottom_blob_unpacked.channel(p + 3);
                    const unsigned short* r4 = bottom_blob_unpacked.channel(p + 4);
                    const unsigned short* r5 = bottom_blob_unpacked.channel(p + 5);
                    const unsigned short* r6 = bottom_blob_unpacked.channel(p + 6);
                    const unsigned short* r7 = bottom_blob_unpacked.channel(p + 7);

                    unsigned short* outptr0 = top_blob.channel(q);

                    for (int j = 0; j < size; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;
                        outptr0[4] = *r4++;
                        outptr0[5] = *r5++;
                        outptr0[6] = *r6++;
                        outptr0[7] = *r7++;

                        outptr0 += 8;
                    }

                    p += 8;
                }
            }
#endif // NCNN_ARM82
            if (out_elempack == 1 && top_blob.elempack == 4)
            {
                int size = top_blob.w * top_blob.h * top_blob.d;

                for (int q = 0; q < top_blob.c; q++)
                {
                    const unsigned short* r0 = bottom_blob_unpacked.channel(p);
                    const unsigned short* r1 = bottom_blob_unpacked.channel(p + 1);
                    const unsigned short* r2 = bottom_blob_unpacked.channel(p + 2);
                    const unsigned short* r3 = bottom_blob_unpacked.channel(p + 3);

                    unsigned short* outptr0 = top_blob.channel(q);

                    for (int j = 0; j < size; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;

                        outptr0 += 4;
                    }

                    p += 4;
                }
            }
            if (out_elempack == top_blob.elempack) // 1-1 4-4 8-8
            {
                int size = top_blob.total();

                const unsigned short* ptr = bottom_blob_unpacked.channel(p);
                unsigned short* outptr = top_blob;
                memcpy(outptr, ptr, size * top_blob.elemsize);

                p += top_blob.c;
            }
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // slice dim height
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = h - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? h + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((h - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice, d, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const unsigned short* ptr = bottom_blob.channel(p);

            for (int j = 0; j < d; j++)
            {
                for (size_t i = 0; i < top_blobs.size(); i++)
                {
                    Mat& top_blob = top_blobs[i];

                    int size = top_blob.w * top_blob.h;

                    unsigned short* outptr = top_blob.channel(p).depth(j);
                    memcpy(outptr, ptr, size * elemsize);

                    ptr += size * elempack;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // slice dim width
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, d, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const unsigned short* ptr = bottom_blob.channel(p);

            for (int j = 0; j < d; j++)
            {
                for (int k = 0; k < h; k++)
                {
                    for (size_t i = 0; i < top_blobs.size(); i++)
                    {
                        Mat& top_blob = top_blobs[i];

                        unsigned short* outptr = top_blob.channel(p).depth(j).row<unsigned short>(k);
                        memcpy(outptr, ptr, top_blob.w * elemsize);

                        ptr += top_blob.w * elempack;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = d - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? d + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((d - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, h, slice, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const unsigned short* ptr = bottom_blob.channel(p);

            for (size_t i = 0; i < top_blobs.size(); i++)
            {
                Mat& top_blob = top_blobs[i];

                int size = top_blob.w * top_blob.h * top_blob.d;

                unsigned short* outptr = top_blob.channel(p);
                memcpy(outptr, ptr, size * elemsize);

                ptr += size * elempack;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/arm/slice_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SLICE_ARM_H
#define LAYER_SLICE_ARM_H

#include "slice.h"

namespace ncnn {

class Slice_arm : public Slice
{
public:
    Slice_arm();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    int forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SLICE_ARM_H


================================================
FILE: src/layer/arm/softmax_arm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "softmax_arm.h"

#include <float.h>

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

Softmax_arm::Softmax_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

static void softmax(float* _ptr, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

    // reduce max
#if __ARM_NEON
    float32x4_t _max = vdupq_n_f32(-FLT_MAX);
#endif // __ARM_NEON
    float max = -FLT_MAX;
    {
        const float* ptr = _ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _max = vmaxq_f32(_max, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            max = std::max(max, *ptr++);
        }
    }

#if __ARM_NEON
    if (elempack == 1)
    {
#if __aarch64__
        max = std::max(max, vmaxvq_f32(_max));
#else
        float32x2_t _max2 = vmax_f32(vget_low_f32(_max), vget_high_f32(_max));
        float32x2_t _mm2 = vpmax_f32(_max2, _max2);
        max = std::max(max, vget_lane_f32(_mm2, 0));
#endif

        _max = vdupq_n_f32(max);
    }
#endif // __ARM_NEON

    // reduce exp(x - max)
#if __ARM_NEON
    float32x4_t _sum = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float sum = 0.f;
    {
        float* ptr = _ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vsubq_f32(_p, _max);
            _p = exp_ps(_p);
            vst1q_f32(ptr, _p);
            _sum = vaddq_f32(_sum, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = expf(*ptr - max);
            *ptr = v;
            sum += v;
            ptr++;
        }
    }

#if __ARM_NEON
    if (elempack == 1)
    {
#if __aarch64__
        sum += vaddvq_f32(_sum);
#else
        float32x2_t _sum2 = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
        float32x2_t _ss2 = vpadd_f32(_sum2, _sum2);
        sum += vget_lane_f32(_ss2, 0);
#endif

        _sum = vdupq_n_f32(sum);
    }
#endif // __ARM_NEON

#if __ARM_NEON
    _sum = div_ps(vdupq_n_f32(1.f), _sum);
#endif // __ARM_NEON
    sum = 1.f / sum;

    // div sum
    {
        float* ptr = _ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = vmulq_f32(_p, _sum);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr++ *= sum;
        }
    }
}

#if __ARM_NEON
static void softmax_pack4(float* _ptr, int elemcount, size_t stride, int size1, float* _maxptr, float* _sumptr)
{
    // reduce max
    for (int i = 0; i < elemcount; i++)
    {
        const float* ptr = _ptr + i * stride;
        float* maxptr = _maxptr;

        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            float32x4x4_t _p = vld4q_f32(ptr);
            float32x4_t _max = vld1q_f32(maxptr);
            float32x4_t _max2 = vmaxq_f32(_p.val[0], _p.val[1]);
            float32x4_t _max4 = vmaxq_f32(_p.val[2], _p.val[3]);
            _max = vmaxq_f32(_max, vmaxq_f32(_max2, _max4));
            vst1q_f32(maxptr, _max);
            ptr += 16;
            maxptr += 4;
        }
        for (; j < size1; j++)
        {
            float32x4_t _p = vld1q_f32(ptr);
#if __aarch64__
            float max0 = vmaxvq_f32(_p);
#else
            float32x2_t _max2 = vmax_f32(vget_low_f32(_p), vget_high_f32(_p));
            float32x2_t _mm2 = vpmax_f32(_max2, _max2);
            float max0 = vget_lane_f32(_mm2, 0);
#endif
            *maxptr = std::max(*maxptr, max0);
            ptr += 4;
            maxptr++;
        }
    }

    // reduce exp(x - max)
    for (int i = 0; i < elemcount; i++)
    {
        float* ptr = _ptr + i * stride;
        const float* maxptr = _maxptr;
        float* sumptr = _sumptr;

        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            float32x4x4_t _p = vld4q_f32(ptr);
            float32x4_t _max = vld1q_f32(maxptr);
            float32x4_t _p0 = vsubq_f32(_p.val[0], _max);
            float32x4_t _p1 = vsubq_f32(_p.val[1], _max);
            float32x4_t _p2 = vsubq_f32(_p.val[2], _max);
            float32x4_t _p3 = vsubq_f32(_p.val[3], _max);
            _p.val[0] = exp_ps(_p0);
            _p.val[1] = exp_ps(_p1);
            _p.val[2] = exp_ps(_p2);
            _p.val[3] = exp_ps(_p3);
            vst4q_f32(ptr, _p);
            float32x4_t _sum = vld1q_f32(sumptr);
            float32x4_t _ss2 = vaddq_f32(_p.val[0], _p.val[1]);
            float32x4_t _ss4 = vaddq_f32(_p.val[2], _p.val[3]);
            _sum = vaddq_f32(_sum, vaddq_f32(_ss2, _ss4));
            vst1q_f32(sumptr, _sum);
            ptr += 16;
            maxptr += 4;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _max = vdupq_n_f32(*maxptr);
            _p = exp_ps(vsubq_f32(_p, _max));
            vst1q_f32(ptr, _p);
#if __aarch64__
            float sum0 = vaddvq_f32(_p);
#else
            float32x2_t _sum2 = vadd_f32(vget_low_f32(_p), vget_high_f32(_p));
            float32x2_t _ss2 = vpadd_f32(_sum2, _sum2);
            float sum0 = vget_lane_f32(_ss2, 0);
#endif
            *sumptr += sum0;
            ptr += 4;
            maxptr++;
            sumptr++;
        }
    }

    {
        float32x4_t _one = vdupq_n_f32(1.f);
        float* sumptr = _sumptr;
        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _sum = vld1q_f32(sumptr);
            _sum = div_ps(_one, _sum);
            vst1q_f32(sumptr, _sum);
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            *sumptr = 1.f / *sumptr;
            sumptr++;
        }
    }

    // div sum
    for (int i = 0; i < elemcount; i++)
    {
        float* ptr = _ptr + i * stride;
        const float* sumptr = _sumptr;

        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            float32x4x4_t _p = vld4q_f32(ptr);
            float32x4_t _sum = vld1q_f32(sumptr);
            _p.val[0] = vmulq_f32(_p.val[0], _sum);
            _p.val[1] = vmulq_f32(_p.val[1], _sum);
            _p.val[2] = vmulq_f32(_p.val[2], _sum);
            _p.val[3] = vmulq_f32(_p.val[3], _sum);
            vst4q_f32(ptr, _p);
            ptr += 16;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _sum = vld1q_dup_f32(sumptr);
            _p = vmulq_f32(_p, _sum);
            vst1q_f32(ptr, _p);
            ptr += 4;
            sumptr++;
        }
    }
}
#endif // __ARM_NEON

static void softmax_pack1(float* _ptr, int elemcount, size_t stride, int size1, float* _maxptr, float* _sumptr)
{
    // reduce max
    for (int i = 0; i < elemcount; i++)
    {
        const float* ptr = _ptr + i * stride;
        float* maxptr = _maxptr;

        int j = 0;
#if __ARM_NEON
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _max = vld1q_f32(maxptr);
            _max = vmaxq_f32(_max, _p);
            vst1q_f32(maxptr, _max);
            ptr += 4;
            maxptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *maxptr = std::max(*maxptr, *ptr);
            ptr++;
            maxptr++;
        }
    }

    // reduce exp(x - max)
    for (int i = 0; i < elemcount; i++)
    {
        float* ptr = _ptr + i * stride;
        const float* maxptr = _maxptr;
        float* sumptr = _sumptr;

        int j = 0;
#if __ARM_NEON
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _max = vld1q_f32(maxptr);
            float32x4_t _sum = vld1q_f32(sumptr);
            _p = vsubq_f32(_p, _max);
            _p = exp_ps(_p);
            vst1q_f32(ptr, _p);
            _sum = vaddq_f32(_sum, _p);
            vst1q_f32(sumptr, _sum);
            ptr += 4;
            maxptr += 4;
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            float v = expf(*ptr - *maxptr);
            *ptr = v;
            *sumptr += v;
            ptr++;
            maxptr++;
            sumptr++;
        }
    }

    {
        float* sumptr = _sumptr;
        int j = 0;
#if __ARM_NEON
        float32x4_t _one = vdupq_n_f32(1.f);
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _sum = vld1q_f32(sumptr);
            _sum = div_ps(_one, _sum);
            vst1q_f32(sumptr, _sum);
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *sumptr = 1.f / *sumptr;
            sumptr++;
        }
    }

    // div sum
    for (int i = 0; i < elemcount; i++)
    {
        float* ptr = _ptr + i * stride;
        const float* sumptr = _sumptr;

        int j = 0;
#if __ARM_NEON
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _sum = vld1q_f32(sumptr);
            _p = vmulq_f32(_p, _sum);
            vst1q_f32(ptr, _p);
            ptr += 4;
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *ptr *= *sumptr;
            ptr++;
            sumptr++;
        }
    }
}

static void softmax(float* _ptr, int elemcount, int elempack, size_t stride, int size1, float* _maxptr, float* _sumptr)
{
    // reduce max
    {
        float* maxptr = _maxptr;

        int j = 0;
#if __ARM_NEON
        float32x4_t _negmax = vdupq_n_f32(-FLT_MAX);
        for (; j + 3 < size1; j += 4)
        {
            vst1q_f32(maxptr, _negmax);
            maxptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *maxptr++ = -FLT_MAX;
        }
    }

    // reduce exp(x - max)
    {
        float* sumptr = _sumptr;

        int j = 0;
#if __ARM_NEON
        float32x4_t _zero = vdupq_n_f32(0.f);
        for (; j + 3 < size1; j += 4)
        {
            vst1q_f32(sumptr, _zero);
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *sumptr++ = 0.f;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        softmax_pack4(_ptr, elemcount, stride, size1, _maxptr, _sumptr);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        softmax_pack1(_ptr, elemcount, stride, size1, _maxptr, _sumptr);
    }
}

int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int d = bottom_top_blob.d;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;
    const int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        float* ptr = bottom_top_blob;

        const int size = w * elempack;

        softmax(ptr, size, 1);
    }

    if (dims == 2 && positive_axis == 0)
    {
        const int size = w;
        const int sizen = (size + (opt.num_threads - 1)) / opt.num_threads;
        const size_t stride = (size_t)w * elempack;

        Mat maxsum(sizen, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        const int nn_size = (size + sizen - 1) / sizen;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            const int i = ii * sizen;
            const int size1 = std::min(sizen, size - i);

            float* maxsumptr = maxsum.channel(get_omp_thread_num());
            float* maxptr = maxsumptr;
            float* sumptr = maxptr + sizen;

            float* ptr = (float*)bottom_top_blob + i * elempack;

            softmax(ptr, h, elempack, stride, size1, maxptr, sumptr);
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            softmax(ptr, w, elempack);
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        const int size = w * h * d;
        const int sizen = (size + (opt.num_threads - 1)) / opt.num_threads;
        const size_t stride = bottom_top_blob.cstep * elempack;

        Mat maxsum(sizen, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        const int nn_size = (size + sizen - 1) / sizen;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            const int i = ii * sizen;
            const int size1 = std::min(sizen, size - i);

            float* maxsumptr = maxsum.channel(get_omp_thread_num());
            float* maxptr = maxsumptr;
            float* sumptr = maxptr + sizen;

            float* ptr = (float*)bottom_top_blob + i * elempack;

            softmax(ptr, channels, elempack, stride, size1, maxptr, sumptr);
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        const int size = w * elempack;

        Mat maxsum(size, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            for (int i = 0; i < d; i++)
            {
                float* ptr = bottom_top_blob.channel(q).depth(i);

                float* maxsumptr = maxsum.channel(get_omp_thread_num());
                float* maxptr = maxsumptr;
                float* sumptr = maxptr + size;

                softmax(ptr, h, 1, size, size, maxptr, sumptr);
            }
        }
    }

    if (dims == 3 && positive_axis == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < h; i++)
            {
                softmax(ptr, w, elempack);
                ptr += w * elempack;
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        const int size = w * h * elempack;

        Mat maxsum(size, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            float* maxsumptr = maxsum.channel(get_omp_thread_num());
            float* maxptr = maxsumptr;
            float* sumptr = maxptr + size;

            softmax(ptr, d, 1, size, size, maxptr, sumptr);
        }
    }

    if (dims == 4 && positive_axis == 3)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    softmax(ptr, w, elempack);
                    ptr += w * elempack;
                }
            }
        }
    }

    return 0;
}

#if NCNN_BF16
static void softmax_bf16s(unsigned short* _ptr, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

    // reduce max
#if __ARM_NEON
    float32x4_t _max = vdupq_n_f32(-FLT_MAX);
#endif // __ARM_NEON
    float max = -FLT_MAX;
    {
        const unsigned short* ptr = _ptr;

        int i = 0;
#if __ARM_NEON
        float32x4_t _max1 = vdupq_n_f32(-FLT_MAX);
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _max = vmaxq_f32(_max, _p0);
            _max1 = vmaxq_f32(_max1, _p1);
            ptr += 8;
        }
        _max = vmaxq_f32(_max, _max1);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _max = vmaxq_f32(_max, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            max = std::max(max, bfloat16_to_float32(*ptr++));
        }
    }

#if __ARM_NEON
    if (elempack == 1)
    {
#if __aarch64__
        max = std::max(max, vmaxvq_f32(_max));
#else
        float32x2_t _max2 = vmax_f32(vget_low_f32(_max), vget_high_f32(_max));
        float32x2_t _mm2 = vpmax_f32(_max2, _max2);
        max = std::max(max, vget_lane_f32(_mm2, 0));
#endif

        _max = vdupq_n_f32(max);
    }
#endif // __ARM_NEON

    // reduce exp(x - max)
#if __ARM_NEON
    float32x4_t _sum = vdupq_n_f32(0.f);
#endif // __ARM_NEON
    float sum = 0.f;
    {
        unsigned short* ptr = _ptr;

        int i = 0;
#if __ARM_NEON
        float32x4_t _sum1 = vdupq_n_f32(0.f);
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _p0 = vsubq_f32(_p0, _max);
            _p1 = vsubq_f32(_p1, _max);
            _p0 = exp_ps(_p0);
            _p1 = exp_ps(_p1);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            _sum = vaddq_f32(_sum, _p0);
            _sum1 = vaddq_f32(_sum1, _p1);
            ptr += 8;
        }
        _sum = vaddq_f32(_sum, _sum1);
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = vsubq_f32(_p, _max);
            _p = exp_ps(_p);
            vst1_u16(ptr, float2bfloat(_p));
            _sum = vaddq_f32(_sum, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = expf(bfloat16_to_float32(*ptr) - max);
            *ptr = float32_to_bfloat16(v);
            sum += v;
            ptr++;
        }
    }

#if __ARM_NEON
    if (elempack == 1)
    {
#if __aarch64__
        sum += vaddvq_f32(_sum);
#else
        float32x2_t _sum2 = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
        float32x2_t _ss2 = vpadd_f32(_sum2, _sum2);
        sum += vget_lane_f32(_ss2, 0);
#endif

        _sum = vdupq_n_f32(sum);
    }
#endif // __ARM_NEON

#if __ARM_NEON
    _sum = div_ps(vdupq_n_f32(1.f), _sum);
#endif // __ARM_NEON
    sum = 1.f / sum;

    // div sum
    {
        unsigned short* ptr = _ptr;

        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _p0 = vmulq_f32(_p0, _sum);
            _p1 = vmulq_f32(_p1, _sum);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = vmulq_f32(_p, _sum);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = float32_to_bfloat16(bfloat16_to_float32(*ptr) * sum);
            ptr++;
        }
    }
}

#if __ARM_NEON
static void softmax_bf16s_pack4(unsigned short* _ptr, int elemcount, size_t stride, int size1, float* _maxptr, float* _sumptr)
{
    // reduce max
    for (int i = 0; i < elemcount; i++)
    {
        const unsigned short* ptr = _ptr + i * stride;
        float* maxptr = _maxptr;

        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            uint16x4x4_t _p = vld4_u16(ptr);
            float32x4_t _p0 = bfloat2float(_p.val[0]);
            float32x4_t _p1 = bfloat2float(_p.val[1]);
            float32x4_t _p2 = bfloat2float(_p.val[2]);
            float32x4_t _p3 = bfloat2float(_p.val[3]);
            float32x4_t _max = vld1q_f32(maxptr);
            float32x4_t _max2 = vmaxq_f32(_p0, _p1);
            float32x4_t _max4 = vmaxq_f32(_p2, _p3);
            _max = vmaxq_f32(_max, vmaxq_f32(_max2, _max4));
            vst1q_f32(maxptr, _max);
            ptr += 16;
            maxptr += 4;
        }
        for (; j < size1; j++)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
#if __aarch64__
            float max0 = vmaxvq_f32(_p);
#else
            float32x2_t _max2 = vmax_f32(vget_low_f32(_p), vget_high_f32(_p));
            float32x2_t _mm2 = vpmax_f32(_max2, _max2);
            float max0 = vget_lane_f32(_mm2, 0);
#endif
            *maxptr = std::max(*maxptr, max0);
            ptr += 4;
            maxptr++;
        }
    }

    // reduce exp(x - max)
    for (int i = 0; i < elemcount; i++)
    {
        unsigned short* ptr = _ptr + i * stride;
        const float* maxptr = _maxptr;
        float* sumptr = _sumptr;

        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            uint16x4x4_t _p = vld4_u16(ptr);
            float32x4_t _p0 = bfloat2float(_p.val[0]);
            float32x4_t _p1 = bfloat2float(_p.val[1]);
            float32x4_t _p2 = bfloat2float(_p.val[2]);
            float32x4_t _p3 = bfloat2float(_p.val[3]);
            float32x4_t _max = vld1q_f32(maxptr);
            _p0 = vsubq_f32(_p0, _max);
            _p1 = vsubq_f32(_p1, _max);
            _p2 = vsubq_f32(_p2, _max);
            _p3 = vsubq_f32(_p3, _max);
            _p0 = exp_ps(_p0);
            _p1 = exp_ps(_p1);
            _p2 = exp_ps(_p2);
            _p3 = exp_ps(_p3);
            _p.val[0] = float2bfloat(_p0);
            _p.val[1] = float2bfloat(_p1);
            _p.val[2] = float2bfloat(_p2);
            _p.val[3] = float2bfloat(_p3);
            vst4_u16(ptr, _p);
            float32x4_t _sum = vld1q_f32(sumptr);
            float32x4_t _ss2 = vaddq_f32(_p0, _p1);
            float32x4_t _ss4 = vaddq_f32(_p2, _p3);
            _sum = vaddq_f32(_sum, vaddq_f32(_ss2, _ss4));
            vst1q_f32(sumptr, _sum);
            ptr += 16;
            maxptr += 4;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _max = vdupq_n_f32(*maxptr);
            _p = exp_ps(vsubq_f32(_p, _max));
            vst1_u16(ptr, float2bfloat(_p));
#if __aarch64__
            float sum0 = vaddvq_f32(_p);
#else
            float32x2_t _sum2 = vadd_f32(vget_low_f32(_p), vget_high_f32(_p));
            float32x2_t _ss2 = vpadd_f32(_sum2, _sum2);
            float sum0 = vget_lane_f32(_ss2, 0);
#endif
            *sumptr += sum0;
            ptr += 4;
            maxptr++;
            sumptr++;
        }
    }

    {
        float32x4_t _one = vdupq_n_f32(1.f);
        float* sumptr = _sumptr;
        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _sum = vld1q_f32(sumptr);
            _sum = div_ps(_one, _sum);
            vst1q_f32(sumptr, _sum);
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            *sumptr = 1.f / *sumptr;
            sumptr++;
        }
    }

    // div sum
    for (int i = 0; i < elemcount; i++)
    {
        unsigned short* ptr = _ptr + i * stride;
        const float* sumptr = _sumptr;

        int j = 0;
        for (; j + 3 < size1; j += 4)
        {
            uint16x4x4_t _p = vld4_u16(ptr);
            float32x4_t _p0 = bfloat2float(_p.val[0]);
            float32x4_t _p1 = bfloat2float(_p.val[1]);
            float32x4_t _p2 = bfloat2float(_p.val[2]);
            float32x4_t _p3 = bfloat2float(_p.val[3]);
            float32x4_t _sum = vld1q_f32(sumptr);
            _p0 = vmulq_f32(_p0, _sum);
            _p1 = vmulq_f32(_p1, _sum);
            _p2 = vmulq_f32(_p2, _sum);
            _p3 = vmulq_f32(_p3, _sum);
            _p.val[0] = float2bfloat(_p0);
            _p.val[1] = float2bfloat(_p1);
            _p.val[2] = float2bfloat(_p2);
            _p.val[3] = float2bfloat(_p3);
            vst4_u16(ptr, _p);
            ptr += 16;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _sum = vld1q_dup_f32(sumptr);
            _p = vmulq_f32(_p, _sum);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
            sumptr++;
        }
    }
}
#endif // __ARM_NEON

static void softmax_bf16s_pack1(unsigned short* _ptr, int elemcount, size_t stride, int size1, float* _maxptr, float* _sumptr)
{
    // reduce max
    for (int i = 0; i < elemcount; i++)
    {
        const unsigned short* ptr = _ptr + i * stride;
        float* maxptr = _maxptr;

        int j = 0;
#if __ARM_NEON
        for (; j + 7 < size1; j += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            float32x4_t _max0 = vld1q_f32(maxptr);
            float32x4_t _max1 = vld1q_f32(maxptr + 4);
            _max0 = vmaxq_f32(_max0, _p0);
            _max1 = vmaxq_f32(_max1, _p1);
            vst1q_f32(maxptr, _max0);
            vst1q_f32(maxptr + 4, _max1);
            ptr += 8;
            maxptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _max = vld1q_f32(maxptr);
            _max = vmaxq_f32(_max, _p);
            vst1q_f32(maxptr, _max);
            ptr += 4;
            maxptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *maxptr = std::max(*maxptr, bfloat16_to_float32(*ptr));
            ptr++;
            maxptr++;
        }
    }

    // reduce exp(x - max)
    for (int i = 0; i < elemcount; i++)
    {
        unsigned short* ptr = _ptr + i * stride;
        const float* maxptr = _maxptr;
        float* sumptr = _sumptr;

        int j = 0;
#if __ARM_NEON
        for (; j + 7 < size1; j += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            float32x4_t _max0 = vld1q_f32(maxptr);
            float32x4_t _max1 = vld1q_f32(maxptr + 4);
            float32x4_t _sum0 = vld1q_f32(sumptr);
            float32x4_t _sum1 = vld1q_f32(sumptr + 4);
            _p0 = vsubq_f32(_p0, _max0);
            _p1 = vsubq_f32(_p1, _max1);
            _p0 = exp_ps(_p0);
            _p1 = exp_ps(_p1);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            _sum0 = vaddq_f32(_sum0, _p0);
            _sum1 = vaddq_f32(_sum1, _p1);
            vst1q_f32(sumptr, _sum0);
            vst1q_f32(sumptr + 4, _sum1);
            ptr += 8;
            maxptr += 8;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _max = vld1q_f32(maxptr);
            float32x4_t _sum = vld1q_f32(sumptr);
            _p = vsubq_f32(_p, _max);
            _p = exp_ps(_p);
            vst1_u16(ptr, float2bfloat(_p));
            _sum = vaddq_f32(_sum, _p);
            vst1q_f32(sumptr, _sum);
            ptr += 4;
            maxptr += 4;
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            float v = expf(bfloat16_to_float32(*ptr) - *maxptr);
            *ptr = float32_to_bfloat16(v);
            *sumptr += v;
            ptr++;
            maxptr++;
            sumptr++;
        }
    }

    {
        float* sumptr = _sumptr;
        int j = 0;
#if __ARM_NEON
        float32x4_t _one = vdupq_n_f32(1.f);
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _sum = vld1q_f32(sumptr);
            _sum = div_ps(_one, _sum);
            vst1q_f32(sumptr, _sum);
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *sumptr = 1.f / *sumptr;
            sumptr++;
        }
    }

    // div sum
    for (int i = 0; i < elemcount; i++)
    {
        unsigned short* ptr = _ptr + i * stride;
        const float* sumptr = _sumptr;

        int j = 0;
#if __ARM_NEON
        for (; j + 7 < size1; j += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            float32x4_t _sum0 = vld1q_f32(sumptr);
            float32x4_t _sum1 = vld1q_f32(sumptr + 4);
            _p0 = vmulq_f32(_p0, _sum0);
            _p1 = vmulq_f32(_p1, _sum1);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            float32x4_t _sum = vld1q_f32(sumptr);
            _p = vmulq_f32(_p, _sum);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *ptr = float32_to_bfloat16(bfloat16_to_float32(*ptr) * *sumptr);
            ptr++;
            sumptr++;
        }
    }
}

static void softmax_bf16s(unsigned short* _ptr, int elemcount, int elempack, size_t stride, int size1, float* _maxptr, float* _sumptr)
{
    // reduce max
    {
        float* maxptr = _maxptr;

        int j = 0;
#if __ARM_NEON
        float32x4_t _negmax = vdupq_n_f32(-FLT_MAX);
        for (; j + 3 < size1; j += 4)
        {
            vst1q_f32(maxptr, _negmax);
            maxptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *maxptr++ = -FLT_MAX;
        }
    }

    // reduce exp(x - max)
    {
        float* sumptr = _sumptr;

        int j = 0;
#if __ARM_NEON
        float32x4_t _zero = vdupq_n_f32(0.f);
        for (; j + 3 < size1; j += 4)
        {
            vst1q_f32(sumptr, _zero);
            sumptr += 4;
        }
#endif // __ARM_NEON
        for (; j < size1; j++)
        {
            *sumptr++ = 0.f;
        }
    }

#if __ARM_NEON
    if (elempack == 4)
    {
        softmax_bf16s_pack4(_ptr, elemcount, stride, size1, _maxptr, _sumptr);
    }
#endif // __ARM_NEON
    if (elempack == 1)
    {
        softmax_bf16s_pack1(_ptr, elemcount, stride, size1, _maxptr, _sumptr);
    }
}

int Softmax_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int d = bottom_top_blob.d;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;
    const int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        unsigned short* ptr = bottom_top_blob;

        const int size = w * elempack;

        softmax_bf16s(ptr, size, 1);
    }

    if (dims == 2 && positive_axis == 0)
    {
        const int size = w;
        const int sizen = (size + (opt.num_threads - 1)) / opt.num_threads;
        const size_t stride = (size_t)w * elempack;

        Mat maxsum(sizen, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        const int nn_size = (size + sizen - 1) / sizen;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            const int i = ii * sizen;
            const int size1 = std::min(sizen, size - i);

            float* maxsumptr = maxsum.channel(get_omp_thread_num());
            float* maxptr = maxsumptr;
            float* sumptr = maxptr + sizen;

            unsigned short* ptr = (unsigned short*)bottom_top_blob + i * elempack;

            softmax_bf16s(ptr, h, elempack, stride, size1, maxptr, sumptr);
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            unsigned short* ptr = bottom_top_blob.row<unsigned short>(i);

            softmax_bf16s(ptr, w, elempack);
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        const int size = w * h * d;
        const int sizen = (size + (opt.num_threads - 1)) / opt.num_threads;
        const size_t stride = bottom_top_blob.cstep * elempack;

        Mat maxsum(sizen, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        const int nn_size = (size + sizen - 1) / sizen;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            const int i = ii * sizen;
            const int size1 = std::min(sizen, size - i);

            float* maxsumptr = maxsum.channel(get_omp_thread_num());
            float* maxptr = maxsumptr;
            float* sumptr = maxptr + sizen;

            unsigned short* ptr = (unsigned short*)bottom_top_blob + i * elempack;

            softmax_bf16s(ptr, channels, elempack, stride, size1, maxptr, sumptr);
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        const int size = w * elempack;

        Mat maxsum(size, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            for (int i = 0; i < d; i++)
            {
                unsigned short* ptr = bottom_top_blob.channel(q).depth(i);

                float* maxsumptr = maxsum.channel(get_omp_thread_num());
                float* maxptr = maxsumptr;
                float* sumptr = maxptr + size;

                softmax_bf16s(ptr, h, 1, size, size, maxptr, sumptr);
            }
        }
    }

    if (dims == 3 && positive_axis == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < h; i++)
            {
                softmax_bf16s(ptr, w, elempack);
                ptr += w * elempack;
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        const int size = w * h * elempack;

        Mat maxsum(size, 2, opt.num_threads, 4u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            float* maxsumptr = maxsum.channel(get_omp_thread_num());
            float* maxptr = maxsumptr;
            float* sumptr = maxptr + size;

            softmax_bf16s(ptr, d, 1, size, size, maxptr, sumptr);
        }
    }

    if (dims == 4 && positive_axis == 3)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    softmax_bf16s(ptr, w, elempack);
                    ptr += w * elempack;
                }
            }
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/softmax_arm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SOFTMAX_ARM_H
#define LAYER_SOFTMAX_ARM_H

#include "softmax.h"

namespace ncnn {

class Softmax_arm : public Softmax
{
public:
    Softmax_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_SOFTMAX_ARM_H


================================================
FILE: src/layer/arm/softmax_arm_asimdhp.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "softmax_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#include "neon_mathfun_fp16s.h"
#endif // __ARM_NEON

#include "cpu.h"

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
static void softmax_fp16s(__fp16* _ptr, int elemcount, int elempack)
{
    const int size = elemcount * elempack;

    // reduce max
    float16x8_t _max8 = vdupq_n_f16(-65504.f);
    float16x4_t _max4 = vdup_n_f16(-65504.f);
    __fp16 max = -65504.f;
    {
        const __fp16* ptr = _ptr;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _max8 = vmaxq_f16(_max8, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _max4 = vmax_f16(_max4, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            max = std::max(max, *ptr++);
        }
    }

    if (elempack == 4)
    {
        _max4 = vmax_f16(_max4, vget_low_f16(_max8));
        _max4 = vmax_f16(_max4, vget_high_f16(_max8));

        _max8 = vcombine_f16(_max4, _max4);
    }
    if (elempack == 1)
    {
        max = std::max(max, vmaxvq_f16(_max8));
        max = std::max(max, vmaxv_f16(_max4));

        _max4 = vdup_n_f16(max);
        _max8 = vdupq_n_f16(max);
    }

    // reduce exp(x - max)
    float16x8_t _sum8 = vdupq_n_f16(0.f);
    float16x4_t _sum4 = vdup_n_f16(0.f);
    __fp16 sum = 0.f;
    {
        __fp16* ptr = _ptr;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _p = exp_ps_f16(vsubq_f16(_p, _max8));
            vst1q_f16(ptr, _p);
            _sum8 = vaddq_f16(_sum8, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = exp_ps_f16(vsub_f16(_p, _max4));
            vst1_f16(ptr, _p);
            _sum4 = vadd_f16(_sum4, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = (__fp16)expf(*ptr - max);
            *ptr = v;
            sum += v;
            ptr++;
        }
    }

    if (elempack == 4)
    {
        _sum4 = vadd_f16(_sum4, vget_low_f16(_sum8));
        _sum4 = vadd_f16(_sum4, vget_high_f16(_sum8));

        _sum8 = vcombine_f16(_sum4, _sum4);
    }
    if (elempack == 1)
    {
        _sum4 = vadd_f16(_sum4, vget_low_f16(_sum8));
        _sum4 = vadd_f16(_sum4, vget_high_f16(_sum8));
        _sum4 = vpadd_f16(_sum4, _sum4);
        _sum4 = vpadd_f16(_sum4, _sum4);
        sum += vget_lane_f16(_sum4, 0);

        _sum4 = vdup_n_f16(sum);
        _sum8 = vdupq_n_f16(sum);
    }

    _sum8 = vdivq_f16(vdupq_n_f16(1.f), _sum8);
    _sum4 = vdiv_f16(vdup_n_f16(1.f), _sum4);
    sum = (__fp16)1.f / sum;

    // div sum
    {
        __fp16* ptr = _ptr;

        int i = 0;
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _p = vmulq_f16(_p, _sum8);
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = vmul_f16(_p, _sum4);
            vst1_f16(ptr, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            *ptr++ *= sum;
        }
    }
}

static void softmax_fp16s_pack8(__fp16* _ptr, int elemcount, size_t stride, int size1, __fp16* _maxptr, __fp16* _sumptr)
{
    // reduce max
    for (int i = 0; i < elemcount; i++)
    {
        const __fp16* ptr = _ptr + i * stride;
        __fp16* maxptr = _maxptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x8_t _p4 = vld1q_f16(ptr + 32);
            float16x8_t _p5 = vld1q_f16(ptr + 40);
            float16x8_t _p6 = vld1q_f16(ptr + 48);
            float16x8_t _p7 = vld1q_f16(ptr + 56);
            float16x8_t _max = vld1q_f16(maxptr);
            float16x8_t _max01 = vpmaxq_f16(_p0, _p1);
            float16x8_t _max23 = vpmaxq_f16(_p2, _p3);
            float16x8_t _max45 = vpmaxq_f16(_p4, _p5);
            float16x8_t _max67 = vpmaxq_f16(_p6, _p7);
            float16x8_t _max2 = vpmaxq_f16(_max01, _max23);
            float16x8_t _max4 = vpmaxq_f16(_max45, _max67);
            _max = vmaxq_f16(_max, vpmaxq_f16(_max2, _max4));
            vst1q_f16(maxptr, _max);
            ptr += 64;
            maxptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x4_t _max = vld1_f16(maxptr);
            float16x8_t _max01 = vpmaxq_f16(_p0, _p1);
            float16x8_t _max23 = vpmaxq_f16(_p2, _p3);
            float16x8_t _max2 = vpmaxq_f16(_max01, _max23);
            _max = vmax_f16(_max, vpmax_f16(vget_low_f16(_max2), vget_high_f16(_max2)));
            vst1_f16(maxptr, _max);
            ptr += 32;
            maxptr += 4;
        }
        for (; j < size1; j++)
        {
            float16x8_t _p = vld1q_f16(ptr);
            __fp16 max0 = vmaxvq_f16(_p);
            *maxptr = std::max(*maxptr, max0);
            ptr += 8;
            maxptr++;
        }
    }

    // reduce exp(x - max)
    for (int i = 0; i < elemcount; i++)
    {
        __fp16* ptr = _ptr + i * stride;
        const __fp16* maxptr = _maxptr;
        __fp16* sumptr = _sumptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x8_t _p4 = vld1q_f16(ptr + 32);
            float16x8_t _p5 = vld1q_f16(ptr + 40);
            float16x8_t _p6 = vld1q_f16(ptr + 48);
            float16x8_t _p7 = vld1q_f16(ptr + 56);
            float16x8_t _max = vld1q_f16(maxptr);
            _p0 = exp_ps_f16(vsubq_f16(_p0, vdupq_laneq_f16(_max, 0)));
            _p1 = exp_ps_f16(vsubq_f16(_p1, vdupq_laneq_f16(_max, 1)));
            _p2 = exp_ps_f16(vsubq_f16(_p2, vdupq_laneq_f16(_max, 2)));
            _p3 = exp_ps_f16(vsubq_f16(_p3, vdupq_laneq_f16(_max, 3)));
            _p4 = exp_ps_f16(vsubq_f16(_p4, vdupq_laneq_f16(_max, 4)));
            _p5 = exp_ps_f16(vsubq_f16(_p5, vdupq_laneq_f16(_max, 5)));
            _p6 = exp_ps_f16(vsubq_f16(_p6, vdupq_laneq_f16(_max, 6)));
            _p7 = exp_ps_f16(vsubq_f16(_p7, vdupq_laneq_f16(_max, 7)));
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            vst1q_f16(ptr + 32, _p4);
            vst1q_f16(ptr + 40, _p5);
            vst1q_f16(ptr + 48, _p6);
            vst1q_f16(ptr + 56, _p7);
            float16x8_t _sum = vld1q_f16(sumptr);
            float16x8_t _ss01 = vpaddq_f16(_p0, _p1);
            float16x8_t _ss23 = vpaddq_f16(_p2, _p3);
            float16x8_t _ss45 = vpaddq_f16(_p4, _p5);
            float16x8_t _ss67 = vpaddq_f16(_p6, _p7);
            float16x8_t _ss2 = vpaddq_f16(_ss01, _ss23);
            float16x8_t _ss4 = vpaddq_f16(_ss45, _ss67);
            _sum = vaddq_f16(_sum, vpaddq_f16(_ss2, _ss4));
            vst1q_f16(sumptr, _sum);
            ptr += 64;
            maxptr += 8;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x4_t _max = vld1_f16(maxptr);
            _p0 = exp_ps_f16(vsubq_f16(_p0, vdupq_lane_f16(_max, 0)));
            _p1 = exp_ps_f16(vsubq_f16(_p1, vdupq_lane_f16(_max, 1)));
            _p2 = exp_ps_f16(vsubq_f16(_p2, vdupq_lane_f16(_max, 2)));
            _p3 = exp_ps_f16(vsubq_f16(_p3, vdupq_lane_f16(_max, 3)));
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            float16x4_t _sum = vld1_f16(sumptr);
            float16x8_t _ss01 = vpaddq_f16(_p0, _p1);
            float16x8_t _ss23 = vpaddq_f16(_p2, _p3);
            float16x8_t _ss2 = vpaddq_f16(_ss01, _ss23);
            _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
            vst1_f16(sumptr, _sum);
            ptr += 32;
            maxptr += 4;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x8_t _max = vdupq_n_f16(*maxptr);
            _p = exp_ps_f16(vsubq_f16(_p, _max));
            vst1q_f16(ptr, _p);
            float16x4_t _sum2 = vadd_f16(vget_low_f16(_p), vget_high_f16(_p));
            float16x4_t _ss2 = vpadd_f16(_sum2, _sum2);
            __fp16 sum0 = vget_lane_f16(_ss2, 0) + vget_lane_f16(_ss2, 1);
            *sumptr += sum0;
            ptr += 8;
            maxptr++;
            sumptr++;
        }
    }

    {
        float16x8_t _one = vdupq_n_f16(1.f);
        __fp16* sumptr = _sumptr;
        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _sum = vld1q_f16(sumptr);
            _sum = vdivq_f16(_one, _sum);
            vst1q_f16(sumptr, _sum);
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x4_t _sum = vld1_f16(sumptr);
            _sum = vdiv_f16(vget_low_f16(_one), _sum);
            vst1_f16(sumptr, _sum);
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            *sumptr = (__fp16)1.f / *sumptr;
            sumptr++;
        }
    }

    // div sum
    for (int i = 0; i < elemcount; i++)
    {
        __fp16* ptr = _ptr + i * stride;
        const __fp16* sumptr = _sumptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x8_t _p4 = vld1q_f16(ptr + 32);
            float16x8_t _p5 = vld1q_f16(ptr + 40);
            float16x8_t _p6 = vld1q_f16(ptr + 48);
            float16x8_t _p7 = vld1q_f16(ptr + 56);
            float16x8_t _sum = vld1q_f16(sumptr);
            _p0 = vmulq_laneq_f16(_p0, _sum, 0);
            _p1 = vmulq_laneq_f16(_p1, _sum, 1);
            _p2 = vmulq_laneq_f16(_p2, _sum, 2);
            _p3 = vmulq_laneq_f16(_p3, _sum, 3);
            _p4 = vmulq_laneq_f16(_p4, _sum, 4);
            _p5 = vmulq_laneq_f16(_p5, _sum, 5);
            _p6 = vmulq_laneq_f16(_p6, _sum, 6);
            _p7 = vmulq_laneq_f16(_p7, _sum, 7);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            vst1q_f16(ptr + 32, _p4);
            vst1q_f16(ptr + 40, _p5);
            vst1q_f16(ptr + 48, _p6);
            vst1q_f16(ptr + 56, _p7);
            ptr += 64;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x4_t _sum = vld1_f16(sumptr);
            _p0 = vmulq_lane_f16(_p0, _sum, 0);
            _p1 = vmulq_lane_f16(_p1, _sum, 1);
            _p2 = vmulq_lane_f16(_p2, _sum, 2);
            _p3 = vmulq_lane_f16(_p3, _sum, 3);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            ptr += 32;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x8_t _sum = vld1q_dup_f16(sumptr);
            _p = vmulq_f16(_p, _sum);
            vst1q_f16(ptr, _p);
            ptr += 8;
            sumptr++;
        }
    }
}

static void softmax_fp16s_pack4(__fp16* _ptr, int elemcount, size_t stride, int size1, __fp16* _maxptr, __fp16* _sumptr)
{
    // reduce max
    for (int i = 0; i < elemcount; i++)
    {
        const __fp16* ptr = _ptr + i * stride;
        __fp16* maxptr = _maxptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x8_t _max = vld1q_f16(maxptr);
            float16x8_t _max2 = vpmaxq_f16(_p0, _p1);
            float16x8_t _max4 = vpmaxq_f16(_p2, _p3);
            _max = vmaxq_f16(_max, vpmaxq_f16(_max2, _max4));
            vst1q_f16(maxptr, _max);
            ptr += 32;
            maxptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x4_t _max = vld1_f16(maxptr);
            float16x8_t _max2 = vpmaxq_f16(_p0, _p1);
            _max = vmax_f16(_max, vpmax_f16(vget_low_f16(_max2), vget_high_f16(_max2)));
            vst1_f16(maxptr, _max);
            ptr += 16;
            maxptr += 4;
        }
        for (; j < size1; j++)
        {
            float16x4_t _p = vld1_f16(ptr);
            __fp16 max0 = vmaxv_f16(_p);
            *maxptr = std::max(*maxptr, max0);
            ptr += 4;
            maxptr++;
        }
    }

    // reduce exp(x - max)
    for (int i = 0; i < elemcount; i++)
    {
        __fp16* ptr = _ptr + i * stride;
        const __fp16* maxptr = _maxptr;
        __fp16* sumptr = _sumptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x8_t _max = vld1q_f16(maxptr);
            float16x8_t _max0 = vcombine_f16(vdup_laneq_f16(_max, 0), vdup_laneq_f16(_max, 1));
            float16x8_t _max1 = vcombine_f16(vdup_laneq_f16(_max, 2), vdup_laneq_f16(_max, 3));
            float16x8_t _max2 = vcombine_f16(vdup_laneq_f16(_max, 4), vdup_laneq_f16(_max, 5));
            float16x8_t _max3 = vcombine_f16(vdup_laneq_f16(_max, 6), vdup_laneq_f16(_max, 7));
            _p0 = exp_ps_f16(vsubq_f16(_p0, _max0));
            _p1 = exp_ps_f16(vsubq_f16(_p1, _max1));
            _p2 = exp_ps_f16(vsubq_f16(_p2, _max2));
            _p3 = exp_ps_f16(vsubq_f16(_p3, _max3));
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            float16x8_t _sum = vld1q_f16(sumptr);
            float16x8_t _ss2 = vpaddq_f16(_p0, _p1);
            float16x8_t _ss4 = vpaddq_f16(_p2, _p3);
            _sum = vaddq_f16(_sum, vpaddq_f16(_ss2, _ss4));
            vst1q_f16(sumptr, _sum);
            ptr += 32;
            maxptr += 8;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x4_t _max = vld1_f16(maxptr);
            float16x8_t _max0 = vcombine_f16(vdup_lane_f16(_max, 0), vdup_lane_f16(_max, 1));
            float16x8_t _max1 = vcombine_f16(vdup_lane_f16(_max, 2), vdup_lane_f16(_max, 3));
            _p0 = exp_ps_f16(vsubq_f16(_p0, _max0));
            _p1 = exp_ps_f16(vsubq_f16(_p1, _max1));
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            float16x4_t _sum = vld1_f16(sumptr);
            float16x8_t _ss2 = vpaddq_f16(_p0, _p1);
            _sum = vadd_f16(_sum, vpadd_f16(vget_low_f16(_ss2), vget_high_f16(_ss2)));
            vst1_f16(sumptr, _sum);
            ptr += 16;
            maxptr += 4;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float16x4_t _p = vld1_f16(ptr);
            float16x4_t _max = vdup_n_f16(*maxptr);
            _p = exp_ps_f16(vsub_f16(_p, _max));
            vst1_f16(ptr, _p);
            float16x4_t _ss2 = vpadd_f16(_p, _p);
            __fp16 sum0 = vget_lane_f16(_ss2, 0) + vget_lane_f16(_ss2, 1);
            *sumptr += sum0;
            ptr += 4;
            maxptr++;
            sumptr++;
        }
    }

    {
        float16x8_t _one = vdupq_n_f16(1.f);
        __fp16* sumptr = _sumptr;
        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _sum = vld1q_f16(sumptr);
            _sum = vdivq_f16(_one, _sum);
            vst1q_f16(sumptr, _sum);
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x4_t _sum = vld1_f16(sumptr);
            _sum = vdiv_f16(vget_low_f16(_one), _sum);
            vst1_f16(sumptr, _sum);
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            *sumptr = (__fp16)1.f / *sumptr;
            sumptr++;
        }
    }

    // div sum
    for (int i = 0; i < elemcount; i++)
    {
        __fp16* ptr = _ptr + i * stride;
        const __fp16* sumptr = _sumptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            float16x8_t _sum = vld1q_f16(sumptr);
            float16x8_t _sum0 = vcombine_f16(vdup_laneq_f16(_sum, 0), vdup_laneq_f16(_sum, 1));
            float16x8_t _sum1 = vcombine_f16(vdup_laneq_f16(_sum, 2), vdup_laneq_f16(_sum, 3));
            float16x8_t _sum2 = vcombine_f16(vdup_laneq_f16(_sum, 4), vdup_laneq_f16(_sum, 5));
            float16x8_t _sum3 = vcombine_f16(vdup_laneq_f16(_sum, 6), vdup_laneq_f16(_sum, 7));
            _p0 = vmulq_f16(_p0, _sum0);
            _p1 = vmulq_f16(_p1, _sum1);
            _p2 = vmulq_f16(_p2, _sum2);
            _p3 = vmulq_f16(_p3, _sum3);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            ptr += 32;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x4_t _sum = vld1_f16(sumptr);
            float16x8_t _sum0 = vcombine_f16(vdup_lane_f16(_sum, 0), vdup_lane_f16(_sum, 1));
            float16x8_t _sum1 = vcombine_f16(vdup_lane_f16(_sum, 2), vdup_lane_f16(_sum, 3));
            _p0 = vmulq_f16(_p0, _sum0);
            _p1 = vmulq_f16(_p1, _sum1);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            ptr += 16;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            float16x4_t _p = vld1_f16(ptr);
            float16x4_t _sum = vld1_dup_f16(sumptr);
            _p = vmul_f16(_p, _sum);
            vst1_f16(ptr, _p);
            ptr += 4;
            sumptr++;
        }
    }
}

static void softmax_fp16s_pack1(__fp16* _ptr, int elemcount, size_t stride, int size1, __fp16* _maxptr, __fp16* _sumptr)
{
    // reduce max
    for (int i = 0; i < elemcount; i++)
    {
        const __fp16* ptr = _ptr + i * stride;
        __fp16* maxptr = _maxptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x8_t _max = vld1q_f16(maxptr);
            _max = vmaxq_f16(_max, _p);
            vst1q_f16(maxptr, _max);
            ptr += 8;
            maxptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            float16x4_t _max = vld1_f16(maxptr);
            _max = vmax_f16(_max, _p);
            vst1_f16(maxptr, _max);
            ptr += 4;
            maxptr += 4;
        }
        for (; j < size1; j++)
        {
            *maxptr = std::max(*maxptr, *ptr);
            ptr++;
            maxptr++;
        }
    }

    // reduce exp(x - max)
    for (int i = 0; i < elemcount; i++)
    {
        __fp16* ptr = _ptr + i * stride;
        const __fp16* maxptr = _maxptr;
        __fp16* sumptr = _sumptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x8_t _max = vld1q_f16(maxptr);
            float16x8_t _sum = vld1q_f16(sumptr);
            _p = vsubq_f16(_p, _max);
            _p = exp_ps_f16(_p);
            vst1q_f16(ptr, _p);
            _sum = vaddq_f16(_sum, _p);
            vst1q_f16(sumptr, _sum);
            ptr += 8;
            maxptr += 8;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            float16x4_t _max = vld1_f16(maxptr);
            float16x4_t _sum = vld1_f16(sumptr);
            _p = vsub_f16(_p, _max);
            _p = exp_ps_f16(_p);
            vst1_f16(ptr, _p);
            _sum = vadd_f16(_sum, _p);
            vst1_f16(sumptr, _sum);
            ptr += 4;
            maxptr += 4;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            __fp16 v = expf(*ptr - *maxptr);
            *ptr = v;
            *sumptr += v;
            ptr++;
            maxptr++;
            sumptr++;
        }
    }

    {
        float16x8_t _one = vdupq_n_f16(1.f);
        __fp16* sumptr = _sumptr;
        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _sum = vld1q_f16(sumptr);
            _sum = vdivq_f16(_one, _sum);
            vst1q_f16(sumptr, _sum);
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x4_t _sum = vld1_f16(sumptr);
            _sum = vdiv_f16(vget_low_f16(_one), _sum);
            vst1_f16(sumptr, _sum);
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            *sumptr = (__fp16)1.f / *sumptr;
            sumptr++;
        }
    }

    // div sum
    for (int i = 0; i < elemcount; i++)
    {
        __fp16* ptr = _ptr + i * stride;
        const __fp16* sumptr = _sumptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float16x8_t _sum = vld1q_f16(sumptr);
            _p = vmulq_f16(_p, _sum);
            vst1q_f16(ptr, _p);
            ptr += 8;
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            float16x4_t _sum = vld1_f16(sumptr);
            _p = vmul_f16(_p, _sum);
            vst1_f16(ptr, _p);
            ptr += 4;
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            *ptr *= *sumptr;
            ptr++;
            sumptr++;
        }
    }
}

static void softmax_fp16s(__fp16* _ptr, int elemcount, int elempack, size_t stride, int size1, __fp16* _maxptr, __fp16* _sumptr)
{
    // reduce max
    {
        float16x8_t _negmax = vdupq_n_f16(-65504.f);

        __fp16* maxptr = _maxptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            vst1q_f16(maxptr, _negmax);
            maxptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            vst1_f16(maxptr, vget_low_f16(_negmax));
            maxptr += 4;
        }
        for (; j < size1; j++)
        {
            *maxptr++ = -65504.f;
        }
    }

    // reduce exp(x - max)
    {
        float16x8_t _zero = vdupq_n_f16(0.f);

        __fp16* sumptr = _sumptr;

        int j = 0;
        for (; j + 7 < size1; j += 8)
        {
            vst1q_f16(sumptr, _zero);
            sumptr += 8;
        }
        for (; j + 3 < size1; j += 4)
        {
            vst1_f16(sumptr, vget_low_f16(_zero));
            sumptr += 4;
        }
        for (; j < size1; j++)
        {
            *sumptr++ = 0.f;
        }
    }

    if (elempack == 8)
    {
        softmax_fp16s_pack8(_ptr, elemcount, stride, size1, _maxptr, _sumptr);
    }
    if (elempack == 4)
    {
        softmax_fp16s_pack4(_ptr, elemcount, stride, size1, _maxptr, _sumptr);
    }
    if (elempack == 1)
    {
        softmax_fp16s_pack1(_ptr, elemcount, stride, size1, _maxptr, _sumptr);
    }
}

int Softmax_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int w = bottom_top_blob.w;
    const int h = bottom_top_blob.h;
    const int d = bottom_top_blob.d;
    const int channels = bottom_top_blob.c;
    const int elempack = bottom_top_blob.elempack;
    const int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        __fp16* ptr = bottom_top_blob;

        const int size = w * elempack;

        softmax_fp16s(ptr, size, 1);
    }

    if (dims == 2 && positive_axis == 0)
    {
        const int size = w;
        const int sizen = (size + (opt.num_threads - 1)) / opt.num_threads;
        const size_t stride = (size_t)w * elempack;

        Mat maxsum(sizen, 2, opt.num_threads, 2u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        const int nn_size = (size + sizen - 1) / sizen;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            const int i = ii * sizen;
            const int size1 = std::min(sizen, size - i);

            __fp16* maxsumptr = maxsum.channel(get_omp_thread_num());
            __fp16* maxptr = maxsumptr;
            __fp16* sumptr = maxptr + sizen;

            __fp16* ptr = (__fp16*)bottom_top_blob + i * elempack;

            softmax_fp16s(ptr, h, elempack, stride, size1, maxptr, sumptr);
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            __fp16* ptr = bottom_top_blob.row<__fp16>(i);

            softmax_fp16s(ptr, w, elempack);
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        const int size = w * h * d;
        const int sizen = (size + (opt.num_threads - 1)) / opt.num_threads;
        const size_t stride = bottom_top_blob.cstep * elempack;

        Mat maxsum(sizen, 2, opt.num_threads, 2u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        const int nn_size = (size + sizen - 1) / sizen;
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            const int i = ii * sizen;
            const int size1 = std::min(sizen, size - i);

            __fp16* maxsumptr = maxsum.channel(get_omp_thread_num());
            __fp16* maxptr = maxsumptr;
            __fp16* sumptr = maxptr + sizen;

            __fp16* ptr = (__fp16*)bottom_top_blob + i * elempack;

            softmax_fp16s(ptr, channels, elempack, stride, size1, maxptr, sumptr);
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        const int size = w * elempack;

        Mat maxsum(size, 2, opt.num_threads, 2u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            for (int i = 0; i < d; i++)
            {
                __fp16* ptr = bottom_top_blob.channel(q).depth(i);

                __fp16* maxsumptr = maxsum.channel(get_omp_thread_num());
                __fp16* maxptr = maxsumptr;
                __fp16* sumptr = maxptr + size;

                softmax_fp16s(ptr, h, 1, size, size, maxptr, sumptr);
            }
        }
    }

    if (dims == 3 && positive_axis == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < h; i++)
            {
                softmax_fp16s(ptr, w, elempack);
                ptr += w * elempack;
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        const int size = w * h * elempack;

        Mat maxsum(size, 2, opt.num_threads, 2u, opt.workspace_allocator);
        if (maxsum.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            __fp16* maxsumptr = maxsum.channel(get_omp_thread_num());
            __fp16* maxptr = maxsumptr;
            __fp16* sumptr = maxptr + size;

            softmax_fp16s(ptr, d, 1, size, size, maxptr, sumptr);
        }
    }

    if (dims == 4 && positive_axis == 3)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    softmax_fp16s(ptr, w, elempack);
                    ptr += w * elempack;
                }
            }
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/swish_arm.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "swish_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

Swish_arm::Swish_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int Swish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _one = vdupq_n_f32(1.f);
#if __aarch64__
        for (; i + 15 < size; i += 16)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            float32x4_t _p2 = vld1q_f32(ptr + 8);
            float32x4_t _p3 = vld1q_f32(ptr + 12);
            _p0 = div_ps(_p0, vaddq_f32(_one, exp_ps(vnegq_f32(_p0))));
            _p1 = div_ps(_p1, vaddq_f32(_one, exp_ps(vnegq_f32(_p1))));
            _p2 = div_ps(_p2, vaddq_f32(_one, exp_ps(vnegq_f32(_p2))));
            _p3 = div_ps(_p3, vaddq_f32(_one, exp_ps(vnegq_f32(_p3))));
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            vst1q_f32(ptr + 8, _p2);
            vst1q_f32(ptr + 12, _p3);
            ptr += 16;
        }
#endif // __aarch64__
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            _p0 = div_ps(_p0, vaddq_f32(_one, exp_ps(vnegq_f32(_p0))));
            _p1 = div_ps(_p1, vaddq_f32(_one, exp_ps(vnegq_f32(_p1))));
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = div_ps(_p, vaddq_f32(_one, exp_ps(vnegq_f32(_p))));
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = *ptr / (1.f + expf(-*ptr));

            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int Swish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __ARM_NEON
        float32x4_t _one = vdupq_n_f32(1.f);
#if __aarch64__
        for (; i + 15 < size; i += 16)
        {
            uint16x8_t _p01 = vld1q_u16(ptr);
            uint16x8_t _p23 = vld1q_u16(ptr + 8);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
            float32x4_t _p2 = bfloat2float(vget_low_u16(_p23));
            float32x4_t _p3 = bfloat2float(vget_high_u16(_p23));
            _p0 = div_ps(_p0, vaddq_f32(_one, exp_ps(vnegq_f32(_p0))));
            _p1 = div_ps(_p1, vaddq_f32(_one, exp_ps(vnegq_f32(_p1))));
            _p2 = div_ps(_p2, vaddq_f32(_one, exp_ps(vnegq_f32(_p2))));
            _p3 = div_ps(_p3, vaddq_f32(_one, exp_ps(vnegq_f32(_p3))));
            _p01 = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            _p23 = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
            vst1q_u16(ptr, _p01);
            vst1q_u16(ptr + 8, _p23);
            ptr += 16;
        }
#endif // __aarch64__
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _p0 = div_ps(_p0, vaddq_f32(_one, exp_ps(vnegq_f32(_p0))));
            _p1 = div_ps(_p1, vaddq_f32(_one, exp_ps(vnegq_f32(_p1))));
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = div_ps(_p, vaddq_f32(_one, exp_ps(vnegq_f32(_p))));
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            float v = bfloat16_to_float32(*ptr);
            v = v / (1.f + expf(-v));
            *ptr = float32_to_bfloat16(v);

            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/swish_arm.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SWISH_ARM_H
#define LAYER_SWISH_ARM_H

#include "swish.h"

namespace ncnn {

class Swish_arm : public Swish
{
public:
    Swish_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_SWISH_ARM_H


================================================
FILE: src/layer/arm/swish_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "swish_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#include "neon_mathfun.h"
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "neon_mathfun_fp16s.h"
#endif
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int Swish_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        float32x4_t _one = vdupq_n_f32(1.f);

        int i = 0;
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p01 = vld1q_f16(ptr);
            float16x8_t _p23 = vld1q_f16(ptr + 8);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p01));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p01));
            float32x4_t _p2 = vcvt_f32_f16(vget_low_f16(_p23));
            float32x4_t _p3 = vcvt_f32_f16(vget_high_f16(_p23));
            _p0 = vdivq_f32(_p0, vaddq_f32(_one, exp_ps(vnegq_f32(_p0))));
            _p1 = vdivq_f32(_p1, vaddq_f32(_one, exp_ps(vnegq_f32(_p1))));
            _p2 = vdivq_f32(_p2, vaddq_f32(_one, exp_ps(vnegq_f32(_p2))));
            _p3 = vdivq_f32(_p3, vaddq_f32(_one, exp_ps(vnegq_f32(_p3))));
            _p01 = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            _p23 = vcombine_f16(vcvt_f16_f32(_p2), vcvt_f16_f32(_p3));
            vst1q_f16(ptr, _p01);
            vst1q_f16(ptr + 8, _p23);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            float32x4_t _p0 = vcvt_f32_f16(vget_low_f16(_p));
            float32x4_t _p1 = vcvt_f32_f16(vget_high_f16(_p));
            _p0 = vdivq_f32(_p0, vaddq_f32(_one, exp_ps(vnegq_f32(_p0))));
            _p1 = vdivq_f32(_p1, vaddq_f32(_one, exp_ps(vnegq_f32(_p1))));
            _p = vcombine_f16(vcvt_f16_f32(_p0), vcvt_f16_f32(_p1));
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _p = vdivq_f32(_p, vaddq_f32(_one, exp_ps(vnegq_f32(_p))));
            vst1_f16(ptr, vcvt_f16_f32(_p));
            ptr += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)*ptr;
            v = v / (1.f + expf(-v));
            *ptr = (__fp16)v;

            ptr++;
        }
    }

    return 0;
}

int Swish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        float16x8_t _one = vdupq_n_f16(1.f);

        int i = 0;
        for (; i + 31 < size; i += 32)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            float16x8_t _p2 = vld1q_f16(ptr + 16);
            float16x8_t _p3 = vld1q_f16(ptr + 24);
            _p0 = vdivq_f16(_p0, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p0))));
            _p1 = vdivq_f16(_p1, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p1))));
            _p2 = vdivq_f16(_p2, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p2))));
            _p3 = vdivq_f16(_p3, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p3))));
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            vst1q_f16(ptr + 16, _p2);
            vst1q_f16(ptr + 24, _p3);
            ptr += 32;
        }
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            _p0 = vdivq_f16(_p0, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p0))));
            _p1 = vdivq_f16(_p1, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p1))));
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _p = vdivq_f16(_p, vaddq_f16(_one, exp_ps_f16(vnegq_f16(_p))));
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = vdiv_f16(_p, vadd_f16(vget_low_f16(_one), exp_ps_f16(vneg_f16(_p))));
            vst1_f16(ptr, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = *ptr;
            v = v / (__fp16)(1.f + expf(-v));
            *ptr = v;

            ptr++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/tanh_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "tanh_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"
#include "cpu.h"

namespace ncnn {

TanH_arm::TanH_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

int TanH_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
    {
        if (opt.use_fp16_arithmetic)
            return forward_inplace_fp16sa(bottom_top_blob, opt);
        else
            return forward_inplace_fp16s(bottom_top_blob, opt);
    }
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vld1q_f32(ptr);
                _p = tanh_ps(_p);
                vst1q_f32(ptr, _p);
                ptr += 4;
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        for (; nn > 0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = tanh_ps(_p);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; remain > 0; remain--)
        {
            *ptr = tanhf(*ptr);
            ptr++;
        }
    }

    return 0;
}

#if NCNN_BF16
int TanH_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

#if __ARM_NEON
    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned short* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = bfloat2float(vld1_u16(ptr));
                _p = tanh_ps(_p);
                vst1_u16(ptr, float2bfloat(_p));
                ptr += 4;
            }
        }

        return 0;
    }
#endif // __ARM_NEON

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = bottom_top_blob.channel(q);

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        for (; nn > 0; nn--)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = tanh_ps(_p);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; remain > 0; remain--)
        {
            float v = bfloat16_to_float32(*ptr);
            v = tanhf(v);
            *ptr = float32_to_bfloat16(v);
            ptr++;
        }
    }

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/tanh_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_TANH_ARM_H
#define LAYER_TANH_ARM_H

#include "tanh.h"

namespace ncnn {

class TanH_arm : public TanH
{
public:
    TanH_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
    int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_TANH_ARM_H


================================================
FILE: src/layer/arm/tanh_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "tanh_arm.h"

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#include "neon_mathfun.h"
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "neon_mathfun_fp16s.h"
#endif
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
int TanH_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
                _p = tanh_ps(_p);
                vst1_f16(ptr, vcvt_f16_f32(_p));

                ptr += 4;
            }
        }

        return 0;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vcvt_f32_f16(vld1_f16(ptr));
            _p = tanh_ps(_p);
            vst1_f16(ptr, vcvt_f16_f32(_p));

            ptr += 4;
        }
        for (; i < size; i++)
        {
            float v = (float)*ptr;
            v = tanhf(v);
            *ptr = (__fp16)v;
            ptr++;
        }
    }

    return 0;
}

int TanH_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;
    int elempack = bottom_top_blob.elempack;

    if (elempack == 8)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float16x8_t _p = vld1q_f16(ptr);
                _p = tanh_ps_f16(_p);
                vst1q_f16(ptr, _p);

                ptr += 8;
            }
        }

        return 0;
    }

    if (elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            __fp16* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                float16x4_t _p = vld1_f16(ptr);
                _p = tanh_ps_f16(_p);
                vst1_f16(ptr, _p);

                ptr += 4;
            }
        }

        return 0;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = bottom_top_blob.channel(q);

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = tanh_ps_f16(_p);
            vst1_f16(ptr, _p);

            ptr += 4;
        }
        for (; i < size; i++)
        {
            __fp16 v = *ptr;
            v = tanhf(v);
            *ptr = v;
            ptr++;
        }
    }

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/arm/unaryop_arm.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "unaryop_arm.h"

// #include <fenv.h>
#include <float.h>

#if __ARM_NEON
#include <arm_neon.h>
#include "neon_mathfun.h"
#endif // __ARM_NEON

#include "arm_usability.h"

#include "cpu.h"

namespace ncnn {

UnaryOp_arm::UnaryOp_arm()
{
#if __ARM_NEON
    support_packing = true;
#if NCNN_ARM82
    support_fp16_storage = cpu_support_arm_asimdhp();
#endif
#endif // __ARM_NEON

#if NCNN_BF16
    support_bf16_storage = true;
#endif
}

template<typename Op>
static int unary_op_inplace(Mat& a, const Option& opt)
{
    Op op;

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int elempack = a.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        int i = 0;
#if __ARM_NEON
        for (; i + 7 < size; i += 8)
        {
            float32x4_t _p0 = vld1q_f32(ptr);
            float32x4_t _p1 = vld1q_f32(ptr + 4);
            _p0 = op.func_pack4(_p0);
            _p1 = op.func_pack4(_p1);
            vst1q_f32(ptr, _p0);
            vst1q_f32(ptr + 4, _p1);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = vld1q_f32(ptr);
            _p = op.func_pack4(_p);
            vst1q_f32(ptr, _p);
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = op.func(*ptr);
            ptr++;
        }
    }

    return 0;
}

namespace UnaryOp_arm_functor {

struct unary_op_abs
{
    float func(const float& x) const
    {
        return (float)fabsf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return vabsq_f32(x);
    }
#endif // __ARM_NEON
};

struct unary_op_neg
{
    float func(const float& x) const
    {
        return -x;
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return vnegq_f32(x);
    }
#endif // __ARM_NEON
};

struct unary_op_floor
{
    float func(const float& x) const
    {
        return (float)floorf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
#if __aarch64__
        return vrndmq_f32(x);
#else  // __aarch64__
        int32x4_t _xi = vcvtq_s32_f32(x);
        uint32x4_t _mask = vcgtq_f32(vcvtq_f32_s32(_xi), x);
        return vcvtq_f32_s32(vaddq_s32(_xi, vreinterpretq_s32_u32(_mask)));
#endif // __aarch64__
    }
#endif // __ARM_NEON
};

struct unary_op_ceil
{
    float func(const float& x) const
    {
        return (float)ceilf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
#if __aarch64__
        return vrndpq_f32(x);
#else  // __aarch64__
        int32x4_t _xi = vcvtq_s32_f32(x);
        uint32x4_t _mask = vcgtq_f32(x, vcvtq_f32_s32(_xi));
        return vcvtq_f32_s32(vsubq_s32(_xi, vreinterpretq_s32_u32(_mask)));
#endif // __aarch64__
    }
#endif // __ARM_NEON
};

struct unary_op_square
{
    float func(const float& x) const
    {
        return x * x;
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return vmulq_f32(x, x);
    }
#endif // __ARM_NEON
};

struct unary_op_sqrt
{
    float func(const float& x) const
    {
        return (float)sqrtf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
#if __aarch64__
        return vsqrtq_f32(x);
#else
        float32x4_t _reciprocal = vrsqrteq_f32(x);
        _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, _reciprocal), _reciprocal), _reciprocal);
        // _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, _reciprocal), _reciprocal), _reciprocal);
        return vmulq_f32(x, _reciprocal);
#endif
    }
#endif // __ARM_NEON
};

struct unary_op_rsqrt
{
    float func(const float& x) const
    {
        return (float)(1.f / sqrtf(x));
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        float32x4_t _reciprocal = vrsqrteq_f32(x);
        _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, _reciprocal), _reciprocal), _reciprocal);
        // _reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, _reciprocal), _reciprocal), _reciprocal);
        return _reciprocal;
    }
#endif // __ARM_NEON
};

struct unary_op_exp
{
    float func(const float& x) const
    {
        return (float)expf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return exp_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_log
{
    float func(const float& x) const
    {
        return (float)logf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return log_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_sin
{
    float func(const float& x) const
    {
        return (float)sinf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return sin_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_cos
{
    float func(const float& x) const
    {
        return (float)cosf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return cos_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_tan
{
    float func(const float& x) const
    {
        return (float)tanf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return tan_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_asin
{
    float func(const float& x) const
    {
        return (float)asinf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return asin_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_acos
{
    float func(const float& x) const
    {
        return (float)acosf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return acos_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_atan
{
    float func(const float& x) const
    {
        return (float)atanf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        // TODO neon optimize
        float tmp[4];
        vst1q_f32(tmp, x);
        tmp[0] = atanf(tmp[0]);
        tmp[1] = atanf(tmp[1]);
        tmp[2] = atanf(tmp[2]);
        tmp[3] = atanf(tmp[3]);
        return vld1q_f32(tmp);
    }
#endif // __ARM_NEON
};

struct unary_op_reciprocal
{
    float func(const float& x) const
    {
        return 1.f / x;
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        float32x4_t _reciprocal = vrecpeq_f32(x);
        _reciprocal = vmulq_f32(vrecpsq_f32(x, _reciprocal), _reciprocal);
        // _reciprocal = vmulq_f32(vrecpsq_f32(x, _reciprocal), _reciprocal);
        return _reciprocal;
    }
#endif // __ARM_NEON
};

struct unary_op_tanh
{
    float func(const float& x) const
    {
        return (float)tanhf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return tanh_ps(x);
    }
#endif // __ARM_NEON
};

struct unary_op_log10
{
    float func(const float& x) const
    {
        return (float)log10f(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return vmulq_f32(log_ps(x), vdupq_n_f32(0.434294481903));
    }
#endif // __ARM_NEON
};

struct unary_op_round
{
    float func(const float& x) const
    {
        // round to nearest even
#if NCNN_GNU_INLINE_ASM && __ARM_NEON
        // return (x + 12582912.f) - 12582912.f;
        float y;
        const float magic = 12582912.f;
#if __aarch64__
        asm volatile(
            "fadd   %s0, %s1, %s2   \n"
            "fsub   %s0, %s0, %s2   \n"
            : "=w"(y)
            : "w"(x), "w"(magic)
            :);
#else
        asm volatile(
            "vadd.f32   %0, %1, %2  \n"
            "vsub.f32   %0, %0, %2  \n"
            : "=t"(y)
            : "t"(x), "t"(magic)
            :);
#endif
        return y;
#else
#ifdef FE_TONEAREST
        int old_rm = fegetround();
        fesetround(FE_TONEAREST);
#endif
        float y = nearbyintf(x);
#ifdef FE_TONEAREST
        fesetround(old_rm);
#endif
        return y;
#endif
    }
#if __ARM_NEON
#if __aarch64__
    float32x4_t func_pack4(const float32x4_t& x) const
    {
        return vrndnq_f32(x);
    }
#else
    float32x4_t func_pack4(const float32x4_t& x) const
    {
#if NCNN_GNU_INLINE_ASM
        float32x4_t y;
        float32x4_t _magic = vdupq_n_f32(12582912.f); // 1.5 * 2^23
        asm volatile(
            "vadd.f32   %q0, %q1, %q2   \n"
            "vsub.f32   %q0, %q0, %q2   \n"
            : "=w"(y)
            : "w"(x), "w"(_magic)
            :);
        return y;
#else
        float tmp[4];
        vst1q_f32(tmp, x);
#ifdef FE_TONEAREST
        int old_rm = fegetround();
        fesetround(FE_TONEAREST);
#endif
        tmp[0] = nearbyintf(tmp[0]);
        tmp[1] = nearbyintf(tmp[1]);
        tmp[2] = nearbyintf(tmp[2]);
        tmp[3] = nearbyintf(tmp[3]);
#ifdef FE_TONEAREST
        fesetround(old_rm);
#endif
        float32x4_t y = vld1q_f32(tmp);
        return y;
#endif
    }
#endif
#endif // __ARM_NEON
};

struct unary_op_trunc
{
    float func(const float& x) const
    {
        return (float)truncf(x);
    }
#if __ARM_NEON
    float32x4_t func_pack4(const float32x4_t& x) const
    {
#if __aarch64__
        return vrndq_f32(x);
#else
        return vcvtq_f32_s32(vcvtq_s32_f32(x));
#endif
    }
#endif // __ARM_NEON
};

} // namespace UnaryOp_arm_functor

int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int elembits = bottom_top_blob.elembits();

#if NCNN_ARM82
    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
        return forward_inplace_fp16s(bottom_top_blob, opt);
#endif

#if NCNN_BF16
    if (opt.use_bf16_storage && elembits == 16)
        return forward_inplace_bf16s(bottom_top_blob, opt);
#endif

    using namespace UnaryOp_arm_functor;

    if (op_type == Operation_ABS)
        return unary_op_inplace<unary_op_abs>(bottom_top_blob, opt);

    if (op_type == Operation_NEG)
        return unary_op_inplace<unary_op_neg>(bottom_top_blob, opt);

    if (op_type == Operation_FLOOR)
        return unary_op_inplace<unary_op_floor>(bottom_top_blob, opt);

    if (op_type == Operation_CEIL)
        return unary_op_inplace<unary_op_ceil>(bottom_top_blob, opt);

    if (op_type == Operation_SQUARE)
        return unary_op_inplace<unary_op_square>(bottom_top_blob, opt);

    if (op_type == Operation_SQRT)
        return unary_op_inplace<unary_op_sqrt>(bottom_top_blob, opt);

    if (op_type == Operation_RSQRT)
        return unary_op_inplace<unary_op_rsqrt>(bottom_top_blob, opt);

    if (op_type == Operation_EXP)
        return unary_op_inplace<unary_op_exp>(bottom_top_blob, opt);

    if (op_type == Operation_LOG)
        return unary_op_inplace<unary_op_log>(bottom_top_blob, opt);

    if (op_type == Operation_SIN)
        return unary_op_inplace<unary_op_sin>(bottom_top_blob, opt);

    if (op_type == Operation_COS)
        return unary_op_inplace<unary_op_cos>(bottom_top_blob, opt);

    if (op_type == Operation_TAN)
        return unary_op_inplace<unary_op_tan>(bottom_top_blob, opt);

    if (op_type == Operation_ASIN)
        return unary_op_inplace<unary_op_asin>(bottom_top_blob, opt);

    if (op_type == Operation_ACOS)
        return unary_op_inplace<unary_op_acos>(bottom_top_blob, opt);

    if (op_type == Operation_ATAN)
        return unary_op_inplace<unary_op_atan>(bottom_top_blob, opt);

    if (op_type == Operation_RECIPROCAL)
        return unary_op_inplace<unary_op_reciprocal>(bottom_top_blob, opt);

    if (op_type == Operation_TANH)
        return unary_op_inplace<unary_op_tanh>(bottom_top_blob, opt);

    if (op_type == Operation_LOG10)
        return unary_op_inplace<unary_op_log10>(bottom_top_blob, opt);

    if (op_type == Operation_ROUND)
        return unary_op_inplace<unary_op_round>(bottom_top_blob, opt);

    if (op_type == Operation_TRUNC)
        return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);

    return 0;
}

#if NCNN_BF16
template<typename Op>
static int unary_op_inplace_bf16s(Mat& a, const Option& opt)
{
    Op op;

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int elempack = a.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        unsigned short* ptr = a.channel(q);

        int i = 0;
#if __ARM_NEON
#if __aarch64__
        for (; i + 15 < size; i += 16)
        {
            uint16x8_t _p01 = vld1q_u16(ptr);
            uint16x8_t _p23 = vld1q_u16(ptr + 8);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p01));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p01));
            float32x4_t _p2 = bfloat2float(vget_low_u16(_p23));
            float32x4_t _p3 = bfloat2float(vget_high_u16(_p23));
            _p0 = op.func_pack4(_p0);
            _p1 = op.func_pack4(_p1);
            _p2 = op.func_pack4(_p2);
            _p3 = op.func_pack4(_p3);
            _p01 = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            _p23 = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
            vst1q_u16(ptr, _p01);
            vst1q_u16(ptr + 8, _p23);
            ptr += 16;
        }
#endif // __aarch64__
        for (; i + 7 < size; i += 8)
        {
            uint16x8_t _p = vld1q_u16(ptr);
            float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
            float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
            _p0 = op.func_pack4(_p0);
            _p1 = op.func_pack4(_p1);
            _p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
            vst1q_u16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float32x4_t _p = bfloat2float(vld1_u16(ptr));
            _p = op.func_pack4(_p);
            vst1_u16(ptr, float2bfloat(_p));
            ptr += 4;
        }
#endif // __ARM_NEON
        for (; i < size; i++)
        {
            *ptr = float32_to_bfloat16(op.func(bfloat16_to_float32(*ptr)));
            ptr++;
        }
    }

    return 0;
}

int UnaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const
{
    using namespace UnaryOp_arm_functor;

    if (op_type == Operation_ABS)
        return unary_op_inplace_bf16s<unary_op_abs>(bottom_top_blob, opt);

    if (op_type == Operation_NEG)
        return unary_op_inplace_bf16s<unary_op_neg>(bottom_top_blob, opt);

    if (op_type == Operation_FLOOR)
        return unary_op_inplace_bf16s<unary_op_floor>(bottom_top_blob, opt);

    if (op_type == Operation_CEIL)
        return unary_op_inplace_bf16s<unary_op_ceil>(bottom_top_blob, opt);

    if (op_type == Operation_SQUARE)
        return unary_op_inplace_bf16s<unary_op_square>(bottom_top_blob, opt);

    if (op_type == Operation_SQRT)
        return unary_op_inplace_bf16s<unary_op_sqrt>(bottom_top_blob, opt);

    if (op_type == Operation_RSQRT)
        return unary_op_inplace_bf16s<unary_op_rsqrt>(bottom_top_blob, opt);

    if (op_type == Operation_EXP)
        return unary_op_inplace_bf16s<unary_op_exp>(bottom_top_blob, opt);

    if (op_type == Operation_LOG)
        return unary_op_inplace_bf16s<unary_op_log>(bottom_top_blob, opt);

    if (op_type == Operation_SIN)
        return unary_op_inplace_bf16s<unary_op_sin>(bottom_top_blob, opt);

    if (op_type == Operation_COS)
        return unary_op_inplace_bf16s<unary_op_cos>(bottom_top_blob, opt);

    if (op_type == Operation_TAN)
        return unary_op_inplace_bf16s<unary_op_tan>(bottom_top_blob, opt);

    if (op_type == Operation_ASIN)
        return unary_op_inplace_bf16s<unary_op_asin>(bottom_top_blob, opt);

    if (op_type == Operation_ACOS)
        return unary_op_inplace_bf16s<unary_op_acos>(bottom_top_blob, opt);

    if (op_type == Operation_ATAN)
        return unary_op_inplace_bf16s<unary_op_atan>(bottom_top_blob, opt);

    if (op_type == Operation_RECIPROCAL)
        return unary_op_inplace_bf16s<unary_op_reciprocal>(bottom_top_blob, opt);

    if (op_type == Operation_TANH)
        return unary_op_inplace_bf16s<unary_op_tanh>(bottom_top_blob, opt);

    if (op_type == Operation_LOG10)
        return unary_op_inplace_bf16s<unary_op_log10>(bottom_top_blob, opt);

    if (op_type == Operation_ROUND)
        return unary_op_inplace_bf16s<unary_op_round>(bottom_top_blob, opt);

    if (op_type == Operation_TRUNC)
        return unary_op_inplace_bf16s<unary_op_trunc>(bottom_top_blob, opt);

    return 0;
}
#endif // NCNN_BF16

} // namespace ncnn


================================================
FILE: src/layer/arm/unaryop_arm.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_UNARYOP_ARM_H
#define LAYER_UNARYOP_ARM_H

#include "unaryop.h"

namespace ncnn {

class UnaryOp_arm : public UnaryOp
{
public:
    UnaryOp_arm();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

protected:
#if NCNN_ARM82
    int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
#if NCNN_BF16
    int forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn

#endif // LAYER_UNARYOP_ARM_H


================================================
FILE: src/layer/arm/unaryop_arm_asimdhp.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "unaryop_arm.h"

// #include <fenv.h>
#include <float.h>

#if __ARM_NEON
#include <arm_neon.h>
#include "arm_usability.h"
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#include "neon_mathfun_fp16s.h"
#endif
#endif // __ARM_NEON

namespace ncnn {

#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template<typename Op>
static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
{
    Op op;

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int elempack = a.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        __fp16* ptr = a.channel(q);

        int i = 0;
        for (; i + 15 < size; i += 16)
        {
            float16x8_t _p0 = vld1q_f16(ptr);
            float16x8_t _p1 = vld1q_f16(ptr + 8);
            _p0 = op.func_pack8(_p0);
            _p1 = op.func_pack8(_p1);
            vst1q_f16(ptr, _p0);
            vst1q_f16(ptr + 8, _p1);
            ptr += 16;
        }
        for (; i + 7 < size; i += 8)
        {
            float16x8_t _p = vld1q_f16(ptr);
            _p = op.func_pack8(_p);
            vst1q_f16(ptr, _p);
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            float16x4_t _p = vld1_f16(ptr);
            _p = op.func_pack4(_p);
            vst1_f16(ptr, _p);
            ptr += 4;
        }
        for (; i < size; i++)
        {
            *ptr = op.func(*ptr);
            ptr++;
        }
    }

    return 0;
}

namespace UnaryOp_arm_functor {

struct unary_op_abs_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)fabsf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vabs_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vabsq_f16(x);
    }
};

struct unary_op_neg_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return -x;
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vneg_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vnegq_f16(x);
    }
};

struct unary_op_floor_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)floorf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vcvt_f16_s16(vcvtm_s16_f16(x));
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vcvtq_f16_s16(vcvtmq_s16_f16(x));
    }
};

struct unary_op_ceil_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)ceilf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vcvt_f16_s16(vcvtp_s16_f16(x));
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vcvtq_f16_s16(vcvtpq_s16_f16(x));
    }
};

struct unary_op_square_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return x * x;
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vmul_f16(x, x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vmulq_f16(x, x);
    }
};

struct unary_op_sqrt_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)sqrtf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vsqrt_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vsqrtq_f16(x);
    }
};

struct unary_op_rsqrt_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)1.f / (__fp16)sqrtf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        float16x4_t _reciprocal = vrsqrte_f16(x);
        _reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, _reciprocal), _reciprocal), _reciprocal);
        // _reciprocal = vmul_f16(vrsqrts_f16(vmul_f16(x, _reciprocal), _reciprocal), _reciprocal);
        return _reciprocal;
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        float16x8_t _reciprocal = vrsqrteq_f16(x);
        _reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, _reciprocal), _reciprocal), _reciprocal);
        // _reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, _reciprocal), _reciprocal), _reciprocal);
        return _reciprocal;
    }
};

struct unary_op_exp_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)expf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return exp_ps_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return exp_ps_f16(x);
    }
};

struct unary_op_log_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)logf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return log_ps_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return log_ps_f16(x);
    }
};

struct unary_op_sin_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)sinf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return sin_ps_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return sin_ps_f16(x);
    }
};

struct unary_op_cos_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)cosf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return cos_ps_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return cos_ps_f16(x);
    }
};

struct unary_op_tan_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)tanf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[4];
        vst1_f16(tmp, x);
        tmp[0] = tanf(tmp[0]);
        tmp[1] = tanf(tmp[1]);
        tmp[2] = tanf(tmp[2]);
        tmp[3] = tanf(tmp[3]);
        return vld1_f16(tmp);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[8];
        vst1q_f16(tmp, x);
        tmp[0] = tanf(tmp[0]);
        tmp[1] = tanf(tmp[1]);
        tmp[2] = tanf(tmp[2]);
        tmp[3] = tanf(tmp[3]);
        tmp[4] = tanf(tmp[4]);
        tmp[5] = tanf(tmp[5]);
        tmp[6] = tanf(tmp[6]);
        tmp[7] = tanf(tmp[7]);
        return vld1q_f16(tmp);
    }
};

struct unary_op_asin_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)asinf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[4];
        vst1_f16(tmp, x);
        tmp[0] = asinf(tmp[0]);
        tmp[1] = asinf(tmp[1]);
        tmp[2] = asinf(tmp[2]);
        tmp[3] = asinf(tmp[3]);
        return vld1_f16(tmp);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[8];
        vst1q_f16(tmp, x);
        tmp[0] = asinf(tmp[0]);
        tmp[1] = asinf(tmp[1]);
        tmp[2] = asinf(tmp[2]);
        tmp[3] = asinf(tmp[3]);
        tmp[4] = asinf(tmp[4]);
        tmp[5] = asinf(tmp[5]);
        tmp[6] = asinf(tmp[6]);
        tmp[7] = asinf(tmp[7]);
        return vld1q_f16(tmp);
    }
};

struct unary_op_acos_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)acosf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[4];
        vst1_f16(tmp, x);
        tmp[0] = acosf(tmp[0]);
        tmp[1] = acosf(tmp[1]);
        tmp[2] = acosf(tmp[2]);
        tmp[3] = acosf(tmp[3]);
        return vld1_f16(tmp);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[8];
        vst1q_f16(tmp, x);
        tmp[0] = acosf(tmp[0]);
        tmp[1] = acosf(tmp[1]);
        tmp[2] = acosf(tmp[2]);
        tmp[3] = acosf(tmp[3]);
        tmp[4] = acosf(tmp[4]);
        tmp[5] = acosf(tmp[5]);
        tmp[6] = acosf(tmp[6]);
        tmp[7] = acosf(tmp[7]);
        return vld1q_f16(tmp);
    }
};

struct unary_op_atan_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)atanf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[4];
        vst1_f16(tmp, x);
        tmp[0] = atanf(tmp[0]);
        tmp[1] = atanf(tmp[1]);
        tmp[2] = atanf(tmp[2]);
        tmp[3] = atanf(tmp[3]);
        return vld1_f16(tmp);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        // TODO neon optimize
        __fp16 tmp[8];
        vst1q_f16(tmp, x);
        tmp[0] = atanf(tmp[0]);
        tmp[1] = atanf(tmp[1]);
        tmp[2] = atanf(tmp[2]);
        tmp[3] = atanf(tmp[3]);
        tmp[4] = atanf(tmp[4]);
        tmp[5] = atanf(tmp[5]);
        tmp[6] = atanf(tmp[6]);
        tmp[7] = atanf(tmp[7]);
        return vld1q_f16(tmp);
    }
};

struct unary_op_reciprocal_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)1.f / x;
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        float16x4_t _reciprocal = vrecpe_f16(x);
        _reciprocal = vmul_f16(vrecps_f16(x, _reciprocal), _reciprocal);
        // _reciprocal = vmul_f16(vrecps_f16(x, _reciprocal), _reciprocal);
        return _reciprocal;
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        float16x8_t _reciprocal = vrecpeq_f16(x);
        _reciprocal = vmulq_f16(vrecpsq_f16(x, _reciprocal), _reciprocal);
        // _reciprocal = vmulq_f16(vrecpsq_f16(x, _reciprocal), _reciprocal);
        return _reciprocal;
    }
};

struct unary_op_tanh_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)tanhf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return tanh_ps_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return tanh_ps_f16(x);
    }
};

struct unary_op_log10_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)log10f(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vmul_f16(log_ps_f16(x), vdup_n_f16(0.434294481903));
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vmulq_f16(log_ps_f16(x), vdupq_n_f16(0.434294481903));
    }
};

struct unary_op_round_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        // round to nearest even
#if NCNN_GNU_INLINE_ASM
        // return (x + 1536.f) - 1536.f;
        __fp16 y;
        const __fp16 magic = 1536.f;
        asm volatile(
            "fadd   %h0, %h1, %h2  \n"
            "fsub   %h0, %h0, %h2  \n"
            : "=w"(y)
            : "w"(x), "w"(magic)
            :);
        return y;
#else
#ifdef FE_TONEAREST
        int old_rm = fegetround();
        fesetround(FE_TONEAREST);
#endif
        __fp16 y = (__fp16)nearbyintf(x);
#ifdef FE_TONEAREST
        fesetround(old_rm);
#endif
        return y;
#endif
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vrndn_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vrndnq_f16(x);
    }
};

struct unary_op_trunc_fp16s
{
    __fp16 func(const __fp16& x) const
    {
        return (__fp16)truncf(x);
    }
    float16x4_t func_pack4(const float16x4_t& x) const
    {
        return vrnd_f16(x);
    }
    float16x8_t func_pack8(const float16x8_t& x) const
    {
        return vrndq_f16(x);
    }
};

} // namespace UnaryOp_arm_functor

int UnaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
{
    using namespace UnaryOp_arm_functor;

    if (op_type == Operation_ABS)
        return unary_op_inplace_fp16s<unary_op_abs_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_NEG)
        return unary_op_inplace_fp16s<unary_op_neg_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_FLOOR)
        return unary_op_inplace_fp16s<unary_op_floor_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_CEIL)
        return unary_op_inplace_fp16s<unary_op_ceil_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_SQUARE)
        return unary_op_inplace_fp16s<unary_op_square_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_SQRT)
        return unary_op_inplace_fp16s<unary_op_sqrt_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_RSQRT)
        return unary_op_inplace_fp16s<unary_op_rsqrt_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_EXP)
        return unary_op_inplace_fp16s<unary_op_exp_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_LOG)
        return unary_op_inplace_fp16s<unary_op_log_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_SIN)
        return unary_op_inplace_fp16s<unary_op_sin_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_COS)
        return unary_op_inplace_fp16s<unary_op_cos_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_TAN)
        return unary_op_inplace_fp16s<unary_op_tan_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_ASIN)
        return unary_op_inplace_fp16s<unary_op_asin_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_ACOS)
        return unary_op_inplace_fp16s<unary_op_acos_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_ATAN)
        return unary_op_inplace_fp16s<unary_op_atan_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_RECIPROCAL)
        return unary_op_inplace_fp16s<unary_op_reciprocal_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_TANH)
        return unary_op_inplace_fp16s<unary_op_tanh_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_LOG10)
        return unary_op_inplace_fp16s<unary_op_log10_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_ROUND)
        return unary_op_inplace_fp16s<unary_op_round_fp16s>(bottom_top_blob, opt);

    if (op_type == Operation_TRUNC)
        return unary_op_inplace_fp16s<unary_op_trunc_fp16s>(bottom_top_blob, opt);

    return 0;
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

} // namespace ncnn


================================================
FILE: src/layer/batchnorm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "batchnorm.h"

namespace ncnn {

BatchNorm::BatchNorm()
{
    one_blob_only = true;
    support_inplace = true;
}

int BatchNorm::load_param(const ParamDict& pd)
{
    channels = pd.get(0, 0);
    eps = pd.get(1, 0.f);

    return 0;
}

int BatchNorm::load_model(const ModelBin& mb)
{
    slope_data = mb.load(channels, 1);
    if (slope_data.empty())
        return -100;

    mean_data = mb.load(channels, 1);
    if (mean_data.empty())
        return -100;

    var_data = mb.load(channels, 1);
    if (var_data.empty())
        return -100;

    bias_data = mb.load(channels, 1);
    if (bias_data.empty())
        return -100;

    a_data.create(channels);
    if (a_data.empty())
        return -100;
    b_data.create(channels);
    if (b_data.empty())
        return -100;

    for (int i = 0; i < channels; i++)
    {
        float sqrt_var = sqrtf(var_data[i] + eps);
        if (sqrt_var == 0.f)
            sqrt_var = 0.0001f; // sanitize divide by zero
        a_data[i] = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var;
        b_data[i] = slope_data[i] / sqrt_var;
    }

    return 0;
}

int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
    // value = b * value + a

    int dims = bottom_top_blob.dims;

    if (dims == 1)
    {
        int w = bottom_top_blob.w;

        float* ptr = bottom_top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < w; i++)
        {
            ptr[i] = b_data[i] * ptr[i] + a_data[i];
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            float a = a_data[i];
            float b = b_data[i];

            for (int j = 0; j < w; j++)
            {
                ptr[j] = b * ptr[j] + a;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int d = bottom_top_blob.d;
        int c = bottom_top_blob.c;
        int size = w * h * d;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float a = a_data[q];
            float b = b_data[q];

            for (int i = 0; i < size; i++)
            {
                ptr[i] = b * ptr[i] + a;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/batchnorm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BATCHNORM_H
#define LAYER_BATCHNORM_H

#include "layer.h"

namespace ncnn {

class BatchNorm : public Layer
{
public:
    BatchNorm();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    // param
    int channels;
    float eps;

    // model
    Mat slope_data;
    Mat mean_data;
    Mat var_data;
    Mat bias_data;

    Mat a_data;
    Mat b_data;
};

} // namespace ncnn

#endif // LAYER_BATCHNORM_H


================================================
FILE: src/layer/bias.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "bias.h"

namespace ncnn {

Bias::Bias()
{
    one_blob_only = true;
    support_inplace = true;
}

int Bias::load_param(const ParamDict& pd)
{
    bias_data_size = pd.get(0, 0);

    return 0;
}

int Bias::load_model(const ModelBin& mb)
{
    bias_data = mb.load(bias_data_size, 1);
    if (bias_data.empty())
        return -100;

    return 0;
}

int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_data[q];

        for (int i = 0; i < size; i++)
        {
            ptr[i] += bias;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/bias.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BIAS_H
#define LAYER_BIAS_H

#include "layer.h"

namespace ncnn {

class Bias : public Layer
{
public:
    Bias();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    // param
    int bias_data_size;

    // model
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_BIAS_H


================================================
FILE: src/layer/binaryop.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "binaryop.h"

namespace ncnn {

BinaryOp::BinaryOp()
{
    one_blob_only = false;
    support_inplace = false;
}

int BinaryOp::load_param(const ParamDict& pd)
{
    op_type = pd.get(0, 0);
    with_scalar = pd.get(1, 0);
    b = pd.get(2, 0.f);

    if (with_scalar != 0)
    {
        one_blob_only = true;
        support_inplace = true;
    }

    return 0;
}

// broadcasting rule
// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting

template<typename Op>
static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, const Option& opt)
{
    // general broadcast
    const Op op;

    const int dims = c.dims;
    const int w = c.w;
    const int h = c.h;
    const int d = c.d;
    const int channels = c.c;

    if (dims == 1)
    {
        const float* ptr = a;
        const float* ptr1 = b;
        float* outptr = c;

        const int ainc = a.w > 1 ? 1 : 0;
        const int binc = b.w > 1 ? 1 : 0;

        for (int x = 0; x < w; x++)
        {
            outptr[x] = op(*ptr, *ptr1);
            ptr += ainc;
            ptr1 += binc;
        }
    }

    if (dims == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < h; y++)
        {
            const float* ptr = a.row(std::min(y, a.h - 1));
            const float* ptr1 = b.row(std::min(y, b.h - 1));
            float* outptr = c.row(y);

            const int ainc = a.w > 1 ? 1 : 0;
            const int binc = b.w > 1 ? 1 : 0;

            for (int x = 0; x < w; x++)
            {
                outptr[x] = op(*ptr, *ptr1);
                ptr += ainc;
                ptr1 += binc;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = c.channel(q);

            const int ainc = a.w > 1 ? 1 : 0;
            const int binc = b.w > 1 ? 1 : 0;

            for (int z = 0; z < d; z++)
            {
                for (int y = 0; y < h; y++)
                {
                    const float* ptr = a.channel(std::min(q, a.c - 1)).depth(std::min(z, a.d - 1)).row(std::min(y, a.h - 1));
                    const float* ptr1 = b.channel(std::min(q, b.c - 1)).depth(std::min(z, b.d - 1)).row(std::min(y, b.h - 1));

                    for (int x = 0; x < w; x++)
                    {
                        outptr[x] = op(*ptr, *ptr1);
                        ptr += ainc;
                        ptr1 += binc;
                    }

                    outptr += w;
                }
            }
        }
    }
}

template<typename Op>
static void binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
{
    const Op op;

    const int channels = a.c;
    const int size = a.w * a.h * a.d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        for (int i = 0; i < size; i++)
        {
            ptr[i] = op(ptr[i], b);
        }
    }
}

struct binary_op_add
{
    float operator()(const float& x, const float& y) const
    {
        return x + y;
    }
};

struct binary_op_sub
{
    float operator()(const float& x, const float& y) const
    {
        return x - y;
    }
};

struct binary_op_mul
{
    float operator()(const float& x, const float& y) const
    {
        return x * y;
    }
};

struct binary_op_div
{
    float operator()(const float& x, const float& y) const
    {
        return x / y;
    }
};

struct binary_op_max
{
    float operator()(const float& x, const float& y) const
    {
        return std::max(x, y);
    }
};

struct binary_op_min
{
    float operator()(const float& x, const float& y) const
    {
        return std::min(x, y);
    }
};

struct binary_op_pow
{
    float operator()(const float& x, const float& y) const
    {
        return (float)powf(x, y);
    }
};

struct binary_op_rsub
{
    float operator()(const float& x, const float& y) const
    {
        return y - x;
    }
};

struct binary_op_rdiv
{
    float operator()(const float& x, const float& y) const
    {
        return y / x;
    }
};

struct binary_op_rpow
{
    float operator()(const float& x, const float& y) const
    {
        return (float)powf(y, x);
    }
};

struct binary_op_atan2
{
    float operator()(const float& x, const float& y) const
    {
        return (float)atan2f(x, y);
    }
};

struct binary_op_ratan2
{
    float operator()(const float& x, const float& y) const
    {
        return (float)atan2f(y, x);
    }
};

struct binary_op_fmod
{
    float operator()(const float& x, const float& y) const
    {
        return (float)fmodf(x, y);
    }
};

struct binary_op_logaddexp
{
    float operator()(const float& x, const float& y) const
    {
        float max_xy = std::max(x, y);
        float min_xy = std::min(x, y);
        return (float)(max_xy + log1pf(expf(min_xy - max_xy)));
    }
};

struct binary_op_floor_divide
{
    float operator()(const float& x, const float& y) const
    {
        return (float)floorf(x / y);
    }
};

struct binary_op_remainder
{
    float operator()(const float& x, const float& y) const
    {
        return (float)remainderf(x, y);
    }
};

struct binary_op_rfmod
{
    float operator()(const float& x, const float& y) const
    {
        return (float)fmodf(y, x);
    }
};

struct binary_op_rfloor_divide
{
    float operator()(const float& x, const float& y) const
    {
        return (float)floorf(y / x);
    }
};

struct binary_op_rremainder
{
    float operator()(const float& x, const float& y) const
    {
        return (float)remainderf(y, x);
    }
};

static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    if (op_type == BinaryOp::Operation_ADD) return binary_op_broadcast<binary_op_add>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_broadcast<binary_op_sub>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_broadcast<binary_op_mul>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_broadcast<binary_op_div>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_broadcast<binary_op_max>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_broadcast<binary_op_min>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_POW) return binary_op_broadcast<binary_op_pow>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_broadcast<binary_op_sub>(b, a, c, opt);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_broadcast<binary_op_div>(b, a, c, opt);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_broadcast<binary_op_pow>(b, a, c, opt);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_broadcast<binary_op_atan2>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_broadcast<binary_op_atan2>(b, a, c, opt);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_broadcast<binary_op_fmod>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_broadcast<binary_op_fmod>(b, a, c, opt);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_broadcast<binary_op_logaddexp>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_broadcast<binary_op_floor_divide>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_broadcast<binary_op_floor_divide>(b, a, c, opt);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_broadcast<binary_op_remainder>(a, b, c, opt);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_broadcast<binary_op_remainder>(b, a, c, opt);

    // should never reach here
}

static void binary_op_scalar_inplace(Mat& bottom_top_blob, float b, int op_type, const Option& opt)
{
    if (op_type == BinaryOp::Operation_ADD) return binary_op_scalar_inplace<binary_op_add>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_scalar_inplace<binary_op_sub>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_scalar_inplace<binary_op_mul>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_scalar_inplace<binary_op_div>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_scalar_inplace<binary_op_max>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_scalar_inplace<binary_op_min>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_POW) return binary_op_scalar_inplace<binary_op_pow>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_scalar_inplace<binary_op_rsub>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_scalar_inplace<binary_op_rdiv>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_scalar_inplace<binary_op_rpow>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_scalar_inplace<binary_op_atan2>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_scalar_inplace<binary_op_ratan2>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_scalar_inplace<binary_op_fmod>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_scalar_inplace<binary_op_rfmod>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_scalar_inplace<binary_op_logaddexp>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_scalar_inplace<binary_op_floor_divide>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_scalar_inplace<binary_op_rfloor_divide>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_scalar_inplace<binary_op_remainder>(bottom_top_blob, b, opt);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_scalar_inplace<binary_op_rremainder>(bottom_top_blob, b, opt);

    // should never reach here
}

int BinaryOp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    const int outdims = std::max(A.dims, B.dims);

    Mat A2 = A;
    Mat B2 = B;
    if (A.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (A.w == B.h)
                A2 = A.reshape(1, A.w);
            else // if (A.w == B.w)
                A2 = A.reshape(A.w, 1);
        }
        if (outdims == 3 && A.dims == 1)
        {
            if (A.w == B.c)
                A2 = A.reshape(1, 1, A.w);
            else // if (A.w == B.w)
                A2 = A.reshape(A.w, 1, 1);
        }
        if (outdims == 3 && A.dims == 2)
            A2 = A.reshape(1, A.w, A.h);
        if (outdims == 4 && A.dims == 1)
        {
            if (A.w == B.c)
                A2 = A.reshape(1, 1, 1, A.w);
            else // if (A.w == B.w)
                A2 = A.reshape(A.w, 1, 1, 1);
        }
        if (outdims == 4 && A.dims == 2)
            A2 = A.reshape(1, 1, A.w, A.h);
        if (outdims == 4 && A.dims == 3)
            A2 = A.reshape(1, A.w, A.h, A.c);
    }
    if (B.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (B.w == A.h)
                B2 = B.reshape(1, B.w);
            else // if (B.w == A.w)
                B2 = B.reshape(B.w, 1);
        }
        if (outdims == 3 && B.dims == 1)
        {
            if (B.w == A.c)
                B2 = B.reshape(1, 1, B.w);
            else // if (B.w == A.w)
                B2 = B.reshape(B.w, 1, 1);
        }
        if (outdims == 3 && B.dims == 2)
            B2 = B.reshape(1, B.w, B.h);
        if (outdims == 4 && B.dims == 1)
        {
            if (B.w == A.c)
                B2 = B.reshape(1, 1, 1, B.w);
            else // if (B.w == A.w)
                B2 = B.reshape(B.w, 1, 1, 1);
        }
        if (outdims == 4 && B.dims == 2)
            B2 = B.reshape(1, 1, B.w, B.h);
        if (outdims == 4 && B.dims == 3)
            B2 = B.reshape(1, B.w, B.h, B.c);
    }

    const int outw = std::max(A2.w, B2.w);
    const int outh = std::max(A2.h, B2.h);
    const int outd = std::max(A2.d, B2.d);
    const int outc = std::max(A2.c, B2.c);

    Mat& top_blob = top_blobs[0];
    if (outdims == 1)
    {
        top_blob.create(outw, 4u, opt.blob_allocator);
    }
    if (outdims == 2)
    {
        top_blob.create(outw, outh, 4u, opt.blob_allocator);
    }
    if (outdims == 3)
    {
        top_blob.create(outw, outh, outc, 4u, opt.blob_allocator);
    }
    if (outdims == 4)
    {
        top_blob.create(outw, outh, outd, outc, 4u, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    binary_op_broadcast(A2, B2, top_blob, op_type, opt);

    return 0;
}

int BinaryOp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    binary_op_scalar_inplace(bottom_top_blob, b, op_type, opt);

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/binaryop.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BINARYOP_H
#define LAYER_BINARYOP_H

#include "layer.h"

namespace ncnn {

class BinaryOp : public Layer
{
public:
    BinaryOp();

    virtual int load_param(const ParamDict& pd);

    using Layer::forward;
    using Layer::forward_inplace;
    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

    enum OperationType
    {
        Operation_ADD = 0,
        Operation_SUB = 1,
        Operation_MUL = 2,
        Operation_DIV = 3,
        Operation_MAX = 4,
        Operation_MIN = 5,
        Operation_POW = 6,
        Operation_RSUB = 7,
        Operation_RDIV = 8,
        Operation_RPOW = 9,
        Operation_ATAN2 = 10,
        Operation_RATAN2 = 11,
        Operation_FMOD = 12,
        Operation_RFMOD = 13,
        Operation_LOGADDEXP = 14,
        Operation_FLOOR_DIVIDE = 15,
        Operation_RFLOOR_DIVIDE = 16,
        Operation_REMAINDER = 17,
        Operation_RREMAINDER = 18
    };

public:
    // param
    int op_type;
    int with_scalar;
    float b;
};

} // namespace ncnn

#endif // LAYER_BINARYOP_H


================================================
FILE: src/layer/bnll.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "bnll.h"

namespace ncnn {

BNLL::BNLL()
{
    one_blob_only = true;
    support_inplace = true;
}

int BNLL::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            if (ptr[i] > 0)
                ptr[i] = ptr[i] + logf(1.f + expf(-ptr[i]));
            else
                ptr[i] = logf(1.f + expf(ptr[i]));
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/bnll.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BNLL_H
#define LAYER_BNLL_H

#include "layer.h"

namespace ncnn {

class BNLL : public Layer
{
public:
    BNLL();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
};

} // namespace ncnn

#endif // LAYER_BNLL_H


================================================
FILE: src/layer/cast.cpp
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cast.h"

namespace ncnn {

Cast::Cast()
{
    one_blob_only = true;
    support_inplace = false;
    support_packing = true;
}

int Cast::load_param(const ParamDict& pd)
{
    type_from = pd.get(0, 0);
    type_to = pd.get(1, 0);

    return 0;
}

// round to nearest
signed char float32_to_int8(float value)
{
    float tmp;
    if (value >= 0.f)
        tmp = value + 0.5f;
    else
        tmp = value - 0.5f;

    if (tmp > 127)
        return 127;
    if (tmp < -128)
        return -128;

    return static_cast<signed char>(tmp);
}

int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (type_from == type_to)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    size_t out_elemsize = elemsize;
    if (type_to == 1)
    {
        // float32
        out_elemsize = 4 * elempack;
    }
    else if (type_to == 2)
    {
        // float16
        out_elemsize = 2 * elempack;
    }
    else if (type_to == 3)
    {
        // int8
        out_elemsize = elempack;
    }
    else if (type_to == 4)
    {
        // bfloat16
        out_elemsize = 2 * elempack;
    }

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 4)
    {
        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int size = w * h * d * elempack;

    if (type_from == 1 && type_to == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            unsigned short* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = float32_to_float16(ptr[i]);
            }
        }
    }

    if (type_from == 2 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = float16_to_float32(ptr[i]);
            }
        }
    }

    if (type_from == 3 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const signed char* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = (float)ptr[i];
            }
        }
    }

    if (type_from == 1 && type_to == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            unsigned short* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = float32_to_bfloat16(ptr[i]);
            }
        }
    }

    if (type_from == 4 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = bfloat16_to_float32(ptr[i]);
            }
        }
    }

    // TODO more cast type

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/cast.h
================================================
// Copyright 2019 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CAST_H
#define LAYER_CAST_H

#include "layer.h"

namespace ncnn {

class Cast : public Layer
{
public:
    Cast();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    // element type
    // 0 = auto
    // 1 = float32
    // 2 = float16
    // 3 = int8
    // 4 = bfloat16
    int type_from;
    int type_to;
};

} // namespace ncnn

#endif // LAYER_CAST_H


================================================
FILE: src/layer/celu.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "celu.h"

namespace ncnn {

CELU::CELU()
{
    one_blob_only = true;
    support_inplace = true;
}

int CELU::load_param(const ParamDict& pd)
{
    alpha = pd.get(0, 1.f);

    return 0;
}

int CELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            if (ptr[i] < 0.f)
                ptr[i] = (expf(ptr[i] / alpha) - 1.f) * alpha;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/celu.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CELU_H
#define LAYER_CELU_H

#include "layer.h"

namespace ncnn {

class CELU : public Layer
{
public:
    CELU();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float alpha;
};

} // namespace ncnn

#endif // LAYER_CELU_H


================================================
FILE: src/layer/clip.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "clip.h"

#include <float.h>

namespace ncnn {

Clip::Clip()
{
    one_blob_only = true;
    support_inplace = true;
}

int Clip::load_param(const ParamDict& pd)
{
    min = pd.get(0, -FLT_MAX);
    max = pd.get(1, FLT_MAX);

    return 0;
}

int Clip::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            if (ptr[i] < min)
                ptr[i] = min;
            if (ptr[i] > max)
                ptr[i] = max;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/clip.h
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CLIP_H
#define LAYER_CLIP_H

#include "layer.h"

namespace ncnn {

class Clip : public Layer
{
public:
    Clip();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float min;
    float max;
};

} // namespace ncnn

#endif // LAYER_CLIP_H


================================================
FILE: src/layer/concat.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "concat.h"

namespace ncnn {

Concat::Concat()
{
    one_blob_only = false;
    support_inplace = false;
}

int Concat::load_param(const ParamDict& pd)
{
    axis = pd.get(0, 0);

    return 0;
}

int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int dims = bottom_blobs[0].dims;
    size_t elemsize = bottom_blobs[0].elemsize;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // concat vector
        // total length
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        unsigned char* outptr = top_blob;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int w = bottom_blob.w;

            const unsigned char* ptr = bottom_blob;
            memcpy(outptr, ptr, w * elemsize);

            outptr += w * elemsize;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // concat image
        int w = bottom_blobs[0].w;

        // total height
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        unsigned char* outptr = top_blob;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int size = w * bottom_blob.h;

            const unsigned char* ptr = bottom_blob;
            memcpy(outptr, ptr, size * elemsize);

            outptr += size * elemsize;
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // interleave image row
        int h = bottom_blobs[0].h;

        // total width
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            unsigned char* outptr = top_blob.row<unsigned char>(i);
            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                const unsigned char* ptr = bottom_blob.row<const unsigned char>(i);
                memcpy(outptr, ptr, bottom_blob.w * elemsize);

                outptr += bottom_blob.w * elemsize;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;

        // total channels
        int top_channels = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_channels += bottom_blob.c;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, d, top_channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        int q = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            int channels = bottom_blob.c;
            size_t size = bottom_blob.cstep * channels;

            const unsigned char* ptr = bottom_blob;
            unsigned char* outptr = top_blob.channel(q);
            memcpy(outptr, ptr, size * elemsize);

            q += channels;
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // interleave dim height
        int w = bottom_blobs[0].w;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;

        // total height
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, d, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned char* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (size_t b = 0; b < bottom_blobs.size(); b++)
                {
                    const Mat& bottom_blob = bottom_blobs[b];

                    int size = bottom_blob.w * bottom_blob.h;

                    const unsigned char* ptr = bottom_blob.channel(q).depth(i);
                    memcpy(outptr, ptr, size * elemsize);

                    outptr += size * elemsize;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // interleave dim width
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;

        // total width
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, d, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned char* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
                    {
                        const Mat& bottom_blob = bottom_blobs[b];

                        const unsigned char* ptr = bottom_blob.channel(q).depth(i).row<const unsigned char>(j);
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);

                        outptr += bottom_blob.w * elemsize;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        // interleave dim depth
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;

        // total depth
        int top_d = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_d += bottom_blob.d;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_d, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            unsigned char* outptr = top_blob.channel(q);

            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                const unsigned char* ptr = bottom_blob.channel(q);
                memcpy(outptr, ptr, size * elemsize);

                outptr += size * elemsize;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/concat.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONCAT_H
#define LAYER_CONCAT_H

#include "layer.h"

namespace ncnn {

class Concat : public Layer
{
public:
    Concat();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    int axis;
};

} // namespace ncnn

#endif // LAYER_CONCAT_H


================================================
FILE: src/layer/convolution.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution.h"

#include "layer_type.h"

#include "fused_activation.h"

namespace ncnn {

Convolution::Convolution()
{
    one_blob_only = true;
    support_inplace = false;
}

int Convolution::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    pad_value = pd.get(18, 0.f);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    int8_scale_term = pd.get(8, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(19, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    if (int8_scale_term)
    {
#if NCNN_INT8
        support_int8_storage = true;
#else
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
#endif
    }

    return 0;
}

int Convolution::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

#if NCNN_INT8
    if (int8_scale_term)
    {
        weight_data_int8_scales = mb.load(num_output, 1);
        bottom_blob_int8_scales = mb.load(1, 1);
    }

    if (int8_scale_term > 100)
    {
        top_blob_int8_scales = mb.load(1, 1);
    }
#endif // NCNN_INT8

#if NCNN_INT8
    // runtime quantize the weight data
    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
    {
        const int maxk = kernel_w * kernel_h;
        const int num_input = weight_data_size / num_output / maxk;

        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        Mat weight_data_int8;

        Option opt_q;
        opt_q.num_threads = 1;
        opt_q.blob_allocator = weight_data.allocator;
        opt_q.use_packing_layout = false;
        quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
        if (weight_data_int8.empty())
            return -100;

        weight_data = weight_data_int8.reshape(weight_data_size);
    }
#endif // NCNN_INT8

    return 0;
}

static int convolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int bias_term = bias_data.empty() ? 0 : 1;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_term)
                    sum = bias_data[p];

                const float* kptr = (const float*)weight_data + maxk * inch * p;

                for (int q = 0; q < inch; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const float* sptr = m.row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++) // 29.23
                    {
                        float val = sptr[space_ofs[k]]; // 20.72
                        float wt = kptr[k];
                        sum += val * wt; // 41.45
                    }

                    kptr += maxk;
                }

                outptr[j] = activation_ss(sum, activation_type, activation_params);
            }

            outptr += outw;
        }
    }

    return 0;
}

int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
#endif

    // flattened blob, implement as InnerProduct
    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
    {
        int num_input = weight_data_size / num_output;
        if (bottom_blob.w * bottom_blob.elempack == num_input)
        {
            // call InnerProduct
            ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::InnerProduct);

            // set param
            ncnn::ParamDict pd;
            pd.set(0, num_output);
            pd.set(1, bias_term);
            pd.set(2, weight_data_size);
            pd.set(8, int8_scale_term);
            pd.set(9, activation_type);
            pd.set(10, activation_params);

            op->load_param(pd);

            // set weights
            ncnn::Mat weights[4];
            weights[0] = weight_data;
            weights[1] = bias_data;

#if NCNN_INT8
            if (int8_scale_term)
            {
                weights[2] = weight_data_int8_scales;
                weights[3] = bottom_blob_int8_scales;
            }
#endif

            op->load_model(ModelBinFromMatArray(weights));

            op->create_pipeline(opt);

            // forward
            int ret = op->forward(bottom_blob, top_blob, opt);

            op->destroy_pipeline(opt);

            delete op;

            return ret;
        }
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const int h = bottom_blob_bordered.h;
    const size_t elemsize = bottom_blob_bordered.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = (h - kernel_extent_h) / stride_h + 1;

    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolution(bottom_blob_bordered, top_blob, weight_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

int Convolution::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.c;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, _kernel_w, _kernel_h, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const int h = bottom_blob_bordered.h;
    const size_t elemsize = bottom_blob_bordered.elemsize;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = (h - kernel_extent_h) / stride_h + 1;

    top_blob.create(outw, outh, _num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolution(bottom_blob_bordered, top_blob, weight_data_flattened, bias_data_flattened, _kernel_w, _kernel_h, stride_w, stride_h, dilation_w, dilation_h, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

void Convolution::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
{
    make_padding(bottom_blob, bottom_blob_bordered, kernel_w, kernel_h, opt);
}

void Convolution::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int _kernel_w, int _kernel_h, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;

    bottom_blob_bordered = bottom_blob;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
    {
        Option opt_b = opt;
        opt_b.blob_allocator = opt.workspace_allocator;
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
    }
    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
    {
        // tensorflow padding=SAME or onnx padding=SAME_UPPER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
    {
        // onnx padding=SAME_LOWER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
}

#if NCNN_INT8
static inline signed char float2int8(float v)
{
    int int32 = static_cast<int>(round(v));
    if (int32 > 127) return 127;
    if (int32 < -127) return -127;
    return (signed char)int32;
}

int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    //     NCNN_LOGE("Convolution input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_unbordered = bottom_blob;
    if (elemsize != 1)
    {
        Option opt_g = opt;
        opt_g.blob_allocator = opt.workspace_allocator;

        quantize_to_int8(bottom_blob, bottom_blob_unbordered, bottom_blob_int8_scales, opt_g);
        if (bottom_blob_unbordered.empty())
            return -100;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_unbordered, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // int8
    bool use_int8_requantize = int8_scale_term > 100;
    size_t out_elemsize = use_int8_requantize ? 1u : 4u;

    top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < num_output; p++)
    {
        signed char* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                int sum = 0;

                const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        int val = sptr[space_ofs[k]];
                        int wt = kptr[k];
                        sum += val * wt;
                    }

                    kptr += maxk;
                }

                float scale_in;
                if (weight_data_int8_scales[p] == 0)
                    scale_in = 0;
                else
                    scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

                float sumfp32 = sum * scale_in;

                if (bias_term)
                    sumfp32 += bias_data[p];

                sumfp32 = activation_ss(sumfp32, activation_type, activation_params);

                if (use_int8_requantize)
                {
                    // requantize
                    float scale_out = top_blob_int8_scales[0];
                    signed char sums8 = float2int8(sumfp32 * scale_out);
                    outptr[0] = sums8;
                    outptr += 1;
                }
                else
                {
                    // dequantize
                    ((float*)outptr)[0] = sumfp32;
                    outptr += 4;
                }
            }
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/convolution.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION_H
#define LAYER_CONVOLUTION_H

#include "layer.h"

namespace ncnn {

class Convolution : public Layer
{
public:
    Convolution();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int kernel_w, int kernel_h, const Option& opt) const;

#if NCNN_INT8
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int dilation_w;
    int dilation_h;
    int stride_w;
    int stride_h;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    int pad_top;
    int pad_bottom;
    float pad_value;
    int bias_term;

    int weight_data_size;

    int int8_scale_term;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;

#if NCNN_INT8
    Mat weight_data_int8_scales;
    Mat bottom_blob_int8_scales;
    Mat top_blob_int8_scales;
#endif
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION_H


================================================
FILE: src/layer/convolution1d.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution1d.h"

#include "fused_activation.h"

namespace ncnn {

Convolution1D::Convolution1D()
{
    one_blob_only = true;
    support_inplace = false;
}

int Convolution1D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    dilation_w = pd.get(2, 1);
    stride_w = pd.get(3, 1);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_value = pd.get(18, 0.f);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(19, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    return 0;
}

int Convolution1D::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int convolution1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int h = bottom_blob.h;

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int bias_term = bias_data.empty() ? 0 : 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outh; p++)
    {
        float* outptr = top_blob.row(p);

        for (int j = 0; j < outw; j++)
        {
            float sum = 0.f;

            if (bias_term)
                sum = bias_data[p];

            const float* kptr = (const float*)weight_data + kernel_w * h * p;

            for (int q = 0; q < h; q++)
            {
                const float* sptr = bottom_blob.row(q) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val = *sptr;
                    float wt = kptr[k];
                    sum += val * wt;

                    sptr += dilation_w;
                }

                kptr += kernel_w;
            }

            sum = activation_ss(sum, activation_type, activation_params);

            outptr[j] = sum;
        }
    }

    return 0;
}

int Convolution1D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const size_t elemsize = bottom_blob_bordered.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;

    top_blob.create(outw, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolution1d(bottom_blob_bordered, top_blob, weight_data, bias_data, kernel_w, stride_w, dilation_w, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

int Convolution1D::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _num_output = _weight_data.c;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, _kernel_w, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const size_t elemsize = bottom_blob_bordered.elemsize;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;

    top_blob.create(outw, _num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolution1d(bottom_blob_bordered, top_blob, weight_data_flattened, bias_data_flattened, _kernel_w, stride_w, dilation_w, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

void Convolution1D::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
{
    make_padding(bottom_blob, bottom_blob_bordered, kernel_w, opt);
}

void Convolution1D::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int _kernel_w, const Option& opt) const
{
    int w = bottom_blob.w;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;

    bottom_blob_bordered = bottom_blob;
    if (pad_left > 0 || pad_right > 0)
    {
        Option opt_b = opt;
        opt_b.blob_allocator = opt.workspace_allocator;
        copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
    }
    else if (pad_left == -233 && pad_right == -233)
    {
        // tensorflow padding=SAME or onnx padding=SAME_UPPER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        if (wpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
    else if (pad_left == -234 && pad_right == -234)
    {
        // onnx padding=SAME_LOWER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        if (wpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
}

} // namespace ncnn


================================================
FILE: src/layer/convolution1d.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION1D_H
#define LAYER_CONVOLUTION1D_H

#include "layer.h"

namespace ncnn {

class Convolution1D : public Layer
{
public:
    Convolution1D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int kernel_w, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int dilation_w;
    int stride_w;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    float pad_value;
    int bias_term;

    int weight_data_size;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION1D_H


================================================
FILE: src/layer/convolution3d.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution3d.h"

#include "fused_activation.h"

namespace ncnn {

Convolution3D::Convolution3D()
{
    one_blob_only = true;
    support_inplace = false;
}

int Convolution3D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    kernel_d = pd.get(21, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    dilation_d = pd.get(22, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    stride_d = pd.get(23, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    pad_front = pd.get(24, pad_left);
    pad_behind = pd.get(17, pad_front);
    pad_value = pd.get(18, 0.f);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    return 0;
}

int Convolution3D::load_model(const ModelBin& mb)
{
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

int Convolution3D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extend_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extend_h = dilation_h * (kernel_h - 1) + 1;
    const int kernel_extend_d = dilation_d * (kernel_d - 1) + 1;

    Mat bottom_blob_bordered;
    Option opt_pad = opt;
    opt_pad.use_packing_layout = false;
    make_padding(bottom_blob, bottom_blob_bordered, opt_pad);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;
    d = bottom_blob_bordered.d;

    int outw = (w - kernel_extend_w) / stride_w + 1;
    int outh = (h - kernel_extend_h) / stride_h + 1;
    int outd = (d - kernel_extend_d) / stride_d + 1;

    const int maxk = kernel_w * kernel_h * kernel_d;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap0 = w * dilation_h - kernel_w * dilation_w;
        int gap1 = h * w * dilation_d - w * kernel_h * dilation_h;
        for (int z = 0; z < kernel_d; z++)
        {
            for (int i = 0; i < kernel_h; i++)
            {
                for (int j = 0; j < kernel_w; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2 += dilation_w;
                }
                p2 += gap0;
            }
            p2 += gap1;
        }
    }

    top_blob.create(outw, outh, outd, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < num_output; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int z = 0; z < outd; z++)
        {
            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    float sum = 0.f;

                    if (bias_term)
                        sum = bias_data[p];

                    const float* kptr = (const float*)weight_data + maxk * channels * p;

                    for (int q = 0; q < channels; q++)
                    {
                        const Mat m = bottom_blob_bordered.channel(q);
                        const float* sptr = m.depth(z * stride_d).row(i * stride_h) + j * stride_w;

                        for (int l = 0; l < maxk; l++)
                        {
                            float val = sptr[space_ofs[l]];

                            float wt = kptr[l];
                            sum += val * wt;
                        }

                        kptr += maxk;
                    }

                    outptr[j] = activation_ss(sum, activation_type, activation_params);
                }

                outptr += outw;
            }
        }
    }

    return 0;
}

void Convolution3D::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
    const int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;

    bottom_blob_bordered = bottom_blob;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || pad_front > 0 || pad_behind > 0)
    {
        Option opt_b = opt;
        opt_b.blob_allocator = opt.workspace_allocator;
        copy_make_border_3d(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, pad_front, pad_behind, BORDER_CONSTANT, pad_value, opt_b);
    }
    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233 && pad_front == -233 && pad_behind == -233)
    {
        // tensorflow padding=SAME or onnx padding=SAME_UPPER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        int dpad = kernel_extent_d + (d - 1) / stride_d * stride_d - d;
        if (wpad > 0 || hpad > 0 || dpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border_3d(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, dpad / 2, dpad - dpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234 && pad_front == -234 && pad_behind == -234)
    {
        // onnx padding=SAME_LOWER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        int dpad = kernel_extent_d + (d - 1) / stride_d * stride_d - d;
        if (wpad > 0 || hpad > 0 || dpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border_3d(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, dpad / 2, dpad - dpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
}

} // namespace ncnn


================================================
FILE: src/layer/convolution3d.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION3D_H
#define LAYER_CONVOLUTION3D_H

#include "layer.h"

namespace ncnn {

class Convolution3D : public Layer
{
public:
    Convolution3D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int kernel_d;
    int dilation_w;
    int dilation_h;
    int dilation_d;
    int stride_w;
    int stride_h;
    int stride_d;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    int pad_top;
    int pad_bottom;
    int pad_front;
    int pad_behind;
    float pad_value;
    int bias_term;

    int weight_data_size;

    int activation_type;
    Mat activation_params;

    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif //LAYER_CONVOLUTION3D_H


================================================
FILE: src/layer/convolutiondepthwise.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolutiondepthwise.h"

#include "layer_type.h"

#include "fused_activation.h"

namespace ncnn {

ConvolutionDepthWise::ConvolutionDepthWise()
{
    one_blob_only = true;
    support_inplace = false;
}

int ConvolutionDepthWise::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    pad_value = pd.get(18, 0.f);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    group = pd.get(7, 1);
    int8_scale_term = pd.get(8, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(19, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    if (num_output % group != 0)
    {
        // reject invalid group
        return -100;
    }

    if (int8_scale_term)
    {
#if NCNN_INT8
        support_int8_storage = true;
#else
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
#endif
    }

    return 0;
}

int ConvolutionDepthWise::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

#if NCNN_INT8
    if (int8_scale_term == 1 || int8_scale_term == 101)
    {
        weight_data_int8_scales = mb.load(group, 1);
        bottom_blob_int8_scales = mb.load(1, 1);

        float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
        bottom_blob_int8_scales = Mat(group);
        bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
    }
    else if (int8_scale_term == 2 || int8_scale_term == 102)
    {
        weight_data_int8_scales = mb.load(1, 1);
        bottom_blob_int8_scales = mb.load(1, 1);

        // extend group if only one provided
        float weight_data_int8_scale = weight_data_int8_scales[0];
        weight_data_int8_scales = Mat(group);
        weight_data_int8_scales.fill(weight_data_int8_scale);

        float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
        bottom_blob_int8_scales = Mat(group);
        bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
    }

    if (int8_scale_term > 100)
    {
        top_blob_int8_scales = mb.load(1, 1);

        float top_blob_int8_scale = top_blob_int8_scales[0];
        top_blob_int8_scales = Mat(group);
        top_blob_int8_scales.fill(top_blob_int8_scale);
    }
#endif // NCNN_INT8

#if NCNN_INT8
    // runtime quantize the weight data
    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
    {
        Mat int8_weight_data(weight_data_size, (size_t)1u);
        if (int8_weight_data.empty())
            return -100;

        const int weight_data_size_g = weight_data_size / group;

        for (int g = 0; g < group; g++)
        {
            Option opt_q;
            opt_q.num_threads = 1;
            opt_q.blob_allocator = int8_weight_data.allocator;
            opt_q.use_packing_layout = false;

            const Mat weight_data_g = weight_data.range(weight_data_size_g * g, weight_data_size_g);
            Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g);
            const Mat weight_data_int8_scales_g = weight_data_int8_scales.range(g, 1);
            quantize_to_int8(weight_data_g, int8_weight_data_g, weight_data_int8_scales_g, opt_q);
        }

        weight_data = int8_weight_data;
    }
#endif // NCNN_INT8

    return 0;
}

static int convolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int bias_term = bias_data.empty() ? 0 : 1;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // depth-wise
    if (inch == group && group == outch)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            float* outptr = top_blob.channel(g);
            const float* kptr = (const float*)weight_data + maxk * g;
            const Mat m = bottom_blob.channel(g);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    float sum = 0.f;

                    if (bias_term)
                        sum = bias_data[g];

                    const float* sptr = m.row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val = sptr[space_ofs[k]];
                        float w = kptr[k];
                        sum += val * w;
                    }

                    outptr[j] = activation_ss(sum, activation_type, activation_params);
                }

                outptr += outw;
            }
        }
    }
    else
    {
        // group convolution
        const int inch_g = inch / group;
        const int outch_g = outch / group;

#ifdef _WIN32
        #pragma omp parallel for num_threads(opt.num_threads)
#else
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif
        for (int g = 0; g < group; g++)
        {
            for (int p = 0; p < outch_g; p++)
            {
                float* outptr = top_blob.channel(g * outch_g + p);
                const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;

                // shadowed variable for less openmp task args
                const int outw = top_blob.w;
                const int outh = top_blob.h;

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                            sum = bias_data[outch_g * g + p];

                        const float* kptr = weight_data_ptr + maxk * inch_g * p;

                        for (int q = 0; q < inch_g; q++)
                        {
                            const Mat m = bottom_blob.channel(inch_g * g + q);
                            const float* sptr = m.row(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = sptr[space_ofs[k]];
                                float w = kptr[k];
                                sum += val * w;
                            }

                            kptr += maxk;
                        }

                        outptr[j] = activation_ss(sum, activation_type, activation_params);
                    }

                    outptr += outw;
                }
            }
        }
    }

    return 0;
}

int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // convolv with NxN kernel
    // value = value + bias

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
#endif

    //     NCNN_LOGE("ConvolutionDepthWise input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const int h = bottom_blob_bordered.h;
    const size_t elemsize = bottom_blob_bordered.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = (h - kernel_extent_h) / stride_h + 1;

    top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolutiondepthwise(bottom_blob_bordered, top_blob, weight_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

int ConvolutionDepthWise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.c;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, _kernel_w, _kernel_h, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const int h = bottom_blob_bordered.h;
    const size_t elemsize = bottom_blob_bordered.elemsize;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = (h - kernel_extent_h) / stride_h + 1;

    top_blob.create(outw, outh, _num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolutiondepthwise(bottom_blob_bordered, top_blob, weight_data_flattened, bias_data_flattened, _kernel_w, _kernel_h, stride_w, stride_h, dilation_w, dilation_h, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

void ConvolutionDepthWise::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
{
    make_padding(bottom_blob, bottom_blob_bordered, kernel_w, kernel_h, opt);
}

void ConvolutionDepthWise::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int _kernel_w, int _kernel_h, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;

    bottom_blob_bordered = bottom_blob;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
    {
        Option opt_b = opt;
        opt_b.blob_allocator = opt.workspace_allocator;
        copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
    }
    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
    {
        // tensorflow padding=SAME or onnx padding=SAME_UPPER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
    {
        // onnx padding=SAME_LOWER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        if (wpad > 0 || hpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
}

#if NCNN_INT8
static inline signed char float2int8(float v)
{
    int int32 = static_cast<int>(round(v));
    if (int32 > 127) return 127;
    if (int32 < -127) return -127;
    return (signed char)int32;
}

int ConvolutionDepthWise::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // convolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    if (channels % group != 0 || num_output % group != 0)
    {
        // reject invalid group
        return -100;
    }

    //     NCNN_LOGE("ConvolutionDepthWise input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_int8 = bottom_blob;
    if (elemsize != 1)
    {
        const int channels_g = channels / group;

        Mat scales(channels);
        {
            float* ps = scales;
            for (int g = 0; g < group; g++)
            {
                float scale = bottom_blob_int8_scales[g];
                for (int q = 0; q < channels_g; q++)
                {
                    *ps++ = scale;
                }
            }
        }

        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // int8
    bool use_int8_requantize = int8_scale_term > 100;
    size_t out_elemsize = use_int8_requantize ? 1u : 4u;

    top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // depth-wise
    if (channels == group && group == num_output)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            signed char* outptr = top_blob.channel(g);
            const signed char* kptr = (const signed char*)weight_data + maxk * g;
            const Mat m = bottom_blob_bordered.channel(g);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    int sum = 0;

                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        signed char val = sptr[space_ofs[k]];
                        signed char w = kptr[k];
                        sum += val * w;
                    }

                    float scale_in;
                    if (weight_data_int8_scales[g] == 0)
                        scale_in = 0;
                    else
                        scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                    float sumfp32 = sum * scale_in;

                    if (bias_term)
                        sumfp32 += bias_data[g];

                    sumfp32 = activation_ss(sumfp32, activation_type, activation_params);

                    if (use_int8_requantize)
                    {
                        // requantize
                        float scale_out = top_blob_int8_scales[g];
                        signed char sums8 = float2int8(sumfp32 * scale_out);
                        outptr[0] = sums8;
                        outptr += 1;
                    }
                    else
                    {
                        // dequantize
                        ((float*)outptr)[0] = sumfp32;
                        outptr += 4;
                    }
                }
            }
        }
    }
    else
    {
        // group convolution
        const int channels_g = channels / group;
        const int num_output_g = num_output / group;

#ifdef _WIN32
        #pragma omp parallel for num_threads(opt.num_threads)
#else // _WIN32
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif // _WIN32
        for (int g = 0; g < group; g++)
        {
            for (int p = 0; p < num_output_g; p++)
            {
                signed char* outptr = top_blob.channel(g * num_output_g + p);
                const signed char* weight_data_ptr = (const signed char*)weight_data + maxk * channels_g * num_output_g * g;

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        int sum = 0;

                        const signed char* kptr = weight_data_ptr + maxk * channels_g * p;

                        // channels_g
                        for (int q = 0; q < channels_g; q++)
                        {
                            const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
                            const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                signed char val = sptr[space_ofs[k]];
                                signed char w = kptr[k];
                                sum += val * w;
                            }

                            kptr += maxk;
                        }

                        float scale_in;
                        if (weight_data_int8_scales[g] == 0)
                            scale_in = 0;
                        else
                            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                        float sumfp32 = sum * scale_in;

                        if (bias_term)
                            sumfp32 += bias_data[g * num_output_g + p];

                        sumfp32 = activation_ss(sumfp32, activation_type, activation_params);

                        if (use_int8_requantize)
                        {
                            // requantize
                            float scale_out = top_blob_int8_scales[g];
                            signed char sums8 = float2int8(sumfp32 * scale_out);
                            outptr[0] = sums8;
                            outptr += 1;
                        }
                        else
                        {
                            // dequantize
                            ((float*)outptr)[0] = sumfp32;
                            outptr += 4;
                        }
                    }
                }
            }
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/convolutiondepthwise.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTIONDEPTHWISE_H
#define LAYER_CONVOLUTIONDEPTHWISE_H

#include "layer.h"

namespace ncnn {

class ConvolutionDepthWise : public Layer
{
public:
    ConvolutionDepthWise();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int kernel_w, int kernel_h, const Option& opt) const;

#if NCNN_INT8
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int dilation_w;
    int dilation_h;
    int stride_w;
    int stride_h;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    int pad_top;
    int pad_bottom;
    float pad_value;
    int bias_term;

    int weight_data_size;
    int group;

    int int8_scale_term;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;

#if NCNN_INT8
    Mat weight_data_int8_scales;
    Mat bottom_blob_int8_scales;
    Mat top_blob_int8_scales;
#endif
};

} // namespace ncnn

#endif // LAYER_CONVOLUTIONDEPTHWISE_H


================================================
FILE: src/layer/convolutiondepthwise1d.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolutiondepthwise1d.h"

#include "layer_type.h"

#include "fused_activation.h"

namespace ncnn {

ConvolutionDepthWise1D::ConvolutionDepthWise1D()
{
    one_blob_only = true;
    support_inplace = false;
}

int ConvolutionDepthWise1D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    dilation_w = pd.get(2, 1);
    stride_w = pd.get(3, 1);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_value = pd.get(18, 0.f);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    group = pd.get(7, 1);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(19, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    if (num_output % group != 0)
    {
        // reject invalid group
        return -100;
    }

    return 0;
}

int ConvolutionDepthWise1D::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int convolutiondepthwise1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int h = bottom_blob.h;

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int bias_term = bias_data.empty() ? 0 : 1;

    // depth-wise
    if (h == group && group == outh)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            float* outptr = top_blob.row(g);
            const float* kptr = (const float*)weight_data + kernel_w * g;

            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_term)
                    sum = bias_data[g];

                const float* sptr = bottom_blob.row(g) + j * stride_w;

                for (int k = 0; k < kernel_w; k++)
                {
                    float val = *sptr;
                    float w = kptr[k];
                    sum += val * w;

                    sptr += dilation_w;
                }

                outptr[j] = activation_ss(sum, activation_type, activation_params);
            }
        }
    }
    else
    {
        // group convolution
        const int h_g = h / group;
        const int outh_g = outh / group;

#ifdef _WIN32
        #pragma omp parallel for num_threads(opt.num_threads)
#else
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif
        for (int g = 0; g < group; g++)
        {
            for (int p = 0; p < outh_g; p++)
            {
                float* outptr = top_blob.row(g * outh_g + p);
                const float* weight_data_ptr = (const float*)weight_data + kernel_w * h_g * outh_g * g;

                for (int j = 0; j < outw; j++)
                {
                    float sum = 0.f;

                    if (bias_term)
                        sum = bias_data[outh_g * g + p];

                    const float* kptr = weight_data_ptr + kernel_w * h_g * p;

                    for (int q = 0; q < h_g; q++)
                    {
                        const float* sptr = bottom_blob.row(h_g * g + q) + j * stride_w;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            float val = *sptr;
                            float w = kptr[k];
                            sum += val * w;

                            sptr += dilation_w;
                        }

                        kptr += kernel_w;
                    }

                    outptr[j] = activation_ss(sum, activation_type, activation_params);
                }
            }
        }
    }

    return 0;
}

int ConvolutionDepthWise1D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;

    top_blob.create(outw, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolutiondepthwise1d(bottom_blob_bordered, top_blob, weight_data, bias_data, kernel_w, stride_w, dilation_w, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

int ConvolutionDepthWise1D::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _num_output = _weight_data.c;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, _kernel_w, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    const int w = bottom_blob_bordered.w;
    const size_t elemsize = bottom_blob_bordered.elemsize;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;

    const int outw = (w - kernel_extent_w) / stride_w + 1;

    top_blob.create(outw, _num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int ret = convolutiondepthwise1d(bottom_blob_bordered, top_blob, weight_data_flattened, bias_data_flattened, _kernel_w, stride_w, dilation_w, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    return 0;
}

void ConvolutionDepthWise1D::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
{
    make_padding(bottom_blob, bottom_blob_bordered, kernel_w, opt);
}

void ConvolutionDepthWise1D::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int _kernel_w, const Option& opt) const
{
    int w = bottom_blob.w;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;

    bottom_blob_bordered = bottom_blob;
    if (pad_left > 0 || pad_right > 0)
    {
        Option opt_b = opt;
        opt_b.blob_allocator = opt.workspace_allocator;
        copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
    }
    else if (pad_left == -233 && pad_right == -233)
    {
        // tensorflow padding=SAME or onnx padding=SAME_UPPER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        if (wpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
    else if (pad_left == -234 && pad_right == -234)
    {
        // onnx padding=SAME_LOWER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        if (wpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border(bottom_blob, bottom_blob_bordered, 0, 0, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
}

} // namespace ncnn


================================================
FILE: src/layer/convolutiondepthwise1d.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTIONDEPTHWISE1D_H
#define LAYER_CONVOLUTIONDEPTHWISE1D_H

#include "layer.h"

namespace ncnn {

class ConvolutionDepthWise1D : public Layer
{
public:
    ConvolutionDepthWise1D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, int kernel_w, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int dilation_w;
    int stride_w;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    float pad_value;
    int bias_term;

    int weight_data_size;
    int group;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTIONDEPTHWISE1D_H


================================================
FILE: src/layer/convolutiondepthwise3d.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolutiondepthwise3d.h"

#include "fused_activation.h"

namespace ncnn {

ConvolutionDepthWise3D::ConvolutionDepthWise3D()
{
    one_blob_only = true;
    support_inplace = false;
}

int ConvolutionDepthWise3D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    kernel_d = pd.get(21, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    dilation_d = pd.get(22, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    stride_d = pd.get(23, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    pad_front = pd.get(24, pad_left);
    pad_behind = pd.get(17, pad_front);
    pad_value = pd.get(18, 0.f);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    group = pd.get(7, 1);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    return 0;
}

int ConvolutionDepthWise3D::load_model(const ModelBin& mb)
{
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

int ConvolutionDepthWise3D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extend_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extend_h = dilation_h * (kernel_h - 1) + 1;
    const int kernel_extend_d = dilation_d * (kernel_d - 1) + 1;

    Mat bottom_blob_bordered;
    Option opt_pad = opt;
    opt_pad.use_packing_layout = false;
    make_padding(bottom_blob, bottom_blob_bordered, opt_pad);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;
    d = bottom_blob_bordered.d;

    int outw = (w - kernel_extend_w) / stride_w + 1;
    int outh = (h - kernel_extend_h) / stride_h + 1;
    int outd = (d - kernel_extend_d) / stride_d + 1;

    const int maxk = kernel_w * kernel_h * kernel_d;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap0 = w * dilation_h - kernel_w * dilation_w;
        int gap1 = h * w * dilation_d - w * kernel_h * dilation_h;
        for (int z = 0; z < kernel_d; z++)
        {
            for (int i = 0; i < kernel_h; i++)
            {
                for (int j = 0; j < kernel_w; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2 += dilation_w;
                }
                p2 += gap0;
            }
            p2 += gap1;
        }
    }

    top_blob.create(outw, outh, outd, num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // depth-wise
    if (channels == group && group == num_output)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            float* outptr = top_blob.channel(g);
            const float* kptr = (const float*)weight_data + maxk * g;
            const Mat m = bottom_blob_bordered.channel(g);

            for (int z = 0; z < outd; z++)
            {
                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                            sum = bias_data[g];

                        const float* sptr = m.depth(z * stride_d).row(i * stride_h) + j * stride_w;

                        for (int k = 0; k < maxk; k++)
                        {
                            float val = sptr[space_ofs[k]];
                            float w = kptr[k];
                            sum += val * w;
                        }

                        outptr[j] = activation_ss(sum, activation_type, activation_params);
                    }

                    outptr += outw;
                }
            }
        }
    }
    else
    {
        // group convolution
        const int channels_g = channels / group;
        const int num_output_g = num_output / group;

#ifdef _WIN32
        #pragma omp parallel for num_threads(opt.num_threads)
#else
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif
        for (int g = 0; g < group; g++)
        {
            for (int p = 0; p < num_output_g; p++)
            {
                float* outptr = top_blob.channel(g * num_output_g + p);
                const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;

                // shadowed variable for less openmp task args
                const int outw = top_blob.w;
                const int outh = top_blob.h;
                const int outd = top_blob.d;

                for (int z = 0; z < outd; z++)
                {
                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                                sum = bias_data[num_output_g * g + p];

                            const float* kptr = weight_data_ptr + maxk * channels_g * p;

                            for (int q = 0; q < channels_g; q++)
                            {
                                const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
                                const float* sptr = m.depth(z * stride_d).row(i * stride_h) + j * stride_w;

                                for (int l = 0; l < maxk; l++)
                                {
                                    float val = sptr[space_ofs[l]];

                                    float wt = kptr[l];
                                    sum += val * wt;
                                }

                                kptr += maxk;
                            }

                            outptr[j] = activation_ss(sum, activation_type, activation_params);
                        }

                        outptr += outw;
                    }
                }
            }
        }
    }

    return 0;
}

void ConvolutionDepthWise3D::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
    const int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;

    bottom_blob_bordered = bottom_blob;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || pad_front > 0 || pad_behind > 0)
    {
        Option opt_b = opt;
        opt_b.blob_allocator = opt.workspace_allocator;
        copy_make_border_3d(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, pad_front, pad_behind, BORDER_CONSTANT, pad_value, opt_b);
    }
    else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233 && pad_front == -233 && pad_behind == -233)
    {
        // tensorflow padding=SAME or onnx padding=SAME_UPPER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        int dpad = kernel_extent_d + (d - 1) / stride_d * stride_d - d;
        if (wpad > 0 || hpad > 0 || dpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border_3d(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, dpad / 2, dpad - dpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
    else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234 && pad_front == -234 && pad_behind == -234)
    {
        // onnx padding=SAME_LOWER
        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
        int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
        int dpad = kernel_extent_d + (d - 1) / stride_d * stride_d - d;
        if (wpad > 0 || hpad > 0 || dpad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            copy_make_border_3d(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, dpad / 2, dpad - dpad / 2, BORDER_CONSTANT, pad_value, opt_b);
        }
    }
}

} // namespace ncnn


================================================
FILE: src/layer/convolutiondepthwise3d.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTIONDEPTHWISE3D_H
#define LAYER_CONVOLUTIONDEPTHWISE3D_H

#include "layer.h"

namespace ncnn {

class ConvolutionDepthWise3D : public Layer
{
public:
    ConvolutionDepthWise3D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int kernel_d;
    int dilation_w;
    int dilation_h;
    int dilation_d;
    int stride_w;
    int stride_h;
    int stride_d;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    int pad_top;
    int pad_bottom;
    int pad_front;
    int pad_behind;
    float pad_value;
    int bias_term;

    int weight_data_size;
    int group;

    int activation_type;
    Mat activation_params;

    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif //LAYER_CONVOLUTIONDEPTHWISE3D_H


================================================
FILE: src/layer/copyto.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "copyto.h"

namespace ncnn {

CopyTo::CopyTo()
{
    one_blob_only = false;
    support_inplace = false;
}

int CopyTo::load_param(const ParamDict& pd)
{
    woffset = pd.get(0, 0);
    hoffset = pd.get(1, 0);
    doffset = pd.get(13, 0);
    coffset = pd.get(2, 0);

    starts = pd.get(9, Mat());
    axes = pd.get(11, Mat());

    return 0;
}

template<typename T>
static void copy_to_image(const Mat& src, Mat& self, int top, int left)
{
    int w = src.w;
    int h = src.h;

    const T* ptr = src;
    T* outptr = self.row<T>(top) + left;

    for (int y = 0; y < h; y++)
    {
        memcpy(outptr, ptr, w * sizeof(T));
        ptr += w;
        outptr += self.w;
    }
}

int CopyTo::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& self_blob = bottom_blobs[0];
    const Mat& src_blob = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int w = self_blob.w;
    int h = self_blob.h;
    int d = self_blob.d;
    int channels = self_blob.c;
    int dims = self_blob.dims;
    size_t elemsize = self_blob.elemsize;

    if (src_blob.dims == dims && src_blob.w == w && src_blob.h == h && src_blob.d == d && src_blob.c == channels)
    {
        top_blob = src_blob;
        return 0;
    }

    top_blob = self_blob.clone(opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int _woffset, _hoffset, _doffset, _coffset;
    resolve_copyto_offset(self_blob.shape(), _woffset, _hoffset, _doffset, _coffset);

    if (dims == 1)
    {
        if (elemsize == 1)
            copy_to_image<signed char>(src_blob, top_blob, 0, _woffset);
        if (elemsize == 2)
            copy_to_image<unsigned short>(src_blob, top_blob, 0, _woffset);
        if (elemsize == 4)
            copy_to_image<float>(src_blob, top_blob, 0, _woffset);
    }

    if (dims == 2)
    {
        if (elemsize == 1)
            copy_to_image<signed char>(src_blob, top_blob, _hoffset, _woffset);
        if (elemsize == 2)
            copy_to_image<unsigned short>(src_blob, top_blob, _hoffset, _woffset);
        if (elemsize == 4)
            copy_to_image<float>(src_blob, top_blob, _hoffset, _woffset);
    }

    if (dims == 3)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < src_blob.c; q++)
        {
            const Mat roim = src_blob.channel(q);
            Mat m = top_blob.channel(q + _coffset);

            if (elemsize == 1)
                copy_to_image<signed char>(roim, m, _hoffset, _woffset);
            if (elemsize == 2)
                copy_to_image<unsigned short>(roim, m, _hoffset, _woffset);
            if (elemsize == 4)
                copy_to_image<float>(roim, m, _hoffset, _woffset);
        }
    }

    if (dims == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < src_blob.c; q++)
        {
            for (int z = 0; z < src_blob.d; z++)
            {
                const Mat roim = src_blob.channel(q).depth(z);
                Mat m = top_blob.channel(q + _coffset).depth(z + _doffset);

                if (elemsize == 1)
                    copy_to_image<signed char>(roim, m, _hoffset, _woffset);
                if (elemsize == 2)
                    copy_to_image<unsigned short>(roim, m, _hoffset, _woffset);
                if (elemsize == 4)
                    copy_to_image<float>(roim, m, _hoffset, _woffset);
            }
        }
    }

    return 0;
}

void CopyTo::resolve_copyto_offset(const Mat& self_blob, int& _woffset, int& _hoffset, int& _doffset, int& _coffset) const
{
    int w = self_blob.w;
    int h = self_blob.h;
    int d = self_blob.d;
    int channels = self_blob.c;
    int dims = self_blob.dims;

    bool numpy_style_slice = !starts.empty();
    if (numpy_style_slice)
    {
        _woffset = 0;
        _hoffset = 0;
        _doffset = 0;
        _coffset = 0;

        const int* starts_ptr = starts;
        const int* axes_ptr = axes;

        int _axes[4] = {0, 1, 2, 3};
        int num_axis = axes.w;
        if (num_axis == 0)
        {
            num_axis = dims;
        }
        else
        {
            for (int i = 0; i < num_axis; i++)
            {
                int axis = axes_ptr[i];
                if (axis < 0)
                    axis = dims + axis;
                _axes[i] = axis;
            }
        }

        for (int i = 0; i < num_axis; i++)
        {
            int axis = _axes[i];
            int start = starts_ptr[i];

            if (dims == 1) // axis == 0
            {
                if (start == -233) start = 0;
                _woffset = start >= 0 ? start : w + start;
            }
            if (dims == 2)
            {
                if (axis == 0)
                {
                    if (start == -233) start = 0;
                    _hoffset = start >= 0 ? start : h + start;
                }
                if (axis == 1)
                {
                    if (start == -233) start = 0;
                    _woffset = start >= 0 ? start : w + start;
                }
            }
            if (dims == 3)
            {
                if (axis == 0)
                {
                    if (start == -233) start = 0;
                    _coffset = start >= 0 ? start : channels + start;
                }
                if (axis == 1)
                {
                    if (start == -233) start = 0;
                    _hoffset = start >= 0 ? start : h + start;
                }
                if (axis == 2)
                {
                    if (start == -233) start = 0;
                    _woffset = start >= 0 ? start : w + start;
                }
            }
            if (dims == 4)
            {
                if (axis == 0)
                {
                    if (start == -233) start = 0;
                    _coffset = start >= 0 ? start : channels + start;
                }
                if (axis == 1)
                {
                    if (start == -233) start = 0;
                    _doffset = start >= 0 ? start : d + start;
                }
                if (axis == 2)
                {
                    if (start == -233) start = 0;
                    _hoffset = start >= 0 ? start : h + start;
                }
                if (axis == 3)
                {
                    if (start == -233) start = 0;
                    _woffset = start >= 0 ? start : w + start;
                }
            }
        }
    }
    else
    {
        _woffset = woffset;
        _hoffset = hoffset;
        _doffset = doffset;
        _coffset = coffset;
    }
}

} // namespace ncnn


================================================
FILE: src/layer/copyto.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_COPYTO_H
#define LAYER_COPYTO_H

#include "layer.h"

namespace ncnn {

class CopyTo : public Layer
{
public:
    CopyTo();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void resolve_copyto_offset(const Mat& self_blob, int& woffset, int& hoffset, int& doffset, int& coffset) const;

public:
    int woffset;
    int hoffset;
    int doffset;
    int coffset;

    // numpy-style slice
    // if provided, all the above attributes will be ignored
    Mat starts;
    Mat axes;
};

} // namespace ncnn

#endif // LAYER_COPYTO_H


================================================
FILE: src/layer/crop.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "crop.h"

#include "expression.h"

namespace ncnn {

Crop::Crop()
{
    one_blob_only = true;
    support_inplace = false;
}

int Crop::load_param(const ParamDict& pd)
{
    woffset = pd.get(0, 0);
    hoffset = pd.get(1, 0);
    doffset = pd.get(13, 0);
    coffset = pd.get(2, 0);
    outw = pd.get(3, 0);
    outh = pd.get(4, 0);
    outd = pd.get(14, 0);
    outc = pd.get(5, 0);
    woffset2 = pd.get(6, 0);
    hoffset2 = pd.get(7, 0);
    doffset2 = pd.get(15, 0);
    coffset2 = pd.get(8, 0);

    starts = pd.get(9, Mat());
    ends = pd.get(10, Mat());
    axes = pd.get(11, Mat());

    starts_expr = pd.get(19, "");
    ends_expr = pd.get(20, "");
    axes_expr = pd.get(21, "");

    // NCNN_LOGE("%s %s %s", starts_expr.c_str(), ends_expr.c_str(), axes_expr.c_str());

    bool numpy_style_slice = !starts.empty() && !ends.empty();

    if (!starts_expr.empty() && !ends_expr.empty())
        numpy_style_slice = true;

    if (outw == 0 && outh == 0 && outd == 0 && outc == 0 && woffset2 == 0 && hoffset2 == 0 && doffset2 == 0 && coffset2 == 0 && !numpy_style_slice)
    {
        one_blob_only = false;
    }

    // count reference blobs
    if (!starts_expr.empty() || !ends_expr.empty() || !axes_expr.empty())
    {
        const int starts_blob_count = count_expression_blobs(starts_expr);
        const int ends_blob_count = count_expression_blobs(ends_expr);
        const int axes_blob_count = count_expression_blobs(axes_expr);

        // NCNN_LOGE("%d %d %d", starts_blob_count, ends_blob_count, axes_blob_count);
        if (starts_blob_count > 1 || ends_blob_count > 1 || axes_blob_count > 1)
            one_blob_only = false;
    }

    return 0;
}

template<typename T>
static void copy_cut_border_image(const Mat& src, Mat& dst, int top, int left)
{
    int w = dst.w;
    int h = dst.h;

    const T* ptr = src.row<T>(top) + left;
    T* outptr = dst; //.data;

    for (int y = 0; y < h; y++)
    {
        if (w < 12)
        {
            for (int x = 0; x < w; x++)
            {
                outptr[x] = ptr[x];
            }
        }
        else
        {
            memcpy(outptr, ptr, w * sizeof(T));
        }
        outptr += w;
        ptr += src.w;
    }
}

int Crop::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;

    int _woffset, _hoffset, _doffset, _coffset;
    int _outw = -1, _outh = -1, _outd = -1, _outc;

    if (!starts_expr.empty() && !ends_expr.empty())
    {
        std::vector<Mat> bottom_blobs(1);
        bottom_blobs[0] = bottom_blob;
        eval_crop_expr(bottom_blobs, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else
    {
        resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }

    if (dims == 1)
    {
        if (_outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(_outw, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elemsize == 1)
            copy_cut_border_image<signed char>(bottom_blob, top_blob, 0, _woffset);
        if (elemsize == 2)
            copy_cut_border_image<unsigned short>(bottom_blob, top_blob, 0, _woffset);
        if (elemsize == 4)
            copy_cut_border_image<float>(bottom_blob, top_blob, 0, _woffset);
    }

    if (dims == 2)
    {
        if (_outw == w && _outh == h)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(_outw, _outh, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elemsize == 1)
            copy_cut_border_image<signed char>(bottom_blob, top_blob, _hoffset, _woffset);
        if (elemsize == 2)
            copy_cut_border_image<unsigned short>(bottom_blob, top_blob, _hoffset, _woffset);
        if (elemsize == 4)
            copy_cut_border_image<float>(bottom_blob, top_blob, _hoffset, _woffset);
    }

    if (dims == 3)
    {
        if (_outw == w && _outh == h && _outc == channels)
        {
            top_blob = bottom_blob;
            return 0;
        }

        const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset, _outc);

        if (_outw == w && _outh == h)
        {
            top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            return 0;
        }

        top_blob.create(_outw, _outh, _outc, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < _outc; q++)
        {
            const Mat m = bottom_blob_sliced.channel(q);
            Mat borderm = top_blob.channel(q);

            if (elemsize == 1)
                copy_cut_border_image<signed char>(m, borderm, _hoffset, _woffset);
            if (elemsize == 2)
                copy_cut_border_image<unsigned short>(m, borderm, _hoffset, _woffset);
            if (elemsize == 4)
                copy_cut_border_image<float>(m, borderm, _hoffset, _woffset);
        }
    }

    if (dims == 4)
    {
        if (_outw == w && _outh == h && _outd == d && _outc == channels)
        {
            top_blob = bottom_blob;
            return 0;
        }

        const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset, _outc);

        if (_outw == w && _outh == h && _outd == d)
        {
            top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            return 0;
        }

        top_blob.create(_outw, _outh, _outd, _outc, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < _outc; q++)
        {
            for (int z = 0; z < _outd; z++)
            {
                const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                Mat borderm = top_blob.channel(q).depth(z);

                if (elemsize == 1)
                    copy_cut_border_image<signed char>(m, borderm, _hoffset, _woffset);
                if (elemsize == 2)
                    copy_cut_border_image<unsigned short>(m, borderm, _hoffset, _woffset);
                if (elemsize == 4)
                    copy_cut_border_image<float>(m, borderm, _hoffset, _woffset);
            }
        }
    }

    return 0;
}

int Crop::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;

    Mat& top_blob = top_blobs[0];

    int _woffset, _hoffset, _doffset, _coffset = -1;
    int _outw = -1, _outh = -1, _outd = -1, _outc;

    if (!starts_expr.empty() && !ends_expr.empty())
    {
        eval_crop_expr(bottom_blobs, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else if (woffset == -233)
    {
        resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else
    {
        resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }

    if (dims == 1)
    {
        if (_outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(_outw, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elemsize == 1)
            copy_cut_border_image<signed char>(bottom_blob, top_blob, 0, _woffset);
        if (elemsize == 2)
            copy_cut_border_image<unsigned short>(bottom_blob, top_blob, 0, _woffset);
        if (elemsize == 4)
            copy_cut_border_image<float>(bottom_blob, top_blob, 0, _woffset);
    }

    if (dims == 2)
    {
        if (_outw == w && _outh == h)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(_outw, _outh, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (elemsize == 1)
            copy_cut_border_image<signed char>(bottom_blob, top_blob, _hoffset, _woffset);
        if (elemsize == 2)
            copy_cut_border_image<unsigned short>(bottom_blob, top_blob, _hoffset, _woffset);
        if (elemsize == 4)
            copy_cut_border_image<float>(bottom_blob, top_blob, _hoffset, _woffset);
    }

    if (dims == 3)
    {
        if (_outw == w && _outh == h && _outc == channels)
        {
            top_blob = bottom_blob;
            return 0;
        }

        const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset, _outc);

        if (_outw == w && _outh == h)
        {
            top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            return 0;
        }

        top_blob.create(_outw, _outh, _outc, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < _outc; q++)
        {
            const Mat m = bottom_blob_sliced.channel(q);
            Mat borderm = top_blob.channel(q);

            if (elemsize == 1)
                copy_cut_border_image<signed char>(m, borderm, _hoffset, _woffset);
            if (elemsize == 2)
                copy_cut_border_image<unsigned short>(m, borderm, _hoffset, _woffset);
            if (elemsize == 4)
                copy_cut_border_image<float>(m, borderm, _hoffset, _woffset);
        }
    }

    if (dims == 4)
    {
        if (_outw == w && _outh == h && _outd == d && _outc == channels)
        {
            top_blob = bottom_blob;
            return 0;
        }

        const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset, _outc);

        if (_outw == w && _outh == h && _outd == d)
        {
            top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            return 0;
        }

        top_blob.create(_outw, _outh, _outd, _outc, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < _outc; q++)
        {
            for (int z = 0; z < _outd; z++)
            {
                const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                Mat borderm = top_blob.channel(q).depth(z);

                if (elemsize == 1)
                    copy_cut_border_image<signed char>(m, borderm, _hoffset, _woffset);
                if (elemsize == 2)
                    copy_cut_border_image<unsigned short>(m, borderm, _hoffset, _woffset);
                if (elemsize == 4)
                    copy_cut_border_image<float>(m, borderm, _hoffset, _woffset);
            }
        }
    }

    return 0;
}

void Crop::resolve_crop_roi(const Mat& bottom_blob, int& _woffset, int& _hoffset, int& _doffset, int& _coffset, int& _outw, int& _outh, int& _outd, int& _outc) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    bool numpy_style_slice = !starts.empty() && !ends.empty();
    if (numpy_style_slice)
    {
        _woffset = 0;
        _hoffset = 0;
        _doffset = 0;
        _coffset = 0;
        _outw = w;
        _outh = h;
        _outd = d;
        _outc = channels;

        const int* starts_ptr = starts;
        const int* ends_ptr = ends;
        const int* axes_ptr = axes;

        int _axes[4] = {0, 1, 2, 3};
        int num_axis = axes.w;
        if (num_axis == 0)
        {
            num_axis = dims;
        }
        else
        {
            for (int i = 0; i < num_axis; i++)
            {
                int axis = axes_ptr[i];
                if (axis < 0)
                    axis = dims + axis;
                _axes[i] = axis;
            }
        }

        for (int i = 0; i < num_axis; i++)
        {
            int axis = _axes[i];
            int start = starts_ptr[i];
            int end = ends_ptr[i];

            if (dims == 1) // axis == 0
            {
                if (start == -233) start = 0;
                if (end == -233) end = w;
                _woffset = start >= 0 ? start : w + start;
                _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
            }
            if (dims == 2)
            {
                if (axis == 0)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = h;
                    _hoffset = start >= 0 ? start : h + start;
                    _outh = std::min(h, end > 0 ? end : h + end) - _hoffset;
                }
                if (axis == 1)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = w;
                    _woffset = start >= 0 ? start : w + start;
                    _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
                }
            }
            if (dims == 3)
            {
                if (axis == 0)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = channels;
                    _coffset = start >= 0 ? start : channels + start;
                    _outc = std::min(channels, end > 0 ? end : channels + end) - _coffset;
                }
                if (axis == 1)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = h;
                    _hoffset = start >= 0 ? start : h + start;
                    _outh = std::min(h, end > 0 ? end : h + end) - _hoffset;
                }
                if (axis == 2)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = w;
                    _woffset = start >= 0 ? start : w + start;
                    _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
                }
            }
            if (dims == 4)
            {
                if (axis == 0)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = channels;
                    _coffset = start >= 0 ? start : channels + start;
                    _outc = std::min(channels, end > 0 ? end : channels + end) - _coffset;
                }
                if (axis == 1)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = d;
                    _doffset = start >= 0 ? start : d + start;
                    _outd = std::min(d, end > 0 ? end : d + end) - _doffset;
                }
                if (axis == 2)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = h;
                    _hoffset = start >= 0 ? start : h + start;
                    _outh = std::min(h, end > 0 ? end : h + end) - _hoffset;
                }
                if (axis == 3)
                {
                    if (start == -233) start = 0;
                    if (end == -233) end = w;
                    _woffset = start >= 0 ? start : w + start;
                    _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
                }
            }
        }
    }
    else
    {
        _woffset = woffset;
        _hoffset = hoffset;
        _doffset = doffset;
        _coffset = coffset;
        _outw = w;
        _outh = h;
        _outd = d;
        _outc = channels;

        if (dims == 1)
        {
            _outw = w - woffset - woffset2;
            if (outw != -233)
                _outw = std::min(outw, _outw);
        }
        if (dims == 2)
        {
            _outw = w - woffset - woffset2;
            if (outw != -233)
                _outw = std::min(outw, _outw);

            _outh = h - hoffset - hoffset2;
            if (outh != -233)
                _outh = std::min(outh, _outh);
        }
        if (dims == 3)
        {
            _outw = w - woffset - woffset2;
            if (outw != -233)
                _outw = std::min(outw, _outw);

            _outh = h - hoffset - hoffset2;
            if (outh != -233)
                _outh = std::min(outh, _outh);

            _outc = channels - coffset - coffset2;
            if (outc != -233)
                _outc = std::min(outc, _outc);
        }
        if (dims == 4)
        {
            _outw = w - woffset - woffset2;
            if (outw != -233)
                _outw = std::min(outw, _outw);

            _outh = h - hoffset - hoffset2;
            if (outh != -233)
                _outh = std::min(outh, _outh);

            _outd = d - doffset - doffset2;
            if (outd != -233)
                _outd = std::min(outd, _outd);

            _outc = channels - coffset - coffset2;
            if (outc != -233)
                _outc = std::min(outc, _outc);
        }
    }
}

void Crop::resolve_crop_roi(const Mat& bottom_blob, const Mat& reference_blob, int& _woffset, int& _hoffset, int& _doffset, int& _coffset, int& _outw, int& _outh, int& _outd, int& _outc) const
{
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    int ref_w = reference_blob.w;
    int ref_h = reference_blob.h;
    int ref_d = reference_blob.d;
    int ref_channels = reference_blob.c;
    int ref_dims = reference_blob.dims;

    if (dims == 1)
    {
        _woffset = woffset;
        _outw = ref_w;
    }
    if (dims == 2)
    {
        _woffset = woffset;
        _hoffset = hoffset;
        _outw = ref_w;
        _outh = ref_h;
    }
    if (dims == 3)
    {
        _woffset = woffset;
        _hoffset = hoffset;
        _coffset = coffset;
        _outw = ref_w;
        _outh = ref_h;
        _outc = ref_dims == 3 ? ref_channels : channels;
    }
    if (dims == 4)
    {
        _woffset = woffset;
        _hoffset = hoffset;
        _doffset = doffset;
        _coffset = coffset;
        _outw = ref_w;
        _outh = ref_h;
        _outd = ref_d;
        _outc = ref_dims == 4 ? ref_channels : channels;
    }
}

void Crop::resolve_crop_roi(const Mat& bottom_blob, const int* param_data, int& _woffset, int& _hoffset, int& _doffset, int& _coffset, int& _outw, int& _outh, int& _outd, int& _outc) const
{
    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        _woffset = param_data[0];
        _outw = param_data[3];
    }
    if (dims == 2)
    {
        _woffset = param_data[0];
        _hoffset = param_data[1];
        _outw = param_data[3];
        _outh = param_data[4];
    }
    if (dims == 3)
    {
        _woffset = param_data[0];
        _hoffset = param_data[1];
        _coffset = param_data[2];
        _outw = param_data[3];
        _outh = param_data[4];
        _outc = param_data[5];
    }
    if (dims == 4)
    {
        _woffset = param_data[0];
        _hoffset = param_data[1];
        _doffset = param_data[2];
        _coffset = param_data[3];
        _outw = param_data[4];
        _outh = param_data[5];
        _outd = param_data[6];
        _outc = param_data[7];
    }
}

int Crop::eval_crop_expr(const std::vector<Mat>& bottom_blobs, int& _woffset, int& _hoffset, int& _doffset, int& _coffset, int& _outw, int& _outh, int& _outd, int& _outc) const
{
    std::vector<int> _starts;
    std::vector<int> _ends;
    std::vector<int> _axes;
    int er = eval_list_expression(starts_expr, bottom_blobs, _starts);
    if (er != 0)
        return -1;

    er = eval_list_expression(ends_expr, bottom_blobs, _ends);
    if (er != 0)
        return -1;

    er = eval_list_expression(axes_expr, bottom_blobs, _axes);
    if (er != 0)
        return -1;

    // NCNN_LOGE("%d %d %d", _starts[0], _ends[0], _axes[0]);

    const Mat& bottom_blob = bottom_blobs[0];
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;
    const int dims = bottom_blob.dims;

    _woffset = 0;
    _hoffset = 0;
    _doffset = 0;
    _coffset = 0;
    _outw = w;
    _outh = h;
    _outd = d;
    _outc = channels;

    const int* starts_ptr = _starts.data();
    const int* ends_ptr = _ends.data();
    const int* axes_ptr = _axes.data();

    int _axes4[4] = {0, 1, 2, 3};
    int num_axis = (int)_axes.size();
    if (num_axis == 0)
    {
        num_axis = dims;
    }
    else
    {
        for (int i = 0; i < num_axis; i++)
        {
            int axis = axes_ptr[i];
            if (axis < 0)
                axis = dims + axis;
            _axes4[i] = axis;
        }
    }

    for (int i = 0; i < num_axis; i++)
    {
        int axis = _axes4[i];
        int start = starts_ptr[i];
        int end = ends_ptr[i];

        if (dims == 1) // axis == 0
        {
            if (start == -233) start = 0;
            if (end == -233) end = w;
            _woffset = start >= 0 ? start : w + start;
            _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
        }
        if (dims == 2)
        {
            if (axis == 0)
            {
                if (start == -233) start = 0;
                if (end == -233) end = h;
                _hoffset = start >= 0 ? start : h + start;
                _outh = std::min(h, end > 0 ? end : h + end) - _hoffset;
            }
            if (axis == 1)
            {
                if (start == -233) start = 0;
                if (end == -233) end = w;
                _woffset = start >= 0 ? start : w + start;
                _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
            }
        }
        if (dims == 3)
        {
            if (axis == 0)
            {
                if (start == -233) start = 0;
                if (end == -233) end = channels;
                _coffset = start >= 0 ? start : channels + start;
                _outc = std::min(channels, end > 0 ? end : channels + end) - _coffset;
            }
            if (axis == 1)
            {
                if (start == -233) start = 0;
                if (end == -233) end = h;
                _hoffset = start >= 0 ? start : h + start;
                _outh = std::min(h, end > 0 ? end : h + end) - _hoffset;
            }
            if (axis == 2)
            {
                if (start == -233) start = 0;
                if (end == -233) end = w;
                _woffset = start >= 0 ? start : w + start;
                _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
            }
        }
        if (dims == 4)
        {
            if (axis == 0)
            {
                if (start == -233) start = 0;
                if (end == -233) end = channels;
                _coffset = start >= 0 ? start : channels + start;
                _outc = std::min(channels, end > 0 ? end : channels + end) - _coffset;
            }
            if (axis == 1)
            {
                if (start == -233) start = 0;
                if (end == -233) end = d;
                _doffset = start >= 0 ? start : d + start;
                _outd = std::min(d, end > 0 ? end : d + end) - _doffset;
            }
            if (axis == 2)
            {
                if (start == -233) start = 0;
                if (end == -233) end = h;
                _hoffset = start >= 0 ? start : h + start;
                _outh = std::min(h, end > 0 ? end : h + end) - _hoffset;
            }
            if (axis == 3)
            {
                if (start == -233) start = 0;
                if (end == -233) end = w;
                _woffset = start >= 0 ? start : w + start;
                _outw = std::min(w, end > 0 ? end : w + end) - _woffset;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/crop.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CROP_H
#define LAYER_CROP_H

#include "layer.h"

namespace ncnn {

class Crop : public Layer
{
public:
    Crop();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void resolve_crop_roi(const Mat& bottom_blob, int& woffset, int& hoffset, int& doffset, int& coffset, int& outw, int& outh, int& outd, int& outc) const;
    void resolve_crop_roi(const Mat& bottom_blob, const Mat& reference_blob, int& woffset, int& hoffset, int& doffset, int& coffset, int& outw, int& outh, int& outd, int& outc) const;
    void resolve_crop_roi(const Mat& bottom_blob, const int* param_data, int& woffset, int& hoffset, int& doffset, int& coffset, int& outw, int& outh, int& outd, int& outc) const;
    int eval_crop_expr(const std::vector<Mat>& bottom_blobs, int& woffset, int& hoffset, int& doffset, int& coffset, int& outw, int& outh, int& outd, int& outc) const;

public:
    // -233 = dynamic offset from reference blob
    int woffset;
    int hoffset;
    int doffset;
    int coffset;

    // -233 = remaining
    int outw;
    int outh;
    int outd;
    int outc;

    // tail offset for cropping, ignored if reference blob is provided
    // woffset is aka left, and woffset2 is aka right
    int woffset2;
    int hoffset2;
    int doffset2;
    int coffset2;

    // numpy-style slice
    // if provided, all the above attributes will be ignored
    Mat starts;
    Mat ends;
    Mat axes;

    // see docs/developer-guide/expression.md
    std::string starts_expr;
    std::string ends_expr;
    std::string axes_expr;
};

} // namespace ncnn

#endif // LAYER_CROP_H


================================================
FILE: src/layer/cumulativesum.cpp
================================================
// Copyright 2023 Xiaomi Corp.   (author: Fangjun Kuang)
// SPDX-License-Identifier: BSD-3-Clause

#include "cumulativesum.h"

namespace ncnn {

CumulativeSum::CumulativeSum()
{
    one_blob_only = true;
    support_inplace = true;
}

int CumulativeSum::load_param(const ParamDict& pd)
{
    axis = pd.get(0, 0);

    return 0;
}

int CumulativeSum::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1)
    {   // ignore axis
        int w = bottom_top_blob.w;

        float* ptr = bottom_top_blob;

        for (int i = 1; i < w; ++i)
        {
            ptr[i] = ptr[i] + ptr[i - 1];
        }

        return 0;
    } // if (dims == 1)

    if (dims == 2 && positive_axis == 0)
    {
        // sum over rows
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        for (int i = 1; i < h; ++i)
        {
            const float* prev_row = bottom_top_blob.row(i - 1);
            float* this_row = bottom_top_blob.row(i);

            for (int k = 0; k < w; ++k)
            {
                this_row[k] = this_row[k] + prev_row[k];
            }
        }

        return 0;
    } // if (dims == 2 && positive_axis == 0)

    if (dims == 2 && positive_axis == 1)
    {
        // sum over columns
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; ++i)
        {
            float* ptr = bottom_top_blob.row(i);

            for (int k = 1; k < w; ++k)
            {
                ptr[k] = ptr[k] + ptr[k - 1];
            }
        }

        return 0;
    } // if (dims == 2 && positive_axis == 1)

    if (dims == 3 && positive_axis == 0)
    {
        // sum over channels
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int c = bottom_top_blob.c;

        int size = w * h;

        for (int i = 1; i < c; ++i)
        {
            const float* prev = bottom_top_blob.channel(i - 1);
            float* cur = bottom_top_blob.channel(i);

            for (int k = 0; k < size; ++k)
            {
                cur[k] = cur[k] + prev[k];
            }
        }

        return 0;
    } // if (dims == 3 && positive_axis == 0)

    if (dims == 3 && positive_axis == 1)
    {
        // sum over rows within each channel

        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int c = bottom_top_blob.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; ++q)
        {
            Mat this_channel = bottom_top_blob.channel(q);

            for (int i = 1; i < h; ++i)
            {
                const float* prev_row = this_channel.row(i - 1);
                float* this_row = this_channel.row(i);

                for (int k = 0; k < w; ++k)
                {
                    this_row[k] = this_row[k] + prev_row[k];
                }
            }
        }

        return 0;
    } // if (dims == 3 && positive_axis == 1)

    if (dims == 3 && positive_axis == 2)
    {
        // sum over columns within each channel

        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int c = bottom_top_blob.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; ++q)
        {
            Mat this_channel = bottom_top_blob.channel(q);

            for (int i = 0; i < h; ++i)
            {
                float* ptr = this_channel.row(i);
                for (int k = 1; k < w; ++k)
                {
                    ptr[k] = ptr[k] + ptr[k - 1];
                }
            }
        }

        return 0;
    } // if (dims == 3 && positive_axis == 2)

    return -100;
}

} // namespace ncnn


================================================
FILE: src/layer/cumulativesum.h
================================================
// Copyright 2023 Xiaomi Corp.   (author: Fangjun Kuang)
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CUMULATIVESUM_H
#define LAYER_CUMULATIVESUM_H

#include "layer.h"

namespace ncnn {

class CumulativeSum : public Layer
{
public:
    CumulativeSum();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    int axis;
};

} // namespace ncnn

#endif // LAYER_CUMULATIVESUM_H


================================================
FILE: src/layer/deconvolution.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolution.h"

#include "fused_activation.h"

namespace ncnn {

Deconvolution::Deconvolution()
{
    one_blob_only = true;
    support_inplace = false;
}

int Deconvolution::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    output_pad_right = pd.get(18, 0);
    output_pad_bottom = pd.get(19, output_pad_right);
    output_w = pd.get(20, 0);
    output_h = pd.get(21, output_w);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(28, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    return 0;
}

int Deconvolution::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int deconvolution(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int outw = top_blob.w;
    const int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = outw * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias = bias_data.empty() ? 0.f : bias_data[p];

        out.fill(bias);

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int inch = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;

        for (int i = 0; i < h; i++)
        {
            for (int j = 0; j < w; j++)
            {
                float* outptr = out.row(i * stride_h) + j * stride_w;

                const float* kptr = (const float*)weight_data + maxk * inch * p;

                for (int q = 0; q < inch; q++)
                {
                    const float val = bottom_blob.channel(q).row(i)[j];

                    for (int k = 0; k < maxk; k++)
                    {
                        float w = kptr[k];
                        outptr[space_ofs[k]] += val * w;
                    }

                    kptr += maxk;
                }
            }
        }

        {
            float* outptr = out;
            int size = outw * outh;

            for (int i = 0; i < size; i++)
            {
                outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
            }
        }
    }

    return 0;
}

int Deconvolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolution(bottom_blob, top_blob_bordered, weight_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int Deconvolution::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.c;
    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.d * 1;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / 1;
        const int inch_g = _num_input / 1;
        const int maxk = _kernel_h * _kernel_w;

        for (int g = 0; g < 1; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, _num_output, 4u, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, _num_output, 4u, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolution(bottom_blob, top_blob_bordered, weight_data_transposed, bias_data_flattened, _kernel_w, _kernel_h, stride_w, stride_h, dilation_w, dilation_h, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

void Deconvolution::cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const
{
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad_top, pad_bottom, pad_left, pad_right, opt);
    }
    else if (output_w > 0 && output_h > 0)
    {
        int wcut = top_blob_bordered.w - output_w;
        int hcut = top_blob_bordered.h - output_h;

        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
        {
            // onnx padding=SAME_UPPER
            copy_cut_border(top_blob_bordered, top_blob, hcut / 2, hcut - hcut / 2, wcut / 2, wcut - wcut / 2, opt);
        }
        else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
        {
            // onnx padding=SAME_LOWER
            copy_cut_border(top_blob_bordered, top_blob, hcut - hcut / 2, hcut / 2, wcut - wcut / 2, wcut / 2, opt);
        }
    }
    else
    {
        top_blob = top_blob_bordered;
    }
}

} // namespace ncnn


================================================
FILE: src/layer/deconvolution.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTION_H
#define LAYER_DECONVOLUTION_H

#include "layer.h"

namespace ncnn {

class Deconvolution : public Layer
{
public:
    Deconvolution();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int dilation_w;
    int dilation_h;
    int stride_w;
    int stride_h;
    int pad_left;
    int pad_right;
    int pad_top;
    int pad_bottom;
    int output_pad_right;
    int output_pad_bottom;
    int output_w;
    int output_h;
    int bias_term;

    int weight_data_size;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTION_H


================================================
FILE: src/layer/deconvolution1d.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolution1d.h"

#include "fused_activation.h"

namespace ncnn {

Deconvolution1D::Deconvolution1D()
{
    one_blob_only = true;
    support_inplace = false;
}

int Deconvolution1D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    dilation_w = pd.get(2, 1);
    stride_w = pd.get(3, 1);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    output_pad_right = pd.get(18, 0);
    output_w = pd.get(20, 0);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(28, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    return 0;
}

int Deconvolution1D::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int deconvolution1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int bias_term = bias_data.empty() ? 0 : 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outh; p++)
    {
        Mat out = top_blob.row_range(p, 1);

        const float bias = bias_term ? bias_data[p] : 0.f;

        out.fill(bias);

        for (int j = 0; j < w; j++)
        {
            float* outptr = (float*)out + j * stride_w;

            const float* kptr = (const float*)weight_data + kernel_w * h * p;

            for (int q = 0; q < h; q++)
            {
                const float val = bottom_blob.row(q)[j];

                for (int k = 0; k < kernel_w; k++)
                {
                    float w = kptr[k];
                    outptr[k * dilation_w] += val * w;
                }

                kptr += kernel_w;
            }
        }

        {
            float* outptr = out;

            for (int i = 0; i < outw; i++)
            {
                outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
            }
        }
    }

    return 0;
}

int Deconvolution1D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || output_w > 0)
    {
        top_blob_bordered.create(outw, num_output, elemsize, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, num_output, elemsize, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolution1d(bottom_blob, top_blob_bordered, weight_data, bias_data, kernel_w, stride_w, dilation_w, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int Deconvolution1D::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.h;
    const int _kernel_w = _weight_data.w;
    const int _num_output = _weight_data.h * 1;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // transpose group-inch/group-outch/group-kw to group-outch/group-inch/group-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / 1;
        const int inch_g = _num_input / 1;
        const int maxk = _kernel_w;

        for (int g = 0; g < 1; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    const int w = bottom_blob.w;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || output_w > 0)
    {
        top_blob_bordered.create(outw, _num_output, 4u, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, _num_output, 4u, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolution1d(bottom_blob, top_blob_bordered, weight_data_transposed, bias_data_flattened, _kernel_w, stride_w, dilation_w, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

void Deconvolution1D::cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const
{
    if (pad_left > 0 || pad_right > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, 0, 0, pad_left, pad_right, opt);
    }
    else if (output_w > 0)
    {
        int wcut = top_blob_bordered.w - output_w;

        if (pad_left == -233 || pad_right == -233)
        {
            // onnx padding=SAME_UPPER
            copy_cut_border(top_blob_bordered, top_blob, 0, 0, wcut / 2, wcut - wcut / 2, opt);
        }
        else if (pad_left == -234 || pad_right == -234)
        {
            // onnx padding=SAME_LOWER
            copy_cut_border(top_blob_bordered, top_blob, 0, 0, wcut - wcut / 2, wcut / 2, opt);
        }
    }
    else
    {
        top_blob = top_blob_bordered;
    }
}

} // namespace ncnn


================================================
FILE: src/layer/deconvolution1d.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTION1D_H
#define LAYER_DECONVOLUTION1D_H

#include "layer.h"

namespace ncnn {

class Deconvolution1D : public Layer
{
public:
    Deconvolution1D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int dilation_w;
    int stride_w;
    int pad_left;
    int pad_right;
    int output_pad_right;
    int output_w;
    int bias_term;

    int weight_data_size;

    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTION1D_H


================================================
FILE: src/layer/deconvolution3d.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolution3d.h"

#include "fused_activation.h"

namespace ncnn {

Deconvolution3D::Deconvolution3D()
{
    one_blob_only = true;
    support_inplace = false;
}

int Deconvolution3D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    kernel_d = pd.get(21, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    dilation_d = pd.get(22, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    stride_d = pd.get(23, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    pad_front = pd.get(24, pad_left);
    pad_behind = pd.get(17, pad_front);
    output_pad_right = pd.get(18, 0);
    output_pad_bottom = pd.get(19, output_pad_right);
    output_pad_behind = pd.get(20, output_pad_right);
    output_w = pd.get(25, 0);
    output_h = pd.get(26, output_w);
    output_d = pd.get(27, output_w);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    return 0;
}

int Deconvolution3D::load_model(const ModelBin& mb)
{
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int deconvolution3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h * kernel_d;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap0 = outw * dilation_h - kernel_w * dilation_w;
        int gap1 = outh * outw * dilation_d - outw * kernel_h * dilation_h;
        for (int z = 0; z < kernel_d; z++)
        {
            for (int i = 0; i < kernel_h; i++)
            {
                for (int j = 0; j < kernel_w; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2 += dilation_w;
                }
                p2 += gap0;
            }
            p2 += gap1;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias = bias_data.empty() ? 0.f : bias_data[p];

        out.fill(bias);

        // shadowed variable for less openmp task args
        const int w = bottom_blob.w;
        const int h = bottom_blob.h;
        const int d = bottom_blob.d;
        const int inch = bottom_blob.c;
        const int outw = top_blob.w;
        const int outh = top_blob.h;
        const int outd = top_blob.d;

        for (int z = 0; z < d; z++)
        {
            for (int i = 0; i < h; i++)
            {
                for (int j = 0; j < w; j++)
                {
                    float* outptr = out.depth(z * stride_d).row(i * stride_h) + j * stride_w;

                    const float* kptr = (const float*)weight_data + maxk * inch * p;

                    for (int q = 0; q < inch; q++)
                    {
                        const float val = bottom_blob.channel(q).depth(z).row(i)[j];

                        for (int k = 0; k < maxk; k++)
                        {
                            float w = kptr[k];
                            outptr[space_ofs[k]] += val * w;
                        }

                        kptr += maxk;
                    }
                }
            }
        }

        {
            float* outptr = out;
            int size = outw * outh * outd;

            for (int i = 0; i < size; i++)
            {
                outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
            }
        }
    }

    return 0;
}

int Deconvolution3D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
    const int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int outd = (d - 1) * stride_d + kernel_extent_d + output_pad_behind;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || pad_front > 0 || pad_behind > 0 || (output_w > 0 && output_h > 0 && output_d > 0))
    {
        top_blob_bordered.create(outw, outh, outd, num_output, elemsize, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, outd, num_output, elemsize, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolution3d(bottom_blob, top_blob_bordered, weight_data, bias_data, kernel_w, kernel_h, kernel_d, stride_w, stride_h, stride_d, dilation_w, dilation_h, dilation_d, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

void Deconvolution3D::cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const
{
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || pad_front > 0 || pad_behind > 0)
    {
        copy_cut_border_3d(top_blob_bordered, top_blob, pad_top, pad_bottom, pad_left, pad_right, pad_front, pad_behind, opt);
    }
    else if (output_w > 0 && output_h > 0 && output_d > 0)
    {
        int wcut = top_blob_bordered.w - output_w;
        int hcut = top_blob_bordered.h - output_h;
        int dcut = top_blob_bordered.d - output_d;

        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233 || pad_front == -233 || pad_behind == -233)
        {
            // onnx padding=SAME_UPPER
            copy_cut_border_3d(top_blob_bordered, top_blob, hcut / 2, hcut - hcut / 2, wcut / 2, wcut - wcut / 2, dcut / 2, dcut - dcut / 2, opt);
        }
        else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234 || pad_front == -234 || pad_behind == -234)
        {
            // onnx padding=SAME_LOWER
            copy_cut_border_3d(top_blob_bordered, top_blob, hcut - hcut / 2, hcut / 2, wcut - wcut / 2, wcut / 2, dcut - dcut / 2, dcut / 2, opt);
        }
    }
    else
    {
        top_blob = top_blob_bordered;
    }
}

} // namespace ncnn


================================================
FILE: src/layer/deconvolution3d.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTION3D_H
#define LAYER_DECONVOLUTION3D_H

#include "layer.h"

namespace ncnn {

class Deconvolution3D : public Layer
{
public:
    Deconvolution3D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    void cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int kernel_d;
    int dilation_w;
    int dilation_h;
    int dilation_d;
    int stride_w;
    int stride_h;
    int stride_d;
    int pad_left;
    int pad_right;
    int pad_top;
    int pad_bottom;
    int pad_front;
    int pad_behind;
    int output_pad_right;
    int output_pad_bottom;
    int output_pad_behind;
    int output_w;
    int output_h;
    int output_d;
    int bias_term;

    int weight_data_size;

    int activation_type;
    Mat activation_params;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTION3D_H


================================================
FILE: src/layer/deconvolutiondepthwise.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolutiondepthwise.h"

#include "fused_activation.h"

namespace ncnn {

DeconvolutionDepthWise::DeconvolutionDepthWise()
{
    one_blob_only = true;
    support_inplace = false;
}

int DeconvolutionDepthWise::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    output_pad_right = pd.get(18, 0);
    output_pad_bottom = pd.get(19, output_pad_right);
    output_w = pd.get(20, 0);
    output_h = pd.get(21, output_w);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    group = pd.get(7, 1);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(28, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    return 0;
}

int DeconvolutionDepthWise::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int deconvolutiondepthwise(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int stride_w, int stride_h, int dilation_w, int dilation_h, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = outw * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // depth-wise
    if (inch == group && group == outch)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            const float* inptr = bottom_blob.channel(g);
            const float* kptr = (const float*)weight_data + maxk * g;
            Mat out = top_blob.channel(g);

            const float bias = bias_data.empty() ? 0.f : bias_data[g];

            out.fill(bias);

            // shadowed variable for less openmp task args
            const int w = bottom_blob.w;
            const int h = bottom_blob.h;
            const int outw = top_blob.w;
            const int outh = top_blob.h;

            for (int i = 0; i < h; i++)
            {
                for (int j = 0; j < w; j++)
                {
                    float* outptr = out.row(i * stride_h) + j * stride_w;

                    const float val = inptr[i * w + j];

                    for (int k = 0; k < maxk; k++)
                    {
                        float w = kptr[k];
                        outptr[space_ofs[k]] += val * w;
                    }
                }
            }

            {
                float* outptr = out;
                int size = outw * outh;

                for (int i = 0; i < size; i++)
                {
                    outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
                }
            }
        }
    }
    else
    {
        const int inch_g = inch / group;
        const int outch_g = outch / group;

#ifdef _WIN32
        #pragma omp parallel for num_threads(opt.num_threads)
#else
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif
        for (int g = 0; g < group; g++)
        {
            for (int p = 0; p < outch_g; p++)
            {
                Mat out = top_blob.channel(g * outch_g + p);

                const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;

                const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];

                out.fill(bias);

                // shadowed variable for less openmp task args
                const int w = bottom_blob.w;
                const int h = bottom_blob.h;
                const int outw = top_blob.w;
                const int outh = top_blob.h;

                for (int i = 0; i < h; i++)
                {
                    for (int j = 0; j < w; j++)
                    {
                        float* outptr = out.row(i * stride_h) + j * stride_w;

                        const float* kptr = weight_data_ptr + maxk * inch_g * p;

                        for (int q = 0; q < inch_g; q++)
                        {
                            const float val = bottom_blob.channel(inch_g * g + q).row(i)[j];

                            for (int k = 0; k < maxk; k++)
                            {
                                outptr[space_ofs[k]] += val * kptr[k];
                            }

                            kptr += maxk;
                        }
                    }
                }

                {
                    float* outptr = out;
                    int size = outw * outh;

                    for (int i = 0; i < size; i++)
                    {
                        outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
                    }
                }
            }
        }
    }

    return 0;
}

int DeconvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output, elemsize, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolutiondepthwise(bottom_blob, top_blob_bordered, weight_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int DeconvolutionDepthWise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.c;
    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.d * group;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / group;
        const int inch_g = _num_input / group;
        const int maxk = _kernel_h * _kernel_w;

        for (int g = 0; g < group; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (_kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, _num_output, 4u, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, _num_output, 4u, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolutiondepthwise(bottom_blob, top_blob_bordered, weight_data_transposed, bias_data_flattened, _kernel_w, _kernel_h, stride_w, stride_h, dilation_w, dilation_h, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

void DeconvolutionDepthWise::cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const
{
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, pad_top, pad_bottom, pad_left, pad_right, opt);
    }
    else if (output_w > 0 && output_h > 0)
    {
        int wcut = top_blob_bordered.w - output_w;
        int hcut = top_blob_bordered.h - output_h;

        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233)
        {
            // onnx padding=SAME_UPPER
            copy_cut_border(top_blob_bordered, top_blob, hcut / 2, hcut - hcut / 2, wcut / 2, wcut - wcut / 2, opt);
        }
        else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234)
        {
            // onnx padding=SAME_LOWER
            copy_cut_border(top_blob_bordered, top_blob, hcut - hcut / 2, hcut / 2, wcut - wcut / 2, wcut / 2, opt);
        }
    }
    else
    {
        top_blob = top_blob_bordered;
    }
}

} // namespace ncnn


================================================
FILE: src/layer/deconvolutiondepthwise.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTIONDEPTHWISE_H
#define LAYER_DECONVOLUTIONDEPTHWISE_H

#include "layer.h"

namespace ncnn {

class DeconvolutionDepthWise : public Layer
{
public:
    DeconvolutionDepthWise();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int dilation_w;
    int dilation_h;
    int stride_w;
    int stride_h;
    int pad_left;
    int pad_right;
    int pad_top;
    int pad_bottom;
    int output_pad_right;
    int output_pad_bottom;
    int output_w;
    int output_h;
    int bias_term;

    int weight_data_size;
    int group;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTIONDEPTHWISE_H


================================================
FILE: src/layer/deconvolutiondepthwise1d.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolutiondepthwise1d.h"

#include "fused_activation.h"

namespace ncnn {

DeconvolutionDepthWise1D::DeconvolutionDepthWise1D()
{
    one_blob_only = true;
    support_inplace = false;
}

int DeconvolutionDepthWise1D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    dilation_w = pd.get(2, 1);
    stride_w = pd.get(3, 1);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    output_pad_right = pd.get(18, 0);
    output_w = pd.get(20, 0);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    group = pd.get(7, 1);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    dynamic_weight = pd.get(28, 0);

    if (dynamic_weight)
    {
        one_blob_only = false;
    }

    return 0;
}

int DeconvolutionDepthWise1D::load_model(const ModelBin& mb)
{
    if (dynamic_weight)
        return 0;

    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int deconvolutiondepthwise1d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int stride_w, int dilation_w, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;

    const int outw = top_blob.w;
    const int outh = top_blob.h;

    const int bias_term = bias_data.empty() ? 0 : 1;

    // depth-wise
    if (h == group && group == outh)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat out = top_blob.row_range(g, 1);

            const float* inptr = bottom_blob.row(g);
            const float* kptr = (const float*)weight_data + kernel_w * g;

            const float bias = bias_term ? bias_data[g] : 0.f;

            out.fill(bias);

            for (int j = 0; j < w; j++)
            {
                float* outptr = (float*)out + j * stride_w;

                const float val = inptr[j];

                for (int k = 0; k < kernel_w; k++)
                {
                    float w = kptr[k];
                    outptr[k * dilation_w] += val * w;
                }
            }

            {
                float* outptr = out;

                for (int i = 0; i < outw; i++)
                {
                    outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
                }
            }
        }
    }
    else
    {
        const int h_g = h / group;
        const int outh_g = outh / group;

#ifdef _WIN32
        #pragma omp parallel for num_threads(opt.num_threads)
#else
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif
        for (int g = 0; g < group; g++)
        {
            for (int p = 0; p < outh_g; p++)
            {
                Mat out = top_blob.row_range(g * outh_g + p, 1);

                const float* weight_data_ptr = (const float*)weight_data + kernel_w * h_g * outh_g * g;
                const float bias = bias_term ? bias_data[g * outh_g + p] : 0.f;

                out.fill(bias);

                for (int j = 0; j < w; j++)
                {
                    float* outptr = (float*)out + j * stride_w;

                    const float* kptr = weight_data_ptr + kernel_w * h_g * p;

                    for (int q = 0; q < h_g; q++)
                    {
                        const float val = bottom_blob.row(h_g * g + q)[j];

                        for (int k = 0; k < kernel_w; k++)
                        {
                            outptr[k * dilation_w] += val * kptr[k];
                        }

                        kptr += kernel_w;
                    }
                }

                {
                    float* outptr = out;

                    for (int i = 0; i < outw; i++)
                    {
                        outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
                    }
                }
            }
        }
    }

    return 0;
}

int DeconvolutionDepthWise1D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || output_w > 0)
    {
        top_blob_bordered.create(outw, num_output, elemsize, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, num_output, elemsize, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolutiondepthwise1d(bottom_blob, top_blob_bordered, weight_data, bias_data, kernel_w, stride_w, dilation_w, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int DeconvolutionDepthWise1D::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.h;
    const int _kernel_w = _weight_data.w;
    const int _num_output = _weight_data.h * group;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // transpose group-inch/group-outch/group-kw to group-outch/group-inch/group-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _num_output * _num_input / group, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / group;
        const int inch_g = _num_input / group;
        const int maxk = _kernel_w;

        for (int g = 0; g < group; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;
    }

    const int w = bottom_blob.w;

    const int kernel_extent_w = dilation_w * (_kernel_w - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || output_w > 0)
    {
        top_blob_bordered.create(outw, _num_output, 4u, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, _num_output, 4u, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolutiondepthwise1d(bottom_blob, top_blob_bordered, weight_data_transposed, bias_data_flattened, _kernel_w, stride_w, dilation_w, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

void DeconvolutionDepthWise1D::cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const
{
    if (pad_left > 0 || pad_right > 0)
    {
        copy_cut_border(top_blob_bordered, top_blob, 0, 0, pad_left, pad_right, opt);
    }
    else if (output_w > 0)
    {
        int wcut = top_blob_bordered.w - output_w;

        if (pad_left == -233 || pad_right == -233)
        {
            // onnx padding=SAME_UPPER
            copy_cut_border(top_blob_bordered, top_blob, 0, 0, wcut / 2, wcut - wcut / 2, opt);
        }
        else if (pad_left == -234 || pad_right == -234)
        {
            // onnx padding=SAME_LOWER
            copy_cut_border(top_blob_bordered, top_blob, 0, 0, wcut - wcut / 2, wcut / 2, opt);
        }
    }
    else
    {
        top_blob = top_blob_bordered;
    }
}

} // namespace ncnn


================================================
FILE: src/layer/deconvolutiondepthwise1d.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTIONDEPTHWISE1D_H
#define LAYER_DECONVOLUTIONDEPTHWISE1D_H

#include "layer.h"

namespace ncnn {

class DeconvolutionDepthWise1D : public Layer
{
public:
    DeconvolutionDepthWise1D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    void cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int dilation_w;
    int stride_w;
    int pad_left;
    int pad_right;
    int output_pad_right;
    int output_w;
    int bias_term;

    int weight_data_size;
    int group;

    int activation_type;
    Mat activation_params;

    int dynamic_weight;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTIONDEPTHWISE1D_H


================================================
FILE: src/layer/deconvolutiondepthwise3d.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolutiondepthwise3d.h"

#include "fused_activation.h"

namespace ncnn {

DeconvolutionDepthWise3D::DeconvolutionDepthWise3D()
{
    one_blob_only = true;
    support_inplace = false;
}

int DeconvolutionDepthWise3D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    kernel_d = pd.get(21, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    dilation_d = pd.get(22, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    stride_d = pd.get(23, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    pad_front = pd.get(24, pad_left);
    pad_behind = pd.get(17, pad_front);
    output_pad_right = pd.get(18, 0);
    output_pad_bottom = pd.get(19, output_pad_right);
    output_pad_behind = pd.get(20, output_pad_right);
    output_w = pd.get(25, 0);
    output_h = pd.get(26, output_w);
    output_d = pd.get(27, output_w);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    group = pd.get(7, 1);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    return 0;
}

int DeconvolutionDepthWise3D::load_model(const ModelBin& mb)
{
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static int deconvolutiondepthwise3d(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data, const Mat& bias_data, int kernel_w, int kernel_h, int kernel_d, int stride_w, int stride_h, int stride_d, int dilation_w, int dilation_h, int dilation_d, int group, int activation_type, const Mat& activation_params, const Option& opt)
{
    const int inch = bottom_blob.c;

    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h * kernel_d;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap0 = outw * dilation_h - kernel_w * dilation_w;
        int gap1 = outh * outw * dilation_d - outw * kernel_h * dilation_h;
        for (int z = 0; z < kernel_d; z++)
        {
            for (int i = 0; i < kernel_h; i++)
            {
                for (int j = 0; j < kernel_w; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2 += dilation_w;
                }
                p2 += gap0;
            }
            p2 += gap1;
        }
    }

    // depth-wise
    if (inch == group && group == outch)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            const float* inptr = bottom_blob.channel(g);
            const float* kptr = (const float*)weight_data + maxk * g;
            Mat out = top_blob.channel(g);

            const float bias = bias_data.empty() ? 0.f : bias_data[g];

            out.fill(bias);

            // shadowed variable for less openmp task args
            const int w = bottom_blob.w;
            const int h = bottom_blob.h;
            const int d = bottom_blob.d;
            const int outw = top_blob.w;
            const int outh = top_blob.h;
            const int outd = top_blob.d;

            for (int z = 0; z < d; z++)
            {
                for (int i = 0; i < h; i++)
                {
                    for (int j = 0; j < w; j++)
                    {
                        float* outptr = out.depth(z * stride_d).row(i * stride_h) + j * stride_w;

                        const float val = inptr[z * w * h + i * w + j];

                        for (int k = 0; k < maxk; k++)
                        {
                            float w = kptr[k];
                            outptr[space_ofs[k]] += val * w;
                        }
                    }
                }
            }

            {
                float* outptr = out;
                int size = outw * outh * outd;

                for (int i = 0; i < size; i++)
                {
                    outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
                }
            }
        }
    }
    else
    {
        const int inch_g = inch / group;
        const int outch_g = outch / group;

#ifdef _WIN32
        #pragma omp parallel for num_threads(opt.num_threads)
#else
        #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
#endif
        for (int g = 0; g < group; g++)
        {
            for (int p = 0; p < outch_g; p++)
            {
                Mat out = top_blob.channel(g * outch_g + p);

                const float* weight_data_ptr = (const float*)weight_data + maxk * inch_g * outch_g * g;

                const float bias = bias_data.empty() ? 0.f : bias_data[g * outch_g + p];

                out.fill(bias);

                // shadowed variable for less openmp task args
                const int w = bottom_blob.w;
                const int h = bottom_blob.h;
                const int d = bottom_blob.d;
                const int outw = top_blob.w;
                const int outh = top_blob.h;
                const int outd = top_blob.d;

                for (int z = 0; z < d; z++)
                {
                    for (int i = 0; i < h; i++)
                    {
                        for (int j = 0; j < w; j++)
                        {
                            float* outptr = out.depth(z * stride_d).row(i * stride_h) + j * stride_w;

                            const float* kptr = weight_data_ptr + maxk * inch_g * p;

                            for (int q = 0; q < inch_g; q++)
                            {
                                const float val = bottom_blob.channel(inch_g * g + q).depth(z).row(i)[j];

                                for (int k = 0; k < maxk; k++)
                                {
                                    outptr[space_ofs[k]] += val * kptr[k];
                                }

                                kptr += maxk;
                            }
                        }
                    }
                }

                {
                    float* outptr = out;
                    int size = outw * outh * outd;

                    for (int i = 0; i < size; i++)
                    {
                        outptr[i] = activation_ss(outptr[i], activation_type, activation_params);
                    }
                }
            }
        }
    }

    return 0;
}

int DeconvolutionDepthWise3D::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
    const int kernel_extent_d = dilation_d * (kernel_d - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int outd = (d - 1) * stride_d + kernel_extent_d + output_pad_behind;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || pad_front > 0 || pad_behind > 0 || (output_w > 0 && output_h > 0 && output_d > 0))
    {
        top_blob_bordered.create(outw, outh, outd, num_output, elemsize, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, outd, num_output, elemsize, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    int ret = deconvolutiondepthwise3d(bottom_blob, top_blob_bordered, weight_data, bias_data, kernel_w, kernel_h, kernel_d, stride_w, stride_h, stride_d, dilation_w, dilation_h, dilation_d, group, activation_type, activation_params, opt);
    if (ret != 0)
        return ret;

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

void DeconvolutionDepthWise3D::cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const
{
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || pad_front > 0 || pad_behind > 0)
    {
        copy_cut_border_3d(top_blob_bordered, top_blob, pad_top, pad_bottom, pad_left, pad_right, pad_front, pad_behind, opt);
    }
    else if (output_w > 0 && output_h > 0 && output_d > 0)
    {
        int wcut = top_blob_bordered.w - output_w;
        int hcut = top_blob_bordered.h - output_h;
        int dcut = top_blob_bordered.d - output_d;

        if (pad_left == -233 || pad_right == -233 || pad_top == -233 || pad_bottom == -233 || pad_front == -233 || pad_behind == -233)
        {
            // onnx padding=SAME_UPPER
            copy_cut_border_3d(top_blob_bordered, top_blob, hcut / 2, hcut - hcut / 2, wcut / 2, wcut - wcut / 2, dcut / 2, dcut - dcut / 2, opt);
        }
        else if (pad_left == -234 || pad_right == -234 || pad_top == -234 || pad_bottom == -234 || pad_front == -234 || pad_behind == -234)
        {
            // onnx padding=SAME_LOWER
            copy_cut_border_3d(top_blob_bordered, top_blob, hcut - hcut / 2, hcut / 2, wcut - wcut / 2, wcut / 2, dcut - dcut / 2, dcut / 2, opt);
        }
    }
    else
    {
        top_blob = top_blob_bordered;
    }
}

} // namespace ncnn


================================================
FILE: src/layer/deconvolutiondepthwise3d.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTIONDEPTHWISE3D_H
#define LAYER_DECONVOLUTIONDEPTHWISE3D_H

#include "layer.h"

namespace ncnn {

class DeconvolutionDepthWise3D : public Layer
{
public:
    DeconvolutionDepthWise3D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    void cut_padding(const Mat& top_blob_bordered, Mat& top_blob, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int kernel_d;
    int dilation_w;
    int dilation_h;
    int dilation_d;
    int stride_w;
    int stride_h;
    int stride_d;
    int pad_left;
    int pad_right;
    int pad_top;
    int pad_bottom;
    int pad_front;
    int pad_behind;
    int output_pad_right;
    int output_pad_bottom;
    int output_pad_behind;
    int output_w;
    int output_h;
    int output_d;
    int bias_term;

    int weight_data_size;
    int group;

    int activation_type;
    Mat activation_params;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTIONDEPTHWISE3D_H


================================================
FILE: src/layer/deepcopy.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deepcopy.h"

namespace ncnn {

DeepCopy::DeepCopy()
{
    one_blob_only = true;
    support_inplace = false;
    support_packing = true;
}

int DeepCopy::forward(const Mat& bottom_blob, Mat& top_blob, const Option& /*opt*/) const
{
    top_blob = bottom_blob.clone();
    if (top_blob.empty())
        return -100;

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/deepcopy.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DEEPCOPY_H
#define LAYER_DEEPCOPY_H

#include "layer.h"

namespace ncnn {

class DeepCopy : public Layer
{
public:
    DeepCopy();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_DEEPCOPY_H


================================================
FILE: src/layer/deformableconv2d.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "deformableconv2d.h"

#include "fused_activation.h"

namespace ncnn {

DeformableConv2D::DeformableConv2D()
{
    one_blob_only = false;
    support_inplace = false;
}

int DeformableConv2D::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    bias_term = pd.get(5, 0);
    weight_data_size = pd.get(6, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());
    return 0;
}

int DeformableConv2D::load_model(const ModelBin& mb)
{
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }
    return 0;
}

int DeformableConv2D::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& offset = bottom_blobs[1];

    const bool has_mask = (bottom_blobs.size() == 3);

    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int in_c = bottom_blob.c;
    const size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1;
    const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1;

    // output.shape is [num_output, out_h, out_w] (in python).
    Mat& output = top_blobs[0];
    output.create(out_w, out_h, num_output, elemsize, opt.blob_allocator);
    if (output.empty())
        return -100;

    const float* weight_ptr = weight_data;
    const float* bias_ptr = weight_data;
    if (bias_term)
        bias_ptr = bias_data;

    // deformable conv
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int h_col = 0; h_col < out_h; h_col++)
    {
        for (int w_col = 0; w_col < out_w; w_col++)
        {
            int h_in = h_col * stride_h - pad_top;
            int w_in = w_col * stride_w - pad_left;
            for (int oc = 0; oc < num_output; oc++)
            {
                float sum = 0.f;
                if (bias_term)
                    sum = bias_ptr[oc];
                for (int i = 0; i < kernel_h; i++)
                {
                    for (int j = 0; j < kernel_w; j++)
                    {
                        const float offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col];
                        const float offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col];
                        const float mask_ = has_mask ? bottom_blobs[2].channel(i * kernel_w + j).row(h_col)[w_col] : 1.f;
                        const float h_im = h_in + i * dilation_h + offset_h;
                        const float w_im = w_in + j * dilation_w + offset_w;

                        // Bilinear
                        const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w;
                        int h_low = 0;
                        int w_low = 0;
                        int h_high = 0;
                        int w_high = 0;
                        float w1 = 0.f;
                        float w2 = 0.f;
                        float w3 = 0.f;
                        float w4 = 0.f;
                        bool v1_cond = false;
                        bool v2_cond = false;
                        bool v3_cond = false;
                        bool v4_cond = false;
                        if (cond)
                        {
                            h_low = (int)floorf(h_im);
                            w_low = (int)floorf(w_im);
                            h_high = h_low + 1;
                            w_high = w_low + 1;

                            float lh = h_im - h_low;
                            float lw = w_im - w_low;
                            float hh = 1 - lh;
                            float hw = 1 - lw;

                            v1_cond = (h_low >= 0 && w_low >= 0);
                            v2_cond = (h_low >= 0 && w_high <= w - 1);
                            v3_cond = (h_high <= h - 1 && w_low >= 0);
                            v4_cond = (h_high <= h - 1 && w_high <= w - 1);

                            w1 = hh * hw;
                            w2 = hh * lw;
                            w3 = lh * hw;
                            w4 = lh * lw;
                        }

                        for (int c_im = 0; c_im < in_c; c_im++)
                        {
                            float val = 0.f;
                            if (cond)
                            {
                                float v1 = v1_cond ? bottom_blob.channel(c_im).row(h_low)[w_low] : 0.f;
                                float v2 = v2_cond ? bottom_blob.channel(c_im).row(h_low)[w_high] : 0.f;
                                float v3 = v3_cond ? bottom_blob.channel(c_im).row(h_high)[w_low] : 0.f;
                                float v4 = v4_cond ? bottom_blob.channel(c_im).row(h_high)[w_high] : 0.f;
                                val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
                            }
                            sum += val * mask_ * weight_ptr[((oc * in_c + c_im) * kernel_h + i) * kernel_w + j];
                        }
                    }
                }
                output.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params);
            }
        }
    }
    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/deformableconv2d.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DEFORMABLECONV2D_H
#define LAYER_DEFORMABLECONV2D_H

#include "layer.h"

namespace ncnn {

class DeformableConv2D : public Layer
{
public:
    DeformableConv2D();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    // param
    int num_output;
    int kernel_w;
    int kernel_h;
    int dilation_w;
    int dilation_h;
    int stride_w;
    int stride_h;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    int pad_top;
    int pad_bottom;
    int bias_term;

    int weight_data_size;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    // model
    Mat weight_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DEFORMABLECONV2D_H


================================================
FILE: src/layer/dequantize.cpp
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "dequantize.h"

namespace ncnn {

Dequantize::Dequantize()
{
    one_blob_only = true;
    support_inplace = false;
}

int Dequantize::load_param(const ParamDict& pd)
{
    scale_data_size = pd.get(0, 1);
    bias_data_size = pd.get(1, 0);

    return 0;
}

int Dequantize::load_model(const ModelBin& mb)
{
    scale_data = mb.load(scale_data_size, 1);
    if (scale_data.empty())
        return -100;

    if (bias_data_size)
    {
        bias_data = mb.load(bias_data_size, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
}

static void dequantize(const int* intptr, float* ptr, float scale, float bias, int size)
{
    for (int i = 0; i < size; i++)
    {
        *ptr = *intptr * scale + bias;
        intptr++;
        ptr++;
    }
}

int Dequantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;

    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 1)
    {
        // assert scale_data_size == 1
        // assert bias_data_size == 0 || bias_data_size == 1

        const int* intptr = bottom_blob;
        float* ptr = top_blob;

        const float scale = scale_data[0];
        const float bias = bias_data_size == 0 ? 0.f : bias_data[0];

        dequantize(intptr, ptr, scale, bias, w);
    }

    if (dims == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            const int* intptr = bottom_blob.row<const int>(i);
            float* ptr = top_blob.row(i);

            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
            const float bias = bias_data_size == 0 ? 0.f : bias_data_size == 1 ? bias_data[0] : bias_data[i];

            dequantize(intptr, ptr, scale, bias, w);
        }
    }

    if (dims == 3)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int* intptr = bottom_blob.channel(q);
            float* ptr = top_blob.channel(q);

            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
            const float bias = bias_data_size == 0 ? 0.f : bias_data_size == 1 ? bias_data[0] : bias_data[q];

            dequantize(intptr, ptr, scale, bias, w * h);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/dequantize.h
================================================
// Copyright 2018 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DEQUANTIZE_H
#define LAYER_DEQUANTIZE_H

#include "layer.h"

namespace ncnn {

class Dequantize : public Layer
{
public:
    Dequantize();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    int scale_data_size;
    int bias_data_size;

    Mat scale_data;
    Mat bias_data;
};

} // namespace ncnn

#endif // LAYER_DEQUANTIZE_H


================================================
FILE: src/layer/detectionoutput.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "detectionoutput.h"

namespace ncnn {

DetectionOutput::DetectionOutput()
{
    one_blob_only = false;
    support_inplace = false;
}

int DetectionOutput::load_param(const ParamDict& pd)
{
    num_class = pd.get(0, 0);
    nms_threshold = pd.get(1, 0.05f);
    nms_top_k = pd.get(2, 300);
    keep_top_k = pd.get(3, 100);
    confidence_threshold = pd.get(4, 0.5f);
    variances[0] = pd.get(5, 0.1f);
    variances[1] = pd.get(6, 0.1f);
    variances[2] = pd.get(7, 0.2f);
    variances[3] = pd.get(8, 0.2f);

    return 0;
}

struct BBoxRect
{
    float xmin;
    float ymin;
    float xmax;
    float ymax;
    int label;
};

static inline float intersection_area(const BBoxRect& a, const BBoxRect& b)
{
    if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin)
    {
        // no intersection
        return 0.f;
    }

    float inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin);
    float inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin);

    return inter_width * inter_height;
}

template<typename T>
static void qsort_descent_inplace(std::vector<T>& datas, std::vector<float>& scores, int left, int right)
{
    int i = left;
    int j = right;
    float p = scores[(left + right) / 2];

    while (i <= j)
    {
        while (scores[i] > p)
            i++;

        while (scores[j] < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(datas[i], datas[j]);
            std::swap(scores[i], scores[j]);

            i++;
            j--;
        }
    }

    if (left < j)
        qsort_descent_inplace(datas, scores, left, j);

    if (i < right)
        qsort_descent_inplace(datas, scores, i, right);
}

template<typename T>
static void qsort_descent_inplace(std::vector<T>& datas, std::vector<float>& scores)
{
    if (datas.empty() || scores.empty())
        return;

    qsort_descent_inplace(datas, scores, 0, static_cast<int>(scores.size() - 1));
}

static void nms_sorted_bboxes(const std::vector<BBoxRect>& bboxes, std::vector<size_t>& picked, float nms_threshold)
{
    picked.clear();

    const size_t n = bboxes.size();

    std::vector<float> areas(n);
    for (size_t i = 0; i < n; i++)
    {
        const BBoxRect& r = bboxes[i];

        float width = r.xmax - r.xmin;
        float height = r.ymax - r.ymin;

        areas[i] = width * height;
    }

    for (size_t i = 0; i < n; i++)
    {
        const BBoxRect& a = bboxes[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const BBoxRect& b = bboxes[picked[j]];

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            //             float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& location = bottom_blobs[0];
    const Mat& confidence = bottom_blobs[1];
    const Mat& priorbox = bottom_blobs[2];

    bool mxnet_ssd_style = num_class == -233;

    // mxnet-ssd _contrib_MultiBoxDetection
    const int num_prior = mxnet_ssd_style ? priorbox.h : priorbox.w / 4;

    int num_class_copy = mxnet_ssd_style ? confidence.h : num_class;

    // apply location with priorbox
    Mat bboxes;
    bboxes.create(4, num_prior, 4u, opt.workspace_allocator);
    if (bboxes.empty())
        return -100;

    const float* location_ptr = location;
    const float* priorbox_ptr = priorbox.row(0);
    const float* variance_ptr = mxnet_ssd_style ? 0 : priorbox.row(1);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < num_prior; i++)
    {
        // if score of background class is larger than confidence threshold
        float score = mxnet_ssd_style ? confidence[i] : confidence[static_cast<size_t>(i) * static_cast<size_t>(num_class_copy)];
        if (score >= (1.0 - confidence_threshold))
        {
            continue;
        }
        const float* loc = location_ptr + i * 4;
        const float* pb = priorbox_ptr + i * 4;
        const float* var = variance_ptr ? variance_ptr + i * 4 : variances;

        float* bbox = bboxes.row(i);

        // CENTER_SIZE
        float pb_w = pb[2] - pb[0];
        float pb_h = pb[3] - pb[1];
        float pb_cx = (pb[0] + pb[2]) * 0.5f;
        float pb_cy = (pb[1] + pb[3]) * 0.5f;

        float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
        float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
        float bbox_w = expf(var[2] * loc[2]) * pb_w;
        float bbox_h = expf(var[3] * loc[3]) * pb_h;

        bbox[0] = bbox_cx - bbox_w * 0.5f;
        bbox[1] = bbox_cy - bbox_h * 0.5f;
        bbox[2] = bbox_cx + bbox_w * 0.5f;
        bbox[3] = bbox_cy + bbox_h * 0.5f;
    }

    // sort and nms for each class
    std::vector<std::vector<BBoxRect> > all_class_bbox_rects;
    std::vector<std::vector<float> > all_class_bbox_scores;
    all_class_bbox_rects.resize(num_class_copy);
    all_class_bbox_scores.resize(num_class_copy);

    // start from 1 to ignore background class
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 1; i < num_class_copy; i++)
    {
        // filter by confidence_threshold
        std::vector<BBoxRect> class_bbox_rects;
        std::vector<float> class_bbox_scores;

        for (int j = 0; j < num_prior; j++)
        {
            // prob data layout
            // caffe-ssd = num_class x num_prior
            // mxnet-ssd = num_prior x num_class
            float score = mxnet_ssd_style ? confidence[i * num_prior + j] : confidence[j * num_class_copy + i];

            if (score > confidence_threshold)
            {
                const float* bbox = bboxes.row(j);
                BBoxRect c = {bbox[0], bbox[1], bbox[2], bbox[3], i};
                class_bbox_rects.push_back(c);
                class_bbox_scores.push_back(score);
            }
        }

        // sort inplace
        qsort_descent_inplace(class_bbox_rects, class_bbox_scores);

        // keep nms_top_k
        if (nms_top_k < (int)class_bbox_rects.size())
        {
            class_bbox_rects.resize(nms_top_k);
            class_bbox_scores.resize(nms_top_k);
        }

        // apply nms
        std::vector<size_t> picked;
        nms_sorted_bboxes(class_bbox_rects, picked, nms_threshold);

        // select
        for (size_t j = 0; j < picked.size(); j++)
        {
            size_t z = picked[j];
            all_class_bbox_rects[i].push_back(class_bbox_rects[z]);
            all_class_bbox_scores[i].push_back(class_bbox_scores[z]);
        }
    }

    // gather all class
    std::vector<BBoxRect> bbox_rects;
    std::vector<float> bbox_scores;

    for (int i = 1; i < num_class_copy; i++)
    {
        const std::vector<BBoxRect>& class_bbox_rects = all_class_bbox_rects[i];
        const std::vector<float>& class_bbox_scores = all_class_bbox_scores[i];

        bbox_rects.insert(bbox_rects.end(), class_bbox_rects.begin(), class_bbox_rects.end());
        bbox_scores.insert(bbox_scores.end(), class_bbox_scores.begin(), class_bbox_scores.end());
    }

    // global sort inplace
    qsort_descent_inplace(bbox_rects, bbox_scores);

    // keep_top_k
    if (keep_top_k < (int)bbox_rects.size())
    {
        bbox_rects.resize(keep_top_k);
        bbox_scores.resize(keep_top_k);
    }

    // fill result
    int num_detected = static_cast<int>(bbox_rects.size());
    if (num_detected == 0)
        return 0;

    Mat& top_blob = top_blobs[0];
    top_blob.create(6, num_detected, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    for (int i = 0; i < num_detected; i++)
    {
        const BBoxRect& r = bbox_rects[i];
        float score = bbox_scores[i];
        float* outptr = top_blob.row(i);

        outptr[0] = static_cast<float>(r.label);
        outptr[1] = score;
        outptr[2] = r.xmin;
        outptr[3] = r.ymin;
        outptr[4] = r.xmax;
        outptr[5] = r.ymax;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/detectionoutput.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DETECTIONOUTPUT_H
#define LAYER_DETECTIONOUTPUT_H

#include "layer.h"

namespace ncnn {

class DetectionOutput : public Layer
{
public:
    DetectionOutput();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    int num_class;
    float nms_threshold;
    int nms_top_k;
    int keep_top_k;
    float confidence_threshold;
    float variances[4];
};

} // namespace ncnn

#endif // LAYER_DETECTIONOUTPUT_H


================================================
FILE: src/layer/diag.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "diag.h"

namespace ncnn {

Diag::Diag()
{
    one_blob_only = true;
    support_inplace = false;
}

int Diag::load_param(const ParamDict& pd)
{
    diagonal = pd.get(0, 0);

    return 0;
}

int Diag::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;

    if (dims == 1)
    {
        int w = bottom_blob.w;
        int top_w = w + ((diagonal >= 0) ? diagonal : -diagonal);

        top_blob.create(top_w, top_w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.fill(0.0f);

        int bias_r = -std::min(diagonal, 0);
        int bias_c = std::max(diagonal, 0);

        for (int i = 0; i < w; i++)
        {
            top_blob.row(i + bias_r)[i + bias_c] = bottom_blob[i];
        }
    }
    if (dims == 2)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        int len = 0;
        int minimum = std::min(w - h, 0);
        int maximum = std::max(w - h, 0);
        if (diagonal <= maximum && diagonal >= minimum)
            len = std::min(w, h);
        else if (diagonal > -h && diagonal < minimum)
            len = diagonal + h;
        else if (diagonal > maximum && diagonal < w)
            len = -diagonal + w;

        top_blob.create(len, elemsize, opt.blob_allocator);
        if (top_blob.empty())
        {
            if (len == 0)
                return 0;
            return -100;
        }

        int bias_r = -std::min(diagonal, 0);
        int bias_c = std::max(diagonal, 0);

        for (int i = 0; i < len; i++)
        {
            top_blob[i] = bottom_blob.row(i + bias_r)[i + bias_c];
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/diag.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DIAG_H
#define LAYER_DIAG_H

#include "layer.h"

namespace ncnn {

class Diag : public Layer
{
public:
    Diag();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    int diagonal;
};

} // namespace ncnn

#endif // LAYER_DIAG_H


================================================
FILE: src/layer/dropout.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "dropout.h"

namespace ncnn {

Dropout::Dropout()
{
    one_blob_only = true;
    support_inplace = true;
}

int Dropout::load_param(const ParamDict& pd)
{
    scale = pd.get(0, 1.f);

    return 0;
}

int Dropout::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    if (scale == 1.f)
    {
        return 0;
    }

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            ptr[i] = ptr[i] * scale;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/dropout.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DROPOUT_H
#define LAYER_DROPOUT_H

#include "layer.h"

namespace ncnn {

class Dropout : public Layer
{
public:
    Dropout();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float scale;
};

} // namespace ncnn

#endif // LAYER_DROPOUT_H


================================================
FILE: src/layer/einsum.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "einsum.h"
#include <string.h>

namespace ncnn {

Einsum::Einsum()
{
    one_blob_only = false;
    support_inplace = false;
}

int Einsum::load_param(const ParamDict& pd)
{
    Mat equation_mat = pd.get(0, Mat());

    const int equation_len = equation_mat.w;

    // restore to lexical equation string
    std::string equation;
    equation.resize(equation_len);
    char* equation_ptr = (char*)equation.c_str();
    {
        const int* p = equation_mat;
        for (int i = 0; i < equation_len; i++)
        {
            equation_ptr[i] = p[i];
        }
    }

    if (equation == "ii")
    {
        // trace
        rhs_token = "ii";

        return 0;
    }

    // split into tokens
    char* arrow = strstr(equation_ptr, "->");
    if (!arrow)
    {
        NCNN_LOGE("invalid equation %s", equation_ptr);
        return -1;
    }

    arrow[0] = '\0';
    arrow[1] = '\0';

    char* lhs = equation_ptr;
    char* rhs = arrow + 2;

    {
        char* t = strtok(lhs, ",");
        while (t)
        {
            lhs_tokens.push_back(std::string(t));
            t = strtok(NULL, ",");
        }
    }

    rhs_token = std::string(rhs);

    // check token always in ijkl
    {
        for (size_t i = 0; i < rhs_token.size(); i++)
        {
            if (rhs_token[i] < 'i' || rhs_token[i] > 'l')
            {
                NCNN_LOGE("invalid rhs_token %s", rhs_token.c_str());
                return -1;
            }
        }

        for (size_t i = 0; i < lhs_tokens.size(); i++)
        {
            const std::string& lhs_token = lhs_tokens[i];
            for (size_t j = 0; j < lhs_token.size(); j++)
            {
                if (lhs_token[j] < 'i' || lhs_token[j] > 'x')
                {
                    NCNN_LOGE("invalid lhs_token %s", lhs_token.c_str());
                    return -1;
                }
            }
        }
    }

    return 0;
}

static float get_indexed_value(const Mat& m, const std::string& token, std::vector<int>& indexes)
{
    const int dims = m.dims;

    if (dims == 1)
    {
        int x = indexes[token[0] - 'i'];
        return m[x];
    }

    if (dims == 2)
    {
        int y = indexes[token[0] - 'i'];
        int x = indexes[token[1] - 'i'];
        return m.row(y)[x];
    }

    if (dims == 3)
    {
        int c = indexes[token[0] - 'i'];
        int y = indexes[token[1] - 'i'];
        int x = indexes[token[2] - 'i'];
        return m.channel(c).row(y)[x];
    }

    if (dims == 4)
    {
        int c = indexes[token[0] - 'i'];
        int z = indexes[token[1] - 'i'];
        int y = indexes[token[2] - 'i'];
        int x = indexes[token[3] - 'i'];
        return m.channel(c).depth(z).row(y)[x];
    }

    // should never reach here
    return 0;
}

static float sum_dim(const std::vector<int>& dim_sizes, int d, const std::vector<Mat>& bottom_blobs, const std::vector<std::string>& tokens, std::vector<int>& indexes)
{
    if (d == (int)dim_sizes.size())
    {
        float v = 1.f;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            v *= get_indexed_value(bottom_blobs[b], tokens[b], indexes);
        }

        return v;
    }

    float sum = 0.f;

    for (int i = 0; i < dim_sizes[d]; i++)
    {
        indexes[d] = i;

        sum += sum_dim(dim_sizes, d + 1, bottom_blobs, tokens, indexes);
    }

    return sum;
}

int Einsum::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    // assert bottom_blobs.size() == lhs_tokens.size()
    // assert top_blobs.size() == 1

    size_t elemsize = bottom_blobs[0].elemsize;

    if (lhs_tokens.empty() && rhs_token == "ii")
    {
        // assert bottom_blobs.size() == 1
        // assert bottom_blob.dims == 2
        // assert bottom_blob.w == bottom_blob.h

        // trace
        Mat& top_blob = top_blobs[0];
        top_blob.create(1, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const Mat& bottom_blob = bottom_blobs[0];

        float sum = 0.f;

        for (int i = 0; i < bottom_blob.h; i++)
        {
            sum += bottom_blob.row(i)[i];
        }

        top_blob[0] = sum;

        return 0;
    }

    // resolve dimension sizes
    std::vector<int> dim_sizes(16, 1); // map ijklmnopqrstuvwx -> dim_size
    int dim_sizes_count = 0;

    for (size_t b = 0; b < bottom_blobs.size(); b++)
    {
        const std::string& lhs_token = lhs_tokens[b];
        const Mat& bottom_blob = bottom_blobs[b];
        const int in_dims = bottom_blob.dims;

        for (int s = 0; s < in_dims; s++)
        {
            int dim_size = 1;
            if (in_dims == 1) dim_size = bottom_blob.w;
            if (in_dims == 2 && s == 0) dim_size = bottom_blob.h;
            if (in_dims == 2 && s == 1) dim_size = bottom_blob.w;
            if (in_dims == 3 && s == 0) dim_size = bottom_blob.c;
            if (in_dims == 3 && s == 1) dim_size = bottom_blob.h;
            if (in_dims == 3 && s == 2) dim_size = bottom_blob.w;
            if (in_dims == 4 && s == 0) dim_size = bottom_blob.c;
            if (in_dims == 4 && s == 1) dim_size = bottom_blob.d;
            if (in_dims == 4 && s == 2) dim_size = bottom_blob.h;
            if (in_dims == 4 && s == 3) dim_size = bottom_blob.w;

            int dim_sizes_index = lhs_token[s] - 'i';
            dim_sizes[dim_sizes_index] = dim_size;
            dim_sizes_count = std::max(dim_sizes_count, dim_sizes_index + 1);
        }
    }

    dim_sizes.resize(dim_sizes_count);

    const int out_dims = (int)rhs_token.size();

    std::vector<int> indexes(dim_sizes_count);

    if (out_dims == 1)
    {
        Mat& top_blob = top_blobs[0];
        top_blob.create(dim_sizes[0], elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        for (int i = 0; i < top_blob.w; i++)
        {
            indexes[0] = i;

            float sum = sum_dim(dim_sizes, 1, bottom_blobs, lhs_tokens, indexes);

            top_blob[i] = sum;
        }
    }

    if (out_dims == 2)
    {
        Mat& top_blob = top_blobs[0];
        top_blob.create(dim_sizes[1], dim_sizes[0], elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        for (int i = 0; i < top_blob.h; i++)
        {
            indexes[0] = i;

            for (int j = 0; j < top_blob.w; j++)
            {
                indexes[1] = j;

                float sum = sum_dim(dim_sizes, 2, bottom_blobs, lhs_tokens, indexes);

                top_blob.row(i)[j] = sum;
            }
        }
    }

    if (out_dims == 3)
    {
        Mat& top_blob = top_blobs[0];
        top_blob.create(dim_sizes[2], dim_sizes[1], dim_sizes[0], elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        for (int i = 0; i < top_blob.c; i++)
        {
            indexes[0] = i;

            for (int j = 0; j < top_blob.h; j++)
            {
                indexes[1] = j;

                for (int k = 0; k < top_blob.w; k++)
                {
                    indexes[2] = k;

                    float sum = sum_dim(dim_sizes, 3, bottom_blobs, lhs_tokens, indexes);

                    top_blob.channel(i).row(j)[k] = sum;
                }
            }
        }
    }

    if (out_dims == 4)
    {
        Mat& top_blob = top_blobs[0];
        top_blob.create(dim_sizes[3], dim_sizes[2], dim_sizes[1], dim_sizes[0], elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        for (int i = 0; i < top_blob.c; i++)
        {
            indexes[0] = i;

            for (int j = 0; j < top_blob.d; j++)
            {
                indexes[1] = j;

                for (int k = 0; k < top_blob.h; k++)
                {
                    indexes[2] = k;

                    for (int l = 0; l < top_blob.w; l++)
                    {
                        indexes[3] = l;

                        float sum = sum_dim(dim_sizes, 4, bottom_blobs, lhs_tokens, indexes);

                        top_blob.channel(i).depth(j).row(k)[l] = sum;
                    }
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/einsum.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_EINSUM_H
#define LAYER_EINSUM_H

#include "layer.h"

namespace ncnn {

class Einsum : public Layer
{
public:
    Einsum();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    // equation tokens
    std::vector<std::string> lhs_tokens;
    std::string rhs_token;
};

} // namespace ncnn

#endif // LAYER_EINSUM_H


================================================
FILE: src/layer/eltwise.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "eltwise.h"

namespace ncnn {

Eltwise::Eltwise()
{
    one_blob_only = false;
    support_inplace = false; // TODO inplace reduction
}

int Eltwise::load_param(const ParamDict& pd)
{
    op_type = pd.get(0, 0);
    coeffs = pd.get(1, Mat());

    return 0;
}

int Eltwise::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int size = w * h * d;

    Mat& top_blob = top_blobs[0];
    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = ptr[i] * ptr1[i];
            }
        }

        for (size_t b = 2; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    outptr[i] *= ptr[i];
                }
            }
        }
    }
    else if (op_type == Operation_SUM)
    {
        if (coeffs.w == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    outptr[i] = ptr[i] + ptr1[i];
                }
            }

            for (size_t b = 2; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    for (int i = 0; i < size; i++)
                    {
                        outptr[i] += ptr[i];
                    }
                }
            }
        }
        else
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            float coeff0 = coeffs[0];
            float coeff1 = coeffs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    outptr[i] = ptr[i] * coeff0 + ptr1[i] * coeff1;
                }
            }

            for (size_t b = 2; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                float coeff = coeffs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    for (int i = 0; i < size; i++)
                    {
                        outptr[i] += ptr[i] * coeff;
                    }
                }
            }
        }
    }
    else if (op_type == Operation_MAX)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = std::max(ptr[i], ptr1[i]);
            }
        }

        for (size_t b = 2; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i = 0; i < size; i++)
                {
                    outptr[i] = std::max(outptr[i], ptr[i]);
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/eltwise.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ELTWISE_H
#define LAYER_ELTWISE_H

#include "layer.h"

namespace ncnn {

class Eltwise : public Layer
{
public:
    Eltwise();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    enum OperationType
    {
        Operation_PROD = 0,
        Operation_SUM = 1,
        Operation_MAX = 2
    };

public:
    // param
    int op_type;
    Mat coeffs;
};

} // namespace ncnn

#endif // LAYER_ELTWISE_H


================================================
FILE: src/layer/elu.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "elu.h"

namespace ncnn {

ELU::ELU()
{
    one_blob_only = true;
    support_inplace = true;
}

int ELU::load_param(const ParamDict& pd)
{
    alpha = pd.get(0, 0.1f);

    return 0;
}

int ELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            if (ptr[i] < 0.f)
                ptr[i] = alpha * (expf(ptr[i]) - 1.f);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/elu.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ELU_H
#define LAYER_ELU_H

#include "layer.h"

namespace ncnn {

class ELU : public Layer
{
public:
    ELU();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float alpha;
};

} // namespace ncnn

#endif // LAYER_ELU_H


================================================
FILE: src/layer/embed.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "embed.h"

#include <string.h>

namespace ncnn {

Embed::Embed()
{
    one_blob_only = true;
    support_inplace = false;
}

int Embed::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    input_dim = pd.get(1, 0);
    bias_term = pd.get(2, 0);
    weight_data_size = pd.get(3, 0);
    int8_scale_term = pd.get(18, 0);

    return 0;
}

int Embed::load_model(const ModelBin& mb)
{
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

#if NCNN_INT8
    if (int8_scale_term)
    {
        weight_data_int8_scale = mb.load(1, 1)[0];
    }
#endif // NCNN_INT8

    return 0;
}

static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
{
    const int num_output = top_blob.w;
    const int words = top_blob.h;

    const float* bias_ptr = bias_data;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < words; q++)
    {
        float* outptr = top_blob.row(q);

        int word_index = ((const int*)bottom_blob)[q];

        if (word_index < 0)
            word_index = 0;
        if (word_index >= input_dim)
            word_index = input_dim - 1;

        const float* em = (const float*)weight_data + num_output * word_index;

        if (bias_ptr)
        {
            for (int p = 0; p < num_output; p++)
            {
                outptr[p] = em[p] + bias_ptr[p];
            }
        }
        else
        {
            memcpy(outptr, em, num_output * sizeof(float));
        }
    }
}

#if NCNN_INT8
static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
{
    const int num_output = top_blob.w;
    const int words = top_blob.h;

    const float* bias_ptr = bias_data;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < words; q++)
    {
        float* outptr = top_blob.row(q);

        int word_index = ((const int*)bottom_blob)[q];

        if (word_index < 0)
            word_index = 0;
        if (word_index >= input_dim)
            word_index = input_dim - 1;

        const float descale_em = 1.f / weight_data_int8_scale;

        const signed char* em = (const signed char*)weight_data + num_output * word_index;

        if (bias_ptr)
        {
            for (int p = 0; p < num_output; p++)
            {
                outptr[p] = em[p] * descale_em + bias_ptr[p];
            }
        }
        else
        {
            for (int p = 0; p < num_output; p++)
            {
                outptr[p] = em[p] * descale_em;
            }
        }
    }
}
#endif // NCNN_INT8

int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int words = bottom_blob.w;

    top_blob.create(num_output, words, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if NCNN_INT8
    if (int8_scale_term)
    {
        embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt);
    }
    else
#endif // NCNN_INT8
    {
        embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt);
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/embed.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_EMBED_H
#define LAYER_EMBED_H

#include "layer.h"

namespace ncnn {

class Embed : public Layer
{
public:
    Embed();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    // param
    int num_output;
    int input_dim;
    int bias_term;

    int weight_data_size;

    int int8_scale_term;

    // model
    Mat weight_data;
    Mat bias_data;

#if NCNN_INT8
    float weight_data_int8_scale;
#endif
};

} // namespace ncnn

#endif // LAYER_EMBED_H


================================================
FILE: src/layer/erf.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "erf.h"

namespace ncnn {

Erf::Erf()
{
    one_blob_only = true;
    support_inplace = true;
}

int Erf::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            ptr[i] = erff(ptr[i]);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/erf.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ERF_H
#define LAYER_ERF_H

#include "layer.h"

namespace ncnn {

class Erf : public Layer
{
public:
    Erf();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ERF_H


================================================
FILE: src/layer/exp.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "exp.h"

namespace ncnn {

Exp::Exp()
{
    one_blob_only = true;
    support_inplace = true;
}

int Exp::load_param(const ParamDict& pd)
{
    base = pd.get(0, -1.f);
    scale = pd.get(1, 1.f);
    shift = pd.get(2, 0.f);

    return 0;
}

int Exp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (base == -1.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                ptr[i] = expf(shift + ptr[i] * scale);
            }
        }
    }
    else
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                ptr[i] = powf(base, (shift + ptr[i] * scale));
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/exp.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_EXP_H
#define LAYER_EXP_H

#include "layer.h"

namespace ncnn {

class Exp : public Layer
{
public:
    Exp();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float base;
    float scale;
    float shift;
};

} // namespace ncnn

#endif // LAYER_EXP_H


================================================
FILE: src/layer/expanddims.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "expanddims.h"

namespace ncnn {

ExpandDims::ExpandDims()
{
    one_blob_only = true;
    support_inplace = false;
}

int ExpandDims::load_param(const ParamDict& pd)
{
    axes = pd.get(3, Mat());

    return 0;
}

int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int dims = bottom_blob.dims;

    const int outdims = dims + axes.w;

    bool expand_w = false;
    bool expand_h = false;
    bool expand_d = false;
    bool expand_c = false;

    {
        const int* axes_ptr = axes;
        for (int i = 0; i < axes.w; i++)
        {
            int axis = axes_ptr[i];
            if (axis < 0)
                axis = outdims + axis;

            if (outdims == 2)
            {
                if (axis == 0) expand_h = true;
                if (axis == 1) expand_w = true;
            }
            if (outdims == 3)
            {
                if (axis == 0) expand_c = true;
                if (axis == 1) expand_h = true;
                if (axis == 2) expand_w = true;
            }
            if (outdims == 4)
            {
                if (axis == 0) expand_c = true;
                if (axis == 1) expand_d = true;
                if (axis == 2) expand_h = true;
                if (axis == 3) expand_w = true;
            }
        }
    }

    top_blob = bottom_blob;

    if (outdims == 2)
    {
        if (expand_w)
        {
            top_blob = bottom_blob.reshape(1, w, opt.blob_allocator);
        }
        else if (expand_h)
        {
            top_blob = bottom_blob.reshape(w, 1, opt.blob_allocator);
        }
    }
    if (outdims == 3)
    {
        if (expand_w && expand_h)
        {
            top_blob = bottom_blob.reshape(1, 1, w, opt.blob_allocator);
        }
        else if (expand_w && expand_c)
        {
            top_blob = bottom_blob.reshape(1, w, 1, opt.blob_allocator);
        }
        else if (expand_h && expand_c)
        {
            top_blob = bottom_blob.reshape(w, 1, 1, opt.blob_allocator);
        }
        else if (expand_w)
        {
            top_blob = bottom_blob.reshape(1, w, h, opt.blob_allocator);
        }
        else if (expand_h)
        {
            top_blob = bottom_blob.reshape(w, 1, h, opt.blob_allocator);
        }
        else if (expand_c)
        {
            top_blob = bottom_blob.reshape(w, h, 1, opt.blob_allocator);
        }
    }
    if (outdims == 4)
    {
        if (expand_w && expand_h && expand_d)
        {
            top_blob = bottom_blob.reshape(1, 1, 1, w, opt.blob_allocator);
        }
        else if (expand_w && expand_h && expand_c)
        {
            top_blob = bottom_blob.reshape(1, 1, w, 1, opt.blob_allocator);
        }
        else if (expand_w && expand_d && expand_c)
        {
            top_blob = bottom_blob.reshape(1, w, 1, 1, opt.blob_allocator);
        }
        else if (expand_h && expand_d && expand_c)
        {
            top_blob = bottom_blob.reshape(w, 1, 1, 1, opt.blob_allocator);
        }
        else if (expand_w && expand_h)
        {
            top_blob = bottom_blob.reshape(1, 1, w, h, opt.blob_allocator);
        }
        else if (expand_w && expand_c)
        {
            top_blob = bottom_blob.reshape(1, w, h, 1, opt.blob_allocator);
        }
        else if (expand_d && expand_c)
        {
            top_blob = bottom_blob.reshape(w, h, 1, 1, opt.blob_allocator);
        }
        else if (expand_w && expand_d)
        {
            top_blob = bottom_blob.reshape(1, w, 1, h, opt.blob_allocator);
        }
        else if (expand_h && expand_c)
        {
            top_blob = bottom_blob.reshape(w, 1, h, 1, opt.blob_allocator);
        }
        else if (expand_h && expand_d)
        {
            top_blob = bottom_blob.reshape(w, 1, 1, h, opt.blob_allocator);
        }
        else if (expand_w)
        {
            top_blob = bottom_blob.reshape(1, w, h, channels, opt.blob_allocator);
        }
        else if (expand_h)
        {
            top_blob = bottom_blob.reshape(w, 1, h, channels, opt.blob_allocator);
        }
        else if (expand_d)
        {
            top_blob = bottom_blob.reshape(w, h, 1, channels, opt.blob_allocator);
        }
        else if (expand_c)
        {
            top_blob = bottom_blob.reshape(w, h, channels, 1, opt.blob_allocator);
        }
    }

    if (top_blob.empty())
        return -100;

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/expanddims.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_EXPANDDIMS_H
#define LAYER_EXPANDDIMS_H

#include "layer.h"

namespace ncnn {

class ExpandDims : public Layer
{
public:
    ExpandDims();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    Mat axes;
};

} // namespace ncnn

#endif // LAYER_EXPANDDIMS_H


================================================
FILE: src/layer/flatten.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "flatten.h"

#include <string.h>

namespace ncnn {

Flatten::Flatten()
{
    one_blob_only = true;
    support_inplace = false;
}

int Flatten::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h * d;

    top_blob.create(size * channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const unsigned char* ptr = bottom_blob.channel(q);
        unsigned char* outptr = (unsigned char*)top_blob + size * elemsize * q;

        memcpy(outptr, ptr, size * elemsize);
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/flatten.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_FLATTEN_H
#define LAYER_FLATTEN_H

#include "layer.h"

namespace ncnn {

class Flatten : public Layer
{
public:
    Flatten();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_FLATTEN_H


================================================
FILE: src/layer/flip.cpp
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "flip.h"

namespace ncnn {

Flip::Flip()
{
    one_blob_only = true;
}

int Flip::load_param(const ParamDict& pd)
{
    axes = pd.get(0, Mat());

    if (axes.w > 4)
    {
        // only handle up to 4-dim
        return -1;
    }

    return 0;
}

int Flip::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (axes.empty())
    {
        top_blob = bottom_blob;
        return 0;
    }

    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int d = bottom_blob.d;
    const int channels = bottom_blob.c;

    int axes_flag[4] = {0};
    bool flip_w = false;
    bool flip_h = false;
    bool flip_d = false;
    bool flip_c = false;
    {
        const int* axes_ptr = axes;
        for (int i = 0; i < axes.w; i++)
        {
            int axis = axes_ptr[i];
            // handle negative axis
            if (axis < 0)
                axis += dims;
            axes_flag[axis] = 1;
        }

        if (dims == 1)
        {
            flip_w = true;
        }
        else if (dims == 2)
        {
            if (axes_flag[0] == 1) flip_h = true;
            if (axes_flag[1] == 1) flip_w = true;
        }
        else if (dims == 3)
        {
            if (axes_flag[0] == 1) flip_c = true;
            if (axes_flag[1] == 1) flip_h = true;
            if (axes_flag[2] == 1) flip_w = true;
        }
        else if (dims == 4)
        {
            if (axes_flag[0] == 1) flip_c = true;
            if (axes_flag[1] == 1) flip_d = true;
            if (axes_flag[2] == 1) flip_h = true;
            if (axes_flag[3] == 1) flip_w = true;
        }
    }

    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        for (int z = 0; z < d; z++)
        {
            for (int i = 0; i < h; i++)
            {
                int q2 = flip_c ? channels - 1 - q : q;
                int z2 = flip_d ? d - 1 - z : z;
                int i2 = flip_h ? h - 1 - i : i;

                const float* ptr = bottom_blob.channel(q2).depth(z2).row(i2);
                float* outptr = top_blob.channel(q).depth(z).row(i);

                if (flip_w)
                {
                    ptr += w - 1;
                    for (int j = 0; j < w; j++)
                    {
                        *outptr++ = *ptr--;
                    }
                }
                else
                {
                    memcpy(outptr, ptr, w * sizeof(float));
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/flip.h
================================================
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_FLIP_H
#define LAYER_FLIP_H

#include "layer.h"

namespace ncnn {

class Flip : public Layer
{
public:
    Flip();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    Mat axes;
};

} // namespace ncnn

#endif // LAYER_FLIP_H


================================================
FILE: src/layer/fold.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "fold.h"

namespace ncnn {

Fold::Fold()
{
    one_blob_only = true;
}

int Fold::load_param(const ParamDict& pd)
{
    kernel_w = pd.get(1, 0);
    kernel_h = pd.get(11, kernel_w);
    dilation_w = pd.get(2, 1);
    dilation_h = pd.get(12, dilation_w);
    stride_w = pd.get(3, 1);
    stride_h = pd.get(13, stride_w);
    pad_left = pd.get(4, 0);
    pad_right = pd.get(15, pad_left);
    pad_top = pd.get(14, pad_left);
    pad_bottom = pd.get(16, pad_top);
    output_w = pd.get(20, 0);
    output_h = pd.get(21, output_w);

    return 0;
}

int Fold::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int max_channels = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int outw = output_w + pad_left + pad_right;
    const int outh = output_h + pad_top + pad_bottom;

    const int inw = (outw - kernel_extent_w) / stride_w + 1;
    const int inh = (outh - kernel_extent_h) / stride_h + 1;

    // assert inw * inh == size

    const int maxk = kernel_w * kernel_h;
    const int channels = max_channels / maxk;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
    {
        top_blob_bordered.create(outw, outh, channels, elemsize, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, channels, elemsize, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    // col2im
    const int gap = outw * stride_h - inw * stride_w;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const float* sptr = bottom_blob.row(p * maxk);
        Mat outm = top_blob_bordered.channel(p);

        outm.fill(0.f);

        for (int u = 0; u < kernel_h; u++)
        {
            for (int v = 0; v < kernel_w; v++)
            {
                float* ptr = outm.row(dilation_h * u) + dilation_w * v;

                for (int i = 0; i < inh; i++)
                {
                    for (int j = 0; j < inw; j++)
                    {
                        ptr[0] += sptr[0];

                        ptr += stride_w;
                        sptr += 1;
                    }

                    ptr += gap;
                }
            }
        }
    }

    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
    {
        Option opt_b = opt;
        opt_b.use_packing_layout = false;
        copy_cut_border(top_blob_bordered, top_blob, pad_top, pad_bottom, pad_left, pad_right, opt_b);
        if (top_blob.empty())
            return -100;
    }
    else
    {
        top_blob = top_blob_bordered;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/fold.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_FOLD_H
#define LAYER_FOLD_H

#include "layer.h"

namespace ncnn {

class Fold : public Layer
{
public:
    Fold();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    int kernel_w;
    int kernel_h;
    int dilation_w;
    int dilation_h;
    int stride_w;
    int stride_h;
    int pad_left; // -233=SAME_UPPER -234=SAME_LOWER
    int pad_right;
    int pad_top;
    int pad_bottom;
    int output_w;
    int output_h;
};

} // namespace ncnn

#endif // LAYER_FOLD_H


================================================
FILE: src/layer/fused_activation.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef FUSED_ACTIVATION_H
#define FUSED_ACTIVATION_H

#include "mat.h"
#include "layer_type.h"

static NCNN_FORCEINLINE float activation_ss(float v, int activation_type, const ncnn::Mat& activation_params)
{
    switch (activation_type)
    {
    case 1:
    {
        v = fmaxf(v, 0.f);
        break;
    }
    case 2:
    {
        float slope = activation_params[0];
        v = v > 0.f ? v : v * slope;
        break;
    }
    case 3:
    {
        float min = activation_params[0];
        float max = activation_params[1];
        if (v < min)
            v = min;
        if (v > max)
            v = max;
        break;
    }
    case 4:
    {
        v = std::min(v, 88.3762626647949f);
        v = std::max(v, -88.3762626647949f);
        v = 1.f / (1.f + expf(-v));
        break;
    }
    case 5:
    {
        v = v * tanhf(logf(expf(v) + 1.f));
        break;
    }
    case 6:
    {
        float alpha = activation_params[0];
        float beta = activation_params[1];
        float lower = -beta / alpha;
        float upper = (1.f / alpha) + lower;
        if (v < lower)
            v = 0.f;
        else if (v > upper)
            ;
        else
            v = v * (v * alpha + beta);
        break;
    }
    }

    return v;
}

static ncnn::Layer* create_activation_layer(int activation_type, const ncnn::Mat& activation_params, const ncnn::Option& opt)
{
    ncnn::Layer* activation = 0;

    if (activation_type == 1)
    {
        activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);

        ncnn::ParamDict pd;
        activation->load_param(pd);
    }
    else if (activation_type == 2)
    {
        activation = ncnn::create_layer_cpu(ncnn::LayerType::ReLU);

        ncnn::ParamDict pd;
        pd.set(0, activation_params[0]); // slope
        activation->load_param(pd);
    }
    else if (activation_type == 3)
    {
        activation = ncnn::create_layer_cpu(ncnn::LayerType::Clip);

        ncnn::ParamDict pd;
        pd.set(0, activation_params[0]); // min
        pd.set(1, activation_params[1]); // max

        activation->load_param(pd);
    }
    else if (activation_type == 4)
    {
        activation = ncnn::create_layer_cpu(ncnn::LayerType::Sigmoid);

        ncnn::ParamDict pd;
        activation->load_param(pd);
    }
    else if (activation_type == 5)
    {
        activation = ncnn::create_layer_cpu(ncnn::LayerType::Mish);

        ncnn::ParamDict pd;
        activation->load_param(pd);
    }
    else if (activation_type == 6)
    {
        activation = ncnn::create_layer_cpu(ncnn::LayerType::HardSwish);

        ncnn::ParamDict pd;
        pd.set(0, activation_params[0]); // alpha
        pd.set(1, activation_params[1]); // beta

        activation->load_param(pd);
    }

    if (activation)
    {
        activation->create_pipeline(opt);
    }

    return activation;
}

#endif // FUSED_ACTIVATION_H


================================================
FILE: src/layer/gelu.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gelu.h"

namespace ncnn {

GELU::GELU()
{
    one_blob_only = true;
    support_inplace = true;
}

int GELU::load_param(const ParamDict& pd)
{
    fast_gelu = pd.get(0, 0);

    return 0;
}

int GELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    if (fast_gelu)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3)))
                ptr[i] = 0.5f * ptr[i] * (1.0f + tanhf(0.79788452f * (ptr[i] + 0.044715f * ptr[i] * ptr[i] * ptr[i])));
            }
        }
    }
    else
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                // y = x * P(X <= x) where X ~ N(0, 1)
                ptr[i] = 0.5f * ptr[i] * erfcf(-0.70710678f * ptr[i]);
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/gelu.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GELU_H
#define LAYER_GELU_H

#include "layer.h"

namespace ncnn {

class GELU : public Layer
{
public:
    GELU();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    int fast_gelu;
};

} // namespace ncnn

#endif // LAYER_GELU_H


================================================
FILE: src/layer/gemm.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gemm.h"

namespace ncnn {

Gemm::Gemm()
{
    one_blob_only = false;
    support_inplace = false;
}

int Gemm::load_param(const ParamDict& pd)
{
    alpha = pd.get(0, 1.f);
    beta = pd.get(1, 1.f);
    transA = pd.get(2, 0);
    transB = pd.get(3, 0);
    constantA = pd.get(4, 0);
    constantB = pd.get(5, 0);
    constantC = pd.get(6, 0);
    constantM = pd.get(7, 0);
    constantN = pd.get(8, 0);
    constantK = pd.get(9, 0);
    constant_broadcast_type_C = pd.get(10, 0);
    output_N1M = pd.get(11, 0);
    output_elempack = pd.get(12, 0);
    output_elemtype = pd.get(13, 0);
    output_transpose = pd.get(14, 0);
    int8_scale_term = pd.get(18, 0);
    constant_TILE_M = pd.get(20, 0);
    constant_TILE_N = pd.get(21, 0);
    constant_TILE_K = pd.get(22, 0);

    if (int8_scale_term)
    {
#if !NCNN_INT8
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
#endif
    }

    if (constantA == 1 && (constantM == 0 || constantK == 0))
    {
        NCNN_LOGE("constantM and constantK must be non-zero when constantA enabled");
        return -1;
    }

    if (constantB == 1 && (constantN == 0 || constantK == 0))
    {
        NCNN_LOGE("constantN and constantK must be non-zero when constantB enabled");
        return -1;
    }

    if (constantC == 1 && (constant_broadcast_type_C < -1 || constant_broadcast_type_C > 4))
    {
        NCNN_LOGE("constant_broadcast_type_C must be -1 or 0~4 when constantC enabled");
        return -1;
    }

    if (constantA == 0 && constantB == 1 && constantC == 1)
        one_blob_only = true;

    if (constantA == 1 && constantB == 0 && constantC == 1)
        one_blob_only = true;

    if (constantA == 1 && constantB == 1 && constantC == 0)
        one_blob_only = true;

    return 0;
}

int Gemm::load_model(const ModelBin& mb)
{
    if (constantA == 1)
    {
        if (transA == 0)
            A_data = mb.load(constantK, constantM, 0);
        else
            A_data = mb.load(constantM, constantK, 0);
        if (A_data.empty())
            return -100;
    }

    if (constantB == 1)
    {
        if (transB == 0)
            B_data = mb.load(constantN, constantK, 0);
        else
            B_data = mb.load(constantK, constantN, 0);
        if (B_data.empty())
            return -100;
    }

    if (constantC == 1 && constant_broadcast_type_C != -1)
    {
        if (constant_broadcast_type_C == 0)
            C_data = mb.load(1, 0);
        if (constant_broadcast_type_C == 1)
            C_data = mb.load(constantM, 0);
        if (constant_broadcast_type_C == 2)
            C_data = mb.load(1, constantM, 0);
        if (constant_broadcast_type_C == 3)
            C_data = mb.load(constantN, constantM, 0);
        if (constant_broadcast_type_C == 4)
            C_data = mb.load(constantN, 1, 0);
        if (C_data.empty())
            return -100;
    }

#if NCNN_INT8
    if (int8_scale_term)
    {
        if (constantA == 1)
        {
            A_data_int8_scales = mb.load(constantM, 1);
        }

        if (constantB == 1)
        {
            B_data_int8_scale = mb.load(1, 1)[0];
        }
    }
#endif // NCNN_INT8

    return 0;
}

static void gemm_transB(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, float alpha, float beta, int broadcast_type_C, int output_transpose, const Option& opt)
{
    const int M = A.dims == 3 ? A.c : A.h;
    const int N = BT.dims == 3 ? BT.c : BT.h;
    const int K = A.w; // assert A.w == BT.w

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < M; i++)
    {
        const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

        const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
        const size_t BT_hstep = BT.dims == 3 ? BT.cstep : (size_t)BT.w;

        const float* ptrA = (const float*)A + i * A_hstep;
        const float* ptrC = C;

        for (int j = 0; j < N; j++)
        {
            const float* ptrBT = (const float*)BT + j * BT_hstep;

            float sum = 0.f;
            if (ptrC)
            {
                if (broadcast_type_C == 0)
                {
                    sum = ptrC[0];
                }
                if (broadcast_type_C == 1)
                {
                    sum = ptrC[i];
                }
                if (broadcast_type_C == 2)
                {
                    sum = ptrC[i];
                }
                if (broadcast_type_C == 3)
                {
                    sum = ptrC[i * N + j];
                }
                if (broadcast_type_C == 4)
                {
                    sum = ptrC[j];
                }

                sum *= beta;
            }

            for (int k = 0; k < K; k++)
            {
                sum += ptrA[k] * ptrBT[k];
            }

            sum *= alpha;

            if (output_transpose)
            {
                top_blob[j * out_hstep + i] = sum;
            }
            else
            {
                top_blob[i * out_hstep + j] = sum;
            }
        }
    }
}

#if NCNN_INT8
static inline signed char float2int8(float v)
{
    int int32 = static_cast<int>(round(v));
    if (int32 > 127) return 127;
    if (int32 < -127) return -127;
    return (signed char)int32;
}

static void gemm_transB_int8(const Mat& A_int8, const Mat& BT_int8, const Mat& A_int8_scales, float BT_int8_scale, const Mat& C, Mat& top_blob, float alpha, float beta, int broadcast_type_C, int output_transpose, const Option& opt)
{
    const int M = A_int8.h;
    const int N = BT_int8.h;
    const int K = A_int8.w; // assert A_int8.w == BT_int8.w

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < M; i++)
    {
        const size_t out_hstep = top_blob.dims == 3 ? top_blob.cstep : (size_t)top_blob.w;

        const signed char* ptrA = A_int8.row<const signed char>(i);
        const float* ptrC = C;

        const float descale = 1.f / (A_int8_scales[i] * BT_int8_scale);

        for (int j = 0; j < N; j++)
        {
            const signed char* ptrBT = BT_int8.row<const signed char>(j);

            int sum = 0;
            for (int k = 0; k < K; k++)
            {
                sum += ptrA[k] * ptrBT[k];
            }

            float sum_fp32 = sum * descale;

            if (ptrC)
            {
                float c = 0.f;
                if (broadcast_type_C == 0)
                {
                    c = ptrC[0];
                }
                if (broadcast_type_C == 1)
                {
                    c = ptrC[i];
                }
                if (broadcast_type_C == 2)
                {
                    c = ptrC[i];
                }
                if (broadcast_type_C == 3)
                {
                    c = ptrC[i * N + j];
                }
                if (broadcast_type_C == 4)
                {
                    c = ptrC[j];
                }

                sum_fp32 += c * beta;
            }

            sum_fp32 *= alpha;

            if (output_transpose)
            {
                top_blob[j * out_hstep + i] = sum_fp32;
            }
            else
            {
                top_blob[i * out_hstep + j] = sum_fp32;
            }
        }
    }
}
#endif // NCNN_INT8

int Gemm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    std::vector<Mat> bottom_blobs(1, bottom_blob);
    std::vector<Mat> top_blobs(1, top_blob);
    int ret = forward(bottom_blobs, top_blobs, opt);
    top_blob = top_blobs[0];
    return ret;
}

int Gemm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
#if NCNN_INT8
    if (int8_scale_term)
    {
        return forward_int8(bottom_blobs, top_blobs, opt);
    }
#endif // NCNN_INT8

    const Mat& A0 = constantA ? A_data : bottom_blobs[0];
    const Mat& B0 = constantB ? B_data : constantA ? bottom_blobs[0] : bottom_blobs[1];

    size_t elemsize = A0.elemsize;

    Mat A;
    if (transA == 0)
    {
        A = A0;
    }
    else
    {
        // transpose A to row-major
        A.create((A0.dims == 3 ? A0.c : A0.h), A0.w, elemsize, opt.workspace_allocator);
        if (A.empty())
            return -100;

        const size_t A0_hstep = A0.dims == 3 ? A0.cstep : (size_t)A0.w;

        for (int i = 0; i < A.h; i++)
        {
            float* ptr = A.row(i);
            for (int j = 0; j < A.w; j++)
            {
                ptr[j] = A0[j * A0_hstep + i];
            }
        }
    }

    Mat BT;
    if (transB == 0)
    {
        // transpose B to col-major
        BT.create((B0.dims == 3 ? B0.c : B0.h), B0.w, elemsize, opt.workspace_allocator);
        if (BT.empty())
            return -100;

        const size_t B0_hstep = B0.dims == 3 ? B0.cstep : (size_t)B0.w;

        for (int i = 0; i < BT.h; i++)
        {
            float* ptr = BT.row(i);
            for (int j = 0; j < BT.w; j++)
            {
                ptr[j] = B0[j * B0_hstep + i];
            }
        }
    }
    else
    {
        BT = B0;
    }

    const int M = A.dims == 3 ? A.c : A.h;
    const int N = BT.dims == 3 ? BT.c : BT.h;

    Mat C;
    int broadcast_type_C = 0;
    if (constantC)
    {
        C = C_data;
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (constantA && constantB && bottom_blobs.size() == 1)
        {
            C = bottom_blobs[0];
        }
        else if ((constantA || constantB) && bottom_blobs.size() == 2)
        {
            C = bottom_blobs[1];
        }
        else if (bottom_blobs.size() == 3)
        {
            C = bottom_blobs[2];
        }

        if (!C.empty())
        {
            if (C.dims == 1 && C.w == 1)
            {
                // scalar
                broadcast_type_C = 0;
            }
            if (C.dims == 1 && C.w == M)
            {
                // M
                // auto broadcast from h to w is the ncnn-style convention
                broadcast_type_C = 1;
            }
            if (C.dims == 1 && C.w == N)
            {
                // N
                broadcast_type_C = 4;
            }
            if (C.dims == 2 && C.w == 1 && C.h == M)
            {
                // Mx1
                broadcast_type_C = 2;
            }
            if (C.dims == 2 && C.w == N && C.h == M)
            {
                // MxN
                broadcast_type_C = 3;
            }
            if (C.dims == 2 && C.w == N && C.h == 1)
            {
                // 1xN
                broadcast_type_C = 4;
            }
        }
    }

    Mat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N, elemsize, opt.blob_allocator);
        else
            top_blob.create(M, N, elemsize, opt.blob_allocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M, elemsize, opt.blob_allocator);
        else
            top_blob.create(N, M, elemsize, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    gemm_transB(A, BT, C, top_blob, alpha, beta, broadcast_type_C, output_transpose, opt);

    return 0;
}

#if NCNN_INT8
int Gemm::forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A0 = constantA ? A_data : bottom_blobs[0];
    const Mat& B0 = constantB ? B_data : constantA ? bottom_blobs[0] : bottom_blobs[1];

    Mat A;
    if (transA == 0)
    {
        A = A0;
    }
    else
    {
        // transpose A to row-major
        if (A0.elemsize == 1)
        {
            A.create(A0.h, A0.w, (size_t)1u, 1, opt.workspace_allocator);
            if (A.empty())
                return -100;

            for (int i = 0; i < A.h; i++)
            {
                signed char* ptr = A.row<signed char>(i);
                for (int j = 0; j < A.w; j++)
                {
                    ptr[j] = A0.row<const signed char>(j)[i];
                }
            }
        }
        else
        {
            A.create(A0.dims == 3 ? A0.c : A0.h, A0.w, (size_t)4u, 1, opt.workspace_allocator);
            if (A.empty())
                return -100;

            for (int i = 0; i < A.h; i++)
            {
                float* ptr = A.row(i);
                for (int j = 0; j < A.w; j++)
                {
                    ptr[j] = A0.dims == 3 ? A0.channel(j)[i] : A0.row(j)[i];
                }
            }
        }
    }

    // dynamic quantize A
    Mat A_int8 = A;
    Mat A_int8_scales = A_data_int8_scales;
    if (A_int8.elemsize != 1)
    {
        A_int8.create(A.w, A.dims == 3 ? A.c : A.h, (size_t)1u, 1, opt.workspace_allocator);
        if (A_int8.empty())
            return -100;
        A_int8_scales.create(A_int8.h, (size_t)4u, 1, opt.workspace_allocator);
        if (A_int8_scales.empty())
            return -100;

        for (int i = 0; i < A_int8.h; i++)
        {
            const size_t A_hstep = A.dims == 3 ? A.cstep : (size_t)A.w;
            const float* ptr = (const float*)A + i * A_hstep;

            float absmax = 0.f;
            for (int k = 0; k < A_int8.w; k++)
            {
                absmax = std::max(absmax, (float)fabs(ptr[k]));
            }

            float A_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax;
            A_int8_scales[i] = A_int8_scale;

            signed char* ptrAi = A_int8.row<signed char>(i);

            for (int k = 0; k < A_int8.w; k++)
            {
                ptrAi[k] = float2int8(ptr[k] * A_int8_scale);
            }
        }
    }

    // dynamic quantize B
    Mat B0_int8 = B0;
    float B_int8_scale = B_data_int8_scale;
    if (B0_int8.elemsize != 1)
    {
        B0_int8.create(B0.w, B0.dims == 3 ? B0.c : B0.h, (size_t)1u, 1, opt.workspace_allocator);
        if (B0_int8.empty())
            return -100;

        float absmax = 0.f;
        for (int i = 0; i < B0_int8.h; i++)
        {
            const size_t B_hstep = B0.dims == 3 ? B0.cstep : (size_t)B0.w;
            const float* ptr = (const float*)B0 + i * B_hstep;

            for (int k = 0; k < B0_int8.w; k++)
            {
                absmax = std::max(absmax, (float)fabs(ptr[k]));
            }
        }

        B_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax;

        for (int i = 0; i < B0_int8.h; i++)
        {
            const size_t B_hstep = B0.dims == 3 ? B0.cstep : (size_t)B0.w;
            const float* ptr = (const float*)B0 + i * B_hstep;

            signed char* ptrBi = B0_int8.row<signed char>(i);

            for (int k = 0; k < B0_int8.w; k++)
            {
                ptrBi[k] = float2int8(ptr[k] * B_int8_scale);
            }
        }
    }

    Mat BT_int8;
    if (transB == 0)
    {
        // transpose B to col-major
        BT_int8.create(B0_int8.h, B0_int8.w, (size_t)1u, 1, opt.workspace_allocator);
        if (BT_int8.empty())
            return -100;

        for (int i = 0; i < BT_int8.h; i++)
        {
            signed char* ptr = BT_int8.row<signed char>(i);
            for (int j = 0; j < BT_int8.w; j++)
            {
                ptr[j] = B0_int8.row<const signed char>(j)[i];
            }
        }
    }
    else
    {
        BT_int8 = B0_int8;
    }

    const int M = A_int8.h;
    const int N = BT_int8.h;

    Mat C;
    int broadcast_type_C = 0;
    if (constantC)
    {
        C = C_data;
        broadcast_type_C = constant_broadcast_type_C;
    }
    else
    {
        if (constantA && constantB && bottom_blobs.size() == 1)
        {
            C = bottom_blobs[0];
        }
        else if ((constantA || constantB) && bottom_blobs.size() == 2)
        {
            C = bottom_blobs[1];
        }
        else if (bottom_blobs.size() == 3)
        {
            C = bottom_blobs[2];
        }

        if (!C.empty())
        {
            if (C.dims == 1 && C.w == 1)
            {
                // scalar
                broadcast_type_C = 0;
            }
            if (C.dims == 1 && C.w == M)
            {
                // M
                // auto broadcast from h to w is the ncnn-style convention
                broadcast_type_C = 1;
            }
            if (C.dims == 1 && C.w == N)
            {
                // N
                broadcast_type_C = 4;
            }
            if (C.dims == 2 && C.w == 1 && C.h == M)
            {
                // Mx1
                broadcast_type_C = 2;
            }
            if (C.dims == 2 && C.w == N && C.h == M)
            {
                // MxN
                broadcast_type_C = 3;
            }
            if (C.dims == 2 && C.w == N && C.h == 1)
            {
                // 1xN
                broadcast_type_C = 4;
            }
        }
    }

    Mat& top_blob = top_blobs[0];
    if (output_transpose)
    {
        if (output_N1M)
            top_blob.create(M, 1, N, 4u, opt.blob_allocator);
        else
            top_blob.create(M, N, 4u, opt.blob_allocator);
    }
    else
    {
        if (output_N1M)
            top_blob.create(N, 1, M, 4u, opt.blob_allocator);
        else
            top_blob.create(N, M, 4u, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    gemm_transB_int8(A_int8, BT_int8, A_int8_scales, B_int8_scale, C, top_blob, alpha, beta, broadcast_type_C, output_transpose, opt);

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/gemm.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GEMM_H
#define LAYER_GEMM_H

#include "layer.h"

namespace ncnn {

class Gemm : public Layer
{
public:
    Gemm();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_INT8
    int forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
#endif

public:
    float alpha;
    float beta;
    int transA;
    int transB;

    int constantA;
    int constantB;
    int constantC;
    int constantM;
    int constantN;
    int constantK;
    int constant_broadcast_type_C;
    int output_N1M;
    int output_elempack;
    int output_elemtype; // 0=auto 1=fp32
    int output_transpose;

    int int8_scale_term;

    int constant_TILE_M;
    int constant_TILE_N;
    int constant_TILE_K;

    // constant A / B / C
    Mat A_data;
    Mat B_data;
    Mat C_data;

#if NCNN_INT8
    Mat A_data_int8_scales;
    float B_data_int8_scale;
#endif
};

} // namespace ncnn

#endif // LAYER_GEMM_H


================================================
FILE: src/layer/glu.cpp
================================================
// Copyright 2022 Xiaomi Corp.   (author: Fangjun Kuang)
// SPDX-License-Identifier: BSD-3-Clause

#include "glu.h"

namespace ncnn {

GLU::GLU()
{
    one_blob_only = true;
    support_inplace = false;
}

int GLU::load_param(const ParamDict& pd)
{
    axis = pd.get(0, 0);

    return 0;
}

int GLU::forward(const Mat& bottom_blob, Mat& top_blob,
                 const Option& opt) const
{
    int dims = bottom_blob.dims;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1)
    {   // ignore axis
        int w = bottom_blob.w;
        int out_w = w / 2;
        top_blob.create(out_w, sizeof(float), opt.blob_allocator);

        const float* in_ptr = bottom_blob;
        float* out_ptr = top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int x = 0; x < out_w; ++x)
        {
            float sigmoid = 1.f / (1.f + expf(-in_ptr[x + out_w]));

            out_ptr[x] = in_ptr[x] * sigmoid;
        }

        return 0;
    } // if (dims == 1)

    if (dims == 2 && positive_axis == 0)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int out_w = w;
        int out_h = h / 2;
        top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);

        int offset = out_w * out_h;

#if 0
        // this one is equivalent to the else branch. It is more readable
        // but less efficient
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < out_h; ++y) {
            const float *in_ptr = bottom_blob.row(y);
            float *out_ptr = top_blob.row(y);

            for (int x = 0; x < w; ++x) {
                float sigmoid =
                    1.f / (1.f + expf(-in_ptr[x + offset]));

                out_ptr[x] = in_ptr[x] * sigmoid;
            }
        }
#else
        int size = offset;
        const float* in_ptr = bottom_blob;
        float* out_ptr = top_blob;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < size; ++i)
        {
            float sigmoid = 1.f / (1.f + expf(-in_ptr[i + offset]));
            out_ptr[i] = in_ptr[i] * sigmoid;
        }
#endif

        return 0;
    } // if (dims == 2 && positive_axis == 0)

    if (dims == 2 && positive_axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int out_w = w / 2;
        int out_h = h;

        top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < h; ++y)
        {
            const float* in_ptr = bottom_blob.row(y);
            float* out_ptr = top_blob.row(y);

            for (int x = 0; x < out_w; ++x)
            {
                float sigmoid = 1.f / (1.f + expf(-in_ptr[x + out_w]));
                out_ptr[x] = in_ptr[x] * sigmoid;
            }
        }

        return 0;
    } // if (dims == 2 && positive_axis == 1)

    if (dims == 3 && positive_axis == 0)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int c = bottom_blob.c;

        int out_w = w;
        int out_h = h;
        int out_c = c / 2;

        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);

        size_t offset = out_c * bottom_blob.cstep;
        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < out_c; ++q)
        {
            const float* in_ptr = bottom_blob.channel(q);
            float* out_ptr = top_blob.channel(q);

            for (int i = 0; i < size; ++i)
            {
                float sigmoid = 1.f / (1.f + expf(-in_ptr[i + offset]));
                out_ptr[i] = in_ptr[i] * sigmoid;
            }
        }
        return 0;
    } //   if (dims == 3 && positive_axis == 0) {

    if (dims == 3 && positive_axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int c = bottom_blob.c;

        int out_w = w;
        int out_h = h / 2;
        int out_c = c;

        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);

        int offset = out_h * out_w;
        int size = offset;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; ++q)
        {
            const float* in_ptr = bottom_blob.channel(q);
            float* out_ptr = top_blob.channel(q);

            for (int i = 0; i < size; ++i)
            {
                float sigmoid = 1.f / (1.f + expf(-in_ptr[i + offset]));
                out_ptr[i] = in_ptr[i] * sigmoid;
            }
        }
        return 0;
    } // if (dims == 3 && positive_axis == 1)

    if (dims == 3 && positive_axis == 2)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int c = bottom_blob.c;

        int out_w = w / 2;
        int out_h = h;
        int out_c = c;

        top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; ++q)
        {
            const float* in_ptr = bottom_blob.channel(q);
            float* out_ptr = top_blob.channel(q);
            for (int y = 0; y < h; ++y)
            {
                for (int x = 0; x < out_w; ++x)
                {
                    float sigmoid = 1.f / (1.f + expf(-in_ptr[x + out_w]));
                    out_ptr[x] = in_ptr[x] * sigmoid;
                }
                in_ptr += w;
                out_ptr += out_w;
            }
        }
        return 0;
    } // if (dims == 3 && positive_axis == 2)

    return -100;
}

} // namespace ncnn


================================================
FILE: src/layer/glu.h
================================================
// Copyright 2022 Xiaomi Corp.   (author: Fangjun Kuang)
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GLU_H
#define LAYER_GLU_H

#include "layer.h"

namespace ncnn {

class GLU : public Layer
{
public:
    GLU();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob,
                        const Option& opt) const;

public:
    int axis;
};

} // namespace ncnn

#endif // LAYER_GLU_H


================================================
FILE: src/layer/gridsample.cpp
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gridsample.h"

namespace ncnn {

GridSample::GridSample()
{
    one_blob_only = false;
    support_inplace = false;
}

int GridSample::load_param(const ParamDict& pd)
{
    sample_type = pd.get(0, 1);
    padding_mode = pd.get(1, 1);
    align_corner = pd.get(2, 0);
    permute_fusion = pd.get(3, 0);

    if (sample_type < 1 || sample_type > 3)
    {
        NCNN_LOGE("unsupported sample type %d", sample_type);
        return -1;
    }

    if (padding_mode < 1 || padding_mode > 3)
    {
        NCNN_LOGE("unsupported padding mode %d", padding_mode);
        return -1;
    }

    return 0;
}

// Restore normalized location to acutal image location
//   When align_corners is true:
//     Normalized location (-1, -1) points to the top-left pixel.
//     Normalized location (1, 1) points to the bottom-tight pixel.
//   When align_corners is false [default]:
//     Normalized location (-1, -1) points to the top-left pixel minus half
//     pixel coord both directions, i.e, (-0.5, -0.5) coord acutal image space.
//     Normalized location (1, 1) points to the bottom-tight pixel plus half
//     pixel coord both directions, i.e. (H - 0.5, W - 0.5) coord acutal image space.
static float grid_sample_unormalize(int w, float coordx, int align_corner)
{
    return align_corner ? (coordx + 1) / 2.f * (w - 1) : ((coordx + 1) * w - 1) / 2.f;
}

static float border_coord(float x, float border)
{
    return std::min(border, std::max(x, 0.0f));
}

static float reflect_coord(float x, int high)
{
    x = fabs(x);
    x = high - fabs(x - high);
    return x;
}

static float compute_coord(float sx, int w, int padding_mode, int align_corner)
{
    if (padding_mode == 2) // border
    {
        sx = border_coord(sx, w - 1);
    }
    else if (padding_mode == 3) // reflection
    {
        if (align_corner)
        {
            sx = reflect_coord(sx, w - 1);
        }
        else
        {
            sx = reflect_coord(sx + 0.5, w) - 0.5;
            sx = border_coord(sx, w - 1);
        }
    }

    return sx;
}

static bool in_bounds(const Mat& image, int x, int y)
{
    return x >= 0 && y >= 0 && x < image.w && y < image.h;
}

static bool in_bounds(const Mat& image, int x, int y, int z)
{
    return x >= 0 && y >= 0 && z >= 0 && x < image.w && y < image.h && z < image.c;
}

static float get_value_bounded(const Mat& image, int x, int y)
{
    return in_bounds(image, x, y) ? image.row(y)[x] : 0.f;
}

static float get_value_bounded(const Mat& image, int x, int y, int z)
{
    return in_bounds(image, x, y, z) ? image.depth(z).row(y)[x] : 0.f;
}

static float get_value_bounded(const Mat& image, int x, int y, int padding_mode, int align_corner)
{
    x = compute_coord(x, image.w, padding_mode, align_corner);
    y = compute_coord(y, image.h, padding_mode, align_corner);

    return get_value_bounded(image, x, y);
}

static inline void interpolate_cubic(float fx, float* coeffs)
{
    const float A = -0.75f;

    float fx0 = fx + 1;
    float fx1 = fx;
    float fx2 = 1 - fx;
    // float fx3 = 2 - fx;

    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}

int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& grid = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;

    if (dims == 3)
    {
        int outw = permute_fusion == 0 ? grid.h : grid.w;
        int outh = permute_fusion == 0 ? grid.c : grid.h;

        top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);

        Mat offset_blob;
        offset_blob.create(outw, outh, grid.c, elemsize, opt.workspace_allocator);

        if (top_blob.empty() || offset_blob.empty())
            return -100;

        //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly
        if (permute_fusion == 0)
        {
            float* offsetptr_x = offset_blob.channel(0);
            float* offsetptr_y = offset_blob.channel(1);

            for (int y = 0; y < outh; y++)
            {
                const float* gridptr = grid.channel(y);
                for (int x = 0; x < outw; x++)
                {
                    float sample_x = gridptr[0];
                    float sample_y = gridptr[1];

                    sample_x = grid_sample_unormalize(w, sample_x, align_corner);
                    sample_y = grid_sample_unormalize(h, sample_y, align_corner);

                    *offsetptr_x = sample_x;
                    *offsetptr_y = sample_y;

                    gridptr += 2;
                    offsetptr_x++;
                    offsetptr_y++;
                }
            }
        }
        else
        {
            const float* gridptr_x = grid.channel(0);
            const float* gridptr_y = grid.channel(1);
            float* offsetptr_x = offset_blob.channel(0);
            float* offsetptr_y = offset_blob.channel(1);

            for (int y = 0; y < outh; y++)
            {
                for (int x = 0; x < outw; x++)
                {
                    float sample_x = *gridptr_x;
                    float sample_y = *gridptr_y;

                    sample_x = grid_sample_unormalize(w, sample_x, align_corner);
                    sample_y = grid_sample_unormalize(h, sample_y, align_corner);

                    *offsetptr_x = sample_x;
                    *offsetptr_y = sample_y;

                    gridptr_x++;
                    gridptr_y++;
                    offsetptr_x++;
                    offsetptr_y++;
                }
            }
        }

        if (sample_type == Interpolation_BILINEAR) // bilinear
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat image = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                const float* offsetptr_x = offset_blob.channel(0);
                const float* offsetptr_y = offset_blob.channel(1);

                for (int y = 0; y < outh; y++)
                {
                    for (int x = 0; x < outw; x++)
                    {
                        float sample_x = *offsetptr_x;
                        float sample_y = *offsetptr_y;

                        // bilinear interpolate
                        float v;
                        {
                            sample_x = compute_coord(sample_x, w, padding_mode, align_corner);
                            sample_y = compute_coord(sample_y, h, padding_mode, align_corner);
                            int x0 = floor(sample_x);
                            int y0 = floor(sample_y);
                            int x1 = x0 + 1;
                            int y1 = y0 + 1;

                            float v00 = get_value_bounded(image, x0, y0);
                            float v01 = get_value_bounded(image, x1, y0);
                            float v10 = get_value_bounded(image, x0, y1);
                            float v11 = get_value_bounded(image, x1, y1);

                            float alpha = sample_x - x0;
                            float beta = sample_y - y0;

                            float v0 = v00 * (1 - alpha) + v01 * alpha;
                            float v1 = v10 * (1 - alpha) + v11 * alpha;

                            v = v0 * (1 - beta) + v1 * beta;
                        }

                        outptr[0] = v;
                        outptr += 1;

                        offsetptr_x++;
                        offsetptr_y++;
                    }
                }
            }
        }
        else if (sample_type == Interpolation_NEAREST) // nearest
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat image = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                const float* offsetptr_x = offset_blob.channel(0);
                const float* offsetptr_y = offset_blob.channel(1);

                for (int y = 0; y < outh; y++)
                {
                    for (int x = 0; x < outw; x++)
                    {
                        float sample_x = *offsetptr_x;
                        float sample_y = *offsetptr_y;
                        sample_x = compute_coord(sample_x, w, padding_mode, align_corner);
                        sample_y = compute_coord(sample_y, h, padding_mode, align_corner);

                        int x0 = static_cast<int>(floor(sample_x + 0.5f));
                        int y0 = static_cast<int>(floor(sample_y + 0.5f));

                        float v = get_value_bounded(image, x0, y0);

                        outptr[0] = v;
                        outptr += 1;

                        offsetptr_x++;
                        offsetptr_y++;
                    }
                }
            }
        }
        else if (sample_type == Interpolation_BICUBIC) // bicubic
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat image = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                const float* offsetptr_x = offset_blob.channel(0);
                const float* offsetptr_y = offset_blob.channel(1);

                for (int y = 0; y < outh; y++)
                {
                    for (int x = 0; x < outw; x++)
                    {
                        float sample_x = *offsetptr_x;
                        float sample_y = *offsetptr_y;

                        // bicubic interpolate
                        float v;
                        {
                            int x1 = (int)floorf(sample_x);
                            int y1 = (int)floorf(sample_y);
                            int x0 = x1 - 1;
                            int y0 = y1 - 1;
                            int x2 = x1 + 1;
                            int y2 = y1 + 1;
                            int x3 = x1 + 2;
                            int y3 = y1 + 2;

                            float v00 = get_value_bounded(image, x0, y0, padding_mode, align_corner);
                            float v01 = get_value_bounded(image, x1, y0, padding_mode, align_corner);
                            float v02 = get_value_bounded(image, x2, y0, padding_mode, align_corner);
                            float v03 = get_value_bounded(image, x3, y0, padding_mode, align_corner);
                            float v10 = get_value_bounded(image, x0, y1, padding_mode, align_corner);
                            float v11 = get_value_bounded(image, x1, y1, padding_mode, align_corner);
                            float v12 = get_value_bounded(image, x2, y1, padding_mode, align_corner);
                            float v13 = get_value_bounded(image, x3, y1, padding_mode, align_corner);
                            float v20 = get_value_bounded(image, x0, y2, padding_mode, align_corner);
                            float v21 = get_value_bounded(image, x1, y2, padding_mode, align_corner);
                            float v22 = get_value_bounded(image, x2, y2, padding_mode, align_corner);
                            float v23 = get_value_bounded(image, x3, y2, padding_mode, align_corner);
                            float v30 = get_value_bounded(image, x0, y3, padding_mode, align_corner);
                            float v31 = get_value_bounded(image, x1, y3, padding_mode, align_corner);
                            float v32 = get_value_bounded(image, x2, y3, padding_mode, align_corner);
                            float v33 = get_value_bounded(image, x3, y3, padding_mode, align_corner);

                            float x_coeffs[4];
                            float y_coeffs[4];
                            interpolate_cubic(sample_x - x1, x_coeffs);
                            interpolate_cubic(sample_y - y1, y_coeffs);

                            float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3];
                            float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3];
                            float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3];
                            float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3];

                            v = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3];
                        }

                        outptr[0] = v;
                        outptr += 1;

                        offsetptr_x++;
                        offsetptr_y++;
                    }
                }
            }
        }
    }

    if (dims == 4)
    {
        int outw = permute_fusion == 0 ? grid.h : grid.w;
        int outh = permute_fusion == 0 ? grid.d : grid.h;
        int outd = permute_fusion == 0 ? grid.c : grid.d;

        top_blob.create(outw, outh, outd, channels, elemsize, opt.blob_allocator);

        Mat offset_blob;
        offset_blob.create(outw, outh, outd, grid.c, elemsize, opt.workspace_allocator);

        if (top_blob.empty() || offset_blob.empty())
            return -100;

        //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly
        if (permute_fusion == 0)
        {
            float* offsetptr_x = offset_blob.channel(0);
            float* offsetptr_y = offset_blob.channel(1);
            float* offsetptr_z = offset_blob.channel(2);

            for (int z = 0; z < outd; z++)
            {
                const float* gridptr = grid.channel(z);
                for (int y = 0; y < outh; y++)
                {
                    for (int x = 0; x < outw; x++)
                    {
                        float sample_x = gridptr[0];
                        float sample_y = gridptr[1];
                        float sample_z = gridptr[2];

                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
                        sample_x = compute_coord(sample_x, w, padding_mode, align_corner);

                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
                        sample_y = compute_coord(sample_y, h, padding_mode, align_corner);

                        sample_z = grid_sample_unormalize(d, sample_z, align_corner);
                        sample_z = compute_coord(sample_z, d, padding_mode, align_corner);

                        *offsetptr_x = sample_x;
                        *offsetptr_y = sample_y;
                        *offsetptr_z = sample_z;

                        gridptr += 3;
                        offsetptr_x++;
                        offsetptr_y++;
                        offsetptr_z++;
                    }
                }
            }
        }
        else
        {
            const float* gridptr_x = grid.channel(0);
            const float* gridptr_y = grid.channel(1);
            const float* gridptr_z = grid.channel(2);
            float* offsetptr_x = offset_blob.channel(0);
            float* offsetptr_y = offset_blob.channel(1);
            float* offsetptr_z = offset_blob.channel(2);

            for (int z = 0; z < outd; z++)
            {
                for (int y = 0; y < outh; y++)
                {
                    for (int x = 0; x < outw; x++)
                    {
                        float sample_x = *gridptr_x;
                        float sample_y = *gridptr_y;
                        float sample_z = *gridptr_z;

                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
                        sample_x = compute_coord(sample_x, w, padding_mode, align_corner);

                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
                        sample_y = compute_coord(sample_y, h, padding_mode, align_corner);

                        sample_z = grid_sample_unormalize(d, sample_z, align_corner);
                        sample_z = compute_coord(sample_z, d, padding_mode, align_corner);

                        *offsetptr_x = sample_x;
                        *offsetptr_y = sample_y;
                        *offsetptr_z = sample_z;

                        gridptr_x++;
                        gridptr_y++;
                        gridptr_z++;
                        offsetptr_x++;
                        offsetptr_y++;
                        offsetptr_z++;
                    }
                }
            }
        }

        if (sample_type == Interpolation_BILINEAR) // bilinear
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat image = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                const float* offsetptr_x = offset_blob.channel(0);
                const float* offsetptr_y = offset_blob.channel(1);
                const float* offsetptr_z = offset_blob.channel(2);

                for (int z = 0; z < outd; z++)
                {
                    for (int y = 0; y < outh; y++)
                    {
                        for (int x = 0; x < outw; x++)
                        {
                            float sample_x = *offsetptr_x;
                            float sample_y = *offsetptr_y;
                            float sample_z = *offsetptr_z;

                            // bilinear interpolate
                            float v;
                            {
                                int x0 = (int)floor(sample_x);
                                int y0 = (int)floor(sample_y);
                                int z0 = (int)floor(sample_z);
                                int x1 = x0 + 1;
                                int y1 = y0 + 1;
                                int z1 = z0 + 1;

                                float v000 = get_value_bounded(image, x0, y0, z0);
                                float v001 = get_value_bounded(image, x1, y0, z0);
                                float v010 = get_value_bounded(image, x0, y1, z0);
                                float v011 = get_value_bounded(image, x1, y1, z0);
                                float v100 = get_value_bounded(image, x0, y0, z1);
                                float v101 = get_value_bounded(image, x1, y0, z1);
                                float v110 = get_value_bounded(image, x0, y1, z1);
                                float v111 = get_value_bounded(image, x1, y1, z1);

                                float alpha = sample_x - x0;
                                float beta = sample_y - y0;
                                float gamma = sample_z - z0;

                                float v00 = v000 * (1 - alpha) + v001 * alpha;
                                float v01 = v010 * (1 - alpha) + v011 * alpha;
                                float v10 = v100 * (1 - alpha) + v101 * alpha;
                                float v11 = v110 * (1 - alpha) + v111 * alpha;

                                float v0 = v00 * (1 - beta) + v01 * beta;
                                float v1 = v10 * (1 - beta) + v11 * beta;

                                v = v0 * (1 - gamma) + v1 * gamma;
                            }

                            outptr[0] = v;
                            outptr += 1;

                            offsetptr_x++;
                            offsetptr_y++;
                            offsetptr_z++;
                        }
                    }
                }
            }
        }
        else if (sample_type == Interpolation_NEAREST) // nearest
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat image = bottom_blob.channel(q);
                float* outptr = top_blob.channel(q);
                const float* offsetptr_x = offset_blob.channel(0);
                const float* offsetptr_y = offset_blob.channel(1);
                const float* offsetptr_z = offset_blob.channel(2);

                for (int z = 0; z < outd; z++)
                {
                    for (int y = 0; y < outh; y++)
                    {
                        for (int x = 0; x < outw; x++)
                        {
                            float sample_x = *offsetptr_x;
                            float sample_y = *offsetptr_y;
                            float sample_z = *offsetptr_z;

                            int x0 = static_cast<int>(floor(sample_x + 0.5f));
                            int y0 = static_cast<int>(floor(sample_y + 0.5f));
                            int z0 = static_cast<int>(floor(sample_z + 0.5f));

                            float v = get_value_bounded(image, x0, y0, z0);

                            outptr[0] = v;
                            outptr += 1;

                            offsetptr_x++;
                            offsetptr_y++;
                            offsetptr_z++;
                        }
                    }
                }
            }
        }
        else if (sample_type == 3)
        {
            NCNN_LOGE("unsupported bicubic when dims == 4");
            return -1;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/gridsample.h
================================================
// Copyright 2023 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GRIDSAMPLE_H
#define LAYER_GRIDSAMPLE_H

#include "layer.h"

namespace ncnn {

class GridSample : public Layer
{
public:
    GridSample();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    enum InterpolationMode // 1=bilinear  2=nearest  3=bicubic
    {
        Interpolation_BILINEAR = 1,
        Interpolation_NEAREST = 2,
        Interpolation_BICUBIC = 3
    };

    enum PaddingMode // 1=zeros     2=border   3=reflection
    {
        Padding_ZEROS = 1,
        Padding_BORDER = 2,
        Padding_REFLECTION = 3
    };

public:
    // param
    int sample_type;  // 1=bilinear  2=nearest  3=bicubic
    int padding_mode; // 1=zeros     2=border   3=reflection
    int align_corner;

    int permute_fusion;
};

} // namespace ncnn

#endif // LAYER_GRIDSAMPLE_H


================================================
FILE: src/layer/groupnorm.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "groupnorm.h"

namespace ncnn {

GroupNorm::GroupNorm()
{
    one_blob_only = true;
    support_inplace = true;
}

int GroupNorm::load_param(const ParamDict& pd)
{
    group = pd.get(0, 1);
    channels = pd.get(1, 0);
    eps = pd.get(2, 0.001f);
    affine = pd.get(3, 1);

    return 0;
}

int GroupNorm::load_model(const ModelBin& mb)
{
    if (affine == 0)
        return 0;

    gamma_data = mb.load(channels, 1);
    if (gamma_data.empty())
        return -100;

    beta_data = mb.load(channels, 1);
    if (beta_data.empty())
        return -100;

    return 0;
}

static void groupnorm(float* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int channels, int size, size_t cstep)
{
    float sum = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const float* ptr0 = ptr + cstep * q;
        for (int i = 0; i < size; i++)
        {
            sum += ptr0[i];
        }
    }

    float mean = sum / (channels * size);

    float sqsum = 0.f;
    for (int q = 0; q < channels; q++)
    {
        const float* ptr0 = ptr + cstep * q;
        for (int i = 0; i < size; i++)
        {
            float v = ptr0[i] - mean;
            sqsum += v * v;
        }
    }

    float var = sqsum / (channels * size);

    float a = 1.f / sqrtf(var + eps);
    float b = -mean * a;

    if (gamma_ptr && beta_ptr)
    {
        for (int q = 0; q < channels; q++)
        {
            float* ptr0 = ptr + cstep * q;
            const float gamma = gamma_ptr[q];
            const float beta = beta_ptr[q];
            for (int i = 0; i < size; i++)
            {
                ptr0[i] = (ptr0[i] * a + b) * gamma + beta;
            }
        }
    }
    else
    {
        for (int q = 0; q < channels; q++)
        {
            float* ptr0 = ptr + cstep * q;
            for (int i = 0; i < size; i++)
            {
                ptr0[i] = ptr0[i] * a + b;
            }
        }
    }
}

int GroupNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    const int dims = bottom_top_blob.dims;
    const int channels_g = channels / group;

    if (dims == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob.range(g * channels_g, channels_g);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g, 1, 1);
        }
    }

    if (dims == 2)
    {
        const int w = bottom_top_blob.w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob.row_range(g * channels_g, channels_g);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g, w, w);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int size = bottom_top_blob.w * bottom_top_blob.h * bottom_top_blob.d;
        const size_t cstep = bottom_top_blob.cstep;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int g = 0; g < group; g++)
        {
            Mat bottom_top_blob_g = bottom_top_blob.channel_range(g * channels_g, channels_g);
            const float* gamma_ptr = affine ? (const float*)gamma_data + g * channels_g : 0;
            const float* beta_ptr = affine ? (const float*)beta_data + g * channels_g : 0;
            groupnorm(bottom_top_blob_g, gamma_ptr, beta_ptr, eps, channels_g, size, cstep);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/groupnorm.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GROUPNORM_H
#define LAYER_GROUPNORM_H

#include "layer.h"

namespace ncnn {

class GroupNorm : public Layer
{
public:
    GroupNorm();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    // param
    int group;
    int channels;
    float eps;
    int affine;

    // model
    Mat gamma_data;
    Mat beta_data;
};

} // namespace ncnn

#endif // LAYER_GROUPNORM_H


================================================
FILE: src/layer/gru.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gru.h"

namespace ncnn {

GRU::GRU()
{
    one_blob_only = false;
    support_inplace = false;
}

int GRU::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    weight_data_size = pd.get(1, 0);
    direction = pd.get(2, 0);
    int8_scale_term = pd.get(8, 0);

    if (int8_scale_term)
    {
#if !NCNN_INT8
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
#endif
    }

    return 0;
}

int GRU::load_model(const ModelBin& mb)
{
    int num_directions = direction == 2 ? 2 : 1;

    int size = weight_data_size / num_directions / num_output / 3;

    // raw weight data
    weight_xc_data = mb.load(size, num_output * 3, num_directions, 0);
    if (weight_xc_data.empty())
        return -100;

    bias_c_data = mb.load(num_output, 4, num_directions, 0);
    if (bias_c_data.empty())
        return -100;

    weight_hc_data = mb.load(num_output, num_output * 3, num_directions, 0);
    if (weight_hc_data.empty())
        return -100;

#if NCNN_INT8
    if (int8_scale_term)
    {
        weight_xc_data_int8_scales = mb.load(num_output * 3, num_directions, 1);
        weight_hc_data_int8_scales = mb.load(num_output * 3, num_directions, 1);
    }
#endif // NCNN_INT8

    return 0;
}

static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // 2 x num_output
    Mat gates(2, num_output, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        const float* x = bottom_blob.row(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < num_output; q++)
        {
            float* gates_data = gates.row(q);

            // gate reset update
            const float* bias_c_R = bias_c.row(0);
            const float* bias_c_U = bias_c.row(1);

            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);

            float R = bias_c_R[q];
            float U = bias_c_U[q];

            for (int i = 0; i < size; i++)
            {
                float xi = x[i];

                R += weight_xc_R[i] * xi;
                U += weight_xc_U[i] * xi;
            }

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                R += weight_hc_R[i] * h_cont;
                U += weight_hc_U[i] * h_cont;
            }

            // sigmoid(R)
            // sigmoid(U)
            R = 1.f / (1.f + expf(-R));
            U = 1.f / (1.f + expf(-U));

            // gate new
            const float* bias_c_WN = bias_c.row(2);
            const float* bias_c_BN = bias_c.row(3);

            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);
            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);

            float N = bias_c_BN[q];

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                N += weight_hc_N[i] * h_cont;
            }

            N = bias_c_WN[q] + R * N;

            for (int i = 0; i < size; i++)
            {
                float xi = x[i];

                N += weight_xc_N[i] * xi;
            }

            // tanh(N)
            N = tanhf(N);

            gates_data[0] = U;
            gates_data[1] = N;
        }

        // h_t := (1 - update) .* new + update .* h_{t-1}
        float* output_data = top_blob.row(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < num_output; q++)
        {
            const float* gates_data = gates.row(q);

            float U = gates_data[0];
            float N = gates_data[1];

            float H = (1 - U) * N + U * hidden_state[q];

            hidden_state[q] = H;
            output_data[q] = H;
        }
    }

    return 0;
}

#if NCNN_INT8
static int gru_int8(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc_int8, const float* weight_xc_int8_scales, const Mat& bias_c, const Mat& weight_hc_int8, const float* weight_hc_int8_scales, Mat& hidden_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;

    // 2 x num_output
    Mat gates(2, num_output, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8(size, T, (size_t)1u, 1, opt.workspace_allocator);
    Mat bottom_blob_int8_scales(T, (size_t)4u, 1, opt.workspace_allocator);
    {
        for (int t = 0; t < T; t++)
        {
            const float* x = bottom_blob.row(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(x[i]));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
        }

        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_quant);
    }

    Mat hidden_state_int8(num_output, (size_t)1u, 1, opt.workspace_allocator);
    Mat hidden_state_int8_scales(1, (size_t)4u, 1, opt.workspace_allocator);

    // unroll
    for (int t = 0; t < T; t++)
    {
        int ti = reverse ? T - 1 - t : t;

        // dynamic quantize hidden_state
        {
            float absmax = 0.f;
            for (int i = 0; i < num_output; i++)
            {
                absmax = std::max(absmax, (float)fabs(hidden_state[i]));
            }

            if (absmax == 0.f)
            {
                hidden_state_int8_scales[0] = 1.f;
                hidden_state_int8.fill<signed char>(0);
            }
            else
            {
                hidden_state_int8_scales[0] = 127.f / absmax;

                Option opt_quant = opt;
                opt_quant.blob_allocator = opt.workspace_allocator;
                opt_quant.use_packing_layout = false;
                quantize_to_int8(hidden_state, hidden_state_int8, hidden_state_int8_scales, opt_quant);
            }
        }

        const signed char* x = bottom_blob_int8.row<const signed char>(ti);
        const signed char* hs = hidden_state_int8;
        const float descale_x = 1.f / bottom_blob_int8_scales[ti];
        const float descale_h = 1.f / hidden_state_int8_scales[0];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < num_output; q++)
        {
            float* gates_data = gates.row(q);

            // gate reset update
            const float* bias_c_R = bias_c.row(0);
            const float* bias_c_U = bias_c.row(1);

            const signed char* weight_xc_int8_R = weight_xc_int8.row<const signed char>(num_output * 0 + q);
            const signed char* weight_xc_int8_U = weight_xc_int8.row<const signed char>(num_output * 1 + q);
            const signed char* weight_hc_int8_R = weight_hc_int8.row<const signed char>(num_output * 0 + q);
            const signed char* weight_hc_int8_U = weight_hc_int8.row<const signed char>(num_output * 1 + q);

            const float descale_xc_R = 1.f / weight_xc_int8_scales[num_output * 0 + q];
            const float descale_xc_U = 1.f / weight_xc_int8_scales[num_output * 1 + q];
            const float descale_hc_R = 1.f / weight_hc_int8_scales[num_output * 0 + q];
            const float descale_hc_U = 1.f / weight_hc_int8_scales[num_output * 1 + q];

            int Rx = 0;
            int Ux = 0;
            for (int i = 0; i < size; i++)
            {
                signed char xi = x[i];

                Rx += weight_xc_int8_R[i] * xi;
                Ux += weight_xc_int8_U[i] * xi;
            }

            int Rh = 0;
            int Uh = 0;
            for (int i = 0; i < num_output; i++)
            {
                signed char h_cont = hs[i];

                Rh += weight_hc_int8_R[i] * h_cont;
                Uh += weight_hc_int8_U[i] * h_cont;
            }

            float R = bias_c_R[q] + Rx * (descale_x * descale_xc_R) + Rh * (descale_h * descale_hc_R);
            float U = bias_c_U[q] + Ux * (descale_x * descale_xc_U) + Uh * (descale_h * descale_hc_U);

            // sigmoid(R)
            // sigmoid(U)
            R = 1.f / (1.f + expf(-R));
            U = 1.f / (1.f + expf(-U));

            // gate new
            const float* bias_c_WN = bias_c.row(2);
            const float* bias_c_BN = bias_c.row(3);

            const signed char* weight_xc_int8_N = weight_xc_int8.row<const signed char>(num_output * 2 + q);
            const signed char* weight_hc_int8_N = weight_hc_int8.row<const signed char>(num_output * 2 + q);

            const float descale_xc_N = 1.f / weight_xc_int8_scales[num_output * 2 + q];
            const float descale_hc_N = 1.f / weight_hc_int8_scales[num_output * 2 + q];

            int Nh = 0;
            for (int i = 0; i < num_output; i++)
            {
                signed char h_cont = hs[i];

                Nh += weight_hc_int8_N[i] * h_cont;
            }

            int Nx = 0;
            for (int i = 0; i < size; i++)
            {
                signed char xi = x[i];

                Nx += weight_xc_int8_N[i] * xi;
            }

            float N = bias_c_BN[q] + Nh * (descale_h * descale_hc_N);
            N = bias_c_WN[q] + R * N + Nx * (descale_x * descale_xc_N);

            // tanh(N)
            N = tanhf(N);

            gates_data[0] = U;
            gates_data[1] = N;
        }

        // h_t := (1 - update) .* new + update .* h_{t-1}
        float* output_data = top_blob.row(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < num_output; q++)
        {
            const float* gates_data = gates.row(q);

            float U = gates_data[0];
            float N = gates_data[1];

            float H = (1 - U) * N + U * hidden_state[q];

            hidden_state[q] = H;
            output_data[q] = H;
        }
    }

    return 0;
}
#endif // NCNN_INT8

int GRU::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = gru_int8(bottom_blob, top_blob, direction, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), hidden, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = gru_int8(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), hidden, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = gru(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.0f);

#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = gru_int8(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), weight_xc_data_int8_scales.row(1), bias_c_data.channel(1), weight_hc_data.channel(1), weight_hc_data_int8_scales.row(1), hidden, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    return 0;
}

int GRU::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 2)
    {
        hidden = bottom_blobs[1].clone(hidden_allocator);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = gru_int8(bottom_blob, top_blob, direction, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), hidden, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = gru(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
            if (ret != 0)
                return ret;
        }
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = gru_int8(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = gru(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = gru_int8(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), weight_xc_data_int8_scales.row(1), bias_c_data.channel(1), weight_hc_data.channel(1), weight_hc_data_int8_scales.row(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = gru(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    if (top_blobs.size() == 2)
    {
        top_blobs[1] = hidden;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/gru.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GRU_H
#define LAYER_GRU_H

#include "layer.h"

namespace ncnn {

class GRU : public Layer
{
public:
    GRU();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    int num_output;
    int weight_data_size;
    int direction; // 0=forward 1=reverse 2=bidirectional

    int int8_scale_term;

    Mat weight_hc_data;
    Mat weight_xc_data;
    Mat bias_c_data;

#if NCNN_INT8
    Mat weight_hc_data_int8_scales;
    Mat weight_xc_data_int8_scales;
#endif
};

} // namespace ncnn

#endif // LAYER_GRU_H


================================================
FILE: src/layer/hardsigmoid.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "hardsigmoid.h"

namespace ncnn {

HardSigmoid::HardSigmoid()
{
    one_blob_only = true;
    support_inplace = true;
}

int HardSigmoid::load_param(const ParamDict& pd)
{
    // tensorflow uses alpha,beta = 0.2, 0.5
    // pytorch uses alpha,beta = 1/6, 0.5
    alpha = pd.get(0, 0.2f);
    beta = pd.get(1, 0.5f);
    lower = -beta / alpha;
    upper = (1.f / alpha) + lower;

    return 0;
}

int HardSigmoid::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            if (ptr[i] < lower)
                ptr[i] = 0.f;
            else if (ptr[i] > upper)
                ptr[i] = 1.f;
            else
                ptr[i] = ptr[i] * alpha + beta;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/hardsigmoid.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_HARDSIGMOID_H
#define LAYER_HARDSIGMOID_H

#include "layer.h"

namespace ncnn {

class HardSigmoid : public Layer
{
public:
    HardSigmoid();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float alpha, beta, lower, upper;
};

} // namespace ncnn

#endif // LAYER_HARDSIGMOID_H


================================================
FILE: src/layer/hardswish.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "hardswish.h"

namespace ncnn {

HardSwish::HardSwish()
{
    one_blob_only = true;
    support_inplace = true;
}

int HardSwish::load_param(const ParamDict& pd)
{
    // Note that tensorflow/pytorch use alpha,beta = 1/6, 0.5, not the default value here.
    // You can setup them manually in .param file.
    alpha = pd.get(0, 0.2f);
    beta = pd.get(1, 0.5f);
    lower = -beta / alpha;
    upper = (1.f / alpha) + lower;

    return 0;
}

int HardSwish::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            if (ptr[i] < lower)
                ptr[i] = 0.f;
            else if (ptr[i] > upper)
                ;
            else
                ptr[i] = ptr[i] * (ptr[i] * alpha + beta);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/hardswish.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_HARDSWISH_H
#define LAYER_HARDSWISH_H

#include "layer.h"

namespace ncnn {

class HardSwish : public Layer
{
public:
    HardSwish();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float alpha, beta, lower, upper;
};

} // namespace ncnn

#endif // LAYER_HARDSWISH_H


================================================
FILE: src/layer/innerproduct.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "innerproduct.h"

#include "layer_type.h"

#include "fused_activation.h"

namespace ncnn {

InnerProduct::InnerProduct()
{
    one_blob_only = true;
    support_inplace = false;
}

int InnerProduct::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    bias_term = pd.get(1, 0);
    weight_data_size = pd.get(2, 0);
    int8_scale_term = pd.get(8, 0);
    activation_type = pd.get(9, 0);
    activation_params = pd.get(10, Mat());

    if (int8_scale_term)
    {
#if NCNN_INT8
        support_int8_storage = true;
#else
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
#endif
    }

    return 0;
}

int InnerProduct::load_model(const ModelBin& mb)
{
    weight_data = mb.load(weight_data_size, 0);
    if (weight_data.empty())
        return -100;

    if (bias_term)
    {
        bias_data = mb.load(num_output, 1);
        if (bias_data.empty())
            return -100;
    }

#if NCNN_INT8
    if (int8_scale_term)
    {
        weight_data_int8_scales = mb.load(num_output, 1);
        bottom_blob_int8_scales = mb.load(1, 1);
    }
#endif // NCNN_INT8

#if NCNN_INT8
    // runtime quantize the weight data
    if (weight_data.elemsize == (size_t)4u && int8_scale_term)
    {
        const int num_input = weight_data_size / num_output;

        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        Mat weight_data_int8;
        Option opt_q;
        opt_q.num_threads = 1;
        opt_q.use_packing_layout = false;
        quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
        if (weight_data_int8.empty())
            return -100;

        weight_data = weight_data_int8.reshape(weight_data_size);
    }
#endif // NCNN_INT8

    return 0;
}

int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return forward_int8(bottom_blob, top_blob, opt);
    }
#endif

    const int num_input = weight_data_size / num_output;

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    if (bottom_blob.dims == 2 && w == num_input)
    {
        // gemm
        top_blob.create(num_output, h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
            const float* m = bottom_blob.row(j);
            float* outptr = top_blob.row(j);

            for (int p = 0; p < num_output; p++)
            {
                const float* kptr = (const float*)weight_data + w * p;

                float sum = 0.f;

                if (bias_term)
                    sum = bias_data[p];

                for (int i = 0; i < w; i++)
                {
                    sum += m[i] * kptr[i];
                }

                outptr[p] = activation_ss(sum, activation_type, activation_params);
            }
        }

        return 0;
    }

    top_blob.create(num_output, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < num_output; p++)
    {
        float sum = 0.f;

        if (bias_term)
            sum = bias_data[p];

        // channels
        for (int q = 0; q < channels; q++)
        {
            const float* w = (const float*)weight_data + size * channels * p + size * q;
            const float* m = bottom_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                sum += m[i] * w[i];
            }
        }

        top_blob[p] = activation_ss(sum, activation_type, activation_params);
    }

    return 0;
}

#if NCNN_INT8
int InnerProduct::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int num_input = weight_data_size / num_output;

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    Mat bottom_blob_int8 = bottom_blob;
    if (elemsize != 1)
    {
        Option opt_g = opt;
        opt_g.blob_allocator = opt.workspace_allocator;
        opt_g.use_packing_layout = false;

        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_g);
    }

    if (bottom_blob.dims == 2 && w == num_input)
    {
        // gemm
        top_blob.create(num_output, h, 4u, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
            const signed char* m = bottom_blob_int8.row<signed char>(j);
            float* outptr = top_blob.row(j);

            for (int p = 0; p < num_output; p++)
            {
                const signed char* kptr = (const signed char*)weight_data + w * p;
                int sum = 0;

                for (int i = 0; i < w; i++)
                {
                    sum += m[i] * kptr[i];
                }
                // dequantize and relu
                float scale_in;
                if (weight_data_int8_scales[p] == 0)
                    scale_in = 0;
                else
                    scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

                float sumfp32 = sum * scale_in;

                if (bias_term)
                    sumfp32 += bias_data[p];

                outptr[p] = activation_ss(sumfp32, activation_type, activation_params);
            }
        }

        return 0;
    }

    top_blob.create(num_output, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < num_output; p++)
    {
        float* outptr = top_blob;

        int sum = 0;

        int offset = size * channels * p;
        // channels
        for (int q = 0; q < channels; q++)
        {
            const signed char* w = (const signed char*)weight_data + offset + size * q;
            const signed char* m = bottom_blob_int8.channel(q);

            for (int i = 0; i < size; i++)
            {
                sum += m[i] * w[i];
            }
        }

        // dequantize and relu
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        float sumfp32 = sum * scale_in;

        if (bias_term)
            sumfp32 += bias_data[p];

        outptr[p] = activation_ss(sumfp32, activation_type, activation_params);
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/innerproduct.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INNERPRODUCT_H
#define LAYER_INNERPRODUCT_H

#include "layer.h"

namespace ncnn {

class InnerProduct : public Layer
{
public:
    InnerProduct();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
#if NCNN_INT8
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    // param
    int num_output;
    int bias_term;

    int weight_data_size;

    int int8_scale_term;

    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
    int activation_type;
    Mat activation_params;

    // model
    Mat weight_data;
    Mat bias_data;

#if NCNN_INT8
    Mat weight_data_int8_scales;
    Mat bottom_blob_int8_scales;
#endif
};

} // namespace ncnn

#endif // LAYER_INNERPRODUCT_H


================================================
FILE: src/layer/input.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "input.h"

namespace ncnn {

Input::Input()
{
    one_blob_only = true;
    support_inplace = true;
    support_vulkan = true;
    support_packing = true;
    support_bf16_storage = true;
}

int Input::load_param(const ParamDict& pd)
{
    w = pd.get(0, 0);
    h = pd.get(1, 0);
    d = pd.get(11, 0);
    c = pd.get(2, 0);
    return 0;
}

int Input::forward_inplace(Mat& /*bottom_top_blob*/, const Option& /*opt*/) const
{
    return 0;
}

#if NCNN_VULKAN
int Input::forward_inplace(VkMat& /*bottom_top_blob*/, VkCompute& /*cmd*/, const Option& /*opt*/) const
{
    return 0;
}
#endif // NCNN_VULKAN

} // namespace ncnn


================================================
FILE: src/layer/input.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INPUT_H
#define LAYER_INPUT_H

#include "layer.h"

namespace ncnn {

class Input : public Layer
{
public:
    Input();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

#if NCNN_VULKAN
    virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
#endif // NCNN_VULKAN

public:
    int w;
    int h;
    int d;
    int c;
};

} // namespace ncnn

#endif // LAYER_INPUT_H


================================================
FILE: src/layer/instancenorm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "instancenorm.h"

namespace ncnn {

InstanceNorm::InstanceNorm()
{
    one_blob_only = true;
    support_inplace = true;
}

int InstanceNorm::load_param(const ParamDict& pd)
{
    channels = pd.get(0, 0);
    eps = pd.get(1, 0.001f);
    affine = pd.get(2, 1);

    return 0;
}

int InstanceNorm::load_model(const ModelBin& mb)
{
    if (affine == 0)
        return 0;

    gamma_data = mb.load(channels, 1);
    if (gamma_data.empty())
        return -100;

    beta_data = mb.load(channels, 1);
    if (beta_data.empty())
        return -100;

    return 0;
}

int InstanceNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    // x = (x - mean) / (sqrt(var + eps)) * gamma + beta

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int c = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < c; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        // mean and var
        float sum = 0.f;
        float sqsum = 0.f;
        for (int i = 0; i < size; i++)
        {
            sum += ptr[i];
            //sqsum += ptr[i] * ptr[i];
        }
        float mean = sum / size;
        float tmp = 0.f;
        for (int i = 0; i < size; i++)
        {
            tmp = ptr[i] - mean;
            sqsum += tmp * tmp;
        }
        float var = sqsum / size;
        // the var maybe minus due to accuracy
        //float var = sqsum / size - mean * mean;

        float a;
        float b;
        if (affine)
        {
            float gamma = gamma_data[q];
            float beta = beta_data[q];

            a = gamma / (sqrtf(var + eps));
            b = -mean * a + beta;
        }
        else
        {
            a = 1.f / (sqrtf(var + eps));
            b = -mean * a;
        }

        for (int i = 0; i < size; i++)
        {
            ptr[i] = ptr[i] * a + b;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/instancenorm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INSTANCENORM_H
#define LAYER_INSTANCENORM_H

#include "layer.h"

namespace ncnn {

class InstanceNorm : public Layer
{
public:
    InstanceNorm();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    // param
    int channels;
    float eps;
    int affine;

    // model
    Mat gamma_data;
    Mat beta_data;
};

} // namespace ncnn

#endif // LAYER_INSTANCENORM_H


================================================
FILE: src/layer/interp.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "interp.h"

#include "expression.h"

namespace ncnn {

Interp::Interp()
{
    one_blob_only = true;
    support_inplace = false;
}

int Interp::load_param(const ParamDict& pd)
{
    resize_type = pd.get(0, 0);
    height_scale = pd.get(1, 1.f);
    width_scale = pd.get(2, 1.f);
    output_height = pd.get(3, 0);
    output_width = pd.get(4, 0);
    dynamic_target_size = pd.get(5, 0);
    align_corner = pd.get(6, 0);

    if (resize_type < 0 || resize_type > 3)
    {
        NCNN_LOGE("unsupported resize type %d", resize_type);
        return -1;
    }

    if (dynamic_target_size == 1)
    {
        one_blob_only = false;
    }

    size_expr = pd.get(9, "");

    // count reference blobs
    if (!size_expr.empty())
    {
        const int blob_count = count_expression_blobs(size_expr);
        if (blob_count > 1)
            one_blob_only = false;
    }

    return 0;
}

#if defined(__GNUC__) && defined(__powerpc__) && defined(__ALTIVEC__)
// NOTE gcc altivec optimized version produce wrong result
// so I have to disable vectorize here  --- nihui
__attribute__((optimize("no-tree-vectorize")))
#endif
static void
linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = static_cast<float>(dx * scale);
        }

        int sx = static_cast<int>(floor(fx));
        fx -= sx;

        if (sx < 0)
        {
            sx = 0;
            fx = 0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 2;
            fx = 1.f;
        }

        xofs[dx] = sx;

        alpha[dx * 2] = 1.f - fx;
        alpha[dx * 2 + 1] = fx;
    }
}

static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const float* S0 = src.row(sy);
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* Dp = dst.row(dy);
        for (int dx = 0; dx < w; dx++)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1;
            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
        }

        beta += 2;
    }
}

static inline void interpolate_cubic(float fx, float* coeffs)
{
    const float A = -0.75f;

    float fx0 = fx + 1;
    float fx1 = fx;
    float fx2 = 1 - fx;
    // float fx3 = 2 - fx;

    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}

static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = static_cast<float>(dx * scale);
        }

        int sx = static_cast<int>(floor(fx));
        fx -= sx;

        interpolate_cubic(fx, alpha + dx * 4);

        if (sx <= -1)
        {
            sx = 1;
            alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 3];
            alpha[dx * 4 + 2] = 0.f;
            alpha[dx * 4 + 3] = 0.f;
        }
        if (sx == 0)
        {
            sx = 1;
            alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 2];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 3];
            alpha[dx * 4 + 3] = 0.f;
        }
        if (sx == w - 2)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 1];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 0];
            alpha[dx * 4 + 0] = 0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 0];
            alpha[dx * 4 + 1] = 0.f;
            alpha[dx * 4 + 0] = 0.f;
        }

        xofs[dx] = sx;
    }
}

static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    Mat rowsbuf2(w);
    Mat rowsbuf3(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const float* S0 = src.row(sy - 1);
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3;
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];
        float b2 = beta[2];
        float b3 = beta[3];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        float* Dp = dst.row(dy);
        for (int dx = 0; dx < w; dx++)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
        }

        beta += 4;
    }
}

int Interp::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;

    int outw = output_width;
    int outh = output_height;
    if (bottom_blob.dims == 1)
    {
        w = 1;
        h = 1;
    }
    if (outw == 0 || outh == 0)
    {
        outw = static_cast<int>(w * width_scale);
        outh = static_cast<int>(h * height_scale);
    }

    Mat reference_blob;
    reference_blob.w = outw;
    reference_blob.h = outh;

    std::vector<Mat> bottom_blobs(2);
    bottom_blobs[0] = bottom_blob;
    bottom_blobs[1] = reference_blob;

    std::vector<Mat> top_blobs(1);

    int ret = forward(bottom_blobs, top_blobs, opt);

    top_blob = top_blobs[0];

    return ret;
}

int Interp::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;

    int outw = reference_blob.w;
    int outh = reference_blob.h;

    if (!size_expr.empty())
    {
        int r = eval_size_expr(bottom_blobs, outw, outh);
        if (r != 0)
            return -1;
    }

    if (dims == 1)
    {
        // special case for 2d resize on flattened blob
        top_blob.create(outw, outh, w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < w; q++)
        {
            Mat top_blob_c = top_blob.channel(q);
            const float v = bottom_blob[q];
            top_blob_c.fill(v);
        }

        return 0;
    }

    if (dims == 2)
    {
        if (outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(outw, h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (resize_type == 1) // nearest
        {
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outw * 2];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            linear_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const float* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
                    alphap += 2;
                }
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outw * 4];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            cubic_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const float* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    float a2 = alphap[2];
                    float a3 = alphap[3];
                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
                    alphap += 4;
                }
            }

            delete[] buf;
        }

        return 0;
    }

    if (outw == w && outh == h)
    {
        top_blob = bottom_blob;
        return 0;
    }

    top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (resize_type == 1) // nearest
    {
        const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
        const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);
            for (int y = 0; y < outh; y++)
            {
                int in_y = std::min((int)(y * hs), (h - 1));
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_y * w + in_x];
                }
            }
        }
    }

    if (resize_type == 2) // bilinear
    {
        int* buf = new int[outw + outh + outw * 2 + outh * 2];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

        linear_coeffs(w, outw, xofs, alpha, align_corner);
        linear_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; ++q)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    if (resize_type == 3) // bicubic
    {
        int* buf = new int[outw + outh + outw * 4 + outh * 4];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

        cubic_coeffs(w, outw, xofs, alpha, align_corner);
        cubic_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    return 0;
}

int Interp::eval_size_expr(const std::vector<Mat>& bottom_blobs, int& outw, int& outh) const
{
    // [size(@0,0),size(@0,1)]
    std::vector<int> sizes;
    int er = eval_list_expression(size_expr, bottom_blobs, sizes);
    if (er != 0)
        return -1;

    if (sizes.empty() || sizes.size() > 2)
        return -1;

    if (sizes.size() == 1)
    {
        outw = sizes[0];
        outh = bottom_blobs[0].h;
    }
    else
    {
        outw = sizes[0];
        outh = sizes[1];
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/interp.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INTERP_H
#define LAYER_INTERP_H

#include "layer.h"

namespace ncnn {

class Interp : public Layer
{
public:
    Interp();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    int eval_size_expr(const std::vector<Mat>& bottom_blobs, int& outw, int& outh) const;

public:
    // param
    int resize_type; //1=nearest  2=bilinear  3=bicubic
    float width_scale;
    float height_scale;
    int output_width;
    int output_height;
    int dynamic_target_size;
    int align_corner;

    // see docs/developer-guide/expression.md
    std::string size_expr;
};

} // namespace ncnn

#endif // LAYER_INTERP_H


================================================
FILE: src/layer/inversespectrogram.cpp
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "inversespectrogram.h"

namespace ncnn {

InverseSpectrogram::InverseSpectrogram()
{
    one_blob_only = true;
    support_inplace = false;
}

int InverseSpectrogram::load_param(const ParamDict& pd)
{
    n_fft = pd.get(0, 0);
    returns = pd.get(1, 0);
    hoplen = pd.get(2, n_fft / 4);
    winlen = pd.get(3, n_fft);
    window_type = pd.get(4, 0);
    center = pd.get(5, 1);
    normalized = pd.get(7, 0);

    // assert winlen <= n_fft
    // generate window
    window_data.create(normalized == 2 ? n_fft + 1 : n_fft);
    {
        float* p = window_data;
        for (int i = 0; i < (n_fft - winlen) / 2; i++)
        {
            *p++ = 0.f;
        }
        if (window_type == 0)
        {
            // all ones
            for (int i = 0; i < winlen; i++)
            {
                *p++ = 1.f;
            }
        }
        if (window_type == 1)
        {
            // hann window
            for (int i = 0; i < winlen; i++)
            {
                *p++ = 0.5f * (1 - cosf(2 * 3.14159265358979323846 * i / winlen));
            }
        }
        if (window_type == 2)
        {
            // hamming window
            for (int i = 0; i < winlen; i++)
            {
                *p++ = 0.54f - 0.46f * cosf(2 * 3.14159265358979323846 * i / winlen);
            }
        }
        for (int i = 0; i < n_fft - winlen - (n_fft - winlen) / 2; i++)
        {
            *p++ = 0.f;
        }

        // pre-calculated window norm factor
        if (normalized == 2)
        {
            float sqsum = 0.f;
            for (int i = 0; i < n_fft; i++)
            {
                sqsum += window_data[i] * window_data[i];
            }
            window_data[n_fft] = sqrt(sqsum);
        }
    }

    return 0;
}

int InverseSpectrogram::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // https://github.com/librosa/librosa/blob/main/librosa/core/spectrum.py#L630

    // TODO custom window
    // TODO output length

    const int frames = bottom_blob.h;
    const int freqs = bottom_blob.c;
    // assert freqs == n_fft or freqs == n_fft / 2 + 1

    const int onesided = freqs == n_fft / 2 + 1 ? 1 : 0;

    const int outsize = center ? (frames - 1) * hoplen + (n_fft - n_fft / 2 * 2) : (frames - 1) * hoplen + n_fft;

    const size_t elemsize = bottom_blob.elemsize;

    if (returns == 0)
    {
        top_blob.create(2, outsize, elemsize, opt.blob_allocator);
    }
    else
    {
        top_blob.create(outsize, elemsize, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    Mat window_sumsquare(outsize + n_fft, elemsize, opt.workspace_allocator);
    if (window_sumsquare.empty())
        return -100;

    top_blob.fill(0.f);
    window_sumsquare.fill(0.f);

    for (int j = 0; j < frames; j++)
    {
        // collect complex
        Mat sp(2, n_fft);
        if (onesided == 1)
        {
            for (int k = 0; k < n_fft / 2 + 1; k++)
            {
                sp.row(k)[0] = bottom_blob.channel(k).row(j)[0];
                sp.row(k)[1] = bottom_blob.channel(k).row(j)[1];
            }
            for (int k = n_fft / 2 + 1; k < n_fft; k++)
            {
                sp.row(k)[0] = bottom_blob.channel(n_fft - k).row(j)[0];
                sp.row(k)[1] = -bottom_blob.channel(n_fft - k).row(j)[1];
            }
        }
        else
        {
            for (int k = 0; k < n_fft; k++)
            {
                sp.row(k)[0] = bottom_blob.channel(k).row(j)[0];
                sp.row(k)[1] = bottom_blob.channel(k).row(j)[1];
            }
        }

        if (normalized == 1)
        {
            float norm = sqrt(n_fft);
            for (int i = 0; i < 2 * n_fft; i++)
            {
                sp[i] *= norm;
            }
        }
        if (normalized == 2)
        {
            float norm = window_data[n_fft];
            for (int i = 0; i < 2 * n_fft; i++)
            {
                sp[i] *= norm;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < n_fft; i++)
        {
            // inverse dft
            float re = 0.f;
            float im = 0.f;
            for (int k = 0; k < n_fft; k++)
            {
                double angle = 2 * 3.14159265358979323846 * i * k / n_fft;

                re += sp.row(k)[0] * cosf(angle) - sp.row(k)[1] * sinf(angle);
                im += sp.row(k)[0] * sinf(angle) + sp.row(k)[1] * cosf(angle);
            }

            re /= n_fft;
            im /= n_fft;

            // apply window
            re *= window_data[i];
            im *= window_data[i];

            int output_index = j * hoplen + i;
            if (center == 1)
            {
                output_index -= n_fft / 2;
            }
            if (output_index >= 0 && output_index < outsize)
            {
                // square window
                window_sumsquare[output_index] += window_data[i] * window_data[i];

                if (returns == 0)
                {
                    top_blob.row(output_index)[0] += re;
                    top_blob.row(output_index)[1] += im;
                }
                if (returns == 1)
                {
                    top_blob[output_index] += re;
                }
                if (returns == 2)
                {
                    top_blob[output_index] += im;
                }
            }
        }
    }

    // square window norm
    if (returns == 0)
    {
        for (int i = 0; i < outsize; i++)
        {
            if (window_sumsquare[i] != 0.f)
            {
                top_blob.row(i)[0] /= window_sumsquare[i];
                top_blob.row(i)[1] /= window_sumsquare[i];
            }
        }
    }
    else
    {
        for (int i = 0; i < outsize; i++)
        {
            if (window_sumsquare[i] != 0.f)
                top_blob[i] /= window_sumsquare[i];
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/inversespectrogram.h
================================================
// Copyright 2024 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INVERSESPECTROGRAM_H
#define LAYER_INVERSESPECTROGRAM_H

#include "layer.h"

namespace ncnn {

class InverseSpectrogram : public Layer
{
public:
    InverseSpectrogram();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

public:
    int n_fft;
    int returns; // 0=complex 1=real 2=imag
    int hoplen;
    int winlen;
    int window_type; // 0=ones 1=hann 2=hamming
    int center;
    int normalized; // 0=disabled 1=sqrt(n_fft) 2=window-l2-energy

    Mat window_data;
};

} // namespace ncnn

#endif // LAYER_INVERSESPECTROGRAM_H


================================================
FILE: src/layer/layernorm.cpp
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "layernorm.h"

namespace ncnn {

LayerNorm::LayerNorm()
{
    one_blob_only = true;
    support_inplace = true;
}

int LayerNorm::load_param(const ParamDict& pd)
{
    affine_size = pd.get(0, 0);
    eps = pd.get(1, 0.001f);
    affine = pd.get(2, 1);

    return 0;
}

int LayerNorm::load_model(const ModelBin& mb)
{
    if (affine == 0)
        return 0;

    gamma_data = mb.load(affine_size, 1);
    if (gamma_data.empty())
        return -100;

    beta_data = mb.load(affine_size, 1);
    if (beta_data.empty())
        return -100;

    return 0;
}

static void layernorm(float* ptr, const float* gamma_ptr, const float* beta_ptr, float eps, int size)
{
    float sum = 0.f;
    for (int i = 0; i < size; i++)
    {
        sum += ptr[i];
    }

    float mean = sum / size;

    float sqsum = 0.f;
    for (int i = 0; i < size; i++)
    {
        float v = ptr[i] - mean;
        sqsum += v * v;
    }

    float var = sqsum / size;

    float a = 1.f / sqrtf(var + eps);
    float b = -mean * a;

    if (gamma_ptr && beta_ptr)
    {
        for (int i = 0; i < size; i++)
        {
            ptr[i] = (ptr[i] * a + b) * gamma_ptr[i] + beta_ptr[i];
        }
    }
    else
    {
        for (int i = 0; i < size; i++)
        {
            ptr[i] = ptr[i] * a + b;
        }
    }
}

int LayerNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    // x = (x - mean) / sqrt(var + eps) * gamma + beta

    int dims = bottom_top_blob.dims;

    if (dims == 1)
    {
        int w = bottom_top_blob.w;
        // assert affine_size == w

        float* ptr = bottom_top_blob;
        layernorm(ptr, gamma_data, beta_data, eps, w);
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        // assert affine_size == w

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            layernorm(ptr, gamma_data, beta_data, eps, w);
        }
    }

    if (dims == 3)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;

        if (affine_size == w)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                for (int i = 0; i < h; i++)
                {
                    float* ptr = bottom_top_blob.channel(q).row(i);
                    layernorm(ptr, gamma_data, beta_data, eps, w);
                }
            }
        }
        else // if (affine_size == size)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                float* ptr = bottom_top_blob.channel(q);
                layernorm(ptr, gamma_data, beta_data, eps, w * h);
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/layernorm.h
================================================
// Copyright 2020 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_LAYERNORM_H
#define LAYER_LAYERNORM_H

#include "layer.h"

namespace ncnn {

class LayerNorm : public Layer
{
public:
    LayerNorm();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    // param
    int affine_size;
    float eps;
    int affine;

    // model
    Mat gamma_data;
    Mat beta_data;
};

} // namespace ncnn

#endif // LAYER_LAYERNORM_H


================================================
FILE: src/layer/log.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "log.h"

namespace ncnn {

Log::Log()
{
    one_blob_only = true;
    support_inplace = true;
}

int Log::load_param(const ParamDict& pd)
{
    base = pd.get(0, -1.f);
    scale = pd.get(1, 1.f);
    shift = pd.get(2, 0.f);

    return 0;
}

int Log::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (base == -1.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                ptr[i] = logf(shift + ptr[i] * scale);
            }
        }
    }
    else
    {
        float log_base_inv = 1.f / logf(base);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                ptr[i] = logf(shift + ptr[i] * scale) * log_base_inv;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/log.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_LOG_H
#define LAYER_LOG_H

#include "layer.h"

namespace ncnn {

class Log : public Layer
{
public:
    Log();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

public:
    float base;
    float scale;
    float shift;
};

} // namespace ncnn

#endif // LAYER_LOG_H


================================================
FILE: src/layer/loongarch/absval_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "absval_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

namespace ncnn {

AbsVal_loongarch::AbsVal_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128i _p = __lsx_vld(ptr, 0);
            __m128i _outp = __lsx_vbitclri_w(_p, 31);
            __lsx_vst(_outp, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = *ptr > 0 ? *ptr : -*ptr;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/absval_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ABSVAL_LOONGARCH_H
#define LAYER_ABSVAL_LOONGARCH_H

#include "absval.h"

namespace ncnn {

class AbsVal_loongarch : public AbsVal
{
public:
    AbsVal_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ABSVAL_LOONGARCH_H


================================================
FILE: src/layer/loongarch/batchnorm_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "batchnorm_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

BatchNorm_loongarch::BatchNorm_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        int w = bottom_top_blob.w * elempack;

#if __loongarch_sx
        int nn_w = w / 4;
        int remain_w_start = nn_w * 4;
#else
        int remain_w_start = 0;
#endif // __loongarch_sx

        float* ptr = bottom_top_blob;

#if __loongarch_sx
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < nn_w; i++)
        {
            float* ptr0 = ptr + i * 4;

            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
            __m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0);
            __m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0);
            _p = __lsx_vfmadd_s(_b, _p, _a);
            __lsx_vst(_p, ptr0, 0);
        }
#endif // __loongarch_sx

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_w_start; i < w; i++)
        {
            ptr[i] = b_data[i] * ptr[i] + a_data[i];
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w * elempack;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            float a = a_data[i];
            float b = b_data[i];

            int j = 0;
#if __loongarch_sx
            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
            for (; j + 3 < w; j += 4)
            {
                __builtin_prefetch(ptr + 16);
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                _p = __lsx_vfmadd_s(_b, _p, _a);
                __lsx_vst(_p, ptr, 0);

                ptr += 4;
            }
#endif // __loongarch_sx
            for (; j < w; j++)
            {
                *ptr = b * *ptr + a;
                ptr++;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int d = bottom_top_blob.d;
        int c = bottom_top_blob.c;
        int size = w * h * d * elempack;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float a = a_data[q];
            float b = b_data[q];

            int i = 0;
#if __loongarch_sx
            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
            for (; i + 3 < size; i += 4)
            {
                __builtin_prefetch(ptr + 16);
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                _p = __lsx_vfmadd_s(_b, _p, _a);
                __lsx_vst(_p, ptr, 0);

                ptr += 4;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                *ptr = b * *ptr + a;
                ptr++;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/batchnorm_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BATCHNORM_LOONGARCH_H
#define LAYER_BATCHNORM_LOONGARCH_H

#include "batchnorm.h"

namespace ncnn {

class BatchNorm_loongarch : public BatchNorm
{
public:
    BatchNorm_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_BATCHNORM_LOONGARCH_H


================================================
FILE: src/layer/loongarch/bias_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "bias_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

int Bias_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

#if __loongarch_sx
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __loongarch_sx

#if __loongarch_sx
        __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias);
        for (; nn > 0; nn--)
        {
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __m128 _outp = __lsx_vfadd_s(_p, _bias);
            __lsx_vst(_outp, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx

        for (; remain > 0; remain--)
        {
            *ptr = *ptr + bias;
            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/bias_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BIAS_LOONGARCH_H
#define LAYER_BIAS_LOONGARCH_H

#include "bias.h"

namespace ncnn {

class Bias_loongarch : public Bias
{
public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_BIAS_LOONGARCH_H


================================================
FILE: src/layer/loongarch/binaryop_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "binaryop_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

BinaryOp_loongarch::BinaryOp_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

template<typename Op>
static void binary_op_vector_no_broadcast(const float* ptr, const float* ptr1, float* outptr, int size)
{
    const Op op;

    int i = 0;
#if __loongarch_sx
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr + 16);
        __builtin_prefetch(ptr1 + 16);
        __m128 _p = (__m128)__lsx_vld(ptr, 0);
        __m128 _b = (__m128)__lsx_vld(ptr1, 0);
        __m128 _outp = op(_p, _b);
        __lsx_vst(_outp, outptr, 0);
        ptr += 4;
        ptr1 += 4;
        outptr += 4;
    }
#endif // __loongarch_sx
    for (; i < size; i++)
    {
        *outptr = op(*ptr, *ptr1);
        ptr += 1;
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_b(const float* ptr, const float* ptr1, float* outptr, int size, int elempack)
{
    const Op op;

    const float b = *ptr1;
#if __loongarch_sx
    __m128 _b_128 = (elempack == 4) ? (__m128)__lsx_vld(ptr1, 0) : __lsx_vreplfr2vr_s(b);
#endif // __loongarch_sx

    int i = 0;
#if __loongarch_sx
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr + 16);
        __m128 _p = (__m128)__lsx_vld(ptr, 0);
        __m128 _outp = op(_p, _b_128);
        __lsx_vst(_outp, outptr, 0);
        ptr += 4;
        outptr += 4;
    }
#endif // __loongarch_sx
    for (; i < size; i++)
    {
        *outptr = op(*ptr, b);
        ptr += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_a(const float* ptr, const float* ptr1, float* outptr, int size, int elempack)
{
    const Op op;

    const float a = *ptr;
#if __loongarch_sx
    __m128 _a_128 = (elempack == 4) ? (__m128)__lsx_vld(ptr, 0) : __lsx_vreplfr2vr_s(a);
#endif // __loongarch_sx

    int i = 0;
#if __loongarch_sx
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr1 + 16);
        __m128 _b = (__m128)__lsx_vld(ptr1, 0);
        __m128 _outp = op(_a_128, _b);
        __lsx_vst(_outp, outptr, 0);
        ptr1 += 4;
        outptr += 4;
    }
#endif // __loongarch_sx
    for (; i < size; i++)
    {
        *outptr = op(a, *ptr1);
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_pb(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

#if __loongarch_sx
    if (elempack == 4)
    {
        int i = 0;
        for (; i < w; i++)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __m128 _b = __lsx_vreplfr2vr_s(*ptr1);
            __m128 _outp = op(_p, _b);
            __lsx_vst(_outp, outptr, 0);
            ptr += 4;
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __loongarch_sx
}

template<typename Op>
static void binary_op_vector_broadcast_pb_b(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

    const int size = w * elempack;

    int i = 0;
#if __loongarch_sx
    __m128 _b = __lsx_vreplfr2vr_s(*ptr1);
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr + 16);
        __m128 _p = (__m128)__lsx_vld(ptr, 0);
        __m128 _outp = op(_p, _b);
        __lsx_vst(_outp, outptr, 0);
        ptr += 4;
        outptr += 4;
    }
#endif // __loongarch_sx
}

template<typename Op>
static void binary_op_vector_broadcast_pb_a(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

#if __loongarch_sx
    if (elempack == 4)
    {
        int i = 0;
        __m128 _p = (__m128)__lsx_vld(ptr, 0);
        for (; i < w; i++)
        {
            __m128 _b = __lsx_vreplfr2vr_s(*ptr1);
            __m128 _outp = op(_p, _b);
            __lsx_vst(_outp, outptr, 0);
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __loongarch_sx
}

template<typename Op>
static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp)
{
    const int w = std::max(aw, bw);
    const int elempack = std::max(ap, bp);
    const int size = w * elempack;

    if (ap == bp)
    {
        if (aw == bw)
        {
            // no broadcast
            return binary_op_vector_no_broadcast<Op>(ptr, ptr1, outptr, size);
        }

        if (bw == 1)
        {
            // broadcast single b
            return binary_op_vector_broadcast_b<Op>(ptr, ptr1, outptr, size, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a
            return binary_op_vector_broadcast_a<Op>(ptr, ptr1, outptr, size, elempack);
        }
    }

    if (bp == 1)
    {
        if (aw == bw)
        {
            // broadcast pack1 b
            return binary_op_vector_broadcast_pb<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (bw == 1)
        {
            // broadcast pack1 single b
            return binary_op_vector_broadcast_pb_b<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a and pack1 b
            return binary_op_vector_broadcast_pb_a<Op>(ptr, ptr1, outptr, w, elempack);
        }
    }

    // shall never reach here
}

template<typename Op>
static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
{
    Op op;

    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        int i = 0;
#if __loongarch_sx
        __m128 _b = __lsx_vreplfr2vr_s(b);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = op(_p, _b);
            __lsx_vst(_p, ptr, 0);
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = op(*ptr, b);
            ptr++;
        }
    }

    return 0;
}

namespace BinaryOp_loongarch_functor {

#if __loongarch_sx
#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                          \
    struct NAME                                                   \
    {                                                             \
        float operator()(const float& x, const float& y) const    \
        {                                                         \
            return IMPL;                                          \
        }                                                         \
        __m128 operator()(const __m128& x, const __m128& y) const \
        {                                                         \
            return IMPL4;                                         \
        }                                                         \
    };
#else
#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                       \
    struct NAME                                                \
    {                                                          \
        float operator()(const float& x, const float& y) const \
        {                                                      \
            return IMPL;                                       \
        }                                                      \
    };
#endif // __loongarch_sx

// clang-format off
// *INDENT-OFF*
MAKE_FUNCTION(binary_op_add, x + y, __lsx_vfadd_s(x, y))
MAKE_FUNCTION(binary_op_sub, x - y, __lsx_vfsub_s(x, y))
MAKE_FUNCTION(binary_op_mul, x * y, __lsx_vfmul_s(x, y))
MAKE_FUNCTION(binary_op_div, x / y, __lsx_vfdiv_s(x, y))
MAKE_FUNCTION(binary_op_max, std::max(x, y), __lsx_vfmax_s(x, y))
MAKE_FUNCTION(binary_op_min, std::min(x, y), __lsx_vfmin_s(x, y))
MAKE_FUNCTION(binary_op_pow, (float)powf(x, y), pow_ps(x, y))
MAKE_FUNCTION(binary_op_rsub, y - x, __lsx_vfsub_s(y, x))
MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x))
MAKE_FUNCTION(binary_op_rpow, (float)powf(y, x), pow_ps(y, x))
MAKE_FUNCTION(binary_op_atan2, (float)atan2f(x, y), atan2_ps(x, y))
MAKE_FUNCTION(binary_op_ratan2, (float)atan2f(y, x), atan2_ps(y, x))
MAKE_FUNCTION(binary_op_fmod, (float)fmodf(x, y), fmod_ps(x, y))
MAKE_FUNCTION(binary_op_rfmod, (float)fmodf(y, x), fmod_ps(y, x))
MAKE_FUNCTION(binary_op_logaddexp, (float)(std::max(x, y) + log1pf(expf(std::min(x, y) - std::max(x, y)))), logaddexp_ps(x, y))
MAKE_FUNCTION(binary_op_floor_divide, (float)floorf(x / y), floor_divide_ps(x, y))
MAKE_FUNCTION(binary_op_rfloor_divide, (float)floorf(y / x), floor_divide_ps(y, x))
MAKE_FUNCTION(binary_op_remainder, (float)remainderf(x, y), remainder_ps(x, y))
MAKE_FUNCTION(binary_op_rremainder, (float)remainderf(y, x), remainder_ps(y, x))
// *INDENT-ON*
// clang-format on

#undef MAKE_FUNCTION

} // namespace BinaryOp_loongarch_functor

static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp, int op_type)
{
    using namespace BinaryOp_loongarch_functor;

    if (op_type == BinaryOp::Operation_ADD) return binary_op_vector<binary_op_add>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_vector<binary_op_sub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_vector<binary_op_mul>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_vector<binary_op_div>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_vector<binary_op_max>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_vector<binary_op_min>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_POW) return binary_op_vector<binary_op_pow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector<binary_op_rsub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector<binary_op_rdiv>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_vector<binary_op_fmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_vector<binary_op_rfmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_vector<binary_op_logaddexp>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_vector<binary_op_floor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_vector<binary_op_rfloor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_vector<binary_op_rremainder>(ptr, ptr1, outptr, aw, bw, ap, bp);

    // should never reach here
}

static void binary_op_scalar(const Mat& a, float b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = a.channel(q);
        float* outptr = c.channel(q);

        binary_op_vector(ptr, &b, outptr, size, 1, 1, 1, op_type);
    }
}

static void binary_op_no_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = a.channel(q);
        const float* ptr1 = b.channel(q);
        float* outptr = c.channel(q);

        binary_op_vector(ptr, ptr1, outptr, size, size, 1, 1, op_type);
    }
}

static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    if (b.w * b.h * b.d * b.c * b.elempack == 1)
    {
        return binary_op_scalar(a, b[0], c, op_type, opt);
    }

    if (a.dims == b.dims && a.w == b.w && a.h == b.h && a.d == b.d && a.c == b.c && a.elempack == b.elempack)
    {
        return binary_op_no_broadcast(a, b, c, op_type, opt);
    }

    const int dims = c.dims;

    if (dims == 2)
    {
        const int h = c.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < h; y++)
        {
            const int y0 = std::min(y, a.h - 1);
            const int y1 = std::min(y, b.h - 1);

            const float* ptr = a.row(y0);
            const float* ptr1 = b.row(y1);
            float* outptr = c.row(y);

            binary_op_vector(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int channels = c.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int q0 = std::min(q, a.c - 1);
            const int q1 = std::min(q, b.c - 1);

            if (b.d * b.h * b.w == 1)
            {
                const float* ptr = a.channel(q0);
                const float* ptr1 = b.channel(q1);
                float* outptr = c.channel(q);

                binary_op_vector(ptr, ptr1, outptr, a.w * a.h * a.d, 1, a.elempack, b.elempack, op_type);
                continue;
            }

            if (b.h * b.w == 1)
            {
                for (int z = 0; z < c.d; z++)
                {
                    const int z0 = std::min(z, a.d - 1);
                    const int z1 = std::min(z, b.d - 1);

                    const float* ptr = a.channel(q0).depth(z0);
                    const float* ptr1 = b.channel(q1).depth(z1);
                    float* outptr = c.channel(q).depth(z);

                    binary_op_vector(ptr, ptr1, outptr, a.w * a.h, 1, a.elempack, b.elempack, op_type);
                }
                continue;
            }

            for (int z = 0; z < c.d; z++)
            {
                const int z0 = std::min(z, a.d - 1);
                const int z1 = std::min(z, b.d - 1);

                for (int y = 0; y < c.h; y++)
                {
                    const int y0 = std::min(y, a.h - 1);
                    const int y1 = std::min(y, b.h - 1);

                    const float* ptr = a.channel(q0).depth(z0).row(y0);
                    const float* ptr1 = b.channel(q1).depth(z1).row(y1);
                    float* outptr = c.channel(q).depth(z).row(y);

                    binary_op_vector(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
                }
            }
        }
    }
}

static void binary_op_scalar_inplace(Mat& a, float b, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        binary_op_vector(ptr, &b, ptr, size, 1, 1, 1, op_type);
    }
}

static int get_reverse_op_type(int op_type)
{
    if (op_type == BinaryOp::Operation_SUB) return BinaryOp::Operation_RSUB;
    if (op_type == BinaryOp::Operation_DIV) return BinaryOp::Operation_RDIV;
    if (op_type == BinaryOp::Operation_POW) return BinaryOp::Operation_RPOW;
    if (op_type == BinaryOp::Operation_ATAN2) return BinaryOp::Operation_RATAN2;
    if (op_type == BinaryOp::Operation_FMOD) return BinaryOp::Operation_RFMOD;
    if (op_type == BinaryOp::Operation_LOGADDEXP) return BinaryOp::Operation_LOGADDEXP;
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return BinaryOp::Operation_RFLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_REMAINDER) return BinaryOp::Operation_RREMAINDER;

    if (op_type == BinaryOp::Operation_RSUB) return BinaryOp::Operation_SUB;
    if (op_type == BinaryOp::Operation_RDIV) return BinaryOp::Operation_DIV;
    if (op_type == BinaryOp::Operation_RPOW) return BinaryOp::Operation_POW;
    if (op_type == BinaryOp::Operation_RATAN2) return BinaryOp::Operation_ATAN2;
    if (op_type == BinaryOp::Operation_RFMOD) return BinaryOp::Operation_FMOD;
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return BinaryOp::Operation_FLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_RREMAINDER) return BinaryOp::Operation_REMAINDER;

    return op_type;
}

int BinaryOp_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    const int outdims = std::max(A.dims, B.dims);

    Mat A2 = A;
    Mat B2 = B;
    if (A.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (A.w * A.elempack == B.h * B.elempack)
                A2 = A.reshape(1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 2;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 3;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 2)
            A2 = A.reshape(1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 4;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 4 && A.dims == 2)
            A2 = A.reshape(1, 1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 3)
            A2 = A.reshape(1, A.w, A.h, A.c, opt.workspace_allocator);
    }
    if (B.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (B.w * B.elempack == A.h * A.elempack)
                B2 = B.reshape(1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 2;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 3;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 2)
            B2 = B.reshape(1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 4;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 4 && B.dims == 2)
            B2 = B.reshape(1, 1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 3)
            B2 = B.reshape(1, B.w, B.h, B.c, opt.workspace_allocator);
    }

    const int outw = std::max(A2.w, B2.w);
    const int outh = std::max(A2.h, B2.h);
    const int outd = std::max(A2.d, B2.d);
    const int outc = std::max(A2.c, B2.c);
    const size_t out_elemsize = std::max(A2.elemsize, B2.elemsize);
    const int out_elempack = std::max(A2.elempack, B2.elempack);

    Mat& top_blob = top_blobs[0];
    if (outdims == 1)
    {
        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 2)
    {
        top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 3)
    {
        top_blob.create(outw, outh, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 4)
    {
        top_blob.create(outw, outh, outd, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    const bool a_pack_is_lower = A2.elempack < B2.elempack;
    const bool a_pack_is_equal = A2.elempack == B2.elempack;
    const bool a_size_is_lower = A2.w * A2.h * A2.d * A2.c * A2.elempack < B2.w * B2.h * B2.d * B2.c * B2.elempack;
    if (a_pack_is_lower || (a_pack_is_equal && a_size_is_lower))
    {
        binary_op_broadcast(B2, A2, top_blob, get_reverse_op_type(op_type), opt);
    }
    else
    {
        binary_op_broadcast(A2, B2, top_blob, op_type, opt);
    }

    return 0;
}

int BinaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    binary_op_scalar_inplace(bottom_top_blob, b, op_type, opt);

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/binaryop_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BINARYOP_LOONGARCH_H
#define LAYER_BINARYOP_LOONGARCH_H

#include "binaryop.h"

namespace ncnn {

class BinaryOp_loongarch : public BinaryOp
{
public:
    BinaryOp_loongarch();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_BINARYOP_LOONGARCH_H


================================================
FILE: src/layer/loongarch/cast_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "cast_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

namespace ncnn {

Cast_loongarch::Cast_loongarch()
{
    support_packing = true;
}

int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (type_from == type_to)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    size_t out_elemsize = elemsize;
    if (type_to == 1)
    {
        if (type_from == 3)
        {
            Cast::forward(bottom_blob, top_blob, opt);
        }

        // float32
        out_elemsize = 4 * elempack;
    }
    else if (type_to == 2)
    {
        // float16
        out_elemsize = 2 * elempack;
    }
    else if (type_to == 3)
    {
        // int8
        out_elemsize = elempack;
    }
    else if (type_to == 4)
    {
        // bfloat16
        out_elemsize = 2 * elempack;
    }

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 4)
    {
        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int size = w * h * d * elempack;

    if (type_from == 1 && type_to == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            unsigned short* outptr = top_blob.channel(q);

            int i = 0;
#if __loongarch_sx
            for (; i + 7 < size; i += 8)
            {
                __builtin_prefetch(ptr + 16);
                __m128 _p0 = (__m128)__lsx_vld(ptr, 0);
                __m128 _p1 = (__m128)__lsx_vld(ptr + 4, 0);
                __m128i _p = __lsx_vfcvt_h_s(_p1, _p0);
                __lsx_vst(_p, outptr, 0);

                ptr += 8;
                outptr += 8;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                *outptr = float32_to_float16(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    if (type_from == 2 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
#if __loongarch_sx
            for (; i + 7 < size; i += 8)
            {
                __builtin_prefetch(ptr + 16);
                __m128i _p = __lsx_vld(ptr, 0);
                __m128 _p0 = __lsx_vfcvtl_s_h(_p);
                __m128 _p1 = __lsx_vfcvth_s_h(_p);
                __lsx_vst(_p0, outptr, 0);
                __lsx_vst(_p1, outptr + 4, 0);

                ptr += 8;
                outptr += 8;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                *outptr = float16_to_float32(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    if (type_from == 3 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const signed char* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = (float)ptr[i];
            }
        }
    }

    if (type_from == 4 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
            for (; i < size; i++)
            {
                *outptr = bfloat16_to_float32(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    if (type_from == 1 && type_to == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            unsigned short* outptr = top_blob.channel(q);

            int i = 0;
            for (; i < size; i++)
            {
                *outptr = float32_to_bfloat16(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/cast_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CAST_LOONGARCH_H
#define LAYER_CAST_LOONGARCH_H

#include "cast.h"

namespace ncnn {

class Cast_loongarch : public Cast
{
public:
    Cast_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CAST_LOONGARCH_H


================================================
FILE: src/layer/loongarch/clip_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "clip_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

Clip_loongarch::Clip_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

int Clip_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        __m128 _max = (__m128)__lsx_vreplfr2vr_s(max);
        __m128 _min = (__m128)__lsx_vreplfr2vr_s(min);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = __lsx_vfmax_s(_p, _min);
            _p = __lsx_vfmin_s(_p, _max);
            __lsx_vst(_p, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            if (*ptr < min)
                *ptr = min;

            if (*ptr > max)
                *ptr = max;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/clip_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CLIP_LOONGARCH_H
#define LAYER_CLIP_LOONGARCH_H

#include "clip.h"

namespace ncnn {

class Clip_loongarch : public Clip
{
public:
    Clip_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CLIP_LOONGARCH_H


================================================
FILE: src/layer/loongarch/concat_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "concat_loongarch.h"

namespace ncnn {

Concat_loongarch::Concat_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Concat_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int dims = bottom_blobs[0].dims;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // concat vector
        // total length
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        float* outptr = top_blob;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            const float* ptr = bottom_blob;
            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);

            outptr += bottom_blob.w * bottom_blob.elempack;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // concat image
        int w = bottom_blobs[0].w;

        // total height
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_h += bottom_blob.h * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;
        }

        float* outptr = top_blob_unpacked;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                for (int i = 0; i < bottom_blob.h; i++)
                {
                    const float* r0 = bottom_blob.row(i);

                    float* outptr0 = outptr;
                    float* outptr1 = outptr + w;
                    float* outptr2 = outptr + w * 2;
                    float* outptr3 = outptr + w * 3;

                    for (int j = 0; j < w; j++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    outptr += w * 4;
                }
            }
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
            {
                int size = w * bottom_blob.h;

                const float* ptr = bottom_blob;
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                outptr += size * bottom_blob.elempack;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // interleave image row
        int h = bottom_blobs[0].h;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total width
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* outptr = top_blob.row(i);
            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                const float* ptr = bottom_blob.row(i);
                memcpy(outptr, ptr, bottom_blob.w * elemsize);

                outptr += bottom_blob.w * elempack;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;

        // total channels
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_channels = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_channels += bottom_blob.c * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;

            top_blob_unpacked.dims = dims;
        }

        int p = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                for (int q = 0; q < bottom_blob.c; q++)
                {
                    const float* r0 = bottom_blob.channel(q);

                    float* outptr0 = top_blob_unpacked.channel(p);
                    float* outptr1 = top_blob_unpacked.channel(p + 1);
                    float* outptr2 = top_blob_unpacked.channel(p + 2);
                    float* outptr3 = top_blob_unpacked.channel(p + 3);

                    for (int i = 0; i < size; i++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    p += 4;
                }
            }
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
            {
                int size = bottom_blob.total();

                const float* ptr = bottom_blob;
                float* outptr = top_blob_unpacked.channel(p);
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                p += bottom_blob.c;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // interleave dim height
        int w = bottom_blobs[0].w;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (size_t b = 0; b < bottom_blobs.size(); b++)
                {
                    const Mat& bottom_blob = bottom_blobs[b];

                    int size = bottom_blob.w * bottom_blob.h;

                    const float* ptr = bottom_blob.channel(q).depth(i);
                    memcpy(outptr, ptr, size * elemsize);

                    outptr += size * elempack;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // interleave dim width
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
                    {
                        const Mat& bottom_blob = bottom_blobs[b];

                        const float* ptr = bottom_blob.channel(q).depth(i).row(j);
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);

                        outptr += bottom_blob.w * elempack;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        // interleave dim depth
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total depth
        int top_d = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_d += bottom_blob.d;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                const float* ptr = bottom_blob.channel(q);
                memcpy(outptr, ptr, size * elemsize);

                outptr += size * elempack;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/concat_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONCAT_LOONGARCH_H
#define LAYER_CONCAT_LOONGARCH_H

#include "concat.h"

namespace ncnn {

class Concat_loongarch : public Concat
{
public:
    Concat_loongarch();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CONCAT_LOONGARCH_H


================================================
FILE: src/layer/loongarch/convolution1d_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution1d_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_activation.h"
#include "loongarch_usability.h"

namespace ncnn {

Convolution1D_loongarch::Convolution1D_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Convolution1D_loongarch::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    const int num_input = weight_data_size / kernel_w / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

    // src = kw-inch-outch
    // dst = pb-pa-kw-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output);

        weight_data_packed.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            float* g00 = weight_data_packed.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < kernel_w; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution1D_loongarch::destroy_pipeline(const Option& /*opt*/)
{
    return 0;
}

int Convolution1D_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = num_output / out_elempack;

    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __loongarch_sx
    if (elempack == 4 && out_elempack == 4)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
                    }

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            __m128 _val0 = __lsx_vreplfr2vr_s(sptr[0]);
                            __m128 _val1 = __lsx_vreplfr2vr_s(sptr[1]);
                            __m128 _val2 = __lsx_vreplfr2vr_s(sptr[2]);
                            __m128 _val3 = __lsx_vreplfr2vr_s(sptr[3]);

                            __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
                            __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
                            __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
                            __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);

                            _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
                            _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
                            _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
                            _sum = __lsx_vfmadd_s(_w3, _val3, _sum);

                            sptr += dilation_w * 4;
                            kptr += 16;
                        }
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    __lsx_vst(_sum, outptr, 0);
                    outptr += 4;
                }
            }
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
                    }

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
                            __m128 _w = (__m128)__lsx_vld(kptr, 0);
                            _sum = __lsx_vfmadd_s(_w, _val, _sum);

                            sptr += dilation_w;
                            kptr += 4;
                        }
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    __lsx_vst(_sum, outptr, 0);
                    outptr += 4;
                }
            }
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
                            __m128 _w = (__m128)__lsx_vld(kptr, 0);
                            _sum = __lsx_vfmadd_s(_w, _val, _sum);

                            sptr += dilation_w * 4;
                            kptr += 4;
                        }
                    }

                    sum += __lsx_reduce_fadd_s(_sum);

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[j] = sum;
                }
            }
        }
    }
#endif // __loongarch_sx

    if (elempack == 1 && out_elempack == 1)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            float val = sptr[0];
                            float wt = kptr[0];
                            sum += val * wt;

                            sptr += dilation_w;
                            kptr += 1;
                        }
                    }

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[j] = sum;
                }
            }
        }
    }

    return 0;
}

int Convolution1D_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(2, dilation_w);
    pd.set(3, stride_w);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/convolution1d_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION1D_LOONGARCH_H
#define LAYER_CONVOLUTION1D_LOONGARCH_H

#include "convolution1d.h"

namespace ncnn {

class Convolution1D_loongarch : public Convolution1D
{
public:
    Convolution1D_loongarch();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    // packn
    Mat weight_data_packed;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION1D_LOONGARCH_H


================================================
FILE: src/layer/loongarch/convolution_1x1.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/loongarch/convolution_1x1_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_1x1_pack1to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_1x1_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
}

static void conv1x1s2_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 4;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const float* r0 = bottom_blob.channel(p);
        float* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128 _val = (__m128)__lsx_vld(r0, 0);
                __lsx_vst(_val, outptr, 0);

                r0 += 4 * 2;
                outptr += 4;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack4_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/loongarch/convolution_1x1_pack4to1.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
}

static void conv1x1s2_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 4;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const float* r0 = bottom_blob.channel(p);
        float* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128 _val = (__m128)__lsx_vld(r0, 0);
                __lsx_vst(_val, outptr, 0);

                r0 += 4 * 2;
                outptr += 4;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/loongarch/convolution_1x1_pack8to1_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const int64_t* r0 = bottom_blob.channel(p);
        int64_t* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_1x1_pack8to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const int64_t* r0 = bottom_blob.channel(p);
        int64_t* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_3x3.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd23_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt)
{
    Mat kernel_tm(4 * 4, inch, outch);

    // G
    const float ktm[4][3] = {
        {1.0f, 0.0f, 0.0f},
        {1.0f / 2, 1.0f / 2, 1.0f / 2},
        {1.0f / 2, -1.0f / 2, 1.0f / 2},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[4][3];
            for (int i = 0; i < 4; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 4; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 4; i++)
                {
                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 16-inch-outch
    // dst = inch-16-outch
#if __loongarch_sx
    kernel_tm2.create(8 * inch, 16, outch / 8 + (outch % 8) / 4 + outch % 4);
#else
    kernel_tm2.create(2 * inch, 16, outch / 2 + outch % 2);
#endif

    int q = 0;
#if __loongarch_sx
    for (; q + 7 < outch; q += 8)
    {
        Mat g0 = kernel_tm2.channel(q / 8);

        for (int k = 0; k < 16; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                for (int i = 0; i < 8; i++)
                {
                    const float* k00 = kernel_tm.channel(q + i).row(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
    for (; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4);

        for (int k = 0; k < 16; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                for (int i = 0; i < 4; i++)
                {
                    const float* k00 = kernel_tm.channel(q + i).row(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
#else  // __loongarch_sx
    for (; q + 1 < outch; q += 2)
    {
        Mat g0 = kernel_tm2.channel(q / 2);

        for (int k = 0; k < 16; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                for (int i = 0; i < 2; i++)
                {
                    const float* k00 = kernel_tm.channel(q + i).row(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
#endif // __loongarch_sx
    for (; q < outch; q++)
    {
#if __loongarch_sx
        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4);
#else
        Mat g0 = kernel_tm2.channel(q / 2 + q % 2);
#endif

        for (int k = 0; k < 16; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                const float* k00 = kernel_tm.channel(q).row(p);
                g00[0] = k00[k];
                g00++;
            }
        }
    }
}

static void conv3x3s1_winograd23_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 2n+2, winograd F(2,3)
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 1) / 2 * 2;
    outh = (outh + 1) / 2 * 2;

    w = outw + 2;
    h = outh + 2;
    Option opt_b = opt;
    opt_b.blob_allocator = opt.workspace_allocator;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 2;
        int h_tiles = outh / 2;
        int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 16, inch, 4u, opt.workspace_allocator);
        conv3x3s1_winograd23_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd23_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}

static void conv3x3s1_winograd43_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt)
{
    Mat kernel_tm(6 * 6, inch, outch);

    // G
    const float ktm[6][3] = {
        {1.0f / 4, 0.0f, 0.0f},
        {-1.0f / 6, -1.0f / 6, -1.0f / 6},
        {-1.0f / 6, 1.0f / 6, -1.0f / 6},
        {1.0f / 24, 1.0f / 12, 1.0f / 6},
        {1.0f / 24, -1.0f / 12, 1.0f / 6},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = inch-36-outch
#if __loongarch_sx
    kernel_tm2.create(8 * inch, 36, outch / 8 + (outch % 8) / 4 + outch % 4);
#else
    kernel_tm2.create(2 * inch, 36, outch / 2 + outch % 2);
#endif

    int q = 0;
#if __loongarch_sx
    for (; q + 7 < outch; q += 8)
    {
        Mat g0 = kernel_tm2.channel(q / 8);

        for (int k = 0; k < 36; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                for (int i = 0; i < 8; i++)
                {
                    const float* k00 = kernel_tm.channel(q + i).row(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
    for (; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4);

        for (int k = 0; k < 36; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                for (int i = 0; i < 4; i++)
                {
                    const float* k00 = kernel_tm.channel(q + i).row(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
#else  // __loongarch_sx
    for (; q + 1 < outch; q += 2)
    {
        Mat g0 = kernel_tm2.channel(q / 2);

        for (int k = 0; k < 36; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                for (int i = 0; i < 2; i++)
                {
                    const float* k00 = kernel_tm.channel(q + i).row(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
#endif // __loongarch_sx
    for (; q < outch; q++)
    {
#if __loongarch_sx
        Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4);
#else
        Mat g0 = kernel_tm2.channel(q / 2 + q % 2);
#endif

        for (int k = 0; k < 36; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p < inch; p++)
            {
                const float* k00 = kernel_tm.channel(q).row(p);
                g00[0] = k00[k];
                g00++;
            }
        }
    }
}

static void conv3x3s1_winograd43_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2, winograd F(4,3)
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;

    Option opt_b = opt;
    opt_b.blob_allocator = opt.workspace_allocator;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, 4u, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/loongarch/convolution_3x3_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_kernel_int8_lsx(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt)
{
    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 2b-inch-36-outch/2b
#if __loongarch_sx
    if (outch >= 4)
    {
        if (inch >= 4)
            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch / 4 + outch % 4, (size_t)2u * 16, 16);
        else
            kernel_tm_packed.create(inch, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
    }
#else  // __loongarch_sx
    if (outch >= 2)
    {
        kernel_tm_packed.create(inch, 36, outch / 2 + outch % 2, (size_t)2u * 2, 2);
    }
#endif // __loongarch_sx
    else
    {
#if __loongarch_sx
        if (inch >= 4)
            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch, (size_t)2u * 4, 4);
        else
#endif // __loongarch_sx
        {
            kernel_tm_packed.create(inch, 36, outch, (size_t)2u, 1);
        }
    }

    int p = 0;
#if __loongarch_sx
    for (; p + 3 < outch; p += 4)
    {
        const Mat k0 = kernel_tm.channel(p);
        const Mat k1 = kernel_tm.channel(p + 1);
        const Mat k2 = kernel_tm.channel(p + 2);
        const Mat k3 = kernel_tm.channel(p + 3);

        Mat g0 = kernel_tm_packed.channel(p / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k0.row<const short>(q + 1)[k];
                g00[2] = k0.row<const short>(q + 2)[k];
                g00[3] = k0.row<const short>(q + 3)[k];
                g00[4] = k1.row<const short>(q)[k];
                g00[5] = k1.row<const short>(q + 1)[k];
                g00[6] = k1.row<const short>(q + 2)[k];
                g00[7] = k1.row<const short>(q + 3)[k];
                g00[8] = k2.row<const short>(q)[k];
                g00[9] = k2.row<const short>(q + 1)[k];
                g00[10] = k2.row<const short>(q + 2)[k];
                g00[11] = k2.row<const short>(q + 3)[k];
                g00[12] = k3.row<const short>(q)[k];
                g00[13] = k3.row<const short>(q + 1)[k];
                g00[14] = k3.row<const short>(q + 2)[k];
                g00[15] = k3.row<const short>(q + 3)[k];
                g00 += 16;
            }
            for (; q < inch; q++)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k1.row<const short>(q)[k];
                g00[2] = k2.row<const short>(q)[k];
                g00[3] = k3.row<const short>(q)[k];
                g00 += 4;
            }
        }
    }
#else  // __loongarch_sx
    for (; p + 1 < outch; p += 2)
    {
        const Mat k0 = kernel_tm.channel(p);
        const Mat k1 = kernel_tm.channel(p + 1);

        Mat g0 = kernel_tm_packed.channel(p / 2);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            int q = 0;
            for (; q < inch; q++)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k1.row<const short>(q)[k];
                g00 += 2;
            }
        }
    }
#endif // __loongarch_sx
    for (; p < outch; p++)
    {
        const Mat k0 = kernel_tm.channel(p);

#if __loongarch_sx
        Mat g0 = kernel_tm_packed.channel(p / 4 + p % 4);
#else
        Mat g0 = kernel_tm_packed.channel(p / 2 + p % 2);
#endif

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            int q = 0;
#if __loongarch_sx
            for (; q + 3 < inch; q += 4)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k0.row<const short>(q + 1)[k];
                g00[2] = k0.row<const short>(q + 2)[k];
                g00[3] = k0.row<const short>(q + 3)[k];
                g00 += 4;
            }
#endif // __loongarch_sx
            for (; q < inch; q++)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00 += 1;
            }
        }
    }
}

static void conv3x3s1_winograd43_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/loongarch/convolution_3x3_pack1to4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
        out0.fill(_bias0);

        const float* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            __m128 _k00 = (__m128)__lsx_vld(k0, 0);
            __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
            __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
            __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
            __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
            __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
            __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
            __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
            __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 7 < outw; j += 8)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
                    __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0);
                    __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0);
                    __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0);
                    __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);

                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k00, _r04, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k00, _r05, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k00, _r06, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k00, _r07, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k01, _r05, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k01, _r06, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k01, _r07, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k01, _r08, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k02, _r06, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k02, _r07, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k02, _r08, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k02, _r09, _sum7);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);

                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k10, _r14, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k10, _r15, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k10, _r16, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k10, _r17, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k11, _r15, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k11, _r16, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k11, _r17, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k11, _r18, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k12, _r16, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k12, _r17, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k12, _r18, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k12, _r19, _sum7);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);

                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k20, _r24, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k20, _r25, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k20, _r26, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k20, _r27, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k21, _r25, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k21, _r26, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k21, _r27, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k21, _r28, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k22, _r26, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k22, _r27, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k22, _r28, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k22, _r29, _sum7);

                    __lsx_vst(_sum0, outptr0, 0);
                    __lsx_vst(_sum1, outptr0 + 4, 0);
                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
                    __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
                    __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
                    __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
                    __lsx_vst(_sum7, outptr0 + 4 * 7, 0);

                    outptr0 += 4 * 8;

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                }
                for (; j + 3 < outw; j += 4)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r0n = __lsx_vld(r0 + 4, 0);

                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r1n = __lsx_vld(r1 + 4, 0);

                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r2n = __lsx_vld(r2 + 4, 0);

                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3);

                    __lsx_vst(_sum0, outptr0, 0);
                    __lsx_vst(_sum1, outptr0 + 4, 0);
                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);

                    outptr0 += 4 * 4;

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                }
                for (; j + 1 < outw; j += 2)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1);

                    __lsx_vst(_sum0, outptr0, 0);
                    __lsx_vst(_sum1, outptr0 + 4, 0);

                    outptr0 += 4 * 2;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                }
                for (; j < outw; j++)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);

                    __lsx_vst(_sum0, outptr0, 0);

                    outptr0 += 4;

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
        }
    }
}

static void conv3x3s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
        out0.fill(_bias0);

        const float* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            __m128 _k00 = (__m128)__lsx_vld(k0, 0);
            __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
            __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
            __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
            __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
            __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
            __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
            __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
            __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 7 < outw; j += 8)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);
                    __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0);
                    __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0);
                    __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0);
                    __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);
                    __m128i _r0nnn = __lsx_vld(r0 + 12, 0);

                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
                    __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2);
                    __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3);
                    __m128 _r0c = (__m128)__lsx_vreplvei_w(_r0nnn, 0);
                    __m128 _r0d = (__m128)__lsx_vreplvei_w(_r0nnn, 1);
                    __m128 _r0e = (__m128)__lsx_vreplvei_w(_r0nnn, 2);
                    __m128 _r0f = (__m128)__lsx_vreplvei_w(_r0nnn, 3);
                    __m128 _r0g = __lsx_vreplfr2vr_s(r0[16]);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k00, _r08, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k00, _r0a, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k00, _r0c, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k00, _r0e, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k01, _r09, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k01, _r0b, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k01, _r0d, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k01, _r0f, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k02, _r0a, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k02, _r0c, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k02, _r0e, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k02, _r0g, _sum7);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);
                    __m128i _r1nnn = __lsx_vld(r1 + 12, 0);

                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
                    __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2);
                    __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3);
                    __m128 _r1c = (__m128)__lsx_vreplvei_w(_r1nnn, 0);
                    __m128 _r1d = (__m128)__lsx_vreplvei_w(_r1nnn, 1);
                    __m128 _r1e = (__m128)__lsx_vreplvei_w(_r1nnn, 2);
                    __m128 _r1f = (__m128)__lsx_vreplvei_w(_r1nnn, 3);
                    __m128 _r1g = __lsx_vreplfr2vr_s(r1[16]);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k10, _r18, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k10, _r1a, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k10, _r1c, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k10, _r1e, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k11, _r19, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k11, _r1b, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k11, _r1d, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k11, _r1f, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k12, _r1a, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k12, _r1c, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k12, _r1e, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k12, _r1g, _sum7);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);
                    __m128i _r2nnn = __lsx_vld(r2 + 12, 0);

                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
                    __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2);
                    __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3);
                    __m128 _r2c = (__m128)__lsx_vreplvei_w(_r2nnn, 0);
                    __m128 _r2d = (__m128)__lsx_vreplvei_w(_r2nnn, 1);
                    __m128 _r2e = (__m128)__lsx_vreplvei_w(_r2nnn, 2);
                    __m128 _r2f = (__m128)__lsx_vreplvei_w(_r2nnn, 3);
                    __m128 _r2g = __lsx_vreplfr2vr_s(r2[16]);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k20, _r28, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k20, _r2a, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k20, _r2c, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k20, _r2e, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k21, _r29, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k21, _r2b, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k21, _r2d, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k21, _r2f, _sum7);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
                    _sum4 = __lsx_vfmadd_s(_k22, _r2a, _sum4);
                    _sum5 = __lsx_vfmadd_s(_k22, _r2c, _sum5);
                    _sum6 = __lsx_vfmadd_s(_k22, _r2e, _sum6);
                    _sum7 = __lsx_vfmadd_s(_k22, _r2g, _sum7);

                    __lsx_vst(_sum0, outptr0, 0);
                    __lsx_vst(_sum1, outptr0 + 4, 0);
                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
                    __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
                    __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
                    __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
                    __lsx_vst(_sum7, outptr0 + 4 * 7, 0);

                    outptr0 += 4 * 8;

                    r0 += 16;
                    r1 += 16;
                    r2 += 16;
                }
                for (; j + 3 < outw; j += 4)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r0n = __lsx_vld(r0 + 4, 0);

                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
                    __m128 _r08 = __lsx_vreplfr2vr_s(r0[8]);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r1n = __lsx_vld(r1 + 4, 0);

                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
                    __m128 _r18 = __lsx_vreplfr2vr_s(r1[8]);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r2n = __lsx_vld(r2 + 4, 0);

                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
                    __m128 _r28 = __lsx_vreplfr2vr_s(r2[8]);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);

                    __lsx_vst(_sum0, outptr0, 0);
                    __lsx_vst(_sum1, outptr0 + 4, 0);
                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);

                    outptr0 += 4 * 4;

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                }
                for (; j + 1 < outw; j += 2)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
                    __m128 _r04 = __lsx_vreplfr2vr_s(r0[4]);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
                    __m128 _r14 = __lsx_vreplfr2vr_s(r1[4]);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
                    __m128 _r24 = __lsx_vreplfr2vr_s(r2[4]);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);

                    __lsx_vst(_sum0, outptr0, 0);
                    __lsx_vst(_sum1, outptr0 + 4, 0);

                    outptr0 += 4 * 2;

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                }
                for (; j < outw; j++)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);

                    __lsx_vst(_sum0, outptr0, 0);

                    outptr0 += 4;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_3x3_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd63_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
    // winograd63 transform kernel
    Mat kernel_tm;
    kernel_tm.create(8 * 8, inch, outch);

    const float ktm[8][3] = {
        {1.0f, 0.0f, 0.0f},
        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
        {1.0f / 90, 1.0f / 45, 2.0f / 45},
        {1.0f / 90, -1.0f / 45, 2.0f / 45},
        {1.0f / 45, 1.0f / 90, 1.0f / 180},
        {1.0f / 45, -1.0f / 90, 1.0f / 180},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel, transposed
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[8][3];
            for (int i = 0; i < 8; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // v
            for (int j = 0; j < 8; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 8; i++)
                {
                    kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 64-inch-outch
    // dst = pb-pa-inch/pa-64-outch/pb
    kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 4 * 4, 4 * 4);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm_pack4.channel(q / 4);

        for (int k = 0; k < 64; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p + 3 < inch; p += 4)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }
}

static void conv3x3s1_winograd63_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 6n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 5) / 6 * 6;
    outh = (outh + 5) / 6 * 6;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 6;
        int h_tiles = outh / 6;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
        conv3x3s1_winograd63_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd63_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}

static void conv3x3s1_winograd43_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch);

    const float ktm[6][3] = {
        {1.0f / 4, 0.0f, 0.0f},
        {-1.0f / 6, -1.0f / 6, -1.0f / 6},
        {-1.0f / 6, 1.0f / 6, -1.0f / 6},
        {1.0f / 24, 1.0f / 12, 1.0f / 6},
        {1.0f / 24, -1.0f / 12, 1.0f / 6},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = pb-pa-inch/pa-36-outch/pb
    kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 4 * 4, 4 * 4);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm_pack4.channel(q / 4);

        for (int k = 0; k < 36; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p + 3 < inch; p += 4)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}

static void conv3x3s1_winograd23_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
    // winograd23 transform kernel
    Mat kernel_tm(4 * 4, inch, outch);

    const float ktm[4][3] = {
        {1.0f, 0.0f, 0.0f},
        {1.0f / 2, 1.0f / 2, 1.0f / 2},
        {1.0f / 2, -1.0f / 2, 1.0f / 2},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[4][3];
            for (int i = 0; i < 4; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 4; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 4; i++)
                {
                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 16-inch-outch
    // dst = pb-pa-inch/pa-16-outch/pb
    kernel_tm_pack4.create(inch / 4, 16, outch / 4, (size_t)4u * 4 * 4, 4 * 4);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm_pack4.channel(q / 4);

        for (int k = 0; k < 16; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p + 3 < inch; p += 4)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }
}

static void conv3x3s1_winograd23_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 2n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 1) / 2 * 2;
    outh = (outh + 1) / 2 * 2;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 2;
        int h_tiles = outh / 2;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator);
        conv3x3s1_winograd23_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd23_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/loongarch/convolution_3x3_pack8to1_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
{
    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 4b-8a-inch/8a-36-outch/4b
    kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);

    int p = 0;
    for (; p + 3 < outch; p += 4)
    {
        const Mat k0 = kernel_tm.channel(p);
        const Mat k1 = kernel_tm.channel(p + 1);
        const Mat k2 = kernel_tm.channel(p + 2);
        const Mat k3 = kernel_tm.channel(p + 3);

        Mat g0 = kernel_tm_pack8to1.channel(p / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            for (int q = 0; q + 7 < inch; q += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0.row<const short>(q + i)[k];
                    g00[1] = k1.row<const short>(q + i)[k];
                    g00[2] = k2.row<const short>(q + i)[k];
                    g00[3] = k3.row<const short>(q + i)[k];

                    g00 += 4;
                }
            }
        }
    }
    for (; p < outch; p++)
    {
        const Mat k0 = kernel_tm.channel(p);

        Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            for (int q = 0; q + 7 < inch; q += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0.row<const short>(q + i)[k];

                    g00 += 1;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack8to1_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/loongarch/convolution_3x3_pack8to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
{
    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 4b-8a-inch/8a-36-outch/4b
    kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);

    int q = 0;
    for (; q + 3 < outch; q += 4)
    {
        const Mat k0 = kernel_tm.channel(q);
        const Mat k1 = kernel_tm.channel(q + 1);
        const Mat k2 = kernel_tm.channel(q + 2);
        const Mat k3 = kernel_tm.channel(q + 3);

        Mat kernel_tm = kernel_tm_pack8.channel(q / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = kernel_tm.row<short>(k);

            for (int p = 0; p + 7 < inch; p += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    const short* k00 = k0.row<const short>(p + i);
                    const short* k10 = k1.row<const short>(p + i);
                    const short* k20 = k2.row<const short>(p + i);
                    const short* k30 = k3.row<const short>(p + i);

                    g00[0] = k00[k];
                    g00[1] = k10[k];
                    g00[2] = k20[k];
                    g00[3] = k30[k];

                    g00 += 4;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack8to4_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_pack4_int8_lsx(top_blob_tm, top_blob_bordered, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/loongarch/convolution_7x7_pack1to4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv7x7s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
        out0.fill(_bias0);

        for (int q = 0; q < inch; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);
            const float* r3 = img0.row(3);
            const float* r4 = img0.row(4);
            const float* r5 = img0.row(5);
            const float* r6 = img0.row(6);

            const float* kptr = kernel.channel(p).row(q);

            int i = 0;

            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 3 < outw; j += 4)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);
                    __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0);
                    __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0);
                    __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0);

                    __m128 _k00 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r0n = __lsx_vld(r0 + 4, 0);
                    __m128i _r0nn = __lsx_vld(r0 + 8, 0);

                    __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0);
                    __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1);
                    __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2);
                    __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3);
                    __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0);
                    __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1);
                    __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2);
                    __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3);
                    __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0);
                    __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1);
                    __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2);
                    __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3);
                    __m128 _r0c = __lsx_vreplfr2vr_s(r0[12]);

                    _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k03, _r03, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k03, _r05, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k03, _r07, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k03, _r09, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k04, _r04, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k04, _r06, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k04, _r08, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k04, _r0a, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k05, _r05, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k05, _r07, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k05, _r09, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k05, _r0b, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k06, _r06, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k06, _r08, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k06, _r0a, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k06, _r0c, _sum3);

                    __m128 _k10 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r1n = __lsx_vld(r1 + 4, 0);
                    __m128i _r1nn = __lsx_vld(r1 + 8, 0);

                    __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0);
                    __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1);
                    __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2);
                    __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3);
                    __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0);
                    __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1);
                    __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2);
                    __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3);
                    __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0);
                    __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1);
                    __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2);
                    __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3);
                    __m128 _r1c = __lsx_vreplfr2vr_s(r1[12]);

                    _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k13, _r13, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k13, _r15, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k13, _r17, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k13, _r19, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k14, _r14, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k14, _r16, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k14, _r18, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k14, _r1a, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k15, _r15, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k15, _r17, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k15, _r19, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k15, _r1b, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k16, _r16, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k16, _r18, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k16, _r1a, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k16, _r1c, _sum3);

                    __m128 _k20 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r2n = __lsx_vld(r2 + 4, 0);
                    __m128i _r2nn = __lsx_vld(r2 + 8, 0);

                    __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0);
                    __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1);
                    __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2);
                    __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3);
                    __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0);
                    __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1);
                    __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2);
                    __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3);
                    __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0);
                    __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1);
                    __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2);
                    __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3);
                    __m128 _r2c = __lsx_vreplfr2vr_s(r2[12]);

                    _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k23, _r23, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k23, _r25, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k23, _r27, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k23, _r29, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k24, _r24, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k24, _r26, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k24, _r28, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k24, _r2a, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k25, _r25, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k25, _r27, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k25, _r29, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k25, _r2b, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k26, _r26, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k26, _r28, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k26, _r2a, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k26, _r2c, _sum3);

                    __m128 _k30 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r3 = __lsx_vld(r3, 0);
                    __m128i _r3n = __lsx_vld(r3 + 4, 0);
                    __m128i _r3nn = __lsx_vld(r3 + 8, 0);

                    __m128 _r30 = (__m128)__lsx_vreplvei_w(_r3, 0);
                    __m128 _r31 = (__m128)__lsx_vreplvei_w(_r3, 1);
                    __m128 _r32 = (__m128)__lsx_vreplvei_w(_r3, 2);
                    __m128 _r33 = (__m128)__lsx_vreplvei_w(_r3, 3);
                    __m128 _r34 = (__m128)__lsx_vreplvei_w(_r3n, 0);
                    __m128 _r35 = (__m128)__lsx_vreplvei_w(_r3n, 1);
                    __m128 _r36 = (__m128)__lsx_vreplvei_w(_r3n, 2);
                    __m128 _r37 = (__m128)__lsx_vreplvei_w(_r3n, 3);
                    __m128 _r38 = (__m128)__lsx_vreplvei_w(_r3nn, 0);
                    __m128 _r39 = (__m128)__lsx_vreplvei_w(_r3nn, 1);
                    __m128 _r3a = (__m128)__lsx_vreplvei_w(_r3nn, 2);
                    __m128 _r3b = (__m128)__lsx_vreplvei_w(_r3nn, 3);
                    __m128 _r3c = __lsx_vreplfr2vr_s(r3[12]);

                    _sum0 = __lsx_vfmadd_s(_k30, _r30, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k30, _r32, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k30, _r34, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k30, _r36, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k31, _r31, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k31, _r33, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k31, _r35, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k31, _r37, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k32, _r32, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k32, _r34, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k32, _r36, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k32, _r38, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k33, _r33, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k33, _r35, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k33, _r37, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k33, _r39, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k34, _r34, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k34, _r36, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k34, _r38, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k34, _r3a, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k35, _r35, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k35, _r37, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k35, _r39, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k35, _r3b, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k36, _r36, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k36, _r38, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k36, _r3a, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k36, _r3c, _sum3);

                    __m128 _k40 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r4 = __lsx_vld(r4, 0);
                    __m128i _r4n = __lsx_vld(r4 + 4, 0);
                    __m128i _r4nn = __lsx_vld(r4 + 8, 0);

                    __m128 _r40 = (__m128)__lsx_vreplvei_w(_r4, 0);
                    __m128 _r41 = (__m128)__lsx_vreplvei_w(_r4, 1);
                    __m128 _r42 = (__m128)__lsx_vreplvei_w(_r4, 2);
                    __m128 _r43 = (__m128)__lsx_vreplvei_w(_r4, 3);
                    __m128 _r44 = (__m128)__lsx_vreplvei_w(_r4n, 0);
                    __m128 _r45 = (__m128)__lsx_vreplvei_w(_r4n, 1);
                    __m128 _r46 = (__m128)__lsx_vreplvei_w(_r4n, 2);
                    __m128 _r47 = (__m128)__lsx_vreplvei_w(_r4n, 3);
                    __m128 _r48 = (__m128)__lsx_vreplvei_w(_r4nn, 0);
                    __m128 _r49 = (__m128)__lsx_vreplvei_w(_r4nn, 1);
                    __m128 _r4a = (__m128)__lsx_vreplvei_w(_r4nn, 2);
                    __m128 _r4b = (__m128)__lsx_vreplvei_w(_r4nn, 3);
                    __m128 _r4c = __lsx_vreplfr2vr_s(r4[12]);

                    _sum0 = __lsx_vfmadd_s(_k40, _r40, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k40, _r42, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k40, _r44, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k40, _r46, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k41, _r41, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k41, _r43, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k41, _r45, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k41, _r47, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k42, _r42, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k42, _r44, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k42, _r46, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k42, _r48, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k43, _r43, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k43, _r45, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k43, _r47, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k43, _r49, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k44, _r44, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k44, _r46, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k44, _r48, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k44, _r4a, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k45, _r45, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k45, _r47, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k45, _r49, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k45, _r4b, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k46, _r46, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k46, _r48, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k46, _r4a, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k46, _r4c, _sum3);

                    __m128 _k50 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r5 = __lsx_vld(r5, 0);
                    __m128i _r5n = __lsx_vld(r5 + 4, 0);
                    __m128i _r5nn = __lsx_vld(r5 + 8, 0);

                    __m128 _r50 = (__m128)__lsx_vreplvei_w(_r5, 0);
                    __m128 _r51 = (__m128)__lsx_vreplvei_w(_r5, 1);
                    __m128 _r52 = (__m128)__lsx_vreplvei_w(_r5, 2);
                    __m128 _r53 = (__m128)__lsx_vreplvei_w(_r5, 3);
                    __m128 _r54 = (__m128)__lsx_vreplvei_w(_r5n, 0);
                    __m128 _r55 = (__m128)__lsx_vreplvei_w(_r5n, 1);
                    __m128 _r56 = (__m128)__lsx_vreplvei_w(_r5n, 2);
                    __m128 _r57 = (__m128)__lsx_vreplvei_w(_r5n, 3);
                    __m128 _r58 = (__m128)__lsx_vreplvei_w(_r5nn, 0);
                    __m128 _r59 = (__m128)__lsx_vreplvei_w(_r5nn, 1);
                    __m128 _r5a = (__m128)__lsx_vreplvei_w(_r5nn, 2);
                    __m128 _r5b = (__m128)__lsx_vreplvei_w(_r5nn, 3);
                    __m128 _r5c = __lsx_vreplfr2vr_s(r5[12]);

                    _sum0 = __lsx_vfmadd_s(_k50, _r50, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k50, _r52, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k50, _r54, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k50, _r56, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k51, _r51, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k51, _r53, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k51, _r55, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k51, _r57, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k52, _r52, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k52, _r54, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k52, _r56, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k52, _r58, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k53, _r53, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k53, _r55, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k53, _r57, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k53, _r59, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k54, _r54, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k54, _r56, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k54, _r58, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k54, _r5a, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k55, _r55, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k55, _r57, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k55, _r59, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k55, _r5b, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k56, _r56, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k56, _r58, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k56, _r5a, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k56, _r5c, _sum3);

                    __m128 _k60 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr -= 4 * 42;

                    __m128i _r6 = __lsx_vld(r6, 0);
                    __m128i _r6n = __lsx_vld(r6 + 4, 0);
                    __m128i _r6nn = __lsx_vld(r6 + 8, 0);

                    __m128 _r60 = (__m128)__lsx_vreplvei_w(_r6, 0);
                    __m128 _r61 = (__m128)__lsx_vreplvei_w(_r6, 1);
                    __m128 _r62 = (__m128)__lsx_vreplvei_w(_r6, 2);
                    __m128 _r63 = (__m128)__lsx_vreplvei_w(_r6, 3);
                    __m128 _r64 = (__m128)__lsx_vreplvei_w(_r6n, 0);
                    __m128 _r65 = (__m128)__lsx_vreplvei_w(_r6n, 1);
                    __m128 _r66 = (__m128)__lsx_vreplvei_w(_r6n, 2);
                    __m128 _r67 = (__m128)__lsx_vreplvei_w(_r6n, 3);
                    __m128 _r68 = (__m128)__lsx_vreplvei_w(_r6nn, 0);
                    __m128 _r69 = (__m128)__lsx_vreplvei_w(_r6nn, 1);
                    __m128 _r6a = (__m128)__lsx_vreplvei_w(_r6nn, 2);
                    __m128 _r6b = (__m128)__lsx_vreplvei_w(_r6nn, 3);
                    __m128 _r6c = __lsx_vreplfr2vr_s(r6[12]);

                    _sum0 = __lsx_vfmadd_s(_k60, _r60, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k60, _r62, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k60, _r64, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k60, _r66, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k61, _r61, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k61, _r63, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k61, _r65, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k61, _r67, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k62, _r62, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k62, _r64, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k62, _r66, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k62, _r68, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k63, _r63, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k63, _r65, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k63, _r67, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k63, _r69, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k64, _r64, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k64, _r66, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k64, _r68, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k64, _r6a, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k65, _r65, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k65, _r67, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k65, _r69, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k65, _r6b, _sum3);
                    _sum0 = __lsx_vfmadd_s(_k66, _r66, _sum0);
                    _sum1 = __lsx_vfmadd_s(_k66, _r68, _sum1);
                    _sum2 = __lsx_vfmadd_s(_k66, _r6a, _sum2);
                    _sum3 = __lsx_vfmadd_s(_k66, _r6c, _sum3);

                    __lsx_vst(_sum0, outptr0, 0);
                    __lsx_vst(_sum1, outptr0 + 4, 0);
                    __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
                    __lsx_vst(_sum3, outptr0 + 4 * 3, 0);

                    outptr0 += 4 * 4;

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                    r3 += 8;
                    r4 += 8;
                    r5 += 8;
                    r6 += 8;
                }
                for (; j < outw; j++)
                {
                    __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0);

                    __m128 _k00 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r0n = __lsx_vld(r0 + 4, 0);

                    _sum0 = __lsx_vfmadd_s(_k00, (__m128)__lsx_vreplvei_w(_r0, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k01, (__m128)__lsx_vreplvei_w(_r0, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k02, (__m128)__lsx_vreplvei_w(_r0, 2), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k03, (__m128)__lsx_vreplvei_w(_r0, 3), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k04, (__m128)__lsx_vreplvei_w(_r0n, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k05, (__m128)__lsx_vreplvei_w(_r0n, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k06, (__m128)__lsx_vreplvei_w(_r0n, 2), _sum0);

                    __m128 _k10 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r1n = __lsx_vld(r1 + 4, 0);

                    _sum0 = __lsx_vfmadd_s(_k10, (__m128)__lsx_vreplvei_w(_r1, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k11, (__m128)__lsx_vreplvei_w(_r1, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k12, (__m128)__lsx_vreplvei_w(_r1, 2), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k13, (__m128)__lsx_vreplvei_w(_r1, 3), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k14, (__m128)__lsx_vreplvei_w(_r1n, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k15, (__m128)__lsx_vreplvei_w(_r1n, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k16, (__m128)__lsx_vreplvei_w(_r1n, 2), _sum0);

                    __m128 _k20 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r2n = __lsx_vld(r2 + 4, 0);

                    _sum0 = __lsx_vfmadd_s(_k20, (__m128)__lsx_vreplvei_w(_r2, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k21, (__m128)__lsx_vreplvei_w(_r2, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k22, (__m128)__lsx_vreplvei_w(_r2, 2), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k23, (__m128)__lsx_vreplvei_w(_r2, 3), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k24, (__m128)__lsx_vreplvei_w(_r2n, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k25, (__m128)__lsx_vreplvei_w(_r2n, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k26, (__m128)__lsx_vreplvei_w(_r2n, 2), _sum0);

                    __m128 _k30 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r3 = __lsx_vld(r3, 0);
                    __m128i _r3n = __lsx_vld(r3 + 4, 0);

                    _sum0 = __lsx_vfmadd_s(_k30, (__m128)__lsx_vreplvei_w(_r3, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k31, (__m128)__lsx_vreplvei_w(_r3, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k32, (__m128)__lsx_vreplvei_w(_r3, 2), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k33, (__m128)__lsx_vreplvei_w(_r3, 3), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k34, (__m128)__lsx_vreplvei_w(_r3n, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k35, (__m128)__lsx_vreplvei_w(_r3n, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k36, (__m128)__lsx_vreplvei_w(_r3n, 2), _sum0);

                    __m128 _k40 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r4 = __lsx_vld(r4, 0);
                    __m128i _r4n = __lsx_vld(r4 + 4, 0);

                    _sum0 = __lsx_vfmadd_s(_k40, (__m128)__lsx_vreplvei_w(_r4, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k41, (__m128)__lsx_vreplvei_w(_r4, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k42, (__m128)__lsx_vreplvei_w(_r4, 2), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k43, (__m128)__lsx_vreplvei_w(_r4, 3), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k44, (__m128)__lsx_vreplvei_w(_r4n, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k45, (__m128)__lsx_vreplvei_w(_r4n, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k46, (__m128)__lsx_vreplvei_w(_r4n, 2), _sum0);

                    __m128 _k50 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr += 4 * 7;

                    __m128i _r5 = __lsx_vld(r5, 0);
                    __m128i _r5n = __lsx_vld(r5 + 4, 0);

                    _sum0 = __lsx_vfmadd_s(_k50, (__m128)__lsx_vreplvei_w(_r5, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k51, (__m128)__lsx_vreplvei_w(_r5, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k52, (__m128)__lsx_vreplvei_w(_r5, 2), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k53, (__m128)__lsx_vreplvei_w(_r5, 3), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k54, (__m128)__lsx_vreplvei_w(_r5n, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k55, (__m128)__lsx_vreplvei_w(_r5n, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k56, (__m128)__lsx_vreplvei_w(_r5n, 2), _sum0);

                    __m128 _k60 = (__m128)__lsx_vld(kptr, 0);
                    __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0);
                    __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0);
                    __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0);
                    __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0);
                    __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0);
                    __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0);

                    kptr -= 4 * 42;

                    __m128i _r6 = __lsx_vld(r6, 0);
                    __m128i _r6n = __lsx_vld(r6 + 4, 0);

                    _sum0 = __lsx_vfmadd_s(_k60, (__m128)__lsx_vreplvei_w(_r6, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k61, (__m128)__lsx_vreplvei_w(_r6, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k62, (__m128)__lsx_vreplvei_w(_r6, 2), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k63, (__m128)__lsx_vreplvei_w(_r6, 3), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k64, (__m128)__lsx_vreplvei_w(_r6n, 0), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k65, (__m128)__lsx_vreplvei_w(_r6n, 1), _sum0);
                    _sum0 = __lsx_vfmadd_s(_k66, (__m128)__lsx_vreplvei_w(_r6n, 2), _sum0);

                    __lsx_vst(_sum0, outptr0, 0);

                    outptr0 += 4;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;
                    r4 += 2;
                    r5 += 2;
                    r6 += 2;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
                r3 += tailstep;
                r4 += tailstep;
                r5 += tailstep;
                r6 += tailstep;
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                int sum = 0;

                //                 const signed char* kptr = weight_data_int8.channel(p);
                const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        signed char val = sptr[space_ofs[k]];
                        signed char w = kptr[k];
                        sum += val * w;
                    }

                    kptr += maxk;
                }

                outptr[j] = sum;
            }

            outptr += outw;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution_loongarch.h"

#include "benchmark.h"
#include "cpu.h"
#include "layer_type.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_activation.h"
#include "loongarch_usability.h"

#include "cpu.h"

namespace ncnn {

#include "convolution_sgemm.h"
#include "convolution_winograd_transform.h"
#include "convolution_winograd_dot.h"
#include "convolution_1x1.h"
#include "convolution_3x3.h"

#if NCNN_INT8
#include "convolution_sgemm_int8.h"
#include "convolution_winograd_transform_int8.h"
#include "convolution_winograd_dot_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"
#include "convolution_int8.h"
#endif // NCNN_INT8

#if __loongarch_sx
#include "convolution_pack4.h"
#include "convolution_pack1to4.h"
#include "convolution_pack4to1.h"

#include "convolution_sgemm_pack4.h"
#include "convolution_sgemm_pack4to1.h"
#include "convolution_winograd_transform_pack4.h"
#include "convolution_winograd_dot_pack4.h"
#include "convolution_1x1_pack4.h"
#include "convolution_1x1_pack4to1.h"
#include "convolution_3x3_pack4.h"
#include "convolution_3x3_pack1to4.h"
#include "convolution_7x7_pack1to4.h"

#if NCNN_INT8
#include "convolution_pack8to4_int8.h"
#include "convolution_pack1to4_int8.h"
#include "convolution_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_winograd_transform_pack4_int8.h"
#include "convolution_winograd_transform_pack8_int8.h"
#include "convolution_winograd_dot_pack8to4_int8.h"
#include "convolution_winograd_dot_pack8to1_int8.h"
#include "convolution_1x1_pack8to4_int8.h"
#include "convolution_1x1_pack1to4_int8.h"
#include "convolution_1x1_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#endif // NCNN_INT8
#endif // __loongarch_sx

Convolution_loongarch::Convolution_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx

    activation = 0;
}

static void convolution_transform_kernel_packed_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            float* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }
}

int Convolution_loongarch::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_loongarch(opt);
    }
#endif

    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

#if __loongarch_sx
    // pack4
    if (elempack == 4 && out_elempack == 4)
    {
        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd63_transform_kernel_pack4_lsx(weight_data, weight_winograd63_data, num_input, num_output, opt);
            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd43_transform_kernel_pack4_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
            else // if (opt.use_winograd23_convolution)
                conv3x3s1_winograd23_transform_kernel_pack4_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt);
        }
        else
        {
            convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    // pack1ton
    if (elempack == 1 && out_elempack == 4)
    {
        convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
    }

    // pack4to1
    if (elempack == 4 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }
#endif // __loongarch_sx

    // pack1
    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd43_transform_kernel_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
            }
            else if (opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd23_transform_kernel_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            weight_data_tm = weight_data;
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_loongarch::destroy_pipeline(const Option& opt)
{
    if (activation)
    {
        activation->destroy_pipeline(opt);
        delete activation;
        activation = 0;
    }

    return 0;
}

int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && int8_scale_term)
    {
        return forward_int8_loongarch(bottom_blob, top_blob, opt);
    }
#endif

    // flattened blob, implement as InnerProduct
    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
    {
        Mat bottom_blob_3d;
        if (bottom_blob.elemsize % 16 == 0)
        {
            bottom_blob_3d = bottom_blob;
            bottom_blob_3d.dims = 3;
            bottom_blob_3d.w = 1;
            bottom_blob_3d.h = 1;
            bottom_blob_3d.c = bottom_blob.w;
            bottom_blob_3d.cstep = 1;
        }
        else
        {
            bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
        }

        Mat top_blob_3d;
        int ret = forward(bottom_blob_3d, top_blob_3d, opt);
        if (ret != 0)
            return ret;

        if (top_blob_3d.elemsize % 16 == 0)
        {
            top_blob = top_blob_3d;
            top_blob.dims = 1;
            top_blob.w = top_blob_3d.c;
            top_blob.h = 1;
            top_blob.c = 1;
            top_blob.cstep = top_blob_3d.c;
        }
        else
        {
            top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
        }

        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int num_input = channels * elempack;

#if __loongarch_sx
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd63_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd43_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
            else // if (opt.use_winograd23_convolution)
                conv3x3s1_winograd23_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv7x7s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#endif // __loongarch_sx

    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd43_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
            }
            else if (opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd23_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
            }

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            const int maxk = kernel_w * kernel_h;

            // kernel offsets
            std::vector<int> _space_ofs(maxk);
            int* space_ofs = &_space_ofs[0];
            {
                int p1 = 0;
                int p2 = 0;
                int gap = w * dilation_h - kernel_w * dilation_w;
                for (int i = 0; i < kernel_h; i++)
                {
                    for (int j = 0; j < kernel_w; j++)
                    {
                        space_ofs[p1] = p2;
                        p1++;
                        p2 += dilation_w;
                    }
                    p2 += gap;
                }
            }

            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output; p++)
            {
                float* outptr = top_blob.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const float* kptr = (const float*)weight_data_tm + maxk * channels * p;

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob_bordered.channel(q);
                            const float* sptr = m.row(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = sptr[space_ofs[k]];
                                float wt = kptr[k];
                                sum += val * wt;
                            }

                            kptr += maxk;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = sum;
                    }

                    outptr += outw;
                }
            }
        }
    }

    return 0;
}

int Convolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(8, int8_scale_term);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_INT8
static void convolution_transform_kernel_packed_int8_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pa-pb-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            signed char* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < out_elempack; i++)
                    {
                        for (int j = 0; j < elempack; j++)
                        {
                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }
}

int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        elempack = num_input % 8 == 0 ? 8 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __loongarch_sx

#if __loongarch_sx
    if (elempack == 8 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    if (elempack == 8 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }
#endif // __loongarch_sx

    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            weight_data_tm = weight_data;
        }
    }

    scale_in_data.create(num_output);
    for (int p = 0; p < num_output; p++)
    {
        // requantize and relu
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        scale_in_data[p] = scale_in;
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    int w = bottom_blob_bordered.w;
    int h = bottom_blob_bordered.h;
    int channels = bottom_blob_bordered.c;
    int elempack = bottom_blob_bordered.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    bool use_int8_requantize = int8_scale_term > 100;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        if (use_int8_requantize)
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        else
            out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __loongarch_sx
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int num_input = channels * elempack;

    int out_elempack_int32 = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __loongarch_sx

    Mat top_blob_int32;
    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
    if (top_blob_int32.empty())
        return -100;

#if __loongarch_sx
    if (elempack == 8 && out_elempack_int32 == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

    if (elempack == 1 && out_elempack_int32 == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

    if (elempack == 8 && out_elempack_int32 == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }
#endif // __loongarch_sx

    if (elempack == 1 && out_elempack_int32 == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        // NCNN_LOGE("top_blob_int32  %d  %d", top_blob_int32.c, top_blob_int32.elempack);
        if (use_int8_requantize)
        {
            // TODO implement winograd sgemm packed int8 pack1 output
            if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 1)
            {
                Mat tmp;
                convert_packing(top_blob_int32, tmp, 1, opt);
                top_blob_int32 = tmp;
            }
            if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 0)
            {
                Mat tmp;
                convert_packing(top_blob_int32, tmp, 8, opt);
                top_blob_int32 = tmp;
            }
        }
    }
#endif

    if (use_int8_requantize)
    {
        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
    }
    else
    {
        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/loongarch/convolution_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION_LOONGARCH_H
#define LAYER_CONVOLUTION_LOONGARCH_H

#include "convolution.h"

namespace ncnn {

class Convolution_loongarch : public Convolution
{
public:
    Convolution_loongarch();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_INT8
    int create_pipeline_int8_loongarch(const Option& opt);
    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Layer* activation;

    Mat weight_data_tm;
    Mat weight_sgemm_data;
    Mat weight_winograd23_data;
    Mat weight_winograd43_data;
    Mat weight_winograd63_data;

#if NCNN_INT8
    Mat scale_in_data;
#endif
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION_LOONGARCH_H


================================================
FILE: src/layer/loongarch/convolution_pack1to4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const float* bias_data_ptr = bias_data;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                if (bias_data_ptr)
                {
                    _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0);
                }

                const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const float* sptr = m.row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++) // 29.23
                    {
                        __m128 _val = __lsx_vreplfr2vr_s(sptr[space_ofs[k]]);
                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
                        _sum = __lsx_vfmadd_s(_w, _val, _sum);

                        kptr += 4;
                    }
                }

                _sum = activation_ps(_sum, activation_type, activation_params);

                __lsx_vst(_sum, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_pack1to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128i _sum = __lsx_vreplgr2vr_w(0);

                const signed char* kptr = weight_data_int8.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        __m128i _val = __lsx_vreplgr2vr_h((short)sptr[space_ofs[k]]);

                        __m128i _w = __lsx_vld(kptr, 0);
                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                        __m128i _s0 = __lsx_vmul_h(_val, _w16);
                        __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);

                        _sum = __lsx_vadd_w(_sum, _s032);

                        kptr += 4;
                    }
                }

                __lsx_vst(_sum, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const float* bias_data_ptr = bias_data;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                if (bias_data_ptr)
                {
                    _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0);
                }

                const float* kptr = (const float*)weight_data_pack4.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                    for (int k = 0; k < maxk; k++) // 29.23
                    {
                        const float* slptr = sptr + space_ofs[k] * 4;

                        __m128 _val0 = __lsx_vreplfr2vr_s(slptr[0]);
                        __m128 _val1 = __lsx_vreplfr2vr_s(slptr[1]);
                        __m128 _val2 = __lsx_vreplfr2vr_s(slptr[2]);
                        __m128 _val3 = __lsx_vreplfr2vr_s(slptr[3]);

                        __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
                        __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
                        __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
                        __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);

                        _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
                        _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
                        _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
                        _sum = __lsx_vfmadd_s(_w3, _val3, _sum);

                        kptr += 16;
                    }
                }

                _sum = activation_ps(_sum, activation_type, activation_params);

                __lsx_vst(_sum, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_pack4to1.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const float* bias_data_ptr = bias_data;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_data_ptr)
                {
                    sum = bias_data_ptr[p];
                }

                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                const float* kptr = (const float*)weight_data_pack4to1.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                    for (int k = 0; k < maxk; k++)
                    {
                        __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
                        _sum = __lsx_vfmadd_s(_w, _val, _sum);

                        kptr += 4;
                    }
                }

                sum += __lsx_reduce_fadd_s(_sum);

                sum = activation_ss(sum, activation_type, activation_params);

                outptr[j] = sum;
            }

            outptr += outw;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_pack8to1_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128i _sum = __lsx_vreplgr2vr_w(0);

                const signed char* kptr = weight_data_int8.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;

                    for (int k = 0; k < maxk; k++)
                    {
                        __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
                        __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                        __m128i _w = __lsx_vld(kptr, 0);
                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                        __m128i _s0 = __lsx_vmul_h(_val16, _w16);

                        _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0));

                        kptr += 8;
                    }
                }

                outptr[j] = __lsx_reduce_add_w(_sum);
            }

            outptr += outw;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_pack8to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
                __m128i _sum3 = __lsx_vreplgr2vr_w(0);

                const signed char* kptr = weight_data_int8.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;

                    for (int k = 0; k < maxk; k++)
                    {
                        __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
                        __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                        __m128i _w01 = __lsx_vld(kptr, 0);
                        __m128i _w23 = __lsx_vld(kptr + 16, 0);
                        __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                        __m128i _extw23 = __lsx_vslti_b(_w23, 0);
                        __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                        __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
                        __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
                        __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);

                        __m128i _s0 = __lsx_vmul_h(_val16, _w0);
                        __m128i _s1 = __lsx_vmul_h(_val16, _w1);
                        __m128i _s2 = __lsx_vmul_h(_val16, _w2);
                        __m128i _s3 = __lsx_vmul_h(_val16, _w3);

                        _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
                        _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
                        _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
                        _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));

                        kptr += 32;
                    }
                }

                // transpose 4x4
                {
                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);
                _sum2 = __lsx_vadd_w(_sum2, _sum3);

                _sum0 = __lsx_vadd_w(_sum0, _sum2);

                __lsx_vst(_sum0, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_sgemm.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    const float* bias = _bias;

    // permute
    Mat tmp;
    if (size >= 4)
        tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u, 1, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 4u, 1, opt.workspace_allocator);
    {
        int nn_size = size / 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = ii * 4;

            float* tmpptr = tmp.channel(i / 4);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
#if __loongarch_sx
                    __lsx_vst(__lsx_vld(img0, 0), tmpptr, 0);
#else
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img0[1];
                    tmpptr[2] = img0[2];
                    tmpptr[3] = img0[3];
#endif
                    img0 += size;
                    tmpptr += 4;
                }
            }
        }

        int remain_size_start = nn_size * 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            float* tmpptr = tmp.channel(i / 4 + i % 4);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    img0 += size;
                    tmpptr += 1;
                }
            }
        }
    }

#if __loongarch_sx
    int nn_outch = outch >> 3;
    int remain_outch_start = nn_outch << 3;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 8;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);
        float* outptr2 = top_blob.channel(p + 2);
        float* outptr3 = top_blob.channel(p + 3);
        float* outptr4 = top_blob.channel(p + 4);
        float* outptr5 = top_blob.channel(p + 5);
        float* outptr6 = top_blob.channel(p + 6);
        float* outptr7 = top_blob.channel(p + 7);

        const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
        const float* biasptr = bias ? bias + p : zeros;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
            const float* kptr = kernel.channel(p / 8);

            int nn = inch * maxk; // inch always > 0

            __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]);
            __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]);
            __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]);
            __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]);
            __m128 _sum4 = __lsx_vreplfr2vr_s(biasptr[4]);
            __m128 _sum5 = __lsx_vreplfr2vr_s(biasptr[5]);
            __m128 _sum6 = __lsx_vreplfr2vr_s(biasptr[6]);
            __m128 _sum7 = __lsx_vreplfr2vr_s(biasptr[7]);

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 32);
                __m128 _val = (__m128)__lsx_vld(tmpptr, 0);
                __m128i _w0123 = __lsx_vld(kptr, 0);
                __m128i _w4567 = __lsx_vld(kptr + 4, 0);
                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4);
                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5);
                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6);
                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7);

                tmpptr += 4;
                kptr += 8;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr1, 0);
            __lsx_vst(_sum2, outptr2, 0);
            __lsx_vst(_sum3, outptr3, 0);
            __lsx_vst(_sum4, outptr4, 0);
            __lsx_vst(_sum5, outptr5, 0);
            __lsx_vst(_sum6, outptr6, 0);
            __lsx_vst(_sum7, outptr7, 0);

            outptr0 += 4;
            outptr1 += 4;
            outptr2 += 4;
            outptr3 += 4;
            outptr4 += 4;
            outptr5 += 4;
            outptr6 += 4;
            outptr7 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
            const float* kptr = kernel.channel(p / 8);

            int nn = inch * maxk; // inch always > 0

            float sum0 = biasptr[0];
            float sum1 = biasptr[1];
            float sum2 = biasptr[2];
            float sum3 = biasptr[3];
            float sum4 = biasptr[4];
            float sum5 = biasptr[5];
            float sum6 = biasptr[6];
            float sum7 = biasptr[7];

            for (int q = 0; q < nn; q++)
            {
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[0] * kptr[1];
                sum2 += tmpptr[0] * kptr[2];
                sum3 += tmpptr[0] * kptr[3];
                sum4 += tmpptr[0] * kptr[4];
                sum5 += tmpptr[0] * kptr[5];
                sum6 += tmpptr[0] * kptr[6];
                sum7 += tmpptr[0] * kptr[7];
                tmpptr++;
                kptr += 8;
            }

            outptr0[0] = sum0;
            outptr1[0] = sum1;
            outptr2[0] = sum2;
            outptr3[0] = sum3;
            outptr4[0] = sum4;
            outptr5[0] = sum5;
            outptr6[0] = sum6;
            outptr7[0] = sum7;

            outptr0++;
            outptr1++;
            outptr2++;
            outptr3++;
            outptr4++;
            outptr5++;
            outptr6++;
            outptr7++;
        }
    }

    nn_outch = (outch - remain_outch_start) >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = remain_outch_start + pp * 4;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);
        float* outptr2 = top_blob.channel(p + 2);
        float* outptr3 = top_blob.channel(p + 3);

        const float zeros[4] = {0.f, 0.f, 0.f, 0.f};
        const float* biasptr = bias ? bias + p : zeros;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);

            int nn = inch * maxk; // inch always > 0

            __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]);
            __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]);
            __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]);
            __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]);

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 16);
                __m128 _val = (__m128)__lsx_vld(tmpptr, 0);
                __m128i _w0123 = __lsx_vld(kptr, 0);
                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);

                tmpptr += 4;
                kptr += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr1, 0);
            __lsx_vst(_sum2, outptr2, 0);
            __lsx_vst(_sum3, outptr3, 0);

            outptr0 += 4;
            outptr1 += 4;
            outptr2 += 4;
            outptr3 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);

            int nn = inch * maxk; // inch always > 0

            float sum0 = biasptr[0];
            float sum1 = biasptr[1];
            float sum2 = biasptr[2];
            float sum3 = biasptr[3];

            for (int q = 0; q < nn; q++)
            {
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[0] * kptr[1];
                sum2 += tmpptr[0] * kptr[2];
                sum3 += tmpptr[0] * kptr[3];
                tmpptr++;
                kptr += 4;
            }

            outptr0[0] = sum0;
            outptr1[0] = sum1;
            outptr2[0] = sum2;
            outptr3[0] = sum3;

            outptr0++;
            outptr1++;
            outptr2++;
            outptr3++;
        }
    }

    remain_outch_start += nn_outch << 2;
#else // __loongarch_sx
    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);

        const float zeros[2] = {0.f, 0.f};
        const float* biasptr = bias ? bias + p : zeros;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
            const float* kptr = kernel.channel(p / 2);

            int nn = inch * maxk; // inch always > 0

            float sum00 = biasptr[0];
            float sum01 = biasptr[0];
            float sum02 = biasptr[0];
            float sum03 = biasptr[0];
            float sum10 = biasptr[1];
            float sum11 = biasptr[1];
            float sum12 = biasptr[1];
            float sum13 = biasptr[1];

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 8);
                float k0 = kptr[0];
                float k1 = kptr[1];
                sum00 += tmpptr[0] * k0;
                sum01 += tmpptr[1] * k0;
                sum02 += tmpptr[2] * k0;
                sum03 += tmpptr[3] * k0;
                sum10 += tmpptr[0] * k1;
                sum11 += tmpptr[1] * k1;
                sum12 += tmpptr[2] * k1;
                sum13 += tmpptr[3] * k1;
                tmpptr += 4;
                kptr += 2;
            }

            outptr0[0] = sum00;
            outptr0[1] = sum01;
            outptr0[2] = sum02;
            outptr0[3] = sum03;
            outptr1[0] = sum10;
            outptr1[1] = sum11;
            outptr1[2] = sum12;
            outptr1[3] = sum13;

            outptr0 += 4;
            outptr1 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
            const float* kptr = kernel.channel(p / 2);

            int nn = inch * maxk; // inch always > 0

            float sum0 = biasptr[0];
            float sum1 = biasptr[1];

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 4);
                __builtin_prefetch(kptr + 8);
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[0] * kptr[1];
                tmpptr++;
                kptr += 2;
            }

            outptr0[0] = sum0;
            outptr1[0] = sum1;

            outptr0++;
            outptr1++;
        }
    }
#endif // __loongarch_sx

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        float* outptr0 = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
#if __loongarch_sx
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
#else
            const float* kptr = kernel.channel(p / 2 + p % 2);
#endif

            int nn = inch * maxk; // inch always > 0

#if __loongarch_sx
            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);

            for (int q = 0; q < nn; q++)
            {
                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(tmpptr, 0), __lsx_vreplfr2vr_s(kptr[0]), _sum0);
                tmpptr += 4;
                kptr++;
            }

            __lsx_vst(_sum0, outptr0, 0);

            outptr0 += 4;
#else
            float sum0 = bias0;
            float sum1 = bias0;
            float sum2 = bias0;
            float sum3 = bias0;

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 4);
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[1] * kptr[0];
                sum2 += tmpptr[2] * kptr[0];
                sum3 += tmpptr[3] * kptr[0];
                tmpptr += 4;
                kptr++;
            }

            outptr0[0] = sum0;
            outptr0[1] = sum1;
            outptr0[2] = sum2;
            outptr0[3] = sum3;

            outptr0 += 4;
#endif // __loongarch_sx
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
#if __loongarch_sx
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
#else
            const float* kptr = kernel.channel(p / 2 + p % 2);
#endif

            int nn = inch * maxk; // inch always > 0

            float sum0 = bias0;

            for (int q = 0; q < nn; q++)
            {
                sum0 += tmpptr[0] * kptr[0];
                tmpptr++;
                kptr++;
            }

            outptr0[0] = sum0;

            outptr0++;
        }
    }
}

static void convolution_im2col_sgemm_transform_kernel_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 8b-maxk-inch-outch/8b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
#if __loongarch_sx
    kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4);
#else
    kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2);
#endif

    int q = 0;
#if __loongarch_sx
    for (; q + 7 < outch; q += 8)
    {
        const Mat k0 = kernel.channel(q);
        const Mat k1 = kernel.channel(q + 1);
        const Mat k2 = kernel.channel(q + 2);
        const Mat k3 = kernel.channel(q + 3);
        const Mat k4 = kernel.channel(q + 4);
        const Mat k5 = kernel.channel(q + 5);
        const Mat k6 = kernel.channel(q + 6);
        const Mat k7 = kernel.channel(q + 7);

        float* g00 = kernel_tm.channel(q / 8);

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);
            const float* k10 = k1.row(p);
            const float* k20 = k2.row(p);
            const float* k30 = k3.row(p);
            const float* k40 = k4.row(p);
            const float* k50 = k5.row(p);
            const float* k60 = k6.row(p);
            const float* k70 = k7.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];
                g00[1] = k10[k];
                g00[2] = k20[k];
                g00[3] = k30[k];
                g00[4] = k40[k];
                g00[5] = k50[k];
                g00[6] = k60[k];
                g00[7] = k70[k];

                g00 += 8;
            }
        }
    }
    for (; q + 3 < outch; q += 4)
    {
        const Mat k0 = kernel.channel(q);
        const Mat k1 = kernel.channel(q + 1);
        const Mat k2 = kernel.channel(q + 2);
        const Mat k3 = kernel.channel(q + 3);

        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);
            const float* k10 = k1.row(p);
            const float* k20 = k2.row(p);
            const float* k30 = k3.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];
                g00[1] = k10[k];
                g00[2] = k20[k];
                g00[3] = k30[k];

                g00 += 4;
            }
        }
    }
#else
    for (; q + 1 < outch; q += 2)
    {
        const Mat k0 = kernel.channel(q);
        const Mat k1 = kernel.channel(q + 1);

        float* g00 = kernel_tm.channel(q / 2);

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);
            const float* k10 = k1.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];
                g00[1] = k10[k];

                g00 += 2;
            }
        }
    }
#endif // __loongarch_sx
    for (; q < outch; q++)
    {
        const Mat k0 = kernel.channel(q);

#if __loongarch_sx
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4);
#else
        float* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];

                g00 += 1;
            }
        }
    }
}

static void convolution_im2col_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            float* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const float* sptr = img.row<const float>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/loongarch/convolution_sgemm_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
#if __loongarch_sx
    if (inch >= 4)
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
        else
            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
    }
    else
#endif // __loongarch_sx
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
        else
            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
    }
    {
        int remain_size_start = 0;
        int nn_size = (size - remain_size_start) >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            signed char* tmpptr = tmp.channel(i / 2);

            int q = 0;
#if __loongarch_sx
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr[4] = img0[1];
                    tmpptr[5] = img1[1];
                    tmpptr[6] = img2[1];
                    tmpptr[7] = img3[1];
                    tmpptr += 8;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
#endif // __loongarch_sx
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img0[1];

                    tmpptr += 2;

                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            signed char* tmpptr = tmp.channel(i / 2 + i % 2);

            int q = 0;
#if __loongarch_sx
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr += 4;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
#endif // __loongarch_sx
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];

                    tmpptr += 1;

                    img0 += size;
                }
            }
        }
    }

#if __loongarch_sx
    int nn_outch = outch >> 2;
    int remain_outch_start = nn_outch << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        int* outptr0 = top_blob.channel(p);
        int* outptr1 = top_blob.channel(p + 1);
        int* outptr2 = top_blob.channel(p + 2);
        int* outptr3 = top_blob.channel(p + 3);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
            __m128i _sum10 = __lsx_vreplgr2vr_w(0);

            if (nn4 > 0)
            {
                __m128i _sum01 = __lsx_vreplgr2vr_w(0);
                __m128i _sum02 = __lsx_vreplgr2vr_w(0);
                __m128i _sum03 = __lsx_vreplgr2vr_w(0);
                __m128i _sum11 = __lsx_vreplgr2vr_w(0);
                __m128i _sum12 = __lsx_vreplgr2vr_w(0);
                __m128i _sum13 = __lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    __m128i _val = __lsx_vld(tmpptr, 0);
                    __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                    __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
                    __m128i _val1 = __lsx_vilvh_d(_val01, _val01);

                    __m128i _w01 = __lsx_vld(kptr, 0);
                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);

                    __m128i _s00 = __lsx_vmul_h(_val0, _w0);
                    __m128i _s01 = __lsx_vmul_h(_val0, _w1);
                    __m128i _s10 = __lsx_vmul_h(_val1, _w0);
                    __m128i _s11 = __lsx_vmul_h(_val1, _w1);

                    __m128i _exts00 = __lsx_vslti_h(_s00, 0);
                    __m128i _exts01 = __lsx_vslti_h(_s01, 0);
                    __m128i _exts10 = __lsx_vslti_h(_s10, 0);
                    __m128i _exts11 = __lsx_vslti_h(_s11, 0);
                    __m128i _s00l = __lsx_vilvl_h(_exts00, _s00);
                    __m128i _s00h = __lsx_vilvh_h(_exts00, _s00);
                    __m128i _s01l = __lsx_vilvl_h(_exts01, _s01);
                    __m128i _s01h = __lsx_vilvh_h(_exts01, _s01);
                    __m128i _s10l = __lsx_vilvl_h(_exts10, _s10);
                    __m128i _s10h = __lsx_vilvh_h(_exts10, _s10);
                    __m128i _s11l = __lsx_vilvl_h(_exts11, _s11);
                    __m128i _s11h = __lsx_vilvh_h(_exts11, _s11);

                    _sum00 = __lsx_vadd_w(_sum00, _s00l);
                    _sum01 = __lsx_vadd_w(_sum01, _s00h);
                    _sum02 = __lsx_vadd_w(_sum02, _s01l);
                    _sum03 = __lsx_vadd_w(_sum03, _s01h);
                    _sum10 = __lsx_vadd_w(_sum10, _s10l);
                    _sum11 = __lsx_vadd_w(_sum11, _s10h);
                    _sum12 = __lsx_vadd_w(_sum12, _s11l);
                    _sum13 = __lsx_vadd_w(_sum13, _s11h);

                    tmpptr += 8;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
                    _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
                    _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
                    _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
                    _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
                    _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
                    _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
                    _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
                }
                {
                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
                    _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
                    _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
                    _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
                    _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
                    _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
                    _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
                    _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
                }

                _sum00 = __lsx_vadd_w(_sum00, _sum01);
                _sum02 = __lsx_vadd_w(_sum02, _sum03);
                _sum10 = __lsx_vadd_w(_sum10, _sum11);
                _sum12 = __lsx_vadd_w(_sum12, _sum13);

                _sum00 = __lsx_vadd_w(_sum00, _sum02);
                _sum10 = __lsx_vadd_w(_sum10, _sum12);
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]);
                __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]);
                __m128i _val = __lsx_vilvl_d(_val1, _val0);

                __m128i _w = __lsx_vld(kptr, 0);
                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                _w16 = __lsx_vilvl_d(_w16, _w16);

                __m128i _s0 = __lsx_vmul_h(_val, _w16);
                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);

                _sum00 = __lsx_vadd_w(_sum00, _s0l);
                _sum10 = __lsx_vadd_w(_sum10, _s0h);

                tmpptr += 2;
                kptr += 4;
            }

            int sum[8];
            __lsx_vst(_sum00, sum, 0);
            __lsx_vst(_sum10, sum + 4, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0[1] = sum[4];
            outptr1[1] = sum[5];
            outptr2[1] = sum[6];
            outptr3[1] = sum[7];
            outptr0 += 2;
            outptr1 += 2;
            outptr2 += 2;
            outptr3 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            __m128i _sum0 = __lsx_vreplgr2vr_w(0);

            if (nn4 > 0)
            {
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
                __m128i _sum3 = __lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    __m128i _val = __lsx_vld(tmpptr, 0);
                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                    _val16 = __lsx_vilvl_d(_val16, _val16);

                    __m128i _w01 = __lsx_vld(kptr, 0);
                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);

                    __m128i _s0 = __lsx_vmul_h(_val16, _w0);
                    __m128i _s1 = __lsx_vmul_h(_val16, _w1);

                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                    __m128i _exts1 = __lsx_vslti_h(_s1, 0);
                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
                    __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
                    __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);

                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
                    _sum2 = __lsx_vadd_w(_sum2, _s1l);
                    _sum3 = __lsx_vadd_w(_sum3, _s1h);

                    tmpptr += 4;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);
                _sum2 = __lsx_vadd_w(_sum2, _sum3);
                _sum0 = __lsx_vadd_w(_sum0, _sum2);
            }
            int j = 0;
            for (; j < nn1; j++)
            {
                __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]);

                __m128i _w = __lsx_vld(kptr, 0);
                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                __m128i _s0 = __lsx_vmul_h(_val, _w16);
                __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);

                _sum0 = __lsx_vadd_w(_sum0, _s032);

                tmpptr += 1;
                kptr += 4;
            }

            int sum[4];
            __lsx_vst(_sum0, sum, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0 += 1;
            outptr1 += 1;
            outptr2 += 1;
            outptr3 += 1;
        }
    }
#else // __loongarch_sx
    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        int* outptr0 = top_blob.channel(p);
        int* outptr1 = top_blob.channel(p + 1);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 2);

            int sum00 = 0;
            int sum01 = 0;
            int sum10 = 0;
            int sum11 = 0;

            int nn1 = inch * maxk;

            int j = 0;
            for (; j < nn1; j++)
            {
                signed char val0 = tmpptr[0];
                signed char val1 = tmpptr[1];
                signed char w0 = kptr[0];
                signed char w1 = kptr[1];

                sum00 += val0 * w0;
                sum01 += val1 * w0;
                sum10 += val0 * w1;
                sum11 += val1 * w1;

                tmpptr += 2;
                kptr += 2;
            }

            outptr0[0] = sum00;
            outptr0[1] = sum01;
            outptr1[0] = sum10;
            outptr1[1] = sum11;
            outptr0 += 2;
            outptr1 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 2);

            int sum00 = 0;
            int sum10 = 0;

            int nn1 = inch * maxk;

            int j = 0;
            for (; j < nn1; j++)
            {
                signed char val0 = tmpptr[0];
                signed char w0 = kptr[0];
                signed char w1 = kptr[1];

                sum00 += val0 * w0;
                sum10 += val0 * w1;

                tmpptr += 1;
                kptr += 2;
            }

            outptr0[0] = sum00;
            outptr1[0] = sum10;
            outptr0 += 1;
            outptr1 += 1;
        }
    }
#endif // __loongarch_sx

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
#if __loongarch_sx
            const signed char* kptr = kernel.channel(p / 4 + p % 4);
#else
            const signed char* kptr = kernel.channel(p / 2 + p % 2);
#endif

            int sum0 = 0;
            int sum1 = 0;

#if __loongarch_sx
            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            if (nn4 > 0)
            {
                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    __m128i _val = __lsx_vld(tmpptr, 0);
                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                    __m128i _w = __lsx_vld(kptr, 0);
                    __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                    _w16 = __lsx_vilvl_d(_w16, _w16);

                    __m128i _s0 = __lsx_vmul_h(_val16, _w16);
                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);

                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
                    _sum1 = __lsx_vadd_w(_sum1, _s0h);

                    tmpptr += 8;
                    kptr += 4;
                }

                sum0 = __lsx_reduce_add_w(_sum0);
                sum1 = __lsx_reduce_add_w(_sum1);
            }
#else
            int nn1 = inch * maxk;
#endif // __loongarch_sx

            int j = 0;
            for (; j < nn1; j++)
            {
                signed char val0 = tmpptr[0];
                signed char val1 = tmpptr[1];
                signed char w = kptr[0];

                sum0 += val0 * w;
                sum1 += val1 * w;

                tmpptr += 2;
                kptr += 1;
            }

            outptr0[0] = sum0;
            outptr0[1] = sum1;
            outptr0 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
#if __loongarch_sx
            const signed char* kptr = kernel.channel(p / 4 + p % 4);
#else
            const signed char* kptr = kernel.channel(p / 2 + p % 2);
#endif

            int sum = 0;

#if __loongarch_sx
            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            if (nn4 > 0)
            {
                __m128i _sum = __lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    __m128i _val = __lsx_vld(tmpptr, 0);
                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                    __m128i _w = __lsx_vld(kptr, 0);
                    __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                    __m128i _s0 = __lsx_vmul_h(_val16, _w16);
                    __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);

                    _sum = __lsx_vadd_w(_sum, _s032);

                    tmpptr += 4;
                    kptr += 4;
                }

                sum = __lsx_reduce_add_w(_sum);
            }
#else
            int nn1 = inch * maxk;
#endif // __loongarch_sx

            int j = 0;
            for (; j < nn1; j++)
            {
                signed char val = tmpptr[0];
                signed char w = kptr[0];

                sum += val * w;

                tmpptr += 1;
                kptr += 1;
            }

            outptr0[0] = sum;
            outptr0 += 1;
        }
    }
}

static void convolution_im2col_sgemm_transform_kernel_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 4a-4b-maxk-inch/4a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
#if __loongarch_sx
    if (outch >= 4)
    {
        if (inch >= 4)
            kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u);
        else
            kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u);
    }
#else
    if (outch >= 2)
    {
        kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)1u);
    }
#endif // __loongarch_sx
    else
    {
#if __loongarch_sx
        if (inch >= 4)
            kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u);
        else
#endif // __loongarch_sx
        {
            kernel_tm.create(1 * maxk, inch, outch, (size_t)1u);
        }
    }

    int q = 0;
#if __loongarch_sx
    for (; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        int p = 0;
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
#else  // __loongarch_sx
    for (; q + 1 < outch; q += 2)
    {
        signed char* g00 = kernel_tm.channel(q / 2);

        int p = 0;
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 2; i++)
                {
                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
    }
#endif // __loongarch_sx
    for (; q < outch; q++)
    {
#if __loongarch_sx
        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);
#else
        signed char* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        int p = 0;
#if __loongarch_sx
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int j = 0; j < 4; j++)
                {
                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);
                    g00[0] = k00[k];
                    g00++;
                }
            }
        }
#endif // __loongarch_sx
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                const signed char* k00 = kernel.channel(q).row<const signed char>(p);
                g00[0] = k00[k];
                g00++;
            }
        }
    }
}

static void convolution_im2col_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            signed char* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j + 3 < outw; j += 4)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];
                            ptr[2] = sptr[stride_w * 2];
                            ptr[3] = sptr[stride_w * 3];

                            sptr += stride_w * 4;
                            ptr += 4;
                        }
                        for (; j + 1 < outw; j += 2)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];

                            sptr += stride_w * 2;
                            ptr += 2;
                        }
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_sgemm_pack1to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
    if (inch >= 4)
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator);
        else
            tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator);
    }
    else
    {
        if (size >= 2)
            tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator);
        else
            tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator);
    }
    {
        int remain_size_start = 0;
        int nn_size = (size - remain_size_start) >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            signed char* tmpptr = tmp.channel(i / 2);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr[4] = img0[1];
                    tmpptr[5] = img1[1];
                    tmpptr[6] = img2[1];
                    tmpptr[7] = img3[1];
                    tmpptr += 8;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img0[1];

                    tmpptr += 2;

                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            signed char* tmpptr = tmp.channel(i / 2 + i % 2);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;
                const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i;
                const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i;
                const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img1[0];
                    tmpptr[2] = img2[0];
                    tmpptr[3] = img3[0];
                    tmpptr += 4;

                    img0 += size;
                    img1 += size;
                    img2 += size;
                    img3 += size;
                }
            }
            for (; q < inch; q++)
            {
                const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];

                    tmpptr += 1;

                    img0 += size;
                }
            }
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
            __m128i _sum10 = __lsx_vreplgr2vr_w(0);

            if (nn4 > 0)
            {
                __m128i _sum01 = __lsx_vreplgr2vr_w(0);
                __m128i _sum02 = __lsx_vreplgr2vr_w(0);
                __m128i _sum03 = __lsx_vreplgr2vr_w(0);
                __m128i _sum11 = __lsx_vreplgr2vr_w(0);
                __m128i _sum12 = __lsx_vreplgr2vr_w(0);
                __m128i _sum13 = __lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    __builtin_prefetch(tmpptr + 32);
                    __builtin_prefetch(kptr + 64);
                    __m128i _val = __lsx_vld(tmpptr, 0);
                    __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                    __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
                    __m128i _val1 = __lsx_vilvh_d(_val01, _val01);

                    __m128i _w01 = __lsx_vld(kptr, 0);
                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);

                    __m128i _s00 = __lsx_vmul_h(_val0, _w0);
                    __m128i _s01 = __lsx_vmul_h(_val0, _w1);
                    __m128i _s10 = __lsx_vmul_h(_val1, _w0);
                    __m128i _s11 = __lsx_vmul_h(_val1, _w1);

                    __m128i _exts00 = __lsx_vslti_h(_s00, 0);
                    __m128i _exts01 = __lsx_vslti_h(_s01, 0);
                    __m128i _exts10 = __lsx_vslti_h(_s10, 0);
                    __m128i _exts11 = __lsx_vslti_h(_s11, 0);
                    __m128i _s00l = __lsx_vilvl_h(_exts00, _s00);
                    __m128i _s00h = __lsx_vilvh_h(_exts00, _s00);
                    __m128i _s01l = __lsx_vilvl_h(_exts01, _s01);
                    __m128i _s01h = __lsx_vilvh_h(_exts01, _s01);
                    __m128i _s10l = __lsx_vilvl_h(_exts10, _s10);
                    __m128i _s10h = __lsx_vilvh_h(_exts10, _s10);
                    __m128i _s11l = __lsx_vilvl_h(_exts11, _s11);
                    __m128i _s11h = __lsx_vilvh_h(_exts11, _s11);

                    _sum00 = __lsx_vadd_w(_sum00, _s00l);
                    _sum01 = __lsx_vadd_w(_sum01, _s00h);
                    _sum02 = __lsx_vadd_w(_sum02, _s01l);
                    _sum03 = __lsx_vadd_w(_sum03, _s01h);
                    _sum10 = __lsx_vadd_w(_sum10, _s10l);
                    _sum11 = __lsx_vadd_w(_sum11, _s10h);
                    _sum12 = __lsx_vadd_w(_sum12, _s11l);
                    _sum13 = __lsx_vadd_w(_sum13, _s11h);

                    tmpptr += 8;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
                    _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
                    _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
                    _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
                    _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
                    _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
                    _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
                    _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
                }
                {
                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
                    _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
                    _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
                    _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
                    _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
                    _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
                    _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
                    _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
                }

                _sum00 = __lsx_vadd_w(_sum00, _sum01);
                _sum02 = __lsx_vadd_w(_sum02, _sum03);
                _sum10 = __lsx_vadd_w(_sum10, _sum11);
                _sum12 = __lsx_vadd_w(_sum12, _sum13);

                _sum00 = __lsx_vadd_w(_sum00, _sum02);
                _sum10 = __lsx_vadd_w(_sum10, _sum12);
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]);
                __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]);
                __m128i _val = __lsx_vilvl_d(_val1, _val0);

                __m128i _w = __lsx_vld(kptr, 0);
                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                _w16 = __lsx_vilvl_d(_w16, _w16);

                __m128i _s0 = __lsx_vmul_h(_val, _w16);
                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);

                _sum00 = __lsx_vadd_w(_sum00, _s0l);
                _sum10 = __lsx_vadd_w(_sum10, _s0h);

                tmpptr += 2;
                kptr += 4;
            }

            __lsx_vst(_sum00, outptr0, 0);
            __lsx_vst(_sum10, outptr0 + 4, 0);
            outptr0 += 8;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p);

            int nn4 = (inch / 4) * maxk;
            int nn1 = (inch % 4) * maxk;

            __m128i _sum0 = __lsx_vreplgr2vr_w(0);

            if (nn4 > 0)
            {
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
                __m128i _sum3 = __lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn4; j++)
                {
                    __builtin_prefetch(tmpptr + 16);
                    __builtin_prefetch(kptr + 64);
                    __m128i _val = __lsx_vld(tmpptr, 0);
                    __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                    _val16 = __lsx_vilvl_d(_val16, _val16);

                    __m128i _w01 = __lsx_vld(kptr, 0);
                    __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                    __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                    __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);

                    __m128i _s0 = __lsx_vmul_h(_val16, _w0);
                    __m128i _s1 = __lsx_vmul_h(_val16, _w1);

                    __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                    __m128i _exts1 = __lsx_vslti_h(_s1, 0);
                    __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                    __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
                    __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
                    __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);

                    _sum0 = __lsx_vadd_w(_sum0, _s0l);
                    _sum1 = __lsx_vadd_w(_sum1, _s0h);
                    _sum2 = __lsx_vadd_w(_sum2, _s1l);
                    _sum3 = __lsx_vadd_w(_sum3, _s1h);

                    tmpptr += 4;
                    kptr += 16;
                }

                // transpose 4x4
                {
                    __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
                    _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
                    _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
                    _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
                    _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
                    _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
                    _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
                    _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);
                _sum2 = __lsx_vadd_w(_sum2, _sum3);
                _sum0 = __lsx_vadd_w(_sum0, _sum2);
            }

            int j = 0;
            for (; j < nn1; j++)
            {
                __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]);

                __m128i _w = __lsx_vld(kptr, 0);
                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                __m128i _s0 = __lsx_vmul_h(_val, _w16);
                __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0);

                _sum0 = __lsx_vadd_w(_sum0, _s032);

                tmpptr += 1;
                kptr += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            outptr0 += 4;
        }
    }
}

static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 4a-4b-maxk-inch/4a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    if (inch >= 4)
        kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u);
    else
        kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        int p = 0;
        for (; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
        for (; p < inch; p++)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    const signed char* k00 = kernel.channel(q + i).row<const signed char>(p);

                    g00[0] = k00[k];

                    g00++;
                }
            }
        }
    }
}

static void convolution_im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            signed char* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const signed char* sptr = img.row<const signed char>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j + 3 < outw; j += 4)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];
                            ptr[2] = sptr[stride_w * 2];
                            ptr[3] = sptr[stride_w * 3];

                            sptr += stride_w * 4;
                            ptr += 4;
                        }
                        for (; j + 1 < outw; j += 2)
                        {
                            ptr[0] = sptr[0];
                            ptr[1] = sptr[stride_w];

                            sptr += stride_w * 2;
                            ptr += 2;
                        }
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_sgemm_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_pack4_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    const float* bias = _bias;

    // permute
    Mat tmp;
    if (size >= 12)
        tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + (size % 12 % 4) / 2 + size % 12 % 2, 4u * 4, 4, opt.workspace_allocator);
    else if (size >= 8)
        tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
    else if (size >= 4)
        tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
    else if (size >= 2)
        tmp.create(2 * maxk, inch, size / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator);
    {
        int remain_size_start = 0;
        int nn_size = size / 12;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 12;

            float* tmpptr = tmp.channel(i / 12);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    // transpose 4x12
                    __m128i _r0 = __lsx_vld(img0, 0);
                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
                    __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0);
                    __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0);
                    __m128i _ra = __lsx_vld(img0 + 4 * 10, 0);
                    __m128i _rb = __lsx_vld(img0 + 4 * 11, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
                    __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
                    __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
                    __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
                    __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
                    __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
                    __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
                    __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
                    __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);

                    __lsx_vst(_r0123_0, tmpptr, 0);
                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
                    __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
                    __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
                    __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
                    __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
                    __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
                    __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
                    __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
                    __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
                    __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
                    __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);

                    img0 += size * 4;
                    tmpptr += 48;
                }
            }
        }

        remain_size_start += nn_size * 12;
        nn_size = (size - remain_size_start) >> 3;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 8;

            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    // transpose 4x8
                    __m128i _r0 = __lsx_vld(img0, 0);
                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);

                    __lsx_vst(_r0123_0, tmpptr, 0);
                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
                    __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
                    __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
                    __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
                    __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
                    __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
                    __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);

                    img0 += size * 4;
                    tmpptr += 32;
                }
            }
        }

        remain_size_start += nn_size << 3;
        nn_size = (size - remain_size_start) >> 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 4;

            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(img0, 0);
                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, tmpptr, 0);
                    __lsx_vst(_r0123_1, tmpptr + 4, 0);
                    __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
                    __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);

                    img0 += size * 4;
                    tmpptr += 16;
                }
            }
        }

        remain_size_start += nn_size << 2;
        nn_size = (size - remain_size_start) >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    // transpose 4x2
                    __m128i _r0 = __lsx_vld(img0, 0);
                    __m128i _r1 = __lsx_vld(img0 + 4, 0);

                    __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0);

                    __lsx_vst(_r01_0, tmpptr, 0);
                    __lsx_vst(_r01_1, tmpptr + 4, 0);

                    img0 += size * 4;
                    tmpptr += 8;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    __m128i _val = __lsx_vld(img0, 0);
                    __lsx_vst(_val, tmpptr, 0);

                    img0 += size * 4;
                    tmpptr += 4;
                }
            }
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 11 < size; i += 12)
        {
            const float* tmpptr = tmp.channel(i / 12);
            const float* kptr0 = kernel.channel(p);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = _sum0;
            __m128 _sum2 = _sum0;
            __m128 _sum3 = _sum0;
            __m128 _sum4 = _sum0;
            __m128 _sum5 = _sum0;
            __m128 _sum6 = _sum0;
            __m128 _sum7 = _sum0;
            __m128 _sum8 = _sum0;
            __m128 _sum9 = _sum0;
            __m128 _suma = _sum0;
            __m128 _sumb = _sum0;

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 48);
                __builtin_prefetch(kptr0 + 16);
                __m128i _val0123 = __lsx_vld(tmpptr, 0);
                __m128i _val4567 = __lsx_vld(tmpptr + 4, 0);
                __m128i _val89ab = __lsx_vld(tmpptr + 8, 0);
                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
                _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
                _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
                _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
                _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
                _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8);
                _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9);
                _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma);
                _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb);

                tmpptr += 12;
                kptr0 += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);
            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
            __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
            __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
            __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
            __lsx_vst(_sum7, outptr0 + 4 * 7, 0);
            __lsx_vst(_sum8, outptr0 + 4 * 8, 0);
            __lsx_vst(_sum9, outptr0 + 4 * 9, 0);
            __lsx_vst(_suma, outptr0 + 4 * 10, 0);
            __lsx_vst(_sumb, outptr0 + 4 * 11, 0);

            outptr0 += 4 * 12;
        }
        for (; i + 7 < size; i += 8)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
            const float* kptr0 = kernel.channel(p);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = _sum0;
            __m128 _sum2 = _sum0;
            __m128 _sum3 = _sum0;
            __m128 _sum4 = _sum0;
            __m128 _sum5 = _sum0;
            __m128 _sum6 = _sum0;
            __m128 _sum7 = _sum0;

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 32);
                __builtin_prefetch(kptr0 + 16);
                __m128i _val0123 = __lsx_vld(tmpptr, 0);
                __m128i _val4567 = __lsx_vld(tmpptr + 4, 0);
                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
                _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
                _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
                _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
                _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);

                tmpptr += 8;
                kptr0 += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);
            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);
            __lsx_vst(_sum4, outptr0 + 4 * 4, 0);
            __lsx_vst(_sum5, outptr0 + 4 * 5, 0);
            __lsx_vst(_sum6, outptr0 + 4 * 6, 0);
            __lsx_vst(_sum7, outptr0 + 4 * 7, 0);

            outptr0 += 4 * 8;
        }
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
            const float* kptr0 = kernel.channel(p);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = _sum0;
            __m128 _sum2 = _sum0;
            __m128 _sum3 = _sum0;

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr0 + 16);
                __m128i _val0123 = __lsx_vld(tmpptr, 0);
                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
                _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
                _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
                _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);

                tmpptr += 4;
                kptr0 += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);
            __lsx_vst(_sum2, outptr0 + 4 * 2, 0);
            __lsx_vst(_sum3, outptr0 + 4 * 3, 0);

            outptr0 += 4 * 4;
        }
        for (; i + 1 < size; i += 2)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
            const float* kptr0 = kernel.channel(p);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = _sum0;

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 8);
                __builtin_prefetch(kptr0 + 16);
                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
                __m128 _val1 = __lsx_vreplfr2vr_s(*tmpptr++);
                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
                _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1);

                kptr0 += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);

            outptr0 += 4 * 2;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
            const float* kptr0 = kernel.channel(p);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 4);
                __builtin_prefetch(kptr0 + 16);
                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
                _sum = __lsx_vfmadd_s(_w0, _val0, _sum);

                kptr0 += 4;
            }

            __lsx_vst(_sum, outptr0, 0);

            outptr0 += 4;
        }
    }
}

static void convolution_im2col_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
    {
        const int gap = (w * stride_h - outw * stride_w) * 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            float* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const float* sptr = img.row<const float>(dilation_h * u) + dilation_w * v * 4;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
                            __lsx_vst(_val, ptr, 0);

                            sptr += stride_w * 4;
                            ptr += 4;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/loongarch/convolution_sgemm_pack4to1.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_pack4to1_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    const float* bias = _bias;

    Mat tmp;
    if (size >= 12)
        tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + size % 12 % 4, 4u * 4, 4, opt.workspace_allocator);
    else if (size >= 8)
        tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator);
    else if (size >= 4)
        tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator);
    {
        int remain_size_start = 0;
        int nn_size = size / 12;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 12;

            float* tmpptr = tmp.channel(i / 12);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    // transpose 4x12
                    __m128i _r0 = __lsx_vld(img0, 0);
                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);
                    __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0);
                    __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0);
                    __m128i _ra = __lsx_vld(img0 + 4 * 10, 0);
                    __m128i _rb = __lsx_vld(img0 + 4 * 11, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
                    __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
                    __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
                    __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
                    __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
                    __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
                    __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
                    __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
                    __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);

                    __lsx_vst(_r0123_0, tmpptr, 0);
                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
                    __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
                    __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
                    __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
                    __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
                    __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
                    __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
                    __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
                    __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
                    __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
                    __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);

                    img0 += size * 4;
                    tmpptr += 48;
                }
            }
        }

        remain_size_start += nn_size * 12;
        nn_size = (size - remain_size_start) >> 3;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 8;

            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    // transpose 4x8
                    __m128i _r0 = __lsx_vld(img0, 0);
                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);
                    __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0);
                    __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0);
                    __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0);
                    __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
                    __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
                    __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
                    __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
                    __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
                    __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
                    __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
                    __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);

                    __lsx_vst(_r0123_0, tmpptr, 0);
                    __lsx_vst(_r4567_0, tmpptr + 4, 0);
                    __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
                    __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
                    __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
                    __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
                    __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
                    __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);

                    img0 += size * 4;
                    tmpptr += 32;
                }
            }
        }

        remain_size_start += nn_size << 3;
        nn_size = (size - remain_size_start) >> 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 4;

            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(img0, 0);
                    __m128i _r1 = __lsx_vld(img0 + 4, 0);
                    __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, tmpptr, 0);
                    __lsx_vst(_r0123_1, tmpptr + 4, 0);
                    __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
                    __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);

                    img0 += size * 4;
                    tmpptr += 16;
                }
            }
        }

        remain_size_start += nn_size << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4;

                for (int k = 0; k < maxk; k++)
                {
                    __m128 _val = (__m128)__lsx_vld(img0, 0);
                    __lsx_vst(_val, tmpptr, 0);

                    img0 += size * 4;
                    tmpptr += 4;
                }
            }
        }
    }

    int nn_outch = outch / 4;
    int remain_outch_start = nn_outch * 4;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);
        float* outptr2 = top_blob.channel(p + 2);
        float* outptr3 = top_blob.channel(p + 3);

        const float zeros[4] = {0.f};
        const float* biasptr = bias ? bias + p : zeros;

        int i = 0;
        for (; i + 11 < size; i += 12)
        {
            const float* tmpptr = tmp.channel(i / 12);
            const float* kptr0 = kernel.channel(p / 4);

            int nn = inch * maxk * 4; // inch always > 0

            __m128i _bias = __lsx_vld(biasptr, 0);
            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0);
            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 0);
            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1);
            __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 1);
            __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 1);
            __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 2);
            __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 2);
            __m128 _sum8 = (__m128)__lsx_vreplvei_w(_bias, 2);
            __m128 _sum9 = (__m128)__lsx_vreplvei_w(_bias, 3);
            __m128 _suma = (__m128)__lsx_vreplvei_w(_bias, 3);
            __m128 _sumb = (__m128)__lsx_vreplvei_w(_bias, 3);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 48);
                __builtin_prefetch(kptr0 + 16);
                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
                __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0);
                __m128i _w0123 = __lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1);
                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val2, _sum2);
                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum3);
                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum4);
                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val2, _sum5);
                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum6);
                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum7);
                _sum8 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val2, _sum8);
                _sum9 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum9);
                _suma = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _suma);
                _sumb = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val2, _sumb);

                tmpptr += 12;
                kptr0 += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);
            __lsx_vst(_sum2, outptr0 + 8, 0);
            __lsx_vst(_sum3, outptr1, 0);
            __lsx_vst(_sum4, outptr1 + 4, 0);
            __lsx_vst(_sum5, outptr1 + 8, 0);
            __lsx_vst(_sum6, outptr2, 0);
            __lsx_vst(_sum7, outptr2 + 4, 0);
            __lsx_vst(_sum8, outptr2 + 8, 0);
            __lsx_vst(_sum9, outptr3, 0);
            __lsx_vst(_suma, outptr3 + 4, 0);
            __lsx_vst(_sumb, outptr3 + 8, 0);

            outptr0 += 12;
            outptr1 += 12;
            outptr2 += 12;
            outptr3 += 12;
        }
        for (; i + 7 < size; i += 8)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
            const float* kptr0 = kernel.channel(p / 4);

            int nn = inch * maxk * 4; // inch always > 0

            __m128i _bias = __lsx_vld(biasptr, 0);
            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0);
            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 1);
            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1);
            __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 2);
            __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 2);
            __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 3);
            __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 3);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 32);
                __builtin_prefetch(kptr0 + 16);
                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
                __m128i _w0123 = __lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1);
                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum2);
                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum3);
                _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum4);
                _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum5);
                _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum6);
                _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _sum7);

                tmpptr += 8;
                kptr0 += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);
            __lsx_vst(_sum2, outptr1, 0);
            __lsx_vst(_sum3, outptr1 + 4, 0);
            __lsx_vst(_sum4, outptr2, 0);
            __lsx_vst(_sum5, outptr2 + 4, 0);
            __lsx_vst(_sum6, outptr3, 0);
            __lsx_vst(_sum7, outptr3 + 4, 0);

            outptr0 += 8;
            outptr1 += 8;
            outptr2 += 8;
            outptr3 += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
            const float* kptr0 = kernel.channel(p / 4);

            int nn = inch * maxk * 4; // inch always > 0

            __m128i _bias = __lsx_vld(biasptr, 0);
            __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0);
            __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 1);
            __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 2);
            __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 3);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr0 + 16);
                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
                __m128i _w0123 = __lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0);
                _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum1);
                _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum2);
                _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum3);

                tmpptr += 4;
                kptr0 += 4;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr1, 0);
            __lsx_vst(_sum2, outptr2, 0);
            __lsx_vst(_sum3, outptr3, 0);

            outptr0 += 4;
            outptr1 += 4;
            outptr2 += 4;
            outptr3 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
            const float* kptr0 = kernel.channel(p / 4);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum = (__m128)__lsx_vld(biasptr, 0);
            float* _sum_p = (float*)&_sum;

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 4);
                __builtin_prefetch(kptr0 + 16);
                __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++);
                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
                _sum = __lsx_vfmadd_s(_w0, _val0, _sum);

                kptr0 += 4;
            }

            outptr0[0] = _sum_p[0];
            outptr1[0] = _sum_p[1];
            outptr2[0] = _sum_p[2];
            outptr3[0] = _sum_p[3];

            outptr0 += 1;
            outptr1 += 1;
            outptr2 += 1;
            outptr3 += 1;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        float* outptr0 = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        int i = 0;
        for (; i + 11 < size; i += 12)
        {
            const float* tmpptr = tmp.channel(i / 12);
            const float* kptr0 = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
            __m128 _sum1 = __lsx_vreplfr2vr_s(bias0);
            __m128 _sum2 = __lsx_vreplfr2vr_s(bias0);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 48);
                __builtin_prefetch(kptr0 + 4);
                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
                __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0);
                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
                _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1);
                _sum2 = __lsx_vfmadd_s(_val2, _w0, _sum2);

                tmpptr += 12;
                kptr0 += 1;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);
            __lsx_vst(_sum2, outptr0 + 8, 0);

            outptr0 += 12;
        }
        for (; i + 7 < size; i += 8)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
            const float* kptr0 = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);
            __m128 _sum1 = __lsx_vreplfr2vr_s(bias0);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 32);
                __builtin_prefetch(kptr0 + 4);
                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
                __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0);
                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);
                _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1);

                tmpptr += 8;
                kptr0 += 1;
            }

            __lsx_vst(_sum0, outptr0, 0);
            __lsx_vst(_sum1, outptr0 + 4, 0);

            outptr0 += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
            const float* kptr0 = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk * 4; // inch always > 0

            __m128 _sum0 = __lsx_vreplfr2vr_s(bias0);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr0 + 4);
                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
                __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0);
                _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0);

                tmpptr += 4;
                kptr0 += 1;
            }

            __lsx_vst(_sum0, outptr0, 0);

            outptr0 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
            const float* kptr0 = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk; // inch always > 0

            float sum0 = bias0;

            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);

            for (int j = 0; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr0 + 16);
                __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0);
                __m128 _w0 = (__m128)__lsx_vld(kptr0, 0);
                _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
                tmpptr += 4;
                kptr0 += 4;
            }

            sum0 += __lsx_reduce_fadd_s(_sum0);

            outptr0[0] = sum0;

            outptr0 += 1;
        }
    }
}

static void convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = pb-pa-maxk-inch/pa-outch/pb
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    kernel_tm.create(4 * 4 * maxk, inch / 4, outch / 4 + outch % 4);

    int q = 0;
    for (; q + 3 < outch; q += 4)
    {
        float* g00 = kernel_tm.channel(q / 4);

        for (int p = 0; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const float* k00 = kernel.channel(q + j).row(p + i);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
    }
    for (; q < outch; q++)
    {
        const Mat k0 = kernel.channel(q);

        float* g00 = kernel_tm.channel(q / 4 + q % 4);

        for (int p = 0; p + 3 < inch; p += 4)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int j = 0; j < 4; j++)
                {
                    const float* k00 = k0.row(p + j);

                    g00[0] = k00[k];

                    g00++;
                }
            }
        }
    }
}

static void convolution_im2col_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator);
    {
        const int gap = (w * stride_h - outw * stride_w) * 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            float* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
                            __lsx_vst(_val, ptr, 0);

                            sptr += stride_w * 4;
                            ptr += 4;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/loongarch/convolution_sgemm_pack8to1_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
    if (size >= 2)
        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
    {
        int remain_size_start = 0;
        int nn_size = (size - remain_size_start) >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            int64_t* tmpptr = tmp.channel(i / 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    __m128i _v = __lsx_vld(img0, 0);
                    __lsx_vst(_v, tmpptr, 0);
                    tmpptr += 2;
                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr += 1;
                    img0 += size;
                }
            }
        }
    }

    int nn_outch = 0;
    int remain_outch_start = 0;

    nn_outch = outch >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        int* outptr0 = top_blob.channel(p);
        int* outptr1 = top_blob.channel(p + 1);
        int* outptr2 = top_blob.channel(p + 2);
        int* outptr3 = top_blob.channel(p + 3);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn = inch * maxk; // inch always > 0

            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
            __m128i _sum01 = __lsx_vreplgr2vr_w(0);
            __m128i _sum02 = __lsx_vreplgr2vr_w(0);
            __m128i _sum03 = __lsx_vreplgr2vr_w(0);
            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
            __m128i _sum11 = __lsx_vreplgr2vr_w(0);
            __m128i _sum12 = __lsx_vreplgr2vr_w(0);
            __m128i _sum13 = __lsx_vreplgr2vr_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 64);
                __builtin_prefetch(kptr + 128);
                __m128i _val01 = __lsx_vld(tmpptr, 0);
                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);

                __m128i _w01 = __lsx_vld(kptr, 0);
                __m128i _w23 = __lsx_vld(kptr + 16, 0);
                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);

                __m128i _s00 = __lsx_vmul_h(_val0, _w0);
                __m128i _s01 = __lsx_vmul_h(_val0, _w1);
                __m128i _s02 = __lsx_vmul_h(_val0, _w2);
                __m128i _s03 = __lsx_vmul_h(_val0, _w3);
                __m128i _s10 = __lsx_vmul_h(_val1, _w0);
                __m128i _s11 = __lsx_vmul_h(_val1, _w1);
                __m128i _s12 = __lsx_vmul_h(_val1, _w2);
                __m128i _s13 = __lsx_vmul_h(_val1, _w3);

                _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00));
                _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01));
                _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02));
                _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03));
                _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10));
                _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11));
                _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12));
                _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13));

                tmpptr += 16;
                kptr += 32;
            }

            // transpose 4x4
            {
                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
                _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
                _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
                _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
                _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
                _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
                _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
                _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
            }
            {
                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
                _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
                _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
                _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
                _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
                _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
                _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
                _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
            }

            _sum00 = __lsx_vadd_w(_sum00, _sum01);
            _sum02 = __lsx_vadd_w(_sum02, _sum03);
            _sum10 = __lsx_vadd_w(_sum10, _sum11);
            _sum12 = __lsx_vadd_w(_sum12, _sum13);

            _sum00 = __lsx_vadd_w(_sum00, _sum02);
            _sum10 = __lsx_vadd_w(_sum10, _sum12);

            int sum[8];
            __lsx_vst(_sum00, sum, 0);
            __lsx_vst(_sum10, sum + 4, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0[1] = sum[4];
            outptr1[1] = sum[5];
            outptr2[1] = sum[6];
            outptr3[1] = sum[7];
            outptr0 += 2;
            outptr1 += 2;
            outptr2 += 2;
            outptr3 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 4);

            int nn = inch * maxk; // inch always > 0

            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
            __m128i _sum2 = __lsx_vreplgr2vr_w(0);
            __m128i _sum3 = __lsx_vreplgr2vr_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 32);
                __builtin_prefetch(kptr + 128);
                __m128i _val = __lsx_vld(tmpptr, 0);
                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                __m128i _w01 = __lsx_vld(kptr, 0);
                __m128i _w23 = __lsx_vld(kptr + 16, 0);
                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);

                __m128i _s0 = __lsx_vmul_h(_val16, _w0);
                __m128i _s1 = __lsx_vmul_h(_val16, _w1);
                __m128i _s2 = __lsx_vmul_h(_val16, _w2);
                __m128i _s3 = __lsx_vmul_h(_val16, _w3);

                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
                _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
                _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));

                tmpptr += 8;
                kptr += 32;
            }

            // transpose 4x4
            {
                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
                _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
                _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
                _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
                _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
                _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
                _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
                _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
            }

            _sum0 = __lsx_vadd_w(_sum0, _sum1);
            _sum2 = __lsx_vadd_w(_sum2, _sum3);

            _sum0 = __lsx_vadd_w(_sum0, _sum2);

            int sum[4];
            __lsx_vst(_sum0, sum, 0);

            outptr0[0] = sum[0];
            outptr1[0] = sum[1];
            outptr2[0] = sum[2];
            outptr3[0] = sum[3];
            outptr0 += 1;
            outptr1 += 1;
            outptr2 += 1;
            outptr3 += 1;
        }
    }

    remain_outch_start += nn_outch << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk; // inch always > 0

            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
            __m128i _sum1 = __lsx_vreplgr2vr_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 64);
                __builtin_prefetch(kptr + 32);
                __m128i _val01 = __lsx_vld(tmpptr, 0);
                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);

                __m128i _w = __lsx_vld(kptr, 0);
                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                __m128i _s0 = __lsx_vmul_h(_val0, _w16);
                __m128i _s1 = __lsx_vmul_h(_val1, _w16);

                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));

                tmpptr += 16;
                kptr += 8;
            }

            outptr0[0] = __lsx_reduce_add_w(_sum0);
            outptr0[1] = __lsx_reduce_add_w(_sum1);
            outptr0 += 2;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p / 4 + p % 4);

            int nn = inch * maxk; // inch always > 0

            __m128i _sum = __lsx_vreplgr2vr_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 32);
                __builtin_prefetch(kptr + 32);
                __m128i _val = __lsx_vld(tmpptr, 0);
                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                __m128i _w = __lsx_vld(kptr, 0);
                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                __m128i _s0 = __lsx_vmul_h(_val16, _w16);

                _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0));

                tmpptr += 8;
                kptr += 8;
            }

            outptr0[0] = __lsx_reduce_add_w(_sum);
            outptr0 += 1;
        }
    }
}

static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 8a-4b-maxk-inch/8a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    if (outch >= 4)
        kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u);
    else
        kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u);

    int q = 0;
    for (; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        for (int p = 0; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 8; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
    }
    // TODO unroll 2
    for (; q < outch; q++)
    {
        signed char* g00 = kernel_tm.channel(q / 4 + q % 4);

        for (int p = 0; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int j = 0; j < 8; j++)
                {
                    const signed char* k00 = kernel.channel(q).row<const signed char>(p + j);

                    g00[0] = k00[k];

                    g00++;
                }
            }
        }
    }
}

static void convolution_im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            int64_t* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_sgemm_pack8to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    // permute
    Mat tmp;
    if (size >= 2)
        tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator);
    {
        int remain_size_start = 0;
        int nn_size = size >> 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = remain_size_start + ii * 2;

            int64_t* tmpptr = tmp.channel(i / 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    __m128i _v = __lsx_vld(img0, 0);
                    __lsx_vst(_v, tmpptr, 0);
                    tmpptr += 2;
                    img0 += size;
                }
            }
        }

        remain_size_start += nn_size << 1;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            int64_t* tmpptr = tmp.channel(i / 2 + i % 2);

            for (int q = 0; q < inch; q++)
            {
                const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    tmpptr += 1;
                    img0 += size;
                }
            }
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr0 = top_blob.channel(p);

        int i = 0;
        for (; i + 1 < size; i += 2)
        {
            const signed char* tmpptr = tmp.channel(i / 2);
            const signed char* kptr = kernel.channel(p);

            int nn = inch * maxk; // inch always > 0

            __m128i _sum00 = __lsx_vreplgr2vr_w(0);
            __m128i _sum01 = __lsx_vreplgr2vr_w(0);
            __m128i _sum02 = __lsx_vreplgr2vr_w(0);
            __m128i _sum03 = __lsx_vreplgr2vr_w(0);
            __m128i _sum10 = __lsx_vreplgr2vr_w(0);
            __m128i _sum11 = __lsx_vreplgr2vr_w(0);
            __m128i _sum12 = __lsx_vreplgr2vr_w(0);
            __m128i _sum13 = __lsx_vreplgr2vr_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 64);
                __builtin_prefetch(kptr + 128);
                __m128i _val01 = __lsx_vld(tmpptr, 0);
                __m128i _extval01 = __lsx_vslti_b(_val01, 0);
                __m128i _val0 = __lsx_vilvl_b(_extval01, _val01);
                __m128i _val1 = __lsx_vilvh_b(_extval01, _val01);

                __m128i _w01 = __lsx_vld(kptr, 0);
                __m128i _w23 = __lsx_vld(kptr + 16, 0);
                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);

                __m128i _s00 = __lsx_vmul_h(_val0, _w0);
                __m128i _s01 = __lsx_vmul_h(_val0, _w1);
                __m128i _s02 = __lsx_vmul_h(_val0, _w2);
                __m128i _s03 = __lsx_vmul_h(_val0, _w3);
                __m128i _s10 = __lsx_vmul_h(_val1, _w0);
                __m128i _s11 = __lsx_vmul_h(_val1, _w1);
                __m128i _s12 = __lsx_vmul_h(_val1, _w2);
                __m128i _s13 = __lsx_vmul_h(_val1, _w3);

                _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00));
                _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01));
                _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02));
                _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03));
                _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10));
                _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11));
                _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12));
                _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13));

                tmpptr += 16;
                kptr += 32;
            }

            // transpose 4x4
            {
                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
                _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
                _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
                _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
                _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
                _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
                _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
                _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
            }
            {
                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
                _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
                _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
                _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
                _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
                _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
                _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
                _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
            }

            _sum00 = __lsx_vadd_w(_sum00, _sum01);
            _sum02 = __lsx_vadd_w(_sum02, _sum03);
            _sum10 = __lsx_vadd_w(_sum10, _sum11);
            _sum12 = __lsx_vadd_w(_sum12, _sum13);

            _sum00 = __lsx_vadd_w(_sum00, _sum02);
            _sum10 = __lsx_vadd_w(_sum10, _sum12);

            __lsx_vst(_sum00, outptr0, 0);
            __lsx_vst(_sum10, outptr0 + 4, 0);
            outptr0 += 8;
        }
        for (; i < size; i++)
        {
            const signed char* tmpptr = tmp.channel(i / 2 + i % 2);
            const signed char* kptr = kernel.channel(p);

            int nn = inch * maxk; // inch always > 0

            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
            __m128i _sum1 = __lsx_vreplgr2vr_w(0);
            __m128i _sum2 = __lsx_vreplgr2vr_w(0);
            __m128i _sum3 = __lsx_vreplgr2vr_w(0);

            int j = 0;
            for (; j < nn; j++)
            {
                __builtin_prefetch(tmpptr + 32);
                __builtin_prefetch(kptr + 128);
                __m128i _val = __lsx_vld(tmpptr, 0);
                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                __m128i _w01 = __lsx_vld(kptr, 0);
                __m128i _w23 = __lsx_vld(kptr + 16, 0);
                __m128i _extw01 = __lsx_vslti_b(_w01, 0);
                __m128i _extw23 = __lsx_vslti_b(_w23, 0);
                __m128i _w0 = __lsx_vilvl_b(_extw01, _w01);
                __m128i _w1 = __lsx_vilvh_b(_extw01, _w01);
                __m128i _w2 = __lsx_vilvl_b(_extw23, _w23);
                __m128i _w3 = __lsx_vilvh_b(_extw23, _w23);

                __m128i _s0 = __lsx_vmul_h(_val16, _w0);
                __m128i _s1 = __lsx_vmul_h(_val16, _w1);
                __m128i _s2 = __lsx_vmul_h(_val16, _w2);
                __m128i _s3 = __lsx_vmul_h(_val16, _w3);

                _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0));
                _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1));
                _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2));
                _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3));

                tmpptr += 8;
                kptr += 32;
            }

            // transpose 4x4
            {
                __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
                _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
                _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
                _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
                _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
                _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
                _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
                _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
            }

            _sum0 = __lsx_vadd_w(_sum0, _sum1);
            _sum2 = __lsx_vadd_w(_sum2, _sum3);

            _sum0 = __lsx_vadd_w(_sum0, _sum2);

            __lsx_vst(_sum0, outptr0, 0);
            outptr0 += 4;
        }
    }
}

static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 8a-4b-maxk-inch/8a-outch/4b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
    kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        signed char* g00 = kernel_tm.channel(q / 4);

        for (int p = 0; p + 7 < inch; p += 8)
        {
            for (int k = 0; k < maxk; k++)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 8; j++)
                    {
                        const signed char* k00 = kernel.channel(q + i).row<const signed char>(p + j);

                        g00[0] = k00[k];

                        g00++;
                    }
                }
            }
        }
    }
}

static void convolution_im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            int64_t* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const int64_t* sptr = img.row<const int64_t>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt);
}


================================================
FILE: src/layer/loongarch/convolution_winograd_dot.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_winograd_dot_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
{
    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator);

    const int tiles = bottom_blob_tm.w;
    const int batch = bottom_blob_tm.h;
    const int inch = bottom_blob_tm.c;

    // permute
    Mat bottom_blob_tm2;
    if (tiles >= 4)
        bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, batch, 4u, opt.workspace_allocator);
    else
        bottom_blob_tm2.create(1 * inch, tiles, batch, 4u, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int r = 0; r < batch; r++)
    {
        Mat tm2 = bottom_blob_tm2.channel(r);

        // tile
        int i = 0;
        for (; i + 3 < tiles; i += 4)
        {
            float* tmpptr = tm2.row(i / 4);

            const float* r0 = bottom_blob_tm;

            r0 += (r * tiles + i);

            for (int q = 0; q < inch; q++)
            {
#if __loongarch_sx
                __lsx_vst(__lsx_vld(r0, 0), tmpptr, 0);
#else
                tmpptr[0] = r0[0];
                tmpptr[1] = r0[1];
                tmpptr[2] = r0[2];
                tmpptr[3] = r0[3];
#endif

                r0 += bottom_blob_tm.cstep;
                tmpptr += 4;
            }
        }
        for (; i < tiles; i++)
        {
            float* tmpptr = tm2.row(i / 4 + i % 4);

            const float* r0 = bottom_blob_tm;

            r0 += (r * tiles + i);

            for (int q = 0; q < inch; q++)
            {
                tmpptr[0] = r0[0];

                r0 += bottom_blob_tm.cstep;
                tmpptr += 1;
            }
        }
    }

    bottom_blob_tm = Mat();
    // permute end

    top_blob_tm.create(tiles, batch, outch, 4u, opt.workspace_allocator);

#if __loongarch_sx
    int nn_outch = outch >> 3;
    int remain_outch_start = nn_outch << 3;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 8;

        float* output0_tm = top_blob_tm.channel(p);
        float* output1_tm = top_blob_tm.channel(p + 1);
        float* output2_tm = top_blob_tm.channel(p + 2);
        float* output3_tm = top_blob_tm.channel(p + 3);
        float* output4_tm = top_blob_tm.channel(p + 4);
        float* output5_tm = top_blob_tm.channel(p + 5);
        float* output6_tm = top_blob_tm.channel(p + 6);
        float* output7_tm = top_blob_tm.channel(p + 7);

        const Mat kernel0_tm = kernel_tm.channel(p / 8);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 3 < tiles; i += 4)
            {
                const float* r0 = bb2.row(i / 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 16);
                    __builtin_prefetch(k0 + 32);
                    __m128 _val = (__m128)__lsx_vld(r0, 0);
                    __m128i _w0123 = __lsx_vld(k0, 0);
                    __m128i _w4567 = __lsx_vld(k0 + 4, 0);
                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
                    _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
                    _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
                    _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);
                    _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4);
                    _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5);
                    _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6);
                    _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7);

                    r0 += 4;
                    k0 += 8;
                }

                __lsx_vst(_sum0, output0_tm, 0);
                __lsx_vst(_sum1, output1_tm, 0);
                __lsx_vst(_sum2, output2_tm, 0);
                __lsx_vst(_sum3, output3_tm, 0);
                __lsx_vst(_sum4, output4_tm, 0);
                __lsx_vst(_sum5, output5_tm, 0);
                __lsx_vst(_sum6, output6_tm, 0);
                __lsx_vst(_sum7, output7_tm, 0);

                output0_tm += 4;
                output1_tm += 4;
                output2_tm += 4;
                output3_tm += 4;
                output4_tm += 4;
                output5_tm += 4;
                output6_tm += 4;
                output7_tm += 4;
            }
            for (; i < tiles; i++)
            {
                const float* r0 = bb2.row(i / 4 + i % 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                float sum0 = 0.f;
                float sum1 = 0.f;
                float sum2 = 0.f;
                float sum3 = 0.f;
                float sum4 = 0.f;
                float sum5 = 0.f;
                float sum6 = 0.f;
                float sum7 = 0.f;

                int j = 0;
                for (; j < nn; j++)
                {
                    sum0 += r0[0] * k0[0];
                    sum1 += r0[0] * k0[1];
                    sum2 += r0[0] * k0[2];
                    sum3 += r0[0] * k0[3];
                    sum4 += r0[0] * k0[4];
                    sum5 += r0[0] * k0[5];
                    sum6 += r0[0] * k0[6];
                    sum7 += r0[0] * k0[7];

                    r0 += 1;
                    k0 += 8;
                }

                output0_tm[0] = sum0;
                output1_tm[0] = sum1;
                output2_tm[0] = sum2;
                output3_tm[0] = sum3;
                output4_tm[0] = sum4;
                output5_tm[0] = sum5;
                output6_tm[0] = sum6;
                output7_tm[0] = sum7;

                output0_tm++;
                output1_tm++;
                output2_tm++;
                output3_tm++;
                output4_tm++;
                output5_tm++;
                output6_tm++;
                output7_tm++;
            }
        }
    }

    nn_outch = (outch - remain_outch_start) >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = remain_outch_start + pp * 4;

        float* output0_tm = top_blob_tm.channel(p);
        float* output1_tm = top_blob_tm.channel(p + 1);
        float* output2_tm = top_blob_tm.channel(p + 2);
        float* output3_tm = top_blob_tm.channel(p + 3);

        const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 3 < tiles; i += 4)
            {
                const float* r0 = bb2.row(i / 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

                int j = 0;
                for (; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 16);
                    __builtin_prefetch(k0 + 16);
                    __m128 _val = (__m128)__lsx_vld(r0, 0);
                    __m128i _w0123 = __lsx_vld(k0, 0);
                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0);
                    _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1);
                    _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2);
                    _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3);

                    r0 += 4;
                    k0 += 4;
                }

                __lsx_vst(_sum0, output0_tm, 0);
                __lsx_vst(_sum1, output1_tm, 0);
                __lsx_vst(_sum2, output2_tm, 0);
                __lsx_vst(_sum3, output3_tm, 0);

                output0_tm += 4;
                output1_tm += 4;
                output2_tm += 4;
                output3_tm += 4;
            }
            for (; i < tiles; i++)
            {
                const float* r0 = bb2.row(i / 4 + i % 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                float sum0 = 0.f;
                float sum1 = 0.f;
                float sum2 = 0.f;
                float sum3 = 0.f;

                int j = 0;
                for (; j < nn; j++)
                {
                    sum0 += r0[0] * k0[0];
                    sum1 += r0[0] * k0[1];
                    sum2 += r0[0] * k0[2];
                    sum3 += r0[0] * k0[3];

                    r0 += 1;
                    k0 += 4;
                }

                output0_tm[0] = sum0;
                output1_tm[0] = sum1;
                output2_tm[0] = sum2;
                output3_tm[0] = sum3;

                output0_tm++;
                output1_tm++;
                output2_tm++;
                output3_tm++;
            }
        }
    }

    remain_outch_start += nn_outch << 2;
#else
    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        float* output0_tm = top_blob_tm.channel(p);
        float* output1_tm = top_blob_tm.channel(p + 1);

        const Mat kernel0_tm = kernel_tm.channel(p / 2);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 3 < tiles; i += 4)
            {
                const float* r0 = bb2.row(i / 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                float sum00 = 0.f;
                float sum01 = 0.f;
                float sum02 = 0.f;
                float sum03 = 0.f;
                float sum10 = 0.f;
                float sum11 = 0.f;
                float sum12 = 0.f;
                float sum13 = 0.f;

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 16);
                    __builtin_prefetch(k0 + 8);
                    float w0 = k0[0];
                    float w1 = k0[1];
                    sum00 += r0[0] * w0;
                    sum01 += r0[1] * w0;
                    sum02 += r0[2] * w0;
                    sum03 += r0[3] * w0;
                    sum10 += r0[0] * w1;
                    sum11 += r0[1] * w1;
                    sum12 += r0[2] * w1;
                    sum13 += r0[3] * w1;

                    r0 += 4;
                    k0 += 2;
                }

                output0_tm[0] = sum00;
                output0_tm[1] = sum01;
                output0_tm[2] = sum02;
                output0_tm[3] = sum03;
                output1_tm[0] = sum10;
                output1_tm[1] = sum11;
                output1_tm[2] = sum12;
                output1_tm[3] = sum13;

                output0_tm += 4;
                output1_tm += 4;
            }
            for (; i < tiles; i++)
            {
                const float* r0 = bb2.row(i / 4 + i % 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                float sum00 = 0.f;
                float sum10 = 0.f;

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 4);
                    __builtin_prefetch(k0 + 8);
                    float val0 = r0[0];
                    sum00 += val0 * k0[0];
                    sum10 += val0 * k0[1];

                    r0 += 1;
                    k0 += 2;
                }

                output0_tm[0] = sum00;
                output1_tm[0] = sum10;
                output0_tm++;
                output1_tm++;
            }
        }
    }
#endif

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        float* output0_tm = top_blob_tm.channel(p);

#if __loongarch_sx
        const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4);
#else
        const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
#endif

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 3 < tiles; i += 4)
            {
                const float* r0 = bb2.row(i / 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                int j = 0;
#if __loongarch_sx
                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);

                for (; j < nn; j++)
                {
                    _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(r0, 0), __lsx_vreplfr2vr_s(k0[0]), _sum0);
                    r0 += 4;
                    k0++;
                }

                __lsx_vst(_sum0, output0_tm, 0);
                output0_tm += 4;
#else  // __loongarch_sx
                float sum0 = 0.f;
                float sum1 = 0.f;
                float sum2 = 0.f;
                float sum3 = 0.f;

                for (; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 16);
                    __builtin_prefetch(k0 + 4);
                    float w0 = k0[0];
                    sum0 += r0[0] * w0;
                    sum1 += r0[1] * w0;
                    sum2 += r0[2] * w0;
                    sum3 += r0[3] * w0;

                    r0 += 4;
                    k0++;
                }

                output0_tm[0] = sum0;
                output0_tm[1] = sum1;
                output0_tm[2] = sum2;
                output0_tm[3] = sum3;
                output0_tm += 4;
#endif // __loongarch_sx
            }
            for (; i < tiles; i++)
            {
                const float* r0 = bb2.row(i / 4 + i % 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch; // inch always > 0

                float sum = 0.f;

                for (int j = 0; j < nn; j++)
                {
                    float w0 = k0[0];
                    float val0 = r0[0];
                    sum += val0 * w0;

                    r0 += 1;
                    k0 += 1;
                }

                output0_tm[0] = sum;
                output0_tm += 1;
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_dot_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_winograd_dot_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
{
    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u, 1, opt.workspace_allocator);

    const int tiles = bottom_blob_tm.w;
    const int batch = bottom_blob_tm.h;
    const int inch = bottom_blob_tm.c;

    // permute
    Mat bottom_blob_tm2;
#if __loongarch_sx
    if (inch >= 4)
    {
        if (tiles >= 2)
            bottom_blob_tm2.create(inch / 4 + inch % 4, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
        else // if (tiles >= 1)
            bottom_blob_tm2.create(inch / 4 + inch % 4, tiles, batch, 8u, 4, opt.workspace_allocator);
    }
    else
#endif // __loongarch_sx
    {
        if (tiles >= 2)
            bottom_blob_tm2.create(inch, tiles / 2 + tiles % 2, batch, 4u, 2, opt.workspace_allocator);
        else // if (tiles >= 1)
            bottom_blob_tm2.create(inch, tiles, batch, 2u, 1, opt.workspace_allocator);
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int r = 0; r < batch; r++)
    {
        Mat tm2 = bottom_blob_tm2.channel(r);

        // tile
        int i = 0;
        for (; i + 1 < tiles; i += 2)
        {
            short* tmpptr = tm2.row<short>(i / 2);

            const short* r0 = (const short*)bottom_blob_tm + r * tiles + i;

            int q = 0;
#if __loongarch_sx
            const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i;
            const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i;
            const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i;
            for (; q + 3 < inch; q += 4)
            {
                tmpptr[0] = r0[0];
                tmpptr[1] = r1[0];
                tmpptr[2] = r2[0];
                tmpptr[3] = r3[0];
                tmpptr[4] = r0[1];
                tmpptr[5] = r1[1];
                tmpptr[6] = r2[1];
                tmpptr[7] = r3[1];
                r0 += bottom_blob_tm.cstep * 4;
                r1 += bottom_blob_tm.cstep * 4;
                r2 += bottom_blob_tm.cstep * 4;
                r3 += bottom_blob_tm.cstep * 4;
                tmpptr += 8;
            }
#endif // __loongarch_sx
            for (; q < inch; q++)
            {
                tmpptr[0] = r0[0];
                tmpptr[1] = r0[1];
                r0 += bottom_blob_tm.cstep;
                tmpptr += 2;
            }
        }
        for (; i < tiles; i++)
        {
            short* tmpptr = tm2.row<short>(i / 2 + i % 2);

            const short* r0 = (const short*)bottom_blob_tm + r * tiles + i;

            int q = 0;
#if __loongarch_sx
            const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i;
            const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i;
            const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i;
            for (; q + 3 < inch; q += 4)
            {
                tmpptr[0] = r0[0];
                tmpptr[1] = r1[0];
                tmpptr[2] = r2[0];
                tmpptr[3] = r3[0];
                r0 += bottom_blob_tm.cstep * 4;
                r1 += bottom_blob_tm.cstep * 4;
                r2 += bottom_blob_tm.cstep * 4;
                r3 += bottom_blob_tm.cstep * 4;
                tmpptr += 4;
            }
#endif // __loongarch_sx
            for (; q < inch; q++)
            {
                tmpptr[0] = r0[0];
                r0 += bottom_blob_tm.cstep;
                tmpptr += 1;
            }
        }
    }

    bottom_blob_tm = Mat();
    // permute end

    top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator);

#if __loongarch_sx
    int nn_outch = outch >> 2;
    int remain_outch_start = nn_outch << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        int* output0_tm = top_blob_tm.channel(p);
        int* output1_tm = top_blob_tm.channel(p + 1);
        int* output2_tm = top_blob_tm.channel(p + 2);
        int* output3_tm = top_blob_tm.channel(p + 3);

        const Mat kernel0_tm = kernel_tm.channel(p / 4);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                const short* r0 = bb2.row<const short>(i / 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int nn4 = inch / 4;
                int nn1 = inch % 4;

                __m128i _sum00 = __lsx_vreplgr2vr_w(0);
                __m128i _sum10 = __lsx_vreplgr2vr_w(0);

                if (nn4 > 0)
                {
                    __m128i _sum01 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum02 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum03 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum11 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum12 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum13 = __lsx_vreplgr2vr_w(0);

                    int j = 0;
                    for (; j < nn4; j++)
                    {
                        __m128i _val01 = __lsx_vld(r0, 0);

                        __m128i _val0 = __lsx_vilvl_d(_val01, _val01);
                        __m128i _val1 = __lsx_vilvh_d(_val01, _val01);

                        __m128i _w0 = __lsx_vld(k0, 0);
                        __m128i _w1 = __lsx_vld(k0 + 8, 0);

                        __m128i _extval0 = __lsx_vslti_h(_val0, 0);
                        __m128i _extval1 = __lsx_vslti_h(_val1, 0);
                        __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                        __m128i _extw1 = __lsx_vslti_h(_w1, 0);

                        __m128i _val0l = __lsx_vilvl_h(_extval0, _val0);
                        __m128i _val0h = __lsx_vilvh_h(_extval0, _val0);
                        __m128i _val1l = __lsx_vilvl_h(_extval1, _val1);
                        __m128i _val1h = __lsx_vilvh_h(_extval1, _val1);

                        __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                        __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
                        __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
                        __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);

                        _sum00 = __lsx_vmadd_w(_sum00, _val0l, _w0l);
                        _sum01 = __lsx_vmadd_w(_sum01, _val0h, _w0h);
                        _sum02 = __lsx_vmadd_w(_sum02, _val0l, _w1l);
                        _sum03 = __lsx_vmadd_w(_sum03, _val0h, _w1h);
                        _sum10 = __lsx_vmadd_w(_sum10, _val1l, _w0l);
                        _sum11 = __lsx_vmadd_w(_sum11, _val1h, _w0h);
                        _sum12 = __lsx_vmadd_w(_sum12, _val1l, _w1l);
                        _sum13 = __lsx_vmadd_w(_sum13, _val1h, _w1h);

                        r0 += 8;
                        k0 += 16;
                    }

                    // transpose 4x4
                    {
                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                        _tmp0 = __lsx_vilvl_w(_sum01, _sum00);
                        _tmp1 = __lsx_vilvl_w(_sum03, _sum02);
                        _tmp2 = __lsx_vilvh_w(_sum01, _sum00);
                        _tmp3 = __lsx_vilvh_w(_sum03, _sum02);
                        _sum00 = __lsx_vilvl_d(_tmp1, _tmp0);
                        _sum01 = __lsx_vilvh_d(_tmp1, _tmp0);
                        _sum02 = __lsx_vilvl_d(_tmp3, _tmp2);
                        _sum03 = __lsx_vilvh_d(_tmp3, _tmp2);
                    }
                    {
                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                        _tmp0 = __lsx_vilvl_w(_sum11, _sum10);
                        _tmp1 = __lsx_vilvl_w(_sum13, _sum12);
                        _tmp2 = __lsx_vilvh_w(_sum11, _sum10);
                        _tmp3 = __lsx_vilvh_w(_sum13, _sum12);
                        _sum10 = __lsx_vilvl_d(_tmp1, _tmp0);
                        _sum11 = __lsx_vilvh_d(_tmp1, _tmp0);
                        _sum12 = __lsx_vilvl_d(_tmp3, _tmp2);
                        _sum13 = __lsx_vilvh_d(_tmp3, _tmp2);
                    }

                    _sum00 = __lsx_vadd_w(_sum00, _sum01);
                    _sum02 = __lsx_vadd_w(_sum02, _sum03);
                    _sum10 = __lsx_vadd_w(_sum10, _sum11);
                    _sum12 = __lsx_vadd_w(_sum12, _sum13);

                    _sum00 = __lsx_vadd_w(_sum00, _sum02);
                    _sum10 = __lsx_vadd_w(_sum10, _sum12);
                }

                for (int j = 0; j < nn1; j++)
                {
                    __m128i _val0 = __lsx_vreplgr2vr_h(r0[0]);
                    __m128i _val1 = __lsx_vreplgr2vr_h(r0[1]);
                    __m128i _val = __lsx_vilvl_d(_val1, _val0);

                    __m128i _w16 = __lsx_vld(k0, 0);

                    _w16 = __lsx_vilvl_d(_w16, _w16);

                    __m128i _extval = __lsx_vslti_h(_val, 0);
                    __m128i _extw16 = __lsx_vslti_h(_w16, 0);

                    __m128i _vall = __lsx_vilvl_h(_extval, _val);
                    __m128i _valh = __lsx_vilvh_h(_extval, _val);
                    __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
                    __m128i _w0h = __lsx_vilvh_h(_extw16, _w16);

                    _sum00 = __lsx_vmadd_w(_sum00, _vall, _w0l);
                    _sum10 = __lsx_vmadd_w(_sum10, _valh, _w0h);

                    r0 += 2;
                    k0 += 4;
                }

                int sum[8];
                __lsx_vst(_sum00, sum, 0);
                __lsx_vst(_sum10, sum + 4, 0);

                output0_tm[0] = sum[0];
                output1_tm[0] = sum[1];
                output2_tm[0] = sum[2];
                output3_tm[0] = sum[3];
                output0_tm[1] = sum[4];
                output1_tm[1] = sum[5];
                output2_tm[1] = sum[6];
                output3_tm[1] = sum[7];
                output0_tm += 2;
                output1_tm += 2;
                output2_tm += 2;
                output3_tm += 2;
            }
            for (; i < tiles; i++)
            {
                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int nn4 = inch / 4;
                int nn1 = inch % 4;

                __m128i _sum0 = __lsx_vreplgr2vr_w(0);

                if (nn4 > 0)
                {
                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum2 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum3 = __lsx_vreplgr2vr_w(0);

                    int j = 0;
                    for (; j < nn4; j++)
                    {
                        __m128i _val16 = __lsx_vld(r0, 0);

                        _val16 = __lsx_vilvl_d(_val16, _val16);

                        __m128i _w0 = __lsx_vld(k0, 0);
                        __m128i _w1 = __lsx_vld(k0 + 8, 0);

                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
                        __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                        __m128i _extw1 = __lsx_vslti_h(_w1, 0);

                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
                        __m128i _val0h = __lsx_vilvh_h(_extval16, _val16);

                        __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                        __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
                        __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
                        __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);

                        _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l);
                        _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h);
                        _sum2 = __lsx_vmadd_w(_sum2, _val0l, _w1l);
                        _sum3 = __lsx_vmadd_w(_sum3, _val0h, _w1h);

                        r0 += 4;
                        k0 += 16;
                    }

                    // transpose 4x4
                    {
                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
                        _tmp0 = __lsx_vilvl_w(_sum1, _sum0);
                        _tmp1 = __lsx_vilvl_w(_sum3, _sum2);
                        _tmp2 = __lsx_vilvh_w(_sum1, _sum0);
                        _tmp3 = __lsx_vilvh_w(_sum3, _sum2);
                        _sum0 = __lsx_vilvl_d(_tmp1, _tmp0);
                        _sum1 = __lsx_vilvh_d(_tmp1, _tmp0);
                        _sum2 = __lsx_vilvl_d(_tmp3, _tmp2);
                        _sum3 = __lsx_vilvh_d(_tmp3, _tmp2);
                    }

                    _sum0 = __lsx_vadd_w(_sum0, _sum1);
                    _sum2 = __lsx_vadd_w(_sum2, _sum3);
                    _sum0 = __lsx_vadd_w(_sum0, _sum2);
                }

                for (int j = 0; j < nn1; j++)
                {
                    __m128i _val = __lsx_vreplgr2vr_w(r0[0]);
                    __m128i _w16 = __lsx_vld(k0, 0);

                    __m128i _extw16 = __lsx_vslti_h(_w16, 0);
                    __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);

                    _sum0 = __lsx_vmadd_w(_sum0, _val, _w0l);

                    r0 += 1;
                    k0 += 4;
                }

                int sum[4];
                __lsx_vst(_sum0, sum, 0);

                output0_tm[0] = sum[0];
                output1_tm[0] = sum[1];
                output2_tm[0] = sum[2];
                output3_tm[0] = sum[3];
                output0_tm += 1;
                output1_tm += 1;
                output2_tm += 1;
                output3_tm += 1;
            }
        }
    }
#else // __loongarch_sx
    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        int* output0_tm = top_blob_tm.channel(p);
        int* output1_tm = top_blob_tm.channel(p + 1);

        const Mat kernel0_tm = kernel_tm.channel(p / 2);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                const short* r0 = bb2.row<const short>(i / 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int sum00 = 0;
                int sum01 = 0;
                int sum10 = 0;
                int sum11 = 0;

                int nn1 = inch;

                for (int j = 0; j < nn1; j++)
                {
                    signed short val0 = r0[0];
                    signed short val1 = r0[1];
                    signed short w0 = k0[0];
                    signed short w1 = k0[1];

                    sum00 += val0 * w0;
                    sum01 += val1 * w0;
                    sum10 += val0 * w1;
                    sum11 += val1 * w1;

                    r0 += 2;
                    k0 += 2;
                }

                output0_tm[0] = sum00;
                output0_tm[1] = sum01;
                output1_tm[0] = sum10;
                output1_tm[1] = sum11;
                output0_tm += 2;
                output1_tm += 2;
            }
            for (; i < tiles; i++)
            {
                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int sum0 = 0;
                int sum1 = 0;

                int nn1 = inch;

                for (int j = 0; j < nn1; j++)
                {
                    signed short val0 = r0[0];
                    signed short w0 = k0[0];
                    signed short w1 = k0[1];

                    sum0 += val0 * w0;
                    sum1 += val0 * w1;

                    r0 += 1;
                    k0 += 2;
                }

                output0_tm[0] = sum0;
                output1_tm[0] = sum1;
                output0_tm += 1;
                output1_tm += 1;
            }
        }
    }
#endif // __loongarch_sx

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        int* output0_tm = top_blob_tm.channel(p);

#if __loongarch_sx
        const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
#else
        const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
#endif

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                const short* r0 = bb2.row<const short>(i / 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int sum0 = 0;
                int sum1 = 0;

#if __loongarch_sx
                int nn4 = inch / 4;
                int nn1 = inch % 4;

                if (nn4 > 0)
                {
                    __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);

                    int j = 0;
                    for (; j < nn4; j++)
                    {
                        __m128i _val16 = __lsx_vld(r0, 0);

                        __m128i _w16 = __lsx_vld(k0, 0);

                        _w16 = __lsx_vilvl_d(_w16, _w16);

                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
                        __m128i _extw16 = __lsx_vslti_h(_w16, 0);

                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
                        __m128i _val0h = __lsx_vilvh_h(_extval16, _val16);

                        __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);
                        __m128i _w0h = __lsx_vilvh_h(_extw16, _w16);

                        _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l);
                        _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h);

                        r0 += 8;
                        k0 += 4;
                    }

                    sum0 = __lsx_reduce_add_w(_sum0);
                    sum1 = __lsx_reduce_add_w(_sum1);
                }
#else  // __loongarch_sx
                int nn1 = inch;
#endif // __loongarch_sx

                for (int q = 0; q < nn1; q++)
                {
                    signed short val0 = r0[0];
                    signed short val1 = r0[1];
                    signed short w = k0[0];

                    sum0 += val0 * w;
                    sum1 += val1 * w;

                    k0 += 1;
                    r0 += 2;
                }

                output0_tm[0] = sum0;
                output0_tm[1] = sum1;
                output0_tm += 2;
            }
            for (; i < tiles; i++)
            {
                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int sum = 0;

#if __loongarch_sx
                int nn4 = inch / 4;
                int nn1 = inch % 4;

                if (nn4 > 0)
                {
                    __m128i _sum = __lsx_vreplgr2vr_w(0);

                    int j = 0;
                    for (; j < nn4; j++)
                    {
                        __m128i _val16 = __lsx_vld(r0, 0);
                        __m128i _w16 = __lsx_vld(k0, 0);

                        __m128i _extval16 = __lsx_vslti_h(_val16, 0);
                        __m128i _extw16 = __lsx_vslti_h(_w16, 0);

                        __m128i _val0l = __lsx_vilvl_h(_extval16, _val16);
                        __m128i _w0l = __lsx_vilvl_h(_extw16, _w16);

                        _sum = __lsx_vmadd_w(_sum, _val0l, _w0l);

                        r0 += 4;
                        k0 += 4;
                    }

                    sum = __lsx_reduce_add_w(_sum);
                }
#else  // __loongarch_sx
                int nn1 = inch;
#endif // __loongarch_sx

                for (int q = 0; q < nn1; q++)
                {
                    signed short val = r0[0];
                    signed short w = k0[0];

                    sum += val * w;

                    k0 += 1;
                    r0 += 1;
                }

                output0_tm[0] = sum;
                output0_tm++;
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_dot_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_winograd_dot_pack4_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
{
    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 4, opt.workspace_allocator);

    const int tiles = bottom_blob_tm.w;
    const int batch = bottom_blob_tm.h;
    const int inch = bottom_blob_tm.c;

    // permute
    Mat bottom_blob_tm2;
    if (tiles >= 12)
        bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, batch, 16u, 4, opt.workspace_allocator);
    else if (tiles >= 8)
        bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
    else if (tiles >= 4)
        bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
    else if (tiles >= 2)
        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator);
    else // if (tiles >= 1)
        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 4, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int r = 0; r < batch; r++)
    {
        Mat tm2 = bottom_blob_tm2.channel(r);

        // tile
        int i = 0;
        for (; i + 11 < tiles; i += 12)
        {
            float* tmpptr = tm2.row(i / 12);

            const float* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 4;

            for (int q = 0; q < inch; q++)
            {
                // transpose 4x8
                __m128i _r0 = __lsx_vld(r0, 0);
                __m128i _r1 = __lsx_vld(r0 + 4, 0);
                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
                __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0);
                __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0);
                __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0);
                __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0);
                __m128i _r8 = __lsx_vld(r0 + 4 * 8, 0);
                __m128i _r9 = __lsx_vld(r0 + 4 * 9, 0);
                __m128i _ra = __lsx_vld(r0 + 4 * 10, 0);
                __m128i _rb = __lsx_vld(r0 + 4 * 11, 0);

                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
                __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
                __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
                __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
                __m128i _r89r = __lsx_vilvl_w(_r9, _r8);
                __m128i _r89l = __lsx_vilvh_w(_r9, _r8);
                __m128i _rabr = __lsx_vilvl_w(_rb, _ra);
                __m128i _rabl = __lsx_vilvh_w(_rb, _ra);
                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
                __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
                __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
                __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
                __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);
                __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r);
                __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r);
                __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l);
                __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l);

                __lsx_vst(_r0123_0, tmpptr, 0);
                __lsx_vst(_r4567_0, tmpptr + 4, 0);
                __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0);
                __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0);
                __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0);
                __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0);
                __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0);
                __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0);
                __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0);
                __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0);
                __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0);
                __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0);

                r0 += bottom_blob_tm.cstep * 4;
                tmpptr += 48;
            }
        }
        for (; i + 7 < tiles; i += 8)
        {
            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8);

            const float* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 4;

            for (int q = 0; q < inch; q++)
            {
                // transpose 4x8
                __m128i _r0 = __lsx_vld(r0, 0);
                __m128i _r1 = __lsx_vld(r0 + 4, 0);
                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);
                __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0);
                __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0);
                __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0);
                __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0);

                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                __m128i _r45r = __lsx_vilvl_w(_r5, _r4);
                __m128i _r45l = __lsx_vilvh_w(_r5, _r4);
                __m128i _r67r = __lsx_vilvl_w(_r7, _r6);
                __m128i _r67l = __lsx_vilvh_w(_r7, _r6);
                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);
                __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r);
                __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r);
                __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l);
                __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l);

                __lsx_vst(_r0123_0, tmpptr, 0);
                __lsx_vst(_r4567_0, tmpptr + 4, 0);
                __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0);
                __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0);
                __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0);
                __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0);
                __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0);
                __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0);

                r0 += bottom_blob_tm.cstep * 4;
                tmpptr += 32;
            }
        }
        for (; i + 3 < tiles; i += 4)
        {
            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);

            const float* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 4;

            for (int q = 0; q < inch; q++)
            {
                // transpose 4x4
                __m128i _r0 = __lsx_vld(r0, 0);
                __m128i _r1 = __lsx_vld(r0 + 4, 0);
                __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
                __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);

                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                __lsx_vst(_r0123_0, tmpptr, 0);
                __lsx_vst(_r0123_1, tmpptr + 4, 0);
                __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0);
                __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0);

                r0 += bottom_blob_tm.cstep * 4;
                tmpptr += 16;
            }
        }
        for (; i + 1 < tiles; i += 2)
        {
            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);

            const float* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 4;

            for (int q = 0; q < inch; q++)
            {
                // transpose 4x2
                __m128i _r0 = __lsx_vld(r0, 0);
                __m128i _r1 = __lsx_vld(r0 + 4, 0);

                __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0);
                __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0);

                __lsx_vst(_r01_0, tmpptr, 0);
                __lsx_vst(_r01_1, tmpptr + 4, 0);

                r0 += bottom_blob_tm.cstep * 4;
                tmpptr += 8;
            }
        }
        for (; i < tiles; i++)
        {
            float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);

            const float* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 4;

            for (int q = 0; q < inch; q++)
            {
                __m128i _val = __lsx_vld(r0, 0);
                __lsx_vst(_val, tmpptr, 0);

                r0 += bottom_blob_tm.cstep * 4;
                tmpptr += 4;
            }
        }
    }

    bottom_blob_tm = Mat();
    // permute end

    top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* output0_tm = top_blob_tm.channel(p);

        const Mat kernel0_tm = kernel_tm.channel(p);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 11 < tiles; i += 12)
            {
                const float* r0 = bb2.row(i / 12);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch * 4; // inch always > 0

                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum8 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum9 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _suma = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sumb = (__m128)__lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 48);
                    __builtin_prefetch(k0 + 16);
                    __m128i _val0123 = __lsx_vld(r0, 0);
                    __m128i _val4567 = __lsx_vld(r0 + 4, 0);
                    __m128i _val89ab = __lsx_vld(r0 + 8, 0);
                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
                    _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
                    _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
                    _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
                    _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);
                    _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8);
                    _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9);
                    _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma);
                    _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb);

                    r0 += 12;
                    k0 += 4;
                }

                __lsx_vst(_sum0, output0_tm, 0);
                __lsx_vst(_sum1, output0_tm + 4, 0);
                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
                __lsx_vst(_sum4, output0_tm + 4 * 4, 0);
                __lsx_vst(_sum5, output0_tm + 4 * 5, 0);
                __lsx_vst(_sum6, output0_tm + 4 * 6, 0);
                __lsx_vst(_sum7, output0_tm + 4 * 7, 0);
                __lsx_vst(_sum8, output0_tm + 4 * 8, 0);
                __lsx_vst(_sum9, output0_tm + 4 * 9, 0);
                __lsx_vst(_suma, output0_tm + 4 * 10, 0);
                __lsx_vst(_sumb, output0_tm + 4 * 11, 0);

                output0_tm += 4 * 12;
            }
            for (; i + 7 < tiles; i += 8)
            {
                const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch * 4; // inch always > 0

                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 32);
                    __builtin_prefetch(k0 + 16);
                    __m128i _val0123 = __lsx_vld(r0, 0);
                    __m128i _val4567 = __lsx_vld(r0 + 4, 0);
                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);
                    _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4);
                    _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5);
                    _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6);
                    _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7);

                    r0 += 8;
                    k0 += 4;
                }

                __lsx_vst(_sum0, output0_tm, 0);
                __lsx_vst(_sum1, output0_tm + 4, 0);
                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);
                __lsx_vst(_sum4, output0_tm + 4 * 4, 0);
                __lsx_vst(_sum5, output0_tm + 4 * 5, 0);
                __lsx_vst(_sum6, output0_tm + 4 * 6, 0);
                __lsx_vst(_sum7, output0_tm + 4 * 7, 0);

                output0_tm += 4 * 8;
            }
            for (; i + 3 < tiles; i += 4)
            {
                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch * 4; // inch always > 0

                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 16);
                    __builtin_prefetch(k0 + 16);
                    __m128i _val0123 = __lsx_vld(r0, 0);
                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
                    _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0);
                    _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1);
                    _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2);
                    _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3);

                    r0 += 4;
                    k0 += 4;
                }

                __lsx_vst(_sum0, output0_tm, 0);
                __lsx_vst(_sum1, output0_tm + 4, 0);
                __lsx_vst(_sum2, output0_tm + 4 * 2, 0);
                __lsx_vst(_sum3, output0_tm + 4 * 3, 0);

                output0_tm += 4 * 4;
            }
            for (; i + 1 < tiles; i += 2)
            {
                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch * 4; // inch always > 0

                __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 8);
                    __builtin_prefetch(k0 + 16);
                    __m128 _val0 = __lsx_vreplfr2vr_s(*r0++);
                    __m128 _val1 = __lsx_vreplfr2vr_s(*r0++);
                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
                    _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0);
                    _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1);

                    k0 += 4;
                }

                __lsx_vst(_sum0, output0_tm, 0);
                __lsx_vst(_sum1, output0_tm + 4, 0);

                output0_tm += 4 * 2;
            }
            for (; i < tiles; i++)
            {
                const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
                const float* k0 = kernel0_tm.row(r);

                int nn = inch * 4; // inch always > 0

                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 4);
                    __builtin_prefetch(k0 + 16);
                    __m128 _val0 = __lsx_vreplfr2vr_s(*r0++);
                    __m128 _w0 = (__m128)__lsx_vld(k0, 0);
                    _sum = __lsx_vfmadd_s(_w0, _val0, _sum);

                    k0 += 4;
                }

                __lsx_vst(_sum, output0_tm, 0);

                output0_tm += 4;
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_winograd_dot_pack8to1_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
{
    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator);

    const int tiles = bottom_blob_tm.w;
    const int batch = bottom_blob_tm.h;
    const int inch = bottom_blob_tm.c;

    // permute
    Mat bottom_blob_tm2;
    if (tiles >= 2)
        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
    else // if (tiles >= 1)
        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int r = 0; r < batch; r++)
    {
        Mat tm2 = bottom_blob_tm2.channel(r);

        // tile
        int i = 0;
        for (; i + 1 < tiles; i += 2)
        {
            short* tmpptr = tm2.row<short>(i / 2);

            const short* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 8;

            for (int q = 0; q < inch; q++)
            {
                __m128i _r0 = __lsx_vld(r0, 0);
                __m128i _r1 = __lsx_vld(r0 + 8, 0);
                __lsx_vst(_r0, tmpptr, 0);
                __lsx_vst(_r1, tmpptr + 8, 0);
                r0 += bottom_blob_tm.cstep * 8;
                tmpptr += 16;
            }
        }
        for (; i < tiles; i++)
        {
            short* tmpptr = tm2.row<short>(i / 2 + i % 2);

            const short* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 8;

            for (int q = 0; q < inch; q++)
            {
                __m128i _r0 = __lsx_vld(r0, 0);
                __lsx_vst(_r0, tmpptr, 0);
                r0 += bottom_blob_tm.cstep * 8;
                tmpptr += 8;
            }
        }
    }

    bottom_blob_tm = Mat();
    // permute end

    top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator);

    int nn_outch = 0;
    int remain_outch_start = 0;

    nn_outch = outch >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 4;

        int* output0_tm = top_blob_tm.channel(p);
        int* output1_tm = top_blob_tm.channel(p + 1);
        int* output2_tm = top_blob_tm.channel(p + 2);
        int* output3_tm = top_blob_tm.channel(p + 3);

        const Mat kernel0_tm = kernel_tm.channel(p / 4);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                const short* r0 = bb2.row<const short>(i / 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int nn = inch; // inch always > 0

                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
                __m128i _sum3 = __lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 64);
                    __builtin_prefetch(k0 + 128);
                    __m128i _w0 = __lsx_vld(k0, 0);
                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
                    __m128i _w3 = __lsx_vld(k0 + 24, 0);

                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);

                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);

                    __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]);
                    __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]);
                    __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]);
                    __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]);
                    __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]);
                    __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]);
                    __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]);
                    __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]);
                    __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]);
                    __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]);
                    __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]);
                    __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]);
                    __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]);
                    __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]);
                    __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]);
                    __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]);

                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0);
                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1);
                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0);
                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1);
                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2);
                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3);
                    _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2);
                    _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3);
                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4);
                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5);
                    _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4);
                    _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5);
                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6);
                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7);
                    _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6);
                    _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7);

                    r0 += 16;
                    k0 += 32;
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);
                _sum2 = __lsx_vadd_w(_sum2, _sum3);

                int sum[8];
                __lsx_vst(_sum0, sum, 0);
                __lsx_vst(_sum2, sum + 4, 0);

                output0_tm[0] = sum[0];
                output1_tm[0] = sum[1];
                output2_tm[0] = sum[2];
                output3_tm[0] = sum[3];
                output0_tm[1] = sum[4];
                output1_tm[1] = sum[5];
                output2_tm[1] = sum[6];
                output3_tm[1] = sum[7];
                output0_tm += 2;
                output1_tm += 2;
                output2_tm += 2;
                output3_tm += 2;
            }
            for (; i < tiles; i++)
            {
                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int nn = inch; // inch always > 0

                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 32);
                    __builtin_prefetch(k0 + 128);
                    __m128i _w0 = __lsx_vld(k0, 0);
                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
                    __m128i _w3 = __lsx_vld(k0 + 24, 0);

                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);

                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);

                    __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]);
                    __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]);
                    __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]);
                    __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]);
                    __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]);
                    __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]);
                    __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]);
                    __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]);

                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0);
                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1);
                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2);
                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3);
                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4);
                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5);
                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6);
                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7);

                    r0 += 8;
                    k0 += 32;
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);

                int sum[4];
                __lsx_vst(_sum0, sum, 0);

                output0_tm[0] = sum[0];
                output1_tm[0] = sum[1];
                output2_tm[0] = sum[2];
                output3_tm[0] = sum[3];
                output0_tm += 1;
                output1_tm += 1;
                output2_tm += 1;
                output3_tm += 1;
            }
        }
    }

    remain_outch_start += nn_outch << 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        int* output0_tm = top_blob_tm.channel(p);

        const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                const short* r0 = bb2.row<const short>(i / 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
                __m128i _sum3 = __lsx_vreplgr2vr_w(0);

                for (int q = 0; q < inch; q++)
                {
                    __builtin_prefetch(r0 + 32);
                    __builtin_prefetch(k0 + 64);
                    __m128i _val0 = __lsx_vld(r0, 0);
                    __m128i _val1 = __lsx_vld(r0 + 8, 0);

                    __m128i _extval0 = __lsx_vslti_h(_val0, 0);
                    __m128i _extval1 = __lsx_vslti_h(_val1, 0);
                    __m128i _val0l = __lsx_vilvl_h(_extval0, _val0);
                    __m128i _val0h = __lsx_vilvh_h(_extval0, _val0);
                    __m128i _val1l = __lsx_vilvl_h(_extval1, _val1);
                    __m128i _val1h = __lsx_vilvh_h(_extval1, _val1);

                    __m128i _w0 = __lsx_vld(k0, 0);

                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);

                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0l);
                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0h);
                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1l);
                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1h);

                    k0 += 8;
                    r0 += 16;
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);
                _sum2 = __lsx_vadd_w(_sum2, _sum3);

                output0_tm[0] = __lsx_reduce_add_w(_sum0);
                output0_tm[1] = __lsx_reduce_add_w(_sum2);
                output0_tm += 2;
            }
            for (; i < tiles; i++)
            {
                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);

                for (int q = 0; q < inch; q++)
                {
                    __builtin_prefetch(r0 + 32);
                    __builtin_prefetch(k0 + 32);
                    __m128i _val = __lsx_vld(r0, 0);

                    __m128i _extval = __lsx_vslti_h(_val, 0);
                    __m128i _vall = __lsx_vilvl_h(_extval, _val);
                    __m128i _valh = __lsx_vilvh_h(_extval, _val);

                    __m128i _w0 = __lsx_vld(k0, 0);

                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);

                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _vall);
                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _valh);

                    k0 += 8;
                    r0 += 8;
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);

                output0_tm[0] = __lsx_reduce_add_w(_sum0);
                output0_tm++;
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_winograd_dot_pack8to4_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
{
    // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator);

    const int tiles = bottom_blob_tm.w;
    const int batch = bottom_blob_tm.h;
    const int inch = bottom_blob_tm.c;

    // permute
    Mat bottom_blob_tm2;
    if (tiles >= 2)
        bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator);
    else // if (tiles >= 1)
        bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int r = 0; r < batch; r++)
    {
        Mat tm2 = bottom_blob_tm2.channel(r);

        // tile
        int i = 0;
        for (; i + 1 < tiles; i += 2)
        {
            short* tmpptr = tm2.row<short>(i / 2);

            const short* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 8;

            for (int q = 0; q < inch; q++)
            {
                __m128i _r0 = __lsx_vld(r0, 0);
                __m128i _r1 = __lsx_vld(r0 + 8, 0);
                __lsx_vst(_r0, tmpptr, 0);
                __lsx_vst(_r1, tmpptr + 8, 0);
                r0 += bottom_blob_tm.cstep * 8;
                tmpptr += 16;
            }
        }
        for (; i < tiles; i++)
        {
            short* tmpptr = tm2.row<short>(i / 2 + i % 2);

            const short* r0 = bottom_blob_tm;

            r0 += (r * tiles + i) * 8;

            for (int q = 0; q < inch; q++)
            {
                __m128i _r0 = __lsx_vld(r0, 0);
                __lsx_vst(_r0, tmpptr, 0);
                r0 += bottom_blob_tm.cstep * 8;
                tmpptr += 8;
            }
        }
    }

    bottom_blob_tm = Mat();
    // permute end

    top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* output0_tm = top_blob_tm.channel(p);

        const Mat kernel0_tm = kernel_tm.channel(p);

        for (int r = 0; r < batch; r++)
        {
            const Mat bb2 = bottom_blob_tm2.channel(r);

            int i = 0;
            for (; i + 1 < tiles; i += 2)
            {
                const short* r0 = bb2.row<const short>(i / 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int nn = inch; // inch always > 0

                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);
                __m128i _sum2 = __lsx_vreplgr2vr_w(0);
                __m128i _sum3 = __lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 64);
                    __builtin_prefetch(k0 + 128);
                    __m128i _w0 = __lsx_vld(k0, 0);
                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
                    __m128i _w3 = __lsx_vld(k0 + 24, 0);

                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);

                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);

                    __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]);
                    __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]);
                    __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]);
                    __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]);
                    __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]);
                    __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]);
                    __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]);
                    __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]);
                    __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]);
                    __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]);
                    __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]);
                    __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]);
                    __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]);
                    __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]);
                    __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]);
                    __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]);

                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0);
                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1);
                    _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0);
                    _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1);
                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2);
                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3);
                    _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2);
                    _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3);
                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4);
                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5);
                    _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4);
                    _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5);
                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6);
                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7);
                    _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6);
                    _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7);

                    r0 += 16;
                    k0 += 32;
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);
                _sum2 = __lsx_vadd_w(_sum2, _sum3);

                __lsx_vst(_sum0, output0_tm, 0);
                __lsx_vst(_sum2, output0_tm + 4, 0);

                output0_tm += 8;
            }
            for (; i < tiles; i++)
            {
                const short* r0 = bb2.row<const short>(i / 2 + i % 2);
                const short* k0 = kernel0_tm.row<const short>(r);

                int nn = inch; // inch always > 0

                __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                __m128i _sum1 = __lsx_vreplgr2vr_w(0);

                for (int j = 0; j < nn; j++)
                {
                    __builtin_prefetch(r0 + 32);
                    __builtin_prefetch(k0 + 128);
                    __m128i _w0 = __lsx_vld(k0, 0);
                    __m128i _w1 = __lsx_vld(k0 + 8, 0);
                    __m128i _w2 = __lsx_vld(k0 + 16, 0);
                    __m128i _w3 = __lsx_vld(k0 + 24, 0);

                    __m128i _extw0 = __lsx_vslti_h(_w0, 0);
                    __m128i _extw1 = __lsx_vslti_h(_w1, 0);
                    __m128i _extw2 = __lsx_vslti_h(_w2, 0);
                    __m128i _extw3 = __lsx_vslti_h(_w3, 0);

                    __m128i _w0l = __lsx_vilvl_h(_extw0, _w0);
                    __m128i _w0h = __lsx_vilvh_h(_extw0, _w0);
                    __m128i _w1l = __lsx_vilvl_h(_extw1, _w1);
                    __m128i _w1h = __lsx_vilvh_h(_extw1, _w1);
                    __m128i _w2l = __lsx_vilvl_h(_extw2, _w2);
                    __m128i _w2h = __lsx_vilvh_h(_extw2, _w2);
                    __m128i _w3l = __lsx_vilvl_h(_extw3, _w3);
                    __m128i _w3h = __lsx_vilvh_h(_extw3, _w3);

                    __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]);
                    __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]);
                    __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]);
                    __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]);
                    __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]);
                    __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]);
                    __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]);
                    __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]);

                    _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0);
                    _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1);
                    _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2);
                    _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3);
                    _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4);
                    _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5);
                    _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6);
                    _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7);

                    r0 += 8;
                    k0 += 32;
                }

                _sum0 = __lsx_vadd_w(_sum0, _sum1);

                __lsx_vst(_sum0, output0_tm, 0);
                output0_tm += 4;
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_transform.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int w_tiles = (w - 2) / 4;
    const int h_tiles = (h - 2) / 4;
    const int tiles = w_tiles * h_tiles;

    // const float itm[6][6] = {
    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
    // };

    // 0 =  4 * r00 - 5 * r02 + r04
    // 1 = -4 * (r01 + r02) + r04 + r03
    // 2 =  4 * (r01 - r02) + r04 - r03
    // 3 = -2 * (r01 - r03) + r04 - r02
    // 4 =  2 * (r01 - r03) + r04 - r02
    // 5 =  4 * r01 - 5 * r03 + r05

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        Mat img0_tm = bottom_blob_tm.channel(q);

        float tmp[6][6];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* r0 = img0.row(i * 4) + (j * 4);

                for (int m = 0; m < 6; m++)
                {
                    float r00 = r0[0];
                    float r01 = r0[1];
                    float r02 = r0[2];
                    float r03 = r0[3];
                    float r04 = r0[4];
                    float r05 = r0[5];

                    float tmp0m = 4 * r00 - 5 * r02 + r04;
                    float tmp1m = -4 * (r01 + r02) + r04 + r03;
                    float tmp2m = 4 * (r01 - r02) + r04 - r03;
                    float tmp3m = -2 * (r01 - r03) + r04 - r02;
                    float tmp4m = 2 * (r01 - r03) + r04 - r02;
                    float tmp5m = 4 * r01 - 5 * r03 + r05;

                    tmp[0][m] = tmp0m;
                    tmp[1][m] = tmp1m;
                    tmp[2][m] = tmp2m;
                    tmp[3][m] = tmp3m;
                    tmp[4][m] = tmp4m;
                    tmp[5][m] = tmp5m;

                    r0 += w;
                }

                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j);
                float* r0_tm_1 = r0_tm_0 + tiles;
                float* r0_tm_2 = r0_tm_0 + tiles * 2;
                float* r0_tm_3 = r0_tm_0 + tiles * 3;
                float* r0_tm_4 = r0_tm_0 + tiles * 4;
                float* r0_tm_5 = r0_tm_0 + tiles * 5;

                for (int m = 0; m < 6; m++)
                {
                    float tmp00 = tmp[m][0];
                    float tmp01 = tmp[m][1];
                    float tmp02 = tmp[m][2];
                    float tmp03 = tmp[m][3];
                    float tmp04 = tmp[m][4];
                    float tmp05 = tmp[m][5];

                    float r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04;
                    float r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03;
                    float r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03;
                    float r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02;
                    float r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02;
                    float r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05;

                    r0_tm_0[0] = r0tm0;
                    r0_tm_1[0] = r0tm1;
                    r0_tm_2[0] = r0tm2;
                    r0_tm_3[0] = r0tm3;
                    r0_tm_4[0] = r0tm4;
                    r0_tm_5[0] = r0tm5;

                    r0_tm_0 += tiles * 6;
                    r0_tm_1 += tiles * 6;
                    r0_tm_2 += tiles * 6;
                    r0_tm_3 += tiles * 6;
                    r0_tm_4 += tiles * 6;
                    r0_tm_5 += tiles * 6;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int w_tiles = outw / 4;
    const int h_tiles = outh / 4;
    const int tiles = w_tiles * h_tiles;

    const float* biasptr = bias;

    // const float otm[4][6] = {
    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
    // };

    // 0 = r00 + (r01 + r02) + (r03 + r04)
    // 1 =       (r01 - r02) + (r03 - r04) * 2
    // 2 =       (r01 + r02) + (r03 + r04) * 4
    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        const Mat out0_tm = top_blob_tm.channel(p);
        Mat out0 = top_blob.channel(p);

        float bias0 = biasptr ? biasptr[p] : 0.f;

        float tmp[4][6];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
                const float* output0_tm_1 = output0_tm_0 + tiles;
                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
                const float* output0_tm_3 = output0_tm_0 + tiles * 3;
                const float* output0_tm_4 = output0_tm_0 + tiles * 4;
                const float* output0_tm_5 = output0_tm_0 + tiles * 5;

                float* output0 = out0.row(i * 4) + (j * 4);

                for (int m = 0; m < 6; m++)
                {
                    float out0tm0 = output0_tm_0[0];
                    float out0tm1 = output0_tm_1[0];
                    float out0tm2 = output0_tm_2[0];
                    float out0tm3 = output0_tm_3[0];
                    float out0tm4 = output0_tm_4[0];
                    float out0tm5 = output0_tm_5[0];

                    float tmp02a = out0tm1 + out0tm2;
                    float tmp13a = out0tm1 - out0tm2;

                    float tmp02b = out0tm3 + out0tm4;
                    float tmp13b = out0tm3 - out0tm4;

                    float tmp0m = out0tm0 + tmp02a + tmp02b;
                    float tmp1m = tmp13a + tmp13b * 2;
                    float tmp2m = tmp02a + tmp02b * 4;
                    float tmp3m = out0tm5 + tmp13a + tmp13b * 8;

                    tmp[0][m] = tmp0m;
                    tmp[1][m] = tmp1m;
                    tmp[2][m] = tmp2m;
                    tmp[3][m] = tmp3m;

                    output0_tm_0 += tiles * 6;
                    output0_tm_1 += tiles * 6;
                    output0_tm_2 += tiles * 6;
                    output0_tm_3 += tiles * 6;
                    output0_tm_4 += tiles * 6;
                    output0_tm_5 += tiles * 6;
                }

                for (int m = 0; m < 4; m++)
                {
                    float tmp00 = tmp[m][0];
                    float tmp01 = tmp[m][1];
                    float tmp02 = tmp[m][2];
                    float tmp03 = tmp[m][3];
                    float tmp04 = tmp[m][4];
                    float tmp05 = tmp[m][5];

                    float tmp02a = tmp01 + tmp02;
                    float tmp13a = tmp01 - tmp02;

                    float tmp02b = tmp03 + tmp04;
                    float tmp13b = tmp03 - tmp04;

                    float out00 = bias0 + tmp00 + tmp02a + tmp02b;
                    float out01 = bias0 + tmp13a + tmp13b * 2;
                    float out02 = bias0 + tmp02a + tmp02b * 4;
                    float out03 = bias0 + tmp05 + tmp13a + tmp13b * 8;

                    output0[0] = out00;
                    output0[1] = out01;
                    output0[2] = out02;
                    output0[3] = out03;

                    output0 += outw;
                }
            }
        }
    }
}

static void conv3x3s1_winograd23_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int w_tiles = (w - 2) / 2;
    const int h_tiles = (h - 2) / 2;
    const int tiles = w_tiles * h_tiles;

    // const float itm[4][4] = {
    //     {1.0f,  0.0f, -1.0f,  0.0f},
    //     {0.0f,  1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  0.00f, 1.0f}
    // };

    // 0 = r00 - r02
    // 1 = r01 + r02
    // 2 = r02 - r01
    // 3 = r03 - r01

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        Mat img0_tm = bottom_blob_tm.channel(q);

        float tmp[4][4];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* r0 = img0.row(i * 2) + (j * 2);

                for (int m = 0; m < 4; m++)
                {
                    float r00 = r0[0];
                    float r01 = r0[1];
                    float r02 = r0[2];
                    float r03 = r0[3];

                    float tmp0m = r00 - r02;
                    float tmp1m = r01 + r02;
                    float tmp2m = r02 - r01;
                    float tmp3m = r03 - r01;

                    tmp[0][m] = tmp0m;
                    tmp[1][m] = tmp1m;
                    tmp[2][m] = tmp2m;
                    tmp[3][m] = tmp3m;

                    r0 += w;
                }

                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j);
                float* r0_tm_1 = r0_tm_0 + tiles;
                float* r0_tm_2 = r0_tm_0 + tiles * 2;
                float* r0_tm_3 = r0_tm_0 + tiles * 3;

                for (int m = 0; m < 4; m++)
                {
                    float tmp00 = tmp[m][0];
                    float tmp01 = tmp[m][1];
                    float tmp02 = tmp[m][2];
                    float tmp03 = tmp[m][3];

                    float r0tm0 = tmp00 - tmp02;
                    float r0tm1 = tmp01 + tmp02;
                    float r0tm2 = tmp02 - tmp01;
                    float r0tm3 = tmp03 - tmp01;

                    r0_tm_0[0] = r0tm0;
                    r0_tm_1[0] = r0tm1;
                    r0_tm_2[0] = r0tm2;
                    r0_tm_3[0] = r0tm3;

                    r0_tm_0 += tiles * 4;
                    r0_tm_1 += tiles * 4;
                    r0_tm_2 += tiles * 4;
                    r0_tm_3 += tiles * 4;
                }
            }
        }
    }
}

static void conv3x3s1_winograd23_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int w_tiles = outw / 2;
    const int h_tiles = outh / 2;
    const int tiles = w_tiles * h_tiles;

    const float* biasptr = bias;

    // const float otm[2][4] = {
    //     {1.0f,  1.0f,  1.0f,  0.0f},
    //     {0.0f,  1.0f, -1.0f,  1.0f}
    // };

    // 0 = r00 + r01 + r02
    // 1 = r01 - r02 + r03

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        const Mat out0_tm = top_blob_tm.channel(p);
        Mat out0 = top_blob.channel(p);

        float bias0 = biasptr ? biasptr[p] : 0.f;

        float tmp[2][4];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j);
                const float* output0_tm_1 = output0_tm_0 + tiles;
                const float* output0_tm_2 = output0_tm_0 + tiles * 2;
                const float* output0_tm_3 = output0_tm_0 + tiles * 3;

                float* output0 = out0.row(i * 2) + (j * 2);

                for (int m = 0; m < 4; m++)
                {
                    float out0tm0 = output0_tm_0[0];
                    float out0tm1 = output0_tm_1[0];
                    float out0tm2 = output0_tm_2[0];
                    float out0tm3 = output0_tm_3[0];

                    float tmp0m = out0tm0 + out0tm1 + out0tm2;
                    float tmp1m = out0tm1 - out0tm2 + out0tm3;

                    tmp[0][m] = tmp0m;
                    tmp[1][m] = tmp1m;

                    output0_tm_0 += tiles * 4;
                    output0_tm_1 += tiles * 4;
                    output0_tm_2 += tiles * 4;
                    output0_tm_3 += tiles * 4;
                }

                for (int m = 0; m < 2; m++)
                {
                    float tmp00 = tmp[m][0];
                    float tmp01 = tmp[m][1];
                    float tmp02 = tmp[m][2];
                    float tmp03 = tmp[m][3];

                    float out00 = bias0 + tmp00 + tmp01 + tmp02;
                    float out01 = bias0 + tmp01 - tmp02 + tmp03;

                    output0[0] = out00;
                    output0[1] = out01;

                    output0 += outw;
                }
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_transform_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_input_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int w_tiles = (w - 2) / 4;
    const int h_tiles = (h - 2) / 4;
    const int tiles = w_tiles * h_tiles;

    // const float itm[6][6] = {
    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
    // };

    // 0 =  4 * r00 - 5 * r02 + r04
    // 1 = -4 * (r01 + r02) + r04 + r03
    // 2 =  4 * (r01 - r02) + r04 - r03
    // 3 = -2 * (r01 - r03) + r04 - r02
    // 4 =  2 * (r01 - r03) + r04 - r02
    // 5 =  4 * r01 - 5 * r03 + r05

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        Mat img0_tm = bottom_blob_tm.channel(q);

        short tmp[6][6];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4);

                for (int m = 0; m < 6; m++)
                {
                    signed char r00 = r0[0];
                    signed char r01 = r0[1];
                    signed char r02 = r0[2];
                    signed char r03 = r0[3];
                    signed char r04 = r0[4];
                    signed char r05 = r0[5];

                    short tmp0m = 4 * r00 - 5 * r02 + r04;
                    short tmp1m = -4 * (r01 + r02) + r04 + r03;
                    short tmp2m = 4 * (r01 - r02) + r04 - r03;
                    short tmp3m = -2 * (r01 - r03) + r04 - r02;
                    short tmp4m = 2 * (r01 - r03) + r04 - r02;
                    short tmp5m = 4 * r01 - 5 * r03 + r05;

                    tmp[0][m] = tmp0m;
                    tmp[1][m] = tmp1m;
                    tmp[2][m] = tmp2m;
                    tmp[3][m] = tmp3m;
                    tmp[4][m] = tmp4m;
                    tmp[5][m] = tmp5m;

                    r0 += w;
                }

                short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j);
                short* r0_tm_1 = r0_tm_0 + tiles;
                short* r0_tm_2 = r0_tm_0 + tiles * 2;
                short* r0_tm_3 = r0_tm_0 + tiles * 3;
                short* r0_tm_4 = r0_tm_0 + tiles * 4;
                short* r0_tm_5 = r0_tm_0 + tiles * 5;

                for (int m = 0; m < 6; m++)
                {
                    short tmp00 = tmp[m][0];
                    short tmp01 = tmp[m][1];
                    short tmp02 = tmp[m][2];
                    short tmp03 = tmp[m][3];
                    short tmp04 = tmp[m][4];
                    short tmp05 = tmp[m][5];

                    short r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04;
                    short r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03;
                    short r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03;
                    short r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02;
                    short r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02;
                    short r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05;

                    r0_tm_0[0] = r0tm0;
                    r0_tm_1[0] = r0tm1;
                    r0_tm_2[0] = r0tm2;
                    r0_tm_3[0] = r0tm3;
                    r0_tm_4[0] = r0tm4;
                    r0_tm_5[0] = r0tm5;

                    r0_tm_0 += tiles * 6;
                    r0_tm_1 += tiles * 6;
                    r0_tm_2 += tiles * 6;
                    r0_tm_3 += tiles * 6;
                    r0_tm_4 += tiles * 6;
                    r0_tm_5 += tiles * 6;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_transform_output_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int w_tiles = outw / 4;
    const int h_tiles = outh / 4;
    const int tiles = w_tiles * h_tiles;

    // const float otm[4][6] = {
    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
    // };

    // 0 = r00 + (r01 + r02) + (r03 + r04)
    // 1 =       (r01 - r02) + (r03 - r04) * 2
    // 2 =       (r01 + r02) + (r03 + r04) * 4
    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        const Mat out0_tm = top_blob_tm.channel(p);
        Mat out0 = top_blob.channel(p);

        int tmp[4][6];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 1;
                const int* output0_tm_1 = output0_tm_0 + tiles * 1;
                const int* output0_tm_2 = output0_tm_0 + tiles * 2;
                const int* output0_tm_3 = output0_tm_0 + tiles * 3;
                const int* output0_tm_4 = output0_tm_0 + tiles * 4;
                const int* output0_tm_5 = output0_tm_0 + tiles * 5;

                int* output0 = out0.row<int>(i * 4) + j * 4;

                for (int m = 0; m < 5; m++)
                {
                    int tmp02a = output0_tm_1[0] + output0_tm_2[0];
                    int tmp13a = output0_tm_1[0] - output0_tm_2[0];

                    int tmp02b = output0_tm_3[0] + output0_tm_4[0];
                    int tmp13b = output0_tm_3[0] - output0_tm_4[0];

                    tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b;
                    tmp[1][m] = tmp13a + tmp13b * 2;
                    tmp[2][m] = tmp02a + tmp02b * 4;
                    tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8;

                    output0_tm_0 += tiles * 6;
                    output0_tm_1 += tiles * 6;
                    output0_tm_2 += tiles * 6;
                    output0_tm_3 += tiles * 6;
                    output0_tm_4 += tiles * 6;
                    output0_tm_5 += tiles * 6;
                }
                for (int m = 5; m < 6; m++)
                {
                    int tmp02a = output0_tm_1[0] + output0_tm_2[0];
                    int tmp13a = output0_tm_1[0] - output0_tm_2[0];

                    int tmp02b = output0_tm_3[0] + output0_tm_4[0];
                    int tmp13b = output0_tm_3[0] - output0_tm_4[0];

                    tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4;
                    tmp[1][m] = (tmp13a + tmp13b * 2) * 4;
                    tmp[2][m] = (tmp02a + tmp02b * 4) * 4;
                    tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4;

                    output0_tm_0 += tiles * 6;
                    output0_tm_1 += tiles * 6;
                    output0_tm_2 += tiles * 6;
                    output0_tm_3 += tiles * 6;
                    output0_tm_4 += tiles * 6;
                    output0_tm_5 += tiles * 6;
                }

                for (int m = 0; m < 4; m++)
                {
                    const int* tmp0 = tmp[m];

                    int tmp02a = tmp0[1] + tmp0[2];
                    int tmp13a = tmp0[1] - tmp0[2];

                    int tmp02b = tmp0[3] + tmp0[4];
                    int tmp13b = tmp0[3] - tmp0[4];

                    output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576;
                    output0[1] = (tmp13a + tmp13b * 2) / 576;
                    output0[2] = (tmp02a + tmp02b * 4) / 576;
                    output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576;

                    output0 += outw;
                }
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_transform_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd63_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int w_tiles = (w - 2) / 6;
    const int h_tiles = (h - 2) / 6;
    const int tiles = w_tiles * h_tiles;

    // const float itm[8][8] = {
    //     {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
    //
    //     {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
    //     {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
    //
    //     {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
    //     {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
    //
    //     {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
    //     {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
    //
    //     {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
    // };

    // 0 = r00 - r06 + (r04 - r02) * 5.25
    // 7 = r07 - r01 + (r03 - r05) * 5.25

    // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
    // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)

    // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
    // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)

    // reuse r04 * 1.25
    // reuse r03 * 2.5
    // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
    // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        Mat img0_tm = bottom_blob_tm.channel(q);

        float tmp[8][8][4];

        __m128 _v5_25 = __lsx_vreplfr2vr_s(5.25f);
        __m128 _vm4_25 = __lsx_vreplfr2vr_s(-4.25f);
        __m128 _vm1_25 = __lsx_vreplfr2vr_s(-1.25f);
        __m128 _v0_25 = __lsx_vreplfr2vr_s(0.25f);
        __m128 _vm2_5 = __lsx_vreplfr2vr_s(-2.5f);
        __m128 _v0_5 = __lsx_vreplfr2vr_s(0.5f);
        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* r0 = img0.row(i * 6) + (j * 6) * 4;

                for (int m = 0; m < 8; m++)
                {
                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
                    __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
                    __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0);
                    __m128 _r06 = (__m128)__lsx_vld(r0 + 4 * 6, 0);
                    __m128 _r07 = (__m128)__lsx_vld(r0 + 4 * 7, 0);

                    __m128 _tmp0m = __lsx_vfmadd_s(__lsx_vfsub_s(_r04, _r02), _v5_25, __lsx_vfsub_s(_r00, _r06));
                    __m128 _tmp7m = __lsx_vfmadd_s(__lsx_vfsub_s(_r03, _r05), _v5_25, __lsx_vfsub_s(_r07, _r01));
                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp7m, tmp[7][m], 0);

                    __m128 _tmp12a = __lsx_vfmadd_s(_r04, _vm4_25, __lsx_vfadd_s(_r02, _r06));
                    __m128 _tmp12b = __lsx_vfmadd_s(_r03, _vm4_25, __lsx_vfadd_s(_r01, _r05));

                    __m128 _tmp1m = __lsx_vfadd_s(_tmp12a, _tmp12b);
                    __m128 _tmp2m = __lsx_vfsub_s(_tmp12a, _tmp12b);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);

                    __m128 _tmp34a = __lsx_vfmadd_s(_r04, _vm1_25, __lsx_vfmadd_s(_r02, _v0_25, _r06));
                    __m128 _tmp34b = __lsx_vfmadd_s(_r05, _v2, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v0_5)));

                    __m128 _tmp3m = __lsx_vfadd_s(_tmp34a, _tmp34b);
                    __m128 _tmp4m = __lsx_vfsub_s(_tmp34a, _tmp34b);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);
                    __lsx_vst(_tmp4m, tmp[4][m], 0);

                    __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_r04, _vm1_25, _r02), _v4, _r06);
                    __m128 _tmp56b = __lsx_vfmadd_s(_r05, _v0_5, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v2)));

                    __m128 _tmp5m = __lsx_vfadd_s(_tmp56a, _tmp56b);
                    __m128 _tmp6m = __lsx_vfsub_s(_tmp56a, _tmp56b);
                    __lsx_vst(_tmp5m, tmp[5][m], 0);
                    __lsx_vst(_tmp6m, tmp[6][m], 0);

                    r0 += w * 4;
                }

                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
                float* r0_tm_1 = r0_tm_0 + tiles * 4;
                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
                float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4;
                float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5;
                float* r0_tm_6 = r0_tm_0 + tiles * 4 * 6;
                float* r0_tm_7 = r0_tm_0 + tiles * 4 * 7;

                for (int m = 0; m < 8; m++)
                {
                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
                    __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0);
                    __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0);

                    __m128 _r0tm0 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp04, _tmp02), _v5_25, __lsx_vfsub_s(_tmp00, _tmp06));
                    __m128 _r0tm7 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp03, _tmp05), _v5_25, __lsx_vfsub_s(_tmp07, _tmp01));

                    __m128 _tmp12a = __lsx_vfmadd_s(_tmp04, _vm4_25, __lsx_vfadd_s(_tmp02, _tmp06));
                    __m128 _tmp12b = __lsx_vfmadd_s(_tmp03, _vm4_25, __lsx_vfadd_s(_tmp01, _tmp05));

                    __m128 _r0tm1 = __lsx_vfadd_s(_tmp12a, _tmp12b);
                    __m128 _r0tm2 = __lsx_vfsub_s(_tmp12a, _tmp12b);

                    __m128 _tmp34a = __lsx_vfmadd_s(_tmp04, _vm1_25, __lsx_vfmadd_s(_tmp02, _v0_25, _tmp06));
                    __m128 _tmp34b = __lsx_vfmadd_s(_tmp05, _v2, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v0_5)));

                    __m128 _r0tm3 = __lsx_vfadd_s(_tmp34a, _tmp34b);
                    __m128 _r0tm4 = __lsx_vfsub_s(_tmp34a, _tmp34b);

                    __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_tmp04, _vm1_25, _tmp02), _v4, _tmp06);
                    __m128 _tmp56b = __lsx_vfmadd_s(_tmp05, _v0_5, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v2)));

                    __m128 _r0tm5 = __lsx_vfadd_s(_tmp56a, _tmp56b);
                    __m128 _r0tm6 = __lsx_vfsub_s(_tmp56a, _tmp56b);

                    __lsx_vst(_r0tm0, r0_tm_0, 0);
                    __lsx_vst(_r0tm1, r0_tm_1, 0);
                    __lsx_vst(_r0tm2, r0_tm_2, 0);
                    __lsx_vst(_r0tm3, r0_tm_3, 0);
                    __lsx_vst(_r0tm4, r0_tm_4, 0);
                    __lsx_vst(_r0tm5, r0_tm_5, 0);
                    __lsx_vst(_r0tm6, r0_tm_6, 0);
                    __lsx_vst(_r0tm7, r0_tm_7, 0);

                    r0_tm_0 += tiles * 4 * 8;
                    r0_tm_1 += tiles * 4 * 8;
                    r0_tm_2 += tiles * 4 * 8;
                    r0_tm_3 += tiles * 4 * 8;
                    r0_tm_4 += tiles * 4 * 8;
                    r0_tm_5 += tiles * 4 * 8;
                    r0_tm_6 += tiles * 4 * 8;
                    r0_tm_7 += tiles * 4 * 8;
                }
            }
        }
    }
}

static void conv3x3s1_winograd63_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int w_tiles = outw / 6;
    const int h_tiles = outh / 6;
    const int tiles = w_tiles * h_tiles;

    const float* biasptr = bias;

    // const float otm[6][8] = {
    //     {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
    //     {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
    //     {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
    //     {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
    //     {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
    //     {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
    // };

    // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
    // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
    // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
    // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
    // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
    // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        const Mat out0_tm = top_blob_tm.channel(p);
        Mat out0 = top_blob.channel(p);

        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

        float tmp[6][8][4];

        __m128 _v32 = __lsx_vreplfr2vr_s(32.f);
        __m128 _v16 = __lsx_vreplfr2vr_s(16.f);
        __m128 _v8 = __lsx_vreplfr2vr_s(8.f);
        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
                const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4;
                const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5;
                const float* output0_tm_6 = output0_tm_0 + tiles * 4 * 6;
                const float* output0_tm_7 = output0_tm_0 + tiles * 4 * 7;

                float* output0 = out0.row<float>(i * 6) + (j * 6) * 4;

                for (int m = 0; m < 8; m++)
                {
                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
                    __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0);
                    __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0);
                    __m128 _out0tm6 = (__m128)__lsx_vld(output0_tm_6, 0);
                    __m128 _out0tm7 = (__m128)__lsx_vld(output0_tm_7, 0);

                    __m128 _tmp024a = __lsx_vfadd_s(_out0tm1, _out0tm2);
                    __m128 _tmp135a = __lsx_vfsub_s(_out0tm1, _out0tm2);

                    __m128 _tmp024b = __lsx_vfadd_s(_out0tm3, _out0tm4);
                    __m128 _tmp135b = __lsx_vfsub_s(_out0tm3, _out0tm4);

                    __m128 _tmp024c = __lsx_vfadd_s(_out0tm5, _out0tm6);
                    __m128 _tmp135c = __lsx_vfsub_s(_out0tm5, _out0tm6);

                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b));
                    __m128 _tmp2m = __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a));
                    __m128 _tmp4m = __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a));
                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);
                    __lsx_vst(_tmp4m, tmp[4][m], 0);

                    __m128 _tmp1m = __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a));
                    __m128 _tmp3m = __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a));
                    __m128 _tmp5m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm7, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c));
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);
                    __lsx_vst(_tmp5m, tmp[5][m], 0);

                    output0_tm_0 += tiles * 4 * 8;
                    output0_tm_1 += tiles * 4 * 8;
                    output0_tm_2 += tiles * 4 * 8;
                    output0_tm_3 += tiles * 4 * 8;
                    output0_tm_4 += tiles * 4 * 8;
                    output0_tm_5 += tiles * 4 * 8;
                    output0_tm_6 += tiles * 4 * 8;
                    output0_tm_7 += tiles * 4 * 8;
                }

                for (int m = 0; m < 6; m++)
                {
                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);
                    __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0);
                    __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0);

                    __m128 _tmp024a = __lsx_vfadd_s(_tmp01, _tmp02);
                    __m128 _tmp135a = __lsx_vfsub_s(_tmp01, _tmp02);

                    __m128 _tmp024b = __lsx_vfadd_s(_tmp03, _tmp04);
                    __m128 _tmp135b = __lsx_vfsub_s(_tmp03, _tmp04);

                    __m128 _tmp024c = __lsx_vfadd_s(_tmp05, _tmp06);
                    __m128 _tmp135c = __lsx_vfsub_s(_tmp05, _tmp06);

                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b)));
                    __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a)));
                    __m128 _out04 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a)));
                    __lsx_vst(_out00, output0, 0);
                    __lsx_vst(_out02, output0 + 4 * 2, 0);
                    __lsx_vst(_out04, output0 + 4 * 4, 0);

                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a)));
                    __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a)));
                    __m128 _out05 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp07, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c)));
                    __lsx_vst(_out01, output0 + 4, 0);
                    __lsx_vst(_out03, output0 + 4 * 3, 0);
                    __lsx_vst(_out05, output0 + 4 * 5, 0);

                    output0 += outw * 4;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int w_tiles = (w - 2) / 4;
    const int h_tiles = (h - 2) / 4;
    const int tiles = w_tiles * h_tiles;

    // const float itm[6][6] = {
    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
    // };

    // 0 =  4 * r00 - 5 * r02 + r04
    // 1 = -4 * (r01 + r02) + r04 + r03
    // 2 =  4 * (r01 - r02) + r04 - r03
    // 3 = -2 * (r01 - r03) + r04 - r02
    // 4 =  2 * (r01 - r03) + r04 - r02
    // 5 =  4 * r01 - 5 * r03 + r05

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        Mat img0_tm = bottom_blob_tm.channel(q);

        float tmp[6][6][4];

        __m128 _vm5 = __lsx_vreplfr2vr_s(-5.f);
        __m128 _vm4 = __lsx_vreplfr2vr_s(-4.f);
        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
        __m128 _vm2 = __lsx_vreplfr2vr_s(-2.f);
        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* r0 = img0.row(i * 4) + (j * 4) * 4;

                for (int m = 0; m < 6; m++)
                {
                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
                    __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);
                    __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0);

                    __m128 _tmp0m = __lsx_vfmadd_s(_r02, _vm5, __lsx_vfmadd_s(_r00, _v4, _r04));
                    __m128 _tmp1m = __lsx_vfmadd_s(__lsx_vfadd_s(_r01, _r02), _vm4, __lsx_vfadd_s(_r04, _r03));
                    __m128 _tmp2m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r02), _v4, __lsx_vfsub_s(_r04, _r03));
                    __m128 _tmp3m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _vm2, __lsx_vfsub_s(_r04, _r02));
                    __m128 _tmp4m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _v2, __lsx_vfsub_s(_r04, _r02));
                    __m128 _tmp5m = __lsx_vfmadd_s(_r03, _vm5, __lsx_vfmadd_s(_r01, _v4, _r05));

                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);
                    __lsx_vst(_tmp4m, tmp[4][m], 0);
                    __lsx_vst(_tmp5m, tmp[5][m], 0);

                    r0 += w * 4;
                }

                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
                float* r0_tm_1 = r0_tm_0 + tiles * 4;
                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;
                float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4;
                float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5;

                for (int m = 0; m < 6; m++)
                {
                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);

                    __m128 _r0tm0 = __lsx_vfmadd_s(_tmp02, _vm5, __lsx_vfmadd_s(_tmp00, _v4, _tmp04));
                    __m128 _r0tm1 = __lsx_vfmadd_s(__lsx_vfadd_s(_tmp01, _tmp02), _vm4, __lsx_vfadd_s(_tmp04, _tmp03));
                    __m128 _r0tm2 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _v4, __lsx_vfsub_s(_tmp04, _tmp03));
                    __m128 _r0tm3 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _vm2, __lsx_vfsub_s(_tmp04, _tmp02));
                    __m128 _r0tm4 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _v2, __lsx_vfsub_s(_tmp04, _tmp02));
                    __m128 _r0tm5 = __lsx_vfmadd_s(_tmp03, _vm5, __lsx_vfmadd_s(_tmp01, _v4, _tmp05));

                    __lsx_vst(_r0tm0, r0_tm_0, 0);
                    __lsx_vst(_r0tm1, r0_tm_1, 0);
                    __lsx_vst(_r0tm2, r0_tm_2, 0);
                    __lsx_vst(_r0tm3, r0_tm_3, 0);
                    __lsx_vst(_r0tm4, r0_tm_4, 0);
                    __lsx_vst(_r0tm5, r0_tm_5, 0);

                    r0_tm_0 += tiles * 4 * 6;
                    r0_tm_1 += tiles * 4 * 6;
                    r0_tm_2 += tiles * 4 * 6;
                    r0_tm_3 += tiles * 4 * 6;
                    r0_tm_4 += tiles * 4 * 6;
                    r0_tm_5 += tiles * 4 * 6;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int w_tiles = outw / 4;
    const int h_tiles = outh / 4;
    const int tiles = w_tiles * h_tiles;

    const float* biasptr = bias;

    // const float otm[4][6] = {
    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
    // };

    // 0 = r00 + (r01 + r02) + (r03 + r04)
    // 1 =       (r01 - r02) + (r03 - r04) * 2
    // 2 =       (r01 + r02) + (r03 + r04) * 4
    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        const Mat out0_tm = top_blob_tm.channel(p);
        Mat out0 = top_blob.channel(p);

        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

        float tmp[4][6][4];

        __m128 _v2 = __lsx_vreplfr2vr_s(2.f);
        __m128 _v4 = __lsx_vreplfr2vr_s(4.f);
        __m128 _v8 = __lsx_vreplfr2vr_s(8.f);

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;
                const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4;
                const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5;

                float* output0 = out0.row<float>(i * 4) + (j * 4) * 4;

                for (int m = 0; m < 6; m++)
                {
                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);
                    __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0);
                    __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0);

                    __m128 _tmp02a = __lsx_vfadd_s(_out0tm1, _out0tm2);
                    __m128 _tmp13a = __lsx_vfsub_s(_out0tm1, _out0tm2);

                    __m128 _tmp02b = __lsx_vfadd_s(_out0tm3, _out0tm4);
                    __m128 _tmp13b = __lsx_vfsub_s(_out0tm3, _out0tm4);

                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp02a), _tmp02b);
                    __m128 _tmp1m = __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a);
                    __m128 _tmp2m = __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a);
                    __m128 _tmp3m = __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_out0tm5, _tmp13a));

                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);

                    output0_tm_0 += tiles * 4 * 6;
                    output0_tm_1 += tiles * 4 * 6;
                    output0_tm_2 += tiles * 4 * 6;
                    output0_tm_3 += tiles * 4 * 6;
                    output0_tm_4 += tiles * 4 * 6;
                    output0_tm_5 += tiles * 4 * 6;
                }

                for (int m = 0; m < 4; m++)
                {
                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);
                    __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0);
                    __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0);

                    __m128 _tmp02a = __lsx_vfadd_s(_tmp01, _tmp02);
                    __m128 _tmp13a = __lsx_vfsub_s(_tmp01, _tmp02);

                    __m128 _tmp02b = __lsx_vfadd_s(_tmp03, _tmp04);
                    __m128 _tmp13b = __lsx_vfsub_s(_tmp03, _tmp04);

                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp02a), _tmp02b));
                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a));
                    __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a));
                    __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_tmp05, _tmp13a)));

                    __lsx_vst(_out00, output0, 0);
                    __lsx_vst(_out01, output0 + 4, 0);
                    __lsx_vst(_out02, output0 + 4 * 2, 0);
                    __lsx_vst(_out03, output0 + 4 * 3, 0);

                    output0 += outw * 4;
                }
            }
        }
    }
}

static void conv3x3s1_winograd23_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int w_tiles = (w - 2) / 2;
    const int h_tiles = (h - 2) / 2;
    const int tiles = w_tiles * h_tiles;

    // const float itm[4][4] = {
    //     {1.0f,  0.0f, -1.0f,  0.0f},
    //     {0.0f,  1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  1.00f, 0.0f},
    //     {0.0f, -1.0f,  0.00f, 1.0f}
    // };

    // 0 = r00 - r02
    // 1 = r01 + r02
    // 2 = r02 - r01
    // 3 = r03 - r01

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        Mat img0_tm = bottom_blob_tm.channel(q);

        float tmp[4][4][4];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* r0 = img0.row(i * 2) + (j * 2) * 4;

                for (int m = 0; m < 4; m++)
                {
                    __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                    __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                    __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                    __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);

                    __m128 _tmp0m = __lsx_vfsub_s(_r00, _r02);
                    __m128 _tmp1m = __lsx_vfadd_s(_r01, _r02);
                    __m128 _tmp2m = __lsx_vfsub_s(_r02, _r01);
                    __m128 _tmp3m = __lsx_vfsub_s(_r03, _r01);

                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);

                    r0 += w * 4;
                }

                float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4;
                float* r0_tm_1 = r0_tm_0 + tiles * 4;
                float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2;
                float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3;

                for (int m = 0; m < 4; m++)
                {
                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);

                    __m128 _r0tm0 = __lsx_vfsub_s(_tmp00, _tmp02);
                    __m128 _r0tm1 = __lsx_vfadd_s(_tmp01, _tmp02);
                    __m128 _r0tm2 = __lsx_vfsub_s(_tmp02, _tmp01);
                    __m128 _r0tm3 = __lsx_vfsub_s(_tmp03, _tmp01);

                    __lsx_vst(_r0tm0, r0_tm_0, 0);
                    __lsx_vst(_r0tm1, r0_tm_1, 0);
                    __lsx_vst(_r0tm2, r0_tm_2, 0);
                    __lsx_vst(_r0tm3, r0_tm_3, 0);

                    r0_tm_0 += tiles * 4 * 4;
                    r0_tm_1 += tiles * 4 * 4;
                    r0_tm_2 += tiles * 4 * 4;
                    r0_tm_3 += tiles * 4 * 4;
                }
            }
        }
    }
}

static void conv3x3s1_winograd23_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int w_tiles = outw / 2;
    const int h_tiles = outh / 2;
    const int tiles = w_tiles * h_tiles;

    const float* biasptr = bias;

    // const float otm[2][4] = {
    //     {1.0f,  1.0f,  1.0f,  0.0f},
    //     {0.0f,  1.0f, -1.0f,  1.0f}
    // };

    // 0 = r00 + r01 + r02
    // 1 = r01 - r02 + r03

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        const Mat out0_tm = top_blob_tm.channel(p);
        Mat out0 = top_blob.channel(p);

        __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

        float tmp[2][4][4];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4;
                const float* output0_tm_1 = output0_tm_0 + tiles * 4;
                const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2;
                const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3;

                float* output0 = out0.row<float>(i * 2) + (j * 2) * 4;

                for (int m = 0; m < 4; m++)
                {
                    __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0);
                    __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0);
                    __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0);
                    __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0);

                    __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _out0tm1), _out0tm2);
                    __m128 _tmp1m = __lsx_vfadd_s(__lsx_vfsub_s(_out0tm1, _out0tm2), _out0tm3);

                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);

                    output0_tm_0 += tiles * 4 * 4;
                    output0_tm_1 += tiles * 4 * 4;
                    output0_tm_2 += tiles * 4 * 4;
                    output0_tm_3 += tiles * 4 * 4;
                }

                for (int m = 0; m < 2; m++)
                {
                    __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0);
                    __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0);
                    __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0);
                    __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0);

                    __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp01), _tmp02));
                    __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _tmp03));

                    __lsx_vst(_out00, output0, 0);
                    __lsx_vst(_out01, output0 + 4, 0);

                    output0 += outw * 4;
                }
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_transform_pack4_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_output_pack4_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt)
{
    const int outw = top_blob.w;
    const int outh = top_blob.h;
    const int outch = top_blob.c;

    const int w_tiles = outw / 4;
    const int h_tiles = outh / 4;
    const int tiles = w_tiles * h_tiles;

    // const float otm[4][6] = {
    //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
    //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
    //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
    // };

    // 0 = r00 + (r01 + r02) + (r03 + r04)
    // 1 =       (r01 - r02) + (r03 - r04) * 2
    // 2 =       (r01 + r02) + (r03 + r04) * 4
    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        const Mat out0_tm = top_blob_tm.channel(p);
        Mat out0 = top_blob.channel(p);

        int tmp[4][6][4];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 4;
                const int* output0_tm_1 = output0_tm_0 + tiles * 4;
                const int* output0_tm_2 = output0_tm_0 + tiles * 8;
                const int* output0_tm_3 = output0_tm_0 + tiles * 12;
                const int* output0_tm_4 = output0_tm_0 + tiles * 16;
                const int* output0_tm_5 = output0_tm_0 + tiles * 20;

                int* output0 = out0.row<int>(i * 4) + (j * 4) * 4;

                for (int m = 0; m < 5; m++)
                {
                    __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0);
                    __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0);
                    __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0);
                    __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0);
                    __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0);
                    __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0);

                    __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2);
                    __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2);

                    __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4);
                    __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4);

                    __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b);
                    __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
                    __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
                    __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3));

                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);

                    output0_tm_0 += tiles * 24;
                    output0_tm_1 += tiles * 24;
                    output0_tm_2 += tiles * 24;
                    output0_tm_3 += tiles * 24;
                    output0_tm_4 += tiles * 24;
                    output0_tm_5 += tiles * 24;
                }
                for (int m = 5; m < 6; m++)
                {
                    __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0);
                    __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0);
                    __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0);
                    __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0);
                    __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0);
                    __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0);

                    __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2);
                    __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2);

                    __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4);
                    __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4);

                    __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b);
                    __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
                    __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
                    __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3));

                    _tmp0m = __lsx_vslli_w(_tmp0m, 2);
                    _tmp1m = __lsx_vslli_w(_tmp1m, 2);
                    _tmp2m = __lsx_vslli_w(_tmp2m, 2);
                    _tmp3m = __lsx_vslli_w(_tmp3m, 2);

                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);

                    output0_tm_0 += tiles * 24;
                    output0_tm_1 += tiles * 24;
                    output0_tm_2 += tiles * 24;
                    output0_tm_3 += tiles * 24;
                    output0_tm_4 += tiles * 24;
                    output0_tm_5 += tiles * 24;
                }

                for (int m = 0; m < 4; m++)
                {
                    __m128i _tmp00 = __lsx_vld(tmp[m][0], 0);
                    __m128i _tmp01 = __lsx_vld(tmp[m][1], 0);
                    __m128i _tmp02 = __lsx_vld(tmp[m][2], 0);
                    __m128i _tmp03 = __lsx_vld(tmp[m][3], 0);
                    __m128i _tmp04 = __lsx_vld(tmp[m][4], 0);
                    __m128i _tmp05 = __lsx_vld(tmp[m][5], 0);

                    __m128i _tmp02a = __lsx_vadd_w(_tmp01, _tmp02);
                    __m128i _tmp13a = __lsx_vsub_w(_tmp01, _tmp02);

                    __m128i _tmp02b = __lsx_vadd_w(_tmp03, _tmp04);
                    __m128i _tmp13b = __lsx_vsub_w(_tmp03, _tmp04);

                    __m128i _out00 = __lsx_vadd_w(__lsx_vadd_w(_tmp00, _tmp02a), _tmp02b);
                    __m128i _out01 = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1));
                    __m128i _out02 = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2));
                    __m128i _out03 = __lsx_vadd_w(__lsx_vadd_w(_tmp05, _tmp13a), __lsx_vslli_w(_tmp13b, 3));

                    // TODO use integer trick for division by 576
                    __m128 _v576 = __lsx_vreplfr2vr_s(1.0 / 576);
                    _out00 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out00), _v576));
                    _out01 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out01), _v576));
                    _out02 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out02), _v576));
                    _out03 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out03), _v576));

                    __lsx_vst(_out00, output0, 0);
                    __lsx_vst(_out01, output0 + 4, 0);
                    __lsx_vst(_out02, output0 + 8, 0);
                    __lsx_vst(_out03, output0 + 12, 0);

                    output0 += outw * 4;
                }
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolution_winograd_transform_pack8_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_input_pack8_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
{
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int inch = bottom_blob.c;

    const int w_tiles = (w - 2) / 4;
    const int h_tiles = (h - 2) / 4;
    const int tiles = w_tiles * h_tiles;

    // const float itm[6][6] = {
    //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
    //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
    //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
    //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
    //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
    // };

    // 0 =  4 * r00 - 5 * r02 + r04
    // 1 = -4 * (r01 + r02) + r04 + r03
    // 2 =  4 * (r01 - r02) + r04 - r03
    // 3 = -2 * (r01 - r03) + r04 - r02
    // 4 =  2 * (r01 - r03) + r04 - r02
    // 5 =  4 * r01 - 5 * r03 + r05

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < inch; q++)
    {
        const Mat img0 = bottom_blob.channel(q);
        Mat img0_tm = bottom_blob_tm.channel(q);

        short tmp[6][6][8];

        // tile
        for (int i = 0; i < h_tiles; i++)
        {
            for (int j = 0; j < w_tiles; j++)
            {
                const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;

                for (int m = 0; m < 6; m++)
                {
                    __m128i _r00_01 = __lsx_vld(r0, 0);
                    __m128i _r02_03 = __lsx_vld(r0 + 16, 0);
                    __m128i _r04_05 = __lsx_vld(r0 + 32, 0);
                    __m128i _extr0001 = __lsx_vslti_b(_r00_01, 0);
                    __m128i _extr0203 = __lsx_vslti_b(_r02_03, 0);
                    __m128i _extr0405 = __lsx_vslti_b(_r04_05, 0);
                    __m128i _r00 = __lsx_vilvl_b(_extr0001, _r00_01);
                    __m128i _r01 = __lsx_vilvh_b(_extr0001, _r00_01);
                    __m128i _r02 = __lsx_vilvl_b(_extr0203, _r02_03);
                    __m128i _r03 = __lsx_vilvh_b(_extr0203, _r02_03);
                    __m128i _r04 = __lsx_vilvl_b(_extr0405, _r04_05);
                    __m128i _r05 = __lsx_vilvh_b(_extr0405, _r04_05);

                    __m128i _v5 = __lsx_vreplgr2vr_h(5);

                    __m128i _tmp0m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r00, 2), _r04), __lsx_vmul_h(_r02, _v5));
                    __m128i _tmp1m = __lsx_vsub_h(__lsx_vadd_h(_r04, _r03), __lsx_vslli_h(__lsx_vadd_h(_r01, _r02), 2));
                    __m128i _tmp2m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r03), __lsx_vslli_h(__lsx_vsub_h(_r01, _r02), 2));
                    __m128i _tmp3m = __lsx_vsub_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1));
                    __m128i _tmp4m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1));
                    __m128i _tmp5m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r01, 2), _r05), __lsx_vmul_h(_r03, _v5));

                    __lsx_vst(_tmp0m, tmp[0][m], 0);
                    __lsx_vst(_tmp1m, tmp[1][m], 0);
                    __lsx_vst(_tmp2m, tmp[2][m], 0);
                    __lsx_vst(_tmp3m, tmp[3][m], 0);
                    __lsx_vst(_tmp4m, tmp[4][m], 0);
                    __lsx_vst(_tmp5m, tmp[5][m], 0);

                    r0 += w * 8;
                }

                short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j) * 8;
                short* r0_tm_1 = r0_tm_0 + tiles * 8;
                short* r0_tm_2 = r0_tm_0 + tiles * 16;
                short* r0_tm_3 = r0_tm_0 + tiles * 24;
                short* r0_tm_4 = r0_tm_0 + tiles * 32;
                short* r0_tm_5 = r0_tm_0 + tiles * 40;

                for (int m = 0; m < 6; m++)
                {
                    __m128i _tmp00 = __lsx_vld(tmp[m][0], 0);
                    __m128i _tmp01 = __lsx_vld(tmp[m][1], 0);
                    __m128i _tmp02 = __lsx_vld(tmp[m][2], 0);
                    __m128i _tmp03 = __lsx_vld(tmp[m][3], 0);
                    __m128i _tmp04 = __lsx_vld(tmp[m][4], 0);
                    __m128i _tmp05 = __lsx_vld(tmp[m][5], 0);

                    __m128i _v5 = __lsx_vreplgr2vr_h(5);

                    __m128i _r0tm0 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp00, 2), _tmp04), __lsx_vmul_h(_tmp02, _v5));
                    __m128i _r0tm1 = __lsx_vsub_h(__lsx_vadd_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vadd_h(_tmp01, _tmp02), 2));
                    __m128i _r0tm2 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp02), 2));
                    __m128i _r0tm3 = __lsx_vsub_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1));
                    __m128i _r0tm4 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1));
                    __m128i _r0tm5 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp01, 2), _tmp05), __lsx_vmul_h(_tmp03, _v5));

                    __lsx_vst(_r0tm0, r0_tm_0, 0);
                    __lsx_vst(_r0tm1, r0_tm_1, 0);
                    __lsx_vst(_r0tm2, r0_tm_2, 0);
                    __lsx_vst(_r0tm3, r0_tm_3, 0);
                    __lsx_vst(_r0tm4, r0_tm_4, 0);
                    __lsx_vst(_r0tm5, r0_tm_5, 0);

                    r0_tm_0 += tiles * 48;
                    r0_tm_1 += tiles * 48;
                    r0_tm_2 += tiles * 48;
                    r0_tm_3 += tiles * 48;
                    r0_tm_4 += tiles * 48;
                    r0_tm_5 += tiles * 48;
                }
            }
        }
    }
}


================================================
FILE: src/layer/loongarch/convolutiondepthwise_3x3.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const float bias0 = bias ? bias[g] : 0.f;

        const float* kernel0 = kernel + g * 9;

        float* outptr0 = out;
        float* outptr1 = outptr0 + outw;

        const float* img0 = bottom_blob.channel(g);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w * 2;
        const float* r3 = img0 + w * 3;

        const float* k0 = kernel0;
        const float* k1 = kernel0 + 3;
        const float* k2 = kernel0 + 6;

        int i = 0;

        for (; i + 1 < outh; i += 2)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = bias0;
                float sum2 = bias0;

                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum2 += r1[0] * k0[0];
                sum2 += r1[1] * k0[1];
                sum2 += r1[2] * k0[2];
                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum2 += r2[0] * k1[0];
                sum2 += r2[1] * k1[1];
                sum2 += r2[2] * k1[2];
                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];
                sum2 += r3[0] * k2[0];
                sum2 += r3[1] * k2[1];
                sum2 += r3[2] * k2[2];

                *outptr0 = sum;
                *outptr1 = sum2;

                r0++;
                r1++;
                r2++;
                r3++;
                outptr0++;
                outptr1++;
            }

            r0 += 2 + w;
            r1 += 2 + w;
            r2 += 2 + w;
            r3 += 2 + w;

            outptr0 += outw;
            outptr1 += outw;
        }

        for (; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = bias0;
                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];

                *outptr0 = sum;

                r0++;
                r1++;
                r2++;
                outptr0++;
            }

            r0 += 2;
            r1 += 2;
            r2 += 2;
        }
    }
}

static void convdw3x3s2_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        const float bias0 = bias ? bias[g] : 0.f;

        const float* kernel0 = kernel + g * 9;

        float* outptr = out;

        const float* img0 = bottom_blob.channel(g);

        const float* r0 = img0;
        const float* r1 = img0 + w;
        const float* r2 = img0 + w * 2;

        const float* k0 = kernel0;
        const float* k1 = kernel0 + 3;
        const float* k2 = kernel0 + 6;

        int i = 0;

        for (; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = bias0;
                sum += r0[0] * k0[0];
                sum += r0[1] * k0[1];
                sum += r0[2] * k0[2];
                sum += r1[0] * k1[0];
                sum += r1[1] * k1[1];
                sum += r1[2] * k1[2];
                sum += r2[0] * k2[0];
                sum += r2[1] * k2[1];
                sum += r2[2] * k2[2];

                *outptr = sum;

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolutiondepthwise_3x3_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convdw3x3s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

        const float* k0 = kernel.row(g);

        float* outptr0 = out.row(0);
        float* outptr1 = out.row(1);

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);
        const float* r3 = img0.row(3);

        __m128 _k00 = (__m128)__lsx_vld(k0, 0);
        __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
        __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
        __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
        __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
        __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
        __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
        __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
        __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);

        int i = 0;
        for (; i + 1 < outh; i += 2)
        {
            int j = 0;
            for (; j + 1 < outw; j += 2)
            {
                __builtin_prefetch(r0 + 32);
                __builtin_prefetch(r1 + 32);
                __builtin_prefetch(r2 + 32);
                __builtin_prefetch(r3 + 32);

                __m128 _sum00 = _bias0;
                __m128 _sum01 = _bias0;
                __m128 _sum10 = _bias0;
                __m128 _sum11 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);

                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
                _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01);
                _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01);
                _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);

                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
                _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01);
                _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01);
                _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01);
                _sum10 = __lsx_vfmadd_s(_r10, _k00, _sum10);
                _sum10 = __lsx_vfmadd_s(_r11, _k01, _sum10);
                _sum10 = __lsx_vfmadd_s(_r12, _k02, _sum10);
                _sum11 = __lsx_vfmadd_s(_r11, _k00, _sum11);
                _sum11 = __lsx_vfmadd_s(_r12, _k01, _sum11);
                _sum11 = __lsx_vfmadd_s(_r13, _k02, _sum11);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);

                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
                _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01);
                _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01);
                _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01);
                _sum10 = __lsx_vfmadd_s(_r20, _k10, _sum10);
                _sum10 = __lsx_vfmadd_s(_r21, _k11, _sum10);
                _sum10 = __lsx_vfmadd_s(_r22, _k12, _sum10);
                _sum11 = __lsx_vfmadd_s(_r21, _k10, _sum11);
                _sum11 = __lsx_vfmadd_s(_r22, _k11, _sum11);
                _sum11 = __lsx_vfmadd_s(_r23, _k12, _sum11);

                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);

                _sum10 = __lsx_vfmadd_s(_r30, _k20, _sum10);
                _sum10 = __lsx_vfmadd_s(_r31, _k21, _sum10);
                _sum10 = __lsx_vfmadd_s(_r32, _k22, _sum10);
                _sum11 = __lsx_vfmadd_s(_r31, _k20, _sum11);
                _sum11 = __lsx_vfmadd_s(_r32, _k21, _sum11);
                _sum11 = __lsx_vfmadd_s(_r33, _k22, _sum11);

                __lsx_vst(_sum00, outptr0, 0);
                __lsx_vst(_sum01, outptr0 + 4, 0);
                __lsx_vst(_sum10, outptr1, 0);
                __lsx_vst(_sum11, outptr1 + 4, 0);

                outptr0 += 4 * 2;
                outptr1 += 4 * 2;

                r0 += 4 * 2;
                r1 += 4 * 2;
                r2 += 4 * 2;
                r3 += 4 * 2;
            }
            for (; j < outw; j++)
            {
                __builtin_prefetch(r0 + 16);
                __builtin_prefetch(r1 + 16);
                __builtin_prefetch(r2 + 16);
                __builtin_prefetch(r3 + 16);

                __m128 _sum0 = _bias0;
                __m128 _sum1 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
                _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1);
                _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1);
                _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
                _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1);
                _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1);
                _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1);

                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);

                _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1);
                _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1);
                _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1);

                __lsx_vst(_sum0, outptr0, 0);
                __lsx_vst(_sum1, outptr1, 0);

                outptr0 += 4;
                outptr1 += 4;

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
            }

            r0 += 2 * 4 + w * 4;
            r1 += 2 * 4 + w * 4;
            r2 += 2 * 4 + w * 4;
            r3 += 2 * 4 + w * 4;

            outptr0 += outw * 4;
            outptr1 += outw * 4;
        }
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 1 < outw; j += 2)
            {
                __builtin_prefetch(r0 + 32);
                __builtin_prefetch(r1 + 32);
                __builtin_prefetch(r2 + 32);

                __m128 _sum00 = _bias0;
                __m128 _sum01 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);

                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
                _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01);
                _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01);
                _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);

                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
                _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01);
                _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01);
                _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);

                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
                _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01);
                _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01);
                _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01);

                __lsx_vst(_sum00, outptr0, 0);
                __lsx_vst(_sum01, outptr0 + 4, 0);

                outptr0 += 4 * 2;

                r0 += 4 * 2;
                r1 += 4 * 2;
                r2 += 4 * 2;
            }
            for (; j < outw; j++)
            {
                __builtin_prefetch(r0 + 16);
                __builtin_prefetch(r1 + 16);
                __builtin_prefetch(r2 + 16);

                __m128 _sum0 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);

                __lsx_vst(_sum0, outptr0, 0);

                outptr0 += 4;

                r0 += 4;
                r1 += 4;
                r2 += 4;
            }

            r0 += 2 * 4;
            r1 += 2 * 4;
            r2 += 2 * 4;
        }
    }
}

static void convdw3x3s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

        const float* k0 = kernel.row(g);

        float* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);

        __m128 _k00 = (__m128)__lsx_vld(k0, 0);
        __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
        __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
        __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
        __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
        __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0);
        __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0);
        __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0);
        __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0);

        int i = 0;
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j + 1 < outw; j += 2)
            {
                __builtin_prefetch(r0 + 64);
                __builtin_prefetch(r1 + 64);
                __builtin_prefetch(r2 + 64);

                __m128 _sum00 = _bias0;
                __m128 _sum01 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);

                _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00);
                _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00);
                _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00);
                _sum01 = __lsx_vfmadd_s(_r02, _k00, _sum01);
                _sum01 = __lsx_vfmadd_s(_r03, _k01, _sum01);
                _sum01 = __lsx_vfmadd_s(_r04, _k02, _sum01);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);

                _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00);
                _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00);
                _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00);
                _sum01 = __lsx_vfmadd_s(_r12, _k10, _sum01);
                _sum01 = __lsx_vfmadd_s(_r13, _k11, _sum01);
                _sum01 = __lsx_vfmadd_s(_r14, _k12, _sum01);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);

                _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00);
                _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00);
                _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00);
                _sum01 = __lsx_vfmadd_s(_r22, _k20, _sum01);
                _sum01 = __lsx_vfmadd_s(_r23, _k21, _sum01);
                _sum01 = __lsx_vfmadd_s(_r24, _k22, _sum01);

                __lsx_vst(_sum00, outptr0, 0);
                __lsx_vst(_sum01, outptr0 + 4, 0);

                outptr0 += 4 * 2;

                r0 += 4 * 4;
                r1 += 4 * 4;
                r2 += 4 * 4;
            }
            for (; j < outw; j++)
            {
                __builtin_prefetch(r0 + 32);
                __builtin_prefetch(r1 + 32);
                __builtin_prefetch(r2 + 32);

                __m128 _sum0 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);

                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);

                __lsx_vst(_sum0, outptr0, 0);

                outptr0 += 4;

                r0 += 4 * 2;
                r1 += 4 * 2;
                r2 += 4 * 2;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolutiondepthwise_5x5_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void convdw5x5s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

        const float* k0 = kernel.row(g);

        float* outptr0 = out.row(0);
        float* outptr1 = out.row(1);

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);
        const float* r3 = img0.row(3);
        const float* r4 = img0.row(4);
        const float* r5 = img0.row(5);

        int i = 0;
        for (; i + 1 < outh; i += 2)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                __builtin_prefetch(r0 + 16);
                __builtin_prefetch(r1 + 16);
                __builtin_prefetch(r2 + 16);
                __builtin_prefetch(r3 + 16);
                __builtin_prefetch(r4 + 16);
                __builtin_prefetch(r5 + 16);

                __builtin_prefetch(k0 + 400);

                __m128 _sum0 = _bias0;
                __m128 _sum1 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);

                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);

                _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1);
                _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1);
                _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1);
                _sum1 = __lsx_vfmadd_s(_r13, _k03, _sum1);
                _sum1 = __lsx_vfmadd_s(_r14, _k04, _sum1);

                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);

                _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1);
                _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1);
                _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1);
                _sum1 = __lsx_vfmadd_s(_r23, _k13, _sum1);
                _sum1 = __lsx_vfmadd_s(_r24, _k14, _sum1);

                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);

                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);

                _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1);
                _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1);
                _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1);
                _sum1 = __lsx_vfmadd_s(_r33, _k23, _sum1);
                _sum1 = __lsx_vfmadd_s(_r34, _k24, _sum1);

                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);

                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);

                _sum1 = __lsx_vfmadd_s(_r40, _k30, _sum1);
                _sum1 = __lsx_vfmadd_s(_r41, _k31, _sum1);
                _sum1 = __lsx_vfmadd_s(_r42, _k32, _sum1);
                _sum1 = __lsx_vfmadd_s(_r43, _k33, _sum1);
                _sum1 = __lsx_vfmadd_s(_r44, _k34, _sum1);

                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 -= 4 * 20;

                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);

                __m128 _r50 = (__m128)__lsx_vld(r5, 0);
                __m128 _r51 = (__m128)__lsx_vld(r5 + 4, 0);
                __m128 _r52 = (__m128)__lsx_vld(r5 + 4 * 2, 0);
                __m128 _r53 = (__m128)__lsx_vld(r5 + 4 * 3, 0);
                __m128 _r54 = (__m128)__lsx_vld(r5 + 4 * 4, 0);

                _sum1 = __lsx_vfmadd_s(_r50, _k40, _sum1);
                _sum1 = __lsx_vfmadd_s(_r51, _k41, _sum1);
                _sum1 = __lsx_vfmadd_s(_r52, _k42, _sum1);
                _sum1 = __lsx_vfmadd_s(_r53, _k43, _sum1);
                _sum1 = __lsx_vfmadd_s(_r54, _k44, _sum1);

                __lsx_vst(_sum0, outptr0, 0);
                __lsx_vst(_sum1, outptr1, 0);

                outptr0 += 4;
                outptr1 += 4;

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
                r4 += 4;
                r5 += 4;
            }

            r0 += 4 * 4 + w * 4;
            r1 += 4 * 4 + w * 4;
            r2 += 4 * 4 + w * 4;
            r3 += 4 * 4 + w * 4;
            r4 += 4 * 4 + w * 4;
            r5 += 4 * 4 + w * 4;

            outptr0 += outw * 4;
            outptr1 += outw * 4;
        }
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                __builtin_prefetch(r0 + 16);
                __builtin_prefetch(r1 + 16);
                __builtin_prefetch(r2 + 16);
                __builtin_prefetch(r3 + 16);
                __builtin_prefetch(r4 + 16);

                __builtin_prefetch(k0 + 400);

                __m128 _sum0 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);

                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);

                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);

                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);

                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);

                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);

                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);

                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 -= 4 * 20;

                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);

                __lsx_vst(_sum0, outptr0, 0);

                outptr0 += 4;

                r0 += 4;
                r1 += 4;
                r2 += 4;
                r3 += 4;
                r4 += 4;
            }

            r0 += 4 * 4;
            r1 += 4 * 4;
            r2 += 4 * 4;
            r3 += 4 * 4;
            r4 += 4 * 4;
        }
    }
}

static void convdw5x5s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int group = bottom_blob.c;

    const int tailstep = (w - 2 * outw + w) * 4;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        Mat out = top_blob.channel(g);

        __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0);

        const float* k0 = kernel.row(g);

        float* outptr0 = out;

        const Mat img0 = bottom_blob.channel(g);

        const float* r0 = img0.row(0);
        const float* r1 = img0.row(1);
        const float* r2 = img0.row(2);
        const float* r3 = img0.row(3);
        const float* r4 = img0.row(4);

        int i = 0;
        for (; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                __builtin_prefetch(r0 + 32);
                __builtin_prefetch(r1 + 32);
                __builtin_prefetch(r2 + 32);
                __builtin_prefetch(r3 + 32);
                __builtin_prefetch(r4 + 32);

                __builtin_prefetch(k0 + 400);

                __m128 _sum0 = _bias0;

                __m128 _r00 = (__m128)__lsx_vld(r0, 0);
                __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0);
                __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0);
                __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0);
                __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0);

                __m128 _k00 = (__m128)__lsx_vld(k0, 0);
                __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0);
                _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0);
                _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0);
                _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0);
                _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0);

                __m128 _r10 = (__m128)__lsx_vld(r1, 0);
                __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0);
                __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0);
                __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0);
                __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0);

                __m128 _k10 = (__m128)__lsx_vld(k0, 0);
                __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0);
                _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0);
                _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0);
                _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0);
                _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0);

                __m128 _r20 = (__m128)__lsx_vld(r2, 0);
                __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0);
                __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0);
                __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0);
                __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0);

                __m128 _k20 = (__m128)__lsx_vld(k0, 0);
                __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0);
                _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0);
                _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0);
                _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0);
                _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0);

                __m128 _r30 = (__m128)__lsx_vld(r3, 0);
                __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0);
                __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0);
                __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0);
                __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0);

                __m128 _k30 = (__m128)__lsx_vld(k0, 0);
                __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 += 4 * 5;

                _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0);
                _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0);
                _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0);
                _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0);
                _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0);

                __m128 _r40 = (__m128)__lsx_vld(r4, 0);
                __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0);
                __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0);
                __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0);
                __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0);

                __m128 _k40 = (__m128)__lsx_vld(k0, 0);
                __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0);
                __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0);
                __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0);
                __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0);
                k0 -= 4 * 20;

                _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0);
                _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0);
                _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0);
                _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0);
                _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0);

                __lsx_vst(_sum0, outptr0, 0);

                outptr0 += 4;

                r0 += 4 * 2;
                r1 += 4 * 2;
                r2 += 4 * 2;
                r3 += 4 * 2;
                r4 += 4 * 2;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
            r3 += tailstep;
            r4 += tailstep;
        }
    }
}


================================================
FILE: src/layer/loongarch/convolutiondepthwise_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "convolutiondepthwise_loongarch.h"

#include "layer_type.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_activation.h"
#include "loongarch_usability.h"

namespace ncnn {

#include "convolutiondepthwise_3x3.h"

#if __loongarch_sx
#include "convolutiondepthwise_3x3_pack4.h"
#include "convolutiondepthwise_5x5_pack4.h"
#endif // __loongarch_sx

ConvolutionDepthWise_loongarch::ConvolutionDepthWise_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx

    activation = 0;
}

int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_loongarch(opt);
    }
#endif

    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        int elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            elempack = channels % 4 == 0 ? 4 : 1;
        }
#endif

#if __loongarch_sx
        // pack4
        if (elempack == 4)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
        }
#endif // __loongarch_sx

        if (elempack == 1)
        {
            weight_data_tm = weight_data;
        }

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    // group convolution
    create_group_ops(opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
{
    // create Convolution op for each group
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    for (int i = 0; i < (int)group_ops.size(); i++)
        delete group_ops[i];

    group_ops.clear();

    const int channels_g = channels / group;
    const int num_output_g = num_output / group;

    group_ops.resize(group);

    for (int g = 0; g < group; g++)
    {
        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
        Mat bias_data_g;
        if (bias_term)
            bias_data_g = bias_data.range(num_output_g * g, num_output_g);

        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);

        // set param
        ncnn::ParamDict pd;
        pd.set(0, num_output_g); // num_output
        pd.set(1, kernel_w);
        pd.set(11, kernel_h);
        pd.set(2, dilation_w);
        pd.set(12, dilation_h);
        pd.set(3, stride_w);
        pd.set(13, stride_h);
        pd.set(4, 0);  // pad_w
        pd.set(14, 0); // pad_h
        pd.set(5, bias_term);
        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
        pd.set(8, int8_scale_term);
        pd.set(9, activation_type);
        pd.set(10, activation_params);

        op->load_param(pd);

        // set weights
        if (bias_term)
        {
            ncnn::Mat weights[5];
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

#if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
                weights[2] = weight_data_int8_scales_g;
                weights[3] = bottom_blob_int8_scales.range(g, 1);
            }
            if (int8_scale_term > 100)
            {
                weights[4] = top_blob_int8_scales.range(g, 1);
            }
#endif

            op->load_model(ModelBinFromMatArray(weights));
        }
        else
        {
            ncnn::Mat weights[4];
            weights[0] = weight_data_g;

#if NCNN_INT8
            if (int8_scale_term)
            {
                Mat weight_data_int8_scales_g(num_output_g);
                weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
                weights[1] = weight_data_int8_scales_g;
                weights[2] = bottom_blob_int8_scales.range(g, 1);
            }
            if (int8_scale_term > 100)
            {
                weights[3] = top_blob_int8_scales.range(g, 1);
            }
#endif

            op->load_model(ModelBinFromMatArray(weights));
        }

        op->create_pipeline(opt);

        group_ops[g] = op;
    }

    return 0;
}

int ConvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt)
{
    if (activation)
    {
        activation->destroy_pipeline(opt);
        delete activation;
        activation = 0;
    }

    for (int i = 0; i < (int)group_ops.size(); i++)
    {
        group_ops[i]->destroy_pipeline(opt);
        delete group_ops[i];
    }
    group_ops.clear();

    return 0;
}

int ConvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && int8_scale_term)
    {
        return forward_int8_loongarch(bottom_blob, top_blob, opt);
    }
#endif

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
#if __loongarch_sx
        if (elempack == 4)
        {
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw5x5s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw5x5s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    float* outptr = top_blob.channel(g);
                    const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                            if (bias_term)
                            {
                                _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0);
                            }

                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                            for (int k = 0; k < maxk; k++)
                            {
                                __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
                                __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
                                _sum = __lsx_vfmadd_s(_w, _val, _sum);
                            }

                            _sum = activation_ps(_sum, activation_type, activation_params);

                            __lsx_vst(_sum, outptr + j * 4, 0);
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }
#endif // __loongarch_sx

        if (elempack == 1)
        {
            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                convdw3x3s1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                convdw3x3s2_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

                if (activation)
                {
                    activation->forward_inplace(top_blob, opt);
                }
            }
            else
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < group; g++)
                {
                    float* outptr = top_blob.channel(g);
                    const float* kptr = (const float*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            float sum = 0.f;

                            if (bias_term)
                                sum = bias_data[g];

                            const float* sptr = m.row(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = (float)sptr[space_ofs[k]];
                                float w = (float)kptr[k];
                                sum += val * w;
                            }

                            sum = activation_ss(sum, activation_type, activation_params);

                            outptr[j] = sum;
                        }

                        outptr += outw;
                    }
                }
            }
        }

        return 0;
    }

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = 1;
    int out_g_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 4 == 0 ? 4 : 1;
        out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
    }
#endif

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p);
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    for (int g = 0; g < group; g++)
    {
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

        const ncnn::Layer* op = group_ops[g];

        Option opt_g = opt;
        opt_g.blob_allocator = top_blob_unpacked.allocator;

        // forward
        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
    }

    // packing
    if (out_g_elempack < out_elempack)
    {
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
}

int ConvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::ConvolutionDepthWise);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(7, group);
    pd.set(8, int8_scale_term);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_INT8
int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        int elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            elempack = channels % 8 == 0 ? 8 : 1;
        }
#endif // __loongarch_sx

        if (elempack == 8)
        {
            Mat weight_data_r2 = weight_data.reshape(maxk, group);
            convert_packing(weight_data_r2, weight_data_tm, 8, opt);
        }

        if (elempack == 1)
        {
            weight_data_tm = weight_data;
        }

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    // group convolution
    create_group_ops(opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int ConvolutionDepthWise_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;

    int elembits = bottom_blob.elembits();

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        const int channels_g = channels * elempack / group;

        Mat scales(channels * elempack);
        {
            float* ps = scales;
            for (int g = 0; g < group; g++)
            {
                float scale = bottom_blob_int8_scales[g];
                for (int q = 0; q < channels_g; q++)
                {
                    *ps++ = scale;
                }
            }
        }

        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;
    channels = bottom_blob_bordered.c;
    elempack = bottom_blob_bordered.elempack;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
        int out_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        }
#endif // __loongarch_sx
        bool use_int8_requantize = int8_scale_term > 100;
        size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;

        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __loongarch_sx
        if (elempack == 8)
        {
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    signed char* outptr_s8 = top_blob.channel(g);
                    float* outptr_f32 = top_blob.channel(g);
                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                            __m128i _sum1 = __lsx_vreplgr2vr_w(0);

                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;

                            for (int k = 0; k < maxk; k++)
                            {
                                __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0);
                                __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val);

                                __m128i _w = __lsx_vld(kptr + k * 8, 0);
                                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                                __m128i _s0 = __lsx_vmul_h(_val16, _w16);
                                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);

                                _sum0 = __lsx_vadd_w(_sum0, _s0l);
                                _sum1 = __lsx_vadd_w(_sum1, _s0h);
                            }

                            __m128 _scale_in0;
                            __m128 _scale_in1;
                            {
                                __m128 _bottom_blob_int8_scales0 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8, 0);
                                __m128 _bottom_blob_int8_scales1 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8 + 4, 0);
                                __m128 _weight_data_int8_scales0 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8, 0);
                                __m128 _weight_data_int8_scales1 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8 + 4, 0);
                                _scale_in0 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
                                _scale_in1 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales1, _weight_data_int8_scales1));

                                __m128i _m0 = __lsx_vfcmp_cne_s(_weight_data_int8_scales0, __lsx_vreplfr2vr_s(0.f));
                                __m128i _m1 = __lsx_vfcmp_cne_s(_weight_data_int8_scales1, __lsx_vreplfr2vr_s(0.f));
                                _scale_in0 = (__m128)__lsx_vand_v((__m128i)_scale_in0, (__m128i)_m0);
                                _scale_in1 = (__m128)__lsx_vand_v((__m128i)_scale_in1, (__m128i)_m1);
                            }

                            __m128 _sumfp32_0 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum0), _scale_in0);
                            __m128 _sumfp32_1 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum1), _scale_in1);

                            if (bias_term)
                            {
                                __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + g * 8, 0);
                                __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + g * 8 + 4, 0);
                                _sumfp32_0 = __lsx_vfadd_s(_sumfp32_0, _bias0);
                                _sumfp32_1 = __lsx_vfadd_s(_sumfp32_1, _bias1);
                            }

                            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
                            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);

                            if (use_int8_requantize)
                            {
                                // requantize and relu
                                __m128 _scale_out0 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8, 0);
                                __m128 _scale_out1 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8 + 4, 0);
                                _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_out0);
                                _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_out1);
                                int64_t _sum8 = float2int8(_sumfp32_0, _sumfp32_1);

                                *(int64_t*)outptr_s8 = _sum8;
                                outptr_s8 += 8;
                            }
                            else
                            {
                                // dequantize and relu
                                __lsx_vst(_sumfp32_0, outptr_f32, 0);
                                __lsx_vst(_sumfp32_1, outptr_f32 + 4, 0);
                                outptr_f32 += 8;
                            }
                        }
                    }
                }
            }
        }
#endif // __loongarch_sx

        if (elempack == 1)
        {
            {
                const int maxk = kernel_w * kernel_h;

                // kernel offsets
                std::vector<int> _space_ofs(maxk);
                int* space_ofs = &_space_ofs[0];
                {
                    int p1 = 0;
                    int p2 = 0;
                    int gap = w * dilation_h - kernel_w * dilation_w;
                    for (int i = 0; i < kernel_h; i++)
                    {
                        for (int j = 0; j < kernel_w; j++)
                        {
                            space_ofs[p1] = p2;
                            p1++;
                            p2 += dilation_w;
                        }
                        p2 += gap;
                    }
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < group; g++)
                {
                    signed char* outptr_s8 = top_blob.channel(g);
                    float* outptr_f32 = top_blob.channel(g);
                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
                    const Mat m = bottom_blob_bordered.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            int sum = 0;

                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                signed char val = sptr[space_ofs[k]];
                                signed char w = kptr[k];
                                sum += val * w;
                            }

                            float scale_in;
                            if (weight_data_int8_scales[g] == 0)
                                scale_in = 0;
                            else
                                scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

                            float sumfp32 = sum * scale_in;

                            if (bias_term)
                                sumfp32 += bias_data[g];

                            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);

                            if (use_int8_requantize)
                            {
                                // requantize
                                float scale_out = top_blob_int8_scales[g];
                                signed char sums8 = float2int8(sumfp32 * scale_out);
                                outptr_s8[0] = sums8;
                                outptr_s8 += 1;
                            }
                            else
                            {
                                // dequantize
                                outptr_f32[0] = sumfp32;
                                outptr_f32 += 1;
                            }
                        }
                    }
                }
            }
        }

        return 0;
    }

    bool use_int8_requantize = int8_scale_term > 100;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        if (use_int8_requantize)
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        else
            out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __loongarch_sx
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // group convolution
    const int channels_g = channels * elempack / group;
    const int num_output_g = num_output / group;

    int g_elempack = 1;
    int out_g_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        g_elempack = channels_g % 8 == 0 ? 8 : 1;
        if (use_int8_requantize)
            out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
        else
            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
    }
#endif // __loongarch_sx

    // unpacking
    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
    if (elempack > g_elempack)
    {
        Option opt_p = opt;
        opt_p.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
    }

    Mat top_blob_unpacked = top_blob;
    if (out_g_elempack < out_elempack)
    {
        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
        if (top_blob_unpacked.empty())
            return -100;
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int g = 0; g < group; g++)
    {
        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

        const ncnn::Layer* op = group_ops[g];

        Option opt_g = opt;
        opt_g.blob_allocator = top_blob_unpacked.allocator;

        // forward
        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
    }

    // packing
    if (out_g_elempack < out_elempack)
    {
        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
    }
    else
    {
        top_blob = top_blob_unpacked;
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/loongarch/convolutiondepthwise_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H
#define LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H

#include "convolutiondepthwise.h"

namespace ncnn {

class ConvolutionDepthWise_loongarch : public ConvolutionDepthWise
{
public:
    ConvolutionDepthWise_loongarch();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    int create_group_ops(const Option& opt);
#if NCNN_INT8
    int create_pipeline_int8_loongarch(const Option& opt);
    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Layer* activation;
    std::vector<ncnn::Layer*> group_ops;

    Mat weight_data_tm;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H


================================================
FILE: src/layer/loongarch/crop_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "crop_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

namespace ncnn {

Crop_loongarch::Crop_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

#if __loongarch_sx
static void crop_pack4_lsx(const Mat& src, Mat& dst, int top, int left)
{
    int w = dst.w;
    int h = dst.h;
    int right = src.w - dst.w - left;

    const float* ptr = src.row(top) + left * 4;
    float* outptr = dst;

    for (int y = 0; y < h; y++)
    {
        for (int x = 0; x < w; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __lsx_vst(_p, outptr, 0);

            ptr += 4;
            outptr += 4;
        }

        ptr += (left + right) * 4;
    }
}
#endif // __loongarch_sx

int Crop_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __loongarch_sx
    int _woffset, _hoffset, _doffset, _coffset;
    int _outw, _outh, _outd, _outc;
    if (!starts_expr.empty() && !ends_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(1);
        bottom_blob_shapes[0] = bottom_blob.shape();
        eval_crop_expr(bottom_blob_shapes, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else
    {
        resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }

    if (elempack == 4)
    {
        if (dims == 1)
        {
            int out_elempack = _outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw / out_elempack == w && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_woffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack);

                return 0;
            }
        }

        if (dims == 2)
        {
            int out_elempack = _outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_hoffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset);

                return 0;
            }
        }

        if (dims == 3)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    const Mat m = bottom_blob_sliced.channel(q);
                    Mat borderm = top_blob.channel(q);

                    crop_pack4_lsx(m, borderm, _hoffset, _woffset);
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h && _outd == d)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    for (int z = 0; z < _outd; z++)
                    {
                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                        Mat borderm = top_blob.channel(q).depth(z);

                        crop_pack4_lsx(m, borderm, _hoffset, _woffset);
                    }
                }

                return 0;
            }
        }
    }
#endif // __loongarch_sx

    Mat bottom_blob_unpacked = bottom_blob;
    if (elempack != 1)
    {
        Option opt_pack1 = opt;
        opt_pack1.blob_allocator = opt.workspace_allocator;

        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
        if (bottom_blob_unpacked.empty())
            return -100;
    }

    return Crop::forward(bottom_blob_unpacked, top_blob, opt);
}

int Crop_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int ref_elempack = reference_blob.elempack;

    Mat& top_blob = top_blobs[0];

#if __loongarch_sx
    int _woffset, _hoffset, _doffset, _coffset;
    int _outw, _outh, _outd, _outc;
    if (!starts_expr.empty() && !ends_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(bottom_blobs.size());
        for (size_t i = 0; i < bottom_blobs.size(); i++)
        {
            bottom_blob_shapes[i] = bottom_blobs[i].shape();
        }
        eval_crop_expr(bottom_blob_shapes, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else if (woffset == -233)
    {
        resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }
    else
    {
        resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
    }

    if (elempack == 4)
    {
        if (dims == 1)
        {
            int out_elempack = _outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw / out_elempack == w && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_woffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack);

                return 0;
            }
        }

        if (dims == 2)
        {
            int out_elempack = _outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_hoffset % 4 == 0 && out_elempack == 4)
            {
                top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset);

                return 0;
            }
        }

        if (dims == 3)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    const Mat m = bottom_blob_sliced.channel(q);
                    Mat borderm = top_blob.channel(q);

                    crop_pack4_lsx(m, borderm, _hoffset, _woffset);
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int out_elempack = _outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
            {
                top_blob = bottom_blob;
                return 0;
            }

            if (_coffset % 4 == 0 && out_elempack == 4)
            {
                const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);

                if (_outw == w && _outh == h && _outd == d)
                {
                    top_blob = bottom_blob_sliced.clone(opt.blob_allocator);
                    if (top_blob.empty())
                        return -100;
                }

                top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < top_blob.c; q++)
                {
                    for (int z = 0; z < _outd; z++)
                    {
                        const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
                        Mat borderm = top_blob.channel(q).depth(z);

                        crop_pack4_lsx(m, borderm, _hoffset, _woffset);
                    }
                }

                return 0;
            }
        }
    }
#endif // __loongarch_sx

    std::vector<Mat> bottom_blobs_unpacked(bottom_blobs.size());
    for (size_t i = 0; i < bottom_blobs.size(); i++)
    {
        Mat bottom_blob_unpacked = bottom_blobs[i];
        if (elempack != 1)
        {
            Option opt_pack1 = opt;
            opt_pack1.blob_allocator = opt.workspace_allocator;

            convert_packing(bottom_blobs[i], bottom_blob_unpacked, 1, opt_pack1);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        bottom_blobs_unpacked[i] = bottom_blob_unpacked;
    }

    return Crop::forward(bottom_blobs_unpacked, top_blobs, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/crop_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CROP_LOONGARCH_H
#define LAYER_CROP_LOONGARCH_H

#include "crop.h"

namespace ncnn {

class Crop_loongarch : public Crop
{
public:
    Crop_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CROP_LOONGARCH_H


================================================
FILE: src/layer/loongarch/deconvolution_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolution_loongarch.h"

#include "layer_type.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_activation.h"
#include "loongarch_usability.h"

namespace ncnn {

#if __loongarch_sx
#include "deconvolution_pack4.h"
#include "deconvolution_pack1to4.h"
#include "deconvolution_pack4to1.h"
#endif // __loongarch_sx

Deconvolution_loongarch::Deconvolution_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Deconvolution_loongarch::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    const int maxk = kernel_w * kernel_h;
    int num_input = weight_data_size / maxk / num_output;

    Mat weight_data_transposed(weight_data.w);
    {
        float* pt = weight_data_transposed;
        const float* p = weight_data;

        for (int i = 0; i < num_input * num_output; i++)
        {
            for (int k = 0; k < maxk; k++)
            {
                pt[maxk - 1 - k] = p[k];
            }

            p += maxk;
            pt += maxk;
        }
    }

    int elempack = 1;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            float* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }

#if __loongarch_sx
    // pack4
    if (elempack == 4 && out_elempack == 4)
    {
    }

    // pack1ton
    if (elempack == 1 && out_elempack == 4)
    {
    }

    // pack4to1
    if (elempack == 4 && out_elempack == 1)
    {
    }
#endif // __loongarch_sx

    // pack1
    if (elempack == 1 && out_elempack == 1)
    {
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Deconvolution_loongarch::destroy_pipeline(const Option& opt)
{
    return 0;
}

int Deconvolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // deconvolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Deconvolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

#if __loongarch_sx
    if (elempack == 4 && out_elempack == 4)
    {
        {
            deconvolution_pack4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        {
            deconvolution_pack1to4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            deconvolution_pack4to1_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#endif // __loongarch_sx

    if (elempack == 1 && out_elempack == 1)
    {
        {
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output; p++)
            {
                float* outptr = top_blob_bordered.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const float* kptr = (const float*)weight_data_tm.channel(p);

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob.channel(q);

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                const float* sptr = m.row(sy);

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    float val = sptr[sx];

                                    int k = y * kernel_w + x;

                                    float w = kptr[k];

                                    sum += val * w;
                                }
                            }

                            kptr += maxk;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = sum;
                    }

                    outptr += outw;
                }
            }
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int Deconvolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.c * bottom_blob.elempack;
    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.d * 1;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / 1;
        const int inch_g = _num_input / 1;
        const int maxk = _kernel_h * _kernel_w;

        for (int g = 0; g < 1; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, output_pad_right);
    pd.set(19, output_pad_bottom);
    pd.set(20, output_w);
    pd.set(21, output_h);
    pd.set(5, bias_term);
    pd.set(6, weight_data_transposed.w);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_transposed;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/deconvolution_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTION_LOONGARCH_H
#define LAYER_DECONVOLUTION_LOONGARCH_H

#include "deconvolution.h"

namespace ncnn {

class Deconvolution_loongarch : public Deconvolution
{
public:
    Deconvolution_loongarch();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    Mat weight_data_tm;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTION_LOONGARCH_H


================================================
FILE: src/layer/loongarch/deconvolution_pack1to4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void deconvolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                if (bias_data_ptr)
                {
                    _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0);
                }

                const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);

                    for (int y = 0; y < kernel_h; y++)
                    {
                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                        if (sys < 0 || sys % stride_h != 0)
                            continue;

                        int sy = sys / stride_h;
                        if (sy >= h)
                            continue;

                        const float* sptr = m.row(sy);

                        for (int x = 0; x < kernel_w; x++)
                        {
                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                            if (sxs < 0 || sxs % stride_w != 0)
                                continue;

                            int sx = sxs / stride_w;
                            if (sx >= w)
                                continue;

                            float val = sptr[sx];

                            int k = y * kernel_w + x;

                            __m128 _val = (__m128)__lsx_vreplfr2vr_s(val);
                            __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
                        }
                    }

                    kptr += maxk * 4;
                }

                _sum = activation_ps(_sum, activation_type, activation_params);

                __lsx_vst(_sum, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/loongarch/deconvolution_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void deconvolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                if (bias_data_ptr)
                {
                    _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0);
                }

                const float* kptr = (const float*)weight_data_pack4.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);

                    for (int y = 0; y < kernel_h; y++)
                    {
                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                        if (sys < 0 || sys % stride_h != 0)
                            continue;

                        int sy = sys / stride_h;
                        if (sy >= h)
                            continue;

                        for (int x = 0; x < kernel_w; x++)
                        {
                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                            if (sxs < 0 || sxs % stride_w != 0)
                                continue;

                            int sx = sxs / stride_w;
                            if (sx >= w)
                                continue;

                            const float* sptr = m.row(sy) + sx * 4;

                            int k = (y * kernel_w + x) * 16;

                            __m128 _val0 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
                            __m128 _val1 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
                            __m128 _val2 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
                            __m128 _val3 = (__m128)__lsx_vreplfr2vr_s(*sptr++);
                            __m128 _w0 = (__m128)__lsx_vld(kptr + k, 0);
                            __m128 _w1 = (__m128)__lsx_vld(kptr + k + 4, 0);
                            __m128 _w2 = (__m128)__lsx_vld(kptr + k + 8, 0);
                            __m128 _w3 = (__m128)__lsx_vld(kptr + k + 12, 0);
                            _sum = __lsx_vfmadd_s(_w0, _val0, _sum);
                            _sum = __lsx_vfmadd_s(_w1, _val1, _sum);
                            _sum = __lsx_vfmadd_s(_w2, _val2, _sum);
                            _sum = __lsx_vfmadd_s(_w3, _val3, _sum);
                        }
                    }

                    kptr += maxk * 16;
                }

                _sum = activation_ps(_sum, activation_type, activation_params);

                __lsx_vst(_sum, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/loongarch/deconvolution_pack4to1.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void deconvolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    const int maxk = kernel_w * kernel_h;

    const float* bias_data_ptr = bias_data;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                float sum = 0.f;

                if (bias_data_ptr)
                {
                    sum = bias_data_ptr[p];
                }

                __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);

                    for (int y = 0; y < kernel_h; y++)
                    {
                        int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                        if (sys < 0 || sys % stride_h != 0)
                            continue;

                        int sy = sys / stride_h;
                        if (sy >= h)
                            continue;

                        for (int x = 0; x < kernel_w; x++)
                        {
                            int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                            if (sxs < 0 || sxs % stride_w != 0)
                                continue;

                            int sx = sxs / stride_w;
                            if (sx >= w)
                                continue;

                            const float* sptr = m.row(sy) + sx * 4;

                            int k = y * kernel_w + x;

                            __m128 _val = (__m128)__lsx_vld(sptr, 0);
                            __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
                            _sum = __lsx_vfmadd_s(_w, _val, _sum);
                        }
                    }

                    kptr += maxk * 4;
                }

                sum += __lsx_reduce_fadd_s(_sum);

                sum = activation_ss(sum, activation_type, activation_params);

                outptr[j] = sum;
            }

            outptr += outw;
        }
    }
}


================================================
FILE: src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "deconvolutiondepthwise_loongarch.h"

#include "layer_type.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_activation.h"
#include "loongarch_usability.h"

namespace ncnn {

DeconvolutionDepthWise_loongarch::DeconvolutionDepthWise_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    // depth-wise
    if (channels == group && group == num_output)
    {
        int elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            elempack = channels % 4 == 0 ? 4 : 1;
        }
#endif

        Mat weight_data_transposed(weight_data.w);
        {
            float* pt = weight_data_transposed;
            const float* p = weight_data;

            for (int i = 0; i < (channels / group) * (num_output / group) * group; i++)
            {
                for (int k = 0; k < maxk; k++)
                {
                    pt[maxk - 1 - k] = p[k];
                }

                p += maxk;
                pt += maxk;
            }
        }

#if __loongarch_sx
        // pack4
        if (elempack == 4)
        {
            Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group);
            convert_packing(weight_data_r2, weight_data_tm, 4, opt);
        }
#endif // __loongarch_sx

        if (elempack == 1)
        {
            weight_data_tm = weight_data_transposed;
        }

        if (opt.lightmode)
            weight_data.release();

        return 0;
    }

    // group convolution
    create_group_ops(opt);

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt)
{
    // create Deconvolution op for each group
    const int maxk = kernel_w * kernel_h;
    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;

    for (int i = 0; i < (int)group_ops.size(); i++)
        delete group_ops[i];

    group_ops.clear();

    const int channels_g = channels / group;
    const int num_output_g = num_output / group;

    group_ops.resize(group);

    for (int g = 0; g < group; g++)
    {
        Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone();
        Mat bias_data_g;
        if (bias_term)
            bias_data_g = bias_data.range(num_output_g * g, num_output_g);

        ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Deconvolution);

        // set param
        ncnn::ParamDict pd;
        pd.set(0, num_output_g); // num_output
        pd.set(1, kernel_w);
        pd.set(11, kernel_h);
        pd.set(2, dilation_w);
        pd.set(12, dilation_h);
        pd.set(3, stride_w);
        pd.set(13, stride_h);
        pd.set(4, 0);  // pad_w
        pd.set(14, 0); // pad_h
        pd.set(18, output_pad_right);
        pd.set(19, output_pad_bottom);
        pd.set(5, bias_term);
        pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
        pd.set(9, activation_type);
        pd.set(10, activation_params);

        op->load_param(pd);

        // set weights
        if (bias_term)
        {
            ncnn::Mat weights[2];
            weights[0] = weight_data_g;
            weights[1] = bias_data_g;

            op->load_model(ModelBinFromMatArray(weights));
        }
        else
        {
            ncnn::Mat weights[1];
            weights[0] = weight_data_g;

            op->load_model(ModelBinFromMatArray(weights));
        }

        op->create_pipeline(opt);

        group_ops[g] = op;
    }

    return 0;
}

int DeconvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt)
{
    for (int i = 0; i < (int)group_ops.size(); i++)
    {
        group_ops[i]->destroy_pipeline(opt);
        delete group_ops[i];
    }
    group_ops.clear();

    return 0;
}

int DeconvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // convolv with NxN kernel
    // value = value + bias

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right;
    int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom;
    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    Mat top_blob_bordered;
    if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0))
    {
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator);
    }
    else
    {
        top_blob_bordered = top_blob;
        top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob_bordered.empty())
        return -100;

    const int maxk = kernel_w * kernel_h;

    // depth-wise
    if (channels * elempack == group && group == num_output)
    {
#if __loongarch_sx
        if (elempack == 4)
        {
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int g = 0; g < channels; g++)
                {
                    float* outptr = top_blob_bordered.channel(g);
                    const float* kptr = (const float*)weight_data_tm + maxk * g * 4;
                    const Mat m = bottom_blob.channel(g);

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                            if (bias_term)
                            {
                                _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0);
                            }

                            for (int y = 0; y < kernel_h; y++)
                            {
                                int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                                if (sys < 0 || sys % stride_h != 0)
                                    continue;

                                int sy = sys / stride_h;
                                if (sy >= h)
                                    continue;

                                for (int x = 0; x < kernel_w; x++)
                                {
                                    int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                    if (sxs < 0 || sxs % stride_w != 0)
                                        continue;

                                    int sx = sxs / stride_w;
                                    if (sx >= w)
                                        continue;

                                    const float* sptr = m.row(sy) + sx * 4;

                                    int k = y * kernel_w + x;

                                    __m128 _val = (__m128)__lsx_vld(sptr, 0);
                                    __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0);
                                    _sum = __lsx_vfmadd_s(_w, _val, _sum);
                                }
                            }

                            _sum = activation_ps(_sum, activation_type, activation_params);

                            __lsx_vst(_sum, outptr + j * 4, 0);
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }
#endif // __loongarch_sx

        if (elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int g = 0; g < channels; g++)
            {
                float* outptr = top_blob_bordered.channel(g);
                const float* kptr = (const float*)weight_data_tm + maxk * g;
                const Mat m = bottom_blob.channel(g);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[g];
                        }

                        for (int y = 0; y < kernel_h; y++)
                        {
                            int sys = (i + y * dilation_h - (kernel_extent_h - 1));
                            if (sys < 0 || sys % stride_h != 0)
                                continue;

                            int sy = sys / stride_h;
                            if (sy >= h)
                                continue;

                            const float* sptr = m.row(sy);

                            for (int x = 0; x < kernel_w; x++)
                            {
                                int sxs = (j + x * dilation_w - (kernel_extent_w - 1));
                                if (sxs < 0 || sxs % stride_w != 0)
                                    continue;

                                int sx = sxs / stride_w;
                                if (sx >= w)
                                    continue;

                                float val = sptr[sx];

                                int k = y * kernel_w + x;

                                float w = kptr[k];

                                sum += val * w;
                            }
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = sum;
                    }

                    outptr += outw;
                }
            }
        }
    }
    else
    {
        // group deconvolution
        const int channels_g = channels * elempack / group;
        const int num_output_g = num_output / group;

        int g_elempack = 1;
        int out_g_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            g_elempack = channels_g % 4 == 0 ? 4 : 1;
            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
        }
#endif

        // unpacking
        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > g_elempack)
        {
            Option opt_p = opt;
            opt_p.blob_allocator = opt.workspace_allocator;
            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p);
        }

        Mat top_blob_bordered_unpacked = top_blob_bordered;
        if (out_g_elempack < out_elempack)
        {
            top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator);
            if (top_blob_bordered_unpacked.empty())
                return -100;
        }

        for (int g = 0; g < group; g++)
        {
            const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
            Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);

            const ncnn::Layer* op = group_ops[g];

            Option opt_g = opt;
            opt_g.blob_allocator = top_blob_bordered_unpacked.allocator;

            // forward
            op->forward(bottom_blob_g, top_blob_bordered_g, opt_g);
        }

        // packing
        if (out_g_elempack < out_elempack)
        {
            convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt);
        }
        else
        {
            top_blob_bordered = top_blob_bordered_unpacked;
        }
    }

    cut_padding(top_blob_bordered, top_blob, opt);
    if (top_blob.empty())
        return -100;

    return 0;
}

int DeconvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _num_input = bottom_blob.c * bottom_blob.elempack;
    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.d * group;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
    Mat weight_data_transposed;
    {
        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
        if (weight_data_transposed.empty())
            return -100;

        const int outch_g = _num_output / group;
        const int inch_g = _num_input / group;
        const int maxk = _kernel_h * _kernel_w;

        for (int g = 0; g < group; g++)
        {
            // reorder weight from inch-outch to outch-inch
            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
            for (int i = 0; i < outch_g; i++)
            {
                for (int j = 0; j < inch_g; j++)
                {
                    for (int k = 0; k < maxk; k++)
                    {
                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
                    }
                }
            }
        }
    }

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::DeconvolutionDepthWise);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, output_pad_right);
    pd.set(19, output_pad_bottom);
    pd.set(20, output_w);
    pd.set(21, output_h);
    pd.set(5, bias_term);
    pd.set(6, weight_data_transposed.w);
    pd.set(7, group);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_transposed;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/deconvolutiondepthwise_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H
#define LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H

#include "deconvolutiondepthwise.h"

namespace ncnn {

class DeconvolutionDepthWise_loongarch : public DeconvolutionDepthWise
{
public:
    DeconvolutionDepthWise_loongarch();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
    int create_group_ops(const Option& opt);

public:
    std::vector<ncnn::Layer*> group_ops;

    Mat weight_data_tm;
};

} // namespace ncnn

#endif // LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H


================================================
FILE: src/layer/loongarch/dequantize_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "dequantize_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

Dequantize_loongarch::Dequantize_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

static void dequantize(const int* intptr, float* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int bias_data_size = bias_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("dequantize %d %d   %d %d", scale_data_size, bias_data_size, elemcount, elempack);

    float scale = scale_data[0];
#if __loongarch_sx
    __m128 _scale0 = (__m128)__lsx_vreplfr2vr_s(scale);
    __m128 _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        if (elempack == 4)
        {
            _scale0 = (__m128)__lsx_vld((const float*)scale_data, 0);
            _scale1 = _scale0;
        }
        if (elempack == 8)
        {
            _scale0 = (__m128)__lsx_vld((const float*)scale_data, 0);
            _scale1 = (__m128)__lsx_vld((const float*)scale_data + 4, 0);
        }
    }
#endif // __loongarch_sx

    if (bias_data_size == 0)
    {
        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmul_s(_v0, _scale0);
            _v1 = __lsx_vfmul_s(_v1, _scale1);
            __lsx_vst(_v0, ptr, 0);
            __lsx_vst(_v1, ptr + 4, 0);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmul_s(_v, _scale0);
            __lsx_vst(_v, ptr, 0);
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = *intptr * scale;
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __loongarch_sx
        __m128 _bias0 = (__m128)__lsx_vreplfr2vr_s(bias);
        __m128 _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 4)
            {
                _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
                _bias1 = _bias0;
            }
            if (elempack == 8)
            {
                _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
                _bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0);
            }
        }
#endif // __loongarch_sx

        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0);
            _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1);
            __lsx_vst(_v0, ptr, 0);
            __lsx_vst(_v1, ptr + 4, 0);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmadd_s(_scale0, _v, _bias0);
            __lsx_vst(_v, ptr, 0);
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = *intptr * scale + bias;
            intptr++;
            ptr++;
        }
    }
}

int Dequantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    // assert bottom_blob.elembits() == 32

    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 1)
    {
        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const int* intptr = (const int*)bottom_blob + i * elempack;
            float* ptr = (float*)top_blob + i * elempack;

            // assert scale_data_size == 1
            // assert bias_data_size == 0 || bias_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            dequantize(intptr, ptr, scale_data, bias_data, size, 1);
        }
    }

    if (dims == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            const int* intptr = bottom_blob.row<const int>(i);
            float* ptr = top_blob.row(i);

            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;

            dequantize(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
        }
    }

    if (dims == 3)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int* intptr = bottom_blob.channel(q);
            float* ptr = top_blob.channel(q);

            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;

            dequantize(intptr, ptr, scale_data_q, bias_data_q, w * h, elempack);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/dequantize_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DEQUANTIZE_LOONGARCH_H
#define LAYER_DEQUANTIZE_LOONGARCH_H

#include "dequantize.h"

namespace ncnn {

class Dequantize_loongarch : public Dequantize
{
public:
    Dequantize_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_DEQUANTIZE_LOONGARCH_H


================================================
FILE: src/layer/loongarch/dropout_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "dropout_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

Dropout_loongarch::Dropout_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Dropout_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    if (scale == 1.f)
    {
        return 0;
    }

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = __lsx_vfmul_s(_p, _scale);
            __lsx_vst(_p, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = *ptr * scale;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/dropout_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_DROPOUT_LOONGARCH_H
#define LAYER_DROPOUT_LOONGARCH_H

#include "dropout.h"

namespace ncnn {

class Dropout_loongarch : public Dropout
{
public:
    Dropout_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_DROPOUT_LOONGARCH_H


================================================
FILE: src/layer/loongarch/eltwise_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "eltwise_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

Eltwise_loongarch::Eltwise_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Eltwise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int elempack = bottom_blob.elempack;
    int size = w * h * d * elempack;

    Mat& top_blob = top_blobs[0];
    top_blob.create_like(bottom_blob, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (op_type == Operation_PROD)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
#if __loongarch_sx
            for (; i + 3 < size; i += 4)
            {
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
                _p = __lsx_vfmul_s(_p, _p1);
                __lsx_vst(_p, outptr, 0);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                *outptr = *ptr * *ptr1;

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        for (size_t b = 2; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __loongarch_sx
                for (; i + 3 < size; i += 4)
                {
                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
                    _p = __lsx_vfmul_s(_p, _p1);
                    __lsx_vst(_p, outptr, 0);

                    ptr += 4;
                    outptr += 4;
                }
#endif // __loongarch_sx
                for (; i < size; i++)
                {
                    *outptr *= *ptr;

                    ptr++;
                    outptr++;
                }
            }
        }
    }
    if (op_type == Operation_SUM)
    {
        if (coeffs.w == 0)
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __loongarch_sx
                for (; i + 3 < size; i += 4)
                {
                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
                    _p = __lsx_vfadd_s(_p, _p1);
                    __lsx_vst(_p, outptr, 0);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __loongarch_sx
                for (; i < size; i++)
                {
                    *outptr = *ptr + *ptr1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            for (size_t b = 2; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    int i = 0;
#if __loongarch_sx
                    for (; i + 3 < size; i += 4)
                    {
                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
                        _p = __lsx_vfadd_s(_p, _p1);
                        __lsx_vst(_p, outptr, 0);

                        ptr += 4;
                        outptr += 4;
                    }
#endif // __loongarch_sx
                    for (; i < size; i++)
                    {
                        *outptr += *ptr;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
        else
        {
            // first blob
            const Mat& bottom_blob1 = bottom_blobs[1];
            float coeff0 = coeffs[0];
            float coeff1 = coeffs[1];
#if __loongarch_sx
            __m128 _coeff0 = (__m128)__lsx_vreplfr2vr_s(coeff0);
            __m128 _coeff1 = (__m128)__lsx_vreplfr2vr_s(coeff1);
#endif // __loongarch_sx
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                const float* ptr1 = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __loongarch_sx
                for (; i + 3 < size; i += 4)
                {
                    __m128 _p = (__m128)__lsx_vld(ptr, 0);
                    __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
                    _p = __lsx_vfmul_s(_p, _coeff0);
                    _p = __lsx_vfmadd_s(_coeff1, _p1, _p);
                    __lsx_vst(_p, outptr, 0);

                    ptr += 4;
                    ptr1 += 4;
                    outptr += 4;
                }
#endif // __loongarch_sx
                for (; i < size; i++)
                {
                    *outptr = *ptr * coeff0 + *ptr1 * coeff1;

                    ptr++;
                    ptr1++;
                    outptr++;
                }
            }

            for (size_t b = 2; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob1 = bottom_blobs[b];
                float coeff = coeffs[b];
#if __loongarch_sx
                __m128 _coeff = (__m128)__lsx_vreplfr2vr_s(coeff);
#endif // __loongarch_sx
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob1.channel(q);
                    float* outptr = top_blob.channel(q);

                    int i = 0;
#if __loongarch_sx
                    for (; i + 3 < size; i += 4)
                    {
                        __m128 _p = (__m128)__lsx_vld(outptr, 0);
                        __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
                        _p = __lsx_vfmadd_s(_coeff, _p1, _p);
                        __lsx_vst(_p, outptr, 0);

                        ptr += 4;
                        outptr += 4;
                    }
#endif // __loongarch_sx
                    for (; i < size; i++)
                    {
                        *outptr += *ptr * coeff;

                        ptr++;
                        outptr++;
                    }
                }
            }
        }
    }
    if (op_type == Operation_MAX)
    {
        // first blob
        const Mat& bottom_blob1 = bottom_blobs[1];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            const float* ptr1 = bottom_blob1.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
#if __loongarch_sx
            for (; i + 3 < size; i += 4)
            {
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                __m128 _p1 = (__m128)__lsx_vld(ptr1, 0);
                _p = __lsx_vfmax_s(_p, _p1);
                __lsx_vst(_p, outptr, 0);

                ptr += 4;
                ptr1 += 4;
                outptr += 4;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                *outptr = std::max(*ptr, *ptr1);

                ptr++;
                ptr1++;
                outptr++;
            }
        }

        for (size_t b = 2; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob1 = bottom_blobs[b];
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob1.channel(q);
                float* outptr = top_blob.channel(q);

                int i = 0;
#if __loongarch_sx
                for (; i + 3 < size; i += 4)
                {
                    __m128 _p = (__m128)__lsx_vld(outptr, 0);
                    __m128 _p1 = (__m128)__lsx_vld(ptr, 0);
                    _p = __lsx_vfmax_s(_p, _p1);
                    __lsx_vst(_p, outptr, 0);

                    ptr += 4;
                    outptr += 4;
                }
#endif // __loongarch_sx
                for (; i < size; i++)
                {
                    *outptr = std::max(*ptr, *outptr);

                    ptr++;
                    outptr++;
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/eltwise_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ELTWISE_LOONGARCH_H
#define LAYER_ELTWISE_LOONGARCH_H

#include "eltwise.h"

namespace ncnn {

class Eltwise_loongarch : public Eltwise
{
public:
    Eltwise_loongarch();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ELTWISE_LOONGARCH_H


================================================
FILE: src/layer/loongarch/flatten_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "flatten_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

Flatten_loongarch::Flatten_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Flatten_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

    if (elembits == 8)
        return forward_int8(bottom_blob, top_blob, opt);

    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    int size = w * h * d;

    int total = size * channels * elempack;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = total % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    if (out_elempack == 1)
    {
        return Flatten::forward(bottom_blob, top_blob, opt);
    }

    if (dims == 2 && elempack == 1) // out_elempack == 4
    {
        top_blob = bottom_blob;
        top_blob.dims = 1;
        top_blob.w = total / out_elempack;
        top_blob.h = 1;
        top_blob.cstep = bottom_blob.cstep / out_elempack;
        top_blob.elemsize = out_elemsize;
        top_blob.elempack = out_elempack;
        return 0;
    }

    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 2)
    {
#if __loongarch_sx
        if (elempack == 4) // out_elempack == 4
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* ptr = bottom_blob.row(i);
                float* outptr0 = (float*)top_blob + w * i * 4;
                float* outptr1 = (float*)top_blob + w * (i * 4 + 1);
                float* outptr2 = (float*)top_blob + w * (i * 4 + 2);
                float* outptr3 = (float*)top_blob + w * (i * 4 + 3);

                int j = 0;
                for (; j + 3 < w; j += 4)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(ptr, 0);
                    __m128i _r1 = __lsx_vld(ptr + 4, 0);
                    __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, outptr0, 0);
                    __lsx_vst(_r0123_1, outptr1, 0);
                    __lsx_vst(_r0123_2, outptr2, 0);
                    __lsx_vst(_r0123_3, outptr3, 0);

                    ptr += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
                for (; j < w; j++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];

                    ptr += 4;
                }
            }
        }
#endif // __loongarch_sx
    }

    if (dims == 3 || dims == 4)
    {
#if __loongarch_sx
        if (elempack == 4) // out_elempack == 4
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr0 = (float*)top_blob + size * q * 4;
                float* outptr1 = (float*)top_blob + size * (q * 4 + 1);
                float* outptr2 = (float*)top_blob + size * (q * 4 + 2);
                float* outptr3 = (float*)top_blob + size * (q * 4 + 3);

                int i = 0;
                for (; i + 3 < size; i += 4)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(ptr, 0);
                    __m128i _r1 = __lsx_vld(ptr + 4, 0);
                    __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, outptr0, 0);
                    __lsx_vst(_r0123_1, outptr1, 0);
                    __lsx_vst(_r0123_2, outptr2, 0);
                    __lsx_vst(_r0123_3, outptr3, 0);

                    ptr += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
                for (; i < size; i++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];

                    ptr += 4;
                }
            }
        }
#endif // __loongarch_sx

        if (elempack == 1) // out_elempack == 4
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                float* outptr = (float*)top_blob + size * q;

                int i = 0;
#if __loongarch_sx
                for (; i + 3 < size; i += 4)
                {
                    __lsx_vst(__lsx_vld(ptr, 0), outptr, 0);
                    ptr += 4;
                    outptr += 4;
                }
#endif // __loongarch_sx
                for (; i < size; i++)
                {
                    *outptr++ = *ptr++;
                }
            }
        }
    }

    return 0;
}

int Flatten_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    int size = w * h * d;

    int total = size * channels * elempack;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = total % 8 == 0 ? 8 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    if (out_elempack == 1)
    {
        return Flatten::forward(bottom_blob, top_blob, opt);
    }

    if (dims == 2 && elempack == 1) // out_elempack == 8
    {
        top_blob = bottom_blob;
        top_blob.dims = 1;
        top_blob.w = total / out_elempack;
        top_blob.h = 1;
        top_blob.cstep = bottom_blob.cstep / out_elempack;
        top_blob.elemsize = out_elemsize;
        top_blob.elempack = out_elempack;
        return 0;
    }

    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (dims == 2)
    {
#if __loongarch_sx
        if (elempack == 8) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const signed char* ptr = bottom_blob.row<signed char>(i);
                signed char* outptr0 = (signed char*)top_blob + w * i * 8;
                signed char* outptr1 = (signed char*)top_blob + w * (i * 8 + 1);
                signed char* outptr2 = (signed char*)top_blob + w * (i * 8 + 2);
                signed char* outptr3 = (signed char*)top_blob + w * (i * 8 + 3);
                signed char* outptr4 = (signed char*)top_blob + w * (i * 8 + 4);
                signed char* outptr5 = (signed char*)top_blob + w * (i * 8 + 5);
                signed char* outptr6 = (signed char*)top_blob + w * (i * 8 + 6);
                signed char* outptr7 = (signed char*)top_blob + w * (i * 8 + 7);

                int j = 0;
                for (; j < w; j++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];
                    *outptr4++ = ptr[4];
                    *outptr5++ = ptr[5];
                    *outptr6++ = ptr[6];
                    *outptr7++ = ptr[7];

                    ptr += 8;
                }
            }
        }
#endif // __loongarch_sx
    }

    if (dims == 3 || dims == 4)
    {
#if __loongarch_sx
        if (elempack == 8) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const signed char* ptr = bottom_blob.channel(q);
                signed char* outptr0 = (signed char*)top_blob + size * q * 8;
                signed char* outptr1 = (signed char*)top_blob + size * (q * 8 + 1);
                signed char* outptr2 = (signed char*)top_blob + size * (q * 8 + 2);
                signed char* outptr3 = (signed char*)top_blob + size * (q * 8 + 3);
                signed char* outptr4 = (signed char*)top_blob + size * (q * 8 + 4);
                signed char* outptr5 = (signed char*)top_blob + size * (q * 8 + 5);
                signed char* outptr6 = (signed char*)top_blob + size * (q * 8 + 6);
                signed char* outptr7 = (signed char*)top_blob + size * (q * 8 + 7);

                int i = 0;
                for (; i < size; i++)
                {
                    *outptr0++ = ptr[0];
                    *outptr1++ = ptr[1];
                    *outptr2++ = ptr[2];
                    *outptr3++ = ptr[3];
                    *outptr4++ = ptr[4];
                    *outptr5++ = ptr[5];
                    *outptr6++ = ptr[6];
                    *outptr7++ = ptr[7];

                    ptr += 8;
                }
            }
        }
#endif // __loongarch_sx

        if (elempack == 1) // out_elempack == 8
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const signed char* ptr = bottom_blob.channel(q);
                signed char* outptr = (signed char*)top_blob + size * q;

                int i = 0;
                for (; i < size; i++)
                {
                    *outptr++ = *ptr++;
                }
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/flatten_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_FLATTEN_LOONGARCH_H
#define LAYER_FLATTEN_LOONGARCH_H

#include "flatten.h"

namespace ncnn {

class Flatten_loongarch : public Flatten
{
public:
    Flatten_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_FLATTEN_LOONGARCH_H


================================================
FILE: src/layer/loongarch/hardsigmoid_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "hardsigmoid_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

HardSigmoid_loongarch::HardSigmoid_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int HardSigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = __lsx_vfmadd_s(_alpha, _p, _beta);
            _p = __lsx_vfmax_s(_p, _zero);
            _p = __lsx_vfmin_s(_p, _one);
            __lsx_vst(_p, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            if (*ptr < lower)
                *ptr = 0.f;
            else if (*ptr > upper)
                *ptr = 1.f;
            else
                *ptr = *ptr * alpha + beta;
            ++ptr;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/hardsigmoid_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_HARDSIGMOID_LOONGARCH_H
#define LAYER_HARDSIGMOID_LOONGARCH_H

#include "hardsigmoid.h"

namespace ncnn {

class HardSigmoid_loongarch : public HardSigmoid
{
public:
    HardSigmoid_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_HARDSIGMOID_LOONGARCH_H


================================================
FILE: src/layer/loongarch/hardswish_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "hardswish_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

HardSwish_loongarch::HardSwish_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int HardSwish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha);
        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __m128 _outp = __lsx_vfmadd_s(_alpha, _p, _beta);
            _outp = __lsx_vfmax_s(_outp, _zero);
            _outp = __lsx_vfmin_s(_outp, _one);
            _outp = __lsx_vfmul_s(_outp, _p);
            __lsx_vst(_outp, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            if (*ptr < lower)
                *ptr = 0.f;
            else if (*ptr > upper)
                ;
            else
                *ptr = *ptr * (*ptr * alpha + beta);
            ++ptr;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/hardswish_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_HARDSWISH_LOONGARCH_H
#define LAYER_HARDSWISH_LOONGARCH_H

#include "hardswish.h"

namespace ncnn {

class HardSwish_loongarch : public HardSwish
{
public:
    HardSwish_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_HARDSWISH_LOONGARCH_H


================================================
FILE: src/layer/loongarch/innerproduct_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "innerproduct_loongarch.h"

#include "layer_type.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

#include "loongarch_activation.h"

namespace ncnn {

InnerProduct_loongarch::InnerProduct_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx

    flatten = 0;
}

int InnerProduct_loongarch::create_pipeline(const Option& opt)
{
    {
        flatten = ncnn::create_layer_cpu(ncnn::LayerType::Flatten);

        ncnn::ParamDict pd;

        flatten->load_param(pd);

        flatten->create_pipeline(opt);
    }

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_loongarch(opt);
    }
#endif

#if __loongarch_sx
    if (opt.use_fp16_storage)
    {
        return create_pipeline_fp16s(opt);
    }
#endif

    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;

#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __loongarch_sx

    if (out_elempack == 4)
    {
        // src = inch-outch
        // dst = 4-inch-outch/4
        {
            Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

            weight_data_tm.create(num_input, num_output / 4, (size_t)4u * 4, 4);

            for (int q = 0; q + 3 < num_output; q += 4)
            {
                float* g0 = weight_data_tm.row(q / 4);

                for (int p = 0; p < num_input; p++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        *g0++ = weight_data_r2.row(q + j)[p];
                    }
                }
            }
        }
    }
    else
    {
        weight_data_tm = weight_data;
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int InnerProduct_loongarch::destroy_pipeline(const Option& opt)
{
    if (flatten)
    {
        flatten->destroy_pipeline(opt);
        delete flatten;
        flatten = 0;
    }

    return 0;
}

int InnerProduct_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && int8_scale_term)
    {
        return forward_int8_loongarch(bottom_blob, top_blob, opt);
    }
#endif

#if __loongarch_sx
    if (opt.use_fp16_storage)
    {
        return forward_fp16s(bottom_blob, top_blob, opt);
    }
#endif

    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
    {
        // gemm
        int h = bottom_blob.h;
        size_t elemsize = bottom_blob.elemsize;
        int elempack = bottom_blob.elempack;

        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int num_output_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
        }
#endif

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
#if __loongarch_sx
            if (elempack == 4 && num_output_elempack == 4)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const float* kptr = weight_data_tm.row(p);
                    const float* m = bottom_blob.row(j);

                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum0 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 0]);
                        _sum1 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 1]);
                        _sum2 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 2]);
                        _sum3 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 3]);
                    }

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 16);
                        __m128 _val = (__m128)__lsx_vld(m, 0);
                        __m128i _w = __lsx_vld(kptr, 0);
                        _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0);
                        _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1);
                        _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2);
                        _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3);

                        m += 4;
                        kptr += 4;
                    }

                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps(_sum3, activation_type, activation_params);

                    __lsx_vst(_sum0, outptr, 0);
                    __lsx_vst(_sum1, outptr + 4, 0);
                    __lsx_vst(_sum2, outptr + 8, 0);
                    __lsx_vst(_sum3, outptr + 12, 0);
                    outptr += 16;
                }
            }

            if (elempack == 1 && num_output_elempack == 4)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const float* kptr = weight_data_tm.row(p);
                    const float* m = bottom_blob.row(j);

                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
                    }

                    int i = 0;
                    for (; i + 3 < num_input; i += 4)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 64);
                        __m128i _val = __lsx_vld(m, 0);
                        __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
                        __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
                        __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
                        __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
                        _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
                        _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
                        _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
                        _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);

                        m += 4;
                        kptr += 16;
                    }
                    for (; i < num_input; i++)
                    {
                        __m128 _val = __lsx_vreplfr2vr_s(m[0]);
                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
                        _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);

                        m += 1;
                        kptr += 4;
                    }

                    _sum0 = __lsx_vfadd_s(_sum0, _sum1);
                    _sum2 = __lsx_vfadd_s(_sum2, _sum3);
                    _sum0 = __lsx_vfadd_s(_sum0, _sum2);

                    _sum0 = activation_ps(_sum0, activation_type, activation_params);

                    __lsx_vst(_sum0, outptr, 0);
                    outptr += 4;
                }
            }

            if (elempack == 4 && num_output_elempack == 1)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const float* kptr = (const float*)weight_data_tm + num_input * p;
                    const float* m = bottom_blob.row(j);

                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum = __lsx_vreplfr2vr_s(bias_data[p]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 4);
                        __m128 _val = (__m128)__lsx_vld(m, 0);
                        __m128 _k = __lsx_vreplfr2vr_s(kptr[0]);
                        _sum = __lsx_vfmadd_s(_k, _val, _sum);

                        m += 4;
                        kptr += 1;
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    __lsx_vst(_sum, outptr, 0);
                    outptr += 4;
                }
            }
#endif // __loongarch_sx

            if (elempack == 1 && num_output_elempack == 1)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const float* kptr = (const float*)weight_data_tm + num_input * p;
                    const float* m = bottom_blob.row(j);

                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    int i = 0;
#if __loongarch_sx
                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
                    for (; i + 3 < num_input; i += 4)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 16);
                        __m128 _m = (__m128)__lsx_vld(m, 0);
                        __m128 _w = (__m128)__lsx_vld(kptr, 0);
                        _sum = __lsx_vfmadd_s(_w, _m, _sum);

                        m += 4;
                        kptr += 4;
                    }
                    sum += __lsx_reduce_fadd_s(_sum);
#endif // __loongarch_sx
                    for (; i < num_input; i++)
                    {
                        sum += *m * *kptr;

                        m += 1;
                        kptr += 1;
                    }

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[0] = sum;
                    outptr += 1;
                }
            }
        }

        return 0;
    }

    // flatten
    Mat bottom_blob_flattened = bottom_blob;
    if (bottom_blob.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;

        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
    }

    size_t elemsize = bottom_blob_flattened.elemsize;
    int elempack = bottom_blob_flattened.elempack;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __loongarch_sx
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __loongarch_sx
    if (out_elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

            if (bias_term)
            {
                _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
            }

            const float* kptr = weight_data_tm.row(p);

            const float* sptr = bottom_blob_flattened;

            int i = 0;
            for (; i + 3 < num_input; i += 4)
            {
                __builtin_prefetch(sptr + 16);
                __builtin_prefetch(kptr + 64);
                __m128i _val = __lsx_vld(sptr, 0);
                __m128 _w0 = (__m128)__lsx_vld(kptr, 0);
                __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0);
                __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0);
                __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0);
                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
                _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
                _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
                _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);

                sptr += 4;
                kptr += 16;
            }
            for (; i < num_input; i++)
            {
                __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
                __m128 _w = (__m128)__lsx_vld(kptr, 0);
                _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);

                sptr += 1;
                kptr += 4;
            }

            _sum0 = __lsx_vfadd_s(_sum0, _sum1);
            _sum2 = __lsx_vfadd_s(_sum2, _sum3);
            _sum0 = __lsx_vfadd_s(_sum0, _sum2);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);

            float* outptr = top_blob;
            __lsx_vst(_sum0, outptr + p * 4, 0);
        }
    }
#endif // __loongarch_sx

    if (out_elempack == 1)
    {
        int nn_num_output = num_output / 4;
        int remain_num_output_start = nn_num_output * 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp = 0; pp < nn_num_output; pp++)
        {
            int p = pp * 4;

            float sum0 = 0.f;
            float sum1 = 0.f;
            float sum2 = 0.f;
            float sum3 = 0.f;

            if (bias_term)
            {
                sum0 = bias_data[p];
                sum1 = bias_data[p + 1];
                sum2 = bias_data[p + 2];
                sum3 = bias_data[p + 3];
            }

            const float* w0 = (const float*)weight_data_tm + num_input * p;
            const float* w1 = (const float*)weight_data_tm + num_input * (p + 1);
            const float* w2 = (const float*)weight_data_tm + num_input * (p + 2);
            const float* w3 = (const float*)weight_data_tm + num_input * (p + 3);

            const float* m = bottom_blob_flattened;

            int i = 0;
#if __loongarch_sx
            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
            for (; i + 3 < num_input; i += 4)
            {
                __builtin_prefetch(m + 16);
                __builtin_prefetch(w0 + 16);
                __builtin_prefetch(w1 + 16);
                __builtin_prefetch(w2 + 16);
                __builtin_prefetch(w3 + 16);
                __m128 _m = (__m128)__lsx_vld(m, 0);
                __m128 _w0 = (__m128)__lsx_vld(w0, 0);
                __m128 _w1 = (__m128)__lsx_vld(w1, 0);
                __m128 _w2 = (__m128)__lsx_vld(w2, 0);
                __m128 _w3 = (__m128)__lsx_vld(w3, 0);
                _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0);
                _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1);
                _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2);
                _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3);

                m += 4;
                w0 += 4;
                w1 += 4;
                w2 += 4;
                w3 += 4;
            }
#endif // __loongarch_sx
            for (; i < num_input; i++)
            {
                sum0 += *m * *w0;
                sum1 += *m * *w1;
                sum2 += *m * *w2;
                sum3 += *m * *w3;

                m++;
                w0++;
                w1++;
                w2++;
                w3++;
            }

#if __loongarch_sx
            sum0 += __lsx_reduce_fadd_s(_sum0);
            sum1 += __lsx_reduce_fadd_s(_sum1);
            sum2 += __lsx_reduce_fadd_s(_sum2);
            sum3 += __lsx_reduce_fadd_s(_sum3);
#endif // __loongarch_sx

            sum0 = activation_ss(sum0, activation_type, activation_params);
            sum1 = activation_ss(sum1, activation_type, activation_params);
            sum2 = activation_ss(sum2, activation_type, activation_params);
            sum3 = activation_ss(sum3, activation_type, activation_params);

            top_blob[p] = sum0;
            top_blob[p + 1] = sum1;
            top_blob[p + 2] = sum2;
            top_blob[p + 3] = sum3;
        }

        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = remain_num_output_start; p < num_output; p++)
        {
            float sum = 0.f;

            if (bias_term)
                sum = bias_data[p];

            const float* w = (const float*)weight_data_tm + num_input * p;

            const float* m = bottom_blob_flattened;

            int i = 0;
#if __loongarch_sx
            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
            for (; i + 3 < num_input; i += 4)
            {
                __builtin_prefetch(m + 16);
                __builtin_prefetch(w + 16);
                __m128 _m = (__m128)__lsx_vld(m, 0);
                __m128 _w = (__m128)__lsx_vld(w, 0);
                _sum0 = __lsx_vfmadd_s(_w, _m, _sum0);

                m += 4;
                w += 4;
            }
            sum += __lsx_reduce_fadd_s(_sum0);
#endif // __loongarch_sx
            for (; i < num_input; i++)
            {
                sum += *m * *w;

                m++;
                w++;
            }

            sum = activation_ss(sum, activation_type, activation_params);

            top_blob[p] = sum;
        }
    }

    return 0;
}

#if __loongarch_sx
int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt)
{
    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }

    // src = inch-outch
    // dst = pb-inch-outch/pb
    if (out_elempack == 4)
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4);

        for (int q = 0; q + 3 < num_output; q += 4)
        {
            unsigned short* g0 = weight_data_tm.row<unsigned short>(q / 4);

            const float* k0 = weight_data_r2.row(q);
            const float* k1 = weight_data_r2.row(q + 1);
            const float* k2 = weight_data_r2.row(q + 2);
            const float* k3 = weight_data_r2.row(q + 3);

            int p = 0;
            for (; p + 3 < num_input; p += 4)
            {
                // transpose 4x4
                __m128i _r0 = __lsx_vld(k0, 0);
                __m128i _r1 = __lsx_vld(k1, 0);
                __m128i _r2 = __lsx_vld(k2, 0);
                __m128i _r3 = __lsx_vld(k3, 0);

                __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                __m128i _p0 = __lsx_vfcvt_h_s((__m128)_r0123_1, (__m128)_r0123_0);
                __m128i _p1 = __lsx_vfcvt_h_s((__m128)_r0123_3, (__m128)_r0123_2);

                __lsx_vst(_p0, g0, 0);
                __lsx_vst(_p1, g0 + 8, 0);

                k0 += 4;
                k1 += 4;
                k2 += 4;
                k3 += 4;
                g0 += 16;
            }
            for (; p < num_input; p++)
            {
                g0[0] = float32_to_float16(*k0++);
                g0[1] = float32_to_float16(*k1++);
                g0[2] = float32_to_float16(*k2++);
                g0[3] = float32_to_float16(*k3++);
                g0 += 4;
            }
        }
    }

    if (out_elempack == 1)
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
        ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt);
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int InnerProduct_loongarch::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int num_input = weight_data_size / num_output;

    if (bottom_blob.dims == 2 && bottom_blob.w == num_input)
    {
        // gemm
        int h = bottom_blob.h;
        size_t elemsize = bottom_blob.elemsize;
        int elempack = bottom_blob.elempack;

        top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int num_output_elempack = 1;
        if (opt.use_packing_layout)
        {
            num_output_elempack = num_output % 4 == 0 ? 4 : 1;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
            if (elempack == 4 && num_output_elempack == 4)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
                    const float* m = bottom_blob.row(j);

                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum0 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 0]);
                        _sum1 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 1]);
                        _sum2 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 2]);
                        _sum3 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 3]);
                    }

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 16);
                        __m128 _val = (__m128)__lsx_vld(m, 0);
                        __m128i _w = (__m128i)__lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
                        _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0);
                        _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1);
                        _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2);
                        _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3);

                        m += 4;
                        kptr += 4;
                    }

                    _sum0 = activation_ps(_sum0, activation_type, activation_params);
                    _sum1 = activation_ps(_sum1, activation_type, activation_params);
                    _sum2 = activation_ps(_sum2, activation_type, activation_params);
                    _sum3 = activation_ps(_sum3, activation_type, activation_params);

                    __lsx_vst(_sum0, outptr, 0);
                    __lsx_vst(_sum1, outptr + 4, 0);
                    __lsx_vst(_sum2, outptr + 8, 0);
                    __lsx_vst(_sum3, outptr + 12, 0);
                    outptr += 16;
                }
            }

            if (elempack == 1 && num_output_elempack == 4)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
                    const float* m = bottom_blob.row(j);

                    __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
                    __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
                    }

                    int i = 0;
                    for (; i + 3 < num_input; i += 4)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 64);
                        __m128i _val = __lsx_vld(m, 0);
                        __m128i _w01 = __lsx_vld(kptr, 0);
                        __m128i _w23 = __lsx_vld(kptr + 8, 0);
                        __m128 _w0 = __lsx_vfcvtl_s_h(_w01);
                        __m128 _w1 = __lsx_vfcvth_s_h(_w01);
                        __m128 _w2 = __lsx_vfcvtl_s_h(_w23);
                        __m128 _w3 = __lsx_vfcvth_s_h(_w23);
                        _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
                        _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
                        _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
                        _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);

                        m += 4;
                        kptr += 16;
                    }
                    for (; i < num_input; i++)
                    {
                        __m128 _val = __lsx_vreplfr2vr_s(m[0]);
                        __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
                        _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);

                        m += 1;
                        kptr += 4;
                    }

                    _sum0 = __lsx_vfadd_s(_sum0, _sum1);
                    _sum2 = __lsx_vfadd_s(_sum2, _sum3);
                    _sum0 = __lsx_vfadd_s(_sum0, _sum2);

                    _sum0 = activation_ps(_sum0, activation_type, activation_params);

                    __lsx_vst(_sum0, outptr, 0);
                    outptr += 4;
                }
            }

            if (elempack == 4 && num_output_elempack == 1)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
                    const float* m = bottom_blob.row(j);

                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                    if (bias_term)
                    {
                        _sum = __lsx_vreplfr2vr_s(bias_data[p]);
                    }

                    for (int i = 0; i < num_input; i++)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 4);
                        __m128 _val = (__m128)__lsx_vld(m, 0);
                        __m128 _k = __lsx_vreplfr2vr_s(float16_to_float32(kptr[0]));
                        _sum = __lsx_vfmadd_s(_k, _val, _sum);

                        m += 4;
                        kptr += 1;
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    __lsx_vst(_sum, outptr, 0);
                    outptr += 4;
                }
            }

            if (elempack == 1 && num_output_elempack == 1)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);
                    const float* m = bottom_blob.row(j);

                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    int i = 0;
                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
                    for (; i + 3 < num_input; i += 4)
                    {
                        __builtin_prefetch(m + 16);
                        __builtin_prefetch(kptr + 16);
                        __m128 _m = (__m128)__lsx_vld(m, 0);
                        __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
                        _sum = __lsx_vfmadd_s(_w, _m, _sum);

                        m += 4;
                        kptr += 4;
                    }
                    sum += __lsx_reduce_fadd_s(_sum);
                    for (; i < num_input; i++)
                    {
                        sum += *m * float16_to_float32(*kptr);

                        m += 1;
                        kptr += 1;
                    }

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[0] = sum;
                    outptr += 1;
                }
            }
        }

        return 0;
    }

    // flatten
    Mat bottom_blob_flattened = bottom_blob;
    if (bottom_blob.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;

        flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten);
    }

    size_t elemsize = bottom_blob_flattened.elemsize;
    int elempack = bottom_blob_flattened.elempack;

    int out_elempack = 1;
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    if (out_elempack == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);

            if (bias_term)
            {
                _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0);
            }

            const unsigned short* kptr = weight_data_tm.row<const unsigned short>(p);

            const float* sptr = bottom_blob_flattened;

            int i = 0;
            for (; i + 3 < num_input; i += 4)
            {
                __builtin_prefetch(sptr + 16);
                __builtin_prefetch(kptr + 64);
                __m128i _val = __lsx_vld(sptr, 0);
                __m128i _w01 = __lsx_vld(kptr, 0);
                __m128i _w23 = __lsx_vld(kptr + 8, 0);
                __m128 _w0 = __lsx_vfcvtl_s_h(_w01);
                __m128 _w1 = __lsx_vfcvth_s_h(_w01);
                __m128 _w2 = __lsx_vfcvtl_s_h(_w23);
                __m128 _w3 = __lsx_vfcvth_s_h(_w23);
                _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0);
                _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1);
                _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2);
                _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3);

                sptr += 4;
                kptr += 16;
            }
            for (; i < num_input; i++)
            {
                __m128 _val = __lsx_vreplfr2vr_s(sptr[0]);
                __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0));
                _sum0 = __lsx_vfmadd_s(_w, _val, _sum0);

                sptr += 1;
                kptr += 4;
            }

            _sum0 = __lsx_vfadd_s(_sum0, _sum1);
            _sum2 = __lsx_vfadd_s(_sum2, _sum3);
            _sum0 = __lsx_vfadd_s(_sum0, _sum2);

            _sum0 = activation_ps(_sum0, activation_type, activation_params);

            float* outptr = top_blob;
            __lsx_vst(_sum0, outptr + p * 4, 0);
        }
    }

    if (out_elempack == 1)
    {
        int nn_num_output = num_output / 4;
        int remain_num_output_start = nn_num_output * 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp = 0; pp < nn_num_output; pp++)
        {
            int p = pp * 4;

            float sum0 = 0.f;
            float sum1 = 0.f;
            float sum2 = 0.f;
            float sum3 = 0.f;

            if (bias_term)
            {
                sum0 = bias_data[p];
                sum1 = bias_data[p + 1];
                sum2 = bias_data[p + 2];
                sum3 = bias_data[p + 3];
            }

            const unsigned short* w0 = weight_data_tm.row<const unsigned short>(p);
            const unsigned short* w1 = weight_data_tm.row<const unsigned short>(p + 1);
            const unsigned short* w2 = weight_data_tm.row<const unsigned short>(p + 2);
            const unsigned short* w3 = weight_data_tm.row<const unsigned short>(p + 3);

            const float* m = bottom_blob_flattened;

            int i = 0;
            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0);
            for (; i + 3 < num_input; i += 4)
            {
                __builtin_prefetch(m + 16);
                __builtin_prefetch(w0 + 16);
                __builtin_prefetch(w1 + 16);
                __builtin_prefetch(w2 + 16);
                __builtin_prefetch(w3 + 16);
                __m128 _m = (__m128)__lsx_vld(m, 0);
                __m128 _w0 = __lsx_vfcvtl_s_h(__lsx_vld(w0, 0));
                __m128 _w1 = __lsx_vfcvtl_s_h(__lsx_vld(w1, 0));
                __m128 _w2 = __lsx_vfcvtl_s_h(__lsx_vld(w2, 0));
                __m128 _w3 = __lsx_vfcvtl_s_h(__lsx_vld(w3, 0));
                _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0);
                _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1);
                _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2);
                _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3);

                m += 4;
                w0 += 4;
                w1 += 4;
                w2 += 4;
                w3 += 4;
            }
            for (; i < num_input; i++)
            {
                sum0 += *m * float16_to_float32(*w0);
                sum1 += *m * float16_to_float32(*w1);
                sum2 += *m * float16_to_float32(*w2);
                sum3 += *m * float16_to_float32(*w3);

                m++;
                w0++;
                w1++;
                w2++;
                w3++;
            }

            sum0 += __lsx_reduce_fadd_s(_sum0);
            sum1 += __lsx_reduce_fadd_s(_sum1);
            sum2 += __lsx_reduce_fadd_s(_sum2);
            sum3 += __lsx_reduce_fadd_s(_sum3);

            sum0 = activation_ss(sum0, activation_type, activation_params);
            sum1 = activation_ss(sum1, activation_type, activation_params);
            sum2 = activation_ss(sum2, activation_type, activation_params);
            sum3 = activation_ss(sum3, activation_type, activation_params);

            top_blob[p] = sum0;
            top_blob[p + 1] = sum1;
            top_blob[p + 2] = sum2;
            top_blob[p + 3] = sum3;
        }

        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = remain_num_output_start; p < num_output; p++)
        {
            float sum = 0.f;

            if (bias_term)
                sum = bias_data[p];

            const unsigned short* w = weight_data_tm.row<const unsigned short>(p);

            const float* m = bottom_blob_flattened;

            int i = 0;
            __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0);
            for (; i + 3 < num_input; i += 4)
            {
                __builtin_prefetch(m + 16);
                __builtin_prefetch(w + 16);
                __m128 _m = (__m128)__lsx_vld(m, 0);
                __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(w, 0));
                _sum0 = __lsx_vfmadd_s(_w, _m, _sum0);

                m += 4;
                w += 4;
            }
            sum += __lsx_reduce_fadd_s(_sum0);
            for (; i < num_input; i++)
            {
                sum += *m * float16_to_float32(*w);

                m++;
                w++;
            }

            sum = activation_ss(sum, activation_type, activation_params);

            top_blob[p] = sum;
        }
    }

    return 0;
}
#endif // __loongarch_sx

#if NCNN_INT8
int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt)
{
    const int num_input = weight_data_size / num_output;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }
#endif // __loongarch_sx

    // src = inch-outch
    // dst = pb-inch-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);

        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            signed char* g0 = weight_data_tm.row<signed char>(q / out_elempack);

            for (int p = 0; p < num_input; p++)
            {
                for (int j = 0; j < out_elempack; j++)
                {
                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
                }
            }
        }
    }

    scale_in_data.create(num_output);
    for (int p = 0; p < num_output; p++)
    {
        // dequantize
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        scale_in_data[p] = scale_in;
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int InnerProduct_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int num_input = weight_data_size / num_output;

    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
    }

    if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input)
    {
        // gemm
        Mat bottom_blob_int8_unpacked;
        Option opt_unpack = opt;
        opt_unpack.blob_allocator = opt.workspace_allocator;
        convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);

        int h = bottom_blob_int8_unpacked.h;

        int out_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            out_elempack = h % 4 == 0 ? 4 : 1;
        }
#endif

        int outh = h / out_elempack;

        top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int num_output_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            num_output_elempack = num_output % 8 == 0 ? 8 : 1;
        }
#endif

#if __loongarch_sx
        if (num_output_elempack == 8 && out_elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);

                    __m128i _sum00 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum01 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum10 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum11 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum20 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum21 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum30 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum31 = __lsx_vreplgr2vr_w(0);

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        __builtin_prefetch(m0 + 4);
                        __builtin_prefetch(m1 + 4);
                        __builtin_prefetch(m2 + 4);
                        __builtin_prefetch(m3 + 4);
                        __builtin_prefetch(kptr + 32);
                        __m128i _val0 = __lsx_vreplgr2vr_h((short)m0[0]);
                        __m128i _val1 = __lsx_vreplgr2vr_h((short)m1[0]);
                        __m128i _val2 = __lsx_vreplgr2vr_h((short)m2[0]);
                        __m128i _val3 = __lsx_vreplgr2vr_h((short)m3[0]);

                        __m128i _w = __lsx_vld(kptr, 0);
                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                        __m128i _s0 = __lsx_vmul_h(_val0, _w16);
                        __m128i _s1 = __lsx_vmul_h(_val1, _w16);
                        __m128i _s2 = __lsx_vmul_h(_val2, _w16);
                        __m128i _s3 = __lsx_vmul_h(_val3, _w16);
                        __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                        __m128i _exts1 = __lsx_vslti_h(_s1, 0);
                        __m128i _exts2 = __lsx_vslti_h(_s2, 0);
                        __m128i _exts3 = __lsx_vslti_h(_s3, 0);
                        __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                        __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);
                        __m128i _s1l = __lsx_vilvl_h(_exts1, _s1);
                        __m128i _s1h = __lsx_vilvh_h(_exts1, _s1);
                        __m128i _s2l = __lsx_vilvl_h(_exts2, _s2);
                        __m128i _s2h = __lsx_vilvh_h(_exts2, _s2);
                        __m128i _s3l = __lsx_vilvl_h(_exts3, _s3);
                        __m128i _s3h = __lsx_vilvh_h(_exts3, _s3);

                        _sum00 = __lsx_vadd_w(_sum00, _s0l);
                        _sum01 = __lsx_vadd_w(_sum01, _s0h);
                        _sum10 = __lsx_vadd_w(_sum10, _s1l);
                        _sum11 = __lsx_vadd_w(_sum11, _s1h);
                        _sum20 = __lsx_vadd_w(_sum20, _s2l);
                        _sum21 = __lsx_vadd_w(_sum21, _s2h);
                        _sum30 = __lsx_vadd_w(_sum30, _s3l);
                        _sum31 = __lsx_vadd_w(_sum31, _s3h);

                        m0++;
                        m1++;
                        m2++;
                        m3++;
                        kptr += 8;
                    }

                    // dequantize and relu
                    __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
                    __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);

                    __m128 _sumfp32_00 = __lsx_vffint_s_w(_sum00);
                    __m128 _sumfp32_01 = __lsx_vffint_s_w(_sum01);
                    __m128 _sumfp32_10 = __lsx_vffint_s_w(_sum10);
                    __m128 _sumfp32_11 = __lsx_vffint_s_w(_sum11);
                    __m128 _sumfp32_20 = __lsx_vffint_s_w(_sum20);
                    __m128 _sumfp32_21 = __lsx_vffint_s_w(_sum21);
                    __m128 _sumfp32_30 = __lsx_vffint_s_w(_sum30);
                    __m128 _sumfp32_31 = __lsx_vffint_s_w(_sum31);
                    if (bias_term)
                    {
                        __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
                        __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
                        _sumfp32_00 = __lsx_vfmadd_s(_scale_in0, _sumfp32_00, _bias0);
                        _sumfp32_01 = __lsx_vfmadd_s(_scale_in1, _sumfp32_01, _bias1);
                        _sumfp32_10 = __lsx_vfmadd_s(_scale_in0, _sumfp32_10, _bias0);
                        _sumfp32_11 = __lsx_vfmadd_s(_scale_in1, _sumfp32_11, _bias1);
                        _sumfp32_20 = __lsx_vfmadd_s(_scale_in0, _sumfp32_20, _bias0);
                        _sumfp32_21 = __lsx_vfmadd_s(_scale_in1, _sumfp32_21, _bias1);
                        _sumfp32_30 = __lsx_vfmadd_s(_scale_in0, _sumfp32_30, _bias0);
                        _sumfp32_31 = __lsx_vfmadd_s(_scale_in1, _sumfp32_31, _bias1);
                    }
                    else
                    {
                        _sumfp32_00 = __lsx_vfmul_s(_sumfp32_00, _scale_in0);
                        _sumfp32_01 = __lsx_vfmul_s(_sumfp32_01, _scale_in1);
                        _sumfp32_10 = __lsx_vfmul_s(_sumfp32_10, _scale_in0);
                        _sumfp32_11 = __lsx_vfmul_s(_sumfp32_11, _scale_in1);
                        _sumfp32_20 = __lsx_vfmul_s(_sumfp32_20, _scale_in0);
                        _sumfp32_21 = __lsx_vfmul_s(_sumfp32_21, _scale_in1);
                        _sumfp32_30 = __lsx_vfmul_s(_sumfp32_30, _scale_in0);
                        _sumfp32_31 = __lsx_vfmul_s(_sumfp32_31, _scale_in1);
                    }

                    _sumfp32_00 = activation_ps(_sumfp32_00, activation_type, activation_params);
                    _sumfp32_01 = activation_ps(_sumfp32_01, activation_type, activation_params);
                    _sumfp32_10 = activation_ps(_sumfp32_10, activation_type, activation_params);
                    _sumfp32_11 = activation_ps(_sumfp32_11, activation_type, activation_params);
                    _sumfp32_20 = activation_ps(_sumfp32_20, activation_type, activation_params);
                    _sumfp32_21 = activation_ps(_sumfp32_21, activation_type, activation_params);
                    _sumfp32_30 = activation_ps(_sumfp32_30, activation_type, activation_params);
                    _sumfp32_31 = activation_ps(_sumfp32_31, activation_type, activation_params);

                    // transpose 4x8
                    __m128i _r01r = __lsx_vilvl_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00);
                    __m128i _r01l = __lsx_vilvh_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00);
                    __m128i _r23r = __lsx_vilvl_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20);
                    __m128i _r23l = __lsx_vilvh_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20);
                    __m128i _r45r = __lsx_vilvl_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01);
                    __m128i _r45l = __lsx_vilvh_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01);
                    __m128i _r67r = __lsx_vilvl_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21);
                    __m128i _r67l = __lsx_vilvh_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21);
                    _sumfp32_00 = (__m128)__lsx_vilvl_d(_r23r, _r01r);
                    _sumfp32_10 = (__m128)__lsx_vilvh_d(_r23r, _r01r);
                    _sumfp32_20 = (__m128)__lsx_vilvl_d(_r23l, _r01l);
                    _sumfp32_30 = (__m128)__lsx_vilvh_d(_r23l, _r01l);
                    _sumfp32_01 = (__m128)__lsx_vilvl_d(_r67r, _r45r);
                    _sumfp32_11 = (__m128)__lsx_vilvh_d(_r67r, _r45r);
                    _sumfp32_21 = (__m128)__lsx_vilvl_d(_r67l, _r45l);
                    _sumfp32_31 = (__m128)__lsx_vilvh_d(_r67l, _r45l);

                    __lsx_vst(_sumfp32_00, outptr, 0);
                    __lsx_vst(_sumfp32_10, outptr + 4, 0);
                    __lsx_vst(_sumfp32_20, outptr + 8, 0);
                    __lsx_vst(_sumfp32_30, outptr + 12, 0);
                    __lsx_vst(_sumfp32_01, outptr + 16, 0);
                    __lsx_vst(_sumfp32_11, outptr + 20, 0);
                    __lsx_vst(_sumfp32_21, outptr + 24, 0);
                    __lsx_vst(_sumfp32_31, outptr + 28, 0);

                    outptr += 32;
                }
            }
        }

        if (num_output_elempack == 1 && out_elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);

                    int sum0 = 0;
                    int sum1 = 0;
                    int sum2 = 0;
                    int sum3 = 0;

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        sum0 += *m0++ * kptr[0];
                        sum1 += *m1++ * kptr[0];
                        sum2 += *m2++ * kptr[0];
                        sum3 += *m3++ * kptr[0];
                        kptr += 1;
                    }

                    // dequantize and relu
                    float sumfp32_0 = sum0 * scale_in_data[p];
                    float sumfp32_1 = sum1 * scale_in_data[p];
                    float sumfp32_2 = sum2 * scale_in_data[p];
                    float sumfp32_3 = sum3 * scale_in_data[p];

                    if (bias_term)
                    {
                        sumfp32_0 += bias_data[p];
                        sumfp32_1 += bias_data[p];
                        sumfp32_2 += bias_data[p];
                        sumfp32_3 += bias_data[p];
                    }

                    outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params);
                    outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params);
                    outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params);
                    outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params);
                    outptr += 4;
                }
            }
        }

        if (num_output_elempack == 8 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output / num_output_elempack; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);

                    __m128i _sum0 = __lsx_vreplgr2vr_w(0);
                    __m128i _sum1 = __lsx_vreplgr2vr_w(0);

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        __builtin_prefetch(m + 4);
                        __builtin_prefetch(kptr + 32);
                        __m128i _val = __lsx_vreplgr2vr_h((short)m[0]);

                        __m128i _w = __lsx_vld(kptr, 0);
                        __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                        __m128i _s0 = __lsx_vmul_h(_val, _w16);
                        __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                        __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                        __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);

                        _sum0 = __lsx_vadd_w(_sum0, _s0l);
                        _sum1 = __lsx_vadd_w(_sum1, _s0h);

                        m++;
                        kptr += 8;
                    }

                    // dequantize and relu
                    __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
                    __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);

                    __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0);
                    __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1);

                    if (bias_term)
                    {
                        __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
                        __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
                        _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0);
                        _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1);
                    }
                    else
                    {
                        _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0);
                        _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1);
                    }

                    _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
                    _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);

                    __lsx_vst(_sumfp32_0, outptr, 0);
                    __lsx_vst(_sumfp32_1, outptr + 4, 0);
                    outptr += 8;
                }
            }
        }
#endif // __loongarch_sx

        if (num_output_elempack == 1 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int j = 0; j < outh; j++)
            {
                float* outptr = top_blob.row(j);

                for (int p = 0; p < num_output; p++)
                {
                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);

                    int sum = 0;

                    int i = 0;
                    for (; i < num_input; i++)
                    {
                        sum += *m++ * *kptr++;
                    }

                    // dequantize and relu
                    float sumfp32 = sum * scale_in_data[p];

                    if (bias_term)
                        sumfp32 += bias_data[p];

                    outptr[0] = activation_ss(sumfp32, activation_type, activation_params);
                    outptr += 1;
                }
            }
        }

        return 0;
    }

    Mat bottom_blob_int8_flattened = bottom_blob_int8;
    if (bottom_blob_int8.dims != 1)
    {
        Option opt_flatten = opt;
        opt_flatten.blob_allocator = opt.workspace_allocator;
        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
    }

    //     int elempack = bottom_blob_int8_flattened.elempack;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 8 == 0 ? 8 : 1;
    }
#endif // __loongarch_sx
    //     size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __loongarch_sx
    if (out_elempack == 8)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            __m128i _sum0 = __lsx_vreplgr2vr_w(0);
            __m128i _sum1 = __lsx_vreplgr2vr_w(0);

            const signed char* kptr = weight_data_tm.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int i = 0;
            for (; i < num_input; i++)
            {
                __builtin_prefetch(sptr + 4);
                __builtin_prefetch(kptr + 32);
                __m128i _val = __lsx_vreplgr2vr_h((short)sptr[0]);

                __m128i _w = __lsx_vld(kptr, 0);
                __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w);

                __m128i _s0 = __lsx_vmul_h(_val, _w16);
                __m128i _exts0 = __lsx_vslti_h(_s0, 0);
                __m128i _s0l = __lsx_vilvl_h(_exts0, _s0);
                __m128i _s0h = __lsx_vilvh_h(_exts0, _s0);

                _sum0 = __lsx_vadd_w(_sum0, _s0l);
                _sum1 = __lsx_vadd_w(_sum1, _s0h);

                sptr += 1;
                kptr += 8;
            }

            // dequantize and relu
            __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0);
            __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0);

            __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0);
            __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1);

            if (bias_term)
            {
                __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0);
                __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0);
                _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0);
                _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1);
            }
            else
            {
                _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0);
                _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1);
            }

            _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params);
            _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params);

            float* outptr = (float*)top_blob + p * 8;
            __lsx_vst(_sumfp32_0, outptr, 0);
            __lsx_vst(_sumfp32_1, outptr + 4, 0);
        }
    }
#endif // __loongarch_sx

    if (out_elempack == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < num_output / out_elempack; p++)
        {
            int sum = 0;

            const signed char* kptr = weight_data_tm.row<const signed char>(p);
            const signed char* sptr = bottom_blob_int8_flattened;

            int i = 0;
            for (; i < num_input; i++)
            {
                signed char val = sptr[0];

                signed char w = kptr[0];

                sum += val * w;

                sptr += 1;
                kptr += 1;
            }

            // dequantize and relu
            float sumfp32 = sum * scale_in_data[p];

            if (bias_term)
                sumfp32 += bias_data[p];

            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);

            top_blob[p] = sumfp32;
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/loongarch/innerproduct_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INNERPRODUCT_LOONGARCH_H
#define LAYER_INNERPRODUCT_LOONGARCH_H

#include "innerproduct.h"

namespace ncnn {

class InnerProduct_loongarch : public InnerProduct
{
public:
    InnerProduct_loongarch();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
#if __loongarch_sx
    int create_pipeline_fp16s(const Option& opt);
    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
#if NCNN_INT8
    int create_pipeline_int8_loongarch(const Option& opt);
    int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Layer* flatten;

    Mat weight_data_tm;

#if NCNN_INT8
    Mat scale_in_data;
#endif
};

} // namespace ncnn

#endif // LAYER_INNERPRODUCT_LOONGARCH_H


================================================
FILE: src/layer/loongarch/interp_bicubic.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static inline void interpolate_cubic(float fx, float* coeffs)
{
    const float A = -0.75f;

    float fx0 = fx + 1;
    float fx1 = fx;
    float fx2 = 1 - fx;
    // float fx3 = 2 - fx;

    coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
    coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
    coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
    coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
}

static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = (float)(dx * scale);
        }

        int sx = static_cast<int>(floor(fx));
        fx -= sx;

        interpolate_cubic(fx, alpha + dx * 4);

        if (sx <= -1)
        {
            sx = 1;
            alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 3];
            alpha[dx * 4 + 2] = 0.f;
            alpha[dx * 4 + 3] = 0.f;
        }
        if (sx == 0)
        {
            sx = 1;
            alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 2];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 3];
            alpha[dx * 4 + 3] = 0.f;
        }
        if (sx == w - 2)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 1];
            alpha[dx * 4 + 1] = alpha[dx * 4 + 0];
            alpha[dx * 4 + 0] = 0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 3;
            alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0];
            alpha[dx * 4 + 2] = alpha[dx * 4 + 0];
            alpha[dx * 4 + 1] = 0.f;
            alpha[dx * 4 + 0] = 0.f;
        }

        xofs[dx] = sx;
    }
}

static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    Mat rowsbuf2(w);
    Mat rowsbuf3(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const float* S0 = src.row(sy - 1);
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                float a2 = alphap[2];
                float a3 = alphap[3];
                rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3;
                rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3;
                rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3;
                rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3;

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];
        float b2 = beta[2];
        float b3 = beta[3];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        float* Dp = dst.row(dy);
        for (int dx = 0; dx < w; dx++)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3;
            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3;
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/loongarch/interp_bicubic_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    Mat rowsbuf2(w, (size_t)4 * 4u, 4);
    Mat rowsbuf3(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;
    float* rows2 = rowsbuf2;
    float* rows3 = rowsbuf3;

    int prev_sy1 = -3;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows2;
            rows2 = rows3;
            rows3 = rows0_old;
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S3p = S3 + sx;

                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);

                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
                __lsx_vst(_rows3, rows3p + dx * 4, 0);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 2)
        {
            // hresize two rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            rows0 = rows2;
            rows1 = rows3;
            rows2 = rows0_old;
            rows3 = rows1_old;
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);

                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
                __lsx_vst(_rows2, rows2p + dx * 4, 0);
                __lsx_vst(_rows3, rows3p + dx * 4, 0);

                alphap += 4;
            }
        }
        else if (sy == prev_sy1 + 3)
        {
            // hresize three rows
            float* rows0_old = rows0;
            float* rows1_old = rows1;
            float* rows2_old = rows2;
            rows0 = rows3;
            rows1 = rows0_old;
            rows2 = rows1_old;
            rows3 = rows2_old;
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);

                __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0);
                __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0);
                __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0);
                __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0);
                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
                _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1);
                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
                _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1);
                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
                __lsx_vst(_rows1, rows1p + dx * 4, 0);
                __lsx_vst(_rows2, rows2p + dx * 4, 0);
                __lsx_vst(_rows3, rows3p + dx * 4, 0);

                alphap += 4;
            }
        }
        else
        {
            // hresize four rows
            const float* S0 = src.row(sy - 1);
            const float* S1 = src.row(sy);
            const float* S2 = src.row(sy + 1);
            const float* S3 = src.row(sy + 2);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            float* rows2p = rows2;
            float* rows3p = rows3;
            for (int dx = 0; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;
                const float* S2p = S2 + sx;
                const float* S3p = S3 + sx;

                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
                __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
                __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);

                __m128 _S00 = (__m128)__lsx_vld(S0p - 4, 0);
                __m128 _S01 = (__m128)__lsx_vld(S0p + 0, 0);
                __m128 _S02 = (__m128)__lsx_vld(S0p + 4, 0);
                __m128 _S03 = (__m128)__lsx_vld(S0p + 8, 0);
                __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0);
                __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0);
                __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0);
                __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0);
                __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0);
                __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0);
                __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0);
                __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0);
                __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0);
                __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0);
                __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0);
                __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0);
                __m128 _rows0 = __lsx_vfmul_s(_S00, _a0);
                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
                __m128 _rows2 = __lsx_vfmul_s(_S20, _a0);
                __m128 _rows3 = __lsx_vfmul_s(_S30, _a0);
                _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0);
                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
                _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2);
                _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3);
                _rows0 = __lsx_vfmadd_s(_a2, _S02, _rows0);
                _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1);
                _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2);
                _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3);
                _rows0 = __lsx_vfmadd_s(_a3, _S03, _rows0);
                _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1);
                _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2);
                _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3);
                __lsx_vst(_rows0, rows0p + dx * 4, 0);
                __lsx_vst(_rows1, rows1p + dx * 4, 0);
                __lsx_vst(_rows2, rows2p + dx * 4, 0);
                __lsx_vst(_rows3, rows3p + dx * 4, 0);

                alphap += 4;
            }
        }

        prev_sy1 = sy;

        // vresize
        __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]);
        __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]);
        __m128 _b2 = __lsx_vreplfr2vr_s(beta[2]);
        __m128 _b3 = __lsx_vreplfr2vr_s(beta[3]);

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* rows2p = rows2;
        float* rows3p = rows3;
        float* Dp = dst.row(dy);

        for (int dx = 0; dx < w; dx++)
        {
            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
            __m128 _rows2 = (__m128)__lsx_vld(rows2p, 0);
            __m128 _rows3 = (__m128)__lsx_vld(rows3p, 0);
            __m128 _Dp = __lsx_vfmul_s(_rows0, _b0);
            _Dp = __lsx_vfmadd_s(_b1, _rows1, _Dp);
            _Dp = __lsx_vfmadd_s(_b2, _rows2, _Dp);
            _Dp = __lsx_vfmadd_s(_b3, _rows3, _Dp);
            __lsx_vst(_Dp, Dp, 0);

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
            rows2p += 4;
            rows3p += 4;
        }

        beta += 4;
    }
}


================================================
FILE: src/layer/loongarch/interp_bilinear.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner)
{
    double scale = (double)w / outw;
    if (align_corner)
    {
        scale = (double)(w - 1) / (outw - 1);
    }

    for (int dx = 0; dx < outw; dx++)
    {
        float fx = (float)((dx + 0.5) * scale - 0.5);
        if (align_corner)
        {
            fx = (float)(dx * scale);
        }

        int sx = floor(fx);
        fx -= sx;

        if (sx < 0)
        {
            sx = 0;
            fx = 0.f;
        }
        if (sx >= w - 1)
        {
            sx = w - 2;
            fx = 1.f;
        }

        xofs[dx] = sx;

        alpha[dx * 2] = 1.f - fx;
        alpha[dx * 2 + 1] = fx;
    }
}

static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w);
    Mat rowsbuf1(w);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const float* S0 = src.row(sy);
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx];
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;

                float a0 = alphap[0];
                float a1 = alphap[1];
                rows0p[dx] = S0p[0] * a0 + S0p[1] * a1;
                rows1p[dx] = S1p[0] * a0 + S1p[1] * a1;

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        float b0 = beta[0];
        float b1 = beta[1];

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* Dp = dst.row(dy);

#if __loongarch_sx
        int nn = w >> 3;
#else
        int nn = 0;
#endif
        int remain = w - (nn << 3);

#if __loongarch_sx
        __m128 _b0 = __lsx_vreplfr2vr_s(b0);
        __m128 _b1 = __lsx_vreplfr2vr_s(b1);
        for (; nn > 0; nn--)
        {
            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);

            __m128 _Dp = __lsx_vfmul_s(_rows0, _b0);
            _Dp = __lsx_vfmadd_s(_b1, _rows1, _Dp);

            __lsx_vst(_Dp, Dp, 0);

            __m128 _rows0n = (__m128)__lsx_vld(rows0p + 4, 0);
            __m128 _rows1n = (__m128)__lsx_vld(rows1p + 4, 0);

            __m128 _Dpn = __lsx_vfmul_s(_rows0n, _b0);
            _Dpn = __lsx_vfmadd_s(_b1, _rows1n, _Dpn);

            __lsx_vst(_Dpn, Dp + 4, 0);

            Dp += 8;
            rows0p += 8;
            rows1p += 8;
        }
#endif // __loongarch_sx
        for (; remain; --remain)
        {
            //             D[x] = rows0[x]*b0 + rows1[x]*b1;
            *Dp++ = *rows0p++ * b0 + *rows1p++ * b1;
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/loongarch/interp_bilinear_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
{
    int w = dst.w;
    int h = dst.h;

    // loop body
    Mat rowsbuf0(w, (size_t)4 * 4u, 4);
    Mat rowsbuf1(w, (size_t)4 * 4u, 4);
    float* rows0 = rowsbuf0;
    float* rows1 = rowsbuf1;

    int prev_sy1 = -2;

    for (int dy = 0; dy < h; dy++)
    {
        int sy = yofs[dy];

        if (sy == prev_sy1)
        {
            // reuse all rows
        }
        else if (sy == prev_sy1 + 1)
        {
            // hresize one row
            float* rows0_old = rows0;
            rows0 = rows1;
            rows1 = rows0_old;
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S1p = S1 + sx;

                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);

                __m128 _S10 = (__m128)__lsx_vld(S1p, 0);
                __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0);
                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
                __lsx_vst(_rows1, rows1p + dx * 4, 0);

                alphap += 2;
            }
        }
        else
        {
            // hresize two rows
            const float* S0 = src.row(sy);
            const float* S1 = src.row(sy + 1);

            const float* alphap = alpha;
            float* rows0p = rows0;
            float* rows1p = rows1;
            int dx = 0;
            for (; dx < w; dx++)
            {
                int sx = xofs[dx] * 4;
                const float* S0p = S0 + sx;
                const float* S1p = S1 + sx;

                __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);

                __m128 _S00 = (__m128)__lsx_vld(S0p, 0);
                __m128 _S01 = (__m128)__lsx_vld(S0p + 4, 0);
                __m128 _S10 = (__m128)__lsx_vld(S1p, 0);
                __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0);
                __m128 _rows0 = __lsx_vfmul_s(_S00, _a0);
                __m128 _rows1 = __lsx_vfmul_s(_S10, _a0);
                _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0);
                _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1);
                __lsx_vst(_rows0, rows0p + dx * 4, 0);
                __lsx_vst(_rows1, rows1p + dx * 4, 0);

                alphap += 2;
            }
        }

        prev_sy1 = sy;

        // vresize
        __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]);
        __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]);

        float* rows0p = rows0;
        float* rows1p = rows1;
        float* Dp = dst.row(dy);

        for (int dx = 0; dx < w; dx++)
        {
            __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0);
            __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0);
            __m128 _Dp = __lsx_vfmul_s(_rows0, _b0);
            _Dp = __lsx_vfmadd_s(_b1, _rows1, _Dp);
            __lsx_vst(_Dp, Dp, 0);

            Dp += 4;
            rows0p += 4;
            rows1p += 4;
        }

        beta += 2;
    }
}


================================================
FILE: src/layer/loongarch/interp_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "interp_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

#include "interp_bicubic.h"
#include "interp_bilinear.h"

#if __loongarch_sx
#include "interp_bicubic_pack4.h"
#include "interp_bilinear_pack4.h"
#endif

Interp_loongarch::Interp_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Interp_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& reference_blob = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    int h = bottom_blob.h;
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = reference_blob.w;
    int outh = reference_blob.h;

    if (!size_expr.empty())
    {
        std::vector<Mat> bottom_blob_shapes(bottom_blobs.size());
        for (size_t i = 0; i < bottom_blobs.size(); i++)
        {
            bottom_blob_shapes[i] = bottom_blobs[i].shape();
        }
        eval_size_expr(bottom_blob_shapes, outw, outh);
    }

    if (dims == 1)
    {
        top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __loongarch_sx
        if (elempack == 4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < w; q++)
            {
                Mat top_blob_c = top_blob.channel(q);
                __m128 _v = (__m128)__lsx_vld((const float*)bottom_blob + q * 4, 0);
                top_blob_c.fill(_v);
            }

            return 0;
        }
#endif // __loongarch_sx

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < w; q++)
        {
            Mat top_blob_c = top_blob.channel(q);
            const float v = bottom_blob[q];
            top_blob_c.fill(v);
        }

        return 0;
    }

    if (dims == 2)
    {
        if (outw == w)
        {
            top_blob = bottom_blob;
            return 0;
        }

        top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __loongarch_sx
        if (elempack == 4)
        {
            if (resize_type == 1) // nearest
            {
                const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const float* ptr = bottom_blob.row(y);
                    float* outptr = top_blob.row(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0);
                        __lsx_vst(_p, outptr, 0);

                        outptr += 4;
                    }
                }
            }

            if (resize_type == 2) // bilinear
            {
                int* buf = new int[outw + outw * 2];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                linear_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const float* ptr = bottom_blob.row(y);
                    float* outptr = top_blob.row(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const float* Sp = ptr + sx;

                        __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                        __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);

                        __m128 _S0 = (__m128)__lsx_vld(Sp, 0);
                        __m128 _S1 = (__m128)__lsx_vld(Sp + 4, 0);
                        __m128 _p = __lsx_vfmul_s(_S0, _a0);
                        _p = __lsx_vfmadd_s(_a1, _S1, _p);
                        __lsx_vst(_p, outptr, 0);

                        alphap += 2;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            if (resize_type == 3) // bicubic
            {
                int* buf = new int[outw + outw * 4];

                int* xofs = buf;
                float* alpha = (float*)(buf + outw);

                cubic_coeffs(w, outw, xofs, alpha, align_corner);

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int y = 0; y < h; y++)
                {
                    const float* ptr = bottom_blob.row(y);
                    float* outptr = top_blob.row(y);
                    const float* alphap = alpha;

                    for (int x = 0; x < outw; x++)
                    {
                        int sx = xofs[x] * 4;
                        const float* Sp = ptr + sx;

                        __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]);
                        __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]);
                        __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]);
                        __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]);

                        __m128 _S0 = (__m128)__lsx_vld(Sp - 4, 0);
                        __m128 _S1 = (__m128)__lsx_vld(Sp + 0, 0);
                        __m128 _S2 = (__m128)__lsx_vld(Sp + 4, 0);
                        __m128 _S3 = (__m128)__lsx_vld(Sp + 8, 0);
                        __m128 _p = __lsx_vfmul_s(_S0, _a0);
                        _p = __lsx_vfmadd_s(_a1, _S1, _p);
                        _p = __lsx_vfmadd_s(_a2, _S2, _p);
                        _p = __lsx_vfmadd_s(_a3, _S3, _p);
                        __lsx_vst(_p, outptr, 0);

                        alphap += 4;
                        outptr += 4;
                    }
                }

                delete[] buf;
            }

            return 0;
        }
#endif // __loongarch_sx

        if (resize_type == 1) // nearest
        {
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outw * 2];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            linear_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const float* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    *outptr++ = Sp[0] * a0 + Sp[1] * a1;
                    alphap += 2;
                }
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outw * 4];

            int* xofs = buf;
            float* alpha = (float*)(buf + outw);

            cubic_coeffs(w, outw, xofs, alpha, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int y = 0; y < h; y++)
            {
                const float* ptr = bottom_blob.row(y);
                float* outptr = top_blob.row(y);
                const float* alphap = alpha;

                for (int x = 0; x < outw; x++)
                {
                    int sx = xofs[x];
                    const float* Sp = ptr + sx;
                    float a0 = alphap[0];
                    float a1 = alphap[1];
                    float a2 = alphap[2];
                    float a3 = alphap[3];
                    *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3;
                    alphap += 4;
                }
            }

            delete[] buf;
        }

        return 0;
    }

    if (outw == w && outh == h)
    {
        top_blob = bottom_blob;
        return 0;
    }

    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __loongarch_sx
    if (elempack == 4)
    {
        if (resize_type == 1) // nearest
        {
            const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
            const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                for (int y = 0; y < outh; y++)
                {
                    int in_y = std::min((int)(y * hs), (h - 1));

                    const float* ptr = src.row(in_y);
                    float* outptr = dst.row(y);
                    for (int x = 0; x < outw; x++)
                    {
                        int in_x = std::min((int)(x * ws), (w - 1));

                        __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0);
                        __lsx_vst(_p, outptr, 0);

                        outptr += 4;
                    }
                }
            }
        }

        if (resize_type == 2) // bilinear
        {
            int* buf = new int[outw + outh + outw * 2 + outh * 2];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
            float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

            linear_coeffs(w, outw, xofs, alpha, align_corner);
            linear_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        if (resize_type == 3) // bicubic
        {
            int* buf = new int[outw + outh + outw * 4 + outh * 4];

            int* xofs = buf;        //new int[outw];
            int* yofs = buf + outw; //new int[outh];

            float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
            float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

            cubic_coeffs(w, outw, xofs, alpha, align_corner);
            cubic_coeffs(h, outh, yofs, beta, align_corner);

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat src = bottom_blob.channel(q);
                Mat dst = top_blob.channel(q);

                resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs);
            }

            delete[] buf;
        }

        return 0;
    }
#endif // __loongarch_sx

    if (resize_type == 1) // nearest
    {
        const float hs = (output_height || !size_expr.empty()) ? h / (float)outh : 1.f / height_scale;
        const float ws = (output_width || !size_expr.empty()) ? w / (float)outw : 1.f / width_scale;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            for (int y = 0; y < outh; y++)
            {
                int in_y = std::min((int)(y * hs), (h - 1));

                const float* ptr = src.row(in_y);
                float* outptr = dst.row(y);
                for (int x = 0; x < outw; x++)
                {
                    int in_x = std::min((int)(x * ws), (w - 1));
                    *outptr++ = ptr[in_x];
                }
            }
        }
    }

    if (resize_type == 2) // bilinear
    {
        int* buf = new int[outw + outh + outw * 2 + outh * 2];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 2];
        float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2];

        linear_coeffs(w, outw, xofs, alpha, align_corner);
        linear_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bilinear_image(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    if (resize_type == 3) // bicubic
    {
        int* buf = new int[outw + outh + outw * 4 + outh * 4];

        int* xofs = buf;        //new int[outw];
        int* yofs = buf + outw; //new int[outh];

        float* alpha = (float*)(buf + outw + outh);           //new float[outw * 4];
        float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4];

        cubic_coeffs(w, outw, xofs, alpha, align_corner);
        cubic_coeffs(h, outh, yofs, beta, align_corner);

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const Mat src = bottom_blob.channel(q);
            Mat dst = top_blob.channel(q);

            resize_bicubic_image(src, dst, alpha, xofs, beta, yofs);
        }

        delete[] buf;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/interp_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_INTERP_LOONGARCH_H
#define LAYER_INTERP_LOONGARCH_H

#include "interp.h"

namespace ncnn {

class Interp_loongarch : public Interp
{
public:
    Interp_loongarch();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_INTERP_LOONGARCH_H


================================================
FILE: src/layer/loongarch/lasx_mathfun.h
================================================
// Copyright 2025 AtomAlpaca <atal@anche.no>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LASX_MATHFUN_H
#define LASX_MATHFUN_H

#include "loongarch_usability.h"

#include <lasxintrin.h>

_LOONGARCH_FLOAT_CONST_PS256(c_0, 0.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_1, 1.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_2, 2.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_3, 3.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_4, 4.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_n1, -1.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_n3, -3.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_0p5, 0.5f);
_LOONGARCH_FLOAT_CONST_PS256(c_eps, 1E-8f);

#define c_inv_mant_mask ~0x7f800000u
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_SQRTHF, 0.707106781186547524);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p0, 7.0376836292E-2);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p1, -1.1514610310E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p2, 1.1676998740E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p3, -1.2420140846E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p4, +1.4249322787E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p5, -1.6668057665E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p6, +2.0000714765E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p7, -2.4999993993E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_p8, +3.3333331174E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_q1, -2.12194440e-4);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_log_q2, 0.693359375);

/* natural logarithm computed for 4 simultaneous float
 *   return NaN for x <= 0
 */
static inline __m256 log256_ps(__m256 x)
{
    __m256 one = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i);

    x = __lasx_xvfmax_s(x, (__m256)__lasx_xvreplgr2vr_w(0)); /* force flush to zero on denormal values */
    __m256i invalid_mask = __lasx_xvfcmp_cle_s(x, (__m256)__lasx_xvreplgr2vr_w(0));

    __m256i ux = (__m256i)(x);

    __m256i emm0 = __lasx_xvsrl_w(ux, __lasx_xvreplgr2vr_w(23));

    /* keep only the fractional part */
    ux = __lasx_xvand_v(ux, __lasx_xvreplgr2vr_w(c_inv_mant_mask));
    ux = __lasx_xvor_v(ux, __lasx_xvreplgr2vr_w(_ps256_c_0p5.i));
    x = (__m256)(ux);

    emm0 = __lasx_xvsub_w(emm0, __lasx_xvreplgr2vr_w(0x7f));
    __m256 e = __lasx_xvffint_s_w(emm0);

    e = __lasx_xvfadd_s(e, one);

    /* part2:
     *     if( x < SQRTHF ) {
     *       e -= 1;
     *       x = x + x - 1.0;
     *     } else { x = x - 1.0; }
     */
    __m256i mask = __lasx_xvfcmp_clt_s((__m256)x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_SQRTHF.i));
    __m256 tmp = (__m256)(__lasx_xvand_v((__m256i)(x), (__m256i)mask));
    x = __lasx_xvfsub_s(x, one);
    e = __lasx_xvfsub_s(e, (__m256)(__lasx_xvand_v((__m256i)(one), (__m256i)mask)));
    x = __lasx_xvfadd_s(x, tmp);

    __m256 z = __lasx_xvfmul_s(x, x);

    __m256 y = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p0.i);

    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p1.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p2.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p3.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p4.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p5.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p6.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p7.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_p8.i));
    y = __lasx_xvfmul_s(y, x);

    y = __lasx_xvfmul_s(y, z);

    tmp = __lasx_xvfmul_s(e, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_q1.i));
    y = __lasx_xvfadd_s(y, tmp);

    tmp = __lasx_xvfmul_s(z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i));
    y = __lasx_xvfsub_s(y, tmp);

    tmp = __lasx_xvfmul_s(e, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_log_q2.i));
    x = __lasx_xvfadd_s(x, y);
    x = __lasx_xvfadd_s(x, tmp);
    x = (__m256)(__lasx_xvor_v((__m256i)(x), (__m256i)invalid_mask)); // negative arg will be NAN
    return x;
}

_LOONGARCH_FLOAT_CONST_PS256(c_exp_hi, 88.3762626647949f);
_LOONGARCH_FLOAT_CONST_PS256(c_exp_lo, -88.3762626647949f);

_LOONGARCH_FLOAT_CONST_PS256(c_cephes_LOG2EF, 1.44269504088896341);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_C1, 0.693359375);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_C2, -2.12194440e-4);

_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_p0, 1.9875691500E-4);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_p1, 1.3981999507E-3);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_p2, 8.3334519073E-3);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_p3, 4.1665795894E-2);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_p4, 1.6666665459E-1);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_exp_p5, 5.0000001201E-1);

/* exp() computed for 4 float at once */
static inline __m256 exp256_ps(__m256 x)
{
    __m256 tmp, fx;

    __m256 one = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i);
    x = __lasx_xvfmin_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_exp_hi.i));
    x = __lasx_xvfmax_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_exp_lo.i));

    /* express exp(x) as exp(g + n*log(2)) */
    fx = __lasx_xvfmul_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_LOG2EF.i));
    fx = __lasx_xvfadd_s(fx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i));

    /* perform a floorf */
    tmp = __lasx_xvffint_s_w(__lasx_xvftint_w_s(fx));

    /* if greater, substract 1 */
    __m256i mask = __lasx_xvfcmp_clt_s(fx, tmp);
    mask = __lasx_xvand_v(mask, (__m256i)one);

    fx = __lasx_xvfsub_s(tmp, (__m256)mask);

    tmp = __lasx_xvfmul_s(fx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_C1.i));
    __m256 z = __lasx_xvfmul_s(fx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_C2.i));
    x = __lasx_xvfsub_s(x, tmp);
    x = __lasx_xvfsub_s(x, z);

    __m256 y = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_p0.i);

    z = __lasx_xvfmul_s(x, x);

    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_p1.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_p2.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_p3.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_p4.i));
    y = __lasx_xvfmadd_s(x, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_exp_p5.i));

    y = __lasx_xvfmul_s(y, z);
    y = __lasx_xvfadd_s(y, x);
    y = __lasx_xvfadd_s(y, one);

    /* build 2^n */
    __m256i mm;
    mm = __lasx_xvftintrz_w_s(fx);
    mm = __lasx_xvadd_w(mm, __lasx_xvreplgr2vr_w(0x7f));
    mm = __lasx_xvsll_w(mm, __lasx_xvreplgr2vr_w(23));

    y = __lasx_xvfmul_s(y, (__m256)mm);
    return y;
}

_LOONGARCH_FLOAT_CONST_PS256(c_minus_cephes_DP1, -0.78515625f);
_LOONGARCH_FLOAT_CONST_PS256(c_minus_cephes_DP2, -2.4187564849853515625e-4f);
_LOONGARCH_FLOAT_CONST_PS256(c_minus_cephes_DP3, -3.77489497744594108e-8f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_sin_p0, -1.9515295891E-4f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_sin_p1, 8.3321608736E-3f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_sin_p2, -1.6666654611E-1f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_cos_p0, 2.443315711809948E-005f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_cos_p1, -1.388731625493765E-003f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_cos_p2, 4.166664568298827E-002f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_FOPI, 1.27323954473516f); // 4/PI

static inline __m256 sin256_ps(__m256 x)
{
    __m256 y;
    __m256i swap_sign_bit, poly_mask, sign_bit;
    __m256 n0p5 = __lasx_xvfmul_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_n1.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i));

    sign_bit = __lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x80000000));
    x = (__m256)__lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x7fffffff));

    y = __lasx_xvfmul_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_FOPI.i));

    poly_mask = __lasx_xvftintrz_w_s(y);
    poly_mask = __lasx_xvadd_w(poly_mask, __lasx_xvreplgr2vr_w(1));
    poly_mask = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(~1));
    y = __lasx_xvffint_s_w(poly_mask);

    swap_sign_bit = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(4));
    swap_sign_bit = __lasx_xvslli_w(swap_sign_bit, 29);

    poly_mask = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(2));
    poly_mask = __lasx_xvseq_w(poly_mask, __lasx_xvreplgr2vr_w(0));

    sign_bit = __lasx_xvxor_v(sign_bit, swap_sign_bit);

    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP1.i), x);
    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP2.i), x);
    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP3.i), x);

    y = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p0.i);
    __m256 z = __lasx_xvfmul_s(x, x);
    y = __lasx_xvfmadd_s(y, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p1.i));
    y = __lasx_xvfmadd_s(y, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p2.i));
    y = __lasx_xvfmul_s(y, z);
    y = __lasx_xvfmul_s(y, z);
    y = __lasx_xvfmadd_s(z, n0p5, y);
    y = __lasx_xvfadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i));

    __m256 y2 = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p0.i);
    y2 = __lasx_xvfmadd_s(y2, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p1.i));
    y2 = __lasx_xvfmadd_s(y2, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p2.i));
    y2 = __lasx_xvfmul_s(y2, z);
    y2 = __lasx_xvfmadd_s(y2, x, x);

    y2 = (__m256)__lasx_xvand_v((__m256i)y2, poly_mask);
    y = (__m256)__lasx_xvand_v(__lasx_xvxor_v(poly_mask, __lasx_xvreplgr2vr_w(0xffffffff)), (__m256i)y);
    y = __lasx_xvfadd_s(y, y2);
    y = (__m256)__lasx_xvxor_v((__m256i)y, sign_bit);

    return y;
}

static inline __m256 cos256_ps(__m256 x)
{
    __m256 y;
    __m256i swap_sign_bit, poly_mask, sign_bit;
    __m256 n0p5 = __lasx_xvfmul_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_n1.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i));

    x = (__m256)__lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x7fffffff));

    y = __lasx_xvfmul_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_FOPI.i));

    poly_mask = __lasx_xvftintrz_w_s(y);
    poly_mask = __lasx_xvadd_w(poly_mask, __lasx_xvreplgr2vr_w(1));
    poly_mask = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(~1));
    y = __lasx_xvffint_s_w(poly_mask);
    poly_mask = __lasx_xvsub_w(poly_mask, __lasx_xvreplgr2vr_w(2));

    swap_sign_bit = __lasx_xvandn_v(poly_mask, __lasx_xvreplgr2vr_w(4));
    swap_sign_bit = __lasx_xvslli_w(swap_sign_bit, 29);

    poly_mask = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(2));
    poly_mask = __lasx_xvseq_w(poly_mask, __lasx_xvreplgr2vr_w(0));

    sign_bit = swap_sign_bit;

    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP1.i), x);
    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP2.i), x);
    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP3.i), x);

    y = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p0.i);
    __m256 z = __lasx_xvfmul_s(x, x);
    y = __lasx_xvfmadd_s(y, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p1.i));
    y = __lasx_xvfmadd_s(y, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p2.i));
    y = __lasx_xvfmul_s(y, z);
    y = __lasx_xvfmul_s(y, z);
    y = __lasx_xvfmadd_s(z, n0p5, y);
    y = __lasx_xvfadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i));

    __m256 y2 = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p0.i);
    y2 = __lasx_xvfmadd_s(y2, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p1.i));
    y2 = __lasx_xvfmadd_s(y2, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p2.i));
    y2 = __lasx_xvfmul_s(y2, z);
    y2 = __lasx_xvfmadd_s(y2, x, x);

    y2 = (__m256)__lasx_xvand_v((__m256i)y2, poly_mask);
    y = (__m256)__lasx_xvandn_v(poly_mask, (__m256i)y);
    y = __lasx_xvfadd_s(y, y2);
    y = (__m256)__lasx_xvxor_v((__m256i)y, sign_bit);

    return y;
}

static inline void sincos256_ps(__m256 x, __m256* s, __m256* c)
{
    __m256 y;
    __m256i swap_sign_bit_cos, swap_sign_bit_sin, poly_mask, sign_bit_sin, sign_bit_cos;
    __m256 n0p5 = __lasx_xvfmul_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_n1.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i));

    sign_bit_sin = __lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x80000000));
    x = (__m256)__lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x7fffffff));

    y = __lasx_xvfmul_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_FOPI.i));

    poly_mask = __lasx_xvftintrz_w_s(y);
    poly_mask = __lasx_xvadd_w(poly_mask, __lasx_xvreplgr2vr_w(1));
    poly_mask = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(~1));
    y = __lasx_xvffint_s_w(poly_mask);

    swap_sign_bit_cos = __lasx_xvsub_w(poly_mask, __lasx_xvreplgr2vr_w(2));
    swap_sign_bit_cos = __lasx_xvandn_v(swap_sign_bit_cos, __lasx_xvreplgr2vr_w(4));
    swap_sign_bit_cos = __lasx_xvslli_w(swap_sign_bit_cos, 29);

    swap_sign_bit_sin = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(4));
    swap_sign_bit_sin = __lasx_xvslli_w(swap_sign_bit_sin, 29);

    poly_mask = __lasx_xvand_v(poly_mask, __lasx_xvreplgr2vr_w(2));
    poly_mask = __lasx_xvseq_w(poly_mask, __lasx_xvreplgr2vr_w(0));

    sign_bit_sin = __lasx_xvxor_v(sign_bit_sin, swap_sign_bit_sin);
    sign_bit_cos = swap_sign_bit_cos;

    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP1.i), x);
    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP2.i), x);
    x = __lasx_xvfmadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_minus_cephes_DP3.i), x);

    __m256 z = __lasx_xvfmul_s(x, x);
    y = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p0.i);
    y = __lasx_xvfmadd_s(y, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p1.i));
    y = __lasx_xvfmadd_s(y, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_cos_p2.i));
    y = __lasx_xvfmul_s(y, z);
    y = __lasx_xvfmul_s(y, z);
    y = __lasx_xvfmadd_s(z, n0p5, y);
    y = __lasx_xvfadd_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i));

    __m256 y2 = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p0.i);
    y2 = __lasx_xvfmadd_s(y2, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p1.i));
    y2 = __lasx_xvfmadd_s(y2, z, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_sin_p2.i));
    y2 = __lasx_xvfmul_s(y2, z);
    y2 = __lasx_xvfmadd_s(y2, x, x);

    __m256 ysin1 = (__m256)__lasx_xvandn_v(poly_mask, (__m256i)y);
    __m256 ysin2 = (__m256)__lasx_xvand_v(poly_mask, (__m256i)y2);
    y2 = __lasx_xvfsub_s(y2, ysin2);
    y = __lasx_xvfsub_s(y, ysin1);

    ysin1 = __lasx_xvfadd_s(ysin1, ysin2);
    y = __lasx_xvfadd_s(y, y2);

    *s = (__m256)__lasx_xvxor_v((__m256i)ysin1, sign_bit_sin);
    *c = (__m256)__lasx_xvxor_v((__m256i)y, sign_bit_cos);
}

static inline __m256 tan256_ps(__m256 x)
{
    __m256 ysin, ycos;
    __m256 eps = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_eps.i);
    __m256 zero = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0.i);
    sincos256_ps(x, &ysin, &ycos);
    __m256i mask = __lasx_xvfcmp_ceq_s(ycos, eps);
    mask = __lasx_xvand_v(mask, (__m256i)eps);
    ycos = __lasx_xvfadd_s(ycos, (__m256)mask);
    __m256 ytan = __lasx_xvfdiv_s(ysin, ycos);
    return ytan;
}

_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_a4, 0.023994016f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_a5, 0.042417344f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_a2, 0.07494697f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_a3, 0.045520633f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_a0, 1.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_a1, 0.166667819f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_half_pi, 1.5707964f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_pi, 3.1415927f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_asin_npi, -3.1415927f);

static inline __m256 asin256_ps(__m256 x)
{
    __m256 big_input_approx, input_approx, square_of_input_approx, fourth_power_of_input_approx;
    __m256 is_big_input_one, output_approx, final_approx;
    __m256 tmp1, tmp2, tmp3, tmp4;
    __m256i mask, is_small_input, is_big_input;

    mask = __lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x80000000));
    x = (__m256)__lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x7fffffff));

    is_small_input = __lasx_xvfcmp_cle_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i));
    is_big_input = __lasx_xvxor_v(is_small_input, __lasx_xvreplgr2vr_w(0xffffffff));
    is_big_input_one = (__m256)__lasx_xvand_v(__lasx_xvreplgr2vr_w(_ps256_c_1.i), is_big_input);

    big_input_approx = __lasx_xvfsub_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i), x);
    big_input_approx = __lasx_xvfmul_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i), big_input_approx);
    big_input_approx = __lasx_xvfsqrt_s(big_input_approx);

    input_approx = (__m256)__lasx_xvand_v(is_small_input, (__m256i)x);
    input_approx = (__m256)__lasx_xvor_v((__m256i)input_approx, __lasx_xvand_v(is_big_input, (__m256i)big_input_approx));

    square_of_input_approx = __lasx_xvfmul_s(input_approx, input_approx);
    fourth_power_of_input_approx = __lasx_xvfmul_s(square_of_input_approx, square_of_input_approx);

    tmp1 = __lasx_xvfmadd_s(fourth_power_of_input_approx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a4.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a2.i));
    tmp2 = __lasx_xvfmadd_s(fourth_power_of_input_approx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a5.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a3.i));
    tmp3 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp1, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a0.i));
    tmp4 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp2, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a1.i));
    output_approx = __lasx_xvfmadd_s(square_of_input_approx, tmp4, tmp3);

    tmp1 = __lasx_xvfmul_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_half_pi.i), is_big_input_one);
    tmp2 = __lasx_xvfmul_s(output_approx, input_approx);
    tmp3 = __lasx_xvfmadd_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_n3.i), is_big_input_one, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i));

    final_approx = __lasx_xvfmadd_s(tmp2, tmp3, tmp1);
    final_approx = (__m256)__lasx_xvor_v((__m256i)final_approx, mask);

    return final_approx;
}

static inline __m256 acos256_ps(__m256 x)
{
    __m256 big_input_approx, input_approx, square_of_input_approx, fourth_power_of_input_approx;
    __m256 output_approx, final_approx, small_final_approx, big_final_approx;
    __m256 tmp1, tmp2, tmp3, tmp4;
    __m256i mask, mask2, is_small_input, is_big_input, lt_zero;

    lt_zero = __lasx_xvfcmp_clt_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0.i));
    mask = __lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x80000000));
    x = (__m256)__lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x7fffffff));

    is_small_input = __lasx_xvfcmp_cle_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i));
    is_big_input = __lasx_xvxor_v(is_small_input, __lasx_xvreplgr2vr_w(0xffffffff));

    big_input_approx = __lasx_xvfsub_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i), x);
    big_input_approx = __lasx_xvfmul_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_0p5.i), big_input_approx);
    big_input_approx = __lasx_xvfsqrt_s(big_input_approx);

    input_approx = (__m256)__lasx_xvand_v(is_small_input, (__m256i)x);
    input_approx = (__m256)__lasx_xvor_v((__m256i)input_approx, __lasx_xvand_v(is_big_input, (__m256i)big_input_approx));

    square_of_input_approx = __lasx_xvfmul_s(input_approx, input_approx);
    fourth_power_of_input_approx = __lasx_xvfmul_s(square_of_input_approx, square_of_input_approx);

    tmp1 = __lasx_xvfmadd_s(fourth_power_of_input_approx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a4.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a2.i));
    tmp2 = __lasx_xvfmadd_s(fourth_power_of_input_approx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a5.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a3.i));
    tmp3 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp1, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a0.i));
    tmp4 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp2, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_a1.i));
    output_approx = __lasx_xvfmadd_s(square_of_input_approx, tmp4, tmp3);

    tmp1 = __lasx_xvfmul_s(input_approx, output_approx);

    small_final_approx = (__m256)__lasx_xvor_v((__m256i)tmp1, mask);
    small_final_approx = __lasx_xvfsub_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_half_pi.i), small_final_approx);

    big_final_approx = (__m256)__lasx_xvand_v(lt_zero, __lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_pi.i));
    tmp1 = __lasx_xvfadd_s(tmp1, tmp1);
    tmp1 = (__m256)__lasx_xvor_v((__m256i)tmp1, mask);
    big_final_approx = __lasx_xvfadd_s(big_final_approx, tmp1);

    final_approx = (__m256)__lasx_xvand_v(is_small_input, (__m256i)small_final_approx);
    final_approx = (__m256)__lasx_xvor_v((__m256i)final_approx, __lasx_xvand_v(is_big_input, (__m256i)big_final_approx));

    return final_approx;
}

_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x0, 1.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x1, -0.33333072f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x2, 0.1999262f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x3, -0.14203644f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x4, 0.10640934f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x5, -0.07504295f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x6, 0.04269152f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x7, -0.01606863f);
_LOONGARCH_FLOAT_CONST_PS256(c_cephes_atan_x8, 0.0028498897f);

static inline __m256 atan256_ps(__m256 x)
{
    __m256i mask, is_small_input, is_big_input;
    __m256 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, input_approx, output_approx;
    __m256 square_of_input_approx, fourth_power_of_input_approx;

    mask = __lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x80000000));
    x = (__m256)__lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x7fffffff));

    is_small_input = __lasx_xvfcmp_clt_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_1.i), x);
    is_big_input = __lasx_xvxor_v(is_small_input, __lasx_xvreplgr2vr_w(0xffffffff));

    tmp1 = (__m256)__lasx_xvand_v(is_small_input, __lasx_xvreplgr2vr_w(_ps256_c_n1.i));
    tmp1 = (__m256)__lasx_xvor_v(__lasx_xvand_v(is_big_input, (__m256i)x), (__m256i)tmp1);

    tmp2 = (__m256)__lasx_xvand_v(is_small_input, (__m256i)x);
    tmp2 = (__m256)__lasx_xvor_v(__lasx_xvand_v((__m256i)is_big_input, __lasx_xvreplgr2vr_w(_ps256_c_1.i)), (__m256i)tmp2);

    input_approx = __lasx_xvfdiv_s(tmp1, tmp2);
    square_of_input_approx = __lasx_xvfmul_s(input_approx, input_approx);
    fourth_power_of_input_approx = __lasx_xvfmul_s(square_of_input_approx, square_of_input_approx);

    tmp1 = __lasx_xvfmadd_s(fourth_power_of_input_approx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x7.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x5.i));
    tmp2 = __lasx_xvfmadd_s(fourth_power_of_input_approx, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x8.i), (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x6.i));
    tmp3 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp1, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x3.i));
    tmp4 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp2, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x4.i));
    tmp5 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp3, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x1.i));
    tmp6 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp4, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x2.i));
    tmp7 = __lasx_xvfmadd_s(fourth_power_of_input_approx, tmp6, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_cephes_atan_x0.i));
    output_approx = __lasx_xvfmadd_s(square_of_input_approx, tmp5, tmp7);

    tmp1 = __lasx_xvfmul_s(input_approx, output_approx);
    tmp2 = (__m256)__lasx_xvand_v(is_small_input, __lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_half_pi.i));
    tmp1 = __lasx_xvfadd_s(tmp1, tmp2);
    tmp1 = (__m256)__lasx_xvxor_v(mask, (__m256i)tmp1);
    return tmp1;
}

static inline __m256 atan2256_ps(__m256 y, __m256 x)
{
    __m256i not_eq_zero_x, not_eq_zero_y, normal_mode, negative_mask_x, negative_mask_y;
    __m256i lt_zero_mask_x, lt_zero_mask_y, ge_zero_mask_y, eq_zero_y;
    __m256 pi_additions, tmp1, tmp2, normal_result, special_result, final_result;

    not_eq_zero_x = __lasx_xvfcmp_cne_s(x, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0.i));
    not_eq_zero_y = __lasx_xvfcmp_cne_s(y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_0.i));
    eq_zero_y = __lasx_xvxor_v(not_eq_zero_y, __lasx_xvreplgr2vr_w(0xffffffff));
    normal_mode = __lasx_xvand_v(not_eq_zero_x, not_eq_zero_y);
    negative_mask_x = __lasx_xvand_v((__m256i)x, __lasx_xvreplgr2vr_w(0x80000000));
    negative_mask_y = __lasx_xvand_v((__m256i)y, __lasx_xvreplgr2vr_w(0x80000000));

    lt_zero_mask_x = __lasx_xvfcmp_clt_s(x, (__m256)__lasx_xvreplgr2vr_w(0));
    lt_zero_mask_y = __lasx_xvfcmp_clt_s(y, (__m256)__lasx_xvreplgr2vr_w(0));
    ge_zero_mask_y = __lasx_xvxor_v(lt_zero_mask_y, __lasx_xvreplgr2vr_w(0xffffffff));

    pi_additions = (__m256)__lasx_xvand_v(lt_zero_mask_y, __lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_npi.i));
    pi_additions = (__m256)__lasx_xvor_v(__lasx_xvand_v(ge_zero_mask_y, __lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_pi.i)), (__m256i)pi_additions);
    pi_additions = (__m256)__lasx_xvand_v(lt_zero_mask_x, (__m256i)pi_additions);

    normal_result = __lasx_xvfdiv_s(y, x);
    normal_result = __lasx_xvfadd_s(atan256_ps(normal_result), pi_additions);

    tmp1 = (__m256)__lasx_xvand_v(negative_mask_y, __lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_half_pi.i));
    tmp2 = (__m256)__lasx_xvand_v(negative_mask_x, __lasx_xvreplgr2vr_w(_ps256_c_cephes_asin_pi.i));
    special_result = (__m256)__lasx_xvand_v(not_eq_zero_y, (__m256i)tmp1);
    special_result = (__m256)__lasx_xvor_v(__lasx_xvand_v(eq_zero_y, (__m256i)tmp2), (__m256i)special_result);

    final_result = (__m256)__lasx_xvand_v(normal_mode, (__m256i)normal_result);
    normal_mode = __lasx_xvxor_v(normal_mode, __lasx_xvreplgr2vr_w(0xffffffff));
    final_result = (__m256)__lasx_xvor_v(__lasx_xvand_v(normal_mode, (__m256i)special_result), (__m256i)final_result);

    return final_result;
}

_LOONGARCH_FLOAT_CONST_PS256(c_tanh_tiny, 1e-4f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_hi, 9.0f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_alpha_1, 4.89352455891786e-3f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_alpha_3, 6.37261928875436e-4f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_alpha_5, 1.48572235717979e-5f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_alpha_7, 5.12229709037114e-8f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_alpha_9, -8.60467152213735e-11f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_alpha_11, 2.00018790482477e-13f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_alpha_13, -2.76076847742355e-16f);
// The monomial coefficients of the denominator polynomial (even).
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_beta_0, 4.89352518554385e-3f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_beta_2, 2.26843463243900e-3f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_beta_4, 1.18534705686654e-4f);
_LOONGARCH_FLOAT_CONST_PS256(c_tanh_beta_6, 1.19825839466702e-6f);

/* tanh() computed for 4 float at once */
static inline __m256 tanh256_ps(__m256 x)
{
    __m256 x2 = (__m256)__lasx_xvbitclri_w((__m256i)x, 31);
    __m256i tiny_mask = __lasx_xvfcmp_clt_s((__m256)x2, (__m256)(__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_tiny.i));
    __m256i sig_mask = __lasx_xvreplgr2vr_w(1 << 31);
    __m256i sig_save = __lasx_xvand_v((__m256i)x, sig_mask);

    // clamp the inputs to the range [-9, 9] since anything outside
    // this range is -/+1.0f in single-precision.
    x2 = (__m256)__lasx_xvbitsel_v((__m256i)x2, (__m256i)__lasx_xvreplgr2vr_w(_ps256_c_tanh_hi.i), (__m256i)__lasx_xvfcmp_clt_s((__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_hi.i), (__m256)x2));

    // since the polynomials are odd/even, we need x**2.
    __m256 z = __lasx_xvfmul_s(x2, x2);

    // evaluate the numerator polynomial y.
    __m256 y = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_alpha_13.i);
    y = __lasx_xvfmadd_s(z, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_alpha_11.i));
    y = __lasx_xvfmadd_s(z, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_alpha_9.i));
    y = __lasx_xvfmadd_s(z, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_alpha_7.i));
    y = __lasx_xvfmadd_s(z, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_alpha_5.i));
    y = __lasx_xvfmadd_s(z, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_alpha_3.i));
    y = __lasx_xvfmadd_s(z, y, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_alpha_1.i));
    y = __lasx_xvfmul_s(y, x2);

    // evaluate the denominator polynomial w.
    __m256 w = (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_beta_6.i);
    w = __lasx_xvfmadd_s(z, w, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_beta_4.i));
    w = __lasx_xvfmadd_s(z, w, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_beta_2.i));
    w = __lasx_xvfmadd_s(z, w, (__m256)__lasx_xvreplgr2vr_w(_ps256_c_tanh_beta_0.i));

    // divide the numerator by the denominator.
    y = __lasx_xvfdiv_s(y, w);

    // reinstate the sign.
    y = (__m256)__lasx_xvor_v((__m256i)y, sig_save);

    // when the argument is very small in magnitude it's more accurate to just return it.
    y = (__m256)__lasx_xvbitsel_v((__m256i)y, (__m256i)x, (__m256i)tiny_mask);

    return y;
}

static inline __m256 pow256_ps(__m256 a, __m256 b)
{
    // pow(x, m) = exp(m * log(x))
    return exp256_ps(__lasx_xvfmul_s(b, log256_ps(a)));
}

static inline __m256 sigmoid256_ps(__m256 _v)
{
    __m256 _one = __lasx_xvreplfr2vr_s(1.f);
    _v = (__m256)__lasx_xvbitrevi_w((__m256i)_v, 31);
    _v = exp256_ps(_v);
    _v = __lasx_xvfadd_s(_v, _one);
    return __lasx_xvfdiv_s(_one, _v);
}

#endif // LASX_MATHFUN_H


================================================
FILE: src/layer/loongarch/loongarch_activation.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LOONGARCH_ACTIVATION_H
#define LOONGARCH_ACTIVATION_H

#include "fused_activation.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"

static inline __m128 activation_ps(__m128 _v, int activation_type, const ncnn::Mat& activation_params)
{
    if (activation_type == 1)
    {
        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
        _v = __lsx_vfmax_s(_v, _zero);
    }
    else if (activation_type == 2)
    {
        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
        __m128 _slope = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
        __m128i _lemask = __lsx_vfcmp_cle_s(_v, _zero);
        __m128 _ps = __lsx_vfmul_s(_v, _slope);
        _v = (__m128)__lsx_vbitsel_v((__m128i)_v, (__m128i)_ps, (__m128i)_lemask);
    }
    else if (activation_type == 3)
    {
        __m128 _min = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
        __m128 _max = (__m128)__lsx_vreplfr2vr_s(activation_params[1]);
        _v = __lsx_vfmax_s(_v, _min);
        _v = __lsx_vfmin_s(_v, _max);
    }
    else if (activation_type == 4)
    {
        _v = sigmoid_ps(_v);
    }
    else if (activation_type == 5)
    {
        _v = __lsx_vfmul_s(_v, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_v), (__m128)__lsx_vreplfr2vr_s(1.f)))));
    }
    else if (activation_type == 6)
    {
        __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(activation_params[0]);
        __m128 _beta = (__m128)__lsx_vreplfr2vr_s(activation_params[1]);
        __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
        __m128 _outp = __lsx_vfmadd_s(_alpha, _v, _beta);
        _outp = __lsx_vfmax_s(_outp, _zero);
        _outp = __lsx_vfmin_s(_outp, _one);
        _v = __lsx_vfmul_s(_outp, _v);
    }

    return _v;
}
#endif // __loongarch_sx

#endif // LOONGARCH_ACTIVATION_H


================================================
FILE: src/layer/loongarch/loongarch_usability.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LOONGARCH_USABILITY_H
#define LOONGARCH_USABILITY_H

#if __loongarch_sx
#include <lsxintrin.h>
#if __loongarch_asx
#include <lasxintrin.h>
#endif // __loongarch_asx
#endif // __loongarch_sx

#include <stdint.h>

namespace ncnn {

typedef union
{
    int32_t i;
    float f;
} FloatInt;

} // namespace ncnn

#if __loongarch_sx
/* declare some loongarch constants with union */
#define _LOONGARCH_FLOAT_CONST(Name, Val) \
    static const ncnn::FloatInt Name = {.f = Val}
#endif

#if __loongarch_asx
/* declare some loongarch constants with union */
#define _LOONGARCH_FLOAT_CONST_PS256(Name, Val) \
    static const ncnn::FloatInt _ps256_##Name = {.f = Val}
#endif

#if __loongarch_sx
/* float type data load instructions */
static NCNN_FORCEINLINE __m128 __lsx_vreplfr2vr_s(float val)
{
    ncnn::FloatInt fi_tmpval = {.f = val};
    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
}

static NCNN_FORCEINLINE float __lsx_reduce_fadd_s(__m128 _v)
{
    // TODO find a more efficient way
    float* _v_p = (float*)&_v;
    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3];
}

static NCNN_FORCEINLINE int __lsx_reduce_add_w(__m128i _v)
{
    // TODO find a more efficient way
    int* _v_p = (int*)&_v;
    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3];
}

#endif // __loongarch_sx

#if __loongarch_asx
/* float type data load instructions */
static NCNN_FORCEINLINE __m256 __lasx_xvreplfr2vr_s(float val)
{
    ncnn::FloatInt fi_tmpval = {.f = val};
    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
}

static NCNN_FORCEINLINE float __lasx_reduce_fadd_s(__m256 _v)
{
    // TODO find a more efficient way
    float* _v_p = (float*)&_v;
    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3] + _v_p[4] + _v_p[5] + _v_p[6] + _v_p[7];
}

static NCNN_FORCEINLINE int __lasx_reduce_add_w(__m256i _v)
{
    // TODO find a more efficient way
    int* _v_p = (int*)&_v;
    return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3] + _v_p[4] + _v_p[5] + _v_p[6] + _v_p[7];
}
#endif // __loongarch_asx

static NCNN_FORCEINLINE signed char float2int8(float v)
{
    int int32 = round(v);
    if (int32 > 127) return 127;
    if (int32 < -127) return -127;
    return (signed char)int32;
}

#if __loongarch_sx
static NCNN_FORCEINLINE __m128i round(__m128 _v)
{
    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);

    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, (__m128i)_sign);
    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
    __m128i _v32 = __lsx_vftintrz_w_s(_v5);

    return _v32;
}

static NCNN_FORCEINLINE __m128i float2int8(__m128 _v)
{
    __m128i _v32 = round(_v);

    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
    _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127));
    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);

    return _v8;
}

static NCNN_FORCEINLINE int64_t float2int8(__m128 _vlow, __m128 _vhigh)
{
    // simulate round to nearest via +/-0.5
    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);

    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
    __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow);
    __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh);
    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);

    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
    _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127));
    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);

    return _v8[0];
}

static NCNN_FORCEINLINE __m128i float2int8relu(__m128 _v)
{
    __m128i _v32 = round(_v);

    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);
    _v16 = __lsx_vmaxi_h(_v16, 0);
    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);

    return _v8;
}

static NCNN_FORCEINLINE int64_t float2int8relu(__m128 _vlow, __m128 _vhigh)
{
    // simulate round to nearest via +/-0.5
    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);

    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
    __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow);
    __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh);
    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);

    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);
    _v16 = __lsx_vmaxi_h(_v16, 0);
    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);

    return _v8[0];
}

static NCNN_FORCEINLINE __m128i float2int8leakyrelu(__m128 _v, __m128 _slope)
{
    __m128 _v_leaky = __lsx_vfmul_s(_v, _slope);

    // simulate round to nearest via +/-0.5
    __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f);
    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);

    __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask);
    __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign);
    __m128 _v5 = __lsx_vfadd_s(_v, _p5s);
    __m128i _v32 = __lsx_vftintrz_w_s(_v5);

    __m128i _sign_leaky = __lsx_vand_v((__m128i)_v_leaky, _signmask);
    __m128 _p5_leaky = (__m128)__lsx_vor_v((__m128i)_p5, _sign_leaky);
    __m128 _v5_leaky = __lsx_vfadd_s(_v_leaky, _p5_leaky);
    __m128i _v32_leaky = __lsx_vftintrz_w_s(_v5_leaky);

    __m128i _v32_16 = __lsx_vsat_w(_v32, 15);
    __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16);

    __m128i _v32_16_leaky = __lsx_vsat_w(_v32_leaky, 15);
    __m128i _v16_leaky = __lsx_vpickev_h(_v32_16_leaky, _v32_16_leaky);

    _v16 = __lsx_vmax_h(_v16, _v16_leaky);
    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);

    return _v8;
}

static NCNN_FORCEINLINE int64_t float2int8leakyrelu(__m128 _vlow, __m128 _vhigh, __m128 _slope)
{
    __m128 _vlow_leaky = __lsx_vfmul_s(_vlow, _slope);
    __m128 _vhigh_leaky = __lsx_vfmul_s(_vhigh, _slope);

    // simulate round to nearest via +/-0.5
    __m128i _p5 = (__m128i)__lsx_vreplfr2vr_s(0.5f);
    __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31);

    __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask);
    __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask);
    __m128 _p5low = (__m128)__lsx_vor_v(_p5, _signlow);
    __m128 _p5high = (__m128)__lsx_vor_v(_p5, _signhigh);
    __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low);
    __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high);
    __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5);
    __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5);

    __m128i _signlow_leaky = __lsx_vand_v((__m128i)_vlow_leaky, _signmask);
    __m128i _signhigh_leaky = __lsx_vand_v((__m128i)_vhigh_leaky, _signmask);
    __m128 _p5low_leaky = (__m128)__lsx_vor_v(_p5, _signlow_leaky);
    __m128 _p5high_leaky = (__m128)__lsx_vor_v(_p5, _signhigh_leaky);
    __m128 _vlow5_leaky = __lsx_vfadd_s(_vlow_leaky, _p5low_leaky);
    __m128 _vhigh5_leaky = __lsx_vfadd_s(_vhigh_leaky, _p5high_leaky);
    __m128i _vlow32_leaky = __lsx_vftintrz_w_s(_vlow5_leaky);
    __m128i _vhigh32_leaky = __lsx_vftintrz_w_s(_vhigh5_leaky);

    __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15);
    __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15);
    __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16);

    __m128i _vlow32_16_leaky = __lsx_vsat_w(_vlow32_leaky, 15);
    __m128i _vhigh32_16_leaky = __lsx_vsat_w(_vhigh32_leaky, 15);
    __m128i _v16_leaky = __lsx_vpickev_h(_vhigh32_16_leaky, _vlow32_16_leaky);

    _v16 = __lsx_vmax_h(_v16, _v16_leaky);
    __m128i _v16_8 = __lsx_vsat_h(_v16, 7);
    __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8);

    return _v8[0];
}
#endif // __loongarch_sx

#if __loongarch_asx
static NCNN_FORCEINLINE __m256i round(__m256 _v)
{
    __m256 _p5 = (__m256)__lasx_xvreplfr2vr_s(0.5f);
    __m256i _signmask = __lasx_xvreplgr2vr_w(1 << 31);

    __m256i _sign = __lasx_xvand_v((__m256i)_v, _signmask);
    __m256 _p5s = (__m256)__lasx_xvor_v((__m256i)_p5, (__m256i)_sign);
    __m256 _v5 = __lasx_xvfadd_s(_v, _p5s);
    __m256i _v32 = __lasx_xvftintrz_w_s(_v5);

    return _v32;
}

static NCNN_FORCEINLINE __m256i float2int8(__m256 _v)
{
    __m256i _v32 = round(_v);

    __m256i _v32_16 = __lasx_xvsat_w(_v32, 15);
    __m256i _v16 = __lasx_xvpickev_h(_v32_16, _v32_16);
    _v16 = __lasx_xvmax_h(_v16, __lasx_xvreplgr2vr_h(-127));
    __m256i _v16_8 = __lasx_xvsat_h(_v16, 7);
    __m256i _v8 = __lasx_xvpickev_b(_v16_8, _v16_8);

    return _v8;
}

static NCNN_FORCEINLINE int64_t float2int8(__m256 _vlow, __m256 _vhigh)
{
    // simulate round to nearest via +/-0.5
    __m256 _p5 = (__m256)__lasx_xvreplfr2vr_s(0.5f);
    __m256i _signmask = __lasx_xvreplgr2vr_w(1 << 31);

    __m256i _signlow = __lasx_xvand_v((__m256i)_vlow, _signmask);
    __m256i _signhigh = __lasx_xvand_v((__m256i)_vhigh, _signmask);
    __m256 _p5low = (__m256)__lasx_xvor_v((__m256i)_p5, _signlow);
    __m256 _p5high = (__m256)__lasx_xvor_v((__m256i)_p5, _signhigh);
    __m256 _vlow5 = __lasx_xvfadd_s(_vlow, _p5low);
    __m256 _vhigh5 = __lasx_xvfadd_s(_vhigh, _p5high);
    __m256i _vlow32 = __lasx_xvftintrz_w_s(_vlow5);
    __m256i _vhigh32 = __lasx_xvftintrz_w_s(_vhigh5);

    __m256i _vlow32_16 = __lasx_xvsat_w(_vlow32, 15);
    __m256i _vhigh32_16 = __lasx_xvsat_w(_vhigh32, 15);
    __m256i _v16 = __lasx_xvpickev_h(_vhigh32_16, _vlow32_16);
    _v16 = __lasx_xvmax_h(_v16, __lasx_xvreplgr2vr_h(-127));
    __m256i _v16_8 = __lasx_xvsat_h(_v16, 7);
    __m256i _v8 = __lasx_xvpickev_b(_v16_8, _v16_8);

    return _v8[0];
}

static NCNN_FORCEINLINE __m256i float2int8relu(__m256 _v)
{
    __m256i _v32 = round(_v);

    __m256i _v32_16 = __lasx_xvsat_w(_v32, 15);
    __m256i _v16 = __lasx_xvpickev_h(_v32_16, _v32_16);
    _v16 = __lasx_xvmaxi_h(_v16, 0);
    __m256i _v16_8 = __lasx_xvsat_h(_v16, 7);
    __m256i _v8 = __lasx_xvpickev_b(_v16_8, _v16_8);

    return _v8;
}

static NCNN_FORCEINLINE int64_t float2int8relu(__m256 _vlow, __m256 _vhigh)
{
    // simulate round to nearest via +/-0.5
    __m256 _p5 = (__m256)__lasx_xvreplfr2vr_s(0.5f);
    __m256i _signmask = __lasx_xvreplgr2vr_w(1 << 31);

    __m256i _signlow = __lasx_xvand_v((__m256i)_vlow, _signmask);
    __m256i _signhigh = __lasx_xvand_v((__m256i)_vhigh, _signmask);
    __m256 _p5low = (__m256)__lasx_xvor_v((__m256i)_p5, _signlow);
    __m256 _p5high = (__m256)__lasx_xvor_v((__m256i)_p5, _signhigh);
    __m256 _vlow5 = __lasx_xvfadd_s(_vlow, _p5low);
    __m256 _vhigh5 = __lasx_xvfadd_s(_vhigh, _p5high);
    __m256i _vlow32 = __lasx_xvftintrz_w_s(_vlow5);
    __m256i _vhigh32 = __lasx_xvftintrz_w_s(_vhigh5);

    __m256i _vlow32_16 = __lasx_xvsat_w(_vlow32, 15);
    __m256i _vhigh32_16 = __lasx_xvsat_w(_vhigh32, 15);
    __m256i _v16 = __lasx_xvpickev_h(_vhigh32_16, _vlow32_16);
    _v16 = __lasx_xvmaxi_h(_v16, 0);
    __m256i _v16_8 = __lasx_xvsat_h(_v16, 7);
    __m256i _v8 = __lasx_xvpickev_b(_v16_8, _v16_8);

    return _v8[0];
}

static NCNN_FORCEINLINE __m256i float2int8leakyrelu(__m256 _v, __m256 _slope)
{
    __m256 _v_leaky = __lasx_xvfmul_s(_v, _slope);

    // simulate round to nearest via +/-0.5
    __m256 _p5 = (__m256)__lasx_xvreplfr2vr_s(0.5f);
    __m256i _signmask = __lasx_xvreplgr2vr_w(1 << 31);

    __m256i _sign = __lasx_xvand_v((__m256i)_v, _signmask);
    __m256 _p5s = (__m256)__lasx_xvor_v((__m256i)_p5, _sign);
    __m256 _v5 = __lasx_xvfadd_s(_v, _p5s);
    __m256i _v32 = __lasx_xvftintrz_w_s(_v5);

    __m256i _sign_leaky = __lasx_xvand_v((__m256i)_v_leaky, _signmask);
    __m256 _p5_leaky = (__m256)__lasx_xvor_v((__m256i)_p5, _sign_leaky);
    __m256 _v5_leaky = __lasx_xvfadd_s(_v_leaky, _p5_leaky);
    __m256i _v32_leaky = __lasx_xvftintrz_w_s(_v5_leaky);

    __m256i _v32_16 = __lasx_xvsat_w(_v32, 15);
    __m256i _v16 = __lasx_xvpickev_h(_v32_16, _v32_16);

    __m256i _v32_16_leaky = __lasx_xvsat_w(_v32_leaky, 15);
    __m256i _v16_leaky = __lasx_xvpickev_h(_v32_16_leaky, _v32_16_leaky);

    _v16 = __lasx_xvmax_h(_v16, _v16_leaky);
    __m256i _v16_8 = __lasx_xvsat_h(_v16, 7);
    __m256i _v8 = __lasx_xvpickev_b(_v16_8, _v16_8);

    return _v8;
}

static NCNN_FORCEINLINE int64_t float2int8leakyrelu(__m256 _vlow, __m256 _vhigh, __m256 _slope)
{
    __m256 _vlow_leaky = __lasx_xvfmul_s(_vlow, _slope);
    __m256 _vhigh_leaky = __lasx_xvfmul_s(_vhigh, _slope);

    // simulate round to nearest via +/-0.5
    __m256i _p5 = (__m256i)__lasx_xvreplfr2vr_s(0.5f);
    __m256i _signmask = __lasx_xvreplgr2vr_w(1 << 31);

    __m256i _signlow = __lasx_xvand_v((__m256i)_vlow, _signmask);
    __m256i _signhigh = __lasx_xvand_v((__m256i)_vhigh, _signmask);
    __m256 _p5low = (__m256)__lasx_xvor_v(_p5, _signlow);
    __m256 _p5high = (__m256)__lasx_xvor_v(_p5, _signhigh);
    __m256 _vlow5 = __lasx_xvfadd_s(_vlow, _p5low);
    __m256 _vhigh5 = __lasx_xvfadd_s(_vhigh, _p5high);
    __m256i _vlow32 = __lasx_xvftintrz_w_s(_vlow5);
    __m256i _vhigh32 = __lasx_xvftintrz_w_s(_vhigh5);

    __m256i _signlow_leaky = __lasx_xvand_v((__m256i)_vlow_leaky, _signmask);
    __m256i _signhigh_leaky = __lasx_xvand_v((__m256i)_vhigh_leaky, _signmask);
    __m256 _p5low_leaky = (__m256)__lasx_xvor_v(_p5, _signlow_leaky);
    __m256 _p5high_leaky = (__m256)__lasx_xvor_v(_p5, _signhigh_leaky);
    __m256 _vlow5_leaky = __lasx_xvfadd_s(_vlow_leaky, _p5low_leaky);
    __m256 _vhigh5_leaky = __lasx_xvfadd_s(_vhigh_leaky, _p5high_leaky);
    __m256i _vlow32_leaky = __lasx_xvftintrz_w_s(_vlow5_leaky);
    __m256i _vhigh32_leaky = __lasx_xvftintrz_w_s(_vhigh5_leaky);

    __m256i _vlow32_16 = __lasx_xvsat_w(_vlow32, 15);
    __m256i _vhigh32_16 = __lasx_xvsat_w(_vhigh32, 15);
    __m256i _v16 = __lasx_xvpickev_h(_vhigh32_16, _vlow32_16);

    __m256i _vlow32_16_leaky = __lasx_xvsat_w(_vlow32_leaky, 15);
    __m256i _vhigh32_16_leaky = __lasx_xvsat_w(_vhigh32_leaky, 15);
    __m256i _v16_leaky = __lasx_xvpickev_h(_vhigh32_16_leaky, _vlow32_16_leaky);

    _v16 = __lasx_xvmax_h(_v16, _v16_leaky);
    __m256i _v16_8 = __lasx_xvsat_h(_v16, 7);
    __m256i _v8 = __lasx_xvpickev_b(_v16_8, _v16_8);

    return _v8[0];
}
#endif // __loongarch_asx

#endif // LOONGARCH_USABILITY_H


================================================
FILE: src/layer/loongarch/lsx_mathfun.h
================================================
/* LOONGARCH implementation of mathfun
 *
 *   Inspired by Intel Approximate Math library, and based on the
 *   corresponding algorithms of the cephes math library
 *   Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
 */

/*
 *  This software is provided 'as-is', without any express or implied
 *  warranty.  In no event will the authors be held liable for any damages
 *  arising from the use of this software.
 *
 *  Permission is granted to anyone to use this software for any purpose,
 *  including commercial applications, and to alter it and redistribute it
 *  freely, subject to the following restrictions:
 *
 *  1. The origin of this software must not be misrepresented; you must not
 *     claim that you wrote the original software. If you use this software
 *     in a product, an acknowledgment in the product documentation would be
 *     appreciated but is not required.
 *  2. Altered source versions must be plainly marked as such, and must not be
 *     misrepresented as being the original software.
 *  3. This notice may not be removed or altered from any source distribution.
 *
 *  (this is the zlib license)
 */

#ifndef LSX_MATHFUN_H
#define LSX_MATHFUN_H

#include "loongarch_usability.h"

#include <lsxintrin.h>

_LOONGARCH_FLOAT_CONST(c_0, 0.0f);
_LOONGARCH_FLOAT_CONST(c_1, 1.0f);
_LOONGARCH_FLOAT_CONST(c_2, 2.0f);
_LOONGARCH_FLOAT_CONST(c_3, 3.0f);
_LOONGARCH_FLOAT_CONST(c_4, 4.0f);
_LOONGARCH_FLOAT_CONST(c_n1, -1.0f);
_LOONGARCH_FLOAT_CONST(c_n3, -3.0f);
_LOONGARCH_FLOAT_CONST(c_0p5, 0.5f);
_LOONGARCH_FLOAT_CONST(c_eps, 1E-8f);

#define c_inv_mant_mask ~0x7f800000u
_LOONGARCH_FLOAT_CONST(c_cephes_SQRTHF, 0.707106781186547524);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p0, 7.0376836292E-2);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p1, -1.1514610310E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p2, 1.1676998740E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p3, -1.2420140846E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p4, +1.4249322787E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p5, -1.6668057665E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p6, +2.0000714765E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p7, -2.4999993993E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_p8, +3.3333331174E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_log_q1, -2.12194440e-4);
_LOONGARCH_FLOAT_CONST(c_cephes_log_q2, 0.693359375);

/* natural logarithm computed for 4 simultaneous float
 *   return NaN for x <= 0
 */
static inline __m128 log_ps(__m128 x)
{
    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);

    x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(0)); /* force flush to zero on denormal values */
    __m128i invalid_mask = __lsx_vfcmp_cle_s(x, (__m128)__lsx_vreplgr2vr_w(0));

    __m128i ux = (__m128i)(x);

    __m128i emm0 = __lsx_vsrl_w(ux, __lsx_vreplgr2vr_w(23));

    /* keep only the fractional part */
    ux = __lsx_vand_v(ux, __lsx_vreplgr2vr_w(c_inv_mant_mask));
    ux = __lsx_vor_v(ux, __lsx_vreplgr2vr_w(c_0p5.i));
    x = (__m128)(ux);

    emm0 = __lsx_vsub_w(emm0, __lsx_vreplgr2vr_w(0x7f));
    __m128 e = __lsx_vffint_s_w(emm0);

    e = __lsx_vfadd_s(e, one);

    /* part2:
     *     if( x < SQRTHF ) {
     *       e -= 1;
     *       x = x + x - 1.0;
     *     } else { x = x - 1.0; }
     */
    __m128i mask = __lsx_vfcmp_clt_s((__m128)x, (__m128)__lsx_vreplgr2vr_w(c_cephes_SQRTHF.i));
    __m128 tmp = (__m128)(__lsx_vand_v((__m128i)(x), (__m128i)mask));
    x = __lsx_vfsub_s(x, one);
    e = __lsx_vfsub_s(e, (__m128)(__lsx_vand_v((__m128i)(one), (__m128i)mask)));
    x = __lsx_vfadd_s(x, tmp);

    __m128 z = __lsx_vfmul_s(x, x);

    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p0.i);

    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p1.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p2.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p3.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p4.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p5.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p6.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p7.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p8.i));
    y = __lsx_vfmul_s(y, x);

    y = __lsx_vfmul_s(y, z);

    tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q1.i));
    y = __lsx_vfadd_s(y, tmp);

    tmp = __lsx_vfmul_s(z, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
    y = __lsx_vfsub_s(y, tmp);

    tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q2.i));
    x = __lsx_vfadd_s(x, y);
    x = __lsx_vfadd_s(x, tmp);
    x = (__m128)(__lsx_vor_v((__m128i)(x), (__m128i)invalid_mask)); // negative arg will be NAN
    return x;
}

_LOONGARCH_FLOAT_CONST(c_exp_hi, 88.3762626647949f);
_LOONGARCH_FLOAT_CONST(c_exp_lo, -88.3762626647949f);

_LOONGARCH_FLOAT_CONST(c_cephes_LOG2EF, 1.44269504088896341);
_LOONGARCH_FLOAT_CONST(c_cephes_exp_C1, 0.693359375);
_LOONGARCH_FLOAT_CONST(c_cephes_exp_C2, -2.12194440e-4);

_LOONGARCH_FLOAT_CONST(c_cephes_exp_p0, 1.9875691500E-4);
_LOONGARCH_FLOAT_CONST(c_cephes_exp_p1, 1.3981999507E-3);
_LOONGARCH_FLOAT_CONST(c_cephes_exp_p2, 8.3334519073E-3);
_LOONGARCH_FLOAT_CONST(c_cephes_exp_p3, 4.1665795894E-2);
_LOONGARCH_FLOAT_CONST(c_cephes_exp_p4, 1.6666665459E-1);
_LOONGARCH_FLOAT_CONST(c_cephes_exp_p5, 5.0000001201E-1);

/* exp() computed for 4 float at once */
static inline __m128 exp_ps(__m128 x)
{
    __m128 tmp, fx;

    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
    x = __lsx_vfmin_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_hi.i));
    x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_lo.i));

    /* express exp(x) as exp(g + n*log(2)) */
    fx = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_LOG2EF.i));
    fx = __lsx_vfadd_s(fx, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));

    /* perform a floorf */
    tmp = __lsx_vffint_s_w(__lsx_vftint_w_s(fx));

    /* if greater, substract 1 */
    __m128i mask = __lsx_vfcmp_clt_s(fx, tmp);
    mask = __lsx_vand_v(mask, (__m128i)one);

    fx = __lsx_vfsub_s(tmp, (__m128)mask);

    tmp = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C1.i));
    __m128 z = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C2.i));
    x = __lsx_vfsub_s(x, tmp);
    x = __lsx_vfsub_s(x, z);

    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p0.i);

    z = __lsx_vfmul_s(x, x);

    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p1.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p2.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p3.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p4.i));
    y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p5.i));

    y = __lsx_vfmul_s(y, z);
    y = __lsx_vfadd_s(y, x);
    y = __lsx_vfadd_s(y, one);

    /* build 2^n */
    __m128i mm;
    mm = __lsx_vftintrz_w_s(fx);
    mm = __lsx_vadd_w(mm, __lsx_vreplgr2vr_w(0x7f));
    mm = __lsx_vsll_w(mm, __lsx_vreplgr2vr_w(23));

    y = __lsx_vfmul_s(y, (__m128)mm);
    return y;
}

_LOONGARCH_FLOAT_CONST(c_tanh_tiny, 1e-4f);
_LOONGARCH_FLOAT_CONST(c_tanh_hi, 9.0f);
// The monomial coefficients of the numerator polynomial (odd).
_LOONGARCH_FLOAT_CONST(c_tanh_alpha_1, 4.89352455891786e-3f);
_LOONGARCH_FLOAT_CONST(c_tanh_alpha_3, 6.37261928875436e-4f);
_LOONGARCH_FLOAT_CONST(c_tanh_alpha_5, 1.48572235717979e-5f);
_LOONGARCH_FLOAT_CONST(c_tanh_alpha_7, 5.12229709037114e-8f);
_LOONGARCH_FLOAT_CONST(c_tanh_alpha_9, -8.60467152213735e-11f);
_LOONGARCH_FLOAT_CONST(c_tanh_alpha_11, 2.00018790482477e-13f);
_LOONGARCH_FLOAT_CONST(c_tanh_alpha_13, -2.76076847742355e-16f);
// The monomial coefficients of the denominator polynomial (even).
_LOONGARCH_FLOAT_CONST(c_tanh_beta_0, 4.89352518554385e-3f);
_LOONGARCH_FLOAT_CONST(c_tanh_beta_2, 2.26843463243900e-3f);
_LOONGARCH_FLOAT_CONST(c_tanh_beta_4, 1.18534705686654e-4f);
_LOONGARCH_FLOAT_CONST(c_tanh_beta_6, 1.19825839466702e-6f);

/* tanh() computed for 4 float at once */
static inline __m128 tanh_ps(__m128 x)
{
    __m128 x2 = (__m128)__lsx_vbitclri_w((__m128i)x, 31);
    __m128i tiny_mask = __lsx_vfcmp_clt_s((__m128)x2, (__m128)(__m128)__lsx_vreplgr2vr_w(c_tanh_tiny.i));
    __m128i sig_mask = __lsx_vreplgr2vr_w(1 << 31);
    __m128i sig_save = __lsx_vand_v((__m128i)x, sig_mask);

    // clamp the inputs to the range [-9, 9] since anything outside
    // this range is -/+1.0f in single-precision.
    x2 = (__m128)__lsx_vbitsel_v((__m128i)x2, (__m128i)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128i)__lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128)x2));

    // since the polynomials are odd/even, we need x**2.
    __m128 z = __lsx_vfmul_s(x2, x2);

    // evaluate the numerator polynomial y.
    __m128 y = (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_13.i);
    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_11.i));
    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_9.i));
    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_7.i));
    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_5.i));
    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_3.i));
    y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_1.i));
    y = __lsx_vfmul_s(y, x2);

    // evaluate the denominator polynomial w.
    __m128 w = (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_6.i);
    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_4.i));
    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_2.i));
    w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_0.i));

    // divide the numerator by the denominator.
    y = __lsx_vfdiv_s(y, w);

    // reinstate the sign.
    y = (__m128)__lsx_vor_v((__m128i)y, sig_save);

    // when the argument is very small in magnitude it's more accurate to just return it.
    y = (__m128)__lsx_vbitsel_v((__m128i)y, (__m128i)x, (__m128i)tiny_mask);

    return y;
}

_LOONGARCH_FLOAT_CONST(c_minus_cephes_DP1, -0.78515625f);
_LOONGARCH_FLOAT_CONST(c_minus_cephes_DP2, -2.4187564849853515625e-4f);
_LOONGARCH_FLOAT_CONST(c_minus_cephes_DP3, -3.77489497744594108e-8f);
_LOONGARCH_FLOAT_CONST(c_cephes_sin_p0, -1.9515295891E-4f);
_LOONGARCH_FLOAT_CONST(c_cephes_sin_p1, 8.3321608736E-3f);
_LOONGARCH_FLOAT_CONST(c_cephes_sin_p2, -1.6666654611E-1f);
_LOONGARCH_FLOAT_CONST(c_cephes_cos_p0, 2.443315711809948E-005f);
_LOONGARCH_FLOAT_CONST(c_cephes_cos_p1, -1.388731625493765E-003f);
_LOONGARCH_FLOAT_CONST(c_cephes_cos_p2, 4.166664568298827E-002f);
_LOONGARCH_FLOAT_CONST(c_cephes_FOPI, 1.27323954473516f); // 4/PI

static inline __m128 sin_ps(__m128 x)
{
    __m128 y;
    __m128i swap_sign_bit, poly_mask, sign_bit;
    __m128 n0p5 = __lsx_vfmul_s((__m128)__lsx_vreplgr2vr_w(c_n1.i), (__m128)__lsx_vreplgr2vr_w(c_0p5.i));

    sign_bit = __lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x80000000));
    x = (__m128)__lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x7fffffff));

    y = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_FOPI.i));

    poly_mask = __lsx_vftintrz_w_s(y);
    poly_mask = __lsx_vadd_w(poly_mask, __lsx_vreplgr2vr_w(1));
    poly_mask = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(~1));
    y = __lsx_vffint_s_w(poly_mask);

    swap_sign_bit = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(4));
    swap_sign_bit = __lsx_vslli_w(swap_sign_bit, 29);

    poly_mask = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(2));
    poly_mask = __lsx_vseq_w(poly_mask, __lsx_vreplgr2vr_w(0));

    sign_bit = __lsx_vxor_v(sign_bit, swap_sign_bit);

    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP1.i), x);
    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP2.i), x);
    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP3.i), x);

    y = (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p0.i);
    __m128 z = __lsx_vfmul_s(x, x);
    y = __lsx_vfmadd_s(y, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p1.i));
    y = __lsx_vfmadd_s(y, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p2.i));
    y = __lsx_vfmul_s(y, z);
    y = __lsx_vfmul_s(y, z);
    y = __lsx_vfmadd_s(z, n0p5, y);
    y = __lsx_vfadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_1.i));

    __m128 y2 = (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p0.i);
    y2 = __lsx_vfmadd_s(y2, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p1.i));
    y2 = __lsx_vfmadd_s(y2, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p2.i));
    y2 = __lsx_vfmul_s(y2, z);
    y2 = __lsx_vfmadd_s(y2, x, x);

    y2 = (__m128)__lsx_vand_v((__m128i)y2, poly_mask);
    y = (__m128)__lsx_vand_v(__lsx_vxor_v(poly_mask, __lsx_vreplgr2vr_w(0xffffffff)), (__m128i)y);
    y = __lsx_vfadd_s(y, y2);
    y = (__m128)__lsx_vxor_v((__m128i)y, sign_bit);

    return y;
}

static inline __m128 cos_ps(__m128 x)
{
    __m128 y;
    __m128i swap_sign_bit, poly_mask, sign_bit;
    __m128 n0p5 = __lsx_vfmul_s((__m128)__lsx_vreplgr2vr_w(c_n1.i), (__m128)__lsx_vreplgr2vr_w(c_0p5.i));

    x = (__m128)__lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x7fffffff));

    y = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_FOPI.i));

    poly_mask = __lsx_vftintrz_w_s(y);
    poly_mask = __lsx_vadd_w(poly_mask, __lsx_vreplgr2vr_w(1));
    poly_mask = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(~1));
    y = __lsx_vffint_s_w(poly_mask);
    poly_mask = __lsx_vsub_w(poly_mask, __lsx_vreplgr2vr_w(2));

    swap_sign_bit = __lsx_vandn_v(poly_mask, __lsx_vreplgr2vr_w(4));
    swap_sign_bit = __lsx_vslli_w(swap_sign_bit, 29);

    poly_mask = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(2));
    poly_mask = __lsx_vseq_w(poly_mask, __lsx_vreplgr2vr_w(0));

    sign_bit = swap_sign_bit;

    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP1.i), x);
    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP2.i), x);
    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP3.i), x);

    y = (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p0.i);
    __m128 z = __lsx_vfmul_s(x, x);
    y = __lsx_vfmadd_s(y, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p1.i));
    y = __lsx_vfmadd_s(y, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p2.i));
    y = __lsx_vfmul_s(y, z);
    y = __lsx_vfmul_s(y, z);
    y = __lsx_vfmadd_s(z, n0p5, y);
    y = __lsx_vfadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_1.i));

    __m128 y2 = (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p0.i);
    y2 = __lsx_vfmadd_s(y2, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p1.i));
    y2 = __lsx_vfmadd_s(y2, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p2.i));
    y2 = __lsx_vfmul_s(y2, z);
    y2 = __lsx_vfmadd_s(y2, x, x);

    y2 = (__m128)__lsx_vand_v((__m128i)y2, poly_mask);
    y = (__m128)__lsx_vandn_v(poly_mask, (__m128i)y);
    y = __lsx_vfadd_s(y, y2);
    y = (__m128)__lsx_vxor_v((__m128i)y, sign_bit);

    return y;
}

static inline void sincos_ps(__m128 x, __m128* s, __m128* c)
{
    __m128 y;
    __m128i swap_sign_bit_cos, swap_sign_bit_sin, poly_mask, sign_bit_sin, sign_bit_cos;
    __m128 n0p5 = __lsx_vfmul_s((__m128)__lsx_vreplgr2vr_w(c_n1.i), (__m128)__lsx_vreplgr2vr_w(c_0p5.i));

    sign_bit_sin = __lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x80000000));
    x = (__m128)__lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x7fffffff));

    y = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_FOPI.i));

    poly_mask = __lsx_vftintrz_w_s(y);
    poly_mask = __lsx_vadd_w(poly_mask, __lsx_vreplgr2vr_w(1));
    poly_mask = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(~1));
    y = __lsx_vffint_s_w(poly_mask);

    swap_sign_bit_cos = __lsx_vsub_w(poly_mask, __lsx_vreplgr2vr_w(2));
    swap_sign_bit_cos = __lsx_vandn_v(swap_sign_bit_cos, __lsx_vreplgr2vr_w(4));
    swap_sign_bit_cos = __lsx_vslli_w(swap_sign_bit_cos, 29);

    swap_sign_bit_sin = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(4));
    swap_sign_bit_sin = __lsx_vslli_w(swap_sign_bit_sin, 29);

    poly_mask = __lsx_vand_v(poly_mask, __lsx_vreplgr2vr_w(2));
    poly_mask = __lsx_vseq_w(poly_mask, __lsx_vreplgr2vr_w(0));

    sign_bit_sin = __lsx_vxor_v(sign_bit_sin, swap_sign_bit_sin);
    sign_bit_cos = swap_sign_bit_cos;

    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP1.i), x);
    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP2.i), x);
    x = __lsx_vfmadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_minus_cephes_DP3.i), x);

    __m128 z = __lsx_vfmul_s(x, x);
    y = (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p0.i);
    y = __lsx_vfmadd_s(y, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p1.i));
    y = __lsx_vfmadd_s(y, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_cos_p2.i));
    y = __lsx_vfmul_s(y, z);
    y = __lsx_vfmul_s(y, z);
    y = __lsx_vfmadd_s(z, n0p5, y);
    y = __lsx_vfadd_s(y, (__m128)__lsx_vreplgr2vr_w(c_1.i));

    __m128 y2 = (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p0.i);
    y2 = __lsx_vfmadd_s(y2, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p1.i));
    y2 = __lsx_vfmadd_s(y2, z, (__m128)__lsx_vreplgr2vr_w(c_cephes_sin_p2.i));
    y2 = __lsx_vfmul_s(y2, z);
    y2 = __lsx_vfmadd_s(y2, x, x);

    __m128 ysin1 = (__m128)__lsx_vandn_v(poly_mask, (__m128i)y);
    __m128 ysin2 = (__m128)__lsx_vand_v(poly_mask, (__m128i)y2);
    y2 = __lsx_vfsub_s(y2, ysin2);
    y = __lsx_vfsub_s(y, ysin1);

    ysin1 = __lsx_vfadd_s(ysin1, ysin2);
    y = __lsx_vfadd_s(y, y2);

    *s = (__m128)__lsx_vxor_v((__m128i)ysin1, sign_bit_sin);
    *c = (__m128)__lsx_vxor_v((__m128i)y, sign_bit_cos);
}

static inline __m128 tan_ps(__m128 x)
{
    __m128 ysin, ycos;
    __m128 eps = (__m128)__lsx_vreplgr2vr_w(c_eps.i);
    __m128 zero = (__m128)__lsx_vreplgr2vr_w(c_0.i);
    sincos_ps(x, &ysin, &ycos);
    __m128i mask = __lsx_vfcmp_ceq_s(ycos, eps);
    mask = __lsx_vand_v(mask, (__m128i)eps);
    ycos = __lsx_vfadd_s(ycos, (__m128)mask);
    __m128 ytan = __lsx_vfdiv_s(ysin, ycos);
    return ytan;
}

static inline __m128 pow_ps(__m128 a, __m128 b)
{
    // pow(x, m) = exp(m * log(x))
    return exp_ps(__lsx_vfmul_s(b, log_ps(a)));
}

static inline __m128 sigmoid_ps(__m128 _v)
{
    __m128 _one = __lsx_vreplfr2vr_s(1.f);
    _v = (__m128)__lsx_vbitrevi_w((__m128i)_v, 31);
    _v = exp_ps(_v);
    _v = __lsx_vfadd_s(_v, _one);
    return __lsx_vfdiv_s(_one, _v);
}

_LOONGARCH_FLOAT_CONST(c_cephes_asin_a4, 0.023994016f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_a5, 0.042417344f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_a2, 0.07494697f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_a3, 0.045520633f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_a0, 1.0f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_a1, 0.166667819f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_half_pi, 1.5707964f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_pi, 3.1415927f);
_LOONGARCH_FLOAT_CONST(c_cephes_asin_npi, -3.1415927f);

static inline __m128 asin_ps(__m128 x)
{
    __m128 big_input_approx, input_approx, square_of_input_approx, fourth_power_of_input_approx;
    __m128 is_big_input_one, output_approx, final_approx;
    __m128 tmp1, tmp2, tmp3, tmp4;
    __m128i mask, is_small_input, is_big_input;

    mask = __lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x80000000));
    x = (__m128)__lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x7fffffff));

    is_small_input = __lsx_vfcmp_cle_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
    is_big_input = __lsx_vxor_v(is_small_input, __lsx_vreplgr2vr_w(0xffffffff));
    is_big_input_one = (__m128)__lsx_vand_v(__lsx_vreplgr2vr_w(c_1.i), is_big_input);

    big_input_approx = __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_1.i), x);
    big_input_approx = __lsx_vfmul_s((__m128)__lsx_vreplgr2vr_w(c_0p5.i), big_input_approx);
    big_input_approx = __lsx_vfsqrt_s(big_input_approx);

    input_approx = (__m128)__lsx_vand_v(is_small_input, (__m128i)x);
    input_approx = (__m128)__lsx_vor_v((__m128i)input_approx, __lsx_vand_v(is_big_input, (__m128i)big_input_approx));

    square_of_input_approx = __lsx_vfmul_s(input_approx, input_approx);
    fourth_power_of_input_approx = __lsx_vfmul_s(square_of_input_approx, square_of_input_approx);

    tmp1 = __lsx_vfmadd_s(fourth_power_of_input_approx, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a4.i), (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a2.i));
    tmp2 = __lsx_vfmadd_s(fourth_power_of_input_approx, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a5.i), (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a3.i));
    tmp3 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp1, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a0.i));
    tmp4 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp2, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a1.i));
    output_approx = __lsx_vfmadd_s(square_of_input_approx, tmp4, tmp3);

    tmp1 = __lsx_vfmul_s((__m128)__lsx_vreplgr2vr_w(c_cephes_asin_half_pi.i), is_big_input_one);
    tmp2 = __lsx_vfmul_s(output_approx, input_approx);
    tmp3 = __lsx_vfmadd_s((__m128)__lsx_vreplgr2vr_w(c_n3.i), is_big_input_one, (__m128)__lsx_vreplgr2vr_w(c_1.i));

    final_approx = __lsx_vfmadd_s(tmp2, tmp3, tmp1);
    final_approx = (__m128)__lsx_vor_v((__m128i)final_approx, mask);

    return final_approx;
}

static inline __m128 acos_ps(__m128 x)
{
    __m128 big_input_approx, input_approx, square_of_input_approx, fourth_power_of_input_approx;
    __m128 output_approx, final_approx, small_final_approx, big_final_approx;
    __m128 tmp1, tmp2, tmp3, tmp4;
    __m128i mask, mask2, is_small_input, is_big_input, lt_zero;

    lt_zero = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(c_0.i));
    mask = __lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x80000000));
    x = (__m128)__lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x7fffffff));

    is_small_input = __lsx_vfcmp_cle_s(x, (__m128)__lsx_vreplgr2vr_w(c_0p5.i));
    is_big_input = __lsx_vxor_v(is_small_input, __lsx_vreplgr2vr_w(0xffffffff));

    big_input_approx = __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_1.i), x);
    big_input_approx = __lsx_vfmul_s((__m128)__lsx_vreplgr2vr_w(c_0p5.i), big_input_approx);
    big_input_approx = __lsx_vfsqrt_s(big_input_approx);

    input_approx = (__m128)__lsx_vand_v(is_small_input, (__m128i)x);
    input_approx = (__m128)__lsx_vor_v((__m128i)input_approx, __lsx_vand_v(is_big_input, (__m128i)big_input_approx));

    square_of_input_approx = __lsx_vfmul_s(input_approx, input_approx);
    fourth_power_of_input_approx = __lsx_vfmul_s(square_of_input_approx, square_of_input_approx);

    tmp1 = __lsx_vfmadd_s(fourth_power_of_input_approx, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a4.i), (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a2.i));
    tmp2 = __lsx_vfmadd_s(fourth_power_of_input_approx, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a5.i), (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a3.i));
    tmp3 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp1, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a0.i));
    tmp4 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp2, (__m128)__lsx_vreplgr2vr_w(c_cephes_asin_a1.i));
    output_approx = __lsx_vfmadd_s(square_of_input_approx, tmp4, tmp3);

    tmp1 = __lsx_vfmul_s(input_approx, output_approx);

    small_final_approx = (__m128)__lsx_vor_v((__m128i)tmp1, mask);
    small_final_approx = __lsx_vfsub_s((__m128)__lsx_vreplgr2vr_w(c_cephes_asin_half_pi.i), small_final_approx);

    big_final_approx = (__m128)__lsx_vand_v(lt_zero, __lsx_vreplgr2vr_w(c_cephes_asin_pi.i));
    tmp1 = __lsx_vfadd_s(tmp1, tmp1);
    tmp1 = (__m128)__lsx_vor_v((__m128i)tmp1, mask);
    big_final_approx = __lsx_vfadd_s(big_final_approx, tmp1);

    final_approx = (__m128)__lsx_vand_v(is_small_input, (__m128i)small_final_approx);
    final_approx = (__m128)__lsx_vor_v((__m128i)final_approx, __lsx_vand_v(is_big_input, (__m128i)big_final_approx));

    return final_approx;
}

_LOONGARCH_FLOAT_CONST(c_cephes_atan_x0, 1.0f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x1, -0.33333072f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x2, 0.1999262f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x3, -0.14203644f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x4, 0.10640934f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x5, -0.07504295f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x6, 0.04269152f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x7, -0.01606863f);
_LOONGARCH_FLOAT_CONST(c_cephes_atan_x8, 0.0028498897f);

static inline __m128 atan_ps(__m128 x)
{
    __m128i mask, is_small_input, is_big_input;
    __m128 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, input_approx, output_approx;
    __m128 square_of_input_approx, fourth_power_of_input_approx;

    mask = __lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x80000000));
    x = (__m128)__lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x7fffffff));

    is_small_input = __lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_1.i), x);
    is_big_input = __lsx_vxor_v(is_small_input, __lsx_vreplgr2vr_w(0xffffffff));

    tmp1 = (__m128)__lsx_vand_v(is_small_input, __lsx_vreplgr2vr_w(c_n1.i));
    tmp1 = (__m128)__lsx_vor_v(__lsx_vand_v(is_big_input, (__m128i)x), (__m128i)tmp1);

    tmp2 = (__m128)__lsx_vand_v(is_small_input, (__m128i)x);
    tmp2 = (__m128)__lsx_vor_v(__lsx_vand_v((__m128i)is_big_input, __lsx_vreplgr2vr_w(c_1.i)), (__m128i)tmp2);

    input_approx = __lsx_vfdiv_s(tmp1, tmp2);
    square_of_input_approx = __lsx_vfmul_s(input_approx, input_approx);
    fourth_power_of_input_approx = __lsx_vfmul_s(square_of_input_approx, square_of_input_approx);

    tmp1 = __lsx_vfmadd_s(fourth_power_of_input_approx, (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x7.i), (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x5.i));
    tmp2 = __lsx_vfmadd_s(fourth_power_of_input_approx, (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x8.i), (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x6.i));
    tmp3 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp1, (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x3.i));
    tmp4 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp2, (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x4.i));
    tmp5 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp3, (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x1.i));
    tmp6 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp4, (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x2.i));
    tmp7 = __lsx_vfmadd_s(fourth_power_of_input_approx, tmp6, (__m128)__lsx_vreplgr2vr_w(c_cephes_atan_x0.i));
    output_approx = __lsx_vfmadd_s(square_of_input_approx, tmp5, tmp7);

    tmp1 = __lsx_vfmul_s(input_approx, output_approx);
    tmp2 = (__m128)__lsx_vand_v(is_small_input, __lsx_vreplgr2vr_w(c_cephes_asin_half_pi.i));
    tmp1 = __lsx_vfadd_s(tmp1, tmp2);
    tmp1 = (__m128)__lsx_vxor_v(mask, (__m128i)tmp1);
    return tmp1;
}

static inline __m128 atan2_ps(__m128 y, __m128 x)
{
    __m128i not_eq_zero_x, not_eq_zero_y, normal_mode, negative_mask_x, negative_mask_y;
    __m128i lt_zero_mask_x, lt_zero_mask_y, ge_zero_mask_y, eq_zero_y;
    __m128 pi_additions, tmp1, tmp2, normal_result, special_result, final_result;

    not_eq_zero_x = __lsx_vfcmp_cne_s(x, (__m128)__lsx_vreplgr2vr_w(c_0.i));
    not_eq_zero_y = __lsx_vfcmp_cne_s(y, (__m128)__lsx_vreplgr2vr_w(c_0.i));
    eq_zero_y = __lsx_vxor_v(not_eq_zero_y, __lsx_vreplgr2vr_w(0xffffffff));
    normal_mode = __lsx_vand_v(not_eq_zero_x, not_eq_zero_y);
    negative_mask_x = __lsx_vand_v((__m128i)x, __lsx_vreplgr2vr_w(0x80000000));
    negative_mask_y = __lsx_vand_v((__m128i)y, __lsx_vreplgr2vr_w(0x80000000));

    lt_zero_mask_x = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0));
    lt_zero_mask_y = __lsx_vfcmp_clt_s(y, (__m128)__lsx_vreplgr2vr_w(0));
    ge_zero_mask_y = __lsx_vxor_v(lt_zero_mask_y, __lsx_vreplgr2vr_w(0xffffffff));

    pi_additions = (__m128)__lsx_vand_v(lt_zero_mask_y, __lsx_vreplgr2vr_w(c_cephes_asin_npi.i));
    pi_additions = (__m128)__lsx_vor_v(__lsx_vand_v(ge_zero_mask_y, __lsx_vreplgr2vr_w(c_cephes_asin_pi.i)), (__m128i)pi_additions);
    pi_additions = (__m128)__lsx_vand_v(lt_zero_mask_x, (__m128i)pi_additions);

    normal_result = __lsx_vfdiv_s(y, x);
    normal_result = __lsx_vfadd_s(atan_ps(normal_result), pi_additions);

    tmp1 = (__m128)__lsx_vand_v(negative_mask_y, __lsx_vreplgr2vr_w(c_cephes_asin_half_pi.i));
    tmp2 = (__m128)__lsx_vand_v(negative_mask_x, __lsx_vreplgr2vr_w(c_cephes_asin_pi.i));
    special_result = (__m128)__lsx_vand_v(not_eq_zero_y, (__m128i)tmp1);
    special_result = (__m128)__lsx_vor_v(__lsx_vand_v(eq_zero_y, (__m128i)tmp2), (__m128i)special_result);

    final_result = (__m128)__lsx_vand_v(normal_mode, (__m128i)normal_result);
    normal_mode = __lsx_vxor_v(normal_mode, __lsx_vreplgr2vr_w(0xffffffff));
    final_result = (__m128)__lsx_vor_v(__lsx_vand_v(normal_mode, (__m128i)special_result), (__m128i)final_result);

    return final_result;
}

static inline __m128 fmod_ps(__m128 a, __m128 b)
{
    // fmod(a,b) = a - trunc(a/b)*b  (trunc toward 0)
    __m128 q = __lsx_vfdiv_s(a, b);
    __m128i qi = __lsx_vftintrz_w_s(q); // float -> int32 trunc toward zero
    __m128 qf = __lsx_vffint_s_w(qi);   // int32 -> float
    return __lsx_vfsub_s(a, __lsx_vfmul_s(qf, b));
}

static inline __m128 round_ps(__m128 x)
{
    __m128 half = (__m128)__lsx_vreplgr2vr_w(c_0p5.i);
    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
    __m128i sign_mask = __lsx_vfcmp_clt_s(x, (__m128)__lsx_vreplgr2vr_w(0));
    __m128 abs_x = (__m128)__lsx_vbitclri_w((__m128i)x, 31);
    __m128i xi = __lsx_vftintrz_w_s(abs_x);
    __m128 xf = __lsx_vffint_s_w(xi);
    __m128 diff = __lsx_vfsub_s(abs_x, xf);
    __m128i diff_gt_half = __lsx_vfcmp_clt_s(half, diff);
    __m128i diff_eq_half = __lsx_vfcmp_ceq_s(diff, half);
    __m128i xi_and_1 = __lsx_vand_v(xi, __lsx_vreplgr2vr_w(1));
    __m128i is_odd = __lsx_vseq_w(xi_and_1, __lsx_vreplgr2vr_w(1));
    __m128i round_up = __lsx_vor_v(diff_gt_half, __lsx_vand_v(diff_eq_half, is_odd));
    __m128 rounded = __lsx_vfadd_s(xf, (__m128)__lsx_vand_v(round_up, (__m128i)one));
    return (__m128)__lsx_vbitsel_v((__m128i)rounded, (__m128i)__lsx_vbitrevi_w((__m128i)rounded, 31), sign_mask);
}

static inline __m128 logaddexp_ps(__m128 a, __m128 b)
{
    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
    __m128 max_xy = __lsx_vfmax_s(a, b);
    __m128 min_xy = __lsx_vfmin_s(a, b);
    __m128 diff = __lsx_vfsub_s(min_xy, max_xy);
    __m128 exp_diff = exp_ps(diff);
    __m128 one_plus_exp = __lsx_vfadd_s(one, exp_diff);
    __m128 log_result = log_ps(one_plus_exp);
    return __lsx_vfadd_s(max_xy, log_result);
}

static inline __m128 floor_ps(__m128 x)
{
    __m128i xi = __lsx_vftintrz_w_s(x);
    __m128 xf = __lsx_vffint_s_w(xi);
    __m128i need_adjust = __lsx_vfcmp_clt_s(x, xf);
    __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i);
    return __lsx_vfsub_s(xf, (__m128)__lsx_vand_v(need_adjust, (__m128i)one));
}

static inline __m128 floor_divide_ps(__m128 a, __m128 b)
{
    __m128 q = __lsx_vfdiv_s(a, b);
    return floor_ps(q);
}

static inline __m128 remainder_ps(__m128 a, __m128 b)
{
    __m128 q = __lsx_vfdiv_s(a, b);
    __m128 rq = round_ps(q);
    return __lsx_vfsub_s(a, __lsx_vfmul_s(rq, b));
}

#endif // LSX_MATHFUN_H


================================================
FILE: src/layer/loongarch/mish_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "mish_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

Mish_loongarch::Mish_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

int Mish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = __lsx_vfmul_s(_p, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_p), _one))));
            __lsx_vst(_p, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = *ptr * tanhf(logf(expf(*ptr) + 1.f));
            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/mish_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_MISH_LOONGARCH_H
#define LAYER_MISH_LOONGARCH_H

#include "mish.h"

namespace ncnn {

class Mish_loongarch : public Mish
{
public:
    Mish_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_MISH_LOONGARCH_H


================================================
FILE: src/layer/loongarch/packing_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "packing_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

namespace ncnn {

Packing_loongarch::Packing_loongarch()
{
    support_packing = true;
}

int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

    if (elembits == 8)
        return forward_int8(bottom_blob, top_blob, opt);

    if (use_padding)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    if (elembits != 32)
    {
        // non-fp32 type
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    if (elempack == out_elempack)
    {
        top_blob = bottom_blob;
        return 0;
    }

    bool pack1to4 = elempack == 1 && out_elempack == 4;
    bool pack4to1 = elempack == 4 && out_elempack == 1;

    if (!pack1to4 && !pack4to1)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    if (!use_padding)
    {
        // identity if use_padding not allowed
        if (dims == 1 && w * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if (dims == 2 && h * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
    }

    if (dims == 1)
    {
        top_blob = bottom_blob;
        top_blob.w = w * elempack / out_elempack;
        top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
        top_blob.elemsize = elemsize / elempack * out_elempack;
        top_blob.elempack = out_elempack;
        return 0;
    }

    if (dims == 2)
    {
        int outh = h * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const float* r0 = bottom_blob.row(i * 4);
                const float* r1 = bottom_blob.row(i * 4 + 1);
                const float* r2 = bottom_blob.row(i * 4 + 2);
                const float* r3 = bottom_blob.row(i * 4 + 3);

                float* outptr = top_blob.row(i);

                int j = 0;
#if __loongarch_sx
                for (; j + 3 < w; j += 4)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r3 = __lsx_vld(r3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, outptr, 0);
                    __lsx_vst(_r0123_1, outptr + 4, 0);
                    __lsx_vst(_r0123_2, outptr + 4 * 2, 0);
                    __lsx_vst(_r0123_3, outptr + 4 * 3, 0);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr += 16;
                }
#endif // __loongarch_sx
                for (; j < w; j++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;

                    outptr += 4;
                }
            }
        }
        if (pack4to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* r0 = bottom_blob.row(i);

                float* outptr0 = top_blob.row(i * 4);
                float* outptr1 = top_blob.row(i * 4 + 1);
                float* outptr2 = top_blob.row(i * 4 + 2);
                float* outptr3 = top_blob.row(i * 4 + 3);

                int j = 0;
#if __loongarch_sx
                for (; j + 3 < w; j += 4)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r1 = __lsx_vld(r0 + 4, 0);
                    __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, outptr0, 0);
                    __lsx_vst(_r0123_1, outptr1, 0);
                    __lsx_vst(_r0123_2, outptr2, 0);
                    __lsx_vst(_r0123_3, outptr3, 0);

                    r0 += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif // __loongarch_sx
                for (; j < w; j++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];

                    r0 += 4;
                }
            }
        }

        return 0;
    }

    if (dims == 3 || dims == 4)
    {
        int size = w * h * d;
        int outc = channels * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if (dims == 3)
            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        else // if (dims == 4)
            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to4)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const float* r0 = bottom_blob.channel(q * 4);
                const float* r1 = bottom_blob.channel(q * 4 + 1);
                const float* r2 = bottom_blob.channel(q * 4 + 2);
                const float* r3 = bottom_blob.channel(q * 4 + 3);

                float* outptr = top_blob.channel(q);

                int i = 0;
#if __loongarch_sx
                for (; i + 3 < size; i += 4)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r1 = __lsx_vld(r1, 0);
                    __m128i _r2 = __lsx_vld(r2, 0);
                    __m128i _r3 = __lsx_vld(r3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, outptr, 0);
                    __lsx_vst(_r0123_1, outptr + 4, 0);
                    __lsx_vst(_r0123_2, outptr + 4 * 2, 0);
                    __lsx_vst(_r0123_3, outptr + 4 * 3, 0);

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                    r3 += 4;
                    outptr += 16;
                }
#endif // __loongarch_sx
                for (; i < size; i++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;

                    outptr += 4;
                }
            }
        }
        if (pack4to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* r0 = bottom_blob.channel(q);

                float* outptr0 = top_blob.channel(q * 4);
                float* outptr1 = top_blob.channel(q * 4 + 1);
                float* outptr2 = top_blob.channel(q * 4 + 2);
                float* outptr3 = top_blob.channel(q * 4 + 3);

                int i = 0;
#if __loongarch_sx
                for (; i + 3 < size; i += 4)
                {
                    // transpose 4x4
                    __m128i _r0 = __lsx_vld(r0, 0);
                    __m128i _r1 = __lsx_vld(r0 + 4, 0);
                    __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0);
                    __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0);

                    __m128i _r01r = __lsx_vilvl_w(_r1, _r0);
                    __m128i _r01l = __lsx_vilvh_w(_r1, _r0);
                    __m128i _r23r = __lsx_vilvl_w(_r3, _r2);
                    __m128i _r23l = __lsx_vilvh_w(_r3, _r2);
                    __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r);
                    __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r);
                    __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l);
                    __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l);

                    __lsx_vst(_r0123_0, outptr0, 0);
                    __lsx_vst(_r0123_1, outptr1, 0);
                    __lsx_vst(_r0123_2, outptr2, 0);
                    __lsx_vst(_r0123_3, outptr3, 0);

                    r0 += 16;
                    outptr0 += 4;
                    outptr1 += 4;
                    outptr2 += 4;
                    outptr3 += 4;
                }
#endif // __loongarch_sx
                for (; i < size; i++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];

                    r0 += 4;
                }
            }
        }

        return 0;
    }

    return 0;
}

int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (use_padding)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    if (elempack == out_elempack)
    {
        top_blob = bottom_blob;
        return 0;
    }

    bool pack1to8 = elempack == 1 && out_elempack == 8;
    bool pack8to1 = elempack == 8 && out_elempack == 1;

    if (!pack1to8 && !pack8to1)
    {
        return Packing::forward(bottom_blob, top_blob, opt);
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;

    if (!use_padding)
    {
        // identity if use_padding not allowed
        if (dims == 1 && w * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if (dims == 2 && h * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
        if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0)
        {
            top_blob = bottom_blob;
            return 0;
        }
    }

    if (dims == 1)
    {
        top_blob = bottom_blob;
        top_blob.w = w * elempack / out_elempack;
        top_blob.cstep = bottom_blob.cstep * elempack / out_elempack;
        top_blob.elemsize = elemsize / elempack * out_elempack;
        top_blob.elempack = out_elempack;
        return 0;
    }

    if (dims == 2)
    {
        int outh = h * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
                const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
                const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
                const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
                const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
                const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
                const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
                const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);

                signed char* outptr = top_blob.row<signed char>(i);

                int j = 0;
                for (; j < w; j++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;
                    outptr[4] = *r4++;
                    outptr[5] = *r5++;
                    outptr[6] = *r6++;
                    outptr[7] = *r7++;

                    outptr += 8;
                }
            }
        }
        if (pack8to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const signed char* r0 = bottom_blob.row<const signed char>(i);

                signed char* outptr0 = top_blob.row<signed char>(i * 8);
                signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
                signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
                signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
                signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
                signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
                signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
                signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);

                int j = 0;
                for (; j < w; j++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];
                    *outptr4++ = r0[4];
                    *outptr5++ = r0[5];
                    *outptr6++ = r0[6];
                    *outptr7++ = r0[7];

                    r0 += 8;
                }
            }
        }

        return 0;
    }

    if (dims == 3 || dims == 4)
    {
        int size = w * h * d;
        int outc = channels * elempack / out_elempack;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        if (dims == 3)
            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        else // if (dims == 4)
            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        if (pack1to8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const signed char* r0 = bottom_blob.channel(q * 8);
                const signed char* r1 = bottom_blob.channel(q * 8 + 1);
                const signed char* r2 = bottom_blob.channel(q * 8 + 2);
                const signed char* r3 = bottom_blob.channel(q * 8 + 3);
                const signed char* r4 = bottom_blob.channel(q * 8 + 4);
                const signed char* r5 = bottom_blob.channel(q * 8 + 5);
                const signed char* r6 = bottom_blob.channel(q * 8 + 6);
                const signed char* r7 = bottom_blob.channel(q * 8 + 7);

                signed char* outptr = top_blob.channel(q);

                int i = 0;
                for (; i < size; i++)
                {
                    outptr[0] = *r0++;
                    outptr[1] = *r1++;
                    outptr[2] = *r2++;
                    outptr[3] = *r3++;
                    outptr[4] = *r4++;
                    outptr[5] = *r5++;
                    outptr[6] = *r6++;
                    outptr[7] = *r7++;

                    outptr += 8;
                }
            }
        }
        if (pack8to1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const signed char* r0 = bottom_blob.channel(q);

                signed char* outptr0 = top_blob.channel(q * 8);
                signed char* outptr1 = top_blob.channel(q * 8 + 1);
                signed char* outptr2 = top_blob.channel(q * 8 + 2);
                signed char* outptr3 = top_blob.channel(q * 8 + 3);
                signed char* outptr4 = top_blob.channel(q * 8 + 4);
                signed char* outptr5 = top_blob.channel(q * 8 + 5);
                signed char* outptr6 = top_blob.channel(q * 8 + 6);
                signed char* outptr7 = top_blob.channel(q * 8 + 7);

                int i = 0;
                for (; i < size; i++)
                {
                    *outptr0++ = r0[0];
                    *outptr1++ = r0[1];
                    *outptr2++ = r0[2];
                    *outptr3++ = r0[3];
                    *outptr4++ = r0[4];
                    *outptr5++ = r0[5];
                    *outptr6++ = r0[6];
                    *outptr7++ = r0[7];

                    r0 += 8;
                }
            }
        }

        return 0;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/packing_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_PACKING_LOONGARCH_H
#define LAYER_PACKING_LOONGARCH_H

#include "packing.h"

namespace ncnn {

class Packing_loongarch : public Packing
{
public:
    Packing_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_PACKING_LOONGARCH_H


================================================
FILE: src/layer/loongarch/padding_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "padding_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

#if __loongarch_sx
#include "padding_pack4.h"
#include "padding_pack8_int8.h"
#endif // __loongarch_sx

Padding_loongarch::Padding_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Padding_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int elembits = bottom_blob.elembits();

    if (elembits == 8)
        return forward_int8(bottom_blob, top_blob, opt);

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __loongarch_sx
    if (elempack == 4)
    {
        if (dims == 1)
        {
            int outw = w * elempack + left + right;

            int out_elempack = outw % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (left % 4 == 0 && out_elempack == 4 && type == 0)
            {
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                __m128 pad_value = __lsx_vreplfr2vr_s(value);
                padding_constant_pack4_lsx(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value);

                return 0;
            }
        }

        if (dims == 2)
        {
            int outw = w + left + right;
            int outh = h * elempack + top + bottom;

            int out_elempack = outh % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (top % 4 == 0 && out_elempack == 4 && type == 0)
            {
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                __m128 pad_value = __lsx_vreplfr2vr_s(value);
                padding_constant_pack4_lsx(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value);

                return 0;
            }
        }

        if (dims == 3)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outc = channels * elempack + front + behind;

            int out_elempack = outc % 4 == 0 ? 4 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0))
            {
                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int front_ = front / elempack;
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < outc / out_elempack; q++)
                {
                    Mat borderm = top_blob.channel(q);

                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);
                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
                        borderm.fill(pad_value);
                    }
                    else
                    {
                        const Mat m = bottom_blob.channel(q - front_);
                        if (type == 0)
                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
                        if (type == 1)
                            padding_replicate_pack4_lsx(m, borderm, top, bottom, left, right);
                        if (type == 2)
                            padding_reflect_pack4_lsx(m, borderm, top, bottom, left, right);
                    }
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outd = d + front + behind;

            if (type == 0)
            {
                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value);

                    for (int z = 0; z < outd; z++)
                    {
                        Mat borderm = top_blob.channel(q).depth(z);

                        // depth padding
                        if ((z - front) < 0 || (z - front) >= d)
                        {
                            borderm.fill(pad_value);
                        }
                        else
                        {
                            const Mat m = bottom_blob.channel(q).depth(z - front);
                            padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value);
                        }
                    }
                }

                return 0;
            }
        }
    }
#endif // __loongarch_sx

    Mat bottom_blob_unpacked = bottom_blob;
    if (elempack != 1)
    {
        Option opt_pack1 = opt;
        opt_pack1.blob_allocator = opt.workspace_allocator;

        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
    }

    Mat top_blob_unpacked;
    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
    if (ret != 0)
        return ret;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = top_blob_unpacked.c % 4 == 0 ? 4 : 1;
    }
#endif

    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);

    return 0;
}

int Padding_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __loongarch_sx
    if (elempack == 8)
    {
        if (dims == 1)
        {
            int outw = w * elempack + left + right;

            int out_elempack = outw % 8 == 0 ? 8 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (left % 8 == 0 && out_elempack == 8 && type == 0)
            {
                top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int64_t v8 = (int64_t)value;
                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value);

                return 0;
            }
        }

        if (dims == 2)
        {
            int outw = w + left + right;
            int outh = h * elempack + top + bottom;

            int out_elempack = outh % 8 == 0 ? 8 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (top % 8 == 0 && out_elempack == 8 && type == 0)
            {
                top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int64_t v8 = (int64_t)value;
                int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);
                padding_constant_pack8_int8_lsx(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value);

                return 0;
            }
        }

        if (dims == 3)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outc = channels * elempack + front + behind;

            int out_elempack = outc % 8 == 0 ? 8 : 1;
            size_t out_elemsize = elemsize / elempack * out_elempack;

            if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0))
            {
                top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                int front_ = front / elempack;
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < outc / out_elempack; q++)
                {
                    Mat borderm = top_blob.channel(q);

                    // TODO perchannel
                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
                    int64_t v8 = (int64_t)value;
                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);

                    //Channel padding
                    if ((q - front_) < 0 || (q - front_) >= channels)
                    {
                        borderm.fill(pad_value);
                    }
                    else
                    {
                        const Mat m = bottom_blob.channel(q - front_);
                        if (type == 0)
                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
                        if (type == 1)
                            padding_replicate_pack8_int8_lsx(m, borderm, top, bottom, left, right);
                        if (type == 2)
                            padding_reflect_pack8_int8_lsx(m, borderm, top, bottom, left, right);
                    }
                }

                return 0;
            }
        }

        if (dims == 4)
        {
            int outw = w + left + right;
            int outh = h + top + bottom;
            int outd = d + front + behind;

            if (type == 0)
            {
                top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
                if (top_blob.empty())
                    return -100;

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    // TODO perchannel
                    //                     int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value);
                    int64_t v8 = (int64_t)value;
                    int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56);

                    for (int z = 0; z < outd; z++)
                    {
                        Mat borderm = top_blob.channel(q).depth(z);

                        // depth padding
                        if ((z - front) < 0 || (z - front) >= d)
                        {
                            borderm.fill(pad_value);
                        }
                        else
                        {
                            const Mat m = bottom_blob.channel(q).depth(z - front);
                            padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value);
                        }
                    }
                }

                return 0;
            }
        }
    }
#endif // __loongarch_sx

    Mat bottom_blob_unpacked = bottom_blob;
    if (elempack != 1)
    {
        Option opt_pack1 = opt;
        opt_pack1.blob_allocator = opt.workspace_allocator;

        convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
    }

    Mat top_blob_unpacked;
    int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt);
    if (ret != 0)
        return ret;

    int out_elempack = 1;
#if __loongarch_sx
    if (opt.use_packing_layout)
    {
        out_elempack = top_blob_unpacked.c % 8 == 0 ? 8 : 1;
    }
#endif

    convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/padding_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_PADDING_LOONGARCH_H
#define LAYER_PADDING_LOONGARCH_H

#include "padding.h"

namespace ncnn {

class Padding_loongarch : public Padding
{
public:
    Padding_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

protected:
    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_PADDING_LOONGARCH_H


================================================
FILE: src/layer/loongarch/padding_pack4.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void padding_constant_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v)
{
    const float* ptr = src;
    float* outptr = dst;
    int top_size = top * dst.w;
    int bottom_size = bottom * dst.w;

    // fill top
    for (int y = 0; y < top_size; y++)
    {
        __lsx_vst(v, outptr, 0);
        outptr += 4;
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            __lsx_vst(v, outptr, 0);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            __builtin_prefetch(ptr + 32);
            __lsx_vst(__lsx_vld(ptr, 0), outptr, 0);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            __lsx_vst(v, outptr, 0);
            outptr += 4;
        }
    }
    // fill top
    for (int y = 0; y < bottom_size; y++)
    {
        __lsx_vst(v, outptr, 0);
        outptr += 4;
    }
}

static void padding_replicate_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const float* ptr = src;
    float* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        const float* ptr0 = ptr;
        __m128 _p = (__m128)__lsx_vld(ptr0, 0);
        for (int x = 0; x < left; x++)
        {
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = (__m128)__lsx_vld(ptr0, 0);
            __lsx_vst(_p, outptr, 0);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        __m128 _p = (__m128)__lsx_vld(ptr, 0);
        for (int x = 0; x < left; x++)
        {
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = (__m128)__lsx_vld(ptr, 0);
            __lsx_vst(_p, outptr, 0);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
    }
    // fill bottom
    ptr -= src.w * 4;
    for (int y = 0; y < bottom; y++)
    {
        const float* ptr0 = ptr;
        __m128 _p = (__m128)__lsx_vld(ptr0, 0);
        for (int x = 0; x < left; x++)
        {
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            _p = (__m128)__lsx_vld(ptr0, 0);
            __lsx_vst(_p, outptr, 0);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
    }
}

static void padding_reflect_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const float* ptr = src;
    float* outptr = dst;

    // fill top
    ptr += top * src.w * 4;
    for (int y = 0; y < top; y++)
    {
        const float* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0);
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
            __lsx_vst(_p, outptr, 0);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0);
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        ptr -= src.w * 4;
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr + (left - x) * 4, 0);
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __lsx_vst(_p, outptr, 0);
            ptr += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr - 8 - x * 4, 0);
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
    }
    // fill bottom
    ptr -= 2 * src.w * 4;
    for (int y = 0; y < bottom; y++)
    {
        const float* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0);
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        for (int x = 0; x < src.w; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
            __lsx_vst(_p, outptr, 0);
            ptr0 += 4;
            outptr += 4;
        }
        for (int x = 0; x < right; x++)
        {
            __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0);
            __lsx_vst(_p, outptr, 0);
            outptr += 4;
        }
        ptr -= src.w * 4;
    }
}


================================================
FILE: src/layer/loongarch/padding_pack8_int8.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

static void padding_constant_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int64_t _v)
{
    const int64_t* ptr = src;
    int64_t* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        for (int x = 0; x < dst.w; x++)
        {
            *outptr++ = _v;
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            *outptr++ = _v;
        }
        for (int x = 0; x < src.w; x++)
        {
            *outptr++ = *ptr++;
        }
        for (int x = 0; x < right; x++)
        {
            *outptr++ = _v;
        }
    }
    // fill bottom
    for (int y = 0; y < bottom; y++)
    {
        for (int x = 0; x < dst.w; x++)
        {
            *outptr++ = _v;
        }
    }
}

static void padding_replicate_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const int64_t* ptr = src;
    int64_t* outptr = dst;

    // fill top
    for (int y = 0; y < top; y++)
    {
        const int64_t* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            *outptr++ = *ptr0;
        }
        for (int x = 0; x < src.w; x++)
        {
            *outptr++ = *ptr0++;
        }
        for (int x = 0; x < right; x++)
        {
            *outptr++ = ptr0[-1];
        }
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            *outptr++ = *ptr;
        }
        for (int x = 0; x < src.w; x++)
        {
            *outptr++ = *ptr++;
        }
        for (int x = 0; x < right; x++)
        {
            *outptr++ = ptr[-1];
        }
    }
    // fill bottom
    ptr -= src.w;
    for (int y = 0; y < bottom; y++)
    {
        const int64_t* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            *outptr++ = *ptr0;
        }
        for (int x = 0; x < src.w; x++)
        {
            *outptr++ = *ptr0++;
        }
        for (int x = 0; x < right; x++)
        {
            *outptr++ = ptr0[-1];
        }
    }
}

static void padding_reflect_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
{
    const int64_t* ptr = src;
    int64_t* outptr = dst;

    // fill top
    ptr += top * src.w;
    for (int y = 0; y < top; y++)
    {
        const int64_t* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            *outptr++ = ptr0[left - x];
        }
        for (int x = 0; x < src.w; x++)
        {
            *outptr++ = *ptr0++;
        }
        for (int x = 0; x < right; x++)
        {
            *outptr++ = ptr0[-2 - x];
        }
        ptr -= src.w;
    }
    // fill center
    for (int y = 0; y < src.h; y++)
    {
        for (int x = 0; x < left; x++)
        {
            *outptr++ = ptr[left - x];
        }
        for (int x = 0; x < src.w; x++)
        {
            *outptr++ = *ptr++;
        }
        for (int x = 0; x < right; x++)
        {
            *outptr++ = ptr[-2 - x];
        }
    }
    // fill bottom
    ptr -= 2 * src.w;
    for (int y = 0; y < bottom; y++)
    {
        const int64_t* ptr0 = ptr;
        for (int x = 0; x < left; x++)
        {
            *outptr++ = ptr0[left - x];
        }
        for (int x = 0; x < src.w; x++)
        {
            *outptr++ = *ptr0++;
        }
        for (int x = 0; x < right; x++)
        {
            *outptr++ = ptr0[-2 - x];
        }
        ptr -= src.w;
    }
}


================================================
FILE: src/layer/loongarch/pooling_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "pooling_loongarch.h"

#include <float.h>

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

Pooling_loongarch::Pooling_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Pooling_loongarch::create_pipeline(const Option& /*opt*/)
{
    if (adaptive_pooling)
    {
        support_packing = false;

        support_bf16_storage = false;
        support_fp16_storage = false;
        support_int8_storage = false;
        support_tensor_storage = false;
    }
    return 0;
}

int Pooling_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (adaptive_pooling)
    {
        return Pooling::forward(bottom_blob, top_blob, opt);
    }

    // max value in NxN window
    // avg value in NxN window

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

#if __loongarch_sx
    //     NCNN_LOGE("Pooling     input %d x %d  pad = %d %d %d %d  ksize=%d %d  stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h);

    if (elempack == 4)
    {
        if (global_pooling)
        {
            top_blob.create(channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            int size = w * h;

            if (pooling_type == PoolMethod_MAX)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);

                    __m128 _max = (__m128)__lsx_vld(ptr, 0);
                    for (int i = 0; i < size; i++)
                    {
                        __m128 _val = (__m128)__lsx_vld(ptr, 0);
                        _max = __lsx_vfmax_s(_max, _val);
                        ptr += 4;
                    }

                    float* outptr = top_blob;
                    __lsx_vst(_max, outptr + q * 4, 0);
                }
            }
            else if (pooling_type == PoolMethod_AVE)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const float* ptr = bottom_blob.channel(q);

                    __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
                    for (int i = 0; i < size; i++)
                    {
                        __m128 _val = (__m128)__lsx_vld(ptr, 0);
                        _sum = __lsx_vfadd_s(_sum, _val);
                        ptr += 4;
                    }

                    __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / size));

                    float* outptr = top_blob;
                    __lsx_vst(_avg, outptr + q * 4, 0);
                }
            }

            return 0;
        }

        Mat bottom_blob_bordered;
        make_padding(bottom_blob, bottom_blob_bordered, opt);
        if (bottom_blob_bordered.empty())
            return -100;

        w = bottom_blob_bordered.w;
        h = bottom_blob_bordered.h;

        int outw = (w - kernel_w) / stride_w + 1;
        int outh = (h - kernel_h) / stride_h + 1;

        top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int maxk = kernel_w * kernel_h;

        // kernel offsets
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;
            int gap = w - kernel_w;
            for (int i = 0; i < kernel_h; i++)
            {
                for (int j = 0; j < kernel_w; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        if (pooling_type == PoolMethod_MAX)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const Mat m = bottom_blob_bordered.channel(q);
                float* outptr = top_blob.channel(q);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                        __m128 _max = (__m128)__lsx_vld(sptr, 0);

                        for (int k = 0; k < maxk; k++)
                        {
                            __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
                            _max = __lsx_vfmax_s(_max, _val);
                        }

                        __lsx_vst(_max, outptr + j * 4, 0);
                    }

                    outptr += outw * 4;
                }
            }
        }
        else if (pooling_type == PoolMethod_AVE)
        {
            if (avgpool_count_include_pad == 0)
            {
                int wtailpad = 0;
                int htailpad = 0;

                if (pad_mode == 0) // full padding
                {
                    wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right;
                    htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom;
                }

                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    float* outptr = top_blob.channel(q);

                    for (int i = 0; i < outh; i++)
                    {
                        int sy0 = i * stride_h;

                        for (int j = 0; j < outw; j++)
                        {
                            int sx0 = j * stride_w;

                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);
                            int area = 0;

                            for (int ki = 0; ki < kernel_h; ki++)
                            {
                                int sy = sy0 + ki;

                                if (sy < pad_top)
                                    continue;

                                if (sy >= h - pad_bottom - htailpad)
                                    break;

                                for (int kj = 0; kj < kernel_w; kj++)
                                {
                                    int sx = sx0 + kj;

                                    if (sx < pad_left)
                                        continue;

                                    if (sx >= w - pad_right - wtailpad)
                                        break;

                                    __m128 _val = (__m128)__lsx_vld(m.row(sy) + sx * 4, 0);
                                    _sum = __lsx_vfadd_s(_sum, _val);
                                    area += 1;
                                }
                            }

                            __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / area));
                            __lsx_vst(_avg, outptr + j * 4, 0);
                        }

                        outptr += outw * 4;
                    }
                }
            }
            else // if (avgpool_count_include_pad == 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob_bordered.channel(q);
                    float* outptr = top_blob.channel(q);

                    const float inv_maxk = 1.f / maxk;

                    for (int i = 0; i < outh; i++)
                    {
                        for (int j = 0; j < outw; j++)
                        {
                            const float* sptr = m.row(i * stride_h) + j * stride_w * 4;

                            __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0);

                            for (int k = 0; k < maxk; k++)
                            {
                                __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0);
                                _sum = __lsx_vfadd_s(_sum, _val);
                            }

                            __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(inv_maxk));
                            __lsx_vst(_avg, outptr + j * 4, 0);
                        }

                        outptr += outw * 4;
                    }
                }
            }
        }

        return 0;
    }
#endif // __loongarch_sx

    return Pooling::forward(bottom_blob, top_blob, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/pooling_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_POOLING_LOONGARCH_H
#define LAYER_POOLING_LOONGARCH_H

#include "pooling.h"

namespace ncnn {

class Pooling_loongarch : public Pooling
{
public:
    Pooling_loongarch();

    virtual int create_pipeline(const Option& opt);
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_POOLING_LOONGARCH_H


================================================
FILE: src/layer/loongarch/prelu_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "prelu_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

PReLU_loongarch::PReLU_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int PReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        int w = bottom_top_blob.w * elempack;

#if __loongarch_sx
        int nn_w = w / 4;
        int remain_w_start = nn_w * 4;
#else
        int remain_w_start = 0;
#endif // __loongarch_sx

        float* ptr = bottom_top_blob;

        if (num_slope > 1)
        {
            const float* slope = slope_data;

#if __loongarch_sx
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < nn_w; i++)
            {
                float* ptr0 = ptr + i * 4;

                __m128 _p = (__m128)__lsx_vld(ptr0, 0);
                __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _slope = (__m128)__lsx_vld(slope + i * 4, 0);
                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
                __m128 _ps = __lsx_vfmul_s(_p, _slope);
                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
                __lsx_vst(_p, ptr0, 0);
            }
#endif // __loongarch_sx

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = remain_w_start; i < w; i++)
            {
                float v = ptr[i];
                if (v < 0.f)
                    ptr[i] = v * slope[i];
            }
        }
        else
        {
            const float slope = slope_data[0];

#if __loongarch_sx
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < nn_w; i++)
            {
                float* ptr0 = ptr + i * 4;

                __m128 _p = (__m128)__lsx_vld(ptr0, 0);
                __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
                __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
                __m128 _ps = __lsx_vfmul_s(_p, _slope);
                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
                __lsx_vst(_p, ptr0, 0);
            }
#endif // __loongarch_sx

            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = remain_w_start; i < w; i++)
            {
                float v = ptr[i];
                if (v < 0.f)
                    ptr[i] = v * slope;
            }
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w * elempack;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);

            const float slope = num_slope > 1 ? slope_data[i] : slope_data[0];

            int j = 0;
#if __loongarch_sx
            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope);

            for (; j + 3 < w; j += 4)
            {
                __builtin_prefetch(ptr + 16);
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
                __m128 _ps = __lsx_vfmul_s(_p, _slope);
                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
                __lsx_vst(_p, ptr, 0);

                ptr += 4;
            }
#endif // __loongarch_sx
            for (; j < w; j++)
            {
                float v = *ptr;
                if (v < 0.f)
                    *ptr = v * slope;

                ptr++;
            }
        }
    }

    if (dims == 3)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int channels = bottom_top_blob.c;
        int size = w * h * elempack;

        const float* slope_data_ptr = slope_data;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0];

            int i = 0;
#if __loongarch_sx
            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope);

            for (; i + 3 < size; i += 4)
            {
                __builtin_prefetch(ptr + 16);
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
                __m128 _ps = __lsx_vfmul_s(_p, _slope);
                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
                __lsx_vst(_p, ptr, 0);

                ptr += 4;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                if (*ptr < 0)
                    *ptr *= slope;

                ptr++;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/prelu_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_PRELU_LOONGARCH_H
#define LAYER_PRELU_LOONGARCH_H

#include "prelu.h"

namespace ncnn {

class PReLU_loongarch : public PReLU
{
public:
    PReLU_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_PRELU_LOONGARCH_H


================================================
FILE: src/layer/loongarch/quantize_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "quantize_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

Quantize_loongarch::Quantize_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data, int elemcount, int elempack)
{
    const int scale_data_size = scale_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("quantize %d   %d %d", scale_data_size, elemcount, elempack);

    float scale = scale_data[0];
#if __loongarch_sx
    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
    if (scale_data_size > 1)
    {
        if (elempack == 4)
        {
            _scale = (__m128)__lsx_vld((const float*)scale_data, 0);
        }
    }
#endif // __loongarch_sx

    int i = 0;
#if __loongarch_sx
    for (; i + 7 < size; i += 8)
    {
        __builtin_prefetch(ptr + 32);
        __m128 _v0 = (__m128)__lsx_vld(ptr, 0);
        __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0);
        _v0 = __lsx_vfmul_s(_v0, _scale);
        _v1 = __lsx_vfmul_s(_v1, _scale);
        *((int64_t*)s8ptr) = float2int8(_v0, _v1);
        ptr += 8;
        s8ptr += 8;
    }
    for (; i + 3 < size; i += 4)
    {
        __m128 _v = (__m128)__lsx_vld(ptr, 0);
        _v = __lsx_vfmul_s(_v, _scale);
        v16i8 v = (v16i8)float2int8(_v);
        s8ptr[0] = v[0];
        s8ptr[1] = v[1];
        s8ptr[2] = v[2];
        s8ptr[3] = v[3];
        ptr += 4;
        s8ptr += 4;
    }
#endif // __loongarch_sx
    for (; i < size; i++)
    {
        float v = *ptr * scale;
        *s8ptr = float2int8(v);
        ptr++;
        s8ptr++;
    }
}

#if __loongarch_sx
static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to8 %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    __m128 _scale0 = (__m128)__lsx_vreplfr2vr_s(scale);
    __m128 _scale1 = _scale0;
    if (scale_data_size > 1)
    {
        _scale0 = (__m128)__lsx_vld((const float*)scale_data, 0);
        _scale1 = (__m128)__lsx_vld((const float*)scale_data + 4, 0);
    }

    int i = 0;
    for (; i < elemcount; i++)
    {
        __m128 _v0 = (__m128)__lsx_vld(ptr0, 0);
        __m128 _v1 = (__m128)__lsx_vld(ptr1, 0);
        _v0 = __lsx_vfmul_s(_v0, _scale0);
        _v1 = __lsx_vfmul_s(_v1, _scale1);
        *((int64_t*)s8ptr) = float2int8(_v0, _v1);
        ptr0 += 4;
        ptr1 += 4;
        s8ptr += 8;
    }
}

static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount)
{
    const int scale_data_size = scale_data.w;

    // NCNN_LOGE("quantize_pack4to1 %d   %d", scale_data_size, elemcount);

    float scale = scale_data[0];
    __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
    if (scale_data_size > 1)
    {
        _scale = (__m128)__lsx_vld((const float*)scale_data, 0);
    }

    int i = 0;
    for (; i < elemcount; i++)
    {
        __m128 _v = (__m128)__lsx_vld(ptr, 0);
        _v = __lsx_vfmul_s(_v, _scale);
        v16i8 v = (v16i8)float2int8(_v);
        s8ptr0[0] = v[0];
        s8ptr1[0] = v[1];
        s8ptr2[0] = v[2];
        s8ptr3[0] = v[3];
        ptr += 4;
        s8ptr0 += 1;
        s8ptr1 += 1;
        s8ptr2 += 1;
        s8ptr3 += 1;
    }
}
#endif // __loongarch_sx

int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;

    if (dims == 1)
    {
        int out_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            out_elempack = w * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outw = w * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const float* ptr = (const float*)bottom_blob + i * elempack;
            signed char* s8ptr = (signed char*)top_blob + i * elempack;

            // assert scale_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            quantize(ptr, s8ptr, scale_data, size, 1);
        }
    }

    if (dims == 2)
    {
        int out_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            out_elempack = h * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outh = h * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __loongarch_sx
        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < outh; i++)
            {
                const float* ptr0 = bottom_blob.row(i * 2);
                const float* ptr1 = bottom_blob.row(i * 2 + 1);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* ptr = bottom_blob.row(i);
                signed char* s8ptr0 = top_blob.row<signed char>(i * 4);
                signed char* s8ptr1 = top_blob.row<signed char>(i * 4 + 1);
                signed char* s8ptr2 = top_blob.row<signed char>(i * 4 + 2);
                signed char* s8ptr3 = top_blob.row<signed char>(i * 4 + 3);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w);
            }
        }
#endif // __loongarch_sx
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i = 0; i < h; i++)
            {
                const float* ptr = bottom_blob.row(i);
                signed char* s8ptr = top_blob.row<signed char>(i);

                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

                quantize(ptr, s8ptr, scale_data_i, w, elempack);
            }
        }
    }

    if (dims == 3)
    {
        int out_elempack = 1;
#if __loongarch_sx
        if (opt.use_packing_layout)
        {
            out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
        }
#endif
        const int outc = channels * elempack / out_elempack;
        const size_t out_elemsize = out_elempack * 1u;

        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

#if __loongarch_sx
        if (elempack == 4 && out_elempack == 8)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < outc; q++)
            {
                const float* ptr0 = bottom_blob.channel(q * 2);
                const float* ptr1 = bottom_blob.channel(q * 2 + 1);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data;

                quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h);
            }
        }
        if (elempack == 4 && out_elempack == 1)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                signed char* s8ptr0 = top_blob.channel(q * 4);
                signed char* s8ptr1 = top_blob.channel(q * 4 + 1);
                signed char* s8ptr2 = top_blob.channel(q * 4 + 2);
                signed char* s8ptr3 = top_blob.channel(q * 4 + 3);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h);
            }
        }
#endif // __loongarch_sx
        if (elempack == out_elempack)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < channels; q++)
            {
                const float* ptr = bottom_blob.channel(q);
                signed char* s8ptr = top_blob.channel(q);

                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

                quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/quantize_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_QUANTIZE_LOONGARCH_H
#define LAYER_QUANTIZE_LOONGARCH_H

#include "quantize.h"

namespace ncnn {

class Quantize_loongarch : public Quantize
{
public:
    Quantize_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_QUANTIZE_LOONGARCH_H


================================================
FILE: src/layer/loongarch/relu_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "relu_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

ReLU_loongarch::ReLU_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

int ReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        if (slope == 0.f)
        {
            int i = 0;
#if __loongarch_sx
            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
            for (; i + 3 < size; i += 4)
            {
                __builtin_prefetch(ptr + 16);
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                _p = __lsx_vfmax_s(_p, _zero);
                __lsx_vst(_p, ptr, 0);

                ptr += 4;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                if (*ptr < 0)
                    *ptr = 0;
                ptr++;
            }
        }
        else
        {
            int i = 0;
#if __loongarch_sx
            __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0);
            __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
            for (; i + 3 < size; i += 4)
            {
                __builtin_prefetch(ptr + 16);
                __m128 _p = (__m128)__lsx_vld(ptr, 0);
                __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero);
                __m128 _ps = __lsx_vfmul_s(_p, _slope);
                _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask);
                __lsx_vst(_p, ptr, 0);

                ptr += 4;
            }
#endif // __loongarch_sx
            for (; i < size; i++)
            {
                if (*ptr < 0)
                    *ptr *= slope;
                ptr++;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/relu_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_RELU_LOONGARCH_H
#define LAYER_RELU_LOONGARCH_H

#include "relu.h"

namespace ncnn {

class ReLU_loongarch : public ReLU
{
public:
    ReLU_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_RELU_LOONGARCH_H


================================================
FILE: src/layer/loongarch/requantize_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "requantize_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_activation.h"
#include "loongarch_usability.h"

namespace ncnn {

Requantize_loongarch::Requantize_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount, int elempack)
{
    const int scale_in_data_size = scale_in_data.w;
    const int bias_data_size = bias_data.w;
    const int scale_out_data_size = scale_out_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("requantize_relu %d %d %d   %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack);

    // int8(relu(v * scale_in) * scale_out)
    // int8_relu(v * (scale_in * scale_out))

    // int8(relu(v * scale_in + bias) * scale_out)
    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))

    float scale_in = scale_in_data[0];
#if __loongarch_sx
    __m128 _scale_in0 = (__m128)__lsx_vreplfr2vr_s(scale_in);
    __m128 _scale_in1 = _scale_in0;
    if (scale_in_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
            _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0);
        }
    }
#endif // __loongarch_sx

    float scale_out = scale_out_data[0];
#if __loongarch_sx
    __m128 _scale_out0 = (__m128)__lsx_vreplfr2vr_s(scale_out);
    __m128 _scale_out1 = _scale_out0;
    if (scale_out_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
            _scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0);
        }
    }
#endif // __loongarch_sx

    float scale = scale_in * scale_out;
#if __loongarch_sx
    __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
    __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
#endif // __loongarch_sx

    if (bias_data_size == 0)
    {
        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmul_s(_v0, _scale0);
            _v1 = __lsx_vfmul_s(_v1, _scale1);
            *((int64_t*)ptr) = float2int8relu(_v0, _v1);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmul_s(_v, _scale0);
            v16i8 v = (v16i8)float2int8relu(_v);
            ptr[0] = v[0];
            ptr[1] = v[1];
            ptr[2] = v[2];
            ptr[3] = v[3];
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            float v = *intptr * scale;
            if (v < 0) v = 0;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __loongarch_sx
        __m128 _bias0 = (__m128)__lsx_vreplfr2vr_s(bias);
        __m128 _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 8)
            {
                _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
                _bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0);
            }
        }
#endif // __loongarch_sx

        bias = bias * scale_out;
#if __loongarch_sx
        _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
        _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
#endif // __loongarch_sx

        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmadd_s(_v0, _scale0, _bias0);
            _v1 = __lsx_vfmadd_s(_v1, _scale1, _bias1);
            *((int64_t*)ptr) = float2int8relu(_v0, _v1);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmadd_s(_v, _scale0, _bias0);
            v16i8 v = (v16i8)float2int8relu(_v);
            ptr[0] = v[0];
            ptr[1] = v[1];
            ptr[2] = v[2];
            ptr[3] = v[3];
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            float v = *intptr * scale + bias;
            if (v < 0) v = 0;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
}

static void requantize_leakyrelu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, float slope, int elemcount, int elempack)
{
    const int scale_in_data_size = scale_in_data.w;
    const int bias_data_size = bias_data.w;
    const int scale_out_data_size = scale_out_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("requantize_leakyrelu %d %d %d   %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack);

    // int8(leakyrelu(v * scale_in, slope) * scale_out)
    // int8_leakyrelu(v * (scale_in * scale_out), slope)

    // int8(leakyrelu(v * scale_in + bias, slope) * scale_out)
    // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope)

    float scale_in = scale_in_data[0];
#if __loongarch_sx
    __m128 _scale_in0 = (__m128)__lsx_vreplfr2vr_s(scale_in);
    __m128 _scale_in1 = _scale_in0;
    if (scale_in_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
            _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0);
        }
    }
#endif // __loongarch_sx

    float scale_out = scale_out_data[0];
#if __loongarch_sx
    __m128 _scale_out0 = (__m128)__lsx_vreplfr2vr_s(scale_out);
    __m128 _scale_out1 = _scale_out0;
    if (scale_out_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
            _scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0);
        }
    }
#endif // __loongarch_sx

    float scale = scale_in * scale_out;
#if __loongarch_sx
    __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0);
    __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1);
    __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope);
#endif // __loongarch_sx

    if (bias_data_size == 0)
    {
        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmul_s(_v0, _scale0);
            _v1 = __lsx_vfmul_s(_v1, _scale1);
            *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmul_s(_v, _scale0);
            v16i8 v = (v16i8)float2int8leakyrelu(_v, _slope);
            ptr[0] = v[0];
            ptr[1] = v[1];
            ptr[2] = v[2];
            ptr[3] = v[3];
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            float v = *intptr * scale;
            if (v < 0) v *= slope;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __loongarch_sx
        __m128 _bias0 = (__m128)__lsx_vreplfr2vr_s(bias);
        __m128 _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 8)
            {
                _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
                _bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0);
            }
        }
#endif // __loongarch_sx

        bias = bias * scale_out;
#if __loongarch_sx
        _bias0 = __lsx_vfmul_s(_bias0, _scale_out0);
        _bias1 = __lsx_vfmul_s(_bias1, _scale_out1);
#endif // __loongarch_sx

        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmadd_s(_v0, _scale0, _bias0);
            _v1 = __lsx_vfmadd_s(_v1, _scale1, _bias1);
            *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmadd_s(_v, _scale0, _bias0);
            v16i8 v = (v16i8)float2int8leakyrelu(_v, _slope);
            ptr[0] = v[0];
            ptr[1] = v[1];
            ptr[2] = v[2];
            ptr[3] = v[3];
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            float v = *intptr * scale + bias;
            if (v < 0) v *= slope;
            *ptr = float2int8(v);
            intptr++;
            ptr++;
        }
    }
}

static void requantize(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int elempack)
{
    if (activation_type == 1)
    {
        requantize_relu(intptr, ptr, scale_in_data, bias_data, scale_out_data, elemcount, elempack);
        return;
    }

    if (activation_type == 2 && activation_params[0] > 0.f)
    {
        const float slope = activation_params[0];
        requantize_leakyrelu(intptr, ptr, scale_in_data, bias_data, scale_out_data, slope, elemcount, elempack);
        return;
    }

    const int scale_in_data_size = scale_in_data.w;
    const int bias_data_size = bias_data.w;
    const int scale_out_data_size = scale_out_data.w;
    const int size = elemcount * elempack;

    // NCNN_LOGE("requantize %d %d %d   %d %d", scale_in_data_size, bias_data_size, scale_out_data_size, elemcount, elempack);

    float scale_in = scale_in_data[0];
#if __loongarch_sx
    __m128 _scale_in0 = (__m128)__lsx_vreplfr2vr_s(scale_in);
    __m128 _scale_in1 = _scale_in0;
    if (scale_in_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data, 0);
            _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + 4, 0);
        }
    }
#endif // __loongarch_sx

    float scale_out = scale_out_data[0];
#if __loongarch_sx
    __m128 _scale_out0 = (__m128)__lsx_vreplfr2vr_s(scale_out);
    __m128 _scale_out1 = _scale_out0;
    if (scale_out_data_size > 1)
    {
        if (elempack == 8)
        {
            _scale_out0 = (__m128)__lsx_vld((const float*)scale_out_data, 0);
            _scale_out1 = (__m128)__lsx_vld((const float*)scale_out_data + 4, 0);
        }
    }
#endif // __loongarch_sx

    if (bias_data_size == 0)
    {
        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmul_s(_v0, _scale_in0);
            _v1 = __lsx_vfmul_s(_v1, _scale_in1);
            _v0 = activation_ps(_v0, activation_type, activation_params);
            _v1 = activation_ps(_v1, activation_type, activation_params);
            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
            *((int64_t*)ptr) = float2int8(_v0, _v1);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmul_s(_v, _scale_in0);
            _v = activation_ps(_v, activation_type, activation_params);
            _v = __lsx_vfmul_s(_v, _scale_out0);
            v16i8 v = (v16i8)float2int8(_v);
            ptr[0] = v[0];
            ptr[1] = v[1];
            ptr[2] = v[2];
            ptr[3] = v[3];
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            float v = *intptr * scale_in;
            v = activation_ss(v, activation_type, activation_params);
            *ptr = float2int8(v * scale_out);
            intptr++;
            ptr++;
        }
    }
    else
    {
        float bias = bias_data[0];
#if __loongarch_sx
        __m128 _bias0 = (__m128)__lsx_vreplfr2vr_s(bias);
        __m128 _bias1 = _bias0;
        if (bias_data_size > 1)
        {
            if (elempack == 8)
            {
                _bias0 = (__m128)__lsx_vld((const float*)bias_data, 0);
                _bias1 = (__m128)__lsx_vld((const float*)bias_data + 4, 0);
            }
        }
#endif // __loongarch_sx

        int i = 0;
#if __loongarch_sx
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(intptr + 32);
            __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0));
            _v0 = __lsx_vfmadd_s(_v0, _scale_in0, _bias0);
            _v1 = __lsx_vfmadd_s(_v1, _scale_in1, _bias1);
            _v0 = activation_ps(_v0, activation_type, activation_params);
            _v1 = activation_ps(_v1, activation_type, activation_params);
            _v0 = __lsx_vfmul_s(_v0, _scale_out0);
            _v1 = __lsx_vfmul_s(_v1, _scale_out1);
            *((int64_t*)ptr) = float2int8(_v0, _v1);
            intptr += 8;
            ptr += 8;
        }
        for (; i + 3 < size; i += 4)
        {
            __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0));
            _v = __lsx_vfmadd_s(_v, _scale_in0, _bias0);
            _v = activation_ps(_v, activation_type, activation_params);
            _v = __lsx_vfmul_s(_v, _scale_out0);
            v16i8 v = (v16i8)float2int8(_v);
            ptr[0] = v[0];
            ptr[1] = v[1];
            ptr[2] = v[2];
            ptr[3] = v[3];
            intptr += 4;
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            float v = *intptr * scale_in + bias;
            v = activation_ss(v, activation_type, activation_params);
            *ptr = float2int8(v * scale_out);
            intptr++;
            ptr++;
        }
    }
}

int Requantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    const int dims = bottom_blob.dims;
    const int w = bottom_blob.w;
    const int h = bottom_blob.h;
    const int channels = bottom_blob.c;
    const int elempack = bottom_blob.elempack;
    const size_t out_elemsize = elempack * 1u;

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int wp = std::max(1, w / opt.num_threads);
        const int nn_w = (w + wp - 1) / wp;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_w; ii++)
        {
            const int i = ii * wp;

            const int* intptr = (const int*)bottom_blob + i * elempack;
            signed char* ptr = (signed char*)top_blob + i * elempack;

            // assert scale_in_data_size == 1
            // assert bias_data_size == 0 || bias_data_size == 1
            // assert scale_out_data_size == 1

            const int size = std::min(w - i, wp) * elempack;

            requantize(intptr, ptr, scale_in_data, bias_data, scale_out_data, activation_type, activation_params, size, 1);
        }
    }

    if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            const int* intptr = bottom_blob.row<const int>(i);
            signed char* ptr = top_blob.row<signed char>(i);

            const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
            const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;

            requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
        }
    }

    if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int* intptr = bottom_blob.channel(q);
            signed char* ptr = top_blob.channel(q);

            const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
            const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;

            requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h, elempack);
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/requantize_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_REQUANTIZE_LOONGARCH_H
#define LAYER_REQUANTIZE_LOONGARCH_H

#include "requantize.h"

namespace ncnn {

class Requantize_loongarch : public Requantize
{
public:
    Requantize_loongarch();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_REQUANTIZE_LOONGARCH_H


================================================
FILE: src/layer/loongarch/sigmoid_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "sigmoid_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#if __loongarch_asx
#include <lasxintrin.h>
#include "lasx_mathfun.h"
#endif // __loongarch_asx
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

Sigmoid_loongarch::Sigmoid_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

int Sigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
#if __loongarch_asx
        __m256 _one_lasx = (__m256)__lasx_xvreplfr2vr_s(1.f);
        for (; i + 7 < size; i += 8)
        {
            __builtin_prefetch(ptr + 32);
            __m256 _p = (__m256)__lasx_xvld(ptr, 0);
            _p = (__m256)__lasx_xvbitrevi_w((__m256i)_p, 31);
            _p = exp256_ps(_p);
            _p = __lasx_xvfadd_s(_p, _one_lasx);
            __m256 _outp = __lasx_xvfdiv_s(_one_lasx, _p);
            __lasx_xvst(_outp, ptr, 0);

            ptr += 8;
        }
#endif // __loongarch_lasx
        __m128 _one_lsx = (__m128)__lsx_vreplfr2vr_s(1.f);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = (__m128)__lsx_vbitrevi_w((__m128i)_p, 31);
            _p = exp_ps(_p);
            _p = __lsx_vfadd_s(_p, _one_lsx);
            __m128 _outp = __lsx_vfdiv_s(_one_lsx, _p);
            __lsx_vst(_outp, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = 1.f / (1.f + expf(-*ptr));

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/sigmoid_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SIGMOID_LOONGARCH_H
#define LAYER_SIGMOID_LOONGARCH_H

#include "sigmoid.h"

namespace ncnn {

class Sigmoid_loongarch : public Sigmoid
{
public:
    Sigmoid_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SIGMOID_LOONGARCH_H


================================================
FILE: src/layer/loongarch/slice_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "slice_loongarch.h"

namespace ncnn {

Slice_loongarch::Slice_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Slice_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;
    const int* slices_ptr = slices;
    const int* indices_ptr = indices;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // slice vector
        int w = bottom_blob.w * elempack;
        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __loongarch_sx
            if (opt.use_packing_layout)
                out_elempack = slice % 4 == 0 ? 4 : 1;
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            const float* ptr = (const float*)bottom_blob + q;
            float* outptr = top_blob;
            memcpy(outptr, ptr, top_blob.w * top_blob.elemsize);

            q += slice;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // slice image height
        int w = bottom_blob.w;
        int h = bottom_blob.h * elempack;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = h - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? h + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((h - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __loongarch_sx
            if (opt.use_packing_layout)
                out_elempack = slice % 4 == 0 ? 4 : 1;
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        size_t out_elemsize = top_blobs[0].elemsize;
        int out_elempack = top_blobs[0].elempack;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
        }

        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > out_elempack)
        {
            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        const float* ptr = bottom_blob_unpacked;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            Mat& top_blob = top_blobs[i];

            if (out_elempack == 1 && top_blob.elempack == 4)
            {
                for (int j = 0; j < top_blob.h; j++)
                {
                    const float* r0 = ptr;
                    const float* r1 = ptr + w;
                    const float* r2 = ptr + w * 2;
                    const float* r3 = ptr + w * 3;

                    float* outptr0 = top_blob.row(j);

                    for (int j = 0; j < w; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;

                        outptr0 += 4;
                    }

                    ptr += w * 4;
                }
            }
            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
            {
                int size = w * top_blob.h;

                float* outptr = top_blob;
                memcpy(outptr, ptr, size * top_blob.elemsize);

                ptr += size * top_blob.elempack;
            }
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // slice image width
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int j = 0; j < h; j++)
        {
            const float* ptr = bottom_blob.row(j);
            for (size_t i = 0; i < top_blobs.size(); i++)
            {
                Mat& top_blob = top_blobs[i];

                float* outptr = top_blob.row(j);
                memcpy(outptr, ptr, top_blob.w * elemsize);

                ptr += top_blob.w * elempack;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // slice dim channel
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c * elempack;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = channels - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? channels + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((channels - q) / (top_blobs.size() - i));
                }
            }

            int out_elempack = 1;
#if __loongarch_sx
            if (opt.use_packing_layout)
                out_elempack = slice % 4 == 0 ? 4 : 1;
#endif
            size_t out_elemsize = elemsize / elempack * out_elempack;

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, h, d, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        size_t out_elemsize = top_blobs[0].elemsize;
        int out_elempack = top_blobs[0].elempack;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize);
            out_elempack = std::min(out_elempack, top_blobs[i].elempack);
        }

        Mat bottom_blob_unpacked = bottom_blob;
        if (elempack > out_elempack)
        {
            convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt);
            if (bottom_blob_unpacked.empty())
                return -100;
        }

        int p = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            Mat& top_blob = top_blobs[i];

            if (out_elempack == 1 && top_blob.elempack == 4)
            {
                int size = top_blob.w * top_blob.h * top_blob.d;

                for (int q = 0; q < top_blob.c; q++)
                {
                    const float* r0 = bottom_blob_unpacked.channel(p);
                    const float* r1 = bottom_blob_unpacked.channel(p + 1);
                    const float* r2 = bottom_blob_unpacked.channel(p + 2);
                    const float* r3 = bottom_blob_unpacked.channel(p + 3);

                    float* outptr0 = top_blob.channel(q);

                    for (int j = 0; j < size; j++)
                    {
                        outptr0[0] = *r0++;
                        outptr0[1] = *r1++;
                        outptr0[2] = *r2++;
                        outptr0[3] = *r3++;

                        outptr0 += 4;
                    }

                    p += 4;
                }
            }
            else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4)
            {
                int size = top_blob.total();

                const float* ptr = bottom_blob_unpacked.channel(p);
                float* outptr = top_blob;
                memcpy(outptr, ptr, size * top_blob.elemsize);

                p += top_blob.c;
            }
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // slice dim height
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = h - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? h + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((h - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, slice, d, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const float* ptr = bottom_blob.channel(p);

            for (int j = 0; j < d; j++)
            {
                for (size_t i = 0; i < top_blobs.size(); i++)
                {
                    Mat& top_blob = top_blobs[i];

                    int size = top_blob.w * top_blob.h;

                    float* outptr = top_blob.channel(p).depth(j);
                    memcpy(outptr, ptr, size * elemsize);

                    ptr += size * elempack;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // slice dim width
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = w - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? w + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((w - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(slice, h, d, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            top_blob.dims = dims;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const float* ptr = bottom_blob.channel(p);

            for (int j = 0; j < d; j++)
            {
                for (int k = 0; k < h; k++)
                {
                    for (size_t i = 0; i < top_blobs.size(); i++)
                    {
                        Mat& top_blob = top_blobs[i];

                        float* outptr = top_blob.channel(p).depth(j).row(k);
                        memcpy(outptr, ptr, top_blob.w * elemsize);

                        ptr += top_blob.w * elempack;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int d = bottom_blob.d;
        int channels = bottom_blob.c;

        int q = 0;
        for (size_t i = 0; i < top_blobs.size(); i++)
        {
            int slice;
            if (indices_ptr)
            {
                if (i == top_blobs.size() - 1)
                {
                    slice = d - q;
                }
                else
                {
                    int indice = indices_ptr[i];
                    int positive_indice = indice < 0 ? d + indice : indice;
                    slice = positive_indice - q;
                }
            }
            else
            {
                slice = slices_ptr[i];
                if (slice == -233)
                {
                    slice = static_cast<int>((d - q) / (top_blobs.size() - i));
                }
            }

            Mat& top_blob = top_blobs[i];
            top_blob.create(w, h, slice, channels, elemsize, elempack, opt.blob_allocator);
            if (top_blob.empty())
                return -100;

            q += slice;
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < channels; p++)
        {
            const float* ptr = bottom_blob.channel(p);

            for (size_t i = 0; i < top_blobs.size(); i++)
            {
                Mat& top_blob = top_blobs[i];

                int size = top_blob.w * top_blob.h * top_blob.d;

                float* outptr = top_blob.channel(p);
                memcpy(outptr, ptr, size * elemsize);

                ptr += size * elempack;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/slice_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SLICE_LOONGARCH_H
#define LAYER_SLICE_LOONGARCH_H

#include "slice.h"

namespace ncnn {

class Slice_loongarch : public Slice
{
public:
    Slice_loongarch();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SLICE_LOONGARCH_H


================================================
FILE: src/layer/loongarch/softmax_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "softmax_loongarch.h"

#include <float.h>

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

int Softmax_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    size_t elemsize = bottom_top_blob.elemsize;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims != 3 || positive_axis != 0)
        return Softmax::forward_inplace(bottom_top_blob, opt);

    // value = exp( value - global max value )
    // sum all value
    // value = value / sum

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    Mat max;
    max.create(w, h, elemsize, opt.workspace_allocator);
    if (max.empty())
        return -100;
    max.fill(-FLT_MAX);
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* maxptr = max;

        for (int i = 0; i < size; i++)
        {
            maxptr[i] = std::max(maxptr[i], ptr[i]);
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* maxptr = max;

#if __loongarch_sx
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __loongarch_sx

#if __loongarch_sx
        for (; nn > 0; nn--)
        {
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __m128 _max = (__m128)__lsx_vld(maxptr, 0);

            _p = exp_ps(__lsx_vfsub_s(_p, _max));

            __lsx_vst(_p, ptr, 0);

            ptr += 4;
            maxptr += 4;
        }
#endif // __loongarch_sx

        for (; remain > 0; remain--)
        {
            *ptr = expf(*ptr - *maxptr);

            ptr++;
            maxptr++;
        }
    }

    Mat sum;
    sum.create(w, h, elemsize, opt.workspace_allocator);
    if (sum.empty())
        return -100;
    sum.fill(0.f);
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* sumptr = sum;

#if __loongarch_sx
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __loongarch_sx

#if __loongarch_sx
        for (; nn > 0; nn--)
        {
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __m128 _sum = (__m128)__lsx_vld(sumptr, 0);
            _sum = __lsx_vfadd_s(_sum, _p);
            __lsx_vst(_sum, sumptr, 0);

            ptr += 4;
            sumptr += 4;
        }
#endif // __loongarch_sx

        for (; remain > 0; remain--)
        {
            *sumptr += *ptr;

            ptr++;
            sumptr++;
        }
    }

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);
        float* sumptr = sum;

#if __loongarch_sx
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __loongarch_sx

#if __loongarch_sx
        for (; nn > 0; nn--)
        {
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            __m128 _sum = (__m128)__lsx_vld(sumptr, 0);
            _p = __lsx_vfdiv_s(_p, _sum);
            __lsx_vst(_p, ptr, 0);

            ptr += 4;
            sumptr += 4;
        }
#endif // __loongarch_sx

        for (; remain > 0; remain--)
        {
            *ptr /= *sumptr;

            ptr++;
            sumptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/softmax_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SOFTMAX_LOONGARCH_H
#define LAYER_SOFTMAX_LOONGARCH_H

#include "softmax.h"

namespace ncnn {

class Softmax_loongarch : public Softmax
{
public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SOFTMAX_LOONGARCH_H


================================================
FILE: src/layer/loongarch/swish_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "swish_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

Swish_loongarch::Swish_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

int Swish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128i _p = __lsx_vld(ptr, 0);
            _p = (__m128i)__lsx_vfdiv_s((__m128)_p, __lsx_vfadd_s(_one, exp_ps((__m128)__lsx_vbitrevi_w(_p, 31))));
            __lsx_vst(_p, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = *ptr / (1.f + expf(-*ptr));
            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/swish_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_SWISH_LOONGARCH_H
#define LAYER_SWISH_LOONGARCH_H

#include "swish.h"

namespace ncnn {

class Swish_loongarch : public Swish
{
public:
    Swish_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_SWISH_LOONGARCH_H


================================================
FILE: src/layer/loongarch/tanh_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "tanh_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

TanH_loongarch::TanH_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif
}

int TanH_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __loongarch_sx
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = tanh_ps(_p);
            __lsx_vst(_p, ptr, 0);

            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = tanhf(*ptr);
            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/tanh_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_TANH_LOONGARCH_H
#define LAYER_TANH_LOONGARCH_H

#include "tanh.h"

namespace ncnn {

class TanH_loongarch : public TanH
{
public:
    TanH_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_TANH_LOONGARCH_H


================================================
FILE: src/layer/loongarch/unaryop_loongarch.cpp
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#include "unaryop_loongarch.h"

// #include <fenv.h>
#include <float.h>

#if __loongarch_sx
#include <lsxintrin.h>
#include "lsx_mathfun.h"
#endif // __loongarch_sx

namespace ncnn {

UnaryOp_loongarch::UnaryOp_loongarch()
{
#if __loongarch_sx
    support_packing = true;
#endif // __loongarch_sx
}

template<typename Op>
static int unary_op_inplace(Mat& a, const Option& opt)
{
    Op op;

    int w = a.w;
    int h = a.h;
    int d = a.d;
    int channels = a.c;
    int elempack = a.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        int i = 0;
#if __loongarch_sx
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            __m128 _p = (__m128)__lsx_vld(ptr, 0);
            _p = op.func_pack4(_p);
            __lsx_vst(_p, ptr, 0);
            ptr += 4;
        }
#endif // __loongarch_sx
        for (; i < size; i++)
        {
            *ptr = op.func(*ptr);
            ptr++;
        }
    }

    return 0;
}

namespace UnaryOp_loongarch_functor {

struct unary_op_abs
{
    float func(const float& x) const
    {
        return (float)fabsf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return (__m128)__lsx_vbitclri_w((__m128i)x, 31);
    }
#endif // __loongarch_sx
};

struct unary_op_neg
{
    float func(const float& x) const
    {
        return -x;
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return (__m128)__lsx_vbitrevi_w((__m128i)x, 31);
    }
#endif // __loongarch_sx
};

struct unary_op_floor
{
    float func(const float& x) const
    {
        return (float)floorf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return (__m128)__lsx_vfrintrm_s(x);
    }
#endif // __loongarch_sx
};

struct unary_op_ceil
{
    float func(const float& x) const
    {
        return (float)ceilf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return (__m128)__lsx_vfrintrp_s(x);
    }
#endif // __loongarch_sx
};

struct unary_op_square
{
    float func(const float& x) const
    {
        return x * x;
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return __lsx_vfmul_s(x, x);
    }
#endif // __loongarch_sx
};

struct unary_op_sqrt
{
    float func(const float& x) const
    {
        return (float)sqrtf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return __lsx_vfsqrt_s(x);
    }
#endif // __loongarch_sx
};

struct unary_op_rsqrt
{
    float func(const float& x) const
    {
        return (float)(1.f / sqrtf(x));
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return __lsx_vfrsqrt_s(x);
    }
#endif // __loongarch_sx
};

struct unary_op_exp
{
    float func(const float& x) const
    {
        return (float)expf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return exp_ps(x);
    }
#endif // __loongarch_sx
};

struct unary_op_log
{
    float func(const float& x) const
    {
        return (float)logf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return log_ps(x);
    }
#endif // __loongarch_sx
};

struct unary_op_sin
{
    float func(const float& x) const
    {
        return (float)sinf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        // TODO msa optimize
        float tmp[4];
        __lsx_vst(x, tmp, 0);
        tmp[0] = sinf(tmp[0]);
        tmp[1] = sinf(tmp[1]);
        tmp[2] = sinf(tmp[2]);
        tmp[3] = sinf(tmp[3]);
        return (__m128)__lsx_vld(tmp, 0);
    }
#endif // __loongarch_sx
};

struct unary_op_cos
{
    float func(const float& x) const
    {
        return (float)cosf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        // TODO msa optimize
        float tmp[4];
        __lsx_vst(x, tmp, 0);
        tmp[0] = cosf(tmp[0]);
        tmp[1] = cosf(tmp[1]);
        tmp[2] = cosf(tmp[2]);
        tmp[3] = cosf(tmp[3]);
        return (__m128)__lsx_vld(tmp, 0);
    }
#endif // __loongarch_sx
};

struct unary_op_tan
{
    float func(const float& x) const
    {
        return (float)tanf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        // TODO msa optimize
        float tmp[4];
        __lsx_vst(x, tmp, 0);
        tmp[0] = tanf(tmp[0]);
        tmp[1] = tanf(tmp[1]);
        tmp[2] = tanf(tmp[2]);
        tmp[3] = tanf(tmp[3]);
        return (__m128)__lsx_vld(tmp, 0);
    }
#endif // __loongarch_sx
};

struct unary_op_asin
{
    float func(const float& x) const
    {
        return (float)asinf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        // TODO msa optimize
        float tmp[4];
        __lsx_vst(x, tmp, 0);
        tmp[0] = asinf(tmp[0]);
        tmp[1] = asinf(tmp[1]);
        tmp[2] = asinf(tmp[2]);
        tmp[3] = asinf(tmp[3]);
        return (__m128)__lsx_vld(tmp, 0);
    }
#endif // __loongarch_sx
};

struct unary_op_acos
{
    float func(const float& x) const
    {
        return (float)acosf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        // TODO msa optimize
        float tmp[4];
        __lsx_vst(x, tmp, 0);
        tmp[0] = acosf(tmp[0]);
        tmp[1] = acosf(tmp[1]);
        tmp[2] = acosf(tmp[2]);
        tmp[3] = acosf(tmp[3]);
        return (__m128)__lsx_vld(tmp, 0);
    }
#endif // __loongarch_sx
};

struct unary_op_atan
{
    float func(const float& x) const
    {
        return (float)atanf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        // TODO msa optimize
        float tmp[4];
        __lsx_vst(x, tmp, 0);
        tmp[0] = atanf(tmp[0]);
        tmp[1] = atanf(tmp[1]);
        tmp[2] = atanf(tmp[2]);
        tmp[3] = atanf(tmp[3]);
        return (__m128)__lsx_vld(tmp, 0);
    }
#endif // __loongarch_sx
};

struct unary_op_reciprocal
{
    float func(const float& x) const
    {
        return 1.f / x;
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return __lsx_vfrecip_s(x);
    }
#endif // __loongarch_sx
};

struct unary_op_tanh
{
    float func(const float& x) const
    {
        return (float)tanhf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return tanh_ps(x);
    }
#endif // __loongarch_sx
};

struct unary_op_log10
{
    float func(const float& x) const
    {
        return (float)log10f(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return __lsx_vfmul_s(log_ps(x), __lsx_vreplfr2vr_s(0.434294481903));
    }
#endif // __loongarch_sx
};

struct unary_op_round
{
    float func(const float& x) const
    {
        // round to nearest even
#if NCNN_GNU_INLINE_ASM
        // return (x + 12582912.f) - 12582912.f;
        float y;
        const float magic = 12582912.f;
        asm volatile(
            "fadd.s     %0, %1, %2  \n"
            "fsub.s     %0, %0, %2  \n"
            : "=f"(y)
            : "f"(x), "f"(magic)
            :);
        return y;
#else
#ifdef FE_TONEAREST
        int old_rm = fegetround();
        fesetround(FE_TONEAREST);
#endif
        float y = nearbyintf(x);
#ifdef FE_TONEAREST
        fesetround(old_rm);
#endif
        return y;
#endif
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return (__m128)__lsx_vfrintrne_s(x);
    }
#endif // __loongarch_sx
};

struct unary_op_trunc
{
    float func(const float& x) const
    {
        return (float)truncf(x);
    }
#if __loongarch_sx
    __m128 func_pack4(const __m128& x) const
    {
        return (__m128)__lsx_vfrintrz_s(x);
    }
#endif // __loongarch_sx
};

} // namespace UnaryOp_loongarch_functor

int UnaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    using namespace UnaryOp_loongarch_functor;

    if (op_type == Operation_ABS)
        return unary_op_inplace<unary_op_abs>(bottom_top_blob, opt);

    if (op_type == Operation_NEG)
        return unary_op_inplace<unary_op_neg>(bottom_top_blob, opt);

    if (op_type == Operation_FLOOR)
        return unary_op_inplace<unary_op_floor>(bottom_top_blob, opt);

    if (op_type == Operation_CEIL)
        return unary_op_inplace<unary_op_ceil>(bottom_top_blob, opt);

    if (op_type == Operation_SQUARE)
        return unary_op_inplace<unary_op_square>(bottom_top_blob, opt);

    if (op_type == Operation_SQRT)
        return unary_op_inplace<unary_op_sqrt>(bottom_top_blob, opt);

    if (op_type == Operation_RSQRT)
        return unary_op_inplace<unary_op_rsqrt>(bottom_top_blob, opt);

    if (op_type == Operation_EXP)
        return unary_op_inplace<unary_op_exp>(bottom_top_blob, opt);

    if (op_type == Operation_LOG)
        return unary_op_inplace<unary_op_log>(bottom_top_blob, opt);

    if (op_type == Operation_SIN)
        return unary_op_inplace<unary_op_sin>(bottom_top_blob, opt);

    if (op_type == Operation_COS)
        return unary_op_inplace<unary_op_cos>(bottom_top_blob, opt);

    if (op_type == Operation_TAN)
        return unary_op_inplace<unary_op_tan>(bottom_top_blob, opt);

    if (op_type == Operation_ASIN)
        return unary_op_inplace<unary_op_asin>(bottom_top_blob, opt);

    if (op_type == Operation_ACOS)
        return unary_op_inplace<unary_op_acos>(bottom_top_blob, opt);

    if (op_type == Operation_ATAN)
        return unary_op_inplace<unary_op_atan>(bottom_top_blob, opt);

    if (op_type == Operation_RECIPROCAL)
        return unary_op_inplace<unary_op_reciprocal>(bottom_top_blob, opt);

    if (op_type == Operation_TANH)
        return unary_op_inplace<unary_op_tanh>(bottom_top_blob, opt);

    if (op_type == Operation_LOG10)
        return unary_op_inplace<unary_op_log10>(bottom_top_blob, opt);

    if (op_type == Operation_ROUND)
        return unary_op_inplace<unary_op_round>(bottom_top_blob, opt);

    if (op_type == Operation_TRUNC)
        return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/loongarch/unaryop_loongarch.h
================================================
// Copyright 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_UNARYOP_LOONGARCH_H
#define LAYER_UNARYOP_LOONGARCH_H

#include "unaryop.h"

namespace ncnn {

class UnaryOp_loongarch : public UnaryOp
{
public:
    UnaryOp_loongarch();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_UNARYOP_LOONGARCH_H


================================================
FILE: src/layer/lrn.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "lrn.h"

namespace ncnn {

LRN::LRN()
{
    one_blob_only = true;
    support_inplace = true;
}

int LRN::load_param(const ParamDict& pd)
{
    region_type = pd.get(0, 0);
    local_size = pd.get(1, 5);
    alpha = pd.get(2, 1.f);
    beta = pd.get(3, 0.75f);
    bias = pd.get(4, 1.f);

    return 0;
}

int LRN::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    size_t elemsize = bottom_top_blob.elemsize;
    int size = w * h;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels, elemsize, opt.workspace_allocator);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = bottom_top_blob.channel(q);
        float* outptr = square_blob.channel(q);

        for (int i = 0; i < size; i++)
        {
            outptr[i] = ptr[i] * ptr[i];
        }
    }

    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        Mat square_sum;
        square_sum.create(w, h, channels, elemsize, opt.workspace_allocator);
        if (square_sum.empty())
            return -100;
        square_sum.fill(0.f);

        const float alpha_div_size = alpha / local_size;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            // square sum
            float* ssptr = square_sum.channel(q);
            for (int p = q - local_size / 2; p <= q + local_size / 2; p++)
            {
                if (p < 0 || p >= channels)
                    continue;

                const float* sptr = square_blob.channel(p);
                for (int i = 0; i < size; i++)
                {
                    ssptr[i] += sptr[i];
                }
            }

            float* ptr = bottom_top_blob.channel(q);
            for (int i = 0; i < size; i++)
            {
                ptr[i] = ptr[i] * powf(bias + alpha_div_size * ssptr[i], -beta);
            }
        }
    }
    else if (region_type == NormRegion_WITHIN_CHANNEL)
    {
        int outw = w;
        int outh = h;

        Mat square_blob_bordered = square_blob;
        int pad = local_size / 2;
        if (pad > 0)
        {
            Option opt_b = opt;
            opt_b.blob_allocator = opt.workspace_allocator;
            opt_b.use_packing_layout = false;
            copy_make_border(square_blob, square_blob_bordered, pad, local_size - pad - 1, pad, local_size - pad - 1, BORDER_CONSTANT, 0.f, opt_b);
            if (square_blob_bordered.empty())
                return -100;

            w = square_blob_bordered.w;
            h = square_blob_bordered.h;
        }

        const int maxk = local_size * local_size;

        const float alpha_div_size = alpha / maxk;

        // norm window offsets
        std::vector<int> _space_ofs(maxk);
        int* space_ofs = &_space_ofs[0];
        {
            int p1 = 0;
            int p2 = 0;
            int gap = w - local_size;
            for (int i = 0; i < local_size; i++)
            {
                for (int j = 0; j < local_size; j++)
                {
                    space_ofs[p1] = p2;
                    p1++;
                    p2++;
                }
                p2 += gap;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            const Mat m = square_blob_bordered.channel(q);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    const float* sptr = m.row(i) + j;

                    float ss = 0.f;

                    for (int k = 0; k < maxk; k++)
                    {
                        float val = sptr[space_ofs[k]];
                        ss += val;
                    }

                    ptr[j] = ptr[j] * powf(bias + alpha_div_size * ss, -beta);
                }

                ptr += outw;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/lrn.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_LRN_H
#define LAYER_LRN_H

#include "layer.h"

namespace ncnn {

class LRN : public Layer
{
public:
    LRN();

    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;

    enum NormRegionType
    {
        NormRegion_ACROSS_CHANNELS = 0,
        NormRegion_WITHIN_CHANNEL = 1
    };

public:
    // param
    int region_type;
    int local_size;
    float alpha;
    float beta;
    float bias;
};

} // namespace ncnn

#endif // LAYER_LRN_H


================================================
FILE: src/layer/lstm.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "lstm.h"

namespace ncnn {

LSTM::LSTM()
{
    one_blob_only = false;
    support_inplace = false;
}

int LSTM::load_param(const ParamDict& pd)
{
    num_output = pd.get(0, 0);
    weight_data_size = pd.get(1, 0);
    direction = pd.get(2, 0);
    hidden_size = pd.get(3, num_output);
    int8_scale_term = pd.get(8, 0);

    if (int8_scale_term)
    {
#if !NCNN_INT8
        NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
        return -1;
#endif
    }

    return 0;
}

int LSTM::load_model(const ModelBin& mb)
{
    int num_directions = direction == 2 ? 2 : 1;

    int size = weight_data_size / num_directions / hidden_size / 4;

    // raw weight data
    weight_xc_data = mb.load(size, hidden_size * 4, num_directions, 0);
    if (weight_xc_data.empty())
        return -100;

    bias_c_data = mb.load(hidden_size, 4, num_directions, 0);
    if (bias_c_data.empty())
        return -100;

    weight_hc_data = mb.load(num_output, hidden_size * 4, num_directions, 0);
    if (weight_hc_data.empty())
        return -100;

    if (num_output != hidden_size)
    {
        weight_hr_data = mb.load(hidden_size, num_output, num_directions, 0);
        if (weight_hr_data.empty())
            return -100;
    }

#if NCNN_INT8
    if (int8_scale_term)
    {
        weight_xc_data_int8_scales = mb.load(hidden_size * 4, num_directions, 1);
        weight_hc_data_int8_scales = mb.load(hidden_size * 4, num_directions, 1);
    }
#endif // NCNN_INT8

    return 0;
}

static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;
    int hidden_size = cell_state.w;

    // 4 x hidden_size
    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    Mat tmp_hidden_state;
    if (num_output != hidden_size)
    {
        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
        if (tmp_hidden_state.empty())
            return -100;
    }

    // unroll
    for (int t = 0; t < T; t++)
    {
        // clip hidden by continuation indicator
        // h_cont_{t-1} = cont_t * h_{t-1}
        // h_cont_{t-1} = h_{t-1} if cont_t == 1
        //                0       otherwise
        // calculate hidden
        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c

        int ti = reverse ? T - 1 - t : t;

        const float* x = bottom_blob.row(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const float* bias_c_I = bias_c.row(0);
            const float* bias_c_F = bias_c.row(1);
            const float* bias_c_O = bias_c.row(2);
            const float* bias_c_G = bias_c.row(3);

            float* gates_data = gates.row(q);

            // gate I F O G
            const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q);
            const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q);
            const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q);
            const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q);

            const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q);
            const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q);
            const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q);
            const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q);

            float I = bias_c_I[q];
            float F = bias_c_F[q];
            float O = bias_c_O[q];
            float G = bias_c_G[q];

            for (int i = 0; i < size; i++)
            {
                float xi = x[i];

                I += weight_xc_I[i] * xi;
                F += weight_xc_F[i] * xi;
                O += weight_xc_O[i] * xi;
                G += weight_xc_G[i] * xi;
            }

            for (int i = 0; i < num_output; i++)
            {
                float h_cont = hidden_state[i];

                I += weight_hc_I[i] * h_cont;
                F += weight_hc_F[i] * h_cont;
                O += weight_hc_O[i] * h_cont;
                G += weight_hc_G[i] * h_cont;
            }

            gates_data[0] = I;
            gates_data[1] = F;
            gates_data[2] = O;
            gates_data[3] = G;
        }

        // lstm unit
        // sigmoid(I)
        // sigmoid(F)
        // sigmoid(O)
        // tanh(G)
        // c_t := f_t .* c_{t-1} + i_t .* g_t
        // h_t := o_t .* tanh[c_t]
        float* output_data = top_blob.row(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const float* gates_data = gates.row(q);

            float I = gates_data[0];
            float F = gates_data[1];
            float O = gates_data[2];
            float G = gates_data[3];

            I = 1.f / (1.f + expf(-I));
            F = 1.f / (1.f + expf(-F));
            O = 1.f / (1.f + expf(-O));
            G = tanhf(G);

            float cell2 = F * cell_state[q] + I * G;
            float H = O * tanhf(cell2);
            cell_state[q] = cell2;

            if (num_output == hidden_size)
            {
                hidden_state[q] = H;
                output_data[q] = H;
            }
            else
            {
                tmp_hidden_state[q] = H;
            }
        }

        if (num_output != hidden_size)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < num_output; q++)
            {
                const float* hr = weight_hr.row(q);

                float H = 0;
                for (int i = 0; i < hidden_size; i++)
                {
                    H += tmp_hidden_state[i] * hr[i];
                }

                hidden_state[q] = H;
                output_data[q] = H;
            }
        }
    }

    return 0;
}

#if NCNN_INT8
static int lstm_int8(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc_int8, const float* weight_xc_int8_scales, const Mat& bias_c, const Mat& weight_hc_int8, const float* weight_hc_int8_scales, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt)
{
    int size = bottom_blob.w;
    int T = bottom_blob.h;

    int num_output = top_blob.w;
    int hidden_size = cell_state.w;

    // 4 x hidden_size
    Mat gates(4, hidden_size, 4u, opt.workspace_allocator);
    if (gates.empty())
        return -100;

    Mat tmp_hidden_state;
    if (num_output != hidden_size)
    {
        tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator);
        if (tmp_hidden_state.empty())
            return -100;
    }

    // dynamic quantize bottom_blob
    Mat bottom_blob_int8(size, T, (size_t)1u, 1, opt.workspace_allocator);
    Mat bottom_blob_int8_scales(T, (size_t)4u, 1, opt.workspace_allocator);
    {
        for (int t = 0; t < T; t++)
        {
            const float* x = bottom_blob.row(t);

            float absmax = 0.f;
            for (int i = 0; i < size; i++)
            {
                absmax = std::max(absmax, (float)fabs(x[i]));
            }

            bottom_blob_int8_scales[t] = 127.f / absmax;
        }

        Option opt_quant = opt;
        opt_quant.blob_allocator = opt.workspace_allocator;
        opt_quant.use_packing_layout = false;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_quant);
    }

    Mat hidden_state_int8(num_output, (size_t)1u, 1, opt.workspace_allocator);
    Mat hidden_state_int8_scales(1, (size_t)4u, 1, opt.workspace_allocator);

    // unroll
    for (int t = 0; t < T; t++)
    {
        // clip hidden by continuation indicator
        // h_cont_{t-1} = cont_t * h_{t-1}
        // h_cont_{t-1} = h_{t-1} if cont_t == 1
        //                0       otherwise
        // calculate hidden
        // gate_input_t := W_hc * h_conted_{t-1} + W_xc * x_t + b_c

        int ti = reverse ? T - 1 - t : t;

        // dynamic quantize hidden_state
        {
            float absmax = 0.f;
            for (int i = 0; i < num_output; i++)
            {
                absmax = std::max(absmax, (float)fabs(hidden_state[i]));
            }

            if (absmax == 0.f)
            {
                hidden_state_int8_scales[0] = 1.f;
                hidden_state_int8.fill<signed char>(0);
            }
            else
            {
                hidden_state_int8_scales[0] = 127.f / absmax;

                Option opt_quant = opt;
                opt_quant.blob_allocator = opt.workspace_allocator;
                opt_quant.use_packing_layout = false;
                quantize_to_int8(hidden_state, hidden_state_int8, hidden_state_int8_scales, opt_quant);
            }
        }

        const signed char* x = bottom_blob_int8.row<const signed char>(ti);
        const signed char* hs = hidden_state_int8;
        const float descale_x = 1.f / bottom_blob_int8_scales[ti];
        const float descale_h = 1.f / hidden_state_int8_scales[0];
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const float* bias_c_I = bias_c.row(0);
            const float* bias_c_F = bias_c.row(1);
            const float* bias_c_O = bias_c.row(2);
            const float* bias_c_G = bias_c.row(3);

            float* gates_data = gates.row(q);

            // gate I F O G
            const signed char* weight_xc_int8_I = weight_xc_int8.row<const signed char>(hidden_size * 0 + q);
            const signed char* weight_xc_int8_F = weight_xc_int8.row<const signed char>(hidden_size * 1 + q);
            const signed char* weight_xc_int8_O = weight_xc_int8.row<const signed char>(hidden_size * 2 + q);
            const signed char* weight_xc_int8_G = weight_xc_int8.row<const signed char>(hidden_size * 3 + q);

            const signed char* weight_hc_int8_I = weight_hc_int8.row<const signed char>(hidden_size * 0 + q);
            const signed char* weight_hc_int8_F = weight_hc_int8.row<const signed char>(hidden_size * 1 + q);
            const signed char* weight_hc_int8_O = weight_hc_int8.row<const signed char>(hidden_size * 2 + q);
            const signed char* weight_hc_int8_G = weight_hc_int8.row<const signed char>(hidden_size * 3 + q);

            const float descale_xc_I = 1.f / weight_xc_int8_scales[hidden_size * 0 + q];
            const float descale_xc_F = 1.f / weight_xc_int8_scales[hidden_size * 1 + q];
            const float descale_xc_O = 1.f / weight_xc_int8_scales[hidden_size * 2 + q];
            const float descale_xc_G = 1.f / weight_xc_int8_scales[hidden_size * 3 + q];
            const float descale_hc_I = 1.f / weight_hc_int8_scales[hidden_size * 0 + q];
            const float descale_hc_F = 1.f / weight_hc_int8_scales[hidden_size * 1 + q];
            const float descale_hc_O = 1.f / weight_hc_int8_scales[hidden_size * 2 + q];
            const float descale_hc_G = 1.f / weight_hc_int8_scales[hidden_size * 3 + q];

            int Ix = 0;
            int Fx = 0;
            int Ox = 0;
            int Gx = 0;
            for (int i = 0; i < size; i++)
            {
                signed char xi = x[i];

                Ix += weight_xc_int8_I[i] * xi;
                Fx += weight_xc_int8_F[i] * xi;
                Ox += weight_xc_int8_O[i] * xi;
                Gx += weight_xc_int8_G[i] * xi;
            }

            int Ih = 0;
            int Fh = 0;
            int Oh = 0;
            int Gh = 0;
            for (int i = 0; i < num_output; i++)
            {
                signed char h_cont = hs[i];

                Ih += weight_hc_int8_I[i] * h_cont;
                Fh += weight_hc_int8_F[i] * h_cont;
                Oh += weight_hc_int8_O[i] * h_cont;
                Gh += weight_hc_int8_G[i] * h_cont;
            }

            float I = bias_c_I[q] + Ix * (descale_x * descale_xc_I) + Ih * (descale_h * descale_hc_I);
            float F = bias_c_F[q] + Fx * (descale_x * descale_xc_F) + Fh * (descale_h * descale_hc_F);
            float O = bias_c_O[q] + Ox * (descale_x * descale_xc_O) + Oh * (descale_h * descale_hc_O);
            float G = bias_c_G[q] + Gx * (descale_x * descale_xc_G) + Gh * (descale_h * descale_hc_G);

            gates_data[0] = I;
            gates_data[1] = F;
            gates_data[2] = O;
            gates_data[3] = G;
        }

        // lstm unit
        // sigmoid(I)
        // sigmoid(F)
        // sigmoid(O)
        // tanh(G)
        // c_t := f_t .* c_{t-1} + i_t .* g_t
        // h_t := o_t .* tanh[c_t]
        float* output_data = top_blob.row(ti);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < hidden_size; q++)
        {
            const float* gates_data = gates.row(q);

            float I = gates_data[0];
            float F = gates_data[1];
            float O = gates_data[2];
            float G = gates_data[3];

            I = 1.f / (1.f + expf(-I));
            F = 1.f / (1.f + expf(-F));
            O = 1.f / (1.f + expf(-O));
            G = tanhf(G);

            float cell2 = F * cell_state[q] + I * G;
            float H = O * tanhf(cell2);
            cell_state[q] = cell2;

            if (num_output == hidden_size)
            {
                hidden_state[q] = H;
                output_data[q] = H;
            }
            else
            {
                tmp_hidden_state[q] = H;
            }
        }

        if (num_output != hidden_size)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q = 0; q < num_output; q++)
            {
                const float* hr = weight_hr.row(q);

                float H = 0;
                for (int i = 0; i < hidden_size; i++)
                {
                    H += tmp_hidden_state[i] * hr[i];
                }

                hidden_state[q] = H;
                output_data[q] = H;
            }
        }
    }

    return 0;
}
#endif // NCNN_INT8

int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int T = bottom_blob.h;

    int num_directions = direction == 2 ? 2 : 1;

    // initial hidden state
    Mat hidden(num_output, 4u, opt.workspace_allocator);
    if (hidden.empty())
        return -100;
    hidden.fill(0.f);

    Mat cell(hidden_size, 4u, opt.workspace_allocator);
    if (cell.empty())
        return -100;
    cell.fill(0.f);

    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = lstm_int8(bottom_blob, top_blob, direction, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = lstm_int8(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        hidden.fill(0.0f);
        cell.fill(0.0f);

#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = lstm_int8(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), weight_xc_data_int8_scales.row(1), bias_c_data.channel(1), weight_hc_data.channel(1), weight_hc_data_int8_scales.row(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    return 0;
}

int LSTM::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    int T = bottom_blob.h;
    int num_directions = direction == 2 ? 2 : 1;

    Mat hidden;
    Mat cell;
    Allocator* hidden_cell_allocator = top_blobs.size() == 3 ? opt.blob_allocator : opt.workspace_allocator;
    if (bottom_blobs.size() == 3)
    {
        hidden = bottom_blobs[1].clone(hidden_cell_allocator);
        cell = bottom_blobs[2].clone(hidden_cell_allocator);
    }
    else
    {
        hidden.create(num_output, num_directions, 4u, hidden_cell_allocator);
        if (hidden.empty())
            return -100;
        hidden.fill(0.f);

        cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator);
        if (cell.empty())
            return -100;
        cell.fill(0.f);
    }

    Mat& top_blob = top_blobs[0];
    top_blob.create(num_output * num_directions, T, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // Uni directional
    if (direction == 0 || direction == 1)
    {
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = lstm_int8(bottom_blob, top_blob, direction, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt);
            if (ret != 0)
                return ret;
        }
    }

    if (direction == 2)
    {
        Mat top_blob_forward(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_forward.empty())
            return -100;

        Mat top_blob_reverse(num_output, T, 4u, opt.workspace_allocator);
        if (top_blob_reverse.empty())
            return -100;

        Mat hidden0 = hidden.row_range(0, 1);
        Mat cell0 = cell.row_range(0, 1);
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = lstm_int8(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), weight_xc_data_int8_scales.row(0), bias_c_data.channel(0), weight_hc_data.channel(0), weight_hc_data_int8_scales.row(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt);
            if (ret != 0)
                return ret;
        }

        Mat hidden1 = hidden.row_range(1, 1);
        Mat cell1 = cell.row_range(1, 1);
#if NCNN_INT8
        if (int8_scale_term)
        {
            int ret = lstm_int8(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), weight_xc_data_int8_scales.row(1), bias_c_data.channel(1), weight_hc_data.channel(1), weight_hc_data_int8_scales.row(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
            if (ret != 0)
                return ret;
        }
        else
#endif
        {
            int ret = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt);
            if (ret != 0)
                return ret;
        }

        // concat w
        for (int i = 0; i < T; i++)
        {
            const float* pf = top_blob_forward.row(i);
            const float* pr = top_blob_reverse.row(i);
            float* ptr = top_blob.row(i);

            memcpy(ptr, pf, num_output * sizeof(float));
            memcpy(ptr + num_output, pr, num_output * sizeof(float));
        }
    }

    if (top_blobs.size() == 3)
    {
        top_blobs[1] = hidden;
        top_blobs[2] = cell;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/lstm.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_LSTM_H
#define LAYER_LSTM_H

#include "layer.h"

namespace ncnn {

class LSTM : public Layer
{
public:
    LSTM();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    int num_output;
    int weight_data_size;
    int direction; // 0=forward 1=reverse 2=bidirectional
    int hidden_size;

    int int8_scale_term;

    Mat weight_hc_data;
    Mat weight_xc_data;
    Mat bias_c_data;
    Mat weight_hr_data;

#if NCNN_INT8
    Mat weight_hc_data_int8_scales;
    Mat weight_xc_data_int8_scales;
#endif
};

} // namespace ncnn

#endif // LAYER_LSTM_H


================================================
FILE: src/layer/matmul.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "matmul.h"

namespace ncnn {

MatMul::MatMul()
{
    one_blob_only = false;
    support_inplace = false;
}

int MatMul::load_param(const ParamDict& pd)
{
    transB = pd.get(0, 0);

    return 0;
}

static void transpose(const Mat& X, Mat& XT, const Option& opt)
{
    const int w = X.w;
    const int h = X.h;

    const float* pX = X;
    float* pXT = XT;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < w; i++)
    {
        float* ptr = pXT + i * h;
        for (int j = 0; j < h; j++)
        {
            ptr[j] = pX[j * w + i];
        }
    }
}

static void matmul_transb(const Mat& A, const Mat& B, Mat& top_blob, const Option& opt)
{
    const int M = A.h;
    const int K = A.w; // assert A.w == B.w
    const int N = B.h;

    const float* pA = A;
    const float* pB = B;
    float* pOut = top_blob;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < M; i++)
    {
        const float* ptrA = pA + i * K;
        float* outptr = pOut + i * N;

        for (int j = 0; j < N; j++)
        {
            const float* ptrB = pB + j * K;

            float sum = 0.f;
            for (int k = 0; k < K; k++)
            {
                sum += ptrA[k] * ptrB[k];
            }

            *outptr++ = sum;
        }
    }
}

int MatMul::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int Adims = A.dims;
    const int Bdims = B.dims;
    const int max_ABdims = std::max(Adims, Bdims);
    const size_t elemsize = A.elemsize;

    if (Adims == 1 && Bdims == 1)
    {
        // dot product
        top_blob.create(1, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        const int K = A.w; // assert A.w == B.w
        const float* ptrA = A;
        const float* ptrB = B;

        float sum = 0.f;
        for (int k = 0; k < K; k++)
        {
            sum += ptrA[k] * ptrB[k];
        }

        top_blob[0] = sum;
    }
    else if (Adims == 2 && Bdims == 2)
    {
        // matrix multiply
        const int M = A.h;
        const int N = transB == 0 ? B.w : B.h;

        top_blob.create(N, M, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        Mat BT;
        if (transB == 0)
        {
            BT.create(B.h, B.w, elemsize, opt.workspace_allocator);
            if (BT.empty())
                return -100;

            transpose(B, BT, opt);
        }
        else
        {
            BT = B;
        }

        matmul_transb(A, BT, top_blob, opt);
    }
    else if (Adims == 1 && Bdims == 2)
    {
        // matrix multiply
        const int N = transB == 0 ? B.w : B.h;

        Mat top_blob1(N, 1, elemsize, opt.blob_allocator);
        if (top_blob1.empty())
            return -100;

        Mat A1 = A.reshape(A.w, 1);

        Mat BT;
        if (transB == 0)
        {
            BT.create(B.h, B.w, elemsize, opt.workspace_allocator);
            if (BT.empty())
                return -100;

            transpose(B, BT, opt);
        }
        else
        {
            BT = B;
        }

        matmul_transb(A1, BT, top_blob1, opt);

        top_blob = top_blob1.reshape(N);
    }
    else if (Adims == 2 && Bdims == 1)
    {
        // matrix multiply
        const int M = A.h;

        Mat top_blob1(1, M, elemsize, opt.blob_allocator);
        if (top_blob1.empty())
            return -100;

        Mat BT = B.reshape(B.w, 1);

        matmul_transb(A, BT, top_blob1, opt);

        top_blob = top_blob1.reshape(M);
    }
    else if (Adims == 1 && Bdims > 2)
    {
        // batched matrix multiply
        const int N = transB == 0 ? B.w : B.h;
        const int batch_size = B.d * B.c;

        Mat top_blob1(N, 1, batch_size, elemsize, opt.blob_allocator);
        if (top_blob1.empty())
            return -100;

        Mat A1 = A.reshape(A.w, 1);
        Mat B1 = B.reshape(B.w, B.h, batch_size);

        for (int p = 0; p < batch_size; p++)
        {
            Mat BT;
            if (transB == 0)
            {
                BT.create(B.h, B.w, elemsize, opt.workspace_allocator);
                if (BT.empty())
                    return -100;

                transpose(B1.channel(p), BT, opt);
            }
            else
            {
                BT = B1.channel(p);
            }

            Mat top_blob1_p = top_blob1.channel(p);
            matmul_transb(A1, BT, top_blob1_p, opt);
        }

        if (Bdims == 3)
            top_blob = top_blob1.reshape(N, B.d * B.c);
        else
            top_blob = top_blob1.reshape(N, B.d, B.c);
    }
    else if (Adims > 2 && Bdims == 1)
    {
        // batched matrix multiply
        const int M = A.h;
        const int batch_size = A.d * A.c;

        Mat top_blob1(1, M, batch_size, elemsize, opt.blob_allocator);
        if (top_blob1.empty())
            return -100;

        Mat A1 = A.reshape(A.w, A.h, batch_size);
        Mat BT = B.reshape(B.w, 1);

        for (int p = 0; p < batch_size; p++)
        {
            Mat top_blob1_p = top_blob1.channel(p);
            matmul_transb(A1.channel(p), BT, top_blob1_p, opt);
        }

        if (Adims == 3)
            top_blob = top_blob1.reshape(M, A.d * A.c);
        else
            top_blob = top_blob1.reshape(M, A.d, A.c);
    }
    else if (max_ABdims == 3)
    {
        Mat A1 = Adims == 2 ? A.reshape(A.w, A.h, 1) : A;
        Mat B1 = Bdims == 2 ? B.reshape(B.w, B.h, 1) : B;

        const int M = A1.h;
        const int N = transB == 0 ? B1.w : B1.h;
        const int batch_size = std::max(A1.c, B1.c);

        top_blob.create(N, M, batch_size, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        Mat BT0;
        if (B1.c == 1)
        {
            if (transB == 0)
            {
                BT0.create(B1.h, B1.w, elemsize, opt.workspace_allocator);
                if (BT0.empty())
                    return -100;

                transpose(B1.channel(0), BT0, opt);
            }
            else
            {
                BT0 = B1.channel(0);
            }
        }

        for (int p = 0; p < batch_size; p++)
        {
            int Ap = A1.c == 1 ? 0 : p;
            int Bp = B1.c == 1 ? 0 : p;

            Mat BT;
            if (B1.c == 1)
            {
                BT = BT0;
            }
            else
            {
                if (transB == 0)
                {
                    BT.create(B1.h, B1.w, elemsize, opt.workspace_allocator);
                    if (BT.empty())
                        return -100;

                    transpose(B1.channel(Bp), BT, opt);
                }
                else
                {
                    BT = B1.channel(Bp);
                }
            }

            Mat top_blob_p = top_blob.channel(p);
            matmul_transb(A1.channel(Ap), BT, top_blob_p, opt);
        }
    }
    else if (max_ABdims == 4)
    {
        Mat A1 = Adims == 3 ? A.reshape(A.w, A.h, A.c, 1) : A;
        Mat B1 = Bdims == 3 ? B.reshape(B.w, B.h, B.c, 1) : B;

        const int M = A1.h;
        const int N = transB == 0 ? B1.w : B1.h;
        const int batch_size_d = std::max(A1.d, B1.d);
        const int batch_size_c = std::max(A1.c, B1.c);

        top_blob.create(N, M, batch_size_d, batch_size_c, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        Mat BT00;
        if (B1.d == 1 && B1.c == 1)
        {
            if (transB == 0)
            {
                BT00.create(B1.h, B1.w, elemsize, opt.workspace_allocator);
                if (BT00.empty())
                    return -100;

                transpose(B1.channel(0).depth(0), BT00, opt);
            }
            else
            {
                BT00 = B1.channel(0).depth(0);
            }
        }

        for (int p = 0; p < batch_size_c; p++)
        {
            int Ap = A1.c == 1 ? 0 : p;
            int Bp = B1.c == 1 ? 0 : p;

            Mat BT0x;
            if (B1.d == 1 && B1.c != 1)
            {
                if (transB == 0)
                {
                    BT0x.create(B1.h, B1.w, elemsize, opt.workspace_allocator);
                    if (BT0x.empty())
                        return -100;

                    transpose(B1.channel(Bp).depth(0), BT0x, opt);
                }
                else
                {
                    BT0x = B1.channel(Bp).depth(0);
                }
            }

            for (int q = 0; q < batch_size_d; q++)
            {
                int Ad = A1.d == 1 ? 0 : q;
                int Bd = B1.d == 1 ? 0 : q;

                Mat BT;
                if (B1.d == 1 && B1.c == 1)
                {
                    BT = BT00;
                }
                else if (B1.d == 1 && B1.c != 1)
                {
                    BT = BT0x;
                }
                else
                {
                    if (transB == 0)
                    {
                        BT.create(B1.h, B1.w, elemsize, opt.workspace_allocator);
                        if (BT.empty())
                            return -100;

                        transpose(B1.channel(Bp).depth(Bd), BT, opt);
                    }
                    else
                    {
                        BT = B1.channel(Bp).depth(Bd);
                    }
                }

                Mat top_blob_p_q = top_blob.channel(p).depth(q);
                matmul_transb(A1.channel(Ap).depth(Ad), BT, top_blob_p_q, opt);
            }
        }
    }
    else
    {
        NCNN_LOGE("impossible matmul %d %d", Adims, Bdims);
        return -1;
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/matmul.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_MATMUL_H
#define LAYER_MATMUL_H

#include "layer.h"

namespace ncnn {

class MatMul : public Layer
{
public:
    MatMul();

    virtual int load_param(const ParamDict& pd);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    int transB;
};

} // namespace ncnn

#endif // LAYER_MATMUL_H


================================================
FILE: src/layer/memorydata.cpp
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "memorydata.h"

namespace ncnn {

MemoryData::MemoryData()
{
    one_blob_only = false;
    support_inplace = false;
}

int MemoryData::load_param(const ParamDict& pd)
{
    w = pd.get(0, 0);
    h = pd.get(1, 0);
    d = pd.get(11, 0);
    c = pd.get(2, 0);
    load_type = pd.get(21, 1);

    return 0;
}

int MemoryData::load_model(const ModelBin& mb)
{
    if (d != 0)
    {
        data = mb.load(w, h, d, c, load_type);
    }
    else if (c != 0)
    {
        data = mb.load(w, h, c, load_type);
    }
    else if (h != 0)
    {
        data = mb.load(w, h, load_type);
    }
    else if (w != 0)
    {
        data = mb.load(w, load_type);
    }
    else // 0 0 0
    {
        data.create(1);
    }
    if (data.empty())
        return -100;

    return 0;
}

int MemoryData::forward(const std::vector<Mat>& /*bottom_blobs*/, std::vector<Mat>& top_blobs, const Option& opt) const
{
    Mat& top_blob = top_blobs[0];

    top_blob = data.clone(opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/memorydata.h
================================================
// Copyright 2017 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_MEMORYDATA_H
#define LAYER_MEMORYDATA_H

#include "layer.h"

namespace ncnn {

class MemoryData : public Layer
{
public:
    MemoryData();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    int w;
    int h;
    int d;
    int c;
    int load_type;

    Mat data;
};

} // namespace ncnn

#endif // LAYER_MEMORYDATA_H


================================================
FILE: src/layer/mips/absval_mips.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/mips/absval_mips.h
================================================
// Copyright 2019 Leo <leo@nullptr.com.cn>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_ABSVAL_MIPS_H
#define LAYER_ABSVAL_MIPS_H

#include "absval.h"

namespace ncnn {

class AbsVal_mips : public AbsVal
{
public:
    AbsVal_mips();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ABSVAL_MIPS_H


================================================
FILE: src/layer/mips/batchnorm_mips.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "batchnorm_mips.h"

#if __mips_msa
#include <msa.h>
#endif // __mips_msa

#include "mips_usability.h"

namespace ncnn {

BatchNorm_mips::BatchNorm_mips()
{
#if __mips_msa
    support_packing = true;
#endif // __mips_msa
}

int BatchNorm_mips::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int dims = bottom_top_blob.dims;
    int elempack = bottom_top_blob.elempack;

    if (dims == 1)
    {
        int w = bottom_top_blob.w * elempack;

#if __mips_msa
        int nn_w = w / 4;
        int remain_w_start = nn_w * 4;
#else
        int remain_w_start = 0;
#endif // __mips_msa

        float* ptr = bottom_top_blob;

#if __mips_msa
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < nn_w; i++)
        {
            float* ptr0 = ptr + i * 4;

            v4f32 _p = (v4f32)__msa_ld_w(ptr0, 0);
            v4f32 _a = (v4f32)__msa_ld_w((const float*)a_data + i * 4, 0);
            v4f32 _b = (v4f32)__msa_ld_w((const float*)b_data + i * 4, 0);
            _p = __msa_fmadd_w(_a, _p, _b);
            __msa_st_w((v4i32)_p, ptr0, 0);
        }
#endif // __mips_msa

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_w_start; i < w; i++)
        {
            ptr[i] = b_data[i] * ptr[i] + a_data[i];
        }
    }

    if (dims == 2)
    {
        int w = bottom_top_blob.w * elempack;
        int h = bottom_top_blob.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* ptr = bottom_top_blob.row(i);
            float a = a_data[i];
            float b = b_data[i];

            int j = 0;
#if __mips_msa
            v4f32 _a = elempack == 4 ? (v4f32)__msa_ld_w((const float*)a_data + i * 4, 0) : (v4f32)__msa_fill_w_f32(a);
            v4f32 _b = elempack == 4 ? (v4f32)__msa_ld_w((const float*)b_data + i * 4, 0) : (v4f32)__msa_fill_w_f32(b);
            for (; j + 3 < w; j += 4)
            {
                __builtin_prefetch(ptr + 16);
                v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
                _p = __msa_fmadd_w(_a, _p, _b);
                __msa_st_w((v4i32)_p, ptr, 0);

                ptr += 4;
            }
#endif // __mips_msa
            for (; j < w; j++)
            {
                *ptr = b * *ptr + a;
                ptr++;
            }
        }
    }

    if (dims == 3 || dims == 4)
    {
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int d = bottom_top_blob.d;
        int c = bottom_top_blob.c;
        int size = w * h * d * elempack;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < c; q++)
        {
            float* ptr = bottom_top_blob.channel(q);
            float a = a_data[q];
            float b = b_data[q];

            int i = 0;
#if __mips_msa
            v4f32 _a = elempack == 4 ? (v4f32)__msa_ld_w((const float*)a_data + q * 4, 0) : (v4f32)__msa_fill_w_f32(a);
            v4f32 _b = elempack == 4 ? (v4f32)__msa_ld_w((const float*)b_data + q * 4, 0) : (v4f32)__msa_fill_w_f32(b);
            for (; i + 3 < size; i += 4)
            {
                __builtin_prefetch(ptr + 16);
                v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
                _p = __msa_fmadd_w(_a, _p, _b);
                __msa_st_w((v4i32)_p, ptr, 0);

                ptr += 4;
            }
#endif // __mips_msa
            for (; i < size; i++)
            {
                *ptr = b * *ptr + a;
                ptr++;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/mips/batchnorm_mips.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BATCHNORM_MIPS_H
#define LAYER_BATCHNORM_MIPS_H

#include "batchnorm.h"

namespace ncnn {

class BatchNorm_mips : public BatchNorm
{
public:
    BatchNorm_mips();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_BATCHNORM_MIPS_H


================================================
FILE: src/layer/mips/bias_mips.cpp
================================================
// Copyright 2019 Leo <leo@nullptr.com.cn>
// SPDX-License-Identifier: BSD-3-Clause

#include "bias_mips.h"

#if __mips_msa
#include <msa.h>
#endif // __mips_msa

#include "mips_usability.h"

namespace ncnn {

int Bias_mips::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int size = w * h * d;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

#if __mips_msa
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __mips_msa

#if __mips_msa
        v4f32 _bias = (v4f32)__msa_fill_w_f32(bias);
        for (; nn > 0; nn--)
        {
            v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
            v4f32 _outp = __msa_fadd_w(_p, _bias);
            __msa_st_w((v4i32)_outp, ptr, 0);

            ptr += 4;
        }
#endif // __mips_msa

        for (; remain > 0; remain--)
        {
            *ptr = *ptr + bias;
            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/mips/bias_mips.h
================================================
// Copyright 2019 Leo <leo@nullptr.com.cn>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BIAS_MIPS_H
#define LAYER_BIAS_MIPS_H

#include "bias.h"

namespace ncnn {

class Bias_mips : public Bias
{
public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_BIAS_MIPS_H


================================================
FILE: src/layer/mips/binaryop_mips.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "binaryop_mips.h"

#if __mips_msa
#include <msa.h>
#include "msa_mathfun.h"
#endif // __mips_msa

namespace ncnn {

BinaryOp_mips::BinaryOp_mips()
{
#if __mips_msa
    support_packing = true;
#endif // __mips_msa
}

template<typename Op>
static void binary_op_vector_no_broadcast(const float* ptr, const float* ptr1, float* outptr, int size)
{
    const Op op;

    int i = 0;
#if __mips_msa
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr + 16);
        __builtin_prefetch(ptr1 + 16);
        v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
        v4f32 _b = (v4f32)__msa_ld_w(ptr1, 0);
        v4f32 _outp = op(_p, _b);
        __msa_st_w((v4i32)_outp, outptr, 0);
        ptr += 4;
        ptr1 += 4;
        outptr += 4;
    }
#endif // __mips_msa
    for (; i < size; i++)
    {
        *outptr = op(*ptr, *ptr1);
        ptr += 1;
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_b(const float* ptr, const float* ptr1, float* outptr, int size, int elempack)
{
    const Op op;

    const float b = *ptr1;
#if __mips_msa
    v4f32 _b_128 = (elempack == 4) ? (v4f32)__msa_ld_w(ptr1, 0) : __msa_fill_w_f32(b);
#endif // __mips_msa

    int i = 0;
#if __mips_msa
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr + 16);
        v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
        v4f32 _outp = op(_p, _b_128);
        __msa_st_w((v4i32)_outp, outptr, 0);
        ptr += 4;
        outptr += 4;
    }
#endif // __mips_msa
    for (; i < size; i++)
    {
        *outptr = op(*ptr, b);
        ptr += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_a(const float* ptr, const float* ptr1, float* outptr, int size, int elempack)
{
    const Op op;

    const float a = *ptr;
#if __mips_msa
    v4f32 _a_128 = (elempack == 4) ? (v4f32)__msa_ld_w(ptr, 0) : __msa_fill_w_f32(a);
#endif // __mips_msa

    int i = 0;
#if __mips_msa
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr1 + 16);
        v4f32 _b = (v4f32)__msa_ld_w(ptr1, 0);
        v4f32 _outp = op(_a_128, _b);
        __msa_st_w((v4i32)_outp, outptr, 0);
        ptr1 += 4;
        outptr += 4;
    }
#endif // __mips_msa
    for (; i < size; i++)
    {
        *outptr = op(a, *ptr1);
        ptr1 += 1;
        outptr += 1;
    }
}

template<typename Op>
static void binary_op_vector_broadcast_pb(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

#if __mips_msa
    if (elempack == 4)
    {
        int i = 0;
        for (; i < w; i++)
        {
            __builtin_prefetch(ptr + 16);
            v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
            v4f32 _b = __msa_fill_w_f32(*ptr1);
            v4f32 _outp = op(_p, _b);
            __msa_st_w((v4i32)_outp, outptr, 0);
            ptr += 4;
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __mips_msa
}

template<typename Op>
static void binary_op_vector_broadcast_pb_b(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

    const int size = w * elempack;

    int i = 0;
#if __mips_msa
    v4f32 _b = __msa_fill_w_f32(*ptr1);
    for (; i + 3 < size; i += 4)
    {
        __builtin_prefetch(ptr + 16);
        v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
        v4f32 _outp = op(_p, _b);
        __msa_st_w((v4i32)_outp, outptr, 0);
        ptr += 4;
        outptr += 4;
    }
#endif // __mips_msa
}

template<typename Op>
static void binary_op_vector_broadcast_pb_a(const float* ptr, const float* ptr1, float* outptr, int w, int elempack)
{
    const Op op;

#if __mips_msa
    if (elempack == 4)
    {
        int i = 0;
        v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
        for (; i < w; i++)
        {
            v4f32 _b = __msa_fill_w_f32(*ptr1);
            v4f32 _outp = op(_p, _b);
            __msa_st_w((v4i32)_outp, outptr, 0);
            ptr1 += 1;
            outptr += 4;
        }
    }
#endif // __mips_msa
}

template<typename Op>
static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp)
{
    const int w = std::max(aw, bw);
    const int elempack = std::max(ap, bp);
    const int size = w * elempack;

    if (ap == bp)
    {
        if (aw == bw)
        {
            // no broadcast
            return binary_op_vector_no_broadcast<Op>(ptr, ptr1, outptr, size);
        }

        if (bw == 1)
        {
            // broadcast single b
            return binary_op_vector_broadcast_b<Op>(ptr, ptr1, outptr, size, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a
            return binary_op_vector_broadcast_a<Op>(ptr, ptr1, outptr, size, elempack);
        }
    }

    if (bp == 1)
    {
        if (aw == bw)
        {
            // broadcast pack1 b
            return binary_op_vector_broadcast_pb<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (bw == 1)
        {
            // broadcast pack1 single b
            return binary_op_vector_broadcast_pb_b<Op>(ptr, ptr1, outptr, w, elempack);
        }

        if (aw == 1)
        {
            // broadcast single a and pack1 b
            return binary_op_vector_broadcast_pb_a<Op>(ptr, ptr1, outptr, w, elempack);
        }
    }

    // shall never reach here
}

template<typename Op>
static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt)
{
    Op op;

    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        int i = 0;
#if __mips_msa
        v4f32 _b = __msa_fill_w_f32(b);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
            _p = op(_p, _b);
            __msa_st_w((v4i32)_p, ptr, 0);
            ptr += 4;
        }
#endif // __mips_msa
        for (; i < size; i++)
        {
            *ptr = op(*ptr, b);
            ptr++;
        }
    }

    return 0;
}

namespace BinaryOp_mips_functor {

#if __mips_msa
#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                       \
    struct NAME                                                \
    {                                                          \
        float operator()(const float& x, const float& y) const \
        {                                                      \
            return IMPL;                                       \
        }                                                      \
        v4f32 operator()(const v4f32& x, const v4f32& y) const \
        {                                                      \
            return IMPL4;                                      \
        }                                                      \
    };
#else
#define MAKE_FUNCTION(NAME, IMPL, IMPL4)                       \
    struct NAME                                                \
    {                                                          \
        float operator()(const float& x, const float& y) const \
        {                                                      \
            return IMPL;                                       \
        }                                                      \
    };
#endif // __mips_msa

// clang-format off
// *INDENT-OFF*
MAKE_FUNCTION(binary_op_add, x + y, __msa_fadd_w(x, y))
MAKE_FUNCTION(binary_op_sub, x - y, __msa_fsub_w(x, y))
MAKE_FUNCTION(binary_op_mul, x * y, __msa_fmul_w(x, y))
MAKE_FUNCTION(binary_op_div, x / y, __msa_fdiv_w(x, y))
MAKE_FUNCTION(binary_op_max, std::max(x, y), __msa_fmax_w(x, y))
MAKE_FUNCTION(binary_op_min, std::min(x, y), __msa_fmin_w(x, y))
MAKE_FUNCTION(binary_op_pow, (float)powf(x, y), pow_ps(x, y))
MAKE_FUNCTION(binary_op_rsub, y - x, __msa_fsub_w(y, x))
MAKE_FUNCTION(binary_op_rdiv, y / x, __msa_fdiv_w(y, x))
MAKE_FUNCTION(binary_op_rpow, (float)powf(y, x), pow_ps(y, x))
MAKE_FUNCTION(binary_op_atan2, (float)atan2f(x, y), atan2_ps(x, y))
MAKE_FUNCTION(binary_op_ratan2, (float)atan2f(y, x), atan2_ps(y, x))
MAKE_FUNCTION(binary_op_fmod, (float)fmodf(x, y), fmod_ps(x, y))
MAKE_FUNCTION(binary_op_rfmod, (float)fmodf(y, x), fmod_ps(y, x))
MAKE_FUNCTION(binary_op_logaddexp, (float)(std::max(x, y) + log1pf(expf(std::min(x, y) - std::max(x, y)))), logaddexp_ps(x, y))
MAKE_FUNCTION(binary_op_floor_divide, (float)floorf(x / y), floor_divide_ps(x, y))
MAKE_FUNCTION(binary_op_rfloor_divide, (float)floorf(y / x), floor_divide_ps(y, x))
MAKE_FUNCTION(binary_op_remainder, (float)remainderf(x, y), remainder_ps(x, y))
MAKE_FUNCTION(binary_op_rremainder, (float)remainderf(y, x), remainder_ps(y, x))
// *INDENT-ON*
// clang-format on

#undef MAKE_FUNCTION

} // namespace BinaryOp_mips_functor

static void binary_op_vector(const float* ptr, const float* ptr1, float* outptr, int aw, int bw, int ap, int bp, int op_type)
{
    using namespace BinaryOp_mips_functor;

    if (op_type == BinaryOp::Operation_ADD) return binary_op_vector<binary_op_add>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_SUB) return binary_op_vector<binary_op_sub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MUL) return binary_op_vector<binary_op_mul>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_DIV) return binary_op_vector<binary_op_div>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MAX) return binary_op_vector<binary_op_max>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_MIN) return binary_op_vector<binary_op_min>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_POW) return binary_op_vector<binary_op_pow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RSUB) return binary_op_vector<binary_op_rsub>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RDIV) return binary_op_vector<binary_op_rdiv>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RPOW) return binary_op_vector<binary_op_rpow>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_ATAN2) return binary_op_vector<binary_op_atan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RATAN2) return binary_op_vector<binary_op_ratan2>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FMOD) return binary_op_vector<binary_op_fmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFMOD) return binary_op_vector<binary_op_rfmod>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_LOGADDEXP) return binary_op_vector<binary_op_logaddexp>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return binary_op_vector<binary_op_floor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return binary_op_vector<binary_op_rfloor_divide>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_REMAINDER) return binary_op_vector<binary_op_remainder>(ptr, ptr1, outptr, aw, bw, ap, bp);
    if (op_type == BinaryOp::Operation_RREMAINDER) return binary_op_vector<binary_op_rremainder>(ptr, ptr1, outptr, aw, bw, ap, bp);

    // should never reach here
}

static void binary_op_scalar(const Mat& a, float b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = a.channel(q);
        float* outptr = c.channel(q);

        binary_op_vector(ptr, &b, outptr, size, 1, 1, 1, op_type);
    }
}

static void binary_op_no_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        const float* ptr = a.channel(q);
        const float* ptr1 = b.channel(q);
        float* outptr = c.channel(q);

        binary_op_vector(ptr, ptr1, outptr, size, size, 1, 1, op_type);
    }
}

static void binary_op_broadcast(const Mat& a, const Mat& b, Mat& c, int op_type, const Option& opt)
{
    if (b.w * b.h * b.d * b.c * b.elempack == 1)
    {
        return binary_op_scalar(a, b[0], c, op_type, opt);
    }

    if (a.dims == b.dims && a.w == b.w && a.h == b.h && a.d == b.d && a.c == b.c && a.elempack == b.elempack)
    {
        return binary_op_no_broadcast(a, b, c, op_type, opt);
    }

    const int dims = c.dims;

    if (dims == 2)
    {
        const int h = c.h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int y = 0; y < h; y++)
        {
            const int y0 = std::min(y, a.h - 1);
            const int y1 = std::min(y, b.h - 1);

            const float* ptr = a.row(y0);
            const float* ptr1 = b.row(y1);
            float* outptr = c.row(y);

            binary_op_vector(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
        }
    }

    if (dims == 3 || dims == 4)
    {
        const int channels = c.c;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const int q0 = std::min(q, a.c - 1);
            const int q1 = std::min(q, b.c - 1);

            if (b.d * b.h * b.w == 1)
            {
                const float* ptr = a.channel(q0);
                const float* ptr1 = b.channel(q1);
                float* outptr = c.channel(q);

                binary_op_vector(ptr, ptr1, outptr, a.w * a.h * a.d, 1, a.elempack, b.elempack, op_type);
                continue;
            }

            if (b.h * b.w == 1)
            {
                for (int z = 0; z < c.d; z++)
                {
                    const int z0 = std::min(z, a.d - 1);
                    const int z1 = std::min(z, b.d - 1);

                    const float* ptr = a.channel(q0).depth(z0);
                    const float* ptr1 = b.channel(q1).depth(z1);
                    float* outptr = c.channel(q).depth(z);

                    binary_op_vector(ptr, ptr1, outptr, a.w * a.h, 1, a.elempack, b.elempack, op_type);
                }
                continue;
            }

            for (int z = 0; z < c.d; z++)
            {
                const int z0 = std::min(z, a.d - 1);
                const int z1 = std::min(z, b.d - 1);

                for (int y = 0; y < c.h; y++)
                {
                    const int y0 = std::min(y, a.h - 1);
                    const int y1 = std::min(y, b.h - 1);

                    const float* ptr = a.channel(q0).depth(z0).row(y0);
                    const float* ptr1 = b.channel(q1).depth(z1).row(y1);
                    float* outptr = c.channel(q).depth(z).row(y);

                    binary_op_vector(ptr, ptr1, outptr, a.w, b.w, a.elempack, b.elempack, op_type);
                }
            }
        }
    }
}

static void binary_op_scalar_inplace(Mat& a, float b, int op_type, const Option& opt)
{
    const int channels = a.c;
    const int size = a.w * a.h * a.d * a.elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = a.channel(q);

        binary_op_vector(ptr, &b, ptr, size, 1, 1, 1, op_type);
    }
}

static int get_reverse_op_type(int op_type)
{
    if (op_type == BinaryOp::Operation_SUB) return BinaryOp::Operation_RSUB;
    if (op_type == BinaryOp::Operation_DIV) return BinaryOp::Operation_RDIV;
    if (op_type == BinaryOp::Operation_POW) return BinaryOp::Operation_RPOW;
    if (op_type == BinaryOp::Operation_ATAN2) return BinaryOp::Operation_RATAN2;
    if (op_type == BinaryOp::Operation_FMOD) return BinaryOp::Operation_RFMOD;
    if (op_type == BinaryOp::Operation_FLOOR_DIVIDE) return BinaryOp::Operation_RFLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_REMAINDER) return BinaryOp::Operation_RREMAINDER;

    if (op_type == BinaryOp::Operation_RSUB) return BinaryOp::Operation_SUB;
    if (op_type == BinaryOp::Operation_RDIV) return BinaryOp::Operation_DIV;
    if (op_type == BinaryOp::Operation_RPOW) return BinaryOp::Operation_POW;
    if (op_type == BinaryOp::Operation_RATAN2) return BinaryOp::Operation_ATAN2;
    if (op_type == BinaryOp::Operation_RFMOD) return BinaryOp::Operation_FMOD;
    if (op_type == BinaryOp::Operation_RFLOOR_DIVIDE) return BinaryOp::Operation_FLOOR_DIVIDE;
    if (op_type == BinaryOp::Operation_RREMAINDER) return BinaryOp::Operation_REMAINDER;
    return op_type;
}

int BinaryOp_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& A = bottom_blobs[0];
    const Mat& B = bottom_blobs[1];
    const int outdims = std::max(A.dims, B.dims);

    Mat A2 = A;
    Mat B2 = B;
    if (A.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (A.w * A.elempack == B.h * B.elempack)
                A2 = A.reshape(1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 2;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 3;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 3 && A.dims == 2)
            A2 = A.reshape(1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 1)
        {
            if (A.w * A.elempack == B.c * B.elempack)
                A2 = A.reshape(1, 1, 1, A.w, opt.workspace_allocator);
            else // if (A.w == B.w)
            {
                A2.dims = 4;
                A2.w = A.w * A.elempack;
                A2.elempack = 1;
                A2.elemsize = A.elemsize / A.elempack;
                A2.cstep = A.cstep * A.elempack;
            }
        }
        if (outdims == 4 && A.dims == 2)
            A2 = A.reshape(1, 1, A.w, A.h, opt.workspace_allocator);
        if (outdims == 4 && A.dims == 3)
            A2 = A.reshape(1, A.w, A.h, A.c, opt.workspace_allocator);
    }
    if (B.dims < outdims)
    {
        // expand inner axes
        if (outdims == 2)
        {
            if (B.w * B.elempack == A.h * A.elempack)
                B2 = B.reshape(1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 2;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 3;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 3 && B.dims == 2)
            B2 = B.reshape(1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 1)
        {
            if (B.w * B.elempack == A.c * A.elempack)
                B2 = B.reshape(1, 1, 1, B.w, opt.workspace_allocator);
            else // if (B.w == A.w)
            {
                B2.dims = 4;
                B2.w = B.w * B.elempack;
                B2.elempack = 1;
                B2.elemsize = B.elemsize / B.elempack;
                B2.cstep = B.cstep * B.elempack;
            }
        }
        if (outdims == 4 && B.dims == 2)
            B2 = B.reshape(1, 1, B.w, B.h, opt.workspace_allocator);
        if (outdims == 4 && B.dims == 3)
            B2 = B.reshape(1, B.w, B.h, B.c, opt.workspace_allocator);
    }

    const int outw = std::max(A2.w, B2.w);
    const int outh = std::max(A2.h, B2.h);
    const int outd = std::max(A2.d, B2.d);
    const int outc = std::max(A2.c, B2.c);
    const size_t out_elemsize = std::max(A2.elemsize, B2.elemsize);
    const int out_elempack = std::max(A2.elempack, B2.elempack);

    Mat& top_blob = top_blobs[0];
    if (outdims == 1)
    {
        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 2)
    {
        top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 3)
    {
        top_blob.create(outw, outh, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (outdims == 4)
    {
        top_blob.create(outw, outh, outd, outc, out_elemsize, out_elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    const bool a_pack_is_lower = A2.elempack < B2.elempack;
    const bool a_pack_is_equal = A2.elempack == B2.elempack;
    const bool a_size_is_lower = A2.w * A2.h * A2.d * A2.c * A2.elempack < B2.w * B2.h * B2.d * B2.c * B2.elempack;
    if (a_pack_is_lower || (a_pack_is_equal && a_size_is_lower))
    {
        binary_op_broadcast(B2, A2, top_blob, get_reverse_op_type(op_type), opt);
    }
    else
    {
        binary_op_broadcast(A2, B2, top_blob, op_type, opt);
    }

    return 0;
}

int BinaryOp_mips::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    binary_op_scalar_inplace(bottom_top_blob, b, op_type, opt);

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/mips/binaryop_mips.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_BINARYOP_MIPS_H
#define LAYER_BINARYOP_MIPS_H

#include "binaryop.h"

namespace ncnn {

class BinaryOp_mips : public BinaryOp
{
public:
    BinaryOp_mips();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_BINARYOP_MIPS_H


================================================
FILE: src/layer/mips/cast_mips.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cast_mips.h"

#if __mips_msa
#include <msa.h>
#endif // __mips_msa

namespace ncnn {

Cast_mips::Cast_mips()
{
    support_packing = true;
}

int Cast_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    if (type_from == type_to)
    {
        top_blob = bottom_blob;
        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int d = bottom_blob.d;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    size_t out_elemsize = elemsize;
    if (type_to == 1)
    {
        if (type_from == 3)
        {
            Cast::forward(bottom_blob, top_blob, opt);
        }

        // float32
        out_elemsize = 4 * elempack;
    }
    else if (type_to == 2)
    {
        // float16
        out_elemsize = 2 * elempack;
    }
    else if (type_to == 3)
    {
        // int8
        out_elemsize = elempack;
    }
    else if (type_to == 4)
    {
        // bfloat16
        out_elemsize = 2 * elempack;
    }

    if (dims == 1)
    {
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 2)
    {
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 3)
    {
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    else if (dims == 4)
    {
        top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
    }
    if (top_blob.empty())
        return -100;

    int size = w * h * d * elempack;

    if (type_from == 1 && type_to == 2)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            unsigned short* outptr = top_blob.channel(q);

            int i = 0;
#if __mips_msa
            for (; i + 7 < size; i += 8)
            {
                __builtin_prefetch(ptr + 16);
                v4f32 _p0 = (v4f32)__msa_ld_w(ptr, 0);
                v4f32 _p1 = (v4f32)__msa_ld_w(ptr + 4, 0);
                v8i16 _p = __msa_fexdo_h(_p1, _p0);
                __msa_st_h(_p, outptr, 0);

                ptr += 8;
                outptr += 8;
            }
#endif // __mips_msa
            for (; i < size; i++)
            {
                *outptr = float32_to_float16(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    if (type_from == 2 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
#if __mips_msa
            for (; i + 7 < size; i += 8)
            {
                __builtin_prefetch(ptr + 16);
                v8i16 _p = __msa_ld_h(ptr, 0);
                v4f32 _p0 = __msa_fexupr_w(_p);
                v4f32 _p1 = __msa_fexupl_w(_p);
                __msa_st_w((v4i32)_p0, outptr, 0);
                __msa_st_w((v4i32)_p1, outptr + 4, 0);

                ptr += 8;
                outptr += 8;
            }
#endif // __mips_msa
            for (; i < size; i++)
            {
                *outptr = float16_to_float32(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    if (type_from == 3 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const signed char* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < size; i++)
            {
                outptr[i] = (float)ptr[i];
            }
        }
    }

    if (type_from == 4 && type_to == 1)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const unsigned short* ptr = bottom_blob.channel(q);
            float* outptr = top_blob.channel(q);

            int i = 0;
            for (; i < size; i++)
            {
                *outptr = bfloat16_to_float32(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    if (type_from == 1 && type_to == 4)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            const float* ptr = bottom_blob.channel(q);
            unsigned short* outptr = top_blob.channel(q);

            int i = 0;
            for (; i < size; i++)
            {
                *outptr = float32_to_bfloat16(*ptr);
                outptr++;
                ptr++;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/mips/cast_mips.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CAST_MIPS_H
#define LAYER_CAST_MIPS_H

#include "cast.h"

namespace ncnn {

class Cast_mips : public Cast
{
public:
    Cast_mips();

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CAST_MIPS_H


================================================
FILE: src/layer/mips/clip_mips.cpp
================================================
// Copyright 2019 Leo <leo@nullptr.com.cn>
// SPDX-License-Identifier: BSD-3-Clause

#include "clip_mips.h"

#if __mips_msa
#include <msa.h>
#endif // __mips_msa

#include "mips_usability.h"

namespace ncnn {

Clip_mips::Clip_mips()
{
#if __mips_msa
    support_packing = true;
#endif
}

int Clip_mips::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int d = bottom_top_blob.d;
    int channels = bottom_top_blob.c;
    int elempack = bottom_top_blob.elempack;
    int size = w * h * d * elempack;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q = 0; q < channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        int i = 0;
#if __mips_msa
        v4f32 _max = (v4f32)__msa_fill_w_f32(max);
        v4f32 _min = (v4f32)__msa_fill_w_f32(min);
        for (; i + 3 < size; i += 4)
        {
            __builtin_prefetch(ptr + 16);
            v4f32 _p = (v4f32)__msa_ld_w(ptr, 0);
            _p = __msa_fmax_w(_p, _min);
            _p = __msa_fmin_w(_p, _max);
            __msa_st_w((v4i32)_p, ptr, 0);

            ptr += 4;
        }
#endif // __mips_msa
        for (; i < size; i++)
        {
            if (*ptr < min)
                *ptr = min;

            if (*ptr > max)
                *ptr = max;

            ptr++;
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/mips/clip_mips.h
================================================
// Copyright 2019 Leo <leo@nullptr.com.cn>
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CLIP_MIPS_H
#define LAYER_CLIP_MIPS_H

#include "clip.h"

namespace ncnn {

class Clip_mips : public Clip
{
public:
    Clip_mips();

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CLIP_MIPS_H


================================================
FILE: src/layer/mips/concat_mips.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "concat_mips.h"

namespace ncnn {

Concat_mips::Concat_mips()
{
#if __mips_msa
    support_packing = true;
#endif // __mips_msa
}

int Concat_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    int dims = bottom_blobs[0].dims;
    int positive_axis = axis < 0 ? dims + axis : axis;

    if (dims == 1) // positive_axis == 0
    {
        // concat vector
        // total length
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        float* outptr = top_blob;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            const float* ptr = bottom_blob;
            memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);

            outptr += bottom_blob.w * bottom_blob.elempack;
        }
    }

    if (dims == 2 && positive_axis == 0)
    {
        // concat image
        int w = bottom_blobs[0].w;

        // total height
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_h += bottom_blob.h * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;
        }

        float* outptr = top_blob_unpacked;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                for (int i = 0; i < bottom_blob.h; i++)
                {
                    const float* r0 = bottom_blob.row(i);

                    float* outptr0 = outptr;
                    float* outptr1 = outptr + w;
                    float* outptr2 = outptr + w * 2;
                    float* outptr3 = outptr + w * 3;

                    for (int j = 0; j < w; j++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    outptr += w * 4;
                }
            }
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
            {
                int size = w * bottom_blob.h;

                const float* ptr = bottom_blob;
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                outptr += size * bottom_blob.elempack;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if (dims == 2 && positive_axis == 1)
    {
        // interleave image row
        int h = bottom_blobs[0].h;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total width
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = 0; i < h; i++)
        {
            float* outptr = top_blob.row(i);
            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                const float* ptr = bottom_blob.row(i);
                memcpy(outptr, ptr, bottom_blob.w * elemsize);

                outptr += bottom_blob.w * elempack;
            }
        }
    }

    if ((dims == 3 || dims == 4) && positive_axis == 0)
    {
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;

        // total channels
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;
        int top_channels = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            elemsize = std::min(elemsize, bottom_blob.elemsize);
            elempack = std::min(elempack, bottom_blob.elempack);
            top_channels += bottom_blob.c * bottom_blob.elempack;
        }

        int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
        size_t out_elemsize = elemsize / elempack * out_elempack;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, d, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        Mat top_blob_unpacked = top_blob;
        if (elempack < out_elempack)
        {
            top_blob_unpacked.create(w, h, d, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
            if (top_blob_unpacked.empty())
                return -100;

            top_blob_unpacked.dims = dims;
        }

        int p = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];

            if (bottom_blob.elempack == 4 && elempack == 1)
            {
                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                for (int q = 0; q < bottom_blob.c; q++)
                {
                    const float* r0 = bottom_blob.channel(q);

                    float* outptr0 = top_blob_unpacked.channel(p);
                    float* outptr1 = top_blob_unpacked.channel(p + 1);
                    float* outptr2 = top_blob_unpacked.channel(p + 2);
                    float* outptr3 = top_blob_unpacked.channel(p + 3);

                    for (int i = 0; i < size; i++)
                    {
                        *outptr0++ = r0[0];
                        *outptr1++ = r0[1];
                        *outptr2++ = r0[2];
                        *outptr3++ = r0[3];

                        r0 += 4;
                    }

                    p += 4;
                }
            }
            else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
            {
                int size = bottom_blob.total();

                const float* ptr = bottom_blob;
                float* outptr = top_blob_unpacked.channel(p);
                memcpy(outptr, ptr, size * bottom_blob.elemsize);

                p += bottom_blob.c;
            }
        }

        // packing
        if (elempack < out_elempack)
        {
            convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
        }
    }

    if ((dims == 3 && positive_axis == 1) || (dims == 4 && positive_axis == 2))
    {
        // interleave dim height
        int w = bottom_blobs[0].w;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_h = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (size_t b = 0; b < bottom_blobs.size(); b++)
                {
                    const Mat& bottom_blob = bottom_blobs[b];

                    int size = bottom_blob.w * bottom_blob.h;

                    const float* ptr = bottom_blob.channel(q).depth(i);
                    memcpy(outptr, ptr, size * elemsize);

                    outptr += size * elempack;
                }
            }
        }
    }

    if ((dims == 3 && positive_axis == 2) || (dims == 4 && positive_axis == 3))
    {
        // interleave dim width
        int h = bottom_blobs[0].h;
        int d = bottom_blobs[0].d;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total height
        int top_w = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        top_blob.dims = dims;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (int i = 0; i < d; i++)
            {
                for (int j = 0; j < h; j++)
                {
                    for (size_t b = 0; b < bottom_blobs.size(); b++)
                    {
                        const Mat& bottom_blob = bottom_blobs[b];

                        const float* ptr = bottom_blob.channel(q).depth(i).row(j);
                        memcpy(outptr, ptr, bottom_blob.w * elemsize);

                        outptr += bottom_blob.w * elempack;
                    }
                }
            }
        }
    }

    if (dims == 4 && positive_axis == 1)
    {
        // interleave dim depth
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;
        size_t elemsize = bottom_blobs[0].elemsize;
        int elempack = bottom_blobs[0].elempack;

        // total depth
        int top_d = 0;
        for (size_t b = 0; b < bottom_blobs.size(); b++)
        {
            const Mat& bottom_blob = bottom_blobs[b];
            top_d += bottom_blob.d;
        }

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_d, channels, elemsize, elempack, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q = 0; q < channels; q++)
        {
            float* outptr = top_blob.channel(q);

            for (size_t b = 0; b < bottom_blobs.size(); b++)
            {
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h * bottom_blob.d;

                const float* ptr = bottom_blob.channel(q);
                memcpy(outptr, ptr, size * elemsize);

                outptr += size * elempack;
            }
        }
    }

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/mips/concat_mips.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONCAT_MIPS_H
#define LAYER_CONCAT_MIPS_H

#include "concat.h"

namespace ncnn {

class Concat_mips : public Concat
{
public:
    Concat_mips();

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_CONCAT_MIPS_H


================================================
FILE: src/layer/mips/convolution1d_mips.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution1d_mips.h"

#if __mips_msa
#include <msa.h>
#endif // __mips_msa

#include "mips_activation.h"
#include "mips_usability.h"

namespace ncnn {

Convolution1D_mips::Convolution1D_mips()
{
#if __mips_msa
    support_packing = true;
#endif // __mips_msa
}

int Convolution1D_mips::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    const int num_input = weight_data_size / kernel_w / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __mips_msa
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

    // src = kw-inch-outch
    // dst = pb-pa-kw-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output);

        weight_data_packed.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            float* g00 = weight_data_packed.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < kernel_w; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution1D_mips::destroy_pipeline(const Option& /*opt*/)
{
    return 0;
}

int Convolution1D_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int out_elempack = 1;
#if __mips_msa
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    const int outw = (w - kernel_extent_w) / stride_w + 1;
    const int outh = num_output / out_elempack;

    top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

#if __mips_msa
    if (elempack == 4 && out_elempack == 4)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    v4f32 _sum = (v4f32)__msa_fill_w(0);

                    if (bias_term)
                    {
                        _sum = (v4f32)__msa_ld_w((const float*)bias_data + p * 4, 0);
                    }

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            v4f32 _val0 = __msa_fill_w_f32(sptr[0]);
                            v4f32 _val1 = __msa_fill_w_f32(sptr[1]);
                            v4f32 _val2 = __msa_fill_w_f32(sptr[2]);
                            v4f32 _val3 = __msa_fill_w_f32(sptr[3]);

                            v4f32 _w0 = (v4f32)__msa_ld_w(kptr, 0);
                            v4f32 _w1 = (v4f32)__msa_ld_w(kptr + 4, 0);
                            v4f32 _w2 = (v4f32)__msa_ld_w(kptr + 8, 0);
                            v4f32 _w3 = (v4f32)__msa_ld_w(kptr + 12, 0);

                            _sum = __msa_fmadd_w(_sum, _val0, _w0);
                            _sum = __msa_fmadd_w(_sum, _val1, _w1);
                            _sum = __msa_fmadd_w(_sum, _val2, _w2);
                            _sum = __msa_fmadd_w(_sum, _val3, _w3);

                            sptr += dilation_w * 4;
                            kptr += 16;
                        }
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    __msa_st_w((v4i32)_sum, outptr, 0);
                    outptr += 4;
                }
            }
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    v4f32 _sum = (v4f32)__msa_fill_w(0);

                    if (bias_term)
                    {
                        _sum = (v4f32)__msa_ld_w((const float*)bias_data + p * 4, 0);
                    }

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            v4f32 _val = __msa_fill_w_f32(sptr[0]);
                            v4f32 _w = (v4f32)__msa_ld_w(kptr, 0);
                            _sum = __msa_fmadd_w(_sum, _val, _w);

                            sptr += dilation_w;
                            kptr += 4;
                        }
                    }

                    _sum = activation_ps(_sum, activation_type, activation_params);

                    __msa_st_w((v4i32)_sum, outptr, 0);
                    outptr += 4;
                }
            }
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    v4f32 _sum = (v4f32)__msa_fill_w(0);

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            v4f32 _val = (v4f32)__msa_ld_w(sptr, 0);
                            v4f32 _w = (v4f32)__msa_ld_w(kptr, 0);
                            _sum = __msa_fmadd_w(_sum, _val, _w);

                            sptr += dilation_w * 4;
                            kptr += 4;
                        }
                    }

                    sum += __msa_reduce_fadd_w(_sum);

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[j] = sum;
                }
            }
        }
    }
#endif // __mips_msa

    if (elempack == 1 && out_elempack == 1)
    {
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < outh; p++)
            {
                float* outptr = top_blob.row(p);

                for (int j = 0; j < outw; j++)
                {
                    float sum = 0.f;

                    if (bias_term)
                    {
                        sum = bias_data[p];
                    }

                    const float* kptr = weight_data_packed.channel(p);

                    for (int q = 0; q < h; q++)
                    {
                        const float* sptr = bottom_blob_bordered.row(q) + j * stride_w;

                        for (int k = 0; k < kernel_w; k++)
                        {
                            float val = sptr[0];
                            float wt = kptr[0];
                            sum += val * wt;

                            sptr += dilation_w;
                            kptr += 1;
                        }
                    }

                    sum = activation_ss(sum, activation_type, activation_params);

                    outptr[j] = sum;
                }
            }
        }
    }

    return 0;
}

int Convolution1D_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution1D);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(2, dilation_w);
    pd.set(3, stride_w);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

} // namespace ncnn


================================================
FILE: src/layer/mips/convolution1d_mips.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION1D_MIPS_H
#define LAYER_CONVOLUTION1D_MIPS_H

#include "convolution1d.h"

namespace ncnn {

class Convolution1D_mips : public Convolution1D
{
public:
    Convolution1D_mips();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
    // packn
    Mat weight_data_packed;
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION1D_MIPS_H


================================================
FILE: src/layer/mips/convolution_1x1.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_msa(bottom_im2col, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/mips/convolution_1x1_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/mips/convolution_1x1_pack1to4_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack1to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack1to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const signed char* r0 = bottom_blob.channel(p);
        signed char* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j + 3 < outw; j += 4)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];
                outptr[2] = r0[4];
                outptr[3] = r0[6];

                r0 += 8;
                outptr += 4;
            }
            for (; j + 1 < outw; j += 2)
            {
                outptr[0] = r0[0];
                outptr[1] = r0[2];

                r0 += 4;
                outptr += 2;
            }
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/mips/convolution_1x1_pack4.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack4_msa(bottom_im2col, top_blob, kernel, _bias, opt);
}

static void conv1x1s2_sgemm_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = (w - 2 * outw + w) * 4;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const float* r0 = bottom_blob.channel(p);
        float* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                v4f32 _val = (v4f32)__msa_ld_w(r0, 0);
                __msa_st_w((v4i32)_val, outptr, 0);

                r0 += 4 * 2;
                outptr += 4;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack4_msa(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/mips/convolution_1x1_pack4to1.h
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/mips/convolution_1x1_pack8to1_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to1_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const int64_t* r0 = bottom_blob.channel(p);
        int64_t* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/mips/convolution_1x1_pack8to4_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv1x1s1_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    const int size = w * h;

    Mat bottom_im2col = bottom_blob;
    bottom_im2col.w = size;
    bottom_im2col.h = 1;

    im2col_sgemm_pack8to4_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

static void conv1x1s2_sgemm_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;

    const int tailstep = w - 2 * outw + w;

    Mat bottom_blob_shrinked;
    bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < channels; p++)
    {
        const int64_t* r0 = bottom_blob.channel(p);
        int64_t* outptr = bottom_blob_shrinked.channel(p);

        for (int i = 0; i < outh; i++)
        {
            int j = 0;
            for (; j < outw; j++)
            {
                outptr[0] = r0[0];

                r0 += 2;
                outptr += 1;
            }

            r0 += tailstep;
        }
    }

    conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_shrinked, top_blob, kernel, opt);
}


================================================
FILE: src/layer/mips/convolution_3x3.h
================================================
[File too large to display: 10.9 KB]

================================================
FILE: src/layer/mips/convolution_3x3_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#if NCNN_RUNTIME_CPU && NCNN_MMI && !__mips_msa && !__mips_loongson_mmi
void conv3x3s1_winograd43_transform_kernel_int8_loongson_mmi(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt);
#endif

static void conv3x3s1_winograd43_transform_kernel_int8_msa(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt)
{
#if NCNN_RUNTIME_CPU && NCNN_MMI && !__mips_msa && !__mips_loongson_mmi
    if (ncnn::cpu_support_loongson_mmi())
    {
        conv3x3s1_winograd43_transform_kernel_int8_loongson_mmi(kernel, kernel_tm_packed, inch, outch, opt);
        return;
    }
#endif

    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 2b-inch-36-outch/2b
#if __mips_msa
    if (outch >= 4)
    {
        if (inch >= 4)
            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch / 4 + outch % 4, (size_t)2u * 16, 16);
        else
            kernel_tm_packed.create(inch, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
    }
#else // __mips_msa
    if (outch >= 2)
    {
#if __mips_loongson_mmi
        if (inch >= 4)
            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch / 2 + outch % 2, (size_t)2u * 8, 8);
        else
#endif // __mips_loongson_mmi
        {
            kernel_tm_packed.create(inch, 36, outch / 2 + outch % 2, (size_t)2u * 2, 2);
        }
    }
#endif // __mips_msa
    else
    {
#if __mips_msa || __mips_loongson_mmi
        if (inch >= 4)
            kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch, (size_t)2u * 4, 4);
        else
#endif // __mips_msa || __mips_loongson_mmi
        {
            kernel_tm_packed.create(inch, 36, outch, (size_t)2u, 1);
        }
    }

    int p = 0;
#if __mips_msa
    for (; p + 3 < outch; p += 4)
    {
        const Mat k0 = kernel_tm.channel(p);
        const Mat k1 = kernel_tm.channel(p + 1);
        const Mat k2 = kernel_tm.channel(p + 2);
        const Mat k3 = kernel_tm.channel(p + 3);

        Mat g0 = kernel_tm_packed.channel(p / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            int q = 0;
            for (; q + 3 < inch; q += 4)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k0.row<const short>(q + 1)[k];
                g00[2] = k0.row<const short>(q + 2)[k];
                g00[3] = k0.row<const short>(q + 3)[k];
                g00[4] = k1.row<const short>(q)[k];
                g00[5] = k1.row<const short>(q + 1)[k];
                g00[6] = k1.row<const short>(q + 2)[k];
                g00[7] = k1.row<const short>(q + 3)[k];
                g00[8] = k2.row<const short>(q)[k];
                g00[9] = k2.row<const short>(q + 1)[k];
                g00[10] = k2.row<const short>(q + 2)[k];
                g00[11] = k2.row<const short>(q + 3)[k];
                g00[12] = k3.row<const short>(q)[k];
                g00[13] = k3.row<const short>(q + 1)[k];
                g00[14] = k3.row<const short>(q + 2)[k];
                g00[15] = k3.row<const short>(q + 3)[k];
                g00 += 16;
            }
            for (; q < inch; q++)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k1.row<const short>(q)[k];
                g00[2] = k2.row<const short>(q)[k];
                g00[3] = k3.row<const short>(q)[k];
                g00 += 4;
            }
        }
    }
#else // __mips_msa
    for (; p + 1 < outch; p += 2)
    {
        const Mat k0 = kernel_tm.channel(p);
        const Mat k1 = kernel_tm.channel(p + 1);

        Mat g0 = kernel_tm_packed.channel(p / 2);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            int q = 0;
#if __mips_loongson_mmi
            for (; q + 3 < inch; q += 4)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k0.row<const short>(q + 1)[k];
                g00[2] = k1.row<const short>(q)[k];
                g00[3] = k1.row<const short>(q + 1)[k];
                g00[4] = k0.row<const short>(q + 2)[k];
                g00[5] = k0.row<const short>(q + 3)[k];
                g00[6] = k1.row<const short>(q + 2)[k];
                g00[7] = k1.row<const short>(q + 3)[k];
                g00 += 8;
            }
#endif // __mips_loongson_mmi
            for (; q < inch; q++)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k1.row<const short>(q)[k];
                g00 += 2;
            }
        }
    }
#endif // __mips_msa
    for (; p < outch; p++)
    {
        const Mat k0 = kernel_tm.channel(p);

#if __mips_msa
        Mat g0 = kernel_tm_packed.channel(p / 4 + p % 4);
#else
        Mat g0 = kernel_tm_packed.channel(p / 2 + p % 2);
#endif

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            int q = 0;
#if __mips_msa || __mips_loongson_mmi
            for (; q + 3 < inch; q += 4)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00[1] = k0.row<const short>(q + 1)[k];
                g00[2] = k0.row<const short>(q + 2)[k];
                g00[3] = k0.row<const short>(q + 3)[k];
                g00 += 4;
            }
#endif // __mips_msa || __mips_loongson_mmi
            for (; q < inch; q++)
            {
                g00[0] = k0.row<const short>(q)[k];
                g00 += 1;
            }
        }
    }
}

static void conv3x3s1_winograd43_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_int8_msa(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_int8_msa(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_int8_msa(top_blob_tm, top_blob_bordered, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/mips/convolution_3x3_pack1to4.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_pack1to4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        v4f32 _bias0 = bias ? (v4f32)__msa_ld_w(bias + p * 4, 0) : (v4f32)__msa_fill_w(0);
        out0.fill(_bias0);

        const float* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            v4f32 _k00 = (v4f32)__msa_ld_w(k0, 0);
            v4f32 _k01 = (v4f32)__msa_ld_w(k0 + 4, 0);
            v4f32 _k02 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
            v4f32 _k10 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
            v4f32 _k11 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
            v4f32 _k12 = (v4f32)__msa_ld_w(k0 + 4 * 5, 0);
            v4f32 _k20 = (v4f32)__msa_ld_w(k0 + 4 * 6, 0);
            v4f32 _k21 = (v4f32)__msa_ld_w(k0 + 4 * 7, 0);
            v4f32 _k22 = (v4f32)__msa_ld_w(k0 + 4 * 8, 0);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 7 < outw; j += 8)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);
                    v4f32 _sum1 = (v4f32)__msa_ld_w(outptr0 + 4, 0);
                    v4f32 _sum2 = (v4f32)__msa_ld_w(outptr0 + 4 * 2, 0);
                    v4f32 _sum3 = (v4f32)__msa_ld_w(outptr0 + 4 * 3, 0);
                    v4f32 _sum4 = (v4f32)__msa_ld_w(outptr0 + 4 * 4, 0);
                    v4f32 _sum5 = (v4f32)__msa_ld_w(outptr0 + 4 * 5, 0);
                    v4f32 _sum6 = (v4f32)__msa_ld_w(outptr0 + 4 * 6, 0);
                    v4f32 _sum7 = (v4f32)__msa_ld_w(outptr0 + 4 * 7, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4i32 _r0n = __msa_ld_w(r0 + 4, 0);
                    v4i32 _r0nn = __msa_ld_w(r0 + 8, 0);

                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);
                    v4f32 _r03 = (v4f32)__msa_splati_w(_r0, 3);
                    v4f32 _r04 = (v4f32)__msa_splati_w(_r0n, 0);
                    v4f32 _r05 = (v4f32)__msa_splati_w(_r0n, 1);
                    v4f32 _r06 = (v4f32)__msa_splati_w(_r0n, 2);
                    v4f32 _r07 = (v4f32)__msa_splati_w(_r0n, 3);
                    v4f32 _r08 = (v4f32)__msa_splati_w(_r0nn, 0);
                    v4f32 _r09 = (v4f32)__msa_splati_w(_r0nn, 1);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum1 = __msa_fmadd_w(_sum1, _r01, _k00);
                    _sum2 = __msa_fmadd_w(_sum2, _r02, _k00);
                    _sum3 = __msa_fmadd_w(_sum3, _r03, _k00);
                    _sum4 = __msa_fmadd_w(_sum4, _r04, _k00);
                    _sum5 = __msa_fmadd_w(_sum5, _r05, _k00);
                    _sum6 = __msa_fmadd_w(_sum6, _r06, _k00);
                    _sum7 = __msa_fmadd_w(_sum7, _r07, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum1 = __msa_fmadd_w(_sum1, _r02, _k01);
                    _sum2 = __msa_fmadd_w(_sum2, _r03, _k01);
                    _sum3 = __msa_fmadd_w(_sum3, _r04, _k01);
                    _sum4 = __msa_fmadd_w(_sum4, _r05, _k01);
                    _sum5 = __msa_fmadd_w(_sum5, _r06, _k01);
                    _sum6 = __msa_fmadd_w(_sum6, _r07, _k01);
                    _sum7 = __msa_fmadd_w(_sum7, _r08, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);
                    _sum1 = __msa_fmadd_w(_sum1, _r03, _k02);
                    _sum2 = __msa_fmadd_w(_sum2, _r04, _k02);
                    _sum3 = __msa_fmadd_w(_sum3, _r05, _k02);
                    _sum4 = __msa_fmadd_w(_sum4, _r06, _k02);
                    _sum5 = __msa_fmadd_w(_sum5, _r07, _k02);
                    _sum6 = __msa_fmadd_w(_sum6, _r08, _k02);
                    _sum7 = __msa_fmadd_w(_sum7, _r09, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4i32 _r1n = __msa_ld_w(r1 + 4, 0);
                    v4i32 _r1nn = __msa_ld_w(r1 + 8, 0);

                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);
                    v4f32 _r13 = (v4f32)__msa_splati_w(_r1, 3);
                    v4f32 _r14 = (v4f32)__msa_splati_w(_r1n, 0);
                    v4f32 _r15 = (v4f32)__msa_splati_w(_r1n, 1);
                    v4f32 _r16 = (v4f32)__msa_splati_w(_r1n, 2);
                    v4f32 _r17 = (v4f32)__msa_splati_w(_r1n, 3);
                    v4f32 _r18 = (v4f32)__msa_splati_w(_r1nn, 0);
                    v4f32 _r19 = (v4f32)__msa_splati_w(_r1nn, 1);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum1 = __msa_fmadd_w(_sum1, _r11, _k10);
                    _sum2 = __msa_fmadd_w(_sum2, _r12, _k10);
                    _sum3 = __msa_fmadd_w(_sum3, _r13, _k10);
                    _sum4 = __msa_fmadd_w(_sum4, _r14, _k10);
                    _sum5 = __msa_fmadd_w(_sum5, _r15, _k10);
                    _sum6 = __msa_fmadd_w(_sum6, _r16, _k10);
                    _sum7 = __msa_fmadd_w(_sum7, _r17, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum1 = __msa_fmadd_w(_sum1, _r12, _k11);
                    _sum2 = __msa_fmadd_w(_sum2, _r13, _k11);
                    _sum3 = __msa_fmadd_w(_sum3, _r14, _k11);
                    _sum4 = __msa_fmadd_w(_sum4, _r15, _k11);
                    _sum5 = __msa_fmadd_w(_sum5, _r16, _k11);
                    _sum6 = __msa_fmadd_w(_sum6, _r17, _k11);
                    _sum7 = __msa_fmadd_w(_sum7, _r18, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);
                    _sum1 = __msa_fmadd_w(_sum1, _r13, _k12);
                    _sum2 = __msa_fmadd_w(_sum2, _r14, _k12);
                    _sum3 = __msa_fmadd_w(_sum3, _r15, _k12);
                    _sum4 = __msa_fmadd_w(_sum4, _r16, _k12);
                    _sum5 = __msa_fmadd_w(_sum5, _r17, _k12);
                    _sum6 = __msa_fmadd_w(_sum6, _r18, _k12);
                    _sum7 = __msa_fmadd_w(_sum7, _r19, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4i32 _r2n = __msa_ld_w(r2 + 4, 0);
                    v4i32 _r2nn = __msa_ld_w(r2 + 8, 0);

                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);
                    v4f32 _r23 = (v4f32)__msa_splati_w(_r2, 3);
                    v4f32 _r24 = (v4f32)__msa_splati_w(_r2n, 0);
                    v4f32 _r25 = (v4f32)__msa_splati_w(_r2n, 1);
                    v4f32 _r26 = (v4f32)__msa_splati_w(_r2n, 2);
                    v4f32 _r27 = (v4f32)__msa_splati_w(_r2n, 3);
                    v4f32 _r28 = (v4f32)__msa_splati_w(_r2nn, 0);
                    v4f32 _r29 = (v4f32)__msa_splati_w(_r2nn, 1);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum1 = __msa_fmadd_w(_sum1, _r21, _k20);
                    _sum2 = __msa_fmadd_w(_sum2, _r22, _k20);
                    _sum3 = __msa_fmadd_w(_sum3, _r23, _k20);
                    _sum4 = __msa_fmadd_w(_sum4, _r24, _k20);
                    _sum5 = __msa_fmadd_w(_sum5, _r25, _k20);
                    _sum6 = __msa_fmadd_w(_sum6, _r26, _k20);
                    _sum7 = __msa_fmadd_w(_sum7, _r27, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum1 = __msa_fmadd_w(_sum1, _r22, _k21);
                    _sum2 = __msa_fmadd_w(_sum2, _r23, _k21);
                    _sum3 = __msa_fmadd_w(_sum3, _r24, _k21);
                    _sum4 = __msa_fmadd_w(_sum4, _r25, _k21);
                    _sum5 = __msa_fmadd_w(_sum5, _r26, _k21);
                    _sum6 = __msa_fmadd_w(_sum6, _r27, _k21);
                    _sum7 = __msa_fmadd_w(_sum7, _r28, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);
                    _sum1 = __msa_fmadd_w(_sum1, _r23, _k22);
                    _sum2 = __msa_fmadd_w(_sum2, _r24, _k22);
                    _sum3 = __msa_fmadd_w(_sum3, _r25, _k22);
                    _sum4 = __msa_fmadd_w(_sum4, _r26, _k22);
                    _sum5 = __msa_fmadd_w(_sum5, _r27, _k22);
                    _sum6 = __msa_fmadd_w(_sum6, _r28, _k22);
                    _sum7 = __msa_fmadd_w(_sum7, _r29, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);
                    __msa_st_w((v4i32)_sum1, outptr0 + 4, 0);
                    __msa_st_w((v4i32)_sum2, outptr0 + 4 * 2, 0);
                    __msa_st_w((v4i32)_sum3, outptr0 + 4 * 3, 0);
                    __msa_st_w((v4i32)_sum4, outptr0 + 4 * 4, 0);
                    __msa_st_w((v4i32)_sum5, outptr0 + 4 * 5, 0);
                    __msa_st_w((v4i32)_sum6, outptr0 + 4 * 6, 0);
                    __msa_st_w((v4i32)_sum7, outptr0 + 4 * 7, 0);

                    outptr0 += 4 * 8;

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                }
                for (; j + 3 < outw; j += 4)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);
                    v4f32 _sum1 = (v4f32)__msa_ld_w(outptr0 + 4, 0);
                    v4f32 _sum2 = (v4f32)__msa_ld_w(outptr0 + 4 * 2, 0);
                    v4f32 _sum3 = (v4f32)__msa_ld_w(outptr0 + 4 * 3, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4i32 _r0n = __msa_ld_w(r0 + 4, 0);

                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);
                    v4f32 _r03 = (v4f32)__msa_splati_w(_r0, 3);
                    v4f32 _r04 = (v4f32)__msa_splati_w(_r0n, 0);
                    v4f32 _r05 = (v4f32)__msa_splati_w(_r0n, 1);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum1 = __msa_fmadd_w(_sum1, _r01, _k00);
                    _sum2 = __msa_fmadd_w(_sum2, _r02, _k00);
                    _sum3 = __msa_fmadd_w(_sum3, _r03, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum1 = __msa_fmadd_w(_sum1, _r02, _k01);
                    _sum2 = __msa_fmadd_w(_sum2, _r03, _k01);
                    _sum3 = __msa_fmadd_w(_sum3, _r04, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);
                    _sum1 = __msa_fmadd_w(_sum1, _r03, _k02);
                    _sum2 = __msa_fmadd_w(_sum2, _r04, _k02);
                    _sum3 = __msa_fmadd_w(_sum3, _r05, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4i32 _r1n = __msa_ld_w(r1 + 4, 0);

                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);
                    v4f32 _r13 = (v4f32)__msa_splati_w(_r1, 3);
                    v4f32 _r14 = (v4f32)__msa_splati_w(_r1n, 0);
                    v4f32 _r15 = (v4f32)__msa_splati_w(_r1n, 1);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum1 = __msa_fmadd_w(_sum1, _r11, _k10);
                    _sum2 = __msa_fmadd_w(_sum2, _r12, _k10);
                    _sum3 = __msa_fmadd_w(_sum3, _r13, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum1 = __msa_fmadd_w(_sum1, _r12, _k11);
                    _sum2 = __msa_fmadd_w(_sum2, _r13, _k11);
                    _sum3 = __msa_fmadd_w(_sum3, _r14, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);
                    _sum1 = __msa_fmadd_w(_sum1, _r13, _k12);
                    _sum2 = __msa_fmadd_w(_sum2, _r14, _k12);
                    _sum3 = __msa_fmadd_w(_sum3, _r15, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4i32 _r2n = __msa_ld_w(r2 + 4, 0);

                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);
                    v4f32 _r23 = (v4f32)__msa_splati_w(_r2, 3);
                    v4f32 _r24 = (v4f32)__msa_splati_w(_r2n, 0);
                    v4f32 _r25 = (v4f32)__msa_splati_w(_r2n, 1);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum1 = __msa_fmadd_w(_sum1, _r21, _k20);
                    _sum2 = __msa_fmadd_w(_sum2, _r22, _k20);
                    _sum3 = __msa_fmadd_w(_sum3, _r23, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum1 = __msa_fmadd_w(_sum1, _r22, _k21);
                    _sum2 = __msa_fmadd_w(_sum2, _r23, _k21);
                    _sum3 = __msa_fmadd_w(_sum3, _r24, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);
                    _sum1 = __msa_fmadd_w(_sum1, _r23, _k22);
                    _sum2 = __msa_fmadd_w(_sum2, _r24, _k22);
                    _sum3 = __msa_fmadd_w(_sum3, _r25, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);
                    __msa_st_w((v4i32)_sum1, outptr0 + 4, 0);
                    __msa_st_w((v4i32)_sum2, outptr0 + 4 * 2, 0);
                    __msa_st_w((v4i32)_sum3, outptr0 + 4 * 3, 0);

                    outptr0 += 4 * 4;

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                }
                for (; j + 1 < outw; j += 2)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);
                    v4f32 _sum1 = (v4f32)__msa_ld_w(outptr0 + 4, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);
                    v4f32 _r03 = (v4f32)__msa_splati_w(_r0, 3);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum1 = __msa_fmadd_w(_sum1, _r01, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum1 = __msa_fmadd_w(_sum1, _r02, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);
                    _sum1 = __msa_fmadd_w(_sum1, _r03, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);
                    v4f32 _r13 = (v4f32)__msa_splati_w(_r1, 3);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum1 = __msa_fmadd_w(_sum1, _r11, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum1 = __msa_fmadd_w(_sum1, _r12, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);
                    _sum1 = __msa_fmadd_w(_sum1, _r13, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);
                    v4f32 _r23 = (v4f32)__msa_splati_w(_r2, 3);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum1 = __msa_fmadd_w(_sum1, _r21, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum1 = __msa_fmadd_w(_sum1, _r22, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);
                    _sum1 = __msa_fmadd_w(_sum1, _r23, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);
                    __msa_st_w((v4i32)_sum1, outptr0 + 4, 0);

                    outptr0 += 4 * 2;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                }
                for (; j < outw; j++)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);

                    outptr0 += 4;

                    r0 += 1;
                    r1 += 1;
                    r2 += 1;
                }

                r0 += 2;
                r1 += 2;
                r2 += 2;
            }

            k0 += 9 * 4;
        }
    }
}

static void conv3x3s2_pack1to4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;
    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out0 = top_blob.channel(p);

        v4f32 _bias0 = bias ? (v4f32)__msa_ld_w(bias + p * 4, 0) : (v4f32)__msa_fill_w(0);
        out0.fill(_bias0);

        const float* k0 = kernel.channel(p);

        int q = 0;
        for (; q < inch; q++)
        {
            float* outptr0 = out0;

            const Mat img0 = bottom_blob.channel(q);

            const float* r0 = img0.row(0);
            const float* r1 = img0.row(1);
            const float* r2 = img0.row(2);

            v4f32 _k00 = (v4f32)__msa_ld_w(k0, 0);
            v4f32 _k01 = (v4f32)__msa_ld_w(k0 + 4, 0);
            v4f32 _k02 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
            v4f32 _k10 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
            v4f32 _k11 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
            v4f32 _k12 = (v4f32)__msa_ld_w(k0 + 4 * 5, 0);
            v4f32 _k20 = (v4f32)__msa_ld_w(k0 + 4 * 6, 0);
            v4f32 _k21 = (v4f32)__msa_ld_w(k0 + 4 * 7, 0);
            v4f32 _k22 = (v4f32)__msa_ld_w(k0 + 4 * 8, 0);

            int i = 0;
            for (; i < outh; i++)
            {
                int j = 0;
                for (; j + 7 < outw; j += 8)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);
                    v4f32 _sum1 = (v4f32)__msa_ld_w(outptr0 + 4, 0);
                    v4f32 _sum2 = (v4f32)__msa_ld_w(outptr0 + 4 * 2, 0);
                    v4f32 _sum3 = (v4f32)__msa_ld_w(outptr0 + 4 * 3, 0);
                    v4f32 _sum4 = (v4f32)__msa_ld_w(outptr0 + 4 * 4, 0);
                    v4f32 _sum5 = (v4f32)__msa_ld_w(outptr0 + 4 * 5, 0);
                    v4f32 _sum6 = (v4f32)__msa_ld_w(outptr0 + 4 * 6, 0);
                    v4f32 _sum7 = (v4f32)__msa_ld_w(outptr0 + 4 * 7, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4i32 _r0n = __msa_ld_w(r0 + 4, 0);
                    v4i32 _r0nn = __msa_ld_w(r0 + 8, 0);
                    v4i32 _r0nnn = __msa_ld_w(r0 + 12, 0);

                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);
                    v4f32 _r03 = (v4f32)__msa_splati_w(_r0, 3);
                    v4f32 _r04 = (v4f32)__msa_splati_w(_r0n, 0);
                    v4f32 _r05 = (v4f32)__msa_splati_w(_r0n, 1);
                    v4f32 _r06 = (v4f32)__msa_splati_w(_r0n, 2);
                    v4f32 _r07 = (v4f32)__msa_splati_w(_r0n, 3);
                    v4f32 _r08 = (v4f32)__msa_splati_w(_r0nn, 0);
                    v4f32 _r09 = (v4f32)__msa_splati_w(_r0nn, 1);
                    v4f32 _r0a = (v4f32)__msa_splati_w(_r0nn, 2);
                    v4f32 _r0b = (v4f32)__msa_splati_w(_r0nn, 3);
                    v4f32 _r0c = (v4f32)__msa_splati_w(_r0nnn, 0);
                    v4f32 _r0d = (v4f32)__msa_splati_w(_r0nnn, 1);
                    v4f32 _r0e = (v4f32)__msa_splati_w(_r0nnn, 2);
                    v4f32 _r0f = (v4f32)__msa_splati_w(_r0nnn, 3);
                    v4f32 _r0g = __msa_fill_w_f32(r0[16]);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum1 = __msa_fmadd_w(_sum1, _r02, _k00);
                    _sum2 = __msa_fmadd_w(_sum2, _r04, _k00);
                    _sum3 = __msa_fmadd_w(_sum3, _r06, _k00);
                    _sum4 = __msa_fmadd_w(_sum4, _r08, _k00);
                    _sum5 = __msa_fmadd_w(_sum5, _r0a, _k00);
                    _sum6 = __msa_fmadd_w(_sum6, _r0c, _k00);
                    _sum7 = __msa_fmadd_w(_sum7, _r0e, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum1 = __msa_fmadd_w(_sum1, _r03, _k01);
                    _sum2 = __msa_fmadd_w(_sum2, _r05, _k01);
                    _sum3 = __msa_fmadd_w(_sum3, _r07, _k01);
                    _sum4 = __msa_fmadd_w(_sum4, _r09, _k01);
                    _sum5 = __msa_fmadd_w(_sum5, _r0b, _k01);
                    _sum6 = __msa_fmadd_w(_sum6, _r0d, _k01);
                    _sum7 = __msa_fmadd_w(_sum7, _r0f, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);
                    _sum1 = __msa_fmadd_w(_sum1, _r04, _k02);
                    _sum2 = __msa_fmadd_w(_sum2, _r06, _k02);
                    _sum3 = __msa_fmadd_w(_sum3, _r08, _k02);
                    _sum4 = __msa_fmadd_w(_sum4, _r0a, _k02);
                    _sum5 = __msa_fmadd_w(_sum5, _r0c, _k02);
                    _sum6 = __msa_fmadd_w(_sum6, _r0e, _k02);
                    _sum7 = __msa_fmadd_w(_sum7, _r0g, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4i32 _r1n = __msa_ld_w(r1 + 4, 0);
                    v4i32 _r1nn = __msa_ld_w(r1 + 8, 0);
                    v4i32 _r1nnn = __msa_ld_w(r1 + 12, 0);

                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);
                    v4f32 _r13 = (v4f32)__msa_splati_w(_r1, 3);
                    v4f32 _r14 = (v4f32)__msa_splati_w(_r1n, 0);
                    v4f32 _r15 = (v4f32)__msa_splati_w(_r1n, 1);
                    v4f32 _r16 = (v4f32)__msa_splati_w(_r1n, 2);
                    v4f32 _r17 = (v4f32)__msa_splati_w(_r1n, 3);
                    v4f32 _r18 = (v4f32)__msa_splati_w(_r1nn, 0);
                    v4f32 _r19 = (v4f32)__msa_splati_w(_r1nn, 1);
                    v4f32 _r1a = (v4f32)__msa_splati_w(_r1nn, 2);
                    v4f32 _r1b = (v4f32)__msa_splati_w(_r1nn, 3);
                    v4f32 _r1c = (v4f32)__msa_splati_w(_r1nnn, 0);
                    v4f32 _r1d = (v4f32)__msa_splati_w(_r1nnn, 1);
                    v4f32 _r1e = (v4f32)__msa_splati_w(_r1nnn, 2);
                    v4f32 _r1f = (v4f32)__msa_splati_w(_r1nnn, 3);
                    v4f32 _r1g = __msa_fill_w_f32(r1[16]);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum1 = __msa_fmadd_w(_sum1, _r12, _k10);
                    _sum2 = __msa_fmadd_w(_sum2, _r14, _k10);
                    _sum3 = __msa_fmadd_w(_sum3, _r16, _k10);
                    _sum4 = __msa_fmadd_w(_sum4, _r18, _k10);
                    _sum5 = __msa_fmadd_w(_sum5, _r1a, _k10);
                    _sum6 = __msa_fmadd_w(_sum6, _r1c, _k10);
                    _sum7 = __msa_fmadd_w(_sum7, _r1e, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum1 = __msa_fmadd_w(_sum1, _r13, _k11);
                    _sum2 = __msa_fmadd_w(_sum2, _r15, _k11);
                    _sum3 = __msa_fmadd_w(_sum3, _r17, _k11);
                    _sum4 = __msa_fmadd_w(_sum4, _r19, _k11);
                    _sum5 = __msa_fmadd_w(_sum5, _r1b, _k11);
                    _sum6 = __msa_fmadd_w(_sum6, _r1d, _k11);
                    _sum7 = __msa_fmadd_w(_sum7, _r1f, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);
                    _sum1 = __msa_fmadd_w(_sum1, _r14, _k12);
                    _sum2 = __msa_fmadd_w(_sum2, _r16, _k12);
                    _sum3 = __msa_fmadd_w(_sum3, _r18, _k12);
                    _sum4 = __msa_fmadd_w(_sum4, _r1a, _k12);
                    _sum5 = __msa_fmadd_w(_sum5, _r1c, _k12);
                    _sum6 = __msa_fmadd_w(_sum6, _r1e, _k12);
                    _sum7 = __msa_fmadd_w(_sum7, _r1g, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4i32 _r2n = __msa_ld_w(r2 + 4, 0);
                    v4i32 _r2nn = __msa_ld_w(r2 + 8, 0);
                    v4i32 _r2nnn = __msa_ld_w(r2 + 12, 0);

                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);
                    v4f32 _r23 = (v4f32)__msa_splati_w(_r2, 3);
                    v4f32 _r24 = (v4f32)__msa_splati_w(_r2n, 0);
                    v4f32 _r25 = (v4f32)__msa_splati_w(_r2n, 1);
                    v4f32 _r26 = (v4f32)__msa_splati_w(_r2n, 2);
                    v4f32 _r27 = (v4f32)__msa_splati_w(_r2n, 3);
                    v4f32 _r28 = (v4f32)__msa_splati_w(_r2nn, 0);
                    v4f32 _r29 = (v4f32)__msa_splati_w(_r2nn, 1);
                    v4f32 _r2a = (v4f32)__msa_splati_w(_r2nn, 2);
                    v4f32 _r2b = (v4f32)__msa_splati_w(_r2nn, 3);
                    v4f32 _r2c = (v4f32)__msa_splati_w(_r2nnn, 0);
                    v4f32 _r2d = (v4f32)__msa_splati_w(_r2nnn, 1);
                    v4f32 _r2e = (v4f32)__msa_splati_w(_r2nnn, 2);
                    v4f32 _r2f = (v4f32)__msa_splati_w(_r2nnn, 3);
                    v4f32 _r2g = __msa_fill_w_f32(r2[16]);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum1 = __msa_fmadd_w(_sum1, _r22, _k20);
                    _sum2 = __msa_fmadd_w(_sum2, _r24, _k20);
                    _sum3 = __msa_fmadd_w(_sum3, _r26, _k20);
                    _sum4 = __msa_fmadd_w(_sum4, _r28, _k20);
                    _sum5 = __msa_fmadd_w(_sum5, _r2a, _k20);
                    _sum6 = __msa_fmadd_w(_sum6, _r2c, _k20);
                    _sum7 = __msa_fmadd_w(_sum7, _r2e, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum1 = __msa_fmadd_w(_sum1, _r23, _k21);
                    _sum2 = __msa_fmadd_w(_sum2, _r25, _k21);
                    _sum3 = __msa_fmadd_w(_sum3, _r27, _k21);
                    _sum4 = __msa_fmadd_w(_sum4, _r29, _k21);
                    _sum5 = __msa_fmadd_w(_sum5, _r2b, _k21);
                    _sum6 = __msa_fmadd_w(_sum6, _r2d, _k21);
                    _sum7 = __msa_fmadd_w(_sum7, _r2f, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);
                    _sum1 = __msa_fmadd_w(_sum1, _r24, _k22);
                    _sum2 = __msa_fmadd_w(_sum2, _r26, _k22);
                    _sum3 = __msa_fmadd_w(_sum3, _r28, _k22);
                    _sum4 = __msa_fmadd_w(_sum4, _r2a, _k22);
                    _sum5 = __msa_fmadd_w(_sum5, _r2c, _k22);
                    _sum6 = __msa_fmadd_w(_sum6, _r2e, _k22);
                    _sum7 = __msa_fmadd_w(_sum7, _r2g, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);
                    __msa_st_w((v4i32)_sum1, outptr0 + 4, 0);
                    __msa_st_w((v4i32)_sum2, outptr0 + 4 * 2, 0);
                    __msa_st_w((v4i32)_sum3, outptr0 + 4 * 3, 0);
                    __msa_st_w((v4i32)_sum4, outptr0 + 4 * 4, 0);
                    __msa_st_w((v4i32)_sum5, outptr0 + 4 * 5, 0);
                    __msa_st_w((v4i32)_sum6, outptr0 + 4 * 6, 0);
                    __msa_st_w((v4i32)_sum7, outptr0 + 4 * 7, 0);

                    outptr0 += 4 * 8;

                    r0 += 16;
                    r1 += 16;
                    r2 += 16;
                }
                for (; j + 3 < outw; j += 4)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);
                    v4f32 _sum1 = (v4f32)__msa_ld_w(outptr0 + 4, 0);
                    v4f32 _sum2 = (v4f32)__msa_ld_w(outptr0 + 4 * 2, 0);
                    v4f32 _sum3 = (v4f32)__msa_ld_w(outptr0 + 4 * 3, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4i32 _r0n = __msa_ld_w(r0 + 4, 0);

                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);
                    v4f32 _r03 = (v4f32)__msa_splati_w(_r0, 3);
                    v4f32 _r04 = (v4f32)__msa_splati_w(_r0n, 0);
                    v4f32 _r05 = (v4f32)__msa_splati_w(_r0n, 1);
                    v4f32 _r06 = (v4f32)__msa_splati_w(_r0n, 2);
                    v4f32 _r07 = (v4f32)__msa_splati_w(_r0n, 3);
                    v4f32 _r08 = __msa_fill_w_f32(r0[8]);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum1 = __msa_fmadd_w(_sum1, _r02, _k00);
                    _sum2 = __msa_fmadd_w(_sum2, _r04, _k00);
                    _sum3 = __msa_fmadd_w(_sum3, _r06, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum1 = __msa_fmadd_w(_sum1, _r03, _k01);
                    _sum2 = __msa_fmadd_w(_sum2, _r05, _k01);
                    _sum3 = __msa_fmadd_w(_sum3, _r07, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);
                    _sum1 = __msa_fmadd_w(_sum1, _r04, _k02);
                    _sum2 = __msa_fmadd_w(_sum2, _r06, _k02);
                    _sum3 = __msa_fmadd_w(_sum3, _r08, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4i32 _r1n = __msa_ld_w(r1 + 4, 0);

                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);
                    v4f32 _r13 = (v4f32)__msa_splati_w(_r1, 3);
                    v4f32 _r14 = (v4f32)__msa_splati_w(_r1n, 0);
                    v4f32 _r15 = (v4f32)__msa_splati_w(_r1n, 1);
                    v4f32 _r16 = (v4f32)__msa_splati_w(_r1n, 2);
                    v4f32 _r17 = (v4f32)__msa_splati_w(_r1n, 3);
                    v4f32 _r18 = __msa_fill_w_f32(r1[8]);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum1 = __msa_fmadd_w(_sum1, _r12, _k10);
                    _sum2 = __msa_fmadd_w(_sum2, _r14, _k10);
                    _sum3 = __msa_fmadd_w(_sum3, _r16, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum1 = __msa_fmadd_w(_sum1, _r13, _k11);
                    _sum2 = __msa_fmadd_w(_sum2, _r15, _k11);
                    _sum3 = __msa_fmadd_w(_sum3, _r17, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);
                    _sum1 = __msa_fmadd_w(_sum1, _r14, _k12);
                    _sum2 = __msa_fmadd_w(_sum2, _r16, _k12);
                    _sum3 = __msa_fmadd_w(_sum3, _r18, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4i32 _r2n = __msa_ld_w(r2 + 4, 0);

                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);
                    v4f32 _r23 = (v4f32)__msa_splati_w(_r2, 3);
                    v4f32 _r24 = (v4f32)__msa_splati_w(_r2n, 0);
                    v4f32 _r25 = (v4f32)__msa_splati_w(_r2n, 1);
                    v4f32 _r26 = (v4f32)__msa_splati_w(_r2n, 2);
                    v4f32 _r27 = (v4f32)__msa_splati_w(_r2n, 3);
                    v4f32 _r28 = __msa_fill_w_f32(r2[8]);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum1 = __msa_fmadd_w(_sum1, _r22, _k20);
                    _sum2 = __msa_fmadd_w(_sum2, _r24, _k20);
                    _sum3 = __msa_fmadd_w(_sum3, _r26, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum1 = __msa_fmadd_w(_sum1, _r23, _k21);
                    _sum2 = __msa_fmadd_w(_sum2, _r25, _k21);
                    _sum3 = __msa_fmadd_w(_sum3, _r27, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);
                    _sum1 = __msa_fmadd_w(_sum1, _r24, _k22);
                    _sum2 = __msa_fmadd_w(_sum2, _r26, _k22);
                    _sum3 = __msa_fmadd_w(_sum3, _r28, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);
                    __msa_st_w((v4i32)_sum1, outptr0 + 4, 0);
                    __msa_st_w((v4i32)_sum2, outptr0 + 4 * 2, 0);
                    __msa_st_w((v4i32)_sum3, outptr0 + 4 * 3, 0);

                    outptr0 += 4 * 4;

                    r0 += 8;
                    r1 += 8;
                    r2 += 8;
                }
                for (; j + 1 < outw; j += 2)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);
                    v4f32 _sum1 = (v4f32)__msa_ld_w(outptr0 + 4, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);
                    v4f32 _r03 = (v4f32)__msa_splati_w(_r0, 3);
                    v4f32 _r04 = __msa_fill_w_f32(r0[4]);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum1 = __msa_fmadd_w(_sum1, _r02, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum1 = __msa_fmadd_w(_sum1, _r03, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);
                    _sum1 = __msa_fmadd_w(_sum1, _r04, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);
                    v4f32 _r13 = (v4f32)__msa_splati_w(_r1, 3);
                    v4f32 _r14 = __msa_fill_w_f32(r1[4]);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum1 = __msa_fmadd_w(_sum1, _r12, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum1 = __msa_fmadd_w(_sum1, _r13, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);
                    _sum1 = __msa_fmadd_w(_sum1, _r14, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);
                    v4f32 _r23 = (v4f32)__msa_splati_w(_r2, 3);
                    v4f32 _r24 = __msa_fill_w_f32(r2[4]);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum1 = __msa_fmadd_w(_sum1, _r22, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum1 = __msa_fmadd_w(_sum1, _r23, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);
                    _sum1 = __msa_fmadd_w(_sum1, _r24, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);
                    __msa_st_w((v4i32)_sum1, outptr0 + 4, 0);

                    outptr0 += 4 * 2;

                    r0 += 4;
                    r1 += 4;
                    r2 += 4;
                }
                for (; j < outw; j++)
                {
                    v4f32 _sum0 = (v4f32)__msa_ld_w(outptr0, 0);

                    v4i32 _r0 = __msa_ld_w(r0, 0);
                    v4f32 _r00 = (v4f32)__msa_splati_w(_r0, 0);
                    v4f32 _r01 = (v4f32)__msa_splati_w(_r0, 1);
                    v4f32 _r02 = (v4f32)__msa_splati_w(_r0, 2);

                    _sum0 = __msa_fmadd_w(_sum0, _r00, _k00);
                    _sum0 = __msa_fmadd_w(_sum0, _r01, _k01);
                    _sum0 = __msa_fmadd_w(_sum0, _r02, _k02);

                    v4i32 _r1 = __msa_ld_w(r1, 0);
                    v4f32 _r10 = (v4f32)__msa_splati_w(_r1, 0);
                    v4f32 _r11 = (v4f32)__msa_splati_w(_r1, 1);
                    v4f32 _r12 = (v4f32)__msa_splati_w(_r1, 2);

                    _sum0 = __msa_fmadd_w(_sum0, _r10, _k10);
                    _sum0 = __msa_fmadd_w(_sum0, _r11, _k11);
                    _sum0 = __msa_fmadd_w(_sum0, _r12, _k12);

                    v4i32 _r2 = __msa_ld_w(r2, 0);
                    v4f32 _r20 = (v4f32)__msa_splati_w(_r2, 0);
                    v4f32 _r21 = (v4f32)__msa_splati_w(_r2, 1);
                    v4f32 _r22 = (v4f32)__msa_splati_w(_r2, 2);

                    _sum0 = __msa_fmadd_w(_sum0, _r20, _k20);
                    _sum0 = __msa_fmadd_w(_sum0, _r21, _k21);
                    _sum0 = __msa_fmadd_w(_sum0, _r22, _k22);

                    __msa_st_w((v4i32)_sum0, outptr0, 0);

                    outptr0 += 4;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }

            k0 += 9 * 4;
        }
    }
}


================================================
FILE: src/layer/mips/convolution_3x3_pack4.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd63_transform_kernel_pack4_msa(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
    // winograd63 transform kernel
    Mat kernel_tm;
    kernel_tm.create(8 * 8, inch, outch);

    const float ktm[8][3] = {
        {1.0f, 0.0f, 0.0f},
        {-2.0f / 9, -2.0f / 9, -2.0f / 9},
        {-2.0f / 9, 2.0f / 9, -2.0f / 9},
        {1.0f / 90, 1.0f / 45, 2.0f / 45},
        {1.0f / 90, -1.0f / 45, 2.0f / 45},
        {1.0f / 45, 1.0f / 90, 1.0f / 180},
        {1.0f / 45, -1.0f / 90, 1.0f / 180},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel, transposed
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[8][3];
            for (int i = 0; i < 8; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // v
            for (int j = 0; j < 8; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 8; i++)
                {
                    kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 64-inch-outch
    // dst = pb-pa-inch/pa-64-outch/pb
    kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 4 * 4, 4 * 4);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm_pack4.channel(q / 4);

        for (int k = 0; k < 64; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p + 3 < inch; p += 4)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }
}

static void conv3x3s1_winograd63_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 6n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 5) / 6 * 6;
    outh = (outh + 5) / 6 * 6;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 6;
        int h_tiles = outh / 6;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
        conv3x3s1_winograd63_transform_input_pack4_msa(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack4_msa(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd63_transform_output_pack4_msa(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}

static void conv3x3s1_winograd43_transform_kernel_pack4_msa(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch);

    const float ktm[6][3] = {
        {1.0f / 4, 0.0f, 0.0f},
        {-1.0f / 6, -1.0f / 6, -1.0f / 6},
        {-1.0f / 6, 1.0f / 6, -1.0f / 6},
        {1.0f / 24, 1.0f / 12, 1.0f / 6},
        {1.0f / 24, -1.0f / 12, 1.0f / 6},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = pb-pa-inch/pa-36-outch/pb
    kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 4 * 4, 4 * 4);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm_pack4.channel(q / 4);

        for (int k = 0; k < 36; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p + 3 < inch; p += 4)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_pack4_msa(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack4_msa(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_pack4_msa(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}

static void conv3x3s1_winograd23_transform_kernel_pack4_msa(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt)
{
    // winograd23 transform kernel
    Mat kernel_tm(4 * 4, inch, outch);

    const float ktm[4][3] = {
        {1.0f, 0.0f, 0.0f},
        {1.0f / 2, 1.0f / 2, 1.0f / 2},
        {1.0f / 2, -1.0f / 2, 1.0f / 2},
        {0.0f, 0.0f, 1.0f}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[4][3];
            for (int i = 0; i < 4; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 4; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i = 0; i < 4; i++)
                {
                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 16-inch-outch
    // dst = pb-pa-inch/pa-16-outch/pb
    kernel_tm_pack4.create(inch / 4, 16, outch / 4, (size_t)4u * 4 * 4, 4 * 4);

    for (int q = 0; q + 3 < outch; q += 4)
    {
        Mat g0 = kernel_tm_pack4.channel(q / 4);

        for (int k = 0; k < 16; k++)
        {
            float* g00 = g0.row(k);

            for (int p = 0; p + 3 < inch; p += 4)
            {
                for (int i = 0; i < 4; i++)
                {
                    for (int j = 0; j < 4; j++)
                    {
                        const float* k00 = kernel_tm.channel(q + j).row(p + i);
                        g00[0] = k00[k];
                        g00++;
                    }
                }
            }
        }
    }
}

static void conv3x3s1_winograd23_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 2n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 1) / 2 * 2;
    outh = (outh + 1) / 2 * 2;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 2;
        int h_tiles = outh / 2;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator);
        conv3x3s1_winograd23_transform_input_pack4_msa(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack4_msa(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd23_transform_output_pack4_msa(top_blob_tm, top_blob_bordered, bias, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/mips/convolution_3x3_pack8to1_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
{
    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 4b-8a-inch/8a-36-outch/4b
    kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);

    int p = 0;
    for (; p + 3 < outch; p += 4)
    {
        const Mat k0 = kernel_tm.channel(p);
        const Mat k1 = kernel_tm.channel(p + 1);
        const Mat k2 = kernel_tm.channel(p + 2);
        const Mat k3 = kernel_tm.channel(p + 3);

        Mat g0 = kernel_tm_pack8to1.channel(p / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            for (int q = 0; q + 7 < inch; q += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0.row<const short>(q + i)[k];
                    g00[1] = k1.row<const short>(q + i)[k];
                    g00[2] = k2.row<const short>(q + i)[k];
                    g00[3] = k3.row<const short>(q + i)[k];

                    g00 += 4;
                }
            }
        }
    }
    for (; p < outch; p++)
    {
        const Mat k0 = kernel_tm.channel(p);

        Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = g0.row<short>(k);

            for (int q = 0; q + 7 < inch; q += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    g00[0] = k0.row<const short>(q + i)[k];

                    g00 += 1;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_pack8to1_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_pack8_int8_msa(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack8to1_int8_msa(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_int8_msa(top_blob_tm, top_blob_bordered, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/mips/convolution_3x3_pack8to4_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_msa(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
{
    // winograd43 transform kernel
    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);

    const short ktm[6][3] = {
        {6, 0, 0},
        {-4, -4, -4},
        {-4, 4, -4},
        {1, 2, 4},
        {1, -2, 4},
        {0, 0, 6}
    };

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        for (int q = 0; q < inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[6][3];
            for (int i = 0; i < 6; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j = 0; j < 6; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i = 0; i < 6; i++)
                {
                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }

    // interleave
    // src = 36-inch-outch
    // dst = 4b-8a-inch/8a-36-outch/4b
    kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);

    int q = 0;
    for (; q + 3 < outch; q += 4)
    {
        const Mat k0 = kernel_tm.channel(q);
        const Mat k1 = kernel_tm.channel(q + 1);
        const Mat k2 = kernel_tm.channel(q + 2);
        const Mat k3 = kernel_tm.channel(q + 3);

        Mat kernel_tm = kernel_tm_pack8.channel(q / 4);

        for (int k = 0; k < 36; k++)
        {
            short* g00 = kernel_tm.row<short>(k);

            for (int p = 0; p + 7 < inch; p += 8)
            {
                for (int i = 0; i < 8; i++)
                {
                    const short* k00 = k0.row<const short>(p + i);
                    const short* k10 = k1.row<const short>(p + i);
                    const short* k20 = k2.row<const short>(p + i);
                    const short* k30 = k3.row<const short>(p + i);

                    g00[0] = k00[k];
                    g00[1] = k10[k];
                    g00[2] = k20[k];
                    g00[3] = k30[k];

                    g00 += 4;
                }
            }
        }
    }
}

static void conv3x3s1_winograd43_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
{
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;
    //     size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 4n+2
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 3) / 4 * 4;
    outh = (outh + 3) / 4 * 4;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tiles = outw / 4;
        int h_tiles = outh / 4;
        const int tiles = w_tiles * h_tiles;

        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
        conv3x3s1_winograd43_transform_input_pack8_int8_msa(bottom_blob_bordered, bottom_blob_tm, opt);
    }
    bottom_blob_bordered = Mat();
    // END transform input

    // BEGIN dot
    Mat top_blob_tm;
    convolution_winograd_dot_pack8to4_int8_msa(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    if (outw == top_blob.w && outh == top_blob.h)
    {
        top_blob_bordered = top_blob;
    }
    else
    {
        top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
    }
    {
        conv3x3s1_winograd43_transform_output_pack4_int8_msa(top_blob_tm, top_blob_bordered, opt);
    }
    // END transform output

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
}


================================================
FILE: src/layer/mips/convolution_7x7_pack1to4.h
================================================
[File too large to display: 33.4 KB]

================================================
FILE: src/layer/mips/convolution_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                int sum = 0;

                //                 const signed char* kptr = weight_data_int8.channel(p);
                const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++)
                    {
                        signed char val = sptr[space_ofs[k]];
                        signed char w = kptr[k];
                        sum += val * w;
                    }

                    kptr += maxk;
                }

                outptr[j] = sum;
            }

            outptr += outw;
        }
    }
}


================================================
FILE: src/layer/mips/convolution_mips.cpp
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "convolution_mips.h"

#include "benchmark.h"
#include "cpu.h"
#include "layer_type.h"

#if __mips_msa
#include <msa.h>
#endif // __mips_msa

#include "mips_activation.h"
#include "mips_usability.h"

#include "cpu.h"

namespace ncnn {

#include "convolution_sgemm.h"
#include "convolution_winograd_transform.h"
#include "convolution_winograd_dot.h"
#include "convolution_1x1.h"
#include "convolution_3x3.h"

#if NCNN_INT8
#include "convolution_sgemm_int8.h"
#include "convolution_winograd_transform_int8.h"
#include "convolution_winograd_dot_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"
#include "convolution_int8.h"
#endif // NCNN_INT8

#if __mips_msa
#include "convolution_pack4.h"
#include "convolution_pack1to4.h"
#include "convolution_pack4to1.h"

#include "convolution_sgemm_pack4.h"
#include "convolution_sgemm_pack4to1.h"
#include "convolution_winograd_transform_pack4.h"
#include "convolution_winograd_dot_pack4.h"
#include "convolution_1x1_pack4.h"
#include "convolution_1x1_pack4to1.h"
#include "convolution_3x3_pack4.h"
#include "convolution_3x3_pack1to4.h"
#include "convolution_7x7_pack1to4.h"

#if NCNN_INT8
#include "convolution_pack8to4_int8.h"
#include "convolution_pack1to4_int8.h"
#include "convolution_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
#include "convolution_winograd_transform_pack4_int8.h"
#include "convolution_winograd_transform_pack8_int8.h"
#include "convolution_winograd_dot_pack8to4_int8.h"
#include "convolution_winograd_dot_pack8to1_int8.h"
#include "convolution_1x1_pack8to4_int8.h"
#include "convolution_1x1_pack1to4_int8.h"
#include "convolution_1x1_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
#include "convolution_3x3_pack8to1_int8.h"
#endif // NCNN_INT8
#endif // __mips_msa

Convolution_mips::Convolution_mips()
{
#if __mips_msa
    support_packing = true;
#endif // __mips_msa

    activation = 0;
}

static void convolution_transform_kernel_packed_msa(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pb-pa-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            float* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < elempack; i++)
                    {
                        for (int j = 0; j < out_elempack; j++)
                        {
                            const float* k00 = weight_data_r2.channel(q + j).row(p + i);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }
}

int Convolution_mips::create_pipeline(const Option& opt)
{
    if (dynamic_weight)
        return 0;

    activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_INT8
    if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
    {
        return create_pipeline_int8_mips(opt);
    }
#endif

    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __mips_msa
    if (opt.use_packing_layout)
    {
        elempack = num_input % 4 == 0 ? 4 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif

#if __mips_msa
    // pack4
    if (elempack == 4 && out_elempack == 4)
    {
        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd63_transform_kernel_pack4_msa(weight_data, weight_winograd63_data, num_input, num_output, opt);
            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd43_transform_kernel_pack4_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
            else // if (opt.use_winograd23_convolution)
                conv3x3s1_winograd23_transform_kernel_pack4_msa(weight_data, weight_winograd23_data, num_input, num_output, opt);
        }
        else
        {
            convolution_transform_kernel_packed_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    // pack1ton
    if (elempack == 1 && out_elempack == 4)
    {
        convolution_transform_kernel_packed_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
    }

    // pack4to1
    if (elempack == 4 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack4to1_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack4to1_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_pack4to1_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }
#endif // __mips_msa

    // pack1
    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd43_transform_kernel_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
            }
            else if (opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd23_transform_kernel_msa(weight_data, weight_winograd23_data, num_input, num_output, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            weight_data_tm = weight_data;
        }
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_mips::destroy_pipeline(const Option& opt)
{
    if (activation)
    {
        activation->destroy_pipeline(opt);
        delete activation;
        activation = 0;
    }

    return 0;
}

int Convolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if NCNN_INT8
    if (opt.use_int8_inference && int8_scale_term)
    {
        return forward_int8_mips(bottom_blob, top_blob, opt);
    }
#endif

    // flattened blob, implement as InnerProduct
    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
    {
        Mat bottom_blob_3d;
        if (bottom_blob.elemsize % 16 == 0)
        {
            bottom_blob_3d = bottom_blob;
            bottom_blob_3d.dims = 3;
            bottom_blob_3d.w = 1;
            bottom_blob_3d.h = 1;
            bottom_blob_3d.c = bottom_blob.w;
            bottom_blob_3d.cstep = 1;
        }
        else
        {
            bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator);
        }

        Mat top_blob_3d;
        int ret = forward(bottom_blob_3d, top_blob_3d, opt);
        if (ret != 0)
            return ret;

        if (top_blob_3d.elemsize % 16 == 0)
        {
            top_blob = top_blob_3d;
            top_blob.dims = 1;
            top_blob.w = top_blob_3d.c;
            top_blob.h = 1;
            top_blob.c = 1;
            top_blob.cstep = top_blob_3d.c;
        }
        else
        {
            top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator);
        }

        return 0;
    }

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    Mat bottom_blob_bordered;
    make_padding(bottom_blob, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    w = bottom_blob_bordered.w;
    h = bottom_blob_bordered.h;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;
    int out_elempack = 1;
#if __mips_msa
    if (opt.use_packing_layout)
    {
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif
    size_t out_elemsize = elemsize / elempack * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int num_input = channels * elempack;

#if __mips_msa
    if (elempack == 4 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd63_pack4_msa(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt);
            else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution))
                conv3x3s1_winograd43_pack4_msa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
            else // if (opt.use_winograd23_convolution)
                conv3x3s1_winograd23_pack4_msa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_pack4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv7x7s2_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_pack1to4_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }

    if (elempack == 4 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack4to1_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack4to1_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack4to1_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            convolution_pack4to1_msa(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt);
        }
    }
#endif // __mips_msa

    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd43_msa(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt);
            }
            else if (opt.use_winograd23_convolution)
            {
                conv3x3s1_winograd23_msa(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt);
            }

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_msa(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);

            if (activation)
            {
                activation->forward_inplace(top_blob, opt);
            }
        }
        else
        {
            const int maxk = kernel_w * kernel_h;

            // kernel offsets
            std::vector<int> _space_ofs(maxk);
            int* space_ofs = &_space_ofs[0];
            {
                int p1 = 0;
                int p2 = 0;
                int gap = w * dilation_h - kernel_w * dilation_w;
                for (int i = 0; i < kernel_h; i++)
                {
                    for (int j = 0; j < kernel_w; j++)
                    {
                        space_ofs[p1] = p2;
                        p1++;
                        p2 += dilation_w;
                    }
                    p2 += gap;
                }
            }

            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p = 0; p < num_output; p++)
            {
                float* outptr = top_blob.channel(p);

                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        float sum = 0.f;

                        if (bias_term)
                        {
                            sum = bias_data[p];
                        }

                        const float* kptr = (const float*)weight_data_tm + maxk * channels * p;

                        // channels
                        for (int q = 0; q < channels; q++)
                        {
                            const Mat m = bottom_blob_bordered.channel(q);
                            const float* sptr = m.row(i * stride_h) + j * stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                float val = sptr[space_ofs[k]];
                                float wt = kptr[k];
                                sum += val * wt;
                            }

                            kptr += maxk;
                        }

                        sum = activation_ss(sum, activation_type, activation_params);

                        outptr[j] = sum;
                    }

                    outptr += outw;
                }
            }
        }
    }

    return 0;
}

int Convolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
    const Mat& bottom_blob = bottom_blobs[0];
    const Mat& _weight_data = bottom_blobs[1];
    Mat& top_blob = top_blobs[0];

    const int _kernel_w = _weight_data.w;
    const int _kernel_h = _weight_data.h;
    const int _num_output = _weight_data.c * _weight_data.elempack;

    Mat weight_data_flattened;
    flatten(_weight_data, weight_data_flattened, opt);
    if (weight_data_flattened.empty())
        return -100;

    // weight_data_flattened as pack1
    weight_data_flattened.w *= weight_data_flattened.elempack;
    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
    weight_data_flattened.elempack = 1;

    Mat bias_data_flattened;
    if (bias_term)
    {
        const Mat& _bias_data = bottom_blobs[2];
        flatten(_bias_data, bias_data_flattened, opt);
        if (bias_data_flattened.empty())
            return -100;

        // bias_data_flattened as pack1
        bias_data_flattened.w *= bias_data_flattened.elempack;
        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
        bias_data_flattened.elempack = 1;
    }

    ncnn::Layer* op = ncnn::create_layer_cpu(ncnn::LayerType::Convolution);

    ncnn::ParamDict pd;
    pd.set(0, _num_output);
    pd.set(1, _kernel_w);
    pd.set(11, _kernel_h);
    pd.set(2, dilation_w);
    pd.set(12, dilation_h);
    pd.set(3, stride_w);
    pd.set(13, stride_h);
    pd.set(4, pad_left);
    pd.set(15, pad_right);
    pd.set(14, pad_top);
    pd.set(16, pad_bottom);
    pd.set(18, pad_value);
    pd.set(5, bias_term);
    pd.set(6, weight_data_flattened.w);
    pd.set(8, int8_scale_term);
    pd.set(9, activation_type);
    pd.set(10, activation_params);

    op->load_param(pd);

    ncnn::Mat weights[2];
    weights[0] = weight_data_flattened;
    weights[1] = bias_data_flattened;

    op->load_model(ncnn::ModelBinFromMatArray(weights));

    op->create_pipeline(opt);

    op->forward(bottom_blob, top_blob, opt);

    op->destroy_pipeline(opt);

    delete op;

    return 0;
}

#if NCNN_INT8
static void convolution_transform_kernel_packed_int8_msa(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack)
{
    const int maxk = kernel_w * kernel_h;

    // src = kw-kh-inch-outch
    // dst = pa-pb-kw-kh-inch/pa-outch/pb
    {
        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

        weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);

        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
        {
            signed char* g00 = weight_data_tm.channel(q / out_elempack);

            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
            {
                for (int k = 0; k < maxk; k++)
                {
                    for (int i = 0; i < out_elempack; i++)
                    {
                        for (int j = 0; j < elempack; j++)
                        {
                            const signed char* k00 = weight_data_r2.channel(q + i).row<const signed char>(p + j);

                            g00[0] = k00[k];

                            g00++;
                        }
                    }
                }
            }
        }
    }
}

int Convolution_mips::create_pipeline_int8_mips(const Option& opt)
{
    const int maxk = kernel_w * kernel_h;
    const int num_input = weight_data_size / maxk / num_output;

    int elempack = 1;
    int out_elempack = 1;
#if __mips_msa
    if (opt.use_packing_layout)
    {
        elempack = num_input % 8 == 0 ? 8 : 1;
        out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __mips_msa

#if __mips_msa
    if (elempack == 8 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_pack8to4_int8_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    if (elempack == 1 && out_elempack == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack1to4_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }

    if (elempack == 8 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_pack8to1_int8_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_transform_kernel_pack8to1_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            convolution_transform_kernel_packed_int8_msa(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack);
        }
    }
#endif // __mips_msa

    if (elempack == 1 && out_elempack == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_transform_kernel_int8_msa(weight_data, weight_winograd43_data, num_input, num_output, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_transform_kernel_int8_msa(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h);
        }
        else
        {
            weight_data_tm = weight_data;
        }
    }

    scale_in_data.create(num_output);
    for (int p = 0; p < num_output; p++)
    {
        // requantize and relu
        float scale_in;
        if (weight_data_int8_scales[p] == 0)
            scale_in = 0;
        else
            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);

        scale_in_data[p] = scale_in;
    }

    if (opt.lightmode)
        weight_data.release();

    return 0;
}

int Convolution_mips::forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
    int elembits = bottom_blob.elembits();

    Mat bottom_blob_int8 = bottom_blob;
    if (elembits != 8)
    {
        Option opt_q = opt;
        opt_q.blob_allocator = opt.workspace_allocator;
        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
    }

    Mat bottom_blob_bordered;
    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
    if (bottom_blob_bordered.empty())
        return -100;

    int w = bottom_blob_bordered.w;
    int h = bottom_blob_bordered.h;
    int channels = bottom_blob_bordered.c;
    int elempack = bottom_blob_bordered.elempack;

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

    int outw = (w - kernel_extent_w) / stride_w + 1;
    int outh = (h - kernel_extent_h) / stride_h + 1;

    bool use_int8_requantize = int8_scale_term > 100;
    int out_elempack = 1;
#if __mips_msa
    if (opt.use_packing_layout)
    {
        if (use_int8_requantize)
            out_elempack = num_output % 8 == 0 ? 8 : 1;
        else
            out_elempack = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __mips_msa
    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;

    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    const int num_input = channels * elempack;

    int out_elempack_int32 = 1;
#if __mips_msa
    if (opt.use_packing_layout)
    {
        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
    }
#endif // __mips_msa

    Mat top_blob_int32;
    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
    if (top_blob_int32.empty())
        return -100;

#if __mips_msa
    if (elempack == 8 && out_elempack_int32 == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

    if (elempack == 1 && out_elempack_int32 == 4)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack1to4_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

    if (elempack == 8 && out_elempack_int32 == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8)
        {
            convolution_im2col_sgemm_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_pack8to1_int8_msa(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }
#endif // __mips_msa

    if (elempack == 1 && out_elempack_int32 == 1)
    {
        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv1x1s1_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv1x1s2_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt);
        }
        else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            conv3x3s1_winograd43_int8_msa(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
        }
        else if (opt.use_sgemm_convolution)
        {
            convolution_im2col_sgemm_int8_msa(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
        else
        {
            convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
        }
    }

#if __mips_msa
    if (opt.use_packing_layout)
    {
        // NCNN_LOGE("top_blob_int32  %d  %d", top_blob_int32.c, top_blob_int32.elempack);
        if (use_int8_requantize)
        {
            // TODO implement winograd sgemm packed int8 pack1 output
            if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 1)
            {
                Mat tmp;
                convert_packing(top_blob_int32, tmp, 1, opt);
                top_blob_int32 = tmp;
            }
            if (top_blob_int32.elempack == 4 && top_blob_int32.c % 2 == 0)
            {
                Mat tmp;
                convert_packing(top_blob_int32, tmp, 8, opt);
                top_blob_int32 = tmp;
            }
        }
    }
#endif

    if (use_int8_requantize)
    {
        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
    }
    else
    {
        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);

        if (activation)
        {
            activation->forward_inplace(top_blob, opt);
        }
    }

    return 0;
}
#endif // NCNN_INT8

} // namespace ncnn


================================================
FILE: src/layer/mips/convolution_mips.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_CONVOLUTION_MIPS_H
#define LAYER_CONVOLUTION_MIPS_H

#include "convolution.h"

namespace ncnn {

class Convolution_mips : public Convolution
{
public:
    Convolution_mips();

    virtual int create_pipeline(const Option& opt);
    virtual int destroy_pipeline(const Option& opt);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_INT8
    int create_pipeline_int8_mips(const Option& opt);
    int forward_int8_mips(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif

public:
    Layer* activation;

    Mat weight_data_tm;
    Mat weight_sgemm_data;
    Mat weight_winograd23_data;
    Mat weight_winograd43_data;
    Mat weight_winograd63_data;

#if NCNN_INT8
    Mat scale_in_data;
#endif
};

} // namespace ncnn

#endif // LAYER_CONVOLUTION_MIPS_H


================================================
FILE: src/layer/mips/convolution_mips_mmi.cpp
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "cpu.h"
#include "mat.h"
#if __mips_loongson_mmi
#include "loongson_mmi.h"
#endif // __mips_loongson_mmi

namespace ncnn {

#include "convolution_sgemm_int8.h"
#include "convolution_winograd_transform_int8.h"
#include "convolution_winograd_dot_int8.h"
#include "convolution_3x3_int8.h"

// pack1
void im2col_sgemm_int8_loongson_mmi(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt)
{
    im2col_sgemm_int8_msa(bottom_im2col, top_blob, kernel, opt);
}

void convolution_im2col_sgemm_transform_kernel_int8_loongson_mmi(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    convolution_im2col_sgemm_transform_kernel_int8_msa(kernel, kernel_tm, inch, outch, kernel_w, kernel_h);
}

void conv3x3s1_winograd43_transform_kernel_int8_loongson_mmi(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt)
{
    conv3x3s1_winograd43_transform_kernel_int8_msa(kernel, kernel_tm_packed, inch, outch, opt);
}

void convolution_winograd_dot_int8_loongson_mmi(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
{
    convolution_winograd_dot_int8_msa(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt);
}

} // namespace ncnn


================================================
FILE: src/layer/mips/convolution_pack1to4.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack1to4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    const float* bias_data_ptr = bias_data;

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        float* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                v4f32 _sum = (v4f32)__msa_fill_w(0);

                if (bias_data_ptr)
                {
                    _sum = (v4f32)__msa_ld_w(bias_data_ptr + p * 4, 0);
                }

                const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4;

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const float* sptr = m.row(i * stride_h) + j * stride_w;

                    for (int k = 0; k < maxk; k++) // 29.23
                    {
                        v4f32 _val = __msa_fill_w_f32(sptr[space_ofs[k]]);
                        v4f32 _w = (v4f32)__msa_ld_w(kptr, 0);
                        _sum = __msa_fmadd_w(_sum, _val, _w);

                        kptr += 4;
                    }
                }

                _sum = activation_ps(_sum, activation_type, activation_params);

                __msa_st_w((v4i32)_sum, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/mips/convolution_pack1to4_int8.h
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/mips/convolution_pack4.h
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/mips/convolution_pack4to1.h
================================================
[File too large to display: 2.4 KB]

================================================
FILE: src/layer/mips/convolution_pack8to1_int8.h
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/mips/convolution_pack8to4_int8.h
================================================
// Copyright 2022 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void convolution_pack8to4_int8_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int channels = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int maxk = kernel_w * kernel_h;

    // kernel offsets
    std::vector<int> _space_ofs(maxk);
    int* space_ofs = &_space_ofs[0];
    {
        int p1 = 0;
        int p2 = 0;
        int gap = w * dilation_h - kernel_w * dilation_w;
        for (int i = 0; i < kernel_h; i++)
        {
            for (int j = 0; j < kernel_w; j++)
            {
                space_ofs[p1] = p2;
                p1++;
                p2 += dilation_w;
            }
            p2 += gap;
        }
    }

    // num_output
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        int* outptr = top_blob.channel(p);

        for (int i = 0; i < outh; i++)
        {
            for (int j = 0; j < outw; j++)
            {
                v4i32 _sum0 = __msa_fill_w(0);
                v4i32 _sum1 = __msa_fill_w(0);
                v4i32 _sum2 = __msa_fill_w(0);
                v4i32 _sum3 = __msa_fill_w(0);

                const signed char* kptr = weight_data_int8.channel(p);

                // channels
                for (int q = 0; q < channels; q++)
                {
                    const Mat m = bottom_blob.channel(q);
                    const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w * 8;

                    for (int k = 0; k < maxk; k++)
                    {
                        v16i8 _val = __msa_ld_b(sptr + space_ofs[k] * 8, 0);
                        v8i16 _val16 = (v8i16)__msa_ilvr_b(__msa_clti_s_b(_val, 0), _val);

                        v16i8 _w01 = __msa_ld_b(kptr, 0);
                        v16i8 _w23 = __msa_ld_b(kptr + 16, 0);
                        v16i8 _extw01 = __msa_clti_s_b(_w01, 0);
                        v16i8 _extw23 = __msa_clti_s_b(_w23, 0);
                        v8i16 _w0 = (v8i16)__msa_ilvr_b(_extw01, _w01);
                        v8i16 _w1 = (v8i16)__msa_ilvl_b(_extw01, _w01);
                        v8i16 _w2 = (v8i16)__msa_ilvr_b(_extw23, _w23);
                        v8i16 _w3 = (v8i16)__msa_ilvl_b(_extw23, _w23);

                        v8i16 _s0 = __msa_mulv_h(_val16, _w0);
                        v8i16 _s1 = __msa_mulv_h(_val16, _w1);
                        v8i16 _s2 = __msa_mulv_h(_val16, _w2);
                        v8i16 _s3 = __msa_mulv_h(_val16, _w3);

                        _sum0 = __msa_addv_w(_sum0, __msa_hadd_s_w(_s0, _s0));
                        _sum1 = __msa_addv_w(_sum1, __msa_hadd_s_w(_s1, _s1));
                        _sum2 = __msa_addv_w(_sum2, __msa_hadd_s_w(_s2, _s2));
                        _sum3 = __msa_addv_w(_sum3, __msa_hadd_s_w(_s3, _s3));

                        kptr += 32;
                    }
                }

                // transpose 4x4
                {
                    v4i32 _tmp0, _tmp1, _tmp2, _tmp3;
                    _tmp0 = __msa_ilvr_w(_sum1, _sum0);
                    _tmp1 = __msa_ilvr_w(_sum3, _sum2);
                    _tmp2 = __msa_ilvl_w(_sum1, _sum0);
                    _tmp3 = __msa_ilvl_w(_sum3, _sum2);
                    _sum0 = (v4i32)__msa_ilvr_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum1 = (v4i32)__msa_ilvl_d((v2i64)_tmp1, (v2i64)_tmp0);
                    _sum2 = (v4i32)__msa_ilvr_d((v2i64)_tmp3, (v2i64)_tmp2);
                    _sum3 = (v4i32)__msa_ilvl_d((v2i64)_tmp3, (v2i64)_tmp2);
                }

                _sum0 = __msa_addv_w(_sum0, _sum1);
                _sum2 = __msa_addv_w(_sum2, _sum3);

                _sum0 = __msa_addv_w(_sum0, _sum2);

                __msa_st_w(_sum0, outptr + j * 4, 0);
            }

            outptr += outw * 4;
        }
    }
}


================================================
FILE: src/layer/mips/convolution_sgemm.h
================================================
// Copyright 2021 Tencent
// SPDX-License-Identifier: BSD-3-Clause

static void im2col_sgemm_msa(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
{
    // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);

    const int size = bottom_im2col.w;
    const int maxk = bottom_im2col.h;
    const int inch = bottom_im2col.c;

    const int outch = top_blob.c;

    const float* bias = _bias;

    // permute
    Mat tmp;
    if (size >= 4)
        tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u, 1, opt.workspace_allocator);
    else
        tmp.create(maxk, inch, size, 4u, 1, opt.workspace_allocator);
    {
        int nn_size = size / 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii = 0; ii < nn_size; ii++)
        {
            int i = ii * 4;

            float* tmpptr = tmp.channel(i / 4);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
#if __mips_msa
                    __msa_st_w(__msa_ld_w(img0, 0), tmpptr, 0);
#else
                    tmpptr[0] = img0[0];
                    tmpptr[1] = img0[1];
                    tmpptr[2] = img0[2];
                    tmpptr[3] = img0[3];
#endif
                    img0 += size;
                    tmpptr += 4;
                }
            }
        }

        int remain_size_start = nn_size * 4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i = remain_size_start; i < size; i++)
        {
            float* tmpptr = tmp.channel(i / 4 + i % 4);

            for (int q = 0; q < inch; q++)
            {
                const float* img0 = (const float*)bottom_im2col.channel(q) + i;

                for (int k = 0; k < maxk; k++)
                {
                    tmpptr[0] = img0[0];
                    img0 += size;
                    tmpptr += 1;
                }
            }
        }
    }

#if __mips_msa
    int nn_outch = outch >> 3;
    int remain_outch_start = nn_outch << 3;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 8;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);
        float* outptr2 = top_blob.channel(p + 2);
        float* outptr3 = top_blob.channel(p + 3);
        float* outptr4 = top_blob.channel(p + 4);
        float* outptr5 = top_blob.channel(p + 5);
        float* outptr6 = top_blob.channel(p + 6);
        float* outptr7 = top_blob.channel(p + 7);

        const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
        const float* biasptr = bias ? bias + p : zeros;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
            const float* kptr = kernel.channel(p / 8);

            int nn = inch * maxk; // inch always > 0

            v4f32 _sum0 = __msa_fill_w_f32(biasptr[0]);
            v4f32 _sum1 = __msa_fill_w_f32(biasptr[1]);
            v4f32 _sum2 = __msa_fill_w_f32(biasptr[2]);
            v4f32 _sum3 = __msa_fill_w_f32(biasptr[3]);
            v4f32 _sum4 = __msa_fill_w_f32(biasptr[4]);
            v4f32 _sum5 = __msa_fill_w_f32(biasptr[5]);
            v4f32 _sum6 = __msa_fill_w_f32(biasptr[6]);
            v4f32 _sum7 = __msa_fill_w_f32(biasptr[7]);

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 32);
                v4f32 _val = (v4f32)__msa_ld_w(tmpptr, 0);
                v4i32 _w0123 = __msa_ld_w(kptr, 0);
                v4i32 _w4567 = __msa_ld_w(kptr + 4, 0);
                _sum0 = __msa_fmadd_w(_sum0, _val, (v4f32)__msa_splati_w(_w0123, 0));
                _sum1 = __msa_fmadd_w(_sum1, _val, (v4f32)__msa_splati_w(_w0123, 1));
                _sum2 = __msa_fmadd_w(_sum2, _val, (v4f32)__msa_splati_w(_w0123, 2));
                _sum3 = __msa_fmadd_w(_sum3, _val, (v4f32)__msa_splati_w(_w0123, 3));
                _sum4 = __msa_fmadd_w(_sum4, _val, (v4f32)__msa_splati_w(_w4567, 0));
                _sum5 = __msa_fmadd_w(_sum5, _val, (v4f32)__msa_splati_w(_w4567, 1));
                _sum6 = __msa_fmadd_w(_sum6, _val, (v4f32)__msa_splati_w(_w4567, 2));
                _sum7 = __msa_fmadd_w(_sum7, _val, (v4f32)__msa_splati_w(_w4567, 3));

                tmpptr += 4;
                kptr += 8;
            }

            __msa_st_w((v4i32)_sum0, outptr0, 0);
            __msa_st_w((v4i32)_sum1, outptr1, 0);
            __msa_st_w((v4i32)_sum2, outptr2, 0);
            __msa_st_w((v4i32)_sum3, outptr3, 0);
            __msa_st_w((v4i32)_sum4, outptr4, 0);
            __msa_st_w((v4i32)_sum5, outptr5, 0);
            __msa_st_w((v4i32)_sum6, outptr6, 0);
            __msa_st_w((v4i32)_sum7, outptr7, 0);

            outptr0 += 4;
            outptr1 += 4;
            outptr2 += 4;
            outptr3 += 4;
            outptr4 += 4;
            outptr5 += 4;
            outptr6 += 4;
            outptr7 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
            const float* kptr = kernel.channel(p / 8);

            int nn = inch * maxk; // inch always > 0

            float sum0 = biasptr[0];
            float sum1 = biasptr[1];
            float sum2 = biasptr[2];
            float sum3 = biasptr[3];
            float sum4 = biasptr[4];
            float sum5 = biasptr[5];
            float sum6 = biasptr[6];
            float sum7 = biasptr[7];

            for (int q = 0; q < nn; q++)
            {
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[0] * kptr[1];
                sum2 += tmpptr[0] * kptr[2];
                sum3 += tmpptr[0] * kptr[3];
                sum4 += tmpptr[0] * kptr[4];
                sum5 += tmpptr[0] * kptr[5];
                sum6 += tmpptr[0] * kptr[6];
                sum7 += tmpptr[0] * kptr[7];
                tmpptr++;
                kptr += 8;
            }

            outptr0[0] = sum0;
            outptr1[0] = sum1;
            outptr2[0] = sum2;
            outptr3[0] = sum3;
            outptr4[0] = sum4;
            outptr5[0] = sum5;
            outptr6[0] = sum6;
            outptr7[0] = sum7;

            outptr0++;
            outptr1++;
            outptr2++;
            outptr3++;
            outptr4++;
            outptr5++;
            outptr6++;
            outptr7++;
        }
    }

    nn_outch = (outch - remain_outch_start) >> 2;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = remain_outch_start + pp * 4;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);
        float* outptr2 = top_blob.channel(p + 2);
        float* outptr3 = top_blob.channel(p + 3);

        const float zeros[4] = {0.f, 0.f, 0.f, 0.f};
        const float* biasptr = bias ? bias + p : zeros;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);

            int nn = inch * maxk; // inch always > 0

            v4f32 _sum0 = __msa_fill_w_f32(biasptr[0]);
            v4f32 _sum1 = __msa_fill_w_f32(biasptr[1]);
            v4f32 _sum2 = __msa_fill_w_f32(biasptr[2]);
            v4f32 _sum3 = __msa_fill_w_f32(biasptr[3]);

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 16);
                v4f32 _val = (v4f32)__msa_ld_w(tmpptr, 0);
                v4i32 _w0123 = __msa_ld_w(kptr, 0);
                _sum0 = __msa_fmadd_w(_sum0, _val, (v4f32)__msa_splati_w(_w0123, 0));
                _sum1 = __msa_fmadd_w(_sum1, _val, (v4f32)__msa_splati_w(_w0123, 1));
                _sum2 = __msa_fmadd_w(_sum2, _val, (v4f32)__msa_splati_w(_w0123, 2));
                _sum3 = __msa_fmadd_w(_sum3, _val, (v4f32)__msa_splati_w(_w0123, 3));

                tmpptr += 4;
                kptr += 4;
            }

            __msa_st_w((v4i32)_sum0, outptr0, 0);
            __msa_st_w((v4i32)_sum1, outptr1, 0);
            __msa_st_w((v4i32)_sum2, outptr2, 0);
            __msa_st_w((v4i32)_sum3, outptr3, 0);

            outptr0 += 4;
            outptr1 += 4;
            outptr2 += 4;
            outptr3 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4);

            int nn = inch * maxk; // inch always > 0

            float sum0 = biasptr[0];
            float sum1 = biasptr[1];
            float sum2 = biasptr[2];
            float sum3 = biasptr[3];

            for (int q = 0; q < nn; q++)
            {
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[0] * kptr[1];
                sum2 += tmpptr[0] * kptr[2];
                sum3 += tmpptr[0] * kptr[3];
                tmpptr++;
                kptr += 4;
            }

            outptr0[0] = sum0;
            outptr1[0] = sum1;
            outptr2[0] = sum2;
            outptr3[0] = sum3;

            outptr0++;
            outptr1++;
            outptr2++;
            outptr3++;
        }
    }

    remain_outch_start += nn_outch << 2;
#else // __mips_msa
    int nn_outch = outch >> 1;
    int remain_outch_start = nn_outch << 1;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int pp = 0; pp < nn_outch; pp++)
    {
        int p = pp * 2;

        float* outptr0 = top_blob.channel(p);
        float* outptr1 = top_blob.channel(p + 1);

        const float zeros[2] = {0.f, 0.f};
        const float* biasptr = bias ? bias + p : zeros;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
            const float* kptr = kernel.channel(p / 2);

            int nn = inch * maxk; // inch always > 0

            float sum00 = biasptr[0];
            float sum01 = biasptr[0];
            float sum02 = biasptr[0];
            float sum03 = biasptr[0];
            float sum10 = biasptr[1];
            float sum11 = biasptr[1];
            float sum12 = biasptr[1];
            float sum13 = biasptr[1];

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 8);
                float k0 = kptr[0];
                float k1 = kptr[1];
                sum00 += tmpptr[0] * k0;
                sum01 += tmpptr[1] * k0;
                sum02 += tmpptr[2] * k0;
                sum03 += tmpptr[3] * k0;
                sum10 += tmpptr[0] * k1;
                sum11 += tmpptr[1] * k1;
                sum12 += tmpptr[2] * k1;
                sum13 += tmpptr[3] * k1;
                tmpptr += 4;
                kptr += 2;
            }

            outptr0[0] = sum00;
            outptr0[1] = sum01;
            outptr0[2] = sum02;
            outptr0[3] = sum03;
            outptr1[0] = sum10;
            outptr1[1] = sum11;
            outptr1[2] = sum12;
            outptr1[3] = sum13;

            outptr0 += 4;
            outptr1 += 4;
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
            const float* kptr = kernel.channel(p / 2);

            int nn = inch * maxk; // inch always > 0

            float sum0 = biasptr[0];
            float sum1 = biasptr[1];

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 4);
                __builtin_prefetch(kptr + 8);
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[0] * kptr[1];
                tmpptr++;
                kptr += 2;
            }

            outptr0[0] = sum0;
            outptr1[0] = sum1;

            outptr0++;
            outptr1++;
        }
    }
#endif // __mips_msa

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = remain_outch_start; p < outch; p++)
    {
        float* outptr0 = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        int i = 0;
        for (; i + 3 < size; i += 4)
        {
            const float* tmpptr = tmp.channel(i / 4);
#if __mips_msa
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
#else
            const float* kptr = kernel.channel(p / 2 + p % 2);
#endif

            int nn = inch * maxk; // inch always > 0

#if __mips_msa
            v4f32 _sum0 = __msa_fill_w_f32(bias0);

            for (int q = 0; q < nn; q++)
            {
                _sum0 = __msa_fmadd_w(_sum0, __msa_fill_w_f32(kptr[0]), (v4f32)__msa_ld_w(tmpptr, 0));
                tmpptr += 4;
                kptr++;
            }

            __msa_st_w((v4i32)_sum0, outptr0, 0);

            outptr0 += 4;
#else
            float sum0 = bias0;
            float sum1 = bias0;
            float sum2 = bias0;
            float sum3 = bias0;

            for (int q = 0; q < nn; q++)
            {
                __builtin_prefetch(tmpptr + 16);
                __builtin_prefetch(kptr + 4);
                sum0 += tmpptr[0] * kptr[0];
                sum1 += tmpptr[1] * kptr[0];
                sum2 += tmpptr[2] * kptr[0];
                sum3 += tmpptr[3] * kptr[0];
                tmpptr += 4;
                kptr++;
            }

            outptr0[0] = sum0;
            outptr0[1] = sum1;
            outptr0[2] = sum2;
            outptr0[3] = sum3;

            outptr0 += 4;
#endif // __mips_msa
        }
        for (; i < size; i++)
        {
            const float* tmpptr = tmp.channel(i / 4 + i % 4);
#if __mips_msa
            const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4);
#else
            const float* kptr = kernel.channel(p / 2 + p % 2);
#endif

            int nn = inch * maxk; // inch always > 0

            float sum0 = bias0;

            for (int q = 0; q < nn; q++)
            {
                sum0 += tmpptr[0] * kptr[0];
                tmpptr++;
                kptr++;
            }

            outptr0[0] = sum0;

            outptr0++;
        }
    }
}

static void convolution_im2col_sgemm_transform_kernel_msa(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
{
    const int maxk = kernel_w * kernel_h;

    // interleave
    // src = maxk-inch-outch
    // dst = 8b-maxk-inch-outch/8b
    Mat kernel = _kernel.reshape(maxk, inch, outch);
#if __mips_msa
    kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4);
#else
    kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2);
#endif

    int q = 0;
#if __mips_msa
    for (; q + 7 < outch; q += 8)
    {
        const Mat k0 = kernel.channel(q);
        const Mat k1 = kernel.channel(q + 1);
        const Mat k2 = kernel.channel(q + 2);
        const Mat k3 = kernel.channel(q + 3);
        const Mat k4 = kernel.channel(q + 4);
        const Mat k5 = kernel.channel(q + 5);
        const Mat k6 = kernel.channel(q + 6);
        const Mat k7 = kernel.channel(q + 7);

        float* g00 = kernel_tm.channel(q / 8);

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);
            const float* k10 = k1.row(p);
            const float* k20 = k2.row(p);
            const float* k30 = k3.row(p);
            const float* k40 = k4.row(p);
            const float* k50 = k5.row(p);
            const float* k60 = k6.row(p);
            const float* k70 = k7.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];
                g00[1] = k10[k];
                g00[2] = k20[k];
                g00[3] = k30[k];
                g00[4] = k40[k];
                g00[5] = k50[k];
                g00[6] = k60[k];
                g00[7] = k70[k];

                g00 += 8;
            }
        }
    }
    for (; q + 3 < outch; q += 4)
    {
        const Mat k0 = kernel.channel(q);
        const Mat k1 = kernel.channel(q + 1);
        const Mat k2 = kernel.channel(q + 2);
        const Mat k3 = kernel.channel(q + 3);

        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);
            const float* k10 = k1.row(p);
            const float* k20 = k2.row(p);
            const float* k30 = k3.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];
                g00[1] = k10[k];
                g00[2] = k20[k];
                g00[3] = k30[k];

                g00 += 4;
            }
        }
    }
#else
    for (; q + 1 < outch; q += 2)
    {
        const Mat k0 = kernel.channel(q);
        const Mat k1 = kernel.channel(q + 1);

        float* g00 = kernel_tm.channel(q / 2);

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);
            const float* k10 = k1.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];
                g00[1] = k10[k];

                g00 += 2;
            }
        }
    }
#endif // __mips_msa
    for (; q < outch; q++)
    {
        const Mat k0 = kernel.channel(q);

#if __mips_msa
        float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4);
#else
        float* g00 = kernel_tm.channel(q / 2 + q % 2);
#endif

        for (int p = 0; p < inch; p++)
        {
            const float* k00 = k0.row(p);

            for (int k = 0; k < maxk; k++)
            {
                g00[0] = k00[k];

                g00 += 1;
            }
        }
    }
}

static void convolution_im2col_sgemm_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
{
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    const int size = outw * outh;

    const int maxk = kernel_w * kernel_h;

    // im2col
    Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
    {
        const int gap = w * stride_h - outw * stride_w;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < inch; p++)
        {
            const Mat img = bottom_blob.channel(p);
            float* ptr = bottom_im2col.channel(p);

            for (int u = 0; u < kernel_h; u++)
            {
                for (int v = 0; v < kernel_w; v++)
                {
                    const float* sptr = img.row<const float>(dilation_h * u) + dilation_w * v;

                    for (int i = 0; i < outh; i++)
                    {
                        int j = 0;
                        for (; j < outw; j++)
                        {
                            ptr[0] = sptr[0];

                            sptr += stride_w;
                            ptr += 1;
                        }

                        sptr += gap;
                    }
                }
            }
        }
    }

    im2col_sgemm_msa(bottom_im2col, top_blob, kernel, _bias, opt);
}


================================================
FILE: src/layer/mips/convolution_sgemm_int8.h
================================================
[File too large to display: 41.4 KB]

================================================
FILE: src/layer/mips/convolution_sgemm_pack1to4_int8.h
================================================
[File too large to display: 16.7 KB]

================================================
FILE: src/layer/mips/convolution_sgemm_pack4.h
================================================
[File too large to display: 21.5 KB]

================================================
FILE: src/layer/mips/convolution_sgemm_pack4to1.h
================================================
[File too large to display: 25.8 KB]

================================================
FILE: src/layer/mips/convolution_sgemm_pack8to1_int8.h
================================================
[File too large to display: 15.1 KB]

================================================
FILE: src/layer/mips/convolution_sgemm_pack8to4_int8.h
================================================
[File too large to display: 11.2 KB]

================================================
FILE: src/layer/mips/convolution_winograd_dot.h
================================================
[File too large to display: 15.0 KB]

================================================
FILE: src/layer/mips/convolution_winograd_dot_int8.h
================================================
[File too large to display: 32.4 KB]

================================================
FILE: src/layer/mips/convolution_winograd_dot_pack4.h
================================================
[File too large to display: 19.3 KB]

================================================
FILE: src/layer/mips/convolution_winograd_dot_pack8to1_int8.h
================================================
[File too large to display: 13.2 KB]

================================================
FILE: src/layer/mips/convolution_winograd_dot_pack8to4_int8.h
================================================
[File too large to display: 8.7 KB]

================================================
FILE: src/layer/mips/convolution_winograd_transform.h
================================================
[File too large to display: 12.9 KB]

================================================
FILE: src/layer/mips/convolution_winograd_transform_int8.h
================================================
[File too large to display: 7.9 KB]

================================================
FILE: src/layer/mips/convolution_winograd_transform_pack4.h
================================================
[File too large to display: 31.3 KB]

================================================
FILE: src/layer/mips/convolution_winograd_transform_pack4_int8.h
================================================
[File too large to display: 6.9 KB]

================================================
FILE: src/layer/mips/convolution_winograd_transform_pack8_int8.h
================================================
[File too large to display: 5.5 KB]

================================================
FILE: src/layer/mips/convolutiondepthwise_3x3.h
================================================
[File too large to display: 4.5 KB]

================================================
FILE: src/layer/mips/convolutiondepthwise_3x3_pack4.h
================================================
[File too large to display: 17.0 KB]

================================================
FILE: src/layer/mips/convolutiondepthwise_5x5_pack4.h
================================================
[File too large to display: 19.9 KB]

================================================
FILE: src/layer/mips/convolutiondepthwise_mips.cpp
================================================
[File too large to display: 30.9 KB]

================================================
FILE: src/layer/mips/convolutiondepthwise_mips.h
================================================
[File too large to display: 1015 B]

================================================
FILE: src/layer/mips/crop_mips.cpp
================================================
[File too large to display: 12.5 KB]

================================================
FILE: src/layer/mips/crop_mips.h
================================================
[File too large to display: 476 B]

================================================
FILE: src/layer/mips/deconvolution_mips.cpp
================================================
[File too large to display: 10.9 KB]

================================================
FILE: src/layer/mips/deconvolution_mips.h
================================================
[File too large to display: 678 B]

================================================
FILE: src/layer/mips/deconvolution_pack1to4.h
================================================
[File too large to display: 2.9 KB]

================================================
FILE: src/layer/mips/deconvolution_pack4.h
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/mips/deconvolution_pack4to1.h
================================================
[File too large to display: 2.9 KB]

================================================
FILE: src/layer/mips/deconvolutiondepthwise_mips.cpp
================================================
[File too large to display: 15.3 KB]

================================================
FILE: src/layer/mips/deconvolutiondepthwise_mips.h
================================================
[File too large to display: 840 B]

================================================
FILE: src/layer/mips/dequantize_mips.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: src/layer/mips/dequantize_mips.h
================================================
[File too large to display: 400 B]

================================================
FILE: src/layer/mips/dropout_mips.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/mips/dropout_mips.h
================================================
[File too large to display: 370 B]

================================================
FILE: src/layer/mips/eltwise_mips.cpp
================================================
[File too large to display: 9.5 KB]

================================================
FILE: src/layer/mips/eltwise_mips.h
================================================
[File too large to display: 407 B]

================================================
FILE: src/layer/mips/elu_mips.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/mips/elu_mips.h
================================================
[File too large to display: 342 B]

================================================
FILE: src/layer/mips/erf_mips.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/mips/erf_mips.h
================================================
[File too large to display: 352 B]

================================================
FILE: src/layer/mips/flatten_mips.cpp
================================================
[File too large to display: 11.5 KB]

================================================
FILE: src/layer/mips/flatten_mips.h
================================================
[File too large to display: 477 B]

================================================
FILE: src/layer/mips/gelu_mips.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/mips/gelu_mips.h
================================================
[File too large to display: 349 B]

================================================
FILE: src/layer/mips/hardsigmoid_mips.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/mips/hardsigmoid_mips.h
================================================
[File too large to display: 398 B]

================================================
FILE: src/layer/mips/hardswish_mips.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/mips/hardswish_mips.h
================================================
[File too large to display: 384 B]

================================================
FILE: src/layer/mips/innerproduct_mips.cpp
================================================
[File too large to display: 57.6 KB]

================================================
FILE: src/layer/mips/innerproduct_mips.h
================================================
[File too large to display: 956 B]

================================================
FILE: src/layer/mips/interp_bicubic.h
================================================
[File too large to display: 7.4 KB]

================================================
FILE: src/layer/mips/interp_bicubic_pack4.h
================================================
[File too large to display: 10.7 KB]

================================================
FILE: src/layer/mips/interp_bilinear.h
================================================
[File too large to display: 3.7 KB]

================================================
FILE: src/layer/mips/interp_bilinear_pack4.h
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/mips/interp_mips.cpp
================================================
[File too large to display: 14.3 KB]

================================================
FILE: src/layer/mips/interp_mips.h
================================================
[File too large to display: 400 B]

================================================
FILE: src/layer/mips/loongson_mmi.h
================================================
[File too large to display: 8.9 KB]

================================================
FILE: src/layer/mips/mips_activation.h
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/mips/mips_usability.h
================================================
[File too large to display: 7.7 KB]

================================================
FILE: src/layer/mips/mish_mips.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/mips/mish_mips.h
================================================
[File too large to display: 349 B]

================================================
FILE: src/layer/mips/msa_mathfun.h
================================================
[File too large to display: 13.7 KB]

================================================
FILE: src/layer/mips/packing_mips.cpp
================================================
[File too large to display: 18.8 KB]

================================================
FILE: src/layer/mips/packing_mips.h
================================================
[File too large to display: 477 B]

================================================
FILE: src/layer/mips/padding_mips.cpp
================================================
[File too large to display: 12.4 KB]

================================================
FILE: src/layer/mips/padding_mips.h
================================================
[File too large to display: 477 B]

================================================
FILE: src/layer/mips/padding_pack4.h
================================================
[File too large to display: 5.3 KB]

================================================
FILE: src/layer/mips/padding_pack8_int8.h
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/mips/pooling_mips.cpp
================================================
[File too large to display: 8.7 KB]

================================================
FILE: src/layer/mips/pooling_mips.h
================================================
[File too large to display: 431 B]

================================================
FILE: src/layer/mips/prelu_mips.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: src/layer/mips/prelu_mips.h
================================================
[File too large to display: 356 B]

================================================
FILE: src/layer/mips/quantize_mips.cpp
================================================
[File too large to display: 9.3 KB]

================================================
FILE: src/layer/mips/quantize_mips.h
================================================
[File too large to display: 386 B]

================================================
FILE: src/layer/mips/relu_mips.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/mips/relu_mips.h
================================================
[File too large to display: 349 B]

================================================
FILE: src/layer/mips/requantize_mips.cpp
================================================
[File too large to display: 17.6 KB]

================================================
FILE: src/layer/mips/requantize_mips.h
================================================
[File too large to display: 400 B]

================================================
FILE: src/layer/mips/selu_mips.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/mips/selu_mips.h
================================================
[File too large to display: 349 B]

================================================
FILE: src/layer/mips/sigmoid_mips.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/mips/sigmoid_mips.h
================================================
[File too large to display: 387 B]

================================================
FILE: src/layer/mips/slice_mips.cpp
================================================
[File too large to display: 15.7 KB]

================================================
FILE: src/layer/mips/slice_mips.h
================================================
[File too large to display: 393 B]

================================================
FILE: src/layer/mips/softmax_mips.cpp
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/mips/softmax_mips.h
================================================
[File too large to display: 366 B]

================================================
FILE: src/layer/mips/swish_mips.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/mips/swish_mips.h
================================================
[File too large to display: 356 B]

================================================
FILE: src/layer/mips/tanh_mips.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/mips/tanh_mips.h
================================================
[File too large to display: 366 B]

================================================
FILE: src/layer/mips/unaryop_mips.cpp
================================================
[File too large to display: 10.8 KB]

================================================
FILE: src/layer/mips/unaryop_mips.h
================================================
[File too large to display: 370 B]

================================================
FILE: src/layer/mish.cpp
================================================
[File too large to display: 965 B]

================================================
FILE: src/layer/mish.h
================================================
[File too large to display: 326 B]

================================================
FILE: src/layer/multiheadattention.cpp
================================================
[File too large to display: 27.6 KB]

================================================
FILE: src/layer/multiheadattention.h
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/mvn.cpp
================================================
[File too large to display: 4.0 KB]

================================================
FILE: src/layer/mvn.h
================================================
[File too large to display: 457 B]

================================================
FILE: src/layer/noop.cpp
================================================
[File too large to display: 445 B]

================================================
FILE: src/layer/noop.h
================================================
[File too large to display: 340 B]

================================================
FILE: src/layer/normalize.cpp
================================================
[File too large to display: 6.6 KB]

================================================
FILE: src/layer/normalize.h
================================================
[File too large to display: 753 B]

================================================
FILE: src/layer/packing.cpp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: src/layer/packing.h
================================================
[File too large to display: 600 B]

================================================
FILE: src/layer/padding.cpp
================================================
[File too large to display: 12.2 KB]

================================================
FILE: src/layer/padding.h
================================================
[File too large to display: 709 B]

================================================
FILE: src/layer/permute.cpp
================================================
[File too large to display: 22.8 KB]

================================================
FILE: src/layer/permute.h
================================================
[File too large to display: 429 B]

================================================
FILE: src/layer/pixelshuffle.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/pixelshuffle.h
================================================
[File too large to display: 472 B]

================================================
FILE: src/layer/pooling.cpp
================================================
[File too large to display: 12.3 KB]

================================================
FILE: src/layer/pooling.h
================================================
[File too large to display: 967 B]

================================================
FILE: src/layer/pooling1d.cpp
================================================
[File too large to display: 8.6 KB]

================================================
FILE: src/layer/pooling1d.h
================================================
[File too large to display: 889 B]

================================================
FILE: src/layer/pooling3d.cpp
================================================
[File too large to display: 15.7 KB]

================================================
FILE: src/layer/pooling3d.h
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/power.cpp
================================================
[File too large to display: 842 B]

================================================
FILE: src/layer/power.h
================================================
[File too large to display: 441 B]

================================================
FILE: src/layer/prelu.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/prelu.h
================================================
[File too large to display: 478 B]

================================================
FILE: src/layer/priorbox.cpp
================================================
[File too large to display: 7.1 KB]

================================================
FILE: src/layer/priorbox.h
================================================
[File too large to display: 717 B]

================================================
FILE: src/layer/proposal.cpp
================================================
[File too large to display: 8.9 KB]

================================================
FILE: src/layer/proposal.h
================================================
[File too large to display: 632 B]

================================================
FILE: src/layer/psroipooling.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: src/layer/psroipooling.h
================================================
[File too large to display: 552 B]

================================================
FILE: src/layer/quantize.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: src/layer/quantize.h
================================================
[File too large to display: 508 B]

================================================
FILE: src/layer/reduction.cpp
================================================
[File too large to display: 25.5 KB]

================================================
FILE: src/layer/reduction.h
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/relu.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/relu.h
================================================
[File too large to display: 402 B]

================================================
FILE: src/layer/reorg.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/reorg.h
================================================
[File too large to display: 429 B]

================================================
FILE: src/layer/requantize.cpp
================================================
[File too large to display: 4.0 KB]

================================================
FILE: src/layer/requantize.h
================================================
[File too large to display: 988 B]

================================================
FILE: src/layer/reshape.cpp
================================================
[File too large to display: 5.0 KB]

================================================
FILE: src/layer/reshape.h
================================================
[File too large to display: 893 B]

================================================
FILE: src/layer/riscv/absval_riscv.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/riscv/absval_riscv.h
================================================
[File too large to display: 513 B]

================================================
FILE: src/layer/riscv/absval_riscv_zfh.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/riscv/batchnorm_riscv.cpp
================================================
[File too large to display: 6.1 KB]

================================================
FILE: src/layer/riscv/batchnorm_riscv.h
================================================
[File too large to display: 600 B]

================================================
FILE: src/layer/riscv/batchnorm_riscv_zfh.cpp
================================================
[File too large to display: 11.4 KB]

================================================
FILE: src/layer/riscv/bias_riscv.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/riscv/bias_riscv.h
================================================
[File too large to display: 483 B]

================================================
FILE: src/layer/riscv/bias_riscv_zfh.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/riscv/binaryop_riscv.cpp
================================================
[File too large to display: 24.7 KB]

================================================
FILE: src/layer/riscv/binaryop_riscv.h
================================================
[File too large to display: 760 B]

================================================
FILE: src/layer/riscv/binaryop_riscv_zfh.cpp
================================================
[File too large to display: 25.1 KB]

================================================
FILE: src/layer/riscv/bnll_riscv.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: src/layer/riscv/bnll_riscv.h
================================================
[File too large to display: 483 B]

================================================
FILE: src/layer/riscv/bnll_riscv_zfh.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/riscv/cast_riscv.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/riscv/cast_riscv.h
================================================
[File too large to display: 568 B]

================================================
FILE: src/layer/riscv/cast_riscv_zfh.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/riscv/celu_riscv.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/riscv/celu_riscv.h
================================================
[File too large to display: 483 B]

================================================
FILE: src/layer/riscv/celu_riscv_zfh.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/riscv/clip_riscv.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/riscv/clip_riscv.h
================================================
[File too large to display: 464 B]

================================================
FILE: src/layer/riscv/clip_riscv_zfh.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/riscv/concat_riscv.cpp
================================================
[File too large to display: 25.6 KB]

================================================
FILE: src/layer/riscv/concat_riscv.h
================================================
[File too large to display: 538 B]

================================================
FILE: src/layer/riscv/convolution1d_riscv.cpp
================================================
[File too large to display: 12.0 KB]

================================================
FILE: src/layer/riscv/convolution1d_riscv.h
================================================
[File too large to display: 1020 B]

================================================
FILE: src/layer/riscv/convolution1d_riscv_zfh.cpp
================================================
[File too large to display: 16.4 KB]

================================================
FILE: src/layer/riscv/convolution_1x1.h
================================================
[File too large to display: 440 B]

================================================
FILE: src/layer/riscv/convolution_1x1_fp16s.h
================================================
[File too large to display: 454 B]

================================================
FILE: src/layer/riscv/convolution_1x1_pack1ton.h
================================================
[File too large to display: 458 B]

================================================
FILE: src/layer/riscv/convolution_1x1_pack1ton_fp16s.h
================================================
[File too large to display: 472 B]

================================================
FILE: src/layer/riscv/convolution_1x1_packn.h
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/riscv/convolution_1x1_packn_fp16s.h
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/riscv/convolution_1x1_packnto1.h
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/riscv/convolution_1x1_packnto1_fp16s.h
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/riscv/convolution_3x3.h
================================================
[File too large to display: 11.0 KB]

================================================
FILE: src/layer/riscv/convolution_3x3_pack1ton.h
================================================
[File too large to display: 30.4 KB]

================================================
FILE: src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
================================================
[File too large to display: 30.5 KB]

================================================
FILE: src/layer/riscv/convolution_3x3_packn.h
================================================
[File too large to display: 12.9 KB]

================================================
FILE: src/layer/riscv/convolution_3x3_packn_fp16s.h
================================================
[File too large to display: 13.0 KB]

================================================
FILE: src/layer/riscv/convolution_7x7_pack1ton.h
================================================
[File too large to display: 65.4 KB]

================================================
FILE: src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
================================================
[File too large to display: 65.6 KB]

================================================
FILE: src/layer/riscv/convolution_fp16s.h
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/riscv/convolution_pack1ton.h
================================================
[File too large to display: 2.5 KB]

================================================
FILE: src/layer/riscv/convolution_pack1ton_fp16s.h
================================================
[File too large to display: 5.0 KB]

================================================
FILE: src/layer/riscv/convolution_packn.h
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/riscv/convolution_packn_fp16s.h
================================================
[File too large to display: 5.5 KB]

================================================
FILE: src/layer/riscv/convolution_packnto1.h
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/riscv/convolution_packnto1_fp16s.h
================================================
[File too large to display: 5.5 KB]

================================================
FILE: src/layer/riscv/convolution_riscv.cpp
================================================
[File too large to display: 23.6 KB]

================================================
FILE: src/layer/riscv/convolution_riscv.h
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/riscv/convolution_riscv_zfh.cpp
================================================
[File too large to display: 17.5 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm.h
================================================
[File too large to display: 19.9 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm_fp16s.h
================================================
[File too large to display: 16.5 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm_pack1ton.h
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
================================================
[File too large to display: 3.6 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm_packn.h
================================================
[File too large to display: 14.9 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm_packn_fp16s.h
================================================
[File too large to display: 18.5 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm_packnto1.h
================================================
[File too large to display: 27.0 KB]

================================================
FILE: src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
================================================
[File too large to display: 27.3 KB]

================================================
FILE: src/layer/riscv/convolution_winograd_dot.h
================================================
[File too large to display: 15.8 KB]

================================================
FILE: src/layer/riscv/convolution_winograd_dot_packn.h
================================================
[File too large to display: 11.7 KB]

================================================
FILE: src/layer/riscv/convolution_winograd_dot_packn_fp16s.h
================================================
[File too large to display: 11.9 KB]

================================================
FILE: src/layer/riscv/convolution_winograd_transform.h
================================================
[File too large to display: 12.9 KB]

================================================
FILE: src/layer/riscv/convolution_winograd_transform_packn.h
================================================
[File too large to display: 36.5 KB]

================================================
FILE: src/layer/riscv/convolution_winograd_transform_packn_fp16s.h
================================================
[File too large to display: 37.2 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_3x3.h
================================================
[File too large to display: 4.5 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_3x3_packn.h
================================================
[File too large to display: 19.5 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h
================================================
[File too large to display: 19.6 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_5x5_packn.h
================================================
[File too large to display: 23.2 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h
================================================
[File too large to display: 23.4 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_riscv.cpp
================================================
[File too large to display: 19.0 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_riscv.h
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/riscv/convolutiondepthwise_riscv_zfh.cpp
================================================
[File too large to display: 18.3 KB]

================================================
FILE: src/layer/riscv/crop_riscv.cpp
================================================
[File too large to display: 15.3 KB]

================================================
FILE: src/layer/riscv/crop_riscv.h
================================================
[File too large to display: 481 B]

================================================
FILE: src/layer/riscv/deconvolution_fp16s.h
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/riscv/deconvolution_pack1ton.h
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/riscv/deconvolution_pack1ton_fp16s.h
================================================
[File too large to display: 6.0 KB]

================================================
FILE: src/layer/riscv/deconvolution_packn.h
================================================
[File too large to display: 3.1 KB]

================================================
FILE: src/layer/riscv/deconvolution_packn_fp16s.h
================================================
[File too large to display: 6.3 KB]

================================================
FILE: src/layer/riscv/deconvolution_packnto1.h
================================================
[File too large to display: 3.1 KB]

================================================
FILE: src/layer/riscv/deconvolution_packnto1_fp16s.h
================================================
[File too large to display: 6.5 KB]

================================================
FILE: src/layer/riscv/deconvolution_riscv.cpp
================================================
[File too large to display: 12.4 KB]

================================================
FILE: src/layer/riscv/deconvolution_riscv.h
================================================
[File too large to display: 977 B]

================================================
FILE: src/layer/riscv/deconvolution_riscv_zfh.cpp
================================================
[File too large to display: 8.6 KB]

================================================
FILE: src/layer/riscv/deconvolutiondepthwise_riscv.cpp
================================================
[File too large to display: 16.9 KB]

================================================
FILE: src/layer/riscv/deconvolutiondepthwise_riscv.h
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/riscv/deconvolutiondepthwise_riscv_zfh.cpp
================================================
[File too large to display: 19.0 KB]

================================================
FILE: src/layer/riscv/deformableconv2d_pack1ton.h
================================================
[File too large to display: 6.5 KB]

================================================
FILE: src/layer/riscv/deformableconv2d_packn.h
================================================
[File too large to display: 7.5 KB]

================================================
FILE: src/layer/riscv/deformableconv2d_packnto1.h
================================================
[File too large to display: 7.4 KB]

================================================
FILE: src/layer/riscv/deformableconv2d_riscv.cpp
================================================
[File too large to display: 22.0 KB]

================================================
FILE: src/layer/riscv/deformableconv2d_riscv.h
================================================
[File too large to display: 656 B]

================================================
FILE: src/layer/riscv/dropout_riscv.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/riscv/dropout_riscv.h
================================================
[File too large to display: 398 B]

================================================
FILE: src/layer/riscv/eltwise_riscv.cpp
================================================
[File too large to display: 10.6 KB]

================================================
FILE: src/layer/riscv/eltwise_riscv.h
================================================
[File too large to display: 580 B]

================================================
FILE: src/layer/riscv/eltwise_riscv_zfh.cpp
================================================
[File too large to display: 9.7 KB]

================================================
FILE: src/layer/riscv/flatten_riscv.cpp
================================================
[File too large to display: 12.7 KB]

================================================
FILE: src/layer/riscv/flatten_riscv.h
================================================
[File too large to display: 575 B]

================================================
FILE: src/layer/riscv/gelu_riscv.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/layer/riscv/gelu_riscv.h
================================================
[File too large to display: 377 B]

================================================
FILE: src/layer/riscv/gemm_bf16s_fp16s.h
================================================
[File too large to display: 17.0 KB]

================================================
FILE: src/layer/riscv/gemm_fp16s.h
================================================
[File too large to display: 53.8 KB]

================================================
FILE: src/layer/riscv/gemm_riscv.cpp
================================================
[File too large to display: 67.7 KB]

================================================
FILE: src/layer/riscv/gemm_riscv.h
================================================
[File too large to display: 713 B]

================================================
FILE: src/layer/riscv/gemm_riscv_zfh.cpp
================================================
[File too large to display: 22.2 KB]

================================================
FILE: src/layer/riscv/gru_riscv.cpp
================================================
[File too large to display: 12.2 KB]

================================================
FILE: src/layer/riscv/gru_riscv.h
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/riscv/gru_riscv_zfh.cpp
================================================
[File too large to display: 24.1 KB]

================================================
FILE: src/layer/riscv/hardsigmoid_riscv.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/riscv/hardsigmoid_riscv.h
================================================
[File too large to display: 536 B]

================================================
FILE: src/layer/riscv/hardsigmoid_riscv_zfh.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/riscv/hardswish_riscv.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/riscv/hardswish_riscv.h
================================================
[File too large to display: 522 B]

================================================
FILE: src/layer/riscv/hardswish_riscv_zfh.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/riscv/innerproduct_riscv.cpp
================================================
[File too large to display: 14.4 KB]

================================================
FILE: src/layer/riscv/innerproduct_riscv.h
================================================
[File too large to display: 873 B]

================================================
FILE: src/layer/riscv/innerproduct_riscv_zfh.cpp
================================================
[File too large to display: 18.6 KB]

================================================
FILE: src/layer/riscv/instancenorm_riscv.cpp
================================================
[File too large to display: 6.8 KB]

================================================
FILE: src/layer/riscv/instancenorm_riscv.h
================================================
[File too large to display: 620 B]

================================================
FILE: src/layer/riscv/instancenorm_riscv_zfh.cpp
================================================
[File too large to display: 12.8 KB]

================================================
FILE: src/layer/riscv/interp_bicubic.h
================================================
[File too large to display: 7.4 KB]

================================================
FILE: src/layer/riscv/interp_bicubic_fp16s.h
================================================
[File too large to display: 13.6 KB]

================================================
FILE: src/layer/riscv/interp_bicubic_packn.h
================================================
[File too large to display: 10.5 KB]

================================================
FILE: src/layer/riscv/interp_bicubic_packn_fp16s.h
================================================
[File too large to display: 21.6 KB]

================================================
FILE: src/layer/riscv/interp_bilinear.h
================================================
[File too large to display: 6.0 KB]

================================================
FILE: src/layer/riscv/interp_bilinear_fp16s.h
================================================
[File too large to display: 6.3 KB]

================================================
FILE: src/layer/riscv/interp_bilinear_packn.h
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/riscv/interp_bilinear_packn_fp16s.h
================================================
[File too large to display: 6.8 KB]

================================================
FILE: src/layer/riscv/interp_riscv.cpp
================================================
[File too large to display: 15.0 KB]

================================================
FILE: src/layer/riscv/interp_riscv.h
================================================
[File too large to display: 668 B]

================================================
FILE: src/layer/riscv/interp_riscv_zfh.cpp
================================================
[File too large to display: 25.0 KB]

================================================
FILE: src/layer/riscv/layernorm_riscv.cpp
================================================
[File too large to display: 15.0 KB]

================================================
FILE: src/layer/riscv/layernorm_riscv.h
================================================
[File too large to display: 499 B]

================================================
FILE: src/layer/riscv/layernorm_riscv_zfh.cpp
================================================
[File too large to display: 13.2 KB]

================================================
FILE: src/layer/riscv/mish_riscv.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/riscv/mish_riscv.h
================================================
[File too large to display: 543 B]

================================================
FILE: src/layer/riscv/mish_riscv_zfh.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: src/layer/riscv/packing_riscv.cpp
================================================
[File too large to display: 62.9 KB]

================================================
FILE: src/layer/riscv/packing_riscv.h
================================================
[File too large to display: 575 B]

================================================
FILE: src/layer/riscv/padding_packn.h
================================================
[File too large to display: 29.6 KB]

================================================
FILE: src/layer/riscv/padding_riscv.cpp
================================================
[File too large to display: 21.9 KB]

================================================
FILE: src/layer/riscv/padding_riscv.h
================================================
[File too large to display: 868 B]

================================================
FILE: src/layer/riscv/pooling_riscv.cpp
================================================
[File too large to display: 9.6 KB]

================================================
FILE: src/layer/riscv/pooling_riscv.h
================================================
[File too large to display: 643 B]

================================================
FILE: src/layer/riscv/pooling_riscv_zfh.cpp
================================================
[File too large to display: 22.2 KB]

================================================
FILE: src/layer/riscv/prelu_riscv.cpp
================================================
[File too large to display: 7.0 KB]

================================================
FILE: src/layer/riscv/prelu_riscv.h
================================================
[File too large to display: 573 B]

================================================
FILE: src/layer/riscv/prelu_riscv_zfh.cpp
================================================
[File too large to display: 13.4 KB]

================================================
FILE: src/layer/riscv/relu_riscv.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: src/layer/riscv/relu_riscv.h
================================================
[File too large to display: 464 B]

================================================
FILE: src/layer/riscv/relu_riscv_zfh.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/riscv/riscv_activation.h
================================================
[File too large to display: 7.7 KB]

================================================
FILE: src/layer/riscv/riscv_usability.h
================================================
[File too large to display: 15.2 KB]

================================================
FILE: src/layer/riscv/rvv_mathfun.h
================================================
[File too large to display: 74.0 KB]

================================================
FILE: src/layer/riscv/rvv_mathfun_fp16s.h
================================================
[File too large to display: 46.5 KB]

================================================
FILE: src/layer/riscv/selu_riscv.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/riscv/selu_riscv.h
================================================
[File too large to display: 358 B]

================================================
FILE: src/layer/riscv/shufflechannel_riscv.cpp
================================================
[File too large to display: 47.4 KB]

================================================
FILE: src/layer/riscv/shufflechannel_riscv.h
================================================
[File too large to display: 538 B]

================================================
FILE: src/layer/riscv/sigmoid_riscv.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/riscv/sigmoid_riscv.h
================================================
[File too large to display: 564 B]

================================================
FILE: src/layer/riscv/sigmoid_riscv_zfh.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/riscv/softmax_riscv.cpp
================================================
[File too large to display: 30.0 KB]

================================================
FILE: src/layer/riscv/softmax_riscv.h
================================================
[File too large to display: 398 B]

================================================
FILE: src/layer/riscv/swish_riscv.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/riscv/swish_riscv.h
================================================
[File too large to display: 550 B]

================================================
FILE: src/layer/riscv/swish_riscv_zfh.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: src/layer/riscv/tanh_riscv.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/riscv/tanh_riscv.h
================================================
[File too large to display: 543 B]

================================================
FILE: src/layer/riscv/tanh_riscv_zfh.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/riscv/unaryop_riscv.cpp
================================================
[File too large to display: 10.2 KB]

================================================
FILE: src/layer/riscv/unaryop_riscv.h
================================================
[File too large to display: 485 B]

================================================
FILE: src/layer/riscv/unaryop_riscv_zfh.cpp
================================================
[File too large to display: 13.4 KB]

================================================
FILE: src/layer/rmsnorm.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/rmsnorm.h
================================================
[File too large to display: 522 B]

================================================
FILE: src/layer/rnn.cpp
================================================
[File too large to display: 12.7 KB]

================================================
FILE: src/layer/rnn.h
================================================
[File too large to display: 850 B]

================================================
FILE: src/layer/roialign.cpp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: src/layer/roialign.h
================================================
[File too large to display: 571 B]

================================================
FILE: src/layer/roipooling.cpp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/roipooling.h
================================================
[File too large to display: 522 B]

================================================
FILE: src/layer/rotaryembed.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/rotaryembed.h
================================================
[File too large to display: 478 B]

================================================
FILE: src/layer/scale.cpp
================================================
[File too large to display: 3.6 KB]

================================================
FILE: src/layer/scale.h
================================================
[File too large to display: 643 B]

================================================
FILE: src/layer/sdpa.cpp
================================================
[File too large to display: 15.3 KB]

================================================
FILE: src/layer/sdpa.h
================================================
[File too large to display: 649 B]

================================================
FILE: src/layer/selu.cpp
================================================
[File too large to display: 986 B]

================================================
FILE: src/layer/selu.h
================================================
[File too large to display: 420 B]

================================================
FILE: src/layer/shrink.cpp
================================================
[File too large to display: 918 B]

================================================
FILE: src/layer/shrink.h
================================================
[File too large to display: 427 B]

================================================
FILE: src/layer/shufflechannel.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/shufflechannel.h
================================================
[File too large to display: 476 B]

================================================
FILE: src/layer/sigmoid.cpp
================================================
[File too large to display: 850 B]

================================================
FILE: src/layer/sigmoid.h
================================================
[File too large to display: 341 B]

================================================
FILE: src/layer/slice.cpp
================================================
[File too large to display: 11.1 KB]

================================================
FILE: src/layer/slice.h
================================================
[File too large to display: 474 B]

================================================
FILE: src/layer/softmax.cpp
================================================
[File too large to display: 5.5 KB]

================================================
FILE: src/layer/softmax.h
================================================
[File too large to display: 414 B]

================================================
FILE: src/layer/softplus.cpp
================================================
[File too large to display: 759 B]

================================================
FILE: src/layer/softplus.h
================================================
[File too large to display: 346 B]

================================================
FILE: src/layer/spectrogram.cpp
================================================
[File too large to display: 5.6 KB]

================================================
FILE: src/layer/spectrogram.h
================================================
[File too large to display: 718 B]

================================================
FILE: src/layer/split.cpp
================================================
[File too large to display: 641 B]

================================================
FILE: src/layer/split.h
================================================
[File too large to display: 368 B]

================================================
FILE: src/layer/spp.cpp
================================================
[File too large to display: 4.2 KB]

================================================
FILE: src/layer/spp.h
================================================
[File too large to display: 537 B]

================================================
FILE: src/layer/squeeze.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: src/layer/squeeze.h
================================================
[File too large to display: 499 B]

================================================
FILE: src/layer/statisticspooling.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/statisticspooling.h
================================================
[File too large to display: 540 B]

================================================
FILE: src/layer/swish.cpp
================================================
[File too large to display: 743 B]

================================================
FILE: src/layer/swish.h
================================================
[File too large to display: 331 B]

================================================
FILE: src/layer/tanh.cpp
================================================
[File too large to display: 702 B]

================================================
FILE: src/layer/tanh.h
================================================
[File too large to display: 326 B]

================================================
FILE: src/layer/threshold.cpp
================================================
[File too large to display: 803 B]

================================================
FILE: src/layer/threshold.h
================================================
[File too large to display: 431 B]

================================================
FILE: src/layer/tile.cpp
================================================
[File too large to display: 4.8 KB]

================================================
FILE: src/layer/tile.h
================================================
[File too large to display: 440 B]

================================================
FILE: src/layer/unaryop.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: src/layer/unaryop.h
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/unfold.cpp
================================================
[File too large to display: 4.2 KB]

================================================
FILE: src/layer/unfold.h
================================================
[File too large to display: 757 B]

================================================
FILE: src/layer/vulkan/absval_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/absval_vulkan.h
================================================
[File too large to display: 572 B]

================================================
FILE: src/layer/vulkan/batchnorm_vulkan.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/vulkan/batchnorm_vulkan.h
================================================
[File too large to display: 751 B]

================================================
FILE: src/layer/vulkan/binaryop_vulkan.cpp
================================================
[File too large to display: 19.0 KB]

================================================
FILE: src/layer/vulkan/binaryop_vulkan.h
================================================
[File too large to display: 967 B]

================================================
FILE: src/layer/vulkan/cast_vulkan.cpp
================================================
[File too large to display: 5.4 KB]

================================================
FILE: src/layer/vulkan/cast_vulkan.h
================================================
[File too large to display: 592 B]

================================================
FILE: src/layer/vulkan/celu_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/celu_vulkan.h
================================================
[File too large to display: 554 B]

================================================
FILE: src/layer/vulkan/clip_vulkan.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/clip_vulkan.h
================================================
[File too large to display: 554 B]

================================================
FILE: src/layer/vulkan/concat_vulkan.cpp
================================================
[File too large to display: 27.7 KB]

================================================
FILE: src/layer/vulkan/concat_vulkan.h
================================================
[File too large to display: 689 B]

================================================
FILE: src/layer/vulkan/convolution1d_vulkan.cpp
================================================
[File too large to display: 7.8 KB]

================================================
FILE: src/layer/vulkan/convolution1d_vulkan.h
================================================
[File too large to display: 864 B]

================================================
FILE: src/layer/vulkan/convolution_vulkan.cpp
================================================
[File too large to display: 88.8 KB]

================================================
FILE: src/layer/vulkan/convolution_vulkan.h
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/vulkan/convolutiondepthwise_vulkan.cpp
================================================
[File too large to display: 18.4 KB]

================================================
FILE: src/layer/vulkan/convolutiondepthwise_vulkan.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/crop_vulkan.cpp
================================================
[File too large to display: 17.3 KB]

================================================
FILE: src/layer/vulkan/crop_vulkan.h
================================================
[File too large to display: 806 B]

================================================
FILE: src/layer/vulkan/deconvolution_vulkan.cpp
================================================
[File too large to display: 29.5 KB]

================================================
FILE: src/layer/vulkan/deconvolution_vulkan.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/deconvolutiondepthwise_vulkan.cpp
================================================
[File too large to display: 21.4 KB]

================================================
FILE: src/layer/vulkan/deconvolutiondepthwise_vulkan.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/deepcopy_vulkan.cpp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: src/layer/vulkan/deepcopy_vulkan.h
================================================
[File too large to display: 632 B]

================================================
FILE: src/layer/vulkan/dequantize_vulkan.cpp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: src/layer/vulkan/dequantize_vulkan.h
================================================
[File too large to display: 771 B]

================================================
FILE: src/layer/vulkan/dropout_vulkan.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/dropout_vulkan.h
================================================
[File too large to display: 581 B]

================================================
FILE: src/layer/vulkan/eltwise_vulkan.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/vulkan/eltwise_vulkan.h
================================================
[File too large to display: 615 B]

================================================
FILE: src/layer/vulkan/elu_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/elu_vulkan.h
================================================
[File too large to display: 545 B]

================================================
FILE: src/layer/vulkan/erf_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/erf_vulkan.h
================================================
[File too large to display: 545 B]

================================================
FILE: src/layer/vulkan/flatten_vulkan.cpp
================================================
[File too large to display: 4.5 KB]

================================================
FILE: src/layer/vulkan/flatten_vulkan.h
================================================
[File too large to display: 663 B]

================================================
FILE: src/layer/vulkan/gelu_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/gelu_vulkan.h
================================================
[File too large to display: 554 B]

================================================
FILE: src/layer/vulkan/gemm_vulkan.cpp
================================================
[File too large to display: 30.6 KB]

================================================
FILE: src/layer/vulkan/gemm_vulkan.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/groupnorm_vulkan.cpp
================================================
[File too large to display: 23.1 KB]

================================================
FILE: src/layer/vulkan/groupnorm_vulkan.h
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/hardsigmoid_vulkan.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/vulkan/hardsigmoid_vulkan.h
================================================
[File too large to display: 617 B]

================================================
FILE: src/layer/vulkan/hardswish_vulkan.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/hardswish_vulkan.h
================================================
[File too large to display: 599 B]

================================================
FILE: src/layer/vulkan/innerproduct_vulkan.cpp
================================================
[File too large to display: 16.5 KB]

================================================
FILE: src/layer/vulkan/innerproduct_vulkan.h
================================================
[File too large to display: 940 B]

================================================
FILE: src/layer/vulkan/instancenorm_vulkan.cpp
================================================
[File too large to display: 19.6 KB]

================================================
FILE: src/layer/vulkan/instancenorm_vulkan.h
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/interp_vulkan.cpp
================================================
[File too large to display: 16.1 KB]

================================================
FILE: src/layer/vulkan/interp_vulkan.h
================================================
[File too large to display: 930 B]

================================================
FILE: src/layer/vulkan/layernorm_vulkan.cpp
================================================
[File too large to display: 17.0 KB]

================================================
FILE: src/layer/vulkan/layernorm_vulkan.h
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/lrn_vulkan.cpp
================================================
[File too large to display: 8.8 KB]

================================================
FILE: src/layer/vulkan/lrn_vulkan.h
================================================
[File too large to display: 817 B]

================================================
FILE: src/layer/vulkan/memorydata_vulkan.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/memorydata_vulkan.h
================================================
[File too large to display: 638 B]

================================================
FILE: src/layer/vulkan/mish_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/mish_vulkan.h
================================================
[File too large to display: 554 B]

================================================
FILE: src/layer/vulkan/multiheadattention_vulkan.cpp
================================================
[File too large to display: 19.2 KB]

================================================
FILE: src/layer/vulkan/multiheadattention_vulkan.h
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/noop_vulkan.cpp
================================================
[File too large to display: 369 B]

================================================
FILE: src/layer/vulkan/noop_vulkan.h
================================================
[File too large to display: 416 B]

================================================
FILE: src/layer/vulkan/normalize_vulkan.cpp
================================================
[File too large to display: 12.4 KB]

================================================
FILE: src/layer/vulkan/normalize_vulkan.h
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/vulkan/packing_vulkan.cpp
================================================
[File too large to display: 11.6 KB]

================================================
FILE: src/layer/vulkan/packing_vulkan.h
================================================
[File too large to display: 666 B]

================================================
FILE: src/layer/vulkan/padding_vulkan.cpp
================================================
[File too large to display: 17.5 KB]

================================================
FILE: src/layer/vulkan/padding_vulkan.h
================================================
[File too large to display: 1023 B]

================================================
FILE: src/layer/vulkan/permute_vulkan.cpp
================================================
[File too large to display: 11.6 KB]

================================================
FILE: src/layer/vulkan/permute_vulkan.h
================================================
[File too large to display: 704 B]

================================================
FILE: src/layer/vulkan/pixelshuffle_vulkan.cpp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: src/layer/vulkan/pixelshuffle_vulkan.h
================================================
[File too large to display: 718 B]

================================================
FILE: src/layer/vulkan/pooling_vulkan.cpp
================================================
[File too large to display: 21.7 KB]

================================================
FILE: src/layer/vulkan/pooling_vulkan.h
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/vulkan/prelu_vulkan.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: src/layer/vulkan/prelu_vulkan.h
================================================
[File too large to display: 693 B]

================================================
FILE: src/layer/vulkan/priorbox_vulkan.cpp
================================================
[File too large to display: 6.9 KB]

================================================
FILE: src/layer/vulkan/priorbox_vulkan.h
================================================
[File too large to display: 806 B]

================================================
FILE: src/layer/vulkan/quantize_vulkan.cpp
================================================
[File too large to display: 4.7 KB]

================================================
FILE: src/layer/vulkan/quantize_vulkan.h
================================================
[File too large to display: 726 B]

================================================
FILE: src/layer/vulkan/reduction_vulkan.cpp
================================================
[File too large to display: 5.6 KB]

================================================
FILE: src/layer/vulkan/reduction_vulkan.h
================================================
[File too large to display: 612 B]

================================================
FILE: src/layer/vulkan/relu_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/relu_vulkan.h
================================================
[File too large to display: 554 B]

================================================
FILE: src/layer/vulkan/reorg_vulkan.cpp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: src/layer/vulkan/reorg_vulkan.h
================================================
[File too large to display: 641 B]

================================================
FILE: src/layer/vulkan/requantize_vulkan.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: src/layer/vulkan/requantize_vulkan.h
================================================
[File too large to display: 804 B]

================================================
FILE: src/layer/vulkan/reshape_vulkan.cpp
================================================
[File too large to display: 10.1 KB]

================================================
FILE: src/layer/vulkan/reshape_vulkan.h
================================================
[File too large to display: 841 B]

================================================
FILE: src/layer/vulkan/rmsnorm_vulkan.cpp
================================================
[File too large to display: 13.7 KB]

================================================
FILE: src/layer/vulkan/rmsnorm_vulkan.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/rotaryembed_vulkan.cpp
================================================
[File too large to display: 3.7 KB]

================================================
FILE: src/layer/vulkan/rotaryembed_vulkan.h
================================================
[File too large to display: 726 B]

================================================
FILE: src/layer/vulkan/scale_vulkan.cpp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: src/layer/vulkan/scale_vulkan.h
================================================
[File too large to display: 830 B]

================================================
FILE: src/layer/vulkan/sdpa_vulkan.cpp
================================================
[File too large to display: 22.6 KB]

================================================
FILE: src/layer/vulkan/sdpa_vulkan.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/selu_vulkan.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/vulkan/selu_vulkan.h
================================================
[File too large to display: 564 B]

================================================
FILE: src/layer/vulkan/shader/.clang-format
================================================
[File too large to display: 3.6 KB]

================================================
FILE: src/layer/vulkan/shader/absval.comp
================================================
[File too large to display: 484 B]

================================================
FILE: src/layer/vulkan/shader/batchnorm.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/batchnorm_pack4.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/binaryop.comp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: src/layer/vulkan/shader/binaryop_broadcast.comp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: src/layer/vulkan/shader/binaryop_broadcast_pack1to4.comp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: src/layer/vulkan/shader/binaryop_broadcast_pack4.comp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: src/layer/vulkan/shader/binaryop_pack4.comp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: src/layer/vulkan/shader/cast_fp16_to_fp32.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/cast_fp16_to_fp32_pack4.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/cast_fp32_to_fp16.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/cast_fp32_to_fp16_pack4.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/celu.comp
================================================
[File too large to display: 681 B]

================================================
FILE: src/layer/vulkan/shader/clip.comp
================================================
[File too large to display: 691 B]

================================================
FILE: src/layer/vulkan/shader/concat.comp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/vulkan/shader/concat_pack4.comp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/vulkan/shader/concat_pack4to1.comp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/vulkan/shader/convolution1d_packed.comp
================================================
[File too large to display: 6.8 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_1x1s1d1_cm.comp
================================================
[File too large to display: 47.8 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_input.comp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output.comp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_input.comp
================================================
[File too large to display: 12.5 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output.comp
================================================
[File too large to display: 10.5 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_3x3s1d1_winograd_gemm.comp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_gemm_cm.comp
================================================
[File too large to display: 56.4 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_pack1to4_3x3s1d1_winograd_gemm.comp
================================================
[File too large to display: 4.5 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
================================================
[File too large to display: 5.3 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
================================================
[File too large to display: 4.5 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input.comp
================================================
[File too large to display: 12.9 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp
================================================
[File too large to display: 10.8 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm.comp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_pack4to1_3x3s1d1_winograd_gemm.comp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_packed.comp
================================================
[File too large to display: 12.0 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_packed_1x1s1d1.comp
================================================
[File too large to display: 11.4 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_packed_gemm.comp
================================================
[File too large to display: 14.7 KB]

================================================
FILE: src/layer/vulkan/shader/convolution_winograd_gemm_cm.comp
================================================
[File too large to display: 45.7 KB]

================================================
FILE: src/layer/vulkan/shader/convolutiondepthwise.comp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: src/layer/vulkan/shader/convolutiondepthwise_group.comp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/vulkan/shader/convolutiondepthwise_pack4.comp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/vulkan/shader/crop.comp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/layer/vulkan/shader/crop_pack1to4.comp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: src/layer/vulkan/shader/crop_pack4.comp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/layer/vulkan/shader/crop_pack4to1.comp
================================================
[File too large to display: 4.0 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolution_col2im.comp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolution_gemm_cm.comp
================================================
[File too large to display: 45.3 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolution_gemm_packed.comp
================================================
[File too large to display: 9.7 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolution_pack4_col2im.comp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolution_packed.comp
================================================
[File too large to display: 14.3 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolutiondepthwise.comp
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolutiondepthwise_group.comp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/vulkan/shader/deepcopy.comp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/vulkan/shader/deepcopy_pack4.comp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/vulkan/shader/dequantize.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/dequantize_pack4.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/dropout.comp
================================================
[File too large to display: 593 B]

================================================
FILE: src/layer/vulkan/shader/eltwise.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/elu.comp
================================================
[File too large to display: 675 B]

================================================
FILE: src/layer/vulkan/shader/erf.comp
================================================
[File too large to display: 964 B]

================================================
FILE: src/layer/vulkan/shader/flatten.comp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/vulkan/shader/flatten_pack1to4.comp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/vulkan/shader/flatten_pack4.comp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: src/layer/vulkan/shader/gelu.comp
================================================
[File too large to display: 831 B]

================================================
FILE: src/layer/vulkan/shader/gemm.comp
================================================
[File too large to display: 24.3 KB]

================================================
FILE: src/layer/vulkan/shader/gemm_cm.comp
================================================
[File too large to display: 165.2 KB]

================================================
FILE: src/layer/vulkan/shader/gemm_sg.comp
================================================
[File too large to display: 15.5 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_coeffs.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_coeffs_pack4.comp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_norm.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_norm_pack4.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_reduce_mean.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_reduce_mean_pack4.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_reduce_sum4_fp16_to_fp32.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_reduce_sum4_fp16_to_fp32_pack4.comp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_reduce_sum4_fp32.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_reduce_sum4_fp32_pack4.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_sub_mean_square.comp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/vulkan/shader/groupnorm_sub_mean_square_pack4.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/hardsigmoid.comp
================================================
[File too large to display: 681 B]

================================================
FILE: src/layer/vulkan/shader/hardswish.comp
================================================
[File too large to display: 685 B]

================================================
FILE: src/layer/vulkan/shader/innerproduct.comp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_gemm.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_gemm_wp1to4.comp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_gemm_wp4.comp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_gemm_wp4to1.comp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_pack1to4.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_pack4.comp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_pack4to1.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_reduce_sum8.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_reduce_sum8_pack4.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_sum8.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_sum8_pack1to4.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_sum8_pack4.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/innerproduct_sum8_pack4to1.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_coeffs.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_coeffs_pack4.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_norm.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_norm_pack4.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_reduce_mean.comp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_reduce_mean_pack4.comp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_reduce_sum4_fp16_to_fp32.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_reduce_sum4_fp16_to_fp32_pack4.comp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_reduce_sum4_fp32.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_reduce_sum4_fp32_pack4.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_sub_mean_square.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/instancenorm_sub_mean_square_pack4.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/interp.comp
================================================
[File too large to display: 4.9 KB]

================================================
FILE: src/layer/vulkan/shader/interp_bicubic.comp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: src/layer/vulkan/shader/interp_bicubic_coeffs.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/interp_bicubic_pack4.comp
================================================
[File too large to display: 4.2 KB]

================================================
FILE: src/layer/vulkan/shader/interp_pack4.comp
================================================
[File too large to display: 5.0 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_coeffs.comp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_coeffs_pack4.comp
================================================
[File too large to display: 990 B]

================================================
FILE: src/layer/vulkan/shader/layernorm_norm.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_norm_pack4.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_reduce_mean.comp
================================================
[File too large to display: 836 B]

================================================
FILE: src/layer/vulkan/shader/layernorm_reduce_mean_pack4.comp
================================================
[File too large to display: 839 B]

================================================
FILE: src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32_pack4.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_reduce_sum4_fp32_pack4.comp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_sub_mean_square.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/layernorm_sub_mean_square_pack4.comp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/vulkan/shader/lrn_norm.comp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/vulkan/shader/lrn_norm_across_channel_pack4.comp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: src/layer/vulkan/shader/lrn_norm_within_channel_pack4.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/lrn_square_pad.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/lrn_square_pad_across_channel_pack4.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/lrn_square_pad_within_channel_pack4.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/mish.comp
================================================
[File too large to display: 600 B]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qk_cross.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qk_cross_pack1to4.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qk_cross_pack4.comp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qk_cross_pack4to1.comp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qkv_cross.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qkv_cross_pack1to4.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qkv_cross_pack4.comp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: src/layer/vulkan/shader/multiheadattention_qkv_cross_pack4to1.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_coeffs.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_coeffs_pack4.comp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_norm.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_norm_pack4.comp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_reduce_sum4_fp16_to_fp32.comp
================================================
[File too large to display: 4.9 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_reduce_sum4_fp16_to_fp32_pack4.comp
================================================
[File too large to display: 4.7 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_reduce_sum4_fp32.comp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: src/layer/vulkan/shader/normalize_reduce_sum4_fp32_pack4.comp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: src/layer/vulkan/shader/packing.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/packing_int8.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/packing_pack1to4.comp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/vulkan/shader/packing_pack1to4_int8.comp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/vulkan/shader/packing_pack4to1.comp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/vulkan/shader/packing_pack4to1_int8.comp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/vulkan/shader/padding.comp
================================================
[File too large to display: 5.5 KB]

================================================
FILE: src/layer/vulkan/shader/padding_3d.comp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/vulkan/shader/padding_3d_pack4.comp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/vulkan/shader/padding_pack1to4.comp
================================================
[File too large to display: 7.2 KB]

================================================
FILE: src/layer/vulkan/shader/padding_pack4.comp
================================================
[File too large to display: 7.4 KB]

================================================
FILE: src/layer/vulkan/shader/padding_pack4to1.comp
================================================
[File too large to display: 8.5 KB]

================================================
FILE: src/layer/vulkan/shader/permute.comp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: src/layer/vulkan/shader/permute_pack1to4.comp
================================================
[File too large to display: 7.8 KB]

================================================
FILE: src/layer/vulkan/shader/permute_pack4.comp
================================================
[File too large to display: 9.6 KB]

================================================
FILE: src/layer/vulkan/shader/permute_pack4to1.comp
================================================
[File too large to display: 7.9 KB]

================================================
FILE: src/layer/vulkan/shader/pixelshuffle.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/pixelshuffle_pack4.comp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/vulkan/shader/pixelshuffle_pack4to1.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/pooling.comp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_adaptive.comp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_adaptive_pack4.comp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_max.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_max_first.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_max_first_pack4.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_max_last.comp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_max_last_pack4.comp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_max_pack4.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_sum.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_sum_first.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_sum_first_pack4.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_sum_last.comp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_sum_last_pack4.comp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_global_reduce_sum_pack4.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/pooling_pack4.comp
================================================
[File too large to display: 4.0 KB]

================================================
FILE: src/layer/vulkan/shader/prelu.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/prelu_pack4.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/priorbox.comp
================================================
[File too large to display: 4.5 KB]

================================================
FILE: src/layer/vulkan/shader/priorbox_mxnet.comp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/vulkan/shader/quantize.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/quantize_pack4.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/reduction.comp
================================================
[File too large to display: 9.2 KB]

================================================
FILE: src/layer/vulkan/shader/relu.comp
================================================
[File too large to display: 694 B]

================================================
FILE: src/layer/vulkan/shader/reorg.comp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: src/layer/vulkan/shader/reorg_pack1to4.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/reorg_pack4.comp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/vulkan/shader/requantize.comp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: src/layer/vulkan/shader/requantize_pack4.comp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/vulkan/shader/reshape.comp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/layer/vulkan/shader/reshape_pack1to4.comp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/vulkan/shader/reshape_pack4.comp
================================================
[File too large to display: 4.3 KB]

================================================
FILE: src/layer/vulkan/shader/reshape_pack4to1.comp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/vulkan/shader/rmsnorm_coeffs.comp
================================================
[File too large to display: 796 B]

================================================
FILE: src/layer/vulkan/shader/rmsnorm_coeffs_pack4.comp
================================================
[File too large to display: 808 B]

================================================
FILE: src/layer/vulkan/shader/rmsnorm_norm.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/rmsnorm_norm_pack4.comp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/vulkan/shader/rmsnorm_square.comp
================================================
[File too large to display: 714 B]

================================================
FILE: src/layer/vulkan/shader/rmsnorm_square_pack4.comp
================================================
[File too large to display: 725 B]

================================================
FILE: src/layer/vulkan/shader/rotaryembed.comp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/vulkan/shader/rotaryembed_pack4.comp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/vulkan/shader/scale.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/scale_pack4.comp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shader/sdpa_cross.comp
================================================
[File too large to display: 11.7 KB]

================================================
FILE: src/layer/vulkan/shader/sdpa_cross_cm.comp
================================================
[File too large to display: 94.8 KB]

================================================
FILE: src/layer/vulkan/shader/sdpa_fa.comp
================================================
[File too large to display: 14.2 KB]

================================================
FILE: src/layer/vulkan/shader/sdpa_fa_cm.comp
================================================
[File too large to display: 35.7 KB]

================================================
FILE: src/layer/vulkan/shader/selu.comp
================================================
[File too large to display: 822 B]

================================================
FILE: src/layer/vulkan/shader/shrink.comp
================================================
[File too large to display: 874 B]

================================================
FILE: src/layer/vulkan/shader/shufflechannel.comp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/vulkan/shader/shufflechannel_pack4.comp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/layer/vulkan/shader/sigmoid.comp
================================================
[File too large to display: 517 B]

================================================
FILE: src/layer/vulkan/shader/slice.comp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/vulkan/shader/slice_pack1to4.comp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/vulkan/shader/slice_pack4.comp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_div_sum.comp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_div_sum_pack4.comp
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_exp_sub_max.comp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_exp_sub_max_pack4.comp
================================================
[File too large to display: 3.6 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_reduce_max.comp
================================================
[File too large to display: 5.4 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_reduce_max_pack4.comp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_reduce_sum.comp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: src/layer/vulkan/shader/softmax_reduce_sum_pack4.comp
================================================
[File too large to display: 5.6 KB]

================================================
FILE: src/layer/vulkan/shader/softplus.comp
================================================
[File too large to display: 514 B]

================================================
FILE: src/layer/vulkan/shader/swish.comp
================================================
[File too large to display: 500 B]

================================================
FILE: src/layer/vulkan/shader/tanh.comp
================================================
[File too large to display: 542 B]

================================================
FILE: src/layer/vulkan/shader/unaryop.comp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/shader/unfold_im2col.comp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/vulkan/shader/unfold_im2col_pack1to4.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/unfold_im2col_pack4.comp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/vulkan/shader/unfold_im2col_pack4to1.comp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/vulkan/shader/vulkan_activation.comp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/vulkan/shrink_vulkan.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/shrink_vulkan.h
================================================
[File too large to display: 582 B]

================================================
FILE: src/layer/vulkan/shufflechannel_vulkan.cpp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: src/layer/vulkan/shufflechannel_vulkan.h
================================================
[File too large to display: 692 B]

================================================
FILE: src/layer/vulkan/sigmoid_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/sigmoid_vulkan.h
================================================
[File too large to display: 581 B]

================================================
FILE: src/layer/vulkan/slice_vulkan.cpp
================================================
[File too large to display: 31.7 KB]

================================================
FILE: src/layer/vulkan/slice_vulkan.h
================================================
[File too large to display: 678 B]

================================================
FILE: src/layer/vulkan/softmax_vulkan.cpp
================================================
[File too large to display: 13.7 KB]

================================================
FILE: src/layer/vulkan/softmax_vulkan.h
================================================
[File too large to display: 914 B]

================================================
FILE: src/layer/vulkan/softplus_vulkan.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/softplus_vulkan.h
================================================
[File too large to display: 600 B]

================================================
FILE: src/layer/vulkan/split_vulkan.cpp
================================================
[File too large to display: 541 B]

================================================
FILE: src/layer/vulkan/split_vulkan.h
================================================
[File too large to display: 449 B]

================================================
FILE: src/layer/vulkan/swish_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/swish_vulkan.h
================================================
[File too large to display: 563 B]

================================================
FILE: src/layer/vulkan/tanh_vulkan.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/vulkan/tanh_vulkan.h
================================================
[File too large to display: 554 B]

================================================
FILE: src/layer/vulkan/unaryop_vulkan.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/vulkan/unaryop_vulkan.h
================================================
[File too large to display: 581 B]

================================================
FILE: src/layer/vulkan/unfold_vulkan.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: src/layer/vulkan/unfold_vulkan.h
================================================
[File too large to display: 730 B]

================================================
FILE: src/layer/x86/absval_x86.cpp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: src/layer/x86/absval_x86.h
================================================
[File too large to display: 454 B]

================================================
FILE: src/layer/x86/avx512_mathfun.h
================================================
[File too large to display: 36.7 KB]

================================================
FILE: src/layer/x86/avx_mathfun.h
================================================
[File too large to display: 46.1 KB]

================================================
FILE: src/layer/x86/batchnorm_bf16s.h
================================================
[File too large to display: 4.6 KB]

================================================
FILE: src/layer/x86/batchnorm_x86.cpp
================================================
[File too large to display: 8.6 KB]

================================================
FILE: src/layer/x86/batchnorm_x86.h
================================================
[File too large to display: 490 B]

================================================
FILE: src/layer/x86/batchnorm_x86_avx512bf16.cpp
================================================
[File too large to display: 568 B]

================================================
FILE: src/layer/x86/bias_x86.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/x86/bias_x86.h
================================================
[File too large to display: 327 B]

================================================
FILE: src/layer/x86/binaryop_bf16s.h
================================================
[File too large to display: 10.8 KB]

================================================
FILE: src/layer/x86/binaryop_functor.h
================================================
[File too large to display: 11.7 KB]

================================================
FILE: src/layer/x86/binaryop_x86.cpp
================================================
[File too large to display: 38.3 KB]

================================================
FILE: src/layer/x86/binaryop_x86.h
================================================
[File too large to display: 716 B]

================================================
FILE: src/layer/x86/binaryop_x86_avx512bf16.cpp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: src/layer/x86/bnll_x86.cpp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/x86/bnll_x86.h
================================================
[File too large to display: 374 B]

================================================
FILE: src/layer/x86/cast_bf16.h
================================================
[File too large to display: 4.4 KB]

================================================
FILE: src/layer/x86/cast_fp16.h
================================================
[File too large to display: 3.8 KB]

================================================
FILE: src/layer/x86/cast_x86.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/x86/cast_x86.h
================================================
[File too large to display: 353 B]

================================================
FILE: src/layer/x86/cast_x86_avx2.cpp
================================================
[File too large to display: 492 B]

================================================
FILE: src/layer/x86/cast_x86_avx512bf16.cpp
================================================
[File too large to display: 504 B]

================================================
FILE: src/layer/x86/cast_x86_f16c.cpp
================================================
[File too large to display: 492 B]

================================================
FILE: src/layer/x86/clip_bf16s.h
================================================
[File too large to display: 3.2 KB]

================================================
FILE: src/layer/x86/clip_x86.cpp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/x86/clip_x86.h
================================================
[File too large to display: 455 B]

================================================
FILE: src/layer/x86/clip_x86_avx512bf16.cpp
================================================
[File too large to display: 332 B]

================================================
FILE: src/layer/x86/concat_x86.cpp
================================================
[File too large to display: 56.0 KB]

================================================
FILE: src/layer/x86/concat_x86.h
================================================
[File too large to display: 528 B]

================================================
FILE: src/layer/x86/convolution1d_packed.h
================================================
[File too large to display: 116.7 KB]

================================================
FILE: src/layer/x86/convolution1d_x86.cpp
================================================
[File too large to display: 4.0 KB]

================================================
FILE: src/layer/x86/convolution1d_x86.h
================================================
[File too large to display: 673 B]

================================================
FILE: src/layer/x86/convolution_1x1.h
================================================
[File too large to display: 4.8 KB]

================================================
FILE: src/layer/x86/convolution_2x2_pack8.h
================================================
[File too large to display: 17.3 KB]

================================================
FILE: src/layer/x86/convolution_3x3.h
================================================
[File too large to display: 5.3 KB]

================================================
FILE: src/layer/x86/convolution_3x3_int8.h
================================================
[File too large to display: 3.6 KB]

================================================
FILE: src/layer/x86/convolution_3x3_pack16to1.h
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/x86/convolution_3x3_pack1to4.h
================================================
[File too large to display: 59.4 KB]

================================================
FILE: src/layer/x86/convolution_3x3_pack1to8.h
================================================
[File too large to display: 78.1 KB]

================================================
FILE: src/layer/x86/convolution_3x3_pack8.h
================================================
[File too large to display: 37.1 KB]

================================================
FILE: src/layer/x86/convolution_3x3_pack8to1.h
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/x86/convolution_3x3_winograd.h
================================================
[File too large to display: 284.6 KB]

================================================
FILE: src/layer/x86/convolution_3x3_winograd_int8.h
================================================
[File too large to display: 268.7 KB]

================================================
FILE: src/layer/x86/convolution_5x5.h
================================================
[File too large to display: 5.8 KB]

================================================
FILE: src/layer/x86/convolution_im2col_gemm.h
================================================
[File too large to display: 174.8 KB]

================================================
FILE: src/layer/x86/convolution_im2col_gemm_int8.h
================================================
[File too large to display: 204.6 KB]

================================================
FILE: src/layer/x86/convolution_packed.h
================================================
[File too large to display: 124.6 KB]

================================================
FILE: src/layer/x86/convolution_packed_int8.h
================================================
[File too large to display: 252.3 KB]

================================================
FILE: src/layer/x86/convolution_x86.cpp
================================================
[File too large to display: 42.0 KB]

================================================
FILE: src/layer/x86/convolution_x86.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/x86/convolution_x86_avx2.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/x86/convolution_x86_avx512vnni.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/x86/convolution_x86_avxvnni.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/x86/convolution_x86_avxvnniint8.cpp
================================================
[File too large to display: 576 B]

================================================
FILE: src/layer/x86/convolution_x86_xop.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_3x3.h
================================================
[File too large to display: 4.6 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_3x3_int8.h
================================================
[File too large to display: 11.0 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_3x3_pack16.h
================================================
[File too large to display: 30.0 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_3x3_pack4.h
================================================
[File too large to display: 23.8 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_3x3_pack8.h
================================================
[File too large to display: 33.6 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_5x5_pack16.h
================================================
[File too large to display: 11.3 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_5x5_pack4.h
================================================
[File too large to display: 18.1 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_5x5_pack8.h
================================================
[File too large to display: 11.5 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_x86.cpp
================================================
[File too large to display: 42.8 KB]

================================================
FILE: src/layer/x86/convolutiondepthwise_x86.h
================================================
[File too large to display: 1008 B]

================================================
FILE: src/layer/x86/crop_x86.cpp
================================================
[File too large to display: 36.8 KB]

================================================
FILE: src/layer/x86/crop_x86.h
================================================
[File too large to display: 471 B]

================================================
FILE: src/layer/x86/deconvolution_packed.h
================================================
[File too large to display: 129.0 KB]

================================================
FILE: src/layer/x86/deconvolution_x86.cpp
================================================
[File too large to display: 15.4 KB]

================================================
FILE: src/layer/x86/deconvolution_x86.h
================================================
[File too large to display: 714 B]

================================================
FILE: src/layer/x86/deconvolutiondepthwise_x86.cpp
================================================
[File too large to display: 21.8 KB]

================================================
FILE: src/layer/x86/deconvolutiondepthwise_x86.h
================================================
[File too large to display: 835 B]

================================================
FILE: src/layer/x86/deformableconv2d_packed.h
================================================
[File too large to display: 15.0 KB]

================================================
FILE: src/layer/x86/deformableconv2d_x86.cpp
================================================
[File too large to display: 23.0 KB]

================================================
FILE: src/layer/x86/deformableconv2d_x86.h
================================================
[File too large to display: 646 B]

================================================
FILE: src/layer/x86/dequantize_x86.cpp
================================================
[File too large to display: 8.3 KB]

================================================
FILE: src/layer/x86/dequantize_x86.h
================================================
[File too large to display: 395 B]

================================================
FILE: src/layer/x86/dropout_x86.cpp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: src/layer/x86/dropout_x86.h
================================================
[File too large to display: 365 B]

================================================
FILE: src/layer/x86/eltwise_x86.cpp
================================================
[File too large to display: 16.2 KB]

================================================
FILE: src/layer/x86/eltwise_x86.h
================================================
[File too large to display: 402 B]

================================================
FILE: src/layer/x86/elu_x86.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer/x86/elu_x86.h
================================================
[File too large to display: 337 B]

================================================
FILE: src/layer/x86/erf_x86.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/x86/erf_x86.h
================================================
[File too large to display: 347 B]

================================================
FILE: src/layer/x86/flatten_x86.cpp
================================================
[File too large to display: 38.1 KB]

================================================
FILE: src/layer/x86/flatten_x86.h
================================================
[File too large to display: 565 B]

================================================
FILE: src/layer/x86/gelu_x86.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: src/layer/x86/gelu_x86.h
================================================
[File too large to display: 396 B]

================================================
FILE: src/layer/x86/gemm_bf16s.h
================================================
[File too large to display: 101.7 KB]

================================================
FILE: src/layer/x86/gemm_int8.h
================================================
[File too large to display: 624.4 KB]

================================================
FILE: src/layer/x86/gemm_x86.cpp
================================================
[File too large to display: 320.0 KB]

================================================
FILE: src/layer/x86/gemm_x86.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/x86/gemm_x86_avx2.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/x86/gemm_x86_avx512vnni.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/x86/gemm_x86_avxvnni.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/x86/gemm_x86_avxvnniint8.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: src/layer/x86/gemm_x86_xop.cpp
================================================
[File too large to display: 602 B]

================================================
FILE: src/layer/x86/gridsample_bicubic_apply_interpolation.h
================================================
[File too large to display: 12.2 KB]

================================================
FILE: src/layer/x86/gridsample_bicubic_compute_blob.h
================================================
[File too large to display: 13.4 KB]

================================================
FILE: src/layer/x86/gridsample_bilinear_apply_interpolation.h
================================================
[File too large to display: 15.8 KB]

================================================
FILE: src/layer/x86/gridsample_bilinear_compute_blob.h
================================================
[File too large to display: 30.2 KB]

================================================
FILE: src/layer/x86/gridsample_compute_blob.h
================================================
[File too large to display: 3.4 KB]

================================================
FILE: src/layer/x86/gridsample_nearest_apply_interpolation.h
================================================
[File too large to display: 3.2 KB]

================================================
FILE: src/layer/x86/gridsample_nearest_compute_blob.h
================================================
[File too large to display: 11.5 KB]

================================================
FILE: src/layer/x86/gridsample_x86.cpp
================================================
[File too large to display: 16.2 KB]

================================================
FILE: src/layer/x86/gridsample_x86.h
================================================
[File too large to display: 423 B]

================================================
FILE: src/layer/x86/groupnorm_bf16s.h
================================================
[File too large to display: 10.1 KB]

================================================
FILE: src/layer/x86/groupnorm_x86.cpp
================================================
[File too large to display: 15.4 KB]

================================================
FILE: src/layer/x86/groupnorm_x86.h
================================================
[File too large to display: 490 B]

================================================
FILE: src/layer/x86/groupnorm_x86_avx512bf16.cpp
================================================
[File too large to display: 459 B]

================================================
FILE: src/layer/x86/hardsigmoid_x86.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/x86/hardsigmoid_x86.h
================================================
[File too large to display: 393 B]

================================================
FILE: src/layer/x86/hardswish_x86.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/x86/hardswish_x86.h
================================================
[File too large to display: 379 B]

================================================
FILE: src/layer/x86/innerproduct_fp.h
================================================
[File too large to display: 55.1 KB]

================================================
FILE: src/layer/x86/innerproduct_gemm_fp.h
================================================
[File too large to display: 50.4 KB]

================================================
FILE: src/layer/x86/innerproduct_x86.cpp
================================================
[File too large to display: 26.4 KB]

================================================
FILE: src/layer/x86/innerproduct_x86.h
================================================
[File too large to display: 959 B]

================================================
FILE: src/layer/x86/innerproduct_x86_f16c.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/x86/instancenorm_bf16s.h
================================================
[File too large to display: 4.7 KB]

================================================
FILE: src/layer/x86/instancenorm_x86.cpp
================================================
[File too large to display: 6.1 KB]

================================================
FILE: src/layer/x86/instancenorm_x86.h
================================================
[File too large to display: 511 B]

================================================
FILE: src/layer/x86/instancenorm_x86_avx512bf16.cpp
================================================
[File too large to display: 521 B]

================================================
FILE: src/layer/x86/interp_bicubic.h
================================================
[File too large to display: 33.7 KB]

================================================
FILE: src/layer/x86/interp_bicubic_pack16.h
================================================
[File too large to display: 9.7 KB]

================================================
FILE: src/layer/x86/interp_bicubic_pack4.h
================================================
[File too large to display: 32.8 KB]

================================================
FILE: src/layer/x86/interp_bicubic_pack8.h
================================================
[File too large to display: 21.1 KB]

================================================
FILE: src/layer/x86/interp_bilinear.h
================================================
[File too large to display: 12.6 KB]

================================================
FILE: src/layer/x86/interp_bilinear_pack16.h
================================================
[File too large to display: 2.7 KB]

================================================
FILE: src/layer/x86/interp_bilinear_pack4.h
================================================
[File too large to display: 8.5 KB]

================================================
FILE: src/layer/x86/interp_bilinear_pack8.h
================================================
[File too large to display: 5.2 KB]

================================================
FILE: src/layer/x86/interp_x86.cpp
================================================
[File too large to display: 16.1 KB]

================================================
FILE: src/layer/x86/interp_x86.h
================================================
[File too large to display: 395 B]

================================================
FILE: src/layer/x86/interp_x86_avx2.cpp
================================================
[File too large to display: 572 B]

================================================
FILE: src/layer/x86/layernorm_bf16s.h
================================================
[File too large to display: 16.7 KB]

================================================
FILE: src/layer/x86/layernorm_x86.cpp
================================================
[File too large to display: 18.8 KB]

================================================
FILE: src/layer/x86/layernorm_x86.h
================================================
[File too large to display: 490 B]

================================================
FILE: src/layer/x86/layernorm_x86_avx512bf16.cpp
================================================
[File too large to display: 424 B]

================================================
FILE: src/layer/x86/lrn_x86.cpp
================================================
[File too large to display: 5.5 KB]

================================================
FILE: src/layer/x86/lrn_x86.h
================================================
[File too large to display: 321 B]

================================================
FILE: src/layer/x86/lstm_int8.h
================================================
[File too large to display: 152.4 KB]

================================================
FILE: src/layer/x86/lstm_x86.cpp
================================================
[File too large to display: 32.6 KB]

================================================
FILE: src/layer/x86/lstm_x86.h
================================================
[File too large to display: 990 B]

================================================
FILE: src/layer/x86/lstm_x86_avx2.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer/x86/lstm_x86_avx512vnni.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/x86/lstm_x86_avxvnni.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: src/layer/x86/lstm_x86_xop.cpp
================================================
[File too large to display: 697 B]

================================================
FILE: src/layer/x86/matmul_x86.cpp
================================================
[File too large to display: 6.4 KB]

================================================
FILE: src/layer/x86/matmul_x86.h
================================================
[File too large to display: 527 B]

================================================
FILE: src/layer/x86/mish_x86.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/x86/mish_x86.h
================================================
[File too large to display: 344 B]

================================================
FILE: src/layer/x86/multiheadattention_x86.cpp
================================================
[File too large to display: 15.2 KB]

================================================
FILE: src/layer/x86/multiheadattention_x86.h
================================================
[File too large to display: 736 B]

================================================
FILE: src/layer/x86/packing_x86.cpp
================================================
[File too large to display: 97.2 KB]

================================================
FILE: src/layer/x86/packing_x86.h
================================================
[File too large to display: 565 B]

================================================
FILE: src/layer/x86/padding_pack16.h
================================================
[File too large to display: 5.1 KB]

================================================
FILE: src/layer/x86/padding_pack16_bf16s_fp16s.h
================================================
[File too large to display: 5.9 KB]

================================================
FILE: src/layer/x86/padding_pack4.h
================================================
[File too large to display: 4.9 KB]

================================================
FILE: src/layer/x86/padding_pack4_bf16s_fp16s.h
================================================
[File too large to display: 4.8 KB]

================================================
FILE: src/layer/x86/padding_pack8.h
================================================
[File too large to display: 5.1 KB]

================================================
FILE: src/layer/x86/padding_pack8_bf16s_fp16s.h
================================================
[File too large to display: 5.7 KB]

================================================
FILE: src/layer/x86/padding_pack8_int8.h
================================================
[File too large to display: 3.5 KB]

================================================
FILE: src/layer/x86/padding_x86.cpp
================================================
[File too large to display: 39.7 KB]

================================================
FILE: src/layer/x86/padding_x86.h
================================================
[File too large to display: 837 B]

================================================
FILE: src/layer/x86/pooling_2x2.h
================================================
[File too large to display: 2.1 KB]

================================================
FILE: src/layer/x86/pooling_2x2_pack16.h
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/x86/pooling_2x2_pack4.h
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/x86/pooling_2x2_pack8.h
================================================
[File too large to display: 1.3 KB]

================================================
FILE: src/layer/x86/pooling_3x3_pack16.h
================================================
[File too large to display: 7.2 KB]

================================================
FILE: src/layer/x86/pooling_3x3_pack4.h
================================================
[File too large to display: 3.6 KB]

================================================
FILE: src/layer/x86/pooling_3x3_pack8.h
================================================
[File too large to display: 7.2 KB]

================================================
FILE: src/layer/x86/pooling_x86.cpp
================================================
[File too large to display: 26.1 KB]

================================================
FILE: src/layer/x86/pooling_x86.h
================================================
[File too large to display: 450 B]

================================================
FILE: src/layer/x86/prelu_bf16s.h
================================================
[File too large to display: 7.9 KB]

================================================
FILE: src/layer/x86/prelu_x86.cpp
================================================
[File too large to display: 9.9 KB]

================================================
FILE: src/layer/x86/prelu_x86.h
================================================
[File too large to display: 462 B]

================================================
FILE: src/layer/x86/prelu_x86_avx512bf16.cpp
================================================
[File too large to display: 702 B]

================================================
FILE: src/layer/x86/quantize_x86.cpp
================================================
[File too large to display: 14.9 KB]

================================================
FILE: src/layer/x86/quantize_x86.h
================================================
[File too large to display: 381 B]

================================================
FILE: src/layer/x86/relu_bf16s.h
================================================
[File too large to display: 6.2 KB]

================================================
FILE: src/layer/x86/relu_x86.cpp
================================================
[File too large to display: 6.1 KB]

================================================
FILE: src/layer/x86/relu_x86.h
================================================
[File too large to display: 532 B]

================================================
FILE: src/layer/x86/relu_x86_avx512bf16.cpp
================================================
[File too large to display: 320 B]

================================================
FILE: src/layer/x86/requantize_x86.cpp
================================================
[File too large to display: 12.9 KB]

================================================
FILE: src/layer/x86/requantize_x86.h
================================================
[File too large to display: 395 B]

================================================
FILE: src/layer/x86/reshape_x86.cpp
================================================
[File too large to display: 43.9 KB]

================================================
FILE: src/layer/x86/reshape_x86.h
================================================
[File too large to display: 535 B]

================================================
FILE: src/layer/x86/rmsnorm_bf16s.h
================================================
[File too large to display: 10.9 KB]

================================================
FILE: src/layer/x86/rmsnorm_x86.cpp
================================================
[File too large to display: 13.1 KB]

================================================
FILE: src/layer/x86/rmsnorm_x86.h
================================================
[File too large to display: 476 B]

================================================
FILE: src/layer/x86/rmsnorm_x86_avx512bf16.cpp
================================================
[File too large to display: 385 B]

================================================
FILE: src/layer/x86/roialign_x86.cpp
================================================
[File too large to display: 12.4 KB]

================================================
FILE: src/layer/x86/roialign_x86.h
================================================
[File too large to display: 409 B]

================================================
FILE: src/layer/x86/rotaryembed_x86.cpp
================================================
[File too large to display: 13.5 KB]

================================================
FILE: src/layer/x86/rotaryembed_x86.h
================================================
[File too large to display: 431 B]

================================================
FILE: src/layer/x86/scale_bf16s.h
================================================
[File too large to display: 8.5 KB]

================================================
FILE: src/layer/x86/scale_x86.cpp
================================================
[File too large to display: 14.2 KB]

================================================
FILE: src/layer/x86/scale_x86.h
================================================
[File too large to display: 490 B]

================================================
FILE: src/layer/x86/scale_x86_avx512bf16.cpp
================================================
[File too large to display: 940 B]

================================================
FILE: src/layer/x86/sdpa_x86.cpp
================================================
[File too large to display: 9.5 KB]

================================================
FILE: src/layer/x86/sdpa_x86.h
================================================
[File too large to display: 561 B]

================================================
FILE: src/layer/x86/selu_x86.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/x86/selu_x86.h
================================================
[File too large to display: 344 B]

================================================
FILE: src/layer/x86/shufflechannel_x86.cpp
================================================
[File too large to display: 27.7 KB]

================================================
FILE: src/layer/x86/shufflechannel_x86.h
================================================
[File too large to display: 423 B]

================================================
FILE: src/layer/x86/sigmoid_bf16s.h
================================================
[File too large to display: 3.2 KB]

================================================
FILE: src/layer/x86/sigmoid_x86.cpp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: src/layer/x86/sigmoid_x86.h
================================================
[File too large to display: 476 B]

================================================
FILE: src/layer/x86/sigmoid_x86_avx512bf16.cpp
================================================
[File too large to display: 537 B]

================================================
FILE: src/layer/x86/slice_x86.cpp
================================================
[File too large to display: 63.6 KB]

================================================
FILE: src/layer/x86/slice_x86.h
================================================
[File too large to display: 521 B]

================================================
FILE: src/layer/x86/softmax_bf16s.h
================================================
[File too large to display: 25.8 KB]

================================================
FILE: src/layer/x86/softmax_x86.cpp
================================================
[File too large to display: 68.1 KB]

================================================
FILE: src/layer/x86/softmax_x86.h
================================================
[File too large to display: 476 B]

================================================
FILE: src/layer/x86/softmax_x86_avx512bf16.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: src/layer/x86/sse_mathfun.h
================================================
[File too large to display: 45.7 KB]

================================================
FILE: src/layer/x86/swish_bf16s.h
================================================
[File too large to display: 2.3 KB]

================================================
FILE: src/layer/x86/swish_x86.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/layer/x86/swish_x86.h
================================================
[File too large to display: 462 B]

================================================
FILE: src/layer/x86/swish_x86_avx512bf16.cpp
================================================
[File too large to display: 529 B]

================================================
FILE: src/layer/x86/tanh_x86.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: src/layer/x86/tanh_x86.h
================================================
[File too large to display: 344 B]

================================================
FILE: src/layer/x86/unaryop_bf16s.h
================================================
[File too large to display: 2.2 KB]

================================================
FILE: src/layer/x86/unaryop_functor.h
================================================
[File too large to display: 11.5 KB]

================================================
FILE: src/layer/x86/unaryop_x86.cpp
================================================
[File too large to display: 8.2 KB]

================================================
FILE: src/layer/x86/unaryop_x86.h
================================================
[File too large to display: 476 B]

================================================
FILE: src/layer/x86/unaryop_x86_avx512bf16.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: src/layer/x86/x86_activation.h
================================================
[File too large to display: 8.6 KB]

================================================
FILE: src/layer/x86/x86_usability.h
================================================
[File too large to display: 84.3 KB]

================================================
FILE: src/layer/x86/yolov3detectionoutput_x86.cpp
================================================
[File too large to display: 7.2 KB]

================================================
FILE: src/layer/x86/yolov3detectionoutput_x86.h
================================================
[File too large to display: 500 B]

================================================
FILE: src/layer/yolodetectionoutput.cpp
================================================
[File too large to display: 8.7 KB]

================================================
FILE: src/layer/yolodetectionoutput.h
================================================
[File too large to display: 716 B]

================================================
FILE: src/layer/yolov3detectionoutput.cpp
================================================
[File too large to display: 8.0 KB]

================================================
FILE: src/layer/yolov3detectionoutput.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/layer.cpp
================================================
[File too large to display: 12.6 KB]

================================================
FILE: src/layer.h
================================================
[File too large to display: 5.7 KB]

================================================
FILE: src/layer_declaration.h.in
================================================
[File too large to display: 108 B]

================================================
FILE: src/layer_registry.h.in
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/layer_shader_registry.h.in
================================================
[File too large to display: 116 B]

================================================
FILE: src/layer_shader_spv_data.h.in
================================================
[File too large to display: 116 B]

================================================
FILE: src/layer_shader_type.h
================================================
[File too large to display: 331 B]

================================================
FILE: src/layer_shader_type_enum.h.in
================================================
[File too large to display: 113 B]

================================================
FILE: src/layer_type.h
================================================
[File too large to display: 311 B]

================================================
FILE: src/layer_type_enum.h.in
================================================
[File too large to display: 104 B]

================================================
FILE: src/mat.cpp
================================================
[File too large to display: 40.7 KB]

================================================
FILE: src/mat.h
================================================
[File too large to display: 66.7 KB]

================================================
FILE: src/mat_pixel.cpp
================================================
[File too large to display: 86.8 KB]

================================================
FILE: src/mat_pixel_affine.cpp
================================================
[File too large to display: 77.1 KB]

================================================
FILE: src/mat_pixel_android.cpp
================================================
[File too large to display: 5.6 KB]

================================================
FILE: src/mat_pixel_drawing.cpp
================================================
[File too large to display: 53.6 KB]

================================================
FILE: src/mat_pixel_drawing_font.h
================================================
[File too large to display: 154.8 KB]

================================================
FILE: src/mat_pixel_resize.cpp
================================================
[File too large to display: 33.3 KB]

================================================
FILE: src/mat_pixel_rotate.cpp
================================================
[File too large to display: 229.0 KB]

================================================
FILE: src/modelbin.cpp
================================================
[File too large to display: 9.0 KB]

================================================
FILE: src/modelbin.h
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/ncnn.pc.in
================================================
[File too large to display: 382 B]

================================================
FILE: src/net.cpp
================================================
[File too large to display: 81.9 KB]

================================================
FILE: src/net.h
================================================
[File too large to display: 6.9 KB]

================================================
FILE: src/option.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: src/option.h
================================================
[File too large to display: 4.0 KB]

================================================
FILE: src/paramdict.cpp
================================================
[File too large to display: 14.2 KB]

================================================
FILE: src/paramdict.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: src/pipeline.cpp
================================================
[File too large to display: 17.0 KB]

================================================
FILE: src/pipeline.h
================================================
[File too large to display: 2.8 KB]

================================================
FILE: src/pipelinecache.cpp
================================================
[File too large to display: 15.7 KB]

================================================
FILE: src/pipelinecache.h
================================================
[File too large to display: 2.5 KB]

================================================
FILE: src/platform.h.in
================================================
[File too large to display: 12.3 KB]

================================================
FILE: src/ruapu.h
================================================
[File too large to display: 22.8 KB]

================================================
FILE: src/simplemath.cpp
================================================
[File too large to display: 14.9 KB]

================================================
FILE: src/simplemath.h
================================================
[File too large to display: 2.6 KB]

================================================
FILE: src/simpleocv.cpp
================================================
[File too large to display: 11.0 KB]

================================================
FILE: src/simpleocv.h
================================================
[File too large to display: 9.2 KB]

================================================
FILE: src/simpleomp.cpp
================================================
[File too large to display: 28.4 KB]

================================================
FILE: src/simpleomp.h
================================================
[File too large to display: 796 B]

================================================
FILE: src/simplestl.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: src/simplestl.h
================================================
[File too large to display: 11.3 KB]

================================================
FILE: src/simplevk.cpp
================================================
[File too large to display: 17.1 KB]

================================================
FILE: src/simplevk.h
================================================
[File too large to display: 86.7 KB]

================================================
FILE: src/simplevk.tbd
================================================
[File too large to display: 910 B]

================================================
FILE: src/stb_image.h
================================================
[File too large to display: 295.5 KB]

================================================
FILE: src/stb_image_write.h
================================================
[File too large to display: 69.5 KB]

================================================
FILE: src/vulkan_header_fix.h
================================================
[File too large to display: 88.7 KB]

================================================
FILE: tests/CMakeLists.txt
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tests/perf/CMakeLists.txt
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/perf/perf_batchnorm.cpp
================================================
[File too large to display: 628 B]

================================================
FILE: tests/perf/perf_binaryop.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tests/perf/perf_concat.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/perf/perf_convolution.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/perf/perf_convolutiondepthwise.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/perf/perf_deconvolution.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/perf/perf_innerproduct.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/perf/perf_pooling.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/perf/perf_relu.cpp
================================================
[File too large to display: 621 B]

================================================
FILE: tests/perf/perf_sigmoid.cpp
================================================
[File too large to display: 478 B]

================================================
FILE: tests/perf/perf_softmax.cpp
================================================
[File too large to display: 587 B]

================================================
FILE: tests/perf/perfutil.cpp
================================================
[File too large to display: 23.7 KB]

================================================
FILE: tests/perf/perfutil.h
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tests/prng.h
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tests/test_absval.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tests/test_batchnorm.cpp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tests/test_bias.cpp
================================================
[File too large to display: 961 B]

================================================
FILE: tests/test_binaryop.cpp
================================================
[File too large to display: 9.2 KB]

================================================
FILE: tests/test_binaryop_1.cpp
================================================
[File too large to display: 9.2 KB]

================================================
FILE: tests/test_binaryop_2.cpp
================================================
[File too large to display: 9.2 KB]

================================================
FILE: tests/test_binaryop_3.cpp
================================================
[File too large to display: 9.2 KB]

================================================
FILE: tests/test_binaryop_4.cpp
================================================
[File too large to display: 12.0 KB]

================================================
FILE: tests/test_bnll.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/test_c_api.cpp
================================================
[File too large to display: 10.1 KB]

================================================
FILE: tests/test_cast.cpp
================================================
[File too large to display: 7.5 KB]

================================================
FILE: tests/test_celu.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tests/test_clip.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tests/test_command.cpp
================================================
[File too large to display: 6.1 KB]

================================================
FILE: tests/test_concat.cpp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: tests/test_concat_oom.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tests/test_convolution.cpp
================================================
[File too large to display: 6.2 KB]

================================================
FILE: tests/test_convolution1d.cpp
================================================
[File too large to display: 7.8 KB]

================================================
FILE: tests/test_convolution3d.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tests/test_convolution_1.cpp
================================================
[File too large to display: 6.2 KB]

================================================
FILE: tests/test_convolution_2.cpp
================================================
[File too large to display: 7.2 KB]

================================================
FILE: tests/test_convolution_3.cpp
================================================
[File too large to display: 18.1 KB]

================================================
FILE: tests/test_convolution_oom.cpp
================================================
[File too large to display: 4.3 KB]

================================================
FILE: tests/test_convolutiondepthwise.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tests/test_convolutiondepthwise1d.cpp
================================================
[File too large to display: 8.2 KB]

================================================
FILE: tests/test_convolutiondepthwise3d.cpp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: tests/test_convolutiondepthwise_1.cpp
================================================
[File too large to display: 8.4 KB]

================================================
FILE: tests/test_copyto.cpp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tests/test_copyto_1.cpp
================================================
[File too large to display: 6.2 KB]

================================================
FILE: tests/test_cpu.cpp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tests/test_crop.cpp
================================================
[File too large to display: 12.9 KB]

================================================
FILE: tests/test_crop_1.cpp
================================================
[File too large to display: 16.4 KB]

================================================
FILE: tests/test_crop_2.cpp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: tests/test_crop_3.cpp
================================================
[File too large to display: 3.7 KB]

================================================
FILE: tests/test_crop_oom.cpp
================================================
[File too large to display: 6.9 KB]

================================================
FILE: tests/test_cumulativesum.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tests/test_deconvolution.cpp
================================================
[File too large to display: 13.5 KB]

================================================
FILE: tests/test_deconvolution1d.cpp
================================================
[File too large to display: 5.5 KB]

================================================
FILE: tests/test_deconvolution3d.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tests/test_deconvolutiondepthwise.cpp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: tests/test_deconvolutiondepthwise1d.cpp
================================================
[File too large to display: 6.8 KB]

================================================
FILE: tests/test_deconvolutiondepthwise3d.cpp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: tests/test_deconvolutiondepthwise_1.cpp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: tests/test_deepcopy.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tests/test_deformableconv2d.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: tests/test_deformableconv2d_1.cpp
================================================
[File too large to display: 5.7 KB]

================================================
FILE: tests/test_deformableconv2d_2.cpp
================================================
[File too large to display: 5.7 KB]

================================================
FILE: tests/test_deformableconv2d_3.cpp
================================================
[File too large to display: 5.7 KB]

================================================
FILE: tests/test_deformableconv2d_4.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tests/test_dequantize.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: tests/test_diag.cpp
================================================
[File too large to display: 937 B]

================================================
FILE: tests/test_dropout.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tests/test_einsum.cpp
================================================
[File too large to display: 3.6 KB]

================================================
FILE: tests/test_eltwise.cpp
================================================
[File too large to display: 10.2 KB]

================================================
FILE: tests/test_elu.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/test_embed.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tests/test_erf.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/test_expanddims.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tests/test_expression.cpp
================================================
[File too large to display: 4.2 KB]

================================================
FILE: tests/test_flatten.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tests/test_flip.cpp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: tests/test_fold.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tests/test_gelu.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tests/test_gemm_0.h
================================================
[File too large to display: 3.9 KB]

================================================
FILE: tests/test_gemm_0a.cpp
================================================
[File too large to display: 598 B]

================================================
FILE: tests/test_gemm_0b.cpp
================================================
[File too large to display: 594 B]

================================================
FILE: tests/test_gemm_0c.cpp
================================================
[File too large to display: 572 B]

================================================
FILE: tests/test_gemm_0d.cpp
================================================
[File too large to display: 512 B]

================================================
FILE: tests/test_gemm_0e.cpp
================================================
[File too large to display: 572 B]

================================================
FILE: tests/test_gemm_0f.cpp
================================================
[File too large to display: 554 B]

================================================
FILE: tests/test_gemm_1.h
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tests/test_gemm_1a.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/test_gemm_1b.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_gemm_2.h
================================================
[File too large to display: 6.2 KB]

================================================
FILE: tests/test_gemm_2a.cpp
================================================
[File too large to display: 598 B]

================================================
FILE: tests/test_gemm_2b.cpp
================================================
[File too large to display: 594 B]

================================================
FILE: tests/test_gemm_2c.cpp
================================================
[File too large to display: 572 B]

================================================
FILE: tests/test_gemm_2d.cpp
================================================
[File too large to display: 597 B]

================================================
FILE: tests/test_gemm_2e.cpp
================================================
[File too large to display: 595 B]

================================================
FILE: tests/test_gemm_3.cpp
================================================
[File too large to display: 18.7 KB]

================================================
FILE: tests/test_gemm_4.cpp
================================================
[File too large to display: 9.7 KB]

================================================
FILE: tests/test_gemm_nt.cpp
================================================
[File too large to display: 9.0 KB]

================================================
FILE: tests/test_gemm_oom.cpp
================================================
[File too large to display: 15.6 KB]

================================================
FILE: tests/test_glu.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_gridsample.cpp
================================================
[File too large to display: 11.7 KB]

================================================
FILE: tests/test_groupnorm.cpp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: tests/test_gru.cpp
================================================
[File too large to display: 19.6 KB]

================================================
FILE: tests/test_hardsigmoid.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/test_hardswish.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_innerproduct.cpp
================================================
[File too large to display: 9.6 KB]

================================================
FILE: tests/test_instancenorm.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tests/test_interp.cpp
================================================
[File too large to display: 26.7 KB]

================================================
FILE: tests/test_interp_1.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tests/test_inversespectrogram.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_layernorm.cpp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: tests/test_lrn.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tests/test_lstm.cpp
================================================
[File too large to display: 23.4 KB]

================================================
FILE: tests/test_mat_pixel.cpp
================================================
[File too large to display: 13.1 KB]

================================================
FILE: tests/test_mat_pixel_affine.cpp
================================================
[File too large to display: 15.4 KB]

================================================
FILE: tests/test_mat_pixel_drawing.cpp
================================================
[File too large to display: 23.9 KB]

================================================
FILE: tests/test_mat_pixel_resize.cpp
================================================
[File too large to display: 11.4 KB]

================================================
FILE: tests/test_mat_pixel_rotate.cpp
================================================
[File too large to display: 6.3 KB]

================================================
FILE: tests/test_matmul.cpp
================================================
[File too large to display: 8.1 KB]

================================================
FILE: tests/test_memorydata.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tests/test_mish.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tests/test_multiheadattention.cpp
================================================
[File too large to display: 5.3 KB]

================================================
FILE: tests/test_multiheadattention_1.cpp
================================================
[File too large to display: 6.3 KB]

================================================
FILE: tests/test_multiheadattention_kvcache.cpp
================================================
[File too large to display: 13.7 KB]

================================================
FILE: tests/test_multiheadattention_oom.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tests/test_noop.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tests/test_normalize.cpp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: tests/test_packing.cpp
================================================
[File too large to display: 11.3 KB]

================================================
FILE: tests/test_padding.cpp
================================================
[File too large to display: 17.3 KB]

================================================
FILE: tests/test_paramdict.cpp
================================================
[File too large to display: 17.2 KB]

================================================
FILE: tests/test_permute.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tests/test_pixelshuffle.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tests/test_pooling.cpp
================================================
[File too large to display: 11.9 KB]

================================================
FILE: tests/test_pooling1d.cpp
================================================
[File too large to display: 11.3 KB]

================================================
FILE: tests/test_pooling3d.cpp
================================================
[File too large to display: 12.4 KB]

================================================
FILE: tests/test_power.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tests/test_prelu.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tests/test_priorbox.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tests/test_quantize.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tests/test_quantize_oom.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tests/test_reduction.cpp
================================================
[File too large to display: 6.2 KB]

================================================
FILE: tests/test_relu.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tests/test_reorg.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/test_requantize.cpp
================================================
[File too large to display: 9.9 KB]

================================================
FILE: tests/test_requantize_oom.cpp
================================================
[File too large to display: 5.3 KB]

================================================
FILE: tests/test_reshape.cpp
================================================
[File too large to display: 7.0 KB]

================================================
FILE: tests/test_reshape_1.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tests/test_reshape_oom.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tests/test_rmsnorm.cpp
================================================
[File too large to display: 4.2 KB]

================================================
FILE: tests/test_rnn.cpp
================================================
[File too large to display: 19.4 KB]

================================================
FILE: tests/test_roialign.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tests/test_roipooling.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/test_rotaryembed.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_rotaryembed_oom.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tests/test_scale.cpp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: tests/test_sdpa.cpp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: tests/test_sdpa_kvcache.cpp
================================================
[File too large to display: 4.5 KB]

================================================
FILE: tests/test_sdpa_oom.cpp
================================================
[File too large to display: 5.4 KB]

================================================
FILE: tests/test_selu.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tests/test_shrink.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/test_shufflechannel.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tests/test_sigmoid.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_slice.cpp
================================================
[File too large to display: 6.2 KB]

================================================
FILE: tests/test_slice_oom.cpp
================================================
[File too large to display: 3.7 KB]

================================================
FILE: tests/test_softmax.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tests/test_softmax_oom.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tests/test_softplus.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tests/test_spectrogram.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_squeeze.cpp
================================================
[File too large to display: 5.4 KB]

================================================
FILE: tests/test_squeezenet.cpp
================================================
[File too large to display: 15.8 KB]

================================================
FILE: tests/test_swish.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tests/test_tanh.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tests/test_tile.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: tests/test_tile_oom.cpp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: tests/test_unaryop.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tests/test_unfold.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tests/test_yolov3detectionoutput.cpp
================================================
[File too large to display: 4.2 KB]

================================================
FILE: tests/testutil.cpp
================================================
[File too large to display: 65.5 KB]

================================================
FILE: tests/testutil.h
================================================
[File too large to display: 5.1 KB]

================================================
FILE: toolchains/aarch64-linux-gnu-c.toolchain.cmake
================================================
[File too large to display: 901 B]

================================================
FILE: toolchains/aarch64-linux-gnu.toolchain.cmake
================================================
[File too large to display: 772 B]

================================================
FILE: toolchains/aarch64-qnx.toolchain.cmake
================================================
[File too large to display: 1.5 KB]

================================================
FILE: toolchains/anykav500.toolchain.cmake
================================================
[File too large to display: 1.1 KB]

================================================
FILE: toolchains/arm-linux-gnueabi-c.toolchain.cmake
================================================
[File too large to display: 936 B]

================================================
FILE: toolchains/arm-linux-gnueabi.toolchain.cmake
================================================
[File too large to display: 871 B]

================================================
FILE: toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake
================================================
[File too large to display: 838 B]

================================================
FILE: toolchains/arm-linux-gnueabihf.toolchain.cmake
================================================
[File too large to display: 828 B]

================================================
FILE: toolchains/c906-v310.toolchain.cmake
================================================
[File too large to display: 1.8 KB]

================================================
FILE: toolchains/c907-rv32-v310.toolchain.cmake
================================================
[File too large to display: 1.9 KB]

================================================
FILE: toolchains/c907-v310.toolchain.cmake
================================================
[File too large to display: 1.8 KB]

================================================
FILE: toolchains/c908-v310.toolchain.cmake
================================================
[File too large to display: 1.8 KB]

================================================
FILE: toolchains/c910-v310.toolchain.cmake
================================================
[File too large to display: 1.7 KB]

================================================
FILE: toolchains/esp32.toolchain.cmake
================================================
[File too large to display: 526 B]

================================================
FILE: toolchains/himix100.toolchain.cmake
================================================
[File too large to display: 1.2 KB]

================================================
FILE: toolchains/himix200.toolchain.cmake
================================================
[File too large to display: 1.2 KB]

================================================
FILE: toolchains/himix210.toolchain.cmake
================================================
[File too large to display: 1.2 KB]

================================================
FILE: toolchains/hisiv300.toolchain.cmake
================================================
[File too large to display: 1.2 KB]

================================================
FILE: toolchains/hisiv500.toolchain.cmake
================================================
[File too large to display: 1.2 KB]

================================================
FILE: toolchains/hisiv600.toolchain.cmake
================================================
[File too large to display: 1.2 KB]

================================================
FILE: toolchains/host-c.clang.toolchain.cmake
================================================
[File too large to display: 737 B]

================================================
FILE: toolchains/host-c.gcc.toolchain.cmake
================================================
[File too large to display: 731 B]

================================================
FILE: toolchains/host.clang-m32.toolchain.cmake
================================================
[File too large to display: 458 B]

================================================
FILE: toolchains/host.gcc-c++03.toolchain.cmake
================================================
[File too large to display: 459 B]

================================================
FILE: toolchains/host.gcc-m32.toolchain.cmake
================================================
[File too large to display: 452 B]

================================================
FILE: toolchains/host.gcc.toolchain.cmake
================================================
[File too large to display: 501 B]

================================================
FILE: toolchains/ingenic-x2000.toolchain.cmake
================================================
[File too large to display: 1.5 KB]

================================================
FILE: toolchains/ios.toolchain.cmake
================================================
[File too large to display: 50.6 KB]

================================================
FILE: toolchains/iossimxc-x64.toolchain.cmake
================================================
[File too large to display: 1.5 KB]

================================================
FILE: toolchains/iossimxc.toolchain.cmake
================================================
[File too large to display: 1.5 KB]

================================================
FILE: toolchains/iosxc-arm64.toolchain.cmake
================================================
[File too large to display: 1.5 KB]

================================================
FILE: toolchains/iosxc.toolchain.cmake
================================================
[File too large to display: 1.5 KB]

================================================
FILE: toolchains/jetson.toolchain.cmake
================================================
[File too large to display: 971 B]

================================================
FILE: toolchains/k1.llvm.toolchain.cmake
================================================
[File too large to display: 1.6 KB]

================================================
FILE: toolchains/k1.toolchain.cmake
================================================
[File too large to display: 1.6 KB]

================================================
FILE: toolchains/loongarch64-linux-gnu.toolchain.cmake
================================================
[File too large to display: 792 B]

================================================
FILE: toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake
================================================
[File too large to display: 1.3 KB]

================================================
FILE: toolchains/loongson2f-linux-gnuabi64.toolchain.cmake
================================================
[File too large to display: 865 B]

================================================
FILE: toolchains/mips-mti-linux-gnu.toolchain.cmake
================================================
[File too large to display: 773 B]

================================================
FILE: toolchains/mips32r2-linux-gnu.toolchain.cmake
================================================
[File too large to display: 823 B]

================================================
FILE: toolchains/mips64el-linux-gnuabi64.toolchain.cmake
================================================
[File too large to display: 783 B]

================================================
FILE: toolchains/mipsel-linux-gnu.toolchain.cmake
================================================
[File too large to display: 767 B]

================================================
FILE: toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake
================================================
[File too large to display: 792 B]

================================================
FILE: toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake
================================================
[File too large to display: 876 B]

================================================
FILE: toolchains/pi3.toolchain.cmake
================================================
[File too large to display: 825 B]

================================================
FILE: toolchains/power8le-linux-gnu-vsx.clang.toolchain.cmake
================================================
[File too large to display: 1.1 KB]

================================================
FILE: toolchains/power8le-linux-gnu-vsx.toolchain.cmake
================================================
[File too large to display: 976 B]

================================================
FILE: toolchains/power9le-linux-gnu-vsx.clang.toolchain.cmake
================================================
[File too large to display: 1.1 KB]

================================================
FILE: toolchains/power9le-linux-gnu-vsx.toolchain.cmake
================================================
[File too large to display: 976 B]

================================================
FILE: toolchains/powerpc-linux-gnu.toolchain.cmake
================================================
[File too large to display: 770 B]

================================================
FILE: toolchains/powerpc64le-linux-gnu.toolchain.cmake
================================================
[File too large to display: 790 B]

================================================
FILE: toolchains/riscv32-unknown-elf.toolchain.cmake
================================================
[File too large to display: 933 B]

================================================
FILE: toolchains/riscv64-linux-gnu.toolchain.cmake
================================================
[File too large to display: 770 B]

================================================
FILE: toolchains/riscv64-unknown-elf.toolchain.cmake
================================================
[File too large to display: 932 B]

================================================
FILE: toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake
================================================
[File too large to display: 1.4 KB]

================================================
FILE: toolchains/riscv64-unknown-linux-gnu.toolchain.cmake
================================================
[File too large to display: 1.2 KB]

================================================
FILE: toolchains/v831.toolchain.cmake
================================================
[File too large to display: 1.1 KB]

================================================
FILE: toolchains/windows-xp-clang.toolchain.cmake
================================================
[File too large to display: 1.4 KB]

================================================
FILE: toolchains/windows-xp-mingw.toolchain.cmake
================================================
[File too large to display: 1.6 KB]

================================================
FILE: toolchains/windows-xp-msvc.toolchain.cmake
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/CMakeLists.txt
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/caffe/CMakeLists.txt
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/caffe/caffe.proto
================================================
[File too large to display: 66.4 KB]

================================================
FILE: tools/caffe/caffe2ncnn.cpp
================================================
[File too large to display: 44.4 KB]

================================================
FILE: tools/darknet/CMakeLists.txt
================================================
[File too large to display: 146 B]

================================================
FILE: tools/darknet/README.md
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/darknet/darknet2ncnn.cpp
================================================
[File too large to display: 31.8 KB]

================================================
FILE: tools/keras/readme.md
================================================
[File too large to display: 587 B]

================================================
FILE: tools/mlir/CMakeLists.txt
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/mlir/fix_td.sh
================================================
[File too large to display: 637 B]

================================================
FILE: tools/mlir/mlir2ncnn.cpp
================================================
[File too large to display: 57.0 KB]

================================================
FILE: tools/mlir/ncnn_dialect.cpp
================================================
[File too large to display: 572 B]

================================================
FILE: tools/mlir/ncnn_dialect.h
================================================
[File too large to display: 656 B]

================================================
FILE: tools/mlir/ncnn_ops.td
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/mlir/ncnn_rewriter.cpp
================================================
[File too large to display: 959 B]

================================================
FILE: tools/mlir/ncnn_rewriter.td
================================================
[File too large to display: 5.6 KB]

================================================
FILE: tools/mlir/tf_attributes.cc
================================================
[File too large to display: 4.1 KB]

================================================
FILE: tools/mlir/tf_attributes.h
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tools/mlir/tf_dialect.cpp
================================================
[File too large to display: 9.6 KB]

================================================
FILE: tools/mlir/tf_dialect.h
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/mlir/tf_generated_ops.td
================================================
[File too large to display: 649.4 KB]

================================================
FILE: tools/mlir/tf_op_base.td
================================================
[File too large to display: 25.0 KB]

================================================
FILE: tools/mlir/tf_ops.td
================================================
[File too large to display: 65.6 KB]

================================================
FILE: tools/mlir/tf_side_effects.h
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/mlir/tf_traits.h
================================================
[File too large to display: 6.8 KB]

================================================
FILE: tools/mlir/tf_types.cc
================================================
[File too large to display: 16.4 KB]

================================================
FILE: tools/mlir/tf_types.def
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/mlir/tf_types.h
================================================
[File too large to display: 13.0 KB]

================================================
FILE: tools/modelwriter.h
================================================
[File too large to display: 92.4 KB]

================================================
FILE: tools/mxnet/CMakeLists.txt
================================================
[File too large to display: 193 B]

================================================
FILE: tools/mxnet/mxnet2ncnn.cpp
================================================
[File too large to display: 80.2 KB]

================================================
FILE: tools/ncnn2mem.cpp
================================================
[File too large to display: 16.1 KB]

================================================
FILE: tools/ncnnmerge.cpp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: tools/ncnnoptimize.cpp
================================================
[File too large to display: 83.0 KB]

================================================
FILE: tools/onnx/CMakeLists.txt
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/onnx/README.md
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/onnx/onnx.proto
================================================
[File too large to display: 19.5 KB]

================================================
FILE: tools/onnx/onnx2ncnn.cpp
================================================
[File too large to display: 206.9 KB]

================================================
FILE: tools/plugin/ImageWatchNCNN.natvis
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/plugin/ImageWatchNNIE.natvis
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/plugin/README.md
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/CMakeLists.txt
================================================
[File too large to display: 4.7 KB]

================================================
FILE: tools/pnnx/README.md
================================================
[File too large to display: 35.5 KB]

================================================
FILE: tools/pnnx/cmake/PNNXPyTorch.cmake
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/python/README.md
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/python/examples/convert.py
================================================
[File too large to display: 547 B]

================================================
FILE: tools/pnnx/python/examples/export.py
================================================
[File too large to display: 629 B]

================================================
FILE: tools/pnnx/python/pnnx/__init__.py
================================================
[File too large to display: 709 B]

================================================
FILE: tools/pnnx/python/pnnx/utils/__init__.py
================================================
[File too large to display: 121 B]

================================================
FILE: tools/pnnx/python/pnnx/utils/convert.py
================================================
[File too large to display: 6.6 KB]

================================================
FILE: tools/pnnx/python/pnnx/utils/export.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/python/pnnx/utils/utils.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/python/requirements.txt
================================================
[File too large to display: 5 B]

================================================
FILE: tools/pnnx/python/setup.py
================================================
[File too large to display: 6.6 KB]

================================================
FILE: tools/pnnx/python/tests/test_convert.py
================================================
[File too large to display: 987 B]

================================================
FILE: tools/pnnx/python/tests/test_dynamicinput_convert.py
================================================
[File too large to display: 736 B]

================================================
FILE: tools/pnnx/python/tests/test_dynamicinput_export.py
================================================
[File too large to display: 673 B]

================================================
FILE: tools/pnnx/python/tests/test_export.py
================================================
[File too large to display: 884 B]

================================================
FILE: tools/pnnx/python/tests/test_naiveinput_convert.py
================================================
[File too large to display: 645 B]

================================================
FILE: tools/pnnx/python/tests/test_naiveinput_export.py
================================================
[File too large to display: 578 B]

================================================
FILE: tools/pnnx/src/CMakeLists.txt
================================================
[File too large to display: 26.8 KB]

================================================
FILE: tools/pnnx/src/ir.cpp
================================================
[File too large to display: 96.8 KB]

================================================
FILE: tools/pnnx/src/ir.h
================================================
[File too large to display: 8.9 KB]

================================================
FILE: tools/pnnx/src/load_onnx.cpp
================================================
[File too large to display: 25.2 KB]

================================================
FILE: tools/pnnx/src/load_onnx.h
================================================
[File too large to display: 517 B]

================================================
FILE: tools/pnnx/src/load_tnn.cpp
================================================
[File too large to display: 16.0 KB]

================================================
FILE: tools/pnnx/src/load_tnn.h
================================================
[File too large to display: 252 B]

================================================
FILE: tools/pnnx/src/load_torchscript.cpp
================================================
[File too large to display: 23.0 KB]

================================================
FILE: tools/pnnx/src/load_torchscript.h
================================================
[File too large to display: 893 B]

================================================
FILE: tools/pnnx/src/main.cpp
================================================
[File too large to display: 11.7 KB]

================================================
FILE: tools/pnnx/src/onnx-data.proto
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/src/onnx-ml.proto
================================================
[File too large to display: 41.5 KB]

================================================
FILE: tools/pnnx/src/onnx-operators-ml.proto
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tools/pnnx/src/pass_level0/constant_unpooling.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level0/constant_unpooling.h
================================================
[File too large to display: 202 B]

================================================
FILE: tools/pnnx/src/pass_level0/convert_half_to_float.cpp
================================================
[File too large to display: 973 B]

================================================
FILE: tools/pnnx/src/pass_level0/convert_half_to_float.h
================================================
[File too large to display: 187 B]

================================================
FILE: tools/pnnx/src/pass_level0/flatten_input.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level0/flatten_input.h
================================================
[File too large to display: 197 B]

================================================
FILE: tools/pnnx/src/pass_level0/inline_block.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tools/pnnx/src/pass_level0/inline_block.h
================================================
[File too large to display: 246 B]

================================================
FILE: tools/pnnx/src/pass_level0/reset_device.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_level0/reset_device.h
================================================
[File too large to display: 223 B]

================================================
FILE: tools/pnnx/src/pass_level0/shape_inference.cpp
================================================
[File too large to display: 14.7 KB]

================================================
FILE: tools/pnnx/src/pass_level0/shape_inference.h
================================================
[File too large to display: 548 B]

================================================
FILE: tools/pnnx/src/pass_level0.cpp
================================================
[File too large to display: 1023 B]

================================================
FILE: tools/pnnx/src/pass_level0.h
================================================
[File too large to display: 610 B]

================================================
FILE: tools/pnnx/src/pass_level1/fuse_module_pass.cpp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: tools/pnnx/src/pass_level1/fuse_module_pass.h
================================================
[File too large to display: 3.0 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AdaptiveAvgPool1d.cpp
================================================
[File too large to display: 733 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AdaptiveAvgPool2d.cpp
================================================
[File too large to display: 733 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AdaptiveAvgPool3d.cpp
================================================
[File too large to display: 733 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AdaptiveMaxPool1d.cpp
================================================
[File too large to display: 927 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AdaptiveMaxPool2d.cpp
================================================
[File too large to display: 927 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AdaptiveMaxPool3d.cpp
================================================
[File too large to display: 927 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AlphaDropout.cpp
================================================
[File too large to display: 444 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AvgPool1d.cpp
================================================
[File too large to display: 964 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AvgPool2d.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_AvgPool3d.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_BatchNorm1d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_BatchNorm2d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_BatchNorm3d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_CELU.cpp
================================================
[File too large to display: 627 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ChannelShuffle.cpp
================================================
[File too large to display: 706 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ConstantPad1d.cpp
================================================
[File too large to display: 885 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ConstantPad2d.cpp
================================================
[File too large to display: 885 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ConstantPad3d.cpp
================================================
[File too large to display: 885 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Conv1d.cpp
================================================
[File too large to display: 4.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Conv2d.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Conv3d.cpp
================================================
[File too large to display: 5.6 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ConvTranspose1d.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ConvTranspose2d.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ConvTranspose3d.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Dropout.cpp
================================================
[File too large to display: 424 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Dropout2d.cpp
================================================
[File too large to display: 432 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Dropout3d.cpp
================================================
[File too large to display: 432 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ELU.cpp
================================================
[File too large to display: 620 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Embedding.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Fold.cpp
================================================
[File too large to display: 899 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_GELU.cpp
================================================
[File too large to display: 802 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_GLU.cpp
================================================
[File too large to display: 673 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_GRU.cpp
================================================
[File too large to display: 3.6 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_GroupNorm.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Hardshrink.cpp
================================================
[File too large to display: 669 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Hardsigmoid.cpp
================================================
[File too large to display: 443 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Hardswish.cpp
================================================
[File too large to display: 435 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Hardtanh.cpp
================================================
[File too large to display: 724 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_InstanceNorm1d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_InstanceNorm2d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_InstanceNorm3d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LPPool1d.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LPPool2d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LSTM.cpp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LayerNorm.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LeakyReLU.cpp
================================================
[File too large to display: 857 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Linear.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LocalResponseNorm.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LogSigmoid.cpp
================================================
[File too large to display: 439 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_LogSoftmax.cpp
================================================
[File too large to display: 668 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_MaxPool1d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_MaxPool2d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_MaxPool3d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Mish.cpp
================================================
[File too large to display: 415 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp
================================================
[File too large to display: 6.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_PReLU.cpp
================================================
[File too large to display: 686 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_PixelShuffle.cpp
================================================
[File too large to display: 706 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_PixelUnshuffle.cpp
================================================
[File too large to display: 724 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_RMSNorm.cpp
================================================
[File too large to display: 926 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_RNN.cpp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_RReLU.cpp
================================================
[File too large to display: 692 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ReLU.cpp
================================================
[File too large to display: 415 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ReLU6.cpp
================================================
[File too large to display: 419 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ReflectionPad1d.cpp
================================================
[File too large to display: 916 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ReflectionPad2d.cpp
================================================
[File too large to display: 916 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ReplicationPad1d.cpp
================================================
[File too large to display: 923 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ReplicationPad2d.cpp
================================================
[File too large to display: 923 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ReplicationPad3d.cpp
================================================
[File too large to display: 923 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_SELU.cpp
================================================
[File too large to display: 415 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_SiLU.cpp
================================================
[File too large to display: 415 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Sigmoid.cpp
================================================
[File too large to display: 427 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Softmax.cpp
================================================
[File too large to display: 644 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Softmax2d.cpp
================================================
[File too large to display: 435 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Softmin.cpp
================================================
[File too large to display: 644 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Softplus.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Softshrink.cpp
================================================
[File too large to display: 669 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Softsign.cpp
================================================
[File too large to display: 431 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Tanh.cpp
================================================
[File too large to display: 415 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Tanhshrink.cpp
================================================
[File too large to display: 439 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Threshold.cpp
================================================
[File too large to display: 732 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Unfold.cpp
================================================
[File too large to display: 836 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_Upsample.cpp
================================================
[File too large to display: 5.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_UpsamplingBilinear2d.cpp
================================================
[File too large to display: 950 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_UpsamplingNearest2d.cpp
================================================
[File too large to display: 945 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_ZeroPad2d.cpp
================================================
[File too large to display: 813 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_maxunpool2d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_quantized_Conv2d.cpp
================================================
[File too large to display: 7.6 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_quantized_DeQuantize.cpp
================================================
[File too large to display: 961 B]

================================================
FILE: tools/pnnx/src/pass_level1/nn_quantized_Linear.cpp
================================================
[File too large to display: 5.0 KB]

================================================
FILE: tools/pnnx/src/pass_level1/nn_quantized_Quantize.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level1/torchvision_DeformConv2d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_level1/torchvision_RoIAlign.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level1.cpp
================================================
[File too large to display: 15.9 KB]

================================================
FILE: tools/pnnx/src/pass_level1.h
================================================
[File too large to display: 364 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_adaptive_avg_pool1d.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_adaptive_avg_pool2d.cpp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_adaptive_avg_pool3d.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_adaptive_max_pool1d.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_adaptive_max_pool2d.cpp
================================================
[File too large to display: 4.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_adaptive_max_pool3d.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_affine_grid.cpp
================================================
[File too large to display: 714 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_alpha_dropout.cpp
================================================
[File too large to display: 683 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_avg_pool1d.cpp
================================================
[File too large to display: 13.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_avg_pool2d.cpp
================================================
[File too large to display: 17.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_avg_pool3d.cpp
================================================
[File too large to display: 16.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_batch_norm.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_celu.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_conv1d.cpp
================================================
[File too large to display: 14.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_conv2d.cpp
================================================
[File too large to display: 18.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_conv3d.cpp
================================================
[File too large to display: 14.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_conv_transpose1d.cpp
================================================
[File too large to display: 7.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_conv_transpose2d.cpp
================================================
[File too large to display: 7.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_conv_transpose3d.cpp
================================================
[File too large to display: 7.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_dropout.cpp
================================================
[File too large to display: 665 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_dropout23d.cpp
================================================
[File too large to display: 671 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_elu.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_embedding.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_feature_alpha_dropout.cpp
================================================
[File too large to display: 707 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_fold.cpp
================================================
[File too large to display: 850 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_gelu.cpp
================================================
[File too large to display: 10.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_glu.cpp
================================================
[File too large to display: 630 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_grid_sample.cpp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_group_norm.cpp
================================================
[File too large to display: 10.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_hardshrink.cpp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_hardsigmoid.cpp
================================================
[File too large to display: 10.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_hardswish.cpp
================================================
[File too large to display: 9.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_hardtanh.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_instance_norm.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_interpolate.cpp
================================================
[File too large to display: 53.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_layer_norm.cpp
================================================
[File too large to display: 11.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_leaky_relu.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_linear.cpp
================================================
[File too large to display: 9.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_local_response_norm.cpp
================================================
[File too large to display: 10.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_log_softmax.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_logsigmoid.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_lp_pool1d.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_lp_pool2d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_max_pool1d.cpp
================================================
[File too large to display: 13.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_max_pool2d.cpp
================================================
[File too large to display: 18.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_max_pool3d.cpp
================================================
[File too large to display: 15.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_mish.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_normalize.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_pad.cpp
================================================
[File too large to display: 14.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_pairwise_distance.cpp
================================================
[File too large to display: 844 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_pixel_shuffle.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_pixel_unshuffle.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_prelu.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_relu.cpp
================================================
[File too large to display: 986 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_relu6.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_rms_norm.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_rrelu.cpp
================================================
[File too large to display: 802 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp
================================================
[File too large to display: 9.9 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_selu.cpp
================================================
[File too large to display: 986 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_sigmoid.cpp
================================================
[File too large to display: 563 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_silu.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_softmax.cpp
================================================
[File too large to display: 3.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_softmin.cpp
================================================
[File too large to display: 616 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_softplus.cpp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_softshrink.cpp
================================================
[File too large to display: 4.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_softsign.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_tanh.cpp
================================================
[File too large to display: 554 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_tanhshrink.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_threshold.cpp
================================================
[File too large to display: 681 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_unfold.cpp
================================================
[File too large to display: 792 B]

================================================
FILE: tools/pnnx/src/pass_level2/F_upsample.cpp
================================================
[File too large to display: 11.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_upsample_bilinear.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/F_upsample_nearest.cpp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/README.md
================================================
[File too large to display: 3.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_copy.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_expand.cpp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_expand_as.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_fill.cpp
================================================
[File too large to display: 620 B]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_index.cpp
================================================
[File too large to display: 621 B]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_index_put.cpp
================================================
[File too large to display: 773 B]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_masked_fill.cpp
================================================
[File too large to display: 691 B]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_new_empty.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_new_ones.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_new_zeros.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_permute.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_repeat.cpp
================================================
[File too large to display: 626 B]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_reshape.cpp
================================================
[File too large to display: 5.9 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_reshape_as.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_select.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_size.cpp
================================================
[File too large to display: 4.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_slice.cpp
================================================
[File too large to display: 6.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_to.cpp
================================================
[File too large to display: 6.9 KB]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_type_as.cpp
================================================
[File too large to display: 629 B]

================================================
FILE: tools/pnnx/src/pass_level2/Tensor_unflatten.cpp
================================================
[File too large to display: 683 B]

================================================
FILE: tools/pnnx/src/pass_level2/eliminate_contiguous.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/eliminate_contiguous.h
================================================
[File too large to display: 165 B]

================================================
FILE: tools/pnnx/src/pass_level2/eliminate_size_numtotensor_int.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/eliminate_size_numtotensor_int.h
================================================
[File too large to display: 157 B]

================================================
FILE: tools/pnnx/src/pass_level2/functionize.cpp
================================================
[File too large to display: 10.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/functionize.h
================================================
[File too large to display: 138 B]

================================================
FILE: tools/pnnx/src/pass_level2/fuse_constantlist.cpp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/fuse_constantlist.h
================================================
[File too large to display: 144 B]

================================================
FILE: tools/pnnx/src/pass_level2/nn_GRU.cpp
================================================
[File too large to display: 22.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/nn_LSTM.cpp
================================================
[File too large to display: 36.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/nn_RNN.cpp
================================================
[File too large to display: 15.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/nn_quantized_FloatFunctional.cpp
================================================
[File too large to display: 781 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_addmm.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_amax.cpp
================================================
[File too large to display: 684 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_amin.cpp
================================================
[File too large to display: 684 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_arange.cpp
================================================
[File too large to display: 4.9 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_argmax.cpp
================================================
[File too large to display: 675 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_argmin.cpp
================================================
[File too large to display: 675 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_as_strided.cpp
================================================
[File too large to display: 765 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_baddbmm.cpp
================================================
[File too large to display: 784 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_bitwise_and.cpp
================================================
[File too large to display: 638 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_bitwise_left_shift.cpp
================================================
[File too large to display: 659 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_bitwise_not.cpp
================================================
[File too large to display: 586 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_bitwise_or.cpp
================================================
[File too large to display: 634 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_bitwise_right_shift.cpp
================================================
[File too large to display: 662 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_bitwise_xor.cpp
================================================
[File too large to display: 638 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_bmm.cpp
================================================
[File too large to display: 612 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_cat.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_chunk.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_clamp.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_clone.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_complex.cpp
================================================
[File too large to display: 622 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_cross.cpp
================================================
[File too large to display: 668 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_cumprod.cpp
================================================
[File too large to display: 740 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_cumsum.cpp
================================================
[File too large to display: 736 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_dequantize.cpp
================================================
[File too large to display: 583 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_diag.cpp
================================================
[File too large to display: 623 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_einsum.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_empty.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_empty_like.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_eq.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_fft.cpp
================================================
[File too large to display: 716 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_fft2.cpp
================================================
[File too large to display: 719 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_fftn.cpp
================================================
[File too large to display: 719 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_hfft.cpp
================================================
[File too large to display: 719 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_hfft2.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_hfftn.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_ifft.cpp
================================================
[File too large to display: 719 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_ifft2.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_ifftn.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_ihfft.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_ihfft2.cpp
================================================
[File too large to display: 725 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_ihfftn.cpp
================================================
[File too large to display: 725 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_irfft.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_irfft2.cpp
================================================
[File too large to display: 725 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_irfftn.cpp
================================================
[File too large to display: 725 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_rfft.cpp
================================================
[File too large to display: 719 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_rfft2.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_fft_rfftn.cpp
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_flatten.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_flip.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_full.cpp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_full_like.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_gather.cpp
================================================
[File too large to display: 743 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_ge.cpp
================================================
[File too large to display: 611 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_gt.cpp
================================================
[File too large to display: 611 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_imag.cpp
================================================
[File too large to display: 565 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_index_select.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_istft.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_le.cpp
================================================
[File too large to display: 611 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_lgamma.cpp
================================================
[File too large to display: 571 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_logical_and.cpp
================================================
[File too large to display: 638 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_logical_not.cpp
================================================
[File too large to display: 586 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_logical_or.cpp
================================================
[File too large to display: 634 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_logical_xor.cpp
================================================
[File too large to display: 638 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_logsumexp.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_lt.cpp
================================================
[File too large to display: 611 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_masked_select.cpp
================================================
[File too large to display: 642 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_matmul.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_max.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_mean.cpp
================================================
[File too large to display: 4.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_min.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_mm.cpp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_mv.cpp
================================================
[File too large to display: 613 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_narrow.cpp
================================================
[File too large to display: 725 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_ne.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_norm.cpp
================================================
[File too large to display: 8.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_normal.cpp
================================================
[File too large to display: 729 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_ones.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_ones_like.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_positive.cpp
================================================
[File too large to display: 577 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_prod.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_quantize_per_tensor.cpp
================================================
[File too large to display: 932 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_randn.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_randn_like.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_real.cpp
================================================
[File too large to display: 565 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_repeat_interleave.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_roll.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_scatter_add.cpp
================================================
[File too large to display: 734 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_slice_scatter.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_split.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_squeeze.cpp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_stack.cpp
================================================
[File too large to display: 620 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_std.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_stft.cpp
================================================
[File too large to display: 5.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_sum.cpp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_t.cpp
================================================
[File too large to display: 556 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_tensor_split.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_tile.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_topk.cpp
================================================
[File too large to display: 789 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_transpose.cpp
================================================
[File too large to display: 680 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_unbind.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_unsqueeze.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_var.cpp
================================================
[File too large to display: 724 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_view_as_complex.cpp
================================================
[File too large to display: 598 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_view_as_real.cpp
================================================
[File too large to display: 589 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_where.cpp
================================================
[File too large to display: 680 B]

================================================
FILE: tools/pnnx/src/pass_level2/torch_zeros.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torch_zeros_like.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torchaudio_F_inverse_spectrogram.cpp
================================================
[File too large to display: 4.1 KB]

================================================
FILE: tools/pnnx/src/pass_level2/torchaudio_F_spectrogram.cpp
================================================
[File too large to display: 10.8 KB]

================================================
FILE: tools/pnnx/src/pass_level2.cpp
================================================
[File too large to display: 32.6 KB]

================================================
FILE: tools/pnnx/src/pass_level2.h
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level3/assign_unique_name.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level3/assign_unique_name.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level3/eliminate_noop_math.cpp
================================================
[File too large to display: 10.0 KB]

================================================
FILE: tools/pnnx/src/pass_level3/eliminate_noop_math.h
================================================
[File too large to display: 164 B]

================================================
FILE: tools/pnnx/src/pass_level3/eliminate_squeeze_unsqueeze_pair.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_level3/eliminate_squeeze_unsqueeze_pair.h
================================================
[File too large to display: 177 B]

================================================
FILE: tools/pnnx/src/pass_level3/eliminate_tuple_pair.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_level3/eliminate_tuple_pair.h
================================================
[File too large to display: 165 B]

================================================
FILE: tools/pnnx/src/pass_level3/expand_quantization_modules.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_level3/expand_quantization_modules.h
================================================
[File too large to display: 172 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_dynamic_adaptive_pool.cpp
================================================
[File too large to display: 23.4 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_dynamic_adaptive_pool.h
================================================
[File too large to display: 171 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_einsum_operands.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_einsum_operands.h
================================================
[File too large to display: 165 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_expression.cpp
================================================
[File too large to display: 29.5 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_expression.h
================================================
[File too large to display: 256 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_index_expression.cpp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_index_expression.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_maxpool_unpack.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_maxpool_unpack.h
================================================
[File too large to display: 164 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_multiheadattention_unpack.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_multiheadattention_unpack.h
================================================
[File too large to display: 175 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_op1ton_unpack.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_op1ton_unpack.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_opnto1_tensors.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_opnto1_tensors.h
================================================
[File too large to display: 164 B]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_rnn_unpack.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level3/fuse_rnn_unpack.h
================================================
[File too large to display: 160 B]

================================================
FILE: tools/pnnx/src/pass_level3/rename_F_dropoutnd.cpp
================================================
[File too large to display: 790 B]

================================================
FILE: tools/pnnx/src/pass_level3/rename_F_dropoutnd.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level3.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_level3.h
================================================
[File too large to display: 333 B]

================================================
FILE: tools/pnnx/src/pass_level4/attribute_pooling.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level4/attribute_pooling.h
================================================
[File too large to display: 158 B]

================================================
FILE: tools/pnnx/src/pass_level4/canonicalize.cpp
================================================
[File too large to display: 363 B]

================================================
FILE: tools/pnnx/src/pass_level4/canonicalize.h
================================================
[File too large to display: 139 B]

================================================
FILE: tools/pnnx/src/pass_level4/dead_code_elimination.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level4/dead_code_elimination.h
================================================
[File too large to display: 148 B]

================================================
FILE: tools/pnnx/src/pass_level4/fuse_custom_op.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_level4/fuse_custom_op.h
================================================
[File too large to display: 159 B]

================================================
FILE: tools/pnnx/src/pass_level4.cpp
================================================
[File too large to display: 368 B]

================================================
FILE: tools/pnnx/src/pass_level4.h
================================================
[File too large to display: 237 B]

================================================
FILE: tools/pnnx/src/pass_level5/attribute_unpooling.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/attribute_unpooling.h
================================================
[File too large to display: 160 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_dropout.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_dropout.h
================================================
[File too large to display: 162 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_identity_operator.cpp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_identity_operator.h
================================================
[File too large to display: 172 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_maxpool_indices.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_maxpool_indices.h
================================================
[File too large to display: 170 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_cat.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_cat.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_einsum.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_einsum.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_expand.cpp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_expand.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_expression.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_expression.h
================================================
[File too large to display: 170 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_pad.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_pad.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_permute.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_permute.h
================================================
[File too large to display: 167 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_reshape.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_reshape.h
================================================
[File too large to display: 167 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_slice.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_slice.h
================================================
[File too large to display: 165 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_upsample.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_noop_upsample.h
================================================
[File too large to display: 168 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_reshape_shape_expression.cpp
================================================
[File too large to display: 6.6 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_reshape_shape_expression.h
================================================
[File too large to display: 179 B]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_type_as.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eliminate_type_as.h
================================================
[File too large to display: 162 B]

================================================
FILE: tools/pnnx/src/pass_level5/eval_expression.cpp
================================================
[File too large to display: 21.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/eval_expression.h
================================================
[File too large to display: 160 B]

================================================
FILE: tools/pnnx/src/pass_level5/fold_constants.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fold_constants.h
================================================
[File too large to display: 255 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_adjacent_permute.cpp
================================================
[File too large to display: 5.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_adjacent_permute.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_adjacent_reshape.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_adjacent_reshape.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_channel_shuffle.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_channel_shuffle.h
================================================
[File too large to display: 165 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_constant_expression.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_constant_expression.h
================================================
[File too large to display: 169 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_conv1d_batchnorm1d.cpp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_conv1d_batchnorm1d.h
================================================
[File too large to display: 168 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_conv2d_batchnorm2d.cpp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_conv2d_batchnorm2d.h
================================================
[File too large to display: 168 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_conv3d_batchnorm3d.cpp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_conv3d_batchnorm3d.h
================================================
[File too large to display: 168 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_convtranspose1d_batchnorm1d.cpp
================================================
[File too large to display: 5.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_convtranspose1d_batchnorm1d.h
================================================
[File too large to display: 177 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_convtranspose2d_batchnorm2d.cpp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_convtranspose2d_batchnorm2d.h
================================================
[File too large to display: 177 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_convtranspose3d_batchnorm3d.cpp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_convtranspose3d_batchnorm3d.h
================================================
[File too large to display: 177 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_layernorm.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_layernorm.h
================================================
[File too large to display: 159 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_linear_batchnorm1d.cpp
================================================
[File too large to display: 3.8 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_linear_batchnorm1d.h
================================================
[File too large to display: 168 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_multiheadattention.cpp
================================================
[File too large to display: 115.6 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_multiheadattention.h
================================================
[File too large to display: 168 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_multiheadattention_sameqkv.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_multiheadattention_sameqkv.h
================================================
[File too large to display: 176 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pad_conv1d.cpp
================================================
[File too large to display: 13.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pad_conv1d.h
================================================
[File too large to display: 160 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pad_conv2d.cpp
================================================
[File too large to display: 17.5 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pad_conv2d.h
================================================
[File too large to display: 160 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pixel_shuffle.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pixel_shuffle.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pixel_unshuffle.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_pixel_unshuffle.h
================================================
[File too large to display: 165 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_rmsnorm.cpp
================================================
[File too large to display: 5.3 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_rmsnorm.h
================================================
[File too large to display: 157 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.cpp
================================================
[File too large to display: 9.8 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_scaled_dot_product_attention.h
================================================
[File too large to display: 178 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_select_to_unbind.cpp
================================================
[File too large to display: 3.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_select_to_unbind.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_silu.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_silu.h
================================================
[File too large to display: 154 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_copy.cpp
================================================
[File too large to display: 7.4 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_copy.h
================================================
[File too large to display: 160 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_indices.cpp
================================================
[File too large to display: 19.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_indices.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_squeeze_to_select.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_squeeze_to_select.h
================================================
[File too large to display: 173 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp
================================================
[File too large to display: 5.4 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.h
================================================
[File too large to display: 171 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_batchnorm.cpp
================================================
[File too large to display: 8.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_batchnorm.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_conv.cpp
================================================
[File too large to display: 18.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_conv.h
================================================
[File too large to display: 161 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_convtranspose.cpp
================================================
[File too large to display: 9.5 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_convtranspose.h
================================================
[File too large to display: 170 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_embedding.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_embedding.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_groupnorm.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_groupnorm.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_instancenorm.cpp
================================================
[File too large to display: 3.5 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_instancenorm.h
================================================
[File too large to display: 169 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_layernorm.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_layernorm.h
================================================
[File too large to display: 166 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_linear.cpp
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_linear.h
================================================
[File too large to display: 163 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_prelu.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_prelu.h
================================================
[File too large to display: 162 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_rmsnorm.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_static_rmsnorm.h
================================================
[File too large to display: 164 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_transformers_multiheadattention.cpp
================================================
[File too large to display: 125.1 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_transformers_multiheadattention.h
================================================
[File too large to display: 181 B]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_transformers_scaled_dot_product_attention.cpp
================================================
[File too large to display: 7.7 KB]

================================================
FILE: tools/pnnx/src/pass_level5/fuse_transformers_scaled_dot_product_attention.h
================================================
[File too large to display: 191 B]

================================================
FILE: tools/pnnx/src/pass_level5/normalize_einsum_equation.cpp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: tools/pnnx/src/pass_level5/normalize_einsum_equation.h
================================================
[File too large to display: 170 B]

================================================
FILE: tools/pnnx/src/pass_level5/unroll_rnn_op.cpp
================================================
[File too large to display: 8.3 KB]

================================================
FILE: tools/pnnx/src/pass_level5/unroll_rnn_op.h
================================================
[File too large to display: 167 B]

================================================
FILE: tools/pnnx/src/pass_level5.cpp
================================================
[File too large to display: 4.9 KB]

================================================
FILE: tools/pnnx/src/pass_level5.h
================================================
[File too large to display: 333 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_adaptive_avg_pool1d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_adaptive_avg_pool2d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_adaptive_avg_pool3d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_adaptive_max_pool1d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_adaptive_max_pool2d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_adaptive_max_pool3d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_avg_pool1d.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_avg_pool2d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_avg_pool3d.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_batch_norm.cpp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_celu.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_conv1d.cpp
================================================
[File too large to display: 7.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_conv2d.cpp
================================================
[File too large to display: 8.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_conv3d.cpp
================================================
[File too large to display: 169 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp
================================================
[File too large to display: 6.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp
================================================
[File too large to display: 7.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp
================================================
[File too large to display: 169 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_elu.cpp
================================================
[File too large to display: 831 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_embedding.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_fold.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_gelu.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_glu.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_grid_sample.cpp
================================================
[File too large to display: 4.6 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_group_norm.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_hardshrink.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_hardsigmoid.cpp
================================================
[File too large to display: 863 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_hardswish.cpp
================================================
[File too large to display: 855 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_hardtanh.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_instance_norm.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_interpolate.cpp
================================================
[File too large to display: 7.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_layer_norm.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_leaky_relu.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_linear.cpp
================================================
[File too large to display: 169 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_local_response_norm.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_log_softmax.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_logsigmoid.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_max_pool1d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_max_pool2d.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_max_pool3d.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_mish.cpp
================================================
[File too large to display: 662 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_normalize.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_pad.cpp
================================================
[File too large to display: 7.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_pixel_shuffle.cpp
================================================
[File too large to display: 926 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_pixel_unshuffle.cpp
================================================
[File too large to display: 930 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_prelu.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_relu.cpp
================================================
[File too large to display: 662 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_relu6.cpp
================================================
[File too large to display: 836 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_rms_norm.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_scaled_dot_product_attention.cpp
================================================
[File too large to display: 15.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_selu.cpp
================================================
[File too large to display: 662 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_sigmoid.cpp
================================================
[File too large to display: 674 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_silu.cpp
================================================
[File too large to display: 663 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_softmax.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_softplus.cpp
================================================
[File too large to display: 697 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_softshrink.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_tanh.cpp
================================================
[File too large to display: 662 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_unfold.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_upsample.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_upsample_bilinear.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/F_upsample_nearest.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/Tensor_expand.cpp
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/Tensor_permute.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/Tensor_repeat.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/Tensor_reshape.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/Tensor_reshape_as.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/Tensor_unflatten.cpp
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/chain_multi_output.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/chain_multi_output.h
================================================
[File too large to display: 209 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_Tensor_select.cpp
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_Tensor_select.h
================================================
[File too large to display: 212 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_Tensor_slice.cpp
================================================
[File too large to display: 5.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_Tensor_slice.h
================================================
[File too large to display: 211 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.cpp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_Tensor_slice_copy.h
================================================
[File too large to display: 216 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_attribute.cpp
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_attribute.h
================================================
[File too large to display: 208 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_custom_op.cpp
================================================
[File too large to display: 817 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_custom_op.h
================================================
[File too large to display: 208 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_half_to_float.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_half_to_float.h
================================================
[File too large to display: 212 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_input.cpp
================================================
[File too large to display: 557 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_input.h
================================================
[File too large to display: 204 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_module_op.cpp
================================================
[File too large to display: 690 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_module_op.h
================================================
[File too large to display: 258 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_reshape_interp_expression.cpp
================================================
[File too large to display: 18.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_reshape_interp_expression.h
================================================
[File too large to display: 224 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_slice_expression.cpp
================================================
[File too large to display: 42.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_slice_expression.h
================================================
[File too large to display: 215 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_cat.cpp
================================================
[File too large to display: 991 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_cat.h
================================================
[File too large to display: 208 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_chunk.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_chunk.h
================================================
[File too large to display: 210 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_einsum.cpp
================================================
[File too large to display: 4.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_einsum.h
================================================
[File too large to display: 211 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_split.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_split.h
================================================
[File too large to display: 210 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_stack.cpp
================================================
[File too large to display: 3.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_stack.h
================================================
[File too large to display: 210 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_tensor_split.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_tensor_split.h
================================================
[File too large to display: 217 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_unbind.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/convert_torch_unbind.h
================================================
[File too large to display: 211 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/eliminate_noop.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/eliminate_noop.h
================================================
[File too large to display: 205 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/eliminate_output.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/eliminate_output.h
================================================
[File too large to display: 207 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/expand_expression.cpp
================================================
[File too large to display: 13.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/expand_expression.h
================================================
[File too large to display: 208 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_binaryop_eltwise.cpp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_binaryop_eltwise.h
================================================
[File too large to display: 205 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convert_rotaryembed.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convert_rotaryembed.h
================================================
[File too large to display: 208 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convert_shufflechannel_slice.cpp
================================================
[File too large to display: 3.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convert_shufflechannel_slice.h
================================================
[File too large to display: 217 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolution1d_activation.cpp
================================================
[File too large to display: 6.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolution1d_activation.h
================================================
[File too large to display: 213 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolution_activation.cpp
================================================
[File too large to display: 6.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolution_activation.h
================================================
[File too large to display: 211 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolutiondepthwise1d_activation.cpp
================================================
[File too large to display: 6.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolutiondepthwise1d_activation.h
================================================
[File too large to display: 222 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolutiondepthwise_activation.cpp
================================================
[File too large to display: 6.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_convolutiondepthwise_activation.h
================================================
[File too large to display: 220 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_deconvolution_activation.cpp
================================================
[File too large to display: 6.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_deconvolution_activation.h
================================================
[File too large to display: 213 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_deconvolutiondepthwise_activation.cpp
================================================
[File too large to display: 6.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_deconvolutiondepthwise_activation.h
================================================
[File too large to display: 222 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_innerproduct_activation.cpp
================================================
[File too large to display: 6.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_innerproduct_activation.h
================================================
[File too large to display: 212 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_padding_convolution.cpp
================================================
[File too large to display: 5.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_padding_convolution.h
================================================
[File too large to display: 208 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_padding_convolutiondepthwise.cpp
================================================
[File too large to display: 5.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_padding_convolutiondepthwise.h
================================================
[File too large to display: 217 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_transpose_matmul.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/fuse_transpose_matmul.h
================================================
[File too large to display: 205 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_global_pooling.cpp
================================================
[File too large to display: 6.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_global_pooling.h
================================================
[File too large to display: 220 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_linear.cpp
================================================
[File too large to display: 3.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_linear.h
================================================
[File too large to display: 212 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_numpy_binaryop_broadcast.cpp
================================================
[File too large to display: 4.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_numpy_binaryop_broadcast.h
================================================
[File too large to display: 230 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_pooling.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_reshape_pooling.h
================================================
[File too large to display: 213 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_split.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/insert_split.h
================================================
[File too large to display: 203 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AdaptiveAvgPool1d.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AdaptiveAvgPool2d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AdaptiveAvgPool3d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AdaptiveMaxPool1d.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AdaptiveMaxPool2d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AdaptiveMaxPool3d.cpp
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AvgPool1d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AvgPool2d.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_AvgPool3d.cpp
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_BatchNorm1d.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_BatchNorm2d.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_BatchNorm3d.cpp
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_CELU.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ChannelShuffle.cpp
================================================
[File too large to display: 909 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ConstantPad1d.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ConstantPad2d.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ConstantPad3d.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Conv1d.cpp
================================================
[File too large to display: 9.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Conv2d.cpp
================================================
[File too large to display: 10.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Conv3d.cpp
================================================
[File too large to display: 11.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ConvTranspose1d.cpp
================================================
[File too large to display: 5.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ConvTranspose2d.cpp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ConvTranspose3d.cpp
================================================
[File too large to display: 7.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ELU.cpp
================================================
[File too large to display: 833 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Embedding.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Fold.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_GELU.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_GLU.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_GRU.cpp
================================================
[File too large to display: 6.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_GroupNorm.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Hardshrink.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Hardsigmoid.cpp
================================================
[File too large to display: 865 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Hardswish.cpp
================================================
[File too large to display: 857 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Hardtanh.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_InstanceNorm2d.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_LSTM.cpp
================================================
[File too large to display: 15.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_LayerNorm.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_LeakyReLU.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Linear.cpp
================================================
[File too large to display: 7.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_LocalResponseNorm.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_LogSigmoid.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_LogSoftmax.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_MaxPool1d.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_MaxPool2d.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_MaxPool3d.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Mish.cpp
================================================
[File too large to display: 664 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp
================================================
[File too large to display: 17.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_PReLU.cpp
================================================
[File too large to display: 992 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_PixelShuffle.cpp
================================================
[File too large to display: 925 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_PixelUnshuffle.cpp
================================================
[File too large to display: 930 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_RMSNorm.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_RNN.cpp
================================================
[File too large to display: 6.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ReLU.cpp
================================================
[File too large to display: 664 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ReLU6.cpp
================================================
[File too large to display: 838 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ReflectionPad1d.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ReflectionPad2d.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ReplicationPad1d.cpp
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ReplicationPad2d.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ReplicationPad3d.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_SELU.cpp
================================================
[File too large to display: 664 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_SiLU.cpp
================================================
[File too large to display: 665 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Sigmoid.cpp
================================================
[File too large to display: 676 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Softmax.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Softmax2d.cpp
================================================
[File too large to display: 849 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Softplus.cpp
================================================
[File too large to display: 699 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Softshrink.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Tanh.cpp
================================================
[File too large to display: 664 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Unfold.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_Upsample.cpp
================================================
[File too large to display: 7.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_UpsamplingBilinear2d.cpp
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_UpsamplingNearest2d.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/nn_ZeroPad2d.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/solve_batch_index.cpp
================================================
[File too large to display: 28.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/solve_batch_index.h
================================================
[File too large to display: 208 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_addmm.cpp
================================================
[File too large to display: 4.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_amax.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_amin.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_bmm.cpp
================================================
[File too large to display: 832 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_clamp.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_clone.cpp
================================================
[File too large to display: 792 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_cumsum.cpp
================================================
[File too large to display: 941 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_diag.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_flatten.cpp
================================================
[File too large to display: 6.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_flip.cpp
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_istft.cpp
================================================
[File too large to display: 6.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_logsumexp.cpp
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_matmul.cpp
================================================
[File too large to display: 843 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_max.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_mean.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_min.cpp
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_mm.cpp
================================================
[File too large to display: 714 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_norm.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_prod.cpp
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_roll.cpp
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_slice_scatter.cpp
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_squeeze.cpp
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_stft.cpp
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_sum.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_t.cpp
================================================
[File too large to display: 798 B]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_transpose.cpp
================================================
[File too large to display: 3.2 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torchaudio_F_inverse_spectrogram.cpp
================================================
[File too large to display: 3.4 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torchaudio_F_spectrogram.cpp
================================================
[File too large to display: 8.7 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn/torchvision_DeformConv2d.cpp
================================================
[File too large to display: 3.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn.cpp
================================================
[File too large to display: 4.9 KB]

================================================
FILE: tools/pnnx/src/pass_ncnn.h
================================================
[File too large to display: 700 B]

================================================
FILE: tools/pnnx/src/pass_onnx/canonicalize.cpp
================================================
[File too large to display: 12.2 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/canonicalize.h
================================================
[File too large to display: 225 B]

================================================
FILE: tools/pnnx/src/pass_onnx/dead_code_elimination.cpp
================================================
[File too large to display: 10.2 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/dead_code_elimination.h
================================================
[File too large to display: 234 B]

================================================
FILE: tools/pnnx/src/pass_onnx/eliminate_initializer_input.cpp
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/eliminate_initializer_input.h
================================================
[File too large to display: 240 B]

================================================
FILE: tools/pnnx/src/pass_onnx/eliminate_noop.cpp
================================================
[File too large to display: 5.6 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/eliminate_noop.h
================================================
[File too large to display: 285 B]

================================================
FILE: tools/pnnx/src/pass_onnx/fold_constants.cpp
================================================
[File too large to display: 43.0 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/fold_constants.h
================================================
[File too large to display: 917 B]

================================================
FILE: tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.cpp
================================================
[File too large to display: 11.1 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/fuse_constant_as_attribute.h
================================================
[File too large to display: 239 B]

================================================
FILE: tools/pnnx/src/pass_onnx/inline_containers.cpp
================================================
[File too large to display: 5.8 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/inline_containers.h
================================================
[File too large to display: 230 B]

================================================
FILE: tools/pnnx/src/pass_onnx/inline_if_graph.cpp
================================================
[File too large to display: 4.8 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/inline_if_graph.h
================================================
[File too large to display: 227 B]

================================================
FILE: tools/pnnx/src/pass_onnx/model_stat.cpp
================================================
[File too large to display: 20.7 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/model_stat.h
================================================
[File too large to display: 959 B]

================================================
FILE: tools/pnnx/src/pass_onnx/shape_inference.cpp
================================================
[File too large to display: 25.2 KB]

================================================
FILE: tools/pnnx/src/pass_onnx/shape_inference.h
================================================
[File too large to display: 604 B]

================================================
FILE: tools/pnnx/src/pass_onnx.cpp
================================================
[File too large to display: 41.5 KB]

================================================
FILE: tools/pnnx/src/pass_onnx.h
================================================
[File too large to display: 3.5 KB]

================================================
FILE: tools/pnnx/src/pass_tnn/fuse_shape_list_construct.cpp
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/src/pass_tnn/fuse_shape_list_construct.h
================================================
[File too large to display: 217 B]

================================================
FILE: tools/pnnx/src/pass_tnn/fuse_shape_size.cpp
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/src/pass_tnn/fuse_shape_size.h
================================================
[File too large to display: 207 B]

================================================
FILE: tools/pnnx/src/pass_tnn/lower_concat.cpp
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/pass_tnn/lower_concat.h
================================================
[File too large to display: 204 B]

================================================
FILE: tools/pnnx/src/pass_tnn/lower_convolution_activation.cpp
================================================
[File too large to display: 8.3 KB]

================================================
FILE: tools/pnnx/src/pass_tnn/lower_convolution_activation.h
================================================
[File too large to display: 220 B]

================================================
FILE: tools/pnnx/src/pass_tnn/lower_power.cpp
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/src/pass_tnn/lower_power.h
================================================
[File too large to display: 203 B]

================================================
FILE: tools/pnnx/src/save_ncnn.cpp
================================================
[File too large to display: 15.2 KB]

================================================
FILE: tools/pnnx/src/save_ncnn.h
================================================
[File too large to display: 385 B]

================================================
FILE: tools/pnnx/src/save_onnx.cpp
================================================
[File too large to display: 6.9 KB]

================================================
FILE: tools/pnnx/src/save_onnx.h
================================================
[File too large to display: 266 B]

================================================
FILE: tools/pnnx/src/storezip.cpp
================================================
[File too large to display: 13.6 KB]

================================================
FILE: tools/pnnx/src/storezip.h
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/src/utils.cpp
================================================
[File too large to display: 4.5 KB]

================================================
FILE: tools/pnnx/src/utils.h
================================================
[File too large to display: 496 B]

================================================
FILE: tools/pnnx/tests/CMakeLists.txt
================================================
[File too large to display: 12.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/CMakeLists.txt
================================================
[File too large to display: 7.9 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_adaptive_avg_pool1d.py
================================================
[File too large to display: 993 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_adaptive_avg_pool2d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_adaptive_avg_pool3d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_adaptive_max_pool1d.py
================================================
[File too large to display: 993 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_adaptive_max_pool2d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_adaptive_max_pool3d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_alpha_dropout.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_avg_pool1d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_avg_pool2d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_avg_pool3d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_batch_norm.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_celu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_conv1d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_conv2d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_conv3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_conv_transpose1d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_conv_transpose2d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_conv_transpose3d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_dropout.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_dropout2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_dropout3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_elu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_embedding.py
================================================
[File too large to display: 952 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_feature_alpha_dropout.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_fold.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_gelu.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_glu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_grid_sample.py
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_group_norm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_hardshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_hardsigmoid.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_hardswish.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_hardtanh.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_interpolate.py
================================================
[File too large to display: 3.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_layer_norm.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_leaky_relu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_local_response_norm.py
================================================
[File too large to display: 984 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_log_softmax.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_logsigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_max_pool1d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_max_pool2d.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_max_pool3d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_mish.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_normalize.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_pad.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_pixel_shuffle.py
================================================
[File too large to display: 913 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_pixel_unshuffle.py
================================================
[File too large to display: 929 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_prelu.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_relu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_relu6.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_rms_norm.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_scaled_dot_product_attention.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_selu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_sigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_silu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_softmax.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_softshrink.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_tanh.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_unfold.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_upsample.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_upsample_bilinear.py
================================================
[File too large to display: 963 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_F_upsample_nearest.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_expand.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_permute.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_repeat.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_reshape.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_reshape_as.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_slice.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_slice_copy.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_unflatten.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_Tensor_view.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_convnext_tiny.py
================================================
[File too large to display: 863 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_mobilenet_v2.py
================================================
[File too large to display: 717 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_mobilenet_v3_small.py
================================================
[File too large to display: 747 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_fuse_binaryop_eltwise.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_fuse_pad_conv.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_fuse_shufflechannel_slice.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_fuse_transpose_matmul.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_interp_expr.py
================================================
[File too large to display: 3.5 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_numpy_binaryop_broadcast.py
================================================
[File too large to display: 8.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_reshape_expr.py
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_slice_expr.py
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_ncnn_solve_batch_index.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AdaptiveAvgPool1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AdaptiveAvgPool2d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AdaptiveAvgPool3d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AdaptiveMaxPool1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AdaptiveMaxPool2d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AdaptiveMaxPool3d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AlphaDropout.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AvgPool1d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AvgPool2d.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_AvgPool3d.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_BatchNorm1d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_BatchNorm2d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_BatchNorm3d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_CELU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ChannelShuffle.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ConstantPad1d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ConstantPad2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ConstantPad3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Conv1d.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Conv2d.py
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Conv3d.py
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ConvTranspose1d.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ConvTranspose2d.py
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ConvTranspose3d.py
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Dropout.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Dropout2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Dropout3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ELU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Embedding.py
================================================
[File too large to display: 964 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Fold.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_GELU.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_GLU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_GRU.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_GroupNorm.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Hardshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Hardsigmoid.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Hardswish.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Hardtanh.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Identity.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_InstanceNorm2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_LSTM.py
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_LayerNorm.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_LeakyReLU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Linear.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_LocalResponseNorm.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_LogSigmoid.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_LogSoftmax.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_MaxPool1d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_MaxPool2d.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_MaxPool3d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Mish.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_PReLU.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_PixelShuffle.py
================================================
[File too large to display: 978 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_PixelUnshuffle.py
================================================
[File too large to display: 1002 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_RMSNorm.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_RNN.py
================================================
[File too large to display: 2.6 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ReLU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ReLU6.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ReflectionPad1d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ReflectionPad2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ReplicationPad1d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ReplicationPad2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ReplicationPad3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_SELU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_SiLU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Sigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Softmax.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Softmax2d.py
================================================
[File too large to display: 920 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Softshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Tanh.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Unfold.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_Upsample.py
================================================
[File too large to display: 4.6 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_UpsamplingBilinear2d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_UpsamplingNearest2d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_nn_ZeroPad2d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_resnet18.py
================================================
[File too large to display: 697 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_shufflenet_v2_x1_0.py
================================================
[File too large to display: 747 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_squeezenet1_1.py
================================================
[File too large to display: 722 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_abs.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_acos.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_addmm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_amax.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_amin.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_asin.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_atan.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_atan2.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_bmm.py
================================================
[File too large to display: 907 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_cat.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_ceil.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_chunk.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_clamp.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_clone.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_cos.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_cumsum.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_diag.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_einsum.py
================================================
[File too large to display: 5.6 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_exp.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_flatten.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_flip.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_floor.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_istft.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_log.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_log10.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_logsumexp.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_matmul.py
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_max.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_maximum.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_mean.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_min.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_minimum.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_mm.py
================================================
[File too large to display: 890 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_neg.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_norm.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_pow.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_prod.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_reciprocal.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_roll.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_round.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_rsqrt.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_sin.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_slice_scatter.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_sqrt.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_square.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_squeeze.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_stack.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_stft.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_sum.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_t.py
================================================
[File too large to display: 1006 B]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_tan.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_tanh.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_tensor_split.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_transpose.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_trunc.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_unbind.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torch_unsqueeze.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torchaudio_F_inverse_spectrogram.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torchaudio_F_spectrogram.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torchaudio_InverseSpectrogram.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torchaudio_Spectrogram.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_torchvision_DeformConv2d.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_transformers_deepseek_v3_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_transformers_qwen2_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_transformers_qwen3_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/ncnn/test_vit_b_32.py
================================================
[File too large to display: 977 B]

================================================
FILE: tools/pnnx/tests/onnx/CMakeLists.txt
================================================
[File too large to display: 8.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_adaptive_avg_pool1d.py
================================================
[File too large to display: 929 B]

================================================
FILE: tools/pnnx/tests/onnx/test_F_adaptive_avg_pool2d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_adaptive_avg_pool3d.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_adaptive_max_pool1d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_adaptive_max_pool2d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_adaptive_max_pool3d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_avg_pool1d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_avg_pool2d.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_avg_pool3d.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_batch_norm.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_celu.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_conv1d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_conv2d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_conv3d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_conv_transpose1d.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_conv_transpose2d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_conv_transpose3d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_elu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_gelu.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_group_norm.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_hardshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_hardsigmoid.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_hardswish.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_hardtanh.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_interpolate.py
================================================
[File too large to display: 7.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_layer_norm.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_leaky_relu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_linear.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_local_response_norm.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_log_softmax.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_logsigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_max_pool1d.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_max_pool2d.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_max_pool3d.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_mish.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_normalize.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_pad.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_pixel_shuffle.py
================================================
[File too large to display: 987 B]

================================================
FILE: tools/pnnx/tests/onnx/test_F_pixel_unshuffle.py
================================================
[File too large to display: 1003 B]

================================================
FILE: tools/pnnx/tests/onnx/test_F_prelu.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_relu.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_relu6.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_scaled_dot_product_attention.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_selu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_sigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_silu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_softmax.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_softmin.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_softplus.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_softshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_softsign.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_tanh.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_tanhshrink.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_upsample.py
================================================
[File too large to display: 5.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_upsample_bilinear.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_F_upsample_nearest.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_expand.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_permute.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_repeat.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_reshape.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_reshape_as.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_select.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_slice.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_unflatten.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_Tensor_view.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_convnext_tiny.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_mobilenet_v2.py
================================================
[File too large to display: 678 B]

================================================
FILE: tools/pnnx/tests/onnx/test_mobilenet_v3_small.py
================================================
[File too large to display: 823 B]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AdaptiveAvgPool1d.py
================================================
[File too large to display: 996 B]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AdaptiveAvgPool2d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AdaptiveAvgPool3d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AdaptiveMaxPool1d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AdaptiveMaxPool2d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AdaptiveMaxPool3d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AvgPool1d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AvgPool2d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_AvgPool3d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_BatchNorm1d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_BatchNorm2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_BatchNorm3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_CELU.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ConstantPad1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ConstantPad2d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ConstantPad3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Conv1d.py
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Conv2d.py
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Conv3d.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ConvTranspose1d.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ConvTranspose2d.py
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ConvTranspose3d.py
================================================
[File too large to display: 2.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ELU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_GELU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_GRU.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_GroupNorm.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Hardshrink.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Hardsigmoid.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Hardswish.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Hardtanh.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_InstanceNorm1d.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_InstanceNorm2d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_InstanceNorm3d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_LSTM.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_LayerNorm.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_LeakyReLU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Linear.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_LocalResponseNorm.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_LogSigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_LogSoftmax.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_MaxPool1d.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_MaxPool2d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_MaxPool3d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Mish.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_MultiheadAttention.py
================================================
[File too large to display: 5.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_PReLU.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_PixelShuffle.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_PixelUnshuffle.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_RNN.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ReLU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ReLU6.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ReflectionPad1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ReflectionPad2d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ReplicationPad1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ReplicationPad2d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ReplicationPad3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_SELU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_SiLU.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Sigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Softmax.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Softmin.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Softplus.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Softshrink.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Softsign.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Tanh.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Tanhshrink.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_Upsample.py
================================================
[File too large to display: 9.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_UpsamplingBilinear2d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_UpsamplingNearest2d.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_nn_ZeroPad2d.py
================================================
[File too large to display: 1019 B]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_activation_ops.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_conv_ops.py
================================================
[File too large to display: 3.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_dense_ops.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_fuse_channel_shuffle.py
================================================
[File too large to display: 5.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_fuse_pixel_shuffle.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_fuse_pixel_unshuffle.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_layout_ops.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_math_ops.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_normalize_ops.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_opset21_ops.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_pool_ops.py
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_reduce_ops.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_onnx_rnn_ops.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_resnet18.py
================================================
[File too large to display: 658 B]

================================================
FILE: tools/pnnx/tests/onnx/test_shufflenet_v2_x1_0.py
================================================
[File too large to display: 708 B]

================================================
FILE: tools/pnnx/tests/onnx/test_squeezenet1_1.py
================================================
[File too large to display: 683 B]

================================================
FILE: tools/pnnx/tests/onnx/test_swin_t.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_cat.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_ceil.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_chunk.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_clamp.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_flatten.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_flip.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_floor.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_logical_and.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_logical_not.py
================================================
[File too large to display: 1022 B]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_logical_or.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_logical_xor.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_max.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_maximum.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_mean.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_min.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_minimum.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_norm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_prod.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_roll.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_split.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_squeeze.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_stack.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_sum.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_transpose.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_unbind.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_torch_unsqueeze.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_albert_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_bart_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_bert_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_bert_generation_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_blenderbot_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_camembert_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_chinese_clip_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_clip_attention.py
================================================
[File too large to display: 5.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_ctrl_attention.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_deberta_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_distilbert_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_electra_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_flaubert_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_fsmt_attention.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_funnel_attention.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_gpt2_attention.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_layoutlm_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_lxmert_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_m2m_100_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_marian_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_mbart_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_mobilebert_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_mt5_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_openai_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_pegasus_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_prophetnet_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_reformer_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_roberta_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_squeezebert_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_t5_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_xlm_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_transformers_xlm_roberta_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/onnx/test_vit_b_32.py
================================================
[File too large to display: 945 B]

================================================
FILE: tools/pnnx/tests/run_test.cmake
================================================
[File too large to display: 270 B]

================================================
FILE: tools/pnnx/tests/test_F_adaptive_avg_pool1d.py
================================================
[File too large to display: 950 B]

================================================
FILE: tools/pnnx/tests/test_F_adaptive_avg_pool2d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_adaptive_avg_pool3d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_adaptive_max_pool1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_F_adaptive_max_pool2d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_adaptive_max_pool3d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_affine_grid.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_F_alpha_dropout.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_avg_pool1d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_F_avg_pool2d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_avg_pool3d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_batch_norm.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/test_F_celu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_conv1d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_F_conv2d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_F_conv3d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_F_conv_transpose1d.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/test_F_conv_transpose2d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_F_conv_transpose3d.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_F_dropout.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_dropout2d.py
================================================
[File too large to display: 999 B]

================================================
FILE: tools/pnnx/tests/test_F_dropout3d.py
================================================
[File too large to display: 1009 B]

================================================
FILE: tools/pnnx/tests/test_F_elu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_embedding.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_F_feature_alpha_dropout.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_F_fold.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_gelu.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_F_glu.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_grid_sample.py
================================================
[File too large to display: 4.1 KB]

================================================
FILE: tools/pnnx/tests/test_F_group_norm.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_F_hardshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_hardsigmoid.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_F_hardswish.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_F_hardtanh.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_instance_norm.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/test_F_interpolate.py
================================================
[File too large to display: 5.5 KB]

================================================
FILE: tools/pnnx/tests/test_F_layer_norm.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_F_leaky_relu.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_linear.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_local_response_norm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_F_log_softmax.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_logsigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_lp_pool1d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_lp_pool2d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_max_pool1d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_max_pool2d.py
================================================
[File too large to display: 2.4 KB]

================================================
FILE: tools/pnnx/tests/test_F_max_pool3d.py
================================================
[File too large to display: 2.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_mish.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_F_normalize.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_pad.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_F_pairwise_distance.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_pixel_shuffle.py
================================================
[File too large to display: 895 B]

================================================
FILE: tools/pnnx/tests/test_F_pixel_unshuffle.py
================================================
[File too large to display: 911 B]

================================================
FILE: tools/pnnx/tests/test_F_prelu.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_F_relu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_relu6.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_rms_norm.py
================================================
[File too large to display: 3.0 KB]

================================================
FILE: tools/pnnx/tests/test_F_rrelu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_scaled_dot_product_attention.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_F_selu.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_sigmoid.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_silu.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_softmax.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_softmin.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_softplus.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_softshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_softsign.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_tanh.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_tanhshrink.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_F_threshold.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_F_unfold.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_F_upsample.py
================================================
[File too large to display: 3.5 KB]

================================================
FILE: tools/pnnx/tests/test_F_upsample_bilinear.py
================================================
[File too large to display: 945 B]

================================================
FILE: tools/pnnx/tests/test_F_upsample_nearest.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_expand.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_fill.py
================================================
[File too large to display: 947 B]

================================================
FILE: tools/pnnx/tests/test_Tensor_index.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_index_put.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_masked_fill.py
================================================
[File too large to display: 931 B]

================================================
FILE: tools/pnnx/tests/test_Tensor_new_empty.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_new_full.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_new_ones.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_new_zeros.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_permute.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_repeat.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_reshape.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_reshape_as.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_select.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_slice.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_slice_copy.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_to.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_type_as.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_unflatten.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_Tensor_view.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_convnext_tiny.py
================================================
[File too large to display: 979 B]

================================================
FILE: tools/pnnx/tests/test_ir_complex.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_mobilenet_v2.py
================================================
[File too large to display: 862 B]

================================================
FILE: tools/pnnx/tests/test_mobilenet_v3_small.py
================================================
[File too large to display: 898 B]

================================================
FILE: tools/pnnx/tests/test_nn_AdaptiveAvgPool1d.py
================================================
[File too large to display: 1017 B]

================================================
FILE: tools/pnnx/tests/test_nn_AdaptiveAvgPool2d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AdaptiveAvgPool3d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AdaptiveMaxPool1d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AdaptiveMaxPool2d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AdaptiveMaxPool3d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AlphaDropout.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AvgPool1d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AvgPool2d.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_nn_AvgPool3d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_BatchNorm1d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_BatchNorm2d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_nn_BatchNorm3d.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_nn_CELU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ChannelShuffle.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ConstantPad1d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ConstantPad2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ConstantPad3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Conv1d.py
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Conv2d.py
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Conv3d.py
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ConvTranspose1d.py
================================================
[File too large to display: 2.7 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ConvTranspose2d.py
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ConvTranspose3d.py
================================================
[File too large to display: 2.8 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Dropout.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Dropout2d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Dropout3d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ELU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Embedding.py
================================================
[File too large to display: 935 B]

================================================
FILE: tools/pnnx/tests/test_nn_Fold.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_GELU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_GLU.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_GRU.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_nn_GroupNorm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Hardshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Hardsigmoid.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Hardswish.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Hardtanh.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Identity.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_InstanceNorm1d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_InstanceNorm2d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_InstanceNorm3d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LPPool1d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LPPool2d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LSTM.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LayerNorm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LeakyReLU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Linear.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LocalResponseNorm.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LogSigmoid.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_LogSoftmax.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_MaxPool1d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_MaxPool2d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_MaxPool3d.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Mish.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_MultiheadAttention.py
================================================
[File too large to display: 5.5 KB]

================================================
FILE: tools/pnnx/tests/test_nn_PReLU.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/test_nn_PixelShuffle.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_PixelUnshuffle.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_RMSNorm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_RNN.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_RReLU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ReLU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ReLU6.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ReflectionPad1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ReflectionPad2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ReplicationPad1d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ReplicationPad2d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ReplicationPad3d.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_nn_SELU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_SiLU.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Sigmoid.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Softmax.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Softmax2d.py
================================================
[File too large to display: 902 B]

================================================
FILE: tools/pnnx/tests/test_nn_Softmin.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Softplus.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Softshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Softsign.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Tanh.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Tanhshrink.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Threshold.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Unfold.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_nn_Upsample.py
================================================
[File too large to display: 7.4 KB]

================================================
FILE: tools/pnnx/tests/test_nn_UpsamplingBilinear2d.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_nn_UpsamplingNearest2d.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_nn_ZeroPad2d.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_eliminate_noop_cat.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_eliminate_noop_expand.py
================================================
[File too large to display: 9.2 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_eliminate_noop_math.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_eliminate_noop_upsample.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_expression.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fold_constant.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_adjacent_permute.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_adjacent_reshape.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_channel_shuffle.py
================================================
[File too large to display: 5.5 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_conv1d_batchnorm1d.py
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_conv2d_batchnorm2d.py
================================================
[File too large to display: 3.3 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_conv3d_batchnorm3d.py
================================================
[File too large to display: 3.5 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_convtranspose1d_batchnorm1d.py
================================================
[File too large to display: 3.0 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_convtranspose2d_batchnorm2d.py
================================================
[File too large to display: 3.0 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_convtranspose3d_batchnorm3d.py
================================================
[File too large to display: 3.1 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_input_unpack.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_layernorm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_linear_batchnorm1d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_multiheadattention.py
================================================
[File too large to display: 19.1 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_pad_conv1d.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_pad_conv2d.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_pixel_shuffle.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_pixel_unshuffle.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_rmsnorm.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_scaled_dot_product_attention.py
================================================
[File too large to display: 3.5 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_select_to_unbind.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_pnnx_fuse_slice_to_tensor_split.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_quantization_shufflenet_v2_x1_0.py
================================================
[File too large to display: 807 B]

================================================
FILE: tools/pnnx/tests/test_resnet18.py
================================================
[File too large to display: 838 B]

================================================
FILE: tools/pnnx/tests/test_shufflenet_v2_x1_0.py
================================================
[File too large to display: 898 B]

================================================
FILE: tools/pnnx/tests/test_squeezenet1_1.py
================================================
[File too large to display: 853 B]

================================================
FILE: tools/pnnx/tests/test_swin_t.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_abs.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_acos.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_acosh.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_addmm.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_torch_amax.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_amin.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_arange.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_argmax.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_argmin.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_asin.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_asinh.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_atan.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_atan2.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_atanh.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_bitwise_and.py
================================================
[File too large to display: 961 B]

================================================
FILE: tools/pnnx/tests/test_torch_bitwise_left_shift.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_bitwise_not.py
================================================
[File too large to display: 931 B]

================================================
FILE: tools/pnnx/tests/test_torch_bitwise_or.py
================================================
[File too large to display: 956 B]

================================================
FILE: tools/pnnx/tests/test_torch_bitwise_right_shift.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_bitwise_xor.py
================================================
[File too large to display: 961 B]

================================================
FILE: tools/pnnx/tests/test_torch_bmm.py
================================================
[File too large to display: 889 B]

================================================
FILE: tools/pnnx/tests/test_torch_cat.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_ceil.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_chunk.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_clamp.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_clone.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_complex.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_cos.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_cosh.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_cross.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_cumprod.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_cumsum.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_diag.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_einsum.py
================================================
[File too large to display: 4.8 KB]

================================================
FILE: tools/pnnx/tests/test_torch_eq.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_exp.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_fft.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_fft2.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_fftn.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_hfft.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_hfft2.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_hfftn.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_ifft.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_ifft2.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_ifftn.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_ihfft.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_ihfft2.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_ihfftn.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_irfft.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_irfft2.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_irfftn.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_rfft.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_rfft2.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_fft_rfftn.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_flatten.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_flip.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_floor.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_full.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_full_like.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_gather.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_ge.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_gt.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_imag.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_index_select.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_istft.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_torch_le.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_lgamma.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_log.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_log10.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_logaddexp.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_logical_and.py
================================================
[File too large to display: 961 B]

================================================
FILE: tools/pnnx/tests/test_torch_logical_not.py
================================================
[File too large to display: 931 B]

================================================
FILE: tools/pnnx/tests/test_torch_logical_or.py
================================================
[File too large to display: 956 B]

================================================
FILE: tools/pnnx/tests/test_torch_logical_xor.py
================================================
[File too large to display: 961 B]

================================================
FILE: tools/pnnx/tests/test_torch_logsumexp.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_lt.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_masked_select.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_matmul.py
================================================
[File too large to display: 2.9 KB]

================================================
FILE: tools/pnnx/tests/test_torch_max.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_maximum.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_mean.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_min.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_minimum.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_mm.py
================================================
[File too large to display: 872 B]

================================================
FILE: tools/pnnx/tests/test_torch_mv.py
================================================
[File too large to display: 821 B]

================================================
FILE: tools/pnnx/tests/test_torch_narrow.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_ne.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_neg.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_norm.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_ones.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_ones_like.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_positive.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_pow.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_prod.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_real.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_reciprocal.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_repeat_interleave.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_roll.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_round.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_rsqrt.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_scatter_add.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_sign.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_sin.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_sinh.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_slice_scatter.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_torch_split.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_sqrt.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_square.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_squeeze.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_stack.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_std.py
================================================
[File too large to display: 1.3 KB]

================================================
FILE: tools/pnnx/tests/test_torch_stft.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_torch_sum.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_t.py
================================================
[File too large to display: 944 B]

================================================
FILE: tools/pnnx/tests/test_torch_tan.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_tanh.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_tensor_split.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_torch_tile.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_topk.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_transpose.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_trunc.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_unbind.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_unsqueeze.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_view_as_complex.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_view_as_real.py
================================================
[File too large to display: 1.2 KB]

================================================
FILE: tools/pnnx/tests/test_torch_where.py
================================================
[File too large to display: 1.0 KB]

================================================
FILE: tools/pnnx/tests/test_torch_zeros.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torch_zeros_like.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_torchaudio_F_inverse_spectrogram.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_torchaudio_F_spectrogram.py
================================================
[File too large to display: 3.2 KB]

================================================
FILE: tools/pnnx/tests/test_torchaudio_InverseSpectrogram.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/test_torchaudio_Spectrogram.py
================================================
[File too large to display: 2.2 KB]

================================================
FILE: tools/pnnx/tests/test_torchvision_DeformConv2d.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_torchvision_RoIAlign.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_albert_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_bart_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_bert_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_bert_generation_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_blenderbot_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_camembert_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_chinese_clip_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_clip_attention.py
================================================
[File too large to display: 5.1 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_ctrl_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_deberta_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_deepseek_v3_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_distilbert_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_electra_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_flaubert_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_fsmt_attention.py
================================================
[File too large to display: 1.4 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_funnel_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_gpt2_attention.py
================================================
[File too large to display: 1.5 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_layoutlm_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_longformer_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_lxmert_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_m2m_100_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_marian_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_mbart_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_mobilebert_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_mt5_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_openai_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_pegasus_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_prophetnet_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_qwen2_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_qwen3_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_reformer_attention.py
================================================
[File too large to display: 2.0 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_roberta_attention.py
================================================
[File too large to display: 1.7 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_squeezebert_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_t5_attention.py
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_xlm_attention.py
================================================
[File too large to display: 1.9 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_xlm_roberta_attention.py
================================================
[File too large to display: 1.8 KB]

================================================
FILE: tools/pnnx/tests/test_transformers_xlnet_attention.py
================================================
[File too large to display: 2.1 KB]

================================================
FILE: tools/pnnx/tests/test_vit_b_32.py
================================================
[File too large to display: 1.1 KB]

================================================
FILE: tools/pytorch/README.md
================================================
[File too large to display: 3.4 KB]

================================================
FILE: tools/quantize/CMakeLists.txt
================================================
[File too large to display: 1.6 KB]

================================================
FILE: tools/quantize/README.md
================================================
[File too large to display: 90 B]

================================================
FILE: tools/quantize/imreadwrite.cpp
================================================
[File too large to display: 4.2 KB]

================================================
FILE: tools/quantize/imreadwrite.h
================================================
[File too large to display: 3.2 KB]

================================================
FILE: tools/quantize/ncnn2int8.cpp
================================================
[File too large to display: 37.2 KB]

================================================
FILE: tools/quantize/ncnn2table.cpp
================================================
[File too large to display: 57.1 KB]

================================================
FILE: tools/quantize/npy.hpp
================================================
[File too large to display: 19.3 KB]

================================================
FILE: tools/tensorflow/readme.txt
================================================
[File too large to display: 120 B]